├── __init__.py
├── models
    ├── __init__.py
    ├── classifiers.py
    ├── temporal_aggregation.py
    └── video_classification.py
├── datasets
    ├── __init__.py
    ├── data.py
    └── reader_fns.py
├── external
    └── __init__.py
├── loss_fn
    ├── __init__.py
    ├── mse.py
    ├── multidim_xentropy.py
    └── simclr_infonce.py
├── common
    ├── __init__.py
    ├── cluster.py
    ├── sampler.py
    ├── scheduler.py
    ├── utils.py
    └── log.py
├── func
    ├── __init__.py
    └── train_eval_ops.py
├── .style.yapf
├── conf
    ├── model
    │   ├── temporal_aggregator_after_future_pred
    │   ├── classifier
    │   │   ├── linear.yaml
    │   │   └── mlp.yaml
    │   ├── backbone
    │   │   ├── r3d_18.yaml
    │   │   ├── avt_b.yaml
    │   │   ├── bn_inception.yaml
    │   │   ├── avt_b_in21k.yaml
    │   │   ├── r2plus1d_34.yaml
    │   │   ├── identity.yaml
    │   │   └── r2plus1d_18.yaml
    │   ├── future_predictor
    │   │   ├── avth.yaml
    │   │   ├── identity.yaml
    │   │   └── mlp.yaml
    │   └── temporal_aggregator
    │   │   ├── mean.yaml
    │   │   ├── identity.yaml
    │   │   ├── transformer.yaml
    │   │   └── rulstm.yaml
    ├── opt
    │   ├── optimizer
    │   │   ├── adam.yaml
    │   │   ├── adamW.yaml
    │   │   ├── adafactor.yaml
    │   │   └── sgd.yaml
    │   └── scheduler
    │   │   ├── cosine.yaml
    │   │   ├── reduce_lr_on_plateau.yaml
    │   │   └── warmup_multi_step.yaml
    ├── train_eval_op
    │   ├── cls_loss_acc_fn
    │   │   ├── no.yaml
    │   │   └── basic.yaml
    │   ├── basic.yaml
    │   ├── reg_criterion
    │   │   ├── mse.yaml
    │   │   └── simclr_infonce.yaml
    │   └── pred_future_feat.yaml
    ├── dataset
    │   ├── dundee50salads
    │   │   ├── annot_reader_fn
    │   │   │   ├── orig.yaml
    │   │   │   └── abu_farha.yaml
    │   │   ├── anticipation_train.yaml
    │   │   ├── anticipation_val.yaml
    │   │   └── common.yaml
    │   ├── egtea
    │   │   ├── common.yaml
    │   │   ├── anticipation_train.yaml
    │   │   └── anticipation_val.yaml
    │   ├── epic_kitchens
    │   │   ├── common.yaml
    │   │   ├── anticipation_test_s1.yaml
    │   │   ├── anticipation_test_s2.yaml
    │   │   ├── anticipation_train.yaml
    │   │   ├── anticipation_val.yaml
    │   │   └── anticipation_train_minus_val.yaml
    │   └── epic_kitchens100
    │   │   ├── common.yaml
    │   │   ├── anticipation_test.yaml
    │   │   ├── anticipation_train.yaml
    │   │   ├── anticipation_val.yaml
    │   │   └── anticipation_train+val.yaml
    ├── data
    │   └── default.yaml
    └── config.yaml
├── .yapfignore
├── .gitignore
├── .gitmodules
├── sample_scripts
    └── resize_epic_256px.sh
├── docs
    ├── DATASETS.md
    └── MODELS.md
├── CONTRIBUTING.md
├── train_net.py
├── expts
    ├── 05_ek100_rustm_test_testonly.txt
    ├── 08_ek55_avt_tsn.txt
    ├── 10_ek55_avt_ig65m.txt
    ├── 10_ek55_avt_ig65m_forAR.txt
    ├── 08_ek55_avt_tsn_forAR.txt
    ├── 02_ek100_avt_tsn.txt
    ├── 03_ek100_avt_tsn_obj.txt
    ├── 02_ek100_avt_tsn_test_trainval.txt
    ├── 03_ek100_avt_tsn_obj_test_trainval.txt
    ├── 04_ek100_avt_ig65m_test_trainval.txt
    ├── 11_egtea_avt_tsn.txt
    ├── 02_ek100_avt_tsn_test_testonly.txt
    ├── 06_ek100_avt_tsnflow.txt
    ├── 06_ek100_avt_tsnflow_test_trainval.txt
    ├── 04_ek100_avt_ig65m.txt
    ├── 13_50s_avt.txt
    ├── 09_ek55_avt.txt
    ├── 09_ek55_avt_forAR.txt
    ├── 06_ek100_avt_tsnflow_test_testonly.txt
    ├── 04_ek100_avt_ig65m_test_testonly.txt
    ├── 07_ek100_avt_longer.txt
    ├── 07_ek100_avt_longer_test_testonly.txt
    ├── 07_ek100_avt_longer_test_trainval.txt
    ├── 01_ek100_avt.txt
    ├── 01_ek100_avt_test_testonly.txt
    ├── 01_ek100_avt_test_trainval.txt
    └── 12_egtea_avt.txt
├── CODE_OF_CONDUCT.md
├── env.yaml
├── README.md
└── LICENSE


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/external/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/loss_fn/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/common/__init__.py:
--------------------------------------------------------------------------------
1 | from .log import *
2 | 


--------------------------------------------------------------------------------
/func/__init__.py:
--------------------------------------------------------------------------------
1 | from . import train
2 | 


--------------------------------------------------------------------------------
/.style.yapf:
--------------------------------------------------------------------------------
1 | [style]
2 | based_on_style = google
3 | 


--------------------------------------------------------------------------------
/conf/model/temporal_aggregator_after_future_pred:
--------------------------------------------------------------------------------
1 | temporal_aggregator/


--------------------------------------------------------------------------------
/conf/opt/optimizer/adam.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: torch.optim.Adam
4 | 


--------------------------------------------------------------------------------
/conf/opt/optimizer/adamW.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: torch.optim.AdamW
4 | 


--------------------------------------------------------------------------------
/conf/train_eval_op/cls_loss_acc_fn/no.yaml:
--------------------------------------------------------------------------------
1 | _target_: func.train_eval_ops.NoLossAccuracy
2 | 


--------------------------------------------------------------------------------
/.yapfignore:
--------------------------------------------------------------------------------
1 | src/python/phyre/interface/*
2 | src/python/build/*
3 | src/viz/mpde_modules/*
4 | 


--------------------------------------------------------------------------------
/conf/train_eval_op/basic.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: func.train_eval_ops.Basic
4 | 


--------------------------------------------------------------------------------
/conf/train_eval_op/reg_criterion/mse.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: torch.nn.MSELoss
4 | 


--------------------------------------------------------------------------------
/conf/model/classifier/linear.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: torch.nn.Linear
4 | bias: true
5 | 


--------------------------------------------------------------------------------
/conf/model/backbone/r3d_18.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: torchvision.models.video.resnet.r3d_18
4 | 


--------------------------------------------------------------------------------
/conf/model/future_predictor/avth.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: models.future_prediction.AVTh
4 | 


--------------------------------------------------------------------------------
/conf/model/future_predictor/identity.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: models.future_prediction.Identity
4 | 


--------------------------------------------------------------------------------
/conf/model/temporal_aggregator/mean.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: models.temporal_aggregation.Mean
4 | 


--------------------------------------------------------------------------------
/conf/model/temporal_aggregator/identity.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: models.temporal_aggregation.Identity
4 | 


--------------------------------------------------------------------------------
/conf/opt/optimizer/adafactor.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: transformers.Adafactor
4 | warmup_init: false
5 | 


--------------------------------------------------------------------------------
/conf/opt/optimizer/sgd.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: torch.optim.SGD
4 | momentum: 0.9
5 | nesterov: false
6 | 


--------------------------------------------------------------------------------
/conf/model/classifier/mlp.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: models.classifiers.MLP
4 | nlayers: 2
5 | bias: true
6 | 


--------------------------------------------------------------------------------
/conf/model/future_predictor/mlp.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: models.future_prediction.MLP
4 | num_layers: 2
5 | 


--------------------------------------------------------------------------------
/conf/model/temporal_aggregator/transformer.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: models.temporal_aggregation.Transformer
4 | 


--------------------------------------------------------------------------------
/conf/model/backbone/avt_b.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: models.video_classification.TIMMModel
3 | model_type: vit_base_patch16_224
4 | 


--------------------------------------------------------------------------------
/conf/model/backbone/bn_inception.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: models.video_classification.BNInceptionVideo
3 | pretrained: null
4 | 


--------------------------------------------------------------------------------
/conf/train_eval_op/cls_loss_acc_fn/basic.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: func.train_eval_ops.BasicLossAccuracy
4 | balance_classes: false
5 | 


--------------------------------------------------------------------------------
/conf/model/backbone/avt_b_in21k.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: models.video_classification.TIMMModel
3 | model_type: vit_base_patch16_224_in21k
4 | 


--------------------------------------------------------------------------------
/conf/model/backbone/r2plus1d_34.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: vmz.models.r2plus1d_34
3 | pretraining: ""
4 | use_pool1: false
5 | num_classes: null  # Will be set in the code based on dataset
6 | 


--------------------------------------------------------------------------------
/conf/opt/scheduler/cosine.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: common.scheduler.CosineLR
4 | num_epochs: ${minus:${train.num_epochs},${opt.warmup.num_epochs}}
5 | eta_min: 0.0  # Min LR (default)
6 | 


--------------------------------------------------------------------------------
/conf/model/backbone/identity.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | # This backbone will just pass the input as output, useful when the
3 | # input is not video but pretrained features
4 | _target_: torch.nn.Identity
5 | 


--------------------------------------------------------------------------------
/conf/model/backbone/r2plus1d_18.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | _target_: torchvision.models.video.r2plus1d_18
3 | pretrained: false
4 | progress: False
5 | num_classes: null  # Will be set in the code based on dataset
6 | 


--------------------------------------------------------------------------------
/conf/train_eval_op/reg_criterion/simclr_infonce.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: loss_fn.simclr_infonce.DistributedSimclrInfoNCELoss
4 | temperature: 0.1
5 | target_to_output_loss: true
6 | mil_type: sum
7 | 


--------------------------------------------------------------------------------
/conf/dataset/dundee50salads/annot_reader_fn/orig.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: datasets.breakfast_50salads.read_orig_50salads_annotations
4 | annots_dir: ${dataset.dundee50salads.common.annots_dir}
5 | timestamps_dir: ${dataset.dundee50salads.common.timestamps_dir}
6 | 


--------------------------------------------------------------------------------
/conf/dataset/egtea/common.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | version: -1
 4 | # RULSTM feats dirs
 5 | rulstm_feats_dir: ${cwd}/DATA/external/rulstm/RULSTM/egtea/
 6 | rulstm_annot_dir: ${cwd}/external/rulstm/RULSTM/data/egtea/
 7 | label_type: action
 8 | tau_a: 1.0
 9 | tau_o: 2.5
10 | split: 1
11 | modality: rgb
12 | 


--------------------------------------------------------------------------------
/conf/opt/scheduler/reduce_lr_on_plateau.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | _target_: common.scheduler.ReduceLROnPlateau
 4 | mode: "max"  # Since I pass in the validation accuracy (top 1)
 5 | patience: 10
 6 | threshold: 1  # If the val acc stuck within 1%
 7 | threshold_mode: "abs"
 8 | cooldown: 3
 9 | min_lr: 0.0000001
10 | 


--------------------------------------------------------------------------------
/conf/dataset/dundee50salads/annot_reader_fn/abu_farha.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: datasets.breakfast_50salads.read_abu_farha_annotations
4 | annots_dir: ${dataset.dundee50salads.common.annots_dir_abu_farha}
5 | bundle_entry_to_vname_fn:
6 |   _target_: datasets.breakfast_50salads.bundle_entry_to_video_fname_50salads
7 | 


--------------------------------------------------------------------------------
/conf/opt/scheduler/warmup_multi_step.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | _target_: common.scheduler.WarmupMultiStepLR
 4 | # Decrease lr on milestones
 5 | milestone_epochs: [20, 30, 40]
 6 | # Decrease LR by this factor
 7 | gamma: 0.1
 8 | warmup_factor: 0.00001
 9 | warmup_epochs: 10
10 | warmup_method: "linear"
11 | last_epoch: -1
12 | 


--------------------------------------------------------------------------------
/conf/train_eval_op/pred_future_feat.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | _target_: func.train_eval_ops.PredFutureFeat
 4 | reg_criterion: ${train_eval_op.pred_future_feat.reg_criterion}
 5 | future_target: temp_agg_projected
 6 | incur_loss_style: separately
 7 | combine_future_losses:
 8 |   _target_: torch.min
 9 | cumulative_future: false
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # General exluded files
 2 | *.swp
 3 | *.o
 4 | __pycache__
 5 | .DS_Store
 6 | .ropeproject
 7 | .nfs*
 8 | .vscode
 9 | *.egg-info/
10 | *.pyc
11 | *.mypy_cache
12 | 
13 | # Data directories
14 | DATA
15 | OUTPUTS
16 | 
17 | # python notebook related
18 | */.cph_*
19 | *.ipynb_checkpoints
20 | 
21 | # other codebases
22 | external/rulstm
23 | 


--------------------------------------------------------------------------------
/conf/model/temporal_aggregator/rulstm.yaml:
--------------------------------------------------------------------------------
1 | # @package _group_
2 | 
3 | _target_: models.temporal_aggregation.RULSTMAggregation
4 | intermediate_featdim: ${model.intermediate_featdim}
5 | dropout: ${model.dropout}
6 | # The following may be important to set if using with pre-trained model that
7 | # was trained for diff number of unrollings
8 | num_pad_feats: 0
9 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "external/epic-kitchens-100-annotations"]
2 | 	path = external/epic-kitchens-100-annotations
3 | 	url = git@github.com:epic-kitchens/epic-kitchens-100-annotations.git
4 | [submodule "external/epic-kitchens-55-annotations"]
5 | 	path = external/epic-kitchens-55-annotations
6 | 	url = git@github.com:epic-kitchens/epic-kitchens-55-annotations.git
7 | 


--------------------------------------------------------------------------------
/conf/dataset/epic_kitchens/common.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | version: 0.1
 4 | data_dir: ${cwd}/DATA/videos/EpicKitchens/videos_ht256px
 5 | annot_dir: ${cwd}/external/epic-kitchens-55-annotations/
 6 | rulstm_annot_dir: ${cwd}/external/rulstm/RULSTM/data/ek55/
 7 | rulstm_feats_dir: ${cwd}/DATA/external/rulstm/RULSTM/data_full/
 8 | label_type: action
 9 | tau_a: 1.0
10 | tau_o: 2.5
11 | 


--------------------------------------------------------------------------------
/conf/dataset/epic_kitchens100/common.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | version: 0.2
 4 | # I manually removed the video/ directory in between by moving
 5 | # videos out of the videos/ dir
 6 | data_dir_extension: ${cwd}/DATA/videos/EpicKitchens100/videos_extension_ht256px
 7 | # RULSTM feats dirs
 8 | rulstm_feats_dir: ${cwd}/DATA/external/rulstm/RULSTM/ek100_data_full/
 9 | annot_dir: ${cwd}/external/epic-kitchens-100-annotations/
10 | rulstm_annot_dir: ${cwd}/external/rulstm/RULSTM/data/ek100/
11 | label_type: action
12 | tau_a: 1.0
13 | tau_o: 2.5
14 | 


--------------------------------------------------------------------------------
/loss_fn/mse.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | """Variants of MSE loss."""
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | class NormedMSE(nn.MSELoss):
 8 |     def forward(self, inp, tgt, *args, **kwargs):
 9 |         """
10 |         Args:
11 |             inp: (*, C)
12 |             tgt: (*, C)
13 |             Will normalize the input before the loss
14 |         """
15 |         inp = nn.functional.normalize(inp, dim=-1, p=2)
16 |         tgt = nn.functional.normalize(tgt, dim=-1, p=2)
17 |         return super().forward(inp, tgt, *args, **kwargs)
18 | 


--------------------------------------------------------------------------------
/conf/dataset/dundee50salads/anticipation_train.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | _target_: datasets.breakfast_50salads.Breakfast50Salads
 4 | which: 50Salads
 5 | root: ${dataset.dundee50salads.common.videos_dir}
 6 | splits_dir: ${dataset.dundee50salads.common.splits_dir}
 7 | classes_fpath: ${dataset.dundee50salads.common.classes_fpath}
 8 | is_train: true
 9 | fold: ${dataset.dundee50salads.common.fold}
10 | sample_strategy: last_clip
11 | annot_reader_fn: ${dataset.dundee50salads.annot_reader_fn}
12 | conv_to_anticipate_fn:
13 |   _target_: datasets.base_video_dataset.convert_to_anticipation
14 |   tau_a: 1.0
15 |   tau_o: 2.5
16 | 


--------------------------------------------------------------------------------
/conf/dataset/dundee50salads/anticipation_val.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | _target_: datasets.breakfast_50salads.Breakfast50Salads
 4 | which: 50Salads
 5 | root: ${dataset.dundee50salads.common.videos_dir}
 6 | splits_dir: ${dataset.dundee50salads.common.splits_dir}
 7 | classes_fpath: ${dataset.dundee50salads.common.classes_fpath}
 8 | is_train: false
 9 | fold: ${dataset.dundee50salads.common.fold}
10 | sample_strategy: last_clip
11 | annot_reader_fn: ${dataset.dundee50salads.annot_reader_fn}
12 | conv_to_anticipate_fn:
13 |   _target_: datasets.base_video_dataset.convert_to_anticipation
14 |   tau_a: 1.0
15 |   tau_o: 2.5
16 | 


--------------------------------------------------------------------------------
/conf/dataset/dundee50salads/common.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | data_dir: ${cwd}/DATA/videos/50Salads/
 4 | splits_dir: ${cwd}/external/breakfast_50salad_anticipation_annotations/cvpr18_data/50s_splits/
 5 | annots_dir: ${dataset.dundee50salads.common.data_dir}/activityAnnotations/
 6 | videos_dir: ${dataset.dundee50salads.common.data_dir}/rgb/
 7 | timestamps_dir: ${dataset.dundee50salads.common.data_dir}/timestamps/
 8 | classes_fpath: ${cwd}/external/breakfast_50salad_anticipation_annotations/annotations/50salads/mapping.txt
 9 | annots_dir_abu_farha: ${cwd}/external/breakfast_50salad_anticipation_annotations/annotations/50salads/groundTruth/
10 | fold: 1
11 | 


--------------------------------------------------------------------------------
/models/classifiers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | class MLP(nn.Module):
 7 |     def __init__(self, in_features, out_features, nlayers, **kwargs):
 8 |         super().__init__()
 9 |         layers = [[nn.Linear(in_features, in_features, **kwargs),
10 |                    nn.ReLU()] for _ in range(nlayers - 1)]
11 |         # flatten out the pairs
12 |         layers = [item for sublist in layers for item in sublist]
13 |         layers.append(nn.Linear(in_features, out_features))
14 |         self.cls = nn.Sequential(*layers)
15 | 
16 |     def forward(self, inp):
17 |         return self.cls(inp)
18 | 


--------------------------------------------------------------------------------
/conf/dataset/epic_kitchens/anticipation_test_s1.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | _target_: datasets.epic_kitchens.EPICKitchens
 4 | root: ${dataset.epic_kitchens.common.data_dir}/test
 5 | annotation_path:
 6 |   - ${dataset.epic_kitchens.common.annot_dir}/EPIC_test_s1_timestamps.pkl
 7 | annotation_dir: ${dataset.epic_kitchens.common.annot_dir}
 8 | label_type: ${dataset.epic_kitchens.common.label_type}
 9 | sample_strategy: "center_clip"
10 | action_labels_fpath: ${dataset.epic_kitchens.common.rulstm_annot_dir}/actions.csv
11 | conv_to_anticipate_fn:
12 |   _target_: datasets.base_video_dataset.convert_to_anticipation
13 |   tau_a: ${dataset.epic_kitchens.common.tau_a}
14 |   tau_o: ${dataset.epic_kitchens.common.tau_o}
15 | 


--------------------------------------------------------------------------------
/conf/dataset/epic_kitchens/anticipation_test_s2.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | _target_: datasets.epic_kitchens.EPICKitchens
 4 | root: ${dataset.epic_kitchens.common.data_dir}/test
 5 | annotation_path:
 6 |   - ${dataset.epic_kitchens.common.annot_dir}/EPIC_test_s2_timestamps.pkl
 7 | annotation_dir: ${dataset.epic_kitchens.common.annot_dir}
 8 | label_type: ${dataset.epic_kitchens.common.label_type}
 9 | sample_strategy: "center_clip"
10 | action_labels_fpath: ${dataset.epic_kitchens.common.rulstm_annot_dir}/actions.csv
11 | conv_to_anticipate_fn:
12 |   _target_: datasets.base_video_dataset.convert_to_anticipation
13 |   tau_a: ${dataset.epic_kitchens.common.tau_a}
14 |   tau_o: ${dataset.epic_kitchens.common.tau_o}
15 | 


--------------------------------------------------------------------------------
/sample_scripts/resize_epic_256px.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | #!/bin/bash
 4 | indir="/path/to/orig/videos/"
 5 | outdir="/path/to/resulting/videos_ht256px/"
 6 | 
 7 | cd $indir
 8 | videos=$(find . -iname *.MP4)
 9 | 
10 | num_procs=32  # Run this many in parallel at max
11 | num_jobs="\j"  # The prompt escape for number of jobs currently running
12 | for video in $videos; do
13 |     while (( ${num_jobs@P} >= num_procs )); do
14 |         wait -n
15 |     done
16 |     mkdir -p $(dirname ${outdir}/${video})
17 |     # from https://superuser.com/a/624564
18 |     ffmpeg -y -i ${indir}/${video} -filter:v scale="trunc(oh*a/2)*2:256" -c:a copy ${outdir}/${video} &
19 |     echo 'Converted ' ${video}
20 | done
21 | 


--------------------------------------------------------------------------------
/conf/dataset/epic_kitchens/anticipation_train.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | _target_: datasets.epic_kitchens.EPICKitchens
 4 | root: ${dataset.epic_kitchens.common.data_dir}/train
 5 | annotation_path:
 6 |   - ${dataset.epic_kitchens.common.annot_dir}/EPIC_train_action_labels.pkl
 7 | annotation_dir: ${dataset.epic_kitchens.common.annot_dir}
 8 | label_type: ${dataset.epic_kitchens.common.label_type}
 9 | sample_strategy: "random_clip"
10 | # https://github.com/fpv-iplab/rulstm/blob/master/RULSTM/data/training_videos.csv
11 | action_labels_fpath: ${dataset.epic_kitchens.common.rulstm_annot_dir}/actions.csv
12 | conv_to_anticipate_fn:
13 |   _target_: datasets.base_video_dataset.convert_to_anticipation
14 |   tau_a: ${dataset.epic_kitchens.common.tau_a}
15 |   tau_o: ${dataset.epic_kitchens.common.tau_o}
16 | 


--------------------------------------------------------------------------------
/conf/dataset/epic_kitchens/anticipation_val.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | _target_: datasets.epic_kitchens.EPICKitchens
 4 | root: ${dataset.epic_kitchens.common.data_dir}/train
 5 | annotation_path:
 6 |   - ${dataset.epic_kitchens.common.annot_dir}/EPIC_train_action_labels.pkl
 7 | annotation_dir: ${dataset.epic_kitchens.common.annot_dir}
 8 | label_type: ${dataset.epic_kitchens.common.label_type}
 9 | sample_strategy: "center_clip"
10 | # https://github.com/fpv-iplab/rulstm/blob/master/RULSTM/data/validation_videos.csv
11 | only_keep_videos: ${dataset.epic_kitchens.common.rulstm_annot_dir}/validation_videos.csv
12 | action_labels_fpath: ${dataset.epic_kitchens.common.rulstm_annot_dir}/actions.csv
13 | conv_to_anticipate_fn:
14 |   _target_: datasets.base_video_dataset.convert_to_anticipation
15 |   tau_a: ${dataset.epic_kitchens.common.tau_a}
16 |   tau_o: ${dataset.epic_kitchens.common.tau_o}
17 | 


--------------------------------------------------------------------------------
/conf/dataset/epic_kitchens100/anticipation_test.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | _target_: datasets.epic_kitchens.EPICKitchens
 4 | version: ${dataset.epic_kitchens100.common.version}
 5 | root:
 6 |   - ${dataset.epic_kitchens100.common.data_dir_extension}
 7 | annotation_path:
 8 |   - ${dataset.epic_kitchens100.common.annot_dir}/EPIC_100_test_timestamps.pkl
 9 | annotation_dir: ${dataset.epic_kitchens100.common.annot_dir}
10 | rulstm_annotation_dir: ${dataset.epic_kitchens100.common.rulstm_annot_dir}  # Needed during computing final outputs to get tail classes etc.
11 | label_type: ${dataset.epic_kitchens100.common.label_type}
12 | sample_strategy: "center_clip"
13 | action_labels_fpath: ${dataset.epic_kitchens100.common.rulstm_annot_dir}/actions.csv
14 | conv_to_anticipate_fn:
15 |   _target_: datasets.base_video_dataset.convert_to_anticipation
16 |   tau_a: ${dataset.epic_kitchens100.common.tau_a}
17 |   tau_o: ${dataset.epic_kitchens100.common.tau_o}
18 | 


--------------------------------------------------------------------------------
/conf/dataset/egtea/anticipation_train.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | _target_: datasets.epic_kitchens.EPICKitchens
 4 | version: ${dataset.egtea.common.version}
 5 | root:
 6 |   - ${cwd}/DATA/videos/EGTEA/101020/videos/
 7 | annotation_path:
 8 |   - ${dataset.egtea.common.rulstm_annot_dir}/training${dataset.egtea.common.split}.csv
 9 | annotation_dir: ${dataset.egtea.common.rulstm_annot_dir}
10 | label_type: ${dataset.egtea.common.label_type}
11 | sample_strategy: random_clip
12 | action_labels_fpath: ${dataset.egtea.common.rulstm_annot_dir}/actions.csv
13 | conv_to_anticipate_fn:
14 |   _target_: datasets.base_video_dataset.convert_to_anticipation
15 |   tau_a: ${dataset.egtea.common.tau_a}
16 |   tau_o: ${dataset.egtea.common.tau_o}
17 | reader_fn:  # Setting it since for EGTEA I mostly use RULSTM features
18 |   _target_: datasets.epic_kitchens.EpicRULSTMFeatsReader
19 |   lmdb_path: ${dataset.egtea.common.rulstm_feats_dir}/TSN-C_3_egtea_action_CE_s${dataset.egtea.common.split}_${dataset.egtea.common.modality}_model_best_fcfull_hd/
20 |   read_type: normal
21 | 


--------------------------------------------------------------------------------
/conf/dataset/egtea/anticipation_val.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | _target_: datasets.epic_kitchens.EPICKitchens
 4 | version: ${dataset.egtea.common.version}
 5 | root:
 6 |   - ${cwd}/DATA/videos/EGTEA/101020/videos/
 7 | annotation_path:
 8 |   - ${dataset.egtea.common.rulstm_annot_dir}/validation${dataset.egtea.common.split}.csv
 9 | annotation_dir: ${dataset.egtea.common.rulstm_annot_dir}
10 | label_type: ${dataset.egtea.common.label_type}
11 | sample_strategy: center_clip
12 | action_labels_fpath: ${dataset.egtea.common.rulstm_annot_dir}/actions.csv
13 | conv_to_anticipate_fn:
14 |   _target_: datasets.base_video_dataset.convert_to_anticipation
15 |   tau_a: ${dataset.egtea.common.tau_a}
16 |   tau_o: ${dataset.egtea.common.tau_o}
17 | reader_fn:  # Setting it since for EGTEA I mostly use RULSTM features
18 |   _target_: datasets.epic_kitchens.EpicRULSTMFeatsReader
19 |   lmdb_path: ${dataset.egtea.common.rulstm_feats_dir}/TSN-C_3_egtea_action_CE_s${dataset.egtea.common.split}_${dataset.egtea.common.modality}_model_best_fcfull_hd/
20 |   read_type: normal
21 | 


--------------------------------------------------------------------------------
/conf/dataset/epic_kitchens100/anticipation_train.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | _target_: datasets.epic_kitchens.EPICKitchens
 4 | version: ${dataset.epic_kitchens100.common.version}
 5 | root:
 6 |   - ${dataset.epic_kitchens.common.data_dir}/train  # the old one
 7 |   # Need test too since some of epic-55 test was added to epic-100 train
 8 |   - ${dataset.epic_kitchens.common.data_dir}/test  # the old one
 9 |   - ${dataset.epic_kitchens100.common.data_dir_extension}
10 | annotation_path:
11 |   - ${dataset.epic_kitchens100.common.annot_dir}/EPIC_100_train.pkl
12 | annotation_dir: ${dataset.epic_kitchens100.common.annot_dir}
13 | rulstm_annotation_dir: ${dataset.epic_kitchens100.common.rulstm_annot_dir}  # Needed during computing final outputs to get tail classes etc.
14 | label_type: ${dataset.epic_kitchens100.common.label_type}
15 | sample_strategy: "random_clip"
16 | action_labels_fpath: ${dataset.epic_kitchens100.common.rulstm_annot_dir}/actions.csv
17 | conv_to_anticipate_fn:
18 |   _target_: datasets.base_video_dataset.convert_to_anticipation
19 |   tau_a: ${dataset.epic_kitchens100.common.tau_a}
20 |   tau_o: ${dataset.epic_kitchens100.common.tau_o}
21 | 


--------------------------------------------------------------------------------
/conf/dataset/epic_kitchens100/anticipation_val.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | _target_: datasets.epic_kitchens.EPICKitchens
 4 | version: ${dataset.epic_kitchens100.common.version}
 5 | root:
 6 |   - ${dataset.epic_kitchens.common.data_dir}/train  # the old one
 7 |   # Need test too since some of epic-55 test was added to epic-100 train
 8 |   - ${dataset.epic_kitchens.common.data_dir}/test  # the old one
 9 |   - ${dataset.epic_kitchens100.common.data_dir_extension}
10 | annotation_path:
11 |   - ${dataset.epic_kitchens100.common.annot_dir}/EPIC_100_validation.pkl
12 | annotation_dir: ${dataset.epic_kitchens100.common.annot_dir}
13 | rulstm_annotation_dir: ${dataset.epic_kitchens100.common.rulstm_annot_dir}  # Needed during computing final outputs to get tail classes etc.
14 | label_type: ${dataset.epic_kitchens100.common.label_type}
15 | sample_strategy: "center_clip"
16 | action_labels_fpath: ${dataset.epic_kitchens100.common.rulstm_annot_dir}/actions.csv
17 | conv_to_anticipate_fn:
18 |   _target_: datasets.base_video_dataset.convert_to_anticipation
19 |   tau_a: ${dataset.epic_kitchens100.common.tau_a}
20 |   tau_o: ${dataset.epic_kitchens100.common.tau_o}
21 | 


--------------------------------------------------------------------------------
/conf/dataset/epic_kitchens100/anticipation_train+val.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | _target_: datasets.epic_kitchens.EPICKitchens
 4 | version: ${dataset.epic_kitchens100.common.version}
 5 | root:
 6 |   - ${dataset.epic_kitchens.common.data_dir}/train  # the old one
 7 |   # Need test too since some of epic-55 test was added to epic-100 train
 8 |   - ${dataset.epic_kitchens.common.data_dir}/test  # the old one
 9 |   - ${dataset.epic_kitchens100.common.data_dir_extension}
10 | annotation_path:
11 |   - ${dataset.epic_kitchens100.common.annot_dir}/EPIC_100_train.pkl
12 |   - ${dataset.epic_kitchens100.common.annot_dir}/EPIC_100_validation.pkl
13 | annotation_dir: ${dataset.epic_kitchens100.common.annot_dir}
14 | rulstm_annotation_dir: ${dataset.epic_kitchens100.common.rulstm_annot_dir}  # Needed during computing final outputs to get tail classes etc.
15 | label_type: ${dataset.epic_kitchens100.common.label_type}
16 | sample_strategy: "random_clip"
17 | action_labels_fpath: ${dataset.epic_kitchens100.common.rulstm_annot_dir}/actions.csv
18 | conv_to_anticipate_fn:
19 |   _target_: datasets.base_video_dataset.convert_to_anticipation
20 |   tau_a: ${dataset.epic_kitchens100.common.tau_a}
21 |   tau_o: ${dataset.epic_kitchens100.common.tau_o}
22 | 


--------------------------------------------------------------------------------
/docs/DATASETS.md:
--------------------------------------------------------------------------------
 1 | ## EGTEA Gaze+
 2 | 
 3 | The annotations are used from RULSTM, so you'd need to set it up as described in the main README.
 4 | 
 5 | ### To train models on pre-extracted TSN features
 6 | 
 7 | 1. Download the features from [RULSTM](https://iplab.dmi.unict.it/sharing/rulstm/features/egtea.zip)
 8 | 2. Unzip into `DATA/external/rulstm/RULSTM/egtea/`
 9 | 
10 | ### To train models on raw videos
11 | 
12 | Download the videos from [here](https://www.dropbox.com/s/uwwj6wb1j4rsm02/video_links.txt) into `DATA/videos/EGTEA/101020/videos/`
13 | 
14 | ## 50-Salads
15 | 
16 | 1. Download videos from [here](https://cvip.computing.dundee.ac.uk/datasets/foodpreparation/50salads/data/) into `DATA/videos/50Salads/`.
17 | 2. Download annotations
18 |    - The models in this paper use annotations from [here](https://github.com/yabufarha/anticipating-activities/issues/5#issuecomment-555916894)
19 |       - Download the `cvpr18_data` folder to `external/breakfast_50salad_anticipation_annotations/cvpr18_data`
20 |    - Additionally, [this](https://dl.fbaipublicfiles.com/avt/datasets/50salads/annotations.zip) annotations directory was shared by the authors of the above paper as well, download it at `external/breakfast_50salad_anticipation_annotations/annotations/`. Shared here for reproducibility of the code.
21 | 


--------------------------------------------------------------------------------
/conf/dataset/epic_kitchens/anticipation_train_minus_val.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_._name_
 2 | 
 3 | _target_: datasets.epic_kitchens.EPICKitchens
 4 | root: ${dataset.epic_kitchens.common.data_dir}/train
 5 | # Not using the rulstm/RULSTM/data/{training|validation}.csv here since
 6 | # they are effectively the same as the original EPIC labels (to my
 7 | # knowledge).
 8 | # wc -l rulstm/RULSTM/data/training.csv --> 23493
 9 | # wc -l rulstm/RULSTM/data/validation.csv --> 4979
10 | # wc -l epic_annotations/EPIC_train_action_labels.csv --> 28473
11 | # which is the same as (sum - 1, for header)
12 | # So using the only_keep_videos to subselect for train/val
13 | annotation_path:
14 |   - ${dataset.epic_kitchens.common.annot_dir}/EPIC_train_action_labels.pkl
15 | annotation_dir: ${dataset.epic_kitchens.common.annot_dir}
16 | label_type: ${dataset.epic_kitchens.common.label_type}
17 | sample_strategy: "random_clip"
18 | # https://github.com/fpv-iplab/rulstm/blob/master/RULSTM/data/training_videos.csv
19 | only_keep_videos: ${dataset.epic_kitchens.common.rulstm_annot_dir}/training_videos.csv
20 | action_labels_fpath: ${dataset.epic_kitchens.common.rulstm_annot_dir}/actions.csv
21 | conv_to_anticipate_fn:
22 |   _target_: datasets.base_video_dataset.convert_to_anticipation
23 |   tau_a: ${dataset.epic_kitchens.common.tau_a}
24 |   tau_o: ${dataset.epic_kitchens.common.tau_o}
25 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to AVT
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `main`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Facebook's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## License
30 | By contributing to AVT, you agree that your contributions will be licensed
31 | under the LICENSE file in the root directory of this source tree.


--------------------------------------------------------------------------------
/train_net.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | """Main training entry."""
 4 | 
 5 | import os
 6 | import logging
 7 | import random
 8 | import subprocess
 9 | 
10 | import torch
11 | import hydra
12 | from omegaconf import DictConfig, OmegaConf
13 | 
14 | import func
15 | 
16 | 
17 | OmegaConf.register_new_resolver('minus', lambda x, y: x - y)
18 | # Multiply and cast to integer
19 | OmegaConf.register_new_resolver('times_int', lambda x, y: int(x * y))
20 | 
21 | 
22 | @hydra.main(config_path='conf', config_name='config')
23 | def main(cfg: DictConfig) -> None:
24 |     # Since future runs might corrupt the stored hydra config, copy it over
25 |     # for backup.
26 |     if not os.path.exists('.hydra.orig'):
27 |         subprocess.call('cp -r .hydra .hydra.orig', shell=True)
28 |     random.seed(cfg.seed)
29 |     torch.manual_seed(cfg.seed)
30 |     try:
31 |         print(subprocess.check_output('nvidia-smi'))
32 |     except subprocess.CalledProcessError:
33 |         print('Could not run nvidia-smi..')
34 |     # cudnn.deterministic = True  # Makes it slow..
35 |     getattr(func, cfg.train.fn).main(cfg)
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     logging.basicConfig(format=('%(asctime)s %(levelname)-8s'
40 |                                 ' {%(module)s:%(lineno)d} %(message)s'),
41 |                         level=logging.DEBUG,
42 |                         datefmt='%Y-%m-%d %H:%M:%S')
43 |     torch.multiprocessing.set_start_method('spawn')
44 |     main()  # pylint: disable=no-value-for-parameter  # Uses hydra
45 | 


--------------------------------------------------------------------------------
/expts/05_ek100_rustm_test_testonly.txt:
--------------------------------------------------------------------------------
 1 | train.batch_size=128
 2 | eval.batch_size=128
 3 | train.num_epochs=0
 4 | # Download the following model from RULSTM
 5 | # https://iplab.dmi.unict.it/sharing/rulstm/ek100_models/RULSTM-anticipation_0.25_6_8_rgb_mt5r_best.pth.tar
 6 | train.init_from_model=[[temporal_aggregator,/path/to/RULSTM-anticipation_0.25_6_8_rgb_mt5r_best.pth.tar],[classifiers.action,classifier.1.,/path/to/RULSTM-anticipation_0.25_6_8_rgb_mt5r_best.pth.tar]]
 7 | 
 8 | model/backbone=identity
 9 | model.backbone_dim=1024
10 | model/temporal_aggregator=rulstm
11 | model.temporal_aggregator.num_pad_feats=3
12 | model.dropout=0.8
13 | 
14 | opt.lr_wd=[[backbone,0.0,0.0],[temporal_aggregator,0.01,0.0],[classifiers,0.01,0.0]]
15 | opt.bias_bn_wd_scale=0.0
16 | opt.optimizer.nesterov=true
17 | 
18 | data_train.num_frames=11
19 | data_train.frame_rate=30
20 | data_eval.num_frames=11
21 | data_eval.frame_rate=30
22 | 
23 | opt/scheduler=cosine
24 | 
25 | dataset@dataset_train=epic_kitchens100/anticipation_train
26 | dataset@dataset_eval=epic_kitchens100/anticipation_test
27 | dataset_train.sample_strategy=last_clip
28 | dataset_eval.sample_strategy=last_clip
29 | dataset_train.conv_to_anticipate_fn.tau_o=2.5
30 | dataset_eval.conv_to_anticipate_fn.tau_o=2.5
31 | dataset.epic_kitchens100.common.label_type=action
32 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb/, read_type: exact_rulstm}
33 | +dataset_eval.reader_fn=${dataset_train.reader_fn}
34 | 
35 | # RULSTM data
36 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
37 | 
38 | hydra.launcher.nodes=1
39 | hydra.launcher.gpus_per_node=1
40 | 
41 | test_only=True
42 | 


--------------------------------------------------------------------------------
/expts/08_ek55_avt_tsn.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=2.0
 2 | 
 3 | train.batch_size=32
 4 | eval.batch_size=32
 5 | train.num_epochs=20
 6 | 
 7 | model/backbone=identity
 8 | model.backbone_dim=1024
 9 | model/temporal_aggregator=identity
10 | model/future_predictor=avth
11 | model.dropout=0.8
12 | +model.future_predictor.n_head=8
13 | +model.future_predictor.n_layer=12
14 | +model.future_predictor.output_len=1
15 | +model.future_predictor.inter_dim=2048
16 | +model.future_predictor.return_past_too=true
17 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
18 | +model.future_predictor.future_pred_loss_wt=1.0
19 | +model.future_predictor.avg_last_n=1
20 | 
21 | 
22 | opt.lr_wd=[[__all__,0.000005,0.0001]]
23 | opt.bias_bn_wd_scale=1.0
24 | 
25 | data_train.num_frames=10
26 | data_train.frame_rate=2
27 | data_eval.num_frames=${data_train.num_frames}
28 | data_eval.frame_rate=${data_train.frame_rate}
29 | 
30 | opt/optimizer=adam
31 | opt/scheduler=cosine
32 | opt.warmup.num_epochs=5
33 | opt.scheduler.num_epochs=15  # total - 5 warmup
34 | 
35 | dataset@dataset_train=epic_kitchens/anticipation_train_minus_val
36 | dataset@dataset_eval=epic_kitchens/anticipation_val
37 | dataset_train.sample_strategy=last_clip
38 | dataset_eval.sample_strategy=last_clip
39 | dataset_train.conv_to_anticipate_fn.tau_o=5
40 | dataset_eval.conv_to_anticipate_fn.tau_o=5
41 | dataset.epic_kitchens.common.label_type=action
42 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens.common.rulstm_feats_dir}/rgb/, read_type: normal}
43 | +dataset_eval.reader_fn=${dataset_train.reader_fn}
44 | 
45 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
46 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
47 | 
48 | hydra.launcher.nodes=1
49 | hydra.launcher.gpus_per_node=4
50 | 


--------------------------------------------------------------------------------
/expts/10_ek55_avt_ig65m.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=2.0
 2 | 
 3 | train.batch_size=32
 4 | eval.batch_size=32
 5 | train.num_epochs=20
 6 | 
 7 | model/backbone=identity
 8 | model.backbone_dim=2048
 9 | model/temporal_aggregator=identity
10 | model/future_predictor=avth
11 | model.dropout=0.8
12 | +model.future_predictor.n_head=8
13 | +model.future_predictor.n_layer=12
14 | +model.future_predictor.output_len=1
15 | +model.future_predictor.inter_dim=2048
16 | +model.future_predictor.return_past_too=true
17 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
18 | +model.future_predictor.future_pred_loss_wt=1.0
19 | +model.future_predictor.avg_last_n=1
20 | 
21 | 
22 | opt.lr_wd=[[__all__,0.000005,0.0001]]
23 | opt.bias_bn_wd_scale=1.0
24 | # opt.optimizer.nesterov=true
25 | 
26 | data_train.num_frames=10
27 | data_train.frame_rate=2
28 | data_eval.num_frames=${data_train.num_frames}
29 | data_eval.frame_rate=${data_train.frame_rate}
30 | 
31 | opt/optimizer=adam
32 | opt/scheduler=cosine
33 | 
34 | dataset@dataset_train=epic_kitchens/anticipation_train_minus_val
35 | dataset@dataset_eval=epic_kitchens/anticipation_val
36 | dataset_train.sample_strategy=last_clip
37 | dataset_eval.sample_strategy=last_clip
38 | dataset_train.conv_to_anticipate_fn.tau_o=5
39 | dataset_eval.conv_to_anticipate_fn.tau_o=5
40 | 
41 | dataset.epic_kitchens.common.label_type=action
42 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${cwd}/DATA/extracted_features/ek55/ig65m_ftEk55train_logits_25fps/rgb, read_type: normal, warn_if_using_closeby_frame: false}
43 | +dataset_eval.reader_fn=${dataset_train.reader_fn}
44 | 
45 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
46 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
47 | 
48 | hydra.launcher.nodes=1
49 | hydra.launcher.gpus_per_node=4
50 | 


--------------------------------------------------------------------------------
/expts/10_ek55_avt_ig65m_forAR.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=2.0
 2 | 
 3 | train.batch_size=32
 4 | eval.batch_size=32
 5 | train.num_epochs=100
 6 | 
 7 | model/backbone=identity
 8 | model.backbone_dim=2048
 9 | model/temporal_aggregator=identity
10 | model/future_predictor=avth
11 | model.dropout=0.8
12 | +model.future_predictor.n_head=8
13 | +model.future_predictor.n_layer=12
14 | +model.future_predictor.output_len=1
15 | +model.future_predictor.inter_dim=2048
16 | +model.future_predictor.return_past_too=true
17 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
18 | +model.future_predictor.future_pred_loss_wt=1.0
19 | +model.future_predictor.avg_last_n=1
20 | 
21 | 
22 | opt.lr_wd=[[__all__,0.000005,0.0001]]
23 | opt.bias_bn_wd_scale=1.0
24 | # opt.optimizer.nesterov=true
25 | 
26 | data_train.num_frames=10
27 | data_train.frame_rate=2
28 | data_eval.num_frames=${data_train.num_frames}
29 | data_eval.frame_rate=${data_train.frame_rate}
30 | 
31 | opt/optimizer=adam
32 | opt/scheduler=cosine
33 | 
34 | dataset@dataset_train=epic_kitchens/anticipation_train_minus_val
35 | dataset@dataset_eval=epic_kitchens/anticipation_val
36 | dataset_train.sample_strategy=last_clip
37 | dataset_eval.sample_strategy=last_clip
38 | dataset_train.conv_to_anticipate_fn.tau_o=5
39 | dataset_eval.conv_to_anticipate_fn.tau_o=5
40 | 
41 | dataset.epic_kitchens.common.label_type=action
42 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${cwd}/DATA/extracted_features/ek55/ig65m_ftEk55train_logits_25fps/rgb, read_type: normal, warn_if_using_closeby_frame: false}
43 | +dataset_eval.reader_fn=${dataset_train.reader_fn}
44 | 
45 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
46 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
47 | 
48 | hydra.launcher.nodes=1
49 | hydra.launcher.gpus_per_node=4
50 | 


--------------------------------------------------------------------------------
/expts/08_ek55_avt_tsn_forAR.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 3 | 
 4 | train.batch_size=64
 5 | eval.batch_size=64
 6 | train.num_epochs=50
 7 | 
 8 | model/backbone=identity
 9 | model.backbone_dim=1024
10 | model/temporal_aggregator=identity
11 | model/future_predictor=avth
12 | model.dropout=0.2
13 | +model.future_predictor.n_head=4
14 | +model.future_predictor.n_layer=6
15 | +model.future_predictor.output_len=1
16 | +model.future_predictor.inter_dim=2048
17 | +model.future_predictor.return_past_too=true
18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
19 | +model.future_predictor.future_pred_loss_wt=1.0
20 | +model.future_predictor.avg_last_n=1
21 | model.classifier_on_past=true
22 | 
23 | 
24 | opt.lr_wd=[[__all__,0.001,0.000001]]
25 | opt.bias_bn_wd_scale=1.0
26 | opt.optimizer.nesterov=true
27 | 
28 | data_train.num_frames=10
29 | data_train.frame_rate=1
30 | data_train.subclips.num_frames=1
31 | data_train.subclips.stride=1
32 | data_eval=${data_train}
33 | 
34 | opt/optimizer=sgd
35 | opt/scheduler=cosine
36 | 
37 | dataset@dataset_train=epic_kitchens/anticipation_train_minus_val
38 | dataset@dataset_eval=epic_kitchens/anticipation_val
39 | dataset_train.sample_strategy=last_clip
40 | dataset_eval.sample_strategy=last_clip
41 | 
42 | dataset_train.conv_to_anticipate_fn.tau_a=1
43 | dataset_train.conv_to_anticipate_fn.tau_o=10
44 | dataset_eval.conv_to_anticipate_fn.tau_a=1
45 | dataset_eval.conv_to_anticipate_fn.tau_o=10
46 | 
47 | dataset.epic_kitchens.common.label_type=action
48 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens.common.rulstm_feats_dir}/rgb/, read_type: normal}
49 | +dataset_eval.reader_fn=${dataset_train.reader_fn}
50 | 
51 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
52 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
53 | 
54 | hydra.launcher.nodes=1
55 | hydra.launcher.gpus_per_node=2
56 | 


--------------------------------------------------------------------------------
/expts/02_ek100_avt_tsn.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 3 | 
 4 | train.batch_size=64
 5 | eval.batch_size=64
 6 | train.num_epochs=50
 7 | 
 8 | model/backbone=identity
 9 | model.backbone_dim=1024
10 | model/temporal_aggregator=identity
11 | model/future_predictor=avth
12 | model.dropout=0.2
13 | +model.future_predictor.n_head=4
14 | +model.future_predictor.n_layer=6
15 | +model.future_predictor.output_len=1
16 | +model.future_predictor.inter_dim=2048
17 | +model.future_predictor.return_past_too=true
18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
19 | +model.future_predictor.future_pred_loss_wt=1.0
20 | +model.future_predictor.avg_last_n=1
21 | model.classifier_on_past=true
22 | 
23 | 
24 | opt.lr_wd=[[__all__,0.001,0.000001]]
25 | opt.bias_bn_wd_scale=1.0
26 | opt.optimizer.nesterov=true
27 | 
28 | data_train.num_frames=10
29 | data_train.frame_rate=1
30 | data_train.subclips.num_frames=1
31 | data_train.subclips.stride=1
32 | data_eval=${data_train}
33 | 
34 | opt/optimizer=sgd
35 | opt/scheduler=cosine
36 | opt.warmup.num_epochs=20
37 | 
38 | dataset@dataset_train=epic_kitchens100/anticipation_train
39 | dataset@dataset_eval=epic_kitchens100/anticipation_val
40 | dataset_train.sample_strategy=last_clip
41 | dataset_eval.sample_strategy=last_clip
42 | dataset_train.conv_to_anticipate_fn.tau_a=1
43 | dataset_train.conv_to_anticipate_fn.tau_o=10
44 | dataset_eval.conv_to_anticipate_fn.tau_a=1
45 | dataset_eval.conv_to_anticipate_fn.tau_o=10
46 | dataset.epic_kitchens100.common.label_type=action
47 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb/, read_type: normal}
48 | +dataset_eval.reader_fn=${dataset_train.reader_fn}
49 | 
50 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
51 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
52 | 
53 | hydra.launcher.nodes=1
54 | hydra.launcher.gpus_per_node=2
55 | 


--------------------------------------------------------------------------------
/expts/03_ek100_avt_tsn_obj.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 3 | 
 4 | train.batch_size=64
 5 | eval.batch_size=64
 6 | train.num_epochs=50
 7 | 
 8 | model/backbone=identity
 9 | model.backbone_dim=352
10 | model/temporal_aggregator=identity
11 | model/future_predictor=avth
12 | model.dropout=0.2
13 | +model.future_predictor.n_head=4
14 | +model.future_predictor.n_layer=6
15 | +model.future_predictor.output_len=1
16 | +model.future_predictor.inter_dim=2048
17 | +model.future_predictor.return_past_too=true
18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
19 | +model.future_predictor.future_pred_loss_wt=1.0
20 | +model.future_predictor.avg_last_n=1
21 | model.classifier_on_past=true
22 | 
23 | 
24 | opt.lr_wd=[[__all__,0.001,0.000001]]
25 | opt.bias_bn_wd_scale=1.0
26 | opt.optimizer.nesterov=true
27 | 
28 | data_train.num_frames=10
29 | data_train.frame_rate=1
30 | data_train.subclips.num_frames=1
31 | data_train.subclips.stride=1
32 | data_eval=${data_train}
33 | 
34 | opt/optimizer=sgd
35 | opt/scheduler=cosine
36 | opt.warmup.num_epochs=20
37 | 
38 | dataset@dataset_train=epic_kitchens100/anticipation_train
39 | dataset@dataset_eval=epic_kitchens100/anticipation_val
40 | dataset_train.sample_strategy=last_clip
41 | dataset_eval.sample_strategy=last_clip
42 | dataset_train.conv_to_anticipate_fn.tau_a=1
43 | dataset_train.conv_to_anticipate_fn.tau_o=10
44 | dataset_eval.conv_to_anticipate_fn.tau_a=1
45 | dataset_eval.conv_to_anticipate_fn.tau_o=10
46 | dataset.epic_kitchens100.common.label_type=action
47 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/obj/, read_type: normal}
48 | +dataset_eval.reader_fn=${dataset_train.reader_fn}
49 | 
50 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
51 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
52 | 
53 | hydra.launcher.nodes=1
54 | hydra.launcher.gpus_per_node=2
55 | 


--------------------------------------------------------------------------------
/expts/02_ek100_avt_tsn_test_trainval.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 3 | 
 4 | train.batch_size=64
 5 | eval.batch_size=64
 6 | train.num_epochs=50
 7 | 
 8 | model/backbone=identity
 9 | model.backbone_dim=1024
10 | model/temporal_aggregator=identity
11 | model/future_predictor=avth
12 | model.dropout=0.2
13 | +model.future_predictor.n_head=4
14 | +model.future_predictor.n_layer=6
15 | +model.future_predictor.output_len=1
16 | +model.future_predictor.inter_dim=2048
17 | +model.future_predictor.return_past_too=true
18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
19 | +model.future_predictor.future_pred_loss_wt=1.0
20 | +model.future_predictor.avg_last_n=1
21 | model.classifier_on_past=true
22 | 
23 | 
24 | opt.lr_wd=[[__all__,0.001,0.000001]]
25 | opt.bias_bn_wd_scale=1.0
26 | opt.optimizer.nesterov=true
27 | 
28 | data_train.num_frames=10
29 | data_train.frame_rate=1
30 | data_train.subclips.num_frames=1
31 | data_train.subclips.stride=1
32 | data_eval=${data_train}
33 | 
34 | opt/optimizer=sgd
35 | opt/scheduler=cosine
36 | opt.warmup.num_epochs=20
37 | 
38 | dataset@dataset_train=epic_kitchens100/anticipation_train+val
39 | dataset@dataset_eval=epic_kitchens100/anticipation_test
40 | dataset_train.sample_strategy=last_clip
41 | dataset_eval.sample_strategy=last_clip
42 | dataset_train.conv_to_anticipate_fn.tau_a=1
43 | dataset_train.conv_to_anticipate_fn.tau_o=10
44 | dataset_eval.conv_to_anticipate_fn.tau_a=1
45 | dataset_eval.conv_to_anticipate_fn.tau_o=10
46 | dataset.epic_kitchens100.common.label_type=action
47 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb/, read_type: normal}
48 | +dataset_eval.reader_fn=${dataset_train.reader_fn}
49 | 
50 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
51 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
52 | 
53 | hydra.launcher.nodes=1
54 | hydra.launcher.gpus_per_node=2
55 | 


--------------------------------------------------------------------------------
/expts/03_ek100_avt_tsn_obj_test_trainval.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 3 | 
 4 | train.batch_size=64
 5 | eval.batch_size=64
 6 | train.num_epochs=50
 7 | 
 8 | model/backbone=identity
 9 | model.backbone_dim=352
10 | model/temporal_aggregator=identity
11 | model/future_predictor=avth
12 | model.dropout=0.2
13 | +model.future_predictor.n_head=4
14 | +model.future_predictor.n_layer=6
15 | +model.future_predictor.output_len=1
16 | +model.future_predictor.inter_dim=2048
17 | +model.future_predictor.return_past_too=true
18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
19 | +model.future_predictor.future_pred_loss_wt=1.0
20 | +model.future_predictor.avg_last_n=1
21 | model.classifier_on_past=true
22 | 
23 | 
24 | opt.lr_wd=[[__all__,0.001,0.000001]]
25 | opt.bias_bn_wd_scale=1.0
26 | opt.optimizer.nesterov=true
27 | 
28 | data_train.num_frames=10
29 | data_train.frame_rate=1
30 | data_train.subclips.num_frames=1
31 | data_train.subclips.stride=1
32 | data_eval=${data_train}
33 | 
34 | opt/optimizer=sgd
35 | opt/scheduler=cosine
36 | opt.warmup.num_epochs=20
37 | 
38 | dataset@dataset_train=epic_kitchens100/anticipation_train+val
39 | dataset@dataset_eval=epic_kitchens100/anticipation_test
40 | dataset_train.sample_strategy=last_clip
41 | dataset_eval.sample_strategy=last_clip
42 | dataset_train.conv_to_anticipate_fn.tau_a=1
43 | dataset_train.conv_to_anticipate_fn.tau_o=10
44 | dataset_eval.conv_to_anticipate_fn.tau_a=1
45 | dataset_eval.conv_to_anticipate_fn.tau_o=10
46 | dataset.epic_kitchens100.common.label_type=action
47 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/obj/, read_type: normal}
48 | +dataset_eval.reader_fn=${dataset_train.reader_fn}
49 | 
50 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
51 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
52 | 
53 | hydra.launcher.nodes=1
54 | hydra.launcher.gpus_per_node=2
55 | 


--------------------------------------------------------------------------------
/expts/04_ek100_avt_ig65m_test_trainval.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 3 | 
 4 | train.batch_size=64
 5 | eval.batch_size=64
 6 | train.num_epochs=50
 7 | 
 8 | model/backbone=identity
 9 | model.backbone_dim=2048
10 | model/temporal_aggregator=identity
11 | model/future_predictor=avth
12 | model.dropout=0.2
13 | +model.future_predictor.n_head=2
14 | +model.future_predictor.n_layer=8
15 | +model.future_predictor.output_len=1
16 | +model.future_predictor.inter_dim=2048
17 | +model.future_predictor.return_past_too=true
18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
19 | +model.future_predictor.future_pred_loss_wt=1.0
20 | +model.future_predictor.avg_last_n=1
21 | model.classifier_on_past=true
22 | 
23 | 
24 | opt.lr_wd=[[__all__,0.001,0.000001]]
25 | opt.bias_bn_wd_scale=1.0
26 | opt.optimizer.nesterov=true
27 | 
28 | data_train.num_frames=10
29 | data_train.frame_rate=1
30 | data_train.subclips.num_frames=1
31 | data_train.subclips.stride=1
32 | data_eval=${data_train}
33 | 
34 | opt/optimizer=sgd
35 | opt/scheduler=cosine
36 | opt.warmup.num_epochs=5
37 | 
38 | dataset@dataset_train=epic_kitchens100/anticipation_train+val
39 | dataset@dataset_eval=epic_kitchens100/anticipation_test
40 | dataset_train.sample_strategy=last_clip
41 | dataset_eval.sample_strategy=last_clip
42 | dataset_train.conv_to_anticipate_fn.tau_a=1
43 | dataset_train.conv_to_anticipate_fn.tau_o=10
44 | dataset_eval.conv_to_anticipate_fn.tau_a=1
45 | dataset_eval.conv_to_anticipate_fn.tau_o=10
46 | dataset.epic_kitchens100.common.label_type=action
47 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${cwd}/DATA/extracted_features/ek100/ig65m_ftEk100_logits_10fps1s/rgb/, read_type: normal, warn_if_using_closeby_frame: false}
48 | +dataset_eval.reader_fn=${dataset_train.reader_fn}
49 | 
50 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
51 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
52 | 
53 | hydra.launcher.nodes=1
54 | hydra.launcher.gpus_per_node=2
55 | 


--------------------------------------------------------------------------------
/expts/11_egtea_avt_tsn.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=0.1
 3 | 
 4 | train.batch_size=64
 5 | eval.batch_size=64
 6 | train.num_epochs=15
 7 | 
 8 | model/backbone=identity
 9 | model.backbone_dim=1024
10 | model/temporal_aggregator=identity
11 | model/future_predictor=avth
12 | model.dropout=0.8
13 | +model.future_predictor.n_head=4
14 | +model.future_predictor.n_layer=2
15 | +model.future_predictor.output_len=1
16 | +model.future_predictor.inter_dim=2048
17 | +model.future_predictor.return_past_too=true
18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
19 | +model.future_predictor.future_pred_loss_wt=1.0
20 | +model.future_predictor.avg_last_n=1
21 | model.classifier_on_past=true
22 | 
23 | 
24 | opt.lr_wd=[[__all__,0.001,0.000001]]
25 | opt.bias_bn_wd_scale=1.0
26 | opt.optimizer.nesterov=true
27 | 
28 | data_train.num_frames=10
29 | data_train.frame_rate=1
30 | data_train.subclips.num_frames=1
31 | data_train.subclips.stride=1
32 | data_eval=${data_train}
33 | 
34 | opt/optimizer=sgd
35 | opt/scheduler=cosine
36 | 
37 | dataset@dataset_train=egtea/anticipation_train
38 | dataset@dataset_eval=egtea/anticipation_val
39 | +dataset@dataset_eval_train=egtea/anticipation_train
40 | dataset_train.sample_strategy=last_clip
41 | dataset_eval.sample_strategy=last_clip
42 | dataset_eval_train.sample_strategy=last_clip
43 | dataset_train.conv_to_anticipate_fn.tau_a=0.5
44 | dataset_train.conv_to_anticipate_fn.tau_o=10
45 | dataset_eval.conv_to_anticipate_fn.tau_a=0.5
46 | dataset_eval.conv_to_anticipate_fn.tau_o=10
47 | dataset_eval_train.conv_to_anticipate_fn.tau_a=0.5
48 | dataset_eval_train.conv_to_anticipate_fn.tau_o=10
49 | dataset.egtea.common.label_type=action
50 | dataset.egtea.common.split=1
51 | dataset.egtea.common.modality=rgb
52 | 
53 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
54 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
55 | +dataset_eval_train.conv_to_anticipate_fn.drop_style=correct
56 | 
57 | hydra.launcher.nodes=1
58 | hydra.launcher.gpus_per_node=2
59 | 


--------------------------------------------------------------------------------
/common/cluster.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | class KmeansAssigner(nn.Module):
 8 |     def __init__(self, centroids_fpath, norm=False):
 9 |         super().__init__()
10 |         # NxC dimension
11 |         # Not converting this to linear layer as then the weights get
12 |         # overwriten during random init, and these cluster centers are lost.
13 |         self.register_buffer('centroids',
14 |                              torch.load(centroids_fpath)['weight'])
15 |         self.norm = norm
16 | 
17 |     @property
18 |     def num_clusters(self):
19 |         return self.centroids.size(0)
20 | 
21 |     @staticmethod
22 |     def feat2cluster(feats, centroids, norm):
23 |         """
24 |         Compute index for the feats, w.r.t centroids.
25 |         Args:
26 |             feats *xC
27 |             centroids KxC
28 |         Returns:
29 |             assignments *
30 |         """
31 |         feats_flat = feats.flatten(0, -2)
32 |         if norm:
33 |             feats_flat = nn.functional.normalize(feats_flat, dim=-1, p=2)
34 |         dists = torch.cdist(feats_flat.unsqueeze(0), centroids.unsqueeze(0))
35 |         assgns = torch.argmin(dists[0], dim=-1)
36 |         assgns = assgns.reshape(feats.shape[:-1])
37 |         return assgns
38 | 
39 |     @staticmethod
40 |     def cluster2feat(idx, centroids):
41 |         """
42 |         Get features for cluster ids
43 |         Args:
44 |             idx *
45 |             centroids KxC
46 |         Returns:
47 |             assignments *xC
48 |         """
49 |         idx_flat = idx.reshape((-1, ))
50 |         feats = centroids[idx_flat, :]
51 |         return feats.reshape(list(idx.shape) + [feats.size(-1)])
52 | 
53 |     def forward(self, inp):
54 |         """
55 |         If inp is torch.float, then find the nearest assignments.
56 |         If torch.long, return the corresponding features.
57 |         """
58 |         if inp.dtype == torch.long:
59 |             return self.cluster2feat(inp, self.centroids)
60 |         return self.feat2cluster(inp, self.centroids, self.norm)
61 | 


--------------------------------------------------------------------------------
/expts/02_ek100_avt_tsn_test_testonly.txt:
--------------------------------------------------------------------------------
 1 | test_only=true
 2 | 
 3 | train.train_one_epoch_fn.loss_wts.feat=1.0
 4 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 5 | train.init_from_model=[[${cwd}/OUTPUTS/expts/02_ek100_avt_tsn.txt/0/checkpoint.pth]]
 6 | 
 7 | train.batch_size=64
 8 | eval.batch_size=64
 9 | train.num_epochs=50
10 | 
11 | model/backbone=identity
12 | model.backbone_dim=1024
13 | model/temporal_aggregator=identity
14 | model/future_predictor=avth
15 | model.dropout=0.2
16 | +model.future_predictor.n_head=4
17 | +model.future_predictor.n_layer=6
18 | +model.future_predictor.output_len=1
19 | +model.future_predictor.inter_dim=2048
20 | +model.future_predictor.return_past_too=true
21 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
22 | +model.future_predictor.future_pred_loss_wt=1.0
23 | +model.future_predictor.avg_last_n=1
24 | model.classifier_on_past=true
25 | 
26 | 
27 | opt.lr_wd=[[__all__,0.001,0.000001]]
28 | opt.bias_bn_wd_scale=1.0
29 | opt.optimizer.nesterov=true
30 | 
31 | data_train.num_frames=10
32 | data_train.frame_rate=1
33 | data_train.subclips.num_frames=1
34 | data_train.subclips.stride=1
35 | data_eval=${data_train}
36 | 
37 | opt/optimizer=sgd
38 | opt/scheduler=cosine
39 | opt.warmup.num_epochs=20
40 | 
41 | dataset@dataset_train=epic_kitchens100/anticipation_train
42 | dataset@dataset_eval=epic_kitchens100/anticipation_test
43 | dataset_train.sample_strategy=last_clip
44 | dataset_eval.sample_strategy=last_clip
45 | dataset_train.conv_to_anticipate_fn.tau_a=1
46 | dataset_train.conv_to_anticipate_fn.tau_o=10
47 | dataset_eval.conv_to_anticipate_fn.tau_a=1
48 | dataset_eval.conv_to_anticipate_fn.tau_o=10
49 | dataset.epic_kitchens100.common.label_type=action
50 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/rgb/, read_type: normal}
51 | +dataset_eval.reader_fn=${dataset_train.reader_fn}
52 | 
53 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
54 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
55 | 
56 | hydra.launcher.nodes=1
57 | hydra.launcher.gpus_per_node=2
58 | 


--------------------------------------------------------------------------------
/conf/data/default.yaml:
--------------------------------------------------------------------------------
 1 | # @package _group_
 2 | 
 3 | # The top few go into the dataset object to load as per these
 4 | num_frames: 16
 5 | frame_rate: null  # Null => Defaults or natural frame rate of the video
 6 | # Allow for an option to clip the long original clip into subclips. This is
 7 | # useful I want features for multiple past clips, so I just read and process a
 8 | # really long one and then crop it up. By default crop into 1 subclip -- same
 9 | # as input
10 | subclips:
11 |   # TODO Need to use relative interpolation here
12 |   num_frames: ${..num_frames}
13 |   stride: ${..num_frames}
14 | # Load segmentation labels only if a classifier on the past is being applied
15 | load_seg_labels: ${model.classifier_on_past}
16 | # Get rid of the next 2 params.. not sure what they are for
17 | train_bs_multiplier: 5
18 | val_clips_per_video: 1
19 | workers: 10
20 | # Scale image to this size before cropping
21 | # scale_w can be -1, in which case it will scale the shorter size to
22 | # scale_h.
23 | scale_h: 128
24 | scale_w: 174
25 | # Ht and wd of the crop from the above resized video. Set to null for no
26 | # cropping.
27 | crop_size: 112
28 | # Mean/std for centering the image
29 | mean: [0.43216, 0.394666, 0.37645]
30 | std: [0.22803, 0.22145, 0.216989]
31 | # Augmentations. Note, set these all to default, or "0", such that they are
32 | # not applied. Change it in the txt file to add it at training time, since
33 | # this ConfigGroup object will be copied for both train and test time.
34 | flip_p: 0.5  # Left-right flip 50% at train time. Not used during eval.
35 | scale_pix_val: 1.0  # Scale the pixel values by this number. Useful to scale from 0-1 values to 0-255.
36 | reverse_channels: false  # Reverse channels, i.e. convert from RGB->BGR
37 | color_jitter_brightness: 0.0
38 | color_jitter_contrast: 0.0
39 | color_jitter_saturation: 0.0
40 | color_jitter_hue: 0.0
41 | # Use distributed sampler or not. For certain data loader settings, such
42 | # as when using the dense_sampler for feature extraction, that automatically
43 | # samples different clips for different workers, so set this to false when
44 | # using it
45 | use_dist_sampler: true
46 | # Test time augmentations. Only used in the eval code
47 | eval_num_crops: 1
48 | eval_flip_crops: False
49 | 


--------------------------------------------------------------------------------
/loss_fn/multidim_xentropy.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | 
 4 | """Cross entropy loss, that works with multi-dim input."""
 5 | import torch
 6 | import torch.nn as nn
 7 | from common.cluster import KmeansAssigner
 8 | 
 9 | 
10 | class MultiDimCrossEntropy(nn.CrossEntropyLoss):
11 |     def forward(self, inp, tgt, *args, **kwargs):
12 |         """
13 |         Args:
14 |             inp: (*, C)
15 |             tgt: (*, )
16 |             Will reshape the flatten initial dimensions and then incur loss
17 |         """
18 |         assert inp.ndim == tgt.ndim + 1
19 |         assert inp.shape[:-1] == tgt.shape
20 |         res = super().forward(inp.reshape(-1, inp.size(-1)), tgt.reshape(
21 |             (-1, )), *args, **kwargs)
22 |         if torch.numel(res) == torch.numel(tgt):
23 |             # Reduction was not done, so reshape back to orig shape
24 |             res = res.reshape(tgt.shape)
25 |         return res
26 | 
27 | 
28 | class QuantizeAndCrossEntropy(MultiDimCrossEntropy):
29 |     """Given a set of cluster centers, project the features to that before
30 |     incurring the loss."""
31 |     def __init__(self, centroids_fpath, norm=True, *args, **kwargs):
32 |         super().__init__(*args, **kwargs)
33 |         self.assigner = KmeansAssigner(centroids_fpath)
34 |         self.norm = norm
35 | 
36 |     def forward(self, inp, tgt):
37 |         """
38 |         Args:
39 |             inp: (*, C)
40 |             tgt: (*, C)
41 |             Will reshape the flatten initial dimensions and then incur loss
42 |         """
43 |         # Normalize L2 both target and input, since that's how I'm computing
44 |         # centroids
45 |         if self.norm:
46 |             inp = nn.functional.normalize(inp, dim=-1, p=2)
47 |             tgt = nn.functional.normalize(tgt, dim=-1, p=2)
48 |         # assign the GT and predictions to the centroids
49 |         inp_proj = torch.mm(inp.flatten(0, 1),
50 |                             self.centroids.t()).view(inp.shape[:-1] +
51 |                                                      self.centroids.shape[:1])
52 |         # the weights of project layer are the centroids, so pick from there
53 |         tgt_proj_q = self.assigner(tgt)
54 |         return super().forward(inp_proj, tgt_proj_q)
55 | 


--------------------------------------------------------------------------------
/expts/06_ek100_avt_tsnflow.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 3 | 
 4 | train.batch_size=64
 5 | eval.batch_size=64
 6 | train.num_epochs=50
 7 | 
 8 | model/backbone=identity
 9 | model.backbone_dim=1024
10 | model/temporal_aggregator=identity
11 | model/future_predictor=avth
12 | model.dropout=0.2
13 | +model.future_predictor.n_head=4
14 | +model.future_predictor.n_layer=6
15 | +model.future_predictor.output_len=1
16 | +model.future_predictor.inter_dim=2048
17 | +model.future_predictor.return_past_too=true
18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
19 | +model.future_predictor.future_pred_loss_wt=1.0
20 | +model.future_predictor.avg_last_n=1
21 | model.classifier_on_past=true
22 | 
23 | 
24 | opt.lr_wd=[[__all__,0.001,0.000001]]
25 | opt.bias_bn_wd_scale=1.0
26 | opt.optimizer.nesterov=true
27 | 
28 | data_train.num_frames=10
29 | data_train.frame_rate=1
30 | data_train.subclips.num_frames=1
31 | data_train.subclips.stride=1
32 | data_eval=${data_train}
33 | 
34 | opt/optimizer=sgd
35 | opt/scheduler=cosine
36 | 
37 | dataset@dataset_train=epic_kitchens100/anticipation_train
38 | dataset@dataset_eval=epic_kitchens100/anticipation_val
39 | +dataset@dataset_eval_train=epic_kitchens100/anticipation_train
40 | dataset_train.sample_strategy=last_clip
41 | dataset_eval.sample_strategy=last_clip
42 | dataset_eval_train.sample_strategy=last_clip
43 | dataset_train.conv_to_anticipate_fn.tau_a=1
44 | dataset_train.conv_to_anticipate_fn.tau_o=10
45 | dataset_eval.conv_to_anticipate_fn.tau_a=1
46 | dataset_eval.conv_to_anticipate_fn.tau_o=10
47 | dataset_eval_train.conv_to_anticipate_fn.tau_a=1
48 | dataset_eval_train.conv_to_anticipate_fn.tau_o=10
49 | dataset.epic_kitchens100.common.label_type=action
50 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/flow/, read_type: normal}
51 | +dataset_eval.reader_fn=${dataset_train.reader_fn}
52 | +dataset_eval_train.reader_fn=${dataset_train.reader_fn}
53 | 
54 | +dataset_train.conv_to_anticipate_fn.drop_style=rulstm
55 | +dataset_eval.conv_to_anticipate_fn.drop_style=rulstm
56 | +dataset_eval_train.conv_to_anticipate_fn.drop_style=rulstm
57 | 
58 | hydra.launcher.nodes=1
59 | hydra.launcher.gpus_per_node=2
60 | 


--------------------------------------------------------------------------------
/expts/06_ek100_avt_tsnflow_test_trainval.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 3 | 
 4 | train.batch_size=64
 5 | eval.batch_size=64
 6 | train.num_epochs=50
 7 | 
 8 | model/backbone=identity
 9 | model.backbone_dim=1024
10 | model/temporal_aggregator=identity
11 | model/future_predictor=avth
12 | model.dropout=0.2
13 | +model.future_predictor.n_head=4
14 | +model.future_predictor.n_layer=6
15 | +model.future_predictor.output_len=1
16 | +model.future_predictor.inter_dim=2048
17 | +model.future_predictor.return_past_too=true
18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
19 | +model.future_predictor.future_pred_loss_wt=1.0
20 | +model.future_predictor.avg_last_n=1
21 | model.classifier_on_past=true
22 | 
23 | 
24 | opt.lr_wd=[[__all__,0.001,0.000001]]
25 | opt.bias_bn_wd_scale=1.0
26 | opt.optimizer.nesterov=true
27 | 
28 | data_train.num_frames=10
29 | data_train.frame_rate=1
30 | data_train.subclips.num_frames=1
31 | data_train.subclips.stride=1
32 | data_eval=${data_train}
33 | 
34 | opt/optimizer=sgd
35 | opt/scheduler=cosine
36 | 
37 | dataset@dataset_train=epic_kitchens100/anticipation_train+val
38 | dataset@dataset_eval=epic_kitchens100/anticipation_test
39 | +dataset@dataset_eval_train=epic_kitchens100/anticipation_val
40 | dataset_train.sample_strategy=last_clip
41 | dataset_eval.sample_strategy=last_clip
42 | dataset_eval_train.sample_strategy=last_clip
43 | dataset_train.conv_to_anticipate_fn.tau_a=1
44 | dataset_train.conv_to_anticipate_fn.tau_o=10
45 | dataset_eval.conv_to_anticipate_fn.tau_a=1
46 | dataset_eval.conv_to_anticipate_fn.tau_o=10
47 | dataset_eval_train.conv_to_anticipate_fn.tau_a=1
48 | dataset_eval_train.conv_to_anticipate_fn.tau_o=10
49 | dataset.epic_kitchens100.common.label_type=action
50 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/flow/, read_type: normal}
51 | +dataset_eval.reader_fn=${dataset_train.reader_fn}
52 | +dataset_eval_train.reader_fn=${dataset_train.reader_fn}
53 | 
54 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
55 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
56 | +dataset_eval_train.conv_to_anticipate_fn.drop_style=correct
57 | 
58 | hydra.launcher.nodes=1
59 | hydra.launcher.gpus_per_node=2
60 | 


--------------------------------------------------------------------------------
/expts/04_ek100_avt_ig65m.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 3 | 
 4 | train.batch_size=64
 5 | eval.batch_size=64
 6 | train.num_epochs=50
 7 | 
 8 | model/backbone=identity
 9 | model.backbone_dim=2048
10 | model/temporal_aggregator=identity
11 | model/future_predictor=avth
12 | model.dropout=0.2
13 | +model.future_predictor.n_head=2
14 | +model.future_predictor.n_layer=8
15 | +model.future_predictor.output_len=1
16 | +model.future_predictor.inter_dim=2048
17 | +model.future_predictor.return_past_too=true
18 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
19 | +model.future_predictor.future_pred_loss_wt=1.0
20 | +model.future_predictor.avg_last_n=1
21 | model.classifier_on_past=true
22 | 
23 | 
24 | opt.lr_wd=[[__all__,0.001,0.000001]]
25 | opt.bias_bn_wd_scale=1.0
26 | opt.optimizer.nesterov=true
27 | 
28 | data_train.num_frames=10
29 | data_train.frame_rate=1
30 | data_train.subclips.num_frames=1
31 | data_train.subclips.stride=1
32 | data_eval=${data_train}
33 | 
34 | opt/optimizer=sgd
35 | opt/scheduler=cosine
36 | opt.warmup.num_epochs=5
37 | 
38 | dataset@dataset_train=epic_kitchens100/anticipation_train
39 | dataset@dataset_eval=epic_kitchens100/anticipation_val
40 | +dataset@dataset_eval_train=epic_kitchens100/anticipation_train
41 | dataset_train.sample_strategy=last_clip
42 | dataset_eval.sample_strategy=last_clip
43 | dataset_eval_train.sample_strategy=last_clip
44 | dataset_train.conv_to_anticipate_fn.tau_a=1
45 | dataset_train.conv_to_anticipate_fn.tau_o=10
46 | dataset_eval.conv_to_anticipate_fn.tau_a=1
47 | dataset_eval.conv_to_anticipate_fn.tau_o=10
48 | dataset_eval_train.conv_to_anticipate_fn.tau_a=1
49 | dataset_eval_train.conv_to_anticipate_fn.tau_o=10
50 | dataset.epic_kitchens100.common.label_type=action
51 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${cwd}/DATA/extracted_features/ek100/ig65m_ftEk100_logits_10fps1s/rgb, read_type: normal, warn_if_using_closeby_frame: false}
52 | +dataset_eval.reader_fn=${dataset_train.reader_fn}
53 | +dataset_eval_train.reader_fn=${dataset_train.reader_fn}
54 | 
55 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
56 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
57 | +dataset_eval_train.conv_to_anticipate_fn.drop_style=correct
58 | 
59 | hydra.launcher.nodes=1
60 | hydra.launcher.gpus_per_node=2
61 | 


--------------------------------------------------------------------------------
/datasets/data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | import os
 4 | import torch
 5 | from importlib import import_module
 6 | from tqdm import tqdm
 7 | 
 8 | import omegaconf
 9 | import hydra
10 | 
11 | from common import utils
12 | 
13 | __all__ = [
14 |     "get_dataset",
15 | ]
16 | 
17 | 
18 | def get_dataset(dataset_cfg, data_cfg, transform, logger):
19 |     # If there is _precomputed_metadata file passed in, load that in
20 |     kwargs = {}
21 |     precomp_metadata_fpath = None
22 |     if '_precomputed_metadata_file' in dataset_cfg:
23 |         precomp_metadata_fpath = dataset_cfg._precomputed_metadata_file
24 |         # Remove from the config since otherwise can't init the obj
25 |         with omegaconf.open_dict(dataset_cfg):
26 |             del dataset_cfg['_precomputed_metadata_file']
27 |         if os.path.exists(precomp_metadata_fpath):
28 |             _precomputed_metadata = torch.load(precomp_metadata_fpath)
29 |             kwargs['_precomputed_metadata'] = _precomputed_metadata
30 | 
31 |     kwargs['transform'] = transform
32 |     kwargs['frame_rate'] = data_cfg.frame_rate
33 |     kwargs['frames_per_clip'] = data_cfg.num_frames
34 |     # Have to call dict() here since relative interpolation somehow doesn't
35 |     # work once I get the subclips object
36 |     kwargs['subclips_options'] = dict(data_cfg.subclips)
37 |     kwargs['load_seg_labels'] = data_cfg.load_seg_labels
38 |     logger.info('Creating the dataset object...')
39 |     # Not recursive since many of the sub-instantiations would need positional
40 |     # arguments
41 |     _dataset = hydra.utils.instantiate(dataset_cfg,
42 |                                        _recursive_=False,
43 |                                        **kwargs)
44 |     try:
45 |         logger.info('Computing clips...')
46 |         _dataset.video_clips.compute_clips(data_cfg.num_frames,
47 |                                            1,
48 |                                            frame_rate=data_cfg.frame_rate)
49 |         logger.info('Done')
50 |     except AttributeError:  # if video_clips not in _dataset
51 |         logger.warning('No video_clips present')
52 |     logger.info(f'Created dataset with {len(_dataset)} elts')
53 | 
54 |     if precomp_metadata_fpath and not os.path.exists(precomp_metadata_fpath):
55 |         utils.save_on_master(_dataset.metadata, precomp_metadata_fpath)
56 |     return _dataset
57 | 


--------------------------------------------------------------------------------
/expts/13_50s_avt.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 3 | train.init_from_model=[[backbone.model,${cwd}/DATA/pretrained/TIMM/jx_vit_base_patch16_224_in21k-e5005f0a.pth]]
 4 | 
 5 | train.batch_size=2
 6 | eval.batch_size=2
 7 | train.num_epochs=200
 8 | 
 9 | model/backbone=avt_b_in21k
10 | model.backbone_last_n_modules_to_drop=0
11 | model.backbone_dim=768
12 | model/temporal_aggregator=identity
13 | model/future_predictor=avth
14 | model.dropout=0.8
15 | +model.future_predictor.n_head=8
16 | +model.future_predictor.n_layer=8
17 | +model.future_predictor.output_len=1
18 | +model.future_predictor.inter_dim=2048
19 | +model.future_predictor.return_past_too=true
20 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
21 | +model.future_predictor.avg_last_n=1
22 | model.classifier_on_past=true
23 | 
24 | 
25 | opt.lr_wd=[[__all__,0.0000005,0.0001]]
26 | opt.bias_bn_wd_scale=1.0
27 | # opt.optimizer.nesterov=true
28 | 
29 | data_train.num_frames=10
30 | data_train.frame_rate=0.5
31 | data_train.subclips.num_frames=1
32 | data_train.subclips.stride=1
33 | data_eval.num_frames=${data_train.num_frames}
34 | data_eval.frame_rate=${data_train.frame_rate}
35 | data_eval.subclips.num_frames=${data_train.subclips.num_frames}
36 | data_eval.subclips.stride=${data_train.subclips.stride}
37 | data_train.mean=[0.5, 0.5, 0.5]
38 | data_train.std=[0.5, 0.5, 0.5]
39 | data_eval.mean=${data_train.mean}
40 | data_eval.std=${data_train.std}
41 | data_eval.eval_num_crops=3
42 | data_eval.eval_flip_crops=true
43 | 
44 | opt/optimizer=adam
45 | opt/scheduler=cosine
46 | opt.warmup.num_epochs=20
47 | 
48 | dataset/dundee50salads/annot_reader_fn=abu_farha
49 | dataset.dundee50salads.common.fold=1,2,3,4,5
50 | dataset@dataset_train=dundee50salads/anticipation_train
51 | dataset@dataset_eval=dundee50salads/anticipation_val
52 | dataset_train.sample_strategy=last_clip
53 | dataset_eval.sample_strategy=last_clip
54 | dataset_train.conv_to_anticipate_fn.tau_o=20
55 | dataset_eval.conv_to_anticipate_fn.tau_o=20
56 | 
57 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
58 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
59 | 
60 | data_train.scale_h=248-280
61 | data_train.scale_w=-1
62 | data_train.crop_size=224
63 | data_eval.scale_h=248
64 | data_eval.scale_w=-1
65 | data_eval.crop_size=224
66 | 
67 | hydra.launcher.nodes=4
68 | hydra.launcher.gpus_per_node=8
69 | 


--------------------------------------------------------------------------------
/expts/09_ek55_avt.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 3 | train.init_from_model=[[backbone.model,${cwd}/DATA/pretrained/TIMM/jx_vit_base_p16_224-80ecf9dd.pth]]
 4 | 
 5 | train.batch_size=3
 6 | eval.batch_size=3
 7 | train.num_epochs=35
 8 | 
 9 | model/backbone=avt_b
10 | model.backbone_last_n_modules_to_drop=0
11 | model.backbone_dim=768
12 | model/temporal_aggregator=identity
13 | model/future_predictor=avth
14 | model.dropout=0.8
15 | +model.future_predictor.n_head=4
16 | +model.future_predictor.n_layer=6
17 | +model.future_predictor.output_len=1
18 | +model.future_predictor.inter_dim=2048
19 | +model.future_predictor.return_past_too=true
20 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
21 | +model.future_predictor.future_pred_loss_wt=1.0
22 | +model.future_predictor.avg_last_n=1
23 | model.classifier_on_past=true
24 | 
25 | 
26 | opt.lr_wd=[[__all__,0.0001,0.0001]]
27 | opt.bias_bn_wd_scale=1.0
28 | opt.optimizer.nesterov=true
29 | 
30 | data_train.num_frames=10
31 | data_train.frame_rate=1
32 | data_train.subclips.num_frames=1
33 | data_train.subclips.stride=1
34 | data_eval.num_frames=${data_train.num_frames}
35 | data_eval.frame_rate=${data_train.frame_rate}
36 | data_eval.subclips.num_frames=${data_train.subclips.num_frames}
37 | data_eval.subclips.stride=${data_train.subclips.stride}
38 | data_train.mean=[0.5, 0.5, 0.5]
39 | data_train.std=[0.5, 0.5, 0.5]
40 | data_eval.mean=${data_train.mean}
41 | data_eval.std=${data_train.std}
42 | data_eval.eval_num_crops=3
43 | data_eval.eval_flip_crops=true
44 | 
45 | opt/optimizer=sgd
46 | opt/scheduler=cosine
47 | opt.scheduler.eta_min=0.000000005
48 | opt.warmup.num_epochs=5
49 | 
50 | dataset@dataset_train=epic_kitchens/anticipation_train_minus_val
51 | dataset@dataset_eval=epic_kitchens/anticipation_val
52 | dataset_train.sample_strategy=last_clip
53 | dataset_eval.sample_strategy=last_clip
54 | dataset_train.conv_to_anticipate_fn.tau_o=20
55 | dataset_eval.conv_to_anticipate_fn.tau_o=20
56 | dataset.epic_kitchens.common.label_type=action
57 | 
58 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
59 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
60 | 
61 | data_train.scale_h=248-280
62 | data_train.scale_w=-1
63 | data_train.crop_size=224
64 | data_eval.scale_h=248
65 | data_eval.scale_w=-1
66 | data_eval.crop_size=224
67 | 
68 | hydra.launcher.nodes=4
69 | hydra.launcher.gpus_per_node=8
70 | 


--------------------------------------------------------------------------------
/expts/09_ek55_avt_forAR.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 3 | train.init_from_model=[[backbone.model,${cwd}/DATA/pretrained/TIMM/jx_vit_base_p16_224-80ecf9dd.pth]]
 4 | 
 5 | train.batch_size=3
 6 | eval.batch_size=3
 7 | train.num_epochs=50
 8 | 
 9 | model/backbone=avt_b
10 | model.backbone_last_n_modules_to_drop=0
11 | model.backbone_dim=768
12 | model/temporal_aggregator=identity
13 | model/future_predictor=avth
14 | model.dropout=0.8
15 | +model.future_predictor.n_head=4
16 | +model.future_predictor.n_layer=6
17 | +model.future_predictor.output_len=1
18 | +model.future_predictor.inter_dim=2048
19 | +model.future_predictor.return_past_too=true
20 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
21 | +model.future_predictor.future_pred_loss_wt=1.0
22 | +model.future_predictor.avg_last_n=1
23 | model.classifier_on_past=true
24 | 
25 | 
26 | opt.lr_wd=[[__all__,0.0001,0.0001]]
27 | opt.bias_bn_wd_scale=1.0
28 | opt.optimizer.nesterov=true
29 | 
30 | data_train.num_frames=10
31 | data_train.frame_rate=1
32 | data_train.subclips.num_frames=1
33 | data_train.subclips.stride=1
34 | data_eval.num_frames=${data_train.num_frames}
35 | data_eval.frame_rate=${data_train.frame_rate}
36 | data_eval.subclips.num_frames=${data_train.subclips.num_frames}
37 | data_eval.subclips.stride=${data_train.subclips.stride}
38 | data_train.mean=[0.5, 0.5, 0.5]
39 | data_train.std=[0.5, 0.5, 0.5]
40 | data_eval.mean=${data_train.mean}
41 | data_eval.std=${data_train.std}
42 | data_eval.eval_num_crops=3
43 | data_eval.eval_flip_crops=true
44 | 
45 | opt/optimizer=sgd
46 | opt/scheduler=cosine
47 | opt.scheduler.eta_min=0.000000005
48 | opt.warmup.num_epochs=5
49 | 
50 | dataset@dataset_train=epic_kitchens/anticipation_train_minus_val
51 | dataset@dataset_eval=epic_kitchens/anticipation_val
52 | dataset_train.sample_strategy=last_clip
53 | dataset_eval.sample_strategy=last_clip
54 | dataset_train.conv_to_anticipate_fn.tau_o=20
55 | dataset_eval.conv_to_anticipate_fn.tau_o=20
56 | dataset.epic_kitchens.common.label_type=action
57 | 
58 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
59 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
60 | 
61 | data_train.scale_h=248-280
62 | data_train.scale_w=-1
63 | data_train.crop_size=224
64 | data_eval.scale_h=248
65 | data_eval.scale_w=-1
66 | data_eval.crop_size=224
67 | 
68 | hydra.launcher.nodes=4
69 | hydra.launcher.gpus_per_node=8
70 | 


--------------------------------------------------------------------------------
/datasets/reader_fns.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | """Implementation of reader functions."""
 4 | 
 5 | import logging
 6 | from pathlib import Path
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | import torchvision
11 | 
12 | from common.utils import get_video_info
13 | 
14 | 
15 | # An abstract class to keep track of all reader type classes
16 | class Reader(nn.Module):
17 |     pass
18 | 
19 | 
20 | class DefaultReader(Reader):
21 |     def forward(self, video_path, start, end, fps, df_row, **kwargs):
22 |         del df_row, fps  # Not needed here
23 |         video_info = torchvision.io.read_video(video_path, start, end,
24 |                                                **kwargs)
25 |         # DEBUG see what is breaking
26 |         logging.debug('Read %s from %s', video_info[0].shape, video_path)
27 |         return video_info
28 | 
29 |     @staticmethod
30 |     def get_frame_rate(video_path: Path) -> float:
31 |         return get_video_info(video_path, ['fps'])['fps']
32 | 
33 | 
34 | class VideoAsLabelOnehotReader(Reader):
35 |     @staticmethod
36 |     def get_frame_rate(video_path: Path) -> float:
37 |         raise NotImplementedError('Not sure what it is here... TODO')
38 | 
39 |     def forward(self,
40 |                 video_path,
41 |                 start,
42 |                 end,
43 |                 fps,
44 |                 df_row,
45 |                 pts_unit='sec',
46 |                 num_classes=1000):
47 |         """
48 |         Return the video as a 1-hot representation of the actual labels.
49 |         Args:
50 |             video_path
51 |             start: start time in sec
52 |             end: end time in sec
53 |             fps: frame rate of this video
54 |             df_row: The data frame row corresponding to this video. Includes
55 |                 labels
56 |             num_classes: Total number of classes for the 1-hot representation.
57 |                 Could just be a large number, should work too.
58 |         Returns:
59 |             video_feature of shape T x 1 x 1 x num_classes
60 |         """
61 |         del pts_unit, video_path, start, fps
62 |         assert abs(end -
63 |                    df_row['end']) < 0.1, 'For now just supporting last_clip'
64 |         labels = df_row['obs_action_class'][:, 1]
65 |         # Convert to 1-hot, TxC shape
66 |         feats = nn.functional.one_hot(torch.LongTensor(labels), num_classes)
67 |         return feats.unsqueeze(1).unsqueeze(1).float(), {}, {}
68 | 


--------------------------------------------------------------------------------
/expts/06_ek100_avt_tsnflow_test_testonly.txt:
--------------------------------------------------------------------------------
 1 | test_only=true
 2 | 
 3 | train.train_one_epoch_fn.loss_wts.feat=1.0
 4 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 5 | train.init_from_model=[[${cwd}/OUTPUTS/expts/06_ek100_avt_tsnflow.txt/0/checkpoint.pth]]
 6 | 
 7 | train.batch_size=64
 8 | eval.batch_size=64
 9 | train.num_epochs=50
10 | 
11 | model/backbone=identity
12 | model.backbone_dim=1024
13 | model/temporal_aggregator=identity
14 | model/future_predictor=avth
15 | model.dropout=0.2
16 | +model.future_predictor.n_head=4
17 | +model.future_predictor.n_layer=6
18 | +model.future_predictor.output_len=1
19 | # +model.future_predictor.avg_last_n=1
20 | +model.future_predictor.inter_dim=2048
21 | +model.future_predictor.return_past_too=true
22 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
23 | +model.future_predictor.future_pred_loss_wt=1.0
24 | +model.future_predictor.avg_last_n=1
25 | model.classifier_on_past=true
26 | 
27 | 
28 | opt.lr_wd=[[__all__,0.001,0.000001]]
29 | opt.bias_bn_wd_scale=1.0
30 | opt.optimizer.nesterov=true
31 | 
32 | data_train.num_frames=10
33 | data_train.frame_rate=1
34 | data_train.subclips.num_frames=1
35 | data_train.subclips.stride=1
36 | data_eval=${data_train}
37 | 
38 | opt/optimizer=sgd
39 | opt/scheduler=cosine
40 | 
41 | dataset@dataset_train=epic_kitchens100/anticipation_train
42 | dataset@dataset_eval=epic_kitchens100/anticipation_test
43 | +dataset@dataset_eval_train=epic_kitchens100/anticipation_train
44 | dataset_train.sample_strategy=last_clip
45 | dataset_eval.sample_strategy=last_clip
46 | dataset_eval_train.sample_strategy=last_clip
47 | dataset_train.conv_to_anticipate_fn.tau_a=1
48 | dataset_train.conv_to_anticipate_fn.tau_o=10
49 | dataset_eval.conv_to_anticipate_fn.tau_a=1
50 | dataset_eval.conv_to_anticipate_fn.tau_o=10
51 | dataset_eval_train.conv_to_anticipate_fn.tau_a=1
52 | dataset_eval_train.conv_to_anticipate_fn.tau_o=10
53 | dataset.epic_kitchens100.common.label_type=action
54 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${dataset.epic_kitchens100.common.rulstm_feats_dir}/flow/, read_type: normal}
55 | +dataset_eval.reader_fn=${dataset_train.reader_fn}
56 | +dataset_eval_train.reader_fn=${dataset_train.reader_fn}
57 | 
58 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
59 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
60 | +dataset_eval_train.conv_to_anticipate_fn.drop_style=correct
61 | 
62 | hydra.launcher.nodes=1
63 | hydra.launcher.gpus_per_node=2
64 | 


--------------------------------------------------------------------------------
/expts/04_ek100_avt_ig65m_test_testonly.txt:
--------------------------------------------------------------------------------
 1 | test_only=true
 2 | 
 3 | train.train_one_epoch_fn.loss_wts.feat=1.0
 4 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 5 | train.init_from_model=[[${cwd}/OUTPUTS/expts/04_ek100_avt_ig65m.txt/0/checkpoint.pth]]
 6 | 
 7 | train.batch_size=64
 8 | eval.batch_size=64
 9 | train.num_epochs=50
10 | 
11 | model/backbone=identity
12 | model.backbone_dim=2048
13 | model/temporal_aggregator=identity
14 | model/future_predictor=avth
15 | model.dropout=0.2
16 | +model.future_predictor.n_head=2
17 | +model.future_predictor.n_layer=8
18 | +model.future_predictor.output_len=1
19 | +model.future_predictor.inter_dim=2048
20 | +model.future_predictor.return_past_too=true
21 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
22 | +model.future_predictor.future_pred_loss_wt=1.0
23 | +model.future_predictor.avg_last_n=1
24 | model.classifier_on_past=true
25 | 
26 | 
27 | opt.lr_wd=[[__all__,0.001,0.000001]]
28 | opt.bias_bn_wd_scale=1.0
29 | opt.optimizer.nesterov=true
30 | 
31 | data_train.num_frames=10
32 | data_train.frame_rate=1
33 | data_train.subclips.num_frames=1
34 | data_train.subclips.stride=1
35 | data_eval=${data_train}
36 | 
37 | opt/optimizer=sgd
38 | opt/scheduler=cosine
39 | opt.warmup.num_epochs=5
40 | 
41 | dataset@dataset_train=epic_kitchens100/anticipation_train
42 | dataset@dataset_eval=epic_kitchens100/anticipation_test
43 | +dataset@dataset_eval_train=epic_kitchens100/anticipation_val
44 | dataset_train.sample_strategy=last_clip
45 | dataset_eval.sample_strategy=last_clip
46 | dataset_eval_train.sample_strategy=last_clip
47 | dataset_train.conv_to_anticipate_fn.tau_a=1
48 | dataset_train.conv_to_anticipate_fn.tau_o=10
49 | dataset_eval.conv_to_anticipate_fn.tau_a=1
50 | dataset_eval.conv_to_anticipate_fn.tau_o=10
51 | dataset_eval_train.conv_to_anticipate_fn.tau_a=1
52 | dataset_eval_train.conv_to_anticipate_fn.tau_o=10
53 | dataset.epic_kitchens100.common.label_type=action
54 | +dataset_train.reader_fn={_target_: datasets.epic_kitchens.EpicRULSTMFeatsReader, lmdb_path: ${cwd}/DATA/extracted_features/ek100/ig65m_ftEk100_logits_10fps1s/rgb, read_type: normal, warn_if_using_closeby_frame: false}
55 | +dataset_eval.reader_fn=${dataset_train.reader_fn}
56 | +dataset_eval_train.reader_fn=${dataset_train.reader_fn}
57 | 
58 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
59 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
60 | +dataset_eval_train.conv_to_anticipate_fn.drop_style=correct
61 | 
62 | hydra.launcher.nodes=1
63 | hydra.launcher.gpus_per_node=2
64 | 


--------------------------------------------------------------------------------
/expts/07_ek100_avt_longer.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 3 | train.init_from_model=[[backbone.model,${cwd}/DATA/pretrained/TIMM/jx_vit_base_patch16_224_in21k-e5005f0a.pth]]
 4 | 
 5 | train.batch_size=3
 6 | eval.batch_size=3
 7 | train.num_epochs=70
 8 | 
 9 | model/backbone=avt_b_in21k
10 | model.backbone_last_n_modules_to_drop=0
11 | model.backbone_dim=768
12 | model/temporal_aggregator=identity
13 | model/future_predictor=avth
14 | model.dropout=0.2
15 | +model.future_predictor.n_head=4
16 | +model.future_predictor.n_layer=6
17 | +model.future_predictor.output_len=1
18 | +model.future_predictor.inter_dim=2048
19 | +model.future_predictor.return_past_too=true
20 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
21 | +model.future_predictor.future_pred_loss_wt=1.0
22 | +model.future_predictor.avg_last_n=1
23 | model.classifier_on_past=true
24 | 
25 | 
26 | opt.lr_wd=[[__all__,0.0001,0.000001]]
27 | opt.bias_bn_wd_scale=1.0
28 | opt.optimizer.nesterov=true
29 | 
30 | data_train.num_frames=15
31 | data_train.frame_rate=1
32 | data_train.subclips.num_frames=1
33 | data_train.subclips.stride=1
34 | data_eval.num_frames=${data_train.num_frames}
35 | data_eval.frame_rate=${data_train.frame_rate}
36 | data_eval.subclips.num_frames=${data_train.subclips.num_frames}
37 | data_eval.subclips.stride=${data_train.subclips.stride}
38 | data_train.mean=[0.5, 0.5, 0.5]
39 | data_train.std=[0.5, 0.5, 0.5]
40 | data_eval.mean=${data_train.mean}
41 | data_eval.std=${data_train.std}
42 | data_eval.eval_num_crops=3
43 | data_eval.eval_flip_crops=true
44 | 
45 | opt/optimizer=sgd
46 | opt/scheduler=cosine
47 | opt.warmup.num_epochs=20
48 | 
49 | dataset@dataset_train=epic_kitchens100/anticipation_train
50 | dataset@dataset_eval=epic_kitchens100/anticipation_val
51 | 
52 | dataset_train.sample_strategy=last_clip
53 | dataset_eval.sample_strategy=last_clip
54 | 
55 | dataset_train.conv_to_anticipate_fn.tau_a=1
56 | dataset_train.conv_to_anticipate_fn.tau_o=15
57 | dataset_eval.conv_to_anticipate_fn.tau_a=1
58 | dataset_eval.conv_to_anticipate_fn.tau_o=15
59 | 
60 | dataset.epic_kitchens100.common.label_type=action
61 | 
62 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
63 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
64 | 
65 | data_train.scale_h=248-280
66 | data_train.scale_w=-1
67 | data_train.crop_size=224
68 | data_eval.scale_h=248
69 | data_eval.scale_w=-1
70 | data_eval.crop_size=224
71 | 
72 | hydra.launcher.nodes=4
73 | hydra.launcher.gpus_per_node=8
74 | 


--------------------------------------------------------------------------------
/expts/07_ek100_avt_longer_test_testonly.txt:
--------------------------------------------------------------------------------
 1 | test_only=true
 2 | 
 3 | train.train_one_epoch_fn.loss_wts.feat=1.0
 4 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 5 | train.init_from_model=[[${cwd}/OUTPUTS/expts/07_ek100_avt_longer.txt/0/checkpoint.pth]]
 6 | 
 7 | train.batch_size=3
 8 | eval.batch_size=3
 9 | train.num_epochs=70
10 | 
11 | model/backbone=avt_b_in21k
12 | model.backbone_last_n_modules_to_drop=0
13 | model.backbone_dim=768
14 | model/temporal_aggregator=identity
15 | model/future_predictor=avth
16 | model.dropout=0.2
17 | +model.future_predictor.n_head=4
18 | +model.future_predictor.n_layer=6
19 | +model.future_predictor.output_len=1
20 | +model.future_predictor.inter_dim=2048
21 | +model.future_predictor.return_past_too=true
22 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
23 | +model.future_predictor.future_pred_loss_wt=1.0
24 | +model.future_predictor.avg_last_n=1
25 | model.classifier_on_past=true
26 | 
27 | 
28 | opt.lr_wd=[[__all__,0.0001,0.000001]]
29 | opt.bias_bn_wd_scale=1.0
30 | opt.optimizer.nesterov=true
31 | 
32 | data_train.num_frames=15
33 | data_train.frame_rate=1
34 | data_train.subclips.num_frames=1
35 | data_train.subclips.stride=1
36 | data_eval.num_frames=${data_train.num_frames}
37 | data_eval.frame_rate=${data_train.frame_rate}
38 | data_eval.subclips.num_frames=${data_train.subclips.num_frames}
39 | data_eval.subclips.stride=${data_train.subclips.stride}
40 | data_train.mean=[0.5, 0.5, 0.5]
41 | data_train.std=[0.5, 0.5, 0.5]
42 | data_eval.mean=${data_train.mean}
43 | data_eval.std=${data_train.std}
44 | data_eval.eval_num_crops=3
45 | data_eval.eval_flip_crops=true
46 | 
47 | opt/optimizer=sgd
48 | opt/scheduler=cosine
49 | opt.warmup.num_epochs=20
50 | 
51 | dataset@dataset_train=epic_kitchens100/anticipation_train
52 | dataset@dataset_eval=epic_kitchens100/anticipation_test
53 | 
54 | dataset_train.sample_strategy=last_clip
55 | dataset_eval.sample_strategy=last_clip
56 | 
57 | dataset_train.conv_to_anticipate_fn.tau_a=1
58 | dataset_train.conv_to_anticipate_fn.tau_o=15
59 | dataset_eval.conv_to_anticipate_fn.tau_a=1
60 | dataset_eval.conv_to_anticipate_fn.tau_o=15
61 | 
62 | dataset.epic_kitchens100.common.label_type=action
63 | 
64 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
65 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
66 | 
67 | data_train.scale_h=248-280
68 | data_train.scale_w=-1
69 | data_train.crop_size=224
70 | data_eval.scale_h=248
71 | data_eval.scale_w=-1
72 | data_eval.crop_size=224
73 | 
74 | hydra.launcher.nodes=4
75 | hydra.launcher.gpus_per_node=8
76 | 


--------------------------------------------------------------------------------
/expts/07_ek100_avt_longer_test_trainval.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 3 | train.init_from_model=[[backbone.model,${cwd}/DATA/pretrained/TIMM/jx_vit_base_patch16_224_in21k-e5005f0a.pth]]
 4 | 
 5 | train.batch_size=3
 6 | eval.batch_size=3
 7 | train.num_epochs=70
 8 | 
 9 | model/backbone=avt_b_in21k
10 | model.backbone_last_n_modules_to_drop=0
11 | model.backbone_dim=768
12 | model/temporal_aggregator=identity
13 | model/future_predictor=avth
14 | model.dropout=0.2
15 | +model.future_predictor.n_head=4
16 | +model.future_predictor.n_layer=6
17 | +model.future_predictor.output_len=1
18 | +model.future_predictor.inter_dim=2048
19 | +model.future_predictor.return_past_too=true
20 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
21 | +model.future_predictor.future_pred_loss_wt=1.0
22 | +model.future_predictor.avg_last_n=1
23 | model.classifier_on_past=true
24 | 
25 | 
26 | opt.lr_wd=[[__all__,0.0001,0.000001]]
27 | opt.bias_bn_wd_scale=1.0
28 | opt.optimizer.nesterov=true
29 | 
30 | data_train.num_frames=15
31 | data_train.frame_rate=1
32 | data_train.subclips.num_frames=1
33 | data_train.subclips.stride=1
34 | data_eval.num_frames=${data_train.num_frames}
35 | data_eval.frame_rate=${data_train.frame_rate}
36 | data_eval.subclips.num_frames=${data_train.subclips.num_frames}
37 | data_eval.subclips.stride=${data_train.subclips.stride}
38 | data_train.mean=[0.5, 0.5, 0.5]
39 | data_train.std=[0.5, 0.5, 0.5]
40 | data_eval.mean=${data_train.mean}
41 | data_eval.std=${data_train.std}
42 | data_eval.eval_num_crops=3
43 | data_eval.eval_flip_crops=true
44 | 
45 | opt/optimizer=sgd
46 | opt/scheduler=cosine
47 | opt.warmup.num_epochs=20
48 | 
49 | dataset@dataset_train=epic_kitchens100/anticipation_train+val
50 | dataset@dataset_eval=epic_kitchens100/anticipation_test
51 | 
52 | dataset_train.sample_strategy=last_clip
53 | dataset_eval.sample_strategy=last_clip
54 | 
55 | dataset_train.conv_to_anticipate_fn.tau_a=1
56 | dataset_train.conv_to_anticipate_fn.tau_o=15
57 | dataset_eval.conv_to_anticipate_fn.tau_a=1
58 | dataset_eval.conv_to_anticipate_fn.tau_o=15
59 | 
60 | dataset.epic_kitchens100.common.label_type=action
61 | 
62 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
63 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
64 | 
65 | data_train.scale_h=248-280
66 | data_train.scale_w=-1
67 | data_train.crop_size=224
68 | data_eval.scale_h=248
69 | data_eval.scale_w=-1
70 | data_eval.crop_size=224
71 | 
72 | hydra.launcher.nodes=8
73 | hydra.launcher.gpus_per_node=8
74 | 


--------------------------------------------------------------------------------
/expts/01_ek100_avt.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 3 | train.init_from_model=[[backbone.model, ${cwd}/DATA/pretrained/TIMM/jx_vit_base_patch16_224_in21k-e5005f0a.pth]]
 4 | 
 5 | train.batch_size=3
 6 | eval.batch_size=3
 7 | train.num_epochs=50
 8 | 
 9 | model/backbone=avt_b_in21k
10 | model.backbone_last_n_modules_to_drop=0
11 | model.backbone_dim=768
12 | model/temporal_aggregator=identity
13 | model/future_predictor=avth
14 | model.dropout=0.2
15 | +model.future_predictor.n_head=4
16 | +model.future_predictor.n_layer=6
17 | +model.future_predictor.output_len=1
18 | +model.future_predictor.inter_dim=2048
19 | +model.future_predictor.return_past_too=true
20 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
21 | +model.future_predictor.future_pred_loss_wt=1.0
22 | +model.future_predictor.avg_last_n=1
23 | model.classifier_on_past=true
24 | 
25 | 
26 | opt.lr_wd=[[__all__,0.0001,0.000001]]
27 | opt.bias_bn_wd_scale=1.0
28 | opt.optimizer.nesterov=true
29 | 
30 | data_train.num_frames=10
31 | data_train.frame_rate=1
32 | data_train.subclips.num_frames=1
33 | data_train.subclips.stride=1
34 | data_eval.num_frames=${data_train.num_frames}
35 | data_eval.frame_rate=${data_train.frame_rate}
36 | data_eval.subclips.num_frames=${data_train.subclips.num_frames}
37 | data_eval.subclips.stride=${data_train.subclips.stride}
38 | data_train.mean=[0.5, 0.5, 0.5]
39 | data_train.std=[0.5, 0.5, 0.5]
40 | data_eval.mean=${data_train.mean}
41 | data_eval.std=${data_train.std}
42 | data_eval.eval_num_crops=3
43 | data_eval.eval_flip_crops=true
44 | 
45 | opt/optimizer=sgd
46 | opt/scheduler=cosine
47 | opt.warmup.num_epochs=20
48 | opt.scheduler.num_epochs=30
49 | 
50 | dataset@dataset_train=epic_kitchens100/anticipation_train
51 | dataset@dataset_eval=epic_kitchens100/anticipation_val
52 | 
53 | dataset_train.sample_strategy=last_clip
54 | dataset_eval.sample_strategy=last_clip
55 | 
56 | dataset_train.conv_to_anticipate_fn.tau_a=1
57 | dataset_train.conv_to_anticipate_fn.tau_o=10
58 | dataset_eval.conv_to_anticipate_fn.tau_a=1
59 | dataset_eval.conv_to_anticipate_fn.tau_o=10
60 | 
61 | dataset.epic_kitchens100.common.label_type=action
62 | 
63 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
64 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
65 | 
66 | data_train.scale_h=248-280
67 | data_train.scale_w=-1
68 | data_train.crop_size=224
69 | data_eval.scale_h=248
70 | data_eval.scale_w=-1
71 | data_eval.crop_size=224
72 | 
73 | hydra.launcher.nodes=4
74 | hydra.launcher.gpus_per_node=8
75 | 


--------------------------------------------------------------------------------
/expts/01_ek100_avt_test_testonly.txt:
--------------------------------------------------------------------------------
 1 | test_only=true
 2 | 
 3 | train.train_one_epoch_fn.loss_wts.feat=1.0
 4 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 5 | train.init_from_model=[[OUTPUTS/expts/01_ek100_avt.txt/0/checkpoint.pth]]
 6 | 
 7 | train.batch_size=3
 8 | eval.batch_size=3
 9 | train.num_epochs=50
10 | 
11 | model/backbone=avt_b_in21k
12 | model.backbone_last_n_modules_to_drop=0
13 | model.backbone_dim=768
14 | model/temporal_aggregator=identity
15 | model/future_predictor=avth
16 | model.dropout=0.2
17 | +model.future_predictor.n_head=4
18 | +model.future_predictor.n_layer=6
19 | +model.future_predictor.output_len=1
20 | +model.future_predictor.inter_dim=2048
21 | +model.future_predictor.return_past_too=true
22 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
23 | +model.future_predictor.future_pred_loss_wt=1.0
24 | +model.future_predictor.avg_last_n=1
25 | model.classifier_on_past=true
26 | 
27 | 
28 | opt.lr_wd=[[__all__,0.0001,0.000001]]
29 | opt.bias_bn_wd_scale=1.0
30 | opt.optimizer.nesterov=true
31 | 
32 | data_train.num_frames=10
33 | data_train.frame_rate=1
34 | data_train.subclips.num_frames=1
35 | data_train.subclips.stride=1
36 | data_eval.num_frames=${data_train.num_frames}
37 | data_eval.frame_rate=${data_train.frame_rate}
38 | data_eval.subclips.num_frames=${data_train.subclips.num_frames}
39 | data_eval.subclips.stride=${data_train.subclips.stride}
40 | data_train.mean=[0.5, 0.5, 0.5]
41 | data_train.std=[0.5, 0.5, 0.5]
42 | data_eval.mean=${data_train.mean}
43 | data_eval.std=${data_train.std}
44 | data_eval.eval_num_crops=3
45 | data_eval.eval_flip_crops=true
46 | 
47 | opt/optimizer=sgd
48 | opt/scheduler=cosine
49 | opt.warmup.num_epochs=20
50 | opt.scheduler.num_epochs=30
51 | 
52 | dataset@dataset_train=epic_kitchens100/anticipation_train
53 | dataset@dataset_eval=epic_kitchens100/anticipation_test
54 | 
55 | dataset_train.sample_strategy=last_clip
56 | dataset_eval.sample_strategy=last_clip
57 | 
58 | dataset_train.conv_to_anticipate_fn.tau_a=1
59 | dataset_train.conv_to_anticipate_fn.tau_o=10
60 | dataset_eval.conv_to_anticipate_fn.tau_a=1
61 | dataset_eval.conv_to_anticipate_fn.tau_o=10
62 | 
63 | dataset.epic_kitchens100.common.label_type=action
64 | 
65 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
66 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
67 | 
68 | data_train.scale_h=248-280
69 | data_train.scale_w=-1
70 | data_train.crop_size=224
71 | data_eval.scale_h=248
72 | data_eval.scale_w=-1
73 | data_eval.crop_size=224
74 | 
75 | hydra.launcher.nodes=4
76 | hydra.launcher.gpus_per_node=8
77 | 


--------------------------------------------------------------------------------
/expts/01_ek100_avt_test_trainval.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=1.0
 3 | train.init_from_model=[[backbone.model,${cwd}/DATA/pretrained/TIMM/jx_vit_base_patch16_224_in21k-e5005f0a.pth]]
 4 | 
 5 | train.batch_size=3
 6 | eval.batch_size=3
 7 | train.num_epochs=100
 8 | 
 9 | model/backbone=avt_b_in21k
10 | model.backbone_last_n_modules_to_drop=0
11 | model.backbone_dim=768
12 | model/temporal_aggregator=identity
13 | model/future_predictor=avth
14 | model.dropout=0.2
15 | +model.future_predictor.n_head=4
16 | +model.future_predictor.n_layer=6
17 | +model.future_predictor.output_len=1
18 | +model.future_predictor.inter_dim=2048
19 | +model.future_predictor.return_past_too=true
20 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
21 | +model.future_predictor.future_pred_loss_wt=1.0
22 | +model.future_predictor.avg_last_n=1
23 | model.classifier_on_past=true
24 | 
25 | 
26 | opt.lr_wd=[[__all__,0.0001,0.000001]]
27 | opt.bias_bn_wd_scale=1.0
28 | opt.optimizer.nesterov=true
29 | 
30 | data_train.num_frames=10
31 | data_train.frame_rate=1
32 | data_train.subclips.num_frames=1
33 | data_train.subclips.stride=1
34 | data_eval.num_frames=${data_train.num_frames}
35 | data_eval.frame_rate=${data_train.frame_rate}
36 | data_eval.subclips.num_frames=${data_train.subclips.num_frames}
37 | data_eval.subclips.stride=${data_train.subclips.stride}
38 | data_train.mean=[0.5, 0.5, 0.5]
39 | data_train.std=[0.5, 0.5, 0.5]
40 | data_eval.mean=${data_train.mean}
41 | data_eval.std=${data_train.std}
42 | data_eval.eval_num_crops=3
43 | data_eval.eval_flip_crops=true
44 | 
45 | opt/optimizer=sgd
46 | opt/scheduler=cosine
47 | opt.warmup.num_epochs=20
48 | opt.scheduler.num_epochs=30
49 | 
50 | dataset@dataset_train=epic_kitchens100/anticipation_train+val
51 | dataset@dataset_eval=epic_kitchens100/anticipation_test
52 | 
53 | dataset_train.sample_strategy=last_clip
54 | dataset_eval.sample_strategy=last_clip
55 | 
56 | dataset_train.conv_to_anticipate_fn.tau_a=1
57 | dataset_train.conv_to_anticipate_fn.tau_o=10
58 | dataset_eval.conv_to_anticipate_fn.tau_a=1
59 | dataset_eval.conv_to_anticipate_fn.tau_o=10
60 | 
61 | dataset.epic_kitchens100.common.label_type=action
62 | 
63 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
64 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
65 | 
66 | data_train.scale_h=248-280
67 | data_train.scale_w=-1
68 | data_train.crop_size=224
69 | data_eval.scale_h=248
70 | data_eval.scale_w=-1
71 | data_eval.crop_size=224
72 | 
73 | hydra.launcher.nodes=4
74 | hydra.launcher.gpus_per_node=8
75 | 


--------------------------------------------------------------------------------
/expts/12_egtea_avt.txt:
--------------------------------------------------------------------------------
 1 | train.train_one_epoch_fn.loss_wts.feat=1.0
 2 | train.train_one_epoch_fn.loss_wts.past_cls_action=0.1
 3 | train.init_from_model=[[backbone.model,${cwd}/DATA/pretrained/TIMM/jx_vit_base_p16_224-80ecf9dd.pth]]
 4 | 
 5 | train.batch_size=3
 6 | eval.batch_size=3
 7 | train.num_epochs=10
 8 | 
 9 | model/backbone=avt_b
10 | model.backbone_last_n_modules_to_drop=0
11 | model.backbone_dim=768
12 | model/temporal_aggregator=identity
13 | model/future_predictor=avth
14 | model.dropout=0.2
15 | +model.future_predictor.n_head=4
16 | +model.future_predictor.n_layer=2
17 | +model.future_predictor.output_len=1
18 | # +model.future_predictor.avg_last_n=1
19 | +model.future_predictor.inter_dim=2048
20 | +model.future_predictor.return_past_too=true
21 | +model.future_predictor.future_pred_loss={_target_: torch.nn.MSELoss}
22 | +model.future_predictor.future_pred_loss_wt=1.0
23 | +model.future_predictor.avg_last_n=1
24 | model.classifier_on_past=true
25 | 
26 | 
27 | opt.lr_wd=[[__all__,0.0005,0.000001]]
28 | opt.bias_bn_wd_scale=1.0
29 | opt.optimizer.nesterov=true
30 | 
31 | data_train.num_frames=10
32 | data_train.frame_rate=1
33 | data_train.subclips.num_frames=1
34 | data_train.subclips.stride=1
35 | data_eval.num_frames=${data_train.num_frames}
36 | data_eval.frame_rate=${data_train.frame_rate}
37 | data_eval.subclips.num_frames=${data_train.subclips.num_frames}
38 | data_eval.subclips.stride=${data_train.subclips.stride}
39 | data_train.mean=[0.5, 0.5, 0.5]
40 | data_train.std=[0.5, 0.5, 0.5]
41 | data_eval.mean=${data_train.mean}
42 | data_eval.std=${data_train.std}
43 | data_eval.eval_num_crops=3
44 | data_eval.eval_flip_crops=true
45 | 
46 | opt/optimizer=sgd
47 | opt/scheduler=cosine
48 | opt.warmup.num_epochs=5
49 | opt.scheduler.num_epochs=5
50 | 
51 | dataset@dataset_train=egtea/anticipation_train
52 | dataset@dataset_eval=egtea/anticipation_val
53 | dataset_train.sample_strategy=last_clip
54 | dataset_eval.sample_strategy=last_clip
55 | dataset_train.conv_to_anticipate_fn.tau_a=0.5
56 | dataset_train.conv_to_anticipate_fn.tau_o=10
57 | dataset_eval.conv_to_anticipate_fn.tau_a=0.5
58 | dataset_eval.conv_to_anticipate_fn.tau_o=10
59 | dataset.egtea.common.label_type=action
60 | dataset.egtea.common.split=1
61 | dataset.egtea.common.modality=rgb
62 | 
63 | # Remove the RULSTM reader and read from the frames
64 | ~dataset_train.reader_fn
65 | +dataset_train.reader_fn={_target_: datasets.reader_fns.DefaultReader}
66 | ~dataset_eval.reader_fn
67 | +dataset_eval.reader_fn={_target_: datasets.reader_fns.DefaultReader}
68 | 
69 | +dataset_train.conv_to_anticipate_fn.drop_style=correct
70 | +dataset_eval.conv_to_anticipate_fn.drop_style=correct
71 | 
72 | data_train.scale_h=248-280
73 | data_train.scale_w=-1
74 | data_train.crop_size=224
75 | data_eval.scale_h=248
76 | data_eval.scale_w=-1
77 | data_eval.crop_size=224
78 | 
79 | hydra.launcher.nodes=4
80 | hydra.launcher.gpus_per_node=8
81 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when there is a
56 | reasonable belief that an individual's behavior may have a negative impact on
57 | the project or its community.
58 | 
59 | ## Enforcement
60 | 
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project team at <opensource-conduct@fb.com>. All
63 | complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 | 
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 | 
72 | ## Attribution
73 | 
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 | 
77 | [homepage]: https://www.contributor-covenant.org
78 | 
79 | For answers to common questions about this code of conduct, see
80 | https://www.contributor-covenant.org/faq
81 | 


--------------------------------------------------------------------------------
/common/sampler.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | import math
  4 | import torch
  5 | from torch.utils.data import Sampler
  6 | import torch.distributed as dist
  7 | import torchvision.datasets.video_utils
  8 | 
  9 | 
 10 | class DistributedSampler(Sampler):
 11 |     """
 12 |     Extension of DistributedSampler, as discussed in
 13 |     https://github.com/pytorch/pytorch/issues/23430
 14 |     """
 15 | 
 16 |     def __init__(self, dataset, num_replicas=None, rank=None, shuffle=False):
 17 |         if num_replicas is None:
 18 |             if not dist.is_available():
 19 |                 raise RuntimeError("Requires distributed package to be available")
 20 |             num_replicas = dist.get_world_size()
 21 |         if rank is None:
 22 |             if not dist.is_available():
 23 |                 raise RuntimeError("Requires distributed package to be available")
 24 |             rank = dist.get_rank()
 25 |         self.dataset = dataset
 26 |         self.num_replicas = num_replicas
 27 |         self.rank = rank
 28 |         self.epoch = 0
 29 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
 30 |         self.total_size = self.num_samples * self.num_replicas
 31 |         self.shuffle = shuffle
 32 | 
 33 |     def __iter__(self):
 34 |         # deterministically shuffle based on epoch
 35 |         g = torch.Generator()
 36 |         g.manual_seed(self.epoch)
 37 |         if self.shuffle:
 38 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
 39 |         else:
 40 |             indices = list(range(len(self.dataset)))
 41 | 
 42 |         # add extra samples to make it evenly divisible
 43 |         indices += indices[:(self.total_size - len(indices))]
 44 |         assert len(indices) == self.total_size
 45 | 
 46 |         # subsample
 47 |         indices = indices[self.rank:self.total_size:self.num_replicas]
 48 |         assert len(indices) == self.num_samples
 49 | 
 50 |         if isinstance(self.dataset, Sampler):
 51 |             orig_indices = list(iter(self.dataset))
 52 |             indices = [orig_indices[i] for i in indices]
 53 | 
 54 |         return iter(indices)
 55 | 
 56 |     def __len__(self):
 57 |         return self.num_samples
 58 | 
 59 |     def set_epoch(self, epoch):
 60 |         self.epoch = epoch
 61 | 
 62 | 
 63 | class UniformClipSampler(torch.utils.data.Sampler):
 64 |     """
 65 |     Samples at most `max_video_clips_per_video` clips for each video, equally spaced
 66 |     Arguments:
 67 |         video_clips (VideoClips): video clips to sample from
 68 |         max_clips_per_video (int): maximum number of clips to be sampled per video
 69 |     """
 70 |     def __init__(self, video_clips, max_clips_per_video):
 71 |         if not isinstance(video_clips, torchvision.datasets.video_utils.VideoClips):
 72 |             raise TypeError("Expected video_clips to be an instance of VideoClips, "
 73 |                             "got {}".format(type(video_clips)))
 74 |         self.video_clips = video_clips
 75 |         self.max_clips_per_video = max_clips_per_video
 76 | 
 77 |     def __iter__(self):
 78 |         idxs = []
 79 |         s = 0
 80 |         # select at most max_clips_per_video for each video, uniformly spaced
 81 |         for c in self.video_clips.clips:
 82 |             length = len(c)
 83 |             step = max(length // self.max_clips_per_video, 1)
 84 |             sampled = torch.arange(length)[::step] + s
 85 |             s += length
 86 |             idxs.append(sampled)
 87 |         idxs = torch.cat(idxs).tolist()
 88 |         return iter(idxs)
 89 | 
 90 |     def __len__(self):
 91 |         return sum(min(len(c), self.max_clips_per_video) for c in self.video_clips.clips)
 92 | 
 93 | 
 94 | class RandomClipSampler(torch.utils.data.Sampler):
 95 |     """
 96 |     Samples at most `max_video_clips_per_video` clips for each video randomly
 97 | 
 98 |     Arguments:
 99 |         video_clips (VideoClips): video clips to sample from
100 |         max_clips_per_video (int): maximum number of clips to be sampled per video
101 |     """
102 |     def __init__(self, video_clips, max_clips_per_video):
103 |         if not isinstance(video_clips, torchvision.datasets.video_utils.VideoClips):
104 |             raise TypeError("Expected video_clips to be an instance of VideoClips, "
105 |                             "got {}".format(type(video_clips)))
106 |         self.video_clips = video_clips
107 |         self.max_clips_per_video = max_clips_per_video
108 | 
109 |     def __iter__(self):
110 |         idxs = []
111 |         s = 0
112 |         # select at most max_clips_per_video for each video, randomly
113 |         for c in self.video_clips.clips:
114 |             length = len(c)
115 |             size = min(length, self.max_clips_per_video)
116 |             sampled = torch.randperm(length)[:size] + s
117 |             s += length
118 |             idxs.append(sampled)
119 |         idxs = torch.cat(idxs)
120 |         # shuffle all clips randomly
121 |         perm = torch.randperm(len(idxs))
122 |         idxs = idxs[perm].tolist()
123 |         return iter(idxs)
124 | 
125 |     def __len__(self):
126 |         return sum(min(len(c), self.max_clips_per_video) for c in self.video_clips.clips)
127 | 


--------------------------------------------------------------------------------
/common/scheduler.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | from typing import Sequence
  4 | 
  5 | import torch
  6 | from bisect import bisect_right
  7 | 
  8 | 
  9 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
 10 |     def __init__(
 11 |             self,
 12 |             optimizer: torch.optim.Optimizer,
 13 |             milestone_epochs: Sequence[int],
 14 |             gamma: float = 0.1,
 15 |             warmup_factor: float = 1.0 / 3,
 16 |             warmup_epochs: int = 5,
 17 |             warmup_method: str = 'linear',
 18 |             last_epoch: int = -1,
 19 |             iters_per_epoch: int = None,  # Must be set by calling code
 20 |             world_size: int = None,
 21 |     ):
 22 |         del world_size
 23 |         if not milestone_epochs == sorted(milestone_epochs):
 24 |             raise ValueError(
 25 |                 "Milestones should be a list of"
 26 |                 " increasing integers. Got {}",
 27 |                 milestone_epochs,
 28 |             )
 29 | 
 30 |         if warmup_method not in ("constant", "linear"):
 31 |             raise ValueError(
 32 |                 "Only 'constant' or 'linear' warmup_method accepted"
 33 |                 "got {}".format(warmup_method))
 34 |         self.milestones = [iters_per_epoch * m for m in milestone_epochs]
 35 |         self.gamma = gamma
 36 |         self.warmup_factor = warmup_factor
 37 |         self.warmup_iters = max(warmup_epochs * iters_per_epoch, 1)
 38 | 
 39 |         self.warmup_method = warmup_method
 40 |         super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)
 41 | 
 42 |     def get_lr(self):
 43 |         warmup_factor = 1
 44 |         if self.last_epoch < self.warmup_iters:
 45 |             if self.warmup_method == "constant":
 46 |                 warmup_factor = self.warmup_factor
 47 |             elif self.warmup_method == "linear":
 48 |                 alpha = float(self.last_epoch) / self.warmup_iters
 49 |                 warmup_factor = self.warmup_factor * (1 - alpha) + alpha
 50 |         return [
 51 |             base_lr * warmup_factor *
 52 |             self.gamma**bisect_right(self.milestones, self.last_epoch)
 53 |             for base_lr in self.base_lrs
 54 |         ]
 55 | 
 56 | 
 57 | class CosineLR(torch.optim.lr_scheduler.CosineAnnealingLR):
 58 |     def __init__(self,
 59 |                  optimizer,
 60 |                  num_epochs,
 61 |                  iters_per_epoch=None,
 62 |                  world_size=None,
 63 |                  **kwargs):
 64 |         kwargs['eta_min'] *= world_size
 65 |         super().__init__(optimizer,
 66 |                          T_max=num_epochs * iters_per_epoch,
 67 |                          **kwargs)
 68 | 
 69 |     def get_lr(self, *args, **kwargs):
 70 |         if self.last_epoch < self.T_max:
 71 |             return super().get_lr(*args, **kwargs)
 72 |         else:
 73 |             # Adding this if I train the model longer than the T_max set in
 74 |             # this. Happens when I sweep over different amounts of warmup.
 75 |             return [0.0 for _ in self.optimizer.param_groups]
 76 | 
 77 | 
 78 | class ReduceLROnPlateau(torch.optim.lr_scheduler.ReduceLROnPlateau):
 79 |     def __init__(self,
 80 |                  optimizer,
 81 |                  iters_per_epoch=None,
 82 |                  world_size=None,
 83 |                  **kwargs):
 84 |         del iters_per_epoch, world_size
 85 |         super().__init__(optimizer, **kwargs)
 86 | 
 87 | 
 88 | class Warmup(torch.optim.lr_scheduler._LRScheduler):
 89 |     """Wrap the scheduler for warmup before it kicks in."""
 90 |     def __init__(
 91 |             self,
 92 |             optimizer: torch.optim.Optimizer,
 93 |             scheduler: torch.optim.lr_scheduler._LRScheduler,
 94 |             init_lr_ratio: float = 0.0,
 95 |             num_epochs: int = 5,
 96 |             last_epoch: int = -1,
 97 |             iters_per_epoch: int = None,  # Must be set by calling code
 98 |             world_size: int = None,
 99 |     ):
100 |         """
101 |         Args:
102 |             init_lr_ratio (float in [0, 1]): Ratio of the original LR to start
103 |                 from. If 0.1, it will start from 0.1 of the original LRs and go
104 |                 upto 1.0 of the original LRs in the epochs. By def start from
105 |                 0 up.
106 |             num_epochs (int): Num of epochs to take to warmup.
107 |             last_epoch (int): Which was the last epoch to init from (not really
108 |                 used anymore since we store the state_dict when loading
109 |                 scheduler from disk.)
110 |         """
111 |         del world_size
112 |         self.base_scheduler = scheduler
113 |         self.warmup_iters = max(num_epochs * iters_per_epoch, 1)
114 |         if self.warmup_iters > 1:
115 |             self.init_lr_ratio = init_lr_ratio
116 |         else:
117 |             self.init_lr_ratio = 1.0  # Don't go from 0 to 1 in 1 iteration
118 |         super().__init__(optimizer, last_epoch)
119 | 
120 |     def get_lr(self):
121 |         # Epoch is iters for me, since I step after each iteration
122 |         # (not after each epoch)
123 |         # Based on logic in step, this should only be called for the warmup
124 |         # iters. After that it should go to the base scheduler
125 |         assert self.last_epoch < self.warmup_iters  # since it increments
126 |         return [
127 |             el * (self.init_lr_ratio + (1 - self.init_lr_ratio) *
128 |                   (float(self.last_epoch) / self.warmup_iters))
129 |             for el in self.base_lrs
130 |         ]
131 | 
132 |     def step(self, *args, **kwargs):
133 |         if self.last_epoch < (self.warmup_iters - 1):
134 |             super().step(*args, **kwargs)
135 |         else:
136 |             self.base_scheduler.step(*args, **kwargs)
137 | 
138 |     def state_dict(self):
139 |         """Returns the state of the scheduler as a :class:`dict`.
140 | 
141 |         It contains an entry for every variable in self.__dict__ which
142 |         is not the optimizer.
143 |         """
144 |         base_sched_dict = self.base_scheduler.state_dict()
145 |         other_stuff = {
146 |             key: value
147 |             for key, value in self.__dict__.items() if key not in [
148 |                 'base_scheduler', 'optimizer']
149 |         }
150 |         return {'base_sched_dict': base_sched_dict, 'other_stuff': other_stuff}
151 | 
152 |     def load_state_dict(self, state_dict):
153 |         """Loads the schedulers state.
154 | 
155 |         Arguments:
156 |             state_dict (dict): scheduler state. Should be an object returned
157 |                 from a call to :meth:`state_dict`.
158 |         """
159 |         self.base_scheduler.__dict__.update(state_dict['base_sched_dict'])
160 |         self.__dict__.update(state_dict['other_stuff'])
161 | 


--------------------------------------------------------------------------------
/docs/MODELS.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## EPIC-Kitchens-100 Test/challenge submission
 3 | 
 4 | Any of the models can be trained/tested on train+val/test by changing the
 5 | `dataset@dataset_train` and `dataset@dataset_eval` fields in the configs.
 6 | Here we provide the configs that were used for the challenge submission.
 7 | 
 8 | | Backbone | Head | Train data |  Config | Model |
 9 | |----------|------|--------|-------|-------|
10 | | TSN (RGB) | RULSTM | train |  `expts/05_ek100_rustm_test_testonly.txt` | [link](https://iplab.dmi.unict.it/sharing/rulstm/ek100_models/RULSTM-anticipation_0.25_6_8_rgb_mt5r_best.pth.tar) |
11 | | TSN (RGB) | AVT-h | train | `expts/02_ek100_avt_tsn_test_testonly.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/02_ek100_avt_tsn.txt/0/checkpoint.pth) |
12 | | TSN (RGB) | AVT-h | train + val | `expts/02_ek100_avt_tsn_test_trainval.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/02_ek100_avt_tsn_test_trainval.txt/0/checkpoint.pth) |
13 | | irCSN-152 (IG65M) | AVT-h | train | `expts/04_ek100_avt_ig65m_test_testonly.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/04_ek100_avt_ig65m.txt/0/checkpoint.pth) |
14 | | irCSN-152 (IG65M) | AVT-h | train + val | `expts/04_ek100_avt_ig65m_test_trainval.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/04_ek100_avt_ig65m_test_trainval.txt/0/checkpoint.pth) |
15 | | AVT-b (RGB) | AVT-h | train | `expts/01_ek100_avt_test_testonly.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/01_ek100_avt.txt/0/checkpoint.pth) |
16 | | AVT-b (RGB) | AVT-h | train + val | `expts/01_ek100_avt_test_trainval.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/01_ek100_avt_test_trainval.txt/0/checkpoint.pth) |
17 | | TSN (Flow) | AVT-h | train | `expts/06_ek100_avt_tsnflow_test_testonly.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/06_ek100_avt_tsnflow.txt/0/checkpoint.pth) |
18 | | TSN (Flow) | AVT-h | train + val | `expts/06_ek100_avt_tsnflow_test_trainval.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/06_ek100_avt_tsnflow_test_trainval.txt/0/checkpoint.pth) |
19 | | TSN (Obj) | AVT-h | train + val | `expts/03_ek100_avt_tsn_obj_test_trainval.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/03_ek100_avt_tsn_obj_test_trainval.txt/0/checkpoint.pth) |
20 | | AVT-b (RGB, longer) | AVT-h | train | `expts/07_ek100_avt_longer_test_testonly.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/07_ek100_avt_longer.txt/0/checkpoint.pth) |
21 | | AVT-b (RGB, longer) | AVT-h | train + val | `expts/07_ek100_avt_longer_test_trainval.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/07_ek100_avt_longer_test_trainval.txt/0/checkpoint.pth) |
22 | 
23 | 
24 | 
25 | The predictions from all the above models were late fused and submitted
26 | for evaluation using the following script:
27 | 
28 | 
29 | ```python
30 | from notebooks.utils import *
31 | CFG_FILES = [
32 |     # RULSTM
33 |     ('expts/05_ek100_rustm_test_testonly.txt', 0),
34 |     # TSN + AVT-h (train and train+val models)
35 |     ('expts/02_ek100_avt_tsn_test_testonly.txt', 0),
36 |     ('expts/02_ek100_avt_tsn_test_trainval.txt', 0),
37 |     # irCSN152/IG65M + AVT-h
38 |     ('expts/04_ek100_avt_ig65m_test_testonly.txt', 0),
39 |     ('expts/04_ek100_avt_ig65m_test_trainval.txt', 0),
40 |     # AVT
41 |     ('expts/01_ek100_avt_test_testonly.txt', 0),
42 |     ('expts/01_ek100_avt_test_trainval.txt', 0),
43 |     # Flow, obj AVT
44 |     ('expts/06_ek100_avt_tsnflow_test_testonly.txt', 0),
45 |     ('expts/06_ek100_avt_tsnflow_test_trainval.txt', 0),
46 |     ('expts/03_ek100_avt_tsn_obj_test_trainval.txt', 0),
47 |     # Longer AVT
48 |     ('expts/07_ek100_avt_longer_test_testonly.txt', 0),
49 |     ('expts/07_ek100_avt_longer_test_trainval.txt', 0),
50 | 
51 | ]
52 | WTS = [1.0, # RULSTM
53 |        # TSN + AVT-h
54 |        1.0, 1.0,
55 |        # irCSN152/IG65M + AVT-h
56 |        1.0, 1.0,
57 |        # AVT
58 |        0.5, 0.5,
59 |        # Flow, obj AVT
60 |        0.5, 0.5, 0.5,
61 |        # Longer AVT
62 |        1.5, 1.5]
63 | SLS = [2, 4, 4]
64 | 
65 | package_results_for_submission_ek100(CFG_FILES, WTS, SLS)
66 | ```
67 | 
68 | It should obtain 16.74 on the challenge leaderboard. We also provide our
69 | final submission file [here](https://dl.fbaipublicfiles.com/avt/challenge_submissions/ek100.zip).
70 | 
71 | ## EPIC-Kitchens-55
72 | 
73 | | Backbone | Head | Top-1 | Top-5 | Config (for top-1/5) | Model (for top-1/5) | AR5 | Config (for AR5) | Model (for AR5) |
74 | |----------|------|------|--------|--------|-----|-----|-----|----|
75 | | TSN (RGB) | AVT-h | 13.1 | 28.1 | `expts/08_ek55_avt_tsn.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/08_ek55_avt_tsn.txt/0/checkpoint.pth)| 13.5 | `expts/08_ek55_avt_tsn_forAR.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/08_ek55_avt_tsn_forAR.txt/0/checkpoint.pth) |
76 | | AVT-b | AVT-h | 12.5 | 30.1 | `expts/09_ek55_avt.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/09_ek55_avt.txt/0/checkpoint.pth)| 13.6 | `expts/09_ek55_avt_forAR.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/09_ek55_avt_forAR.txt/0/checkpoint.pth) |
77 | | irCSN-152 (IG65M) | AVT-h | 14.4 | 31.7 | `expts/10_ek55_avt_ig65m.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/10_ek55_avt_ig65m.txt/0/checkpoint.pth)| 13.2 | `expts/10_ek55_avt_ig65m_forAR.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/10_ek55_avt_ig65m_forAR.txt/0/checkpoint.pth) |
78 | 
79 | Our final test submission was generated by late-fusing AVT model with predictions from [prior work](https://arxiv.org/abs/2006.00830), and is available [here](https://dl.fbaipublicfiles.com/avt/challenge_submissions/ek55.zip).
80 | 
81 | ## EGTEA Gaze+
82 | 
83 | | Backbone | Head | Top-1 (Act) | Class-mean (Act) | Config | Model |
84 | |----------|------|-------------|------------------|-------|-------|
85 | | TSN (RGB) | AVT-h | 39.8 | 28.3 | `expts/11_egtea_avt_tsn.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/11_egtea_avt_tsn.txt/0/checkpoint.pth) |
86 | | AVT-b | AVT-h | 43.0 | 35.2 | `expts/12_egtea_avt.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/12_egtea_avt.txt/0/checkpoint.pth) |
87 | 
88 | 
89 | ## 50-Salads
90 | 
91 | | Backbone | Head | Top-1 (Act) | Config | Model |
92 | |----------|------|-------------|-------|-------|
93 | | AVT-b | AVT-h | 48.0 | `expts/13_50s_avt.txt` | [fold 1](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/13_50s_avt.txt/0/checkpoint.pth)  [fold 2](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/13_50s_avt.txt/1/checkpoint.pth)  [fold 3](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/13_50s_avt.txt/2/checkpoint.pth) [fold 4](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/13_50s_avt.txt/3/checkpoint.pth) [fold 5](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/13_50s_avt.txt/4/checkpoint.pth) |
94 | 


--------------------------------------------------------------------------------
/common/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | from __future__ import print_function
  4 | from typing import List, Dict
  5 | 
  6 | import errno
  7 | import os
  8 | from pathlib import Path
  9 | import logging
 10 | import submitit
 11 | import cv2
 12 | 
 13 | import torch
 14 | import torch.distributed as dist
 15 | 
 16 | 
 17 | def accuracy(output, target, topk=(1, )):
 18 |     """Computes the accuracy over the k top predictions
 19 |     for the specified values of k
 20 |     Args:
 21 |         output (*, K) predictions
 22 |         target (*, ) targets
 23 |     """
 24 |     if torch.all(target < 0):
 25 |         return [
 26 |             torch.zeros([], device=output.device) for _ in range(len(topk))
 27 |         ]
 28 |     with torch.no_grad():
 29 |         # flatten the initial dimensions, to deal with 3D+ input
 30 |         output = output.flatten(0, -2)
 31 |         target = target.flatten()
 32 |         # Now compute the accuracy
 33 |         maxk = max(topk)
 34 |         batch_size = target.size(0)
 35 | 
 36 |         _, pred = output.topk(maxk, 1, True, True)
 37 |         pred = pred.t()
 38 |         correct = pred.eq(target[None])
 39 | 
 40 |         res = []
 41 |         for k in topk:
 42 |             correct_k = correct[:k].flatten().sum(dtype=torch.float32)
 43 |             res.append(correct_k * (100.0 / batch_size))
 44 |         return res
 45 | 
 46 | 
 47 | def mkdir(path):
 48 |     try:
 49 |         os.makedirs(path)
 50 |     except OSError as e:
 51 |         if e.errno != errno.EEXIST:
 52 |             raise
 53 | 
 54 | 
 55 | def setup_for_distributed(is_master, logger):
 56 |     """
 57 |     This function disables printing when not in master process
 58 |     """
 59 |     import builtins as __builtin__
 60 |     builtin_print = __builtin__.print
 61 | 
 62 |     def print(*args, **kwargs):
 63 |         force = kwargs.pop('force', False)
 64 |         if is_master or force:
 65 |             builtin_print(*args, **kwargs)
 66 | 
 67 |     __builtin__.print = print
 68 |     if not is_master:
 69 |         # Don't print anything except FATAL
 70 |         logger.setLevel(logging.ERROR)
 71 |         logging.basicConfig(level=logging.ERROR)
 72 |     else:
 73 |         logger.setLevel(logging.INFO)
 74 |         logging.basicConfig(level=logging.INFO)
 75 | 
 76 | 
 77 | def is_dist_avail_and_initialized():
 78 |     if not dist.is_available():
 79 |         return False
 80 |     if not dist.is_initialized():
 81 |         return False
 82 |     return True
 83 | 
 84 | 
 85 | def get_world_size():
 86 |     if not is_dist_avail_and_initialized():
 87 |         return 1
 88 |     return dist.get_world_size()
 89 | 
 90 | 
 91 | def get_rank():
 92 |     if not is_dist_avail_and_initialized():
 93 |         return 0
 94 |     return dist.get_rank()
 95 | 
 96 | 
 97 | def is_main_process():
 98 |     return get_rank() == 0
 99 | 
100 | 
101 | def save_on_master(*args, **kwargs):
102 |     if is_main_process():
103 |         torch.save(*args, **kwargs)
104 | 
105 | 
106 | def init_distributed_mode(logger, dist_backend='nccl'):
107 |     dist_info = dict(
108 |         distributed=False,
109 |         rank=0,
110 |         world_size=1,
111 |         gpu=0,
112 |         dist_backend=dist_backend,
113 |         dist_url=get_init_file(None).as_uri(),
114 |     )
115 |     # If launched using submitit, get the job_env and set using those
116 |     try:
117 |         job_env = submitit.JobEnvironment()
118 |     except RuntimeError:
119 |         job_env = None
120 |     if job_env is not None:
121 |         dist_info['rank'] = job_env.global_rank
122 |         dist_info['world_size'] = job_env.num_tasks
123 |         dist_info['gpu'] = job_env.local_rank
124 |     if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
125 |         dist_info['rank'] = int(os.environ["RANK"])
126 |         dist_info['world_size'] = int(os.environ['WORLD_SIZE'])
127 |         dist_info['gpu'] = int(os.environ['LOCAL_RANK'])
128 |     elif 'SLURM_PROCID' in os.environ:
129 |         dist_info['rank'] = int(os.environ['SLURM_PROCID'])
130 |         dist_info['gpu'] = dist_info['rank'] % torch.cuda.device_count()
131 |     elif 'rank' in dist_info:
132 |         pass
133 |     else:
134 |         print('Not using distributed mode')
135 |         dist_info['distributed'] = False
136 |         return dist_info
137 | 
138 |     dist_info['distributed'] = True
139 | 
140 |     torch.cuda.set_device(dist_info['gpu'])
141 |     dist_info['dist_backend'] = dist_backend
142 |     print('| distributed init (rank {}): {}'.format(dist_info['rank'],
143 |                                                     dist_info['dist_url']),
144 |           flush=True)
145 |     torch.distributed.init_process_group(backend=dist_info['dist_backend'],
146 |                                          init_method=dist_info['dist_url'],
147 |                                          world_size=dist_info['world_size'],
148 |                                          rank=dist_info['rank'])
149 |     setup_for_distributed(dist_info['rank'] == 0, logger)
150 |     return dist_info
151 | 
152 | 
153 | def get_shared_folder(name) -> Path:
154 |     # Since using hydra, which figures the out folder
155 |     return Path('./').absolute()
156 | 
157 | 
158 | def get_init_file(name):
159 |     # Init file must not exist, but it's parent dir must exist.
160 |     os.makedirs(str(get_shared_folder(name)), exist_ok=True)
161 |     init_file = get_shared_folder(name) / 'sync_file_init'
162 |     return init_file
163 | 
164 | 
165 | def gather_tensors_from_all(tensor: torch.Tensor) -> List[torch.Tensor]:
166 |     """
167 |     Wrapper over torch.distributed.all_gather for performing
168 |     'gather' of 'tensor' over all processes in both distributed /
169 |     non-distributed scenarios.
170 |     """
171 |     if tensor.ndim == 0:
172 |         # 0 dim tensors cannot be gathered. so unsqueeze
173 |         tensor = tensor.unsqueeze(0)
174 | 
175 |     if is_dist_avail_and_initialized():
176 |         gathered_tensors = [
177 |             torch.zeros_like(tensor)
178 |             for _ in range(torch.distributed.get_world_size())
179 |         ]
180 |         torch.distributed.all_gather(gathered_tensors, tensor)
181 |     else:
182 |         gathered_tensors = [tensor]
183 | 
184 |     return gathered_tensors
185 | 
186 | 
187 | def gather_from_all(tensor: torch.Tensor) -> torch.Tensor:
188 |     gathered_tensors = gather_tensors_from_all(tensor)
189 |     gathered_tensor = torch.cat(gathered_tensors, 0)
190 |     return gathered_tensor
191 | 
192 | 
193 | def get_video_info(video_path: Path, props: List[str]) -> Dict[str, float]:
194 |     """
195 |     Given the video, return the properties asked for
196 |     """
197 |     output = {}
198 |     cam = cv2.VideoCapture(str(video_path))
199 |     if 'fps' in props:
200 |         output['fps'] = cam.get(cv2.CAP_PROP_FPS)
201 |     if 'len' in props:
202 |         fps = cam.get(cv2.CAP_PROP_FPS)
203 |         if fps <= 0:
204 |             output['len'] = 0
205 |         else:
206 |             output['len'] = (cam.get(cv2.CAP_PROP_FRAME_COUNT) / fps)
207 |     cam.release()
208 |     return output
209 | 


--------------------------------------------------------------------------------
/loss_fn/simclr_infonce.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | """The SimCLR InfoNCE loss."""
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | from common import utils
  8 | 
  9 | LARGE_NUM = 1e9
 10 | 
 11 | 
 12 | class MILCrossEntropyLoss(nn.Module):
 13 |     def __init__(self, mil_type='sum', reduction='mean'):
 14 |         super().__init__()
 15 |         self.mil_type = mil_type
 16 |         self.reduction = reduction
 17 | 
 18 |     def forward(self, *args, **kwargs):
 19 |         if self.mil_type == 'sum':
 20 |             return self.forward_sum(*args, **kwargs)
 21 |         elif self.mil_type == 'max':
 22 |             return self.forward_max(*args, **kwargs)
 23 |         else:
 24 |             raise NotImplementedError(f'Unknown type {self.mil_type}')
 25 | 
 26 |     def forward_sum(self, pred, labels_onehot):
 27 |         """
 28 |         Args:
 29 |             pred: BxC is the output
 30 |             labels: BxC is 1s for positive, and 0s for negatives
 31 |         Based on https://github.com/antoine77340/MIL-NCE_HowTo100M/blob/master/loss.py
 32 |         Or the MIL-NCE paper Eq 1 (https://arxiv.org/pdf/1912.06430.pdf)
 33 |         """
 34 |         assert pred.shape == labels_onehot.shape
 35 |         # In the MILNCE code there is a sum, followed by logsumexp. I think
 36 |         # using the labels to select the positive samples and then doing
 37 |         # logsumexp will have the same effect.
 38 |         pos_pred = pred[labels_onehot.bool()].reshape((pred.size(0), -1))
 39 |         numerator = torch.logsumexp(pos_pred, dim=1)
 40 |         denominotor = torch.logsumexp(pred, dim=1)
 41 |         loss = denominotor - numerator
 42 |         if self.reduction == 'mean':
 43 |             loss = torch.mean(loss)
 44 |         elif self.reduction == 'none':
 45 |             pass
 46 |         else:
 47 |             raise NotImplementedError(f'Unknown reduction {self.reduction}')
 48 |         return loss
 49 | 
 50 |     def forward_max(self, pred, labels_onehot):
 51 |         """
 52 |         Args:
 53 |             pred: BxC is the output
 54 |             labels: BxC is 1s for positive, and 0s for negatives
 55 |         Based on Appendix A (https://arxiv.org/pdf/1912.06430.pdf)
 56 |         """
 57 |         assert pred.shape == labels_onehot.shape
 58 |         # Do max before, and then logsumexp. Works since exp is monotonic fn
 59 |         # so the max with exp or without will be the same.
 60 |         pos_pred = pred[labels_onehot.bool()].reshape((pred.size(0), -1))
 61 |         pos_pred = torch.max(pos_pred, dim=1, keepdim=True)[0]
 62 |         neg_pred = pred[~labels_onehot.bool()].reshape((pred.size(0), -1))
 63 |         numerator = torch.logsumexp(pos_pred, dim=1)
 64 |         denominotor = torch.logsumexp(torch.cat([pos_pred, neg_pred], dim=1),
 65 |                                       dim=1)
 66 |         return torch.mean(denominotor - numerator)
 67 | 
 68 | 
 69 | class DistributedSimclrInfoNCELoss(nn.Module):
 70 |     def __init__(self,
 71 |                  temperature: float = 0.1,
 72 |                  target_to_output_loss=True,
 73 |                  mil_type='sum',
 74 |                  reduction='mean'):
 75 |         super().__init__()
 76 |         self.temperature = temperature
 77 |         self.criterion = MILCrossEntropyLoss(mil_type, reduction=reduction)
 78 |         # This defines whether the reverse part of the loss, from target to
 79 |         # the output features, is incurred.
 80 |         self.target_to_output_loss = target_to_output_loss
 81 | 
 82 |     def forward(self, output: torch.Tensor,
 83 |                 target: torch.Tensor) -> torch.Tensor:
 84 |         """
 85 |         Args:
 86 |             output: BxC
 87 |             target: BxC or BxKxC <-- In case of MIL NCE, K is the number of
 88 |                 positives for each batch element.
 89 |             Following https://github.com/google-research/simclr/blob/master/objective.py
 90 |         """
 91 |         # Normalize first, before the gather -- so that all the features I get
 92 |         # are normalized
 93 |         output = nn.functional.normalize(output, dim=-1, p=2)
 94 |         target = nn.functional.normalize(target, dim=-1, p=2)
 95 |         # To be consistent with MIL-NCE input, convert K to batch dim,
 96 |         # and repeat the output to same value for each repeated target
 97 |         elt_for_back_loss = 0
 98 |         if target.ndim == 3:
 99 |             num_matching = target.size(1)
100 |             target_flat = target.reshape((-1, target.size(-1)))
101 |             # Keep the first one for the back loss
102 |             target = target[:, elt_for_back_loss]
103 |         else:
104 |             num_matching = 1
105 |             target_flat = target
106 |         # Gather all the outputs and all the targets
107 |         output_all = self.gather_embeddings(output)
108 |         target_flat_all = self.gather_embeddings(target_flat)
109 |         batch_size = output.size(0)
110 |         replica_id = utils.get_rank()
111 |         # -> (B, B_full * num_matching)
112 |         labels_onehot = torch.zeros((batch_size, output_all.size(0)),
113 |                                     dtype=output.dtype,
114 |                                     device=output.device)
115 |         extra_zeros = torch.zeros((batch_size, output_all.size(0)),
116 |                                   dtype=output.dtype,
117 |                                   device=output.device)
118 |         ones_diag = torch.eye(batch_size,
119 |                               batch_size,
120 |                               dtype=output.dtype,
121 |                               device=output.device)
122 |         labels_onehot[:, replica_id * batch_size:(replica_id + 1) *
123 |                       batch_size] = ones_diag
124 |         labels_onehot_interleaved = labels_onehot.repeat_interleave(
125 |             num_matching, dim=1)
126 |         # (B, C) * (B_full, C) -> (B, B_full)
127 |         logits_aa = torch.mm(output, output_all.t() / self.temperature)
128 |         # (B, C) * (B_full * num_matching, C) -> (B, B_full * num_matching)
129 |         logits_ab = torch.mm(output, target_flat_all.t() / self.temperature)
130 |         logits_aa = logits_aa - labels_onehot * LARGE_NUM
131 |         loss = self.criterion(
132 |             torch.cat([logits_ab, logits_aa], 1),
133 |             torch.cat([labels_onehot_interleaved, extra_zeros], 1))
134 |         if self.target_to_output_loss:
135 |             # Keep only the first prediction, since that is what I will incur
136 |             # reverse loss with
137 |             target_all = target_flat_all[elt_for_back_loss::num_matching]
138 |             logits_bb = torch.mm(target, target_all.t() / self.temperature)
139 |             logits_bb = logits_bb - labels_onehot * LARGE_NUM
140 |             logits_ba = torch.mm(target, output_all.t() / self.temperature)
141 |             loss = loss + self.criterion(
142 |                 torch.cat([logits_ba, logits_bb], 1),
143 |                 torch.cat([labels_onehot, extra_zeros], 1))
144 |         return loss
145 | 
146 |     def gather_embeddings(self, embedding: torch.Tensor) -> torch.Tensor:
147 |         """
148 |         Do a gather over all embeddings, so we can compute the loss.
149 |         Final shape is like: (batch_size * num_gpus) x embedding_dim
150 |         """
151 |         if torch.distributed.is_available(
152 |         ) and torch.distributed.is_initialized():
153 |             # gather all embeddings.
154 |             embedding_gathered = utils.gather_from_all(embedding)
155 |         else:
156 |             embedding_gathered = embedding
157 |         return embedding_gathered
158 | 
159 | 
160 | class MultiDimDistributedSimclrInfoNCELoss(DistributedSimclrInfoNCELoss):
161 |     """
162 |     Fold in the initial dimensions and run simple NCE.
163 |     """
164 |     def forward(self, output: torch.Tensor, target: torch.Tensor, *args,
165 |                 **kwargs) -> torch.Tensor:
166 |         return super().forward(output.flatten(0, -2), target.flatten(0, -2),
167 |                                *args, **kwargs)
168 | 


--------------------------------------------------------------------------------
/conf/config.yaml:
--------------------------------------------------------------------------------
  1 | expt_name: "default"
  2 | # Just set to multiple values to run the same config multiple times. Just there
  3 | # to take into account random variation
  4 | run_id: 0
  5 | seed: 42
  6 | # A common place, so can be overriden in notebooks, which don't support ":"
  7 | # interpolation
  8 | cwd: ${hydra:runtime.cwd}
  9 | 
 10 | sync_bn: false
 11 | 
 12 | test_only: false
 13 | 
 14 | # Set this to force data parallel training. Num nodes should be 1.
 15 | data_parallel: false
 16 | 
 17 | dist_backend: nccl
 18 | 
 19 | pytorch:
 20 |   # This only works with the compiled version of torchvision, and might have
 21 |   # some memory issues?
 22 |   video_backend: "video_reader"
 23 | 
 24 | train:
 25 |   fn: 'train'  # Which file in func/ directory to use for training
 26 |   batch_size: 16
 27 |   # This can have structure as follows:
 28 |   # <module name in model>:<module name in ckpt>:<path to ckpt> <>...
 29 |   # By default also supports just the <path to ckpt>
 30 |   # But the more complex structure can be used to init separate parts of model
 31 |   # using diff checkpoints. By default if only 2 elements are specified with :,
 32 |   # module_name_in_ckpt is assumed to be null
 33 |   init_from_model: null
 34 |   # Total epochs to train for
 35 |   num_epochs: 45
 36 |   # Evaluate within training, every these many epochs
 37 |   eval_freq: 1
 38 |   # Shuffle data at train time
 39 |   shuffle_data: true
 40 |   # Store the best performing checkpoint
 41 |   store_best: false
 42 |   train_one_epoch_fn:
 43 |     _target_: func.train.train_one_epoch
 44 |     print_freq: 10
 45 |     print_large_freq: 1000  # How often to write images/videos summary
 46 |     grad_clip_params: ${opt.grad_clip}  # DO NOT CHANGE HERE, change in opt
 47 |     # Set the following to store models every so many epochs. By default
 48 |     # will only store the last checkpoint and the best checkpoint.
 49 |     save_freq: null
 50 |     # Num of minutes to save at, same as above -- must set save_intermediate
 51 |     # true to save like this
 52 |     save_freq_min: 60  # At least save every 60 mins
 53 |     # Whether or not to save the intermediate models
 54 |     save_intermediates: false
 55 |     loss_wts:
 56 |       cls_action: 1.0
 57 |       cls_verb: 1.0
 58 |       cls_noun: 1.0
 59 |       pred: 1.0
 60 |       feat: 1.0
 61 |       # Past predictions, default 0 to be backward compatible
 62 |       past_cls_action: 0.0
 63 |       past_cls_verb: 0.0
 64 |       past_cls_noun: 0.0
 65 | 
 66 | 
 67 | eval:
 68 |   batch_size: null  # Will automatically figure out from train if null
 69 |   eval_fn:
 70 |     _target_: func.train.evaluate
 71 |     store: true
 72 |     store_endpoint: logits
 73 |     only_run_featext: false
 74 | 
 75 | model:
 76 |   backbone_dim: 2048
 77 |   # Use the backbone dim if null. Don't use the interpolation since the
 78 |   # backbone dim might be updated in the code
 79 |   intermediate_featdim: null
 80 |   backbone_last_n_modules_to_drop: 2  # Avg pool and linear layer
 81 |   dropout: 0.0
 82 |   # Set to a number to project the temp_agg and future features to this
 83 |   # dimension using a MLP before applying the NCE loss.
 84 |   # Note this is also applied when doing L2 regression loss, so the name is a
 85 |   # bit of a misnomer.
 86 |   project_dim_for_nce: null
 87 |   # Set to true to also add a regression head -- that is used for dense
 88 |   # anticipation when predicting the duration of an action
 89 |   add_regression_head: False
 90 |   bn:
 91 |     eps: 0.001
 92 |     mom: 0.1
 93 |   # Set this to true if you want to have the same temporal aggregated feat
 94 |   # dim as from the original backbone (backbone_dim). This will add a linear
 95 |   # layer to get that. It's useful when training future predictive models,
 96 |   # with future feat avg as the target.
 97 |   same_temp_agg_dim: false
 98 |   # Set this to true to use the class mappings to get the other predictions
 99 |   # eg, verb/noun from action, instead of adding additional linear layers
100 |   # Only applicable when predicting multiple output classes
101 |   use_cls_mappings: false
102 |   # Apply the classifier on the past predictions too
103 |   classifier_on_past: false
104 | 
105 | opt:
106 |   # Not using an overall LR anymore, since everything is now defined per
107 |   # module.
108 |   # Use a list format to specify per-layer LRs and WD. The first element is
109 |   # module_name ("__all__" => all params), LR and WD.
110 |   # Note that if there is any overlap between parameters, those params
111 |   # will get updated that many number of times as they appear in the list.
112 |   # It WILL NOT take the last options as highest precedence. (TODO future)
113 |   # The first term can also be a list, to give it a bunch of modules to set
114 |   # the same LR and WD for.
115 |   lr_wd: [[__all__, 0.1, 0.0001]]
116 |   # Set this to true to also scale the LR by the batch size (normally it will
117 |   # be scaled by the #replicas, so the LR is specified per given batch size).
118 |   # This allows to further specify a LR per batch element (useful when doing
119 |   # sweeps over batch size).
120 |   scale_lr_by_bs: false
121 |   # Set this to true to only train the last classifier layer.
122 |   # Also, will set all BN layers to not compute mean/var at runtime.
123 |   classifier_only: false
124 |   bias_bn_wd_scale: 1.0  # Scale the WD for bias and BN layers by this amount
125 |   grad_clip:
126 |     max_norm: null  # By default, no clipping
127 |     norm_type: 2
128 |   warmup:
129 |     _target_: common.scheduler.Warmup
130 |     init_lr_ratio: 0.0  # Warmup from this ratio of the orig LRs
131 |     num_epochs: 0  # Warmup for this many epochs (will take out of total epochs)
132 | 
133 | moco:
134 |   _target_: moco.moco.builder.MoCo
135 |   dim: 128
136 |   K: 65536
137 |   m: 0.999
138 |   T: 0.2  # From moco-v2
139 |   mlp: true  # From moco-v2
140 | 
141 | defaults:
142 |   - train_eval_op: basic
143 |   - train_eval_op/cls_loss_acc_fn: basic
144 |   - train_eval_op/reg_criterion: mse
145 |   - opt/optimizer: sgd
146 |   - model/backbone: r2plus1d_34
147 |   - model/temporal_aggregator: mean
148 |   - model/future_predictor: identity
149 |   - model/temporal_aggregator_after_future_pred: identity
150 |   - model/classifier: linear
151 |   - opt/scheduler: warmup_multi_step
152 |   # Any keys with dataset_train prefix, like dataset_train2, etc, will all
153 |   # be used for training by concatentating all those datasets. So you can
154 |   # use multiple datasets in training by adding
155 |   # +dataseset_train2=hmdb51/train to the command line config.
156 |   # Note that this only works with standard datasets, ConcatDataset can't
157 |   # handle overly customized datasets as we use in EpicKitchens
158 |   - dataset@dataset_train: epic_kitchens100/anticipation_train
159 |   # Any keys with the dataset_eval prefix, will all be evaluated on separately.
160 |   # The postfix will be used to identify which dataset the results are on.
161 |   # So, you can use > 1 evaluation datasets that way, by adding it in the
162 |   # command line config, like +dataset_eval2=hmdb51/val
163 |   - dataset@dataset_eval: epic_kitchens100/anticipation_val
164 |   - data@data_train: default
165 |   - data@data_eval: default
166 |   # Load any common dataset files, that will be used to create other dataset
167 |   # elements.
168 |   - dataset/epic_kitchens/common
169 |   - dataset/epic_kitchens100/common
170 |   - dataset/dundee50salads/common
171 |   - dataset/dundee50salads/annot_reader_fn: orig
172 |   - dataset/egtea/common
173 |   # Overrides
174 |   - override hydra/launcher: submitit_slurm
175 |   - override hydra/job_logging: colorlog
176 |   - override hydra/hydra_logging: colorlog
177 | 
178 | hydra:
179 |   job:
180 |     name: "AVT"
181 |   launcher:
182 |     # All params in https://github.com/facebookresearch/hydra/blob/master/plugins/hydra_submitit_launcher/hydra_plugins/hydra_submitit_launcher/config.py
183 |     timeout_min: 2880
184 |     cpus_per_task: 10
185 |     gpus_per_node: 8
186 |     tasks_per_node: ${hydra.launcher.gpus_per_node}
187 |     # This is the memory requested per node. So all GPUs on a given
188 |     # node will share this memory
189 |     mem_gb: 450
190 |     nodes: 2
191 |     # Use these parameters through + options in hydra
192 |     # partition: learnfair
193 |     # max_num_timeout: 3
194 |     # constraint: ${hydra.launcher.gpu_type} # Any, or could say [volta|pascal]
195 |     # comment: ""
196 |   run:
197 |     dir: ./outputs/  # Specified in the launch script
198 |   sweep:
199 |     dir: ${hydra.run.dir}
200 |     # Output sub directory for sweep runs.
201 |     subdir: ${hydra.job.num}  # ${hydra.job.override_dirname}
202 | 


--------------------------------------------------------------------------------
/models/temporal_aggregation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | """
  4 | Implementation of the temporal aggregation algorithms.
  5 |     Input: (B, C, T)
  6 |     Output: (B, C)
  7 | """
  8 | import math
  9 | import torch
 10 | import torch.nn as nn
 11 | import logging
 12 | import warnings
 13 | 
 14 | try:
 15 |     from external.rulstm.RULSTM.models import RULSTM
 16 | except ImportError:
 17 |     RULSTM = object
 18 |     logging.warning('No RULSTM found.')
 19 | 
 20 | 
 21 | class Identity(nn.Identity):
 22 |     def __init__(self, in_features):
 23 |         super().__init__()
 24 |         self.in_features = in_features
 25 | 
 26 |     def forward(self, *args, **kwargs):
 27 |         return super().forward(*args, **kwargs), {}
 28 | 
 29 |     @property
 30 |     def output_dim(self):
 31 |         return self.in_features
 32 | 
 33 | 
 34 | class Mean(nn.Module):
 35 |     def __init__(self, in_features):
 36 |         super().__init__()
 37 |         self.in_features = in_features
 38 | 
 39 |     def forward(self, feats):
 40 |         """
 41 |             feats: B, T, C dimensional input
 42 |         """
 43 |         return torch.mean(feats, dim=1), {}
 44 | 
 45 |     @property
 46 |     def output_dim(self):
 47 |         return self.in_features
 48 | 
 49 | 
 50 | class PositionalEncoding(nn.Module):
 51 |     """For now, just using simple pos encoding from language.
 52 |     https://pytorch.org/tutorials/beginner/transformer_tutorial.html
 53 |     """
 54 |     def __init__(self, d_model, dropout=0.1, max_len=5000):
 55 |         super(PositionalEncoding, self).__init__()
 56 |         self.dropout = nn.Dropout(p=dropout)
 57 | 
 58 |         pe = torch.zeros(max_len, d_model)
 59 |         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
 60 |         div_term = torch.exp(
 61 |             torch.arange(0, d_model, 2).float() *
 62 |             (-math.log(10000.0) / d_model))
 63 |         pe[:, 0::2] = torch.sin(position * div_term)
 64 |         pe[:, 1::2] = torch.cos(position * div_term)
 65 |         pe = pe.unsqueeze(0).transpose(0, 1)
 66 |         self.register_buffer('pe', pe)
 67 | 
 68 |     def forward(self, x):
 69 |         x = x + self.pe[:x.size(0), :]
 70 |         return self.dropout(x)
 71 | 
 72 | 
 73 | class Transformer(nn.Module):
 74 |     """ Using a transformer encoder and simple decoder. """
 75 |     def __init__(self,
 76 |                  in_features,
 77 |                  inter_rep=512,
 78 |                  nheads=8,
 79 |                  nlayers=6,
 80 |                  agg_style='mean',
 81 |                  cloze_loss_ratio=0.0,
 82 |                  cloze_loss_wt=0.0):
 83 |         super().__init__()
 84 |         self.in_features = in_features
 85 |         self.inter_rep = inter_rep
 86 |         self.downproject = nn.Linear(in_features, inter_rep)
 87 |         layer = nn.TransformerEncoderLayer(d_model=inter_rep, nhead=nheads)
 88 |         # Don't think I'll ever consider longer than 1000 features?
 89 |         self.pos_encoder = PositionalEncoding(inter_rep, max_len=1000)
 90 |         self.transformer_encoder = nn.TransformerEncoder(
 91 |             layer, num_layers=nlayers, norm=nn.LayerNorm(inter_rep))
 92 |         self.agg_style = agg_style
 93 |         self.cloze_loss_ratio = cloze_loss_ratio
 94 |         self.cloze_loss_wt = cloze_loss_wt
 95 |         self.cloze_loss_fn = nn.MSELoss(reduction='none')
 96 |         # The embedding for the [MASK] token
 97 |         if self.cloze_loss_ratio > 0:
 98 |             self.extra_embeddings = nn.Embedding(1, in_features)
 99 | 
100 |     def forward(self, feats):
101 |         """
102 |         Args:
103 |             feats (B, T, C)
104 |         Returns:
105 |             aggregated features (B, C')
106 |         """
107 |         # Convert to the format used by transformer: T, B, C
108 |         feats = feats.transpose(0, 1)
109 |         kwargs = {}
110 |         if self.training and self.cloze_loss_ratio > 0:
111 |             # Mask out certain positions, so when doing attention these
112 |             # positions will be ignored
113 |             key_padding_mask = torch.rand((feats.size(0), feats.size(1)),
114 |                                           device=feats.device)
115 |             # Get close_ratio amount as True, so those will be ignored
116 |             key_padding_mask = key_padding_mask <= self.cloze_loss_ratio
117 |             # Set the features to MASK embedding, for the ones that are masked
118 |             key_padding_mask_rep = key_padding_mask.unsqueeze(-1).expand(
119 |                 -1, -1, feats.size(2))
120 |             # Set the masked elements to 0, and add the MASK embedding
121 |             replaced_feats = (
122 |                 feats * (~key_padding_mask_rep) +
123 |                 key_padding_mask_rep * self.extra_embeddings(
124 |                     torch.tensor([0], dtype=torch.long,
125 |                                  device=feats.device)).unsqueeze(0))
126 |             feats = replaced_feats
127 |             # Transpose since the function takes in B, T
128 |             kwargs['src_key_padding_mask'] = key_padding_mask.t()
129 |         feats = self.pos_encoder(self.downproject(feats))
130 |         feats_encoded = self.transformer_encoder(feats, **kwargs)
131 |         aux_losses = {}
132 |         if self.training and self.cloze_loss_ratio > 0:
133 |             dist = self.cloze_loss_fn(feats_encoded, feats)
134 |             dist_masked_elts = self.cloze_loss_wt * torch.mean(
135 |                 torch.mean(dist, dim=-1) * key_padding_mask)
136 |             aux_losses['tx_mlm'] = dist_masked_elts
137 |         if self.agg_style == 'mean':
138 |             res = torch.mean(feats_encoded, dim=[0])
139 |         elif self.agg_style == 'last':
140 |             res = feats_encoded[-1]
141 |         else:
142 |             raise NotImplementedError(f'Unknown agg style {self.agg_style}')
143 |         return res, aux_losses
144 | 
145 |     @property
146 |     def output_dim(self):
147 |         return self.inter_rep
148 | 
149 | 
150 | class RULSTMAggregation(RULSTM):
151 |     def __init__(self,
152 |                  in_features: int,
153 |                  intermediate_featdim: int = 1024,
154 |                  dropout: float = 0.8,
155 |                  num_pad_feats: int = 0):
156 |         """
157 |         Args:
158 |             num_pad_feats (int): Pad the features with zero feats for this
159 |                 many times on the time axis. This is because the unrolling
160 |                 LSTM unrolls forward as many times as input, and since original
161 |                 models were trained for 14 steps unrolling (upto 0.25s
162 |                 before the action), and I usually test for 11 steps (1s before
163 |                 action), need to pad 3 times to get the same output when
164 |                 testing pre-trained models.
165 |         """
166 |         super().__init__(1, in_features, intermediate_featdim, dropout)
167 |         # Remove the classifier, since the outside code will deal with that
168 |         self.classifier = nn.Sequential()
169 |         self.output_dim = intermediate_featdim
170 |         self.num_pad_feats = num_pad_feats
171 |         # Ignore warnings because it UserWarning: RNN module weights are not
172 |         # part of single contiguous chunk of memory. This means they need to be
173 |         # compacted at every call, possibly greatly increasing memory usage.
174 |         # To compact weights again call flatten_parameters().
175 |         # Not sure how to fix this, adding the flatten didn't really fix
176 |         # Happens only with DataParallel, not DDP
177 |         # Using https://github.com/pytorch/pytorch/issues/24155#issuecomment-604474511
178 |         # Just ignoring the warning
179 |         warnings.filterwarnings('ignore')
180 | 
181 |     def forward(self, feats):
182 |         """
183 |             Args:
184 |                 feats (B, T, C)
185 |             Returns:
186 |                 aggregated (B, C)
187 |         """
188 |         if self.num_pad_feats > 0:
189 |             empty_feats = torch.zeros(
190 |                 (feats.size(0), self.num_pad_feats, feats.size(-1)),
191 |                 dtype=feats.dtype,
192 |                 device=feats.device)
193 |             feats = torch.cat([feats, empty_feats], dim=1)
194 |         res = super().forward(feats)
195 |         # Return output corresponding to the last input frame. Note that in
196 |         # original RULSTM they do -4 since they predict 3 steps further into
197 |         # the anticipation time, whereas I stop when the anticipation time
198 |         # starts here.
199 |         # Subtract num_pad_feat as that would mean it predicted further into
200 |         # the future
201 |         return res[:, -1 - self.num_pad_feats, :], {}
202 | 


--------------------------------------------------------------------------------
/models/video_classification.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | """
  3 | Model architectures.
  4 | """
  5 | import torch.nn as nn
  6 | 
  7 | from torchvision.models.video.resnet import (
  8 |     BasicBlock,
  9 |     Bottleneck,
 10 |     R2Plus1dStem,
 11 |     _video_resnet,
 12 | )
 13 | from pretrainedmodels import bninception
 14 | import timm
 15 | 
 16 | __all__ = [
 17 |     'r2plus1d_34',
 18 |     'r2plus1d_152',
 19 |     'ir_csn_152',
 20 |     'ip_csn_152',
 21 |     'ip_csn_50',
 22 |     'BNInceptionVideo',
 23 | ]
 24 | 
 25 | 
 26 | class BasicStem_Pool(nn.Sequential):
 27 |     def __init__(self):
 28 |         super(BasicStem_Pool, self).__init__(
 29 |             nn.Conv3d(
 30 |                 3,
 31 |                 64,
 32 |                 kernel_size=(3, 7, 7),
 33 |                 stride=(1, 2, 2),
 34 |                 padding=(1, 3, 3),
 35 |                 bias=False,
 36 |             ),
 37 |             nn.BatchNorm3d(64),
 38 |             nn.ReLU(inplace=True),
 39 |             nn.MaxPool3d(kernel_size=(1, 3, 3),
 40 |                          stride=(1, 2, 2),
 41 |                          padding=(0, 1, 1)),
 42 |         )
 43 | 
 44 | 
 45 | class Conv3DDepthwise(nn.Conv3d):
 46 |     def __init__(self,
 47 |                  in_planes,
 48 |                  out_planes,
 49 |                  midplanes=None,
 50 |                  stride=1,
 51 |                  padding=1):
 52 | 
 53 |         assert in_planes == out_planes
 54 |         super(Conv3DDepthwise, self).__init__(
 55 |             in_channels=in_planes,
 56 |             out_channels=out_planes,
 57 |             kernel_size=(3, 3, 3),
 58 |             stride=stride,
 59 |             padding=padding,
 60 |             groups=in_planes,
 61 |             bias=False,
 62 |         )
 63 | 
 64 |     @staticmethod
 65 |     def get_downsample_stride(stride):
 66 |         return (stride, stride, stride)
 67 | 
 68 | 
 69 | class IPConv3DDepthwise(nn.Sequential):
 70 |     def __init__(self, in_planes, out_planes, midplanes, stride=1, padding=1):
 71 | 
 72 |         assert in_planes == out_planes
 73 |         super(IPConv3DDepthwise, self).__init__(
 74 |             nn.Conv3d(in_planes, out_planes, kernel_size=1, bias=False),
 75 |             nn.BatchNorm3d(out_planes),
 76 |             # nn.ReLU(inplace=True),
 77 |             Conv3DDepthwise(out_planes, out_planes, None, stride),
 78 |         )
 79 | 
 80 |     @staticmethod
 81 |     def get_downsample_stride(stride):
 82 |         return (stride, stride, stride)
 83 | 
 84 | 
 85 | class Conv2Plus1D(nn.Sequential):
 86 |     def __init__(self, in_planes, out_planes, midplanes, stride=1, padding=1):
 87 | 
 88 |         midplanes = (in_planes * out_planes * 3 * 3 *
 89 |                      3) // (in_planes * 3 * 3 + 3 * out_planes)
 90 |         super(Conv2Plus1D, self).__init__(
 91 |             nn.Conv3d(
 92 |                 in_planes,
 93 |                 midplanes,
 94 |                 kernel_size=(1, 3, 3),
 95 |                 stride=(1, stride, stride),
 96 |                 padding=(0, padding, padding),
 97 |                 bias=False,
 98 |             ),
 99 |             nn.BatchNorm3d(midplanes),
100 |             nn.ReLU(inplace=True),
101 |             nn.Conv3d(
102 |                 midplanes,
103 |                 out_planes,
104 |                 kernel_size=(3, 1, 1),
105 |                 stride=(stride, 1, 1),
106 |                 padding=(padding, 0, 0),
107 |                 bias=False,
108 |             ),
109 |         )
110 | 
111 |     @staticmethod
112 |     def get_downsample_stride(stride):
113 |         return (stride, stride, stride)
114 | 
115 | 
116 | def _set_bn_params(model, bn_eps=1e-3, bn_mom=0.1):
117 |     """
118 |     Set the BN parameters to the defaults: Du's models were trained
119 |         with 1e-3 and 0.9 for eps and momentum resp.
120 |         Ref: https://github.com/facebookresearch/VMZ/blob/f4089e2164f67a98bc5bed4f97dc722bdbcd268e/lib/models/r3d_model.py#L208
121 |     """
122 |     for module in model.modules():
123 |         if isinstance(module, nn.BatchNorm3d):
124 |             module.eps = bn_eps
125 |             module.momentum = bn_mom
126 | 
127 | 
128 | def r2plus1d_34(pretrained=False,
129 |                 progress=False,
130 |                 bn_eps=1e-3,
131 |                 bn_mom=0.1,
132 |                 **kwargs):
133 |     model = _video_resnet("r2plus1d_34",
134 |                           False,
135 |                           False,
136 |                           block=BasicBlock,
137 |                           conv_makers=[Conv2Plus1D] * 4,
138 |                           layers=[3, 4, 6, 3],
139 |                           stem=R2Plus1dStem,
140 |                           **kwargs)
141 |     _set_bn_params(model, bn_eps, bn_mom)
142 |     return model
143 | 
144 | 
145 | def r2plus1d_152(pretrained=False,
146 |                  progress=False,
147 |                  bn_eps=1e-3,
148 |                  bn_mom=0.1,
149 |                  **kwargs):
150 |     model = _video_resnet("r2plus1d_152",
151 |                           False,
152 |                           False,
153 |                           block=Bottleneck,
154 |                           conv_makers=[Conv2Plus1D] * 4,
155 |                           layers=[3, 8, 36, 3],
156 |                           stem=R2Plus1dStem,
157 |                           **kwargs)
158 |     _set_bn_params(model, bn_eps, bn_mom)
159 |     return model
160 | 
161 | 
162 | def ir_csn_152(pretrained=False,
163 |                progress=False,
164 |                bn_eps=1e-3,
165 |                bn_mom=0.1,
166 |                **kwargs):
167 |     model = _video_resnet("ir_csn_152",
168 |                           False,
169 |                           False,
170 |                           block=Bottleneck,
171 |                           conv_makers=[Conv3DDepthwise] * 4,
172 |                           layers=[3, 8, 36, 3],
173 |                           stem=BasicStem_Pool,
174 |                           **kwargs)
175 |     _set_bn_params(model, bn_eps, bn_mom)
176 |     return model
177 | 
178 | 
179 | def ip_csn_152(pretrained=False,
180 |                progress=False,
181 |                bn_eps=1e-3,
182 |                bn_mom=0.1,
183 |                **kwargs):
184 |     model = _video_resnet("ip_csn_152",
185 |                           False,
186 |                           False,
187 |                           block=Bottleneck,
188 |                           conv_makers=[IPConv3DDepthwise] * 4,
189 |                           layers=[3, 8, 36, 3],
190 |                           stem=BasicStem_Pool,
191 |                           **kwargs)
192 |     _set_bn_params(model, bn_eps, bn_mom)
193 |     return model
194 | 
195 | 
196 | def ip_csn_50(pretrained=False,
197 |               progress=False,
198 |               bn_eps=0.3,
199 |               bn_mom=0.1,
200 |               **kwargs):
201 |     model = _video_resnet("ip_csn_50",
202 |                           False,
203 |                           False,
204 |                           block=Bottleneck,
205 |                           conv_makers=[IPConv3DDepthwise] * 4,
206 |                           layers=[3, 8, 6, 3],
207 |                           stem=BasicStem_Pool,
208 |                           **kwargs)
209 |     _set_bn_params(model, bn_eps, bn_mom)
210 |     return model
211 | 
212 | 
213 | def process_each_frame(model, video, *args, **kwargs):
214 |     """
215 |     Pass in each frame separately
216 |     Args:
217 |         video (B, C, T, H, W)
218 |     Returns:
219 |         feats: (B, C', T, 1, 1)
220 |     """
221 |     batch_size = video.size(0)
222 |     time_dim = video.size(2)
223 |     video_flat = video.transpose(1, 2).flatten(0, 1)
224 |     feats_flat = model(video_flat, *args, **kwargs)
225 |     return feats_flat.view((batch_size, time_dim) +
226 |                            feats_flat.shape[1:]).transpose(
227 |                                1, 2).unsqueeze(-1).unsqueeze(-1)
228 | 
229 | 
230 | class FrameLevelModel(nn.Module):
231 |     """Runs a frame level model on all the frames."""
232 |     def __init__(self, num_classes: int, model: nn.Module = None):
233 |         del num_classes
234 |         super().__init__()
235 |         self.model = model
236 | 
237 |     def forward(self, video, *args, **kwargs):
238 |         return process_each_frame(self.model, video, *args, **kwargs)
239 | 
240 | 
241 | class BNInceptionVideo(FrameLevelModel):
242 |     def __init__(self, *args, **kwargs):
243 |         super().__init__(*args, **kwargs)
244 |         self.model = bninception(*args, **kwargs)
245 |         self.model.last_linear = nn.Identity()
246 |         self.model.global_pool = nn.AdaptiveAvgPool2d(1)
247 | 
248 | 
249 | class TIMMModel(FrameLevelModel):
250 |     def __init__(self,
251 |                  num_classes,
252 |                  model_type='vit_base_patch16_224',
253 |                  drop_cls=True):
254 |         super().__init__(num_classes)
255 |         model = timm.create_model(model_type,
256 |                                   num_classes=0 if drop_cls else num_classes)
257 |         self.model = model
258 | 


--------------------------------------------------------------------------------
/common/log.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | from collections import defaultdict, deque
  4 | import datetime
  5 | import time
  6 | import logging
  7 | 
  8 | import torch
  9 | import torch.distributed as dist
 10 | 
 11 | from common.utils import is_dist_avail_and_initialized, is_main_process
 12 | __all__ = [
 13 |     'SmoothedValue', 'MetricLogger', 'get_default_loggers',
 14 |     'get_default_loggers'
 15 | ]
 16 | EPS = 0.000001
 17 | 
 18 | 
 19 | class SmoothedValue(object):
 20 |     """Track a series of values and provide access to smoothed values over a
 21 |     window or the global series average.
 22 |     """
 23 |     def __init__(self, window_size=20, fmt=None):
 24 |         if fmt is None:
 25 |             fmt = "{median:.4f} ({global_avg:.4f})"
 26 |         self.deque = deque(maxlen=window_size)
 27 |         self.total = 0.0
 28 |         self.count = 0
 29 |         self.fmt = fmt
 30 |         self.ws = window_size
 31 | 
 32 |     def reset(self):
 33 |         self.__init__(window_size=self.ws, fmt=self.fmt)
 34 | 
 35 |     def update(self, value, n=1):
 36 |         self.deque.append(value)
 37 |         self.count += n
 38 |         self.total += value * n
 39 | 
 40 |     def synchronize_between_processes(self):
 41 |         """
 42 |         Warning: does not synchronize the deque!
 43 |         """
 44 |         if not is_dist_avail_and_initialized():
 45 |             return
 46 |         t = torch.tensor([self.count, self.total],
 47 |                          dtype=torch.float64,
 48 |                          device='cuda')
 49 |         dist.barrier()
 50 |         dist.all_reduce(t)
 51 |         t = t.tolist()
 52 |         self.count = int(t[0])
 53 |         self.total = t[1]
 54 | 
 55 |     @property
 56 |     def median(self):
 57 |         d = torch.tensor(list(self.deque))
 58 |         return d.median().item()
 59 | 
 60 |     @property
 61 |     def avg(self):
 62 |         d = torch.tensor(list(self.deque), dtype=torch.float32)
 63 |         return d.mean().item()
 64 | 
 65 |     @property
 66 |     def global_avg(self):
 67 |         return self.total / (self.count + EPS)
 68 | 
 69 |     @property
 70 |     def max(self):
 71 |         return max(self.deque)
 72 | 
 73 |     @property
 74 |     def value(self):
 75 |         return self.deque[-1]
 76 | 
 77 |     def __str__(self):
 78 |         return self.fmt.format(median=self.median,
 79 |                                avg=self.avg,
 80 |                                global_avg=self.global_avg,
 81 |                                max=self.max,
 82 |                                value=self.value)
 83 | 
 84 | 
 85 | class MetricLogger(object):
 86 |     def __init__(self,
 87 |                  delimiter="\t",
 88 |                  writer=None,
 89 |                  stat_set="train",
 90 |                  epoch=0,
 91 |                  logger=None):
 92 |         self.meters = defaultdict(SmoothedValue)
 93 |         self.delimiter = delimiter
 94 |         self.metric_set = stat_set
 95 |         self.epoch = epoch
 96 |         self.logger = logger.info if logger is not None else logging.info
 97 | 
 98 |         self.writer = writer
 99 |         self.writer_step = 0
100 |         # Adding all logs from this to raw/ header, so I can plot other metrics
101 |         # cleanly
102 |         self.tbd_header = 'metric_logger/'
103 | 
104 |         self.meters["iter_time"] = SmoothedValue(fmt='{avg:.4f}')
105 |         self.meters["data_time"] = SmoothedValue(fmt='{avg:.4f}')
106 | 
107 |     def update(self, **kwargs):
108 |         for k, v in kwargs.items():
109 |             if isinstance(v, torch.Tensor):
110 |                 v = v.item()
111 |             assert isinstance(v, (float, int))
112 |             self.meters[k].update(v)
113 | 
114 |     def __getattr__(self, attr):
115 |         if attr in self.meters:
116 |             return self.meters[attr]
117 |         if attr in self.__dict__:
118 |             return self.__dict__[attr]
119 |         raise AttributeError("'{}' object has no attribute '{}'".format(
120 |             type(self).__name__, attr))
121 | 
122 |     def __str__(self):
123 |         loss_str = []
124 |         for name, meter in self.meters.items():
125 |             loss_str.append("{}: {}".format(name, str(meter)))
126 |         return self.delimiter.join(loss_str)
127 | 
128 |     def synchronize_between_processes(self):
129 |         for meter in self.meters.values():
130 |             meter.synchronize_between_processes()
131 | 
132 |     def add_meter(self, name, meter):
133 |         self.meters[name] = meter
134 | 
135 |     def reset_meters(self):
136 |         self.logger("Logging: resseting all meters")
137 |         for name, meter in self.meters.items():
138 |             meter.reset()
139 |         self.logger(
140 |             "Logging: resseting all meters done, updating epoch to %d".format(
141 |                 self.epoch + 1))
142 |         self.epoch += 1
143 | 
144 |     def log_every(self, iterable, print_freq, header=None):
145 |         i = 0
146 |         if not header:
147 |             header = ''
148 |         start_time = time.time()
149 |         end = time.time()
150 | 
151 |         space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
152 |         if torch.cuda.is_available():
153 |             log_msg = self.delimiter.join([
154 |                 header, '[{0' + space_fmt + '}/{1}]', 'eta: {eta}', '{meters}',
155 |                 'time: {time}', 'data: {data}', 'max mem: {memory:.0f}'
156 |             ])
157 |         else:
158 |             log_msg = self.delimiter.join([
159 |                 header, '[{0' + space_fmt + '}/{1}]', 'eta: {eta}', '{meters}',
160 |                 'time: {time}', 'data: {data}'
161 |             ])
162 |         MB = 1024.0 * 1024.0
163 |         for obj in iterable:
164 |             self.meters["data_time"].update(time.time() - end)
165 |             yield obj
166 |             self.meters["iter_time"].update(time.time() - end)
167 |             if i % print_freq == 0:
168 |                 self._write_meters()
169 |                 eta_seconds = self.meters["iter_time"].global_avg * (
170 |                     len(iterable) - i)
171 |                 eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
172 |                 if torch.cuda.is_available():
173 |                     self.logger(
174 |                         log_msg.format(
175 |                             i,
176 |                             len(iterable),
177 |                             eta=eta_string,
178 |                             meters=str(self),
179 |                             time=str(self.meters["iter_time"]),
180 |                             data=str(self.meters["data_time"]),
181 |                             memory=torch.cuda.max_memory_allocated() / MB))
182 |                 else:
183 |                     self.logger(
184 |                         log_msg.format(i,
185 |                                        len(iterable),
186 |                                        eta=eta_string,
187 |                                        meters=str(self),
188 |                                        time=str(self.meters["iter_time"]),
189 |                                        data=str(self.meters["data_time"])))
190 |             i += 1
191 |             end = time.time()
192 |         total_time = time.time() - start_time
193 |         total_time_str = str(datetime.timedelta(seconds=int(total_time)))
194 |         self.logger('{} Total time: {}'.format(header, total_time_str))
195 |         self._write_epoch(total_time_str)
196 | 
197 |     def _write_meters(self):
198 |         if self.writer is not None:
199 |             for name, meter in self.meters.items():
200 |                 self.writer.add_scalar(
201 |                     f"{self.tbd_header}iter/{self.metric_set}_{name}",
202 |                     meter.avg, self.writer_step)
203 | 
204 |             self.writer_step += 1
205 | 
206 |     def _write_epoch(self, total_time_string):
207 |         if self.writer is not None:
208 |             for name, meter in self.meters.items():
209 |                 self.writer.add_scalar(
210 |                     f"{self.tbd_header}epoch/{self.metric_set}_{name}",
211 |                     meter.avg, self.epoch)
212 | 
213 |             self.writer.add_text(
214 |                 f"{self.tbd_header}epoch/{self.metric_set}_totaltime",
215 |                 total_time_string, self.epoch)
216 | 
217 | 
218 | def setup_tbx(save_dir, SummaryWriter):
219 |     if not is_main_process():
220 |         return None
221 | 
222 |     writer = SummaryWriter(save_dir)
223 |     return writer
224 | 
225 | 
226 | def get_default_loggers(writer, epoch, logger):
227 |     stat_loggers = dict()
228 |     stat_loggers["train"] = MetricLogger(delimiter="  ",
229 |                                          writer=writer,
230 |                                          stat_set="train",
231 |                                          epoch=epoch,
232 |                                          logger=logger)
233 |     stat_loggers["train"].add_meter(
234 |         'lr', SmoothedValue(window_size=1, fmt='{value}'))
235 |     stat_loggers["train"].add_meter(
236 |         'clips/s', SmoothedValue(window_size=10, fmt='{value:.3f}'))
237 | 
238 |     stat_loggers["val"] = MetricLogger(delimiter="  ",
239 |                                        writer=writer,
240 |                                        stat_set="val",
241 |                                        epoch=epoch,
242 |                                        logger=logger)
243 | 
244 |     return stat_loggers
245 | 


--------------------------------------------------------------------------------
/env.yaml:
--------------------------------------------------------------------------------
  1 | name: avt
  2 | channels:
  3 |   - huggingface
  4 |   - iopath
  5 |   - pytorch
  6 |   - conda-forge
  7 |   - defaults
  8 | dependencies:
  9 |   - _libgcc_mutex=0.1=main
 10 |   - attrs=19.3.0=py_0
 11 |   - av=7.0.1=py37h82f89c2_2
 12 |   - backcall=0.1.0=py_0
 13 |   - blas=1.0=mkl
 14 |   - bleach=3.1.5=pyh9f0ad1d_0
 15 |   - blessed=1.17.8=py37hc8dfbb8_0
 16 |   - brotlipy=0.7.0=py37h8f50634_1000
 17 |   - bzip2=1.0.8=h516909a_2
 18 |   - ca-certificates=2021.4.13=h06a4308_1
 19 |   - cairo=1.16.0=hcf35c78_1003
 20 |   - certifi=2020.12.5=py37h06a4308_0
 21 |   - cffi=1.14.0=py37h2e261b9_0
 22 |   - chardet=3.0.4=py37hc8dfbb8_1006
 23 |   - cmake=3.3.1=0
 24 |   - cryptography=2.9.2=py37hb09aad4_0
 25 |   - cudatoolkit=11.0.221=h6bb024c_0
 26 |   - dataclasses=0.7=py37_0
 27 |   - dbus=1.13.6=he372182_0
 28 |   - decorator=4.4.2=py_0
 29 |   - defusedxml=0.6.0=py_0
 30 |   - entrypoints=0.3=py37hc8dfbb8_1001
 31 |   - expat=2.2.9=he1b5a44_2
 32 |   - faiss-cpu=1.6.3=py37h6bb024c_0
 33 |   - ffmpeg=4.2=h167e202_0
 34 |   - fftw=3.3.8=nompi_h7f3a6c3_1110
 35 |   - filelock=3.0.12=pyhd3eb1b0_1
 36 |   - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
 37 |   - font-ttf-inconsolata=2.001=hab24e00_0
 38 |   - font-ttf-source-code-pro=2.030=hab24e00_0
 39 |   - font-ttf-ubuntu=0.83=hab24e00_0
 40 |   - fontconfig=2.13.1=h86ecdb6_1001
 41 |   - fonts-conda-forge=1=0
 42 |   - freetype=2.9.1=h8a8886c_1
 43 |   - fribidi=1.0.9=h516909a_0
 44 |   - future=0.18.2=py37_1
 45 |   - gdk-pixbuf=2.38.2=h3f25603_3
 46 |   - gettext=0.19.8.1=hc5be6a0_1002
 47 |   - ghostscript=9.22=hf484d3e_1001
 48 |   - giflib=5.2.1=h516909a_2
 49 |   - glib=2.64.3=h6f030ca_0
 50 |   - gmp=6.2.0=he1b5a44_2
 51 |   - gnutls=3.6.5=hd3a4fd2_1002
 52 |   - gobject-introspection=1.64.1=py37h619baee_1
 53 |   - graphite2=1.3.13=he1b5a44_1001
 54 |   - graphviz=2.42.3=h0511662_0
 55 |   - gst-plugins-base=1.14.5=h0935bb2_2
 56 |   - gstreamer=1.14.5=h36ae1b5_2
 57 |   - harfbuzz=2.4.0=h9f30f68_3
 58 |   - hdf5=1.10.6=nompi_h3c11f04_100
 59 |   - icu=64.2=he1b5a44_1
 60 |   - idna=2.9=py_1
 61 |   - imageio=2.8.0=py_0
 62 |   - imageio-ffmpeg=0.4.2=py_0
 63 |   - imagemagick=7.0.10_6=pl526ha9fe49d_0
 64 |   - importlib-metadata=1.6.1=py37hc8dfbb8_0
 65 |   - importlib_metadata=1.6.1=0
 66 |   - inquirer=2.7.0=py_0
 67 |   - intel-openmp=2020.0=166
 68 |   - iopath=0.1.8=py37
 69 |   - ipykernel=5.3.0=py37h43977f1_0
 70 |   - ipython=7.15.0=py37hc8dfbb8_0
 71 |   - ipython_genutils=0.2.0=py_1
 72 |   - ipywidgets=7.5.1=py_0
 73 |   - isort=4.3.21=py37_0
 74 |   - jasper=1.900.1=h07fcdf6_1006
 75 |   - jbig=2.1=h516909a_2002
 76 |   - jedi=0.17.0=py37hc8dfbb8_0
 77 |   - jinja2=2.11.2=pyh9f0ad1d_0
 78 |   - jpeg=9c=h14c3975_1001
 79 |   - jsonschema=3.2.0=py37hc8dfbb8_1
 80 |   - jupyter_client=6.1.3=py_0
 81 |   - jupyter_core=4.6.3=py37hc8dfbb8_1
 82 |   - lame=3.100=h14c3975_1001
 83 |   - lazy-object-proxy=1.4.3=py37h27cfd23_2
 84 |   - ld_impl_linux-64=2.33.1=h53a641e_7
 85 |   - libblas=3.8.0=15_mkl
 86 |   - libcblas=3.8.0=15_mkl
 87 |   - libclang=9.0.1=default_hde54327_0
 88 |   - libcroco=0.6.13=h8d621e5_1
 89 |   - libedit=3.1.20181209=hc058e9b_0
 90 |   - libffi=3.2.1=hd88cf55_4
 91 |   - libgcc-ng=9.1.0=hdf63c60_0
 92 |   - libgfortran-ng=7.3.0=hdf63c60_0
 93 |   - libiconv=1.15=h516909a_1006
 94 |   - liblapack=3.8.0=15_mkl
 95 |   - liblapacke=3.8.0=15_mkl
 96 |   - libllvm9=9.0.1=he513fc3_1
 97 |   - libopencv=4.2.0=py37_6
 98 |   - libpng=1.6.37=hbc83047_0
 99 |   - libprotobuf=3.13.0.1=hd408876_0
100 |   - librsvg=2.49.2=h33a7fed_0
101 |   - libsodium=1.0.17=h516909a_0
102 |   - libstdcxx-ng=9.1.0=hdf63c60_0
103 |   - libtiff=4.1.0=h2733197_0
104 |   - libtool=2.4.6=h14c3975_1002
105 |   - libuuid=2.32.1=h14c3975_1000
106 |   - libuv=1.40.0=h7b6447c_0
107 |   - libwebp=1.0.2=h56121f0_5
108 |   - libxcb=1.13=h14c3975_1002
109 |   - libxkbcommon=0.10.0=he1b5a44_0
110 |   - libxml2=2.9.10=hee79883_0
111 |   - lmdb=0.9.24=h516909a_0
112 |   - markupsafe=1.1.1=py37h8f50634_1
113 |   - mistune=0.8.4=py37h8f50634_1001
114 |   - mkl=2020.2=256
115 |   - mkl-include=2020.2=256
116 |   - mkl-service=2.3.0=py37he904b0f_0
117 |   - mkl_fft=1.0.15=py37ha843d7b_0
118 |   - mkl_random=1.1.0=py37hd6b4f25_0
119 |   - nbconvert=5.6.1=py37hc8dfbb8_1
120 |   - nbformat=5.0.6=py_0
121 |   - ncurses=6.2=he6710b0_0
122 |   - nettle=3.4.1=h1bed415_1002
123 |   - ninja=1.10.2=py37hff7bd54_0
124 |   - nltk=3.4.4=py_0
125 |   - notebook=6.0.3=py37hc8dfbb8_0
126 |   - nspr=4.25=he1b5a44_0
127 |   - nss=3.47=he751ad9_0
128 |   - numpy=1.18.1=py37h4f9e942_0
129 |   - numpy-base=1.18.1=py37hde5b4d6_1
130 |   - olefile=0.46=py37_0
131 |   - opencv=4.2.0=py37_6
132 |   - openh264=1.8.0=hdbcaa40_1000
133 |   - openjpeg=2.3.1=h981e76c_3
134 |   - openssl=1.1.1k=h27cfd23_0
135 |   - packaging=20.4=pyh9f0ad1d_0
136 |   - pandas=1.0.3=py37h0da4684_1
137 |   - pandoc=2.9.2.1=0
138 |   - pandocfilters=1.4.2=py_1
139 |   - pango=1.42.4=h7062337_4
140 |   - parso=0.7.0=pyh9f0ad1d_0
141 |   - pcre=8.44=he1b5a44_0
142 |   - perl=5.26.2=h516909a_1006
143 |   - pexpect=4.8.0=py37hc8dfbb8_1
144 |   - pickleshare=0.7.5=py37hc8dfbb8_1001
145 |   - pip=20.0.2=py37_1
146 |   - pixman=0.38.0=h516909a_1003
147 |   - pkg-config=0.29.2=h516909a_1006
148 |   - proglog=0.1.9=py_0
149 |   - prometheus_client=0.8.0=pyh9f0ad1d_0
150 |   - prompt-toolkit=3.0.5=py_0
151 |   - pthread-stubs=0.4=h14c3975_1001
152 |   - ptyprocess=0.6.0=py_1001
153 |   - py-opencv=4.2.0=py37h43977f1_6
154 |   - pycparser=2.20=py_0
155 |   - pygments=2.6.1=py_0
156 |   - pyopenssl=19.1.0=py_1
157 |   - pyparsing=2.4.7=pyh9f0ad1d_0
158 |   - pyrsistent=0.16.0=py37h8f50634_0
159 |   - pysocks=1.7.1=py37hc8dfbb8_1
160 |   - python=3.7.7=hcf32534_0_cpython
161 |   - python-dateutil=2.8.1=py_0
162 |   - python-editor=1.0.4=py_0
163 |   - python-lmdb=0.96=py37he1b5a44_0
164 |   - python_abi=3.7=1_cp37m
165 |   - pytorch=1.7.1=py3.7_cuda11.0.221_cudnn8.0.5_0
166 |   - pytz=2020.1=pyh9f0ad1d_0
167 |   - pyzmq=19.0.1=py37hac76be4_0
168 |   - qt=5.12.5=hd8c4c69_1
169 |   - readchar=2.0.0=py_0
170 |   - readline=8.0=h7b6447c_0
171 |   - requests=2.25.1=pyhd3eb1b0_0
172 |   - send2trash=1.5.0=py_0
173 |   - setuptools=51.3.3=py37h06a4308_4
174 |   - six=1.15.0=py37h06a4308_0
175 |   - sqlite=3.31.1=h62c20be_1
176 |   - terminado=0.8.3=py37hc8dfbb8_1
177 |   - testpath=0.4.4=py_0
178 |   - tk=8.6.8=hbc83047_0
179 |   - toml=0.10.1=py_0
180 |   - tornado=6.0.4=py37h8f50634_1
181 |   - tqdm=4.54.1=pyhd8ed1ab_1
182 |   - traitlets=4.3.3=py37hc8dfbb8_1
183 |   - transformers=4.2.2=pyh7b7c402_0
184 |   - typed-ast=1.4.2=py37h27cfd23_1
185 |   - typing_extensions=3.7.4.3=py_0
186 |   - urllib3=1.25.9=py_0
187 |   - wcwidth=0.2.4=pyh9f0ad1d_0
188 |   - webencodings=0.5.1=py_1
189 |   - wheel=0.34.2=py37_0
190 |   - widgetsnbextension=3.5.1=py37_0
191 |   - x264=1!152.20180806=h14c3975_0
192 |   - xorg-kbproto=1.0.7=h14c3975_1002
193 |   - xorg-libice=1.0.10=h516909a_0
194 |   - xorg-libsm=1.2.3=h84519dc_1000
195 |   - xorg-libx11=1.6.9=h516909a_0
196 |   - xorg-libxau=1.0.9=h14c3975_0
197 |   - xorg-libxdmcp=1.1.3=h516909a_0
198 |   - xorg-libxext=1.3.4=h516909a_0
199 |   - xorg-libxpm=3.5.13=h516909a_0
200 |   - xorg-libxrender=0.9.10=h516909a_1002
201 |   - xorg-libxt=1.1.5=h516909a_1003
202 |   - xorg-renderproto=0.11.1=h14c3975_1002
203 |   - xorg-xextproto=7.3.0=h14c3975_1002
204 |   - xorg-xproto=7.0.31=h14c3975_1007
205 |   - xz=5.2.5=h7b6447c_0
206 |   - yaml=0.1.7=had09818_2
207 |   - zeromq=4.3.2=he1b5a44_2
208 |   - zipp=3.1.0=py_0
209 |   - zlib=1.2.11=h7b6447c_3
210 |   - zstd=1.3.7=h0b5b093_0
211 |   - pip:
212 |     - absl-py==0.9.0
213 |     - addict==2.2.1
214 |     - antlr4-python3-runtime==4.8
215 |     - astor==0.8.1
216 |     - astroid==2.4.1
217 |     - async-generator==1.10
218 |     - azure-core==1.10.0
219 |     - azure-identity==1.5.0
220 |     - azure-storage-blob==12.7.1
221 |     - cachetools==4.1.0
222 |     - click==7.1.1
223 |     - cloudpickle==1.3.0
224 |     - colorlog==4.1.0
225 |     - cycler==0.10.0
226 |     - cython==0.29.23
227 |     - dask==2.15.0
228 |     - dask-jobqueue==0.7.1
229 |     - distributed==2.15.0
230 |     - fairtask==1.1.0
231 |     - fairtask-slurm==0.3.0
232 |     - flask==1.1.2
233 |     - ftfy==5.8
234 |     - gast==0.2.2
235 |     - google-auth==1.14.1
236 |     - google-auth-oauthlib==0.4.1
237 |     - google-pasta==0.2.0
238 |     - grpcio==1.28.1
239 |     - h5py==2.10.0
240 |     - heapdict==1.0.1
241 |     - hydra-colorlog==1.0.0
242 |     - hydra-core==1.1.0.dev4
243 |     - hydra-submitit-launcher==1.1.0
244 |     - importlib-resources==2.0.1
245 |     - isodate==0.6.0
246 |     - itsdangerous==1.1.0
247 |     - joblib==0.16.0
248 |     - keras-applications==1.0.8
249 |     - keras-preprocessing==1.1.0
250 |     - kiwisolver==1.2.0
251 |     - markdown==3.2.1
252 |     - matplotlib==3.3.0
253 |     - mccabe==0.6.1
254 |     - microsoftvision==1.0.5
255 |     - mmcv==1.0.4
256 |     - moviepy==1.0.3
257 |     - msal==1.8.0
258 |     - msal-extensions==0.3.0
259 |     - msgpack==1.0.0
260 |     - msrest==0.6.21
261 |     - munch==2.5.0
262 |     - oauthlib==3.1.0
263 |     - omegaconf==2.1.0.dev22
264 |     - opencv-contrib-python==4.3.0.36
265 |     - opencv-python==4.3.0.36
266 |     - opt-einsum==3.2.1
267 |     - parameterized==0.8.1
268 |     - pillow==7.0.0
269 |     - portalocker==1.7.1
270 |     - pretrainedmodels==0.7.4
271 |     - protobuf==3.11.3
272 |     - psutil==5.7.0
273 |     - pyarrow==1.0.1
274 |     - pyasn1==0.4.8
275 |     - pyasn1-modules==0.2.8
276 |     - pycocotools==2.0.2
277 |     - pydantic==1.5.1
278 |     - pydot==1.4.2
279 |     - pyjwt==1.7.1
280 |     - pylint==2.5.2
281 |     - pytorchvideo==0.1.0
282 |     - pyyaml==5.3.1
283 |     - regex==2020.7.14
284 |     - requests-oauthlib==1.3.0
285 |     - rsa==4.0
286 |     - sacremoses==0.0.43
287 |     - scikit-learn==0.23.2
288 |     - scipy==1.5.2
289 |     - seaborn==0.11.1
290 |     - sentencepiece==0.1.91
291 |     - simplejson==3.17.2
292 |     - sklearn==0.0
293 |     - sortedcontainers==2.1.0
294 |     - submitit==1.2.1
295 |     - tabulate==0.8.9
296 |     - tblib==1.6.0
297 |     - tensorboard==2.0.0
298 |     - tensorflow==2.1.0
299 |     - tensorflow-estimator==2.1.0
300 |     - termcolor==1.1.0
301 |     - threadpoolctl==2.1.0
302 |     - timm==0.4.12
303 |     - tokenizers==0.9.4
304 |     - toolz==0.10.0
305 |     - torchvision==0.8.2
306 |     - typing-extensions==3.7.4.2
307 |     - werkzeug==1.0.1
308 |     - wrapt==1.12.1
309 |     - yacs==0.1.8
310 |     - yapf==0.31.0
311 |     - zict==2.0.0
312 | prefix: ~/.conda/envs/avt
313 | 


--------------------------------------------------------------------------------
/func/train_eval_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | """
  4 | Modular implementation of the basic train ops
  5 | """
  6 | from typing import Dict, Union, Tuple
  7 | import torch
  8 | import torch.nn as nn
  9 | import hydra
 10 | from hydra.types import TargetConf
 11 | 
 12 | from common import utils
 13 | 
 14 | from datasets.base_video_dataset import FUTURE_PREFIX
 15 | from models.base_model import PAST_LOGITS_PREFIX
 16 | from loss_fn.multidim_xentropy import MultiDimCrossEntropy
 17 | 
 18 | 
 19 | class NoLossAccuracy(nn.Module):
 20 |     def __init__(self, *args, **kwargs):
 21 |         super().__init__()
 22 | 
 23 |     def forward(self, *args, **kwargs):
 24 |         return {}, {}
 25 | 
 26 | 
 27 | class BasicLossAccuracy(nn.Module):
 28 |     def __init__(self, dataset, device, balance_classes=False):
 29 |         super().__init__()
 30 |         kwargs = {'ignore_index': -1}
 31 |         if balance_classes:
 32 |             assert dataset.class_balanced_sampling is False, (
 33 |                 'Do not re-weight the losses, and do balanced sampling')
 34 |             weight = torch.zeros((len(dataset.classes, )),
 35 |                                  device=device,
 36 |                                  dtype=torch.float)
 37 |             for cls_id, count in dataset.classes_counts.items():
 38 |                 weight[cls_id] = count
 39 |             weight = weight / torch.sum(weight)  # To get ratios for non -1 cls
 40 |             weight = 1 / (weight + 0.00001)
 41 |             kwargs['weight'] = weight
 42 |         kwargs['reduction'] = 'none'  # to get batch level output
 43 |         self.cls_criterion = MultiDimCrossEntropy(**kwargs)
 44 | 
 45 |     def forward(self, outputs, target, target_subclips):
 46 |         """
 47 |         Args:
 48 |             outputs['logits'] torch.Tensor (B, num_classes) or
 49 |                 (B, T, num_classes)
 50 |                 Latter in case of dense prediction
 51 |             target: {type: (B) or (B, T')}; latter in case of dense prediction
 52 |             target_subclips: {type: (B, #clips, T)}: The target for each input
 53 |                 frame
 54 |         """
 55 |         losses = {}
 56 |         accuracies = {}
 57 |         for tgt_type, tgt_val in target.items():
 58 |             logits = outputs[f'logits/{tgt_type}']
 59 |             assert logits.ndim == tgt_val.ndim + 1
 60 |             loss = self.cls_criterion(logits, tgt_val)
 61 |             dataset_max_classes = logits.size(-1)
 62 |             acc1, acc5 = utils.accuracy(logits,
 63 |                                         tgt_val,
 64 |                                         topk=(1, min(5, dataset_max_classes)))
 65 |             # Don't use / in loss since I use the config to set weights, and
 66 |             # can't use / there.
 67 |             losses[f'cls_{tgt_type}'] = loss
 68 |             accuracies[f'acc1/{tgt_type}'] = acc1
 69 |             accuracies[f'acc5/{tgt_type}'] = acc5
 70 |             # Incur past losses
 71 |             past_logits_key = f'{PAST_LOGITS_PREFIX}logits/{tgt_type}'
 72 |             # If this key exists, means we asked for classifier on the last
 73 |             # layer, so the loss should be incurred.
 74 |             if past_logits_key in outputs and target_subclips is not None:
 75 |                 past_logits = outputs[past_logits_key]
 76 |                 # Take mode over the frames to get the subclip level loss
 77 |                 past_target = torch.mode(target_subclips[tgt_type], -1)[0]
 78 |                 assert past_logits.shape[:-1] == past_target.shape, (
 79 |                     f'The subclips should be set such that the past logits '
 80 |                     f'and past targets match in shape. Currently they are '
 81 |                     f'{past_logits.shape} and {past_target.shape}')
 82 |                 losses[f'past_cls_{tgt_type}'] = self.cls_criterion(
 83 |                     past_logits, past_target)
 84 |             # Else likely not using subclips, so no way to do this loss
 85 |         return losses, accuracies
 86 | 
 87 | 
 88 | class Basic:
 89 |     def __init__(self,
 90 |                  model,
 91 |                  device,
 92 |                  dataset,
 93 |                  cls_loss_acc_fn: TargetConf,
 94 |                  reg_criterion: TargetConf = None):
 95 |         super().__init__()
 96 |         self.model = model
 97 |         self.device = device
 98 |         self.cls_loss_acc_fn = hydra.utils.instantiate(cls_loss_acc_fn,
 99 |                                                        dataset, device)
100 |         del reg_criterion  # not used here
101 | 
102 |     def _basic_preproc(self, data, train_mode):
103 |         if not isinstance(data, dict):
104 |             video, target = data
105 |             # Make a dict so that later code can use it
106 |             data = {}
107 |             data['video'] = video
108 |             data['target'] = target
109 |             data['idx'] = -torch.ones_like(target)
110 | 
111 |         if train_mode:
112 |             self.model.train()
113 |         else:
114 |             self.model.eval()
115 |         return data
116 | 
117 |     def __call__(
118 |             self,
119 |             data: Union[Dict[str, torch.Tensor],  # If dict
120 |                         Tuple[torch.Tensor, torch.Tensor]],  # vid, target
121 |             train_mode: bool = True):
122 |         """
123 |         Args:
124 |             data (dict): Dictionary of all the data from the data loader
125 |         """
126 |         data = self._basic_preproc(data, train_mode)
127 |         video = data['video'].to(self.device, non_blocking=True)
128 |         target = {}
129 |         target_subclips = {}
130 |         for key in data['target'].keys():
131 |             target[key] = data['target'][key].to(self.device,
132 |                                                  non_blocking=True)
133 |         outputs, aux_losses = self.model(video,
134 |                                          target_shape=next(
135 |                                              iter(target.values())).shape)
136 |         if 'target_subclips' in data:
137 |             for key in data['target_subclips'].keys():
138 |                 target_subclips[key] = data['target_subclips'][key].to(
139 |                     self.device, non_blocking=True)
140 |         else:
141 |             target_subclips = None
142 |         losses, accuracies = self.cls_loss_acc_fn(outputs, target,
143 |                                                   target_subclips)
144 |         losses.update(aux_losses)
145 |         return data, outputs, losses, accuracies
146 | 
147 | 
148 | class PredFutureFeat(Basic):
149 |     def __init__(self,
150 |                  *args,
151 |                  reg_criterion: TargetConf = None,
152 |                  future_target: str = 'temp_agg_projected',
153 |                  incur_loss_style: str = 'separately',
154 |                  combine_future_losses: TargetConf = {'_target_': 'torch.min'},
155 |                  cumulative_future: bool = False,
156 |                  **kwargs):
157 |         '''
158 |         Args:
159 |             incur_loss_style (str): Defines how to incur losses for multiple
160 |                 futures. Could do 'separately', and then combine using
161 |                 `combine_future_losses`. Or 'together', such as for MIL-NCE.
162 |         '''
163 |         super().__init__(*args, **kwargs)
164 |         self.reg_criterion = hydra.utils.instantiate(reg_criterion)
165 |         self.future_target = future_target
166 |         self.incur_loss_style = incur_loss_style
167 |         self.combine_future_losses = combine_future_losses
168 |         self.cumulative_future = cumulative_future
169 | 
170 |     def __call__(
171 |             self,
172 |             data: Union[Dict[str, torch.Tensor],  # If dict
173 |                         Tuple[torch.Tensor, torch.Tensor]],  # vid, target
174 |             train_mode: bool = True):
175 |         data = self._basic_preproc(data, train_mode)
176 |         video = data['video'].to(self.device, non_blocking=True)
177 |         target = {
178 |             key: val.to(self.device, non_blocking=True)
179 |             for key, val in data['target'].items()
180 |         }
181 |         batch_size = video.size(0)
182 |         if train_mode:
183 |             # At test time, I don't sample the extra future video, since
184 |             # that is only used during training
185 |             all_videos = [video]
186 |             nfutures = len(
187 |                 [key for key in data.keys() if key.startswith(FUTURE_PREFIX)])
188 |             for i in range(nfutures):
189 |                 future_vid = data[f'{FUTURE_PREFIX}_{i}_video'].to(
190 |                     self.device, non_blocking=True)
191 |                 all_videos.append(future_vid)
192 |             video = torch.cat(all_videos, dim=0)  # Add to batch dim
193 |         outputs_full, aux_losses = self.model(video)
194 |         # Just the actual video for outputs
195 |         outputs = {key: val[:batch_size] for key, val in outputs_full.items()}
196 |         # if self.cls_loss_wt != 0:
197 |         # Doing this makes some layers not have gradients and it gives errors,
198 |         # so just leaving it here for now. The gradient should be 0 anyway
199 |         losses, accuracies = self.cls_loss_acc_fn(outputs, target)
200 |         losses.update(aux_losses)
201 |         losses['cls'] = losses['cls']
202 |         if train_mode:
203 |             # Incur the regression losses, for each of the futures
204 |             reg_losses = []
205 |             if self.incur_loss_style == 'separately':
206 |                 for i in range(nfutures):
207 |                     future_feats = outputs_full[self.future_target][
208 |                         (i + 1) * batch_size:(i + 2) * batch_size]
209 |                     if self.cumulative_future:
210 |                         future_feats = torch.cumsum(future_feats, 0)
211 |                         # Divide by the position to get mean of features until then
212 |                         future_feats = future_feats / (torch.range(
213 |                             1,
214 |                             future_feats.size(0),
215 |                             device=future_feats.device,
216 |                             dtype=future_feats.dtype).unsqueeze(1))
217 |                     loss = self.reg_criterion(outputs['future_projected'],
218 |                                               future_feats)
219 |                     reg_losses.append(loss)
220 |                 final_reg_loss = hydra.utils.call(self.combine_future_losses,
221 |                                                   torch.stack(reg_losses))
222 |             elif self.incur_loss_style == 'together':
223 |                 future_feats = outputs_full[self.future_target][batch_size:]
224 |                 future_feats = future_feats.reshape(
225 |                     (-1, batch_size, future_feats.size(-1))).transpose(0, 1)
226 |                 final_reg_loss = self.reg_criterion(
227 |                     outputs['future_projected'], future_feats)
228 |             else:
229 |                 raise NotImplementedError(self.incur_loss_style)
230 |             losses['reg'] = final_reg_loss
231 |         return data, outputs, losses, accuracies
232 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Anticipative Video Transformer
  2 | 
  3 | <p><img src="https://rohitgirdhar.github.io/DetectAndTrack/assets/cup.png" width="30px" align="center" /> Ranked <b>first</b> in the Action Anticipation task of the <a href="https://epic-kitchens.github.io/2021#results">CVPR 2021 EPIC-Kitchens Challenge</a>! (entry: AVT-FB-UT)</p>
  4 | 
  5 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/anticipative-video-transformer/action-anticipation-on-epic-kitchens-100)](https://paperswithcode.com/sota/action-anticipation-on-epic-kitchens-100?p=anticipative-video-transformer) <br/>
  6 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/anticipative-video-transformer/action-anticipation-on-epic-kitchens-100-test)](https://paperswithcode.com/sota/action-anticipation-on-epic-kitchens-100-test?p=anticipative-video-transformer) <br/>
  7 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/anticipative-video-transformer/action-anticipation-on-epic-kitchens-55-seen)](https://paperswithcode.com/sota/action-anticipation-on-epic-kitchens-55-seen?p=anticipative-video-transformer) <br/>
  8 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/anticipative-video-transformer/action-anticipation-on-epic-kitchens-55-1)](https://paperswithcode.com/sota/action-anticipation-on-epic-kitchens-55-1?p=anticipative-video-transformer) <br/>
  9 | 
 10 | 
 11 | 
 12 | 
 13 | [[project page](https://facebookresearch.github.io/AVT/)] [[paper](https://arxiv.org/abs/2106.02036)]
 14 | 
 15 | If this code helps with your work, please cite:
 16 | 
 17 | R. Girdhar and K. Grauman. **Anticipative Video Transformer.** IEEE/CVF International Conference on Computer Vision (ICCV), 2021.
 18 | 
 19 | ```bibtex
 20 | @inproceedings{girdhar2021anticipative,
 21 |     title = {{Anticipative Video Transformer}},
 22 |     author = {Girdhar, Rohit and Grauman, Kristen},
 23 |     booktitle = {ICCV},
 24 |     year = 2021
 25 | }
 26 | ```
 27 | 
 28 | ## Installation
 29 | 
 30 | The code was tested on a `Ubuntu 20.04` cluster
 31 | with each server consisting of 8 V100 16GB GPUs.
 32 | 
 33 | First clone the repo and set up the required packages in a conda environment.
 34 | You might need to make minor modifications here if some packages are no longer
 35 | available. In most cases they should be replaceable by more recent versions.
 36 | 
 37 | ```bash
 38 | $ git clone --recursive git@github.com:facebookresearch/AVT.git
 39 | $ conda env create -f env.yaml python=3.7.7
 40 | $ conda activate avt
 41 | ```
 42 | 
 43 | ### Set up RULSTM codebase
 44 | 
 45 | If you plan to use EPIC-Kitchens datasets,
 46 | you might need the train/test splits and evaluation code from RULSTM. This is also needed
 47 | if you want to extract RULSTM predictions for test submissions.
 48 | 
 49 | ```bash
 50 | $ cd external
 51 | $ git clone git@github.com:fpv-iplab/rulstm.git; cd rulstm
 52 | $ git checkout 57842b27d6264318be2cb0beb9e2f8c2819ad9bc
 53 | $ cd ../..
 54 | ```
 55 | 
 56 | ## Datasets
 57 | 
 58 | The code expects the data in the `DATA/` folder. You can also symlink it to
 59 | a different folder on a faster/larger drive. Inside it will contain following folders:
 60 | 1) `videos/` which will contain raw videos
 61 | 2) `external/` which will contain pre-extracted features from prior work
 62 | 3) `extracted_features/` which will contain other extracted features
 63 | 4) `pretrained/` which contains pretrained models, eg from TIMM
 64 | 
 65 | The paths to these datasets are set
 66 | in files like [`conf/dataset/epic_kitchens100/common.yaml`](conf/dataset/epic_kitchens100/common.yaml)
 67 | so you can also update the paths there instead.
 68 | 
 69 | ### EPIC-Kitchens
 70 | 
 71 | To train only the AVT-h on top of pre-extracted features, you can download the
 72 | features from RULSTM into `DATA/external/rulstm/RULSTM/data_full` for [EK55](https://github.com/fpv-iplab/rulstm/blob/master/RULSTM/scripts/download_data_ek55_full.sh) and
 73 | `DATA/external/rulstm/RULSTM/ek100_data_full`
 74 | for [EK100](https://github.com/fpv-iplab/rulstm/blob/master/RULSTM/scripts/download_data_ek100_full.sh).
 75 | If you plan to train models on features extracted from a irCSN-152 model
 76 | finetuned from IG65M features, you can download our pre-extracted features
 77 | from [here](https://dl.fbaipublicfiles.com/avt/datasets/ek100/ig65m_ftEk100_logits_10fps1s/rgb/data.mdb) into `DATA/extracted_features/ek100/ig65m_ftEk100_logits_10fps1s/rgb/` or [here](https://dl.fbaipublicfiles.com/avt/datasets/ek55/ig65m_ftEk55train_logits_25fps/rgb/data.mdb) into `DATA/extracted_features/ek55/ig65m_ftEk55train_logits_25fps/rgb/`.
 78 | 
 79 | To train AVT end-to-end, you need to download the raw videos from [EPIC-Kitchens](https://data.bris.ac.uk/data/dataset/2g1n6qdydwa9u22shpxqzp0t8m). They can be organized as you wish, but this
 80 | is how my folders are organized (since I first downloaded EK55 and then the remaining
 81 | new videos for EK100):
 82 | 
 83 | ```
 84 | DATA
 85 | ├── videos
 86 | │   ├── EpicKitchens
 87 | │   │   └── videos_ht256px
 88 | │   │       ├── train
 89 | │   │       │   ├── P01
 90 | │   │       │   │   ├── P01_01.MP4
 91 | │   │       │   │   ├── P01_03.MP4
 92 | │   │       │   │   ├── ...
 93 | │   │       └── test
 94 | │   │           ├── P01
 95 | │   │           │   ├── P01_11.MP4
 96 | │   │           │   ├── P01_12.MP4
 97 | │   │           │   ├── ...
 98 | │   │           ...
 99 | │   ├── EpicKitchens100
100 | │   │   └── videos_extension_ht256px
101 | │   │       ├── P01
102 | │   │       │   ├── P01_101.MP4
103 | │   │       │   ├── P01_102.MP4
104 | │   │       │   ├── ...
105 | │   │       ...
106 | │   ├── EGTEA/101020/videos/
107 | │   │   ├── OP01-R01-PastaSalad.mp4
108 | │   │   ...
109 | │   └── 50Salads/rgb/
110 | │       ├── rgb-01-1.avi
111 | │       ...
112 | ├── external
113 | │   └── rulstm
114 | │       └── RULSTM
115 | │           ├── egtea
116 | │           │   ├── TSN-C_3_egtea_action_CE_flow_model_best_fcfull_hd
117 | │           │   ...
118 | │           ├── data_full  # (EK55)
119 | │           │   ├── rgb
120 | │           │   ├── obj
121 | │           │   └── flow
122 | │           └── ek100_data_full
123 | │               ├── rgb
124 | │               ├── obj
125 | │               └── flow
126 | └── extracted_features
127 |     ├── ek100
128 |     │   └── ig65m_ftEk100_logits_10fps1s
129 |     │       └── rgb
130 |     └── ek55
131 |         └── ig65m_ftEk55train_logits_25fps
132 |             └── rgb
133 | ```
134 | 
135 | If you use a different organization, you would need to edit the train/val
136 | dataset files, such as [`conf/dataset/epic_kitchens100/anticipation_train.yaml`](conf/dataset/epic_kitchens100/anticipation_train.yaml). Sometimes the values are overriden
137 | in the TXT config files, so might need to change there too. The `root` property takes a list of
138 | folders where the videos can be found, and it will search through all of them
139 | in order for a given video. Note that we resized the EPIC videos to
140 | 256px height for faster processing; you can use [`sample_scripts/resize_epic_256px.sh`](sample_scripts/resize_epic_256px.sh) script for the same.
141 | 
142 | Please see [`docs/DATASETS.md`](docs/DATASETS.md) for setting up other datasets.
143 | 
144 | ## Training and evaluating models
145 | 
146 | If you want to train AVT models, you would need pre-trained models from
147 | [`timm`](https://github.com/rwightman/pytorch-image-models/tree/8257b86550b8453b658e386498d4e643d6bf8d38).
148 | We have experiments that use the following models:
149 | 
150 | ```bash
151 | $ mkdir DATA/pretrained/TIMM/
152 | $ wget https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch16_224_in21k-e5005f0a.pth -O DATA/pretrained/TIMM/jx_vit_base_patch16_224_in21k-e5005f0a.pth
153 | $ wget https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth -O DATA/pretrained/TIMM/jx_vit_base_p16_224-80ecf9dd.pth
154 | ```
155 | 
156 | The code uses [`hydra 1.0`](https://hydra.cc/) for configuration with [`submitit`](https://github.com/facebookincubator/submitit) plugin for jobs
157 | via SLURM. We provide a `launch.py` script that is a wrapper around the
158 | training scripts and can run jobs locally or launch distributed jobs. The
159 | configuration overrides for a specific experiment is defined by a TXT file.
160 | You can run a config by:
161 | 
162 | ```bash
163 | $ python launch.py -c expts/01_ek100_avt.txt
164 | ```
165 | where `expts/01_ek100_avt.txt` can be replaced by any TXT config file.
166 | 
167 | By default, the launcher will launch the job to a SLURM cluster. However,
168 | you can run it locally using one of the following options:
169 | 
170 | 1. `-g` to run locally in debug mode with 1 GPU and 0 workers. Will allow you to place
171 | `pdb.set_trace()` to debug interactively.
172 | 2. `-l` to run locally using as many GPUs on the local machine.
173 | 
174 | This will run the training, which will run validation every few epochs. You can
175 | also only run testing using the `-t` flag. When running testing for a pre-trained model,
176 | don't forget to set the checkpoint to load weights from, using something like this in the 
177 | txt experiment config:
178 | ```
179 | train.init_from_model=[[path/to/checkpoint.pth]]
180 | ```
181 | 
182 | The outputs will be stored in `OUTPUTS/<path to config>`. This would include
183 | tensorboard files that you can use to visualize the training progress.
184 | 
185 | ## Model Zoo
186 | 
187 | 
188 | ### EPIC-Kitchens-100
189 | 
190 | 
191 | | Backbone | Head | Class-mean <br/> Recall@5 (Actions) | Config | Model |
192 | |----------|------|-------------------------------|--------|-----|
193 | | AVT-b (IN21K) | AVT-h | 14.9 | `expts/01_ek100_avt.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/01_ek100_avt.txt/0/checkpoint.pth)|
194 | | TSN (RGB) | AVT-h | 13.6 | `expts/02_ek100_avt_tsn.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/02_ek100_avt_tsn.txt/0/checkpoint.pth)|
195 | | TSN (Obj) | AVT-h | 8.7 | `expts/03_ek100_avt_tsn_obj.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/03_ek100_avt_tsn_obj.txt/0/checkpoint.pth)|
196 | | irCSN152 (IG65M) | AVT-h | 12.8 | `expts/04_ek100_avt_ig65m.txt` | [link](https://dl.fbaipublicfiles.com/avt/checkpoints/expts/04_ek100_avt_ig65m.txt/0/checkpoint.pth)|
197 | 
198 | 
199 | ### Late fusing predictions
200 | 
201 | For comparison to methods that use multiple modalities, you can late fuse
202 | predictions from multiple models using functions from `notebooks/utils.py`.
203 | For example, to compute the late fused performance reported in Table 3 (val)
204 | as `AVT+` (obtains 15.9 recall@5 for actions):
205 | 
206 | ```python
207 | from notebooks.utils import *
208 | CFG_FILES = [
209 |     ('expts/01_ek100_avt.txt', 0),
210 |     ('expts/03_ek100_avt_tsn_obj.txt', 0),
211 | ]
212 | WTS = [2.5, 0.5]
213 | print_accuracies_epic(get_epic_marginalize_late_fuse(CFG_FILES, weights=WTS)[0])
214 | ```
215 | 
216 | Please see [`docs/MODELS.md`](docs/MODELS.md) for test submission and models on other datasets.
217 | 
218 | ## License
219 | 
220 | This codebase is released under the license terms specified in the [LICENSE](LICENSE) file. Any imported libraries, datasets or other code follows the license terms set by respective authors.
221 | 
222 | 
223 | ## Acknowledgements
224 | 
225 | The codebase was built on top of [`facebookresearch/VMZ`](https://github.com/facebookresearch/VMZ). Many thanks to [Antonino Furnari](https://github.com/fpv-iplab/rulstm), [Fadime Sener](https://cg.cs.uni-bonn.de/en/publications/paper-details/sener-2020-temporal/) and [Miao Liu](https://github.com/2020aptx4869lm/Forecasting-Human-Object-Interaction-in-FPV) for help with prior work.
226 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------