├── General-Bench-Closeset
    └── .gitkeep
├── General-Bench-Openset
    └── .gitkeep
├── video_generation_evaluation
    ├── toolkit
    │   ├── cli
    │   │   ├── __init__.py
    │   │   └── vbench.py
    │   ├── launch
    │   │   └── __init__.py
    │   ├── third_party
    │   │   ├── __init__.py
    │   │   ├── amt
    │   │   │   ├── __init__.py
    │   │   │   ├── datasets
    │   │   │   │   └── __init__.py
    │   │   │   ├── losses
    │   │   │   │   └── __init__.py
    │   │   │   ├── metrics
    │   │   │   │   └── __init__.py
    │   │   │   ├── networks
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── blocks
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── multi_flow.py
    │   │   │   ├── trainers
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── logger.py
    │   │   │   ├── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── build_utils.py
    │   │   │   │   └── dist_utils.py
    │   │   │   ├── benchmarks
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── speed_parameters.py
    │   │   │   │   ├── gopro.py
    │   │   │   │   ├── adobe240.py
    │   │   │   │   ├── ucf101.py
    │   │   │   │   ├── vimeo90k.py
    │   │   │   │   ├── vimeo90k_tta.py
    │   │   │   │   └── snu_film.py
    │   │   │   ├── flow_generation
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── liteflownet
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── correlation
    │   │   │   │   │   │   └── README.md
    │   │   │   │   │   └── README.md
    │   │   │   │   └── gen_flow.py
    │   │   │   ├── scripts
    │   │   │   │   ├── benchmark_arbitrary.sh
    │   │   │   │   ├── train.sh
    │   │   │   │   └── benchmark_fixed.sh
    │   │   │   ├── environment.yaml
    │   │   │   ├── cfgs
    │   │   │   │   ├── AMT-S_gopro.yaml
    │   │   │   │   ├── AMT-G.yaml
    │   │   │   │   ├── AMT-L.yaml
    │   │   │   │   ├── AMT-S.yaml
    │   │   │   │   └── IFRNet.yaml
    │   │   │   └── train.py
    │   │   ├── umt
    │   │   │   ├── __init__.py
    │   │   │   ├── datasets
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── masking_generator.py
    │   │   │   ├── models
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── extract_clip
    │   │   │   │   │   └── extract.ipynb
    │   │   │   └── functional.py
    │   │   ├── RAFT
    │   │   │   ├── __init__.py
    │   │   │   ├── core
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── utils_core
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── utils.py
    │   │   │   │   └── corr.py
    │   │   │   ├── download_models.sh
    │   │   │   ├── RAFT.png
    │   │   │   ├── alt_cuda_corr
    │   │   │   │   ├── setup.py
    │   │   │   │   └── correlation.cpp
    │   │   │   ├── LICENSE
    │   │   │   └── README.md
    │   │   ├── ViCLIP
    │   │   │   └── __init__.py
    │   │   ├── grit_src
    │   │   │   ├── __init__.py
    │   │   │   ├── grit
    │   │   │   │   ├── data
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── datasets
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── vg.py
    │   │   │   │   │   ├── transforms
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── custom_augmentation_impl.py
    │   │   │   │   │   └── custom_build_augmentation.py
    │   │   │   │   ├── modeling
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── text
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── load_text_token.py
    │   │   │   │   │   ├── backbone
    │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   ├── meta_arch
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── grit.py
    │   │   │   │   │   └── roi_heads
    │   │   │   │   │   │   └── __init__.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── config.py
    │   │   │   ├── centernet2
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── centernet
    │   │   │   │   │   ├── modeling
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── backbone
    │   │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   │   └── fpn_p5.py
    │   │   │   │   │   │   ├── layers
    │   │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   │   ├── ml_nms.py
    │   │   │   │   │   │   │   └── heatmap_focal_loss.py
    │   │   │   │   │   │   ├── meta_arch
    │   │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   │   └── centernet_detector.py
    │   │   │   │   │   │   ├── roi_heads
    │   │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   │   └── fed_loss.py
    │   │   │   │   │   │   └── dense_heads
    │   │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   │   └── utils.py
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── config.py
    │   │   │   │   ├── configs
    │   │   │   │   │   ├── CenterNet2_R50_1x.yaml
    │   │   │   │   │   ├── CenterNet-FPN_R50_1x.yaml
    │   │   │   │   │   ├── CenterNet2-F_R50_1x.yaml
    │   │   │   │   │   ├── CenterNet-S4_DLA_8x.yaml
    │   │   │   │   │   ├── O365_CenterNet2_R50_1x.yaml
    │   │   │   │   │   ├── LVIS_CenterNet2_R50_1x.yaml
    │   │   │   │   │   ├── LVIS_CenterNet2_R50_Fed_1x.yaml
    │   │   │   │   │   ├── CenterNet2_X101-DCN_2x.yaml
    │   │   │   │   │   ├── CenterNet2_DLA-BiFPN-P5_640_16x.yaml
    │   │   │   │   │   ├── CenterNet2_DLA-BiFPN-P5_640_16x_ST.yaml
    │   │   │   │   │   ├── CenterNet2_R2-101-DCN_896_4x.yaml
    │   │   │   │   │   ├── Base-CenterNet-FPN.yaml
    │   │   │   │   │   ├── CenterNet2_DLA-fcosBiFPN-P5_640_16x_ST.yaml
    │   │   │   │   │   ├── CenterNet2_R2-101-DCN-BiFPN_1280_4x.yaml
    │   │   │   │   │   ├── CenterNet2_R2-101-DCN-BiFPN_4x+4x_1560_ST.yaml
    │   │   │   │   │   ├── CenterNet2_DLA-BiFPN-P3_24x.yaml
    │   │   │   │   │   ├── CenterNet2_DLA-BiFPN-P3_4x.yaml
    │   │   │   │   │   ├── Base_S4_DLA.yaml
    │   │   │   │   │   ├── nuImages_CenterNet2_DLA_640_8x.yaml
    │   │   │   │   │   └── Base-CenterNet2.yaml
    │   │   │   │   └── .gitignore
    │   │   │   └── configs
    │   │   │   │   ├── GRiT_B_DenseCap.yaml
    │   │   │   │   ├── GRiT_B_ObjectDet.yaml
    │   │   │   │   ├── GRiT_L_ObjectDet.yaml
    │   │   │   │   ├── GRiT_H_ObjectDet.yaml
    │   │   │   │   ├── GRiT_B_DenseCap_ObjectDet.yaml
    │   │   │   │   └── Base.yaml
    │   │   ├── tag2Text
    │   │   │   ├── __init__.py
    │   │   │   ├── config_swinB_384.json
    │   │   │   ├── med_config.json
    │   │   │   └── q2l_config.json
    │   │   └── grit_model.py
    │   ├── temporal_flickering.py
    │   ├── imaging_quality.py
    │   ├── fvd.py
    │   ├── background_consistency.py
    │   ├── scene.py
    │   ├── overall_consistency.py
    │   ├── temporal_style.py
    │   ├── subject_consistency.py
    │   ├── object_class.py
    │   └── appearance_style.py
    ├── pretrained
    │   ├── amt_model
    │   │   ├── download.sh
    │   │   └── AMT-S.yaml
    │   ├── grit_model
    │   │   └── model_path.txt
    │   ├── viclip_model
    │   │   └── model_path.txt
    │   ├── caption_model
    │   │   └── model_path.txt
    │   ├── pyiqa_model
    │   │   └── model_path.txt
    │   ├── README.md
    │   ├── aesthetic_model
    │   │   └── model_path.txt
    │   ├── umt_model
    │   │   └── model_path.txt
    │   ├── raft_model
    │   │   └── download.sh
    │   └── clip_model
    │   │   └── model_path.txt
    ├── competitions
    │   ├── configs
    │   │   ├── clip_length_0.5.yaml
    │   │   ├── clip_length_1.0.yaml
    │   │   ├── clip_length_mix.yaml
    │   │   ├── clip_length_short.yaml
    │   │   ├── slow_fast_params.yaml
    │   │   ├── subject_mapping_table.yaml
    │   │   └── background_mapping_table.yaml
    │   ├── requirements.txt
    │   ├── competition_utils.py
    │   └── clip_score.py
    ├── requirements.txt
    └── README.md
├── processors
    ├── __init__.py
    ├── ._audio_processor.py
    ├── ._image_processor.py
    ├── __pycache__
    │   ├── __init__.cpython-38.pyc
    │   ├── __init__.cpython-39.pyc
    │   ├── .___init__.cpython-38.pyc
    │   ├── .___init__.cpython-39.pyc
    │   ├── __init__.cpython-311.pyc
    │   ├── __init__.cpython-312.pyc
    │   ├── nlp_processor.cpython-38.pyc
    │   ├── nlp_processor.cpython-39.pyc
    │   ├── audio_processor.cpython-311.pyc
    │   ├── audio_processor.cpython-312.pyc
    │   ├── audio_processor.cpython-38.pyc
    │   ├── audio_processor.cpython-39.pyc
    │   ├── image_processor.cpython-311.pyc
    │   ├── image_processor.cpython-312.pyc
    │   ├── image_processor.cpython-38.pyc
    │   ├── image_processor.cpython-39.pyc
    │   ├── nlp_processor.cpython-311.pyc
    │   ├── nlp_processor.cpython-312.pyc
    │   ├── video_processor.cpython-311.pyc
    │   ├── video_processor.cpython-312.pyc
    │   ├── video_processor.cpython-38.pyc
    │   ├── video_processor.cpython-39.pyc
    │   ├── ._video_processor.cpython-39.pyc
    │   ├── three_d_processor.cpython-311.pyc
    │   ├── three_d_processor.cpython-312.pyc
    │   ├── three_d_processor.cpython-38.pyc
    │   ├── three_d_processor.cpython-39.pyc
    │   └── pseudo_audio_processor.cpython-39.pyc
    ├── three_d_processor.py
    ├── audio_processor.py
    ├── video_processor.py
    └── image_processor.py
├── outcome
    ├── test_result.xlsx
    └── Qwen2.5-7B-Instruct_result.xlsx
├── utils
    ├── ._special_metrix.py
    ├── __pycache__
    │   ├── data_types.cpython-311.pyc
    │   ├── data_types.cpython-312.pyc
    │   ├── data_types.cpython-38.pyc
    │   ├── data_types.cpython-39.pyc
    │   ├── ._special_metrix.cpython-39.pyc
    │   ├── base_processor.cpython-311.pyc
    │   ├── base_processor.cpython-312.pyc
    │   ├── base_processor.cpython-38.pyc
    │   ├── base_processor.cpython-39.pyc
    │   ├── special_metrix.cpython-310.pyc
    │   ├── special_metrix.cpython-311.pyc
    │   ├── special_metrix.cpython-312.pyc
    │   ├── special_metrix.cpython-38.pyc
    │   └── special_metrix.cpython-39.pyc
    ├── data_types.py
    ├── base_processor.py
    └── special_metrix.py
├── references
    ├── sota_result.xlsx
    └── template_result.xlsx
├── run.sh
└── README_ZH.md


/General-Bench-Closeset/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/General-Bench-Openset/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/cli/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/launch/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/umt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/RAFT/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/ViCLIP/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/processors/__init__.py:
--------------------------------------------------------------------------------
1 | """Processors package for different modalities.""" 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/RAFT/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/losses/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/networks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/trainers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/grit/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/RAFT/core/utils_core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/flow_generation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/networks/blocks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/grit/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/grit/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/grit/modeling/text/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/flow_generation/liteflownet/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/grit/data/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/grit/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/grit/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/grit/modeling/roi_heads/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/layers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/meta_arch/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/roi_heads/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/outcome/test_result.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/outcome/test_result.xlsx


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/dense_heads/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/._special_metrix.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/._special_metrix.py


--------------------------------------------------------------------------------
/references/sota_result.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/references/sota_result.xlsx


--------------------------------------------------------------------------------
/processors/._audio_processor.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/._audio_processor.py


--------------------------------------------------------------------------------
/processors/._image_processor.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/._image_processor.py


--------------------------------------------------------------------------------
/references/template_result.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/references/template_result.xlsx


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/tag2Text/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('third_party/grit_src')
3 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_R50_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-CenterNet2.yaml"
2 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/umt/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import build_dataset, build_pretraining_dataset


--------------------------------------------------------------------------------
/outcome/Qwen2.5-7B-Instruct_result.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/outcome/Qwen2.5-7B-Instruct_result.xlsx


--------------------------------------------------------------------------------
/utils/__pycache__/data_types.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/data_types.cpython-311.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/data_types.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/data_types.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/data_types.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/data_types.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/data_types.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/data_types.cpython-39.pyc


--------------------------------------------------------------------------------
/video_generation_evaluation/pretrained/amt_model/download.sh:
--------------------------------------------------------------------------------
1 | wget https://huggingface.co/lalala125/AMT/resolve/main/amt-s.pth -P ~/.cache/amt_model
2 | 


--------------------------------------------------------------------------------
/processors/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/.___init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/.___init__.cpython-38.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/.___init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/.___init__.cpython-39.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/__init__.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/__init__.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/._special_metrix.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/._special_metrix.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/base_processor.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/base_processor.cpython-311.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/base_processor.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/base_processor.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/base_processor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/base_processor.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/base_processor.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/base_processor.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/special_metrix.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/special_metrix.cpython-310.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/special_metrix.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/special_metrix.cpython-311.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/special_metrix.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/special_metrix.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/special_metrix.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/special_metrix.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/special_metrix.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/special_metrix.cpython-39.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/nlp_processor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/nlp_processor.cpython-38.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/nlp_processor.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/nlp_processor.cpython-39.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/audio_processor.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/audio_processor.cpython-311.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/audio_processor.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/audio_processor.cpython-312.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/audio_processor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/audio_processor.cpython-38.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/audio_processor.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/audio_processor.cpython-39.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/image_processor.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/image_processor.cpython-311.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/image_processor.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/image_processor.cpython-312.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/image_processor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/image_processor.cpython-38.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/image_processor.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/image_processor.cpython-39.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/nlp_processor.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/nlp_processor.cpython-311.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/nlp_processor.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/nlp_processor.cpython-312.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/video_processor.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/video_processor.cpython-311.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/video_processor.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/video_processor.cpython-312.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/video_processor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/video_processor.cpython-38.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/video_processor.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/video_processor.cpython-39.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/._video_processor.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/._video_processor.cpython-39.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/three_d_processor.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/three_d_processor.cpython-311.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/three_d_processor.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/three_d_processor.cpython-312.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/three_d_processor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/three_d_processor.cpython-38.pyc


--------------------------------------------------------------------------------
/processors/__pycache__/three_d_processor.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/three_d_processor.cpython-39.pyc


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/RAFT/download_models.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget https://dl.dropboxusercontent.com/s/4j4z58wuv8o0mfz/models.zip
3 | unzip models.zip
4 | 


--------------------------------------------------------------------------------
/processors/__pycache__/pseudo_audio_processor.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/pseudo_audio_processor.cpython-39.pyc


--------------------------------------------------------------------------------
/video_generation_evaluation/pretrained/grit_model/model_path.txt:
--------------------------------------------------------------------------------
1 | wget https://datarelease.blob.core.windows.net/grit/models/grit_b_densecap_objectdet.pth -P ~/.cache/toolkit/grit_model
2 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/RAFT/RAFT.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/video_generation_evaluation/toolkit/third_party/RAFT/RAFT.png


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet-FPN_R50_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-CenterNet-FPN.yaml"
2 | MODEL:
3 |   CENTERNET:
4 |     MORE_POS: True


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2-F_R50_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-CenterNet2.yaml"
2 | MODEL:
3 |   ROI_HEADS:
4 |     NAME: CustomROIHeads


--------------------------------------------------------------------------------
/video_generation_evaluation/pretrained/viclip_model/model_path.txt:
--------------------------------------------------------------------------------
1 | wget https://huggingface.co/OpenGVLab/VBench_Used_Models/resolve/main/ViClip-InternVid-10M-FLT.pth -P ~/.cache/toolkit/ViCLIP
2 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/pretrained/caption_model/model_path.txt:
--------------------------------------------------------------------------------
1 | wget https://huggingface.co/spaces/xinyu1205/recognize-anything/resolve/main/tag2text_swin_14m.pth -P ~/.cache/toolkit/caption_model
2 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/pretrained/pyiqa_model/model_path.txt:
--------------------------------------------------------------------------------
1 | wget https://github.com/chaofengc/IQA-PyTorch/releases/download/v0.1-weights/musiq_spaq_ckpt-358bb6af.pth -P ~/.cache/toolkit/pyiqa_model
2 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/scripts/benchmark_arbitrary.sh:
--------------------------------------------------------------------------------
1 | CFG=$1
2 | CKPT=$2
3 | 
4 | python benchmarks/gopro.py -c $CFG -p $CKPT
5 | python benchmarks/adobe240.py -c $CFG -p $CKPT


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet-S4_DLA_8x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base_S4_DLA.yaml"
2 | SOLVER:
3 |   MAX_ITER: 90000
4 |   BASE_LR: 0.08
5 |   IMS_PER_BATCH: 128


--------------------------------------------------------------------------------
/video_generation_evaluation/pretrained/README.md:
--------------------------------------------------------------------------------
1 | ## Pre-Trained Models
2 | Please download the pre-trained weights according to the guidance in the `model_path.txt` file for each model (see each folder).
3 | 
4 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/pretrained/aesthetic_model/model_path.txt:
--------------------------------------------------------------------------------
1 | wget https://github.com/LAION-AI/aesthetic-predictor/blob/main/sa_0_4_vit_l_14_linear.pth -P ~/.cache/toolkit/aesthetic_model/emb_reader
2 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/pretrained/umt_model/model_path.txt:
--------------------------------------------------------------------------------
1 | wget https://huggingface.co/OpenGVLab/VBench_Used_Models/resolve/main/l16_ptk710_ftk710_ftk400_f16_res224.pth -P  ~/.cache/toolkit/umt_model/
2 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/scripts/train.sh:
--------------------------------------------------------------------------------
1 | NUM_GPU=$1
2 | CFG=$2
3 | PORT=$3
4 | python -m torch.distributed.launch \
5 | --nproc_per_node $NUM_GPU \
6 | --master_port $PORT train.py -c $CFG


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/.gitignore:
--------------------------------------------------------------------------------
 1 | # compilation and distribution
 2 | __pycache__
 3 | _ext
 4 | *.pyc
 5 | *.pyd
 6 | *.so
 7 | centernet.egg-info/
 8 | build/
 9 | dist/
10 | wheels/
11 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/scripts/benchmark_fixed.sh:
--------------------------------------------------------------------------------
1 | CFG=$1
2 | CKPT=$2
3 | 
4 | python benchmarks/vimeo90k.py -c $CFG -p $CKPT
5 | python benchmarks/ucf101.py -c $CFG -p $CKPT
6 | python benchmarks/snu_film.py -c $CFG -p $CKPT
7 | python benchmarks/xiph.py -c $CFG -p $CKPT


--------------------------------------------------------------------------------
/video_generation_evaluation/pretrained/raft_model/download.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | CACHE_DIR=~/.cache/toolkit
3 | wget -P $CACHE_DIR/raft_model/ https://dl.dropboxusercontent.com/s/4j4z58wuv8o0mfz/models.zip
4 | unzip -d ${CACHE_DIR}/raft_model/ $CACHE_DIR/raft_model/models.zip
5 | rm -r $CACHE_DIR/raft_model/models.zip
6 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/grit/__init__.py:
--------------------------------------------------------------------------------
1 | from .modeling.meta_arch import grit
2 | from .modeling.roi_heads import grit_roi_heads
3 | from .modeling.backbone import vit
4 | 
5 | from .data.datasets import object365
6 | from .data.datasets import vg
7 | from .data.datasets import grit_coco


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/tag2Text/config_swinB_384.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "ckpt": "pretrain_model/swin_base_patch4_window7_224_22k.pth",
 3 |     "vision_width": 1024,
 4 |     "image_res": 384,
 5 |     "window_size": 12,
 6 |     "embed_dim": 128,
 7 |     "depths": [ 2, 2, 18, 2 ],
 8 |     "num_heads": [ 4, 8, 16, 32 ]
 9 |   }
10 |   


--------------------------------------------------------------------------------
/video_generation_evaluation/pretrained/clip_model/model_path.txt:
--------------------------------------------------------------------------------
1 | wget https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt -P ~/.cache/toolkit/clip_model
2 | wget https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt -P ~/.cache/toolkit/clip_model
3 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/flow_generation/liteflownet/correlation/README.md:
--------------------------------------------------------------------------------
1 | This is an adaptation of the FlowNet2 implementation in order to compute cost volumes. Should you be making use of this work, please make sure to adhere to the licensing terms of the original authors. Should you be making use or modify this particular implementation, please acknowledge it appropriately.


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/O365_CenterNet2_R50_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-CenterNet2.yaml"
 2 | MODEL:
 3 |   ROI_HEADS:
 4 |     NUM_CLASSES: 365
 5 |   CENTERNET:
 6 |     NUM_CLASSES: 365
 7 | DATASETS:
 8 |   TRAIN: ("objects365_train",)
 9 |   TEST: ("objects365_val",)
10 | DATALOADER:
11 |   SAMPLER_TRAIN: "ClassAwareSampler"
12 | TEST:
13 |   DETECTIONS_PER_IMAGE: 300


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: amt
 2 | channels:
 3 |   - pytorch
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.8.5
 8 |   - pip=20.3
 9 |   - cudatoolkit=11.3
10 |   - pytorch=1.11.0
11 |   - torchvision=0.12.0
12 |   - numpy=1.21.5
13 |   - pip:
14 |     - opencv-python==4.1.2.30
15 |     - imageio==2.19.3
16 |     - omegaconf==2.3.0
17 |     - Pillow==9.4.0
18 |     - tqdm==4.64.1
19 |     - wandb==0.12.21


--------------------------------------------------------------------------------
/video_generation_evaluation/competitions/configs/clip_length_0.5.yaml:
--------------------------------------------------------------------------------
 1 | subject_consistency: 0.5
 2 | background_consistency: 0.5
 3 | motion_smoothness: 0.5
 4 | temporal_flickering: 0.5
 5 | dynamic_degree: 0.5
 6 | imaging_quality: 0.5
 7 | aesthetic_quality: 0.5
 8 | 
 9 | object_class: 0.5
10 | multiple_objects: 0.5
11 | human_action: 0.5
12 | color: 0.5
13 | spatial_relationship: 0.5
14 | scene: 0.5
15 | appearance_style: 0.5
16 | temporal_style: 0.5
17 | overall_consistency: 0.5
18 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/competitions/configs/clip_length_1.0.yaml:
--------------------------------------------------------------------------------
 1 | subject_consistency: 1.0
 2 | background_consistency: 1.0
 3 | motion_smoothness: 1.0
 4 | temporal_flickering: 1.0
 5 | dynamic_degree: 1.0
 6 | imaging_quality: 1.0
 7 | aesthetic_quality: 1.0
 8 | 
 9 | object_class: 1.0
10 | multiple_objects: 1.0
11 | human_action: 1.0
12 | color: 1.0
13 | spatial_relationship: 1.0
14 | scene: 1.0
15 | appearance_style: 1.0
16 | temporal_style: 1.0
17 | overall_consistency: 1.0
18 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/competitions/configs/clip_length_mix.yaml:
--------------------------------------------------------------------------------
 1 | subject_consistency: 2.0
 2 | background_consistency: 2.0
 3 | motion_smoothness: 2.0
 4 | temporal_flickering: 2.0
 5 | dynamic_degree: 2.0
 6 | imaging_quality: 2.0
 7 | aesthetic_quality: 2.0
 8 | 
 9 | object_class: 2.0
10 | multiple_objects: 2.0
11 | human_action: 10.0
12 | color: 2.0
13 | spatial_relationship: 2.0
14 | scene: 2.0
15 | appearance_style: 2.0
16 | temporal_style: 10.0
17 | overall_consistency: 10.0
18 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/competitions/configs/clip_length_short.yaml:
--------------------------------------------------------------------------------
 1 | subject_consistency: 2.0
 2 | background_consistency: 2.0
 3 | motion_smoothness: 2.0
 4 | temporal_flickering: 2.0
 5 | dynamic_degree: 2.0
 6 | imaging_quality: 2.0
 7 | aesthetic_quality: 2.0
 8 | 
 9 | object_class: 2.0
10 | multiple_objects: 2.0
11 | human_action: 2.0
12 | color: 2.0
13 | spatial_relationship: 2.0
14 | scene: 2.0
15 | appearance_style: 2.0
16 | temporal_style: 2.0
17 | overall_consistency: 2.0
18 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/competitions/configs/slow_fast_params.yaml:
--------------------------------------------------------------------------------
 1 | w_inclip_sb: 0.7
 2 | w_clip2clip_sb: 0.3
 3 | inclip_mean_sb: 0.9206531487463249
 4 | inclip_std_sb: 0.06767633012297831
 5 | clip2clip_mean_sb: 0.782773956831079
 6 | clip2clip_std_sb: 0.15702951463645903
 7 | 
 8 | 
 9 | w_inclip_bg: 0.8
10 | w_clip2clip_bg: 0.2
11 | inclip_mean_bg: 0.9461633887475777
12 | inclip_std_bg: 0.02029563684589086
13 | clip2clip_mean_bg: 0.8817304710164493
14 | clip2clip_std_bg: 0.0888072561860013


--------------------------------------------------------------------------------
/video_generation_evaluation/requirements.txt:
--------------------------------------------------------------------------------
 1 | Pillow
 2 | numpy<2.0.0
 3 | matplotlib
 4 | timm>=0.9
 5 | wheel
 6 | cython
 7 | tensorboard
 8 | scipy
 9 | opencv-python
10 | scikit-learn
11 | scikit-image
12 | openai-clip
13 | decord
14 | requests
15 | pyyaml
16 | easydict
17 | pyiqa==0.1.10
18 | lvis
19 | fairscale>=0.4.4
20 | fvcore
21 | easydict
22 | urllib3
23 | boto3
24 | omegaconf
25 | transformers==4.33.2
26 | pycocoevalcap
27 | # detectron2@git+https://github.com/facebookresearch/detectron2.git
28 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/RAFT/alt_cuda_corr/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 3 | 
 4 | 
 5 | setup(
 6 |     name='correlation',
 7 |     ext_modules=[
 8 |         CUDAExtension('alt_cuda_corr',
 9 |             sources=['correlation.cpp', 'correlation_kernel.cu'],
10 |             extra_compile_args={'cxx': [], 'nvcc': ['-O3']}),
11 |     ],
12 |     cmdclass={
13 |         'build_ext': BuildExtension
14 |     })
15 | 
16 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/LVIS_CenterNet2_R50_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-CenterNet2.yaml"
 2 | MODEL:
 3 |   ROI_HEADS:
 4 |     NUM_CLASSES: 1203
 5 |     SCORE_THRESH_TEST: 0.02
 6 |     NMS_THRESH_TEST: 0.5
 7 |   CENTERNET:
 8 |     NUM_CLASSES: 1203
 9 |     
10 | DATASETS:
11 |   TRAIN: ("lvis_v1_train",)
12 |   TEST: ("lvis_v1_val",)
13 | DATALOADER:
14 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
15 |   REPEAT_THRESHOLD: 0.001
16 | TEST:
17 |   DETECTIONS_PER_IMAGE: 300
18 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/competitions/requirements.txt:
--------------------------------------------------------------------------------
 1 | Pillow==9.5.0
 2 | numpy
 3 | matplotlib
 4 | timm==0.9.12
 5 | torch==1.13.1
 6 | torchvision>=0.13
 7 | tensorboard
 8 | scipy==1.10.1
 9 | opencv-python
10 | scikit-learn
11 | requests
12 | scikit-image
13 | pyyaml
14 | easydict
15 | lvis
16 | fairscale==0.4.4
17 | openai-clip
18 | fvcore
19 | easydict
20 | decord==0.6.0
21 | pyiqa==0.1.8
22 | transformers==4.33.2
23 | pycocoevalcap
24 | wheel
25 | cython
26 | urllib3
27 | boto3
28 | omegaconf
29 | pyav
30 | av
31 | moviepy


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/umt/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .clip import clip_b16, clip_l14, clip_l14_336
2 | # from .modeling_finetune import vit_base_patch16_224, vit_base_patch16_384, vit_large_patch16_224, vit_large_patch16_384
3 | from .modeling_finetune import vit_large_patch16_224
4 | from .modeling_pretrain_umt import pretrain_umt_base_patch16_224, pretrain_umt_large_patch16_224 
5 | from .modeling_pretrain import pretrain_videomae_base_patch16_224, pretrain_videomae_large_patch16_224, pretrain_videomae_huge_patch16_224 
6 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/utils/build_utils.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os
 3 | import sys
 4 | CUR_DIR = os.path.dirname(os.path.abspath(__file__))
 5 | sys.path.append(os.path.join(CUR_DIR, "../"))
 6 | 
 7 | 
 8 | def base_build_fn(module, cls, params):
 9 |     return getattr(importlib.import_module(
10 |                     module, package=None), cls)(**params)
11 | 
12 | 
13 | def build_from_cfg(config):
14 |     module, cls = config['name'].rsplit(".", 1)
15 |     params = config.get('params', {})
16 |     return base_build_fn(module, cls, params)
17 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/LVIS_CenterNet2_R50_Fed_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-CenterNet2.yaml"
 2 | MODEL:
 3 |   ROI_HEADS:
 4 |     NUM_CLASSES: 1203
 5 |     SCORE_THRESH_TEST: 0.02
 6 |     NMS_THRESH_TEST: 0.5
 7 |   CENTERNET:
 8 |     NUM_CLASSES: 1203
 9 |   ROI_BOX_HEAD:
10 |     USE_SIGMOID_CE: True
11 |     USE_FED_LOSS: True
12 | DATASETS:
13 |   TRAIN: ("lvis_v1_train",)
14 |   TEST: ("lvis_v1_val",)
15 | DATALOADER:
16 |   SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
17 |   REPEAT_THRESHOLD: 0.001
18 | TEST:
19 |   DETECTIONS_PER_IMAGE: 300
20 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/configs/GRiT_B_DenseCap.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base.yaml"
 2 | MODEL:
 3 |   TRAIN_TASK: ["DenseCap"]
 4 |   TEST_TASK: "DenseCap"
 5 |   MASK_ON: False
 6 |   ROI_HEADS:
 7 |     SOFT_NMS_ENABLED: False
 8 |   BEAM_SIZE: 1
 9 |   WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth"
10 |   BACKBONE:
11 |     NAME: build_vit_fpn_backbone
12 |   VIT_LAYERS: 12
13 | SOLVER:
14 |   VIT_LAYER_DECAY_RATE: 0.7
15 | DATASETS:
16 |   TRAIN: ("vg_train",)
17 |   TEST: ("vg_test",)
18 | DATALOADER:
19 |   DATASET_BS: 2
20 | OUTPUT_DIR: "./output/GRiT_B_DenseCap"


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/configs/GRiT_B_ObjectDet.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base.yaml"
 2 | MODEL:
 3 |   TRAIN_TASK: ["ObjectDet"]
 4 |   TEST_TASK: "ObjectDet"
 5 |   MASK_ON: True
 6 |   ROI_HEADS:
 7 |     SOFT_NMS_ENABLED: True
 8 |   BEAM_SIZE: 3
 9 |   WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth"
10 |   BACKBONE:
11 |     NAME: build_vit_fpn_backbone
12 |   VIT_LAYERS: 12
13 | SOLVER:
14 |   VIT_LAYER_DECAY_RATE: 0.7
15 | DATASETS:
16 |   TRAIN: ("GRiT_coco2017_train",)
17 |   TEST: ("coco_2017_val",)
18 | DATALOADER:
19 |   DATASET_BS: 2
20 | OUTPUT_DIR: "./output/GRiT_B_ObjectDet"


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/configs/GRiT_L_ObjectDet.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base.yaml"
 2 | MODEL:
 3 |   TRAIN_TASK: ["ObjectDet"]
 4 |   TEST_TASK: "ObjectDet"
 5 |   MASK_ON: True
 6 |   ROI_HEADS:
 7 |     SOFT_NMS_ENABLED: True
 8 |   BEAM_SIZE: 3
 9 |   WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_large.pth"
10 |   BACKBONE:
11 |     NAME: build_vit_fpn_backbone_large
12 |   VIT_LAYERS: 24
13 | SOLVER:
14 |   VIT_LAYER_DECAY_RATE: 0.8
15 | DATASETS:
16 |   TRAIN: ("GRiT_coco2017_train",)
17 |   TEST: ("coco_2017_val",)
18 | DATALOADER:
19 |   DATASET_BS: 1
20 | OUTPUT_DIR: "./output/GRiT_L_ObjectDet"


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/tag2Text/med_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "type_vocab_size": 2,
18 |   "vocab_size": 30524,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true   
21 | }
22 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/configs/GRiT_H_ObjectDet.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base.yaml"
 2 | MODEL:
 3 |   TRAIN_TASK: ["ObjectDet"]
 4 |   TEST_TASK: "ObjectDet"
 5 |   MASK_ON: True
 6 |   ROI_HEADS:
 7 |     SOFT_NMS_ENABLED: True
 8 |   BEAM_SIZE: 3
 9 |   WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth"
10 |   BACKBONE:
11 |     NAME: build_vit_fpn_backbone_huge
12 |   VIT_LAYERS: 32
13 | SOLVER:
14 |   MAX_ITER: 135000
15 |   VIT_LAYER_DECAY_RATE: 0.9
16 | DATASETS:
17 |   TRAIN: ("GRiT_coco2017_train",)
18 |   TEST: ("coco_2017_val",)
19 | DATALOADER:
20 |   DATASET_BS: 1
21 | OUTPUT_DIR: "./output/GRiT_H_ObjectDet"


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/cli/vbench.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import importlib
 3 | import subprocess
 4 | 
 5 | vbench_cmd = ['evaluate', 'static_filter']
 6 | 
 7 | def main():
 8 |     parser = argparse.ArgumentParser(prog="toolkit", formatter_class=argparse.RawTextHelpFormatter)
 9 |     subparsers = parser.add_subparsers(title='toolkit subcommands')
10 | 
11 |     for cmd in vbench_cmd:
12 |         module = importlib.import_module(f'toolkit.cli.{cmd}')
13 |         module.register_subparsers(subparsers)
14 |     parser.set_defaults(func=help)
15 |     args = parser.parse_args()
16 |     args.func(args)
17 | 
18 | def help(args):
19 |     subprocess.run(['toolkit', '-h'], check=True)
20 | 


--------------------------------------------------------------------------------
/utils/data_types.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict, Union, Literal
 2 | from dataclasses import dataclass
 3 | from enum import Enum
 4 | 
 5 | class TaskType(Enum):
 6 |     COMPREHENSION = "comprehension"
 7 |     GENERATION = "generation"
 8 | 
 9 | class ModalityType(Enum):
10 |     IMAGE = "Image"
11 |     VIDEO = "Video"
12 |     AUDIO = "Audio"
13 |     NLP = "NLP"
14 |     THREE_D = "3D"
15 | 
16 | @dataclass
17 | class TaskResult:
18 |     task_name: str
19 |     metric: str
20 |     score: float
21 |     task_type: TaskType = TaskType.COMPREHENSION  # Default to comprehension task
22 | 
23 | # Store results for all modalities
24 | ModalityResults = Dict[ModalityType, Dict[TaskType, List[TaskResult]]] 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/tag2Text/q2l_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "architectures": [
 3 |       "BertModel"
 4 |     ],
 5 |     "attention_probs_dropout_prob": 0.1,
 6 |     "hidden_act": "gelu",
 7 |     "hidden_dropout_prob": 0.1,
 8 |     "hidden_size": 768,
 9 |     "initializer_range": 0.02,
10 |     "intermediate_size": 3072,
11 |     "layer_norm_eps": 1e-12,
12 |     "max_position_embeddings": 512,
13 |     "model_type": "bert",
14 |     "num_attention_heads": 4,
15 |     "num_hidden_layers": 2,
16 |     "pad_token_id": 0,
17 |     "type_vocab_size": 2,
18 |     "vocab_size": 30522,
19 |     "encoder_width": 768,
20 |     "add_cross_attention": true,
21 |     "add_tag_cross_attention": false
22 |   }
23 |   


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/__init__.py:
--------------------------------------------------------------------------------
 1 | from .modeling.meta_arch.centernet_detector import CenterNetDetector
 2 | from .modeling.dense_heads.centernet import CenterNet
 3 | from .modeling.roi_heads.custom_roi_heads import CustomROIHeads, CustomCascadeROIHeads
 4 | 
 5 | from .modeling.backbone.fpn_p5 import build_p67_resnet_fpn_backbone
 6 | from .modeling.backbone.dla import build_dla_backbone
 7 | from .modeling.backbone.dlafpn import build_dla_fpn3_backbone
 8 | from .modeling.backbone.bifpn import build_resnet_bifpn_backbone
 9 | from .modeling.backbone.bifpn_fcos import build_fcos_resnet_bifpn_backbone
10 | from .modeling.backbone.res2net import build_p67_res2net_fpn_backbone
11 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_X101-DCN_2x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-CenterNet2.yaml"
 2 | MODEL:
 3 |   CENTERNET:
 4 |     USE_DEFORMABLE: True
 5 |   WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
 6 |   PIXEL_STD: [57.375, 57.120, 58.395]
 7 |   RESNETS:
 8 |     STRIDE_IN_1X1: False
 9 |     NUM_GROUPS: 32
10 |     WIDTH_PER_GROUP: 8
11 |     DEPTH: 101
12 |     DEFORM_ON_PER_STAGE: [False, False, True, True] # on Res4, Res5
13 |     DEFORM_MODULATED: True
14 |   ROI_HEADS:
15 |     IN_FEATURES: ["p3", "p4"]
16 | SOLVER:
17 |   STEPS: (120000, 160000)
18 |   MAX_ITER: 180000
19 |   CHECKPOINT_PERIOD: 40000
20 | INPUT:
21 |   MIN_SIZE_TRAIN: (480, 960)
22 |   MIN_SIZE_TRAIN_SAMPLING: "range"
23 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/configs/GRiT_B_DenseCap_ObjectDet.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base.yaml"
 2 | MODEL:
 3 |   TRAIN_TASK: ["ObjectDet", "DenseCap"]
 4 |   TEST_TASK: "DenseCap" # DenseCap or ObjectDet: Choose one for testing
 5 |   MASK_ON: True
 6 |   ROI_HEADS:
 7 |     SOFT_NMS_ENABLED: False
 8 |   BEAM_SIZE: 1
 9 |   WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth"
10 |   BACKBONE:
11 |     NAME: build_vit_fpn_backbone
12 |   VIT_LAYERS: 12
13 | SOLVER:
14 |   VIT_LAYER_DECAY_RATE: 0.7
15 | DATASETS:
16 |   TRAIN: ("GRiT_coco2017_train", "vg_train")
17 |   TEST: ("coco_2017_test-dev",)
18 | DATALOADER:
19 |   DATASET_RATIO: [1, 1]
20 |   DATASET_BS: 2
21 |   DATASET_INPUT_SIZE: [1024, 1024]
22 |   DATASET_INPUT_SCALE: [[0.1, 2.0], [0.1, 2.0]]
23 | OUTPUT_DIR: "./output/GRiT_B_DenseCap_ObjectDet"


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_DLA-BiFPN-P5_640_16x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-CenterNet2.yaml"
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "build_p37_dla_bifpn_backbone"
 5 |   BIFPN:
 6 |     OUT_CHANNELS: 160
 7 |     NUM_LEVELS: 5
 8 |     NUM_BIFPN: 3
 9 |   CENTERNET:
10 |     POST_NMS_TOPK_TEST: 128
11 |   WEIGHTS: ''
12 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
13 |   PIXEL_STD: [58.395, 57.12, 57.375]
14 |   FPN:
15 |     IN_FEATURES: ["dla3", "dla4", "dla5"]
16 | SOLVER:
17 |   LR_SCHEDULER_NAME: "WarmupCosineLR"
18 |   MAX_ITER: 360000
19 |   BASE_LR: 0.08
20 |   IMS_PER_BATCH: 64
21 |   CHECKPOINT_PERIOD: 90000
22 | TEST:
23 |   EVAL_PERIOD: 7500
24 | INPUT:
25 |   FORMAT: RGB
26 |   CUSTOM_AUG: EfficientDetResizeCrop
27 |   TRAIN_SIZE: 640
28 |   MIN_SIZE_TEST: 608
29 |   MAX_SIZE_TEST: 900
30 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_DLA-BiFPN-P5_640_16x_ST.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-CenterNet2.yaml"
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "build_p37_dla_bifpn_backbone"
 5 |   BIFPN:
 6 |     OUT_CHANNELS: 160
 7 |     NUM_LEVELS: 5
 8 |     NUM_BIFPN: 3
 9 |   CENTERNET:
10 |     POST_NMS_TOPK_TEST: 128
11 |   WEIGHTS: ''
12 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
13 |   PIXEL_STD: [58.395, 57.12, 57.375]
14 |   FPN:
15 |     IN_FEATURES: ["dla3", "dla4", "dla5"]
16 | SOLVER:
17 |   LR_SCHEDULER_NAME: "WarmupCosineLR"
18 |   MAX_ITER: 360000
19 |   BASE_LR: 0.08
20 |   IMS_PER_BATCH: 64
21 | TEST:
22 |   EVAL_PERIOD: 7500
23 | INPUT:
24 |   FORMAT: RGB
25 |   CUSTOM_AUG: EfficientDetResizeCrop
26 |   TRAIN_SIZE: 640
27 |   MIN_SIZE_TEST: 608
28 |   MAX_SIZE_TEST: 900
29 | DATASETS:
30 |   TRAIN: ("coco_2017_train","coco_un_yolov4_55_0.5",)
31 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_R2-101-DCN_896_4x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-CenterNet2.yaml"
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "build_p67_res2net_fpn_backbone"
 5 |   WEIGHTS: "output/r2_101.pkl"
 6 |   RESNETS:
 7 |     DEPTH: 101
 8 |     WIDTH_PER_GROUP: 26
 9 |     DEFORM_ON_PER_STAGE: [False, False, True, True] # on Res4, Res5
10 |     DEFORM_MODULATED: True
11 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
12 |   PIXEL_STD: [58.395, 57.12, 57.375]
13 |   CENTERNET:
14 |     USE_DEFORMABLE: True
15 |   ROI_HEADS:
16 |     IN_FEATURES: ["p3", "p4"]
17 | INPUT:
18 |   FORMAT: RGB
19 | TEST:
20 |   EVAL_PERIOD: 7500
21 | SOLVER:
22 |   MAX_ITER: 180000
23 |   CHECKPOINT_PERIOD: 600000
24 |   LR_SCHEDULER_NAME: "WarmupCosineLR"
25 |   BASE_LR: 0.04
26 |   IMS_PER_BATCH: 32
27 | INPUT:
28 |   CUSTOM_AUG: EfficientDetResizeCrop
29 |   TRAIN_SIZE: 896


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/Base-CenterNet-FPN.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "CenterNetDetector"
 3 |   PROPOSAL_GENERATOR:
 4 |     NAME: "CenterNet"
 5 |   BACKBONE:
 6 |     NAME: "build_p67_resnet_fpn_backbone"
 7 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     OUT_FEATURES: ["res3", "res4", "res5"]
11 |   FPN:
12 |     IN_FEATURES: ["res3", "res4", "res5"]
13 | DATASETS:
14 |   TRAIN: ("coco_2017_train",)
15 |   TEST: ("coco_2017_val",)
16 | SOLVER:
17 |   IMS_PER_BATCH: 16
18 |   BASE_LR: 0.01
19 |   STEPS: (60000, 80000)
20 |   MAX_ITER: 90000
21 |   CHECKPOINT_PERIOD: 1000000000
22 |   WARMUP_ITERS: 4000
23 |   WARMUP_FACTOR: 0.00025
24 |   CLIP_GRADIENTS:
25 |     ENABLED: True
26 | INPUT:
27 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
28 | OUTPUT_DIR: "./output/CenterNet2/auto"
29 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_DLA-fcosBiFPN-P5_640_16x_ST.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-CenterNet2.yaml"
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "build_p37_fcos_dla_bifpn_backbone"
 5 |   BIFPN:
 6 |     OUT_CHANNELS: 160
 7 |     NUM_LEVELS: 5
 8 |     NUM_BIFPN: 3
 9 |   CENTERNET:
10 |     POST_NMS_TOPK_TEST: 128
11 |   WEIGHTS: ''
12 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
13 |   PIXEL_STD: [58.395, 57.12, 57.375]
14 |   FPN:
15 |     IN_FEATURES: ["dla3", "dla4", "dla5"]
16 | TEST:
17 |   EVAL_PERIOD: 7500
18 | SOLVER:
19 |   LR_SCHEDULER_NAME: "WarmupCosineLR"
20 |   MAX_ITER: 360000
21 |   BASE_LR: 0.08
22 |   IMS_PER_BATCH: 64
23 | INPUT:
24 |   FORMAT: RGB
25 |   CUSTOM_AUG: EfficientDetResizeCrop
26 |   TRAIN_SIZE: 640
27 |   MIN_SIZE_TEST: 608
28 |   MAX_SIZE_TEST: 900
29 | DATASETS:
30 |   TRAIN: ("coco_2017_train","coco_un_yolov4_55_0.5",)
31 | 


--------------------------------------------------------------------------------
/utils/base_processor.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from .data_types import ModalityType, TaskType, TaskResult
 3 | 
 4 | """Base modality processor"""
 5 | 
 6 | class BaseModalityProcessor:
 7 |     def __init__(self, modality: ModalityType, 
 8 |                  dataset_dir: str, 
 9 |                  pred_json_file: str):
10 |         self.modality = modality
11 |         self.dataset_dir = dataset_dir
12 |         self.pred_json_file = pred_json_file
13 |     
14 |     def process_comprehension(self) -> List[TaskResult]:
15 |         """Process comprehension tasks, optional implementation"""
16 |         return []
17 |     
18 |     def process_generation(self) -> List[TaskResult]:
19 |         """Process generation tasks, optional implementation"""
20 |         return []
21 |     
22 |     def process(self) -> List[TaskResult]:
23 |         """Process tasks without type distinction (e.g., NLP tasks)"""
24 |         return [] 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_R2-101-DCN-BiFPN_1280_4x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-CenterNet2.yaml"
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "build_res2net_bifpn_backbone"
 5 |   BIFPN:
 6 |     NUM_BIFPN: 7
 7 |     OUT_CHANNELS: 288
 8 |   WEIGHTS: "output/r2_101.pkl"
 9 |   RESNETS:
10 |     DEPTH: 101
11 |     WIDTH_PER_GROUP: 26
12 |     DEFORM_ON_PER_STAGE: [False, False, True, True] # on Res4, Res5
13 |     DEFORM_MODULATED: True
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.12, 57.375]
16 |   CENTERNET:
17 |     USE_DEFORMABLE: True
18 |   ROI_HEADS:
19 |     IN_FEATURES: ["p3", "p4"]
20 | INPUT:
21 |   FORMAT: RGB
22 | TEST:
23 |   EVAL_PERIOD: 7500
24 | SOLVER:
25 |   MAX_ITER: 180000
26 |   CHECKPOINT_PERIOD: 60000
27 |   LR_SCHEDULER_NAME: "WarmupCosineLR"
28 |   BASE_LR: 0.04
29 |   IMS_PER_BATCH: 32
30 | INPUT:
31 |   CUSTOM_AUG: EfficientDetResizeCrop
32 |   TRAIN_SIZE: 1280
33 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_R2-101-DCN-BiFPN_4x+4x_1560_ST.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-CenterNet2.yaml"
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "build_res2net_bifpn_backbone"
 5 |   BIFPN:
 6 |     NUM_BIFPN: 7
 7 |     OUT_CHANNELS: 288
 8 |   WEIGHTS: "output/r2_101.pkl"
 9 |   RESNETS:
10 |     DEPTH: 101
11 |     WIDTH_PER_GROUP: 26
12 |     DEFORM_ON_PER_STAGE: [False, False, True, True] # on Res4, Res5
13 |     DEFORM_MODULATED: True
14 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
15 |   PIXEL_STD: [58.395, 57.12, 57.375]
16 |   CENTERNET:
17 |     USE_DEFORMABLE: True
18 |   ROI_HEADS:
19 |     IN_FEATURES: ["p3", "p4"]
20 | TEST:
21 |   EVAL_PERIOD: 7500
22 | SOLVER:
23 |   MAX_ITER: 180000
24 |   CHECKPOINT_PERIOD: 7500
25 |   LR_SCHEDULER_NAME: "WarmupCosineLR"
26 |   BASE_LR: 0.04
27 |   IMS_PER_BATCH: 32
28 | DATASETS:
29 |   TRAIN: "('coco_2017_train', 'coco_un_yolov4_55_0.5')"
30 | INPUT:
31 |   FORMAT: RGB
32 |   CUSTOM_AUG: EfficientDetResizeCrop
33 |   TRAIN_SIZE: 1280
34 |   TEST_SIZE: 1560
35 |   TEST_INPUT_TYPE: 'square'
36 |   


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_DLA-BiFPN-P3_24x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-CenterNet2.yaml"
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "build_p35_fcos_dla_bifpn_backbone"
 5 |   BIFPN:
 6 |     OUT_CHANNELS: 160
 7 |     NUM_LEVELS: 3
 8 |     NUM_BIFPN: 4
 9 |   DLA:
10 |     NUM_LAYERS: 34
11 |     NORM: "SyncBN"
12 |   FPN:
13 |     IN_FEATURES: ["dla3", "dla4", "dla5"]
14 |   ROI_HEADS:
15 |     IN_FEATURES: ["p3", "p4", "p5"]
16 |   CENTERNET:
17 |     POST_NMS_TOPK_TEST: 128
18 |     FPN_STRIDES: [8, 16, 32]
19 |     IN_FEATURES: ['p3', 'p4', 'p5']
20 |     SOI: [[0, 64], [48, 192], [128, 1000000]]
21 | DATASETS:
22 |   TRAIN: ("coco_2017_train",)
23 |   TEST: ("coco_2017_val",)
24 | SOLVER:
25 |   IMS_PER_BATCH: 16
26 |   BASE_LR: 0.02
27 |   STEPS: (300000, 340000)
28 |   MAX_ITER: 360000
29 |   CHECKPOINT_PERIOD: 100000
30 |   WARMUP_ITERS: 4000
31 |   WARMUP_FACTOR: 0.00025
32 | INPUT:
33 |   MIN_SIZE_TRAIN: (256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608)
34 |   MAX_SIZE_TRAIN: 900
35 |   MAX_SIZE_TEST: 736
36 |   MIN_SIZE_TEST: 512


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_DLA-BiFPN-P3_4x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-CenterNet2.yaml"
 2 | MODEL:
 3 |   BACKBONE:
 4 |     NAME: "build_p35_fcos_dla_bifpn_backbone"
 5 |   BIFPN:
 6 |     OUT_CHANNELS: 160
 7 |     NUM_LEVELS: 3
 8 |     NUM_BIFPN: 4
 9 |   DLA:
10 |     NUM_LAYERS: 34
11 |     NORM: "SyncBN"
12 |   FPN:
13 |     IN_FEATURES: ["dla3", "dla4", "dla5"]
14 |   ROI_HEADS:
15 |     IN_FEATURES: ["p3", "p4", "p5"]
16 |   CENTERNET:
17 |     POST_NMS_TOPK_TEST: 128
18 |     FPN_STRIDES: [8, 16, 32]
19 |     IN_FEATURES: ['p3', 'p4', 'p5']
20 |     SOI: [[0, 64], [48, 192], [128, 1000000]]
21 | DATASETS:
22 |   TRAIN: ("coco_2017_train",)
23 |   TEST: ("coco_2017_val",)
24 | SOLVER:
25 |   IMS_PER_BATCH: 16
26 |   BASE_LR: 0.02
27 |   STEPS: (300000, 340000)
28 |   MAX_ITER: 360000
29 |   CHECKPOINT_PERIOD: 100000
30 |   WARMUP_ITERS: 4000
31 |   WARMUP_FACTOR: 0.00025
32 | INPUT:
33 |   MIN_SIZE_TRAIN: (256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608)
34 |   MAX_SIZE_TRAIN: 900
35 |   MAX_SIZE_TEST: 736
36 |   MIN_SIZE_TEST: 512


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/Base_S4_DLA.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "CenterNetDetector"
 3 |   PROPOSAL_GENERATOR:
 4 |     NAME: "CenterNet"
 5 |   PIXEL_STD: [57.375, 57.120, 58.395]
 6 |   BACKBONE:
 7 |     NAME: "build_dla_backbone"
 8 |   DLA:
 9 |     NORM: "BN"
10 |   CENTERNET:
11 |     IN_FEATURES: ["dla2"]
12 |     FPN_STRIDES: [4]
13 |     SOI: [[0, 1000000]]
14 |     NUM_CLS_CONVS: 1
15 |     NUM_BOX_CONVS: 1
16 |     REG_WEIGHT: 1.
17 |     MORE_POS: True
18 |     HM_FOCAL_ALPHA: 0.25
19 | DATASETS:
20 |   TRAIN: ("coco_2017_train",)
21 |   TEST: ("coco_2017_val",)
22 | SOLVER:
23 |   LR_SCHEDULER_NAME: "WarmupCosineLR"
24 |   MAX_ITER: 90000
25 |   BASE_LR: 0.04
26 |   IMS_PER_BATCH: 64
27 |   WEIGHT_DECAY: 0.0001
28 |   CHECKPOINT_PERIOD: 1000000
29 |   CLIP_GRADIENTS:
30 |     ENABLED: True
31 | INPUT:
32 |   CUSTOM_AUG: EfficientDetResizeCrop
33 |   TRAIN_SIZE: 640
34 |   MIN_SIZE_TEST: 608
35 |   MAX_SIZE_TEST: 900
36 | TEST:
37 |   EVAL_PERIOD: 7500
38 | DATALOADER:
39 |   NUM_WORKERS: 8
40 | OUTPUT_DIR: "output/CenterNet2/auto"
41 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/nuImages_CenterNet2_DLA_640_8x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-CenterNet2.yaml"
 2 | MODEL:
 3 |   MASK_ON: True
 4 |   ROI_MASK_HEAD:
 5 |     NAME: "MaskRCNNConvUpsampleHead"
 6 |     NUM_CONV: 4
 7 |     POOLER_RESOLUTION: 14
 8 |   ROI_HEADS:
 9 |     NUM_CLASSES: 10
10 |     IN_FEATURES: ["dla2"]
11 |   BACKBONE:
12 |     NAME: "build_dla_backbone"
13 |   DLA:
14 |     NORM: "BN"
15 |   CENTERNET:
16 |     IN_FEATURES: ["dla2"]
17 |     FPN_STRIDES: [4]
18 |     SOI: [[0, 1000000]]
19 |     NUM_CLS_CONVS: 1
20 |     NUM_BOX_CONVS: 1
21 |     REG_WEIGHT: 1.
22 |     MORE_POS: True
23 |     HM_FOCAL_ALPHA: 0.25
24 |     POST_NMS_TOPK_TEST: 128
25 |   WEIGHTS: ''
26 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
27 |   PIXEL_STD: [58.395, 57.12, 57.375]
28 | SOLVER:
29 |   MAX_ITER: 180000
30 |   STEPS: (120000, 160000)
31 |   BASE_LR: 0.08
32 |   IMS_PER_BATCH: 64
33 | INPUT:
34 |   FORMAT: RGB
35 |   CUSTOM_AUG: EfficientDetResizeCrop
36 |   TRAIN_SIZE: 640
37 |   MIN_SIZE_TEST: 608
38 |   MAX_SIZE_TEST: 900
39 |   MASK_FORMAT: bitmask
40 | DATASETS:
41 |   TRAIN: ("nuimages_train",)
42 |   TEST: ("nuimages_val",)
43 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/README.md:
--------------------------------------------------------------------------------
 1 | <a name="installation"></a>
 2 | ## 🛠 Installation
 3 | 
 4 | ### Install with pip
 5 | ```bash
 6 | pip install vbench
 7 | pip install -r requirements.txt
 8 | ```
 9 | 
10 | To evaluate some video generation ability aspects, you need to install [detectron2](https://github.com/facebookresearch/detectron2) via:
11 |    ```
12 |    pip install detectron2@git+https://github.com/facebookresearch/detectron2.git
13 |    ```
14 |     
15 | If there is an error during [detectron2](https://github.com/facebookresearch/detectron2) installation, see [here](https://detectron2.readthedocs.io/en/latest/tutorials/install.html).
16 | 
17 | ### Thrid-party models
18 | Download required pretrained models by following the instructions in [here](pretrained/README.md)
19 | 
20 | 
21 | 
22 | <a name="usage"></a>
23 | ## 🚀 Usage
24 | Configure: Modify the model and task type (T2V for text-to-video or I2V for image-to-video) in ``video_generation_evaluate_kit.py``, and then run:
25 | ```
26 | python video_generation_evaluate_kit.py
27 | ```
28 | This script will automatically:
29 | 
30 | Generate video outputs 🖥️
31 | 
32 | Evaluate model performance across metrics 📊


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/layers/ml_nms.py:
--------------------------------------------------------------------------------
 1 | from detectron2.layers import batched_nms
 2 | 
 3 | 
 4 | def ml_nms(boxlist, nms_thresh, max_proposals=-1,
 5 |            score_field="scores", label_field="labels"):
 6 |     """
 7 |     Performs non-maximum suppression on a boxlist, with scores specified
 8 |     in a boxlist field via score_field.
 9 |     Arguments:
10 |         boxlist(BoxList)
11 |         nms_thresh (float)
12 |         max_proposals (int): if > 0, then only the top max_proposals are kept
13 |             after non-maximum suppression
14 |         score_field (str)
15 |     """
16 |     if nms_thresh <= 0:
17 |         return boxlist
18 |     if boxlist.has('pred_boxes'):
19 |         boxes = boxlist.pred_boxes.tensor
20 |         labels = boxlist.pred_classes
21 |     else:
22 |         boxes = boxlist.proposal_boxes.tensor
23 |         labels = boxlist.proposal_boxes.tensor.new_zeros(
24 |             len(boxlist.proposal_boxes.tensor))
25 |     scores = boxlist.scores
26 |     
27 |     keep = batched_nms(boxes, scores, labels, nms_thresh)
28 |     if max_proposals > 0:
29 |         keep = keep[: max_proposals]
30 |     boxlist = boxlist[keep]
31 |     return boxlist
32 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/roi_heads/fed_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import json
 3 | import numpy as np
 4 | from torch.nn import functional as F
 5 | 
 6 | def load_class_freq(
 7 |     path='datasets/lvis/lvis_v1_train_cat_info.json', 
 8 |     freq_weight=0.5):
 9 |     cat_info = json.load(open(path, 'r'))
10 |     cat_info = torch.tensor(
11 |         [c['image_count'] for c in sorted(cat_info, key=lambda x: x['id'])])
12 |     freq_weight = cat_info.float() ** freq_weight
13 |     return freq_weight
14 | 
15 | def get_fed_loss_inds(
16 |     gt_classes, num_sample_cats=50, C=1203, \
17 |     weight=None, fed_cls_inds=-1):
18 |     appeared = torch.unique(gt_classes) # C'
19 |     prob = appeared.new_ones(C + 1).float()
20 |     prob[-1] = 0
21 |     if len(appeared) < num_sample_cats:
22 |         if weight is not None:
23 |             prob[:C] = weight.float().clone()
24 |         prob[appeared] = 0
25 |         if fed_cls_inds > 0:
26 |             prob[fed_cls_inds:] = 0
27 |         more_appeared = torch.multinomial(
28 |             prob, num_sample_cats - len(appeared),
29 |             replacement=False)
30 |         appeared = torch.cat([appeared, more_appeared])
31 |     return appeared


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/cfgs/AMT-S_gopro.yaml:
--------------------------------------------------------------------------------
 1 | exp_name: wofloloss_400epoch_bs24_lr2e-4
 2 | seed: 2023
 3 | epochs: 400
 4 | distributed: true
 5 | lr: 2e-4
 6 | lr_min: 2e-5
 7 | weight_decay: 0.0
 8 | resume_state: null
 9 | save_dir: work_dir
10 | eval_interval: 1
11 | 
12 | network:
13 |   name: networks.AMT-S.Model
14 |   params:
15 |     corr_radius: 3
16 |     corr_lvls: 4
17 |     num_flows: 3
18 | 
19 | data:
20 |   train: 
21 |     name: datasets.gopro_datasets.GoPro_Train_Dataset
22 |     params: 
23 |       dataset_dir: data/GOPRO
24 |   val:
25 |     name: datasets.gopro_datasets.GoPro_Test_Dataset
26 |     params: 
27 |       dataset_dir: data/GOPRO
28 |   train_loader:
29 |     batch_size: 24
30 |     num_workers: 12
31 |   val_loader:
32 |     batch_size: 24
33 |     num_workers: 3
34 | 
35 | logger:
36 |   use_wandb: false  
37 |   resume_id: null
38 | 
39 | losses:
40 |   - {
41 |     name: losses.loss.CharbonnierLoss,
42 |     nickname: l_rec,
43 |     params: {
44 |       loss_weight: 1.0,
45 |       keys: [imgt_pred, imgt]
46 |     }
47 |   }
48 |   - {
49 |     name: losses.loss.TernaryLoss,
50 |     nickname: l_ter,
51 |     params: {
52 |       loss_weight: 1.0,
53 |       keys: [imgt_pred, imgt]
54 |     }
55 |   }
56 | 
57 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/competitions/competition_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torchvision.io as tvio
 3 | import torch
 4 | 
 5 | def transform_to_videos(input_path, output_path, frame_rate):
 6 |     if not os.path.exists(output_path):
 7 |         os.makedirs(output_path)
 8 | 
 9 |     for root, dirs, files in os.walk(input_path):
10 |         for directory in dirs:
11 |             
12 |             dir_path = os.path.join(root, directory)
13 |             image_files = [f for f in os.listdir(dir_path) if f.endswith('.png')]
14 |             if not image_files:
15 |                 continue  # Skip if there are no image files in the directory
16 | 
17 |             image_files.sort()
18 |             
19 |             frames = []
20 |             for image_file in image_files:
21 |                 image_path = os.path.join(dir_path, image_file)
22 |                 frame = tvio.read_image(image_path)
23 |                 frames.append(frame)
24 |             frames = torch.stack(frames).permute(0, 2, 3, 1)    
25 |             
26 |             # Write the frames to video
27 |             video_path = os.path.join(output_path, f"{directory}.mp4")
28 |             tvio.write_video(video_path, frames, fps=frame_rate)
29 | 
30 |     print(f"Videos are saved in '{output_path}'")
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/benchmarks/speed_parameters.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | import torch
 4 | import argparse
 5 | from omegaconf import OmegaConf
 6 | 
 7 | sys.path.append('.')
 8 | from utils.build_utils import build_from_cfg
 9 | 
10 | parser = argparse.ArgumentParser(
11 |                 prog = 'AMT',
12 |                 description = 'Speed&parameter benchmark',
13 |                 )
14 | parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') 
15 | args = parser.parse_args()
16 | 
17 | cfg_path = args.config
18 | network_cfg = OmegaConf.load(cfg_path).network
19 | model = build_from_cfg(network_cfg)
20 | model = model.cuda()
21 | model.eval()
22 | 
23 | img0 = torch.randn(1, 3, 256, 448).cuda()
24 | img1 = torch.randn(1, 3, 256, 448).cuda()
25 | embt = torch.tensor(1/2).float().view(1, 1, 1, 1).cuda()
26 | 
27 | with torch.no_grad():
28 |     for i in range(100):
29 |         out = model(img0, img1, embt, eval=True)
30 |     torch.cuda.synchronize()
31 |     time_stamp = time.time()
32 |     for i in range(1000):
33 |         out = model(img0, img1, embt, eval=True)
34 |     torch.cuda.synchronize()
35 |     print('Time: {:.5f}s'.format((time.time() - time_stamp) / 1))
36 | 
37 | total = sum([param.nelement() for param in model.parameters()])
38 | print('Parameters: {:.2f}M'.format(total / 1e6))
39 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/dense_heads/utils.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import torch
 3 | from torch import nn
 4 | from detectron2.utils.comm import get_world_size
 5 | from detectron2.structures import pairwise_iou, Boxes
 6 | # from .data import CenterNetCrop
 7 | import torch.nn.functional as F
 8 | import numpy as np
 9 | from detectron2.structures import Boxes, ImageList, Instances
10 | 
11 | __all__ = ['reduce_sum', '_transpose']
12 | 
13 | INF = 1000000000
14 | 
15 | def _transpose(training_targets, num_loc_list):
16 |     '''
17 |     This function is used to transpose image first training targets to 
18 |         level first ones
19 |     :return: level first training targets
20 |     '''
21 |     for im_i in range(len(training_targets)):
22 |         training_targets[im_i] = torch.split(
23 |             training_targets[im_i], num_loc_list, dim=0)
24 | 
25 |     targets_level_first = []
26 |     for targets_per_level in zip(*training_targets):
27 |         targets_level_first.append(
28 |             torch.cat(targets_per_level, dim=0))
29 |     return targets_level_first
30 | 
31 | 
32 | def reduce_sum(tensor):
33 |     world_size = get_world_size()
34 |     if world_size < 2:
35 |         return tensor
36 |     tensor = tensor.clone()
37 |     torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
38 |     return tensor


--------------------------------------------------------------------------------
/video_generation_evaluation/pretrained/amt_model/AMT-S.yaml:
--------------------------------------------------------------------------------
 1 | exp_name: floloss1e-2_300epoch_bs24_lr2e-4
 2 | seed: 2023
 3 | epochs: 300
 4 | distributed: true
 5 | lr: 2e-4
 6 | lr_min: 2e-5
 7 | weight_decay: 0.0
 8 | resume_state: null
 9 | save_dir: work_dir
10 | eval_interval: 1
11 | 
12 | network:
13 |   name: networks.AMT-S.Model
14 |   params:
15 |     corr_radius: 3
16 |     corr_lvls: 4
17 |     num_flows: 3
18 | 
19 | data:
20 |   train: 
21 |     name: datasets.vimeo_datasets.Vimeo90K_Train_Dataset
22 |     params: 
23 |       dataset_dir: data/vimeo_triplet
24 |   val:
25 |     name: datasets.vimeo_datasets.Vimeo90K_Test_Dataset
26 |     params: 
27 |       dataset_dir: data/vimeo_triplet
28 |   train_loader:
29 |     batch_size: 24
30 |     num_workers: 12
31 |   val_loader:
32 |     batch_size: 24
33 |     num_workers: 3
34 | 
35 | logger:
36 |   use_wandb: false  
37 |   resume_id: null
38 | 
39 | losses:
40 |   - {
41 |     name: losses.loss.CharbonnierLoss,
42 |     nickname: l_rec,
43 |     params: {
44 |       loss_weight: 1.0,
45 |       keys: [imgt_pred, imgt]
46 |     }
47 |   }
48 |   - {
49 |     name: losses.loss.TernaryLoss,
50 |     nickname: l_ter,
51 |     params: {
52 |       loss_weight: 1.0,
53 |       keys: [imgt_pred, imgt]
54 |     }
55 |   }
56 |   - {
57 |     name: losses.loss.MultipleFlowLoss,
58 |     nickname: l_flo,
59 |     params: {
60 |       loss_weight: 0.002,
61 |       keys: [flow0_pred, flow1_pred, flow]
62 |     }
63 |   }
64 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/cfgs/AMT-G.yaml:
--------------------------------------------------------------------------------
 1 | exp_name: floloss1e-2_300epoch_bs24_lr1p5e-4
 2 | seed: 2023
 3 | epochs: 300
 4 | distributed: true
 5 | lr: 1.5e-4
 6 | lr_min: 2e-5
 7 | weight_decay: 0.0
 8 | resume_state: null
 9 | save_dir: work_dir
10 | eval_interval: 1
11 | 
12 | network:
13 |   name: networks.AMT-G.Model
14 |   params:
15 |     corr_radius: 3
16 |     corr_lvls: 4
17 |     num_flows: 5
18 | data:
19 |   train: 
20 |     name: datasets.vimeo_datasets.Vimeo90K_Train_Dataset
21 |     params: 
22 |       dataset_dir: data/vimeo_triplet
23 |   val:
24 |     name: datasets.vimeo_datasets.Vimeo90K_Test_Dataset
25 |     params: 
26 |       dataset_dir: data/vimeo_triplet
27 |   train_loader:
28 |     batch_size: 24
29 |     num_workers: 12
30 |   val_loader:
31 |     batch_size: 24
32 |     num_workers: 3
33 | 
34 | logger:
35 |   use_wandb: true  
36 |   resume_id: null
37 | 
38 | losses:
39 |   - {
40 |     name: losses.loss.CharbonnierLoss,
41 |     nickname: l_rec,
42 |     params: {
43 |       loss_weight: 1.0,
44 |       keys: [imgt_pred, imgt]
45 |     }
46 |   }
47 |   - {
48 |     name: losses.loss.TernaryLoss,
49 |     nickname: l_ter,
50 |     params: {
51 |       loss_weight: 1.0,
52 |       keys: [imgt_pred, imgt]
53 |     }
54 |   }
55 |   - {
56 |     name: losses.loss.MultipleFlowLoss,
57 |     nickname: l_flo,
58 |     params: {
59 |       loss_weight: 0.005,
60 |       keys: [flow0_pred, flow1_pred, flow]
61 |     }
62 |   }
63 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/cfgs/AMT-L.yaml:
--------------------------------------------------------------------------------
 1 | exp_name: floloss1e-2_300epoch_bs24_lr2e-4
 2 | seed: 2023
 3 | epochs: 300
 4 | distributed: true
 5 | lr: 2e-4
 6 | lr_min: 2e-5
 7 | weight_decay: 0.0
 8 | resume_state: null
 9 | save_dir: work_dir
10 | eval_interval: 1
11 | 
12 | network:
13 |   name: networks.AMT-L.Model
14 |   params:
15 |     corr_radius: 3
16 |     corr_lvls: 4
17 |     num_flows: 5
18 | data:
19 |   train: 
20 |     name: datasets.vimeo_datasets.Vimeo90K_Train_Dataset
21 |     params: 
22 |       dataset_dir: data/vimeo_triplet
23 |   val:
24 |     name: datasets.vimeo_datasets.Vimeo90K_Test_Dataset
25 |     params: 
26 |       dataset_dir: data/vimeo_triplet
27 |   train_loader:
28 |     batch_size: 24
29 |     num_workers: 12
30 |   val_loader:
31 |     batch_size: 24
32 |     num_workers: 3
33 | 
34 | logger:
35 |   use_wandb: true  
36 |   resume_id: null
37 | 
38 | losses:
39 |   - {
40 |     name: losses.loss.CharbonnierLoss,
41 |     nickname: l_rec,
42 |     params: {
43 |       loss_weight: 1.0,
44 |       keys: [imgt_pred, imgt]
45 |     }
46 |   }
47 |   - {
48 |     name: losses.loss.TernaryLoss,
49 |     nickname: l_ter,
50 |     params: {
51 |       loss_weight: 1.0,
52 |       keys: [imgt_pred, imgt]
53 |     }
54 |   }
55 |   - {
56 |     name: losses.loss.MultipleFlowLoss,
57 |     nickname: l_flo,
58 |     params: {
59 |       loss_weight: 0.002,
60 |       keys: [flow0_pred, flow1_pred, flow]
61 |     }
62 |   }
63 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/cfgs/AMT-S.yaml:
--------------------------------------------------------------------------------
 1 | exp_name: floloss1e-2_300epoch_bs24_lr2e-4
 2 | seed: 2023
 3 | epochs: 300
 4 | distributed: true
 5 | lr: 2e-4
 6 | lr_min: 2e-5
 7 | weight_decay: 0.0
 8 | resume_state: null
 9 | save_dir: work_dir
10 | eval_interval: 1
11 | 
12 | network:
13 |   name: networks.AMT-S.Model
14 |   params:
15 |     corr_radius: 3
16 |     corr_lvls: 4
17 |     num_flows: 3
18 | 
19 | data:
20 |   train: 
21 |     name: datasets.vimeo_datasets.Vimeo90K_Train_Dataset
22 |     params: 
23 |       dataset_dir: data/vimeo_triplet
24 |   val:
25 |     name: datasets.vimeo_datasets.Vimeo90K_Test_Dataset
26 |     params: 
27 |       dataset_dir: data/vimeo_triplet
28 |   train_loader:
29 |     batch_size: 24
30 |     num_workers: 12
31 |   val_loader:
32 |     batch_size: 24
33 |     num_workers: 3
34 | 
35 | logger:
36 |   use_wandb: false  
37 |   resume_id: null
38 | 
39 | losses:
40 |   - {
41 |     name: losses.loss.CharbonnierLoss,
42 |     nickname: l_rec,
43 |     params: {
44 |       loss_weight: 1.0,
45 |       keys: [imgt_pred, imgt]
46 |     }
47 |   }
48 |   - {
49 |     name: losses.loss.TernaryLoss,
50 |     nickname: l_ter,
51 |     params: {
52 |       loss_weight: 1.0,
53 |       keys: [imgt_pred, imgt]
54 |     }
55 |   }
56 |   - {
57 |     name: losses.loss.MultipleFlowLoss,
58 |     nickname: l_flo,
59 |     params: {
60 |       loss_weight: 0.002,
61 |       keys: [flow0_pred, flow1_pred, flow]
62 |     }
63 |   }
64 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/cfgs/IFRNet.yaml:
--------------------------------------------------------------------------------
 1 | exp_name: floloss1e-2_geoloss1e-2_300epoch_bs24_lr1e-4
 2 | seed: 2023
 3 | epochs: 300
 4 | distributed: true
 5 | lr: 1e-4
 6 | lr_min: 1e-5
 7 | weight_decay: 1e-6
 8 | resume_state: null
 9 | save_dir: work_dir
10 | eval_interval: 1
11 | 
12 | network:
13 |   name: networks.IFRNet.Model
14 | 
15 | data:
16 |   train: 
17 |     name: datasets.datasets.Vimeo90K_Train_Dataset
18 |     params: 
19 |       dataset_dir: data/vimeo_triplet
20 |   val:
21 |     name: datasets.datasets.Vimeo90K_Test_Dataset
22 |     params: 
23 |       dataset_dir: data/vimeo_triplet
24 |   train_loader:
25 |     batch_size: 24
26 |     num_workers: 12
27 |   val_loader:
28 |     batch_size: 24
29 |     num_workers: 3
30 | 
31 | logger:
32 |   use_wandb: true 
33 |   resume_id: null
34 | 
35 | losses:
36 |   - {
37 |     name: losses.loss.CharbonnierLoss,
38 |     nickname: l_rec,
39 |     params: {
40 |       loss_weight: 1.0,
41 |       keys: [imgt_pred, imgt]
42 |     }
43 |   }
44 |   - {
45 |     name: losses.loss.TernaryLoss,
46 |     nickname: l_ter,
47 |     params: {
48 |       loss_weight: 1.0,
49 |       keys: [imgt_pred, imgt]
50 |     }
51 |   }
52 |   - {
53 |     name: losses.loss.IFRFlowLoss,
54 |     nickname: l_flo,
55 |     params: {
56 |       loss_weight: 0.01,
57 |       keys: [flow0_pred, flow1_pred, flow]
58 |     }
59 |   }
60 |   - {
61 |     name: losses.loss.GeometryLoss,
62 |     nickname: l_geo,
63 |     params: {
64 |       loss_weight: 0.01,
65 |       keys: [ft_pred, ft_gt]
66 |     }
67 |   }
68 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/RAFT/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020, princeton-vl
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/RAFT/alt_cuda_corr/correlation.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <vector>
 3 | 
 4 | // CUDA forward declarations
 5 | std::vector<torch::Tensor> corr_cuda_forward(
 6 |     torch::Tensor fmap1,
 7 |     torch::Tensor fmap2,
 8 |     torch::Tensor coords,
 9 |     int radius);
10 | 
11 | std::vector<torch::Tensor> corr_cuda_backward(
12 |   torch::Tensor fmap1,
13 |   torch::Tensor fmap2,
14 |   torch::Tensor coords,
15 |   torch::Tensor corr_grad,
16 |   int radius);
17 | 
18 | // C++ interface
19 | #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
20 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
21 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
22 | 
23 | std::vector<torch::Tensor> corr_forward(
24 |     torch::Tensor fmap1,
25 |     torch::Tensor fmap2,
26 |     torch::Tensor coords,
27 |     int radius) {
28 |   CHECK_INPUT(fmap1);
29 |   CHECK_INPUT(fmap2);
30 |   CHECK_INPUT(coords);
31 | 
32 |   return corr_cuda_forward(fmap1, fmap2, coords, radius);
33 | }
34 | 
35 | 
36 | std::vector<torch::Tensor> corr_backward(
37 |     torch::Tensor fmap1,
38 |     torch::Tensor fmap2,
39 |     torch::Tensor coords,
40 |     torch::Tensor corr_grad,
41 |     int radius) {
42 |   CHECK_INPUT(fmap1);
43 |   CHECK_INPUT(fmap2);
44 |   CHECK_INPUT(coords);
45 |   CHECK_INPUT(corr_grad);
46 | 
47 |   return corr_cuda_backward(fmap1, fmap2, coords, corr_grad, radius);
48 | }
49 | 
50 | 
51 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
52 |   m.def("forward", &corr_forward, "CORR forward");
53 |   m.def("backward", &corr_backward, "CORR backward");
54 | }


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/Base-CenterNet2.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   PROPOSAL_GENERATOR:
 4 |     NAME: "CenterNet"
 5 |   BACKBONE:
 6 |     NAME: "build_p67_resnet_fpn_backbone"
 7 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 8 |   RESNETS:
 9 |     DEPTH: 50
10 |     OUT_FEATURES: ["res3", "res4", "res5"]
11 |   FPN:
12 |     IN_FEATURES: ["res3", "res4", "res5"]
13 |   ROI_HEADS:
14 |     NAME: CustomCascadeROIHeads
15 |     IN_FEATURES: ["p3", "p4", "p5", "p6", "p7"]
16 |     IOU_THRESHOLDS: [0.6]
17 |     NMS_THRESH_TEST: 0.7
18 |   ROI_BOX_CASCADE_HEAD:
19 |     IOUS: [0.6, 0.7, 0.8]
20 |   ROI_BOX_HEAD:
21 |     NAME: "FastRCNNConvFCHead"
22 |     NUM_FC: 2
23 |     POOLER_RESOLUTION: 7
24 |     CLS_AGNOSTIC_BBOX_REG: True
25 |     MULT_PROPOSAL_SCORE: True
26 |   CENTERNET:
27 |     REG_WEIGHT: 1.
28 |     NOT_NORM_REG: True
29 |     ONLY_PROPOSAL: True
30 |     WITH_AGN_HM: True
31 |     INFERENCE_TH: 0.0001
32 |     PRE_NMS_TOPK_TRAIN: 4000
33 |     POST_NMS_TOPK_TRAIN: 2000
34 |     PRE_NMS_TOPK_TEST: 1000
35 |     POST_NMS_TOPK_TEST: 256
36 |     NMS_TH_TRAIN: 0.9
37 |     NMS_TH_TEST: 0.9
38 |     POS_WEIGHT: 0.5
39 |     NEG_WEIGHT: 0.5
40 |     IGNORE_HIGH_FP: 0.85
41 | DATASETS:
42 |   TRAIN: ("coco_2017_train",)
43 |   TEST: ("coco_2017_val",)
44 | SOLVER:
45 |   IMS_PER_BATCH: 16
46 |   BASE_LR: 0.02
47 |   STEPS: (60000, 80000)
48 |   MAX_ITER: 90000
49 |   CHECKPOINT_PERIOD: 1000000000
50 |   WARMUP_ITERS: 4000
51 |   WARMUP_FACTOR: 0.00025
52 |   CLIP_GRADIENTS:
53 |     ENABLED: True
54 | INPUT:
55 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
56 | OUTPUT_DIR: "./output/CenterNet2/auto"
57 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/utils/dist_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | 
 4 | 
 5 | def get_world_size():
 6 |     """Find OMPI world size without calling mpi functions
 7 |     :rtype: int
 8 |     """
 9 |     if os.environ.get('PMI_SIZE') is not None:
10 |         return int(os.environ.get('PMI_SIZE') or 1)
11 |     elif os.environ.get('OMPI_COMM_WORLD_SIZE') is not None:
12 |         return int(os.environ.get('OMPI_COMM_WORLD_SIZE') or 1)
13 |     else:
14 |         return torch.cuda.device_count()
15 | 
16 | 
17 | def get_global_rank():
18 |     """Find OMPI world rank without calling mpi functions
19 |     :rtype: int
20 |     """
21 |     if os.environ.get('PMI_RANK') is not None:
22 |         return int(os.environ.get('PMI_RANK') or 0)
23 |     elif os.environ.get('OMPI_COMM_WORLD_RANK') is not None:
24 |         return int(os.environ.get('OMPI_COMM_WORLD_RANK') or 0)
25 |     else:
26 |         return 0
27 | 
28 | 
29 | def get_local_rank():
30 |     """Find OMPI local rank without calling mpi functions
31 |     :rtype: int
32 |     """
33 |     if os.environ.get('MPI_LOCALRANKID') is not None:
34 |         return int(os.environ.get('MPI_LOCALRANKID') or 0)
35 |     elif os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK') is not None:
36 |         return int(os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK') or 0)
37 |     else:
38 |         return 0
39 | 
40 | 
41 | def get_master_ip():
42 |     if os.environ.get('AZ_BATCH_MASTER_NODE') is not None:
43 |         return os.environ.get('AZ_BATCH_MASTER_NODE').split(':')[0]
44 |     elif os.environ.get('AZ_BATCHAI_MPI_MASTER_NODE') is not None:
45 |         return os.environ.get('AZ_BATCHAI_MPI_MASTER_NODE')
46 |     else:
47 |         return "127.0.0.1"
48 | 
49 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from .grit_src.image_dense_captions import image_caption_api, init_demo, dense_pred_to_caption, dense_pred_to_caption_only_name,dense_pred_to_caption_tuple
 5 | from detectron2.data.detection_utils import read_image
 6 | 
 7 | class DenseCaptioning():
 8 |     def __init__(self, device):
 9 |         self.device = device
10 |         self.demo =  None
11 | 
12 | 
13 |     def initialize_model(self, model_weight):
14 |         self.demo = init_demo(self.device, model_weight=model_weight)
15 |         
16 |     def initialize_model_det(self, model_weight):
17 |         self.demo = init_demo(self.device, model_weight = model_weight, task="ObjectDet")
18 |     
19 |     def image_dense_caption(self, image_src):
20 |         dense_caption = image_caption_api(image_src, self.device)
21 |         print('\033[1;35m' + '*' * 100 + '\033[0m')
22 |         print("Step2, Dense Caption:\n")
23 |         print(dense_caption)
24 |         print('\033[1;35m' + '*' * 100 + '\033[0m')
25 |         return dense_caption
26 |     
27 |     def run_caption_api(self,image_src):
28 |         img = read_image(image_src, format="BGR")
29 |         print(img.shape)
30 |         predictions, visualized_output = self.demo.run_on_image(img)
31 |         new_caption = dense_pred_to_caption_only_name(predictions)
32 |         return new_caption
33 | 
34 |     def run_caption_tensor(self,img):
35 |         predictions, visualized_output = self.demo.run_on_image(img)
36 |         new_caption = dense_pred_to_caption_tuple(predictions)
37 |         return new_caption, visualized_output
38 | 
39 |     def run_det_tensor(self,img):
40 |         predictions, visualized_output = self.demo.run_on_image(img)
41 |         return predictions, visualized_output
42 | 
43 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/grit/data/custom_build_augmentation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from detectron2.data import transforms as T
 3 | from .transforms.custom_augmentation_impl import EfficientDetResizeCrop
 4 | 
 5 | 
 6 | def build_custom_augmentation(cfg, is_train, scale=None, size=None, \
 7 |     min_size=None, max_size=None):
 8 |     """
 9 |     Create a list of default :class:`Augmentation` from config.
10 |     Now it includes resizing and flipping.
11 | 
12 |     Returns:
13 |         list[Augmentation]
14 |     """
15 |     if cfg.INPUT.CUSTOM_AUG == 'ResizeShortestEdge':
16 |         if is_train:
17 |             min_size = cfg.INPUT.MIN_SIZE_TRAIN if min_size is None else min_size
18 |             max_size = cfg.INPUT.MAX_SIZE_TRAIN if max_size is None else max_size
19 |             sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
20 |         else:
21 |             min_size = cfg.INPUT.MIN_SIZE_TEST
22 |             max_size = cfg.INPUT.MAX_SIZE_TEST
23 |             sample_style = "choice"
24 |         augmentation = [T.ResizeShortestEdge(min_size, max_size, sample_style)]
25 |     elif cfg.INPUT.CUSTOM_AUG == 'EfficientDetResizeCrop':
26 |         if is_train:
27 |             scale = cfg.INPUT.SCALE_RANGE if scale is None else scale
28 |             size = cfg.INPUT.TRAIN_SIZE if size is None else size
29 |         else:
30 |             scale = (1, 1)
31 |             size = cfg.INPUT.TEST_SIZE
32 |         augmentation = [EfficientDetResizeCrop(size, scale)]
33 |     else:
34 |         assert 0, cfg.INPUT.CUSTOM_AUG
35 | 
36 |     if is_train:
37 |         augmentation.append(T.RandomFlip())
38 |     return augmentation
39 | 
40 | 
41 | build_custom_transform_gen = build_custom_augmentation
42 | """
43 | Alias for backward-compatibility.
44 | """


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/grit/config.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import CfgNode as CN
 2 | 
 3 | 
 4 | def add_grit_config(cfg):
 5 |     _C = cfg
 6 | 
 7 |     _C.MODEL.BEAM_SIZE = 1
 8 |     _C.MODEL.TRAIN_TASK = ["ObjectDet", "DenseCap"]
 9 |     _C.MODEL.TEST_TASK = "DenseCap"  # This can be varied if the model is jointly trained on multiple tasks
10 | 
11 |     _C.MODEL.ROI_BOX_HEAD.USE_BIAS = 0.0 # >= 0: not use
12 |     _C.MODEL.ROI_BOX_HEAD.MULT_PROPOSAL_SCORE = False
13 | 
14 |     _C.MODEL.ROI_HEADS.MASK_WEIGHT = 1.0
15 |     _C.MODEL.ROI_HEADS.OBJECT_FEAT_POOLER_RES = 14
16 |     _C.MODEL.ROI_HEADS.SOFT_NMS_ENABLED = False
17 | 
18 |     # Backbones
19 |     _C.MODEL.VIT_LAYERS = 12
20 | 
21 |     # Text Decoder
22 |     _C.TEXT_DECODER = CN()
23 |     _C.TEXT_DECODER.VOCAB_SIZE = 30522
24 |     _C.TEXT_DECODER.HIDDEN_SIZE = 768
25 |     _C.TEXT_DECODER.NUM_LAYERS = 6
26 |     _C.TEXT_DECODER.ATTENTION_HEADS = 12
27 |     _C.TEXT_DECODER.FEEDFORWARD_SIZE = 768 * 4
28 |     
29 |     # Multi-dataset dataloader
30 |     _C.DATALOADER.DATASET_RATIO = [1, 1]  # sample ratio
31 |     _C.DATALOADER.DATASET_BS = 1
32 |     _C.DATALOADER.DATASET_INPUT_SIZE = [1024, 1024]
33 |     _C.DATALOADER.DATASET_INPUT_SCALE = [(0.1, 2.0), (0.1, 2.0)]
34 |     _C.DATALOADER.DATASET_MIN_SIZES = [(640, 800), (640, 800)]
35 |     _C.DATALOADER.DATASET_MAX_SIZES = [1333, 1333]
36 |     
37 |     _C.SOLVER.USE_CUSTOM_SOLVER = True
38 |     _C.SOLVER.OPTIMIZER = 'ADAMW'
39 |     _C.SOLVER.VIT_LAYER_DECAY = True
40 |     _C.SOLVER.VIT_LAYER_DECAY_RATE = 0.7
41 | 
42 |     _C.INPUT.CUSTOM_AUG = 'EfficientDetResizeCrop'
43 |     _C.INPUT.TRAIN_SIZE = 1024
44 |     _C.INPUT.TEST_SIZE = 1024
45 |     _C.INPUT.SCALE_RANGE = (0.1, 2.)
46 |     # 'default' for fixed short / long edge
47 |     _C.INPUT.TEST_INPUT_TYPE = 'default' 
48 | 
49 |     _C.FIND_UNUSED_PARAM = True
50 |     _C.USE_ACT_CHECKPOINT = True
51 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/umt/datasets/masking_generator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class TubeMaskingGenerator:
 5 |     def __init__(self, input_size, mask_ratio):
 6 |         self.frames, self.height, self.width = input_size
 7 |         self.num_patches_per_frame = self.height * self.width
 8 |         self.total_patches = self.frames * self.num_patches_per_frame 
 9 |         self.num_masks_per_frame = int(mask_ratio * self.num_patches_per_frame)
10 |         self.total_masks = self.frames * self.num_masks_per_frame
11 | 
12 |     def __repr__(self):
13 |         repr_str = "Maks: total patches {}, mask patches {}".format(
14 |             self.total_patches, self.total_masks
15 |         )
16 |         return repr_str
17 | 
18 |     def __call__(self):
19 |         mask_per_frame = np.hstack([
20 |             np.zeros(self.num_patches_per_frame - self.num_masks_per_frame),
21 |             np.ones(self.num_masks_per_frame),
22 |         ])
23 |         np.random.shuffle(mask_per_frame)
24 |         mask = np.tile(mask_per_frame, (self.frames, 1)).flatten()
25 |         return mask 
26 | 
27 | 
28 | class RandomMaskingGenerator:
29 |     def __init__(self, input_size, mask_ratio):
30 |         if not isinstance(input_size, tuple):
31 |             input_size = (input_size, ) * 3
32 | 
33 |         self.frames, self.height, self.width = input_size
34 | 
35 |         self.num_patches = self.frames * self.height * self.width  # 8x14x14
36 |         self.num_mask = int(mask_ratio * self.num_patches)
37 | 
38 |     def __repr__(self):
39 |         repr_str = "Maks: total patches {}, mask patches {}".format(
40 |             self.num_patches, self.num_mask)
41 |         return repr_str
42 | 
43 |     def __call__(self):
44 |         mask = np.hstack([
45 |             np.zeros(self.num_patches - self.num_mask),
46 |             np.ones(self.num_mask),
47 |         ])
48 |         np.random.shuffle(mask)
49 |         return mask  # [196*8]
50 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/benchmarks/gopro.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import tqdm
 3 | import torch
 4 | import argparse
 5 | import numpy as np
 6 | from omegaconf import OmegaConf
 7 | 
 8 | sys.path.append('.')
 9 | from utils.build_utils import build_from_cfg
10 | from datasets.gopro_datasets import GoPro_Test_Dataset
11 | from metrics.psnr_ssim import calculate_psnr, calculate_ssim
12 | 
13 | parser = argparse.ArgumentParser(
14 |                 prog = 'AMT',
15 |                 description = 'GOPRO evaluation',
16 |                 )
17 | parser.add_argument('-c', '--config', default='cfgs/AMT-S_gopro.yaml') 
18 | parser.add_argument('-p', '--ckpt', default='pretrained/gopro_amt-s.pth',) 
19 | parser.add_argument('-r', '--root', default='data/GOPRO',) 
20 | args = parser.parse_args()
21 | 
22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
23 | cfg_path = args.config
24 | ckpt_path = args.ckpt
25 | root = args.root
26 | 
27 | network_cfg = OmegaConf.load(cfg_path).network
28 | network_name = network_cfg.name
29 | model = build_from_cfg(network_cfg)
30 | ckpt = torch.load(ckpt_path)
31 | model.load_state_dict(ckpt['state_dict'])
32 | model = model.to(device)
33 | model.eval()
34 | 
35 | dataset = GoPro_Test_Dataset(dataset_dir=root)
36 | 
37 | psnr_list = []
38 | ssim_list = []
39 | pbar = tqdm.tqdm(dataset, total=len(dataset))
40 | for data in pbar:
41 |     input_dict = {}
42 |     for k, v in data.items():
43 |         input_dict[k] = v.to(device).unsqueeze(0)
44 |     with torch.no_grad():
45 |         imgt_pred = model(**input_dict)['imgt_pred']
46 |         psnr = calculate_psnr(imgt_pred, input_dict['imgt'])
47 |         ssim = calculate_ssim(imgt_pred, input_dict['imgt'])
48 |     psnr_list.append(psnr)
49 |     ssim_list.append(ssim)
50 |     avg_psnr = np.mean(psnr_list)
51 |     avg_ssim = np.mean(ssim_list)
52 |     desc_str = f'[{network_name}/GOPRO] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}'
53 |     pbar.set_description_str(desc_str)
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/benchmarks/adobe240.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import tqdm
 3 | import torch
 4 | import argparse
 5 | import numpy as np
 6 | from omegaconf import OmegaConf
 7 | 
 8 | sys.path.append('.')
 9 | from utils.build_utils import build_from_cfg
10 | from datasets.adobe_datasets import Adobe240_Dataset
11 | from metrics.psnr_ssim import calculate_psnr, calculate_ssim
12 | 
13 | parser = argparse.ArgumentParser(
14 |                 prog = 'AMT',
15 |                 description = 'Adobe240 evaluation',
16 |                 )
17 | parser.add_argument('-c', '--config', default='cfgs/AMT-S_gopro.yaml') 
18 | parser.add_argument('-p', '--ckpt', default='pretrained/gopro_amt-s.pth',) 
19 | parser.add_argument('-r', '--root', default='data/Adobe240/test_frames',) 
20 | args = parser.parse_args()
21 | 
22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
23 | cfg_path = args.config
24 | ckpt_path = args.ckpt
25 | root = args.root
26 | 
27 | network_cfg = OmegaConf.load(cfg_path).network
28 | network_name = network_cfg.name
29 | model = build_from_cfg(network_cfg)
30 | ckpt = torch.load(ckpt_path)
31 | model.load_state_dict(ckpt['state_dict'])
32 | model = model.to(device)
33 | model.eval()
34 | 
35 | dataset = Adobe240_Dataset(dataset_dir=root, augment=False)
36 | 
37 | psnr_list = []
38 | ssim_list = []
39 | pbar = tqdm.tqdm(dataset, total=len(dataset))
40 | for data in pbar:
41 |     input_dict = {}
42 |     for k, v in data.items():
43 |         input_dict[k] = v.to(device).unsqueeze(0)
44 |     with torch.no_grad():
45 |         imgt_pred = model(**input_dict)['imgt_pred']
46 |         psnr = calculate_psnr(imgt_pred, input_dict['imgt'])
47 |         ssim = calculate_ssim(imgt_pred, input_dict['imgt'])
48 |     psnr_list.append(psnr)
49 |     ssim_list.append(ssim)
50 |     avg_psnr = np.mean(psnr_list)
51 |     avg_ssim = np.mean(ssim_list)
52 |     desc_str = f'[{network_name}/Adobe240] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}'
53 |     pbar.set_description_str(desc_str)
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0
 2 | 
 3 | DATASET_DIR=General-Bench-Openset
 4 | NLP_MODEL_NAME=Qwen/Qwen2.5-7B-Instruct
 5 | AUDIO_MODEL_NAME=Qwen/Qwen2-Audio-7B-Instruct
 6 | VIDEO_MODEL_NAME=Qwen/Qwen2.5-VL-3B-Instruct
 7 | IMAGE_MODEL_NAME=Qwen/Qwen2.5-VL-7B-Instruct
 8 | 3D_MODEL_NAME=Qwen/Qwen2.5-3B-Instruct
 9 | 
10 | # 解析 step 参数
11 | STEP="123"
12 | for arg in "$@"; do
13 | case $arg in
14 |   --step=*)
15 |     STEP="${arg#*=}"
16 |     ;;
17 |   --step)
18 |     shift
19 |     STEP="$1"
20 |     ;;
21 | esac
22 | done
23 | 
24 | contains_step() {
25 | case "$STEP" in
26 |   *$1*) return 0 ;;
27 |   *) return 1 ;;
28 | esac
29 | }
30 | 
31 | # Step1: Generate predictions for NLP, Image, Audio, Video, 3D tasks
32 | if contains_step 1; then
33 |   # NLP
34 |   python predictors/nlp_predictor.py --dataset_dir ${DATASET_DIR}/nlp --model_name ${NLP_MODEL_NAME}
35 | 
36 |   # Audio
37 |   python predictors/audio_predict_comprehension.py -m Qwen/Qwen2-Audio-7B-Instruct -d ${DATASET_DIR}/audio/comprehension/ -o ${DATASET_DIR}/audio/predictions/comprehension/ -t AccentClassification AccentSexClassification
38 |   python predictors/audio_predict_generation.py -m SpeechGPT -d ${DATASET_DIR}/audio/generation/ -o ${DATASET_DIR}/audio/predictions/generation/ -t SingleCaptionToAudio VideoToAudio ImageToSpeech 
39 | 
40 |   # Video
41 |   python predictors/video_comprehension_tasks.py
42 |   python predictors/video_comprehension_flow_matching_tracking.py
43 |   python predictors/video_comprehension_qa_caption.py
44 |   python predictors/video_translation_restoration_superresolution_objectdetection.py
45 |   python predictors/video_generation_evaluate_kit.py
46 | fi
47 | 
48 | MODEL_NAME=Qwen2.5-7B-Instruct
49 | # Step2: Obtain the score for each task
50 | if contains_step 2; then
51 |   python register.py -d ${DATASET_DIR} -t references/template_result.xlsx -o outcome -m ${MODEL_NAME} -p prediction.json
52 | fi
53 | 
54 | MODEL_NAME=Qwen2.5-7B-Instruct
55 | # Step3: Obtain the Level score 
56 | if contains_step 3; then
57 |   python ranker.py -p outcome/${MODEL_NAME}_result.xlsx -m ${MODEL_NAME}
58 | fi


--------------------------------------------------------------------------------
/utils/special_metrix.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import random
 3 | 
 4 | def _sigmoid(x):
 5 |     return 1 / (1 + math.exp(-x))
 6 | 
 7 | 
 8 | def _2_sigmoid_minus_1(x):
 9 |     return 2 * _sigmoid(x) - 1
10 | 
11 | def _tanh(x):
12 |     return math.tanh(x)
13 | 
14 | 
15 | # mapping param for special metrix
16 | special_metric_dict = {
17 |     # with T
18 |     'MAE': 50,
19 |     'RMS': 50,
20 |     'MSE': 5,
21 |     'RMSE': 5,
22 |     'ABSREL': 0.1,
23 |     'EPE': 1,
24 |     'FID': 25,
25 |     'FVD': 100,
26 |     'FAD': 10,
27 |     'PSNR': 1 / 20,  # higher is better
28 |     'SAD': 10, 
29 |     'RTE': 0.5,
30 |     'CD': 1,
31 |     'MCD': 5,
32 |     # without T
33 |     'WER': None,
34 |     'MS-SSIM': None,
35 |     'MOS': None,
36 | }
37 | 
38 | HIGHER_IS_BETTER = [
39 |     'PSNR',
40 | ]
41 | 
42 | def map_function_for_special(metrix: str, score: float) -> float:
43 |     """
44 |     Score mapping function for special metrics.
45 |     >>> metrix: metrix name, str, e.g., 'MAE'.
46 |     >>> score: task score, float, e.g., 5.3.
47 |     return: mapped scores, float.
48 |     """
49 |     metrix = metrix.upper()
50 |     T = special_metric_dict[metrix]
51 | 
52 |     assert score > 0, f'score should be > 0, but found: {score}'
53 |     
54 |     if metrix in HIGHER_IS_BETTER:
55 |         y = _tanh(T * score)
56 |     elif metrix == 'WER':
57 |         y = 1 - score
58 |     elif metrix == 'MS-SSIM':
59 |         y = (score + 1) / 2
60 |     elif metrix == 'MOS':
61 |         y = (score - 1) / 4
62 |     else:  # lower is better
63 |         y = _2_sigmoid_minus_1(T / score)
64 | 
65 |     return y * 100  # Convert to percentage scale
66 | 
67 | # • Normalizing WER:
68 | #   y = 1 − x, where x ∈ [0, 1], y ∈ [0, 1].
69 | # • Normalizing MS-SSIM:
70 | #   y = (x + 1) / 2 , where x ∈ [−1, 1], y ∈ [0, 1].
71 | # • Normalizing MOS:
72 | #   y = x − 1 / 4 , where x ∈ [1, 5], y ∈ [0, 1].
73 | 
74 | if __name__ == '__main__':
75 |     r = random.random()
76 |     print(f"{r = }")
77 |     print(f"{_sigmoid(r) = }")
78 |     print(f"{_2_sigmoid_minus_1(r) = }")
79 |     print(f"{_tanh(r) = }")
80 |     print(f"{_tanh(r / 2) = }")


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/benchmarks/ucf101.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import tqdm
 4 | import torch
 5 | import argparse
 6 | import numpy as np
 7 | import os.path as osp
 8 | from omegaconf import OmegaConf
 9 | 
10 | sys.path.append('.')
11 | from utils.utils import read, img2tensor
12 | from utils.build_utils import build_from_cfg
13 | from metrics.psnr_ssim import calculate_psnr, calculate_ssim
14 | 
15 | parser = argparse.ArgumentParser(
16 |                 prog = 'AMT',
17 |                 description = 'UCF101 evaluation',
18 |                 )
19 | parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') 
20 | parser.add_argument('-p', '--ckpt', default='pretrained/amt-s.pth') 
21 | parser.add_argument('-r', '--root', default='data/ucf101_interp_ours') 
22 | args = parser.parse_args()
23 | 
24 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
25 | cfg_path = args.config
26 | ckpt_path = args.ckpt
27 | root = args.root
28 | 
29 | network_cfg = OmegaConf.load(cfg_path).network
30 | network_name = network_cfg.name
31 | model = build_from_cfg(network_cfg)
32 | ckpt = torch.load(ckpt_path)
33 | model.load_state_dict(ckpt['state_dict'])
34 | model = model.to(device)
35 | model.eval()
36 | 
37 | dirs = sorted(os.listdir(root))
38 | psnr_list = []
39 | ssim_list = []
40 | pbar = tqdm.tqdm(dirs, total=len(dirs))
41 | for d in pbar:
42 |     dir_path = osp.join(root, d)
43 |     I0 = img2tensor(read(osp.join(dir_path, 'frame_00.png'))).to(device)
44 |     I1 = img2tensor(read(osp.join(dir_path, 'frame_01_gt.png'))).to(device)
45 |     I2 = img2tensor(read(osp.join(dir_path, 'frame_02.png'))).to(device)
46 |     embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device)
47 | 
48 |     I1_pred = model(I0, I2, embt, eval=True)['imgt_pred']
49 | 
50 |     psnr = calculate_psnr(I1_pred, I1).detach().cpu().numpy()
51 |     ssim = calculate_ssim(I1_pred, I1).detach().cpu().numpy()
52 | 
53 |     psnr_list.append(psnr)
54 |     ssim_list.append(ssim)
55 |     
56 |     avg_psnr = np.mean(psnr_list)
57 |     avg_ssim = np.mean(ssim_list)
58 |     desc_str = f'[{network_name}/UCF101] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}'
59 |     pbar.set_description_str(desc_str)


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/configs/Base.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GRiT"
 3 |   MASK_ON: True
 4 |   PROPOSAL_GENERATOR:
 5 |     NAME: "CenterNet"
 6 |   FPN:
 7 |     IN_FEATURES: ["layer3", "layer4", "layer5"]
 8 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 9 |   PIXEL_STD: [58.395, 57.12, 57.375]
10 |   ROI_HEADS:
11 |     NAME: GRiTROIHeadsAndTextDecoder
12 |     IN_FEATURES: ["p3", "p4", "p5"]
13 |     IOU_THRESHOLDS: [0.6]
14 |     NUM_CLASSES: 1
15 |     SCORE_THRESH_TEST: 0.02
16 |     NMS_THRESH_TEST: 0.5
17 |     OBJECT_FEAT_POOLER_RES: 14
18 |   ROI_BOX_CASCADE_HEAD:
19 |     IOUS: [0.6, 0.7, 0.8]
20 |   ROI_BOX_HEAD:
21 |     NAME: "FastRCNNConvFCHead"
22 |     NUM_FC: 2
23 |     POOLER_RESOLUTION: 7
24 |     CLS_AGNOSTIC_BBOX_REG: True
25 |     MULT_PROPOSAL_SCORE: True
26 |   ROI_MASK_HEAD:
27 |     NAME: "MaskRCNNConvUpsampleHead"
28 |     NUM_CONV: 4
29 |     POOLER_RESOLUTION: 14
30 |     CLS_AGNOSTIC_MASK: True
31 |   CENTERNET:
32 |     NUM_CLASSES: 1
33 |     REG_WEIGHT: 1.
34 |     NOT_NORM_REG: True
35 |     ONLY_PROPOSAL: True
36 |     WITH_AGN_HM: True
37 |     INFERENCE_TH: 0.0001
38 |     PRE_NMS_TOPK_TRAIN: 4000
39 |     POST_NMS_TOPK_TRAIN: 2000
40 |     PRE_NMS_TOPK_TEST: 1000
41 |     POST_NMS_TOPK_TEST: 256
42 |     NMS_TH_TRAIN: 0.9
43 |     NMS_TH_TEST: 0.9
44 |     POS_WEIGHT: 0.5
45 |     NEG_WEIGHT: 0.5
46 |     IGNORE_HIGH_FP: 0.85
47 | DATASETS:
48 |   TRAIN: ("coco_2017_train",)
49 |   TEST: ("coco_2017_val",)
50 | DATALOADER:
51 |   SAMPLER_TRAIN: "MultiDatasetSampler"
52 |   DATASET_RATIO: [1]
53 |   DATASET_INPUT_SIZE: [1024]
54 |   DATASET_INPUT_SCALE: [[0.1, 2.0]]
55 |   FILTER_EMPTY_ANNOTATIONS: False
56 |   NUM_WORKERS: 8
57 | TEST:
58 |   DETECTIONS_PER_IMAGE: 256
59 | SOLVER:
60 |   LR_SCHEDULER_NAME: "WarmupCosineLR"
61 |   CHECKPOINT_PERIOD: 10000
62 |   WARMUP_ITERS: 1000
63 |   WARMUP_FACTOR: 0.001
64 |   USE_CUSTOM_SOLVER: True
65 |   OPTIMIZER: "ADAMW"
66 |   MAX_ITER: 180000
67 |   IMS_PER_BATCH: 64
68 |   BASE_LR: 0.00008
69 |   VIT_LAYER_DECAY: True
70 |   CLIP_GRADIENTS:
71 |     ENABLED: True
72 | INPUT:
73 |   FORMAT: RGB
74 |   CUSTOM_AUG: EfficientDetResizeCrop
75 |   TRAIN_SIZE: 640
76 | USE_ACT_CHECKPOINT: True
77 | VERSION: 2


--------------------------------------------------------------------------------
/README_ZH.md:
--------------------------------------------------------------------------------
 1 | # GenBench 评分系统 - 用户使用说明
 2 | 
 3 | <div align="center">
 4 | <p><a href="README_EN.md">English</a> | <a href="README_ZH.md">中文</a></p>
 5 | </div>
 6 | 
 7 | ---
 8 | 
 9 | 本系统用于评估大模型在 General-Bench 多模态任务集上的表现。用户只需一条命令即可完成预测、评分和最终得分计算。
10 | 
11 | ## 环境准备
12 | 
13 | - Python 3.9 及以上
14 | - 推荐提前安装依赖（如 pandas, numpy, openpyxl 等）
15 | - Video Generation评测，需要按照video_generation_evaluation/README.md中的步骤安装依赖
16 | - Video Comprehension评测，需要按照[sa2va](https://github.com/magic-research/Sa2VA)中的README.md中的步骤安装依赖。
17 | 
18 | ## 数据集下载
19 | 
20 | - **Open Set（公开数据集）**：请从 [HuggingFace General-Bench-Openset](https://huggingface.co/datasets/General-Level/General-Bench-Openset) 下载全部数据，解压后放入 `General-Bench-Openset/` 目录。
21 | - **Close Set（私有数据集）**：请从 [HuggingFace General-Bench-Closeset](https://huggingface.co/datasets/General-Level/General-Bench-Closeset) 下载全部数据，解压后放入 `General-Bench-Closeset/` 目录。
22 | 
23 | ## 一键运行
24 | 
25 | 请直接运行主脚本 `run.sh`，即可完成全部流程：
26 | 
27 | ```bash
28 | bash run.sh
29 | ```
30 | 
31 | 该命令将依次完成：
32 | 1. 生成各模态预测结果
33 | 2. 计算各任务得分
34 | 3. 计算最终 Level 得分
35 | 
36 | ## 分步运行（可选）
37 | 
38 | 如只需运行部分步骤，可使用 `--step` 参数：
39 | 
40 | - 只运行第1步（生成预测）：
41 |   ```bash
42 |   bash run.sh --step 1
43 |   ```
44 | - 只运行第1、2步：
45 |   ```bash
46 |   bash run.sh --step 12
47 |   ```
48 | - 只运行第2、3步：
49 |   ```bash
50 |   bash run.sh --step 23
51 |   ```
52 | - 不加参数默认全部执行（等价于 `--step 123`）
53 | 
54 | - 步骤1：生成预测结果prediction.json，存在每一个数据集的annotation.json同级目录下
55 | - 步骤2：计算每个任务的得分，存在outcome/{model_name}_result.xlsx中
56 | - 步骤3：计算相关模型的Level得分
57 | 
58 | > **注意：**
59 | > - 使用 **Close Set（私有数据集）** 时，只需运行 step1（即 `bash run.sh --step 1`），并将生成的 prediction.json 提交到系统。
60 | > - 使用 **Open Set（公开数据集）** 时，需依次运行 step1、step2、step3（即 `bash run.sh --step 123`），完成全部评测流程。
61 | 
62 | ## 结果查看
63 | 
64 | - 预测结果（prediction.json）会输出到每个任务对应的数据集文件夹下，与 annotation.json 同级。
65 | - 评分结果（如 Qwen2.5-7B-Instruct_result.xlsx）会输出到 outcome/ 目录。
66 | - 最终 Level 得分会直接在终端打印输出。
67 | 
68 | ## 目录说明
69 | 
70 | - `General-Bench-Openset/`：公开数据集目录
71 | - `General-Bench-Closeset/`：私有数据集目录
72 | - `outcome/`：输出结果目录
73 | - `references/`：参考模板目录
74 | - `run.sh`：主运行脚本（推荐用户只用此脚本）
75 | 
76 | ## 常见问题
77 | 
78 | - 如遇依赖缺失，请根据报错信息安装相应 Python 包。
79 | - 如需自定义模型或数据路径，可编辑 `run.sh` 脚本中的相关变量。
80 | 
81 | ---
82 | 
83 | 如需进一步帮助，请联系系统维护者或查阅详细开发文档。 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/trainers/logger.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import wandb
 3 | import shutil
 4 | import logging
 5 | import os.path as osp
 6 | from torch.utils.tensorboard import SummaryWriter
 7 | 
 8 | 
 9 | def mv_archived_logger(name):
10 |     timestamp = time.strftime("%Y-%m-%d_%H:%M:%S_", time.localtime())
11 |     basename = 'archived_' + timestamp + osp.basename(name)
12 |     archived_name = osp.join(osp.dirname(name), basename)
13 |     shutil.move(name, archived_name) 
14 | 
15 | 
16 | class CustomLogger:
17 |     def __init__(self, common_cfg, tb_cfg=None, wandb_cfg=None, rank=0):
18 |         global global_logger
19 |         self.rank = rank
20 | 
21 |         if self.rank == 0:
22 |             self.logger = logging.getLogger('VFI')
23 |             self.logger.setLevel(logging.INFO)
24 |             format_str = logging.Formatter(common_cfg['format'])
25 | 
26 |             console_handler = logging.StreamHandler()
27 |             console_handler.setFormatter(format_str)
28 | 
29 |             if osp.exists(common_cfg['filename']):
30 |                 mv_archived_logger(common_cfg['filename'])
31 | 
32 |             file_handler = logging.FileHandler(common_cfg['filename'],
33 |                                                common_cfg['filemode'])
34 |             file_handler.setFormatter(format_str)
35 | 
36 |             self.logger.addHandler(console_handler)
37 |             self.logger.addHandler(file_handler)
38 |             self.tb_logger = None
39 | 
40 |             self.enable_wandb = False
41 | 
42 |             if wandb_cfg is not None:
43 |                 self.enable_wandb = True
44 |                 wandb.init(**wandb_cfg)
45 | 
46 |             if tb_cfg is not None:
47 |                 self.tb_logger = SummaryWriter(**tb_cfg)
48 | 
49 |         global_logger = self
50 | 
51 |     def __call__(self, msg=None, level=logging.INFO, tb_msg=None):
52 |         if self.rank != 0:
53 |             return
54 |         if msg is not None:
55 |             self.logger.log(level, msg)
56 | 
57 |         if self.tb_logger is not None and tb_msg is not None:
58 |             self.tb_logger.add_scalar(*tb_msg)
59 | 
60 |     def close(self):
61 |         if self.rank == 0 and self.enable_wandb:
62 |             wandb.finish()
63 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/grit/data/transforms/custom_augmentation_impl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | # Part of the code is from https://github.com/rwightman/efficientdet-pytorch/blob/master/effdet/data/transforms.py 
 4 | # Modified by Xingyi Zhou
 5 | # The original code is under Apache-2.0 License
 6 | import numpy as np
 7 | from PIL import Image
 8 | 
 9 | from detectron2.data.transforms.augmentation import Augmentation
10 | from .custom_transform import EfficientDetResizeCropTransform
11 | 
12 | __all__ = [
13 |     "EfficientDetResizeCrop",
14 | ]
15 | 
16 | 
17 | class EfficientDetResizeCrop(Augmentation):
18 |     """
19 |     Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge.
20 |     If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
21 |     """
22 | 
23 |     def __init__(
24 |         self, size, scale, interp=Image.BILINEAR
25 |     ):
26 |         """
27 |         """
28 |         super().__init__()
29 |         self.target_size = (size, size)
30 |         self.scale = scale
31 |         self.interp = interp
32 | 
33 |     def get_transform(self, img):
34 |         # Select a random scale factor.
35 |         scale_factor = np.random.uniform(*self.scale)
36 |         scaled_target_height = scale_factor * self.target_size[0]
37 |         scaled_target_width = scale_factor * self.target_size[1]
38 |         # Recompute the accurate scale_factor using rounded scaled image size.
39 |         width, height = img.shape[1], img.shape[0]
40 |         img_scale_y = scaled_target_height / height
41 |         img_scale_x = scaled_target_width / width
42 |         img_scale = min(img_scale_y, img_scale_x)
43 | 
44 |         # Select non-zero random offset (x, y) if scaled image is larger than target size
45 |         scaled_h = int(height * img_scale)
46 |         scaled_w = int(width * img_scale)
47 |         offset_y = scaled_h - self.target_size[0]
48 |         offset_x = scaled_w - self.target_size[1]
49 |         offset_y = int(max(0.0, float(offset_y)) * np.random.uniform(0, 1))
50 |         offset_x = int(max(0.0, float(offset_x)) * np.random.uniform(0, 1))
51 |         return EfficientDetResizeCropTransform(
52 |             scaled_h, scaled_w, offset_y, offset_x, img_scale, self.target_size, self.interp)
53 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/benchmarks/vimeo90k.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import tqdm
 3 | import torch
 4 | import argparse
 5 | import numpy as np
 6 | import os.path as osp
 7 | from omegaconf import OmegaConf
 8 | 
 9 | sys.path.append('.')
10 | from utils.utils import read, img2tensor
11 | from utils.build_utils import build_from_cfg
12 | from metrics.psnr_ssim import calculate_psnr, calculate_ssim
13 | 
14 | parser = argparse.ArgumentParser(
15 |                 prog = 'AMT',
16 |                 description = 'Vimeo90K evaluation',
17 |                 )
18 | parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') 
19 | parser.add_argument('-p', '--ckpt', default='pretrained/amt-s.pth',) 
20 | parser.add_argument('-r', '--root', default='data/vimeo_triplet',) 
21 | args = parser.parse_args()
22 | 
23 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
24 | cfg_path = args.config
25 | ckpt_path = args.ckpt
26 | root = args.root
27 | 
28 | network_cfg = OmegaConf.load(cfg_path).network
29 | network_name = network_cfg.name
30 | model = build_from_cfg(network_cfg)
31 | ckpt = torch.load(ckpt_path)
32 | model.load_state_dict(ckpt['state_dict'])
33 | model = model.to(device)
34 | model.eval()
35 | 
36 | with open(osp.join(root, 'tri_testlist.txt'), 'r') as fr:
37 |     file_list = fr.readlines()
38 | 
39 | psnr_list = []
40 | ssim_list = []
41 | 
42 | pbar = tqdm.tqdm(file_list, total=len(file_list))
43 | for name in pbar:
44 |     name = str(name).strip()
45 |     if(len(name) <= 1):
46 |         continue
47 |     dir_path = osp.join(root, 'sequences', name)
48 |     I0 = img2tensor(read(osp.join(dir_path, 'im1.png'))).to(device)
49 |     I1 = img2tensor(read(osp.join(dir_path, 'im2.png'))).to(device)
50 |     I2 = img2tensor(read(osp.join(dir_path, 'im3.png'))).to(device)
51 |     embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device)
52 | 
53 |     I1_pred = model(I0, I2, embt, 
54 |                         scale_factor=1.0, eval=True)['imgt_pred']
55 | 
56 |     psnr = calculate_psnr(I1_pred, I1).detach().cpu().numpy()
57 |     ssim = calculate_ssim(I1_pred, I1).detach().cpu().numpy()
58 | 
59 |     psnr_list.append(psnr)
60 |     ssim_list.append(ssim)
61 |     avg_psnr = np.mean(psnr_list)
62 |     avg_ssim = np.mean(ssim_list)
63 |     desc_str = f'[{network_name}/Vimeo90K] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}'
64 |     pbar.set_description_str(desc_str)
65 | 
66 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/competitions/clip_score.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from tqdm import tqdm
 3 | import clip
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | 
 8 | from vbench2_beta_long.utils import reorganize_clips_results
 9 | from toolkit.utils import load_dimension_info, clip_transform, read_frames_decord_by_fps
10 | import logging
11 | logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
12 | logger = logging.getLogger(__name__)
13 | 
14 | def clip_alignment(clip_model, video_dict, preprocess, device):
15 |     sim = []
16 |     video_results = []
17 |     
18 |     image_transform = clip_transform(224)
19 |     for info in tqdm(video_dict):
20 |         
21 |         query = info["prompt"]
22 |         text = clip.tokenize([query], truncate=True).to(device)
23 |         text_feature = clip_model.encode_text(text)
24 |         text_feature = F.normalize(text_feature, dim=-1)
25 |         
26 |         video_list = info["video_list"]
27 |         for video_path in video_list:
28 |             with torch.no_grad():
29 |                 images = read_frames_decord_by_fps(video_path, num_frames=8, sample="middle")
30 |                 images = image_transform(images)
31 |                 images = images.to(device)
32 |                 
33 |                 image_features = clip_model.encode_image(images)
34 |                 image_features = F.normalize(image_features, dim=-1, p=2)
35 | 
36 |                 video_sim = image_features @ text_feature.T
37 |                 video_sim = np.mean(video_sim.cpu().tolist())
38 |                 sim.append(video_sim)
39 | 
40 |                 video_results.append({'video_path': video_path, 'video_results': video_sim})
41 |     
42 |     avg_sim = np.mean(sim)
43 |     
44 |     return avg_sim, video_results
45 | 
46 | 
47 | def compute_clip_score(json_dir, device, submodules_list, **kwargs):
48 |     
49 |     clip_model, preprocess = clip.load("ViT-B/32", device=device)
50 |     logger.info("Initialize CLIP success")
51 |     
52 |     _, video_dict = load_dimension_info(json_dir, dimension='clip_score', lang='en')
53 |     all_results, video_results = clip_alignment(clip_model, video_dict, preprocess, device)
54 |     return all_results, video_results
55 | 
56 | 
57 | def compute_long_clip_score(json_dir, device, submodules_list, **kwargs):
58 |     all_results, detailed_results = compute_clip_score(json_dir, device, submodules_list, **kwargs)
59 | 
60 |     return reorganize_clips_results(detailed_results)
61 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/temporal_flickering.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from tqdm import tqdm
 3 | import cv2
 4 | from toolkit.utils import load_dimension_info
 5 | 
 6 | from .distributed import (
 7 |     get_world_size,
 8 |     get_rank,
 9 |     all_gather,
10 |     barrier,
11 |     distribute_list_to_rank,
12 |     gather_list_of_dict,
13 | )
14 | 
15 | 
16 | def get_frames(video_path):
17 |         frames = []
18 |         video = cv2.VideoCapture(video_path)
19 |         while video.isOpened():
20 |             success, frame = video.read()
21 |             if success:
22 |                 frames.append(frame)
23 |             else:
24 |                 break
25 |         video.release()
26 |         assert frames != []
27 |         return frames
28 | 
29 | 
30 | def mae_seq(frames):
31 |     ssds = []
32 |     for i in range(len(frames)-1):
33 |         ssds.append(calculate_mae(frames[i], frames[i+1]))
34 |     return np.array(ssds)
35 | 
36 | 
37 | def calculate_mae(img1, img2):
38 |     """Computing the mean absolute error (MAE) between two images."""
39 |     if img1.shape != img2.shape:
40 |         print("Images don't have the same shape.")
41 |         return
42 |     return np.mean(cv2.absdiff(np.array(img1, dtype=np.float32), np.array(img2, dtype=np.float32)))
43 | 
44 | 
45 | def cal_score(video_path):
46 |     """please ensure the video is static"""
47 |     frames = get_frames(video_path)
48 |     score_seq = mae_seq(frames)
49 |     return (255.0 - np.mean(score_seq).item())/255.0
50 | 
51 | 
52 | def temporal_flickering(video_list):
53 |     sim = []
54 |     video_results = []
55 |     for video_path in tqdm(video_list, disable=get_rank() > 0):
56 |         try:
57 |             score_per_video = cal_score(video_path)
58 |         except AssertionError:
59 |             continue
60 |         video_results.append({'video_path': video_path, 'video_results': score_per_video})
61 |         sim.append(score_per_video)
62 |     avg_score = np.mean(sim)
63 |     return avg_score, video_results
64 | 
65 | 
66 | def compute_temporal_flickering(json_dir, device, submodules_list, **kwargs):
67 |     video_list, _ = load_dimension_info(json_dir, dimension='temporal_flickering', lang='en')
68 |     video_list = distribute_list_to_rank(video_list)
69 |     all_results, video_results = temporal_flickering(video_list)
70 |     if get_world_size() > 1:
71 |         video_results = gather_list_of_dict(video_results)
72 |         all_results = sum([d['video_results'] for d in video_results]) / len(video_results)
73 |     return all_results, video_results
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | from shutil import copyfile
 4 | import torch.distributed as dist
 5 | import torch
 6 | import importlib
 7 | import datetime
 8 | from utils.dist_utils import (
 9 |     get_world_size,
10 | )
11 | from omegaconf import OmegaConf
12 | from utils.utils import seed_all
13 | parser = argparse.ArgumentParser(description='VFI')
14 | parser.add_argument('-c', '--config', type=str)
15 | parser.add_argument('-p', '--port', default='23455', type=str)
16 | parser.add_argument('--local_rank', default='0')
17 | 
18 | args = parser.parse_args()
19 | 
20 | 
21 | def main_worker(rank, config):
22 |     if 'local_rank' not in config:
23 |         config['local_rank'] = config['global_rank'] = rank
24 |     if torch.cuda.is_available():
25 |         print(f'Rank {rank} is available')
26 |         config['device'] = f"cuda:{rank}"
27 |         if config['distributed']:
28 |             dist.init_process_group(backend='nccl', 
29 |                                     timeout=datetime.timedelta(seconds=5400))
30 |     else:
31 |         config['device'] = 'cpu'
32 | 
33 |     cfg_name = os.path.basename(args.config).split('.')[0]
34 |     config['exp_name'] = cfg_name + '_' + config['exp_name']
35 |     config['save_dir'] = os.path.join(config['save_dir'], config['exp_name'])
36 | 
37 |     if (not config['distributed']) or rank == 0:
38 |         os.makedirs(config['save_dir'], exist_ok=True)
39 |         os.makedirs(f'{config["save_dir"]}/ckpts', exist_ok=True)
40 |         config_path = os.path.join(config['save_dir'],
41 |                                    args.config.split('/')[-1])
42 |         if not os.path.isfile(config_path):
43 |             copyfile(args.config, config_path)
44 |         print('[**] create folder {}'.format(config['save_dir']))
45 | 
46 |     trainer_name = config.get('trainer_type', 'base_trainer')
47 |     print(f'using GPU {rank} for training')
48 |     if rank == 0:
49 |         print(trainer_name)
50 |     trainer_pack = importlib.import_module('trainers.' + trainer_name)
51 |     trainer = trainer_pack.Trainer(config)
52 | 
53 |     trainer.train()
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     torch.backends.cudnn.benchmark = True
58 |     cfg = OmegaConf.load(args.config)
59 |     seed_all(cfg.seed)
60 |     rank = int(args.local_rank)
61 |     torch.cuda.set_device(torch.device(f'cuda:{rank}'))
62 |     # setting distributed cfgurations
63 |     cfg['world_size'] = get_world_size()
64 |     cfg['local_rank'] = rank
65 |     if rank == 0:
66 |        print('world_size: ', cfg['world_size'])
67 |     main_worker(rank, cfg)
68 |         
69 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/benchmarks/vimeo90k_tta.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import tqdm
 3 | import torch
 4 | import argparse
 5 | import numpy as np
 6 | import os.path as osp
 7 | from omegaconf import OmegaConf
 8 | 
 9 | sys.path.append('.')
10 | from utils.utils import read, img2tensor
11 | from utils.build_utils import build_from_cfg
12 | from metrics.psnr_ssim import calculate_psnr, calculate_ssim
13 | 
14 | parser = argparse.ArgumentParser(
15 |                 prog = 'AMT',
16 |                 description = 'Vimeo90K evaluation (with Test-Time Augmentation)',
17 |                 )
18 | parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') 
19 | parser.add_argument('p', '--ckpt', default='pretrained/amt-s.pth',) 
20 | parser.add_argument('-r', '--root', default='data/vimeo_triplet',) 
21 | args = parser.parse_args()
22 | 
23 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
24 | cfg_path = args.config
25 | ckpt_path = args.ckpt
26 | root = args.root
27 | 
28 | network_cfg = OmegaConf.load(cfg_path).network
29 | network_name = network_cfg.name
30 | model = build_from_cfg(network_cfg)
31 | ckpt = torch.load(ckpt_path)
32 | model.load_state_dict(ckpt['state_dict'])
33 | model = model.to(device)
34 | model.eval()
35 | 
36 | with open(osp.join(root, 'tri_testlist.txt'), 'r') as fr:
37 |     file_list = fr.readlines()
38 | 
39 | psnr_list = []
40 | ssim_list = []
41 | 
42 | pbar = tqdm.tqdm(file_list, total=len(file_list))
43 | for name in pbar:
44 |     name = str(name).strip()
45 |     if(len(name) <= 1):
46 |         continue
47 |     dir_path = osp.join(root, 'sequences', name)
48 |     I0 = img2tensor(read(osp.join(dir_path, 'im1.png'))).to(device)
49 |     I1 = img2tensor(read(osp.join(dir_path, 'im2.png'))).to(device)
50 |     I2 = img2tensor(read(osp.join(dir_path, 'im3.png'))).to(device)
51 |     embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device)
52 | 
53 |     I1_pred1 = model(I0, I2, embt, 
54 |                         scale_factor=1.0, eval=True)['imgt_pred']
55 |     I1_pred2 = model(torch.flip(I0, [2]), torch.flip(I2, [2]), embt, 
56 |                         scale_factor=1.0, eval=True)['imgt_pred']
57 |     I1_pred = I1_pred1 / 2 + torch.flip(I1_pred2, [2]) / 2
58 |     psnr = calculate_psnr(I1_pred, I1).detach().cpu().numpy()
59 |     ssim = calculate_ssim(I1_pred, I1).detach().cpu().numpy()
60 | 
61 |     psnr_list.append(psnr)
62 |     ssim_list.append(ssim)
63 |     avg_psnr = np.mean(psnr_list)
64 |     avg_ssim = np.mean(ssim_list)
65 |     desc_str = f'[{network_name}/Vimeo90K] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}'
66 |     pbar.set_description_str(desc_str)
67 | 
68 | 


--------------------------------------------------------------------------------
/processors/three_d_processor.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from utils.data_types import ModalityType, TaskType, TaskResult
 3 | from utils.base_processor import BaseModalityProcessor
 4 | 
 5 | class ThreeDProcessor(BaseModalityProcessor):
 6 |     """3D模态处理器"""
 7 |     def __init__(self, modality: ModalityType, dataset_dir: str, pred_json_file: str):
 8 |         super().__init__(modality, dataset_dir, pred_json_file)
 9 |     
10 |     def process_comprehension(self) -> List[TaskResult]:
11 |         """处理3D理解类任务
12 |         
13 |         需要返回一个TaskResult列表，每个TaskResult包含：
14 |         - task_name: 任务名称，例如 "3d_object_detection", "point_cloud_segmentation" 等
15 |         - metric: 评估指标，例如 "mAP", "IoU" 等
16 |         - score: 评估分数
17 |         - task_type: 默认为 TaskType.COMPREHENSION，不需要指定
18 |         示例格式：
19 |         return [
20 |             TaskResult(
21 |                 task_name="3d_object_detection",
22 |                 metric="mAP",
23 |                 score=0.76
24 |             ),
25 |             TaskResult(
26 |                 task_name="point_cloud_segmentation",
27 |                 metric="IoU",
28 |                 score=0.82
29 |             )
30 |         ]
31 |         """
32 |         return []
33 |     
34 |     def process_generation(self) -> List[TaskResult]:
35 |         """处理3D生成类任务
36 |         
37 |         需要返回一个TaskResult列表，每个TaskResult包含：
38 |         - task_name: 任务名称，例如 "3d_reconstruction", "mesh_generation" 等
39 |         - metric: 评估指标，例如 "CD", "F1" 等
40 |         - score: 评估分数
41 |         - task_type: 这里需要指定为 TaskType.GENERATION
42 |         
43 |         示例格式：
44 |         return [
45 |             TaskResult(
46 |                 task_name="3d_reconstruction",
47 |                 metric="CD",
48 |                 score=0.15,
49 |                 task_type=TaskType.GENERATION
50 |             ),
51 |             TaskResult(
52 |                 task_name="mesh_generation",
53 |                 metric="F1",
54 |                 score=0.88,
55 |                 task_type=TaskType.GENERATION
56 |             )
57 |         ]
58 |         """
59 |         return []
60 | 
61 | # 使用示例
62 | if __name__ == "__main__":
63 |     processor = ThreeDProcessor(ModalityType.THREE_D, "")
64 |     
65 |     # 测试理解任务
66 |     print("\n理解类任务结果:")
67 |     for task in processor.process_comprehension():
68 |         print(f"任务: {task.task_name}")
69 |         print(f"指标: {task.metric}")
70 |         print(f"分数: {task.score}")
71 |         print("-" * 20)
72 |     
73 |     # 测试生成任务
74 |     print("\n生成类任务结果:")
75 |     for task in processor.process_generation():
76 |         print(f"任务: {task.task_name}")
77 |         print(f"指标: {task.metric}")
78 |         print(f"分数: {task.score}")
79 |         print("-" * 20) 


--------------------------------------------------------------------------------
/processors/audio_processor.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from utils.data_types import ModalityType, TaskType, TaskResult
 3 | from utils.base_processor import BaseModalityProcessor
 4 | 
 5 | class AudioProcessor(BaseModalityProcessor):
 6 |     """音频模态处理器"""
 7 |     def __init__(self, modality: ModalityType, dataset_dir: str, pred_json_file: str):
 8 |         super().__init__(modality, dataset_dir, pred_json_file)
 9 |     
10 |     def process_comprehension(self) -> List[TaskResult]:
11 |         """处理音频理解类任务
12 |         
13 |         需要返回一个TaskResult列表，每个TaskResult包含：
14 |         - task_name: 任务名称，例如 "speech_recognition", "audio_classification" 等
15 |         - metric: 评估指标，例如 "WER", "accuracy" 等
16 |         - score: 评估分数
17 |         - task_type: 默认为 TaskType.COMPREHENSION，不需要指定
18 |         
19 |         示例格式：
20 |         return [
21 |             TaskResult(
22 |                 task_name="speech_recognition",
23 |                 metric="WER",
24 |                 score=0.15
25 |             ),
26 |             TaskResult(
27 |                 task_name="audio_classification",
28 |                 metric="accuracy",
29 |                 score=0.92
30 |             )
31 |         ]
32 |         """
33 |         return []
34 |     
35 |     def process_generation(self) -> List[TaskResult]:
36 |         """处理音频生成类任务
37 |         
38 |         需要返回一个TaskResult列表，每个TaskResult包含：
39 |         - task_name: 任务名称，例如 "speech_synthesis", "audio_generation" 等
40 |         - metric: 评估指标，例如 "MOS", "FAD" 等
41 |         - score: 评估分数
42 |         - task_type: 需要指定为 TaskType.GENERATION
43 |         
44 |         示例格式：
45 |         return [
46 |             TaskResult(
47 |                 task_name="speech_synthesis",
48 |                 metric="MOS",
49 |                 score=4.2,
50 |                 task_type=TaskType.GENERATION
51 |             ),
52 |             TaskResult(
53 |                 task_name="audio_generation",
54 |                 metric="FAD",
55 |                 score=12.5,
56 |                 task_type=TaskType.GENERATION
57 |             )
58 |         ]
59 |         """
60 |         return []
61 | 
62 | # 使用示例
63 | if __name__ == "__main__":
64 |     processor = AudioProcessor(ModalityType.AUDIO, "")
65 |     
66 |     # 测试理解任务
67 |     print("\n理解类任务结果:")
68 |     for task in processor.process_comprehension():
69 |         print(f"任务: {task.task_name}")
70 |         print(f"指标: {task.metric}")
71 |         print(f"分数: {task.score}")
72 |         print("-" * 20)
73 |     
74 |     # 测试生成任务
75 |     print("\n生成类任务结果:")
76 |     for task in processor.process_generation():
77 |         print(f"任务: {task.task_name}")
78 |         print(f"指标: {task.metric}")
79 |         print(f"分数: {task.score}")
80 |         print("-" * 20) 


--------------------------------------------------------------------------------
/processors/video_processor.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from utils.data_types import ModalityType, TaskType, TaskResult
 3 | from utils.base_processor import BaseModalityProcessor
 4 | 
 5 | class VideoProcessor(BaseModalityProcessor):
 6 |     """视频模态处理器"""
 7 |     def __init__(self, modality: ModalityType, dataset_dir: str, pred_json_file: str):
 8 |         super().__init__(modality, dataset_dir, pred_json_file)
 9 |     
10 |     def process_comprehension(self) -> List[TaskResult]:
11 |         """处理视频理解类任务
12 |         
13 |         需要返回一个TaskResult列表，每个TaskResult包含：
14 |         - task_name: 任务名称，例如 "action_recognition", "video_classification" 等
15 |         - metric: 评估指标，例如 "accuracy", "mAP" 等
16 |         - score: 评估分数
17 |         - task_type: 默认为 TaskType.COMPREHENSION，不需要指定
18 |         
19 |         示例格式：
20 |         return [
21 |             TaskResult(
22 |                 task_name="action_recognition",
23 |                 metric="accuracy",
24 |                 score=0.88
25 |             ),
26 |             TaskResult(
27 |                 task_name="video_classification",
28 |                 metric="accuracy",
29 |                 score=0.92
30 |             )
31 |         ]
32 |         """
33 |         return []
34 |     
35 |     def process_generation(self) -> List[TaskResult]:
36 |         """处理视频生成类任务
37 |         
38 |         需要返回一个TaskResult列表，每个TaskResult包含：
39 |         - task_name: 任务名称，例如 "video_generation", "video_prediction" 等
40 |         - metric: 评估指标，例如 "FVD", "PSNR" 等
41 |         - score: 评估分数
42 |         - task_type: 需要指定为 TaskType.GENERATION
43 |         
44 |         示例格式：
45 |         return [
46 |             TaskResult(
47 |                 task_name="video_generation",
48 |                 metric="FVD",
49 |                 score=45.2,
50 |                 task_type=TaskType.GENERATION
51 |             ),
52 |             TaskResult(
53 |                 task_name="video_prediction",
54 |                 metric="PSNR",
55 |                 score=25.8,
56 |                 task_type=TaskType.GENERATION
57 |             )
58 |         ]
59 |         """
60 |         return []
61 | 
62 | # 使用示例
63 | if __name__ == "__main__":
64 |     processor = VideoProcessor(ModalityType.VIDEO, "")
65 |     
66 |     # 测试理解任务
67 |     print("\n理解类任务结果:")
68 |     for task in processor.process_comprehension():
69 |         print(f"任务: {task.task_name}")
70 |         print(f"指标: {task.metric}")
71 |         print(f"分数: {task.score}")
72 |         print("-" * 20)
73 |     
74 |     # 测试生成任务
75 |     print("\n生成类任务结果:")
76 |     for task in processor.process_generation():
77 |         print(f"任务: {task.task_name}")
78 |         print(f"指标: {task.metric}")
79 |         print(f"分数: {task.score}")
80 |         print("-" * 20) 


--------------------------------------------------------------------------------
/processors/image_processor.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from utils.data_types import ModalityType, TaskType, TaskResult
 3 | from utils.base_processor import BaseModalityProcessor
 4 | 
 5 | class ImageProcessor(BaseModalityProcessor):
 6 |     """图像模态处理器"""
 7 |     def __init__(self, modality: ModalityType, dataset_dir: str, pred_json_file: str):
 8 |         super().__init__(modality, dataset_dir, pred_json_file)
 9 |     
10 |     def process_1(self):
11 |         return []
12 |     
13 |     def process_comprehension(self) -> List[TaskResult]:
14 |         """处理图像理解类任务
15 |         
16 |         需要返回一个TaskResult列表，每个TaskResult包含：
17 |         - task_name: 任务名称，例如 "image_classification", "object_detection" 等
18 |         - metric: 评估指标，例如 "accuracy", "mAP" 等
19 |         - score: 评估分数
20 |         - task_type: 默认为 TaskType.COMPREHENSION，不需要指定
21 |         
22 |         示例格式：
23 |         return [
24 |             TaskResult(
25 |                 task_name="image_classification",
26 |                 metric="accuracy",
27 |                 score=0.95
28 |             ),
29 |             TaskResult(
30 |                 task_name="object_detection",
31 |                 metric="mAP",
32 |                 score=0.82
33 |             )
34 |         ]
35 |         """
36 |         return []
37 |     
38 |     def process_generation(self) -> List[TaskResult]:
39 |         """处理图像生成类任务
40 |         
41 |         需要返回一个TaskResult列表，每个TaskResult包含：
42 |         - task_name: 任务名称，例如 "image_generation", "image_editing" 等
43 |         - metric: 评估指标，例如 "FID", "IS" 等
44 |         - score: 评估分数
45 |         - task_type: 需要指定为 TaskType.GENERATION
46 |         
47 |         示例格式：
48 |         return [
49 |             TaskResult(
50 |                 task_name="image_generation",
51 |                 metric="FID",
52 |                 score=15.2,
53 |                 task_type=TaskType.GENERATION
54 |             ),
55 |             TaskResult(
56 |                 task_name="image_editing",
57 |                 metric="PSNR",
58 |                 score=28.5,
59 |                 task_type=TaskType.GENERATION
60 |             )
61 |         ]
62 |         """
63 |         return []
64 | 
65 | # 使用示例
66 | if __name__ == "__main__":
67 |     processor = ImageProcessor(ModalityType.IMAGE, "")
68 |     
69 |     # 测试理解任务
70 |     print("\n理解类任务结果:")
71 |     for task in processor.process_comprehension():
72 |         print(f"任务: {task.task_name}")
73 |         print(f"指标: {task.metric}")
74 |         print(f"分数: {task.score}")
75 |         print("-" * 20)
76 |     
77 |     # 测试生成任务
78 |     print("\n生成类任务结果:")
79 |     for task in processor.process_generation():
80 |         print(f"任务: {task.task_name}")
81 |         print(f"指标: {task.metric}")
82 |         print(f"分数: {task.score}")
83 |         print("-" * 20) 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/benchmarks/snu_film.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import tqdm
 4 | import torch
 5 | import argparse
 6 | import numpy as np
 7 | import os.path as osp
 8 | from omegaconf import OmegaConf
 9 | 
10 | sys.path.append('.')
11 | from utils.build_utils import build_from_cfg
12 | from metrics.psnr_ssim import calculate_psnr, calculate_ssim
13 | from utils.utils import InputPadder, read, img2tensor
14 | 
15 | 
16 | def parse_path(path):
17 |     path_list = path.split('/')
18 |     new_path = osp.join(*path_list[-3:])
19 |     return new_path
20 | 
21 | parser = argparse.ArgumentParser(
22 |                 prog = 'AMT',
23 |                 description = 'SNU-FILM evaluation',
24 |                 )
25 | parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') 
26 | parser.add_argument('-p', '--ckpt', default='pretrained/amt-s.pth')
27 | parser.add_argument('-r', '--root', default='data/SNU_FILM') 
28 | args = parser.parse_args()
29 | 
30 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
31 | cfg_path = args.config
32 | ckpt_path = args.ckpt
33 | root = args.root
34 | 
35 | network_cfg = OmegaConf.load(cfg_path).network
36 | network_name = network_cfg.name
37 | model = build_from_cfg(network_cfg)
38 | ckpt = torch.load(ckpt_path)
39 | model.load_state_dict(ckpt['state_dict'])
40 | model = model.to(device)
41 | model.eval()
42 | 
43 | divisor = 20; scale_factor = 0.8
44 | splits = ['easy', 'medium', 'hard', 'extreme']
45 | for split in splits:
46 |     with open(os.path.join(root, f'test-{split}.txt'), "r") as fr:
47 |         file_list = [l.strip().split(' ') for l in fr.readlines()]
48 |     pbar = tqdm.tqdm(file_list, total=len(file_list))
49 |     
50 |     psnr_list = []; ssim_list = []
51 |     for name in pbar:
52 |         img0 = img2tensor(read(osp.join(root, parse_path(name[0])))).to(device)
53 |         imgt = img2tensor(read(osp.join(root, parse_path(name[1])))).to(device)
54 |         img1 = img2tensor(read(osp.join(root, parse_path(name[2])))).to(device)
55 |         padder = InputPadder(img0.shape, divisor)
56 |         img0, img1 = padder.pad(img0, img1)
57 |             
58 |         embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device)
59 |         imgt_pred = model(img0, img1, embt, scale_factor=scale_factor, eval=True)['imgt_pred']
60 |         imgt_pred = padder.unpad(imgt_pred)
61 | 
62 |         psnr = calculate_psnr(imgt_pred, imgt).detach().cpu().numpy()
63 |         ssim = calculate_ssim(imgt_pred, imgt).detach().cpu().numpy()
64 | 
65 |         psnr_list.append(psnr)
66 |         ssim_list.append(ssim)
67 |         avg_psnr = np.mean(psnr_list)
68 |         avg_ssim = np.mean(ssim_list)
69 |         desc_str = f'[{network_name}/SNU-FILM] [{split}] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}'
70 |         pbar.set_description_str(desc_str)
71 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/backbone/fpn_p5.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | import math
 3 | import fvcore.nn.weight_init as weight_init
 4 | import torch.nn.functional as F
 5 | from torch import nn
 6 | 
 7 | from detectron2.layers import Conv2d, ShapeSpec, get_norm
 8 | 
 9 | from detectron2.modeling.backbone import Backbone
10 | from detectron2.modeling.backbone.fpn import FPN 
11 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
12 | from detectron2.modeling.backbone.resnet import build_resnet_backbone
13 | 
14 | 
15 | class LastLevelP6P7_P5(nn.Module):
16 |     """
17 |     This module is used in RetinaNet to generate extra layers, P6 and P7 from
18 |     C5 feature.
19 |     """
20 | 
21 |     def __init__(self, in_channels, out_channels):
22 |         super().__init__()
23 |         self.num_levels = 2
24 |         self.in_feature = "p5"
25 |         self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
26 |         self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
27 |         for module in [self.p6, self.p7]:
28 |             weight_init.c2_xavier_fill(module)
29 | 
30 |     def forward(self, c5):
31 |         p6 = self.p6(c5)
32 |         p7 = self.p7(F.relu(p6))
33 |         return [p6, p7]
34 | 
35 | 
36 | @BACKBONE_REGISTRY.register()
37 | def build_p67_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
38 |     """
39 |     Args:
40 |         cfg: a detectron2 CfgNode
41 | 
42 |     Returns:
43 |         backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
44 |     """
45 |     bottom_up = build_resnet_backbone(cfg, input_shape)
46 |     in_features = cfg.MODEL.FPN.IN_FEATURES
47 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
48 |     backbone = FPN(
49 |         bottom_up=bottom_up,
50 |         in_features=in_features,
51 |         out_channels=out_channels,
52 |         norm=cfg.MODEL.FPN.NORM,
53 |         top_block=LastLevelP6P7_P5(out_channels, out_channels),
54 |         fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
55 |     )
56 |     return backbone
57 | 
58 | @BACKBONE_REGISTRY.register()
59 | def build_p35_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
60 |     """
61 |     Args:
62 |         cfg: a detectron2 CfgNode
63 | 
64 |     Returns:
65 |         backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
66 |     """
67 |     bottom_up = build_resnet_backbone(cfg, input_shape)
68 |     in_features = cfg.MODEL.FPN.IN_FEATURES
69 |     out_channels = cfg.MODEL.FPN.OUT_CHANNELS
70 |     backbone = FPN(
71 |         bottom_up=bottom_up,
72 |         in_features=in_features,
73 |         out_channels=out_channels,
74 |         norm=cfg.MODEL.FPN.NORM,
75 |         top_block=None,
76 |         fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
77 |     )
78 |     return backbone
79 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/RAFT/core/utils_core/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | import numpy as np
 4 | from scipy import interpolate
 5 | 
 6 | 
 7 | class InputPadder:
 8 |     """ Pads images such that dimensions are divisible by 8 """
 9 |     def __init__(self, dims, mode='sintel'):
10 |         self.ht, self.wd = dims[-2:]
11 |         pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8
12 |         pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8
13 |         if mode == 'sintel':
14 |             self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2]
15 |         else:
16 |             self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht]
17 | 
18 |     def pad(self, *inputs):
19 |         return [F.pad(x, self._pad, mode='replicate') for x in inputs]
20 | 
21 |     def unpad(self,x):
22 |         ht, wd = x.shape[-2:]
23 |         c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]]
24 |         return x[..., c[0]:c[1], c[2]:c[3]]
25 | 
26 | def forward_interpolate(flow):
27 |     flow = flow.detach().cpu().numpy()
28 |     dx, dy = flow[0], flow[1]
29 | 
30 |     ht, wd = dx.shape
31 |     x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht))
32 | 
33 |     x1 = x0 + dx
34 |     y1 = y0 + dy
35 |     
36 |     x1 = x1.reshape(-1)
37 |     y1 = y1.reshape(-1)
38 |     dx = dx.reshape(-1)
39 |     dy = dy.reshape(-1)
40 | 
41 |     valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht)
42 |     x1 = x1[valid]
43 |     y1 = y1[valid]
44 |     dx = dx[valid]
45 |     dy = dy[valid]
46 | 
47 |     flow_x = interpolate.griddata(
48 |         (x1, y1), dx, (x0, y0), method='nearest', fill_value=0)
49 | 
50 |     flow_y = interpolate.griddata(
51 |         (x1, y1), dy, (x0, y0), method='nearest', fill_value=0)
52 | 
53 |     flow = np.stack([flow_x, flow_y], axis=0)
54 |     return torch.from_numpy(flow).float()
55 | 
56 | 
57 | def bilinear_sampler(img, coords, mode='bilinear', mask=False):
58 |     """ Wrapper for grid_sample, uses pixel coordinates """
59 |     H, W = img.shape[-2:]
60 |     xgrid, ygrid = coords.split([1,1], dim=-1)
61 |     xgrid = 2*xgrid/(W-1) - 1
62 |     ygrid = 2*ygrid/(H-1) - 1
63 | 
64 |     grid = torch.cat([xgrid, ygrid], dim=-1)
65 |     img = F.grid_sample(img, grid, align_corners=True)
66 | 
67 |     if mask:
68 |         mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
69 |         return img, mask.float()
70 | 
71 |     return img
72 | 
73 | 
74 | def coords_grid(batch, ht, wd, device):
75 |     coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device))
76 |     coords = torch.stack(coords[::-1], dim=0).float()
77 |     return coords[None].repeat(batch, 1, 1, 1)
78 | 
79 | 
80 | def upflow8(flow, mode='bilinear'):
81 |     new_size = (8 * flow.shape[2], 8 * flow.shape[3])
82 |     return  8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True)
83 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/grit/modeling/meta_arch/grit.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Optional, Tuple
 2 | import torch
 3 | from detectron2.config import configurable
 4 | from detectron2.structures import ImageList, Instances, Boxes
 5 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
 6 | from detectron2.modeling.meta_arch.rcnn import GeneralizedRCNN
 7 | 
 8 | 
 9 | @META_ARCH_REGISTRY.register()
10 | class GRiT(GeneralizedRCNN):
11 |     @configurable
12 |     def __init__(
13 |         self,
14 |         **kwargs):
15 |         super().__init__(**kwargs)
16 |         assert self.proposal_generator is not None
17 | 
18 |     @classmethod
19 |     def from_config(cls, cfg):
20 |         ret = super().from_config(cfg)
21 |         return ret
22 | 
23 |     def inference(
24 |         self,
25 |         batched_inputs: Tuple[Dict[str, torch.Tensor]],
26 |         detected_instances: Optional[List[Instances]] = None,
27 |         do_postprocess: bool = True,
28 |     ):
29 |         assert not self.training
30 |         assert detected_instances is None
31 | 
32 |         images = self.preprocess_image(batched_inputs)
33 |         features = self.backbone(images.tensor)
34 |         proposals, _ = self.proposal_generator(images, features, None)
35 |         results, _ = self.roi_heads(features, proposals)
36 |         results_det, _ = self.roi_heads.forward_object(features, proposals)
37 |         # results_det.get
38 |         for idx in range(len(results)):
39 |             obj_type = results_det[idx].get("pred_object_descriptions")
40 |             results[idx].set('det_obj',obj_type)
41 |         if do_postprocess:
42 |             assert not torch.jit.is_scripting(), \
43 |                 "Scripting is not supported for postprocess."
44 |             return GRiT._postprocess(
45 |                 results, batched_inputs, images.image_sizes)
46 |         else:
47 |             return results
48 | 
49 |     def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
50 |         if not self.training:
51 |             return self.inference(batched_inputs)
52 |         
53 |         images = self.preprocess_image(batched_inputs)
54 |         
55 |         gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
56 | 
57 |         targets_task = batched_inputs[0]['task']
58 |         for anno_per_image in batched_inputs:
59 |             assert targets_task == anno_per_image['task']
60 | 
61 |         features = self.backbone(images.tensor)
62 |         proposals, proposal_losses = self.proposal_generator(
63 |             images, features, gt_instances)
64 |         proposals, roihead_textdecoder_losses = self.roi_heads(
65 |             features, proposals, gt_instances, targets_task=targets_task)
66 | 
67 |         losses = {}
68 |         losses.update(roihead_textdecoder_losses)
69 |         losses.update(proposal_losses)
70 | 
71 |         return losses
72 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/umt/models/extract_clip/extract.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 9,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import clip.clip as clip\n",
 10 |     "import os\n",
 11 |     "import torch\n",
 12 |     "from collections import OrderedDict"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 10,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "path = 'your_model_path/clip_visual_encoder'"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 14,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "model, _ = clip.load(\"ViT-B/16\", device='cpu')\n",
 31 |     "new_state_dict = OrderedDict()\n",
 32 |     "for k, v in model.state_dict().items():\n",
 33 |     "    if 'visual.' in k:\n",
 34 |     "        new_state_dict[k[7:]] = v\n",
 35 |     "torch.save(new_state_dict, os.path.join(path, 'vit_b16.pth'))"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 15,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "model, _ = clip.load(\"ViT-L/14\", device='cpu')\n",
 45 |     "new_state_dict = OrderedDict()\n",
 46 |     "for k, v in model.state_dict().items():\n",
 47 |     "    if 'visual.' in k:\n",
 48 |     "        new_state_dict[k[7:]] = v\n",
 49 |     "torch.save(new_state_dict, os.path.join(path, 'vit_l14.pth'))"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "model, _ = clip.load(\"ViT-L/14@336px\", device='cpu')\n",
 59 |     "new_state_dict = OrderedDict()\n",
 60 |     "for k, v in model.state_dict().items():\n",
 61 |     "    if 'visual.' in k:\n",
 62 |     "        new_state_dict[k[7:]] = v\n",
 63 |     "torch.save(new_state_dict, os.path.join(path, 'vit_l14_336.pth'))"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": []
 72 |   }
 73 |  ],
 74 |  "metadata": {
 75 |   "kernelspec": {
 76 |    "display_name": "Python 3.7.13 ('torch1.9')",
 77 |    "language": "python",
 78 |    "name": "python3"
 79 |   },
 80 |   "language_info": {
 81 |    "codemirror_mode": {
 82 |     "name": "ipython",
 83 |     "version": 3
 84 |    },
 85 |    "file_extension": ".py",
 86 |    "mimetype": "text/x-python",
 87 |    "name": "python",
 88 |    "nbconvert_exporter": "python",
 89 |    "pygments_lexer": "ipython3",
 90 |    "version": "3.7.13"
 91 |   },
 92 |   "orig_nbformat": 4,
 93 |   "vscode": {
 94 |    "interpreter": {
 95 |     "hash": "c30e0be9d1dabfc31a056b9daab5ce1d15284c0e9e5af7f56f8931344ec84c24"
 96 |    }
 97 |   }
 98 |  },
 99 |  "nbformat": 4,
100 |  "nbformat_minor": 2
101 | }
102 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/competitions/configs/subject_mapping_table.yaml:
--------------------------------------------------------------------------------
  1 | 0.0: 0.0
  2 | 0.01: 0.655812085783768
  3 | 0.02: 0.706856949045235
  4 | 0.03: 0.731659342416906
  5 | 0.04: 0.73660992057736
  6 | 0.05: 0.749101188592094
  7 | 0.06: 0.761032814753647
  8 | 0.07: 0.774597183768173
  9 | 0.08: 0.784555729997569
 10 | 0.09: 0.792953568694271
 11 | 0.1: 0.802689699298385
 12 | 0.11: 0.808076071440993
 13 | 0.12: 0.816204790771909
 14 | 0.13: 0.824219815909538
 15 | 0.14: 0.830472157111834
 16 | 0.15: 0.835419531889346
 17 | 0.16: 0.83907681617532
 18 | 0.17: 0.841978081155746
 19 | 0.18: 0.84679192068861
 20 | 0.19: 0.850625540675788
 21 | 0.2: 0.852853044011848
 22 | 0.21: 0.854691139482507
 23 | 0.22: 0.858132224563246
 24 | 0.23: 0.863729405870906
 25 | 0.24: 0.866102417035313
 26 | 0.25: 0.870585293424396
 27 | 0.26: 0.872331870277398
 28 | 0.27: 0.874960548804337
 29 | 0.28: 0.878698116066965
 30 | 0.29: 0.88170792606262
 31 | 0.3: 0.885683841036798
 32 | 0.31: 0.887194775904732
 33 | 0.32: 0.890181215752347
 34 | 0.33: 0.8940085858716
 35 | 0.34: 0.896727529739295
 36 | 0.35: 0.899204109394038
 37 | 0.36: 0.901872688917701
 38 | 0.37: 0.902930005754908
 39 | 0.38: 0.904255123199727
 40 | 0.39: 0.906709500890894
 41 | 0.4: 0.909197403281584
 42 | 0.41: 0.911998758637682
 43 | 0.42: 0.914120648767612
 44 | 0.43: 0.917820970919085
 45 | 0.44: 0.920037992613574
 46 | 0.45: 0.922367310037017
 47 | 0.46: 0.923878218312373
 48 | 0.47: 0.92612833568708
 49 | 0.48: 0.928554265517505
 50 | 0.49: 0.931094522914667
 51 | 0.5: 0.932674917380015
 52 | 0.51: 0.933938855974875
 53 | 0.52: 0.935219359871336
 54 | 0.53: 0.93807406531488
 55 | 0.54: 0.939675705126034
 56 | 0.55: 0.941552521922844
 57 | 0.56: 0.944195698642471
 58 | 0.57: 0.946289318094669
 59 | 0.58: 0.947781123820032
 60 | 0.59: 0.949137334918494
 61 | 0.6: 0.951897174598649
 62 | 0.61: 0.953055388977942
 63 | 0.62: 0.954985032256127
 64 | 0.63: 0.956199606401013
 65 | 0.64: 0.957250230848176
 66 | 0.65: 0.958689000129844
 67 | 0.66: 0.960455895301363
 68 | 0.67: 0.961342514244196
 69 | 0.68: 0.962936044827203
 70 | 0.69: 0.964827439510959
 71 | 0.7: 0.966785529778715
 72 | 0.71: 0.968174134640714
 73 | 0.72: 0.969813944137392
 74 | 0.73: 0.971409261937727
 75 | 0.74: 0.972530004578652
 76 | 0.75: 0.973668488824432
 77 | 0.76: 0.974642341870362
 78 | 0.77: 0.976008729176383
 79 | 0.78: 0.977155875644753
 80 | 0.79: 0.978418810979857
 81 | 0.8: 0.979501010595634
 82 | 0.81: 0.980594016861641
 83 | 0.82: 0.981990506802626
 84 | 0.83: 0.983434155927019
 85 | 0.84: 0.98433502683478
 86 | 0.85: 0.985466305825542
 87 | 0.86: 0.986316598986252
 88 | 0.87: 0.987193187882002
 89 | 0.88: 0.98770020514925
 90 | 0.89: 0.988262855586541
 91 | 0.9: 0.988710454351168
 92 | 0.91: 0.989251092021853
 93 | 0.92: 0.989782759199991
 94 | 0.93: 0.990371501103215
 95 | 0.94: 0.991172390892083
 96 | 0.95: 0.992180427851925
 97 | 0.96: 0.992921150016265
 98 | 0.97: 0.99326859591264
 99 | 0.98: 0.994591460602974
100 | 0.99: 0.995516073547993
101 | 1.0: 1.0


--------------------------------------------------------------------------------
/video_generation_evaluation/competitions/configs/background_mapping_table.yaml:
--------------------------------------------------------------------------------
  1 | 0.0: 0.0
  2 | 0.01: 0.873691544930448
  3 | 0.02: 0.88392356992722
  4 | 0.03: 0.888340769126807
  5 | 0.04: 0.894395017892299
  6 | 0.05: 0.899626435216563
  7 | 0.06: 0.903145754159405
  8 | 0.07: 0.905965662216789
  9 | 0.08: 0.907634139293668
 10 | 0.09: 0.909681980518171
 11 | 0.1: 0.912059260929028
 12 | 0.11: 0.914872300522044
 13 | 0.12: 0.916864571230313
 14 | 0.13: 0.91899572410357
 15 | 0.14: 0.920360080000968
 16 | 0.15: 0.921301105005809
 17 | 0.16: 0.922499725160567
 18 | 0.17: 0.923335310160083
 19 | 0.18: 0.924364064416312
 20 | 0.19: 0.925033502674768
 21 | 0.2: 0.926479836367157
 22 | 0.21: 0.927276633706106
 23 | 0.22: 0.927840039415505
 24 | 0.23: 0.928488115842048
 25 | 0.24: 0.929855989179899
 26 | 0.25: 0.93043699722034
 27 | 0.26: 0.930961847243739
 28 | 0.27: 0.931837518457107
 29 | 0.28: 0.932535174404531
 30 | 0.29: 0.933476108636716
 31 | 0.3: 0.934152037140137
 32 | 0.31: 0.934940306892267
 33 | 0.32: 0.935567840962271
 34 | 0.33: 0.936222006721211
 35 | 0.34: 0.936694266597276
 36 | 0.35: 0.937215165488639
 37 | 0.36: 0.937728512599245
 38 | 0.37: 0.938159241463336
 39 | 0.38: 0.938786767968952
 40 | 0.39: 0.939348915468468
 41 | 0.4: 0.939684244791667
 42 | 0.41: 0.940032821879841
 43 | 0.42: 0.940740896511102
 44 | 0.43: 0.941350394558482
 45 | 0.44: 0.941967580545604
 46 | 0.45: 0.942834956146721
 47 | 0.46: 0.943218163003486
 48 | 0.47: 0.944092961790763
 49 | 0.48: 0.944922112017493
 50 | 0.49: 0.945415133617351
 51 | 0.5: 0.946057962880035
 52 | 0.51: 0.946612672064614
 53 | 0.52: 0.947050138277014
 54 | 0.53: 0.947583230961948
 55 | 0.54: 0.948510612332171
 56 | 0.55: 0.949047688928156
 57 | 0.56: 0.94972291646495
 58 | 0.57: 0.950246513321392
 59 | 0.58: 0.950660608096114
 60 | 0.59: 0.951255542174994
 61 | 0.6: 0.951911455307578
 62 | 0.61: 0.952366960064065
 63 | 0.62: 0.952950734149077
 64 | 0.63: 0.953568790040828
 65 | 0.64: 0.954187246845146
 66 | 0.65: 0.954717288560225
 67 | 0.66: 0.955338935014846
 68 | 0.67: 0.95590276685144
 69 | 0.68: 0.956451298452427
 70 | 0.69: 0.957104193394171
 71 | 0.7: 0.957455075099245
 72 | 0.71: 0.957910428567971
 73 | 0.72: 0.958549581538052
 74 | 0.73: 0.959168784695327
 75 | 0.74: 0.959610176825136
 76 | 0.75: 0.960120447751259
 77 | 0.76: 0.960917058501969
 78 | 0.77: 0.961979166666667
 79 | 0.78: 0.962551626948586
 80 | 0.79: 0.963566142505003
 81 | 0.8: 0.964157551579041
 82 | 0.81: 0.964602080408437
 83 | 0.82: 0.964906362961529
 84 | 0.83: 0.965452531951975
 85 | 0.84: 0.966266180226084
 86 | 0.85: 0.967015800998096
 87 | 0.86: 0.968036075575297
 88 | 0.87: 0.969119242996385
 89 | 0.88: 0.969973438912019
 90 | 0.89: 0.970532389196844
 91 | 0.9: 0.971717108527789
 92 | 0.91: 0.972427724793442
 93 | 0.92: 0.973225634097437
 94 | 0.93: 0.974180063197941
 95 | 0.94: 0.975258326374096
 96 | 0.95: 0.976684089973857
 97 | 0.96: 0.978594319850568
 98 | 0.97: 0.980095581086206
 99 | 0.98: 0.981866938883779
100 | 0.99: 0.985895411744772
101 | 1.0: 1.0
102 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/flow_generation/gen_flow.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | import argparse
 5 | import numpy as np
 6 | import os.path as osp
 7 | import torch.nn.functional as F
 8 | 
 9 | sys.path.append('.')
10 | from utils.utils import read, write
11 | from flow_generation.liteflownet.run import estimate
12 | 
13 | parser = argparse.ArgumentParser(
14 |                 prog = 'AMT',
15 |                 description = 'Flow generation',
16 |                 )
17 | parser.add_argument('-r', '--root', default='data/vimeo_triplet') 
18 | args = parser.parse_args()
19 | 
20 | vimeo90k_dir = args.root
21 | vimeo90k_sequences_dir = osp.join(vimeo90k_dir, 'sequences')
22 | vimeo90k_flow_dir = osp.join(vimeo90k_dir, 'flow')
23 | 
24 | def pred_flow(img1, img2):
25 |     img1 = torch.from_numpy(img1).float().permute(2, 0, 1) / 255.0
26 |     img2 = torch.from_numpy(img2).float().permute(2, 0, 1) / 255.0
27 | 
28 |     flow = estimate(img1, img2)
29 | 
30 |     flow = flow.permute(1, 2, 0).cpu().numpy()
31 |     return flow
32 | 
33 | print('Built Flow Path')
34 | if not osp.exists(vimeo90k_flow_dir):
35 |     os.makedirs(vimeo90k_flow_dir)
36 | 
37 | for sequences_path in sorted(os.listdir(vimeo90k_sequences_dir)):
38 |     vimeo90k_sequences_path_dir = osp.join(vimeo90k_sequences_dir, sequences_path)
39 |     vimeo90k_flow_path_dir = osp.join(vimeo90k_flow_dir, sequences_path)
40 |     if not osp.exists(vimeo90k_flow_path_dir):
41 |         os.mkdir(vimeo90k_flow_path_dir)
42 |         
43 |     for sequences_id in sorted(os.listdir(vimeo90k_sequences_path_dir)):
44 |         vimeo90k_flow_id_dir = osp.join(vimeo90k_flow_path_dir, sequences_id)
45 |         if not osp.exists(vimeo90k_flow_id_dir):
46 |             os.mkdir(vimeo90k_flow_id_dir)
47 | 
48 | for sequences_path in sorted(os.listdir(vimeo90k_sequences_dir)):
49 |     vimeo90k_sequences_path_dir = os.path.join(vimeo90k_sequences_dir, sequences_path)
50 |     vimeo90k_flow_path_dir = os.path.join(vimeo90k_flow_dir, sequences_path)
51 |     
52 |     for sequences_id in sorted(os.listdir(vimeo90k_sequences_path_dir)):
53 |         vimeo90k_sequences_id_dir = os.path.join(vimeo90k_sequences_path_dir, sequences_id)
54 |         vimeo90k_flow_id_dir = os.path.join(vimeo90k_flow_path_dir, sequences_id)
55 |         
56 |         img0_path = vimeo90k_sequences_id_dir + '/im1.png'
57 |         imgt_path = vimeo90k_sequences_id_dir + '/im2.png'
58 |         img1_path = vimeo90k_sequences_id_dir + '/im3.png'
59 |         flow_t0_path = vimeo90k_flow_id_dir + '/flow_t0.flo'
60 |         flow_t1_path = vimeo90k_flow_id_dir + '/flow_t1.flo'
61 |         
62 |         img0 = read(img0_path)
63 |         imgt = read(imgt_path)
64 |         img1 = read(img1_path)
65 |         
66 |         flow_t0 = pred_flow(imgt, img0)
67 |         flow_t1 = pred_flow(imgt, img1)
68 |         
69 |         write(flow_t0_path, flow_t0)
70 |         write(flow_t1_path, flow_t1)
71 |         
72 |     print('Written Sequences {}'.format(sequences_path))


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/meta_arch/centernet_detector.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import json
 3 | import numpy as np
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
 8 | from detectron2.modeling import build_backbone, build_proposal_generator
 9 | from detectron2.modeling import detector_postprocess
10 | from detectron2.structures import ImageList
11 | 
12 | @META_ARCH_REGISTRY.register()
13 | class CenterNetDetector(nn.Module):
14 |     def __init__(self, cfg):
15 |         super().__init__()
16 |         self.mean, self.std = cfg.MODEL.PIXEL_MEAN, cfg.MODEL.PIXEL_STD
17 |         self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
18 |         self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
19 |         
20 |         self.backbone = build_backbone(cfg)
21 |         self.proposal_generator = build_proposal_generator(
22 |             cfg, self.backbone.output_shape()) # TODO: change to a more precise name
23 |     
24 |     
25 |     def forward(self, batched_inputs):
26 |         if not self.training:
27 |             return self.inference(batched_inputs)
28 |         images = self.preprocess_image(batched_inputs)
29 |         features = self.backbone(images.tensor)
30 |         gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
31 | 
32 |         _, proposal_losses = self.proposal_generator(
33 |             images, features, gt_instances)
34 |         return proposal_losses
35 | 
36 | 
37 |     @property
38 |     def device(self):
39 |         return self.pixel_mean.device
40 | 
41 | 
42 |     @torch.no_grad()
43 |     def inference(self, batched_inputs, do_postprocess=True):
44 |         images = self.preprocess_image(batched_inputs)
45 |         inp = images.tensor
46 |         features = self.backbone(inp)
47 |         proposals, _ = self.proposal_generator(images, features, None)
48 | 
49 |         processed_results = []
50 |         for results_per_image, input_per_image, image_size in zip(
51 |             proposals, batched_inputs, images.image_sizes):
52 |             if do_postprocess:
53 |                 height = input_per_image.get("height", image_size[0])
54 |                 width = input_per_image.get("width", image_size[1])
55 |                 r = detector_postprocess(results_per_image, height, width)
56 |                 processed_results.append({"instances": r})
57 |             else:
58 |                 r = results_per_image
59 |                 processed_results.append(r)
60 |         return processed_results
61 | 
62 |     def preprocess_image(self, batched_inputs):
63 |         """
64 |         Normalize, pad and batch the input images.
65 |         """
66 |         images = [x["image"].to(self.device) for x in batched_inputs]
67 |         images = [(x - self.pixel_mean) / self.pixel_std for x in images]
68 |         images = ImageList.from_tensors(images, self.backbone.size_divisibility)
69 |         return images
70 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/imaging_quality.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from tqdm import tqdm
 3 | from torchvision import transforms
 4 | from pyiqa.archs.musiq_arch import MUSIQ
 5 | from toolkit.utils import load_video, load_dimension_info
 6 | 
 7 | from .distributed import (
 8 |     get_world_size,
 9 |     get_rank,
10 |     all_gather,
11 |     barrier,
12 |     distribute_list_to_rank,
13 |     gather_list_of_dict,
14 | )
15 | 
16 | 
17 | def transform(images, preprocess_mode='shorter'):
18 |     if preprocess_mode.startswith('shorter'):
19 |         _, _, h, w = images.size()
20 |         if min(h,w) > 512:
21 |             scale = 512./min(h,w)
22 |             images = transforms.Resize(size=( int(scale * h), int(scale * w) ), antialias=False)(images)
23 |             if preprocess_mode == 'shorter_centercrop':
24 |                 images = transforms.CenterCrop(512)(images)
25 | 
26 |     elif preprocess_mode == 'longer':
27 |         _, _, h, w = images.size()
28 |         if max(h,w) > 512:
29 |             scale = 512./max(h,w)
30 |             images = transforms.Resize(size=( int(scale * h), int(scale * w) ), antialias=False)(images)
31 | 
32 |     elif preprocess_mode == 'None':
33 |         return images / 255.
34 | 
35 |     else:
36 |         raise ValueError("Please recheck imaging_quality_mode")
37 |     return images / 255.
38 | 
39 | def technical_quality(model, video_list, device, **kwargs):
40 |     if 'imaging_quality_preprocessing_mode' not in kwargs:
41 |         preprocess_mode = 'longer'
42 |     else:
43 |         preprocess_mode = kwargs['imaging_quality_preprocessing_mode']
44 |     video_results = []
45 |     for video_path in tqdm(video_list, disable=get_rank() > 0):
46 |         images = load_video(video_path)
47 |         images = transform(images, preprocess_mode)
48 |         acc_score_video = 0.
49 |         for i in range(len(images)):
50 |             frame = images[i].unsqueeze(0).to(device)
51 |             score = model(frame)
52 |             acc_score_video += float(score)
53 |         video_results.append({'video_path': video_path, 'video_results': acc_score_video/len(images)})
54 |     average_score = sum([o['video_results'] for o in video_results]) / len(video_results)
55 |     average_score = average_score / 100.
56 |     return average_score, video_results
57 | 
58 | 
59 | def compute_imaging_quality(json_dir, device, submodules_list, **kwargs):
60 |     model_path = submodules_list['model_path']
61 | 
62 |     model = MUSIQ(pretrained_model_path=model_path)
63 |     model.to(device)
64 |     model.training = False
65 |     
66 |     video_list, _ = load_dimension_info(json_dir, dimension='imaging_quality', lang='en')
67 |     video_list = distribute_list_to_rank(video_list)
68 |     all_results, video_results = technical_quality(model, video_list, device, **kwargs)
69 |     if get_world_size() > 1:
70 |         video_results = gather_list_of_dict(video_results)
71 |         all_results = sum([d['video_results'] for d in video_results]) / len(video_results)
72 |         all_results = all_results / 100.
73 |     return all_results, video_results
74 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/RAFT/README.md:
--------------------------------------------------------------------------------
 1 | # RAFT
 2 | This repository contains the source code for our paper:
 3 | 
 4 | [RAFT: Recurrent All Pairs Field Transforms for Optical Flow](https://arxiv.org/pdf/2003.12039.pdf)<br/>
 5 | ECCV 2020 <br/>
 6 | Zachary Teed and Jia Deng<br/>
 7 | 
 8 | <img src="RAFT.png">
 9 | 
10 | ## Requirements
11 | The code has been tested with PyTorch 1.6 and Cuda 10.1.
12 | ```Shell
13 | conda create --name raft
14 | conda activate raft
15 | conda install pytorch=1.6.0 torchvision=0.7.0 cudatoolkit=10.1 matplotlib tensorboard scipy opencv -c pytorch
16 | ```
17 | 
18 | ## Demos
19 | Pretrained models can be downloaded by running
20 | ```Shell
21 | ./download_models.sh
22 | ```
23 | or downloaded from [google drive](https://drive.google.com/drive/folders/1sWDsfuZ3Up38EUQt7-JDTT1HcGHuJgvT?usp=sharing)
24 | 
25 | You can demo a trained model on a sequence of frames
26 | ```Shell
27 | python demo.py --model=models/raft-things.pth --path=demo-frames
28 | ```
29 | 
30 | ## Required Data
31 | To evaluate/train RAFT, you will need to download the required datasets. 
32 | * [FlyingChairs](https://lmb.informatik.uni-freiburg.de/resources/datasets/FlyingChairs.en.html#flyingchairs)
33 | * [FlyingThings3D](https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html)
34 | * [Sintel](http://sintel.is.tue.mpg.de/)
35 | * [KITTI](http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php?benchmark=flow)
36 | * [HD1K](http://hci-benchmark.iwr.uni-heidelberg.de/) (optional)
37 | 
38 | 
39 | By default `datasets.py` will search for the datasets in these locations. You can create symbolic links to wherever the datasets were downloaded in the `datasets` folder
40 | 
41 | ```Shell
42 | ├── datasets
43 |     ├── Sintel
44 |         ├── test
45 |         ├── training
46 |     ├── KITTI
47 |         ├── testing
48 |         ├── training
49 |         ├── devkit
50 |     ├── FlyingChairs_release
51 |         ├── data
52 |     ├── FlyingThings3D
53 |         ├── frames_cleanpass
54 |         ├── frames_finalpass
55 |         ├── optical_flow
56 | ```
57 | 
58 | ## Evaluation
59 | You can evaluate a trained model using `evaluate.py`
60 | ```Shell
61 | python evaluate.py --model=models/raft-things.pth --dataset=sintel --mixed_precision
62 | ```
63 | 
64 | ## Training
65 | We used the following training schedule in our paper (2 GPUs). Training logs will be written to the `runs` which can be visualized using tensorboard
66 | ```Shell
67 | ./train_standard.sh
68 | ```
69 | 
70 | If you have a RTX GPU, training can be accelerated using mixed precision. You can expect similiar results in this setting (1 GPU)
71 | ```Shell
72 | ./train_mixed.sh
73 | ```
74 | 
75 | ## (Optional) Efficent Implementation
76 | You can optionally use our alternate (efficent) implementation by compiling the provided cuda extension
77 | ```Shell
78 | cd alt_cuda_corr && python setup.py install && cd ..
79 | ```
80 | and running `demo.py` and `evaluate.py` with the `--alternate_corr` flag Note, this implementation is somewhat slower than all-pairs, but uses significantly less GPU memory during the forward pass.
81 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/layers/heatmap_focal_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import functional as F
 3 | 
 4 | # TODO: merge these two function
 5 | def heatmap_focal_loss(
 6 |     inputs,
 7 |     targets,
 8 |     pos_inds,
 9 |     labels,
10 |     alpha: float = -1,
11 |     beta: float = 4,
12 |     gamma: float = 2,
13 |     reduction: str = 'sum',
14 |     sigmoid_clamp: float = 1e-4,
15 |     ignore_high_fp: float = -1.,
16 | ):
17 |     """
18 |     Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
19 |     Args:
20 |         inputs:  (sum_l N*Hl*Wl, C)
21 |         targets: (sum_l N*Hl*Wl, C)
22 |         pos_inds: N
23 |         labels: N
24 |     Returns:
25 |         Loss tensor with the reduction option applied.
26 |     """
27 |     pred = torch.clamp(inputs.sigmoid_(), min=sigmoid_clamp, max=1-sigmoid_clamp)
28 |     neg_weights = torch.pow(1 - targets, beta)
29 |     pos_pred_pix = pred[pos_inds] # N x C
30 |     pos_pred = pos_pred_pix.gather(1, labels.unsqueeze(1))
31 |     pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, gamma)
32 |     neg_loss = torch.log(1 - pred) * torch.pow(pred, gamma) * neg_weights
33 | 
34 |     if ignore_high_fp > 0:
35 |         not_high_fp = (pred < ignore_high_fp).float()
36 |         neg_loss = not_high_fp * neg_loss
37 | 
38 |     if reduction == "sum":
39 |         pos_loss = pos_loss.sum()
40 |         neg_loss = neg_loss.sum()
41 | 
42 |     if alpha >= 0:
43 |         pos_loss = alpha * pos_loss
44 |         neg_loss = (1 - alpha) * neg_loss
45 | 
46 |     return - pos_loss, - neg_loss
47 | 
48 | heatmap_focal_loss_jit = torch.jit.script(heatmap_focal_loss)
49 | # heatmap_focal_loss_jit = heatmap_focal_loss
50 | 
51 | def binary_heatmap_focal_loss(
52 |     inputs,
53 |     targets,
54 |     pos_inds,
55 |     alpha: float = -1,
56 |     beta: float = 4,
57 |     gamma: float = 2,
58 |     sigmoid_clamp: float = 1e-4,
59 |     ignore_high_fp: float = -1.,
60 | ):
61 |     """
62 |     Args:
63 |         inputs:  (sum_l N*Hl*Wl,)
64 |         targets: (sum_l N*Hl*Wl,)
65 |         pos_inds: N
66 |     Returns:
67 |         Loss tensor with the reduction option applied.
68 |     """
69 |     pred = torch.clamp(inputs.sigmoid_(), min=sigmoid_clamp, max=1-sigmoid_clamp)
70 |     neg_weights = torch.pow(1 - targets, beta)
71 |     for i, ind in enumerate(pos_inds):
72 |         if ind >= pred.shape[0]:
73 |             print('%'*100)
74 |             print(pred.shape, ind, pos_inds)
75 |             pos_inds[i] = pred.shape[0] - 1
76 |     pos_pred = pred[pos_inds] # N
77 |     pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, gamma)
78 |     neg_loss = torch.log(1 - pred) * torch.pow(pred, gamma) * neg_weights
79 |     if ignore_high_fp > 0:
80 |         not_high_fp = (pred < ignore_high_fp).float()
81 |         neg_loss = not_high_fp * neg_loss
82 | 
83 |     pos_loss = - pos_loss.sum()
84 |     neg_loss = - neg_loss.sum()
85 | 
86 |     if alpha >= 0:
87 |         pos_loss = alpha * pos_loss
88 |         neg_loss = (1 - alpha) * neg_loss
89 | 
90 |     return pos_loss, neg_loss
91 | 
92 | # binary_heatmap_focal_loss_jit = torch.jit.script(binary_heatmap_focal_loss)


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/fvd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | from torchvision import transforms
 4 | import av
 5 | import numpy as np
 6 | from tqdm import tqdm
 7 | from numpy import cov
 8 | from numpy import mean
 9 | 
10 | 
11 | class I3DFeatureExtractor(torch.nn.Module):
12 |     def __init__(self):
13 |         super(I3DFeatureExtractor, self).__init__()
14 |         self.model = torchvision.models.video.r3d_18(pretrained=True)
15 |         self.model.fc = torch.nn.Identity()
16 | 
17 |     def forward(self, x):
18 |         return self.model(x)
19 | 
20 | def extract_features(video_path, model, device, transform):
21 |     try:
22 |         container = av.open(video_path)
23 |         frames = []
24 |         for frame in container.decode(video=0):
25 |             img = frame.to_rgb().to_ndarray()
26 |             img = transform(img)
27 |             frames.append(img)
28 |             if len(frames) == 16:
29 |                 break
30 |         if len(frames) < 16:
31 |             while len(frames) < 16:
32 |                 frames.append(frames[-1])
33 |         video_tensor = torch.stack(frames).permute(1, 0, 2, 3).unsqueeze(0).to(device)
34 |         with torch.no_grad():
35 |             features = model(video_tensor)
36 |         return features.cpu().numpy().flatten()
37 |     except Exception as e:
38 |         print(f"Error processing {video_path}: {e}")
39 |         return None
40 | 
41 | def get_dataset_features(video_dir, model, device):
42 |     transform = transforms.Compose([
43 |         transforms.ToPILImage(),
44 |         transforms.Resize((224, 224)),
45 |         transforms.ToTensor(),
46 |         transforms.Normalize(mean=[0.43216, 0.394666, 0.37645],
47 |                              std=[0.22803, 0.22145, 0.216989]),
48 |     ])
49 |     features = []
50 |     for video_file in tqdm(os.listdir(video_dir)):
51 |         video_path = os.path.join(video_dir, video_file)
52 |         feature = extract_features(video_path, model, model.device, transform)
53 |         if feature is not None:
54 |             features.append(feature)
55 |     return np.array(features)
56 | 
57 | import os
58 | 
59 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
60 | model = I3DFeatureExtractor().to(device)
61 | model.eval()
62 | 
63 | real_video_dir = './FVD/real_videos/architecture'
64 | real_features = get_dataset_features(real_video_dir, model, device)
65 | 
66 | generated_video_dir = './sampled_videos/cogvideox-5b/architecture'
67 | generated_features = get_dataset_features(generated_video_dir, model, device)
68 | 
69 | mu_real = mean(real_features, axis=0)
70 | mu_generated = mean(generated_features, axis=0)
71 | 
72 | sigma_real = cov(real_features, rowvar=False)
73 | sigma_generated = cov(generated_features, rowvar=False)
74 | 
75 | from scipy.linalg import sqrtm
76 | 
77 | def calculate_fvd(mu1, sigma1, mu2, sigma2):
78 |     diff = mu1 - mu2
79 |     covmean, _ = sqrtm(sigma1.dot(sigma2), disp=False)
80 |     if np.iscomplexobj(covmean):
81 |         covmean = covmean.real
82 |     fvd = diff.dot(diff) + np.trace(sigma1 + sigma2 - 2 * covmean)
83 |     return fvd
84 | 
85 | fvd_value = calculate_fvd(mu_real, sigma_real, mu_generated, sigma_generated)
86 | print(f"FVD: {fvd_value}")


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/flow_generation/liteflownet/README.md:
--------------------------------------------------------------------------------
 1 | # pytorch-liteflownet
 2 | This is a personal reimplementation of LiteFlowNet [1] using PyTorch. Should you be making use of this work, please cite the paper accordingly. Also, make sure to adhere to the <a href="https://github.com/twhui/LiteFlowNet#license-and-citation">licensing terms</a> of the authors. Should you be making use of this particular implementation, please acknowledge it appropriately [2].
 3 | 
 4 | <a href="https://arxiv.org/abs/1805.07036" rel="Paper"><img src="http://www.arxiv-sanity.com/static/thumbs/1805.07036v1.pdf.jpg" alt="Paper" width="100%"></a>
 5 | 
 6 | For the original Caffe version of this work, please see: https://github.com/twhui/LiteFlowNet
 7 | <br />
 8 | Other optical flow implementations from me: [pytorch-pwc](https://github.com/sniklaus/pytorch-pwc), [pytorch-unflow](https://github.com/sniklaus/pytorch-unflow), [pytorch-spynet](https://github.com/sniklaus/pytorch-spynet)
 9 | 
10 | ## setup
11 | The correlation layer is implemented in CUDA using CuPy, which is why CuPy is a required dependency. It can be installed using `pip install cupy` or alternatively using one of the provided [binary packages](https://docs.cupy.dev/en/stable/install.html#installing-cupy) as outlined in the CuPy repository. If you would like to use Docker, you can take a look at [this](https://github.com/sniklaus/pytorch-liteflownet/pull/43) pull request to get started.
12 | 
13 | ## usage
14 | To run it on your own pair of images, use the following command. You can choose between three models, please make sure to see their paper / the code for more details.
15 | 
16 | ```
17 | python run.py --model default --one ./images/one.png --two ./images/two.png --out ./out.flo
18 | ```
19 | 
20 | I am afraid that I cannot guarantee that this reimplementation is correct. However, it produced results pretty much identical to the implementation of the original authors in the examples that I tried. There are some numerical deviations that stem from differences in the `DownsampleLayer` of Caffe and the `torch.nn.functional.interpolate` function of PyTorch. Please feel free to contribute to this repository by submitting issues and pull requests.
21 | 
22 | ## comparison
23 | <p align="center"><img src="comparison/comparison.gif?raw=true" alt="Comparison"></p>
24 | 
25 | ## license
26 | As stated in the <a href="https://github.com/twhui/LiteFlowNet#license-and-citation">licensing terms</a> of the authors of the paper, their material is provided for research purposes only. Please make sure to further consult their licensing terms.
27 | 
28 | ## references
29 | ```
30 | [1]  @inproceedings{Hui_CVPR_2018,
31 |          author = {Tak-Wai Hui and Xiaoou Tang and Chen Change Loy},
32 |          title = {{LiteFlowNet}: A Lightweight Convolutional Neural Network for Optical Flow Estimation},
33 |          booktitle = {IEEE Conference on Computer Vision and Pattern Recognition},
34 |          year = {2018}
35 |      }
36 | ```
37 | 
38 | ```
39 | [2]  @misc{pytorch-liteflownet,
40 |          author = {Simon Niklaus},
41 |          title = {A Reimplementation of {LiteFlowNet} Using {PyTorch}},
42 |          year = {2019},
43 |          howpublished = {\url{https://github.com/sniklaus/pytorch-liteflownet}}
44 |     }
45 | ```


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/umt/functional.py:
--------------------------------------------------------------------------------
 1 | import numbers
 2 | import cv2
 3 | import numpy as np
 4 | import PIL
 5 | import torch
 6 | 
 7 | 
 8 | def _is_tensor_clip(clip):
 9 |     return torch.is_tensor(clip) and clip.ndimension() == 4
10 | 
11 | 
12 | def crop_clip(clip, min_h, min_w, h, w):
13 |     if isinstance(clip[0], np.ndarray):
14 |         cropped = [img[min_h:min_h + h, min_w:min_w + w, :] for img in clip]
15 | 
16 |     elif isinstance(clip[0], PIL.Image.Image):
17 |         cropped = [
18 |             img.crop((min_w, min_h, min_w + w, min_h + h)) for img in clip
19 |         ]
20 |     else:
21 |         raise TypeError('Expected numpy.ndarray or PIL.Image' +
22 |                         'but got list of {0}'.format(type(clip[0])))
23 |     return cropped
24 | 
25 | 
26 | def resize_clip(clip, size, interpolation='bilinear'):
27 |     if isinstance(clip[0], np.ndarray):
28 |         if isinstance(size, numbers.Number):
29 |             im_h, im_w, im_c = clip[0].shape
30 |             # Min spatial dim already matches minimal size
31 |             if (im_w <= im_h and im_w == size) or (im_h <= im_w
32 |                                                    and im_h == size):
33 |                 return clip
34 |             new_h, new_w = get_resize_sizes(im_h, im_w, size)
35 |             size = (new_w, new_h)
36 |         else:
37 |             size = size[0], size[1]
38 |         if interpolation == 'bilinear':
39 |             np_inter = cv2.INTER_LINEAR
40 |         else:
41 |             np_inter = cv2.INTER_NEAREST
42 |         scaled = [
43 |             cv2.resize(img, size, interpolation=np_inter) for img in clip
44 |         ]
45 |     elif isinstance(clip[0], PIL.Image.Image):
46 |         if isinstance(size, numbers.Number):
47 |             im_w, im_h = clip[0].size
48 |             # Min spatial dim already matches minimal size
49 |             if (im_w <= im_h and im_w == size) or (im_h <= im_w
50 |                                                    and im_h == size):
51 |                 return clip
52 |             new_h, new_w = get_resize_sizes(im_h, im_w, size)
53 |             size = (new_w, new_h)
54 |         else:
55 |             size = size[1], size[0]
56 |         if interpolation == 'bilinear':
57 |             pil_inter = PIL.Image.BILINEAR
58 |         else:
59 |             pil_inter = PIL.Image.NEAREST
60 |         scaled = [img.resize(size, pil_inter) for img in clip]
61 |     else:
62 |         raise TypeError('Expected numpy.ndarray or PIL.Image' +
63 |                         'but got list of {0}'.format(type(clip[0])))
64 |     return scaled
65 | 
66 | 
67 | def get_resize_sizes(im_h, im_w, size):
68 |     if im_w < im_h:
69 |         ow = size
70 |         oh = int(size * im_h / im_w)
71 |     else:
72 |         oh = size
73 |         ow = int(size * im_w / im_h)
74 |     return oh, ow
75 | 
76 | 
77 | def normalize(clip, mean, std, inplace=False):
78 |     if not _is_tensor_clip(clip):
79 |         raise TypeError('tensor is not a torch clip.')
80 | 
81 |     if not inplace:
82 |         clip = clip.clone()
83 | 
84 |     dtype = clip.dtype
85 |     mean = torch.as_tensor(mean, dtype=dtype, device=clip.device)
86 |     std = torch.as_tensor(std, dtype=dtype, device=clip.device)
87 |     clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
88 | 
89 |     return clip
90 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/amt/networks/blocks/multi_flow.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from toolkit.third_party.amt.utils.flow_utils import warp
 4 | from toolkit.third_party.amt.networks.blocks.ifrnet import (
 5 |     convrelu, resize,
 6 |     ResBlock,
 7 | )
 8 | 
 9 | 
10 | def multi_flow_combine(comb_block, img0, img1, flow0, flow1, 
11 |                        mask=None, img_res=None, mean=None):
12 |         '''
13 |             A parallel implementation of multiple flow field warping 
14 |             comb_block: An nn.Seqential object.
15 |             img shape: [b, c, h, w]
16 |             flow shape: [b, 2*num_flows, h, w]
17 |             mask (opt):
18 |                 If 'mask' is None, the function conduct a simple average.
19 |             img_res (opt):
20 |                 If 'img_res' is None, the function adds zero instead. 
21 |             mean (opt):
22 |                 If 'mean' is None, the function adds zero instead.       
23 |         '''
24 |         b, c, h, w = flow0.shape
25 |         num_flows = c // 2
26 |         flow0   =   flow0.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w)
27 |         flow1   =   flow1.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w)
28 |         
29 |         mask    =    mask.reshape(b, num_flows, 1, h, w
30 |                             ).reshape(-1, 1, h, w) if mask is not None else None
31 |         img_res = img_res.reshape(b, num_flows, 3, h, w
32 |                             ).reshape(-1, 3, h, w)  if img_res is not None else 0
33 |         img0 = torch.stack([img0] * num_flows, 1).reshape(-1, 3, h, w)
34 |         img1 = torch.stack([img1] * num_flows, 1).reshape(-1, 3, h, w)
35 |         mean = torch.stack([mean] * num_flows, 1).reshape(-1, 1, 1, 1
36 |                                                     ) if mean is not None else 0
37 |         
38 |         img0_warp = warp(img0, flow0)
39 |         img1_warp = warp(img1, flow1)
40 |         img_warps = mask * img0_warp + (1 - mask) * img1_warp + mean + img_res
41 |         img_warps = img_warps.reshape(b, num_flows, 3, h, w)
42 |         imgt_pred = img_warps.mean(1) + comb_block(img_warps.view(b, -1, h, w))
43 |         return imgt_pred
44 | 
45 | 
46 | class MultiFlowDecoder(nn.Module):
47 |     def __init__(self, in_ch, skip_ch, num_flows=3):
48 |         super(MultiFlowDecoder, self).__init__()
49 |         self.num_flows = num_flows
50 |         self.convblock = nn.Sequential(
51 |             convrelu(in_ch*3+4, in_ch*3), 
52 |             ResBlock(in_ch*3, skip_ch), 
53 |             nn.ConvTranspose2d(in_ch*3, 8*num_flows, 4, 2, 1, bias=True)
54 |         )
55 |         
56 |     def forward(self, ft_, f0, f1, flow0, flow1):
57 |         n = self.num_flows
58 |         f0_warp = warp(f0, flow0)
59 |         f1_warp = warp(f1, flow1)
60 |         out = self.convblock(torch.cat([ft_, f0_warp, f1_warp, flow0, flow1], 1))
61 |         delta_flow0, delta_flow1, mask, img_res = torch.split(out, [2*n, 2*n, n, 3*n], 1)
62 |         mask = torch.sigmoid(mask)
63 |         
64 |         flow0 = delta_flow0 + 2.0 * resize(flow0, scale_factor=2.0
65 |                                            ).repeat(1, self.num_flows, 1, 1)
66 |         flow1 = delta_flow1 + 2.0 * resize(flow1, scale_factor=2.0
67 |                                            ).repeat(1, self.num_flows, 1, 1)
68 |         
69 |         return flow0, flow1, mask, img_res
70 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/background_consistency.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import logging
 4 | import numpy as np
 5 | import clip
 6 | from PIL import Image
 7 | import torch
 8 | import torch.nn as nn
 9 | import torch.nn.functional as F
10 | from toolkit.utils import load_video, load_dimension_info, clip_transform
11 | from tqdm import tqdm
12 | 
13 | from .distributed import (
14 |     get_world_size,
15 |     get_rank,
16 |     all_gather,
17 |     barrier,
18 |     distribute_list_to_rank,
19 |     gather_list_of_dict,
20 | )
21 | 
22 | 
23 | def background_consistency(clip_model, preprocess, video_list, device, read_frame=False):
24 |     sim = 0.0
25 |     cnt = 0
26 |     video_results = []
27 |     image_transform = clip_transform(224)
28 |     for video_path in tqdm(video_list, disable=get_rank() > 0):
29 |         video_sim = 0.0
30 |         cnt_per_video = 0
31 |         if read_frame:
32 |             video_path = video_path[:-4].replace('videos', 'frames').replace(' ', '_')
33 |             tmp_paths = [os.path.join(video_path, f) for f in sorted(os.listdir(video_path))]
34 |             images = []
35 |             for tmp_path in tmp_paths:
36 |                 images.append(preprocess(Image.open(tmp_path)))
37 |             images = torch.stack(images)
38 |         else:
39 |             images = load_video(video_path)
40 |             images = image_transform(images)
41 |         images = images.to(device)
42 |         image_features = clip_model.encode_image(images)
43 |         image_features = F.normalize(image_features, dim=-1, p=2)
44 |         for i in range(len(image_features)):
45 |             image_feature = image_features[i].unsqueeze(0)
46 |             if i == 0:
47 |                 first_image_feature = image_feature
48 |             else:
49 |                 sim_pre = max(0.0, F.cosine_similarity(former_image_feature, image_feature).item())
50 |                 sim_fir = max(0.0, F.cosine_similarity(first_image_feature, image_feature).item())
51 |                 cur_sim = (sim_pre + sim_fir) / 2
52 |                 video_sim += cur_sim
53 |                 cnt += 1
54 |                 cnt_per_video += 1
55 |             former_image_feature = image_feature
56 |         sim_per_image = video_sim / (len(image_features) - 1)
57 |         sim += video_sim
58 |         video_results.append({
59 |             'video_path': video_path, 
60 |             'video_results': sim_per_image,
61 |             'video_sim': video_sim,
62 |             'cnt_per_video': cnt_per_video})
63 |     # sim_per_video = sim / (len(video_list) - 1)
64 |     sim_per_frame = sim / cnt
65 |     return sim_per_frame, video_results
66 | 
67 | 
68 | def compute_background_consistency(json_dir, device, submodules_list, **kwargs):
69 |     vit_path, read_frame = submodules_list[0], submodules_list[1]
70 |     clip_model, preprocess = clip.load(vit_path, device=device)
71 |     video_list, _ = load_dimension_info(json_dir, dimension='background_consistency', lang='en')
72 |     video_list = distribute_list_to_rank(video_list)
73 |     all_results, video_results = background_consistency(clip_model, preprocess, video_list, device, read_frame)
74 |     if get_world_size() > 1:
75 |         video_results = gather_list_of_dict(video_results)
76 |         sim = sum([d['video_sim'] for d in video_results])
77 |         cnt = sum([d['cnt_per_video'] for d in video_results])
78 |         all_results = sim / cnt
79 |     return all_results, video_results
80 | 
81 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/scene.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | 
 4 | import torch
 5 | import numpy as np
 6 | from tqdm import tqdm
 7 | from toolkit.utils import load_video, load_dimension_info, tag2text_transform
 8 | from toolkit.third_party.tag2Text.tag2text import tag2text_caption
 9 | 
10 | import logging
11 | 
12 | from .distributed import (
13 |     get_world_size,
14 |     get_rank,
15 |     all_gather,
16 |     barrier,
17 |     distribute_list_to_rank,
18 |     gather_list_of_dict,
19 | )
20 | 
21 | logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
22 | logger = logging.getLogger(__name__)
23 | 
24 | 
25 | def get_caption(model, image_arrays):
26 |     caption, tag_predict = model.generate(image_arrays, tag_input = None, return_tag_predict = True)
27 |     return caption
28 | 
29 | def check_generate(key_info, predictions):
30 |     cur_cnt = 0
31 |     key = key_info['scene']
32 |     for pred in predictions:
33 |         q_flag = [q in pred for q in key.split(' ')]
34 |         if len(q_flag) == sum(q_flag):
35 |             cur_cnt +=1
36 |     return cur_cnt
37 | 
38 | def scene(model, video_dict, device):
39 |     success_frame_count, frame_count = 0, 0
40 |     video_results = []
41 |     transform = tag2text_transform(384)
42 |     for info in tqdm(video_dict, disable=get_rank() > 0):
43 |         if 'auxiliary_info' not in info:
44 |             raise "Auxiliary info is not in json, please check your json."
45 |         scene_info = info['auxiliary_info']['scene']
46 |         for video_path in info['video_list']:
47 |             video_array = load_video(video_path, num_frames=16, return_tensor=False, width=384, height=384)
48 |             video_tensor_list = []
49 |             for i in video_array:
50 |                 video_tensor_list.append(transform(i).to(device).unsqueeze(0))
51 |             video_tensor = torch.cat(video_tensor_list)
52 |             cur_video_pred = get_caption(model, video_tensor)
53 |             cur_success_frame_count = check_generate(scene_info, cur_video_pred)
54 |             cur_success_frame_rate = cur_success_frame_count/len(cur_video_pred)
55 |             success_frame_count += cur_success_frame_count
56 |             frame_count += len(cur_video_pred)
57 |             video_results.append({
58 |                 'video_path': video_path, 
59 |                 'video_results': cur_success_frame_rate,
60 |                 'success_frame_count': cur_success_frame_count,
61 |                 'frame_count': len(cur_video_pred)})
62 |     success_rate = success_frame_count / frame_count
63 |     return success_rate, video_results
64 |         
65 | 
66 | def compute_scene(json_dir, device, submodules_dict, **kwargs):
67 |     model = tag2text_caption(**submodules_dict)
68 |     model.eval()
69 |     model = model.to(device)
70 |     logger.info("Initialize caption model success")
71 |     _, prompt_dict_ls = load_dimension_info(json_dir, dimension='scene', lang='en')
72 |     prompt_dict_ls = distribute_list_to_rank(prompt_dict_ls)
73 |     all_results, video_results = scene(model, prompt_dict_ls, device)
74 |     if get_world_size() > 1:
75 |         video_results = gather_list_of_dict(video_results)
76 |         success_frame_count = sum([d['success_frame_count'] for d in video_results])
77 |         frame_count = sum([d['frame_count'] for d in video_results])
78 |         all_results = success_frame_count / frame_count
79 |     return all_results, video_results
80 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/RAFT/core/corr.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from .utils_core.utils import bilinear_sampler, coords_grid
 4 | 
 5 | try:
 6 |     import alt_cuda_corr
 7 | except:
 8 |     # alt_cuda_corr is not compiled
 9 |     pass
10 | 
11 | 
12 | class CorrBlock:
13 |     def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
14 |         self.num_levels = num_levels
15 |         self.radius = radius
16 |         self.corr_pyramid = []
17 | 
18 |         # all pairs correlation
19 |         corr = CorrBlock.corr(fmap1, fmap2)
20 | 
21 |         batch, h1, w1, dim, h2, w2 = corr.shape
22 |         corr = corr.reshape(batch*h1*w1, dim, h2, w2)
23 |         
24 |         self.corr_pyramid.append(corr)
25 |         for i in range(self.num_levels-1):
26 |             corr = F.avg_pool2d(corr, 2, stride=2)
27 |             self.corr_pyramid.append(corr)
28 | 
29 |     def __call__(self, coords):
30 |         r = self.radius
31 |         coords = coords.permute(0, 2, 3, 1)
32 |         batch, h1, w1, _ = coords.shape
33 | 
34 |         out_pyramid = []
35 |         for i in range(self.num_levels):
36 |             corr = self.corr_pyramid[i]
37 |             dx = torch.linspace(-r, r, 2*r+1, device=coords.device)
38 |             dy = torch.linspace(-r, r, 2*r+1, device=coords.device)
39 |             delta = torch.stack(torch.meshgrid(dy, dx), axis=-1)
40 | 
41 |             centroid_lvl = coords.reshape(batch*h1*w1, 1, 1, 2) / 2**i
42 |             delta_lvl = delta.view(1, 2*r+1, 2*r+1, 2)
43 |             coords_lvl = centroid_lvl + delta_lvl
44 | 
45 |             corr = bilinear_sampler(corr, coords_lvl)
46 |             corr = corr.view(batch, h1, w1, -1)
47 |             out_pyramid.append(corr)
48 | 
49 |         out = torch.cat(out_pyramid, dim=-1)
50 |         return out.permute(0, 3, 1, 2).contiguous().float()
51 | 
52 |     @staticmethod
53 |     def corr(fmap1, fmap2):
54 |         batch, dim, ht, wd = fmap1.shape
55 |         fmap1 = fmap1.view(batch, dim, ht*wd)
56 |         fmap2 = fmap2.view(batch, dim, ht*wd) 
57 |         
58 |         corr = torch.matmul(fmap1.transpose(1, 2), fmap2)
59 |         corr = corr.view(batch, ht, wd, 1, ht, wd)
60 |         return corr / torch.sqrt(torch.tensor(dim).float())
61 | 
62 | 
63 | class AlternateCorrBlock:
64 |     def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
65 |         self.num_levels = num_levels
66 |         self.radius = radius
67 | 
68 |         self.pyramid = [(fmap1, fmap2)]
69 |         for i in range(self.num_levels):
70 |             fmap1 = F.avg_pool2d(fmap1, 2, stride=2)
71 |             fmap2 = F.avg_pool2d(fmap2, 2, stride=2)
72 |             self.pyramid.append((fmap1, fmap2))
73 | 
74 |     def __call__(self, coords):
75 |         coords = coords.permute(0, 2, 3, 1)
76 |         B, H, W, _ = coords.shape
77 |         dim = self.pyramid[0][0].shape[1]
78 | 
79 |         corr_list = []
80 |         for i in range(self.num_levels):
81 |             r = self.radius
82 |             fmap1_i = self.pyramid[0][0].permute(0, 2, 3, 1).contiguous()
83 |             fmap2_i = self.pyramid[i][1].permute(0, 2, 3, 1).contiguous()
84 | 
85 |             coords_i = (coords / 2**i).reshape(B, 1, H, W, 2).contiguous()
86 |             corr, = alt_cuda_corr.forward(fmap1_i, fmap2_i, coords_i, r)
87 |             corr_list.append(corr.squeeze(1))
88 | 
89 |         corr = torch.stack(corr_list, dim=1)
90 |         corr = corr.reshape(B, -1, H, W)
91 |         return corr / torch.sqrt(torch.tensor(dim).float())
92 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/overall_consistency.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import numpy as np
 4 | 
 5 | import torch
 6 | import clip
 7 | from tqdm import tqdm
 8 | from toolkit.utils import load_video, load_dimension_info, clip_transform, read_frames_decord_by_fps, CACHE_DIR
 9 | from toolkit.third_party.ViCLIP.viclip import ViCLIP
10 | from toolkit.third_party.ViCLIP.simple_tokenizer import SimpleTokenizer
11 | 
12 | from .distributed import (
13 |     get_world_size,
14 |     get_rank,
15 |     all_gather,
16 |     barrier,
17 |     distribute_list_to_rank,
18 |     gather_list_of_dict,
19 | )
20 | 
21 | 
22 | def get_text_features(model, input_text, tokenizer, text_feature_dict={}):
23 |     if input_text in text_feature_dict:
24 |         return text_feature_dict[input_text]
25 |     text_template= f"{input_text}"
26 |     with torch.no_grad():
27 |         text_features = model.encode_text(text_template).float()
28 |         text_features /= text_features.norm(dim=-1, keepdim=True)      
29 |         text_feature_dict[input_text] = text_features
30 |     return text_features
31 | 
32 | def get_vid_features(model, input_frames):
33 |     with torch.no_grad():
34 |         clip_feat = model.encode_vision(input_frames,test=True).float()
35 |         clip_feat /= clip_feat.norm(dim=-1, keepdim=True)    
36 |     return clip_feat
37 | 
38 | def get_predict_label(clip_feature, text_feats_tensor, top=5):
39 |     label_probs = (100.0 * clip_feature @ text_feats_tensor.T).softmax(dim=-1)
40 |     top_probs, top_labels = label_probs.cpu().topk(top, dim=-1)
41 |     return top_probs, top_labels
42 | 
43 | def overall_consistency(clip_model, video_dict, tokenizer, device, sample="middle"):
44 |     sim = []
45 |     video_results = []
46 |     image_transform = clip_transform(224)
47 |     for info in tqdm(video_dict, disable=get_rank() > 0):
48 |         query = info['prompt']
49 |         # text = clip.tokenize([query]).to(device)
50 |         video_list = info['video_list']
51 |         for video_path in video_list:
52 |             cur_video = []
53 |             with torch.no_grad():
54 |                 images = read_frames_decord_by_fps(video_path, num_frames=8, sample=sample)
55 |                 images = image_transform(images)
56 |                 images = images.to(device)
57 |                 clip_feat = get_vid_features(clip_model,images.unsqueeze(0))
58 |                 text_feat = get_text_features(clip_model, query, tokenizer)
59 |                 logit_per_text =  clip_feat @ text_feat.T
60 |                 score_per_video =  float(logit_per_text[0][0].cpu())
61 |                 sim.append(score_per_video)
62 |                 video_results.append({'video_path': video_path, 'video_results': score_per_video})
63 |     avg_score = np.mean(sim)
64 |     return avg_score, video_results
65 | 
66 | def compute_overall_consistency(json_dir, device, submodules_list, **kwargs):
67 |     tokenizer = SimpleTokenizer(os.path.join(CACHE_DIR, "ViCLIP/bpe_simple_vocab_16e6.txt.gz"))
68 |     viclip = ViCLIP(tokenizer= tokenizer, **submodules_list).to(device)
69 |     _, video_dict = load_dimension_info(json_dir, dimension='overall_consistency', lang='en')
70 |     video_dict = distribute_list_to_rank(video_dict)
71 |     all_results, video_results = overall_consistency(viclip, video_dict, tokenizer, device)
72 |     if get_world_size() > 1:
73 |         video_results = gather_list_of_dict(video_results)
74 |         all_results = sum([d['video_results'] for d in video_results]) / len(video_results)
75 |     return all_results, video_results
76 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/grit/data/datasets/vg.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from fvcore.common.timer import Timer
 4 | from detectron2.structures import BoxMode
 5 | from fvcore.common.file_io import PathManager
 6 | from detectron2.data import DatasetCatalog, MetadataCatalog
 7 | from lvis import LVIS
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | __all__ = ["load_vg_json", "register_vg_instances"]
12 | 
13 | 
14 | def register_vg_instances(name, metadata, json_file, image_root):
15 |     """
16 |     """
17 |     DatasetCatalog.register(name, lambda: load_vg_json(
18 |         json_file, image_root, name))
19 |     MetadataCatalog.get(name).set(
20 |         json_file=json_file, image_root=image_root,
21 |         evaluator_type="vg", **metadata
22 |     )
23 | 
24 | 
25 | def get_vg_meta():
26 |     categories = [{'supercategory': 'object', 'id': 1, 'name': 'object'}]
27 |     vg_categories = sorted(categories, key=lambda x: x["id"])
28 |     thing_classes = [k["name"] for k in vg_categories]
29 |     meta = {"thing_classes": thing_classes}
30 |     return meta
31 | 
32 | 
33 | def load_vg_json(json_file, image_root, dataset_name=None):
34 | 
35 |     json_file = PathManager.get_local_path(json_file)
36 | 
37 |     timer = Timer()
38 |     lvis_api = LVIS(json_file)
39 |     if timer.seconds() > 1:
40 |         logger.info("Loading {} takes {:.2f} seconds.".format(
41 |             json_file, timer.seconds()))
42 | 
43 |     img_ids = sorted(lvis_api.imgs.keys())
44 |     imgs = lvis_api.load_imgs(img_ids)
45 |     anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids]
46 | 
47 |     ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
48 |     assert len(set(ann_ids)) == len(ann_ids), \
49 |         "Annotation ids in '{}' are not unique".format(json_file)
50 | 
51 |     imgs_anns = list(zip(imgs, anns))
52 |     logger.info("Loaded {} images in the LVIS v1 format from {}".format(
53 |         len(imgs_anns), json_file))
54 | 
55 |     dataset_dicts = []
56 | 
57 |     for (img_dict, anno_dict_list) in imgs_anns:
58 |         record = {}
59 |         if "file_name" in img_dict:
60 |             file_name = img_dict["file_name"]
61 |             record["file_name"] = os.path.join(image_root, file_name)
62 | 
63 |         record["height"] = int(img_dict["height"])
64 |         record["width"] = int(img_dict["width"])
65 |         image_id = record["image_id"] = img_dict["id"]
66 | 
67 |         objs = []
68 |         for anno in anno_dict_list:
69 |             assert anno["image_id"] == image_id
70 |             if anno.get('iscrowd', 0) > 0:
71 |                 continue
72 |             obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS}
73 |             obj["category_id"] = 0
74 |             obj["object_description"] = anno["caption"]
75 | 
76 |             objs.append(obj)
77 |         record["annotations"] = objs
78 |         if len(record["annotations"]) == 0:
79 |             continue
80 |         record["task"] = "DenseCap"
81 |         dataset_dicts.append(record)
82 | 
83 |     return dataset_dicts
84 | 
85 | 
86 | _CUSTOM_SPLITS_LVIS = {
87 |     "vg_train": ("vg/images", "vg/annotations/train.json"),
88 |     "vg_test": ("vg/images", "vg/annotations/test.json"),
89 | }
90 | 
91 | 
92 | for key, (image_root, json_file) in _CUSTOM_SPLITS_LVIS.items():
93 |     register_vg_instances(
94 |         key,
95 |         get_vg_meta(),
96 |         os.path.join("datasets", json_file) if "://" not in json_file else json_file,
97 |         os.path.join("datasets", image_root),
98 |     )


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/temporal_style.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import numpy as np
 4 | 
 5 | import torch
 6 | import clip
 7 | from tqdm import tqdm
 8 | from toolkit.utils import load_video, load_dimension_info, clip_transform, read_frames_decord_by_fps, CACHE_DIR
 9 | from toolkit.third_party.ViCLIP.viclip import ViCLIP
10 | from toolkit.third_party.ViCLIP.simple_tokenizer import SimpleTokenizer
11 | 
12 | from .distributed import (
13 |     get_world_size,
14 |     get_rank,
15 |     all_gather,
16 |     barrier,
17 |     distribute_list_to_rank,
18 |     gather_list_of_dict,
19 | )
20 | 
21 | 
22 | def get_text_features(model, input_text, tokenizer, text_feature_dict={}):
23 |     if input_text in text_feature_dict:
24 |         return text_feature_dict[input_text]
25 |     text_template= f"{input_text}"
26 |     with torch.no_grad():
27 |         text_features = model.encode_text(text_template).float()
28 |         text_features /= text_features.norm(dim=-1, keepdim=True)      
29 |         text_feature_dict[input_text] = text_features
30 |     return text_features
31 | 
32 | def get_vid_features(model, input_frames):
33 |     with torch.no_grad():
34 |         clip_feat = model.encode_vision(input_frames,test=True).float()
35 |         clip_feat /= clip_feat.norm(dim=-1, keepdim=True)    
36 |     return clip_feat
37 | 
38 | def get_predict_label(clip_feature, text_feats_tensor, top=5):
39 |     label_probs = (100.0 * clip_feature @ text_feats_tensor.T).softmax(dim=-1)
40 |     top_probs, top_labels = label_probs.cpu().topk(top, dim=-1)
41 |     return top_probs, top_labels
42 | 
43 | def temporal_style(clip_model, video_dict, tokenizer, device, sample="middle"):
44 |     sim = []
45 |     video_results = []
46 |     image_transform = clip_transform(224)
47 |     for info in tqdm(video_dict, disable=get_rank() > 0):
48 |         query = info['prompt']
49 |         # text = clip.tokenize([query]).to(device)
50 |         video_list = info['video_list']
51 |         for video_path in video_list:
52 |             cur_video = []
53 |             with torch.no_grad():
54 |                 # images = load_video(video_path, num_frames=8)
55 |                 images = read_frames_decord_by_fps(video_path, num_frames=8, sample=sample)
56 |                 images = image_transform(images)
57 |                 images = images.to(device)
58 |                 clip_feat = get_vid_features(clip_model,images.unsqueeze(0))
59 |                 text_feat = get_text_features(clip_model, query, tokenizer)
60 |                 logit_per_text =  clip_feat @ text_feat.T
61 |                 score_per_video =  float(logit_per_text[0][0].cpu())
62 |                 sim.append(score_per_video)
63 |                 video_results.append({'video_path': video_path, 'video_results': score_per_video})
64 |     avg_score = np.mean(sim)
65 |     return avg_score, video_results
66 | 
67 | def compute_temporal_style(json_dir, device, submodules_list, **kwargs):
68 |     tokenizer = SimpleTokenizer(os.path.join(CACHE_DIR, "ViCLIP/bpe_simple_vocab_16e6.txt.gz"))
69 |     viclip = ViCLIP(tokenizer= tokenizer, **submodules_list).to(device)
70 |     _, video_dict = load_dimension_info(json_dir, dimension='temporal_style', lang='en')
71 |     video_dict = distribute_list_to_rank(video_dict)
72 |     all_results, video_results = temporal_style(viclip, video_dict, tokenizer, device)
73 |     if get_world_size() > 1:
74 |         video_results = gather_list_of_dict(video_results)
75 |         all_results = sum([d['video_results'] for d in video_results]) / len(video_results)
76 |     return all_results, video_results
77 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/subject_consistency.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | import cv2
 4 | import json
 5 | import numpy as np
 6 | from PIL import Image
 7 | from tqdm import tqdm
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 | import torchvision.transforms as transforms
13 | 
14 | from toolkit.utils import load_video, load_dimension_info, dino_transform, dino_transform_Image
15 | import logging
16 | 
17 | from .distributed import (
18 |     get_world_size,
19 |     get_rank,
20 |     all_gather,
21 |     barrier,
22 |     distribute_list_to_rank,
23 |     gather_list_of_dict,
24 | )
25 | 
26 | logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
27 | logger = logging.getLogger(__name__)
28 | 
29 | 
30 | def subject_consistency(model, video_list, device, read_frame):
31 |     sim = 0.0
32 |     cnt = 0
33 |     video_results = []
34 |     if read_frame:
35 |         image_transform = dino_transform_Image(224)
36 |     else:
37 |         image_transform = dino_transform(224)
38 |     for video_path in tqdm(video_list, disable=get_rank() > 0):
39 |         video_sim = 0.0
40 |         if read_frame:
41 |             video_path = video_path[:-4].replace('videos', 'frames').replace(' ', '_')
42 |             tmp_paths = [os.path.join(video_path, f) for f in sorted(os.listdir(video_path))]
43 |             images = []
44 |             for tmp_path in tmp_paths:
45 |                 images.append(image_transform(Image.open(tmp_path)))
46 |         else:
47 |             images = load_video(video_path)
48 |             images = image_transform(images)
49 |         for i in range(len(images)):
50 |             with torch.no_grad():
51 |                 image = images[i].unsqueeze(0)
52 |                 image = image.to(device)
53 |                 image_features = model(image)
54 |                 image_features = F.normalize(image_features, dim=-1, p=2)
55 |                 if i == 0:
56 |                     first_image_features = image_features
57 |                 else:
58 |                     sim_pre = max(0.0, F.cosine_similarity(former_image_features, image_features).item())
59 |                     sim_fir = max(0.0, F.cosine_similarity(first_image_features, image_features).item())
60 |                     cur_sim = (sim_pre + sim_fir) / 2
61 |                     video_sim += cur_sim
62 |                     cnt += 1
63 |             former_image_features = image_features
64 |         sim_per_images = video_sim / (len(images) - 1)
65 |         sim += video_sim
66 |         video_results.append({'video_path': video_path, 'video_results': sim_per_images})
67 |     # sim_per_video = sim / (len(video_list) - 1)
68 |     sim_per_frame = sim / cnt
69 |     return sim_per_frame, video_results
70 | 
71 | 
72 | def compute_subject_consistency(json_dir, device, submodules_list, **kwargs):
73 |     dino_model = torch.hub.load('facebookresearch/dino', 'dino_vitb16', pretrained=True)
74 |     dino_model = dino_model.to(device)
75 |     read_frame = submodules_list['read_frame']
76 |     logger.info("Initialize DINO success")
77 |     video_list, _ = load_dimension_info(json_dir, dimension='subject_consistency', lang='en')
78 |     video_list = distribute_list_to_rank(video_list)
79 |     all_results, video_results = subject_consistency(dino_model, video_list, device, read_frame)
80 |     if get_world_size() > 1:
81 |         video_results = gather_list_of_dict(video_results)
82 |         all_results = sum([d['video_results'] for d in video_results]) / len(video_results)
83 |     return all_results, video_results
84 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/object_class.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | 
 4 | import torch
 5 | import numpy as np
 6 | from tqdm import tqdm
 7 | from toolkit.utils import load_video, load_dimension_info
 8 | from toolkit.third_party.grit_model import DenseCaptioning
 9 | from torchvision import transforms
10 | import logging
11 | 
12 | from .distributed import (
13 |     get_world_size,
14 |     get_rank,
15 |     all_gather,
16 |     barrier,
17 |     distribute_list_to_rank,
18 |     gather_list_of_dict,
19 | )
20 | 
21 | 
22 | logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
23 | logger = logging.getLogger(__name__)
24 | 
25 | def get_dect_from_grit(model, image_arrays):
26 |     pred = []
27 |     if type(image_arrays) is not list:
28 |         image_arrays = image_arrays.numpy()
29 |     with torch.no_grad():
30 |         for frame in image_arrays:
31 |             try:
32 |                 pred.append(set(model.run_caption_tensor(frame)[0][0][2]))
33 |             except:
34 |                 pred.append(set())
35 |     return pred
36 | 
37 | def check_generate(key_info, predictions):
38 |     cur_cnt = 0
39 |     for pred in predictions:
40 |         if key_info in pred:
41 |             cur_cnt+=1
42 |     return cur_cnt
43 | 
44 | def object_class(model, video_dict, device):
45 |     success_frame_count, frame_count = 0,0
46 |     video_results = []
47 |     for info in tqdm(video_dict, disable=get_rank() > 0):
48 |         if 'auxiliary_info' not in info:
49 |             raise "Auxiliary info is not in json, please check your json."
50 |         object_info = info['auxiliary_info']['object']
51 |         for video_path in info['video_list']:
52 |             video_tensor = load_video(video_path, num_frames=16)
53 |             _, _, h, w = video_tensor.size()
54 |             if min(h,w) > 768:
55 |                 scale = 720./min(h,w)
56 |                 output_tensor = transforms.Resize(size=( int(scale * h), int(scale * w) ),)(video_tensor)
57 |                 video_tensor=output_tensor
58 |             cur_video_pred = get_dect_from_grit(model, video_tensor.permute(0,2,3,1))
59 |             cur_success_frame_count = check_generate(object_info, cur_video_pred)
60 |             cur_success_frame_rate = cur_success_frame_count/len(cur_video_pred)
61 |             success_frame_count += cur_success_frame_count
62 |             frame_count += len(cur_video_pred)
63 |             video_results.append({
64 |                 'video_path': video_path, 
65 |                 'video_results': cur_success_frame_rate,
66 |                 'success_frame_count': cur_success_frame_count,
67 |                 'frame_count': len(cur_video_pred)})
68 |     success_rate = success_frame_count / frame_count
69 |     return success_rate, video_results
70 |         
71 | 
72 | def compute_object_class(json_dir, device, submodules_dict, **kwargs):
73 |     dense_caption_model = DenseCaptioning(device)
74 |     dense_caption_model.initialize_model_det(**submodules_dict)
75 |     logger.info("Initialize detection model success")
76 |     _, prompt_dict_ls = load_dimension_info(json_dir, dimension='object_class', lang='en')
77 |     prompt_dict_ls = distribute_list_to_rank(prompt_dict_ls)
78 |     all_results, video_results = object_class(dense_caption_model, prompt_dict_ls, device)
79 |     if get_world_size() > 1:
80 |         video_results = gather_list_of_dict(video_results)
81 |         success_frame_count = sum([d['success_frame_count'] for d in video_results])
82 |         frame_count = sum([d['frame_count'] for d in video_results])
83 |         all_results = success_frame_count / frame_count
84 |     return all_results, video_results
85 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/config.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import CfgNode as CN
 2 | 
 3 | def add_centernet_config(cfg):
 4 |     _C = cfg
 5 | 
 6 |     _C.MODEL.CENTERNET = CN()
 7 |     _C.MODEL.CENTERNET.NUM_CLASSES = 80
 8 |     _C.MODEL.CENTERNET.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"]
 9 |     _C.MODEL.CENTERNET.FPN_STRIDES = [8, 16, 32, 64, 128]
10 |     _C.MODEL.CENTERNET.PRIOR_PROB = 0.01
11 |     _C.MODEL.CENTERNET.INFERENCE_TH = 0.05
12 |     _C.MODEL.CENTERNET.CENTER_NMS = False
13 |     _C.MODEL.CENTERNET.NMS_TH_TRAIN = 0.6
14 |     _C.MODEL.CENTERNET.NMS_TH_TEST = 0.6
15 |     _C.MODEL.CENTERNET.PRE_NMS_TOPK_TRAIN = 1000
16 |     _C.MODEL.CENTERNET.POST_NMS_TOPK_TRAIN = 100
17 |     _C.MODEL.CENTERNET.PRE_NMS_TOPK_TEST = 1000
18 |     _C.MODEL.CENTERNET.POST_NMS_TOPK_TEST = 100
19 |     _C.MODEL.CENTERNET.NORM = "GN"
20 |     _C.MODEL.CENTERNET.USE_DEFORMABLE = False
21 |     _C.MODEL.CENTERNET.NUM_CLS_CONVS = 4
22 |     _C.MODEL.CENTERNET.NUM_BOX_CONVS = 4
23 |     _C.MODEL.CENTERNET.NUM_SHARE_CONVS = 0
24 |     _C.MODEL.CENTERNET.LOC_LOSS_TYPE = 'giou'
25 |     _C.MODEL.CENTERNET.SIGMOID_CLAMP = 1e-4
26 |     _C.MODEL.CENTERNET.HM_MIN_OVERLAP = 0.8
27 |     _C.MODEL.CENTERNET.MIN_RADIUS = 4
28 |     _C.MODEL.CENTERNET.SOI = [[0, 80], [64, 160], [128, 320], [256, 640], [512, 10000000]]
29 |     _C.MODEL.CENTERNET.POS_WEIGHT = 1.
30 |     _C.MODEL.CENTERNET.NEG_WEIGHT = 1.
31 |     _C.MODEL.CENTERNET.REG_WEIGHT = 2.
32 |     _C.MODEL.CENTERNET.HM_FOCAL_BETA = 4
33 |     _C.MODEL.CENTERNET.HM_FOCAL_ALPHA = 0.25
34 |     _C.MODEL.CENTERNET.LOSS_GAMMA = 2.0
35 |     _C.MODEL.CENTERNET.WITH_AGN_HM = False
36 |     _C.MODEL.CENTERNET.ONLY_PROPOSAL = False
37 |     _C.MODEL.CENTERNET.AS_PROPOSAL = False
38 |     _C.MODEL.CENTERNET.IGNORE_HIGH_FP = -1.
39 |     _C.MODEL.CENTERNET.MORE_POS = False
40 |     _C.MODEL.CENTERNET.MORE_POS_THRESH = 0.2
41 |     _C.MODEL.CENTERNET.MORE_POS_TOPK = 9
42 |     _C.MODEL.CENTERNET.NOT_NORM_REG = True
43 |     _C.MODEL.CENTERNET.NOT_NMS = False
44 |     _C.MODEL.CENTERNET.NO_REDUCE = False
45 | 
46 |     _C.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE = False
47 |     _C.MODEL.ROI_BOX_HEAD.PRIOR_PROB = 0.01
48 |     _C.MODEL.ROI_BOX_HEAD.USE_EQL_LOSS = False
49 |     _C.MODEL.ROI_BOX_HEAD.CAT_FREQ_PATH = \
50 |         'datasets/lvis/lvis_v1_train_cat_info.json'
51 |     _C.MODEL.ROI_BOX_HEAD.EQL_FREQ_CAT = 200
52 |     _C.MODEL.ROI_BOX_HEAD.USE_FED_LOSS = False
53 |     _C.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CAT = 50
54 |     _C.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT = 0.5
55 |     _C.MODEL.ROI_BOX_HEAD.MULT_PROPOSAL_SCORE = False
56 | 
57 |     _C.MODEL.BIFPN = CN()
58 |     _C.MODEL.BIFPN.NUM_LEVELS = 5
59 |     _C.MODEL.BIFPN.NUM_BIFPN = 6
60 |     _C.MODEL.BIFPN.NORM = 'GN'
61 |     _C.MODEL.BIFPN.OUT_CHANNELS = 160
62 |     _C.MODEL.BIFPN.SEPARABLE_CONV = False
63 | 
64 |     _C.MODEL.DLA = CN()
65 |     _C.MODEL.DLA.OUT_FEATURES = ['dla2']
66 |     _C.MODEL.DLA.USE_DLA_UP = True
67 |     _C.MODEL.DLA.NUM_LAYERS = 34
68 |     _C.MODEL.DLA.MS_OUTPUT = False
69 |     _C.MODEL.DLA.NORM = 'BN'
70 |     _C.MODEL.DLA.DLAUP_IN_FEATURES = ['dla3', 'dla4', 'dla5']
71 |     _C.MODEL.DLA.DLAUP_NODE = 'conv'
72 | 
73 |     _C.SOLVER.RESET_ITER = False
74 |     _C.SOLVER.TRAIN_ITER = -1
75 | 
76 |     _C.INPUT.CUSTOM_AUG = ''
77 |     _C.INPUT.TRAIN_SIZE = 640
78 |     _C.INPUT.TEST_SIZE = 640
79 |     _C.INPUT.SCALE_RANGE = (0.1, 2.)
80 |     # 'default' for fixed short/ long edge, 'square' for max size=INPUT.SIZE
81 |     _C.INPUT.TEST_INPUT_TYPE = 'default' 
82 |     
83 |     _C.DEBUG = False
84 |     _C.SAVE_DEBUG = False
85 |     _C.SAVE_PTH = False
86 |     _C.VIS_THRESH = 0.3
87 |     _C.DEBUG_SHOW_NAME = False
88 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/third_party/grit_src/grit/modeling/text/load_text_token.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class LoadTextTokens(object):
 5 |     def __init__(self, tokenizer, max_text_len=40, padding='do_not_pad'):
 6 |         self.tokenizer = tokenizer
 7 |         self.max_text_len = max_text_len
 8 |         self.padding = padding
 9 | 
10 |     def descriptions_to_text_tokens(self, target, begin_token):
11 |         target_encoding = self.tokenizer(
12 |             target, padding=self.padding,
13 |             add_special_tokens=False,
14 |             truncation=True, max_length=self.max_text_len)
15 | 
16 |         need_predict = [1] * len(target_encoding['input_ids'])
17 |         payload = target_encoding['input_ids']
18 |         if len(payload) > self.max_text_len - 2:
19 |             payload = payload[-(self.max_text_len - 2):]
20 |             need_predict = payload[-(self.max_text_len - 2):]
21 | 
22 |         input_ids = [begin_token] + payload + [self.tokenizer.sep_token_id]
23 | 
24 |         need_predict = [0] + need_predict + [1]
25 |         data = {
26 |             'text_tokens': torch.tensor(input_ids),
27 |             'text_lengths': len(input_ids),
28 |             'need_predict': torch.tensor(need_predict),
29 |         }
30 | 
31 |         return data
32 | 
33 |     def __call__(self, object_descriptions, box_features, begin_token):
34 |         text_tokens = []
35 |         text_lengths = []
36 |         need_predict = []
37 |         for description in object_descriptions:
38 |             tokens = self.descriptions_to_text_tokens(description, begin_token)
39 |             text_tokens.append(tokens['text_tokens'])
40 |             text_lengths.append(tokens['text_lengths'])
41 |             need_predict.append(tokens['need_predict'])
42 | 
43 |         text_tokens = torch.cat(self.collate(text_tokens), dim=0).to(box_features.device)
44 |         text_lengths = torch.tensor(text_lengths).to(box_features.device)
45 |         need_predict = torch.cat(self.collate(need_predict), dim=0).to(box_features.device)
46 | 
47 |         assert text_tokens.dim() == 2 and need_predict.dim() == 2
48 |         data = {'text_tokens': text_tokens,
49 |                 'text_lengths': text_lengths,
50 |                 'need_predict': need_predict}
51 | 
52 |         return data
53 | 
54 |     def collate(self, batch):
55 |         if all(isinstance(b, torch.Tensor) for b in batch) and len(batch) > 0:
56 |             if not all(b.shape == batch[0].shape for b in batch[1:]):
57 |                 assert all(len(b.shape) == len(batch[0].shape) for b in batch[1:])
58 |                 shape = torch.tensor([b.shape for b in batch])
59 |                 max_shape = tuple(shape.max(dim=0)[0].tolist())
60 |                 batch2 = []
61 |                 for b in batch:
62 |                     if any(c < m for c, m in zip(b.shape, max_shape)):
63 |                         b2 = torch.zeros(max_shape, dtype=b.dtype, device=b.device)
64 |                         if b.dim() == 1:
65 |                             b2[:b.shape[0]] = b
66 |                         elif b.dim() == 2:
67 |                             b2[:b.shape[0], :b.shape[1]] = b
68 |                         elif b.dim() == 3:
69 |                             b2[:b.shape[0], :b.shape[1], :b.shape[2]] = b
70 |                         else:
71 |                             raise NotImplementedError
72 |                         b = b2
73 |                     batch2.append(b[None, ...])
74 |             else:
75 |                 batch2 = []
76 |                 for b in batch:
77 |                     batch2.append(b[None, ...])
78 |             return batch2
79 |         else:
80 |             raise NotImplementedError
81 | 


--------------------------------------------------------------------------------
/video_generation_evaluation/toolkit/appearance_style.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import numpy as np
 4 | from tqdm import tqdm
 5 | 
 6 | import torch
 7 | import clip
 8 | from PIL import Image
 9 | from toolkit.utils import load_video, load_dimension_info, clip_transform, read_frames_decord_by_fps, clip_transform_Image
10 | 
11 | from .distributed import (
12 |     get_world_size,
13 |     get_rank,
14 |     all_gather,
15 |     barrier,
16 |     distribute_list_to_rank,
17 |     gather_list_of_dict,
18 | )
19 | 
20 | 
21 | def get_text_features(model, input_text, tokenizer, text_feature_dict={}):
22 |     if input_text in text_feature_dict:
23 |         return text_feature_dict[input_text]
24 |     text_template= f"{input_text}"
25 |     with torch.no_grad():
26 |         text_features = model.encode_text(text_template).float()
27 |         text_features /= text_features.norm(dim=-1, keepdim=True)      
28 |         text_feature_dict[input_text] = text_features
29 |     return text_features
30 | 
31 | def get_vid_features(model, input_frames):
32 |     with torch.no_grad():
33 |         clip_feat = model.encode_vision(input_frames,test=True).float()
34 |         clip_feat /= clip_feat.norm(dim=-1, keepdim=True)    
35 |     return clip_feat
36 | 
37 | def get_predict_label(clip_feature, text_feats_tensor, top=5):
38 |     label_probs = (100.0 * clip_feature @ text_feats_tensor.T).softmax(dim=-1)
39 |     top_probs, top_labels = label_probs.cpu().topk(top, dim=-1)
40 |     return top_probs, top_labels
41 | 
42 | def appearance_style(clip_model, video_dict, device, sample="rand"):
43 |     sim = 0.0
44 |     cnt = 0
45 |     video_results = []
46 |     image_transform = clip_transform_Image(224)
47 |     for info in tqdm(video_dict, disable=get_rank() > 0):
48 |         if 'auxiliary_info' not in info:
49 |             raise "Auxiliary info is not in json, please check your json."
50 |         query = info['auxiliary_info']['appearance_style']
51 |         text = clip.tokenize([query]).to(device)
52 |         video_list = info['video_list']
53 |         for video_path in video_list:
54 |             cur_video = []
55 |             with torch.no_grad():
56 |                 video_arrays = load_video(video_path, return_tensor=False)
57 |                 images = [Image.fromarray(i) for i in video_arrays]
58 |                 for image in images:
59 |                     image = image_transform(image)
60 |                     image = image.to(device)
61 |                     logits_per_image, logits_per_text = clip_model(image.unsqueeze(0), text)
62 |                     cur_sim = float(logits_per_text[0][0].cpu())
63 |                     cur_sim = cur_sim / 100
64 |                     cur_video.append(cur_sim)
65 |                     sim += cur_sim
66 |                     cnt +=1
67 |                 video_sim = np.mean(cur_video)
68 |                 video_results.append({
69 |                     'video_path': video_path, 
70 |                     'video_results': video_sim, 
71 |                     'frame_results': cur_video,
72 |                     'cur_sim': cur_sim})
73 |     sim_per_frame = sim / cnt
74 |     return sim_per_frame, video_results
75 | 
76 | def compute_appearance_style(json_dir, device, submodules_list, **kwargs):
77 |     clip_model, preprocess = clip.load(device=device, **submodules_list)
78 |     _, video_dict = load_dimension_info(json_dir, dimension='appearance_style', lang='en')
79 |     video_dict = distribute_list_to_rank(video_dict)
80 |     all_results, video_results = appearance_style(clip_model, video_dict, device)
81 |     if get_world_size() > 1:
82 |         video_results = gather_list_of_dict(video_results)
83 |         all_results = sum([d['cur_sim'] for d in video_results]) / len(video_results)
84 |     return all_results, video_results
85 | 


--------------------------------------------------------------------------------