├── General-Bench-Closeset └── .gitkeep ├── General-Bench-Openset └── .gitkeep ├── video_generation_evaluation ├── toolkit │ ├── cli │ │ ├── __init__.py │ │ └── vbench.py │ ├── launch │ │ └── __init__.py │ ├── third_party │ │ ├── __init__.py │ │ ├── amt │ │ │ ├── __init__.py │ │ │ ├── datasets │ │ │ │ └── __init__.py │ │ │ ├── losses │ │ │ │ └── __init__.py │ │ │ ├── metrics │ │ │ │ └── __init__.py │ │ │ ├── networks │ │ │ │ ├── __init__.py │ │ │ │ └── blocks │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── multi_flow.py │ │ │ ├── trainers │ │ │ │ ├── __init__.py │ │ │ │ └── logger.py │ │ │ ├── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── build_utils.py │ │ │ │ └── dist_utils.py │ │ │ ├── benchmarks │ │ │ │ ├── __init__.py │ │ │ │ ├── speed_parameters.py │ │ │ │ ├── gopro.py │ │ │ │ ├── adobe240.py │ │ │ │ ├── ucf101.py │ │ │ │ ├── vimeo90k.py │ │ │ │ ├── vimeo90k_tta.py │ │ │ │ └── snu_film.py │ │ │ ├── flow_generation │ │ │ │ ├── __init__.py │ │ │ │ ├── liteflownet │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── correlation │ │ │ │ │ │ └── README.md │ │ │ │ │ └── README.md │ │ │ │ └── gen_flow.py │ │ │ ├── scripts │ │ │ │ ├── benchmark_arbitrary.sh │ │ │ │ ├── train.sh │ │ │ │ └── benchmark_fixed.sh │ │ │ ├── environment.yaml │ │ │ ├── cfgs │ │ │ │ ├── AMT-S_gopro.yaml │ │ │ │ ├── AMT-G.yaml │ │ │ │ ├── AMT-L.yaml │ │ │ │ ├── AMT-S.yaml │ │ │ │ └── IFRNet.yaml │ │ │ └── train.py │ │ ├── umt │ │ │ ├── __init__.py │ │ │ ├── datasets │ │ │ │ ├── __init__.py │ │ │ │ └── masking_generator.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ └── extract_clip │ │ │ │ │ └── extract.ipynb │ │ │ └── functional.py │ │ ├── RAFT │ │ │ ├── __init__.py │ │ │ ├── core │ │ │ │ ├── __init__.py │ │ │ │ ├── utils_core │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── utils.py │ │ │ │ └── corr.py │ │ │ ├── download_models.sh │ │ │ ├── RAFT.png │ │ │ ├── alt_cuda_corr │ │ │ │ ├── setup.py │ │ │ │ └── correlation.cpp │ │ │ ├── LICENSE │ │ │ └── README.md │ │ ├── ViCLIP │ │ │ └── __init__.py │ │ ├── grit_src │ │ │ ├── __init__.py │ │ │ ├── grit │ │ │ │ ├── data │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── datasets │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── vg.py │ │ │ │ │ ├── transforms │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── custom_augmentation_impl.py │ │ │ │ │ └── custom_build_augmentation.py │ │ │ │ ├── modeling │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── text │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── load_text_token.py │ │ │ │ │ ├── backbone │ │ │ │ │ │ └── __init__.py │ │ │ │ │ ├── meta_arch │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── grit.py │ │ │ │ │ └── roi_heads │ │ │ │ │ │ └── __init__.py │ │ │ │ ├── __init__.py │ │ │ │ └── config.py │ │ │ ├── centernet2 │ │ │ │ ├── __init__.py │ │ │ │ ├── centernet │ │ │ │ │ ├── modeling │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── backbone │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ └── fpn_p5.py │ │ │ │ │ │ ├── layers │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ ├── ml_nms.py │ │ │ │ │ │ │ └── heatmap_focal_loss.py │ │ │ │ │ │ ├── meta_arch │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ └── centernet_detector.py │ │ │ │ │ │ ├── roi_heads │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ └── fed_loss.py │ │ │ │ │ │ └── dense_heads │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ └── utils.py │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── config.py │ │ │ │ ├── configs │ │ │ │ │ ├── CenterNet2_R50_1x.yaml │ │ │ │ │ ├── CenterNet-FPN_R50_1x.yaml │ │ │ │ │ ├── CenterNet2-F_R50_1x.yaml │ │ │ │ │ ├── CenterNet-S4_DLA_8x.yaml │ │ │ │ │ ├── O365_CenterNet2_R50_1x.yaml │ │ │ │ │ ├── LVIS_CenterNet2_R50_1x.yaml │ │ │ │ │ ├── LVIS_CenterNet2_R50_Fed_1x.yaml │ │ │ │ │ ├── CenterNet2_X101-DCN_2x.yaml │ │ │ │ │ ├── CenterNet2_DLA-BiFPN-P5_640_16x.yaml │ │ │ │ │ ├── CenterNet2_DLA-BiFPN-P5_640_16x_ST.yaml │ │ │ │ │ ├── CenterNet2_R2-101-DCN_896_4x.yaml │ │ │ │ │ ├── Base-CenterNet-FPN.yaml │ │ │ │ │ ├── CenterNet2_DLA-fcosBiFPN-P5_640_16x_ST.yaml │ │ │ │ │ ├── CenterNet2_R2-101-DCN-BiFPN_1280_4x.yaml │ │ │ │ │ ├── CenterNet2_R2-101-DCN-BiFPN_4x+4x_1560_ST.yaml │ │ │ │ │ ├── CenterNet2_DLA-BiFPN-P3_24x.yaml │ │ │ │ │ ├── CenterNet2_DLA-BiFPN-P3_4x.yaml │ │ │ │ │ ├── Base_S4_DLA.yaml │ │ │ │ │ ├── nuImages_CenterNet2_DLA_640_8x.yaml │ │ │ │ │ └── Base-CenterNet2.yaml │ │ │ │ └── .gitignore │ │ │ └── configs │ │ │ │ ├── GRiT_B_DenseCap.yaml │ │ │ │ ├── GRiT_B_ObjectDet.yaml │ │ │ │ ├── GRiT_L_ObjectDet.yaml │ │ │ │ ├── GRiT_H_ObjectDet.yaml │ │ │ │ ├── GRiT_B_DenseCap_ObjectDet.yaml │ │ │ │ └── Base.yaml │ │ ├── tag2Text │ │ │ ├── __init__.py │ │ │ ├── config_swinB_384.json │ │ │ ├── med_config.json │ │ │ └── q2l_config.json │ │ └── grit_model.py │ ├── temporal_flickering.py │ ├── imaging_quality.py │ ├── fvd.py │ ├── background_consistency.py │ ├── scene.py │ ├── overall_consistency.py │ ├── temporal_style.py │ ├── subject_consistency.py │ ├── object_class.py │ └── appearance_style.py ├── pretrained │ ├── amt_model │ │ ├── download.sh │ │ └── AMT-S.yaml │ ├── grit_model │ │ └── model_path.txt │ ├── viclip_model │ │ └── model_path.txt │ ├── caption_model │ │ └── model_path.txt │ ├── pyiqa_model │ │ └── model_path.txt │ ├── README.md │ ├── aesthetic_model │ │ └── model_path.txt │ ├── umt_model │ │ └── model_path.txt │ ├── raft_model │ │ └── download.sh │ └── clip_model │ │ └── model_path.txt ├── competitions │ ├── configs │ │ ├── clip_length_0.5.yaml │ │ ├── clip_length_1.0.yaml │ │ ├── clip_length_mix.yaml │ │ ├── clip_length_short.yaml │ │ ├── slow_fast_params.yaml │ │ ├── subject_mapping_table.yaml │ │ └── background_mapping_table.yaml │ ├── requirements.txt │ ├── competition_utils.py │ └── clip_score.py ├── requirements.txt └── README.md ├── processors ├── __init__.py ├── ._audio_processor.py ├── ._image_processor.py ├── __pycache__ │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── .___init__.cpython-38.pyc │ ├── .___init__.cpython-39.pyc │ ├── __init__.cpython-311.pyc │ ├── __init__.cpython-312.pyc │ ├── nlp_processor.cpython-38.pyc │ ├── nlp_processor.cpython-39.pyc │ ├── audio_processor.cpython-311.pyc │ ├── audio_processor.cpython-312.pyc │ ├── audio_processor.cpython-38.pyc │ ├── audio_processor.cpython-39.pyc │ ├── image_processor.cpython-311.pyc │ ├── image_processor.cpython-312.pyc │ ├── image_processor.cpython-38.pyc │ ├── image_processor.cpython-39.pyc │ ├── nlp_processor.cpython-311.pyc │ ├── nlp_processor.cpython-312.pyc │ ├── video_processor.cpython-311.pyc │ ├── video_processor.cpython-312.pyc │ ├── video_processor.cpython-38.pyc │ ├── video_processor.cpython-39.pyc │ ├── ._video_processor.cpython-39.pyc │ ├── three_d_processor.cpython-311.pyc │ ├── three_d_processor.cpython-312.pyc │ ├── three_d_processor.cpython-38.pyc │ ├── three_d_processor.cpython-39.pyc │ └── pseudo_audio_processor.cpython-39.pyc ├── three_d_processor.py ├── audio_processor.py ├── video_processor.py └── image_processor.py ├── outcome ├── test_result.xlsx └── Qwen2.5-7B-Instruct_result.xlsx ├── utils ├── ._special_metrix.py ├── __pycache__ │ ├── data_types.cpython-311.pyc │ ├── data_types.cpython-312.pyc │ ├── data_types.cpython-38.pyc │ ├── data_types.cpython-39.pyc │ ├── ._special_metrix.cpython-39.pyc │ ├── base_processor.cpython-311.pyc │ ├── base_processor.cpython-312.pyc │ ├── base_processor.cpython-38.pyc │ ├── base_processor.cpython-39.pyc │ ├── special_metrix.cpython-310.pyc │ ├── special_metrix.cpython-311.pyc │ ├── special_metrix.cpython-312.pyc │ ├── special_metrix.cpython-38.pyc │ └── special_metrix.cpython-39.pyc ├── data_types.py ├── base_processor.py └── special_metrix.py ├── references ├── sota_result.xlsx └── template_result.xlsx ├── run.sh └── README_ZH.md /General-Bench-Closeset/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /General-Bench-Openset/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/launch/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/umt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/RAFT/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/ViCLIP/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /processors/__init__.py: -------------------------------------------------------------------------------- 1 | """Processors package for different modalities.""" -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/RAFT/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/losses/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/networks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/trainers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/grit/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/RAFT/core/utils_core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/flow_generation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/networks/blocks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/grit/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/grit/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/grit/modeling/text/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/flow_generation/liteflownet/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/grit/data/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/grit/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/grit/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/grit/modeling/roi_heads/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/layers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/meta_arch/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/roi_heads/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /outcome/test_result.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/outcome/test_result.xlsx -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/dense_heads/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/._special_metrix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/._special_metrix.py -------------------------------------------------------------------------------- /references/sota_result.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/references/sota_result.xlsx -------------------------------------------------------------------------------- /processors/._audio_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/._audio_processor.py -------------------------------------------------------------------------------- /processors/._image_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/._image_processor.py -------------------------------------------------------------------------------- /references/template_result.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/references/template_result.xlsx -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/tag2Text/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('third_party/grit_src') 3 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_R50_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CenterNet2.yaml" 2 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/umt/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import build_dataset, build_pretraining_dataset -------------------------------------------------------------------------------- /outcome/Qwen2.5-7B-Instruct_result.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/outcome/Qwen2.5-7B-Instruct_result.xlsx -------------------------------------------------------------------------------- /utils/__pycache__/data_types.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/data_types.cpython-311.pyc -------------------------------------------------------------------------------- /utils/__pycache__/data_types.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/data_types.cpython-312.pyc -------------------------------------------------------------------------------- /utils/__pycache__/data_types.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/data_types.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/data_types.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/data_types.cpython-39.pyc -------------------------------------------------------------------------------- /video_generation_evaluation/pretrained/amt_model/download.sh: -------------------------------------------------------------------------------- 1 | wget https://huggingface.co/lalala125/AMT/resolve/main/amt-s.pth -P ~/.cache/amt_model 2 | -------------------------------------------------------------------------------- /processors/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /processors/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /processors/__pycache__/.___init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/.___init__.cpython-38.pyc -------------------------------------------------------------------------------- /processors/__pycache__/.___init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/.___init__.cpython-39.pyc -------------------------------------------------------------------------------- /processors/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /processors/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /utils/__pycache__/._special_metrix.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/._special_metrix.cpython-39.pyc -------------------------------------------------------------------------------- /utils/__pycache__/base_processor.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/base_processor.cpython-311.pyc -------------------------------------------------------------------------------- /utils/__pycache__/base_processor.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/base_processor.cpython-312.pyc -------------------------------------------------------------------------------- /utils/__pycache__/base_processor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/base_processor.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/base_processor.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/base_processor.cpython-39.pyc -------------------------------------------------------------------------------- /utils/__pycache__/special_metrix.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/special_metrix.cpython-310.pyc -------------------------------------------------------------------------------- /utils/__pycache__/special_metrix.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/special_metrix.cpython-311.pyc -------------------------------------------------------------------------------- /utils/__pycache__/special_metrix.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/special_metrix.cpython-312.pyc -------------------------------------------------------------------------------- /utils/__pycache__/special_metrix.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/special_metrix.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/special_metrix.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/utils/__pycache__/special_metrix.cpython-39.pyc -------------------------------------------------------------------------------- /processors/__pycache__/nlp_processor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/nlp_processor.cpython-38.pyc -------------------------------------------------------------------------------- /processors/__pycache__/nlp_processor.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/nlp_processor.cpython-39.pyc -------------------------------------------------------------------------------- /processors/__pycache__/audio_processor.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/audio_processor.cpython-311.pyc -------------------------------------------------------------------------------- /processors/__pycache__/audio_processor.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/audio_processor.cpython-312.pyc -------------------------------------------------------------------------------- /processors/__pycache__/audio_processor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/audio_processor.cpython-38.pyc -------------------------------------------------------------------------------- /processors/__pycache__/audio_processor.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/audio_processor.cpython-39.pyc -------------------------------------------------------------------------------- /processors/__pycache__/image_processor.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/image_processor.cpython-311.pyc -------------------------------------------------------------------------------- /processors/__pycache__/image_processor.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/image_processor.cpython-312.pyc -------------------------------------------------------------------------------- /processors/__pycache__/image_processor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/image_processor.cpython-38.pyc -------------------------------------------------------------------------------- /processors/__pycache__/image_processor.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/image_processor.cpython-39.pyc -------------------------------------------------------------------------------- /processors/__pycache__/nlp_processor.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/nlp_processor.cpython-311.pyc -------------------------------------------------------------------------------- /processors/__pycache__/nlp_processor.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/nlp_processor.cpython-312.pyc -------------------------------------------------------------------------------- /processors/__pycache__/video_processor.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/video_processor.cpython-311.pyc -------------------------------------------------------------------------------- /processors/__pycache__/video_processor.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/video_processor.cpython-312.pyc -------------------------------------------------------------------------------- /processors/__pycache__/video_processor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/video_processor.cpython-38.pyc -------------------------------------------------------------------------------- /processors/__pycache__/video_processor.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/video_processor.cpython-39.pyc -------------------------------------------------------------------------------- /processors/__pycache__/._video_processor.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/._video_processor.cpython-39.pyc -------------------------------------------------------------------------------- /processors/__pycache__/three_d_processor.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/three_d_processor.cpython-311.pyc -------------------------------------------------------------------------------- /processors/__pycache__/three_d_processor.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/three_d_processor.cpython-312.pyc -------------------------------------------------------------------------------- /processors/__pycache__/three_d_processor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/three_d_processor.cpython-38.pyc -------------------------------------------------------------------------------- /processors/__pycache__/three_d_processor.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/three_d_processor.cpython-39.pyc -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/RAFT/download_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget https://dl.dropboxusercontent.com/s/4j4z58wuv8o0mfz/models.zip 3 | unzip models.zip 4 | -------------------------------------------------------------------------------- /processors/__pycache__/pseudo_audio_processor.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/processors/__pycache__/pseudo_audio_processor.cpython-39.pyc -------------------------------------------------------------------------------- /video_generation_evaluation/pretrained/grit_model/model_path.txt: -------------------------------------------------------------------------------- 1 | wget https://datarelease.blob.core.windows.net/grit/models/grit_b_densecap_objectdet.pth -P ~/.cache/toolkit/grit_model 2 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/RAFT/RAFT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/path2generalist/General-Level/HEAD/video_generation_evaluation/toolkit/third_party/RAFT/RAFT.png -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet-FPN_R50_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CenterNet-FPN.yaml" 2 | MODEL: 3 | CENTERNET: 4 | MORE_POS: True -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2-F_R50_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CenterNet2.yaml" 2 | MODEL: 3 | ROI_HEADS: 4 | NAME: CustomROIHeads -------------------------------------------------------------------------------- /video_generation_evaluation/pretrained/viclip_model/model_path.txt: -------------------------------------------------------------------------------- 1 | wget https://huggingface.co/OpenGVLab/VBench_Used_Models/resolve/main/ViClip-InternVid-10M-FLT.pth -P ~/.cache/toolkit/ViCLIP 2 | -------------------------------------------------------------------------------- /video_generation_evaluation/pretrained/caption_model/model_path.txt: -------------------------------------------------------------------------------- 1 | wget https://huggingface.co/spaces/xinyu1205/recognize-anything/resolve/main/tag2text_swin_14m.pth -P ~/.cache/toolkit/caption_model 2 | -------------------------------------------------------------------------------- /video_generation_evaluation/pretrained/pyiqa_model/model_path.txt: -------------------------------------------------------------------------------- 1 | wget https://github.com/chaofengc/IQA-PyTorch/releases/download/v0.1-weights/musiq_spaq_ckpt-358bb6af.pth -P ~/.cache/toolkit/pyiqa_model 2 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/scripts/benchmark_arbitrary.sh: -------------------------------------------------------------------------------- 1 | CFG=$1 2 | CKPT=$2 3 | 4 | python benchmarks/gopro.py -c $CFG -p $CKPT 5 | python benchmarks/adobe240.py -c $CFG -p $CKPT -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet-S4_DLA_8x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base_S4_DLA.yaml" 2 | SOLVER: 3 | MAX_ITER: 90000 4 | BASE_LR: 0.08 5 | IMS_PER_BATCH: 128 -------------------------------------------------------------------------------- /video_generation_evaluation/pretrained/README.md: -------------------------------------------------------------------------------- 1 | ## Pre-Trained Models 2 | Please download the pre-trained weights according to the guidance in the `model_path.txt` file for each model (see each folder). 3 | 4 | -------------------------------------------------------------------------------- /video_generation_evaluation/pretrained/aesthetic_model/model_path.txt: -------------------------------------------------------------------------------- 1 | wget https://github.com/LAION-AI/aesthetic-predictor/blob/main/sa_0_4_vit_l_14_linear.pth -P ~/.cache/toolkit/aesthetic_model/emb_reader 2 | -------------------------------------------------------------------------------- /video_generation_evaluation/pretrained/umt_model/model_path.txt: -------------------------------------------------------------------------------- 1 | wget https://huggingface.co/OpenGVLab/VBench_Used_Models/resolve/main/l16_ptk710_ftk710_ftk400_f16_res224.pth -P ~/.cache/toolkit/umt_model/ 2 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/scripts/train.sh: -------------------------------------------------------------------------------- 1 | NUM_GPU=$1 2 | CFG=$2 3 | PORT=$3 4 | python -m torch.distributed.launch \ 5 | --nproc_per_node $NUM_GPU \ 6 | --master_port $PORT train.py -c $CFG -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/.gitignore: -------------------------------------------------------------------------------- 1 | # compilation and distribution 2 | __pycache__ 3 | _ext 4 | *.pyc 5 | *.pyd 6 | *.so 7 | centernet.egg-info/ 8 | build/ 9 | dist/ 10 | wheels/ 11 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/scripts/benchmark_fixed.sh: -------------------------------------------------------------------------------- 1 | CFG=$1 2 | CKPT=$2 3 | 4 | python benchmarks/vimeo90k.py -c $CFG -p $CKPT 5 | python benchmarks/ucf101.py -c $CFG -p $CKPT 6 | python benchmarks/snu_film.py -c $CFG -p $CKPT 7 | python benchmarks/xiph.py -c $CFG -p $CKPT -------------------------------------------------------------------------------- /video_generation_evaluation/pretrained/raft_model/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CACHE_DIR=~/.cache/toolkit 3 | wget -P $CACHE_DIR/raft_model/ https://dl.dropboxusercontent.com/s/4j4z58wuv8o0mfz/models.zip 4 | unzip -d ${CACHE_DIR}/raft_model/ $CACHE_DIR/raft_model/models.zip 5 | rm -r $CACHE_DIR/raft_model/models.zip 6 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/grit/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling.meta_arch import grit 2 | from .modeling.roi_heads import grit_roi_heads 3 | from .modeling.backbone import vit 4 | 5 | from .data.datasets import object365 6 | from .data.datasets import vg 7 | from .data.datasets import grit_coco -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/tag2Text/config_swinB_384.json: -------------------------------------------------------------------------------- 1 | { 2 | "ckpt": "pretrain_model/swin_base_patch4_window7_224_22k.pth", 3 | "vision_width": 1024, 4 | "image_res": 384, 5 | "window_size": 12, 6 | "embed_dim": 128, 7 | "depths": [ 2, 2, 18, 2 ], 8 | "num_heads": [ 4, 8, 16, 32 ] 9 | } 10 | -------------------------------------------------------------------------------- /video_generation_evaluation/pretrained/clip_model/model_path.txt: -------------------------------------------------------------------------------- 1 | wget https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt -P ~/.cache/toolkit/clip_model 2 | wget https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt -P ~/.cache/toolkit/clip_model 3 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/flow_generation/liteflownet/correlation/README.md: -------------------------------------------------------------------------------- 1 | This is an adaptation of the FlowNet2 implementation in order to compute cost volumes. Should you be making use of this work, please make sure to adhere to the licensing terms of the original authors. Should you be making use or modify this particular implementation, please acknowledge it appropriately. -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/O365_CenterNet2_R50_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CenterNet2.yaml" 2 | MODEL: 3 | ROI_HEADS: 4 | NUM_CLASSES: 365 5 | CENTERNET: 6 | NUM_CLASSES: 365 7 | DATASETS: 8 | TRAIN: ("objects365_train",) 9 | TEST: ("objects365_val",) 10 | DATALOADER: 11 | SAMPLER_TRAIN: "ClassAwareSampler" 12 | TEST: 13 | DETECTIONS_PER_IMAGE: 300 -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/environment.yaml: -------------------------------------------------------------------------------- 1 | name: amt 2 | channels: 3 | - pytorch 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - python=3.8.5 8 | - pip=20.3 9 | - cudatoolkit=11.3 10 | - pytorch=1.11.0 11 | - torchvision=0.12.0 12 | - numpy=1.21.5 13 | - pip: 14 | - opencv-python==4.1.2.30 15 | - imageio==2.19.3 16 | - omegaconf==2.3.0 17 | - Pillow==9.4.0 18 | - tqdm==4.64.1 19 | - wandb==0.12.21 -------------------------------------------------------------------------------- /video_generation_evaluation/competitions/configs/clip_length_0.5.yaml: -------------------------------------------------------------------------------- 1 | subject_consistency: 0.5 2 | background_consistency: 0.5 3 | motion_smoothness: 0.5 4 | temporal_flickering: 0.5 5 | dynamic_degree: 0.5 6 | imaging_quality: 0.5 7 | aesthetic_quality: 0.5 8 | 9 | object_class: 0.5 10 | multiple_objects: 0.5 11 | human_action: 0.5 12 | color: 0.5 13 | spatial_relationship: 0.5 14 | scene: 0.5 15 | appearance_style: 0.5 16 | temporal_style: 0.5 17 | overall_consistency: 0.5 18 | -------------------------------------------------------------------------------- /video_generation_evaluation/competitions/configs/clip_length_1.0.yaml: -------------------------------------------------------------------------------- 1 | subject_consistency: 1.0 2 | background_consistency: 1.0 3 | motion_smoothness: 1.0 4 | temporal_flickering: 1.0 5 | dynamic_degree: 1.0 6 | imaging_quality: 1.0 7 | aesthetic_quality: 1.0 8 | 9 | object_class: 1.0 10 | multiple_objects: 1.0 11 | human_action: 1.0 12 | color: 1.0 13 | spatial_relationship: 1.0 14 | scene: 1.0 15 | appearance_style: 1.0 16 | temporal_style: 1.0 17 | overall_consistency: 1.0 18 | -------------------------------------------------------------------------------- /video_generation_evaluation/competitions/configs/clip_length_mix.yaml: -------------------------------------------------------------------------------- 1 | subject_consistency: 2.0 2 | background_consistency: 2.0 3 | motion_smoothness: 2.0 4 | temporal_flickering: 2.0 5 | dynamic_degree: 2.0 6 | imaging_quality: 2.0 7 | aesthetic_quality: 2.0 8 | 9 | object_class: 2.0 10 | multiple_objects: 2.0 11 | human_action: 10.0 12 | color: 2.0 13 | spatial_relationship: 2.0 14 | scene: 2.0 15 | appearance_style: 2.0 16 | temporal_style: 10.0 17 | overall_consistency: 10.0 18 | -------------------------------------------------------------------------------- /video_generation_evaluation/competitions/configs/clip_length_short.yaml: -------------------------------------------------------------------------------- 1 | subject_consistency: 2.0 2 | background_consistency: 2.0 3 | motion_smoothness: 2.0 4 | temporal_flickering: 2.0 5 | dynamic_degree: 2.0 6 | imaging_quality: 2.0 7 | aesthetic_quality: 2.0 8 | 9 | object_class: 2.0 10 | multiple_objects: 2.0 11 | human_action: 2.0 12 | color: 2.0 13 | spatial_relationship: 2.0 14 | scene: 2.0 15 | appearance_style: 2.0 16 | temporal_style: 2.0 17 | overall_consistency: 2.0 18 | -------------------------------------------------------------------------------- /video_generation_evaluation/competitions/configs/slow_fast_params.yaml: -------------------------------------------------------------------------------- 1 | w_inclip_sb: 0.7 2 | w_clip2clip_sb: 0.3 3 | inclip_mean_sb: 0.9206531487463249 4 | inclip_std_sb: 0.06767633012297831 5 | clip2clip_mean_sb: 0.782773956831079 6 | clip2clip_std_sb: 0.15702951463645903 7 | 8 | 9 | w_inclip_bg: 0.8 10 | w_clip2clip_bg: 0.2 11 | inclip_mean_bg: 0.9461633887475777 12 | inclip_std_bg: 0.02029563684589086 13 | clip2clip_mean_bg: 0.8817304710164493 14 | clip2clip_std_bg: 0.0888072561860013 -------------------------------------------------------------------------------- /video_generation_evaluation/requirements.txt: -------------------------------------------------------------------------------- 1 | Pillow 2 | numpy<2.0.0 3 | matplotlib 4 | timm>=0.9 5 | wheel 6 | cython 7 | tensorboard 8 | scipy 9 | opencv-python 10 | scikit-learn 11 | scikit-image 12 | openai-clip 13 | decord 14 | requests 15 | pyyaml 16 | easydict 17 | pyiqa==0.1.10 18 | lvis 19 | fairscale>=0.4.4 20 | fvcore 21 | easydict 22 | urllib3 23 | boto3 24 | omegaconf 25 | transformers==4.33.2 26 | pycocoevalcap 27 | # detectron2@git+https://github.com/facebookresearch/detectron2.git 28 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/RAFT/alt_cuda_corr/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 3 | 4 | 5 | setup( 6 | name='correlation', 7 | ext_modules=[ 8 | CUDAExtension('alt_cuda_corr', 9 | sources=['correlation.cpp', 'correlation_kernel.cu'], 10 | extra_compile_args={'cxx': [], 'nvcc': ['-O3']}), 11 | ], 12 | cmdclass={ 13 | 'build_ext': BuildExtension 14 | }) 15 | 16 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/LVIS_CenterNet2_R50_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CenterNet2.yaml" 2 | MODEL: 3 | ROI_HEADS: 4 | NUM_CLASSES: 1203 5 | SCORE_THRESH_TEST: 0.02 6 | NMS_THRESH_TEST: 0.5 7 | CENTERNET: 8 | NUM_CLASSES: 1203 9 | 10 | DATASETS: 11 | TRAIN: ("lvis_v1_train",) 12 | TEST: ("lvis_v1_val",) 13 | DATALOADER: 14 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 15 | REPEAT_THRESHOLD: 0.001 16 | TEST: 17 | DETECTIONS_PER_IMAGE: 300 18 | -------------------------------------------------------------------------------- /video_generation_evaluation/competitions/requirements.txt: -------------------------------------------------------------------------------- 1 | Pillow==9.5.0 2 | numpy 3 | matplotlib 4 | timm==0.9.12 5 | torch==1.13.1 6 | torchvision>=0.13 7 | tensorboard 8 | scipy==1.10.1 9 | opencv-python 10 | scikit-learn 11 | requests 12 | scikit-image 13 | pyyaml 14 | easydict 15 | lvis 16 | fairscale==0.4.4 17 | openai-clip 18 | fvcore 19 | easydict 20 | decord==0.6.0 21 | pyiqa==0.1.8 22 | transformers==4.33.2 23 | pycocoevalcap 24 | wheel 25 | cython 26 | urllib3 27 | boto3 28 | omegaconf 29 | pyav 30 | av 31 | moviepy -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/umt/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip import clip_b16, clip_l14, clip_l14_336 2 | # from .modeling_finetune import vit_base_patch16_224, vit_base_patch16_384, vit_large_patch16_224, vit_large_patch16_384 3 | from .modeling_finetune import vit_large_patch16_224 4 | from .modeling_pretrain_umt import pretrain_umt_base_patch16_224, pretrain_umt_large_patch16_224 5 | from .modeling_pretrain import pretrain_videomae_base_patch16_224, pretrain_videomae_large_patch16_224, pretrain_videomae_huge_patch16_224 6 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/utils/build_utils.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | import sys 4 | CUR_DIR = os.path.dirname(os.path.abspath(__file__)) 5 | sys.path.append(os.path.join(CUR_DIR, "../")) 6 | 7 | 8 | def base_build_fn(module, cls, params): 9 | return getattr(importlib.import_module( 10 | module, package=None), cls)(**params) 11 | 12 | 13 | def build_from_cfg(config): 14 | module, cls = config['name'].rsplit(".", 1) 15 | params = config.get('params', {}) 16 | return base_build_fn(module, cls, params) 17 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/LVIS_CenterNet2_R50_Fed_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CenterNet2.yaml" 2 | MODEL: 3 | ROI_HEADS: 4 | NUM_CLASSES: 1203 5 | SCORE_THRESH_TEST: 0.02 6 | NMS_THRESH_TEST: 0.5 7 | CENTERNET: 8 | NUM_CLASSES: 1203 9 | ROI_BOX_HEAD: 10 | USE_SIGMOID_CE: True 11 | USE_FED_LOSS: True 12 | DATASETS: 13 | TRAIN: ("lvis_v1_train",) 14 | TEST: ("lvis_v1_val",) 15 | DATALOADER: 16 | SAMPLER_TRAIN: "RepeatFactorTrainingSampler" 17 | REPEAT_THRESHOLD: 0.001 18 | TEST: 19 | DETECTIONS_PER_IMAGE: 300 20 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/configs/GRiT_B_DenseCap.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base.yaml" 2 | MODEL: 3 | TRAIN_TASK: ["DenseCap"] 4 | TEST_TASK: "DenseCap" 5 | MASK_ON: False 6 | ROI_HEADS: 7 | SOFT_NMS_ENABLED: False 8 | BEAM_SIZE: 1 9 | WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth" 10 | BACKBONE: 11 | NAME: build_vit_fpn_backbone 12 | VIT_LAYERS: 12 13 | SOLVER: 14 | VIT_LAYER_DECAY_RATE: 0.7 15 | DATASETS: 16 | TRAIN: ("vg_train",) 17 | TEST: ("vg_test",) 18 | DATALOADER: 19 | DATASET_BS: 2 20 | OUTPUT_DIR: "./output/GRiT_B_DenseCap" -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/configs/GRiT_B_ObjectDet.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base.yaml" 2 | MODEL: 3 | TRAIN_TASK: ["ObjectDet"] 4 | TEST_TASK: "ObjectDet" 5 | MASK_ON: True 6 | ROI_HEADS: 7 | SOFT_NMS_ENABLED: True 8 | BEAM_SIZE: 3 9 | WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth" 10 | BACKBONE: 11 | NAME: build_vit_fpn_backbone 12 | VIT_LAYERS: 12 13 | SOLVER: 14 | VIT_LAYER_DECAY_RATE: 0.7 15 | DATASETS: 16 | TRAIN: ("GRiT_coco2017_train",) 17 | TEST: ("coco_2017_val",) 18 | DATALOADER: 19 | DATASET_BS: 2 20 | OUTPUT_DIR: "./output/GRiT_B_ObjectDet" -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/configs/GRiT_L_ObjectDet.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base.yaml" 2 | MODEL: 3 | TRAIN_TASK: ["ObjectDet"] 4 | TEST_TASK: "ObjectDet" 5 | MASK_ON: True 6 | ROI_HEADS: 7 | SOFT_NMS_ENABLED: True 8 | BEAM_SIZE: 3 9 | WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_large.pth" 10 | BACKBONE: 11 | NAME: build_vit_fpn_backbone_large 12 | VIT_LAYERS: 24 13 | SOLVER: 14 | VIT_LAYER_DECAY_RATE: 0.8 15 | DATASETS: 16 | TRAIN: ("GRiT_coco2017_train",) 17 | TEST: ("coco_2017_val",) 18 | DATALOADER: 19 | DATASET_BS: 1 20 | OUTPUT_DIR: "./output/GRiT_L_ObjectDet" -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/tag2Text/med_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 30524, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } 22 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/configs/GRiT_H_ObjectDet.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base.yaml" 2 | MODEL: 3 | TRAIN_TASK: ["ObjectDet"] 4 | TEST_TASK: "ObjectDet" 5 | MASK_ON: True 6 | ROI_HEADS: 7 | SOFT_NMS_ENABLED: True 8 | BEAM_SIZE: 3 9 | WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" 10 | BACKBONE: 11 | NAME: build_vit_fpn_backbone_huge 12 | VIT_LAYERS: 32 13 | SOLVER: 14 | MAX_ITER: 135000 15 | VIT_LAYER_DECAY_RATE: 0.9 16 | DATASETS: 17 | TRAIN: ("GRiT_coco2017_train",) 18 | TEST: ("coco_2017_val",) 19 | DATALOADER: 20 | DATASET_BS: 1 21 | OUTPUT_DIR: "./output/GRiT_H_ObjectDet" -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/cli/vbench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import importlib 3 | import subprocess 4 | 5 | vbench_cmd = ['evaluate', 'static_filter'] 6 | 7 | def main(): 8 | parser = argparse.ArgumentParser(prog="toolkit", formatter_class=argparse.RawTextHelpFormatter) 9 | subparsers = parser.add_subparsers(title='toolkit subcommands') 10 | 11 | for cmd in vbench_cmd: 12 | module = importlib.import_module(f'toolkit.cli.{cmd}') 13 | module.register_subparsers(subparsers) 14 | parser.set_defaults(func=help) 15 | args = parser.parse_args() 16 | args.func(args) 17 | 18 | def help(args): 19 | subprocess.run(['toolkit', '-h'], check=True) 20 | -------------------------------------------------------------------------------- /utils/data_types.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Union, Literal 2 | from dataclasses import dataclass 3 | from enum import Enum 4 | 5 | class TaskType(Enum): 6 | COMPREHENSION = "comprehension" 7 | GENERATION = "generation" 8 | 9 | class ModalityType(Enum): 10 | IMAGE = "Image" 11 | VIDEO = "Video" 12 | AUDIO = "Audio" 13 | NLP = "NLP" 14 | THREE_D = "3D" 15 | 16 | @dataclass 17 | class TaskResult: 18 | task_name: str 19 | metric: str 20 | score: float 21 | task_type: TaskType = TaskType.COMPREHENSION # Default to comprehension task 22 | 23 | # Store results for all modalities 24 | ModalityResults = Dict[ModalityType, Dict[TaskType, List[TaskResult]]] -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/tag2Text/q2l_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 4, 15 | "num_hidden_layers": 2, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true, 21 | "add_tag_cross_attention": false 22 | } 23 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling.meta_arch.centernet_detector import CenterNetDetector 2 | from .modeling.dense_heads.centernet import CenterNet 3 | from .modeling.roi_heads.custom_roi_heads import CustomROIHeads, CustomCascadeROIHeads 4 | 5 | from .modeling.backbone.fpn_p5 import build_p67_resnet_fpn_backbone 6 | from .modeling.backbone.dla import build_dla_backbone 7 | from .modeling.backbone.dlafpn import build_dla_fpn3_backbone 8 | from .modeling.backbone.bifpn import build_resnet_bifpn_backbone 9 | from .modeling.backbone.bifpn_fcos import build_fcos_resnet_bifpn_backbone 10 | from .modeling.backbone.res2net import build_p67_res2net_fpn_backbone 11 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_X101-DCN_2x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CenterNet2.yaml" 2 | MODEL: 3 | CENTERNET: 4 | USE_DEFORMABLE: True 5 | WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl" 6 | PIXEL_STD: [57.375, 57.120, 58.395] 7 | RESNETS: 8 | STRIDE_IN_1X1: False 9 | NUM_GROUPS: 32 10 | WIDTH_PER_GROUP: 8 11 | DEPTH: 101 12 | DEFORM_ON_PER_STAGE: [False, False, True, True] # on Res4, Res5 13 | DEFORM_MODULATED: True 14 | ROI_HEADS: 15 | IN_FEATURES: ["p3", "p4"] 16 | SOLVER: 17 | STEPS: (120000, 160000) 18 | MAX_ITER: 180000 19 | CHECKPOINT_PERIOD: 40000 20 | INPUT: 21 | MIN_SIZE_TRAIN: (480, 960) 22 | MIN_SIZE_TRAIN_SAMPLING: "range" 23 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/configs/GRiT_B_DenseCap_ObjectDet.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base.yaml" 2 | MODEL: 3 | TRAIN_TASK: ["ObjectDet", "DenseCap"] 4 | TEST_TASK: "DenseCap" # DenseCap or ObjectDet: Choose one for testing 5 | MASK_ON: True 6 | ROI_HEADS: 7 | SOFT_NMS_ENABLED: False 8 | BEAM_SIZE: 1 9 | WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth" 10 | BACKBONE: 11 | NAME: build_vit_fpn_backbone 12 | VIT_LAYERS: 12 13 | SOLVER: 14 | VIT_LAYER_DECAY_RATE: 0.7 15 | DATASETS: 16 | TRAIN: ("GRiT_coco2017_train", "vg_train") 17 | TEST: ("coco_2017_test-dev",) 18 | DATALOADER: 19 | DATASET_RATIO: [1, 1] 20 | DATASET_BS: 2 21 | DATASET_INPUT_SIZE: [1024, 1024] 22 | DATASET_INPUT_SCALE: [[0.1, 2.0], [0.1, 2.0]] 23 | OUTPUT_DIR: "./output/GRiT_B_DenseCap_ObjectDet" -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_DLA-BiFPN-P5_640_16x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CenterNet2.yaml" 2 | MODEL: 3 | BACKBONE: 4 | NAME: "build_p37_dla_bifpn_backbone" 5 | BIFPN: 6 | OUT_CHANNELS: 160 7 | NUM_LEVELS: 5 8 | NUM_BIFPN: 3 9 | CENTERNET: 10 | POST_NMS_TOPK_TEST: 128 11 | WEIGHTS: '' 12 | PIXEL_MEAN: [123.675, 116.280, 103.530] 13 | PIXEL_STD: [58.395, 57.12, 57.375] 14 | FPN: 15 | IN_FEATURES: ["dla3", "dla4", "dla5"] 16 | SOLVER: 17 | LR_SCHEDULER_NAME: "WarmupCosineLR" 18 | MAX_ITER: 360000 19 | BASE_LR: 0.08 20 | IMS_PER_BATCH: 64 21 | CHECKPOINT_PERIOD: 90000 22 | TEST: 23 | EVAL_PERIOD: 7500 24 | INPUT: 25 | FORMAT: RGB 26 | CUSTOM_AUG: EfficientDetResizeCrop 27 | TRAIN_SIZE: 640 28 | MIN_SIZE_TEST: 608 29 | MAX_SIZE_TEST: 900 30 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_DLA-BiFPN-P5_640_16x_ST.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CenterNet2.yaml" 2 | MODEL: 3 | BACKBONE: 4 | NAME: "build_p37_dla_bifpn_backbone" 5 | BIFPN: 6 | OUT_CHANNELS: 160 7 | NUM_LEVELS: 5 8 | NUM_BIFPN: 3 9 | CENTERNET: 10 | POST_NMS_TOPK_TEST: 128 11 | WEIGHTS: '' 12 | PIXEL_MEAN: [123.675, 116.280, 103.530] 13 | PIXEL_STD: [58.395, 57.12, 57.375] 14 | FPN: 15 | IN_FEATURES: ["dla3", "dla4", "dla5"] 16 | SOLVER: 17 | LR_SCHEDULER_NAME: "WarmupCosineLR" 18 | MAX_ITER: 360000 19 | BASE_LR: 0.08 20 | IMS_PER_BATCH: 64 21 | TEST: 22 | EVAL_PERIOD: 7500 23 | INPUT: 24 | FORMAT: RGB 25 | CUSTOM_AUG: EfficientDetResizeCrop 26 | TRAIN_SIZE: 640 27 | MIN_SIZE_TEST: 608 28 | MAX_SIZE_TEST: 900 29 | DATASETS: 30 | TRAIN: ("coco_2017_train","coco_un_yolov4_55_0.5",) 31 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_R2-101-DCN_896_4x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CenterNet2.yaml" 2 | MODEL: 3 | BACKBONE: 4 | NAME: "build_p67_res2net_fpn_backbone" 5 | WEIGHTS: "output/r2_101.pkl" 6 | RESNETS: 7 | DEPTH: 101 8 | WIDTH_PER_GROUP: 26 9 | DEFORM_ON_PER_STAGE: [False, False, True, True] # on Res4, Res5 10 | DEFORM_MODULATED: True 11 | PIXEL_MEAN: [123.675, 116.280, 103.530] 12 | PIXEL_STD: [58.395, 57.12, 57.375] 13 | CENTERNET: 14 | USE_DEFORMABLE: True 15 | ROI_HEADS: 16 | IN_FEATURES: ["p3", "p4"] 17 | INPUT: 18 | FORMAT: RGB 19 | TEST: 20 | EVAL_PERIOD: 7500 21 | SOLVER: 22 | MAX_ITER: 180000 23 | CHECKPOINT_PERIOD: 600000 24 | LR_SCHEDULER_NAME: "WarmupCosineLR" 25 | BASE_LR: 0.04 26 | IMS_PER_BATCH: 32 27 | INPUT: 28 | CUSTOM_AUG: EfficientDetResizeCrop 29 | TRAIN_SIZE: 896 -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/Base-CenterNet-FPN.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "CenterNetDetector" 3 | PROPOSAL_GENERATOR: 4 | NAME: "CenterNet" 5 | BACKBONE: 6 | NAME: "build_p67_resnet_fpn_backbone" 7 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 8 | RESNETS: 9 | DEPTH: 50 10 | OUT_FEATURES: ["res3", "res4", "res5"] 11 | FPN: 12 | IN_FEATURES: ["res3", "res4", "res5"] 13 | DATASETS: 14 | TRAIN: ("coco_2017_train",) 15 | TEST: ("coco_2017_val",) 16 | SOLVER: 17 | IMS_PER_BATCH: 16 18 | BASE_LR: 0.01 19 | STEPS: (60000, 80000) 20 | MAX_ITER: 90000 21 | CHECKPOINT_PERIOD: 1000000000 22 | WARMUP_ITERS: 4000 23 | WARMUP_FACTOR: 0.00025 24 | CLIP_GRADIENTS: 25 | ENABLED: True 26 | INPUT: 27 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 28 | OUTPUT_DIR: "./output/CenterNet2/auto" 29 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_DLA-fcosBiFPN-P5_640_16x_ST.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CenterNet2.yaml" 2 | MODEL: 3 | BACKBONE: 4 | NAME: "build_p37_fcos_dla_bifpn_backbone" 5 | BIFPN: 6 | OUT_CHANNELS: 160 7 | NUM_LEVELS: 5 8 | NUM_BIFPN: 3 9 | CENTERNET: 10 | POST_NMS_TOPK_TEST: 128 11 | WEIGHTS: '' 12 | PIXEL_MEAN: [123.675, 116.280, 103.530] 13 | PIXEL_STD: [58.395, 57.12, 57.375] 14 | FPN: 15 | IN_FEATURES: ["dla3", "dla4", "dla5"] 16 | TEST: 17 | EVAL_PERIOD: 7500 18 | SOLVER: 19 | LR_SCHEDULER_NAME: "WarmupCosineLR" 20 | MAX_ITER: 360000 21 | BASE_LR: 0.08 22 | IMS_PER_BATCH: 64 23 | INPUT: 24 | FORMAT: RGB 25 | CUSTOM_AUG: EfficientDetResizeCrop 26 | TRAIN_SIZE: 640 27 | MIN_SIZE_TEST: 608 28 | MAX_SIZE_TEST: 900 29 | DATASETS: 30 | TRAIN: ("coco_2017_train","coco_un_yolov4_55_0.5",) 31 | -------------------------------------------------------------------------------- /utils/base_processor.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from .data_types import ModalityType, TaskType, TaskResult 3 | 4 | """Base modality processor""" 5 | 6 | class BaseModalityProcessor: 7 | def __init__(self, modality: ModalityType, 8 | dataset_dir: str, 9 | pred_json_file: str): 10 | self.modality = modality 11 | self.dataset_dir = dataset_dir 12 | self.pred_json_file = pred_json_file 13 | 14 | def process_comprehension(self) -> List[TaskResult]: 15 | """Process comprehension tasks, optional implementation""" 16 | return [] 17 | 18 | def process_generation(self) -> List[TaskResult]: 19 | """Process generation tasks, optional implementation""" 20 | return [] 21 | 22 | def process(self) -> List[TaskResult]: 23 | """Process tasks without type distinction (e.g., NLP tasks)""" 24 | return [] -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_R2-101-DCN-BiFPN_1280_4x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CenterNet2.yaml" 2 | MODEL: 3 | BACKBONE: 4 | NAME: "build_res2net_bifpn_backbone" 5 | BIFPN: 6 | NUM_BIFPN: 7 7 | OUT_CHANNELS: 288 8 | WEIGHTS: "output/r2_101.pkl" 9 | RESNETS: 10 | DEPTH: 101 11 | WIDTH_PER_GROUP: 26 12 | DEFORM_ON_PER_STAGE: [False, False, True, True] # on Res4, Res5 13 | DEFORM_MODULATED: True 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.12, 57.375] 16 | CENTERNET: 17 | USE_DEFORMABLE: True 18 | ROI_HEADS: 19 | IN_FEATURES: ["p3", "p4"] 20 | INPUT: 21 | FORMAT: RGB 22 | TEST: 23 | EVAL_PERIOD: 7500 24 | SOLVER: 25 | MAX_ITER: 180000 26 | CHECKPOINT_PERIOD: 60000 27 | LR_SCHEDULER_NAME: "WarmupCosineLR" 28 | BASE_LR: 0.04 29 | IMS_PER_BATCH: 32 30 | INPUT: 31 | CUSTOM_AUG: EfficientDetResizeCrop 32 | TRAIN_SIZE: 1280 33 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_R2-101-DCN-BiFPN_4x+4x_1560_ST.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CenterNet2.yaml" 2 | MODEL: 3 | BACKBONE: 4 | NAME: "build_res2net_bifpn_backbone" 5 | BIFPN: 6 | NUM_BIFPN: 7 7 | OUT_CHANNELS: 288 8 | WEIGHTS: "output/r2_101.pkl" 9 | RESNETS: 10 | DEPTH: 101 11 | WIDTH_PER_GROUP: 26 12 | DEFORM_ON_PER_STAGE: [False, False, True, True] # on Res4, Res5 13 | DEFORM_MODULATED: True 14 | PIXEL_MEAN: [123.675, 116.280, 103.530] 15 | PIXEL_STD: [58.395, 57.12, 57.375] 16 | CENTERNET: 17 | USE_DEFORMABLE: True 18 | ROI_HEADS: 19 | IN_FEATURES: ["p3", "p4"] 20 | TEST: 21 | EVAL_PERIOD: 7500 22 | SOLVER: 23 | MAX_ITER: 180000 24 | CHECKPOINT_PERIOD: 7500 25 | LR_SCHEDULER_NAME: "WarmupCosineLR" 26 | BASE_LR: 0.04 27 | IMS_PER_BATCH: 32 28 | DATASETS: 29 | TRAIN: "('coco_2017_train', 'coco_un_yolov4_55_0.5')" 30 | INPUT: 31 | FORMAT: RGB 32 | CUSTOM_AUG: EfficientDetResizeCrop 33 | TRAIN_SIZE: 1280 34 | TEST_SIZE: 1560 35 | TEST_INPUT_TYPE: 'square' 36 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_DLA-BiFPN-P3_24x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CenterNet2.yaml" 2 | MODEL: 3 | BACKBONE: 4 | NAME: "build_p35_fcos_dla_bifpn_backbone" 5 | BIFPN: 6 | OUT_CHANNELS: 160 7 | NUM_LEVELS: 3 8 | NUM_BIFPN: 4 9 | DLA: 10 | NUM_LAYERS: 34 11 | NORM: "SyncBN" 12 | FPN: 13 | IN_FEATURES: ["dla3", "dla4", "dla5"] 14 | ROI_HEADS: 15 | IN_FEATURES: ["p3", "p4", "p5"] 16 | CENTERNET: 17 | POST_NMS_TOPK_TEST: 128 18 | FPN_STRIDES: [8, 16, 32] 19 | IN_FEATURES: ['p3', 'p4', 'p5'] 20 | SOI: [[0, 64], [48, 192], [128, 1000000]] 21 | DATASETS: 22 | TRAIN: ("coco_2017_train",) 23 | TEST: ("coco_2017_val",) 24 | SOLVER: 25 | IMS_PER_BATCH: 16 26 | BASE_LR: 0.02 27 | STEPS: (300000, 340000) 28 | MAX_ITER: 360000 29 | CHECKPOINT_PERIOD: 100000 30 | WARMUP_ITERS: 4000 31 | WARMUP_FACTOR: 0.00025 32 | INPUT: 33 | MIN_SIZE_TRAIN: (256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608) 34 | MAX_SIZE_TRAIN: 900 35 | MAX_SIZE_TEST: 736 36 | MIN_SIZE_TEST: 512 -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/CenterNet2_DLA-BiFPN-P3_4x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CenterNet2.yaml" 2 | MODEL: 3 | BACKBONE: 4 | NAME: "build_p35_fcos_dla_bifpn_backbone" 5 | BIFPN: 6 | OUT_CHANNELS: 160 7 | NUM_LEVELS: 3 8 | NUM_BIFPN: 4 9 | DLA: 10 | NUM_LAYERS: 34 11 | NORM: "SyncBN" 12 | FPN: 13 | IN_FEATURES: ["dla3", "dla4", "dla5"] 14 | ROI_HEADS: 15 | IN_FEATURES: ["p3", "p4", "p5"] 16 | CENTERNET: 17 | POST_NMS_TOPK_TEST: 128 18 | FPN_STRIDES: [8, 16, 32] 19 | IN_FEATURES: ['p3', 'p4', 'p5'] 20 | SOI: [[0, 64], [48, 192], [128, 1000000]] 21 | DATASETS: 22 | TRAIN: ("coco_2017_train",) 23 | TEST: ("coco_2017_val",) 24 | SOLVER: 25 | IMS_PER_BATCH: 16 26 | BASE_LR: 0.02 27 | STEPS: (300000, 340000) 28 | MAX_ITER: 360000 29 | CHECKPOINT_PERIOD: 100000 30 | WARMUP_ITERS: 4000 31 | WARMUP_FACTOR: 0.00025 32 | INPUT: 33 | MIN_SIZE_TRAIN: (256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608) 34 | MAX_SIZE_TRAIN: 900 35 | MAX_SIZE_TEST: 736 36 | MIN_SIZE_TEST: 512 -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/Base_S4_DLA.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "CenterNetDetector" 3 | PROPOSAL_GENERATOR: 4 | NAME: "CenterNet" 5 | PIXEL_STD: [57.375, 57.120, 58.395] 6 | BACKBONE: 7 | NAME: "build_dla_backbone" 8 | DLA: 9 | NORM: "BN" 10 | CENTERNET: 11 | IN_FEATURES: ["dla2"] 12 | FPN_STRIDES: [4] 13 | SOI: [[0, 1000000]] 14 | NUM_CLS_CONVS: 1 15 | NUM_BOX_CONVS: 1 16 | REG_WEIGHT: 1. 17 | MORE_POS: True 18 | HM_FOCAL_ALPHA: 0.25 19 | DATASETS: 20 | TRAIN: ("coco_2017_train",) 21 | TEST: ("coco_2017_val",) 22 | SOLVER: 23 | LR_SCHEDULER_NAME: "WarmupCosineLR" 24 | MAX_ITER: 90000 25 | BASE_LR: 0.04 26 | IMS_PER_BATCH: 64 27 | WEIGHT_DECAY: 0.0001 28 | CHECKPOINT_PERIOD: 1000000 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | INPUT: 32 | CUSTOM_AUG: EfficientDetResizeCrop 33 | TRAIN_SIZE: 640 34 | MIN_SIZE_TEST: 608 35 | MAX_SIZE_TEST: 900 36 | TEST: 37 | EVAL_PERIOD: 7500 38 | DATALOADER: 39 | NUM_WORKERS: 8 40 | OUTPUT_DIR: "output/CenterNet2/auto" 41 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/nuImages_CenterNet2_DLA_640_8x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-CenterNet2.yaml" 2 | MODEL: 3 | MASK_ON: True 4 | ROI_MASK_HEAD: 5 | NAME: "MaskRCNNConvUpsampleHead" 6 | NUM_CONV: 4 7 | POOLER_RESOLUTION: 14 8 | ROI_HEADS: 9 | NUM_CLASSES: 10 10 | IN_FEATURES: ["dla2"] 11 | BACKBONE: 12 | NAME: "build_dla_backbone" 13 | DLA: 14 | NORM: "BN" 15 | CENTERNET: 16 | IN_FEATURES: ["dla2"] 17 | FPN_STRIDES: [4] 18 | SOI: [[0, 1000000]] 19 | NUM_CLS_CONVS: 1 20 | NUM_BOX_CONVS: 1 21 | REG_WEIGHT: 1. 22 | MORE_POS: True 23 | HM_FOCAL_ALPHA: 0.25 24 | POST_NMS_TOPK_TEST: 128 25 | WEIGHTS: '' 26 | PIXEL_MEAN: [123.675, 116.280, 103.530] 27 | PIXEL_STD: [58.395, 57.12, 57.375] 28 | SOLVER: 29 | MAX_ITER: 180000 30 | STEPS: (120000, 160000) 31 | BASE_LR: 0.08 32 | IMS_PER_BATCH: 64 33 | INPUT: 34 | FORMAT: RGB 35 | CUSTOM_AUG: EfficientDetResizeCrop 36 | TRAIN_SIZE: 640 37 | MIN_SIZE_TEST: 608 38 | MAX_SIZE_TEST: 900 39 | MASK_FORMAT: bitmask 40 | DATASETS: 41 | TRAIN: ("nuimages_train",) 42 | TEST: ("nuimages_val",) 43 | -------------------------------------------------------------------------------- /video_generation_evaluation/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## 🛠 Installation 3 | 4 | ### Install with pip 5 | ```bash 6 | pip install vbench 7 | pip install -r requirements.txt 8 | ``` 9 | 10 | To evaluate some video generation ability aspects, you need to install [detectron2](https://github.com/facebookresearch/detectron2) via: 11 | ``` 12 | pip install detectron2@git+https://github.com/facebookresearch/detectron2.git 13 | ``` 14 | 15 | If there is an error during [detectron2](https://github.com/facebookresearch/detectron2) installation, see [here](https://detectron2.readthedocs.io/en/latest/tutorials/install.html). 16 | 17 | ### Thrid-party models 18 | Download required pretrained models by following the instructions in [here](pretrained/README.md) 19 | 20 | 21 | 22 | 23 | ## 🚀 Usage 24 | Configure: Modify the model and task type (T2V for text-to-video or I2V for image-to-video) in ``video_generation_evaluate_kit.py``, and then run: 25 | ``` 26 | python video_generation_evaluate_kit.py 27 | ``` 28 | This script will automatically: 29 | 30 | Generate video outputs 🖥️ 31 | 32 | Evaluate model performance across metrics 📊 -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/layers/ml_nms.py: -------------------------------------------------------------------------------- 1 | from detectron2.layers import batched_nms 2 | 3 | 4 | def ml_nms(boxlist, nms_thresh, max_proposals=-1, 5 | score_field="scores", label_field="labels"): 6 | """ 7 | Performs non-maximum suppression on a boxlist, with scores specified 8 | in a boxlist field via score_field. 9 | Arguments: 10 | boxlist(BoxList) 11 | nms_thresh (float) 12 | max_proposals (int): if > 0, then only the top max_proposals are kept 13 | after non-maximum suppression 14 | score_field (str) 15 | """ 16 | if nms_thresh <= 0: 17 | return boxlist 18 | if boxlist.has('pred_boxes'): 19 | boxes = boxlist.pred_boxes.tensor 20 | labels = boxlist.pred_classes 21 | else: 22 | boxes = boxlist.proposal_boxes.tensor 23 | labels = boxlist.proposal_boxes.tensor.new_zeros( 24 | len(boxlist.proposal_boxes.tensor)) 25 | scores = boxlist.scores 26 | 27 | keep = batched_nms(boxes, scores, labels, nms_thresh) 28 | if max_proposals > 0: 29 | keep = keep[: max_proposals] 30 | boxlist = boxlist[keep] 31 | return boxlist 32 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/roi_heads/fed_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import json 3 | import numpy as np 4 | from torch.nn import functional as F 5 | 6 | def load_class_freq( 7 | path='datasets/lvis/lvis_v1_train_cat_info.json', 8 | freq_weight=0.5): 9 | cat_info = json.load(open(path, 'r')) 10 | cat_info = torch.tensor( 11 | [c['image_count'] for c in sorted(cat_info, key=lambda x: x['id'])]) 12 | freq_weight = cat_info.float() ** freq_weight 13 | return freq_weight 14 | 15 | def get_fed_loss_inds( 16 | gt_classes, num_sample_cats=50, C=1203, \ 17 | weight=None, fed_cls_inds=-1): 18 | appeared = torch.unique(gt_classes) # C' 19 | prob = appeared.new_ones(C + 1).float() 20 | prob[-1] = 0 21 | if len(appeared) < num_sample_cats: 22 | if weight is not None: 23 | prob[:C] = weight.float().clone() 24 | prob[appeared] = 0 25 | if fed_cls_inds > 0: 26 | prob[fed_cls_inds:] = 0 27 | more_appeared = torch.multinomial( 28 | prob, num_sample_cats - len(appeared), 29 | replacement=False) 30 | appeared = torch.cat([appeared, more_appeared]) 31 | return appeared -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/cfgs/AMT-S_gopro.yaml: -------------------------------------------------------------------------------- 1 | exp_name: wofloloss_400epoch_bs24_lr2e-4 2 | seed: 2023 3 | epochs: 400 4 | distributed: true 5 | lr: 2e-4 6 | lr_min: 2e-5 7 | weight_decay: 0.0 8 | resume_state: null 9 | save_dir: work_dir 10 | eval_interval: 1 11 | 12 | network: 13 | name: networks.AMT-S.Model 14 | params: 15 | corr_radius: 3 16 | corr_lvls: 4 17 | num_flows: 3 18 | 19 | data: 20 | train: 21 | name: datasets.gopro_datasets.GoPro_Train_Dataset 22 | params: 23 | dataset_dir: data/GOPRO 24 | val: 25 | name: datasets.gopro_datasets.GoPro_Test_Dataset 26 | params: 27 | dataset_dir: data/GOPRO 28 | train_loader: 29 | batch_size: 24 30 | num_workers: 12 31 | val_loader: 32 | batch_size: 24 33 | num_workers: 3 34 | 35 | logger: 36 | use_wandb: false 37 | resume_id: null 38 | 39 | losses: 40 | - { 41 | name: losses.loss.CharbonnierLoss, 42 | nickname: l_rec, 43 | params: { 44 | loss_weight: 1.0, 45 | keys: [imgt_pred, imgt] 46 | } 47 | } 48 | - { 49 | name: losses.loss.TernaryLoss, 50 | nickname: l_ter, 51 | params: { 52 | loss_weight: 1.0, 53 | keys: [imgt_pred, imgt] 54 | } 55 | } 56 | 57 | -------------------------------------------------------------------------------- /video_generation_evaluation/competitions/competition_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torchvision.io as tvio 3 | import torch 4 | 5 | def transform_to_videos(input_path, output_path, frame_rate): 6 | if not os.path.exists(output_path): 7 | os.makedirs(output_path) 8 | 9 | for root, dirs, files in os.walk(input_path): 10 | for directory in dirs: 11 | 12 | dir_path = os.path.join(root, directory) 13 | image_files = [f for f in os.listdir(dir_path) if f.endswith('.png')] 14 | if not image_files: 15 | continue # Skip if there are no image files in the directory 16 | 17 | image_files.sort() 18 | 19 | frames = [] 20 | for image_file in image_files: 21 | image_path = os.path.join(dir_path, image_file) 22 | frame = tvio.read_image(image_path) 23 | frames.append(frame) 24 | frames = torch.stack(frames).permute(0, 2, 3, 1) 25 | 26 | # Write the frames to video 27 | video_path = os.path.join(output_path, f"{directory}.mp4") 28 | tvio.write_video(video_path, frames, fps=frame_rate) 29 | 30 | print(f"Videos are saved in '{output_path}'") 31 | 32 | 33 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/benchmarks/speed_parameters.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import torch 4 | import argparse 5 | from omegaconf import OmegaConf 6 | 7 | sys.path.append('.') 8 | from utils.build_utils import build_from_cfg 9 | 10 | parser = argparse.ArgumentParser( 11 | prog = 'AMT', 12 | description = 'Speed¶meter benchmark', 13 | ) 14 | parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') 15 | args = parser.parse_args() 16 | 17 | cfg_path = args.config 18 | network_cfg = OmegaConf.load(cfg_path).network 19 | model = build_from_cfg(network_cfg) 20 | model = model.cuda() 21 | model.eval() 22 | 23 | img0 = torch.randn(1, 3, 256, 448).cuda() 24 | img1 = torch.randn(1, 3, 256, 448).cuda() 25 | embt = torch.tensor(1/2).float().view(1, 1, 1, 1).cuda() 26 | 27 | with torch.no_grad(): 28 | for i in range(100): 29 | out = model(img0, img1, embt, eval=True) 30 | torch.cuda.synchronize() 31 | time_stamp = time.time() 32 | for i in range(1000): 33 | out = model(img0, img1, embt, eval=True) 34 | torch.cuda.synchronize() 35 | print('Time: {:.5f}s'.format((time.time() - time_stamp) / 1)) 36 | 37 | total = sum([param.nelement() for param in model.parameters()]) 38 | print('Parameters: {:.2f}M'.format(total / 1e6)) 39 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/dense_heads/utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import torch 3 | from torch import nn 4 | from detectron2.utils.comm import get_world_size 5 | from detectron2.structures import pairwise_iou, Boxes 6 | # from .data import CenterNetCrop 7 | import torch.nn.functional as F 8 | import numpy as np 9 | from detectron2.structures import Boxes, ImageList, Instances 10 | 11 | __all__ = ['reduce_sum', '_transpose'] 12 | 13 | INF = 1000000000 14 | 15 | def _transpose(training_targets, num_loc_list): 16 | ''' 17 | This function is used to transpose image first training targets to 18 | level first ones 19 | :return: level first training targets 20 | ''' 21 | for im_i in range(len(training_targets)): 22 | training_targets[im_i] = torch.split( 23 | training_targets[im_i], num_loc_list, dim=0) 24 | 25 | targets_level_first = [] 26 | for targets_per_level in zip(*training_targets): 27 | targets_level_first.append( 28 | torch.cat(targets_per_level, dim=0)) 29 | return targets_level_first 30 | 31 | 32 | def reduce_sum(tensor): 33 | world_size = get_world_size() 34 | if world_size < 2: 35 | return tensor 36 | tensor = tensor.clone() 37 | torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM) 38 | return tensor -------------------------------------------------------------------------------- /video_generation_evaluation/pretrained/amt_model/AMT-S.yaml: -------------------------------------------------------------------------------- 1 | exp_name: floloss1e-2_300epoch_bs24_lr2e-4 2 | seed: 2023 3 | epochs: 300 4 | distributed: true 5 | lr: 2e-4 6 | lr_min: 2e-5 7 | weight_decay: 0.0 8 | resume_state: null 9 | save_dir: work_dir 10 | eval_interval: 1 11 | 12 | network: 13 | name: networks.AMT-S.Model 14 | params: 15 | corr_radius: 3 16 | corr_lvls: 4 17 | num_flows: 3 18 | 19 | data: 20 | train: 21 | name: datasets.vimeo_datasets.Vimeo90K_Train_Dataset 22 | params: 23 | dataset_dir: data/vimeo_triplet 24 | val: 25 | name: datasets.vimeo_datasets.Vimeo90K_Test_Dataset 26 | params: 27 | dataset_dir: data/vimeo_triplet 28 | train_loader: 29 | batch_size: 24 30 | num_workers: 12 31 | val_loader: 32 | batch_size: 24 33 | num_workers: 3 34 | 35 | logger: 36 | use_wandb: false 37 | resume_id: null 38 | 39 | losses: 40 | - { 41 | name: losses.loss.CharbonnierLoss, 42 | nickname: l_rec, 43 | params: { 44 | loss_weight: 1.0, 45 | keys: [imgt_pred, imgt] 46 | } 47 | } 48 | - { 49 | name: losses.loss.TernaryLoss, 50 | nickname: l_ter, 51 | params: { 52 | loss_weight: 1.0, 53 | keys: [imgt_pred, imgt] 54 | } 55 | } 56 | - { 57 | name: losses.loss.MultipleFlowLoss, 58 | nickname: l_flo, 59 | params: { 60 | loss_weight: 0.002, 61 | keys: [flow0_pred, flow1_pred, flow] 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/cfgs/AMT-G.yaml: -------------------------------------------------------------------------------- 1 | exp_name: floloss1e-2_300epoch_bs24_lr1p5e-4 2 | seed: 2023 3 | epochs: 300 4 | distributed: true 5 | lr: 1.5e-4 6 | lr_min: 2e-5 7 | weight_decay: 0.0 8 | resume_state: null 9 | save_dir: work_dir 10 | eval_interval: 1 11 | 12 | network: 13 | name: networks.AMT-G.Model 14 | params: 15 | corr_radius: 3 16 | corr_lvls: 4 17 | num_flows: 5 18 | data: 19 | train: 20 | name: datasets.vimeo_datasets.Vimeo90K_Train_Dataset 21 | params: 22 | dataset_dir: data/vimeo_triplet 23 | val: 24 | name: datasets.vimeo_datasets.Vimeo90K_Test_Dataset 25 | params: 26 | dataset_dir: data/vimeo_triplet 27 | train_loader: 28 | batch_size: 24 29 | num_workers: 12 30 | val_loader: 31 | batch_size: 24 32 | num_workers: 3 33 | 34 | logger: 35 | use_wandb: true 36 | resume_id: null 37 | 38 | losses: 39 | - { 40 | name: losses.loss.CharbonnierLoss, 41 | nickname: l_rec, 42 | params: { 43 | loss_weight: 1.0, 44 | keys: [imgt_pred, imgt] 45 | } 46 | } 47 | - { 48 | name: losses.loss.TernaryLoss, 49 | nickname: l_ter, 50 | params: { 51 | loss_weight: 1.0, 52 | keys: [imgt_pred, imgt] 53 | } 54 | } 55 | - { 56 | name: losses.loss.MultipleFlowLoss, 57 | nickname: l_flo, 58 | params: { 59 | loss_weight: 0.005, 60 | keys: [flow0_pred, flow1_pred, flow] 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/cfgs/AMT-L.yaml: -------------------------------------------------------------------------------- 1 | exp_name: floloss1e-2_300epoch_bs24_lr2e-4 2 | seed: 2023 3 | epochs: 300 4 | distributed: true 5 | lr: 2e-4 6 | lr_min: 2e-5 7 | weight_decay: 0.0 8 | resume_state: null 9 | save_dir: work_dir 10 | eval_interval: 1 11 | 12 | network: 13 | name: networks.AMT-L.Model 14 | params: 15 | corr_radius: 3 16 | corr_lvls: 4 17 | num_flows: 5 18 | data: 19 | train: 20 | name: datasets.vimeo_datasets.Vimeo90K_Train_Dataset 21 | params: 22 | dataset_dir: data/vimeo_triplet 23 | val: 24 | name: datasets.vimeo_datasets.Vimeo90K_Test_Dataset 25 | params: 26 | dataset_dir: data/vimeo_triplet 27 | train_loader: 28 | batch_size: 24 29 | num_workers: 12 30 | val_loader: 31 | batch_size: 24 32 | num_workers: 3 33 | 34 | logger: 35 | use_wandb: true 36 | resume_id: null 37 | 38 | losses: 39 | - { 40 | name: losses.loss.CharbonnierLoss, 41 | nickname: l_rec, 42 | params: { 43 | loss_weight: 1.0, 44 | keys: [imgt_pred, imgt] 45 | } 46 | } 47 | - { 48 | name: losses.loss.TernaryLoss, 49 | nickname: l_ter, 50 | params: { 51 | loss_weight: 1.0, 52 | keys: [imgt_pred, imgt] 53 | } 54 | } 55 | - { 56 | name: losses.loss.MultipleFlowLoss, 57 | nickname: l_flo, 58 | params: { 59 | loss_weight: 0.002, 60 | keys: [flow0_pred, flow1_pred, flow] 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/cfgs/AMT-S.yaml: -------------------------------------------------------------------------------- 1 | exp_name: floloss1e-2_300epoch_bs24_lr2e-4 2 | seed: 2023 3 | epochs: 300 4 | distributed: true 5 | lr: 2e-4 6 | lr_min: 2e-5 7 | weight_decay: 0.0 8 | resume_state: null 9 | save_dir: work_dir 10 | eval_interval: 1 11 | 12 | network: 13 | name: networks.AMT-S.Model 14 | params: 15 | corr_radius: 3 16 | corr_lvls: 4 17 | num_flows: 3 18 | 19 | data: 20 | train: 21 | name: datasets.vimeo_datasets.Vimeo90K_Train_Dataset 22 | params: 23 | dataset_dir: data/vimeo_triplet 24 | val: 25 | name: datasets.vimeo_datasets.Vimeo90K_Test_Dataset 26 | params: 27 | dataset_dir: data/vimeo_triplet 28 | train_loader: 29 | batch_size: 24 30 | num_workers: 12 31 | val_loader: 32 | batch_size: 24 33 | num_workers: 3 34 | 35 | logger: 36 | use_wandb: false 37 | resume_id: null 38 | 39 | losses: 40 | - { 41 | name: losses.loss.CharbonnierLoss, 42 | nickname: l_rec, 43 | params: { 44 | loss_weight: 1.0, 45 | keys: [imgt_pred, imgt] 46 | } 47 | } 48 | - { 49 | name: losses.loss.TernaryLoss, 50 | nickname: l_ter, 51 | params: { 52 | loss_weight: 1.0, 53 | keys: [imgt_pred, imgt] 54 | } 55 | } 56 | - { 57 | name: losses.loss.MultipleFlowLoss, 58 | nickname: l_flo, 59 | params: { 60 | loss_weight: 0.002, 61 | keys: [flow0_pred, flow1_pred, flow] 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/cfgs/IFRNet.yaml: -------------------------------------------------------------------------------- 1 | exp_name: floloss1e-2_geoloss1e-2_300epoch_bs24_lr1e-4 2 | seed: 2023 3 | epochs: 300 4 | distributed: true 5 | lr: 1e-4 6 | lr_min: 1e-5 7 | weight_decay: 1e-6 8 | resume_state: null 9 | save_dir: work_dir 10 | eval_interval: 1 11 | 12 | network: 13 | name: networks.IFRNet.Model 14 | 15 | data: 16 | train: 17 | name: datasets.datasets.Vimeo90K_Train_Dataset 18 | params: 19 | dataset_dir: data/vimeo_triplet 20 | val: 21 | name: datasets.datasets.Vimeo90K_Test_Dataset 22 | params: 23 | dataset_dir: data/vimeo_triplet 24 | train_loader: 25 | batch_size: 24 26 | num_workers: 12 27 | val_loader: 28 | batch_size: 24 29 | num_workers: 3 30 | 31 | logger: 32 | use_wandb: true 33 | resume_id: null 34 | 35 | losses: 36 | - { 37 | name: losses.loss.CharbonnierLoss, 38 | nickname: l_rec, 39 | params: { 40 | loss_weight: 1.0, 41 | keys: [imgt_pred, imgt] 42 | } 43 | } 44 | - { 45 | name: losses.loss.TernaryLoss, 46 | nickname: l_ter, 47 | params: { 48 | loss_weight: 1.0, 49 | keys: [imgt_pred, imgt] 50 | } 51 | } 52 | - { 53 | name: losses.loss.IFRFlowLoss, 54 | nickname: l_flo, 55 | params: { 56 | loss_weight: 0.01, 57 | keys: [flow0_pred, flow1_pred, flow] 58 | } 59 | } 60 | - { 61 | name: losses.loss.GeometryLoss, 62 | nickname: l_geo, 63 | params: { 64 | loss_weight: 0.01, 65 | keys: [ft_pred, ft_gt] 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/RAFT/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, princeton-vl 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/RAFT/alt_cuda_corr/correlation.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // CUDA forward declarations 5 | std::vector corr_cuda_forward( 6 | torch::Tensor fmap1, 7 | torch::Tensor fmap2, 8 | torch::Tensor coords, 9 | int radius); 10 | 11 | std::vector corr_cuda_backward( 12 | torch::Tensor fmap1, 13 | torch::Tensor fmap2, 14 | torch::Tensor coords, 15 | torch::Tensor corr_grad, 16 | int radius); 17 | 18 | // C++ interface 19 | #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") 20 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") 21 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 22 | 23 | std::vector corr_forward( 24 | torch::Tensor fmap1, 25 | torch::Tensor fmap2, 26 | torch::Tensor coords, 27 | int radius) { 28 | CHECK_INPUT(fmap1); 29 | CHECK_INPUT(fmap2); 30 | CHECK_INPUT(coords); 31 | 32 | return corr_cuda_forward(fmap1, fmap2, coords, radius); 33 | } 34 | 35 | 36 | std::vector corr_backward( 37 | torch::Tensor fmap1, 38 | torch::Tensor fmap2, 39 | torch::Tensor coords, 40 | torch::Tensor corr_grad, 41 | int radius) { 42 | CHECK_INPUT(fmap1); 43 | CHECK_INPUT(fmap2); 44 | CHECK_INPUT(coords); 45 | CHECK_INPUT(corr_grad); 46 | 47 | return corr_cuda_backward(fmap1, fmap2, coords, corr_grad, radius); 48 | } 49 | 50 | 51 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 52 | m.def("forward", &corr_forward, "CORR forward"); 53 | m.def("backward", &corr_backward, "CORR backward"); 54 | } -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/configs/Base-CenterNet2.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | PROPOSAL_GENERATOR: 4 | NAME: "CenterNet" 5 | BACKBONE: 6 | NAME: "build_p67_resnet_fpn_backbone" 7 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 8 | RESNETS: 9 | DEPTH: 50 10 | OUT_FEATURES: ["res3", "res4", "res5"] 11 | FPN: 12 | IN_FEATURES: ["res3", "res4", "res5"] 13 | ROI_HEADS: 14 | NAME: CustomCascadeROIHeads 15 | IN_FEATURES: ["p3", "p4", "p5", "p6", "p7"] 16 | IOU_THRESHOLDS: [0.6] 17 | NMS_THRESH_TEST: 0.7 18 | ROI_BOX_CASCADE_HEAD: 19 | IOUS: [0.6, 0.7, 0.8] 20 | ROI_BOX_HEAD: 21 | NAME: "FastRCNNConvFCHead" 22 | NUM_FC: 2 23 | POOLER_RESOLUTION: 7 24 | CLS_AGNOSTIC_BBOX_REG: True 25 | MULT_PROPOSAL_SCORE: True 26 | CENTERNET: 27 | REG_WEIGHT: 1. 28 | NOT_NORM_REG: True 29 | ONLY_PROPOSAL: True 30 | WITH_AGN_HM: True 31 | INFERENCE_TH: 0.0001 32 | PRE_NMS_TOPK_TRAIN: 4000 33 | POST_NMS_TOPK_TRAIN: 2000 34 | PRE_NMS_TOPK_TEST: 1000 35 | POST_NMS_TOPK_TEST: 256 36 | NMS_TH_TRAIN: 0.9 37 | NMS_TH_TEST: 0.9 38 | POS_WEIGHT: 0.5 39 | NEG_WEIGHT: 0.5 40 | IGNORE_HIGH_FP: 0.85 41 | DATASETS: 42 | TRAIN: ("coco_2017_train",) 43 | TEST: ("coco_2017_val",) 44 | SOLVER: 45 | IMS_PER_BATCH: 16 46 | BASE_LR: 0.02 47 | STEPS: (60000, 80000) 48 | MAX_ITER: 90000 49 | CHECKPOINT_PERIOD: 1000000000 50 | WARMUP_ITERS: 4000 51 | WARMUP_FACTOR: 0.00025 52 | CLIP_GRADIENTS: 53 | ENABLED: True 54 | INPUT: 55 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 56 | OUTPUT_DIR: "./output/CenterNet2/auto" 57 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/utils/dist_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | 5 | def get_world_size(): 6 | """Find OMPI world size without calling mpi functions 7 | :rtype: int 8 | """ 9 | if os.environ.get('PMI_SIZE') is not None: 10 | return int(os.environ.get('PMI_SIZE') or 1) 11 | elif os.environ.get('OMPI_COMM_WORLD_SIZE') is not None: 12 | return int(os.environ.get('OMPI_COMM_WORLD_SIZE') or 1) 13 | else: 14 | return torch.cuda.device_count() 15 | 16 | 17 | def get_global_rank(): 18 | """Find OMPI world rank without calling mpi functions 19 | :rtype: int 20 | """ 21 | if os.environ.get('PMI_RANK') is not None: 22 | return int(os.environ.get('PMI_RANK') or 0) 23 | elif os.environ.get('OMPI_COMM_WORLD_RANK') is not None: 24 | return int(os.environ.get('OMPI_COMM_WORLD_RANK') or 0) 25 | else: 26 | return 0 27 | 28 | 29 | def get_local_rank(): 30 | """Find OMPI local rank without calling mpi functions 31 | :rtype: int 32 | """ 33 | if os.environ.get('MPI_LOCALRANKID') is not None: 34 | return int(os.environ.get('MPI_LOCALRANKID') or 0) 35 | elif os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK') is not None: 36 | return int(os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK') or 0) 37 | else: 38 | return 0 39 | 40 | 41 | def get_master_ip(): 42 | if os.environ.get('AZ_BATCH_MASTER_NODE') is not None: 43 | return os.environ.get('AZ_BATCH_MASTER_NODE').split(':')[0] 44 | elif os.environ.get('AZ_BATCHAI_MPI_MASTER_NODE') is not None: 45 | return os.environ.get('AZ_BATCHAI_MPI_MASTER_NODE') 46 | else: 47 | return "127.0.0.1" 48 | 49 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from .grit_src.image_dense_captions import image_caption_api, init_demo, dense_pred_to_caption, dense_pred_to_caption_only_name,dense_pred_to_caption_tuple 5 | from detectron2.data.detection_utils import read_image 6 | 7 | class DenseCaptioning(): 8 | def __init__(self, device): 9 | self.device = device 10 | self.demo = None 11 | 12 | 13 | def initialize_model(self, model_weight): 14 | self.demo = init_demo(self.device, model_weight=model_weight) 15 | 16 | def initialize_model_det(self, model_weight): 17 | self.demo = init_demo(self.device, model_weight = model_weight, task="ObjectDet") 18 | 19 | def image_dense_caption(self, image_src): 20 | dense_caption = image_caption_api(image_src, self.device) 21 | print('\033[1;35m' + '*' * 100 + '\033[0m') 22 | print("Step2, Dense Caption:\n") 23 | print(dense_caption) 24 | print('\033[1;35m' + '*' * 100 + '\033[0m') 25 | return dense_caption 26 | 27 | def run_caption_api(self,image_src): 28 | img = read_image(image_src, format="BGR") 29 | print(img.shape) 30 | predictions, visualized_output = self.demo.run_on_image(img) 31 | new_caption = dense_pred_to_caption_only_name(predictions) 32 | return new_caption 33 | 34 | def run_caption_tensor(self,img): 35 | predictions, visualized_output = self.demo.run_on_image(img) 36 | new_caption = dense_pred_to_caption_tuple(predictions) 37 | return new_caption, visualized_output 38 | 39 | def run_det_tensor(self,img): 40 | predictions, visualized_output = self.demo.run_on_image(img) 41 | return predictions, visualized_output 42 | 43 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/grit/data/custom_build_augmentation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from detectron2.data import transforms as T 3 | from .transforms.custom_augmentation_impl import EfficientDetResizeCrop 4 | 5 | 6 | def build_custom_augmentation(cfg, is_train, scale=None, size=None, \ 7 | min_size=None, max_size=None): 8 | """ 9 | Create a list of default :class:`Augmentation` from config. 10 | Now it includes resizing and flipping. 11 | 12 | Returns: 13 | list[Augmentation] 14 | """ 15 | if cfg.INPUT.CUSTOM_AUG == 'ResizeShortestEdge': 16 | if is_train: 17 | min_size = cfg.INPUT.MIN_SIZE_TRAIN if min_size is None else min_size 18 | max_size = cfg.INPUT.MAX_SIZE_TRAIN if max_size is None else max_size 19 | sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING 20 | else: 21 | min_size = cfg.INPUT.MIN_SIZE_TEST 22 | max_size = cfg.INPUT.MAX_SIZE_TEST 23 | sample_style = "choice" 24 | augmentation = [T.ResizeShortestEdge(min_size, max_size, sample_style)] 25 | elif cfg.INPUT.CUSTOM_AUG == 'EfficientDetResizeCrop': 26 | if is_train: 27 | scale = cfg.INPUT.SCALE_RANGE if scale is None else scale 28 | size = cfg.INPUT.TRAIN_SIZE if size is None else size 29 | else: 30 | scale = (1, 1) 31 | size = cfg.INPUT.TEST_SIZE 32 | augmentation = [EfficientDetResizeCrop(size, scale)] 33 | else: 34 | assert 0, cfg.INPUT.CUSTOM_AUG 35 | 36 | if is_train: 37 | augmentation.append(T.RandomFlip()) 38 | return augmentation 39 | 40 | 41 | build_custom_transform_gen = build_custom_augmentation 42 | """ 43 | Alias for backward-compatibility. 44 | """ -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/grit/config.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import CfgNode as CN 2 | 3 | 4 | def add_grit_config(cfg): 5 | _C = cfg 6 | 7 | _C.MODEL.BEAM_SIZE = 1 8 | _C.MODEL.TRAIN_TASK = ["ObjectDet", "DenseCap"] 9 | _C.MODEL.TEST_TASK = "DenseCap" # This can be varied if the model is jointly trained on multiple tasks 10 | 11 | _C.MODEL.ROI_BOX_HEAD.USE_BIAS = 0.0 # >= 0: not use 12 | _C.MODEL.ROI_BOX_HEAD.MULT_PROPOSAL_SCORE = False 13 | 14 | _C.MODEL.ROI_HEADS.MASK_WEIGHT = 1.0 15 | _C.MODEL.ROI_HEADS.OBJECT_FEAT_POOLER_RES = 14 16 | _C.MODEL.ROI_HEADS.SOFT_NMS_ENABLED = False 17 | 18 | # Backbones 19 | _C.MODEL.VIT_LAYERS = 12 20 | 21 | # Text Decoder 22 | _C.TEXT_DECODER = CN() 23 | _C.TEXT_DECODER.VOCAB_SIZE = 30522 24 | _C.TEXT_DECODER.HIDDEN_SIZE = 768 25 | _C.TEXT_DECODER.NUM_LAYERS = 6 26 | _C.TEXT_DECODER.ATTENTION_HEADS = 12 27 | _C.TEXT_DECODER.FEEDFORWARD_SIZE = 768 * 4 28 | 29 | # Multi-dataset dataloader 30 | _C.DATALOADER.DATASET_RATIO = [1, 1] # sample ratio 31 | _C.DATALOADER.DATASET_BS = 1 32 | _C.DATALOADER.DATASET_INPUT_SIZE = [1024, 1024] 33 | _C.DATALOADER.DATASET_INPUT_SCALE = [(0.1, 2.0), (0.1, 2.0)] 34 | _C.DATALOADER.DATASET_MIN_SIZES = [(640, 800), (640, 800)] 35 | _C.DATALOADER.DATASET_MAX_SIZES = [1333, 1333] 36 | 37 | _C.SOLVER.USE_CUSTOM_SOLVER = True 38 | _C.SOLVER.OPTIMIZER = 'ADAMW' 39 | _C.SOLVER.VIT_LAYER_DECAY = True 40 | _C.SOLVER.VIT_LAYER_DECAY_RATE = 0.7 41 | 42 | _C.INPUT.CUSTOM_AUG = 'EfficientDetResizeCrop' 43 | _C.INPUT.TRAIN_SIZE = 1024 44 | _C.INPUT.TEST_SIZE = 1024 45 | _C.INPUT.SCALE_RANGE = (0.1, 2.) 46 | # 'default' for fixed short / long edge 47 | _C.INPUT.TEST_INPUT_TYPE = 'default' 48 | 49 | _C.FIND_UNUSED_PARAM = True 50 | _C.USE_ACT_CHECKPOINT = True 51 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/umt/datasets/masking_generator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class TubeMaskingGenerator: 5 | def __init__(self, input_size, mask_ratio): 6 | self.frames, self.height, self.width = input_size 7 | self.num_patches_per_frame = self.height * self.width 8 | self.total_patches = self.frames * self.num_patches_per_frame 9 | self.num_masks_per_frame = int(mask_ratio * self.num_patches_per_frame) 10 | self.total_masks = self.frames * self.num_masks_per_frame 11 | 12 | def __repr__(self): 13 | repr_str = "Maks: total patches {}, mask patches {}".format( 14 | self.total_patches, self.total_masks 15 | ) 16 | return repr_str 17 | 18 | def __call__(self): 19 | mask_per_frame = np.hstack([ 20 | np.zeros(self.num_patches_per_frame - self.num_masks_per_frame), 21 | np.ones(self.num_masks_per_frame), 22 | ]) 23 | np.random.shuffle(mask_per_frame) 24 | mask = np.tile(mask_per_frame, (self.frames, 1)).flatten() 25 | return mask 26 | 27 | 28 | class RandomMaskingGenerator: 29 | def __init__(self, input_size, mask_ratio): 30 | if not isinstance(input_size, tuple): 31 | input_size = (input_size, ) * 3 32 | 33 | self.frames, self.height, self.width = input_size 34 | 35 | self.num_patches = self.frames * self.height * self.width # 8x14x14 36 | self.num_mask = int(mask_ratio * self.num_patches) 37 | 38 | def __repr__(self): 39 | repr_str = "Maks: total patches {}, mask patches {}".format( 40 | self.num_patches, self.num_mask) 41 | return repr_str 42 | 43 | def __call__(self): 44 | mask = np.hstack([ 45 | np.zeros(self.num_patches - self.num_mask), 46 | np.ones(self.num_mask), 47 | ]) 48 | np.random.shuffle(mask) 49 | return mask # [196*8] 50 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/benchmarks/gopro.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tqdm 3 | import torch 4 | import argparse 5 | import numpy as np 6 | from omegaconf import OmegaConf 7 | 8 | sys.path.append('.') 9 | from utils.build_utils import build_from_cfg 10 | from datasets.gopro_datasets import GoPro_Test_Dataset 11 | from metrics.psnr_ssim import calculate_psnr, calculate_ssim 12 | 13 | parser = argparse.ArgumentParser( 14 | prog = 'AMT', 15 | description = 'GOPRO evaluation', 16 | ) 17 | parser.add_argument('-c', '--config', default='cfgs/AMT-S_gopro.yaml') 18 | parser.add_argument('-p', '--ckpt', default='pretrained/gopro_amt-s.pth',) 19 | parser.add_argument('-r', '--root', default='data/GOPRO',) 20 | args = parser.parse_args() 21 | 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 23 | cfg_path = args.config 24 | ckpt_path = args.ckpt 25 | root = args.root 26 | 27 | network_cfg = OmegaConf.load(cfg_path).network 28 | network_name = network_cfg.name 29 | model = build_from_cfg(network_cfg) 30 | ckpt = torch.load(ckpt_path) 31 | model.load_state_dict(ckpt['state_dict']) 32 | model = model.to(device) 33 | model.eval() 34 | 35 | dataset = GoPro_Test_Dataset(dataset_dir=root) 36 | 37 | psnr_list = [] 38 | ssim_list = [] 39 | pbar = tqdm.tqdm(dataset, total=len(dataset)) 40 | for data in pbar: 41 | input_dict = {} 42 | for k, v in data.items(): 43 | input_dict[k] = v.to(device).unsqueeze(0) 44 | with torch.no_grad(): 45 | imgt_pred = model(**input_dict)['imgt_pred'] 46 | psnr = calculate_psnr(imgt_pred, input_dict['imgt']) 47 | ssim = calculate_ssim(imgt_pred, input_dict['imgt']) 48 | psnr_list.append(psnr) 49 | ssim_list.append(ssim) 50 | avg_psnr = np.mean(psnr_list) 51 | avg_ssim = np.mean(ssim_list) 52 | desc_str = f'[{network_name}/GOPRO] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}' 53 | pbar.set_description_str(desc_str) 54 | 55 | 56 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/benchmarks/adobe240.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tqdm 3 | import torch 4 | import argparse 5 | import numpy as np 6 | from omegaconf import OmegaConf 7 | 8 | sys.path.append('.') 9 | from utils.build_utils import build_from_cfg 10 | from datasets.adobe_datasets import Adobe240_Dataset 11 | from metrics.psnr_ssim import calculate_psnr, calculate_ssim 12 | 13 | parser = argparse.ArgumentParser( 14 | prog = 'AMT', 15 | description = 'Adobe240 evaluation', 16 | ) 17 | parser.add_argument('-c', '--config', default='cfgs/AMT-S_gopro.yaml') 18 | parser.add_argument('-p', '--ckpt', default='pretrained/gopro_amt-s.pth',) 19 | parser.add_argument('-r', '--root', default='data/Adobe240/test_frames',) 20 | args = parser.parse_args() 21 | 22 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 23 | cfg_path = args.config 24 | ckpt_path = args.ckpt 25 | root = args.root 26 | 27 | network_cfg = OmegaConf.load(cfg_path).network 28 | network_name = network_cfg.name 29 | model = build_from_cfg(network_cfg) 30 | ckpt = torch.load(ckpt_path) 31 | model.load_state_dict(ckpt['state_dict']) 32 | model = model.to(device) 33 | model.eval() 34 | 35 | dataset = Adobe240_Dataset(dataset_dir=root, augment=False) 36 | 37 | psnr_list = [] 38 | ssim_list = [] 39 | pbar = tqdm.tqdm(dataset, total=len(dataset)) 40 | for data in pbar: 41 | input_dict = {} 42 | for k, v in data.items(): 43 | input_dict[k] = v.to(device).unsqueeze(0) 44 | with torch.no_grad(): 45 | imgt_pred = model(**input_dict)['imgt_pred'] 46 | psnr = calculate_psnr(imgt_pred, input_dict['imgt']) 47 | ssim = calculate_ssim(imgt_pred, input_dict['imgt']) 48 | psnr_list.append(psnr) 49 | ssim_list.append(ssim) 50 | avg_psnr = np.mean(psnr_list) 51 | avg_ssim = np.mean(ssim_list) 52 | desc_str = f'[{network_name}/Adobe240] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}' 53 | pbar.set_description_str(desc_str) 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | DATASET_DIR=General-Bench-Openset 4 | NLP_MODEL_NAME=Qwen/Qwen2.5-7B-Instruct 5 | AUDIO_MODEL_NAME=Qwen/Qwen2-Audio-7B-Instruct 6 | VIDEO_MODEL_NAME=Qwen/Qwen2.5-VL-3B-Instruct 7 | IMAGE_MODEL_NAME=Qwen/Qwen2.5-VL-7B-Instruct 8 | 3D_MODEL_NAME=Qwen/Qwen2.5-3B-Instruct 9 | 10 | # 解析 step 参数 11 | STEP="123" 12 | for arg in "$@"; do 13 | case $arg in 14 | --step=*) 15 | STEP="${arg#*=}" 16 | ;; 17 | --step) 18 | shift 19 | STEP="$1" 20 | ;; 21 | esac 22 | done 23 | 24 | contains_step() { 25 | case "$STEP" in 26 | *$1*) return 0 ;; 27 | *) return 1 ;; 28 | esac 29 | } 30 | 31 | # Step1: Generate predictions for NLP, Image, Audio, Video, 3D tasks 32 | if contains_step 1; then 33 | # NLP 34 | python predictors/nlp_predictor.py --dataset_dir ${DATASET_DIR}/nlp --model_name ${NLP_MODEL_NAME} 35 | 36 | # Audio 37 | python predictors/audio_predict_comprehension.py -m Qwen/Qwen2-Audio-7B-Instruct -d ${DATASET_DIR}/audio/comprehension/ -o ${DATASET_DIR}/audio/predictions/comprehension/ -t AccentClassification AccentSexClassification 38 | python predictors/audio_predict_generation.py -m SpeechGPT -d ${DATASET_DIR}/audio/generation/ -o ${DATASET_DIR}/audio/predictions/generation/ -t SingleCaptionToAudio VideoToAudio ImageToSpeech 39 | 40 | # Video 41 | python predictors/video_comprehension_tasks.py 42 | python predictors/video_comprehension_flow_matching_tracking.py 43 | python predictors/video_comprehension_qa_caption.py 44 | python predictors/video_translation_restoration_superresolution_objectdetection.py 45 | python predictors/video_generation_evaluate_kit.py 46 | fi 47 | 48 | MODEL_NAME=Qwen2.5-7B-Instruct 49 | # Step2: Obtain the score for each task 50 | if contains_step 2; then 51 | python register.py -d ${DATASET_DIR} -t references/template_result.xlsx -o outcome -m ${MODEL_NAME} -p prediction.json 52 | fi 53 | 54 | MODEL_NAME=Qwen2.5-7B-Instruct 55 | # Step3: Obtain the Level score 56 | if contains_step 3; then 57 | python ranker.py -p outcome/${MODEL_NAME}_result.xlsx -m ${MODEL_NAME} 58 | fi -------------------------------------------------------------------------------- /utils/special_metrix.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | 4 | def _sigmoid(x): 5 | return 1 / (1 + math.exp(-x)) 6 | 7 | 8 | def _2_sigmoid_minus_1(x): 9 | return 2 * _sigmoid(x) - 1 10 | 11 | def _tanh(x): 12 | return math.tanh(x) 13 | 14 | 15 | # mapping param for special metrix 16 | special_metric_dict = { 17 | # with T 18 | 'MAE': 50, 19 | 'RMS': 50, 20 | 'MSE': 5, 21 | 'RMSE': 5, 22 | 'ABSREL': 0.1, 23 | 'EPE': 1, 24 | 'FID': 25, 25 | 'FVD': 100, 26 | 'FAD': 10, 27 | 'PSNR': 1 / 20, # higher is better 28 | 'SAD': 10, 29 | 'RTE': 0.5, 30 | 'CD': 1, 31 | 'MCD': 5, 32 | # without T 33 | 'WER': None, 34 | 'MS-SSIM': None, 35 | 'MOS': None, 36 | } 37 | 38 | HIGHER_IS_BETTER = [ 39 | 'PSNR', 40 | ] 41 | 42 | def map_function_for_special(metrix: str, score: float) -> float: 43 | """ 44 | Score mapping function for special metrics. 45 | >>> metrix: metrix name, str, e.g., 'MAE'. 46 | >>> score: task score, float, e.g., 5.3. 47 | return: mapped scores, float. 48 | """ 49 | metrix = metrix.upper() 50 | T = special_metric_dict[metrix] 51 | 52 | assert score > 0, f'score should be > 0, but found: {score}' 53 | 54 | if metrix in HIGHER_IS_BETTER: 55 | y = _tanh(T * score) 56 | elif metrix == 'WER': 57 | y = 1 - score 58 | elif metrix == 'MS-SSIM': 59 | y = (score + 1) / 2 60 | elif metrix == 'MOS': 61 | y = (score - 1) / 4 62 | else: # lower is better 63 | y = _2_sigmoid_minus_1(T / score) 64 | 65 | return y * 100 # Convert to percentage scale 66 | 67 | # • Normalizing WER: 68 | # y = 1 − x, where x ∈ [0, 1], y ∈ [0, 1]. 69 | # • Normalizing MS-SSIM: 70 | # y = (x + 1) / 2 , where x ∈ [−1, 1], y ∈ [0, 1]. 71 | # • Normalizing MOS: 72 | # y = x − 1 / 4 , where x ∈ [1, 5], y ∈ [0, 1]. 73 | 74 | if __name__ == '__main__': 75 | r = random.random() 76 | print(f"{r = }") 77 | print(f"{_sigmoid(r) = }") 78 | print(f"{_2_sigmoid_minus_1(r) = }") 79 | print(f"{_tanh(r) = }") 80 | print(f"{_tanh(r / 2) = }") -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/benchmarks/ucf101.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tqdm 4 | import torch 5 | import argparse 6 | import numpy as np 7 | import os.path as osp 8 | from omegaconf import OmegaConf 9 | 10 | sys.path.append('.') 11 | from utils.utils import read, img2tensor 12 | from utils.build_utils import build_from_cfg 13 | from metrics.psnr_ssim import calculate_psnr, calculate_ssim 14 | 15 | parser = argparse.ArgumentParser( 16 | prog = 'AMT', 17 | description = 'UCF101 evaluation', 18 | ) 19 | parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') 20 | parser.add_argument('-p', '--ckpt', default='pretrained/amt-s.pth') 21 | parser.add_argument('-r', '--root', default='data/ucf101_interp_ours') 22 | args = parser.parse_args() 23 | 24 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 25 | cfg_path = args.config 26 | ckpt_path = args.ckpt 27 | root = args.root 28 | 29 | network_cfg = OmegaConf.load(cfg_path).network 30 | network_name = network_cfg.name 31 | model = build_from_cfg(network_cfg) 32 | ckpt = torch.load(ckpt_path) 33 | model.load_state_dict(ckpt['state_dict']) 34 | model = model.to(device) 35 | model.eval() 36 | 37 | dirs = sorted(os.listdir(root)) 38 | psnr_list = [] 39 | ssim_list = [] 40 | pbar = tqdm.tqdm(dirs, total=len(dirs)) 41 | for d in pbar: 42 | dir_path = osp.join(root, d) 43 | I0 = img2tensor(read(osp.join(dir_path, 'frame_00.png'))).to(device) 44 | I1 = img2tensor(read(osp.join(dir_path, 'frame_01_gt.png'))).to(device) 45 | I2 = img2tensor(read(osp.join(dir_path, 'frame_02.png'))).to(device) 46 | embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device) 47 | 48 | I1_pred = model(I0, I2, embt, eval=True)['imgt_pred'] 49 | 50 | psnr = calculate_psnr(I1_pred, I1).detach().cpu().numpy() 51 | ssim = calculate_ssim(I1_pred, I1).detach().cpu().numpy() 52 | 53 | psnr_list.append(psnr) 54 | ssim_list.append(ssim) 55 | 56 | avg_psnr = np.mean(psnr_list) 57 | avg_ssim = np.mean(ssim_list) 58 | desc_str = f'[{network_name}/UCF101] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}' 59 | pbar.set_description_str(desc_str) -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/configs/Base.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GRiT" 3 | MASK_ON: True 4 | PROPOSAL_GENERATOR: 5 | NAME: "CenterNet" 6 | FPN: 7 | IN_FEATURES: ["layer3", "layer4", "layer5"] 8 | PIXEL_MEAN: [123.675, 116.280, 103.530] 9 | PIXEL_STD: [58.395, 57.12, 57.375] 10 | ROI_HEADS: 11 | NAME: GRiTROIHeadsAndTextDecoder 12 | IN_FEATURES: ["p3", "p4", "p5"] 13 | IOU_THRESHOLDS: [0.6] 14 | NUM_CLASSES: 1 15 | SCORE_THRESH_TEST: 0.02 16 | NMS_THRESH_TEST: 0.5 17 | OBJECT_FEAT_POOLER_RES: 14 18 | ROI_BOX_CASCADE_HEAD: 19 | IOUS: [0.6, 0.7, 0.8] 20 | ROI_BOX_HEAD: 21 | NAME: "FastRCNNConvFCHead" 22 | NUM_FC: 2 23 | POOLER_RESOLUTION: 7 24 | CLS_AGNOSTIC_BBOX_REG: True 25 | MULT_PROPOSAL_SCORE: True 26 | ROI_MASK_HEAD: 27 | NAME: "MaskRCNNConvUpsampleHead" 28 | NUM_CONV: 4 29 | POOLER_RESOLUTION: 14 30 | CLS_AGNOSTIC_MASK: True 31 | CENTERNET: 32 | NUM_CLASSES: 1 33 | REG_WEIGHT: 1. 34 | NOT_NORM_REG: True 35 | ONLY_PROPOSAL: True 36 | WITH_AGN_HM: True 37 | INFERENCE_TH: 0.0001 38 | PRE_NMS_TOPK_TRAIN: 4000 39 | POST_NMS_TOPK_TRAIN: 2000 40 | PRE_NMS_TOPK_TEST: 1000 41 | POST_NMS_TOPK_TEST: 256 42 | NMS_TH_TRAIN: 0.9 43 | NMS_TH_TEST: 0.9 44 | POS_WEIGHT: 0.5 45 | NEG_WEIGHT: 0.5 46 | IGNORE_HIGH_FP: 0.85 47 | DATASETS: 48 | TRAIN: ("coco_2017_train",) 49 | TEST: ("coco_2017_val",) 50 | DATALOADER: 51 | SAMPLER_TRAIN: "MultiDatasetSampler" 52 | DATASET_RATIO: [1] 53 | DATASET_INPUT_SIZE: [1024] 54 | DATASET_INPUT_SCALE: [[0.1, 2.0]] 55 | FILTER_EMPTY_ANNOTATIONS: False 56 | NUM_WORKERS: 8 57 | TEST: 58 | DETECTIONS_PER_IMAGE: 256 59 | SOLVER: 60 | LR_SCHEDULER_NAME: "WarmupCosineLR" 61 | CHECKPOINT_PERIOD: 10000 62 | WARMUP_ITERS: 1000 63 | WARMUP_FACTOR: 0.001 64 | USE_CUSTOM_SOLVER: True 65 | OPTIMIZER: "ADAMW" 66 | MAX_ITER: 180000 67 | IMS_PER_BATCH: 64 68 | BASE_LR: 0.00008 69 | VIT_LAYER_DECAY: True 70 | CLIP_GRADIENTS: 71 | ENABLED: True 72 | INPUT: 73 | FORMAT: RGB 74 | CUSTOM_AUG: EfficientDetResizeCrop 75 | TRAIN_SIZE: 640 76 | USE_ACT_CHECKPOINT: True 77 | VERSION: 2 -------------------------------------------------------------------------------- /README_ZH.md: -------------------------------------------------------------------------------- 1 | # GenBench 评分系统 - 用户使用说明 2 | 3 |
4 |

English | 中文

5 |
6 | 7 | --- 8 | 9 | 本系统用于评估大模型在 General-Bench 多模态任务集上的表现。用户只需一条命令即可完成预测、评分和最终得分计算。 10 | 11 | ## 环境准备 12 | 13 | - Python 3.9 及以上 14 | - 推荐提前安装依赖(如 pandas, numpy, openpyxl 等) 15 | - Video Generation评测,需要按照video_generation_evaluation/README.md中的步骤安装依赖 16 | - Video Comprehension评测,需要按照[sa2va](https://github.com/magic-research/Sa2VA)中的README.md中的步骤安装依赖。 17 | 18 | ## 数据集下载 19 | 20 | - **Open Set(公开数据集)**:请从 [HuggingFace General-Bench-Openset](https://huggingface.co/datasets/General-Level/General-Bench-Openset) 下载全部数据,解压后放入 `General-Bench-Openset/` 目录。 21 | - **Close Set(私有数据集)**:请从 [HuggingFace General-Bench-Closeset](https://huggingface.co/datasets/General-Level/General-Bench-Closeset) 下载全部数据,解压后放入 `General-Bench-Closeset/` 目录。 22 | 23 | ## 一键运行 24 | 25 | 请直接运行主脚本 `run.sh`,即可完成全部流程: 26 | 27 | ```bash 28 | bash run.sh 29 | ``` 30 | 31 | 该命令将依次完成: 32 | 1. 生成各模态预测结果 33 | 2. 计算各任务得分 34 | 3. 计算最终 Level 得分 35 | 36 | ## 分步运行(可选) 37 | 38 | 如只需运行部分步骤,可使用 `--step` 参数: 39 | 40 | - 只运行第1步(生成预测): 41 | ```bash 42 | bash run.sh --step 1 43 | ``` 44 | - 只运行第1、2步: 45 | ```bash 46 | bash run.sh --step 12 47 | ``` 48 | - 只运行第2、3步: 49 | ```bash 50 | bash run.sh --step 23 51 | ``` 52 | - 不加参数默认全部执行(等价于 `--step 123`) 53 | 54 | - 步骤1:生成预测结果prediction.json,存在每一个数据集的annotation.json同级目录下 55 | - 步骤2:计算每个任务的得分,存在outcome/{model_name}_result.xlsx中 56 | - 步骤3:计算相关模型的Level得分 57 | 58 | > **注意:** 59 | > - 使用 **Close Set(私有数据集)** 时,只需运行 step1(即 `bash run.sh --step 1`),并将生成的 prediction.json 提交到系统。 60 | > - 使用 **Open Set(公开数据集)** 时,需依次运行 step1、step2、step3(即 `bash run.sh --step 123`),完成全部评测流程。 61 | 62 | ## 结果查看 63 | 64 | - 预测结果(prediction.json)会输出到每个任务对应的数据集文件夹下,与 annotation.json 同级。 65 | - 评分结果(如 Qwen2.5-7B-Instruct_result.xlsx)会输出到 outcome/ 目录。 66 | - 最终 Level 得分会直接在终端打印输出。 67 | 68 | ## 目录说明 69 | 70 | - `General-Bench-Openset/`:公开数据集目录 71 | - `General-Bench-Closeset/`:私有数据集目录 72 | - `outcome/`:输出结果目录 73 | - `references/`:参考模板目录 74 | - `run.sh`:主运行脚本(推荐用户只用此脚本) 75 | 76 | ## 常见问题 77 | 78 | - 如遇依赖缺失,请根据报错信息安装相应 Python 包。 79 | - 如需自定义模型或数据路径,可编辑 `run.sh` 脚本中的相关变量。 80 | 81 | --- 82 | 83 | 如需进一步帮助,请联系系统维护者或查阅详细开发文档。 -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/trainers/logger.py: -------------------------------------------------------------------------------- 1 | import time 2 | import wandb 3 | import shutil 4 | import logging 5 | import os.path as osp 6 | from torch.utils.tensorboard import SummaryWriter 7 | 8 | 9 | def mv_archived_logger(name): 10 | timestamp = time.strftime("%Y-%m-%d_%H:%M:%S_", time.localtime()) 11 | basename = 'archived_' + timestamp + osp.basename(name) 12 | archived_name = osp.join(osp.dirname(name), basename) 13 | shutil.move(name, archived_name) 14 | 15 | 16 | class CustomLogger: 17 | def __init__(self, common_cfg, tb_cfg=None, wandb_cfg=None, rank=0): 18 | global global_logger 19 | self.rank = rank 20 | 21 | if self.rank == 0: 22 | self.logger = logging.getLogger('VFI') 23 | self.logger.setLevel(logging.INFO) 24 | format_str = logging.Formatter(common_cfg['format']) 25 | 26 | console_handler = logging.StreamHandler() 27 | console_handler.setFormatter(format_str) 28 | 29 | if osp.exists(common_cfg['filename']): 30 | mv_archived_logger(common_cfg['filename']) 31 | 32 | file_handler = logging.FileHandler(common_cfg['filename'], 33 | common_cfg['filemode']) 34 | file_handler.setFormatter(format_str) 35 | 36 | self.logger.addHandler(console_handler) 37 | self.logger.addHandler(file_handler) 38 | self.tb_logger = None 39 | 40 | self.enable_wandb = False 41 | 42 | if wandb_cfg is not None: 43 | self.enable_wandb = True 44 | wandb.init(**wandb_cfg) 45 | 46 | if tb_cfg is not None: 47 | self.tb_logger = SummaryWriter(**tb_cfg) 48 | 49 | global_logger = self 50 | 51 | def __call__(self, msg=None, level=logging.INFO, tb_msg=None): 52 | if self.rank != 0: 53 | return 54 | if msg is not None: 55 | self.logger.log(level, msg) 56 | 57 | if self.tb_logger is not None and tb_msg is not None: 58 | self.tb_logger.add_scalar(*tb_msg) 59 | 60 | def close(self): 61 | if self.rank == 0 and self.enable_wandb: 62 | wandb.finish() 63 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/grit/data/transforms/custom_augmentation_impl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | # Part of the code is from https://github.com/rwightman/efficientdet-pytorch/blob/master/effdet/data/transforms.py 4 | # Modified by Xingyi Zhou 5 | # The original code is under Apache-2.0 License 6 | import numpy as np 7 | from PIL import Image 8 | 9 | from detectron2.data.transforms.augmentation import Augmentation 10 | from .custom_transform import EfficientDetResizeCropTransform 11 | 12 | __all__ = [ 13 | "EfficientDetResizeCrop", 14 | ] 15 | 16 | 17 | class EfficientDetResizeCrop(Augmentation): 18 | """ 19 | Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge. 20 | If `max_size` is reached, then downscale so that the longer edge does not exceed max_size. 21 | """ 22 | 23 | def __init__( 24 | self, size, scale, interp=Image.BILINEAR 25 | ): 26 | """ 27 | """ 28 | super().__init__() 29 | self.target_size = (size, size) 30 | self.scale = scale 31 | self.interp = interp 32 | 33 | def get_transform(self, img): 34 | # Select a random scale factor. 35 | scale_factor = np.random.uniform(*self.scale) 36 | scaled_target_height = scale_factor * self.target_size[0] 37 | scaled_target_width = scale_factor * self.target_size[1] 38 | # Recompute the accurate scale_factor using rounded scaled image size. 39 | width, height = img.shape[1], img.shape[0] 40 | img_scale_y = scaled_target_height / height 41 | img_scale_x = scaled_target_width / width 42 | img_scale = min(img_scale_y, img_scale_x) 43 | 44 | # Select non-zero random offset (x, y) if scaled image is larger than target size 45 | scaled_h = int(height * img_scale) 46 | scaled_w = int(width * img_scale) 47 | offset_y = scaled_h - self.target_size[0] 48 | offset_x = scaled_w - self.target_size[1] 49 | offset_y = int(max(0.0, float(offset_y)) * np.random.uniform(0, 1)) 50 | offset_x = int(max(0.0, float(offset_x)) * np.random.uniform(0, 1)) 51 | return EfficientDetResizeCropTransform( 52 | scaled_h, scaled_w, offset_y, offset_x, img_scale, self.target_size, self.interp) 53 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/benchmarks/vimeo90k.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tqdm 3 | import torch 4 | import argparse 5 | import numpy as np 6 | import os.path as osp 7 | from omegaconf import OmegaConf 8 | 9 | sys.path.append('.') 10 | from utils.utils import read, img2tensor 11 | from utils.build_utils import build_from_cfg 12 | from metrics.psnr_ssim import calculate_psnr, calculate_ssim 13 | 14 | parser = argparse.ArgumentParser( 15 | prog = 'AMT', 16 | description = 'Vimeo90K evaluation', 17 | ) 18 | parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') 19 | parser.add_argument('-p', '--ckpt', default='pretrained/amt-s.pth',) 20 | parser.add_argument('-r', '--root', default='data/vimeo_triplet',) 21 | args = parser.parse_args() 22 | 23 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 24 | cfg_path = args.config 25 | ckpt_path = args.ckpt 26 | root = args.root 27 | 28 | network_cfg = OmegaConf.load(cfg_path).network 29 | network_name = network_cfg.name 30 | model = build_from_cfg(network_cfg) 31 | ckpt = torch.load(ckpt_path) 32 | model.load_state_dict(ckpt['state_dict']) 33 | model = model.to(device) 34 | model.eval() 35 | 36 | with open(osp.join(root, 'tri_testlist.txt'), 'r') as fr: 37 | file_list = fr.readlines() 38 | 39 | psnr_list = [] 40 | ssim_list = [] 41 | 42 | pbar = tqdm.tqdm(file_list, total=len(file_list)) 43 | for name in pbar: 44 | name = str(name).strip() 45 | if(len(name) <= 1): 46 | continue 47 | dir_path = osp.join(root, 'sequences', name) 48 | I0 = img2tensor(read(osp.join(dir_path, 'im1.png'))).to(device) 49 | I1 = img2tensor(read(osp.join(dir_path, 'im2.png'))).to(device) 50 | I2 = img2tensor(read(osp.join(dir_path, 'im3.png'))).to(device) 51 | embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device) 52 | 53 | I1_pred = model(I0, I2, embt, 54 | scale_factor=1.0, eval=True)['imgt_pred'] 55 | 56 | psnr = calculate_psnr(I1_pred, I1).detach().cpu().numpy() 57 | ssim = calculate_ssim(I1_pred, I1).detach().cpu().numpy() 58 | 59 | psnr_list.append(psnr) 60 | ssim_list.append(ssim) 61 | avg_psnr = np.mean(psnr_list) 62 | avg_ssim = np.mean(ssim_list) 63 | desc_str = f'[{network_name}/Vimeo90K] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}' 64 | pbar.set_description_str(desc_str) 65 | 66 | -------------------------------------------------------------------------------- /video_generation_evaluation/competitions/clip_score.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import tqdm 3 | import clip 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | 8 | from vbench2_beta_long.utils import reorganize_clips_results 9 | from toolkit.utils import load_dimension_info, clip_transform, read_frames_decord_by_fps 10 | import logging 11 | logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 12 | logger = logging.getLogger(__name__) 13 | 14 | def clip_alignment(clip_model, video_dict, preprocess, device): 15 | sim = [] 16 | video_results = [] 17 | 18 | image_transform = clip_transform(224) 19 | for info in tqdm(video_dict): 20 | 21 | query = info["prompt"] 22 | text = clip.tokenize([query], truncate=True).to(device) 23 | text_feature = clip_model.encode_text(text) 24 | text_feature = F.normalize(text_feature, dim=-1) 25 | 26 | video_list = info["video_list"] 27 | for video_path in video_list: 28 | with torch.no_grad(): 29 | images = read_frames_decord_by_fps(video_path, num_frames=8, sample="middle") 30 | images = image_transform(images) 31 | images = images.to(device) 32 | 33 | image_features = clip_model.encode_image(images) 34 | image_features = F.normalize(image_features, dim=-1, p=2) 35 | 36 | video_sim = image_features @ text_feature.T 37 | video_sim = np.mean(video_sim.cpu().tolist()) 38 | sim.append(video_sim) 39 | 40 | video_results.append({'video_path': video_path, 'video_results': video_sim}) 41 | 42 | avg_sim = np.mean(sim) 43 | 44 | return avg_sim, video_results 45 | 46 | 47 | def compute_clip_score(json_dir, device, submodules_list, **kwargs): 48 | 49 | clip_model, preprocess = clip.load("ViT-B/32", device=device) 50 | logger.info("Initialize CLIP success") 51 | 52 | _, video_dict = load_dimension_info(json_dir, dimension='clip_score', lang='en') 53 | all_results, video_results = clip_alignment(clip_model, video_dict, preprocess, device) 54 | return all_results, video_results 55 | 56 | 57 | def compute_long_clip_score(json_dir, device, submodules_list, **kwargs): 58 | all_results, detailed_results = compute_clip_score(json_dir, device, submodules_list, **kwargs) 59 | 60 | return reorganize_clips_results(detailed_results) 61 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/temporal_flickering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import tqdm 3 | import cv2 4 | from toolkit.utils import load_dimension_info 5 | 6 | from .distributed import ( 7 | get_world_size, 8 | get_rank, 9 | all_gather, 10 | barrier, 11 | distribute_list_to_rank, 12 | gather_list_of_dict, 13 | ) 14 | 15 | 16 | def get_frames(video_path): 17 | frames = [] 18 | video = cv2.VideoCapture(video_path) 19 | while video.isOpened(): 20 | success, frame = video.read() 21 | if success: 22 | frames.append(frame) 23 | else: 24 | break 25 | video.release() 26 | assert frames != [] 27 | return frames 28 | 29 | 30 | def mae_seq(frames): 31 | ssds = [] 32 | for i in range(len(frames)-1): 33 | ssds.append(calculate_mae(frames[i], frames[i+1])) 34 | return np.array(ssds) 35 | 36 | 37 | def calculate_mae(img1, img2): 38 | """Computing the mean absolute error (MAE) between two images.""" 39 | if img1.shape != img2.shape: 40 | print("Images don't have the same shape.") 41 | return 42 | return np.mean(cv2.absdiff(np.array(img1, dtype=np.float32), np.array(img2, dtype=np.float32))) 43 | 44 | 45 | def cal_score(video_path): 46 | """please ensure the video is static""" 47 | frames = get_frames(video_path) 48 | score_seq = mae_seq(frames) 49 | return (255.0 - np.mean(score_seq).item())/255.0 50 | 51 | 52 | def temporal_flickering(video_list): 53 | sim = [] 54 | video_results = [] 55 | for video_path in tqdm(video_list, disable=get_rank() > 0): 56 | try: 57 | score_per_video = cal_score(video_path) 58 | except AssertionError: 59 | continue 60 | video_results.append({'video_path': video_path, 'video_results': score_per_video}) 61 | sim.append(score_per_video) 62 | avg_score = np.mean(sim) 63 | return avg_score, video_results 64 | 65 | 66 | def compute_temporal_flickering(json_dir, device, submodules_list, **kwargs): 67 | video_list, _ = load_dimension_info(json_dir, dimension='temporal_flickering', lang='en') 68 | video_list = distribute_list_to_rank(video_list) 69 | all_results, video_results = temporal_flickering(video_list) 70 | if get_world_size() > 1: 71 | video_results = gather_list_of_dict(video_results) 72 | all_results = sum([d['video_results'] for d in video_results]) / len(video_results) 73 | return all_results, video_results 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from shutil import copyfile 4 | import torch.distributed as dist 5 | import torch 6 | import importlib 7 | import datetime 8 | from utils.dist_utils import ( 9 | get_world_size, 10 | ) 11 | from omegaconf import OmegaConf 12 | from utils.utils import seed_all 13 | parser = argparse.ArgumentParser(description='VFI') 14 | parser.add_argument('-c', '--config', type=str) 15 | parser.add_argument('-p', '--port', default='23455', type=str) 16 | parser.add_argument('--local_rank', default='0') 17 | 18 | args = parser.parse_args() 19 | 20 | 21 | def main_worker(rank, config): 22 | if 'local_rank' not in config: 23 | config['local_rank'] = config['global_rank'] = rank 24 | if torch.cuda.is_available(): 25 | print(f'Rank {rank} is available') 26 | config['device'] = f"cuda:{rank}" 27 | if config['distributed']: 28 | dist.init_process_group(backend='nccl', 29 | timeout=datetime.timedelta(seconds=5400)) 30 | else: 31 | config['device'] = 'cpu' 32 | 33 | cfg_name = os.path.basename(args.config).split('.')[0] 34 | config['exp_name'] = cfg_name + '_' + config['exp_name'] 35 | config['save_dir'] = os.path.join(config['save_dir'], config['exp_name']) 36 | 37 | if (not config['distributed']) or rank == 0: 38 | os.makedirs(config['save_dir'], exist_ok=True) 39 | os.makedirs(f'{config["save_dir"]}/ckpts', exist_ok=True) 40 | config_path = os.path.join(config['save_dir'], 41 | args.config.split('/')[-1]) 42 | if not os.path.isfile(config_path): 43 | copyfile(args.config, config_path) 44 | print('[**] create folder {}'.format(config['save_dir'])) 45 | 46 | trainer_name = config.get('trainer_type', 'base_trainer') 47 | print(f'using GPU {rank} for training') 48 | if rank == 0: 49 | print(trainer_name) 50 | trainer_pack = importlib.import_module('trainers.' + trainer_name) 51 | trainer = trainer_pack.Trainer(config) 52 | 53 | trainer.train() 54 | 55 | 56 | if __name__ == "__main__": 57 | torch.backends.cudnn.benchmark = True 58 | cfg = OmegaConf.load(args.config) 59 | seed_all(cfg.seed) 60 | rank = int(args.local_rank) 61 | torch.cuda.set_device(torch.device(f'cuda:{rank}')) 62 | # setting distributed cfgurations 63 | cfg['world_size'] = get_world_size() 64 | cfg['local_rank'] = rank 65 | if rank == 0: 66 | print('world_size: ', cfg['world_size']) 67 | main_worker(rank, cfg) 68 | 69 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/benchmarks/vimeo90k_tta.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tqdm 3 | import torch 4 | import argparse 5 | import numpy as np 6 | import os.path as osp 7 | from omegaconf import OmegaConf 8 | 9 | sys.path.append('.') 10 | from utils.utils import read, img2tensor 11 | from utils.build_utils import build_from_cfg 12 | from metrics.psnr_ssim import calculate_psnr, calculate_ssim 13 | 14 | parser = argparse.ArgumentParser( 15 | prog = 'AMT', 16 | description = 'Vimeo90K evaluation (with Test-Time Augmentation)', 17 | ) 18 | parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') 19 | parser.add_argument('p', '--ckpt', default='pretrained/amt-s.pth',) 20 | parser.add_argument('-r', '--root', default='data/vimeo_triplet',) 21 | args = parser.parse_args() 22 | 23 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 24 | cfg_path = args.config 25 | ckpt_path = args.ckpt 26 | root = args.root 27 | 28 | network_cfg = OmegaConf.load(cfg_path).network 29 | network_name = network_cfg.name 30 | model = build_from_cfg(network_cfg) 31 | ckpt = torch.load(ckpt_path) 32 | model.load_state_dict(ckpt['state_dict']) 33 | model = model.to(device) 34 | model.eval() 35 | 36 | with open(osp.join(root, 'tri_testlist.txt'), 'r') as fr: 37 | file_list = fr.readlines() 38 | 39 | psnr_list = [] 40 | ssim_list = [] 41 | 42 | pbar = tqdm.tqdm(file_list, total=len(file_list)) 43 | for name in pbar: 44 | name = str(name).strip() 45 | if(len(name) <= 1): 46 | continue 47 | dir_path = osp.join(root, 'sequences', name) 48 | I0 = img2tensor(read(osp.join(dir_path, 'im1.png'))).to(device) 49 | I1 = img2tensor(read(osp.join(dir_path, 'im2.png'))).to(device) 50 | I2 = img2tensor(read(osp.join(dir_path, 'im3.png'))).to(device) 51 | embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device) 52 | 53 | I1_pred1 = model(I0, I2, embt, 54 | scale_factor=1.0, eval=True)['imgt_pred'] 55 | I1_pred2 = model(torch.flip(I0, [2]), torch.flip(I2, [2]), embt, 56 | scale_factor=1.0, eval=True)['imgt_pred'] 57 | I1_pred = I1_pred1 / 2 + torch.flip(I1_pred2, [2]) / 2 58 | psnr = calculate_psnr(I1_pred, I1).detach().cpu().numpy() 59 | ssim = calculate_ssim(I1_pred, I1).detach().cpu().numpy() 60 | 61 | psnr_list.append(psnr) 62 | ssim_list.append(ssim) 63 | avg_psnr = np.mean(psnr_list) 64 | avg_ssim = np.mean(ssim_list) 65 | desc_str = f'[{network_name}/Vimeo90K] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}' 66 | pbar.set_description_str(desc_str) 67 | 68 | -------------------------------------------------------------------------------- /processors/three_d_processor.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from utils.data_types import ModalityType, TaskType, TaskResult 3 | from utils.base_processor import BaseModalityProcessor 4 | 5 | class ThreeDProcessor(BaseModalityProcessor): 6 | """3D模态处理器""" 7 | def __init__(self, modality: ModalityType, dataset_dir: str, pred_json_file: str): 8 | super().__init__(modality, dataset_dir, pred_json_file) 9 | 10 | def process_comprehension(self) -> List[TaskResult]: 11 | """处理3D理解类任务 12 | 13 | 需要返回一个TaskResult列表,每个TaskResult包含: 14 | - task_name: 任务名称,例如 "3d_object_detection", "point_cloud_segmentation" 等 15 | - metric: 评估指标,例如 "mAP", "IoU" 等 16 | - score: 评估分数 17 | - task_type: 默认为 TaskType.COMPREHENSION,不需要指定 18 | 示例格式: 19 | return [ 20 | TaskResult( 21 | task_name="3d_object_detection", 22 | metric="mAP", 23 | score=0.76 24 | ), 25 | TaskResult( 26 | task_name="point_cloud_segmentation", 27 | metric="IoU", 28 | score=0.82 29 | ) 30 | ] 31 | """ 32 | return [] 33 | 34 | def process_generation(self) -> List[TaskResult]: 35 | """处理3D生成类任务 36 | 37 | 需要返回一个TaskResult列表,每个TaskResult包含: 38 | - task_name: 任务名称,例如 "3d_reconstruction", "mesh_generation" 等 39 | - metric: 评估指标,例如 "CD", "F1" 等 40 | - score: 评估分数 41 | - task_type: 这里需要指定为 TaskType.GENERATION 42 | 43 | 示例格式: 44 | return [ 45 | TaskResult( 46 | task_name="3d_reconstruction", 47 | metric="CD", 48 | score=0.15, 49 | task_type=TaskType.GENERATION 50 | ), 51 | TaskResult( 52 | task_name="mesh_generation", 53 | metric="F1", 54 | score=0.88, 55 | task_type=TaskType.GENERATION 56 | ) 57 | ] 58 | """ 59 | return [] 60 | 61 | # 使用示例 62 | if __name__ == "__main__": 63 | processor = ThreeDProcessor(ModalityType.THREE_D, "") 64 | 65 | # 测试理解任务 66 | print("\n理解类任务结果:") 67 | for task in processor.process_comprehension(): 68 | print(f"任务: {task.task_name}") 69 | print(f"指标: {task.metric}") 70 | print(f"分数: {task.score}") 71 | print("-" * 20) 72 | 73 | # 测试生成任务 74 | print("\n生成类任务结果:") 75 | for task in processor.process_generation(): 76 | print(f"任务: {task.task_name}") 77 | print(f"指标: {task.metric}") 78 | print(f"分数: {task.score}") 79 | print("-" * 20) -------------------------------------------------------------------------------- /processors/audio_processor.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from utils.data_types import ModalityType, TaskType, TaskResult 3 | from utils.base_processor import BaseModalityProcessor 4 | 5 | class AudioProcessor(BaseModalityProcessor): 6 | """音频模态处理器""" 7 | def __init__(self, modality: ModalityType, dataset_dir: str, pred_json_file: str): 8 | super().__init__(modality, dataset_dir, pred_json_file) 9 | 10 | def process_comprehension(self) -> List[TaskResult]: 11 | """处理音频理解类任务 12 | 13 | 需要返回一个TaskResult列表,每个TaskResult包含: 14 | - task_name: 任务名称,例如 "speech_recognition", "audio_classification" 等 15 | - metric: 评估指标,例如 "WER", "accuracy" 等 16 | - score: 评估分数 17 | - task_type: 默认为 TaskType.COMPREHENSION,不需要指定 18 | 19 | 示例格式: 20 | return [ 21 | TaskResult( 22 | task_name="speech_recognition", 23 | metric="WER", 24 | score=0.15 25 | ), 26 | TaskResult( 27 | task_name="audio_classification", 28 | metric="accuracy", 29 | score=0.92 30 | ) 31 | ] 32 | """ 33 | return [] 34 | 35 | def process_generation(self) -> List[TaskResult]: 36 | """处理音频生成类任务 37 | 38 | 需要返回一个TaskResult列表,每个TaskResult包含: 39 | - task_name: 任务名称,例如 "speech_synthesis", "audio_generation" 等 40 | - metric: 评估指标,例如 "MOS", "FAD" 等 41 | - score: 评估分数 42 | - task_type: 需要指定为 TaskType.GENERATION 43 | 44 | 示例格式: 45 | return [ 46 | TaskResult( 47 | task_name="speech_synthesis", 48 | metric="MOS", 49 | score=4.2, 50 | task_type=TaskType.GENERATION 51 | ), 52 | TaskResult( 53 | task_name="audio_generation", 54 | metric="FAD", 55 | score=12.5, 56 | task_type=TaskType.GENERATION 57 | ) 58 | ] 59 | """ 60 | return [] 61 | 62 | # 使用示例 63 | if __name__ == "__main__": 64 | processor = AudioProcessor(ModalityType.AUDIO, "") 65 | 66 | # 测试理解任务 67 | print("\n理解类任务结果:") 68 | for task in processor.process_comprehension(): 69 | print(f"任务: {task.task_name}") 70 | print(f"指标: {task.metric}") 71 | print(f"分数: {task.score}") 72 | print("-" * 20) 73 | 74 | # 测试生成任务 75 | print("\n生成类任务结果:") 76 | for task in processor.process_generation(): 77 | print(f"任务: {task.task_name}") 78 | print(f"指标: {task.metric}") 79 | print(f"分数: {task.score}") 80 | print("-" * 20) -------------------------------------------------------------------------------- /processors/video_processor.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from utils.data_types import ModalityType, TaskType, TaskResult 3 | from utils.base_processor import BaseModalityProcessor 4 | 5 | class VideoProcessor(BaseModalityProcessor): 6 | """视频模态处理器""" 7 | def __init__(self, modality: ModalityType, dataset_dir: str, pred_json_file: str): 8 | super().__init__(modality, dataset_dir, pred_json_file) 9 | 10 | def process_comprehension(self) -> List[TaskResult]: 11 | """处理视频理解类任务 12 | 13 | 需要返回一个TaskResult列表,每个TaskResult包含: 14 | - task_name: 任务名称,例如 "action_recognition", "video_classification" 等 15 | - metric: 评估指标,例如 "accuracy", "mAP" 等 16 | - score: 评估分数 17 | - task_type: 默认为 TaskType.COMPREHENSION,不需要指定 18 | 19 | 示例格式: 20 | return [ 21 | TaskResult( 22 | task_name="action_recognition", 23 | metric="accuracy", 24 | score=0.88 25 | ), 26 | TaskResult( 27 | task_name="video_classification", 28 | metric="accuracy", 29 | score=0.92 30 | ) 31 | ] 32 | """ 33 | return [] 34 | 35 | def process_generation(self) -> List[TaskResult]: 36 | """处理视频生成类任务 37 | 38 | 需要返回一个TaskResult列表,每个TaskResult包含: 39 | - task_name: 任务名称,例如 "video_generation", "video_prediction" 等 40 | - metric: 评估指标,例如 "FVD", "PSNR" 等 41 | - score: 评估分数 42 | - task_type: 需要指定为 TaskType.GENERATION 43 | 44 | 示例格式: 45 | return [ 46 | TaskResult( 47 | task_name="video_generation", 48 | metric="FVD", 49 | score=45.2, 50 | task_type=TaskType.GENERATION 51 | ), 52 | TaskResult( 53 | task_name="video_prediction", 54 | metric="PSNR", 55 | score=25.8, 56 | task_type=TaskType.GENERATION 57 | ) 58 | ] 59 | """ 60 | return [] 61 | 62 | # 使用示例 63 | if __name__ == "__main__": 64 | processor = VideoProcessor(ModalityType.VIDEO, "") 65 | 66 | # 测试理解任务 67 | print("\n理解类任务结果:") 68 | for task in processor.process_comprehension(): 69 | print(f"任务: {task.task_name}") 70 | print(f"指标: {task.metric}") 71 | print(f"分数: {task.score}") 72 | print("-" * 20) 73 | 74 | # 测试生成任务 75 | print("\n生成类任务结果:") 76 | for task in processor.process_generation(): 77 | print(f"任务: {task.task_name}") 78 | print(f"指标: {task.metric}") 79 | print(f"分数: {task.score}") 80 | print("-" * 20) -------------------------------------------------------------------------------- /processors/image_processor.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from utils.data_types import ModalityType, TaskType, TaskResult 3 | from utils.base_processor import BaseModalityProcessor 4 | 5 | class ImageProcessor(BaseModalityProcessor): 6 | """图像模态处理器""" 7 | def __init__(self, modality: ModalityType, dataset_dir: str, pred_json_file: str): 8 | super().__init__(modality, dataset_dir, pred_json_file) 9 | 10 | def process_1(self): 11 | return [] 12 | 13 | def process_comprehension(self) -> List[TaskResult]: 14 | """处理图像理解类任务 15 | 16 | 需要返回一个TaskResult列表,每个TaskResult包含: 17 | - task_name: 任务名称,例如 "image_classification", "object_detection" 等 18 | - metric: 评估指标,例如 "accuracy", "mAP" 等 19 | - score: 评估分数 20 | - task_type: 默认为 TaskType.COMPREHENSION,不需要指定 21 | 22 | 示例格式: 23 | return [ 24 | TaskResult( 25 | task_name="image_classification", 26 | metric="accuracy", 27 | score=0.95 28 | ), 29 | TaskResult( 30 | task_name="object_detection", 31 | metric="mAP", 32 | score=0.82 33 | ) 34 | ] 35 | """ 36 | return [] 37 | 38 | def process_generation(self) -> List[TaskResult]: 39 | """处理图像生成类任务 40 | 41 | 需要返回一个TaskResult列表,每个TaskResult包含: 42 | - task_name: 任务名称,例如 "image_generation", "image_editing" 等 43 | - metric: 评估指标,例如 "FID", "IS" 等 44 | - score: 评估分数 45 | - task_type: 需要指定为 TaskType.GENERATION 46 | 47 | 示例格式: 48 | return [ 49 | TaskResult( 50 | task_name="image_generation", 51 | metric="FID", 52 | score=15.2, 53 | task_type=TaskType.GENERATION 54 | ), 55 | TaskResult( 56 | task_name="image_editing", 57 | metric="PSNR", 58 | score=28.5, 59 | task_type=TaskType.GENERATION 60 | ) 61 | ] 62 | """ 63 | return [] 64 | 65 | # 使用示例 66 | if __name__ == "__main__": 67 | processor = ImageProcessor(ModalityType.IMAGE, "") 68 | 69 | # 测试理解任务 70 | print("\n理解类任务结果:") 71 | for task in processor.process_comprehension(): 72 | print(f"任务: {task.task_name}") 73 | print(f"指标: {task.metric}") 74 | print(f"分数: {task.score}") 75 | print("-" * 20) 76 | 77 | # 测试生成任务 78 | print("\n生成类任务结果:") 79 | for task in processor.process_generation(): 80 | print(f"任务: {task.task_name}") 81 | print(f"指标: {task.metric}") 82 | print(f"分数: {task.score}") 83 | print("-" * 20) -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/benchmarks/snu_film.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tqdm 4 | import torch 5 | import argparse 6 | import numpy as np 7 | import os.path as osp 8 | from omegaconf import OmegaConf 9 | 10 | sys.path.append('.') 11 | from utils.build_utils import build_from_cfg 12 | from metrics.psnr_ssim import calculate_psnr, calculate_ssim 13 | from utils.utils import InputPadder, read, img2tensor 14 | 15 | 16 | def parse_path(path): 17 | path_list = path.split('/') 18 | new_path = osp.join(*path_list[-3:]) 19 | return new_path 20 | 21 | parser = argparse.ArgumentParser( 22 | prog = 'AMT', 23 | description = 'SNU-FILM evaluation', 24 | ) 25 | parser.add_argument('-c', '--config', default='cfgs/AMT-S.yaml') 26 | parser.add_argument('-p', '--ckpt', default='pretrained/amt-s.pth') 27 | parser.add_argument('-r', '--root', default='data/SNU_FILM') 28 | args = parser.parse_args() 29 | 30 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 31 | cfg_path = args.config 32 | ckpt_path = args.ckpt 33 | root = args.root 34 | 35 | network_cfg = OmegaConf.load(cfg_path).network 36 | network_name = network_cfg.name 37 | model = build_from_cfg(network_cfg) 38 | ckpt = torch.load(ckpt_path) 39 | model.load_state_dict(ckpt['state_dict']) 40 | model = model.to(device) 41 | model.eval() 42 | 43 | divisor = 20; scale_factor = 0.8 44 | splits = ['easy', 'medium', 'hard', 'extreme'] 45 | for split in splits: 46 | with open(os.path.join(root, f'test-{split}.txt'), "r") as fr: 47 | file_list = [l.strip().split(' ') for l in fr.readlines()] 48 | pbar = tqdm.tqdm(file_list, total=len(file_list)) 49 | 50 | psnr_list = []; ssim_list = [] 51 | for name in pbar: 52 | img0 = img2tensor(read(osp.join(root, parse_path(name[0])))).to(device) 53 | imgt = img2tensor(read(osp.join(root, parse_path(name[1])))).to(device) 54 | img1 = img2tensor(read(osp.join(root, parse_path(name[2])))).to(device) 55 | padder = InputPadder(img0.shape, divisor) 56 | img0, img1 = padder.pad(img0, img1) 57 | 58 | embt = torch.tensor(1/2).float().view(1, 1, 1, 1).to(device) 59 | imgt_pred = model(img0, img1, embt, scale_factor=scale_factor, eval=True)['imgt_pred'] 60 | imgt_pred = padder.unpad(imgt_pred) 61 | 62 | psnr = calculate_psnr(imgt_pred, imgt).detach().cpu().numpy() 63 | ssim = calculate_ssim(imgt_pred, imgt).detach().cpu().numpy() 64 | 65 | psnr_list.append(psnr) 66 | ssim_list.append(ssim) 67 | avg_psnr = np.mean(psnr_list) 68 | avg_ssim = np.mean(ssim_list) 69 | desc_str = f'[{network_name}/SNU-FILM] [{split}] psnr: {avg_psnr:.02f}, ssim: {avg_ssim:.04f}' 70 | pbar.set_description_str(desc_str) 71 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/backbone/fpn_p5.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import math 3 | import fvcore.nn.weight_init as weight_init 4 | import torch.nn.functional as F 5 | from torch import nn 6 | 7 | from detectron2.layers import Conv2d, ShapeSpec, get_norm 8 | 9 | from detectron2.modeling.backbone import Backbone 10 | from detectron2.modeling.backbone.fpn import FPN 11 | from detectron2.modeling.backbone.build import BACKBONE_REGISTRY 12 | from detectron2.modeling.backbone.resnet import build_resnet_backbone 13 | 14 | 15 | class LastLevelP6P7_P5(nn.Module): 16 | """ 17 | This module is used in RetinaNet to generate extra layers, P6 and P7 from 18 | C5 feature. 19 | """ 20 | 21 | def __init__(self, in_channels, out_channels): 22 | super().__init__() 23 | self.num_levels = 2 24 | self.in_feature = "p5" 25 | self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) 26 | self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1) 27 | for module in [self.p6, self.p7]: 28 | weight_init.c2_xavier_fill(module) 29 | 30 | def forward(self, c5): 31 | p6 = self.p6(c5) 32 | p7 = self.p7(F.relu(p6)) 33 | return [p6, p7] 34 | 35 | 36 | @BACKBONE_REGISTRY.register() 37 | def build_p67_resnet_fpn_backbone(cfg, input_shape: ShapeSpec): 38 | """ 39 | Args: 40 | cfg: a detectron2 CfgNode 41 | 42 | Returns: 43 | backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. 44 | """ 45 | bottom_up = build_resnet_backbone(cfg, input_shape) 46 | in_features = cfg.MODEL.FPN.IN_FEATURES 47 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 48 | backbone = FPN( 49 | bottom_up=bottom_up, 50 | in_features=in_features, 51 | out_channels=out_channels, 52 | norm=cfg.MODEL.FPN.NORM, 53 | top_block=LastLevelP6P7_P5(out_channels, out_channels), 54 | fuse_type=cfg.MODEL.FPN.FUSE_TYPE, 55 | ) 56 | return backbone 57 | 58 | @BACKBONE_REGISTRY.register() 59 | def build_p35_resnet_fpn_backbone(cfg, input_shape: ShapeSpec): 60 | """ 61 | Args: 62 | cfg: a detectron2 CfgNode 63 | 64 | Returns: 65 | backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. 66 | """ 67 | bottom_up = build_resnet_backbone(cfg, input_shape) 68 | in_features = cfg.MODEL.FPN.IN_FEATURES 69 | out_channels = cfg.MODEL.FPN.OUT_CHANNELS 70 | backbone = FPN( 71 | bottom_up=bottom_up, 72 | in_features=in_features, 73 | out_channels=out_channels, 74 | norm=cfg.MODEL.FPN.NORM, 75 | top_block=None, 76 | fuse_type=cfg.MODEL.FPN.FUSE_TYPE, 77 | ) 78 | return backbone 79 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/RAFT/core/utils_core/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | from scipy import interpolate 5 | 6 | 7 | class InputPadder: 8 | """ Pads images such that dimensions are divisible by 8 """ 9 | def __init__(self, dims, mode='sintel'): 10 | self.ht, self.wd = dims[-2:] 11 | pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8 12 | pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8 13 | if mode == 'sintel': 14 | self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2] 15 | else: 16 | self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht] 17 | 18 | def pad(self, *inputs): 19 | return [F.pad(x, self._pad, mode='replicate') for x in inputs] 20 | 21 | def unpad(self,x): 22 | ht, wd = x.shape[-2:] 23 | c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]] 24 | return x[..., c[0]:c[1], c[2]:c[3]] 25 | 26 | def forward_interpolate(flow): 27 | flow = flow.detach().cpu().numpy() 28 | dx, dy = flow[0], flow[1] 29 | 30 | ht, wd = dx.shape 31 | x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht)) 32 | 33 | x1 = x0 + dx 34 | y1 = y0 + dy 35 | 36 | x1 = x1.reshape(-1) 37 | y1 = y1.reshape(-1) 38 | dx = dx.reshape(-1) 39 | dy = dy.reshape(-1) 40 | 41 | valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht) 42 | x1 = x1[valid] 43 | y1 = y1[valid] 44 | dx = dx[valid] 45 | dy = dy[valid] 46 | 47 | flow_x = interpolate.griddata( 48 | (x1, y1), dx, (x0, y0), method='nearest', fill_value=0) 49 | 50 | flow_y = interpolate.griddata( 51 | (x1, y1), dy, (x0, y0), method='nearest', fill_value=0) 52 | 53 | flow = np.stack([flow_x, flow_y], axis=0) 54 | return torch.from_numpy(flow).float() 55 | 56 | 57 | def bilinear_sampler(img, coords, mode='bilinear', mask=False): 58 | """ Wrapper for grid_sample, uses pixel coordinates """ 59 | H, W = img.shape[-2:] 60 | xgrid, ygrid = coords.split([1,1], dim=-1) 61 | xgrid = 2*xgrid/(W-1) - 1 62 | ygrid = 2*ygrid/(H-1) - 1 63 | 64 | grid = torch.cat([xgrid, ygrid], dim=-1) 65 | img = F.grid_sample(img, grid, align_corners=True) 66 | 67 | if mask: 68 | mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1) 69 | return img, mask.float() 70 | 71 | return img 72 | 73 | 74 | def coords_grid(batch, ht, wd, device): 75 | coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device)) 76 | coords = torch.stack(coords[::-1], dim=0).float() 77 | return coords[None].repeat(batch, 1, 1, 1) 78 | 79 | 80 | def upflow8(flow, mode='bilinear'): 81 | new_size = (8 * flow.shape[2], 8 * flow.shape[3]) 82 | return 8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True) 83 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/grit/modeling/meta_arch/grit.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Optional, Tuple 2 | import torch 3 | from detectron2.config import configurable 4 | from detectron2.structures import ImageList, Instances, Boxes 5 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY 6 | from detectron2.modeling.meta_arch.rcnn import GeneralizedRCNN 7 | 8 | 9 | @META_ARCH_REGISTRY.register() 10 | class GRiT(GeneralizedRCNN): 11 | @configurable 12 | def __init__( 13 | self, 14 | **kwargs): 15 | super().__init__(**kwargs) 16 | assert self.proposal_generator is not None 17 | 18 | @classmethod 19 | def from_config(cls, cfg): 20 | ret = super().from_config(cfg) 21 | return ret 22 | 23 | def inference( 24 | self, 25 | batched_inputs: Tuple[Dict[str, torch.Tensor]], 26 | detected_instances: Optional[List[Instances]] = None, 27 | do_postprocess: bool = True, 28 | ): 29 | assert not self.training 30 | assert detected_instances is None 31 | 32 | images = self.preprocess_image(batched_inputs) 33 | features = self.backbone(images.tensor) 34 | proposals, _ = self.proposal_generator(images, features, None) 35 | results, _ = self.roi_heads(features, proposals) 36 | results_det, _ = self.roi_heads.forward_object(features, proposals) 37 | # results_det.get 38 | for idx in range(len(results)): 39 | obj_type = results_det[idx].get("pred_object_descriptions") 40 | results[idx].set('det_obj',obj_type) 41 | if do_postprocess: 42 | assert not torch.jit.is_scripting(), \ 43 | "Scripting is not supported for postprocess." 44 | return GRiT._postprocess( 45 | results, batched_inputs, images.image_sizes) 46 | else: 47 | return results 48 | 49 | def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]): 50 | if not self.training: 51 | return self.inference(batched_inputs) 52 | 53 | images = self.preprocess_image(batched_inputs) 54 | 55 | gt_instances = [x["instances"].to(self.device) for x in batched_inputs] 56 | 57 | targets_task = batched_inputs[0]['task'] 58 | for anno_per_image in batched_inputs: 59 | assert targets_task == anno_per_image['task'] 60 | 61 | features = self.backbone(images.tensor) 62 | proposals, proposal_losses = self.proposal_generator( 63 | images, features, gt_instances) 64 | proposals, roihead_textdecoder_losses = self.roi_heads( 65 | features, proposals, gt_instances, targets_task=targets_task) 66 | 67 | losses = {} 68 | losses.update(roihead_textdecoder_losses) 69 | losses.update(proposal_losses) 70 | 71 | return losses 72 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/umt/models/extract_clip/extract.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 9, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import clip.clip as clip\n", 10 | "import os\n", 11 | "import torch\n", 12 | "from collections import OrderedDict" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 10, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "path = 'your_model_path/clip_visual_encoder'" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 14, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "model, _ = clip.load(\"ViT-B/16\", device='cpu')\n", 31 | "new_state_dict = OrderedDict()\n", 32 | "for k, v in model.state_dict().items():\n", 33 | " if 'visual.' in k:\n", 34 | " new_state_dict[k[7:]] = v\n", 35 | "torch.save(new_state_dict, os.path.join(path, 'vit_b16.pth'))" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 15, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "model, _ = clip.load(\"ViT-L/14\", device='cpu')\n", 45 | "new_state_dict = OrderedDict()\n", 46 | "for k, v in model.state_dict().items():\n", 47 | " if 'visual.' in k:\n", 48 | " new_state_dict[k[7:]] = v\n", 49 | "torch.save(new_state_dict, os.path.join(path, 'vit_l14.pth'))" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "model, _ = clip.load(\"ViT-L/14@336px\", device='cpu')\n", 59 | "new_state_dict = OrderedDict()\n", 60 | "for k, v in model.state_dict().items():\n", 61 | " if 'visual.' in k:\n", 62 | " new_state_dict[k[7:]] = v\n", 63 | "torch.save(new_state_dict, os.path.join(path, 'vit_l14_336.pth'))" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [] 72 | } 73 | ], 74 | "metadata": { 75 | "kernelspec": { 76 | "display_name": "Python 3.7.13 ('torch1.9')", 77 | "language": "python", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "codemirror_mode": { 82 | "name": "ipython", 83 | "version": 3 84 | }, 85 | "file_extension": ".py", 86 | "mimetype": "text/x-python", 87 | "name": "python", 88 | "nbconvert_exporter": "python", 89 | "pygments_lexer": "ipython3", 90 | "version": "3.7.13" 91 | }, 92 | "orig_nbformat": 4, 93 | "vscode": { 94 | "interpreter": { 95 | "hash": "c30e0be9d1dabfc31a056b9daab5ce1d15284c0e9e5af7f56f8931344ec84c24" 96 | } 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 2 101 | } 102 | -------------------------------------------------------------------------------- /video_generation_evaluation/competitions/configs/subject_mapping_table.yaml: -------------------------------------------------------------------------------- 1 | 0.0: 0.0 2 | 0.01: 0.655812085783768 3 | 0.02: 0.706856949045235 4 | 0.03: 0.731659342416906 5 | 0.04: 0.73660992057736 6 | 0.05: 0.749101188592094 7 | 0.06: 0.761032814753647 8 | 0.07: 0.774597183768173 9 | 0.08: 0.784555729997569 10 | 0.09: 0.792953568694271 11 | 0.1: 0.802689699298385 12 | 0.11: 0.808076071440993 13 | 0.12: 0.816204790771909 14 | 0.13: 0.824219815909538 15 | 0.14: 0.830472157111834 16 | 0.15: 0.835419531889346 17 | 0.16: 0.83907681617532 18 | 0.17: 0.841978081155746 19 | 0.18: 0.84679192068861 20 | 0.19: 0.850625540675788 21 | 0.2: 0.852853044011848 22 | 0.21: 0.854691139482507 23 | 0.22: 0.858132224563246 24 | 0.23: 0.863729405870906 25 | 0.24: 0.866102417035313 26 | 0.25: 0.870585293424396 27 | 0.26: 0.872331870277398 28 | 0.27: 0.874960548804337 29 | 0.28: 0.878698116066965 30 | 0.29: 0.88170792606262 31 | 0.3: 0.885683841036798 32 | 0.31: 0.887194775904732 33 | 0.32: 0.890181215752347 34 | 0.33: 0.8940085858716 35 | 0.34: 0.896727529739295 36 | 0.35: 0.899204109394038 37 | 0.36: 0.901872688917701 38 | 0.37: 0.902930005754908 39 | 0.38: 0.904255123199727 40 | 0.39: 0.906709500890894 41 | 0.4: 0.909197403281584 42 | 0.41: 0.911998758637682 43 | 0.42: 0.914120648767612 44 | 0.43: 0.917820970919085 45 | 0.44: 0.920037992613574 46 | 0.45: 0.922367310037017 47 | 0.46: 0.923878218312373 48 | 0.47: 0.92612833568708 49 | 0.48: 0.928554265517505 50 | 0.49: 0.931094522914667 51 | 0.5: 0.932674917380015 52 | 0.51: 0.933938855974875 53 | 0.52: 0.935219359871336 54 | 0.53: 0.93807406531488 55 | 0.54: 0.939675705126034 56 | 0.55: 0.941552521922844 57 | 0.56: 0.944195698642471 58 | 0.57: 0.946289318094669 59 | 0.58: 0.947781123820032 60 | 0.59: 0.949137334918494 61 | 0.6: 0.951897174598649 62 | 0.61: 0.953055388977942 63 | 0.62: 0.954985032256127 64 | 0.63: 0.956199606401013 65 | 0.64: 0.957250230848176 66 | 0.65: 0.958689000129844 67 | 0.66: 0.960455895301363 68 | 0.67: 0.961342514244196 69 | 0.68: 0.962936044827203 70 | 0.69: 0.964827439510959 71 | 0.7: 0.966785529778715 72 | 0.71: 0.968174134640714 73 | 0.72: 0.969813944137392 74 | 0.73: 0.971409261937727 75 | 0.74: 0.972530004578652 76 | 0.75: 0.973668488824432 77 | 0.76: 0.974642341870362 78 | 0.77: 0.976008729176383 79 | 0.78: 0.977155875644753 80 | 0.79: 0.978418810979857 81 | 0.8: 0.979501010595634 82 | 0.81: 0.980594016861641 83 | 0.82: 0.981990506802626 84 | 0.83: 0.983434155927019 85 | 0.84: 0.98433502683478 86 | 0.85: 0.985466305825542 87 | 0.86: 0.986316598986252 88 | 0.87: 0.987193187882002 89 | 0.88: 0.98770020514925 90 | 0.89: 0.988262855586541 91 | 0.9: 0.988710454351168 92 | 0.91: 0.989251092021853 93 | 0.92: 0.989782759199991 94 | 0.93: 0.990371501103215 95 | 0.94: 0.991172390892083 96 | 0.95: 0.992180427851925 97 | 0.96: 0.992921150016265 98 | 0.97: 0.99326859591264 99 | 0.98: 0.994591460602974 100 | 0.99: 0.995516073547993 101 | 1.0: 1.0 -------------------------------------------------------------------------------- /video_generation_evaluation/competitions/configs/background_mapping_table.yaml: -------------------------------------------------------------------------------- 1 | 0.0: 0.0 2 | 0.01: 0.873691544930448 3 | 0.02: 0.88392356992722 4 | 0.03: 0.888340769126807 5 | 0.04: 0.894395017892299 6 | 0.05: 0.899626435216563 7 | 0.06: 0.903145754159405 8 | 0.07: 0.905965662216789 9 | 0.08: 0.907634139293668 10 | 0.09: 0.909681980518171 11 | 0.1: 0.912059260929028 12 | 0.11: 0.914872300522044 13 | 0.12: 0.916864571230313 14 | 0.13: 0.91899572410357 15 | 0.14: 0.920360080000968 16 | 0.15: 0.921301105005809 17 | 0.16: 0.922499725160567 18 | 0.17: 0.923335310160083 19 | 0.18: 0.924364064416312 20 | 0.19: 0.925033502674768 21 | 0.2: 0.926479836367157 22 | 0.21: 0.927276633706106 23 | 0.22: 0.927840039415505 24 | 0.23: 0.928488115842048 25 | 0.24: 0.929855989179899 26 | 0.25: 0.93043699722034 27 | 0.26: 0.930961847243739 28 | 0.27: 0.931837518457107 29 | 0.28: 0.932535174404531 30 | 0.29: 0.933476108636716 31 | 0.3: 0.934152037140137 32 | 0.31: 0.934940306892267 33 | 0.32: 0.935567840962271 34 | 0.33: 0.936222006721211 35 | 0.34: 0.936694266597276 36 | 0.35: 0.937215165488639 37 | 0.36: 0.937728512599245 38 | 0.37: 0.938159241463336 39 | 0.38: 0.938786767968952 40 | 0.39: 0.939348915468468 41 | 0.4: 0.939684244791667 42 | 0.41: 0.940032821879841 43 | 0.42: 0.940740896511102 44 | 0.43: 0.941350394558482 45 | 0.44: 0.941967580545604 46 | 0.45: 0.942834956146721 47 | 0.46: 0.943218163003486 48 | 0.47: 0.944092961790763 49 | 0.48: 0.944922112017493 50 | 0.49: 0.945415133617351 51 | 0.5: 0.946057962880035 52 | 0.51: 0.946612672064614 53 | 0.52: 0.947050138277014 54 | 0.53: 0.947583230961948 55 | 0.54: 0.948510612332171 56 | 0.55: 0.949047688928156 57 | 0.56: 0.94972291646495 58 | 0.57: 0.950246513321392 59 | 0.58: 0.950660608096114 60 | 0.59: 0.951255542174994 61 | 0.6: 0.951911455307578 62 | 0.61: 0.952366960064065 63 | 0.62: 0.952950734149077 64 | 0.63: 0.953568790040828 65 | 0.64: 0.954187246845146 66 | 0.65: 0.954717288560225 67 | 0.66: 0.955338935014846 68 | 0.67: 0.95590276685144 69 | 0.68: 0.956451298452427 70 | 0.69: 0.957104193394171 71 | 0.7: 0.957455075099245 72 | 0.71: 0.957910428567971 73 | 0.72: 0.958549581538052 74 | 0.73: 0.959168784695327 75 | 0.74: 0.959610176825136 76 | 0.75: 0.960120447751259 77 | 0.76: 0.960917058501969 78 | 0.77: 0.961979166666667 79 | 0.78: 0.962551626948586 80 | 0.79: 0.963566142505003 81 | 0.8: 0.964157551579041 82 | 0.81: 0.964602080408437 83 | 0.82: 0.964906362961529 84 | 0.83: 0.965452531951975 85 | 0.84: 0.966266180226084 86 | 0.85: 0.967015800998096 87 | 0.86: 0.968036075575297 88 | 0.87: 0.969119242996385 89 | 0.88: 0.969973438912019 90 | 0.89: 0.970532389196844 91 | 0.9: 0.971717108527789 92 | 0.91: 0.972427724793442 93 | 0.92: 0.973225634097437 94 | 0.93: 0.974180063197941 95 | 0.94: 0.975258326374096 96 | 0.95: 0.976684089973857 97 | 0.96: 0.978594319850568 98 | 0.97: 0.980095581086206 99 | 0.98: 0.981866938883779 100 | 0.99: 0.985895411744772 101 | 1.0: 1.0 102 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/flow_generation/gen_flow.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import argparse 5 | import numpy as np 6 | import os.path as osp 7 | import torch.nn.functional as F 8 | 9 | sys.path.append('.') 10 | from utils.utils import read, write 11 | from flow_generation.liteflownet.run import estimate 12 | 13 | parser = argparse.ArgumentParser( 14 | prog = 'AMT', 15 | description = 'Flow generation', 16 | ) 17 | parser.add_argument('-r', '--root', default='data/vimeo_triplet') 18 | args = parser.parse_args() 19 | 20 | vimeo90k_dir = args.root 21 | vimeo90k_sequences_dir = osp.join(vimeo90k_dir, 'sequences') 22 | vimeo90k_flow_dir = osp.join(vimeo90k_dir, 'flow') 23 | 24 | def pred_flow(img1, img2): 25 | img1 = torch.from_numpy(img1).float().permute(2, 0, 1) / 255.0 26 | img2 = torch.from_numpy(img2).float().permute(2, 0, 1) / 255.0 27 | 28 | flow = estimate(img1, img2) 29 | 30 | flow = flow.permute(1, 2, 0).cpu().numpy() 31 | return flow 32 | 33 | print('Built Flow Path') 34 | if not osp.exists(vimeo90k_flow_dir): 35 | os.makedirs(vimeo90k_flow_dir) 36 | 37 | for sequences_path in sorted(os.listdir(vimeo90k_sequences_dir)): 38 | vimeo90k_sequences_path_dir = osp.join(vimeo90k_sequences_dir, sequences_path) 39 | vimeo90k_flow_path_dir = osp.join(vimeo90k_flow_dir, sequences_path) 40 | if not osp.exists(vimeo90k_flow_path_dir): 41 | os.mkdir(vimeo90k_flow_path_dir) 42 | 43 | for sequences_id in sorted(os.listdir(vimeo90k_sequences_path_dir)): 44 | vimeo90k_flow_id_dir = osp.join(vimeo90k_flow_path_dir, sequences_id) 45 | if not osp.exists(vimeo90k_flow_id_dir): 46 | os.mkdir(vimeo90k_flow_id_dir) 47 | 48 | for sequences_path in sorted(os.listdir(vimeo90k_sequences_dir)): 49 | vimeo90k_sequences_path_dir = os.path.join(vimeo90k_sequences_dir, sequences_path) 50 | vimeo90k_flow_path_dir = os.path.join(vimeo90k_flow_dir, sequences_path) 51 | 52 | for sequences_id in sorted(os.listdir(vimeo90k_sequences_path_dir)): 53 | vimeo90k_sequences_id_dir = os.path.join(vimeo90k_sequences_path_dir, sequences_id) 54 | vimeo90k_flow_id_dir = os.path.join(vimeo90k_flow_path_dir, sequences_id) 55 | 56 | img0_path = vimeo90k_sequences_id_dir + '/im1.png' 57 | imgt_path = vimeo90k_sequences_id_dir + '/im2.png' 58 | img1_path = vimeo90k_sequences_id_dir + '/im3.png' 59 | flow_t0_path = vimeo90k_flow_id_dir + '/flow_t0.flo' 60 | flow_t1_path = vimeo90k_flow_id_dir + '/flow_t1.flo' 61 | 62 | img0 = read(img0_path) 63 | imgt = read(imgt_path) 64 | img1 = read(img1_path) 65 | 66 | flow_t0 = pred_flow(imgt, img0) 67 | flow_t1 = pred_flow(imgt, img1) 68 | 69 | write(flow_t0_path, flow_t0) 70 | write(flow_t1_path, flow_t1) 71 | 72 | print('Written Sequences {}'.format(sequences_path)) -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/meta_arch/centernet_detector.py: -------------------------------------------------------------------------------- 1 | import math 2 | import json 3 | import numpy as np 4 | import torch 5 | from torch import nn 6 | 7 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY 8 | from detectron2.modeling import build_backbone, build_proposal_generator 9 | from detectron2.modeling import detector_postprocess 10 | from detectron2.structures import ImageList 11 | 12 | @META_ARCH_REGISTRY.register() 13 | class CenterNetDetector(nn.Module): 14 | def __init__(self, cfg): 15 | super().__init__() 16 | self.mean, self.std = cfg.MODEL.PIXEL_MEAN, cfg.MODEL.PIXEL_STD 17 | self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) 18 | self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1)) 19 | 20 | self.backbone = build_backbone(cfg) 21 | self.proposal_generator = build_proposal_generator( 22 | cfg, self.backbone.output_shape()) # TODO: change to a more precise name 23 | 24 | 25 | def forward(self, batched_inputs): 26 | if not self.training: 27 | return self.inference(batched_inputs) 28 | images = self.preprocess_image(batched_inputs) 29 | features = self.backbone(images.tensor) 30 | gt_instances = [x["instances"].to(self.device) for x in batched_inputs] 31 | 32 | _, proposal_losses = self.proposal_generator( 33 | images, features, gt_instances) 34 | return proposal_losses 35 | 36 | 37 | @property 38 | def device(self): 39 | return self.pixel_mean.device 40 | 41 | 42 | @torch.no_grad() 43 | def inference(self, batched_inputs, do_postprocess=True): 44 | images = self.preprocess_image(batched_inputs) 45 | inp = images.tensor 46 | features = self.backbone(inp) 47 | proposals, _ = self.proposal_generator(images, features, None) 48 | 49 | processed_results = [] 50 | for results_per_image, input_per_image, image_size in zip( 51 | proposals, batched_inputs, images.image_sizes): 52 | if do_postprocess: 53 | height = input_per_image.get("height", image_size[0]) 54 | width = input_per_image.get("width", image_size[1]) 55 | r = detector_postprocess(results_per_image, height, width) 56 | processed_results.append({"instances": r}) 57 | else: 58 | r = results_per_image 59 | processed_results.append(r) 60 | return processed_results 61 | 62 | def preprocess_image(self, batched_inputs): 63 | """ 64 | Normalize, pad and batch the input images. 65 | """ 66 | images = [x["image"].to(self.device) for x in batched_inputs] 67 | images = [(x - self.pixel_mean) / self.pixel_std for x in images] 68 | images = ImageList.from_tensors(images, self.backbone.size_divisibility) 69 | return images 70 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/imaging_quality.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | from torchvision import transforms 4 | from pyiqa.archs.musiq_arch import MUSIQ 5 | from toolkit.utils import load_video, load_dimension_info 6 | 7 | from .distributed import ( 8 | get_world_size, 9 | get_rank, 10 | all_gather, 11 | barrier, 12 | distribute_list_to_rank, 13 | gather_list_of_dict, 14 | ) 15 | 16 | 17 | def transform(images, preprocess_mode='shorter'): 18 | if preprocess_mode.startswith('shorter'): 19 | _, _, h, w = images.size() 20 | if min(h,w) > 512: 21 | scale = 512./min(h,w) 22 | images = transforms.Resize(size=( int(scale * h), int(scale * w) ), antialias=False)(images) 23 | if preprocess_mode == 'shorter_centercrop': 24 | images = transforms.CenterCrop(512)(images) 25 | 26 | elif preprocess_mode == 'longer': 27 | _, _, h, w = images.size() 28 | if max(h,w) > 512: 29 | scale = 512./max(h,w) 30 | images = transforms.Resize(size=( int(scale * h), int(scale * w) ), antialias=False)(images) 31 | 32 | elif preprocess_mode == 'None': 33 | return images / 255. 34 | 35 | else: 36 | raise ValueError("Please recheck imaging_quality_mode") 37 | return images / 255. 38 | 39 | def technical_quality(model, video_list, device, **kwargs): 40 | if 'imaging_quality_preprocessing_mode' not in kwargs: 41 | preprocess_mode = 'longer' 42 | else: 43 | preprocess_mode = kwargs['imaging_quality_preprocessing_mode'] 44 | video_results = [] 45 | for video_path in tqdm(video_list, disable=get_rank() > 0): 46 | images = load_video(video_path) 47 | images = transform(images, preprocess_mode) 48 | acc_score_video = 0. 49 | for i in range(len(images)): 50 | frame = images[i].unsqueeze(0).to(device) 51 | score = model(frame) 52 | acc_score_video += float(score) 53 | video_results.append({'video_path': video_path, 'video_results': acc_score_video/len(images)}) 54 | average_score = sum([o['video_results'] for o in video_results]) / len(video_results) 55 | average_score = average_score / 100. 56 | return average_score, video_results 57 | 58 | 59 | def compute_imaging_quality(json_dir, device, submodules_list, **kwargs): 60 | model_path = submodules_list['model_path'] 61 | 62 | model = MUSIQ(pretrained_model_path=model_path) 63 | model.to(device) 64 | model.training = False 65 | 66 | video_list, _ = load_dimension_info(json_dir, dimension='imaging_quality', lang='en') 67 | video_list = distribute_list_to_rank(video_list) 68 | all_results, video_results = technical_quality(model, video_list, device, **kwargs) 69 | if get_world_size() > 1: 70 | video_results = gather_list_of_dict(video_results) 71 | all_results = sum([d['video_results'] for d in video_results]) / len(video_results) 72 | all_results = all_results / 100. 73 | return all_results, video_results 74 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/RAFT/README.md: -------------------------------------------------------------------------------- 1 | # RAFT 2 | This repository contains the source code for our paper: 3 | 4 | [RAFT: Recurrent All Pairs Field Transforms for Optical Flow](https://arxiv.org/pdf/2003.12039.pdf)
5 | ECCV 2020
6 | Zachary Teed and Jia Deng
7 | 8 | 9 | 10 | ## Requirements 11 | The code has been tested with PyTorch 1.6 and Cuda 10.1. 12 | ```Shell 13 | conda create --name raft 14 | conda activate raft 15 | conda install pytorch=1.6.0 torchvision=0.7.0 cudatoolkit=10.1 matplotlib tensorboard scipy opencv -c pytorch 16 | ``` 17 | 18 | ## Demos 19 | Pretrained models can be downloaded by running 20 | ```Shell 21 | ./download_models.sh 22 | ``` 23 | or downloaded from [google drive](https://drive.google.com/drive/folders/1sWDsfuZ3Up38EUQt7-JDTT1HcGHuJgvT?usp=sharing) 24 | 25 | You can demo a trained model on a sequence of frames 26 | ```Shell 27 | python demo.py --model=models/raft-things.pth --path=demo-frames 28 | ``` 29 | 30 | ## Required Data 31 | To evaluate/train RAFT, you will need to download the required datasets. 32 | * [FlyingChairs](https://lmb.informatik.uni-freiburg.de/resources/datasets/FlyingChairs.en.html#flyingchairs) 33 | * [FlyingThings3D](https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html) 34 | * [Sintel](http://sintel.is.tue.mpg.de/) 35 | * [KITTI](http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php?benchmark=flow) 36 | * [HD1K](http://hci-benchmark.iwr.uni-heidelberg.de/) (optional) 37 | 38 | 39 | By default `datasets.py` will search for the datasets in these locations. You can create symbolic links to wherever the datasets were downloaded in the `datasets` folder 40 | 41 | ```Shell 42 | ├── datasets 43 | ├── Sintel 44 | ├── test 45 | ├── training 46 | ├── KITTI 47 | ├── testing 48 | ├── training 49 | ├── devkit 50 | ├── FlyingChairs_release 51 | ├── data 52 | ├── FlyingThings3D 53 | ├── frames_cleanpass 54 | ├── frames_finalpass 55 | ├── optical_flow 56 | ``` 57 | 58 | ## Evaluation 59 | You can evaluate a trained model using `evaluate.py` 60 | ```Shell 61 | python evaluate.py --model=models/raft-things.pth --dataset=sintel --mixed_precision 62 | ``` 63 | 64 | ## Training 65 | We used the following training schedule in our paper (2 GPUs). Training logs will be written to the `runs` which can be visualized using tensorboard 66 | ```Shell 67 | ./train_standard.sh 68 | ``` 69 | 70 | If you have a RTX GPU, training can be accelerated using mixed precision. You can expect similiar results in this setting (1 GPU) 71 | ```Shell 72 | ./train_mixed.sh 73 | ``` 74 | 75 | ## (Optional) Efficent Implementation 76 | You can optionally use our alternate (efficent) implementation by compiling the provided cuda extension 77 | ```Shell 78 | cd alt_cuda_corr && python setup.py install && cd .. 79 | ``` 80 | and running `demo.py` and `evaluate.py` with the `--alternate_corr` flag Note, this implementation is somewhat slower than all-pairs, but uses significantly less GPU memory during the forward pass. 81 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/modeling/layers/heatmap_focal_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | # TODO: merge these two function 5 | def heatmap_focal_loss( 6 | inputs, 7 | targets, 8 | pos_inds, 9 | labels, 10 | alpha: float = -1, 11 | beta: float = 4, 12 | gamma: float = 2, 13 | reduction: str = 'sum', 14 | sigmoid_clamp: float = 1e-4, 15 | ignore_high_fp: float = -1., 16 | ): 17 | """ 18 | Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. 19 | Args: 20 | inputs: (sum_l N*Hl*Wl, C) 21 | targets: (sum_l N*Hl*Wl, C) 22 | pos_inds: N 23 | labels: N 24 | Returns: 25 | Loss tensor with the reduction option applied. 26 | """ 27 | pred = torch.clamp(inputs.sigmoid_(), min=sigmoid_clamp, max=1-sigmoid_clamp) 28 | neg_weights = torch.pow(1 - targets, beta) 29 | pos_pred_pix = pred[pos_inds] # N x C 30 | pos_pred = pos_pred_pix.gather(1, labels.unsqueeze(1)) 31 | pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, gamma) 32 | neg_loss = torch.log(1 - pred) * torch.pow(pred, gamma) * neg_weights 33 | 34 | if ignore_high_fp > 0: 35 | not_high_fp = (pred < ignore_high_fp).float() 36 | neg_loss = not_high_fp * neg_loss 37 | 38 | if reduction == "sum": 39 | pos_loss = pos_loss.sum() 40 | neg_loss = neg_loss.sum() 41 | 42 | if alpha >= 0: 43 | pos_loss = alpha * pos_loss 44 | neg_loss = (1 - alpha) * neg_loss 45 | 46 | return - pos_loss, - neg_loss 47 | 48 | heatmap_focal_loss_jit = torch.jit.script(heatmap_focal_loss) 49 | # heatmap_focal_loss_jit = heatmap_focal_loss 50 | 51 | def binary_heatmap_focal_loss( 52 | inputs, 53 | targets, 54 | pos_inds, 55 | alpha: float = -1, 56 | beta: float = 4, 57 | gamma: float = 2, 58 | sigmoid_clamp: float = 1e-4, 59 | ignore_high_fp: float = -1., 60 | ): 61 | """ 62 | Args: 63 | inputs: (sum_l N*Hl*Wl,) 64 | targets: (sum_l N*Hl*Wl,) 65 | pos_inds: N 66 | Returns: 67 | Loss tensor with the reduction option applied. 68 | """ 69 | pred = torch.clamp(inputs.sigmoid_(), min=sigmoid_clamp, max=1-sigmoid_clamp) 70 | neg_weights = torch.pow(1 - targets, beta) 71 | for i, ind in enumerate(pos_inds): 72 | if ind >= pred.shape[0]: 73 | print('%'*100) 74 | print(pred.shape, ind, pos_inds) 75 | pos_inds[i] = pred.shape[0] - 1 76 | pos_pred = pred[pos_inds] # N 77 | pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, gamma) 78 | neg_loss = torch.log(1 - pred) * torch.pow(pred, gamma) * neg_weights 79 | if ignore_high_fp > 0: 80 | not_high_fp = (pred < ignore_high_fp).float() 81 | neg_loss = not_high_fp * neg_loss 82 | 83 | pos_loss = - pos_loss.sum() 84 | neg_loss = - neg_loss.sum() 85 | 86 | if alpha >= 0: 87 | pos_loss = alpha * pos_loss 88 | neg_loss = (1 - alpha) * neg_loss 89 | 90 | return pos_loss, neg_loss 91 | 92 | # binary_heatmap_focal_loss_jit = torch.jit.script(binary_heatmap_focal_loss) -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/fvd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | from torchvision import transforms 4 | import av 5 | import numpy as np 6 | from tqdm import tqdm 7 | from numpy import cov 8 | from numpy import mean 9 | 10 | 11 | class I3DFeatureExtractor(torch.nn.Module): 12 | def __init__(self): 13 | super(I3DFeatureExtractor, self).__init__() 14 | self.model = torchvision.models.video.r3d_18(pretrained=True) 15 | self.model.fc = torch.nn.Identity() 16 | 17 | def forward(self, x): 18 | return self.model(x) 19 | 20 | def extract_features(video_path, model, device, transform): 21 | try: 22 | container = av.open(video_path) 23 | frames = [] 24 | for frame in container.decode(video=0): 25 | img = frame.to_rgb().to_ndarray() 26 | img = transform(img) 27 | frames.append(img) 28 | if len(frames) == 16: 29 | break 30 | if len(frames) < 16: 31 | while len(frames) < 16: 32 | frames.append(frames[-1]) 33 | video_tensor = torch.stack(frames).permute(1, 0, 2, 3).unsqueeze(0).to(device) 34 | with torch.no_grad(): 35 | features = model(video_tensor) 36 | return features.cpu().numpy().flatten() 37 | except Exception as e: 38 | print(f"Error processing {video_path}: {e}") 39 | return None 40 | 41 | def get_dataset_features(video_dir, model, device): 42 | transform = transforms.Compose([ 43 | transforms.ToPILImage(), 44 | transforms.Resize((224, 224)), 45 | transforms.ToTensor(), 46 | transforms.Normalize(mean=[0.43216, 0.394666, 0.37645], 47 | std=[0.22803, 0.22145, 0.216989]), 48 | ]) 49 | features = [] 50 | for video_file in tqdm(os.listdir(video_dir)): 51 | video_path = os.path.join(video_dir, video_file) 52 | feature = extract_features(video_path, model, model.device, transform) 53 | if feature is not None: 54 | features.append(feature) 55 | return np.array(features) 56 | 57 | import os 58 | 59 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 60 | model = I3DFeatureExtractor().to(device) 61 | model.eval() 62 | 63 | real_video_dir = './FVD/real_videos/architecture' 64 | real_features = get_dataset_features(real_video_dir, model, device) 65 | 66 | generated_video_dir = './sampled_videos/cogvideox-5b/architecture' 67 | generated_features = get_dataset_features(generated_video_dir, model, device) 68 | 69 | mu_real = mean(real_features, axis=0) 70 | mu_generated = mean(generated_features, axis=0) 71 | 72 | sigma_real = cov(real_features, rowvar=False) 73 | sigma_generated = cov(generated_features, rowvar=False) 74 | 75 | from scipy.linalg import sqrtm 76 | 77 | def calculate_fvd(mu1, sigma1, mu2, sigma2): 78 | diff = mu1 - mu2 79 | covmean, _ = sqrtm(sigma1.dot(sigma2), disp=False) 80 | if np.iscomplexobj(covmean): 81 | covmean = covmean.real 82 | fvd = diff.dot(diff) + np.trace(sigma1 + sigma2 - 2 * covmean) 83 | return fvd 84 | 85 | fvd_value = calculate_fvd(mu_real, sigma_real, mu_generated, sigma_generated) 86 | print(f"FVD: {fvd_value}") -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/flow_generation/liteflownet/README.md: -------------------------------------------------------------------------------- 1 | # pytorch-liteflownet 2 | This is a personal reimplementation of LiteFlowNet [1] using PyTorch. Should you be making use of this work, please cite the paper accordingly. Also, make sure to adhere to the licensing terms of the authors. Should you be making use of this particular implementation, please acknowledge it appropriately [2]. 3 | 4 | Paper 5 | 6 | For the original Caffe version of this work, please see: https://github.com/twhui/LiteFlowNet 7 |
8 | Other optical flow implementations from me: [pytorch-pwc](https://github.com/sniklaus/pytorch-pwc), [pytorch-unflow](https://github.com/sniklaus/pytorch-unflow), [pytorch-spynet](https://github.com/sniklaus/pytorch-spynet) 9 | 10 | ## setup 11 | The correlation layer is implemented in CUDA using CuPy, which is why CuPy is a required dependency. It can be installed using `pip install cupy` or alternatively using one of the provided [binary packages](https://docs.cupy.dev/en/stable/install.html#installing-cupy) as outlined in the CuPy repository. If you would like to use Docker, you can take a look at [this](https://github.com/sniklaus/pytorch-liteflownet/pull/43) pull request to get started. 12 | 13 | ## usage 14 | To run it on your own pair of images, use the following command. You can choose between three models, please make sure to see their paper / the code for more details. 15 | 16 | ``` 17 | python run.py --model default --one ./images/one.png --two ./images/two.png --out ./out.flo 18 | ``` 19 | 20 | I am afraid that I cannot guarantee that this reimplementation is correct. However, it produced results pretty much identical to the implementation of the original authors in the examples that I tried. There are some numerical deviations that stem from differences in the `DownsampleLayer` of Caffe and the `torch.nn.functional.interpolate` function of PyTorch. Please feel free to contribute to this repository by submitting issues and pull requests. 21 | 22 | ## comparison 23 |

Comparison

24 | 25 | ## license 26 | As stated in the licensing terms of the authors of the paper, their material is provided for research purposes only. Please make sure to further consult their licensing terms. 27 | 28 | ## references 29 | ``` 30 | [1] @inproceedings{Hui_CVPR_2018, 31 | author = {Tak-Wai Hui and Xiaoou Tang and Chen Change Loy}, 32 | title = {{LiteFlowNet}: A Lightweight Convolutional Neural Network for Optical Flow Estimation}, 33 | booktitle = {IEEE Conference on Computer Vision and Pattern Recognition}, 34 | year = {2018} 35 | } 36 | ``` 37 | 38 | ``` 39 | [2] @misc{pytorch-liteflownet, 40 | author = {Simon Niklaus}, 41 | title = {A Reimplementation of {LiteFlowNet} Using {PyTorch}}, 42 | year = {2019}, 43 | howpublished = {\url{https://github.com/sniklaus/pytorch-liteflownet}} 44 | } 45 | ``` -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/umt/functional.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | import cv2 3 | import numpy as np 4 | import PIL 5 | import torch 6 | 7 | 8 | def _is_tensor_clip(clip): 9 | return torch.is_tensor(clip) and clip.ndimension() == 4 10 | 11 | 12 | def crop_clip(clip, min_h, min_w, h, w): 13 | if isinstance(clip[0], np.ndarray): 14 | cropped = [img[min_h:min_h + h, min_w:min_w + w, :] for img in clip] 15 | 16 | elif isinstance(clip[0], PIL.Image.Image): 17 | cropped = [ 18 | img.crop((min_w, min_h, min_w + w, min_h + h)) for img in clip 19 | ] 20 | else: 21 | raise TypeError('Expected numpy.ndarray or PIL.Image' + 22 | 'but got list of {0}'.format(type(clip[0]))) 23 | return cropped 24 | 25 | 26 | def resize_clip(clip, size, interpolation='bilinear'): 27 | if isinstance(clip[0], np.ndarray): 28 | if isinstance(size, numbers.Number): 29 | im_h, im_w, im_c = clip[0].shape 30 | # Min spatial dim already matches minimal size 31 | if (im_w <= im_h and im_w == size) or (im_h <= im_w 32 | and im_h == size): 33 | return clip 34 | new_h, new_w = get_resize_sizes(im_h, im_w, size) 35 | size = (new_w, new_h) 36 | else: 37 | size = size[0], size[1] 38 | if interpolation == 'bilinear': 39 | np_inter = cv2.INTER_LINEAR 40 | else: 41 | np_inter = cv2.INTER_NEAREST 42 | scaled = [ 43 | cv2.resize(img, size, interpolation=np_inter) for img in clip 44 | ] 45 | elif isinstance(clip[0], PIL.Image.Image): 46 | if isinstance(size, numbers.Number): 47 | im_w, im_h = clip[0].size 48 | # Min spatial dim already matches minimal size 49 | if (im_w <= im_h and im_w == size) or (im_h <= im_w 50 | and im_h == size): 51 | return clip 52 | new_h, new_w = get_resize_sizes(im_h, im_w, size) 53 | size = (new_w, new_h) 54 | else: 55 | size = size[1], size[0] 56 | if interpolation == 'bilinear': 57 | pil_inter = PIL.Image.BILINEAR 58 | else: 59 | pil_inter = PIL.Image.NEAREST 60 | scaled = [img.resize(size, pil_inter) for img in clip] 61 | else: 62 | raise TypeError('Expected numpy.ndarray or PIL.Image' + 63 | 'but got list of {0}'.format(type(clip[0]))) 64 | return scaled 65 | 66 | 67 | def get_resize_sizes(im_h, im_w, size): 68 | if im_w < im_h: 69 | ow = size 70 | oh = int(size * im_h / im_w) 71 | else: 72 | oh = size 73 | ow = int(size * im_w / im_h) 74 | return oh, ow 75 | 76 | 77 | def normalize(clip, mean, std, inplace=False): 78 | if not _is_tensor_clip(clip): 79 | raise TypeError('tensor is not a torch clip.') 80 | 81 | if not inplace: 82 | clip = clip.clone() 83 | 84 | dtype = clip.dtype 85 | mean = torch.as_tensor(mean, dtype=dtype, device=clip.device) 86 | std = torch.as_tensor(std, dtype=dtype, device=clip.device) 87 | clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None]) 88 | 89 | return clip 90 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/amt/networks/blocks/multi_flow.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from toolkit.third_party.amt.utils.flow_utils import warp 4 | from toolkit.third_party.amt.networks.blocks.ifrnet import ( 5 | convrelu, resize, 6 | ResBlock, 7 | ) 8 | 9 | 10 | def multi_flow_combine(comb_block, img0, img1, flow0, flow1, 11 | mask=None, img_res=None, mean=None): 12 | ''' 13 | A parallel implementation of multiple flow field warping 14 | comb_block: An nn.Seqential object. 15 | img shape: [b, c, h, w] 16 | flow shape: [b, 2*num_flows, h, w] 17 | mask (opt): 18 | If 'mask' is None, the function conduct a simple average. 19 | img_res (opt): 20 | If 'img_res' is None, the function adds zero instead. 21 | mean (opt): 22 | If 'mean' is None, the function adds zero instead. 23 | ''' 24 | b, c, h, w = flow0.shape 25 | num_flows = c // 2 26 | flow0 = flow0.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w) 27 | flow1 = flow1.reshape(b, num_flows, 2, h, w).reshape(-1, 2, h, w) 28 | 29 | mask = mask.reshape(b, num_flows, 1, h, w 30 | ).reshape(-1, 1, h, w) if mask is not None else None 31 | img_res = img_res.reshape(b, num_flows, 3, h, w 32 | ).reshape(-1, 3, h, w) if img_res is not None else 0 33 | img0 = torch.stack([img0] * num_flows, 1).reshape(-1, 3, h, w) 34 | img1 = torch.stack([img1] * num_flows, 1).reshape(-1, 3, h, w) 35 | mean = torch.stack([mean] * num_flows, 1).reshape(-1, 1, 1, 1 36 | ) if mean is not None else 0 37 | 38 | img0_warp = warp(img0, flow0) 39 | img1_warp = warp(img1, flow1) 40 | img_warps = mask * img0_warp + (1 - mask) * img1_warp + mean + img_res 41 | img_warps = img_warps.reshape(b, num_flows, 3, h, w) 42 | imgt_pred = img_warps.mean(1) + comb_block(img_warps.view(b, -1, h, w)) 43 | return imgt_pred 44 | 45 | 46 | class MultiFlowDecoder(nn.Module): 47 | def __init__(self, in_ch, skip_ch, num_flows=3): 48 | super(MultiFlowDecoder, self).__init__() 49 | self.num_flows = num_flows 50 | self.convblock = nn.Sequential( 51 | convrelu(in_ch*3+4, in_ch*3), 52 | ResBlock(in_ch*3, skip_ch), 53 | nn.ConvTranspose2d(in_ch*3, 8*num_flows, 4, 2, 1, bias=True) 54 | ) 55 | 56 | def forward(self, ft_, f0, f1, flow0, flow1): 57 | n = self.num_flows 58 | f0_warp = warp(f0, flow0) 59 | f1_warp = warp(f1, flow1) 60 | out = self.convblock(torch.cat([ft_, f0_warp, f1_warp, flow0, flow1], 1)) 61 | delta_flow0, delta_flow1, mask, img_res = torch.split(out, [2*n, 2*n, n, 3*n], 1) 62 | mask = torch.sigmoid(mask) 63 | 64 | flow0 = delta_flow0 + 2.0 * resize(flow0, scale_factor=2.0 65 | ).repeat(1, self.num_flows, 1, 1) 66 | flow1 = delta_flow1 + 2.0 * resize(flow1, scale_factor=2.0 67 | ).repeat(1, self.num_flows, 1, 1) 68 | 69 | return flow0, flow1, mask, img_res 70 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/background_consistency.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | import numpy as np 5 | import clip 6 | from PIL import Image 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from toolkit.utils import load_video, load_dimension_info, clip_transform 11 | from tqdm import tqdm 12 | 13 | from .distributed import ( 14 | get_world_size, 15 | get_rank, 16 | all_gather, 17 | barrier, 18 | distribute_list_to_rank, 19 | gather_list_of_dict, 20 | ) 21 | 22 | 23 | def background_consistency(clip_model, preprocess, video_list, device, read_frame=False): 24 | sim = 0.0 25 | cnt = 0 26 | video_results = [] 27 | image_transform = clip_transform(224) 28 | for video_path in tqdm(video_list, disable=get_rank() > 0): 29 | video_sim = 0.0 30 | cnt_per_video = 0 31 | if read_frame: 32 | video_path = video_path[:-4].replace('videos', 'frames').replace(' ', '_') 33 | tmp_paths = [os.path.join(video_path, f) for f in sorted(os.listdir(video_path))] 34 | images = [] 35 | for tmp_path in tmp_paths: 36 | images.append(preprocess(Image.open(tmp_path))) 37 | images = torch.stack(images) 38 | else: 39 | images = load_video(video_path) 40 | images = image_transform(images) 41 | images = images.to(device) 42 | image_features = clip_model.encode_image(images) 43 | image_features = F.normalize(image_features, dim=-1, p=2) 44 | for i in range(len(image_features)): 45 | image_feature = image_features[i].unsqueeze(0) 46 | if i == 0: 47 | first_image_feature = image_feature 48 | else: 49 | sim_pre = max(0.0, F.cosine_similarity(former_image_feature, image_feature).item()) 50 | sim_fir = max(0.0, F.cosine_similarity(first_image_feature, image_feature).item()) 51 | cur_sim = (sim_pre + sim_fir) / 2 52 | video_sim += cur_sim 53 | cnt += 1 54 | cnt_per_video += 1 55 | former_image_feature = image_feature 56 | sim_per_image = video_sim / (len(image_features) - 1) 57 | sim += video_sim 58 | video_results.append({ 59 | 'video_path': video_path, 60 | 'video_results': sim_per_image, 61 | 'video_sim': video_sim, 62 | 'cnt_per_video': cnt_per_video}) 63 | # sim_per_video = sim / (len(video_list) - 1) 64 | sim_per_frame = sim / cnt 65 | return sim_per_frame, video_results 66 | 67 | 68 | def compute_background_consistency(json_dir, device, submodules_list, **kwargs): 69 | vit_path, read_frame = submodules_list[0], submodules_list[1] 70 | clip_model, preprocess = clip.load(vit_path, device=device) 71 | video_list, _ = load_dimension_info(json_dir, dimension='background_consistency', lang='en') 72 | video_list = distribute_list_to_rank(video_list) 73 | all_results, video_results = background_consistency(clip_model, preprocess, video_list, device, read_frame) 74 | if get_world_size() > 1: 75 | video_results = gather_list_of_dict(video_results) 76 | sim = sum([d['video_sim'] for d in video_results]) 77 | cnt = sum([d['cnt_per_video'] for d in video_results]) 78 | all_results = sim / cnt 79 | return all_results, video_results 80 | 81 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/scene.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | import torch 5 | import numpy as np 6 | from tqdm import tqdm 7 | from toolkit.utils import load_video, load_dimension_info, tag2text_transform 8 | from toolkit.third_party.tag2Text.tag2text import tag2text_caption 9 | 10 | import logging 11 | 12 | from .distributed import ( 13 | get_world_size, 14 | get_rank, 15 | all_gather, 16 | barrier, 17 | distribute_list_to_rank, 18 | gather_list_of_dict, 19 | ) 20 | 21 | logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | def get_caption(model, image_arrays): 26 | caption, tag_predict = model.generate(image_arrays, tag_input = None, return_tag_predict = True) 27 | return caption 28 | 29 | def check_generate(key_info, predictions): 30 | cur_cnt = 0 31 | key = key_info['scene'] 32 | for pred in predictions: 33 | q_flag = [q in pred for q in key.split(' ')] 34 | if len(q_flag) == sum(q_flag): 35 | cur_cnt +=1 36 | return cur_cnt 37 | 38 | def scene(model, video_dict, device): 39 | success_frame_count, frame_count = 0, 0 40 | video_results = [] 41 | transform = tag2text_transform(384) 42 | for info in tqdm(video_dict, disable=get_rank() > 0): 43 | if 'auxiliary_info' not in info: 44 | raise "Auxiliary info is not in json, please check your json." 45 | scene_info = info['auxiliary_info']['scene'] 46 | for video_path in info['video_list']: 47 | video_array = load_video(video_path, num_frames=16, return_tensor=False, width=384, height=384) 48 | video_tensor_list = [] 49 | for i in video_array: 50 | video_tensor_list.append(transform(i).to(device).unsqueeze(0)) 51 | video_tensor = torch.cat(video_tensor_list) 52 | cur_video_pred = get_caption(model, video_tensor) 53 | cur_success_frame_count = check_generate(scene_info, cur_video_pred) 54 | cur_success_frame_rate = cur_success_frame_count/len(cur_video_pred) 55 | success_frame_count += cur_success_frame_count 56 | frame_count += len(cur_video_pred) 57 | video_results.append({ 58 | 'video_path': video_path, 59 | 'video_results': cur_success_frame_rate, 60 | 'success_frame_count': cur_success_frame_count, 61 | 'frame_count': len(cur_video_pred)}) 62 | success_rate = success_frame_count / frame_count 63 | return success_rate, video_results 64 | 65 | 66 | def compute_scene(json_dir, device, submodules_dict, **kwargs): 67 | model = tag2text_caption(**submodules_dict) 68 | model.eval() 69 | model = model.to(device) 70 | logger.info("Initialize caption model success") 71 | _, prompt_dict_ls = load_dimension_info(json_dir, dimension='scene', lang='en') 72 | prompt_dict_ls = distribute_list_to_rank(prompt_dict_ls) 73 | all_results, video_results = scene(model, prompt_dict_ls, device) 74 | if get_world_size() > 1: 75 | video_results = gather_list_of_dict(video_results) 76 | success_frame_count = sum([d['success_frame_count'] for d in video_results]) 77 | frame_count = sum([d['frame_count'] for d in video_results]) 78 | all_results = success_frame_count / frame_count 79 | return all_results, video_results 80 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/RAFT/core/corr.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from .utils_core.utils import bilinear_sampler, coords_grid 4 | 5 | try: 6 | import alt_cuda_corr 7 | except: 8 | # alt_cuda_corr is not compiled 9 | pass 10 | 11 | 12 | class CorrBlock: 13 | def __init__(self, fmap1, fmap2, num_levels=4, radius=4): 14 | self.num_levels = num_levels 15 | self.radius = radius 16 | self.corr_pyramid = [] 17 | 18 | # all pairs correlation 19 | corr = CorrBlock.corr(fmap1, fmap2) 20 | 21 | batch, h1, w1, dim, h2, w2 = corr.shape 22 | corr = corr.reshape(batch*h1*w1, dim, h2, w2) 23 | 24 | self.corr_pyramid.append(corr) 25 | for i in range(self.num_levels-1): 26 | corr = F.avg_pool2d(corr, 2, stride=2) 27 | self.corr_pyramid.append(corr) 28 | 29 | def __call__(self, coords): 30 | r = self.radius 31 | coords = coords.permute(0, 2, 3, 1) 32 | batch, h1, w1, _ = coords.shape 33 | 34 | out_pyramid = [] 35 | for i in range(self.num_levels): 36 | corr = self.corr_pyramid[i] 37 | dx = torch.linspace(-r, r, 2*r+1, device=coords.device) 38 | dy = torch.linspace(-r, r, 2*r+1, device=coords.device) 39 | delta = torch.stack(torch.meshgrid(dy, dx), axis=-1) 40 | 41 | centroid_lvl = coords.reshape(batch*h1*w1, 1, 1, 2) / 2**i 42 | delta_lvl = delta.view(1, 2*r+1, 2*r+1, 2) 43 | coords_lvl = centroid_lvl + delta_lvl 44 | 45 | corr = bilinear_sampler(corr, coords_lvl) 46 | corr = corr.view(batch, h1, w1, -1) 47 | out_pyramid.append(corr) 48 | 49 | out = torch.cat(out_pyramid, dim=-1) 50 | return out.permute(0, 3, 1, 2).contiguous().float() 51 | 52 | @staticmethod 53 | def corr(fmap1, fmap2): 54 | batch, dim, ht, wd = fmap1.shape 55 | fmap1 = fmap1.view(batch, dim, ht*wd) 56 | fmap2 = fmap2.view(batch, dim, ht*wd) 57 | 58 | corr = torch.matmul(fmap1.transpose(1, 2), fmap2) 59 | corr = corr.view(batch, ht, wd, 1, ht, wd) 60 | return corr / torch.sqrt(torch.tensor(dim).float()) 61 | 62 | 63 | class AlternateCorrBlock: 64 | def __init__(self, fmap1, fmap2, num_levels=4, radius=4): 65 | self.num_levels = num_levels 66 | self.radius = radius 67 | 68 | self.pyramid = [(fmap1, fmap2)] 69 | for i in range(self.num_levels): 70 | fmap1 = F.avg_pool2d(fmap1, 2, stride=2) 71 | fmap2 = F.avg_pool2d(fmap2, 2, stride=2) 72 | self.pyramid.append((fmap1, fmap2)) 73 | 74 | def __call__(self, coords): 75 | coords = coords.permute(0, 2, 3, 1) 76 | B, H, W, _ = coords.shape 77 | dim = self.pyramid[0][0].shape[1] 78 | 79 | corr_list = [] 80 | for i in range(self.num_levels): 81 | r = self.radius 82 | fmap1_i = self.pyramid[0][0].permute(0, 2, 3, 1).contiguous() 83 | fmap2_i = self.pyramid[i][1].permute(0, 2, 3, 1).contiguous() 84 | 85 | coords_i = (coords / 2**i).reshape(B, 1, H, W, 2).contiguous() 86 | corr, = alt_cuda_corr.forward(fmap1_i, fmap2_i, coords_i, r) 87 | corr_list.append(corr.squeeze(1)) 88 | 89 | corr = torch.stack(corr_list, dim=1) 90 | corr = corr.reshape(B, -1, H, W) 91 | return corr / torch.sqrt(torch.tensor(dim).float()) 92 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/overall_consistency.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | 5 | import torch 6 | import clip 7 | from tqdm import tqdm 8 | from toolkit.utils import load_video, load_dimension_info, clip_transform, read_frames_decord_by_fps, CACHE_DIR 9 | from toolkit.third_party.ViCLIP.viclip import ViCLIP 10 | from toolkit.third_party.ViCLIP.simple_tokenizer import SimpleTokenizer 11 | 12 | from .distributed import ( 13 | get_world_size, 14 | get_rank, 15 | all_gather, 16 | barrier, 17 | distribute_list_to_rank, 18 | gather_list_of_dict, 19 | ) 20 | 21 | 22 | def get_text_features(model, input_text, tokenizer, text_feature_dict={}): 23 | if input_text in text_feature_dict: 24 | return text_feature_dict[input_text] 25 | text_template= f"{input_text}" 26 | with torch.no_grad(): 27 | text_features = model.encode_text(text_template).float() 28 | text_features /= text_features.norm(dim=-1, keepdim=True) 29 | text_feature_dict[input_text] = text_features 30 | return text_features 31 | 32 | def get_vid_features(model, input_frames): 33 | with torch.no_grad(): 34 | clip_feat = model.encode_vision(input_frames,test=True).float() 35 | clip_feat /= clip_feat.norm(dim=-1, keepdim=True) 36 | return clip_feat 37 | 38 | def get_predict_label(clip_feature, text_feats_tensor, top=5): 39 | label_probs = (100.0 * clip_feature @ text_feats_tensor.T).softmax(dim=-1) 40 | top_probs, top_labels = label_probs.cpu().topk(top, dim=-1) 41 | return top_probs, top_labels 42 | 43 | def overall_consistency(clip_model, video_dict, tokenizer, device, sample="middle"): 44 | sim = [] 45 | video_results = [] 46 | image_transform = clip_transform(224) 47 | for info in tqdm(video_dict, disable=get_rank() > 0): 48 | query = info['prompt'] 49 | # text = clip.tokenize([query]).to(device) 50 | video_list = info['video_list'] 51 | for video_path in video_list: 52 | cur_video = [] 53 | with torch.no_grad(): 54 | images = read_frames_decord_by_fps(video_path, num_frames=8, sample=sample) 55 | images = image_transform(images) 56 | images = images.to(device) 57 | clip_feat = get_vid_features(clip_model,images.unsqueeze(0)) 58 | text_feat = get_text_features(clip_model, query, tokenizer) 59 | logit_per_text = clip_feat @ text_feat.T 60 | score_per_video = float(logit_per_text[0][0].cpu()) 61 | sim.append(score_per_video) 62 | video_results.append({'video_path': video_path, 'video_results': score_per_video}) 63 | avg_score = np.mean(sim) 64 | return avg_score, video_results 65 | 66 | def compute_overall_consistency(json_dir, device, submodules_list, **kwargs): 67 | tokenizer = SimpleTokenizer(os.path.join(CACHE_DIR, "ViCLIP/bpe_simple_vocab_16e6.txt.gz")) 68 | viclip = ViCLIP(tokenizer= tokenizer, **submodules_list).to(device) 69 | _, video_dict = load_dimension_info(json_dir, dimension='overall_consistency', lang='en') 70 | video_dict = distribute_list_to_rank(video_dict) 71 | all_results, video_results = overall_consistency(viclip, video_dict, tokenizer, device) 72 | if get_world_size() > 1: 73 | video_results = gather_list_of_dict(video_results) 74 | all_results = sum([d['video_results'] for d in video_results]) / len(video_results) 75 | return all_results, video_results 76 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/grit/data/datasets/vg.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from fvcore.common.timer import Timer 4 | from detectron2.structures import BoxMode 5 | from fvcore.common.file_io import PathManager 6 | from detectron2.data import DatasetCatalog, MetadataCatalog 7 | from lvis import LVIS 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | __all__ = ["load_vg_json", "register_vg_instances"] 12 | 13 | 14 | def register_vg_instances(name, metadata, json_file, image_root): 15 | """ 16 | """ 17 | DatasetCatalog.register(name, lambda: load_vg_json( 18 | json_file, image_root, name)) 19 | MetadataCatalog.get(name).set( 20 | json_file=json_file, image_root=image_root, 21 | evaluator_type="vg", **metadata 22 | ) 23 | 24 | 25 | def get_vg_meta(): 26 | categories = [{'supercategory': 'object', 'id': 1, 'name': 'object'}] 27 | vg_categories = sorted(categories, key=lambda x: x["id"]) 28 | thing_classes = [k["name"] for k in vg_categories] 29 | meta = {"thing_classes": thing_classes} 30 | return meta 31 | 32 | 33 | def load_vg_json(json_file, image_root, dataset_name=None): 34 | 35 | json_file = PathManager.get_local_path(json_file) 36 | 37 | timer = Timer() 38 | lvis_api = LVIS(json_file) 39 | if timer.seconds() > 1: 40 | logger.info("Loading {} takes {:.2f} seconds.".format( 41 | json_file, timer.seconds())) 42 | 43 | img_ids = sorted(lvis_api.imgs.keys()) 44 | imgs = lvis_api.load_imgs(img_ids) 45 | anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids] 46 | 47 | ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] 48 | assert len(set(ann_ids)) == len(ann_ids), \ 49 | "Annotation ids in '{}' are not unique".format(json_file) 50 | 51 | imgs_anns = list(zip(imgs, anns)) 52 | logger.info("Loaded {} images in the LVIS v1 format from {}".format( 53 | len(imgs_anns), json_file)) 54 | 55 | dataset_dicts = [] 56 | 57 | for (img_dict, anno_dict_list) in imgs_anns: 58 | record = {} 59 | if "file_name" in img_dict: 60 | file_name = img_dict["file_name"] 61 | record["file_name"] = os.path.join(image_root, file_name) 62 | 63 | record["height"] = int(img_dict["height"]) 64 | record["width"] = int(img_dict["width"]) 65 | image_id = record["image_id"] = img_dict["id"] 66 | 67 | objs = [] 68 | for anno in anno_dict_list: 69 | assert anno["image_id"] == image_id 70 | if anno.get('iscrowd', 0) > 0: 71 | continue 72 | obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS} 73 | obj["category_id"] = 0 74 | obj["object_description"] = anno["caption"] 75 | 76 | objs.append(obj) 77 | record["annotations"] = objs 78 | if len(record["annotations"]) == 0: 79 | continue 80 | record["task"] = "DenseCap" 81 | dataset_dicts.append(record) 82 | 83 | return dataset_dicts 84 | 85 | 86 | _CUSTOM_SPLITS_LVIS = { 87 | "vg_train": ("vg/images", "vg/annotations/train.json"), 88 | "vg_test": ("vg/images", "vg/annotations/test.json"), 89 | } 90 | 91 | 92 | for key, (image_root, json_file) in _CUSTOM_SPLITS_LVIS.items(): 93 | register_vg_instances( 94 | key, 95 | get_vg_meta(), 96 | os.path.join("datasets", json_file) if "://" not in json_file else json_file, 97 | os.path.join("datasets", image_root), 98 | ) -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/temporal_style.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | 5 | import torch 6 | import clip 7 | from tqdm import tqdm 8 | from toolkit.utils import load_video, load_dimension_info, clip_transform, read_frames_decord_by_fps, CACHE_DIR 9 | from toolkit.third_party.ViCLIP.viclip import ViCLIP 10 | from toolkit.third_party.ViCLIP.simple_tokenizer import SimpleTokenizer 11 | 12 | from .distributed import ( 13 | get_world_size, 14 | get_rank, 15 | all_gather, 16 | barrier, 17 | distribute_list_to_rank, 18 | gather_list_of_dict, 19 | ) 20 | 21 | 22 | def get_text_features(model, input_text, tokenizer, text_feature_dict={}): 23 | if input_text in text_feature_dict: 24 | return text_feature_dict[input_text] 25 | text_template= f"{input_text}" 26 | with torch.no_grad(): 27 | text_features = model.encode_text(text_template).float() 28 | text_features /= text_features.norm(dim=-1, keepdim=True) 29 | text_feature_dict[input_text] = text_features 30 | return text_features 31 | 32 | def get_vid_features(model, input_frames): 33 | with torch.no_grad(): 34 | clip_feat = model.encode_vision(input_frames,test=True).float() 35 | clip_feat /= clip_feat.norm(dim=-1, keepdim=True) 36 | return clip_feat 37 | 38 | def get_predict_label(clip_feature, text_feats_tensor, top=5): 39 | label_probs = (100.0 * clip_feature @ text_feats_tensor.T).softmax(dim=-1) 40 | top_probs, top_labels = label_probs.cpu().topk(top, dim=-1) 41 | return top_probs, top_labels 42 | 43 | def temporal_style(clip_model, video_dict, tokenizer, device, sample="middle"): 44 | sim = [] 45 | video_results = [] 46 | image_transform = clip_transform(224) 47 | for info in tqdm(video_dict, disable=get_rank() > 0): 48 | query = info['prompt'] 49 | # text = clip.tokenize([query]).to(device) 50 | video_list = info['video_list'] 51 | for video_path in video_list: 52 | cur_video = [] 53 | with torch.no_grad(): 54 | # images = load_video(video_path, num_frames=8) 55 | images = read_frames_decord_by_fps(video_path, num_frames=8, sample=sample) 56 | images = image_transform(images) 57 | images = images.to(device) 58 | clip_feat = get_vid_features(clip_model,images.unsqueeze(0)) 59 | text_feat = get_text_features(clip_model, query, tokenizer) 60 | logit_per_text = clip_feat @ text_feat.T 61 | score_per_video = float(logit_per_text[0][0].cpu()) 62 | sim.append(score_per_video) 63 | video_results.append({'video_path': video_path, 'video_results': score_per_video}) 64 | avg_score = np.mean(sim) 65 | return avg_score, video_results 66 | 67 | def compute_temporal_style(json_dir, device, submodules_list, **kwargs): 68 | tokenizer = SimpleTokenizer(os.path.join(CACHE_DIR, "ViCLIP/bpe_simple_vocab_16e6.txt.gz")) 69 | viclip = ViCLIP(tokenizer= tokenizer, **submodules_list).to(device) 70 | _, video_dict = load_dimension_info(json_dir, dimension='temporal_style', lang='en') 71 | video_dict = distribute_list_to_rank(video_dict) 72 | all_results, video_results = temporal_style(viclip, video_dict, tokenizer, device) 73 | if get_world_size() > 1: 74 | video_results = gather_list_of_dict(video_results) 75 | all_results = sum([d['video_results'] for d in video_results]) / len(video_results) 76 | return all_results, video_results 77 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/subject_consistency.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import cv2 4 | import json 5 | import numpy as np 6 | from PIL import Image 7 | from tqdm import tqdm 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torchvision.transforms as transforms 13 | 14 | from toolkit.utils import load_video, load_dimension_info, dino_transform, dino_transform_Image 15 | import logging 16 | 17 | from .distributed import ( 18 | get_world_size, 19 | get_rank, 20 | all_gather, 21 | barrier, 22 | distribute_list_to_rank, 23 | gather_list_of_dict, 24 | ) 25 | 26 | logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | def subject_consistency(model, video_list, device, read_frame): 31 | sim = 0.0 32 | cnt = 0 33 | video_results = [] 34 | if read_frame: 35 | image_transform = dino_transform_Image(224) 36 | else: 37 | image_transform = dino_transform(224) 38 | for video_path in tqdm(video_list, disable=get_rank() > 0): 39 | video_sim = 0.0 40 | if read_frame: 41 | video_path = video_path[:-4].replace('videos', 'frames').replace(' ', '_') 42 | tmp_paths = [os.path.join(video_path, f) for f in sorted(os.listdir(video_path))] 43 | images = [] 44 | for tmp_path in tmp_paths: 45 | images.append(image_transform(Image.open(tmp_path))) 46 | else: 47 | images = load_video(video_path) 48 | images = image_transform(images) 49 | for i in range(len(images)): 50 | with torch.no_grad(): 51 | image = images[i].unsqueeze(0) 52 | image = image.to(device) 53 | image_features = model(image) 54 | image_features = F.normalize(image_features, dim=-1, p=2) 55 | if i == 0: 56 | first_image_features = image_features 57 | else: 58 | sim_pre = max(0.0, F.cosine_similarity(former_image_features, image_features).item()) 59 | sim_fir = max(0.0, F.cosine_similarity(first_image_features, image_features).item()) 60 | cur_sim = (sim_pre + sim_fir) / 2 61 | video_sim += cur_sim 62 | cnt += 1 63 | former_image_features = image_features 64 | sim_per_images = video_sim / (len(images) - 1) 65 | sim += video_sim 66 | video_results.append({'video_path': video_path, 'video_results': sim_per_images}) 67 | # sim_per_video = sim / (len(video_list) - 1) 68 | sim_per_frame = sim / cnt 69 | return sim_per_frame, video_results 70 | 71 | 72 | def compute_subject_consistency(json_dir, device, submodules_list, **kwargs): 73 | dino_model = torch.hub.load('facebookresearch/dino', 'dino_vitb16', pretrained=True) 74 | dino_model = dino_model.to(device) 75 | read_frame = submodules_list['read_frame'] 76 | logger.info("Initialize DINO success") 77 | video_list, _ = load_dimension_info(json_dir, dimension='subject_consistency', lang='en') 78 | video_list = distribute_list_to_rank(video_list) 79 | all_results, video_results = subject_consistency(dino_model, video_list, device, read_frame) 80 | if get_world_size() > 1: 81 | video_results = gather_list_of_dict(video_results) 82 | all_results = sum([d['video_results'] for d in video_results]) / len(video_results) 83 | return all_results, video_results 84 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/object_class.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | import torch 5 | import numpy as np 6 | from tqdm import tqdm 7 | from toolkit.utils import load_video, load_dimension_info 8 | from toolkit.third_party.grit_model import DenseCaptioning 9 | from torchvision import transforms 10 | import logging 11 | 12 | from .distributed import ( 13 | get_world_size, 14 | get_rank, 15 | all_gather, 16 | barrier, 17 | distribute_list_to_rank, 18 | gather_list_of_dict, 19 | ) 20 | 21 | 22 | logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 23 | logger = logging.getLogger(__name__) 24 | 25 | def get_dect_from_grit(model, image_arrays): 26 | pred = [] 27 | if type(image_arrays) is not list: 28 | image_arrays = image_arrays.numpy() 29 | with torch.no_grad(): 30 | for frame in image_arrays: 31 | try: 32 | pred.append(set(model.run_caption_tensor(frame)[0][0][2])) 33 | except: 34 | pred.append(set()) 35 | return pred 36 | 37 | def check_generate(key_info, predictions): 38 | cur_cnt = 0 39 | for pred in predictions: 40 | if key_info in pred: 41 | cur_cnt+=1 42 | return cur_cnt 43 | 44 | def object_class(model, video_dict, device): 45 | success_frame_count, frame_count = 0,0 46 | video_results = [] 47 | for info in tqdm(video_dict, disable=get_rank() > 0): 48 | if 'auxiliary_info' not in info: 49 | raise "Auxiliary info is not in json, please check your json." 50 | object_info = info['auxiliary_info']['object'] 51 | for video_path in info['video_list']: 52 | video_tensor = load_video(video_path, num_frames=16) 53 | _, _, h, w = video_tensor.size() 54 | if min(h,w) > 768: 55 | scale = 720./min(h,w) 56 | output_tensor = transforms.Resize(size=( int(scale * h), int(scale * w) ),)(video_tensor) 57 | video_tensor=output_tensor 58 | cur_video_pred = get_dect_from_grit(model, video_tensor.permute(0,2,3,1)) 59 | cur_success_frame_count = check_generate(object_info, cur_video_pred) 60 | cur_success_frame_rate = cur_success_frame_count/len(cur_video_pred) 61 | success_frame_count += cur_success_frame_count 62 | frame_count += len(cur_video_pred) 63 | video_results.append({ 64 | 'video_path': video_path, 65 | 'video_results': cur_success_frame_rate, 66 | 'success_frame_count': cur_success_frame_count, 67 | 'frame_count': len(cur_video_pred)}) 68 | success_rate = success_frame_count / frame_count 69 | return success_rate, video_results 70 | 71 | 72 | def compute_object_class(json_dir, device, submodules_dict, **kwargs): 73 | dense_caption_model = DenseCaptioning(device) 74 | dense_caption_model.initialize_model_det(**submodules_dict) 75 | logger.info("Initialize detection model success") 76 | _, prompt_dict_ls = load_dimension_info(json_dir, dimension='object_class', lang='en') 77 | prompt_dict_ls = distribute_list_to_rank(prompt_dict_ls) 78 | all_results, video_results = object_class(dense_caption_model, prompt_dict_ls, device) 79 | if get_world_size() > 1: 80 | video_results = gather_list_of_dict(video_results) 81 | success_frame_count = sum([d['success_frame_count'] for d in video_results]) 82 | frame_count = sum([d['frame_count'] for d in video_results]) 83 | all_results = success_frame_count / frame_count 84 | return all_results, video_results 85 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/centernet2/centernet/config.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import CfgNode as CN 2 | 3 | def add_centernet_config(cfg): 4 | _C = cfg 5 | 6 | _C.MODEL.CENTERNET = CN() 7 | _C.MODEL.CENTERNET.NUM_CLASSES = 80 8 | _C.MODEL.CENTERNET.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"] 9 | _C.MODEL.CENTERNET.FPN_STRIDES = [8, 16, 32, 64, 128] 10 | _C.MODEL.CENTERNET.PRIOR_PROB = 0.01 11 | _C.MODEL.CENTERNET.INFERENCE_TH = 0.05 12 | _C.MODEL.CENTERNET.CENTER_NMS = False 13 | _C.MODEL.CENTERNET.NMS_TH_TRAIN = 0.6 14 | _C.MODEL.CENTERNET.NMS_TH_TEST = 0.6 15 | _C.MODEL.CENTERNET.PRE_NMS_TOPK_TRAIN = 1000 16 | _C.MODEL.CENTERNET.POST_NMS_TOPK_TRAIN = 100 17 | _C.MODEL.CENTERNET.PRE_NMS_TOPK_TEST = 1000 18 | _C.MODEL.CENTERNET.POST_NMS_TOPK_TEST = 100 19 | _C.MODEL.CENTERNET.NORM = "GN" 20 | _C.MODEL.CENTERNET.USE_DEFORMABLE = False 21 | _C.MODEL.CENTERNET.NUM_CLS_CONVS = 4 22 | _C.MODEL.CENTERNET.NUM_BOX_CONVS = 4 23 | _C.MODEL.CENTERNET.NUM_SHARE_CONVS = 0 24 | _C.MODEL.CENTERNET.LOC_LOSS_TYPE = 'giou' 25 | _C.MODEL.CENTERNET.SIGMOID_CLAMP = 1e-4 26 | _C.MODEL.CENTERNET.HM_MIN_OVERLAP = 0.8 27 | _C.MODEL.CENTERNET.MIN_RADIUS = 4 28 | _C.MODEL.CENTERNET.SOI = [[0, 80], [64, 160], [128, 320], [256, 640], [512, 10000000]] 29 | _C.MODEL.CENTERNET.POS_WEIGHT = 1. 30 | _C.MODEL.CENTERNET.NEG_WEIGHT = 1. 31 | _C.MODEL.CENTERNET.REG_WEIGHT = 2. 32 | _C.MODEL.CENTERNET.HM_FOCAL_BETA = 4 33 | _C.MODEL.CENTERNET.HM_FOCAL_ALPHA = 0.25 34 | _C.MODEL.CENTERNET.LOSS_GAMMA = 2.0 35 | _C.MODEL.CENTERNET.WITH_AGN_HM = False 36 | _C.MODEL.CENTERNET.ONLY_PROPOSAL = False 37 | _C.MODEL.CENTERNET.AS_PROPOSAL = False 38 | _C.MODEL.CENTERNET.IGNORE_HIGH_FP = -1. 39 | _C.MODEL.CENTERNET.MORE_POS = False 40 | _C.MODEL.CENTERNET.MORE_POS_THRESH = 0.2 41 | _C.MODEL.CENTERNET.MORE_POS_TOPK = 9 42 | _C.MODEL.CENTERNET.NOT_NORM_REG = True 43 | _C.MODEL.CENTERNET.NOT_NMS = False 44 | _C.MODEL.CENTERNET.NO_REDUCE = False 45 | 46 | _C.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE = False 47 | _C.MODEL.ROI_BOX_HEAD.PRIOR_PROB = 0.01 48 | _C.MODEL.ROI_BOX_HEAD.USE_EQL_LOSS = False 49 | _C.MODEL.ROI_BOX_HEAD.CAT_FREQ_PATH = \ 50 | 'datasets/lvis/lvis_v1_train_cat_info.json' 51 | _C.MODEL.ROI_BOX_HEAD.EQL_FREQ_CAT = 200 52 | _C.MODEL.ROI_BOX_HEAD.USE_FED_LOSS = False 53 | _C.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CAT = 50 54 | _C.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT = 0.5 55 | _C.MODEL.ROI_BOX_HEAD.MULT_PROPOSAL_SCORE = False 56 | 57 | _C.MODEL.BIFPN = CN() 58 | _C.MODEL.BIFPN.NUM_LEVELS = 5 59 | _C.MODEL.BIFPN.NUM_BIFPN = 6 60 | _C.MODEL.BIFPN.NORM = 'GN' 61 | _C.MODEL.BIFPN.OUT_CHANNELS = 160 62 | _C.MODEL.BIFPN.SEPARABLE_CONV = False 63 | 64 | _C.MODEL.DLA = CN() 65 | _C.MODEL.DLA.OUT_FEATURES = ['dla2'] 66 | _C.MODEL.DLA.USE_DLA_UP = True 67 | _C.MODEL.DLA.NUM_LAYERS = 34 68 | _C.MODEL.DLA.MS_OUTPUT = False 69 | _C.MODEL.DLA.NORM = 'BN' 70 | _C.MODEL.DLA.DLAUP_IN_FEATURES = ['dla3', 'dla4', 'dla5'] 71 | _C.MODEL.DLA.DLAUP_NODE = 'conv' 72 | 73 | _C.SOLVER.RESET_ITER = False 74 | _C.SOLVER.TRAIN_ITER = -1 75 | 76 | _C.INPUT.CUSTOM_AUG = '' 77 | _C.INPUT.TRAIN_SIZE = 640 78 | _C.INPUT.TEST_SIZE = 640 79 | _C.INPUT.SCALE_RANGE = (0.1, 2.) 80 | # 'default' for fixed short/ long edge, 'square' for max size=INPUT.SIZE 81 | _C.INPUT.TEST_INPUT_TYPE = 'default' 82 | 83 | _C.DEBUG = False 84 | _C.SAVE_DEBUG = False 85 | _C.SAVE_PTH = False 86 | _C.VIS_THRESH = 0.3 87 | _C.DEBUG_SHOW_NAME = False 88 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/third_party/grit_src/grit/modeling/text/load_text_token.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class LoadTextTokens(object): 5 | def __init__(self, tokenizer, max_text_len=40, padding='do_not_pad'): 6 | self.tokenizer = tokenizer 7 | self.max_text_len = max_text_len 8 | self.padding = padding 9 | 10 | def descriptions_to_text_tokens(self, target, begin_token): 11 | target_encoding = self.tokenizer( 12 | target, padding=self.padding, 13 | add_special_tokens=False, 14 | truncation=True, max_length=self.max_text_len) 15 | 16 | need_predict = [1] * len(target_encoding['input_ids']) 17 | payload = target_encoding['input_ids'] 18 | if len(payload) > self.max_text_len - 2: 19 | payload = payload[-(self.max_text_len - 2):] 20 | need_predict = payload[-(self.max_text_len - 2):] 21 | 22 | input_ids = [begin_token] + payload + [self.tokenizer.sep_token_id] 23 | 24 | need_predict = [0] + need_predict + [1] 25 | data = { 26 | 'text_tokens': torch.tensor(input_ids), 27 | 'text_lengths': len(input_ids), 28 | 'need_predict': torch.tensor(need_predict), 29 | } 30 | 31 | return data 32 | 33 | def __call__(self, object_descriptions, box_features, begin_token): 34 | text_tokens = [] 35 | text_lengths = [] 36 | need_predict = [] 37 | for description in object_descriptions: 38 | tokens = self.descriptions_to_text_tokens(description, begin_token) 39 | text_tokens.append(tokens['text_tokens']) 40 | text_lengths.append(tokens['text_lengths']) 41 | need_predict.append(tokens['need_predict']) 42 | 43 | text_tokens = torch.cat(self.collate(text_tokens), dim=0).to(box_features.device) 44 | text_lengths = torch.tensor(text_lengths).to(box_features.device) 45 | need_predict = torch.cat(self.collate(need_predict), dim=0).to(box_features.device) 46 | 47 | assert text_tokens.dim() == 2 and need_predict.dim() == 2 48 | data = {'text_tokens': text_tokens, 49 | 'text_lengths': text_lengths, 50 | 'need_predict': need_predict} 51 | 52 | return data 53 | 54 | def collate(self, batch): 55 | if all(isinstance(b, torch.Tensor) for b in batch) and len(batch) > 0: 56 | if not all(b.shape == batch[0].shape for b in batch[1:]): 57 | assert all(len(b.shape) == len(batch[0].shape) for b in batch[1:]) 58 | shape = torch.tensor([b.shape for b in batch]) 59 | max_shape = tuple(shape.max(dim=0)[0].tolist()) 60 | batch2 = [] 61 | for b in batch: 62 | if any(c < m for c, m in zip(b.shape, max_shape)): 63 | b2 = torch.zeros(max_shape, dtype=b.dtype, device=b.device) 64 | if b.dim() == 1: 65 | b2[:b.shape[0]] = b 66 | elif b.dim() == 2: 67 | b2[:b.shape[0], :b.shape[1]] = b 68 | elif b.dim() == 3: 69 | b2[:b.shape[0], :b.shape[1], :b.shape[2]] = b 70 | else: 71 | raise NotImplementedError 72 | b = b2 73 | batch2.append(b[None, ...]) 74 | else: 75 | batch2 = [] 76 | for b in batch: 77 | batch2.append(b[None, ...]) 78 | return batch2 79 | else: 80 | raise NotImplementedError 81 | -------------------------------------------------------------------------------- /video_generation_evaluation/toolkit/appearance_style.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | from tqdm import tqdm 5 | 6 | import torch 7 | import clip 8 | from PIL import Image 9 | from toolkit.utils import load_video, load_dimension_info, clip_transform, read_frames_decord_by_fps, clip_transform_Image 10 | 11 | from .distributed import ( 12 | get_world_size, 13 | get_rank, 14 | all_gather, 15 | barrier, 16 | distribute_list_to_rank, 17 | gather_list_of_dict, 18 | ) 19 | 20 | 21 | def get_text_features(model, input_text, tokenizer, text_feature_dict={}): 22 | if input_text in text_feature_dict: 23 | return text_feature_dict[input_text] 24 | text_template= f"{input_text}" 25 | with torch.no_grad(): 26 | text_features = model.encode_text(text_template).float() 27 | text_features /= text_features.norm(dim=-1, keepdim=True) 28 | text_feature_dict[input_text] = text_features 29 | return text_features 30 | 31 | def get_vid_features(model, input_frames): 32 | with torch.no_grad(): 33 | clip_feat = model.encode_vision(input_frames,test=True).float() 34 | clip_feat /= clip_feat.norm(dim=-1, keepdim=True) 35 | return clip_feat 36 | 37 | def get_predict_label(clip_feature, text_feats_tensor, top=5): 38 | label_probs = (100.0 * clip_feature @ text_feats_tensor.T).softmax(dim=-1) 39 | top_probs, top_labels = label_probs.cpu().topk(top, dim=-1) 40 | return top_probs, top_labels 41 | 42 | def appearance_style(clip_model, video_dict, device, sample="rand"): 43 | sim = 0.0 44 | cnt = 0 45 | video_results = [] 46 | image_transform = clip_transform_Image(224) 47 | for info in tqdm(video_dict, disable=get_rank() > 0): 48 | if 'auxiliary_info' not in info: 49 | raise "Auxiliary info is not in json, please check your json." 50 | query = info['auxiliary_info']['appearance_style'] 51 | text = clip.tokenize([query]).to(device) 52 | video_list = info['video_list'] 53 | for video_path in video_list: 54 | cur_video = [] 55 | with torch.no_grad(): 56 | video_arrays = load_video(video_path, return_tensor=False) 57 | images = [Image.fromarray(i) for i in video_arrays] 58 | for image in images: 59 | image = image_transform(image) 60 | image = image.to(device) 61 | logits_per_image, logits_per_text = clip_model(image.unsqueeze(0), text) 62 | cur_sim = float(logits_per_text[0][0].cpu()) 63 | cur_sim = cur_sim / 100 64 | cur_video.append(cur_sim) 65 | sim += cur_sim 66 | cnt +=1 67 | video_sim = np.mean(cur_video) 68 | video_results.append({ 69 | 'video_path': video_path, 70 | 'video_results': video_sim, 71 | 'frame_results': cur_video, 72 | 'cur_sim': cur_sim}) 73 | sim_per_frame = sim / cnt 74 | return sim_per_frame, video_results 75 | 76 | def compute_appearance_style(json_dir, device, submodules_list, **kwargs): 77 | clip_model, preprocess = clip.load(device=device, **submodules_list) 78 | _, video_dict = load_dimension_info(json_dir, dimension='appearance_style', lang='en') 79 | video_dict = distribute_list_to_rank(video_dict) 80 | all_results, video_results = appearance_style(clip_model, video_dict, device) 81 | if get_world_size() > 1: 82 | video_results = gather_list_of_dict(video_results) 83 | all_results = sum([d['cur_sim'] for d in video_results]) / len(video_results) 84 | return all_results, video_results 85 | --------------------------------------------------------------------------------