├── IQA ├── fastvqa │ ├── __init__.py │ ├── models │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── head.cpython-38.pyc │ │ │ ├── resnet.cpython-38.pyc │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── swin_v1.cpython-38.pyc │ │ │ ├── swin_v2.cpython-38.pyc │ │ │ ├── evaluator.cpython-38.pyc │ │ │ ├── conv_backbone.cpython-38.pyc │ │ │ ├── swin_backbone.cpython-38.pyc │ │ │ └── xclip_backbone.cpython-38.pyc │ │ ├── evaluator.py │ │ ├── head.py │ │ ├── swin_v1.py │ │ └── swin_v2.py │ └── __pycache__ │ │ └── __init__.cpython-38.pyc ├── options │ └── fast-sama-iqa.yml └── demo_train_iqa_baseline.py ├── method.png ├── VQA ├── fastvqa │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ └── __init__.cpython-39.pyc │ ├── models │ │ ├── __pycache__ │ │ │ ├── head.cpython-37.pyc │ │ │ ├── head.cpython-38.pyc │ │ │ ├── head.cpython-39.pyc │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── __init__.cpython-39.pyc │ │ │ ├── backbone.cpython-38.pyc │ │ │ ├── swin_v1.cpython-38.pyc │ │ │ ├── evaluator.cpython-37.pyc │ │ │ ├── evaluator.cpython-38.pyc │ │ │ ├── evaluator.cpython-39.pyc │ │ │ ├── backbone_v0_1.cpython-38.pyc │ │ │ ├── conv_backbone.cpython-37.pyc │ │ │ ├── conv_backbone.cpython-38.pyc │ │ │ ├── conv_backbone.cpython-39.pyc │ │ │ ├── swin_backbone.cpython-37.pyc │ │ │ ├── swin_backbone.cpython-38.pyc │ │ │ ├── swin_backbone.cpython-39.pyc │ │ │ ├── xclip_backbone.cpython-37.pyc │ │ │ ├── xclip_backbone.cpython-38.pyc │ │ │ └── swin_backbone_scale.cpython-38.pyc │ │ ├── __init__.py │ │ ├── evaluator.py │ │ └── head.py │ ├── datasets │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── __init__.cpython-39.pyc │ │ │ ├── basic_datasets.cpython-37.pyc │ │ │ ├── basic_datasets.cpython-38.pyc │ │ │ ├── basic_datasets.cpython-39.pyc │ │ │ ├── fusion_datasets.cpython-37.pyc │ │ │ ├── fusion_datasets.cpython-38.pyc │ │ │ ├── fusion_datasets.cpython-39.pyc │ │ │ ├── inference_dataset.cpython-38.pyc │ │ │ └── fusion_datasets_TEST.cpython-38.pyc │ │ └── __init__.py │ └── version.py ├── pretrained_weights │ └── README.md ├── options │ ├── fast-SAMA-test.yml │ ├── fast-SAMA-finetune.yml │ └── fast-SAMA-train.yml ├── demo_test.py ├── demo_finetune.py ├── demo_train.py └── examplar_data_labels │ └── LIVE_VQC │ └── mylabels.txt └── README.md /IQA/fastvqa/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import * 2 | -------------------------------------------------------------------------------- /method.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/method.png -------------------------------------------------------------------------------- /VQA/fastvqa/__init__.py: -------------------------------------------------------------------------------- 1 | from .datasets import * 2 | from .models import * 3 | -------------------------------------------------------------------------------- /IQA/fastvqa/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluator import IQAModel 2 | 3 | __all__ = [ 4 | "IQAModel", 5 | ] 6 | -------------------------------------------------------------------------------- /VQA/pretrained_weights/README.md: -------------------------------------------------------------------------------- 1 | put the pretrained weights in this folder, and set the configuration in the `.yml` file. 2 | -------------------------------------------------------------------------------- /IQA/fastvqa/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/IQA/fastvqa/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /IQA/fastvqa/models/__pycache__/head.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/IQA/fastvqa/models/__pycache__/head.cpython-38.pyc -------------------------------------------------------------------------------- /IQA/fastvqa/models/__pycache__/resnet.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/IQA/fastvqa/models/__pycache__/resnet.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/head.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/head.cpython-37.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/head.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/head.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/head.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/head.cpython-39.pyc -------------------------------------------------------------------------------- /IQA/fastvqa/models/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/IQA/fastvqa/models/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /IQA/fastvqa/models/__pycache__/swin_v1.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/IQA/fastvqa/models/__pycache__/swin_v1.cpython-38.pyc -------------------------------------------------------------------------------- /IQA/fastvqa/models/__pycache__/swin_v2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/IQA/fastvqa/models/__pycache__/swin_v2.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/backbone.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/backbone.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/swin_v1.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/swin_v1.cpython-38.pyc -------------------------------------------------------------------------------- /IQA/fastvqa/models/__pycache__/evaluator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/IQA/fastvqa/models/__pycache__/evaluator.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/datasets/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/datasets/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/datasets/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/datasets/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/datasets/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/datasets/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/evaluator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/evaluator.cpython-37.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/evaluator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/evaluator.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/evaluator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/evaluator.cpython-39.pyc -------------------------------------------------------------------------------- /IQA/fastvqa/models/__pycache__/conv_backbone.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/IQA/fastvqa/models/__pycache__/conv_backbone.cpython-38.pyc -------------------------------------------------------------------------------- /IQA/fastvqa/models/__pycache__/swin_backbone.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/IQA/fastvqa/models/__pycache__/swin_backbone.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/backbone_v0_1.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/backbone_v0_1.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/conv_backbone.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/conv_backbone.cpython-37.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/conv_backbone.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/conv_backbone.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/conv_backbone.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/conv_backbone.cpython-39.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/swin_backbone.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/swin_backbone.cpython-37.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/swin_backbone.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/swin_backbone.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/swin_backbone.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/swin_backbone.cpython-39.pyc -------------------------------------------------------------------------------- /IQA/fastvqa/models/__pycache__/xclip_backbone.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/IQA/fastvqa/models/__pycache__/xclip_backbone.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/datasets/__pycache__/basic_datasets.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/datasets/__pycache__/basic_datasets.cpython-37.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/datasets/__pycache__/basic_datasets.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/datasets/__pycache__/basic_datasets.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/datasets/__pycache__/basic_datasets.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/datasets/__pycache__/basic_datasets.cpython-39.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/xclip_backbone.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/xclip_backbone.cpython-37.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/xclip_backbone.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/xclip_backbone.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .fusion_datasets import FusionDataset, FineTuneDataset 3 | 4 | __all__ = [ 5 | "FusionDataset", 6 | "FineTuneDataset" 7 | ] -------------------------------------------------------------------------------- /VQA/fastvqa/datasets/__pycache__/fusion_datasets.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/datasets/__pycache__/fusion_datasets.cpython-37.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/datasets/__pycache__/fusion_datasets.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/datasets/__pycache__/fusion_datasets.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/datasets/__pycache__/fusion_datasets.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/datasets/__pycache__/fusion_datasets.cpython-39.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/datasets/__pycache__/inference_dataset.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/datasets/__pycache__/inference_dataset.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__pycache__/swin_backbone_scale.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/models/__pycache__/swin_backbone_scale.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/datasets/__pycache__/fusion_datasets_TEST.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sissuire/SAMA/HEAD/VQA/fastvqa/datasets/__pycache__/fusion_datasets_TEST.cpython-38.pyc -------------------------------------------------------------------------------- /VQA/fastvqa/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .swin_backbone import SwinTransformer3D as VQABackbone 2 | from .swin_backbone import SwinTransformer2D as IQABackbone 3 | from .head import VQAHead, IQAHead, VARHead 4 | 5 | from .evaluator import DiViDeAddEvaluator 6 | 7 | __all__ = [ 8 | "VQABackbone", 9 | "IQABackbone", 10 | "VQAHead", 11 | "IQAHead", 12 | "VARHead", 13 | "DiViDeAddEvaluator" 14 | ] 15 | -------------------------------------------------------------------------------- /VQA/fastvqa/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "3.1.0" 2 | 3 | 4 | def parse_version_info(version_str): 5 | version_info = [] 6 | for x in version_str.split("."): 7 | if x.isdigit(): 8 | version_info.append(int(x)) 9 | elif x.find("rc") != -1: 10 | patch_version = x.split("rc") 11 | version_info.append(int(patch_version[0])) 12 | version_info.append(f"rc{patch_version[1]}") 13 | return tuple(version_info) 14 | 15 | 16 | version_info = parse_version_info(__version__) 17 | -------------------------------------------------------------------------------- /IQA/fastvqa/models/evaluator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from .swin_v1 import SwinTransformer as ImageEncoder_v1 5 | from .swin_v2 import SwinTransformerV2 as ImageEncoder 6 | from .head import VQAHead, IQAHead, VARHead, VQAHeadMLP, HyperHead 7 | 8 | 9 | class IQAModel(nn.Module): 10 | def __init__(self): 11 | super().__init__() 12 | # self.backbone = ImageEncoder_v1() 13 | self.backbone = ImageEncoder() 14 | self.vqa_head = VQAHeadMLP() 15 | 16 | def forward(self, x): 17 | f = self.backbone(x) 18 | scores = self.vqa_head(f) 19 | return scores.flatten(1).mean(1) 20 | 21 | -------------------------------------------------------------------------------- /IQA/options/fast-sama-iqa.yml: -------------------------------------------------------------------------------- 1 | # Swin-Image-Encoder for IQA 2 | 3 | name: SAMA-IQA-sama-koniq 4 | 5 | stype: sama # [fragment, sama, sama-spm] 6 | 7 | num_epochs: 50 8 | l_num_epochs: 0 9 | warmup_epochs: 5 10 | constant_epochs: 150 11 | ema: true 12 | save_model: true 13 | batch_size: 64 # 64 14 | num_workers: 8 15 | test_batch_size: 64 #64 16 | test_num_workers: 8 17 | num_splits: 10 18 | 19 | data: 20 | database: koniq 21 | data_info: PATH_TO_DATA/koniq/koniq10k_scores.csv 22 | data_prefix: PATH_TO_DATA/data/koniq/1024x768 23 | 24 | fwin_h: 8 25 | fwin_w: 8 26 | fsize_h: 32 27 | fsize_w: 32 28 | 29 | 30 | # data: 31 | # database: spaq 32 | # data_info: PATH_TO_DATA/spaq/spaq_info.txt 33 | # data_prefix: PATH_TO_DATA/spaq/TestImage 34 | 35 | # fwin_h: 8 36 | # fwin_w: 8 37 | # fsize_h: 32 38 | # fsize_w: 32 39 | 40 | 41 | # model: # discard 42 | # backbone_type: swin_image_v2 43 | # head_in_channels: 768 44 | # head_hidden_channels: 128 45 | 46 | optimizer: 47 | lr: !!float 1e-3 48 | backbone_lr_mult: !!float 1e-1 49 | wd: 0.05 50 | 51 | load_path: PATH_TO_MODEL/swinv2_tiny_patch4_window8_256.pth 52 | -------------------------------------------------------------------------------- /VQA/options/fast-SAMA-test.yml: -------------------------------------------------------------------------------- 1 | 2 | name: SAMA-VQA-sama 3 | test_batch_size: 1 4 | test_num_workers: 2 5 | 6 | stype: sama # [sama, sama-c, sama-mix, sama-swm, sama-spm, fragments==others] 7 | 8 | data: 9 | val-ltest: 10 | type: FusionDataset 11 | args: 12 | phase: test 13 | anno_file: ./examplar_data_labels/LSVQ/labels_mytest.txt 14 | data_prefix: PATH_TO_DATA/LSVQ 15 | sample_types: 16 | fragments: 17 | fragments_h: 7 18 | fragments_w: 7 19 | fsize_h: 32 20 | fsize_w: 32 21 | aligned: 32 22 | clip_len: 32 23 | frame_interval: 2 24 | num_clips: 4 25 | model: 26 | type: DiViDeAddEvaluator 27 | args: 28 | backbone: 29 | fragments: 30 | checkpoint: false 31 | pretrained: 32 | backbone_size: swin_tiny_grpb 33 | backbone_preserve_keys: fragments 34 | divide_head: false 35 | vqa_head: 36 | in_channels: 768 37 | hidden_channels: 64 38 | 39 | optimizer: 40 | lr: !!float 1e-3 41 | backbone_lr_mult: !!float 1e-1 42 | wd: 0.05 43 | 44 | load_path: ./pretrained_weights/SAMA-baseline_val-ltest_s_dev_v0.0.pth 45 | -------------------------------------------------------------------------------- /VQA/options/fast-SAMA-finetune.yml: -------------------------------------------------------------------------------- 1 | 2 | name: SAMA-baseline-finetune-youtube 3 | split_seed: 10 4 | 5 | num_epochs: 30 6 | l_num_epochs: 0 7 | warmup_epochs: 2.5 8 | ema: true 9 | save_model: true 10 | train_batch_size: 12 11 | train_num_workers: 6 12 | test_batch_size: 1 13 | test_num_workers: 6 14 | 15 | stype: sama # [sama, sama-c, sama-mix, sama-swm, sama-spm, fragments==others] 16 | 17 | data: 18 | # database: livevqc 19 | # type: FineTuneDataset 20 | # anno_file: ./examplar_data_labels/LIVE_VQC/mylabels.txt 21 | # data_prefix: PATH_TO_DATA/LIVE-VQC 22 | # train: 23 | # sample_types: 24 | # fragments: 25 | # fragments_h: 7 26 | # fragments_w: 7 27 | # fsize_h: 32 28 | # fsize_w: 32 29 | # aligned: 32 30 | # clip_len: 32 31 | # frame_interval: 2 32 | # num_clips: 1 33 | # test: 34 | # sample_types: 35 | # fragments: 36 | # fragments_h: 7 37 | # fragments_w: 7 38 | # fsize_h: 32 39 | # fsize_w: 32 40 | # aligned: 32 41 | # clip_len: 32 42 | # frame_interval: 2 43 | # num_clips: 4 44 | 45 | 46 | # database: kv1k 47 | # type: FineTuneDataset 48 | # anno_file: ./examplar_data_labels/KoNViD/mylabels.txt 49 | # data_prefix: PATH_TO_DATA/KoNViD 50 | # train: 51 | # sample_types: 52 | # fragments: 53 | # fragments_h: 7 54 | # fragments_w: 7 55 | # fsize_h: 32 56 | # fsize_w: 32 57 | # aligned: 32 58 | # clip_len: 32 59 | # frame_interval: 2 60 | # num_clips: 1 61 | # test: 62 | # sample_types: 63 | # fragments: 64 | # fragments_h: 7 65 | # fragments_w: 7 66 | # fsize_h: 32 67 | # fsize_w: 32 68 | # aligned: 32 69 | # clip_len: 32 70 | # frame_interval: 2 71 | # num_clips: 4 72 | 73 | database: youtube 74 | type: FineTuneDataset 75 | anno_file: ./examplar_data_labels/YouTubeUGC/mylabels.txt 76 | data_prefix: PATH_TO_DATA/YouTube 77 | train: 78 | sample_types: 79 | fragments: 80 | fragments_h: 7 81 | fragments_w: 7 82 | fsize_h: 32 83 | fsize_w: 32 84 | aligned: 32 85 | clip_len: 32 86 | frame_interval: 2 87 | num_clips: 1 88 | test: 89 | sample_types: 90 | fragments: 91 | fragments_h: 7 92 | fragments_w: 7 93 | fsize_h: 32 94 | fsize_w: 32 95 | aligned: 32 96 | clip_len: 32 97 | frame_interval: 2 98 | num_clips: 4 99 | 100 | model: 101 | type: DiViDeAddEvaluator 102 | args: 103 | backbone: 104 | fragments: 105 | checkpoint: false 106 | pretrained: 107 | backbone_size: swin_tiny_grpb 108 | backbone_preserve_keys: fragments 109 | divide_head: false 110 | vqa_head: 111 | in_channels: 768 112 | hidden_channels: 64 113 | 114 | optimizer: 115 | lr: !!float 1e-3 116 | backbone_lr_mult: !!float 1e-1 117 | wd: 0.05 118 | 119 | load_path: PATH_TO_MODEL/pretrained_weights/SAMA-baseline_val-ltest_s_dev_v0.0.pth 120 | 121 | -------------------------------------------------------------------------------- /VQA/options/fast-SAMA-train.yml: -------------------------------------------------------------------------------- 1 | 2 | name: SAMA-VQA-sama 3 | num_epochs: 30 4 | l_num_epochs: 0 5 | warmup_epochs: 2.5 6 | ema: true 7 | save_model: true 8 | train_batch_size: 12 9 | train_num_workers: 6 10 | test_batch_size: 1 11 | test_num_workers: 6 12 | 13 | stype: sama # [sama, sama-c, sama-mix, sama-swm, sama-spm, fragments==others] 14 | 15 | data: 16 | train: 17 | type: FusionDataset 18 | args: 19 | phase: train 20 | anno_file: ./examplar_data_labels/LSVQ/labels_mytrain.txt 21 | data_prefix: PATH_TO_DATA/LSVQ 22 | sample_types: 23 | fragments: 24 | fragments_h: 7 25 | fragments_w: 7 26 | fsize_h: 32 27 | fsize_w: 32 28 | aligned: 32 29 | clip_len: 32 30 | frame_interval: 2 31 | num_clips: 1 32 | val-livevqc: 33 | type: FusionDataset 34 | args: 35 | phase: test 36 | anno_file: ./examplar_data_labels/LIVE_VQC/mylabels.txt 37 | data_prefix: PATH_TO_DATA/LIVE-VQC 38 | sample_types: 39 | fragments: 40 | fragments_h: 7 41 | fragments_w: 7 42 | fsize_h: 32 43 | fsize_w: 32 44 | aligned: 32 45 | clip_len: 32 46 | frame_interval: 2 47 | num_clips: 4 48 | val-kv1k: 49 | type: FusionDataset 50 | args: 51 | phase: test 52 | anno_file: ./examplar_data_labels/KoNViD/mylabels.txt 53 | data_prefix: PATH_TO_DATA/KoNViD 54 | sample_types: 55 | fragments: 56 | fragments_h: 7 57 | fragments_w: 7 58 | fsize_h: 32 59 | fsize_w: 32 60 | aligned: 32 61 | clip_len: 32 62 | frame_interval: 2 63 | num_clips: 4 64 | val-ltest: 65 | type: FusionDataset 66 | args: 67 | phase: test 68 | anno_file: ./examplar_data_labels/LSVQ/labels_mytest.txt 69 | data_prefix: PATH_TO_DATA/LSVQ 70 | sample_types: 71 | fragments: 72 | fragments_h: 7 73 | fragments_w: 7 74 | fsize_h: 32 75 | fsize_w: 32 76 | aligned: 32 77 | clip_len: 32 78 | frame_interval: 2 79 | num_clips: 4 80 | val-l1080p: 81 | type: FusionDataset 82 | args: 83 | phase: test 84 | anno_file: ./examplar_data_labels/LSVQ/labels_mytest_1080p.txt 85 | data_prefix: PATH_TO_DATA/LSVQ 86 | sample_types: 87 | fragments: 88 | fragments_h: 7 89 | fragments_w: 7 90 | fsize_h: 32 91 | fsize_w: 32 92 | aligned: 32 93 | clip_len: 32 94 | frame_interval: 2 95 | num_clips: 4 96 | model: 97 | type: DiViDeAddEvaluator 98 | args: 99 | backbone: 100 | fragments: 101 | checkpoint: false 102 | pretrained: 103 | backbone_size: swin_tiny_grpb 104 | backbone_preserve_keys: fragments 105 | divide_head: false 106 | vqa_head: 107 | in_channels: 768 108 | hidden_channels: 64 109 | 110 | optimizer: 111 | lr: !!float 1e-3 112 | backbone_lr_mult: !!float 1e-1 113 | wd: 0.05 114 | 115 | load_path: PATH_TO_MODEL/swin_tiny_patch244_window877_kinetics400_1k.pth 116 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SAMA Overview 2 | 3 | PyTorch implementation of "**Scaling and Masking: A New Paradigm of Data Sampling for Image and Video Quality Assessment**" ([arXiv](https://arxiv.org/abs/2401.02614)/[AAAI](https://ojs.aaai.org/index.php/AAAI/article/view/28170)), which has been accepted by **AAAI-2024**. 4 | 5 | This code is modified from [FAST-VQA](https://github.com/VQAssessment/FAST-VQA-and-FasterVQA). 6 | 7 | ![](method.png) 8 | 9 | ## Usage 10 | 11 | ### IQA 12 | For image quality assessment (IQA), please refer to [IQA/demo_train_iqa_baseline.py](https://github.com/Sissuire/SAMA/blob/main/IQA/demo_train_iqa_baseline.py). 13 | 14 | ### VQA 15 | For video quality assessment (VQA), please refer to [VQA/demo_train.py](https://github.com/Sissuire/SAMA/blob/main/VQA/demo_train.py) to get the training result, and refer to [VQA/demo_finetune.py](https://github.com/Sissuire/SAMA/blob/main/VQA/demo_finetune.py) to get the finetuning result. We also provide the [training log](https://github.com/Sissuire/SAMA/blob/main/VQA/log.FAST.SAMA.out) for VQA. 16 | 17 | The main idea/contribution lies in the data sampling, which can be found in [IQA](https://github.com/Sissuire/SAMA/blob/b8fdfa390999908bf6c0da284973bb1f2eb646d8/IQA/demo_train_iqa_baseline.py#L166C13-L166C13) and [VQA](https://github.com/Sissuire/SAMA/blob/b8fdfa390999908bf6c0da284973bb1f2eb646d8/VQA/fastvqa/datasets/fusion_datasets.py#L211). 18 | 19 | Make sure the configuration has been properly set in 20 | - [fast-sama-iqa.yml](https://github.com/Sissuire/SAMA/blob/main/IQA/options/fast-sama-iqa.yml) for IQA training; 21 | - [fast-SAMA-train.yml](https://github.com/Sissuire/SAMA/blob/main/VQA/options/fast-SAMA-train.yml) for VQA training on LSVQ; 22 | - and [fast-SAMA-tinetune.yml](https://github.com/Sissuire/SAMA/blob/main/VQA/options/fast-SAMA-finetune.yml) for VQA finetuning. 23 | 24 | And please prepare the pretrained models of [video-swin](https://github.com/SwinTransformer/storage/releases/download/v1.0.4/swin_tiny_patch244_window877_kinetics400_1k.pth) for VQA and [swin-v2](https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_tiny_patch4_window8_256.pth) for IQA. 25 | 26 | #### Testing with pretrained model on videos 27 | 28 | We have provided the pretrained weights (trained on LSVQ train set): [GoogleDrive](https://drive.google.com/drive/folders/1adB3aB8gBMx7c38tEfgls-i6QNZJI8nF?usp=sharing) / [BaiDu](https://pan.baidu.com/s/1KTicZ2WX8BN7GTgr9PX6ZQ?pwd=xyns) (Code:xyns). please check the pretrained weights in `./VQA/pretrained_weights` folder and put the weights in the folder. 29 | 30 | To test on your own dataset or video files, please construct the dataset information as the examplar in `./VQA/examplar_data_labels`, and set the configuration in [fast-SAMA-test.yml](https://github.com/Sissuire/SAMA/blob/main/VQA/options/fast-SAMA-test.yml). Run the file [demo_test.py](https://github.com/Sissuire/SAMA/blob/main/VQA/demo_test.py) to check the details. 31 | 32 | ### Environment 33 | Different environment may induce possible fluctuation of performance. 34 | 35 | ``` 36 | Python 3.8.10 37 | PyTorch 1.7.0 38 | ``` 39 | 40 | The installation can refer to [FAST-VQA](https://github.com/VQAssessment/FAST-VQA-and-FasterVQA). 41 | 42 | ### Citation 43 | If you are interested in the work, or find the code helpful, please cite our work 44 | ``` 45 | @article{sama2024, 46 | title={Scaling and Masking: A New Paradigm of Data Sampling for Image and Video Quality Assessment}, 47 | volume={38}, 48 | number={4}, 49 | journal={Proceedings of the AAAI Conference on Artificial Intelligence}, 50 | author={Liu, Yongxu and Quan, Yinghui and Xiao, Guoyao and Li, Aobo and Wu, Jinjian}, 51 | year={2024}, 52 | month={Mar.}, 53 | pages={3792-3801}, 54 | url={https://ojs.aaai.org/index.php/AAAI/article/view/28170}, 55 | DOI={10.1609/aaai.v38i4.28170} 56 | } 57 | ``` 58 | 59 | ### Contact 60 | 61 | Feel free to contact me via `yongxu.liu@xidian.edu.cn` if any question or bug. 62 | 63 | ### License 64 | 65 | Copyright (c) [2024] [Yongxu Liu] 66 | 67 | Permission to use, copy, or modify this software and its documentation for educational and research purposes only and without fee is here granted. This program shall not be used, rewritten, or adapted as the basis of a commercial software or hardware product without first obtaining permission of the authors. The authors make no representations about the suitability of this software for any purpose. It is provided "as is" without express or implied warranty. 68 | -------------------------------------------------------------------------------- /VQA/fastvqa/models/evaluator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from functools import partial, reduce 4 | from .swin_backbone import SwinTransformer3D as VideoBackbone 5 | from .head import VQAHead, IQAHead, VARHead 6 | 7 | 8 | class DiViDeAddEvaluator(nn.Module): 9 | def __init__( 10 | self, 11 | backbone_size="divided", 12 | backbone_preserve_keys = 'fragments,resize', 13 | multi=False, 14 | layer=-1, 15 | backbone=dict(resize={"window_size": (4,4,4)}, fragments={"window_size": (4,4,4)}), 16 | divide_head=False, 17 | vqa_head=dict(in_channels=768), 18 | var=False, 19 | ): 20 | self.backbone_preserve_keys = backbone_preserve_keys.split(",") 21 | self.multi = multi 22 | self.layer = layer 23 | super().__init__() 24 | for key, hypers in backbone.items(): 25 | print(backbone_size) 26 | if key not in self.backbone_preserve_keys: 27 | continue 28 | if backbone_size=="divided": 29 | t_backbone_size = hypers["type"] 30 | else: 31 | t_backbone_size = backbone_size 32 | if t_backbone_size == 'swin_tiny_grpb': 33 | # to reproduce fast-vqa 34 | b = VideoBackbone() 35 | elif t_backbone_size == 'swin_tiny_grpb_m': 36 | # to reproduce fast-vqa-m 37 | b = VideoBackbone(window_size=(4,4,4), frag_biases=[0,0,0,0]) 38 | else: 39 | raise NotImplementedError 40 | print("Setting backbone:", key+"_backbone") 41 | setattr(self, key+"_backbone", b) 42 | if divide_head: 43 | print(divide_head) 44 | for key in backbone: 45 | if key not in self.backbone_preserve_keys: 46 | continue 47 | if var: 48 | b = VARHead(**vqa_head) 49 | print(b) 50 | else: 51 | b = VQAHead(**vqa_head) 52 | print("Setting head:", key+"_head") 53 | setattr(self, key+"_head", b) 54 | else: 55 | if var: 56 | self.vqa_head = VARHead(**vqa_head) 57 | print(b) 58 | else: 59 | self.vqa_head = VQAHead(**vqa_head) 60 | 61 | def forward(self, vclips, inference=True, return_pooled_feats=False, reduce_scores=True, pooled=False, **kwargs): 62 | if inference: 63 | self.eval() 64 | with torch.no_grad(): 65 | 66 | scores = [] 67 | feats = {} 68 | for key in vclips: 69 | feat = getattr(self, key.split("_")[0]+"_backbone")(vclips[key], multi=self.multi, layer=self.layer, **kwargs) 70 | if hasattr(self, key.split("_")[0]+"_head"): 71 | scores += [getattr(self, key.split("_")[0]+"_head")(feat)] 72 | else: 73 | scores += [getattr(self, "vqa_head")(feat)] 74 | if return_pooled_feats: 75 | feats[key] = feat.mean((-3,-2,-1)) 76 | if reduce_scores: 77 | if len(scores) > 1: 78 | scores = reduce(lambda x,y:x+y, scores) 79 | else: 80 | scores = scores[0] 81 | if pooled: 82 | scores = torch.mean(scores, (1,2,3,4)) 83 | self.train() 84 | if return_pooled_feats: 85 | return scores, feats 86 | return scores 87 | else: 88 | self.train() 89 | scores = [] 90 | feats = {} 91 | for key in vclips: 92 | feat = getattr(self, key.split("_")[0]+"_backbone")(vclips[key], multi=self.multi, layer=self.layer, **kwargs) 93 | if hasattr(self, key.split("_")[0]+"_head"): 94 | scores += [getattr(self, key.split("_")[0]+"_head")(feat)] 95 | else: 96 | scores += [getattr(self, "vqa_head")(feat)] 97 | if return_pooled_feats: 98 | feats[key] = feat.mean((-3,-2,-1)) 99 | if reduce_scores: 100 | if len(scores) > 1: 101 | scores = reduce(lambda x,y:x+y, scores) 102 | else: 103 | scores = scores[0] 104 | if pooled: 105 | print(scores.shape) 106 | scores = torch.mean(scores, (1,2,3,4)) 107 | print(scores.shape) 108 | 109 | if return_pooled_feats: 110 | return scores, feats 111 | return scores 112 | -------------------------------------------------------------------------------- /VQA/fastvqa/models/head.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | from torchvision.ops import roi_pool, roi_align 4 | from torch.nn import functional as F 5 | import numpy as np 6 | import math 7 | 8 | class VQAHead(nn.Module): 9 | """MLP Regression Head for VQA. 10 | Args: 11 | in_channels: input channels for MLP 12 | hidden_channels: hidden channels for MLP 13 | dropout_ratio: the dropout ratio for features before the MLP (default 0.5) 14 | """ 15 | 16 | def __init__( 17 | self, in_channels=768, hidden_channels=64, dropout_ratio=0.5, **kwargs 18 | ): 19 | super().__init__() 20 | self.dropout_ratio = dropout_ratio 21 | self.in_channels = in_channels 22 | self.hidden_channels = hidden_channels 23 | if self.dropout_ratio != 0: 24 | self.dropout = nn.Dropout(p=self.dropout_ratio) 25 | else: 26 | self.dropout = None 27 | self.fc_hid = nn.Conv3d(self.in_channels, self.hidden_channels, (1, 1, 1)) 28 | self.fc_last = nn.Conv3d(self.hidden_channels, 1, (1, 1, 1)) 29 | self.gelu = nn.GELU() 30 | 31 | self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) 32 | 33 | def forward(self, x, rois=None): 34 | x = self.dropout(x) 35 | qlt_score = self.fc_last(self.dropout(self.gelu(self.fc_hid(x)))) 36 | return qlt_score 37 | 38 | 39 | class VQAHead_samaw(nn.Module): 40 | """MLP Regression Head for VQA. 41 | Args: 42 | in_channels: input channels for MLP 43 | hidden_channels: hidden channels for MLP 44 | dropout_ratio: the dropout ratio for features before the MLP (default 0.5) 45 | """ 46 | 47 | def __init__( 48 | self, in_channels=768, hidden_channels=64, dropout_ratio=0.5, **kwargs 49 | ): 50 | super().__init__() 51 | self.dropout_ratio = dropout_ratio 52 | self.in_channels = in_channels 53 | self.hidden_channels = hidden_channels 54 | if self.dropout_ratio != 0: 55 | self.dropout = nn.Dropout(p=self.dropout_ratio) 56 | else: 57 | self.dropout = None 58 | self.fc_hid = nn.Conv3d(self.in_channels, self.hidden_channels, (1, 1, 1)) 59 | self.fc_last = nn.Conv3d(self.hidden_channels, 1, (1, 1, 1)) 60 | self.gelu = nn.GELU() 61 | self.fc_scale_hid = nn.Conv3d(self.in_channels, self.hidden_channels, (1, 1, 1)) 62 | self.fc_scale_last = nn.Conv3d(self.hidden_channels, 1, (1, 1, 1)) 63 | self.softmax = nn.Softmax(-3) 64 | 65 | self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) 66 | 67 | def forward(self, x, rois=None): 68 | x = self.dropout(x) 69 | qlt_score = self.fc_last(self.dropout(self.gelu(self.fc_hid(x)))) 70 | 71 | x_mean = torch.mean(x, dim=(3, 4), keepdim=True) 72 | qlt_weight = self.fc_scale_last(self.dropout(self.gelu(self.fc_scale_hid(self.dropout(x_mean))))) 73 | qlt_weight = self.softmax(qlt_weight) 74 | return qlt_score * qlt_weight 75 | 76 | 77 | class VARHead(nn.Module): 78 | """MLP Regression Head for Video Action Recognition. 79 | Args: 80 | in_channels: input channels for MLP 81 | hidden_channels: hidden channels for MLP 82 | dropout_ratio: the dropout ratio for features before the MLP (default 0.5) 83 | """ 84 | 85 | def __init__( 86 | self, in_channels=768, out_channels=400, dropout_ratio=0.5, **kwargs 87 | ): 88 | super().__init__() 89 | self.dropout_ratio = dropout_ratio 90 | self.in_channels = in_channels 91 | self.out_channels = out_channels 92 | if self.dropout_ratio != 0: 93 | self.dropout = nn.Dropout(p=self.dropout_ratio) 94 | else: 95 | self.dropout = None 96 | self.fc = nn.Conv3d(self.in_channels, self.out_channels, (1, 1, 1)) 97 | self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) 98 | 99 | def forward(self, x, rois=None): 100 | x = self.dropout(x) 101 | x = self.avg_pool(x) 102 | out = self.fc(x) 103 | return out 104 | 105 | 106 | class IQAHead(nn.Module): 107 | """MLP Regression Head for IQA. 108 | Args: 109 | in_channels: input channels for MLP 110 | hidden_channels: hidden channels for MLP 111 | dropout_ratio: the dropout ratio for features before the MLP (default 0.5) 112 | """ 113 | 114 | def __init__( 115 | self, in_channels=768, hidden_channels=64, dropout_ratio=0.5, **kwargs 116 | ): 117 | super().__init__() 118 | self.dropout_ratio = dropout_ratio 119 | self.in_channels = in_channels 120 | self.hidden_channels = hidden_channels 121 | if self.dropout_ratio != 0: 122 | self.dropout = nn.Dropout(p=self.dropout_ratio) 123 | else: 124 | self.dropout = None 125 | self.fc_hid = nn.Linear(self.in_channels, self.hidden_channels) 126 | self.fc_last = nn.Linear(self.hidden_channels, 1) 127 | self.gelu = nn.GELU() 128 | 129 | def forward(self, x): 130 | x = self.dropout(x) 131 | qlt_score = self.fc_last(self.dropout(self.gelu(self.fc_hid(x)))) 132 | return qlt_score 133 | -------------------------------------------------------------------------------- /IQA/fastvqa/models/head.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | from torchvision.ops import roi_pool, roi_align 4 | from torch.nn import functional as F 5 | import numpy as np 6 | import math 7 | 8 | 9 | class VQAHeadMLP(nn.Module): 10 | """MLP Regression Head for VQA. 11 | Args: 12 | in_channels: input channels for MLP 13 | hidden_channels: hidden channels for MLP 14 | dropout_ratio: the dropout ratio for features before the MLP (default 0.5) 15 | """ 16 | 17 | def __init__( 18 | self, in_channels=768, hidden_channels=64, target=1, dropout_ratio=0.5): 19 | super().__init__() 20 | 21 | self.dropout_ratio = dropout_ratio 22 | self.in_channels = in_channels 23 | self.hidden_channels = hidden_channels 24 | self.dropout = nn.Dropout(p=self.dropout_ratio) if dropout_ratio > 0 else nn.Identity() 25 | self.fc1 = nn.Linear(self.in_channels, self.hidden_channels) 26 | self.fc2 = nn.Linear(self.hidden_channels, target) 27 | self.gelu = nn.GELU() 28 | 29 | 30 | def forward(self, x, rois=None): 31 | x = self.dropout(x) 32 | qlt_score = self.fc2(self.dropout(self.gelu(self.fc1(x)))) 33 | return qlt_score 34 | 35 | 36 | class HyperHead(nn.Module): 37 | def __init__( 38 | self, in_channels=768, hidden_channels=64, dropout_ratio=0.5): 39 | super().__init__() 40 | 41 | self.dropout_ratio = dropout_ratio 42 | self.in_channels = in_channels 43 | self.hidden_channels = hidden_channels 44 | 45 | self.dropout = nn.Dropout(dropout_ratio) if dropout_ratio > 0 else nn.Identity() 46 | self.fc11 = nn.Linear(self.in_channels, self.hidden_channels) 47 | self.fc12 = nn.Linear(self.hidden_channels, 1) 48 | 49 | self.fc21 = nn.Linear(self.in_channels, hidden_channels) 50 | self.fc22 = nn.Linear(hidden_channels, 50) 51 | self.fc32 = nn.Linear(hidden_channels, 2) 52 | self.gelu = nn.GELU() 53 | 54 | 55 | def forward(self, x): 56 | x = self.dropout(x) 57 | relative_score = self.fc12(self.dropout(self.gelu(self.fc11(x)))) # [b, 1] 58 | 59 | f = self.dropout(self.gelu(self.fc21(x))) 60 | cls = self.fc22(f) # [b, N] 61 | 62 | wb = self.fc32(f) # [b, 2] 63 | scores = relative_score * wb[:, :1] + wb[:, 1:] 64 | return scores, cls 65 | 66 | 67 | 68 | class VQAHead(nn.Module): 69 | """MLP Regression Head for VQA. 70 | Args: 71 | in_channels: input channels for MLP 72 | hidden_channels: hidden channels for MLP 73 | dropout_ratio: the dropout ratio for features before the MLP (default 0.5) 74 | """ 75 | 76 | def __init__( 77 | self, in_channels=768, hidden_channels=64, dropout_ratio=0.5, **kwargs 78 | ): 79 | super().__init__() 80 | self.dropout_ratio = dropout_ratio 81 | self.in_channels = in_channels 82 | self.hidden_channels = hidden_channels 83 | if self.dropout_ratio != 0: 84 | self.dropout = nn.Dropout(p=self.dropout_ratio) 85 | else: 86 | self.dropout = None 87 | self.fc_hid = nn.Conv3d(self.in_channels, self.hidden_channels, (1, 1, 1)) 88 | self.fc_last = nn.Conv3d(self.hidden_channels, 1, (1, 1, 1)) 89 | self.gelu = nn.GELU() 90 | 91 | self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) 92 | 93 | def forward(self, x, rois=None): 94 | x = self.dropout(x) 95 | qlt_score = self.fc_last(self.dropout(self.gelu(self.fc_hid(x)))) 96 | return qlt_score 97 | 98 | class VARHead(nn.Module): 99 | """MLP Regression Head for Video Action Recognition. 100 | Args: 101 | in_channels: input channels for MLP 102 | hidden_channels: hidden channels for MLP 103 | dropout_ratio: the dropout ratio for features before the MLP (default 0.5) 104 | """ 105 | 106 | def __init__( 107 | self, in_channels=768, out_channels=400, dropout_ratio=0.5, **kwargs 108 | ): 109 | super().__init__() 110 | self.dropout_ratio = dropout_ratio 111 | self.in_channels = in_channels 112 | self.out_channels = out_channels 113 | if self.dropout_ratio != 0: 114 | self.dropout = nn.Dropout(p=self.dropout_ratio) 115 | else: 116 | self.dropout = None 117 | self.fc = nn.Conv3d(self.in_channels, self.out_channels, (1, 1, 1)) 118 | self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) 119 | 120 | def forward(self, x, rois=None): 121 | x = self.dropout(x) 122 | x = self.avg_pool(x) 123 | out = self.fc(x) 124 | return out 125 | 126 | 127 | class IQAHead(nn.Module): 128 | """MLP Regression Head for IQA. 129 | Args: 130 | in_channels: input channels for MLP 131 | hidden_channels: hidden channels for MLP 132 | dropout_ratio: the dropout ratio for features before the MLP (default 0.5) 133 | """ 134 | 135 | def __init__( 136 | self, in_channels=768, hidden_channels=64, dropout_ratio=0.5, **kwargs 137 | ): 138 | super().__init__() 139 | self.dropout_ratio = dropout_ratio 140 | self.in_channels = in_channels 141 | self.hidden_channels = hidden_channels 142 | if self.dropout_ratio != 0: 143 | self.dropout = nn.Dropout(p=self.dropout_ratio) 144 | else: 145 | self.dropout = None 146 | self.fc_hid = nn.Linear(self.in_channels, self.hidden_channels) 147 | self.fc_last = nn.Linear(self.hidden_channels, 1) 148 | self.gelu = nn.GELU() 149 | 150 | def forward(self, x): 151 | x = self.dropout(x) 152 | qlt_score = self.fc_last(self.dropout(self.gelu(self.fc_hid(x)))) 153 | return qlt_score 154 | -------------------------------------------------------------------------------- /VQA/demo_test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------- 2 | # SAMA, AAAI 2024 3 | # Testing code for VQA. 4 | # This code is modified from FAST-VQA [ECCV, 2022] 5 | # ------------------------------------------------- 6 | import torch 7 | import random 8 | import os.path as osp 9 | import fastvqa.models as models 10 | import fastvqa.datasets as datasets 11 | import os 12 | import argparse 13 | import sys 14 | 15 | from scipy.stats import spearmanr, pearsonr 16 | from scipy.stats.stats import kendalltau as kendallr 17 | import numpy as np 18 | 19 | import timeit 20 | import math 21 | 22 | import yaml 23 | 24 | from functools import reduce 25 | from thop import profile 26 | import warnings 27 | 28 | warnings.filterwarnings("ignore") 29 | 30 | from torch.utils.tensorboard import SummaryWriter 31 | 32 | 33 | def rescale(pr, gt=None): 34 | if gt is None: 35 | pr = (pr - np.mean(pr)) / np.std(pr) 36 | else: 37 | pr = ((pr - np.mean(pr)) / np.std(pr)) * np.std(gt) + np.mean(gt) 38 | return pr 39 | 40 | sample_types=["resize", "fragments", "crop", "arp_resize", "arp_fragments"] 41 | 42 | 43 | 44 | def inference_set(inf_loader, model, device): 45 | 46 | results = [] 47 | 48 | tic = timeit.default_timer() 49 | gt_labels, pr_labels = [], [] 50 | 51 | for i, data in enumerate(inf_loader): 52 | result = dict() 53 | video, video_up = {}, {} 54 | for key in sample_types: 55 | if key in data: 56 | video[key] = data[key].to(device) 57 | ## Reshape into clips 58 | b, c, t, h, w = video[key].shape 59 | video[key] = video[key].reshape(b, c, data["num_clips"][key], t // data["num_clips"][key], h, w).permute(0,2,1,3,4,5).reshape(b * data["num_clips"][key], c, t // data["num_clips"][key], h, w) 60 | 61 | with torch.no_grad(): 62 | result["pr_labels"] = model(video).cpu().numpy() 63 | 64 | result["gt_label"] = data["gt_label"].item() 65 | 66 | results.append(result) 67 | 68 | ## generate the demo video for video quality localization 69 | gt_labels = [r["gt_label"] for r in results] 70 | pr_labels = [np.mean(r["pr_labels"][:]) for r in results] 71 | pr_labels = rescale(pr_labels, gt_labels) 72 | 73 | s = spearmanr(gt_labels, pr_labels)[0] 74 | p = pearsonr(gt_labels, pr_labels)[0] 75 | k = kendallr(gt_labels, pr_labels)[0] 76 | r = np.sqrt(((gt_labels - pr_labels) ** 2).mean()) 77 | 78 | torch.cuda.empty_cache() 79 | 80 | toc = timeit.default_timer() 81 | minutes = int((toc - tic) / 60) 82 | seconds = int((toc - tic) % 60) 83 | 84 | print( 85 | f"For {len(gt_labels)} videos, \nthe accuracy of the model is as follows:\n SROCC: {s:.4f} best: {best_s:.4f} \n PLCC: {p:.4f} best: {best_p:.4f} \n KROCC: {k:.4f} best: {best_k:.4f} \n RMSE: {r:.4f} best: {best_r:.4f}." 86 | ) 87 | print('time elapsed {:02d}m {:02d}s.'.format(minutes, seconds)) 88 | 89 | return s, p, k, r 90 | 91 | 92 | 93 | def main(): 94 | 95 | parser = argparse.ArgumentParser() 96 | parser.add_argument("-o", "--opt", type=str, 97 | default="./options/fast-SAMA-test.yml", help="the option file") 98 | 99 | args = parser.parse_args() 100 | with open(args.opt, "r") as f: 101 | opt = yaml.safe_load(f) 102 | print(opt) 103 | 104 | ## adaptively choose the device 105 | device = "cuda" if torch.cuda.is_available() else "cpu" 106 | 107 | if sys.gettrace(): 108 | print('in DEBUGE mode.') 109 | opt["name"] = "DEBUG" 110 | opt['test_num_workers']=0 111 | 112 | ## defining model and loading checkpoint 113 | 114 | print('using device: {}'.format(device)) 115 | model = getattr(models, opt["model"]["type"])(**opt["model"]["args"]).to(device) 116 | 117 | 118 | stype = opt['stype'] if opt['stype'] in ['sama', 'sama-c', 'sama-mix', 'sama+spm', 'sama+swm'] else 'fragments' 119 | 120 | val_datasets = {} 121 | for key in opt["data"]: 122 | if key.startswith("val"): 123 | val_datasets[key] = getattr(datasets, opt["data"][key]["type"])(opt["data"][key]["args"], stype=stype) 124 | print('dataset=[{}], with {} samples.'.format(key, len(val_datasets[key]))) 125 | 126 | val_loaders = {} 127 | for key, val_dataset in val_datasets.items(): 128 | val_loaders[key] = torch.utils.data.DataLoader(val_dataset, 129 | batch_size=opt["test_batch_size"], 130 | num_workers=opt["test_num_workers"], 131 | pin_memory=False, 132 | shuffle=False, 133 | drop_last=False) 134 | 135 | if "load_path" in opt: 136 | state_dict = torch.load(opt["load_path"], map_location=device) 137 | print(model.load_state_dict(state_dict['state_dict'] , strict=False)) 138 | 139 | 140 | print(f"evaluation ..") 141 | 142 | bests = {} 143 | for key in val_loaders: 144 | bests[key] = inference_set( 145 | val_loaders[key], 146 | model, 147 | device 148 | ) 149 | 150 | for key in val_loaders: 151 | print( 152 | f"""For the finetuning process on {key} with {len(val_datasets[key])} videos, 153 | the best validation accuracy of the model-s is as follows: 154 | SROCC: {bests[key][0]:.4f} 155 | PLCC: {bests[key][1]:.4f} 156 | KROCC: {bests[key][2]:.4f} 157 | RMSE: {bests[key][3]:.4f}.""" 158 | ) 159 | 160 | 161 | 162 | if __name__ == "__main__": 163 | main() 164 | -------------------------------------------------------------------------------- /VQA/demo_finetune.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------- 2 | # SAMA, AAAI 2024 3 | # Finetuning code for VQA. 4 | # This code is modified from FAST-VQA [ECCV, 2022] 5 | # ------------------------------------------------- 6 | import torch 7 | import random 8 | import os.path as osp 9 | import fastvqa.models as models 10 | import fastvqa.datasets as datasets 11 | import os 12 | import argparse 13 | import sys 14 | 15 | from scipy.stats import spearmanr, pearsonr 16 | from scipy.stats.stats import kendalltau as kendallr 17 | import numpy as np 18 | 19 | import timeit 20 | import math 21 | 22 | import yaml 23 | 24 | from functools import reduce 25 | from thop import profile 26 | import warnings 27 | 28 | warnings.filterwarnings("ignore") 29 | 30 | from torch.utils.tensorboard import SummaryWriter 31 | 32 | 33 | def train_test_split(dataset_path, ann_file, ratio=0.8, seed=42): 34 | random.seed(seed) 35 | video_infos = [] 36 | with open(ann_file, "r") as fin: 37 | for line in fin.readlines(): 38 | line_split = line.strip().split(",") 39 | filename, _, _, label = line_split 40 | label = float(label) 41 | filename = osp.join(dataset_path, filename) 42 | video_infos.append(dict(filename=filename, label=label)) 43 | random.shuffle(video_infos) 44 | return ( 45 | video_infos[: int(ratio * len(video_infos))], 46 | video_infos[int(ratio * len(video_infos)) :], 47 | ) 48 | 49 | 50 | def rank_loss(y_pred, y): 51 | ranking_loss = torch.nn.functional.relu( 52 | (y_pred - y_pred.t()) * torch.sign((y.t() - y)) 53 | ) 54 | scale = 1 + torch.max(ranking_loss) 55 | return ( 56 | torch.sum(ranking_loss) / y_pred.shape[0] / (y_pred.shape[0] - 1) / scale 57 | ).float() 58 | 59 | def plcc_loss(y_pred, y): 60 | sigma_hat, m_hat = torch.std_mean(y_pred, unbiased=False) 61 | y_pred = (y_pred - m_hat) / (sigma_hat + 1e-8) 62 | sigma, m = torch.std_mean(y, unbiased=False) 63 | y = (y - m) / (sigma + 1e-8) 64 | loss0 = torch.nn.functional.mse_loss(y_pred, y) / 4 65 | rho = torch.mean(y_pred * y) 66 | loss1 = torch.nn.functional.mse_loss(rho * y_pred, y) / 4 67 | return ((loss0 + loss1) / 2).float() 68 | 69 | def rescaled_l2_loss(y_pred, y): 70 | y_pred_rs = (y_pred - y_pred.mean()) / y_pred.std() 71 | y_rs = (y - y.mean()) / (y.std() + eps) 72 | return torch.nn.functional.mse_loss(y_pred_rs, y_rs) 73 | 74 | def rplcc_loss(y_pred, y, eps=1e-8): 75 | ## Literally (1 - PLCC) / 2 76 | cov = torch.cov(y_pred, y) 77 | std = (torch.std(y_pred) + eps) * (torch.std(y) + eps) 78 | return (1 - cov / std) / 2 79 | 80 | def self_similarity_loss(f, f_hat, f_hat_detach=False): 81 | if f_hat_detach: 82 | f_hat = f_hat.detach() 83 | return 1 - torch.nn.functional.cosine_similarity(f, f_hat, dim=1).mean() 84 | 85 | def contrastive_similarity_loss(f, f_hat, f_hat_detach=False, eps=1e-8): 86 | if f_hat_detach: 87 | f_hat = f_hat.detach() 88 | intra_similarity = torch.nn.functional.cosine_similarity(f, f_hat, dim=1).mean() 89 | cross_similarity = torch.nn.functional.cosine_similarity(f, f_hat, dim=0).mean() 90 | return (1 - intra_similarity) / (1 - cross_similarity + eps) 91 | 92 | def rescale(pr, gt=None): 93 | if gt is None: 94 | pr = (pr - np.mean(pr)) / np.std(pr) 95 | else: 96 | pr = ((pr - np.mean(pr)) / np.std(pr)) * np.std(gt) + np.mean(gt) 97 | return pr 98 | 99 | sample_types=["resize", "fragments", "crop", "arp_resize", "arp_fragments"] 100 | 101 | 102 | 103 | 104 | def finetune_epoch(ft_loader, model, model_ema, optimizer, scheduler, device, epoch=-1, writer=None, 105 | need_upsampled=True, need_feat=True, need_fused=False, need_separate_sup=False): 106 | model.train() 107 | tic = timeit.default_timer() 108 | train_labels, pred_labels = [], [] 109 | epoch_loss = 0 110 | 111 | for i, data in enumerate(ft_loader): 112 | optimizer.zero_grad() 113 | video = {} 114 | for key in sample_types: 115 | if key in data: 116 | video[key] = data[key].to(device) 117 | 118 | y = data["gt_label"].float().detach().to(device).unsqueeze(-1) 119 | scores = model(video, inference=False, reduce_scores=False) 120 | if len(scores) > 1: 121 | y_pred = reduce(lambda x,y:x+y, scores) 122 | else: 123 | y_pred = scores[0] 124 | y_pred = y_pred.mean((-2, -1)).sum(-1) 125 | 126 | frame_inds = data["frame_inds"] 127 | 128 | # Plain Supervised Loss 129 | p_loss, r_loss = plcc_loss(y_pred, y), rank_loss(y_pred, y) 130 | 131 | loss = p_loss + 0.3 * r_loss 132 | epoch_loss += loss.item() 133 | 134 | loss.backward() 135 | optimizer.step() 136 | scheduler.step() 137 | 138 | pred_labels.extend(list(y_pred.view(-1).detach().cpu().numpy())) 139 | train_labels.extend(list(y.view(-1).detach().cpu().numpy())) 140 | 141 | #ft_loader.dataset.refresh_hypers() 142 | 143 | 144 | if model_ema is not None: 145 | model_params = dict(model.named_parameters()) 146 | model_ema_params = dict(model_ema.named_parameters()) 147 | for k in model_params.keys(): 148 | model_ema_params[k].data.mul_(0.999).add_( 149 | model_params[k].data, alpha=1 - 0.999) 150 | 151 | 152 | train_srcc = spearmanr(train_labels, pred_labels)[0] 153 | 154 | writer.add_scalar('train_srcc', train_srcc, epoch) 155 | writer.add_scalar('train_total_loss', epoch_loss, epoch) 156 | 157 | toc = timeit.default_timer() 158 | 159 | minutes = int((toc - tic) / 60) 160 | seconds = int((toc - tic) % 60) 161 | print('Epoch-{:02d}, training SRCC={:.4f}, time elapsed {:02d}m {:02d}s.'.format(epoch, train_srcc, minutes, seconds)) 162 | print('backbone_lr = {:.2e}, head_lr = {:.2e}'.format(optimizer.state_dict()['param_groups'][0]['lr'], 163 | optimizer.state_dict()['param_groups'][-1]['lr'])) 164 | 165 | model.eval() 166 | 167 | 168 | def profile_inference(inf_set, model, device): 169 | video = {} 170 | data = inf_set[0] 171 | for key in sample_types: 172 | if key in data: 173 | video[key] = data[key].to(device).unsqueeze(0) 174 | with torch.no_grad(): 175 | flops, params = profile(model, (video, )) 176 | print(f"The FLOps of the Variant is {flops/1e9:.1f}G, with Params {params/1e6:.2f}M.") 177 | 178 | def inference_set(inf_loader, model, device, best_, epoch, writer=None, save_model=False, suffix='s', save_name="divide"): 179 | 180 | results = [] 181 | 182 | tic = timeit.default_timer() 183 | gt_labels, pr_labels = [], [] 184 | 185 | best_s, best_p, best_k, best_r = best_ 186 | 187 | for i, data in enumerate(inf_loader): 188 | result = dict() 189 | video, video_up = {}, {} 190 | for key in sample_types: 191 | if key in data: 192 | video[key] = data[key].to(device) 193 | ## Reshape into clips 194 | b, c, t, h, w = video[key].shape 195 | video[key] = video[key].reshape(b, c, data["num_clips"][key], t // data["num_clips"][key], h, w).permute(0,2,1,3,4,5).reshape(b * data["num_clips"][key], c, t // data["num_clips"][key], h, w) 196 | 197 | with torch.no_grad(): 198 | result["pr_labels"] = model(video).cpu().numpy() 199 | 200 | result["gt_label"] = data["gt_label"].item() 201 | 202 | results.append(result) 203 | 204 | ## generate the demo video for video quality localization 205 | gt_labels = [r["gt_label"] for r in results] 206 | pr_labels = [np.mean(r["pr_labels"][:]) for r in results] 207 | pr_labels = rescale(pr_labels, gt_labels) 208 | 209 | s = spearmanr(gt_labels, pr_labels)[0] 210 | p = pearsonr(gt_labels, pr_labels)[0] 211 | k = kendallr(gt_labels, pr_labels)[0] 212 | r = np.sqrt(((gt_labels - pr_labels) ** 2).mean()) 213 | 214 | writer.add_scalar('val_{}_srcc'.format(suffix), s, epoch) 215 | writer.add_scalar('val_{}_plcc'.format(suffix), p, epoch) 216 | writer.add_scalar('val_{}_krcc'.format(suffix), k, epoch) 217 | writer.add_scalar('val_{}_rmse'.format(suffix), r, epoch) 218 | 219 | torch.cuda.empty_cache() 220 | 221 | if s + p > best_s + best_p and save_model: 222 | state_dict = model.state_dict() 223 | torch.save( 224 | {"state_dict": state_dict, 225 | "validation_results": best_}, 226 | f"pretrained_weights/{save_name}_{suffix}_dev_v0.0.pth") 227 | 228 | best_s, best_p, best_k, best_r = ( 229 | max(best_s, s), 230 | max(best_p, p), 231 | max(best_k, k), 232 | min(best_r, r), 233 | ) 234 | 235 | 236 | writer.add_scalar('val_{}_best_srcc'.format(suffix), best_s, epoch) 237 | writer.add_scalar('val_{}_best_plcc'.format(suffix), best_p, epoch) 238 | writer.add_scalar('val_{}_best_krcc'.format(suffix), best_k, epoch) 239 | writer.add_scalar('val_{}_best_rmse'.format(suffix), best_r, epoch) 240 | 241 | toc = timeit.default_timer() 242 | minutes = int((toc - tic) / 60) 243 | seconds = int((toc - tic) % 60) 244 | 245 | print( 246 | f"For {len(gt_labels)} videos, \nthe accuracy of the model: [{suffix}] is as follows:\n SROCC: {s:.4f} best: {best_s:.4f} \n PLCC: {p:.4f} best: {best_p:.4f} \n KROCC: {k:.4f} best: {best_k:.4f} \n RMSE: {r:.4f} best: {best_r:.4f}." 247 | ) 248 | print('time elapsed {:02d}m {:02d}s.'.format(minutes, seconds)) 249 | 250 | return best_s, best_p, best_k, best_r 251 | 252 | # torch.save(results, f'{args.save_dir}/results_{dataset.lower()}_s{32}*{32}_ens{args.famount}.pkl') 253 | 254 | 255 | def main(): 256 | 257 | parser = argparse.ArgumentParser() 258 | parser.add_argument( 259 | "-o", "--opt", type=str, default="./options/fast-SAMA-finetune.yml", help="the option file" 260 | ) 261 | 262 | args = parser.parse_args() 263 | with open(args.opt, "r") as f: 264 | opt = yaml.safe_load(f) 265 | print(opt) 266 | 267 | 268 | ## adaptively choose the device 269 | 270 | # os.environ['CUDA_VISIBLE_DEVICES']='6' 271 | device = "cuda" if torch.cuda.is_available() else "cpu" 272 | 273 | if sys.gettrace(): 274 | print('in DEBUGE mode.') 275 | opt["name"] = "DEBUG" 276 | opt['train_num_workers']=0 277 | opt['test_num_workers']=0 278 | 279 | 280 | if opt.get("split_seed", -1) > 0: 281 | num_splits = 10 282 | else: 283 | num_splits = 1 284 | 285 | stype = opt['stype'] if opt['stype'] in ['sama', 'sama-c', 'sama-mix', 'sama+spm', 'sama+swm'] else 'fragments' 286 | 287 | for split in range(num_splits): 288 | print(f"""\n==================== SPLIT-{split:02d} ====================""") 289 | 290 | key = opt["data"]["database"] 291 | ann_file = opt["data"]["anno_file"] 292 | data_prefix = opt["data"]["data_prefix"] 293 | video_infos = [] 294 | with open(ann_file, "r") as fin: 295 | for line in fin: 296 | line_split = line.strip().split(",") 297 | fileid, _, _, label = line_split 298 | label = float(label) 299 | filename = osp.join(data_prefix, fileid) 300 | video_infos.append(dict(filename=filename, label=label, fileid=fileid)) 301 | video_infos = np.asarray(video_infos) 302 | 303 | index_current = np.arange(len(video_infos)) 304 | random.Random(split * 123).shuffle(index_current) # shuffle with certain seed 305 | pos_train_end = int(0.8 * len(video_infos)) 306 | trainindex = index_current[:pos_train_end] 307 | evalindex = index_current[pos_train_end:] 308 | 309 | train_datasets, train_loaders, val_datasets, val_loaders = {}, {}, {}, {} 310 | 311 | val_datasets[key] = getattr(datasets, opt["data"]["type"])(video_infos[evalindex], 312 | opt["data"]["test"], 313 | stype=stype, 314 | is_train=False) 315 | val_loaders[key] = torch.utils.data.DataLoader(val_datasets[key], 316 | batch_size=opt["test_batch_size"], 317 | num_workers=opt["test_num_workers"], 318 | pin_memory=False, 319 | shuffle=False, 320 | drop_last=False) 321 | 322 | train_datasets[key] = getattr(datasets, opt["data"]["type"])(video_infos[trainindex], 323 | opt["data"]["train"], 324 | stype=stype, 325 | is_train=True) 326 | train_loaders[key] = torch.utils.data.DataLoader(train_datasets[key], 327 | batch_size=opt["train_batch_size"], 328 | num_workers=opt["train_num_workers"], 329 | shuffle=True) 330 | print('dataset=[{}], with {} samples.'.format(key, len(train_datasets[key]))) 331 | print('dataset=[{}], with {} samples.'.format(key, len(val_datasets[key]))) 332 | 333 | ## defining model and loading checkpoint 334 | print('using device: {}'.format(device)) 335 | model = getattr(models, opt["model"]["type"])(**opt["model"]["args"]).to(device) 336 | if "load_path" in opt: 337 | state_dict = torch.load(opt["load_path"], map_location=device)["state_dict"] 338 | print(model.load_state_dict(state_dict, strict=True)) 339 | 340 | if opt["ema"]: 341 | from copy import deepcopy 342 | model_ema = deepcopy(model) 343 | else: 344 | model_ema = None 345 | 346 | #profile_inference(val_dataset, model, device) 347 | 348 | # finetune the model 349 | param_groups=[] 350 | 351 | for key, value in dict(model.named_children()).items(): 352 | if "backbone" in key: 353 | param_groups += [{"params": value.parameters(), "lr": opt["optimizer"]["lr"] * opt["optimizer"]["backbone_lr_mult"]}] 354 | else: 355 | param_groups += [{"params": value.parameters(), "lr": opt["optimizer"]["lr"]}] 356 | 357 | optimizer = torch.optim.AdamW(lr=opt["optimizer"]["lr"], 358 | params=param_groups, 359 | weight_decay=opt["optimizer"]["wd"]) 360 | warmup_iter = 0 361 | for train_loader in train_loaders.values(): 362 | warmup_iter += int(opt["warmup_epochs"] * len(train_loader)) 363 | max_iter = int((opt["num_epochs"] + opt["l_num_epochs"]) * len(train_loader)) 364 | lr_lambda = ( 365 | lambda cur_iter: cur_iter / warmup_iter 366 | if cur_iter <= warmup_iter 367 | else 0.5 * (1 + math.cos(math.pi * (cur_iter - warmup_iter) / max_iter)) 368 | ) 369 | 370 | scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda) 371 | 372 | bests = {} 373 | # bests_n = {} 374 | for key in val_loaders: 375 | bests[key] = -1,-1,-1,1000 376 | # bests_n[key] = -1,-1,-1,1000 377 | 378 | os.makedirs('./tensorboard/', exist_ok=True) 379 | os.makedirs('./pretrained_weights/', exist_ok=True) 380 | writer = SummaryWriter('./tensorboard/{}'.format(opt['name'])) 381 | 382 | for epoch in range(opt["num_epochs"]): 383 | print(f"Finetune Epoch {epoch}:") 384 | 385 | for key, train_loader in train_loaders.items(): 386 | finetune_epoch( 387 | train_loader, model, model_ema, optimizer, scheduler, device, epoch, writer, 388 | opt.get("need_upsampled", False), opt.get("need_feat", False), opt.get("need_fused", False), 389 | ) 390 | 391 | 392 | print(f"evaluation ..") 393 | 394 | for key in val_loaders: 395 | bests[key] = inference_set( 396 | val_loaders[key], 397 | model_ema if model_ema is not None else model, 398 | device, bests[key], epoch, writer, 399 | save_model=opt["save_model"], save_name=opt["name"], 400 | suffix=key+"_s", 401 | ) 402 | if opt["num_epochs"] > 0: 403 | for key in val_loaders: 404 | print( 405 | f"""For the finetuning process on {key} with {len(val_datasets[key])} videos, 406 | the best validation accuracy of the model-s is as follows: 407 | SROCC: {bests[key][0]:.4f} 408 | PLCC: {bests[key][1]:.4f} 409 | KROCC: {bests[key][2]:.4f} 410 | RMSE: {bests[key][3]:.4f}.""" 411 | ) 412 | 413 | 414 | if __name__ == "__main__": 415 | main() 416 | -------------------------------------------------------------------------------- /VQA/demo_train.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------- 2 | # SAMA, AAAI 2024 3 | # Training code for VQA. 4 | # This code is modified from FAST-VQA [ECCV, 2022] 5 | # ------------------------------------------------- 6 | import torch 7 | import random 8 | import os.path as osp 9 | import fastvqa.models as models 10 | import fastvqa.datasets as datasets 11 | import os 12 | import argparse 13 | import sys 14 | 15 | from scipy.stats import spearmanr, pearsonr 16 | from scipy.stats.stats import kendalltau as kendallr 17 | import numpy as np 18 | 19 | import timeit 20 | import math 21 | 22 | import yaml 23 | 24 | from functools import reduce 25 | from thop import profile 26 | import warnings 27 | 28 | warnings.filterwarnings("ignore") 29 | 30 | from torch.utils.tensorboard import SummaryWriter 31 | 32 | 33 | def train_test_split(dataset_path, ann_file, ratio=0.8, seed=42): 34 | random.seed(seed) 35 | video_infos = [] 36 | with open(ann_file, "r") as fin: 37 | for line in fin.readlines(): 38 | line_split = line.strip().split(",") 39 | filename, _, _, label = line_split 40 | label = float(label) 41 | filename = osp.join(dataset_path, filename) 42 | video_infos.append(dict(filename=filename, label=label)) 43 | random.shuffle(video_infos) 44 | return ( 45 | video_infos[: int(ratio * len(video_infos))], 46 | video_infos[int(ratio * len(video_infos)) :], 47 | ) 48 | 49 | 50 | def rank_loss(y_pred, y): 51 | ranking_loss = torch.nn.functional.relu( 52 | (y_pred - y_pred.t()) * torch.sign((y.t() - y)) 53 | ) 54 | scale = 1 + torch.max(ranking_loss) 55 | return ( 56 | torch.sum(ranking_loss) / y_pred.shape[0] / (y_pred.shape[0] - 1) / scale 57 | ).float() 58 | 59 | def plcc_loss(y_pred, y): 60 | sigma_hat, m_hat = torch.std_mean(y_pred, unbiased=False) 61 | y_pred = (y_pred - m_hat) / (sigma_hat + 1e-8) 62 | sigma, m = torch.std_mean(y, unbiased=False) 63 | y = (y - m) / (sigma + 1e-8) 64 | loss0 = torch.nn.functional.mse_loss(y_pred, y) / 4 65 | rho = torch.mean(y_pred * y) 66 | loss1 = torch.nn.functional.mse_loss(rho * y_pred, y) / 4 67 | return ((loss0 + loss1) / 2).float() 68 | 69 | def rescaled_l2_loss(y_pred, y): 70 | y_pred_rs = (y_pred - y_pred.mean()) / y_pred.std() 71 | y_rs = (y - y.mean()) / (y.std() + eps) 72 | return torch.nn.functional.mse_loss(y_pred_rs, y_rs) 73 | 74 | def rplcc_loss(y_pred, y, eps=1e-8): 75 | ## Literally (1 - PLCC) / 2 76 | cov = torch.cov(y_pred, y) 77 | std = (torch.std(y_pred) + eps) * (torch.std(y) + eps) 78 | return (1 - cov / std) / 2 79 | 80 | def self_similarity_loss(f, f_hat, f_hat_detach=False): 81 | if f_hat_detach: 82 | f_hat = f_hat.detach() 83 | return 1 - torch.nn.functional.cosine_similarity(f, f_hat, dim=1).mean() 84 | 85 | def contrastive_similarity_loss(f, f_hat, f_hat_detach=False, eps=1e-8): 86 | if f_hat_detach: 87 | f_hat = f_hat.detach() 88 | intra_similarity = torch.nn.functional.cosine_similarity(f, f_hat, dim=1).mean() 89 | cross_similarity = torch.nn.functional.cosine_similarity(f, f_hat, dim=0).mean() 90 | return (1 - intra_similarity) / (1 - cross_similarity + eps) 91 | 92 | def rescale(pr, gt=None): 93 | if gt is None: 94 | pr = (pr - np.mean(pr)) / np.std(pr) 95 | else: 96 | pr = ((pr - np.mean(pr)) / np.std(pr)) * np.std(gt) + np.mean(gt) 97 | return pr 98 | 99 | sample_types=["resize", "fragments", "crop", "arp_resize", "arp_fragments"] 100 | 101 | 102 | 103 | 104 | def finetune_epoch(ft_loader, model, model_ema, optimizer, scheduler, device, epoch=-1, writer=None, 105 | need_upsampled=True, need_feat=True, need_fused=False, need_separate_sup=False): 106 | model.train() 107 | tic = timeit.default_timer() 108 | train_labels, pred_labels = [], [] 109 | epoch_loss = 0 110 | 111 | for i, data in enumerate(ft_loader): 112 | optimizer.zero_grad() 113 | video = {} 114 | for key in sample_types: 115 | if key in data: 116 | video[key] = data[key].to(device) 117 | 118 | y = data["gt_label"].float().detach().to(device).unsqueeze(-1) 119 | scores = model(video, inference=False, reduce_scores=False) 120 | if len(scores) > 1: 121 | y_pred = reduce(lambda x,y:x+y, scores) 122 | else: 123 | y_pred = scores[0] 124 | y_pred = y_pred.mean((-2, -1)).sum(-1) 125 | 126 | frame_inds = data["frame_inds"] 127 | 128 | # Plain Supervised Loss 129 | p_loss, r_loss = plcc_loss(y_pred, y), rank_loss(y_pred, y) 130 | 131 | loss = p_loss + 0.3 * r_loss 132 | epoch_loss += loss.item() 133 | 134 | loss.backward() 135 | optimizer.step() 136 | scheduler.step() 137 | 138 | pred_labels.extend(list(y_pred.view(-1).detach().cpu().numpy())) 139 | train_labels.extend(list(y.view(-1).detach().cpu().numpy())) 140 | 141 | #ft_loader.dataset.refresh_hypers() 142 | 143 | 144 | if model_ema is not None: 145 | model_params = dict(model.named_parameters()) 146 | model_ema_params = dict(model_ema.named_parameters()) 147 | for k in model_params.keys(): 148 | model_ema_params[k].data.mul_(0.999).add_( 149 | model_params[k].data, alpha=1 - 0.999) 150 | 151 | 152 | train_srcc = spearmanr(train_labels, pred_labels)[0] 153 | 154 | writer.add_scalar('train_srcc', train_srcc, epoch) 155 | writer.add_scalar('train_total_loss', epoch_loss, epoch) 156 | 157 | toc = timeit.default_timer() 158 | 159 | minutes = int((toc - tic) / 60) 160 | seconds = int((toc - tic) % 60) 161 | print('Epoch-{:02d}, training SRCC={:.4f}, time elapsed {:02d}m {:02d}s.'.format(epoch, train_srcc, minutes, seconds)) 162 | print('backbone_lr = {:.2e}, head_lr = {:.2e}'.format(optimizer.state_dict()['param_groups'][0]['lr'], 163 | optimizer.state_dict()['param_groups'][-1]['lr'])) 164 | 165 | model.eval() 166 | 167 | 168 | def profile_inference(inf_set, model, device): 169 | video = {} 170 | data = inf_set[0] 171 | for key in sample_types: 172 | if key in data: 173 | video[key] = data[key].to(device).unsqueeze(0) 174 | with torch.no_grad(): 175 | flops, params = profile(model, (video, )) 176 | print(f"The FLOps of the Variant is {flops/1e9:.1f}G, with Params {params/1e6:.2f}M.") 177 | 178 | 179 | def inference_set(inf_loader, model, device, best_, epoch, writer=None, save_model=False, suffix='s', save_name="divide"): 180 | 181 | results = [] 182 | 183 | tic = timeit.default_timer() 184 | gt_labels, pr_labels = [], [] 185 | 186 | best_s, best_p, best_k, best_r = best_ 187 | 188 | for i, data in enumerate(inf_loader): 189 | result = dict() 190 | video, video_up = {}, {} 191 | for key in sample_types: 192 | if key in data: 193 | video[key] = data[key].to(device) 194 | ## Reshape into clips 195 | b, c, t, h, w = video[key].shape 196 | video[key] = video[key].reshape(b, c, data["num_clips"][key], t // data["num_clips"][key], h, w).permute(0,2,1,3,4,5).reshape(b * data["num_clips"][key], c, t // data["num_clips"][key], h, w) 197 | 198 | with torch.no_grad(): 199 | result["pr_labels"] = model(video).cpu().numpy() 200 | 201 | result["gt_label"] = data["gt_label"].item() 202 | 203 | results.append(result) 204 | 205 | ## generate the demo video for video quality localization 206 | gt_labels = [r["gt_label"] for r in results] 207 | pr_labels = [np.mean(r["pr_labels"][:]) for r in results] 208 | pr_labels = rescale(pr_labels, gt_labels) 209 | 210 | s = spearmanr(gt_labels, pr_labels)[0] 211 | p = pearsonr(gt_labels, pr_labels)[0] 212 | k = kendallr(gt_labels, pr_labels)[0] 213 | r = np.sqrt(((gt_labels - pr_labels) ** 2).mean()) 214 | 215 | writer.add_scalar('val_{}_srcc'.format(suffix), s, epoch) 216 | writer.add_scalar('val_{}_plcc'.format(suffix), p, epoch) 217 | writer.add_scalar('val_{}_krcc'.format(suffix), k, epoch) 218 | writer.add_scalar('val_{}_rmse'.format(suffix), r, epoch) 219 | 220 | torch.cuda.empty_cache() 221 | 222 | if s + p > best_s + best_p and save_model: 223 | state_dict = model.state_dict() 224 | torch.save( 225 | {"state_dict": state_dict, 226 | "validation_results": best_}, 227 | f"pretrained_weights/{save_name}_{suffix}_dev_v0.0.pth") 228 | 229 | best_s, best_p, best_k, best_r = ( 230 | max(best_s, s), 231 | max(best_p, p), 232 | max(best_k, k), 233 | min(best_r, r), 234 | ) 235 | 236 | 237 | writer.add_scalar('val_{}_best_srcc'.format(suffix), best_s, epoch) 238 | writer.add_scalar('val_{}_best_plcc'.format(suffix), best_p, epoch) 239 | writer.add_scalar('val_{}_best_krcc'.format(suffix), best_k, epoch) 240 | writer.add_scalar('val_{}_best_rmse'.format(suffix), best_r, epoch) 241 | 242 | toc = timeit.default_timer() 243 | minutes = int((toc - tic) / 60) 244 | seconds = int((toc - tic) % 60) 245 | 246 | print( 247 | f"For {len(gt_labels)} videos, \nthe accuracy of the model: [{suffix}] is as follows:\n SROCC: {s:.4f} best: {best_s:.4f} \n PLCC: {p:.4f} best: {best_p:.4f} \n KROCC: {k:.4f} best: {best_k:.4f} \n RMSE: {r:.4f} best: {best_r:.4f}." 248 | ) 249 | print('time elapsed {:02d}m {:02d}s.'.format(minutes, seconds)) 250 | 251 | return best_s, best_p, best_k, best_r 252 | 253 | # torch.save(results, f'{args.save_dir}/results_{dataset.lower()}_s{32}*{32}_ens{args.famount}.pkl') 254 | 255 | 256 | def main(): 257 | 258 | parser = argparse.ArgumentParser() 259 | parser.add_argument("-o", "--opt", type=str, 260 | default="./options/fast-SAMA-train.yml", help="the option file") 261 | 262 | args = parser.parse_args() 263 | with open(args.opt, "r") as f: 264 | opt = yaml.safe_load(f) 265 | print(opt) 266 | 267 | ## adaptively choose the device 268 | device = "cuda" if torch.cuda.is_available() else "cpu" 269 | 270 | if sys.gettrace(): 271 | print('in DEBUGE mode.') 272 | opt["name"] = "DEBUG" 273 | opt['train_num_workers']=0 274 | opt['test_num_workers']=0 275 | 276 | ## defining model and loading checkpoint 277 | 278 | bests_ = [] 279 | print('using device: {}'.format(device)) 280 | model = getattr(models, opt["model"]["type"])(**opt["model"]["args"]).to(device) 281 | 282 | if opt.get("split_seed", -1) > 0: 283 | num_splits = 10 284 | else: 285 | num_splits = 1 286 | 287 | stype = opt['stype'] if opt['stype'] in ['sama', 'sama-c', 'sama-mix', 'sama+spm', 'sama+swm'] else 'fragments' 288 | for split in range(num_splits): 289 | 290 | val_datasets = {} 291 | for key in opt["data"]: 292 | if key.startswith("val"): 293 | val_datasets[key] = getattr(datasets, opt["data"][key]["type"])(opt["data"][key]["args"], stype=stype) 294 | print('dataset=[{}], with {} samples.'.format(key, len(val_datasets[key]))) 295 | 296 | 297 | val_loaders = {} 298 | for key, val_dataset in val_datasets.items(): 299 | val_loaders[key] = torch.utils.data.DataLoader(val_dataset, 300 | batch_size=opt["test_batch_size"], 301 | num_workers=opt["test_num_workers"], 302 | pin_memory=False, 303 | shuffle=False, 304 | drop_last=False) 305 | 306 | train_datasets = {} 307 | for key in opt["data"]: 308 | if key.startswith("train"): 309 | train_dataset = getattr(datasets, opt["data"][key]["type"])(opt["data"][key]["args"], stype=stype) 310 | train_datasets[key] = train_dataset 311 | print('dataset=[{}], with {} samples.'.format(key, len(train_datasets[key]))) 312 | 313 | train_loaders = {} 314 | for key, train_dataset in train_datasets.items(): 315 | train_loaders[key] = torch.utils.data.DataLoader(train_dataset, 316 | batch_size=opt["train_batch_size"], 317 | num_workers=opt["train_num_workers"], 318 | shuffle=True) 319 | 320 | 321 | if "load_path_aux" in opt: 322 | state_dict = torch.load(opt["load_path"], map_location=device)["state_dict"] 323 | aux_state_dict = torch.load(opt["load_path_aux"], map_location=device)["state_dict"] 324 | 325 | from collections import OrderedDict 326 | 327 | fusion_state_dict = OrderedDict() 328 | for k, v in state_dict.items(): 329 | if "head" in k: 330 | continue 331 | if k.startswith("vqa_head"): 332 | ki = k.replace("vqa", "fragments") 333 | else: 334 | ki = k 335 | fusion_state_dict[ki] = v 336 | 337 | for k, v in aux_state_dict.items(): 338 | if "head" in k: 339 | continue 340 | if k.startswith("frag"): 341 | continue 342 | if k.startswith("vqa_head"): 343 | ki = k.replace("vqa", "resize") 344 | else: 345 | ki = k 346 | fusion_state_dict[ki] = v 347 | state_dict = fusion_state_dict 348 | print(model.load_state_dict(state_dict)) 349 | 350 | elif "load_path" in opt: 351 | state_dict = torch.load(opt["load_path"], map_location=device) 352 | 353 | if "state_dict" in state_dict: 354 | ### migrate training weights from mmaction 355 | state_dict = state_dict["state_dict"] 356 | from collections import OrderedDict 357 | 358 | i_state_dict = OrderedDict() 359 | for key in state_dict.keys(): 360 | if "head" in key: 361 | continue 362 | if "cls" in key: 363 | tkey = key.replace("cls", "vqa") 364 | elif "backbone" in key: 365 | i_state_dict[key] = state_dict[key] 366 | i_state_dict["fragments_"+key] = state_dict[key] 367 | i_state_dict["resize_"+key] = state_dict[key] 368 | else: 369 | i_state_dict[key] = state_dict[key] 370 | 371 | t_state_dict = model.state_dict() 372 | for key, value in t_state_dict.items(): 373 | if key in i_state_dict and i_state_dict[key].shape != value.shape: 374 | i_state_dict.pop(key) 375 | 376 | print(model.load_state_dict(i_state_dict, strict=False)) 377 | 378 | if opt["ema"]: 379 | from copy import deepcopy 380 | model_ema = deepcopy(model) 381 | else: 382 | model_ema = None 383 | 384 | #profile_inference(val_dataset, model, device) 385 | 386 | # finetune the model 387 | param_groups=[] 388 | 389 | for key, value in dict(model.named_children()).items(): 390 | if "backbone" in key: 391 | param_groups += [{"params": value.parameters(), "lr": opt["optimizer"]["lr"] * opt["optimizer"]["backbone_lr_mult"]}] 392 | else: 393 | param_groups += [{"params": value.parameters(), "lr": opt["optimizer"]["lr"]}] 394 | 395 | optimizer = torch.optim.AdamW(lr=opt["optimizer"]["lr"], 396 | params=param_groups, 397 | weight_decay=opt["optimizer"]["wd"]) 398 | warmup_iter = 0 399 | for train_loader in train_loaders.values(): 400 | warmup_iter += int(opt["warmup_epochs"] * len(train_loader)) 401 | max_iter = int((opt["num_epochs"] + opt["l_num_epochs"]) * len(train_loader)) 402 | lr_lambda = ( 403 | lambda cur_iter: cur_iter / warmup_iter 404 | if cur_iter <= warmup_iter 405 | else 0.5 * (1 + math.cos(math.pi * (cur_iter - warmup_iter) / max_iter))) 406 | 407 | scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda) 408 | 409 | bests = {} 410 | for key in val_loaders: 411 | bests[key] = -1,-1,-1,1000 412 | 413 | os.makedirs('./tensorboard/', exist_ok=True) 414 | os.makedirs('./pretrained_weights/', exist_ok=True) 415 | writer = SummaryWriter('./tensorboard/{}'.format(opt['name'])) 416 | 417 | for epoch in range(opt["num_epochs"]): 418 | print(f"Finetune Epoch {epoch}:") 419 | 420 | for key, train_loader in train_loaders.items(): 421 | finetune_epoch( 422 | train_loader, model, model_ema, optimizer, scheduler, device, epoch, writer, 423 | opt.get("need_upsampled", False), opt.get("need_feat", False), opt.get("need_fused", False), 424 | ) 425 | 426 | state_dict = model.state_dict() 427 | torch.save({"state_dict": state_dict}, 'pretrained_weights/{}_Epoch_{:02d}.pth'.format(opt["name"], epoch)) 428 | 429 | print(f"evaluation ..") 430 | 431 | for key in val_loaders: 432 | bests[key] = inference_set( 433 | val_loaders[key], 434 | model_ema if model_ema is not None else model, 435 | device, bests[key], epoch, writer, 436 | save_model=opt["save_model"], save_name=opt["name"], 437 | suffix=key+"_s", 438 | ) 439 | 440 | 441 | if opt["num_epochs"] > 0: 442 | for key in val_loaders: 443 | print( 444 | f"""For the finetuning process on {key} with {len(val_datasets[key])} videos, 445 | the best validation accuracy of the model-s is as follows: 446 | SROCC: {bests[key][0]:.4f} 447 | PLCC: {bests[key][1]:.4f} 448 | KROCC: {bests[key][2]:.4f} 449 | RMSE: {bests[key][3]:.4f}.""" 450 | ) 451 | 452 | 453 | 454 | if __name__ == "__main__": 455 | main() 456 | -------------------------------------------------------------------------------- /IQA/fastvqa/models/swin_v1.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Swin Transformer 3 | # Copyright (c) 2021 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.utils.checkpoint as checkpoint 10 | from timm.models.layers import DropPath, to_2tuple, trunc_normal_ 11 | 12 | try: 13 | import os, sys 14 | 15 | kernel_path = os.path.abspath(os.path.join('..')) 16 | sys.path.append(kernel_path) 17 | from kernels.window_process.window_process import WindowProcess, WindowProcessReverse 18 | 19 | except: 20 | WindowProcess = None 21 | WindowProcessReverse = None 22 | print("[Warning] Fused window process have not been installed. Please refer to get_started.md for installation.") 23 | 24 | 25 | class Mlp(nn.Module): 26 | def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): 27 | super().__init__() 28 | out_features = out_features or in_features 29 | hidden_features = hidden_features or in_features 30 | self.fc1 = nn.Linear(in_features, hidden_features) 31 | self.act = act_layer() 32 | self.fc2 = nn.Linear(hidden_features, out_features) 33 | self.drop = nn.Dropout(drop) 34 | 35 | def forward(self, x): 36 | x = self.fc1(x) 37 | x = self.act(x) 38 | x = self.drop(x) 39 | x = self.fc2(x) 40 | x = self.drop(x) 41 | return x 42 | 43 | 44 | def window_partition(x, window_size): 45 | """ 46 | Args: 47 | x: (B, H, W, C) 48 | window_size (int): window size 49 | Returns: 50 | windows: (num_windows*B, window_size, window_size, C) 51 | """ 52 | B, H, W, C = x.shape 53 | x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) 54 | windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) 55 | return windows 56 | 57 | 58 | def window_reverse(windows, window_size, H, W): 59 | """ 60 | Args: 61 | windows: (num_windows*B, window_size, window_size, C) 62 | window_size (int): Window size 63 | H (int): Height of image 64 | W (int): Width of image 65 | Returns: 66 | x: (B, H, W, C) 67 | """ 68 | B = int(windows.shape[0] / (H * W / window_size / window_size)) 69 | x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) 70 | x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) 71 | return x 72 | 73 | 74 | class WindowAttention(nn.Module): 75 | r""" Window based multi-head self attention (W-MSA) module with relative position bias. 76 | It supports both of shifted and non-shifted window. 77 | Args: 78 | dim (int): Number of input channels. 79 | window_size (tuple[int]): The height and width of the window. 80 | num_heads (int): Number of attention heads. 81 | qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True 82 | qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set 83 | attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 84 | proj_drop (float, optional): Dropout ratio of output. Default: 0.0 85 | """ 86 | 87 | def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.): 88 | 89 | super().__init__() 90 | self.dim = dim 91 | self.window_size = window_size # Wh, Ww 92 | self.num_heads = num_heads 93 | head_dim = dim // num_heads 94 | self.scale = qk_scale or head_dim ** -0.5 95 | 96 | # define a parameter table of relative position bias 97 | self.relative_position_bias_table = nn.Parameter( 98 | torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH 99 | 100 | # get pair-wise relative position index for each token inside the window 101 | coords_h = torch.arange(self.window_size[0]) 102 | coords_w = torch.arange(self.window_size[1]) 103 | coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww 104 | coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww 105 | relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww 106 | relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 107 | relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 108 | relative_coords[:, :, 1] += self.window_size[1] - 1 109 | relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 110 | relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww 111 | self.register_buffer("relative_position_index", relative_position_index) 112 | 113 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) 114 | self.attn_drop = nn.Dropout(attn_drop) 115 | self.proj = nn.Linear(dim, dim) 116 | self.proj_drop = nn.Dropout(proj_drop) 117 | 118 | trunc_normal_(self.relative_position_bias_table, std=.02) 119 | self.softmax = nn.Softmax(dim=-1) 120 | 121 | def forward(self, x, mask=None): 122 | """ 123 | Args: 124 | x: input features with shape of (num_windows*B, N, C) 125 | mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None 126 | """ 127 | B_, N, C = x.shape 128 | qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 129 | q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) 130 | 131 | q = q * self.scale 132 | attn = (q @ k.transpose(-2, -1)) 133 | 134 | relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( 135 | self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH 136 | relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww 137 | attn = attn + relative_position_bias.unsqueeze(0) 138 | 139 | if mask is not None: 140 | nW = mask.shape[0] 141 | attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) 142 | attn = attn.view(-1, self.num_heads, N, N) 143 | attn = self.softmax(attn) 144 | else: 145 | attn = self.softmax(attn) 146 | 147 | attn = self.attn_drop(attn) 148 | 149 | x = (attn @ v).transpose(1, 2).reshape(B_, N, C) 150 | x = self.proj(x) 151 | x = self.proj_drop(x) 152 | return x 153 | 154 | def extra_repr(self) -> str: 155 | return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}' 156 | 157 | def flops(self, N): 158 | # calculate flops for 1 window with token length of N 159 | flops = 0 160 | # qkv = self.qkv(x) 161 | flops += N * self.dim * 3 * self.dim 162 | # attn = (q @ k.transpose(-2, -1)) 163 | flops += self.num_heads * N * (self.dim // self.num_heads) * N 164 | # x = (attn @ v) 165 | flops += self.num_heads * N * N * (self.dim // self.num_heads) 166 | # x = self.proj(x) 167 | flops += N * self.dim * self.dim 168 | return flops 169 | 170 | 171 | class SwinTransformerBlock(nn.Module): 172 | r""" Swin Transformer Block. 173 | Args: 174 | dim (int): Number of input channels. 175 | input_resolution (tuple[int]): Input resulotion. 176 | num_heads (int): Number of attention heads. 177 | window_size (int): Window size. 178 | shift_size (int): Shift size for SW-MSA. 179 | mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. 180 | qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True 181 | qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. 182 | drop (float, optional): Dropout rate. Default: 0.0 183 | attn_drop (float, optional): Attention dropout rate. Default: 0.0 184 | drop_path (float, optional): Stochastic depth rate. Default: 0.0 185 | act_layer (nn.Module, optional): Activation layer. Default: nn.GELU 186 | norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm 187 | fused_window_process (bool, optional): If True, use one kernel to fused window shift & window partition for acceleration, similar for the reversed part. Default: False 188 | """ 189 | 190 | def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0, 191 | mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., 192 | act_layer=nn.GELU, norm_layer=nn.LayerNorm, 193 | fused_window_process=False): 194 | super().__init__() 195 | self.dim = dim 196 | self.input_resolution = input_resolution 197 | self.num_heads = num_heads 198 | self.window_size = window_size 199 | self.shift_size = shift_size 200 | self.mlp_ratio = mlp_ratio 201 | if min(self.input_resolution) <= self.window_size: 202 | # if window size is larger than input resolution, we don't partition windows 203 | self.shift_size = 0 204 | self.window_size = min(self.input_resolution) 205 | assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" 206 | 207 | self.norm1 = norm_layer(dim) 208 | self.attn = WindowAttention( 209 | dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, 210 | qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) 211 | 212 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() 213 | self.norm2 = norm_layer(dim) 214 | mlp_hidden_dim = int(dim * mlp_ratio) 215 | self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) 216 | 217 | if self.shift_size > 0: 218 | # calculate attention mask for SW-MSA 219 | H, W = self.input_resolution 220 | img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1 221 | h_slices = (slice(0, -self.window_size), 222 | slice(-self.window_size, -self.shift_size), 223 | slice(-self.shift_size, None)) 224 | w_slices = (slice(0, -self.window_size), 225 | slice(-self.window_size, -self.shift_size), 226 | slice(-self.shift_size, None)) 227 | cnt = 0 228 | for h in h_slices: 229 | for w in w_slices: 230 | img_mask[:, h, w, :] = cnt 231 | cnt += 1 232 | 233 | mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1 234 | mask_windows = mask_windows.view(-1, self.window_size * self.window_size) 235 | attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) 236 | attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) 237 | else: 238 | attn_mask = None 239 | 240 | self.register_buffer("attn_mask", attn_mask) 241 | self.fused_window_process = fused_window_process 242 | 243 | def forward(self, x): 244 | H, W = self.input_resolution 245 | B, L, C = x.shape 246 | assert L == H * W, "input feature has wrong size" 247 | 248 | shortcut = x 249 | x = self.norm1(x) 250 | x = x.view(B, H, W, C) 251 | 252 | # cyclic shift 253 | if self.shift_size > 0: 254 | if not self.fused_window_process: 255 | shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) 256 | # partition windows 257 | x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C 258 | else: 259 | x_windows = WindowProcess.apply(x, B, H, W, C, -self.shift_size, self.window_size) 260 | else: 261 | shifted_x = x 262 | # partition windows 263 | x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C 264 | 265 | x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C 266 | 267 | # W-MSA/SW-MSA 268 | attn_windows = self.attn(x_windows, mask=self.attn_mask) # nW*B, window_size*window_size, C 269 | 270 | # merge windows 271 | attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) 272 | 273 | # reverse cyclic shift 274 | if self.shift_size > 0: 275 | if not self.fused_window_process: 276 | shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C 277 | x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) 278 | else: 279 | x = WindowProcessReverse.apply(attn_windows, B, H, W, C, self.shift_size, self.window_size) 280 | else: 281 | shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C 282 | x = shifted_x 283 | x = x.view(B, H * W, C) 284 | x = shortcut + self.drop_path(x) 285 | 286 | # FFN 287 | x = x + self.drop_path(self.mlp(self.norm2(x))) 288 | 289 | return x 290 | 291 | def extra_repr(self) -> str: 292 | return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \ 293 | f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}" 294 | 295 | def flops(self): 296 | flops = 0 297 | H, W = self.input_resolution 298 | # norm1 299 | flops += self.dim * H * W 300 | # W-MSA/SW-MSA 301 | nW = H * W / self.window_size / self.window_size 302 | flops += nW * self.attn.flops(self.window_size * self.window_size) 303 | # mlp 304 | flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio 305 | # norm2 306 | flops += self.dim * H * W 307 | return flops 308 | 309 | 310 | class PatchMerging(nn.Module): 311 | r""" Patch Merging Layer. 312 | Args: 313 | input_resolution (tuple[int]): Resolution of input feature. 314 | dim (int): Number of input channels. 315 | norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm 316 | """ 317 | 318 | def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): 319 | super().__init__() 320 | self.input_resolution = input_resolution 321 | self.dim = dim 322 | self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) 323 | self.norm = norm_layer(4 * dim) 324 | 325 | def forward(self, x): 326 | """ 327 | x: B, H*W, C 328 | """ 329 | H, W = self.input_resolution 330 | B, L, C = x.shape 331 | assert L == H * W, "input feature has wrong size" 332 | assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even." 333 | 334 | x = x.view(B, H, W, C) 335 | 336 | x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C 337 | x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C 338 | x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C 339 | x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C 340 | x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C 341 | x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C 342 | 343 | x = self.norm(x) 344 | x = self.reduction(x) 345 | 346 | return x 347 | 348 | def extra_repr(self) -> str: 349 | return f"input_resolution={self.input_resolution}, dim={self.dim}" 350 | 351 | def flops(self): 352 | H, W = self.input_resolution 353 | flops = H * W * self.dim 354 | flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim 355 | return flops 356 | 357 | 358 | class BasicLayer(nn.Module): 359 | """ A basic Swin Transformer layer for one stage. 360 | Args: 361 | dim (int): Number of input channels. 362 | input_resolution (tuple[int]): Input resolution. 363 | depth (int): Number of blocks. 364 | num_heads (int): Number of attention heads. 365 | window_size (int): Local window size. 366 | mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. 367 | qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True 368 | qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. 369 | drop (float, optional): Dropout rate. Default: 0.0 370 | attn_drop (float, optional): Attention dropout rate. Default: 0.0 371 | drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 372 | norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm 373 | downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None 374 | use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. 375 | fused_window_process (bool, optional): If True, use one kernel to fused window shift & window partition for acceleration, similar for the reversed part. Default: False 376 | """ 377 | 378 | def __init__(self, dim, input_resolution, depth, num_heads, window_size, 379 | mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., 380 | drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False, 381 | fused_window_process=False): 382 | 383 | super().__init__() 384 | self.dim = dim 385 | self.input_resolution = input_resolution 386 | self.depth = depth 387 | self.use_checkpoint = use_checkpoint 388 | 389 | # build blocks 390 | self.blocks = nn.ModuleList([ 391 | SwinTransformerBlock(dim=dim, input_resolution=input_resolution, 392 | num_heads=num_heads, window_size=window_size, 393 | shift_size=0 if (i % 2 == 0) else window_size // 2, 394 | mlp_ratio=mlp_ratio, 395 | qkv_bias=qkv_bias, qk_scale=qk_scale, 396 | drop=drop, attn_drop=attn_drop, 397 | drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, 398 | norm_layer=norm_layer, 399 | fused_window_process=fused_window_process) 400 | for i in range(depth)]) 401 | 402 | # patch merging layer 403 | if downsample is not None: 404 | self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer) 405 | else: 406 | self.downsample = None 407 | 408 | def forward(self, x): 409 | for blk in self.blocks: 410 | if self.use_checkpoint: 411 | x = checkpoint.checkpoint(blk, x) 412 | else: 413 | x = blk(x) 414 | if self.downsample is not None: 415 | x = self.downsample(x) 416 | return x 417 | 418 | def extra_repr(self) -> str: 419 | return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}" 420 | 421 | def flops(self): 422 | flops = 0 423 | for blk in self.blocks: 424 | flops += blk.flops() 425 | if self.downsample is not None: 426 | flops += self.downsample.flops() 427 | return flops 428 | 429 | 430 | class PatchEmbed(nn.Module): 431 | r""" Image to Patch Embedding 432 | Args: 433 | img_size (int): Image size. Default: 224. 434 | patch_size (int): Patch token size. Default: 4. 435 | in_chans (int): Number of input image channels. Default: 3. 436 | embed_dim (int): Number of linear projection output channels. Default: 96. 437 | norm_layer (nn.Module, optional): Normalization layer. Default: None 438 | """ 439 | 440 | def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): 441 | super().__init__() 442 | img_size = to_2tuple(img_size) 443 | patch_size = to_2tuple(patch_size) 444 | patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]] 445 | self.img_size = img_size 446 | self.patch_size = patch_size 447 | self.patches_resolution = patches_resolution 448 | self.num_patches = patches_resolution[0] * patches_resolution[1] 449 | 450 | self.in_chans = in_chans 451 | self.embed_dim = embed_dim 452 | 453 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) 454 | if norm_layer is not None: 455 | self.norm = norm_layer(embed_dim) 456 | else: 457 | self.norm = None 458 | 459 | def forward(self, x): 460 | B, C, H, W = x.shape 461 | # FIXME look at relaxing size constraints 462 | assert H == self.img_size[0] and W == self.img_size[1], \ 463 | f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." 464 | x = self.proj(x).flatten(2).transpose(1, 2) # B Ph*Pw C 465 | if self.norm is not None: 466 | x = self.norm(x) 467 | return x 468 | 469 | def flops(self): 470 | Ho, Wo = self.patches_resolution 471 | flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) 472 | if self.norm is not None: 473 | flops += Ho * Wo * self.embed_dim 474 | return flops 475 | 476 | 477 | class SwinTransformer(nn.Module): 478 | r""" Swin Transformer 479 | A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - 480 | https://arxiv.org/pdf/2103.14030 481 | Args: 482 | img_size (int | tuple(int)): Input image size. Default 224 483 | patch_size (int | tuple(int)): Patch size. Default: 4 484 | in_chans (int): Number of input image channels. Default: 3 485 | num_classes (int): Number of classes for classification head. Default: 1000 486 | embed_dim (int): Patch embedding dimension. Default: 96 487 | depths (tuple(int)): Depth of each Swin Transformer layer. 488 | num_heads (tuple(int)): Number of attention heads in different layers. 489 | window_size (int): Window size. Default: 7 490 | mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 491 | qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True 492 | qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None 493 | drop_rate (float): Dropout rate. Default: 0 494 | attn_drop_rate (float): Attention dropout rate. Default: 0 495 | drop_path_rate (float): Stochastic depth rate. Default: 0.1 496 | norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. 497 | ape (bool): If True, add absolute position embedding to the patch embedding. Default: False 498 | patch_norm (bool): If True, add normalization after patch embedding. Default: True 499 | use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False 500 | fused_window_process (bool, optional): If True, use one kernel to fused window shift & window partition for acceleration, similar for the reversed part. Default: False 501 | """ 502 | 503 | def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, 504 | embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], 505 | window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, 506 | drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, 507 | norm_layer=nn.LayerNorm, ape=False, patch_norm=True, 508 | use_checkpoint=False, fused_window_process=False, **kwargs): 509 | super().__init__() 510 | 511 | self.num_classes = num_classes 512 | self.num_layers = len(depths) 513 | self.embed_dim = embed_dim 514 | self.ape = ape 515 | self.patch_norm = patch_norm 516 | self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) 517 | self.mlp_ratio = mlp_ratio 518 | 519 | # split image into non-overlapping patches 520 | self.patch_embed = PatchEmbed( 521 | img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, 522 | norm_layer=norm_layer if self.patch_norm else None) 523 | num_patches = self.patch_embed.num_patches 524 | patches_resolution = self.patch_embed.patches_resolution 525 | self.patches_resolution = patches_resolution 526 | 527 | # absolute position embedding 528 | if self.ape: 529 | self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) 530 | trunc_normal_(self.absolute_pos_embed, std=.02) 531 | 532 | self.pos_drop = nn.Dropout(p=drop_rate) 533 | 534 | # stochastic depth 535 | dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule 536 | 537 | # build layers 538 | self.layers = nn.ModuleList() 539 | for i_layer in range(self.num_layers): 540 | layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer), 541 | input_resolution=(patches_resolution[0] // (2 ** i_layer), 542 | patches_resolution[1] // (2 ** i_layer)), 543 | depth=depths[i_layer], 544 | num_heads=num_heads[i_layer], 545 | window_size=window_size, 546 | mlp_ratio=self.mlp_ratio, 547 | qkv_bias=qkv_bias, qk_scale=qk_scale, 548 | drop=drop_rate, attn_drop=attn_drop_rate, 549 | drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], 550 | norm_layer=norm_layer, 551 | downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, 552 | use_checkpoint=use_checkpoint, 553 | fused_window_process=fused_window_process) 554 | self.layers.append(layer) 555 | 556 | self.norm = norm_layer(self.num_features) 557 | self.avgpool = nn.AdaptiveAvgPool1d(1) 558 | self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() 559 | 560 | self.apply(self._init_weights) 561 | 562 | def _init_weights(self, m): 563 | if isinstance(m, nn.Linear): 564 | trunc_normal_(m.weight, std=.02) 565 | if isinstance(m, nn.Linear) and m.bias is not None: 566 | nn.init.constant_(m.bias, 0) 567 | elif isinstance(m, nn.LayerNorm): 568 | nn.init.constant_(m.bias, 0) 569 | nn.init.constant_(m.weight, 1.0) 570 | 571 | @torch.jit.ignore 572 | def no_weight_decay(self): 573 | return {'absolute_pos_embed'} 574 | 575 | @torch.jit.ignore 576 | def no_weight_decay_keywords(self): 577 | return {'relative_position_bias_table'} 578 | 579 | def forward_features(self, x): 580 | # x = self.patch_embed(x) 581 | if self.ape: 582 | x = x + self.absolute_pos_embed 583 | x = self.pos_drop(x) 584 | 585 | for i, layer in enumerate(self.layers): 586 | x = layer(x) 587 | 588 | x = self.norm(x) # B L C 589 | x = self.avgpool(x.transpose(1, 2)) # B C 1 590 | x = torch.flatten(x, 1) 591 | return x 592 | 593 | def forward(self, x): 594 | x = self.forward_features(x) 595 | # x = self.head(x) 596 | return x 597 | 598 | def flops(self): 599 | flops = 0 600 | flops += self.patch_embed.flops() 601 | for i, layer in enumerate(self.layers): 602 | flops += layer.flops() 603 | flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers) 604 | flops += self.num_features * self.num_classes 605 | return flops -------------------------------------------------------------------------------- /IQA/fastvqa/models/swin_v2.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Swin Transformer V2 3 | # Copyright (c) 2022 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import torch.utils.checkpoint as checkpoint 11 | from timm.models.layers import DropPath, to_2tuple, trunc_normal_ 12 | import numpy as np 13 | 14 | 15 | class Mlp(nn.Module): 16 | def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): 17 | super().__init__() 18 | out_features = out_features or in_features 19 | hidden_features = hidden_features or in_features 20 | self.fc1 = nn.Linear(in_features, hidden_features) 21 | self.act = act_layer() 22 | self.fc2 = nn.Linear(hidden_features, out_features) 23 | self.drop = nn.Dropout(drop) 24 | 25 | def forward(self, x): 26 | x = self.fc1(x) 27 | x = self.act(x) 28 | x = self.drop(x) 29 | x = self.fc2(x) 30 | x = self.drop(x) 31 | return x 32 | 33 | 34 | def window_partition(x, window_size): 35 | """ 36 | Args: 37 | x: (B, H, W, C) 38 | window_size (int): window size 39 | Returns: 40 | windows: (num_windows*B, window_size, window_size, C) 41 | """ 42 | B, H, W, C = x.shape 43 | x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) 44 | windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) 45 | return windows 46 | 47 | 48 | def window_reverse(windows, window_size, H, W): 49 | """ 50 | Args: 51 | windows: (num_windows*B, window_size, window_size, C) 52 | window_size (int): Window size 53 | H (int): Height of image 54 | W (int): Width of image 55 | Returns: 56 | x: (B, H, W, C) 57 | """ 58 | B = int(windows.shape[0] / (H * W / window_size / window_size)) 59 | x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) 60 | x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) 61 | return x 62 | 63 | 64 | class WindowAttention(nn.Module): 65 | r""" Window based multi-head self attention (W-MSA) module with relative position bias. 66 | It supports both of shifted and non-shifted window. 67 | Args: 68 | dim (int): Number of input channels. 69 | window_size (tuple[int]): The height and width of the window. 70 | num_heads (int): Number of attention heads. 71 | qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True 72 | attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 73 | proj_drop (float, optional): Dropout ratio of output. Default: 0.0 74 | pretrained_window_size (tuple[int]): The height and width of the window in pre-training. 75 | """ 76 | 77 | def __init__(self, dim, window_size, num_heads, qkv_bias=True, attn_drop=0., proj_drop=0., 78 | pretrained_window_size=[0, 0]): 79 | 80 | super().__init__() 81 | self.dim = dim 82 | self.window_size = window_size # Wh, Ww 83 | self.pretrained_window_size = pretrained_window_size 84 | self.num_heads = num_heads 85 | 86 | self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True) 87 | 88 | # mlp to generate continuous relative position bias 89 | self.cpb_mlp = nn.Sequential(nn.Linear(2, 512, bias=True), 90 | nn.ReLU(inplace=True), 91 | nn.Linear(512, num_heads, bias=False)) 92 | 93 | # get relative_coords_table 94 | relative_coords_h = torch.arange(-(self.window_size[0] - 1), self.window_size[0], dtype=torch.float32) 95 | relative_coords_w = torch.arange(-(self.window_size[1] - 1), self.window_size[1], dtype=torch.float32) 96 | relative_coords_table = torch.stack( 97 | torch.meshgrid([relative_coords_h, 98 | relative_coords_w])).permute(1, 2, 0).contiguous().unsqueeze(0) # 1, 2*Wh-1, 2*Ww-1, 2 99 | if pretrained_window_size[0] > 0: 100 | relative_coords_table[:, :, :, 0] /= (pretrained_window_size[0] - 1) 101 | relative_coords_table[:, :, :, 1] /= (pretrained_window_size[1] - 1) 102 | else: 103 | relative_coords_table[:, :, :, 0] /= (self.window_size[0] - 1) 104 | relative_coords_table[:, :, :, 1] /= (self.window_size[1] - 1) 105 | relative_coords_table *= 8 # normalize to -8, 8 106 | relative_coords_table = torch.sign(relative_coords_table) * torch.log2( 107 | torch.abs(relative_coords_table) + 1.0) / np.log2(8) 108 | 109 | self.register_buffer("relative_coords_table", relative_coords_table) 110 | 111 | # get pair-wise relative position index for each token inside the window 112 | coords_h = torch.arange(self.window_size[0]) 113 | coords_w = torch.arange(self.window_size[1]) 114 | coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww 115 | coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww 116 | relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww 117 | relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 118 | relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 119 | relative_coords[:, :, 1] += self.window_size[1] - 1 120 | relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 121 | relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww 122 | self.register_buffer("relative_position_index", relative_position_index) 123 | 124 | self.qkv = nn.Linear(dim, dim * 3, bias=False) 125 | if qkv_bias: 126 | self.q_bias = nn.Parameter(torch.zeros(dim)) 127 | self.v_bias = nn.Parameter(torch.zeros(dim)) 128 | else: 129 | self.q_bias = None 130 | self.v_bias = None 131 | self.attn_drop = nn.Dropout(attn_drop) 132 | self.proj = nn.Linear(dim, dim) 133 | self.proj_drop = nn.Dropout(proj_drop) 134 | self.softmax = nn.Softmax(dim=-1) 135 | 136 | def forward(self, x, mask=None): 137 | """ 138 | Args: 139 | x: input features with shape of (num_windows*B, N, C) 140 | mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None 141 | """ 142 | B_, N, C = x.shape 143 | qkv_bias = None 144 | if self.q_bias is not None: 145 | qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)) 146 | qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) 147 | qkv = qkv.reshape(B_, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) 148 | q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) 149 | 150 | # cosine attention 151 | attn = (F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1)) 152 | logit_scale = torch.clamp(self.logit_scale, max=torch.log(torch.tensor(1. / 0.01))).exp() 153 | attn = attn * logit_scale 154 | 155 | relative_position_bias_table = self.cpb_mlp(self.relative_coords_table).view(-1, self.num_heads) 156 | relative_position_bias = relative_position_bias_table[self.relative_position_index.view(-1)].view( 157 | self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH 158 | relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww 159 | relative_position_bias = 16 * torch.sigmoid(relative_position_bias) 160 | attn = attn + relative_position_bias.unsqueeze(0) 161 | 162 | if mask is not None: 163 | nW = mask.shape[0] 164 | attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) 165 | attn = attn.view(-1, self.num_heads, N, N) 166 | attn = self.softmax(attn) 167 | else: 168 | attn = self.softmax(attn) 169 | 170 | attn = self.attn_drop(attn) 171 | 172 | x = (attn @ v).transpose(1, 2).reshape(B_, N, C) 173 | x = self.proj(x) 174 | x = self.proj_drop(x) 175 | return x 176 | 177 | def extra_repr(self) -> str: 178 | return f'dim={self.dim}, window_size={self.window_size}, ' \ 179 | f'pretrained_window_size={self.pretrained_window_size}, num_heads={self.num_heads}' 180 | 181 | def flops(self, N): 182 | # calculate flops for 1 window with token length of N 183 | flops = 0 184 | # qkv = self.qkv(x) 185 | flops += N * self.dim * 3 * self.dim 186 | # attn = (q @ k.transpose(-2, -1)) 187 | flops += self.num_heads * N * (self.dim // self.num_heads) * N 188 | # x = (attn @ v) 189 | flops += self.num_heads * N * N * (self.dim // self.num_heads) 190 | # x = self.proj(x) 191 | flops += N * self.dim * self.dim 192 | return flops 193 | 194 | 195 | class SwinTransformerBlock(nn.Module): 196 | r""" Swin Transformer Block. 197 | Args: 198 | dim (int): Number of input channels. 199 | input_resolution (tuple[int]): Input resulotion. 200 | num_heads (int): Number of attention heads. 201 | window_size (int): Window size. 202 | shift_size (int): Shift size for SW-MSA. 203 | mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. 204 | qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True 205 | drop (float, optional): Dropout rate. Default: 0.0 206 | attn_drop (float, optional): Attention dropout rate. Default: 0.0 207 | drop_path (float, optional): Stochastic depth rate. Default: 0.0 208 | act_layer (nn.Module, optional): Activation layer. Default: nn.GELU 209 | norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm 210 | pretrained_window_size (int): Window size in pre-training. 211 | """ 212 | 213 | def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0, 214 | mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., drop_path=0., 215 | act_layer=nn.GELU, norm_layer=nn.LayerNorm, pretrained_window_size=0): 216 | super().__init__() 217 | self.dim = dim 218 | self.input_resolution = input_resolution 219 | self.num_heads = num_heads 220 | self.window_size = window_size 221 | self.shift_size = shift_size 222 | self.mlp_ratio = mlp_ratio 223 | if min(self.input_resolution) <= self.window_size: 224 | # if window size is larger than input resolution, we don't partition windows 225 | self.shift_size = 0 226 | self.window_size = min(self.input_resolution) 227 | assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" 228 | 229 | self.norm1 = norm_layer(dim) 230 | self.attn = WindowAttention( 231 | dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, 232 | qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, 233 | pretrained_window_size=to_2tuple(pretrained_window_size)) 234 | 235 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() 236 | self.norm2 = norm_layer(dim) 237 | mlp_hidden_dim = int(dim * mlp_ratio) 238 | self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) 239 | 240 | if self.shift_size > 0: 241 | # calculate attention mask for SW-MSA 242 | H, W = self.input_resolution 243 | img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1 244 | h_slices = (slice(0, -self.window_size), 245 | slice(-self.window_size, -self.shift_size), 246 | slice(-self.shift_size, None)) 247 | w_slices = (slice(0, -self.window_size), 248 | slice(-self.window_size, -self.shift_size), 249 | slice(-self.shift_size, None)) 250 | cnt = 0 251 | for h in h_slices: 252 | for w in w_slices: 253 | img_mask[:, h, w, :] = cnt 254 | cnt += 1 255 | 256 | mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1 257 | mask_windows = mask_windows.view(-1, self.window_size * self.window_size) 258 | attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) 259 | attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) 260 | else: 261 | attn_mask = None 262 | 263 | self.register_buffer("attn_mask", attn_mask) 264 | 265 | def forward(self, x): 266 | H, W = self.input_resolution 267 | B, L, C = x.shape 268 | assert L == H * W, "input feature has wrong size" 269 | 270 | shortcut = x 271 | x = x.view(B, H, W, C) 272 | 273 | # cyclic shift 274 | if self.shift_size > 0: 275 | shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) 276 | else: 277 | shifted_x = x 278 | 279 | # partition windows 280 | x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C 281 | x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C 282 | 283 | # W-MSA/SW-MSA 284 | attn_windows = self.attn(x_windows, mask=self.attn_mask) # nW*B, window_size*window_size, C 285 | 286 | # merge windows 287 | attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) 288 | shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C 289 | 290 | # reverse cyclic shift 291 | if self.shift_size > 0: 292 | x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) 293 | else: 294 | x = shifted_x 295 | x = x.view(B, H * W, C) 296 | x = shortcut + self.drop_path(self.norm1(x)) 297 | 298 | # FFN 299 | x = x + self.drop_path(self.norm2(self.mlp(x))) 300 | 301 | return x 302 | 303 | def extra_repr(self) -> str: 304 | return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \ 305 | f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}" 306 | 307 | def flops(self): 308 | flops = 0 309 | H, W = self.input_resolution 310 | # norm1 311 | flops += self.dim * H * W 312 | # W-MSA/SW-MSA 313 | nW = H * W / self.window_size / self.window_size 314 | flops += nW * self.attn.flops(self.window_size * self.window_size) 315 | # mlp 316 | flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio 317 | # norm2 318 | flops += self.dim * H * W 319 | return flops 320 | 321 | 322 | class PatchMerging(nn.Module): 323 | r""" Patch Merging Layer. 324 | Args: 325 | input_resolution (tuple[int]): Resolution of input feature. 326 | dim (int): Number of input channels. 327 | norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm 328 | """ 329 | 330 | def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): 331 | super().__init__() 332 | self.input_resolution = input_resolution 333 | self.dim = dim 334 | self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) 335 | self.norm = norm_layer(2 * dim) 336 | 337 | def forward(self, x): 338 | """ 339 | x: B, H*W, C 340 | """ 341 | H, W = self.input_resolution 342 | B, L, C = x.shape 343 | assert L == H * W, "input feature has wrong size" 344 | assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even." 345 | 346 | x = x.view(B, H, W, C) 347 | 348 | x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C 349 | x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C 350 | x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C 351 | x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C 352 | x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C 353 | x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C 354 | 355 | x = self.reduction(x) 356 | x = self.norm(x) 357 | 358 | return x 359 | 360 | def extra_repr(self) -> str: 361 | return f"input_resolution={self.input_resolution}, dim={self.dim}" 362 | 363 | def flops(self): 364 | H, W = self.input_resolution 365 | flops = (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim 366 | flops += H * W * self.dim // 2 367 | return flops 368 | 369 | 370 | class BasicLayer(nn.Module): 371 | """ A basic Swin Transformer layer for one stage. 372 | Args: 373 | dim (int): Number of input channels. 374 | input_resolution (tuple[int]): Input resolution. 375 | depth (int): Number of blocks. 376 | num_heads (int): Number of attention heads. 377 | window_size (int): Local window size. 378 | mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. 379 | qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True 380 | drop (float, optional): Dropout rate. Default: 0.0 381 | attn_drop (float, optional): Attention dropout rate. Default: 0.0 382 | drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 383 | norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm 384 | downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None 385 | use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. 386 | pretrained_window_size (int): Local window size in pre-training. 387 | """ 388 | 389 | def __init__(self, dim, input_resolution, depth, num_heads, window_size, 390 | mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., 391 | drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False, 392 | pretrained_window_size=0): 393 | 394 | super().__init__() 395 | self.dim = dim 396 | self.input_resolution = input_resolution 397 | self.depth = depth 398 | self.use_checkpoint = use_checkpoint 399 | 400 | # build blocks 401 | self.blocks = nn.ModuleList([ 402 | SwinTransformerBlock(dim=dim, input_resolution=input_resolution, 403 | num_heads=num_heads, window_size=window_size, 404 | shift_size=0 if (i % 2 == 0) else window_size // 2, 405 | mlp_ratio=mlp_ratio, 406 | qkv_bias=qkv_bias, 407 | drop=drop, attn_drop=attn_drop, 408 | drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, 409 | norm_layer=norm_layer, 410 | pretrained_window_size=pretrained_window_size) 411 | for i in range(depth)]) 412 | 413 | # patch merging layer 414 | if downsample is not None: 415 | self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer) 416 | else: 417 | self.downsample = None 418 | 419 | def forward(self, x): 420 | for blk in self.blocks: 421 | if self.use_checkpoint: 422 | x = checkpoint.checkpoint(blk, x) 423 | else: 424 | x = blk(x) 425 | if self.downsample is not None: 426 | x = self.downsample(x) 427 | return x 428 | 429 | def extra_repr(self) -> str: 430 | return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}" 431 | 432 | def flops(self): 433 | flops = 0 434 | for blk in self.blocks: 435 | flops += blk.flops() 436 | if self.downsample is not None: 437 | flops += self.downsample.flops() 438 | return flops 439 | 440 | def _init_respostnorm(self): 441 | for blk in self.blocks: 442 | nn.init.constant_(blk.norm1.bias, 0) 443 | nn.init.constant_(blk.norm1.weight, 0) 444 | nn.init.constant_(blk.norm2.bias, 0) 445 | nn.init.constant_(blk.norm2.weight, 0) 446 | 447 | 448 | class PatchEmbed(nn.Module): 449 | r""" Image to Patch Embedding 450 | Args: 451 | img_size (int): Image size. Default: 224. 452 | patch_size (int): Patch token size. Default: 4. 453 | in_chans (int): Number of input image channels. Default: 3. 454 | embed_dim (int): Number of linear projection output channels. Default: 96. 455 | norm_layer (nn.Module, optional): Normalization layer. Default: None 456 | """ 457 | 458 | def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): 459 | super().__init__() 460 | img_size = to_2tuple(img_size) 461 | patch_size = to_2tuple(patch_size) 462 | patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]] 463 | self.img_size = img_size 464 | self.patch_size = patch_size 465 | self.patches_resolution = patches_resolution 466 | self.num_patches = patches_resolution[0] * patches_resolution[1] 467 | 468 | self.in_chans = in_chans 469 | self.embed_dim = embed_dim 470 | 471 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) 472 | if norm_layer is not None: 473 | self.norm = norm_layer(embed_dim) 474 | else: 475 | self.norm = None 476 | 477 | def forward(self, x): 478 | B, C, H, W = x.shape 479 | # FIXME look at relaxing size constraints 480 | assert H == self.img_size[0] and W == self.img_size[1], \ 481 | f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." 482 | x = self.proj(x).flatten(2).transpose(1, 2) # B Ph*Pw C 483 | if self.norm is not None: 484 | x = self.norm(x) 485 | return x 486 | 487 | def flops(self): 488 | Ho, Wo = self.patches_resolution 489 | flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) 490 | if self.norm is not None: 491 | flops += Ho * Wo * self.embed_dim 492 | return flops 493 | 494 | 495 | class SwinTransformerV2(nn.Module): 496 | r""" Swin Transformer 497 | A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - 498 | https://arxiv.org/pdf/2103.14030 499 | Args: 500 | img_size (int | tuple(int)): Input image size. Default 224 501 | patch_size (int | tuple(int)): Patch size. Default: 4 502 | in_chans (int): Number of input image channels. Default: 3 503 | num_classes (int): Number of classes for classification head. Default: 1000 504 | embed_dim (int): Patch embedding dimension. Default: 96 505 | depths (tuple(int)): Depth of each Swin Transformer layer. 506 | num_heads (tuple(int)): Number of attention heads in different layers. 507 | window_size (int): Window size. Default: 7 508 | mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 509 | qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True 510 | drop_rate (float): Dropout rate. Default: 0 511 | attn_drop_rate (float): Attention dropout rate. Default: 0 512 | drop_path_rate (float): Stochastic depth rate. Default: 0.1 513 | norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. 514 | ape (bool): If True, add absolute position embedding to the patch embedding. Default: False 515 | patch_norm (bool): If True, add normalization after patch embedding. Default: True 516 | use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False 517 | pretrained_window_sizes (tuple(int)): Pretrained window sizes of each layer. 518 | """ 519 | 520 | def __init__(self, img_size=256, patch_size=4, in_chans=3, num_classes=1000, 521 | embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], 522 | window_size=8, mlp_ratio=4., qkv_bias=True, 523 | drop_rate=0., attn_drop_rate=0., drop_path_rate=0.2, 524 | norm_layer=nn.LayerNorm, ape=False, patch_norm=True, 525 | use_checkpoint=False, pretrained_window_sizes=[0, 0, 0, 0], **kwargs): 526 | super().__init__() 527 | 528 | self.num_classes = num_classes 529 | self.num_layers = len(depths) 530 | self.embed_dim = embed_dim 531 | self.ape = ape 532 | self.patch_norm = patch_norm 533 | self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) 534 | self.mlp_ratio = mlp_ratio 535 | 536 | # split image into non-overlapping patches 537 | self.patch_embed = PatchEmbed( 538 | img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, 539 | norm_layer=norm_layer if self.patch_norm else None) 540 | num_patches = self.patch_embed.num_patches 541 | patches_resolution = self.patch_embed.patches_resolution 542 | self.patches_resolution = patches_resolution 543 | 544 | # absolute position embedding 545 | if self.ape: 546 | self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) 547 | trunc_normal_(self.absolute_pos_embed, std=.02) 548 | 549 | self.pos_drop = nn.Dropout(p=drop_rate) 550 | 551 | # stochastic depth 552 | dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule 553 | 554 | # build layers 555 | self.layers = nn.ModuleList() 556 | for i_layer in range(self.num_layers): 557 | layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer), 558 | input_resolution=(patches_resolution[0] // (2 ** i_layer), 559 | patches_resolution[1] // (2 ** i_layer)), 560 | depth=depths[i_layer], 561 | num_heads=num_heads[i_layer], 562 | window_size=window_size, 563 | mlp_ratio=self.mlp_ratio, 564 | qkv_bias=qkv_bias, 565 | drop=drop_rate, attn_drop=attn_drop_rate, 566 | drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], 567 | norm_layer=norm_layer, 568 | downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, 569 | use_checkpoint=use_checkpoint, 570 | pretrained_window_size=pretrained_window_sizes[i_layer]) 571 | self.layers.append(layer) 572 | 573 | self.norm = norm_layer(self.num_features) 574 | self.avgpool = nn.AdaptiveAvgPool1d(1) 575 | self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() 576 | 577 | self.apply(self._init_weights) 578 | for bly in self.layers: 579 | bly._init_respostnorm() 580 | 581 | def _init_weights(self, m): 582 | if isinstance(m, nn.Linear): 583 | trunc_normal_(m.weight, std=.02) 584 | if isinstance(m, nn.Linear) and m.bias is not None: 585 | nn.init.constant_(m.bias, 0) 586 | elif isinstance(m, nn.LayerNorm): 587 | nn.init.constant_(m.bias, 0) 588 | nn.init.constant_(m.weight, 1.0) 589 | 590 | @torch.jit.ignore 591 | def no_weight_decay(self): 592 | return {'absolute_pos_embed'} 593 | 594 | @torch.jit.ignore 595 | def no_weight_decay_keywords(self): 596 | return {"cpb_mlp", "logit_scale", 'relative_position_bias_table'} 597 | 598 | def forward_features(self, x): 599 | x = self.patch_embed(x) 600 | if self.ape: 601 | x = x + self.absolute_pos_embed 602 | x = self.pos_drop(x) 603 | 604 | for layer in self.layers: 605 | x = layer(x) 606 | 607 | x = self.norm(x) # B L C 608 | # x = self.avgpool(x.transpose(1, 2)) # B C 1 609 | # x = torch.flatten(x, 1) 610 | return x 611 | 612 | def forward(self, x): 613 | x = self.forward_features(x) 614 | # x = self.head(x) 615 | return x 616 | 617 | def flops(self): 618 | flops = 0 619 | flops += self.patch_embed.flops() 620 | for i, layer in enumerate(self.layers): 621 | flops += layer.flops() 622 | flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers) 623 | flops += self.num_features * self.num_classes 624 | return flops -------------------------------------------------------------------------------- /IQA/demo_train_iqa_baseline.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------- 2 | # SAMA, AAAI 2024 3 | # Training code for IQA. 4 | # ------------------------------------------------- 5 | import torch 6 | import random 7 | import os 8 | import os.path as osp 9 | import fastvqa.models as models 10 | import sys 11 | import argparse 12 | import torch.nn as nn 13 | 14 | from scipy.stats import spearmanr, pearsonr 15 | from scipy.stats import kendalltau as kendallr 16 | import numpy as np 17 | from torchvision import transforms 18 | 19 | import yaml 20 | import timeit 21 | from PIL import Image 22 | 23 | from thop import profile 24 | import warnings 25 | 26 | warnings.filterwarnings("ignore") 27 | 28 | from torch.utils.tensorboard import SummaryWriter 29 | 30 | 31 | class ImageDataset(torch.utils.data.Dataset): 32 | def __init__(self, files, labels, 33 | data_args={"fwin_h": 8, "fwin_w": 8, "fsize_h": 32, "fsize_w": 32}, 34 | stype="fragment", 35 | is_train=True): 36 | 37 | super().__init__() 38 | 39 | self.files = files 40 | self.labels = labels 41 | self.is_train = is_train 42 | self.length = len(files) 43 | 44 | self.fwin_h = data_args['fwin_h'] 45 | self.fwin_w = data_args['fwin_w'] 46 | self.fsize_h = data_args['fsize_h'] 47 | self.fsize_w = data_args['fsize_w'] 48 | 49 | self.minh = self.fwin_h * self.fsize_h 50 | self.minw = self.fwin_w * self.fsize_w 51 | self.minsize = max(self.minh, self.minw) 52 | 53 | self.stype = stype if stype in ["sama", "sama-spm"] else "fragment" 54 | print("processing data with [{}]".format(self.stype)) 55 | 56 | if self.is_train: 57 | self.transform = transforms.Compose([ 58 | transforms.ToTensor(), 59 | transforms.RandomHorizontalFlip(0.5), 60 | transforms.RandomRotation(45), 61 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) 62 | ]) 63 | else: 64 | self.transform = transforms.Compose([ 65 | transforms.ToTensor(), 66 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) 67 | ]) 68 | 69 | def get_spatial_fragments(self, img, fragments_h=8, fragments_w=8, fsize_h=32, fsize_w=32): 70 | size_h = fragments_h * fsize_h 71 | size_w = fragments_w * fsize_w 72 | 73 | res_h, res_w = img.shape[-2:] 74 | ratio = min(res_h / size_h, res_w / size_w) 75 | if ratio < 1: 76 | img = torch.nn.functional.interpolate(img.unsqueeze(0), scale_factor=1 / ratio, mode="bilinear", align_corners=False) 77 | img = img[0] 78 | size = size_h, size_w 79 | 80 | ## make sure that sampling will not run out of the picture 81 | hgrids = torch.LongTensor([min(res_h // fragments_h * i, res_h - fsize_h) for i in range(fragments_h)]) 82 | wgrids = torch.LongTensor([min(res_w // fragments_w * i, res_w - fsize_w) for i in range(fragments_w)]) 83 | hlength, wlength = res_h // fragments_h, res_w // fragments_w 84 | 85 | if self.is_train: 86 | if hlength > fsize_h: 87 | rnd_h = torch.randint(hlength - fsize_h, (len(hgrids), len(wgrids))) 88 | else: 89 | rnd_h = torch.zeros((len(hgrids), len(wgrids))).int() 90 | if wlength > fsize_w: 91 | rnd_w = torch.randint(wlength - fsize_w, (len(hgrids), len(wgrids))) 92 | else: 93 | rnd_w = torch.zeros((len(hgrids), len(wgrids))).int() 94 | else: 95 | rnd_h = torch.ones((len(hgrids), len(wgrids))).int() * int((hlength - fsize_h) / 2) 96 | rnd_w = torch.ones((len(hgrids), len(wgrids))).int() * int((wlength - fsize_w) / 2) 97 | 98 | t_img = torch.zeros(img.shape[:-2] + size).to(img.device) 99 | 100 | for i, hs in enumerate(hgrids): 101 | for j, ws in enumerate(wgrids): 102 | h_s, h_e = i * fsize_h, (i + 1) * fsize_h 103 | w_s, w_e = j * fsize_w, (j + 1) * fsize_w 104 | 105 | h_so, h_eo = hs + rnd_h[i][j], hs + rnd_h[i][j] + fsize_h 106 | w_so, w_eo = ws + rnd_w[i][j], ws + rnd_w[i][j] + fsize_w 107 | t_img[:, h_s:h_e, w_s:w_e] = img[:, h_so:h_eo, w_so:w_eo] 108 | return t_img 109 | 110 | 111 | def get_spatial_fragments_spm(self, img, fragments_h=8, fragments_w=8, fsize_h=32, fsize_w=32): 112 | size_h = fragments_h * fsize_h 113 | size_w = fragments_w * fsize_w 114 | 115 | res_h, res_w = img.shape[-2:] 116 | ratio = min(res_h / size_h, res_w / size_w) 117 | if ratio < 1: 118 | res_h, res_w = round(res_h / ratio), round(res_w / ratio) 119 | img = torch.nn.functional.interpolate(img.unsqueeze(0), size=(res_h, res_w), mode="bilinear", align_corners=False) 120 | img = img[0] 121 | ratio = min(res_h / size_h, res_w / size_w) 122 | size = size_h, size_w 123 | 124 | img_scale, hgrids, wgrids = [], [], [] 125 | rnd_h, rnd_w = [], [] 126 | if self.is_train: 127 | rnd_rh, rnd_rw = torch.rand((fragments_h, fragments_w)), torch.rand((fragments_h, fragments_w)) 128 | else: 129 | rnd_rh, rnd_rw = torch.ones((fragments_h, fragments_w)) * 0.5, torch.ones((fragments_h, fragments_w)) * 0.5 130 | 131 | factors = [1, 1 / ratio] 132 | for scale in factors: 133 | this_h, this_w = round(res_h * scale), round(res_w * scale) 134 | img_scale.append(torch.nn.functional.interpolate(img.unsqueeze(0), size=(this_h, this_w), mode="bilinear", align_corners=False)[0]) 135 | 136 | hgrids.append(torch.LongTensor([min(this_h // fragments_h * i, this_h - fsize_h) for i in range(fragments_h)])) 137 | wgrids.append(torch.LongTensor([min(this_w // fragments_w * i, this_w - fsize_w) for i in range(fragments_w)])) 138 | 139 | hlength, wlength = this_h // fragments_h, this_w // fragments_w 140 | rnd_h.append((rnd_rh[:, :] * (hlength - fsize_h)).int()) 141 | rnd_w.append((rnd_rw[:, :] * (wlength - fsize_w)).int()) 142 | 143 | target_imgs = torch.zeros((2, ) + img.shape[:-2] + size).to(img.device) 144 | for k, scale in enumerate(factors): 145 | for i, hs in enumerate(hgrids[k]): 146 | for j, ws in enumerate(wgrids[k]): 147 | h_s, h_e = i * fsize_h, (i + 1) * fsize_h 148 | w_s, w_e = j * fsize_w, (j + 1) * fsize_w 149 | 150 | h_so = hs + rnd_h[k][i][j] 151 | h_eo = h_so + fsize_h 152 | w_so = ws + rnd_w[k][i][j] 153 | w_eo = w_so + fsize_w 154 | target_imgs[k, :, h_s:h_e, w_s:w_e] = img_scale[k][:, h_so:h_eo, w_so:w_eo] # 32 * 32 155 | 156 | # patch-based mask [4, 4] 157 | mask = torch.zeros((1, size_h, size_w)) 158 | for i in range(size_w // 8): # patch为4 159 | for j in range(size_h // 8): 160 | mask[:, j*8:j*8+4, i*8:i*8+4] = 1 161 | mask[:, j*8+4:j*8+8, i*8+4:i*8+8] = 1 162 | 163 | out_img = mask * target_imgs[0] + (1 - mask) * target_imgs[1] 164 | return out_img 165 | 166 | def get_spatial_fragments_swm(self, img, fragments_h=8, fragments_w=8, fsize_h=32, fsize_w=32): 167 | size_h = fragments_h * fsize_h 168 | size_w = fragments_w * fsize_w 169 | 170 | res_h, res_w = img.shape[-2:] 171 | ratio = min(res_h / size_h, res_w / size_w) 172 | if ratio < 1: 173 | res_h, res_w = round(res_h / ratio), round(res_w / ratio) 174 | img = torch.nn.functional.interpolate(img.unsqueeze(0), size=(res_h, res_w), mode="bilinear", align_corners=False) 175 | img = img[0] 176 | ratio = min(res_h / size_h, res_w / size_w) 177 | size = size_h, size_w 178 | 179 | img_scale, hgrids, wgrids = [], [], [] 180 | rnd_h, rnd_w = [], [] 181 | if self.is_train: 182 | rnd_rh, rnd_rw = torch.rand((fragments_h, fragments_w)), torch.rand((fragments_h, fragments_w)) 183 | else: 184 | rnd_rh, rnd_rw = torch.ones((fragments_h, fragments_w)) * 0.5, torch.ones((fragments_h, fragments_w)) * 0.5 185 | 186 | factors = [1, 1 / ratio] 187 | for scale in factors: 188 | this_h, this_w = round(res_h * scale), round(res_w * scale) 189 | img_scale.append(torch.nn.functional.interpolate(img.unsqueeze(0), size=(this_h, this_w), mode="bilinear", align_corners=False)[0]) 190 | 191 | hgrids.append(torch.LongTensor([min(this_h // fragments_h * i, this_h - fsize_h) for i in range(fragments_h)])) 192 | wgrids.append(torch.LongTensor([min(this_w // fragments_w * i, this_w - fsize_w) for i in range(fragments_w)])) 193 | 194 | hlength, wlength = this_h // fragments_h, this_w // fragments_w 195 | rnd_h.append((rnd_rh[:, :] * (hlength - fsize_h)).int()) 196 | rnd_w.append((rnd_rw[:, :] * (wlength - fsize_w)).int()) 197 | 198 | target_imgs = torch.zeros((2, ) + img.shape[:-2] + size).to(img.device) 199 | for k, scale in enumerate(factors): 200 | for i, hs in enumerate(hgrids[k]): 201 | for j, ws in enumerate(wgrids[k]): 202 | h_s, h_e = i * fsize_h, (i + 1) * fsize_h 203 | w_s, w_e = j * fsize_w, (j + 1) * fsize_w 204 | 205 | h_so = hs + rnd_h[k][i][j] 206 | h_eo = h_so + fsize_h 207 | w_so = ws + rnd_w[k][i][j] 208 | w_eo = w_so + fsize_w 209 | target_imgs[k, :, h_s:h_e, w_s:w_e] = img_scale[k][:, h_so:h_eo, w_so:w_eo] # 32 * 32 210 | 211 | # window-based mask [32, 32] 212 | mask = torch.zeros((1, size_h, size_w)) 213 | for i in range(fragments_h): # window 214 | for j in range(fragments_w): 215 | if (i + j) % 2 == 0: 216 | mask[:, j*32:j*32+32, i*32:i*32+32] = 1 217 | 218 | out_img = mask * target_imgs[0] + (1 - mask) * target_imgs[1] 219 | return out_img 220 | 221 | def __getitem__(self, index): 222 | filename = self.files[index] 223 | label = float(self.labels[index]) 224 | 225 | img = Image.open(filename).convert('RGB') 226 | width, height = img.size 227 | 228 | if min(width, height) < self.minsize: 229 | scale_factor = self.minsize / min(width, height) 230 | img = img.resize((int(width * scale_factor), int(height * scale_factor)), Image.BILINEAR) 231 | 232 | img = self.transform(img) 233 | 234 | if self.stype == "fragment": 235 | data = self.get_spatial_fragments(img, self.fwin_h, self.fwin_w, self.fsize_h, self.fsize_w) 236 | elif self.stype == "sama-spm": 237 | data = self.get_spatial_fragments_spm(img, self.fwin_h, self.fwin_w, self.fsize_h, self.fsize_w) 238 | elif self.stype == "sama": 239 | data = self.get_spatial_fragments_swm(img, self.fwin_h, self.fwin_w, self.fsize_h, self.fsize_w) 240 | else: 241 | raise NotImplementedError 242 | 243 | return data, label 244 | 245 | def __len__(self): 246 | return self.length 247 | 248 | 249 | 250 | def train_test_split(dataset_path, ann_file, ratio=0.8, seed=42): 251 | random.seed(seed) 252 | video_infos = [] 253 | with open(ann_file, "r") as fin: 254 | for line in fin.readlines(): 255 | line_split = line.strip().split(",") 256 | filename, _, _, label = line_split 257 | label = float(label) 258 | filename = osp.join(dataset_path, filename) 259 | video_infos.append(dict(filename=filename, label=label)) 260 | random.shuffle(video_infos) 261 | return ( 262 | video_infos[: int(ratio * len(video_infos))], 263 | video_infos[int(ratio * len(video_infos)) :], 264 | ) 265 | 266 | 267 | def rank_loss(y_pred, y): 268 | ranking_loss = torch.nn.functional.relu( 269 | (y_pred - y_pred.t()) * torch.sign((y.t() - y)) 270 | ) 271 | scale = 1 + torch.max(ranking_loss) 272 | return ( 273 | torch.sum(ranking_loss) / y_pred.shape[0] / (y_pred.shape[0] - 1) / scale 274 | ).float() 275 | 276 | 277 | def plcc_loss(y_pred, y): 278 | sigma_hat, m_hat = torch.std_mean(y_pred, unbiased=False) 279 | y_pred = (y_pred - m_hat) / (sigma_hat + 1e-8) 280 | sigma, m = torch.std_mean(y, unbiased=False) 281 | y = (y - m) / (sigma + 1e-8) 282 | loss0 = torch.nn.functional.mse_loss(y_pred, y) / 4 283 | rho = torch.mean(y_pred * y) 284 | loss1 = torch.nn.functional.mse_loss(rho * y_pred, y) / 4 285 | return ((loss0 + loss1) / 2).float() 286 | 287 | def rescaled_l2_loss(y_pred, y): 288 | y_pred_rs = (y_pred - y_pred.mean()) / y_pred.std() 289 | y_rs = (y - y.mean()) / (y.std() + eps) 290 | return torch.nn.functional.mse_loss(y_pred_rs, y_rs) 291 | 292 | def rplcc_loss(y_pred, y, eps=1e-8): 293 | sigma_hat, m_hat = torch.std_mean(y_pred, unbiased=False) 294 | y_pred = (y_pred - m_hat) / (sigma_hat + 1e-8) 295 | 296 | sigma, m = torch.std_mean(y, unbiased=False) 297 | y = (y - m) / (sigma + 1e-8) 298 | 299 | rho = torch.mean(y_pred.reshape(y.shape) * y) 300 | return 1 - rho 301 | 302 | def self_similarity_loss(f, f_hat, f_hat_detach=False): 303 | if f_hat_detach: 304 | f_hat = f_hat.detach() 305 | return 1 - torch.nn.functional.cosine_similarity(f, f_hat, dim=1).mean() 306 | 307 | def contrastive_similarity_loss(f, f_hat, f_hat_detach=False, eps=1e-8): 308 | if f_hat_detach: 309 | f_hat = f_hat.detach() 310 | intra_similarity = torch.nn.functional.cosine_similarity(f, f_hat, dim=1).mean() 311 | cross_similarity = torch.nn.functional.cosine_similarity(f, f_hat, dim=0).mean() 312 | return (1 - intra_similarity) / (1 - cross_similarity + eps) 313 | 314 | def rescale(pr, gt=None): 315 | if gt is None: 316 | pr = (pr - np.mean(pr)) / np.std(pr) 317 | else: 318 | pr = ((pr - np.mean(pr)) / np.std(pr)) * np.std(gt) + np.mean(gt) 319 | return pr 320 | 321 | sample_types=["resize", "diamond_fragments", "fragments", "crop", "arp_resize", "arp_fragments"] 322 | 323 | 324 | def finetune_epoch(ft_loader, model, model_ema, optimizer, scheduler, device, epoch=-1, split=-1, writer=None): 325 | 326 | model.train() 327 | 328 | tic = timeit.default_timer() 329 | 330 | criterion = nn.SmoothL1Loss() 331 | train_labels, pred_labels = [], [] 332 | plcc_loss_total, rank_loss_total, loss_total = 0, 0, 0 333 | for i, (data, label) in enumerate(ft_loader): 334 | optimizer.zero_grad() 335 | 336 | data = data.to(device) 337 | label = label.to(device).float() 338 | 339 | scores = model(data) 340 | scores = scores.view(label.shape) 341 | 342 | # Plain Supervised Loss 343 | # p_loss, r_loss = plcc_loss(scores, label), rank_loss(scores, label) 344 | 345 | loss = criterion(scores, label) # + 0.5 * rplcc_loss(scores, label) 346 | # loss = p_loss + 0.3 * r_loss + 0.3 * criterion(scores, label) 347 | 348 | 349 | # plcc_loss_total += p_loss.item() 350 | # rank_loss_total += r_loss.item() 351 | loss_total += loss.item() 352 | 353 | loss.backward() 354 | optimizer.step() 355 | scheduler.step() 356 | 357 | #ft_loader.dataset.refresh_hypers() 358 | 359 | pred_labels.extend(list(scores.view(-1).detach().cpu().numpy())) 360 | train_labels.extend(list(label.view(-1).detach().cpu().numpy())) 361 | 362 | if model_ema is not None: 363 | model_params = dict(model.named_parameters()) 364 | model_ema_params = dict(model_ema.named_parameters()) 365 | for k in model_params.keys(): 366 | model_ema_params[k].data.mul_(0.999).add_( 367 | model_params[k].data, alpha=1 - 0.999) 368 | 369 | train_srcc = spearmanr(train_labels, pred_labels)[0] 370 | writer.add_scalar('train_srcc', train_srcc, epoch) 371 | 372 | writer.add_scalar('train_plcc_loss', plcc_loss_total, epoch) 373 | writer.add_scalar('train_rank_loss', rank_loss_total, epoch) 374 | writer.add_scalar('train_total_loss', loss_total, epoch) 375 | 376 | toc = timeit.default_timer() 377 | 378 | minutes = int((toc - tic) / 60) 379 | seconds = int((toc - tic) % 60) 380 | print('Epoch-{:02d}, training SRCC={:.4f}, time elapsed {:02d}m {:02d}s.'.format(epoch, train_srcc, minutes, seconds)) 381 | print('backbone_lr = {:.2e}, head_lr = {:.2e}'.format(optimizer.state_dict()['param_groups'][0]['lr'], 382 | optimizer.state_dict()['param_groups'][-1]['lr'])) 383 | model.eval() 384 | 385 | 386 | def profile_inference(inf_set, model, device): 387 | video = {} 388 | data = inf_set[0] 389 | for key in sample_types: 390 | if key in data: 391 | video[key] = data[key].to(device).unsqueeze(0) 392 | with torch.no_grad(): 393 | flops, params = profile(model, (video, )) 394 | print(f"The FLOps of the Variant is {flops/1e9:.1f}G, with Params {params/1e6:.2f}M.") 395 | 396 | 397 | def inference_set(inf_loader, model, device, best_, epoch, split=-1, save_model=False, suffix='s', save_name="divide", writer=None): 398 | 399 | model.eval() 400 | 401 | tic = timeit.default_timer() 402 | gt_labels, pr_labels = [], [] 403 | 404 | best_s, best_p, best_k, best_r = best_ 405 | 406 | with torch.no_grad(): 407 | for i, (data, label) in enumerate(inf_loader): 408 | 409 | data = data.to(device) 410 | label = label.to(device) 411 | 412 | scores = model(data) 413 | scores = scores.view(label.shape) 414 | 415 | pr_labels.extend(list(scores.cpu().numpy())) 416 | gt_labels.extend(list(label.cpu().numpy())) 417 | 418 | pr_labels = rescale(pr_labels, gt_labels) 419 | 420 | s = spearmanr(gt_labels, pr_labels)[0] 421 | p = pearsonr(gt_labels, pr_labels)[0] 422 | k = kendallr(gt_labels, pr_labels)[0] 423 | r = np.sqrt(((gt_labels - pr_labels) ** 2).mean()) 424 | 425 | writer.add_scalar('val_{}_srcc'.format(suffix), s, epoch) 426 | writer.add_scalar('val_{}_plcc'.format(suffix), p, epoch) 427 | writer.add_scalar('val_{}_krcc'.format(suffix), k, epoch) 428 | writer.add_scalar('val_{}_rmse'.format(suffix), r, epoch) 429 | 430 | # del results, result #, video, video_up 431 | torch.cuda.empty_cache() 432 | 433 | if s + p > best_s + best_p and save_model: 434 | state_dict = model.state_dict() 435 | torch.save( 436 | { 437 | "state_dict": state_dict, 438 | "validation_results": best_, 439 | }, 440 | f"pretrained_weights/{save_name}_{suffix}_dev.pth", 441 | ) 442 | 443 | best_s, best_p, best_k, best_r = ( 444 | max(best_s, s), 445 | max(best_p, p), 446 | max(best_k, k), 447 | min(best_r, r), 448 | ) 449 | 450 | writer.add_scalar('val_{}_best_srcc'.format(suffix), best_s, epoch) 451 | writer.add_scalar('val_{}_best_plcc'.format(suffix), best_p, epoch) 452 | writer.add_scalar('val_{}_best_krcc'.format(suffix), best_k, epoch) 453 | writer.add_scalar('val_{}_best_rmse'.format(suffix), best_r, epoch) 454 | 455 | toc = timeit.default_timer() 456 | 457 | minutes = int((toc - tic) / 60) 458 | seconds = int((toc - tic) % 60) 459 | 460 | print( 461 | f"For {len(gt_labels)} images, \nthe accuracy of the model: [{suffix}] is as follows:\n SROCC: {s:.4f} best: {best_s:.4f} \n PLCC: {p:.4f} best: {best_p:.4f} \n KROCC: {k:.4f} best: {best_k:.4f} \n RMSE: {r:.4f} best: {best_r:.4f}." 462 | ) 463 | print('time elapsed {:02d}m {:02d}s.'.format(minutes, seconds)) 464 | 465 | return best_s, best_p, best_k, best_r 466 | 467 | 468 | def read_info(info_file, prefix): 469 | name, mos = [], [] 470 | import os.path as osp 471 | if info_file[-3:] == "txt": 472 | with open(info_file, 'r') as f: 473 | for line in f: 474 | dis, score = line.split() 475 | name.append(osp.join(prefix, dis)) 476 | mos.append(float(score)) 477 | name = np.stack(name) 478 | mos = np.stack(mos).astype(np.float32) 479 | 480 | elif info_file[-3:] == "csv": 481 | import pandas as pd 482 | d = pd.read_csv(info_file) 483 | mos = np.asarray(d['MOS_zscore'].to_list()).astype(np.float32) 484 | name = d['image_name'].to_list() 485 | for i in range(len(name)): 486 | name[i] = osp.join(prefix, name[i]) 487 | name = np.asarray(name) 488 | elif info_file[-3:] == "pkl": 489 | import pickle 490 | with open(info_file, 'rb') as f: 491 | d = pickle.load(f) 492 | for i, ifile in enumerate(d['files']): 493 | name.append(osp.join(prefix, ifile)) 494 | name = np.asarray(name) 495 | mos = np.asarray(d['labels']) 496 | else: 497 | raise NotImplementedError 498 | 499 | return name, mos 500 | 501 | 502 | def main(): 503 | 504 | parser = argparse.ArgumentParser() 505 | parser.add_argument( 506 | "-o", "--opt", type=str, default="./options/fast-sama-iqa.yml", help="the option file" 507 | ) 508 | 509 | args = parser.parse_args() 510 | 511 | if sys.gettrace(): 512 | print('in DEBUG mode.') 513 | args.opt = './options/fast-sama-iqa.yml' 514 | 515 | with open(args.opt, "r") as f: 516 | opt = yaml.safe_load(f) 517 | 518 | if sys.gettrace(): 519 | opt['num_workers'] = 0 520 | opt['test_num_workers'] = 0 521 | opt['name'] = 'DEBUG' 522 | 523 | print(opt) 524 | 525 | database = opt["data"]["database"] 526 | files, labels = read_info(opt["data"]["data_info"], opt["data"]["data_prefix"]) 527 | 528 | num_samples = len(files) 529 | num_repeat = opt["num_splits"] 530 | if opt["data"]["database"] == "kadid": 531 | ref_idx = np.arange(81).repeat(5*25).reshape(-1) 532 | index_all = np.zeros((num_repeat, 81), dtype=np.int) 533 | for ii in range(num_repeat): 534 | index_current = np.arange(81) 535 | random.Random(ii * 123).shuffle(index_current) 536 | index_all[ii] = index_current 537 | else: 538 | index_all = np.zeros((num_repeat, num_samples), dtype=np.int) 539 | for ii in range(num_repeat): 540 | index_current = np.asarray(range(num_samples)) 541 | random.Random(ii * 123).shuffle(index_current) # shuffle with certain seed 542 | index_all[ii] = index_current 543 | np.savetxt('rand_index_{}.txt'.format(database), index_all, fmt='%d') 544 | 545 | # ------------------ fix seed ----------------------- 546 | seed = 44442 547 | torch.manual_seed(seed) # 548 | torch.backends.cudnn.deterministic = True 549 | torch.backends.cudnn.benchmark = False 550 | np.random.seed(seed) 551 | random.seed(seed) 552 | # --------------------------------------------------- 553 | 554 | os.makedirs('./pretrained_weights/', exist_ok=True) 555 | os.makedirs('./tf-logs/', exist_ok=True) 556 | torch.utils.backcompat.broadcast_warning.enabled = True 557 | device = "cuda" if torch.cuda.is_available() else "cpu" 558 | # if sys.gettrace(): 559 | # device = 'cpu' 560 | 561 | # best_eval = {'koniq': [], 'livec': []} 562 | best_eval = {database: []} 563 | for split in range(num_repeat): 564 | print(f"""\n==================== SPLIT-{split:02d} ====================""") 565 | writer = SummaryWriter('./tf-logs/{}-split-{:02d}'.format(opt['name'], split)) 566 | 567 | index = index_all[split] 568 | 569 | pos_train_end = int(0.8 * num_samples) 570 | if opt["data"]["database"] == "kadid": 571 | eval_ref_idx = index[:int(0.2 * 81)] 572 | trainindex, evalindex = [], [] 573 | for iii in range(len(files)): 574 | if ref_idx[iii] in eval_ref_idx: 575 | evalindex.append(iii) 576 | else: 577 | trainindex.append(iii) 578 | trainindex = np.asarray(trainindex) 579 | evalindex = np.asarray(evalindex) 580 | else: 581 | trainindex = index[:pos_train_end] # the first 80% 582 | evalindex = index[pos_train_end:] 583 | 584 | trainindex.sort() 585 | evalindex.sort() 586 | 587 | train_dataset = ImageDataset(files[trainindex], labels[trainindex], data_args=opt["data"], stype=opt["stype"], is_train=True) 588 | eval_datasets = {} 589 | eval_datasets[database] = ImageDataset(files[evalindex], labels[evalindex], data_args=opt["data"], stype=opt["stype"], is_train=False) 590 | # eval_datasets['livec'] = ImageDataset(files_livec, labels_livec, data_args=opt["data"], is_train=False) 591 | 592 | train_loader = torch.utils.data.DataLoader( 593 | train_dataset, batch_size=opt["batch_size"], num_workers=opt["num_workers"], shuffle=True) 594 | eval_loaders = {} 595 | for key, idataset in eval_datasets.items(): 596 | eval_loaders[key] = torch.utils.data.DataLoader( 597 | idataset, batch_size=opt["test_batch_size"], num_workers=opt["test_num_workers"], 598 | pin_memory=True, shuffle=False, drop_last=False) 599 | 600 | model = getattr(models, "IQAModel")().to(device) 601 | 602 | if "load_path" in opt: 603 | state_dict = torch.load(opt["load_path"], map_location=device) 604 | if 'pretrained_weights' in opt["load_path"] and "state_dict" in state_dict: 605 | i_state_dict = state_dict['state_dict'] 606 | 607 | elif "model" in state_dict: 608 | ### migrate training weights from swin-transformer-v1 609 | state_dict = state_dict["model"] 610 | from collections import OrderedDict 611 | 612 | i_state_dict = OrderedDict() 613 | for key in state_dict.keys(): 614 | tkey = 'backbone.' + key 615 | i_state_dict[tkey] = state_dict[key] 616 | 617 | elif "state_dict" in state_dict: 618 | ### migrate training weights from mmaction 619 | state_dict = state_dict["state_dict"] 620 | from collections import OrderedDict 621 | 622 | i_state_dict = OrderedDict() 623 | for key in state_dict.keys(): 624 | if "head" in key: 625 | continue 626 | if "cls" in key: 627 | tkey = key.replace("cls", "vqa") 628 | elif "backbone" in key: 629 | i_state_dict[key] = state_dict[key] 630 | i_state_dict["fragments_"+key] = state_dict[key] 631 | i_state_dict["resize_"+key] = state_dict[key] 632 | else: 633 | i_state_dict[key] = state_dict[key] 634 | t_state_dict = model.state_dict() 635 | for key, value in t_state_dict.items(): 636 | if key in i_state_dict and i_state_dict[key].shape != value.shape: 637 | i_state_dict.pop(key) 638 | 639 | print(model.load_state_dict(i_state_dict, strict=False)) 640 | 641 | #print(model) 642 | 643 | if opt["ema"]: 644 | from copy import deepcopy 645 | model_ema = deepcopy(model) 646 | else: 647 | model_ema = None 648 | 649 | #profile_inference(val_dataset, model, device) 650 | 651 | # finetune the model 652 | 653 | param_groups=[] 654 | for key, value in dict(model.named_children()).items(): 655 | if "backbone" in key: 656 | param_groups += [{"params": value.parameters(), "lr": opt["optimizer"]["lr"] * opt["optimizer"]["backbone_lr_mult"]}] 657 | else: 658 | param_groups += [{"params": value.parameters(), "lr": opt["optimizer"]["lr"]}] 659 | 660 | optimizer = torch.optim.AdamW(lr=opt["optimizer"]["lr"], 661 | params=param_groups, 662 | weight_decay=opt["optimizer"]["wd"]) 663 | 664 | warmup_iter = int(opt["warmup_epochs"] * len(train_loader)) 665 | max_iter = int((opt["num_epochs"] + opt["l_num_epochs"]) * len(train_loader)) 666 | warmup_iter = max(1, warmup_iter) 667 | lr_lambda = ( 668 | lambda cur_iter: max(1e-2, cur_iter / warmup_iter) 669 | # lambda cur_iter: cur_iter / warmup_iter 670 | if cur_iter <= warmup_iter 671 | else 1 672 | # else max(1e-1, min(1, 1 - 0.9 * (cur_iter / len(train_loader) - opt["constant_epochs"]) / (opt["num_epochs"] - 20 - opt["constant_epochs"]))) 673 | # else 0.5 * (1 + math.cos(math.pi * (cur_iter - warmup_iter) / max_iter)) 674 | ) 675 | # lr_lambda = (lambda x: x) 676 | scheduler = torch.optim.lr_scheduler.LambdaLR( 677 | optimizer, lr_lambda=lr_lambda, 678 | ) 679 | 680 | bests = {} 681 | bests_n = {} 682 | for key in eval_loaders.keys(): 683 | bests[key] = -1,-1,-1,1000 684 | bests_n[key] = -1,-1,-1,1000 685 | 686 | for epoch in range(opt["num_epochs"]): 687 | print(f"Finetune Epoch {epoch}:") 688 | finetune_epoch( 689 | train_loader, model, model_ema, optimizer, scheduler, device, epoch, split, 690 | writer=writer) 691 | 692 | print(f"evaluation ..") 693 | # ----------------------------- reduce time consumption 694 | for key in eval_loaders: 695 | bests[key] = inference_set( 696 | eval_loaders[key], 697 | model_ema if model_ema is not None else model, 698 | device, bests[key], epoch, split, 699 | save_model=opt["save_model"], save_name=opt["name"], 700 | suffix=key+"_s", 701 | writer=writer 702 | ) 703 | 704 | 705 | if opt["num_epochs"] > 0: 706 | for key in eval_loaders: 707 | print( 708 | f"""SPLIT-{split:02d}, for the finetuning process on {key} with {len(eval_datasets[key])} images, 709 | the best validation accuracy of the model-s is as follows: 710 | SROCC: {bests[key][0]:.4f} 711 | PLCC: {bests[key][1]:.4f} 712 | KROCC: {bests[key][2]:.4f} 713 | RMSE: {bests[key][3]:.4f}.""" 714 | ) 715 | 716 | 717 | best_eval[key].append([bests[key][0], bests[key][1], bests[key][2], bests[key][3]]) 718 | 719 | print('\n ============================================== ') 720 | print(np.median(best_eval[database], 0)) 721 | 722 | 723 | 724 | if __name__ == "__main__": 725 | main() 726 | -------------------------------------------------------------------------------- /VQA/examplar_data_labels/LIVE_VQC/mylabels.txt: -------------------------------------------------------------------------------- 1 | all-videos/A001.mp4, 10.002367, 30.0, 80.232 2 | all-videos/A002.mp4, 10.04, 29.97002997002997, 57.3005 3 | all-videos/A003.mp4, 10.008333, 29.97002997002997, 78.1558 4 | all-videos/A004.mp4, 9.966821999999999, 30.0, 59.3179 5 | all-videos/A005.mp4, 10.033332999999999, 30.0, 47.1778 6 | all-videos/A006.mp4, 10.030721999999999, 30.0, 84.6517 7 | all-videos/A007.mp4, 10.008333, 29.97002997002997, 50.8675 8 | all-videos/A008.mp4, 10.006667, 29.97002997002997, 46.8902 9 | all-videos/A009.mp4, 10.006667, 29.97002997002997, 80.7925 10 | all-videos/A010.mp4, 10.0306, 30.0, 31.2785 11 | all-videos/A011.mp4, 10.006611, 30.0, 87.2287 12 | all-videos/A012.mp4, 10.033389, 29.916666666666668, 47.7826 13 | all-videos/A013.mp4, 10.04, 29.97002997002997, 83.4083 14 | all-videos/A014.mp4, 10.006667, 29.97002997002997, 69.9652 15 | all-videos/A015.mp4, 10.003910999999999, 30.0, 78.3957 16 | all-videos/A016.mp4, 10.000855999999999, 29.916666666666668, 37.8421 17 | all-videos/A017.mp4, 10.030332999999999, 30.0, 65.4778 18 | all-videos/A018.mp4, 10.006667, 29.97002997002997, 76.0205 19 | all-videos/A019.mp4, 10.04, 29.97002997002997, 70.843 20 | all-videos/A020.mp4, 10.005455999999999, 30.0, 77.7258 21 | all-videos/A021.mp4, 10.016733, 29.850746268656717, 37.6011 22 | all-videos/A022.mp4, 10.031666999999999, 30.0, 38.674 23 | all-videos/A023.mp4, 10.006667, 29.97002997002997, 69.0588 24 | all-videos/A024.mp4, 10.006667, 29.97002997002997, 71.962 25 | all-videos/A025.mp4, 10.024867, 30.0, 42.0856 26 | all-videos/A026.mp4, 10.008066999999999, 30.0, 74.5397 27 | all-videos/A027.mp4, 10.006667, 29.97002997002997, 56.3388 28 | all-videos/A028.mp4, 10.023678, 29.916666666666668, 41.5304 29 | all-videos/A029.mp4, 10.004999999999999, 29.97002997002997, 75.3861 30 | all-videos/A030.mp4, 10.031478, 30.0, 81.5278 31 | all-videos/A031.mp4, 10.023522, 30.0, 62.1042 32 | all-videos/A032.mp4, 10.04, 29.97002997002997, 74.3189 33 | all-videos/A033.mp4, 10.017267, 30.0, 71.7097 34 | all-videos/A034.mp4, 10.04, 29.97002997002997, 80.4271 35 | all-videos/A035.mp4, 10.04, 29.97002997002997, 85.0968 36 | all-videos/A036.mp4, 10.028767, 30.0, 46.6608 37 | all-videos/A037.mp4, 10.04, 29.97002997002997, 77.5858 38 | all-videos/A038.mp4, 10.025063, 29.925, 51.9195 39 | all-videos/A039.mp4, 10.025599999999999, 29.916666666666668, 53.006 40 | all-videos/A040.mp4, 10.032022, 30.0, 50.3763 41 | all-videos/A041.mp4, 10.004622, 30.0, 35.1412 42 | all-videos/A042.mp4, 10.030278, 30.0, 76.8312 43 | all-videos/A043.mp4, 10.031666999999999, 30.0, 38.1554 44 | all-videos/A044.mp4, 10.033332999999999, 30.0, 53.4333 45 | all-videos/A045.mp4, 10.001510999999999, 30.0, 81.9773 46 | all-videos/A046.mp4, 10.030344, 30.0, 69.1902 47 | all-videos/A047.mp4, 10.030356, 30.0, 58.4895 48 | all-videos/A048.mp4, 10.006667, 29.97002997002997, 72.3462 49 | all-videos/A049.mp4, 10.006667, 29.97002997002997, 72.2176 50 | all-videos/A050.mp4, 10.002177999999999, 30.0, 48.0519 51 | all-videos/A051.mp4, 10.008333, 29.97002997002997, 80.2807 52 | all-videos/A052.mp4, 10.006667, 29.97002997002997, 44.8256 53 | all-videos/A053.mp4, 10.030433, 30.0, 67.0884 54 | all-videos/A054.mp4, 10.027743, 29.917, 76.1667 55 | all-videos/A055.mp4, 10.034666999999999, 30.0, 53.8092 56 | all-videos/A056.mp4, 10.0, 30.0, 52.3594 57 | all-videos/A057.mp4, 10.004999999999999, 29.97002997002997, 66.4892 58 | all-videos/A058.mp4, 10.000656, 29.850746268656717, 57.0 59 | all-videos/A059.mp4, 10.011099999999999, 30.0, 74.2873 60 | all-videos/A060.mp4, 10.008578, 30.0, 63.1839 61 | all-videos/A061.mp4, 10.002078, 30.0, 56.4521 62 | all-videos/A062.mp4, 10.026188999999999, 30.0, 47.5187 63 | all-videos/A063.mp4, 10.031378, 30.0, 44.8971 64 | all-videos/A064.mp4, 10.012844, 30.0, 49.8497 65 | all-videos/A065.mp4, 10.030622, 30.0, 81.8222 66 | all-videos/A066.mp4, 10.006667, 29.97002997002997, 74.7647 67 | all-videos/A067.mp4, 10.04, 29.97002997002997, 77.4551 68 | all-videos/A068.mp4, 10.005644, 30.0, 39.123 69 | all-videos/A069.mp4, 10.0, 30.0, 49.4444 70 | all-videos/A070.mp4, 10.006667, 29.97002997002997, 65.4608 71 | all-videos/A071.mp4, 10.006667, 29.97002997002997, 74.1071 72 | all-videos/A072.mp4, 10.08, 25.0, 20.4406 73 | all-videos/A073.mp4, 10.04, 29.97002997002997, 91.1313 74 | all-videos/A074.mp4, 10.004999999999999, 29.97002997002997, 57.3204 75 | all-videos/A075.mp4, 10.030356, 30.0, 58.1867 76 | all-videos/A076.mp4, 10.030332999999999, 30.0, 64.7835 77 | all-videos/A077.mp4, 10.04, 29.97002997002997, 74.4483 78 | all-videos/A078.mp4, 9.971478, 30.0, 38.8777 79 | all-videos/A079.mp4, 9.999977999999999, 30.0, 81.0943 80 | all-videos/A080.mp4, 10.015233, 30.0, 39.2086 81 | all-videos/A081.mp4, 10.006667, 29.97002997002997, 75.7143 82 | all-videos/A082.mp4, 10.008333, 29.97002997002997, 87.0859 83 | all-videos/A083.mp4, 10.023052999999999, 29.931, 46.3077 84 | all-videos/A084.mp4, 10.003357, 29.79, 55.3835 85 | all-videos/A085.mp4, 10.030367, 30.0, 70.8486 86 | all-videos/A086.mp4, 10.006667, 29.97002997002997, 35.2749 87 | all-videos/A087.mp4, 10.019810999999999, 29.916666666666668, 56.6415 88 | all-videos/A088.mp4, 10.027033, 29.416666666666668, 43.4892 89 | all-videos/A089.mp4, 10.019771, 29.841, 46.9444 90 | all-videos/A090.mp4, 10.006667, 29.97002997002997, 86.1709 91 | all-videos/A091.mp4, 10.021115, 29.837, 46.2238 92 | all-videos/A092.mp4, 10.026022, 29.833333333333332, 71.095 93 | all-videos/A093.mp4, 10.005956, 30.0, 66.9632 94 | all-videos/A094.mp4, 10.031467, 30.0, 52.3441 95 | all-videos/A095.mp4, 10.006667, 29.97002997002997, 71.5491 96 | all-videos/A096.mp4, 10.008333, 29.97002997002997, 66.5815 97 | all-videos/A097.mp4, 10.041667, 29.97002997002997, 66.4593 98 | all-videos/A098.mp4, 10.030356, 30.0, 85.8091 99 | all-videos/A099.mp4, 10.030510999999999, 30.0, 46.1517 100 | all-videos/A100.mp4, 10.024967, 30.0, 46.6273 101 | all-videos/A101.mp4, 10.008333, 29.97002997002997, 69.3657 102 | all-videos/A102.mp4, 10.026043999999999, 29.850746268656717, 62.987 103 | all-videos/A103.mp4, 10.016499999999999, 29.850746268656717, 71.6203 104 | all-videos/A104.mp4, 10.021355999999999, 30.0, 50.385 105 | all-videos/A105.mp4, 10.006667, 29.97002997002997, 37.8824 106 | all-videos/A106.mp4, 9.984587999999999, 29.833333333333332, 72.642 107 | all-videos/A107.mp4, 10.006667, 29.97002997002997, 81.6325 108 | all-videos/A108.mp4, 10.005556, 120.0, 35.8462 109 | all-videos/A109.mp4, 10.003021, 29.791, 36.5086 110 | all-videos/A110.mp4, 10.04, 29.97002997002997, 87.8585 111 | all-videos/B001.mp4, 10.019677999999999, 11.083333333333334, 27.2485 112 | all-videos/B002.mp4, 10.031467, 30.0, 73.4245 113 | all-videos/B003.mp4, 10.031367, 30.0, 49.1111 114 | all-videos/B004.mp4, 10.006321999999999, 30.0, 60.6719 115 | all-videos/B005.mp4, 10.038333, 29.97002997002997, 47.2959 116 | all-videos/B006.mp4, 9.998232999999999, 30.0, 44.4845 117 | all-videos/B007.mp4, 9.998267, 30.0, 37.858 118 | all-videos/B008.mp4, 10.030766999999999, 30.0, 75.0204 119 | all-videos/B009.mp4, 9.998066999999999, 30.0, 39.4971 120 | all-videos/B010.mp4, 10.002689, 29.916666666666668, 33.9518 121 | all-videos/B011.mp4, 10.031644, 29.916666666666668, 36.0333 122 | all-videos/B012.mp4, 10.024211, 29.916666666666668, 33.9801 123 | all-videos/B013.mp4, 10.025466999999999, 29.916666666666668, 58.0795 124 | all-videos/B014.mp4, 10.030267, 30.0, 62.3243 125 | all-videos/B015.mp4, 10.033332999999999, 30.0, 56.9636 126 | all-videos/B016.mp4, 10.04, 29.97002997002997, 52.1065 127 | all-videos/B017.mp4, 10.01, 29.97002997002997, 65.3693 128 | all-videos/B018.mp4, 10.001944, 30.0, 54.024 129 | all-videos/B019.mp4, 10.021422, 30.0, 66.1658 130 | all-videos/B020.mp4, 10.033332999999999, 30.0, 69.2781 131 | all-videos/B021.mp4, 10.038333, 29.97002997002997, 84.9903 132 | all-videos/B022.mp4, 10.038333, 29.97002997002997, 84.532 133 | all-videos/B023.mp4, 10.038333, 29.97002997002997, 70.8629 134 | all-videos/B024.mp4, 10.038333, 29.97002997002997, 67.6078 135 | all-videos/B025.mp4, 10.038333, 29.97002997002997, 86.53 136 | all-videos/B026.mp4, 10.04, 29.97002997002997, 72.0794 137 | all-videos/B027.mp4, 10.04, 29.97002997002997, 89.2689 138 | all-videos/B028.mp4, 10.04, 29.97002997002997, 78.8442 139 | all-videos/B029.mp4, 10.04, 29.97002997002997, 90.8788 140 | all-videos/B030.mp4, 10.04, 29.97002997002997, 23.7704 141 | all-videos/B031.mp4, 10.04, 29.97002997002997, 69.9155 142 | all-videos/B032.mp4, 10.04, 29.97002997002997, 62.1639 143 | all-videos/B033.mp4, 10.04, 29.97002997002997, 67.869 144 | all-videos/B034.mp4, 10.04, 29.97002997002997, 71.4773 145 | all-videos/B035.mp4, 10.04, 29.97002997002997, 43.9005 146 | all-videos/B036.mp4, 10.04, 29.97002997002997, 82.8776 147 | all-videos/B037.mp4, 10.04, 29.97002997002997, 47.7402 148 | all-videos/B038.mp4, 10.04, 29.97002997002997, 67.4641 149 | all-videos/B039.mp4, 10.04, 29.97002997002997, 61.2217 150 | all-videos/B040.mp4, 10.04, 29.97002997002997, 67.0729 151 | all-videos/B041.mp4, 10.04, 29.97002997002997, 45.799 152 | all-videos/B042.mp4, 10.04, 29.97002997002997, 62.8774 153 | all-videos/B043.mp4, 10.04, 29.97002997002997, 38.3 154 | all-videos/B044.mp4, 10.04, 29.97002997002997, 87.1329 155 | all-videos/B045.mp4, 10.04, 29.97002997002997, 64.1244 156 | all-videos/B046.mp4, 10.04, 29.97002997002997, 75.0229 157 | all-videos/B047.mp4, 10.04, 29.97002997002997, 63.7714 158 | all-videos/B048.mp4, 10.04, 29.97002997002997, 65.9427 159 | all-videos/B049.mp4, 10.04, 29.97002997002997, 78.5928 160 | all-videos/B050.mp4, 10.04, 29.97002997002997, 77.2019 161 | all-videos/B051.mp4, 10.04, 29.97002997002997, 46.0611 162 | all-videos/B052.mp4, 10.04, 29.97002997002997, 58.6099 163 | all-videos/B053.mp4, 10.04, 29.97002997002997, 74.9505 164 | all-videos/B054.mp4, 10.003333, 120.0, 27.2126 165 | all-videos/B055.mp4, 10.04, 29.97002997002997, 64.193 166 | all-videos/B056.mp4, 10.038333, 29.97002997002997, 62.6744 167 | all-videos/B057.mp4, 10.04, 29.97002997002997, 59.9947 168 | all-videos/B058.mp4, 10.04, 29.97002997002997, 58.367 169 | all-videos/B059.mp4, 10.04, 29.97002997002997, 52.0 170 | all-videos/B060.mp4, 10.04, 29.97002997002997, 59.4882 171 | all-videos/B061.mp4, 10.04, 29.97002997002997, 63.7011 172 | all-videos/B062.mp4, 10.04, 29.97002997002997, 56.8361 173 | all-videos/B063.mp4, 10.038333, 29.97002997002997, 44.152 174 | all-videos/B064.mp4, 10.04, 29.97002997002997, 60.0559 175 | all-videos/B065.mp4, 10.04, 29.97002997002997, 58.7207 176 | all-videos/B066.mp4, 10.04, 29.97002997002997, 57.5947 177 | all-videos/B067.mp4, 10.0, 30.0, 70.6193 178 | all-videos/B068.mp4, 10.0, 30.0, 76.6719 179 | all-videos/B069.mp4, 10.0, 30.0, 68.4948 180 | all-videos/B070.mp4, 10.0, 30.0, 56.4128 181 | all-videos/B071.mp4, 10.0, 30.0, 81.3516 182 | all-videos/B072.mp4, 10.0, 30.0, 82.573 183 | all-videos/B073.mp4, 10.0, 30.0, 66.0347 184 | all-videos/B074.mp4, 10.0, 30.0, 83.0838 185 | all-videos/B075.mp4, 10.0, 30.0, 55.4301 186 | all-videos/B076.mp4, 10.0, 30.0, 85.2071 187 | all-videos/B077.mp4, 10.0, 30.0, 81.9227 188 | all-videos/B078.mp4, 10.0, 30.0, 87.6489 189 | all-videos/B079.mp4, 10.0, 30.0, 73.5323 190 | all-videos/B080.mp4, 10.0, 30.0, 78.8136 191 | all-videos/B081.mp4, 10.033332999999999, 30.0, 46.3822 192 | all-videos/B082.mp4, 10.027854999999999, 29.916666666666668, 39.3422 193 | all-videos/B083.mp4, 10.01001, 29.97, 47.1299 194 | all-videos/B084.mp4, 10.0, 30.0, 27.3091 195 | all-videos/B085.mp4, 10.033332999999999, 30.0, 78.5556 196 | all-videos/B086.mp4, 10.033332999999999, 30.0, 80.3536 197 | all-videos/B087.mp4, 10.0, 30.0, 72.178 198 | all-videos/B088.mp4, 10.0, 30.0, 52.8177 199 | all-videos/B089.mp4, 10.01, 29.97002997002997, 74.2079 200 | all-videos/B090.mp4, 10.016807, 29.75, 26.0497 201 | all-videos/B091.mp4, 10.022345999999999, 29.833333333333332, 71.7738 202 | all-videos/B092.mp4, 10.033332999999999, 30.0, 75.2238 203 | all-videos/B093.mp4, 10.016499999999999, 29.850746268656717, 70.0112 204 | all-videos/B094.mp4, 10.033332999999999, 30.0, 72.7368 205 | all-videos/B095.mp4, 10.033332999999999, 30.0, 72.7588 206 | all-videos/B096.mp4, 10.018093, 29.846, 57.6856 207 | all-videos/B097.mp4, 10.028329, 29.416666666666668, 60.4053 208 | all-videos/B098.mp4, 10.01, 29.97002997002997, 80.199 209 | all-videos/B099.mp4, 10.022345999999999, 29.833333333333332, 69.5399 210 | all-videos/B100.mp4, 10.01, 29.97002997002997, 80.2398 211 | all-videos/B101.mp4, 10.01, 29.97002997002997, 59.2431 212 | all-videos/B102.mp4, 10.01, 29.97002997002997, 88.0219 213 | all-videos/B103.mp4, 10.033332999999999, 30.0, 25.2484 214 | all-videos/B104.mp4, 10.033332999999999, 30.0, 71.3464 215 | all-videos/B105.mp4, 10.01, 29.97002997002997, 70.9 216 | all-videos/B106.mp4, 10.01, 29.97002997002997, 57.0578 217 | all-videos/B107.mp4, 10.01, 29.97002997002997, 76.7188 218 | all-videos/B108.mp4, 10.0, 30.0, 85.7418 219 | all-videos/B109.mp4, 10.01, 29.97002997002997, 79.3295 220 | all-videos/B110.mp4, 10.01, 29.97002997002997, 77.6527 221 | all-videos/B111.mp4, 10.0, 30.0, 57.7553 222 | all-videos/B112.mp4, 10.033332999999999, 30.0, 56.3086 223 | all-videos/B113.mp4, 10.033332999999999, 30.0, 73.7705 224 | all-videos/B114.mp4, 10.01, 29.97002997002997, 85.8129 225 | all-videos/B115.mp4, 10.033332999999999, 30.0, 39.1942 226 | all-videos/B116.mp4, 10.016499999999999, 29.850746268656717, 65.6578 227 | all-videos/B117.mp4, 10.0, 30.0, 70.7436 228 | all-videos/B118.mp4, 10.041667, 24.0, 71.8769 229 | all-videos/B119.mp4, 10.033332999999999, 30.0, 69.3909 230 | all-videos/B120.mp4, 10.043367, 29.97002997002997, 82.9815 231 | all-videos/B121.mp4, 10.033332999999999, 30.0, 54.3529 232 | all-videos/B122.mp4, 10.01, 29.97002997002997, 59.2645 233 | all-videos/B123.mp4, 10.033332999999999, 30.0, 76.6535 234 | all-videos/B124.mp4, 10.033332999999999, 30.0, 55.3564 235 | all-videos/B125.mp4, 10.033332999999999, 30.0, 52.9389 236 | all-videos/B126.mp4, 10.033332999999999, 30.0, 73.5285 237 | all-videos/B127.mp4, 10.01, 29.97002997002997, 58.382 238 | all-videos/B128.mp4, 10.01, 29.97002997002997, 59.2601 239 | all-videos/B129.mp4, 10.01, 29.97002997002997, 38.2353 240 | all-videos/B130.mp4, 10.01001, 29.97, 52.7733 241 | all-videos/B131.mp4, 10.01, 29.97002997002997, 37.062 242 | all-videos/B132.mp4, 10.036451, 20.027, 42.6774 243 | all-videos/B133.mp4, 10.01, 29.97002997002997, 81.2515 244 | all-videos/B134.mp4, 10.033332999999999, 30.0, 69.35 245 | all-videos/B135.mp4, 10.043367, 29.97002997002997, 79.775 246 | all-videos/B136.mp4, 10.01, 29.97002997002997, 71.8718 247 | all-videos/B137.mp4, 10.033332999999999, 30.0, 47.1258 248 | all-videos/B138.mp4, 10.0, 30.0, 79.3155 249 | all-videos/B139.mp4, 10.033332999999999, 30.0, 68.8052 250 | all-videos/B140.mp4, 10.043367, 29.97002997002997, 68.4787 251 | all-videos/B141.mp4, 10.01, 29.97002997002997, 86.3333 252 | all-videos/B142.mp4, 10.033332999999999, 30.0, 64.9667 253 | all-videos/B143.mp4, 10.033332999999999, 30.0, 64.0795 254 | all-videos/B144.mp4, 10.01, 29.97002997002997, 46.1283 255 | all-videos/B145.mp4, 10.043367, 29.97002997002997, 87.7442 256 | all-videos/B146.mp4, 10.089385, 29.833333333333332, 62.9054 257 | all-videos/B147.mp4, 10.022345999999999, 29.833333333333332, 73.0254 258 | all-videos/B148.mp4, 10.027854999999999, 29.916666666666668, 27.5249 259 | all-videos/B149.mp4, 10.033332999999999, 30.0, 51.5759 260 | all-videos/B150.mp4, 10.016499999999999, 29.850746268656717, 58.6421 261 | all-videos/B151.mp4, 10.027854999999999, 29.916666666666668, 71.6954 262 | all-videos/B152.mp4, 10.075641, 23.919073800308954, 70.9887 263 | all-videos/B153.mp4, 10.012811, 29.662, 62.9021 264 | all-videos/B154.mp4, 10.031096, 29.90700104493208, 49.736 265 | all-videos/B155.mp4, 10.01, 29.97002997002997, 43.4969 266 | all-videos/B156.mp4, 10.0, 30.0, 69.2067 267 | all-videos/B157.mp4, 10.0, 30.0, 75.9554 268 | all-videos/B158.mp4, 10.043367, 29.97002997002997, 76.8757 269 | all-videos/B159.mp4, 10.066666999999999, 30.0, 35.9529 270 | all-videos/B160.mp4, 10.022122, 29.834, 34.3556 271 | all-videos/B161.mp4, 9.566666999999999, 30.0, 71.6517 272 | all-videos/B162.mp4, 10.050419999999999, 29.75, 51.3829 273 | all-videos/B163.mp4, 10.005035999999999, 29.785, 45.5813 274 | all-videos/B164.mp4, 10.043367, 29.97002997002997, 83.1737 275 | all-videos/B165.mp4, 10.01, 29.97002997002997, 67.3736 276 | all-videos/B166.mp4, 10.033332999999999, 30.0, 57.8166 277 | all-videos/B167.mp4, 10.0, 30.0, 14.8662 278 | all-videos/B168.mp4, 10.01, 29.97002997002997, 56.3245 279 | all-videos/B169.mp4, 10.0, 30.0, 54.0769 280 | all-videos/B170.mp4, 10.027854999999999, 29.916666666666668, 35.8132 281 | all-videos/B171.mp4, 10.01, 29.97002997002997, 89.7447 282 | all-videos/B172.mp4, 10.01, 29.97002997002997, 75.619 283 | all-videos/B173.mp4, 10.0, 30.0, 66.0171 284 | all-videos/B174.mp4, 10.0, 30.0, 17.9012 285 | all-videos/B175.mp4, 10.027854999999999, 29.916666666666668, 38.6221 286 | all-videos/B176.mp4, 10.01, 29.97002997002997, 68.9807 287 | all-videos/B177.mp4, 10.033332999999999, 30.0, 40.7351 288 | all-videos/B178.mp4, 10.033332999999999, 30.0, 61.3542 289 | all-videos/B179.mp4, 10.033332999999999, 30.0, 56.5635 290 | all-videos/B180.mp4, 10.01, 29.97002997002997, 72.9946 291 | all-videos/B181.mp4, 10.055866, 29.833333333333332, 38.5263 292 | all-videos/B182.mp4, 10.01, 29.97002997002997, 82.9684 293 | all-videos/B183.mp4, 10.043367, 29.97002997002997, 85.2941 294 | all-videos/B184.mp4, 10.033332999999999, 30.0, 57.0256 295 | all-videos/B185.mp4, 10.0, 30.0, 65.4143 296 | all-videos/B186.mp4, 10.01, 29.97002997002997, 64.5275 297 | all-videos/B187.mp4, 10.01, 29.97002997002997, 90.199 298 | all-videos/B188.mp4, 10.0, 30.0, 70.3474 299 | all-videos/B189.mp4, 10.043367, 29.97002997002997, 49.8683 300 | all-videos/B190.mp4, 10.01001, 29.97, 72.1381 301 | all-videos/B191.mp4, 10.043367, 29.97002997002997, 82.8227 302 | all-videos/B192.mp4, 10.066666999999999, 30.0, 55.6981 303 | all-videos/B193.mp4, 10.01, 29.97002997002997, 70.5926 304 | all-videos/B194.mp4, 10.0, 30.0, 55.5238 305 | all-videos/B195.mp4, 10.027854999999999, 29.916666666666668, 70.2275 306 | all-videos/B196.mp4, 10.033332999999999, 30.0, 50.7012 307 | all-videos/B197.mp4, 10.004363999999999, 29.787, 36.4242 308 | all-videos/B198.mp4, 10.043367, 29.97002997002997, 84.7438 309 | all-videos/B199.mp4, 10.01001, 29.97, 76.1538 310 | all-videos/B200.mp4, 10.01, 29.97002997002997, 36.2614 311 | all-videos/B201.mp4, 10.0, 30.0, 49.4186 312 | all-videos/B202.mp4, 10.01, 29.97002997002997, 78.8095 313 | all-videos/B203.mp4, 10.01, 29.97002997002997, 78.539 314 | all-videos/B204.mp4, 10.01, 29.97002997002997, 72.3743 315 | all-videos/B205.mp4, 10.0, 30.0, 63.2195 316 | all-videos/B206.mp4, 10.027854999999999, 29.916666666666668, 80.9784 317 | all-videos/B207.mp4, 10.01, 29.97002997002997, 86.5106 318 | all-videos/B208.mp4, 10.019802, 25.25, 74.1111 319 | all-videos/B209.mp4, 10.033332999999999, 30.0, 74.7753 320 | all-videos/B210.mp4, 10.01, 29.97002997002997, 29.4167 321 | all-videos/B211.mp4, 10.01, 29.97002997002997, 80.995 322 | all-videos/B212.mp4, 10.028329, 29.416666666666668, 71.6895 323 | all-videos/B213.mp4, 10.01, 29.97002997002997, 85.9296 324 | all-videos/B214.mp4, 10.01, 29.97002997002997, 45.2158 325 | all-videos/B215.mp4, 10.0, 30.0, 49.1789 326 | all-videos/B216.mp4, 10.033332999999999, 30.0, 85.8934 327 | all-videos/B217.mp4, 10.01, 29.97002997002997, 37.7778 328 | all-videos/B218.mp4, 10.043367, 29.97002997002997, 89.0446 329 | all-videos/B219.mp4, 10.043367, 29.97002997002997, 78.0149 330 | all-videos/B220.mp4, 10.01, 29.97002997002997, 70.117 331 | all-videos/B221.mp4, 10.0, 30.0, 80.4144 332 | all-videos/B222.mp4, 10.083333, 24.0, 34.2513 333 | all-videos/B223.mp4, 10.01, 29.97002997002997, 60.6067 334 | all-videos/B224.mp4, 10.01, 29.97002997002997, 81.0055 335 | all-videos/B225.mp4, 10.033332999999999, 30.0, 80.4624 336 | all-videos/B226.mp4, 10.022345999999999, 29.833333333333332, 62.5157 337 | all-videos/B227.mp4, 10.01, 29.97002997002997, 90.2448 338 | all-videos/B228.mp4, 10.033332999999999, 30.0, 71.1987 339 | all-videos/B229.mp4, 10.01, 29.97002997002997, 73.8866 340 | all-videos/B230.mp4, 10.03009, 29.91, 73.6337 341 | all-videos/B231.mp4, 10.01, 29.97002997002997, 65.0412 342 | all-videos/B232.mp4, 10.0, 29.8, 65.1549 343 | all-videos/B233.mp4, 10.016499999999999, 29.850746268656717, 50.8201 344 | all-videos/B234.mp4, 10.033332999999999, 30.0, 80.4612 345 | all-videos/B235.mp4, 10.0, 30.0, 68.4103 346 | all-videos/B236.mp4, 10.020107, 29.84, 53.3737 347 | all-videos/B237.mp4, 10.033332999999999, 30.0, 70.1209 348 | all-videos/B238.mp4, 10.016499999999999, 29.850746268656717, 60.5058 349 | all-videos/B239.mp4, 10.0, 30.0, 67.5337 350 | all-videos/B240.mp4, 10.086454999999999, 29.842, 67.1576 351 | all-videos/B241.mp4, 10.027854999999999, 29.916666666666668, 66.459 352 | all-videos/B242.mp4, 10.033332999999999, 30.0, 71.6724 353 | all-videos/B243.mp4, 10.016499999999999, 29.850746268656717, 47.8021 354 | all-videos/B244.mp4, 10.033332999999999, 30.0, 63.5397 355 | all-videos/B245.mp4, 10.033332999999999, 30.0, 82.8444 356 | all-videos/B246.mp4, 10.033332999999999, 30.0, 56.3813 357 | all-videos/B247.mp4, 10.033332999999999, 30.0, 76.2626 358 | all-videos/B248.mp4, 10.01, 29.97002997002997, 62.3922 359 | all-videos/B249.mp4, 10.0, 30.0, 69.7869 360 | all-videos/B250.mp4, 10.016499999999999, 29.850746268656717, 61.2128 361 | all-videos/B251.mp4, 10.01, 29.97002997002997, 82.7214 362 | all-videos/B252.mp4, 10.015691, 29.953, 73.2762 363 | all-videos/B253.mp4, 10.033332999999999, 30.0, 72.1818 364 | all-videos/B254.mp4, 10.033332999999999, 30.0, 54.569 365 | all-videos/B255.mp4, 10.022345999999999, 29.833333333333332, 62.2424 366 | all-videos/B256.mp4, 10.0, 30.0, 59.5503 367 | all-videos/B257.mp4, 10.01, 29.97002997002997, 82.2251 368 | all-videos/B258.mp4, 10.01, 29.97002997002997, 85.4278 369 | all-videos/B259.mp4, 10.01, 29.97002997002997, 84.8989 370 | all-videos/B260.mp4, 10.041667, 24.0, 6.22368 371 | all-videos/B261.mp4, 10.033332999999999, 30.0, 78.0125 372 | all-videos/B262.mp4, 10.01, 29.97002997002997, 51.9461 373 | all-videos/B263.mp4, 10.033332999999999, 30.0, 64.4931 374 | all-videos/B264.mp4, 10.033332999999999, 30.0, 78.114 375 | all-videos/B265.mp4, 9.7, 30.0, 74.4638 376 | all-videos/B266.mp4, 10.01, 29.97002997002997, 88.24 377 | all-videos/B267.mp4, 10.01, 29.97002997002997, 62.3061 378 | all-videos/B268.mp4, 10.027854999999999, 29.916666666666668, 25.0861 379 | all-videos/B269.mp4, 10.022345999999999, 29.833333333333332, 66.5404 380 | all-videos/B270.mp4, 10.043367, 29.97002997002997, 90.3281 381 | all-videos/B271.mp4, 10.0, 30.0, 84.2073 382 | all-videos/B272.mp4, 10.041667, 24.0, 49.6404 383 | all-videos/B273.mp4, 10.066666999999999, 30.0, 53.4568 384 | all-videos/B274.mp4, 10.033332999999999, 30.0, 54.9497 385 | all-videos/B275.mp4, 10.043367, 29.97002997002997, 79.0355 386 | all-videos/B276.mp4, 10.043367, 29.97002997002997, 83.5027 387 | all-videos/B277.mp4, 10.01, 29.97002997002997, 28.6053 388 | all-videos/B278.mp4, 10.01, 29.97002997002997, 70.3646 389 | all-videos/B279.mp4, 10.033332999999999, 30.0, 67.9827 390 | all-videos/B280.mp4, 10.0, 30.0, 69.5136 391 | all-videos/B281.mp4, 10.01, 29.97002997002997, 50.288 392 | all-videos/B282.mp4, 9.994429, 29.916666666666668, 70.8817 393 | all-videos/B283.mp4, 10.027009, 29.62, 64.2865 394 | all-videos/B284.mp4, 10.01, 29.97002997002997, 71.3248 395 | all-videos/B285.mp4, 10.01, 29.97002997002997, 73.6837 396 | all-videos/B286.mp4, 10.033332999999999, 30.0, 38.6012 397 | all-videos/B287.mp4, 10.0, 30.0, 70.2771 398 | all-videos/B288.mp4, 10.043367, 29.97002997002997, 82.4922 399 | all-videos/B289.mp4, 10.01, 29.97002997002997, 84.8406 400 | all-videos/B290.mp4, 10.033332999999999, 30.0, 62.4348 401 | all-videos/B291.mp4, 10.0, 30.0, 71.1256 402 | all-videos/B292.mp4, 10.033332999999999, 30.0, 81.9686 403 | all-videos/B293.mp4, 10.066666999999999, 30.0, 78.2485 404 | all-videos/B294.mp4, 10.0, 30.0, 66.3431 405 | all-videos/B295.mp4, 10.01, 29.97002997002997, 54.7539 406 | all-videos/B296.mp4, 10.021044, 29.937, 63.203 407 | all-videos/B297.mp4, 10.033332999999999, 30.0, 61.4341 408 | all-videos/B298.mp4, 10.033332999999999, 30.0, 42.3916 409 | all-videos/B299.mp4, 10.033332999999999, 30.0, 72.21 410 | all-videos/B300.mp4, 10.033332999999999, 30.0, 27.691 411 | all-videos/B301.mp4, 10.0, 30.0, 49.1242 412 | all-videos/B302.mp4, 10.01, 29.97002997002997, 72.25 413 | all-videos/B303.mp4, 10.041667, 24.0, 18.8824 414 | all-videos/B304.mp4, 10.043367, 29.97002997002997, 91.7312 415 | all-videos/B305.mp4, 10.01, 29.97002997002997, 66.0914 416 | all-videos/B306.mp4, 10.033332999999999, 30.0, 79.1263 417 | all-videos/B307.mp4, 10.022345999999999, 29.833333333333332, 62.3543 418 | all-videos/B308.mp4, 10.033332999999999, 30.0, 82.8098 419 | all-videos/B309.mp4, 10.0, 30.0, 75.774 420 | all-videos/B310.mp4, 10.033332999999999, 30.0, 85.1746 421 | all-videos/B311.mp4, 10.033332999999999, 30.0, 88.8258 422 | all-videos/B312.mp4, 10.033332999999999, 30.0, 61.1696 423 | all-videos/B313.mp4, 10.033332999999999, 30.0, 89.2074 424 | all-videos/B314.mp4, 10.01, 29.97002997002997, 94.2865 425 | all-videos/B315.mp4, 10.033332999999999, 30.0, 85.593 426 | all-videos/B316.mp4, 10.033332999999999, 30.0, 89.1236 427 | all-videos/C001.mp4, 10.04, 30.0, 84.2191 428 | all-videos/D001.mp4, 10.020211, 16.666666666666668, 32.7293 429 | all-videos/E001.mp4, 10.0314, 30.0, 51.6061 430 | all-videos/F001.mp4, 10.026644, 30.020013342228154, 69.3289 431 | all-videos/F002.mp4, 10.007033, 30.0, 29.0101 432 | all-videos/F003.mp4, 10.011688999999999, 30.0, 40.2153 433 | all-videos/F004.mp4, 10.001610999999999, 30.0, 18.6163 434 | all-videos/F005.mp4, 10.004222, 30.0, 56.6974 435 | all-videos/F006.mp4, 10.01, 29.97002997002997, 59.6044 436 | all-videos/F007.mp4, 10.008333, 29.97002997002997, 42.7267 437 | all-videos/G001.mp4, 10.0, 30.0, 72.0884 438 | all-videos/G002.mp4, 10.033332999999999, 30.0, 47.5172 439 | all-videos/G003.mp4, 10.033332999999999, 30.0, 73.1593 440 | all-videos/G004.mp4, 10.0, 30.0, 76.068 441 | all-videos/G005.mp4, 10.033332999999999, 30.0, 85.2933 442 | all-videos/G006.mp4, 10.0, 30.0, 72.881 443 | all-videos/G007.mp4, 10.01, 29.97002997002997, 87.2169 444 | all-videos/G008.mp4, 10.01, 29.97002997002997, 65.8113 445 | all-videos/G009.mp4, 10.01, 29.97002997002997, 72.7931 446 | all-videos/G010.mp4, 10.01, 29.97002997002997, 68.4129 447 | all-videos/G011.mp4, 10.01, 29.97002997002997, 24.7697 448 | all-videos/G012.mp4, 10.01, 29.97002997002997, 51.3923 449 | all-videos/G013.mp4, 10.036451, 20.027, 73.2679 450 | all-videos/G014.mp4, 10.01, 29.97002997002997, 56.096 451 | all-videos/G015.mp4, 10.01, 29.97002997002997, 89.2513 452 | all-videos/G016.mp4, 10.01, 29.97002997002997, 80.5959 453 | all-videos/G017.mp4, 10.0, 30.0, 75.1832 454 | all-videos/G018.mp4, 10.0, 30.0, 83.5848 455 | all-videos/G019.mp4, 10.0, 30.0, 71.6599 456 | all-videos/G020.mp4, 10.033332999999999, 30.0, 60.6335 457 | all-videos/G021.mp4, 10.0, 30.0, 41.2703 458 | all-videos/G022.mp4, 10.0, 30.0, 58.5622 459 | all-videos/G023.mp4, 10.0, 30.0, 58.9406 460 | all-videos/G024.mp4, 10.033332999999999, 30.0, 59.6292 461 | all-videos/G025.mp4, 10.0, 30.0, 40.5028 462 | all-videos/G026.mp4, 10.033332999999999, 30.0, 54.4118 463 | all-videos/G027.mp4, 10.033332999999999, 30.0, 45.7368 464 | all-videos/G028.mp4, 10.033332999999999, 30.0, 61.3883 465 | all-videos/G029.mp4, 10.033332999999999, 30.0, 50.4835 466 | all-videos/G030.mp4, 10.01, 29.97002997002997, 77.7895 467 | all-videos/G031.mp4, 10.01, 29.97002997002997, 70.6823 468 | all-videos/G032.mp4, 10.01, 29.97002997002997, 53.278 469 | all-videos/G033.mp4, 10.01, 29.97002997002997, 60.6505 470 | all-videos/G034.mp4, 10.01, 29.97002997002997, 82.4354 471 | all-videos/G035.mp4, 10.01, 29.97002997002997, 72.2569 472 | all-videos/G036.mp4, 10.01, 29.97002997002997, 78.6898 473 | all-videos/G037.mp4, 10.01, 29.97002997002997, 78.2783 474 | all-videos/G038.mp4, 10.01, 29.97002997002997, 72.125 475 | all-videos/G039.mp4, 10.01, 29.97002997002997, 73.5789 476 | all-videos/G040.mp4, 10.01, 29.97002997002997, 80.9306 477 | all-videos/G041.mp4, 10.016499999999999, 29.850746268656717, 43.7181 478 | all-videos/G042.mp4, 10.022345999999999, 29.833333333333332, 70.191 479 | all-videos/G043.mp4, 10.022345999999999, 29.833333333333332, 77.4279 480 | all-videos/G044.mp4, 10.036432999999999, 29.791459781529294, 74.9635 481 | all-videos/G045.mp4, 10.016499999999999, 29.850746268656717, 62.4847 482 | all-videos/G046.mp4, 10.016499999999999, 29.850746268656717, 78.8212 483 | all-videos/G047.mp4, 10.066666999999999, 30.0, 70.1804 484 | all-videos/G048.mp4, 10.066666999999999, 30.0, 78.9434 485 | all-videos/G049.mp4, 9.994429, 29.916666666666668, 44.5829 486 | all-videos/G050.mp4, 10.033332999999999, 30.0, 79.9171 487 | all-videos/G051.mp4, 10.066666999999999, 30.0, 75.6131 488 | all-videos/G052.mp4, 10.066666999999999, 30.0, 67.6825 489 | all-videos/G053.mp4, 10.033332999999999, 30.0, 74.3118 490 | all-videos/G054.mp4, 10.0, 30.0, 56.0 491 | all-videos/G055.mp4, 10.066666999999999, 30.0, 63.478 492 | all-videos/G056.mp4, 10.01, 29.97002997002997, 62.0506 493 | all-videos/G057.mp4, 10.033332999999999, 30.0, 54.5354 494 | all-videos/G058.mp4, 10.033332999999999, 30.0, 65.1421 495 | all-videos/G059.mp4, 10.033332999999999, 30.0, 38.0054 496 | all-videos/G060.mp4, 10.033332999999999, 30.0, 71.4348 497 | all-videos/G061.mp4, 10.033332999999999, 30.0, 73.8883 498 | all-videos/G062.mp4, 10.041667, 24.0, 49.3832 499 | all-videos/G063.mp4, 10.115701999999999, 20.166666666666668, 29.0845 500 | all-videos/G064.mp4, 10.016529, 20.166666666666668, 11.3333 501 | all-videos/G065.mp4, 10.020619, 24.25, 68.6814 502 | all-videos/G066.mp4, 10.097999999999999, 20.2020202020202, 11.9079 503 | all-videos/G067.mp4, 10.022345999999999, 29.833333333333332, 57.2944 504 | all-videos/G068.mp4, 10.016499999999999, 29.850746268656717, 59.4638 505 | all-videos/G069.mp4, 10.022345999999999, 29.833333333333332, 66.1244 506 | all-videos/G070.mp4, 10.066666999999999, 30.0, 79.5904 507 | all-videos/G071.mp4, 10.0, 30.0, 48.3714 508 | all-videos/G072.mp4, 10.01, 29.97002997002997, 47.7432 509 | all-videos/G073.mp4, 10.0, 30.0, 18.6333 510 | all-videos/G074.mp4, 10.01, 29.97002997002997, 83.1826 511 | all-videos/G075.mp4, 10.049915, 29.851, 64.293 512 | all-videos/G076.mp4, 10.020443, 29.839, 80.0361 513 | all-videos/G077.mp4, 10.022345999999999, 29.833333333333332, 85.2383 514 | all-videos/G078.mp4, 10.022345999999999, 29.833333333333332, 81.5829 515 | all-videos/G079.mp4, 10.033332999999999, 30.0, 15.3576 516 | all-videos/G080.mp4, 10.027854999999999, 29.916666666666668, 26.2575 517 | all-videos/G081.mp4, 10.041667, 24.0, 46.7325 518 | all-videos/G082.mp4, 10.043367, 29.97002997002997, 78.019 519 | all-videos/G083.mp4, 10.01, 29.97002997002997, 78.7753 520 | all-videos/G084.mp4, 10.0, 30.0, 59.3198 521 | all-videos/G085.mp4, 10.01, 29.97002997002997, 76.4971 522 | all-videos/G086.mp4, 10.01, 29.97002997002997, 41.3556 523 | all-videos/G087.mp4, 10.01, 29.97002997002997, 71.1184 524 | all-videos/G088.mp4, 10.01, 29.97002997002997, 56.0826 525 | all-videos/G089.mp4, 10.041667, 24.0, 31.2638 526 | all-videos/G090.mp4, 10.043367, 29.97002997002997, 75.4464 527 | all-videos/G091.mp4, 10.01, 29.97002997002997, 84.2475 528 | all-videos/G092.mp4, 10.01, 29.97002997002997, 67.8351 529 | all-videos/G093.mp4, 10.01, 29.97002997002997, 66.1105 530 | all-videos/G094.mp4, 10.01, 29.97002997002997, 66.865 531 | all-videos/G095.mp4, 10.01, 29.97002997002997, 79.2192 532 | all-videos/G096.mp4, 10.01, 29.97002997002997, 86.6243 533 | all-videos/G097.mp4, 10.01, 29.97002997002997, 84.6976 534 | all-videos/G098.mp4, 10.01, 29.97002997002997, 35.8556 535 | all-videos/G099.mp4, 10.01, 29.97002997002997, 80.5632 536 | all-videos/G100.mp4, 10.0, 30.0, 80.0573 537 | all-videos/G101.mp4, 10.01, 29.97002997002997, 63.0345 538 | all-videos/G102.mp4, 10.01, 29.97002997002997, 67.75 539 | all-videos/G103.mp4, 10.01, 29.97002997002997, 81.3545 540 | all-videos/G104.mp4, 10.01, 29.97002997002997, 86.1474 541 | all-videos/G105.mp4, 10.033332999999999, 30.0, 45.0703 542 | all-videos/G106.mp4, 10.043367, 29.97002997002997, 23.4251 543 | all-videos/G107.mp4, 10.033332999999999, 30.0, 49.2717 544 | all-videos/G108.mp4, 10.041667, 24.0, 58.5988 545 | all-videos/G109.mp4, 10.0, 30.0, 79.2459 546 | all-videos/G110.mp4, 10.0, 30.0, 83.5217 547 | all-videos/G111.mp4, 10.033332999999999, 30.0, 55.0282 548 | all-videos/G112.mp4, 10.1, 30.0, 74.3393 549 | all-videos/G113.mp4, 10.043367, 29.97002997002997, 74.4923 550 | all-videos/G114.mp4, 10.033332999999999, 30.0, 65.0532 551 | all-videos/G115.mp4, 10.043367, 29.97002997002997, 77.8551 552 | all-videos/G116.mp4, 10.043367, 29.97002997002997, 88.4254 553 | all-videos/G117.mp4, 10.043367, 29.97002997002997, 71.7751 554 | all-videos/G118.mp4, 10.043367, 29.97002997002997, 65.9 555 | all-videos/G119.mp4, 10.043367, 29.97002997002997, 76.8795 556 | all-videos/H001.mp4, 10.0, 30.0, 63.6974 557 | all-videos/H002.mp4, 10.010714, 29.868, 69.1073 558 | all-videos/I001.mp4, 10.01, 29.97002997002997, 72.3462 559 | all-videos/I002.mp4, 10.022122, 29.834, 63.4225 560 | all-videos/I003.mp4, 10.018032, 29.946, 69.5414 561 | all-videos/J001.mp4, 10.011974, 30.064, 71.2778 562 | all-videos/J002.mp4, 10.022197, 29.734, 62.36 563 | all-videos/J003.mp4, 10.003988, 30.088, 60.4424 564 | all-videos/J004.mp4, 10.028224999999999, 30.115, 62.645 565 | all-videos/J005.mp4, 10.034182, 27.207, 24.1117 566 | all-videos/K001.mp4, 10.0, 30.0, 33.0114 567 | all-videos/K002.mp4, 10.016694, 29.95, 53.9415 568 | all-videos/L001.mp4, 10.006667, 29.97002997002997, 76.5463 569 | all-videos/M001.mp4, 10.0, 30.0, 36.9665 570 | all-videos/M002.mp4, 10.0, 30.0, 69.3081 571 | all-videos/M003.mp4, 10.0, 30.0, 21.1084 572 | all-videos/N001.mp4, 10.004999999999999, 120.0, 67.445 573 | all-videos/O001.mp4, 10.036667, 24.0, 27.8 574 | all-videos/O002.mp4, 10.04, 120.0, 31.0169 575 | all-videos/P001.mp4, 10.0, 30.0, 64.399 576 | all-videos/P002.mp4, 10.0, 30.0, 60.1489 577 | all-videos/P003.mp4, 10.0, 30.0, 61.749 578 | all-videos/P004.mp4, 10.0, 30.0, 63.4194 579 | all-videos/P005.mp4, 10.0, 30.0, 74.1421 580 | all-videos/P006.mp4, 10.0, 30.0, 54.2271 581 | all-videos/P007.mp4, 10.0, 30.0, 38.2139 582 | all-videos/P008.mp4, 10.0, 30.0, 38.4545 583 | all-videos/P009.mp4, 10.0, 30.0, 55.9663 584 | all-videos/Q001.mp4, 10.026644, 30.020013342228154, 61.1466 585 | all-videos/R001.mp4, 10.026644, 30.020013342228154, 72.4848 586 | --------------------------------------------------------------------------------