├── code └── MultiModal-DeepFake-main │ ├── models │ ├── __init__.py │ ├── METER │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── heads.cpython-38.pyc │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── bert_model.cpython-38.pyc │ │ │ ├── clip_model.cpython-38.pyc │ │ │ ├── dist_utils.cpython-38.pyc │ │ │ ├── meter_utils.cpython-38.pyc │ │ │ ├── objectives.cpython-38.pyc │ │ │ ├── meter_module.cpython-38.pyc │ │ │ ├── swin_helpers.cpython-38.pyc │ │ │ └── swin_transformer.cpython-38.pyc │ │ ├── heads.py │ │ ├── dist_utils.py │ │ ├── clip_model.py │ │ ├── meter_utils.py │ │ └── meter_module.py │ ├── interaction.py │ ├── box_ops.py │ └── consist_modeling.py │ ├── tools │ ├── __init__.py │ ├── loss.py │ ├── config.py │ ├── schedulers.py │ ├── env.py │ ├── utils.py │ └── multilabel_metrics.py │ ├── scheduler │ ├── __init__.py │ ├── lr_sched.py │ ├── step_lr.py │ ├── scheduler_factory.py │ ├── cosine_lr.py │ ├── tanh_lr.py │ ├── plateau_lr.py │ └── scheduler.py │ ├── requirements.txt │ ├── train.sh │ ├── test.sh │ ├── optim │ ├── __init__.py │ ├── novograd.py │ ├── sgdp.py │ ├── nadam.py │ ├── lookahead.py │ ├── adamp.py │ ├── nvnovograd.py │ ├── adamw.py │ ├── radam.py │ ├── rmsprop_tf.py │ ├── adahessian.py │ ├── optim_factory.py │ └── adafactor.py │ ├── configs │ ├── test.yaml │ ├── train.yaml │ └── METER.yaml │ └── dataset │ ├── __init__.py │ ├── dataset.py │ ├── utils.py │ └── randaugment.py ├── .gitignore ├── LICENSE └── README.md /code/MultiModal-DeepFake-main/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/METER/__init__.py: -------------------------------------------------------------------------------- 1 | from .meter_module import METERTransformerSS 2 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/METER/__pycache__/heads.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/heads.cpython-38.pyc -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/METER/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/METER/__pycache__/bert_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/bert_model.cpython-38.pyc -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/METER/__pycache__/clip_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/clip_model.cpython-38.pyc -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/METER/__pycache__/dist_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/dist_utils.cpython-38.pyc -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/METER/__pycache__/meter_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/meter_utils.cpython-38.pyc -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/METER/__pycache__/objectives.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/objectives.cpython-38.pyc -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/METER/__pycache__/meter_module.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/meter_module.cpython-38.pyc -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/METER/__pycache__/swin_helpers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/swin_helpers.cpython-38.pyc -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/METER/__pycache__/swin_transformer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/swin_transformer.cpython-38.pyc -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | from .cosine_lr import CosineLRScheduler 2 | from .plateau_lr import PlateauLRScheduler 3 | from .step_lr import StepLRScheduler 4 | from .tanh_lr import TanhLRScheduler 5 | from .scheduler_factory import create_scheduler 6 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/requirements.txt: -------------------------------------------------------------------------------- 1 | timm==0.4.12 2 | transformers==4.8.1 3 | Pillow==8.3.2 4 | numpy==1.21.1 5 | opencv-python==4.5.5.62 6 | scipy==1.8.0 7 | scikit_image==0.19.2 8 | matplotlib==3.4.3 9 | scikit-learn 10 | tensorboard 11 | setuptools==59.5.0 12 | einops==0.8.0 13 | seaborn==0.13.2 -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/train.sh: -------------------------------------------------------------------------------- 1 | EXPID=$(date +"%Y%m%d_%H%M%S") 2 | 3 | HOST='localhost' 4 | PORT='1' 5 | 6 | NUM_GPU=8 7 | python train.py 8 | --config 'configs/train.yaml' \ 9 | --output_dir './results' \ 10 | --launcher pytorch \ 11 | --rank 0 \ 12 | --log_num ${EXPID} \ 13 | --dist-url tcp://localhost:23459 \ 14 | --world_size $NUM_GPU \ 15 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/test.sh: -------------------------------------------------------------------------------- 1 | EXPID=CSCL 2 | 3 | HOST='localhost' 4 | PORT='1' 5 | 6 | NUM_GPU=1 7 | 8 | python test.py \ 9 | --config 'configs/test.yaml' \ 10 | --output_dir './results' \ 11 | --launcher pytorch \ 12 | --rank 0 \ 13 | --log_num ${EXPID} \ 14 | --dist-url tcp://localhost:23459 \ 15 | --world_size $NUM_GPU \ 16 | --test_epoch 49 \ 17 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/optim/__init__.py: -------------------------------------------------------------------------------- 1 | from .adamp import AdamP 2 | from .adamw import AdamW 3 | from .adafactor import Adafactor 4 | from .adahessian import Adahessian 5 | from .lookahead import Lookahead 6 | from .nadam import Nadam 7 | from .novograd import NovoGrad 8 | from .nvnovograd import NvNovoGrad 9 | from .radam import RAdam 10 | from .rmsprop_tf import RMSpropTF 11 | from .sgdp import SGDP 12 | 13 | from .optim_factory import create_optimizer -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/configs/test.yaml: -------------------------------------------------------------------------------- 1 | train_file: ["../../datasets/DGM4/metadata/trainval.json"] # use train and val set together 2 | val_file: ["../../datasets/DGM4/metadata/test.json"] 3 | 4 | image_res: 256 5 | vision_width: 768 6 | embed_dim: 256 7 | batch_size_train: 64 8 | batch_size_val: 256 9 | max_words: 50 10 | 11 | loss_BIC_wgt: 1 12 | loss_bbox_wgt: 0.1 13 | loss_giou_wgt: 0.1 14 | loss_MLC_wgt: 1 15 | Loss_sim_wgt: 10 16 | 17 | optimizer: {opt: adamW, lr: 0.00001, lr_img: 0.00005, weight_decay: 0.02} 18 | schedular: {sched: cosine, lr: 0.00001, epochs: 50, min_lr: 0.000001, decay_rate: 1, warmup_lr: 0.000001, warmup_epochs: 10, cooldown_epochs: 0} 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/configs/train.yaml: -------------------------------------------------------------------------------- 1 | train_file: ["../../datasets/DGM4/metadata/trainval.json"] # use train and val set together 2 | val_file: ["../../datasets/DGM4/metadata/test.json"] 3 | 4 | image_res: 256 5 | vision_width: 768 6 | embed_dim: 256 7 | batch_size_train: 32 8 | batch_size_val: 64 9 | max_words: 50 10 | 11 | loss_BIC_wgt: 1 12 | loss_bbox_wgt: 0.1 13 | loss_giou_wgt: 0.1 14 | loss_MLC_wgt: 1 15 | Loss_sim_wgt: 10 16 | 17 | optimizer: {opt: adamW, lr: 0.00001, lr_img: 0.00005, weight_decay: 0.02} 18 | schedular: {sched: cosine, lr: 0.00001, epochs: 50, min_lr: 0.000001, decay_rate: 1, warmup_lr: 0.000001, warmup_epochs: 10, cooldown_epochs: 0} 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/configs/METER.yaml: -------------------------------------------------------------------------------- 1 | # ViT-B/16 + RoBERTa 2 | 3 | # Image setting 4 | train_transform_keys: ["clip"] 5 | val_transform_keys: ["clip"] 6 | image_size: 256 7 | patch_size: 16 8 | draw_false_image: 1 9 | image_only: False 10 | resolution_before: 256 11 | 12 | # Text Setting 13 | vqav2_label_size: 3129 14 | max_text_len: 50 15 | tokenizer: "roberta-base" 16 | vocab_size: 50265 17 | whole_word_masking: False # note that whole_word_masking does not work for RoBERTa 18 | mlm_prob: 0.15 19 | draw_false_text: 0 20 | 21 | # Transformer Setting 22 | num_top_layer: 6 23 | input_image_embed_size: 768 24 | input_text_embed_size: 768 25 | vit: 'ViT-B/16' 26 | hidden_size: 768 27 | num_heads: 12 28 | num_layers: 6 29 | mlp_ratio: 4 30 | drop_rate: 0.1 31 | load_path: "meter_clip16_224_roberta_pretrain.ckpt" -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/tools/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | import numpy as np 6 | 7 | 8 | class GeneralizedCELoss(nn.Module): 9 | 10 | def __init__(self, q=0.7): 11 | super(GeneralizedCELoss, self).__init__() 12 | self.q = q 13 | 14 | def forward(self, logits, targets): 15 | # p = F.softmax(logits, dim=1) 16 | p = F.sigmoid(logits) 17 | if np.isnan(p.mean().item()): 18 | raise NameError('GCE_p') 19 | # Yg = torch.gather(p, 1, torch.unsqueeze(targets, 1)) 20 | # Yg = torch.gather(p, 1, targets) 21 | # modify gradient of cross entropy 22 | loss_weight = (p.detach()**self.q)*self.q 23 | if np.isnan(p.mean().item()): 24 | raise NameError('GCE_Yg') 25 | 26 | loss = F.binary_cross_entropy_with_logits(logits, targets.float(), reduction='none') * loss_weight 27 | 28 | return loss 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | datasets/ 2 | code/MultiModal-DeepFake-main/bert-base-uncased 3 | code/MultiModal-DeepFake-main/ALBEF_4M.pth 4 | code/MultiModal-DeepFake-main/nohup.out 5 | code/MultiModal-DeepFake-main/results 6 | code/MultiModal-DeepFake-main/pic_ours 7 | code/MultiModal-DeepFake-main/pic_baseline 8 | code/MultiModal-DeepFake-main/pic_consist 9 | code/MultiModal-DeepFake-main/datasets.zip 10 | code/MultiModal-DeepFake-main/deit_base_patch16_224-b5f2ef4d.pth 11 | code/MultiModal-DeepFake-main/__pycache__/*.pyc 12 | code/MultiModal-DeepFake-main/dataset/__pycache__/*.pyc 13 | code/MultiModal-DeepFake-main/models/__pycache__/*.pyc 14 | code/MultiModal-DeepFake-main/optim/__pycache__/*.pyc 15 | code/MultiModal-DeepFake-main/scheduler/__pycache__/*.pyc 16 | code/MultiModal-DeepFake-main/tools/__pycache__/*.pyc 17 | nohup.out 18 | dataset.zip 19 | code/MultiModal-DeepFake-main/meter_clip16_224_roberta_pretrain.ckpt 20 | code/MultiModal-DeepFake-main/roberta-base/ 21 | code/MultiModal-DeepFake-main/ViT-B-16.pt 22 | code/MultiModal-DeepFake-main/pic_prnu -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/scheduler/lr_sched.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import math 8 | 9 | def adjust_learning_rate(optimizer, epoch, args, cfg): 10 | """Decay the learning rate with half-cycle cosine after warmup""" 11 | if epoch < cfg['schedular']['warmup_epochs']: 12 | lr = args.lr * epoch / cfg['schedular']['warmup_epochs'] 13 | else: 14 | lr = cfg['schedular']['min_lr'] + (args.lr - cfg['schedular']['min_lr']) * 0.5 * \ 15 | (1. + math.cos(math.pi * (epoch - cfg['schedular']['warmup_epochs']) / (cfg['schedular']['epochs'] - cfg['schedular']['warmup_epochs']))) 16 | for param_group in optimizer.param_groups: 17 | if "lr_scale" in param_group: 18 | param_group["lr"] = lr * param_group["lr_scale"] 19 | else: 20 | param_group["lr"] = lr 21 | return lr 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Yiheng Li 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/tools/config.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | DEFAULTS = { 4 | "network": "dpn", 5 | "encoder": "dpn92", 6 | "model_params": {}, 7 | "optimizer": { 8 | "batch_size": 32, 9 | "type": "SGD", # supported: SGD, Adam 10 | "momentum": 0.9, 11 | "weight_decay": 0, 12 | "clip": 1., 13 | "learning_rate": 0.1, 14 | "classifier_lr": -1, 15 | "nesterov": True, 16 | "schedule": { 17 | "type": "constant", # supported: constant, step, multistep, exponential, linear, poly 18 | "mode": "epoch", # supported: epoch, step 19 | "epochs": 10, 20 | "params": {} 21 | } 22 | }, 23 | # "normalize": { 24 | # "mean": [0.485, 0.456, 0.406], 25 | # "std": [0.229, 0.224, 0.225] 26 | # } 27 | } 28 | 29 | 30 | def _merge(src, dst): 31 | for k, v in src.items(): 32 | if k in dst: 33 | if isinstance(v, dict): 34 | _merge(src[k], dst[k]) 35 | else: 36 | dst[k] = v 37 | 38 | 39 | def load_config(config_file, defaults=DEFAULTS): 40 | with open(config_file, "r") as fd: 41 | config = json.load(fd) 42 | _merge(defaults, config) 43 | return config 44 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/METER/heads.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from transformers.models.bert.modeling_bert import BertPredictionHeadTransform 6 | 7 | 8 | class Pooler(nn.Module): 9 | def __init__(self, hidden_size): 10 | super().__init__() 11 | self.dense = nn.Linear(hidden_size, hidden_size) 12 | self.activation = nn.Tanh() 13 | 14 | def forward(self, hidden_states): 15 | first_token_tensor = hidden_states[:, 0] 16 | pooled_output = self.dense(first_token_tensor) 17 | pooled_output = self.activation(pooled_output) 18 | return pooled_output 19 | 20 | 21 | class ITMHead(nn.Module): 22 | def __init__(self, hidden_size): 23 | super().__init__() 24 | self.fc = nn.Linear(hidden_size, 2) 25 | 26 | def forward(self, x): 27 | x = self.fc(x) 28 | return x 29 | 30 | 31 | class MLMHead(nn.Module): 32 | def __init__(self, config, weight=None): 33 | super().__init__() 34 | self.transform = BertPredictionHeadTransform(config) 35 | self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 36 | self.bias = nn.Parameter(torch.zeros(config.vocab_size)) 37 | if weight is not None: 38 | self.decoder.weight = weight 39 | 40 | def forward(self, x): 41 | x = self.transform(x) 42 | x = self.decoder(x) + self.bias 43 | return x 44 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/tools/schedulers.py: -------------------------------------------------------------------------------- 1 | from bisect import bisect_right 2 | 3 | from torch.optim.lr_scheduler import _LRScheduler 4 | from pdb import set_trace as st 5 | 6 | class LRStepScheduler(_LRScheduler): 7 | def __init__(self, optimizer, steps, last_epoch=-1): 8 | self.lr_steps = steps 9 | super().__init__(optimizer, last_epoch) 10 | 11 | def get_lr(self): 12 | pos = max(bisect_right([x for x, y in self.lr_steps], self.last_epoch) - 1, 0) 13 | return [self.lr_steps[pos][1] if self.lr_steps[pos][0] <= self.last_epoch else base_lr for base_lr in self.base_lrs] 14 | 15 | 16 | class PolyLR(_LRScheduler): 17 | """Sets the learning rate of each parameter group according to poly learning rate policy 18 | """ 19 | def __init__(self, optimizer, max_iter=90000, power=0.9, last_epoch=-1): 20 | self.max_iter = max_iter 21 | self.power = power 22 | super(PolyLR, self).__init__(optimizer, last_epoch) 23 | def get_lr(self): 24 | self.last_epoch = (self.last_epoch + 1) % self.max_iter 25 | return [base_lr * ((1 - float(self.last_epoch) / self.max_iter) ** (self.power)) for base_lr in self.base_lrs] 26 | 27 | class ExponentialLRScheduler(_LRScheduler): 28 | """Decays the learning rate of each parameter group by gamma every epoch. 29 | When last_epoch=-1, sets initial lr as lr. 30 | 31 | Args: 32 | optimizer (Optimizer): Wrapped optimizer. 33 | gamma (float): Multiplicative factor of learning rate decay. 34 | last_epoch (int): The index of last epoch. Default: -1. 35 | """ 36 | 37 | def __init__(self, optimizer, gamma, last_epoch=-1): 38 | self.gamma = gamma 39 | super(ExponentialLRScheduler, self).__init__(optimizer, last_epoch) 40 | 41 | def get_lr(self): 42 | if self.last_epoch <= 0: 43 | return self.base_lrs 44 | return [base_lr * self.gamma**self.last_epoch for base_lr in self.base_lrs] 45 | 46 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/interaction.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import pdb 4 | 5 | class Self_Interaction_block(nn.Module): 6 | def __init__(self, num_head, hidden_dim, input_dim, output_dim): 7 | super().__init__() 8 | 9 | self.self_attn = nn.MultiheadAttention(input_dim, num_head, dropout=0.0, batch_first=True) 10 | self.FFN = nn.Sequential(nn.Linear(input_dim, hidden_dim), 11 | nn.ReLU(), 12 | nn.Linear(hidden_dim, output_dim)) 13 | 14 | self.norm1 = nn.LayerNorm(input_dim) 15 | self.norm2 = nn.LayerNorm(input_dim) 16 | self.dropout1 = nn.Dropout(0.1) 17 | self.dropout2 = nn.Dropout(0.1) 18 | 19 | def forward(self, query, query_padding_mask, attn_mask): 20 | 21 | feat_after_self = query + self.dropout1(self.self_attn(query=query, 22 | key=query, 23 | value=query, 24 | key_padding_mask=query_padding_mask, 25 | attn_mask=attn_mask)[0]) 26 | feat_after_self = self.norm1(feat_after_self) 27 | output = feat_after_self + self.dropout2(self.FFN(feat_after_self)) 28 | output = self.norm2(output) 29 | return output 30 | 31 | class Self_Interaction(nn.Module): 32 | def __init__(self, num_head, hidden_dim, input_dim, output_dim, layers=3): 33 | super().__init__() 34 | 35 | self.layers = nn.ModuleList() 36 | for i in range(layers): 37 | self.layers.append(Self_Interaction_block(num_head, hidden_dim, input_dim, output_dim)) 38 | 39 | def forward(self, query, query_padding_mask, query_pos_emb=None, attn_mask=None): 40 | if query_pos_emb is not None: 41 | for i in range(len(self.layers)): 42 | query = self.layers[i](query + query_pos_emb, query_padding_mask, attn_mask) 43 | else: 44 | for i in range(len(self.layers)): 45 | query = self.layers[i](query, query_padding_mask, attn_mask) 46 | return query -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/scheduler/step_lr.py: -------------------------------------------------------------------------------- 1 | """ Step Scheduler 2 | 3 | Basic step LR schedule with warmup, noise. 4 | 5 | Hacked together by / Copyright 2020 Ross Wightman 6 | """ 7 | import math 8 | import torch 9 | 10 | from .scheduler import Scheduler 11 | 12 | 13 | class StepLRScheduler(Scheduler): 14 | """ 15 | """ 16 | 17 | def __init__(self, 18 | optimizer: torch.optim.Optimizer, 19 | decay_t: float, 20 | decay_rate: float = 1., 21 | warmup_t=0, 22 | warmup_lr_init=0, 23 | t_in_epochs=True, 24 | noise_range_t=None, 25 | noise_pct=0.67, 26 | noise_std=1.0, 27 | noise_seed=42, 28 | initialize=True, 29 | ) -> None: 30 | super().__init__( 31 | optimizer, param_group_field="lr", 32 | noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed, 33 | initialize=initialize) 34 | 35 | self.decay_t = decay_t 36 | self.decay_rate = decay_rate 37 | self.warmup_t = warmup_t 38 | self.warmup_lr_init = warmup_lr_init 39 | self.t_in_epochs = t_in_epochs 40 | if self.warmup_t: 41 | self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values] 42 | super().update_groups(self.warmup_lr_init) 43 | else: 44 | self.warmup_steps = [1 for _ in self.base_values] 45 | 46 | def _get_lr(self, t): 47 | if t < self.warmup_t: 48 | lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps] 49 | else: 50 | lrs = [v * (self.decay_rate ** (t // self.decay_t)) for v in self.base_values] 51 | return lrs 52 | 53 | def get_epoch_values(self, epoch: int): 54 | if self.t_in_epochs: 55 | return self._get_lr(epoch) 56 | else: 57 | return None 58 | 59 | def get_update_values(self, num_updates: int): 60 | if not self.t_in_epochs: 61 | return self._get_lr(num_updates) 62 | else: 63 | return None 64 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Utilities for bounding box manipulation and GIoU. 4 | """ 5 | import torch 6 | from torchvision.ops.boxes import box_area 7 | import math 8 | 9 | def box_cxcywh_to_xyxy(x): # 这个用了 10 | x_c, y_c, w, h = x.unbind(-1) 11 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 12 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 13 | return torch.stack(b, dim=-1) 14 | 15 | 16 | def box_xyxy_to_cxcywh(x): 17 | x0, y0, x1, y1 = x.unbind(-1) 18 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 19 | (x1 - x0), (y1 - y0)] 20 | return torch.stack(b, dim=-1) 21 | 22 | 23 | # modified from torchvision to also return the union 24 | def box_iou(boxes1, boxes2, test=False): 25 | 26 | area1 = box_area(boxes1) 27 | area2 = box_area(boxes2) 28 | 29 | # lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 30 | # rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 31 | lt = torch.max(boxes1[:, :2], boxes2[:, :2]) # [N,2] 32 | rb = torch.min(boxes1[:, 2:], boxes2[:, 2:]) # [N,2] 33 | 34 | wh = (rb - lt).clamp(min=0) # [N,2] 35 | # inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 36 | inter = wh[:, 0] * wh[:, 1] # [N] 37 | 38 | # union = area1[:, None] + area2 - inter 39 | union = area1 + area2 - inter 40 | 41 | iou = inter / union 42 | 43 | if test: 44 | zero_lines = boxes2==torch.zeros_like(boxes2) 45 | zero_lines_idx = torch.where(zero_lines[:,0]==True)[0] 46 | 47 | for idx in zero_lines_idx: 48 | if all(boxes1[idx,:] < 1e-4): 49 | iou[idx]=1 50 | 51 | return iou, union 52 | 53 | 54 | def generalized_box_iou(boxes1, boxes2): 55 | """ 56 | Generalized IoU from https://giou.stanford.edu/ 57 | 58 | The boxes should be in [x0, y0, x1, y1] format 59 | 60 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 61 | and M = len(boxes2) 62 | """ 63 | iou, union = box_iou(boxes1, boxes2) 64 | 65 | # lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 66 | # rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 67 | lt = torch.min(boxes1[:, :2], boxes2[:, :2]) 68 | rb = torch.max(boxes1[:, 2:], boxes2[:, 2:]) 69 | 70 | wh = (rb - lt).clamp(min=0) # [N,M,2] 71 | # area = wh[:, :, 0] * wh[:, :, 1] 72 | area = wh[:, 0] * wh[:, 1] 73 | 74 | return iou - (area - union) / area 75 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import DataLoader 3 | from torchvision import transforms 4 | from PIL import Image 5 | 6 | from dataset.dataset import DGM4_Dataset 7 | from dataset.randaugment import RandomAugment 8 | 9 | def create_dataset(config): 10 | 11 | normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) 12 | 13 | train_transform = transforms.Compose([ 14 | RandomAugment(2, 7, isPIL=True, augs=['Identity', 'AutoContrast', 'Equalize', 'Brightness', 'Sharpness']), 15 | transforms.ToTensor(), 16 | normalize, 17 | ]) 18 | 19 | test_transform = transforms.Compose([ 20 | transforms.Resize((config['image_res'],config['image_res']),interpolation=Image.BICUBIC), 21 | transforms.ToTensor(), 22 | normalize, 23 | ]) 24 | 25 | train_dataset = DGM4_Dataset(config=config, ann_file=config['train_file'], transform=train_transform, max_words=config['max_words'], is_train=True) 26 | val_dataset = DGM4_Dataset(config=config, ann_file=config['val_file'], transform=test_transform, max_words=config['max_words'], is_train=False) 27 | return train_dataset, val_dataset 28 | 29 | def create_sampler(datasets, shuffles, num_tasks, global_rank): 30 | samplers = [] 31 | for dataset,shuffle in zip(datasets,shuffles): 32 | sampler = torch.utils.data.DistributedSampler(dataset, num_replicas=num_tasks, rank=global_rank, shuffle=shuffle) 33 | samplers.append(sampler) 34 | return samplers 35 | 36 | 37 | def create_loader(datasets, samplers, batch_size, num_workers, is_trains, collate_fns): 38 | loaders = [] 39 | for dataset,sampler,bs,n_worker,is_train,collate_fn in zip(datasets,samplers,batch_size,num_workers,is_trains,collate_fns): 40 | if is_train: 41 | shuffle = (sampler is None) 42 | drop_last = True 43 | else: 44 | shuffle = False 45 | drop_last = False 46 | loader = DataLoader( 47 | dataset, 48 | batch_size=bs, 49 | num_workers=n_worker, 50 | pin_memory=True, 51 | sampler=sampler, 52 | shuffle=shuffle, 53 | collate_fn=collate_fn, 54 | drop_last=drop_last, 55 | ) 56 | loaders.append(loader) 57 | return loaders -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/tools/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import torch 4 | import torch.distributed as dist 5 | 6 | 7 | def init_dist(args): 8 | """Initialize distributed computing environmen 9 | t.""" 10 | args.ngpus_per_node = torch.cuda.device_count() 11 | if args.launcher == 'pytorch': 12 | _init_dist_pytorch(args) 13 | elif args.launcher == 'mpi': 14 | _init_dist_mpi(args) 15 | elif args.launcher == 'slurm': 16 | _init_dist_slurm(args) 17 | else: 18 | raise ValueError('Invalid launcher type: {}'.format(args.launcher)) 19 | 20 | 21 | def _init_dist_pytorch(args, **kwargs): 22 | """Set up environment.""" 23 | # TODO: use local_rank instead of rank % num_gpus 24 | 25 | args.rank = args.rank * args.ngpus_per_node + args.gpu 26 | args.world_size = args.world_size 27 | 28 | dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 29 | world_size=args.world_size, rank=args.rank) 30 | torch.cuda.set_device(args.gpu) 31 | print(f"{args.dist_url}, ws:{args.world_size}, rank:{args.rank}") 32 | 33 | if args.rank % args.ngpus_per_node == 0: 34 | args.log = True 35 | else: 36 | args.log = False 37 | 38 | 39 | def _init_dist_slurm(args, port=23333, **kwargs): 40 | """Set up slurm environment.""" 41 | rank = int(os.environ['SLURM_PROCID']) 42 | world_size = int(os.environ['SLURM_NTASKS']) 43 | local_rank = int(os.environ['SLURM_LOCALID']) 44 | node_list = str(os.environ['SLURM_NODELIST']) 45 | num_gpus = torch.cuda.device_count() 46 | 47 | node_parts = re.findall('[0-9]+', node_list) 48 | host_ip = '{}.{}.{}.{}'.format(node_parts[1], node_parts[2], node_parts[3], node_parts[4]) 49 | init_method = 'tcp://{}:{}'.format(host_ip, port) 50 | 51 | print(f"{init_method}, rank: {rank}, local rank: {local_rank}") 52 | 53 | dist.init_process_group(backend=args.dist_backend, 54 | init_method=init_method, 55 | world_size=world_size, 56 | rank=rank) 57 | 58 | torch.cuda.set_device(local_rank) 59 | args.rank = rank 60 | args.world_size = world_size 61 | args.ngpus_per_node = num_gpus 62 | args.gpu = local_rank 63 | 64 | if args.rank == 0: 65 | args.log = True 66 | else: 67 | args.log = False 68 | 69 | 70 | def _init_dist_mpi(backend, **kwargs): 71 | raise NotImplementedError 72 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/optim/novograd.py: -------------------------------------------------------------------------------- 1 | """NovoGrad Optimizer. 2 | Original impl by Masashi Kimura (Convergence Lab): https://github.com/convergence-lab/novograd 3 | Paper: `Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks` 4 | - https://arxiv.org/abs/1905.11286 5 | """ 6 | 7 | import torch 8 | from torch.optim.optimizer import Optimizer 9 | import math 10 | 11 | 12 | class NovoGrad(Optimizer): 13 | def __init__(self, params, grad_averaging=False, lr=0.1, betas=(0.95, 0.98), eps=1e-8, weight_decay=0): 14 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 15 | super(NovoGrad, self).__init__(params, defaults) 16 | self._lr = lr 17 | self._beta1 = betas[0] 18 | self._beta2 = betas[1] 19 | self._eps = eps 20 | self._wd = weight_decay 21 | self._grad_averaging = grad_averaging 22 | 23 | self._momentum_initialized = False 24 | 25 | def step(self, closure=None): 26 | loss = None 27 | if closure is not None: 28 | loss = closure() 29 | 30 | if not self._momentum_initialized: 31 | for group in self.param_groups: 32 | for p in group['params']: 33 | if p.grad is None: 34 | continue 35 | state = self.state[p] 36 | grad = p.grad.data 37 | if grad.is_sparse: 38 | raise RuntimeError('NovoGrad does not support sparse gradients') 39 | 40 | v = torch.norm(grad)**2 41 | m = grad/(torch.sqrt(v) + self._eps) + self._wd * p.data 42 | state['step'] = 0 43 | state['v'] = v 44 | state['m'] = m 45 | state['grad_ema'] = None 46 | self._momentum_initialized = True 47 | 48 | for group in self.param_groups: 49 | for p in group['params']: 50 | if p.grad is None: 51 | continue 52 | state = self.state[p] 53 | state['step'] += 1 54 | 55 | step, v, m = state['step'], state['v'], state['m'] 56 | grad_ema = state['grad_ema'] 57 | 58 | grad = p.grad.data 59 | g2 = torch.norm(grad)**2 60 | grad_ema = g2 if grad_ema is None else grad_ema * \ 61 | self._beta2 + g2 * (1. - self._beta2) 62 | grad *= 1.0 / (torch.sqrt(grad_ema) + self._eps) 63 | 64 | if self._grad_averaging: 65 | grad *= (1. - self._beta1) 66 | 67 | g2 = torch.norm(grad)**2 68 | v = self._beta2*v + (1. - self._beta2)*g2 69 | m = self._beta1*m + (grad / (torch.sqrt(v) + self._eps) + self._wd * p.data) 70 | bias_correction1 = 1 - self._beta1 ** step 71 | bias_correction2 = 1 - self._beta2 ** step 72 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 73 | 74 | state['v'], state['m'] = v, m 75 | state['grad_ema'] = grad_ema 76 | p.data.add_(-step_size, m) 77 | return loss 78 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/optim/sgdp.py: -------------------------------------------------------------------------------- 1 | """ 2 | SGDP Optimizer Implementation copied from https://github.com/clovaai/AdamP/blob/master/adamp/sgdp.py 3 | 4 | Paper: `Slowing Down the Weight Norm Increase in Momentum-based Optimizers` - https://arxiv.org/abs/2006.08217 5 | Code: https://github.com/clovaai/AdamP 6 | 7 | Copyright (c) 2020-present NAVER Corp. 8 | MIT license 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | from torch.optim.optimizer import Optimizer, required 14 | import math 15 | 16 | class SGDP(Optimizer): 17 | def __init__(self, params, lr=required, momentum=0, dampening=0, 18 | weight_decay=0, nesterov=False, eps=1e-8, delta=0.1, wd_ratio=0.1): 19 | defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, 20 | nesterov=nesterov, eps=eps, delta=delta, wd_ratio=wd_ratio) 21 | super(SGDP, self).__init__(params, defaults) 22 | 23 | def _channel_view(self, x): 24 | return x.view(x.size(0), -1) 25 | 26 | def _layer_view(self, x): 27 | return x.view(1, -1) 28 | 29 | def _cosine_similarity(self, x, y, eps, view_func): 30 | x = view_func(x) 31 | y = view_func(y) 32 | 33 | x_norm = x.norm(dim=1).add_(eps) 34 | y_norm = y.norm(dim=1).add_(eps) 35 | dot = (x * y).sum(dim=1) 36 | 37 | return dot.abs() / x_norm / y_norm 38 | 39 | def _projection(self, p, grad, perturb, delta, wd_ratio, eps): 40 | wd = 1 41 | expand_size = [-1] + [1] * (len(p.shape) - 1) 42 | for view_func in [self._channel_view, self._layer_view]: 43 | 44 | cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func) 45 | 46 | if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)): 47 | p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps) 48 | perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size) 49 | wd = wd_ratio 50 | 51 | return perturb, wd 52 | 53 | return perturb, wd 54 | 55 | def step(self, closure=None): 56 | loss = None 57 | if closure is not None: 58 | loss = closure() 59 | 60 | for group in self.param_groups: 61 | weight_decay = group['weight_decay'] 62 | momentum = group['momentum'] 63 | dampening = group['dampening'] 64 | nesterov = group['nesterov'] 65 | 66 | for p in group['params']: 67 | if p.grad is None: 68 | continue 69 | grad = p.grad.data 70 | state = self.state[p] 71 | 72 | # State initialization 73 | if len(state) == 0: 74 | state['momentum'] = torch.zeros_like(p.data) 75 | 76 | # SGD 77 | buf = state['momentum'] 78 | buf.mul_(momentum).add_(1 - dampening, grad) 79 | if nesterov: 80 | d_p = grad + momentum * buf 81 | else: 82 | d_p = buf 83 | 84 | # Projection 85 | wd_ratio = 1 86 | if len(p.shape) > 1: 87 | d_p, wd_ratio = self._projection(p, grad, d_p, group['delta'], group['wd_ratio'], group['eps']) 88 | 89 | # Weight decay 90 | if weight_decay != 0: 91 | p.data.mul_(1 - group['lr'] * group['weight_decay'] * wd_ratio / (1-momentum)) 92 | 93 | # Step 94 | p.data.add_(-group['lr'], d_p) 95 | 96 | return loss 97 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/scheduler/scheduler_factory.py: -------------------------------------------------------------------------------- 1 | """ Scheduler Factory 2 | Hacked together by / Copyright 2020 Ross Wightman 3 | """ 4 | from .cosine_lr import CosineLRScheduler 5 | from .tanh_lr import TanhLRScheduler 6 | from .step_lr import StepLRScheduler 7 | from .plateau_lr import PlateauLRScheduler 8 | 9 | 10 | def create_scheduler(args, optimizer): 11 | num_epochs = args.epochs 12 | 13 | if getattr(args, 'lr_noise', None) is not None: 14 | lr_noise = getattr(args, 'lr_noise') 15 | if isinstance(lr_noise, (list, tuple)): 16 | noise_range = [n * num_epochs for n in lr_noise] 17 | if len(noise_range) == 1: 18 | noise_range = noise_range[0] 19 | else: 20 | noise_range = lr_noise * num_epochs 21 | else: 22 | noise_range = None 23 | 24 | lr_scheduler = None 25 | if args.sched == 'cosine': 26 | lr_scheduler = CosineLRScheduler( 27 | optimizer, 28 | t_initial=num_epochs, 29 | t_mul=getattr(args, 'lr_cycle_mul', 1.), 30 | lr_min=args.min_lr, 31 | decay_rate=args.decay_rate, 32 | warmup_lr_init=args.warmup_lr, 33 | warmup_t=args.warmup_epochs, 34 | cycle_limit=getattr(args, 'lr_cycle_limit', 1), 35 | t_in_epochs=True, 36 | noise_range_t=noise_range, 37 | noise_pct=getattr(args, 'lr_noise_pct', 0.67), 38 | noise_std=getattr(args, 'lr_noise_std', 1.), 39 | noise_seed=getattr(args, 'seed', 42), 40 | ) 41 | num_epochs = lr_scheduler.get_cycle_length() + args.cooldown_epochs 42 | elif args.sched == 'tanh': 43 | lr_scheduler = TanhLRScheduler( 44 | optimizer, 45 | t_initial=num_epochs, 46 | t_mul=getattr(args, 'lr_cycle_mul', 1.), 47 | lr_min=args.min_lr, 48 | warmup_lr_init=args.warmup_lr, 49 | warmup_t=args.warmup_epochs, 50 | cycle_limit=getattr(args, 'lr_cycle_limit', 1), 51 | t_in_epochs=True, 52 | noise_range_t=noise_range, 53 | noise_pct=getattr(args, 'lr_noise_pct', 0.67), 54 | noise_std=getattr(args, 'lr_noise_std', 1.), 55 | noise_seed=getattr(args, 'seed', 42), 56 | ) 57 | num_epochs = lr_scheduler.get_cycle_length() + args.cooldown_epochs 58 | elif args.sched == 'step': 59 | lr_scheduler = StepLRScheduler( 60 | optimizer, 61 | decay_t=args.decay_epochs, 62 | decay_rate=args.decay_rate, 63 | warmup_lr_init=args.warmup_lr, 64 | warmup_t=args.warmup_epochs, 65 | noise_range_t=noise_range, 66 | noise_pct=getattr(args, 'lr_noise_pct', 0.67), 67 | noise_std=getattr(args, 'lr_noise_std', 1.), 68 | noise_seed=getattr(args, 'seed', 42), 69 | ) 70 | elif args.sched == 'plateau': 71 | mode = 'min' if 'loss' in getattr(args, 'eval_metric', '') else 'max' 72 | lr_scheduler = PlateauLRScheduler( 73 | optimizer, 74 | decay_rate=args.decay_rate, 75 | patience_t=args.patience_epochs, 76 | lr_min=args.min_lr, 77 | mode=mode, 78 | warmup_lr_init=args.warmup_lr, 79 | warmup_t=args.warmup_epochs, 80 | cooldown_t=0, 81 | noise_range_t=noise_range, 82 | noise_pct=getattr(args, 'lr_noise_pct', 0.67), 83 | noise_std=getattr(args, 'lr_noise_std', 1.), 84 | noise_seed=getattr(args, 'seed', 42), 85 | ) 86 | elif args.sched == 'cosine_in_step': 87 | import scheduler.lr_sched as lr_scheduler 88 | 89 | return lr_scheduler, num_epochs 90 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/dataset/dataset.py: -------------------------------------------------------------------------------- 1 | from distutils.command.config import config 2 | import json 3 | import os 4 | import random 5 | 6 | from torch.utils.data import Dataset 7 | import torch 8 | from PIL import Image 9 | from PIL import ImageFile 10 | ImageFile.LOAD_TRUNCATED_IMAGES = True 11 | Image.MAX_IMAGE_PIXELS = None 12 | 13 | from dataset.utils import pre_caption 14 | import os 15 | from torchvision.transforms.functional import hflip, resize 16 | 17 | import math 18 | import random 19 | from random import random as rand 20 | 21 | class DGM4_Dataset(Dataset): 22 | def __init__(self, config, ann_file, transform, max_words=30, is_train=True): 23 | 24 | self.root_dir = '../../datasets' 25 | self.ann = [] 26 | for f in ann_file: 27 | self.ann += json.load(open(f,'r')) 28 | if 'dataset_division' in config: 29 | self.ann = self.ann[:int(len(self.ann)/config['dataset_division'])] 30 | 31 | self.transform = transform 32 | self.max_words = max_words 33 | self.image_res = config['image_res'] 34 | 35 | self.is_train = is_train 36 | 37 | def __len__(self): 38 | return len(self.ann) 39 | 40 | def get_bbox(self, bbox): 41 | xmin, ymin, xmax, ymax = bbox 42 | w = xmax - xmin 43 | h = ymax - ymin 44 | return int(xmin), int(ymin), int(w), int(h) 45 | 46 | def __getitem__(self, index): 47 | 48 | ann = self.ann[index] 49 | img_dir = ann['image'] 50 | image_dir_all = f'{self.root_dir}/{img_dir}' 51 | 52 | try: 53 | image = Image.open(image_dir_all).convert('RGB') 54 | except Warning: 55 | raise ValueError("### Warning: fakenews_dataset Image.open") 56 | 57 | W, H = image.size 58 | has_bbox = False 59 | try: 60 | x, y, w, h = self.get_bbox(ann['fake_image_box']) 61 | has_bbox = True 62 | except: 63 | fake_image_box = torch.tensor([0, 0, 0, 0], dtype=torch.float) 64 | 65 | do_hflip = False 66 | if self.is_train: 67 | if rand() < 0.5: 68 | # flipped applied 69 | image = hflip(image) 70 | do_hflip = True 71 | 72 | image = resize(image, [self.image_res, self.image_res], interpolation=Image.BICUBIC) 73 | image = self.transform(image) 74 | 75 | if has_bbox: 76 | # flipped applied 77 | if do_hflip: 78 | x = (W - x) - w # W is w0 79 | 80 | # resize applied 81 | x = self.image_res / W * x 82 | w = self.image_res / W * w 83 | y = self.image_res / H * y 84 | h = self.image_res / H * h 85 | 86 | center_x = x + 1 / 2 * w 87 | center_y = y + 1 / 2 * h 88 | 89 | fake_image_box = torch.tensor([center_x / self.image_res, 90 | center_y / self.image_res, 91 | w / self.image_res, 92 | h / self.image_res], 93 | dtype=torch.float) 94 | 95 | label = ann['fake_cls'] 96 | caption = pre_caption(ann['text'], self.max_words) 97 | fake_text_pos = ann['fake_text_pos'] 98 | 99 | fake_text_pos_list = torch.zeros(self.max_words) 100 | 101 | for i in fake_text_pos: 102 | if i 2 | 3 |

Unleashing the Potential of Consistency Learning for Detecting and Grounding Multi-Modal Media Manipulation

4 |

Yiheng Li, Yang Yang, Zichang Tan, Huan Liu, Weihua Chen, Xu Zhou and Zhen Lei

5 |

MAIS&CASIA, UCAS, Sangfor, BJTU and Alibaba
6 | 7 | 8 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2506.05890) 9 | 10 | ## Introduction 11 | 12 | This repository is an official implementation of CSCL. 13 | 14 | ## News 15 | - [2025/6/9] Camera Ready version is released. 16 | - [2025/6/9] Codes and weights are released. 17 | - [2025/2/27] CSCL is accepted by CVPR 2025🎉🎉. 18 | 19 | ## Environment Setting 20 | ``` 21 | conda create -n CSCL python=3.8 22 | conda activate CSCL 23 | pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/cu111/torch_stable.html 24 | pip install -r code/MultiModal-DeepFake-main/requirements.txt 25 | ``` 26 | ## Data Preparation 27 | Here are the pre-trained model: 28 | 29 | Download meter_clip16_224_roberta_pretrain.ckpt: [link](https://drive.usercontent.google.com/download?id=1x4qm2rlYKxpYF3F_xI5ZKurTtFnndq3l&export=download&authuser=0&confirm=t&uuid=9356ef04-1b7b-444c-80be-4bf21fab8bda&at=AIrpjvOLjj-J08OdrxQf_rCxV7Zp:1739190851890) 30 | 31 | Download ViT-B-16.pt: [link](https://drive.usercontent.google.com/download?id=1GL3kOw-lmbD5abJCaaktLrODqxMllpd6&export=download&authuser=0&confirm=t&uuid=5a286816-fa87-4fd0-a75d-825ec03966e4&at=AIrpjvMiXdIVW3BRne33Y_-pvh1D:1739190843518) 32 | 33 | Download roberta-base: [link](https://huggingface.co/FacebookAI/roberta-base/tree/main) 34 | 35 | Download Datasets: [link](https://huggingface.co/datasets/rshaojimmy/DGM4) 36 | 37 | The Folder structure: 38 | ``` 39 | ./ 40 | ├── code 41 | │ └── MultiModal-Deepfake (this github repo) 42 | │ ├── configs 43 | │ │ └──... 44 | │ ├── dataset 45 | │ │ └──... 46 | │ ├── models 47 | │ │ └──... 48 | │ ... 49 | │ ├── roberta-base 50 | │ ├── ViT-B-16.pt 51 | │ └── meter_clip16_224_roberta_pretrain.ckpt 52 | └── datasets 53 | └── DGM4 54 | ├── manipulation 55 | ├── origin 56 | └── metadata 57 | ``` 58 | 59 | Our pre-trained CSCL model: [link](https://drive.usercontent.google.com/download?id=1ZW4akTzcB9QjsS6FcX4zQ5l2YOjl7zNy&export=download&authuser=0&confirm=t&uuid=e8e37fa5-46fd-48bb-be4b-be765ca86059&at=AIrpjvM1Jjby7_AjinIBFS9d61TL:1739189602615) (96.34 AUC, 92.48 mAP, 84.07 IoUm, 76.62 F1) (We use train and val set for training and use test set for evaluation.) 60 | 61 | Make a folder ./results/CSCL/ and put the pre-trained model in it. 62 | 63 | ## Train 64 | ``` 65 | sh train.sh 66 | ``` 67 | ## Inference 68 | 69 | Evaluation 70 | ``` 71 | sh test.sh 72 | ``` 73 | Visualization 74 | ``` 75 | use visualize_res function in utils.py (refer to test.py for details). 76 | ``` 77 | Evaluation on text or image subset 78 | ``` 79 | refer to line 136 in test.py. 80 | ``` 81 | ## Acknowledgements 82 | We thank these great works and open-source codebases: 83 | [DGM4](https://github.com/rshaojimmy/MultiModal-DeepFake?tab=readme-ov-file), [METER](https://github.com/zdou0830/METER), 84 | 85 | ## Citation 86 | If you find our work is useful, please give this repo a star and cite our work as: 87 | ```bibtex 88 | @inproceedings{li2025unleashing, 89 | title={Unleashing the Potential of Consistency Learning for Detecting and Grounding Multi-Modal Media Manipulation}, 90 | author={Li, Yiheng and Yang, Yang and Tan, Zichang and Liu, Huan and Chen, Weihua and Zhou, Xu and Lei, Zhen}, 91 | booktitle={Proceedings of the Computer Vision and Pattern Recognition Conference}, 92 | pages={9242--9252}, 93 | year={2025} 94 | } 95 | ``` 96 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/optim/nadam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import Optimizer 3 | 4 | 5 | class Nadam(Optimizer): 6 | """Implements Nadam algorithm (a variant of Adam based on Nesterov momentum). 7 | 8 | It has been proposed in `Incorporating Nesterov Momentum into Adam`__. 9 | 10 | Arguments: 11 | params (iterable): iterable of parameters to optimize or dicts defining 12 | parameter groups 13 | lr (float, optional): learning rate (default: 2e-3) 14 | betas (Tuple[float, float], optional): coefficients used for computing 15 | running averages of gradient and its square 16 | eps (float, optional): term added to the denominator to improve 17 | numerical stability (default: 1e-8) 18 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 19 | schedule_decay (float, optional): momentum schedule decay (default: 4e-3) 20 | 21 | __ http://cs229.stanford.edu/proj2015/054_report.pdf 22 | __ http://www.cs.toronto.edu/~fritz/absps/momentum.pdf 23 | 24 | Originally taken from: https://github.com/pytorch/pytorch/pull/1408 25 | NOTE: Has potential issues but does work well on some problems. 26 | """ 27 | 28 | def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8, 29 | weight_decay=0, schedule_decay=4e-3): 30 | defaults = dict(lr=lr, betas=betas, eps=eps, 31 | weight_decay=weight_decay, schedule_decay=schedule_decay) 32 | super(Nadam, self).__init__(params, defaults) 33 | 34 | def step(self, closure=None): 35 | """Performs a single optimization step. 36 | 37 | Arguments: 38 | closure (callable, optional): A closure that reevaluates the model 39 | and returns the loss. 40 | """ 41 | loss = None 42 | if closure is not None: 43 | loss = closure() 44 | 45 | for group in self.param_groups: 46 | for p in group['params']: 47 | if p.grad is None: 48 | continue 49 | grad = p.grad.data 50 | state = self.state[p] 51 | 52 | # State initialization 53 | if len(state) == 0: 54 | state['step'] = 0 55 | state['m_schedule'] = 1. 56 | state['exp_avg'] = grad.new().resize_as_(grad).zero_() 57 | state['exp_avg_sq'] = grad.new().resize_as_(grad).zero_() 58 | 59 | # Warming momentum schedule 60 | m_schedule = state['m_schedule'] 61 | schedule_decay = group['schedule_decay'] 62 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 63 | beta1, beta2 = group['betas'] 64 | eps = group['eps'] 65 | state['step'] += 1 66 | t = state['step'] 67 | 68 | if group['weight_decay'] != 0: 69 | grad = grad.add(group['weight_decay'], p.data) 70 | 71 | momentum_cache_t = beta1 * \ 72 | (1. - 0.5 * (0.96 ** (t * schedule_decay))) 73 | momentum_cache_t_1 = beta1 * \ 74 | (1. - 0.5 * (0.96 ** ((t + 1) * schedule_decay))) 75 | m_schedule_new = m_schedule * momentum_cache_t 76 | m_schedule_next = m_schedule * momentum_cache_t * momentum_cache_t_1 77 | state['m_schedule'] = m_schedule_new 78 | 79 | # Decay the first and second moment running average coefficient 80 | exp_avg.mul_(beta1).add_(1. - beta1, grad) 81 | exp_avg_sq.mul_(beta2).addcmul_(1. - beta2, grad, grad) 82 | exp_avg_sq_prime = exp_avg_sq / (1. - beta2 ** t) 83 | denom = exp_avg_sq_prime.sqrt_().add_(eps) 84 | 85 | p.data.addcdiv_(-group['lr'] * (1. - momentum_cache_t) / (1. - m_schedule_new), grad, denom) 86 | p.data.addcdiv_(-group['lr'] * momentum_cache_t_1 / (1. - m_schedule_next), exp_avg, denom) 87 | 88 | return loss 89 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/optim/lookahead.py: -------------------------------------------------------------------------------- 1 | """ Lookahead Optimizer Wrapper. 2 | Implementation modified from: https://github.com/alphadl/lookahead.pytorch 3 | Paper: `Lookahead Optimizer: k steps forward, 1 step back` - https://arxiv.org/abs/1907.08610 4 | 5 | Hacked together by / Copyright 2020 Ross Wightman 6 | """ 7 | import torch 8 | from torch.optim.optimizer import Optimizer 9 | from collections import defaultdict 10 | 11 | 12 | class Lookahead(Optimizer): 13 | def __init__(self, base_optimizer, alpha=0.5, k=6): 14 | if not 0.0 <= alpha <= 1.0: 15 | raise ValueError(f'Invalid slow update rate: {alpha}') 16 | if not 1 <= k: 17 | raise ValueError(f'Invalid lookahead steps: {k}') 18 | defaults = dict(lookahead_alpha=alpha, lookahead_k=k, lookahead_step=0) 19 | self.base_optimizer = base_optimizer 20 | self.param_groups = self.base_optimizer.param_groups 21 | self.defaults = base_optimizer.defaults 22 | self.defaults.update(defaults) 23 | self.state = defaultdict(dict) 24 | # manually add our defaults to the param groups 25 | for name, default in defaults.items(): 26 | for group in self.param_groups: 27 | group.setdefault(name, default) 28 | 29 | def update_slow(self, group): 30 | for fast_p in group["params"]: 31 | if fast_p.grad is None: 32 | continue 33 | param_state = self.state[fast_p] 34 | if 'slow_buffer' not in param_state: 35 | param_state['slow_buffer'] = torch.empty_like(fast_p.data) 36 | param_state['slow_buffer'].copy_(fast_p.data) 37 | slow = param_state['slow_buffer'] 38 | slow.add_(group['lookahead_alpha'], fast_p.data - slow) 39 | fast_p.data.copy_(slow) 40 | 41 | def sync_lookahead(self): 42 | for group in self.param_groups: 43 | self.update_slow(group) 44 | 45 | def step(self, closure=None): 46 | #assert id(self.param_groups) == id(self.base_optimizer.param_groups) 47 | loss = self.base_optimizer.step(closure) 48 | for group in self.param_groups: 49 | group['lookahead_step'] += 1 50 | if group['lookahead_step'] % group['lookahead_k'] == 0: 51 | self.update_slow(group) 52 | return loss 53 | 54 | def state_dict(self): 55 | fast_state_dict = self.base_optimizer.state_dict() 56 | slow_state = { 57 | (id(k) if isinstance(k, torch.Tensor) else k): v 58 | for k, v in self.state.items() 59 | } 60 | fast_state = fast_state_dict['state'] 61 | param_groups = fast_state_dict['param_groups'] 62 | return { 63 | 'state': fast_state, 64 | 'slow_state': slow_state, 65 | 'param_groups': param_groups, 66 | } 67 | 68 | def load_state_dict(self, state_dict): 69 | fast_state_dict = { 70 | 'state': state_dict['state'], 71 | 'param_groups': state_dict['param_groups'], 72 | } 73 | self.base_optimizer.load_state_dict(fast_state_dict) 74 | 75 | # We want to restore the slow state, but share param_groups reference 76 | # with base_optimizer. This is a bit redundant but least code 77 | slow_state_new = False 78 | if 'slow_state' not in state_dict: 79 | print('Loading state_dict from optimizer without Lookahead applied.') 80 | state_dict['slow_state'] = defaultdict(dict) 81 | slow_state_new = True 82 | slow_state_dict = { 83 | 'state': state_dict['slow_state'], 84 | 'param_groups': state_dict['param_groups'], # this is pointless but saves code 85 | } 86 | super(Lookahead, self).load_state_dict(slow_state_dict) 87 | self.param_groups = self.base_optimizer.param_groups # make both ref same container 88 | if slow_state_new: 89 | # reapply defaults to catch missing lookahead specific ones 90 | for name, default in self.defaults.items(): 91 | for group in self.param_groups: 92 | group.setdefault(name, default) 93 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/optim/adamp.py: -------------------------------------------------------------------------------- 1 | """ 2 | AdamP Optimizer Implementation copied from https://github.com/clovaai/AdamP/blob/master/adamp/adamp.py 3 | 4 | Paper: `Slowing Down the Weight Norm Increase in Momentum-based Optimizers` - https://arxiv.org/abs/2006.08217 5 | Code: https://github.com/clovaai/AdamP 6 | 7 | Copyright (c) 2020-present NAVER Corp. 8 | MIT license 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | from torch.optim.optimizer import Optimizer, required 14 | import math 15 | 16 | class AdamP(Optimizer): 17 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, 18 | weight_decay=0, delta=0.1, wd_ratio=0.1, nesterov=False): 19 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, 20 | delta=delta, wd_ratio=wd_ratio, nesterov=nesterov) 21 | super(AdamP, self).__init__(params, defaults) 22 | 23 | def _channel_view(self, x): 24 | return x.view(x.size(0), -1) 25 | 26 | def _layer_view(self, x): 27 | return x.view(1, -1) 28 | 29 | def _cosine_similarity(self, x, y, eps, view_func): 30 | x = view_func(x) 31 | y = view_func(y) 32 | 33 | x_norm = x.norm(dim=1).add_(eps) 34 | y_norm = y.norm(dim=1).add_(eps) 35 | dot = (x * y).sum(dim=1) 36 | 37 | return dot.abs() / x_norm / y_norm 38 | 39 | def _projection(self, p, grad, perturb, delta, wd_ratio, eps): 40 | wd = 1 41 | expand_size = [-1] + [1] * (len(p.shape) - 1) 42 | for view_func in [self._channel_view, self._layer_view]: 43 | 44 | cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func) 45 | 46 | if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)): 47 | p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps) 48 | perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size) 49 | wd = wd_ratio 50 | 51 | return perturb, wd 52 | 53 | return perturb, wd 54 | 55 | def step(self, closure=None): 56 | loss = None 57 | if closure is not None: 58 | loss = closure() 59 | 60 | for group in self.param_groups: 61 | for p in group['params']: 62 | if p.grad is None: 63 | continue 64 | 65 | grad = p.grad.data 66 | beta1, beta2 = group['betas'] 67 | nesterov = group['nesterov'] 68 | 69 | state = self.state[p] 70 | 71 | # State initialization 72 | if len(state) == 0: 73 | state['step'] = 0 74 | state['exp_avg'] = torch.zeros_like(p.data) 75 | state['exp_avg_sq'] = torch.zeros_like(p.data) 76 | 77 | # Adam 78 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 79 | 80 | state['step'] += 1 81 | bias_correction1 = 1 - beta1 ** state['step'] 82 | bias_correction2 = 1 - beta2 ** state['step'] 83 | 84 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 85 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 86 | 87 | denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) 88 | step_size = group['lr'] / bias_correction1 89 | 90 | if nesterov: 91 | perturb = (beta1 * exp_avg + (1 - beta1) * grad) / denom 92 | else: 93 | perturb = exp_avg / denom 94 | 95 | # Projection 96 | wd_ratio = 1 97 | if len(p.shape) > 1: 98 | perturb, wd_ratio = self._projection(p, grad, perturb, group['delta'], group['wd_ratio'], group['eps']) 99 | 100 | # Weight decay 101 | if group['weight_decay'] > 0: 102 | p.data.mul_(1 - group['lr'] * group['weight_decay'] * wd_ratio) 103 | 104 | # Step 105 | p.data.add_(-step_size, perturb) 106 | 107 | return loss 108 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/scheduler/cosine_lr.py: -------------------------------------------------------------------------------- 1 | """ Cosine Scheduler 2 | 3 | Cosine LR schedule with warmup, cycle/restarts, noise. 4 | 5 | Hacked together by / Copyright 2020 Ross Wightman 6 | """ 7 | import logging 8 | import math 9 | import numpy as np 10 | import torch 11 | 12 | from .scheduler import Scheduler 13 | 14 | from pdb import set_trace as breakpoint 15 | 16 | _logger = logging.getLogger(__name__) 17 | 18 | 19 | class CosineLRScheduler(Scheduler): 20 | """ 21 | Cosine decay with restarts. 22 | This is described in the paper https://arxiv.org/abs/1608.03983. 23 | 24 | Inspiration from 25 | https://github.com/allenai/allennlp/blob/master/allennlp/training/learning_rate_schedulers/cosine.py 26 | """ 27 | 28 | def __init__(self, 29 | optimizer: torch.optim.Optimizer, 30 | t_initial: int, 31 | t_mul: float = 1., 32 | lr_min: float = 0., 33 | decay_rate: float = 1., 34 | warmup_t=0, 35 | warmup_lr_init=0, 36 | warmup_prefix=True, 37 | cycle_limit=0, 38 | t_in_epochs=True, 39 | noise_range_t=None, 40 | noise_pct=0.67, 41 | noise_std=1.0, 42 | noise_seed=42, 43 | initialize=True) -> None: 44 | super().__init__( 45 | optimizer, param_group_field="lr", 46 | noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed, 47 | initialize=initialize) 48 | 49 | assert t_initial > 0 50 | assert lr_min >= 0 51 | if t_initial == 1 and t_mul == 1 and decay_rate == 1: 52 | _logger.warning("Cosine annealing scheduler will have no effect on the learning " 53 | "rate since t_initial = t_mul = eta_mul = 1.") 54 | self.t_initial = t_initial 55 | self.t_mul = t_mul 56 | self.lr_min = lr_min 57 | self.decay_rate = decay_rate 58 | self.cycle_limit = cycle_limit 59 | self.warmup_t = warmup_t 60 | self.warmup_lr_init = warmup_lr_init 61 | self.warmup_prefix = warmup_prefix 62 | self.t_in_epochs = t_in_epochs 63 | if self.warmup_t: 64 | self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values] 65 | super().update_groups(self.warmup_lr_init) 66 | else: 67 | self.warmup_steps = [1 for _ in self.base_values] 68 | 69 | def _get_lr(self, t): 70 | if t < self.warmup_t: 71 | lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps] 72 | else: 73 | if self.warmup_prefix: 74 | t = t - self.warmup_t 75 | 76 | if self.t_mul != 1: 77 | i = math.floor(math.log(1 - t / self.t_initial * (1 - self.t_mul), self.t_mul)) 78 | t_i = self.t_mul ** i * self.t_initial 79 | t_curr = t - (1 - self.t_mul ** i) / (1 - self.t_mul) * self.t_initial 80 | else: 81 | i = t // self.t_initial 82 | t_i = self.t_initial 83 | t_curr = t - (self.t_initial * i) 84 | 85 | gamma = self.decay_rate ** i 86 | lr_min = self.lr_min * gamma 87 | lr_max_values = [v * gamma for v in self.base_values] 88 | 89 | if self.cycle_limit == 0 or (self.cycle_limit > 0 and i < self.cycle_limit): 90 | lrs = [ 91 | lr_min + 0.5 * (lr_max - lr_min) * (1 + math.cos(math.pi * t_curr / t_i)) for lr_max in lr_max_values 92 | ] 93 | else: 94 | lrs = [self.lr_min for _ in self.base_values] 95 | 96 | return lrs 97 | 98 | def get_epoch_values(self, epoch: int): 99 | if self.t_in_epochs: 100 | return self._get_lr(epoch) 101 | else: 102 | return None 103 | 104 | def get_update_values(self, num_updates: int): 105 | if not self.t_in_epochs: 106 | return self._get_lr(num_updates) 107 | else: 108 | return None 109 | 110 | def get_cycle_length(self, cycles=0): 111 | if not cycles: 112 | cycles = self.cycle_limit 113 | cycles = max(1, cycles) 114 | if self.t_mul == 1.0: 115 | return self.t_initial * cycles 116 | else: 117 | return int(math.floor(-self.t_initial * (self.t_mul ** cycles - 1) / (1 - self.t_mul))) 118 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/scheduler/tanh_lr.py: -------------------------------------------------------------------------------- 1 | """ TanH Scheduler 2 | 3 | TanH schedule with warmup, cycle/restarts, noise. 4 | 5 | Hacked together by / Copyright 2020 Ross Wightman 6 | """ 7 | import logging 8 | import math 9 | import numpy as np 10 | import torch 11 | 12 | from .scheduler import Scheduler 13 | 14 | 15 | _logger = logging.getLogger(__name__) 16 | 17 | 18 | class TanhLRScheduler(Scheduler): 19 | """ 20 | Hyberbolic-Tangent decay with restarts. 21 | This is described in the paper https://arxiv.org/abs/1806.01593 22 | """ 23 | 24 | def __init__(self, 25 | optimizer: torch.optim.Optimizer, 26 | t_initial: int, 27 | lb: float = -6., 28 | ub: float = 4., 29 | t_mul: float = 1., 30 | lr_min: float = 0., 31 | decay_rate: float = 1., 32 | warmup_t=0, 33 | warmup_lr_init=0, 34 | warmup_prefix=False, 35 | cycle_limit=0, 36 | t_in_epochs=True, 37 | noise_range_t=None, 38 | noise_pct=0.67, 39 | noise_std=1.0, 40 | noise_seed=42, 41 | initialize=True) -> None: 42 | super().__init__( 43 | optimizer, param_group_field="lr", 44 | noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed, 45 | initialize=initialize) 46 | 47 | assert t_initial > 0 48 | assert lr_min >= 0 49 | assert lb < ub 50 | assert cycle_limit >= 0 51 | assert warmup_t >= 0 52 | assert warmup_lr_init >= 0 53 | self.lb = lb 54 | self.ub = ub 55 | self.t_initial = t_initial 56 | self.t_mul = t_mul 57 | self.lr_min = lr_min 58 | self.decay_rate = decay_rate 59 | self.cycle_limit = cycle_limit 60 | self.warmup_t = warmup_t 61 | self.warmup_lr_init = warmup_lr_init 62 | self.warmup_prefix = warmup_prefix 63 | self.t_in_epochs = t_in_epochs 64 | if self.warmup_t: 65 | t_v = self.base_values if self.warmup_prefix else self._get_lr(self.warmup_t) 66 | self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in t_v] 67 | super().update_groups(self.warmup_lr_init) 68 | else: 69 | self.warmup_steps = [1 for _ in self.base_values] 70 | 71 | def _get_lr(self, t): 72 | if t < self.warmup_t: 73 | lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps] 74 | else: 75 | if self.warmup_prefix: 76 | t = t - self.warmup_t 77 | 78 | if self.t_mul != 1: 79 | i = math.floor(math.log(1 - t / self.t_initial * (1 - self.t_mul), self.t_mul)) 80 | t_i = self.t_mul ** i * self.t_initial 81 | t_curr = t - (1 - self.t_mul ** i) / (1 - self.t_mul) * self.t_initial 82 | else: 83 | i = t // self.t_initial 84 | t_i = self.t_initial 85 | t_curr = t - (self.t_initial * i) 86 | 87 | if self.cycle_limit == 0 or (self.cycle_limit > 0 and i < self.cycle_limit): 88 | gamma = self.decay_rate ** i 89 | lr_min = self.lr_min * gamma 90 | lr_max_values = [v * gamma for v in self.base_values] 91 | 92 | tr = t_curr / t_i 93 | lrs = [ 94 | lr_min + 0.5 * (lr_max - lr_min) * (1 - math.tanh(self.lb * (1. - tr) + self.ub * tr)) 95 | for lr_max in lr_max_values 96 | ] 97 | else: 98 | lrs = [self.lr_min * (self.decay_rate ** self.cycle_limit) for _ in self.base_values] 99 | return lrs 100 | 101 | def get_epoch_values(self, epoch: int): 102 | if self.t_in_epochs: 103 | return self._get_lr(epoch) 104 | else: 105 | return None 106 | 107 | def get_update_values(self, num_updates: int): 108 | if not self.t_in_epochs: 109 | return self._get_lr(num_updates) 110 | else: 111 | return None 112 | 113 | def get_cycle_length(self, cycles=0): 114 | if not cycles: 115 | cycles = self.cycle_limit 116 | cycles = max(1, cycles) 117 | if self.t_mul == 1.0: 118 | return self.t_initial * cycles 119 | else: 120 | return int(math.floor(-self.t_initial * (self.t_mul ** cycles - 1) / (1 - self.t_mul))) 121 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/scheduler/plateau_lr.py: -------------------------------------------------------------------------------- 1 | """ Plateau Scheduler 2 | 3 | Adapts PyTorch plateau scheduler and allows application of noise, warmup. 4 | 5 | Hacked together by / Copyright 2020 Ross Wightman 6 | """ 7 | import torch 8 | 9 | from .scheduler import Scheduler 10 | 11 | 12 | class PlateauLRScheduler(Scheduler): 13 | """Decay the LR by a factor every time the validation loss plateaus.""" 14 | 15 | def __init__(self, 16 | optimizer, 17 | decay_rate=0.1, 18 | patience_t=10, 19 | verbose=True, 20 | threshold=1e-4, 21 | cooldown_t=0, 22 | warmup_t=0, 23 | warmup_lr_init=0, 24 | lr_min=0, 25 | mode='max', 26 | noise_range_t=None, 27 | noise_type='normal', 28 | noise_pct=0.67, 29 | noise_std=1.0, 30 | noise_seed=None, 31 | initialize=True, 32 | ): 33 | super().__init__(optimizer, 'lr', initialize=initialize) 34 | 35 | self.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( 36 | self.optimizer, 37 | patience=patience_t, 38 | factor=decay_rate, 39 | verbose=verbose, 40 | threshold=threshold, 41 | cooldown=cooldown_t, 42 | mode=mode, 43 | min_lr=lr_min 44 | ) 45 | 46 | self.noise_range = noise_range_t 47 | self.noise_pct = noise_pct 48 | self.noise_type = noise_type 49 | self.noise_std = noise_std 50 | self.noise_seed = noise_seed if noise_seed is not None else 42 51 | self.warmup_t = warmup_t 52 | self.warmup_lr_init = warmup_lr_init 53 | if self.warmup_t: 54 | self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values] 55 | super().update_groups(self.warmup_lr_init) 56 | else: 57 | self.warmup_steps = [1 for _ in self.base_values] 58 | self.restore_lr = None 59 | 60 | def state_dict(self): 61 | return { 62 | 'best': self.lr_scheduler.best, 63 | 'last_epoch': self.lr_scheduler.last_epoch, 64 | } 65 | 66 | def load_state_dict(self, state_dict): 67 | self.lr_scheduler.best = state_dict['best'] 68 | if 'last_epoch' in state_dict: 69 | self.lr_scheduler.last_epoch = state_dict['last_epoch'] 70 | 71 | # override the base class step fn completely 72 | def step(self, epoch, metric=None): 73 | if epoch <= self.warmup_t: 74 | lrs = [self.warmup_lr_init + epoch * s for s in self.warmup_steps] 75 | super().update_groups(lrs) 76 | else: 77 | if self.restore_lr is not None: 78 | # restore actual LR from before our last noise perturbation before stepping base 79 | for i, param_group in enumerate(self.optimizer.param_groups): 80 | param_group['lr'] = self.restore_lr[i] 81 | self.restore_lr = None 82 | 83 | self.lr_scheduler.step(metric, epoch) # step the base scheduler 84 | 85 | if self.noise_range is not None: 86 | if isinstance(self.noise_range, (list, tuple)): 87 | apply_noise = self.noise_range[0] <= epoch < self.noise_range[1] 88 | else: 89 | apply_noise = epoch >= self.noise_range 90 | if apply_noise: 91 | self._apply_noise(epoch) 92 | 93 | def _apply_noise(self, epoch): 94 | g = torch.Generator() 95 | g.manual_seed(self.noise_seed + epoch) 96 | if self.noise_type == 'normal': 97 | while True: 98 | # resample if noise out of percent limit, brute force but shouldn't spin much 99 | noise = torch.randn(1, generator=g).item() 100 | if abs(noise) < self.noise_pct: 101 | break 102 | else: 103 | noise = 2 * (torch.rand(1, generator=g).item() - 0.5) * self.noise_pct 104 | 105 | # apply the noise on top of previous LR, cache the old value so we can restore for normal 106 | # stepping of base scheduler 107 | restore_lr = [] 108 | for i, param_group in enumerate(self.optimizer.param_groups): 109 | old_lr = float(param_group['lr']) 110 | restore_lr.append(old_lr) 111 | new_lr = old_lr + old_lr * noise 112 | param_group['lr'] = new_lr 113 | self.restore_lr = restore_lr 114 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/scheduler/scheduler.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any 2 | 3 | import torch 4 | 5 | 6 | class Scheduler: 7 | """ Parameter Scheduler Base Class 8 | A scheduler base class that can be used to schedule any optimizer parameter groups. 9 | 10 | Unlike the builtin PyTorch schedulers, this is intended to be consistently called 11 | * At the END of each epoch, before incrementing the epoch count, to calculate next epoch's value 12 | * At the END of each optimizer update, after incrementing the update count, to calculate next update's value 13 | 14 | The schedulers built on this should try to remain as stateless as possible (for simplicity). 15 | 16 | This family of schedulers is attempting to avoid the confusion of the meaning of 'last_epoch' 17 | and -1 values for special behaviour. All epoch and update counts must be tracked in the training 18 | code and explicitly passed in to the schedulers on the corresponding step or step_update call. 19 | 20 | Based on ideas from: 21 | * https://github.com/pytorch/fairseq/tree/master/fairseq/optim/lr_scheduler 22 | * https://github.com/allenai/allennlp/tree/master/allennlp/training/learning_rate_schedulers 23 | """ 24 | 25 | def __init__(self, 26 | optimizer: torch.optim.Optimizer, 27 | param_group_field: str, 28 | noise_range_t=None, 29 | noise_type='normal', 30 | noise_pct=0.67, 31 | noise_std=1.0, 32 | noise_seed=None, 33 | initialize: bool = True) -> None: 34 | self.optimizer = optimizer 35 | self.param_group_field = param_group_field 36 | self._initial_param_group_field = f"initial_{param_group_field}" 37 | if initialize: 38 | for i, group in enumerate(self.optimizer.param_groups): 39 | if param_group_field not in group: 40 | raise KeyError(f"{param_group_field} missing from param_groups[{i}]") 41 | group.setdefault(self._initial_param_group_field, group[param_group_field]) 42 | else: 43 | for i, group in enumerate(self.optimizer.param_groups): 44 | if self._initial_param_group_field not in group: 45 | raise KeyError(f"{self._initial_param_group_field} missing from param_groups[{i}]") 46 | self.base_values = [group[self._initial_param_group_field] for group in self.optimizer.param_groups] 47 | self.metric = None # any point to having this for all? 48 | self.noise_range_t = noise_range_t 49 | self.noise_pct = noise_pct 50 | self.noise_type = noise_type 51 | self.noise_std = noise_std 52 | self.noise_seed = noise_seed if noise_seed is not None else 42 53 | self.update_groups(self.base_values) 54 | 55 | def state_dict(self) -> Dict[str, Any]: 56 | return {key: value for key, value in self.__dict__.items() if key != 'optimizer'} 57 | 58 | def load_state_dict(self, state_dict: Dict[str, Any]) -> None: 59 | self.__dict__.update(state_dict) 60 | 61 | def get_epoch_values(self, epoch: int): 62 | return None 63 | 64 | def get_update_values(self, num_updates: int): 65 | return None 66 | 67 | def step(self, epoch: int, metric: float = None) -> None: 68 | self.metric = metric 69 | values = self.get_epoch_values(epoch) 70 | if values is not None: 71 | values = self._add_noise(values, epoch) 72 | self.update_groups(values) 73 | 74 | def step_update(self, num_updates: int, metric: float = None): 75 | self.metric = metric 76 | values = self.get_update_values(num_updates) 77 | if values is not None: 78 | values = self._add_noise(values, num_updates) 79 | self.update_groups(values) 80 | 81 | def update_groups(self, values): 82 | if not isinstance(values, (list, tuple)): 83 | values = [values] * len(self.optimizer.param_groups) 84 | for param_group, value in zip(self.optimizer.param_groups, values): 85 | param_group[self.param_group_field] = value 86 | 87 | def _add_noise(self, lrs, t): 88 | if self.noise_range_t is not None: 89 | if isinstance(self.noise_range_t, (list, tuple)): 90 | apply_noise = self.noise_range_t[0] <= t < self.noise_range_t[1] 91 | else: 92 | apply_noise = t >= self.noise_range_t 93 | if apply_noise: 94 | g = torch.Generator() 95 | g.manual_seed(self.noise_seed + t) 96 | if self.noise_type == 'normal': 97 | while True: 98 | # resample if noise out of percent limit, brute force but shouldn't spin much 99 | noise = torch.randn(1, generator=g).item() 100 | if abs(noise) < self.noise_pct: 101 | break 102 | else: 103 | noise = 2 * (torch.rand(1, generator=g).item() - 0.5) * self.noise_pct 104 | lrs = [v + v * noise for v in lrs] 105 | return lrs 106 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/optim/nvnovograd.py: -------------------------------------------------------------------------------- 1 | """ Nvidia NovoGrad Optimizer. 2 | Original impl by Nvidia from Jasper example: 3 | - https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechRecognition/Jasper 4 | Paper: `Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks` 5 | - https://arxiv.org/abs/1905.11286 6 | """ 7 | 8 | import torch 9 | from torch.optim.optimizer import Optimizer 10 | import math 11 | 12 | 13 | class NvNovoGrad(Optimizer): 14 | """ 15 | Implements Novograd algorithm. 16 | 17 | Args: 18 | params (iterable): iterable of parameters to optimize or dicts defining 19 | parameter groups 20 | lr (float, optional): learning rate (default: 1e-3) 21 | betas (Tuple[float, float], optional): coefficients used for computing 22 | running averages of gradient and its square (default: (0.95, 0.98)) 23 | eps (float, optional): term added to the denominator to improve 24 | numerical stability (default: 1e-8) 25 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 26 | grad_averaging: gradient averaging 27 | amsgrad (boolean, optional): whether to use the AMSGrad variant of this 28 | algorithm from the paper `On the Convergence of Adam and Beyond`_ 29 | (default: False) 30 | """ 31 | 32 | def __init__(self, params, lr=1e-3, betas=(0.95, 0.98), eps=1e-8, 33 | weight_decay=0, grad_averaging=False, amsgrad=False): 34 | if not 0.0 <= lr: 35 | raise ValueError("Invalid learning rate: {}".format(lr)) 36 | if not 0.0 <= eps: 37 | raise ValueError("Invalid epsilon value: {}".format(eps)) 38 | if not 0.0 <= betas[0] < 1.0: 39 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 40 | if not 0.0 <= betas[1] < 1.0: 41 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 42 | defaults = dict(lr=lr, betas=betas, eps=eps, 43 | weight_decay=weight_decay, 44 | grad_averaging=grad_averaging, 45 | amsgrad=amsgrad) 46 | 47 | super(NvNovoGrad, self).__init__(params, defaults) 48 | 49 | def __setstate__(self, state): 50 | super(NvNovoGrad, self).__setstate__(state) 51 | for group in self.param_groups: 52 | group.setdefault('amsgrad', False) 53 | 54 | def step(self, closure=None): 55 | """Performs a single optimization step. 56 | 57 | Arguments: 58 | closure (callable, optional): A closure that reevaluates the model 59 | and returns the loss. 60 | """ 61 | loss = None 62 | if closure is not None: 63 | loss = closure() 64 | 65 | for group in self.param_groups: 66 | for p in group['params']: 67 | if p.grad is None: 68 | continue 69 | grad = p.grad.data 70 | if grad.is_sparse: 71 | raise RuntimeError('Sparse gradients are not supported.') 72 | amsgrad = group['amsgrad'] 73 | 74 | state = self.state[p] 75 | 76 | # State initialization 77 | if len(state) == 0: 78 | state['step'] = 0 79 | # Exponential moving average of gradient values 80 | state['exp_avg'] = torch.zeros_like(p.data) 81 | # Exponential moving average of squared gradient values 82 | state['exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device) 83 | if amsgrad: 84 | # Maintains max of all exp. moving avg. of sq. grad. values 85 | state['max_exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device) 86 | 87 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 88 | if amsgrad: 89 | max_exp_avg_sq = state['max_exp_avg_sq'] 90 | beta1, beta2 = group['betas'] 91 | 92 | state['step'] += 1 93 | 94 | norm = torch.sum(torch.pow(grad, 2)) 95 | 96 | if exp_avg_sq == 0: 97 | exp_avg_sq.copy_(norm) 98 | else: 99 | exp_avg_sq.mul_(beta2).add_(1 - beta2, norm) 100 | 101 | if amsgrad: 102 | # Maintains the maximum of all 2nd moment running avg. till now 103 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) 104 | # Use the max. for normalizing running avg. of gradient 105 | denom = max_exp_avg_sq.sqrt().add_(group['eps']) 106 | else: 107 | denom = exp_avg_sq.sqrt().add_(group['eps']) 108 | 109 | grad.div_(denom) 110 | if group['weight_decay'] != 0: 111 | grad.add_(group['weight_decay'], p.data) 112 | if group['grad_averaging']: 113 | grad.mul_(1 - beta1) 114 | exp_avg.mul_(beta1).add_(grad) 115 | 116 | p.data.add_(-group['lr'], exp_avg) 117 | 118 | return loss 119 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/optim/adamw.py: -------------------------------------------------------------------------------- 1 | """ AdamW Optimizer 2 | Impl copied from PyTorch master 3 | """ 4 | import math 5 | import torch 6 | from torch.optim.optimizer import Optimizer 7 | 8 | 9 | class AdamW(Optimizer): 10 | r"""Implements AdamW algorithm. 11 | 12 | The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_. 13 | The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_. 14 | 15 | Arguments: 16 | params (iterable): iterable of parameters to optimize or dicts defining 17 | parameter groups 18 | lr (float, optional): learning rate (default: 1e-3) 19 | betas (Tuple[float, float], optional): coefficients used for computing 20 | running averages of gradient and its square (default: (0.9, 0.999)) 21 | eps (float, optional): term added to the denominator to improve 22 | numerical stability (default: 1e-8) 23 | weight_decay (float, optional): weight decay coefficient (default: 1e-2) 24 | amsgrad (boolean, optional): whether to use the AMSGrad variant of this 25 | algorithm from the paper `On the Convergence of Adam and Beyond`_ 26 | (default: False) 27 | 28 | .. _Adam\: A Method for Stochastic Optimization: 29 | https://arxiv.org/abs/1412.6980 30 | .. _Decoupled Weight Decay Regularization: 31 | https://arxiv.org/abs/1711.05101 32 | .. _On the Convergence of Adam and Beyond: 33 | https://openreview.net/forum?id=ryQu7f-RZ 34 | """ 35 | 36 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, 37 | weight_decay=1e-2, amsgrad=False): 38 | if not 0.0 <= lr: 39 | raise ValueError("Invalid learning rate: {}".format(lr)) 40 | if not 0.0 <= eps: 41 | raise ValueError("Invalid epsilon value: {}".format(eps)) 42 | if not 0.0 <= betas[0] < 1.0: 43 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 44 | if not 0.0 <= betas[1] < 1.0: 45 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 46 | defaults = dict(lr=lr, betas=betas, eps=eps, 47 | weight_decay=weight_decay, amsgrad=amsgrad) 48 | super(AdamW, self).__init__(params, defaults) 49 | 50 | def __setstate__(self, state): 51 | super(AdamW, self).__setstate__(state) 52 | for group in self.param_groups: 53 | group.setdefault('amsgrad', False) 54 | 55 | def step(self, closure=None): 56 | """Performs a single optimization step. 57 | 58 | Arguments: 59 | closure (callable, optional): A closure that reevaluates the model 60 | and returns the loss. 61 | """ 62 | loss = None 63 | if closure is not None: 64 | loss = closure() 65 | 66 | for group in self.param_groups: 67 | for p in group['params']: 68 | if p.grad is None: 69 | continue 70 | 71 | # Perform stepweight decay 72 | p.data.mul_(1 - group['lr'] * group['weight_decay']) 73 | 74 | # Perform optimization step 75 | grad = p.grad.data 76 | if grad.is_sparse: 77 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 78 | amsgrad = group['amsgrad'] 79 | 80 | state = self.state[p] 81 | 82 | # State initialization 83 | if len(state) == 0: 84 | state['step'] = 0 85 | # Exponential moving average of gradient values 86 | state['exp_avg'] = torch.zeros_like(p.data) 87 | # Exponential moving average of squared gradient values 88 | state['exp_avg_sq'] = torch.zeros_like(p.data) 89 | if amsgrad: 90 | # Maintains max of all exp. moving avg. of sq. grad. values 91 | state['max_exp_avg_sq'] = torch.zeros_like(p.data) 92 | 93 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 94 | if amsgrad: 95 | max_exp_avg_sq = state['max_exp_avg_sq'] 96 | beta1, beta2 = group['betas'] 97 | 98 | state['step'] += 1 99 | bias_correction1 = 1 - beta1 ** state['step'] 100 | bias_correction2 = 1 - beta2 ** state['step'] 101 | 102 | # Decay the first and second moment running average coefficient 103 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 104 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 105 | if amsgrad: 106 | # Maintains the maximum of all 2nd moment running avg. till now 107 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) 108 | # Use the max. for normalizing running avg. of gradient 109 | denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) 110 | else: 111 | denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) 112 | 113 | step_size = group['lr'] / bias_correction1 114 | 115 | p.data.addcdiv_(-step_size, exp_avg, denom) 116 | 117 | return loss 118 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/optim/radam.py: -------------------------------------------------------------------------------- 1 | """RAdam Optimizer. 2 | Implementation lifted from: https://github.com/LiyuanLucasLiu/RAdam 3 | Paper: `On the Variance of the Adaptive Learning Rate and Beyond` - https://arxiv.org/abs/1908.03265 4 | """ 5 | import math 6 | import torch 7 | from torch.optim.optimizer import Optimizer, required 8 | 9 | 10 | class RAdam(Optimizer): 11 | 12 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 13 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 14 | self.buffer = [[None, None, None] for ind in range(10)] 15 | super(RAdam, self).__init__(params, defaults) 16 | 17 | def __setstate__(self, state): 18 | super(RAdam, self).__setstate__(state) 19 | 20 | def step(self, closure=None): 21 | 22 | loss = None 23 | if closure is not None: 24 | loss = closure() 25 | 26 | for group in self.param_groups: 27 | 28 | for p in group['params']: 29 | if p.grad is None: 30 | continue 31 | grad = p.grad.data.float() 32 | if grad.is_sparse: 33 | raise RuntimeError('RAdam does not support sparse gradients') 34 | 35 | p_data_fp32 = p.data.float() 36 | 37 | state = self.state[p] 38 | 39 | if len(state) == 0: 40 | state['step'] = 0 41 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 42 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 43 | else: 44 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 45 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 46 | 47 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 48 | beta1, beta2 = group['betas'] 49 | 50 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 51 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 52 | 53 | state['step'] += 1 54 | buffered = self.buffer[int(state['step'] % 10)] 55 | if state['step'] == buffered[0]: 56 | N_sma, step_size = buffered[1], buffered[2] 57 | else: 58 | buffered[0] = state['step'] 59 | beta2_t = beta2 ** state['step'] 60 | N_sma_max = 2 / (1 - beta2) - 1 61 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 62 | buffered[1] = N_sma 63 | 64 | # more conservative since it's an approximated value 65 | if N_sma >= 5: 66 | step_size = group['lr'] * math.sqrt( 67 | (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / ( 68 | N_sma_max - 2)) / (1 - beta1 ** state['step']) 69 | else: 70 | step_size = group['lr'] / (1 - beta1 ** state['step']) 71 | buffered[2] = step_size 72 | 73 | if group['weight_decay'] != 0: 74 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 75 | 76 | # more conservative since it's an approximated value 77 | if N_sma >= 5: 78 | denom = exp_avg_sq.sqrt().add_(group['eps']) 79 | p_data_fp32.addcdiv_(-step_size, exp_avg, denom) 80 | else: 81 | p_data_fp32.add_(-step_size, exp_avg) 82 | 83 | p.data.copy_(p_data_fp32) 84 | 85 | return loss 86 | 87 | 88 | class PlainRAdam(Optimizer): 89 | 90 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 91 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 92 | 93 | super(PlainRAdam, self).__init__(params, defaults) 94 | 95 | def __setstate__(self, state): 96 | super(PlainRAdam, self).__setstate__(state) 97 | 98 | def step(self, closure=None): 99 | 100 | loss = None 101 | if closure is not None: 102 | loss = closure() 103 | 104 | for group in self.param_groups: 105 | 106 | for p in group['params']: 107 | if p.grad is None: 108 | continue 109 | grad = p.grad.data.float() 110 | if grad.is_sparse: 111 | raise RuntimeError('RAdam does not support sparse gradients') 112 | 113 | p_data_fp32 = p.data.float() 114 | 115 | state = self.state[p] 116 | 117 | if len(state) == 0: 118 | state['step'] = 0 119 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 120 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 121 | else: 122 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 123 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 124 | 125 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 126 | beta1, beta2 = group['betas'] 127 | 128 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 129 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 130 | 131 | state['step'] += 1 132 | beta2_t = beta2 ** state['step'] 133 | N_sma_max = 2 / (1 - beta2) - 1 134 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 135 | 136 | if group['weight_decay'] != 0: 137 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 138 | 139 | # more conservative since it's an approximated value 140 | if N_sma >= 5: 141 | step_size = group['lr'] * math.sqrt( 142 | (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / ( 143 | N_sma_max - 2)) / (1 - beta1 ** state['step']) 144 | denom = exp_avg_sq.sqrt().add_(group['eps']) 145 | p_data_fp32.addcdiv_(-step_size, exp_avg, denom) 146 | else: 147 | step_size = group['lr'] / (1 - beta1 ** state['step']) 148 | p_data_fp32.add_(-step_size, exp_avg) 149 | 150 | p.data.copy_(p_data_fp32) 151 | 152 | return loss 153 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/optim/rmsprop_tf.py: -------------------------------------------------------------------------------- 1 | """ RMSProp modified to behave like Tensorflow impl 2 | 3 | Originally cut & paste from PyTorch RMSProp 4 | https://github.com/pytorch/pytorch/blob/063946d2b3f3f1e953a2a3b54e0b34f1393de295/torch/optim/rmsprop.py 5 | Licensed under BSD-Clause 3 (ish), https://github.com/pytorch/pytorch/blob/master/LICENSE 6 | 7 | Modifications Copyright 2020 Ross Wightman 8 | """ 9 | 10 | import torch 11 | from torch.optim import Optimizer 12 | 13 | 14 | class RMSpropTF(Optimizer): 15 | """Implements RMSprop algorithm (TensorFlow style epsilon) 16 | 17 | NOTE: This is a direct cut-and-paste of PyTorch RMSprop with eps applied before sqrt 18 | and a few other modifications to closer match Tensorflow for matching hyper-params. 19 | 20 | Noteworthy changes include: 21 | 1. Epsilon applied inside square-root 22 | 2. square_avg initialized to ones 23 | 3. LR scaling of update accumulated in momentum buffer 24 | 25 | Proposed by G. Hinton in his 26 | `course `_. 27 | 28 | The centered version first appears in `Generating Sequences 29 | With Recurrent Neural Networks `_. 30 | 31 | Arguments: 32 | params (iterable): iterable of parameters to optimize or dicts defining 33 | parameter groups 34 | lr (float, optional): learning rate (default: 1e-2) 35 | momentum (float, optional): momentum factor (default: 0) 36 | alpha (float, optional): smoothing (decay) constant (default: 0.9) 37 | eps (float, optional): term added to the denominator to improve 38 | numerical stability (default: 1e-10) 39 | centered (bool, optional) : if ``True``, compute the centered RMSProp, 40 | the gradient is normalized by an estimation of its variance 41 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 42 | decoupled_decay (bool, optional): decoupled weight decay as per https://arxiv.org/abs/1711.05101 43 | lr_in_momentum (bool, optional): learning rate scaling is included in the momentum buffer 44 | update as per defaults in Tensorflow 45 | 46 | """ 47 | 48 | def __init__(self, params, lr=1e-2, alpha=0.9, eps=1e-10, weight_decay=0, momentum=0., centered=False, 49 | decoupled_decay=False, lr_in_momentum=True): 50 | if not 0.0 <= lr: 51 | raise ValueError("Invalid learning rate: {}".format(lr)) 52 | if not 0.0 <= eps: 53 | raise ValueError("Invalid epsilon value: {}".format(eps)) 54 | if not 0.0 <= momentum: 55 | raise ValueError("Invalid momentum value: {}".format(momentum)) 56 | if not 0.0 <= weight_decay: 57 | raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) 58 | if not 0.0 <= alpha: 59 | raise ValueError("Invalid alpha value: {}".format(alpha)) 60 | 61 | defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay, 62 | decoupled_decay=decoupled_decay, lr_in_momentum=lr_in_momentum) 63 | super(RMSpropTF, self).__init__(params, defaults) 64 | 65 | def __setstate__(self, state): 66 | super(RMSpropTF, self).__setstate__(state) 67 | for group in self.param_groups: 68 | group.setdefault('momentum', 0) 69 | group.setdefault('centered', False) 70 | 71 | def step(self, closure=None): 72 | """Performs a single optimization step. 73 | 74 | Arguments: 75 | closure (callable, optional): A closure that reevaluates the model 76 | and returns the loss. 77 | """ 78 | loss = None 79 | if closure is not None: 80 | loss = closure() 81 | 82 | for group in self.param_groups: 83 | for p in group['params']: 84 | if p.grad is None: 85 | continue 86 | grad = p.grad.data 87 | if grad.is_sparse: 88 | raise RuntimeError('RMSprop does not support sparse gradients') 89 | state = self.state[p] 90 | 91 | # State initialization 92 | if len(state) == 0: 93 | state['step'] = 0 94 | state['square_avg'] = torch.ones_like(p.data) # PyTorch inits to zero 95 | if group['momentum'] > 0: 96 | state['momentum_buffer'] = torch.zeros_like(p.data) 97 | if group['centered']: 98 | state['grad_avg'] = torch.zeros_like(p.data) 99 | 100 | square_avg = state['square_avg'] 101 | one_minus_alpha = 1. - group['alpha'] 102 | 103 | state['step'] += 1 104 | 105 | if group['weight_decay'] != 0: 106 | if 'decoupled_decay' in group and group['decoupled_decay']: 107 | p.data.add_(-group['weight_decay'], p.data) 108 | else: 109 | grad = grad.add(group['weight_decay'], p.data) 110 | 111 | # Tensorflow order of ops for updating squared avg 112 | square_avg.add_(one_minus_alpha, grad.pow(2) - square_avg) 113 | # square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad) # PyTorch original 114 | 115 | if group['centered']: 116 | grad_avg = state['grad_avg'] 117 | grad_avg.add_(one_minus_alpha, grad - grad_avg) 118 | # grad_avg.mul_(alpha).add_(1 - alpha, grad) # PyTorch original 119 | avg = square_avg.addcmul(-1, grad_avg, grad_avg).add(group['eps']).sqrt_() # eps moved in sqrt 120 | else: 121 | avg = square_avg.add(group['eps']).sqrt_() # eps moved in sqrt 122 | 123 | if group['momentum'] > 0: 124 | buf = state['momentum_buffer'] 125 | # Tensorflow accumulates the LR scaling in the momentum buffer 126 | if 'lr_in_momentum' in group and group['lr_in_momentum']: 127 | buf.mul_(group['momentum']).addcdiv_(group['lr'], grad, avg) 128 | p.data.add_(-buf) 129 | else: 130 | # PyTorch scales the param update by LR 131 | buf.mul_(group['momentum']).addcdiv_(grad, avg) 132 | p.data.add_(-group['lr'], buf) 133 | else: 134 | p.data.addcdiv_(-group['lr'], grad, avg) 135 | 136 | return loss 137 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/optim/adahessian.py: -------------------------------------------------------------------------------- 1 | """ AdaHessian Optimizer 2 | 3 | Lifted from https://github.com/davda54/ada-hessian/blob/master/ada_hessian.py 4 | Originally licensed MIT, Copyright 2020, David Samuel 5 | """ 6 | import torch 7 | 8 | 9 | class Adahessian(torch.optim.Optimizer): 10 | """ 11 | Implements the AdaHessian algorithm from "ADAHESSIAN: An Adaptive Second OrderOptimizer for Machine Learning" 12 | 13 | Arguments: 14 | params (iterable): iterable of parameters to optimize or dicts defining parameter groups 15 | lr (float, optional): learning rate (default: 0.1) 16 | betas ((float, float), optional): coefficients used for computing running averages of gradient and the 17 | squared hessian trace (default: (0.9, 0.999)) 18 | eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) 19 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0.0) 20 | hessian_power (float, optional): exponent of the hessian trace (default: 1.0) 21 | update_each (int, optional): compute the hessian trace approximation only after *this* number of steps 22 | (to save time) (default: 1) 23 | n_samples (int, optional): how many times to sample `z` for the approximation of the hessian trace (default: 1) 24 | """ 25 | 26 | def __init__(self, params, lr=0.1, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0, 27 | hessian_power=1.0, update_each=1, n_samples=1, avg_conv_kernel=False): 28 | if not 0.0 <= lr: 29 | raise ValueError(f"Invalid learning rate: {lr}") 30 | if not 0.0 <= eps: 31 | raise ValueError(f"Invalid epsilon value: {eps}") 32 | if not 0.0 <= betas[0] < 1.0: 33 | raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}") 34 | if not 0.0 <= betas[1] < 1.0: 35 | raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}") 36 | if not 0.0 <= hessian_power <= 1.0: 37 | raise ValueError(f"Invalid Hessian power value: {hessian_power}") 38 | 39 | self.n_samples = n_samples 40 | self.update_each = update_each 41 | self.avg_conv_kernel = avg_conv_kernel 42 | 43 | # use a separate generator that deterministically generates the same `z`s across all GPUs in case of distributed training 44 | self.seed = 2147483647 45 | self.generator = torch.Generator().manual_seed(self.seed) 46 | 47 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, hessian_power=hessian_power) 48 | super(Adahessian, self).__init__(params, defaults) 49 | 50 | for p in self.get_params(): 51 | p.hess = 0.0 52 | self.state[p]["hessian step"] = 0 53 | 54 | @property 55 | def is_second_order(self): 56 | return True 57 | 58 | def get_params(self): 59 | """ 60 | Gets all parameters in all param_groups with gradients 61 | """ 62 | 63 | return (p for group in self.param_groups for p in group['params'] if p.requires_grad) 64 | 65 | def zero_hessian(self): 66 | """ 67 | Zeros out the accumalated hessian traces. 68 | """ 69 | 70 | for p in self.get_params(): 71 | if not isinstance(p.hess, float) and self.state[p]["hessian step"] % self.update_each == 0: 72 | p.hess.zero_() 73 | 74 | @torch.no_grad() 75 | def set_hessian(self): 76 | """ 77 | Computes the Hutchinson approximation of the hessian trace and accumulates it for each trainable parameter. 78 | """ 79 | 80 | params = [] 81 | for p in filter(lambda p: p.grad is not None, self.get_params()): 82 | if self.state[p]["hessian step"] % self.update_each == 0: # compute the trace only each `update_each` step 83 | params.append(p) 84 | self.state[p]["hessian step"] += 1 85 | 86 | if len(params) == 0: 87 | return 88 | 89 | if self.generator.device != params[0].device: # hackish way of casting the generator to the right device 90 | self.generator = torch.Generator(params[0].device).manual_seed(self.seed) 91 | 92 | grads = [p.grad for p in params] 93 | 94 | for i in range(self.n_samples): 95 | # Rademacher distribution {-1.0, 1.0} 96 | zs = [torch.randint(0, 2, p.size(), generator=self.generator, device=p.device) * 2.0 - 1.0 for p in params] 97 | h_zs = torch.autograd.grad( 98 | grads, params, grad_outputs=zs, only_inputs=True, retain_graph=i < self.n_samples - 1) 99 | for h_z, z, p in zip(h_zs, zs, params): 100 | p.hess += h_z * z / self.n_samples # approximate the expected values of z*(H@z) 101 | 102 | @torch.no_grad() 103 | def step(self, closure=None): 104 | """ 105 | Performs a single optimization step. 106 | Arguments: 107 | closure (callable, optional) -- a closure that reevaluates the model and returns the loss (default: None) 108 | """ 109 | 110 | loss = None 111 | if closure is not None: 112 | loss = closure() 113 | 114 | self.zero_hessian() 115 | self.set_hessian() 116 | 117 | for group in self.param_groups: 118 | for p in group['params']: 119 | if p.grad is None or p.hess is None: 120 | continue 121 | 122 | if self.avg_conv_kernel and p.dim() == 4: 123 | p.hess = torch.abs(p.hess).mean(dim=[2, 3], keepdim=True).expand_as(p.hess).clone() 124 | 125 | # Perform correct stepweight decay as in AdamW 126 | p.mul_(1 - group['lr'] * group['weight_decay']) 127 | 128 | state = self.state[p] 129 | 130 | # State initialization 131 | if len(state) == 1: 132 | state['step'] = 0 133 | # Exponential moving average of gradient values 134 | state['exp_avg'] = torch.zeros_like(p) 135 | # Exponential moving average of Hessian diagonal square values 136 | state['exp_hessian_diag_sq'] = torch.zeros_like(p) 137 | 138 | exp_avg, exp_hessian_diag_sq = state['exp_avg'], state['exp_hessian_diag_sq'] 139 | beta1, beta2 = group['betas'] 140 | state['step'] += 1 141 | 142 | # Decay the first and second moment running average coefficient 143 | exp_avg.mul_(beta1).add_(p.grad, alpha=1 - beta1) 144 | exp_hessian_diag_sq.mul_(beta2).addcmul_(p.hess, p.hess, value=1 - beta2) 145 | 146 | bias_correction1 = 1 - beta1 ** state['step'] 147 | bias_correction2 = 1 - beta2 ** state['step'] 148 | 149 | k = group['hessian_power'] 150 | denom = (exp_hessian_diag_sq / bias_correction2).pow_(k / 2).add_(group['eps']) 151 | 152 | # make update 153 | step_size = group['lr'] / bias_correction1 154 | p.addcdiv_(exp_avg, denom, value=-step_size) 155 | 156 | return loss 157 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/optim/optim_factory.py: -------------------------------------------------------------------------------- 1 | """ Optimizer Factory w/ Custom Weight Decay 2 | Hacked together by / Copyright 2020 Ross Wightman 3 | """ 4 | import torch 5 | from torch import optim as optim 6 | 7 | from .adafactor import Adafactor 8 | from .adahessian import Adahessian 9 | from .adamp import AdamP 10 | from .lookahead import Lookahead 11 | from .nadam import Nadam 12 | from .novograd import NovoGrad 13 | from .nvnovograd import NvNovoGrad 14 | from .radam import RAdam 15 | from .rmsprop_tf import RMSpropTF 16 | from .sgdp import SGDP 17 | 18 | try: 19 | from apex.optimizers import FusedNovoGrad, FusedAdam, FusedLAMB, FusedSGD 20 | has_apex = True 21 | except ImportError: 22 | has_apex = False 23 | 24 | 25 | def add_weight_decay(model, weight_decay=1e-5, skip_list=()): 26 | decay = [] 27 | no_decay = [] 28 | for name, param in model.named_parameters(): 29 | if not param.requires_grad: 30 | continue # frozen weights 31 | if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list: 32 | no_decay.append(param) 33 | else: 34 | decay.append(param) 35 | return [ 36 | {'params': no_decay, 'weight_decay': 0.}, 37 | {'params': decay, 'weight_decay': weight_decay}] 38 | 39 | def add_weight_lr(args, model, weight_decay=1e-5, skip_list=()): 40 | decay = [] 41 | no_decay = [] 42 | 43 | lr_vis = [] 44 | lr_text = [] 45 | for name, param in model.named_parameters(): 46 | if not param.requires_grad: 47 | continue # frozen weights 48 | # if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list: 49 | # no_decay.append(param) 50 | # else: 51 | # decay.append(param) 52 | 53 | if "text_encoder" in name: 54 | lr_text.append(param) 55 | else: 56 | lr_vis.append(param) 57 | 58 | return [{'params': lr_vis, 'lr': args.lr}, 59 | {'params': lr_text, 'lr': args.lr_text}] 60 | 61 | def add_weight_lr_img(args, model, weight_decay=1e-5, skip_list=()): 62 | decay = [] 63 | no_decay = [] 64 | 65 | lr_vis = [] 66 | lr_text = [] 67 | for name, param in model.named_parameters(): 68 | if not param.requires_grad: 69 | continue # frozen weights 70 | # if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list: 71 | # no_decay.append(param) 72 | # else: 73 | # decay.append(param) 74 | 75 | if ("visual_encoder" in name) or ("bbox_head" in name) or ("aggregator" in name) or ("cls_token_local" in name) or ("it_cross_attn" in name) or ("norm_layer_aggr" in name) or ("norm_layer_it_cross_atten" in name): 76 | lr_vis.append(param) 77 | else: 78 | lr_text.append(param) 79 | 80 | return [{'params': lr_vis, 'lr': args.lr_img}, 81 | {'params': lr_text, 'lr': args.lr}] 82 | 83 | 84 | def create_optimizer(args, model, filter_bias_and_bn=True): 85 | opt_lower = args.opt.lower() 86 | weight_decay = args.weight_decay 87 | 88 | if weight_decay and filter_bias_and_bn: 89 | skip = {} 90 | if hasattr(model, 'no_weight_decay'): 91 | skip = model.no_weight_decay() 92 | if 'lr_text' in args: 93 | parameters = add_weight_lr(args, model, weight_decay, skip) 94 | if 'lr_img' in args: 95 | parameters = add_weight_lr_img(args, model, weight_decay, skip) 96 | else: 97 | parameters = add_weight_decay(model, weight_decay, skip) 98 | weight_decay = 0. 99 | else: 100 | parameters = model.parameters() 101 | 102 | if 'fused' in opt_lower: 103 | assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers' 104 | 105 | opt_args = dict(lr=args.lr, weight_decay=weight_decay) 106 | if hasattr(args, 'opt_eps') and args.opt_eps is not None: 107 | opt_args['eps'] = args.opt_eps 108 | if hasattr(args, 'opt_betas') and args.opt_betas is not None: 109 | opt_args['betas'] = args.opt_betas 110 | if hasattr(args, 'opt_args') and args.opt_args is not None: 111 | opt_args.update(args.opt_args) 112 | 113 | opt_split = opt_lower.split('_') 114 | opt_lower = opt_split[-1] 115 | if opt_lower == 'sgd' or opt_lower == 'nesterov': 116 | opt_args.pop('eps', None) 117 | optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=True, **opt_args) 118 | elif opt_lower == 'momentum': 119 | opt_args.pop('eps', None) 120 | optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=False, **opt_args) 121 | elif opt_lower == 'adam': 122 | optimizer = optim.Adam(parameters, **opt_args) 123 | elif opt_lower == 'adamw': 124 | optimizer = optim.AdamW(parameters, **opt_args) 125 | elif opt_lower == 'nadam': 126 | optimizer = Nadam(parameters, **opt_args) 127 | elif opt_lower == 'radam': 128 | optimizer = RAdam(parameters, **opt_args) 129 | elif opt_lower == 'adamp': 130 | optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args) 131 | elif opt_lower == 'sgdp': 132 | optimizer = SGDP(parameters, momentum=args.momentum, nesterov=True, **opt_args) 133 | elif opt_lower == 'adadelta': 134 | optimizer = optim.Adadelta(parameters, **opt_args) 135 | elif opt_lower == 'adafactor': 136 | if not args.lr: 137 | opt_args['lr'] = None 138 | optimizer = Adafactor(parameters, **opt_args) 139 | elif opt_lower == 'adahessian': 140 | optimizer = Adahessian(parameters, **opt_args) 141 | elif opt_lower == 'rmsprop': 142 | optimizer = optim.RMSprop(parameters, alpha=0.9, momentum=args.momentum, **opt_args) 143 | elif opt_lower == 'rmsproptf': 144 | optimizer = RMSpropTF(parameters, alpha=0.9, momentum=args.momentum, **opt_args) 145 | elif opt_lower == 'novograd': 146 | optimizer = NovoGrad(parameters, **opt_args) 147 | elif opt_lower == 'nvnovograd': 148 | optimizer = NvNovoGrad(parameters, **opt_args) 149 | elif opt_lower == 'fusedsgd': 150 | opt_args.pop('eps', None) 151 | optimizer = FusedSGD(parameters, momentum=args.momentum, nesterov=True, **opt_args) 152 | elif opt_lower == 'fusedmomentum': 153 | opt_args.pop('eps', None) 154 | optimizer = FusedSGD(parameters, momentum=args.momentum, nesterov=False, **opt_args) 155 | elif opt_lower == 'fusedadam': 156 | optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args) 157 | elif opt_lower == 'fusedadamw': 158 | optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args) 159 | elif opt_lower == 'fusedlamb': 160 | optimizer = FusedLAMB(parameters, **opt_args) 161 | elif opt_lower == 'fusednovograd': 162 | opt_args.setdefault('betas', (0.95, 0.98)) 163 | optimizer = FusedNovoGrad(parameters, **opt_args) 164 | else: 165 | assert False and "Invalid optimizer" 166 | raise ValueError 167 | 168 | if len(opt_split) > 1: 169 | if opt_split[0] == 'lookahead': 170 | optimizer = Lookahead(optimizer) 171 | 172 | return optimizer 173 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/tools/utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | # from apex.optimizers import FusedAdam, FusedSGD 3 | # from timm.optim import AdamW 4 | import torch 5 | from torch import optim 6 | from torch.optim import lr_scheduler 7 | from torch.optim.rmsprop import RMSprop 8 | # from torch.optim.adamw import AdamW 9 | from torch.optim.lr_scheduler import MultiStepLR, CyclicLR, StepLR 10 | 11 | from tools.schedulers import ExponentialLRScheduler, PolyLR, LRStepScheduler 12 | 13 | cv2.ocl.setUseOpenCL(False) 14 | cv2.setNumThreads(0) 15 | 16 | import numpy as np 17 | from typing import Optional, List 18 | from torch import Tensor 19 | import json 20 | import os 21 | import torch.distributed as dist 22 | 23 | class AverageMeter(object): 24 | """Computes and stores the average and current value""" 25 | 26 | def __init__(self): 27 | self.reset() 28 | 29 | def reset(self): 30 | self.val = 0 31 | self.avg = 0 32 | self.sum = 0 33 | self.count = 0 34 | 35 | def update(self, val, n=1): 36 | self.val = val 37 | self.sum += val * n 38 | self.count += n 39 | self.avg = self.sum / self.count 40 | 41 | def create_optimizer(optimizer_config, model, sum_steps, master_params=None): 42 | """Creates optimizer and schedule from configuration 43 | 44 | Parameters 45 | ---------- 46 | optimizer_config : dict 47 | Dictionary containing the configuration options for the optimizer. 48 | model : Model 49 | The network model. 50 | 51 | Returns 52 | ------- 53 | optimizer : Optimizer 54 | The optimizer. 55 | scheduler : LRScheduler 56 | The learning rate scheduler. 57 | """ 58 | if optimizer_config.get("classifier_lr", -1) != -1: 59 | # Separate classifier parameters from all others 60 | net_params = [] 61 | classifier_params = [] 62 | for k, v in model.named_parameters(): 63 | if not v.requires_grad: 64 | continue 65 | if k.find("encoder") != -1: 66 | net_params.append(v) 67 | else: 68 | classifier_params.append(v) 69 | params = [ 70 | {"params": net_params}, 71 | {"params": classifier_params, "lr": optimizer_config["classifier_lr"]}, 72 | ] 73 | else: 74 | if master_params: 75 | params = master_params 76 | else: 77 | params = model.parameters() 78 | 79 | if optimizer_config["type"] == "SGD": 80 | optimizer = optim.SGD(params, 81 | lr=optimizer_config["learning_rate"], 82 | momentum=optimizer_config["momentum"], 83 | weight_decay=optimizer_config["weight_decay"], 84 | nesterov=optimizer_config["nesterov"]) 85 | elif optimizer_config["type"] == "Adam": 86 | optimizer = optim.Adam(params, 87 | lr=optimizer_config["learning_rate"], 88 | weight_decay=optimizer_config["weight_decay"]) 89 | # elif optimizer_config["type"] == "FusedAdam": 90 | # optimizer = FusedAdam(params, 91 | # lr=optimizer_config["learning_rate"], 92 | # weight_decay=optimizer_config["weight_decay"]) 93 | # elif optimizer_config["type"] == "AdamW": 94 | # optimizer = AdamW(params, 95 | # lr=optimizer_config["learning_rate"], 96 | # weight_decay=optimizer_config["weight_decay"]) 97 | elif optimizer_config["type"] == "RmsProp": 98 | optimizer = RMSprop(params, 99 | lr=optimizer_config["learning_rate"], 100 | weight_decay=optimizer_config["weight_decay"]) 101 | else: 102 | raise KeyError("unrecognized optimizer {}".format(optimizer_config["type"])) 103 | 104 | if optimizer_config["schedule"]["type"] == "step": 105 | # scheduler = LRStepScheduler(optimizer, **optimizer_config["schedule"]["params"]) 106 | scheduler = StepLR(optimizer, **optimizer_config["schedule"]["params"]) 107 | elif optimizer_config["schedule"]["type"] == "clr": 108 | scheduler = CyclicLR(optimizer, **optimizer_config["schedule"]["params"]) 109 | elif optimizer_config["schedule"]["type"] == "multistep": 110 | scheduler = MultiStepLR(optimizer, **optimizer_config["schedule"]["params"]) 111 | elif optimizer_config["schedule"]["type"] == "exponential": 112 | scheduler = ExponentialLRScheduler(optimizer, **optimizer_config["schedule"]["params"]) 113 | elif optimizer_config["schedule"]["type"] == "poly": 114 | scheduler = PolyLR(optimizer, max_iter = sum_steps) 115 | # scheduler = PolyLR(optimizer, **optimizer_config["schedule"]["params"]) 116 | elif optimizer_config["schedule"]["type"] == "constant": 117 | scheduler = lr_scheduler.LambdaLR(optimizer, lambda epoch: 1.0) 118 | elif optimizer_config["schedule"]["type"] == "linear": 119 | def linear_lr(it): 120 | return it * optimizer_config["schedule"]["params"]["alpha"] + optimizer_config["schedule"]["params"]["beta"] 121 | 122 | scheduler = lr_scheduler.LambdaLR(optimizer, linear_lr) 123 | 124 | return optimizer, scheduler 125 | 126 | 127 | 128 | 129 | def read_json(file_name): 130 | with open(file_name) as handle: 131 | out = json.load(handle) 132 | return out 133 | 134 | def nested_tensor_from_tensor_list(imgsize, tensor_list: List[Tensor]): 135 | # TODO make this more general 136 | if tensor_list[0].ndim == 3: 137 | # TODO make it support different-sized images 138 | max_size = [3, imgsize, imgsize] 139 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 140 | batch_shape = [len(tensor_list)] + max_size 141 | b, c, h, w = batch_shape 142 | dtype = tensor_list[0].dtype 143 | device = tensor_list[0].device 144 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 145 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 146 | for img, pad_img, m in zip(tensor_list, tensor, mask): 147 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 148 | m[: img.shape[1], :img.shape[2]] = False 149 | else: 150 | raise ValueError('not supported') 151 | return NestedTensor(tensor, mask) 152 | 153 | 154 | class NestedTensor(object): 155 | def __init__(self, tensors, mask: Optional[Tensor]): 156 | self.tensors = tensors 157 | self.mask = mask 158 | 159 | def to(self, device): 160 | cast_tensor = self.tensors.cuda(device) 161 | mask = self.mask 162 | if mask is not None: 163 | assert mask is not None 164 | cast_mask = mask.cuda(device) 165 | else: 166 | cast_mask = None 167 | return NestedTensor(cast_tensor, cast_mask) 168 | 169 | def decompose(self): 170 | return self.tensors, self.mask 171 | 172 | def __repr__(self): 173 | return str(self.tensors) 174 | 175 | 176 | def is_dist_avail_and_initialized(): 177 | if not dist.is_available(): 178 | return False 179 | if not dist.is_initialized(): 180 | return False 181 | return True 182 | 183 | 184 | def get_rank(): 185 | if not is_dist_avail_and_initialized(): 186 | return 0 187 | return dist.get_rank() 188 | 189 | 190 | def is_main_process(): 191 | return get_rank() == 0 192 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/dataset/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def pre_question(question,max_ques_words): 4 | question = re.sub( 5 | r"([,.'!?\"()*#:;~])", 6 | '', 7 | question.lower(), 8 | ).replace('-', ' ').replace('/', ' ') 9 | question = question.rstrip(' ') 10 | 11 | #truncate question 12 | question_words = question.split(' ') 13 | if len(question_words)>max_ques_words: 14 | question = ' '.join(question_words[:max_ques_words]) 15 | 16 | return question 17 | 18 | 19 | def pre_caption(caption,max_words): 20 | caption = re.sub( 21 | r"([,.'!?\"()*#:;~])", 22 | '', 23 | caption.lower(), 24 | ).replace('-', ' ').replace('/', ' ').replace('', 'person') 25 | 26 | caption = re.sub( 27 | r"\s{2,}", 28 | ' ', 29 | caption, 30 | ) 31 | caption = caption.rstrip('\n') 32 | caption = caption.strip(' ') 33 | 34 | #truncate caption 35 | caption_words = caption.split(' ') 36 | if len(caption_words)>max_words: 37 | caption = ' '.join(caption_words[:max_words]) 38 | 39 | return caption 40 | 41 | 42 | # from vqaTools.vqaEval import VQAEval 43 | # from refTools.evaluation.refEvaluation import RefEvaluation 44 | 45 | import json 46 | import os 47 | import numpy as np 48 | import torch 49 | import torch.distributed as dist 50 | import torch.nn.functional as F 51 | 52 | import utils 53 | from tqdm import tqdm 54 | 55 | 56 | def vqa_eval(vqa, result_file, test_ques_path): 57 | vqaRes = vqa.loadRes(result_file, test_ques_path) 58 | # create vqaEval object by taking vqa and vqaRes 59 | vqaEval = VQAEval(vqa, vqaRes, n=2) # n is precision of accuracy (number of places after decimal), default is 2 60 | # evaluate results 61 | vqaEval.evaluate() 62 | 63 | # print accuracies 64 | print("\n") 65 | print("Overall Accuracy is: %.02f\n" % (vqaEval.accuracy['overall'])) 66 | print("Per Answer Type Accuracy is the following:") 67 | for ansType in vqaEval.accuracy['perAnswerType']: 68 | print("%s : %.02f" % (ansType, vqaEval.accuracy['perAnswerType'][ansType])) 69 | print("\n") 70 | 71 | return vqaEval 72 | 73 | 74 | 75 | def collect_result(result, result_dir, filename, is_json=True, is_list=True): 76 | if is_json: 77 | result_file = os.path.join(result_dir, '%s_rank%d.json'%(filename,utils.get_rank())) 78 | final_result_file = os.path.join(result_dir, '%s.json'%filename) 79 | json.dump(result,open(result_file,'w')) 80 | else: 81 | result_file = os.path.join(result_dir, '%s_rank%d.pth'%(filename,utils.get_rank())) 82 | final_result_file = os.path.join(result_dir, '%s.pth'%filename) 83 | torch.save(result,result_file) 84 | 85 | dist.barrier() 86 | 87 | result = None 88 | if utils.is_main_process(): 89 | # combine results from all processes 90 | if is_list: 91 | result = [] 92 | else: 93 | result = {} 94 | for rank in range(utils.get_world_size()): 95 | if is_json: 96 | result_file = os.path.join(result_dir, '%s_rank%d.json'%(filename,rank)) 97 | res = json.load(open(result_file,'r')) 98 | else: 99 | result_file = os.path.join(result_dir, '%s_rank%d.pth'%(filename,rank)) 100 | res = torch.load(result_file) 101 | if is_list: 102 | result += res 103 | else: 104 | result.update(res) 105 | 106 | return result 107 | 108 | 109 | def save_result(result, result_dir, filename, is_json=True, is_list=True): 110 | if is_json: 111 | result_file = os.path.join(result_dir, '%s_rank%d.json'%(filename,utils.get_rank())) 112 | final_result_file = os.path.join(result_dir, '%s.json'%filename) 113 | json.dump(result,open(result_file,'w')) 114 | else: 115 | result_file = os.path.join(result_dir, '%s_rank%d.pth'%(filename,utils.get_rank())) 116 | final_result_file = os.path.join(result_dir, '%s.pth'%filename) 117 | torch.save(result,result_file) 118 | 119 | dist.barrier() 120 | 121 | if utils.is_main_process(): 122 | # combine results from all processes 123 | if is_list: 124 | result = [] 125 | else: 126 | result = {} 127 | for rank in range(utils.get_world_size()): 128 | if is_json: 129 | result_file = os.path.join(result_dir, '%s_rank%d.json'%(filename,rank)) 130 | res = json.load(open(result_file,'r')) 131 | else: 132 | result_file = os.path.join(result_dir, '%s_rank%d.pth'%(filename,rank)) 133 | res = torch.load(result_file) 134 | if is_list: 135 | result += res 136 | else: 137 | result.update(res) 138 | if is_json: 139 | json.dump(result,open(final_result_file,'w')) 140 | else: 141 | torch.save(result,final_result_file) 142 | 143 | print('result file saved to %s'%final_result_file) 144 | dist.barrier() 145 | return final_result_file 146 | 147 | 148 | 149 | def grounding_eval(results,dets,cocos,refer,alpha,mask_size=24): 150 | 151 | correct_A_d, correct_B_d, correct_val_d = 0, 0, 0 152 | correct_A, correct_B, correct_val = 0, 0, 0 153 | num_A,num_B,num_val = 0,0,0 154 | 155 | for res in tqdm(results): 156 | 157 | ref_id = res['ref_id'] 158 | ref = refer.Refs[ref_id] 159 | ref_box = refer.refToAnn[ref_id]['bbox'] 160 | image = refer.Imgs[ref['image_id']] 161 | 162 | mask = res['pred'].cuda().view(1,1,mask_size,mask_size) 163 | mask = F.interpolate(mask,size = (image['height'],image['width']), mode='bicubic').squeeze() 164 | 165 | # rank detection boxes 166 | max_score = 0 167 | for det in dets[str(ref['image_id'])]: 168 | score = mask[int(det[1]):int(det[1]+det[3]),int(det[0]):int(det[0]+det[2])] 169 | area = det[2]*det[3] 170 | score = score.sum() / area**alpha 171 | if score>max_score: 172 | pred_box = det[:4] 173 | max_score = score 174 | 175 | IoU_det = computeIoU(ref_box, pred_box) 176 | 177 | if ref['split']=='testA': 178 | num_A += 1 179 | if IoU_det >= 0.5: 180 | correct_A_d += 1 181 | elif ref['split']=='testB': 182 | num_B += 1 183 | if IoU_det >= 0.5: 184 | correct_B_d += 1 185 | elif ref['split']=='val': 186 | num_val += 1 187 | if IoU_det >= 0.5: 188 | correct_val_d += 1 189 | 190 | eval_result = {'val_d':correct_val_d/num_val,'testA_d':correct_A_d/num_A,'testB_d':correct_B_d/num_B} 191 | 192 | for metric, acc in eval_result.items(): 193 | print(f'{metric}: {acc:.3f}') 194 | 195 | return eval_result 196 | 197 | 198 | 199 | # IoU function 200 | def computeIoU(box1, box2): 201 | # each box is of [x1, y1, w, h] 202 | inter_x1 = max(box1[0], box2[0]) 203 | inter_y1 = max(box1[1], box2[1]) 204 | inter_x2 = min(box1[0]+box1[2]-1, box2[0]+box2[2]-1) 205 | inter_y2 = min(box1[1]+box1[3]-1, box2[1]+box2[3]-1) 206 | 207 | if inter_x1 < inter_x2 and inter_y1 < inter_y2: 208 | inter = (inter_x2-inter_x1+1)*(inter_y2-inter_y1+1) 209 | else: 210 | inter = 0 211 | union = box1[2]*box1[3] + box2[2]*box2[3] - inter 212 | return float(inter)/union 213 | 214 | 215 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/optim/adafactor.py: -------------------------------------------------------------------------------- 1 | """ Adafactor Optimizer 2 | 3 | Lifted from https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py 4 | 5 | Original header/copyright below. 6 | 7 | """ 8 | # Copyright (c) Facebook, Inc. and its affiliates. 9 | # 10 | # This source code is licensed under the MIT license found in the 11 | # LICENSE file in the root directory of this source tree. 12 | import torch 13 | import math 14 | 15 | 16 | class Adafactor(torch.optim.Optimizer): 17 | """Implements Adafactor algorithm. 18 | This implementation is based on: `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost` 19 | (see https://arxiv.org/abs/1804.04235) 20 | 21 | Note that this optimizer internally adjusts the learning rate depending on the 22 | *scale_parameter*, *relative_step* and *warmup_init* options. 23 | 24 | To use a manual (external) learning rate schedule you should set `scale_parameter=False` and 25 | `relative_step=False`. 26 | 27 | Arguments: 28 | params (iterable): iterable of parameters to optimize or dicts defining parameter groups 29 | lr (float, optional): external learning rate (default: None) 30 | eps (tuple[float, float]): regularization constants for square gradient 31 | and parameter scale respectively (default: (1e-30, 1e-3)) 32 | clip_threshold (float): threshold of root mean square of final gradient update (default: 1.0) 33 | decay_rate (float): coefficient used to compute running averages of square gradient (default: -0.8) 34 | beta1 (float): coefficient used for computing running averages of gradient (default: None) 35 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 36 | scale_parameter (bool): if True, learning rate is scaled by root mean square of parameter (default: True) 37 | relative_step (bool): if True, time-dependent learning rate is computed 38 | instead of external learning rate (default: True) 39 | warmup_init (bool): time-dependent learning rate computation depends on 40 | whether warm-up initialization is being used (default: False) 41 | """ 42 | 43 | def __init__(self, params, lr=None, eps=1e-30, eps_scale=1e-3, clip_threshold=1.0, 44 | decay_rate=-0.8, betas=None, weight_decay=0.0, scale_parameter=True, warmup_init=False): 45 | relative_step = lr is None 46 | if warmup_init and not relative_step: 47 | raise ValueError('warmup_init requires relative_step=True') 48 | 49 | beta1 = None if betas is None else betas[0] # make it compat with standard betas arg 50 | defaults = dict(lr=lr, eps=eps, eps_scale=eps_scale, clip_threshold=clip_threshold, decay_rate=decay_rate, 51 | beta1=beta1, weight_decay=weight_decay, scale_parameter=scale_parameter, 52 | relative_step=relative_step, warmup_init=warmup_init) 53 | super(Adafactor, self).__init__(params, defaults) 54 | 55 | @staticmethod 56 | def _get_lr(param_group, param_state): 57 | if param_group['relative_step']: 58 | min_step = 1e-6 * param_state['step'] if param_group['warmup_init'] else 1e-2 59 | lr_t = min(min_step, 1.0 / math.sqrt(param_state['step'])) 60 | param_scale = 1.0 61 | if param_group['scale_parameter']: 62 | param_scale = max(param_group['eps_scale'], param_state['RMS']) 63 | param_group['lr'] = lr_t * param_scale 64 | return param_group['lr'] 65 | 66 | @staticmethod 67 | def _get_options(param_group, param_shape): 68 | factored = len(param_shape) >= 2 69 | use_first_moment = param_group['beta1'] is not None 70 | return factored, use_first_moment 71 | 72 | @staticmethod 73 | def _rms(tensor): 74 | return tensor.norm(2) / (tensor.numel() ** 0.5) 75 | 76 | def _approx_sq_grad(self, exp_avg_sq_row, exp_avg_sq_col): 77 | r_factor = (exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1, keepdim=True)).rsqrt_().unsqueeze(-1) 78 | c_factor = exp_avg_sq_col.unsqueeze(-2).rsqrt() 79 | return torch.mul(r_factor, c_factor) 80 | 81 | def step(self, closure=None): 82 | """Performs a single optimization step. 83 | Arguments: 84 | closure (callable, optional): A closure that reevaluates the model and returns the loss. 85 | """ 86 | loss = None 87 | if closure is not None: 88 | loss = closure() 89 | 90 | for group in self.param_groups: 91 | for p in group['params']: 92 | if p.grad is None: 93 | continue 94 | grad = p.grad.data 95 | if grad.dtype in {torch.float16, torch.bfloat16}: 96 | grad = grad.float() 97 | if grad.is_sparse: 98 | raise RuntimeError('Adafactor does not support sparse gradients.') 99 | 100 | state = self.state[p] 101 | grad_shape = grad.shape 102 | 103 | factored, use_first_moment = self._get_options(group, grad_shape) 104 | # State Initialization 105 | if len(state) == 0: 106 | state['step'] = 0 107 | 108 | if use_first_moment: 109 | # Exponential moving average of gradient values 110 | state['exp_avg'] = torch.zeros_like(grad) 111 | if factored: 112 | state['exp_avg_sq_row'] = torch.zeros(grad_shape[:-1]).to(grad) 113 | state['exp_avg_sq_col'] = torch.zeros(grad_shape[:-2] + grad_shape[-1:]).to(grad) 114 | else: 115 | state['exp_avg_sq'] = torch.zeros_like(grad) 116 | 117 | state['RMS'] = 0 118 | else: 119 | if use_first_moment: 120 | state['exp_avg'] = state['exp_avg'].to(grad) 121 | if factored: 122 | state['exp_avg_sq_row'] = state['exp_avg_sq_row'].to(grad) 123 | state['exp_avg_sq_col'] = state['exp_avg_sq_col'].to(grad) 124 | else: 125 | state['exp_avg_sq'] = state['exp_avg_sq'].to(grad) 126 | 127 | p_data_fp32 = p.data 128 | if p.data.dtype in {torch.float16, torch.bfloat16}: 129 | p_data_fp32 = p_data_fp32.float() 130 | 131 | state['step'] += 1 132 | state['RMS'] = self._rms(p_data_fp32) 133 | lr_t = self._get_lr(group, state) 134 | 135 | beta2t = 1.0 - math.pow(state['step'], group['decay_rate']) 136 | update = grad ** 2 + group['eps'] 137 | if factored: 138 | exp_avg_sq_row = state['exp_avg_sq_row'] 139 | exp_avg_sq_col = state['exp_avg_sq_col'] 140 | 141 | exp_avg_sq_row.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-1)) 142 | exp_avg_sq_col.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-2)) 143 | #exp_avg_sq_row.mul_(beta2t).add_(update.mean(dim=-1), alpha=1.0 - beta2t) # pytorch 1.6+ 144 | #exp_avg_sq_col.mul_(beta2t).add_(update.mean(dim=-2), alpha=1.0 - beta2t) 145 | 146 | # Approximation of exponential moving average of square of gradient 147 | update = self._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col) 148 | update.mul_(grad) 149 | else: 150 | exp_avg_sq = state['exp_avg_sq'] 151 | 152 | exp_avg_sq.mul_(beta2t).add_(1.0 - beta2t, update) 153 | #exp_avg_sq.mul_(beta2t).add_(update, alpha=1.0 - beta2t) # pytorch 1.6+ 154 | update = exp_avg_sq.rsqrt().mul_(grad) 155 | 156 | update.div_((self._rms(update) / group['clip_threshold']).clamp_(min=1.0)) 157 | update.mul_(lr_t) 158 | 159 | if use_first_moment: 160 | exp_avg = state['exp_avg'] 161 | exp_avg.mul_(group["beta1"]).add_(1 - group["beta1"], update) 162 | #exp_avg.mul_(group['beta1']).add_(update, alpha=1 - group['beta1']) # pytorch 1.6+ 163 | update = exp_avg 164 | 165 | if group['weight_decay'] != 0: 166 | p_data_fp32.add_(-group["weight_decay"] * lr_t, p_data_fp32) 167 | #p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * lr_t) # pytorch 1.6+ 168 | 169 | p_data_fp32.add_(-update) 170 | 171 | if p.data.dtype in {torch.float16, torch.bfloat16}: 172 | p.data.copy_(p_data_fp32) 173 | 174 | return loss -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/METER/dist_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | This file contains primitives for multi-gpu communication. 4 | This is useful when doing distributed training. 5 | """ 6 | 7 | import functools 8 | import logging 9 | import numpy as np 10 | import pickle 11 | import torch 12 | import torch.distributed as dist 13 | 14 | import torch 15 | 16 | _LOCAL_PROCESS_GROUP = None 17 | """ 18 | A torch process group which only includes processes that on the same machine as the current process. 19 | This variable is set when processes are spawned by `launch()` in "engine/launch.py". 20 | """ 21 | 22 | 23 | def get_world_size() -> int: 24 | if not dist.is_available(): 25 | return 1 26 | if not dist.is_initialized(): 27 | return 1 28 | return dist.get_world_size() 29 | 30 | 31 | def get_rank() -> int: 32 | if not dist.is_available(): 33 | return 0 34 | if not dist.is_initialized(): 35 | return 0 36 | return dist.get_rank() 37 | 38 | 39 | def get_local_rank() -> int: 40 | """ 41 | Returns: 42 | The rank of the current process within the local (per-machine) process group. 43 | """ 44 | if not dist.is_available(): 45 | return 0 46 | if not dist.is_initialized(): 47 | return 0 48 | assert _LOCAL_PROCESS_GROUP is not None 49 | return dist.get_rank(group=_LOCAL_PROCESS_GROUP) 50 | 51 | 52 | def get_local_size() -> int: 53 | """ 54 | Returns: 55 | The size of the per-machine process group, 56 | i.e. the number of processes per machine. 57 | """ 58 | if not dist.is_available(): 59 | return 1 60 | if not dist.is_initialized(): 61 | return 1 62 | return dist.get_world_size(group=_LOCAL_PROCESS_GROUP) 63 | 64 | 65 | def is_main_process() -> bool: 66 | return get_rank() == 0 67 | 68 | 69 | def synchronize(): 70 | """ 71 | Helper function to synchronize (barrier) among all processes when 72 | using distributed training 73 | """ 74 | if not dist.is_available(): 75 | return 76 | if not dist.is_initialized(): 77 | return 78 | world_size = dist.get_world_size() 79 | if world_size == 1: 80 | return 81 | dist.barrier() 82 | 83 | 84 | @functools.lru_cache() 85 | def _get_global_gloo_group(): 86 | """ 87 | Return a process group based on gloo backend, containing all the ranks 88 | The result is cached. 89 | """ 90 | if dist.get_backend() == "nccl": 91 | return dist.new_group(backend="gloo") 92 | else: 93 | return dist.group.WORLD 94 | 95 | 96 | def _serialize_to_tensor(data, group): 97 | backend = dist.get_backend(group) 98 | assert backend in ["gloo", "nccl"] 99 | device = torch.device("cpu" if backend == "gloo" else "cuda") 100 | 101 | buffer = pickle.dumps(data) 102 | if len(buffer) > 1024 ** 3: 103 | logger = logging.getLogger(__name__) 104 | logger.warning( 105 | "Rank {} trying to all-gather {:.2f} GB of data on device {}".format( 106 | get_rank(), len(buffer) / (1024 ** 3), device 107 | ) 108 | ) 109 | storage = torch.ByteStorage.from_buffer(buffer) 110 | tensor = torch.ByteTensor(storage).to(device=device) 111 | return tensor 112 | 113 | 114 | def _pad_to_largest_tensor(tensor, group): 115 | """ 116 | Returns: 117 | list[int]: size of the tensor, on each rank 118 | Tensor: padded tensor that has the max size 119 | """ 120 | world_size = dist.get_world_size(group=group) 121 | assert ( 122 | world_size >= 1 123 | ), "comm.gather/all_gather must be called from ranks within the given group!" 124 | local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device) 125 | size_list = [ 126 | torch.zeros([1], dtype=torch.int64, device=tensor.device) 127 | for _ in range(world_size) 128 | ] 129 | dist.all_gather(size_list, local_size, group=group) 130 | size_list = [int(size.item()) for size in size_list] 131 | 132 | max_size = max(size_list) 133 | 134 | # we pad the tensor because torch all_gather does not support 135 | # gathering tensors of different shapes 136 | if local_size != max_size: 137 | padding = torch.zeros( 138 | (max_size - local_size,), dtype=torch.uint8, device=tensor.device 139 | ) 140 | tensor = torch.cat((tensor, padding), dim=0) 141 | return size_list, tensor 142 | 143 | 144 | def all_gather(data, group=None): 145 | """ 146 | Run all_gather on arbitrary picklable data (not necessarily tensors). 147 | 148 | Args: 149 | data: any picklable object 150 | group: a torch process group. By default, will use a group which 151 | contains all ranks on gloo backend. 152 | 153 | Returns: 154 | list[data]: list of data gathered from each rank 155 | """ 156 | if get_world_size() == 1: 157 | return [data] 158 | if group is None: 159 | group = _get_global_gloo_group() 160 | if dist.get_world_size(group) == 1: 161 | return [data] 162 | 163 | tensor = _serialize_to_tensor(data, group) 164 | 165 | size_list, tensor = _pad_to_largest_tensor(tensor, group) 166 | max_size = max(size_list) 167 | 168 | # receiving Tensor from all ranks 169 | tensor_list = [ 170 | torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) 171 | for _ in size_list 172 | ] 173 | dist.all_gather(tensor_list, tensor, group=group) 174 | 175 | data_list = [] 176 | for size, tensor in zip(size_list, tensor_list): 177 | buffer = tensor.cpu().numpy().tobytes()[:size] 178 | data_list.append(pickle.loads(buffer)) 179 | 180 | return data_list 181 | 182 | 183 | def gather(data, dst=0, group=None): 184 | """ 185 | Run gather on arbitrary picklable data (not necessarily tensors). 186 | 187 | Args: 188 | data: any picklable object 189 | dst (int): destination rank 190 | group: a torch process group. By default, will use a group which 191 | contains all ranks on gloo backend. 192 | 193 | Returns: 194 | list[data]: on dst, a list of data gathered from each rank. Otherwise, 195 | an empty list. 196 | """ 197 | if get_world_size() == 1: 198 | return [data] 199 | if group is None: 200 | group = _get_global_gloo_group() 201 | if dist.get_world_size(group=group) == 1: 202 | return [data] 203 | rank = dist.get_rank(group=group) 204 | 205 | tensor = _serialize_to_tensor(data, group) 206 | size_list, tensor = _pad_to_largest_tensor(tensor, group) 207 | 208 | # receiving Tensor from all ranks 209 | if rank == dst: 210 | max_size = max(size_list) 211 | tensor_list = [ 212 | torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) 213 | for _ in size_list 214 | ] 215 | dist.gather(tensor, tensor_list, dst=dst, group=group) 216 | 217 | data_list = [] 218 | for size, tensor in zip(size_list, tensor_list): 219 | buffer = tensor.cpu().numpy().tobytes()[:size] 220 | data_list.append(pickle.loads(buffer)) 221 | return data_list 222 | else: 223 | dist.gather(tensor, [], dst=dst, group=group) 224 | return [] 225 | 226 | 227 | def shared_random_seed(): 228 | """ 229 | Returns: 230 | int: a random number that is the same across all workers. 231 | If workers need a shared RNG, they can use this shared seed to 232 | create one. 233 | 234 | All workers must call this function, otherwise it will deadlock. 235 | """ 236 | ints = np.random.randint(2 ** 31) 237 | all_ints = all_gather(ints) 238 | return all_ints[0] 239 | 240 | 241 | def reduce_dict(input_dict, average=True): 242 | """ 243 | Reduce the values in the dictionary from all processes so that process with rank 244 | 0 has the reduced results. 245 | 246 | Args: 247 | input_dict (dict): inputs to be reduced. All the values must be scalar CUDA Tensor. 248 | average (bool): whether to do average or sum 249 | 250 | Returns: 251 | a dict with the same keys as input_dict, after reduction. 252 | """ 253 | world_size = get_world_size() 254 | if world_size < 2: 255 | return input_dict 256 | with torch.no_grad(): 257 | names = [] 258 | values = [] 259 | # sort the keys so that they are consistent across processes 260 | for k in sorted(input_dict.keys()): 261 | names.append(k) 262 | values.append(input_dict[k]) 263 | values = torch.stack(values, dim=0) 264 | dist.reduce(values, dst=0) 265 | if dist.get_rank() == 0 and average: 266 | # only main process gets accumulated, so only divide by 267 | # world_size in this case 268 | values /= world_size 269 | reduced_dict = {k: v for k, v in zip(names, values)} 270 | return reduced_dict 271 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/dataset/randaugment.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | 5 | ## aug functions 6 | def identity_func(img): 7 | return img 8 | 9 | 10 | def autocontrast_func(img, cutoff=0): 11 | ''' 12 | same output as PIL.ImageOps.autocontrast 13 | ''' 14 | n_bins = 256 15 | 16 | def tune_channel(ch): 17 | n = ch.size 18 | cut = cutoff * n // 100 19 | if cut == 0: 20 | high, low = ch.max(), ch.min() 21 | else: 22 | hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins]) 23 | low = np.argwhere(np.cumsum(hist) > cut) 24 | low = 0 if low.shape[0] == 0 else low[0] 25 | high = np.argwhere(np.cumsum(hist[::-1]) > cut) 26 | high = n_bins - 1 if high.shape[0] == 0 else n_bins - 1 - high[0] 27 | if high <= low: 28 | table = np.arange(n_bins) 29 | else: 30 | scale = (n_bins - 1) / (high - low) 31 | offset = -low * scale 32 | table = np.arange(n_bins) * scale + offset 33 | table[table < 0] = 0 34 | table[table > n_bins - 1] = n_bins - 1 35 | table = table.clip(0, 255).astype(np.uint8) 36 | return table[ch] 37 | 38 | channels = [tune_channel(ch) for ch in cv2.split(img)] 39 | out = cv2.merge(channels) 40 | return out 41 | 42 | 43 | def equalize_func(img): 44 | ''' 45 | same output as PIL.ImageOps.equalize 46 | PIL's implementation is different from cv2.equalize 47 | ''' 48 | n_bins = 256 49 | 50 | def tune_channel(ch): 51 | hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins]) 52 | non_zero_hist = hist[hist != 0].reshape(-1) 53 | step = np.sum(non_zero_hist[:-1]) // (n_bins - 1) 54 | if step == 0: return ch 55 | n = np.empty_like(hist) 56 | n[0] = step // 2 57 | n[1:] = hist[:-1] 58 | table = (np.cumsum(n) // step).clip(0, 255).astype(np.uint8) 59 | return table[ch] 60 | 61 | channels = [tune_channel(ch) for ch in cv2.split(img)] 62 | out = cv2.merge(channels) 63 | return out 64 | 65 | 66 | def rotate_func(img, degree, fill=(0, 0, 0)): 67 | ''' 68 | like PIL, rotate by degree, not radians 69 | ''' 70 | H, W = img.shape[0], img.shape[1] 71 | center = W / 2, H / 2 72 | M = cv2.getRotationMatrix2D(center, degree, 1) 73 | out = cv2.warpAffine(img, M, (W, H), borderValue=fill) 74 | return out 75 | 76 | 77 | def solarize_func(img, thresh=128): 78 | ''' 79 | same output as PIL.ImageOps.posterize 80 | ''' 81 | table = np.array([el if el < thresh else 255 - el for el in range(256)]) 82 | table = table.clip(0, 255).astype(np.uint8) 83 | out = table[img] 84 | return out 85 | 86 | 87 | def color_func(img, factor): 88 | ''' 89 | same output as PIL.ImageEnhance.Color 90 | ''' 91 | ## implementation according to PIL definition, quite slow 92 | # degenerate = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)[:, :, np.newaxis] 93 | # out = blend(degenerate, img, factor) 94 | # M = ( 95 | # np.eye(3) * factor 96 | # + np.float32([0.114, 0.587, 0.299]).reshape(3, 1) * (1. - factor) 97 | # )[np.newaxis, np.newaxis, :] 98 | M = ( 99 | np.float32([ 100 | [0.886, -0.114, -0.114], 101 | [-0.587, 0.413, -0.587], 102 | [-0.299, -0.299, 0.701]]) * factor 103 | + np.float32([[0.114], [0.587], [0.299]]) 104 | ) 105 | out = np.matmul(img, M).clip(0, 255).astype(np.uint8) 106 | return out 107 | 108 | 109 | def contrast_func(img, factor): 110 | """ 111 | same output as PIL.ImageEnhance.Contrast 112 | """ 113 | mean = np.sum(np.mean(img, axis=(0, 1)) * np.array([0.114, 0.587, 0.299])) 114 | table = np.array([( 115 | el - mean) * factor + mean 116 | for el in range(256) 117 | ]).clip(0, 255).astype(np.uint8) 118 | out = table[img] 119 | return out 120 | 121 | 122 | def brightness_func(img, factor): 123 | ''' 124 | same output as PIL.ImageEnhance.Contrast 125 | ''' 126 | table = (np.arange(256, dtype=np.float32) * factor).clip(0, 255).astype(np.uint8) 127 | out = table[img] 128 | return out 129 | 130 | 131 | def sharpness_func(img, factor): 132 | ''' 133 | The differences the this result and PIL are all on the 4 boundaries, the center 134 | areas are same 135 | ''' 136 | kernel = np.ones((3, 3), dtype=np.float32) 137 | kernel[1][1] = 5 138 | kernel /= 13 139 | degenerate = cv2.filter2D(img, -1, kernel) 140 | if factor == 0.0: 141 | out = degenerate 142 | elif factor == 1.0: 143 | out = img 144 | else: 145 | out = img.astype(np.float32) 146 | degenerate = degenerate.astype(np.float32)[1:-1, 1:-1, :] 147 | out[1:-1, 1:-1, :] = degenerate + factor * (out[1:-1, 1:-1, :] - degenerate) 148 | out = out.astype(np.uint8) 149 | return out 150 | 151 | 152 | def shear_x_func(img, factor, fill=(0, 0, 0)): 153 | H, W = img.shape[0], img.shape[1] 154 | M = np.float32([[1, factor, 0], [0, 1, 0]]) 155 | out = cv2.warpAffine(img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR).astype(np.uint8) 156 | return out 157 | 158 | 159 | def translate_x_func(img, offset, fill=(0, 0, 0)): 160 | ''' 161 | same output as PIL.Image.transform 162 | ''' 163 | H, W = img.shape[0], img.shape[1] 164 | M = np.float32([[1, 0, -offset], [0, 1, 0]]) 165 | out = cv2.warpAffine(img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR).astype(np.uint8) 166 | return out 167 | 168 | 169 | def translate_y_func(img, offset, fill=(0, 0, 0)): 170 | ''' 171 | same output as PIL.Image.transform 172 | ''' 173 | H, W = img.shape[0], img.shape[1] 174 | M = np.float32([[1, 0, 0], [0, 1, -offset]]) 175 | out = cv2.warpAffine(img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR).astype(np.uint8) 176 | return out 177 | 178 | 179 | def posterize_func(img, bits): 180 | ''' 181 | same output as PIL.ImageOps.posterize 182 | ''' 183 | out = np.bitwise_and(img, np.uint8(255 << (8 - bits))) 184 | return out 185 | 186 | 187 | def shear_y_func(img, factor, fill=(0, 0, 0)): 188 | H, W = img.shape[0], img.shape[1] 189 | M = np.float32([[1, 0, 0], [factor, 1, 0]]) 190 | out = cv2.warpAffine(img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR).astype(np.uint8) 191 | return out 192 | 193 | 194 | def cutout_func(img, pad_size, replace=(0, 0, 0)): 195 | replace = np.array(replace, dtype=np.uint8) 196 | H, W = img.shape[0], img.shape[1] 197 | rh, rw = np.random.random(2) 198 | pad_size = pad_size // 2 199 | ch, cw = int(rh * H), int(rw * W) 200 | x1, x2 = max(ch - pad_size, 0), min(ch + pad_size, H) 201 | y1, y2 = max(cw - pad_size, 0), min(cw + pad_size, W) 202 | out = img.copy() 203 | out[x1:x2, y1:y2, :] = replace 204 | return out 205 | 206 | 207 | ### level to args 208 | def enhance_level_to_args(MAX_LEVEL): 209 | def level_to_args(level): 210 | return ((level / MAX_LEVEL) * 1.8 + 0.1,) 211 | return level_to_args 212 | 213 | 214 | def shear_level_to_args(MAX_LEVEL, replace_value): 215 | def level_to_args(level): 216 | level = (level / MAX_LEVEL) * 0.3 217 | if np.random.random() > 0.5: level = -level 218 | return (level, replace_value) 219 | 220 | return level_to_args 221 | 222 | 223 | def translate_level_to_args(translate_const, MAX_LEVEL, replace_value): 224 | def level_to_args(level): 225 | level = (level / MAX_LEVEL) * float(translate_const) 226 | if np.random.random() > 0.5: level = -level 227 | return (level, replace_value) 228 | 229 | return level_to_args 230 | 231 | 232 | def cutout_level_to_args(cutout_const, MAX_LEVEL, replace_value): 233 | def level_to_args(level): 234 | level = int((level / MAX_LEVEL) * cutout_const) 235 | return (level, replace_value) 236 | 237 | return level_to_args 238 | 239 | 240 | def solarize_level_to_args(MAX_LEVEL): 241 | def level_to_args(level): 242 | level = int((level / MAX_LEVEL) * 256) 243 | return (level, ) 244 | return level_to_args 245 | 246 | 247 | def none_level_to_args(level): 248 | return () 249 | 250 | 251 | def posterize_level_to_args(MAX_LEVEL): 252 | def level_to_args(level): 253 | level = int((level / MAX_LEVEL) * 4) 254 | return (level, ) 255 | return level_to_args 256 | 257 | 258 | def rotate_level_to_args(MAX_LEVEL, replace_value): 259 | def level_to_args(level): 260 | level = (level / MAX_LEVEL) * 30 261 | if np.random.random() < 0.5: 262 | level = -level 263 | return (level, replace_value) 264 | 265 | return level_to_args 266 | 267 | 268 | func_dict = { 269 | 'Identity': identity_func, 270 | 'AutoContrast': autocontrast_func, 271 | 'Equalize': equalize_func, 272 | 'Rotate': rotate_func, 273 | 'Solarize': solarize_func, 274 | 'Color': color_func, 275 | 'Contrast': contrast_func, 276 | 'Brightness': brightness_func, 277 | 'Sharpness': sharpness_func, 278 | 'ShearX': shear_x_func, 279 | 'TranslateX': translate_x_func, 280 | 'TranslateY': translate_y_func, 281 | 'Posterize': posterize_func, 282 | 'ShearY': shear_y_func, 283 | } 284 | 285 | translate_const = 10 286 | MAX_LEVEL = 10 287 | replace_value = (128, 128, 128) 288 | arg_dict = { 289 | 'Identity': none_level_to_args, 290 | 'AutoContrast': none_level_to_args, 291 | 'Equalize': none_level_to_args, 292 | 'Rotate': rotate_level_to_args(MAX_LEVEL, replace_value), 293 | 'Solarize': solarize_level_to_args(MAX_LEVEL), 294 | 'Color': enhance_level_to_args(MAX_LEVEL), 295 | 'Contrast': enhance_level_to_args(MAX_LEVEL), 296 | 'Brightness': enhance_level_to_args(MAX_LEVEL), 297 | 'Sharpness': enhance_level_to_args(MAX_LEVEL), 298 | 'ShearX': shear_level_to_args(MAX_LEVEL, replace_value), 299 | 'TranslateX': translate_level_to_args( 300 | translate_const, MAX_LEVEL, replace_value 301 | ), 302 | 'TranslateY': translate_level_to_args( 303 | translate_const, MAX_LEVEL, replace_value 304 | ), 305 | 'Posterize': posterize_level_to_args(MAX_LEVEL), 306 | 'ShearY': shear_level_to_args(MAX_LEVEL, replace_value), 307 | } 308 | 309 | 310 | class RandomAugment(object): 311 | 312 | def __init__(self, N=2, M=10, isPIL=False, augs=[]): 313 | self.N = N 314 | self.M = M 315 | self.isPIL = isPIL 316 | if augs: 317 | self.augs = augs 318 | else: 319 | self.augs = list(arg_dict.keys()) 320 | 321 | def get_random_ops(self): 322 | sampled_ops = np.random.choice(self.augs, self.N) 323 | return [(op, 0.5, self.M) for op in sampled_ops] 324 | 325 | def __call__(self, img): 326 | if self.isPIL: 327 | img = np.array(img) 328 | ops = self.get_random_ops() 329 | for name, prob, level in ops: 330 | if np.random.random() > prob: 331 | continue 332 | args = arg_dict[name](level) 333 | img = func_dict[name](img, *args) 334 | return img 335 | 336 | 337 | if __name__ == '__main__': 338 | a = RandomAugment() 339 | img = np.random.randn(32, 32, 3) 340 | a(img) -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/METER/clip_model.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from typing import Tuple, Union 3 | 4 | import numpy as np 5 | import torch 6 | import torch.nn.functional as F 7 | from torch import nn 8 | 9 | 10 | class LayerNorm(nn.LayerNorm): 11 | """Subclass torch's LayerNorm to handle fp16.""" 12 | 13 | def forward(self, x: torch.Tensor): 14 | orig_type = x.dtype 15 | ret = super().forward(x.type(torch.float32)) 16 | return ret.type(orig_type) 17 | 18 | 19 | class QuickGELU(nn.Module): 20 | def forward(self, x: torch.Tensor): 21 | return x * torch.sigmoid(1.702 * x) 22 | 23 | 24 | class ResidualAttentionBlock(nn.Module): 25 | def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None): 26 | super().__init__() 27 | 28 | self.attn = nn.MultiheadAttention(d_model, n_head) 29 | self.ln_1 = LayerNorm(d_model) 30 | self.mlp = nn.Sequential(OrderedDict([ 31 | ("c_fc", nn.Linear(d_model, d_model * 4)), 32 | ("gelu", QuickGELU()), 33 | ("c_proj", nn.Linear(d_model * 4, d_model)) 34 | ])) 35 | self.ln_2 = LayerNorm(d_model) 36 | self.attn_mask = attn_mask 37 | 38 | def attention(self, x: torch.Tensor, x_mask:torch.Tensor): 39 | if x_mask is not None: 40 | x_mask = x_mask.to(dtype=torch.bool, device=x.device) 41 | self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None 42 | return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask, key_padding_mask=x_mask)[0] 43 | 44 | def forward(self, x: torch.Tensor, x_mask:torch.Tensor=None): 45 | x = x + self.attention(self.ln_1(x), x_mask) 46 | x = x + self.mlp(self.ln_2(x)) 47 | return x 48 | 49 | 50 | class Transformer(nn.Module): 51 | def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None): 52 | super().__init__() 53 | self.width = width 54 | self.layers = layers 55 | self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers-1)]) 56 | 57 | def forward(self, x: torch.Tensor, x_mask: torch.Tensor=None): 58 | for block in self.resblocks: 59 | x = block(x, x_mask) 60 | return x 61 | 62 | 63 | class VisualTransformer(nn.Module): 64 | def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int, resolution_after: int): 65 | super().__init__() 66 | self.input_resolution = input_resolution 67 | self.output_dim = output_dim 68 | self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False) 69 | 70 | scale = width ** -0.5 71 | self.class_embedding = nn.Parameter(scale * torch.randn(width)) 72 | self.positional_embedding = nn.Parameter(scale * torch.randn((resolution_after // patch_size) ** 2 + 1, width)) 73 | self.ln_pre = LayerNorm(width) 74 | 75 | self.transformer = Transformer(width, layers, heads) 76 | self.ln_post = LayerNorm(width) 77 | 78 | def forward(self, x: torch.Tensor, x_mask): 79 | x = self.conv1(x) # shape = [*, width, grid, grid] 80 | x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] 81 | x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] 82 | t=self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device) 83 | x = torch.cat([t, x], dim=1) # shape = [*, grid ** 2 + 1, width] 84 | x = x + self.positional_embedding.to(x.dtype) 85 | x = self.ln_pre(x) 86 | 87 | x = x.permute(1, 0, 2) # NLD -> LND 88 | x = self.transformer(x, x_mask) 89 | x = x.permute(1, 0, 2) # LND -> NLD 90 | 91 | x = self.ln_post(x) 92 | 93 | return x 94 | 95 | 96 | class CLIP(nn.Module): 97 | def __init__(self, 98 | embed_dim: int, 99 | # vision 100 | image_resolution: int, 101 | vision_layers: Union[Tuple[int, int, int, int], int], 102 | vision_width: int, 103 | vision_patch_size: int, 104 | # text 105 | context_length: int, 106 | vocab_size: int, 107 | transformer_width: int, 108 | transformer_heads: int, 109 | transformer_layers: int, 110 | resolution_after=224, 111 | ): 112 | super().__init__() 113 | 114 | self.context_length = context_length 115 | 116 | vision_heads = vision_width // 64 117 | self.visual = VisualTransformer( 118 | input_resolution=image_resolution, 119 | patch_size=vision_patch_size, 120 | width=vision_width, 121 | layers=vision_layers, 122 | heads=vision_heads, 123 | output_dim=embed_dim, 124 | resolution_after=resolution_after, 125 | ) 126 | 127 | self.vocab_size = vocab_size 128 | self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width)) 129 | self.ln_final = LayerNorm(transformer_width) 130 | 131 | self.initialize_parameters() 132 | 133 | def initialize_parameters(self): 134 | nn.init.normal_(self.positional_embedding, std=0.01) 135 | 136 | proj_std = (self.visual.transformer.width ** -0.5) * ((2 * self.visual.transformer.layers) ** -0.5) 137 | attn_std = self.visual.transformer.width ** -0.5 138 | fc_std = (2 * self.visual.transformer.width) ** -0.5 139 | for block in self.visual.transformer.resblocks: 140 | nn.init.normal_(block.attn.in_proj_weight, std=attn_std) 141 | nn.init.normal_(block.attn.out_proj.weight, std=proj_std) 142 | nn.init.normal_(block.mlp.c_fc.weight, std=fc_std) 143 | nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) 144 | 145 | @property 146 | def dtype(self): 147 | return self.visual.conv1.weight.dtype 148 | 149 | def forward(self, image, image_mask=None): 150 | return self.visual(image.type(self.dtype), image_mask) 151 | 152 | 153 | _MODELS = { 154 | "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt", 155 | "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt", 156 | } 157 | import os 158 | import hashlib 159 | import urllib 160 | from tqdm import tqdm 161 | import warnings 162 | def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")): 163 | os.makedirs(root, exist_ok=True) 164 | filename = os.path.basename(url) 165 | 166 | expected_sha256 = url.split("/")[-2] 167 | download_target = os.path.join(root, filename) 168 | 169 | if os.path.exists(download_target) and not os.path.isfile(download_target): 170 | raise RuntimeError(f"{download_target} exists and is not a regular file") 171 | 172 | if os.path.isfile(download_target): 173 | if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256: 174 | return download_target 175 | else: 176 | warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file") 177 | 178 | with urllib.request.urlopen(url) as source, open(download_target, "wb") as output: 179 | with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop: 180 | while True: 181 | buffer = source.read(8192) 182 | if not buffer: 183 | break 184 | 185 | output.write(buffer) 186 | loop.update(len(buffer)) 187 | 188 | if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256: 189 | raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match") 190 | 191 | return download_target 192 | 193 | def adapt_position_encoding(model, patch_size=32, after=384, 194 | suffix='visual.positional_embedding'): 195 | keys = [k for k in model if k.endswith(suffix)] 196 | assert len(keys) == 1 197 | key = keys[0] 198 | origin_pos_embed = model[key] 199 | origin_dim2 = False 200 | if len(origin_pos_embed.shape) == 2: 201 | origin_dim2 = True 202 | origin_pos_embed = origin_pos_embed.unsqueeze(0) 203 | grid_before = int(np.sqrt(origin_pos_embed.shape[1] - 1)) 204 | before = int(grid_before*patch_size) 205 | assert (before % patch_size) == 0 206 | grid_after = after // patch_size 207 | assert (after % patch_size) == 0 208 | embed_dim = origin_pos_embed.shape[-1] 209 | 210 | pos_embed = origin_pos_embed[0, 1:, :].reshape((grid_before, grid_before, embed_dim)) 211 | new_size = (grid_after, grid_after) 212 | pos_embed = torch.nn.functional.interpolate(pos_embed.permute((2, 0, 1)).unsqueeze(0), size=new_size, mode='bicubic') 213 | pos_embed = pos_embed.squeeze(0).permute((1, 2, 0)).reshape((-1, embed_dim)) 214 | pos_embed = torch.cat((origin_pos_embed[0, 0:1, :], pos_embed), dim=0).unsqueeze(0) 215 | assert pos_embed.shape == (1, grid_after * grid_after + 1, embed_dim) 216 | if origin_dim2: 217 | assert pos_embed.shape[0] == 1 218 | pos_embed = pos_embed.squeeze(0) 219 | model[key] = pos_embed 220 | return model 221 | 222 | 223 | def build_model(name, resolution_after=224): 224 | 225 | # if name in _MODELS: 226 | # model_path = _download(_MODELS[name]) 227 | # elif os.path.isfile(name): 228 | # model_path = name 229 | # else: 230 | # raise RuntimeError(f"Model {name} not found; available models = {available_models()}" 231 | # ) 232 | model_path = 'ViT-B-16.pt' 233 | try: 234 | model = torch.jit.load(model_path, map_location="cpu") 235 | state_dict = None 236 | except RuntimeError: 237 | if jit: 238 | warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead") 239 | jit = False 240 | state_dict = torch.load(model_path, map_location="cpu") 241 | state_dict = state_dict or model.state_dict() 242 | vit = "visual.proj" in state_dict 243 | 244 | vision_width = state_dict["visual.conv1.weight"].shape[0] 245 | vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")]) 246 | vision_patch_size = state_dict["visual.conv1.weight"].shape[-1] 247 | grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5) 248 | image_resolution = vision_patch_size * grid_size 249 | 250 | embed_dim = state_dict["text_projection"].shape[1] 251 | context_length = state_dict["positional_embedding"].shape[0] 252 | vocab_size = state_dict["token_embedding.weight"].shape[0] 253 | transformer_width = state_dict["ln_final.weight"].shape[0] 254 | transformer_heads = transformer_width // 64 255 | transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks"))) 256 | 257 | model = CLIP( 258 | embed_dim, 259 | image_resolution, vision_layers, vision_width, vision_patch_size, 260 | context_length, vocab_size, transformer_width, transformer_heads, transformer_layers, 261 | resolution_after, 262 | ) 263 | 264 | for key in ["input_resolution", "context_length", "vocab_size"]: 265 | if key in state_dict: 266 | del state_dict[key] 267 | 268 | model_dict = model.state_dict() 269 | pretrained_dict = state_dict 270 | if resolution_after != image_resolution: 271 | pretrained_dict = adapt_position_encoding(pretrained_dict, after=resolution_after, patch_size=vision_patch_size) 272 | # 1. filter out unnecessary keys 273 | pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} 274 | # 2. overwrite entries in the existing state dict 275 | model_dict.update(pretrained_dict) 276 | # 3. load the new state dict 277 | model.load_state_dict(model_dict) 278 | return model 279 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/tools/multilabel_metrics.py: -------------------------------------------------------------------------------- 1 | import math 2 | from urllib.request import urlretrieve 3 | import torch 4 | import numpy as np 5 | import pdb 6 | 7 | def get_multi_label(label, image): 8 | multi_label = torch.zeros([len(label), 4], dtype=torch.long).to(image.device) 9 | # origin cls = [0, 0, 0, 0] 10 | real_label_pos = np.where(np.array(label) == 'orig')[0].tolist() 11 | multi_label[real_label_pos,:] = torch.tensor([0, 0, 0, 0]).to(image.device) 12 | # face_swap cls = [1, 0, 0, 0] 13 | pos = np.where(np.array(label) == 'face_swap')[0].tolist() 14 | multi_label[pos,:] = torch.tensor([1, 0, 0, 0]).to(image.device) 15 | # face_attribute cls = [0, 1, 0, 0] 16 | pos = np.where(np.array(label) == 'face_attribute')[0].tolist() 17 | multi_label[pos,:] = torch.tensor([0, 1, 0, 0]).to(image.device) 18 | # text_swap cls = [0, 0, 1, 0] 19 | pos = np.where(np.array(label) == 'text_swap')[0].tolist() 20 | multi_label[pos,:] = torch.tensor([0, 0, 1, 0]).to(image.device) 21 | # text_attribute cls = [0, 0, 0, 1] 22 | pos = np.where(np.array(label) == 'text_attribute')[0].tolist() 23 | multi_label[pos,:] = torch.tensor([0, 0, 0, 1]).to(image.device) 24 | # face_swap&text_swap cls = [1, 0, 1, 0] 25 | pos = np.where(np.array(label) == 'face_swap&text_swap')[0].tolist() 26 | multi_label[pos,:] = torch.tensor([1, 0, 1, 0]).to(image.device) 27 | # face_swap&text_attribute cls = [1, 0, 0, 1] 28 | pos = np.where(np.array(label) == 'face_swap&text_attribute')[0].tolist() 29 | multi_label[pos,:] = torch.tensor([1, 0, 0, 1]).to(image.device) 30 | # face_attribute&text_swap cls = [0, 1, 1, 0] 31 | pos = np.where(np.array(label) == 'face_attribute&text_swap')[0].tolist() 32 | multi_label[pos,:] = torch.tensor([0, 1, 1, 0]).to(image.device) 33 | # face_attribute&text_attribute cls = [0, 1, 0, 1] 34 | pos = np.where(np.array(label) == 'face_attribute&text_attribute')[0].tolist() 35 | multi_label[pos,:] = torch.tensor([0, 1, 0, 1]).to(image.device) 36 | 37 | return multi_label, real_label_pos 38 | 39 | 40 | def get_multi_label_TS(label, image): 41 | TS_pos = [] 42 | 43 | multi_label = torch.zeros([len(label), 4], dtype=torch.long).to(image.device) 44 | # origin cls = [0, 0, 0, 0] 45 | real_label_pos = np.where(np.array(label) == 'orig')[0].tolist() 46 | multi_label[real_label_pos,:] = torch.tensor([0, 0, 0, 0]).to(image.device) 47 | # face_swap cls = [1, 0, 0, 0] 48 | pos = np.where(np.array(label) == 'face_swap')[0].tolist() 49 | multi_label[pos,:] = torch.tensor([1, 0, 0, 0]).to(image.device) 50 | # face_attribute cls = [0, 1, 0, 0] 51 | pos = np.where(np.array(label) == 'face_attribute')[0].tolist() 52 | multi_label[pos,:] = torch.tensor([0, 1, 0, 0]).to(image.device) 53 | # text_swap cls = [0, 0, 1, 0] 54 | pos = np.where(np.array(label) == 'text_swap')[0].tolist() 55 | multi_label[pos,:] = torch.tensor([0, 0, 1, 0]).to(image.device) 56 | TS_pos.extend(pos) 57 | # text_attribute cls = [0, 0, 0, 1] 58 | pos = np.where(np.array(label) == 'text_attribute')[0].tolist() 59 | multi_label[pos,:] = torch.tensor([0, 0, 0, 1]).to(image.device) 60 | # face_swap&text_swap cls = [1, 0, 1, 0] 61 | pos = np.where(np.array(label) == 'face_swap&text_swap')[0].tolist() 62 | multi_label[pos,:] = torch.tensor([1, 0, 1, 0]).to(image.device) 63 | TS_pos.extend(pos) 64 | # face_swap&text_attribute cls = [1, 0, 0, 1] 65 | pos = np.where(np.array(label) == 'face_swap&text_attribute')[0].tolist() 66 | multi_label[pos,:] = torch.tensor([1, 0, 0, 1]).to(image.device) 67 | # face_attribute&text_swap cls = [0, 1, 1, 0] 68 | pos = np.where(np.array(label) == 'face_attribute&text_swap')[0].tolist() 69 | multi_label[pos,:] = torch.tensor([0, 1, 1, 0]).to(image.device) 70 | TS_pos.extend(pos) 71 | # face_attribute&text_attribute cls = [0, 1, 0, 1] 72 | pos = np.where(np.array(label) == 'face_attribute&text_attribute')[0].tolist() 73 | multi_label[pos,:] = torch.tensor([0, 1, 0, 1]).to(image.device) 74 | 75 | return multi_label, real_label_pos, TS_pos 76 | 77 | # def get_multi_label(label, image): 78 | # multi_label = torch.zeros([len(label), 3], dtype=torch.long).to(image.device) 79 | # # origin cls = [0, 0, 0] 80 | # real_label_pos = np.where(np.array(label) == 'orig')[0].tolist() 81 | # multi_label[real_label_pos,:] = torch.tensor([0, 0, 0]).to(image.device) 82 | # # face_swap cls = [1, 0, 0] 83 | # pos = np.where(np.array(label) == 'face_swap')[0].tolist() 84 | # multi_label[pos,:] = torch.tensor([1, 0, 0]).to(image.device) 85 | # # face_attribute cls = [0, 1, 0] 86 | # pos = np.where(np.array(label) == 'face_attribute')[0].tolist() 87 | # multi_label[pos,:] = torch.tensor([0, 1, 0]).to(image.device) 88 | # # text_attribute cls = [0, 0, 1] 89 | # pos = np.where(np.array(label) == 'text_attribute')[0].tolist() 90 | # multi_label[pos,:] = torch.tensor([0, 0, 1]).to(image.device) 91 | # # face_swap&text_attribute cls = [1, 0, 1] 92 | # pos = np.where(np.array(label) == 'face_swap&text_attribute')[0].tolist() 93 | # multi_label[pos,:] = torch.tensor([1, 0, 1]).to(image.device) 94 | # # face_attribute&text_attribute cls = [0, 1, 1] 95 | # pos = np.where(np.array(label) == 'face_attribute&text_attribute')[0].tolist() 96 | # multi_label[pos,:] = torch.tensor([0, 1, 1]).to(image.device) 97 | 98 | # return multi_label, real_label_pos 99 | 100 | 101 | 102 | class AveragePrecisionMeter(object): 103 | """ 104 | The APMeter measures the average precision per class. 105 | The APMeter is designed to operate on `NxK` Tensors `output` and 106 | `target`, and optionally a `Nx1` Tensor weight where (1) the `output` 107 | contains model output scores for `N` examples and `K` classes that ought to 108 | be higher when the model is more convinced that the example should be 109 | positively labeled, and smaller when the model believes the example should 110 | be negatively labeled (for instance, the output of a sigmoid function); (2) 111 | the `target` contains only values 0 (for negative examples) and 1 112 | (for positive examples); and (3) the `weight` ( > 0) represents weight for 113 | each sample. 114 | """ 115 | 116 | def __init__(self, difficult_examples=False): 117 | super(AveragePrecisionMeter, self).__init__() 118 | self.reset() 119 | self.difficult_examples = difficult_examples 120 | 121 | def reset(self): 122 | """Resets the meter with empty member variables""" 123 | self.scores = torch.FloatTensor(torch.FloatStorage()) 124 | self.targets = torch.LongTensor(torch.LongStorage()) 125 | 126 | def add(self, output, target): 127 | """ 128 | Args: 129 | output (Tensor): NxK tensor that for each of the N examples 130 | indicates the probability of the example belonging to each of 131 | the K classes, according to the model. The probabilities should 132 | sum to one over all classes 133 | target (Tensor): binary NxK tensort that encodes which of the K 134 | classes are associated with the N-th input 135 | (eg: a row [0, 1, 0, 1] indicates that the example is 136 | associated with classes 2 and 4) 137 | weight (optional, Tensor): Nx1 tensor representing the weight for 138 | each example (each weight > 0) 139 | """ 140 | if not torch.is_tensor(output): 141 | output = torch.from_numpy(output) 142 | if not torch.is_tensor(target): 143 | target = torch.from_numpy(target) 144 | 145 | if output.dim() == 1: 146 | output = output.view(-1, 1) 147 | else: 148 | assert output.dim() == 2, \ 149 | 'wrong output size (should be 1D or 2D with one column \ 150 | per class)' 151 | if target.dim() == 1: 152 | target = target.view(-1, 1) 153 | else: 154 | assert target.dim() == 2, \ 155 | 'wrong target size (should be 1D or 2D with one column \ 156 | per class)' 157 | if self.scores.numel() > 0: 158 | assert target.size(1) == self.targets.size(1), \ 159 | 'dimensions for output should match previously added examples.' 160 | 161 | # make sure storage is of sufficient size 162 | if self.scores.storage().size() < self.scores.numel() + output.numel(): 163 | new_size = math.ceil(self.scores.storage().size() * 1.5) 164 | self.scores.storage().resize_(int(new_size + output.numel())) 165 | self.targets.storage().resize_(int(new_size + output.numel())) 166 | 167 | # store scores and targets 168 | offset = self.scores.size(0) if self.scores.dim() > 0 else 0 169 | self.scores.resize_(offset + output.size(0), output.size(1)) 170 | self.targets.resize_(offset + target.size(0), target.size(1)) 171 | self.scores.narrow(0, offset, output.size(0)).copy_(output) 172 | self.targets.narrow(0, offset, target.size(0)).copy_(target) 173 | 174 | def value(self): 175 | """Returns the model's average precision for each class 176 | Return: 177 | ap (FloatTensor): 1xK tensor, with avg precision for each class k 178 | """ 179 | 180 | if self.scores.numel() == 0: 181 | return 0 182 | ap = torch.zeros(self.scores.size(1)) 183 | rg = torch.arange(1, self.scores.size(0)).float() 184 | # compute average precision for each class 185 | for k in range(self.scores.size(1)): 186 | # sort scores 187 | scores = self.scores[:, k] 188 | targets = self.targets[:, k] 189 | # compute average precision 190 | ap[k] = AveragePrecisionMeter.average_precision(scores, targets, self.difficult_examples) 191 | return ap 192 | 193 | @staticmethod 194 | def average_precision(output, target, difficult_examples=True): 195 | 196 | # sort examples 197 | sorted, indices = torch.sort(output, dim=0, descending=True) 198 | 199 | # Computes prec@i 200 | pos_count = 0. 201 | total_count = 0. 202 | precision_at_i = 0. 203 | for i in indices: 204 | label = target[i] 205 | if difficult_examples and label == 0: 206 | continue 207 | if label == 1: 208 | pos_count += 1 209 | total_count += 1 210 | if label == 1: 211 | precision_at_i += pos_count / total_count 212 | precision_at_i /= pos_count 213 | return precision_at_i 214 | 215 | def overall(self): 216 | if self.scores.numel() == 0: 217 | return 0 218 | scores = self.scores.cpu().numpy() 219 | targets = self.targets.cpu().numpy() 220 | targets[targets == -1] = 0 221 | return self.evaluation(scores, targets) 222 | 223 | def overall_topk(self, k): 224 | targets = self.targets.cpu().numpy() 225 | targets[targets == -1] = 0 226 | n, c = self.scores.size() 227 | scores = np.zeros((n, c)) - 1 228 | index = self.scores.topk(k, 1, True, True)[1].cpu().numpy() 229 | tmp = self.scores.cpu().numpy() 230 | for i in range(n): 231 | for ind in index[i]: 232 | scores[i, ind] = 1 if tmp[i, ind] >= 0 else -1 233 | return self.evaluation(scores, targets) 234 | 235 | 236 | def evaluation(self, scores_, targets_): 237 | n, n_class = scores_.shape 238 | Nc, Np, Ng = np.zeros(n_class), np.zeros(n_class), np.zeros(n_class) 239 | for k in range(n_class): 240 | scores = scores_[:, k] 241 | targets = targets_[:, k] 242 | targets[targets == -1] = 0 243 | Ng[k] = np.sum(targets == 1) 244 | Np[k] = np.sum(scores >= 0) 245 | Nc[k] = np.sum(targets * (scores >= 0)) 246 | Np[Np == 0] = 1 247 | OP = np.sum(Nc) / np.sum(Np) 248 | OR = np.sum(Nc) / np.sum(Ng) 249 | OF1 = (2 * OP * OR) / (OP + OR) 250 | 251 | CP = np.sum(Nc / Np) / n_class 252 | CR = np.sum(Nc / Ng) / n_class 253 | CF1 = (2 * CP * CR) / (CP + CR) 254 | return OP, OR, OF1, CP, CR, CF1 255 | 256 | 257 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/METER/meter_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | 4 | from transformers.optimization import AdamW 5 | from transformers import ( 6 | get_polynomial_decay_schedule_with_warmup, 7 | get_cosine_schedule_with_warmup, 8 | ) 9 | from .dist_utils import all_gather 10 | from .objectives import compute_irtr_recall 11 | from ..gadgets.my_metrics import Accuracy, VQAScore, Scalar 12 | 13 | 14 | def set_metrics(pl_module): 15 | for split in ["train", "val"]: 16 | for k, v in pl_module.hparams.config["loss_names"].items(): 17 | if v <= 0: 18 | continue 19 | if k == "vqa": 20 | setattr(pl_module, f"{split}_vqa_score", VQAScore()) 21 | setattr(pl_module, f"{split}_{k}_loss", Scalar()) 22 | elif k == "nlvr2": 23 | if split == "train": 24 | setattr(pl_module, f"train_{k}_accuracy", Accuracy()) 25 | setattr(pl_module, f"train_{k}_loss", Scalar()) 26 | else: 27 | setattr(pl_module, f"dev_{k}_accuracy", Accuracy()) 28 | setattr(pl_module, f"dev_{k}_loss", Scalar()) 29 | setattr(pl_module, f"test_{k}_accuracy", Accuracy()) 30 | setattr(pl_module, f"test_{k}_loss", Scalar()) 31 | elif k == "snli": 32 | if split == "train": 33 | setattr(pl_module, f"train_{k}_accuracy", Accuracy()) 34 | setattr(pl_module, f"train_{k}_loss", Scalar()) 35 | else: 36 | setattr(pl_module, f"dev_{k}_accuracy", Accuracy()) 37 | setattr(pl_module, f"dev_{k}_loss", Scalar()) 38 | setattr(pl_module, f"test_{k}_accuracy", Accuracy()) 39 | setattr(pl_module, f"test_{k}_loss", Scalar()) 40 | elif k == "irtr": 41 | setattr(pl_module, f"{split}_irtr_loss", Scalar()) 42 | elif k == "mppd" or k == "mpfr": 43 | setattr(pl_module, f"{split}_{k}_loss", Scalar()) 44 | elif k == "itm": 45 | setattr(pl_module, f"{split}_{k}_accuracy", Accuracy()) 46 | setattr(pl_module, f"{split}_{k}_loss", Scalar()) 47 | else: 48 | setattr(pl_module, f"{split}_{k}_accuracy", Accuracy()) 49 | setattr(pl_module, f"{split}_{k}_loss", Scalar()) 50 | 51 | 52 | def epoch_wrapup(pl_module): 53 | phase = "train" if pl_module.training else "val" 54 | the_metric = 0 55 | 56 | if pl_module.hparams.config["get_recall_metric"] and not pl_module.training: 57 | (ir_r1, ir_r5, ir_r10, tr_r1, tr_r5, tr_r10) = compute_irtr_recall(pl_module) 58 | print((ir_r1, ir_r5, ir_r10, tr_r1, tr_r5, tr_r10), pl_module.global_step) 59 | pl_module.logger.experiment.add_scalar( 60 | "recalls/ir_r1", ir_r1, pl_module.global_step 61 | ) 62 | pl_module.logger.experiment.add_scalar( 63 | "recalls/ir_r5", ir_r5, pl_module.global_step 64 | ) 65 | pl_module.logger.experiment.add_scalar( 66 | "recalls/ir_r10", ir_r10, pl_module.global_step 67 | ) 68 | pl_module.logger.experiment.add_scalar( 69 | "recalls/tr_r1", tr_r1, pl_module.global_step 70 | ) 71 | pl_module.logger.experiment.add_scalar( 72 | "recalls/tr_r5", tr_r5, pl_module.global_step 73 | ) 74 | pl_module.logger.experiment.add_scalar( 75 | "recalls/tr_r10", tr_r10, pl_module.global_step 76 | ) 77 | the_metric += ir_r1.item() + tr_r1.item() 78 | 79 | for loss_name, v in pl_module.hparams.config["loss_names"].items(): 80 | if v <= 0: 81 | continue 82 | 83 | value = 0 84 | 85 | if loss_name == "vqa": 86 | value = getattr(pl_module, f"{phase}_{loss_name}_score").compute() 87 | pl_module.log(f"{loss_name}/{phase}/score_epoch", value) 88 | getattr(pl_module, f"{phase}_{loss_name}_score").reset() 89 | pl_module.log( 90 | f"{loss_name}/{phase}/loss_epoch", 91 | getattr(pl_module, f"{phase}_{loss_name}_loss").compute(), 92 | ) 93 | getattr(pl_module, f"{phase}_{loss_name}_loss").reset() 94 | elif loss_name == "nlvr2" or loss_name == 'snli': 95 | if phase == "train": 96 | value = getattr(pl_module, f"train_{loss_name}_accuracy").compute() 97 | pl_module.log(f"{loss_name}/train/accuracy_epoch", value) 98 | getattr(pl_module, f"train_{loss_name}_accuracy").reset() 99 | pl_module.log( 100 | f"{loss_name}/train/loss_epoch", 101 | getattr(pl_module, f"train_{loss_name}_loss").compute(), 102 | ) 103 | getattr(pl_module, f"train_{loss_name}_loss").reset() 104 | else: 105 | value = getattr(pl_module, f"test_{loss_name}_accuracy").compute() 106 | pl_module.log(f"{loss_name}/test/accuracy_epoch", value) 107 | getattr(pl_module, f"test_{loss_name}_accuracy").reset() 108 | pl_module.log( 109 | f"{loss_name}/test/loss_epoch", 110 | getattr(pl_module, f"test_{loss_name}_loss").compute(), 111 | ) 112 | getattr(pl_module, f"test_{loss_name}_loss").reset() 113 | 114 | value = getattr(pl_module, f"dev_{loss_name}_accuracy").compute() 115 | pl_module.log(f"{loss_name}/dev/accuracy_epoch", value) 116 | getattr(pl_module, f"dev_{loss_name}_accuracy").reset() 117 | pl_module.log( 118 | f"{loss_name}/dev/loss_epoch", 119 | getattr(pl_module, f"dev_{loss_name}_loss").compute(), 120 | ) 121 | getattr(pl_module, f"dev_{loss_name}_loss").reset() 122 | elif loss_name == "irtr": 123 | pl_module.log( 124 | f"{loss_name}/{phase}/irtr_loss_epoch", 125 | getattr(pl_module, f"{phase}_irtr_loss").compute(), 126 | ) 127 | getattr(pl_module, f"{phase}_irtr_loss").reset() 128 | elif loss_name == "mppd" or loss_name == "mpfr": 129 | pl_module.log( 130 | f"{loss_name}/{phase}/loss_epoch", 131 | getattr(pl_module, f"{phase}_{loss_name}_loss").compute(), 132 | ) 133 | getattr(pl_module, f"{phase}_{loss_name}_loss").reset() 134 | elif loss_name == "itm": 135 | value = getattr(pl_module, f"{phase}_{loss_name}_accuracy").compute() 136 | pl_module.log(f"{loss_name}/{phase}/accuracy_epoch", value) 137 | getattr(pl_module, f"{phase}_{loss_name}_accuracy").reset() 138 | pl_module.log( 139 | f"{loss_name}/{phase}/loss_epoch", 140 | getattr(pl_module, f"{phase}_{loss_name}_loss").compute(), 141 | ) 142 | getattr(pl_module, f"{phase}_{loss_name}_loss").reset() 143 | else: 144 | value = getattr(pl_module, f"{phase}_{loss_name}_accuracy").compute() 145 | pl_module.log(f"{loss_name}/{phase}/accuracy_epoch", value) 146 | getattr(pl_module, f"{phase}_{loss_name}_accuracy").reset() 147 | pl_module.log( 148 | f"{loss_name}/{phase}/loss_epoch", 149 | getattr(pl_module, f"{phase}_{loss_name}_loss").compute(), 150 | ) 151 | getattr(pl_module, f"{phase}_{loss_name}_loss").reset() 152 | 153 | the_metric += value 154 | 155 | pl_module.log(f"{phase}/the_metric", the_metric) 156 | 157 | 158 | def check_non_acc_grad(pl_module): 159 | if pl_module.token_type_embeddings.weight.grad is None: 160 | return True 161 | else: 162 | grad = pl_module.token_type_embeddings.weight.grad 163 | return (grad.sum() == 0).item() 164 | 165 | 166 | def set_task(pl_module): 167 | pl_module.current_tasks = [ 168 | k for k, v in pl_module.hparams.config["loss_names"].items() if v > 0 169 | ] 170 | return 171 | 172 | def set_schedule(pl_module): 173 | lr = pl_module.hparams.config["learning_rate"] 174 | wd = pl_module.hparams.config["weight_decay"] 175 | 176 | no_decay = [ 177 | "bias", 178 | "LayerNorm.bias", 179 | "LayerNorm.weight", 180 | "norm.bias", 181 | "norm.weight", 182 | "norm1.bias", 183 | "norm1.weight", 184 | "norm2.bias", 185 | "norm2.weight", 186 | ] 187 | head_names = ["vqa_classifier", "nlvr2_classifier", "mlm_score", "itm_score", "snli_classifier"] 188 | cross_modal_names = ['cross_modal'] 189 | lr_mult_head = pl_module.hparams.config["lr_mult_head"] 190 | lr_mult_cross_modal = pl_module.hparams.config["lr_mult_cross_modal"] 191 | end_lr = pl_module.hparams.config["end_lr"] 192 | decay_power = pl_module.hparams.config["decay_power"] 193 | optim_type = pl_module.hparams.config["optim_type"] 194 | optimizer_grouped_parameters = [ 195 | { 196 | "params": [ 197 | p 198 | for n, p in pl_module.named_parameters() 199 | if not any(nd in n for nd in no_decay) 200 | and not any(bb in n for bb in head_names) 201 | and not any(ht in n for ht in cross_modal_names) 202 | ], 203 | "weight_decay": wd, 204 | "lr": lr, 205 | }, 206 | { 207 | "params": [ 208 | p 209 | for n, p in pl_module.named_parameters() 210 | if any(nd in n for nd in no_decay) 211 | and not any(bb in n for bb in head_names) 212 | and not any(ht in n for ht in cross_modal_names) 213 | ], 214 | "weight_decay": 0.0, 215 | "lr": lr, 216 | }, 217 | { 218 | "params": [ 219 | p 220 | for n, p in pl_module.named_parameters() 221 | if not any(nd in n for nd in no_decay) 222 | and any(bb in n for bb in head_names) 223 | and not any(ht in n for ht in cross_modal_names) 224 | ], 225 | "weight_decay": wd, 226 | "lr": lr * lr_mult_head, 227 | }, 228 | { 229 | "params": [ 230 | p 231 | for n, p in pl_module.named_parameters() 232 | if any(nd in n for nd in no_decay) and any(bb in n for bb in head_names) 233 | and not any(ht in n for ht in cross_modal_names) 234 | ], 235 | "weight_decay": 0.0, 236 | "lr": lr * lr_mult_head, 237 | }, 238 | { 239 | "params": [ 240 | p 241 | for n, p in pl_module.named_parameters() 242 | if not any(nd in n for nd in no_decay) 243 | and not any(bb in n for bb in head_names) 244 | and any(ht in n for ht in cross_modal_names) 245 | ], 246 | "weight_decay": wd, 247 | "lr": lr * lr_mult_cross_modal, 248 | }, 249 | { 250 | "params": [ 251 | p 252 | for n, p in pl_module.named_parameters() 253 | if any(nd in n for nd in no_decay) 254 | and not any(bb in n for bb in head_names) 255 | and any(ht in n for ht in cross_modal_names) 256 | ], 257 | "weight_decay": 0.0, 258 | "lr": lr * lr_mult_cross_modal, 259 | }, 260 | ] 261 | 262 | if optim_type == "adamw": 263 | optimizer = AdamW( 264 | optimizer_grouped_parameters, lr=lr, eps=1e-8, betas=(0.9, 0.98) 265 | ) 266 | elif optim_type == "adam": 267 | optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=lr) 268 | elif optim_type == "sgd": 269 | optimizer = torch.optim.SGD(optimizer_grouped_parameters, lr=lr, momentum=0.9) 270 | 271 | if pl_module.trainer.max_steps is None: 272 | max_steps = ( 273 | len(pl_module.trainer.datamodule.train_dataloader()) 274 | * pl_module.trainer.max_epochs 275 | // pl_module.trainer.accumulate_grad_batches 276 | ) 277 | else: 278 | max_steps = pl_module.trainer.max_steps 279 | 280 | warmup_steps = pl_module.hparams.config["warmup_steps"] 281 | if isinstance(pl_module.hparams.config["warmup_steps"], float): 282 | warmup_steps = int(max_steps * warmup_steps) 283 | 284 | if decay_power == "cosine": 285 | scheduler = get_cosine_schedule_with_warmup( 286 | optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps, 287 | ) 288 | else: 289 | scheduler = get_polynomial_decay_schedule_with_warmup( 290 | optimizer, 291 | num_warmup_steps=warmup_steps, 292 | num_training_steps=max_steps, 293 | lr_end=end_lr, 294 | power=decay_power, 295 | ) 296 | 297 | sched = {"scheduler": scheduler, "interval": "step"} 298 | 299 | return ( 300 | [optimizer], 301 | [sched], 302 | ) 303 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/METER/meter_module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | # import pytorch_lightning as pl 4 | import torch.nn as nn 5 | import numpy as np 6 | 7 | from transformers.models.bert.modeling_bert import BertConfig, BertEmbeddings, BertModel, BertEncoder, BertLayer 8 | from .bert_model import BertCrossLayer, BertAttention 9 | from . import swin_transformer as swin 10 | from . import heads, objectives 11 | # from . import meter_utils 12 | from .clip_model import build_model, adapt_position_encoding 13 | from .swin_helpers import swin_adapt_position_encoding 14 | from transformers import RobertaConfig, RobertaModel 15 | import pdb 16 | class METERTransformerSS(nn.Module): 17 | def __init__(self, config): 18 | super().__init__() 19 | # self.save_hyperparameters() 20 | self.current_tasks = list() 21 | self.is_clip= (not 'swin' in config['vit']) 22 | 23 | if 'roberta' in config['tokenizer']: 24 | bert_config = RobertaConfig( 25 | vocab_size=config["vocab_size"], 26 | hidden_size=config["hidden_size"], 27 | num_hidden_layers=config["num_layers"], 28 | num_attention_heads=config["num_heads"], 29 | intermediate_size=config["hidden_size"] * config["mlp_ratio"], 30 | max_position_embeddings=config["max_text_len"], 31 | hidden_dropout_prob=config["drop_rate"], 32 | attention_probs_dropout_prob=config["drop_rate"], 33 | ) 34 | else: 35 | bert_config = BertConfig( 36 | vocab_size=config["vocab_size"], 37 | hidden_size=config["hidden_size"], 38 | num_hidden_layers=config["num_layers"], 39 | num_attention_heads=config["num_heads"], 40 | intermediate_size=config["hidden_size"] * config["mlp_ratio"], 41 | max_position_embeddings=config["max_text_len"], 42 | hidden_dropout_prob=config["drop_rate"], 43 | attention_probs_dropout_prob=config["drop_rate"], 44 | ) 45 | 46 | resolution_after=config['image_size'] 47 | 48 | self.cross_modal_text_transform = nn.Linear(config['input_text_embed_size'], config['hidden_size']) 49 | self.cross_modal_text_transform.apply(objectives.init_weights) 50 | self.cross_modal_image_transform = nn.Linear(config['input_image_embed_size'], config['hidden_size']) 51 | self.cross_modal_image_transform.apply(objectives.init_weights) 52 | 53 | self.token_type_embeddings = nn.Embedding(2, config["hidden_size"]) 54 | self.token_type_embeddings.apply(objectives.init_weights) 55 | 56 | if torch.distributed.is_initialized(): 57 | if torch.distributed.get_rank() == 0: 58 | if self.is_clip: 59 | build_model(config['vit'], resolution_after=resolution_after) 60 | else: 61 | getattr(swin, self.hparams.config["vit"])( 62 | pretrained=True, config=self.hparams.config, 63 | ) 64 | 65 | if 'roberta' in config['tokenizer']: 66 | RobertaModel.from_pretrained(config['tokenizer']) 67 | else: 68 | BertModel.from_pretrained(config['tokenizer']) 69 | 70 | torch.distributed.barrier() 71 | 72 | if self.is_clip: 73 | self.vit_model = build_model(config['vit'], resolution_after=resolution_after) 74 | else: 75 | self.vit_model = getattr(swin, self.hparams.config["vit"])( 76 | pretrained=True, config=self.hparams.config, 77 | ) 78 | self.avgpool = nn.AdaptiveAvgPool1d(1) 79 | 80 | if 'roberta' in config['tokenizer']: 81 | self.text_transformer = RobertaModel.from_pretrained(config['tokenizer']) 82 | else: 83 | self.text_transformer = BertModel.from_pretrained(config['tokenizer']) 84 | 85 | self.cross_modal_image_layers = nn.ModuleList([BertCrossLayer(bert_config) for _ in range(config['num_top_layer'])]) 86 | self.cross_modal_image_layers.apply(objectives.init_weights) 87 | self.cross_modal_text_layers = nn.ModuleList([BertCrossLayer(bert_config) for _ in range(config['num_top_layer'])]) 88 | self.cross_modal_text_layers.apply(objectives.init_weights) 89 | 90 | self.cross_modal_image_pooler = heads.Pooler(config["hidden_size"]) 91 | self.cross_modal_image_pooler.apply(objectives.init_weights) 92 | self.cross_modal_text_pooler = heads.Pooler(config["hidden_size"]) 93 | self.cross_modal_text_pooler.apply(objectives.init_weights) 94 | 95 | ckpt = torch.load(config["load_path"], map_location="cpu") 96 | state_dict = ckpt["state_dict"] 97 | 98 | del state_dict['vit_model.visual.positional_embedding'] 99 | msg = self.load_state_dict(state_dict, strict=False) 100 | print(msg) 101 | # if config["loss_names"]["mlm"] > 0: 102 | # self.mlm_score = heads.MLMHead(bert_config) 103 | # self.mlm_score.apply(objectives.init_weights) 104 | 105 | # if config["loss_names"]["itm"] > 0: 106 | # self.itm_score = heads.ITMHead(config["hidden_size"]*2) 107 | # self.itm_score.apply(objectives.init_weights) 108 | 109 | # hs = self.hparams.config["hidden_size"] 110 | 111 | # if self.hparams.config["loss_names"]["vqa"] > 0: 112 | # vs = self.hparams.config["vqav2_label_size"] 113 | # self.vqa_classifier = nn.Sequential( 114 | # nn.Linear(hs * 2, hs * 2), 115 | # nn.LayerNorm(hs * 2), 116 | # nn.GELU(), 117 | # nn.Linear(hs * 2, vs), 118 | # ) 119 | # self.vqa_classifier.apply(objectives.init_weights) 120 | 121 | # ===================== Downstream ===================== # 122 | # if ( 123 | # self.hparams.config["load_path"] != "" 124 | # and not self.hparams.config["test_only"] 125 | # ): 126 | # ckpt = torch.load(self.hparams.config["load_path"], map_location="cpu") 127 | # state_dict = ckpt["state_dict"] 128 | # if self.is_clip: 129 | # state_dict = adapt_position_encoding(state_dict, after=resolution_after, patch_size=self.hparams.config['patch_size']) 130 | # else: 131 | # state_dict = swin_adapt_position_encoding(state_dict, after=resolution_after, before=config['resolution_before']) 132 | # self.load_state_dict(state_dict, strict=False) 133 | 134 | 135 | # if self.hparams.config["loss_names"]["nlvr2"] > 0: 136 | # self.nlvr2_classifier = nn.Sequential( 137 | # nn.Linear(hs * 4, hs * 2), 138 | # nn.LayerNorm(hs * 2), 139 | # nn.GELU(), 140 | # nn.Linear(hs * 2, 2), 141 | # ) 142 | # self.nlvr2_classifier.apply(objectives.init_weights) 143 | # emb_data = self.token_type_embeddings.weight.data 144 | # self.token_type_embeddings = nn.Embedding(3, hs) 145 | # self.token_type_embeddings.apply(objectives.init_weights) 146 | # self.token_type_embeddings.weight.data[0, :] = emb_data[0, :] 147 | # self.token_type_embeddings.weight.data[1, :] = emb_data[1, :] 148 | # self.token_type_embeddings.weight.data[2, :] = emb_data[1, :] 149 | 150 | # if self.hparams.config["loss_names"]["snli"] > 0: 151 | # self.snli_classifier = nn.Sequential( 152 | # nn.Linear(hs * 2, hs * 2), 153 | # nn.LayerNorm(hs * 2), 154 | # nn.GELU(), 155 | # nn.Linear(hs * 2, 3), 156 | # ) 157 | # self.snli_classifier.apply(objectives.init_weights) 158 | 159 | # if self.hparams.config["loss_names"]["irtr"] > 0: 160 | # self.rank_output = nn.Linear(hs, 1) 161 | # self.rank_output.weight.data = self.itm_score.fc.weight.data[1:, :] 162 | # self.rank_output.bias.data = self.itm_score.fc.bias.data[1:] 163 | # self.margin = 0.2 164 | # for p in self.itm_score.parameters(): 165 | # p.requires_grad = False 166 | 167 | # meter_utils.set_metrics(self) 168 | # self.current_tasks = list() 169 | 170 | # # ===================== load downstream (test_only) ====================== 171 | 172 | # if self.hparams.config["load_path"] != "" and self.hparams.config["test_only"]: 173 | # ckpt = torch.load(self.hparams.config["load_path"], map_location="cpu") 174 | # state_dict = ckpt["state_dict"] 175 | # if self.is_clip: 176 | # state_dict = adapt_position_encoding(state_dict, after=resolution_after, patch_size=self.hparams.config['patch_size']) 177 | # else: 178 | # state_dict = swin_adapt_position_encoding(state_dict, after=resolution_after, before=config['resolution_before']) 179 | # self.load_state_dict(state_dict, strict=False) 180 | 181 | def infer( 182 | self, 183 | batch, 184 | mask_text=False, 185 | mask_image=False, 186 | image_token_type_idx=1, 187 | img=None, 188 | ): 189 | if img is None: 190 | if f"image_{image_token_type_idx - 1}" in batch: 191 | imgkey = f"image_{image_token_type_idx - 1}" 192 | else: 193 | imgkey = "image" 194 | img = batch[imgkey][0] 195 | 196 | do_mlm = "_mlm" if mask_text else "" 197 | text_ids = batch[f"text_ids{do_mlm}"] 198 | text_masks = batch[f"text_masks"] 199 | 200 | #### text encoder #### 201 | text_embeds = self.text_transformer.embeddings(input_ids=text_ids) 202 | device = text_embeds.device 203 | input_shape = text_masks.size() 204 | extend_text_masks = self.text_transformer.get_extended_attention_mask(text_masks, input_shape, device) 205 | for layer in self.text_transformer.encoder.layer: 206 | text_embeds = layer(text_embeds, extend_text_masks)[0] 207 | text_embeds = self.cross_modal_text_transform(text_embeds) 208 | 209 | #### img encoder #### 210 | image_embeds = self.vit_model(img) 211 | image_embeds = self.cross_modal_image_transform(image_embeds) 212 | image_masks = torch.ones((image_embeds.size(0), image_embeds.size(1)), dtype=torch.long, device=device) 213 | extend_image_masks = self.text_transformer.get_extended_attention_mask(image_masks, image_masks.size(), device) 214 | 215 | text_embeds, image_embeds = ( 216 | text_embeds + self.token_type_embeddings(torch.zeros_like(text_masks)), 217 | image_embeds 218 | + self.token_type_embeddings( 219 | torch.full_like(image_masks, image_token_type_idx) 220 | ), 221 | ) 222 | 223 | #### interaction #### 224 | x, y = text_embeds, image_embeds 225 | for text_layer, image_layer in zip(self.cross_modal_text_layers, self.cross_modal_image_layers): 226 | x1 = text_layer(x, y, extend_text_masks, extend_image_masks) 227 | y1 = image_layer(y, x, extend_image_masks, extend_text_masks) 228 | x, y = x1[0], y1[0] 229 | 230 | text_feats, image_feats = x, y 231 | cls_feats_text = self.cross_modal_text_pooler(x) 232 | if self.is_clip: 233 | cls_feats_image = self.cross_modal_image_pooler(y) 234 | else: 235 | avg_image_feats = self.avgpool(image_feats.transpose(1, 2)).view(image_feats.size(0), 1, -1) 236 | cls_feats_image = self.cross_modal_image_pooler(avg_image_feats) 237 | cls_feats = torch.cat([cls_feats_text, cls_feats_image], dim=-1) 238 | 239 | ret = { 240 | "text_feats": text_feats, 241 | "image_feats": image_feats, 242 | "cls_feats": cls_feats, 243 | "text_ids": text_ids, 244 | "text_masks": text_masks, 245 | } 246 | 247 | 248 | return ret 249 | 250 | def forward(self, batch): 251 | ret = dict() 252 | if len(self.current_tasks) == 0: 253 | ret.update(self.infer(batch)) 254 | return ret 255 | 256 | # Masked Language Modeling 257 | if "mlm" in self.current_tasks: 258 | ret.update(objectives.compute_mlm(self, batch)) 259 | 260 | # Image Text Matching 261 | if "itm" in self.current_tasks: 262 | ret.update(objectives.compute_itm(self, batch)) 263 | 264 | # Visual Question Answering 265 | if "vqa" in self.current_tasks: 266 | ret.update(objectives.compute_vqa(self, batch)) 267 | 268 | # Natural Language for Visual Reasoning 2 269 | if "nlvr2" in self.current_tasks: 270 | ret.update(objectives.compute_nlvr2(self, batch)) 271 | 272 | # SNLI Visual Entailment 273 | if "snli" in self.current_tasks: 274 | ret.update(objectives.compute_snli(self, batch)) 275 | 276 | # Image Retrieval and Text Retrieval 277 | if "irtr" in self.current_tasks: 278 | ret.update(objectives.compute_irtr(self, batch)) 279 | 280 | return ret 281 | -------------------------------------------------------------------------------- /code/MultiModal-DeepFake-main/models/consist_modeling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import pdb 4 | from .interaction import Self_Interaction 5 | from timm.models.layers import trunc_normal_ 6 | 7 | def calculate_patch_labels(images, boxes, fake_text_pos, num_patches=(16, 16)): 8 | # 获取图片的尺寸 9 | _, height, width = images.shape[1:4] 10 | 11 | # 计算每个 patch 的大小 12 | patch_height = height // num_patches[0] 13 | patch_width = width // num_patches[1] 14 | 15 | # 将 boxes 转换为张量 16 | # boxes = torch.tensor(boxes) # shape: [N, 4] 17 | 18 | # 计算框的坐标 19 | box_x1 = (boxes[:, 0] * width).int() 20 | box_y1 = (boxes[:, 1] * height).int() 21 | box_w = (boxes[:, 2] * width).int() 22 | box_h = (boxes[:, 3] * height).int() 23 | 24 | # box_x2 = box_x1 + box_w 25 | # box_y2 = box_y1 + box_h 26 | 27 | box_x2 = box_x1 + 0.5*box_w 28 | box_y2 = box_y1 + 0.5*box_h 29 | 30 | box_x1 = box_x1 - 0.5*box_w 31 | box_y1 = box_y1 - 0.5*box_h 32 | 33 | # 计算 patch 的坐标 34 | patch_x1 = torch.arange(0, width, patch_width).view(1, -1).expand(boxes.size(0), -1).to(boxes.device) 35 | patch_y1 = torch.arange(0, height, patch_height).view(1, -1).expand(boxes.size(0), -1).to(boxes.device) 36 | patch_x2 = patch_x1 + patch_width 37 | patch_y2 = patch_y1 + patch_height 38 | 39 | # 计算每个 patch 的面积 40 | patch_area = patch_width * patch_height 41 | 42 | # 计算相交区域 43 | inter_x1 = torch.max(patch_x1, box_x1.view(-1, 1)) 44 | inter_y1 = torch.max(patch_y1, box_y1.view(-1, 1)) 45 | inter_x2 = torch.min(patch_x2, box_x2.view(-1, 1)) 46 | inter_y2 = torch.min(patch_y2, box_y2.view(-1, 1)) 47 | 48 | # 计算相交区域的面积 49 | 50 | inter_area = torch.max(torch.tensor(0), inter_x2 - inter_x1).unsqueeze(1) * torch.max(torch.tensor(0), inter_y2 - inter_y1).unsqueeze(2) 51 | 52 | # 判断条件:相交面积是否大于 patch 面积的一半 53 | labels = (inter_area > (patch_area / 2)).int() 54 | 55 | labels_extented = labels.view(images.shape[0], -1, 1) 56 | 57 | consistency_matrix = (labels_extented == labels_extented.transpose(2, 1)).int() 58 | 59 | labels_extented_it = labels.view(images.shape[0], 1, -1) 60 | fake_text_pos_extented = fake_text_pos.view(images.shape[0], -1, 1) 61 | 62 | consistency_matrix_it = ((labels_extented_it + fake_text_pos_extented)<1).int() 63 | 64 | return consistency_matrix, consistency_matrix_it, labels.view(images.shape[0], -1) 65 | 66 | def get_sscore_label(img, fake_img_box, fake_text_pos, len_edge=16): 67 | consistency_matrix, consistency_matrix_it, labels = calculate_patch_labels(img,fake_img_box,fake_text_pos,(len_edge,len_edge)) 68 | 69 | patch_score = consistency_matrix.sum(dim=-1)/(len_edge*len_edge) 70 | img_score = patch_score.sum(dim=-1)/(len_edge*len_edge) 71 | 72 | return consistency_matrix, labels, patch_score, img_score, consistency_matrix_it 73 | 74 | def get_sscore_label_text(fake_text_pos): 75 | 76 | fake_text_pos_extend = fake_text_pos.unsqueeze(-1) 77 | sim_matrix = ((fake_text_pos_extend == fake_text_pos_extend.transpose(2,1))).int() 78 | matrix_mask = ((fake_text_pos_extend + fake_text_pos_extend.transpose(2,1))>=0) 79 | for i in range(fake_text_pos.shape[0]): 80 | sim_matrix[i].fill_diagonal_(1) 81 | return sim_matrix, matrix_mask 82 | 83 | class Intra_Modal_Modeling(nn.Module): 84 | 85 | def __init__(self, num_head, hidden_dim, input_dim, output_dim, tok_num): 86 | super().__init__() 87 | 88 | self.correlation_model = Self_Interaction(num_head, hidden_dim, input_dim, output_dim, layers=3) 89 | self.consist_encoder = nn.Sequential(nn.Linear(output_dim, 256), 90 | nn.LayerNorm(256), 91 | nn.GELU(), 92 | nn.Linear(256, 128), 93 | nn.LayerNorm(128), 94 | nn.GELU(), 95 | nn.Linear(128, 64)) 96 | self.token_number = tok_num 97 | self.aggregator = nn.MultiheadAttention(output_dim, 4, dropout=0.0, batch_first=True) 98 | self.aggregator_mlp = self.build_mlp(input_dim=output_dim, output_dim=output_dim) 99 | self.aggregator_2 = nn.MultiheadAttention(output_dim, 4, dropout=0.0, batch_first=True) 100 | self.aggregator_mlp_2 = self.build_mlp(input_dim=output_dim, output_dim=output_dim) 101 | self.num_head = 4 102 | 103 | def build_mlp(self, input_dim, output_dim): 104 | return nn.Sequential( 105 | nn.Linear(input_dim, input_dim * 2), 106 | nn.LayerNorm(input_dim * 2), 107 | nn.GELU(), 108 | nn.Linear(input_dim* 2, input_dim * 2), 109 | nn.LayerNorm(input_dim * 2), 110 | nn.GELU(), 111 | nn.Linear(input_dim * 2, output_dim) 112 | ) 113 | 114 | def forward(self, feats, mask, pos_emb, matrix_mask=None): 115 | 116 | B, N, C = feats.shape 117 | feats = self.correlation_model(feats, mask, pos_emb) 118 | consist_feats = self.consist_encoder(feats) 119 | 120 | norms = torch.norm(consist_feats, p=2, dim=2, keepdim=True) 121 | normalized_vectors = consist_feats / norms 122 | similarity_matrix = torch.bmm(normalized_vectors, normalized_vectors.transpose(1, 2)) 123 | similarity_matrix = torch.clamp((similarity_matrix+1)/2, 0, 1) 124 | 125 | if mask.sum() > 0: # for text inputs 126 | similarity_matrix_unsim = similarity_matrix.clone() 127 | similarity_matrix_unsim[~matrix_mask] = 2 128 | 129 | similarity_matrix_sim = similarity_matrix.clone() 130 | similarity_matrix_sim[~matrix_mask] = -1 131 | diagonal_mask = torch.eye(N, device=feats.device).unsqueeze(0).expand(B, N, N) 132 | similarity_matrix_sim = similarity_matrix_sim - diagonal_mask 133 | 134 | else: # for image inputs 135 | similarity_matrix_unsim = similarity_matrix.clone() 136 | similarity_matrix_sim = similarity_matrix.clone() 137 | diagonal_mask = torch.eye(N, device=feats.device).unsqueeze(0).expand(B, N, N) 138 | similarity_matrix_sim = similarity_matrix_sim - diagonal_mask # ignore them self 139 | 140 | unsim_feats_index = torch.topk(similarity_matrix_unsim, self.token_number, dim=-1, largest=False)[1] 141 | unsim_attn_mask = torch.ones([B, N, N], dtype=torch.bool).to(unsim_feats_index.device) 142 | batch_indices = torch.arange(B).view(B, 1, 1) # 形状 (B, N, m) 143 | row_indices = torch.arange(N).view(1, N, 1) # 形状 (B, N, m) 144 | unsim_attn_mask[batch_indices, row_indices, unsim_feats_index] = False 145 | unsim_attn_mask = unsim_attn_mask.repeat(self.num_head ,1,1) 146 | 147 | sim_feats_index = torch.topk(similarity_matrix_sim, self.token_number, dim=-1, largest=True)[1] 148 | sim_attn_mask = torch.ones([B, N, N], dtype=torch.bool).to(sim_feats_index.device) 149 | batch_indices = torch.arange(B).view(B, 1, 1) # 形状 (B, N, m) 150 | row_indices = torch.arange(N).view(1, N, 1) # 形状 (B, N, m) 151 | sim_attn_mask[batch_indices, row_indices, sim_feats_index] = False 152 | sim_attn_mask = sim_attn_mask.repeat(self.num_head ,1,1) 153 | 154 | feats = feats + self.aggregator_mlp(self.aggregator(query=feats, 155 | key=feats, 156 | value=feats, 157 | attn_mask=sim_attn_mask)[0]) 158 | 159 | feats = feats + self.aggregator_mlp_2(self.aggregator_2(query=feats, 160 | key=feats, 161 | value=feats, 162 | attn_mask=unsim_attn_mask)[0]) 163 | 164 | return feats, similarity_matrix, consist_feats 165 | 166 | 167 | class Extra_Modal_Modeling(nn.Module): 168 | 169 | def __init__(self, num_head, output_dim, tok_num): 170 | super().__init__() 171 | 172 | self.feat_encoder = self.build_mlp(input_dim=output_dim, output_dim=output_dim) 173 | self.cross_encoder = self.build_mlp(input_dim=output_dim, output_dim=output_dim) 174 | self.token_number = tok_num 175 | 176 | self.consist_encoder_feat = nn.Sequential(nn.Linear(output_dim, 256), 177 | nn.LayerNorm(256), 178 | nn.GELU(), 179 | nn.Linear(256, 128), 180 | nn.LayerNorm(128), 181 | nn.GELU(), 182 | nn.Linear(128, 64)) 183 | 184 | self.consist_encoder_cross = nn.Sequential(nn.Linear(output_dim, 256), 185 | nn.LayerNorm(256), 186 | nn.GELU(), 187 | nn.Linear(256, 128), 188 | nn.LayerNorm(128), 189 | nn.GELU(), 190 | nn.Linear(128, 64)) 191 | 192 | self.cls_token_cross = nn.Parameter(torch.zeros(1, 1, output_dim)) 193 | self.aggregator_cross = nn.MultiheadAttention(output_dim, num_head, dropout=0.0, batch_first=True) 194 | self.norm_layer_cross =nn.LayerNorm(output_dim) 195 | 196 | self.cls_token_feat = nn.Parameter(torch.zeros(1, 1, output_dim)) 197 | self.aggregator_feat = nn.MultiheadAttention(output_dim, num_head, dropout=0.0, batch_first=True) 198 | self.norm_layer_feat =nn.LayerNorm(output_dim) 199 | 200 | self.aggregator = nn.MultiheadAttention(output_dim, 4, dropout=0.0, batch_first=True) 201 | self.aggregator_mlp = self.build_mlp(input_dim=output_dim, output_dim=output_dim) 202 | self.aggregator_2 = nn.MultiheadAttention(output_dim, 4, dropout=0.0, batch_first=True) 203 | self.aggregator_mlp_2 = self.build_mlp(input_dim=output_dim, output_dim=output_dim) 204 | 205 | trunc_normal_(self.cls_token_cross, std=.02) 206 | trunc_normal_(self.cls_token_feat, std=.02) 207 | 208 | def build_mlp(self, input_dim, output_dim): 209 | return nn.Sequential( 210 | nn.Linear(input_dim, input_dim * 2), 211 | nn.LayerNorm(input_dim * 2), 212 | nn.GELU(), 213 | nn.Linear(input_dim* 2, input_dim * 2), 214 | nn.LayerNorm(input_dim * 2), 215 | nn.GELU(), 216 | nn.Linear(input_dim * 2, output_dim) 217 | ) 218 | 219 | def forward(self, feats, gloabl_feature, cross_feat, feats_mask, cross_mask): 220 | 221 | bs, _, _ = feats.shape 222 | 223 | feats = self.feat_encoder(feats) 224 | cross_feat = self.cross_encoder(cross_feat) 225 | 226 | cls_token_cross = self.cls_token_cross.expand(bs, -1, -1) 227 | feat_aggr_cross = self.aggregator_cross(query=self.norm_layer_cross(cls_token_cross), 228 | key=self.norm_layer_cross(cross_feat), 229 | value=self.norm_layer_cross(cross_feat), 230 | key_padding_mask=cross_mask)[0] 231 | 232 | feats_consist = self.consist_encoder_feat(feats) 233 | cross_feats_consist = self.consist_encoder_feat(feat_aggr_cross) 234 | 235 | norms_feat = torch.norm(feats_consist, p=2, dim=2, keepdim=True) 236 | norms_cross = torch.norm(cross_feats_consist, p=2, dim=2, keepdim=True) 237 | sim_matrix = torch.bmm(feats_consist/norms_feat, (cross_feats_consist/norms_cross).transpose(1, 2)) 238 | sim_matrix = torch.clamp((sim_matrix+1)/2, 0, 1).squeeze() 239 | 240 | cls_token = self.cls_token_feat.expand(bs, -1, -1) 241 | global_feats_mask = torch.zeros(feats_mask.shape[0], 1).bool().to(feats_mask.device) 242 | feat_aggr = self.aggregator_feat(query=self.norm_layer_feat(cls_token), 243 | key=self.norm_layer_feat(torch.cat([gloabl_feature, feats], dim=1)), 244 | value=self.norm_layer_feat(torch.cat([gloabl_feature, feats], dim=1)), 245 | key_padding_mask=torch.cat([global_feats_mask,feats_mask],dim=1))[0] 246 | 247 | if feats_mask.sum() > 0: # for text inputs 248 | sim_score = sim_matrix.clone() 249 | sim_score[feats_mask] = -1 250 | 251 | unsim_score = sim_matrix.clone() 252 | unsim_score[feats_mask] = 2 253 | 254 | else: # for image inputs 255 | sim_score = sim_matrix.clone() 256 | unsim_score = sim_matrix.clone() 257 | 258 | unsim_index = torch.topk(unsim_score, self.token_number, dim=-1, largest=False)[1] 259 | unsim_patch = feats[torch.arange(feats.shape[0]).unsqueeze(1), unsim_index] 260 | 261 | sim_index = torch.topk(sim_score, self.token_number, dim=-1, largest=True)[1] 262 | sim_patch = feats[torch.arange(feats.shape[0]).unsqueeze(1), sim_index] 263 | 264 | feat_aggr = feat_aggr + self.aggregator_mlp(self.aggregator(query=feat_aggr, 265 | key=sim_patch, 266 | value=sim_patch)[0]) 267 | 268 | feat_aggr = feat_aggr + self.aggregator_mlp_2(self.aggregator_2(query=feat_aggr, 269 | key=unsim_patch, 270 | value=unsim_patch)[0]) 271 | 272 | return feat_aggr, sim_matrix, feats_consist 273 | --------------------------------------------------------------------------------