├── code
    └── MultiModal-DeepFake-main
    │   ├── models
    │       ├── __init__.py
    │       ├── METER
    │       │   ├── __init__.py
    │       │   ├── __pycache__
    │       │   │   ├── heads.cpython-38.pyc
    │       │   │   ├── __init__.cpython-38.pyc
    │       │   │   ├── bert_model.cpython-38.pyc
    │       │   │   ├── clip_model.cpython-38.pyc
    │       │   │   ├── dist_utils.cpython-38.pyc
    │       │   │   ├── meter_utils.cpython-38.pyc
    │       │   │   ├── objectives.cpython-38.pyc
    │       │   │   ├── meter_module.cpython-38.pyc
    │       │   │   ├── swin_helpers.cpython-38.pyc
    │       │   │   └── swin_transformer.cpython-38.pyc
    │       │   ├── heads.py
    │       │   ├── dist_utils.py
    │       │   ├── clip_model.py
    │       │   ├── meter_utils.py
    │       │   └── meter_module.py
    │       ├── interaction.py
    │       ├── box_ops.py
    │       └── consist_modeling.py
    │   ├── tools
    │       ├── __init__.py
    │       ├── loss.py
    │       ├── config.py
    │       ├── schedulers.py
    │       ├── env.py
    │       ├── utils.py
    │       └── multilabel_metrics.py
    │   ├── scheduler
    │       ├── __init__.py
    │       ├── lr_sched.py
    │       ├── step_lr.py
    │       ├── scheduler_factory.py
    │       ├── cosine_lr.py
    │       ├── tanh_lr.py
    │       ├── plateau_lr.py
    │       └── scheduler.py
    │   ├── requirements.txt
    │   ├── train.sh
    │   ├── test.sh
    │   ├── optim
    │       ├── __init__.py
    │       ├── novograd.py
    │       ├── sgdp.py
    │       ├── nadam.py
    │       ├── lookahead.py
    │       ├── adamp.py
    │       ├── nvnovograd.py
    │       ├── adamw.py
    │       ├── radam.py
    │       ├── rmsprop_tf.py
    │       ├── adahessian.py
    │       ├── optim_factory.py
    │       └── adafactor.py
    │   ├── configs
    │       ├── test.yaml
    │       ├── train.yaml
    │       └── METER.yaml
    │   └── dataset
    │       ├── __init__.py
    │       ├── dataset.py
    │       ├── utils.py
    │       └── randaugment.py
├── .gitignore
├── LICENSE
└── README.md


/code/MultiModal-DeepFake-main/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/METER/__init__.py:
--------------------------------------------------------------------------------
1 | from .meter_module import METERTransformerSS
2 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/METER/__pycache__/heads.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/heads.cpython-38.pyc


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/METER/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/METER/__pycache__/bert_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/bert_model.cpython-38.pyc


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/METER/__pycache__/clip_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/clip_model.cpython-38.pyc


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/METER/__pycache__/dist_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/dist_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/METER/__pycache__/meter_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/meter_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/METER/__pycache__/objectives.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/objectives.cpython-38.pyc


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/METER/__pycache__/meter_module.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/meter_module.cpython-38.pyc


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/METER/__pycache__/swin_helpers.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/swin_helpers.cpython-38.pyc


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/METER/__pycache__/swin_transformer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liyih/CSCL/HEAD/code/MultiModal-DeepFake-main/models/METER/__pycache__/swin_transformer.cpython-38.pyc


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/scheduler/__init__.py:
--------------------------------------------------------------------------------
1 | from .cosine_lr import CosineLRScheduler
2 | from .plateau_lr import PlateauLRScheduler
3 | from .step_lr import StepLRScheduler
4 | from .tanh_lr import TanhLRScheduler
5 | from .scheduler_factory import create_scheduler
6 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/requirements.txt:
--------------------------------------------------------------------------------
 1 | timm==0.4.12
 2 | transformers==4.8.1
 3 | Pillow==8.3.2
 4 | numpy==1.21.1
 5 | opencv-python==4.5.5.62
 6 | scipy==1.8.0
 7 | scikit_image==0.19.2
 8 | matplotlib==3.4.3
 9 | scikit-learn
10 | tensorboard
11 | setuptools==59.5.0
12 | einops==0.8.0
13 | seaborn==0.13.2


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/train.sh:
--------------------------------------------------------------------------------
 1 | EXPID=$(date +"%Y%m%d_%H%M%S")
 2 | 
 3 | HOST='localhost'
 4 | PORT='1'
 5 | 
 6 | NUM_GPU=8
 7 | python train.py
 8 | --config 'configs/train.yaml' \
 9 | --output_dir './results' \
10 | --launcher pytorch \
11 | --rank 0 \
12 | --log_num ${EXPID} \
13 | --dist-url tcp://localhost:23459 \
14 | --world_size $NUM_GPU \
15 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/test.sh:
--------------------------------------------------------------------------------
 1 | EXPID=CSCL
 2 | 
 3 | HOST='localhost'
 4 | PORT='1'
 5 | 
 6 | NUM_GPU=1
 7 | 
 8 | python test.py \
 9 | --config 'configs/test.yaml' \
10 | --output_dir './results' \
11 | --launcher pytorch \
12 | --rank 0 \
13 | --log_num ${EXPID} \
14 | --dist-url tcp://localhost:23459 \
15 | --world_size $NUM_GPU \
16 | --test_epoch 49 \
17 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/optim/__init__.py:
--------------------------------------------------------------------------------
 1 | from .adamp import AdamP
 2 | from .adamw import AdamW
 3 | from .adafactor import Adafactor
 4 | from .adahessian import Adahessian
 5 | from .lookahead import Lookahead
 6 | from .nadam import Nadam
 7 | from .novograd import NovoGrad
 8 | from .nvnovograd import NvNovoGrad
 9 | from .radam import RAdam
10 | from .rmsprop_tf import RMSpropTF
11 | from .sgdp import SGDP
12 | 
13 | from .optim_factory import create_optimizer


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/configs/test.yaml:
--------------------------------------------------------------------------------
 1 | train_file: ["../../datasets/DGM4/metadata/trainval.json"] # use train and val set together
 2 | val_file: ["../../datasets/DGM4/metadata/test.json"]      
 3 | 
 4 | image_res: 256
 5 | vision_width: 768
 6 | embed_dim: 256
 7 | batch_size_train: 64
 8 | batch_size_val: 256
 9 | max_words: 50
10 | 
11 | loss_BIC_wgt: 1
12 | loss_bbox_wgt: 0.1
13 | loss_giou_wgt: 0.1
14 | loss_MLC_wgt: 1
15 | Loss_sim_wgt: 10
16 | 
17 | optimizer: {opt: adamW, lr: 0.00001, lr_img: 0.00005, weight_decay: 0.02}
18 | schedular: {sched: cosine, lr: 0.00001, epochs: 50, min_lr: 0.000001, decay_rate: 1, warmup_lr: 0.000001, warmup_epochs: 10, cooldown_epochs: 0}
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/configs/train.yaml:
--------------------------------------------------------------------------------
 1 | train_file: ["../../datasets/DGM4/metadata/trainval.json"] # use train and val set together
 2 | val_file: ["../../datasets/DGM4/metadata/test.json"]           
 3 | 
 4 | image_res: 256
 5 | vision_width: 768
 6 | embed_dim: 256
 7 | batch_size_train: 32
 8 | batch_size_val: 64
 9 | max_words: 50
10 | 
11 | loss_BIC_wgt: 1
12 | loss_bbox_wgt: 0.1
13 | loss_giou_wgt: 0.1
14 | loss_MLC_wgt: 1
15 | Loss_sim_wgt: 10
16 | 
17 | optimizer: {opt: adamW, lr: 0.00001, lr_img: 0.00005, weight_decay: 0.02}
18 | schedular: {sched: cosine, lr: 0.00001, epochs: 50, min_lr: 0.000001, decay_rate: 1, warmup_lr: 0.000001, warmup_epochs: 10, cooldown_epochs: 0}
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/configs/METER.yaml:
--------------------------------------------------------------------------------
 1 | # ViT-B/16 + RoBERTa
 2 | 
 3 | # Image setting
 4 | train_transform_keys: ["clip"]
 5 | val_transform_keys: ["clip"]
 6 | image_size: 256
 7 | patch_size: 16
 8 | draw_false_image: 1
 9 | image_only: False
10 | resolution_before: 256
11 | 
12 | # Text Setting
13 | vqav2_label_size: 3129
14 | max_text_len: 50
15 | tokenizer: "roberta-base"
16 | vocab_size: 50265
17 | whole_word_masking: False # note that whole_word_masking does not work for RoBERTa
18 | mlm_prob: 0.15
19 | draw_false_text: 0
20 | 
21 | # Transformer Setting
22 | num_top_layer: 6
23 | input_image_embed_size: 768
24 | input_text_embed_size: 768
25 | vit: 'ViT-B/16'
26 | hidden_size: 768
27 | num_heads: 12
28 | num_layers: 6
29 | mlp_ratio: 4
30 | drop_rate: 0.1
31 | load_path: "meter_clip16_224_roberta_pretrain.ckpt"


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/tools/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | import numpy as np
 6 | 
 7 |         
 8 | class GeneralizedCELoss(nn.Module):
 9 | 
10 |     def __init__(self, q=0.7):
11 |         super(GeneralizedCELoss, self).__init__()
12 |         self.q = q
13 |              
14 |     def forward(self, logits, targets):
15 |         # p = F.softmax(logits, dim=1)
16 |         p = F.sigmoid(logits)
17 |         if np.isnan(p.mean().item()):
18 |             raise NameError('GCE_p')
19 |         # Yg = torch.gather(p, 1, torch.unsqueeze(targets, 1))
20 |         # Yg = torch.gather(p, 1, targets)
21 |         # modify gradient of cross entropy
22 |         loss_weight = (p.detach()**self.q)*self.q
23 |         if np.isnan(p.mean().item()):
24 |             raise NameError('GCE_Yg')
25 | 
26 |         loss = F.binary_cross_entropy_with_logits(logits, targets.float(), reduction='none') * loss_weight
27 | 
28 |         return loss
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | datasets/
 2 | code/MultiModal-DeepFake-main/bert-base-uncased
 3 | code/MultiModal-DeepFake-main/ALBEF_4M.pth
 4 | code/MultiModal-DeepFake-main/nohup.out
 5 | code/MultiModal-DeepFake-main/results
 6 | code/MultiModal-DeepFake-main/pic_ours
 7 | code/MultiModal-DeepFake-main/pic_baseline
 8 | code/MultiModal-DeepFake-main/pic_consist
 9 | code/MultiModal-DeepFake-main/datasets.zip
10 | code/MultiModal-DeepFake-main/deit_base_patch16_224-b5f2ef4d.pth
11 | code/MultiModal-DeepFake-main/__pycache__/*.pyc
12 | code/MultiModal-DeepFake-main/dataset/__pycache__/*.pyc
13 | code/MultiModal-DeepFake-main/models/__pycache__/*.pyc
14 | code/MultiModal-DeepFake-main/optim/__pycache__/*.pyc
15 | code/MultiModal-DeepFake-main/scheduler/__pycache__/*.pyc
16 | code/MultiModal-DeepFake-main/tools/__pycache__/*.pyc
17 | nohup.out
18 | dataset.zip
19 | code/MultiModal-DeepFake-main/meter_clip16_224_roberta_pretrain.ckpt
20 | code/MultiModal-DeepFake-main/roberta-base/
21 | code/MultiModal-DeepFake-main/ViT-B-16.pt
22 | code/MultiModal-DeepFake-main/pic_prnu


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/scheduler/lr_sched.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import math
 8 | 
 9 | def adjust_learning_rate(optimizer, epoch, args, cfg):
10 |     """Decay the learning rate with half-cycle cosine after warmup"""
11 |     if epoch < cfg['schedular']['warmup_epochs']:
12 |         lr = args.lr * epoch / cfg['schedular']['warmup_epochs'] 
13 |     else:
14 |         lr = cfg['schedular']['min_lr'] + (args.lr - cfg['schedular']['min_lr']) * 0.5 * \
15 |             (1. + math.cos(math.pi * (epoch - cfg['schedular']['warmup_epochs']) / (cfg['schedular']['epochs'] - cfg['schedular']['warmup_epochs'])))
16 |     for param_group in optimizer.param_groups:
17 |         if "lr_scale" in param_group:
18 |             param_group["lr"] = lr * param_group["lr_scale"]
19 |         else:
20 |             param_group["lr"] = lr
21 |     return lr
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Yiheng Li
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/tools/config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | DEFAULTS = {
 4 |     "network": "dpn",
 5 |     "encoder": "dpn92",
 6 |     "model_params": {},
 7 |     "optimizer": {
 8 |         "batch_size": 32,
 9 |         "type": "SGD",  # supported: SGD, Adam
10 |         "momentum": 0.9,
11 |         "weight_decay": 0,
12 |         "clip": 1.,
13 |         "learning_rate": 0.1,
14 |         "classifier_lr": -1,
15 |         "nesterov": True,
16 |         "schedule": {
17 |             "type": "constant",  # supported: constant, step, multistep, exponential, linear, poly
18 |             "mode": "epoch",  # supported: epoch, step
19 |             "epochs": 10,
20 |             "params": {}
21 |         }
22 |     },
23 |     # "normalize": {
24 |     #     "mean": [0.485, 0.456, 0.406],
25 |     #     "std": [0.229, 0.224, 0.225]
26 |     # }
27 | }
28 | 
29 | 
30 | def _merge(src, dst):
31 |     for k, v in src.items():
32 |         if k in dst:
33 |             if isinstance(v, dict):
34 |                 _merge(src[k], dst[k])
35 |         else:
36 |             dst[k] = v
37 | 
38 | 
39 | def load_config(config_file, defaults=DEFAULTS):
40 |     with open(config_file, "r") as fd:
41 |         config = json.load(fd)
42 |     _merge(defaults, config)
43 |     return config
44 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/METER/heads.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from transformers.models.bert.modeling_bert import BertPredictionHeadTransform
 6 | 
 7 | 
 8 | class Pooler(nn.Module):
 9 |     def __init__(self, hidden_size):
10 |         super().__init__()
11 |         self.dense = nn.Linear(hidden_size, hidden_size)
12 |         self.activation = nn.Tanh()
13 | 
14 |     def forward(self, hidden_states):
15 |         first_token_tensor = hidden_states[:, 0]
16 |         pooled_output = self.dense(first_token_tensor)
17 |         pooled_output = self.activation(pooled_output)
18 |         return pooled_output
19 | 
20 | 
21 | class ITMHead(nn.Module):
22 |     def __init__(self, hidden_size):
23 |         super().__init__()
24 |         self.fc = nn.Linear(hidden_size, 2)
25 | 
26 |     def forward(self, x):
27 |         x = self.fc(x)
28 |         return x
29 | 
30 | 
31 | class MLMHead(nn.Module):
32 |     def __init__(self, config, weight=None):
33 |         super().__init__()
34 |         self.transform = BertPredictionHeadTransform(config)
35 |         self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
36 |         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
37 |         if weight is not None:
38 |             self.decoder.weight = weight
39 | 
40 |     def forward(self, x):
41 |         x = self.transform(x)
42 |         x = self.decoder(x) + self.bias
43 |         return x
44 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/tools/schedulers.py:
--------------------------------------------------------------------------------
 1 | from bisect import bisect_right
 2 | 
 3 | from torch.optim.lr_scheduler import _LRScheduler
 4 | from pdb import set_trace as st 
 5 | 
 6 | class LRStepScheduler(_LRScheduler):
 7 |     def __init__(self, optimizer, steps, last_epoch=-1):
 8 |         self.lr_steps = steps
 9 |         super().__init__(optimizer, last_epoch)
10 | 
11 |     def get_lr(self):
12 |         pos = max(bisect_right([x for x, y in self.lr_steps], self.last_epoch) - 1, 0)
13 |         return [self.lr_steps[pos][1] if self.lr_steps[pos][0] <= self.last_epoch else base_lr for base_lr in self.base_lrs]
14 | 
15 | 
16 | class PolyLR(_LRScheduler):
17 |     """Sets the learning rate of each parameter group according to poly learning rate policy
18 |     """
19 |     def __init__(self, optimizer, max_iter=90000, power=0.9, last_epoch=-1):
20 |         self.max_iter = max_iter
21 |         self.power = power
22 |         super(PolyLR, self).__init__(optimizer, last_epoch)
23 |     def get_lr(self):
24 |         self.last_epoch = (self.last_epoch + 1) % self.max_iter
25 |         return [base_lr * ((1 - float(self.last_epoch) / self.max_iter) ** (self.power)) for base_lr in self.base_lrs]
26 | 
27 | class ExponentialLRScheduler(_LRScheduler):
28 |     """Decays the learning rate of each parameter group by gamma every epoch.
29 |     When last_epoch=-1, sets initial lr as lr.
30 | 
31 |     Args:
32 |         optimizer (Optimizer): Wrapped optimizer.
33 |         gamma (float): Multiplicative factor of learning rate decay.
34 |         last_epoch (int): The index of last epoch. Default: -1.
35 |     """
36 | 
37 |     def __init__(self, optimizer, gamma, last_epoch=-1):
38 |         self.gamma = gamma
39 |         super(ExponentialLRScheduler, self).__init__(optimizer, last_epoch)
40 | 
41 |     def get_lr(self):
42 |         if self.last_epoch <= 0:
43 |             return self.base_lrs
44 |         return [base_lr * self.gamma**self.last_epoch for base_lr in self.base_lrs]
45 | 
46 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/interaction.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import pdb
 4 |     
 5 | class Self_Interaction_block(nn.Module):
 6 |     def __init__(self, num_head, hidden_dim, input_dim, output_dim):
 7 |         super().__init__()
 8 | 
 9 |         self.self_attn = nn.MultiheadAttention(input_dim, num_head, dropout=0.0, batch_first=True)
10 |         self.FFN = nn.Sequential(nn.Linear(input_dim, hidden_dim),
11 |                                  nn.ReLU(),
12 |                                  nn.Linear(hidden_dim, output_dim))
13 |         
14 |         self.norm1 = nn.LayerNorm(input_dim)
15 |         self.norm2 = nn.LayerNorm(input_dim)
16 |         self.dropout1 = nn.Dropout(0.1)
17 |         self.dropout2 = nn.Dropout(0.1)
18 |     
19 |     def forward(self, query, query_padding_mask, attn_mask):
20 | 
21 |         feat_after_self = query + self.dropout1(self.self_attn(query=query, 
22 |                                               key=query, 
23 |                                               value=query,
24 |                                               key_padding_mask=query_padding_mask,
25 |                                               attn_mask=attn_mask)[0])
26 |         feat_after_self = self.norm1(feat_after_self)
27 |         output = feat_after_self + self.dropout2(self.FFN(feat_after_self))
28 |         output = self.norm2(output)
29 |         return output
30 |     
31 | class Self_Interaction(nn.Module):
32 |     def __init__(self, num_head, hidden_dim, input_dim, output_dim, layers=3):
33 |         super().__init__()
34 | 
35 |         self.layers = nn.ModuleList()
36 |         for i in range(layers):
37 |             self.layers.append(Self_Interaction_block(num_head, hidden_dim, input_dim, output_dim))
38 | 
39 |     def forward(self, query, query_padding_mask, query_pos_emb=None, attn_mask=None):
40 |         if query_pos_emb is not None:
41 |             for i in range(len(self.layers)):
42 |                 query = self.layers[i](query + query_pos_emb, query_padding_mask, attn_mask)
43 |         else:
44 |             for i in range(len(self.layers)):
45 |                 query = self.layers[i](query, query_padding_mask, attn_mask) 
46 |         return query


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/scheduler/step_lr.py:
--------------------------------------------------------------------------------
 1 | """ Step Scheduler
 2 | 
 3 | Basic step LR schedule with warmup, noise.
 4 | 
 5 | Hacked together by / Copyright 2020 Ross Wightman
 6 | """
 7 | import math
 8 | import torch
 9 | 
10 | from .scheduler import Scheduler
11 | 
12 | 
13 | class StepLRScheduler(Scheduler):
14 |     """
15 |     """
16 | 
17 |     def __init__(self,
18 |                  optimizer: torch.optim.Optimizer,
19 |                  decay_t: float,
20 |                  decay_rate: float = 1.,
21 |                  warmup_t=0,
22 |                  warmup_lr_init=0,
23 |                  t_in_epochs=True,
24 |                  noise_range_t=None,
25 |                  noise_pct=0.67,
26 |                  noise_std=1.0,
27 |                  noise_seed=42,
28 |                  initialize=True,
29 |                  ) -> None:
30 |         super().__init__(
31 |             optimizer, param_group_field="lr",
32 |             noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed,
33 |             initialize=initialize)
34 | 
35 |         self.decay_t = decay_t
36 |         self.decay_rate = decay_rate
37 |         self.warmup_t = warmup_t
38 |         self.warmup_lr_init = warmup_lr_init
39 |         self.t_in_epochs = t_in_epochs
40 |         if self.warmup_t:
41 |             self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values]
42 |             super().update_groups(self.warmup_lr_init)
43 |         else:
44 |             self.warmup_steps = [1 for _ in self.base_values]
45 | 
46 |     def _get_lr(self, t):
47 |         if t < self.warmup_t:
48 |             lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps]
49 |         else:
50 |             lrs = [v * (self.decay_rate ** (t // self.decay_t)) for v in self.base_values]
51 |         return lrs
52 | 
53 |     def get_epoch_values(self, epoch: int):
54 |         if self.t_in_epochs:
55 |             return self._get_lr(epoch)
56 |         else:
57 |             return None
58 | 
59 |     def get_update_values(self, num_updates: int):
60 |         if not self.t_in_epochs:
61 |             return self._get_lr(num_updates)
62 |         else:
63 |             return None
64 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/box_ops.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | """
 3 | Utilities for bounding box manipulation and GIoU.
 4 | """
 5 | import torch
 6 | from torchvision.ops.boxes import box_area
 7 | import math
 8 | 
 9 | def box_cxcywh_to_xyxy(x):  # 这个用了
10 |     x_c, y_c, w, h = x.unbind(-1)
11 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
12 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
13 |     return torch.stack(b, dim=-1)
14 | 
15 | 
16 | def box_xyxy_to_cxcywh(x):
17 |     x0, y0, x1, y1 = x.unbind(-1)
18 |     b = [(x0 + x1) / 2, (y0 + y1) / 2,
19 |          (x1 - x0), (y1 - y0)]
20 |     return torch.stack(b, dim=-1)
21 | 
22 | 
23 | # modified from torchvision to also return the union
24 | def box_iou(boxes1, boxes2, test=False):
25 | 
26 |     area1 = box_area(boxes1)
27 |     area2 = box_area(boxes2)
28 | 
29 |     # lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
30 |     # rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
31 |     lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # [N,2]
32 |     rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # [N,2]
33 | 
34 |     wh = (rb - lt).clamp(min=0)  # [N,2]
35 |     # inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
36 |     inter = wh[:, 0] * wh[:, 1]  # [N]
37 | 
38 |     # union = area1[:, None] + area2 - inter
39 |     union = area1 + area2 - inter
40 | 
41 |     iou = inter / union
42 | 
43 |     if test:
44 |         zero_lines = boxes2==torch.zeros_like(boxes2)
45 |         zero_lines_idx = torch.where(zero_lines[:,0]==True)[0]
46 | 
47 |         for idx in zero_lines_idx:
48 |             if all(boxes1[idx,:] < 1e-4):
49 |                 iou[idx]=1
50 | 
51 |     return iou, union
52 | 
53 | 
54 | def generalized_box_iou(boxes1, boxes2):
55 |     """
56 |     Generalized IoU from https://giou.stanford.edu/
57 | 
58 |     The boxes should be in [x0, y0, x1, y1] format
59 | 
60 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
61 |     and M = len(boxes2)
62 |     """
63 |     iou, union = box_iou(boxes1, boxes2)
64 | 
65 |     # lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
66 |     # rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
67 |     lt = torch.min(boxes1[:, :2], boxes2[:, :2])
68 |     rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])
69 | 
70 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
71 |     # area = wh[:, :, 0] * wh[:, :, 1]
72 |     area = wh[:, 0] * wh[:, 1]
73 | 
74 |     return iou - (area - union) / area
75 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import DataLoader
 3 | from torchvision import transforms
 4 | from PIL import Image
 5 | 
 6 | from dataset.dataset import DGM4_Dataset
 7 | from dataset.randaugment import RandomAugment
 8 | 
 9 | def create_dataset(config):
10 |     
11 |     normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
12 | 
13 |     train_transform = transforms.Compose([
14 |         RandomAugment(2, 7, isPIL=True, augs=['Identity', 'AutoContrast', 'Equalize', 'Brightness', 'Sharpness']),
15 |         transforms.ToTensor(),
16 |         normalize,
17 |     ])    
18 |     
19 |     test_transform = transforms.Compose([
20 |         transforms.Resize((config['image_res'],config['image_res']),interpolation=Image.BICUBIC),
21 |         transforms.ToTensor(),
22 |         normalize,
23 |         ])  
24 |     
25 |     train_dataset = DGM4_Dataset(config=config, ann_file=config['train_file'], transform=train_transform, max_words=config['max_words'], is_train=True)              
26 |     val_dataset = DGM4_Dataset(config=config, ann_file=config['val_file'], transform=test_transform, max_words=config['max_words'], is_train=False)              
27 |     return train_dataset, val_dataset    
28 |     
29 | def create_sampler(datasets, shuffles, num_tasks, global_rank):
30 |     samplers = []
31 |     for dataset,shuffle in zip(datasets,shuffles):
32 |         sampler = torch.utils.data.DistributedSampler(dataset, num_replicas=num_tasks, rank=global_rank, shuffle=shuffle)
33 |         samplers.append(sampler)
34 |     return samplers     
35 | 
36 | 
37 | def create_loader(datasets, samplers, batch_size, num_workers, is_trains, collate_fns):
38 |     loaders = []
39 |     for dataset,sampler,bs,n_worker,is_train,collate_fn in zip(datasets,samplers,batch_size,num_workers,is_trains,collate_fns):
40 |         if is_train:
41 |             shuffle = (sampler is None)
42 |             drop_last = True
43 |         else:
44 |             shuffle = False
45 |             drop_last = False
46 |         loader = DataLoader(
47 |             dataset,
48 |             batch_size=bs,
49 |             num_workers=n_worker,
50 |             pin_memory=True,
51 |             sampler=sampler,
52 |             shuffle=shuffle,
53 |             collate_fn=collate_fn,
54 |             drop_last=drop_last,
55 |         )              
56 |         loaders.append(loader)
57 |     return loaders    


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/tools/env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import torch
 4 | import torch.distributed as dist
 5 | 
 6 | 
 7 | def init_dist(args):
 8 |     """Initialize distributed computing environmen
 9 |     t."""
10 |     args.ngpus_per_node = torch.cuda.device_count()
11 |     if args.launcher == 'pytorch':
12 |         _init_dist_pytorch(args)
13 |     elif args.launcher == 'mpi':
14 |         _init_dist_mpi(args)
15 |     elif args.launcher == 'slurm':
16 |         _init_dist_slurm(args)
17 |     else:
18 |         raise ValueError('Invalid launcher type: {}'.format(args.launcher))
19 | 
20 | 
21 | def _init_dist_pytorch(args, **kwargs):
22 |     """Set up environment."""
23 |     # TODO: use local_rank instead of rank % num_gpus
24 |     
25 |     args.rank = args.rank * args.ngpus_per_node + args.gpu
26 |     args.world_size = args.world_size
27 | 
28 |     dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
29 |                             world_size=args.world_size, rank=args.rank)
30 |     torch.cuda.set_device(args.gpu)
31 |     print(f"{args.dist_url}, ws:{args.world_size}, rank:{args.rank}")
32 | 
33 |     if args.rank % args.ngpus_per_node == 0:
34 |         args.log = True
35 |     else:
36 |         args.log = False
37 | 
38 | 
39 | def _init_dist_slurm(args, port=23333, **kwargs):
40 |     """Set up slurm environment."""
41 |     rank = int(os.environ['SLURM_PROCID'])
42 |     world_size = int(os.environ['SLURM_NTASKS'])
43 |     local_rank = int(os.environ['SLURM_LOCALID'])
44 |     node_list = str(os.environ['SLURM_NODELIST'])
45 |     num_gpus = torch.cuda.device_count()
46 | 
47 |     node_parts = re.findall('[0-9]+', node_list)
48 |     host_ip = '{}.{}.{}.{}'.format(node_parts[1], node_parts[2], node_parts[3], node_parts[4])
49 |     init_method = 'tcp://{}:{}'.format(host_ip, port)
50 | 
51 |     print(f"{init_method}, rank: {rank}, local rank: {local_rank}")
52 | 
53 |     dist.init_process_group(backend=args.dist_backend,
54 |                             init_method=init_method,
55 |                             world_size=world_size,
56 |                             rank=rank)
57 | 
58 |     torch.cuda.set_device(local_rank)
59 |     args.rank = rank
60 |     args.world_size = world_size
61 |     args.ngpus_per_node = num_gpus
62 |     args.gpu = local_rank
63 | 
64 |     if args.rank == 0:
65 |         args.log = True
66 |     else:
67 |         args.log = False
68 | 
69 | 
70 | def _init_dist_mpi(backend, **kwargs):
71 |     raise NotImplementedError
72 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/optim/novograd.py:
--------------------------------------------------------------------------------
 1 | """NovoGrad Optimizer.
 2 | Original impl by Masashi Kimura (Convergence Lab): https://github.com/convergence-lab/novograd
 3 | Paper: `Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks`
 4 |     - https://arxiv.org/abs/1905.11286
 5 | """
 6 | 
 7 | import torch
 8 | from torch.optim.optimizer import Optimizer
 9 | import math
10 | 
11 | 
12 | class NovoGrad(Optimizer):
13 |     def __init__(self, params, grad_averaging=False, lr=0.1, betas=(0.95, 0.98), eps=1e-8, weight_decay=0):
14 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
15 |         super(NovoGrad, self).__init__(params, defaults)
16 |         self._lr = lr
17 |         self._beta1 = betas[0]
18 |         self._beta2 = betas[1]
19 |         self._eps = eps
20 |         self._wd = weight_decay
21 |         self._grad_averaging = grad_averaging
22 | 
23 |         self._momentum_initialized = False
24 | 
25 |     def step(self, closure=None):
26 |         loss = None
27 |         if closure is not None:
28 |             loss = closure()
29 | 
30 |         if not self._momentum_initialized:
31 |             for group in self.param_groups:
32 |                 for p in group['params']:
33 |                     if p.grad is None:
34 |                         continue
35 |                     state = self.state[p]
36 |                     grad = p.grad.data
37 |                     if grad.is_sparse:
38 |                         raise RuntimeError('NovoGrad does not support sparse gradients')
39 | 
40 |                     v = torch.norm(grad)**2
41 |                     m = grad/(torch.sqrt(v) + self._eps) + self._wd * p.data
42 |                     state['step'] = 0
43 |                     state['v'] = v
44 |                     state['m'] = m
45 |                     state['grad_ema'] = None
46 |             self._momentum_initialized = True
47 | 
48 |         for group in self.param_groups:
49 |             for p in group['params']:
50 |                 if p.grad is None:
51 |                     continue
52 |                 state = self.state[p]
53 |                 state['step'] += 1
54 | 
55 |                 step, v, m = state['step'], state['v'], state['m']
56 |                 grad_ema = state['grad_ema']
57 | 
58 |                 grad = p.grad.data
59 |                 g2 = torch.norm(grad)**2
60 |                 grad_ema = g2 if grad_ema is None else grad_ema * \
61 |                     self._beta2 + g2 * (1. - self._beta2)
62 |                 grad *= 1.0 / (torch.sqrt(grad_ema) + self._eps)
63 | 
64 |                 if self._grad_averaging:
65 |                     grad *= (1. - self._beta1)
66 | 
67 |                 g2 = torch.norm(grad)**2
68 |                 v = self._beta2*v + (1. - self._beta2)*g2
69 |                 m = self._beta1*m + (grad / (torch.sqrt(v) + self._eps) + self._wd * p.data)
70 |                 bias_correction1 = 1 - self._beta1 ** step
71 |                 bias_correction2 = 1 - self._beta2 ** step
72 |                 step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
73 | 
74 |                 state['v'], state['m']  = v, m
75 |                 state['grad_ema'] = grad_ema
76 |                 p.data.add_(-step_size, m)
77 |         return loss
78 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/optim/sgdp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | SGDP Optimizer Implementation copied from https://github.com/clovaai/AdamP/blob/master/adamp/sgdp.py
 3 | 
 4 | Paper: `Slowing Down the Weight Norm Increase in Momentum-based Optimizers` - https://arxiv.org/abs/2006.08217
 5 | Code: https://github.com/clovaai/AdamP
 6 | 
 7 | Copyright (c) 2020-present NAVER Corp.
 8 | MIT license
 9 | """
10 | 
11 | import torch
12 | import torch.nn as nn
13 | from torch.optim.optimizer import Optimizer, required
14 | import math
15 | 
16 | class SGDP(Optimizer):
17 |     def __init__(self, params, lr=required, momentum=0, dampening=0,
18 |                  weight_decay=0, nesterov=False, eps=1e-8, delta=0.1, wd_ratio=0.1):
19 |         defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay,
20 |                         nesterov=nesterov, eps=eps, delta=delta, wd_ratio=wd_ratio)
21 |         super(SGDP, self).__init__(params, defaults)
22 | 
23 |     def _channel_view(self, x):
24 |         return x.view(x.size(0), -1)
25 | 
26 |     def _layer_view(self, x):
27 |         return x.view(1, -1)
28 | 
29 |     def _cosine_similarity(self, x, y, eps, view_func):
30 |         x = view_func(x)
31 |         y = view_func(y)
32 | 
33 |         x_norm = x.norm(dim=1).add_(eps)
34 |         y_norm = y.norm(dim=1).add_(eps)
35 |         dot = (x * y).sum(dim=1)
36 | 
37 |         return dot.abs() / x_norm / y_norm
38 | 
39 |     def _projection(self, p, grad, perturb, delta, wd_ratio, eps):
40 |         wd = 1
41 |         expand_size = [-1] + [1] * (len(p.shape) - 1)
42 |         for view_func in [self._channel_view, self._layer_view]:
43 | 
44 |             cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func)
45 | 
46 |             if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)):
47 |                 p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps)
48 |                 perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size)
49 |                 wd = wd_ratio
50 | 
51 |                 return perturb, wd
52 | 
53 |         return perturb, wd
54 | 
55 |     def step(self, closure=None):
56 |         loss = None
57 |         if closure is not None:
58 |             loss = closure()
59 | 
60 |         for group in self.param_groups:
61 |             weight_decay = group['weight_decay']
62 |             momentum = group['momentum']
63 |             dampening = group['dampening']
64 |             nesterov = group['nesterov']
65 | 
66 |             for p in group['params']:
67 |                 if p.grad is None:
68 |                     continue
69 |                 grad = p.grad.data
70 |                 state = self.state[p]
71 | 
72 |                 # State initialization
73 |                 if len(state) == 0:
74 |                     state['momentum'] = torch.zeros_like(p.data)
75 | 
76 |                 # SGD
77 |                 buf = state['momentum']
78 |                 buf.mul_(momentum).add_(1 - dampening, grad)
79 |                 if nesterov:
80 |                     d_p = grad + momentum * buf
81 |                 else:
82 |                     d_p = buf
83 | 
84 |                 # Projection
85 |                 wd_ratio = 1
86 |                 if len(p.shape) > 1:
87 |                     d_p, wd_ratio = self._projection(p, grad, d_p, group['delta'], group['wd_ratio'], group['eps'])
88 | 
89 |                 # Weight decay
90 |                 if weight_decay != 0:
91 |                     p.data.mul_(1 - group['lr'] * group['weight_decay'] * wd_ratio / (1-momentum))
92 | 
93 |                 # Step
94 |                 p.data.add_(-group['lr'], d_p)
95 | 
96 |         return loss
97 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/scheduler/scheduler_factory.py:
--------------------------------------------------------------------------------
 1 | """ Scheduler Factory
 2 | Hacked together by / Copyright 2020 Ross Wightman
 3 | """
 4 | from .cosine_lr import CosineLRScheduler
 5 | from .tanh_lr import TanhLRScheduler
 6 | from .step_lr import StepLRScheduler
 7 | from .plateau_lr import PlateauLRScheduler
 8 | 
 9 | 
10 | def create_scheduler(args, optimizer):
11 |     num_epochs = args.epochs
12 | 
13 |     if getattr(args, 'lr_noise', None) is not None:
14 |         lr_noise = getattr(args, 'lr_noise')
15 |         if isinstance(lr_noise, (list, tuple)):
16 |             noise_range = [n * num_epochs for n in lr_noise]
17 |             if len(noise_range) == 1:
18 |                 noise_range = noise_range[0]
19 |         else:
20 |             noise_range = lr_noise * num_epochs
21 |     else:
22 |         noise_range = None
23 | 
24 |     lr_scheduler = None
25 |     if args.sched == 'cosine':
26 |         lr_scheduler = CosineLRScheduler(
27 |             optimizer,
28 |             t_initial=num_epochs,
29 |             t_mul=getattr(args, 'lr_cycle_mul', 1.),
30 |             lr_min=args.min_lr,
31 |             decay_rate=args.decay_rate,
32 |             warmup_lr_init=args.warmup_lr,
33 |             warmup_t=args.warmup_epochs,
34 |             cycle_limit=getattr(args, 'lr_cycle_limit', 1),
35 |             t_in_epochs=True,
36 |             noise_range_t=noise_range,
37 |             noise_pct=getattr(args, 'lr_noise_pct', 0.67),
38 |             noise_std=getattr(args, 'lr_noise_std', 1.),
39 |             noise_seed=getattr(args, 'seed', 42),
40 |         )
41 |         num_epochs = lr_scheduler.get_cycle_length() + args.cooldown_epochs
42 |     elif args.sched == 'tanh':
43 |         lr_scheduler = TanhLRScheduler(
44 |             optimizer,
45 |             t_initial=num_epochs,
46 |             t_mul=getattr(args, 'lr_cycle_mul', 1.),
47 |             lr_min=args.min_lr,
48 |             warmup_lr_init=args.warmup_lr,
49 |             warmup_t=args.warmup_epochs,
50 |             cycle_limit=getattr(args, 'lr_cycle_limit', 1),
51 |             t_in_epochs=True,
52 |             noise_range_t=noise_range,
53 |             noise_pct=getattr(args, 'lr_noise_pct', 0.67),
54 |             noise_std=getattr(args, 'lr_noise_std', 1.),
55 |             noise_seed=getattr(args, 'seed', 42),
56 |         )
57 |         num_epochs = lr_scheduler.get_cycle_length() + args.cooldown_epochs
58 |     elif args.sched == 'step':
59 |         lr_scheduler = StepLRScheduler(
60 |             optimizer,
61 |             decay_t=args.decay_epochs,
62 |             decay_rate=args.decay_rate,
63 |             warmup_lr_init=args.warmup_lr,
64 |             warmup_t=args.warmup_epochs,
65 |             noise_range_t=noise_range,
66 |             noise_pct=getattr(args, 'lr_noise_pct', 0.67),
67 |             noise_std=getattr(args, 'lr_noise_std', 1.),
68 |             noise_seed=getattr(args, 'seed', 42),
69 |         )
70 |     elif args.sched == 'plateau':
71 |         mode = 'min' if 'loss' in getattr(args, 'eval_metric', '') else 'max'
72 |         lr_scheduler = PlateauLRScheduler(
73 |             optimizer,
74 |             decay_rate=args.decay_rate,
75 |             patience_t=args.patience_epochs,
76 |             lr_min=args.min_lr,
77 |             mode=mode,
78 |             warmup_lr_init=args.warmup_lr,
79 |             warmup_t=args.warmup_epochs,
80 |             cooldown_t=0,
81 |             noise_range_t=noise_range,
82 |             noise_pct=getattr(args, 'lr_noise_pct', 0.67),
83 |             noise_std=getattr(args, 'lr_noise_std', 1.),
84 |             noise_seed=getattr(args, 'seed', 42),
85 |         )
86 |     elif args.sched == 'cosine_in_step':
87 |         import scheduler.lr_sched as lr_scheduler
88 | 
89 |     return lr_scheduler, num_epochs
90 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/dataset/dataset.py:
--------------------------------------------------------------------------------
  1 | from distutils.command.config import config
  2 | import json
  3 | import os
  4 | import random
  5 | 
  6 | from torch.utils.data import Dataset
  7 | import torch
  8 | from PIL import Image
  9 | from PIL import ImageFile
 10 | ImageFile.LOAD_TRUNCATED_IMAGES = True
 11 | Image.MAX_IMAGE_PIXELS = None
 12 | 
 13 | from dataset.utils import pre_caption
 14 | import os
 15 | from torchvision.transforms.functional import hflip, resize
 16 | 
 17 | import math
 18 | import random
 19 | from random import random as rand
 20 | 
 21 | class DGM4_Dataset(Dataset):
 22 |     def __init__(self, config, ann_file, transform, max_words=30, is_train=True): 
 23 |         
 24 |         self.root_dir = '../../datasets'       
 25 |         self.ann = []
 26 |         for f in ann_file:
 27 |             self.ann += json.load(open(f,'r'))
 28 |         if 'dataset_division' in config:
 29 |             self.ann = self.ann[:int(len(self.ann)/config['dataset_division'])]
 30 | 
 31 |         self.transform = transform
 32 |         self.max_words = max_words
 33 |         self.image_res = config['image_res']
 34 | 
 35 |         self.is_train = is_train
 36 |         
 37 |     def __len__(self):
 38 |         return len(self.ann)
 39 | 
 40 |     def get_bbox(self, bbox):
 41 |         xmin, ymin, xmax, ymax = bbox
 42 |         w = xmax - xmin
 43 |         h = ymax - ymin
 44 |         return int(xmin), int(ymin), int(w), int(h)    
 45 | 
 46 |     def __getitem__(self, index):    
 47 |         
 48 |         ann = self.ann[index]
 49 |         img_dir = ann['image']    
 50 |         image_dir_all = f'{self.root_dir}/{img_dir}'
 51 | 
 52 |         try:
 53 |             image = Image.open(image_dir_all).convert('RGB')   
 54 |         except Warning:
 55 |             raise ValueError("### Warning: fakenews_dataset Image.open")   
 56 |                          
 57 |         W, H = image.size
 58 |         has_bbox = False
 59 |         try:
 60 |             x, y, w, h = self.get_bbox(ann['fake_image_box'])
 61 |             has_bbox = True
 62 |         except:
 63 |             fake_image_box = torch.tensor([0, 0, 0, 0], dtype=torch.float)
 64 | 
 65 |         do_hflip = False
 66 |         if self.is_train:
 67 |             if rand() < 0.5:
 68 |                 # flipped applied
 69 |                 image = hflip(image)
 70 |                 do_hflip = True
 71 | 
 72 |             image = resize(image, [self.image_res, self.image_res], interpolation=Image.BICUBIC)
 73 |         image = self.transform(image)
 74 |             
 75 |         if has_bbox:
 76 |             # flipped applied
 77 |             if do_hflip:  
 78 |                 x = (W - x) - w  # W is w0
 79 | 
 80 |             # resize applied
 81 |             x = self.image_res / W * x
 82 |             w = self.image_res / W * w
 83 |             y = self.image_res / H * y
 84 |             h = self.image_res / H * h
 85 | 
 86 |             center_x = x + 1 / 2 * w
 87 |             center_y = y + 1 / 2 * h
 88 | 
 89 |             fake_image_box = torch.tensor([center_x / self.image_res, 
 90 |                         center_y / self.image_res,
 91 |                         w / self.image_res, 
 92 |                         h / self.image_res],
 93 |                         dtype=torch.float)
 94 | 
 95 |         label = ann['fake_cls']
 96 |         caption = pre_caption(ann['text'], self.max_words)
 97 |         fake_text_pos = ann['fake_text_pos']
 98 | 
 99 |         fake_text_pos_list = torch.zeros(self.max_words)
100 | 
101 |         for i in fake_text_pos:
102 |             if i<self.max_words:
103 |                 fake_text_pos_list[i]=1
104 |         
105 |                 
106 |         return image, label, caption, fake_image_box, fake_text_pos_list, W, H, image_dir_all
107 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 | <!-- <h1>RCTrans</h1> -->
 3 | <h3> Unleashing the Potential of Consistency Learning for Detecting and Grounding Multi-Modal Media Manipulation</h3>
 4 | <h4>Yiheng Li, Yang Yang, Zichang Tan, Huan Liu, Weihua Chen, Xu Zhou and Zhen Lei<h4>
 5 | <h5>MAIS&CASIA, UCAS, Sangfor, BJTU and Alibaba<h5>
 6 | </div>
 7 | 
 8 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2506.05890)
 9 | 
10 | ## Introduction
11 | 
12 | This repository is an official implementation of CSCL.
13 | 
14 | ## News
15 | - [2025/6/9] Camera Ready version is released.
16 | - [2025/6/9] Codes and weights are released.
17 | - [2025/2/27] CSCL is accepted by CVPR 2025🎉🎉.
18 | 
19 | ## Environment Setting
20 | ```
21 | conda create -n CSCL python=3.8
22 | conda activate CSCL
23 | pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/cu111/torch_stable.html
24 | pip install -r code/MultiModal-DeepFake-main/requirements.txt
25 | ```
26 | ## Data Preparation
27 | Here are the pre-trained model:
28 | 
29 |   Download meter_clip16_224_roberta_pretrain.ckpt: [link](https://drive.usercontent.google.com/download?id=1x4qm2rlYKxpYF3F_xI5ZKurTtFnndq3l&export=download&authuser=0&confirm=t&uuid=9356ef04-1b7b-444c-80be-4bf21fab8bda&at=AIrpjvOLjj-J08OdrxQf_rCxV7Zp:1739190851890)
30 |   
31 |   Download ViT-B-16.pt: [link](https://drive.usercontent.google.com/download?id=1GL3kOw-lmbD5abJCaaktLrODqxMllpd6&export=download&authuser=0&confirm=t&uuid=5a286816-fa87-4fd0-a75d-825ec03966e4&at=AIrpjvMiXdIVW3BRne33Y_-pvh1D:1739190843518)
32 |   
33 |   Download roberta-base: [link](https://huggingface.co/FacebookAI/roberta-base/tree/main)
34 | 
35 | Download Datasets: [link](https://huggingface.co/datasets/rshaojimmy/DGM4)
36 | 
37 | The Folder structure:
38 | ```
39 | ./
40 | ├── code
41 | │   └── MultiModal-Deepfake (this github repo)
42 | │       ├── configs
43 | │       │   └──...
44 | │       ├── dataset
45 | │       │   └──...
46 | │       ├── models
47 | │       │   └──...
48 | │       ...
49 | │       ├── roberta-base
50 | │       ├── ViT-B-16.pt
51 | │       └── meter_clip16_224_roberta_pretrain.ckpt
52 | └── datasets
53 |     └── DGM4
54 |         ├── manipulation
55 |         ├── origin
56 |         └── metadata
57 | ```
58 | 
59 | Our pre-trained CSCL model: [link](https://drive.usercontent.google.com/download?id=1ZW4akTzcB9QjsS6FcX4zQ5l2YOjl7zNy&export=download&authuser=0&confirm=t&uuid=e8e37fa5-46fd-48bb-be4b-be765ca86059&at=AIrpjvM1Jjby7_AjinIBFS9d61TL:1739189602615) (96.34 AUC, 92.48 mAP, 84.07 IoUm, 76.62 F1) (We use train and val set for training and use test set for evaluation.)
60 | 
61 | Make a folder ./results/CSCL/ and put the pre-trained model in it.
62 | 
63 | ## Train
64 | ```
65 | sh train.sh
66 | ```
67 | ## Inference
68 | 
69 | Evaluation
70 | ```
71 | sh test.sh
72 | ```
73 | Visualization
74 | ```
75 | use visualize_res function in utils.py (refer to test.py for details).
76 | ```
77 | Evaluation on text or image subset
78 | ```
79 | refer to line 136 in test.py.
80 | ```
81 | ## Acknowledgements
82 | We thank these great works and open-source codebases:
83 | [DGM4](https://github.com/rshaojimmy/MultiModal-DeepFake?tab=readme-ov-file), [METER](https://github.com/zdou0830/METER),
84 | 
85 | ## Citation
86 | If you find our work is useful, please give this repo a star and cite our work as:
87 | ```bibtex
88 | @inproceedings{li2025unleashing,
89 |   title={Unleashing the Potential of Consistency Learning for Detecting and Grounding Multi-Modal Media Manipulation},
90 |   author={Li, Yiheng and Yang, Yang and Tan, Zichang and Liu, Huan and Chen, Weihua and Zhou, Xu and Lei, Zhen},
91 |   booktitle={Proceedings of the Computer Vision and Pattern Recognition Conference},
92 |   pages={9242--9252},
93 |   year={2025}
94 | }
95 | ```
96 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/optim/nadam.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.optim import Optimizer
 3 | 
 4 | 
 5 | class Nadam(Optimizer):
 6 |     """Implements Nadam algorithm (a variant of Adam based on Nesterov momentum).
 7 | 
 8 |     It has been proposed in `Incorporating Nesterov Momentum into Adam`__.
 9 | 
10 |     Arguments:
11 |         params (iterable): iterable of parameters to optimize or dicts defining
12 |             parameter groups
13 |         lr (float, optional): learning rate (default: 2e-3)
14 |         betas (Tuple[float, float], optional): coefficients used for computing
15 |             running averages of gradient and its square
16 |         eps (float, optional): term added to the denominator to improve
17 |             numerical stability (default: 1e-8)
18 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
19 |         schedule_decay (float, optional): momentum schedule decay (default: 4e-3)
20 | 
21 |     __ http://cs229.stanford.edu/proj2015/054_report.pdf
22 |     __ http://www.cs.toronto.edu/~fritz/absps/momentum.pdf
23 | 
24 |         Originally taken from: https://github.com/pytorch/pytorch/pull/1408
25 |         NOTE: Has potential issues but does work well on some problems.
26 |     """
27 | 
28 |     def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
29 |                  weight_decay=0, schedule_decay=4e-3):
30 |         defaults = dict(lr=lr, betas=betas, eps=eps,
31 |                         weight_decay=weight_decay, schedule_decay=schedule_decay)
32 |         super(Nadam, self).__init__(params, defaults)
33 | 
34 |     def step(self, closure=None):
35 |         """Performs a single optimization step.
36 | 
37 |         Arguments:
38 |             closure (callable, optional): A closure that reevaluates the model
39 |                 and returns the loss.
40 |         """
41 |         loss = None
42 |         if closure is not None:
43 |             loss = closure()
44 | 
45 |         for group in self.param_groups:
46 |             for p in group['params']:
47 |                 if p.grad is None:
48 |                     continue
49 |                 grad = p.grad.data
50 |                 state = self.state[p]
51 | 
52 |                 # State initialization
53 |                 if len(state) == 0:
54 |                     state['step'] = 0
55 |                     state['m_schedule'] = 1.
56 |                     state['exp_avg'] = grad.new().resize_as_(grad).zero_()
57 |                     state['exp_avg_sq'] = grad.new().resize_as_(grad).zero_()
58 | 
59 |                 # Warming momentum schedule
60 |                 m_schedule = state['m_schedule']
61 |                 schedule_decay = group['schedule_decay']
62 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
63 |                 beta1, beta2 = group['betas']
64 |                 eps = group['eps']
65 |                 state['step'] += 1
66 |                 t = state['step']
67 | 
68 |                 if group['weight_decay'] != 0:
69 |                     grad = grad.add(group['weight_decay'], p.data)
70 | 
71 |                 momentum_cache_t = beta1 * \
72 |                     (1. - 0.5 * (0.96 ** (t * schedule_decay)))
73 |                 momentum_cache_t_1 = beta1 * \
74 |                     (1. - 0.5 * (0.96 ** ((t + 1) * schedule_decay)))
75 |                 m_schedule_new = m_schedule * momentum_cache_t
76 |                 m_schedule_next = m_schedule * momentum_cache_t * momentum_cache_t_1
77 |                 state['m_schedule'] = m_schedule_new
78 | 
79 |                 # Decay the first and second moment running average coefficient
80 |                 exp_avg.mul_(beta1).add_(1. - beta1, grad)
81 |                 exp_avg_sq.mul_(beta2).addcmul_(1. - beta2, grad, grad)
82 |                 exp_avg_sq_prime = exp_avg_sq / (1. - beta2 ** t)
83 |                 denom = exp_avg_sq_prime.sqrt_().add_(eps)
84 | 
85 |                 p.data.addcdiv_(-group['lr'] * (1. - momentum_cache_t) / (1. - m_schedule_new), grad, denom)
86 |                 p.data.addcdiv_(-group['lr'] * momentum_cache_t_1 / (1. - m_schedule_next), exp_avg, denom)
87 | 
88 |         return loss
89 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/optim/lookahead.py:
--------------------------------------------------------------------------------
 1 | """ Lookahead Optimizer Wrapper.
 2 | Implementation modified from: https://github.com/alphadl/lookahead.pytorch
 3 | Paper: `Lookahead Optimizer: k steps forward, 1 step back` - https://arxiv.org/abs/1907.08610
 4 | 
 5 | Hacked together by / Copyright 2020 Ross Wightman
 6 | """
 7 | import torch
 8 | from torch.optim.optimizer import Optimizer
 9 | from collections import defaultdict
10 | 
11 | 
12 | class Lookahead(Optimizer):
13 |     def __init__(self, base_optimizer, alpha=0.5, k=6):
14 |         if not 0.0 <= alpha <= 1.0:
15 |             raise ValueError(f'Invalid slow update rate: {alpha}')
16 |         if not 1 <= k:
17 |             raise ValueError(f'Invalid lookahead steps: {k}')
18 |         defaults = dict(lookahead_alpha=alpha, lookahead_k=k, lookahead_step=0)
19 |         self.base_optimizer = base_optimizer
20 |         self.param_groups = self.base_optimizer.param_groups
21 |         self.defaults = base_optimizer.defaults
22 |         self.defaults.update(defaults)
23 |         self.state = defaultdict(dict)
24 |         # manually add our defaults to the param groups
25 |         for name, default in defaults.items():
26 |             for group in self.param_groups:
27 |                 group.setdefault(name, default)
28 | 
29 |     def update_slow(self, group):
30 |         for fast_p in group["params"]:
31 |             if fast_p.grad is None:
32 |                 continue
33 |             param_state = self.state[fast_p]
34 |             if 'slow_buffer' not in param_state:
35 |                 param_state['slow_buffer'] = torch.empty_like(fast_p.data)
36 |                 param_state['slow_buffer'].copy_(fast_p.data)
37 |             slow = param_state['slow_buffer']
38 |             slow.add_(group['lookahead_alpha'], fast_p.data - slow)
39 |             fast_p.data.copy_(slow)
40 | 
41 |     def sync_lookahead(self):
42 |         for group in self.param_groups:
43 |             self.update_slow(group)
44 | 
45 |     def step(self, closure=None):
46 |         #assert id(self.param_groups) == id(self.base_optimizer.param_groups)
47 |         loss = self.base_optimizer.step(closure)
48 |         for group in self.param_groups:
49 |             group['lookahead_step'] += 1
50 |             if group['lookahead_step'] % group['lookahead_k'] == 0:
51 |                 self.update_slow(group)
52 |         return loss
53 | 
54 |     def state_dict(self):
55 |         fast_state_dict = self.base_optimizer.state_dict()
56 |         slow_state = {
57 |             (id(k) if isinstance(k, torch.Tensor) else k): v
58 |             for k, v in self.state.items()
59 |         }
60 |         fast_state = fast_state_dict['state']
61 |         param_groups = fast_state_dict['param_groups']
62 |         return {
63 |             'state': fast_state,
64 |             'slow_state': slow_state,
65 |             'param_groups': param_groups,
66 |         }
67 | 
68 |     def load_state_dict(self, state_dict):
69 |         fast_state_dict = {
70 |             'state': state_dict['state'],
71 |             'param_groups': state_dict['param_groups'],
72 |         }
73 |         self.base_optimizer.load_state_dict(fast_state_dict)
74 | 
75 |         # We want to restore the slow state, but share param_groups reference
76 |         # with base_optimizer. This is a bit redundant but least code
77 |         slow_state_new = False
78 |         if 'slow_state' not in state_dict:
79 |             print('Loading state_dict from optimizer without Lookahead applied.')
80 |             state_dict['slow_state'] = defaultdict(dict)
81 |             slow_state_new = True
82 |         slow_state_dict = {
83 |             'state': state_dict['slow_state'],
84 |             'param_groups': state_dict['param_groups'],  # this is pointless but saves code
85 |         }
86 |         super(Lookahead, self).load_state_dict(slow_state_dict)
87 |         self.param_groups = self.base_optimizer.param_groups  # make both ref same container
88 |         if slow_state_new:
89 |             # reapply defaults to catch missing lookahead specific ones
90 |             for name, default in self.defaults.items():
91 |                 for group in self.param_groups:
92 |                     group.setdefault(name, default)
93 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/optim/adamp.py:
--------------------------------------------------------------------------------
  1 | """
  2 | AdamP Optimizer Implementation copied from https://github.com/clovaai/AdamP/blob/master/adamp/adamp.py
  3 | 
  4 | Paper: `Slowing Down the Weight Norm Increase in Momentum-based Optimizers` - https://arxiv.org/abs/2006.08217
  5 | Code: https://github.com/clovaai/AdamP
  6 | 
  7 | Copyright (c) 2020-present NAVER Corp.
  8 | MIT license
  9 | """
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | from torch.optim.optimizer import Optimizer, required
 14 | import math
 15 | 
 16 | class AdamP(Optimizer):
 17 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
 18 |                  weight_decay=0, delta=0.1, wd_ratio=0.1, nesterov=False):
 19 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
 20 |                         delta=delta, wd_ratio=wd_ratio, nesterov=nesterov)
 21 |         super(AdamP, self).__init__(params, defaults)
 22 | 
 23 |     def _channel_view(self, x):
 24 |         return x.view(x.size(0), -1)
 25 | 
 26 |     def _layer_view(self, x):
 27 |         return x.view(1, -1)
 28 | 
 29 |     def _cosine_similarity(self, x, y, eps, view_func):
 30 |         x = view_func(x)
 31 |         y = view_func(y)
 32 | 
 33 |         x_norm = x.norm(dim=1).add_(eps)
 34 |         y_norm = y.norm(dim=1).add_(eps)
 35 |         dot = (x * y).sum(dim=1)
 36 | 
 37 |         return dot.abs() / x_norm / y_norm
 38 | 
 39 |     def _projection(self, p, grad, perturb, delta, wd_ratio, eps):
 40 |         wd = 1
 41 |         expand_size = [-1] + [1] * (len(p.shape) - 1)
 42 |         for view_func in [self._channel_view, self._layer_view]:
 43 | 
 44 |             cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func)
 45 | 
 46 |             if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)):
 47 |                 p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps)
 48 |                 perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size)
 49 |                 wd = wd_ratio
 50 | 
 51 |                 return perturb, wd
 52 | 
 53 |         return perturb, wd
 54 | 
 55 |     def step(self, closure=None):
 56 |         loss = None
 57 |         if closure is not None:
 58 |             loss = closure()
 59 | 
 60 |         for group in self.param_groups:
 61 |             for p in group['params']:
 62 |                 if p.grad is None:
 63 |                     continue
 64 | 
 65 |                 grad = p.grad.data
 66 |                 beta1, beta2 = group['betas']
 67 |                 nesterov = group['nesterov']
 68 | 
 69 |                 state = self.state[p]
 70 | 
 71 |                 # State initialization
 72 |                 if len(state) == 0:
 73 |                     state['step'] = 0
 74 |                     state['exp_avg'] = torch.zeros_like(p.data)
 75 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 76 | 
 77 |                 # Adam
 78 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 79 | 
 80 |                 state['step'] += 1
 81 |                 bias_correction1 = 1 - beta1 ** state['step']
 82 |                 bias_correction2 = 1 - beta2 ** state['step']
 83 | 
 84 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 85 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 86 | 
 87 |                 denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
 88 |                 step_size = group['lr'] / bias_correction1
 89 | 
 90 |                 if nesterov:
 91 |                     perturb = (beta1 * exp_avg + (1 - beta1) * grad) / denom
 92 |                 else:
 93 |                     perturb = exp_avg / denom
 94 | 
 95 |                 # Projection
 96 |                 wd_ratio = 1
 97 |                 if len(p.shape) > 1:
 98 |                     perturb, wd_ratio = self._projection(p, grad, perturb, group['delta'], group['wd_ratio'], group['eps'])
 99 | 
100 |                 # Weight decay
101 |                 if group['weight_decay'] > 0:
102 |                     p.data.mul_(1 - group['lr'] * group['weight_decay'] * wd_ratio)
103 | 
104 |                 # Step
105 |                 p.data.add_(-step_size, perturb)
106 | 
107 |         return loss
108 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/scheduler/cosine_lr.py:
--------------------------------------------------------------------------------
  1 | """ Cosine Scheduler
  2 | 
  3 | Cosine LR schedule with warmup, cycle/restarts, noise.
  4 | 
  5 | Hacked together by / Copyright 2020 Ross Wightman
  6 | """
  7 | import logging
  8 | import math
  9 | import numpy as np
 10 | import torch
 11 | 
 12 | from .scheduler import Scheduler
 13 | 
 14 | from pdb import set_trace as breakpoint
 15 | 
 16 | _logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | class CosineLRScheduler(Scheduler):
 20 |     """
 21 |     Cosine decay with restarts.
 22 |     This is described in the paper https://arxiv.org/abs/1608.03983.
 23 | 
 24 |     Inspiration from
 25 |     https://github.com/allenai/allennlp/blob/master/allennlp/training/learning_rate_schedulers/cosine.py
 26 |     """
 27 | 
 28 |     def __init__(self,
 29 |                  optimizer: torch.optim.Optimizer,
 30 |                  t_initial: int,
 31 |                  t_mul: float = 1.,
 32 |                  lr_min: float = 0.,
 33 |                  decay_rate: float = 1.,
 34 |                  warmup_t=0,
 35 |                  warmup_lr_init=0,
 36 |                  warmup_prefix=True,
 37 |                  cycle_limit=0,
 38 |                  t_in_epochs=True,
 39 |                  noise_range_t=None,
 40 |                  noise_pct=0.67,
 41 |                  noise_std=1.0,
 42 |                  noise_seed=42,
 43 |                  initialize=True) -> None:
 44 |         super().__init__(
 45 |             optimizer, param_group_field="lr",
 46 |             noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed,
 47 |             initialize=initialize)
 48 | 
 49 |         assert t_initial > 0
 50 |         assert lr_min >= 0
 51 |         if t_initial == 1 and t_mul == 1 and decay_rate == 1:
 52 |             _logger.warning("Cosine annealing scheduler will have no effect on the learning "
 53 |                            "rate since t_initial = t_mul = eta_mul = 1.")
 54 |         self.t_initial = t_initial
 55 |         self.t_mul = t_mul
 56 |         self.lr_min = lr_min
 57 |         self.decay_rate = decay_rate
 58 |         self.cycle_limit = cycle_limit
 59 |         self.warmup_t = warmup_t
 60 |         self.warmup_lr_init = warmup_lr_init
 61 |         self.warmup_prefix = warmup_prefix
 62 |         self.t_in_epochs = t_in_epochs
 63 |         if self.warmup_t:
 64 |             self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values]           
 65 |             super().update_groups(self.warmup_lr_init)
 66 |         else:
 67 |             self.warmup_steps = [1 for _ in self.base_values]
 68 | 
 69 |     def _get_lr(self, t):
 70 |         if t < self.warmup_t:
 71 |             lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps]
 72 |         else:
 73 |             if self.warmup_prefix:
 74 |                 t = t - self.warmup_t
 75 | 
 76 |             if self.t_mul != 1:
 77 |                 i = math.floor(math.log(1 - t / self.t_initial * (1 - self.t_mul), self.t_mul))
 78 |                 t_i = self.t_mul ** i * self.t_initial
 79 |                 t_curr = t - (1 - self.t_mul ** i) / (1 - self.t_mul) * self.t_initial
 80 |             else:
 81 |                 i = t // self.t_initial
 82 |                 t_i = self.t_initial
 83 |                 t_curr = t - (self.t_initial * i)
 84 | 
 85 |             gamma = self.decay_rate ** i
 86 |             lr_min = self.lr_min * gamma
 87 |             lr_max_values = [v * gamma for v in self.base_values]
 88 | 
 89 |             if self.cycle_limit == 0 or (self.cycle_limit > 0 and i < self.cycle_limit):
 90 |                 lrs = [
 91 |                     lr_min + 0.5 * (lr_max - lr_min) * (1 + math.cos(math.pi * t_curr / t_i)) for lr_max in lr_max_values
 92 |                 ]
 93 |             else:
 94 |                 lrs = [self.lr_min for _ in self.base_values]
 95 | 
 96 |         return lrs
 97 | 
 98 |     def get_epoch_values(self, epoch: int):
 99 |         if self.t_in_epochs:
100 |             return self._get_lr(epoch)
101 |         else:
102 |             return None
103 | 
104 |     def get_update_values(self, num_updates: int):
105 |         if not self.t_in_epochs:
106 |             return self._get_lr(num_updates)
107 |         else:
108 |             return None
109 | 
110 |     def get_cycle_length(self, cycles=0):
111 |         if not cycles:
112 |             cycles = self.cycle_limit
113 |         cycles = max(1, cycles)
114 |         if self.t_mul == 1.0:
115 |             return self.t_initial * cycles
116 |         else:
117 |             return int(math.floor(-self.t_initial * (self.t_mul ** cycles - 1) / (1 - self.t_mul)))
118 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/scheduler/tanh_lr.py:
--------------------------------------------------------------------------------
  1 | """ TanH Scheduler
  2 | 
  3 | TanH schedule with warmup, cycle/restarts, noise.
  4 | 
  5 | Hacked together by / Copyright 2020 Ross Wightman
  6 | """
  7 | import logging
  8 | import math
  9 | import numpy as np
 10 | import torch
 11 | 
 12 | from .scheduler import Scheduler
 13 | 
 14 | 
 15 | _logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | class TanhLRScheduler(Scheduler):
 19 |     """
 20 |     Hyberbolic-Tangent decay with restarts.
 21 |     This is described in the paper https://arxiv.org/abs/1806.01593
 22 |     """
 23 | 
 24 |     def __init__(self,
 25 |                  optimizer: torch.optim.Optimizer,
 26 |                  t_initial: int,
 27 |                  lb: float = -6.,
 28 |                  ub: float = 4.,
 29 |                  t_mul: float = 1.,
 30 |                  lr_min: float = 0.,
 31 |                  decay_rate: float = 1.,
 32 |                  warmup_t=0,
 33 |                  warmup_lr_init=0,
 34 |                  warmup_prefix=False,
 35 |                  cycle_limit=0,
 36 |                  t_in_epochs=True,
 37 |                  noise_range_t=None,
 38 |                  noise_pct=0.67,
 39 |                  noise_std=1.0,
 40 |                  noise_seed=42,
 41 |                  initialize=True) -> None:
 42 |         super().__init__(
 43 |             optimizer, param_group_field="lr",
 44 |             noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed,
 45 |             initialize=initialize)
 46 | 
 47 |         assert t_initial > 0
 48 |         assert lr_min >= 0
 49 |         assert lb < ub
 50 |         assert cycle_limit >= 0
 51 |         assert warmup_t >= 0
 52 |         assert warmup_lr_init >= 0
 53 |         self.lb = lb
 54 |         self.ub = ub
 55 |         self.t_initial = t_initial
 56 |         self.t_mul = t_mul
 57 |         self.lr_min = lr_min
 58 |         self.decay_rate = decay_rate
 59 |         self.cycle_limit = cycle_limit
 60 |         self.warmup_t = warmup_t
 61 |         self.warmup_lr_init = warmup_lr_init
 62 |         self.warmup_prefix = warmup_prefix
 63 |         self.t_in_epochs = t_in_epochs
 64 |         if self.warmup_t:
 65 |             t_v = self.base_values if self.warmup_prefix else self._get_lr(self.warmup_t)
 66 |             self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in t_v]
 67 |             super().update_groups(self.warmup_lr_init)
 68 |         else:
 69 |             self.warmup_steps = [1 for _ in self.base_values]
 70 | 
 71 |     def _get_lr(self, t):
 72 |         if t < self.warmup_t:
 73 |             lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps]
 74 |         else:
 75 |             if self.warmup_prefix:
 76 |                 t = t - self.warmup_t
 77 | 
 78 |             if self.t_mul != 1:
 79 |                 i = math.floor(math.log(1 - t / self.t_initial * (1 - self.t_mul), self.t_mul))
 80 |                 t_i = self.t_mul ** i * self.t_initial
 81 |                 t_curr = t - (1 - self.t_mul ** i) / (1 - self.t_mul) * self.t_initial
 82 |             else:
 83 |                 i = t // self.t_initial
 84 |                 t_i = self.t_initial
 85 |                 t_curr = t - (self.t_initial * i)
 86 | 
 87 |             if self.cycle_limit == 0 or (self.cycle_limit > 0 and i < self.cycle_limit):
 88 |                 gamma = self.decay_rate ** i
 89 |                 lr_min = self.lr_min * gamma
 90 |                 lr_max_values = [v * gamma for v in self.base_values]
 91 | 
 92 |                 tr = t_curr / t_i
 93 |                 lrs = [
 94 |                     lr_min + 0.5 * (lr_max - lr_min) * (1 - math.tanh(self.lb * (1. - tr) + self.ub * tr))
 95 |                     for lr_max in lr_max_values
 96 |                 ]
 97 |             else:
 98 |                 lrs = [self.lr_min * (self.decay_rate ** self.cycle_limit) for _ in self.base_values]
 99 |         return lrs
100 | 
101 |     def get_epoch_values(self, epoch: int):
102 |         if self.t_in_epochs:
103 |             return self._get_lr(epoch)
104 |         else:
105 |             return None
106 | 
107 |     def get_update_values(self, num_updates: int):
108 |         if not self.t_in_epochs:
109 |             return self._get_lr(num_updates)
110 |         else:
111 |             return None
112 | 
113 |     def get_cycle_length(self, cycles=0):
114 |         if not cycles:
115 |             cycles = self.cycle_limit
116 |         cycles = max(1, cycles)
117 |         if self.t_mul == 1.0:
118 |             return self.t_initial * cycles
119 |         else:
120 |             return int(math.floor(-self.t_initial * (self.t_mul ** cycles - 1) / (1 - self.t_mul)))
121 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/scheduler/plateau_lr.py:
--------------------------------------------------------------------------------
  1 | """ Plateau Scheduler
  2 | 
  3 | Adapts PyTorch plateau scheduler and allows application of noise, warmup.
  4 | 
  5 | Hacked together by / Copyright 2020 Ross Wightman
  6 | """
  7 | import torch
  8 | 
  9 | from .scheduler import Scheduler
 10 | 
 11 | 
 12 | class PlateauLRScheduler(Scheduler):
 13 |     """Decay the LR by a factor every time the validation loss plateaus."""
 14 | 
 15 |     def __init__(self,
 16 |                  optimizer,
 17 |                  decay_rate=0.1,
 18 |                  patience_t=10,
 19 |                  verbose=True,
 20 |                  threshold=1e-4,
 21 |                  cooldown_t=0,
 22 |                  warmup_t=0,
 23 |                  warmup_lr_init=0,
 24 |                  lr_min=0,
 25 |                  mode='max',
 26 |                  noise_range_t=None,
 27 |                  noise_type='normal',
 28 |                  noise_pct=0.67,
 29 |                  noise_std=1.0,
 30 |                  noise_seed=None,
 31 |                  initialize=True,
 32 |                  ):
 33 |         super().__init__(optimizer, 'lr', initialize=initialize)
 34 | 
 35 |         self.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
 36 |             self.optimizer,
 37 |             patience=patience_t,
 38 |             factor=decay_rate,
 39 |             verbose=verbose,
 40 |             threshold=threshold,
 41 |             cooldown=cooldown_t,
 42 |             mode=mode,
 43 |             min_lr=lr_min
 44 |         )
 45 | 
 46 |         self.noise_range = noise_range_t
 47 |         self.noise_pct = noise_pct
 48 |         self.noise_type = noise_type
 49 |         self.noise_std = noise_std
 50 |         self.noise_seed = noise_seed if noise_seed is not None else 42
 51 |         self.warmup_t = warmup_t
 52 |         self.warmup_lr_init = warmup_lr_init
 53 |         if self.warmup_t:
 54 |             self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values]
 55 |             super().update_groups(self.warmup_lr_init)
 56 |         else:
 57 |             self.warmup_steps = [1 for _ in self.base_values]
 58 |         self.restore_lr = None
 59 | 
 60 |     def state_dict(self):
 61 |         return {
 62 |             'best': self.lr_scheduler.best,
 63 |             'last_epoch': self.lr_scheduler.last_epoch,
 64 |         }
 65 | 
 66 |     def load_state_dict(self, state_dict):
 67 |         self.lr_scheduler.best = state_dict['best']
 68 |         if 'last_epoch' in state_dict:
 69 |             self.lr_scheduler.last_epoch = state_dict['last_epoch']
 70 | 
 71 |     # override the base class step fn completely
 72 |     def step(self, epoch, metric=None):
 73 |         if epoch <= self.warmup_t:
 74 |             lrs = [self.warmup_lr_init + epoch * s for s in self.warmup_steps]
 75 |             super().update_groups(lrs)
 76 |         else:
 77 |             if self.restore_lr is not None:
 78 |                 # restore actual LR from before our last noise perturbation before stepping base
 79 |                 for i, param_group in enumerate(self.optimizer.param_groups):
 80 |                     param_group['lr'] = self.restore_lr[i]
 81 |                 self.restore_lr = None
 82 | 
 83 |             self.lr_scheduler.step(metric, epoch)  # step the base scheduler
 84 | 
 85 |             if self.noise_range is not None:
 86 |                 if isinstance(self.noise_range, (list, tuple)):
 87 |                     apply_noise = self.noise_range[0] <= epoch < self.noise_range[1]
 88 |                 else:
 89 |                     apply_noise = epoch >= self.noise_range
 90 |                 if apply_noise:
 91 |                     self._apply_noise(epoch)
 92 | 
 93 |     def _apply_noise(self, epoch):
 94 |         g = torch.Generator()
 95 |         g.manual_seed(self.noise_seed + epoch)
 96 |         if self.noise_type == 'normal':
 97 |             while True:
 98 |                 # resample if noise out of percent limit, brute force but shouldn't spin much
 99 |                 noise = torch.randn(1, generator=g).item()
100 |                 if abs(noise) < self.noise_pct:
101 |                     break
102 |         else:
103 |             noise = 2 * (torch.rand(1, generator=g).item() - 0.5) * self.noise_pct
104 | 
105 |         # apply the noise on top of previous LR, cache the old value so we can restore for normal
106 |         # stepping of base scheduler
107 |         restore_lr = []
108 |         for i, param_group in enumerate(self.optimizer.param_groups):
109 |             old_lr = float(param_group['lr'])
110 |             restore_lr.append(old_lr)
111 |             new_lr = old_lr + old_lr * noise
112 |             param_group['lr'] = new_lr
113 |         self.restore_lr = restore_lr
114 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/scheduler/scheduler.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Any
  2 | 
  3 | import torch
  4 | 
  5 | 
  6 | class Scheduler:
  7 |     """ Parameter Scheduler Base Class
  8 |     A scheduler base class that can be used to schedule any optimizer parameter groups.
  9 | 
 10 |     Unlike the builtin PyTorch schedulers, this is intended to be consistently called
 11 |     * At the END of each epoch, before incrementing the epoch count, to calculate next epoch's value
 12 |     * At the END of each optimizer update, after incrementing the update count, to calculate next update's value
 13 | 
 14 |     The schedulers built on this should try to remain as stateless as possible (for simplicity).
 15 | 
 16 |     This family of schedulers is attempting to avoid the confusion of the meaning of 'last_epoch'
 17 |     and -1 values for special behaviour. All epoch and update counts must be tracked in the training
 18 |     code and explicitly passed in to the schedulers on the corresponding step or step_update call.
 19 | 
 20 |     Based on ideas from:
 21 |      * https://github.com/pytorch/fairseq/tree/master/fairseq/optim/lr_scheduler
 22 |      * https://github.com/allenai/allennlp/tree/master/allennlp/training/learning_rate_schedulers
 23 |     """
 24 | 
 25 |     def __init__(self,
 26 |                  optimizer: torch.optim.Optimizer,
 27 |                  param_group_field: str,
 28 |                  noise_range_t=None,
 29 |                  noise_type='normal',
 30 |                  noise_pct=0.67,
 31 |                  noise_std=1.0,
 32 |                  noise_seed=None,
 33 |                  initialize: bool = True) -> None:
 34 |         self.optimizer = optimizer
 35 |         self.param_group_field = param_group_field
 36 |         self._initial_param_group_field = f"initial_{param_group_field}"
 37 |         if initialize:
 38 |             for i, group in enumerate(self.optimizer.param_groups):
 39 |                 if param_group_field not in group:
 40 |                     raise KeyError(f"{param_group_field} missing from param_groups[{i}]")
 41 |                 group.setdefault(self._initial_param_group_field, group[param_group_field])
 42 |         else:
 43 |             for i, group in enumerate(self.optimizer.param_groups):
 44 |                 if self._initial_param_group_field not in group:
 45 |                     raise KeyError(f"{self._initial_param_group_field} missing from param_groups[{i}]")
 46 |         self.base_values = [group[self._initial_param_group_field] for group in self.optimizer.param_groups]
 47 |         self.metric = None  # any point to having this for all?
 48 |         self.noise_range_t = noise_range_t
 49 |         self.noise_pct = noise_pct
 50 |         self.noise_type = noise_type
 51 |         self.noise_std = noise_std
 52 |         self.noise_seed = noise_seed if noise_seed is not None else 42
 53 |         self.update_groups(self.base_values)
 54 | 
 55 |     def state_dict(self) -> Dict[str, Any]:
 56 |         return {key: value for key, value in self.__dict__.items() if key != 'optimizer'}
 57 | 
 58 |     def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
 59 |         self.__dict__.update(state_dict)
 60 | 
 61 |     def get_epoch_values(self, epoch: int):
 62 |         return None
 63 | 
 64 |     def get_update_values(self, num_updates: int):
 65 |         return None
 66 | 
 67 |     def step(self, epoch: int, metric: float = None) -> None:
 68 |         self.metric = metric
 69 |         values = self.get_epoch_values(epoch)
 70 |         if values is not None:
 71 |             values = self._add_noise(values, epoch)
 72 |             self.update_groups(values)
 73 | 
 74 |     def step_update(self, num_updates: int, metric: float = None):
 75 |         self.metric = metric
 76 |         values = self.get_update_values(num_updates)
 77 |         if values is not None:
 78 |             values = self._add_noise(values, num_updates)
 79 |             self.update_groups(values)
 80 | 
 81 |     def update_groups(self, values):
 82 |         if not isinstance(values, (list, tuple)):
 83 |             values = [values] * len(self.optimizer.param_groups)
 84 |         for param_group, value in zip(self.optimizer.param_groups, values):
 85 |             param_group[self.param_group_field] = value
 86 | 
 87 |     def _add_noise(self, lrs, t):
 88 |         if self.noise_range_t is not None:
 89 |             if isinstance(self.noise_range_t, (list, tuple)):
 90 |                 apply_noise = self.noise_range_t[0] <= t < self.noise_range_t[1]
 91 |             else:
 92 |                 apply_noise = t >= self.noise_range_t
 93 |             if apply_noise:
 94 |                 g = torch.Generator()
 95 |                 g.manual_seed(self.noise_seed + t)
 96 |                 if self.noise_type == 'normal':
 97 |                     while True:
 98 |                         # resample if noise out of percent limit, brute force but shouldn't spin much
 99 |                         noise = torch.randn(1, generator=g).item()
100 |                         if abs(noise) < self.noise_pct:
101 |                             break
102 |                 else:
103 |                     noise = 2 * (torch.rand(1, generator=g).item() - 0.5) * self.noise_pct
104 |                 lrs = [v + v * noise for v in lrs]
105 |         return lrs
106 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/optim/nvnovograd.py:
--------------------------------------------------------------------------------
  1 | """ Nvidia NovoGrad Optimizer.
  2 | Original impl by Nvidia from Jasper example:
  3 |     - https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechRecognition/Jasper
  4 | Paper: `Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks`
  5 |     - https://arxiv.org/abs/1905.11286
  6 | """
  7 | 
  8 | import torch
  9 | from torch.optim.optimizer import Optimizer
 10 | import math
 11 | 
 12 | 
 13 | class NvNovoGrad(Optimizer):
 14 |     """
 15 |     Implements Novograd algorithm.
 16 | 
 17 |     Args:
 18 |         params (iterable): iterable of parameters to optimize or dicts defining
 19 |             parameter groups
 20 |         lr (float, optional): learning rate (default: 1e-3)
 21 |         betas (Tuple[float, float], optional): coefficients used for computing
 22 |             running averages of gradient and its square (default: (0.95, 0.98))
 23 |         eps (float, optional): term added to the denominator to improve
 24 |             numerical stability (default: 1e-8)
 25 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 26 |         grad_averaging: gradient averaging
 27 |         amsgrad (boolean, optional): whether to use the AMSGrad variant of this
 28 |             algorithm from the paper `On the Convergence of Adam and Beyond`_
 29 |             (default: False)
 30 |     """
 31 | 
 32 |     def __init__(self, params, lr=1e-3, betas=(0.95, 0.98), eps=1e-8,
 33 |                  weight_decay=0, grad_averaging=False, amsgrad=False):
 34 |         if not 0.0 <= lr:
 35 |             raise ValueError("Invalid learning rate: {}".format(lr))
 36 |         if not 0.0 <= eps:
 37 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 38 |         if not 0.0 <= betas[0] < 1.0:
 39 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 40 |         if not 0.0 <= betas[1] < 1.0:
 41 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 42 |         defaults = dict(lr=lr, betas=betas, eps=eps,
 43 |                         weight_decay=weight_decay,
 44 |                         grad_averaging=grad_averaging,
 45 |                         amsgrad=amsgrad)
 46 | 
 47 |         super(NvNovoGrad, self).__init__(params, defaults)
 48 | 
 49 |     def __setstate__(self, state):
 50 |         super(NvNovoGrad, self).__setstate__(state)
 51 |         for group in self.param_groups:
 52 |             group.setdefault('amsgrad', False)
 53 | 
 54 |     def step(self, closure=None):
 55 |         """Performs a single optimization step.
 56 | 
 57 |         Arguments:
 58 |             closure (callable, optional): A closure that reevaluates the model
 59 |             and returns the loss.
 60 |         """
 61 |         loss = None
 62 |         if closure is not None:
 63 |             loss = closure()
 64 | 
 65 |         for group in self.param_groups:
 66 |             for p in group['params']:
 67 |                 if p.grad is None:
 68 |                     continue
 69 |                 grad = p.grad.data
 70 |                 if grad.is_sparse:
 71 |                     raise RuntimeError('Sparse gradients are not supported.')
 72 |                 amsgrad = group['amsgrad']
 73 | 
 74 |                 state = self.state[p]
 75 | 
 76 |                 # State initialization
 77 |                 if len(state) == 0:
 78 |                     state['step'] = 0
 79 |                     # Exponential moving average of gradient values
 80 |                     state['exp_avg'] = torch.zeros_like(p.data)
 81 |                     # Exponential moving average of squared gradient values
 82 |                     state['exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device)
 83 |                     if amsgrad:
 84 |                         # Maintains max of all exp. moving avg. of sq. grad. values
 85 |                         state['max_exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device)
 86 | 
 87 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 88 |                 if amsgrad:
 89 |                     max_exp_avg_sq = state['max_exp_avg_sq']
 90 |                 beta1, beta2 = group['betas']
 91 | 
 92 |                 state['step'] += 1
 93 | 
 94 |                 norm = torch.sum(torch.pow(grad, 2))
 95 | 
 96 |                 if exp_avg_sq == 0:
 97 |                     exp_avg_sq.copy_(norm)
 98 |                 else:
 99 |                     exp_avg_sq.mul_(beta2).add_(1 - beta2, norm)
100 | 
101 |                 if amsgrad:
102 |                     # Maintains the maximum of all 2nd moment running avg. till now
103 |                     torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
104 |                     # Use the max. for normalizing running avg. of gradient
105 |                     denom = max_exp_avg_sq.sqrt().add_(group['eps'])
106 |                 else:
107 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
108 | 
109 |                 grad.div_(denom)
110 |                 if group['weight_decay'] != 0:
111 |                     grad.add_(group['weight_decay'], p.data)
112 |                 if group['grad_averaging']:
113 |                     grad.mul_(1 - beta1)
114 |                 exp_avg.mul_(beta1).add_(grad)
115 | 
116 |                 p.data.add_(-group['lr'], exp_avg)
117 | 
118 |         return loss
119 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/optim/adamw.py:
--------------------------------------------------------------------------------
  1 | """ AdamW Optimizer
  2 | Impl copied from PyTorch master
  3 | """
  4 | import math
  5 | import torch
  6 | from torch.optim.optimizer import Optimizer
  7 | 
  8 | 
  9 | class AdamW(Optimizer):
 10 |     r"""Implements AdamW algorithm.
 11 | 
 12 |     The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
 13 |     The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
 14 | 
 15 |     Arguments:
 16 |         params (iterable): iterable of parameters to optimize or dicts defining
 17 |             parameter groups
 18 |         lr (float, optional): learning rate (default: 1e-3)
 19 |         betas (Tuple[float, float], optional): coefficients used for computing
 20 |             running averages of gradient and its square (default: (0.9, 0.999))
 21 |         eps (float, optional): term added to the denominator to improve
 22 |             numerical stability (default: 1e-8)
 23 |         weight_decay (float, optional): weight decay coefficient (default: 1e-2)
 24 |         amsgrad (boolean, optional): whether to use the AMSGrad variant of this
 25 |             algorithm from the paper `On the Convergence of Adam and Beyond`_
 26 |             (default: False)
 27 | 
 28 |     .. _Adam\: A Method for Stochastic Optimization:
 29 |         https://arxiv.org/abs/1412.6980
 30 |     .. _Decoupled Weight Decay Regularization:
 31 |         https://arxiv.org/abs/1711.05101
 32 |     .. _On the Convergence of Adam and Beyond:
 33 |         https://openreview.net/forum?id=ryQu7f-RZ
 34 |     """
 35 | 
 36 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
 37 |                  weight_decay=1e-2, amsgrad=False):
 38 |         if not 0.0 <= lr:
 39 |             raise ValueError("Invalid learning rate: {}".format(lr))
 40 |         if not 0.0 <= eps:
 41 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 42 |         if not 0.0 <= betas[0] < 1.0:
 43 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 44 |         if not 0.0 <= betas[1] < 1.0:
 45 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 46 |         defaults = dict(lr=lr, betas=betas, eps=eps,
 47 |                         weight_decay=weight_decay, amsgrad=amsgrad)
 48 |         super(AdamW, self).__init__(params, defaults)
 49 | 
 50 |     def __setstate__(self, state):
 51 |         super(AdamW, self).__setstate__(state)
 52 |         for group in self.param_groups:
 53 |             group.setdefault('amsgrad', False)
 54 | 
 55 |     def step(self, closure=None):
 56 |         """Performs a single optimization step.
 57 | 
 58 |         Arguments:
 59 |             closure (callable, optional): A closure that reevaluates the model
 60 |                 and returns the loss.
 61 |         """
 62 |         loss = None
 63 |         if closure is not None:
 64 |             loss = closure()
 65 | 
 66 |         for group in self.param_groups:
 67 |             for p in group['params']:
 68 |                 if p.grad is None:
 69 |                     continue
 70 | 
 71 |                 # Perform stepweight decay
 72 |                 p.data.mul_(1 - group['lr'] * group['weight_decay'])
 73 | 
 74 |                 # Perform optimization step
 75 |                 grad = p.grad.data
 76 |                 if grad.is_sparse:
 77 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
 78 |                 amsgrad = group['amsgrad']
 79 | 
 80 |                 state = self.state[p]
 81 | 
 82 |                 # State initialization
 83 |                 if len(state) == 0:
 84 |                     state['step'] = 0
 85 |                     # Exponential moving average of gradient values
 86 |                     state['exp_avg'] = torch.zeros_like(p.data)
 87 |                     # Exponential moving average of squared gradient values
 88 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 89 |                     if amsgrad:
 90 |                         # Maintains max of all exp. moving avg. of sq. grad. values
 91 |                         state['max_exp_avg_sq'] = torch.zeros_like(p.data)
 92 | 
 93 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 94 |                 if amsgrad:
 95 |                     max_exp_avg_sq = state['max_exp_avg_sq']
 96 |                 beta1, beta2 = group['betas']
 97 | 
 98 |                 state['step'] += 1
 99 |                 bias_correction1 = 1 - beta1 ** state['step']
100 |                 bias_correction2 = 1 - beta2 ** state['step']
101 | 
102 |                 # Decay the first and second moment running average coefficient
103 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
104 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
105 |                 if amsgrad:
106 |                     # Maintains the maximum of all 2nd moment running avg. till now
107 |                     torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
108 |                     # Use the max. for normalizing running avg. of gradient
109 |                     denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
110 |                 else:
111 |                     denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
112 | 
113 |                 step_size = group['lr'] / bias_correction1
114 | 
115 |                 p.data.addcdiv_(-step_size, exp_avg, denom)
116 | 
117 |         return loss
118 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/optim/radam.py:
--------------------------------------------------------------------------------
  1 | """RAdam Optimizer.
  2 | Implementation lifted from: https://github.com/LiyuanLucasLiu/RAdam
  3 | Paper: `On the Variance of the Adaptive Learning Rate and Beyond` - https://arxiv.org/abs/1908.03265
  4 | """
  5 | import math
  6 | import torch
  7 | from torch.optim.optimizer import Optimizer, required
  8 | 
  9 | 
 10 | class RAdam(Optimizer):
 11 | 
 12 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
 13 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
 14 |         self.buffer = [[None, None, None] for ind in range(10)]
 15 |         super(RAdam, self).__init__(params, defaults)
 16 | 
 17 |     def __setstate__(self, state):
 18 |         super(RAdam, self).__setstate__(state)
 19 | 
 20 |     def step(self, closure=None):
 21 | 
 22 |         loss = None
 23 |         if closure is not None:
 24 |             loss = closure()
 25 | 
 26 |         for group in self.param_groups:
 27 | 
 28 |             for p in group['params']:
 29 |                 if p.grad is None:
 30 |                     continue
 31 |                 grad = p.grad.data.float()
 32 |                 if grad.is_sparse:
 33 |                     raise RuntimeError('RAdam does not support sparse gradients')
 34 | 
 35 |                 p_data_fp32 = p.data.float()
 36 | 
 37 |                 state = self.state[p]
 38 | 
 39 |                 if len(state) == 0:
 40 |                     state['step'] = 0
 41 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
 42 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
 43 |                 else:
 44 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
 45 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
 46 | 
 47 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 48 |                 beta1, beta2 = group['betas']
 49 | 
 50 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 51 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 52 | 
 53 |                 state['step'] += 1
 54 |                 buffered = self.buffer[int(state['step'] % 10)]
 55 |                 if state['step'] == buffered[0]:
 56 |                     N_sma, step_size = buffered[1], buffered[2]
 57 |                 else:
 58 |                     buffered[0] = state['step']
 59 |                     beta2_t = beta2 ** state['step']
 60 |                     N_sma_max = 2 / (1 - beta2) - 1
 61 |                     N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
 62 |                     buffered[1] = N_sma
 63 | 
 64 |                     # more conservative since it's an approximated value
 65 |                     if N_sma >= 5:
 66 |                         step_size = group['lr'] * math.sqrt(
 67 |                             (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
 68 |                                         N_sma_max - 2)) / (1 - beta1 ** state['step'])
 69 |                     else:
 70 |                         step_size = group['lr'] / (1 - beta1 ** state['step'])
 71 |                     buffered[2] = step_size
 72 | 
 73 |                 if group['weight_decay'] != 0:
 74 |                     p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
 75 | 
 76 |                 # more conservative since it's an approximated value
 77 |                 if N_sma >= 5:
 78 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
 79 |                     p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
 80 |                 else:
 81 |                     p_data_fp32.add_(-step_size, exp_avg)
 82 | 
 83 |                 p.data.copy_(p_data_fp32)
 84 | 
 85 |         return loss
 86 | 
 87 | 
 88 | class PlainRAdam(Optimizer):
 89 | 
 90 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
 91 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
 92 | 
 93 |         super(PlainRAdam, self).__init__(params, defaults)
 94 | 
 95 |     def __setstate__(self, state):
 96 |         super(PlainRAdam, self).__setstate__(state)
 97 | 
 98 |     def step(self, closure=None):
 99 | 
100 |         loss = None
101 |         if closure is not None:
102 |             loss = closure()
103 | 
104 |         for group in self.param_groups:
105 | 
106 |             for p in group['params']:
107 |                 if p.grad is None:
108 |                     continue
109 |                 grad = p.grad.data.float()
110 |                 if grad.is_sparse:
111 |                     raise RuntimeError('RAdam does not support sparse gradients')
112 | 
113 |                 p_data_fp32 = p.data.float()
114 | 
115 |                 state = self.state[p]
116 | 
117 |                 if len(state) == 0:
118 |                     state['step'] = 0
119 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
120 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
121 |                 else:
122 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
123 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
124 | 
125 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
126 |                 beta1, beta2 = group['betas']
127 | 
128 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
129 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
130 | 
131 |                 state['step'] += 1
132 |                 beta2_t = beta2 ** state['step']
133 |                 N_sma_max = 2 / (1 - beta2) - 1
134 |                 N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
135 | 
136 |                 if group['weight_decay'] != 0:
137 |                     p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
138 | 
139 |                 # more conservative since it's an approximated value
140 |                 if N_sma >= 5:
141 |                     step_size = group['lr'] * math.sqrt(
142 |                         (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
143 |                                     N_sma_max - 2)) / (1 - beta1 ** state['step'])
144 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
145 |                     p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
146 |                 else:
147 |                     step_size = group['lr'] / (1 - beta1 ** state['step'])
148 |                     p_data_fp32.add_(-step_size, exp_avg)
149 | 
150 |                 p.data.copy_(p_data_fp32)
151 | 
152 |         return loss
153 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/optim/rmsprop_tf.py:
--------------------------------------------------------------------------------
  1 | """ RMSProp modified to behave like Tensorflow impl
  2 | 
  3 | Originally cut & paste from PyTorch RMSProp
  4 | https://github.com/pytorch/pytorch/blob/063946d2b3f3f1e953a2a3b54e0b34f1393de295/torch/optim/rmsprop.py
  5 | Licensed under BSD-Clause 3 (ish), https://github.com/pytorch/pytorch/blob/master/LICENSE
  6 | 
  7 | Modifications Copyright 2020 Ross Wightman
  8 | """
  9 | 
 10 | import torch
 11 | from torch.optim import Optimizer
 12 | 
 13 | 
 14 | class RMSpropTF(Optimizer):
 15 |     """Implements RMSprop algorithm (TensorFlow style epsilon)
 16 | 
 17 |     NOTE: This is a direct cut-and-paste of PyTorch RMSprop with eps applied before sqrt
 18 |     and a few other modifications to closer match Tensorflow for matching hyper-params.
 19 | 
 20 |     Noteworthy changes include:
 21 |     1. Epsilon applied inside square-root
 22 |     2. square_avg initialized to ones
 23 |     3. LR scaling of update accumulated in momentum buffer
 24 | 
 25 |     Proposed by G. Hinton in his
 26 |     `course <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.
 27 | 
 28 |     The centered version first appears in `Generating Sequences
 29 |     With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.
 30 | 
 31 |     Arguments:
 32 |         params (iterable): iterable of parameters to optimize or dicts defining
 33 |             parameter groups
 34 |         lr (float, optional): learning rate (default: 1e-2)
 35 |         momentum (float, optional): momentum factor (default: 0)
 36 |         alpha (float, optional): smoothing (decay) constant (default: 0.9)
 37 |         eps (float, optional): term added to the denominator to improve
 38 |             numerical stability (default: 1e-10)
 39 |         centered (bool, optional) : if ``True``, compute the centered RMSProp,
 40 |             the gradient is normalized by an estimation of its variance
 41 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 42 |         decoupled_decay (bool, optional): decoupled weight decay as per https://arxiv.org/abs/1711.05101
 43 |         lr_in_momentum (bool, optional): learning rate scaling is included in the momentum buffer
 44 |             update as per defaults in Tensorflow
 45 | 
 46 |     """
 47 | 
 48 |     def __init__(self, params, lr=1e-2, alpha=0.9, eps=1e-10, weight_decay=0, momentum=0., centered=False,
 49 |                  decoupled_decay=False, lr_in_momentum=True):
 50 |         if not 0.0 <= lr:
 51 |             raise ValueError("Invalid learning rate: {}".format(lr))
 52 |         if not 0.0 <= eps:
 53 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 54 |         if not 0.0 <= momentum:
 55 |             raise ValueError("Invalid momentum value: {}".format(momentum))
 56 |         if not 0.0 <= weight_decay:
 57 |             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
 58 |         if not 0.0 <= alpha:
 59 |             raise ValueError("Invalid alpha value: {}".format(alpha))
 60 | 
 61 |         defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay,
 62 |                         decoupled_decay=decoupled_decay, lr_in_momentum=lr_in_momentum)
 63 |         super(RMSpropTF, self).__init__(params, defaults)
 64 | 
 65 |     def __setstate__(self, state):
 66 |         super(RMSpropTF, self).__setstate__(state)
 67 |         for group in self.param_groups:
 68 |             group.setdefault('momentum', 0)
 69 |             group.setdefault('centered', False)
 70 | 
 71 |     def step(self, closure=None):
 72 |         """Performs a single optimization step.
 73 | 
 74 |         Arguments:
 75 |             closure (callable, optional): A closure that reevaluates the model
 76 |                 and returns the loss.
 77 |         """
 78 |         loss = None
 79 |         if closure is not None:
 80 |             loss = closure()
 81 | 
 82 |         for group in self.param_groups:
 83 |             for p in group['params']:
 84 |                 if p.grad is None:
 85 |                     continue
 86 |                 grad = p.grad.data
 87 |                 if grad.is_sparse:
 88 |                     raise RuntimeError('RMSprop does not support sparse gradients')
 89 |                 state = self.state[p]
 90 | 
 91 |                 # State initialization
 92 |                 if len(state) == 0:
 93 |                     state['step'] = 0
 94 |                     state['square_avg'] = torch.ones_like(p.data)  # PyTorch inits to zero
 95 |                     if group['momentum'] > 0:
 96 |                         state['momentum_buffer'] = torch.zeros_like(p.data)
 97 |                     if group['centered']:
 98 |                         state['grad_avg'] = torch.zeros_like(p.data)
 99 | 
100 |                 square_avg = state['square_avg']
101 |                 one_minus_alpha = 1. - group['alpha']
102 | 
103 |                 state['step'] += 1
104 | 
105 |                 if group['weight_decay'] != 0:
106 |                     if 'decoupled_decay' in group and group['decoupled_decay']:
107 |                         p.data.add_(-group['weight_decay'], p.data)
108 |                     else:
109 |                         grad = grad.add(group['weight_decay'], p.data)
110 | 
111 |                 # Tensorflow order of ops for updating squared avg
112 |                 square_avg.add_(one_minus_alpha, grad.pow(2) - square_avg)
113 |                 # square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)  # PyTorch original
114 | 
115 |                 if group['centered']:
116 |                     grad_avg = state['grad_avg']
117 |                     grad_avg.add_(one_minus_alpha, grad - grad_avg)
118 |                     # grad_avg.mul_(alpha).add_(1 - alpha, grad)  # PyTorch original
119 |                     avg = square_avg.addcmul(-1, grad_avg, grad_avg).add(group['eps']).sqrt_()  # eps moved in sqrt
120 |                 else:
121 |                     avg = square_avg.add(group['eps']).sqrt_()  # eps moved in sqrt
122 | 
123 |                 if group['momentum'] > 0:
124 |                     buf = state['momentum_buffer']
125 |                     # Tensorflow accumulates the LR scaling in the momentum buffer
126 |                     if 'lr_in_momentum' in group and group['lr_in_momentum']:
127 |                         buf.mul_(group['momentum']).addcdiv_(group['lr'], grad, avg)
128 |                         p.data.add_(-buf)
129 |                     else:
130 |                         # PyTorch scales the param update by LR
131 |                         buf.mul_(group['momentum']).addcdiv_(grad, avg)
132 |                         p.data.add_(-group['lr'], buf)
133 |                 else:
134 |                     p.data.addcdiv_(-group['lr'], grad, avg)
135 | 
136 |         return loss
137 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/optim/adahessian.py:
--------------------------------------------------------------------------------
  1 | """ AdaHessian Optimizer
  2 | 
  3 | Lifted from https://github.com/davda54/ada-hessian/blob/master/ada_hessian.py
  4 | Originally licensed MIT, Copyright 2020, David Samuel
  5 | """
  6 | import torch
  7 | 
  8 | 
  9 | class Adahessian(torch.optim.Optimizer):
 10 |     """
 11 |     Implements the AdaHessian algorithm from "ADAHESSIAN: An Adaptive Second OrderOptimizer for Machine Learning"
 12 | 
 13 |     Arguments:
 14 |         params (iterable): iterable of parameters to optimize or dicts defining parameter groups
 15 |         lr (float, optional): learning rate (default: 0.1)
 16 |         betas ((float, float), optional): coefficients used for computing running averages of gradient and the
 17 |             squared hessian trace (default: (0.9, 0.999))
 18 |         eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8)
 19 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0.0)
 20 |         hessian_power (float, optional): exponent of the hessian trace (default: 1.0)
 21 |         update_each (int, optional): compute the hessian trace approximation only after *this* number of steps
 22 |             (to save time) (default: 1)
 23 |         n_samples (int, optional): how many times to sample `z` for the approximation of the hessian trace (default: 1)
 24 |     """
 25 | 
 26 |     def __init__(self, params, lr=0.1, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0,
 27 |                  hessian_power=1.0, update_each=1, n_samples=1, avg_conv_kernel=False):
 28 |         if not 0.0 <= lr:
 29 |             raise ValueError(f"Invalid learning rate: {lr}")
 30 |         if not 0.0 <= eps:
 31 |             raise ValueError(f"Invalid epsilon value: {eps}")
 32 |         if not 0.0 <= betas[0] < 1.0:
 33 |             raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
 34 |         if not 0.0 <= betas[1] < 1.0:
 35 |             raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
 36 |         if not 0.0 <= hessian_power <= 1.0:
 37 |             raise ValueError(f"Invalid Hessian power value: {hessian_power}")
 38 | 
 39 |         self.n_samples = n_samples
 40 |         self.update_each = update_each
 41 |         self.avg_conv_kernel = avg_conv_kernel
 42 | 
 43 |         # use a separate generator that deterministically generates the same `z`s across all GPUs in case of distributed training
 44 |         self.seed = 2147483647
 45 |         self.generator = torch.Generator().manual_seed(self.seed)
 46 | 
 47 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, hessian_power=hessian_power)
 48 |         super(Adahessian, self).__init__(params, defaults)
 49 | 
 50 |         for p in self.get_params():
 51 |             p.hess = 0.0
 52 |             self.state[p]["hessian step"] = 0
 53 | 
 54 |     @property
 55 |     def is_second_order(self):
 56 |         return True
 57 | 
 58 |     def get_params(self):
 59 |         """
 60 |         Gets all parameters in all param_groups with gradients
 61 |         """
 62 | 
 63 |         return (p for group in self.param_groups for p in group['params'] if p.requires_grad)
 64 | 
 65 |     def zero_hessian(self):
 66 |         """
 67 |         Zeros out the accumalated hessian traces.
 68 |         """
 69 | 
 70 |         for p in self.get_params():
 71 |             if not isinstance(p.hess, float) and self.state[p]["hessian step"] % self.update_each == 0:
 72 |                 p.hess.zero_()
 73 | 
 74 |     @torch.no_grad()
 75 |     def set_hessian(self):
 76 |         """
 77 |         Computes the Hutchinson approximation of the hessian trace and accumulates it for each trainable parameter.
 78 |         """
 79 | 
 80 |         params = []
 81 |         for p in filter(lambda p: p.grad is not None, self.get_params()):
 82 |             if self.state[p]["hessian step"] % self.update_each == 0:  # compute the trace only each `update_each` step
 83 |                 params.append(p)
 84 |             self.state[p]["hessian step"] += 1
 85 | 
 86 |         if len(params) == 0:
 87 |             return
 88 | 
 89 |         if self.generator.device != params[0].device:  # hackish way of casting the generator to the right device
 90 |             self.generator = torch.Generator(params[0].device).manual_seed(self.seed)
 91 | 
 92 |         grads = [p.grad for p in params]
 93 | 
 94 |         for i in range(self.n_samples):
 95 |             # Rademacher distribution {-1.0, 1.0}
 96 |             zs = [torch.randint(0, 2, p.size(), generator=self.generator, device=p.device) * 2.0 - 1.0 for p in params]
 97 |             h_zs = torch.autograd.grad(
 98 |                 grads, params, grad_outputs=zs, only_inputs=True, retain_graph=i < self.n_samples - 1)
 99 |             for h_z, z, p in zip(h_zs, zs, params):
100 |                 p.hess += h_z * z / self.n_samples  # approximate the expected values of z*(H@z)
101 | 
102 |     @torch.no_grad()
103 |     def step(self, closure=None):
104 |         """
105 |         Performs a single optimization step.
106 |         Arguments:
107 |             closure (callable, optional) -- a closure that reevaluates the model and returns the loss (default: None)
108 |         """
109 | 
110 |         loss = None
111 |         if closure is not None:
112 |             loss = closure()
113 | 
114 |         self.zero_hessian()
115 |         self.set_hessian()
116 | 
117 |         for group in self.param_groups:
118 |             for p in group['params']:
119 |                 if p.grad is None or p.hess is None:
120 |                     continue
121 | 
122 |                 if self.avg_conv_kernel and p.dim() == 4:
123 |                     p.hess = torch.abs(p.hess).mean(dim=[2, 3], keepdim=True).expand_as(p.hess).clone()
124 | 
125 |                 # Perform correct stepweight decay as in AdamW
126 |                 p.mul_(1 - group['lr'] * group['weight_decay'])
127 | 
128 |                 state = self.state[p]
129 | 
130 |                 # State initialization
131 |                 if len(state) == 1:
132 |                     state['step'] = 0
133 |                     # Exponential moving average of gradient values
134 |                     state['exp_avg'] = torch.zeros_like(p)
135 |                     # Exponential moving average of Hessian diagonal square values
136 |                     state['exp_hessian_diag_sq'] = torch.zeros_like(p)
137 | 
138 |                 exp_avg, exp_hessian_diag_sq = state['exp_avg'], state['exp_hessian_diag_sq']
139 |                 beta1, beta2 = group['betas']
140 |                 state['step'] += 1
141 | 
142 |                 # Decay the first and second moment running average coefficient
143 |                 exp_avg.mul_(beta1).add_(p.grad, alpha=1 - beta1)
144 |                 exp_hessian_diag_sq.mul_(beta2).addcmul_(p.hess, p.hess, value=1 - beta2)
145 | 
146 |                 bias_correction1 = 1 - beta1 ** state['step']
147 |                 bias_correction2 = 1 - beta2 ** state['step']
148 | 
149 |                 k = group['hessian_power']
150 |                 denom = (exp_hessian_diag_sq / bias_correction2).pow_(k / 2).add_(group['eps'])
151 | 
152 |                 # make update
153 |                 step_size = group['lr'] / bias_correction1
154 |                 p.addcdiv_(exp_avg, denom, value=-step_size)
155 | 
156 |         return loss
157 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/optim/optim_factory.py:
--------------------------------------------------------------------------------
  1 | """ Optimizer Factory w/ Custom Weight Decay
  2 | Hacked together by / Copyright 2020 Ross Wightman
  3 | """
  4 | import torch
  5 | from torch import optim as optim
  6 | 
  7 | from .adafactor import Adafactor
  8 | from .adahessian import Adahessian
  9 | from .adamp import AdamP
 10 | from .lookahead import Lookahead
 11 | from .nadam import Nadam
 12 | from .novograd import NovoGrad
 13 | from .nvnovograd import NvNovoGrad
 14 | from .radam import RAdam
 15 | from .rmsprop_tf import RMSpropTF
 16 | from .sgdp import SGDP
 17 | 
 18 | try:
 19 |     from apex.optimizers import FusedNovoGrad, FusedAdam, FusedLAMB, FusedSGD
 20 |     has_apex = True
 21 | except ImportError:
 22 |     has_apex = False
 23 | 
 24 | 
 25 | def add_weight_decay(model, weight_decay=1e-5, skip_list=()):
 26 |     decay = []
 27 |     no_decay = []
 28 |     for name, param in model.named_parameters():
 29 |         if not param.requires_grad:
 30 |             continue  # frozen weights
 31 |         if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
 32 |             no_decay.append(param)
 33 |         else:
 34 |             decay.append(param)
 35 |     return [
 36 |         {'params': no_decay, 'weight_decay': 0.},
 37 |         {'params': decay, 'weight_decay': weight_decay}]
 38 | 
 39 | def add_weight_lr(args, model, weight_decay=1e-5, skip_list=()):
 40 |     decay = []
 41 |     no_decay = []
 42 |     
 43 |     lr_vis = []
 44 |     lr_text = []
 45 |     for name, param in model.named_parameters():
 46 |         if not param.requires_grad:
 47 |             continue  # frozen weights
 48 |         # if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
 49 |         #     no_decay.append(param)
 50 |         # else:
 51 |         #     decay.append(param)
 52 |             
 53 |         if "text_encoder" in name:
 54 |             lr_text.append(param)
 55 |         else:
 56 |             lr_vis.append(param)
 57 |             
 58 |     return [{'params': lr_vis, 'lr': args.lr},
 59 |         {'params': lr_text, 'lr': args.lr_text}]
 60 |     
 61 | def add_weight_lr_img(args, model, weight_decay=1e-5, skip_list=()):
 62 |     decay = []
 63 |     no_decay = []
 64 |     
 65 |     lr_vis = []
 66 |     lr_text = []
 67 |     for name, param in model.named_parameters():
 68 |         if not param.requires_grad:
 69 |             continue  # frozen weights
 70 |         # if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
 71 |         #     no_decay.append(param)
 72 |         # else:
 73 |         #     decay.append(param)
 74 |                     
 75 |         if ("visual_encoder" in name) or ("bbox_head" in name) or ("aggregator" in name) or ("cls_token_local" in name) or ("it_cross_attn" in name) or ("norm_layer_aggr" in name) or ("norm_layer_it_cross_atten" in name):
 76 |             lr_vis.append(param)
 77 |         else:
 78 |             lr_text.append(param)
 79 |             
 80 |     return [{'params': lr_vis, 'lr': args.lr_img},
 81 |         {'params': lr_text, 'lr': args.lr}]
 82 | 
 83 | 
 84 | def create_optimizer(args, model, filter_bias_and_bn=True):
 85 |     opt_lower = args.opt.lower()
 86 |     weight_decay = args.weight_decay
 87 | 
 88 |     if weight_decay and filter_bias_and_bn:
 89 |         skip = {}
 90 |         if hasattr(model, 'no_weight_decay'):
 91 |             skip = model.no_weight_decay()
 92 |         if 'lr_text' in args: 
 93 |             parameters = add_weight_lr(args, model, weight_decay, skip)
 94 |         if 'lr_img' in args: 
 95 |             parameters = add_weight_lr_img(args, model, weight_decay, skip)
 96 |         else:
 97 |             parameters = add_weight_decay(model, weight_decay, skip)
 98 |             weight_decay = 0.            
 99 |     else:
100 |         parameters = model.parameters()
101 | 
102 |     if 'fused' in opt_lower:
103 |         assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers'
104 | 
105 |     opt_args = dict(lr=args.lr, weight_decay=weight_decay)
106 |     if hasattr(args, 'opt_eps') and args.opt_eps is not None:
107 |         opt_args['eps'] = args.opt_eps
108 |     if hasattr(args, 'opt_betas') and args.opt_betas is not None:
109 |         opt_args['betas'] = args.opt_betas
110 |     if hasattr(args, 'opt_args') and args.opt_args is not None:
111 |         opt_args.update(args.opt_args)
112 | 
113 |     opt_split = opt_lower.split('_')
114 |     opt_lower = opt_split[-1]
115 |     if opt_lower == 'sgd' or opt_lower == 'nesterov':
116 |         opt_args.pop('eps', None)
117 |         optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=True, **opt_args)
118 |     elif opt_lower == 'momentum':
119 |         opt_args.pop('eps', None)
120 |         optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=False, **opt_args)
121 |     elif opt_lower == 'adam':
122 |         optimizer = optim.Adam(parameters, **opt_args)
123 |     elif opt_lower == 'adamw':
124 |         optimizer = optim.AdamW(parameters, **opt_args)
125 |     elif opt_lower == 'nadam':
126 |         optimizer = Nadam(parameters, **opt_args)
127 |     elif opt_lower == 'radam':
128 |         optimizer = RAdam(parameters, **opt_args)
129 |     elif opt_lower == 'adamp':        
130 |         optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args)
131 |     elif opt_lower == 'sgdp':        
132 |         optimizer = SGDP(parameters, momentum=args.momentum, nesterov=True, **opt_args)
133 |     elif opt_lower == 'adadelta':
134 |         optimizer = optim.Adadelta(parameters, **opt_args)
135 |     elif opt_lower == 'adafactor':
136 |         if not args.lr:
137 |             opt_args['lr'] = None
138 |         optimizer = Adafactor(parameters, **opt_args)
139 |     elif opt_lower == 'adahessian':
140 |         optimizer = Adahessian(parameters, **opt_args)
141 |     elif opt_lower == 'rmsprop':
142 |         optimizer = optim.RMSprop(parameters, alpha=0.9, momentum=args.momentum, **opt_args)
143 |     elif opt_lower == 'rmsproptf':
144 |         optimizer = RMSpropTF(parameters, alpha=0.9, momentum=args.momentum, **opt_args)
145 |     elif opt_lower == 'novograd':
146 |         optimizer = NovoGrad(parameters, **opt_args)
147 |     elif opt_lower == 'nvnovograd':
148 |         optimizer = NvNovoGrad(parameters, **opt_args)
149 |     elif opt_lower == 'fusedsgd':
150 |         opt_args.pop('eps', None)
151 |         optimizer = FusedSGD(parameters, momentum=args.momentum, nesterov=True, **opt_args)
152 |     elif opt_lower == 'fusedmomentum':
153 |         opt_args.pop('eps', None)
154 |         optimizer = FusedSGD(parameters, momentum=args.momentum, nesterov=False, **opt_args)
155 |     elif opt_lower == 'fusedadam':
156 |         optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args)
157 |     elif opt_lower == 'fusedadamw':
158 |         optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args)
159 |     elif opt_lower == 'fusedlamb':
160 |         optimizer = FusedLAMB(parameters, **opt_args)
161 |     elif opt_lower == 'fusednovograd':
162 |         opt_args.setdefault('betas', (0.95, 0.98))
163 |         optimizer = FusedNovoGrad(parameters, **opt_args)
164 |     else:
165 |         assert False and "Invalid optimizer"
166 |         raise ValueError
167 | 
168 |     if len(opt_split) > 1:
169 |         if opt_split[0] == 'lookahead':
170 |             optimizer = Lookahead(optimizer)
171 | 
172 |     return optimizer
173 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/tools/utils.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | # from apex.optimizers import FusedAdam, FusedSGD
  3 | # from timm.optim import AdamW
  4 | import torch
  5 | from torch import optim
  6 | from torch.optim import lr_scheduler
  7 | from torch.optim.rmsprop import RMSprop
  8 | # from torch.optim.adamw import AdamW
  9 | from torch.optim.lr_scheduler import MultiStepLR, CyclicLR, StepLR
 10 | 
 11 | from tools.schedulers import ExponentialLRScheduler, PolyLR, LRStepScheduler
 12 | 
 13 | cv2.ocl.setUseOpenCL(False)
 14 | cv2.setNumThreads(0)
 15 | 
 16 | import numpy as np
 17 | from typing import Optional, List
 18 | from torch import Tensor
 19 | import json
 20 | import os
 21 | import torch.distributed as dist
 22 | 
 23 | class AverageMeter(object):
 24 |     """Computes and stores the average and current value"""
 25 | 
 26 |     def __init__(self):
 27 |         self.reset()
 28 | 
 29 |     def reset(self):
 30 |         self.val = 0
 31 |         self.avg = 0
 32 |         self.sum = 0
 33 |         self.count = 0
 34 | 
 35 |     def update(self, val, n=1):
 36 |         self.val = val
 37 |         self.sum += val * n
 38 |         self.count += n
 39 |         self.avg = self.sum / self.count
 40 | 
 41 | def create_optimizer(optimizer_config, model, sum_steps, master_params=None):
 42 |     """Creates optimizer and schedule from configuration
 43 | 
 44 |     Parameters
 45 |     ----------
 46 |     optimizer_config : dict
 47 |         Dictionary containing the configuration options for the optimizer.
 48 |     model : Model
 49 |         The network model.
 50 | 
 51 |     Returns
 52 |     -------
 53 |     optimizer : Optimizer
 54 |         The optimizer.
 55 |     scheduler : LRScheduler
 56 |         The learning rate scheduler.
 57 |     """
 58 |     if optimizer_config.get("classifier_lr", -1) != -1:
 59 |         # Separate classifier parameters from all others
 60 |         net_params = []
 61 |         classifier_params = []
 62 |         for k, v in model.named_parameters():
 63 |             if not v.requires_grad:
 64 |                 continue
 65 |             if k.find("encoder") != -1:
 66 |                 net_params.append(v)
 67 |             else:
 68 |                 classifier_params.append(v)
 69 |         params = [
 70 |             {"params": net_params},
 71 |             {"params": classifier_params, "lr": optimizer_config["classifier_lr"]},
 72 |         ]
 73 |     else:
 74 |         if master_params:
 75 |             params = master_params
 76 |         else:
 77 |             params = model.parameters()
 78 | 
 79 |     if optimizer_config["type"] == "SGD":
 80 |         optimizer = optim.SGD(params,
 81 |                               lr=optimizer_config["learning_rate"],
 82 |                               momentum=optimizer_config["momentum"],
 83 |                               weight_decay=optimizer_config["weight_decay"],
 84 |                               nesterov=optimizer_config["nesterov"])
 85 |     elif optimizer_config["type"] == "Adam":
 86 |         optimizer = optim.Adam(params,
 87 |                                lr=optimizer_config["learning_rate"],
 88 |                                weight_decay=optimizer_config["weight_decay"])
 89 |     # elif optimizer_config["type"] == "FusedAdam":
 90 |     #     optimizer = FusedAdam(params,
 91 |     #                           lr=optimizer_config["learning_rate"],
 92 |     #                           weight_decay=optimizer_config["weight_decay"])
 93 |     # elif optimizer_config["type"] == "AdamW":
 94 |     #     optimizer = AdamW(params,
 95 |     #                            lr=optimizer_config["learning_rate"],
 96 |     #                            weight_decay=optimizer_config["weight_decay"])
 97 |     elif optimizer_config["type"] == "RmsProp":
 98 |         optimizer = RMSprop(params,
 99 |                                lr=optimizer_config["learning_rate"],
100 |                                weight_decay=optimizer_config["weight_decay"])
101 |     else:
102 |         raise KeyError("unrecognized optimizer {}".format(optimizer_config["type"]))
103 | 
104 |     if optimizer_config["schedule"]["type"] == "step":
105 |         # scheduler = LRStepScheduler(optimizer, **optimizer_config["schedule"]["params"])
106 |         scheduler = StepLR(optimizer, **optimizer_config["schedule"]["params"])
107 |     elif optimizer_config["schedule"]["type"] == "clr":
108 |         scheduler = CyclicLR(optimizer, **optimizer_config["schedule"]["params"])
109 |     elif optimizer_config["schedule"]["type"] == "multistep":
110 |         scheduler = MultiStepLR(optimizer, **optimizer_config["schedule"]["params"])
111 |     elif optimizer_config["schedule"]["type"] == "exponential":
112 |         scheduler = ExponentialLRScheduler(optimizer, **optimizer_config["schedule"]["params"])
113 |     elif optimizer_config["schedule"]["type"] == "poly":
114 |         scheduler = PolyLR(optimizer, max_iter = sum_steps)
115 |         # scheduler = PolyLR(optimizer, **optimizer_config["schedule"]["params"])
116 |     elif optimizer_config["schedule"]["type"] == "constant":
117 |         scheduler = lr_scheduler.LambdaLR(optimizer, lambda epoch: 1.0)
118 |     elif optimizer_config["schedule"]["type"] == "linear":
119 |         def linear_lr(it):
120 |             return it * optimizer_config["schedule"]["params"]["alpha"] + optimizer_config["schedule"]["params"]["beta"]
121 | 
122 |         scheduler = lr_scheduler.LambdaLR(optimizer, linear_lr)
123 | 
124 |     return optimizer, scheduler
125 | 
126 | 
127 | 
128 | 
129 | def read_json(file_name):
130 |     with open(file_name) as handle:
131 |         out = json.load(handle)
132 |     return out
133 | 
134 | def nested_tensor_from_tensor_list(imgsize, tensor_list: List[Tensor]):
135 |     # TODO make this more general
136 |     if tensor_list[0].ndim == 3:
137 |         # TODO make it support different-sized images
138 |         max_size = [3, imgsize, imgsize]
139 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
140 |         batch_shape = [len(tensor_list)] + max_size
141 |         b, c, h, w = batch_shape
142 |         dtype = tensor_list[0].dtype
143 |         device = tensor_list[0].device
144 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
145 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
146 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
147 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
148 |             m[: img.shape[1], :img.shape[2]] = False
149 |     else:
150 |         raise ValueError('not supported')
151 |     return NestedTensor(tensor, mask)
152 | 
153 | 
154 | class NestedTensor(object):
155 |     def __init__(self, tensors, mask: Optional[Tensor]):
156 |         self.tensors = tensors
157 |         self.mask = mask
158 | 
159 |     def to(self, device):
160 |         cast_tensor = self.tensors.cuda(device)
161 |         mask = self.mask
162 |         if mask is not None:
163 |             assert mask is not None
164 |             cast_mask = mask.cuda(device)
165 |         else:
166 |             cast_mask = None
167 |         return NestedTensor(cast_tensor, cast_mask)
168 | 
169 |     def decompose(self):
170 |         return self.tensors, self.mask
171 | 
172 |     def __repr__(self):
173 |         return str(self.tensors)
174 | 
175 | 
176 | def is_dist_avail_and_initialized():
177 |     if not dist.is_available():
178 |         return False
179 |     if not dist.is_initialized():
180 |         return False
181 |     return True
182 | 
183 | 
184 | def get_rank():
185 |     if not is_dist_avail_and_initialized():
186 |         return 0
187 |     return dist.get_rank()
188 | 
189 | 
190 | def is_main_process():
191 |     return get_rank() == 0
192 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/dataset/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | def pre_question(question,max_ques_words):
  4 |     question = re.sub(
  5 |         r"([,.'!?\"()*#:;~])",
  6 |         '',
  7 |         question.lower(),
  8 |     ).replace('-', ' ').replace('/', ' ')  
  9 |     question = question.rstrip(' ')
 10 |     
 11 |     #truncate question
 12 |     question_words = question.split(' ')
 13 |     if len(question_words)>max_ques_words:
 14 |         question = ' '.join(question_words[:max_ques_words])
 15 |             
 16 |     return question
 17 | 
 18 | 
 19 | def pre_caption(caption,max_words):
 20 |     caption = re.sub(
 21 |         r"([,.'!?\"()*#:;~])",
 22 |         '',
 23 |         caption.lower(),
 24 |     ).replace('-', ' ').replace('/', ' ').replace('<person>', 'person')
 25 | 
 26 |     caption = re.sub(
 27 |         r"\s{2,}",
 28 |         ' ',
 29 |         caption,
 30 |     )
 31 |     caption = caption.rstrip('\n') 
 32 |     caption = caption.strip(' ')
 33 | 
 34 |     #truncate caption
 35 |     caption_words = caption.split(' ')
 36 |     if len(caption_words)>max_words:
 37 |         caption = ' '.join(caption_words[:max_words])
 38 |             
 39 |     return caption
 40 | 
 41 | 
 42 | # from vqaTools.vqaEval import VQAEval
 43 | # from refTools.evaluation.refEvaluation import RefEvaluation
 44 | 
 45 | import json
 46 | import os
 47 | import numpy as np
 48 | import torch
 49 | import torch.distributed as dist
 50 | import torch.nn.functional as F
 51 | 
 52 | import utils
 53 | from tqdm import tqdm
 54 | 
 55 | 
 56 | def vqa_eval(vqa, result_file, test_ques_path):
 57 |     vqaRes = vqa.loadRes(result_file, test_ques_path)
 58 |     # create vqaEval object by taking vqa and vqaRes
 59 |     vqaEval = VQAEval(vqa, vqaRes, n=2)  # n is precision of accuracy (number of places after decimal), default is 2
 60 |     # evaluate results
 61 |     vqaEval.evaluate()   
 62 | 
 63 |     # print accuracies
 64 |     print("\n")
 65 |     print("Overall Accuracy is: %.02f\n" % (vqaEval.accuracy['overall']))
 66 |     print("Per Answer Type Accuracy is the following:")
 67 |     for ansType in vqaEval.accuracy['perAnswerType']:
 68 |         print("%s : %.02f" % (ansType, vqaEval.accuracy['perAnswerType'][ansType]))
 69 |     print("\n")    
 70 |     
 71 |     return vqaEval
 72 | 
 73 | 
 74 |     
 75 | def collect_result(result, result_dir, filename, is_json=True, is_list=True):
 76 |     if is_json:
 77 |         result_file = os.path.join(result_dir, '%s_rank%d.json'%(filename,utils.get_rank()))
 78 |         final_result_file = os.path.join(result_dir, '%s.json'%filename)
 79 |         json.dump(result,open(result_file,'w'))
 80 |     else:
 81 |         result_file = os.path.join(result_dir, '%s_rank%d.pth'%(filename,utils.get_rank()))
 82 |         final_result_file = os.path.join(result_dir, '%s.pth'%filename)
 83 |         torch.save(result,result_file)     
 84 |         
 85 |     dist.barrier()
 86 |     
 87 |     result = None
 88 |     if utils.is_main_process():   
 89 |         # combine results from all processes
 90 |         if is_list:
 91 |             result = []
 92 |         else:
 93 |             result = {}
 94 |         for rank in range(utils.get_world_size()):
 95 |             if is_json:
 96 |                 result_file = os.path.join(result_dir, '%s_rank%d.json'%(filename,rank))
 97 |                 res = json.load(open(result_file,'r'))
 98 |             else:
 99 |                 result_file = os.path.join(result_dir, '%s_rank%d.pth'%(filename,rank))
100 |                 res = torch.load(result_file)            
101 |             if is_list:
102 |                 result += res
103 |             else:
104 |                 result.update(res) 
105 |       
106 |     return result    
107 | 
108 |     
109 | def save_result(result, result_dir, filename, is_json=True, is_list=True):
110 |     if is_json:
111 |         result_file = os.path.join(result_dir, '%s_rank%d.json'%(filename,utils.get_rank()))
112 |         final_result_file = os.path.join(result_dir, '%s.json'%filename)
113 |         json.dump(result,open(result_file,'w'))
114 |     else:
115 |         result_file = os.path.join(result_dir, '%s_rank%d.pth'%(filename,utils.get_rank()))
116 |         final_result_file = os.path.join(result_dir, '%s.pth'%filename)
117 |         torch.save(result,result_file)     
118 |         
119 |     dist.barrier()
120 | 
121 |     if utils.is_main_process():   
122 |         # combine results from all processes
123 |         if is_list:
124 |             result = []
125 |         else:
126 |             result = {}
127 |         for rank in range(utils.get_world_size()):
128 |             if is_json:
129 |                 result_file = os.path.join(result_dir, '%s_rank%d.json'%(filename,rank))
130 |                 res = json.load(open(result_file,'r'))
131 |             else:
132 |                 result_file = os.path.join(result_dir, '%s_rank%d.pth'%(filename,rank))
133 |                 res = torch.load(result_file)            
134 |             if is_list:
135 |                 result += res
136 |             else:
137 |                 result.update(res)
138 |         if is_json:                  
139 |             json.dump(result,open(final_result_file,'w'))   
140 |         else:            
141 |             torch.save(result,final_result_file)     
142 |         
143 |         print('result file saved to %s'%final_result_file)
144 |     dist.barrier()        
145 |     return final_result_file
146 | 
147 | 
148 | 
149 | def grounding_eval(results,dets,cocos,refer,alpha,mask_size=24):
150 |     
151 |     correct_A_d, correct_B_d, correct_val_d = 0, 0, 0
152 |     correct_A, correct_B, correct_val = 0, 0, 0 
153 |     num_A,num_B,num_val = 0,0,0
154 |     
155 |     for res in tqdm(results):
156 | 
157 |         ref_id = res['ref_id']
158 |         ref = refer.Refs[ref_id]
159 |         ref_box = refer.refToAnn[ref_id]['bbox']
160 |         image = refer.Imgs[ref['image_id']]
161 | 
162 |         mask = res['pred'].cuda().view(1,1,mask_size,mask_size)    
163 |         mask = F.interpolate(mask,size = (image['height'],image['width']), mode='bicubic').squeeze()
164 |         
165 |         # rank detection boxes
166 |         max_score = 0
167 |         for det in dets[str(ref['image_id'])]:
168 |             score = mask[int(det[1]):int(det[1]+det[3]),int(det[0]):int(det[0]+det[2])]
169 |             area = det[2]*det[3]
170 |             score = score.sum() / area**alpha
171 |             if score>max_score:
172 |                 pred_box = det[:4]
173 |                 max_score = score    
174 | 
175 |         IoU_det = computeIoU(ref_box, pred_box)
176 |         
177 |         if ref['split']=='testA':
178 |             num_A += 1    
179 |             if IoU_det >= 0.5:   
180 |                 correct_A_d += 1            
181 |         elif ref['split']=='testB':
182 |             num_B += 1    
183 |             if IoU_det >= 0.5:   
184 |                 correct_B_d += 1    
185 |         elif ref['split']=='val':
186 |             num_val += 1    
187 |             if IoU_det >= 0.5:   
188 |                 correct_val_d += 1    
189 |                 
190 |     eval_result = {'val_d':correct_val_d/num_val,'testA_d':correct_A_d/num_A,'testB_d':correct_B_d/num_B}        
191 |     
192 |     for metric, acc in eval_result.items():
193 |         print(f'{metric}: {acc:.3f}')
194 |         
195 |     return eval_result    
196 | 
197 | 
198 | 
199 | # IoU function
200 | def computeIoU(box1, box2):
201 |     # each box is of [x1, y1, w, h]
202 |     inter_x1 = max(box1[0], box2[0])
203 |     inter_y1 = max(box1[1], box2[1])
204 |     inter_x2 = min(box1[0]+box1[2]-1, box2[0]+box2[2]-1)
205 |     inter_y2 = min(box1[1]+box1[3]-1, box2[1]+box2[3]-1)
206 | 
207 |     if inter_x1 < inter_x2 and inter_y1 < inter_y2:
208 |         inter = (inter_x2-inter_x1+1)*(inter_y2-inter_y1+1)
209 |     else:
210 |         inter = 0
211 |     union = box1[2]*box1[3] + box2[2]*box2[3] - inter
212 |     return float(inter)/union
213 |         
214 |         
215 |         


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/optim/adafactor.py:
--------------------------------------------------------------------------------
  1 | """ Adafactor Optimizer
  2 | 
  3 | Lifted from https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py
  4 | 
  5 | Original header/copyright below.
  6 | 
  7 | """
  8 | # Copyright (c) Facebook, Inc. and its affiliates.
  9 | #
 10 | # This source code is licensed under the MIT license found in the
 11 | # LICENSE file in the root directory of this source tree.
 12 | import torch
 13 | import math
 14 | 
 15 | 
 16 | class Adafactor(torch.optim.Optimizer):
 17 |     """Implements Adafactor algorithm.
 18 |     This implementation is based on: `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost`
 19 |     (see https://arxiv.org/abs/1804.04235)
 20 | 
 21 |     Note that this optimizer internally adjusts the learning rate depending on the
 22 |     *scale_parameter*, *relative_step* and *warmup_init* options.
 23 | 
 24 |     To use a manual (external) learning rate schedule you should set `scale_parameter=False` and
 25 |     `relative_step=False`.
 26 | 
 27 |     Arguments:
 28 |         params (iterable): iterable of parameters to optimize or dicts defining parameter groups
 29 |         lr (float, optional): external learning rate (default: None)
 30 |         eps (tuple[float, float]): regularization constants for square gradient
 31 |             and parameter scale respectively (default: (1e-30, 1e-3))
 32 |         clip_threshold (float): threshold of root mean square of final gradient update (default: 1.0)
 33 |         decay_rate (float): coefficient used to compute running averages of square gradient (default: -0.8)
 34 |         beta1 (float): coefficient used for computing running averages of gradient (default: None)
 35 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 36 |         scale_parameter (bool): if True, learning rate is scaled by root mean square of parameter (default: True)
 37 |         relative_step (bool): if True, time-dependent learning rate is computed
 38 |             instead of external learning rate (default: True)
 39 |         warmup_init (bool): time-dependent learning rate computation depends on
 40 |             whether warm-up initialization is being used (default: False)
 41 |     """
 42 | 
 43 |     def __init__(self, params, lr=None, eps=1e-30, eps_scale=1e-3, clip_threshold=1.0,
 44 |                  decay_rate=-0.8, betas=None, weight_decay=0.0, scale_parameter=True, warmup_init=False):
 45 |         relative_step = lr is None
 46 |         if warmup_init and not relative_step:
 47 |             raise ValueError('warmup_init requires relative_step=True')
 48 | 
 49 |         beta1 = None if betas is None else betas[0]   # make it compat with standard betas arg
 50 |         defaults = dict(lr=lr, eps=eps, eps_scale=eps_scale, clip_threshold=clip_threshold, decay_rate=decay_rate,
 51 |                         beta1=beta1, weight_decay=weight_decay, scale_parameter=scale_parameter,
 52 |                         relative_step=relative_step, warmup_init=warmup_init)
 53 |         super(Adafactor, self).__init__(params, defaults)
 54 | 
 55 |     @staticmethod
 56 |     def _get_lr(param_group, param_state):
 57 |         if param_group['relative_step']:
 58 |             min_step = 1e-6 * param_state['step'] if param_group['warmup_init'] else 1e-2
 59 |             lr_t = min(min_step, 1.0 / math.sqrt(param_state['step']))
 60 |             param_scale = 1.0
 61 |             if param_group['scale_parameter']:
 62 |                 param_scale = max(param_group['eps_scale'], param_state['RMS'])
 63 |             param_group['lr'] = lr_t * param_scale
 64 |         return param_group['lr']
 65 | 
 66 |     @staticmethod
 67 |     def _get_options(param_group, param_shape):
 68 |         factored = len(param_shape) >= 2
 69 |         use_first_moment = param_group['beta1'] is not None
 70 |         return factored, use_first_moment
 71 | 
 72 |     @staticmethod
 73 |     def _rms(tensor):
 74 |         return tensor.norm(2) / (tensor.numel() ** 0.5)
 75 | 
 76 |     def _approx_sq_grad(self, exp_avg_sq_row, exp_avg_sq_col):
 77 |         r_factor = (exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1, keepdim=True)).rsqrt_().unsqueeze(-1)
 78 |         c_factor = exp_avg_sq_col.unsqueeze(-2).rsqrt()
 79 |         return torch.mul(r_factor, c_factor)
 80 | 
 81 |     def step(self, closure=None):
 82 |         """Performs a single optimization step.
 83 |         Arguments:
 84 |             closure (callable, optional): A closure that reevaluates the model and returns the loss.
 85 |         """
 86 |         loss = None
 87 |         if closure is not None:
 88 |             loss = closure()
 89 | 
 90 |         for group in self.param_groups:
 91 |             for p in group['params']:
 92 |                 if p.grad is None:
 93 |                     continue
 94 |                 grad = p.grad.data
 95 |                 if grad.dtype in {torch.float16, torch.bfloat16}:
 96 |                     grad = grad.float()
 97 |                 if grad.is_sparse:
 98 |                     raise RuntimeError('Adafactor does not support sparse gradients.')
 99 | 
100 |                 state = self.state[p]
101 |                 grad_shape = grad.shape
102 | 
103 |                 factored, use_first_moment = self._get_options(group, grad_shape)
104 |                 # State Initialization
105 |                 if len(state) == 0:
106 |                     state['step'] = 0
107 | 
108 |                     if use_first_moment:
109 |                         # Exponential moving average of gradient values
110 |                         state['exp_avg'] = torch.zeros_like(grad)
111 |                     if factored:
112 |                         state['exp_avg_sq_row'] = torch.zeros(grad_shape[:-1]).to(grad)
113 |                         state['exp_avg_sq_col'] = torch.zeros(grad_shape[:-2] + grad_shape[-1:]).to(grad)
114 |                     else:
115 |                         state['exp_avg_sq'] = torch.zeros_like(grad)
116 | 
117 |                     state['RMS'] = 0
118 |                 else:
119 |                     if use_first_moment:
120 |                         state['exp_avg'] = state['exp_avg'].to(grad)
121 |                     if factored:
122 |                         state['exp_avg_sq_row'] = state['exp_avg_sq_row'].to(grad)
123 |                         state['exp_avg_sq_col'] = state['exp_avg_sq_col'].to(grad)
124 |                     else:
125 |                         state['exp_avg_sq'] = state['exp_avg_sq'].to(grad)
126 | 
127 |                 p_data_fp32 = p.data
128 |                 if p.data.dtype in {torch.float16, torch.bfloat16}:
129 |                     p_data_fp32 = p_data_fp32.float()
130 | 
131 |                 state['step'] += 1
132 |                 state['RMS'] = self._rms(p_data_fp32)
133 |                 lr_t = self._get_lr(group, state)
134 | 
135 |                 beta2t = 1.0 - math.pow(state['step'], group['decay_rate'])
136 |                 update = grad ** 2 + group['eps']
137 |                 if factored:
138 |                     exp_avg_sq_row = state['exp_avg_sq_row']
139 |                     exp_avg_sq_col = state['exp_avg_sq_col']
140 | 
141 |                     exp_avg_sq_row.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-1))
142 |                     exp_avg_sq_col.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-2))
143 |                     #exp_avg_sq_row.mul_(beta2t).add_(update.mean(dim=-1), alpha=1.0 - beta2t)  # pytorch 1.6+
144 |                     #exp_avg_sq_col.mul_(beta2t).add_(update.mean(dim=-2), alpha=1.0 - beta2t)
145 | 
146 |                     # Approximation of exponential moving average of square of gradient
147 |                     update = self._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col)
148 |                     update.mul_(grad)
149 |                 else:
150 |                     exp_avg_sq = state['exp_avg_sq']
151 | 
152 |                     exp_avg_sq.mul_(beta2t).add_(1.0 - beta2t, update)
153 |                     #exp_avg_sq.mul_(beta2t).add_(update, alpha=1.0 - beta2t)  # pytorch 1.6+
154 |                     update = exp_avg_sq.rsqrt().mul_(grad)
155 | 
156 |                 update.div_((self._rms(update) / group['clip_threshold']).clamp_(min=1.0))
157 |                 update.mul_(lr_t)
158 | 
159 |                 if use_first_moment:
160 |                     exp_avg = state['exp_avg']
161 |                     exp_avg.mul_(group["beta1"]).add_(1 - group["beta1"], update)
162 |                     #exp_avg.mul_(group['beta1']).add_(update, alpha=1 - group['beta1'])  # pytorch 1.6+
163 |                     update = exp_avg
164 | 
165 |                 if group['weight_decay'] != 0:
166 |                     p_data_fp32.add_(-group["weight_decay"] * lr_t, p_data_fp32)
167 |                     #p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * lr_t)  # pytorch 1.6+
168 | 
169 |                 p_data_fp32.add_(-update)
170 | 
171 |                 if p.data.dtype in {torch.float16, torch.bfloat16}:
172 |                     p.data.copy_(p_data_fp32)
173 | 
174 |         return loss


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/METER/dist_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | This file contains primitives for multi-gpu communication.
  4 | This is useful when doing distributed training.
  5 | """
  6 | 
  7 | import functools
  8 | import logging
  9 | import numpy as np
 10 | import pickle
 11 | import torch
 12 | import torch.distributed as dist
 13 | 
 14 | import torch
 15 | 
 16 | _LOCAL_PROCESS_GROUP = None
 17 | """
 18 | A torch process group which only includes processes that on the same machine as the current process.
 19 | This variable is set when processes are spawned by `launch()` in "engine/launch.py".
 20 | """
 21 | 
 22 | 
 23 | def get_world_size() -> int:
 24 |     if not dist.is_available():
 25 |         return 1
 26 |     if not dist.is_initialized():
 27 |         return 1
 28 |     return dist.get_world_size()
 29 | 
 30 | 
 31 | def get_rank() -> int:
 32 |     if not dist.is_available():
 33 |         return 0
 34 |     if not dist.is_initialized():
 35 |         return 0
 36 |     return dist.get_rank()
 37 | 
 38 | 
 39 | def get_local_rank() -> int:
 40 |     """
 41 |     Returns:
 42 |         The rank of the current process within the local (per-machine) process group.
 43 |     """
 44 |     if not dist.is_available():
 45 |         return 0
 46 |     if not dist.is_initialized():
 47 |         return 0
 48 |     assert _LOCAL_PROCESS_GROUP is not None
 49 |     return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
 50 | 
 51 | 
 52 | def get_local_size() -> int:
 53 |     """
 54 |     Returns:
 55 |         The size of the per-machine process group,
 56 |         i.e. the number of processes per machine.
 57 |     """
 58 |     if not dist.is_available():
 59 |         return 1
 60 |     if not dist.is_initialized():
 61 |         return 1
 62 |     return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
 63 | 
 64 | 
 65 | def is_main_process() -> bool:
 66 |     return get_rank() == 0
 67 | 
 68 | 
 69 | def synchronize():
 70 |     """
 71 |     Helper function to synchronize (barrier) among all processes when
 72 |     using distributed training
 73 |     """
 74 |     if not dist.is_available():
 75 |         return
 76 |     if not dist.is_initialized():
 77 |         return
 78 |     world_size = dist.get_world_size()
 79 |     if world_size == 1:
 80 |         return
 81 |     dist.barrier()
 82 | 
 83 | 
 84 | @functools.lru_cache()
 85 | def _get_global_gloo_group():
 86 |     """
 87 |     Return a process group based on gloo backend, containing all the ranks
 88 |     The result is cached.
 89 |     """
 90 |     if dist.get_backend() == "nccl":
 91 |         return dist.new_group(backend="gloo")
 92 |     else:
 93 |         return dist.group.WORLD
 94 | 
 95 | 
 96 | def _serialize_to_tensor(data, group):
 97 |     backend = dist.get_backend(group)
 98 |     assert backend in ["gloo", "nccl"]
 99 |     device = torch.device("cpu" if backend == "gloo" else "cuda")
100 | 
101 |     buffer = pickle.dumps(data)
102 |     if len(buffer) > 1024 ** 3:
103 |         logger = logging.getLogger(__name__)
104 |         logger.warning(
105 |             "Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
106 |                 get_rank(), len(buffer) / (1024 ** 3), device
107 |             )
108 |         )
109 |     storage = torch.ByteStorage.from_buffer(buffer)
110 |     tensor = torch.ByteTensor(storage).to(device=device)
111 |     return tensor
112 | 
113 | 
114 | def _pad_to_largest_tensor(tensor, group):
115 |     """
116 |     Returns:
117 |         list[int]: size of the tensor, on each rank
118 |         Tensor: padded tensor that has the max size
119 |     """
120 |     world_size = dist.get_world_size(group=group)
121 |     assert (
122 |         world_size >= 1
123 |     ), "comm.gather/all_gather must be called from ranks within the given group!"
124 |     local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device)
125 |     size_list = [
126 |         torch.zeros([1], dtype=torch.int64, device=tensor.device)
127 |         for _ in range(world_size)
128 |     ]
129 |     dist.all_gather(size_list, local_size, group=group)
130 |     size_list = [int(size.item()) for size in size_list]
131 | 
132 |     max_size = max(size_list)
133 | 
134 |     # we pad the tensor because torch all_gather does not support
135 |     # gathering tensors of different shapes
136 |     if local_size != max_size:
137 |         padding = torch.zeros(
138 |             (max_size - local_size,), dtype=torch.uint8, device=tensor.device
139 |         )
140 |         tensor = torch.cat((tensor, padding), dim=0)
141 |     return size_list, tensor
142 | 
143 | 
144 | def all_gather(data, group=None):
145 |     """
146 |     Run all_gather on arbitrary picklable data (not necessarily tensors).
147 | 
148 |     Args:
149 |         data: any picklable object
150 |         group: a torch process group. By default, will use a group which
151 |             contains all ranks on gloo backend.
152 | 
153 |     Returns:
154 |         list[data]: list of data gathered from each rank
155 |     """
156 |     if get_world_size() == 1:
157 |         return [data]
158 |     if group is None:
159 |         group = _get_global_gloo_group()
160 |     if dist.get_world_size(group) == 1:
161 |         return [data]
162 | 
163 |     tensor = _serialize_to_tensor(data, group)
164 | 
165 |     size_list, tensor = _pad_to_largest_tensor(tensor, group)
166 |     max_size = max(size_list)
167 | 
168 |     # receiving Tensor from all ranks
169 |     tensor_list = [
170 |         torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
171 |         for _ in size_list
172 |     ]
173 |     dist.all_gather(tensor_list, tensor, group=group)
174 | 
175 |     data_list = []
176 |     for size, tensor in zip(size_list, tensor_list):
177 |         buffer = tensor.cpu().numpy().tobytes()[:size]
178 |         data_list.append(pickle.loads(buffer))
179 | 
180 |     return data_list
181 | 
182 | 
183 | def gather(data, dst=0, group=None):
184 |     """
185 |     Run gather on arbitrary picklable data (not necessarily tensors).
186 | 
187 |     Args:
188 |         data: any picklable object
189 |         dst (int): destination rank
190 |         group: a torch process group. By default, will use a group which
191 |             contains all ranks on gloo backend.
192 | 
193 |     Returns:
194 |         list[data]: on dst, a list of data gathered from each rank. Otherwise,
195 |             an empty list.
196 |     """
197 |     if get_world_size() == 1:
198 |         return [data]
199 |     if group is None:
200 |         group = _get_global_gloo_group()
201 |     if dist.get_world_size(group=group) == 1:
202 |         return [data]
203 |     rank = dist.get_rank(group=group)
204 | 
205 |     tensor = _serialize_to_tensor(data, group)
206 |     size_list, tensor = _pad_to_largest_tensor(tensor, group)
207 | 
208 |     # receiving Tensor from all ranks
209 |     if rank == dst:
210 |         max_size = max(size_list)
211 |         tensor_list = [
212 |             torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
213 |             for _ in size_list
214 |         ]
215 |         dist.gather(tensor, tensor_list, dst=dst, group=group)
216 | 
217 |         data_list = []
218 |         for size, tensor in zip(size_list, tensor_list):
219 |             buffer = tensor.cpu().numpy().tobytes()[:size]
220 |             data_list.append(pickle.loads(buffer))
221 |         return data_list
222 |     else:
223 |         dist.gather(tensor, [], dst=dst, group=group)
224 |         return []
225 | 
226 | 
227 | def shared_random_seed():
228 |     """
229 |     Returns:
230 |         int: a random number that is the same across all workers.
231 |             If workers need a shared RNG, they can use this shared seed to
232 |             create one.
233 | 
234 |     All workers must call this function, otherwise it will deadlock.
235 |     """
236 |     ints = np.random.randint(2 ** 31)
237 |     all_ints = all_gather(ints)
238 |     return all_ints[0]
239 | 
240 | 
241 | def reduce_dict(input_dict, average=True):
242 |     """
243 |     Reduce the values in the dictionary from all processes so that process with rank
244 |     0 has the reduced results.
245 | 
246 |     Args:
247 |         input_dict (dict): inputs to be reduced. All the values must be scalar CUDA Tensor.
248 |         average (bool): whether to do average or sum
249 | 
250 |     Returns:
251 |         a dict with the same keys as input_dict, after reduction.
252 |     """
253 |     world_size = get_world_size()
254 |     if world_size < 2:
255 |         return input_dict
256 |     with torch.no_grad():
257 |         names = []
258 |         values = []
259 |         # sort the keys so that they are consistent across processes
260 |         for k in sorted(input_dict.keys()):
261 |             names.append(k)
262 |             values.append(input_dict[k])
263 |         values = torch.stack(values, dim=0)
264 |         dist.reduce(values, dst=0)
265 |         if dist.get_rank() == 0 and average:
266 |             # only main process gets accumulated, so only divide by
267 |             # world_size in this case
268 |             values /= world_size
269 |         reduced_dict = {k: v for k, v in zip(names, values)}
270 |     return reduced_dict
271 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/dataset/randaugment.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | 
  4 | 
  5 | ## aug functions
  6 | def identity_func(img):
  7 |     return img
  8 | 
  9 | 
 10 | def autocontrast_func(img, cutoff=0):
 11 |     '''
 12 |         same output as PIL.ImageOps.autocontrast
 13 |     '''
 14 |     n_bins = 256
 15 | 
 16 |     def tune_channel(ch):
 17 |         n = ch.size
 18 |         cut = cutoff * n // 100
 19 |         if cut == 0:
 20 |             high, low = ch.max(), ch.min()
 21 |         else:
 22 |             hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins])
 23 |             low = np.argwhere(np.cumsum(hist) > cut)
 24 |             low = 0 if low.shape[0] == 0 else low[0]
 25 |             high = np.argwhere(np.cumsum(hist[::-1]) > cut)
 26 |             high = n_bins - 1 if high.shape[0] == 0 else n_bins - 1 - high[0]
 27 |         if high <= low:
 28 |             table = np.arange(n_bins)
 29 |         else:
 30 |             scale = (n_bins - 1) / (high - low)
 31 |             offset = -low * scale
 32 |             table = np.arange(n_bins) * scale + offset
 33 |             table[table < 0] = 0
 34 |             table[table > n_bins - 1] = n_bins - 1
 35 |         table = table.clip(0, 255).astype(np.uint8)
 36 |         return table[ch]
 37 | 
 38 |     channels = [tune_channel(ch) for ch in cv2.split(img)]
 39 |     out = cv2.merge(channels)
 40 |     return out
 41 | 
 42 | 
 43 | def equalize_func(img):
 44 |     '''
 45 |         same output as PIL.ImageOps.equalize
 46 |         PIL's implementation is different from cv2.equalize
 47 |     '''
 48 |     n_bins = 256
 49 | 
 50 |     def tune_channel(ch):
 51 |         hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins])
 52 |         non_zero_hist = hist[hist != 0].reshape(-1)
 53 |         step = np.sum(non_zero_hist[:-1]) // (n_bins - 1)
 54 |         if step == 0: return ch
 55 |         n = np.empty_like(hist)
 56 |         n[0] = step // 2
 57 |         n[1:] = hist[:-1]
 58 |         table = (np.cumsum(n) // step).clip(0, 255).astype(np.uint8)
 59 |         return table[ch]
 60 | 
 61 |     channels = [tune_channel(ch) for ch in cv2.split(img)]
 62 |     out = cv2.merge(channels)
 63 |     return out
 64 | 
 65 | 
 66 | def rotate_func(img, degree, fill=(0, 0, 0)):
 67 |     '''
 68 |     like PIL, rotate by degree, not radians
 69 |     '''
 70 |     H, W = img.shape[0], img.shape[1]
 71 |     center = W / 2, H / 2
 72 |     M = cv2.getRotationMatrix2D(center, degree, 1)
 73 |     out = cv2.warpAffine(img, M, (W, H), borderValue=fill)
 74 |     return out
 75 | 
 76 | 
 77 | def solarize_func(img, thresh=128):
 78 |     '''
 79 |         same output as PIL.ImageOps.posterize
 80 |     '''
 81 |     table = np.array([el if el < thresh else 255 - el for el in range(256)])
 82 |     table = table.clip(0, 255).astype(np.uint8)
 83 |     out = table[img]
 84 |     return out
 85 | 
 86 | 
 87 | def color_func(img, factor):
 88 |     '''
 89 |         same output as PIL.ImageEnhance.Color
 90 |     '''
 91 |     ## implementation according to PIL definition, quite slow
 92 |     #  degenerate = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)[:, :, np.newaxis]
 93 |     #  out = blend(degenerate, img, factor)
 94 |     #  M = (
 95 |     #      np.eye(3) * factor
 96 |     #      + np.float32([0.114, 0.587, 0.299]).reshape(3, 1) * (1. - factor)
 97 |     #  )[np.newaxis, np.newaxis, :]
 98 |     M = (
 99 |             np.float32([
100 |                 [0.886, -0.114, -0.114],
101 |                 [-0.587, 0.413, -0.587],
102 |                 [-0.299, -0.299, 0.701]]) * factor
103 |             + np.float32([[0.114], [0.587], [0.299]])
104 |     )
105 |     out = np.matmul(img, M).clip(0, 255).astype(np.uint8)
106 |     return out
107 | 
108 | 
109 | def contrast_func(img, factor):
110 |     """
111 |         same output as PIL.ImageEnhance.Contrast
112 |     """
113 |     mean = np.sum(np.mean(img, axis=(0, 1)) * np.array([0.114, 0.587, 0.299]))
114 |     table = np.array([(
115 |         el - mean) * factor + mean
116 |         for el in range(256)
117 |     ]).clip(0, 255).astype(np.uint8)
118 |     out = table[img]
119 |     return out
120 | 
121 | 
122 | def brightness_func(img, factor):
123 |     '''
124 |         same output as PIL.ImageEnhance.Contrast
125 |     '''
126 |     table = (np.arange(256, dtype=np.float32) * factor).clip(0, 255).astype(np.uint8)
127 |     out = table[img]
128 |     return out
129 | 
130 | 
131 | def sharpness_func(img, factor):
132 |     '''
133 |     The differences the this result and PIL are all on the 4 boundaries, the center
134 |     areas are same
135 |     '''
136 |     kernel = np.ones((3, 3), dtype=np.float32)
137 |     kernel[1][1] = 5
138 |     kernel /= 13
139 |     degenerate = cv2.filter2D(img, -1, kernel)
140 |     if factor == 0.0:
141 |         out = degenerate
142 |     elif factor == 1.0:
143 |         out = img
144 |     else:
145 |         out = img.astype(np.float32)
146 |         degenerate = degenerate.astype(np.float32)[1:-1, 1:-1, :]
147 |         out[1:-1, 1:-1, :] = degenerate + factor * (out[1:-1, 1:-1, :] - degenerate)
148 |         out = out.astype(np.uint8)
149 |     return out
150 | 
151 | 
152 | def shear_x_func(img, factor, fill=(0, 0, 0)):
153 |     H, W = img.shape[0], img.shape[1]
154 |     M = np.float32([[1, factor, 0], [0, 1, 0]])
155 |     out = cv2.warpAffine(img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR).astype(np.uint8)
156 |     return out
157 | 
158 | 
159 | def translate_x_func(img, offset, fill=(0, 0, 0)):
160 |     '''
161 |         same output as PIL.Image.transform
162 |     '''
163 |     H, W = img.shape[0], img.shape[1]
164 |     M = np.float32([[1, 0, -offset], [0, 1, 0]])
165 |     out = cv2.warpAffine(img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR).astype(np.uint8)
166 |     return out
167 | 
168 | 
169 | def translate_y_func(img, offset, fill=(0, 0, 0)):
170 |     '''
171 |         same output as PIL.Image.transform
172 |     '''
173 |     H, W = img.shape[0], img.shape[1]
174 |     M = np.float32([[1, 0, 0], [0, 1, -offset]])
175 |     out = cv2.warpAffine(img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR).astype(np.uint8)
176 |     return out
177 | 
178 | 
179 | def posterize_func(img, bits):
180 |     '''
181 |         same output as PIL.ImageOps.posterize
182 |     '''
183 |     out = np.bitwise_and(img, np.uint8(255 << (8 - bits)))
184 |     return out
185 | 
186 | 
187 | def shear_y_func(img, factor, fill=(0, 0, 0)):
188 |     H, W = img.shape[0], img.shape[1]
189 |     M = np.float32([[1, 0, 0], [factor, 1, 0]])
190 |     out = cv2.warpAffine(img, M, (W, H), borderValue=fill, flags=cv2.INTER_LINEAR).astype(np.uint8)
191 |     return out
192 | 
193 | 
194 | def cutout_func(img, pad_size, replace=(0, 0, 0)):
195 |     replace = np.array(replace, dtype=np.uint8)
196 |     H, W = img.shape[0], img.shape[1]
197 |     rh, rw = np.random.random(2)
198 |     pad_size = pad_size // 2
199 |     ch, cw = int(rh * H), int(rw * W)
200 |     x1, x2 = max(ch - pad_size, 0), min(ch + pad_size, H)
201 |     y1, y2 = max(cw - pad_size, 0), min(cw + pad_size, W)
202 |     out = img.copy()
203 |     out[x1:x2, y1:y2, :] = replace
204 |     return out
205 | 
206 | 
207 | ### level to args
208 | def enhance_level_to_args(MAX_LEVEL):
209 |     def level_to_args(level):
210 |         return ((level / MAX_LEVEL) * 1.8 + 0.1,)
211 |     return level_to_args
212 | 
213 | 
214 | def shear_level_to_args(MAX_LEVEL, replace_value):
215 |     def level_to_args(level):
216 |         level = (level / MAX_LEVEL) * 0.3
217 |         if np.random.random() > 0.5: level = -level
218 |         return (level, replace_value)
219 | 
220 |     return level_to_args
221 | 
222 | 
223 | def translate_level_to_args(translate_const, MAX_LEVEL, replace_value):
224 |     def level_to_args(level):
225 |         level = (level / MAX_LEVEL) * float(translate_const)
226 |         if np.random.random() > 0.5: level = -level
227 |         return (level, replace_value)
228 | 
229 |     return level_to_args
230 | 
231 | 
232 | def cutout_level_to_args(cutout_const, MAX_LEVEL, replace_value):
233 |     def level_to_args(level):
234 |         level = int((level / MAX_LEVEL) * cutout_const)
235 |         return (level, replace_value)
236 | 
237 |     return level_to_args
238 | 
239 | 
240 | def solarize_level_to_args(MAX_LEVEL):
241 |     def level_to_args(level):
242 |         level = int((level / MAX_LEVEL) * 256)
243 |         return (level, )
244 |     return level_to_args
245 | 
246 | 
247 | def none_level_to_args(level):
248 |     return ()
249 | 
250 | 
251 | def posterize_level_to_args(MAX_LEVEL):
252 |     def level_to_args(level):
253 |         level = int((level / MAX_LEVEL) * 4)
254 |         return (level, )
255 |     return level_to_args
256 | 
257 | 
258 | def rotate_level_to_args(MAX_LEVEL, replace_value):
259 |     def level_to_args(level):
260 |         level = (level / MAX_LEVEL) * 30
261 |         if np.random.random() < 0.5:
262 |             level = -level
263 |         return (level, replace_value)
264 | 
265 |     return level_to_args
266 | 
267 | 
268 | func_dict = {
269 |     'Identity': identity_func,
270 |     'AutoContrast': autocontrast_func,
271 |     'Equalize': equalize_func,
272 |     'Rotate': rotate_func,
273 |     'Solarize': solarize_func,
274 |     'Color': color_func,
275 |     'Contrast': contrast_func,
276 |     'Brightness': brightness_func,
277 |     'Sharpness': sharpness_func,
278 |     'ShearX': shear_x_func,
279 |     'TranslateX': translate_x_func,
280 |     'TranslateY': translate_y_func,
281 |     'Posterize': posterize_func,
282 |     'ShearY': shear_y_func,
283 | }
284 | 
285 | translate_const = 10
286 | MAX_LEVEL = 10
287 | replace_value = (128, 128, 128)
288 | arg_dict = {
289 |     'Identity': none_level_to_args,
290 |     'AutoContrast': none_level_to_args,
291 |     'Equalize': none_level_to_args,
292 |     'Rotate': rotate_level_to_args(MAX_LEVEL, replace_value),
293 |     'Solarize': solarize_level_to_args(MAX_LEVEL),
294 |     'Color': enhance_level_to_args(MAX_LEVEL),
295 |     'Contrast': enhance_level_to_args(MAX_LEVEL),
296 |     'Brightness': enhance_level_to_args(MAX_LEVEL),
297 |     'Sharpness': enhance_level_to_args(MAX_LEVEL),
298 |     'ShearX': shear_level_to_args(MAX_LEVEL, replace_value),
299 |     'TranslateX': translate_level_to_args(
300 |         translate_const, MAX_LEVEL, replace_value
301 |     ),
302 |     'TranslateY': translate_level_to_args(
303 |         translate_const, MAX_LEVEL, replace_value
304 |     ),
305 |     'Posterize': posterize_level_to_args(MAX_LEVEL),
306 |     'ShearY': shear_level_to_args(MAX_LEVEL, replace_value),
307 | }
308 | 
309 | 
310 | class RandomAugment(object):
311 | 
312 |     def __init__(self, N=2, M=10, isPIL=False, augs=[]):
313 |         self.N = N
314 |         self.M = M
315 |         self.isPIL = isPIL
316 |         if augs:
317 |             self.augs = augs       
318 |         else:
319 |             self.augs = list(arg_dict.keys())
320 | 
321 |     def get_random_ops(self):
322 |         sampled_ops = np.random.choice(self.augs, self.N)
323 |         return [(op, 0.5, self.M) for op in sampled_ops]
324 | 
325 |     def __call__(self, img):
326 |         if self.isPIL:
327 |             img = np.array(img)            
328 |         ops = self.get_random_ops()
329 |         for name, prob, level in ops:
330 |             if np.random.random() > prob:
331 |                 continue
332 |             args = arg_dict[name](level)
333 |             img = func_dict[name](img, *args) 
334 |         return img
335 | 
336 | 
337 | if __name__ == '__main__':
338 |     a = RandomAugment()
339 |     img = np.random.randn(32, 32, 3)
340 |     a(img)


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/METER/clip_model.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | from typing import Tuple, Union
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn.functional as F
  7 | from torch import nn
  8 | 
  9 | 
 10 | class LayerNorm(nn.LayerNorm):
 11 |     """Subclass torch's LayerNorm to handle fp16."""
 12 | 
 13 |     def forward(self, x: torch.Tensor):
 14 |         orig_type = x.dtype
 15 |         ret = super().forward(x.type(torch.float32))
 16 |         return ret.type(orig_type)
 17 | 
 18 | 
 19 | class QuickGELU(nn.Module):
 20 |     def forward(self, x: torch.Tensor):
 21 |         return x * torch.sigmoid(1.702 * x)
 22 | 
 23 | 
 24 | class ResidualAttentionBlock(nn.Module):
 25 |     def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
 26 |         super().__init__()
 27 | 
 28 |         self.attn = nn.MultiheadAttention(d_model, n_head)
 29 |         self.ln_1 = LayerNorm(d_model)
 30 |         self.mlp = nn.Sequential(OrderedDict([
 31 |             ("c_fc", nn.Linear(d_model, d_model * 4)),
 32 |             ("gelu", QuickGELU()),
 33 |             ("c_proj", nn.Linear(d_model * 4, d_model))
 34 |         ]))
 35 |         self.ln_2 = LayerNorm(d_model)
 36 |         self.attn_mask = attn_mask
 37 | 
 38 |     def attention(self, x: torch.Tensor, x_mask:torch.Tensor):
 39 |         if x_mask is not None:
 40 |             x_mask = x_mask.to(dtype=torch.bool, device=x.device)
 41 |         self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
 42 |         return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask, key_padding_mask=x_mask)[0]
 43 | 
 44 |     def forward(self, x: torch.Tensor, x_mask:torch.Tensor=None):
 45 |         x = x + self.attention(self.ln_1(x), x_mask)
 46 |         x = x + self.mlp(self.ln_2(x))
 47 |         return x
 48 | 
 49 | 
 50 | class Transformer(nn.Module):
 51 |     def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
 52 |         super().__init__()
 53 |         self.width = width
 54 |         self.layers = layers
 55 |         self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers-1)])
 56 | 
 57 |     def forward(self, x: torch.Tensor, x_mask: torch.Tensor=None):
 58 |         for block in self.resblocks:
 59 |             x = block(x, x_mask)
 60 |         return x
 61 | 
 62 | 
 63 | class VisualTransformer(nn.Module):
 64 |     def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int, resolution_after: int):
 65 |         super().__init__()
 66 |         self.input_resolution = input_resolution
 67 |         self.output_dim = output_dim
 68 |         self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
 69 | 
 70 |         scale = width ** -0.5
 71 |         self.class_embedding = nn.Parameter(scale * torch.randn(width))
 72 |         self.positional_embedding = nn.Parameter(scale * torch.randn((resolution_after // patch_size) ** 2 + 1, width))
 73 |         self.ln_pre = LayerNorm(width)
 74 | 
 75 |         self.transformer = Transformer(width, layers, heads)
 76 |         self.ln_post = LayerNorm(width)
 77 | 
 78 |     def forward(self, x: torch.Tensor, x_mask):
 79 |         x = self.conv1(x)  # shape = [*, width, grid, grid]
 80 |         x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
 81 |         x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
 82 |         t=self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
 83 |         x = torch.cat([t, x], dim=1)  # shape = [*, grid ** 2 + 1, width]
 84 |         x = x + self.positional_embedding.to(x.dtype)
 85 |         x = self.ln_pre(x)
 86 | 
 87 |         x = x.permute(1, 0, 2)  # NLD -> LND
 88 |         x = self.transformer(x, x_mask)
 89 |         x = x.permute(1, 0, 2)  # LND -> NLD
 90 | 
 91 |         x = self.ln_post(x)
 92 | 
 93 |         return x
 94 | 
 95 | 
 96 | class CLIP(nn.Module):
 97 |     def __init__(self,
 98 |                  embed_dim: int,
 99 |                  # vision
100 |                  image_resolution: int,
101 |                  vision_layers: Union[Tuple[int, int, int, int], int],
102 |                  vision_width: int,
103 |                  vision_patch_size: int,
104 |                  # text
105 |                  context_length: int,
106 |                  vocab_size: int,
107 |                  transformer_width: int,
108 |                  transformer_heads: int,
109 |                  transformer_layers: int,
110 |                  resolution_after=224,
111 |                  ):
112 |         super().__init__()
113 | 
114 |         self.context_length = context_length
115 | 
116 |         vision_heads = vision_width // 64
117 |         self.visual = VisualTransformer(
118 |             input_resolution=image_resolution,
119 |             patch_size=vision_patch_size,
120 |             width=vision_width,
121 |             layers=vision_layers,
122 |             heads=vision_heads,
123 |             output_dim=embed_dim,
124 |             resolution_after=resolution_after,
125 |         )
126 | 
127 |         self.vocab_size = vocab_size
128 |         self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
129 |         self.ln_final = LayerNorm(transformer_width)
130 | 
131 |         self.initialize_parameters()
132 | 
133 |     def initialize_parameters(self):
134 |         nn.init.normal_(self.positional_embedding, std=0.01)
135 | 
136 |         proj_std = (self.visual.transformer.width ** -0.5) * ((2 * self.visual.transformer.layers) ** -0.5)
137 |         attn_std = self.visual.transformer.width ** -0.5
138 |         fc_std = (2 * self.visual.transformer.width) ** -0.5
139 |         for block in self.visual.transformer.resblocks:
140 |             nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
141 |             nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
142 |             nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
143 |             nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
144 | 
145 |     @property
146 |     def dtype(self):
147 |         return self.visual.conv1.weight.dtype
148 | 
149 |     def forward(self, image, image_mask=None):
150 |         return self.visual(image.type(self.dtype), image_mask)
151 | 
152 | 
153 | _MODELS = {
154 |     "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
155 |     "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
156 | }
157 | import os
158 | import hashlib
159 | import urllib
160 | from tqdm import tqdm
161 | import warnings
162 | def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")):
163 |     os.makedirs(root, exist_ok=True)
164 |     filename = os.path.basename(url)
165 | 
166 |     expected_sha256 = url.split("/")[-2]
167 |     download_target = os.path.join(root, filename)
168 | 
169 |     if os.path.exists(download_target) and not os.path.isfile(download_target):
170 |         raise RuntimeError(f"{download_target} exists and is not a regular file")
171 | 
172 |     if os.path.isfile(download_target):
173 |         if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
174 |             return download_target
175 |         else:
176 |             warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
177 | 
178 |     with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
179 |         with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
180 |             while True:
181 |                 buffer = source.read(8192)
182 |                 if not buffer:
183 |                     break
184 | 
185 |                 output.write(buffer)
186 |                 loop.update(len(buffer))
187 | 
188 |     if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
189 |         raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
190 | 
191 |     return download_target
192 | 
193 | def adapt_position_encoding(model, patch_size=32, after=384,
194 |                             suffix='visual.positional_embedding'):
195 |     keys = [k for k in model if k.endswith(suffix)]
196 |     assert len(keys) == 1
197 |     key = keys[0]
198 |     origin_pos_embed = model[key]
199 |     origin_dim2 = False
200 |     if len(origin_pos_embed.shape) == 2:
201 |         origin_dim2 = True
202 |         origin_pos_embed = origin_pos_embed.unsqueeze(0)
203 |     grid_before = int(np.sqrt(origin_pos_embed.shape[1] - 1))
204 |     before = int(grid_before*patch_size)
205 |     assert (before % patch_size) == 0
206 |     grid_after = after // patch_size
207 |     assert (after % patch_size) == 0
208 |     embed_dim = origin_pos_embed.shape[-1]
209 | 
210 |     pos_embed = origin_pos_embed[0, 1:, :].reshape((grid_before, grid_before, embed_dim))
211 |     new_size = (grid_after, grid_after)
212 |     pos_embed = torch.nn.functional.interpolate(pos_embed.permute((2, 0, 1)).unsqueeze(0), size=new_size, mode='bicubic')
213 |     pos_embed = pos_embed.squeeze(0).permute((1, 2, 0)).reshape((-1, embed_dim))
214 |     pos_embed = torch.cat((origin_pos_embed[0, 0:1, :], pos_embed), dim=0).unsqueeze(0)
215 |     assert pos_embed.shape == (1, grid_after * grid_after + 1, embed_dim)
216 |     if origin_dim2:
217 |         assert pos_embed.shape[0] == 1
218 |         pos_embed = pos_embed.squeeze(0)
219 |     model[key] = pos_embed
220 |     return model
221 | 
222 | 
223 | def build_model(name, resolution_after=224):
224 |     
225 | #     if name in _MODELS:
226 | #         model_path = _download(_MODELS[name])
227 | #     elif os.path.isfile(name):
228 | #         model_path = name
229 | #     else:
230 | #         raise RuntimeError(f"Model {name} not found; available models = {available_models()}"
231 | # )   
232 |     model_path = 'ViT-B-16.pt'
233 |     try:
234 |         model = torch.jit.load(model_path, map_location="cpu")
235 |         state_dict = None
236 |     except RuntimeError:
237 |         if jit:
238 |             warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
239 |             jit = False
240 |         state_dict = torch.load(model_path, map_location="cpu")
241 |     state_dict = state_dict or model.state_dict()
242 |     vit = "visual.proj" in state_dict
243 | 
244 |     vision_width = state_dict["visual.conv1.weight"].shape[0]
245 |     vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
246 |     vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
247 |     grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
248 |     image_resolution = vision_patch_size * grid_size
249 | 
250 |     embed_dim = state_dict["text_projection"].shape[1]
251 |     context_length = state_dict["positional_embedding"].shape[0]
252 |     vocab_size = state_dict["token_embedding.weight"].shape[0]
253 |     transformer_width = state_dict["ln_final.weight"].shape[0]
254 |     transformer_heads = transformer_width // 64
255 |     transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
256 | 
257 |     model = CLIP(
258 |         embed_dim,
259 |         image_resolution, vision_layers, vision_width, vision_patch_size,
260 |         context_length, vocab_size, transformer_width, transformer_heads, transformer_layers,
261 |         resolution_after,
262 |     )
263 | 
264 |     for key in ["input_resolution", "context_length", "vocab_size"]:
265 |         if key in state_dict:
266 |             del state_dict[key]
267 | 
268 |     model_dict = model.state_dict()
269 |     pretrained_dict = state_dict
270 |     if resolution_after != image_resolution:
271 |         pretrained_dict = adapt_position_encoding(pretrained_dict, after=resolution_after, patch_size=vision_patch_size)
272 |     # 1. filter out unnecessary keys
273 |     pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
274 |     # 2. overwrite entries in the existing state dict
275 |     model_dict.update(pretrained_dict) 
276 |     # 3. load the new state dict
277 |     model.load_state_dict(model_dict)
278 |     return model
279 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/tools/multilabel_metrics.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from urllib.request import urlretrieve
  3 | import torch
  4 | import numpy as np
  5 | import pdb
  6 | 
  7 | def get_multi_label(label, image):
  8 |     multi_label = torch.zeros([len(label), 4], dtype=torch.long).to(image.device) 
  9 |     # origin cls = [0, 0, 0, 0]
 10 |     real_label_pos = np.where(np.array(label) == 'orig')[0].tolist()
 11 |     multi_label[real_label_pos,:] = torch.tensor([0, 0, 0, 0]).to(image.device) 
 12 |     # face_swap cls = [1, 0, 0, 0]
 13 |     pos = np.where(np.array(label) == 'face_swap')[0].tolist() 
 14 |     multi_label[pos,:] = torch.tensor([1, 0, 0, 0]).to(image.device) 
 15 |     # face_attribute cls = [0, 1, 0, 0]
 16 |     pos = np.where(np.array(label) == 'face_attribute')[0].tolist()
 17 |     multi_label[pos,:] = torch.tensor([0, 1, 0, 0]).to(image.device) 
 18 |     # text_swap cls = [0, 0, 1, 0]
 19 |     pos = np.where(np.array(label) == 'text_swap')[0].tolist()
 20 |     multi_label[pos,:] = torch.tensor([0, 0, 1, 0]).to(image.device) 
 21 |     # text_attribute cls = [0, 0, 0, 1]
 22 |     pos = np.where(np.array(label) == 'text_attribute')[0].tolist()
 23 |     multi_label[pos,:] = torch.tensor([0, 0, 0, 1]).to(image.device) 
 24 |     #  face_swap&text_swap cls = [1, 0, 1, 0]
 25 |     pos = np.where(np.array(label) == 'face_swap&text_swap')[0].tolist()
 26 |     multi_label[pos,:] = torch.tensor([1, 0, 1, 0]).to(image.device) 
 27 |     #  face_swap&text_attribute cls = [1, 0, 0, 1]
 28 |     pos = np.where(np.array(label) == 'face_swap&text_attribute')[0].tolist()
 29 |     multi_label[pos,:] = torch.tensor([1, 0, 0, 1]).to(image.device) 
 30 |     #  face_attribute&text_swap cls = [0, 1, 1, 0]
 31 |     pos = np.where(np.array(label) == 'face_attribute&text_swap')[0].tolist()
 32 |     multi_label[pos,:] = torch.tensor([0, 1, 1, 0]).to(image.device) 
 33 |     #  face_attribute&text_attribute cls = [0, 1, 0, 1]
 34 |     pos = np.where(np.array(label) == 'face_attribute&text_attribute')[0].tolist()
 35 |     multi_label[pos,:] = torch.tensor([0, 1, 0, 1]).to(image.device) 
 36 | 
 37 |     return multi_label, real_label_pos
 38 | 
 39 | 
 40 | def get_multi_label_TS(label, image):
 41 |     TS_pos = []
 42 |     
 43 |     multi_label = torch.zeros([len(label), 4], dtype=torch.long).to(image.device) 
 44 |     # origin cls = [0, 0, 0, 0]
 45 |     real_label_pos = np.where(np.array(label) == 'orig')[0].tolist()
 46 |     multi_label[real_label_pos,:] = torch.tensor([0, 0, 0, 0]).to(image.device) 
 47 |     # face_swap cls = [1, 0, 0, 0]
 48 |     pos = np.where(np.array(label) == 'face_swap')[0].tolist() 
 49 |     multi_label[pos,:] = torch.tensor([1, 0, 0, 0]).to(image.device) 
 50 |     # face_attribute cls = [0, 1, 0, 0]
 51 |     pos = np.where(np.array(label) == 'face_attribute')[0].tolist()
 52 |     multi_label[pos,:] = torch.tensor([0, 1, 0, 0]).to(image.device) 
 53 |     # text_swap cls = [0, 0, 1, 0]
 54 |     pos = np.where(np.array(label) == 'text_swap')[0].tolist()
 55 |     multi_label[pos,:] = torch.tensor([0, 0, 1, 0]).to(image.device) 
 56 |     TS_pos.extend(pos)
 57 |     # text_attribute cls = [0, 0, 0, 1]
 58 |     pos = np.where(np.array(label) == 'text_attribute')[0].tolist()
 59 |     multi_label[pos,:] = torch.tensor([0, 0, 0, 1]).to(image.device) 
 60 |     #  face_swap&text_swap cls = [1, 0, 1, 0]
 61 |     pos = np.where(np.array(label) == 'face_swap&text_swap')[0].tolist()
 62 |     multi_label[pos,:] = torch.tensor([1, 0, 1, 0]).to(image.device) 
 63 |     TS_pos.extend(pos)
 64 |     #  face_swap&text_attribute cls = [1, 0, 0, 1]
 65 |     pos = np.where(np.array(label) == 'face_swap&text_attribute')[0].tolist()
 66 |     multi_label[pos,:] = torch.tensor([1, 0, 0, 1]).to(image.device) 
 67 |     #  face_attribute&text_swap cls = [0, 1, 1, 0]
 68 |     pos = np.where(np.array(label) == 'face_attribute&text_swap')[0].tolist()
 69 |     multi_label[pos,:] = torch.tensor([0, 1, 1, 0]).to(image.device) 
 70 |     TS_pos.extend(pos)
 71 |     #  face_attribute&text_attribute cls = [0, 1, 0, 1]
 72 |     pos = np.where(np.array(label) == 'face_attribute&text_attribute')[0].tolist()
 73 |     multi_label[pos,:] = torch.tensor([0, 1, 0, 1]).to(image.device) 
 74 | 
 75 |     return multi_label, real_label_pos, TS_pos
 76 | 
 77 | # def get_multi_label(label, image):
 78 | #     multi_label = torch.zeros([len(label), 3], dtype=torch.long).to(image.device) 
 79 | #     # origin cls = [0, 0, 0]
 80 | #     real_label_pos = np.where(np.array(label) == 'orig')[0].tolist()
 81 | #     multi_label[real_label_pos,:] = torch.tensor([0, 0, 0]).to(image.device) 
 82 | #     # face_swap cls = [1, 0, 0]
 83 | #     pos = np.where(np.array(label) == 'face_swap')[0].tolist() 
 84 | #     multi_label[pos,:] = torch.tensor([1, 0, 0]).to(image.device) 
 85 | #     # face_attribute cls = [0, 1, 0]
 86 | #     pos = np.where(np.array(label) == 'face_attribute')[0].tolist()
 87 | #     multi_label[pos,:] = torch.tensor([0, 1, 0]).to(image.device) 
 88 | #     # text_attribute cls = [0, 0, 1]
 89 | #     pos = np.where(np.array(label) == 'text_attribute')[0].tolist()
 90 | #     multi_label[pos,:] = torch.tensor([0, 0, 1]).to(image.device) 
 91 | #     #  face_swap&text_attribute cls = [1, 0, 1]
 92 | #     pos = np.where(np.array(label) == 'face_swap&text_attribute')[0].tolist()
 93 | #     multi_label[pos,:] = torch.tensor([1, 0, 1]).to(image.device) 
 94 | #     #  face_attribute&text_attribute cls = [0, 1, 1]
 95 | #     pos = np.where(np.array(label) == 'face_attribute&text_attribute')[0].tolist()
 96 | #     multi_label[pos,:] = torch.tensor([0, 1, 1]).to(image.device) 
 97 | 
 98 | #     return multi_label, real_label_pos
 99 | 
100 | 
101 | 
102 | class AveragePrecisionMeter(object):
103 |     """
104 |     The APMeter measures the average precision per class.
105 |     The APMeter is designed to operate on `NxK` Tensors `output` and
106 |     `target`, and optionally a `Nx1` Tensor weight where (1) the `output`
107 |     contains model output scores for `N` examples and `K` classes that ought to
108 |     be higher when the model is more convinced that the example should be
109 |     positively labeled, and smaller when the model believes the example should
110 |     be negatively labeled (for instance, the output of a sigmoid function); (2)
111 |     the `target` contains only values 0 (for negative examples) and 1
112 |     (for positive examples); and (3) the `weight` ( > 0) represents weight for
113 |     each sample.
114 |     """
115 | 
116 |     def __init__(self, difficult_examples=False):
117 |         super(AveragePrecisionMeter, self).__init__()
118 |         self.reset()
119 |         self.difficult_examples = difficult_examples
120 | 
121 |     def reset(self):
122 |         """Resets the meter with empty member variables"""
123 |         self.scores = torch.FloatTensor(torch.FloatStorage())
124 |         self.targets = torch.LongTensor(torch.LongStorage())
125 | 
126 |     def add(self, output, target):
127 |         """
128 |         Args:
129 |             output (Tensor): NxK tensor that for each of the N examples
130 |                 indicates the probability of the example belonging to each of
131 |                 the K classes, according to the model. The probabilities should
132 |                 sum to one over all classes
133 |             target (Tensor): binary NxK tensort that encodes which of the K
134 |                 classes are associated with the N-th input
135 |                     (eg: a row [0, 1, 0, 1] indicates that the example is
136 |                          associated with classes 2 and 4)
137 |             weight (optional, Tensor): Nx1 tensor representing the weight for
138 |                 each example (each weight > 0)
139 |         """
140 |         if not torch.is_tensor(output):
141 |             output = torch.from_numpy(output)
142 |         if not torch.is_tensor(target):
143 |             target = torch.from_numpy(target)
144 | 
145 |         if output.dim() == 1:
146 |             output = output.view(-1, 1)
147 |         else:
148 |             assert output.dim() == 2, \
149 |                 'wrong output size (should be 1D or 2D with one column \
150 |                 per class)'
151 |         if target.dim() == 1:
152 |             target = target.view(-1, 1)
153 |         else:
154 |             assert target.dim() == 2, \
155 |                 'wrong target size (should be 1D or 2D with one column \
156 |                 per class)'
157 |         if self.scores.numel() > 0:
158 |             assert target.size(1) == self.targets.size(1), \
159 |                 'dimensions for output should match previously added examples.'
160 | 
161 |         # make sure storage is of sufficient size
162 |         if self.scores.storage().size() < self.scores.numel() + output.numel():
163 |             new_size = math.ceil(self.scores.storage().size() * 1.5)
164 |             self.scores.storage().resize_(int(new_size + output.numel()))
165 |             self.targets.storage().resize_(int(new_size + output.numel()))
166 | 
167 |         # store scores and targets
168 |         offset = self.scores.size(0) if self.scores.dim() > 0 else 0
169 |         self.scores.resize_(offset + output.size(0), output.size(1))
170 |         self.targets.resize_(offset + target.size(0), target.size(1))
171 |         self.scores.narrow(0, offset, output.size(0)).copy_(output)
172 |         self.targets.narrow(0, offset, target.size(0)).copy_(target)
173 | 
174 |     def value(self):
175 |         """Returns the model's average precision for each class
176 |         Return:
177 |             ap (FloatTensor): 1xK tensor, with avg precision for each class k
178 |         """
179 | 
180 |         if self.scores.numel() == 0:
181 |             return 0
182 |         ap = torch.zeros(self.scores.size(1))
183 |         rg = torch.arange(1, self.scores.size(0)).float()
184 |         # compute average precision for each class
185 |         for k in range(self.scores.size(1)):
186 |             # sort scores
187 |             scores = self.scores[:, k]
188 |             targets = self.targets[:, k]
189 |             # compute average precision
190 |             ap[k] = AveragePrecisionMeter.average_precision(scores, targets, self.difficult_examples)
191 |         return ap
192 | 
193 |     @staticmethod
194 |     def average_precision(output, target, difficult_examples=True):
195 | 
196 |         # sort examples
197 |         sorted, indices = torch.sort(output, dim=0, descending=True)
198 | 
199 |         # Computes prec@i
200 |         pos_count = 0.
201 |         total_count = 0.
202 |         precision_at_i = 0.
203 |         for i in indices:
204 |             label = target[i]
205 |             if difficult_examples and label == 0:
206 |                 continue
207 |             if label == 1:
208 |                 pos_count += 1
209 |             total_count += 1
210 |             if label == 1:
211 |                 precision_at_i += pos_count / total_count
212 |         precision_at_i /= pos_count
213 |         return precision_at_i
214 | 
215 |     def overall(self):
216 |         if self.scores.numel() == 0:
217 |             return 0
218 |         scores = self.scores.cpu().numpy()
219 |         targets = self.targets.cpu().numpy()
220 |         targets[targets == -1] = 0
221 |         return self.evaluation(scores, targets)
222 | 
223 |     def overall_topk(self, k):
224 |         targets = self.targets.cpu().numpy()
225 |         targets[targets == -1] = 0
226 |         n, c = self.scores.size()
227 |         scores = np.zeros((n, c)) - 1
228 |         index = self.scores.topk(k, 1, True, True)[1].cpu().numpy()
229 |         tmp = self.scores.cpu().numpy()
230 |         for i in range(n):
231 |             for ind in index[i]:
232 |                 scores[i, ind] = 1 if tmp[i, ind] >= 0 else -1
233 |         return self.evaluation(scores, targets)
234 | 
235 | 
236 |     def evaluation(self, scores_, targets_):
237 |         n, n_class = scores_.shape
238 |         Nc, Np, Ng = np.zeros(n_class), np.zeros(n_class), np.zeros(n_class)
239 |         for k in range(n_class):
240 |             scores = scores_[:, k]
241 |             targets = targets_[:, k]
242 |             targets[targets == -1] = 0
243 |             Ng[k] = np.sum(targets == 1)
244 |             Np[k] = np.sum(scores >= 0)
245 |             Nc[k] = np.sum(targets * (scores >= 0))
246 |         Np[Np == 0] = 1
247 |         OP = np.sum(Nc) / np.sum(Np)
248 |         OR = np.sum(Nc) / np.sum(Ng)
249 |         OF1 = (2 * OP * OR) / (OP + OR)
250 | 
251 |         CP = np.sum(Nc / Np) / n_class
252 |         CR = np.sum(Nc / Ng) / n_class
253 |         CF1 = (2 * CP * CR) / (CP + CR)
254 |         return OP, OR, OF1, CP, CR, CF1
255 | 
256 | 
257 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/METER/meter_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import random
  3 | 
  4 | from transformers.optimization import AdamW
  5 | from transformers import (
  6 |     get_polynomial_decay_schedule_with_warmup,
  7 |     get_cosine_schedule_with_warmup,
  8 | )
  9 | from .dist_utils import all_gather
 10 | from .objectives import compute_irtr_recall
 11 | from ..gadgets.my_metrics import Accuracy, VQAScore, Scalar
 12 | 
 13 | 
 14 | def set_metrics(pl_module):
 15 |     for split in ["train", "val"]:
 16 |         for k, v in pl_module.hparams.config["loss_names"].items():
 17 |             if v <= 0:
 18 |                 continue
 19 |             if k == "vqa":
 20 |                 setattr(pl_module, f"{split}_vqa_score", VQAScore())
 21 |                 setattr(pl_module, f"{split}_{k}_loss", Scalar())
 22 |             elif k == "nlvr2":
 23 |                 if split == "train":
 24 |                     setattr(pl_module, f"train_{k}_accuracy", Accuracy())
 25 |                     setattr(pl_module, f"train_{k}_loss", Scalar())
 26 |                 else:
 27 |                     setattr(pl_module, f"dev_{k}_accuracy", Accuracy())
 28 |                     setattr(pl_module, f"dev_{k}_loss", Scalar())
 29 |                     setattr(pl_module, f"test_{k}_accuracy", Accuracy())
 30 |                     setattr(pl_module, f"test_{k}_loss", Scalar())
 31 |             elif k == "snli":
 32 |                 if split == "train":
 33 |                     setattr(pl_module, f"train_{k}_accuracy", Accuracy())
 34 |                     setattr(pl_module, f"train_{k}_loss", Scalar())
 35 |                 else:
 36 |                     setattr(pl_module, f"dev_{k}_accuracy", Accuracy())
 37 |                     setattr(pl_module, f"dev_{k}_loss", Scalar())
 38 |                     setattr(pl_module, f"test_{k}_accuracy", Accuracy())
 39 |                     setattr(pl_module, f"test_{k}_loss", Scalar())
 40 |             elif k == "irtr":
 41 |                 setattr(pl_module, f"{split}_irtr_loss", Scalar())
 42 |             elif k == "mppd" or k == "mpfr":
 43 |                 setattr(pl_module, f"{split}_{k}_loss", Scalar())
 44 |             elif k == "itm":
 45 |                 setattr(pl_module, f"{split}_{k}_accuracy", Accuracy())
 46 |                 setattr(pl_module, f"{split}_{k}_loss", Scalar())
 47 |             else:
 48 |                 setattr(pl_module, f"{split}_{k}_accuracy", Accuracy())
 49 |                 setattr(pl_module, f"{split}_{k}_loss", Scalar())
 50 | 
 51 | 
 52 | def epoch_wrapup(pl_module):
 53 |     phase = "train" if pl_module.training else "val"
 54 |     the_metric = 0
 55 | 
 56 |     if pl_module.hparams.config["get_recall_metric"] and not pl_module.training:
 57 |         (ir_r1, ir_r5, ir_r10, tr_r1, tr_r5, tr_r10) = compute_irtr_recall(pl_module)
 58 |         print((ir_r1, ir_r5, ir_r10, tr_r1, tr_r5, tr_r10), pl_module.global_step)
 59 |         pl_module.logger.experiment.add_scalar(
 60 |             "recalls/ir_r1", ir_r1, pl_module.global_step
 61 |         )
 62 |         pl_module.logger.experiment.add_scalar(
 63 |             "recalls/ir_r5", ir_r5, pl_module.global_step
 64 |         )
 65 |         pl_module.logger.experiment.add_scalar(
 66 |             "recalls/ir_r10", ir_r10, pl_module.global_step
 67 |         )
 68 |         pl_module.logger.experiment.add_scalar(
 69 |             "recalls/tr_r1", tr_r1, pl_module.global_step
 70 |         )
 71 |         pl_module.logger.experiment.add_scalar(
 72 |             "recalls/tr_r5", tr_r5, pl_module.global_step
 73 |         )
 74 |         pl_module.logger.experiment.add_scalar(
 75 |             "recalls/tr_r10", tr_r10, pl_module.global_step
 76 |         )
 77 |         the_metric += ir_r1.item() + tr_r1.item()
 78 | 
 79 |     for loss_name, v in pl_module.hparams.config["loss_names"].items():
 80 |         if v <= 0:
 81 |             continue
 82 | 
 83 |         value = 0
 84 | 
 85 |         if loss_name == "vqa":
 86 |             value = getattr(pl_module, f"{phase}_{loss_name}_score").compute()
 87 |             pl_module.log(f"{loss_name}/{phase}/score_epoch", value)
 88 |             getattr(pl_module, f"{phase}_{loss_name}_score").reset()
 89 |             pl_module.log(
 90 |                 f"{loss_name}/{phase}/loss_epoch",
 91 |                 getattr(pl_module, f"{phase}_{loss_name}_loss").compute(),
 92 |             )
 93 |             getattr(pl_module, f"{phase}_{loss_name}_loss").reset()
 94 |         elif loss_name == "nlvr2" or loss_name == 'snli':
 95 |             if phase == "train":
 96 |                 value = getattr(pl_module, f"train_{loss_name}_accuracy").compute()
 97 |                 pl_module.log(f"{loss_name}/train/accuracy_epoch", value)
 98 |                 getattr(pl_module, f"train_{loss_name}_accuracy").reset()
 99 |                 pl_module.log(
100 |                     f"{loss_name}/train/loss_epoch",
101 |                     getattr(pl_module, f"train_{loss_name}_loss").compute(),
102 |                 )
103 |                 getattr(pl_module, f"train_{loss_name}_loss").reset()
104 |             else:
105 |                 value = getattr(pl_module, f"test_{loss_name}_accuracy").compute()
106 |                 pl_module.log(f"{loss_name}/test/accuracy_epoch", value)
107 |                 getattr(pl_module, f"test_{loss_name}_accuracy").reset()
108 |                 pl_module.log(
109 |                     f"{loss_name}/test/loss_epoch",
110 |                     getattr(pl_module, f"test_{loss_name}_loss").compute(),
111 |                 )
112 |                 getattr(pl_module, f"test_{loss_name}_loss").reset()
113 | 
114 |                 value = getattr(pl_module, f"dev_{loss_name}_accuracy").compute()
115 |                 pl_module.log(f"{loss_name}/dev/accuracy_epoch", value)
116 |                 getattr(pl_module, f"dev_{loss_name}_accuracy").reset()
117 |                 pl_module.log(
118 |                     f"{loss_name}/dev/loss_epoch",
119 |                     getattr(pl_module, f"dev_{loss_name}_loss").compute(),
120 |                 )
121 |                 getattr(pl_module, f"dev_{loss_name}_loss").reset()
122 |         elif loss_name == "irtr":
123 |             pl_module.log(
124 |                 f"{loss_name}/{phase}/irtr_loss_epoch",
125 |                 getattr(pl_module, f"{phase}_irtr_loss").compute(),
126 |             )
127 |             getattr(pl_module, f"{phase}_irtr_loss").reset()
128 |         elif loss_name == "mppd" or loss_name == "mpfr":
129 |             pl_module.log(
130 |                 f"{loss_name}/{phase}/loss_epoch",
131 |                 getattr(pl_module, f"{phase}_{loss_name}_loss").compute(),
132 |             )
133 |             getattr(pl_module, f"{phase}_{loss_name}_loss").reset()
134 |         elif loss_name == "itm":
135 |             value = getattr(pl_module, f"{phase}_{loss_name}_accuracy").compute()
136 |             pl_module.log(f"{loss_name}/{phase}/accuracy_epoch", value)
137 |             getattr(pl_module, f"{phase}_{loss_name}_accuracy").reset()
138 |             pl_module.log(
139 |                 f"{loss_name}/{phase}/loss_epoch",
140 |                 getattr(pl_module, f"{phase}_{loss_name}_loss").compute(),
141 |             )
142 |             getattr(pl_module, f"{phase}_{loss_name}_loss").reset()
143 |         else:
144 |             value = getattr(pl_module, f"{phase}_{loss_name}_accuracy").compute()
145 |             pl_module.log(f"{loss_name}/{phase}/accuracy_epoch", value)
146 |             getattr(pl_module, f"{phase}_{loss_name}_accuracy").reset()
147 |             pl_module.log(
148 |                 f"{loss_name}/{phase}/loss_epoch",
149 |                 getattr(pl_module, f"{phase}_{loss_name}_loss").compute(),
150 |             )
151 |             getattr(pl_module, f"{phase}_{loss_name}_loss").reset()
152 | 
153 |         the_metric += value
154 | 
155 |     pl_module.log(f"{phase}/the_metric", the_metric)
156 | 
157 | 
158 | def check_non_acc_grad(pl_module):
159 |     if pl_module.token_type_embeddings.weight.grad is None:
160 |         return True
161 |     else:
162 |         grad = pl_module.token_type_embeddings.weight.grad
163 |         return (grad.sum() == 0).item()
164 | 
165 | 
166 | def set_task(pl_module):
167 |     pl_module.current_tasks = [
168 |         k for k, v in pl_module.hparams.config["loss_names"].items() if v > 0
169 |     ]
170 |     return
171 | 
172 | def set_schedule(pl_module):
173 |     lr = pl_module.hparams.config["learning_rate"]
174 |     wd = pl_module.hparams.config["weight_decay"]
175 | 
176 |     no_decay = [
177 |         "bias",
178 |         "LayerNorm.bias",
179 |         "LayerNorm.weight",
180 |         "norm.bias",
181 |         "norm.weight",
182 |         "norm1.bias",
183 |         "norm1.weight",
184 |         "norm2.bias",
185 |         "norm2.weight",
186 |     ]
187 |     head_names = ["vqa_classifier", "nlvr2_classifier", "mlm_score", "itm_score", "snli_classifier"]
188 |     cross_modal_names = ['cross_modal']
189 |     lr_mult_head = pl_module.hparams.config["lr_mult_head"]
190 |     lr_mult_cross_modal = pl_module.hparams.config["lr_mult_cross_modal"]
191 |     end_lr = pl_module.hparams.config["end_lr"]
192 |     decay_power = pl_module.hparams.config["decay_power"]
193 |     optim_type = pl_module.hparams.config["optim_type"]
194 |     optimizer_grouped_parameters = [
195 |         {
196 |             "params": [
197 |                 p
198 |                 for n, p in pl_module.named_parameters()
199 |                 if not any(nd in n for nd in no_decay)
200 |                 and not any(bb in n for bb in head_names)
201 |                 and not any(ht in n for ht in cross_modal_names)
202 |             ],
203 |             "weight_decay": wd,
204 |             "lr": lr,
205 |         },
206 |         {
207 |             "params": [
208 |                 p
209 |                 for n, p in pl_module.named_parameters()
210 |                 if any(nd in n for nd in no_decay)
211 |                 and not any(bb in n for bb in head_names)
212 |                 and not any(ht in n for ht in cross_modal_names)
213 |             ],
214 |             "weight_decay": 0.0,
215 |             "lr": lr,
216 |         },
217 |         {
218 |             "params": [
219 |                 p
220 |                 for n, p in pl_module.named_parameters()
221 |                 if not any(nd in n for nd in no_decay)
222 |                 and any(bb in n for bb in head_names)
223 |                 and not any(ht in n for ht in cross_modal_names)
224 |             ],
225 |             "weight_decay": wd,
226 |             "lr": lr * lr_mult_head,
227 |         },
228 |         {
229 |             "params": [
230 |                 p
231 |                 for n, p in pl_module.named_parameters()
232 |                 if any(nd in n for nd in no_decay) and any(bb in n for bb in head_names)
233 |                 and not any(ht in n for ht in cross_modal_names)
234 |             ],
235 |             "weight_decay": 0.0,
236 |             "lr": lr * lr_mult_head,
237 |         },
238 |         {
239 |             "params": [
240 |                 p
241 |                 for n, p in pl_module.named_parameters()
242 |                 if not any(nd in n for nd in no_decay)
243 |                 and not any(bb in n for bb in head_names)
244 |                 and any(ht in n for ht in cross_modal_names)
245 |             ],
246 |             "weight_decay": wd,
247 |             "lr": lr * lr_mult_cross_modal,
248 |         },
249 |         {
250 |             "params": [
251 |                 p
252 |                 for n, p in pl_module.named_parameters()
253 |                 if any(nd in n for nd in no_decay)
254 |                 and not any(bb in n for bb in head_names)
255 |                 and any(ht in n for ht in cross_modal_names)
256 |             ],
257 |             "weight_decay": 0.0,
258 |             "lr": lr * lr_mult_cross_modal,
259 |         },
260 |     ]
261 | 
262 |     if optim_type == "adamw":
263 |         optimizer = AdamW(
264 |             optimizer_grouped_parameters, lr=lr, eps=1e-8, betas=(0.9, 0.98)
265 |         )
266 |     elif optim_type == "adam":
267 |         optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=lr)
268 |     elif optim_type == "sgd":
269 |         optimizer = torch.optim.SGD(optimizer_grouped_parameters, lr=lr, momentum=0.9)
270 | 
271 |     if pl_module.trainer.max_steps is None:
272 |         max_steps = (
273 |             len(pl_module.trainer.datamodule.train_dataloader())
274 |             * pl_module.trainer.max_epochs
275 |             // pl_module.trainer.accumulate_grad_batches
276 |         )
277 |     else:
278 |         max_steps = pl_module.trainer.max_steps
279 | 
280 |     warmup_steps = pl_module.hparams.config["warmup_steps"]
281 |     if isinstance(pl_module.hparams.config["warmup_steps"], float):
282 |         warmup_steps = int(max_steps * warmup_steps)
283 | 
284 |     if decay_power == "cosine":
285 |         scheduler = get_cosine_schedule_with_warmup(
286 |             optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps,
287 |         )
288 |     else:
289 |         scheduler = get_polynomial_decay_schedule_with_warmup(
290 |             optimizer,
291 |             num_warmup_steps=warmup_steps,
292 |             num_training_steps=max_steps,
293 |             lr_end=end_lr,
294 |             power=decay_power,
295 |         )
296 | 
297 |     sched = {"scheduler": scheduler, "interval": "step"}
298 | 
299 |     return (
300 |         [optimizer],
301 |         [sched],
302 |     )
303 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/METER/meter_module.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | # import pytorch_lightning as pl
  4 | import torch.nn as nn
  5 | import numpy as np
  6 | 
  7 | from transformers.models.bert.modeling_bert import BertConfig, BertEmbeddings, BertModel, BertEncoder, BertLayer
  8 | from .bert_model import BertCrossLayer, BertAttention
  9 | from . import swin_transformer as swin
 10 | from . import heads, objectives
 11 | # from . import meter_utils
 12 | from .clip_model import build_model, adapt_position_encoding
 13 | from .swin_helpers import swin_adapt_position_encoding
 14 | from transformers import RobertaConfig, RobertaModel
 15 | import pdb
 16 | class METERTransformerSS(nn.Module):
 17 |     def __init__(self, config):
 18 |         super().__init__()
 19 |         # self.save_hyperparameters()
 20 |         self.current_tasks = list()
 21 |         self.is_clip= (not 'swin' in config['vit'])
 22 | 
 23 |         if 'roberta' in config['tokenizer']:
 24 |             bert_config = RobertaConfig(
 25 |                 vocab_size=config["vocab_size"],
 26 |                 hidden_size=config["hidden_size"],
 27 |                 num_hidden_layers=config["num_layers"],
 28 |                 num_attention_heads=config["num_heads"],
 29 |                 intermediate_size=config["hidden_size"] * config["mlp_ratio"],
 30 |                 max_position_embeddings=config["max_text_len"],
 31 |                 hidden_dropout_prob=config["drop_rate"],
 32 |                 attention_probs_dropout_prob=config["drop_rate"],
 33 |             )
 34 |         else:
 35 |             bert_config = BertConfig(
 36 |                 vocab_size=config["vocab_size"],
 37 |                 hidden_size=config["hidden_size"],
 38 |                 num_hidden_layers=config["num_layers"],
 39 |                 num_attention_heads=config["num_heads"],
 40 |                 intermediate_size=config["hidden_size"] * config["mlp_ratio"],
 41 |                 max_position_embeddings=config["max_text_len"],
 42 |                 hidden_dropout_prob=config["drop_rate"],
 43 |                 attention_probs_dropout_prob=config["drop_rate"],
 44 |             )
 45 | 
 46 |         resolution_after=config['image_size']
 47 | 
 48 |         self.cross_modal_text_transform = nn.Linear(config['input_text_embed_size'], config['hidden_size'])
 49 |         self.cross_modal_text_transform.apply(objectives.init_weights)
 50 |         self.cross_modal_image_transform = nn.Linear(config['input_image_embed_size'], config['hidden_size'])
 51 |         self.cross_modal_image_transform.apply(objectives.init_weights)
 52 | 
 53 |         self.token_type_embeddings = nn.Embedding(2, config["hidden_size"])
 54 |         self.token_type_embeddings.apply(objectives.init_weights)
 55 | 
 56 |         if torch.distributed.is_initialized():
 57 |             if torch.distributed.get_rank() == 0:
 58 |                 if self.is_clip:
 59 |                     build_model(config['vit'], resolution_after=resolution_after)
 60 |                 else:
 61 |                     getattr(swin, self.hparams.config["vit"])(
 62 |                         pretrained=True, config=self.hparams.config,
 63 |                     )
 64 | 
 65 |                 if 'roberta' in config['tokenizer']:
 66 |                     RobertaModel.from_pretrained(config['tokenizer'])
 67 |                 else:
 68 |                     BertModel.from_pretrained(config['tokenizer'])
 69 | 
 70 |             torch.distributed.barrier()
 71 | 
 72 |         if self.is_clip:
 73 |             self.vit_model = build_model(config['vit'], resolution_after=resolution_after)
 74 |         else:
 75 |             self.vit_model = getattr(swin, self.hparams.config["vit"])(
 76 |                 pretrained=True, config=self.hparams.config,
 77 |             )
 78 |             self.avgpool = nn.AdaptiveAvgPool1d(1)
 79 | 
 80 |         if 'roberta' in config['tokenizer']:
 81 |             self.text_transformer = RobertaModel.from_pretrained(config['tokenizer'])
 82 |         else:
 83 |             self.text_transformer = BertModel.from_pretrained(config['tokenizer'])
 84 | 
 85 |         self.cross_modal_image_layers = nn.ModuleList([BertCrossLayer(bert_config) for _ in range(config['num_top_layer'])])
 86 |         self.cross_modal_image_layers.apply(objectives.init_weights)
 87 |         self.cross_modal_text_layers = nn.ModuleList([BertCrossLayer(bert_config) for _ in range(config['num_top_layer'])])
 88 |         self.cross_modal_text_layers.apply(objectives.init_weights)
 89 | 
 90 |         self.cross_modal_image_pooler = heads.Pooler(config["hidden_size"])
 91 |         self.cross_modal_image_pooler.apply(objectives.init_weights)
 92 |         self.cross_modal_text_pooler = heads.Pooler(config["hidden_size"])
 93 |         self.cross_modal_text_pooler.apply(objectives.init_weights)
 94 | 
 95 |         ckpt = torch.load(config["load_path"], map_location="cpu")
 96 |         state_dict = ckpt["state_dict"]
 97 | 
 98 |         del state_dict['vit_model.visual.positional_embedding']
 99 |         msg = self.load_state_dict(state_dict, strict=False)
100 |         print(msg) 
101 |         # if config["loss_names"]["mlm"] > 0:
102 |         #     self.mlm_score = heads.MLMHead(bert_config)
103 |         #     self.mlm_score.apply(objectives.init_weights)
104 | 
105 |         # if config["loss_names"]["itm"] > 0:
106 |         #     self.itm_score = heads.ITMHead(config["hidden_size"]*2)
107 |         #     self.itm_score.apply(objectives.init_weights)
108 | 
109 |         # hs = self.hparams.config["hidden_size"]
110 | 
111 |         # if self.hparams.config["loss_names"]["vqa"] > 0:
112 |         #     vs = self.hparams.config["vqav2_label_size"]
113 |         #     self.vqa_classifier = nn.Sequential(
114 |         #         nn.Linear(hs * 2, hs * 2),
115 |         #         nn.LayerNorm(hs * 2),
116 |         #         nn.GELU(),
117 |         #         nn.Linear(hs * 2, vs),
118 |         #     )
119 |         #     self.vqa_classifier.apply(objectives.init_weights)
120 | 
121 |         # ===================== Downstream ===================== #
122 |         # if (
123 |         #     self.hparams.config["load_path"] != ""
124 |         #     and not self.hparams.config["test_only"]
125 |         # ):
126 |         #     ckpt = torch.load(self.hparams.config["load_path"], map_location="cpu")
127 |         #     state_dict = ckpt["state_dict"]
128 |         #     if self.is_clip:
129 |         #         state_dict = adapt_position_encoding(state_dict, after=resolution_after, patch_size=self.hparams.config['patch_size'])
130 |         #     else:
131 |         #         state_dict = swin_adapt_position_encoding(state_dict, after=resolution_after, before=config['resolution_before'])
132 |         #     self.load_state_dict(state_dict, strict=False)
133 | 
134 | 
135 |         # if self.hparams.config["loss_names"]["nlvr2"] > 0:
136 |         #     self.nlvr2_classifier = nn.Sequential(
137 |         #         nn.Linear(hs * 4, hs * 2),
138 |         #         nn.LayerNorm(hs * 2),
139 |         #         nn.GELU(),
140 |         #         nn.Linear(hs * 2, 2),
141 |         #     )
142 |         #     self.nlvr2_classifier.apply(objectives.init_weights)
143 |         #     emb_data = self.token_type_embeddings.weight.data
144 |         #     self.token_type_embeddings = nn.Embedding(3, hs)
145 |         #     self.token_type_embeddings.apply(objectives.init_weights)
146 |         #     self.token_type_embeddings.weight.data[0, :] = emb_data[0, :]
147 |         #     self.token_type_embeddings.weight.data[1, :] = emb_data[1, :]
148 |         #     self.token_type_embeddings.weight.data[2, :] = emb_data[1, :]
149 | 
150 |         # if self.hparams.config["loss_names"]["snli"] > 0:
151 |         #     self.snli_classifier = nn.Sequential(
152 |         #         nn.Linear(hs * 2, hs * 2),
153 |         #         nn.LayerNorm(hs * 2),
154 |         #         nn.GELU(),
155 |         #         nn.Linear(hs * 2, 3),
156 |         #     )
157 |         #     self.snli_classifier.apply(objectives.init_weights)
158 | 
159 |         # if self.hparams.config["loss_names"]["irtr"] > 0:
160 |         #     self.rank_output = nn.Linear(hs, 1)
161 |         #     self.rank_output.weight.data = self.itm_score.fc.weight.data[1:, :]
162 |         #     self.rank_output.bias.data = self.itm_score.fc.bias.data[1:]
163 |         #     self.margin = 0.2
164 |         #     for p in self.itm_score.parameters():
165 |         #         p.requires_grad = False
166 | 
167 |         # meter_utils.set_metrics(self)
168 |         # self.current_tasks = list()
169 | 
170 |         # # ===================== load downstream (test_only) ======================
171 | 
172 |         # if self.hparams.config["load_path"] != "" and self.hparams.config["test_only"]:
173 |         #     ckpt = torch.load(self.hparams.config["load_path"], map_location="cpu")
174 |         #     state_dict = ckpt["state_dict"]
175 |         #     if self.is_clip:
176 |         #         state_dict = adapt_position_encoding(state_dict, after=resolution_after, patch_size=self.hparams.config['patch_size'])
177 |         #     else:
178 |         #         state_dict = swin_adapt_position_encoding(state_dict, after=resolution_after, before=config['resolution_before'])
179 |         #     self.load_state_dict(state_dict, strict=False)
180 | 
181 |     def infer(
182 |         self,
183 |         batch,
184 |         mask_text=False,
185 |         mask_image=False,
186 |         image_token_type_idx=1,
187 |         img=None,
188 |     ):
189 |         if img is None:
190 |             if f"image_{image_token_type_idx - 1}" in batch:
191 |                 imgkey = f"image_{image_token_type_idx - 1}"
192 |             else:
193 |                 imgkey = "image"
194 |             img = batch[imgkey][0]
195 | 
196 |         do_mlm = "_mlm" if mask_text else ""
197 |         text_ids = batch[f"text_ids{do_mlm}"]
198 |         text_masks = batch[f"text_masks"]
199 | 
200 |         #### text encoder ####
201 |         text_embeds = self.text_transformer.embeddings(input_ids=text_ids)
202 |         device = text_embeds.device
203 |         input_shape = text_masks.size()
204 |         extend_text_masks = self.text_transformer.get_extended_attention_mask(text_masks, input_shape, device)
205 |         for layer in self.text_transformer.encoder.layer:
206 |             text_embeds = layer(text_embeds, extend_text_masks)[0]
207 |         text_embeds = self.cross_modal_text_transform(text_embeds)
208 | 
209 |         #### img encoder ####
210 |         image_embeds = self.vit_model(img)
211 |         image_embeds = self.cross_modal_image_transform(image_embeds)
212 |         image_masks = torch.ones((image_embeds.size(0), image_embeds.size(1)), dtype=torch.long, device=device)
213 |         extend_image_masks = self.text_transformer.get_extended_attention_mask(image_masks, image_masks.size(), device)
214 | 
215 |         text_embeds, image_embeds = (
216 |             text_embeds + self.token_type_embeddings(torch.zeros_like(text_masks)),
217 |             image_embeds
218 |             + self.token_type_embeddings(
219 |                 torch.full_like(image_masks, image_token_type_idx)
220 |             ),
221 |         )
222 | 
223 |         #### interaction ####
224 |         x, y = text_embeds, image_embeds
225 |         for text_layer, image_layer in zip(self.cross_modal_text_layers, self.cross_modal_image_layers):
226 |             x1 = text_layer(x, y, extend_text_masks, extend_image_masks)
227 |             y1 = image_layer(y, x, extend_image_masks, extend_text_masks)
228 |             x, y = x1[0], y1[0]
229 |         
230 |         text_feats, image_feats = x, y
231 |         cls_feats_text = self.cross_modal_text_pooler(x)
232 |         if self.is_clip:
233 |             cls_feats_image = self.cross_modal_image_pooler(y)
234 |         else:
235 |             avg_image_feats = self.avgpool(image_feats.transpose(1, 2)).view(image_feats.size(0), 1, -1)
236 |             cls_feats_image = self.cross_modal_image_pooler(avg_image_feats)
237 |         cls_feats = torch.cat([cls_feats_text, cls_feats_image], dim=-1)
238 | 
239 |         ret = {
240 |             "text_feats": text_feats,
241 |             "image_feats": image_feats,
242 |             "cls_feats": cls_feats,
243 |             "text_ids": text_ids,
244 |             "text_masks": text_masks,
245 |         }
246 | 
247 | 
248 |         return ret
249 | 
250 |     def forward(self, batch):
251 |         ret = dict()
252 |         if len(self.current_tasks) == 0:
253 |             ret.update(self.infer(batch))
254 |             return ret
255 | 
256 |         # Masked Language Modeling
257 |         if "mlm" in self.current_tasks:
258 |             ret.update(objectives.compute_mlm(self, batch))
259 | 
260 |         # Image Text Matching
261 |         if "itm" in self.current_tasks:
262 |             ret.update(objectives.compute_itm(self, batch))
263 | 
264 |         # Visual Question Answering
265 |         if "vqa" in self.current_tasks:
266 |             ret.update(objectives.compute_vqa(self, batch))
267 | 
268 |         # Natural Language for Visual Reasoning 2
269 |         if "nlvr2" in self.current_tasks:
270 |             ret.update(objectives.compute_nlvr2(self, batch))
271 | 
272 |         # SNLI Visual Entailment
273 |         if "snli" in self.current_tasks:
274 |             ret.update(objectives.compute_snli(self, batch))
275 | 
276 |         # Image Retrieval and Text Retrieval
277 |         if "irtr" in self.current_tasks:
278 |             ret.update(objectives.compute_irtr(self, batch))
279 | 
280 |         return ret
281 | 


--------------------------------------------------------------------------------
/code/MultiModal-DeepFake-main/models/consist_modeling.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import pdb
  4 | from .interaction import Self_Interaction
  5 | from timm.models.layers import trunc_normal_
  6 | 
  7 | def calculate_patch_labels(images, boxes, fake_text_pos, num_patches=(16, 16)):
  8 |     # 获取图片的尺寸
  9 |     _, height, width = images.shape[1:4]
 10 |     
 11 |     # 计算每个 patch 的大小
 12 |     patch_height = height // num_patches[0]
 13 |     patch_width = width // num_patches[1]
 14 | 
 15 |     # 将 boxes 转换为张量
 16 |     # boxes = torch.tensor(boxes)  # shape: [N, 4]
 17 | 
 18 |     # 计算框的坐标
 19 |     box_x1 = (boxes[:, 0] * width).int()
 20 |     box_y1 = (boxes[:, 1] * height).int()
 21 |     box_w = (boxes[:, 2] * width).int()
 22 |     box_h = (boxes[:, 3] * height).int()
 23 |     
 24 |     # box_x2 = box_x1 + box_w
 25 |     # box_y2 = box_y1 + box_h
 26 | 
 27 |     box_x2 = box_x1 + 0.5*box_w
 28 |     box_y2 = box_y1 + 0.5*box_h
 29 | 
 30 |     box_x1 = box_x1 - 0.5*box_w
 31 |     box_y1 = box_y1 - 0.5*box_h
 32 |     
 33 |     # 计算 patch 的坐标
 34 |     patch_x1 = torch.arange(0, width, patch_width).view(1, -1).expand(boxes.size(0), -1).to(boxes.device)
 35 |     patch_y1 = torch.arange(0, height, patch_height).view(1, -1).expand(boxes.size(0), -1).to(boxes.device)
 36 |     patch_x2 = patch_x1 + patch_width
 37 |     patch_y2 = patch_y1 + patch_height
 38 | 
 39 |     # 计算每个 patch 的面积
 40 |     patch_area = patch_width * patch_height
 41 | 
 42 |     # 计算相交区域
 43 |     inter_x1 = torch.max(patch_x1, box_x1.view(-1, 1))
 44 |     inter_y1 = torch.max(patch_y1, box_y1.view(-1, 1))
 45 |     inter_x2 = torch.min(patch_x2, box_x2.view(-1, 1))
 46 |     inter_y2 = torch.min(patch_y2, box_y2.view(-1, 1))
 47 | 
 48 |     # 计算相交区域的面积
 49 | 
 50 |     inter_area = torch.max(torch.tensor(0), inter_x2 - inter_x1).unsqueeze(1) * torch.max(torch.tensor(0), inter_y2 - inter_y1).unsqueeze(2)
 51 | 
 52 |     # 判断条件：相交面积是否大于 patch 面积的一半
 53 |     labels = (inter_area > (patch_area / 2)).int()
 54 | 
 55 |     labels_extented = labels.view(images.shape[0], -1, 1)
 56 | 
 57 |     consistency_matrix = (labels_extented == labels_extented.transpose(2, 1)).int()
 58 |     
 59 |     labels_extented_it = labels.view(images.shape[0], 1, -1)
 60 |     fake_text_pos_extented = fake_text_pos.view(images.shape[0], -1, 1)
 61 | 
 62 |     consistency_matrix_it = ((labels_extented_it + fake_text_pos_extented)<1).int()
 63 | 
 64 |     return consistency_matrix, consistency_matrix_it, labels.view(images.shape[0], -1)
 65 | 
 66 | def get_sscore_label(img, fake_img_box, fake_text_pos, len_edge=16):
 67 |     consistency_matrix, consistency_matrix_it, labels = calculate_patch_labels(img,fake_img_box,fake_text_pos,(len_edge,len_edge))
 68 | 
 69 |     patch_score = consistency_matrix.sum(dim=-1)/(len_edge*len_edge)
 70 |     img_score = patch_score.sum(dim=-1)/(len_edge*len_edge)
 71 | 
 72 |     return consistency_matrix, labels, patch_score, img_score, consistency_matrix_it
 73 | 
 74 | def get_sscore_label_text(fake_text_pos):
 75 | 
 76 |     fake_text_pos_extend = fake_text_pos.unsqueeze(-1)
 77 |     sim_matrix = ((fake_text_pos_extend == fake_text_pos_extend.transpose(2,1))).int()
 78 |     matrix_mask = ((fake_text_pos_extend + fake_text_pos_extend.transpose(2,1))>=0)
 79 |     for i in range(fake_text_pos.shape[0]):
 80 |         sim_matrix[i].fill_diagonal_(1)
 81 |     return sim_matrix, matrix_mask
 82 | 
 83 | class Intra_Modal_Modeling(nn.Module):
 84 |     
 85 |     def __init__(self, num_head, hidden_dim, input_dim, output_dim, tok_num):
 86 |         super().__init__()
 87 | 
 88 |         self.correlation_model = Self_Interaction(num_head, hidden_dim, input_dim, output_dim, layers=3)
 89 |         self.consist_encoder = nn.Sequential(nn.Linear(output_dim, 256),
 90 |                                                   nn.LayerNorm(256),
 91 |                                                   nn.GELU(),
 92 |                                                   nn.Linear(256, 128),
 93 |                                                   nn.LayerNorm(128),
 94 |                                                   nn.GELU(),
 95 |                                                   nn.Linear(128, 64))
 96 |         self.token_number = tok_num
 97 |         self.aggregator = nn.MultiheadAttention(output_dim, 4, dropout=0.0, batch_first=True)
 98 |         self.aggregator_mlp = self.build_mlp(input_dim=output_dim, output_dim=output_dim)
 99 |         self.aggregator_2 = nn.MultiheadAttention(output_dim, 4, dropout=0.0, batch_first=True)
100 |         self.aggregator_mlp_2 = self.build_mlp(input_dim=output_dim, output_dim=output_dim)
101 |         self.num_head = 4
102 | 
103 |     def build_mlp(self, input_dim, output_dim):
104 |         return nn.Sequential(
105 |             nn.Linear(input_dim, input_dim * 2),
106 |             nn.LayerNorm(input_dim * 2),
107 |             nn.GELU(),
108 |             nn.Linear(input_dim* 2, input_dim * 2),
109 |             nn.LayerNorm(input_dim * 2),
110 |             nn.GELU(),
111 |             nn.Linear(input_dim * 2, output_dim)
112 |         )
113 |     
114 |     def forward(self, feats, mask, pos_emb, matrix_mask=None):
115 |         
116 |         B, N, C = feats.shape
117 |         feats = self.correlation_model(feats, mask, pos_emb)
118 |         consist_feats = self.consist_encoder(feats)
119 | 
120 |         norms = torch.norm(consist_feats, p=2, dim=2, keepdim=True)
121 |         normalized_vectors = consist_feats / norms
122 |         similarity_matrix = torch.bmm(normalized_vectors, normalized_vectors.transpose(1, 2))
123 |         similarity_matrix = torch.clamp((similarity_matrix+1)/2, 0, 1)
124 | 
125 |         if mask.sum() > 0: # for text inputs
126 |             similarity_matrix_unsim = similarity_matrix.clone()
127 |             similarity_matrix_unsim[~matrix_mask] = 2
128 | 
129 |             similarity_matrix_sim = similarity_matrix.clone()
130 |             similarity_matrix_sim[~matrix_mask] = -1
131 |             diagonal_mask = torch.eye(N, device=feats.device).unsqueeze(0).expand(B, N, N)
132 |             similarity_matrix_sim = similarity_matrix_sim - diagonal_mask
133 | 
134 |         else: # for image inputs
135 |             similarity_matrix_unsim = similarity_matrix.clone()
136 |             similarity_matrix_sim = similarity_matrix.clone()
137 |             diagonal_mask = torch.eye(N, device=feats.device).unsqueeze(0).expand(B, N, N)
138 |             similarity_matrix_sim = similarity_matrix_sim - diagonal_mask # ignore them self
139 | 
140 |         unsim_feats_index = torch.topk(similarity_matrix_unsim, self.token_number, dim=-1, largest=False)[1]
141 |         unsim_attn_mask = torch.ones([B, N, N], dtype=torch.bool).to(unsim_feats_index.device)
142 |         batch_indices = torch.arange(B).view(B, 1, 1) # 形状 (B, N, m)
143 |         row_indices = torch.arange(N).view(1, N, 1)   # 形状 (B, N, m)
144 |         unsim_attn_mask[batch_indices, row_indices, unsim_feats_index] = False
145 |         unsim_attn_mask = unsim_attn_mask.repeat(self.num_head ,1,1)
146 | 
147 |         sim_feats_index = torch.topk(similarity_matrix_sim, self.token_number, dim=-1, largest=True)[1]
148 |         sim_attn_mask = torch.ones([B, N, N], dtype=torch.bool).to(sim_feats_index.device)
149 |         batch_indices = torch.arange(B).view(B, 1, 1) # 形状 (B, N, m)
150 |         row_indices = torch.arange(N).view(1, N, 1)   # 形状 (B, N, m)
151 |         sim_attn_mask[batch_indices, row_indices, sim_feats_index] = False
152 |         sim_attn_mask = sim_attn_mask.repeat(self.num_head ,1,1)
153 |         
154 |         feats = feats + self.aggregator_mlp(self.aggregator(query=feats, 
155 |                                               key=feats, 
156 |                                               value=feats,
157 |                                               attn_mask=sim_attn_mask)[0])
158 |         
159 |         feats = feats + self.aggregator_mlp_2(self.aggregator_2(query=feats, 
160 |                                               key=feats, 
161 |                                               value=feats,
162 |                                               attn_mask=unsim_attn_mask)[0])
163 | 
164 |         return feats, similarity_matrix, consist_feats
165 |     
166 | 
167 | class Extra_Modal_Modeling(nn.Module):
168 |     
169 |     def __init__(self, num_head, output_dim, tok_num):
170 |         super().__init__()
171 | 
172 |         self.feat_encoder = self.build_mlp(input_dim=output_dim, output_dim=output_dim)
173 |         self.cross_encoder = self.build_mlp(input_dim=output_dim, output_dim=output_dim)
174 |         self.token_number = tok_num
175 | 
176 |         self.consist_encoder_feat = nn.Sequential(nn.Linear(output_dim, 256),
177 |                                                   nn.LayerNorm(256),
178 |                                                   nn.GELU(),
179 |                                                   nn.Linear(256, 128),
180 |                                                   nn.LayerNorm(128),
181 |                                                   nn.GELU(),
182 |                                                   nn.Linear(128, 64))
183 |         
184 |         self.consist_encoder_cross = nn.Sequential(nn.Linear(output_dim, 256),
185 |                                                   nn.LayerNorm(256),
186 |                                                   nn.GELU(),
187 |                                                   nn.Linear(256, 128),
188 |                                                   nn.LayerNorm(128),
189 |                                                   nn.GELU(),
190 |                                                   nn.Linear(128, 64))
191 |         
192 |         self.cls_token_cross = nn.Parameter(torch.zeros(1, 1, output_dim))
193 |         self.aggregator_cross = nn.MultiheadAttention(output_dim, num_head, dropout=0.0, batch_first=True)
194 |         self.norm_layer_cross =nn.LayerNorm(output_dim)
195 | 
196 |         self.cls_token_feat = nn.Parameter(torch.zeros(1, 1, output_dim))
197 |         self.aggregator_feat = nn.MultiheadAttention(output_dim, num_head, dropout=0.0, batch_first=True)
198 |         self.norm_layer_feat =nn.LayerNorm(output_dim)
199 | 
200 |         self.aggregator = nn.MultiheadAttention(output_dim, 4, dropout=0.0, batch_first=True)
201 |         self.aggregator_mlp = self.build_mlp(input_dim=output_dim, output_dim=output_dim)
202 |         self.aggregator_2 = nn.MultiheadAttention(output_dim, 4, dropout=0.0, batch_first=True)
203 |         self.aggregator_mlp_2 = self.build_mlp(input_dim=output_dim, output_dim=output_dim)
204 | 
205 |         trunc_normal_(self.cls_token_cross, std=.02)
206 |         trunc_normal_(self.cls_token_feat, std=.02)
207 | 
208 |     def build_mlp(self, input_dim, output_dim):
209 |         return nn.Sequential(
210 |             nn.Linear(input_dim, input_dim * 2),
211 |             nn.LayerNorm(input_dim * 2),
212 |             nn.GELU(),
213 |             nn.Linear(input_dim* 2, input_dim * 2),
214 |             nn.LayerNorm(input_dim * 2),
215 |             nn.GELU(),
216 |             nn.Linear(input_dim * 2, output_dim)
217 |         )
218 |     
219 |     def forward(self, feats, gloabl_feature, cross_feat, feats_mask, cross_mask):
220 |         
221 |         bs, _, _ = feats.shape
222 | 
223 |         feats = self.feat_encoder(feats)
224 |         cross_feat = self.cross_encoder(cross_feat)
225 | 
226 |         cls_token_cross = self.cls_token_cross.expand(bs, -1, -1)
227 |         feat_aggr_cross = self.aggregator_cross(query=self.norm_layer_cross(cls_token_cross), 
228 |                                             key=self.norm_layer_cross(cross_feat), 
229 |                                             value=self.norm_layer_cross(cross_feat),
230 |                                             key_padding_mask=cross_mask)[0]
231 |         
232 |         feats_consist = self.consist_encoder_feat(feats)
233 |         cross_feats_consist = self.consist_encoder_feat(feat_aggr_cross)
234 | 
235 |         norms_feat = torch.norm(feats_consist, p=2, dim=2, keepdim=True)
236 |         norms_cross = torch.norm(cross_feats_consist, p=2, dim=2, keepdim=True)
237 |         sim_matrix = torch.bmm(feats_consist/norms_feat, (cross_feats_consist/norms_cross).transpose(1, 2))
238 |         sim_matrix = torch.clamp((sim_matrix+1)/2, 0, 1).squeeze()
239 | 
240 |         cls_token = self.cls_token_feat.expand(bs, -1, -1)
241 |         global_feats_mask = torch.zeros(feats_mask.shape[0], 1).bool().to(feats_mask.device)
242 |         feat_aggr = self.aggregator_feat(query=self.norm_layer_feat(cls_token), 
243 |                                             key=self.norm_layer_feat(torch.cat([gloabl_feature, feats], dim=1)), 
244 |                                             value=self.norm_layer_feat(torch.cat([gloabl_feature, feats], dim=1)),
245 |                                             key_padding_mask=torch.cat([global_feats_mask,feats_mask],dim=1))[0]
246 |         
247 |         if feats_mask.sum() > 0: # for text inputs
248 |             sim_score = sim_matrix.clone()
249 |             sim_score[feats_mask] = -1
250 | 
251 |             unsim_score = sim_matrix.clone()
252 |             unsim_score[feats_mask] = 2
253 | 
254 |         else: # for image inputs
255 |             sim_score = sim_matrix.clone()
256 |             unsim_score = sim_matrix.clone()
257 | 
258 |         unsim_index = torch.topk(unsim_score, self.token_number, dim=-1, largest=False)[1]
259 |         unsim_patch = feats[torch.arange(feats.shape[0]).unsqueeze(1), unsim_index]
260 | 
261 |         sim_index = torch.topk(sim_score, self.token_number, dim=-1, largest=True)[1]
262 |         sim_patch = feats[torch.arange(feats.shape[0]).unsqueeze(1), sim_index]
263 | 
264 |         feat_aggr = feat_aggr + self.aggregator_mlp(self.aggregator(query=feat_aggr, 
265 |                                               key=sim_patch, 
266 |                                               value=sim_patch)[0])
267 |         
268 |         feat_aggr = feat_aggr + self.aggregator_mlp_2(self.aggregator_2(query=feat_aggr, 
269 |                                               key=unsim_patch, 
270 |                                               value=unsim_patch)[0])
271 |         
272 |         return feat_aggr, sim_matrix, feats_consist
273 | 


--------------------------------------------------------------------------------