├── src ├── nets │ ├── scorers │ │ ├── __init__.py │ │ ├── length_bonus.py │ │ └── ctc.py │ ├── backend │ │ ├── backbones │ │ │ ├── __init__.py │ │ │ ├── conv1d_extractor.py │ │ │ ├── conv3d_extractor.py │ │ │ ├── modules │ │ │ │ ├── resnet.py │ │ │ │ └── shufflenetv2.py │ │ │ └── resnet.py │ │ ├── transformer │ │ │ ├── __init__.py │ │ │ ├── repeat.py │ │ │ ├── layer_norm.py │ │ │ ├── positionwise_feed_forward.py │ │ │ ├── add_sos_eos.py │ │ │ ├── mask.py │ │ │ ├── label_smoothing_loss.py │ │ │ ├── convolution.py │ │ │ ├── decoder_layer.py │ │ │ └── encoder_layer.py │ │ ├── e2e_asr_conformer.py │ │ └── e2e_asr_conformer_av.py │ └── scorer_interface.py ├── ibug │ ├── face_alignment │ │ ├── fan │ │ │ └── __init__.py │ │ ├── __init__.py │ │ └── utils.py │ └── face_detection │ │ ├── s3fd │ │ ├── __init__.py │ │ ├── s3fd_predictor.py │ │ └── s3fd_net.py │ │ ├── retina_face │ │ ├── __init__.py │ │ ├── config.py │ │ ├── py_cpu_nms.py │ │ ├── prior_box.py │ │ ├── retina_face.py │ │ ├── retina_face_net.py │ │ └── retina_face_predictor.py │ │ ├── utils │ │ ├── __init__.py │ │ ├── data │ │ │ └── bfm_lms.npy │ │ ├── head_pose_estimator.py │ │ └── simple_face_tracker.py │ │ └── __init__.py ├── retinaface │ ├── 20words_mean_face.npy │ ├── detector.py │ └── utils.py ├── tokenizer │ ├── spm │ │ ├── unigram │ │ │ └── unigram5000.model │ │ ├── spm_train.py │ │ ├── train.sh │ │ └── spm_encode.py │ └── spm_tokenizer.py ├── dataset │ └── transforms.py ├── talking_detector │ ├── Classifier.py │ ├── loss.py │ ├── Model.py │ ├── segmentation.py │ └── ASD.py ├── avhubert_muavic │ ├── avhubert2text.py │ ├── av_transformer_decoder.py │ ├── av2text_config.py │ └── resnet.py ├── auto_asr │ ├── configuration_asr.py │ └── asr_model.py ├── auto_vsr │ ├── configuration_vsr.py │ └── vsr_model.py ├── auto_avsr │ ├── avsr_model.py │ └── configuration_avsr.py ├── avhubert_avsr │ └── avhubert_avsr_model.py ├── cluster │ └── eval.py └── custom_trainer.py ├── docs ├── images │ ├── align.png │ ├── setting.png │ ├── origin_360.png │ ├── padding_360.png │ ├── central_view.png │ ├── face_linking.png │ ├── anno_transcript.png │ ├── mcorec-baseline.png │ └── mcorec_overview.png ├── organizers.md ├── data_preparation.md ├── submission.md ├── index.md └── baseline.md ├── requirements.txt ├── script ├── lip_crop.py ├── asd.py └── evaluate.py └── .gitignore /src/nets/scorers/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /src/nets/backend/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /src/nets/backend/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize sub package.""" 2 | -------------------------------------------------------------------------------- /src/ibug/face_alignment/fan/__init__.py: -------------------------------------------------------------------------------- 1 | from .fan_predictor import FANPredictor 2 | -------------------------------------------------------------------------------- /src/ibug/face_detection/s3fd/__init__.py: -------------------------------------------------------------------------------- 1 | from .s3fd_predictor import S3FDPredictor 2 | -------------------------------------------------------------------------------- /docs/images/align.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/align.png -------------------------------------------------------------------------------- /docs/images/setting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/setting.png -------------------------------------------------------------------------------- /src/ibug/face_alignment/__init__.py: -------------------------------------------------------------------------------- 1 | from .fan import FANPredictor 2 | 3 | 4 | __version__ = '0.1.0' 5 | -------------------------------------------------------------------------------- /src/ibug/face_detection/retina_face/__init__.py: -------------------------------------------------------------------------------- 1 | from .retina_face_predictor import RetinaFacePredictor 2 | -------------------------------------------------------------------------------- /docs/images/origin_360.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/origin_360.png -------------------------------------------------------------------------------- /docs/images/padding_360.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/padding_360.png -------------------------------------------------------------------------------- /docs/images/central_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/central_view.png -------------------------------------------------------------------------------- /docs/images/face_linking.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/face_linking.png -------------------------------------------------------------------------------- /docs/images/anno_transcript.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/anno_transcript.png -------------------------------------------------------------------------------- /docs/images/mcorec-baseline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/mcorec-baseline.png -------------------------------------------------------------------------------- /docs/images/mcorec_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/mcorec_overview.png -------------------------------------------------------------------------------- /src/retinaface/20words_mean_face.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/src/retinaface/20words_mean_face.npy -------------------------------------------------------------------------------- /src/tokenizer/spm/unigram/unigram5000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/src/tokenizer/spm/unigram/unigram5000.model -------------------------------------------------------------------------------- /src/ibug/face_detection/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .head_pose_estimator import HeadPoseEstimator 2 | from .simple_face_tracker import SimpleFaceTracker 3 | -------------------------------------------------------------------------------- /src/ibug/face_detection/utils/data/bfm_lms.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/src/ibug/face_detection/utils/data/bfm_lms.npy -------------------------------------------------------------------------------- /src/ibug/face_detection/__init__.py: -------------------------------------------------------------------------------- 1 | from .s3fd import S3FDPredictor 2 | from .retina_face import RetinaFacePredictor 3 | 4 | 5 | __version__ = '0.1.0' 6 | -------------------------------------------------------------------------------- /docs/organizers.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Organizers 4 | parent: CHiME-9 Task 1 - MCoRec 5 | nav_order: 7 6 | --- 7 | 8 | * **Alexander Waibel**, CMU, USA & KIT, DE 9 | * **Christian Fuegen**, Meta, UK 10 | * **Shinji Watanabe**, CMU, USA 11 | * **Katerina Zmolikova**, Meta, UK 12 | * **Thai-Binh Nguyen**, KIT, DE 13 | * **Pingchuan Ma**, Meta, USA 14 | -------------------------------------------------------------------------------- /src/tokenizer/spm/spm_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the license found in the 6 | # https://github.com/pytorch/fairseq/blob/master/LICENSE 7 | import sys 8 | 9 | import sentencepiece as spm 10 | 11 | 12 | if __name__ == "__main__": 13 | spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) 14 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.7.1 2 | torchvision==0.22.1 3 | torchaudio==2.7.1 4 | torchcodec==0.4.0 5 | sentencepiece==0.2.0 6 | transformers==4.52.4 7 | accelerate==1.7.0 8 | datasets==3.6.0 9 | six==1.17.0 10 | torch-summary==1.4.5 11 | av==12.0.0 12 | jiwer==3.0.5 13 | wandb==0.19.1 14 | python_speech_features==0.6 15 | opencv-python==4.10.0.84 16 | scikit-image==0.25.2 17 | ffmpeg-python==0.2.0 18 | scikit-learn==1.6.1 19 | webvtt-py -------------------------------------------------------------------------------- /src/dataset/transforms.py: -------------------------------------------------------------------------------- 1 | import torchaudio 2 | import torchvision 3 | import torch 4 | 5 | def load_video(path): 6 | """ 7 | rtype: torch, T x C x H x W 8 | """ 9 | vid = torchvision.io.read_video(path, pts_unit="sec", output_format="THWC")[0] 10 | vid = vid.permute((0, 3, 1, 2)) 11 | return vid 12 | 13 | 14 | def load_audio(path): 15 | """ 16 | rtype: torch, T x 1 17 | """ 18 | waveform, sample_rate = torchaudio.load(path[:-4] + ".wav", normalize=True) 19 | return waveform.transpose(1, 0) 20 | -------------------------------------------------------------------------------- /src/tokenizer/spm/train.sh: -------------------------------------------------------------------------------- 1 | nbpe=5000 2 | bpemode=unigram 3 | mkdir -p ${bpemode} 4 | dict=${bpemode}/${bpemode}${nbpe}_units.txt 5 | bpemodel=${bpemode}/${bpemode}${nbpe} 6 | echo " 1" > ${dict} # must be 1, 0 will be used for "blank" in CTC 7 | python spm_train.py --input=input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 8 | python spm_encode.py --model=${bpemodel}.model --output_format=piece < input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict} 9 | -------------------------------------------------------------------------------- /src/nets/backend/transformer/repeat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Repeat the same layer definition.""" 8 | 9 | import torch 10 | 11 | 12 | class MultiSequential(torch.nn.Sequential): 13 | """Multi-input multi-output torch.nn.Sequential.""" 14 | 15 | def forward(self, *args): 16 | """Repeat.""" 17 | for m in self: 18 | args = m(*args) 19 | return args 20 | 21 | 22 | def repeat(N, fn): 23 | """Repeat module N times. 24 | 25 | :param int N: repeat time 26 | :param function fn: function to generate module 27 | :return: repeated modules 28 | :rtype: MultiSequential 29 | """ 30 | return MultiSequential(*[fn() for _ in range(N)]) 31 | -------------------------------------------------------------------------------- /src/nets/backend/transformer/layer_norm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Layer normalization module.""" 8 | 9 | import torch 10 | 11 | 12 | class LayerNorm(torch.nn.LayerNorm): 13 | """Layer normalization module. 14 | 15 | :param int nout: output dim size 16 | :param int dim: dimension to be normalized 17 | """ 18 | 19 | def __init__(self, nout, dim=-1): 20 | """Construct an LayerNorm object.""" 21 | super(LayerNorm, self).__init__(nout, eps=1e-12) 22 | self.dim = dim 23 | 24 | def forward(self, x): 25 | """Apply layer normalization. 26 | 27 | :param torch.Tensor x: input tensor 28 | :return: layer normalized tensor 29 | :rtype torch.Tensor 30 | """ 31 | if self.dim == -1: 32 | return super(LayerNorm, self).forward(x) 33 | return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1) 34 | -------------------------------------------------------------------------------- /src/nets/backend/transformer/positionwise_feed_forward.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Positionwise feed forward layer definition.""" 8 | 9 | import torch 10 | 11 | 12 | class PositionwiseFeedForward(torch.nn.Module): 13 | """Positionwise feed forward layer. 14 | 15 | :param int idim: input dimenstion 16 | :param int hidden_units: number of hidden units 17 | :param float dropout_rate: dropout rate 18 | 19 | """ 20 | 21 | def __init__(self, idim, hidden_units, dropout_rate): 22 | """Construct an PositionwiseFeedForward object.""" 23 | super(PositionwiseFeedForward, self).__init__() 24 | self.w_1 = torch.nn.Linear(idim, hidden_units) 25 | self.w_2 = torch.nn.Linear(hidden_units, idim) 26 | self.dropout = torch.nn.Dropout(dropout_rate) 27 | 28 | def forward(self, x): 29 | """Forward funciton.""" 30 | return self.w_2(self.dropout(torch.relu(self.w_1(x)))) 31 | -------------------------------------------------------------------------------- /src/ibug/face_detection/retina_face/config.py: -------------------------------------------------------------------------------- 1 | # config.py 2 | 3 | cfg_mnet = { 4 | 'name': 'mobilenet0.25', 5 | 'min_sizes': [[16, 32], [64, 128], [256, 512]], 6 | 'steps': [8, 16, 32], 7 | 'variance': [0.1, 0.2], 8 | 'clip': False, 9 | 'loc_weight': 2.0, 10 | 'gpu_train': True, 11 | 'batch_size': 32, 12 | 'ngpu': 1, 13 | 'epoch': 250, 14 | 'decay1': 190, 15 | 'decay2': 220, 16 | 'image_size': 640, 17 | 'return_layers': {'stage1': 1, 'stage2': 2, 'stage3': 3}, 18 | 'in_channel': 32, 19 | 'out_channel': 64 20 | } 21 | 22 | cfg_re50 = { 23 | 'name': 'Resnet50', 24 | 'min_sizes': [[16, 32], [64, 128], [256, 512]], 25 | 'steps': [8, 16, 32], 26 | 'variance': [0.1, 0.2], 27 | 'clip': False, 28 | 'loc_weight': 2.0, 29 | 'gpu_train': True, 30 | 'batch_size': 24, 31 | 'ngpu': 4, 32 | 'epoch': 100, 33 | 'decay1': 70, 34 | 'decay2': 90, 35 | 'image_size': 840, 36 | 'return_layers': {'layer2': 1, 'layer3': 2, 'layer4': 3}, 37 | 'in_channel': 256, 38 | 'out_channel': 256 39 | } 40 | -------------------------------------------------------------------------------- /src/nets/backend/transformer/add_sos_eos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Unility funcitons for Transformer.""" 8 | 9 | import torch 10 | 11 | 12 | def add_sos_eos(ys_pad, sos, eos, ignore_id): 13 | """Add and labels. 14 | 15 | :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax) 16 | :param int sos: index of 17 | :param int eos: index of 18 | :param int ignore_id: index of padding 19 | :return: padded tensor (B, Lmax) 20 | :rtype: torch.Tensor 21 | :return: padded tensor (B, Lmax) 22 | :rtype: torch.Tensor 23 | """ 24 | from src.nets.backend.nets_utils import pad_list 25 | 26 | _sos = ys_pad.new([sos]) 27 | _eos = ys_pad.new([eos]) 28 | ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys 29 | ys_in = [torch.cat([_sos, y], dim=0) for y in ys] 30 | ys_out = [torch.cat([y, _eos], dim=0) for y in ys] 31 | return pad_list(ys_in, eos), pad_list(ys_out, ignore_id) 32 | -------------------------------------------------------------------------------- /src/nets/backend/backbones/conv1d_extractor.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2021 Imperial College London (Pingchuan Ma) 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | import torch 7 | from src.nets.backend.backbones.modules.resnet1d import ( 8 | BasicBlock1D, 9 | ResNet1D, 10 | ) 11 | 12 | 13 | class Conv1dResNet(torch.nn.Module): 14 | def __init__(self, relu_type="swish", a_upsample_ratio=1): 15 | super().__init__() 16 | self.a_upsample_ratio = a_upsample_ratio 17 | self.trunk = ResNet1D( 18 | BasicBlock1D, 19 | [2, 2, 2, 2], 20 | relu_type=relu_type, 21 | a_upsample_ratio=a_upsample_ratio, 22 | ) 23 | 24 | def forward(self, xs_pad): 25 | """forward. 26 | 27 | :param xs_pad: torch.Tensor, batch of padded input sequences (B, Tmax, idim) 28 | """ 29 | B, T, C = xs_pad.size() 30 | xs_pad = xs_pad[:, : T // 640 * 640, :] 31 | xs_pad = xs_pad.transpose(1, 2) 32 | xs_pad = self.trunk(xs_pad) 33 | return xs_pad.transpose(1, 2) 34 | -------------------------------------------------------------------------------- /src/talking_detector/Classifier.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class BGRU(nn.Module): 6 | def __init__(self, channel): 7 | super(BGRU, self).__init__() 8 | 9 | self.gru_forward = nn.GRU(input_size = channel, hidden_size = channel, num_layers = 1, bidirectional = False, bias = True, batch_first = True) 10 | self.gru_backward = nn.GRU(input_size = channel, hidden_size = channel, num_layers = 1, bidirectional = False, bias = True, batch_first = True) 11 | 12 | self.gelu = nn.GELU() 13 | self.__init_weight() 14 | 15 | def forward(self, x): 16 | x, _ = self.gru_forward(x) 17 | x = self.gelu(x) 18 | x = torch.flip(x, dims=[1]) 19 | x, _ = self.gru_backward(x) 20 | x = torch.flip(x, dims=[1]) 21 | x = self.gelu(x) 22 | 23 | return x 24 | 25 | def __init_weight(self): 26 | for m in self.modules(): 27 | if isinstance(m, nn.GRU): 28 | torch.nn.init.kaiming_normal_(m.weight_ih_l0) 29 | torch.nn.init.kaiming_normal_(m.weight_hh_l0) 30 | m.bias_ih_l0.data.zero_() 31 | m.bias_hh_l0.data.zero_() -------------------------------------------------------------------------------- /src/ibug/face_detection/retina_face/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | 11 | def py_cpu_nms(dets, thresh, top_k): 12 | """Pure Python NMS baseline.""" 13 | x1 = dets[:, 0] 14 | y1 = dets[:, 1] 15 | x2 = dets[:, 2] 16 | y2 = dets[:, 3] 17 | scores = dets[:, 4] 18 | 19 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 20 | order = scores.argsort()[: -top_k - 1: -1] 21 | 22 | keep = [] 23 | while order.size > 0: 24 | i = order[0] 25 | keep.append(i) 26 | xx1 = np.maximum(x1[i], x1[order[1:]]) 27 | yy1 = np.maximum(y1[i], y1[order[1:]]) 28 | xx2 = np.minimum(x2[i], x2[order[1:]]) 29 | yy2 = np.minimum(y2[i], y2[order[1:]]) 30 | 31 | w = np.maximum(0.0, xx2 - xx1 + 1) 32 | h = np.maximum(0.0, yy2 - yy1 + 1) 33 | inter = w * h 34 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 35 | 36 | inds = np.where(ovr <= thresh)[0] 37 | order = order[inds + 1] 38 | 39 | return keep 40 | -------------------------------------------------------------------------------- /src/talking_detector/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class lossAV(nn.Module): 6 | def __init__(self): 7 | super(lossAV, self).__init__() 8 | self.criterion = nn.BCELoss() 9 | self.FC = nn.Linear(128, 2) 10 | 11 | def forward(self, x, labels = None, r = 1): 12 | x = x.squeeze(1) 13 | x = self.FC(x) 14 | if labels == None: 15 | predScore = x[:,1] 16 | predScore = predScore.t() 17 | predScore = predScore.view(-1).detach().cpu().numpy() 18 | return predScore 19 | else: 20 | x1 = x / r 21 | x1 = F.softmax(x1, dim = -1)[:,1] 22 | nloss = self.criterion(x1, labels.float()) 23 | predScore = F.softmax(x, dim = -1) 24 | predLabel = torch.round(F.softmax(x, dim = -1))[:,1] 25 | correctNum = (predLabel == labels).sum().float() 26 | return nloss, predScore, predLabel, correctNum 27 | 28 | 29 | class lossV(nn.Module): 30 | def __init__(self): 31 | super(lossV, self).__init__() 32 | self.criterion = nn.BCELoss() 33 | self.FC = nn.Linear(128, 2) 34 | 35 | def forward(self, x, labels, r = 1): 36 | x = x.squeeze(1) 37 | x = self.FC(x) 38 | 39 | x = x / r 40 | x = F.softmax(x, dim = -1) 41 | 42 | nloss = self.criterion(x[:,1], labels.float()) 43 | return nloss -------------------------------------------------------------------------------- /src/ibug/face_detection/retina_face/prior_box.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from itertools import product as product 3 | from math import ceil 4 | 5 | 6 | class PriorBox(object): 7 | def __init__(self, cfg, image_size=None): 8 | super(PriorBox, self).__init__() 9 | self.min_sizes = cfg['min_sizes'] 10 | self.steps = cfg['steps'] 11 | self.clip = cfg['clip'] 12 | self.image_size = image_size 13 | self.feature_maps = [[ceil(self.image_size[0]/step), ceil(self.image_size[1]/step)] for step in self.steps] 14 | self.name = "s" 15 | 16 | def forward(self): 17 | anchors = [] 18 | for k, f in enumerate(self.feature_maps): 19 | min_sizes = self.min_sizes[k] 20 | for i, j in product(range(f[0]), range(f[1])): 21 | for min_size in min_sizes: 22 | s_kx = min_size / self.image_size[1] 23 | s_ky = min_size / self.image_size[0] 24 | dense_cx = [x * self.steps[k] / self.image_size[1] for x in [j + 0.5]] 25 | dense_cy = [y * self.steps[k] / self.image_size[0] for y in [i + 0.5]] 26 | for cy, cx in product(dense_cy, dense_cx): 27 | anchors += [cx, cy, s_kx, s_ky] 28 | 29 | # back to torch land 30 | output = torch.Tensor(anchors).view(-1, 4) 31 | if self.clip: 32 | output.clamp_(max=1, min=0) 33 | return output 34 | -------------------------------------------------------------------------------- /src/talking_detector/Model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from .Classifier import BGRU 5 | from .Encoder import visual_encoder, audio_encoder 6 | 7 | class ASD_Model(nn.Module): 8 | def __init__(self): 9 | super(ASD_Model, self).__init__() 10 | 11 | self.visualEncoder = visual_encoder() 12 | self.audioEncoder = audio_encoder() 13 | self.GRU = BGRU(128) 14 | 15 | def forward_visual_frontend(self, x): 16 | B, T, W, H = x.shape 17 | x = x.view(B, 1, T, W, H) 18 | x = (x / 255 - 0.4161) / 0.1688 19 | x = self.visualEncoder(x) 20 | return x 21 | 22 | def forward_audio_frontend(self, x): 23 | x = x.unsqueeze(1).transpose(2, 3) 24 | x = self.audioEncoder(x) 25 | return x 26 | 27 | def forward_audio_visual_backend(self, x1, x2): 28 | x = x1 + x2 29 | x = self.GRU(x) 30 | x = torch.reshape(x, (-1, 128)) 31 | return x 32 | 33 | def forward_visual_backend(self,x): 34 | x = torch.reshape(x, (-1, 128)) 35 | return x 36 | 37 | def forward(self, audioFeature, visualFeature): 38 | audioEmbed = self.forward_audio_frontend(audioFeature) 39 | visualEmbed = self.forward_visual_frontend(visualFeature) 40 | outsAV = self.forward_audio_visual_backend(audioEmbed, visualEmbed) 41 | outsV = self.forward_visual_backend(visualEmbed) 42 | 43 | return outsAV, outsV -------------------------------------------------------------------------------- /src/retinaface/detector.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2021 Imperial College London (Pingchuan Ma) 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | import warnings 8 | 9 | from src.ibug.face_alignment import FANPredictor 10 | from src.ibug.face_detection import RetinaFacePredictor 11 | from tqdm import tqdm 12 | 13 | warnings.filterwarnings("ignore") 14 | 15 | 16 | class LandmarksDetector: 17 | def __init__(self, device="cuda:0", model_name="resnet50"): 18 | self.face_detector = RetinaFacePredictor( 19 | device=device, 20 | threshold=0.8, 21 | model=RetinaFacePredictor.get_model(model_name), 22 | ) 23 | self.landmark_detector = FANPredictor(device=device, model=None) 24 | 25 | def __call__(self, video_frames): 26 | landmarks = [] 27 | for frame in tqdm(video_frames, desc="Detecting landmarks"): 28 | detected_faces = self.face_detector(frame, rgb=False) 29 | face_points, _ = self.landmark_detector(frame, detected_faces, rgb=True) 30 | if len(detected_faces) == 0: 31 | landmarks.append(None) 32 | else: 33 | max_id, max_size = 0, 0 34 | for idx, bbox in enumerate(detected_faces): 35 | bbox_size = (bbox[2] - bbox[0]) + (bbox[3] - bbox[1]) 36 | if bbox_size > max_size: 37 | max_id, max_size = idx, bbox_size 38 | landmarks.append(face_points[max_id]) 39 | return landmarks 40 | -------------------------------------------------------------------------------- /src/nets/backend/backbones/conv3d_extractor.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2021 Imperial College London (Pingchuan Ma) 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | import torch 8 | import torch.nn as nn 9 | from src.nets.backend.backbones.modules.resnet import BasicBlock, ResNet 10 | from src.nets.backend.transformer.convolution import Swish 11 | 12 | 13 | def threeD_to_2D_tensor(x): 14 | n_batch, n_channels, s_time, sx, sy = x.shape 15 | x = x.transpose(1, 2) 16 | return x.reshape(n_batch * s_time, n_channels, sx, sy) 17 | 18 | 19 | class Conv3dResNet(torch.nn.Module): 20 | """Conv3dResNet module""" 21 | 22 | def __init__(self, backbone_type="resnet", relu_type="swish"): 23 | """__init__. 24 | 25 | :param backbone_type: str, the type of a visual front-end. 26 | :param relu_type: str, activation function used in an audio front-end. 27 | """ 28 | super(Conv3dResNet, self).__init__() 29 | self.frontend_nout = 64 30 | self.trunk = ResNet(BasicBlock, [2, 2, 2, 2], relu_type=relu_type) 31 | self.frontend3D = nn.Sequential( 32 | nn.Conv3d( 33 | 1, self.frontend_nout, (5, 7, 7), (1, 2, 2), (2, 3, 3), bias=False 34 | ), 35 | nn.BatchNorm3d(self.frontend_nout), 36 | Swish(), 37 | nn.MaxPool3d((1, 3, 3), (1, 2, 2), (0, 1, 1)), 38 | ) 39 | 40 | def forward(self, xs_pad): 41 | xs_pad = xs_pad.transpose(1, 2) # [B, T, C, H, W] -> [B, C, T, H, W] 42 | 43 | B, C, T, H, W = xs_pad.size() 44 | xs_pad = self.frontend3D(xs_pad) 45 | Tnew = xs_pad.shape[2] 46 | xs_pad = threeD_to_2D_tensor(xs_pad) 47 | xs_pad = self.trunk(xs_pad) 48 | return xs_pad.view(B, Tnew, xs_pad.size(1)) 49 | -------------------------------------------------------------------------------- /src/tokenizer/spm_tokenizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import sentencepiece 4 | 5 | SP_MODEL_PATH = os.path.join( 6 | os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 7 | "tokenizer", 8 | "spm", 9 | "unigram", 10 | "unigram5000.model", 11 | ) 12 | 13 | DICT_PATH = os.path.join( 14 | os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 15 | "tokenizer", 16 | "spm", 17 | "unigram", 18 | "unigram5000_units.txt", 19 | ) 20 | 21 | 22 | class TextTransform: 23 | """Mapping Dictionary Class for SentencePiece tokenization.""" 24 | 25 | def __init__( 26 | self, 27 | sp_model_path=SP_MODEL_PATH, 28 | dict_path=DICT_PATH, 29 | ): 30 | 31 | # Load SentencePiece model 32 | self.spm = sentencepiece.SentencePieceProcessor(model_file=sp_model_path) 33 | 34 | # Load units and create dictionary 35 | units = open(dict_path, encoding='utf8').read().splitlines() 36 | self.hashmap = {unit.split()[0]: unit.split()[-1] for unit in units} 37 | # 0 will be used for "blank" in CTC 38 | self.token_list = [""] + list(self.hashmap.keys()) + [""] 39 | self.ignore_id = -1 40 | 41 | def tokenize(self, text): 42 | tokens = self.spm.EncodeAsPieces(text) 43 | token_ids = [self.hashmap.get(token, self.hashmap[""]) for token in tokens] 44 | return torch.tensor(list(map(int, token_ids))) 45 | 46 | def post_process(self, token_ids): 47 | token_ids = token_ids[token_ids != -1] 48 | text = self._ids_to_str(token_ids, self.token_list) 49 | text = text.replace("\u2581", " ").strip() 50 | return text 51 | 52 | def _ids_to_str(self, token_ids, char_list): 53 | token_as_list = [char_list[idx] for idx in token_ids] 54 | return "".join(token_as_list).replace("", " ") 55 | -------------------------------------------------------------------------------- /src/nets/backend/transformer/mask.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2019 Shigeki Karita 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | """Mask module.""" 7 | 8 | from distutils.version import LooseVersion 9 | 10 | import torch 11 | 12 | is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2.0") 13 | # LooseVersion('1.2.0') == LooseVersion(torch.__version__) can't include e.g. 1.2.0+aaa 14 | is_torch_1_2 = ( 15 | LooseVersion("1.3") > LooseVersion(torch.__version__) >= LooseVersion("1.2") 16 | ) 17 | datatype = torch.bool if is_torch_1_2_plus else torch.uint8 18 | 19 | 20 | def subsequent_mask(size, device="cpu", dtype=datatype): 21 | """Create mask for subsequent steps (1, size, size). 22 | 23 | :param int size: size of mask 24 | :param str device: "cpu" or "cuda" or torch.Tensor.device 25 | :param torch.dtype dtype: result dtype 26 | :rtype: torch.Tensor 27 | >>> subsequent_mask(3) 28 | [[1, 0, 0], 29 | [1, 1, 0], 30 | [1, 1, 1]] 31 | """ 32 | if is_torch_1_2 and dtype == torch.bool: 33 | # torch=1.2 doesn't support tril for bool tensor 34 | ret = torch.ones(size, size, device=device, dtype=torch.uint8) 35 | return torch.tril(ret, out=ret).type(dtype) 36 | else: 37 | ret = torch.ones(size, size, device=device, dtype=dtype) 38 | return torch.tril(ret, out=ret) 39 | 40 | 41 | def target_mask(ys_in_pad, ignore_id): 42 | """Create mask for decoder self-attention. 43 | 44 | :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax) 45 | :param int ignore_id: index of padding 46 | :param torch.dtype dtype: result dtype 47 | :rtype: torch.Tensor 48 | """ 49 | ys_mask = ys_in_pad != ignore_id 50 | m = subsequent_mask(ys_mask.size(-1), device=ys_mask.device).unsqueeze(0) 51 | return ys_mask.unsqueeze(-2) & m 52 | -------------------------------------------------------------------------------- /src/avhubert_muavic/avhubert2text.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from transformers import Speech2TextModel, Speech2TextForConditionalGeneration 3 | from .avhubert import AVHubertModel 4 | from .av_transformer_decoder import AVTransformerDecoder 5 | from .av2text_config import AV2TextConfig 6 | import torch 7 | from typing import Optional 8 | from transformers.generation.utils import Cache 9 | 10 | 11 | class AV2TextModel(Speech2TextModel): 12 | def __init__(self, config): 13 | super().__init__(config) 14 | self.encoder = AVHubertModel(config) 15 | self.decoder = AVTransformerDecoder(config) 16 | 17 | self.lm_head = nn.Linear(config.d_model, self.config.vocab_size, bias=False) 18 | self.lm_head.weight = self.decoder.embed_tokens.weight 19 | 20 | class AV2TextForConditionalGeneration(Speech2TextForConditionalGeneration): 21 | config_class = AV2TextConfig 22 | def __init__(self, config): 23 | super().__init__(config) 24 | self.model = AV2TextModel(config) 25 | self.lm_head = nn.Linear(config.d_model, self.config.vocab_size, bias=False) 26 | self.lm_head.weight = self.model.decoder.embed_tokens.weight 27 | 28 | def prepare_inputs_for_generation( 29 | self, 30 | input_ids: torch.LongTensor, 31 | past_key_values: Optional[Cache] = None, 32 | attention_mask: Optional[torch.LongTensor] = None, 33 | inputs_embeds: Optional[torch.FloatTensor] = None, 34 | cache_position: Optional[torch.LongTensor] = None, 35 | **kwargs, 36 | ): 37 | model_inputs = super().prepare_inputs_for_generation( 38 | input_ids=input_ids, 39 | past_key_values=past_key_values, 40 | attention_mask=attention_mask, 41 | inputs_embeds=inputs_embeds, 42 | cache_position=cache_position, 43 | **kwargs 44 | ) 45 | del model_inputs["video"] 46 | return model_inputs -------------------------------------------------------------------------------- /src/nets/scorers/length_bonus.py: -------------------------------------------------------------------------------- 1 | """Length bonus module.""" 2 | from typing import Any, List, Tuple 3 | 4 | import torch 5 | 6 | from src.nets.scorer_interface import BatchScorerInterface 7 | 8 | 9 | class LengthBonus(BatchScorerInterface): 10 | """Length bonus in beam search.""" 11 | 12 | def __init__(self, n_vocab: int): 13 | """Initialize class. 14 | 15 | Args: 16 | n_vocab (int): The number of tokens in vocabulary for beam search 17 | 18 | """ 19 | self.n = n_vocab 20 | 21 | def score(self, y, state, x): 22 | """Score new token. 23 | 24 | Args: 25 | y (torch.Tensor): 1D torch.int64 prefix tokens. 26 | state: Scorer state for prefix tokens 27 | x (torch.Tensor): 2D encoder feature that generates ys. 28 | 29 | Returns: 30 | tuple[torch.Tensor, Any]: Tuple of 31 | torch.float32 scores for next token (n_vocab) 32 | and None 33 | 34 | """ 35 | return torch.tensor([1.0], device=x.device, dtype=x.dtype).expand(self.n), None 36 | 37 | def batch_score( 38 | self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor 39 | ) -> Tuple[torch.Tensor, List[Any]]: 40 | """Score new token batch. 41 | 42 | Args: 43 | ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen). 44 | states (List[Any]): Scorer states for prefix tokens. 45 | xs (torch.Tensor): 46 | The encoder feature that generates ys (n_batch, xlen, n_feat). 47 | 48 | Returns: 49 | tuple[torch.Tensor, List[Any]]: Tuple of 50 | batchfied scores for next token with shape of `(n_batch, n_vocab)` 51 | and next state list for ys. 52 | 53 | """ 54 | return ( 55 | torch.tensor([1.0], device=xs.device, dtype=xs.dtype).expand( 56 | ys.shape[0], self.n 57 | ), 58 | None, 59 | ) 60 | -------------------------------------------------------------------------------- /src/auto_asr/configuration_asr.py: -------------------------------------------------------------------------------- 1 | from transformers.configuration_utils import PretrainedConfig 2 | 3 | class AutoASRConfig(PretrainedConfig): 4 | model_type = "auto_asr" 5 | 6 | def __init__( 7 | self, 8 | odim=5049, 9 | adim=768, 10 | aheads=12, 11 | eunits=3072, 12 | elayers=12, 13 | transformer_input_layer="conv1d", 14 | dropout_rate=0.1, 15 | transformer_attn_dropout_rate=0.1, 16 | transformer_encoder_attn_layer_type="rel_mha", 17 | macaron_style=True, 18 | use_cnn_module=True, 19 | cnn_module_kernel=31, 20 | zero_triu=False, 21 | a_upsample_ratio=1, 22 | relu_type="swish", 23 | ddim=768, 24 | dheads=12, 25 | dunits=3072, 26 | dlayers=6, 27 | lsm_weight=0.1, 28 | transformer_length_normalized_loss=False, 29 | mtlalpha=0.1, 30 | ctc_type="builtin", 31 | rel_pos_type="latest", 32 | **kwargs, 33 | ): 34 | super().__init__(**kwargs) 35 | self.odim = odim 36 | self.adim = adim 37 | self.aheads = aheads 38 | self.eunits = eunits 39 | self.elayers = elayers 40 | self.transformer_input_layer = transformer_input_layer 41 | self.dropout_rate = dropout_rate 42 | self.transformer_attn_dropout_rate = transformer_attn_dropout_rate 43 | self.transformer_encoder_attn_layer_type = transformer_encoder_attn_layer_type 44 | self.macaron_style = macaron_style 45 | self.use_cnn_module = use_cnn_module 46 | self.cnn_module_kernel = cnn_module_kernel 47 | self.zero_triu = zero_triu 48 | self.a_upsample_ratio = a_upsample_ratio 49 | self.relu_type = relu_type 50 | self.ddim = ddim 51 | self.dheads = dheads 52 | self.dunits = dunits 53 | self.dlayers = dlayers 54 | self.lsm_weight = lsm_weight 55 | self.transformer_length_normalized_loss = transformer_length_normalized_loss 56 | self.mtlalpha = mtlalpha 57 | self.ctc_type = ctc_type 58 | self.rel_pos_type = rel_pos_type 59 | 60 | -------------------------------------------------------------------------------- /src/auto_vsr/configuration_vsr.py: -------------------------------------------------------------------------------- 1 | from transformers.configuration_utils import PretrainedConfig 2 | 3 | class AutoVSRConfig(PretrainedConfig): 4 | model_type = "auto_vsr" 5 | 6 | def __init__( 7 | self, 8 | odim=5049, 9 | adim=768, 10 | aheads=12, 11 | eunits=3072, 12 | elayers=12, 13 | transformer_input_layer="conv3d", 14 | dropout_rate=0.1, 15 | transformer_attn_dropout_rate=0.1, 16 | transformer_encoder_attn_layer_type="rel_mha", 17 | macaron_style=True, 18 | use_cnn_module=True, 19 | cnn_module_kernel=31, 20 | zero_triu=False, 21 | a_upsample_ratio=1, 22 | relu_type="swish", 23 | ddim=768, 24 | dheads=12, 25 | dunits=3072, 26 | dlayers=6, 27 | lsm_weight=0.1, 28 | transformer_length_normalized_loss=False, 29 | mtlalpha=0.1, 30 | ctc_type="builtin", 31 | rel_pos_type="latest", 32 | **kwargs, 33 | ): 34 | super().__init__(**kwargs) 35 | self.odim = odim 36 | self.adim = adim 37 | self.aheads = aheads 38 | self.eunits = eunits 39 | self.elayers = elayers 40 | self.transformer_input_layer = transformer_input_layer 41 | self.dropout_rate = dropout_rate 42 | self.transformer_attn_dropout_rate = transformer_attn_dropout_rate 43 | self.transformer_encoder_attn_layer_type = transformer_encoder_attn_layer_type 44 | self.macaron_style = macaron_style 45 | self.use_cnn_module = use_cnn_module 46 | self.cnn_module_kernel = cnn_module_kernel 47 | self.zero_triu = zero_triu 48 | self.a_upsample_ratio = a_upsample_ratio 49 | self.relu_type = relu_type 50 | self.ddim = ddim 51 | self.dheads = dheads 52 | self.dunits = dunits 53 | self.dlayers = dlayers 54 | self.lsm_weight = lsm_weight 55 | self.transformer_length_normalized_loss = transformer_length_normalized_loss 56 | self.mtlalpha = mtlalpha 57 | self.ctc_type = ctc_type 58 | self.rel_pos_type = rel_pos_type 59 | 60 | -------------------------------------------------------------------------------- /src/auto_asr/asr_model.py: -------------------------------------------------------------------------------- 1 | from src.nets.backend.e2e_asr_conformer import E2E 2 | from transformers.modeling_utils import PreTrainedModel 3 | from src.auto_asr.configuration_asr import AutoASRConfig 4 | from src.nets.batch_beam_search import BatchBeamSearch 5 | from src.nets.scorers.length_bonus import LengthBonus 6 | from src.nets.scorers.ctc import CTCPrefixScorer 7 | import torch 8 | 9 | def get_beam_search_decoder(model, token_list, ctc_weight=0.1, beam_size=10): 10 | scorers = { 11 | "decoder": model.decoder, 12 | "ctc": CTCPrefixScorer(model.ctc, model.eos), 13 | "length_bonus": LengthBonus(len(token_list)), 14 | "lm": None 15 | } 16 | 17 | weights = { 18 | "decoder": 1.0 - ctc_weight, 19 | "ctc": ctc_weight, 20 | "lm": 0.0, 21 | "length_bonus": 0.0, 22 | } 23 | 24 | return BatchBeamSearch( 25 | beam_size=beam_size, 26 | vocab_size=len(token_list), 27 | weights=weights, 28 | scorers=scorers, 29 | sos=model.sos, 30 | eos=model.eos, 31 | token_list=token_list, 32 | pre_beam_score_key=None if ctc_weight == 1.0 else "decoder", 33 | ) 34 | 35 | 36 | class AutoASR(PreTrainedModel): 37 | config_class = AutoASRConfig 38 | 39 | def __init__(self, config: AutoASRConfig): 40 | super().__init__(config) 41 | self.asr = E2E(config) 42 | 43 | def forward(self, 44 | video, audio, video_lengths, audio_lengths, label 45 | ): 46 | return self.asr(video, audio, video_lengths, audio_lengths, label) 47 | 48 | def inference(self, sample, text_transform): 49 | 50 | self.beam_search = get_beam_search_decoder(self.asr, text_transform.token_list) 51 | enc_feat, _ = self.asr.encoder(sample.unsqueeze(0).to(self.device), None) 52 | enc_feat = enc_feat.squeeze(0) 53 | 54 | nbest_hyps = self.beam_search(enc_feat) 55 | nbest_hyps = [h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), 1)]] 56 | predicted_token_id = torch.tensor(list(map(int, nbest_hyps[0]["yseq"][1:]))) 57 | predicted = text_transform.post_process(predicted_token_id).replace("", "") 58 | return predicted -------------------------------------------------------------------------------- /src/nets/backend/transformer/label_smoothing_loss.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Label smoothing module.""" 8 | 9 | import torch 10 | from torch import nn 11 | 12 | 13 | class LabelSmoothingLoss(nn.Module): 14 | """Label-smoothing loss. 15 | 16 | :param int size: the number of class 17 | :param int padding_idx: ignored class id 18 | :param float smoothing: smoothing rate (0.0 means the conventional CE) 19 | :param bool normalize_length: normalize loss by sequence length if True 20 | :param torch.nn.Module criterion: loss function to be smoothed 21 | """ 22 | 23 | def __init__( 24 | self, 25 | size, 26 | padding_idx, 27 | smoothing, 28 | normalize_length=False, 29 | criterion=nn.KLDivLoss(reduction="none"), 30 | ): 31 | """Construct an LabelSmoothingLoss object.""" 32 | super(LabelSmoothingLoss, self).__init__() 33 | self.criterion = criterion 34 | self.padding_idx = padding_idx 35 | self.confidence = 1.0 - smoothing 36 | self.smoothing = smoothing 37 | self.size = size 38 | self.true_dist = None 39 | self.normalize_length = normalize_length 40 | 41 | def forward(self, x, target): 42 | """Compute loss between x and target. 43 | 44 | :param torch.Tensor x: prediction (batch, seqlen, class) 45 | :param torch.Tensor target: 46 | target signal masked with self.padding_id (batch, seqlen) 47 | :return: scalar float value 48 | :rtype torch.Tensor 49 | """ 50 | assert x.size(2) == self.size 51 | batch_size = x.size(0) 52 | x = x.view(-1, self.size) 53 | target = target.view(-1) 54 | with torch.no_grad(): 55 | true_dist = x.clone() 56 | true_dist.fill_(self.smoothing / (self.size - 1)) 57 | ignore = target == self.padding_idx # (B,) 58 | total = len(target) - ignore.sum().item() 59 | target = target.masked_fill(ignore, 0) # avoid -1 index 60 | true_dist.scatter_(1, target.unsqueeze(1), self.confidence) 61 | kl = self.criterion(torch.log_softmax(x, dim=1), true_dist) 62 | denom = total if self.normalize_length else batch_size 63 | return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom 64 | -------------------------------------------------------------------------------- /src/nets/backend/transformer/convolution.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe) 5 | # Northwestern Polytechnical University (Pengcheng Guo) 6 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 7 | 8 | """ConvolutionModule definition.""" 9 | 10 | import torch 11 | from torch import nn 12 | 13 | 14 | class ConvolutionModule(nn.Module): 15 | """ConvolutionModule in Conformer model. 16 | 17 | :param int channels: channels of cnn 18 | :param int kernel_size: kernerl size of cnn 19 | 20 | """ 21 | 22 | def __init__(self, channels, kernel_size, bias=True): 23 | """Construct an ConvolutionModule object.""" 24 | super(ConvolutionModule, self).__init__() 25 | # kernerl_size should be a odd number for 'SAME' padding 26 | assert (kernel_size - 1) % 2 == 0 27 | 28 | self.pointwise_cov1 = nn.Conv1d( 29 | channels, 30 | 2 * channels, 31 | kernel_size=1, 32 | stride=1, 33 | padding=0, 34 | bias=bias, 35 | ) 36 | self.depthwise_conv = nn.Conv1d( 37 | channels, 38 | channels, 39 | kernel_size, 40 | stride=1, 41 | padding=(kernel_size - 1) // 2, 42 | groups=channels, 43 | bias=bias, 44 | ) 45 | self.norm = nn.BatchNorm1d(channels) 46 | self.pointwise_cov2 = nn.Conv1d( 47 | channels, 48 | channels, 49 | kernel_size=1, 50 | stride=1, 51 | padding=0, 52 | bias=bias, 53 | ) 54 | self.activation = Swish() 55 | 56 | def forward(self, x): 57 | """Compute covolution module. 58 | 59 | :param torch.Tensor x: (batch, time, size) 60 | :return torch.Tensor: convoluted `value` (batch, time, d_model) 61 | """ 62 | # exchange the temporal dimension and the feature dimension 63 | x = x.transpose(1, 2) 64 | 65 | # GLU mechanism 66 | x = self.pointwise_cov1(x) # (batch, 2*channel, dim) 67 | x = nn.functional.glu(x, dim=1) # (batch, channel, dim) 68 | 69 | # 1D Depthwise Conv 70 | x = self.depthwise_conv(x) 71 | x = self.activation(self.norm(x)) 72 | 73 | x = self.pointwise_cov2(x) 74 | 75 | return x.transpose(1, 2) 76 | 77 | 78 | class Swish(nn.Module): 79 | """Construct an Swish object.""" 80 | 81 | def forward(self, x): 82 | """Return Swich activation function.""" 83 | return x * torch.sigmoid(x) 84 | -------------------------------------------------------------------------------- /src/auto_vsr/vsr_model.py: -------------------------------------------------------------------------------- 1 | from src.nets.backend.e2e_asr_conformer import E2E 2 | from transformers.modeling_utils import PreTrainedModel 3 | from src.auto_vsr.configuration_vsr import AutoVSRConfig 4 | from src.nets.batch_beam_search import BatchBeamSearch 5 | from src.nets.scorers.length_bonus import LengthBonus 6 | from src.nets.scorers.ctc import CTCPrefixScorer 7 | import torch 8 | from transformers.utils import ModelOutput 9 | from typing import List, Optional, Union 10 | from dataclasses import dataclass 11 | 12 | def get_beam_search_decoder(model, token_list, ctc_weight=0.1, beam_size=3): 13 | scorers = { 14 | "decoder": model.decoder, 15 | "ctc": CTCPrefixScorer(model.ctc, model.eos), 16 | "length_bonus": LengthBonus(len(token_list)), 17 | "lm": None 18 | } 19 | 20 | weights = { 21 | "decoder": 1.0 - ctc_weight, 22 | "ctc": ctc_weight, 23 | "lm": 0.0, 24 | "length_bonus": 0.0, 25 | } 26 | 27 | return BatchBeamSearch( 28 | beam_size=beam_size, 29 | vocab_size=len(token_list), 30 | weights=weights, 31 | scorers=scorers, 32 | sos=model.sos, 33 | eos=model.eos, 34 | token_list=token_list, 35 | pre_beam_score_key=None if ctc_weight == 1.0 else "decoder", 36 | ) 37 | 38 | @dataclass 39 | class AutoVSROutput(ModelOutput): 40 | loss: Optional[torch.FloatTensor] = None 41 | loss_ctc: Optional[torch.FloatTensor] = None 42 | loss_att: Optional[torch.FloatTensor] = None 43 | acc: Optional[torch.FloatTensor] = None 44 | 45 | class AutoVSR(PreTrainedModel): 46 | config_class = AutoVSRConfig 47 | 48 | def __init__(self, config: AutoVSRConfig): 49 | super().__init__(config) 50 | self.vsr = E2E(config) 51 | 52 | def forward(self, 53 | videos, 54 | audios, 55 | labels, 56 | video_lengths, 57 | audio_lengths, 58 | label_lengths 59 | ): 60 | loss, loss_ctc, loss_att, acc = self.vsr(videos, video_lengths, labels) 61 | return AutoVSROutput( 62 | loss=loss, 63 | loss_ctc=loss_ctc, 64 | loss_att=loss_att, 65 | acc=acc 66 | ) 67 | 68 | def inference(self, sample, text_transform): 69 | 70 | self.beam_search = get_beam_search_decoder(self.vsr, text_transform.token_list) 71 | enc_feat, _ = self.vsr.encoder(sample.unsqueeze(0).to(self.device), None) 72 | enc_feat = enc_feat.squeeze(0) 73 | 74 | nbest_hyps = self.beam_search(enc_feat) 75 | nbest_hyps = [h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), 1)]] 76 | predicted_token_id = torch.tensor(list(map(int, nbest_hyps[0]["yseq"][1:]))) 77 | predicted = text_transform.post_process(predicted_token_id).replace("", "") 78 | return predicted -------------------------------------------------------------------------------- /script/lip_crop.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | # os.environ['CUDA_VISIBLE_DEVICES'] = '0' 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | from src.retinaface.detector import LandmarksDetector 5 | from src.retinaface.video_process import VideoProcess 6 | from src.retinaface.utils import save_vid_aud_txt 7 | from tqdm import tqdm 8 | import traceback 9 | import math 10 | 11 | import torch 12 | import torchaudio 13 | import torchvision 14 | import json 15 | import ffmpeg 16 | import glob 17 | 18 | # ==================== LOAD MODEL ==================== 19 | 20 | landmarks_detector = LandmarksDetector(device="cuda:0") 21 | video_process = VideoProcess(convert_gray=False) 22 | 23 | def process_video(video_path, output_dir=None): 24 | try: 25 | # Load and process audio and video 26 | audio, sample_rate = torchaudio.load(video_path, normalize=True) 27 | 28 | video = torchvision.io.read_video(video_path)[0].numpy() 29 | landmarks = landmarks_detector(video) 30 | video = video_process(video, landmarks) 31 | video = torch.tensor(video) 32 | 33 | segment_name = video_path.split("/")[-1].replace(".mp4", "_lip") 34 | if output_dir is None: 35 | output_dir = os.path.dirname(video_path) 36 | os.makedirs(output_dir, exist_ok=True) 37 | 38 | dst_vid_filename = os.path.join(output_dir, f"{segment_name}.mp4") 39 | dst_aud_filename = os.path.join(output_dir, f"{segment_name}.wav") 40 | text_filename = os.path.join(output_dir, f"{segment_name}.json") 41 | save_vid_aud_txt( 42 | dst_vid_filename, 43 | dst_aud_filename, 44 | text_filename, 45 | video, 46 | audio, 47 | json.dumps({ 48 | "path": video_path 49 | }, indent=4), 50 | video_fps=25, 51 | audio_sample_rate=16000, 52 | ) 53 | 54 | # Combine audio and video 55 | in1 = ffmpeg.input(dst_vid_filename) 56 | in2 = ffmpeg.input(dst_aud_filename) 57 | out = ffmpeg.output( 58 | in1["v"], 59 | in2["a"], 60 | dst_vid_filename[:-4] + ".av.mp4", 61 | vcodec="copy", 62 | acodec="aac", 63 | strict="experimental", 64 | loglevel="panic", 65 | ) 66 | out.run(overwrite_output=True) 67 | except Exception as e: 68 | traceback.print_exc() 69 | print(f"Error processing {video_path} segment {segment_frame[0]}-{segment_frame[-1]}") 70 | 71 | def main(): 72 | import argparse 73 | parser = argparse.ArgumentParser(description="Active Speaker Detection") 74 | parser.add_argument('--video', type=str, required=True, help='Path to input video file') 75 | opt = parser.parse_args() 76 | 77 | process_video(opt.video) 78 | 79 | 80 | if __name__ == "__main__": 81 | main() -------------------------------------------------------------------------------- /docs/data_preparation.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Data Collection 4 | parent: CHiME-9 Task 1 - MCoRec 5 | nav_order: 2 6 | --- 7 | 8 | ## Data collection 9 | 10 | ### Topics and Speakers 11 | 12 | - Conversation Topics: Everyday life & hobbies, work & school, hypotheticals, entertainment, news, and personal stories. 13 | 14 | - Participants per Session: 2 to 8 speakers, divided into groups of 2–4 participants. 15 | 16 | ### Recording Devices 17 | 18 | - GoPro Max 360 (4K resolution) 19 | - Smartphones (720p resolution) 20 | - Lapel Microphones (connected via adapter to smartphones) 21 | 22 | ### Layout and Setup 23 | 24 | ![](images/setting.png) 25 | 26 | - Recording Environments: Data was collected across approximately 10 different rooms of varying sizes and types, including living rooms, meeting rooms, lecture halls, and other indoor spaces. 27 | 28 | - Seating Arrangement: All speakers sit around a table; distances vary by table size (around 2 to 5 meters). 29 | 30 | - Smartphone Placement: Each speaker has a smartphone in front (selfie mode) with a lapel mic clipped near the mouth. 31 | 32 | - 360° Capture: A GoPro Max mounted at the center of the table captures all participants. 33 | 34 | - Session Markers: Moderator signals start and end with a distinctive whistle. 35 | 36 | 37 | ## Annotation 38 | 39 | ### Signal Alignment 40 | 41 | To synchronize recordings from multiple devices, we use the moderator’s whistle cue in a two-step process: 42 | 43 | - Manual Annotation: Listen to each audio track and mark the start/end regions containing the whistle. 44 | 45 | ![](images/align.png) 46 | 47 | - Automatic Detection: Compute the spectral-flux onset strength envelope with librosa.onset.onset_strength, then identify the timestamp of the highest peak to pinpoint the exact whistle moment. 48 | 49 | ### Transcription Workflow 50 | 51 | High-quality audio from smartphones and lapel mics is used for transcription 52 | 53 | - Automatic Transcript: Run each clip through the Whisper-large-v2 model. 54 | 55 | - Post-Editing: Annotators use Label Studio to: 56 | - Adjust segment boundaries to isolate the target speaker’s speech. 57 | - Correct transcript text for accuracy. 58 | 59 | ![](images/anno_transcript.png) 60 | 61 | ### 360° Video Processing 62 | 63 | Frame Padding for Horizon Artifacts 64 | 65 | - When a speaker straddles the image boundary, their face can split across the frame 66 | 67 | ![](images/origin_360.png) 68 | 69 | - We resolve this by padding each frame: append 20% of the left edge to the right side, creating a continuous panorama. 70 | 71 | ![](images/padding_360.png) 72 | 73 | 74 | - Face Recognition and Linking: 75 | - Face Detection: Run the padded 360° videos through a face-recognition pipeline to extract face crops. 76 | 77 | - Manual Association: Link each face crop from the 360° feed to the corresponding smartphone video of the same speaker. 78 | 79 | ![](images/face_linking.png) 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /src/auto_avsr/avsr_model.py: -------------------------------------------------------------------------------- 1 | from src.nets.backend.e2e_asr_conformer_av import E2E 2 | from transformers.modeling_utils import PreTrainedModel 3 | from src.auto_avsr.configuration_avsr import AutoAVSRConfig 4 | from src.nets.batch_beam_search import BatchBeamSearch 5 | from src.nets.scorers.length_bonus import LengthBonus 6 | from src.nets.scorers.ctc import CTCPrefixScorer 7 | import torch 8 | from transformers.utils import ModelOutput 9 | from typing import List, Optional, Union 10 | from dataclasses import dataclass 11 | 12 | def get_beam_search_decoder(model, token_list, ctc_weight=0.1, beam_size=3): 13 | scorers = { 14 | "decoder": model.decoder, 15 | "ctc": CTCPrefixScorer(model.ctc, model.eos), 16 | "length_bonus": LengthBonus(len(token_list)), 17 | "lm": None 18 | } 19 | 20 | weights = { 21 | "decoder": 1.0 - ctc_weight, 22 | "ctc": ctc_weight, 23 | "lm": 0.0, 24 | "length_bonus": 0.0, 25 | } 26 | 27 | return BatchBeamSearch( 28 | beam_size=beam_size, 29 | vocab_size=len(token_list), 30 | weights=weights, 31 | scorers=scorers, 32 | sos=model.sos, 33 | eos=model.eos, 34 | token_list=token_list, 35 | pre_beam_score_key=None if ctc_weight == 1.0 else "decoder", 36 | ) 37 | 38 | @dataclass 39 | class AutoAVSROutput(ModelOutput): 40 | loss: Optional[torch.FloatTensor] = None 41 | loss_ctc: Optional[torch.FloatTensor] = None 42 | loss_att: Optional[torch.FloatTensor] = None 43 | acc: Optional[torch.FloatTensor] = None 44 | 45 | class AutoAVSR(PreTrainedModel): 46 | config_class = AutoAVSRConfig 47 | 48 | def __init__(self, config: AutoAVSRConfig): 49 | super().__init__(config) 50 | self.avsr = E2E(config) 51 | 52 | def forward(self, 53 | videos, 54 | audios, 55 | labels, 56 | video_lengths, 57 | audio_lengths, 58 | label_lengths 59 | ): 60 | loss, loss_ctc, loss_att, acc = self.avsr(videos, audios, video_lengths, audio_lengths, labels) 61 | return AutoAVSROutput( 62 | loss=loss, 63 | loss_ctc=loss_ctc, 64 | loss_att=loss_att, 65 | acc=acc 66 | ) 67 | # return self.avsr(videos, audios, video_lengths, audio_lengths, labels) 68 | 69 | 70 | 71 | def inference(self, video, audio, text_transform): 72 | self.beam_search = get_beam_search_decoder(self.avsr, text_transform.token_list) 73 | video_feat, _ = self.avsr.encoder(video.unsqueeze(0).to(self.device), None) 74 | audio_feat, _ = self.avsr.aux_encoder(audio.unsqueeze(0).to(self.device), None) 75 | audiovisual_feat = self.avsr.fusion(torch.cat((video_feat, audio_feat), dim=-1)) 76 | 77 | audiovisual_feat = audiovisual_feat.squeeze(0) 78 | 79 | nbest_hyps = self.beam_search(audiovisual_feat) 80 | nbest_hyps = [h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), 1)]] 81 | predicted_token_id = torch.tensor(list(map(int, nbest_hyps[0]["yseq"][1:]))) 82 | predicted = text_transform.post_process(predicted_token_id).replace("", "") 83 | return predicted -------------------------------------------------------------------------------- /src/retinaface/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torchaudio 4 | import torchvision 5 | 6 | 7 | def split_file(filename, max_frames=600, fps=25.0): 8 | 9 | lines = open(filename).read().splitlines() 10 | 11 | flag = 0 12 | stack = [] 13 | res = [] 14 | 15 | tmp = 0 16 | start_timestamp = 0.0 17 | 18 | threshold = max_frames / fps 19 | 20 | for line in lines: 21 | if "WORD START END ASDSCORE" in line: 22 | flag = 1 23 | continue 24 | if flag: 25 | word, start, end, score = line.split(" ") 26 | start, end, score = float(start), float(end), float(score) 27 | if end < tmp + threshold: 28 | stack.append(word) 29 | last_timestamp = end 30 | else: 31 | res.append( 32 | [ 33 | " ".join(stack), 34 | start_timestamp, 35 | last_timestamp, 36 | last_timestamp - start_timestamp, 37 | ] 38 | ) 39 | tmp = start 40 | start_timestamp = start 41 | stack = [word] 42 | if stack: 43 | res.append([" ".join(stack), start_timestamp, end, end - start_timestamp]) 44 | return res 45 | 46 | 47 | def save_vid_txt( 48 | dst_vid_filename, dst_txt_filename, trim_video_data, content, video_fps=25 49 | ): 50 | # -- save video 51 | save2vid(dst_vid_filename, trim_video_data, video_fps) 52 | # -- save text 53 | os.makedirs(os.path.dirname(dst_txt_filename), exist_ok=True) 54 | f = open(dst_txt_filename, "w") 55 | f.write(f"{content}") 56 | f.close() 57 | 58 | 59 | def save_vid_aud( 60 | dst_vid_filename, 61 | dst_aud_filename, 62 | trim_vid_data, 63 | trim_aud_data, 64 | video_fps=25, 65 | audio_sample_rate=16000, 66 | ): 67 | # -- save video 68 | save2vid(dst_vid_filename, trim_vid_data, video_fps) 69 | # -- save audio 70 | save2aud(dst_aud_filename, trim_aud_data, audio_sample_rate) 71 | 72 | 73 | def save_vid_aud_txt( 74 | dst_vid_filename, 75 | dst_aud_filename, 76 | dst_txt_filename, 77 | trim_vid_data, 78 | trim_aud_data, 79 | content, 80 | video_fps=25, 81 | audio_sample_rate=16000, 82 | ): 83 | # -- save video 84 | if dst_vid_filename is not None: 85 | save2vid(dst_vid_filename, trim_vid_data, video_fps) 86 | # -- save audio 87 | if dst_aud_filename is not None: 88 | save2aud(dst_aud_filename, trim_aud_data, audio_sample_rate) 89 | # -- save text 90 | os.makedirs(os.path.dirname(dst_txt_filename), exist_ok=True) 91 | f = open(dst_txt_filename, "w") 92 | f.write(f"{content}") 93 | f.close() 94 | 95 | 96 | def save2vid(filename, vid, frames_per_second): 97 | os.makedirs(os.path.dirname(filename), exist_ok=True) 98 | torchvision.io.write_video(filename, vid, frames_per_second) 99 | 100 | 101 | def save2aud(filename, aud, sample_rate): 102 | os.makedirs(os.path.dirname(filename), exist_ok=True) 103 | torchaudio.save(filename, aud, sample_rate) 104 | -------------------------------------------------------------------------------- /src/avhubert_avsr/avhubert_avsr_model.py: -------------------------------------------------------------------------------- 1 | from src.nets.backend.e2e_asr_avhubert import E2E 2 | from transformers.modeling_utils import PreTrainedModel 3 | from src.avhubert_avsr.configuration_avhubert_avsr import AVHubertAVSRConfig 4 | from src.nets.batch_beam_search import BatchBeamSearch 5 | from src.nets.scorers.length_bonus import LengthBonus 6 | from src.nets.scorers.ctc import CTCPrefixScorer 7 | import torch 8 | from transformers.utils import ModelOutput 9 | from typing import List, Optional, Union 10 | from dataclasses import dataclass 11 | 12 | def get_beam_search_decoder(model, token_list, ctc_weight=0.1, beam_size=3): 13 | scorers = { 14 | "decoder": model.decoder, 15 | "ctc": CTCPrefixScorer(model.ctc, model.eos), 16 | "length_bonus": LengthBonus(len(token_list)), 17 | "lm": None 18 | } 19 | 20 | weights = { 21 | "decoder": 1.0 - ctc_weight, 22 | "ctc": ctc_weight, 23 | "lm": 0.0, 24 | "length_bonus": 0.0, 25 | } 26 | 27 | return BatchBeamSearch( 28 | beam_size=beam_size, 29 | vocab_size=len(token_list), 30 | weights=weights, 31 | scorers=scorers, 32 | sos=model.sos, 33 | eos=model.eos, 34 | token_list=token_list, 35 | pre_beam_score_key=None if ctc_weight == 1.0 else "decoder", 36 | ) 37 | 38 | @dataclass 39 | class AVHubertAVSROutput(ModelOutput): 40 | loss: Optional[torch.FloatTensor] = None 41 | loss_ctc: Optional[torch.FloatTensor] = None 42 | loss_att: Optional[torch.FloatTensor] = None 43 | acc: Optional[torch.FloatTensor] = None 44 | 45 | class AVHubertAVSR(PreTrainedModel): 46 | config_class = AVHubertAVSRConfig 47 | 48 | def __init__(self, config: AVHubertAVSRConfig): 49 | super().__init__(config) 50 | self.avsr = E2E(config) 51 | 52 | def forward(self, 53 | videos, 54 | audios, 55 | labels, 56 | video_lengths, 57 | audio_lengths, 58 | label_lengths 59 | ): 60 | loss, loss_ctc, loss_att, acc = self.avsr(videos, audios, video_lengths, audio_lengths, labels) 61 | return AVHubertAVSROutput( 62 | loss=loss, 63 | loss_ctc=loss_ctc, 64 | loss_att=loss_att, 65 | acc=acc 66 | ) 67 | # return self.avsr(videos, audios, video_lengths, audio_lengths, labels) 68 | 69 | 70 | 71 | # def inference(self, video, audio, text_transform): 72 | # self.beam_search = get_beam_search_decoder(self.avsr, text_transform.token_list) 73 | # video_feat, _ = self.avsr.encoder(video.unsqueeze(0).to(self.device), None) 74 | # audio_feat, _ = self.avsr.aux_encoder(audio.unsqueeze(0).to(self.device), None) 75 | # audiovisual_feat = self.avsr.fusion(torch.cat((video_feat, audio_feat), dim=-1)) 76 | 77 | # audiovisual_feat = audiovisual_feat.squeeze(0) 78 | 79 | # nbest_hyps = self.beam_search(audiovisual_feat) 80 | # nbest_hyps = [h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), 1)]] 81 | # predicted_token_id = torch.tensor(list(map(int, nbest_hyps[0]["yseq"][1:]))) 82 | # predicted = text_transform.post_process(predicted_token_id).replace("", "") 83 | # return predicted -------------------------------------------------------------------------------- /src/ibug/face_detection/s3fd/s3fd_predictor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | from types import SimpleNamespace 5 | from typing import Union, Optional 6 | from .s3fd_net import S3FDNet 7 | 8 | 9 | __all__ = ['S3FDPredictor'] 10 | 11 | 12 | class S3FDPredictor(object): 13 | def __init__(self, threshold: float = 0.8, device: Union[str, torch.device] = 'cuda:0', 14 | model: Optional[SimpleNamespace] = None, config: Optional[SimpleNamespace] = None) -> None: 15 | self.threshold = threshold 16 | self.device = device 17 | if model is None: 18 | model = S3FDPredictor.get_model() 19 | if config is None: 20 | config = S3FDPredictor.create_config() 21 | self.config = SimpleNamespace(**model.config.__dict__, **config.__dict__) 22 | self.net = S3FDNet(config=self.config, device=self.device).to(self.device) 23 | self.net.load_state_dict(torch.load(model.weights, map_location=self.device)) 24 | self.net.eval() 25 | 26 | @staticmethod 27 | def get_model(name: str = 's3fd') -> SimpleNamespace: 28 | name = name.lower().strip() 29 | if name == 's3fd': 30 | return SimpleNamespace(weights=os.path.realpath(os.path.join(os.path.dirname(__file__), 31 | '..','..','..','..','model-bin','face_detection','s3fd','weights', 's3fd_weights.pth')), 32 | config=SimpleNamespace(num_classes=2, variance=(0.1, 0.2), 33 | prior_min_sizes=(16, 32, 64, 128, 256, 512), 34 | prior_steps=(4, 8, 16, 32, 64, 128), prior_clip=False)) 35 | else: 36 | raise ValueError('name must be set to s3fd') 37 | 38 | @staticmethod 39 | def create_config(top_k: int = 750, conf_thresh: float = 0.05,nms_thresh: float = 0.3, 40 | nms_top_k: int = 5000, use_nms_np: bool = True) -> SimpleNamespace: 41 | return SimpleNamespace(top_k=top_k, conf_thresh=conf_thresh, nms_thresh=nms_thresh, 42 | nms_top_k=nms_top_k, use_nms_np=use_nms_np) 43 | 44 | @torch.no_grad() 45 | def __call__(self, image: np.ndarray, rgb: bool = True) -> np.ndarray: 46 | w, h = image.shape[1], image.shape[0] 47 | if not rgb: 48 | image = image[..., ::-1] 49 | image = image.astype(int) - np.array([123, 117, 104]) 50 | image = image.transpose(2, 0, 1) 51 | image = image.reshape((1,) + image.shape) 52 | image = torch.from_numpy(image).float().to(self.device) 53 | 54 | bboxes = [] 55 | detections = self.net(image) 56 | scale = torch.Tensor([w, h, w, h]).to(detections.device) 57 | for i in range(detections.size(1)): 58 | j = 0 59 | while detections[0, i, j, 0] >= self.threshold: 60 | score = detections[0, i, j, 0] 61 | pt = (detections[0, i, j, 1:] * scale).cpu().numpy() 62 | bbox = (pt[0], pt[1], pt[2], pt[3], score) 63 | bboxes.append(bbox) 64 | j += 1 65 | if len(bboxes) > 0: 66 | return np.array(bboxes) 67 | else: 68 | return np.empty(shape=(0, 5), dtype=np.float32) 69 | -------------------------------------------------------------------------------- /src/ibug/face_alignment/utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from typing import Optional, Sequence, Tuple 4 | 5 | 6 | __all__ = ['get_landmark_connectivity', 'plot_landmarks'] 7 | 8 | 9 | def get_landmark_connectivity(num_landmarks: int) -> Optional[Sequence[Tuple[int, int]]]: 10 | if num_landmarks == 68: 11 | return ((0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), 12 | (12, 13), (13, 14), (14, 15), (15, 16), (17, 18), (18, 19), (19, 20), (20, 21), (22, 23), (23, 24), 13 | (24, 25), (25, 26), (27, 28), (28, 29), (29, 30), (30, 33), (31, 32), (32, 33), (33, 34), (34, 35), 14 | (36, 37), (37, 38), (38, 39), (40, 41), (41, 36), (42, 43), (43, 44), (44, 45), (45, 46), (46, 47), 15 | (47, 42), (48, 49), (49, 50), (50, 51), (51, 52), (52, 53), (53, 54), (54, 55), (55, 56), (56, 57), 16 | (57, 58), (58, 59), (59, 48), (60, 61), (61, 62), (62, 63), (63, 64), (64, 65), (65, 66), (66, 67), 17 | (67, 60), (39, 40)) 18 | elif num_landmarks == 100: 19 | return ((0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), 20 | (12, 13), (13, 14), (14, 15), (15, 16), (17, 18), (18, 19), (19, 20), (20, 21), (22, 23), (23, 24), 21 | (24, 25), (25, 26), (68, 69), (69, 70), (70, 71), (72, 73), (73, 74), (74, 75), (36, 76), (76, 37), 22 | (37, 77), (77, 38), (38, 78), (78, 39), (39, 40), (40, 79), (79, 41), (41, 36), (42, 80), (80, 43), 23 | (43, 81), (81, 44), (44, 82), (82, 45), (45, 46), (46, 83), (83, 47), (47, 42), (27, 28), (28, 29), 24 | (29, 30), (30, 33), (31, 32), (32, 33), (33, 34), (34, 35), (84, 85), (86, 87), (48, 49), (49, 88), 25 | (88, 50), (50, 51), (51, 52), (52, 89), (89, 53), (53, 54), (54, 55), (55, 90), (90, 56), (56, 57), 26 | (57, 58), (58, 91), (91, 59), (59, 48), (60, 92), (92, 93), (93, 61), (61, 62), (62, 63), (63, 94), 27 | (94, 95), (95, 64), (64, 96), (96, 97), (97, 65), (65, 66), (66, 67), (67, 98), (98, 99), (99, 60), 28 | (17, 68), (21, 71), (22, 72), (26, 75)) 29 | else: 30 | return None 31 | 32 | 33 | def plot_landmarks(image: np.ndarray, landmarks: np.ndarray, landmark_scores: Optional[Sequence[float]] = None, 34 | threshold: float = 0.2, line_colour: Tuple[int, int, int] = (0, 255, 0), 35 | pts_colour: Tuple[int, int, int] = (0, 0, 255), line_thickness: int = 1, pts_radius: int = 1, 36 | landmark_connectivity: Optional[Sequence[Tuple[int, int]]] = None) -> None: 37 | num_landmarks = len(landmarks) 38 | if landmark_scores is None: 39 | landmark_scores = np.full((num_landmarks,), threshold + 1.0, dtype=float) 40 | if landmark_connectivity is None: 41 | landmark_connectivity = get_landmark_connectivity(len(landmarks)) 42 | if landmark_connectivity is not None: 43 | for (idx1, idx2) in landmark_connectivity: 44 | if (idx1 < num_landmarks and idx2 < num_landmarks and 45 | landmark_scores[idx1] >= threshold and landmark_scores[idx2] >= threshold): 46 | cv2.line(image, tuple(landmarks[idx1].astype(int).tolist()), 47 | tuple(landmarks[idx2].astype(int).tolist()), 48 | color=line_colour, thickness=line_thickness, lineType=cv2.LINE_AA) 49 | for landmark, score in zip(landmarks, landmark_scores): 50 | if score >= threshold: 51 | cv2.circle(image, tuple(landmark.astype(int).tolist()), pts_radius, pts_colour, -1) 52 | -------------------------------------------------------------------------------- /src/avhubert_muavic/av_transformer_decoder.py: -------------------------------------------------------------------------------- 1 | from transformers.models.speech_to_text.modeling_speech_to_text import ( 2 | Speech2TextAttention, 3 | Speech2TextDecoder, 4 | Speech2TextDecoderLayer, 5 | Speech2TextConfig, 6 | ACT2FN, 7 | SPEECH_TO_TEXT_ATTENTION_CLASSES, 8 | Speech2TextDecoderLayer, 9 | ) 10 | from typing import Optional, List 11 | import torch.nn as nn 12 | from .av2text_config import AV2TextConfig 13 | 14 | class AVTransformerAttention(Speech2TextAttention): 15 | def __init__( 16 | self, 17 | input_dim: int, 18 | embed_dim: int, 19 | num_heads: int, 20 | dropout: float = 0.0, 21 | is_decoder: bool = False, 22 | bias: bool = True, 23 | is_causal: bool = False, 24 | config: Optional[Speech2TextConfig] = None, 25 | ): 26 | super().__init__( 27 | embed_dim, 28 | num_heads, 29 | dropout, 30 | is_decoder, 31 | bias, 32 | is_causal, 33 | config 34 | ) 35 | self.embed_dim = embed_dim 36 | self.num_heads = num_heads 37 | self.dropout = dropout 38 | self.head_dim = embed_dim // num_heads 39 | self.config = config 40 | 41 | if (self.head_dim * num_heads) != self.embed_dim: 42 | raise ValueError( 43 | f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" 44 | f" and `num_heads`: {num_heads})." 45 | ) 46 | self.scaling = self.head_dim**-0.5 47 | self.is_decoder = is_decoder 48 | self.is_causal = is_causal 49 | 50 | self.k_proj = nn.Linear(input_dim, embed_dim, bias=bias) 51 | self.v_proj = nn.Linear(input_dim, embed_dim, bias=bias) 52 | self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) 53 | self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) 54 | 55 | 56 | 57 | class AVTransformerDecoderLayer(Speech2TextDecoderLayer): 58 | def __init__(self, config: Speech2TextConfig): 59 | super().__init__(config) 60 | self.embed_dim = config.decoder_hidden_size 61 | 62 | self.self_attn = SPEECH_TO_TEXT_ATTENTION_CLASSES[config._attn_implementation]( 63 | input_dim=self.embed_dim, 64 | embed_dim=self.embed_dim, 65 | num_heads=config.decoder_attention_heads, 66 | dropout=config.attention_dropout, 67 | is_decoder=True, 68 | is_causal=True, 69 | config=config, 70 | ) 71 | self.dropout = config.dropout 72 | self.activation_fn = ACT2FN[config.activation_function] 73 | self.activation_dropout = config.activation_dropout 74 | 75 | self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) 76 | self.encoder_attn = SPEECH_TO_TEXT_ATTENTION_CLASSES[config._attn_implementation]( 77 | config.encoder_hidden_size, 78 | self.embed_dim, 79 | config.decoder_attention_heads, 80 | dropout=config.attention_dropout, 81 | is_decoder=True, 82 | config=config, 83 | ) 84 | self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) 85 | self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) 86 | self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) 87 | self.final_layer_norm = nn.LayerNorm(self.embed_dim) 88 | 89 | SPEECH_TO_TEXT_ATTENTION_CLASSES = {"eager": AVTransformerAttention} 90 | 91 | class AVTransformerDecoder(Speech2TextDecoder): 92 | def __init__(self, config: AV2TextConfig): 93 | super().__init__(config) 94 | self.layers = nn.ModuleList([AVTransformerDecoderLayer(config) for _ in range(config.decoder_layers)]) -------------------------------------------------------------------------------- /docs/submission.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Submission 4 | parent: CHiME-9 Task 1 - MCoRec 5 | nav_order: 6 6 | --- 7 | 8 | The submission of the systems is open until **TBU** and should be done through [Google Form - TBU](#). We allow each team to submit up to **three systems** for the challenge. For the submission, make sure to have the following ready: 9 | 10 | - Technical description paper 11 | - System outputs for development and evaluation subset 12 | 13 | ## Technical description paper 14 | 15 | For the technical description, follow the instructions for CHiME-9 challenge papers at the [workshop page](TBU). The papers should be 4 pages long with one additional page for references. Please describe all of your submitted systems and the results on development subset. Please submit your abstract before your results and note the CMT paper ID assigned to your abstract as you will need to include it in your Google form submission. 16 | 17 | ## System outputs 18 | 19 | Participants should submit a zip file containing the output files for each submitted system. The zip file should contain the following directory structure: 20 | 21 | ├── name_of_system_1 22 | │ ├── dev 23 | │ │ ├── session_id_1 24 | │ │ │ ├── speaker_to_cluster.json 25 | │ │ │ ├── spk_0.vtt 26 | │ │ │ ├── spk_1.vtt 27 | │ │ │ └── ... 28 | │ │ ├── session_id_2 29 | │ │ │ ├── speaker_to_cluster.json 30 | │ │ │ ├── spk_0.vtt 31 | │ │ │ ├── spk_1.vtt 32 | │ │ │ └── ... 33 | │ │ └── ... 34 | │ └── eval 35 | │ ├── session_id_1 36 | │ │ ├── speaker_to_cluster.json 37 | │ │ ├── spk_0.vtt 38 | │ │ ├── spk_1.vtt 39 | │ │ └── ... 40 | │ ├── session_id_2 41 | │ └── ... 42 | ... 43 | └── name_of_system_N 44 | ├── dev 45 | └── eval 46 | 47 | - Feel free to choose any naming of the systems, but please make sure that they are consistent between all submitted archives. 48 | 49 | Each session directory contains: 50 | 51 | - `speaker_to_cluster.json`: Contains the conversation clustering assignments for all speakers in that session, following the same format as in the dataset labels 52 | - `spk_0.vtt`, `spk_1.vtt`, etc.: WebVTT files containing time-aligned transcriptions for each target speaker, following the same format as the dataset labels 53 | 54 | The file formats for system outputs should follow the same structure and format as described in the [Detailed description of data structure and formats](./data.md#detailed-desciption-of-data-structure-and-formats) section of the data documentation. 55 | 56 | ## Important Notes 57 | 58 | - **Evaluation Metrics**: The primary ranking metric is the **Joint ASR-Clustering Error Rate**, which equally weights transcription accuracy (WER) and clustering accuracy (per-speaker clustering F1) 59 | - **Clustering Requirements**: Each speaker must be assigned to exactly one conversation cluster per session. Cluster IDs can be any integer values but must be consistent within each session 60 | - **Text Normalization**: The evaluation script will automatically normalize text and remove disfluencies before computing WER 61 | - **Data Usage Compliance**: Systems must comply with the [challenge rules](./rules.md). Only approved external datasets and pre-trained models may be used 62 | - **Processing Independence**: Each evaluation recording must be processed independently. The development set cannot be used for training or parameter updates 63 | 64 | If you are unsure about how to provide any of the above files, please contact us at the [Slack](https://join.slack.com/t/chimechallenge/shared_invite/zt-37h0cfpeb-qg5jwCgqRWCKc_3mLWVsYA) channel or at [mcorecchallenge@gmail.com](mailto:mcorecchallenge@gmail.com). 65 | -------------------------------------------------------------------------------- /src/tokenizer/spm/spm_encode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the license found in 6 | # https://github.com/pytorch/fairseq/blob/master/LICENSE 7 | 8 | 9 | import argparse 10 | import contextlib 11 | import sys 12 | 13 | import sentencepiece as spm 14 | 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument( 19 | "--model", required=True, help="sentencepiece model to use for encoding" 20 | ) 21 | parser.add_argument( 22 | "--inputs", nargs="+", default=["-"], help="input files to filter/encode" 23 | ) 24 | parser.add_argument( 25 | "--outputs", nargs="+", default=["-"], help="path to save encoded outputs" 26 | ) 27 | parser.add_argument("--output_format", choices=["piece", "id"], default="piece") 28 | parser.add_argument( 29 | "--min-len", 30 | type=int, 31 | metavar="N", 32 | help="filter sentence pairs with fewer than N tokens", 33 | ) 34 | parser.add_argument( 35 | "--max-len", 36 | type=int, 37 | metavar="N", 38 | help="filter sentence pairs with more than N tokens", 39 | ) 40 | args = parser.parse_args() 41 | 42 | assert len(args.inputs) == len( 43 | args.outputs 44 | ), "number of input and output paths should match" 45 | 46 | sp = spm.SentencePieceProcessor() 47 | sp.Load(args.model) 48 | 49 | if args.output_format == "piece": 50 | 51 | def encode(l): 52 | return sp.EncodeAsPieces(l) 53 | 54 | elif args.output_format == "id": 55 | 56 | def encode(l): 57 | return list(map(str, sp.EncodeAsIds(l))) 58 | 59 | else: 60 | raise NotImplementedError 61 | 62 | if args.min_len is not None or args.max_len is not None: 63 | 64 | def valid(line): 65 | return (args.min_len is None or len(line) >= args.min_len) and ( 66 | args.max_len is None or len(line) <= args.max_len 67 | ) 68 | 69 | else: 70 | 71 | def valid(lines): 72 | return True 73 | 74 | with contextlib.ExitStack() as stack: 75 | inputs = [ 76 | stack.enter_context(open(input, "r", encoding="utf-8")) 77 | if input != "-" 78 | else sys.stdin 79 | for input in args.inputs 80 | ] 81 | outputs = [ 82 | stack.enter_context(open(output, "w", encoding="utf-8")) 83 | if output != "-" 84 | else sys.stdout 85 | for output in args.outputs 86 | ] 87 | 88 | stats = { 89 | "num_empty": 0, 90 | "num_filtered": 0, 91 | } 92 | 93 | def encode_line(line): 94 | line = line.strip() 95 | if len(line) > 0: 96 | line = encode(line) 97 | if valid(line): 98 | return line 99 | else: 100 | stats["num_filtered"] += 1 101 | else: 102 | stats["num_empty"] += 1 103 | return None 104 | 105 | for i, lines in enumerate(zip(*inputs), start=1): 106 | enc_lines = list(map(encode_line, lines)) 107 | if not any(enc_line is None for enc_line in enc_lines): 108 | for enc_line, output_h in zip(enc_lines, outputs): 109 | print(" ".join(enc_line), file=output_h) 110 | if i % 10000 == 0: 111 | print("processed {} lines".format(i), file=sys.stderr) 112 | 113 | print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) 114 | print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) 115 | 116 | 117 | if __name__ == "__main__": 118 | main() 119 | -------------------------------------------------------------------------------- /src/cluster/eval.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from typing import List, Tuple, Dict 3 | from sklearn.metrics import adjusted_rand_score 4 | 5 | def pairwise_f1_score(true_labels: List[int], pred_labels: List[int]) -> float: 6 | """ 7 | Compute the pairwise F1 score for clustering evaluation. 8 | 9 | Args: 10 | true_labels (List[int]): Ground truth cluster labels. 11 | pred_labels (List[int]): Predicted cluster labels. 12 | 13 | Returns: 14 | float: Pairwise F1 score. 15 | """ 16 | # Generate all unique unordered pairs of indices 17 | pairs = list(itertools.combinations(range(len(true_labels)), 2)) 18 | 19 | # Initialize counts 20 | tp = fp = fn = 0 21 | 22 | for i, j in pairs: 23 | # True same-cluster? 24 | true_same = (true_labels[i] == true_labels[j]) 25 | # Predicted same-cluster? 26 | pred_same = (pred_labels[i] == pred_labels[j]) 27 | 28 | if pred_same and true_same: 29 | tp += 1 30 | elif pred_same and not true_same: 31 | fp += 1 32 | elif not pred_same and true_same: 33 | fn += 1 34 | # True negatives (not same in both) are not used in F1 35 | # print(tp, fp, fn) 36 | # Handle edge cases 37 | if tp == 0: 38 | return 0.0 39 | 40 | precision = tp / (tp + fp) 41 | recall = tp / (tp + fn) 42 | f1 = 2 * precision * recall / (precision + recall) 43 | 44 | return f1 45 | 46 | def pairwise_f1_score_per_speaker(true_labels: List[int], pred_labels: List[int]) -> Dict[int, float]: 47 | """ 48 | Compute the pairwise F1 score for each speaker (one-vs-rest style) in clustering evaluation. 49 | 50 | Args: 51 | true_labels (List[int]): Ground truth cluster labels. 52 | pred_labels (List[int]): Predicted cluster labels. 53 | 54 | Returns: 55 | Dict[int, float]: Mapping from speaker index to their pairwise F1 score. 56 | """ 57 | n = len(true_labels) 58 | scores = {} 59 | 60 | for i in range(n): 61 | tp = fp = fn = 0 62 | for j in range(n): 63 | if i == j: 64 | continue 65 | 66 | # True and predicted same-cluster relationships between i and j 67 | true_same = (true_labels[i] == true_labels[j]) 68 | pred_same = (pred_labels[i] == pred_labels[j]) 69 | 70 | if pred_same and true_same: 71 | tp += 1 72 | elif pred_same and not true_same: 73 | fp += 1 74 | elif not pred_same and true_same: 75 | fn += 1 76 | 77 | # Compute F1 for this speaker 78 | if tp == 0: 79 | f1 = 0.0 80 | else: 81 | precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0 82 | recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0 83 | f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 84 | 85 | scores[i] = f1 86 | 87 | return scores 88 | 89 | if __name__ == "__main__": 90 | # Example usage 91 | examples: List[Tuple[List[int], List[int]]] = [ 92 | ([0, 0, 1, 1], [0, 0, 2, 2]), 93 | ([0, 0, 1, 1], [1, 1, 0, 0]), 94 | ([0, 0, 1, 2], [0, 0, 1, 1]), 95 | ([0, 0, 0, 0], [0, 1, 2, 3]), 96 | ([0, 0, 1, 1], [0, 1, 0, 1]), 97 | ([1, 1, 0, 0], [0, 0, 0, 0]), 98 | ([0, 0, 0, 0], [1, 1, 0, 0]), 99 | ([0, 0, 0, 0, 1, 2], [1, 1, 0, 0, 2, 2]), 100 | ([0, 0, 1, 1, 2, 2], [0, 0, 0, 1, 1, 1]) 101 | ] 102 | 103 | # Compute and display results 104 | results = [(true, pred, pairwise_f1_score(true, pred), adjusted_rand_score(true, pred)) for true, pred in examples] 105 | for true, pred, f1, ari in results: 106 | print(f"True: {true}, Pred: {pred}, F1: {f1}, ARI: {ari}") 107 | 108 | # Compute per-speaker F1 scores 109 | for true, pred in examples: 110 | per_speaker_f1 = pairwise_f1_score_per_speaker(true, pred) 111 | print(f"True: {true}, Pred: {pred}, Per-Speaker F1: {per_speaker_f1}") -------------------------------------------------------------------------------- /src/ibug/face_detection/utils/head_pose_estimator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import math 4 | import numpy as np 5 | from typing import Optional, Tuple 6 | 7 | 8 | __all__ = ['HeadPoseEstimator'] 9 | 10 | 11 | class HeadPoseEstimator(object): 12 | def __init__(self, mean_shape_path: str = os.path.join(os.path.dirname(__file__), 13 | 'data', 'bfm_lms.npy')) -> None: 14 | # Load the 68-point mean shape derived from BFM 15 | mean_shape = np.load(mean_shape_path) 16 | 17 | # Calculate the 5-points mean shape 18 | left_eye = mean_shape[[37, 38, 40, 41]].mean(axis=0) 19 | right_eye = mean_shape[[43, 44, 46, 47]].mean(axis=0) 20 | self._mean_shape_5pts = np.vstack((left_eye, right_eye, mean_shape[[30, 48, 54]])) 21 | 22 | # Flip the y coordinates of the mean shape to match that of the image coordinate system 23 | self._mean_shape_5pts[:, 1] = -self._mean_shape_5pts[:, 1] 24 | 25 | def __call__(self, landmarks: np.ndarray, image_width: int = 0, image_height: int = 0, 26 | camera_matrix: Optional[np.ndarray] = None, dist_coeffs: Optional[np.ndarray] = None, 27 | output_preference: int = 0) -> Tuple[float, float, float]: 28 | # Form the camera matrix 29 | if camera_matrix is None: 30 | if image_width <= 0 or image_height <= 0: 31 | raise ValueError( 32 | 'image_width and image_height must be specified when camera_matrix is not given directly') 33 | else: 34 | camera_matrix = np.array([[image_width + image_height, 0, image_width / 2.0], 35 | [0, image_width + image_height, image_height / 2.0], 36 | [0, 0, 1]], dtype=float) 37 | 38 | # Prepare the landmarks 39 | if landmarks.shape[0] == 68: 40 | landmarks = landmarks[17:] 41 | if landmarks.shape[0] in [49, 51]: 42 | left_eye = landmarks[[20, 21, 23, 24]].mean(axis=0) 43 | right_eye = landmarks[[26, 27, 29, 30]].mean(axis=0) 44 | landmarks = np.vstack((left_eye, right_eye, landmarks[[13, 31, 37]])) 45 | 46 | # Use EPnP to estimate pitch, yaw, and roll 47 | _, rvec, _ = cv2.solvePnP(self._mean_shape_5pts, np.expand_dims(landmarks, axis=1), 48 | camera_matrix, dist_coeffs, flags=cv2.SOLVEPNP_EPNP) 49 | rot_mat, _ = cv2.Rodrigues(rvec) 50 | if 1.0 + rot_mat[2, 0] < 1e-9: 51 | pitch = 0.0 52 | yaw = 90.0 53 | roll = -math.atan2(rot_mat[0, 1], rot_mat[0, 2]) / math.pi * 180.0 54 | elif 1.0 - rot_mat[2, 0] < 1e-9: 55 | pitch = 0.0 56 | yaw = -90.0 57 | roll = math.atan2(-rot_mat[0, 1], -rot_mat[0, 2]) / math.pi * 180.0 58 | else: 59 | pitch = math.atan2(rot_mat[2, 1], rot_mat[2, 2]) / math.pi * 180.0 60 | yaw = -math.asin(rot_mat[2, 0]) / math.pi * 180.0 61 | roll = math.atan2(rot_mat[1, 0], rot_mat[0, 0]) / math.pi * 180.0 62 | 63 | # Respond to output_preference: 64 | # output_preference == 1: limit pitch to the range of -90.0 ~ 90.0 65 | # output_preference == 2: limit yaw to the range of -90.0 ~ 90.0 (already satisfied) 66 | # output_preference == 3: limit roll to the range of -90.0 ~ 90.0 67 | # otherwise: minimise total rotation, min(abs(pitch) + abs(yaw) + abs(roll)) 68 | if output_preference != 2: 69 | alt_pitch = pitch - 180.0 if pitch > 0.0 else pitch + 180.0 70 | alt_yaw = -180.0 - yaw if yaw < 0.0 else 180.0 - yaw 71 | alt_roll = roll - 180.0 if roll > 0.0 else roll + 180.0 72 | if (output_preference == 1 and -90.0 < alt_pitch < 90.0 or 73 | output_preference == 3 and -90.0 < alt_roll < 90.0 or 74 | output_preference not in (1, 2, 3) and 75 | abs(alt_pitch) + abs(alt_yaw) + abs(alt_roll) < abs(pitch) + abs(yaw) + abs(roll)): 76 | pitch, yaw, roll = alt_pitch, alt_yaw, alt_roll 77 | 78 | return -pitch, yaw, roll 79 | -------------------------------------------------------------------------------- /src/ibug/face_detection/utils/simple_face_tracker.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import List, Optional 3 | from scipy.optimize import linear_sum_assignment 4 | 5 | 6 | __all__ = ['SimpleFaceTracker'] 7 | 8 | 9 | class SimpleFaceTracker(object): 10 | def __init__(self, iou_threshold: float = 0.4, minimum_face_size: float = 0.0) -> None: 11 | self._iou_threshold = iou_threshold 12 | self._minimum_face_size = minimum_face_size 13 | self._tracklets = [] 14 | self._tracklet_counter = 0 15 | 16 | @property 17 | def iou_threshold(self) -> float: 18 | return self._iou_threshold 19 | 20 | @iou_threshold.setter 21 | def iou_threshold(self, threshold: float) -> None: 22 | self._iou_threshold = threshold 23 | 24 | @property 25 | def minimum_face_size(self) -> float: 26 | return self._minimum_face_size 27 | 28 | @minimum_face_size.setter 29 | def minimum_face_size(self, face_size: float) -> None: 30 | self._minimum_face_size = face_size 31 | 32 | def __call__(self, face_boxes: np.ndarray) -> List[Optional[int]]: 33 | if face_boxes.size <= 0: 34 | self._tracklets = [] 35 | return [] 36 | 37 | # Calculate area of the faces 38 | face_areas = np.abs((face_boxes[:, 2] - face_boxes[:, 0]) * (face_boxes[:, 3] - face_boxes[:, 1])) 39 | 40 | # Prepare tracklets 41 | for tracklet in self._tracklets: 42 | tracklet['tracked'] = False 43 | 44 | # Calculate the distance matrix based on IOU 45 | iou_distance_threshold = np.clip(1.0 - self._iou_threshold, 0.0, 1.0) 46 | min_face_area = max(self._minimum_face_size ** 2, np.finfo(float).eps) 47 | distances = np.full(shape=(face_boxes.shape[0], len(self._tracklets)), 48 | fill_value=2.0 * min(face_boxes.shape[0], len(self._tracklets)), dtype=float) 49 | for row, face_box in enumerate(face_boxes): 50 | if face_areas[row] >= min_face_area: 51 | for col, tracklet in enumerate(self._tracklets): 52 | x_left = max(min(face_box[0], face_box[2]), min(tracklet['bbox'][0], tracklet['bbox'][2])) 53 | y_top = max(min(face_box[1], face_box[3]), min(tracklet['bbox'][1], tracklet['bbox'][3])) 54 | x_right = min(max(face_box[2], face_box[0]), max(tracklet['bbox'][2], tracklet['bbox'][0])) 55 | y_bottom = min(max(face_box[3], face_box[1]), max(tracklet['bbox'][3], tracklet['bbox'][1])) 56 | if x_right <= x_left or y_bottom <= y_top: 57 | distance = 1.0 58 | else: 59 | intersection_area = (x_right - x_left) * (y_bottom - y_top) 60 | distance = 1.0 - intersection_area / float(face_areas[row] + tracklet['area'] - 61 | intersection_area) 62 | if distance <= iou_distance_threshold: 63 | distances[row, col] = distance 64 | 65 | # ID assignment 66 | tracked_ids = [None] * face_boxes.shape[0] 67 | for row, col in zip(*linear_sum_assignment(distances)): 68 | if distances[row, col] <= iou_distance_threshold: 69 | tracked_ids[row] = self._tracklets[col]['id'] 70 | self._tracklets[col]['bbox'] = face_boxes[row, :4].copy() 71 | self._tracklets[col]['area'] = face_areas[row] 72 | self._tracklets[col]['tracked'] = True 73 | 74 | # Remove expired tracklets 75 | self._tracklets = [x for x in self._tracklets if x['tracked']] 76 | 77 | # Register new faces 78 | for idx, face_box in enumerate(face_boxes): 79 | if face_areas[idx] >= min_face_area and tracked_ids[idx] is None: 80 | self._tracklet_counter += 1 81 | self._tracklets.append({'bbox': face_box[:4].copy(), 'area': face_areas[idx], 82 | 'id': self._tracklet_counter, 'tracked': True}) 83 | tracked_ids[idx] = self._tracklets[-1]['id'] 84 | 85 | return tracked_ids 86 | 87 | def reset(self, reset_tracklet_counter: bool = True) -> None: 88 | self._tracklets = [] 89 | if reset_tracklet_counter: 90 | self._tracklet_counter = 0 91 | -------------------------------------------------------------------------------- /src/auto_avsr/configuration_avsr.py: -------------------------------------------------------------------------------- 1 | from src.nets.backend.e2e_asr_conformer_av import E2E 2 | from transformers.configuration_utils import PretrainedConfig 3 | 4 | class AutoAVSRConfig(PretrainedConfig): 5 | model_type = "auto_avsr" 6 | 7 | def __init__( 8 | self, 9 | odim=5049, 10 | adim=768, 11 | aheads=12, 12 | eunits=3072, 13 | elayers=12, 14 | transformer_input_layer="conv3d", 15 | dropout_rate=0.1, 16 | transformer_attn_dropout_rate=0.1, 17 | transformer_encoder_attn_layer_type="rel_mha", 18 | macaron_style=True, 19 | use_cnn_module=True, 20 | cnn_module_kernel=31, 21 | zero_triu=False, 22 | a_upsample_ratio=1, 23 | relu_type="swish", 24 | ddim=768, 25 | dheads=12, 26 | dunits=3072, 27 | dlayers=6, 28 | lsm_weight=0.1, 29 | transformer_length_normalized_loss=False, 30 | mtlalpha=0.1, 31 | ctc_type="builtin", 32 | rel_pos_type="latest", 33 | aux_adim=768, 34 | aux_aheads=12, 35 | aux_eunits=3072, 36 | aux_elayers=12, 37 | aux_transformer_input_layer="conv1d", 38 | aux_dropout_rate=0.1, 39 | aux_transformer_attn_dropout_rate=0.1, 40 | aux_transformer_encoder_attn_layer_type="rel_mha", 41 | aux_macaron_style=True, 42 | aux_use_cnn_module=True, 43 | aux_cnn_module_kernel=31, 44 | aux_zero_triu=False, 45 | aux_a_upsample_ratio=1, 46 | aux_relu_type="swish", 47 | aux_dunits=3072, 48 | aux_dlayers=6, 49 | aux_lsm_weight=0.1, 50 | aux_transformer_length_normalized_loss=False, 51 | aux_mtlalpha=0.1, 52 | aux_ctc_type="builtin", 53 | aux_rel_pos_type="latest", 54 | fusion_hdim=8192, 55 | fusion_norm="batchnorm", 56 | **kwargs, 57 | ): 58 | super().__init__(**kwargs) 59 | self.odim = odim 60 | self.adim = adim 61 | self.aheads = aheads 62 | self.eunits = eunits 63 | self.elayers = elayers 64 | self.transformer_input_layer = transformer_input_layer 65 | self.dropout_rate = dropout_rate 66 | self.transformer_attn_dropout_rate = transformer_attn_dropout_rate 67 | self.transformer_encoder_attn_layer_type = transformer_encoder_attn_layer_type 68 | self.macaron_style = macaron_style 69 | self.use_cnn_module = use_cnn_module 70 | self.cnn_module_kernel = cnn_module_kernel 71 | self.zero_triu = zero_triu 72 | self.a_upsample_ratio = a_upsample_ratio 73 | self.relu_type = relu_type 74 | self.ddim = ddim 75 | self.dheads = dheads 76 | self.dunits = dunits 77 | self.dlayers = dlayers 78 | self.lsm_weight = lsm_weight 79 | self.transformer_length_normalized_loss = transformer_length_normalized_loss 80 | self.mtlalpha = mtlalpha 81 | self.ctc_type = ctc_type 82 | self.rel_pos_type = rel_pos_type 83 | self.aux_adim = aux_adim 84 | self.aux_aheads = aux_aheads 85 | self.aux_eunits = aux_eunits 86 | self.aux_elayers = aux_elayers 87 | self.aux_transformer_input_layer = aux_transformer_input_layer 88 | self.aux_dropout_rate = aux_dropout_rate 89 | self.aux_transformer_attn_dropout_rate = aux_transformer_attn_dropout_rate 90 | self.aux_transformer_encoder_attn_layer_type = aux_transformer_encoder_attn_layer_type 91 | self.aux_macaron_style = aux_macaron_style 92 | self.aux_use_cnn_module = aux_use_cnn_module 93 | self.aux_cnn_module_kernel = aux_cnn_module_kernel 94 | self.aux_zero_triu = aux_zero_triu 95 | self.aux_a_upsample_ratio = aux_a_upsample_ratio 96 | self.aux_relu_type = aux_relu_type 97 | self.aux_dunits = aux_dunits 98 | self.aux_dlayers = aux_dlayers 99 | self.aux_lsm_weight = aux_lsm_weight 100 | self.aux_transformer_length_normalized_loss = aux_transformer_length_normalized_loss 101 | self.aux_mtlalpha = aux_mtlalpha 102 | self.aux_ctc_type = aux_ctc_type 103 | self.aux_rel_pos_type = aux_rel_pos_type 104 | self.fusion_hdim = fusion_hdim 105 | self.fusion_norm = fusion_norm 106 | 107 | -------------------------------------------------------------------------------- /src/nets/backend/e2e_asr_conformer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Shigeki Karita 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Transformer speech recognition model (pytorch).""" 5 | 6 | import logging 7 | import numpy 8 | import torch 9 | 10 | from src.nets.backend.ctc import CTC 11 | from src.nets.backend.nets_utils import ( 12 | make_non_pad_mask, 13 | th_accuracy, 14 | ) 15 | from src.nets.backend.transformer.add_sos_eos import add_sos_eos 16 | from src.nets.backend.transformer.decoder import Decoder 17 | from src.nets.backend.transformer.encoder import Encoder 18 | from src.nets.backend.transformer.label_smoothing_loss import LabelSmoothingLoss 19 | from src.nets.backend.transformer.mask import target_mask 20 | 21 | 22 | class E2E(torch.nn.Module): 23 | def __init__(self, args, ignore_id=-1): 24 | torch.nn.Module.__init__(self) 25 | 26 | self.encoder = Encoder( 27 | attention_dim=args.adim, 28 | attention_heads=args.aheads, 29 | linear_units=args.eunits, 30 | num_blocks=args.elayers, 31 | input_layer=args.transformer_input_layer, 32 | dropout_rate=args.dropout_rate, 33 | positional_dropout_rate=args.dropout_rate, 34 | attention_dropout_rate=args.transformer_attn_dropout_rate, 35 | encoder_attn_layer_type=args.transformer_encoder_attn_layer_type, 36 | macaron_style=args.macaron_style, 37 | use_cnn_module=args.use_cnn_module, 38 | cnn_module_kernel=args.cnn_module_kernel, 39 | zero_triu=getattr(args, "zero_triu", False), 40 | a_upsample_ratio=args.a_upsample_ratio, 41 | relu_type=getattr(args, "relu_type", "swish"), 42 | ) 43 | 44 | self.transformer_input_layer = args.transformer_input_layer 45 | self.a_upsample_ratio = args.a_upsample_ratio 46 | 47 | self.proj_decoder = None 48 | if args.adim != args.ddim: 49 | self.proj_decoder = torch.nn.Linear(args.adim, args.ddim) 50 | 51 | if args.mtlalpha < 1: 52 | self.decoder = Decoder( 53 | odim=args.odim, 54 | attention_dim=args.ddim, 55 | attention_heads=args.dheads, 56 | linear_units=args.dunits, 57 | num_blocks=args.dlayers, 58 | dropout_rate=args.dropout_rate, 59 | positional_dropout_rate=args.dropout_rate, 60 | self_attention_dropout_rate=args.transformer_attn_dropout_rate, 61 | src_attention_dropout_rate=args.transformer_attn_dropout_rate, 62 | ) 63 | else: 64 | self.decoder = None 65 | self.blank = 0 66 | self.sos = args.odim - 1 67 | self.eos = args.odim - 1 68 | self.odim = args.odim 69 | self.ignore_id = ignore_id 70 | 71 | # self.lsm_weight = a 72 | self.criterion = LabelSmoothingLoss( 73 | self.odim, 74 | self.ignore_id, 75 | args.lsm_weight, 76 | args.transformer_length_normalized_loss, 77 | ) 78 | 79 | self.adim = args.adim 80 | self.mtlalpha = args.mtlalpha 81 | if args.mtlalpha > 0.0: 82 | self.ctc = CTC( 83 | args.odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True 84 | ) 85 | else: 86 | self.ctc = None 87 | 88 | def forward(self, x, lengths, label): 89 | if self.transformer_input_layer == "conv1d": 90 | lengths = torch.div(lengths, 640, rounding_mode="trunc") 91 | padding_mask = make_non_pad_mask(lengths).to(x.device).unsqueeze(-2) 92 | 93 | x, _ = self.encoder(x, padding_mask) 94 | 95 | # ctc loss 96 | loss_ctc, ys_hat = self.ctc(x, lengths, label) 97 | 98 | if self.proj_decoder: 99 | x = self.proj_decoder(x) 100 | 101 | # decoder loss 102 | ys_in_pad, ys_out_pad = add_sos_eos(label, self.sos, self.eos, self.ignore_id) 103 | ys_mask = target_mask(ys_in_pad, self.ignore_id) 104 | pred_pad, _ = self.decoder(ys_in_pad, ys_mask, x, padding_mask) 105 | loss_att = self.criterion(pred_pad, ys_out_pad) 106 | loss = self.mtlalpha * loss_ctc + (1 - self.mtlalpha) * loss_att 107 | 108 | acc = th_accuracy( 109 | pred_pad.view(-1, self.odim), ys_out_pad, ignore_label=self.ignore_id 110 | ) 111 | 112 | return loss, loss_ctc, loss_att, acc 113 | -------------------------------------------------------------------------------- /src/talking_detector/segmentation.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | CENTRAL_ASD_CHUNKING_PARAMETERS = { 5 | "onset": 1.0, # start threshold 6 | "offset": 0.8, # end threshold 7 | "min_duration_on": 1.0, # drop 8 | "min_duration_off": 0.5, # fill 9 | "max_chunk_size": 10, 10 | "min_chunk_size": 1 11 | } 12 | 13 | EGO_ASD_CHUNKING_PARAMETERS = { 14 | "onset": 2.4, # start threshold 15 | "offset": 1.6, # end threshold 16 | "min_duration_on": 1.0, # drop 17 | "min_duration_off": 0.5, # fill 18 | "max_chunk_size": 10, 19 | "min_chunk_size": 1 20 | } 21 | 22 | 23 | def segment_by_asd(asd, parameters={}): 24 | onset_threshold = parameters.get("onset", CENTRAL_ASD_CHUNKING_PARAMETERS["onset"]) 25 | offset_threshold = parameters.get("offset", CENTRAL_ASD_CHUNKING_PARAMETERS["offset"]) 26 | 27 | # Convert frame numbers to integers and sort them 28 | frames = sorted([int(f) for f in asd.keys()]) 29 | if not frames: 30 | return [] 31 | 32 | # Find the minimum frame number to normalize frame indices 33 | min_frame = min(frames) 34 | 35 | # Convert duration parameters from seconds to frames (assuming 25 fps) 36 | min_duration_on_frames = int(parameters.get("min_duration_on", CENTRAL_ASD_CHUNKING_PARAMETERS["min_duration_on"]) * 25) 37 | min_duration_off_frames = int(parameters.get("min_duration_off", CENTRAL_ASD_CHUNKING_PARAMETERS["min_duration_on"]) * 25) 38 | max_chunk_frames = int(parameters.get("max_chunk_size", CENTRAL_ASD_CHUNKING_PARAMETERS["max_chunk_size"]) * 25) 39 | min_chunk_frames = int(parameters.get("min_chunk_size", CENTRAL_ASD_CHUNKING_PARAMETERS["min_chunk_size"]) * 25) 40 | 41 | # First pass: Find speech regions using hysteresis thresholding 42 | speech_regions = [] 43 | current_region = None 44 | is_active = False 45 | 46 | for frame in frames: 47 | score = asd.get(str(frame), -1) 48 | normalized_frame = frame - min_frame 49 | 50 | if not is_active: 51 | # Currently inactive, check for onset 52 | if score > onset_threshold: 53 | is_active = True 54 | current_region = [normalized_frame] 55 | else: 56 | # Currently active, check for offset 57 | if score < offset_threshold: 58 | is_active = False 59 | if current_region is not None: 60 | speech_regions.append(current_region) 61 | current_region = None 62 | else: 63 | current_region.append(normalized_frame) 64 | 65 | # Handle case where speech continues until the end 66 | if current_region is not None: 67 | speech_regions.append(current_region) 68 | 69 | # Second pass: Merge regions separated by short non-speech gaps 70 | merged_regions = [] 71 | if speech_regions: 72 | current_region = speech_regions[0] 73 | 74 | for next_region in speech_regions[1:]: 75 | gap = next_region[0] - current_region[-1] - 1 76 | if gap <= min_duration_off_frames: 77 | # Merge regions 78 | current_region.extend(next_region) 79 | else: 80 | merged_regions.append(current_region) 81 | current_region = next_region 82 | merged_regions.append(current_region) 83 | 84 | # Third pass: Remove short speech regions and split long ones 85 | final_segments = [] 86 | for region in merged_regions: 87 | region_length = len(region) 88 | 89 | # Skip regions shorter than minimum duration 90 | if region_length < min_duration_on_frames: 91 | continue 92 | 93 | # Split long regions 94 | if region_length > max_chunk_frames: 95 | num_chunks = math.ceil(region_length / max_chunk_frames) 96 | chunk_size = math.ceil(region_length / num_chunks) 97 | 98 | for i in range(0, region_length, chunk_size): 99 | sub_segment = region[i:i + chunk_size] 100 | if len(sub_segment) >= min_chunk_frames: 101 | final_segments.append(sub_segment) 102 | else: 103 | final_segments.append(region) 104 | 105 | # Convert frame indices back to original frame indices 106 | final_segments = [ 107 | [frame + min_frame for frame in segment] 108 | for segment in final_segments 109 | ] 110 | 111 | return final_segments 112 | -------------------------------------------------------------------------------- /src/avhubert_muavic/av2text_config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Speech2Text model configuration""" 16 | 17 | from transformers.configuration_utils import PretrainedConfig 18 | from transformers.utils import logging 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | 24 | class AV2TextConfig(PretrainedConfig): 25 | model_type = "speech_to_text" 26 | keys_to_ignore_at_inference = ["past_key_values"] 27 | attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} 28 | 29 | def __init__( 30 | self, 31 | vocab_size=10000, 32 | encoder_layers=12, 33 | encoder_ffn_dim=2048, 34 | encoder_attention_heads=4, 35 | decoder_layers=6, 36 | decoder_ffn_dim=2048, 37 | decoder_attention_heads=4, 38 | encoder_layerdrop=0.0, 39 | decoder_layerdrop=0.0, 40 | use_cache=True, 41 | is_encoder_decoder=True, 42 | activation_function="relu", 43 | d_model=256, 44 | encoder_hidden_size=256, 45 | decoder_hidden_size=256, 46 | dropout=0.1, 47 | attention_dropout=0.0, 48 | activation_dropout=0.0, 49 | init_std=0.02, 50 | decoder_start_token_id=2, 51 | scale_embedding=True, 52 | pad_token_id=1, 53 | bos_token_id=0, 54 | eos_token_id=2, 55 | max_source_positions=6000, 56 | max_target_positions=1024, 57 | num_conv_layers=2, 58 | conv_kernel_sizes=(5, 5), 59 | conv_channels=1024, 60 | input_feat_per_channel=80, 61 | input_channels=1, 62 | attn_implementation="eager", 63 | **kwargs, 64 | ): 65 | self.vocab_size = vocab_size 66 | self.d_model = d_model 67 | self.encoder_ffn_dim = encoder_ffn_dim 68 | self.encoder_layers = encoder_layers 69 | self.encoder_hidden_size = encoder_hidden_size 70 | self.encoder_attention_heads = encoder_attention_heads 71 | self.decoder_ffn_dim = decoder_ffn_dim 72 | self.decoder_hidden_size = decoder_hidden_size 73 | self.decoder_layers = decoder_layers 74 | self.decoder_attention_heads = decoder_attention_heads 75 | self.dropout = dropout 76 | self.attention_dropout = attention_dropout 77 | self.activation_dropout = activation_dropout 78 | self.activation_function = activation_function 79 | self.init_std = init_std 80 | self.encoder_layerdrop = encoder_layerdrop 81 | self.decoder_layerdrop = decoder_layerdrop 82 | self.use_cache = use_cache 83 | self.num_hidden_layers = encoder_layers 84 | self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True 85 | self.max_source_positions = max_source_positions 86 | self.max_target_positions = max_target_positions 87 | self.num_conv_layers = num_conv_layers 88 | self.conv_kernel_sizes = list(conv_kernel_sizes) 89 | self.conv_channels = conv_channels 90 | self.input_feat_per_channel = input_feat_per_channel 91 | self.input_channels = input_channels 92 | self.attn_implementation = attn_implementation 93 | 94 | if len(self.conv_kernel_sizes) != self.num_conv_layers: 95 | raise ValueError( 96 | "Configuration for convolutional module is incorrect. " 97 | "It is required that `len(config.conv_kernel_sizes)` == `config.num_conv_layers` " 98 | f"but is `len(config.conv_kernel_sizes) = {len(self.conv_kernel_sizes)}`, " 99 | f"`config.num_conv_layers = {self.num_conv_layers}`." 100 | ) 101 | 102 | super().__init__( 103 | pad_token_id=pad_token_id, 104 | bos_token_id=bos_token_id, 105 | eos_token_id=eos_token_id, 106 | is_encoder_decoder=is_encoder_decoder, 107 | decoder_start_token_id=decoder_start_token_id, 108 | **kwargs, 109 | ) 110 | 111 | self._attn_implementation = "eager" -------------------------------------------------------------------------------- /src/ibug/face_detection/retina_face/retina_face.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torchvision.models as models 5 | import torchvision.models._utils as _utils 6 | from .retina_face_net import MobileNetV1, FPN, SSH 7 | 8 | 9 | class ClassHead(nn.Module): 10 | def __init__(self, inchannels=512, num_anchors=3): 11 | super(ClassHead, self).__init__() 12 | self.num_anchors = num_anchors 13 | self.conv1x1 = nn.Conv2d(inchannels, self.num_anchors*2, kernel_size=(1, 1), stride=1, padding=0) 14 | 15 | def forward(self, x): 16 | out = self.conv1x1(x) 17 | out = out.permute(0, 2, 3, 1).contiguous() 18 | 19 | return out.view(out.shape[0], -1, 2) 20 | 21 | 22 | class BboxHead(nn.Module): 23 | def __init__(self, inchannels=512, num_anchors=3): 24 | super(BboxHead, self).__init__() 25 | self.conv1x1 = nn.Conv2d(inchannels, num_anchors*4, kernel_size=(1, 1), stride=1,padding=0) 26 | 27 | def forward(self, x): 28 | out = self.conv1x1(x) 29 | out = out.permute(0, 2, 3, 1).contiguous() 30 | 31 | return out.view(out.shape[0], -1, 4) 32 | 33 | 34 | class LandmarkHead(nn.Module): 35 | def __init__(self, inchannels=512, num_anchors=3): 36 | super(LandmarkHead, self).__init__() 37 | self.conv1x1 = nn.Conv2d(inchannels,num_anchors*10, kernel_size=(1, 1), stride=1, padding=0) 38 | 39 | def forward(self, x): 40 | out = self.conv1x1(x) 41 | out = out.permute(0, 2, 3, 1).contiguous() 42 | 43 | return out.view(out.shape[0], -1, 10) 44 | 45 | 46 | class RetinaFace(nn.Module): 47 | def __init__(self, cfg=None, phase='train'): 48 | """ 49 | :param cfg: Network related settings. 50 | :param phase: train or test. 51 | """ 52 | super(RetinaFace, self).__init__() 53 | self.phase = phase 54 | backbone = None 55 | if cfg['name'] == 'mobilenet0.25': 56 | backbone = MobileNetV1() 57 | elif cfg['name'] == 'Resnet50': 58 | backbone = models.resnet50() 59 | 60 | self.body = _utils.IntermediateLayerGetter(backbone, cfg['return_layers']) 61 | in_channels_stage2 = cfg['in_channel'] 62 | in_channels_list = [ 63 | in_channels_stage2 * 2, 64 | in_channels_stage2 * 4, 65 | in_channels_stage2 * 8, 66 | ] 67 | out_channels = cfg['out_channel'] 68 | self.fpn = FPN(in_channels_list,out_channels) 69 | self.ssh1 = SSH(out_channels, out_channels) 70 | self.ssh2 = SSH(out_channels, out_channels) 71 | self.ssh3 = SSH(out_channels, out_channels) 72 | 73 | self.ClassHead = self._make_class_head(fpn_num=3, inchannels=cfg['out_channel']) 74 | self.BboxHead = self._make_bbox_head(fpn_num=3, inchannels=cfg['out_channel']) 75 | self.LandmarkHead = self._make_landmark_head(fpn_num=3, inchannels=cfg['out_channel']) 76 | 77 | def _make_class_head(self, fpn_num=3, inchannels=64, anchor_num=2): 78 | classhead = nn.ModuleList() 79 | for i in range(fpn_num): 80 | classhead.append(ClassHead(inchannels, anchor_num)) 81 | return classhead 82 | 83 | def _make_bbox_head(self, fpn_num=3, inchannels=64, anchor_num=2): 84 | bboxhead = nn.ModuleList() 85 | for i in range(fpn_num): 86 | bboxhead.append(BboxHead(inchannels, anchor_num)) 87 | return bboxhead 88 | 89 | def _make_landmark_head(self, fpn_num=3, inchannels=64, anchor_num=2): 90 | landmarkhead = nn.ModuleList() 91 | for i in range(fpn_num): 92 | landmarkhead.append(LandmarkHead(inchannels, anchor_num)) 93 | return landmarkhead 94 | 95 | def forward(self, inputs): 96 | out = self.body(inputs) 97 | 98 | # FPN 99 | fpn = self.fpn(out) 100 | 101 | # SSH 102 | feature1 = self.ssh1(fpn[0]) 103 | feature2 = self.ssh2(fpn[1]) 104 | feature3 = self.ssh3(fpn[2]) 105 | features = [feature1, feature2, feature3] 106 | 107 | bbox_regressions = torch.cat([self.BboxHead[i](feature) for i, feature in enumerate(features)], dim=1) 108 | classifications = torch.cat([self.ClassHead[i](feature) for i, feature in enumerate(features)], dim=1) 109 | ldm_regressions = torch.cat([self.LandmarkHead[i](feature) for i, feature in enumerate(features)], dim=1) 110 | 111 | if self.phase == 'train': 112 | output = (bbox_regressions, classifications, ldm_regressions) 113 | else: 114 | output = (bbox_regressions, F.softmax(classifications, dim=-1), ldm_regressions) 115 | return output 116 | -------------------------------------------------------------------------------- /src/nets/backend/transformer/decoder_layer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Decoder self-attention layer definition.""" 8 | 9 | import torch 10 | 11 | from src.nets.backend.transformer.layer_norm import LayerNorm 12 | from torch import nn 13 | 14 | 15 | class DecoderLayer(nn.Module): 16 | """Single decoder layer module. 17 | :param int size: input dim 18 | :param src.nets.backend.transformer.attention.MultiHeadedAttention 19 | self_attn: self attention module 20 | :param src.nets.backend.transformer.attention.MultiHeadedAttention 21 | src_attn: source attention module 22 | :param src.nets.backend.transformer.positionwise_feed_forward. 23 | PositionwiseFeedForward feed_forward: feed forward layer module 24 | :param float dropout_rate: dropout rate 25 | :param bool normalize_before: whether to use layer_norm before the first block 26 | :param bool concat_after: whether to concat attention layer's input and output 27 | if True, additional linear will be applied. 28 | i.e. x -> x + linear(concat(x, att(x))) 29 | if False, no additional linear will be applied. i.e. x -> x + att(x) 30 | """ 31 | 32 | def __init__( 33 | self, 34 | size, 35 | self_attn, 36 | src_attn, 37 | feed_forward, 38 | dropout_rate, 39 | normalize_before=True, 40 | concat_after=False, 41 | ): 42 | """Construct an DecoderLayer object.""" 43 | super(DecoderLayer, self).__init__() 44 | self.size = size 45 | self.self_attn = self_attn 46 | self.src_attn = src_attn 47 | self.feed_forward = feed_forward 48 | self.norm1 = LayerNorm(size) 49 | self.norm2 = LayerNorm(size) 50 | self.norm3 = LayerNorm(size) 51 | self.dropout = nn.Dropout(dropout_rate) 52 | self.normalize_before = normalize_before 53 | self.concat_after = concat_after 54 | if self.concat_after: 55 | self.concat_linear1 = nn.Linear(size + size, size) 56 | self.concat_linear2 = nn.Linear(size + size, size) 57 | 58 | def forward(self, tgt, tgt_mask, memory, memory_mask, cache=None): 59 | """Compute decoded features. 60 | Args: 61 | tgt (torch.Tensor): 62 | decoded previous target features (batch, max_time_out, size) 63 | tgt_mask (torch.Tensor): mask for x (batch, max_time_out) 64 | memory (torch.Tensor): encoded source features (batch, max_time_in, size) 65 | memory_mask (torch.Tensor): mask for memory (batch, max_time_in) 66 | cache (torch.Tensor): cached output (batch, max_time_out-1, size) 67 | """ 68 | residual = tgt 69 | if self.normalize_before: 70 | tgt = self.norm1(tgt) 71 | 72 | if cache is None: 73 | tgt_q = tgt 74 | tgt_q_mask = tgt_mask 75 | else: 76 | # compute only the last frame query keeping dim: max_time_out -> 1 77 | assert cache.shape == ( 78 | tgt.shape[0], 79 | tgt.shape[1] - 1, 80 | self.size, 81 | ), f"{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}" 82 | tgt_q = tgt[:, -1:, :] 83 | residual = residual[:, -1:, :] 84 | tgt_q_mask = None 85 | if tgt_mask is not None: 86 | tgt_q_mask = tgt_mask[:, -1:, :] 87 | 88 | if self.concat_after: 89 | tgt_concat = torch.cat( 90 | (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)), dim=-1 91 | ) 92 | x = residual + self.concat_linear1(tgt_concat) 93 | else: 94 | x = residual + self.dropout(self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)) 95 | if not self.normalize_before: 96 | x = self.norm1(x) 97 | 98 | residual = x 99 | if self.normalize_before: 100 | x = self.norm2(x) 101 | if self.concat_after: 102 | x_concat = torch.cat( 103 | (x, self.src_attn(x, memory, memory, memory_mask)), dim=-1 104 | ) 105 | x = residual + self.concat_linear2(x_concat) 106 | else: 107 | x = residual + self.dropout(self.src_attn(x, memory, memory, memory_mask)) 108 | if not self.normalize_before: 109 | x = self.norm2(x) 110 | 111 | residual = x 112 | if self.normalize_before: 113 | x = self.norm3(x) 114 | x = residual + self.dropout(self.feed_forward(x)) 115 | if not self.normalize_before: 116 | x = self.norm3(x) 117 | 118 | if cache is not None: 119 | x = torch.cat([cache, x], dim=1) 120 | 121 | return x, tgt_mask, memory, memory_mask 122 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | 166 | ### Python Patch ### 167 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 168 | poetry.toml 169 | 170 | # ruff 171 | .ruff_cache/ 172 | 173 | # LSP config files 174 | pyrightconfig.json 175 | 176 | ### VisualStudioCode ### 177 | .vscode/* 178 | !.vscode/settings.json 179 | !.vscode/tasks.json 180 | !.vscode/launch.json 181 | !.vscode/extensions.json 182 | !.vscode/*.code-snippets 183 | 184 | # Local History for Visual Studio Code 185 | .history/ 186 | 187 | # Built Visual Studio Code Extensions 188 | *.vsix 189 | 190 | ### VisualStudioCode Patch ### 191 | # Ignore all local history of files 192 | .history 193 | .ionide 194 | 195 | # End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode 196 | 197 | .cache/* 198 | model-bin 199 | data-bin 200 | wandb 201 | src/ibug/**/weights/* 202 | src/ibug/**/weights/* 203 | 204 | script/_*.py -------------------------------------------------------------------------------- /script/asd.py: -------------------------------------------------------------------------------- 1 | import os, cv2, math, sys 2 | # os.environ['CUDA_VISIBLE_DEVICES'] = '0' 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 4 | import numpy as np 5 | import torch 6 | import torchaudio 7 | import python_speech_features 8 | import json 9 | from src.talking_detector.ASD import ASD 10 | 11 | # ==================== LOAD MODEL ==================== 12 | ASD_MODEL = ASD() 13 | ASD_MODEL.loadParameters("model-bin/finetuning_TalkSet.model") 14 | ASD_MODEL = ASD_MODEL.cuda().eval() 15 | print("Model loaded successfully.") 16 | 17 | def process_video(video_path, output_dir=None, frame_offset=0): 18 | """ 19 | Process a single video file to detect active speakers and output ASD results. 20 | 21 | Args: 22 | video_path (str): Path to the input video file 23 | output_dir (str, optional): Directory to save the output JSON. If None, saves in same directory as video. 24 | 25 | Returns: 26 | str: Path to the output JSON file 27 | """ 28 | if not os.path.exists(video_path): 29 | raise FileNotFoundError(f"Video file not found: {video_path}") 30 | 31 | # Create output directory if specified 32 | if output_dir is None: 33 | output_dir = os.path.dirname(video_path) 34 | os.makedirs(output_dir, exist_ok=True) 35 | 36 | # Get video name without extension 37 | video_name = os.path.splitext(os.path.basename(video_path))[0] 38 | 39 | # Load audio directly using torchaudio 40 | audio, sample_rate = torchaudio.load(video_path, normalize=False) 41 | assert sample_rate == 16000 42 | 43 | # Convert to numpy for MFCC computation 44 | audio_np = audio[0].numpy() 45 | 46 | # Compute MFCC features 47 | audioFeature = python_speech_features.mfcc(audio_np, 16000, numcep=13, winlen=0.025, winstep=0.010) 48 | 49 | # Load video frames 50 | video = cv2.VideoCapture(video_path) 51 | videoFeature = [] 52 | while video.isOpened(): 53 | ret, frames = video.read() 54 | if ret: 55 | face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY) 56 | face = cv2.resize(face, (224,224)) 57 | face = face[int(112-(112/2)):int(112+(112/2)), int(112-(112/2)):int(112+(112/2))] 58 | videoFeature.append(face) 59 | else: 60 | break 61 | video.release() 62 | 63 | videoFeature = np.array(videoFeature) 64 | length = min((audioFeature.shape[0] - audioFeature.shape[0] % 4) / 100, videoFeature.shape[0] / 25) 65 | audioFeature = audioFeature[:int(round(length * 100)),:] 66 | videoFeature = videoFeature[:int(round(length * 25)),:,:] 67 | 68 | # Evaluate using model 69 | durationSet = {1,1,1,2,2,2,3,3,4,5,6} 70 | allScore = [] 71 | 72 | for duration in durationSet: 73 | batchSize = int(math.ceil(length / duration)) 74 | scores = [] 75 | with torch.no_grad(): 76 | for i in range(batchSize): 77 | inputA = torch.FloatTensor(audioFeature[i * duration * 100:(i+1) * duration * 100,:]).unsqueeze(0).cuda() 78 | inputV = torch.FloatTensor(videoFeature[i * duration * 25: (i+1) * duration * 25,:,:]).unsqueeze(0).cuda() 79 | embedA = ASD_MODEL.model.forward_audio_frontend(inputA) 80 | embedV = ASD_MODEL.model.forward_visual_frontend(inputV) 81 | out = ASD_MODEL.model.forward_audio_visual_backend(embedA, embedV) 82 | score = ASD_MODEL.lossAV.forward(out, labels=None) 83 | scores.extend(score) 84 | allScore.append(scores) 85 | 86 | # Calculate final scores 87 | final_scores = np.round((np.mean(np.array(allScore), axis=0)), 1).astype(float) 88 | 89 | # Create frame-wise scores dictionary 90 | frame_scores = {frame_idx + frame_offset: round(float(score), 2) for frame_idx, score in enumerate(final_scores)} 91 | 92 | # Save results 93 | output_json = os.path.join(output_dir, f"{video_name}_asd.json") 94 | with open(output_json, 'w') as f: 95 | json.dump(frame_scores, f, indent=4) 96 | 97 | return output_json 98 | 99 | def main(): 100 | import argparse 101 | parser = argparse.ArgumentParser(description="Active Speaker Detection") 102 | parser.add_argument('--video', type=str, required=True, help='Path to input video file') 103 | parser.add_argument('--output_dir', type=str, default=None, help='Directory to save output JSON (optional)') 104 | opt = parser.parse_args() 105 | 106 | video_info = opt.video.replace(".mp4", ".json") 107 | frame_offset = 0 108 | if os.path.exists(video_info): 109 | with open(video_info, 'r') as f: 110 | video_data = json.load(f) 111 | frame_offset = video_data.get("frame_start", 0) 112 | output_path = process_video(opt.video, opt.output_dir, frame_offset) 113 | print(f"ASD results saved to: {output_path}") 114 | 115 | if __name__ == "__main__": 116 | 117 | main() -------------------------------------------------------------------------------- /src/ibug/face_detection/retina_face/retina_face_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | def conv_bn(inp, oup, stride = 1, leaky = 0): 7 | return nn.Sequential( 8 | nn.Conv2d(inp, oup, 3, stride, 1, bias=False), 9 | nn.BatchNorm2d(oup), 10 | nn.LeakyReLU(negative_slope=leaky, inplace=True) 11 | ) 12 | 13 | 14 | def conv_bn_no_relu(inp, oup, stride): 15 | return nn.Sequential( 16 | nn.Conv2d(inp, oup, 3, stride, 1, bias=False), 17 | nn.BatchNorm2d(oup), 18 | ) 19 | 20 | 21 | def conv_bn1X1(inp, oup, stride, leaky=0): 22 | return nn.Sequential( 23 | nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False), 24 | nn.BatchNorm2d(oup), 25 | nn.LeakyReLU(negative_slope=leaky, inplace=True) 26 | ) 27 | 28 | 29 | def conv_dw(inp, oup, stride, leaky=0.1): 30 | return nn.Sequential( 31 | nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), 32 | nn.BatchNorm2d(inp), 33 | nn.LeakyReLU(negative_slope=leaky, inplace=True), 34 | 35 | nn.Conv2d(inp, oup, 1, 1, 0, bias=False), 36 | nn.BatchNorm2d(oup), 37 | nn.LeakyReLU(negative_slope=leaky, inplace=True), 38 | ) 39 | 40 | 41 | class SSH(nn.Module): 42 | def __init__(self, in_channel, out_channel): 43 | super(SSH, self).__init__() 44 | assert out_channel % 4 == 0 45 | leaky = 0 46 | if out_channel <= 64: 47 | leaky = 0.1 48 | self.conv3X3 = conv_bn_no_relu(in_channel, out_channel//2, stride=1) 49 | 50 | self.conv5X5_1 = conv_bn(in_channel, out_channel//4, stride=1, leaky = leaky) 51 | self.conv5X5_2 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1) 52 | 53 | self.conv7X7_2 = conv_bn(out_channel//4, out_channel//4, stride=1, leaky = leaky) 54 | self.conv7x7_3 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1) 55 | 56 | def forward(self, input): 57 | conv3X3 = self.conv3X3(input) 58 | 59 | conv5X5_1 = self.conv5X5_1(input) 60 | conv5X5 = self.conv5X5_2(conv5X5_1) 61 | 62 | conv7X7_2 = self.conv7X7_2(conv5X5_1) 63 | conv7X7 = self.conv7x7_3(conv7X7_2) 64 | 65 | out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1) 66 | out = F.relu(out) 67 | return out 68 | 69 | 70 | class FPN(nn.Module): 71 | def __init__(self,in_channels_list,out_channels): 72 | super(FPN,self).__init__() 73 | leaky = 0 74 | if out_channels <= 64: 75 | leaky = 0.1 76 | self.output1 = conv_bn1X1(in_channels_list[0], out_channels, stride=1, leaky=leaky) 77 | self.output2 = conv_bn1X1(in_channels_list[1], out_channels, stride=1, leaky=leaky) 78 | self.output3 = conv_bn1X1(in_channels_list[2], out_channels, stride=1, leaky=leaky) 79 | 80 | self.merge1 = conv_bn(out_channels, out_channels, leaky=leaky) 81 | self.merge2 = conv_bn(out_channels, out_channels, leaky=leaky) 82 | 83 | def forward(self, input): 84 | # names = list(input.keys()) 85 | input = list(input.values()) 86 | 87 | output1 = self.output1(input[0]) 88 | output2 = self.output2(input[1]) 89 | output3 = self.output3(input[2]) 90 | 91 | up3 = F.interpolate(output3, size=[output2.size(2), output2.size(3)], mode="nearest") 92 | output2 = output2 + up3 93 | output2 = self.merge2(output2) 94 | 95 | up2 = F.interpolate(output2, size=[output1.size(2), output1.size(3)], mode="nearest") 96 | output1 = output1 + up2 97 | output1 = self.merge1(output1) 98 | 99 | out = [output1, output2, output3] 100 | return out 101 | 102 | 103 | class MobileNetV1(nn.Module): 104 | def __init__(self): 105 | super(MobileNetV1, self).__init__() 106 | self.stage1 = nn.Sequential( 107 | conv_bn(3, 8, 2, leaky=0.1), # 3 108 | conv_dw(8, 16, 1), # 7 109 | conv_dw(16, 32, 2), # 11 110 | conv_dw(32, 32, 1), # 19 111 | conv_dw(32, 64, 2), # 27 112 | conv_dw(64, 64, 1), # 43 113 | ) 114 | self.stage2 = nn.Sequential( 115 | conv_dw(64, 128, 2), # 43 + 16 = 59 116 | conv_dw(128, 128, 1), # 59 + 32 = 91 117 | conv_dw(128, 128, 1), # 91 + 32 = 123 118 | conv_dw(128, 128, 1), # 123 + 32 = 155 119 | conv_dw(128, 128, 1), # 155 + 32 = 187 120 | conv_dw(128, 128, 1), # 187 + 32 = 219 121 | ) 122 | self.stage3 = nn.Sequential( 123 | conv_dw(128, 256, 2), # 219 +3 2 = 241 124 | conv_dw(256, 256, 1), # 241 + 64 = 301 125 | ) 126 | self.avg = nn.AdaptiveAvgPool2d((1,1)) 127 | self.fc = nn.Linear(256, 1000) 128 | 129 | def forward(self, x): 130 | x = self.stage1(x) 131 | x = self.stage2(x) 132 | x = self.stage3(x) 133 | x = self.avg(x) 134 | # x = self.model(x) 135 | x = x.view(-1, 256) 136 | x = self.fc(x) 137 | return x 138 | -------------------------------------------------------------------------------- /src/talking_detector/ASD.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | import sys, time, numpy, os, subprocess, pandas, tqdm 6 | from subprocess import PIPE 7 | 8 | from .loss import lossAV, lossV 9 | from .Model import ASD_Model 10 | 11 | class ASD(nn.Module): 12 | def __init__(self, lr = 0.001, lrDecay = 0.95, **kwargs): 13 | super(ASD, self).__init__() 14 | self.model = ASD_Model().cuda() 15 | self.lossAV = lossAV().cuda() 16 | self.lossV = lossV().cuda() 17 | self.optim = torch.optim.Adam(self.parameters(), lr = lr) 18 | self.scheduler = torch.optim.lr_scheduler.StepLR(self.optim, step_size = 1, gamma=lrDecay) 19 | print(time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f"%(sum(param.numel() for param in self.model.parameters()) / 1000 / 1000)) 20 | 21 | def train_network(self, loader, epoch, **kwargs): 22 | self.train() 23 | self.scheduler.step(epoch - 1) # StepLR 24 | index, top1, lossV, lossAV, loss = 0, 0, 0, 0, 0 25 | lr = self.optim.param_groups[0]['lr'] 26 | r = 1.3 - 0.02 * (epoch - 1) 27 | for num, (audioFeature, visualFeature, labels) in enumerate(loader, start=1): 28 | self.zero_grad() 29 | 30 | audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda()) 31 | visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda()) 32 | 33 | outsAV= self.model.forward_audio_visual_backend(audioEmbed, visualEmbed) 34 | outsV = self.model.forward_visual_backend(visualEmbed) 35 | 36 | labels = labels[0].reshape((-1)).cuda() # Loss 37 | nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels, r) 38 | nlossV = self.lossV.forward(outsV, labels, r) 39 | nloss = nlossAV + 0.5 * nlossV 40 | 41 | lossV += nlossV.detach().cpu().numpy() 42 | lossAV += nlossAV.detach().cpu().numpy() 43 | loss += nloss.detach().cpu().numpy() 44 | top1 += prec 45 | nloss.backward() 46 | self.optim.step() 47 | index += len(labels) 48 | sys.stderr.write(time.strftime("%m-%d %H:%M:%S") + \ 49 | " [%2d] r: %2f, Lr: %5f, Training: %.2f%%, " %(epoch, r, lr, 100 * (num / loader.__len__())) + \ 50 | " LossV: %.5f, LossAV: %.5f, Loss: %.5f, ACC: %2.2f%% \r" %(lossV/(num), lossAV/(num), loss/(num), 100 * (top1/index))) 51 | sys.stderr.flush() 52 | 53 | sys.stdout.write("\n") 54 | 55 | return loss/num, lr 56 | 57 | def evaluate_network(self, loader, evalCsvSave, evalOrig, **kwargs): 58 | self.eval() 59 | predScores = [] 60 | for audioFeature, visualFeature, labels in tqdm.tqdm(loader): 61 | with torch.no_grad(): 62 | audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda()) 63 | visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda()) 64 | outsAV= self.model.forward_audio_visual_backend(audioEmbed, visualEmbed) 65 | labels = labels[0].reshape((-1)).cuda() 66 | _, predScore, _, _ = self.lossAV.forward(outsAV, labels) 67 | predScore = predScore[:,1].detach().cpu().numpy() 68 | predScores.extend(predScore) 69 | # break 70 | evalLines = open(evalOrig).read().splitlines()[1:] 71 | labels = [] 72 | labels = pandas.Series( ['SPEAKING_AUDIBLE' for line in evalLines]) 73 | scores = pandas.Series(predScores) 74 | evalRes = pandas.read_csv(evalOrig) 75 | evalRes['score'] = scores 76 | evalRes['label'] = labels 77 | evalRes.drop(['label_id'], axis=1,inplace=True) 78 | evalRes.drop(['instance_id'], axis=1,inplace=True) 79 | evalRes.to_csv(evalCsvSave, index=False) 80 | cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s "%(evalOrig, evalCsvSave) 81 | mAP = float(str(subprocess.run(cmd, shell=True, stdout=PIPE, stderr=PIPE).stdout).split(' ')[2][:5]) 82 | return mAP 83 | 84 | def saveParameters(self, path): 85 | torch.save(self.state_dict(), path) 86 | 87 | def loadParameters(self, path): 88 | selfState = self.state_dict() 89 | loadedState = torch.load(path, weights_only=True) 90 | for name, param in loadedState.items(): 91 | origName = name 92 | if name not in selfState: 93 | name = name.replace("module.", "") 94 | if name not in selfState: 95 | print("%s is not in the model."%origName) 96 | continue 97 | if selfState[name].size() != loadedState[origName].size(): 98 | sys.stderr.write("Wrong parameter length: %s, model: %s, loaded: %s"%(origName, selfState[name].size(), loadedState[origName].size())) 99 | continue 100 | selfState[name].copy_(param) -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: CHiME-9 Task 1 - MCoRec 4 | has_children: true 5 | parent: CHiME-9 6 | nav_order: 1 7 | --- 8 | 9 | # CHiME-9 Task 1: Multi-Modal Context-aware Recognition (MCoRec) 10 | 11 | ## High-Level Summary 12 | 13 | CHiME-9 Task 1 targets the problem of **Multi-Modal Context-aware Recognition (MCoRec)** in a single-room environment. The goal is to process a single 360° video and audio recording of a room where multiple, separate conversations are happening simultaneously, and to both transcribe each speaker's speech and identify which speakers belong to the same conversation. This task addresses the challenging scenario of understanding overlapping conversations in natural social environments, where multiple groups of people engage in distinct discussions within the same physical space. 14 | 15 | ![MCoRec Challenge Overview](images/mcorec_overview.png) 16 | 17 | ## Key Challenge Features 18 | 19 | * **Multiple concurrent conversations** occurring simultaneously in the same room 20 | * **Single 360° camera and microphone** capturing all participants from a central viewpoint 21 | * **High speech overlap ratios** reaching up to 100% due to simultaneous conversations 22 | * **Real, unscripted conversations** covering everyday topics like hobbies, work, entertainment, and personal stories 23 | * **Natural acoustic environments** with realistic background noise from other ongoing conversations 24 | * **Up to 8 active speakers** divided into up to 4 simultaneous conversations 25 | * **Combined transcription and clustering challenge** requiring both accurate speech recognition and conversation grouping 26 | 27 | ## The Scenario 28 | 29 | The MCoRec dataset captures natural conversational scenarios where 2-8 participants are seated around a table and divided into groups of 2-4 speakers each. Participants engage in unscripted conversations on topics including everyday life, work, school, hypotheticals, entertainment, news, and personal stories. Sessions typically last around 6 minutes, during which multiple separate conversations occur simultaneously, creating a challenging acoustic environment with significant speech overlap. A moderator signals the start and end of each session with a distinctive whistle to facilitate synchronization. 30 | 31 | ## The Recording Setup 32 | 33 | * **360° Camera**: GoPro Max positioned at the center of the table, capturing 4K resolution video at 25fps with single-channel audio at 16kHz 34 | * **Individual Smartphones**: Each speaker has a smartphone placed in front of them (selfie mode) recording at 720p resolution 35 | * **Lapel Microphones**: Close-talking microphones connected to smartphones via adapters, positioned near each speaker's mouth for enhanced audio clarity 36 | * **Seating Arrangement**: All speakers sit around a table with varying distances depending on table size 37 | * **Session Synchronization**: Moderator whistle cues enable precise alignment of recordings from multiple devices 38 | * **Recording Duration**: Each session typically lasts approximately 6 minutes. 39 | 40 | ***Note***: Individual smartphone recordings and lapel microphone audio are **only available for the training set** to facilitate system development. The development and evaluation sets contain **only the central 360° video and audio**, as the core challenge focuses on processing the difficult multi-speaker, multi-conversation scenario captured by the central camera setup with high speech overlap and acoustic complexity. 41 | 42 | ## Task Description 43 | 44 | The challenge consists of a single comprehensive track requiring participants' systems to: 45 | 46 | 1. **Individual Speaker Transcription**: Generate time-aligned transcripts (`.vtt` files) for each target speaker, accurately capturing their speech content within the specified evaluation time intervals. 47 | 2. **Conversation Clustering**: Group participants into their respective conversations by generating a speaker-to-cluster mapping. 48 | 49 | **Input**: Single 360° video and its corresponding audio track, along with bounding boxes to identify the list of target participants. 50 | 51 | **Output**: Per-speaker transcriptions and conversation cluster assignments 52 | 53 | 54 | ## Evaluation and Ranking 55 | 56 | The evaluation uses **three complementary metrics**: 57 | 58 | 1. **Individual Speaker's WER**: Word Error Rate computed for each speaker's transcription 59 | 2. **Conversation Clustering Performance**: Pairwise F1 score measuring clustering accuracy 60 | 3. **Joint ASR-Clustering Error Rate** (*Primary Metric*): Combined metric that weights transcription performance and clustering performance 61 | 62 | ## Important Dates 63 | 64 | * **Data Release**: July 1st, 2025 (train and dev sets) 65 | * **Evaluation Data Release**: TBU 66 | * **Final Submission Deadline**: 7 Feb 2026 67 | * **Results Announcement**: 3 May 2026 68 | * **Workshop**: TBU 69 | 70 | ## Organizers 71 | 72 | - Alexander Waibel (CMU, USA & KIT, DE) 73 | - Christian Fuegen (Meta, UK) 74 | - Shinji Watanabe (CMU, USA) 75 | - Katerina Zmolikova (Meta, UK) 76 | - Thai-Binh Nguyen (KIT, DE) 77 | - Pingchuan Ma (Meta, USA) 78 | 79 | For any questions about the challenge, please contact us: 80 | - Email: [mcorecchallenge@gmail.com](mailto:mcorecchallenge@gmail.com) 81 | - Slack: [CHiME Challenge Community](https://join.slack.com/t/chimechallenge/shared_invite/zt-37h0cfpeb-qg5jwCgqRWCKc_3mLWVsYA) 82 | -------------------------------------------------------------------------------- /src/custom_trainer.py: -------------------------------------------------------------------------------- 1 | from transformers.trainer import * 2 | from typing import Callable, Dict, List, Optional, Tuple, Union, Type 3 | 4 | class AVSRTrainer(Trainer): 5 | 6 | def __init__( 7 | self, 8 | model: Union[PreTrainedModel, nn.Module] = None, 9 | args: TrainingArguments = None, 10 | data_collator: any = None, 11 | valid_data_collator: any = None, 12 | train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None, 13 | eval_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None, 14 | processing_class: Optional[ 15 | Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] 16 | ] = None, 17 | model_init: Optional[Callable[[], PreTrainedModel]] = None, 18 | compute_loss_func: Optional[Callable] = None, 19 | compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, 20 | callbacks: Optional[List[TrainerCallback]] = None, 21 | optimizers: Tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None), 22 | optimizer_cls_and_kwargs: Optional[Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]] = None, 23 | preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, 24 | 25 | ): 26 | super().__init__( 27 | model=model, 28 | args=args, 29 | data_collator=data_collator, 30 | train_dataset=train_dataset, 31 | eval_dataset=eval_dataset, 32 | tokenizer=processing_class, 33 | model_init=model_init, 34 | compute_loss_func=compute_loss_func, 35 | compute_metrics=compute_metrics, 36 | callbacks=callbacks, 37 | optimizers=optimizers, 38 | optimizer_cls_and_kwargs=optimizer_cls_and_kwargs, 39 | preprocess_logits_for_metrics=preprocess_logits_for_metrics, 40 | ) 41 | self.valid_data_collator = valid_data_collator 42 | 43 | def get_eval_dataloader(self, eval_dataset: Optional[Union[str, Dataset]] = None) -> DataLoader: 44 | """ 45 | Returns the evaluation [`~torch.utils.data.DataLoader`]. 46 | 47 | Subclass and override this method if you want to inject some custom behavior. 48 | 49 | Args: 50 | eval_dataset (`str` or `torch.utils.data.Dataset`, *optional*): 51 | If a `str`, will use `self.eval_dataset[eval_dataset]` as the evaluation dataset. If a `Dataset`, will override `self.eval_dataset` and must implement `__len__`. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. 52 | """ 53 | if eval_dataset is None and self.eval_dataset is None: 54 | raise ValueError("Trainer: evaluation requires an eval_dataset.") 55 | 56 | # If we have persistent workers, don't do a fork bomb especially as eval datasets 57 | # don't change during training 58 | dataloader_key = eval_dataset if isinstance(eval_dataset, str) else "eval" 59 | if ( 60 | hasattr(self, "_eval_dataloaders") 61 | and dataloader_key in self._eval_dataloaders 62 | and self.args.dataloader_persistent_workers 63 | ): 64 | return self.accelerator.prepare(self._eval_dataloaders[dataloader_key]) 65 | 66 | eval_dataset = ( 67 | self.eval_dataset[eval_dataset] 68 | if isinstance(eval_dataset, str) 69 | else eval_dataset 70 | if eval_dataset is not None 71 | else self.eval_dataset 72 | ) 73 | data_collator = self.valid_data_collator 74 | 75 | if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset): 76 | eval_dataset = self._remove_unused_columns(eval_dataset, description="evaluation") 77 | else: 78 | data_collator = self._get_collator_with_removed_columns(data_collator, description="evaluation") 79 | 80 | dataloader_params = { 81 | "batch_size": self.args.eval_batch_size, 82 | "collate_fn": data_collator, 83 | "num_workers": self.args.dataloader_num_workers, 84 | "pin_memory": self.args.dataloader_pin_memory, 85 | "persistent_workers": self.args.dataloader_persistent_workers, 86 | } 87 | 88 | if not isinstance(eval_dataset, torch.utils.data.IterableDataset): 89 | dataloader_params["sampler"] = self._get_eval_sampler(eval_dataset) 90 | dataloader_params["drop_last"] = self.args.dataloader_drop_last 91 | dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor 92 | 93 | # accelerator.free_memory() will destroy the references, so 94 | # we need to store the non-prepared version 95 | eval_dataloader = DataLoader(eval_dataset, **dataloader_params) 96 | if self.args.dataloader_persistent_workers: 97 | if hasattr(self, "_eval_dataloaders"): 98 | self._eval_dataloaders[dataloader_key] = eval_dataloader 99 | else: 100 | self._eval_dataloaders = {dataloader_key: eval_dataloader} 101 | 102 | return self.accelerator.prepare(eval_dataloader) -------------------------------------------------------------------------------- /src/nets/scorers/ctc.py: -------------------------------------------------------------------------------- 1 | """ScorerInterface implementation for CTC.""" 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from src.nets.ctc_prefix_score import CTCPrefixScore, CTCPrefixScoreTH 7 | from src.nets.scorer_interface import BatchPartialScorerInterface 8 | 9 | 10 | class CTCPrefixScorer(BatchPartialScorerInterface): 11 | """Decoder interface wrapper for CTCPrefixScore.""" 12 | 13 | def __init__(self, ctc: torch.nn.Module, eos: int): 14 | """Initialize class. 15 | 16 | Args: 17 | ctc (torch.nn.Module): The CTC implementation. 18 | For example, :class:`src.nets.backend.ctc.CTC` 19 | eos (int): The end-of-sequence id. 20 | 21 | """ 22 | self.ctc = ctc 23 | self.eos = eos 24 | self.impl = None 25 | 26 | def init_state(self, x: torch.Tensor): 27 | """Get an initial state for decoding. 28 | 29 | Args: 30 | x (torch.Tensor): The encoded feature tensor 31 | 32 | Returns: initial state 33 | 34 | """ 35 | logp = self.ctc.log_softmax(x.unsqueeze(0)).detach().squeeze(0).cpu().numpy() 36 | # TODO(karita): use CTCPrefixScoreTH 37 | self.impl = CTCPrefixScore(logp, 0, self.eos, np) 38 | return 0, self.impl.initial_state() 39 | 40 | def select_state(self, state, i, new_id=None): 41 | """Select state with relative ids in the main beam search. 42 | 43 | Args: 44 | state: Decoder state for prefix tokens 45 | i (int): Index to select a state in the main beam search 46 | new_id (int): New label id to select a state if necessary 47 | 48 | Returns: 49 | state: pruned state 50 | 51 | """ 52 | if type(state) == tuple: 53 | if len(state) == 2: # for CTCPrefixScore 54 | sc, st = state 55 | return sc[i], st[i] 56 | else: # for CTCPrefixScoreTH (need new_id > 0) 57 | r, log_psi, f_min, f_max, scoring_idmap = state 58 | s = log_psi[i, new_id].expand(log_psi.size(1)) 59 | if scoring_idmap is not None: 60 | return r[:, :, i, scoring_idmap[i, new_id]], s, f_min, f_max 61 | else: 62 | return r[:, :, i, new_id], s, f_min, f_max 63 | return None if state is None else state[i] 64 | 65 | def score_partial(self, y, ids, state, x): 66 | """Score new token. 67 | 68 | Args: 69 | y (torch.Tensor): 1D prefix token 70 | next_tokens (torch.Tensor): torch.int64 next token to score 71 | state: decoder state for prefix tokens 72 | x (torch.Tensor): 2D encoder feature that generates ys 73 | 74 | Returns: 75 | tuple[torch.Tensor, Any]: 76 | Tuple of a score tensor for y that has a shape `(len(next_tokens),)` 77 | and next state for ys 78 | 79 | """ 80 | prev_score, state = state 81 | presub_score, new_st = self.impl(y.cpu(), ids.cpu(), state) 82 | tscore = torch.as_tensor( 83 | presub_score - prev_score, device=x.device, dtype=x.dtype 84 | ) 85 | return tscore, (presub_score, new_st) 86 | 87 | def batch_init_state(self, x: torch.Tensor): 88 | """Get an initial state for decoding. 89 | 90 | Args: 91 | x (torch.Tensor): The encoded feature tensor 92 | 93 | Returns: initial state 94 | 95 | """ 96 | logp = self.ctc.log_softmax(x.unsqueeze(0)) # assuming batch_size = 1 97 | xlen = torch.tensor([logp.size(1)]) 98 | self.impl = CTCPrefixScoreTH(logp, xlen, 0, self.eos) 99 | return None 100 | 101 | def batch_score_partial(self, y, ids, state, x): 102 | """Score new token. 103 | 104 | Args: 105 | y (torch.Tensor): 1D prefix token 106 | ids (torch.Tensor): torch.int64 next token to score 107 | state: decoder state for prefix tokens 108 | x (torch.Tensor): 2D encoder feature that generates ys 109 | 110 | Returns: 111 | tuple[torch.Tensor, Any]: 112 | Tuple of a score tensor for y that has a shape `(len(next_tokens),)` 113 | and next state for ys 114 | 115 | """ 116 | batch_state = ( 117 | ( 118 | torch.stack([s[0] for s in state], dim=2), 119 | torch.stack([s[1] for s in state]), 120 | state[0][2], 121 | state[0][3], 122 | ) 123 | if state[0] is not None 124 | else None 125 | ) 126 | return self.impl(y, batch_state, ids) 127 | 128 | def extend_prob(self, x: torch.Tensor): 129 | """Extend probs for decoding. 130 | 131 | This extension is for streaming decoding 132 | as in Eq (14) in https://arxiv.org/abs/2006.14941 133 | 134 | Args: 135 | x (torch.Tensor): The encoded feature tensor 136 | 137 | """ 138 | logp = self.ctc.log_softmax(x.unsqueeze(0)) 139 | self.impl.extend_prob(logp) 140 | 141 | def extend_state(self, state): 142 | """Extend state for decoding. 143 | 144 | This extension is for streaming decoding 145 | as in Eq (14) in https://arxiv.org/abs/2006.14941 146 | 147 | Args: 148 | state: The states of hyps 149 | 150 | Returns: exteded state 151 | 152 | """ 153 | new_state = [] 154 | for s in state: 155 | new_state.append(self.impl.extend_state(s)) 156 | 157 | return new_state 158 | -------------------------------------------------------------------------------- /src/ibug/face_detection/retina_face/retina_face_predictor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | from copy import deepcopy 5 | from types import SimpleNamespace 6 | from typing import Union, Optional 7 | from .prior_box import PriorBox 8 | from .py_cpu_nms import py_cpu_nms 9 | from .retina_face import RetinaFace 10 | from .config import cfg_mnet, cfg_re50 11 | from .box_utils import decode, decode_landm 12 | 13 | 14 | __all__ = ['RetinaFacePredictor'] 15 | 16 | 17 | class RetinaFacePredictor(object): 18 | def __init__(self, threshold: float = 0.8, device: Union[str, torch.device] = 'cuda:0', 19 | model: Optional[SimpleNamespace] = None, config: Optional[SimpleNamespace] = None) -> None: 20 | self.threshold = threshold 21 | self.device = device 22 | if model is None: 23 | model = RetinaFacePredictor.get_model() 24 | if config is None: 25 | config = RetinaFacePredictor.create_config() 26 | self.config = SimpleNamespace(**model.config.__dict__, **config.__dict__) 27 | self.net = RetinaFace(cfg=self.config.__dict__, phase='test').to(self.device) 28 | pretrained_dict = torch.load(model.weights, map_location=self.device) 29 | if 'state_dict' in pretrained_dict.keys(): 30 | pretrained_dict = {key.split('module.', 1)[-1] if key.startswith('module.') else key: value 31 | for key, value in pretrained_dict['state_dict'].items()} 32 | else: 33 | pretrained_dict = {key.split('module.', 1)[-1] if key.startswith('module.') else key: value 34 | for key, value in pretrained_dict.items()} 35 | self.net.load_state_dict(pretrained_dict, strict=False) 36 | self.net.eval() 37 | self.priors = None 38 | self.previous_size = None 39 | 40 | @staticmethod 41 | def get_model(name: str = 'resnet50') -> SimpleNamespace: 42 | name = name.lower().strip() 43 | if name == 'resnet50': 44 | return SimpleNamespace(weights=os.path.realpath(os.path.join(os.path.dirname(__file__), 45 | '..','..','..','..','model-bin','face_detection','retina_face','weights', 'Resnet50_Final.pth')), 46 | config=SimpleNamespace(**deepcopy(cfg_re50))) 47 | elif name == 'mobilenet0.25': 48 | return SimpleNamespace(weights=os.path.realpath(os.path.join(os.path.dirname(__file__), 49 | '..','..','..','..','model-bin','face_detection','retina_face','weights', 'mobilenet0.25_Final.pth')), 50 | config=SimpleNamespace(**deepcopy(cfg_mnet))) 51 | else: 52 | raise ValueError('name must be set to either resnet50 or mobilenet0.25') 53 | 54 | @staticmethod 55 | def create_config(top_k: int = 750, conf_thresh: float = 0.02, 56 | nms_thresh: float = 0.4, nms_top_k: int = 5000) -> SimpleNamespace: 57 | return SimpleNamespace(top_k=top_k, conf_thresh=conf_thresh, nms_thresh=nms_thresh, nms_top_k=nms_top_k) 58 | 59 | @torch.no_grad() 60 | def __call__(self, image: np.ndarray, rgb: bool = True) -> np.ndarray: 61 | im_height, im_width, _ = image.shape 62 | if rgb: 63 | image = image[..., ::-1] 64 | image = image.astype(int) - np.array([104, 117, 123]) 65 | image = image.transpose(2, 0, 1) 66 | image = torch.from_numpy(image).unsqueeze(0).float().to(self.device) 67 | scale = torch.Tensor([im_width, im_height, im_width, im_height]).to(self.device) 68 | loc, conf, landms = self.net(image) 69 | image_size = (im_height, im_width) 70 | if self.priors is None or self.previous_size != image_size: 71 | self.priors = PriorBox(self.config.__dict__, image_size=image_size).forward().to(self.device) 72 | self.previous_size = image_size 73 | prior_data = self.priors.data 74 | boxes = decode(loc.data.squeeze(0), prior_data, self.config.variance) 75 | boxes = boxes * scale 76 | boxes = boxes.cpu().numpy() 77 | scores = conf.squeeze(0).data.cpu().numpy()[:, 1] 78 | landms = decode_landm(landms.data.squeeze(0), prior_data, self.config.variance) 79 | scale1 = torch.Tensor([image.shape[3], image.shape[2], image.shape[3], image.shape[2], 80 | image.shape[3], image.shape[2], image.shape[3], image.shape[2], 81 | image.shape[3], image.shape[2]]).to(self.device) 82 | landms = landms * scale1 83 | landms = landms.cpu().numpy() 84 | 85 | # ignore low scores 86 | inds = np.where(scores > self.config.conf_thresh)[0] 87 | if len(inds) == 0: 88 | return np.empty(shape=(0, 15), dtype=np.float32) 89 | boxes = boxes[inds] 90 | landms = landms[inds] 91 | scores = scores[inds] 92 | 93 | # do NMS 94 | dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False) 95 | keep = py_cpu_nms(dets, self.config.nms_thresh, self.config.nms_top_k) 96 | dets = dets[keep, :] 97 | landms = landms[keep] 98 | 99 | # keep top-K 100 | dets = dets[:self.config.top_k, :] 101 | landms = landms[:self.config.top_k, :] 102 | dets = np.concatenate((dets, landms), axis=1) 103 | 104 | # further filter by confidence 105 | inds = np.where(dets[:, 4] >= self.threshold)[0] 106 | if len(inds) == 0: 107 | return np.empty(shape=(0, 15), dtype=np.float32) 108 | else: 109 | return dets[inds] 110 | -------------------------------------------------------------------------------- /src/nets/backend/backbones/modules/resnet.py: -------------------------------------------------------------------------------- 1 | import math 2 | import pdb 3 | 4 | import torch.nn as nn 5 | 6 | from src.nets.backend.transformer.convolution import Swish 7 | 8 | 9 | def conv3x3(in_planes, out_planes, stride=1): 10 | """conv3x3. 11 | 12 | :param in_planes: int, number of channels in the input sequence. 13 | :param out_planes: int, number of channels produced by the convolution. 14 | :param stride: int, size of the convolving kernel. 15 | """ 16 | return nn.Conv2d( 17 | in_planes, 18 | out_planes, 19 | kernel_size=3, 20 | stride=stride, 21 | padding=1, 22 | bias=False, 23 | ) 24 | 25 | 26 | def downsample_basic_block(inplanes, outplanes, stride): 27 | """downsample_basic_block. 28 | 29 | :param inplanes: int, number of channels in the input sequence. 30 | :param outplanes: int, number of channels produced by the convolution. 31 | :param stride: int, size of the convolving kernel. 32 | """ 33 | return nn.Sequential( 34 | nn.Conv2d( 35 | inplanes, 36 | outplanes, 37 | kernel_size=1, 38 | stride=stride, 39 | bias=False, 40 | ), 41 | nn.BatchNorm2d(outplanes), 42 | ) 43 | 44 | 45 | class BasicBlock(nn.Module): 46 | expansion = 1 47 | 48 | def __init__( 49 | self, 50 | inplanes, 51 | planes, 52 | stride=1, 53 | downsample=None, 54 | relu_type="swish", 55 | ): 56 | """__init__. 57 | 58 | :param inplanes: int, number of channels in the input sequence. 59 | :param planes: int, number of channels produced by the convolution. 60 | :param stride: int, size of the convolving kernel. 61 | :param downsample: boolean, if True, the temporal resolution is downsampled. 62 | :param relu_type: str, type of activation function. 63 | """ 64 | super(BasicBlock, self).__init__() 65 | 66 | assert relu_type in ["relu", "prelu", "swish"] 67 | 68 | self.conv1 = conv3x3(inplanes, planes, stride) 69 | self.bn1 = nn.BatchNorm2d(planes) 70 | 71 | if relu_type == "relu": 72 | self.relu1 = nn.ReLU(inplace=True) 73 | self.relu2 = nn.ReLU(inplace=True) 74 | elif relu_type == "prelu": 75 | self.relu1 = nn.PReLU(num_parameters=planes) 76 | self.relu2 = nn.PReLU(num_parameters=planes) 77 | elif relu_type == "swish": 78 | self.relu1 = Swish() 79 | self.relu2 = Swish() 80 | else: 81 | raise NotImplementedError 82 | # -------- 83 | 84 | self.conv2 = conv3x3(planes, planes) 85 | self.bn2 = nn.BatchNorm2d(planes) 86 | 87 | self.downsample = downsample 88 | self.stride = stride 89 | 90 | def forward(self, x): 91 | """forward. 92 | 93 | :param x: torch.Tensor, input tensor with input size (B, C, T, H, W). 94 | """ 95 | residual = x 96 | out = self.conv1(x) 97 | out = self.bn1(out) 98 | out = self.relu1(out) 99 | out = self.conv2(out) 100 | out = self.bn2(out) 101 | if self.downsample is not None: 102 | residual = self.downsample(x) 103 | 104 | out += residual 105 | out = self.relu2(out) 106 | 107 | return out 108 | 109 | 110 | class ResNet(nn.Module): 111 | def __init__( 112 | self, 113 | block, 114 | layers, 115 | relu_type="swish", 116 | ): 117 | super(ResNet, self).__init__() 118 | self.inplanes = 64 119 | self.relu_type = relu_type 120 | self.downsample_block = downsample_basic_block 121 | 122 | self.layer1 = self._make_layer(block, 64, layers[0]) 123 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 124 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 125 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 126 | self.avgpool = nn.AdaptiveAvgPool2d(1) 127 | 128 | def _make_layer(self, block, planes, blocks, stride=1): 129 | """_make_layer. 130 | 131 | :param block: torch.nn.Module, class of blocks. 132 | :param planes: int, number of channels produced by the convolution. 133 | :param blocks: int, number of layers in a block. 134 | :param stride: int, size of the convolving kernel. 135 | """ 136 | downsample = None 137 | if stride != 1 or self.inplanes != planes * block.expansion: 138 | downsample = self.downsample_block( 139 | inplanes=self.inplanes, 140 | outplanes=planes * block.expansion, 141 | stride=stride, 142 | ) 143 | 144 | layers = [] 145 | layers.append( 146 | block( 147 | self.inplanes, 148 | planes, 149 | stride, 150 | downsample, 151 | relu_type=self.relu_type, 152 | ) 153 | ) 154 | self.inplanes = planes * block.expansion 155 | for i in range(1, blocks): 156 | layers.append( 157 | block( 158 | self.inplanes, 159 | planes, 160 | relu_type=self.relu_type, 161 | ) 162 | ) 163 | 164 | return nn.Sequential(*layers) 165 | 166 | def forward(self, x): 167 | """forward. 168 | 169 | :param x: torch.Tensor, input tensor with input size (B, C, T, H, W). 170 | """ 171 | x = self.layer1(x) 172 | x = self.layer2(x) 173 | x = self.layer3(x) 174 | x = self.layer4(x) 175 | x = self.avgpool(x) 176 | x = x.view(x.size(0), -1) 177 | return x 178 | -------------------------------------------------------------------------------- /src/nets/backend/transformer/encoder_layer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Encoder self-attention layer definition.""" 8 | 9 | import copy 10 | 11 | import torch 12 | 13 | from src.nets.backend.transformer.layer_norm import LayerNorm 14 | 15 | from torch import nn 16 | 17 | 18 | class EncoderLayer(nn.Module): 19 | """Encoder layer module. 20 | 21 | :param int size: input dim 22 | :param src.nets.backend.transformer.attention. 23 | MultiHeadedAttention self_attn: self attention module 24 | RelPositionMultiHeadedAttention self_attn: self attention module 25 | :param src.nets.backend.transformer.positionwise_feed_forward. 26 | PositionwiseFeedForward feed_forward: 27 | feed forward module 28 | :param src.nets.backend.transformer.convolution. 29 | ConvolutionModule feed_foreard: 30 | feed forward module 31 | :param float dropout_rate: dropout rate 32 | :param bool normalize_before: whether to use layer_norm before the first block 33 | :param bool concat_after: whether to concat attention layer's input and output 34 | if True, additional linear will be applied. 35 | i.e. x -> x + linear(concat(x, att(x))) 36 | if False, no additional linear will be applied. i.e. x -> x + att(x) 37 | :param bool macaron_style: whether to use macaron style for PositionwiseFeedForward 38 | 39 | """ 40 | 41 | def __init__( 42 | self, 43 | size, 44 | self_attn, 45 | feed_forward, 46 | conv_module, 47 | dropout_rate, 48 | normalize_before=True, 49 | concat_after=False, 50 | macaron_style=False, 51 | ): 52 | """Construct an EncoderLayer object.""" 53 | super(EncoderLayer, self).__init__() 54 | self.self_attn = self_attn 55 | self.feed_forward = feed_forward 56 | self.ff_scale = 1.0 57 | self.conv_module = conv_module 58 | self.macaron_style = macaron_style 59 | self.norm_ff = LayerNorm(size) # for the FNN module 60 | self.norm_mha = LayerNorm(size) # for the MHA module 61 | if self.macaron_style: 62 | self.feed_forward_macaron = copy.deepcopy(feed_forward) 63 | self.ff_scale = 0.5 64 | # for another FNN module in macaron style 65 | self.norm_ff_macaron = LayerNorm(size) 66 | if self.conv_module is not None: 67 | self.norm_conv = LayerNorm(size) # for the CNN module 68 | self.norm_final = LayerNorm(size) # for the final output of the block 69 | self.dropout = nn.Dropout(dropout_rate) 70 | self.size = size 71 | self.normalize_before = normalize_before 72 | self.concat_after = concat_after 73 | if self.concat_after: 74 | self.concat_linear = nn.Linear(size + size, size) 75 | 76 | def forward(self, x_input, mask, cache=None): 77 | """Compute encoded features. 78 | 79 | :param torch.Tensor x_input: encoded source features (batch, max_time_in, size) 80 | :param torch.Tensor mask: mask for x (batch, max_time_in) 81 | :param torch.Tensor cache: cache for x (batch, max_time_in - 1, size) 82 | :rtype: Tuple[torch.Tensor, torch.Tensor] 83 | """ 84 | if isinstance(x_input, tuple): 85 | x, pos_emb = x_input[0], x_input[1] 86 | else: 87 | x, pos_emb = x_input, None 88 | 89 | # whether to use macaron style 90 | if self.macaron_style: 91 | residual = x 92 | if self.normalize_before: 93 | x = self.norm_ff_macaron(x) 94 | x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x)) 95 | if not self.normalize_before: 96 | x = self.norm_ff_macaron(x) 97 | 98 | # multi-headed self-attention module 99 | residual = x 100 | if self.normalize_before: 101 | x = self.norm_mha(x) 102 | 103 | if cache is None: 104 | x_q = x 105 | else: 106 | assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size) 107 | x_q = x[:, -1:, :] 108 | residual = residual[:, -1:, :] 109 | mask = None if mask is None else mask[:, -1:, :] 110 | 111 | if pos_emb is not None: 112 | x_att = self.self_attn(x_q, x, x, pos_emb, mask) 113 | else: 114 | x_att = self.self_attn(x_q, x, x, mask) 115 | 116 | if self.concat_after: 117 | x_concat = torch.cat((x, x_att), dim=-1) 118 | x = residual + self.concat_linear(x_concat) 119 | else: 120 | x = residual + self.dropout(x_att) 121 | if not self.normalize_before: 122 | x = self.norm_mha(x) 123 | 124 | # convolution module 125 | if self.conv_module is not None: 126 | residual = x 127 | if self.normalize_before: 128 | x = self.norm_conv(x) 129 | x = residual + self.dropout(self.conv_module(x)) 130 | if not self.normalize_before: 131 | x = self.norm_conv(x) 132 | 133 | # feed forward module 134 | residual = x 135 | if self.normalize_before: 136 | x = self.norm_ff(x) 137 | x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) 138 | if not self.normalize_before: 139 | x = self.norm_ff(x) 140 | 141 | if self.conv_module is not None: 142 | x = self.norm_final(x) 143 | 144 | if cache is not None: 145 | x = torch.cat([cache, x], dim=1) 146 | 147 | if pos_emb is not None: 148 | return (x, pos_emb), mask 149 | else: 150 | return x, mask 151 | -------------------------------------------------------------------------------- /src/nets/backend/e2e_asr_conformer_av.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Shigeki Karita 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Transformer speech recognition model (pytorch).""" 5 | 6 | import logging 7 | import numpy 8 | import torch 9 | 10 | from src.nets.backend.ctc import CTC 11 | from src.nets.backend.nets_utils import ( 12 | make_non_pad_mask, 13 | th_accuracy, 14 | ) 15 | from src.nets.backend.transformer.add_sos_eos import add_sos_eos 16 | from src.nets.backend.transformer.decoder import Decoder 17 | from src.nets.backend.transformer.encoder import Encoder 18 | from src.nets.backend.transformer.label_smoothing_loss import LabelSmoothingLoss 19 | from src.nets.backend.transformer.mask import target_mask 20 | from src.nets.backend.nets_utils import MLPHead 21 | 22 | 23 | class E2E(torch.nn.Module): 24 | def __init__(self, args, ignore_id=-1): 25 | torch.nn.Module.__init__(self) 26 | 27 | self.encoder = Encoder( 28 | attention_dim=args.adim, 29 | attention_heads=args.aheads, 30 | linear_units=args.eunits, 31 | num_blocks=args.elayers, 32 | input_layer=args.transformer_input_layer, 33 | dropout_rate=args.dropout_rate, 34 | positional_dropout_rate=args.dropout_rate, 35 | attention_dropout_rate=args.transformer_attn_dropout_rate, 36 | encoder_attn_layer_type=args.transformer_encoder_attn_layer_type, 37 | macaron_style=args.macaron_style, 38 | use_cnn_module=args.use_cnn_module, 39 | cnn_module_kernel=args.cnn_module_kernel, 40 | zero_triu=getattr(args, "zero_triu", False), 41 | a_upsample_ratio=args.a_upsample_ratio, 42 | relu_type=getattr(args, "relu_type", "swish"), 43 | ) 44 | 45 | self.aux_encoder = Encoder( 46 | attention_dim=args.aux_adim, 47 | attention_heads=args.aux_aheads, 48 | linear_units=args.aux_eunits, 49 | num_blocks=args.aux_elayers, 50 | input_layer=args.aux_transformer_input_layer, 51 | dropout_rate=args.aux_dropout_rate, 52 | positional_dropout_rate=args.aux_dropout_rate, 53 | attention_dropout_rate=args.aux_transformer_attn_dropout_rate, 54 | encoder_attn_layer_type=args.aux_transformer_encoder_attn_layer_type, 55 | macaron_style=args.aux_macaron_style, 56 | use_cnn_module=args.aux_use_cnn_module, 57 | cnn_module_kernel=args.aux_cnn_module_kernel, 58 | zero_triu=getattr(args, "aux_zero_triu", False), 59 | a_upsample_ratio=args.aux_a_upsample_ratio, 60 | relu_type=getattr(args, "aux_relu_type", "swish"), 61 | ) 62 | 63 | self.transformer_input_layer = args.transformer_input_layer 64 | self.a_upsample_ratio = args.a_upsample_ratio 65 | 66 | self.fusion = MLPHead( 67 | idim=args.adim + args.aux_adim, 68 | hdim=args.fusion_hdim, 69 | odim=args.adim, 70 | norm=args.fusion_norm, 71 | ) 72 | 73 | self.proj_decoder = None 74 | if args.adim != args.ddim: 75 | self.proj_decoder = torch.nn.Linear(args.adim, args.ddim) 76 | 77 | if args.mtlalpha < 1: 78 | self.decoder = Decoder( 79 | odim=args.odim, 80 | attention_dim=args.ddim, 81 | attention_heads=args.dheads, 82 | linear_units=args.dunits, 83 | num_blocks=args.dlayers, 84 | dropout_rate=args.dropout_rate, 85 | positional_dropout_rate=args.dropout_rate, 86 | self_attention_dropout_rate=args.transformer_attn_dropout_rate, 87 | src_attention_dropout_rate=args.transformer_attn_dropout_rate, 88 | ) 89 | else: 90 | self.decoder = None 91 | self.blank = 0 92 | self.sos = args.odim - 1 93 | self.eos = args.odim - 1 94 | self.odim = args.odim 95 | self.ignore_id = ignore_id 96 | 97 | # self.lsm_weight = a 98 | self.criterion = LabelSmoothingLoss( 99 | self.odim, 100 | self.ignore_id, 101 | args.lsm_weight, 102 | args.transformer_length_normalized_loss, 103 | ) 104 | 105 | self.adim = args.adim 106 | self.mtlalpha = args.mtlalpha 107 | if args.mtlalpha > 0.0: 108 | self.ctc = CTC( 109 | args.odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True 110 | ) 111 | else: 112 | self.ctc = None 113 | 114 | def forward(self, video, audio, video_lengths, audio_lengths, label): 115 | video_padding_mask = make_non_pad_mask(video_lengths).to(video.device).unsqueeze(-2) 116 | video_feat, _ = self.encoder(video, video_padding_mask) 117 | 118 | audio_lengths = torch.div(audio_lengths, 640, rounding_mode="trunc") 119 | audio_padding_mask = make_non_pad_mask(audio_lengths).to(video.device).unsqueeze(-2) 120 | 121 | audio_feat, _ = self.aux_encoder(audio, audio_padding_mask) 122 | 123 | x = self.fusion(torch.cat((video_feat, audio_feat), dim=-1)) 124 | 125 | # ctc loss 126 | loss_ctc, ys_hat = self.ctc(x, video_lengths, label) 127 | 128 | if self.proj_decoder: 129 | x = self.proj_decoder(x) 130 | 131 | # decoder loss 132 | ys_in_pad, ys_out_pad = add_sos_eos(label, self.sos, self.eos, self.ignore_id) 133 | ys_mask = target_mask(ys_in_pad, self.ignore_id) 134 | pred_pad, _ = self.decoder(ys_in_pad, ys_mask, x, video_padding_mask) 135 | loss_att = self.criterion(pred_pad, ys_out_pad) 136 | loss = self.mtlalpha * loss_ctc + (1 - self.mtlalpha) * loss_att 137 | 138 | acc = th_accuracy( 139 | pred_pad.view(-1, self.odim), ys_out_pad, ignore_label=self.ignore_id 140 | ) 141 | 142 | return loss, loss_ctc, loss_att, acc 143 | -------------------------------------------------------------------------------- /script/evaluate.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.sys.path.append(os.path.join(os.path.dirname(os.path.dirname(__file__)))) 3 | import jiwer 4 | import webvtt 5 | import json 6 | from src.cluster.conv_spks import ( 7 | get_clustering_f1_score, 8 | get_speaker_clustering_f1_score 9 | ) 10 | from src.tokenizer.norm_text import remove_disfluencies 11 | import glob 12 | from transformers.models.whisper.english_normalizer import EnglishTextNormalizer 13 | text_normalizer = EnglishTextNormalizer({}) 14 | 15 | def evaluate_conversation_clustering(label_path, output_path): 16 | with open(os.path.join(label_path, "speaker_to_cluster.json"), "r") as f: 17 | label_data = json.load(f) 18 | with open(os.path.join(output_path, "speaker_to_cluster.json"), "r") as f: 19 | output_data = json.load(f) 20 | return get_clustering_f1_score(label_data, output_data) 21 | 22 | def evaluate_speaker_clustering(label_path, output_path): 23 | with open(os.path.join(label_path, "speaker_to_cluster.json"), "r") as f: 24 | label_data = json.load(f) 25 | with open(os.path.join(output_path, "speaker_to_cluster.json"), "r") as f: 26 | output_data = json.load(f) 27 | return get_speaker_clustering_f1_score(label_data, output_data) 28 | 29 | 30 | def benchmark_vtt_wer(ref_vtt, hypo_vtt, ref_uem_start, ref_uem_end, hypo_uem_start, hypo_uem_end, show_diff=False): 31 | ref_strings = [] 32 | hypo_strings = [] 33 | for caption in webvtt.read(ref_vtt): 34 | if caption.start_in_seconds + caption.start_time.milliseconds/1000 < ref_uem_start: 35 | continue 36 | if caption.end_in_seconds + caption.end_time.milliseconds/1000 > ref_uem_end: 37 | continue 38 | ref_strings.append(remove_disfluencies(text_normalizer(caption.text))) 39 | for caption in webvtt.read(hypo_vtt): 40 | if caption.start_in_seconds + caption.start_time.milliseconds/1000 < hypo_uem_start: 41 | continue 42 | if caption.end_in_seconds + caption.end_time.milliseconds/1000 > hypo_uem_end: 43 | continue 44 | hypo_strings.append(remove_disfluencies(text_normalizer(caption.text))) 45 | 46 | if show_diff: 47 | # Show the WER error type (insertion, deletion, substitution) using wer library 48 | out = jiwer.process_words( 49 | [" ".join(ref_strings)], 50 | [" ".join(hypo_strings)], 51 | ) 52 | print(jiwer.visualize_alignment(out)) 53 | 54 | return jiwer.wer(" ".join(ref_strings), " ".join(hypo_strings)) 55 | 56 | def evaluate_speaker_transcripts(label_path, output_path, speaker_list, speaker_uem_start, speaker_uem_end): 57 | speaker_to_wer = {} 58 | for speaker, uem_start, uem_end in zip(speaker_list, speaker_uem_start, speaker_uem_end): 59 | ref_vtt = os.path.join(label_path, f"{speaker}.vtt") 60 | hypo_vtt = os.path.join(output_path, f"{speaker}.vtt") 61 | wer_score = benchmark_vtt_wer(ref_vtt, hypo_vtt, uem_start, uem_end, uem_start, uem_end) 62 | speaker_to_wer[speaker] = round(wer_score, 4) 63 | return speaker_to_wer 64 | 65 | def main(): 66 | import argparse 67 | parser = argparse.ArgumentParser(description="Evaluate speaker clustering and transcripts from video") 68 | parser.add_argument('--session_dir', type=str, required=True, help='Path to folder containing session data') 69 | parser.add_argument('--output_dir_name', type=str, default='output', help='Name of the output directory within each session (default: output)') 70 | parser.add_argument('--label_dir_name', type=str, default='labels', help='Name of the label directory within each session (default: labels)') 71 | opt = parser.parse_args() 72 | 73 | if opt.session_dir.strip().endswith("*"): 74 | all_session_dirs = glob.glob(opt.session_dir) 75 | else: 76 | all_session_dirs = [opt.session_dir] 77 | print(f"Evaluating {len(all_session_dirs)} sessions") 78 | 79 | all_conversation_clustering_f1_score = [] 80 | all_speaker_wer = [] 81 | all_cluster_speaker_wer = [] 82 | 83 | for session_dir in all_session_dirs: 84 | print(f"Evaluating session {session_dir.split('/')[-1]}") 85 | label_path = os.path.join(session_dir, opt.label_dir_name) 86 | output_path = os.path.join(session_dir, opt.output_dir_name) 87 | assert os.path.exists(label_path), f"Label path {label_path} does not exist" 88 | assert os.path.exists(output_path), f"Output path {output_path} does not exist" 89 | 90 | with open(os.path.join(session_dir, "metadata.json"), "r") as f: 91 | metadata = json.load(f) 92 | speaker_list = list(metadata.keys()) 93 | speaker_uem_start = [metadata[spk]['central']["uem"]["start"] for spk in speaker_list] 94 | speaker_uem_end = [metadata[spk]['central']["uem"]["end"] for spk in speaker_list] 95 | 96 | conversation_clustering_f1_score = evaluate_conversation_clustering(label_path, output_path) 97 | print(f"Conversation clustering F1 score: {conversation_clustering_f1_score}") 98 | all_conversation_clustering_f1_score.append(conversation_clustering_f1_score) 99 | 100 | 101 | speaker_to_wer = evaluate_speaker_transcripts(label_path, output_path, speaker_list, speaker_uem_start, speaker_uem_end) 102 | print(f"Speaker to WER: {speaker_to_wer}") 103 | all_speaker_wer.extend(list(speaker_to_wer.values())) 104 | 105 | speaker_clustering_f1_score = evaluate_speaker_clustering(label_path, output_path) 106 | print(f"Speaker clustering F1 score: {speaker_clustering_f1_score}") 107 | 108 | cluster_speaker_to_wer = {} 109 | for speaker, wer in speaker_to_wer.items(): 110 | cluster_speaker_wer = 0.5 * wer + 0.5 * (1 - speaker_clustering_f1_score[speaker]) 111 | cluster_speaker_to_wer[speaker] = cluster_speaker_wer 112 | print(f"Joint ASR-Clustering Error Rate: {cluster_speaker_to_wer}") 113 | all_cluster_speaker_wer.extend(list(cluster_speaker_to_wer.values())) 114 | 115 | print(f"Average Conversation Clustering F1 score: {sum(all_conversation_clustering_f1_score) / len(all_conversation_clustering_f1_score)}") 116 | print(f"Average Speaker WER: {sum(all_speaker_wer) / len(all_speaker_wer)}") 117 | print(f"Average Joint ASR-Clustering Error Rate: {sum(all_cluster_speaker_wer) / len(all_cluster_speaker_wer)}") 118 | 119 | if __name__ == "__main__": 120 | main() -------------------------------------------------------------------------------- /docs/baseline.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Baseline System 4 | parent: CHiME-9 Task 1 - MCoRec 5 | nav_order: 4 6 | --- 7 | 8 | # Baseline System 9 | 10 | ## Overview 11 | 12 | The baseline system for CHiME-9 Task 1 addresses the challenging problem of **Multi-Modal Context-aware Recognition (MCoRec)** in single-room environments with multiple concurrent conversations. The system processes 360° video and audio recordings to both transcribe each speaker's speech and identify which speakers belong to the same conversation. 13 | 14 | ## Task Requirements 15 | 16 | The baseline system provides an initial framework for addressing two interconnected challenges: 17 | 18 | 1. **Individual Speaker Transcription**: Generate time-aligned transcripts (`.vtt` files) for each target speaker within specified evaluation intervals 19 | 2. **Conversation Clustering**: Group participants into their respective conversations by generating speaker-to-cluster mappings 20 | 21 | **Input**: Single 360° video with audio, plus bounding boxes identifying target participants 22 | **Output**: Per-speaker transcriptions and conversation cluster assignments 23 | 24 | ## Baseline System Architecture 25 | 26 | The baseline system is provided at [Github](https://github.com/MCoRec/mcorec_baseline). Please refer to the README therein for information about how to install and run the system. 27 | 28 | ![](images/mcorec-baseline.png) 29 | 30 | ### Core Components 31 | 32 | #### 1. Active Speaker Detection 33 | - **Purpose**: Determines which speaker is actively speaking at any given moment 34 | - **Baseline Model**: [A Light Weight Model for Active Speaker Detection](https://github.com/Junhua-Liao/Light-ASD) 35 | - **Input**: Face crop video and audio extracted from the 360° video 36 | - **Output**: Active speaker detection scores for each frame of the corresponding track video. These scores are used to determine when a speaker is talking, allowing the audio-visual speech recognition system to run only during the speaking segments. 37 | 38 | #### 2. Face Landmark Detection and Mouth Cropping 39 | - **Purpose**: Extracts mouth region from face crop videos. 40 | - **Models**: 41 | - Face detector based on [RetinaFace](https://arxiv.org/pdf/1905.00641) 42 | - 2D facial landmark detector based on [FAN (Face Alignment Network)](https://openaccess.thecvf.com/content_ICCV_2017/papers/Bulat_How_Far_Are_ICCV_2017_paper.pdf) 43 | - **Input**: Face crop video and audio extracted from the 360° video 44 | - **Output**: Mouth crop video with precise mouth region extraction 45 | - **Processing Pipeline**: Referenced from [Auto-AVSR/preparation](https://github.com/mpc001/auto_avsr/tree/main/preparation) repository 46 | 47 | #### 3. Video Segmentation and Chunking 48 | - **Purpose**: Splits long mouth crop videos into smaller segments (≤15 seconds) based on active speaker detection scores for efficient processing 49 | - **Algorithm**: 50 | - **Hysteresis Thresholding**: Uses onset and offset thresholds to identify speech regions from ASD scores 51 | - **Duration Filtering**: Removes short speech segments and fills short gaps between speech regions 52 | - **Segment Splitting**: Divides long continuous speech regions into manageable chunks 53 | - **Input**: Mouth crop video and active speaker detection scores 54 | - **Output**: List of video segments with start/end timestamps 55 | 56 | #### 4. Audio-Visual Speech Recognition 57 | - **Purpose**: Combines audio and visual cues to transcribe speech into text for each video segment 58 | - **Input**: Segmented mouth crop videos with corresponding audio streams and timestamps 59 | - **Output**: Time-aligned transcriptions in WebVTT format with start/end timestamps for each segment 60 | - **Baseline Model** 61 | - **AV-HuBERT CTC/Attention**: [Cocktail-Party Audio-Visual Speech Recognition](https://arxiv.org/abs/2506.02178) 62 | - **Auto-AVSR**: [Auto-AVSR: Audio-Visual Speech Recognition with Automatic Labels](https://arxiv.org/abs/2303.14307) 63 | - **Muavic-EN**: [MuAViC: A Multilingual Audio-Visual Corpus for Robust Speech Recognition and Robust Speech-to-Text Translation](https://arxiv.org/abs/2303.00628) 64 | 65 | 66 | #### 5. Time-based Conversation Clustering 67 | - **Purpose**: Groups speakers into their respective conversations based on temporal speaking patterns and overlap analysis 68 | - **Input**: Active speaker detection scores 69 | - **Processing Workflow**: 70 | - **Speaker Activity Extraction**: Uses ASD scores from the Active Speaker Detection component to identify time segments where each speaker is actively talking 71 | - **Conversation Score Calculation**: For each pair of speakers, computes interaction scores based on temporal overlap patterns: 72 | - Simultaneous speech (overlap) → indicates different conversations 73 | - Sequential speech (non-overlap) → indicates same conversation 74 | - Score formula: `1 - (overlap_duration / total_duration)` 75 | - **Distance Matrix Construction**: Converts conversation scores to distances for clustering: 76 | - Higher scores (less overlap) = smaller distances = higher probability of same conversation 77 | - Lower scores (more overlap) = larger distances = lower probability of same conversation 78 | - **Agglomerative Clustering**: Hierarchically groups speakers using the distance matrix 79 | - **Output**: Speaker-to-cluster mapping in JSON format (`speaker_to_cluster.json`) 80 | 81 | This baseline establishes a reference implementation that participants can build upon and improve through more sophisticated approaches to better handle the challenging multi-conversation scenarios with high speech overlap and complex acoustic environments. 82 | 83 | ## Results 84 | 85 | The results for the baseline systems on dev subset are the following: 86 | 87 | | System | AVSR Model | MCoRec finetuned | Conversation Clustering | Conversation Clustering F1 Score | Speaker WER | Joint ASR-Clustering Error Rate | 88 | |--------|------------|------------------|------------------------|-----------------------------------|-------------|----------------------------------| 89 | | BL1 | [AV-HuBERT CTC/Attention](https://arxiv.org/abs/2506.02178) | No | Time-based | 0.8153 | 0.5536 | 0.3821 | 90 | | BL2 | [Muavic-EN](https://arxiv.org/abs/2303.00628) | No | Time-based | 0.8153 | 0.7180 | 0.4643 | 91 | | BL3 | [Auto-AVSR](https://arxiv.org/abs/2303.14307) | No | Time-based | 0.8153 | 0.8315 | 0.5211 | 92 | | BL4 | [AV-HuBERT CTC/Attention](https://arxiv.org/abs/2506.02178) | Yes | Time-based | 0.8153 | 0.4990 | 0.3548 | 93 | 94 | For detailed implementation and inference instructions, please refer to the baseline repository on [GitHub](https://github.com/MCoRec/mcorec_baseline). -------------------------------------------------------------------------------- /src/nets/backend/backbones/modules/shufflenetv2.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import pdb 4 | from collections import OrderedDict 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | from torch.autograd import Variable 10 | from torch.nn import init 11 | 12 | 13 | def conv_bn(inp, oup, stride): 14 | return nn.Sequential( 15 | nn.Conv2d(inp, oup, 3, stride, 1, bias=False), 16 | nn.BatchNorm2d(oup), 17 | nn.ReLU(inplace=True), 18 | ) 19 | 20 | 21 | def conv_1x1_bn(inp, oup): 22 | return nn.Sequential( 23 | nn.Conv2d(inp, oup, 1, 1, 0, bias=False), 24 | nn.BatchNorm2d(oup), 25 | nn.ReLU(inplace=True), 26 | ) 27 | 28 | 29 | def channel_shuffle(x, groups): 30 | batchsize, num_channels, height, width = x.data.size() 31 | 32 | channels_per_group = num_channels // groups 33 | 34 | # reshape 35 | x = x.view(batchsize, groups, channels_per_group, height, width) 36 | 37 | x = torch.transpose(x, 1, 2).contiguous() 38 | 39 | # flatten 40 | x = x.view(batchsize, -1, height, width) 41 | 42 | return x 43 | 44 | 45 | class InvertedResidual(nn.Module): 46 | def __init__(self, inp, oup, stride, benchmodel): 47 | super(InvertedResidual, self).__init__() 48 | self.benchmodel = benchmodel 49 | self.stride = stride 50 | assert stride in [1, 2] 51 | 52 | oup_inc = oup // 2 53 | 54 | if self.benchmodel == 1: 55 | # assert inp == oup_inc 56 | self.banch2 = nn.Sequential( 57 | # pw 58 | nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False), 59 | nn.BatchNorm2d(oup_inc), 60 | nn.ReLU(inplace=True), 61 | # dw 62 | nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False), 63 | nn.BatchNorm2d(oup_inc), 64 | # pw-linear 65 | nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False), 66 | nn.BatchNorm2d(oup_inc), 67 | nn.ReLU(inplace=True), 68 | ) 69 | else: 70 | self.banch1 = nn.Sequential( 71 | # dw 72 | nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), 73 | nn.BatchNorm2d(inp), 74 | # pw-linear 75 | nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False), 76 | nn.BatchNorm2d(oup_inc), 77 | nn.ReLU(inplace=True), 78 | ) 79 | 80 | self.banch2 = nn.Sequential( 81 | # pw 82 | nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False), 83 | nn.BatchNorm2d(oup_inc), 84 | nn.ReLU(inplace=True), 85 | # dw 86 | nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False), 87 | nn.BatchNorm2d(oup_inc), 88 | # pw-linear 89 | nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False), 90 | nn.BatchNorm2d(oup_inc), 91 | nn.ReLU(inplace=True), 92 | ) 93 | 94 | @staticmethod 95 | def _concat(x, out): 96 | # concatenate along channel axis 97 | return torch.cat((x, out), 1) 98 | 99 | def forward(self, x): 100 | if 1 == self.benchmodel: 101 | x1 = x[:, : (x.shape[1] // 2), :, :] 102 | x2 = x[:, (x.shape[1] // 2) :, :, :] 103 | out = self._concat(x1, self.banch2(x2)) 104 | elif 2 == self.benchmodel: 105 | out = self._concat(self.banch1(x), self.banch2(x)) 106 | 107 | return channel_shuffle(out, 2) 108 | 109 | 110 | class ShuffleNetV2(nn.Module): 111 | def __init__(self, n_class=1000, input_size=224, width_mult=2.0): 112 | super(ShuffleNetV2, self).__init__() 113 | 114 | assert input_size % 32 == 0, "Input size needs to be divisible by 32" 115 | 116 | self.stage_repeats = [4, 8, 4] 117 | # index 0 is invalid and should never be called. 118 | # only used for indexing convenience. 119 | if width_mult == 0.5: 120 | self.stage_out_channels = [-1, 24, 48, 96, 192, 1024] 121 | elif width_mult == 1.0: 122 | self.stage_out_channels = [-1, 24, 116, 232, 464, 1024] 123 | elif width_mult == 1.5: 124 | self.stage_out_channels = [-1, 24, 176, 352, 704, 1024] 125 | elif width_mult == 2.0: 126 | self.stage_out_channels = [-1, 24, 244, 488, 976, 2048] 127 | else: 128 | raise ValueError( 129 | """Width multiplier should be in [0.5, 1.0, 1.5, 2.0]. Current value: {}""".format( 130 | width_mult 131 | ) 132 | ) 133 | 134 | # building first layer 135 | input_channel = self.stage_out_channels[1] 136 | self.conv1 = conv_bn(3, input_channel, 2) 137 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 138 | 139 | self.features = [] 140 | # building inverted residual blocks 141 | for idxstage in range(len(self.stage_repeats)): 142 | numrepeat = self.stage_repeats[idxstage] 143 | output_channel = self.stage_out_channels[idxstage + 2] 144 | for i in range(numrepeat): 145 | if i == 0: 146 | # inp, oup, stride, benchmodel): 147 | self.features.append( 148 | InvertedResidual(input_channel, output_channel, 2, 2) 149 | ) 150 | else: 151 | self.features.append( 152 | InvertedResidual(input_channel, output_channel, 1, 1) 153 | ) 154 | input_channel = output_channel 155 | 156 | # make it nn.Sequential 157 | self.features = nn.Sequential(*self.features) 158 | 159 | # building last several layers 160 | self.conv_last = conv_1x1_bn(input_channel, self.stage_out_channels[-1]) 161 | self.globalpool = nn.Sequential(nn.AvgPool2d(int(input_size / 32))) 162 | 163 | # building classifier 164 | self.classifier = nn.Sequential(nn.Linear(self.stage_out_channels[-1], n_class)) 165 | 166 | def forward(self, x): 167 | x = self.conv1(x) 168 | x = self.maxpool(x) 169 | x = self.features(x) 170 | x = self.conv_last(x) 171 | x = self.globalpool(x) 172 | x = x.view(-1, self.stage_out_channels[-1]) 173 | x = self.classifier(x) 174 | return x 175 | -------------------------------------------------------------------------------- /src/ibug/face_detection/s3fd/s3fd_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.init as init 4 | import torch.nn.functional as F 5 | from .utils import Detect, PriorBox 6 | 7 | 8 | class L2Norm(nn.Module): 9 | 10 | def __init__(self, n_channels, scale): 11 | super(L2Norm, self).__init__() 12 | self.n_channels = n_channels 13 | self.gamma = scale or None 14 | self.eps = 1e-10 15 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 16 | self.reset_parameters() 17 | 18 | def reset_parameters(self): 19 | init.constant_(self.weight, self.gamma) 20 | 21 | def forward(self, x): 22 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps 23 | x = torch.div(x, norm) 24 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x 25 | return out 26 | 27 | 28 | class S3FDNet(nn.Module): 29 | 30 | def __init__(self, config, device='cuda'): 31 | super(S3FDNet, self).__init__() 32 | self.config = config 33 | self.device = device 34 | 35 | self.vgg = nn.ModuleList([ 36 | nn.Conv2d(3, 64, 3, 1, padding=1), 37 | nn.ReLU(inplace=True), 38 | nn.Conv2d(64, 64, 3, 1, padding=1), 39 | nn.ReLU(inplace=True), 40 | nn.MaxPool2d(2, 2), 41 | 42 | nn.Conv2d(64, 128, 3, 1, padding=1), 43 | nn.ReLU(inplace=True), 44 | nn.Conv2d(128, 128, 3, 1, padding=1), 45 | nn.ReLU(inplace=True), 46 | nn.MaxPool2d(2, 2), 47 | 48 | nn.Conv2d(128, 256, 3, 1, padding=1), 49 | nn.ReLU(inplace=True), 50 | nn.Conv2d(256, 256, 3, 1, padding=1), 51 | nn.ReLU(inplace=True), 52 | nn.Conv2d(256, 256, 3, 1, padding=1), 53 | nn.ReLU(inplace=True), 54 | nn.MaxPool2d(2, 2, ceil_mode=True), 55 | 56 | nn.Conv2d(256, 512, 3, 1, padding=1), 57 | nn.ReLU(inplace=True), 58 | nn.Conv2d(512, 512, 3, 1, padding=1), 59 | nn.ReLU(inplace=True), 60 | nn.Conv2d(512, 512, 3, 1, padding=1), 61 | nn.ReLU(inplace=True), 62 | nn.MaxPool2d(2, 2), 63 | 64 | nn.Conv2d(512, 512, 3, 1, padding=1), 65 | nn.ReLU(inplace=True), 66 | nn.Conv2d(512, 512, 3, 1, padding=1), 67 | nn.ReLU(inplace=True), 68 | nn.Conv2d(512, 512, 3, 1, padding=1), 69 | nn.ReLU(inplace=True), 70 | nn.MaxPool2d(2, 2), 71 | 72 | nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6), 73 | nn.ReLU(inplace=True), 74 | nn.Conv2d(1024, 1024, 1, 1), 75 | nn.ReLU(inplace=True), 76 | ]) 77 | 78 | self.L2Norm3_3 = L2Norm(256, 10) 79 | self.L2Norm4_3 = L2Norm(512, 8) 80 | self.L2Norm5_3 = L2Norm(512, 5) 81 | 82 | self.extras = nn.ModuleList([ 83 | nn.Conv2d(1024, 256, 1, 1), 84 | nn.Conv2d(256, 512, 3, 2, padding=1), 85 | nn.Conv2d(512, 128, 1, 1), 86 | nn.Conv2d(128, 256, 3, 2, padding=1), 87 | ]) 88 | 89 | self.loc = nn.ModuleList([ 90 | nn.Conv2d(256, 4, 3, 1, padding=1), 91 | nn.Conv2d(512, 4, 3, 1, padding=1), 92 | nn.Conv2d(512, 4, 3, 1, padding=1), 93 | nn.Conv2d(1024, 4, 3, 1, padding=1), 94 | nn.Conv2d(512, 4, 3, 1, padding=1), 95 | nn.Conv2d(256, 4, 3, 1, padding=1), 96 | ]) 97 | 98 | self.conf = nn.ModuleList([ 99 | nn.Conv2d(256, 4, 3, 1, padding=1), 100 | nn.Conv2d(512, 2, 3, 1, padding=1), 101 | nn.Conv2d(512, 2, 3, 1, padding=1), 102 | nn.Conv2d(1024, 2, 3, 1, padding=1), 103 | nn.Conv2d(512, 2, 3, 1, padding=1), 104 | nn.Conv2d(256, 2, 3, 1, padding=1), 105 | ]) 106 | 107 | self.priors = None 108 | self.previous_size = None 109 | 110 | self.softmax = nn.Softmax(dim=-1) 111 | self.detect = Detect(self.config) 112 | 113 | def forward(self, x): 114 | size = x.size()[2:] 115 | sources = list() 116 | loc = list() 117 | conf = list() 118 | 119 | for k in range(16): 120 | x = self.vgg[k](x) 121 | s = self.L2Norm3_3(x) 122 | sources.append(s) 123 | 124 | for k in range(16, 23): 125 | x = self.vgg[k](x) 126 | s = self.L2Norm4_3(x) 127 | sources.append(s) 128 | 129 | for k in range(23, 30): 130 | x = self.vgg[k](x) 131 | s = self.L2Norm5_3(x) 132 | sources.append(s) 133 | 134 | for k in range(30, len(self.vgg)): 135 | x = self.vgg[k](x) 136 | sources.append(x) 137 | 138 | # apply extra layers and cache source layer outputs 139 | for k, v in enumerate(self.extras): 140 | x = F.relu(v(x), inplace=True) 141 | if k % 2 == 1: 142 | sources.append(x) 143 | 144 | # apply multibox head to source layers 145 | loc_x = self.loc[0](sources[0]) 146 | conf_x = self.conf[0](sources[0]) 147 | 148 | max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True) 149 | conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1) 150 | 151 | loc.append(loc_x.permute(0, 2, 3, 1).contiguous()) 152 | conf.append(conf_x.permute(0, 2, 3, 1).contiguous()) 153 | 154 | for i in range(1, len(sources)): 155 | x = sources[i] 156 | conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous()) 157 | loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous()) 158 | 159 | if self.priors is None or self.previous_size != size: 160 | with torch.no_grad(): 161 | features_maps = [] 162 | for i in range(len(loc)): 163 | feat = [] 164 | feat += [loc[i].size(1), loc[i].size(2)] 165 | features_maps += [feat] 166 | self.priors = PriorBox(size, features_maps, self.config).forward().to(self.device) 167 | self.previous_size = size 168 | 169 | loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) 170 | conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) 171 | conf = self.softmax(conf.view(conf.size(0), -1, 2)) 172 | 173 | output = self.detect(loc.view(loc.size(0), -1, 4), conf, self.priors) 174 | 175 | return output 176 | -------------------------------------------------------------------------------- /src/nets/scorer_interface.py: -------------------------------------------------------------------------------- 1 | """Scorer interface module.""" 2 | 3 | import warnings 4 | from typing import Any, List, Tuple 5 | 6 | import torch 7 | 8 | 9 | class ScorerInterface: 10 | """Scorer interface for beam search. 11 | 12 | The scorer performs scoring of the all tokens in vocabulary. 13 | 14 | Examples: 15 | * Search heuristics 16 | * :class:`src.nets.scorers.length_bonus.LengthBonus` 17 | * Decoder networks of the sequence-to-sequence models 18 | * :class:`src.nets.backend.nets.transformer.decoder.Decoder` 19 | * :class:`src.nets.backend.nets.rnn.decoders.Decoder` 20 | * Neural language models 21 | * :class:`src.nets.backend.lm.transformer.TransformerLM` 22 | * :class:`src.nets.backend.lm.default.DefaultRNNLM` 23 | * :class:`src.nets.backend.lm.seq_rnn.SequentialRNNLM` 24 | 25 | """ 26 | 27 | def init_state(self, x: torch.Tensor) -> Any: 28 | """Get an initial state for decoding (optional). 29 | 30 | Args: 31 | x (torch.Tensor): The encoded feature tensor 32 | 33 | Returns: initial state 34 | 35 | """ 36 | return None 37 | 38 | def select_state(self, state: Any, i: int, new_id: int = None) -> Any: 39 | """Select state with relative ids in the main beam search. 40 | 41 | Args: 42 | state: Decoder state for prefix tokens 43 | i (int): Index to select a state in the main beam search 44 | new_id (int): New label index to select a state if necessary 45 | 46 | Returns: 47 | state: pruned state 48 | 49 | """ 50 | return None if state is None else state[i] 51 | 52 | def score( 53 | self, y: torch.Tensor, state: Any, x: torch.Tensor 54 | ) -> Tuple[torch.Tensor, Any]: 55 | """Score new token (required). 56 | 57 | Args: 58 | y (torch.Tensor): 1D torch.int64 prefix tokens. 59 | state: Scorer state for prefix tokens 60 | x (torch.Tensor): The encoder feature that generates ys. 61 | 62 | Returns: 63 | tuple[torch.Tensor, Any]: Tuple of 64 | scores for next token that has a shape of `(n_vocab)` 65 | and next state for ys 66 | 67 | """ 68 | raise NotImplementedError 69 | 70 | def final_score(self, state: Any) -> float: 71 | """Score eos (optional). 72 | 73 | Args: 74 | state: Scorer state for prefix tokens 75 | 76 | Returns: 77 | float: final score 78 | 79 | """ 80 | return 0.0 81 | 82 | 83 | class BatchScorerInterface(ScorerInterface): 84 | """Batch scorer interface.""" 85 | 86 | def batch_init_state(self, x: torch.Tensor) -> Any: 87 | """Get an initial state for decoding (optional). 88 | 89 | Args: 90 | x (torch.Tensor): The encoded feature tensor 91 | 92 | Returns: initial state 93 | 94 | """ 95 | return self.init_state(x) 96 | 97 | def batch_score( 98 | self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor 99 | ) -> Tuple[torch.Tensor, List[Any]]: 100 | """Score new token batch (required). 101 | 102 | Args: 103 | ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen). 104 | states (List[Any]): Scorer states for prefix tokens. 105 | xs (torch.Tensor): 106 | The encoder feature that generates ys (n_batch, xlen, n_feat). 107 | 108 | Returns: 109 | tuple[torch.Tensor, List[Any]]: Tuple of 110 | batchfied scores for next token with shape of `(n_batch, n_vocab)` 111 | and next state list for ys. 112 | 113 | """ 114 | warnings.warn( 115 | "{} batch score is implemented through for loop not parallelized".format( 116 | self.__class__.__name__ 117 | ) 118 | ) 119 | scores = list() 120 | outstates = list() 121 | for i, (y, state, x) in enumerate(zip(ys, states, xs)): 122 | score, outstate = self.score(y, state, x) 123 | outstates.append(outstate) 124 | scores.append(score) 125 | scores = torch.cat(scores, 0).view(ys.shape[0], -1) 126 | return scores, outstates 127 | 128 | 129 | class PartialScorerInterface(ScorerInterface): 130 | """Partial scorer interface for beam search. 131 | 132 | The partial scorer performs scoring when non-partial scorer finished scoring, 133 | and receives pre-pruned next tokens to score because it is too heavy to score 134 | all the tokens. 135 | 136 | Examples: 137 | * Prefix search for connectionist-temporal-classification models 138 | * :class:`src.nets.scorers.ctc.CTCPrefixScorer` 139 | 140 | """ 141 | 142 | def score_partial( 143 | self, y: torch.Tensor, next_tokens: torch.Tensor, state: Any, x: torch.Tensor 144 | ) -> Tuple[torch.Tensor, Any]: 145 | """Score new token (required). 146 | 147 | Args: 148 | y (torch.Tensor): 1D prefix token 149 | next_tokens (torch.Tensor): torch.int64 next token to score 150 | state: decoder state for prefix tokens 151 | x (torch.Tensor): The encoder feature that generates ys 152 | 153 | Returns: 154 | tuple[torch.Tensor, Any]: 155 | Tuple of a score tensor for y that has a shape `(len(next_tokens),)` 156 | and next state for ys 157 | 158 | """ 159 | raise NotImplementedError 160 | 161 | 162 | class BatchPartialScorerInterface(BatchScorerInterface, PartialScorerInterface): 163 | """Batch partial scorer interface for beam search.""" 164 | 165 | def batch_score_partial( 166 | self, 167 | ys: torch.Tensor, 168 | next_tokens: torch.Tensor, 169 | states: List[Any], 170 | xs: torch.Tensor, 171 | ) -> Tuple[torch.Tensor, Any]: 172 | """Score new token (required). 173 | 174 | Args: 175 | ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen). 176 | next_tokens (torch.Tensor): torch.int64 tokens to score (n_batch, n_token). 177 | states (List[Any]): Scorer states for prefix tokens. 178 | xs (torch.Tensor): 179 | The encoder feature that generates ys (n_batch, xlen, n_feat). 180 | 181 | Returns: 182 | tuple[torch.Tensor, Any]: 183 | Tuple of a score tensor for ys that has a shape `(n_batch, n_vocab)` 184 | and next states for ys 185 | """ 186 | raise NotImplementedError 187 | -------------------------------------------------------------------------------- /src/avhubert_muavic/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import logging 3 | import math 4 | import torch.nn as nn 5 | from collections import OrderedDict 6 | 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | def conv3x3(in_planes, out_planes, stride=1): 11 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 12 | padding=1, bias=False) 13 | 14 | 15 | def downsample_basic_block( inplanes, outplanes, stride ): 16 | return nn.Sequential( 17 | nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=stride, bias=False), 18 | nn.BatchNorm2d(outplanes), 19 | ) 20 | 21 | def downsample_basic_block_v2( inplanes, outplanes, stride ): 22 | return nn.Sequential( 23 | nn.AvgPool2d(kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False), 24 | nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=1, bias=False), 25 | nn.BatchNorm2d(outplanes), 26 | ) 27 | 28 | 29 | 30 | class BasicBlock(nn.Module): 31 | expansion = 1 32 | 33 | def __init__(self, inplanes, planes, stride=1, downsample=None, relu_type = 'relu' ): 34 | super(BasicBlock, self).__init__() 35 | 36 | assert relu_type in ['relu','prelu'] 37 | 38 | self.conv1 = conv3x3(inplanes, planes, stride) 39 | self.bn1 = nn.BatchNorm2d(planes) 40 | 41 | if relu_type == 'relu': 42 | self.relu1 = nn.ReLU(inplace=True) 43 | self.relu2 = nn.ReLU(inplace=True) 44 | elif relu_type == 'prelu': 45 | self.relu1 = nn.PReLU(num_parameters=planes) 46 | self.relu2 = nn.PReLU(num_parameters=planes) 47 | else: 48 | raise Exception('relu type not implemented') 49 | 50 | self.conv2 = conv3x3(planes, planes) 51 | self.bn2 = nn.BatchNorm2d(planes) 52 | 53 | self.downsample = downsample 54 | self.stride = stride 55 | 56 | def forward(self, x): 57 | residual = x 58 | out = self.conv1(x) 59 | out = self.bn1(out) 60 | out = self.relu1(out) 61 | out = self.conv2(out) 62 | out = self.bn2(out) 63 | if self.downsample is not None: 64 | residual = self.downsample(x) 65 | 66 | out += residual 67 | out = self.relu2(out) 68 | 69 | return out 70 | 71 | 72 | class ResNet(nn.Module): 73 | 74 | def __init__(self, block, layers, num_classes=1000, relu_type = 'relu', gamma_zero = False, avg_pool_downsample = False): 75 | self.inplanes = 64 76 | self.relu_type = relu_type 77 | self.gamma_zero = gamma_zero 78 | self.downsample_block = downsample_basic_block_v2 if avg_pool_downsample else downsample_basic_block 79 | 80 | super(ResNet, self).__init__() 81 | self.layer1 = self._make_layer(block, 64, layers[0]) 82 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 83 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 84 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 85 | self.avgpool = nn.AdaptiveAvgPool2d(1) 86 | 87 | for m in self.modules(): 88 | if isinstance(m, nn.Conv2d): 89 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 90 | m.weight.data.normal_(0, math.sqrt(2. / n)) 91 | elif isinstance(m, nn.BatchNorm2d): 92 | m.weight.data.fill_(1) 93 | m.bias.data.zero_() 94 | 95 | if self.gamma_zero: 96 | for m in self.modules(): 97 | if isinstance(m, BasicBlock ): 98 | m.bn2.weight.data.zero_() 99 | 100 | def _make_layer(self, block, planes, blocks, stride=1): 101 | 102 | 103 | downsample = None 104 | if stride != 1 or self.inplanes != planes * block.expansion: 105 | downsample = self.downsample_block( inplanes = self.inplanes, 106 | outplanes = planes * block.expansion, 107 | stride = stride ) 108 | 109 | layers = [] 110 | layers.append(block(self.inplanes, planes, stride, downsample, relu_type = self.relu_type)) 111 | self.inplanes = planes * block.expansion 112 | for i in range(1, blocks): 113 | layers.append(block(self.inplanes, planes, relu_type = self.relu_type)) 114 | 115 | return nn.Sequential(*layers) 116 | 117 | def forward(self, x): 118 | x = self.layer1(x) 119 | x = self.layer2(x) 120 | x = self.layer3(x) 121 | x = self.layer4(x) 122 | x = self.avgpool(x) 123 | x = x.view(x.size(0), -1) 124 | return x 125 | 126 | class ResEncoder(nn.Module): 127 | def __init__(self, relu_type, weights): 128 | super(ResEncoder, self).__init__() 129 | self.frontend_nout = 64 130 | self.backend_out = 512 131 | frontend_relu = nn.PReLU(num_parameters=self.frontend_nout) if relu_type == 'prelu' else nn.ReLU() 132 | self.frontend3D = nn.Sequential( 133 | nn.Conv3d(1, self.frontend_nout, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False), 134 | nn.BatchNorm3d(self.frontend_nout), 135 | frontend_relu, 136 | nn.MaxPool3d( kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))) 137 | self.trunk = ResNet(BasicBlock, [2, 2, 2, 2], relu_type=relu_type) 138 | if weights is not None: 139 | logger.info(f"Load {weights} for resnet") 140 | std = torch.load(weights, map_location=torch.device('cpu'))['model_state_dict'] 141 | frontend_std, trunk_std = OrderedDict(), OrderedDict() 142 | for key, val in std.items(): 143 | new_key = '.'.join(key.split('.')[1:]) 144 | if 'frontend3D' in key: 145 | frontend_std[new_key] = val 146 | if 'trunk' in key: 147 | trunk_std[new_key] = val 148 | self.frontend3D.load_state_dict(frontend_std) 149 | self.trunk.load_state_dict(trunk_std) 150 | 151 | def forward(self, x): 152 | B, C, T, H, W = x.size() 153 | x = self.frontend3D(x) 154 | Tnew = x.shape[2] 155 | x = self.threeD_to_2D_tensor(x) 156 | x = self.trunk(x) 157 | x = x.view(B, Tnew, x.size(1)) 158 | x = x.transpose(1, 2).contiguous() 159 | return x 160 | 161 | def threeD_to_2D_tensor(self, x): 162 | n_batch, n_channels, s_time, sx, sy = x.shape 163 | x = x.transpose(1, 2).contiguous() 164 | return x.reshape(n_batch*s_time, n_channels, sx, sy) 165 | -------------------------------------------------------------------------------- /src/nets/backend/backbones/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import logging 3 | import math 4 | import torch.nn as nn 5 | from collections import OrderedDict 6 | 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | def conv3x3(in_planes, out_planes, stride=1): 11 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 12 | padding=1, bias=False) 13 | 14 | 15 | def downsample_basic_block( inplanes, outplanes, stride ): 16 | return nn.Sequential( 17 | nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=stride, bias=False), 18 | nn.BatchNorm2d(outplanes), 19 | ) 20 | 21 | def downsample_basic_block_v2( inplanes, outplanes, stride ): 22 | return nn.Sequential( 23 | nn.AvgPool2d(kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False), 24 | nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=1, bias=False), 25 | nn.BatchNorm2d(outplanes), 26 | ) 27 | 28 | 29 | 30 | class BasicBlock(nn.Module): 31 | expansion = 1 32 | 33 | def __init__(self, inplanes, planes, stride=1, downsample=None, relu_type = 'relu' ): 34 | super(BasicBlock, self).__init__() 35 | 36 | assert relu_type in ['relu','prelu'] 37 | 38 | self.conv1 = conv3x3(inplanes, planes, stride) 39 | self.bn1 = nn.BatchNorm2d(planes) 40 | 41 | if relu_type == 'relu': 42 | self.relu1 = nn.ReLU(inplace=True) 43 | self.relu2 = nn.ReLU(inplace=True) 44 | elif relu_type == 'prelu': 45 | self.relu1 = nn.PReLU(num_parameters=planes) 46 | self.relu2 = nn.PReLU(num_parameters=planes) 47 | else: 48 | raise Exception('relu type not implemented') 49 | 50 | self.conv2 = conv3x3(planes, planes) 51 | self.bn2 = nn.BatchNorm2d(planes) 52 | 53 | self.downsample = downsample 54 | self.stride = stride 55 | 56 | def forward(self, x): 57 | residual = x 58 | out = self.conv1(x) 59 | out = self.bn1(out) 60 | out = self.relu1(out) 61 | out = self.conv2(out) 62 | out = self.bn2(out) 63 | if self.downsample is not None: 64 | residual = self.downsample(x) 65 | 66 | out += residual 67 | out = self.relu2(out) 68 | 69 | return out 70 | 71 | 72 | class ResNet(nn.Module): 73 | 74 | def __init__(self, block, layers, num_classes=1000, relu_type = 'relu', gamma_zero = False, avg_pool_downsample = False): 75 | self.inplanes = 64 76 | self.relu_type = relu_type 77 | self.gamma_zero = gamma_zero 78 | self.downsample_block = downsample_basic_block_v2 if avg_pool_downsample else downsample_basic_block 79 | 80 | super(ResNet, self).__init__() 81 | self.layer1 = self._make_layer(block, 64, layers[0]) 82 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 83 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 84 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 85 | self.avgpool = nn.AdaptiveAvgPool2d(1) 86 | 87 | for m in self.modules(): 88 | if isinstance(m, nn.Conv2d): 89 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 90 | m.weight.data.normal_(0, math.sqrt(2. / n)) 91 | elif isinstance(m, nn.BatchNorm2d): 92 | m.weight.data.fill_(1) 93 | m.bias.data.zero_() 94 | 95 | if self.gamma_zero: 96 | for m in self.modules(): 97 | if isinstance(m, BasicBlock ): 98 | m.bn2.weight.data.zero_() 99 | 100 | def _make_layer(self, block, planes, blocks, stride=1): 101 | 102 | 103 | downsample = None 104 | if stride != 1 or self.inplanes != planes * block.expansion: 105 | downsample = self.downsample_block( inplanes = self.inplanes, 106 | outplanes = planes * block.expansion, 107 | stride = stride ) 108 | 109 | layers = [] 110 | layers.append(block(self.inplanes, planes, stride, downsample, relu_type = self.relu_type)) 111 | self.inplanes = planes * block.expansion 112 | for i in range(1, blocks): 113 | layers.append(block(self.inplanes, planes, relu_type = self.relu_type)) 114 | 115 | return nn.Sequential(*layers) 116 | 117 | def forward(self, x): 118 | x = self.layer1(x) 119 | x = self.layer2(x) 120 | x = self.layer3(x) 121 | x = self.layer4(x) 122 | x = self.avgpool(x) 123 | x = x.view(x.size(0), -1) 124 | return x 125 | 126 | class ResEncoder(nn.Module): 127 | def __init__(self, relu_type, weights): 128 | super(ResEncoder, self).__init__() 129 | self.frontend_nout = 64 130 | self.backend_out = 512 131 | frontend_relu = nn.PReLU(num_parameters=self.frontend_nout) if relu_type == 'prelu' else nn.ReLU() 132 | self.frontend3D = nn.Sequential( 133 | nn.Conv3d(1, self.frontend_nout, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False), 134 | nn.BatchNorm3d(self.frontend_nout), 135 | frontend_relu, 136 | nn.MaxPool3d( kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))) 137 | self.trunk = ResNet(BasicBlock, [2, 2, 2, 2], relu_type=relu_type) 138 | if weights is not None: 139 | logger.info(f"Load {weights} for resnet") 140 | std = torch.load(weights, map_location=torch.device('cpu'))['model_state_dict'] 141 | frontend_std, trunk_std = OrderedDict(), OrderedDict() 142 | for key, val in std.items(): 143 | new_key = '.'.join(key.split('.')[1:]) 144 | if 'frontend3D' in key: 145 | frontend_std[new_key] = val 146 | if 'trunk' in key: 147 | trunk_std[new_key] = val 148 | self.frontend3D.load_state_dict(frontend_std) 149 | self.trunk.load_state_dict(trunk_std) 150 | 151 | def forward(self, x): 152 | B, C, T, H, W = x.size() 153 | x = self.frontend3D(x) 154 | Tnew = x.shape[2] 155 | x = self.threeD_to_2D_tensor(x) 156 | x = self.trunk(x) 157 | x = x.view(B, Tnew, x.size(1)) 158 | x = x.transpose(1, 2).contiguous() 159 | return x 160 | 161 | def threeD_to_2D_tensor(self, x): 162 | n_batch, n_channels, s_time, sx, sy = x.shape 163 | x = x.transpose(1, 2).contiguous() 164 | return x.reshape(n_batch*s_time, n_channels, sx, sy) 165 | --------------------------------------------------------------------------------