├── src
    ├── nets
    │   ├── scorers
    │   │   ├── __init__.py
    │   │   ├── length_bonus.py
    │   │   └── ctc.py
    │   ├── backend
    │   │   ├── backbones
    │   │   │   ├── __init__.py
    │   │   │   ├── conv1d_extractor.py
    │   │   │   ├── conv3d_extractor.py
    │   │   │   ├── modules
    │   │   │   │   ├── resnet.py
    │   │   │   │   └── shufflenetv2.py
    │   │   │   └── resnet.py
    │   │   ├── transformer
    │   │   │   ├── __init__.py
    │   │   │   ├── repeat.py
    │   │   │   ├── layer_norm.py
    │   │   │   ├── positionwise_feed_forward.py
    │   │   │   ├── add_sos_eos.py
    │   │   │   ├── mask.py
    │   │   │   ├── label_smoothing_loss.py
    │   │   │   ├── convolution.py
    │   │   │   ├── decoder_layer.py
    │   │   │   └── encoder_layer.py
    │   │   ├── e2e_asr_conformer.py
    │   │   └── e2e_asr_conformer_av.py
    │   └── scorer_interface.py
    ├── ibug
    │   ├── face_alignment
    │   │   ├── fan
    │   │   │   └── __init__.py
    │   │   ├── __init__.py
    │   │   └── utils.py
    │   └── face_detection
    │   │   ├── s3fd
    │   │       ├── __init__.py
    │   │       ├── s3fd_predictor.py
    │   │       └── s3fd_net.py
    │   │   ├── retina_face
    │   │       ├── __init__.py
    │   │       ├── config.py
    │   │       ├── py_cpu_nms.py
    │   │       ├── prior_box.py
    │   │       ├── retina_face.py
    │   │       ├── retina_face_net.py
    │   │       └── retina_face_predictor.py
    │   │   ├── utils
    │   │       ├── __init__.py
    │   │       ├── data
    │   │       │   └── bfm_lms.npy
    │   │       ├── head_pose_estimator.py
    │   │       └── simple_face_tracker.py
    │   │   └── __init__.py
    ├── retinaface
    │   ├── 20words_mean_face.npy
    │   ├── detector.py
    │   └── utils.py
    ├── tokenizer
    │   ├── spm
    │   │   ├── unigram
    │   │   │   └── unigram5000.model
    │   │   ├── spm_train.py
    │   │   ├── train.sh
    │   │   └── spm_encode.py
    │   └── spm_tokenizer.py
    ├── dataset
    │   └── transforms.py
    ├── talking_detector
    │   ├── Classifier.py
    │   ├── loss.py
    │   ├── Model.py
    │   ├── segmentation.py
    │   └── ASD.py
    ├── avhubert_muavic
    │   ├── avhubert2text.py
    │   ├── av_transformer_decoder.py
    │   ├── av2text_config.py
    │   └── resnet.py
    ├── auto_asr
    │   ├── configuration_asr.py
    │   └── asr_model.py
    ├── auto_vsr
    │   ├── configuration_vsr.py
    │   └── vsr_model.py
    ├── auto_avsr
    │   ├── avsr_model.py
    │   └── configuration_avsr.py
    ├── avhubert_avsr
    │   └── avhubert_avsr_model.py
    ├── cluster
    │   └── eval.py
    └── custom_trainer.py
├── docs
    ├── images
    │   ├── align.png
    │   ├── setting.png
    │   ├── origin_360.png
    │   ├── padding_360.png
    │   ├── central_view.png
    │   ├── face_linking.png
    │   ├── anno_transcript.png
    │   ├── mcorec-baseline.png
    │   └── mcorec_overview.png
    ├── organizers.md
    ├── data_preparation.md
    ├── submission.md
    ├── index.md
    └── baseline.md
├── requirements.txt
├── script
    ├── lip_crop.py
    ├── asd.py
    └── evaluate.py
└── .gitignore


/src/nets/scorers/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/src/nets/backend/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/src/nets/backend/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | """Initialize sub package."""
2 | 


--------------------------------------------------------------------------------
/src/ibug/face_alignment/fan/__init__.py:
--------------------------------------------------------------------------------
1 | from .fan_predictor import FANPredictor
2 | 


--------------------------------------------------------------------------------
/src/ibug/face_detection/s3fd/__init__.py:
--------------------------------------------------------------------------------
1 | from .s3fd_predictor import S3FDPredictor
2 | 


--------------------------------------------------------------------------------
/docs/images/align.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/align.png


--------------------------------------------------------------------------------
/docs/images/setting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/setting.png


--------------------------------------------------------------------------------
/src/ibug/face_alignment/__init__.py:
--------------------------------------------------------------------------------
1 | from .fan import FANPredictor
2 | 
3 | 
4 | __version__ = '0.1.0'
5 | 


--------------------------------------------------------------------------------
/src/ibug/face_detection/retina_face/__init__.py:
--------------------------------------------------------------------------------
1 | from .retina_face_predictor import RetinaFacePredictor
2 | 


--------------------------------------------------------------------------------
/docs/images/origin_360.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/origin_360.png


--------------------------------------------------------------------------------
/docs/images/padding_360.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/padding_360.png


--------------------------------------------------------------------------------
/docs/images/central_view.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/central_view.png


--------------------------------------------------------------------------------
/docs/images/face_linking.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/face_linking.png


--------------------------------------------------------------------------------
/docs/images/anno_transcript.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/anno_transcript.png


--------------------------------------------------------------------------------
/docs/images/mcorec-baseline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/mcorec-baseline.png


--------------------------------------------------------------------------------
/docs/images/mcorec_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/docs/images/mcorec_overview.png


--------------------------------------------------------------------------------
/src/retinaface/20words_mean_face.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/src/retinaface/20words_mean_face.npy


--------------------------------------------------------------------------------
/src/tokenizer/spm/unigram/unigram5000.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/src/tokenizer/spm/unigram/unigram5000.model


--------------------------------------------------------------------------------
/src/ibug/face_detection/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .head_pose_estimator import HeadPoseEstimator
2 | from .simple_face_tracker import SimpleFaceTracker
3 | 


--------------------------------------------------------------------------------
/src/ibug/face_detection/utils/data/bfm_lms.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCoRec/mcorec_baseline/HEAD/src/ibug/face_detection/utils/data/bfm_lms.npy


--------------------------------------------------------------------------------
/src/ibug/face_detection/__init__.py:
--------------------------------------------------------------------------------
1 | from .s3fd import S3FDPredictor
2 | from .retina_face import RetinaFacePredictor
3 | 
4 | 
5 | __version__ = '0.1.0'
6 | 


--------------------------------------------------------------------------------
/docs/organizers.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: Organizers
 4 | parent: CHiME-9 Task 1 - MCoRec
 5 | nav_order: 7
 6 | ---
 7 | 
 8 | * **Alexander Waibel**, CMU, USA & KIT, DE
 9 | * **Christian Fuegen**, Meta, UK
10 | * **Shinji Watanabe**, CMU, USA
11 | * **Katerina Zmolikova**, Meta, UK
12 | * **Thai-Binh Nguyen**, KIT, DE
13 | * **Pingchuan Ma**, Meta, USA
14 | 


--------------------------------------------------------------------------------
/src/tokenizer/spm/spm_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # All rights reserved.
 4 | #
 5 | # This source code is licensed under the license found in the
 6 | # https://github.com/pytorch/fairseq/blob/master/LICENSE
 7 | import sys
 8 | 
 9 | import sentencepiece as spm
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
14 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==2.7.1
 2 | torchvision==0.22.1
 3 | torchaudio==2.7.1
 4 | torchcodec==0.4.0
 5 | sentencepiece==0.2.0
 6 | transformers==4.52.4
 7 | accelerate==1.7.0
 8 | datasets==3.6.0
 9 | six==1.17.0
10 | torch-summary==1.4.5
11 | av==12.0.0
12 | jiwer==3.0.5
13 | wandb==0.19.1
14 | python_speech_features==0.6
15 | opencv-python==4.10.0.84
16 | scikit-image==0.25.2
17 | ffmpeg-python==0.2.0
18 | scikit-learn==1.6.1
19 | webvtt-py


--------------------------------------------------------------------------------
/src/dataset/transforms.py:
--------------------------------------------------------------------------------
 1 | import torchaudio
 2 | import torchvision
 3 | import torch
 4 | 
 5 | def load_video(path):
 6 |     """
 7 |     rtype: torch, T x C x H x W
 8 |     """
 9 |     vid = torchvision.io.read_video(path, pts_unit="sec", output_format="THWC")[0]
10 |     vid = vid.permute((0, 3, 1, 2))
11 |     return vid
12 | 
13 | 
14 | def load_audio(path):
15 |     """
16 |     rtype: torch, T x 1
17 |     """
18 |     waveform, sample_rate = torchaudio.load(path[:-4] + ".wav", normalize=True)
19 |     return waveform.transpose(1, 0)
20 | 


--------------------------------------------------------------------------------
/src/tokenizer/spm/train.sh:
--------------------------------------------------------------------------------
1 | nbpe=5000
2 | bpemode=unigram
3 | mkdir -p ${bpemode}
4 | dict=${bpemode}/${bpemode}${nbpe}_units.txt
5 | bpemodel=${bpemode}/${bpemode}${nbpe}
6 | echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
7 | python spm_train.py --input=input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
8 | python spm_encode.py --model=${bpemodel}.model --output_format=piece < input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
9 | 


--------------------------------------------------------------------------------
/src/nets/backend/transformer/repeat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Repeat the same layer definition."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class MultiSequential(torch.nn.Sequential):
13 |     """Multi-input multi-output torch.nn.Sequential."""
14 | 
15 |     def forward(self, *args):
16 |         """Repeat."""
17 |         for m in self:
18 |             args = m(*args)
19 |         return args
20 | 
21 | 
22 | def repeat(N, fn):
23 |     """Repeat module N times.
24 | 
25 |     :param int N: repeat time
26 |     :param function fn: function to generate module
27 |     :return: repeated modules
28 |     :rtype: MultiSequential
29 |     """
30 |     return MultiSequential(*[fn() for _ in range(N)])
31 | 


--------------------------------------------------------------------------------
/src/nets/backend/transformer/layer_norm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Layer normalization module."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class LayerNorm(torch.nn.LayerNorm):
13 |     """Layer normalization module.
14 | 
15 |     :param int nout: output dim size
16 |     :param int dim: dimension to be normalized
17 |     """
18 | 
19 |     def __init__(self, nout, dim=-1):
20 |         """Construct an LayerNorm object."""
21 |         super(LayerNorm, self).__init__(nout, eps=1e-12)
22 |         self.dim = dim
23 | 
24 |     def forward(self, x):
25 |         """Apply layer normalization.
26 | 
27 |         :param torch.Tensor x: input tensor
28 |         :return: layer normalized tensor
29 |         :rtype torch.Tensor
30 |         """
31 |         if self.dim == -1:
32 |             return super(LayerNorm, self).forward(x)
33 |         return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1)
34 | 


--------------------------------------------------------------------------------
/src/nets/backend/transformer/positionwise_feed_forward.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Positionwise feed forward layer definition."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class PositionwiseFeedForward(torch.nn.Module):
13 |     """Positionwise feed forward layer.
14 | 
15 |     :param int idim: input dimenstion
16 |     :param int hidden_units: number of hidden units
17 |     :param float dropout_rate: dropout rate
18 | 
19 |     """
20 | 
21 |     def __init__(self, idim, hidden_units, dropout_rate):
22 |         """Construct an PositionwiseFeedForward object."""
23 |         super(PositionwiseFeedForward, self).__init__()
24 |         self.w_1 = torch.nn.Linear(idim, hidden_units)
25 |         self.w_2 = torch.nn.Linear(hidden_units, idim)
26 |         self.dropout = torch.nn.Dropout(dropout_rate)
27 | 
28 |     def forward(self, x):
29 |         """Forward funciton."""
30 |         return self.w_2(self.dropout(torch.relu(self.w_1(x))))
31 | 


--------------------------------------------------------------------------------
/src/ibug/face_detection/retina_face/config.py:
--------------------------------------------------------------------------------
 1 | # config.py
 2 | 
 3 | cfg_mnet = {
 4 |     'name': 'mobilenet0.25',
 5 |     'min_sizes': [[16, 32], [64, 128], [256, 512]],
 6 |     'steps': [8, 16, 32],
 7 |     'variance': [0.1, 0.2],
 8 |     'clip': False,
 9 |     'loc_weight': 2.0,
10 |     'gpu_train': True,
11 |     'batch_size': 32,
12 |     'ngpu': 1,
13 |     'epoch': 250,
14 |     'decay1': 190,
15 |     'decay2': 220,
16 |     'image_size': 640,
17 |     'return_layers': {'stage1': 1, 'stage2': 2, 'stage3': 3},
18 |     'in_channel': 32,
19 |     'out_channel': 64
20 | }
21 | 
22 | cfg_re50 = {
23 |     'name': 'Resnet50',
24 |     'min_sizes': [[16, 32], [64, 128], [256, 512]],
25 |     'steps': [8, 16, 32],
26 |     'variance': [0.1, 0.2],
27 |     'clip': False,
28 |     'loc_weight': 2.0,
29 |     'gpu_train': True,
30 |     'batch_size': 24,
31 |     'ngpu': 4,
32 |     'epoch': 100,
33 |     'decay1': 70,
34 |     'decay2': 90,
35 |     'image_size': 840,
36 |     'return_layers': {'layer2': 1, 'layer3': 2, 'layer4': 3},
37 |     'in_channel': 256,
38 |     'out_channel': 256
39 | }
40 | 


--------------------------------------------------------------------------------
/src/nets/backend/transformer/add_sos_eos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Unility funcitons for Transformer."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | def add_sos_eos(ys_pad, sos, eos, ignore_id):
13 |     """Add <sos> and <eos> labels.
14 | 
15 |     :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
16 |     :param int sos: index of <sos>
17 |     :param int eos: index of <eeos>
18 |     :param int ignore_id: index of padding
19 |     :return: padded tensor (B, Lmax)
20 |     :rtype: torch.Tensor
21 |     :return: padded tensor (B, Lmax)
22 |     :rtype: torch.Tensor
23 |     """
24 |     from src.nets.backend.nets_utils import pad_list
25 | 
26 |     _sos = ys_pad.new([sos])
27 |     _eos = ys_pad.new([eos])
28 |     ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
29 |     ys_in = [torch.cat([_sos, y], dim=0) for y in ys]
30 |     ys_out = [torch.cat([y, _eos], dim=0) for y in ys]
31 |     return pad_list(ys_in, eos), pad_list(ys_out, ignore_id)
32 | 


--------------------------------------------------------------------------------
/src/nets/backend/backbones/conv1d_extractor.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2021 Imperial College London (Pingchuan Ma)
 5 | # Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | import torch
 7 | from src.nets.backend.backbones.modules.resnet1d import (
 8 |     BasicBlock1D,
 9 |     ResNet1D,
10 | )
11 | 
12 | 
13 | class Conv1dResNet(torch.nn.Module):
14 |     def __init__(self, relu_type="swish", a_upsample_ratio=1):
15 |         super().__init__()
16 |         self.a_upsample_ratio = a_upsample_ratio
17 |         self.trunk = ResNet1D(
18 |             BasicBlock1D,
19 |             [2, 2, 2, 2],
20 |             relu_type=relu_type,
21 |             a_upsample_ratio=a_upsample_ratio,
22 |         )
23 | 
24 |     def forward(self, xs_pad):
25 |         """forward.
26 | 
27 |         :param xs_pad: torch.Tensor, batch of padded input sequences (B, Tmax, idim)
28 |         """
29 |         B, T, C = xs_pad.size()
30 |         xs_pad = xs_pad[:, : T // 640 * 640, :]
31 |         xs_pad = xs_pad.transpose(1, 2)
32 |         xs_pad = self.trunk(xs_pad)
33 |         return xs_pad.transpose(1, 2)
34 | 


--------------------------------------------------------------------------------
/src/talking_detector/Classifier.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class BGRU(nn.Module):
 6 |     def __init__(self, channel):
 7 |         super(BGRU, self).__init__()
 8 | 
 9 |         self.gru_forward = nn.GRU(input_size = channel, hidden_size = channel, num_layers = 1, bidirectional = False, bias = True, batch_first = True)
10 |         self.gru_backward = nn.GRU(input_size = channel, hidden_size = channel, num_layers = 1, bidirectional = False, bias = True, batch_first = True)
11 |         
12 |         self.gelu = nn.GELU()
13 |         self.__init_weight()
14 | 
15 |     def forward(self, x):
16 |         x, _ = self.gru_forward(x)
17 |         x = self.gelu(x)
18 |         x = torch.flip(x, dims=[1])
19 |         x, _ = self.gru_backward(x)
20 |         x = torch.flip(x, dims=[1])
21 |         x = self.gelu(x)
22 | 
23 |         return x
24 | 
25 |     def __init_weight(self):
26 |         for m in self.modules():
27 |             if isinstance(m, nn.GRU):
28 |                 torch.nn.init.kaiming_normal_(m.weight_ih_l0)
29 |                 torch.nn.init.kaiming_normal_(m.weight_hh_l0)
30 |                 m.bias_ih_l0.data.zero_()
31 |                 m.bias_hh_l0.data.zero_()


--------------------------------------------------------------------------------
/src/ibug/face_detection/retina_face/py_cpu_nms.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | def py_cpu_nms(dets, thresh, top_k):
12 |     """Pure Python NMS baseline."""
13 |     x1 = dets[:, 0]
14 |     y1 = dets[:, 1]
15 |     x2 = dets[:, 2]
16 |     y2 = dets[:, 3]
17 |     scores = dets[:, 4]
18 | 
19 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
20 |     order = scores.argsort()[: -top_k - 1: -1]
21 | 
22 |     keep = []
23 |     while order.size > 0:
24 |         i = order[0]
25 |         keep.append(i)
26 |         xx1 = np.maximum(x1[i], x1[order[1:]])
27 |         yy1 = np.maximum(y1[i], y1[order[1:]])
28 |         xx2 = np.minimum(x2[i], x2[order[1:]])
29 |         yy2 = np.minimum(y2[i], y2[order[1:]])
30 | 
31 |         w = np.maximum(0.0, xx2 - xx1 + 1)
32 |         h = np.maximum(0.0, yy2 - yy1 + 1)
33 |         inter = w * h
34 |         ovr = inter / (areas[i] + areas[order[1:]] - inter)
35 | 
36 |         inds = np.where(ovr <= thresh)[0]
37 |         order = order[inds + 1]
38 | 
39 |     return keep
40 | 


--------------------------------------------------------------------------------
/src/talking_detector/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class lossAV(nn.Module):
 6 | 	def __init__(self):
 7 | 		super(lossAV, self).__init__()
 8 | 		self.criterion = nn.BCELoss()
 9 | 		self.FC        = nn.Linear(128, 2)
10 | 		
11 | 	def forward(self, x, labels = None, r = 1):	
12 | 		x = x.squeeze(1)
13 | 		x = self.FC(x)
14 | 		if labels == None:
15 | 			predScore = x[:,1]
16 | 			predScore = predScore.t()
17 | 			predScore = predScore.view(-1).detach().cpu().numpy()
18 | 			return predScore
19 | 		else:
20 | 			x1 = x / r
21 | 			x1 = F.softmax(x1, dim = -1)[:,1]
22 | 			nloss = self.criterion(x1, labels.float())
23 | 			predScore = F.softmax(x, dim = -1)
24 | 			predLabel = torch.round(F.softmax(x, dim = -1))[:,1]
25 | 			correctNum = (predLabel == labels).sum().float()
26 | 			return nloss, predScore, predLabel, correctNum
27 | 
28 | 
29 | class lossV(nn.Module):
30 | 	def __init__(self):
31 | 		super(lossV, self).__init__()
32 | 		self.criterion = nn.BCELoss()
33 | 		self.FC        = nn.Linear(128, 2)
34 | 
35 | 	def forward(self, x, labels, r = 1):	
36 | 		x = x.squeeze(1)
37 | 		x = self.FC(x)
38 | 		
39 | 		x = x / r
40 | 		x = F.softmax(x, dim = -1)
41 | 
42 | 		nloss = self.criterion(x[:,1], labels.float())
43 | 		return nloss


--------------------------------------------------------------------------------
/src/ibug/face_detection/retina_face/prior_box.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from itertools import product as product
 3 | from math import ceil
 4 | 
 5 | 
 6 | class PriorBox(object):
 7 |     def __init__(self, cfg, image_size=None):
 8 |         super(PriorBox, self).__init__()
 9 |         self.min_sizes = cfg['min_sizes']
10 |         self.steps = cfg['steps']
11 |         self.clip = cfg['clip']
12 |         self.image_size = image_size
13 |         self.feature_maps = [[ceil(self.image_size[0]/step), ceil(self.image_size[1]/step)] for step in self.steps]
14 |         self.name = "s"
15 | 
16 |     def forward(self):
17 |         anchors = []
18 |         for k, f in enumerate(self.feature_maps):
19 |             min_sizes = self.min_sizes[k]
20 |             for i, j in product(range(f[0]), range(f[1])):
21 |                 for min_size in min_sizes:
22 |                     s_kx = min_size / self.image_size[1]
23 |                     s_ky = min_size / self.image_size[0]
24 |                     dense_cx = [x * self.steps[k] / self.image_size[1] for x in [j + 0.5]]
25 |                     dense_cy = [y * self.steps[k] / self.image_size[0] for y in [i + 0.5]]
26 |                     for cy, cx in product(dense_cy, dense_cx):
27 |                         anchors += [cx, cy, s_kx, s_ky]
28 | 
29 |         # back to torch land
30 |         output = torch.Tensor(anchors).view(-1, 4)
31 |         if self.clip:
32 |             output.clamp_(max=1, min=0)
33 |         return output
34 | 


--------------------------------------------------------------------------------
/src/talking_detector/Model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from .Classifier import BGRU
 5 | from .Encoder import visual_encoder, audio_encoder
 6 | 
 7 | class ASD_Model(nn.Module):
 8 |     def __init__(self):
 9 |         super(ASD_Model, self).__init__()
10 |         
11 |         self.visualEncoder  = visual_encoder()
12 |         self.audioEncoder  = audio_encoder()
13 |         self.GRU = BGRU(128)
14 | 
15 |     def forward_visual_frontend(self, x):
16 |         B, T, W, H = x.shape  
17 |         x = x.view(B, 1, T, W, H)
18 |         x = (x / 255 - 0.4161) / 0.1688
19 |         x = self.visualEncoder(x)
20 |         return x
21 | 
22 |     def forward_audio_frontend(self, x):    
23 |         x = x.unsqueeze(1).transpose(2, 3)     
24 |         x = self.audioEncoder(x)
25 |         return x
26 | 
27 |     def forward_audio_visual_backend(self, x1, x2):  
28 |         x = x1 + x2 
29 |         x = self.GRU(x)   
30 |         x = torch.reshape(x, (-1, 128))
31 |         return x    
32 | 
33 |     def forward_visual_backend(self,x):
34 |         x = torch.reshape(x, (-1, 128))
35 |         return x
36 | 
37 |     def forward(self, audioFeature, visualFeature):
38 |         audioEmbed = self.forward_audio_frontend(audioFeature)
39 |         visualEmbed = self.forward_visual_frontend(visualFeature)
40 |         outsAV = self.forward_audio_visual_backend(audioEmbed, visualEmbed)  
41 |         outsV = self.forward_visual_backend(visualEmbed)
42 | 
43 |         return outsAV, outsV


--------------------------------------------------------------------------------
/src/retinaface/detector.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2021 Imperial College London (Pingchuan Ma)
 5 | # Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | import warnings
 8 | 
 9 | from src.ibug.face_alignment import FANPredictor
10 | from src.ibug.face_detection import RetinaFacePredictor
11 | from tqdm import tqdm
12 | 
13 | warnings.filterwarnings("ignore")
14 | 
15 | 
16 | class LandmarksDetector:
17 |     def __init__(self, device="cuda:0", model_name="resnet50"):
18 |         self.face_detector = RetinaFacePredictor(
19 |             device=device,
20 |             threshold=0.8,
21 |             model=RetinaFacePredictor.get_model(model_name),
22 |         )
23 |         self.landmark_detector = FANPredictor(device=device, model=None)
24 | 
25 |     def __call__(self, video_frames):
26 |         landmarks = []
27 |         for frame in tqdm(video_frames, desc="Detecting landmarks"):
28 |             detected_faces = self.face_detector(frame, rgb=False)
29 |             face_points, _ = self.landmark_detector(frame, detected_faces, rgb=True)
30 |             if len(detected_faces) == 0:
31 |                 landmarks.append(None)
32 |             else:
33 |                 max_id, max_size = 0, 0
34 |                 for idx, bbox in enumerate(detected_faces):
35 |                     bbox_size = (bbox[2] - bbox[0]) + (bbox[3] - bbox[1])
36 |                     if bbox_size > max_size:
37 |                         max_id, max_size = idx, bbox_size
38 |                 landmarks.append(face_points[max_id])
39 |         return landmarks
40 | 


--------------------------------------------------------------------------------
/src/nets/backend/backbones/conv3d_extractor.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2021 Imperial College London (Pingchuan Ma)
 5 | # Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | from src.nets.backend.backbones.modules.resnet import BasicBlock, ResNet
10 | from src.nets.backend.transformer.convolution import Swish
11 | 
12 | 
13 | def threeD_to_2D_tensor(x):
14 |     n_batch, n_channels, s_time, sx, sy = x.shape
15 |     x = x.transpose(1, 2)
16 |     return x.reshape(n_batch * s_time, n_channels, sx, sy)
17 | 
18 | 
19 | class Conv3dResNet(torch.nn.Module):
20 |     """Conv3dResNet module"""
21 | 
22 |     def __init__(self, backbone_type="resnet", relu_type="swish"):
23 |         """__init__.
24 | 
25 |         :param backbone_type: str, the type of a visual front-end.
26 |         :param relu_type: str, activation function used in an audio front-end.
27 |         """
28 |         super(Conv3dResNet, self).__init__()
29 |         self.frontend_nout = 64
30 |         self.trunk = ResNet(BasicBlock, [2, 2, 2, 2], relu_type=relu_type)
31 |         self.frontend3D = nn.Sequential(
32 |             nn.Conv3d(
33 |                 1, self.frontend_nout, (5, 7, 7), (1, 2, 2), (2, 3, 3), bias=False
34 |             ),
35 |             nn.BatchNorm3d(self.frontend_nout),
36 |             Swish(),
37 |             nn.MaxPool3d((1, 3, 3), (1, 2, 2), (0, 1, 1)),
38 |         )
39 | 
40 |     def forward(self, xs_pad):
41 |         xs_pad = xs_pad.transpose(1, 2)  # [B, T, C, H, W] -> [B, C, T, H, W]
42 | 
43 |         B, C, T, H, W = xs_pad.size()
44 |         xs_pad = self.frontend3D(xs_pad)
45 |         Tnew = xs_pad.shape[2]
46 |         xs_pad = threeD_to_2D_tensor(xs_pad)
47 |         xs_pad = self.trunk(xs_pad)
48 |         return xs_pad.view(B, Tnew, xs_pad.size(1))
49 | 


--------------------------------------------------------------------------------
/src/tokenizer/spm_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import sentencepiece
 4 | 
 5 | SP_MODEL_PATH = os.path.join(
 6 |     os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
 7 |     "tokenizer",
 8 |     "spm",
 9 |     "unigram",
10 |     "unigram5000.model",
11 | )
12 | 
13 | DICT_PATH = os.path.join(
14 |     os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
15 |     "tokenizer",
16 |     "spm",
17 |     "unigram",
18 |     "unigram5000_units.txt",
19 | )
20 | 
21 | 
22 | class TextTransform:
23 |     """Mapping Dictionary Class for SentencePiece tokenization."""
24 | 
25 |     def __init__(
26 |         self,
27 |         sp_model_path=SP_MODEL_PATH,
28 |         dict_path=DICT_PATH,
29 |     ):
30 | 
31 |         # Load SentencePiece model
32 |         self.spm = sentencepiece.SentencePieceProcessor(model_file=sp_model_path)
33 | 
34 |         # Load units and create dictionary
35 |         units = open(dict_path, encoding='utf8').read().splitlines()
36 |         self.hashmap = {unit.split()[0]: unit.split()[-1] for unit in units}
37 |         # 0 will be used for "blank" in CTC
38 |         self.token_list = ["<blank>"] + list(self.hashmap.keys()) + ["<eos>"]
39 |         self.ignore_id = -1
40 | 
41 |     def tokenize(self, text):
42 |         tokens = self.spm.EncodeAsPieces(text)
43 |         token_ids = [self.hashmap.get(token, self.hashmap["<unk>"]) for token in tokens]
44 |         return torch.tensor(list(map(int, token_ids)))
45 | 
46 |     def post_process(self, token_ids):
47 |         token_ids = token_ids[token_ids != -1]
48 |         text = self._ids_to_str(token_ids, self.token_list)
49 |         text = text.replace("\u2581", " ").strip()
50 |         return text
51 | 
52 |     def _ids_to_str(self, token_ids, char_list):
53 |         token_as_list = [char_list[idx] for idx in token_ids]
54 |         return "".join(token_as_list).replace("<space>", " ")
55 | 


--------------------------------------------------------------------------------
/src/nets/backend/transformer/mask.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2019 Shigeki Karita
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | """Mask module."""
 7 | 
 8 | from distutils.version import LooseVersion
 9 | 
10 | import torch
11 | 
12 | is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2.0")
13 | # LooseVersion('1.2.0') == LooseVersion(torch.__version__) can't include e.g. 1.2.0+aaa
14 | is_torch_1_2 = (
15 |     LooseVersion("1.3") > LooseVersion(torch.__version__) >= LooseVersion("1.2")
16 | )
17 | datatype = torch.bool if is_torch_1_2_plus else torch.uint8
18 | 
19 | 
20 | def subsequent_mask(size, device="cpu", dtype=datatype):
21 |     """Create mask for subsequent steps (1, size, size).
22 | 
23 |     :param int size: size of mask
24 |     :param str device: "cpu" or "cuda" or torch.Tensor.device
25 |     :param torch.dtype dtype: result dtype
26 |     :rtype: torch.Tensor
27 |     >>> subsequent_mask(3)
28 |     [[1, 0, 0],
29 |      [1, 1, 0],
30 |      [1, 1, 1]]
31 |     """
32 |     if is_torch_1_2 and dtype == torch.bool:
33 |         # torch=1.2 doesn't support tril for bool tensor
34 |         ret = torch.ones(size, size, device=device, dtype=torch.uint8)
35 |         return torch.tril(ret, out=ret).type(dtype)
36 |     else:
37 |         ret = torch.ones(size, size, device=device, dtype=dtype)
38 |         return torch.tril(ret, out=ret)
39 | 
40 | 
41 | def target_mask(ys_in_pad, ignore_id):
42 |     """Create mask for decoder self-attention.
43 | 
44 |     :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
45 |     :param int ignore_id: index of padding
46 |     :param torch.dtype dtype: result dtype
47 |     :rtype: torch.Tensor
48 |     """
49 |     ys_mask = ys_in_pad != ignore_id
50 |     m = subsequent_mask(ys_mask.size(-1), device=ys_mask.device).unsqueeze(0)
51 |     return ys_mask.unsqueeze(-2) & m
52 | 


--------------------------------------------------------------------------------
/src/avhubert_muavic/avhubert2text.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from transformers import Speech2TextModel, Speech2TextForConditionalGeneration
 3 | from .avhubert import AVHubertModel
 4 | from .av_transformer_decoder import AVTransformerDecoder
 5 | from .av2text_config import AV2TextConfig
 6 | import torch
 7 | from typing import Optional
 8 | from transformers.generation.utils import Cache
 9 | 
10 | 
11 | class AV2TextModel(Speech2TextModel):
12 |     def __init__(self, config):
13 |         super().__init__(config)
14 |         self.encoder = AVHubertModel(config)
15 |         self.decoder = AVTransformerDecoder(config)
16 |         
17 |         self.lm_head = nn.Linear(config.d_model, self.config.vocab_size, bias=False)
18 |         self.lm_head.weight = self.decoder.embed_tokens.weight
19 | 
20 | class AV2TextForConditionalGeneration(Speech2TextForConditionalGeneration):
21 |     config_class = AV2TextConfig
22 |     def __init__(self, config):
23 |         super().__init__(config)        
24 |         self.model = AV2TextModel(config)
25 |         self.lm_head = nn.Linear(config.d_model, self.config.vocab_size, bias=False)
26 |         self.lm_head.weight = self.model.decoder.embed_tokens.weight
27 |     
28 |     def prepare_inputs_for_generation(
29 |         self,
30 |         input_ids: torch.LongTensor,
31 |         past_key_values: Optional[Cache] = None,
32 |         attention_mask: Optional[torch.LongTensor] = None,
33 |         inputs_embeds: Optional[torch.FloatTensor] = None,
34 |         cache_position: Optional[torch.LongTensor] = None,
35 |         **kwargs,
36 |     ):
37 |         model_inputs = super().prepare_inputs_for_generation(
38 |             input_ids=input_ids,
39 |             past_key_values=past_key_values,
40 |             attention_mask=attention_mask,
41 |             inputs_embeds=inputs_embeds,
42 |             cache_position=cache_position,
43 |             **kwargs
44 |         )
45 |         del model_inputs["video"]
46 |         return model_inputs


--------------------------------------------------------------------------------
/src/nets/scorers/length_bonus.py:
--------------------------------------------------------------------------------
 1 | """Length bonus module."""
 2 | from typing import Any, List, Tuple
 3 | 
 4 | import torch
 5 | 
 6 | from src.nets.scorer_interface import BatchScorerInterface
 7 | 
 8 | 
 9 | class LengthBonus(BatchScorerInterface):
10 |     """Length bonus in beam search."""
11 | 
12 |     def __init__(self, n_vocab: int):
13 |         """Initialize class.
14 | 
15 |         Args:
16 |             n_vocab (int): The number of tokens in vocabulary for beam search
17 | 
18 |         """
19 |         self.n = n_vocab
20 | 
21 |     def score(self, y, state, x):
22 |         """Score new token.
23 | 
24 |         Args:
25 |             y (torch.Tensor): 1D torch.int64 prefix tokens.
26 |             state: Scorer state for prefix tokens
27 |             x (torch.Tensor): 2D encoder feature that generates ys.
28 | 
29 |         Returns:
30 |             tuple[torch.Tensor, Any]: Tuple of
31 |                 torch.float32 scores for next token (n_vocab)
32 |                 and None
33 | 
34 |         """
35 |         return torch.tensor([1.0], device=x.device, dtype=x.dtype).expand(self.n), None
36 | 
37 |     def batch_score(
38 |         self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor
39 |     ) -> Tuple[torch.Tensor, List[Any]]:
40 |         """Score new token batch.
41 | 
42 |         Args:
43 |             ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
44 |             states (List[Any]): Scorer states for prefix tokens.
45 |             xs (torch.Tensor):
46 |                 The encoder feature that generates ys (n_batch, xlen, n_feat).
47 | 
48 |         Returns:
49 |             tuple[torch.Tensor, List[Any]]: Tuple of
50 |                 batchfied scores for next token with shape of `(n_batch, n_vocab)`
51 |                 and next state list for ys.
52 | 
53 |         """
54 |         return (
55 |             torch.tensor([1.0], device=xs.device, dtype=xs.dtype).expand(
56 |                 ys.shape[0], self.n
57 |             ),
58 |             None,
59 |         )
60 | 


--------------------------------------------------------------------------------
/src/auto_asr/configuration_asr.py:
--------------------------------------------------------------------------------
 1 | from transformers.configuration_utils import PretrainedConfig
 2 | 
 3 | class AutoASRConfig(PretrainedConfig):
 4 |     model_type = "auto_asr"
 5 | 
 6 |     def __init__(
 7 |         self,
 8 |         odim=5049,
 9 |         adim=768,
10 |         aheads=12,
11 |         eunits=3072,
12 |         elayers=12,
13 |         transformer_input_layer="conv1d",
14 |         dropout_rate=0.1,
15 |         transformer_attn_dropout_rate=0.1,
16 |         transformer_encoder_attn_layer_type="rel_mha",
17 |         macaron_style=True,
18 |         use_cnn_module=True,
19 |         cnn_module_kernel=31,
20 |         zero_triu=False,
21 |         a_upsample_ratio=1,
22 |         relu_type="swish",
23 |         ddim=768,
24 |         dheads=12,
25 |         dunits=3072,
26 |         dlayers=6,
27 |         lsm_weight=0.1,
28 |         transformer_length_normalized_loss=False,
29 |         mtlalpha=0.1,
30 |         ctc_type="builtin",
31 |         rel_pos_type="latest",
32 |         **kwargs,
33 |     ):
34 |         super().__init__(**kwargs)
35 |         self.odim = odim
36 |         self.adim = adim
37 |         self.aheads = aheads
38 |         self.eunits = eunits
39 |         self.elayers = elayers
40 |         self.transformer_input_layer = transformer_input_layer
41 |         self.dropout_rate = dropout_rate
42 |         self.transformer_attn_dropout_rate = transformer_attn_dropout_rate
43 |         self.transformer_encoder_attn_layer_type = transformer_encoder_attn_layer_type
44 |         self.macaron_style = macaron_style
45 |         self.use_cnn_module = use_cnn_module
46 |         self.cnn_module_kernel = cnn_module_kernel
47 |         self.zero_triu = zero_triu
48 |         self.a_upsample_ratio = a_upsample_ratio
49 |         self.relu_type = relu_type
50 |         self.ddim = ddim
51 |         self.dheads = dheads
52 |         self.dunits = dunits
53 |         self.dlayers = dlayers
54 |         self.lsm_weight = lsm_weight
55 |         self.transformer_length_normalized_loss = transformer_length_normalized_loss
56 |         self.mtlalpha = mtlalpha
57 |         self.ctc_type = ctc_type
58 |         self.rel_pos_type = rel_pos_type
59 |         
60 |         


--------------------------------------------------------------------------------
/src/auto_vsr/configuration_vsr.py:
--------------------------------------------------------------------------------
 1 | from transformers.configuration_utils import PretrainedConfig
 2 | 
 3 | class AutoVSRConfig(PretrainedConfig):
 4 |     model_type = "auto_vsr"
 5 | 
 6 |     def __init__(
 7 |         self,
 8 |         odim=5049,
 9 |         adim=768,
10 |         aheads=12,
11 |         eunits=3072,
12 |         elayers=12,
13 |         transformer_input_layer="conv3d",
14 |         dropout_rate=0.1,
15 |         transformer_attn_dropout_rate=0.1,
16 |         transformer_encoder_attn_layer_type="rel_mha",
17 |         macaron_style=True,
18 |         use_cnn_module=True,
19 |         cnn_module_kernel=31,
20 |         zero_triu=False,
21 |         a_upsample_ratio=1,
22 |         relu_type="swish",
23 |         ddim=768,
24 |         dheads=12,
25 |         dunits=3072,
26 |         dlayers=6,
27 |         lsm_weight=0.1,
28 |         transformer_length_normalized_loss=False,
29 |         mtlalpha=0.1,
30 |         ctc_type="builtin",
31 |         rel_pos_type="latest",
32 |         **kwargs,
33 |     ):
34 |         super().__init__(**kwargs)
35 |         self.odim = odim
36 |         self.adim = adim
37 |         self.aheads = aheads
38 |         self.eunits = eunits
39 |         self.elayers = elayers
40 |         self.transformer_input_layer = transformer_input_layer
41 |         self.dropout_rate = dropout_rate
42 |         self.transformer_attn_dropout_rate = transformer_attn_dropout_rate
43 |         self.transformer_encoder_attn_layer_type = transformer_encoder_attn_layer_type
44 |         self.macaron_style = macaron_style
45 |         self.use_cnn_module = use_cnn_module
46 |         self.cnn_module_kernel = cnn_module_kernel
47 |         self.zero_triu = zero_triu
48 |         self.a_upsample_ratio = a_upsample_ratio
49 |         self.relu_type = relu_type
50 |         self.ddim = ddim
51 |         self.dheads = dheads
52 |         self.dunits = dunits
53 |         self.dlayers = dlayers
54 |         self.lsm_weight = lsm_weight
55 |         self.transformer_length_normalized_loss = transformer_length_normalized_loss
56 |         self.mtlalpha = mtlalpha
57 |         self.ctc_type = ctc_type
58 |         self.rel_pos_type = rel_pos_type
59 |         
60 |         


--------------------------------------------------------------------------------
/src/auto_asr/asr_model.py:
--------------------------------------------------------------------------------
 1 | from src.nets.backend.e2e_asr_conformer import E2E
 2 | from transformers.modeling_utils import PreTrainedModel
 3 | from src.auto_asr.configuration_asr import AutoASRConfig
 4 | from src.nets.batch_beam_search import BatchBeamSearch
 5 | from src.nets.scorers.length_bonus import LengthBonus
 6 | from src.nets.scorers.ctc import CTCPrefixScorer
 7 | import torch
 8 | 
 9 | def get_beam_search_decoder(model, token_list, ctc_weight=0.1, beam_size=10):
10 |     scorers = {
11 |         "decoder": model.decoder,
12 |         "ctc": CTCPrefixScorer(model.ctc, model.eos),
13 |         "length_bonus": LengthBonus(len(token_list)),
14 |         "lm": None
15 |     }
16 | 
17 |     weights = {
18 |         "decoder": 1.0 - ctc_weight,
19 |         "ctc": ctc_weight,
20 |         "lm": 0.0,
21 |         "length_bonus": 0.0,
22 |     }
23 | 
24 |     return BatchBeamSearch(
25 |         beam_size=beam_size,
26 |         vocab_size=len(token_list),
27 |         weights=weights,
28 |         scorers=scorers,
29 |         sos=model.sos,
30 |         eos=model.eos,
31 |         token_list=token_list,
32 |         pre_beam_score_key=None if ctc_weight == 1.0 else "decoder",
33 |     )
34 | 
35 | 
36 | class AutoASR(PreTrainedModel):
37 |     config_class = AutoASRConfig
38 |     
39 |     def __init__(self, config: AutoASRConfig):
40 |         super().__init__(config)
41 |         self.asr = E2E(config)
42 |     
43 |     def forward(self, 
44 |         video, audio, video_lengths, audio_lengths, label
45 |     ):
46 |         return self.asr(video, audio, video_lengths, audio_lengths, label)
47 |     
48 |     def inference(self, sample, text_transform):
49 |         
50 |         self.beam_search = get_beam_search_decoder(self.asr, text_transform.token_list)
51 |         enc_feat, _ = self.asr.encoder(sample.unsqueeze(0).to(self.device), None)
52 |         enc_feat = enc_feat.squeeze(0)
53 | 
54 |         nbest_hyps = self.beam_search(enc_feat)
55 |         nbest_hyps = [h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), 1)]]
56 |         predicted_token_id = torch.tensor(list(map(int, nbest_hyps[0]["yseq"][1:])))
57 |         predicted = text_transform.post_process(predicted_token_id).replace("<eos>", "")
58 |         return predicted


--------------------------------------------------------------------------------
/src/nets/backend/transformer/label_smoothing_loss.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Label smoothing module."""
 8 | 
 9 | import torch
10 | from torch import nn
11 | 
12 | 
13 | class LabelSmoothingLoss(nn.Module):
14 |     """Label-smoothing loss.
15 | 
16 |     :param int size: the number of class
17 |     :param int padding_idx: ignored class id
18 |     :param float smoothing: smoothing rate (0.0 means the conventional CE)
19 |     :param bool normalize_length: normalize loss by sequence length if True
20 |     :param torch.nn.Module criterion: loss function to be smoothed
21 |     """
22 | 
23 |     def __init__(
24 |         self,
25 |         size,
26 |         padding_idx,
27 |         smoothing,
28 |         normalize_length=False,
29 |         criterion=nn.KLDivLoss(reduction="none"),
30 |     ):
31 |         """Construct an LabelSmoothingLoss object."""
32 |         super(LabelSmoothingLoss, self).__init__()
33 |         self.criterion = criterion
34 |         self.padding_idx = padding_idx
35 |         self.confidence = 1.0 - smoothing
36 |         self.smoothing = smoothing
37 |         self.size = size
38 |         self.true_dist = None
39 |         self.normalize_length = normalize_length
40 | 
41 |     def forward(self, x, target):
42 |         """Compute loss between x and target.
43 | 
44 |         :param torch.Tensor x: prediction (batch, seqlen, class)
45 |         :param torch.Tensor target:
46 |             target signal masked with self.padding_id (batch, seqlen)
47 |         :return: scalar float value
48 |         :rtype torch.Tensor
49 |         """
50 |         assert x.size(2) == self.size
51 |         batch_size = x.size(0)
52 |         x = x.view(-1, self.size)
53 |         target = target.view(-1)
54 |         with torch.no_grad():
55 |             true_dist = x.clone()
56 |             true_dist.fill_(self.smoothing / (self.size - 1))
57 |             ignore = target == self.padding_idx  # (B,)
58 |             total = len(target) - ignore.sum().item()
59 |             target = target.masked_fill(ignore, 0)  # avoid -1 index
60 |             true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
61 |         kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
62 |         denom = total if self.normalize_length else batch_size
63 |         return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom
64 | 


--------------------------------------------------------------------------------
/src/nets/backend/transformer/convolution.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe)
 5 | #                Northwestern Polytechnical University (Pengcheng Guo)
 6 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 7 | 
 8 | """ConvolutionModule definition."""
 9 | 
10 | import torch
11 | from torch import nn
12 | 
13 | 
14 | class ConvolutionModule(nn.Module):
15 |     """ConvolutionModule in Conformer model.
16 | 
17 |     :param int channels: channels of cnn
18 |     :param int kernel_size: kernerl size of cnn
19 | 
20 |     """
21 | 
22 |     def __init__(self, channels, kernel_size, bias=True):
23 |         """Construct an ConvolutionModule object."""
24 |         super(ConvolutionModule, self).__init__()
25 |         # kernerl_size should be a odd number for 'SAME' padding
26 |         assert (kernel_size - 1) % 2 == 0
27 | 
28 |         self.pointwise_cov1 = nn.Conv1d(
29 |             channels,
30 |             2 * channels,
31 |             kernel_size=1,
32 |             stride=1,
33 |             padding=0,
34 |             bias=bias,
35 |         )
36 |         self.depthwise_conv = nn.Conv1d(
37 |             channels,
38 |             channels,
39 |             kernel_size,
40 |             stride=1,
41 |             padding=(kernel_size - 1) // 2,
42 |             groups=channels,
43 |             bias=bias,
44 |         )
45 |         self.norm = nn.BatchNorm1d(channels)
46 |         self.pointwise_cov2 = nn.Conv1d(
47 |             channels,
48 |             channels,
49 |             kernel_size=1,
50 |             stride=1,
51 |             padding=0,
52 |             bias=bias,
53 |         )
54 |         self.activation = Swish()
55 | 
56 |     def forward(self, x):
57 |         """Compute covolution module.
58 | 
59 |         :param torch.Tensor x: (batch, time, size)
60 |         :return torch.Tensor: convoluted `value` (batch, time, d_model)
61 |         """
62 |         # exchange the temporal dimension and the feature dimension
63 |         x = x.transpose(1, 2)
64 | 
65 |         # GLU mechanism
66 |         x = self.pointwise_cov1(x)  # (batch, 2*channel, dim)
67 |         x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
68 | 
69 |         # 1D Depthwise Conv
70 |         x = self.depthwise_conv(x)
71 |         x = self.activation(self.norm(x))
72 | 
73 |         x = self.pointwise_cov2(x)
74 | 
75 |         return x.transpose(1, 2)
76 | 
77 | 
78 | class Swish(nn.Module):
79 |     """Construct an Swish object."""
80 | 
81 |     def forward(self, x):
82 |         """Return Swich activation function."""
83 |         return x * torch.sigmoid(x)
84 | 


--------------------------------------------------------------------------------
/src/auto_vsr/vsr_model.py:
--------------------------------------------------------------------------------
 1 | from src.nets.backend.e2e_asr_conformer import E2E
 2 | from transformers.modeling_utils import PreTrainedModel
 3 | from src.auto_vsr.configuration_vsr import AutoVSRConfig
 4 | from src.nets.batch_beam_search import BatchBeamSearch
 5 | from src.nets.scorers.length_bonus import LengthBonus
 6 | from src.nets.scorers.ctc import CTCPrefixScorer
 7 | import torch
 8 | from transformers.utils import ModelOutput
 9 | from typing import List, Optional, Union
10 | from dataclasses import dataclass
11 | 
12 | def get_beam_search_decoder(model, token_list, ctc_weight=0.1, beam_size=3):
13 |     scorers = {
14 |         "decoder": model.decoder,
15 |         "ctc": CTCPrefixScorer(model.ctc, model.eos),
16 |         "length_bonus": LengthBonus(len(token_list)),
17 |         "lm": None
18 |     }
19 | 
20 |     weights = {
21 |         "decoder": 1.0 - ctc_weight,
22 |         "ctc": ctc_weight,
23 |         "lm": 0.0,
24 |         "length_bonus": 0.0,
25 |     }
26 | 
27 |     return BatchBeamSearch(
28 |         beam_size=beam_size,
29 |         vocab_size=len(token_list),
30 |         weights=weights,
31 |         scorers=scorers,
32 |         sos=model.sos,
33 |         eos=model.eos,
34 |         token_list=token_list,
35 |         pre_beam_score_key=None if ctc_weight == 1.0 else "decoder",
36 |     )
37 | 
38 | @dataclass
39 | class AutoVSROutput(ModelOutput):
40 |     loss: Optional[torch.FloatTensor] = None
41 |     loss_ctc: Optional[torch.FloatTensor] = None
42 |     loss_att: Optional[torch.FloatTensor] = None
43 |     acc: Optional[torch.FloatTensor] = None
44 | 
45 | class AutoVSR(PreTrainedModel):
46 |     config_class = AutoVSRConfig
47 |     
48 |     def __init__(self, config: AutoVSRConfig):
49 |         super().__init__(config)
50 |         self.vsr = E2E(config)
51 |     
52 |     def forward(self, 
53 |         videos, 
54 |         audios, 
55 |         labels,
56 |         video_lengths, 
57 |         audio_lengths, 
58 |         label_lengths
59 |     ):  
60 |         loss, loss_ctc, loss_att, acc = self.vsr(videos, video_lengths, labels)
61 |         return AutoVSROutput(
62 |             loss=loss,
63 |             loss_ctc=loss_ctc,
64 |             loss_att=loss_att,
65 |             acc=acc
66 |         )
67 |     
68 |     def inference(self, sample, text_transform):
69 |         
70 |         self.beam_search = get_beam_search_decoder(self.vsr, text_transform.token_list)
71 |         enc_feat, _ = self.vsr.encoder(sample.unsqueeze(0).to(self.device), None)
72 |         enc_feat = enc_feat.squeeze(0)
73 | 
74 |         nbest_hyps = self.beam_search(enc_feat)
75 |         nbest_hyps = [h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), 1)]]
76 |         predicted_token_id = torch.tensor(list(map(int, nbest_hyps[0]["yseq"][1:])))
77 |         predicted = text_transform.post_process(predicted_token_id).replace("<eos>", "")
78 |         return predicted


--------------------------------------------------------------------------------
/script/lip_crop.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 4 | from src.retinaface.detector import LandmarksDetector
 5 | from src.retinaface.video_process import VideoProcess
 6 | from src.retinaface.utils import save_vid_aud_txt
 7 | from tqdm import tqdm
 8 | import traceback
 9 | import math
10 | 
11 | import torch
12 | import torchaudio
13 | import torchvision
14 | import json
15 | import ffmpeg
16 | import glob
17 | 
18 | # ==================== LOAD MODEL ====================
19 | 
20 | landmarks_detector = LandmarksDetector(device="cuda:0")
21 | video_process = VideoProcess(convert_gray=False)
22 | 
23 | def process_video(video_path, output_dir=None):
24 |     try:
25 |         # Load and process audio and video
26 |         audio, sample_rate = torchaudio.load(video_path, normalize=True)
27 | 
28 |         video = torchvision.io.read_video(video_path)[0].numpy()
29 |         landmarks = landmarks_detector(video)
30 |         video = video_process(video, landmarks)
31 |         video = torch.tensor(video)   
32 |         
33 |         segment_name = video_path.split("/")[-1].replace(".mp4", "_lip")
34 |         if output_dir is None:
35 |             output_dir = os.path.dirname(video_path)
36 |         os.makedirs(output_dir, exist_ok=True)
37 |         
38 |         dst_vid_filename = os.path.join(output_dir, f"{segment_name}.mp4")
39 |         dst_aud_filename = os.path.join(output_dir, f"{segment_name}.wav")
40 |         text_filename = os.path.join(output_dir, f"{segment_name}.json")
41 |         save_vid_aud_txt(
42 |             dst_vid_filename,
43 |             dst_aud_filename,
44 |             text_filename,
45 |             video,
46 |             audio,
47 |             json.dumps({
48 |                 "path": video_path
49 |             }, indent=4),
50 |             video_fps=25,
51 |             audio_sample_rate=16000,
52 |         )
53 | 
54 |         # Combine audio and video
55 |         in1 = ffmpeg.input(dst_vid_filename)
56 |         in2 = ffmpeg.input(dst_aud_filename)
57 |         out = ffmpeg.output(
58 |             in1["v"],
59 |             in2["a"],
60 |             dst_vid_filename[:-4] + ".av.mp4",
61 |             vcodec="copy",
62 |             acodec="aac",
63 |             strict="experimental",
64 |             loglevel="panic",
65 |         )
66 |         out.run(overwrite_output=True)
67 |     except Exception as e:
68 |         traceback.print_exc()
69 |         print(f"Error processing {video_path} segment {segment_frame[0]}-{segment_frame[-1]}")
70 | 
71 | def main():
72 |     import argparse
73 |     parser = argparse.ArgumentParser(description="Active Speaker Detection")
74 |     parser.add_argument('--video', type=str, required=True, help='Path to input video file')
75 |     opt = parser.parse_args()
76 |         
77 |     process_video(opt.video)
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     main()


--------------------------------------------------------------------------------
/docs/data_preparation.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: Data Collection
 4 | parent: CHiME-9 Task 1 - MCoRec
 5 | nav_order: 2
 6 | ---
 7 | 
 8 | ## Data collection
 9 | 
10 | ### Topics and Speakers
11 | 
12 | - Conversation Topics: Everyday life & hobbies, work & school, hypotheticals, entertainment, news, and personal stories.
13 | 
14 | - Participants per Session: 2 to 8 speakers, divided into groups of 2–4 participants.
15 | 
16 | ### Recording Devices
17 | 
18 | - GoPro Max 360 (4K resolution)
19 | - Smartphones (720p resolution)
20 | - Lapel Microphones (connected via adapter to smartphones)
21 | 
22 | ### Layout and Setup
23 | 
24 | ![](images/setting.png)
25 | 
26 | - Recording Environments: Data was collected across approximately 10 different rooms of varying sizes and types, including living rooms, meeting rooms, lecture halls, and other indoor spaces.
27 | 
28 | - Seating Arrangement: All speakers sit around a table; distances vary by table size (around 2 to 5 meters).
29 | 
30 | - Smartphone Placement: Each speaker has a smartphone in front (selfie mode) with a lapel mic clipped near the mouth.
31 | 
32 | - 360° Capture: A GoPro Max mounted at the center of the table captures all participants.
33 | 
34 | - Session Markers: Moderator signals start and end with a distinctive whistle.
35 | 
36 | 
37 | ## Annotation
38 | 
39 | ### Signal Alignment
40 | 
41 | To synchronize recordings from multiple devices, we use the moderator’s whistle cue in a two-step process:
42 | 
43 | - Manual Annotation: Listen to each audio track and mark the start/end regions containing the whistle.
44 | 
45 |     ![](images/align.png)
46 | 
47 | - Automatic Detection: Compute the spectral-flux onset strength envelope with librosa.onset.onset_strength, then identify the timestamp of the highest peak to pinpoint the exact whistle moment.
48 | 
49 | ### Transcription Workflow
50 | 
51 | High-quality audio from smartphones and lapel mics is used for transcription
52 | 
53 | - Automatic Transcript: Run each clip through the Whisper-large-v2 model.
54 | 
55 | - Post-Editing: Annotators use Label Studio to:
56 |     - Adjust segment boundaries to isolate the target speaker’s speech.
57 |     - Correct transcript text for accuracy.
58 | 
59 |     ![](images/anno_transcript.png)
60 | 
61 | ### 360° Video Processing
62 | 
63 | Frame Padding for Horizon Artifacts
64 | 
65 | - When a speaker straddles the image boundary, their face can split across the frame
66 | 
67 |     ![](images/origin_360.png)
68 | 
69 | - We resolve this by padding each frame: append 20% of the left edge to the right side, creating a continuous panorama.
70 | 
71 |     ![](images/padding_360.png)
72 | 
73 | 
74 | - Face Recognition and Linking:
75 |     - Face Detection: Run the padded 360° videos through a face-recognition pipeline to extract face crops.
76 | 
77 |     - Manual Association: Link each face crop from the 360° feed to the corresponding smartphone video of the same speaker.
78 | 
79 |     ![](images/face_linking.png)
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/src/auto_avsr/avsr_model.py:
--------------------------------------------------------------------------------
 1 | from src.nets.backend.e2e_asr_conformer_av import E2E
 2 | from transformers.modeling_utils import PreTrainedModel
 3 | from src.auto_avsr.configuration_avsr import AutoAVSRConfig
 4 | from src.nets.batch_beam_search import BatchBeamSearch
 5 | from src.nets.scorers.length_bonus import LengthBonus
 6 | from src.nets.scorers.ctc import CTCPrefixScorer
 7 | import torch
 8 | from transformers.utils import ModelOutput
 9 | from typing import List, Optional, Union
10 | from dataclasses import dataclass
11 | 
12 | def get_beam_search_decoder(model, token_list, ctc_weight=0.1, beam_size=3):
13 |     scorers = {
14 |         "decoder": model.decoder,
15 |         "ctc": CTCPrefixScorer(model.ctc, model.eos),
16 |         "length_bonus": LengthBonus(len(token_list)),
17 |         "lm": None
18 |     }
19 | 
20 |     weights = {
21 |         "decoder": 1.0 - ctc_weight,
22 |         "ctc": ctc_weight,
23 |         "lm": 0.0,
24 |         "length_bonus": 0.0,
25 |     }
26 | 
27 |     return BatchBeamSearch(
28 |         beam_size=beam_size,
29 |         vocab_size=len(token_list),
30 |         weights=weights,
31 |         scorers=scorers,
32 |         sos=model.sos,
33 |         eos=model.eos,
34 |         token_list=token_list,
35 |         pre_beam_score_key=None if ctc_weight == 1.0 else "decoder",
36 |     )
37 | 
38 | @dataclass
39 | class AutoAVSROutput(ModelOutput):
40 |     loss: Optional[torch.FloatTensor] = None
41 |     loss_ctc: Optional[torch.FloatTensor] = None
42 |     loss_att: Optional[torch.FloatTensor] = None
43 |     acc: Optional[torch.FloatTensor] = None
44 | 
45 | class AutoAVSR(PreTrainedModel):
46 |     config_class = AutoAVSRConfig
47 |     
48 |     def __init__(self, config: AutoAVSRConfig):
49 |         super().__init__(config)
50 |         self.avsr = E2E(config)
51 |     
52 |     def forward(self, 
53 |         videos, 
54 |         audios, 
55 |         labels,
56 |         video_lengths, 
57 |         audio_lengths, 
58 |         label_lengths
59 |     ):
60 |         loss, loss_ctc, loss_att, acc = self.avsr(videos, audios, video_lengths, audio_lengths, labels)
61 |         return AutoAVSROutput(
62 |             loss=loss,
63 |             loss_ctc=loss_ctc,
64 |             loss_att=loss_att,
65 |             acc=acc
66 |         )
67 |         # return self.avsr(videos, audios, video_lengths, audio_lengths, labels)
68 |     
69 |     
70 |     
71 |     def inference(self, video, audio, text_transform):
72 |         self.beam_search = get_beam_search_decoder(self.avsr, text_transform.token_list)
73 |         video_feat, _ = self.avsr.encoder(video.unsqueeze(0).to(self.device), None)
74 |         audio_feat, _ = self.avsr.aux_encoder(audio.unsqueeze(0).to(self.device), None)
75 |         audiovisual_feat = self.avsr.fusion(torch.cat((video_feat, audio_feat), dim=-1))
76 | 
77 |         audiovisual_feat = audiovisual_feat.squeeze(0)
78 | 
79 |         nbest_hyps = self.beam_search(audiovisual_feat)
80 |         nbest_hyps = [h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), 1)]]
81 |         predicted_token_id = torch.tensor(list(map(int, nbest_hyps[0]["yseq"][1:])))
82 |         predicted = text_transform.post_process(predicted_token_id).replace("<eos>", "")
83 |         return predicted


--------------------------------------------------------------------------------
/src/retinaface/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import torchaudio
  4 | import torchvision
  5 | 
  6 | 
  7 | def split_file(filename, max_frames=600, fps=25.0):
  8 | 
  9 |     lines = open(filename).read().splitlines()
 10 | 
 11 |     flag = 0
 12 |     stack = []
 13 |     res = []
 14 | 
 15 |     tmp = 0
 16 |     start_timestamp = 0.0
 17 | 
 18 |     threshold = max_frames / fps
 19 | 
 20 |     for line in lines:
 21 |         if "WORD START END ASDSCORE" in line:
 22 |             flag = 1
 23 |             continue
 24 |         if flag:
 25 |             word, start, end, score = line.split(" ")
 26 |             start, end, score = float(start), float(end), float(score)
 27 |             if end < tmp + threshold:
 28 |                 stack.append(word)
 29 |                 last_timestamp = end
 30 |             else:
 31 |                 res.append(
 32 |                     [
 33 |                         " ".join(stack),
 34 |                         start_timestamp,
 35 |                         last_timestamp,
 36 |                         last_timestamp - start_timestamp,
 37 |                     ]
 38 |                 )
 39 |                 tmp = start
 40 |                 start_timestamp = start
 41 |                 stack = [word]
 42 |     if stack:
 43 |         res.append([" ".join(stack), start_timestamp, end, end - start_timestamp])
 44 |     return res
 45 | 
 46 | 
 47 | def save_vid_txt(
 48 |     dst_vid_filename, dst_txt_filename, trim_video_data, content, video_fps=25
 49 | ):
 50 |     # -- save video
 51 |     save2vid(dst_vid_filename, trim_video_data, video_fps)
 52 |     # -- save text
 53 |     os.makedirs(os.path.dirname(dst_txt_filename), exist_ok=True)
 54 |     f = open(dst_txt_filename, "w")
 55 |     f.write(f"{content}")
 56 |     f.close()
 57 | 
 58 | 
 59 | def save_vid_aud(
 60 |     dst_vid_filename,
 61 |     dst_aud_filename,
 62 |     trim_vid_data,
 63 |     trim_aud_data,
 64 |     video_fps=25,
 65 |     audio_sample_rate=16000,
 66 | ):
 67 |     # -- save video
 68 |     save2vid(dst_vid_filename, trim_vid_data, video_fps)
 69 |     # -- save audio
 70 |     save2aud(dst_aud_filename, trim_aud_data, audio_sample_rate)
 71 | 
 72 | 
 73 | def save_vid_aud_txt(
 74 |     dst_vid_filename,
 75 |     dst_aud_filename,
 76 |     dst_txt_filename,
 77 |     trim_vid_data,
 78 |     trim_aud_data,
 79 |     content,
 80 |     video_fps=25,
 81 |     audio_sample_rate=16000,
 82 | ):
 83 |     # -- save video
 84 |     if dst_vid_filename is not None:
 85 |         save2vid(dst_vid_filename, trim_vid_data, video_fps)
 86 |     # -- save audio
 87 |     if dst_aud_filename is not None:
 88 |         save2aud(dst_aud_filename, trim_aud_data, audio_sample_rate)
 89 |     # -- save text
 90 |     os.makedirs(os.path.dirname(dst_txt_filename), exist_ok=True)
 91 |     f = open(dst_txt_filename, "w")
 92 |     f.write(f"{content}")
 93 |     f.close()
 94 | 
 95 | 
 96 | def save2vid(filename, vid, frames_per_second):
 97 |     os.makedirs(os.path.dirname(filename), exist_ok=True)
 98 |     torchvision.io.write_video(filename, vid, frames_per_second)
 99 | 
100 | 
101 | def save2aud(filename, aud, sample_rate):
102 |     os.makedirs(os.path.dirname(filename), exist_ok=True)
103 |     torchaudio.save(filename, aud, sample_rate)
104 | 


--------------------------------------------------------------------------------
/src/avhubert_avsr/avhubert_avsr_model.py:
--------------------------------------------------------------------------------
 1 | from src.nets.backend.e2e_asr_avhubert import E2E
 2 | from transformers.modeling_utils import PreTrainedModel
 3 | from src.avhubert_avsr.configuration_avhubert_avsr import AVHubertAVSRConfig
 4 | from src.nets.batch_beam_search import BatchBeamSearch
 5 | from src.nets.scorers.length_bonus import LengthBonus
 6 | from src.nets.scorers.ctc import CTCPrefixScorer
 7 | import torch
 8 | from transformers.utils import ModelOutput
 9 | from typing import List, Optional, Union
10 | from dataclasses import dataclass
11 | 
12 | def get_beam_search_decoder(model, token_list, ctc_weight=0.1, beam_size=3):
13 |     scorers = {
14 |         "decoder": model.decoder,
15 |         "ctc": CTCPrefixScorer(model.ctc, model.eos),
16 |         "length_bonus": LengthBonus(len(token_list)),
17 |         "lm": None
18 |     }
19 | 
20 |     weights = {
21 |         "decoder": 1.0 - ctc_weight,
22 |         "ctc": ctc_weight,
23 |         "lm": 0.0,
24 |         "length_bonus": 0.0,
25 |     }
26 | 
27 |     return BatchBeamSearch(
28 |         beam_size=beam_size,
29 |         vocab_size=len(token_list),
30 |         weights=weights,
31 |         scorers=scorers,
32 |         sos=model.sos,
33 |         eos=model.eos,
34 |         token_list=token_list,
35 |         pre_beam_score_key=None if ctc_weight == 1.0 else "decoder",
36 |     )
37 | 
38 | @dataclass
39 | class AVHubertAVSROutput(ModelOutput):
40 |     loss: Optional[torch.FloatTensor] = None
41 |     loss_ctc: Optional[torch.FloatTensor] = None
42 |     loss_att: Optional[torch.FloatTensor] = None
43 |     acc: Optional[torch.FloatTensor] = None
44 | 
45 | class AVHubertAVSR(PreTrainedModel):
46 |     config_class = AVHubertAVSRConfig
47 |     
48 |     def __init__(self, config: AVHubertAVSRConfig):
49 |         super().__init__(config)
50 |         self.avsr = E2E(config)
51 |     
52 |     def forward(self, 
53 |         videos, 
54 |         audios, 
55 |         labels,
56 |         video_lengths, 
57 |         audio_lengths, 
58 |         label_lengths
59 |     ):
60 |         loss, loss_ctc, loss_att, acc = self.avsr(videos, audios, video_lengths, audio_lengths, labels)
61 |         return AVHubertAVSROutput(
62 |             loss=loss,
63 |             loss_ctc=loss_ctc,
64 |             loss_att=loss_att,
65 |             acc=acc
66 |         )
67 |         # return self.avsr(videos, audios, video_lengths, audio_lengths, labels)
68 |     
69 |     
70 |     
71 |     # def inference(self, video, audio, text_transform):
72 |     #     self.beam_search = get_beam_search_decoder(self.avsr, text_transform.token_list)
73 |     #     video_feat, _ = self.avsr.encoder(video.unsqueeze(0).to(self.device), None)
74 |     #     audio_feat, _ = self.avsr.aux_encoder(audio.unsqueeze(0).to(self.device), None)
75 |     #     audiovisual_feat = self.avsr.fusion(torch.cat((video_feat, audio_feat), dim=-1))
76 | 
77 |     #     audiovisual_feat = audiovisual_feat.squeeze(0)
78 | 
79 |     #     nbest_hyps = self.beam_search(audiovisual_feat)
80 |     #     nbest_hyps = [h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), 1)]]
81 |     #     predicted_token_id = torch.tensor(list(map(int, nbest_hyps[0]["yseq"][1:])))
82 |     #     predicted = text_transform.post_process(predicted_token_id).replace("<eos>", "")
83 |     #     return predicted


--------------------------------------------------------------------------------
/src/ibug/face_detection/s3fd/s3fd_predictor.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import numpy as np
 4 | from types import SimpleNamespace
 5 | from typing import Union, Optional
 6 | from .s3fd_net import S3FDNet
 7 | 
 8 | 
 9 | __all__ = ['S3FDPredictor']
10 | 
11 | 
12 | class S3FDPredictor(object):
13 |     def __init__(self, threshold: float = 0.8, device: Union[str, torch.device] = 'cuda:0',
14 |                  model: Optional[SimpleNamespace] = None, config: Optional[SimpleNamespace] = None) -> None:
15 |         self.threshold = threshold
16 |         self.device = device
17 |         if model is None:
18 |             model = S3FDPredictor.get_model()
19 |         if config is None:
20 |             config = S3FDPredictor.create_config()
21 |         self.config = SimpleNamespace(**model.config.__dict__, **config.__dict__)
22 |         self.net = S3FDNet(config=self.config, device=self.device).to(self.device)
23 |         self.net.load_state_dict(torch.load(model.weights, map_location=self.device))
24 |         self.net.eval()
25 | 
26 |     @staticmethod
27 |     def get_model(name: str = 's3fd') -> SimpleNamespace:
28 |         name = name.lower().strip()
29 |         if name == 's3fd':
30 |             return SimpleNamespace(weights=os.path.realpath(os.path.join(os.path.dirname(__file__),
31 |                                                                          '..','..','..','..','model-bin','face_detection','s3fd','weights', 's3fd_weights.pth')),
32 |                                    config=SimpleNamespace(num_classes=2, variance=(0.1, 0.2),
33 |                                                           prior_min_sizes=(16, 32, 64, 128, 256, 512),
34 |                                                           prior_steps=(4, 8, 16, 32, 64, 128), prior_clip=False))
35 |         else:
36 |             raise ValueError('name must be set to s3fd')
37 | 
38 |     @staticmethod
39 |     def create_config(top_k: int = 750, conf_thresh: float = 0.05,nms_thresh: float = 0.3,
40 |                       nms_top_k: int = 5000, use_nms_np: bool = True) -> SimpleNamespace:
41 |         return SimpleNamespace(top_k=top_k, conf_thresh=conf_thresh, nms_thresh=nms_thresh,
42 |                                nms_top_k=nms_top_k, use_nms_np=use_nms_np)
43 | 
44 |     @torch.no_grad()
45 |     def __call__(self, image: np.ndarray, rgb: bool = True) -> np.ndarray:
46 |         w, h = image.shape[1], image.shape[0]
47 |         if not rgb:
48 |             image = image[..., ::-1]
49 |         image = image.astype(int) - np.array([123, 117, 104])
50 |         image = image.transpose(2, 0, 1)
51 |         image = image.reshape((1,) + image.shape)
52 |         image = torch.from_numpy(image).float().to(self.device)
53 | 
54 |         bboxes = []
55 |         detections = self.net(image)
56 |         scale = torch.Tensor([w, h, w, h]).to(detections.device)
57 |         for i in range(detections.size(1)):
58 |             j = 0
59 |             while detections[0, i, j, 0] >= self.threshold:
60 |                 score = detections[0, i, j, 0]
61 |                 pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
62 |                 bbox = (pt[0], pt[1], pt[2], pt[3], score)
63 |                 bboxes.append(bbox)
64 |                 j += 1
65 |         if len(bboxes) > 0:
66 |             return np.array(bboxes)
67 |         else:
68 |             return np.empty(shape=(0, 5), dtype=np.float32)
69 | 


--------------------------------------------------------------------------------
/src/ibug/face_alignment/utils.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | from typing import Optional, Sequence, Tuple
 4 | 
 5 | 
 6 | __all__ = ['get_landmark_connectivity', 'plot_landmarks']
 7 | 
 8 | 
 9 | def get_landmark_connectivity(num_landmarks: int) -> Optional[Sequence[Tuple[int, int]]]:
10 |     if num_landmarks == 68:
11 |         return ((0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12),
12 |                 (12, 13), (13, 14), (14, 15), (15, 16), (17, 18), (18, 19), (19, 20), (20, 21), (22, 23), (23, 24),
13 |                 (24, 25), (25, 26), (27, 28), (28, 29), (29, 30), (30, 33), (31, 32), (32, 33), (33, 34), (34, 35),
14 |                 (36, 37), (37, 38), (38, 39), (40, 41), (41, 36), (42, 43), (43, 44), (44, 45), (45, 46), (46, 47),
15 |                 (47, 42), (48, 49), (49, 50), (50, 51), (51, 52), (52, 53), (53, 54), (54, 55), (55, 56), (56, 57),
16 |                 (57, 58), (58, 59), (59, 48), (60, 61), (61, 62), (62, 63), (63, 64), (64, 65), (65, 66), (66, 67),
17 |                 (67, 60), (39, 40))
18 |     elif num_landmarks == 100:
19 |         return ((0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12),
20 |                 (12, 13), (13, 14), (14, 15), (15, 16), (17, 18), (18, 19), (19, 20), (20, 21), (22, 23), (23, 24),
21 |                 (24, 25), (25, 26), (68, 69), (69, 70), (70, 71), (72, 73), (73, 74), (74, 75), (36, 76), (76, 37),
22 |                 (37, 77), (77, 38), (38, 78), (78, 39), (39, 40), (40, 79), (79, 41), (41, 36), (42, 80), (80, 43),
23 |                 (43, 81), (81, 44), (44, 82), (82, 45), (45, 46), (46, 83), (83, 47), (47, 42), (27, 28), (28, 29),
24 |                 (29, 30), (30, 33), (31, 32), (32, 33), (33, 34), (34, 35), (84, 85), (86, 87), (48, 49), (49, 88),
25 |                 (88, 50), (50, 51), (51, 52), (52, 89), (89, 53), (53, 54), (54, 55), (55, 90), (90, 56), (56, 57),
26 |                 (57, 58), (58, 91), (91, 59), (59, 48), (60, 92), (92, 93), (93, 61), (61, 62), (62, 63), (63, 94),
27 |                 (94, 95), (95, 64), (64, 96), (96, 97), (97, 65), (65, 66), (66, 67), (67, 98), (98, 99), (99, 60),
28 |                 (17, 68), (21, 71), (22, 72), (26, 75))
29 |     else:
30 |         return None
31 | 
32 | 
33 | def plot_landmarks(image: np.ndarray, landmarks: np.ndarray, landmark_scores: Optional[Sequence[float]] = None,
34 |                    threshold: float = 0.2, line_colour: Tuple[int, int, int] = (0, 255, 0),
35 |                    pts_colour: Tuple[int, int, int] = (0, 0, 255), line_thickness: int = 1, pts_radius: int = 1,
36 |                    landmark_connectivity: Optional[Sequence[Tuple[int, int]]] = None) -> None:
37 |     num_landmarks = len(landmarks)
38 |     if landmark_scores is None:
39 |         landmark_scores = np.full((num_landmarks,), threshold + 1.0, dtype=float)
40 |     if landmark_connectivity is None:
41 |         landmark_connectivity = get_landmark_connectivity(len(landmarks))
42 |     if landmark_connectivity is not None:
43 |         for (idx1, idx2) in landmark_connectivity:
44 |             if (idx1 < num_landmarks and idx2 < num_landmarks and
45 |                     landmark_scores[idx1] >= threshold and landmark_scores[idx2] >= threshold):
46 |                 cv2.line(image, tuple(landmarks[idx1].astype(int).tolist()),
47 |                          tuple(landmarks[idx2].astype(int).tolist()),
48 |                          color=line_colour, thickness=line_thickness, lineType=cv2.LINE_AA)
49 |     for landmark, score in zip(landmarks, landmark_scores):
50 |         if score >= threshold:
51 |             cv2.circle(image, tuple(landmark.astype(int).tolist()), pts_radius, pts_colour, -1)
52 | 


--------------------------------------------------------------------------------
/src/avhubert_muavic/av_transformer_decoder.py:
--------------------------------------------------------------------------------
 1 | from transformers.models.speech_to_text.modeling_speech_to_text import (
 2 |     Speech2TextAttention,
 3 |     Speech2TextDecoder,
 4 |     Speech2TextDecoderLayer,
 5 |     Speech2TextConfig,
 6 |     ACT2FN,
 7 |     SPEECH_TO_TEXT_ATTENTION_CLASSES,
 8 |     Speech2TextDecoderLayer,
 9 | )
10 | from typing import Optional, List
11 | import torch.nn as nn
12 | from .av2text_config import AV2TextConfig
13 | 
14 | class AVTransformerAttention(Speech2TextAttention):
15 |     def __init__(
16 |         self,
17 |         input_dim: int,
18 |         embed_dim: int,
19 |         num_heads: int,
20 |         dropout: float = 0.0,
21 |         is_decoder: bool = False,
22 |         bias: bool = True,
23 |         is_causal: bool = False,
24 |         config: Optional[Speech2TextConfig] = None,
25 |     ):
26 |         super().__init__(
27 |             embed_dim,
28 |             num_heads,
29 |             dropout,
30 |             is_decoder,
31 |             bias,
32 |             is_causal,
33 |             config
34 |         )
35 |         self.embed_dim = embed_dim
36 |         self.num_heads = num_heads
37 |         self.dropout = dropout
38 |         self.head_dim = embed_dim // num_heads
39 |         self.config = config
40 | 
41 |         if (self.head_dim * num_heads) != self.embed_dim:
42 |             raise ValueError(
43 |                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
44 |                 f" and `num_heads`: {num_heads})."
45 |             )
46 |         self.scaling = self.head_dim**-0.5
47 |         self.is_decoder = is_decoder
48 |         self.is_causal = is_causal
49 | 
50 |         self.k_proj = nn.Linear(input_dim, embed_dim, bias=bias)
51 |         self.v_proj = nn.Linear(input_dim, embed_dim, bias=bias)
52 |         self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
53 |         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
54 | 
55 | 
56 | 
57 | class AVTransformerDecoderLayer(Speech2TextDecoderLayer):
58 |     def __init__(self, config: Speech2TextConfig):
59 |         super().__init__(config)
60 |         self.embed_dim = config.decoder_hidden_size
61 | 
62 |         self.self_attn = SPEECH_TO_TEXT_ATTENTION_CLASSES[config._attn_implementation](
63 |             input_dim=self.embed_dim,
64 |             embed_dim=self.embed_dim,
65 |             num_heads=config.decoder_attention_heads,
66 |             dropout=config.attention_dropout,
67 |             is_decoder=True,
68 |             is_causal=True,
69 |             config=config,
70 |         )
71 |         self.dropout = config.dropout
72 |         self.activation_fn = ACT2FN[config.activation_function]
73 |         self.activation_dropout = config.activation_dropout
74 | 
75 |         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
76 |         self.encoder_attn = SPEECH_TO_TEXT_ATTENTION_CLASSES[config._attn_implementation](
77 |             config.encoder_hidden_size,
78 |             self.embed_dim,
79 |             config.decoder_attention_heads,
80 |             dropout=config.attention_dropout,
81 |             is_decoder=True,
82 |             config=config,
83 |         )
84 |         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
85 |         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
86 |         self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
87 |         self.final_layer_norm = nn.LayerNorm(self.embed_dim)
88 | 
89 | SPEECH_TO_TEXT_ATTENTION_CLASSES = {"eager": AVTransformerAttention}
90 | 
91 | class AVTransformerDecoder(Speech2TextDecoder):
92 |     def __init__(self, config: AV2TextConfig):
93 |         super().__init__(config)
94 |         self.layers = nn.ModuleList([AVTransformerDecoderLayer(config) for _ in range(config.decoder_layers)])


--------------------------------------------------------------------------------
/docs/submission.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: Submission
 4 | parent: CHiME-9 Task 1 - MCoRec
 5 | nav_order: 6
 6 | ---
 7 | 
 8 | The submission of the systems is open until **TBU** and should be done through [Google Form - TBU](#). We allow each team to submit up to **three systems** for the challenge. For the submission, make sure to have the following ready:
 9 | 
10 | - Technical description paper
11 | - System outputs for development and evaluation subset
12 | 
13 | ## Technical description paper
14 | 
15 | For the technical description, follow the instructions for CHiME-9 challenge papers at the [workshop page](TBU). The papers should be 4 pages long with one additional page for references. Please describe all of your submitted systems and the results on development subset. Please submit your abstract before your results and note the CMT paper ID assigned to your abstract as you will need to include it in your Google form submission.
16 | 
17 | ## System outputs
18 | 
19 | Participants should submit a zip file containing the output files for each submitted system. The zip file should contain the following directory structure:
20 | 
21 |     ├── name_of_system_1
22 |     │   ├── dev
23 |     │   │   ├── session_id_1
24 |     │   │   │   ├── speaker_to_cluster.json
25 |     │   │   │   ├── spk_0.vtt
26 |     │   │   │   ├── spk_1.vtt
27 |     │   │   │   └── ...
28 |     │   │   ├── session_id_2
29 |     │   │   │   ├── speaker_to_cluster.json
30 |     │   │   │   ├── spk_0.vtt
31 |     │   │   │   ├── spk_1.vtt
32 |     │   │   │   └── ...
33 |     │   │   └── ...
34 |     │   └── eval
35 |     │       ├── session_id_1
36 |     │       │   ├── speaker_to_cluster.json
37 |     │       │   ├── spk_0.vtt
38 |     │       │   ├── spk_1.vtt
39 |     │       │   └── ...
40 |     │       ├── session_id_2
41 |     │       └── ...
42 |     ...
43 |     └── name_of_system_N
44 |         ├── dev
45 |         └── eval
46 | 
47 | - Feel free to choose any naming of the systems, but please make sure that they are consistent between all submitted archives.
48 | 
49 | Each session directory contains:
50 | 
51 | - `speaker_to_cluster.json`: Contains the conversation clustering assignments for all speakers in that session, following the same format as in the dataset labels
52 | - `spk_0.vtt`, `spk_1.vtt`, etc.: WebVTT files containing time-aligned transcriptions for each target speaker, following the same format as the dataset labels
53 | 
54 | The file formats for system outputs should follow the same structure and format as described in the [Detailed description of data structure and formats](./data.md#detailed-desciption-of-data-structure-and-formats) section of the data documentation.
55 | 
56 | ## Important Notes
57 | 
58 | - **Evaluation Metrics**: The primary ranking metric is the **Joint ASR-Clustering Error Rate**, which equally weights transcription accuracy (WER) and clustering accuracy (per-speaker clustering F1)
59 | - **Clustering Requirements**: Each speaker must be assigned to exactly one conversation cluster per session. Cluster IDs can be any integer values but must be consistent within each session
60 | - **Text Normalization**: The evaluation script will automatically normalize text and remove disfluencies before computing WER
61 | - **Data Usage Compliance**: Systems must comply with the [challenge rules](./rules.md). Only approved external datasets and pre-trained models may be used
62 | - **Processing Independence**: Each evaluation recording must be processed independently. The development set cannot be used for training or parameter updates
63 | 
64 | If you are unsure about how to provide any of the above files, please contact us at the [Slack](https://join.slack.com/t/chimechallenge/shared_invite/zt-37h0cfpeb-qg5jwCgqRWCKc_3mLWVsYA) channel or at [mcorecchallenge@gmail.com](mailto:mcorecchallenge@gmail.com).
65 | 


--------------------------------------------------------------------------------
/src/tokenizer/spm/spm_encode.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | # All rights reserved.
  4 | #
  5 | # This source code is licensed under the license found in
  6 | # https://github.com/pytorch/fairseq/blob/master/LICENSE
  7 | 
  8 | 
  9 | import argparse
 10 | import contextlib
 11 | import sys
 12 | 
 13 | import sentencepiece as spm
 14 | 
 15 | 
 16 | def main():
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument(
 19 |         "--model", required=True, help="sentencepiece model to use for encoding"
 20 |     )
 21 |     parser.add_argument(
 22 |         "--inputs", nargs="+", default=["-"], help="input files to filter/encode"
 23 |     )
 24 |     parser.add_argument(
 25 |         "--outputs", nargs="+", default=["-"], help="path to save encoded outputs"
 26 |     )
 27 |     parser.add_argument("--output_format", choices=["piece", "id"], default="piece")
 28 |     parser.add_argument(
 29 |         "--min-len",
 30 |         type=int,
 31 |         metavar="N",
 32 |         help="filter sentence pairs with fewer than N tokens",
 33 |     )
 34 |     parser.add_argument(
 35 |         "--max-len",
 36 |         type=int,
 37 |         metavar="N",
 38 |         help="filter sentence pairs with more than N tokens",
 39 |     )
 40 |     args = parser.parse_args()
 41 | 
 42 |     assert len(args.inputs) == len(
 43 |         args.outputs
 44 |     ), "number of input and output paths should match"
 45 | 
 46 |     sp = spm.SentencePieceProcessor()
 47 |     sp.Load(args.model)
 48 | 
 49 |     if args.output_format == "piece":
 50 | 
 51 |         def encode(l):
 52 |             return sp.EncodeAsPieces(l)
 53 | 
 54 |     elif args.output_format == "id":
 55 | 
 56 |         def encode(l):
 57 |             return list(map(str, sp.EncodeAsIds(l)))
 58 | 
 59 |     else:
 60 |         raise NotImplementedError
 61 | 
 62 |     if args.min_len is not None or args.max_len is not None:
 63 | 
 64 |         def valid(line):
 65 |             return (args.min_len is None or len(line) >= args.min_len) and (
 66 |                 args.max_len is None or len(line) <= args.max_len
 67 |             )
 68 | 
 69 |     else:
 70 | 
 71 |         def valid(lines):
 72 |             return True
 73 | 
 74 |     with contextlib.ExitStack() as stack:
 75 |         inputs = [
 76 |             stack.enter_context(open(input, "r", encoding="utf-8"))
 77 |             if input != "-"
 78 |             else sys.stdin
 79 |             for input in args.inputs
 80 |         ]
 81 |         outputs = [
 82 |             stack.enter_context(open(output, "w", encoding="utf-8"))
 83 |             if output != "-"
 84 |             else sys.stdout
 85 |             for output in args.outputs
 86 |         ]
 87 | 
 88 |         stats = {
 89 |             "num_empty": 0,
 90 |             "num_filtered": 0,
 91 |         }
 92 | 
 93 |         def encode_line(line):
 94 |             line = line.strip()
 95 |             if len(line) > 0:
 96 |                 line = encode(line)
 97 |                 if valid(line):
 98 |                     return line
 99 |                 else:
100 |                     stats["num_filtered"] += 1
101 |             else:
102 |                 stats["num_empty"] += 1
103 |             return None
104 | 
105 |         for i, lines in enumerate(zip(*inputs), start=1):
106 |             enc_lines = list(map(encode_line, lines))
107 |             if not any(enc_line is None for enc_line in enc_lines):
108 |                 for enc_line, output_h in zip(enc_lines, outputs):
109 |                     print(" ".join(enc_line), file=output_h)
110 |             if i % 10000 == 0:
111 |                 print("processed {} lines".format(i), file=sys.stderr)
112 | 
113 |         print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr)
114 |         print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr)
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     main()
119 | 


--------------------------------------------------------------------------------
/src/cluster/eval.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from typing import List, Tuple, Dict
  3 | from sklearn.metrics import adjusted_rand_score
  4 | 
  5 | def pairwise_f1_score(true_labels: List[int], pred_labels: List[int]) -> float:
  6 |     """
  7 |     Compute the pairwise F1 score for clustering evaluation.
  8 |     
  9 |     Args:
 10 |         true_labels (List[int]): Ground truth cluster labels.
 11 |         pred_labels (List[int]): Predicted cluster labels.
 12 |     
 13 |     Returns:
 14 |         float: Pairwise F1 score.
 15 |     """
 16 |     # Generate all unique unordered pairs of indices
 17 |     pairs = list(itertools.combinations(range(len(true_labels)), 2))
 18 |     
 19 |     # Initialize counts
 20 |     tp = fp = fn = 0
 21 |     
 22 |     for i, j in pairs:
 23 |         # True same-cluster?
 24 |         true_same = (true_labels[i] == true_labels[j])
 25 |         # Predicted same-cluster?
 26 |         pred_same = (pred_labels[i] == pred_labels[j])
 27 |         
 28 |         if pred_same and true_same:
 29 |             tp += 1
 30 |         elif pred_same and not true_same:
 31 |             fp += 1
 32 |         elif not pred_same and true_same:
 33 |             fn += 1
 34 |         # True negatives (not same in both) are not used in F1
 35 |     # print(tp, fp, fn)
 36 |     # Handle edge cases
 37 |     if tp == 0:
 38 |         return 0.0
 39 |     
 40 |     precision = tp / (tp + fp)
 41 |     recall = tp / (tp + fn)
 42 |     f1 = 2 * precision * recall / (precision + recall)
 43 |     
 44 |     return f1
 45 | 
 46 | def pairwise_f1_score_per_speaker(true_labels: List[int], pred_labels: List[int]) -> Dict[int, float]:
 47 |     """
 48 |     Compute the pairwise F1 score for each speaker (one-vs-rest style) in clustering evaluation.
 49 |     
 50 |     Args:
 51 |         true_labels (List[int]): Ground truth cluster labels.
 52 |         pred_labels (List[int]): Predicted cluster labels.
 53 |     
 54 |     Returns:
 55 |         Dict[int, float]: Mapping from speaker index to their pairwise F1 score.
 56 |     """
 57 |     n = len(true_labels)
 58 |     scores = {}
 59 | 
 60 |     for i in range(n):
 61 |         tp = fp = fn = 0
 62 |         for j in range(n):
 63 |             if i == j:
 64 |                 continue
 65 | 
 66 |             # True and predicted same-cluster relationships between i and j
 67 |             true_same = (true_labels[i] == true_labels[j])
 68 |             pred_same = (pred_labels[i] == pred_labels[j])
 69 | 
 70 |             if pred_same and true_same:
 71 |                 tp += 1
 72 |             elif pred_same and not true_same:
 73 |                 fp += 1
 74 |             elif not pred_same and true_same:
 75 |                 fn += 1
 76 | 
 77 |         # Compute F1 for this speaker
 78 |         if tp == 0:
 79 |             f1 = 0.0
 80 |         else:
 81 |             precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
 82 |             recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
 83 |             f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
 84 | 
 85 |         scores[i] = f1
 86 | 
 87 |     return scores
 88 | 
 89 | if __name__ == "__main__":
 90 |     # Example usage
 91 |     examples: List[Tuple[List[int], List[int]]] = [
 92 |         ([0, 0, 1, 1], [0, 0, 2, 2]),
 93 |         ([0, 0, 1, 1], [1, 1, 0, 0]),
 94 |         ([0, 0, 1, 2], [0, 0, 1, 1]),
 95 |         ([0, 0, 0, 0], [0, 1, 2, 3]),
 96 |         ([0, 0, 1, 1], [0, 1, 0, 1]),
 97 |         ([1, 1, 0, 0], [0, 0, 0, 0]),
 98 |         ([0, 0, 0, 0], [1, 1, 0, 0]),
 99 |         ([0, 0, 0, 0, 1, 2], [1, 1, 0, 0, 2, 2]),
100 |         ([0, 0, 1, 1, 2, 2], [0, 0, 0, 1, 1, 1])
101 |     ]
102 | 
103 |     # Compute and display results
104 |     results = [(true, pred, pairwise_f1_score(true, pred), adjusted_rand_score(true, pred)) for true, pred in examples]
105 |     for true, pred, f1, ari in results:
106 |         print(f"True: {true}, Pred: {pred}, F1: {f1}, ARI: {ari}")
107 | 
108 |     # Compute per-speaker F1 scores
109 |     for true, pred in examples:
110 |         per_speaker_f1 = pairwise_f1_score_per_speaker(true, pred)
111 |         print(f"True: {true}, Pred: {pred}, Per-Speaker F1: {per_speaker_f1}")


--------------------------------------------------------------------------------
/src/ibug/face_detection/utils/head_pose_estimator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cv2
 3 | import math
 4 | import numpy as np
 5 | from typing import Optional, Tuple
 6 | 
 7 | 
 8 | __all__ = ['HeadPoseEstimator']
 9 | 
10 | 
11 | class HeadPoseEstimator(object):
12 |     def __init__(self, mean_shape_path: str = os.path.join(os.path.dirname(__file__),
13 |                                                            'data', 'bfm_lms.npy')) -> None:
14 |         # Load the 68-point mean shape derived from BFM
15 |         mean_shape = np.load(mean_shape_path)
16 | 
17 |         # Calculate the 5-points mean shape
18 |         left_eye = mean_shape[[37, 38, 40, 41]].mean(axis=0)
19 |         right_eye = mean_shape[[43, 44, 46, 47]].mean(axis=0)
20 |         self._mean_shape_5pts = np.vstack((left_eye, right_eye, mean_shape[[30, 48, 54]]))
21 | 
22 |         # Flip the y coordinates of the mean shape to match that of the image coordinate system
23 |         self._mean_shape_5pts[:, 1] = -self._mean_shape_5pts[:, 1]
24 | 
25 |     def __call__(self, landmarks: np.ndarray, image_width: int = 0, image_height: int = 0,
26 |                  camera_matrix: Optional[np.ndarray] = None, dist_coeffs: Optional[np.ndarray] = None,
27 |                  output_preference: int = 0) -> Tuple[float, float, float]:
28 |         # Form the camera matrix
29 |         if camera_matrix is None:
30 |             if image_width <= 0 or image_height <= 0:
31 |                 raise ValueError(
32 |                     'image_width and image_height must be specified when camera_matrix is not given directly')
33 |             else:
34 |                 camera_matrix = np.array([[image_width + image_height, 0, image_width / 2.0],
35 |                                           [0, image_width + image_height, image_height / 2.0],
36 |                                           [0, 0, 1]], dtype=float)
37 | 
38 |         # Prepare the landmarks
39 |         if landmarks.shape[0] == 68:
40 |             landmarks = landmarks[17:]
41 |         if landmarks.shape[0] in [49, 51]:
42 |             left_eye = landmarks[[20, 21, 23, 24]].mean(axis=0)
43 |             right_eye = landmarks[[26, 27, 29, 30]].mean(axis=0)
44 |             landmarks = np.vstack((left_eye, right_eye, landmarks[[13, 31, 37]]))
45 | 
46 |         # Use EPnP to estimate pitch, yaw, and roll
47 |         _, rvec, _ = cv2.solvePnP(self._mean_shape_5pts, np.expand_dims(landmarks, axis=1),
48 |                                   camera_matrix, dist_coeffs, flags=cv2.SOLVEPNP_EPNP)
49 |         rot_mat, _ = cv2.Rodrigues(rvec)
50 |         if 1.0 + rot_mat[2, 0] < 1e-9:
51 |             pitch = 0.0
52 |             yaw = 90.0
53 |             roll = -math.atan2(rot_mat[0, 1], rot_mat[0, 2]) / math.pi * 180.0
54 |         elif 1.0 - rot_mat[2, 0] < 1e-9:
55 |             pitch = 0.0
56 |             yaw = -90.0
57 |             roll = math.atan2(-rot_mat[0, 1], -rot_mat[0, 2]) / math.pi * 180.0
58 |         else:
59 |             pitch = math.atan2(rot_mat[2, 1], rot_mat[2, 2]) / math.pi * 180.0
60 |             yaw = -math.asin(rot_mat[2, 0]) / math.pi * 180.0
61 |             roll = math.atan2(rot_mat[1, 0], rot_mat[0, 0]) / math.pi * 180.0
62 | 
63 |         # Respond to output_preference:
64 |         # output_preference == 1: limit pitch to the range of -90.0 ~ 90.0
65 |         # output_preference == 2: limit yaw to the range of -90.0 ~ 90.0 (already satisfied)
66 |         # output_preference == 3: limit roll to the range of -90.0 ~ 90.0
67 |         # otherwise: minimise total rotation, min(abs(pitch) + abs(yaw) + abs(roll))
68 |         if output_preference != 2:
69 |             alt_pitch = pitch - 180.0 if pitch > 0.0 else pitch + 180.0
70 |             alt_yaw = -180.0 - yaw if yaw < 0.0 else 180.0 - yaw
71 |             alt_roll = roll - 180.0 if roll > 0.0 else roll + 180.0
72 |             if (output_preference == 1 and -90.0 < alt_pitch < 90.0 or
73 |                     output_preference == 3 and -90.0 < alt_roll < 90.0 or
74 |                     output_preference not in (1, 2, 3) and
75 |                     abs(alt_pitch) + abs(alt_yaw) + abs(alt_roll) < abs(pitch) + abs(yaw) + abs(roll)):
76 |                 pitch, yaw, roll = alt_pitch, alt_yaw, alt_roll
77 | 
78 |         return -pitch, yaw, roll
79 | 


--------------------------------------------------------------------------------
/src/ibug/face_detection/utils/simple_face_tracker.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from typing import List, Optional
 3 | from scipy.optimize import linear_sum_assignment
 4 | 
 5 | 
 6 | __all__ = ['SimpleFaceTracker']
 7 | 
 8 | 
 9 | class SimpleFaceTracker(object):
10 |     def __init__(self, iou_threshold: float = 0.4, minimum_face_size: float = 0.0) -> None:
11 |         self._iou_threshold = iou_threshold
12 |         self._minimum_face_size = minimum_face_size
13 |         self._tracklets = []
14 |         self._tracklet_counter = 0
15 | 
16 |     @property
17 |     def iou_threshold(self) -> float:
18 |         return self._iou_threshold
19 | 
20 |     @iou_threshold.setter
21 |     def iou_threshold(self, threshold: float) -> None:
22 |         self._iou_threshold = threshold
23 | 
24 |     @property
25 |     def minimum_face_size(self) -> float:
26 |         return self._minimum_face_size
27 | 
28 |     @minimum_face_size.setter
29 |     def minimum_face_size(self, face_size: float) -> None:
30 |         self._minimum_face_size = face_size
31 | 
32 |     def __call__(self, face_boxes: np.ndarray) -> List[Optional[int]]:
33 |         if face_boxes.size <= 0:
34 |             self._tracklets = []
35 |             return []
36 | 
37 |         # Calculate area of the faces
38 |         face_areas = np.abs((face_boxes[:, 2] - face_boxes[:, 0]) * (face_boxes[:, 3] - face_boxes[:, 1]))
39 | 
40 |         # Prepare tracklets
41 |         for tracklet in self._tracklets:
42 |             tracklet['tracked'] = False
43 | 
44 |         # Calculate the distance matrix based on IOU
45 |         iou_distance_threshold = np.clip(1.0 - self._iou_threshold, 0.0, 1.0)
46 |         min_face_area = max(self._minimum_face_size ** 2, np.finfo(float).eps)
47 |         distances = np.full(shape=(face_boxes.shape[0], len(self._tracklets)),
48 |                             fill_value=2.0 * min(face_boxes.shape[0], len(self._tracklets)), dtype=float)
49 |         for row, face_box in enumerate(face_boxes):
50 |             if face_areas[row] >= min_face_area:
51 |                 for col, tracklet in enumerate(self._tracklets):
52 |                     x_left = max(min(face_box[0], face_box[2]), min(tracklet['bbox'][0], tracklet['bbox'][2]))
53 |                     y_top = max(min(face_box[1], face_box[3]), min(tracklet['bbox'][1], tracklet['bbox'][3]))
54 |                     x_right = min(max(face_box[2], face_box[0]), max(tracklet['bbox'][2], tracklet['bbox'][0]))
55 |                     y_bottom = min(max(face_box[3], face_box[1]), max(tracklet['bbox'][3], tracklet['bbox'][1]))
56 |                     if x_right <= x_left or y_bottom <= y_top:
57 |                         distance = 1.0
58 |                     else:
59 |                         intersection_area = (x_right - x_left) * (y_bottom - y_top)
60 |                         distance = 1.0 - intersection_area / float(face_areas[row] + tracklet['area'] -
61 |                                                                    intersection_area)
62 |                     if distance <= iou_distance_threshold:
63 |                         distances[row, col] = distance
64 | 
65 |         # ID assignment
66 |         tracked_ids = [None] * face_boxes.shape[0]
67 |         for row, col in zip(*linear_sum_assignment(distances)):
68 |             if distances[row, col] <= iou_distance_threshold:
69 |                 tracked_ids[row] = self._tracklets[col]['id']
70 |                 self._tracklets[col]['bbox'] = face_boxes[row, :4].copy()
71 |                 self._tracklets[col]['area'] = face_areas[row]
72 |                 self._tracklets[col]['tracked'] = True
73 | 
74 |         # Remove expired tracklets
75 |         self._tracklets = [x for x in self._tracklets if x['tracked']]
76 | 
77 |         # Register new faces
78 |         for idx, face_box in enumerate(face_boxes):
79 |             if face_areas[idx] >= min_face_area and tracked_ids[idx] is None:
80 |                 self._tracklet_counter += 1
81 |                 self._tracklets.append({'bbox': face_box[:4].copy(), 'area': face_areas[idx],
82 |                                         'id': self._tracklet_counter, 'tracked': True})
83 |                 tracked_ids[idx] = self._tracklets[-1]['id']
84 | 
85 |         return tracked_ids
86 | 
87 |     def reset(self, reset_tracklet_counter: bool = True) -> None:
88 |         self._tracklets = []
89 |         if reset_tracklet_counter:
90 |             self._tracklet_counter = 0
91 | 


--------------------------------------------------------------------------------
/src/auto_avsr/configuration_avsr.py:
--------------------------------------------------------------------------------
  1 | from src.nets.backend.e2e_asr_conformer_av import E2E
  2 | from transformers.configuration_utils import PretrainedConfig
  3 | 
  4 | class AutoAVSRConfig(PretrainedConfig):
  5 |     model_type = "auto_avsr"
  6 | 
  7 |     def __init__(
  8 |         self,
  9 |         odim=5049,
 10 |         adim=768,
 11 |         aheads=12,
 12 |         eunits=3072,
 13 |         elayers=12,
 14 |         transformer_input_layer="conv3d",
 15 |         dropout_rate=0.1,
 16 |         transformer_attn_dropout_rate=0.1,
 17 |         transformer_encoder_attn_layer_type="rel_mha",
 18 |         macaron_style=True,
 19 |         use_cnn_module=True,
 20 |         cnn_module_kernel=31,
 21 |         zero_triu=False,
 22 |         a_upsample_ratio=1,
 23 |         relu_type="swish",
 24 |         ddim=768,
 25 |         dheads=12,
 26 |         dunits=3072,
 27 |         dlayers=6,
 28 |         lsm_weight=0.1,
 29 |         transformer_length_normalized_loss=False,
 30 |         mtlalpha=0.1,
 31 |         ctc_type="builtin",
 32 |         rel_pos_type="latest",
 33 |         aux_adim=768,
 34 |         aux_aheads=12,
 35 |         aux_eunits=3072,
 36 |         aux_elayers=12,
 37 |         aux_transformer_input_layer="conv1d",
 38 |         aux_dropout_rate=0.1,
 39 |         aux_transformer_attn_dropout_rate=0.1,
 40 |         aux_transformer_encoder_attn_layer_type="rel_mha",
 41 |         aux_macaron_style=True,
 42 |         aux_use_cnn_module=True,
 43 |         aux_cnn_module_kernel=31,
 44 |         aux_zero_triu=False,
 45 |         aux_a_upsample_ratio=1,
 46 |         aux_relu_type="swish",
 47 |         aux_dunits=3072,
 48 |         aux_dlayers=6,
 49 |         aux_lsm_weight=0.1,
 50 |         aux_transformer_length_normalized_loss=False,
 51 |         aux_mtlalpha=0.1,
 52 |         aux_ctc_type="builtin",
 53 |         aux_rel_pos_type="latest",
 54 |         fusion_hdim=8192,
 55 |         fusion_norm="batchnorm",
 56 |         **kwargs,
 57 |     ):
 58 |         super().__init__(**kwargs)
 59 |         self.odim = odim
 60 |         self.adim = adim
 61 |         self.aheads = aheads
 62 |         self.eunits = eunits
 63 |         self.elayers = elayers
 64 |         self.transformer_input_layer = transformer_input_layer
 65 |         self.dropout_rate = dropout_rate
 66 |         self.transformer_attn_dropout_rate = transformer_attn_dropout_rate
 67 |         self.transformer_encoder_attn_layer_type = transformer_encoder_attn_layer_type
 68 |         self.macaron_style = macaron_style
 69 |         self.use_cnn_module = use_cnn_module
 70 |         self.cnn_module_kernel = cnn_module_kernel
 71 |         self.zero_triu = zero_triu
 72 |         self.a_upsample_ratio = a_upsample_ratio
 73 |         self.relu_type = relu_type
 74 |         self.ddim = ddim
 75 |         self.dheads = dheads
 76 |         self.dunits = dunits
 77 |         self.dlayers = dlayers
 78 |         self.lsm_weight = lsm_weight
 79 |         self.transformer_length_normalized_loss = transformer_length_normalized_loss
 80 |         self.mtlalpha = mtlalpha
 81 |         self.ctc_type = ctc_type
 82 |         self.rel_pos_type = rel_pos_type
 83 |         self.aux_adim = aux_adim
 84 |         self.aux_aheads = aux_aheads
 85 |         self.aux_eunits = aux_eunits
 86 |         self.aux_elayers = aux_elayers
 87 |         self.aux_transformer_input_layer = aux_transformer_input_layer
 88 |         self.aux_dropout_rate = aux_dropout_rate
 89 |         self.aux_transformer_attn_dropout_rate = aux_transformer_attn_dropout_rate
 90 |         self.aux_transformer_encoder_attn_layer_type = aux_transformer_encoder_attn_layer_type
 91 |         self.aux_macaron_style = aux_macaron_style
 92 |         self.aux_use_cnn_module = aux_use_cnn_module
 93 |         self.aux_cnn_module_kernel = aux_cnn_module_kernel
 94 |         self.aux_zero_triu = aux_zero_triu
 95 |         self.aux_a_upsample_ratio = aux_a_upsample_ratio
 96 |         self.aux_relu_type = aux_relu_type
 97 |         self.aux_dunits = aux_dunits
 98 |         self.aux_dlayers = aux_dlayers
 99 |         self.aux_lsm_weight = aux_lsm_weight
100 |         self.aux_transformer_length_normalized_loss = aux_transformer_length_normalized_loss
101 |         self.aux_mtlalpha = aux_mtlalpha
102 |         self.aux_ctc_type = aux_ctc_type
103 |         self.aux_rel_pos_type = aux_rel_pos_type
104 |         self.fusion_hdim = fusion_hdim
105 |         self.fusion_norm = fusion_norm
106 |         
107 |         


--------------------------------------------------------------------------------
/src/nets/backend/e2e_asr_conformer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Shigeki Karita
  2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  3 | 
  4 | """Transformer speech recognition model (pytorch)."""
  5 | 
  6 | import logging
  7 | import numpy
  8 | import torch
  9 | 
 10 | from src.nets.backend.ctc import CTC
 11 | from src.nets.backend.nets_utils import (
 12 |     make_non_pad_mask,
 13 |     th_accuracy,
 14 | )
 15 | from src.nets.backend.transformer.add_sos_eos import add_sos_eos
 16 | from src.nets.backend.transformer.decoder import Decoder
 17 | from src.nets.backend.transformer.encoder import Encoder
 18 | from src.nets.backend.transformer.label_smoothing_loss import LabelSmoothingLoss
 19 | from src.nets.backend.transformer.mask import target_mask
 20 | 
 21 | 
 22 | class E2E(torch.nn.Module):
 23 |     def __init__(self, args, ignore_id=-1):
 24 |         torch.nn.Module.__init__(self)
 25 | 
 26 |         self.encoder = Encoder(
 27 |             attention_dim=args.adim,
 28 |             attention_heads=args.aheads,
 29 |             linear_units=args.eunits,
 30 |             num_blocks=args.elayers,
 31 |             input_layer=args.transformer_input_layer,
 32 |             dropout_rate=args.dropout_rate,
 33 |             positional_dropout_rate=args.dropout_rate,
 34 |             attention_dropout_rate=args.transformer_attn_dropout_rate,
 35 |             encoder_attn_layer_type=args.transformer_encoder_attn_layer_type,
 36 |             macaron_style=args.macaron_style,
 37 |             use_cnn_module=args.use_cnn_module,
 38 |             cnn_module_kernel=args.cnn_module_kernel,
 39 |             zero_triu=getattr(args, "zero_triu", False),
 40 |             a_upsample_ratio=args.a_upsample_ratio,
 41 |             relu_type=getattr(args, "relu_type", "swish"),
 42 |         )
 43 | 
 44 |         self.transformer_input_layer = args.transformer_input_layer
 45 |         self.a_upsample_ratio = args.a_upsample_ratio
 46 | 
 47 |         self.proj_decoder = None
 48 |         if args.adim != args.ddim:
 49 |             self.proj_decoder = torch.nn.Linear(args.adim, args.ddim)
 50 | 
 51 |         if args.mtlalpha < 1:
 52 |             self.decoder = Decoder(
 53 |                 odim=args.odim,
 54 |                 attention_dim=args.ddim,
 55 |                 attention_heads=args.dheads,
 56 |                 linear_units=args.dunits,
 57 |                 num_blocks=args.dlayers,
 58 |                 dropout_rate=args.dropout_rate,
 59 |                 positional_dropout_rate=args.dropout_rate,
 60 |                 self_attention_dropout_rate=args.transformer_attn_dropout_rate,
 61 |                 src_attention_dropout_rate=args.transformer_attn_dropout_rate,
 62 |             )
 63 |         else:
 64 |             self.decoder = None
 65 |         self.blank = 0
 66 |         self.sos = args.odim - 1
 67 |         self.eos = args.odim - 1
 68 |         self.odim = args.odim
 69 |         self.ignore_id = ignore_id
 70 | 
 71 |         # self.lsm_weight = a
 72 |         self.criterion = LabelSmoothingLoss(
 73 |             self.odim,
 74 |             self.ignore_id,
 75 |             args.lsm_weight,
 76 |             args.transformer_length_normalized_loss,
 77 |         )
 78 | 
 79 |         self.adim = args.adim
 80 |         self.mtlalpha = args.mtlalpha
 81 |         if args.mtlalpha > 0.0:
 82 |             self.ctc = CTC(
 83 |                 args.odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True
 84 |             )
 85 |         else:
 86 |             self.ctc = None
 87 | 
 88 |     def forward(self, x, lengths, label):
 89 |         if self.transformer_input_layer == "conv1d":
 90 |             lengths = torch.div(lengths, 640, rounding_mode="trunc")
 91 |         padding_mask = make_non_pad_mask(lengths).to(x.device).unsqueeze(-2)
 92 | 
 93 |         x, _ = self.encoder(x, padding_mask)
 94 | 
 95 |         # ctc loss
 96 |         loss_ctc, ys_hat = self.ctc(x, lengths, label)
 97 | 
 98 |         if self.proj_decoder:
 99 |             x = self.proj_decoder(x)
100 | 
101 |         # decoder loss
102 |         ys_in_pad, ys_out_pad = add_sos_eos(label, self.sos, self.eos, self.ignore_id)
103 |         ys_mask = target_mask(ys_in_pad, self.ignore_id)
104 |         pred_pad, _ = self.decoder(ys_in_pad, ys_mask, x, padding_mask)
105 |         loss_att = self.criterion(pred_pad, ys_out_pad)
106 |         loss = self.mtlalpha * loss_ctc + (1 - self.mtlalpha) * loss_att
107 | 
108 |         acc = th_accuracy(
109 |             pred_pad.view(-1, self.odim), ys_out_pad, ignore_label=self.ignore_id
110 |         )
111 | 
112 |         return loss, loss_ctc, loss_att, acc
113 | 


--------------------------------------------------------------------------------
/src/talking_detector/segmentation.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | 
  4 | CENTRAL_ASD_CHUNKING_PARAMETERS = {
  5 |     "onset": 1.0,        # start threshold
  6 |     "offset": 0.8,       # end threshold
  7 |     "min_duration_on": 1.0,  # drop
  8 |     "min_duration_off": 0.5, # fill
  9 |     "max_chunk_size": 10,
 10 |     "min_chunk_size": 1
 11 | }
 12 | 
 13 | EGO_ASD_CHUNKING_PARAMETERS = {
 14 |     "onset": 2.4,        # start threshold
 15 |     "offset": 1.6,       # end threshold
 16 |     "min_duration_on": 1.0,  # drop
 17 |     "min_duration_off": 0.5, # fill
 18 |     "max_chunk_size": 10,
 19 |     "min_chunk_size": 1
 20 | }
 21 | 
 22 | 
 23 | def segment_by_asd(asd, parameters={}):
 24 |     onset_threshold = parameters.get("onset", CENTRAL_ASD_CHUNKING_PARAMETERS["onset"])
 25 |     offset_threshold = parameters.get("offset", CENTRAL_ASD_CHUNKING_PARAMETERS["offset"])
 26 |     
 27 |     # Convert frame numbers to integers and sort them
 28 |     frames = sorted([int(f) for f in asd.keys()])
 29 |     if not frames:
 30 |         return []
 31 |         
 32 |     # Find the minimum frame number to normalize frame indices
 33 |     min_frame = min(frames)
 34 |     
 35 |     # Convert duration parameters from seconds to frames (assuming 25 fps)
 36 |     min_duration_on_frames = int(parameters.get("min_duration_on",  CENTRAL_ASD_CHUNKING_PARAMETERS["min_duration_on"]) * 25)
 37 |     min_duration_off_frames = int(parameters.get("min_duration_off", CENTRAL_ASD_CHUNKING_PARAMETERS["min_duration_on"]) * 25)
 38 |     max_chunk_frames = int(parameters.get("max_chunk_size", CENTRAL_ASD_CHUNKING_PARAMETERS["max_chunk_size"]) * 25)
 39 |     min_chunk_frames = int(parameters.get("min_chunk_size", CENTRAL_ASD_CHUNKING_PARAMETERS["min_chunk_size"]) * 25)
 40 |     
 41 |     # First pass: Find speech regions using hysteresis thresholding
 42 |     speech_regions = []
 43 |     current_region = None
 44 |     is_active = False
 45 |     
 46 |     for frame in frames:
 47 |         score = asd.get(str(frame), -1)
 48 |         normalized_frame = frame - min_frame
 49 |         
 50 |         if not is_active:
 51 |             # Currently inactive, check for onset
 52 |             if score > onset_threshold:
 53 |                 is_active = True
 54 |                 current_region = [normalized_frame]
 55 |         else:
 56 |             # Currently active, check for offset
 57 |             if score < offset_threshold:
 58 |                 is_active = False
 59 |                 if current_region is not None:
 60 |                     speech_regions.append(current_region)
 61 |                     current_region = None
 62 |             else:
 63 |                 current_region.append(normalized_frame)
 64 |     
 65 |     # Handle case where speech continues until the end
 66 |     if current_region is not None:
 67 |         speech_regions.append(current_region)
 68 |     
 69 |     # Second pass: Merge regions separated by short non-speech gaps
 70 |     merged_regions = []
 71 |     if speech_regions:
 72 |         current_region = speech_regions[0]
 73 |         
 74 |         for next_region in speech_regions[1:]:
 75 |             gap = next_region[0] - current_region[-1] - 1
 76 |             if gap <= min_duration_off_frames:
 77 |                 # Merge regions
 78 |                 current_region.extend(next_region)
 79 |             else:
 80 |                 merged_regions.append(current_region)
 81 |                 current_region = next_region
 82 |         merged_regions.append(current_region)
 83 |     
 84 |     # Third pass: Remove short speech regions and split long ones
 85 |     final_segments = []
 86 |     for region in merged_regions:
 87 |         region_length = len(region)
 88 |         
 89 |         # Skip regions shorter than minimum duration
 90 |         if region_length < min_duration_on_frames:
 91 |             continue
 92 |             
 93 |         # Split long regions
 94 |         if region_length > max_chunk_frames:
 95 |             num_chunks = math.ceil(region_length / max_chunk_frames)
 96 |             chunk_size = math.ceil(region_length / num_chunks)
 97 |             
 98 |             for i in range(0, region_length, chunk_size):
 99 |                 sub_segment = region[i:i + chunk_size]
100 |                 if len(sub_segment) >= min_chunk_frames:
101 |                     final_segments.append(sub_segment)
102 |         else:
103 |             final_segments.append(region)
104 |     
105 |     # Convert frame indices back to original frame indices
106 |     final_segments = [
107 |         [frame + min_frame for frame in segment]
108 |         for segment in final_segments
109 |     ]
110 |     
111 |     return final_segments
112 | 


--------------------------------------------------------------------------------
/src/avhubert_muavic/av2text_config.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Speech2Text model configuration"""
 16 | 
 17 | from transformers.configuration_utils import PretrainedConfig
 18 | from transformers.utils import logging
 19 | 
 20 | 
 21 | logger = logging.get_logger(__name__)
 22 | 
 23 | 
 24 | class AV2TextConfig(PretrainedConfig):
 25 |     model_type = "speech_to_text"
 26 |     keys_to_ignore_at_inference = ["past_key_values"]
 27 |     attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
 28 | 
 29 |     def __init__(
 30 |         self,
 31 |         vocab_size=10000,
 32 |         encoder_layers=12,
 33 |         encoder_ffn_dim=2048,
 34 |         encoder_attention_heads=4,
 35 |         decoder_layers=6,
 36 |         decoder_ffn_dim=2048,
 37 |         decoder_attention_heads=4,
 38 |         encoder_layerdrop=0.0,
 39 |         decoder_layerdrop=0.0,
 40 |         use_cache=True,
 41 |         is_encoder_decoder=True,
 42 |         activation_function="relu",
 43 |         d_model=256,
 44 |         encoder_hidden_size=256,
 45 |         decoder_hidden_size=256,
 46 |         dropout=0.1,
 47 |         attention_dropout=0.0,
 48 |         activation_dropout=0.0,
 49 |         init_std=0.02,
 50 |         decoder_start_token_id=2,
 51 |         scale_embedding=True,
 52 |         pad_token_id=1,
 53 |         bos_token_id=0,
 54 |         eos_token_id=2,
 55 |         max_source_positions=6000,
 56 |         max_target_positions=1024,
 57 |         num_conv_layers=2,
 58 |         conv_kernel_sizes=(5, 5),
 59 |         conv_channels=1024,
 60 |         input_feat_per_channel=80,
 61 |         input_channels=1,
 62 |         attn_implementation="eager",
 63 |         **kwargs,
 64 |     ):
 65 |         self.vocab_size = vocab_size
 66 |         self.d_model = d_model
 67 |         self.encoder_ffn_dim = encoder_ffn_dim
 68 |         self.encoder_layers = encoder_layers
 69 |         self.encoder_hidden_size = encoder_hidden_size
 70 |         self.encoder_attention_heads = encoder_attention_heads
 71 |         self.decoder_ffn_dim = decoder_ffn_dim
 72 |         self.decoder_hidden_size = decoder_hidden_size
 73 |         self.decoder_layers = decoder_layers
 74 |         self.decoder_attention_heads = decoder_attention_heads
 75 |         self.dropout = dropout
 76 |         self.attention_dropout = attention_dropout
 77 |         self.activation_dropout = activation_dropout
 78 |         self.activation_function = activation_function
 79 |         self.init_std = init_std
 80 |         self.encoder_layerdrop = encoder_layerdrop
 81 |         self.decoder_layerdrop = decoder_layerdrop
 82 |         self.use_cache = use_cache
 83 |         self.num_hidden_layers = encoder_layers
 84 |         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
 85 |         self.max_source_positions = max_source_positions
 86 |         self.max_target_positions = max_target_positions
 87 |         self.num_conv_layers = num_conv_layers
 88 |         self.conv_kernel_sizes = list(conv_kernel_sizes)
 89 |         self.conv_channels = conv_channels
 90 |         self.input_feat_per_channel = input_feat_per_channel
 91 |         self.input_channels = input_channels
 92 |         self.attn_implementation = attn_implementation
 93 | 
 94 |         if len(self.conv_kernel_sizes) != self.num_conv_layers:
 95 |             raise ValueError(
 96 |                 "Configuration for convolutional module is incorrect. "
 97 |                 "It is required that `len(config.conv_kernel_sizes)` == `config.num_conv_layers` "
 98 |                 f"but is `len(config.conv_kernel_sizes) = {len(self.conv_kernel_sizes)}`, "
 99 |                 f"`config.num_conv_layers = {self.num_conv_layers}`."
100 |             )
101 | 
102 |         super().__init__(
103 |             pad_token_id=pad_token_id,
104 |             bos_token_id=bos_token_id,
105 |             eos_token_id=eos_token_id,
106 |             is_encoder_decoder=is_encoder_decoder,
107 |             decoder_start_token_id=decoder_start_token_id,
108 |             **kwargs,
109 |         )
110 |         
111 |         self._attn_implementation = "eager"


--------------------------------------------------------------------------------
/src/ibug/face_detection/retina_face/retina_face.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import torchvision.models as models
  5 | import torchvision.models._utils as _utils
  6 | from .retina_face_net import MobileNetV1, FPN, SSH
  7 | 
  8 | 
  9 | class ClassHead(nn.Module):
 10 |     def __init__(self, inchannels=512, num_anchors=3):
 11 |         super(ClassHead, self).__init__()
 12 |         self.num_anchors = num_anchors
 13 |         self.conv1x1 = nn.Conv2d(inchannels, self.num_anchors*2, kernel_size=(1, 1), stride=1, padding=0)
 14 | 
 15 |     def forward(self, x):
 16 |         out = self.conv1x1(x)
 17 |         out = out.permute(0, 2, 3, 1).contiguous()
 18 |         
 19 |         return out.view(out.shape[0], -1, 2)
 20 | 
 21 | 
 22 | class BboxHead(nn.Module):
 23 |     def __init__(self, inchannels=512, num_anchors=3):
 24 |         super(BboxHead, self).__init__()
 25 |         self.conv1x1 = nn.Conv2d(inchannels, num_anchors*4, kernel_size=(1, 1), stride=1,padding=0)
 26 | 
 27 |     def forward(self, x):
 28 |         out = self.conv1x1(x)
 29 |         out = out.permute(0, 2, 3, 1).contiguous()
 30 | 
 31 |         return out.view(out.shape[0], -1, 4)
 32 | 
 33 | 
 34 | class LandmarkHead(nn.Module):
 35 |     def __init__(self, inchannels=512, num_anchors=3):
 36 |         super(LandmarkHead, self).__init__()
 37 |         self.conv1x1 = nn.Conv2d(inchannels,num_anchors*10, kernel_size=(1, 1), stride=1, padding=0)
 38 | 
 39 |     def forward(self, x):
 40 |         out = self.conv1x1(x)
 41 |         out = out.permute(0, 2, 3, 1).contiguous()
 42 | 
 43 |         return out.view(out.shape[0], -1, 10)
 44 | 
 45 | 
 46 | class RetinaFace(nn.Module):
 47 |     def __init__(self, cfg=None, phase='train'):
 48 |         """
 49 |         :param cfg:  Network related settings.
 50 |         :param phase: train or test.
 51 |         """
 52 |         super(RetinaFace, self).__init__()
 53 |         self.phase = phase
 54 |         backbone = None
 55 |         if cfg['name'] == 'mobilenet0.25':
 56 |             backbone = MobileNetV1()
 57 |         elif cfg['name'] == 'Resnet50':
 58 |             backbone = models.resnet50()
 59 | 
 60 |         self.body = _utils.IntermediateLayerGetter(backbone, cfg['return_layers'])
 61 |         in_channels_stage2 = cfg['in_channel']
 62 |         in_channels_list = [
 63 |             in_channels_stage2 * 2,
 64 |             in_channels_stage2 * 4,
 65 |             in_channels_stage2 * 8,
 66 |         ]
 67 |         out_channels = cfg['out_channel']
 68 |         self.fpn = FPN(in_channels_list,out_channels)
 69 |         self.ssh1 = SSH(out_channels, out_channels)
 70 |         self.ssh2 = SSH(out_channels, out_channels)
 71 |         self.ssh3 = SSH(out_channels, out_channels)
 72 | 
 73 |         self.ClassHead = self._make_class_head(fpn_num=3, inchannels=cfg['out_channel'])
 74 |         self.BboxHead = self._make_bbox_head(fpn_num=3, inchannels=cfg['out_channel'])
 75 |         self.LandmarkHead = self._make_landmark_head(fpn_num=3, inchannels=cfg['out_channel'])
 76 | 
 77 |     def _make_class_head(self, fpn_num=3, inchannels=64, anchor_num=2):
 78 |         classhead = nn.ModuleList()
 79 |         for i in range(fpn_num):
 80 |             classhead.append(ClassHead(inchannels, anchor_num))
 81 |         return classhead
 82 |     
 83 |     def _make_bbox_head(self, fpn_num=3, inchannels=64, anchor_num=2):
 84 |         bboxhead = nn.ModuleList()
 85 |         for i in range(fpn_num):
 86 |             bboxhead.append(BboxHead(inchannels, anchor_num))
 87 |         return bboxhead
 88 | 
 89 |     def _make_landmark_head(self, fpn_num=3, inchannels=64, anchor_num=2):
 90 |         landmarkhead = nn.ModuleList()
 91 |         for i in range(fpn_num):
 92 |             landmarkhead.append(LandmarkHead(inchannels, anchor_num))
 93 |         return landmarkhead
 94 | 
 95 |     def forward(self, inputs):
 96 |         out = self.body(inputs)
 97 | 
 98 |         # FPN
 99 |         fpn = self.fpn(out)
100 | 
101 |         # SSH
102 |         feature1 = self.ssh1(fpn[0])
103 |         feature2 = self.ssh2(fpn[1])
104 |         feature3 = self.ssh3(fpn[2])
105 |         features = [feature1, feature2, feature3]
106 | 
107 |         bbox_regressions = torch.cat([self.BboxHead[i](feature) for i, feature in enumerate(features)], dim=1)
108 |         classifications = torch.cat([self.ClassHead[i](feature) for i, feature in enumerate(features)], dim=1)
109 |         ldm_regressions = torch.cat([self.LandmarkHead[i](feature) for i, feature in enumerate(features)], dim=1)
110 | 
111 |         if self.phase == 'train':
112 |             output = (bbox_regressions, classifications, ldm_regressions)
113 |         else:
114 |             output = (bbox_regressions, F.softmax(classifications, dim=-1), ldm_regressions)
115 |         return output
116 | 


--------------------------------------------------------------------------------
/src/nets/backend/transformer/decoder_layer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2019 Shigeki Karita
  5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  6 | 
  7 | """Decoder self-attention layer definition."""
  8 | 
  9 | import torch
 10 | 
 11 | from src.nets.backend.transformer.layer_norm import LayerNorm
 12 | from torch import nn
 13 | 
 14 | 
 15 | class DecoderLayer(nn.Module):
 16 |     """Single decoder layer module.
 17 |     :param int size: input dim
 18 |     :param src.nets.backend.transformer.attention.MultiHeadedAttention
 19 |         self_attn: self attention module
 20 |     :param src.nets.backend.transformer.attention.MultiHeadedAttention
 21 |         src_attn: source attention module
 22 |     :param src.nets.backend.transformer.positionwise_feed_forward.
 23 |         PositionwiseFeedForward feed_forward: feed forward layer module
 24 |     :param float dropout_rate: dropout rate
 25 |     :param bool normalize_before: whether to use layer_norm before the first block
 26 |     :param bool concat_after: whether to concat attention layer's input and output
 27 |         if True, additional linear will be applied.
 28 |         i.e. x -> x + linear(concat(x, att(x)))
 29 |         if False, no additional linear will be applied. i.e. x -> x + att(x)
 30 |     """
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         size,
 35 |         self_attn,
 36 |         src_attn,
 37 |         feed_forward,
 38 |         dropout_rate,
 39 |         normalize_before=True,
 40 |         concat_after=False,
 41 |     ):
 42 |         """Construct an DecoderLayer object."""
 43 |         super(DecoderLayer, self).__init__()
 44 |         self.size = size
 45 |         self.self_attn = self_attn
 46 |         self.src_attn = src_attn
 47 |         self.feed_forward = feed_forward
 48 |         self.norm1 = LayerNorm(size)
 49 |         self.norm2 = LayerNorm(size)
 50 |         self.norm3 = LayerNorm(size)
 51 |         self.dropout = nn.Dropout(dropout_rate)
 52 |         self.normalize_before = normalize_before
 53 |         self.concat_after = concat_after
 54 |         if self.concat_after:
 55 |             self.concat_linear1 = nn.Linear(size + size, size)
 56 |             self.concat_linear2 = nn.Linear(size + size, size)
 57 | 
 58 |     def forward(self, tgt, tgt_mask, memory, memory_mask, cache=None):
 59 |         """Compute decoded features.
 60 |         Args:
 61 |             tgt (torch.Tensor):
 62 |                 decoded previous target features (batch, max_time_out, size)
 63 |             tgt_mask (torch.Tensor): mask for x (batch, max_time_out)
 64 |             memory (torch.Tensor): encoded source features (batch, max_time_in, size)
 65 |             memory_mask (torch.Tensor): mask for memory (batch, max_time_in)
 66 |             cache (torch.Tensor): cached output (batch, max_time_out-1, size)
 67 |         """
 68 |         residual = tgt
 69 |         if self.normalize_before:
 70 |             tgt = self.norm1(tgt)
 71 | 
 72 |         if cache is None:
 73 |             tgt_q = tgt
 74 |             tgt_q_mask = tgt_mask
 75 |         else:
 76 |             # compute only the last frame query keeping dim: max_time_out -> 1
 77 |             assert cache.shape == (
 78 |                 tgt.shape[0],
 79 |                 tgt.shape[1] - 1,
 80 |                 self.size,
 81 |             ), f"{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
 82 |             tgt_q = tgt[:, -1:, :]
 83 |             residual = residual[:, -1:, :]
 84 |             tgt_q_mask = None
 85 |             if tgt_mask is not None:
 86 |                 tgt_q_mask = tgt_mask[:, -1:, :]
 87 | 
 88 |         if self.concat_after:
 89 |             tgt_concat = torch.cat(
 90 |                 (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)), dim=-1
 91 |             )
 92 |             x = residual + self.concat_linear1(tgt_concat)
 93 |         else:
 94 |             x = residual + self.dropout(self.self_attn(tgt_q, tgt, tgt, tgt_q_mask))
 95 |         if not self.normalize_before:
 96 |             x = self.norm1(x)
 97 | 
 98 |         residual = x
 99 |         if self.normalize_before:
100 |             x = self.norm2(x)
101 |         if self.concat_after:
102 |             x_concat = torch.cat(
103 |                 (x, self.src_attn(x, memory, memory, memory_mask)), dim=-1
104 |             )
105 |             x = residual + self.concat_linear2(x_concat)
106 |         else:
107 |             x = residual + self.dropout(self.src_attn(x, memory, memory, memory_mask))
108 |         if not self.normalize_before:
109 |             x = self.norm2(x)
110 | 
111 |         residual = x
112 |         if self.normalize_before:
113 |             x = self.norm3(x)
114 |         x = residual + self.dropout(self.feed_forward(x))
115 |         if not self.normalize_before:
116 |             x = self.norm3(x)
117 | 
118 |         if cache is not None:
119 |             x = torch.cat([cache, x], dim=1)
120 | 
121 |         return x, tgt_mask, memory, memory_mask
122 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
  2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode
  3 | 
  4 | ### Python ###
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # poetry
102 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
104 | #   commonly ignored for libraries.
105 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 | 
108 | # pdm
109 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | #   in version control.
113 | #   https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 
166 | ### Python Patch ###
167 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
168 | poetry.toml
169 | 
170 | # ruff
171 | .ruff_cache/
172 | 
173 | # LSP config files
174 | pyrightconfig.json
175 | 
176 | ### VisualStudioCode ###
177 | .vscode/*
178 | !.vscode/settings.json
179 | !.vscode/tasks.json
180 | !.vscode/launch.json
181 | !.vscode/extensions.json
182 | !.vscode/*.code-snippets
183 | 
184 | # Local History for Visual Studio Code
185 | .history/
186 | 
187 | # Built Visual Studio Code Extensions
188 | *.vsix
189 | 
190 | ### VisualStudioCode Patch ###
191 | # Ignore all local history of files
192 | .history
193 | .ionide
194 | 
195 | # End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
196 | 
197 | .cache/*
198 | model-bin
199 | data-bin
200 | wandb
201 | src/ibug/**/weights/*
202 | src/ibug/**/weights/*
203 | 
204 | script/_*.py


--------------------------------------------------------------------------------
/script/asd.py:
--------------------------------------------------------------------------------
  1 | import os, cv2, math, sys
  2 | # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  4 | import numpy as np
  5 | import torch
  6 | import torchaudio
  7 | import python_speech_features
  8 | import json
  9 | from src.talking_detector.ASD import ASD
 10 | 
 11 | # ==================== LOAD MODEL ====================
 12 | ASD_MODEL = ASD()
 13 | ASD_MODEL.loadParameters("model-bin/finetuning_TalkSet.model")
 14 | ASD_MODEL = ASD_MODEL.cuda().eval()
 15 | print("Model loaded successfully.")
 16 | 
 17 | def process_video(video_path, output_dir=None, frame_offset=0):
 18 |     """
 19 |     Process a single video file to detect active speakers and output ASD results.
 20 |     
 21 |     Args:
 22 |         video_path (str): Path to the input video file
 23 |         output_dir (str, optional): Directory to save the output JSON. If None, saves in same directory as video.
 24 |     
 25 |     Returns:
 26 |         str: Path to the output JSON file
 27 |     """
 28 |     if not os.path.exists(video_path):
 29 |         raise FileNotFoundError(f"Video file not found: {video_path}")
 30 |         
 31 |     # Create output directory if specified
 32 |     if output_dir is None:
 33 |         output_dir = os.path.dirname(video_path)
 34 |     os.makedirs(output_dir, exist_ok=True)
 35 |     
 36 |     # Get video name without extension
 37 |     video_name = os.path.splitext(os.path.basename(video_path))[0]
 38 |     
 39 |     # Load audio directly using torchaudio
 40 |     audio, sample_rate = torchaudio.load(video_path, normalize=False)
 41 |     assert sample_rate == 16000
 42 |     
 43 |     # Convert to numpy for MFCC computation
 44 |     audio_np = audio[0].numpy()
 45 |     
 46 |     # Compute MFCC features
 47 |     audioFeature = python_speech_features.mfcc(audio_np, 16000, numcep=13, winlen=0.025, winstep=0.010)
 48 |     
 49 |     # Load video frames
 50 |     video = cv2.VideoCapture(video_path)
 51 |     videoFeature = []
 52 |     while video.isOpened():
 53 |         ret, frames = video.read()
 54 |         if ret:
 55 |             face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY)
 56 |             face = cv2.resize(face, (224,224))
 57 |             face = face[int(112-(112/2)):int(112+(112/2)), int(112-(112/2)):int(112+(112/2))]
 58 |             videoFeature.append(face)
 59 |         else:
 60 |             break
 61 |     video.release()
 62 |     
 63 |     videoFeature = np.array(videoFeature)
 64 |     length = min((audioFeature.shape[0] - audioFeature.shape[0] % 4) / 100, videoFeature.shape[0] / 25)
 65 |     audioFeature = audioFeature[:int(round(length * 100)),:]
 66 |     videoFeature = videoFeature[:int(round(length * 25)),:,:]
 67 |     
 68 |     # Evaluate using model
 69 |     durationSet = {1,1,1,2,2,2,3,3,4,5,6}
 70 |     allScore = []
 71 |     
 72 |     for duration in durationSet:
 73 |         batchSize = int(math.ceil(length / duration))
 74 |         scores = []
 75 |         with torch.no_grad():
 76 |             for i in range(batchSize):
 77 |                 inputA = torch.FloatTensor(audioFeature[i * duration * 100:(i+1) * duration * 100,:]).unsqueeze(0).cuda()
 78 |                 inputV = torch.FloatTensor(videoFeature[i * duration * 25: (i+1) * duration * 25,:,:]).unsqueeze(0).cuda()
 79 |                 embedA = ASD_MODEL.model.forward_audio_frontend(inputA)
 80 |                 embedV = ASD_MODEL.model.forward_visual_frontend(inputV)
 81 |                 out = ASD_MODEL.model.forward_audio_visual_backend(embedA, embedV)
 82 |                 score = ASD_MODEL.lossAV.forward(out, labels=None)
 83 |                 scores.extend(score)
 84 |         allScore.append(scores)
 85 |     
 86 |     # Calculate final scores
 87 |     final_scores = np.round((np.mean(np.array(allScore), axis=0)), 1).astype(float)
 88 |     
 89 |     # Create frame-wise scores dictionary
 90 |     frame_scores = {frame_idx + frame_offset: round(float(score), 2) for frame_idx, score in enumerate(final_scores)}
 91 |     
 92 |     # Save results
 93 |     output_json = os.path.join(output_dir, f"{video_name}_asd.json")
 94 |     with open(output_json, 'w') as f:
 95 |         json.dump(frame_scores, f, indent=4)
 96 |     
 97 |     return output_json
 98 | 
 99 | def main():
100 |     import argparse
101 |     parser = argparse.ArgumentParser(description="Active Speaker Detection")
102 |     parser.add_argument('--video', type=str, required=True, help='Path to input video file')
103 |     parser.add_argument('--output_dir', type=str, default=None, help='Directory to save output JSON (optional)')
104 |     opt = parser.parse_args()
105 |     
106 |     video_info = opt.video.replace(".mp4", ".json")
107 |     frame_offset = 0
108 |     if os.path.exists(video_info):
109 |         with open(video_info, 'r') as f:
110 |             video_data = json.load(f)
111 |         frame_offset = video_data.get("frame_start", 0)
112 |     output_path = process_video(opt.video, opt.output_dir, frame_offset)
113 |     print(f"ASD results saved to: {output_path}")
114 | 
115 | if __name__ == "__main__":
116 | 
117 |     main()


--------------------------------------------------------------------------------
/src/ibug/face_detection/retina_face/retina_face_net.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | def conv_bn(inp, oup, stride = 1, leaky = 0):
  7 |     return nn.Sequential(
  8 |         nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
  9 |         nn.BatchNorm2d(oup),
 10 |         nn.LeakyReLU(negative_slope=leaky, inplace=True)
 11 |     )
 12 | 
 13 | 
 14 | def conv_bn_no_relu(inp, oup, stride):
 15 |     return nn.Sequential(
 16 |         nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
 17 |         nn.BatchNorm2d(oup),
 18 |     )
 19 | 
 20 | 
 21 | def conv_bn1X1(inp, oup, stride, leaky=0):
 22 |     return nn.Sequential(
 23 |         nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False),
 24 |         nn.BatchNorm2d(oup),
 25 |         nn.LeakyReLU(negative_slope=leaky, inplace=True)
 26 |     )
 27 | 
 28 | 
 29 | def conv_dw(inp, oup, stride, leaky=0.1):
 30 |     return nn.Sequential(
 31 |         nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
 32 |         nn.BatchNorm2d(inp),
 33 |         nn.LeakyReLU(negative_slope=leaky, inplace=True),
 34 | 
 35 |         nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
 36 |         nn.BatchNorm2d(oup),
 37 |         nn.LeakyReLU(negative_slope=leaky, inplace=True),
 38 |     )
 39 | 
 40 | 
 41 | class SSH(nn.Module):
 42 |     def __init__(self, in_channel, out_channel):
 43 |         super(SSH, self).__init__()
 44 |         assert out_channel % 4 == 0
 45 |         leaky = 0
 46 |         if out_channel <= 64:
 47 |             leaky = 0.1
 48 |         self.conv3X3 = conv_bn_no_relu(in_channel, out_channel//2, stride=1)
 49 | 
 50 |         self.conv5X5_1 = conv_bn(in_channel, out_channel//4, stride=1, leaky = leaky)
 51 |         self.conv5X5_2 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1)
 52 | 
 53 |         self.conv7X7_2 = conv_bn(out_channel//4, out_channel//4, stride=1, leaky = leaky)
 54 |         self.conv7x7_3 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1)
 55 | 
 56 |     def forward(self, input):
 57 |         conv3X3 = self.conv3X3(input)
 58 | 
 59 |         conv5X5_1 = self.conv5X5_1(input)
 60 |         conv5X5 = self.conv5X5_2(conv5X5_1)
 61 | 
 62 |         conv7X7_2 = self.conv7X7_2(conv5X5_1)
 63 |         conv7X7 = self.conv7x7_3(conv7X7_2)
 64 | 
 65 |         out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1)
 66 |         out = F.relu(out)
 67 |         return out
 68 | 
 69 | 
 70 | class FPN(nn.Module):
 71 |     def __init__(self,in_channels_list,out_channels):
 72 |         super(FPN,self).__init__()
 73 |         leaky = 0
 74 |         if out_channels <= 64:
 75 |             leaky = 0.1
 76 |         self.output1 = conv_bn1X1(in_channels_list[0], out_channels, stride=1, leaky=leaky)
 77 |         self.output2 = conv_bn1X1(in_channels_list[1], out_channels, stride=1, leaky=leaky)
 78 |         self.output3 = conv_bn1X1(in_channels_list[2], out_channels, stride=1, leaky=leaky)
 79 | 
 80 |         self.merge1 = conv_bn(out_channels, out_channels, leaky=leaky)
 81 |         self.merge2 = conv_bn(out_channels, out_channels, leaky=leaky)
 82 | 
 83 |     def forward(self, input):
 84 |         # names = list(input.keys())
 85 |         input = list(input.values())
 86 | 
 87 |         output1 = self.output1(input[0])
 88 |         output2 = self.output2(input[1])
 89 |         output3 = self.output3(input[2])
 90 | 
 91 |         up3 = F.interpolate(output3, size=[output2.size(2), output2.size(3)], mode="nearest")
 92 |         output2 = output2 + up3
 93 |         output2 = self.merge2(output2)
 94 | 
 95 |         up2 = F.interpolate(output2, size=[output1.size(2), output1.size(3)], mode="nearest")
 96 |         output1 = output1 + up2
 97 |         output1 = self.merge1(output1)
 98 | 
 99 |         out = [output1, output2, output3]
100 |         return out
101 | 
102 | 
103 | class MobileNetV1(nn.Module):
104 |     def __init__(self):
105 |         super(MobileNetV1, self).__init__()
106 |         self.stage1 = nn.Sequential(
107 |             conv_bn(3, 8, 2, leaky=0.1),    # 3
108 |             conv_dw(8, 16, 1),   # 7
109 |             conv_dw(16, 32, 2),  # 11
110 |             conv_dw(32, 32, 1),  # 19
111 |             conv_dw(32, 64, 2),  # 27
112 |             conv_dw(64, 64, 1),  # 43
113 |         )
114 |         self.stage2 = nn.Sequential(
115 |             conv_dw(64, 128, 2),    # 43 + 16 = 59
116 |             conv_dw(128, 128, 1),   # 59 + 32 = 91
117 |             conv_dw(128, 128, 1),   # 91 + 32 = 123
118 |             conv_dw(128, 128, 1),   # 123 + 32 = 155
119 |             conv_dw(128, 128, 1),   # 155 + 32 = 187
120 |             conv_dw(128, 128, 1),   # 187 + 32 = 219
121 |         )
122 |         self.stage3 = nn.Sequential(
123 |             conv_dw(128, 256, 2),   # 219 +3 2 = 241
124 |             conv_dw(256, 256, 1),   # 241 + 64 = 301
125 |         )
126 |         self.avg = nn.AdaptiveAvgPool2d((1,1))
127 |         self.fc = nn.Linear(256, 1000)
128 | 
129 |     def forward(self, x):
130 |         x = self.stage1(x)
131 |         x = self.stage2(x)
132 |         x = self.stage3(x)
133 |         x = self.avg(x)
134 |         # x = self.model(x)
135 |         x = x.view(-1, 256)
136 |         x = self.fc(x)
137 |         return x
138 | 


--------------------------------------------------------------------------------
/src/talking_detector/ASD.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | import sys, time, numpy, os, subprocess, pandas, tqdm
  6 | from subprocess import PIPE
  7 | 
  8 | from .loss import lossAV, lossV
  9 | from .Model import ASD_Model
 10 | 
 11 | class ASD(nn.Module):
 12 |     def __init__(self, lr = 0.001, lrDecay = 0.95, **kwargs):
 13 |         super(ASD, self).__init__()        
 14 |         self.model = ASD_Model().cuda()
 15 |         self.lossAV = lossAV().cuda()
 16 |         self.lossV = lossV().cuda()
 17 |         self.optim = torch.optim.Adam(self.parameters(), lr = lr)
 18 |         self.scheduler = torch.optim.lr_scheduler.StepLR(self.optim, step_size = 1, gamma=lrDecay)
 19 |         print(time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f"%(sum(param.numel() for param in self.model.parameters()) / 1000 / 1000))
 20 | 
 21 |     def train_network(self, loader, epoch, **kwargs):
 22 |         self.train()
 23 |         self.scheduler.step(epoch - 1)  # StepLR
 24 |         index, top1, lossV, lossAV, loss = 0, 0, 0, 0, 0
 25 |         lr = self.optim.param_groups[0]['lr']
 26 |         r = 1.3 - 0.02 * (epoch - 1)
 27 |         for num, (audioFeature, visualFeature, labels) in enumerate(loader, start=1):
 28 |             self.zero_grad()
 29 | 
 30 |             audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda())
 31 |             visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda())
 32 | 
 33 |             outsAV= self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)  
 34 |             outsV = self.model.forward_visual_backend(visualEmbed)
 35 | 
 36 |             labels = labels[0].reshape((-1)).cuda() # Loss
 37 |             nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels, r)
 38 |             nlossV = self.lossV.forward(outsV, labels, r)
 39 |             nloss = nlossAV + 0.5 * nlossV
 40 | 
 41 |             lossV += nlossV.detach().cpu().numpy()
 42 |             lossAV += nlossAV.detach().cpu().numpy()
 43 |             loss += nloss.detach().cpu().numpy()
 44 |             top1 += prec
 45 |             nloss.backward()
 46 |             self.optim.step()
 47 |             index += len(labels)
 48 |             sys.stderr.write(time.strftime("%m-%d %H:%M:%S") + \
 49 |             " [%2d] r: %2f, Lr: %5f, Training: %.2f%%, "    %(epoch, r, lr, 100 * (num / loader.__len__())) + \
 50 |             " LossV: %.5f, LossAV: %.5f, Loss: %.5f, ACC: %2.2f%% \r"  %(lossV/(num), lossAV/(num), loss/(num), 100 * (top1/index)))
 51 |             sys.stderr.flush()  
 52 | 
 53 |         sys.stdout.write("\n")      
 54 | 
 55 |         return loss/num, lr
 56 | 
 57 |     def evaluate_network(self, loader, evalCsvSave, evalOrig, **kwargs):
 58 |         self.eval()
 59 |         predScores = []
 60 |         for audioFeature, visualFeature, labels in tqdm.tqdm(loader):
 61 |             with torch.no_grad():                
 62 |                 audioEmbed  = self.model.forward_audio_frontend(audioFeature[0].cuda())
 63 |                 visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda())
 64 |                 outsAV= self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)  
 65 |                 labels = labels[0].reshape((-1)).cuda()             
 66 |                 _, predScore, _, _ = self.lossAV.forward(outsAV, labels)    
 67 |                 predScore = predScore[:,1].detach().cpu().numpy()
 68 |                 predScores.extend(predScore)
 69 |                 # break
 70 |         evalLines = open(evalOrig).read().splitlines()[1:]
 71 |         labels = []
 72 |         labels = pandas.Series( ['SPEAKING_AUDIBLE' for line in evalLines])
 73 |         scores = pandas.Series(predScores)
 74 |         evalRes = pandas.read_csv(evalOrig)
 75 |         evalRes['score'] = scores
 76 |         evalRes['label'] = labels
 77 |         evalRes.drop(['label_id'], axis=1,inplace=True)
 78 |         evalRes.drop(['instance_id'], axis=1,inplace=True)
 79 |         evalRes.to_csv(evalCsvSave, index=False)
 80 |         cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s "%(evalOrig, evalCsvSave)
 81 |         mAP = float(str(subprocess.run(cmd, shell=True, stdout=PIPE, stderr=PIPE).stdout).split(' ')[2][:5])
 82 |         return mAP
 83 | 
 84 |     def saveParameters(self, path):
 85 |         torch.save(self.state_dict(), path)
 86 | 
 87 |     def loadParameters(self, path):
 88 |         selfState = self.state_dict()
 89 |         loadedState = torch.load(path, weights_only=True)
 90 |         for name, param in loadedState.items():
 91 |             origName = name
 92 |             if name not in selfState:
 93 |                 name = name.replace("module.", "")
 94 |                 if name not in selfState:
 95 |                     print("%s is not in the model."%origName)
 96 |                     continue
 97 |             if selfState[name].size() != loadedState[origName].size():
 98 |                 sys.stderr.write("Wrong parameter length: %s, model: %s, loaded: %s"%(origName, selfState[name].size(), loadedState[origName].size()))
 99 |                 continue
100 |             selfState[name].copy_(param)


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: CHiME-9 Task 1 - MCoRec
 4 | has_children: true
 5 | parent: CHiME-9
 6 | nav_order: 1
 7 | ---
 8 | 
 9 | # CHiME-9 Task 1: Multi-Modal Context-aware Recognition (MCoRec)
10 | 
11 | ## High-Level Summary
12 | 
13 | CHiME-9 Task 1 targets the problem of **Multi-Modal Context-aware Recognition (MCoRec)** in a single-room environment. The goal is to process a single 360° video and audio recording of a room where multiple, separate conversations are happening simultaneously, and to both transcribe each speaker's speech and identify which speakers belong to the same conversation. This task addresses the challenging scenario of understanding overlapping conversations in natural social environments, where multiple groups of people engage in distinct discussions within the same physical space.
14 | 
15 | ![MCoRec Challenge Overview](images/mcorec_overview.png)
16 | 
17 | ## Key Challenge Features
18 | 
19 | * **Multiple concurrent conversations** occurring simultaneously in the same room
20 | * **Single 360° camera and microphone** capturing all participants from a central viewpoint
21 | * **High speech overlap ratios** reaching up to 100% due to simultaneous conversations
22 | * **Real, unscripted conversations** covering everyday topics like hobbies, work, entertainment, and personal stories
23 | * **Natural acoustic environments** with realistic background noise from other ongoing conversations
24 | * **Up to 8 active speakers** divided into up to 4 simultaneous conversations
25 | * **Combined transcription and clustering challenge** requiring both accurate speech recognition and conversation grouping
26 | 
27 | ## The Scenario
28 | 
29 | The MCoRec dataset captures natural conversational scenarios where 2-8 participants are seated around a table and divided into groups of 2-4 speakers each. Participants engage in unscripted conversations on topics including everyday life, work, school, hypotheticals, entertainment, news, and personal stories. Sessions typically last around 6 minutes, during which multiple separate conversations occur simultaneously, creating a challenging acoustic environment with significant speech overlap. A moderator signals the start and end of each session with a distinctive whistle to facilitate synchronization.
30 | 
31 | ## The Recording Setup
32 | 
33 | * **360° Camera**: GoPro Max positioned at the center of the table, capturing 4K resolution video at 25fps with single-channel audio at 16kHz
34 | * **Individual Smartphones**: Each speaker has a smartphone placed in front of them (selfie mode) recording at 720p resolution  
35 | * **Lapel Microphones**: Close-talking microphones connected to smartphones via adapters, positioned near each speaker's mouth for enhanced audio clarity
36 | * **Seating Arrangement**: All speakers sit around a table with varying distances depending on table size
37 | * **Session Synchronization**: Moderator whistle cues enable precise alignment of recordings from multiple devices
38 | * **Recording Duration**: Each session typically lasts approximately 6 minutes.
39 | 
40 | ***Note***: Individual smartphone recordings and lapel microphone audio are **only available for the training set** to facilitate system development. The development and evaluation sets contain **only the central 360° video and audio**, as the core challenge focuses on processing the difficult multi-speaker, multi-conversation scenario captured by the central camera setup with high speech overlap and acoustic complexity.
41 | 
42 | ## Task Description
43 | 
44 | The challenge consists of a single comprehensive track requiring participants' systems to:
45 | 
46 | 1. **Individual Speaker Transcription**: Generate time-aligned transcripts (`.vtt` files) for each target speaker, accurately capturing their speech content within the specified evaluation time intervals.
47 | 2. **Conversation Clustering**: Group participants into their respective conversations by generating a speaker-to-cluster mapping.
48 | 
49 | **Input**: Single 360° video and its corresponding audio track, along with bounding boxes to identify the list of target participants. 
50 | 
51 | **Output**: Per-speaker transcriptions and conversation cluster assignments
52 | 
53 | 
54 | ## Evaluation and Ranking
55 | 
56 | The evaluation uses **three complementary metrics**:
57 | 
58 | 1. **Individual Speaker's WER**: Word Error Rate computed for each speaker's transcription
59 | 2. **Conversation Clustering Performance**: Pairwise F1 score measuring clustering accuracy
60 | 3. **Joint ASR-Clustering Error Rate** (*Primary Metric*): Combined metric that weights transcription performance and clustering performance
61 | 
62 | ## Important Dates
63 | 
64 | * **Data Release**: July 1st, 2025 (train and dev sets)
65 | * **Evaluation Data Release**: TBU
66 | * **Final Submission Deadline**: 7 Feb 2026  
67 | * **Results Announcement**: 3 May 2026
68 | * **Workshop**: TBU
69 | 
70 | ## Organizers
71 | 
72 | - Alexander Waibel (CMU, USA & KIT, DE)
73 | - Christian Fuegen (Meta, UK)
74 | - Shinji Watanabe (CMU, USA)
75 | - Katerina Zmolikova (Meta, UK)
76 | - Thai-Binh Nguyen (KIT, DE)
77 | - Pingchuan Ma (Meta, USA)
78 | 
79 | For any questions about the challenge, please contact us:
80 | - Email: [mcorecchallenge@gmail.com](mailto:mcorecchallenge@gmail.com) 
81 | - Slack: [CHiME Challenge Community](https://join.slack.com/t/chimechallenge/shared_invite/zt-37h0cfpeb-qg5jwCgqRWCKc_3mLWVsYA)
82 | 


--------------------------------------------------------------------------------
/src/custom_trainer.py:
--------------------------------------------------------------------------------
  1 | from transformers.trainer import *
  2 | from typing import Callable, Dict, List, Optional, Tuple, Union, Type
  3 | 
  4 | class AVSRTrainer(Trainer):
  5 |     
  6 |     def __init__(
  7 |         self,
  8 |         model: Union[PreTrainedModel, nn.Module] = None,
  9 |         args: TrainingArguments = None,
 10 |         data_collator: any = None,
 11 |         valid_data_collator: any = None,
 12 |         train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None,
 13 |         eval_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None,
 14 |         processing_class: Optional[
 15 |             Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
 16 |         ] = None,
 17 |         model_init: Optional[Callable[[], PreTrainedModel]] = None,
 18 |         compute_loss_func: Optional[Callable] = None,
 19 |         compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
 20 |         callbacks: Optional[List[TrainerCallback]] = None,
 21 |         optimizers: Tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None),
 22 |         optimizer_cls_and_kwargs: Optional[Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]] = None,
 23 |         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
 24 |         
 25 |     ):
 26 |         super().__init__(
 27 |             model=model,
 28 |             args=args,
 29 |             data_collator=data_collator,
 30 |             train_dataset=train_dataset,
 31 |             eval_dataset=eval_dataset,
 32 |             tokenizer=processing_class,
 33 |             model_init=model_init,
 34 |             compute_loss_func=compute_loss_func,
 35 |             compute_metrics=compute_metrics,
 36 |             callbacks=callbacks,
 37 |             optimizers=optimizers,
 38 |             optimizer_cls_and_kwargs=optimizer_cls_and_kwargs,
 39 |             preprocess_logits_for_metrics=preprocess_logits_for_metrics,
 40 |         )
 41 |         self.valid_data_collator = valid_data_collator
 42 |     
 43 |     def get_eval_dataloader(self, eval_dataset: Optional[Union[str, Dataset]] = None) -> DataLoader:
 44 |         """
 45 |         Returns the evaluation [`~torch.utils.data.DataLoader`].
 46 | 
 47 |         Subclass and override this method if you want to inject some custom behavior.
 48 | 
 49 |         Args:
 50 |             eval_dataset (`str` or `torch.utils.data.Dataset`, *optional*):
 51 |                 If a `str`, will use `self.eval_dataset[eval_dataset]` as the evaluation dataset. If a `Dataset`, will override `self.eval_dataset` and must implement `__len__`. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed.
 52 |         """
 53 |         if eval_dataset is None and self.eval_dataset is None:
 54 |             raise ValueError("Trainer: evaluation requires an eval_dataset.")
 55 | 
 56 |         # If we have persistent workers, don't do a fork bomb especially as eval datasets
 57 |         # don't change during training
 58 |         dataloader_key = eval_dataset if isinstance(eval_dataset, str) else "eval"
 59 |         if (
 60 |             hasattr(self, "_eval_dataloaders")
 61 |             and dataloader_key in self._eval_dataloaders
 62 |             and self.args.dataloader_persistent_workers
 63 |         ):
 64 |             return self.accelerator.prepare(self._eval_dataloaders[dataloader_key])
 65 | 
 66 |         eval_dataset = (
 67 |             self.eval_dataset[eval_dataset]
 68 |             if isinstance(eval_dataset, str)
 69 |             else eval_dataset
 70 |             if eval_dataset is not None
 71 |             else self.eval_dataset
 72 |         )
 73 |         data_collator = self.valid_data_collator
 74 | 
 75 |         if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset):
 76 |             eval_dataset = self._remove_unused_columns(eval_dataset, description="evaluation")
 77 |         else:
 78 |             data_collator = self._get_collator_with_removed_columns(data_collator, description="evaluation")
 79 | 
 80 |         dataloader_params = {
 81 |             "batch_size": self.args.eval_batch_size,
 82 |             "collate_fn": data_collator,
 83 |             "num_workers": self.args.dataloader_num_workers,
 84 |             "pin_memory": self.args.dataloader_pin_memory,
 85 |             "persistent_workers": self.args.dataloader_persistent_workers,
 86 |         }
 87 | 
 88 |         if not isinstance(eval_dataset, torch.utils.data.IterableDataset):
 89 |             dataloader_params["sampler"] = self._get_eval_sampler(eval_dataset)
 90 |             dataloader_params["drop_last"] = self.args.dataloader_drop_last
 91 |             dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
 92 | 
 93 |         # accelerator.free_memory() will destroy the references, so
 94 |         # we need to store the non-prepared version
 95 |         eval_dataloader = DataLoader(eval_dataset, **dataloader_params)
 96 |         if self.args.dataloader_persistent_workers:
 97 |             if hasattr(self, "_eval_dataloaders"):
 98 |                 self._eval_dataloaders[dataloader_key] = eval_dataloader
 99 |             else:
100 |                 self._eval_dataloaders = {dataloader_key: eval_dataloader}
101 | 
102 |         return self.accelerator.prepare(eval_dataloader)


--------------------------------------------------------------------------------
/src/nets/scorers/ctc.py:
--------------------------------------------------------------------------------
  1 | """ScorerInterface implementation for CTC."""
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | 
  6 | from src.nets.ctc_prefix_score import CTCPrefixScore, CTCPrefixScoreTH
  7 | from src.nets.scorer_interface import BatchPartialScorerInterface
  8 | 
  9 | 
 10 | class CTCPrefixScorer(BatchPartialScorerInterface):
 11 |     """Decoder interface wrapper for CTCPrefixScore."""
 12 | 
 13 |     def __init__(self, ctc: torch.nn.Module, eos: int):
 14 |         """Initialize class.
 15 | 
 16 |         Args:
 17 |             ctc (torch.nn.Module): The CTC implementation.
 18 |                 For example, :class:`src.nets.backend.ctc.CTC`
 19 |             eos (int): The end-of-sequence id.
 20 | 
 21 |         """
 22 |         self.ctc = ctc
 23 |         self.eos = eos
 24 |         self.impl = None
 25 | 
 26 |     def init_state(self, x: torch.Tensor):
 27 |         """Get an initial state for decoding.
 28 | 
 29 |         Args:
 30 |             x (torch.Tensor): The encoded feature tensor
 31 | 
 32 |         Returns: initial state
 33 | 
 34 |         """
 35 |         logp = self.ctc.log_softmax(x.unsqueeze(0)).detach().squeeze(0).cpu().numpy()
 36 |         # TODO(karita): use CTCPrefixScoreTH
 37 |         self.impl = CTCPrefixScore(logp, 0, self.eos, np)
 38 |         return 0, self.impl.initial_state()
 39 | 
 40 |     def select_state(self, state, i, new_id=None):
 41 |         """Select state with relative ids in the main beam search.
 42 | 
 43 |         Args:
 44 |             state: Decoder state for prefix tokens
 45 |             i (int): Index to select a state in the main beam search
 46 |             new_id (int): New label id to select a state if necessary
 47 | 
 48 |         Returns:
 49 |             state: pruned state
 50 | 
 51 |         """
 52 |         if type(state) == tuple:
 53 |             if len(state) == 2:  # for CTCPrefixScore
 54 |                 sc, st = state
 55 |                 return sc[i], st[i]
 56 |             else:  # for CTCPrefixScoreTH (need new_id > 0)
 57 |                 r, log_psi, f_min, f_max, scoring_idmap = state
 58 |                 s = log_psi[i, new_id].expand(log_psi.size(1))
 59 |                 if scoring_idmap is not None:
 60 |                     return r[:, :, i, scoring_idmap[i, new_id]], s, f_min, f_max
 61 |                 else:
 62 |                     return r[:, :, i, new_id], s, f_min, f_max
 63 |         return None if state is None else state[i]
 64 | 
 65 |     def score_partial(self, y, ids, state, x):
 66 |         """Score new token.
 67 | 
 68 |         Args:
 69 |             y (torch.Tensor): 1D prefix token
 70 |             next_tokens (torch.Tensor): torch.int64 next token to score
 71 |             state: decoder state for prefix tokens
 72 |             x (torch.Tensor): 2D encoder feature that generates ys
 73 | 
 74 |         Returns:
 75 |             tuple[torch.Tensor, Any]:
 76 |                 Tuple of a score tensor for y that has a shape `(len(next_tokens),)`
 77 |                 and next state for ys
 78 | 
 79 |         """
 80 |         prev_score, state = state
 81 |         presub_score, new_st = self.impl(y.cpu(), ids.cpu(), state)
 82 |         tscore = torch.as_tensor(
 83 |             presub_score - prev_score, device=x.device, dtype=x.dtype
 84 |         )
 85 |         return tscore, (presub_score, new_st)
 86 | 
 87 |     def batch_init_state(self, x: torch.Tensor):
 88 |         """Get an initial state for decoding.
 89 | 
 90 |         Args:
 91 |             x (torch.Tensor): The encoded feature tensor
 92 | 
 93 |         Returns: initial state
 94 | 
 95 |         """
 96 |         logp = self.ctc.log_softmax(x.unsqueeze(0))  # assuming batch_size = 1
 97 |         xlen = torch.tensor([logp.size(1)])
 98 |         self.impl = CTCPrefixScoreTH(logp, xlen, 0, self.eos)
 99 |         return None
100 | 
101 |     def batch_score_partial(self, y, ids, state, x):
102 |         """Score new token.
103 | 
104 |         Args:
105 |             y (torch.Tensor): 1D prefix token
106 |             ids (torch.Tensor): torch.int64 next token to score
107 |             state: decoder state for prefix tokens
108 |             x (torch.Tensor): 2D encoder feature that generates ys
109 | 
110 |         Returns:
111 |             tuple[torch.Tensor, Any]:
112 |                 Tuple of a score tensor for y that has a shape `(len(next_tokens),)`
113 |                 and next state for ys
114 | 
115 |         """
116 |         batch_state = (
117 |             (
118 |                 torch.stack([s[0] for s in state], dim=2),
119 |                 torch.stack([s[1] for s in state]),
120 |                 state[0][2],
121 |                 state[0][3],
122 |             )
123 |             if state[0] is not None
124 |             else None
125 |         )
126 |         return self.impl(y, batch_state, ids)
127 | 
128 |     def extend_prob(self, x: torch.Tensor):
129 |         """Extend probs for decoding.
130 | 
131 |         This extension is for streaming decoding
132 |         as in Eq (14) in https://arxiv.org/abs/2006.14941
133 | 
134 |         Args:
135 |             x (torch.Tensor): The encoded feature tensor
136 | 
137 |         """
138 |         logp = self.ctc.log_softmax(x.unsqueeze(0))
139 |         self.impl.extend_prob(logp)
140 | 
141 |     def extend_state(self, state):
142 |         """Extend state for decoding.
143 | 
144 |         This extension is for streaming decoding
145 |         as in Eq (14) in https://arxiv.org/abs/2006.14941
146 | 
147 |         Args:
148 |             state: The states of hyps
149 | 
150 |         Returns: exteded state
151 | 
152 |         """
153 |         new_state = []
154 |         for s in state:
155 |             new_state.append(self.impl.extend_state(s))
156 | 
157 |         return new_state
158 | 


--------------------------------------------------------------------------------
/src/ibug/face_detection/retina_face/retina_face_predictor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import numpy as np
  4 | from copy import deepcopy
  5 | from types import SimpleNamespace
  6 | from typing import Union, Optional
  7 | from .prior_box import PriorBox
  8 | from .py_cpu_nms import py_cpu_nms
  9 | from .retina_face import RetinaFace
 10 | from .config import cfg_mnet, cfg_re50
 11 | from .box_utils import decode, decode_landm
 12 | 
 13 | 
 14 | __all__ = ['RetinaFacePredictor']
 15 | 
 16 | 
 17 | class RetinaFacePredictor(object):
 18 |     def __init__(self, threshold: float = 0.8, device: Union[str, torch.device] = 'cuda:0',
 19 |                  model: Optional[SimpleNamespace] = None, config: Optional[SimpleNamespace] = None) -> None:
 20 |         self.threshold = threshold
 21 |         self.device = device
 22 |         if model is None:
 23 |             model = RetinaFacePredictor.get_model()
 24 |         if config is None:
 25 |             config = RetinaFacePredictor.create_config()
 26 |         self.config = SimpleNamespace(**model.config.__dict__, **config.__dict__)
 27 |         self.net = RetinaFace(cfg=self.config.__dict__, phase='test').to(self.device)
 28 |         pretrained_dict = torch.load(model.weights, map_location=self.device)
 29 |         if 'state_dict' in pretrained_dict.keys():
 30 |             pretrained_dict = {key.split('module.', 1)[-1] if key.startswith('module.') else key: value
 31 |                                for key, value in pretrained_dict['state_dict'].items()}
 32 |         else:
 33 |             pretrained_dict = {key.split('module.', 1)[-1] if key.startswith('module.') else key: value
 34 |                                for key, value in pretrained_dict.items()}
 35 |         self.net.load_state_dict(pretrained_dict, strict=False)
 36 |         self.net.eval()
 37 |         self.priors = None
 38 |         self.previous_size = None
 39 | 
 40 |     @staticmethod
 41 |     def get_model(name: str = 'resnet50') -> SimpleNamespace:
 42 |         name = name.lower().strip()
 43 |         if name == 'resnet50':
 44 |             return SimpleNamespace(weights=os.path.realpath(os.path.join(os.path.dirname(__file__),
 45 |                                                                          '..','..','..','..','model-bin','face_detection','retina_face','weights', 'Resnet50_Final.pth')),
 46 |                                    config=SimpleNamespace(**deepcopy(cfg_re50)))
 47 |         elif name == 'mobilenet0.25':
 48 |             return SimpleNamespace(weights=os.path.realpath(os.path.join(os.path.dirname(__file__),
 49 |                                                                          '..','..','..','..','model-bin','face_detection','retina_face','weights', 'mobilenet0.25_Final.pth')),
 50 |                                    config=SimpleNamespace(**deepcopy(cfg_mnet)))
 51 |         else:
 52 |             raise ValueError('name must be set to either resnet50 or mobilenet0.25')
 53 | 
 54 |     @staticmethod
 55 |     def create_config(top_k: int = 750, conf_thresh: float = 0.02,
 56 |                       nms_thresh: float = 0.4, nms_top_k: int = 5000) -> SimpleNamespace:
 57 |         return SimpleNamespace(top_k=top_k, conf_thresh=conf_thresh, nms_thresh=nms_thresh, nms_top_k=nms_top_k)
 58 | 
 59 |     @torch.no_grad()
 60 |     def __call__(self, image: np.ndarray, rgb: bool = True) -> np.ndarray:
 61 |         im_height, im_width, _ = image.shape
 62 |         if rgb:
 63 |             image = image[..., ::-1]
 64 |         image = image.astype(int) - np.array([104, 117, 123])
 65 |         image = image.transpose(2, 0, 1)
 66 |         image = torch.from_numpy(image).unsqueeze(0).float().to(self.device)
 67 |         scale = torch.Tensor([im_width, im_height, im_width, im_height]).to(self.device)
 68 |         loc, conf, landms = self.net(image)
 69 |         image_size = (im_height, im_width)
 70 |         if self.priors is None or self.previous_size != image_size:
 71 |             self.priors = PriorBox(self.config.__dict__, image_size=image_size).forward().to(self.device)
 72 |             self.previous_size = image_size
 73 |         prior_data = self.priors.data
 74 |         boxes = decode(loc.data.squeeze(0), prior_data, self.config.variance)
 75 |         boxes = boxes * scale
 76 |         boxes = boxes.cpu().numpy()
 77 |         scores = conf.squeeze(0).data.cpu().numpy()[:, 1]
 78 |         landms = decode_landm(landms.data.squeeze(0), prior_data, self.config.variance)
 79 |         scale1 = torch.Tensor([image.shape[3], image.shape[2], image.shape[3], image.shape[2],
 80 |                                image.shape[3], image.shape[2], image.shape[3], image.shape[2],
 81 |                                image.shape[3], image.shape[2]]).to(self.device)
 82 |         landms = landms * scale1
 83 |         landms = landms.cpu().numpy()
 84 | 
 85 |         # ignore low scores
 86 |         inds = np.where(scores > self.config.conf_thresh)[0]
 87 |         if len(inds) == 0:
 88 |             return np.empty(shape=(0, 15), dtype=np.float32)
 89 |         boxes = boxes[inds]
 90 |         landms = landms[inds]
 91 |         scores = scores[inds]
 92 | 
 93 |         # do NMS
 94 |         dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False)
 95 |         keep = py_cpu_nms(dets, self.config.nms_thresh, self.config.nms_top_k)
 96 |         dets = dets[keep, :]
 97 |         landms = landms[keep]
 98 | 
 99 |         # keep top-K
100 |         dets = dets[:self.config.top_k, :]
101 |         landms = landms[:self.config.top_k, :]
102 |         dets = np.concatenate((dets, landms), axis=1)
103 | 
104 |         # further filter by confidence
105 |         inds = np.where(dets[:, 4] >= self.threshold)[0]
106 |         if len(inds) == 0:
107 |             return np.empty(shape=(0, 15), dtype=np.float32)
108 |         else:
109 |             return dets[inds]
110 | 


--------------------------------------------------------------------------------
/src/nets/backend/backbones/modules/resnet.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import pdb
  3 | 
  4 | import torch.nn as nn
  5 | 
  6 | from src.nets.backend.transformer.convolution import Swish
  7 | 
  8 | 
  9 | def conv3x3(in_planes, out_planes, stride=1):
 10 |     """conv3x3.
 11 | 
 12 |     :param in_planes: int, number of channels in the input sequence.
 13 |     :param out_planes: int,  number of channels produced by the convolution.
 14 |     :param stride: int, size of the convolving kernel.
 15 |     """
 16 |     return nn.Conv2d(
 17 |         in_planes,
 18 |         out_planes,
 19 |         kernel_size=3,
 20 |         stride=stride,
 21 |         padding=1,
 22 |         bias=False,
 23 |     )
 24 | 
 25 | 
 26 | def downsample_basic_block(inplanes, outplanes, stride):
 27 |     """downsample_basic_block.
 28 | 
 29 |     :param inplanes: int, number of channels in the input sequence.
 30 |     :param outplanes: int, number of channels produced by the convolution.
 31 |     :param stride: int, size of the convolving kernel.
 32 |     """
 33 |     return nn.Sequential(
 34 |         nn.Conv2d(
 35 |             inplanes,
 36 |             outplanes,
 37 |             kernel_size=1,
 38 |             stride=stride,
 39 |             bias=False,
 40 |         ),
 41 |         nn.BatchNorm2d(outplanes),
 42 |     )
 43 | 
 44 | 
 45 | class BasicBlock(nn.Module):
 46 |     expansion = 1
 47 | 
 48 |     def __init__(
 49 |         self,
 50 |         inplanes,
 51 |         planes,
 52 |         stride=1,
 53 |         downsample=None,
 54 |         relu_type="swish",
 55 |     ):
 56 |         """__init__.
 57 | 
 58 |         :param inplanes: int, number of channels in the input sequence.
 59 |         :param planes: int,  number of channels produced by the convolution.
 60 |         :param stride: int, size of the convolving kernel.
 61 |         :param downsample: boolean, if True, the temporal resolution is downsampled.
 62 |         :param relu_type: str, type of activation function.
 63 |         """
 64 |         super(BasicBlock, self).__init__()
 65 | 
 66 |         assert relu_type in ["relu", "prelu", "swish"]
 67 | 
 68 |         self.conv1 = conv3x3(inplanes, planes, stride)
 69 |         self.bn1 = nn.BatchNorm2d(planes)
 70 | 
 71 |         if relu_type == "relu":
 72 |             self.relu1 = nn.ReLU(inplace=True)
 73 |             self.relu2 = nn.ReLU(inplace=True)
 74 |         elif relu_type == "prelu":
 75 |             self.relu1 = nn.PReLU(num_parameters=planes)
 76 |             self.relu2 = nn.PReLU(num_parameters=planes)
 77 |         elif relu_type == "swish":
 78 |             self.relu1 = Swish()
 79 |             self.relu2 = Swish()
 80 |         else:
 81 |             raise NotImplementedError
 82 |         # --------
 83 | 
 84 |         self.conv2 = conv3x3(planes, planes)
 85 |         self.bn2 = nn.BatchNorm2d(planes)
 86 | 
 87 |         self.downsample = downsample
 88 |         self.stride = stride
 89 | 
 90 |     def forward(self, x):
 91 |         """forward.
 92 | 
 93 |         :param x: torch.Tensor, input tensor with input size (B, C, T, H, W).
 94 |         """
 95 |         residual = x
 96 |         out = self.conv1(x)
 97 |         out = self.bn1(out)
 98 |         out = self.relu1(out)
 99 |         out = self.conv2(out)
100 |         out = self.bn2(out)
101 |         if self.downsample is not None:
102 |             residual = self.downsample(x)
103 | 
104 |         out += residual
105 |         out = self.relu2(out)
106 | 
107 |         return out
108 | 
109 | 
110 | class ResNet(nn.Module):
111 |     def __init__(
112 |         self,
113 |         block,
114 |         layers,
115 |         relu_type="swish",
116 |     ):
117 |         super(ResNet, self).__init__()
118 |         self.inplanes = 64
119 |         self.relu_type = relu_type
120 |         self.downsample_block = downsample_basic_block
121 | 
122 |         self.layer1 = self._make_layer(block, 64, layers[0])
123 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
124 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
125 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
126 |         self.avgpool = nn.AdaptiveAvgPool2d(1)
127 | 
128 |     def _make_layer(self, block, planes, blocks, stride=1):
129 |         """_make_layer.
130 | 
131 |         :param block: torch.nn.Module, class of blocks.
132 |         :param planes: int,  number of channels produced by the convolution.
133 |         :param blocks: int, number of layers in a block.
134 |         :param stride: int, size of the convolving kernel.
135 |         """
136 |         downsample = None
137 |         if stride != 1 or self.inplanes != planes * block.expansion:
138 |             downsample = self.downsample_block(
139 |                 inplanes=self.inplanes,
140 |                 outplanes=planes * block.expansion,
141 |                 stride=stride,
142 |             )
143 | 
144 |         layers = []
145 |         layers.append(
146 |             block(
147 |                 self.inplanes,
148 |                 planes,
149 |                 stride,
150 |                 downsample,
151 |                 relu_type=self.relu_type,
152 |             )
153 |         )
154 |         self.inplanes = planes * block.expansion
155 |         for i in range(1, blocks):
156 |             layers.append(
157 |                 block(
158 |                     self.inplanes,
159 |                     planes,
160 |                     relu_type=self.relu_type,
161 |                 )
162 |             )
163 | 
164 |         return nn.Sequential(*layers)
165 | 
166 |     def forward(self, x):
167 |         """forward.
168 | 
169 |         :param x: torch.Tensor, input tensor with input size (B, C, T, H, W).
170 |         """
171 |         x = self.layer1(x)
172 |         x = self.layer2(x)
173 |         x = self.layer3(x)
174 |         x = self.layer4(x)
175 |         x = self.avgpool(x)
176 |         x = x.view(x.size(0), -1)
177 |         return x
178 | 


--------------------------------------------------------------------------------
/src/nets/backend/transformer/encoder_layer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2019 Shigeki Karita
  5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  6 | 
  7 | """Encoder self-attention layer definition."""
  8 | 
  9 | import copy
 10 | 
 11 | import torch
 12 | 
 13 | from src.nets.backend.transformer.layer_norm import LayerNorm
 14 | 
 15 | from torch import nn
 16 | 
 17 | 
 18 | class EncoderLayer(nn.Module):
 19 |     """Encoder layer module.
 20 | 
 21 |     :param int size: input dim
 22 |     :param src.nets.backend.transformer.attention.
 23 |         MultiHeadedAttention self_attn: self attention module
 24 |         RelPositionMultiHeadedAttention self_attn: self attention module
 25 |     :param src.nets.backend.transformer.positionwise_feed_forward.
 26 |         PositionwiseFeedForward feed_forward:
 27 |         feed forward module
 28 |     :param src.nets.backend.transformer.convolution.
 29 |         ConvolutionModule feed_foreard:
 30 |         feed forward module
 31 |     :param float dropout_rate: dropout rate
 32 |     :param bool normalize_before: whether to use layer_norm before the first block
 33 |     :param bool concat_after: whether to concat attention layer's input and output
 34 |         if True, additional linear will be applied.
 35 |         i.e. x -> x + linear(concat(x, att(x)))
 36 |         if False, no additional linear will be applied. i.e. x -> x + att(x)
 37 |     :param bool macaron_style: whether to use macaron style for PositionwiseFeedForward
 38 | 
 39 |     """
 40 | 
 41 |     def __init__(
 42 |         self,
 43 |         size,
 44 |         self_attn,
 45 |         feed_forward,
 46 |         conv_module,
 47 |         dropout_rate,
 48 |         normalize_before=True,
 49 |         concat_after=False,
 50 |         macaron_style=False,
 51 |     ):
 52 |         """Construct an EncoderLayer object."""
 53 |         super(EncoderLayer, self).__init__()
 54 |         self.self_attn = self_attn
 55 |         self.feed_forward = feed_forward
 56 |         self.ff_scale = 1.0
 57 |         self.conv_module = conv_module
 58 |         self.macaron_style = macaron_style
 59 |         self.norm_ff = LayerNorm(size)  # for the FNN module
 60 |         self.norm_mha = LayerNorm(size)  # for the MHA module
 61 |         if self.macaron_style:
 62 |             self.feed_forward_macaron = copy.deepcopy(feed_forward)
 63 |             self.ff_scale = 0.5
 64 |             # for another FNN module in macaron style
 65 |             self.norm_ff_macaron = LayerNorm(size)
 66 |         if self.conv_module is not None:
 67 |             self.norm_conv = LayerNorm(size)  # for the CNN module
 68 |             self.norm_final = LayerNorm(size)  # for the final output of the block
 69 |         self.dropout = nn.Dropout(dropout_rate)
 70 |         self.size = size
 71 |         self.normalize_before = normalize_before
 72 |         self.concat_after = concat_after
 73 |         if self.concat_after:
 74 |             self.concat_linear = nn.Linear(size + size, size)
 75 | 
 76 |     def forward(self, x_input, mask, cache=None):
 77 |         """Compute encoded features.
 78 | 
 79 |         :param torch.Tensor x_input: encoded source features (batch, max_time_in, size)
 80 |         :param torch.Tensor mask: mask for x (batch, max_time_in)
 81 |         :param torch.Tensor cache: cache for x (batch, max_time_in - 1, size)
 82 |         :rtype: Tuple[torch.Tensor, torch.Tensor]
 83 |         """
 84 |         if isinstance(x_input, tuple):
 85 |             x, pos_emb = x_input[0], x_input[1]
 86 |         else:
 87 |             x, pos_emb = x_input, None
 88 | 
 89 |         # whether to use macaron style
 90 |         if self.macaron_style:
 91 |             residual = x
 92 |             if self.normalize_before:
 93 |                 x = self.norm_ff_macaron(x)
 94 |             x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x))
 95 |             if not self.normalize_before:
 96 |                 x = self.norm_ff_macaron(x)
 97 | 
 98 |         # multi-headed self-attention module
 99 |         residual = x
100 |         if self.normalize_before:
101 |             x = self.norm_mha(x)
102 | 
103 |         if cache is None:
104 |             x_q = x
105 |         else:
106 |             assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
107 |             x_q = x[:, -1:, :]
108 |             residual = residual[:, -1:, :]
109 |             mask = None if mask is None else mask[:, -1:, :]
110 | 
111 |         if pos_emb is not None:
112 |             x_att = self.self_attn(x_q, x, x, pos_emb, mask)
113 |         else:
114 |             x_att = self.self_attn(x_q, x, x, mask)
115 | 
116 |         if self.concat_after:
117 |             x_concat = torch.cat((x, x_att), dim=-1)
118 |             x = residual + self.concat_linear(x_concat)
119 |         else:
120 |             x = residual + self.dropout(x_att)
121 |         if not self.normalize_before:
122 |             x = self.norm_mha(x)
123 | 
124 |         # convolution module
125 |         if self.conv_module is not None:
126 |             residual = x
127 |             if self.normalize_before:
128 |                 x = self.norm_conv(x)
129 |             x = residual + self.dropout(self.conv_module(x))
130 |             if not self.normalize_before:
131 |                 x = self.norm_conv(x)
132 | 
133 |         # feed forward module
134 |         residual = x
135 |         if self.normalize_before:
136 |             x = self.norm_ff(x)
137 |         x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
138 |         if not self.normalize_before:
139 |             x = self.norm_ff(x)
140 | 
141 |         if self.conv_module is not None:
142 |             x = self.norm_final(x)
143 | 
144 |         if cache is not None:
145 |             x = torch.cat([cache, x], dim=1)
146 | 
147 |         if pos_emb is not None:
148 |             return (x, pos_emb), mask
149 |         else:
150 |             return x, mask
151 | 


--------------------------------------------------------------------------------
/src/nets/backend/e2e_asr_conformer_av.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Shigeki Karita
  2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  3 | 
  4 | """Transformer speech recognition model (pytorch)."""
  5 | 
  6 | import logging
  7 | import numpy
  8 | import torch
  9 | 
 10 | from src.nets.backend.ctc import CTC
 11 | from src.nets.backend.nets_utils import (
 12 |     make_non_pad_mask,
 13 |     th_accuracy,
 14 | )
 15 | from src.nets.backend.transformer.add_sos_eos import add_sos_eos
 16 | from src.nets.backend.transformer.decoder import Decoder
 17 | from src.nets.backend.transformer.encoder import Encoder
 18 | from src.nets.backend.transformer.label_smoothing_loss import LabelSmoothingLoss
 19 | from src.nets.backend.transformer.mask import target_mask
 20 | from src.nets.backend.nets_utils import MLPHead
 21 | 
 22 | 
 23 | class E2E(torch.nn.Module):
 24 |     def __init__(self, args, ignore_id=-1):
 25 |         torch.nn.Module.__init__(self)
 26 | 
 27 |         self.encoder = Encoder(
 28 |             attention_dim=args.adim,
 29 |             attention_heads=args.aheads,
 30 |             linear_units=args.eunits,
 31 |             num_blocks=args.elayers,
 32 |             input_layer=args.transformer_input_layer,
 33 |             dropout_rate=args.dropout_rate,
 34 |             positional_dropout_rate=args.dropout_rate,
 35 |             attention_dropout_rate=args.transformer_attn_dropout_rate,
 36 |             encoder_attn_layer_type=args.transformer_encoder_attn_layer_type,
 37 |             macaron_style=args.macaron_style,
 38 |             use_cnn_module=args.use_cnn_module,
 39 |             cnn_module_kernel=args.cnn_module_kernel,
 40 |             zero_triu=getattr(args, "zero_triu", False),
 41 |             a_upsample_ratio=args.a_upsample_ratio,
 42 |             relu_type=getattr(args, "relu_type", "swish"),
 43 |         )
 44 | 
 45 |         self.aux_encoder = Encoder(
 46 |             attention_dim=args.aux_adim,
 47 |             attention_heads=args.aux_aheads,
 48 |             linear_units=args.aux_eunits,
 49 |             num_blocks=args.aux_elayers,
 50 |             input_layer=args.aux_transformer_input_layer,
 51 |             dropout_rate=args.aux_dropout_rate,
 52 |             positional_dropout_rate=args.aux_dropout_rate,
 53 |             attention_dropout_rate=args.aux_transformer_attn_dropout_rate,
 54 |             encoder_attn_layer_type=args.aux_transformer_encoder_attn_layer_type,
 55 |             macaron_style=args.aux_macaron_style,
 56 |             use_cnn_module=args.aux_use_cnn_module,
 57 |             cnn_module_kernel=args.aux_cnn_module_kernel,
 58 |             zero_triu=getattr(args, "aux_zero_triu", False),
 59 |             a_upsample_ratio=args.aux_a_upsample_ratio,
 60 |             relu_type=getattr(args, "aux_relu_type", "swish"),
 61 |         )
 62 | 
 63 |         self.transformer_input_layer = args.transformer_input_layer
 64 |         self.a_upsample_ratio = args.a_upsample_ratio
 65 | 
 66 |         self.fusion = MLPHead(
 67 |             idim=args.adim + args.aux_adim,
 68 |             hdim=args.fusion_hdim,
 69 |             odim=args.adim,
 70 |             norm=args.fusion_norm,
 71 |         )
 72 | 
 73 |         self.proj_decoder = None
 74 |         if args.adim != args.ddim:
 75 |             self.proj_decoder = torch.nn.Linear(args.adim, args.ddim)
 76 | 
 77 |         if args.mtlalpha < 1:
 78 |             self.decoder = Decoder(
 79 |                 odim=args.odim,
 80 |                 attention_dim=args.ddim,
 81 |                 attention_heads=args.dheads,
 82 |                 linear_units=args.dunits,
 83 |                 num_blocks=args.dlayers,
 84 |                 dropout_rate=args.dropout_rate,
 85 |                 positional_dropout_rate=args.dropout_rate,
 86 |                 self_attention_dropout_rate=args.transformer_attn_dropout_rate,
 87 |                 src_attention_dropout_rate=args.transformer_attn_dropout_rate,
 88 |             )
 89 |         else:
 90 |             self.decoder = None
 91 |         self.blank = 0
 92 |         self.sos = args.odim - 1
 93 |         self.eos = args.odim - 1
 94 |         self.odim = args.odim
 95 |         self.ignore_id = ignore_id
 96 | 
 97 |         # self.lsm_weight = a
 98 |         self.criterion = LabelSmoothingLoss(
 99 |             self.odim,
100 |             self.ignore_id,
101 |             args.lsm_weight,
102 |             args.transformer_length_normalized_loss,
103 |         )
104 | 
105 |         self.adim = args.adim
106 |         self.mtlalpha = args.mtlalpha
107 |         if args.mtlalpha > 0.0:
108 |             self.ctc = CTC(
109 |                 args.odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True
110 |             )
111 |         else:
112 |             self.ctc = None
113 | 
114 |     def forward(self, video, audio, video_lengths, audio_lengths, label):
115 |         video_padding_mask = make_non_pad_mask(video_lengths).to(video.device).unsqueeze(-2)
116 |         video_feat, _ = self.encoder(video, video_padding_mask)
117 | 
118 |         audio_lengths = torch.div(audio_lengths, 640, rounding_mode="trunc")
119 |         audio_padding_mask = make_non_pad_mask(audio_lengths).to(video.device).unsqueeze(-2)
120 | 
121 |         audio_feat, _ = self.aux_encoder(audio, audio_padding_mask)
122 | 
123 |         x = self.fusion(torch.cat((video_feat, audio_feat), dim=-1))
124 | 
125 |         # ctc loss
126 |         loss_ctc, ys_hat = self.ctc(x, video_lengths, label)
127 | 
128 |         if self.proj_decoder:
129 |             x = self.proj_decoder(x)
130 | 
131 |         # decoder loss
132 |         ys_in_pad, ys_out_pad = add_sos_eos(label, self.sos, self.eos, self.ignore_id)
133 |         ys_mask = target_mask(ys_in_pad, self.ignore_id)
134 |         pred_pad, _ = self.decoder(ys_in_pad, ys_mask, x, video_padding_mask)
135 |         loss_att = self.criterion(pred_pad, ys_out_pad)
136 |         loss = self.mtlalpha * loss_ctc + (1 - self.mtlalpha) * loss_att
137 | 
138 |         acc = th_accuracy(
139 |             pred_pad.view(-1, self.odim), ys_out_pad, ignore_label=self.ignore_id
140 |         )
141 | 
142 |         return loss, loss_ctc, loss_att, acc
143 | 


--------------------------------------------------------------------------------
/script/evaluate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.sys.path.append(os.path.join(os.path.dirname(os.path.dirname(__file__))))
  3 | import jiwer
  4 | import webvtt
  5 | import json
  6 | from src.cluster.conv_spks import (
  7 |     get_clustering_f1_score,
  8 |     get_speaker_clustering_f1_score
  9 | )
 10 | from src.tokenizer.norm_text import remove_disfluencies
 11 | import glob
 12 | from transformers.models.whisper.english_normalizer import EnglishTextNormalizer
 13 | text_normalizer = EnglishTextNormalizer({})
 14 | 
 15 | def evaluate_conversation_clustering(label_path, output_path):
 16 |     with open(os.path.join(label_path, "speaker_to_cluster.json"), "r") as f:
 17 |         label_data = json.load(f)
 18 |     with open(os.path.join(output_path, "speaker_to_cluster.json"), "r") as f:
 19 |         output_data = json.load(f)
 20 |     return get_clustering_f1_score(label_data, output_data)
 21 | 
 22 | def evaluate_speaker_clustering(label_path, output_path):
 23 |     with open(os.path.join(label_path, "speaker_to_cluster.json"), "r") as f:
 24 |         label_data = json.load(f)
 25 |     with open(os.path.join(output_path, "speaker_to_cluster.json"), "r") as f:
 26 |         output_data = json.load(f)
 27 |     return get_speaker_clustering_f1_score(label_data, output_data)
 28 | 
 29 | 
 30 | def benchmark_vtt_wer(ref_vtt, hypo_vtt, ref_uem_start, ref_uem_end, hypo_uem_start, hypo_uem_end, show_diff=False):
 31 |     ref_strings = []
 32 |     hypo_strings = []
 33 |     for caption in webvtt.read(ref_vtt):
 34 |         if caption.start_in_seconds + caption.start_time.milliseconds/1000 < ref_uem_start:
 35 |             continue
 36 |         if caption.end_in_seconds + caption.end_time.milliseconds/1000 > ref_uem_end:
 37 |             continue
 38 |         ref_strings.append(remove_disfluencies(text_normalizer(caption.text)))
 39 |     for caption in webvtt.read(hypo_vtt):
 40 |         if caption.start_in_seconds + caption.start_time.milliseconds/1000 < hypo_uem_start:
 41 |             continue
 42 |         if caption.end_in_seconds + caption.end_time.milliseconds/1000 > hypo_uem_end:
 43 |             continue
 44 |         hypo_strings.append(remove_disfluencies(text_normalizer(caption.text)))
 45 |     
 46 |     if show_diff:
 47 |         # Show the WER error type (insertion, deletion, substitution) using wer library
 48 |         out = jiwer.process_words(
 49 |             [" ".join(ref_strings)],
 50 |             [" ".join(hypo_strings)],
 51 |         )
 52 |         print(jiwer.visualize_alignment(out))
 53 | 
 54 |     return jiwer.wer(" ".join(ref_strings), " ".join(hypo_strings))
 55 | 
 56 | def evaluate_speaker_transcripts(label_path, output_path, speaker_list, speaker_uem_start, speaker_uem_end):
 57 |     speaker_to_wer = {}
 58 |     for speaker, uem_start, uem_end in zip(speaker_list, speaker_uem_start, speaker_uem_end):
 59 |         ref_vtt = os.path.join(label_path, f"{speaker}.vtt")
 60 |         hypo_vtt = os.path.join(output_path, f"{speaker}.vtt")
 61 |         wer_score = benchmark_vtt_wer(ref_vtt, hypo_vtt, uem_start, uem_end, uem_start, uem_end)
 62 |         speaker_to_wer[speaker] = round(wer_score, 4)
 63 |     return speaker_to_wer
 64 | 
 65 | def main():
 66 |     import argparse
 67 |     parser = argparse.ArgumentParser(description="Evaluate speaker clustering and transcripts from video")
 68 |     parser.add_argument('--session_dir', type=str, required=True, help='Path to folder containing session data')
 69 |     parser.add_argument('--output_dir_name', type=str, default='output', help='Name of the output directory within each session (default: output)')
 70 |     parser.add_argument('--label_dir_name', type=str, default='labels', help='Name of the label directory within each session (default: labels)')
 71 |     opt = parser.parse_args()
 72 | 
 73 |     if opt.session_dir.strip().endswith("*"):
 74 |         all_session_dirs = glob.glob(opt.session_dir)
 75 |     else:
 76 |         all_session_dirs = [opt.session_dir]
 77 |     print(f"Evaluating {len(all_session_dirs)} sessions")
 78 | 
 79 |     all_conversation_clustering_f1_score = []
 80 |     all_speaker_wer = []
 81 |     all_cluster_speaker_wer = []
 82 | 
 83 |     for session_dir in all_session_dirs:
 84 |         print(f"Evaluating session {session_dir.split('/')[-1]}")
 85 |         label_path = os.path.join(session_dir, opt.label_dir_name)
 86 |         output_path = os.path.join(session_dir, opt.output_dir_name)
 87 |         assert os.path.exists(label_path), f"Label path {label_path} does not exist"
 88 |         assert os.path.exists(output_path), f"Output path {output_path} does not exist"
 89 |         
 90 |         with open(os.path.join(session_dir, "metadata.json"), "r") as f:
 91 |             metadata = json.load(f)
 92 |         speaker_list = list(metadata.keys())
 93 |         speaker_uem_start = [metadata[spk]['central']["uem"]["start"] for spk in speaker_list]
 94 |         speaker_uem_end = [metadata[spk]['central']["uem"]["end"] for spk in speaker_list]
 95 | 
 96 |         conversation_clustering_f1_score = evaluate_conversation_clustering(label_path, output_path)
 97 |         print(f"Conversation clustering F1 score: {conversation_clustering_f1_score}")
 98 |         all_conversation_clustering_f1_score.append(conversation_clustering_f1_score)
 99 | 
100 | 
101 |         speaker_to_wer = evaluate_speaker_transcripts(label_path, output_path, speaker_list, speaker_uem_start, speaker_uem_end)
102 |         print(f"Speaker to WER: {speaker_to_wer}")
103 |         all_speaker_wer.extend(list(speaker_to_wer.values()))
104 | 
105 |         speaker_clustering_f1_score = evaluate_speaker_clustering(label_path, output_path)
106 |         print(f"Speaker clustering F1 score: {speaker_clustering_f1_score}")
107 | 
108 |         cluster_speaker_to_wer = {}
109 |         for speaker, wer in speaker_to_wer.items():
110 |             cluster_speaker_wer = 0.5 * wer + 0.5 * (1 - speaker_clustering_f1_score[speaker])
111 |             cluster_speaker_to_wer[speaker] = cluster_speaker_wer
112 |         print(f"Joint ASR-Clustering Error Rate: {cluster_speaker_to_wer}")
113 |         all_cluster_speaker_wer.extend(list(cluster_speaker_to_wer.values()))
114 | 
115 |     print(f"Average Conversation Clustering F1 score: {sum(all_conversation_clustering_f1_score) / len(all_conversation_clustering_f1_score)}")
116 |     print(f"Average Speaker WER: {sum(all_speaker_wer) / len(all_speaker_wer)}")
117 |     print(f"Average Joint ASR-Clustering Error Rate: {sum(all_cluster_speaker_wer) / len(all_cluster_speaker_wer)}")
118 | 
119 | if __name__ == "__main__":
120 |     main()


--------------------------------------------------------------------------------
/docs/baseline.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: Baseline System
 4 | parent: CHiME-9 Task 1 - MCoRec
 5 | nav_order: 4
 6 | ---
 7 | 
 8 | # Baseline System
 9 | 
10 | ## Overview
11 | 
12 | The baseline system for CHiME-9 Task 1 addresses the challenging problem of **Multi-Modal Context-aware Recognition (MCoRec)** in single-room environments with multiple concurrent conversations. The system processes 360° video and audio recordings to both transcribe each speaker's speech and identify which speakers belong to the same conversation.
13 | 
14 | ## Task Requirements
15 | 
16 | The baseline system provides an initial framework for addressing two interconnected challenges:
17 | 
18 | 1. **Individual Speaker Transcription**: Generate time-aligned transcripts (`.vtt` files) for each target speaker within specified evaluation intervals
19 | 2. **Conversation Clustering**: Group participants into their respective conversations by generating speaker-to-cluster mappings
20 | 
21 | **Input**: Single 360° video with audio, plus bounding boxes identifying target participants  
22 | **Output**: Per-speaker transcriptions and conversation cluster assignments
23 | 
24 | ## Baseline System Architecture
25 | 
26 | The baseline system is provided at [Github](https://github.com/MCoRec/mcorec_baseline). Please refer to the README therein for information about how to install and run the system.
27 | 
28 | ![](images/mcorec-baseline.png)
29 | 
30 | ### Core Components
31 | 
32 | #### 1. Active Speaker Detection
33 | - **Purpose**: Determines which speaker is actively speaking at any given moment
34 | - **Baseline Model**: [A Light Weight Model for Active Speaker Detection](https://github.com/Junhua-Liao/Light-ASD)
35 | - **Input**: Face crop video and audio extracted from the 360° video
36 | - **Output**: Active speaker detection scores for each frame of the corresponding track video. These scores are used to determine when a speaker is talking, allowing the audio-visual speech recognition system to run only during the speaking segments.
37 | 
38 | #### 2. Face Landmark Detection and Mouth Cropping
39 | - **Purpose**: Extracts mouth region from face crop videos.
40 | - **Models**: 
41 |   - Face detector based on [RetinaFace](https://arxiv.org/pdf/1905.00641)
42 |   - 2D facial landmark detector based on [FAN (Face Alignment Network)](https://openaccess.thecvf.com/content_ICCV_2017/papers/Bulat_How_Far_Are_ICCV_2017_paper.pdf)
43 | - **Input**: Face crop video and audio extracted from the 360° video
44 | - **Output**: Mouth crop video with precise mouth region extraction
45 | - **Processing Pipeline**: Referenced from [Auto-AVSR/preparation](https://github.com/mpc001/auto_avsr/tree/main/preparation) repository
46 | 
47 | #### 3. Video Segmentation and Chunking
48 | - **Purpose**: Splits long mouth crop videos into smaller segments (≤15 seconds) based on active speaker detection scores for efficient processing
49 | - **Algorithm**: 
50 |   - **Hysteresis Thresholding**: Uses onset and offset thresholds to identify speech regions from ASD scores
51 |   - **Duration Filtering**: Removes short speech segments and fills short gaps between speech regions
52 |   - **Segment Splitting**: Divides long continuous speech regions into manageable chunks
53 | - **Input**: Mouth crop video and active speaker detection scores
54 | - **Output**: List of video segments with start/end timestamps
55 | 
56 | #### 4. Audio-Visual Speech Recognition
57 | - **Purpose**: Combines audio and visual cues to transcribe speech into text for each video segment
58 | - **Input**: Segmented mouth crop videos with corresponding audio streams and timestamps
59 | - **Output**: Time-aligned transcriptions in WebVTT format with start/end timestamps for each segment
60 | - **Baseline Model** 
61 |   - **AV-HuBERT CTC/Attention**: [Cocktail-Party Audio-Visual Speech Recognition](https://arxiv.org/abs/2506.02178)
62 |   - **Auto-AVSR**: [Auto-AVSR: Audio-Visual Speech Recognition with Automatic Labels](https://arxiv.org/abs/2303.14307)
63 |   - **Muavic-EN**: [MuAViC: A Multilingual Audio-Visual Corpus for Robust Speech Recognition and Robust Speech-to-Text Translation](https://arxiv.org/abs/2303.00628)
64 | 
65 | 
66 | #### 5. Time-based Conversation Clustering
67 | - **Purpose**: Groups speakers into their respective conversations based on temporal speaking patterns and overlap analysis
68 | - **Input**: Active speaker detection scores
69 | - **Processing Workflow**:
70 |   - **Speaker Activity Extraction**: Uses ASD scores from the Active Speaker Detection component to identify time segments where each speaker is actively talking
71 |   - **Conversation Score Calculation**: For each pair of speakers, computes interaction scores based on temporal overlap patterns:
72 |     - Simultaneous speech (overlap) → indicates different conversations
73 |     - Sequential speech (non-overlap) → indicates same conversation
74 |     - Score formula: `1 - (overlap_duration / total_duration)`
75 |   - **Distance Matrix Construction**: Converts conversation scores to distances for clustering:
76 |     - Higher scores (less overlap) = smaller distances = higher probability of same conversation
77 |     - Lower scores (more overlap) = larger distances = lower probability of same conversation
78 |   - **Agglomerative Clustering**: Hierarchically groups speakers using the distance matrix
79 | - **Output**: Speaker-to-cluster mapping in JSON format (`speaker_to_cluster.json`)
80 | 
81 | This baseline establishes a reference implementation that participants can build upon and improve through more sophisticated approaches to better handle the challenging multi-conversation scenarios with high speech overlap and complex acoustic environments.
82 | 
83 | ## Results
84 | 
85 | The results for the baseline systems on dev subset are the following:
86 | 
87 | | System | AVSR Model | MCoRec finetuned | Conversation Clustering | Conversation Clustering F1 Score | Speaker WER | Joint ASR-Clustering Error Rate |
88 | |--------|------------|------------------|------------------------|-----------------------------------|-------------|----------------------------------|
89 | | BL1 | [AV-HuBERT CTC/Attention](https://arxiv.org/abs/2506.02178) | No | Time-based | 0.8153 | 0.5536 | 0.3821 |
90 | | BL2 | [Muavic-EN](https://arxiv.org/abs/2303.00628) | No | Time-based | 0.8153 | 0.7180 | 0.4643 |
91 | | BL3 | [Auto-AVSR](https://arxiv.org/abs/2303.14307) | No | Time-based | 0.8153 | 0.8315 | 0.5211 |
92 | | BL4 | [AV-HuBERT CTC/Attention](https://arxiv.org/abs/2506.02178) | Yes | Time-based | 0.8153 | 0.4990 | 0.3548 |
93 | 
94 | For detailed implementation and inference instructions, please refer to the baseline repository on [GitHub](https://github.com/MCoRec/mcorec_baseline).


--------------------------------------------------------------------------------
/src/nets/backend/backbones/modules/shufflenetv2.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import pdb
  4 | from collections import OrderedDict
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | from torch.autograd import Variable
 10 | from torch.nn import init
 11 | 
 12 | 
 13 | def conv_bn(inp, oup, stride):
 14 |     return nn.Sequential(
 15 |         nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
 16 |         nn.BatchNorm2d(oup),
 17 |         nn.ReLU(inplace=True),
 18 |     )
 19 | 
 20 | 
 21 | def conv_1x1_bn(inp, oup):
 22 |     return nn.Sequential(
 23 |         nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
 24 |         nn.BatchNorm2d(oup),
 25 |         nn.ReLU(inplace=True),
 26 |     )
 27 | 
 28 | 
 29 | def channel_shuffle(x, groups):
 30 |     batchsize, num_channels, height, width = x.data.size()
 31 | 
 32 |     channels_per_group = num_channels // groups
 33 | 
 34 |     # reshape
 35 |     x = x.view(batchsize, groups, channels_per_group, height, width)
 36 | 
 37 |     x = torch.transpose(x, 1, 2).contiguous()
 38 | 
 39 |     # flatten
 40 |     x = x.view(batchsize, -1, height, width)
 41 | 
 42 |     return x
 43 | 
 44 | 
 45 | class InvertedResidual(nn.Module):
 46 |     def __init__(self, inp, oup, stride, benchmodel):
 47 |         super(InvertedResidual, self).__init__()
 48 |         self.benchmodel = benchmodel
 49 |         self.stride = stride
 50 |         assert stride in [1, 2]
 51 | 
 52 |         oup_inc = oup // 2
 53 | 
 54 |         if self.benchmodel == 1:
 55 |             # assert inp == oup_inc
 56 |             self.banch2 = nn.Sequential(
 57 |                 # pw
 58 |                 nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),
 59 |                 nn.BatchNorm2d(oup_inc),
 60 |                 nn.ReLU(inplace=True),
 61 |                 # dw
 62 |                 nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False),
 63 |                 nn.BatchNorm2d(oup_inc),
 64 |                 # pw-linear
 65 |                 nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),
 66 |                 nn.BatchNorm2d(oup_inc),
 67 |                 nn.ReLU(inplace=True),
 68 |             )
 69 |         else:
 70 |             self.banch1 = nn.Sequential(
 71 |                 # dw
 72 |                 nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
 73 |                 nn.BatchNorm2d(inp),
 74 |                 # pw-linear
 75 |                 nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False),
 76 |                 nn.BatchNorm2d(oup_inc),
 77 |                 nn.ReLU(inplace=True),
 78 |             )
 79 | 
 80 |             self.banch2 = nn.Sequential(
 81 |                 # pw
 82 |                 nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False),
 83 |                 nn.BatchNorm2d(oup_inc),
 84 |                 nn.ReLU(inplace=True),
 85 |                 # dw
 86 |                 nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False),
 87 |                 nn.BatchNorm2d(oup_inc),
 88 |                 # pw-linear
 89 |                 nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),
 90 |                 nn.BatchNorm2d(oup_inc),
 91 |                 nn.ReLU(inplace=True),
 92 |             )
 93 | 
 94 |     @staticmethod
 95 |     def _concat(x, out):
 96 |         # concatenate along channel axis
 97 |         return torch.cat((x, out), 1)
 98 | 
 99 |     def forward(self, x):
100 |         if 1 == self.benchmodel:
101 |             x1 = x[:, : (x.shape[1] // 2), :, :]
102 |             x2 = x[:, (x.shape[1] // 2) :, :, :]
103 |             out = self._concat(x1, self.banch2(x2))
104 |         elif 2 == self.benchmodel:
105 |             out = self._concat(self.banch1(x), self.banch2(x))
106 | 
107 |         return channel_shuffle(out, 2)
108 | 
109 | 
110 | class ShuffleNetV2(nn.Module):
111 |     def __init__(self, n_class=1000, input_size=224, width_mult=2.0):
112 |         super(ShuffleNetV2, self).__init__()
113 | 
114 |         assert input_size % 32 == 0, "Input size needs to be divisible by 32"
115 | 
116 |         self.stage_repeats = [4, 8, 4]
117 |         # index 0 is invalid and should never be called.
118 |         # only used for indexing convenience.
119 |         if width_mult == 0.5:
120 |             self.stage_out_channels = [-1, 24, 48, 96, 192, 1024]
121 |         elif width_mult == 1.0:
122 |             self.stage_out_channels = [-1, 24, 116, 232, 464, 1024]
123 |         elif width_mult == 1.5:
124 |             self.stage_out_channels = [-1, 24, 176, 352, 704, 1024]
125 |         elif width_mult == 2.0:
126 |             self.stage_out_channels = [-1, 24, 244, 488, 976, 2048]
127 |         else:
128 |             raise ValueError(
129 |                 """Width multiplier should be in [0.5, 1.0, 1.5, 2.0]. Current value: {}""".format(
130 |                     width_mult
131 |                 )
132 |             )
133 | 
134 |         # building first layer
135 |         input_channel = self.stage_out_channels[1]
136 |         self.conv1 = conv_bn(3, input_channel, 2)
137 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
138 | 
139 |         self.features = []
140 |         # building inverted residual blocks
141 |         for idxstage in range(len(self.stage_repeats)):
142 |             numrepeat = self.stage_repeats[idxstage]
143 |             output_channel = self.stage_out_channels[idxstage + 2]
144 |             for i in range(numrepeat):
145 |                 if i == 0:
146 |                     # inp, oup, stride, benchmodel):
147 |                     self.features.append(
148 |                         InvertedResidual(input_channel, output_channel, 2, 2)
149 |                     )
150 |                 else:
151 |                     self.features.append(
152 |                         InvertedResidual(input_channel, output_channel, 1, 1)
153 |                     )
154 |                 input_channel = output_channel
155 | 
156 |         # make it nn.Sequential
157 |         self.features = nn.Sequential(*self.features)
158 | 
159 |         # building last several layers
160 |         self.conv_last = conv_1x1_bn(input_channel, self.stage_out_channels[-1])
161 |         self.globalpool = nn.Sequential(nn.AvgPool2d(int(input_size / 32)))
162 | 
163 |         # building classifier
164 |         self.classifier = nn.Sequential(nn.Linear(self.stage_out_channels[-1], n_class))
165 | 
166 |     def forward(self, x):
167 |         x = self.conv1(x)
168 |         x = self.maxpool(x)
169 |         x = self.features(x)
170 |         x = self.conv_last(x)
171 |         x = self.globalpool(x)
172 |         x = x.view(-1, self.stage_out_channels[-1])
173 |         x = self.classifier(x)
174 |         return x
175 | 


--------------------------------------------------------------------------------
/src/ibug/face_detection/s3fd/s3fd_net.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.init as init
  4 | import torch.nn.functional as F
  5 | from .utils import Detect, PriorBox
  6 | 
  7 | 
  8 | class L2Norm(nn.Module):
  9 | 
 10 |     def __init__(self, n_channels, scale):
 11 |         super(L2Norm, self).__init__()
 12 |         self.n_channels = n_channels
 13 |         self.gamma = scale or None
 14 |         self.eps = 1e-10
 15 |         self.weight = nn.Parameter(torch.Tensor(self.n_channels))
 16 |         self.reset_parameters()
 17 | 
 18 |     def reset_parameters(self):
 19 |         init.constant_(self.weight, self.gamma)
 20 | 
 21 |     def forward(self, x):
 22 |         norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
 23 |         x = torch.div(x, norm)
 24 |         out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
 25 |         return out
 26 | 
 27 | 
 28 | class S3FDNet(nn.Module):
 29 | 
 30 |     def __init__(self, config, device='cuda'):
 31 |         super(S3FDNet, self).__init__()
 32 |         self.config = config
 33 |         self.device = device
 34 | 
 35 |         self.vgg = nn.ModuleList([
 36 |             nn.Conv2d(3, 64, 3, 1, padding=1),
 37 |             nn.ReLU(inplace=True),
 38 |             nn.Conv2d(64, 64, 3, 1, padding=1),
 39 |             nn.ReLU(inplace=True),
 40 |             nn.MaxPool2d(2, 2),
 41 | 
 42 |             nn.Conv2d(64, 128, 3, 1, padding=1),
 43 |             nn.ReLU(inplace=True),
 44 |             nn.Conv2d(128, 128, 3, 1, padding=1),
 45 |             nn.ReLU(inplace=True),
 46 |             nn.MaxPool2d(2, 2),
 47 | 
 48 |             nn.Conv2d(128, 256, 3, 1, padding=1),
 49 |             nn.ReLU(inplace=True),
 50 |             nn.Conv2d(256, 256, 3, 1, padding=1),
 51 |             nn.ReLU(inplace=True),
 52 |             nn.Conv2d(256, 256, 3, 1, padding=1),
 53 |             nn.ReLU(inplace=True),
 54 |             nn.MaxPool2d(2, 2, ceil_mode=True),
 55 | 
 56 |             nn.Conv2d(256, 512, 3, 1, padding=1),
 57 |             nn.ReLU(inplace=True),
 58 |             nn.Conv2d(512, 512, 3, 1, padding=1),
 59 |             nn.ReLU(inplace=True),
 60 |             nn.Conv2d(512, 512, 3, 1, padding=1),
 61 |             nn.ReLU(inplace=True),
 62 |             nn.MaxPool2d(2, 2),
 63 | 
 64 |             nn.Conv2d(512, 512, 3, 1, padding=1),
 65 |             nn.ReLU(inplace=True),
 66 |             nn.Conv2d(512, 512, 3, 1, padding=1),
 67 |             nn.ReLU(inplace=True),
 68 |             nn.Conv2d(512, 512, 3, 1, padding=1),
 69 |             nn.ReLU(inplace=True),
 70 |             nn.MaxPool2d(2, 2),
 71 | 
 72 |             nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6),
 73 |             nn.ReLU(inplace=True),
 74 |             nn.Conv2d(1024, 1024, 1, 1),
 75 |             nn.ReLU(inplace=True),
 76 |         ])
 77 | 
 78 |         self.L2Norm3_3 = L2Norm(256, 10)
 79 |         self.L2Norm4_3 = L2Norm(512, 8)
 80 |         self.L2Norm5_3 = L2Norm(512, 5)
 81 | 
 82 |         self.extras = nn.ModuleList([
 83 |             nn.Conv2d(1024, 256, 1, 1),
 84 |             nn.Conv2d(256, 512, 3, 2, padding=1),
 85 |             nn.Conv2d(512, 128, 1, 1),
 86 |             nn.Conv2d(128, 256, 3, 2, padding=1),
 87 |         ])
 88 | 
 89 |         self.loc = nn.ModuleList([
 90 |             nn.Conv2d(256, 4, 3, 1, padding=1),
 91 |             nn.Conv2d(512, 4, 3, 1, padding=1),
 92 |             nn.Conv2d(512, 4, 3, 1, padding=1),
 93 |             nn.Conv2d(1024, 4, 3, 1, padding=1),
 94 |             nn.Conv2d(512, 4, 3, 1, padding=1),
 95 |             nn.Conv2d(256, 4, 3, 1, padding=1),
 96 |         ])
 97 | 
 98 |         self.conf = nn.ModuleList([
 99 |             nn.Conv2d(256, 4, 3, 1, padding=1),
100 |             nn.Conv2d(512, 2, 3, 1, padding=1),
101 |             nn.Conv2d(512, 2, 3, 1, padding=1),
102 |             nn.Conv2d(1024, 2, 3, 1, padding=1),
103 |             nn.Conv2d(512, 2, 3, 1, padding=1),
104 |             nn.Conv2d(256, 2, 3, 1, padding=1),
105 |         ])
106 | 
107 |         self.priors = None
108 |         self.previous_size = None
109 | 
110 |         self.softmax = nn.Softmax(dim=-1)
111 |         self.detect = Detect(self.config)
112 | 
113 |     def forward(self, x):
114 |         size = x.size()[2:]
115 |         sources = list()
116 |         loc = list()
117 |         conf = list()
118 | 
119 |         for k in range(16):
120 |             x = self.vgg[k](x)
121 |         s = self.L2Norm3_3(x)
122 |         sources.append(s)
123 | 
124 |         for k in range(16, 23):
125 |             x = self.vgg[k](x)
126 |         s = self.L2Norm4_3(x)
127 |         sources.append(s)
128 | 
129 |         for k in range(23, 30):
130 |             x = self.vgg[k](x)
131 |         s = self.L2Norm5_3(x)
132 |         sources.append(s)
133 | 
134 |         for k in range(30, len(self.vgg)):
135 |             x = self.vgg[k](x)
136 |         sources.append(x)
137 | 
138 |         # apply extra layers and cache source layer outputs
139 |         for k, v in enumerate(self.extras):
140 |             x = F.relu(v(x), inplace=True)
141 |             if k % 2 == 1:
142 |                 sources.append(x)
143 | 
144 |         # apply multibox head to source layers
145 |         loc_x = self.loc[0](sources[0])
146 |         conf_x = self.conf[0](sources[0])
147 | 
148 |         max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True)
149 |         conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1)
150 | 
151 |         loc.append(loc_x.permute(0, 2, 3, 1).contiguous())
152 |         conf.append(conf_x.permute(0, 2, 3, 1).contiguous())
153 | 
154 |         for i in range(1, len(sources)):
155 |             x = sources[i]
156 |             conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous())
157 |             loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous())
158 | 
159 |         if self.priors is None or self.previous_size != size:
160 |             with torch.no_grad():
161 |                 features_maps = []
162 |                 for i in range(len(loc)):
163 |                     feat = []
164 |                     feat += [loc[i].size(1), loc[i].size(2)]
165 |                     features_maps += [feat]
166 |                 self.priors = PriorBox(size, features_maps, self.config).forward().to(self.device)
167 |                 self.previous_size = size
168 | 
169 |         loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
170 |         conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
171 |         conf = self.softmax(conf.view(conf.size(0), -1, 2))
172 | 
173 |         output = self.detect(loc.view(loc.size(0), -1, 4), conf, self.priors)
174 | 
175 |         return output
176 | 


--------------------------------------------------------------------------------
/src/nets/scorer_interface.py:
--------------------------------------------------------------------------------
  1 | """Scorer interface module."""
  2 | 
  3 | import warnings
  4 | from typing import Any, List, Tuple
  5 | 
  6 | import torch
  7 | 
  8 | 
  9 | class ScorerInterface:
 10 |     """Scorer interface for beam search.
 11 | 
 12 |     The scorer performs scoring of the all tokens in vocabulary.
 13 | 
 14 |     Examples:
 15 |         * Search heuristics
 16 |             * :class:`src.nets.scorers.length_bonus.LengthBonus`
 17 |         * Decoder networks of the sequence-to-sequence models
 18 |             * :class:`src.nets.backend.nets.transformer.decoder.Decoder`
 19 |             * :class:`src.nets.backend.nets.rnn.decoders.Decoder`
 20 |         * Neural language models
 21 |             * :class:`src.nets.backend.lm.transformer.TransformerLM`
 22 |             * :class:`src.nets.backend.lm.default.DefaultRNNLM`
 23 |             * :class:`src.nets.backend.lm.seq_rnn.SequentialRNNLM`
 24 | 
 25 |     """
 26 | 
 27 |     def init_state(self, x: torch.Tensor) -> Any:
 28 |         """Get an initial state for decoding (optional).
 29 | 
 30 |         Args:
 31 |             x (torch.Tensor): The encoded feature tensor
 32 | 
 33 |         Returns: initial state
 34 | 
 35 |         """
 36 |         return None
 37 | 
 38 |     def select_state(self, state: Any, i: int, new_id: int = None) -> Any:
 39 |         """Select state with relative ids in the main beam search.
 40 | 
 41 |         Args:
 42 |             state: Decoder state for prefix tokens
 43 |             i (int): Index to select a state in the main beam search
 44 |             new_id (int): New label index to select a state if necessary
 45 | 
 46 |         Returns:
 47 |             state: pruned state
 48 | 
 49 |         """
 50 |         return None if state is None else state[i]
 51 | 
 52 |     def score(
 53 |         self, y: torch.Tensor, state: Any, x: torch.Tensor
 54 |     ) -> Tuple[torch.Tensor, Any]:
 55 |         """Score new token (required).
 56 | 
 57 |         Args:
 58 |             y (torch.Tensor): 1D torch.int64 prefix tokens.
 59 |             state: Scorer state for prefix tokens
 60 |             x (torch.Tensor): The encoder feature that generates ys.
 61 | 
 62 |         Returns:
 63 |             tuple[torch.Tensor, Any]: Tuple of
 64 |                 scores for next token that has a shape of `(n_vocab)`
 65 |                 and next state for ys
 66 | 
 67 |         """
 68 |         raise NotImplementedError
 69 | 
 70 |     def final_score(self, state: Any) -> float:
 71 |         """Score eos (optional).
 72 | 
 73 |         Args:
 74 |             state: Scorer state for prefix tokens
 75 | 
 76 |         Returns:
 77 |             float: final score
 78 | 
 79 |         """
 80 |         return 0.0
 81 | 
 82 | 
 83 | class BatchScorerInterface(ScorerInterface):
 84 |     """Batch scorer interface."""
 85 | 
 86 |     def batch_init_state(self, x: torch.Tensor) -> Any:
 87 |         """Get an initial state for decoding (optional).
 88 | 
 89 |         Args:
 90 |             x (torch.Tensor): The encoded feature tensor
 91 | 
 92 |         Returns: initial state
 93 | 
 94 |         """
 95 |         return self.init_state(x)
 96 | 
 97 |     def batch_score(
 98 |         self, ys: torch.Tensor, states: List[Any], xs: torch.Tensor
 99 |     ) -> Tuple[torch.Tensor, List[Any]]:
100 |         """Score new token batch (required).
101 | 
102 |         Args:
103 |             ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
104 |             states (List[Any]): Scorer states for prefix tokens.
105 |             xs (torch.Tensor):
106 |                 The encoder feature that generates ys (n_batch, xlen, n_feat).
107 | 
108 |         Returns:
109 |             tuple[torch.Tensor, List[Any]]: Tuple of
110 |                 batchfied scores for next token with shape of `(n_batch, n_vocab)`
111 |                 and next state list for ys.
112 | 
113 |         """
114 |         warnings.warn(
115 |             "{} batch score is implemented through for loop not parallelized".format(
116 |                 self.__class__.__name__
117 |             )
118 |         )
119 |         scores = list()
120 |         outstates = list()
121 |         for i, (y, state, x) in enumerate(zip(ys, states, xs)):
122 |             score, outstate = self.score(y, state, x)
123 |             outstates.append(outstate)
124 |             scores.append(score)
125 |         scores = torch.cat(scores, 0).view(ys.shape[0], -1)
126 |         return scores, outstates
127 | 
128 | 
129 | class PartialScorerInterface(ScorerInterface):
130 |     """Partial scorer interface for beam search.
131 | 
132 |     The partial scorer performs scoring when non-partial scorer finished scoring,
133 |     and receives pre-pruned next tokens to score because it is too heavy to score
134 |     all the tokens.
135 | 
136 |     Examples:
137 |          * Prefix search for connectionist-temporal-classification models
138 |              * :class:`src.nets.scorers.ctc.CTCPrefixScorer`
139 | 
140 |     """
141 | 
142 |     def score_partial(
143 |         self, y: torch.Tensor, next_tokens: torch.Tensor, state: Any, x: torch.Tensor
144 |     ) -> Tuple[torch.Tensor, Any]:
145 |         """Score new token (required).
146 | 
147 |         Args:
148 |             y (torch.Tensor): 1D prefix token
149 |             next_tokens (torch.Tensor): torch.int64 next token to score
150 |             state: decoder state for prefix tokens
151 |             x (torch.Tensor): The encoder feature that generates ys
152 | 
153 |         Returns:
154 |             tuple[torch.Tensor, Any]:
155 |                 Tuple of a score tensor for y that has a shape `(len(next_tokens),)`
156 |                 and next state for ys
157 | 
158 |         """
159 |         raise NotImplementedError
160 | 
161 | 
162 | class BatchPartialScorerInterface(BatchScorerInterface, PartialScorerInterface):
163 |     """Batch partial scorer interface for beam search."""
164 | 
165 |     def batch_score_partial(
166 |         self,
167 |         ys: torch.Tensor,
168 |         next_tokens: torch.Tensor,
169 |         states: List[Any],
170 |         xs: torch.Tensor,
171 |     ) -> Tuple[torch.Tensor, Any]:
172 |         """Score new token (required).
173 | 
174 |         Args:
175 |             ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
176 |             next_tokens (torch.Tensor): torch.int64 tokens to score (n_batch, n_token).
177 |             states (List[Any]): Scorer states for prefix tokens.
178 |             xs (torch.Tensor):
179 |                 The encoder feature that generates ys (n_batch, xlen, n_feat).
180 | 
181 |         Returns:
182 |             tuple[torch.Tensor, Any]:
183 |                 Tuple of a score tensor for ys that has a shape `(n_batch, n_vocab)`
184 |                 and next states for ys
185 |         """
186 |         raise NotImplementedError
187 | 


--------------------------------------------------------------------------------
/src/avhubert_muavic/resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import logging
  3 | import math
  4 | import torch.nn as nn
  5 | from collections import OrderedDict
  6 | 
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | def conv3x3(in_planes, out_planes, stride=1):
 11 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
 12 |                      padding=1, bias=False)
 13 | 
 14 | 
 15 | def downsample_basic_block( inplanes, outplanes, stride ):
 16 |     return  nn.Sequential(
 17 |                 nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=stride, bias=False),
 18 |                 nn.BatchNorm2d(outplanes),
 19 |             )
 20 | 
 21 | def downsample_basic_block_v2( inplanes, outplanes, stride ):
 22 |     return  nn.Sequential(
 23 |                 nn.AvgPool2d(kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False),
 24 |                 nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=1, bias=False),
 25 |                 nn.BatchNorm2d(outplanes),
 26 |             )
 27 | 
 28 | 
 29 | 
 30 | class BasicBlock(nn.Module):
 31 |     expansion = 1
 32 | 
 33 |     def __init__(self, inplanes, planes, stride=1, downsample=None, relu_type = 'relu' ):
 34 |         super(BasicBlock, self).__init__()
 35 | 
 36 |         assert relu_type in ['relu','prelu']
 37 | 
 38 |         self.conv1 = conv3x3(inplanes, planes, stride)
 39 |         self.bn1 = nn.BatchNorm2d(planes)
 40 | 
 41 |         if relu_type == 'relu':
 42 |             self.relu1 = nn.ReLU(inplace=True)
 43 |             self.relu2 = nn.ReLU(inplace=True)
 44 |         elif relu_type == 'prelu':
 45 |             self.relu1 = nn.PReLU(num_parameters=planes)
 46 |             self.relu2 = nn.PReLU(num_parameters=planes)
 47 |         else:
 48 |             raise Exception('relu type not implemented')
 49 | 
 50 |         self.conv2 = conv3x3(planes, planes)
 51 |         self.bn2 = nn.BatchNorm2d(planes)
 52 |         
 53 |         self.downsample = downsample
 54 |         self.stride = stride
 55 | 
 56 |     def forward(self, x):
 57 |         residual = x
 58 |         out = self.conv1(x)
 59 |         out = self.bn1(out)
 60 |         out = self.relu1(out)
 61 |         out = self.conv2(out)
 62 |         out = self.bn2(out)
 63 |         if self.downsample is not None:
 64 |             residual = self.downsample(x)
 65 | 
 66 |         out += residual
 67 |         out = self.relu2(out)
 68 | 
 69 |         return out
 70 | 
 71 | 
 72 | class ResNet(nn.Module):
 73 | 
 74 |     def __init__(self, block, layers, num_classes=1000, relu_type = 'relu', gamma_zero = False, avg_pool_downsample = False):
 75 |         self.inplanes = 64
 76 |         self.relu_type = relu_type
 77 |         self.gamma_zero = gamma_zero
 78 |         self.downsample_block = downsample_basic_block_v2 if avg_pool_downsample else downsample_basic_block
 79 | 
 80 |         super(ResNet, self).__init__()
 81 |         self.layer1 = self._make_layer(block, 64, layers[0])
 82 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
 83 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
 84 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
 85 |         self.avgpool = nn.AdaptiveAvgPool2d(1)
 86 | 
 87 |         for m in self.modules():
 88 |             if isinstance(m, nn.Conv2d):
 89 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 90 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
 91 |             elif isinstance(m, nn.BatchNorm2d):
 92 |                 m.weight.data.fill_(1)
 93 |                 m.bias.data.zero_()
 94 | 
 95 |         if self.gamma_zero:
 96 |             for m in self.modules():
 97 |                 if isinstance(m, BasicBlock ):
 98 |                     m.bn2.weight.data.zero_()
 99 | 
100 |     def _make_layer(self, block, planes, blocks, stride=1):
101 | 
102 | 
103 |         downsample = None
104 |         if stride != 1 or self.inplanes != planes * block.expansion:
105 |             downsample = self.downsample_block( inplanes = self.inplanes, 
106 |                                                  outplanes = planes * block.expansion, 
107 |                                                  stride = stride )
108 | 
109 |         layers = []
110 |         layers.append(block(self.inplanes, planes, stride, downsample, relu_type = self.relu_type))
111 |         self.inplanes = planes * block.expansion
112 |         for i in range(1, blocks):
113 |             layers.append(block(self.inplanes, planes, relu_type = self.relu_type))
114 | 
115 |         return nn.Sequential(*layers)
116 | 
117 |     def forward(self, x):
118 |         x = self.layer1(x)
119 |         x = self.layer2(x)
120 |         x = self.layer3(x)
121 |         x = self.layer4(x)
122 |         x = self.avgpool(x)
123 |         x = x.view(x.size(0), -1)
124 |         return x
125 | 
126 | class ResEncoder(nn.Module):
127 |     def __init__(self, relu_type, weights):
128 |         super(ResEncoder, self).__init__()
129 |         self.frontend_nout = 64
130 |         self.backend_out = 512
131 |         frontend_relu = nn.PReLU(num_parameters=self.frontend_nout) if relu_type == 'prelu' else nn.ReLU()
132 |         self.frontend3D = nn.Sequential(
133 |             nn.Conv3d(1, self.frontend_nout, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False),
134 |             nn.BatchNorm3d(self.frontend_nout),
135 |             frontend_relu,
136 |             nn.MaxPool3d( kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)))
137 |         self.trunk = ResNet(BasicBlock, [2, 2, 2, 2], relu_type=relu_type)
138 |         if weights is not None:
139 |             logger.info(f"Load {weights} for resnet")
140 |             std = torch.load(weights, map_location=torch.device('cpu'))['model_state_dict']
141 |             frontend_std, trunk_std = OrderedDict(), OrderedDict()
142 |             for key, val in std.items():
143 |                 new_key = '.'.join(key.split('.')[1:])
144 |                 if 'frontend3D' in key:
145 |                     frontend_std[new_key] = val
146 |                 if 'trunk' in key:
147 |                     trunk_std[new_key] = val
148 |             self.frontend3D.load_state_dict(frontend_std)
149 |             self.trunk.load_state_dict(trunk_std)
150 | 
151 |     def forward(self, x):
152 |         B, C, T, H, W = x.size()
153 |         x = self.frontend3D(x)
154 |         Tnew = x.shape[2]
155 |         x = self.threeD_to_2D_tensor(x)
156 |         x = self.trunk(x)
157 |         x = x.view(B, Tnew, x.size(1))
158 |         x = x.transpose(1, 2).contiguous()
159 |         return x
160 | 
161 |     def threeD_to_2D_tensor(self, x):
162 |         n_batch, n_channels, s_time, sx, sy = x.shape
163 |         x = x.transpose(1, 2).contiguous()
164 |         return x.reshape(n_batch*s_time, n_channels, sx, sy)
165 | 


--------------------------------------------------------------------------------
/src/nets/backend/backbones/resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import logging
  3 | import math
  4 | import torch.nn as nn
  5 | from collections import OrderedDict
  6 | 
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | def conv3x3(in_planes, out_planes, stride=1):
 11 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
 12 |                      padding=1, bias=False)
 13 | 
 14 | 
 15 | def downsample_basic_block( inplanes, outplanes, stride ):
 16 |     return  nn.Sequential(
 17 |                 nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=stride, bias=False),
 18 |                 nn.BatchNorm2d(outplanes),
 19 |             )
 20 | 
 21 | def downsample_basic_block_v2( inplanes, outplanes, stride ):
 22 |     return  nn.Sequential(
 23 |                 nn.AvgPool2d(kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False),
 24 |                 nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=1, bias=False),
 25 |                 nn.BatchNorm2d(outplanes),
 26 |             )
 27 | 
 28 | 
 29 | 
 30 | class BasicBlock(nn.Module):
 31 |     expansion = 1
 32 | 
 33 |     def __init__(self, inplanes, planes, stride=1, downsample=None, relu_type = 'relu' ):
 34 |         super(BasicBlock, self).__init__()
 35 | 
 36 |         assert relu_type in ['relu','prelu']
 37 | 
 38 |         self.conv1 = conv3x3(inplanes, planes, stride)
 39 |         self.bn1 = nn.BatchNorm2d(planes)
 40 | 
 41 |         if relu_type == 'relu':
 42 |             self.relu1 = nn.ReLU(inplace=True)
 43 |             self.relu2 = nn.ReLU(inplace=True)
 44 |         elif relu_type == 'prelu':
 45 |             self.relu1 = nn.PReLU(num_parameters=planes)
 46 |             self.relu2 = nn.PReLU(num_parameters=planes)
 47 |         else:
 48 |             raise Exception('relu type not implemented')
 49 | 
 50 |         self.conv2 = conv3x3(planes, planes)
 51 |         self.bn2 = nn.BatchNorm2d(planes)
 52 |         
 53 |         self.downsample = downsample
 54 |         self.stride = stride
 55 | 
 56 |     def forward(self, x):
 57 |         residual = x
 58 |         out = self.conv1(x)
 59 |         out = self.bn1(out)
 60 |         out = self.relu1(out)
 61 |         out = self.conv2(out)
 62 |         out = self.bn2(out)
 63 |         if self.downsample is not None:
 64 |             residual = self.downsample(x)
 65 | 
 66 |         out += residual
 67 |         out = self.relu2(out)
 68 | 
 69 |         return out
 70 | 
 71 | 
 72 | class ResNet(nn.Module):
 73 | 
 74 |     def __init__(self, block, layers, num_classes=1000, relu_type = 'relu', gamma_zero = False, avg_pool_downsample = False):
 75 |         self.inplanes = 64
 76 |         self.relu_type = relu_type
 77 |         self.gamma_zero = gamma_zero
 78 |         self.downsample_block = downsample_basic_block_v2 if avg_pool_downsample else downsample_basic_block
 79 | 
 80 |         super(ResNet, self).__init__()
 81 |         self.layer1 = self._make_layer(block, 64, layers[0])
 82 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
 83 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
 84 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
 85 |         self.avgpool = nn.AdaptiveAvgPool2d(1)
 86 | 
 87 |         for m in self.modules():
 88 |             if isinstance(m, nn.Conv2d):
 89 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 90 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
 91 |             elif isinstance(m, nn.BatchNorm2d):
 92 |                 m.weight.data.fill_(1)
 93 |                 m.bias.data.zero_()
 94 | 
 95 |         if self.gamma_zero:
 96 |             for m in self.modules():
 97 |                 if isinstance(m, BasicBlock ):
 98 |                     m.bn2.weight.data.zero_()
 99 | 
100 |     def _make_layer(self, block, planes, blocks, stride=1):
101 | 
102 | 
103 |         downsample = None
104 |         if stride != 1 or self.inplanes != planes * block.expansion:
105 |             downsample = self.downsample_block( inplanes = self.inplanes, 
106 |                                                  outplanes = planes * block.expansion, 
107 |                                                  stride = stride )
108 | 
109 |         layers = []
110 |         layers.append(block(self.inplanes, planes, stride, downsample, relu_type = self.relu_type))
111 |         self.inplanes = planes * block.expansion
112 |         for i in range(1, blocks):
113 |             layers.append(block(self.inplanes, planes, relu_type = self.relu_type))
114 | 
115 |         return nn.Sequential(*layers)
116 | 
117 |     def forward(self, x):
118 |         x = self.layer1(x)
119 |         x = self.layer2(x)
120 |         x = self.layer3(x)
121 |         x = self.layer4(x)
122 |         x = self.avgpool(x)
123 |         x = x.view(x.size(0), -1)
124 |         return x
125 | 
126 | class ResEncoder(nn.Module):
127 |     def __init__(self, relu_type, weights):
128 |         super(ResEncoder, self).__init__()
129 |         self.frontend_nout = 64
130 |         self.backend_out = 512
131 |         frontend_relu = nn.PReLU(num_parameters=self.frontend_nout) if relu_type == 'prelu' else nn.ReLU()
132 |         self.frontend3D = nn.Sequential(
133 |             nn.Conv3d(1, self.frontend_nout, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False),
134 |             nn.BatchNorm3d(self.frontend_nout),
135 |             frontend_relu,
136 |             nn.MaxPool3d( kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)))
137 |         self.trunk = ResNet(BasicBlock, [2, 2, 2, 2], relu_type=relu_type)
138 |         if weights is not None:
139 |             logger.info(f"Load {weights} for resnet")
140 |             std = torch.load(weights, map_location=torch.device('cpu'))['model_state_dict']
141 |             frontend_std, trunk_std = OrderedDict(), OrderedDict()
142 |             for key, val in std.items():
143 |                 new_key = '.'.join(key.split('.')[1:])
144 |                 if 'frontend3D' in key:
145 |                     frontend_std[new_key] = val
146 |                 if 'trunk' in key:
147 |                     trunk_std[new_key] = val
148 |             self.frontend3D.load_state_dict(frontend_std)
149 |             self.trunk.load_state_dict(trunk_std)
150 | 
151 |     def forward(self, x):
152 |         B, C, T, H, W = x.size()
153 |         x = self.frontend3D(x)
154 |         Tnew = x.shape[2]
155 |         x = self.threeD_to_2D_tensor(x)
156 |         x = self.trunk(x)
157 |         x = x.view(B, Tnew, x.size(1))
158 |         x = x.transpose(1, 2).contiguous()
159 |         return x
160 | 
161 |     def threeD_to_2D_tensor(self, x):
162 |         n_batch, n_channels, s_time, sx, sy = x.shape
163 |         x = x.transpose(1, 2).contiguous()
164 |         return x.reshape(n_batch*s_time, n_channels, sx, sy)
165 | 


--------------------------------------------------------------------------------