├── modules
    ├── __init__.py
    ├── shap_e
    │   ├── shap_e
    │   │   ├── __init__.py
    │   │   ├── util
    │   │   │   ├── __init__.py
    │   │   │   ├── io.py
    │   │   │   ├── notebooks.py
    │   │   │   └── collections.py
    │   │   ├── diffusion
    │   │   │   ├── __init__.py
    │   │   │   └── sample.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── nerf
    │   │   │   │   └── __init__.py
    │   │   │   ├── stf
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── base.py
    │   │   │   ├── generation
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── util.py
    │   │   │   │   ├── latent_diffusion.py
    │   │   │   │   └── pooled_mlp.py
    │   │   │   ├── transmitter
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── bottleneck.py
    │   │   │   ├── nn
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── utils.py
    │   │   │   │   └── checkpoint.py
    │   │   │   └── query.py
    │   │   └── rendering
    │   │   │   ├── __init__.py
    │   │   │   ├── raycast
    │   │   │       ├── __init__.py
    │   │   │       ├── _utils.py
    │   │   │       ├── render.py
    │   │   │       ├── types.py
    │   │   │       └── cast.py
    │   │   │   ├── blender
    │   │   │       ├── constants.py
    │   │   │       ├── __init__.py
    │   │   │       ├── view_data.py
    │   │   │       └── render.py
    │   │   │   ├── torch_mesh.py
    │   │   │   ├── ply_util.py
    │   │   │   └── mesh.py
    │   └── __init__.py
    ├── annotator
    │   ├── midas
    │   │   ├── midas
    │   │   │   ├── __init__.py
    │   │   │   ├── base_model.py
    │   │   │   ├── midas_net.py
    │   │   │   └── dpt_depth.py
    │   │   ├── __init__.py
    │   │   └── utils.py
    │   ├── util.py
    │   ├── openpose
    │   │   ├── __init__.py
    │   │   └── hand.py
    │   └── __init__.py
    ├── mplug
    │   ├── models
    │   │   ├── clip
    │   │   │   ├── __init__.py
    │   │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   │   └── simple_tokenizer.py
    │   │   └── visual_transformers.py
    │   ├── ckpts
    │   │   └── bert-base-uncased
    │   │   │   └── config.json
    │   ├── __init__.py
    │   └── get_video_caption.py
    ├── video_moviepy
    │   └── font
    │   │   └── cn.ttf
    ├── sadtalker
    │   ├── src
    │   │   ├── config
    │   │   │   ├── similarity_Lm3D_all.mat
    │   │   │   ├── facerender.yaml
    │   │   │   ├── facerender_still.yaml
    │   │   │   ├── auido2pose.yaml
    │   │   │   └── auido2exp.yaml
    │   │   ├── face3d
    │   │   │   ├── util
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── preprocess.py
    │   │   │   │   └── load_mats.py
    │   │   │   └── models
    │   │   │   │   ├── arcface_torch
    │   │   │   │       └── backbones
    │   │   │   │       │   ├── __init__.py
    │   │   │   │       │   └── mobilefacenet.py
    │   │   │   │   └── __init__.py
    │   │   ├── utils
    │   │   │   ├── safetensor_helper.py
    │   │   │   ├── text2speech.py
    │   │   │   ├── videoio.py
    │   │   │   ├── paste_pic.py
    │   │   │   ├── init_path.py
    │   │   │   ├── face_enhancer.py
    │   │   │   └── audio.py
    │   │   ├── facerender
    │   │   │   ├── sync_batchnorm
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── unittest.py
    │   │   │   │   ├── replicate.py
    │   │   │   │   └── comm.py
    │   │   │   └── modules
    │   │   │   │   ├── mapping.py
    │   │   │   │   └── discriminator.py
    │   │   ├── audio2exp_models
    │   │   │   ├── audio2exp.py
    │   │   │   └── networks.py
    │   │   ├── audio2pose_models
    │   │   │   ├── res_unet.py
    │   │   │   ├── discriminator.py
    │   │   │   ├── audio_encoder.py
    │   │   │   ├── audio2pose.py
    │   │   │   └── networks.py
    │   │   └── generate_batch.py
    │   ├── __init__.py
    │   └── inference.py
    ├── stable_diffusion
    │   └── __init__.py
    ├── bark
    │   └── __init__.py
    ├── bark_voice_clone
    │   └── __init__.py
    ├── blip
    │   └── __init__.py
    ├── modelscope_t2v
    │   └── __init__.py
    └── text2video_zero
    │   ├── __init__.py
    │   └── utils.py
├── .gitignore
├── test.py
├── model_zoo
    └── mplug
    │   ├── videocap_vatex_mplug_large.yaml
    │   └── config_bert.json
├── requirements.txt
├── LICENSE
├── utils.py
└── video_utils.py


/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/util/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/modules/annotator/midas/midas/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/rendering/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/models/nerf/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/models/stf/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/models/generation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/models/transmitter/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/rendering/raycast/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/modules/mplug/models/clip/__init__.py:
--------------------------------------------------------------------------------
1 | from .clip import *
2 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/models/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from .meta import *
2 | from .ops import *
3 | 


--------------------------------------------------------------------------------
/modules/video_moviepy/font/cn.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaleido-lab/dolphin/HEAD/modules/video_moviepy/font/cn.ttf


--------------------------------------------------------------------------------
/modules/mplug/models/clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaleido-lab/dolphin/HEAD/modules/mplug/models/clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/modules/sadtalker/src/config/similarity_Lm3D_all.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaleido-lab/dolphin/HEAD/modules/sadtalker/src/config/similarity_Lm3D_all.mat


--------------------------------------------------------------------------------
/modules/sadtalker/src/face3d/util/__init__.py:
--------------------------------------------------------------------------------
1 | """This package includes a miscellaneous collection of useful helper functions."""
2 | from .util import *
3 | 
4 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/rendering/blender/constants.py:
--------------------------------------------------------------------------------
1 | UNIFORM_LIGHT_DIRECTION = [0.09387503, -0.63953443, -0.7630093]
2 | BASIC_AMBIENT_COLOR = 0.3
3 | BASIC_DIFFUSE_COLOR = 0.7
4 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/rendering/blender/__init__.py:
--------------------------------------------------------------------------------
1 | from .render import render_mesh, render_model
2 | from .view_data import BlenderViewData
3 | 
4 | __all__ = ["BlenderViewData", "render_model"]
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .DS_Store
 3 | *.iml
 4 | *.xml
 5 | 
 6 | *.pth
 7 | *.mp4
 8 | *.tar
 9 | *.pt
10 | *.bin
11 | *.ckpt
12 | *.safetensors
13 | image
14 | video
15 | logs/
16 | modules/bark_voice_clone/pretrain_work_dir
17 | *.wav


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | from modules.sadtalker import Sadtalker
 2 | import torch
 3 | 
 4 | if torch.cuda.is_available():
 5 |     device = "cuda"
 6 | else:
 7 |     device = "cpu"
 8 | 
 9 | sd = Sadtalker(device)
10 | sd.inference("audio/ac9fc7da.wav,image/test.png")
11 | 
12 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/utils/safetensor_helper.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | def load_x_from_safetensor(checkpoint, key):
4 |     x_generator = {}
5 |     for k,v in checkpoint.items():
6 |         if key in k:
7 |             x_generator[k.replace(key+'.', '')] = v
8 |     return x_generator


--------------------------------------------------------------------------------
/model_zoo/mplug/videocap_vatex_mplug_large.yaml:
--------------------------------------------------------------------------------
 1 | bert_config: './model_zoo/mplug/config_bert.json'
 2 | image_res: 224
 3 | vision_width: 1024
 4 | distill: True
 5 | clip_name: "ViT-L-14"
 6 | k_test: 128
 7 | eos: '[SEP]'
 8 | bos: '[CLS]'
 9 | prompt: 'a video of'
10 | use_checkpoint: true
11 | num_frm_test: 8
12 | min_length: 10
13 | max_length: 20
14 | beam_size: 3
15 | text_encoder: 'bert-base-uncased'
16 | text_decoder: 'bert-base-uncased'


--------------------------------------------------------------------------------
/modules/annotator/midas/midas/base_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class BaseModel(torch.nn.Module):
 5 |     def load(self, path):
 6 |         """Load model from file.
 7 | 
 8 |         Args:
 9 |             path (str): file path
10 |         """
11 |         parameters = torch.load(path, map_location=torch.device('cpu'))
12 | 
13 |         if "optimizer" in parameters:
14 |             parameters = parameters["model"]
15 | 
16 |         self.load_state_dict(parameters)
17 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/facerender/sync_batchnorm/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # File   : __init__.py
 3 | # Author : Jiayuan Mao
 4 | # Email  : maojiayuan@gmail.com
 5 | # Date   : 27/01/2018
 6 | # 
 7 | # This file is part of Synchronized-BatchNorm-PyTorch.
 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
 9 | # Distributed under MIT License.
10 | 
11 | from .batchnorm import SynchronizedBatchNorm1d, SynchronizedBatchNorm2d, SynchronizedBatchNorm3d
12 | from .replicate import DataParallelWithCallback, patch_replication_callback
13 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/rendering/raycast/_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def normalize(v: torch.Tensor) -> torch.Tensor:
 5 |     return v / torch.linalg.norm(v, dim=-1, keepdim=True)
 6 | 
 7 | 
 8 | def cross_product(v1: torch.Tensor, v2: torch.Tensor) -> torch.Tensor:
 9 |     return torch.stack(
10 |         [
11 |             v1[..., 1] * v2[..., 2] - v2[..., 1] * v1[..., 2],
12 |             -(v1[..., 0] * v2[..., 2] - v2[..., 0] * v1[..., 2]),
13 |             v1[..., 0] * v2[..., 1] - v2[..., 0] * v1[..., 1],
14 |         ],
15 |         dim=-1,
16 |     )
17 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/utils/text2speech.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | from TTS.api import TTS
 4 | 
 5 | 
 6 | class TTSTalker():
 7 |     def __init__(self) -> None:
 8 |         model_name = TTS.list_models()[0]
 9 |         self.tts = TTS(model_name)
10 | 
11 |     def test(self, text, language='en'):
12 | 
13 |         tempf  = tempfile.NamedTemporaryFile(
14 |                 delete = False,
15 |                 suffix = ('.'+'wav'),
16 |             )
17 | 
18 |         self.tts.tts_to_file(text, speaker=self.tts.speakers[0], language=language, file_path=tempf.name)
19 | 
20 |         return tempf.name


--------------------------------------------------------------------------------
/modules/stable_diffusion/__init__.py:
--------------------------------------------------------------------------------
 1 | from diffusers import StableDiffusionPipeline, PNDMScheduler
 2 | from utils import generate_image_name
 3 | 
 4 | class Text2Image:
 5 |     def __init__(self):
 6 |         self.pndm = PNDMScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
 7 |         self.pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", scheduler=self.pndm)
 8 | 
 9 |     def image_generation(self, text):
10 |         image = self.pipeline(text).images[0]
11 |         image_url = generate_image_name()
12 |         image.save(image_url)
13 |         return image_url
14 | 
15 | 


--------------------------------------------------------------------------------
/modules/mplug/ckpts/bert-base-uncased/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertForMaskedLM"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "gradient_checkpointing": false,
 7 |   "hidden_act": "gelu",
 8 |   "hidden_dropout_prob": 0.1,
 9 |   "hidden_size": 768,
10 |   "initializer_range": 0.02,
11 |   "intermediate_size": 3072,
12 |   "layer_norm_eps": 1e-12,
13 |   "max_position_embeddings": 512,
14 |   "model_type": "bert",
15 |   "num_attention_heads": 12,
16 |   "num_hidden_layers": 12,
17 |   "pad_token_id": 0,
18 |   "position_embedding_type": "absolute",
19 |   "transformers_version": "4.6.0.dev0",
20 |   "type_vocab_size": 2,
21 |   "use_cache": true,
22 |   "vocab_size": 30522
23 | }
24 | 


--------------------------------------------------------------------------------
/model_zoo/mplug/config_bert.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertForMaskedLM"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "type_vocab_size": 2,
18 |   "vocab_size": 30522,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": false,
21 |   "use_cache":false,
22 |   "gradient_checkpointing": true,
23 |   "text_encoder_layers": 6,
24 |   "fusion_layers": 6,
25 |   "text_decode_layers": 12,
26 |   "stride_layer": 6
27 | }
28 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cu113
 2 | accelerate==0.17.1
 3 | basicsr==1.4.2
 4 | decord==0.6.0
 5 | diffusers==0.16.1
 6 | einops==0.3.0
 7 | ftfy
 8 | gradio==3.20.1
 9 | imageio==2.19.3
10 | kornia==0.6.8
11 | langchain==0.0.101
12 | modelscope==1.4.2
13 | moviepy==1.0.3
14 | omegaconf==2.3.0
15 | openai
16 | opencv-python
17 | open_clip_torch
18 | oss2
19 | pandas==2.0.0
20 | Pillow==9.5.0
21 | PyYAML==6.0
22 | ruamel_yaml
23 | timm==0.4.9
24 | tomesd
25 | torch==1.12.1+cu113
26 | torchaudio==0.12.1+cu113
27 | torchvision==0.13.1+cu113
28 | tqdm==4.65.0
29 | transformers==4.28.0
30 | git+https://github.com/suno-ai/bark.git
31 | 
32 | # SadTalker
33 | ffmpeg
34 | av
35 | numpy==1.23.4
36 | face_alignment==1.3.5
37 | imageio-ffmpeg==0.4.7
38 | librosa==0.9.2
39 | numba
40 | resampy==0.3.1
41 | pydub==0.25.1 
42 | scipy==1.10.1
43 | yacs==0.1.8
44 | pyyaml  
45 | joblib==1.1.0
46 | scikit-image==0.19.3
47 | facexlib==0.3.0
48 | gfpgan
49 | safetensors
50 | 


--------------------------------------------------------------------------------
/modules/mplug/__init__.py:
--------------------------------------------------------------------------------
 1 | from .get_video_caption import prepare_model, pipeline
 2 | 
 3 | 
 4 | mplug_model_zoo = "model_zoo/mplug"
 5 | 
 6 | 
 7 | class VideoCaptioning:
 8 |     def __init__(self, device):
 9 |         print("Initializing mPLUG for VideoCaptioning")
10 |         self.download_models()
11 |         self.device = device
12 |         self.model, self.tokenizer = prepare_model(device)
13 |         self.pipe = pipeline
14 | 
15 |     def inference(self, inputs):
16 |         return pipeline(inputs, self.model, self.tokenizer, self.device)
17 | 
18 |     def download_models(self):
19 |         model_list = [
20 |             "https://alice-open.oss-cn-zhangjiakou.aliyuncs.com/mPLUG/ViT-L-14.tar",
21 |             "https://alice-open.oss-cn-zhangjiakou.aliyuncs.com/mPLUG/mplug_large.pth",
22 |         ]
23 |         for url in model_list:
24 |             from basicsr.utils.download_util import load_file_from_url
25 | 
26 |             load_file_from_url(url, model_dir=mplug_model_zoo)
27 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/face3d/models/arcface_torch/backbones/__init__.py:
--------------------------------------------------------------------------------
 1 | from .iresnet import iresnet18, iresnet34, iresnet50, iresnet100, iresnet200
 2 | from .mobilefacenet import get_mbf
 3 | 
 4 | 
 5 | 
 6 | def get_model(name, **kwargs):
 7 |     # resnet
 8 |     if name == "r18":
 9 |         return iresnet18(False, **kwargs)
10 |     elif name == "r34":
11 |         return iresnet34(False, **kwargs)
12 |     elif name == "r50":
13 |         return iresnet50(False, **kwargs)
14 |     elif name == "r100":
15 |         return iresnet100(False, **kwargs)
16 |     elif name == "r200":
17 |         return iresnet200(False, **kwargs)
18 |     elif name == "r2060":
19 |         from .iresnet2060 import iresnet2060
20 |         return iresnet2060(False, **kwargs)
21 |     elif name == "mbf":
22 |         fp16 = kwargs.get("fp16", False)
23 |         num_features = kwargs.get("num_features", 512)
24 |         return get_mbf(fp16=fp16, num_features=num_features)
25 |     else:
26 |         raise ValueError()


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/models/generation/util.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def timestep_embedding(timesteps, dim, max_period=10000):
 7 |     """
 8 |     Create sinusoidal timestep embeddings.
 9 |     :param timesteps: a 1-D Tensor of N indices, one per batch element.
10 |                       These may be fractional.
11 |     :param dim: the dimension of the output.
12 |     :param max_period: controls the minimum frequency of the embeddings.
13 |     :return: an [N x dim] Tensor of positional embeddings.
14 |     """
15 |     half = dim // 2
16 |     freqs = torch.exp(
17 |         -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
18 |     ).to(device=timesteps.device)
19 |     args = timesteps[:, None].to(timesteps.dtype) * freqs[None]
20 |     embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
21 |     if dim % 2:
22 |         embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
23 |     return embedding
24 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/facerender/sync_batchnorm/unittest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # File   : unittest.py
 3 | # Author : Jiayuan Mao
 4 | # Email  : maojiayuan@gmail.com
 5 | # Date   : 27/01/2018
 6 | # 
 7 | # This file is part of Synchronized-BatchNorm-PyTorch.
 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
 9 | # Distributed under MIT License.
10 | 
11 | import unittest
12 | 
13 | import numpy as np
14 | from torch.autograd import Variable
15 | 
16 | 
17 | def as_numpy(v):
18 |     if isinstance(v, Variable):
19 |         v = v.data
20 |     return v.cpu().numpy()
21 | 
22 | 
23 | class TorchTestCase(unittest.TestCase):
24 |     def assertTensorClose(self, a, b, atol=1e-3, rtol=1e-3):
25 |         npa, npb = as_numpy(a), as_numpy(b)
26 |         self.assertTrue(
27 |                 np.allclose(npa, npb, atol=atol),
28 |                 'Tensor close check failed\n{}\n{}\nadiff={}, rdiff={}'.format(a, b, np.abs(npa - npb).max(), np.abs((npa - npb) / np.fmax(npa, 1e-5)).max())
29 |         )
30 | 


--------------------------------------------------------------------------------
/modules/bark/__init__.py:
--------------------------------------------------------------------------------
 1 | from scipy.io.wavfile import write as write_wav
 2 | from bark import SAMPLE_RATE, generate_audio, preload_models
 3 | 
 4 | from utils import generate_audio_name
 5 | 
 6 | 
 7 | class Text2Audio:
 8 |     def __init__(self, **kwargs):
 9 |         print("Initializing Bark for Text2Audio")
10 |         # download and load all models
11 |         print("Loading bark models for text2audio...")
12 |         preload_models()
13 | 
14 |     def text2audio(self, inputs):
15 |         # generate audio from text
16 |         text = inputs
17 |         audio_array = generate_audio(text)
18 |         audio_path = generate_audio_name()
19 |         write_wav(audio_path, SAMPLE_RATE, audio_array)
20 |         return audio_path
21 | 
22 |     def text2music(self, inputs):
23 |         # generate music from text
24 |         text = "♪ " + inputs + " ♪"
25 |         audio_array = generate_audio(text)
26 |         audio_path = generate_audio_name()
27 |         write_wav(audio_path, SAMPLE_RATE, audio_array)
28 |         return audio_path
29 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/models/query.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Callable, Optional
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | @dataclass
 8 | class Query:
 9 |     # Both of these are of shape [batch_size x ... x 3]
10 |     position: torch.Tensor
11 |     direction: Optional[torch.Tensor] = None
12 | 
13 |     t_min: Optional[torch.Tensor] = None
14 |     t_max: Optional[torch.Tensor] = None
15 | 
16 |     def copy(self) -> "Query":
17 |         return Query(
18 |             position=self.position,
19 |             direction=self.direction,
20 |             t_min=self.t_min,
21 |             t_max=self.t_max,
22 |         )
23 | 
24 |     def map_tensors(self, f: Callable[[torch.Tensor], torch.Tensor]) -> "Query":
25 |         return Query(
26 |             position=f(self.position),
27 |             direction=f(self.direction) if self.direction is not None else None,
28 |             t_min=f(self.t_min) if self.t_min is not None else None,
29 |             t_max=f(self.t_max) if self.t_max is not None else None,
30 |         )
31 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/util/io.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | from contextlib import contextmanager
 3 | from typing import Any, BinaryIO, Iterator, Union
 4 | 
 5 | import blobfile as bf
 6 | import yaml
 7 | 
 8 | from ..util.collections import AttrDict
 9 | 
10 | 
11 | def read_config(path_or_file: Union[str, io.IOBase]) -> Any:
12 |     if isinstance(path_or_file, io.IOBase):
13 |         obj = yaml.load(path_or_file, Loader=yaml.SafeLoader)
14 |     else:
15 |         with bf.BlobFile(path_or_file, "rb") as f:
16 |             try:
17 |                 obj = yaml.load(f, Loader=yaml.SafeLoader)
18 |             except Exception as exc:
19 |                 with bf.BlobFile(path_or_file, "rb") as f:
20 |                     print(f.read())
21 |                 raise exc
22 |     if isinstance(obj, dict):
23 |         return AttrDict(obj)
24 |     return obj
25 | 
26 | 
27 | @contextmanager
28 | def buffered_writer(raw_f: BinaryIO) -> Iterator[io.BufferedIOBase]:
29 |     if isinstance(raw_f, io.BufferedIOBase):
30 |         yield raw_f
31 |     else:
32 |         f = io.BufferedWriter(raw_f)
33 |         yield f
34 |         f.flush()
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 BUAA-PrismGroup
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/modules/annotator/util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import cv2
 3 | import os
 4 | 
 5 | 
 6 | annotator_ckpts_path = "model_zoo/annotator"
 7 | 
 8 | 
 9 | def HWC3(x):
10 |     assert x.dtype == np.uint8
11 |     if x.ndim == 2:
12 |         x = x[:, :, None]
13 |     assert x.ndim == 3
14 |     H, W, C = x.shape
15 |     assert C == 1 or C == 3 or C == 4
16 |     if C == 3:
17 |         return x
18 |     if C == 1:
19 |         return np.concatenate([x, x, x], axis=2)
20 |     if C == 4:
21 |         color = x[:, :, 0:3].astype(np.float32)
22 |         alpha = x[:, :, 3:4].astype(np.float32) / 255.0
23 |         y = color * alpha + 255.0 * (1.0 - alpha)
24 |         y = y.clip(0, 255).astype(np.uint8)
25 |         return y
26 | 
27 | 
28 | def resize_image(input_image, resolution):
29 |     H, W, C = input_image.shape
30 |     H = float(H)
31 |     W = float(W)
32 |     k = float(resolution) / min(H, W)
33 |     H *= k
34 |     W *= k
35 |     H = int(np.round(H / 64.0)) * 64
36 |     W = int(np.round(W / 64.0)) * 64
37 |     img = cv2.resize(
38 |         input_image,
39 |         (W, H),
40 |         interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA,
41 |     )
42 |     return img
43 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/models/generation/latent_diffusion.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | class SplitVectorDiffusion(nn.Module):
 8 |     def __init__(self, *, device: torch.device, wrapped: nn.Module, n_ctx: int, d_latent: int):
 9 |         super().__init__()
10 |         self.device = device
11 |         self.n_ctx = n_ctx
12 |         self.d_latent = d_latent
13 |         self.wrapped = wrapped
14 | 
15 |         if hasattr(self.wrapped, "cached_model_kwargs"):
16 |             self.cached_model_kwargs = self.wrapped.cached_model_kwargs
17 | 
18 |     def forward(self, x: torch.Tensor, t: torch.Tensor, **kwargs):
19 |         h = x.reshape(x.shape[0], self.n_ctx, -1).permute(0, 2, 1)
20 |         pre_channels = h.shape[1]
21 |         h = self.wrapped(h, t, **kwargs)
22 |         assert (
23 |             h.shape[1] == pre_channels * 2
24 |         ), "expected twice as many outputs for variance prediction"
25 |         eps, var = torch.chunk(h, 2, dim=1)
26 |         return torch.cat(
27 |             [
28 |                 eps.permute(0, 2, 1).flatten(1),
29 |                 var.permute(0, 2, 1).flatten(1),
30 |             ],
31 |             dim=1,
32 |         )
33 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/models/nn/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable, Union
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | ArrayType = Union[np.ndarray, Iterable[int], torch.Tensor]
 7 | 
 8 | 
 9 | def to_torch(arr: ArrayType, dtype=torch.float):
10 |     if isinstance(arr, torch.Tensor):
11 |         return arr
12 |     return torch.from_numpy(np.array(arr)).to(dtype)
13 | 
14 | 
15 | def sample_pmf(pmf: torch.Tensor, n_samples: int) -> torch.Tensor:
16 |     """
17 |     Sample from the given discrete probability distribution with replacement.
18 | 
19 |     The i-th bin is assumed to have mass pmf[i].
20 | 
21 |     :param pmf: [batch_size, *shape, n_samples, 1] where (pmf.sum(dim=-2) == 1).all()
22 |     :param n_samples: number of samples
23 | 
24 |     :return: indices sampled with replacement
25 |     """
26 | 
27 |     *shape, support_size, last_dim = pmf.shape
28 |     assert last_dim == 1
29 | 
30 |     cdf = torch.cumsum(pmf.view(-1, support_size), dim=1)
31 |     inds = torch.searchsorted(cdf, torch.rand(cdf.shape[0], n_samples, device=cdf.device))
32 | 
33 |     return inds.view(*shape, n_samples, 1).clamp(0, support_size - 1)
34 | 
35 | 
36 | def safe_divide(a, b, epsilon=1e-6):
37 |     return a / torch.where(b < 0, b - epsilon, b + epsilon)
38 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/config/facerender.yaml:
--------------------------------------------------------------------------------
 1 | model_params:
 2 |   common_params:
 3 |     num_kp: 15 
 4 |     image_channel: 3                    
 5 |     feature_channel: 32
 6 |     estimate_jacobian: False   # True
 7 |   kp_detector_params:
 8 |      temperature: 0.1
 9 |      block_expansion: 32            
10 |      max_features: 1024
11 |      scale_factor: 0.25         # 0.25
12 |      num_blocks: 5
13 |      reshape_channel: 16384  # 16384 = 1024 * 16
14 |      reshape_depth: 16
15 |   he_estimator_params:
16 |      block_expansion: 64            
17 |      max_features: 2048
18 |      num_bins: 66
19 |   generator_params:
20 |     block_expansion: 64
21 |     max_features: 512
22 |     num_down_blocks: 2
23 |     reshape_channel: 32
24 |     reshape_depth: 16         # 512 = 32 * 16
25 |     num_resblocks: 6
26 |     estimate_occlusion_map: True
27 |     dense_motion_params:
28 |       block_expansion: 32
29 |       max_features: 1024
30 |       num_blocks: 5
31 |       reshape_depth: 16
32 |       compress: 4
33 |   discriminator_params:
34 |     scales: [1]
35 |     block_expansion: 32                 
36 |     max_features: 512
37 |     num_blocks: 4
38 |     sn: True
39 |   mapping_params:
40 |       coeff_nc: 70
41 |       descriptor_nc: 1024
42 |       layer: 3
43 |       num_kp: 15
44 |       num_bins: 66
45 | 
46 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/config/facerender_still.yaml:
--------------------------------------------------------------------------------
 1 | model_params:
 2 |   common_params:
 3 |     num_kp: 15 
 4 |     image_channel: 3                    
 5 |     feature_channel: 32
 6 |     estimate_jacobian: False   # True
 7 |   kp_detector_params:
 8 |      temperature: 0.1
 9 |      block_expansion: 32            
10 |      max_features: 1024
11 |      scale_factor: 0.25         # 0.25
12 |      num_blocks: 5
13 |      reshape_channel: 16384  # 16384 = 1024 * 16
14 |      reshape_depth: 16
15 |   he_estimator_params:
16 |      block_expansion: 64            
17 |      max_features: 2048
18 |      num_bins: 66
19 |   generator_params:
20 |     block_expansion: 64
21 |     max_features: 512
22 |     num_down_blocks: 2
23 |     reshape_channel: 32
24 |     reshape_depth: 16         # 512 = 32 * 16
25 |     num_resblocks: 6
26 |     estimate_occlusion_map: True
27 |     dense_motion_params:
28 |       block_expansion: 32
29 |       max_features: 1024
30 |       num_blocks: 5
31 |       reshape_depth: 16
32 |       compress: 4
33 |   discriminator_params:
34 |     scales: [1]
35 |     block_expansion: 32                 
36 |     max_features: 512
37 |     num_blocks: 4
38 |     sn: True
39 |   mapping_params:
40 |       coeff_nc: 73
41 |       descriptor_nc: 1024
42 |       layer: 3
43 |       num_kp: 15
44 |       num_bins: 66
45 | 
46 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/config/auido2pose.yaml:
--------------------------------------------------------------------------------
 1 | DATASET:
 2 |   TRAIN_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/audio2pose_unet_noAudio/dataset/train_33.txt
 3 |   EVAL_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/audio2pose_unet_noAudio/dataset/val.txt
 4 |   TRAIN_BATCH_SIZE: 64
 5 |   EVAL_BATCH_SIZE: 1
 6 |   EXP: True
 7 |   EXP_DIM: 64
 8 |   FRAME_LEN: 32
 9 |   COEFF_LEN: 73
10 |   NUM_CLASSES: 46
11 |   AUDIO_ROOT_PATH: /apdcephfs_cq2/share_1290939/wenxuazhang/voxceleb1/wav
12 |   COEFF_ROOT_PATH: /apdcephfs_cq2/share_1290939/shadowcun/datasets/VoxCeleb/v1/imdb
13 |   DEBUG: True
14 |   
15 | 
16 | MODEL:
17 |   AUDIOENCODER:
18 |     LEAKY_RELU: True
19 |     NORM: 'IN'
20 |   DISCRIMINATOR:
21 |     LEAKY_RELU: False
22 |     INPUT_CHANNELS: 6
23 |   CVAE:
24 |     AUDIO_EMB_IN_SIZE: 512
25 |     AUDIO_EMB_OUT_SIZE: 6
26 |     SEQ_LEN: 32
27 |     LATENT_SIZE: 64
28 |     ENCODER_LAYER_SIZES: [192, 128]
29 |     DECODER_LAYER_SIZES: [128, 192]
30 |     
31 | 
32 | TRAIN:
33 |   MAX_EPOCH: 150
34 |   GENERATOR:
35 |     LR: 1.0e-4
36 |   DISCRIMINATOR:
37 |     LR: 1.0e-4
38 |   LOSS:
39 |     LAMBDA_REG: 1
40 |     LAMBDA_LANDMARKS: 0
41 |     LAMBDA_VERTICES: 0
42 |     LAMBDA_GAN_MOTION: 0.7
43 |     LAMBDA_GAN_COEFF: 0
44 |     LAMBDA_KL: 1
45 | 
46 | TAG:
47 |   NAME: cvae_UNET_useAudio_usewav2lipAudioEncoder
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/rendering/torch_mesh.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Dict, Optional
 3 | 
 4 | import torch
 5 | 
 6 | from .mesh import TriMesh
 7 | 
 8 | 
 9 | @dataclass
10 | class TorchMesh:
11 |     """
12 |     A 3D triangle mesh with optional data at the vertices and faces.
13 |     """
14 | 
15 |     # [N x 3] array of vertex coordinates.
16 |     verts: torch.Tensor
17 | 
18 |     # [M x 3] array of triangles, pointing to indices in verts.
19 |     faces: torch.Tensor
20 | 
21 |     # Extra data per vertex and face.
22 |     vertex_channels: Optional[Dict[str, torch.Tensor]] = field(default_factory=dict)
23 |     face_channels: Optional[Dict[str, torch.Tensor]] = field(default_factory=dict)
24 | 
25 |     def tri_mesh(self) -> TriMesh:
26 |         """
27 |         Create a CPU version of the mesh.
28 |         """
29 |         return TriMesh(
30 |             verts=self.verts.detach().cpu().numpy(),
31 |             faces=self.faces.cpu().numpy(),
32 |             vertex_channels=(
33 |                 {k: v.detach().cpu().numpy() for k, v in self.vertex_channels.items()}
34 |                 if self.vertex_channels is not None
35 |                 else None
36 |             ),
37 |             face_channels=(
38 |                 {k: v.detach().cpu().numpy() for k, v in self.face_channels.items()}
39 |                 if self.face_channels is not None
40 |                 else None
41 |             ),
42 |         )
43 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/audio2exp_models/audio2exp.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import torch
 3 | from torch import nn
 4 | 
 5 | 
 6 | class Audio2Exp(nn.Module):
 7 |     def __init__(self, netG, cfg, device, prepare_training_loss=False):
 8 |         super(Audio2Exp, self).__init__()
 9 |         self.cfg = cfg
10 |         self.device = device
11 |         self.netG = netG.to(device)
12 | 
13 |     def test(self, batch):
14 | 
15 |         mel_input = batch['indiv_mels']                         # bs T 1 80 16
16 |         bs = mel_input.shape[0]
17 |         T = mel_input.shape[1]
18 | 
19 |         exp_coeff_pred = []
20 | 
21 |         for i in tqdm(range(0, T, 10),'audio2exp:'): # every 10 frames
22 |             
23 |             current_mel_input = mel_input[:,i:i+10]
24 | 
25 |             #ref = batch['ref'][:, :, :64].repeat((1,current_mel_input.shape[1],1))           #bs T 64
26 |             ref = batch['ref'][:, :, :64][:, i:i+10]
27 |             ratio = batch['ratio_gt'][:, i:i+10]                               #bs T
28 | 
29 |             audiox = current_mel_input.view(-1, 1, 80, 16)                  # bs*T 1 80 16
30 | 
31 |             curr_exp_coeff_pred  = self.netG(audiox, ref, ratio)         # bs T 64 
32 | 
33 |             exp_coeff_pred += [curr_exp_coeff_pred]
34 | 
35 |         # BS x T x 64
36 |         results_dict = {
37 |             'exp_coeff_pred': torch.cat(exp_coeff_pred, axis=1)
38 |             }
39 |         return results_dict
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/config/auido2exp.yaml:
--------------------------------------------------------------------------------
 1 | DATASET:
 2 |   TRAIN_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/file_list/train.txt
 3 |   EVAL_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/file_list/val.txt
 4 |   TRAIN_BATCH_SIZE: 32
 5 |   EVAL_BATCH_SIZE: 32
 6 |   EXP: True
 7 |   EXP_DIM: 64
 8 |   FRAME_LEN: 32
 9 |   COEFF_LEN: 73
10 |   NUM_CLASSES: 46
11 |   AUDIO_ROOT_PATH: /apdcephfs_cq2/share_1290939/wenxuazhang/voxceleb1/wav
12 |   COEFF_ROOT_PATH: /apdcephfs_cq2/share_1290939/wenxuazhang/voxceleb1/wav2lip_3dmm
13 |   LMDB_PATH: /apdcephfs_cq2/share_1290939/shadowcun/datasets/VoxCeleb/v1/imdb
14 |   DEBUG: True
15 |   NUM_REPEATS: 2
16 |   T: 40
17 |   
18 | 
19 | MODEL:
20 |   FRAMEWORK: V2
21 |   AUDIOENCODER:
22 |     LEAKY_RELU: True
23 |     NORM: 'IN'
24 |   DISCRIMINATOR:
25 |     LEAKY_RELU: False
26 |     INPUT_CHANNELS: 6
27 |   CVAE:
28 |     AUDIO_EMB_IN_SIZE: 512
29 |     AUDIO_EMB_OUT_SIZE: 128
30 |     SEQ_LEN: 32
31 |     LATENT_SIZE: 256
32 |     ENCODER_LAYER_SIZES: [192, 1024]
33 |     DECODER_LAYER_SIZES: [1024, 192]
34 |     
35 | 
36 | TRAIN:
37 |   MAX_EPOCH: 300
38 |   GENERATOR:
39 |     LR: 2.0e-5
40 |   DISCRIMINATOR:
41 |     LR: 1.0e-5
42 |   LOSS:
43 |     W_FEAT: 0
44 |     W_COEFF_EXP: 2
45 |     W_LM: 1.0e-2
46 |     W_LM_MOUTH: 0
47 |     W_REG: 0
48 |     W_SYNC: 0
49 |     W_COLOR: 0
50 |     W_EXPRESSION: 0
51 |     W_LIPREADING: 0.01
52 |     W_LIPREADING_VV: 0
53 |     W_EYE_BLINK: 4
54 | 
55 | TAG:
56 |   NAME:  small_dataset
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/modules/shap_e/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | from .shape_gen import ShapE
 4 | from tqdm import tqdm
 5 | 
 6 | class Shap_E:
 7 |     def __init__(self, device):
 8 |         self.cond_type = "t2m"
 9 |         self.cache_dir = "modules/shap_e/cache_dir"
10 |         self.output_dir = "video"
11 |         self.device = device
12 |         
13 |     def inference(self, prompt):
14 |         self.cond = prompt
15 |         shapE = ShapE(device=self.device, cache_dir=self.cache_dir, type=self.cond_type)
16 | 
17 |         if os.path.exists(self.cond):
18 |             if self.cond_type == "t2m":
19 |                 prompts_path = self.cond
20 |                 with open(prompts_path, "r") as f:
21 |                     prompts = f.readlines()
22 |                 prompts = [prompt.strip() for prompt in prompts]
23 | 
24 |                 for prompt in tqdm(prompts):
25 |                     results_dir = shapE.inference(prompt, self.output_dir)
26 | 
27 |             elif self.cond_type == "i2m":
28 |                 base_dir = self.cond
29 |                 images_path = [
30 |                     os.path.join(base_dir, f)
31 |                     for f in os.listdir(base_dir)
32 |                     if f.endswith(".png") or f.endswith(".jpg")
33 |                 ]
34 | 
35 |                 for image_path in tqdm(images_path):
36 |                     results_dir = shapE.inference(image_path, self.output_dir)
37 | 
38 |         else:
39 |             results_dir = shapE.inference(self.cond, self.output_dir)
40 |             print(f"Output saved to {results_dir}")
41 |         
42 |     


--------------------------------------------------------------------------------
/modules/sadtalker/__init__.py:
--------------------------------------------------------------------------------
 1 | from .inference import main
 2 | import torch
 3 | 
 4 | class Sadtalker:
 5 |     def __init__(self, device):
 6 |         self.driven_audio = './image/bus_chinese.wav'
 7 |         self.source_image = './image/art_0.png'
 8 |         self.ref_eyeblink = None
 9 |         self.ref_pose = None
10 |         self.checkpoint_dir = './modules/sadtalker/checkpoints'
11 |         self.result_dir = './video/sadtalker'
12 |         self.pose_style = 0
13 |         self.batch_size = 2
14 |         self.size = 256
15 |         self.expression_scale = 1.0
16 |         self.input_yaw = None
17 |         self.input_pitch = None
18 |         self.input_roll = None
19 |         self.enhancer = None
20 |         self.background_enhancer = None
21 |         self.preprocess = 'crop'
22 |         self.cpu = False
23 |         self.old_version = False
24 |         self.still = False
25 |         
26 |         self.net_recon = 'resnet50'
27 |         self.init_path = None
28 |         self.use_last_fc = False
29 |         self.bfm_folder = '../modules/sadtalker/checkpoints/BFM_Fitting/'
30 |         self.bfm_model = 'BFM_model_front.mat'
31 |         self.face3dvis = False
32 |         self.verbose = False
33 |         
34 |         self.focal = 1015.0
35 |         self.center = 112.0
36 |         self.camera_d = 10.0
37 |         self.z_near = 5.0
38 |         self.z_far = 15.0
39 |         
40 |         self.device = device
41 | 
42 |     def inference(self, inputs):
43 |         splits = inputs.split(",")
44 |         self.driven_audio = splits[0]
45 |         self.source_image = splits[1]
46 |         main(self)


--------------------------------------------------------------------------------
/modules/sadtalker/src/utils/videoio.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import uuid
 3 | 
 4 | import os
 5 | 
 6 | import cv2
 7 | 
 8 | def load_video_to_cv2(input_path):
 9 |     video_stream = cv2.VideoCapture(input_path)
10 |     fps = video_stream.get(cv2.CAP_PROP_FPS)
11 |     full_frames = [] 
12 |     while 1:
13 |         still_reading, frame = video_stream.read()
14 |         if not still_reading:
15 |             video_stream.release()
16 |             break 
17 |         full_frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
18 |     return full_frames
19 | 
20 | def save_video_with_watermark(video, audio, save_path, watermark=False):
21 |     temp_file = str(uuid.uuid4())+'.mp4'
22 |     cmd = r'ffmpeg -y -hide_banner -loglevel error -i "%s" -i "%s" -vcodec copy "%s"' % (video, audio, temp_file)
23 |     os.system(cmd)
24 | 
25 |     if watermark is False:
26 |         shutil.move(temp_file, save_path)
27 |     else:
28 |         # watermark
29 |         try:
30 |             ##### check if stable-diffusion-webui
31 |             import webui
32 |             from modules import paths
33 |             watarmark_path = paths.script_path+"/extensions/SadTalker/docs/sadtalker_logo.png"
34 |         except:
35 |             # get the root path of sadtalker.
36 |             dir_path = os.path.dirname(os.path.realpath(__file__))
37 |             watarmark_path = dir_path+"/../../docs/sadtalker_logo.png"
38 | 
39 |         cmd = r'ffmpeg -y -hide_banner -loglevel error -i "%s" -i "%s" -filter_complex "[1]scale=100:-1[wm];[0][wm]overlay=(main_w-overlay_w)-10:10" "%s"' % (temp_file, watarmark_path, save_path)
40 |         os.system(cmd)
41 |         os.remove(temp_file)


--------------------------------------------------------------------------------
/modules/annotator/midas/__init__.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | from einops import rearrange
 6 | from .api import MiDaSInference
 7 | 
 8 | 
 9 | class MidasDetector:
10 |     def __init__(self, device=None):
11 |         self.device = device or torch.device(
12 |             "cuda" if torch.cuda.is_available() else "cpu"
13 |         )
14 |         self.model = MiDaSInference(model_type="dpt_hybrid").to(self.device)
15 | 
16 |     def __call__(self, input_image, a=np.pi * 2.0, bg_th=0.1):
17 |         assert input_image.ndim == 3
18 |         image_depth = input_image
19 |         with torch.no_grad():
20 |             image_depth = torch.from_numpy(image_depth).float().to(self.device)
21 |             image_depth = image_depth / 127.5 - 1.0
22 |             image_depth = rearrange(image_depth, "h w c -> 1 c h w")
23 |             depth = self.model(image_depth)[0]
24 | 
25 |             depth_pt = depth.clone()
26 |             depth_pt -= torch.min(depth_pt)
27 |             depth_pt /= torch.max(depth_pt)
28 |             depth_pt = depth_pt.cpu().numpy()
29 |             depth_image = (depth_pt * 255.0).clip(0, 255).astype(np.uint8)
30 | 
31 |             depth_np = depth.cpu().numpy()
32 |             x = cv2.Sobel(depth_np, cv2.CV_32F, 1, 0, ksize=3)
33 |             y = cv2.Sobel(depth_np, cv2.CV_32F, 0, 1, ksize=3)
34 |             z = np.ones_like(x) * a
35 |             x[depth_pt < bg_th] = 0
36 |             y[depth_pt < bg_th] = 0
37 |             normal = np.stack([x, y, z], axis=2)
38 |             normal /= np.sum(normal**2.0, axis=2, keepdims=True) ** 0.5
39 |             normal_image = (normal * 127.5 + 127.5).clip(0, 255).astype(np.uint8)
40 | 
41 |             return depth_image, normal_image
42 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/facerender/modules/mapping.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | class MappingNet(nn.Module):
 9 |     def __init__(self, coeff_nc, descriptor_nc, layer, num_kp, num_bins):
10 |         super( MappingNet, self).__init__()
11 | 
12 |         self.layer = layer
13 |         nonlinearity = nn.LeakyReLU(0.1)
14 | 
15 |         self.first = nn.Sequential(
16 |             torch.nn.Conv1d(coeff_nc, descriptor_nc, kernel_size=7, padding=0, bias=True))
17 | 
18 |         for i in range(layer):
19 |             net = nn.Sequential(nonlinearity,
20 |                 torch.nn.Conv1d(descriptor_nc, descriptor_nc, kernel_size=3, padding=0, dilation=3))
21 |             setattr(self, 'encoder' + str(i), net)   
22 | 
23 |         self.pooling = nn.AdaptiveAvgPool1d(1)
24 |         self.output_nc = descriptor_nc
25 | 
26 |         self.fc_roll = nn.Linear(descriptor_nc, num_bins)
27 |         self.fc_pitch = nn.Linear(descriptor_nc, num_bins)
28 |         self.fc_yaw = nn.Linear(descriptor_nc, num_bins)
29 |         self.fc_t = nn.Linear(descriptor_nc, 3)
30 |         self.fc_exp = nn.Linear(descriptor_nc, 3*num_kp)
31 | 
32 |     def forward(self, input_3dmm):
33 |         out = self.first(input_3dmm)
34 |         for i in range(self.layer):
35 |             model = getattr(self, 'encoder' + str(i))
36 |             out = model(out) + out[:,:,3:-3]
37 |         out = self.pooling(out)
38 |         out = out.view(out.shape[0], -1)
39 |         #print('out:', out.shape)
40 | 
41 |         yaw = self.fc_yaw(out)
42 |         pitch = self.fc_pitch(out)
43 |         roll = self.fc_roll(out)
44 |         t = self.fc_t(out)
45 |         exp = self.fc_exp(out)
46 | 
47 |         return {'yaw': yaw, 'pitch': pitch, 'roll': roll, 't': t, 'exp': exp} 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/models/stf/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any, Dict, Optional
 3 | 
 4 | import torch
 5 | 
 6 | from ...models.query import Query
 7 | from ...models.renderer import append_tensor
 8 | from ...util.collections import AttrDict
 9 | 
10 | 
11 | class Model(ABC):
12 |     @abstractmethod
13 |     def forward(
14 |         self,
15 |         query: Query,
16 |         params: Optional[Dict[str, torch.Tensor]] = None,
17 |         options: Optional[Dict[str, Any]] = None,
18 |     ) -> AttrDict[str, Any]:
19 |         """
20 |         Predict an attribute given position
21 |         """
22 | 
23 |     def forward_batched(
24 |         self,
25 |         query: Query,
26 |         query_batch_size: int = 4096,
27 |         params: Optional[Dict[str, torch.Tensor]] = None,
28 |         options: Optional[Dict[str, Any]] = None,
29 |     ) -> AttrDict[str, Any]:
30 |         if not query.position.numel():
31 |             # Avoid torch.cat() of zero tensors.
32 |             return self(query, params=params, options=options)
33 | 
34 |         if options.cache is None:
35 |             created_cache = True
36 |             options.cache = AttrDict()
37 |         else:
38 |             created_cache = False
39 | 
40 |         results_list = AttrDict()
41 |         for i in range(0, query.position.shape[1], query_batch_size):
42 |             out = self(
43 |                 query=query.map_tensors(lambda x, i=i: x[:, i : i + query_batch_size]),
44 |                 params=params,
45 |                 options=options,
46 |             )
47 |             results_list = results_list.combine(out, append_tensor)
48 | 
49 |         if created_cache:
50 |             del options["cache"]
51 | 
52 |         return results_list.map(lambda key, tensor_list: torch.cat(tensor_list, dim=1))
53 | 


--------------------------------------------------------------------------------
/modules/bark_voice_clone/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import wave
 3 | from utils import generate_audio_name
 4 | from modelscope.models.audio.tts import SambertHifigan
 5 | from modelscope.pipelines import pipeline
 6 | from modelscope.utils.constant import Tasks
 7 | 
 8 | 
 9 | class BarkVoiceClone:
10 |     
11 |     def __init__(self):
12 |         self.model_dir = os.path.abspath("./modules/bark_voice_clone/pretrain_work_dir")
13 |         self.output_file = generate_audio_name()
14 |         self.num_channels = 1
15 |         self.sample_width = 2
16 |         self.frame_rate = 18050 
17 |     
18 |     def inference(self, prompt):
19 |         custom_infer_abs = {
20 |             'voice_name':
21 |             'F7',
22 |             'am_ckpt': os.path.join(self.model_dir, 'tmp_am', 'ckpt'),
23 |             'am_config': os.path.join(self.model_dir, 'tmp_am', 'config.yaml'),
24 |             'voc_ckpt': os.path.join(self.model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'),
25 |             'voc_config': os.path.join(self.model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'config.yaml'),
26 |             'audio_config': os.path.join(self.model_dir, 'data', 'audio_config.yaml'),
27 |             'se_file': os.path.join(self.model_dir, 'data', 'se', 'se.npy')
28 |         }
29 |         kwargs = {'custom_ckpt': custom_infer_abs}
30 | 
31 |         model_id = SambertHifigan(os.path.join(self.model_dir, "orig_model"), **kwargs)
32 | 
33 |         inference = pipeline(task=Tasks.text_to_speech, model=model_id)
34 |         output = inference(input=prompt)
35 | 
36 |         with wave.open(self.output_file, 'wb') as wav_file:
37 |             wav_file.setnchannels(self.num_channels)
38 |             wav_file.setsampwidth(self.sample_width)
39 |             wav_file.setframerate(self.frame_rate)
40 |             wav_file.writeframesraw(output["output_wav"])


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import os, sys, uuid
 2 | import importlib
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | import random
 7 | 
 8 | 
 9 | def instantiate_from_config(config, **kwargs):
10 |     if not "target" in config:
11 |         raise KeyError("Expected key `target` to instantiate.")
12 |     return get_obj_from_str(config["target"])(**config.get("params", dict()), **kwargs)
13 | 
14 | 
15 | def get_obj_from_str(string, reload=False):
16 |     module, cls = string.rsplit(".", 1)
17 |     if reload:
18 |         module_imp = importlib.import_module(module)
19 |         importlib.reload(module_imp)
20 |     return getattr(importlib.import_module(module, package=None), cls)
21 | 
22 | 
23 | def seed_everything(seed):
24 |     random.seed(seed)
25 |     np.random.seed(seed)
26 |     torch.manual_seed(seed)
27 |     torch.cuda.manual_seed_all(seed)
28 |     return seed
29 | 
30 | 
31 | def get_new_video_name(org_vid_name, func_name="update"):
32 |     head_tail = os.path.split(org_vid_name)
33 |     head = head_tail[0]
34 |     tail = head_tail[1]
35 |     name_split = tail.split(".")[0].split("_")
36 |     this_new_uuid = str(uuid.uuid4())[:4]
37 |     if len(name_split) == 1:
38 |         most_org_file_name = name_split[0]
39 |     else:
40 |         assert len(name_split) == 4
41 |         most_org_file_name = name_split[3]
42 |     recent_prev_file_name = name_split[0]
43 |     new_file_name = (
44 |         f"{this_new_uuid}_{func_name}_{recent_prev_file_name}_{most_org_file_name}.mp4"
45 |     )
46 |     return os.path.join(head, new_file_name)
47 | 
48 | 
49 | def generate_video_name_mp4():
50 |     return os.path.join("video", str(uuid.uuid4())[:8] + ".mp4")
51 | 
52 | def generate_audio_name():
53 |     return os.path.join("audio", str(uuid.uuid4())[:8] + ".wav")
54 | 
55 | def generate_image_name():
56 |     return os.path.join("image", str(uuid.uuid4())[:8] + ".png")
57 | 
58 | def get_new_uuid():
59 |     return str(uuid.uuid4())[:8]
60 | 


--------------------------------------------------------------------------------
/modules/blip/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from transformers import AutoProcessor, Blip2ForConditionalGeneration
 4 | from PIL import Image
 5 | 
 6 | from video_utils import prepare_video
 7 | 
 8 | 
 9 | class ImageCaptioning:
10 |     def __init__(self, device):
11 |         print("Initializing BLIP2 for ImageCaptioning")
12 |         self.device = device
13 |         self.processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
14 |         self.model = Blip2ForConditionalGeneration.from_pretrained(
15 |             "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
16 |         ).to(device)
17 | 
18 |     def image_captioning(self, image: Image, prompt=None, is_vqa=False):
19 |         if prompt and is_vqa:
20 |             prompt = f"Question: {prompt} Answer:"
21 |         inputs = self.processor(image, text=prompt, return_tensors="pt").to(
22 |             self.device, torch.float16
23 |         )
24 |         generated_ids = self.model.generate(**inputs, max_new_tokens=40)
25 |         generated_text = self.processor.batch_decode(
26 |             generated_ids, skip_special_tokens=True
27 |         )[0].strip()
28 |         return generated_text
29 | 
30 |     def frames_captioning(self, video_path):
31 |         video, fps = prepare_video(video_path, 512, "cpu", normalize=False)
32 |         # pick each frame for each second
33 |         video = video[::fps]
34 |         video_nd = np.transpose(video.numpy(), (0, 2, 3, 1)).astype(np.uint8)
35 |         pil_images = [Image.fromarray(frame) for frame in video_nd]
36 | 
37 |         caption_results = []
38 |         for i, image in enumerate(pil_images):
39 |             # image.save(f"temp/{str(i).zfill(5)}.png")
40 |             caption = self.image_captioning(
41 |                 image, prompt="This is a video frame describing that"
42 |             )
43 |             caption_results.append(f"Second {i}: {caption}.")
44 |         return " ".join(caption_results)
45 | 
46 |     def inference(self, inputs):
47 |         return self.frames_captioning(inputs)
48 | 


--------------------------------------------------------------------------------
/modules/modelscope_t2v/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import random
 4 | import tempfile
 5 | import imageio
 6 | import numpy as np
 7 | import torch
 8 | from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
 9 | 
10 | 
11 | from utils import generate_video_name_mp4
12 | 
13 | 
14 | def to_video(frames: list[np.ndarray], fps: int, out_file=None) -> str:
15 |     if out_file is None:
16 |         out_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
17 |     writer = imageio.get_writer(out_file, format="FFMPEG", fps=fps)
18 |     for frame in frames:
19 |         writer.append_data(frame)
20 |     writer.close()
21 |     return out_file
22 | 
23 | 
24 | class ModelscopeT2V:
25 |     def __init__(self, device):
26 |         pipe = DiffusionPipeline.from_pretrained(
27 |             "damo-vilab/text-to-video-ms-1.7b",
28 |             torch_dtype=torch.float16,
29 |             variant="fp16",
30 |         ).to(device)
31 |         pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
32 |         pipe.enable_model_cpu_offload()
33 |         pipe.enable_vae_slicing()
34 | 
35 |         self.pipe = pipe
36 | 
37 |     def generate_video(
38 |         self,
39 |         prompt: str,
40 |         seed: int,
41 |         num_frames: int,
42 |         num_inference_steps: int,
43 |         out_file: str = None,
44 |     ) -> str:
45 |         if seed == -1:
46 |             seed = random.randint(0, 1000000)
47 |         generator = torch.Generator().manual_seed(seed)
48 |         frames = self.pipe(
49 |             prompt,
50 |             num_inference_steps=num_inference_steps,
51 |             num_frames=num_frames,
52 |             generator=generator,
53 |         ).frames
54 |         return to_video(frames, 8, out_file=out_file)
55 | 
56 |     def inference(self, inputs):
57 |         video_path = generate_video_name_mp4()
58 |         self.generate_video(
59 |             prompt=inputs,
60 |             seed=-1,
61 |             num_frames=16,
62 |             num_inference_steps=25,
63 |             out_file=video_path,
64 |         )
65 |         return video_path
66 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/rendering/ply_util.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | from typing import BinaryIO, Optional
 3 | 
 4 | import numpy as np
 5 | 
 6 | from ..util.io import buffered_writer
 7 | 
 8 | 
 9 | def write_ply(
10 |     raw_f: BinaryIO,
11 |     coords: np.ndarray,
12 |     rgb: Optional[np.ndarray] = None,
13 |     faces: Optional[np.ndarray] = None,
14 | ):
15 |     """
16 |     Write a PLY file for a mesh or a point cloud.
17 | 
18 |     :param coords: an [N x 3] array of floating point coordinates.
19 |     :param rgb: an [N x 3] array of vertex colors, in the range [0.0, 1.0].
20 |     :param faces: an [N x 3] array of triangles encoded as integer indices.
21 |     """
22 |     with buffered_writer(raw_f) as f:
23 |         f.write(b"ply\n")
24 |         f.write(b"format binary_little_endian 1.0\n")
25 |         f.write(bytes(f"element vertex {len(coords)}\n", "ascii"))
26 |         f.write(b"property float x\n")
27 |         f.write(b"property float y\n")
28 |         f.write(b"property float z\n")
29 |         if rgb is not None:
30 |             f.write(b"property uchar red\n")
31 |             f.write(b"property uchar green\n")
32 |             f.write(b"property uchar blue\n")
33 |         if faces is not None:
34 |             f.write(bytes(f"element face {len(faces)}\n", "ascii"))
35 |             f.write(b"property list uchar int vertex_index\n")
36 |         f.write(b"end_header\n")
37 | 
38 |         if rgb is not None:
39 |             rgb = (rgb * 255.499).round().astype(int)
40 |             vertices = [
41 |                 (*coord, *rgb)
42 |                 for coord, rgb in zip(
43 |                     coords.tolist(),
44 |                     rgb.tolist(),
45 |                 )
46 |             ]
47 |             format = struct.Struct("<3f3B")
48 |             for item in vertices:
49 |                 f.write(format.pack(*item))
50 |         else:
51 |             format = struct.Struct("<3f")
52 |             for vertex in coords.tolist():
53 |                 f.write(format.pack(*vertex))
54 | 
55 |         if faces is not None:
56 |             format = struct.Struct("<B3I")
57 |             for tri in faces.tolist():
58 |                 f.write(format.pack(len(tri), *tri))
59 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/rendering/raycast/render.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Sequence
 2 | 
 3 | import torch
 4 | 
 5 | from ...rendering.blender.constants import (
 6 |     BASIC_AMBIENT_COLOR,
 7 |     BASIC_DIFFUSE_COLOR,
 8 |     UNIFORM_LIGHT_DIRECTION,
 9 | )
10 | from ...rendering.view_data import ProjectiveCamera
11 | 
12 | from .cast import cast_camera
13 | from .types import RayCollisions, TriMesh
14 | 
15 | 
16 | def render_diffuse_mesh(
17 |     camera: ProjectiveCamera,
18 |     mesh: TriMesh,
19 |     light_direction: Sequence[float] = tuple(UNIFORM_LIGHT_DIRECTION),
20 |     diffuse: float = BASIC_DIFFUSE_COLOR,
21 |     ambient: float = BASIC_AMBIENT_COLOR,
22 |     ray_batch_size: Optional[int] = None,
23 |     checkpoint: Optional[bool] = None,
24 | ) -> torch.Tensor:
25 |     """
26 |     Return an [H x W x 4] RGBA tensor of the rendered image.
27 |     The pixels are floating points, with alpha in the range [0, 1] and the
28 |     other colors matching the scale used by the mesh's vertex colors.
29 |     """
30 |     light_direction = torch.tensor(
31 |         light_direction, device=mesh.vertices.device, dtype=mesh.vertices.dtype
32 |     )
33 | 
34 |     all_collisions = RayCollisions.collect(
35 |         cast_camera(
36 |             camera=camera,
37 |             mesh=mesh,
38 |             ray_batch_size=ray_batch_size,
39 |             checkpoint=checkpoint,
40 |         )
41 |     )
42 |     num_rays = len(all_collisions.normals)
43 |     if mesh.vertex_colors is None:
44 |         vertex_colors = torch.tensor([[0.8, 0.8, 0.8]]).to(mesh.vertices).repeat(num_rays, 1)
45 |     else:
46 |         vertex_colors = mesh.vertex_colors
47 | 
48 |     light_coeffs = ambient + (
49 |         diffuse * torch.sum(all_collisions.normals * light_direction, dim=-1).abs()
50 |     )
51 |     vertex_colors = mesh.vertex_colors[mesh.faces[all_collisions.tri_indices]]
52 |     bary_products = torch.sum(vertex_colors * all_collisions.barycentric[..., None], axis=-2)
53 |     out_colors = bary_products * light_coeffs[..., None]
54 |     res = torch.where(all_collisions.collides[:, None], out_colors, torch.zeros_like(out_colors))
55 |     return torch.cat([res, all_collisions.collides[:, None].float()], dim=-1).view(
56 |         camera.height, camera.width, 4
57 |     )
58 | 


--------------------------------------------------------------------------------
/modules/annotator/openpose/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
 4 | 
 5 | import torch
 6 | import numpy as np
 7 | from . import util
 8 | from .body import Body
 9 | from .hand import Hand
10 | from ..util import annotator_ckpts_path
11 | 
12 | 
13 | body_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/body_pose_model.pth"
14 | hand_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/hand_pose_model.pth"
15 | 
16 | 
17 | class OpenposeDetector:
18 |     def __init__(self, device=None):
19 |         body_modelpath = os.path.join(annotator_ckpts_path, "body_pose_model.pth")
20 |         hand_modelpath = os.path.join(annotator_ckpts_path, "hand_pose_model.pth")
21 | 
22 |         if not os.path.exists(hand_modelpath):
23 |             from basicsr.utils.download_util import load_file_from_url
24 | 
25 |             load_file_from_url(body_model_path, model_dir=annotator_ckpts_path)
26 |             load_file_from_url(hand_model_path, model_dir=annotator_ckpts_path)
27 | 
28 |         device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
29 |         self.body_estimation = Body(body_modelpath, device)
30 |         self.hand_estimation = Hand(hand_modelpath, device)
31 | 
32 |     def __call__(self, oriImg, hand=False):
33 |         oriImg = oriImg[:, :, ::-1].copy()
34 |         with torch.no_grad():
35 |             candidate, subset = self.body_estimation(oriImg)
36 |             canvas = np.zeros_like(oriImg)
37 |             canvas = util.draw_bodypose(canvas, candidate, subset)
38 |             if hand:
39 |                 hands_list = util.handDetect(candidate, subset, oriImg)
40 |                 all_hand_peaks = []
41 |                 for x, y, w, is_left in hands_list:
42 |                     peaks = self.hand_estimation(oriImg[y : y + w, x : x + w, :])
43 |                     peaks[:, 0] = np.where(
44 |                         peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x
45 |                     )
46 |                     peaks[:, 1] = np.where(
47 |                         peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y
48 |                     )
49 |                     all_hand_peaks.append(peaks)
50 |                 canvas = util.draw_handpose(canvas, all_hand_peaks)
51 |             return canvas, dict(candidate=candidate.tolist(), subset=subset.tolist())
52 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/audio2pose_models/res_unet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from .networks import ResidualConv, Upsample
 4 | 
 5 | 
 6 | class ResUnet(nn.Module):
 7 |     def __init__(self, channel=1, filters=[32, 64, 128, 256]):
 8 |         super(ResUnet, self).__init__()
 9 | 
10 |         self.input_layer = nn.Sequential(
11 |             nn.Conv2d(channel, filters[0], kernel_size=3, padding=1),
12 |             nn.BatchNorm2d(filters[0]),
13 |             nn.ReLU(),
14 |             nn.Conv2d(filters[0], filters[0], kernel_size=3, padding=1),
15 |         )
16 |         self.input_skip = nn.Sequential(
17 |             nn.Conv2d(channel, filters[0], kernel_size=3, padding=1)
18 |         )
19 | 
20 |         self.residual_conv_1 = ResidualConv(filters[0], filters[1], stride=(2,1), padding=1)
21 |         self.residual_conv_2 = ResidualConv(filters[1], filters[2], stride=(2,1), padding=1)
22 | 
23 |         self.bridge = ResidualConv(filters[2], filters[3], stride=(2,1), padding=1)
24 | 
25 |         self.upsample_1 = Upsample(filters[3], filters[3], kernel=(2,1), stride=(2,1))
26 |         self.up_residual_conv1 = ResidualConv(filters[3] + filters[2], filters[2], stride=1, padding=1)
27 | 
28 |         self.upsample_2 = Upsample(filters[2], filters[2], kernel=(2,1), stride=(2,1))
29 |         self.up_residual_conv2 = ResidualConv(filters[2] + filters[1], filters[1], stride=1, padding=1)
30 | 
31 |         self.upsample_3 = Upsample(filters[1], filters[1], kernel=(2,1), stride=(2,1))
32 |         self.up_residual_conv3 = ResidualConv(filters[1] + filters[0], filters[0], stride=1, padding=1)
33 | 
34 |         self.output_layer = nn.Sequential(
35 |             nn.Conv2d(filters[0], 1, 1, 1),
36 |             nn.Sigmoid(),
37 |         )
38 | 
39 |     def forward(self, x):
40 |         # Encode
41 |         x1 = self.input_layer(x) + self.input_skip(x)
42 |         x2 = self.residual_conv_1(x1)
43 |         x3 = self.residual_conv_2(x2)
44 |         # Bridge
45 |         x4 = self.bridge(x3)
46 | 
47 |         # Decode
48 |         x4 = self.upsample_1(x4)
49 |         x5 = torch.cat([x4, x3], dim=1)
50 | 
51 |         x6 = self.up_residual_conv1(x5)
52 | 
53 |         x6 = self.upsample_2(x6)
54 |         x7 = torch.cat([x6, x2], dim=1)
55 | 
56 |         x8 = self.up_residual_conv2(x7)
57 | 
58 |         x8 = self.upsample_3(x8)
59 |         x9 = torch.cat([x8, x1], dim=1)
60 | 
61 |         x10 = self.up_residual_conv3(x9)
62 | 
63 |         output = self.output_layer(x10)
64 | 
65 |         return output


--------------------------------------------------------------------------------
/modules/sadtalker/src/utils/paste_pic.py:
--------------------------------------------------------------------------------
 1 | import cv2, os
 2 | import numpy as np
 3 | from tqdm import tqdm
 4 | import uuid
 5 | 
 6 | from .videoio import save_video_with_watermark 
 7 | 
 8 | def paste_pic(video_path, pic_path, crop_info, new_audio_path, full_video_path, extended_crop=False):
 9 | 
10 |     if not os.path.isfile(pic_path):
11 |         raise ValueError('pic_path must be a valid path to video/image file')
12 |     elif pic_path.split('.')[-1] in ['jpg', 'png', 'jpeg']:
13 |         # loader for first frame
14 |         full_img = cv2.imread(pic_path)
15 |     else:
16 |         # loader for videos
17 |         video_stream = cv2.VideoCapture(pic_path)
18 |         fps = video_stream.get(cv2.CAP_PROP_FPS)
19 |         full_frames = [] 
20 |         while 1:
21 |             still_reading, frame = video_stream.read()
22 |             if not still_reading:
23 |                 video_stream.release()
24 |                 break 
25 |             break 
26 |         full_img = frame
27 |     frame_h = full_img.shape[0]
28 |     frame_w = full_img.shape[1]
29 | 
30 |     video_stream = cv2.VideoCapture(video_path)
31 |     fps = video_stream.get(cv2.CAP_PROP_FPS)
32 |     crop_frames = []
33 |     while 1:
34 |         still_reading, frame = video_stream.read()
35 |         if not still_reading:
36 |             video_stream.release()
37 |             break
38 |         crop_frames.append(frame)
39 |     
40 |     if len(crop_info) != 3:
41 |         print("you didn't crop the image")
42 |         return
43 |     else:
44 |         r_w, r_h = crop_info[0]
45 |         clx, cly, crx, cry = crop_info[1]
46 |         lx, ly, rx, ry = crop_info[2]
47 |         lx, ly, rx, ry = int(lx), int(ly), int(rx), int(ry)
48 |         # oy1, oy2, ox1, ox2 = cly+ly, cly+ry, clx+lx, clx+rx
49 |         # oy1, oy2, ox1, ox2 = cly+ly, cly+ry, clx+lx, clx+rx
50 | 
51 |         if extended_crop:
52 |             oy1, oy2, ox1, ox2 = cly, cry, clx, crx
53 |         else:
54 |             oy1, oy2, ox1, ox2 = cly+ly, cly+ry, clx+lx, clx+rx
55 | 
56 |     tmp_path = str(uuid.uuid4())+'.mp4'
57 |     out_tmp = cv2.VideoWriter(tmp_path, cv2.VideoWriter_fourcc(*'MP4V'), fps, (frame_w, frame_h))
58 |     for crop_frame in tqdm(crop_frames, 'seamlessClone:'):
59 |         p = cv2.resize(crop_frame.astype(np.uint8), (ox2-ox1, oy2 - oy1)) 
60 | 
61 |         mask = 255*np.ones(p.shape, p.dtype)
62 |         location = ((ox1+ox2) // 2, (oy1+oy2) // 2)
63 |         gen_img = cv2.seamlessClone(p, full_img, mask, location, cv2.NORMAL_CLONE)
64 |         out_tmp.write(gen_img)
65 | 
66 |     out_tmp.release()
67 | 
68 |     save_video_with_watermark(tmp_path, new_audio_path, full_video_path, watermark=False)
69 |     os.remove(tmp_path)
70 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/models/generation/pooled_mlp.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from .util import timestep_embedding
 5 | 
 6 | 
 7 | class PooledMLP(nn.Module):
 8 |     def __init__(
 9 |         self,
10 |         device: torch.device,
11 |         *,
12 |         input_channels: int = 3,
13 |         output_channels: int = 6,
14 |         hidden_size: int = 256,
15 |         resblocks: int = 4,
16 |         pool_op: str = "max",
17 |     ):
18 |         super().__init__()
19 |         self.input_embed = nn.Conv1d(input_channels, hidden_size, kernel_size=1, device=device)
20 |         self.time_embed = nn.Linear(hidden_size, hidden_size, device=device)
21 | 
22 |         blocks = []
23 |         for _ in range(resblocks):
24 |             blocks.append(ResBlock(hidden_size, pool_op, device=device))
25 |         self.sequence = nn.Sequential(*blocks)
26 | 
27 |         self.out = nn.Conv1d(hidden_size, output_channels, kernel_size=1, device=device)
28 |         with torch.no_grad():
29 |             self.out.bias.zero_()
30 |             self.out.weight.zero_()
31 | 
32 |     def forward(self, x: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
33 |         in_embed = self.input_embed(x)
34 |         t_embed = self.time_embed(timestep_embedding(t, in_embed.shape[1]))
35 |         h = in_embed + t_embed[..., None]
36 |         h = self.sequence(h)
37 |         h = self.out(h)
38 |         return h
39 | 
40 | 
41 | class ResBlock(nn.Module):
42 |     def __init__(self, hidden_size: int, pool_op: str, device: torch.device):
43 |         super().__init__()
44 |         assert pool_op in ["mean", "max"]
45 |         self.pool_op = pool_op
46 |         self.body = nn.Sequential(
47 |             nn.SiLU(),
48 |             nn.LayerNorm((hidden_size,), device=device),
49 |             nn.Linear(hidden_size, hidden_size, device=device),
50 |             nn.SiLU(),
51 |             nn.LayerNorm((hidden_size,), device=device),
52 |             nn.Linear(hidden_size, hidden_size, device=device),
53 |         )
54 |         self.gate = nn.Sequential(
55 |             nn.Linear(hidden_size, hidden_size, device=device),
56 |             nn.Tanh(),
57 |         )
58 | 
59 |     def forward(self, x: torch.Tensor):
60 |         N, C, T = x.shape
61 |         out = self.body(x.permute(0, 2, 1).reshape(N * T, C)).reshape([N, T, C]).permute(0, 2, 1)
62 |         pooled = pool(self.pool_op, x)
63 |         gate = self.gate(pooled)
64 |         return x + out * gate[..., None]
65 | 
66 | 
67 | def pool(op_name: str, x: torch.Tensor) -> torch.Tensor:
68 |     if op_name == "max":
69 |         pooled, _ = torch.max(x, dim=-1)
70 |     elif op_name == "mean":
71 |         pooled, _ = torch.mean(x, dim=-1)
72 |     else:
73 |         raise ValueError(f"unknown pool op: {op_name}")
74 |     return pooled
75 | 


--------------------------------------------------------------------------------
/video_utils.py:
--------------------------------------------------------------------------------
 1 | import imageio
 2 | import torch
 3 | import numpy as np
 4 | import decord
 5 | import torchvision
 6 | from einops import rearrange
 7 | from torchvision.transforms import Resize, InterpolationMode
 8 | 
 9 | from utils import get_new_video_name
10 | 
11 | 
12 | def prepare_video(
13 |     video_path: str,
14 |     resolution: int,
15 |     device,
16 |     dtype=torch.float16,
17 |     normalize=True,
18 |     start_t: float = 0,
19 |     end_t: float = -1,
20 |     output_fps: int = -1,
21 | ):
22 |     vr = decord.VideoReader(video_path)
23 |     initial_fps = vr.get_avg_fps()
24 |     if output_fps == -1:
25 |         output_fps = int(initial_fps)
26 |     if end_t == -1:
27 |         end_t = len(vr) / initial_fps
28 |     else:
29 |         end_t = min(len(vr) / initial_fps, end_t)
30 |     assert 0 <= start_t < end_t
31 |     assert output_fps > 0
32 |     start_f_ind = int(start_t * initial_fps)
33 |     end_f_ind = int(end_t * initial_fps)
34 |     num_f = int((end_t - start_t) * output_fps)
35 |     sample_idx = np.linspace(start_f_ind, end_f_ind, num_f, endpoint=False).astype(int)
36 |     video = vr.get_batch(sample_idx)
37 |     if torch.is_tensor(video):
38 |         video = video.detach().cpu().numpy()
39 |     else:
40 |         video = video.asnumpy()
41 |     _, h, w, _ = video.shape
42 |     video = rearrange(video, "f h w c -> f c h w")
43 |     video = torch.Tensor(video).to(device).to(dtype)
44 | 
45 |     # Use max if you want the larger side to be equal to resolution (e.g. 512)
46 |     # k = float(resolution) / min(h, w)
47 |     k = float(resolution) / max(h, w)
48 |     h *= k
49 |     w *= k
50 |     h = int(np.round(h / 64.0)) * 64
51 |     w = int(np.round(w / 64.0)) * 64
52 | 
53 |     video = Resize((h, w), interpolation=InterpolationMode.BILINEAR, antialias=True)(
54 |         video
55 |     )
56 |     if normalize:
57 |         video = video / 127.5 - 1.0
58 |     return video, output_fps  # video: f c h w
59 | 
60 | 
61 | def create_video(frames, fps, path, rescale=False):
62 |     # frames: f h w c
63 |     outputs = []
64 |     for _, x in enumerate(frames):
65 |         x = torchvision.utils.make_grid(torch.Tensor(x), nrow=4)
66 |         if rescale:
67 |             x = (x + 1.0) / 2.0  # -1,1 -> 0,1
68 |         x = (x * 255).numpy().astype(np.uint8)
69 |         outputs.append(x)
70 | 
71 |     imageio.mimsave(path, outputs, fps=fps)
72 |     return path
73 | 
74 | 
75 | def preprocess_video(video_path, out_path=None):
76 |     if out_path is None:
77 |         out_path = get_new_video_name(video_path, func_name="preprocessed")
78 | 
79 |     video, fps = prepare_video(video_path, resolution=512, device="cpu")
80 |     video = rearrange(video, "f c h w -> f h w c")
81 |     create_video(video, fps, out_path, rescale=True)
82 |     print(f"Preprocessed video saved to {out_path}")
83 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/audio2pose_models/discriminator.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch import nn
 4 | 
 5 | class ConvNormRelu(nn.Module):
 6 |     def __init__(self, conv_type='1d', in_channels=3, out_channels=64, downsample=False,
 7 |                  kernel_size=None, stride=None, padding=None, norm='BN', leaky=False):
 8 |         super().__init__()
 9 |         if kernel_size is None:
10 |             if downsample:
11 |                 kernel_size, stride, padding = 4, 2, 1
12 |             else:
13 |                 kernel_size, stride, padding = 3, 1, 1
14 | 
15 |         if conv_type == '2d':
16 |             self.conv = nn.Conv2d(
17 |                 in_channels,
18 |                 out_channels,
19 |                 kernel_size,
20 |                 stride,
21 |                 padding,
22 |                 bias=False,
23 |             )
24 |             if norm == 'BN':
25 |                 self.norm = nn.BatchNorm2d(out_channels)
26 |             elif norm == 'IN':
27 |                 self.norm = nn.InstanceNorm2d(out_channels)
28 |             else:
29 |                 raise NotImplementedError
30 |         elif conv_type == '1d':
31 |             self.conv = nn.Conv1d(
32 |                 in_channels,
33 |                 out_channels,
34 |                 kernel_size,
35 |                 stride,
36 |                 padding,
37 |                 bias=False,
38 |             )
39 |             if norm == 'BN':
40 |                 self.norm = nn.BatchNorm1d(out_channels)
41 |             elif norm == 'IN':
42 |                 self.norm = nn.InstanceNorm1d(out_channels)
43 |             else:
44 |                 raise NotImplementedError
45 |         nn.init.kaiming_normal_(self.conv.weight)
46 | 
47 |         self.act = nn.LeakyReLU(negative_slope=0.2, inplace=False) if leaky else nn.ReLU(inplace=True)
48 | 
49 |     def forward(self, x):
50 |         x = self.conv(x)
51 |         if isinstance(self.norm, nn.InstanceNorm1d):
52 |             x = self.norm(x.permute((0, 2, 1))).permute((0, 2, 1))  # normalize on [C]
53 |         else:
54 |             x = self.norm(x)
55 |         x = self.act(x)
56 |         return x
57 | 
58 | 
59 | class PoseSequenceDiscriminator(nn.Module):
60 |     def __init__(self, cfg):
61 |         super().__init__()
62 |         self.cfg = cfg
63 |         leaky = self.cfg.MODEL.DISCRIMINATOR.LEAKY_RELU
64 | 
65 |         self.seq = nn.Sequential(
66 |             ConvNormRelu('1d', cfg.MODEL.DISCRIMINATOR.INPUT_CHANNELS, 256, downsample=True, leaky=leaky),  # B, 256, 64
67 |             ConvNormRelu('1d', 256, 512, downsample=True, leaky=leaky),  # B, 512, 32
68 |             ConvNormRelu('1d', 512, 1024, kernel_size=3, stride=1, padding=1, leaky=leaky),  # B, 1024, 16
69 |             nn.Conv1d(1024, 1, kernel_size=3, stride=1, padding=1, bias=True)  # B, 1, 16
70 |         )
71 | 
72 |     def forward(self, x):
73 |         x = x.reshape(x.size(0), x.size(1), -1).transpose(1, 2)
74 |         x = self.seq(x)
75 |         x = x.squeeze(1)
76 |         return x


--------------------------------------------------------------------------------
/modules/sadtalker/src/audio2pose_models/audio_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | 
 5 | class Conv2d(nn.Module):
 6 |     def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 |         self.conv_block = nn.Sequential(
 9 |                             nn.Conv2d(cin, cout, kernel_size, stride, padding),
10 |                             nn.BatchNorm2d(cout)
11 |                             )
12 |         self.act = nn.ReLU()
13 |         self.residual = residual
14 | 
15 |     def forward(self, x):
16 |         out = self.conv_block(x)
17 |         if self.residual:
18 |             out += x
19 |         return self.act(out)
20 | 
21 | class AudioEncoder(nn.Module):
22 |     def __init__(self, wav2lip_checkpoint, device):
23 |         super(AudioEncoder, self).__init__()
24 | 
25 |         self.audio_encoder = nn.Sequential(
26 |             Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
27 |             Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
28 |             Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
29 | 
30 |             Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1),
31 |             Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
32 |             Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
33 | 
34 |             Conv2d(64, 128, kernel_size=3, stride=3, padding=1),
35 |             Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
36 |             Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
37 | 
38 |             Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1),
39 |             Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
40 | 
41 |             Conv2d(256, 512, kernel_size=3, stride=1, padding=0),
42 |             Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
43 | 
44 |         #### load the pre-trained audio_encoder, we do not need to load wav2lip model here.
45 |         # wav2lip_state_dict = torch.load(wav2lip_checkpoint, map_location=torch.device(device))['state_dict']
46 |         # state_dict = self.audio_encoder.state_dict()
47 | 
48 |         # for k,v in wav2lip_state_dict.items():
49 |         #     if 'audio_encoder' in k:
50 |         #         state_dict[k.replace('module.audio_encoder.', '')] = v
51 |         # self.audio_encoder.load_state_dict(state_dict)
52 | 
53 | 
54 |     def forward(self, audio_sequences):
55 |         # audio_sequences = (B, T, 1, 80, 16)
56 |         B = audio_sequences.size(0)
57 | 
58 |         audio_sequences = torch.cat([audio_sequences[:, i] for i in range(audio_sequences.size(1))], dim=0)
59 | 
60 |         audio_embedding = self.audio_encoder(audio_sequences) # B, 512, 1, 1
61 |         dim = audio_embedding.shape[1]
62 |         audio_embedding = audio_embedding.reshape((B, -1, dim, 1, 1))
63 | 
64 |         return audio_embedding.squeeze(-1).squeeze(-1) #B seq_len+1 512 
65 | 


--------------------------------------------------------------------------------
/modules/annotator/midas/midas/midas_net.py:
--------------------------------------------------------------------------------
 1 | """MidashNet: Network for monocular depth estimation trained by mixing several datasets.
 2 | This file contains code that is adapted from
 3 | https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
 4 | """
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | from .base_model import BaseModel
 9 | from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
10 | 
11 | 
12 | class MidasNet(BaseModel):
13 |     """Network for monocular depth estimation.
14 |     """
15 | 
16 |     def __init__(self, path=None, features=256, non_negative=True):
17 |         """Init.
18 | 
19 |         Args:
20 |             path (str, optional): Path to saved model. Defaults to None.
21 |             features (int, optional): Number of features. Defaults to 256.
22 |             backbone (str, optional): Backbone network for encoder. Defaults to resnet50
23 |         """
24 |         print("Loading weights: ", path)
25 | 
26 |         super(MidasNet, self).__init__()
27 | 
28 |         use_pretrained = False if path is None else True
29 | 
30 |         self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained)
31 | 
32 |         self.scratch.refinenet4 = FeatureFusionBlock(features)
33 |         self.scratch.refinenet3 = FeatureFusionBlock(features)
34 |         self.scratch.refinenet2 = FeatureFusionBlock(features)
35 |         self.scratch.refinenet1 = FeatureFusionBlock(features)
36 | 
37 |         self.scratch.output_conv = nn.Sequential(
38 |             nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
39 |             Interpolate(scale_factor=2, mode="bilinear"),
40 |             nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
41 |             nn.ReLU(True),
42 |             nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
43 |             nn.ReLU(True) if non_negative else nn.Identity(),
44 |         )
45 | 
46 |         if path:
47 |             self.load(path)
48 | 
49 |     def forward(self, x):
50 |         """Forward pass.
51 | 
52 |         Args:
53 |             x (tensor): input data (image)
54 | 
55 |         Returns:
56 |             tensor: depth
57 |         """
58 | 
59 |         layer_1 = self.pretrained.layer1(x)
60 |         layer_2 = self.pretrained.layer2(layer_1)
61 |         layer_3 = self.pretrained.layer3(layer_2)
62 |         layer_4 = self.pretrained.layer4(layer_3)
63 | 
64 |         layer_1_rn = self.scratch.layer1_rn(layer_1)
65 |         layer_2_rn = self.scratch.layer2_rn(layer_2)
66 |         layer_3_rn = self.scratch.layer3_rn(layer_3)
67 |         layer_4_rn = self.scratch.layer4_rn(layer_4)
68 | 
69 |         path_4 = self.scratch.refinenet4(layer_4_rn)
70 |         path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
71 |         path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
72 |         path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
73 | 
74 |         out = self.scratch.output_conv(path_1)
75 | 
76 |         return torch.squeeze(out, dim=1)
77 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/util/notebooks.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import io
 3 | from typing import Union
 4 | 
 5 | import ipywidgets as widgets
 6 | import numpy as np
 7 | import torch
 8 | from PIL import Image
 9 | 
10 | from ..models.nn.camera import DifferentiableCameraBatch, DifferentiableProjectiveCamera
11 | from ..models.transmitter.base import Transmitter, VectorDecoder
12 | from ..rendering.torch_mesh import TorchMesh
13 | from ..util.collections import AttrDict
14 | 
15 | 
16 | def create_pan_cameras(size: int, device: torch.device) -> DifferentiableCameraBatch:
17 |     origins = []
18 |     xs = []
19 |     ys = []
20 |     zs = []
21 |     for theta in np.linspace(0, 2 * np.pi, num=20):
22 |         z = np.array([np.sin(theta), np.cos(theta), -0.5])
23 |         z /= np.sqrt(np.sum(z**2))
24 |         origin = -z * 4
25 |         x = np.array([np.cos(theta), -np.sin(theta), 0.0])
26 |         y = np.cross(z, x)
27 |         origins.append(origin)
28 |         xs.append(x)
29 |         ys.append(y)
30 |         zs.append(z)
31 |     return DifferentiableCameraBatch(
32 |         shape=(1, len(xs)),
33 |         flat_camera=DifferentiableProjectiveCamera(
34 |             origin=torch.from_numpy(np.stack(origins, axis=0)).float().to(device),
35 |             x=torch.from_numpy(np.stack(xs, axis=0)).float().to(device),
36 |             y=torch.from_numpy(np.stack(ys, axis=0)).float().to(device),
37 |             z=torch.from_numpy(np.stack(zs, axis=0)).float().to(device),
38 |             width=size,
39 |             height=size,
40 |             x_fov=0.7,
41 |             y_fov=0.7,
42 |         ),
43 |     )
44 | 
45 | 
46 | @torch.no_grad()
47 | def decode_latent_images(
48 |     xm: Union[Transmitter, VectorDecoder],
49 |     latent: torch.Tensor,
50 |     cameras: DifferentiableCameraBatch,
51 |     rendering_mode: str = "stf",
52 | ):
53 |     decoded = xm.renderer.render_views(
54 |         AttrDict(cameras=cameras),
55 |         params=(xm.encoder if isinstance(xm, Transmitter) else xm).bottleneck_to_params(
56 |             latent[None]
57 |         ),
58 |         options=AttrDict(rendering_mode=rendering_mode, render_with_direction=False),
59 |     )
60 |     arr = decoded.channels.clamp(0, 255).to(torch.uint8)[0].cpu().numpy()
61 |     return [Image.fromarray(x) for x in arr]
62 | 
63 | 
64 | @torch.no_grad()
65 | def decode_latent_mesh(
66 |     xm: Union[Transmitter, VectorDecoder],
67 |     latent: torch.Tensor,
68 | ) -> TorchMesh:
69 |     decoded = xm.renderer.render_views(
70 |         AttrDict(cameras=create_pan_cameras(2, latent.device)),  # lowest resolution possible
71 |         params=(xm.encoder if isinstance(xm, Transmitter) else xm).bottleneck_to_params(
72 |             latent[None]
73 |         ),
74 |         options=AttrDict(rendering_mode="stf", render_with_direction=False),
75 |     )
76 |     return decoded.raw_meshes[0]
77 | 
78 | 
79 | def gif_widget(images):
80 |     writer = io.BytesIO()
81 |     images[0].save(
82 |         writer, format="GIF", save_all=True, append_images=images[1:], duration=100, loop=0
83 |     )
84 |     writer.seek(0)
85 |     data = base64.b64encode(writer.read()).decode("ascii")
86 |     return widgets.HTML(f'<img src="data:image/gif;base64,{data}" />')
87 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/rendering/raycast/types.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Iterable, Optional
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | from ...rendering.mesh import *
 8 | 
 9 | from ._utils import cross_product, normalize
10 | 
11 | 
12 | @dataclass
13 | class Rays:
14 |     """
15 |     A ray in ray casting.
16 |     """
17 | 
18 |     origins: torch.Tensor  # [N x 3] float tensor
19 |     directions: torch.Tensor  # [N x 3] float tensor
20 | 
21 |     def normalized_directions(self) -> torch.Tensor:
22 |         return normalize(self.directions)
23 | 
24 | 
25 | @dataclass
26 | class RayCollisions:
27 |     """
28 |     The result of casting N rays onto a mesh.
29 |     """
30 | 
31 |     collides: torch.Tensor  # [N] boolean tensor
32 |     ray_dists: torch.Tensor  # [N] float tensor
33 |     tri_indices: torch.Tensor  # [N] long tensor
34 |     barycentric: torch.Tensor  # [N x 3] float tensor
35 |     normals: torch.Tensor  # [N x 3] float tensor
36 | 
37 |     @classmethod
38 |     def collect(cls, it: Iterable["RayCollisions"]) -> "RayCollisions":
39 |         res = None
40 |         for x in it:
41 |             if res is None:
42 |                 res = x
43 |             else:
44 |                 res = cls(
45 |                     collides=torch.cat([res.collides, x.collides]),
46 |                     ray_dists=torch.cat([res.ray_dists, x.ray_dists]),
47 |                     tri_indices=torch.cat([res.tri_indices, x.tri_indices]),
48 |                     barycentric=torch.cat([res.barycentric, x.barycentric]),
49 |                     normals=torch.cat([res.normals, x.normals]),
50 |                 )
51 |         if res is None:
52 |             raise ValueError("cannot collect an empty iterable of RayCollisions")
53 |         return res
54 | 
55 | 
56 | @dataclass
57 | class TriMesh:
58 |     faces: torch.Tensor  # [N x 3] long tensor
59 |     vertices: torch.Tensor  # [N x 3] float tensor
60 | 
61 |     vertex_colors: Optional[torch.Tensor] = None
62 | 
63 |     def normals(self) -> torch.Tensor:
64 |         """
65 |         Returns an [N x 3] batch of normal vectors per triangle assuming the
66 |         right-hand rule.
67 |         """
68 |         tris = self.vertices[self.faces]
69 |         v1 = tris[:, 1] - tris[:, 0]
70 |         v2 = tris[:, 2] - tris[:, 0]
71 |         return normalize(cross_product(v1, v2))
72 | 
73 |     @classmethod
74 |     def from_numpy(cls, x: TriMesh) -> "TriMesh":
75 |         vertex_colors = None
76 |         if all(ch in x.vertex_channels for ch in "RGB"):
77 |             vertex_colors = torch.from_numpy(
78 |                 np.stack([x.vertex_channels[ch] for ch in "RGB"], axis=-1)
79 |             )
80 |         return cls(
81 |             faces=torch.from_numpy(x.faces),
82 |             vertices=torch.from_numpy(x.verts),
83 |             vertex_colors=vertex_colors,
84 |         )
85 | 
86 |     def to(self, *args, **kwargs) -> "TriMesh":
87 |         return TriMesh(
88 |             faces=self.faces.to(*args, **kwargs),
89 |             vertices=self.vertices.to(*args, **kwargs),
90 |             vertex_colors=None
91 |             if self.vertex_colors is None
92 |             else self.vertex_colors.to(*args, **kwargs),
93 |         )
94 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/diffusion/sample.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Callable, Dict, Optional
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from .gaussian_diffusion import GaussianDiffusion
 7 | from .k_diffusion import karras_sample
 8 | 
 9 | DEFAULT_KARRAS_STEPS = 64
10 | DEFAULT_KARRAS_SIGMA_MIN = 1e-3
11 | DEFAULT_KARRAS_SIGMA_MAX = 160
12 | DEFAULT_KARRAS_S_CHURN = 0.0
13 | 
14 | 
15 | def uncond_guide_model(
16 |     model: Callable[..., torch.Tensor], scale: float
17 | ) -> Callable[..., torch.Tensor]:
18 |     def model_fn(x_t, ts, **kwargs):
19 |         half = x_t[: len(x_t) // 2]
20 |         combined = torch.cat([half, half], dim=0)
21 |         model_out = model(combined, ts, **kwargs)
22 |         eps, rest = model_out[:, :3], model_out[:, 3:]
23 |         cond_eps, uncond_eps = torch.chunk(eps, 2, dim=0)
24 |         half_eps = uncond_eps + scale * (cond_eps - uncond_eps)
25 |         eps = torch.cat([half_eps, half_eps], dim=0)
26 |         return torch.cat([eps, rest], dim=1)
27 | 
28 |     return model_fn
29 | 
30 | 
31 | def sample_latents(
32 |     *,
33 |     batch_size: int,
34 |     model: nn.Module,
35 |     diffusion: GaussianDiffusion,
36 |     model_kwargs: Dict[str, Any],
37 |     guidance_scale: float,
38 |     clip_denoised: bool,
39 |     use_fp16: bool,
40 |     use_karras: bool,
41 |     karras_steps: int,
42 |     sigma_min: float,
43 |     sigma_max: float,
44 |     s_churn: float,
45 |     device: Optional[torch.device] = None,
46 |     progress: bool = False,
47 | ) -> torch.Tensor:
48 |     sample_shape = (batch_size, model.d_latent)
49 | 
50 |     if device is None:
51 |         device = next(model.parameters()).device
52 | 
53 |     if hasattr(model, "cached_model_kwargs"):
54 |         model_kwargs = model.cached_model_kwargs(batch_size, model_kwargs)
55 |     if guidance_scale != 1.0 and guidance_scale != 0.0:
56 |         for k, v in model_kwargs.copy().items():
57 |             model_kwargs[k] = torch.cat([v, torch.zeros_like(v)], dim=0)
58 | 
59 |     sample_shape = (batch_size, model.d_latent)
60 |     with torch.autocast(device_type=device.type, enabled=use_fp16):
61 |         if use_karras:
62 |             samples = karras_sample(
63 |                 diffusion=diffusion,
64 |                 model=model,
65 |                 shape=sample_shape,
66 |                 steps=karras_steps,
67 |                 clip_denoised=clip_denoised,
68 |                 model_kwargs=model_kwargs,
69 |                 device=device,
70 |                 sigma_min=sigma_min,
71 |                 sigma_max=sigma_max,
72 |                 s_churn=s_churn,
73 |                 guidance_scale=guidance_scale,
74 |                 progress=progress,
75 |             )
76 |         else:
77 |             internal_batch_size = batch_size
78 |             if guidance_scale != 1.0:
79 |                 model = uncond_guide_model(model, guidance_scale)
80 |                 internal_batch_size *= 2
81 |             samples = diffusion.p_sample_loop(
82 |                 model,
83 |                 shape=(internal_batch_size, *sample_shape[1:]),
84 |                 model_kwargs=model_kwargs,
85 |                 device=device,
86 |                 clip_denoised=clip_denoised,
87 |                 progress=progress,
88 |             )
89 | 
90 |     return samples
91 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/facerender/modules/discriminator.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | import torch.nn.functional as F
 3 | from facerender.modules.util import kp2gaussian
 4 | import torch
 5 | 
 6 | 
 7 | class DownBlock2d(nn.Module):
 8 |     """
 9 |     Simple block for processing video (encoder).
10 |     """
11 | 
12 |     def __init__(self, in_features, out_features, norm=False, kernel_size=4, pool=False, sn=False):
13 |         super(DownBlock2d, self).__init__()
14 |         self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size)
15 | 
16 |         if sn:
17 |             self.conv = nn.utils.spectral_norm(self.conv)
18 | 
19 |         if norm:
20 |             self.norm = nn.InstanceNorm2d(out_features, affine=True)
21 |         else:
22 |             self.norm = None
23 |         self.pool = pool
24 | 
25 |     def forward(self, x):
26 |         out = x
27 |         out = self.conv(out)
28 |         if self.norm:
29 |             out = self.norm(out)
30 |         out = F.leaky_relu(out, 0.2)
31 |         if self.pool:
32 |             out = F.avg_pool2d(out, (2, 2))
33 |         return out
34 | 
35 | 
36 | class Discriminator(nn.Module):
37 |     """
38 |     Discriminator similar to Pix2Pix
39 |     """
40 | 
41 |     def __init__(self, num_channels=3, block_expansion=64, num_blocks=4, max_features=512,
42 |                  sn=False, **kwargs):
43 |         super(Discriminator, self).__init__()
44 | 
45 |         down_blocks = []
46 |         for i in range(num_blocks):
47 |             down_blocks.append(
48 |                 DownBlock2d(num_channels if i == 0 else min(max_features, block_expansion * (2 ** i)),
49 |                             min(max_features, block_expansion * (2 ** (i + 1))),
50 |                             norm=(i != 0), kernel_size=4, pool=(i != num_blocks - 1), sn=sn))
51 | 
52 |         self.down_blocks = nn.ModuleList(down_blocks)
53 |         self.conv = nn.Conv2d(self.down_blocks[-1].conv.out_channels, out_channels=1, kernel_size=1)
54 |         if sn:
55 |             self.conv = nn.utils.spectral_norm(self.conv)
56 | 
57 |     def forward(self, x):
58 |         feature_maps = []
59 |         out = x
60 | 
61 |         for down_block in self.down_blocks:
62 |             feature_maps.append(down_block(out))
63 |             out = feature_maps[-1]
64 |         prediction_map = self.conv(out)
65 | 
66 |         return feature_maps, prediction_map
67 | 
68 | 
69 | class MultiScaleDiscriminator(nn.Module):
70 |     """
71 |     Multi-scale (scale) discriminator
72 |     """
73 | 
74 |     def __init__(self, scales=(), **kwargs):
75 |         super(MultiScaleDiscriminator, self).__init__()
76 |         self.scales = scales
77 |         discs = {}
78 |         for scale in scales:
79 |             discs[str(scale).replace('.', '-')] = Discriminator(**kwargs)
80 |         self.discs = nn.ModuleDict(discs)
81 | 
82 |     def forward(self, x):
83 |         out_dict = {}
84 |         for scale, disc in self.discs.items():
85 |             scale = str(scale).replace('-', '.')
86 |             key = 'prediction_' + scale
87 |             feature_maps, prediction_map = disc(x[key])
88 |             out_dict['feature_maps_' + scale] = feature_maps
89 |             out_dict['prediction_map_' + scale] = prediction_map
90 |         return out_dict
91 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/audio2exp_models/networks.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch import nn
 4 | 
 5 | class Conv2d(nn.Module):
 6 |     def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, use_act = True, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 |         self.conv_block = nn.Sequential(
 9 |                             nn.Conv2d(cin, cout, kernel_size, stride, padding),
10 |                             nn.BatchNorm2d(cout)
11 |                             )
12 |         self.act = nn.ReLU()
13 |         self.residual = residual
14 |         self.use_act = use_act
15 | 
16 |     def forward(self, x):
17 |         out = self.conv_block(x)
18 |         if self.residual:
19 |             out += x
20 |         
21 |         if self.use_act:
22 |             return self.act(out)
23 |         else:
24 |             return out
25 | 
26 | class SimpleWrapperV2(nn.Module):
27 |     def __init__(self) -> None:
28 |         super().__init__()
29 |         self.audio_encoder = nn.Sequential(
30 |             Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
31 |             Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
32 |             Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
33 | 
34 |             Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1),
35 |             Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
36 |             Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
37 | 
38 |             Conv2d(64, 128, kernel_size=3, stride=3, padding=1),
39 |             Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
40 |             Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
41 | 
42 |             Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1),
43 |             Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
44 | 
45 |             Conv2d(256, 512, kernel_size=3, stride=1, padding=0),
46 |             Conv2d(512, 512, kernel_size=1, stride=1, padding=0),
47 |             )
48 | 
49 |         #### load the pre-trained audio_encoder 
50 |         #self.audio_encoder = self.audio_encoder.to(device)  
51 |         '''
52 |         wav2lip_state_dict = torch.load('/apdcephfs_cq2/share_1290939/wenxuazhang/checkpoints/wav2lip.pth')['state_dict']
53 |         state_dict = self.audio_encoder.state_dict()
54 | 
55 |         for k,v in wav2lip_state_dict.items():
56 |             if 'audio_encoder' in k:
57 |                 print('init:', k)
58 |                 state_dict[k.replace('module.audio_encoder.', '')] = v
59 |         self.audio_encoder.load_state_dict(state_dict)
60 |         '''
61 | 
62 |         self.mapping1 = nn.Linear(512+64+1, 64)
63 |         #self.mapping2 = nn.Linear(30, 64)
64 |         #nn.init.constant_(self.mapping1.weight, 0.)
65 |         nn.init.constant_(self.mapping1.bias, 0.)
66 | 
67 |     def forward(self, x, ref, ratio):
68 |         x = self.audio_encoder(x).view(x.size(0), -1)
69 |         ref_reshape = ref.reshape(x.size(0), -1)
70 |         ratio = ratio.reshape(x.size(0), -1)
71 |         
72 |         y = self.mapping1(torch.cat([x, ref_reshape, ratio], dim=1)) 
73 |         out = y.reshape(ref.shape[0], ref.shape[1], -1) #+ ref # resudial
74 |         return out
75 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/face3d/models/__init__.py:
--------------------------------------------------------------------------------
 1 | """This package contains modules related to objective functions, optimizations, and network architectures.
 2 | 
 3 | To add a custom model class called 'dummy', you need to add a file called 'dummy_model.py' and define a subclass DummyModel inherited from BaseModel.
 4 | You need to implement the following five functions:
 5 |     -- <__init__>:                      initialize the class; first call BaseModel.__init__(self, opt).
 6 |     -- <set_input>:                     unpack data from dataset and apply preprocessing.
 7 |     -- <forward>:                       produce intermediate results.
 8 |     -- <optimize_parameters>:           calculate loss, gradients, and update network weights.
 9 |     -- <modify_commandline_options>:    (optionally) add model-specific options and set default options.
10 | 
11 | In the function <__init__>, you need to define four lists:
12 |     -- self.loss_names (str list):          specify the training losses that you want to plot and save.
13 |     -- self.model_names (str list):         define networks used in our training.
14 |     -- self.visual_names (str list):        specify the images that you want to display and save.
15 |     -- self.optimizers (optimizer list):    define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an usage.
16 | 
17 | Now you can use the model class by specifying flag '--model dummy'.
18 | See our template model class 'template_model.py' for more details.
19 | """
20 | 
21 | import importlib
22 | from .base_model import BaseModel
23 | 
24 | 
25 | def find_model_using_name(model_name):
26 |     """Import the module "models/[model_name]_model.py".
27 | 
28 |     In the file, the class called DatasetNameModel() will
29 |     be instantiated. It has to be a subclass of BaseModel,
30 |     and it is case-insensitive.
31 |     """
32 |     model_filename = "face3d.models." + model_name + "_model"
33 |     modellib = importlib.import_module(model_filename)
34 |     model = None
35 |     target_model_name = model_name.replace('_', '') + 'model'
36 |     for name, cls in modellib.__dict__.items():
37 |         if name.lower() == target_model_name.lower() \
38 |            and issubclass(cls, BaseModel):
39 |             model = cls
40 | 
41 |     if model is None:
42 |         print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name))
43 |         exit(0)
44 | 
45 |     return model
46 | 
47 | 
48 | def get_option_setter(model_name):
49 |     """Return the static method <modify_commandline_options> of the model class."""
50 |     model_class = find_model_using_name(model_name)
51 |     return model_class.modify_commandline_options
52 | 
53 | 
54 | def create_model(opt):
55 |     """Create a model given the option.
56 | 
57 |     This function warps the class CustomDatasetDataLoader.
58 |     This is the main interface between this package and 'train.py'/'test.py'
59 | 
60 |     Example:
61 |         >>> from models import create_model
62 |         >>> model = create_model(opt)
63 |     """
64 |     model = find_model_using_name(opt.model)
65 |     instance = model(opt)
66 |     print("model [%s] was created" % type(instance).__name__)
67 |     return instance
68 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/rendering/blender/view_data.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import json
 3 | import zipfile
 4 | from typing import BinaryIO, List, Tuple
 5 | 
 6 | import numpy as np
 7 | from PIL import Image
 8 | 
 9 | from ...rendering.view_data import Camera, ProjectiveCamera, ViewData
10 | 
11 | 
12 | class BlenderViewData(ViewData):
13 |     """
14 |     Interact with a dataset zipfile exported by view_data.py.
15 |     """
16 | 
17 |     def __init__(self, f_obj: BinaryIO):
18 |         self.zipfile = zipfile.ZipFile(f_obj, mode="r")
19 |         self.infos = []
20 |         with self.zipfile.open("info.json", "r") as f:
21 |             self.info = json.load(f)
22 |         self.channels = list(self.info.get("channels", "RGBAD"))
23 |         assert set("RGBA").issubset(
24 |             set(self.channels)
25 |         ), "The blender output should at least have RGBA images."
26 |         names = set(x.filename for x in self.zipfile.infolist())
27 |         for i in itertools.count():
28 |             name = f"{i:05}.json"
29 |             if name not in names:
30 |                 break
31 |             with self.zipfile.open(name, "r") as f:
32 |                 self.infos.append(json.load(f))
33 | 
34 |     @property
35 |     def num_views(self) -> int:
36 |         return len(self.infos)
37 | 
38 |     @property
39 |     def channel_names(self) -> List[str]:
40 |         return list(self.channels)
41 | 
42 |     def load_view(self, index: int, channels: List[str]) -> Tuple[Camera, np.ndarray]:
43 |         for ch in channels:
44 |             if ch not in self.channel_names:
45 |                 raise ValueError(f"unsupported channel: {ch}")
46 | 
47 |         # Gather (a superset of) the requested channels.
48 |         channel_map = {}
49 |         if any(x in channels for x in "RGBA"):
50 |             with self.zipfile.open(f"{index:05}.png", "r") as f:
51 |                 rgba = np.array(Image.open(f)).astype(np.float32) / 255.0
52 |                 channel_map.update(zip("RGBA", rgba.transpose([2, 0, 1])))
53 |         if "D" in channels:
54 |             with self.zipfile.open(f"{index:05}_depth.png", "r") as f:
55 |                 # Decode a 16-bit fixed-point number.
56 |                 fp = np.array(Image.open(f))
57 |                 inf_dist = fp == 0xFFFF
58 |                 channel_map["D"] = np.where(
59 |                     inf_dist,
60 |                     np.inf,
61 |                     self.infos[index]["max_depth"] * (fp.astype(np.float32) / 65536),
62 |                 )
63 |         if "MatAlpha" in channels:
64 |             with self.zipfile.open(f"{index:05}_MatAlpha.png", "r") as f:
65 |                 channel_map["MatAlpha"] = np.array(Image.open(f)).astype(np.float32) / 65536
66 | 
67 |         # The order of channels is user-specified.
68 |         combined = np.stack([channel_map[k] for k in channels], axis=-1)
69 | 
70 |         h, w, _ = combined.shape
71 |         return self.camera(index, w, h), combined
72 | 
73 |     def camera(self, index: int, width: int, height: int) -> ProjectiveCamera:
74 |         info = self.infos[index]
75 |         return ProjectiveCamera(
76 |             origin=np.array(info["origin"], dtype=np.float32),
77 |             x=np.array(info["x"], dtype=np.float32),
78 |             y=np.array(info["y"], dtype=np.float32),
79 |             z=np.array(info["z"], dtype=np.float32),
80 |             width=width,
81 |             height=height,
82 |             x_fov=info["x_fov"],
83 |             y_fov=info["y_fov"],
84 |         )
85 | 


--------------------------------------------------------------------------------
/modules/text2video_zero/__init__.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | from .model import (
  4 |     CannyText2VideoModel,
  5 |     PoseText2VideoModel,
  6 |     DepthText2VideoModel,
  7 |     VideoPix2PixModel,
  8 |     Text2VideoModel,
  9 | )
 10 | 
 11 | from utils import generate_video_name_mp4, get_new_video_name
 12 | 
 13 | 
 14 | class CannyText2Video:
 15 |     def __init__(self, device):
 16 |         self.device = device
 17 |         self.model = CannyText2VideoModel(device, dtype=torch.float16)
 18 | 
 19 |     def inference(self, inputs: str, resolution=512):
 20 |         vid_path, prompt = inputs.split(",")[0], ",".join(inputs.split(",")[1:])
 21 |         out_path = get_new_video_name(vid_path, func_name="canny2video")
 22 |         self.model.process_controlnet_canny(
 23 |             vid_path,
 24 |             prompt,
 25 |             save_path=out_path,
 26 |             resolution=resolution,
 27 |         )
 28 |         return out_path
 29 | 
 30 | 
 31 | class PoseText2Video:
 32 |     def __init__(self, device):
 33 |         self.device = device
 34 |         self.model = PoseText2VideoModel(device, dtype=torch.float16)
 35 | 
 36 |     def inference(self, inputs: str, resolution=512):
 37 |         vid_path, prompt = inputs.split(",")[0], ",".join(inputs.split(",")[1:])
 38 |         out_path = get_new_video_name(vid_path, func_name="pose2video")
 39 |         self.model.process_controlnet_pose(
 40 |             vid_path,
 41 |             prompt,
 42 |             save_path=out_path,
 43 |             resolution=resolution,
 44 |         )
 45 |         return out_path
 46 | 
 47 | 
 48 | class DepthText2Video:
 49 |     def __init__(self, device):
 50 |         self.device = device
 51 |         self.model = DepthText2VideoModel(device, dtype=torch.float16)
 52 | 
 53 |     def inference(self, inputs: str, resolution=512):
 54 |         vid_path, prompt = inputs.split(",")[0], ",".join(inputs.split(",")[1:])
 55 |         out_path = get_new_video_name(vid_path, func_name="depth2video")
 56 |         self.model.process_controlnet_depth(
 57 |             vid_path,
 58 |             prompt,
 59 |             save_path=out_path,
 60 |             resolution=resolution,
 61 |         )
 62 |         return out_path
 63 | 
 64 | 
 65 | class VideoPix2Pix:
 66 |     def __init__(self, device):
 67 |         self.device = device
 68 |         self.model = VideoPix2PixModel(device, dtype=torch.float16)
 69 | 
 70 |     def inference(self, inputs: str):
 71 |         vid_path, prompt = inputs.split(",")[0], ",".join(inputs.split(",")[1:])
 72 |         out_path = get_new_video_name(vid_path, func_name="pix2pix")
 73 |         self.model.process_pix2pix(
 74 |             vid_path,
 75 |             prompt,
 76 |             save_path=out_path,
 77 |         )
 78 |         return out_path
 79 | 
 80 | 
 81 | class Text2Video:
 82 |     def __init__(self, device):
 83 |         self.device = device
 84 |         self.model = Text2VideoModel(device, dtype=torch.float16)
 85 | 
 86 |     def inference(self, inputs: str, resolution=512):
 87 |         prompt = inputs
 88 |         params = {
 89 |             "t0": 44,
 90 |             "t1": 47,
 91 |             "motion_field_strength_x": 12,
 92 |             "motion_field_strength_y": 12,
 93 |             "video_length": 16,
 94 |         }
 95 |         out_path, fps = generate_video_name_mp4(), 8
 96 |         self.model.process_text2video(
 97 |             prompt,
 98 |             fps=fps,
 99 |             path=out_path,
100 |             resolution=resolution,
101 |             **params,
102 |         )
103 |         return out_path
104 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/facerender/sync_batchnorm/replicate.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # File   : replicate.py
 3 | # Author : Jiayuan Mao
 4 | # Email  : maojiayuan@gmail.com
 5 | # Date   : 27/01/2018
 6 | # 
 7 | # This file is part of Synchronized-BatchNorm-PyTorch.
 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
 9 | # Distributed under MIT License.
10 | 
11 | import functools
12 | 
13 | from torch.nn.parallel.data_parallel import DataParallel
14 | 
15 | __all__ = [
16 |     'CallbackContext',
17 |     'execute_replication_callbacks',
18 |     'DataParallelWithCallback',
19 |     'patch_replication_callback'
20 | ]
21 | 
22 | 
23 | class CallbackContext(object):
24 |     pass
25 | 
26 | 
27 | def execute_replication_callbacks(modules):
28 |     """
29 |     Execute an replication callback `__data_parallel_replicate__` on each module created by original replication.
30 | 
31 |     The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)`
32 | 
33 |     Note that, as all modules are isomorphism, we assign each sub-module with a context
34 |     (shared among multiple copies of this module on different devices).
35 |     Through this context, different copies can share some information.
36 | 
37 |     We guarantee that the callback on the master copy (the first copy) will be called ahead of calling the callback
38 |     of any slave copies.
39 |     """
40 |     master_copy = modules[0]
41 |     nr_modules = len(list(master_copy.modules()))
42 |     ctxs = [CallbackContext() for _ in range(nr_modules)]
43 | 
44 |     for i, module in enumerate(modules):
45 |         for j, m in enumerate(module.modules()):
46 |             if hasattr(m, '__data_parallel_replicate__'):
47 |                 m.__data_parallel_replicate__(ctxs[j], i)
48 | 
49 | 
50 | class DataParallelWithCallback(DataParallel):
51 |     """
52 |     Data Parallel with a replication callback.
53 | 
54 |     An replication callback `__data_parallel_replicate__` of each module will be invoked after being created by
55 |     original `replicate` function.
56 |     The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)`
57 | 
58 |     Examples:
59 |         > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
60 |         > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1])
61 |         # sync_bn.__data_parallel_replicate__ will be invoked.
62 |     """
63 | 
64 |     def replicate(self, module, device_ids):
65 |         modules = super(DataParallelWithCallback, self).replicate(module, device_ids)
66 |         execute_replication_callbacks(modules)
67 |         return modules
68 | 
69 | 
70 | def patch_replication_callback(data_parallel):
71 |     """
72 |     Monkey-patch an existing `DataParallel` object. Add the replication callback.
73 |     Useful when you have customized `DataParallel` implementation.
74 | 
75 |     Examples:
76 |         > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
77 |         > sync_bn = DataParallel(sync_bn, device_ids=[0, 1])
78 |         > patch_replication_callback(sync_bn)
79 |         # this is equivalent to
80 |         > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
81 |         > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1])
82 |     """
83 | 
84 |     assert isinstance(data_parallel, DataParallel)
85 | 
86 |     old_replicate = data_parallel.replicate
87 | 
88 |     @functools.wraps(old_replicate)
89 |     def new_replicate(module, device_ids):
90 |         modules = old_replicate(module, device_ids)
91 |         execute_replication_callbacks(modules)
92 |         return modules
93 | 
94 |     data_parallel.replicate = new_replicate
95 | 


--------------------------------------------------------------------------------
/modules/annotator/midas/midas/dpt_depth.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from .base_model import BaseModel
  6 | from .blocks import (
  7 |     FeatureFusionBlock,
  8 |     FeatureFusionBlock_custom,
  9 |     Interpolate,
 10 |     _make_encoder,
 11 |     forward_vit,
 12 | )
 13 | 
 14 | 
 15 | def _make_fusion_block(features, use_bn):
 16 |     return FeatureFusionBlock_custom(
 17 |         features,
 18 |         nn.ReLU(False),
 19 |         deconv=False,
 20 |         bn=use_bn,
 21 |         expand=False,
 22 |         align_corners=True,
 23 |     )
 24 | 
 25 | 
 26 | class DPT(BaseModel):
 27 |     def __init__(
 28 |         self,
 29 |         head,
 30 |         features=256,
 31 |         backbone="vitb_rn50_384",
 32 |         readout="project",
 33 |         channels_last=False,
 34 |         use_bn=False,
 35 |     ):
 36 | 
 37 |         super(DPT, self).__init__()
 38 | 
 39 |         self.channels_last = channels_last
 40 | 
 41 |         hooks = {
 42 |             "vitb_rn50_384": [0, 1, 8, 11],
 43 |             "vitb16_384": [2, 5, 8, 11],
 44 |             "vitl16_384": [5, 11, 17, 23],
 45 |         }
 46 | 
 47 |         # Instantiate backbone and reassemble blocks
 48 |         self.pretrained, self.scratch = _make_encoder(
 49 |             backbone,
 50 |             features,
 51 |             False, # Set to true of you want to train from scratch, uses ImageNet weights
 52 |             groups=1,
 53 |             expand=False,
 54 |             exportable=False,
 55 |             hooks=hooks[backbone],
 56 |             use_readout=readout,
 57 |         )
 58 | 
 59 |         self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
 60 |         self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
 61 |         self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
 62 |         self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
 63 | 
 64 |         self.scratch.output_conv = head
 65 | 
 66 | 
 67 |     def forward(self, x):
 68 |         if self.channels_last == True:
 69 |             x.contiguous(memory_format=torch.channels_last)
 70 | 
 71 |         layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
 72 | 
 73 |         layer_1_rn = self.scratch.layer1_rn(layer_1)
 74 |         layer_2_rn = self.scratch.layer2_rn(layer_2)
 75 |         layer_3_rn = self.scratch.layer3_rn(layer_3)
 76 |         layer_4_rn = self.scratch.layer4_rn(layer_4)
 77 | 
 78 |         path_4 = self.scratch.refinenet4(layer_4_rn)
 79 |         path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
 80 |         path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
 81 |         path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
 82 | 
 83 |         out = self.scratch.output_conv(path_1)
 84 | 
 85 |         return out
 86 | 
 87 | 
 88 | class DPTDepthModel(DPT):
 89 |     def __init__(self, path=None, non_negative=True, **kwargs):
 90 |         features = kwargs["features"] if "features" in kwargs else 256
 91 | 
 92 |         head = nn.Sequential(
 93 |             nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
 94 |             Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
 95 |             nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
 96 |             nn.ReLU(True),
 97 |             nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
 98 |             nn.ReLU(True) if non_negative else nn.Identity(),
 99 |             nn.Identity(),
100 |         )
101 | 
102 |         super().__init__(head, **kwargs)
103 | 
104 |         if path is not None:
105 |            self.load(path)
106 | 
107 |     def forward(self, x):
108 |         return super().forward(x).squeeze(dim=1)
109 | 
110 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/face3d/util/preprocess.py:
--------------------------------------------------------------------------------
  1 | """This script contains the image preprocessing code for Deep3DFaceRecon_pytorch
  2 | """
  3 | 
  4 | import numpy as np
  5 | from scipy.io import loadmat
  6 | from PIL import Image
  7 | import cv2
  8 | import os
  9 | from skimage import transform as trans
 10 | import torch
 11 | import warnings
 12 | warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 
 13 | warnings.filterwarnings("ignore", category=FutureWarning) 
 14 | 
 15 | 
 16 | 
 17 | # calculating least square problem for image alignment
 18 | def POS(xp, x):
 19 |     npts = xp.shape[1]
 20 | 
 21 |     A = np.zeros([2*npts, 8])
 22 | 
 23 |     A[0:2*npts-1:2, 0:3] = x.transpose()
 24 |     A[0:2*npts-1:2, 3] = 1
 25 | 
 26 |     A[1:2*npts:2, 4:7] = x.transpose()
 27 |     A[1:2*npts:2, 7] = 1
 28 | 
 29 |     b = np.reshape(xp.transpose(), [2*npts, 1])
 30 | 
 31 |     k, _, _, _ = np.linalg.lstsq(A, b)
 32 | 
 33 |     R1 = k[0:3]
 34 |     R2 = k[4:7]
 35 |     sTx = k[3]
 36 |     sTy = k[7]
 37 |     s = (np.linalg.norm(R1) + np.linalg.norm(R2))/2
 38 |     t = np.stack([sTx, sTy], axis=0)
 39 | 
 40 |     return t, s
 41 |     
 42 | # resize and crop images for face reconstruction
 43 | def resize_n_crop_img(img, lm, t, s, target_size=224., mask=None):
 44 |     w0, h0 = img.size
 45 |     w = (w0*s).astype(np.int32)
 46 |     h = (h0*s).astype(np.int32)
 47 |     left = (w/2 - target_size/2 + float((t[0] - w0/2)*s)).astype(np.int32)
 48 |     right = left + target_size
 49 |     up = (h/2 - target_size/2 + float((h0/2 - t[1])*s)).astype(np.int32)
 50 |     below = up + target_size
 51 | 
 52 |     img = img.resize((w, h), resample=Image.BICUBIC)
 53 |     img = img.crop((left, up, right, below))
 54 | 
 55 |     if mask is not None:
 56 |         mask = mask.resize((w, h), resample=Image.BICUBIC)
 57 |         mask = mask.crop((left, up, right, below))
 58 | 
 59 |     lm = np.stack([lm[:, 0] - t[0] + w0/2, lm[:, 1] -
 60 |                   t[1] + h0/2], axis=1)*s
 61 |     lm = lm - np.reshape(
 62 |             np.array([(w/2 - target_size/2), (h/2-target_size/2)]), [1, 2])
 63 | 
 64 |     return img, lm, mask
 65 | 
 66 | # utils for face reconstruction
 67 | def extract_5p(lm):
 68 |     lm_idx = np.array([31, 37, 40, 43, 46, 49, 55]) - 1
 69 |     lm5p = np.stack([lm[lm_idx[0], :], np.mean(lm[lm_idx[[1, 2]], :], 0), np.mean(
 70 |         lm[lm_idx[[3, 4]], :], 0), lm[lm_idx[5], :], lm[lm_idx[6], :]], axis=0)
 71 |     lm5p = lm5p[[1, 2, 0, 3, 4], :]
 72 |     return lm5p
 73 | 
 74 | # utils for face reconstruction
 75 | def align_img(img, lm, lm3D, mask=None, target_size=224., rescale_factor=102.):
 76 |     """
 77 |     Return:
 78 |         transparams        --numpy.array  (raw_W, raw_H, scale, tx, ty)
 79 |         img_new            --PIL.Image  (target_size, target_size, 3)
 80 |         lm_new             --numpy.array  (68, 2), y direction is opposite to v direction
 81 |         mask_new           --PIL.Image  (target_size, target_size)
 82 |     
 83 |     Parameters:
 84 |         img                --PIL.Image  (raw_H, raw_W, 3)
 85 |         lm                 --numpy.array  (68, 2), y direction is opposite to v direction
 86 |         lm3D               --numpy.array  (5, 3)
 87 |         mask               --PIL.Image  (raw_H, raw_W, 3)
 88 |     """
 89 | 
 90 |     w0, h0 = img.size
 91 |     if lm.shape[0] != 5:
 92 |         lm5p = extract_5p(lm)
 93 |     else:
 94 |         lm5p = lm
 95 | 
 96 |     # calculate translation and scale factors using 5 facial landmarks and standard landmarks of a 3D face
 97 |     t, s = POS(lm5p.transpose(), lm3D.transpose())
 98 |     s = rescale_factor/s
 99 | 
100 |     # processing the image
101 |     img_new, lm_new, mask_new = resize_n_crop_img(img, lm, t, s, target_size=target_size, mask=mask)
102 |     trans_params = np.array([w0, h0, s, t[0], t[1]])
103 | 
104 |     return trans_params, img_new, lm_new, mask_new
105 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/utils/init_path.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | 
 4 | from torch.hub import download_url_to_file
 5 | 
 6 | 
 7 | def init_path(checkpoint_dir, config_dir, size=512, old_version=False, preprocess='crop'):
 8 | 
 9 |     print("checkpoint_dir is:", checkpoint_dir)
10 |     
11 |     if old_version:
12 |         #### load all the checkpoint of `pth`
13 |         sadtalker_paths = {
14 |                 'wav2lip_checkpoint' : os.path.join(checkpoint_dir, 'wav2lip.pth'),
15 |                 'audio2pose_checkpoint' : os.path.join(checkpoint_dir, 'auido2pose_00140-model.pth'),
16 |                 'audio2exp_checkpoint' : os.path.join(checkpoint_dir, 'auido2exp_00300-model.pth'),
17 |                 'free_view_checkpoint' : os.path.join(checkpoint_dir, 'facevid2vid_00189-model.pth.tar'),
18 |                 'path_of_net_recon_model' : os.path.join(checkpoint_dir, 'epoch_20.pth')
19 |         }
20 | 
21 |         use_safetensor = False
22 |     
23 |     elif len(glob.glob(os.path.join(checkpoint_dir, '*.safetensors'))):
24 |         print('using safetensor as default')
25 |         sadtalker_paths = {
26 |             "checkpoint":os.path.join(checkpoint_dir, 'SadTalker_V0.0.2_'+str(size)+'.safetensors'),
27 |             }
28 |         use_safetensor = True
29 |     
30 |     else:
31 |         print(f"Begin to download models to {checkpoint_dir}...")
32 |         os.makedirs(checkpoint_dir, exist_ok=True)
33 |         
34 |         checkpoint_urls = ["https://github.com/OpenTalker/SadTalker/releases/download/v0.0.2-rc/mapping_00109-model.pth.tar",
35 |                            "https://github.com/OpenTalker/SadTalker/releases/download/v0.0.2-rc/mapping_00229-model.pth.tar",
36 |                            "https://github.com/OpenTalker/SadTalker/releases/download/v0.0.2-rc/SadTalker_V0.0.2_256.safetensors",
37 |                            "https://github.com/OpenTalker/SadTalker/releases/download/v0.0.2-rc/SadTalker_V0.0.2_512.safetensors"]
38 |         
39 |         for checkpoint_url in checkpoint_urls:
40 |             download_url_to_file(checkpoint_url, checkpoint_dir, hash_prefix=None, progress=True)
41 |         
42 |         print('using safetensor as default')
43 |         sadtalker_paths = {
44 |             "checkpoint":os.path.join(checkpoint_dir, 'SadTalker_V0.0.2_'+str(size)+'.safetensors'),
45 |             }
46 |         
47 |         use_safetensor = True
48 |         # print("WARNING: The new version of the model will be updated by safetensor, you may need to download it mannully. We run the old version of the checkpoint this time!")
49 |         # use_safetensor = False
50 |         
51 |         # sadtalker_paths = {
52 |         #         'wav2lip_checkpoint' : os.path.join(checkpoint_dir, 'wav2lip.pth'),
53 |         #         'audio2pose_checkpoint' : os.path.join(checkpoint_dir, 'auido2pose_00140-model.pth'),
54 |         #         'audio2exp_checkpoint' : os.path.join(checkpoint_dir, 'auido2exp_00300-model.pth'),
55 |         #         'free_view_checkpoint' : os.path.join(checkpoint_dir, 'facevid2vid_00189-model.pth.tar'),
56 |         #         'path_of_net_recon_model' : os.path.join(checkpoint_dir, 'epoch_20.pth')
57 |         # } 
58 | 
59 |     sadtalker_paths['dir_of_BFM_fitting'] = os.path.join(config_dir) # , 'BFM_Fitting'
60 |     sadtalker_paths['audio2pose_yaml_path'] = os.path.join(config_dir, 'auido2pose.yaml')
61 |     sadtalker_paths['audio2exp_yaml_path'] = os.path.join(config_dir, 'auido2exp.yaml')
62 |     sadtalker_paths['use_safetensor'] =  use_safetensor # os.path.join(config_dir, 'auido2exp.yaml')
63 | 
64 |     if 'full' in preprocess:
65 |         sadtalker_paths['mappingnet_checkpoint'] = os.path.join(checkpoint_dir, 'mapping_00109-model.pth.tar')
66 |         sadtalker_paths['facerender_yaml'] = os.path.join(config_dir, 'facerender_still.yaml')
67 |     else:
68 |         sadtalker_paths['mappingnet_checkpoint'] = os.path.join(checkpoint_dir, 'mapping_00229-model.pth.tar')
69 |         sadtalker_paths['facerender_yaml'] = os.path.join(config_dir, 'facerender.yaml')
70 | 
71 |     return sadtalker_paths


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/rendering/mesh.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from typing import BinaryIO, Dict, Optional, Union
  3 | 
  4 | import blobfile as bf
  5 | import numpy as np
  6 | 
  7 | from .ply_util import write_ply
  8 | 
  9 | 
 10 | @dataclass
 11 | class TriMesh:
 12 |     """
 13 |     A 3D triangle mesh with optional data at the vertices and faces.
 14 |     """
 15 | 
 16 |     # [N x 3] array of vertex coordinates.
 17 |     verts: np.ndarray
 18 | 
 19 |     # [M x 3] array of triangles, pointing to indices in verts.
 20 |     faces: np.ndarray
 21 | 
 22 |     # [P x 3] array of normal vectors per face.
 23 |     normals: Optional[np.ndarray] = None
 24 | 
 25 |     # Extra data per vertex and face.
 26 |     vertex_channels: Optional[Dict[str, np.ndarray]] = field(default_factory=dict)
 27 |     face_channels: Optional[Dict[str, np.ndarray]] = field(default_factory=dict)
 28 | 
 29 |     @classmethod
 30 |     def load(cls, f: Union[str, BinaryIO]) -> "TriMesh":
 31 |         """
 32 |         Load the mesh from a .npz file.
 33 |         """
 34 |         if isinstance(f, str):
 35 |             with bf.BlobFile(f, "rb") as reader:
 36 |                 return cls.load(reader)
 37 |         else:
 38 |             obj = np.load(f)
 39 |             keys = list(obj.keys())
 40 |             verts = obj["verts"]
 41 |             faces = obj["faces"]
 42 |             normals = obj["normals"] if "normals" in keys else None
 43 |             vertex_channels = {}
 44 |             face_channels = {}
 45 |             for key in keys:
 46 |                 if key.startswith("v_"):
 47 |                     vertex_channels[key[2:]] = obj[key]
 48 |                 elif key.startswith("f_"):
 49 |                     face_channels[key[2:]] = obj[key]
 50 |             return cls(
 51 |                 verts=verts,
 52 |                 faces=faces,
 53 |                 normals=normals,
 54 |                 vertex_channels=vertex_channels,
 55 |                 face_channels=face_channels,
 56 |             )
 57 | 
 58 |     def save(self, f: Union[str, BinaryIO]):
 59 |         """
 60 |         Save the mesh to a .npz file.
 61 |         """
 62 |         if isinstance(f, str):
 63 |             with bf.BlobFile(f, "wb") as writer:
 64 |                 self.save(writer)
 65 |         else:
 66 |             obj_dict = dict(verts=self.verts, faces=self.faces)
 67 |             if self.normals is not None:
 68 |                 obj_dict["normals"] = self.normals
 69 |             for k, v in self.vertex_channels.items():
 70 |                 obj_dict[f"v_{k}"] = v
 71 |             for k, v in self.face_channels.items():
 72 |                 obj_dict[f"f_{k}"] = v
 73 |             np.savez(f, **obj_dict)
 74 | 
 75 |     def has_vertex_colors(self) -> bool:
 76 |         return self.vertex_channels is not None and all(x in self.vertex_channels for x in "RGB")
 77 | 
 78 |     def write_ply(self, raw_f: BinaryIO):
 79 |         write_ply(
 80 |             raw_f,
 81 |             coords=self.verts,
 82 |             rgb=(
 83 |                 np.stack([self.vertex_channels[x] for x in "RGB"], axis=1)
 84 |                 if self.has_vertex_colors()
 85 |                 else None
 86 |             ),
 87 |             faces=self.faces,
 88 |         )
 89 | 
 90 |     def write_obj(self, raw_f: BinaryIO):
 91 |         if self.has_vertex_colors():
 92 |             vertex_colors = np.stack([self.vertex_channels[x] for x in "RGB"], axis=1)
 93 |             vertices = [
 94 |                 "{} {} {} {} {} {}".format(*coord, *color)
 95 |                 for coord, color in zip(self.verts.tolist(), vertex_colors.tolist())
 96 |             ]
 97 |         else:
 98 |             vertices = ["{} {} {}".format(*coord) for coord in self.verts.tolist()]
 99 | 
100 |         faces = [
101 |             "f {} {} {}".format(str(tri[0] + 1), str(tri[1] + 1), str(tri[2] + 1))
102 |             for tri in self.faces.tolist()
103 |         ]
104 | 
105 |         combined_data = ["v " + vertex for vertex in vertices] + faces
106 | 
107 |         raw_f.writelines("\n".join(combined_data))
108 | 


--------------------------------------------------------------------------------
/modules/annotator/openpose/hand.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import json
  3 | import numpy as np
  4 | import math
  5 | import time
  6 | from scipy.ndimage.filters import gaussian_filter
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib
  9 | import torch
 10 | from skimage.measure import label
 11 | 
 12 | from .model import handpose_model
 13 | from . import util
 14 | 
 15 | 
 16 | class Hand(object):
 17 |     def __init__(self, model_path, device=None):
 18 |         self.device = device or torch.device(
 19 |             "cuda" if torch.cuda.is_available() else "cpu"
 20 |         )
 21 |         self.model = handpose_model().to(self.device)
 22 |         model_dict = util.transfer(self.model, torch.load(model_path))
 23 |         self.model.load_state_dict(model_dict)
 24 |         self.model.eval()
 25 | 
 26 |     def __call__(self, oriImg):
 27 |         scale_search = [0.5, 1.0, 1.5, 2.0]
 28 |         # scale_search = [0.5]
 29 |         boxsize = 368
 30 |         stride = 8
 31 |         padValue = 128
 32 |         thre = 0.05
 33 |         multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
 34 |         heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 22))
 35 |         # paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
 36 | 
 37 |         for m in range(len(multiplier)):
 38 |             scale = multiplier[m]
 39 |             imageToTest = cv2.resize(
 40 |                 oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC
 41 |             )
 42 |             imageToTest_padded, pad = util.padRightDownCorner(
 43 |                 imageToTest, stride, padValue
 44 |             )
 45 |             im = (
 46 |                 np.transpose(
 47 |                     np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)
 48 |                 )
 49 |                 / 256
 50 |                 - 0.5
 51 |             )
 52 |             im = np.ascontiguousarray(im)
 53 | 
 54 |             data = torch.from_numpy(im).float().to(self.device)
 55 |             # data = data.permute([2, 0, 1]).unsqueeze(0).float()
 56 |             with torch.no_grad():
 57 |                 output = self.model(data).cpu().numpy()
 58 |                 # output = self.model(data).numpy()q
 59 | 
 60 |             # extract outputs, resize, and remove padding
 61 |             heatmap = np.transpose(
 62 |                 np.squeeze(output), (1, 2, 0)
 63 |             )  # output 1 is heatmaps
 64 |             heatmap = cv2.resize(
 65 |                 heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC
 66 |             )
 67 |             heatmap = heatmap[
 68 |                 : imageToTest_padded.shape[0] - pad[2],
 69 |                 : imageToTest_padded.shape[1] - pad[3],
 70 |                 :,
 71 |             ]
 72 |             heatmap = cv2.resize(
 73 |                 heatmap,
 74 |                 (oriImg.shape[1], oriImg.shape[0]),
 75 |                 interpolation=cv2.INTER_CUBIC,
 76 |             )
 77 | 
 78 |             heatmap_avg += heatmap / len(multiplier)
 79 | 
 80 |         all_peaks = []
 81 |         for part in range(21):
 82 |             map_ori = heatmap_avg[:, :, part]
 83 |             one_heatmap = gaussian_filter(map_ori, sigma=3)
 84 |             binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
 85 |             # 全部小于阈值
 86 |             if np.sum(binary) == 0:
 87 |                 all_peaks.append([0, 0])
 88 |                 continue
 89 |             label_img, label_numbers = label(
 90 |                 binary, return_num=True, connectivity=binary.ndim
 91 |             )
 92 |             max_index = (
 93 |                 np.argmax(
 94 |                     [
 95 |                         np.sum(map_ori[label_img == i])
 96 |                         for i in range(1, label_numbers + 1)
 97 |                     ]
 98 |                 )
 99 |                 + 1
100 |             )
101 |             label_img[label_img != max_index] = 0
102 |             map_ori[label_img == 0] = 0
103 | 
104 |             y, x = util.npmax(map_ori)
105 |             all_peaks.append([x, y])
106 |         return np.array(all_peaks)
107 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/audio2pose_models/audio2pose.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from .cvae import CVAE
 4 | from .discriminator import PoseSequenceDiscriminator
 5 | from .audio_encoder import AudioEncoder
 6 | 
 7 | class Audio2Pose(nn.Module):
 8 |     def __init__(self, cfg, wav2lip_checkpoint, device='cuda'):
 9 |         super().__init__()
10 |         self.cfg = cfg
11 |         self.seq_len = cfg.MODEL.CVAE.SEQ_LEN
12 |         self.latent_dim = cfg.MODEL.CVAE.LATENT_SIZE
13 |         self.device = device
14 | 
15 |         self.audio_encoder = AudioEncoder(wav2lip_checkpoint, device)
16 |         self.audio_encoder.eval()
17 |         for param in self.audio_encoder.parameters():
18 |             param.requires_grad = False
19 | 
20 |         self.netG = CVAE(cfg)
21 |         self.netD_motion = PoseSequenceDiscriminator(cfg)
22 |         
23 |         
24 |     def forward(self, x):
25 | 
26 |         batch = {}
27 |         coeff_gt = x['gt'].cuda().squeeze(0)           #bs frame_len+1 73
28 |         batch['pose_motion_gt'] = coeff_gt[:, 1:, 64:70] - coeff_gt[:, :1, 64:70] #bs frame_len 6
29 |         batch['ref'] = coeff_gt[:, 0, 64:70]  #bs  6
30 |         batch['class'] = x['class'].squeeze(0).cuda() # bs
31 |         indiv_mels= x['indiv_mels'].cuda().squeeze(0) # bs seq_len+1 80 16
32 | 
33 |         # forward
34 |         audio_emb_list = []
35 |         audio_emb = self.audio_encoder(indiv_mels[:, 1:, :, :].unsqueeze(2)) #bs seq_len 512
36 |         batch['audio_emb'] = audio_emb
37 |         batch = self.netG(batch)
38 | 
39 |         pose_motion_pred = batch['pose_motion_pred']           # bs frame_len 6
40 |         pose_gt = coeff_gt[:, 1:, 64:70].clone()               # bs frame_len 6
41 |         pose_pred = coeff_gt[:, :1, 64:70] + pose_motion_pred  # bs frame_len 6
42 | 
43 |         batch['pose_pred'] = pose_pred
44 |         batch['pose_gt'] = pose_gt
45 | 
46 |         return batch
47 | 
48 |     def test(self, x):
49 | 
50 |         batch = {}
51 |         ref = x['ref']                            #bs 1 70
52 |         batch['ref'] = x['ref'][:,0,-6:]  
53 |         batch['class'] = x['class']  
54 |         bs = ref.shape[0]
55 |         
56 |         indiv_mels= x['indiv_mels']               # bs T 1 80 16
57 |         indiv_mels_use = indiv_mels[:, 1:]        # we regard the ref as the first frame
58 |         num_frames = x['num_frames']
59 |         num_frames = int(num_frames) - 1
60 | 
61 |         #  
62 |         div = num_frames//self.seq_len
63 |         re = num_frames%self.seq_len
64 |         audio_emb_list = []
65 |         pose_motion_pred_list = [torch.zeros(batch['ref'].unsqueeze(1).shape, dtype=batch['ref'].dtype, 
66 |                                                 device=batch['ref'].device)]
67 | 
68 |         for i in range(div):
69 |             z = torch.randn(bs, self.latent_dim).to(ref.device)
70 |             batch['z'] = z
71 |             audio_emb = self.audio_encoder(indiv_mels_use[:, i*self.seq_len:(i+1)*self.seq_len,:,:,:]) #bs seq_len 512
72 |             batch['audio_emb'] = audio_emb
73 |             batch = self.netG.test(batch)
74 |             pose_motion_pred_list.append(batch['pose_motion_pred'])  #list of bs seq_len 6
75 |         
76 |         if re != 0:
77 |             z = torch.randn(bs, self.latent_dim).to(ref.device)
78 |             batch['z'] = z
79 |             audio_emb = self.audio_encoder(indiv_mels_use[:, -1*self.seq_len:,:,:,:]) #bs seq_len  512
80 |             if audio_emb.shape[1] != self.seq_len:
81 |                 pad_dim = self.seq_len-audio_emb.shape[1]
82 |                 pad_audio_emb = audio_emb[:, :1].repeat(1, pad_dim, 1) 
83 |                 audio_emb = torch.cat([pad_audio_emb, audio_emb], 1) 
84 |             batch['audio_emb'] = audio_emb
85 |             batch = self.netG.test(batch)
86 |             pose_motion_pred_list.append(batch['pose_motion_pred'][:,-1*re:,:])   
87 |         
88 |         pose_motion_pred = torch.cat(pose_motion_pred_list, dim = 1)
89 |         batch['pose_motion_pred'] = pose_motion_pred
90 | 
91 |         pose_pred = ref[:, :1, -6:] + pose_motion_pred  # bs T 6
92 | 
93 |         batch['pose_pred'] = pose_pred
94 |         return batch
95 | 


--------------------------------------------------------------------------------
/modules/mplug/get_video_caption.py:
--------------------------------------------------------------------------------
  1 | import ruamel.yaml as yaml
  2 | import numpy as np
  3 | import torch
  4 | import torch.nn as nn
  5 | from .models.model_caption_mplug_vatex import MPLUG
  6 | from .models.vit import interpolate_pos_embed, resize_pos_embed
  7 | from .models.tokenization_bert import BertTokenizer
  8 | from decord import VideoReader
  9 | import decord
 10 | import os
 11 | 
 12 | 
 13 | config_path = os.path.join("model_zoo", "mplug", "videocap_vatex_mplug_large.yaml")
 14 | mplug_pth_path = os.path.join("model_zoo", "mplug", "mplug_large.pth")
 15 | 
 16 | config = yaml.load(open(config_path, "r"), Loader=yaml.Loader)
 17 | 
 18 | 
 19 | def prepare_model(device):
 20 |     tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 21 |     model = MPLUG(config=config, tokenizer=tokenizer)
 22 |     model = model.to(device)
 23 | 
 24 |     assert os.path.exists(
 25 |         mplug_pth_path
 26 |     ), "Please download mplug_large.pth checkpoint from https://alice-open.oss-cn-zhangjiakou.aliyuncs.com/mPLUG/mplug_large.pth and put it in ./model_zoo/mplug/"
 27 |     checkpoint = torch.load(mplug_pth_path, map_location=device)
 28 | 
 29 |     try:
 30 |         state_dict = checkpoint["model"]
 31 |     except:
 32 |         state_dict = checkpoint["module"]
 33 |     if config["clip_name"] == "ViT-B-16":
 34 |         num_patches = int(config["image_res"] * config["image_res"] / (16 * 16))
 35 |     elif config["clip_name"] == "ViT-L-14":
 36 |         num_patches = int(config["image_res"] * config["image_res"] / (14 * 14))
 37 | 
 38 |     pos_embed = nn.Parameter(torch.zeros(num_patches + 1, 768).float())
 39 |     pos_embed = resize_pos_embed(
 40 |         state_dict["visual_encoder.visual.positional_embedding"].unsqueeze(0),
 41 |         pos_embed.unsqueeze(0),
 42 |     )
 43 |     state_dict["visual_encoder.visual.positional_embedding"] = pos_embed
 44 | 
 45 |     for key in list(state_dict.keys()):
 46 |         if ("fusion" in key or "bert" in key) and "decode" not in key:
 47 |             encoder_key = key.replace("fusion.", "").replace("bert.", "")
 48 |             state_dict[encoder_key] = state_dict[key]
 49 |             del state_dict[key]
 50 | 
 51 |     model.load_state_dict(state_dict, strict=False)
 52 |     model.eval()
 53 | 
 54 |     return model, tokenizer
 55 | 
 56 | 
 57 | def pipeline(video_path, model, tokenizer, device):
 58 |     video = load_video_from_path_decord(
 59 |         video_path, config["image_res"], config["image_res"], config["num_frm_test"]
 60 |     ).to(device)
 61 |     if config["prompt"] != "":
 62 |         caption = [config["prompt"] + config["eos"]] * video.size(0)
 63 |         caption = tokenizer(
 64 |             caption,
 65 |             padding="longest",
 66 |             truncation=True,
 67 |             max_length=25,
 68 |             return_tensors="pt",
 69 |         ).to(device)
 70 |     else:
 71 |         caption = None
 72 | 
 73 |     topk_ids, topk_probs = model(video, caption, None, train=False)
 74 | 
 75 |     for topk_id, topk_prob in zip(topk_ids, topk_probs):
 76 |         ans = (
 77 |             tokenizer.decode(topk_id[0])
 78 |             .replace("[SEP]", "")
 79 |             .replace("[CLS]", "")
 80 |             .replace("[PAD]", "")
 81 |             .strip()
 82 |         )
 83 |         ans += " ."
 84 |         return ans
 85 | 
 86 | 
 87 | def load_video_from_path_decord(
 88 |     video_path,
 89 |     height=None,
 90 |     width=None,
 91 |     num_frame=12,
 92 |     start_time=None,
 93 |     end_time=None,
 94 |     fps=-1,
 95 | ):
 96 |     decord.bridge.set_bridge("torch")
 97 | 
 98 |     if not height or not width:
 99 |         vr = VideoReader(video_path)
100 |     else:
101 |         vr = VideoReader(video_path, width=width, height=height)
102 |     vlen = len(vr)
103 |     if start_time or end_time:
104 |         assert fps > 0, "must provide video fps if specifying start and end time."
105 |         start_idx = min(int(start_time * fps), vlen)
106 |         end_idx = min(int(end_time * fps), vlen)
107 |     else:
108 |         start_idx, end_idx = 0, vlen
109 | 
110 |     frame_index = np.arange(start_idx, end_idx, vlen / num_frame, dtype=int)
111 |     raw_sample_frms = vr.get_batch(frame_index)
112 |     raw_sample_frms = raw_sample_frms.permute(0, 3, 1, 2).float().unsqueeze(0)
113 | 
114 |     return raw_sample_frms
115 | 


--------------------------------------------------------------------------------
/modules/mplug/models/visual_transformers.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import json
  3 | import logging
  4 | import math
  5 | import os
  6 | import shutil
  7 | import tarfile
  8 | import tempfile
  9 | import sys
 10 | from io import open
 11 | import torch.nn.functional as F
 12 | 
 13 | import torch
 14 | from torch import nn
 15 | from torch.nn import CrossEntropyLoss, SmoothL1Loss
 16 | import numpy as np
 17 | from .clip import clip
 18 | 
 19 | 
 20 | def resize_pos_embed(posemb, posemb_new):
 21 |     # Rescale the grid of position embeddings when loading from state_dict. Adapted from
 22 |     # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
 23 |     ntok_new = posemb_new.shape[1]
 24 |     if True:
 25 |         posemb_tok, posemb_grid = posemb[:, :1], posemb[0, 1:]
 26 |         ntok_new -= 1
 27 |     else:
 28 |         posemb_tok, posemb_grid = posemb[:, :0], posemb[0]
 29 |     gs_old = int(math.sqrt(len(posemb_grid)))
 30 |     gs_new = int(math.sqrt(ntok_new))
 31 |     # _logger.info('Position embedding grid-size from %s to %s', gs_old, gs_new)
 32 |     posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
 33 |     orig = posemb_grid.dtype
 34 |     posemb_grid = F.interpolate(
 35 |         posemb_grid.float(), size=(gs_new, gs_new), mode="bilinear"
 36 |     )
 37 |     posemb_grid = posemb_grid.to(orig)
 38 |     posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new * gs_new, -1)
 39 |     posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
 40 |     return posemb
 41 | 
 42 | 
 43 | def initialize_clip(config, num_patches=240):
 44 |     if config["clip_name"] == "ViT-B-16":
 45 |         clip_model, preprocess = clip.load("ViT-B-16.tar", jit=False)
 46 |         num_patches = int(config["image_res"] * config["image_res"] / (16 * 16))
 47 |         pos_embed = nn.Parameter(torch.zeros(num_patches + 1, 768).float())
 48 |     elif config["clip_name"] == "ViT-L-14":
 49 |         clip_model, preprocess = clip.load(
 50 |             os.path.join("model_zoo", "mplug", "ViT-L-14.tar"),
 51 |             jit=False,
 52 |         )
 53 |         num_patches = int(config["image_res"] * config["image_res"] / (14 * 14))
 54 |         pos_embed = nn.Parameter(torch.zeros(num_patches + 1, 1024).float())
 55 |     pos_embed.weight = resize_pos_embed(
 56 |         clip_model.visual.positional_embedding.unsqueeze(0), pos_embed.unsqueeze(0)
 57 |     )
 58 |     clip_model.visual.positional_embedding = pos_embed
 59 |     return clip_model, preprocess
 60 | 
 61 | 
 62 | # def initialize_vit(VISUAL_CONFIG, model_type="ViT-B_32", pretrained_dir="data/ViT-B_32.npz", img_size=(384, 640),
 63 | #                    num_patches=240):
 64 | #     from vit.models.modeling import VisionTransformer, CONFIGS
 65 | #     config = CONFIGS[model_type]
 66 | #     model = VisionTransformer(config, img_size=224, zero_head=True, num_classes=1)
 67 | #     model.load_from(np.load(pretrained_dir))
 68 | 
 69 | #     pos_embed = nn.Parameter(torch.zeros(num_patches + 1, 768).float())
 70 | #     pos_embed.weight = resize_pos_embed(model.transformer.embeddings.position_embeddings, pos_embed.unsqueeze(0))
 71 | #     model.transformer.embeddings.position_embeddings = pos_embed
 72 | #     if VISUAL_CONFIG.freeze_clip:
 73 | #         for parameter in model.parameters():
 74 | #             parameter.requires_grad = False
 75 | #     return model
 76 | 
 77 | 
 78 | def initialize_optimizer(visual_model, lr, momentum, weight_decay):
 79 |     optimizer = torch.optim.SGD(
 80 |         visual_model.parameters(), lr, momentum=momentum, weight_decay=weight_decay
 81 |     )
 82 |     return optimizer
 83 | 
 84 | 
 85 | def adjust_learning_rate(optimizer, epoch, args):
 86 |     """Decay the learning rate based on schedule"""
 87 |     lr = args.sgd_lr
 88 | 
 89 |     for milestone in args.schedule.split(","):
 90 |         lr *= 0.1 if epoch >= float(milestone) else 1.0
 91 |     for param_group in optimizer.param_groups:
 92 |         param_group["lr"] = lr
 93 | 
 94 | 
 95 | from torch.optim import Optimizer
 96 | 
 97 | 
 98 | class FusedOptimizer(Optimizer):
 99 |     def __init__(self, optimizers):
100 |         self.optimizers = optimizers
101 |         param_groups = []
102 |         for optimizer in self.optimizers:
103 |             param_groups += optimizer.param_groups
104 |         # super(FusedOptimizer, self).__init__([], {})
105 |         self.param_groups = param_groups
106 | 
107 |     def step(self):
108 |         for optimizer in self.optimizers:
109 |             optimizer.step()
110 | 


--------------------------------------------------------------------------------
/modules/text2video_zero/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import numpy as np
  4 | import torch
  5 | import torchvision
  6 | from torchvision.transforms import Resize, InterpolationMode
  7 | import imageio
  8 | from einops import rearrange
  9 | from PIL import Image
 10 | import decord
 11 | 
 12 | 
 13 | def create_gif(frames, fps, rescale=False, path=None):
 14 |     if path is None:
 15 |         dir = "temporal"
 16 |         os.makedirs(dir, exist_ok=True)
 17 |         path = os.path.join(dir, "canny_db.gif")
 18 | 
 19 |     outputs = []
 20 |     for i, x in enumerate(frames):
 21 |         x = torchvision.utils.make_grid(torch.Tensor(x), nrow=4)
 22 |         if rescale:
 23 |             x = (x + 1.0) / 2.0  # -1,1 -> 0,1
 24 |         x = (x * 255).numpy().astype(np.uint8)
 25 |         outputs.append(x)
 26 |         # imageio.imsave(os.path.join(dir, os.path.splitext(name)[0] + f'_{i}.jpg'), x)
 27 | 
 28 |     imageio.mimsave(path, outputs, fps=fps)
 29 |     return path
 30 | 
 31 | 
 32 | def post_process_gif(list_of_results, image_resolution):
 33 |     output_file = "/tmp/ddxk.gif"
 34 |     imageio.mimsave(output_file, list_of_results, fps=4)
 35 |     return output_file
 36 | 
 37 | 
 38 | def HWC3(x):
 39 |     assert x.dtype == np.uint8
 40 |     if x.ndim == 2:
 41 |         x = x[:, :, None]
 42 |     assert x.ndim == 3
 43 |     H, W, C = x.shape
 44 |     assert C == 1 or C == 3 or C == 4
 45 |     if C == 3:
 46 |         return x
 47 |     if C == 1:
 48 |         return np.concatenate([x, x, x], axis=2)
 49 |     if C == 4:
 50 |         color = x[:, :, 0:3].astype(np.float32)
 51 |         alpha = x[:, :, 3:4].astype(np.float32) / 255.0
 52 |         y = color * alpha + 255.0 * (1.0 - alpha)
 53 |         y = y.clip(0, 255).astype(np.uint8)
 54 |         return y
 55 | 
 56 | 
 57 | def pre_process(input_video):
 58 |     control_imgs = []
 59 |     for frame in input_video:
 60 |         img = rearrange(frame, "c h w -> h w c").cpu().numpy().astype(np.uint8)
 61 |         img = HWC3(img)
 62 |         H, W, C = img.shape
 63 |         img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
 64 |         control_imgs.append(img[None])
 65 |     control_imgs = np.concatenate(control_imgs)
 66 |     control = torch.from_numpy(control_imgs.copy()).float() / 255.0
 67 |     return rearrange(control, "f h w c -> f c h w")
 68 | 
 69 | 
 70 | class CrossFrameAttnProcessor:
 71 |     def __init__(self, unet_chunk_size=2):
 72 |         self.unet_chunk_size = unet_chunk_size
 73 | 
 74 |     def __call__(
 75 |         self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None
 76 |     ):
 77 |         batch_size, sequence_length, _ = hidden_states.shape
 78 |         attention_mask = attn.prepare_attention_mask(
 79 |             attention_mask, sequence_length, batch_size
 80 |         )
 81 |         query = attn.to_q(hidden_states)
 82 | 
 83 |         is_cross_attention = encoder_hidden_states is not None
 84 |         if encoder_hidden_states is None:
 85 |             encoder_hidden_states = hidden_states
 86 |         elif attn.cross_attention_norm:
 87 |             encoder_hidden_states = attn.norm_cross(encoder_hidden_states)
 88 |         key = attn.to_k(encoder_hidden_states)
 89 |         value = attn.to_v(encoder_hidden_states)
 90 |         # Sparse Attention
 91 |         if not is_cross_attention:
 92 |             video_length = key.size()[0] // self.unet_chunk_size
 93 |             # former_frame_index = torch.arange(video_length) - 1
 94 |             # former_frame_index[0] = 0
 95 |             former_frame_index = [0] * video_length
 96 |             key = rearrange(key, "(b f) d c -> b f d c", f=video_length)
 97 |             key = key[:, former_frame_index]
 98 |             key = rearrange(key, "b f d c -> (b f) d c")
 99 |             value = rearrange(value, "(b f) d c -> b f d c", f=video_length)
100 |             value = value[:, former_frame_index]
101 |             value = rearrange(value, "b f d c -> (b f) d c")
102 | 
103 |         query = attn.head_to_batch_dim(query)
104 |         key = attn.head_to_batch_dim(key)
105 |         value = attn.head_to_batch_dim(value)
106 | 
107 |         attention_probs = attn.get_attention_scores(query, key, attention_mask)
108 |         hidden_states = torch.bmm(attention_probs, value)
109 |         hidden_states = attn.batch_to_head_dim(hidden_states)
110 | 
111 |         # linear proj
112 |         hidden_states = attn.to_out[0](hidden_states)
113 |         # dropout
114 |         hidden_states = attn.to_out[1](hidden_states)
115 | 
116 |         return hidden_states
117 | 


--------------------------------------------------------------------------------
/modules/sadtalker/inference.py:
--------------------------------------------------------------------------------
  1 | from glob import glob
  2 | import shutil
  3 | import torch
  4 | from time import  strftime
  5 | import os, sys, time
  6 | from argparse import ArgumentParser
  7 | 
  8 | import uuid
  9 | 
 10 | from .src.utils.preprocess import CropAndExtract
 11 | from .src.test_audio2coeff import Audio2Coeff  
 12 | from .src.facerender.animate import AnimateFromCoeff
 13 | from .src.generate_batch import get_data
 14 | from .src.generate_facerender_batch import get_facerender_data
 15 | from .src.utils.init_path import init_path
 16 | 
 17 | def main(args):
 18 |     #torch.backends.cudnn.enabled = False
 19 | 
 20 |     pic_path = args.source_image
 21 |     audio_path = args.driven_audio
 22 |     save_dir = os.path.join(args.result_dir)
 23 |     os.makedirs(save_dir, exist_ok=True)
 24 |     pose_style = args.pose_style
 25 |     device = args.device
 26 |     batch_size = args.batch_size
 27 |     input_yaw_list = args.input_yaw
 28 |     input_pitch_list = args.input_pitch
 29 |     input_roll_list = args.input_roll
 30 |     ref_eyeblink = args.ref_eyeblink
 31 |     ref_pose = args.ref_pose
 32 | 
 33 |     current_root_path = r"./modules/sadtalker" # os.path.split(sys.argv[0])[0]
 34 |     print("current_root_path is:", current_root_path)
 35 | 
 36 |     sadtalker_paths = init_path(
 37 |         args.checkpoint_dir, 
 38 |         os.path.join(current_root_path, 'src/config'), 
 39 |         args.size, 
 40 |         args.old_version, 
 41 |         args.preprocess)
 42 | 
 43 |     #init model
 44 |     print("init preprocess_model")
 45 |     preprocess_model = CropAndExtract(sadtalker_paths, device)
 46 |     
 47 |     print("audio_to_coeff")
 48 |     audio_to_coeff = Audio2Coeff(sadtalker_paths,  device)
 49 |     
 50 |     print("animate_from_coeff")
 51 |     animate_from_coeff = AnimateFromCoeff(sadtalker_paths, device)
 52 | 
 53 |     #crop image and extract 3dmm from image
 54 |     first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
 55 |     os.makedirs(first_frame_dir, exist_ok=True)
 56 |     print('3DMM Extraction for source image')
 57 |     first_coeff_path, crop_pic_path, crop_info =  preprocess_model.generate(pic_path, first_frame_dir, args.preprocess,\
 58 |                                                                              source_image_flag=True, pic_size=args.size)
 59 |     if first_coeff_path is None:
 60 |         print("Can't get the coeffs of the input")
 61 |         return
 62 | 
 63 |     if ref_eyeblink is not None:
 64 |         ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[0]
 65 |         ref_eyeblink_frame_dir = os.path.join(save_dir, ref_eyeblink_videoname)
 66 |         os.makedirs(ref_eyeblink_frame_dir, exist_ok=True)
 67 |         print('3DMM Extraction for the reference video providing eye blinking')
 68 |         ref_eyeblink_coeff_path, _, _ =  preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir, args.preprocess, source_image_flag=False)
 69 |     else:
 70 |         ref_eyeblink_coeff_path=None
 71 | 
 72 |     if ref_pose is not None:
 73 |         if ref_pose == ref_eyeblink: 
 74 |             ref_pose_coeff_path = ref_eyeblink_coeff_path
 75 |         else:
 76 |             ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0]
 77 |             ref_pose_frame_dir = os.path.join(save_dir, ref_pose_videoname)
 78 |             os.makedirs(ref_pose_frame_dir, exist_ok=True)
 79 |             print('3DMM Extraction for the reference video providing pose')
 80 |             ref_pose_coeff_path, _, _ =  preprocess_model.generate(ref_pose, ref_pose_frame_dir, args.preprocess, source_image_flag=False)
 81 |     else:
 82 |         ref_pose_coeff_path=None
 83 | 
 84 |     #audio2ceoff
 85 |     batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path, still=args.still)
 86 |     coeff_path = audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path)
 87 |     
 88 |     #coeff2video
 89 |     data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, 
 90 |                                 batch_size, input_yaw_list, input_pitch_list, input_roll_list,
 91 |                                 expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess, size=args.size)
 92 |     
 93 |     result = animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \
 94 |                                 enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess, img_size=args.size)
 95 |     
 96 |     audio_name = str(uuid.uuid4())[:8]
 97 |     shutil.move(result, './video/'+ audio_name +'.mp4')
 98 |     print('The generated video is named:', audio_name + '.mp4')
 99 | 
100 |     if not args.verbose:
101 |         shutil.rmtree("./video/sadtalker")
102 | 
103 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/models/nn/checkpoint.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, Iterable, Sequence, Union
  2 | 
  3 | import torch
  4 | from torch.cuda.amp import custom_bwd, custom_fwd
  5 | 
  6 | 
  7 | def checkpoint(
  8 |     func: Callable[..., Union[torch.Tensor, Sequence[torch.Tensor]]],
  9 |     inputs: Sequence[torch.Tensor],
 10 |     params: Iterable[torch.Tensor],
 11 |     flag: bool,
 12 | ):
 13 |     """
 14 |     Evaluate a function without caching intermediate activations, allowing for
 15 |     reduced memory at the expense of extra compute in the backward pass.
 16 |     :param func: the function to evaluate.
 17 |     :param inputs: the argument sequence to pass to `func`.
 18 |     :param params: a sequence of parameters `func` depends on but does not
 19 |                    explicitly take as arguments.
 20 |     :param flag: if False, disable gradient checkpointing.
 21 |     """
 22 |     if flag:
 23 |         args = tuple(inputs) + tuple(params)
 24 |         return CheckpointFunction.apply(func, len(inputs), *args)
 25 |     else:
 26 |         return func(*inputs)
 27 | 
 28 | 
 29 | class CheckpointFunction(torch.autograd.Function):
 30 |     @staticmethod
 31 |     @custom_fwd
 32 |     def forward(ctx, run_function, length, *args):
 33 |         ctx.run_function = run_function
 34 |         ctx.length = length
 35 |         input_tensors = list(args[:length])
 36 |         input_params = list(args[length:])
 37 |         ctx.save_for_backward(*input_tensors, *input_params)
 38 |         with torch.no_grad():
 39 |             output_tensors = ctx.run_function(*input_tensors)
 40 |         return output_tensors
 41 | 
 42 |     @staticmethod
 43 |     @custom_bwd
 44 |     def backward(ctx, *output_grads):
 45 |         inputs = ctx.saved_tensors
 46 |         input_tensors = inputs[: ctx.length]
 47 |         input_params = inputs[ctx.length :]
 48 |         res = CheckpointFunctionGradFunction.apply(
 49 |             ctx.run_function,
 50 |             len(input_tensors),
 51 |             len(input_params),
 52 |             *input_tensors,
 53 |             *input_params,
 54 |             *output_grads
 55 |         )
 56 |         return (None, None) + res
 57 | 
 58 | 
 59 | class CheckpointFunctionGradFunction(torch.autograd.Function):
 60 |     @staticmethod
 61 |     @custom_fwd
 62 |     def forward(ctx, run_function, length_1, length_2, *args):
 63 |         ctx.run_function = run_function
 64 |         ctx.length_1 = length_1
 65 |         ctx.length_2 = length_2
 66 |         input_tensors = [x.detach().requires_grad_(True) for x in args[:length_1]]
 67 |         input_params = list(args[length_1 : length_1 + length_2])
 68 |         output_grads = list(args[length_1 + length_2 :])
 69 |         ctx.save_for_backward(*input_tensors, *input_params, *output_grads)
 70 | 
 71 |         with torch.enable_grad():
 72 |             # Fixes a bug where the first op in run_function modifies the
 73 |             # Tensor storage in place, which is not allowed for detach()'d
 74 |             # Tensors.
 75 |             shallow_copies = [x.view_as(x) for x in input_tensors]
 76 |             output_tensors = ctx.run_function(*shallow_copies)
 77 |         input_grads = torch.autograd.grad(
 78 |             output_tensors,
 79 |             input_tensors + input_params,
 80 |             output_grads,
 81 |             allow_unused=True,
 82 |         )
 83 |         return input_grads
 84 | 
 85 |     @staticmethod
 86 |     @custom_bwd
 87 |     def backward(ctx, *all_output_grads):
 88 |         args = ctx.saved_tensors
 89 |         input_tensors = [x.detach().requires_grad_(True) for x in args[: ctx.length_1]]
 90 |         input_params = list(args[ctx.length_1 : ctx.length_1 + ctx.length_2])
 91 |         output_grads = [
 92 |             x.detach().requires_grad_(True) for x in args[ctx.length_1 + ctx.length_2 :]
 93 |         ]
 94 | 
 95 |         with torch.enable_grad():
 96 |             # Fixes a bug where the first op in run_function modifies the
 97 |             # Tensor storage in place, which is not allowed for detach()'d
 98 |             # Tensors.
 99 |             shallow_copies = [x.view_as(x) for x in input_tensors]
100 |             output_tensors = ctx.run_function(*shallow_copies)
101 |             input_grads = torch.autograd.grad(
102 |                 output_tensors,
103 |                 input_tensors + input_params,
104 |                 output_grads,
105 |                 allow_unused=True,
106 |                 create_graph=True,
107 |                 retain_graph=True,
108 |             )
109 |         input_grads_grads = torch.autograd.grad(
110 |             input_grads,
111 |             input_tensors + input_params + output_grads,
112 |             all_output_grads,
113 |             allow_unused=True,
114 |         )
115 |         del input_grads
116 |         return (None, None, None) + input_grads_grads
117 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/face3d/util/load_mats.py:
--------------------------------------------------------------------------------
  1 | """This script is to load 3D face model for Deep3DFaceRecon_pytorch
  2 | """
  3 | 
  4 | import numpy as np
  5 | from PIL import Image
  6 | from scipy.io import loadmat, savemat
  7 | from array import array
  8 | import os.path as osp
  9 | 
 10 | 
 11 | # load expression basis
 12 | def LoadExpBasis(bfm_folder='BFM'):
 13 |     n_vertex = 53215
 14 |     Expbin = open(osp.join(bfm_folder, 'Exp_Pca.bin'), 'rb')
 15 |     exp_dim = array('i')
 16 |     exp_dim.fromfile(Expbin, 1)
 17 |     expMU = array('f')
 18 |     expPC = array('f')
 19 |     expMU.fromfile(Expbin, 3*n_vertex)
 20 |     expPC.fromfile(Expbin, 3*exp_dim[0]*n_vertex)
 21 |     Expbin.close()
 22 | 
 23 |     expPC = np.array(expPC)
 24 |     expPC = np.reshape(expPC, [exp_dim[0], -1])
 25 |     expPC = np.transpose(expPC)
 26 | 
 27 |     expEV = np.loadtxt(osp.join(bfm_folder, 'std_exp.txt'))
 28 | 
 29 |     return expPC, expEV
 30 | 
 31 | 
 32 | # transfer original BFM09 to our face model
 33 | def transferBFM09(bfm_folder='BFM'):
 34 |     print('Transfer BFM09 to BFM_model_front......')
 35 |     original_BFM = loadmat(osp.join(bfm_folder, '01_MorphableModel.mat'))
 36 |     shapePC = original_BFM['shapePC']  # shape basis
 37 |     shapeEV = original_BFM['shapeEV']  # corresponding eigen value
 38 |     shapeMU = original_BFM['shapeMU']  # mean face
 39 |     texPC = original_BFM['texPC']  # texture basis
 40 |     texEV = original_BFM['texEV']  # eigen value
 41 |     texMU = original_BFM['texMU']  # mean texture
 42 | 
 43 |     expPC, expEV = LoadExpBasis(bfm_folder)
 44 | 
 45 |     # transfer BFM09 to our face model
 46 | 
 47 |     idBase = shapePC*np.reshape(shapeEV, [-1, 199])
 48 |     idBase = idBase/1e5  # unify the scale to decimeter
 49 |     idBase = idBase[:, :80]  # use only first 80 basis
 50 | 
 51 |     exBase = expPC*np.reshape(expEV, [-1, 79])
 52 |     exBase = exBase/1e5  # unify the scale to decimeter
 53 |     exBase = exBase[:, :64]  # use only first 64 basis
 54 | 
 55 |     texBase = texPC*np.reshape(texEV, [-1, 199])
 56 |     texBase = texBase[:, :80]  # use only first 80 basis
 57 | 
 58 |     # our face model is cropped along face landmarks and contains only 35709 vertex.
 59 |     # original BFM09 contains 53490 vertex, and expression basis provided by Guo et al. contains 53215 vertex.
 60 |     # thus we select corresponding vertex to get our face model.
 61 | 
 62 |     index_exp = loadmat(osp.join(bfm_folder, 'BFM_front_idx.mat'))
 63 |     index_exp = index_exp['idx'].astype(np.int32) - 1  # starts from 0 (to 53215)
 64 | 
 65 |     index_shape = loadmat(osp.join(bfm_folder, 'BFM_exp_idx.mat'))
 66 |     index_shape = index_shape['trimIndex'].astype(
 67 |         np.int32) - 1  # starts from 0 (to 53490)
 68 |     index_shape = index_shape[index_exp]
 69 | 
 70 |     idBase = np.reshape(idBase, [-1, 3, 80])
 71 |     idBase = idBase[index_shape, :, :]
 72 |     idBase = np.reshape(idBase, [-1, 80])
 73 | 
 74 |     texBase = np.reshape(texBase, [-1, 3, 80])
 75 |     texBase = texBase[index_shape, :, :]
 76 |     texBase = np.reshape(texBase, [-1, 80])
 77 | 
 78 |     exBase = np.reshape(exBase, [-1, 3, 64])
 79 |     exBase = exBase[index_exp, :, :]
 80 |     exBase = np.reshape(exBase, [-1, 64])
 81 | 
 82 |     meanshape = np.reshape(shapeMU, [-1, 3])/1e5
 83 |     meanshape = meanshape[index_shape, :]
 84 |     meanshape = np.reshape(meanshape, [1, -1])
 85 | 
 86 |     meantex = np.reshape(texMU, [-1, 3])
 87 |     meantex = meantex[index_shape, :]
 88 |     meantex = np.reshape(meantex, [1, -1])
 89 | 
 90 |     other_info = loadmat(osp.join(bfm_folder, 'facemodel_info.mat'))
 91 |     frontmask2_idx = other_info['frontmask2_idx']
 92 |     skinmask = other_info['skinmask']
 93 |     keypoints = other_info['keypoints']
 94 |     point_buf = other_info['point_buf']
 95 |     tri = other_info['tri']
 96 |     tri_mask2 = other_info['tri_mask2']
 97 | 
 98 |     # save our face model
 99 |     savemat(osp.join(bfm_folder, 'BFM_model_front.mat'), {'meanshape': meanshape, 'meantex': meantex, 'idBase': idBase, 'exBase': exBase, 'texBase': texBase,
100 |             'tri': tri, 'point_buf': point_buf, 'tri_mask2': tri_mask2, 'keypoints': keypoints, 'frontmask2_idx': frontmask2_idx, 'skinmask': skinmask})
101 | 
102 | 
103 | # load landmarks for standard face, which is used for image preprocessing
104 | def load_lm3d(bfm_folder):
105 | 
106 |     Lm3D = loadmat(osp.join(bfm_folder, 'similarity_Lm3D_all.mat'))
107 |     Lm3D = Lm3D['lm']
108 | 
109 |     # calculate 5 facial landmarks using 68 landmarks
110 |     lm_idx = np.array([31, 37, 40, 43, 46, 49, 55]) - 1
111 |     Lm3D = np.stack([Lm3D[lm_idx[0], :], np.mean(Lm3D[lm_idx[[1, 2]], :], 0), np.mean(
112 |         Lm3D[lm_idx[[3, 4]], :], 0), Lm3D[lm_idx[5], :], Lm3D[lm_idx[6], :]], axis=0)
113 |     Lm3D = Lm3D[[1, 2, 0, 3, 4], :]
114 | 
115 |     return Lm3D
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     transferBFM09()


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/models/transmitter/bottleneck.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from typing import Any, Dict, Optional
  3 | 
  4 | import numpy as np
  5 | import torch.nn as nn
  6 | from torch import torch
  7 | 
  8 | from ...diffusion.gaussian_diffusion import diffusion_from_config
  9 | from ...util.collections import AttrDict
 10 | 
 11 | 
 12 | class LatentBottleneck(nn.Module, ABC):
 13 |     def __init__(self, *, device: torch.device, d_latent: int):
 14 |         super().__init__()
 15 |         self.device = device
 16 |         self.d_latent = d_latent
 17 | 
 18 |     @abstractmethod
 19 |     def forward(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict:
 20 |         pass
 21 | 
 22 | 
 23 | class LatentWarp(nn.Module, ABC):
 24 |     def __init__(self, *, device: torch.device):
 25 |         super().__init__()
 26 |         self.device = device
 27 | 
 28 |     @abstractmethod
 29 |     def warp(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict:
 30 |         pass
 31 | 
 32 |     @abstractmethod
 33 |     def unwarp(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict:
 34 |         pass
 35 | 
 36 | 
 37 | class IdentityLatentWarp(LatentWarp):
 38 |     def warp(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict:
 39 |         _ = options
 40 |         return x
 41 | 
 42 |     def unwarp(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict:
 43 |         _ = options
 44 |         return x
 45 | 
 46 | 
 47 | class Tan2LatentWarp(LatentWarp):
 48 |     def __init__(self, *, coeff1: float = 1.0, device: torch.device):
 49 |         super().__init__(device=device)
 50 |         self.coeff1 = coeff1
 51 |         self.scale = np.tan(np.tan(1.0) * coeff1)
 52 | 
 53 |     def warp(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict:
 54 |         _ = options
 55 |         return ((x.float().tan() * self.coeff1).tan() / self.scale).to(x.dtype)
 56 | 
 57 |     def unwarp(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict:
 58 |         _ = options
 59 |         return ((x.float() * self.scale).arctan() / self.coeff1).arctan().to(x.dtype)
 60 | 
 61 | 
 62 | class IdentityLatentBottleneck(LatentBottleneck):
 63 |     def forward(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict:
 64 |         _ = options
 65 |         return x
 66 | 
 67 | 
 68 | class ClampNoiseBottleneck(LatentBottleneck):
 69 |     def __init__(self, *, device: torch.device, d_latent: int, noise_scale: float):
 70 |         super().__init__(device=device, d_latent=d_latent)
 71 |         self.noise_scale = noise_scale
 72 | 
 73 |     def forward(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict:
 74 |         _ = options
 75 |         x = x.tanh()
 76 |         if not self.training:
 77 |             return x
 78 |         return x + torch.randn_like(x) * self.noise_scale
 79 | 
 80 | 
 81 | class ClampDiffusionNoiseBottleneck(LatentBottleneck):
 82 |     def __init__(
 83 |         self,
 84 |         *,
 85 |         device: torch.device,
 86 |         d_latent: int,
 87 |         diffusion: Dict[str, Any],
 88 |         diffusion_prob: float = 1.0,
 89 |     ):
 90 |         super().__init__(device=device, d_latent=d_latent)
 91 |         self.diffusion = diffusion_from_config(diffusion)
 92 |         self.diffusion_prob = diffusion_prob
 93 | 
 94 |     def forward(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict:
 95 |         _ = options
 96 |         x = x.tanh()
 97 |         if not self.training:
 98 |             return x
 99 |         t = torch.randint(low=0, high=self.diffusion.num_timesteps, size=(len(x),), device=x.device)
100 |         t = torch.where(
101 |             torch.rand(len(x), device=x.device) < self.diffusion_prob, t, torch.zeros_like(t)
102 |         )
103 |         return self.diffusion.q_sample(x, t)
104 | 
105 | 
106 | def latent_bottleneck_from_config(config: Dict[str, Any], device: torch.device, d_latent: int):
107 |     name = config.pop("name")
108 |     if name == "clamp_noise":
109 |         return ClampNoiseBottleneck(**config, device=device, d_latent=d_latent)
110 |     elif name == "identity":
111 |         return IdentityLatentBottleneck(**config, device=device, d_latent=d_latent)
112 |     elif name == "clamp_diffusion_noise":
113 |         return ClampDiffusionNoiseBottleneck(**config, device=device, d_latent=d_latent)
114 |     else:
115 |         raise ValueError(f"unknown latent bottleneck: {name}")
116 | 
117 | 
118 | def latent_warp_from_config(config: Dict[str, Any], device: torch.device):
119 |     name = config.pop("name")
120 |     if name == "identity":
121 |         return IdentityLatentWarp(**config, device=device)
122 |     elif name == "tan2":
123 |         return Tan2LatentWarp(**config, device=device)
124 |     else:
125 |         raise ValueError(f"unknown latent warping function: {name}")
126 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/audio2pose_models/networks.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch
  3 | 
  4 | 
  5 | class ResidualConv(nn.Module):
  6 |     def __init__(self, input_dim, output_dim, stride, padding):
  7 |         super(ResidualConv, self).__init__()
  8 | 
  9 |         self.conv_block = nn.Sequential(
 10 |             nn.BatchNorm2d(input_dim),
 11 |             nn.ReLU(),
 12 |             nn.Conv2d(
 13 |                 input_dim, output_dim, kernel_size=3, stride=stride, padding=padding
 14 |             ),
 15 |             nn.BatchNorm2d(output_dim),
 16 |             nn.ReLU(),
 17 |             nn.Conv2d(output_dim, output_dim, kernel_size=3, padding=1),
 18 |         )
 19 |         self.conv_skip = nn.Sequential(
 20 |             nn.Conv2d(input_dim, output_dim, kernel_size=3, stride=stride, padding=1),
 21 |             nn.BatchNorm2d(output_dim),
 22 |         )
 23 | 
 24 |     def forward(self, x):
 25 | 
 26 |         return self.conv_block(x) + self.conv_skip(x)
 27 | 
 28 | 
 29 | class Upsample(nn.Module):
 30 |     def __init__(self, input_dim, output_dim, kernel, stride):
 31 |         super(Upsample, self).__init__()
 32 | 
 33 |         self.upsample = nn.ConvTranspose2d(
 34 |             input_dim, output_dim, kernel_size=kernel, stride=stride
 35 |         )
 36 | 
 37 |     def forward(self, x):
 38 |         return self.upsample(x)
 39 | 
 40 | 
 41 | class Squeeze_Excite_Block(nn.Module):
 42 |     def __init__(self, channel, reduction=16):
 43 |         super(Squeeze_Excite_Block, self).__init__()
 44 |         self.avg_pool = nn.AdaptiveAvgPool2d(1)
 45 |         self.fc = nn.Sequential(
 46 |             nn.Linear(channel, channel // reduction, bias=False),
 47 |             nn.ReLU(inplace=True),
 48 |             nn.Linear(channel // reduction, channel, bias=False),
 49 |             nn.Sigmoid(),
 50 |         )
 51 | 
 52 |     def forward(self, x):
 53 |         b, c, _, _ = x.size()
 54 |         y = self.avg_pool(x).view(b, c)
 55 |         y = self.fc(y).view(b, c, 1, 1)
 56 |         return x * y.expand_as(x)
 57 | 
 58 | 
 59 | class ASPP(nn.Module):
 60 |     def __init__(self, in_dims, out_dims, rate=[6, 12, 18]):
 61 |         super(ASPP, self).__init__()
 62 | 
 63 |         self.aspp_block1 = nn.Sequential(
 64 |             nn.Conv2d(
 65 |                 in_dims, out_dims, 3, stride=1, padding=rate[0], dilation=rate[0]
 66 |             ),
 67 |             nn.ReLU(inplace=True),
 68 |             nn.BatchNorm2d(out_dims),
 69 |         )
 70 |         self.aspp_block2 = nn.Sequential(
 71 |             nn.Conv2d(
 72 |                 in_dims, out_dims, 3, stride=1, padding=rate[1], dilation=rate[1]
 73 |             ),
 74 |             nn.ReLU(inplace=True),
 75 |             nn.BatchNorm2d(out_dims),
 76 |         )
 77 |         self.aspp_block3 = nn.Sequential(
 78 |             nn.Conv2d(
 79 |                 in_dims, out_dims, 3, stride=1, padding=rate[2], dilation=rate[2]
 80 |             ),
 81 |             nn.ReLU(inplace=True),
 82 |             nn.BatchNorm2d(out_dims),
 83 |         )
 84 | 
 85 |         self.output = nn.Conv2d(len(rate) * out_dims, out_dims, 1)
 86 |         self._init_weights()
 87 | 
 88 |     def forward(self, x):
 89 |         x1 = self.aspp_block1(x)
 90 |         x2 = self.aspp_block2(x)
 91 |         x3 = self.aspp_block3(x)
 92 |         out = torch.cat([x1, x2, x3], dim=1)
 93 |         return self.output(out)
 94 | 
 95 |     def _init_weights(self):
 96 |         for m in self.modules():
 97 |             if isinstance(m, nn.Conv2d):
 98 |                 nn.init.kaiming_normal_(m.weight)
 99 |             elif isinstance(m, nn.BatchNorm2d):
100 |                 m.weight.data.fill_(1)
101 |                 m.bias.data.zero_()
102 | 
103 | 
104 | class Upsample_(nn.Module):
105 |     def __init__(self, scale=2):
106 |         super(Upsample_, self).__init__()
107 | 
108 |         self.upsample = nn.Upsample(mode="bilinear", scale_factor=scale)
109 | 
110 |     def forward(self, x):
111 |         return self.upsample(x)
112 | 
113 | 
114 | class AttentionBlock(nn.Module):
115 |     def __init__(self, input_encoder, input_decoder, output_dim):
116 |         super(AttentionBlock, self).__init__()
117 | 
118 |         self.conv_encoder = nn.Sequential(
119 |             nn.BatchNorm2d(input_encoder),
120 |             nn.ReLU(),
121 |             nn.Conv2d(input_encoder, output_dim, 3, padding=1),
122 |             nn.MaxPool2d(2, 2),
123 |         )
124 | 
125 |         self.conv_decoder = nn.Sequential(
126 |             nn.BatchNorm2d(input_decoder),
127 |             nn.ReLU(),
128 |             nn.Conv2d(input_decoder, output_dim, 3, padding=1),
129 |         )
130 | 
131 |         self.conv_attn = nn.Sequential(
132 |             nn.BatchNorm2d(output_dim),
133 |             nn.ReLU(),
134 |             nn.Conv2d(output_dim, 1, 1),
135 |         )
136 | 
137 |     def forward(self, x1, x2):
138 |         out = self.conv_encoder(x1) + self.conv_decoder(x2)
139 |         out = self.conv_attn(out)
140 |         return out * x2


--------------------------------------------------------------------------------
/modules/sadtalker/src/utils/face_enhancer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch 
  3 | 
  4 | from gfpgan import GFPGANer
  5 | 
  6 | from tqdm import tqdm
  7 | 
  8 | from .videoio import load_video_to_cv2
  9 | 
 10 | import cv2
 11 | 
 12 | 
 13 | class GeneratorWithLen(object):
 14 |     """ From https://stackoverflow.com/a/7460929 """
 15 | 
 16 |     def __init__(self, gen, length):
 17 |         self.gen = gen
 18 |         self.length = length
 19 | 
 20 |     def __len__(self):
 21 |         return self.length
 22 | 
 23 |     def __iter__(self):
 24 |         return self.gen
 25 | 
 26 | def enhancer_list(images, method='gfpgan', bg_upsampler='realesrgan'):
 27 |     gen = enhancer_generator_no_len(images, method=method, bg_upsampler=bg_upsampler)
 28 |     return list(gen)
 29 | 
 30 | def enhancer_generator_with_len(images, method='gfpgan', bg_upsampler='realesrgan'):
 31 |     """ Provide a generator with a __len__ method so that it can passed to functions that
 32 |     call len()"""
 33 | 
 34 |     if os.path.isfile(images): # handle video to images
 35 |         # TODO: Create a generator version of load_video_to_cv2
 36 |         images = load_video_to_cv2(images)
 37 | 
 38 |     gen = enhancer_generator_no_len(images, method=method, bg_upsampler=bg_upsampler)
 39 |     gen_with_len = GeneratorWithLen(gen, len(images))
 40 |     return gen_with_len
 41 | 
 42 | def enhancer_generator_no_len(images, method='gfpgan', bg_upsampler='realesrgan'):
 43 |     """ Provide a generator function so that all of the enhanced images don't need
 44 |     to be stored in memory at the same time. This can save tons of RAM compared to
 45 |     the enhancer function. """
 46 | 
 47 |     print('face enhancer....')
 48 |     if not isinstance(images, list) and os.path.isfile(images): # handle video to images
 49 |         images = load_video_to_cv2(images)
 50 | 
 51 |     # ------------------------ set up GFPGAN restorer ------------------------
 52 |     if  method == 'gfpgan':
 53 |         arch = 'clean'
 54 |         channel_multiplier = 2
 55 |         model_name = 'GFPGANv1.4'
 56 |         url = 'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth'
 57 |     elif method == 'RestoreFormer':
 58 |         arch = 'RestoreFormer'
 59 |         channel_multiplier = 2
 60 |         model_name = 'RestoreFormer'
 61 |         url = 'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.4/RestoreFormer.pth'
 62 |     elif method == 'codeformer': # TODO:
 63 |         arch = 'CodeFormer'
 64 |         channel_multiplier = 2
 65 |         model_name = 'CodeFormer'
 66 |         url = 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth'
 67 |     else:
 68 |         raise ValueError(f'Wrong model version {method}.')
 69 | 
 70 | 
 71 |     # ------------------------ set up background upsampler ------------------------
 72 |     if bg_upsampler == 'realesrgan':
 73 |         if not torch.cuda.is_available():  # CPU
 74 |             import warnings
 75 |             warnings.warn('The unoptimized RealESRGAN is slow on CPU. We do not use it. '
 76 |                           'If you really want to use it, please modify the corresponding codes.')
 77 |             bg_upsampler = None
 78 |         else:
 79 |             from basicsr.archs.rrdbnet_arch import RRDBNet
 80 |             from realesrgan import RealESRGANer
 81 |             model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2)
 82 |             bg_upsampler = RealESRGANer(
 83 |                 scale=2,
 84 |                 model_path='https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth',
 85 |                 model=model,
 86 |                 tile=400,
 87 |                 tile_pad=10,
 88 |                 pre_pad=0,
 89 |                 half=True)  # need to set False in CPU mode
 90 |     else:
 91 |         bg_upsampler = None
 92 | 
 93 |     # determine model paths
 94 |     model_path = os.path.join('gfpgan/weights', model_name + '.pth')
 95 |     
 96 |     if not os.path.isfile(model_path):
 97 |         model_path = os.path.join('checkpoints', model_name + '.pth')
 98 |     
 99 |     if not os.path.isfile(model_path):
100 |         # download pre-trained models from url
101 |         model_path = url
102 | 
103 |     restorer = GFPGANer(
104 |         model_path=model_path,
105 |         upscale=2,
106 |         arch=arch,
107 |         channel_multiplier=channel_multiplier,
108 |         bg_upsampler=bg_upsampler)
109 | 
110 |     # ------------------------ restore ------------------------
111 |     for idx in tqdm(range(len(images)), 'Face Enhancer:'):
112 |         
113 |         img = cv2.cvtColor(images[idx], cv2.COLOR_RGB2BGR)
114 |         
115 |         # restore faces and background if necessary
116 |         cropped_faces, restored_faces, r_img = restorer.enhance(
117 |             img,
118 |             has_aligned=False,
119 |             only_center_face=False,
120 |             paste_back=True)
121 |         
122 |         r_img = cv2.cvtColor(r_img, cv2.COLOR_BGR2RGB)
123 |         yield r_img
124 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/generate_batch.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from tqdm import tqdm
  4 | import torch
  5 | import numpy as np
  6 | import random
  7 | import scipy.io as scio
  8 | 
  9 | from .utils import audio as audio
 10 | 
 11 | def crop_pad_audio(wav, audio_length):
 12 |     if len(wav) > audio_length:
 13 |         wav = wav[:audio_length]
 14 |     elif len(wav) < audio_length:
 15 |         wav = np.pad(wav, [0, audio_length - len(wav)], mode='constant', constant_values=0)
 16 |     return wav
 17 | 
 18 | def parse_audio_length(audio_length, sr, fps):
 19 |     bit_per_frames = sr / fps
 20 | 
 21 |     num_frames = int(audio_length / bit_per_frames)
 22 |     audio_length = int(num_frames * bit_per_frames)
 23 | 
 24 |     return audio_length, num_frames
 25 | 
 26 | def generate_blink_seq(num_frames):
 27 |     ratio = np.zeros((num_frames,1))
 28 |     frame_id = 0
 29 |     while frame_id in range(num_frames):
 30 |         start = 80
 31 |         if frame_id+start+9<=num_frames - 1:
 32 |             ratio[frame_id+start:frame_id+start+9, 0] = [0.5,0.6,0.7,0.9,1, 0.9, 0.7,0.6,0.5]
 33 |             frame_id = frame_id+start+9
 34 |         else:
 35 |             break
 36 |     return ratio 
 37 | 
 38 | def generate_blink_seq_randomly(num_frames):
 39 |     ratio = np.zeros((num_frames,1))
 40 |     if num_frames<=20:
 41 |         return ratio
 42 |     frame_id = 0
 43 |     while frame_id in range(num_frames):
 44 |         start = random.choice(range(min(10,num_frames), min(int(num_frames/2), 70))) 
 45 |         if frame_id+start+5<=num_frames - 1:
 46 |             ratio[frame_id+start:frame_id+start+5, 0] = [0.5, 0.9, 1.0, 0.9, 0.5]
 47 |             frame_id = frame_id+start+5
 48 |         else:
 49 |             break
 50 |     return ratio
 51 | 
 52 | def get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path, still=False, idlemode=False, length_of_audio=False, use_blink=True):
 53 | 
 54 |     syncnet_mel_step_size = 16
 55 |     fps = 25
 56 | 
 57 |     pic_name = os.path.splitext(os.path.split(first_coeff_path)[-1])[0]
 58 |     audio_name = os.path.splitext(os.path.split(audio_path)[-1])[0]
 59 | 
 60 |     
 61 |     if idlemode:
 62 |         num_frames = int(length_of_audio * 25)
 63 |         indiv_mels = np.zeros((num_frames, 80, 16))
 64 |     else:
 65 |         wav = audio.load_wav(audio_path, 16000) 
 66 |         wav_length, num_frames = parse_audio_length(len(wav), 16000, 25)
 67 |         wav = crop_pad_audio(wav, wav_length)
 68 |         orig_mel = audio.melspectrogram(wav).T
 69 |         spec = orig_mel.copy()         # nframes 80
 70 |         indiv_mels = []
 71 | 
 72 |         for i in tqdm(range(num_frames), 'mel:'):
 73 |             start_frame_num = i-2
 74 |             start_idx = int(80. * (start_frame_num / float(fps)))
 75 |             end_idx = start_idx + syncnet_mel_step_size
 76 |             seq = list(range(start_idx, end_idx))
 77 |             seq = [ min(max(item, 0), orig_mel.shape[0]-1) for item in seq ]
 78 |             m = spec[seq, :]
 79 |             indiv_mels.append(m.T)
 80 |         indiv_mels = np.asarray(indiv_mels)         # T 80 16
 81 | 
 82 |     ratio = generate_blink_seq_randomly(num_frames)      # T
 83 |     source_semantics_path = first_coeff_path
 84 |     source_semantics_dict = scio.loadmat(source_semantics_path)
 85 |     ref_coeff = source_semantics_dict['coeff_3dmm'][:1,:70]         #1 70
 86 |     ref_coeff = np.repeat(ref_coeff, num_frames, axis=0)
 87 | 
 88 |     if ref_eyeblink_coeff_path is not None:
 89 |         ratio[:num_frames] = 0
 90 |         refeyeblink_coeff_dict = scio.loadmat(ref_eyeblink_coeff_path)
 91 |         refeyeblink_coeff = refeyeblink_coeff_dict['coeff_3dmm'][:,:64]
 92 |         refeyeblink_num_frames = refeyeblink_coeff.shape[0]
 93 |         if refeyeblink_num_frames<num_frames:
 94 |             div = num_frames//refeyeblink_num_frames
 95 |             re = num_frames%refeyeblink_num_frames
 96 |             refeyeblink_coeff_list = [refeyeblink_coeff for i in range(div)]
 97 |             refeyeblink_coeff_list.append(refeyeblink_coeff[:re, :64])
 98 |             refeyeblink_coeff = np.concatenate(refeyeblink_coeff_list, axis=0)
 99 |             print(refeyeblink_coeff.shape[0])
100 | 
101 |         ref_coeff[:, :64] = refeyeblink_coeff[:num_frames, :64] 
102 |     
103 |     indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1).unsqueeze(0) # bs T 1 80 16
104 | 
105 |     if use_blink:
106 |         ratio = torch.FloatTensor(ratio).unsqueeze(0)                       # bs T
107 |     else:
108 |         ratio = torch.FloatTensor(ratio).unsqueeze(0).fill_(0.) 
109 |                                # bs T
110 |     ref_coeff = torch.FloatTensor(ref_coeff).unsqueeze(0)                # bs 1 70
111 | 
112 |     indiv_mels = indiv_mels.to(device)
113 |     ratio = ratio.to(device)
114 |     ref_coeff = ref_coeff.to(device)
115 | 
116 |     return {'indiv_mels': indiv_mels,  
117 |             'ref': ref_coeff, 
118 |             'num_frames': num_frames, 
119 |             'ratio_gt': ratio,
120 |             'audio_name': audio_name, 'pic_name': pic_name}
121 | 
122 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/facerender/sync_batchnorm/comm.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # File   : comm.py
  3 | # Author : Jiayuan Mao
  4 | # Email  : maojiayuan@gmail.com
  5 | # Date   : 27/01/2018
  6 | # 
  7 | # This file is part of Synchronized-BatchNorm-PyTorch.
  8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
  9 | # Distributed under MIT License.
 10 | 
 11 | import queue
 12 | import collections
 13 | import threading
 14 | 
 15 | __all__ = ['FutureResult', 'SlavePipe', 'SyncMaster']
 16 | 
 17 | 
 18 | class FutureResult(object):
 19 |     """A thread-safe future implementation. Used only as one-to-one pipe."""
 20 | 
 21 |     def __init__(self):
 22 |         self._result = None
 23 |         self._lock = threading.Lock()
 24 |         self._cond = threading.Condition(self._lock)
 25 | 
 26 |     def put(self, result):
 27 |         with self._lock:
 28 |             assert self._result is None, 'Previous result has\'t been fetched.'
 29 |             self._result = result
 30 |             self._cond.notify()
 31 | 
 32 |     def get(self):
 33 |         with self._lock:
 34 |             if self._result is None:
 35 |                 self._cond.wait()
 36 | 
 37 |             res = self._result
 38 |             self._result = None
 39 |             return res
 40 | 
 41 | 
 42 | _MasterRegistry = collections.namedtuple('MasterRegistry', ['result'])
 43 | _SlavePipeBase = collections.namedtuple('_SlavePipeBase', ['identifier', 'queue', 'result'])
 44 | 
 45 | 
 46 | class SlavePipe(_SlavePipeBase):
 47 |     """Pipe for master-slave communication."""
 48 | 
 49 |     def run_slave(self, msg):
 50 |         self.queue.put((self.identifier, msg))
 51 |         ret = self.result.get()
 52 |         self.queue.put(True)
 53 |         return ret
 54 | 
 55 | 
 56 | class SyncMaster(object):
 57 |     """An abstract `SyncMaster` object.
 58 | 
 59 |     - During the replication, as the data parallel will trigger an callback of each module, all slave devices should
 60 |     call `register(id)` and obtain an `SlavePipe` to communicate with the master.
 61 |     - During the forward pass, master device invokes `run_master`, all messages from slave devices will be collected,
 62 |     and passed to a registered callback.
 63 |     - After receiving the messages, the master device should gather the information and determine to message passed
 64 |     back to each slave devices.
 65 |     """
 66 | 
 67 |     def __init__(self, master_callback):
 68 |         """
 69 | 
 70 |         Args:
 71 |             master_callback: a callback to be invoked after having collected messages from slave devices.
 72 |         """
 73 |         self._master_callback = master_callback
 74 |         self._queue = queue.Queue()
 75 |         self._registry = collections.OrderedDict()
 76 |         self._activated = False
 77 | 
 78 |     def __getstate__(self):
 79 |         return {'master_callback': self._master_callback}
 80 | 
 81 |     def __setstate__(self, state):
 82 |         self.__init__(state['master_callback'])
 83 | 
 84 |     def register_slave(self, identifier):
 85 |         """
 86 |         Register an slave device.
 87 | 
 88 |         Args:
 89 |             identifier: an identifier, usually is the device id.
 90 | 
 91 |         Returns: a `SlavePipe` object which can be used to communicate with the master device.
 92 | 
 93 |         """
 94 |         if self._activated:
 95 |             assert self._queue.empty(), 'Queue is not clean before next initialization.'
 96 |             self._activated = False
 97 |             self._registry.clear()
 98 |         future = FutureResult()
 99 |         self._registry[identifier] = _MasterRegistry(future)
100 |         return SlavePipe(identifier, self._queue, future)
101 | 
102 |     def run_master(self, master_msg):
103 |         """
104 |         Main entry for the master device in each forward pass.
105 |         The messages were first collected from each devices (including the master device), and then
106 |         an callback will be invoked to compute the message to be sent back to each devices
107 |         (including the master device).
108 | 
109 |         Args:
110 |             master_msg: the message that the master want to send to itself. This will be placed as the first
111 |             message when calling `master_callback`. For detailed usage, see `_SynchronizedBatchNorm` for an example.
112 | 
113 |         Returns: the message to be sent back to the master device.
114 | 
115 |         """
116 |         self._activated = True
117 | 
118 |         intermediates = [(0, master_msg)]
119 |         for i in range(self.nr_slaves):
120 |             intermediates.append(self._queue.get())
121 | 
122 |         results = self._master_callback(intermediates)
123 |         assert results[0][0] == 0, 'The first result should belongs to the master.'
124 | 
125 |         for i, res in results:
126 |             if i == 0:
127 |                 continue
128 |             self._registry[i].result.put(res)
129 | 
130 |         for i in range(self.nr_slaves):
131 |             assert self._queue.get() is True
132 | 
133 |         return results[0][1]
134 | 
135 |     @property
136 |     def nr_slaves(self):
137 |         return len(self._registry)
138 | 


--------------------------------------------------------------------------------
/modules/annotator/__init__.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import torch
  3 | import numpy as np
  4 | from einops import rearrange
  5 | from torchvision.transforms import Resize, InterpolationMode
  6 | 
  7 | from .util import HWC3
  8 | from .openpose import OpenposeDetector
  9 | from .midas import MidasDetector
 10 | 
 11 | from utils import get_new_video_name
 12 | from video_utils import prepare_video, create_video
 13 | 
 14 | 
 15 | class Video2Canny:
 16 |     def __init__(self, **kwargs):
 17 |         print("Initializing Video2Canny")
 18 | 
 19 |     def pre_process_canny(self, input_video, low_threshold=100, high_threshold=200):
 20 |         detected_maps = []
 21 |         for frame in input_video:
 22 |             img = rearrange(frame, "c h w -> h w c").cpu().numpy().astype(np.uint8)
 23 |             detected_map = cv2.Canny(img, low_threshold, high_threshold)
 24 |             detected_map = HWC3(detected_map)
 25 |             detected_maps.append(detected_map[None])
 26 |         detected_maps = np.concatenate(detected_maps)
 27 |         control = torch.from_numpy(detected_maps.copy()).float() / 255.0
 28 |         return rearrange(control, "f h w c -> f c h w")
 29 | 
 30 |     def inference(self, inputs):
 31 |         vid_path = inputs
 32 |         video, fps = prepare_video(vid_path, resolution=512, device="cpu")
 33 |         vid_canny = self.pre_process_canny(video)
 34 |         canny_to_save = list(
 35 |             rearrange(vid_canny, "f c w h -> f w h c").cpu().detach().numpy()
 36 |         )
 37 |         out_path = get_new_video_name(vid_path, "edge")
 38 |         return create_video(canny_to_save, fps, out_path)
 39 | 
 40 | 
 41 | class Video2Pose:
 42 |     def __init__(self, device, dtype=torch.float16):
 43 |         print("Initializing Video2Pose")
 44 |         self.device = device
 45 |         self.dtype = dtype
 46 |         self.detector = OpenposeDetector(device=device)
 47 | 
 48 |     def pre_process_pose(self, input_video, apply_pose_detect: bool = True):
 49 |         detected_maps = []
 50 |         for frame in input_video:
 51 |             img = rearrange(frame, "c h w -> h w c").cpu().numpy().astype(np.uint8)
 52 |             img = HWC3(img)
 53 |             if apply_pose_detect:
 54 |                 detected_map, _ = self.detector(img)
 55 |             else:
 56 |                 detected_map = img
 57 |             detected_map = HWC3(detected_map)
 58 |             H, W, C = img.shape
 59 |             detected_map = cv2.resize(
 60 |                 detected_map, (W, H), interpolation=cv2.INTER_NEAREST
 61 |             )
 62 |             detected_maps.append(detected_map[None])
 63 |         detected_maps = np.concatenate(detected_maps)
 64 |         control = torch.from_numpy(detected_maps.copy()).float() / 255.0
 65 |         return rearrange(control, "f h w c -> f c h w")
 66 | 
 67 |     def inference(self, inputs, resolution=512):
 68 |         vid_path = inputs
 69 |         video, fps = prepare_video(
 70 |             vid_path, resolution=resolution, device=self.device, normalize=False
 71 |         )
 72 |         vid_pose = self.pre_process_pose(video)
 73 |         canny_to_save = list(
 74 |             rearrange(vid_pose, "f c w h -> f w h c").cpu().detach().numpy()
 75 |         )
 76 |         out_path = get_new_video_name(vid_path, "pose")
 77 |         return create_video(canny_to_save, fps, out_path)
 78 | 
 79 | 
 80 | class Video2Depth:
 81 |     def __init__(self, device, dtype=torch.float16):
 82 |         print("Initializing Video2Depth")
 83 |         self.device = device
 84 |         self.dtype = dtype
 85 |         self.depth_estimator = MidasDetector(device)
 86 | 
 87 |     def pre_process_depth(self, input_video, apply_depth_detect: bool = True):
 88 |         detected_maps = []
 89 |         for frame in input_video:
 90 |             img = rearrange(frame, "c h w -> h w c").cpu().numpy().astype(np.uint8)
 91 |             img = HWC3(img)
 92 |             if apply_depth_detect:
 93 |                 detected_map, _ = self.depth_estimator(img)
 94 |             else:
 95 |                 detected_map = img
 96 |             detected_map = HWC3(detected_map)
 97 |             H, W, C = img.shape
 98 |             detected_map = cv2.resize(
 99 |                 detected_map, (W, H), interpolation=cv2.INTER_NEAREST
100 |             )
101 |             detected_maps.append(detected_map[None])
102 |         detected_maps = np.concatenate(detected_maps)
103 |         control = torch.from_numpy(detected_maps.copy()).float() / 255.0
104 |         return rearrange(control, "f h w c -> f c h w")
105 | 
106 |     def inference(self, inputs, resolution=512):
107 |         vid_path = inputs
108 |         video, fps = prepare_video(
109 |             vid_path,
110 |             resolution=resolution,
111 |             device=self.device,
112 |             dtype=self.dtype,
113 |             normalize=False,
114 |         )
115 |         control = self.pre_process_depth(video).to(self.device).to(self.dtype)
116 | 
117 |         depth_map_to_save = list(
118 |             rearrange(control, "f c w h -> f w h c").cpu().detach().numpy()
119 |         )
120 |         out_path = get_new_video_name(vid_path, "depth")
121 |         return create_video(depth_map_to_save, fps, out_path)
122 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/utils/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import librosa.filters
  3 | import numpy as np
  4 | # import tensorflow as tf
  5 | from scipy import signal
  6 | from scipy.io import wavfile
  7 | from .hparams import hparams as hp
  8 | 
  9 | 
 10 | def load_wav(path, sr):
 11 |     return librosa.core.load(path, sr=sr)[0]
 12 | 
 13 | def save_wav(wav, path, sr):
 14 |     wav *= 32767 / max(0.01, np.max(np.abs(wav)))
 15 |     #proposed by @dsmiller
 16 |     wavfile.write(path, sr, wav.astype(np.int16))
 17 | 
 18 | def save_wavenet_wav(wav, path, sr):
 19 |     librosa.output.write_wav(path, wav, sr=sr)
 20 | 
 21 | def preemphasis(wav, k, preemphasize=True):
 22 |     if preemphasize:
 23 |         return signal.lfilter([1, -k], [1], wav)
 24 |     return wav
 25 | 
 26 | def inv_preemphasis(wav, k, inv_preemphasize=True):
 27 |     if inv_preemphasize:
 28 |         return signal.lfilter([1], [1, -k], wav)
 29 |     return wav
 30 | 
 31 | def get_hop_size():
 32 |     hop_size = hp.hop_size
 33 |     if hop_size is None:
 34 |         assert hp.frame_shift_ms is not None
 35 |         hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate)
 36 |     return hop_size
 37 | 
 38 | def linearspectrogram(wav):
 39 |     D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
 40 |     S = _amp_to_db(np.abs(D)) - hp.ref_level_db
 41 |     
 42 |     if hp.signal_normalization:
 43 |         return _normalize(S)
 44 |     return S
 45 | 
 46 | def melspectrogram(wav):
 47 |     D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
 48 |     S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db
 49 |     
 50 |     if hp.signal_normalization:
 51 |         return _normalize(S)
 52 |     return S
 53 | 
 54 | def _lws_processor():
 55 |     import lws
 56 |     return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech")
 57 | 
 58 | def _stft(y):
 59 |     if hp.use_lws:
 60 |         return _lws_processor(hp).stft(y).T
 61 |     else:
 62 |         return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size)
 63 | 
 64 | ##########################################################
 65 | #Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
 66 | def num_frames(length, fsize, fshift):
 67 |     """Compute number of time frames of spectrogram
 68 |     """
 69 |     pad = (fsize - fshift)
 70 |     if length % fshift == 0:
 71 |         M = (length + pad * 2 - fsize) // fshift + 1
 72 |     else:
 73 |         M = (length + pad * 2 - fsize) // fshift + 2
 74 |     return M
 75 | 
 76 | 
 77 | def pad_lr(x, fsize, fshift):
 78 |     """Compute left and right padding
 79 |     """
 80 |     M = num_frames(len(x), fsize, fshift)
 81 |     pad = (fsize - fshift)
 82 |     T = len(x) + 2 * pad
 83 |     r = (M - 1) * fshift + fsize - T
 84 |     return pad, pad + r
 85 | ##########################################################
 86 | #Librosa correct padding
 87 | def librosa_pad_lr(x, fsize, fshift):
 88 |     return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
 89 | 
 90 | # Conversions
 91 | _mel_basis = None
 92 | 
 93 | def _linear_to_mel(spectogram):
 94 |     global _mel_basis
 95 |     if _mel_basis is None:
 96 |         _mel_basis = _build_mel_basis()
 97 |     return np.dot(_mel_basis, spectogram)
 98 | 
 99 | def _build_mel_basis():
100 |     assert hp.fmax <= hp.sample_rate // 2
101 |     return librosa.filters.mel(sr=hp.sample_rate, n_fft=hp.n_fft, n_mels=hp.num_mels,
102 |                                fmin=hp.fmin, fmax=hp.fmax)
103 | 
104 | def _amp_to_db(x):
105 |     min_level = np.exp(hp.min_level_db / 20 * np.log(10))
106 |     return 20 * np.log10(np.maximum(min_level, x))
107 | 
108 | def _db_to_amp(x):
109 |     return np.power(10.0, (x) * 0.05)
110 | 
111 | def _normalize(S):
112 |     if hp.allow_clipping_in_normalization:
113 |         if hp.symmetric_mels:
114 |             return np.clip((2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value,
115 |                            -hp.max_abs_value, hp.max_abs_value)
116 |         else:
117 |             return np.clip(hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)), 0, hp.max_abs_value)
118 |     
119 |     assert S.max() <= 0 and S.min() - hp.min_level_db >= 0
120 |     if hp.symmetric_mels:
121 |         return (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value
122 |     else:
123 |         return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db))
124 | 
125 | def _denormalize(D):
126 |     if hp.allow_clipping_in_normalization:
127 |         if hp.symmetric_mels:
128 |             return (((np.clip(D, -hp.max_abs_value,
129 |                               hp.max_abs_value) + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value))
130 |                     + hp.min_level_db)
131 |         else:
132 |             return ((np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
133 |     
134 |     if hp.symmetric_mels:
135 |         return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db)
136 |     else:
137 |         return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
138 | 


--------------------------------------------------------------------------------
/modules/mplug/models/clip/simple_tokenizer.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import html
  3 | import os
  4 | from functools import lru_cache
  5 | 
  6 | import ftfy
  7 | import regex as re
  8 | 
  9 | 
 10 | @lru_cache()
 11 | def default_bpe():
 12 |     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
 13 | 
 14 | 
 15 | @lru_cache()
 16 | def bytes_to_unicode():
 17 |     """
 18 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 19 |     The reversible bpe codes work on unicode strings.
 20 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 21 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 22 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 23 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 24 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 25 |     """
 26 |     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
 27 |     cs = bs[:]
 28 |     n = 0
 29 |     for b in range(2**8):
 30 |         if b not in bs:
 31 |             bs.append(b)
 32 |             cs.append(2**8+n)
 33 |             n += 1
 34 |     cs = [chr(n) for n in cs]
 35 |     return dict(zip(bs, cs))
 36 | 
 37 | 
 38 | def get_pairs(word):
 39 |     """Return set of symbol pairs in a word.
 40 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 41 |     """
 42 |     pairs = set()
 43 |     prev_char = word[0]
 44 |     for char in word[1:]:
 45 |         pairs.add((prev_char, char))
 46 |         prev_char = char
 47 |     return pairs
 48 | 
 49 | 
 50 | def basic_clean(text):
 51 |     text = ftfy.fix_text(text)
 52 |     text = html.unescape(html.unescape(text))
 53 |     return text.strip()
 54 | 
 55 | 
 56 | def whitespace_clean(text):
 57 |     text = re.sub(r'\s+', ' ', text)
 58 |     text = text.strip()
 59 |     return text
 60 | 
 61 | 
 62 | class SimpleTokenizer(object):
 63 |     def __init__(self, bpe_path: str = default_bpe()):
 64 |         self.byte_encoder = bytes_to_unicode()
 65 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
 66 |         merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
 67 |         merges = merges[1:49152-256-2+1]
 68 |         merges = [tuple(merge.split()) for merge in merges]
 69 |         vocab = list(bytes_to_unicode().values())
 70 |         vocab = vocab + [v+'</w>' for v in vocab]
 71 |         for merge in merges:
 72 |             vocab.append(''.join(merge))
 73 |         vocab.extend(['<|startoftext|>', '<|endoftext|>'])
 74 |         self.encoder = dict(zip(vocab, range(len(vocab))))
 75 |         self.decoder = {v: k for k, v in self.encoder.items()}
 76 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
 77 |         self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
 78 |         self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
 79 | 
 80 |     def bpe(self, token):
 81 |         if token in self.cache:
 82 |             return self.cache[token]
 83 |         word = tuple(token[:-1]) + ( token[-1] + '</w>',)
 84 |         pairs = get_pairs(word)
 85 | 
 86 |         if not pairs:
 87 |             return token+'</w>'
 88 | 
 89 |         while True:
 90 |             bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
 91 |             if bigram not in self.bpe_ranks:
 92 |                 break
 93 |             first, second = bigram
 94 |             new_word = []
 95 |             i = 0
 96 |             while i < len(word):
 97 |                 try:
 98 |                     j = word.index(first, i)
 99 |                     new_word.extend(word[i:j])
100 |                     i = j
101 |                 except:
102 |                     new_word.extend(word[i:])
103 |                     break
104 | 
105 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
106 |                     new_word.append(first+second)
107 |                     i += 2
108 |                 else:
109 |                     new_word.append(word[i])
110 |                     i += 1
111 |             new_word = tuple(new_word)
112 |             word = new_word
113 |             if len(word) == 1:
114 |                 break
115 |             else:
116 |                 pairs = get_pairs(word)
117 |         word = ' '.join(word)
118 |         self.cache[token] = word
119 |         return word
120 | 
121 |     def encode(self, text):
122 |         bpe_tokens = []
123 |         text = whitespace_clean(basic_clean(text)).lower()
124 |         for token in re.findall(self.pat, text):
125 |             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
126 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
127 |         return bpe_tokens
128 | 
129 |     def decode(self, tokens):
130 |         text = ''.join([self.decoder[token] for token in tokens])
131 |         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
132 |         return text
133 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/util/collections.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | from typing import Any, Callable, Dict, List, Optional
  3 | from typing import OrderedDict, Generic, TypeVar
  4 | 
  5 | K = TypeVar('K')
  6 | V = TypeVar('V')
  7 | 
  8 | class AttrDict(OrderedDict[K, V], Generic[K, V]):
  9 |     """
 10 |     An attribute dictionary that automatically handles nested keys joined by "/".
 11 | 
 12 |     Originally copied from: https://stackoverflow.com/questions/3031219/recursively-access-dict-via-attributes-as-well-as-index-access
 13 |     """
 14 | 
 15 |     MARKER = object()
 16 | 
 17 |     # pylint: disable=super-init-not-called
 18 |     def __init__(self, *args, **kwargs):
 19 |         if len(args) == 0:
 20 |             for key, value in kwargs.items():
 21 |                 self.__setitem__(key, value)
 22 |         else:
 23 |             assert len(args) == 1
 24 |             assert isinstance(args[0], (dict, AttrDict))
 25 |             for key, value in args[0].items():
 26 |                 self.__setitem__(key, value)
 27 | 
 28 |     def __contains__(self, key):
 29 |         if "/" in key:
 30 |             keys = key.split("/")
 31 |             key, next_key = keys[0], "/".join(keys[1:])
 32 |             return key in self and next_key in self[key]
 33 |         return super(AttrDict, self).__contains__(key)
 34 | 
 35 |     def __setitem__(self, key, value):
 36 |         if "/" in key:
 37 |             keys = key.split("/")
 38 |             key, next_key = keys[0], "/".join(keys[1:])
 39 |             if key not in self:
 40 |                 self[key] = AttrDict()
 41 |             self[key].__setitem__(next_key, value)
 42 |             return
 43 | 
 44 |         if isinstance(value, dict) and not isinstance(value, AttrDict):
 45 |             value = AttrDict(**value)
 46 |         if isinstance(value, list):
 47 |             value = [AttrDict(val) if isinstance(val, dict) else val for val in value]
 48 |         super(AttrDict, self).__setitem__(key, value)
 49 | 
 50 |     def __getitem__(self, key):
 51 |         if "/" in key:
 52 |             keys = key.split("/")
 53 |             key, next_key = keys[0], "/".join(keys[1:])
 54 |             val = self[key]
 55 |             if not isinstance(val, AttrDict):
 56 |                 raise ValueError
 57 |             return val.__getitem__(next_key)
 58 | 
 59 |         return self.get(key, None)
 60 | 
 61 |     def all_keys(
 62 |         self,
 63 |         leaves_only: bool = False,
 64 |         parent: Optional[str] = None,
 65 |     ) -> List[str]:
 66 |         keys = []
 67 |         for key in self.keys():
 68 |             cur = key if parent is None else f"{parent}/{key}"
 69 |             if not leaves_only or not isinstance(self[key], dict):
 70 |                 keys.append(cur)
 71 |             if isinstance(self[key], dict):
 72 |                 keys.extend(self[key].all_keys(leaves_only=leaves_only, parent=cur))
 73 |         return keys
 74 | 
 75 |     def dumpable(self, strip=True):
 76 |         """
 77 |         Casts into OrderedDict and removes internal attributes
 78 |         """
 79 | 
 80 |         def _dump(val):
 81 |             if isinstance(val, AttrDict):
 82 |                 return val.dumpable()
 83 |             elif isinstance(val, list):
 84 |                 return [_dump(v) for v in val]
 85 |             return val
 86 | 
 87 |         if strip:
 88 |             return {k: _dump(v) for k, v in self.items() if not k.startswith("_")}
 89 |         return {k: _dump(v if not k.startswith("_") else repr(v)) for k, v in self.items()}
 90 | 
 91 |     def map(
 92 |         self,
 93 |         map_fn: Callable[[Any, Any], Any],
 94 |         should_map: Optional[Callable[[Any, Any], bool]] = None,
 95 |     ) -> "AttrDict":
 96 |         """
 97 |         Creates a copy of self where some or all values are transformed by
 98 |         map_fn.
 99 | 
100 |         :param should_map: If provided, only those values that evaluate to true
101 |             are converted; otherwise, all values are mapped.
102 |         """
103 | 
104 |         def _apply(key, val):
105 |             if isinstance(val, AttrDict):
106 |                 return val.map(map_fn, should_map)
107 |             elif should_map is None or should_map(key, val):
108 |                 return map_fn(key, val)
109 |             return val
110 | 
111 |         return AttrDict({k: _apply(k, v) for k, v in self.items()})
112 | 
113 |     def __eq__(self, other):
114 |         return self.keys() == other.keys() and all(self[k] == other[k] for k in self.keys())
115 | 
116 |     def combine(
117 |         self,
118 |         other: Dict[str, Any],
119 |         combine_fn: Callable[[Optional[Any], Optional[Any]], Any],
120 |     ) -> "AttrDict":
121 |         """
122 |         Some values may be missing, but the dictionary structures must be the
123 |         same.
124 | 
125 |         :param combine_fn: a (possibly non-commutative) function to combine the
126 |             values
127 |         """
128 | 
129 |         def _apply(val, other_val):
130 |             if val is not None and isinstance(val, AttrDict):
131 |                 assert isinstance(other_val, AttrDict)
132 |                 return val.combine(other_val, combine_fn)
133 |             return combine_fn(val, other_val)
134 | 
135 |         # TODO nit: this changes the ordering..
136 |         keys = self.keys() | other.keys()
137 |         return AttrDict({k: _apply(self[k], other[k]) for k in keys})
138 | 
139 |     __setattr__, __getattr__ = __setitem__, __getitem__
140 | 


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/rendering/blender/render.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import platform
  3 | import subprocess
  4 | import tempfile
  5 | import zipfile
  6 | 
  7 | import blobfile as bf
  8 | import numpy as np
  9 | from PIL import Image
 10 | 
 11 | from ...rendering.mesh import TriMesh
 12 | 
 13 | from .constants import BASIC_AMBIENT_COLOR, BASIC_DIFFUSE_COLOR, UNIFORM_LIGHT_DIRECTION
 14 | 
 15 | SCRIPT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "blender_script.py")
 16 | 
 17 | 
 18 | def render_model(
 19 |     model_path: str,
 20 |     output_path: str,
 21 |     num_images: int,
 22 |     backend: str = "BLENDER_EEVEE",
 23 |     light_mode: str = "random",
 24 |     camera_pose: str = "random",
 25 |     camera_dist_min: float = 2.0,
 26 |     camera_dist_max: float = 2.0,
 27 |     fast_mode: bool = False,
 28 |     extract_material: bool = False,
 29 |     delete_material: bool = False,
 30 |     verbose: bool = False,
 31 |     timeout: float = 15 * 60,
 32 | ):
 33 |     with tempfile.TemporaryDirectory() as tmp_dir:
 34 |         tmp_in = model_path
 35 |         tmp_out = os.path.join(tmp_dir, "out")
 36 |         zip_out = tmp_out + ".zip"
 37 |         os.mkdir(tmp_out)
 38 |         args = []
 39 |         if platform.system() == "Linux":
 40 |             # Needed to enable Eevee backend on headless linux.
 41 |             args = ["xvfb-run", "-a"]
 42 |         args.extend(
 43 |             [
 44 |                 _blender_binary_path(),
 45 |                 "-b",
 46 |                 "-P",
 47 |                 SCRIPT_PATH,
 48 |                 "--",
 49 |                 "--input_path",
 50 |                 tmp_in,
 51 |                 "--output_path",
 52 |                 tmp_out,
 53 |                 "--num_images",
 54 |                 str(num_images),
 55 |                 "--backend",
 56 |                 backend,
 57 |                 "--light_mode",
 58 |                 light_mode,
 59 |                 "--camera_pose",
 60 |                 camera_pose,
 61 |                 "--camera_dist_min",
 62 |                 str(camera_dist_min),
 63 |                 "--camera_dist_max",
 64 |                 str(camera_dist_max),
 65 |                 "--uniform_light_direction",
 66 |                 *[str(x) for x in UNIFORM_LIGHT_DIRECTION],
 67 |                 "--basic_ambient",
 68 |                 str(BASIC_AMBIENT_COLOR),
 69 |                 "--basic_diffuse",
 70 |                 str(BASIC_DIFFUSE_COLOR),
 71 |             ]
 72 |         )
 73 |         if fast_mode:
 74 |             args.append("--fast_mode")
 75 |         if extract_material:
 76 |             args.append("--extract_material")
 77 |         if delete_material:
 78 |             args.append("--delete_material")
 79 |         if verbose:
 80 |             subprocess.check_call(args)
 81 |         else:
 82 |             try:
 83 |                 output = subprocess.check_output(args, stderr=subprocess.STDOUT, timeout=timeout)
 84 |             except subprocess.CalledProcessError as exc:
 85 |                 raise RuntimeError(f"{exc}: {exc.output}") from exc
 86 |         if not os.path.exists(os.path.join(tmp_out, "info.json")):
 87 |             if verbose:
 88 |                 # There is no output available, since it was
 89 |                 # logged directly to stdout/stderr.
 90 |                 raise RuntimeError(f"render failed: output file missing")
 91 |             else:
 92 |                 raise RuntimeError(f"render failed: output file missing. Output: {output}")
 93 |         _combine_rgba(tmp_out)
 94 |         with zipfile.ZipFile(zip_out, mode="w") as zf:
 95 |             for name in os.listdir(tmp_out):
 96 |                 zf.write(os.path.join(tmp_out, name), name)
 97 |         bf.copy(zip_out, output_path, overwrite=True)
 98 | 
 99 | 
100 | def render_mesh(
101 |     mesh: TriMesh,
102 |     output_path: str,
103 |     num_images: int,
104 |     backend: str = "BLENDER_EEVEE",
105 |     **kwargs,
106 | ):
107 |     if mesh.has_vertex_colors() and backend not in ["BLENDER_EEVEE", "CYCLES"]:
108 |         raise ValueError(f"backend does not support vertex colors: {backend}")
109 | 
110 |     with tempfile.TemporaryDirectory() as tmp_dir:
111 |         ply_path = os.path.join(tmp_dir, "out.ply")
112 |         with open(ply_path, "wb") as f:
113 |             mesh.write_ply(f)
114 |         render_model(
115 |             ply_path, output_path=output_path, num_images=num_images, backend=backend, **kwargs
116 |         )
117 | 
118 | 
119 | def _combine_rgba(out_dir: str):
120 |     i = 0
121 |     while True:
122 |         paths = [os.path.join(out_dir, f"{i:05}_{ch}.png") for ch in "rgba"]
123 |         if not os.path.exists(paths[0]):
124 |             break
125 |         joined = np.stack(
126 |             [(np.array(Image.open(path)) >> 8).astype(np.uint8) for path in paths], axis=-1
127 |         )
128 |         Image.fromarray(joined).save(os.path.join(out_dir, f"{i:05}.png"))
129 |         for path in paths:
130 |             os.remove(path)
131 |         i += 1
132 | 
133 | 
134 | def _blender_binary_path() -> str:
135 |     path = os.getenv("BLENDER_PATH", None)
136 |     if path is not None:
137 |         return path
138 | 
139 |     if os.path.exists("/Applications/Blender.app/Contents/MacOS/Blender"):
140 |         return "/Applications/Blender.app/Contents/MacOS/Blender"
141 | 
142 |     raise EnvironmentError(
143 |         "To render 3D models, install Blender version 3.3.1 or higher and "
144 |         "set the environment variable `BLENDER_PATH` to the path of the Blender executable."
145 |     )
146 | 


--------------------------------------------------------------------------------
/modules/annotator/midas/utils.py:
--------------------------------------------------------------------------------
  1 | """Utils for monoDepth."""
  2 | import sys
  3 | import re
  4 | import numpy as np
  5 | import cv2
  6 | import torch
  7 | 
  8 | 
  9 | def read_pfm(path):
 10 |     """Read pfm file.
 11 | 
 12 |     Args:
 13 |         path (str): path to file
 14 | 
 15 |     Returns:
 16 |         tuple: (data, scale)
 17 |     """
 18 |     with open(path, "rb") as file:
 19 | 
 20 |         color = None
 21 |         width = None
 22 |         height = None
 23 |         scale = None
 24 |         endian = None
 25 | 
 26 |         header = file.readline().rstrip()
 27 |         if header.decode("ascii") == "PF":
 28 |             color = True
 29 |         elif header.decode("ascii") == "Pf":
 30 |             color = False
 31 |         else:
 32 |             raise Exception("Not a PFM file: " + path)
 33 | 
 34 |         dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
 35 |         if dim_match:
 36 |             width, height = list(map(int, dim_match.groups()))
 37 |         else:
 38 |             raise Exception("Malformed PFM header.")
 39 | 
 40 |         scale = float(file.readline().decode("ascii").rstrip())
 41 |         if scale < 0:
 42 |             # little-endian
 43 |             endian = "<"
 44 |             scale = -scale
 45 |         else:
 46 |             # big-endian
 47 |             endian = ">"
 48 | 
 49 |         data = np.fromfile(file, endian + "f")
 50 |         shape = (height, width, 3) if color else (height, width)
 51 | 
 52 |         data = np.reshape(data, shape)
 53 |         data = np.flipud(data)
 54 | 
 55 |         return data, scale
 56 | 
 57 | 
 58 | def write_pfm(path, image, scale=1):
 59 |     """Write pfm file.
 60 | 
 61 |     Args:
 62 |         path (str): pathto file
 63 |         image (array): data
 64 |         scale (int, optional): Scale. Defaults to 1.
 65 |     """
 66 | 
 67 |     with open(path, "wb") as file:
 68 |         color = None
 69 | 
 70 |         if image.dtype.name != "float32":
 71 |             raise Exception("Image dtype must be float32.")
 72 | 
 73 |         image = np.flipud(image)
 74 | 
 75 |         if len(image.shape) == 3 and image.shape[2] == 3:  # color image
 76 |             color = True
 77 |         elif (
 78 |             len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
 79 |         ):  # greyscale
 80 |             color = False
 81 |         else:
 82 |             raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
 83 | 
 84 |         file.write("PF\n" if color else "Pf\n".encode())
 85 |         file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
 86 | 
 87 |         endian = image.dtype.byteorder
 88 | 
 89 |         if endian == "<" or endian == "=" and sys.byteorder == "little":
 90 |             scale = -scale
 91 | 
 92 |         file.write("%f\n".encode() % scale)
 93 | 
 94 |         image.tofile(file)
 95 | 
 96 | 
 97 | def read_image(path):
 98 |     """Read image and output RGB image (0-1).
 99 | 
100 |     Args:
101 |         path (str): path to file
102 | 
103 |     Returns:
104 |         array: RGB image (0-1)
105 |     """
106 |     img = cv2.imread(path)
107 | 
108 |     if img.ndim == 2:
109 |         img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
110 | 
111 |     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
112 | 
113 |     return img
114 | 
115 | 
116 | def resize_image(img):
117 |     """Resize image and make it fit for network.
118 | 
119 |     Args:
120 |         img (array): image
121 | 
122 |     Returns:
123 |         tensor: data ready for network
124 |     """
125 |     height_orig = img.shape[0]
126 |     width_orig = img.shape[1]
127 | 
128 |     if width_orig > height_orig:
129 |         scale = width_orig / 384
130 |     else:
131 |         scale = height_orig / 384
132 | 
133 |     height = (np.ceil(height_orig / scale / 32) * 32).astype(int)
134 |     width = (np.ceil(width_orig / scale / 32) * 32).astype(int)
135 | 
136 |     img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA)
137 | 
138 |     img_resized = (
139 |         torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float()
140 |     )
141 |     img_resized = img_resized.unsqueeze(0)
142 | 
143 |     return img_resized
144 | 
145 | 
146 | def resize_depth(depth, width, height):
147 |     """Resize depth map and bring to CPU (numpy).
148 | 
149 |     Args:
150 |         depth (tensor): depth
151 |         width (int): image width
152 |         height (int): image height
153 | 
154 |     Returns:
155 |         array: processed depth
156 |     """
157 |     depth = torch.squeeze(depth[0, :, :, :]).to("cpu")
158 | 
159 |     depth_resized = cv2.resize(
160 |         depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC
161 |     )
162 | 
163 |     return depth_resized
164 | 
165 | def write_depth(path, depth, bits=1):
166 |     """Write depth map to pfm and png file.
167 | 
168 |     Args:
169 |         path (str): filepath without extension
170 |         depth (array): depth
171 |     """
172 |     write_pfm(path + ".pfm", depth.astype(np.float32))
173 | 
174 |     depth_min = depth.min()
175 |     depth_max = depth.max()
176 | 
177 |     max_val = (2**(8*bits))-1
178 | 
179 |     if depth_max - depth_min > np.finfo("float").eps:
180 |         out = max_val * (depth - depth_min) / (depth_max - depth_min)
181 |     else:
182 |         out = np.zeros(depth.shape, dtype=depth.type)
183 | 
184 |     if bits == 1:
185 |         cv2.imwrite(path + ".png", out.astype("uint8"))
186 |     elif bits == 2:
187 |         cv2.imwrite(path + ".png", out.astype("uint16"))
188 | 
189 |     return
190 | 


--------------------------------------------------------------------------------
/modules/sadtalker/src/face3d/models/arcface_torch/backbones/mobilefacenet.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Adapted from https://github.com/cavalleria/cavaface.pytorch/blob/master/backbone/mobilefacenet.py
  3 | Original author cavalleria
  4 | '''
  5 | 
  6 | import torch.nn as nn
  7 | from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, Sequential, Module
  8 | import torch
  9 | 
 10 | 
 11 | 
 12 | class Flatten(Module):
 13 |     def forward(self, x):
 14 |         return x.view(x.size(0), -1)
 15 | 
 16 | 
 17 | class ConvBlock(Module):
 18 |     def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
 19 |         super(ConvBlock, self).__init__()
 20 |         self.layers = nn.Sequential(
 21 |             Conv2d(in_c, out_c, kernel, groups=groups, stride=stride, padding=padding, bias=False),
 22 |             BatchNorm2d(num_features=out_c),
 23 |             PReLU(num_parameters=out_c)
 24 |         )
 25 | 
 26 |     def forward(self, x):
 27 |         return self.layers(x)
 28 | 
 29 | 
 30 | class LinearBlock(Module):
 31 |     def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
 32 |         super(LinearBlock, self).__init__()
 33 |         self.layers = nn.Sequential(
 34 |             Conv2d(in_c, out_c, kernel, stride, padding, groups=groups, bias=False),
 35 |             BatchNorm2d(num_features=out_c)
 36 |         )
 37 | 
 38 |     def forward(self, x):
 39 |         return self.layers(x)
 40 | 
 41 | 
 42 | class DepthWise(Module):
 43 |     def __init__(self, in_c, out_c, residual=False, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=1):
 44 |         super(DepthWise, self).__init__()
 45 |         self.residual = residual
 46 |         self.layers = nn.Sequential(
 47 |             ConvBlock(in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1)),
 48 |             ConvBlock(groups, groups, groups=groups, kernel=kernel, padding=padding, stride=stride),
 49 |             LinearBlock(groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
 50 |         )
 51 | 
 52 |     def forward(self, x):
 53 |         short_cut = None
 54 |         if self.residual:
 55 |             short_cut = x
 56 |         x = self.layers(x)
 57 |         if self.residual:
 58 |             output = short_cut + x
 59 |         else:
 60 |             output = x
 61 |         return output
 62 | 
 63 | 
 64 | class Residual(Module):
 65 |     def __init__(self, c, num_block, groups, kernel=(3, 3), stride=(1, 1), padding=(1, 1)):
 66 |         super(Residual, self).__init__()
 67 |         modules = []
 68 |         for _ in range(num_block):
 69 |             modules.append(DepthWise(c, c, True, kernel, stride, padding, groups))
 70 |         self.layers = Sequential(*modules)
 71 | 
 72 |     def forward(self, x):
 73 |         return self.layers(x)
 74 | 
 75 | 
 76 | class GDC(Module):
 77 |     def __init__(self, embedding_size):
 78 |         super(GDC, self).__init__()
 79 |         self.layers = nn.Sequential(
 80 |             LinearBlock(512, 512, groups=512, kernel=(7, 7), stride=(1, 1), padding=(0, 0)),
 81 |             Flatten(),
 82 |             Linear(512, embedding_size, bias=False),
 83 |             BatchNorm1d(embedding_size))
 84 | 
 85 |     def forward(self, x):
 86 |         return self.layers(x)
 87 | 
 88 | 
 89 | class MobileFaceNet(Module):
 90 |     def __init__(self, fp16=False, num_features=512):
 91 |         super(MobileFaceNet, self).__init__()
 92 |         scale = 2
 93 |         self.fp16 = fp16
 94 |         self.layers = nn.Sequential(
 95 |             ConvBlock(3, 64 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1)),
 96 |             ConvBlock(64 * scale, 64 * scale, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64),
 97 |             DepthWise(64 * scale, 64 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128),
 98 |             Residual(64 * scale, num_block=4, groups=128, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
 99 |             DepthWise(64 * scale, 128 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256),
100 |             Residual(128 * scale, num_block=6, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
101 |             DepthWise(128 * scale, 128 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512),
102 |             Residual(128 * scale, num_block=2, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)),
103 |         )
104 |         self.conv_sep = ConvBlock(128 * scale, 512, kernel=(1, 1), stride=(1, 1), padding=(0, 0))
105 |         self.features = GDC(num_features)
106 |         self._initialize_weights()
107 | 
108 |     def _initialize_weights(self):
109 |         for m in self.modules():
110 |             if isinstance(m, nn.Conv2d):
111 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
112 |                 if m.bias is not None:
113 |                     m.bias.data.zero_()
114 |             elif isinstance(m, nn.BatchNorm2d):
115 |                 m.weight.data.fill_(1)
116 |                 m.bias.data.zero_()
117 |             elif isinstance(m, nn.Linear):
118 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
119 |                 if m.bias is not None:
120 |                     m.bias.data.zero_()
121 | 
122 |     def forward(self, x):
123 |         with torch.cuda.amp.autocast(self.fp16):
124 |             x = self.layers(x)
125 |         x = self.conv_sep(x.float() if self.fp16 else x)
126 |         x = self.features(x)
127 |         return x
128 | 
129 | 
130 | def get_mbf(fp16, num_features):
131 |     return MobileFaceNet(fp16, num_features)


--------------------------------------------------------------------------------
/modules/shap_e/shap_e/rendering/raycast/cast.py:
--------------------------------------------------------------------------------
  1 | from typing import Iterator, Optional, Tuple
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | 
  6 | from ...rendering.view_data import ProjectiveCamera
  7 | 
  8 | from ._utils import cross_product
  9 | from .types import RayCollisions, Rays, TriMesh
 10 | 
 11 | 
 12 | def cast_camera(
 13 |     camera: ProjectiveCamera,
 14 |     mesh: TriMesh,
 15 |     ray_batch_size: Optional[int] = None,
 16 |     checkpoint: Optional[bool] = None,
 17 | ) -> Iterator[RayCollisions]:
 18 |     pixel_indices = np.arange(camera.width * camera.height)
 19 |     image_coords = np.stack([pixel_indices % camera.width, pixel_indices // camera.width], axis=1)
 20 |     rays = camera.camera_rays(image_coords)
 21 |     batch_size = ray_batch_size or len(rays)
 22 |     checkpoint = checkpoint if checkpoint is not None else batch_size < len(rays)
 23 |     for i in range(0, len(rays), batch_size):
 24 |         sub_rays = rays[i : i + batch_size]
 25 |         origins = torch.from_numpy(sub_rays[:, 0]).to(mesh.vertices)
 26 |         directions = torch.from_numpy(sub_rays[:, 1]).to(mesh.vertices)
 27 |         yield cast_rays(Rays(origins=origins, directions=directions), mesh, checkpoint=checkpoint)
 28 | 
 29 | 
 30 | def cast_rays(rays: Rays, mesh: TriMesh, checkpoint: bool = False) -> RayCollisions:
 31 |     """
 32 |     Cast a batch of rays onto a mesh.
 33 |     """
 34 |     if checkpoint:
 35 |         collides, ray_dists, tri_indices, barycentric, normals = RayCollisionFunction.apply(
 36 |             rays.origins, rays.directions, mesh.faces, mesh.vertices
 37 |         )
 38 |         return RayCollisions(
 39 |             collides=collides,
 40 |             ray_dists=ray_dists,
 41 |             tri_indices=tri_indices,
 42 |             barycentric=barycentric,
 43 |             normals=normals,
 44 |         )
 45 | 
 46 |     # https://github.com/unixpickle/vae-textures/blob/2968549ddd4a3487f9437d4db00793324453cd59/vae_textures/render.py#L98
 47 |     normals = mesh.normals()  # [N x 3]
 48 |     directions = rays.directions  # [M x 3]
 49 |     collides = (directions @ normals.T).abs() > 1e-8  # [N x M]
 50 | 
 51 |     tris = mesh.vertices[mesh.faces]  # [N x 3 x 3]
 52 |     v1 = tris[:, 1] - tris[:, 0]
 53 |     v2 = tris[:, 2] - tris[:, 0]
 54 | 
 55 |     cross1 = cross_product(directions[:, None], v2[None])  # [N x M x 3]
 56 |     det = torch.sum(cross1 * v1[None], dim=-1)  # [N x M]
 57 |     collides = torch.logical_and(collides, det.abs() > 1e-8)
 58 | 
 59 |     invDet = 1 / det  # [N x M]
 60 |     o = rays.origins[:, None] - tris[None, :, 0]  # [N x M x 3]
 61 |     bary1 = invDet * torch.sum(o * cross1, dim=-1)  # [N x M]
 62 |     collides = torch.logical_and(collides, torch.logical_and(bary1 >= 0, bary1 <= 1))
 63 | 
 64 |     cross2 = cross_product(o, v1[None])  # [N x M x 3]
 65 |     bary2 = invDet * torch.sum(directions[:, None] * cross2, dim=-1)  # [N x M]
 66 |     collides = torch.logical_and(collides, torch.logical_and(bary2 >= 0, bary2 <= 1))
 67 | 
 68 |     bary0 = 1 - (bary1 + bary2)
 69 | 
 70 |     # Make sure this is in the positive part of the ray.
 71 |     scale = invDet * torch.sum(v2 * cross2, dim=-1)
 72 |     collides = torch.logical_and(collides, scale > 0)
 73 | 
 74 |     # Select the nearest collision
 75 |     ray_dists, tri_indices = torch.min(
 76 |         torch.where(collides, scale, torch.tensor(torch.inf).to(scale)), dim=-1
 77 |     )  # [N]
 78 |     nearest_bary = torch.stack(
 79 |         [
 80 |             bary0[range(len(tri_indices)), tri_indices],
 81 |             bary1[range(len(tri_indices)), tri_indices],
 82 |             bary2[range(len(tri_indices)), tri_indices],
 83 |         ],
 84 |         dim=-1,
 85 |     )
 86 | 
 87 |     return RayCollisions(
 88 |         collides=torch.any(collides, dim=-1),
 89 |         ray_dists=ray_dists,
 90 |         tri_indices=tri_indices,
 91 |         barycentric=nearest_bary,
 92 |         normals=normals[tri_indices],
 93 |     )
 94 | 
 95 | 
 96 | class RayCollisionFunction(torch.autograd.Function):
 97 |     @staticmethod
 98 |     def forward(
 99 |         ctx, origins, directions, faces, vertices
100 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
101 |         ctx.save_for_backward(origins, directions, faces, vertices)
102 |         with torch.no_grad():
103 |             res = cast_rays(
104 |                 Rays(origins=origins, directions=directions),
105 |                 TriMesh(faces=faces, vertices=vertices),
106 |                 checkpoint=False,
107 |             )
108 |         return (res.collides, res.ray_dists, res.tri_indices, res.barycentric, res.normals)
109 | 
110 |     @staticmethod
111 |     def backward(
112 |         ctx, _collides_grad, ray_dists_grad, _tri_indices_grad, barycentric_grad, normals_grad
113 |     ):
114 |         origins, directions, faces, vertices = ctx.input_tensors
115 | 
116 |         origins = origins.detach().requires_grad_(True)
117 |         directions = directions.detach().requires_grad_(True)
118 |         vertices = vertices.detach().requires_grad_(True)
119 | 
120 |         with torch.enable_grad():
121 |             outputs = cast_rays(
122 |                 Rays(origins=origins, directions=directions),
123 |                 TriMesh(faces=faces, vertices=vertices),
124 |                 checkpoint=False,
125 |             )
126 | 
127 |         origins_grad, directions_grad, vertices_grad = torch.autograd.grad(
128 |             (outputs.ray_dists, outputs.barycentric, outputs.normals),
129 |             (origins, directions, vertices),
130 |             (ray_dists_grad, barycentric_grad, normals_grad),
131 |         )
132 |         return (origins_grad, directions_grad, None, vertices_grad)
133 | 


--------------------------------------------------------------------------------