├── modules ├── __init__.py ├── shap_e │ ├── shap_e │ │ ├── __init__.py │ │ ├── util │ │ │ ├── __init__.py │ │ │ ├── io.py │ │ │ ├── notebooks.py │ │ │ └── collections.py │ │ ├── diffusion │ │ │ ├── __init__.py │ │ │ └── sample.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── nerf │ │ │ │ └── __init__.py │ │ │ ├── stf │ │ │ │ ├── __init__.py │ │ │ │ └── base.py │ │ │ ├── generation │ │ │ │ ├── __init__.py │ │ │ │ ├── util.py │ │ │ │ ├── latent_diffusion.py │ │ │ │ └── pooled_mlp.py │ │ │ ├── transmitter │ │ │ │ ├── __init__.py │ │ │ │ └── bottleneck.py │ │ │ ├── nn │ │ │ │ ├── __init__.py │ │ │ │ ├── utils.py │ │ │ │ └── checkpoint.py │ │ │ └── query.py │ │ └── rendering │ │ │ ├── __init__.py │ │ │ ├── raycast │ │ │ ├── __init__.py │ │ │ ├── _utils.py │ │ │ ├── render.py │ │ │ ├── types.py │ │ │ └── cast.py │ │ │ ├── blender │ │ │ ├── constants.py │ │ │ ├── __init__.py │ │ │ ├── view_data.py │ │ │ └── render.py │ │ │ ├── torch_mesh.py │ │ │ ├── ply_util.py │ │ │ └── mesh.py │ └── __init__.py ├── annotator │ ├── midas │ │ ├── midas │ │ │ ├── __init__.py │ │ │ ├── base_model.py │ │ │ ├── midas_net.py │ │ │ └── dpt_depth.py │ │ ├── __init__.py │ │ └── utils.py │ ├── util.py │ ├── openpose │ │ ├── __init__.py │ │ └── hand.py │ └── __init__.py ├── mplug │ ├── models │ │ ├── clip │ │ │ ├── __init__.py │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ └── simple_tokenizer.py │ │ └── visual_transformers.py │ ├── ckpts │ │ └── bert-base-uncased │ │ │ └── config.json │ ├── __init__.py │ └── get_video_caption.py ├── video_moviepy │ └── font │ │ └── cn.ttf ├── sadtalker │ ├── src │ │ ├── config │ │ │ ├── similarity_Lm3D_all.mat │ │ │ ├── facerender.yaml │ │ │ ├── facerender_still.yaml │ │ │ ├── auido2pose.yaml │ │ │ └── auido2exp.yaml │ │ ├── face3d │ │ │ ├── util │ │ │ │ ├── __init__.py │ │ │ │ ├── preprocess.py │ │ │ │ └── load_mats.py │ │ │ └── models │ │ │ │ ├── arcface_torch │ │ │ │ └── backbones │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── mobilefacenet.py │ │ │ │ └── __init__.py │ │ ├── utils │ │ │ ├── safetensor_helper.py │ │ │ ├── text2speech.py │ │ │ ├── videoio.py │ │ │ ├── paste_pic.py │ │ │ ├── init_path.py │ │ │ ├── face_enhancer.py │ │ │ └── audio.py │ │ ├── facerender │ │ │ ├── sync_batchnorm │ │ │ │ ├── __init__.py │ │ │ │ ├── unittest.py │ │ │ │ ├── replicate.py │ │ │ │ └── comm.py │ │ │ └── modules │ │ │ │ ├── mapping.py │ │ │ │ └── discriminator.py │ │ ├── audio2exp_models │ │ │ ├── audio2exp.py │ │ │ └── networks.py │ │ ├── audio2pose_models │ │ │ ├── res_unet.py │ │ │ ├── discriminator.py │ │ │ ├── audio_encoder.py │ │ │ ├── audio2pose.py │ │ │ └── networks.py │ │ └── generate_batch.py │ ├── __init__.py │ └── inference.py ├── stable_diffusion │ └── __init__.py ├── bark │ └── __init__.py ├── bark_voice_clone │ └── __init__.py ├── blip │ └── __init__.py ├── modelscope_t2v │ └── __init__.py └── text2video_zero │ ├── __init__.py │ └── utils.py ├── .gitignore ├── test.py ├── model_zoo └── mplug │ ├── videocap_vatex_mplug_large.yaml │ └── config_bert.json ├── requirements.txt ├── LICENSE ├── utils.py └── video_utils.py /modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modules/annotator/midas/midas/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/diffusion/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/rendering/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/models/nerf/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/models/stf/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/models/generation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/models/transmitter/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/rendering/raycast/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modules/mplug/models/clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip import * 2 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/models/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from .meta import * 2 | from .ops import * 3 | -------------------------------------------------------------------------------- /modules/video_moviepy/font/cn.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaleido-lab/dolphin/HEAD/modules/video_moviepy/font/cn.ttf -------------------------------------------------------------------------------- /modules/mplug/models/clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaleido-lab/dolphin/HEAD/modules/mplug/models/clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /modules/sadtalker/src/config/similarity_Lm3D_all.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaleido-lab/dolphin/HEAD/modules/sadtalker/src/config/similarity_Lm3D_all.mat -------------------------------------------------------------------------------- /modules/sadtalker/src/face3d/util/__init__.py: -------------------------------------------------------------------------------- 1 | """This package includes a miscellaneous collection of useful helper functions.""" 2 | from .util import * 3 | 4 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/rendering/blender/constants.py: -------------------------------------------------------------------------------- 1 | UNIFORM_LIGHT_DIRECTION = [0.09387503, -0.63953443, -0.7630093] 2 | BASIC_AMBIENT_COLOR = 0.3 3 | BASIC_DIFFUSE_COLOR = 0.7 4 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/rendering/blender/__init__.py: -------------------------------------------------------------------------------- 1 | from .render import render_mesh, render_model 2 | from .view_data import BlenderViewData 3 | 4 | __all__ = ["BlenderViewData", "render_model"] 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .DS_Store 3 | *.iml 4 | *.xml 5 | 6 | *.pth 7 | *.mp4 8 | *.tar 9 | *.pt 10 | *.bin 11 | *.ckpt 12 | *.safetensors 13 | image 14 | video 15 | logs/ 16 | modules/bark_voice_clone/pretrain_work_dir 17 | *.wav -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from modules.sadtalker import Sadtalker 2 | import torch 3 | 4 | if torch.cuda.is_available(): 5 | device = "cuda" 6 | else: 7 | device = "cpu" 8 | 9 | sd = Sadtalker(device) 10 | sd.inference("audio/ac9fc7da.wav,image/test.png") 11 | 12 | -------------------------------------------------------------------------------- /modules/sadtalker/src/utils/safetensor_helper.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def load_x_from_safetensor(checkpoint, key): 4 | x_generator = {} 5 | for k,v in checkpoint.items(): 6 | if key in k: 7 | x_generator[k.replace(key+'.', '')] = v 8 | return x_generator -------------------------------------------------------------------------------- /model_zoo/mplug/videocap_vatex_mplug_large.yaml: -------------------------------------------------------------------------------- 1 | bert_config: './model_zoo/mplug/config_bert.json' 2 | image_res: 224 3 | vision_width: 1024 4 | distill: True 5 | clip_name: "ViT-L-14" 6 | k_test: 128 7 | eos: '[SEP]' 8 | bos: '[CLS]' 9 | prompt: 'a video of' 10 | use_checkpoint: true 11 | num_frm_test: 8 12 | min_length: 10 13 | max_length: 20 14 | beam_size: 3 15 | text_encoder: 'bert-base-uncased' 16 | text_decoder: 'bert-base-uncased' -------------------------------------------------------------------------------- /modules/annotator/midas/midas/base_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class BaseModel(torch.nn.Module): 5 | def load(self, path): 6 | """Load model from file. 7 | 8 | Args: 9 | path (str): file path 10 | """ 11 | parameters = torch.load(path, map_location=torch.device('cpu')) 12 | 13 | if "optimizer" in parameters: 14 | parameters = parameters["model"] 15 | 16 | self.load_state_dict(parameters) 17 | -------------------------------------------------------------------------------- /modules/sadtalker/src/facerender/sync_batchnorm/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File : __init__.py 3 | # Author : Jiayuan Mao 4 | # Email : maojiayuan@gmail.com 5 | # Date : 27/01/2018 6 | # 7 | # This file is part of Synchronized-BatchNorm-PyTorch. 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch 9 | # Distributed under MIT License. 10 | 11 | from .batchnorm import SynchronizedBatchNorm1d, SynchronizedBatchNorm2d, SynchronizedBatchNorm3d 12 | from .replicate import DataParallelWithCallback, patch_replication_callback 13 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/rendering/raycast/_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def normalize(v: torch.Tensor) -> torch.Tensor: 5 | return v / torch.linalg.norm(v, dim=-1, keepdim=True) 6 | 7 | 8 | def cross_product(v1: torch.Tensor, v2: torch.Tensor) -> torch.Tensor: 9 | return torch.stack( 10 | [ 11 | v1[..., 1] * v2[..., 2] - v2[..., 1] * v1[..., 2], 12 | -(v1[..., 0] * v2[..., 2] - v2[..., 0] * v1[..., 2]), 13 | v1[..., 0] * v2[..., 1] - v2[..., 0] * v1[..., 1], 14 | ], 15 | dim=-1, 16 | ) 17 | -------------------------------------------------------------------------------- /modules/sadtalker/src/utils/text2speech.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | from TTS.api import TTS 4 | 5 | 6 | class TTSTalker(): 7 | def __init__(self) -> None: 8 | model_name = TTS.list_models()[0] 9 | self.tts = TTS(model_name) 10 | 11 | def test(self, text, language='en'): 12 | 13 | tempf = tempfile.NamedTemporaryFile( 14 | delete = False, 15 | suffix = ('.'+'wav'), 16 | ) 17 | 18 | self.tts.tts_to_file(text, speaker=self.tts.speakers[0], language=language, file_path=tempf.name) 19 | 20 | return tempf.name -------------------------------------------------------------------------------- /modules/stable_diffusion/__init__.py: -------------------------------------------------------------------------------- 1 | from diffusers import StableDiffusionPipeline, PNDMScheduler 2 | from utils import generate_image_name 3 | 4 | class Text2Image: 5 | def __init__(self): 6 | self.pndm = PNDMScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler") 7 | self.pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", scheduler=self.pndm) 8 | 9 | def image_generation(self, text): 10 | image = self.pipeline(text).images[0] 11 | image_url = generate_image_name() 12 | image.save(image_url) 13 | return image_url 14 | 15 | -------------------------------------------------------------------------------- /modules/mplug/ckpts/bert-base-uncased/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForMaskedLM" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "gradient_checkpointing": false, 7 | "hidden_act": "gelu", 8 | "hidden_dropout_prob": 0.1, 9 | "hidden_size": 768, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 3072, 12 | "layer_norm_eps": 1e-12, 13 | "max_position_embeddings": 512, 14 | "model_type": "bert", 15 | "num_attention_heads": 12, 16 | "num_hidden_layers": 12, 17 | "pad_token_id": 0, 18 | "position_embedding_type": "absolute", 19 | "transformers_version": "4.6.0.dev0", 20 | "type_vocab_size": 2, 21 | "use_cache": true, 22 | "vocab_size": 30522 23 | } 24 | -------------------------------------------------------------------------------- /model_zoo/mplug/config_bert.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForMaskedLM" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": false, 21 | "use_cache":false, 22 | "gradient_checkpointing": true, 23 | "text_encoder_layers": 6, 24 | "fusion_layers": 6, 25 | "text_decode_layers": 12, 26 | "stride_layer": 6 27 | } 28 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu113 2 | accelerate==0.17.1 3 | basicsr==1.4.2 4 | decord==0.6.0 5 | diffusers==0.16.1 6 | einops==0.3.0 7 | ftfy 8 | gradio==3.20.1 9 | imageio==2.19.3 10 | kornia==0.6.8 11 | langchain==0.0.101 12 | modelscope==1.4.2 13 | moviepy==1.0.3 14 | omegaconf==2.3.0 15 | openai 16 | opencv-python 17 | open_clip_torch 18 | oss2 19 | pandas==2.0.0 20 | Pillow==9.5.0 21 | PyYAML==6.0 22 | ruamel_yaml 23 | timm==0.4.9 24 | tomesd 25 | torch==1.12.1+cu113 26 | torchaudio==0.12.1+cu113 27 | torchvision==0.13.1+cu113 28 | tqdm==4.65.0 29 | transformers==4.28.0 30 | git+https://github.com/suno-ai/bark.git 31 | 32 | # SadTalker 33 | ffmpeg 34 | av 35 | numpy==1.23.4 36 | face_alignment==1.3.5 37 | imageio-ffmpeg==0.4.7 38 | librosa==0.9.2 39 | numba 40 | resampy==0.3.1 41 | pydub==0.25.1 42 | scipy==1.10.1 43 | yacs==0.1.8 44 | pyyaml 45 | joblib==1.1.0 46 | scikit-image==0.19.3 47 | facexlib==0.3.0 48 | gfpgan 49 | safetensors 50 | -------------------------------------------------------------------------------- /modules/mplug/__init__.py: -------------------------------------------------------------------------------- 1 | from .get_video_caption import prepare_model, pipeline 2 | 3 | 4 | mplug_model_zoo = "model_zoo/mplug" 5 | 6 | 7 | class VideoCaptioning: 8 | def __init__(self, device): 9 | print("Initializing mPLUG for VideoCaptioning") 10 | self.download_models() 11 | self.device = device 12 | self.model, self.tokenizer = prepare_model(device) 13 | self.pipe = pipeline 14 | 15 | def inference(self, inputs): 16 | return pipeline(inputs, self.model, self.tokenizer, self.device) 17 | 18 | def download_models(self): 19 | model_list = [ 20 | "https://alice-open.oss-cn-zhangjiakou.aliyuncs.com/mPLUG/ViT-L-14.tar", 21 | "https://alice-open.oss-cn-zhangjiakou.aliyuncs.com/mPLUG/mplug_large.pth", 22 | ] 23 | for url in model_list: 24 | from basicsr.utils.download_util import load_file_from_url 25 | 26 | load_file_from_url(url, model_dir=mplug_model_zoo) 27 | -------------------------------------------------------------------------------- /modules/sadtalker/src/face3d/models/arcface_torch/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .iresnet import iresnet18, iresnet34, iresnet50, iresnet100, iresnet200 2 | from .mobilefacenet import get_mbf 3 | 4 | 5 | 6 | def get_model(name, **kwargs): 7 | # resnet 8 | if name == "r18": 9 | return iresnet18(False, **kwargs) 10 | elif name == "r34": 11 | return iresnet34(False, **kwargs) 12 | elif name == "r50": 13 | return iresnet50(False, **kwargs) 14 | elif name == "r100": 15 | return iresnet100(False, **kwargs) 16 | elif name == "r200": 17 | return iresnet200(False, **kwargs) 18 | elif name == "r2060": 19 | from .iresnet2060 import iresnet2060 20 | return iresnet2060(False, **kwargs) 21 | elif name == "mbf": 22 | fp16 = kwargs.get("fp16", False) 23 | num_features = kwargs.get("num_features", 512) 24 | return get_mbf(fp16=fp16, num_features=num_features) 25 | else: 26 | raise ValueError() -------------------------------------------------------------------------------- /modules/shap_e/shap_e/models/generation/util.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | 5 | 6 | def timestep_embedding(timesteps, dim, max_period=10000): 7 | """ 8 | Create sinusoidal timestep embeddings. 9 | :param timesteps: a 1-D Tensor of N indices, one per batch element. 10 | These may be fractional. 11 | :param dim: the dimension of the output. 12 | :param max_period: controls the minimum frequency of the embeddings. 13 | :return: an [N x dim] Tensor of positional embeddings. 14 | """ 15 | half = dim // 2 16 | freqs = torch.exp( 17 | -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half 18 | ).to(device=timesteps.device) 19 | args = timesteps[:, None].to(timesteps.dtype) * freqs[None] 20 | embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) 21 | if dim % 2: 22 | embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) 23 | return embedding 24 | -------------------------------------------------------------------------------- /modules/sadtalker/src/facerender/sync_batchnorm/unittest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File : unittest.py 3 | # Author : Jiayuan Mao 4 | # Email : maojiayuan@gmail.com 5 | # Date : 27/01/2018 6 | # 7 | # This file is part of Synchronized-BatchNorm-PyTorch. 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch 9 | # Distributed under MIT License. 10 | 11 | import unittest 12 | 13 | import numpy as np 14 | from torch.autograd import Variable 15 | 16 | 17 | def as_numpy(v): 18 | if isinstance(v, Variable): 19 | v = v.data 20 | return v.cpu().numpy() 21 | 22 | 23 | class TorchTestCase(unittest.TestCase): 24 | def assertTensorClose(self, a, b, atol=1e-3, rtol=1e-3): 25 | npa, npb = as_numpy(a), as_numpy(b) 26 | self.assertTrue( 27 | np.allclose(npa, npb, atol=atol), 28 | 'Tensor close check failed\n{}\n{}\nadiff={}, rdiff={}'.format(a, b, np.abs(npa - npb).max(), np.abs((npa - npb) / np.fmax(npa, 1e-5)).max()) 29 | ) 30 | -------------------------------------------------------------------------------- /modules/bark/__init__.py: -------------------------------------------------------------------------------- 1 | from scipy.io.wavfile import write as write_wav 2 | from bark import SAMPLE_RATE, generate_audio, preload_models 3 | 4 | from utils import generate_audio_name 5 | 6 | 7 | class Text2Audio: 8 | def __init__(self, **kwargs): 9 | print("Initializing Bark for Text2Audio") 10 | # download and load all models 11 | print("Loading bark models for text2audio...") 12 | preload_models() 13 | 14 | def text2audio(self, inputs): 15 | # generate audio from text 16 | text = inputs 17 | audio_array = generate_audio(text) 18 | audio_path = generate_audio_name() 19 | write_wav(audio_path, SAMPLE_RATE, audio_array) 20 | return audio_path 21 | 22 | def text2music(self, inputs): 23 | # generate music from text 24 | text = "♪ " + inputs + " ♪" 25 | audio_array = generate_audio(text) 26 | audio_path = generate_audio_name() 27 | write_wav(audio_path, SAMPLE_RATE, audio_array) 28 | return audio_path 29 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/models/query.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Callable, Optional 3 | 4 | import torch 5 | 6 | 7 | @dataclass 8 | class Query: 9 | # Both of these are of shape [batch_size x ... x 3] 10 | position: torch.Tensor 11 | direction: Optional[torch.Tensor] = None 12 | 13 | t_min: Optional[torch.Tensor] = None 14 | t_max: Optional[torch.Tensor] = None 15 | 16 | def copy(self) -> "Query": 17 | return Query( 18 | position=self.position, 19 | direction=self.direction, 20 | t_min=self.t_min, 21 | t_max=self.t_max, 22 | ) 23 | 24 | def map_tensors(self, f: Callable[[torch.Tensor], torch.Tensor]) -> "Query": 25 | return Query( 26 | position=f(self.position), 27 | direction=f(self.direction) if self.direction is not None else None, 28 | t_min=f(self.t_min) if self.t_min is not None else None, 29 | t_max=f(self.t_max) if self.t_max is not None else None, 30 | ) 31 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/util/io.py: -------------------------------------------------------------------------------- 1 | import io 2 | from contextlib import contextmanager 3 | from typing import Any, BinaryIO, Iterator, Union 4 | 5 | import blobfile as bf 6 | import yaml 7 | 8 | from ..util.collections import AttrDict 9 | 10 | 11 | def read_config(path_or_file: Union[str, io.IOBase]) -> Any: 12 | if isinstance(path_or_file, io.IOBase): 13 | obj = yaml.load(path_or_file, Loader=yaml.SafeLoader) 14 | else: 15 | with bf.BlobFile(path_or_file, "rb") as f: 16 | try: 17 | obj = yaml.load(f, Loader=yaml.SafeLoader) 18 | except Exception as exc: 19 | with bf.BlobFile(path_or_file, "rb") as f: 20 | print(f.read()) 21 | raise exc 22 | if isinstance(obj, dict): 23 | return AttrDict(obj) 24 | return obj 25 | 26 | 27 | @contextmanager 28 | def buffered_writer(raw_f: BinaryIO) -> Iterator[io.BufferedIOBase]: 29 | if isinstance(raw_f, io.BufferedIOBase): 30 | yield raw_f 31 | else: 32 | f = io.BufferedWriter(raw_f) 33 | yield f 34 | f.flush() 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 BUAA-PrismGroup 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /modules/annotator/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import os 4 | 5 | 6 | annotator_ckpts_path = "model_zoo/annotator" 7 | 8 | 9 | def HWC3(x): 10 | assert x.dtype == np.uint8 11 | if x.ndim == 2: 12 | x = x[:, :, None] 13 | assert x.ndim == 3 14 | H, W, C = x.shape 15 | assert C == 1 or C == 3 or C == 4 16 | if C == 3: 17 | return x 18 | if C == 1: 19 | return np.concatenate([x, x, x], axis=2) 20 | if C == 4: 21 | color = x[:, :, 0:3].astype(np.float32) 22 | alpha = x[:, :, 3:4].astype(np.float32) / 255.0 23 | y = color * alpha + 255.0 * (1.0 - alpha) 24 | y = y.clip(0, 255).astype(np.uint8) 25 | return y 26 | 27 | 28 | def resize_image(input_image, resolution): 29 | H, W, C = input_image.shape 30 | H = float(H) 31 | W = float(W) 32 | k = float(resolution) / min(H, W) 33 | H *= k 34 | W *= k 35 | H = int(np.round(H / 64.0)) * 64 36 | W = int(np.round(W / 64.0)) * 64 37 | img = cv2.resize( 38 | input_image, 39 | (W, H), 40 | interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA, 41 | ) 42 | return img 43 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/models/generation/latent_diffusion.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class SplitVectorDiffusion(nn.Module): 8 | def __init__(self, *, device: torch.device, wrapped: nn.Module, n_ctx: int, d_latent: int): 9 | super().__init__() 10 | self.device = device 11 | self.n_ctx = n_ctx 12 | self.d_latent = d_latent 13 | self.wrapped = wrapped 14 | 15 | if hasattr(self.wrapped, "cached_model_kwargs"): 16 | self.cached_model_kwargs = self.wrapped.cached_model_kwargs 17 | 18 | def forward(self, x: torch.Tensor, t: torch.Tensor, **kwargs): 19 | h = x.reshape(x.shape[0], self.n_ctx, -1).permute(0, 2, 1) 20 | pre_channels = h.shape[1] 21 | h = self.wrapped(h, t, **kwargs) 22 | assert ( 23 | h.shape[1] == pre_channels * 2 24 | ), "expected twice as many outputs for variance prediction" 25 | eps, var = torch.chunk(h, 2, dim=1) 26 | return torch.cat( 27 | [ 28 | eps.permute(0, 2, 1).flatten(1), 29 | var.permute(0, 2, 1).flatten(1), 30 | ], 31 | dim=1, 32 | ) 33 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/models/nn/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, Union 2 | 3 | import numpy as np 4 | import torch 5 | 6 | ArrayType = Union[np.ndarray, Iterable[int], torch.Tensor] 7 | 8 | 9 | def to_torch(arr: ArrayType, dtype=torch.float): 10 | if isinstance(arr, torch.Tensor): 11 | return arr 12 | return torch.from_numpy(np.array(arr)).to(dtype) 13 | 14 | 15 | def sample_pmf(pmf: torch.Tensor, n_samples: int) -> torch.Tensor: 16 | """ 17 | Sample from the given discrete probability distribution with replacement. 18 | 19 | The i-th bin is assumed to have mass pmf[i]. 20 | 21 | :param pmf: [batch_size, *shape, n_samples, 1] where (pmf.sum(dim=-2) == 1).all() 22 | :param n_samples: number of samples 23 | 24 | :return: indices sampled with replacement 25 | """ 26 | 27 | *shape, support_size, last_dim = pmf.shape 28 | assert last_dim == 1 29 | 30 | cdf = torch.cumsum(pmf.view(-1, support_size), dim=1) 31 | inds = torch.searchsorted(cdf, torch.rand(cdf.shape[0], n_samples, device=cdf.device)) 32 | 33 | return inds.view(*shape, n_samples, 1).clamp(0, support_size - 1) 34 | 35 | 36 | def safe_divide(a, b, epsilon=1e-6): 37 | return a / torch.where(b < 0, b - epsilon, b + epsilon) 38 | -------------------------------------------------------------------------------- /modules/sadtalker/src/config/facerender.yaml: -------------------------------------------------------------------------------- 1 | model_params: 2 | common_params: 3 | num_kp: 15 4 | image_channel: 3 5 | feature_channel: 32 6 | estimate_jacobian: False # True 7 | kp_detector_params: 8 | temperature: 0.1 9 | block_expansion: 32 10 | max_features: 1024 11 | scale_factor: 0.25 # 0.25 12 | num_blocks: 5 13 | reshape_channel: 16384 # 16384 = 1024 * 16 14 | reshape_depth: 16 15 | he_estimator_params: 16 | block_expansion: 64 17 | max_features: 2048 18 | num_bins: 66 19 | generator_params: 20 | block_expansion: 64 21 | max_features: 512 22 | num_down_blocks: 2 23 | reshape_channel: 32 24 | reshape_depth: 16 # 512 = 32 * 16 25 | num_resblocks: 6 26 | estimate_occlusion_map: True 27 | dense_motion_params: 28 | block_expansion: 32 29 | max_features: 1024 30 | num_blocks: 5 31 | reshape_depth: 16 32 | compress: 4 33 | discriminator_params: 34 | scales: [1] 35 | block_expansion: 32 36 | max_features: 512 37 | num_blocks: 4 38 | sn: True 39 | mapping_params: 40 | coeff_nc: 70 41 | descriptor_nc: 1024 42 | layer: 3 43 | num_kp: 15 44 | num_bins: 66 45 | 46 | -------------------------------------------------------------------------------- /modules/sadtalker/src/config/facerender_still.yaml: -------------------------------------------------------------------------------- 1 | model_params: 2 | common_params: 3 | num_kp: 15 4 | image_channel: 3 5 | feature_channel: 32 6 | estimate_jacobian: False # True 7 | kp_detector_params: 8 | temperature: 0.1 9 | block_expansion: 32 10 | max_features: 1024 11 | scale_factor: 0.25 # 0.25 12 | num_blocks: 5 13 | reshape_channel: 16384 # 16384 = 1024 * 16 14 | reshape_depth: 16 15 | he_estimator_params: 16 | block_expansion: 64 17 | max_features: 2048 18 | num_bins: 66 19 | generator_params: 20 | block_expansion: 64 21 | max_features: 512 22 | num_down_blocks: 2 23 | reshape_channel: 32 24 | reshape_depth: 16 # 512 = 32 * 16 25 | num_resblocks: 6 26 | estimate_occlusion_map: True 27 | dense_motion_params: 28 | block_expansion: 32 29 | max_features: 1024 30 | num_blocks: 5 31 | reshape_depth: 16 32 | compress: 4 33 | discriminator_params: 34 | scales: [1] 35 | block_expansion: 32 36 | max_features: 512 37 | num_blocks: 4 38 | sn: True 39 | mapping_params: 40 | coeff_nc: 73 41 | descriptor_nc: 1024 42 | layer: 3 43 | num_kp: 15 44 | num_bins: 66 45 | 46 | -------------------------------------------------------------------------------- /modules/sadtalker/src/config/auido2pose.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | TRAIN_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/audio2pose_unet_noAudio/dataset/train_33.txt 3 | EVAL_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/audio2pose_unet_noAudio/dataset/val.txt 4 | TRAIN_BATCH_SIZE: 64 5 | EVAL_BATCH_SIZE: 1 6 | EXP: True 7 | EXP_DIM: 64 8 | FRAME_LEN: 32 9 | COEFF_LEN: 73 10 | NUM_CLASSES: 46 11 | AUDIO_ROOT_PATH: /apdcephfs_cq2/share_1290939/wenxuazhang/voxceleb1/wav 12 | COEFF_ROOT_PATH: /apdcephfs_cq2/share_1290939/shadowcun/datasets/VoxCeleb/v1/imdb 13 | DEBUG: True 14 | 15 | 16 | MODEL: 17 | AUDIOENCODER: 18 | LEAKY_RELU: True 19 | NORM: 'IN' 20 | DISCRIMINATOR: 21 | LEAKY_RELU: False 22 | INPUT_CHANNELS: 6 23 | CVAE: 24 | AUDIO_EMB_IN_SIZE: 512 25 | AUDIO_EMB_OUT_SIZE: 6 26 | SEQ_LEN: 32 27 | LATENT_SIZE: 64 28 | ENCODER_LAYER_SIZES: [192, 128] 29 | DECODER_LAYER_SIZES: [128, 192] 30 | 31 | 32 | TRAIN: 33 | MAX_EPOCH: 150 34 | GENERATOR: 35 | LR: 1.0e-4 36 | DISCRIMINATOR: 37 | LR: 1.0e-4 38 | LOSS: 39 | LAMBDA_REG: 1 40 | LAMBDA_LANDMARKS: 0 41 | LAMBDA_VERTICES: 0 42 | LAMBDA_GAN_MOTION: 0.7 43 | LAMBDA_GAN_COEFF: 0 44 | LAMBDA_KL: 1 45 | 46 | TAG: 47 | NAME: cvae_UNET_useAudio_usewav2lipAudioEncoder 48 | 49 | 50 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/rendering/torch_mesh.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Dict, Optional 3 | 4 | import torch 5 | 6 | from .mesh import TriMesh 7 | 8 | 9 | @dataclass 10 | class TorchMesh: 11 | """ 12 | A 3D triangle mesh with optional data at the vertices and faces. 13 | """ 14 | 15 | # [N x 3] array of vertex coordinates. 16 | verts: torch.Tensor 17 | 18 | # [M x 3] array of triangles, pointing to indices in verts. 19 | faces: torch.Tensor 20 | 21 | # Extra data per vertex and face. 22 | vertex_channels: Optional[Dict[str, torch.Tensor]] = field(default_factory=dict) 23 | face_channels: Optional[Dict[str, torch.Tensor]] = field(default_factory=dict) 24 | 25 | def tri_mesh(self) -> TriMesh: 26 | """ 27 | Create a CPU version of the mesh. 28 | """ 29 | return TriMesh( 30 | verts=self.verts.detach().cpu().numpy(), 31 | faces=self.faces.cpu().numpy(), 32 | vertex_channels=( 33 | {k: v.detach().cpu().numpy() for k, v in self.vertex_channels.items()} 34 | if self.vertex_channels is not None 35 | else None 36 | ), 37 | face_channels=( 38 | {k: v.detach().cpu().numpy() for k, v in self.face_channels.items()} 39 | if self.face_channels is not None 40 | else None 41 | ), 42 | ) 43 | -------------------------------------------------------------------------------- /modules/sadtalker/src/audio2exp_models/audio2exp.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import torch 3 | from torch import nn 4 | 5 | 6 | class Audio2Exp(nn.Module): 7 | def __init__(self, netG, cfg, device, prepare_training_loss=False): 8 | super(Audio2Exp, self).__init__() 9 | self.cfg = cfg 10 | self.device = device 11 | self.netG = netG.to(device) 12 | 13 | def test(self, batch): 14 | 15 | mel_input = batch['indiv_mels'] # bs T 1 80 16 16 | bs = mel_input.shape[0] 17 | T = mel_input.shape[1] 18 | 19 | exp_coeff_pred = [] 20 | 21 | for i in tqdm(range(0, T, 10),'audio2exp:'): # every 10 frames 22 | 23 | current_mel_input = mel_input[:,i:i+10] 24 | 25 | #ref = batch['ref'][:, :, :64].repeat((1,current_mel_input.shape[1],1)) #bs T 64 26 | ref = batch['ref'][:, :, :64][:, i:i+10] 27 | ratio = batch['ratio_gt'][:, i:i+10] #bs T 28 | 29 | audiox = current_mel_input.view(-1, 1, 80, 16) # bs*T 1 80 16 30 | 31 | curr_exp_coeff_pred = self.netG(audiox, ref, ratio) # bs T 64 32 | 33 | exp_coeff_pred += [curr_exp_coeff_pred] 34 | 35 | # BS x T x 64 36 | results_dict = { 37 | 'exp_coeff_pred': torch.cat(exp_coeff_pred, axis=1) 38 | } 39 | return results_dict 40 | 41 | 42 | -------------------------------------------------------------------------------- /modules/sadtalker/src/config/auido2exp.yaml: -------------------------------------------------------------------------------- 1 | DATASET: 2 | TRAIN_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/file_list/train.txt 3 | EVAL_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/file_list/val.txt 4 | TRAIN_BATCH_SIZE: 32 5 | EVAL_BATCH_SIZE: 32 6 | EXP: True 7 | EXP_DIM: 64 8 | FRAME_LEN: 32 9 | COEFF_LEN: 73 10 | NUM_CLASSES: 46 11 | AUDIO_ROOT_PATH: /apdcephfs_cq2/share_1290939/wenxuazhang/voxceleb1/wav 12 | COEFF_ROOT_PATH: /apdcephfs_cq2/share_1290939/wenxuazhang/voxceleb1/wav2lip_3dmm 13 | LMDB_PATH: /apdcephfs_cq2/share_1290939/shadowcun/datasets/VoxCeleb/v1/imdb 14 | DEBUG: True 15 | NUM_REPEATS: 2 16 | T: 40 17 | 18 | 19 | MODEL: 20 | FRAMEWORK: V2 21 | AUDIOENCODER: 22 | LEAKY_RELU: True 23 | NORM: 'IN' 24 | DISCRIMINATOR: 25 | LEAKY_RELU: False 26 | INPUT_CHANNELS: 6 27 | CVAE: 28 | AUDIO_EMB_IN_SIZE: 512 29 | AUDIO_EMB_OUT_SIZE: 128 30 | SEQ_LEN: 32 31 | LATENT_SIZE: 256 32 | ENCODER_LAYER_SIZES: [192, 1024] 33 | DECODER_LAYER_SIZES: [1024, 192] 34 | 35 | 36 | TRAIN: 37 | MAX_EPOCH: 300 38 | GENERATOR: 39 | LR: 2.0e-5 40 | DISCRIMINATOR: 41 | LR: 1.0e-5 42 | LOSS: 43 | W_FEAT: 0 44 | W_COEFF_EXP: 2 45 | W_LM: 1.0e-2 46 | W_LM_MOUTH: 0 47 | W_REG: 0 48 | W_SYNC: 0 49 | W_COLOR: 0 50 | W_EXPRESSION: 0 51 | W_LIPREADING: 0.01 52 | W_LIPREADING_VV: 0 53 | W_EYE_BLINK: 4 54 | 55 | TAG: 56 | NAME: small_dataset 57 | 58 | 59 | -------------------------------------------------------------------------------- /modules/shap_e/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | from .shape_gen import ShapE 4 | from tqdm import tqdm 5 | 6 | class Shap_E: 7 | def __init__(self, device): 8 | self.cond_type = "t2m" 9 | self.cache_dir = "modules/shap_e/cache_dir" 10 | self.output_dir = "video" 11 | self.device = device 12 | 13 | def inference(self, prompt): 14 | self.cond = prompt 15 | shapE = ShapE(device=self.device, cache_dir=self.cache_dir, type=self.cond_type) 16 | 17 | if os.path.exists(self.cond): 18 | if self.cond_type == "t2m": 19 | prompts_path = self.cond 20 | with open(prompts_path, "r") as f: 21 | prompts = f.readlines() 22 | prompts = [prompt.strip() for prompt in prompts] 23 | 24 | for prompt in tqdm(prompts): 25 | results_dir = shapE.inference(prompt, self.output_dir) 26 | 27 | elif self.cond_type == "i2m": 28 | base_dir = self.cond 29 | images_path = [ 30 | os.path.join(base_dir, f) 31 | for f in os.listdir(base_dir) 32 | if f.endswith(".png") or f.endswith(".jpg") 33 | ] 34 | 35 | for image_path in tqdm(images_path): 36 | results_dir = shapE.inference(image_path, self.output_dir) 37 | 38 | else: 39 | results_dir = shapE.inference(self.cond, self.output_dir) 40 | print(f"Output saved to {results_dir}") 41 | 42 | -------------------------------------------------------------------------------- /modules/sadtalker/__init__.py: -------------------------------------------------------------------------------- 1 | from .inference import main 2 | import torch 3 | 4 | class Sadtalker: 5 | def __init__(self, device): 6 | self.driven_audio = './image/bus_chinese.wav' 7 | self.source_image = './image/art_0.png' 8 | self.ref_eyeblink = None 9 | self.ref_pose = None 10 | self.checkpoint_dir = './modules/sadtalker/checkpoints' 11 | self.result_dir = './video/sadtalker' 12 | self.pose_style = 0 13 | self.batch_size = 2 14 | self.size = 256 15 | self.expression_scale = 1.0 16 | self.input_yaw = None 17 | self.input_pitch = None 18 | self.input_roll = None 19 | self.enhancer = None 20 | self.background_enhancer = None 21 | self.preprocess = 'crop' 22 | self.cpu = False 23 | self.old_version = False 24 | self.still = False 25 | 26 | self.net_recon = 'resnet50' 27 | self.init_path = None 28 | self.use_last_fc = False 29 | self.bfm_folder = '../modules/sadtalker/checkpoints/BFM_Fitting/' 30 | self.bfm_model = 'BFM_model_front.mat' 31 | self.face3dvis = False 32 | self.verbose = False 33 | 34 | self.focal = 1015.0 35 | self.center = 112.0 36 | self.camera_d = 10.0 37 | self.z_near = 5.0 38 | self.z_far = 15.0 39 | 40 | self.device = device 41 | 42 | def inference(self, inputs): 43 | splits = inputs.split(",") 44 | self.driven_audio = splits[0] 45 | self.source_image = splits[1] 46 | main(self) -------------------------------------------------------------------------------- /modules/sadtalker/src/utils/videoio.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import uuid 3 | 4 | import os 5 | 6 | import cv2 7 | 8 | def load_video_to_cv2(input_path): 9 | video_stream = cv2.VideoCapture(input_path) 10 | fps = video_stream.get(cv2.CAP_PROP_FPS) 11 | full_frames = [] 12 | while 1: 13 | still_reading, frame = video_stream.read() 14 | if not still_reading: 15 | video_stream.release() 16 | break 17 | full_frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) 18 | return full_frames 19 | 20 | def save_video_with_watermark(video, audio, save_path, watermark=False): 21 | temp_file = str(uuid.uuid4())+'.mp4' 22 | cmd = r'ffmpeg -y -hide_banner -loglevel error -i "%s" -i "%s" -vcodec copy "%s"' % (video, audio, temp_file) 23 | os.system(cmd) 24 | 25 | if watermark is False: 26 | shutil.move(temp_file, save_path) 27 | else: 28 | # watermark 29 | try: 30 | ##### check if stable-diffusion-webui 31 | import webui 32 | from modules import paths 33 | watarmark_path = paths.script_path+"/extensions/SadTalker/docs/sadtalker_logo.png" 34 | except: 35 | # get the root path of sadtalker. 36 | dir_path = os.path.dirname(os.path.realpath(__file__)) 37 | watarmark_path = dir_path+"/../../docs/sadtalker_logo.png" 38 | 39 | cmd = r'ffmpeg -y -hide_banner -loglevel error -i "%s" -i "%s" -filter_complex "[1]scale=100:-1[wm];[0][wm]overlay=(main_w-overlay_w)-10:10" "%s"' % (temp_file, watarmark_path, save_path) 40 | os.system(cmd) 41 | os.remove(temp_file) -------------------------------------------------------------------------------- /modules/annotator/midas/__init__.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import torch 4 | 5 | from einops import rearrange 6 | from .api import MiDaSInference 7 | 8 | 9 | class MidasDetector: 10 | def __init__(self, device=None): 11 | self.device = device or torch.device( 12 | "cuda" if torch.cuda.is_available() else "cpu" 13 | ) 14 | self.model = MiDaSInference(model_type="dpt_hybrid").to(self.device) 15 | 16 | def __call__(self, input_image, a=np.pi * 2.0, bg_th=0.1): 17 | assert input_image.ndim == 3 18 | image_depth = input_image 19 | with torch.no_grad(): 20 | image_depth = torch.from_numpy(image_depth).float().to(self.device) 21 | image_depth = image_depth / 127.5 - 1.0 22 | image_depth = rearrange(image_depth, "h w c -> 1 c h w") 23 | depth = self.model(image_depth)[0] 24 | 25 | depth_pt = depth.clone() 26 | depth_pt -= torch.min(depth_pt) 27 | depth_pt /= torch.max(depth_pt) 28 | depth_pt = depth_pt.cpu().numpy() 29 | depth_image = (depth_pt * 255.0).clip(0, 255).astype(np.uint8) 30 | 31 | depth_np = depth.cpu().numpy() 32 | x = cv2.Sobel(depth_np, cv2.CV_32F, 1, 0, ksize=3) 33 | y = cv2.Sobel(depth_np, cv2.CV_32F, 0, 1, ksize=3) 34 | z = np.ones_like(x) * a 35 | x[depth_pt < bg_th] = 0 36 | y[depth_pt < bg_th] = 0 37 | normal = np.stack([x, y, z], axis=2) 38 | normal /= np.sum(normal**2.0, axis=2, keepdims=True) ** 0.5 39 | normal_image = (normal * 127.5 + 127.5).clip(0, 255).astype(np.uint8) 40 | 41 | return depth_image, normal_image 42 | -------------------------------------------------------------------------------- /modules/sadtalker/src/facerender/modules/mapping.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | class MappingNet(nn.Module): 9 | def __init__(self, coeff_nc, descriptor_nc, layer, num_kp, num_bins): 10 | super( MappingNet, self).__init__() 11 | 12 | self.layer = layer 13 | nonlinearity = nn.LeakyReLU(0.1) 14 | 15 | self.first = nn.Sequential( 16 | torch.nn.Conv1d(coeff_nc, descriptor_nc, kernel_size=7, padding=0, bias=True)) 17 | 18 | for i in range(layer): 19 | net = nn.Sequential(nonlinearity, 20 | torch.nn.Conv1d(descriptor_nc, descriptor_nc, kernel_size=3, padding=0, dilation=3)) 21 | setattr(self, 'encoder' + str(i), net) 22 | 23 | self.pooling = nn.AdaptiveAvgPool1d(1) 24 | self.output_nc = descriptor_nc 25 | 26 | self.fc_roll = nn.Linear(descriptor_nc, num_bins) 27 | self.fc_pitch = nn.Linear(descriptor_nc, num_bins) 28 | self.fc_yaw = nn.Linear(descriptor_nc, num_bins) 29 | self.fc_t = nn.Linear(descriptor_nc, 3) 30 | self.fc_exp = nn.Linear(descriptor_nc, 3*num_kp) 31 | 32 | def forward(self, input_3dmm): 33 | out = self.first(input_3dmm) 34 | for i in range(self.layer): 35 | model = getattr(self, 'encoder' + str(i)) 36 | out = model(out) + out[:,:,3:-3] 37 | out = self.pooling(out) 38 | out = out.view(out.shape[0], -1) 39 | #print('out:', out.shape) 40 | 41 | yaw = self.fc_yaw(out) 42 | pitch = self.fc_pitch(out) 43 | roll = self.fc_roll(out) 44 | t = self.fc_t(out) 45 | exp = self.fc_exp(out) 46 | 47 | return {'yaw': yaw, 'pitch': pitch, 'roll': roll, 't': t, 'exp': exp} -------------------------------------------------------------------------------- /modules/shap_e/shap_e/models/stf/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Dict, Optional 3 | 4 | import torch 5 | 6 | from ...models.query import Query 7 | from ...models.renderer import append_tensor 8 | from ...util.collections import AttrDict 9 | 10 | 11 | class Model(ABC): 12 | @abstractmethod 13 | def forward( 14 | self, 15 | query: Query, 16 | params: Optional[Dict[str, torch.Tensor]] = None, 17 | options: Optional[Dict[str, Any]] = None, 18 | ) -> AttrDict[str, Any]: 19 | """ 20 | Predict an attribute given position 21 | """ 22 | 23 | def forward_batched( 24 | self, 25 | query: Query, 26 | query_batch_size: int = 4096, 27 | params: Optional[Dict[str, torch.Tensor]] = None, 28 | options: Optional[Dict[str, Any]] = None, 29 | ) -> AttrDict[str, Any]: 30 | if not query.position.numel(): 31 | # Avoid torch.cat() of zero tensors. 32 | return self(query, params=params, options=options) 33 | 34 | if options.cache is None: 35 | created_cache = True 36 | options.cache = AttrDict() 37 | else: 38 | created_cache = False 39 | 40 | results_list = AttrDict() 41 | for i in range(0, query.position.shape[1], query_batch_size): 42 | out = self( 43 | query=query.map_tensors(lambda x, i=i: x[:, i : i + query_batch_size]), 44 | params=params, 45 | options=options, 46 | ) 47 | results_list = results_list.combine(out, append_tensor) 48 | 49 | if created_cache: 50 | del options["cache"] 51 | 52 | return results_list.map(lambda key, tensor_list: torch.cat(tensor_list, dim=1)) 53 | -------------------------------------------------------------------------------- /modules/bark_voice_clone/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import wave 3 | from utils import generate_audio_name 4 | from modelscope.models.audio.tts import SambertHifigan 5 | from modelscope.pipelines import pipeline 6 | from modelscope.utils.constant import Tasks 7 | 8 | 9 | class BarkVoiceClone: 10 | 11 | def __init__(self): 12 | self.model_dir = os.path.abspath("./modules/bark_voice_clone/pretrain_work_dir") 13 | self.output_file = generate_audio_name() 14 | self.num_channels = 1 15 | self.sample_width = 2 16 | self.frame_rate = 18050 17 | 18 | def inference(self, prompt): 19 | custom_infer_abs = { 20 | 'voice_name': 21 | 'F7', 22 | 'am_ckpt': os.path.join(self.model_dir, 'tmp_am', 'ckpt'), 23 | 'am_config': os.path.join(self.model_dir, 'tmp_am', 'config.yaml'), 24 | 'voc_ckpt': os.path.join(self.model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'), 25 | 'voc_config': os.path.join(self.model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'config.yaml'), 26 | 'audio_config': os.path.join(self.model_dir, 'data', 'audio_config.yaml'), 27 | 'se_file': os.path.join(self.model_dir, 'data', 'se', 'se.npy') 28 | } 29 | kwargs = {'custom_ckpt': custom_infer_abs} 30 | 31 | model_id = SambertHifigan(os.path.join(self.model_dir, "orig_model"), **kwargs) 32 | 33 | inference = pipeline(task=Tasks.text_to_speech, model=model_id) 34 | output = inference(input=prompt) 35 | 36 | with wave.open(self.output_file, 'wb') as wav_file: 37 | wav_file.setnchannels(self.num_channels) 38 | wav_file.setsampwidth(self.sample_width) 39 | wav_file.setframerate(self.frame_rate) 40 | wav_file.writeframesraw(output["output_wav"]) -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os, sys, uuid 2 | import importlib 3 | 4 | import numpy as np 5 | import torch 6 | import random 7 | 8 | 9 | def instantiate_from_config(config, **kwargs): 10 | if not "target" in config: 11 | raise KeyError("Expected key `target` to instantiate.") 12 | return get_obj_from_str(config["target"])(**config.get("params", dict()), **kwargs) 13 | 14 | 15 | def get_obj_from_str(string, reload=False): 16 | module, cls = string.rsplit(".", 1) 17 | if reload: 18 | module_imp = importlib.import_module(module) 19 | importlib.reload(module_imp) 20 | return getattr(importlib.import_module(module, package=None), cls) 21 | 22 | 23 | def seed_everything(seed): 24 | random.seed(seed) 25 | np.random.seed(seed) 26 | torch.manual_seed(seed) 27 | torch.cuda.manual_seed_all(seed) 28 | return seed 29 | 30 | 31 | def get_new_video_name(org_vid_name, func_name="update"): 32 | head_tail = os.path.split(org_vid_name) 33 | head = head_tail[0] 34 | tail = head_tail[1] 35 | name_split = tail.split(".")[0].split("_") 36 | this_new_uuid = str(uuid.uuid4())[:4] 37 | if len(name_split) == 1: 38 | most_org_file_name = name_split[0] 39 | else: 40 | assert len(name_split) == 4 41 | most_org_file_name = name_split[3] 42 | recent_prev_file_name = name_split[0] 43 | new_file_name = ( 44 | f"{this_new_uuid}_{func_name}_{recent_prev_file_name}_{most_org_file_name}.mp4" 45 | ) 46 | return os.path.join(head, new_file_name) 47 | 48 | 49 | def generate_video_name_mp4(): 50 | return os.path.join("video", str(uuid.uuid4())[:8] + ".mp4") 51 | 52 | def generate_audio_name(): 53 | return os.path.join("audio", str(uuid.uuid4())[:8] + ".wav") 54 | 55 | def generate_image_name(): 56 | return os.path.join("image", str(uuid.uuid4())[:8] + ".png") 57 | 58 | def get_new_uuid(): 59 | return str(uuid.uuid4())[:8] 60 | -------------------------------------------------------------------------------- /modules/blip/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from transformers import AutoProcessor, Blip2ForConditionalGeneration 4 | from PIL import Image 5 | 6 | from video_utils import prepare_video 7 | 8 | 9 | class ImageCaptioning: 10 | def __init__(self, device): 11 | print("Initializing BLIP2 for ImageCaptioning") 12 | self.device = device 13 | self.processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b") 14 | self.model = Blip2ForConditionalGeneration.from_pretrained( 15 | "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16 16 | ).to(device) 17 | 18 | def image_captioning(self, image: Image, prompt=None, is_vqa=False): 19 | if prompt and is_vqa: 20 | prompt = f"Question: {prompt} Answer:" 21 | inputs = self.processor(image, text=prompt, return_tensors="pt").to( 22 | self.device, torch.float16 23 | ) 24 | generated_ids = self.model.generate(**inputs, max_new_tokens=40) 25 | generated_text = self.processor.batch_decode( 26 | generated_ids, skip_special_tokens=True 27 | )[0].strip() 28 | return generated_text 29 | 30 | def frames_captioning(self, video_path): 31 | video, fps = prepare_video(video_path, 512, "cpu", normalize=False) 32 | # pick each frame for each second 33 | video = video[::fps] 34 | video_nd = np.transpose(video.numpy(), (0, 2, 3, 1)).astype(np.uint8) 35 | pil_images = [Image.fromarray(frame) for frame in video_nd] 36 | 37 | caption_results = [] 38 | for i, image in enumerate(pil_images): 39 | # image.save(f"temp/{str(i).zfill(5)}.png") 40 | caption = self.image_captioning( 41 | image, prompt="This is a video frame describing that" 42 | ) 43 | caption_results.append(f"Second {i}: {caption}.") 44 | return " ".join(caption_results) 45 | 46 | def inference(self, inputs): 47 | return self.frames_captioning(inputs) 48 | -------------------------------------------------------------------------------- /modules/modelscope_t2v/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import random 4 | import tempfile 5 | import imageio 6 | import numpy as np 7 | import torch 8 | from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler 9 | 10 | 11 | from utils import generate_video_name_mp4 12 | 13 | 14 | def to_video(frames: list[np.ndarray], fps: int, out_file=None) -> str: 15 | if out_file is None: 16 | out_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name 17 | writer = imageio.get_writer(out_file, format="FFMPEG", fps=fps) 18 | for frame in frames: 19 | writer.append_data(frame) 20 | writer.close() 21 | return out_file 22 | 23 | 24 | class ModelscopeT2V: 25 | def __init__(self, device): 26 | pipe = DiffusionPipeline.from_pretrained( 27 | "damo-vilab/text-to-video-ms-1.7b", 28 | torch_dtype=torch.float16, 29 | variant="fp16", 30 | ).to(device) 31 | pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) 32 | pipe.enable_model_cpu_offload() 33 | pipe.enable_vae_slicing() 34 | 35 | self.pipe = pipe 36 | 37 | def generate_video( 38 | self, 39 | prompt: str, 40 | seed: int, 41 | num_frames: int, 42 | num_inference_steps: int, 43 | out_file: str = None, 44 | ) -> str: 45 | if seed == -1: 46 | seed = random.randint(0, 1000000) 47 | generator = torch.Generator().manual_seed(seed) 48 | frames = self.pipe( 49 | prompt, 50 | num_inference_steps=num_inference_steps, 51 | num_frames=num_frames, 52 | generator=generator, 53 | ).frames 54 | return to_video(frames, 8, out_file=out_file) 55 | 56 | def inference(self, inputs): 57 | video_path = generate_video_name_mp4() 58 | self.generate_video( 59 | prompt=inputs, 60 | seed=-1, 61 | num_frames=16, 62 | num_inference_steps=25, 63 | out_file=video_path, 64 | ) 65 | return video_path 66 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/rendering/ply_util.py: -------------------------------------------------------------------------------- 1 | import struct 2 | from typing import BinaryIO, Optional 3 | 4 | import numpy as np 5 | 6 | from ..util.io import buffered_writer 7 | 8 | 9 | def write_ply( 10 | raw_f: BinaryIO, 11 | coords: np.ndarray, 12 | rgb: Optional[np.ndarray] = None, 13 | faces: Optional[np.ndarray] = None, 14 | ): 15 | """ 16 | Write a PLY file for a mesh or a point cloud. 17 | 18 | :param coords: an [N x 3] array of floating point coordinates. 19 | :param rgb: an [N x 3] array of vertex colors, in the range [0.0, 1.0]. 20 | :param faces: an [N x 3] array of triangles encoded as integer indices. 21 | """ 22 | with buffered_writer(raw_f) as f: 23 | f.write(b"ply\n") 24 | f.write(b"format binary_little_endian 1.0\n") 25 | f.write(bytes(f"element vertex {len(coords)}\n", "ascii")) 26 | f.write(b"property float x\n") 27 | f.write(b"property float y\n") 28 | f.write(b"property float z\n") 29 | if rgb is not None: 30 | f.write(b"property uchar red\n") 31 | f.write(b"property uchar green\n") 32 | f.write(b"property uchar blue\n") 33 | if faces is not None: 34 | f.write(bytes(f"element face {len(faces)}\n", "ascii")) 35 | f.write(b"property list uchar int vertex_index\n") 36 | f.write(b"end_header\n") 37 | 38 | if rgb is not None: 39 | rgb = (rgb * 255.499).round().astype(int) 40 | vertices = [ 41 | (*coord, *rgb) 42 | for coord, rgb in zip( 43 | coords.tolist(), 44 | rgb.tolist(), 45 | ) 46 | ] 47 | format = struct.Struct("<3f3B") 48 | for item in vertices: 49 | f.write(format.pack(*item)) 50 | else: 51 | format = struct.Struct("<3f") 52 | for vertex in coords.tolist(): 53 | f.write(format.pack(*vertex)) 54 | 55 | if faces is not None: 56 | format = struct.Struct(" torch.Tensor: 25 | """ 26 | Return an [H x W x 4] RGBA tensor of the rendered image. 27 | The pixels are floating points, with alpha in the range [0, 1] and the 28 | other colors matching the scale used by the mesh's vertex colors. 29 | """ 30 | light_direction = torch.tensor( 31 | light_direction, device=mesh.vertices.device, dtype=mesh.vertices.dtype 32 | ) 33 | 34 | all_collisions = RayCollisions.collect( 35 | cast_camera( 36 | camera=camera, 37 | mesh=mesh, 38 | ray_batch_size=ray_batch_size, 39 | checkpoint=checkpoint, 40 | ) 41 | ) 42 | num_rays = len(all_collisions.normals) 43 | if mesh.vertex_colors is None: 44 | vertex_colors = torch.tensor([[0.8, 0.8, 0.8]]).to(mesh.vertices).repeat(num_rays, 1) 45 | else: 46 | vertex_colors = mesh.vertex_colors 47 | 48 | light_coeffs = ambient + ( 49 | diffuse * torch.sum(all_collisions.normals * light_direction, dim=-1).abs() 50 | ) 51 | vertex_colors = mesh.vertex_colors[mesh.faces[all_collisions.tri_indices]] 52 | bary_products = torch.sum(vertex_colors * all_collisions.barycentric[..., None], axis=-2) 53 | out_colors = bary_products * light_coeffs[..., None] 54 | res = torch.where(all_collisions.collides[:, None], out_colors, torch.zeros_like(out_colors)) 55 | return torch.cat([res, all_collisions.collides[:, None].float()], dim=-1).view( 56 | camera.height, camera.width, 4 57 | ) 58 | -------------------------------------------------------------------------------- /modules/annotator/openpose/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" 4 | 5 | import torch 6 | import numpy as np 7 | from . import util 8 | from .body import Body 9 | from .hand import Hand 10 | from ..util import annotator_ckpts_path 11 | 12 | 13 | body_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/body_pose_model.pth" 14 | hand_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/hand_pose_model.pth" 15 | 16 | 17 | class OpenposeDetector: 18 | def __init__(self, device=None): 19 | body_modelpath = os.path.join(annotator_ckpts_path, "body_pose_model.pth") 20 | hand_modelpath = os.path.join(annotator_ckpts_path, "hand_pose_model.pth") 21 | 22 | if not os.path.exists(hand_modelpath): 23 | from basicsr.utils.download_util import load_file_from_url 24 | 25 | load_file_from_url(body_model_path, model_dir=annotator_ckpts_path) 26 | load_file_from_url(hand_model_path, model_dir=annotator_ckpts_path) 27 | 28 | device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu") 29 | self.body_estimation = Body(body_modelpath, device) 30 | self.hand_estimation = Hand(hand_modelpath, device) 31 | 32 | def __call__(self, oriImg, hand=False): 33 | oriImg = oriImg[:, :, ::-1].copy() 34 | with torch.no_grad(): 35 | candidate, subset = self.body_estimation(oriImg) 36 | canvas = np.zeros_like(oriImg) 37 | canvas = util.draw_bodypose(canvas, candidate, subset) 38 | if hand: 39 | hands_list = util.handDetect(candidate, subset, oriImg) 40 | all_hand_peaks = [] 41 | for x, y, w, is_left in hands_list: 42 | peaks = self.hand_estimation(oriImg[y : y + w, x : x + w, :]) 43 | peaks[:, 0] = np.where( 44 | peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x 45 | ) 46 | peaks[:, 1] = np.where( 47 | peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y 48 | ) 49 | all_hand_peaks.append(peaks) 50 | canvas = util.draw_handpose(canvas, all_hand_peaks) 51 | return canvas, dict(candidate=candidate.tolist(), subset=subset.tolist()) 52 | -------------------------------------------------------------------------------- /modules/sadtalker/src/audio2pose_models/res_unet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from .networks import ResidualConv, Upsample 4 | 5 | 6 | class ResUnet(nn.Module): 7 | def __init__(self, channel=1, filters=[32, 64, 128, 256]): 8 | super(ResUnet, self).__init__() 9 | 10 | self.input_layer = nn.Sequential( 11 | nn.Conv2d(channel, filters[0], kernel_size=3, padding=1), 12 | nn.BatchNorm2d(filters[0]), 13 | nn.ReLU(), 14 | nn.Conv2d(filters[0], filters[0], kernel_size=3, padding=1), 15 | ) 16 | self.input_skip = nn.Sequential( 17 | nn.Conv2d(channel, filters[0], kernel_size=3, padding=1) 18 | ) 19 | 20 | self.residual_conv_1 = ResidualConv(filters[0], filters[1], stride=(2,1), padding=1) 21 | self.residual_conv_2 = ResidualConv(filters[1], filters[2], stride=(2,1), padding=1) 22 | 23 | self.bridge = ResidualConv(filters[2], filters[3], stride=(2,1), padding=1) 24 | 25 | self.upsample_1 = Upsample(filters[3], filters[3], kernel=(2,1), stride=(2,1)) 26 | self.up_residual_conv1 = ResidualConv(filters[3] + filters[2], filters[2], stride=1, padding=1) 27 | 28 | self.upsample_2 = Upsample(filters[2], filters[2], kernel=(2,1), stride=(2,1)) 29 | self.up_residual_conv2 = ResidualConv(filters[2] + filters[1], filters[1], stride=1, padding=1) 30 | 31 | self.upsample_3 = Upsample(filters[1], filters[1], kernel=(2,1), stride=(2,1)) 32 | self.up_residual_conv3 = ResidualConv(filters[1] + filters[0], filters[0], stride=1, padding=1) 33 | 34 | self.output_layer = nn.Sequential( 35 | nn.Conv2d(filters[0], 1, 1, 1), 36 | nn.Sigmoid(), 37 | ) 38 | 39 | def forward(self, x): 40 | # Encode 41 | x1 = self.input_layer(x) + self.input_skip(x) 42 | x2 = self.residual_conv_1(x1) 43 | x3 = self.residual_conv_2(x2) 44 | # Bridge 45 | x4 = self.bridge(x3) 46 | 47 | # Decode 48 | x4 = self.upsample_1(x4) 49 | x5 = torch.cat([x4, x3], dim=1) 50 | 51 | x6 = self.up_residual_conv1(x5) 52 | 53 | x6 = self.upsample_2(x6) 54 | x7 = torch.cat([x6, x2], dim=1) 55 | 56 | x8 = self.up_residual_conv2(x7) 57 | 58 | x8 = self.upsample_3(x8) 59 | x9 = torch.cat([x8, x1], dim=1) 60 | 61 | x10 = self.up_residual_conv3(x9) 62 | 63 | output = self.output_layer(x10) 64 | 65 | return output -------------------------------------------------------------------------------- /modules/sadtalker/src/utils/paste_pic.py: -------------------------------------------------------------------------------- 1 | import cv2, os 2 | import numpy as np 3 | from tqdm import tqdm 4 | import uuid 5 | 6 | from .videoio import save_video_with_watermark 7 | 8 | def paste_pic(video_path, pic_path, crop_info, new_audio_path, full_video_path, extended_crop=False): 9 | 10 | if not os.path.isfile(pic_path): 11 | raise ValueError('pic_path must be a valid path to video/image file') 12 | elif pic_path.split('.')[-1] in ['jpg', 'png', 'jpeg']: 13 | # loader for first frame 14 | full_img = cv2.imread(pic_path) 15 | else: 16 | # loader for videos 17 | video_stream = cv2.VideoCapture(pic_path) 18 | fps = video_stream.get(cv2.CAP_PROP_FPS) 19 | full_frames = [] 20 | while 1: 21 | still_reading, frame = video_stream.read() 22 | if not still_reading: 23 | video_stream.release() 24 | break 25 | break 26 | full_img = frame 27 | frame_h = full_img.shape[0] 28 | frame_w = full_img.shape[1] 29 | 30 | video_stream = cv2.VideoCapture(video_path) 31 | fps = video_stream.get(cv2.CAP_PROP_FPS) 32 | crop_frames = [] 33 | while 1: 34 | still_reading, frame = video_stream.read() 35 | if not still_reading: 36 | video_stream.release() 37 | break 38 | crop_frames.append(frame) 39 | 40 | if len(crop_info) != 3: 41 | print("you didn't crop the image") 42 | return 43 | else: 44 | r_w, r_h = crop_info[0] 45 | clx, cly, crx, cry = crop_info[1] 46 | lx, ly, rx, ry = crop_info[2] 47 | lx, ly, rx, ry = int(lx), int(ly), int(rx), int(ry) 48 | # oy1, oy2, ox1, ox2 = cly+ly, cly+ry, clx+lx, clx+rx 49 | # oy1, oy2, ox1, ox2 = cly+ly, cly+ry, clx+lx, clx+rx 50 | 51 | if extended_crop: 52 | oy1, oy2, ox1, ox2 = cly, cry, clx, crx 53 | else: 54 | oy1, oy2, ox1, ox2 = cly+ly, cly+ry, clx+lx, clx+rx 55 | 56 | tmp_path = str(uuid.uuid4())+'.mp4' 57 | out_tmp = cv2.VideoWriter(tmp_path, cv2.VideoWriter_fourcc(*'MP4V'), fps, (frame_w, frame_h)) 58 | for crop_frame in tqdm(crop_frames, 'seamlessClone:'): 59 | p = cv2.resize(crop_frame.astype(np.uint8), (ox2-ox1, oy2 - oy1)) 60 | 61 | mask = 255*np.ones(p.shape, p.dtype) 62 | location = ((ox1+ox2) // 2, (oy1+oy2) // 2) 63 | gen_img = cv2.seamlessClone(p, full_img, mask, location, cv2.NORMAL_CLONE) 64 | out_tmp.write(gen_img) 65 | 66 | out_tmp.release() 67 | 68 | save_video_with_watermark(tmp_path, new_audio_path, full_video_path, watermark=False) 69 | os.remove(tmp_path) 70 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/models/generation/pooled_mlp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from .util import timestep_embedding 5 | 6 | 7 | class PooledMLP(nn.Module): 8 | def __init__( 9 | self, 10 | device: torch.device, 11 | *, 12 | input_channels: int = 3, 13 | output_channels: int = 6, 14 | hidden_size: int = 256, 15 | resblocks: int = 4, 16 | pool_op: str = "max", 17 | ): 18 | super().__init__() 19 | self.input_embed = nn.Conv1d(input_channels, hidden_size, kernel_size=1, device=device) 20 | self.time_embed = nn.Linear(hidden_size, hidden_size, device=device) 21 | 22 | blocks = [] 23 | for _ in range(resblocks): 24 | blocks.append(ResBlock(hidden_size, pool_op, device=device)) 25 | self.sequence = nn.Sequential(*blocks) 26 | 27 | self.out = nn.Conv1d(hidden_size, output_channels, kernel_size=1, device=device) 28 | with torch.no_grad(): 29 | self.out.bias.zero_() 30 | self.out.weight.zero_() 31 | 32 | def forward(self, x: torch.Tensor, t: torch.Tensor) -> torch.Tensor: 33 | in_embed = self.input_embed(x) 34 | t_embed = self.time_embed(timestep_embedding(t, in_embed.shape[1])) 35 | h = in_embed + t_embed[..., None] 36 | h = self.sequence(h) 37 | h = self.out(h) 38 | return h 39 | 40 | 41 | class ResBlock(nn.Module): 42 | def __init__(self, hidden_size: int, pool_op: str, device: torch.device): 43 | super().__init__() 44 | assert pool_op in ["mean", "max"] 45 | self.pool_op = pool_op 46 | self.body = nn.Sequential( 47 | nn.SiLU(), 48 | nn.LayerNorm((hidden_size,), device=device), 49 | nn.Linear(hidden_size, hidden_size, device=device), 50 | nn.SiLU(), 51 | nn.LayerNorm((hidden_size,), device=device), 52 | nn.Linear(hidden_size, hidden_size, device=device), 53 | ) 54 | self.gate = nn.Sequential( 55 | nn.Linear(hidden_size, hidden_size, device=device), 56 | nn.Tanh(), 57 | ) 58 | 59 | def forward(self, x: torch.Tensor): 60 | N, C, T = x.shape 61 | out = self.body(x.permute(0, 2, 1).reshape(N * T, C)).reshape([N, T, C]).permute(0, 2, 1) 62 | pooled = pool(self.pool_op, x) 63 | gate = self.gate(pooled) 64 | return x + out * gate[..., None] 65 | 66 | 67 | def pool(op_name: str, x: torch.Tensor) -> torch.Tensor: 68 | if op_name == "max": 69 | pooled, _ = torch.max(x, dim=-1) 70 | elif op_name == "mean": 71 | pooled, _ = torch.mean(x, dim=-1) 72 | else: 73 | raise ValueError(f"unknown pool op: {op_name}") 74 | return pooled 75 | -------------------------------------------------------------------------------- /video_utils.py: -------------------------------------------------------------------------------- 1 | import imageio 2 | import torch 3 | import numpy as np 4 | import decord 5 | import torchvision 6 | from einops import rearrange 7 | from torchvision.transforms import Resize, InterpolationMode 8 | 9 | from utils import get_new_video_name 10 | 11 | 12 | def prepare_video( 13 | video_path: str, 14 | resolution: int, 15 | device, 16 | dtype=torch.float16, 17 | normalize=True, 18 | start_t: float = 0, 19 | end_t: float = -1, 20 | output_fps: int = -1, 21 | ): 22 | vr = decord.VideoReader(video_path) 23 | initial_fps = vr.get_avg_fps() 24 | if output_fps == -1: 25 | output_fps = int(initial_fps) 26 | if end_t == -1: 27 | end_t = len(vr) / initial_fps 28 | else: 29 | end_t = min(len(vr) / initial_fps, end_t) 30 | assert 0 <= start_t < end_t 31 | assert output_fps > 0 32 | start_f_ind = int(start_t * initial_fps) 33 | end_f_ind = int(end_t * initial_fps) 34 | num_f = int((end_t - start_t) * output_fps) 35 | sample_idx = np.linspace(start_f_ind, end_f_ind, num_f, endpoint=False).astype(int) 36 | video = vr.get_batch(sample_idx) 37 | if torch.is_tensor(video): 38 | video = video.detach().cpu().numpy() 39 | else: 40 | video = video.asnumpy() 41 | _, h, w, _ = video.shape 42 | video = rearrange(video, "f h w c -> f c h w") 43 | video = torch.Tensor(video).to(device).to(dtype) 44 | 45 | # Use max if you want the larger side to be equal to resolution (e.g. 512) 46 | # k = float(resolution) / min(h, w) 47 | k = float(resolution) / max(h, w) 48 | h *= k 49 | w *= k 50 | h = int(np.round(h / 64.0)) * 64 51 | w = int(np.round(w / 64.0)) * 64 52 | 53 | video = Resize((h, w), interpolation=InterpolationMode.BILINEAR, antialias=True)( 54 | video 55 | ) 56 | if normalize: 57 | video = video / 127.5 - 1.0 58 | return video, output_fps # video: f c h w 59 | 60 | 61 | def create_video(frames, fps, path, rescale=False): 62 | # frames: f h w c 63 | outputs = [] 64 | for _, x in enumerate(frames): 65 | x = torchvision.utils.make_grid(torch.Tensor(x), nrow=4) 66 | if rescale: 67 | x = (x + 1.0) / 2.0 # -1,1 -> 0,1 68 | x = (x * 255).numpy().astype(np.uint8) 69 | outputs.append(x) 70 | 71 | imageio.mimsave(path, outputs, fps=fps) 72 | return path 73 | 74 | 75 | def preprocess_video(video_path, out_path=None): 76 | if out_path is None: 77 | out_path = get_new_video_name(video_path, func_name="preprocessed") 78 | 79 | video, fps = prepare_video(video_path, resolution=512, device="cpu") 80 | video = rearrange(video, "f c h w -> f h w c") 81 | create_video(video, fps, out_path, rescale=True) 82 | print(f"Preprocessed video saved to {out_path}") 83 | -------------------------------------------------------------------------------- /modules/sadtalker/src/audio2pose_models/discriminator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | class ConvNormRelu(nn.Module): 6 | def __init__(self, conv_type='1d', in_channels=3, out_channels=64, downsample=False, 7 | kernel_size=None, stride=None, padding=None, norm='BN', leaky=False): 8 | super().__init__() 9 | if kernel_size is None: 10 | if downsample: 11 | kernel_size, stride, padding = 4, 2, 1 12 | else: 13 | kernel_size, stride, padding = 3, 1, 1 14 | 15 | if conv_type == '2d': 16 | self.conv = nn.Conv2d( 17 | in_channels, 18 | out_channels, 19 | kernel_size, 20 | stride, 21 | padding, 22 | bias=False, 23 | ) 24 | if norm == 'BN': 25 | self.norm = nn.BatchNorm2d(out_channels) 26 | elif norm == 'IN': 27 | self.norm = nn.InstanceNorm2d(out_channels) 28 | else: 29 | raise NotImplementedError 30 | elif conv_type == '1d': 31 | self.conv = nn.Conv1d( 32 | in_channels, 33 | out_channels, 34 | kernel_size, 35 | stride, 36 | padding, 37 | bias=False, 38 | ) 39 | if norm == 'BN': 40 | self.norm = nn.BatchNorm1d(out_channels) 41 | elif norm == 'IN': 42 | self.norm = nn.InstanceNorm1d(out_channels) 43 | else: 44 | raise NotImplementedError 45 | nn.init.kaiming_normal_(self.conv.weight) 46 | 47 | self.act = nn.LeakyReLU(negative_slope=0.2, inplace=False) if leaky else nn.ReLU(inplace=True) 48 | 49 | def forward(self, x): 50 | x = self.conv(x) 51 | if isinstance(self.norm, nn.InstanceNorm1d): 52 | x = self.norm(x.permute((0, 2, 1))).permute((0, 2, 1)) # normalize on [C] 53 | else: 54 | x = self.norm(x) 55 | x = self.act(x) 56 | return x 57 | 58 | 59 | class PoseSequenceDiscriminator(nn.Module): 60 | def __init__(self, cfg): 61 | super().__init__() 62 | self.cfg = cfg 63 | leaky = self.cfg.MODEL.DISCRIMINATOR.LEAKY_RELU 64 | 65 | self.seq = nn.Sequential( 66 | ConvNormRelu('1d', cfg.MODEL.DISCRIMINATOR.INPUT_CHANNELS, 256, downsample=True, leaky=leaky), # B, 256, 64 67 | ConvNormRelu('1d', 256, 512, downsample=True, leaky=leaky), # B, 512, 32 68 | ConvNormRelu('1d', 512, 1024, kernel_size=3, stride=1, padding=1, leaky=leaky), # B, 1024, 16 69 | nn.Conv1d(1024, 1, kernel_size=3, stride=1, padding=1, bias=True) # B, 1, 16 70 | ) 71 | 72 | def forward(self, x): 73 | x = x.reshape(x.size(0), x.size(1), -1).transpose(1, 2) 74 | x = self.seq(x) 75 | x = x.squeeze(1) 76 | return x -------------------------------------------------------------------------------- /modules/sadtalker/src/audio2pose_models/audio_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | class Conv2d(nn.Module): 6 | def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | self.conv_block = nn.Sequential( 9 | nn.Conv2d(cin, cout, kernel_size, stride, padding), 10 | nn.BatchNorm2d(cout) 11 | ) 12 | self.act = nn.ReLU() 13 | self.residual = residual 14 | 15 | def forward(self, x): 16 | out = self.conv_block(x) 17 | if self.residual: 18 | out += x 19 | return self.act(out) 20 | 21 | class AudioEncoder(nn.Module): 22 | def __init__(self, wav2lip_checkpoint, device): 23 | super(AudioEncoder, self).__init__() 24 | 25 | self.audio_encoder = nn.Sequential( 26 | Conv2d(1, 32, kernel_size=3, stride=1, padding=1), 27 | Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True), 28 | Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True), 29 | 30 | Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1), 31 | Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True), 32 | Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True), 33 | 34 | Conv2d(64, 128, kernel_size=3, stride=3, padding=1), 35 | Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True), 36 | Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True), 37 | 38 | Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1), 39 | Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True), 40 | 41 | Conv2d(256, 512, kernel_size=3, stride=1, padding=0), 42 | Conv2d(512, 512, kernel_size=1, stride=1, padding=0),) 43 | 44 | #### load the pre-trained audio_encoder, we do not need to load wav2lip model here. 45 | # wav2lip_state_dict = torch.load(wav2lip_checkpoint, map_location=torch.device(device))['state_dict'] 46 | # state_dict = self.audio_encoder.state_dict() 47 | 48 | # for k,v in wav2lip_state_dict.items(): 49 | # if 'audio_encoder' in k: 50 | # state_dict[k.replace('module.audio_encoder.', '')] = v 51 | # self.audio_encoder.load_state_dict(state_dict) 52 | 53 | 54 | def forward(self, audio_sequences): 55 | # audio_sequences = (B, T, 1, 80, 16) 56 | B = audio_sequences.size(0) 57 | 58 | audio_sequences = torch.cat([audio_sequences[:, i] for i in range(audio_sequences.size(1))], dim=0) 59 | 60 | audio_embedding = self.audio_encoder(audio_sequences) # B, 512, 1, 1 61 | dim = audio_embedding.shape[1] 62 | audio_embedding = audio_embedding.reshape((B, -1, dim, 1, 1)) 63 | 64 | return audio_embedding.squeeze(-1).squeeze(-1) #B seq_len+1 512 65 | -------------------------------------------------------------------------------- /modules/annotator/midas/midas/midas_net.py: -------------------------------------------------------------------------------- 1 | """MidashNet: Network for monocular depth estimation trained by mixing several datasets. 2 | This file contains code that is adapted from 3 | https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py 4 | """ 5 | import torch 6 | import torch.nn as nn 7 | 8 | from .base_model import BaseModel 9 | from .blocks import FeatureFusionBlock, Interpolate, _make_encoder 10 | 11 | 12 | class MidasNet(BaseModel): 13 | """Network for monocular depth estimation. 14 | """ 15 | 16 | def __init__(self, path=None, features=256, non_negative=True): 17 | """Init. 18 | 19 | Args: 20 | path (str, optional): Path to saved model. Defaults to None. 21 | features (int, optional): Number of features. Defaults to 256. 22 | backbone (str, optional): Backbone network for encoder. Defaults to resnet50 23 | """ 24 | print("Loading weights: ", path) 25 | 26 | super(MidasNet, self).__init__() 27 | 28 | use_pretrained = False if path is None else True 29 | 30 | self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained) 31 | 32 | self.scratch.refinenet4 = FeatureFusionBlock(features) 33 | self.scratch.refinenet3 = FeatureFusionBlock(features) 34 | self.scratch.refinenet2 = FeatureFusionBlock(features) 35 | self.scratch.refinenet1 = FeatureFusionBlock(features) 36 | 37 | self.scratch.output_conv = nn.Sequential( 38 | nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1), 39 | Interpolate(scale_factor=2, mode="bilinear"), 40 | nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1), 41 | nn.ReLU(True), 42 | nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), 43 | nn.ReLU(True) if non_negative else nn.Identity(), 44 | ) 45 | 46 | if path: 47 | self.load(path) 48 | 49 | def forward(self, x): 50 | """Forward pass. 51 | 52 | Args: 53 | x (tensor): input data (image) 54 | 55 | Returns: 56 | tensor: depth 57 | """ 58 | 59 | layer_1 = self.pretrained.layer1(x) 60 | layer_2 = self.pretrained.layer2(layer_1) 61 | layer_3 = self.pretrained.layer3(layer_2) 62 | layer_4 = self.pretrained.layer4(layer_3) 63 | 64 | layer_1_rn = self.scratch.layer1_rn(layer_1) 65 | layer_2_rn = self.scratch.layer2_rn(layer_2) 66 | layer_3_rn = self.scratch.layer3_rn(layer_3) 67 | layer_4_rn = self.scratch.layer4_rn(layer_4) 68 | 69 | path_4 = self.scratch.refinenet4(layer_4_rn) 70 | path_3 = self.scratch.refinenet3(path_4, layer_3_rn) 71 | path_2 = self.scratch.refinenet2(path_3, layer_2_rn) 72 | path_1 = self.scratch.refinenet1(path_2, layer_1_rn) 73 | 74 | out = self.scratch.output_conv(path_1) 75 | 76 | return torch.squeeze(out, dim=1) 77 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/util/notebooks.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | from typing import Union 4 | 5 | import ipywidgets as widgets 6 | import numpy as np 7 | import torch 8 | from PIL import Image 9 | 10 | from ..models.nn.camera import DifferentiableCameraBatch, DifferentiableProjectiveCamera 11 | from ..models.transmitter.base import Transmitter, VectorDecoder 12 | from ..rendering.torch_mesh import TorchMesh 13 | from ..util.collections import AttrDict 14 | 15 | 16 | def create_pan_cameras(size: int, device: torch.device) -> DifferentiableCameraBatch: 17 | origins = [] 18 | xs = [] 19 | ys = [] 20 | zs = [] 21 | for theta in np.linspace(0, 2 * np.pi, num=20): 22 | z = np.array([np.sin(theta), np.cos(theta), -0.5]) 23 | z /= np.sqrt(np.sum(z**2)) 24 | origin = -z * 4 25 | x = np.array([np.cos(theta), -np.sin(theta), 0.0]) 26 | y = np.cross(z, x) 27 | origins.append(origin) 28 | xs.append(x) 29 | ys.append(y) 30 | zs.append(z) 31 | return DifferentiableCameraBatch( 32 | shape=(1, len(xs)), 33 | flat_camera=DifferentiableProjectiveCamera( 34 | origin=torch.from_numpy(np.stack(origins, axis=0)).float().to(device), 35 | x=torch.from_numpy(np.stack(xs, axis=0)).float().to(device), 36 | y=torch.from_numpy(np.stack(ys, axis=0)).float().to(device), 37 | z=torch.from_numpy(np.stack(zs, axis=0)).float().to(device), 38 | width=size, 39 | height=size, 40 | x_fov=0.7, 41 | y_fov=0.7, 42 | ), 43 | ) 44 | 45 | 46 | @torch.no_grad() 47 | def decode_latent_images( 48 | xm: Union[Transmitter, VectorDecoder], 49 | latent: torch.Tensor, 50 | cameras: DifferentiableCameraBatch, 51 | rendering_mode: str = "stf", 52 | ): 53 | decoded = xm.renderer.render_views( 54 | AttrDict(cameras=cameras), 55 | params=(xm.encoder if isinstance(xm, Transmitter) else xm).bottleneck_to_params( 56 | latent[None] 57 | ), 58 | options=AttrDict(rendering_mode=rendering_mode, render_with_direction=False), 59 | ) 60 | arr = decoded.channels.clamp(0, 255).to(torch.uint8)[0].cpu().numpy() 61 | return [Image.fromarray(x) for x in arr] 62 | 63 | 64 | @torch.no_grad() 65 | def decode_latent_mesh( 66 | xm: Union[Transmitter, VectorDecoder], 67 | latent: torch.Tensor, 68 | ) -> TorchMesh: 69 | decoded = xm.renderer.render_views( 70 | AttrDict(cameras=create_pan_cameras(2, latent.device)), # lowest resolution possible 71 | params=(xm.encoder if isinstance(xm, Transmitter) else xm).bottleneck_to_params( 72 | latent[None] 73 | ), 74 | options=AttrDict(rendering_mode="stf", render_with_direction=False), 75 | ) 76 | return decoded.raw_meshes[0] 77 | 78 | 79 | def gif_widget(images): 80 | writer = io.BytesIO() 81 | images[0].save( 82 | writer, format="GIF", save_all=True, append_images=images[1:], duration=100, loop=0 83 | ) 84 | writer.seek(0) 85 | data = base64.b64encode(writer.read()).decode("ascii") 86 | return widgets.HTML(f'') 87 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/rendering/raycast/types.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Iterable, Optional 3 | 4 | import numpy as np 5 | import torch 6 | 7 | from ...rendering.mesh import * 8 | 9 | from ._utils import cross_product, normalize 10 | 11 | 12 | @dataclass 13 | class Rays: 14 | """ 15 | A ray in ray casting. 16 | """ 17 | 18 | origins: torch.Tensor # [N x 3] float tensor 19 | directions: torch.Tensor # [N x 3] float tensor 20 | 21 | def normalized_directions(self) -> torch.Tensor: 22 | return normalize(self.directions) 23 | 24 | 25 | @dataclass 26 | class RayCollisions: 27 | """ 28 | The result of casting N rays onto a mesh. 29 | """ 30 | 31 | collides: torch.Tensor # [N] boolean tensor 32 | ray_dists: torch.Tensor # [N] float tensor 33 | tri_indices: torch.Tensor # [N] long tensor 34 | barycentric: torch.Tensor # [N x 3] float tensor 35 | normals: torch.Tensor # [N x 3] float tensor 36 | 37 | @classmethod 38 | def collect(cls, it: Iterable["RayCollisions"]) -> "RayCollisions": 39 | res = None 40 | for x in it: 41 | if res is None: 42 | res = x 43 | else: 44 | res = cls( 45 | collides=torch.cat([res.collides, x.collides]), 46 | ray_dists=torch.cat([res.ray_dists, x.ray_dists]), 47 | tri_indices=torch.cat([res.tri_indices, x.tri_indices]), 48 | barycentric=torch.cat([res.barycentric, x.barycentric]), 49 | normals=torch.cat([res.normals, x.normals]), 50 | ) 51 | if res is None: 52 | raise ValueError("cannot collect an empty iterable of RayCollisions") 53 | return res 54 | 55 | 56 | @dataclass 57 | class TriMesh: 58 | faces: torch.Tensor # [N x 3] long tensor 59 | vertices: torch.Tensor # [N x 3] float tensor 60 | 61 | vertex_colors: Optional[torch.Tensor] = None 62 | 63 | def normals(self) -> torch.Tensor: 64 | """ 65 | Returns an [N x 3] batch of normal vectors per triangle assuming the 66 | right-hand rule. 67 | """ 68 | tris = self.vertices[self.faces] 69 | v1 = tris[:, 1] - tris[:, 0] 70 | v2 = tris[:, 2] - tris[:, 0] 71 | return normalize(cross_product(v1, v2)) 72 | 73 | @classmethod 74 | def from_numpy(cls, x: TriMesh) -> "TriMesh": 75 | vertex_colors = None 76 | if all(ch in x.vertex_channels for ch in "RGB"): 77 | vertex_colors = torch.from_numpy( 78 | np.stack([x.vertex_channels[ch] for ch in "RGB"], axis=-1) 79 | ) 80 | return cls( 81 | faces=torch.from_numpy(x.faces), 82 | vertices=torch.from_numpy(x.verts), 83 | vertex_colors=vertex_colors, 84 | ) 85 | 86 | def to(self, *args, **kwargs) -> "TriMesh": 87 | return TriMesh( 88 | faces=self.faces.to(*args, **kwargs), 89 | vertices=self.vertices.to(*args, **kwargs), 90 | vertex_colors=None 91 | if self.vertex_colors is None 92 | else self.vertex_colors.to(*args, **kwargs), 93 | ) 94 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/diffusion/sample.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Dict, Optional 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from .gaussian_diffusion import GaussianDiffusion 7 | from .k_diffusion import karras_sample 8 | 9 | DEFAULT_KARRAS_STEPS = 64 10 | DEFAULT_KARRAS_SIGMA_MIN = 1e-3 11 | DEFAULT_KARRAS_SIGMA_MAX = 160 12 | DEFAULT_KARRAS_S_CHURN = 0.0 13 | 14 | 15 | def uncond_guide_model( 16 | model: Callable[..., torch.Tensor], scale: float 17 | ) -> Callable[..., torch.Tensor]: 18 | def model_fn(x_t, ts, **kwargs): 19 | half = x_t[: len(x_t) // 2] 20 | combined = torch.cat([half, half], dim=0) 21 | model_out = model(combined, ts, **kwargs) 22 | eps, rest = model_out[:, :3], model_out[:, 3:] 23 | cond_eps, uncond_eps = torch.chunk(eps, 2, dim=0) 24 | half_eps = uncond_eps + scale * (cond_eps - uncond_eps) 25 | eps = torch.cat([half_eps, half_eps], dim=0) 26 | return torch.cat([eps, rest], dim=1) 27 | 28 | return model_fn 29 | 30 | 31 | def sample_latents( 32 | *, 33 | batch_size: int, 34 | model: nn.Module, 35 | diffusion: GaussianDiffusion, 36 | model_kwargs: Dict[str, Any], 37 | guidance_scale: float, 38 | clip_denoised: bool, 39 | use_fp16: bool, 40 | use_karras: bool, 41 | karras_steps: int, 42 | sigma_min: float, 43 | sigma_max: float, 44 | s_churn: float, 45 | device: Optional[torch.device] = None, 46 | progress: bool = False, 47 | ) -> torch.Tensor: 48 | sample_shape = (batch_size, model.d_latent) 49 | 50 | if device is None: 51 | device = next(model.parameters()).device 52 | 53 | if hasattr(model, "cached_model_kwargs"): 54 | model_kwargs = model.cached_model_kwargs(batch_size, model_kwargs) 55 | if guidance_scale != 1.0 and guidance_scale != 0.0: 56 | for k, v in model_kwargs.copy().items(): 57 | model_kwargs[k] = torch.cat([v, torch.zeros_like(v)], dim=0) 58 | 59 | sample_shape = (batch_size, model.d_latent) 60 | with torch.autocast(device_type=device.type, enabled=use_fp16): 61 | if use_karras: 62 | samples = karras_sample( 63 | diffusion=diffusion, 64 | model=model, 65 | shape=sample_shape, 66 | steps=karras_steps, 67 | clip_denoised=clip_denoised, 68 | model_kwargs=model_kwargs, 69 | device=device, 70 | sigma_min=sigma_min, 71 | sigma_max=sigma_max, 72 | s_churn=s_churn, 73 | guidance_scale=guidance_scale, 74 | progress=progress, 75 | ) 76 | else: 77 | internal_batch_size = batch_size 78 | if guidance_scale != 1.0: 79 | model = uncond_guide_model(model, guidance_scale) 80 | internal_batch_size *= 2 81 | samples = diffusion.p_sample_loop( 82 | model, 83 | shape=(internal_batch_size, *sample_shape[1:]), 84 | model_kwargs=model_kwargs, 85 | device=device, 86 | clip_denoised=clip_denoised, 87 | progress=progress, 88 | ) 89 | 90 | return samples 91 | -------------------------------------------------------------------------------- /modules/sadtalker/src/facerender/modules/discriminator.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch.nn.functional as F 3 | from facerender.modules.util import kp2gaussian 4 | import torch 5 | 6 | 7 | class DownBlock2d(nn.Module): 8 | """ 9 | Simple block for processing video (encoder). 10 | """ 11 | 12 | def __init__(self, in_features, out_features, norm=False, kernel_size=4, pool=False, sn=False): 13 | super(DownBlock2d, self).__init__() 14 | self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size) 15 | 16 | if sn: 17 | self.conv = nn.utils.spectral_norm(self.conv) 18 | 19 | if norm: 20 | self.norm = nn.InstanceNorm2d(out_features, affine=True) 21 | else: 22 | self.norm = None 23 | self.pool = pool 24 | 25 | def forward(self, x): 26 | out = x 27 | out = self.conv(out) 28 | if self.norm: 29 | out = self.norm(out) 30 | out = F.leaky_relu(out, 0.2) 31 | if self.pool: 32 | out = F.avg_pool2d(out, (2, 2)) 33 | return out 34 | 35 | 36 | class Discriminator(nn.Module): 37 | """ 38 | Discriminator similar to Pix2Pix 39 | """ 40 | 41 | def __init__(self, num_channels=3, block_expansion=64, num_blocks=4, max_features=512, 42 | sn=False, **kwargs): 43 | super(Discriminator, self).__init__() 44 | 45 | down_blocks = [] 46 | for i in range(num_blocks): 47 | down_blocks.append( 48 | DownBlock2d(num_channels if i == 0 else min(max_features, block_expansion * (2 ** i)), 49 | min(max_features, block_expansion * (2 ** (i + 1))), 50 | norm=(i != 0), kernel_size=4, pool=(i != num_blocks - 1), sn=sn)) 51 | 52 | self.down_blocks = nn.ModuleList(down_blocks) 53 | self.conv = nn.Conv2d(self.down_blocks[-1].conv.out_channels, out_channels=1, kernel_size=1) 54 | if sn: 55 | self.conv = nn.utils.spectral_norm(self.conv) 56 | 57 | def forward(self, x): 58 | feature_maps = [] 59 | out = x 60 | 61 | for down_block in self.down_blocks: 62 | feature_maps.append(down_block(out)) 63 | out = feature_maps[-1] 64 | prediction_map = self.conv(out) 65 | 66 | return feature_maps, prediction_map 67 | 68 | 69 | class MultiScaleDiscriminator(nn.Module): 70 | """ 71 | Multi-scale (scale) discriminator 72 | """ 73 | 74 | def __init__(self, scales=(), **kwargs): 75 | super(MultiScaleDiscriminator, self).__init__() 76 | self.scales = scales 77 | discs = {} 78 | for scale in scales: 79 | discs[str(scale).replace('.', '-')] = Discriminator(**kwargs) 80 | self.discs = nn.ModuleDict(discs) 81 | 82 | def forward(self, x): 83 | out_dict = {} 84 | for scale, disc in self.discs.items(): 85 | scale = str(scale).replace('-', '.') 86 | key = 'prediction_' + scale 87 | feature_maps, prediction_map = disc(x[key]) 88 | out_dict['feature_maps_' + scale] = feature_maps 89 | out_dict['prediction_map_' + scale] = prediction_map 90 | return out_dict 91 | -------------------------------------------------------------------------------- /modules/sadtalker/src/audio2exp_models/networks.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | class Conv2d(nn.Module): 6 | def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, use_act = True, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | self.conv_block = nn.Sequential( 9 | nn.Conv2d(cin, cout, kernel_size, stride, padding), 10 | nn.BatchNorm2d(cout) 11 | ) 12 | self.act = nn.ReLU() 13 | self.residual = residual 14 | self.use_act = use_act 15 | 16 | def forward(self, x): 17 | out = self.conv_block(x) 18 | if self.residual: 19 | out += x 20 | 21 | if self.use_act: 22 | return self.act(out) 23 | else: 24 | return out 25 | 26 | class SimpleWrapperV2(nn.Module): 27 | def __init__(self) -> None: 28 | super().__init__() 29 | self.audio_encoder = nn.Sequential( 30 | Conv2d(1, 32, kernel_size=3, stride=1, padding=1), 31 | Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True), 32 | Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True), 33 | 34 | Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1), 35 | Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True), 36 | Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True), 37 | 38 | Conv2d(64, 128, kernel_size=3, stride=3, padding=1), 39 | Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True), 40 | Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True), 41 | 42 | Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1), 43 | Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True), 44 | 45 | Conv2d(256, 512, kernel_size=3, stride=1, padding=0), 46 | Conv2d(512, 512, kernel_size=1, stride=1, padding=0), 47 | ) 48 | 49 | #### load the pre-trained audio_encoder 50 | #self.audio_encoder = self.audio_encoder.to(device) 51 | ''' 52 | wav2lip_state_dict = torch.load('/apdcephfs_cq2/share_1290939/wenxuazhang/checkpoints/wav2lip.pth')['state_dict'] 53 | state_dict = self.audio_encoder.state_dict() 54 | 55 | for k,v in wav2lip_state_dict.items(): 56 | if 'audio_encoder' in k: 57 | print('init:', k) 58 | state_dict[k.replace('module.audio_encoder.', '')] = v 59 | self.audio_encoder.load_state_dict(state_dict) 60 | ''' 61 | 62 | self.mapping1 = nn.Linear(512+64+1, 64) 63 | #self.mapping2 = nn.Linear(30, 64) 64 | #nn.init.constant_(self.mapping1.weight, 0.) 65 | nn.init.constant_(self.mapping1.bias, 0.) 66 | 67 | def forward(self, x, ref, ratio): 68 | x = self.audio_encoder(x).view(x.size(0), -1) 69 | ref_reshape = ref.reshape(x.size(0), -1) 70 | ratio = ratio.reshape(x.size(0), -1) 71 | 72 | y = self.mapping1(torch.cat([x, ref_reshape, ratio], dim=1)) 73 | out = y.reshape(ref.shape[0], ref.shape[1], -1) #+ ref # resudial 74 | return out 75 | -------------------------------------------------------------------------------- /modules/sadtalker/src/face3d/models/__init__.py: -------------------------------------------------------------------------------- 1 | """This package contains modules related to objective functions, optimizations, and network architectures. 2 | 3 | To add a custom model class called 'dummy', you need to add a file called 'dummy_model.py' and define a subclass DummyModel inherited from BaseModel. 4 | You need to implement the following five functions: 5 | -- <__init__>: initialize the class; first call BaseModel.__init__(self, opt). 6 | -- : unpack data from dataset and apply preprocessing. 7 | -- : produce intermediate results. 8 | -- : calculate loss, gradients, and update network weights. 9 | -- : (optionally) add model-specific options and set default options. 10 | 11 | In the function <__init__>, you need to define four lists: 12 | -- self.loss_names (str list): specify the training losses that you want to plot and save. 13 | -- self.model_names (str list): define networks used in our training. 14 | -- self.visual_names (str list): specify the images that you want to display and save. 15 | -- self.optimizers (optimizer list): define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an usage. 16 | 17 | Now you can use the model class by specifying flag '--model dummy'. 18 | See our template model class 'template_model.py' for more details. 19 | """ 20 | 21 | import importlib 22 | from .base_model import BaseModel 23 | 24 | 25 | def find_model_using_name(model_name): 26 | """Import the module "models/[model_name]_model.py". 27 | 28 | In the file, the class called DatasetNameModel() will 29 | be instantiated. It has to be a subclass of BaseModel, 30 | and it is case-insensitive. 31 | """ 32 | model_filename = "face3d.models." + model_name + "_model" 33 | modellib = importlib.import_module(model_filename) 34 | model = None 35 | target_model_name = model_name.replace('_', '') + 'model' 36 | for name, cls in modellib.__dict__.items(): 37 | if name.lower() == target_model_name.lower() \ 38 | and issubclass(cls, BaseModel): 39 | model = cls 40 | 41 | if model is None: 42 | print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name)) 43 | exit(0) 44 | 45 | return model 46 | 47 | 48 | def get_option_setter(model_name): 49 | """Return the static method of the model class.""" 50 | model_class = find_model_using_name(model_name) 51 | return model_class.modify_commandline_options 52 | 53 | 54 | def create_model(opt): 55 | """Create a model given the option. 56 | 57 | This function warps the class CustomDatasetDataLoader. 58 | This is the main interface between this package and 'train.py'/'test.py' 59 | 60 | Example: 61 | >>> from models import create_model 62 | >>> model = create_model(opt) 63 | """ 64 | model = find_model_using_name(opt.model) 65 | instance = model(opt) 66 | print("model [%s] was created" % type(instance).__name__) 67 | return instance 68 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/rendering/blender/view_data.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import json 3 | import zipfile 4 | from typing import BinaryIO, List, Tuple 5 | 6 | import numpy as np 7 | from PIL import Image 8 | 9 | from ...rendering.view_data import Camera, ProjectiveCamera, ViewData 10 | 11 | 12 | class BlenderViewData(ViewData): 13 | """ 14 | Interact with a dataset zipfile exported by view_data.py. 15 | """ 16 | 17 | def __init__(self, f_obj: BinaryIO): 18 | self.zipfile = zipfile.ZipFile(f_obj, mode="r") 19 | self.infos = [] 20 | with self.zipfile.open("info.json", "r") as f: 21 | self.info = json.load(f) 22 | self.channels = list(self.info.get("channels", "RGBAD")) 23 | assert set("RGBA").issubset( 24 | set(self.channels) 25 | ), "The blender output should at least have RGBA images." 26 | names = set(x.filename for x in self.zipfile.infolist()) 27 | for i in itertools.count(): 28 | name = f"{i:05}.json" 29 | if name not in names: 30 | break 31 | with self.zipfile.open(name, "r") as f: 32 | self.infos.append(json.load(f)) 33 | 34 | @property 35 | def num_views(self) -> int: 36 | return len(self.infos) 37 | 38 | @property 39 | def channel_names(self) -> List[str]: 40 | return list(self.channels) 41 | 42 | def load_view(self, index: int, channels: List[str]) -> Tuple[Camera, np.ndarray]: 43 | for ch in channels: 44 | if ch not in self.channel_names: 45 | raise ValueError(f"unsupported channel: {ch}") 46 | 47 | # Gather (a superset of) the requested channels. 48 | channel_map = {} 49 | if any(x in channels for x in "RGBA"): 50 | with self.zipfile.open(f"{index:05}.png", "r") as f: 51 | rgba = np.array(Image.open(f)).astype(np.float32) / 255.0 52 | channel_map.update(zip("RGBA", rgba.transpose([2, 0, 1]))) 53 | if "D" in channels: 54 | with self.zipfile.open(f"{index:05}_depth.png", "r") as f: 55 | # Decode a 16-bit fixed-point number. 56 | fp = np.array(Image.open(f)) 57 | inf_dist = fp == 0xFFFF 58 | channel_map["D"] = np.where( 59 | inf_dist, 60 | np.inf, 61 | self.infos[index]["max_depth"] * (fp.astype(np.float32) / 65536), 62 | ) 63 | if "MatAlpha" in channels: 64 | with self.zipfile.open(f"{index:05}_MatAlpha.png", "r") as f: 65 | channel_map["MatAlpha"] = np.array(Image.open(f)).astype(np.float32) / 65536 66 | 67 | # The order of channels is user-specified. 68 | combined = np.stack([channel_map[k] for k in channels], axis=-1) 69 | 70 | h, w, _ = combined.shape 71 | return self.camera(index, w, h), combined 72 | 73 | def camera(self, index: int, width: int, height: int) -> ProjectiveCamera: 74 | info = self.infos[index] 75 | return ProjectiveCamera( 76 | origin=np.array(info["origin"], dtype=np.float32), 77 | x=np.array(info["x"], dtype=np.float32), 78 | y=np.array(info["y"], dtype=np.float32), 79 | z=np.array(info["z"], dtype=np.float32), 80 | width=width, 81 | height=height, 82 | x_fov=info["x_fov"], 83 | y_fov=info["y_fov"], 84 | ) 85 | -------------------------------------------------------------------------------- /modules/text2video_zero/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .model import ( 4 | CannyText2VideoModel, 5 | PoseText2VideoModel, 6 | DepthText2VideoModel, 7 | VideoPix2PixModel, 8 | Text2VideoModel, 9 | ) 10 | 11 | from utils import generate_video_name_mp4, get_new_video_name 12 | 13 | 14 | class CannyText2Video: 15 | def __init__(self, device): 16 | self.device = device 17 | self.model = CannyText2VideoModel(device, dtype=torch.float16) 18 | 19 | def inference(self, inputs: str, resolution=512): 20 | vid_path, prompt = inputs.split(",")[0], ",".join(inputs.split(",")[1:]) 21 | out_path = get_new_video_name(vid_path, func_name="canny2video") 22 | self.model.process_controlnet_canny( 23 | vid_path, 24 | prompt, 25 | save_path=out_path, 26 | resolution=resolution, 27 | ) 28 | return out_path 29 | 30 | 31 | class PoseText2Video: 32 | def __init__(self, device): 33 | self.device = device 34 | self.model = PoseText2VideoModel(device, dtype=torch.float16) 35 | 36 | def inference(self, inputs: str, resolution=512): 37 | vid_path, prompt = inputs.split(",")[0], ",".join(inputs.split(",")[1:]) 38 | out_path = get_new_video_name(vid_path, func_name="pose2video") 39 | self.model.process_controlnet_pose( 40 | vid_path, 41 | prompt, 42 | save_path=out_path, 43 | resolution=resolution, 44 | ) 45 | return out_path 46 | 47 | 48 | class DepthText2Video: 49 | def __init__(self, device): 50 | self.device = device 51 | self.model = DepthText2VideoModel(device, dtype=torch.float16) 52 | 53 | def inference(self, inputs: str, resolution=512): 54 | vid_path, prompt = inputs.split(",")[0], ",".join(inputs.split(",")[1:]) 55 | out_path = get_new_video_name(vid_path, func_name="depth2video") 56 | self.model.process_controlnet_depth( 57 | vid_path, 58 | prompt, 59 | save_path=out_path, 60 | resolution=resolution, 61 | ) 62 | return out_path 63 | 64 | 65 | class VideoPix2Pix: 66 | def __init__(self, device): 67 | self.device = device 68 | self.model = VideoPix2PixModel(device, dtype=torch.float16) 69 | 70 | def inference(self, inputs: str): 71 | vid_path, prompt = inputs.split(",")[0], ",".join(inputs.split(",")[1:]) 72 | out_path = get_new_video_name(vid_path, func_name="pix2pix") 73 | self.model.process_pix2pix( 74 | vid_path, 75 | prompt, 76 | save_path=out_path, 77 | ) 78 | return out_path 79 | 80 | 81 | class Text2Video: 82 | def __init__(self, device): 83 | self.device = device 84 | self.model = Text2VideoModel(device, dtype=torch.float16) 85 | 86 | def inference(self, inputs: str, resolution=512): 87 | prompt = inputs 88 | params = { 89 | "t0": 44, 90 | "t1": 47, 91 | "motion_field_strength_x": 12, 92 | "motion_field_strength_y": 12, 93 | "video_length": 16, 94 | } 95 | out_path, fps = generate_video_name_mp4(), 8 96 | self.model.process_text2video( 97 | prompt, 98 | fps=fps, 99 | path=out_path, 100 | resolution=resolution, 101 | **params, 102 | ) 103 | return out_path 104 | -------------------------------------------------------------------------------- /modules/sadtalker/src/facerender/sync_batchnorm/replicate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File : replicate.py 3 | # Author : Jiayuan Mao 4 | # Email : maojiayuan@gmail.com 5 | # Date : 27/01/2018 6 | # 7 | # This file is part of Synchronized-BatchNorm-PyTorch. 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch 9 | # Distributed under MIT License. 10 | 11 | import functools 12 | 13 | from torch.nn.parallel.data_parallel import DataParallel 14 | 15 | __all__ = [ 16 | 'CallbackContext', 17 | 'execute_replication_callbacks', 18 | 'DataParallelWithCallback', 19 | 'patch_replication_callback' 20 | ] 21 | 22 | 23 | class CallbackContext(object): 24 | pass 25 | 26 | 27 | def execute_replication_callbacks(modules): 28 | """ 29 | Execute an replication callback `__data_parallel_replicate__` on each module created by original replication. 30 | 31 | The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)` 32 | 33 | Note that, as all modules are isomorphism, we assign each sub-module with a context 34 | (shared among multiple copies of this module on different devices). 35 | Through this context, different copies can share some information. 36 | 37 | We guarantee that the callback on the master copy (the first copy) will be called ahead of calling the callback 38 | of any slave copies. 39 | """ 40 | master_copy = modules[0] 41 | nr_modules = len(list(master_copy.modules())) 42 | ctxs = [CallbackContext() for _ in range(nr_modules)] 43 | 44 | for i, module in enumerate(modules): 45 | for j, m in enumerate(module.modules()): 46 | if hasattr(m, '__data_parallel_replicate__'): 47 | m.__data_parallel_replicate__(ctxs[j], i) 48 | 49 | 50 | class DataParallelWithCallback(DataParallel): 51 | """ 52 | Data Parallel with a replication callback. 53 | 54 | An replication callback `__data_parallel_replicate__` of each module will be invoked after being created by 55 | original `replicate` function. 56 | The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)` 57 | 58 | Examples: 59 | > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False) 60 | > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1]) 61 | # sync_bn.__data_parallel_replicate__ will be invoked. 62 | """ 63 | 64 | def replicate(self, module, device_ids): 65 | modules = super(DataParallelWithCallback, self).replicate(module, device_ids) 66 | execute_replication_callbacks(modules) 67 | return modules 68 | 69 | 70 | def patch_replication_callback(data_parallel): 71 | """ 72 | Monkey-patch an existing `DataParallel` object. Add the replication callback. 73 | Useful when you have customized `DataParallel` implementation. 74 | 75 | Examples: 76 | > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False) 77 | > sync_bn = DataParallel(sync_bn, device_ids=[0, 1]) 78 | > patch_replication_callback(sync_bn) 79 | # this is equivalent to 80 | > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False) 81 | > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1]) 82 | """ 83 | 84 | assert isinstance(data_parallel, DataParallel) 85 | 86 | old_replicate = data_parallel.replicate 87 | 88 | @functools.wraps(old_replicate) 89 | def new_replicate(module, device_ids): 90 | modules = old_replicate(module, device_ids) 91 | execute_replication_callbacks(modules) 92 | return modules 93 | 94 | data_parallel.replicate = new_replicate 95 | -------------------------------------------------------------------------------- /modules/annotator/midas/midas/dpt_depth.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from .base_model import BaseModel 6 | from .blocks import ( 7 | FeatureFusionBlock, 8 | FeatureFusionBlock_custom, 9 | Interpolate, 10 | _make_encoder, 11 | forward_vit, 12 | ) 13 | 14 | 15 | def _make_fusion_block(features, use_bn): 16 | return FeatureFusionBlock_custom( 17 | features, 18 | nn.ReLU(False), 19 | deconv=False, 20 | bn=use_bn, 21 | expand=False, 22 | align_corners=True, 23 | ) 24 | 25 | 26 | class DPT(BaseModel): 27 | def __init__( 28 | self, 29 | head, 30 | features=256, 31 | backbone="vitb_rn50_384", 32 | readout="project", 33 | channels_last=False, 34 | use_bn=False, 35 | ): 36 | 37 | super(DPT, self).__init__() 38 | 39 | self.channels_last = channels_last 40 | 41 | hooks = { 42 | "vitb_rn50_384": [0, 1, 8, 11], 43 | "vitb16_384": [2, 5, 8, 11], 44 | "vitl16_384": [5, 11, 17, 23], 45 | } 46 | 47 | # Instantiate backbone and reassemble blocks 48 | self.pretrained, self.scratch = _make_encoder( 49 | backbone, 50 | features, 51 | False, # Set to true of you want to train from scratch, uses ImageNet weights 52 | groups=1, 53 | expand=False, 54 | exportable=False, 55 | hooks=hooks[backbone], 56 | use_readout=readout, 57 | ) 58 | 59 | self.scratch.refinenet1 = _make_fusion_block(features, use_bn) 60 | self.scratch.refinenet2 = _make_fusion_block(features, use_bn) 61 | self.scratch.refinenet3 = _make_fusion_block(features, use_bn) 62 | self.scratch.refinenet4 = _make_fusion_block(features, use_bn) 63 | 64 | self.scratch.output_conv = head 65 | 66 | 67 | def forward(self, x): 68 | if self.channels_last == True: 69 | x.contiguous(memory_format=torch.channels_last) 70 | 71 | layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x) 72 | 73 | layer_1_rn = self.scratch.layer1_rn(layer_1) 74 | layer_2_rn = self.scratch.layer2_rn(layer_2) 75 | layer_3_rn = self.scratch.layer3_rn(layer_3) 76 | layer_4_rn = self.scratch.layer4_rn(layer_4) 77 | 78 | path_4 = self.scratch.refinenet4(layer_4_rn) 79 | path_3 = self.scratch.refinenet3(path_4, layer_3_rn) 80 | path_2 = self.scratch.refinenet2(path_3, layer_2_rn) 81 | path_1 = self.scratch.refinenet1(path_2, layer_1_rn) 82 | 83 | out = self.scratch.output_conv(path_1) 84 | 85 | return out 86 | 87 | 88 | class DPTDepthModel(DPT): 89 | def __init__(self, path=None, non_negative=True, **kwargs): 90 | features = kwargs["features"] if "features" in kwargs else 256 91 | 92 | head = nn.Sequential( 93 | nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), 94 | Interpolate(scale_factor=2, mode="bilinear", align_corners=True), 95 | nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), 96 | nn.ReLU(True), 97 | nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), 98 | nn.ReLU(True) if non_negative else nn.Identity(), 99 | nn.Identity(), 100 | ) 101 | 102 | super().__init__(head, **kwargs) 103 | 104 | if path is not None: 105 | self.load(path) 106 | 107 | def forward(self, x): 108 | return super().forward(x).squeeze(dim=1) 109 | 110 | -------------------------------------------------------------------------------- /modules/sadtalker/src/face3d/util/preprocess.py: -------------------------------------------------------------------------------- 1 | """This script contains the image preprocessing code for Deep3DFaceRecon_pytorch 2 | """ 3 | 4 | import numpy as np 5 | from scipy.io import loadmat 6 | from PIL import Image 7 | import cv2 8 | import os 9 | from skimage import transform as trans 10 | import torch 11 | import warnings 12 | warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 13 | warnings.filterwarnings("ignore", category=FutureWarning) 14 | 15 | 16 | 17 | # calculating least square problem for image alignment 18 | def POS(xp, x): 19 | npts = xp.shape[1] 20 | 21 | A = np.zeros([2*npts, 8]) 22 | 23 | A[0:2*npts-1:2, 0:3] = x.transpose() 24 | A[0:2*npts-1:2, 3] = 1 25 | 26 | A[1:2*npts:2, 4:7] = x.transpose() 27 | A[1:2*npts:2, 7] = 1 28 | 29 | b = np.reshape(xp.transpose(), [2*npts, 1]) 30 | 31 | k, _, _, _ = np.linalg.lstsq(A, b) 32 | 33 | R1 = k[0:3] 34 | R2 = k[4:7] 35 | sTx = k[3] 36 | sTy = k[7] 37 | s = (np.linalg.norm(R1) + np.linalg.norm(R2))/2 38 | t = np.stack([sTx, sTy], axis=0) 39 | 40 | return t, s 41 | 42 | # resize and crop images for face reconstruction 43 | def resize_n_crop_img(img, lm, t, s, target_size=224., mask=None): 44 | w0, h0 = img.size 45 | w = (w0*s).astype(np.int32) 46 | h = (h0*s).astype(np.int32) 47 | left = (w/2 - target_size/2 + float((t[0] - w0/2)*s)).astype(np.int32) 48 | right = left + target_size 49 | up = (h/2 - target_size/2 + float((h0/2 - t[1])*s)).astype(np.int32) 50 | below = up + target_size 51 | 52 | img = img.resize((w, h), resample=Image.BICUBIC) 53 | img = img.crop((left, up, right, below)) 54 | 55 | if mask is not None: 56 | mask = mask.resize((w, h), resample=Image.BICUBIC) 57 | mask = mask.crop((left, up, right, below)) 58 | 59 | lm = np.stack([lm[:, 0] - t[0] + w0/2, lm[:, 1] - 60 | t[1] + h0/2], axis=1)*s 61 | lm = lm - np.reshape( 62 | np.array([(w/2 - target_size/2), (h/2-target_size/2)]), [1, 2]) 63 | 64 | return img, lm, mask 65 | 66 | # utils for face reconstruction 67 | def extract_5p(lm): 68 | lm_idx = np.array([31, 37, 40, 43, 46, 49, 55]) - 1 69 | lm5p = np.stack([lm[lm_idx[0], :], np.mean(lm[lm_idx[[1, 2]], :], 0), np.mean( 70 | lm[lm_idx[[3, 4]], :], 0), lm[lm_idx[5], :], lm[lm_idx[6], :]], axis=0) 71 | lm5p = lm5p[[1, 2, 0, 3, 4], :] 72 | return lm5p 73 | 74 | # utils for face reconstruction 75 | def align_img(img, lm, lm3D, mask=None, target_size=224., rescale_factor=102.): 76 | """ 77 | Return: 78 | transparams --numpy.array (raw_W, raw_H, scale, tx, ty) 79 | img_new --PIL.Image (target_size, target_size, 3) 80 | lm_new --numpy.array (68, 2), y direction is opposite to v direction 81 | mask_new --PIL.Image (target_size, target_size) 82 | 83 | Parameters: 84 | img --PIL.Image (raw_H, raw_W, 3) 85 | lm --numpy.array (68, 2), y direction is opposite to v direction 86 | lm3D --numpy.array (5, 3) 87 | mask --PIL.Image (raw_H, raw_W, 3) 88 | """ 89 | 90 | w0, h0 = img.size 91 | if lm.shape[0] != 5: 92 | lm5p = extract_5p(lm) 93 | else: 94 | lm5p = lm 95 | 96 | # calculate translation and scale factors using 5 facial landmarks and standard landmarks of a 3D face 97 | t, s = POS(lm5p.transpose(), lm3D.transpose()) 98 | s = rescale_factor/s 99 | 100 | # processing the image 101 | img_new, lm_new, mask_new = resize_n_crop_img(img, lm, t, s, target_size=target_size, mask=mask) 102 | trans_params = np.array([w0, h0, s, t[0], t[1]]) 103 | 104 | return trans_params, img_new, lm_new, mask_new 105 | -------------------------------------------------------------------------------- /modules/sadtalker/src/utils/init_path.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | 4 | from torch.hub import download_url_to_file 5 | 6 | 7 | def init_path(checkpoint_dir, config_dir, size=512, old_version=False, preprocess='crop'): 8 | 9 | print("checkpoint_dir is:", checkpoint_dir) 10 | 11 | if old_version: 12 | #### load all the checkpoint of `pth` 13 | sadtalker_paths = { 14 | 'wav2lip_checkpoint' : os.path.join(checkpoint_dir, 'wav2lip.pth'), 15 | 'audio2pose_checkpoint' : os.path.join(checkpoint_dir, 'auido2pose_00140-model.pth'), 16 | 'audio2exp_checkpoint' : os.path.join(checkpoint_dir, 'auido2exp_00300-model.pth'), 17 | 'free_view_checkpoint' : os.path.join(checkpoint_dir, 'facevid2vid_00189-model.pth.tar'), 18 | 'path_of_net_recon_model' : os.path.join(checkpoint_dir, 'epoch_20.pth') 19 | } 20 | 21 | use_safetensor = False 22 | 23 | elif len(glob.glob(os.path.join(checkpoint_dir, '*.safetensors'))): 24 | print('using safetensor as default') 25 | sadtalker_paths = { 26 | "checkpoint":os.path.join(checkpoint_dir, 'SadTalker_V0.0.2_'+str(size)+'.safetensors'), 27 | } 28 | use_safetensor = True 29 | 30 | else: 31 | print(f"Begin to download models to {checkpoint_dir}...") 32 | os.makedirs(checkpoint_dir, exist_ok=True) 33 | 34 | checkpoint_urls = ["https://github.com/OpenTalker/SadTalker/releases/download/v0.0.2-rc/mapping_00109-model.pth.tar", 35 | "https://github.com/OpenTalker/SadTalker/releases/download/v0.0.2-rc/mapping_00229-model.pth.tar", 36 | "https://github.com/OpenTalker/SadTalker/releases/download/v0.0.2-rc/SadTalker_V0.0.2_256.safetensors", 37 | "https://github.com/OpenTalker/SadTalker/releases/download/v0.0.2-rc/SadTalker_V0.0.2_512.safetensors"] 38 | 39 | for checkpoint_url in checkpoint_urls: 40 | download_url_to_file(checkpoint_url, checkpoint_dir, hash_prefix=None, progress=True) 41 | 42 | print('using safetensor as default') 43 | sadtalker_paths = { 44 | "checkpoint":os.path.join(checkpoint_dir, 'SadTalker_V0.0.2_'+str(size)+'.safetensors'), 45 | } 46 | 47 | use_safetensor = True 48 | # print("WARNING: The new version of the model will be updated by safetensor, you may need to download it mannully. We run the old version of the checkpoint this time!") 49 | # use_safetensor = False 50 | 51 | # sadtalker_paths = { 52 | # 'wav2lip_checkpoint' : os.path.join(checkpoint_dir, 'wav2lip.pth'), 53 | # 'audio2pose_checkpoint' : os.path.join(checkpoint_dir, 'auido2pose_00140-model.pth'), 54 | # 'audio2exp_checkpoint' : os.path.join(checkpoint_dir, 'auido2exp_00300-model.pth'), 55 | # 'free_view_checkpoint' : os.path.join(checkpoint_dir, 'facevid2vid_00189-model.pth.tar'), 56 | # 'path_of_net_recon_model' : os.path.join(checkpoint_dir, 'epoch_20.pth') 57 | # } 58 | 59 | sadtalker_paths['dir_of_BFM_fitting'] = os.path.join(config_dir) # , 'BFM_Fitting' 60 | sadtalker_paths['audio2pose_yaml_path'] = os.path.join(config_dir, 'auido2pose.yaml') 61 | sadtalker_paths['audio2exp_yaml_path'] = os.path.join(config_dir, 'auido2exp.yaml') 62 | sadtalker_paths['use_safetensor'] = use_safetensor # os.path.join(config_dir, 'auido2exp.yaml') 63 | 64 | if 'full' in preprocess: 65 | sadtalker_paths['mappingnet_checkpoint'] = os.path.join(checkpoint_dir, 'mapping_00109-model.pth.tar') 66 | sadtalker_paths['facerender_yaml'] = os.path.join(config_dir, 'facerender_still.yaml') 67 | else: 68 | sadtalker_paths['mappingnet_checkpoint'] = os.path.join(checkpoint_dir, 'mapping_00229-model.pth.tar') 69 | sadtalker_paths['facerender_yaml'] = os.path.join(config_dir, 'facerender.yaml') 70 | 71 | return sadtalker_paths -------------------------------------------------------------------------------- /modules/shap_e/shap_e/rendering/mesh.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import BinaryIO, Dict, Optional, Union 3 | 4 | import blobfile as bf 5 | import numpy as np 6 | 7 | from .ply_util import write_ply 8 | 9 | 10 | @dataclass 11 | class TriMesh: 12 | """ 13 | A 3D triangle mesh with optional data at the vertices and faces. 14 | """ 15 | 16 | # [N x 3] array of vertex coordinates. 17 | verts: np.ndarray 18 | 19 | # [M x 3] array of triangles, pointing to indices in verts. 20 | faces: np.ndarray 21 | 22 | # [P x 3] array of normal vectors per face. 23 | normals: Optional[np.ndarray] = None 24 | 25 | # Extra data per vertex and face. 26 | vertex_channels: Optional[Dict[str, np.ndarray]] = field(default_factory=dict) 27 | face_channels: Optional[Dict[str, np.ndarray]] = field(default_factory=dict) 28 | 29 | @classmethod 30 | def load(cls, f: Union[str, BinaryIO]) -> "TriMesh": 31 | """ 32 | Load the mesh from a .npz file. 33 | """ 34 | if isinstance(f, str): 35 | with bf.BlobFile(f, "rb") as reader: 36 | return cls.load(reader) 37 | else: 38 | obj = np.load(f) 39 | keys = list(obj.keys()) 40 | verts = obj["verts"] 41 | faces = obj["faces"] 42 | normals = obj["normals"] if "normals" in keys else None 43 | vertex_channels = {} 44 | face_channels = {} 45 | for key in keys: 46 | if key.startswith("v_"): 47 | vertex_channels[key[2:]] = obj[key] 48 | elif key.startswith("f_"): 49 | face_channels[key[2:]] = obj[key] 50 | return cls( 51 | verts=verts, 52 | faces=faces, 53 | normals=normals, 54 | vertex_channels=vertex_channels, 55 | face_channels=face_channels, 56 | ) 57 | 58 | def save(self, f: Union[str, BinaryIO]): 59 | """ 60 | Save the mesh to a .npz file. 61 | """ 62 | if isinstance(f, str): 63 | with bf.BlobFile(f, "wb") as writer: 64 | self.save(writer) 65 | else: 66 | obj_dict = dict(verts=self.verts, faces=self.faces) 67 | if self.normals is not None: 68 | obj_dict["normals"] = self.normals 69 | for k, v in self.vertex_channels.items(): 70 | obj_dict[f"v_{k}"] = v 71 | for k, v in self.face_channels.items(): 72 | obj_dict[f"f_{k}"] = v 73 | np.savez(f, **obj_dict) 74 | 75 | def has_vertex_colors(self) -> bool: 76 | return self.vertex_channels is not None and all(x in self.vertex_channels for x in "RGB") 77 | 78 | def write_ply(self, raw_f: BinaryIO): 79 | write_ply( 80 | raw_f, 81 | coords=self.verts, 82 | rgb=( 83 | np.stack([self.vertex_channels[x] for x in "RGB"], axis=1) 84 | if self.has_vertex_colors() 85 | else None 86 | ), 87 | faces=self.faces, 88 | ) 89 | 90 | def write_obj(self, raw_f: BinaryIO): 91 | if self.has_vertex_colors(): 92 | vertex_colors = np.stack([self.vertex_channels[x] for x in "RGB"], axis=1) 93 | vertices = [ 94 | "{} {} {} {} {} {}".format(*coord, *color) 95 | for coord, color in zip(self.verts.tolist(), vertex_colors.tolist()) 96 | ] 97 | else: 98 | vertices = ["{} {} {}".format(*coord) for coord in self.verts.tolist()] 99 | 100 | faces = [ 101 | "f {} {} {}".format(str(tri[0] + 1), str(tri[1] + 1), str(tri[2] + 1)) 102 | for tri in self.faces.tolist() 103 | ] 104 | 105 | combined_data = ["v " + vertex for vertex in vertices] + faces 106 | 107 | raw_f.writelines("\n".join(combined_data)) 108 | -------------------------------------------------------------------------------- /modules/annotator/openpose/hand.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import json 3 | import numpy as np 4 | import math 5 | import time 6 | from scipy.ndimage.filters import gaussian_filter 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | import torch 10 | from skimage.measure import label 11 | 12 | from .model import handpose_model 13 | from . import util 14 | 15 | 16 | class Hand(object): 17 | def __init__(self, model_path, device=None): 18 | self.device = device or torch.device( 19 | "cuda" if torch.cuda.is_available() else "cpu" 20 | ) 21 | self.model = handpose_model().to(self.device) 22 | model_dict = util.transfer(self.model, torch.load(model_path)) 23 | self.model.load_state_dict(model_dict) 24 | self.model.eval() 25 | 26 | def __call__(self, oriImg): 27 | scale_search = [0.5, 1.0, 1.5, 2.0] 28 | # scale_search = [0.5] 29 | boxsize = 368 30 | stride = 8 31 | padValue = 128 32 | thre = 0.05 33 | multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search] 34 | heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 22)) 35 | # paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38)) 36 | 37 | for m in range(len(multiplier)): 38 | scale = multiplier[m] 39 | imageToTest = cv2.resize( 40 | oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC 41 | ) 42 | imageToTest_padded, pad = util.padRightDownCorner( 43 | imageToTest, stride, padValue 44 | ) 45 | im = ( 46 | np.transpose( 47 | np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1) 48 | ) 49 | / 256 50 | - 0.5 51 | ) 52 | im = np.ascontiguousarray(im) 53 | 54 | data = torch.from_numpy(im).float().to(self.device) 55 | # data = data.permute([2, 0, 1]).unsqueeze(0).float() 56 | with torch.no_grad(): 57 | output = self.model(data).cpu().numpy() 58 | # output = self.model(data).numpy()q 59 | 60 | # extract outputs, resize, and remove padding 61 | heatmap = np.transpose( 62 | np.squeeze(output), (1, 2, 0) 63 | ) # output 1 is heatmaps 64 | heatmap = cv2.resize( 65 | heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC 66 | ) 67 | heatmap = heatmap[ 68 | : imageToTest_padded.shape[0] - pad[2], 69 | : imageToTest_padded.shape[1] - pad[3], 70 | :, 71 | ] 72 | heatmap = cv2.resize( 73 | heatmap, 74 | (oriImg.shape[1], oriImg.shape[0]), 75 | interpolation=cv2.INTER_CUBIC, 76 | ) 77 | 78 | heatmap_avg += heatmap / len(multiplier) 79 | 80 | all_peaks = [] 81 | for part in range(21): 82 | map_ori = heatmap_avg[:, :, part] 83 | one_heatmap = gaussian_filter(map_ori, sigma=3) 84 | binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8) 85 | # 全部小于阈值 86 | if np.sum(binary) == 0: 87 | all_peaks.append([0, 0]) 88 | continue 89 | label_img, label_numbers = label( 90 | binary, return_num=True, connectivity=binary.ndim 91 | ) 92 | max_index = ( 93 | np.argmax( 94 | [ 95 | np.sum(map_ori[label_img == i]) 96 | for i in range(1, label_numbers + 1) 97 | ] 98 | ) 99 | + 1 100 | ) 101 | label_img[label_img != max_index] = 0 102 | map_ori[label_img == 0] = 0 103 | 104 | y, x = util.npmax(map_ori) 105 | all_peaks.append([x, y]) 106 | return np.array(all_peaks) 107 | -------------------------------------------------------------------------------- /modules/sadtalker/src/audio2pose_models/audio2pose.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from .cvae import CVAE 4 | from .discriminator import PoseSequenceDiscriminator 5 | from .audio_encoder import AudioEncoder 6 | 7 | class Audio2Pose(nn.Module): 8 | def __init__(self, cfg, wav2lip_checkpoint, device='cuda'): 9 | super().__init__() 10 | self.cfg = cfg 11 | self.seq_len = cfg.MODEL.CVAE.SEQ_LEN 12 | self.latent_dim = cfg.MODEL.CVAE.LATENT_SIZE 13 | self.device = device 14 | 15 | self.audio_encoder = AudioEncoder(wav2lip_checkpoint, device) 16 | self.audio_encoder.eval() 17 | for param in self.audio_encoder.parameters(): 18 | param.requires_grad = False 19 | 20 | self.netG = CVAE(cfg) 21 | self.netD_motion = PoseSequenceDiscriminator(cfg) 22 | 23 | 24 | def forward(self, x): 25 | 26 | batch = {} 27 | coeff_gt = x['gt'].cuda().squeeze(0) #bs frame_len+1 73 28 | batch['pose_motion_gt'] = coeff_gt[:, 1:, 64:70] - coeff_gt[:, :1, 64:70] #bs frame_len 6 29 | batch['ref'] = coeff_gt[:, 0, 64:70] #bs 6 30 | batch['class'] = x['class'].squeeze(0).cuda() # bs 31 | indiv_mels= x['indiv_mels'].cuda().squeeze(0) # bs seq_len+1 80 16 32 | 33 | # forward 34 | audio_emb_list = [] 35 | audio_emb = self.audio_encoder(indiv_mels[:, 1:, :, :].unsqueeze(2)) #bs seq_len 512 36 | batch['audio_emb'] = audio_emb 37 | batch = self.netG(batch) 38 | 39 | pose_motion_pred = batch['pose_motion_pred'] # bs frame_len 6 40 | pose_gt = coeff_gt[:, 1:, 64:70].clone() # bs frame_len 6 41 | pose_pred = coeff_gt[:, :1, 64:70] + pose_motion_pred # bs frame_len 6 42 | 43 | batch['pose_pred'] = pose_pred 44 | batch['pose_gt'] = pose_gt 45 | 46 | return batch 47 | 48 | def test(self, x): 49 | 50 | batch = {} 51 | ref = x['ref'] #bs 1 70 52 | batch['ref'] = x['ref'][:,0,-6:] 53 | batch['class'] = x['class'] 54 | bs = ref.shape[0] 55 | 56 | indiv_mels= x['indiv_mels'] # bs T 1 80 16 57 | indiv_mels_use = indiv_mels[:, 1:] # we regard the ref as the first frame 58 | num_frames = x['num_frames'] 59 | num_frames = int(num_frames) - 1 60 | 61 | # 62 | div = num_frames//self.seq_len 63 | re = num_frames%self.seq_len 64 | audio_emb_list = [] 65 | pose_motion_pred_list = [torch.zeros(batch['ref'].unsqueeze(1).shape, dtype=batch['ref'].dtype, 66 | device=batch['ref'].device)] 67 | 68 | for i in range(div): 69 | z = torch.randn(bs, self.latent_dim).to(ref.device) 70 | batch['z'] = z 71 | audio_emb = self.audio_encoder(indiv_mels_use[:, i*self.seq_len:(i+1)*self.seq_len,:,:,:]) #bs seq_len 512 72 | batch['audio_emb'] = audio_emb 73 | batch = self.netG.test(batch) 74 | pose_motion_pred_list.append(batch['pose_motion_pred']) #list of bs seq_len 6 75 | 76 | if re != 0: 77 | z = torch.randn(bs, self.latent_dim).to(ref.device) 78 | batch['z'] = z 79 | audio_emb = self.audio_encoder(indiv_mels_use[:, -1*self.seq_len:,:,:,:]) #bs seq_len 512 80 | if audio_emb.shape[1] != self.seq_len: 81 | pad_dim = self.seq_len-audio_emb.shape[1] 82 | pad_audio_emb = audio_emb[:, :1].repeat(1, pad_dim, 1) 83 | audio_emb = torch.cat([pad_audio_emb, audio_emb], 1) 84 | batch['audio_emb'] = audio_emb 85 | batch = self.netG.test(batch) 86 | pose_motion_pred_list.append(batch['pose_motion_pred'][:,-1*re:,:]) 87 | 88 | pose_motion_pred = torch.cat(pose_motion_pred_list, dim = 1) 89 | batch['pose_motion_pred'] = pose_motion_pred 90 | 91 | pose_pred = ref[:, :1, -6:] + pose_motion_pred # bs T 6 92 | 93 | batch['pose_pred'] = pose_pred 94 | return batch 95 | -------------------------------------------------------------------------------- /modules/mplug/get_video_caption.py: -------------------------------------------------------------------------------- 1 | import ruamel.yaml as yaml 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | from .models.model_caption_mplug_vatex import MPLUG 6 | from .models.vit import interpolate_pos_embed, resize_pos_embed 7 | from .models.tokenization_bert import BertTokenizer 8 | from decord import VideoReader 9 | import decord 10 | import os 11 | 12 | 13 | config_path = os.path.join("model_zoo", "mplug", "videocap_vatex_mplug_large.yaml") 14 | mplug_pth_path = os.path.join("model_zoo", "mplug", "mplug_large.pth") 15 | 16 | config = yaml.load(open(config_path, "r"), Loader=yaml.Loader) 17 | 18 | 19 | def prepare_model(device): 20 | tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") 21 | model = MPLUG(config=config, tokenizer=tokenizer) 22 | model = model.to(device) 23 | 24 | assert os.path.exists( 25 | mplug_pth_path 26 | ), "Please download mplug_large.pth checkpoint from https://alice-open.oss-cn-zhangjiakou.aliyuncs.com/mPLUG/mplug_large.pth and put it in ./model_zoo/mplug/" 27 | checkpoint = torch.load(mplug_pth_path, map_location=device) 28 | 29 | try: 30 | state_dict = checkpoint["model"] 31 | except: 32 | state_dict = checkpoint["module"] 33 | if config["clip_name"] == "ViT-B-16": 34 | num_patches = int(config["image_res"] * config["image_res"] / (16 * 16)) 35 | elif config["clip_name"] == "ViT-L-14": 36 | num_patches = int(config["image_res"] * config["image_res"] / (14 * 14)) 37 | 38 | pos_embed = nn.Parameter(torch.zeros(num_patches + 1, 768).float()) 39 | pos_embed = resize_pos_embed( 40 | state_dict["visual_encoder.visual.positional_embedding"].unsqueeze(0), 41 | pos_embed.unsqueeze(0), 42 | ) 43 | state_dict["visual_encoder.visual.positional_embedding"] = pos_embed 44 | 45 | for key in list(state_dict.keys()): 46 | if ("fusion" in key or "bert" in key) and "decode" not in key: 47 | encoder_key = key.replace("fusion.", "").replace("bert.", "") 48 | state_dict[encoder_key] = state_dict[key] 49 | del state_dict[key] 50 | 51 | model.load_state_dict(state_dict, strict=False) 52 | model.eval() 53 | 54 | return model, tokenizer 55 | 56 | 57 | def pipeline(video_path, model, tokenizer, device): 58 | video = load_video_from_path_decord( 59 | video_path, config["image_res"], config["image_res"], config["num_frm_test"] 60 | ).to(device) 61 | if config["prompt"] != "": 62 | caption = [config["prompt"] + config["eos"]] * video.size(0) 63 | caption = tokenizer( 64 | caption, 65 | padding="longest", 66 | truncation=True, 67 | max_length=25, 68 | return_tensors="pt", 69 | ).to(device) 70 | else: 71 | caption = None 72 | 73 | topk_ids, topk_probs = model(video, caption, None, train=False) 74 | 75 | for topk_id, topk_prob in zip(topk_ids, topk_probs): 76 | ans = ( 77 | tokenizer.decode(topk_id[0]) 78 | .replace("[SEP]", "") 79 | .replace("[CLS]", "") 80 | .replace("[PAD]", "") 81 | .strip() 82 | ) 83 | ans += " ." 84 | return ans 85 | 86 | 87 | def load_video_from_path_decord( 88 | video_path, 89 | height=None, 90 | width=None, 91 | num_frame=12, 92 | start_time=None, 93 | end_time=None, 94 | fps=-1, 95 | ): 96 | decord.bridge.set_bridge("torch") 97 | 98 | if not height or not width: 99 | vr = VideoReader(video_path) 100 | else: 101 | vr = VideoReader(video_path, width=width, height=height) 102 | vlen = len(vr) 103 | if start_time or end_time: 104 | assert fps > 0, "must provide video fps if specifying start and end time." 105 | start_idx = min(int(start_time * fps), vlen) 106 | end_idx = min(int(end_time * fps), vlen) 107 | else: 108 | start_idx, end_idx = 0, vlen 109 | 110 | frame_index = np.arange(start_idx, end_idx, vlen / num_frame, dtype=int) 111 | raw_sample_frms = vr.get_batch(frame_index) 112 | raw_sample_frms = raw_sample_frms.permute(0, 3, 1, 2).float().unsqueeze(0) 113 | 114 | return raw_sample_frms 115 | -------------------------------------------------------------------------------- /modules/mplug/models/visual_transformers.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import logging 4 | import math 5 | import os 6 | import shutil 7 | import tarfile 8 | import tempfile 9 | import sys 10 | from io import open 11 | import torch.nn.functional as F 12 | 13 | import torch 14 | from torch import nn 15 | from torch.nn import CrossEntropyLoss, SmoothL1Loss 16 | import numpy as np 17 | from .clip import clip 18 | 19 | 20 | def resize_pos_embed(posemb, posemb_new): 21 | # Rescale the grid of position embeddings when loading from state_dict. Adapted from 22 | # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224 23 | ntok_new = posemb_new.shape[1] 24 | if True: 25 | posemb_tok, posemb_grid = posemb[:, :1], posemb[0, 1:] 26 | ntok_new -= 1 27 | else: 28 | posemb_tok, posemb_grid = posemb[:, :0], posemb[0] 29 | gs_old = int(math.sqrt(len(posemb_grid))) 30 | gs_new = int(math.sqrt(ntok_new)) 31 | # _logger.info('Position embedding grid-size from %s to %s', gs_old, gs_new) 32 | posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2) 33 | orig = posemb_grid.dtype 34 | posemb_grid = F.interpolate( 35 | posemb_grid.float(), size=(gs_new, gs_new), mode="bilinear" 36 | ) 37 | posemb_grid = posemb_grid.to(orig) 38 | posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new * gs_new, -1) 39 | posemb = torch.cat([posemb_tok, posemb_grid], dim=1) 40 | return posemb 41 | 42 | 43 | def initialize_clip(config, num_patches=240): 44 | if config["clip_name"] == "ViT-B-16": 45 | clip_model, preprocess = clip.load("ViT-B-16.tar", jit=False) 46 | num_patches = int(config["image_res"] * config["image_res"] / (16 * 16)) 47 | pos_embed = nn.Parameter(torch.zeros(num_patches + 1, 768).float()) 48 | elif config["clip_name"] == "ViT-L-14": 49 | clip_model, preprocess = clip.load( 50 | os.path.join("model_zoo", "mplug", "ViT-L-14.tar"), 51 | jit=False, 52 | ) 53 | num_patches = int(config["image_res"] * config["image_res"] / (14 * 14)) 54 | pos_embed = nn.Parameter(torch.zeros(num_patches + 1, 1024).float()) 55 | pos_embed.weight = resize_pos_embed( 56 | clip_model.visual.positional_embedding.unsqueeze(0), pos_embed.unsqueeze(0) 57 | ) 58 | clip_model.visual.positional_embedding = pos_embed 59 | return clip_model, preprocess 60 | 61 | 62 | # def initialize_vit(VISUAL_CONFIG, model_type="ViT-B_32", pretrained_dir="data/ViT-B_32.npz", img_size=(384, 640), 63 | # num_patches=240): 64 | # from vit.models.modeling import VisionTransformer, CONFIGS 65 | # config = CONFIGS[model_type] 66 | # model = VisionTransformer(config, img_size=224, zero_head=True, num_classes=1) 67 | # model.load_from(np.load(pretrained_dir)) 68 | 69 | # pos_embed = nn.Parameter(torch.zeros(num_patches + 1, 768).float()) 70 | # pos_embed.weight = resize_pos_embed(model.transformer.embeddings.position_embeddings, pos_embed.unsqueeze(0)) 71 | # model.transformer.embeddings.position_embeddings = pos_embed 72 | # if VISUAL_CONFIG.freeze_clip: 73 | # for parameter in model.parameters(): 74 | # parameter.requires_grad = False 75 | # return model 76 | 77 | 78 | def initialize_optimizer(visual_model, lr, momentum, weight_decay): 79 | optimizer = torch.optim.SGD( 80 | visual_model.parameters(), lr, momentum=momentum, weight_decay=weight_decay 81 | ) 82 | return optimizer 83 | 84 | 85 | def adjust_learning_rate(optimizer, epoch, args): 86 | """Decay the learning rate based on schedule""" 87 | lr = args.sgd_lr 88 | 89 | for milestone in args.schedule.split(","): 90 | lr *= 0.1 if epoch >= float(milestone) else 1.0 91 | for param_group in optimizer.param_groups: 92 | param_group["lr"] = lr 93 | 94 | 95 | from torch.optim import Optimizer 96 | 97 | 98 | class FusedOptimizer(Optimizer): 99 | def __init__(self, optimizers): 100 | self.optimizers = optimizers 101 | param_groups = [] 102 | for optimizer in self.optimizers: 103 | param_groups += optimizer.param_groups 104 | # super(FusedOptimizer, self).__init__([], {}) 105 | self.param_groups = param_groups 106 | 107 | def step(self): 108 | for optimizer in self.optimizers: 109 | optimizer.step() 110 | -------------------------------------------------------------------------------- /modules/text2video_zero/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import numpy as np 4 | import torch 5 | import torchvision 6 | from torchvision.transforms import Resize, InterpolationMode 7 | import imageio 8 | from einops import rearrange 9 | from PIL import Image 10 | import decord 11 | 12 | 13 | def create_gif(frames, fps, rescale=False, path=None): 14 | if path is None: 15 | dir = "temporal" 16 | os.makedirs(dir, exist_ok=True) 17 | path = os.path.join(dir, "canny_db.gif") 18 | 19 | outputs = [] 20 | for i, x in enumerate(frames): 21 | x = torchvision.utils.make_grid(torch.Tensor(x), nrow=4) 22 | if rescale: 23 | x = (x + 1.0) / 2.0 # -1,1 -> 0,1 24 | x = (x * 255).numpy().astype(np.uint8) 25 | outputs.append(x) 26 | # imageio.imsave(os.path.join(dir, os.path.splitext(name)[0] + f'_{i}.jpg'), x) 27 | 28 | imageio.mimsave(path, outputs, fps=fps) 29 | return path 30 | 31 | 32 | def post_process_gif(list_of_results, image_resolution): 33 | output_file = "/tmp/ddxk.gif" 34 | imageio.mimsave(output_file, list_of_results, fps=4) 35 | return output_file 36 | 37 | 38 | def HWC3(x): 39 | assert x.dtype == np.uint8 40 | if x.ndim == 2: 41 | x = x[:, :, None] 42 | assert x.ndim == 3 43 | H, W, C = x.shape 44 | assert C == 1 or C == 3 or C == 4 45 | if C == 3: 46 | return x 47 | if C == 1: 48 | return np.concatenate([x, x, x], axis=2) 49 | if C == 4: 50 | color = x[:, :, 0:3].astype(np.float32) 51 | alpha = x[:, :, 3:4].astype(np.float32) / 255.0 52 | y = color * alpha + 255.0 * (1.0 - alpha) 53 | y = y.clip(0, 255).astype(np.uint8) 54 | return y 55 | 56 | 57 | def pre_process(input_video): 58 | control_imgs = [] 59 | for frame in input_video: 60 | img = rearrange(frame, "c h w -> h w c").cpu().numpy().astype(np.uint8) 61 | img = HWC3(img) 62 | H, W, C = img.shape 63 | img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST) 64 | control_imgs.append(img[None]) 65 | control_imgs = np.concatenate(control_imgs) 66 | control = torch.from_numpy(control_imgs.copy()).float() / 255.0 67 | return rearrange(control, "f h w c -> f c h w") 68 | 69 | 70 | class CrossFrameAttnProcessor: 71 | def __init__(self, unet_chunk_size=2): 72 | self.unet_chunk_size = unet_chunk_size 73 | 74 | def __call__( 75 | self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None 76 | ): 77 | batch_size, sequence_length, _ = hidden_states.shape 78 | attention_mask = attn.prepare_attention_mask( 79 | attention_mask, sequence_length, batch_size 80 | ) 81 | query = attn.to_q(hidden_states) 82 | 83 | is_cross_attention = encoder_hidden_states is not None 84 | if encoder_hidden_states is None: 85 | encoder_hidden_states = hidden_states 86 | elif attn.cross_attention_norm: 87 | encoder_hidden_states = attn.norm_cross(encoder_hidden_states) 88 | key = attn.to_k(encoder_hidden_states) 89 | value = attn.to_v(encoder_hidden_states) 90 | # Sparse Attention 91 | if not is_cross_attention: 92 | video_length = key.size()[0] // self.unet_chunk_size 93 | # former_frame_index = torch.arange(video_length) - 1 94 | # former_frame_index[0] = 0 95 | former_frame_index = [0] * video_length 96 | key = rearrange(key, "(b f) d c -> b f d c", f=video_length) 97 | key = key[:, former_frame_index] 98 | key = rearrange(key, "b f d c -> (b f) d c") 99 | value = rearrange(value, "(b f) d c -> b f d c", f=video_length) 100 | value = value[:, former_frame_index] 101 | value = rearrange(value, "b f d c -> (b f) d c") 102 | 103 | query = attn.head_to_batch_dim(query) 104 | key = attn.head_to_batch_dim(key) 105 | value = attn.head_to_batch_dim(value) 106 | 107 | attention_probs = attn.get_attention_scores(query, key, attention_mask) 108 | hidden_states = torch.bmm(attention_probs, value) 109 | hidden_states = attn.batch_to_head_dim(hidden_states) 110 | 111 | # linear proj 112 | hidden_states = attn.to_out[0](hidden_states) 113 | # dropout 114 | hidden_states = attn.to_out[1](hidden_states) 115 | 116 | return hidden_states 117 | -------------------------------------------------------------------------------- /modules/sadtalker/inference.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | import shutil 3 | import torch 4 | from time import strftime 5 | import os, sys, time 6 | from argparse import ArgumentParser 7 | 8 | import uuid 9 | 10 | from .src.utils.preprocess import CropAndExtract 11 | from .src.test_audio2coeff import Audio2Coeff 12 | from .src.facerender.animate import AnimateFromCoeff 13 | from .src.generate_batch import get_data 14 | from .src.generate_facerender_batch import get_facerender_data 15 | from .src.utils.init_path import init_path 16 | 17 | def main(args): 18 | #torch.backends.cudnn.enabled = False 19 | 20 | pic_path = args.source_image 21 | audio_path = args.driven_audio 22 | save_dir = os.path.join(args.result_dir) 23 | os.makedirs(save_dir, exist_ok=True) 24 | pose_style = args.pose_style 25 | device = args.device 26 | batch_size = args.batch_size 27 | input_yaw_list = args.input_yaw 28 | input_pitch_list = args.input_pitch 29 | input_roll_list = args.input_roll 30 | ref_eyeblink = args.ref_eyeblink 31 | ref_pose = args.ref_pose 32 | 33 | current_root_path = r"./modules/sadtalker" # os.path.split(sys.argv[0])[0] 34 | print("current_root_path is:", current_root_path) 35 | 36 | sadtalker_paths = init_path( 37 | args.checkpoint_dir, 38 | os.path.join(current_root_path, 'src/config'), 39 | args.size, 40 | args.old_version, 41 | args.preprocess) 42 | 43 | #init model 44 | print("init preprocess_model") 45 | preprocess_model = CropAndExtract(sadtalker_paths, device) 46 | 47 | print("audio_to_coeff") 48 | audio_to_coeff = Audio2Coeff(sadtalker_paths, device) 49 | 50 | print("animate_from_coeff") 51 | animate_from_coeff = AnimateFromCoeff(sadtalker_paths, device) 52 | 53 | #crop image and extract 3dmm from image 54 | first_frame_dir = os.path.join(save_dir, 'first_frame_dir') 55 | os.makedirs(first_frame_dir, exist_ok=True) 56 | print('3DMM Extraction for source image') 57 | first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(pic_path, first_frame_dir, args.preprocess,\ 58 | source_image_flag=True, pic_size=args.size) 59 | if first_coeff_path is None: 60 | print("Can't get the coeffs of the input") 61 | return 62 | 63 | if ref_eyeblink is not None: 64 | ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[0] 65 | ref_eyeblink_frame_dir = os.path.join(save_dir, ref_eyeblink_videoname) 66 | os.makedirs(ref_eyeblink_frame_dir, exist_ok=True) 67 | print('3DMM Extraction for the reference video providing eye blinking') 68 | ref_eyeblink_coeff_path, _, _ = preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir, args.preprocess, source_image_flag=False) 69 | else: 70 | ref_eyeblink_coeff_path=None 71 | 72 | if ref_pose is not None: 73 | if ref_pose == ref_eyeblink: 74 | ref_pose_coeff_path = ref_eyeblink_coeff_path 75 | else: 76 | ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0] 77 | ref_pose_frame_dir = os.path.join(save_dir, ref_pose_videoname) 78 | os.makedirs(ref_pose_frame_dir, exist_ok=True) 79 | print('3DMM Extraction for the reference video providing pose') 80 | ref_pose_coeff_path, _, _ = preprocess_model.generate(ref_pose, ref_pose_frame_dir, args.preprocess, source_image_flag=False) 81 | else: 82 | ref_pose_coeff_path=None 83 | 84 | #audio2ceoff 85 | batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path, still=args.still) 86 | coeff_path = audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path) 87 | 88 | #coeff2video 89 | data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, 90 | batch_size, input_yaw_list, input_pitch_list, input_roll_list, 91 | expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess, size=args.size) 92 | 93 | result = animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \ 94 | enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess, img_size=args.size) 95 | 96 | audio_name = str(uuid.uuid4())[:8] 97 | shutil.move(result, './video/'+ audio_name +'.mp4') 98 | print('The generated video is named:', audio_name + '.mp4') 99 | 100 | if not args.verbose: 101 | shutil.rmtree("./video/sadtalker") 102 | 103 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/models/nn/checkpoint.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Iterable, Sequence, Union 2 | 3 | import torch 4 | from torch.cuda.amp import custom_bwd, custom_fwd 5 | 6 | 7 | def checkpoint( 8 | func: Callable[..., Union[torch.Tensor, Sequence[torch.Tensor]]], 9 | inputs: Sequence[torch.Tensor], 10 | params: Iterable[torch.Tensor], 11 | flag: bool, 12 | ): 13 | """ 14 | Evaluate a function without caching intermediate activations, allowing for 15 | reduced memory at the expense of extra compute in the backward pass. 16 | :param func: the function to evaluate. 17 | :param inputs: the argument sequence to pass to `func`. 18 | :param params: a sequence of parameters `func` depends on but does not 19 | explicitly take as arguments. 20 | :param flag: if False, disable gradient checkpointing. 21 | """ 22 | if flag: 23 | args = tuple(inputs) + tuple(params) 24 | return CheckpointFunction.apply(func, len(inputs), *args) 25 | else: 26 | return func(*inputs) 27 | 28 | 29 | class CheckpointFunction(torch.autograd.Function): 30 | @staticmethod 31 | @custom_fwd 32 | def forward(ctx, run_function, length, *args): 33 | ctx.run_function = run_function 34 | ctx.length = length 35 | input_tensors = list(args[:length]) 36 | input_params = list(args[length:]) 37 | ctx.save_for_backward(*input_tensors, *input_params) 38 | with torch.no_grad(): 39 | output_tensors = ctx.run_function(*input_tensors) 40 | return output_tensors 41 | 42 | @staticmethod 43 | @custom_bwd 44 | def backward(ctx, *output_grads): 45 | inputs = ctx.saved_tensors 46 | input_tensors = inputs[: ctx.length] 47 | input_params = inputs[ctx.length :] 48 | res = CheckpointFunctionGradFunction.apply( 49 | ctx.run_function, 50 | len(input_tensors), 51 | len(input_params), 52 | *input_tensors, 53 | *input_params, 54 | *output_grads 55 | ) 56 | return (None, None) + res 57 | 58 | 59 | class CheckpointFunctionGradFunction(torch.autograd.Function): 60 | @staticmethod 61 | @custom_fwd 62 | def forward(ctx, run_function, length_1, length_2, *args): 63 | ctx.run_function = run_function 64 | ctx.length_1 = length_1 65 | ctx.length_2 = length_2 66 | input_tensors = [x.detach().requires_grad_(True) for x in args[:length_1]] 67 | input_params = list(args[length_1 : length_1 + length_2]) 68 | output_grads = list(args[length_1 + length_2 :]) 69 | ctx.save_for_backward(*input_tensors, *input_params, *output_grads) 70 | 71 | with torch.enable_grad(): 72 | # Fixes a bug where the first op in run_function modifies the 73 | # Tensor storage in place, which is not allowed for detach()'d 74 | # Tensors. 75 | shallow_copies = [x.view_as(x) for x in input_tensors] 76 | output_tensors = ctx.run_function(*shallow_copies) 77 | input_grads = torch.autograd.grad( 78 | output_tensors, 79 | input_tensors + input_params, 80 | output_grads, 81 | allow_unused=True, 82 | ) 83 | return input_grads 84 | 85 | @staticmethod 86 | @custom_bwd 87 | def backward(ctx, *all_output_grads): 88 | args = ctx.saved_tensors 89 | input_tensors = [x.detach().requires_grad_(True) for x in args[: ctx.length_1]] 90 | input_params = list(args[ctx.length_1 : ctx.length_1 + ctx.length_2]) 91 | output_grads = [ 92 | x.detach().requires_grad_(True) for x in args[ctx.length_1 + ctx.length_2 :] 93 | ] 94 | 95 | with torch.enable_grad(): 96 | # Fixes a bug where the first op in run_function modifies the 97 | # Tensor storage in place, which is not allowed for detach()'d 98 | # Tensors. 99 | shallow_copies = [x.view_as(x) for x in input_tensors] 100 | output_tensors = ctx.run_function(*shallow_copies) 101 | input_grads = torch.autograd.grad( 102 | output_tensors, 103 | input_tensors + input_params, 104 | output_grads, 105 | allow_unused=True, 106 | create_graph=True, 107 | retain_graph=True, 108 | ) 109 | input_grads_grads = torch.autograd.grad( 110 | input_grads, 111 | input_tensors + input_params + output_grads, 112 | all_output_grads, 113 | allow_unused=True, 114 | ) 115 | del input_grads 116 | return (None, None, None) + input_grads_grads 117 | -------------------------------------------------------------------------------- /modules/sadtalker/src/face3d/util/load_mats.py: -------------------------------------------------------------------------------- 1 | """This script is to load 3D face model for Deep3DFaceRecon_pytorch 2 | """ 3 | 4 | import numpy as np 5 | from PIL import Image 6 | from scipy.io import loadmat, savemat 7 | from array import array 8 | import os.path as osp 9 | 10 | 11 | # load expression basis 12 | def LoadExpBasis(bfm_folder='BFM'): 13 | n_vertex = 53215 14 | Expbin = open(osp.join(bfm_folder, 'Exp_Pca.bin'), 'rb') 15 | exp_dim = array('i') 16 | exp_dim.fromfile(Expbin, 1) 17 | expMU = array('f') 18 | expPC = array('f') 19 | expMU.fromfile(Expbin, 3*n_vertex) 20 | expPC.fromfile(Expbin, 3*exp_dim[0]*n_vertex) 21 | Expbin.close() 22 | 23 | expPC = np.array(expPC) 24 | expPC = np.reshape(expPC, [exp_dim[0], -1]) 25 | expPC = np.transpose(expPC) 26 | 27 | expEV = np.loadtxt(osp.join(bfm_folder, 'std_exp.txt')) 28 | 29 | return expPC, expEV 30 | 31 | 32 | # transfer original BFM09 to our face model 33 | def transferBFM09(bfm_folder='BFM'): 34 | print('Transfer BFM09 to BFM_model_front......') 35 | original_BFM = loadmat(osp.join(bfm_folder, '01_MorphableModel.mat')) 36 | shapePC = original_BFM['shapePC'] # shape basis 37 | shapeEV = original_BFM['shapeEV'] # corresponding eigen value 38 | shapeMU = original_BFM['shapeMU'] # mean face 39 | texPC = original_BFM['texPC'] # texture basis 40 | texEV = original_BFM['texEV'] # eigen value 41 | texMU = original_BFM['texMU'] # mean texture 42 | 43 | expPC, expEV = LoadExpBasis(bfm_folder) 44 | 45 | # transfer BFM09 to our face model 46 | 47 | idBase = shapePC*np.reshape(shapeEV, [-1, 199]) 48 | idBase = idBase/1e5 # unify the scale to decimeter 49 | idBase = idBase[:, :80] # use only first 80 basis 50 | 51 | exBase = expPC*np.reshape(expEV, [-1, 79]) 52 | exBase = exBase/1e5 # unify the scale to decimeter 53 | exBase = exBase[:, :64] # use only first 64 basis 54 | 55 | texBase = texPC*np.reshape(texEV, [-1, 199]) 56 | texBase = texBase[:, :80] # use only first 80 basis 57 | 58 | # our face model is cropped along face landmarks and contains only 35709 vertex. 59 | # original BFM09 contains 53490 vertex, and expression basis provided by Guo et al. contains 53215 vertex. 60 | # thus we select corresponding vertex to get our face model. 61 | 62 | index_exp = loadmat(osp.join(bfm_folder, 'BFM_front_idx.mat')) 63 | index_exp = index_exp['idx'].astype(np.int32) - 1 # starts from 0 (to 53215) 64 | 65 | index_shape = loadmat(osp.join(bfm_folder, 'BFM_exp_idx.mat')) 66 | index_shape = index_shape['trimIndex'].astype( 67 | np.int32) - 1 # starts from 0 (to 53490) 68 | index_shape = index_shape[index_exp] 69 | 70 | idBase = np.reshape(idBase, [-1, 3, 80]) 71 | idBase = idBase[index_shape, :, :] 72 | idBase = np.reshape(idBase, [-1, 80]) 73 | 74 | texBase = np.reshape(texBase, [-1, 3, 80]) 75 | texBase = texBase[index_shape, :, :] 76 | texBase = np.reshape(texBase, [-1, 80]) 77 | 78 | exBase = np.reshape(exBase, [-1, 3, 64]) 79 | exBase = exBase[index_exp, :, :] 80 | exBase = np.reshape(exBase, [-1, 64]) 81 | 82 | meanshape = np.reshape(shapeMU, [-1, 3])/1e5 83 | meanshape = meanshape[index_shape, :] 84 | meanshape = np.reshape(meanshape, [1, -1]) 85 | 86 | meantex = np.reshape(texMU, [-1, 3]) 87 | meantex = meantex[index_shape, :] 88 | meantex = np.reshape(meantex, [1, -1]) 89 | 90 | other_info = loadmat(osp.join(bfm_folder, 'facemodel_info.mat')) 91 | frontmask2_idx = other_info['frontmask2_idx'] 92 | skinmask = other_info['skinmask'] 93 | keypoints = other_info['keypoints'] 94 | point_buf = other_info['point_buf'] 95 | tri = other_info['tri'] 96 | tri_mask2 = other_info['tri_mask2'] 97 | 98 | # save our face model 99 | savemat(osp.join(bfm_folder, 'BFM_model_front.mat'), {'meanshape': meanshape, 'meantex': meantex, 'idBase': idBase, 'exBase': exBase, 'texBase': texBase, 100 | 'tri': tri, 'point_buf': point_buf, 'tri_mask2': tri_mask2, 'keypoints': keypoints, 'frontmask2_idx': frontmask2_idx, 'skinmask': skinmask}) 101 | 102 | 103 | # load landmarks for standard face, which is used for image preprocessing 104 | def load_lm3d(bfm_folder): 105 | 106 | Lm3D = loadmat(osp.join(bfm_folder, 'similarity_Lm3D_all.mat')) 107 | Lm3D = Lm3D['lm'] 108 | 109 | # calculate 5 facial landmarks using 68 landmarks 110 | lm_idx = np.array([31, 37, 40, 43, 46, 49, 55]) - 1 111 | Lm3D = np.stack([Lm3D[lm_idx[0], :], np.mean(Lm3D[lm_idx[[1, 2]], :], 0), np.mean( 112 | Lm3D[lm_idx[[3, 4]], :], 0), Lm3D[lm_idx[5], :], Lm3D[lm_idx[6], :]], axis=0) 113 | Lm3D = Lm3D[[1, 2, 0, 3, 4], :] 114 | 115 | return Lm3D 116 | 117 | 118 | if __name__ == '__main__': 119 | transferBFM09() -------------------------------------------------------------------------------- /modules/shap_e/shap_e/models/transmitter/bottleneck.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Dict, Optional 3 | 4 | import numpy as np 5 | import torch.nn as nn 6 | from torch import torch 7 | 8 | from ...diffusion.gaussian_diffusion import diffusion_from_config 9 | from ...util.collections import AttrDict 10 | 11 | 12 | class LatentBottleneck(nn.Module, ABC): 13 | def __init__(self, *, device: torch.device, d_latent: int): 14 | super().__init__() 15 | self.device = device 16 | self.d_latent = d_latent 17 | 18 | @abstractmethod 19 | def forward(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict: 20 | pass 21 | 22 | 23 | class LatentWarp(nn.Module, ABC): 24 | def __init__(self, *, device: torch.device): 25 | super().__init__() 26 | self.device = device 27 | 28 | @abstractmethod 29 | def warp(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict: 30 | pass 31 | 32 | @abstractmethod 33 | def unwarp(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict: 34 | pass 35 | 36 | 37 | class IdentityLatentWarp(LatentWarp): 38 | def warp(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict: 39 | _ = options 40 | return x 41 | 42 | def unwarp(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict: 43 | _ = options 44 | return x 45 | 46 | 47 | class Tan2LatentWarp(LatentWarp): 48 | def __init__(self, *, coeff1: float = 1.0, device: torch.device): 49 | super().__init__(device=device) 50 | self.coeff1 = coeff1 51 | self.scale = np.tan(np.tan(1.0) * coeff1) 52 | 53 | def warp(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict: 54 | _ = options 55 | return ((x.float().tan() * self.coeff1).tan() / self.scale).to(x.dtype) 56 | 57 | def unwarp(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict: 58 | _ = options 59 | return ((x.float() * self.scale).arctan() / self.coeff1).arctan().to(x.dtype) 60 | 61 | 62 | class IdentityLatentBottleneck(LatentBottleneck): 63 | def forward(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict: 64 | _ = options 65 | return x 66 | 67 | 68 | class ClampNoiseBottleneck(LatentBottleneck): 69 | def __init__(self, *, device: torch.device, d_latent: int, noise_scale: float): 70 | super().__init__(device=device, d_latent=d_latent) 71 | self.noise_scale = noise_scale 72 | 73 | def forward(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict: 74 | _ = options 75 | x = x.tanh() 76 | if not self.training: 77 | return x 78 | return x + torch.randn_like(x) * self.noise_scale 79 | 80 | 81 | class ClampDiffusionNoiseBottleneck(LatentBottleneck): 82 | def __init__( 83 | self, 84 | *, 85 | device: torch.device, 86 | d_latent: int, 87 | diffusion: Dict[str, Any], 88 | diffusion_prob: float = 1.0, 89 | ): 90 | super().__init__(device=device, d_latent=d_latent) 91 | self.diffusion = diffusion_from_config(diffusion) 92 | self.diffusion_prob = diffusion_prob 93 | 94 | def forward(self, x: torch.Tensor, options: Optional[AttrDict] = None) -> AttrDict: 95 | _ = options 96 | x = x.tanh() 97 | if not self.training: 98 | return x 99 | t = torch.randint(low=0, high=self.diffusion.num_timesteps, size=(len(x),), device=x.device) 100 | t = torch.where( 101 | torch.rand(len(x), device=x.device) < self.diffusion_prob, t, torch.zeros_like(t) 102 | ) 103 | return self.diffusion.q_sample(x, t) 104 | 105 | 106 | def latent_bottleneck_from_config(config: Dict[str, Any], device: torch.device, d_latent: int): 107 | name = config.pop("name") 108 | if name == "clamp_noise": 109 | return ClampNoiseBottleneck(**config, device=device, d_latent=d_latent) 110 | elif name == "identity": 111 | return IdentityLatentBottleneck(**config, device=device, d_latent=d_latent) 112 | elif name == "clamp_diffusion_noise": 113 | return ClampDiffusionNoiseBottleneck(**config, device=device, d_latent=d_latent) 114 | else: 115 | raise ValueError(f"unknown latent bottleneck: {name}") 116 | 117 | 118 | def latent_warp_from_config(config: Dict[str, Any], device: torch.device): 119 | name = config.pop("name") 120 | if name == "identity": 121 | return IdentityLatentWarp(**config, device=device) 122 | elif name == "tan2": 123 | return Tan2LatentWarp(**config, device=device) 124 | else: 125 | raise ValueError(f"unknown latent warping function: {name}") 126 | -------------------------------------------------------------------------------- /modules/sadtalker/src/audio2pose_models/networks.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | 4 | 5 | class ResidualConv(nn.Module): 6 | def __init__(self, input_dim, output_dim, stride, padding): 7 | super(ResidualConv, self).__init__() 8 | 9 | self.conv_block = nn.Sequential( 10 | nn.BatchNorm2d(input_dim), 11 | nn.ReLU(), 12 | nn.Conv2d( 13 | input_dim, output_dim, kernel_size=3, stride=stride, padding=padding 14 | ), 15 | nn.BatchNorm2d(output_dim), 16 | nn.ReLU(), 17 | nn.Conv2d(output_dim, output_dim, kernel_size=3, padding=1), 18 | ) 19 | self.conv_skip = nn.Sequential( 20 | nn.Conv2d(input_dim, output_dim, kernel_size=3, stride=stride, padding=1), 21 | nn.BatchNorm2d(output_dim), 22 | ) 23 | 24 | def forward(self, x): 25 | 26 | return self.conv_block(x) + self.conv_skip(x) 27 | 28 | 29 | class Upsample(nn.Module): 30 | def __init__(self, input_dim, output_dim, kernel, stride): 31 | super(Upsample, self).__init__() 32 | 33 | self.upsample = nn.ConvTranspose2d( 34 | input_dim, output_dim, kernel_size=kernel, stride=stride 35 | ) 36 | 37 | def forward(self, x): 38 | return self.upsample(x) 39 | 40 | 41 | class Squeeze_Excite_Block(nn.Module): 42 | def __init__(self, channel, reduction=16): 43 | super(Squeeze_Excite_Block, self).__init__() 44 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 45 | self.fc = nn.Sequential( 46 | nn.Linear(channel, channel // reduction, bias=False), 47 | nn.ReLU(inplace=True), 48 | nn.Linear(channel // reduction, channel, bias=False), 49 | nn.Sigmoid(), 50 | ) 51 | 52 | def forward(self, x): 53 | b, c, _, _ = x.size() 54 | y = self.avg_pool(x).view(b, c) 55 | y = self.fc(y).view(b, c, 1, 1) 56 | return x * y.expand_as(x) 57 | 58 | 59 | class ASPP(nn.Module): 60 | def __init__(self, in_dims, out_dims, rate=[6, 12, 18]): 61 | super(ASPP, self).__init__() 62 | 63 | self.aspp_block1 = nn.Sequential( 64 | nn.Conv2d( 65 | in_dims, out_dims, 3, stride=1, padding=rate[0], dilation=rate[0] 66 | ), 67 | nn.ReLU(inplace=True), 68 | nn.BatchNorm2d(out_dims), 69 | ) 70 | self.aspp_block2 = nn.Sequential( 71 | nn.Conv2d( 72 | in_dims, out_dims, 3, stride=1, padding=rate[1], dilation=rate[1] 73 | ), 74 | nn.ReLU(inplace=True), 75 | nn.BatchNorm2d(out_dims), 76 | ) 77 | self.aspp_block3 = nn.Sequential( 78 | nn.Conv2d( 79 | in_dims, out_dims, 3, stride=1, padding=rate[2], dilation=rate[2] 80 | ), 81 | nn.ReLU(inplace=True), 82 | nn.BatchNorm2d(out_dims), 83 | ) 84 | 85 | self.output = nn.Conv2d(len(rate) * out_dims, out_dims, 1) 86 | self._init_weights() 87 | 88 | def forward(self, x): 89 | x1 = self.aspp_block1(x) 90 | x2 = self.aspp_block2(x) 91 | x3 = self.aspp_block3(x) 92 | out = torch.cat([x1, x2, x3], dim=1) 93 | return self.output(out) 94 | 95 | def _init_weights(self): 96 | for m in self.modules(): 97 | if isinstance(m, nn.Conv2d): 98 | nn.init.kaiming_normal_(m.weight) 99 | elif isinstance(m, nn.BatchNorm2d): 100 | m.weight.data.fill_(1) 101 | m.bias.data.zero_() 102 | 103 | 104 | class Upsample_(nn.Module): 105 | def __init__(self, scale=2): 106 | super(Upsample_, self).__init__() 107 | 108 | self.upsample = nn.Upsample(mode="bilinear", scale_factor=scale) 109 | 110 | def forward(self, x): 111 | return self.upsample(x) 112 | 113 | 114 | class AttentionBlock(nn.Module): 115 | def __init__(self, input_encoder, input_decoder, output_dim): 116 | super(AttentionBlock, self).__init__() 117 | 118 | self.conv_encoder = nn.Sequential( 119 | nn.BatchNorm2d(input_encoder), 120 | nn.ReLU(), 121 | nn.Conv2d(input_encoder, output_dim, 3, padding=1), 122 | nn.MaxPool2d(2, 2), 123 | ) 124 | 125 | self.conv_decoder = nn.Sequential( 126 | nn.BatchNorm2d(input_decoder), 127 | nn.ReLU(), 128 | nn.Conv2d(input_decoder, output_dim, 3, padding=1), 129 | ) 130 | 131 | self.conv_attn = nn.Sequential( 132 | nn.BatchNorm2d(output_dim), 133 | nn.ReLU(), 134 | nn.Conv2d(output_dim, 1, 1), 135 | ) 136 | 137 | def forward(self, x1, x2): 138 | out = self.conv_encoder(x1) + self.conv_decoder(x2) 139 | out = self.conv_attn(out) 140 | return out * x2 -------------------------------------------------------------------------------- /modules/sadtalker/src/utils/face_enhancer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | from gfpgan import GFPGANer 5 | 6 | from tqdm import tqdm 7 | 8 | from .videoio import load_video_to_cv2 9 | 10 | import cv2 11 | 12 | 13 | class GeneratorWithLen(object): 14 | """ From https://stackoverflow.com/a/7460929 """ 15 | 16 | def __init__(self, gen, length): 17 | self.gen = gen 18 | self.length = length 19 | 20 | def __len__(self): 21 | return self.length 22 | 23 | def __iter__(self): 24 | return self.gen 25 | 26 | def enhancer_list(images, method='gfpgan', bg_upsampler='realesrgan'): 27 | gen = enhancer_generator_no_len(images, method=method, bg_upsampler=bg_upsampler) 28 | return list(gen) 29 | 30 | def enhancer_generator_with_len(images, method='gfpgan', bg_upsampler='realesrgan'): 31 | """ Provide a generator with a __len__ method so that it can passed to functions that 32 | call len()""" 33 | 34 | if os.path.isfile(images): # handle video to images 35 | # TODO: Create a generator version of load_video_to_cv2 36 | images = load_video_to_cv2(images) 37 | 38 | gen = enhancer_generator_no_len(images, method=method, bg_upsampler=bg_upsampler) 39 | gen_with_len = GeneratorWithLen(gen, len(images)) 40 | return gen_with_len 41 | 42 | def enhancer_generator_no_len(images, method='gfpgan', bg_upsampler='realesrgan'): 43 | """ Provide a generator function so that all of the enhanced images don't need 44 | to be stored in memory at the same time. This can save tons of RAM compared to 45 | the enhancer function. """ 46 | 47 | print('face enhancer....') 48 | if not isinstance(images, list) and os.path.isfile(images): # handle video to images 49 | images = load_video_to_cv2(images) 50 | 51 | # ------------------------ set up GFPGAN restorer ------------------------ 52 | if method == 'gfpgan': 53 | arch = 'clean' 54 | channel_multiplier = 2 55 | model_name = 'GFPGANv1.4' 56 | url = 'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth' 57 | elif method == 'RestoreFormer': 58 | arch = 'RestoreFormer' 59 | channel_multiplier = 2 60 | model_name = 'RestoreFormer' 61 | url = 'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.4/RestoreFormer.pth' 62 | elif method == 'codeformer': # TODO: 63 | arch = 'CodeFormer' 64 | channel_multiplier = 2 65 | model_name = 'CodeFormer' 66 | url = 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth' 67 | else: 68 | raise ValueError(f'Wrong model version {method}.') 69 | 70 | 71 | # ------------------------ set up background upsampler ------------------------ 72 | if bg_upsampler == 'realesrgan': 73 | if not torch.cuda.is_available(): # CPU 74 | import warnings 75 | warnings.warn('The unoptimized RealESRGAN is slow on CPU. We do not use it. ' 76 | 'If you really want to use it, please modify the corresponding codes.') 77 | bg_upsampler = None 78 | else: 79 | from basicsr.archs.rrdbnet_arch import RRDBNet 80 | from realesrgan import RealESRGANer 81 | model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2) 82 | bg_upsampler = RealESRGANer( 83 | scale=2, 84 | model_path='https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth', 85 | model=model, 86 | tile=400, 87 | tile_pad=10, 88 | pre_pad=0, 89 | half=True) # need to set False in CPU mode 90 | else: 91 | bg_upsampler = None 92 | 93 | # determine model paths 94 | model_path = os.path.join('gfpgan/weights', model_name + '.pth') 95 | 96 | if not os.path.isfile(model_path): 97 | model_path = os.path.join('checkpoints', model_name + '.pth') 98 | 99 | if not os.path.isfile(model_path): 100 | # download pre-trained models from url 101 | model_path = url 102 | 103 | restorer = GFPGANer( 104 | model_path=model_path, 105 | upscale=2, 106 | arch=arch, 107 | channel_multiplier=channel_multiplier, 108 | bg_upsampler=bg_upsampler) 109 | 110 | # ------------------------ restore ------------------------ 111 | for idx in tqdm(range(len(images)), 'Face Enhancer:'): 112 | 113 | img = cv2.cvtColor(images[idx], cv2.COLOR_RGB2BGR) 114 | 115 | # restore faces and background if necessary 116 | cropped_faces, restored_faces, r_img = restorer.enhance( 117 | img, 118 | has_aligned=False, 119 | only_center_face=False, 120 | paste_back=True) 121 | 122 | r_img = cv2.cvtColor(r_img, cv2.COLOR_BGR2RGB) 123 | yield r_img 124 | -------------------------------------------------------------------------------- /modules/sadtalker/src/generate_batch.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from tqdm import tqdm 4 | import torch 5 | import numpy as np 6 | import random 7 | import scipy.io as scio 8 | 9 | from .utils import audio as audio 10 | 11 | def crop_pad_audio(wav, audio_length): 12 | if len(wav) > audio_length: 13 | wav = wav[:audio_length] 14 | elif len(wav) < audio_length: 15 | wav = np.pad(wav, [0, audio_length - len(wav)], mode='constant', constant_values=0) 16 | return wav 17 | 18 | def parse_audio_length(audio_length, sr, fps): 19 | bit_per_frames = sr / fps 20 | 21 | num_frames = int(audio_length / bit_per_frames) 22 | audio_length = int(num_frames * bit_per_frames) 23 | 24 | return audio_length, num_frames 25 | 26 | def generate_blink_seq(num_frames): 27 | ratio = np.zeros((num_frames,1)) 28 | frame_id = 0 29 | while frame_id in range(num_frames): 30 | start = 80 31 | if frame_id+start+9<=num_frames - 1: 32 | ratio[frame_id+start:frame_id+start+9, 0] = [0.5,0.6,0.7,0.9,1, 0.9, 0.7,0.6,0.5] 33 | frame_id = frame_id+start+9 34 | else: 35 | break 36 | return ratio 37 | 38 | def generate_blink_seq_randomly(num_frames): 39 | ratio = np.zeros((num_frames,1)) 40 | if num_frames<=20: 41 | return ratio 42 | frame_id = 0 43 | while frame_id in range(num_frames): 44 | start = random.choice(range(min(10,num_frames), min(int(num_frames/2), 70))) 45 | if frame_id+start+5<=num_frames - 1: 46 | ratio[frame_id+start:frame_id+start+5, 0] = [0.5, 0.9, 1.0, 0.9, 0.5] 47 | frame_id = frame_id+start+5 48 | else: 49 | break 50 | return ratio 51 | 52 | def get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path, still=False, idlemode=False, length_of_audio=False, use_blink=True): 53 | 54 | syncnet_mel_step_size = 16 55 | fps = 25 56 | 57 | pic_name = os.path.splitext(os.path.split(first_coeff_path)[-1])[0] 58 | audio_name = os.path.splitext(os.path.split(audio_path)[-1])[0] 59 | 60 | 61 | if idlemode: 62 | num_frames = int(length_of_audio * 25) 63 | indiv_mels = np.zeros((num_frames, 80, 16)) 64 | else: 65 | wav = audio.load_wav(audio_path, 16000) 66 | wav_length, num_frames = parse_audio_length(len(wav), 16000, 25) 67 | wav = crop_pad_audio(wav, wav_length) 68 | orig_mel = audio.melspectrogram(wav).T 69 | spec = orig_mel.copy() # nframes 80 70 | indiv_mels = [] 71 | 72 | for i in tqdm(range(num_frames), 'mel:'): 73 | start_frame_num = i-2 74 | start_idx = int(80. * (start_frame_num / float(fps))) 75 | end_idx = start_idx + syncnet_mel_step_size 76 | seq = list(range(start_idx, end_idx)) 77 | seq = [ min(max(item, 0), orig_mel.shape[0]-1) for item in seq ] 78 | m = spec[seq, :] 79 | indiv_mels.append(m.T) 80 | indiv_mels = np.asarray(indiv_mels) # T 80 16 81 | 82 | ratio = generate_blink_seq_randomly(num_frames) # T 83 | source_semantics_path = first_coeff_path 84 | source_semantics_dict = scio.loadmat(source_semantics_path) 85 | ref_coeff = source_semantics_dict['coeff_3dmm'][:1,:70] #1 70 86 | ref_coeff = np.repeat(ref_coeff, num_frames, axis=0) 87 | 88 | if ref_eyeblink_coeff_path is not None: 89 | ratio[:num_frames] = 0 90 | refeyeblink_coeff_dict = scio.loadmat(ref_eyeblink_coeff_path) 91 | refeyeblink_coeff = refeyeblink_coeff_dict['coeff_3dmm'][:,:64] 92 | refeyeblink_num_frames = refeyeblink_coeff.shape[0] 93 | if refeyeblink_num_frames h w c").cpu().numpy().astype(np.uint8) 23 | detected_map = cv2.Canny(img, low_threshold, high_threshold) 24 | detected_map = HWC3(detected_map) 25 | detected_maps.append(detected_map[None]) 26 | detected_maps = np.concatenate(detected_maps) 27 | control = torch.from_numpy(detected_maps.copy()).float() / 255.0 28 | return rearrange(control, "f h w c -> f c h w") 29 | 30 | def inference(self, inputs): 31 | vid_path = inputs 32 | video, fps = prepare_video(vid_path, resolution=512, device="cpu") 33 | vid_canny = self.pre_process_canny(video) 34 | canny_to_save = list( 35 | rearrange(vid_canny, "f c w h -> f w h c").cpu().detach().numpy() 36 | ) 37 | out_path = get_new_video_name(vid_path, "edge") 38 | return create_video(canny_to_save, fps, out_path) 39 | 40 | 41 | class Video2Pose: 42 | def __init__(self, device, dtype=torch.float16): 43 | print("Initializing Video2Pose") 44 | self.device = device 45 | self.dtype = dtype 46 | self.detector = OpenposeDetector(device=device) 47 | 48 | def pre_process_pose(self, input_video, apply_pose_detect: bool = True): 49 | detected_maps = [] 50 | for frame in input_video: 51 | img = rearrange(frame, "c h w -> h w c").cpu().numpy().astype(np.uint8) 52 | img = HWC3(img) 53 | if apply_pose_detect: 54 | detected_map, _ = self.detector(img) 55 | else: 56 | detected_map = img 57 | detected_map = HWC3(detected_map) 58 | H, W, C = img.shape 59 | detected_map = cv2.resize( 60 | detected_map, (W, H), interpolation=cv2.INTER_NEAREST 61 | ) 62 | detected_maps.append(detected_map[None]) 63 | detected_maps = np.concatenate(detected_maps) 64 | control = torch.from_numpy(detected_maps.copy()).float() / 255.0 65 | return rearrange(control, "f h w c -> f c h w") 66 | 67 | def inference(self, inputs, resolution=512): 68 | vid_path = inputs 69 | video, fps = prepare_video( 70 | vid_path, resolution=resolution, device=self.device, normalize=False 71 | ) 72 | vid_pose = self.pre_process_pose(video) 73 | canny_to_save = list( 74 | rearrange(vid_pose, "f c w h -> f w h c").cpu().detach().numpy() 75 | ) 76 | out_path = get_new_video_name(vid_path, "pose") 77 | return create_video(canny_to_save, fps, out_path) 78 | 79 | 80 | class Video2Depth: 81 | def __init__(self, device, dtype=torch.float16): 82 | print("Initializing Video2Depth") 83 | self.device = device 84 | self.dtype = dtype 85 | self.depth_estimator = MidasDetector(device) 86 | 87 | def pre_process_depth(self, input_video, apply_depth_detect: bool = True): 88 | detected_maps = [] 89 | for frame in input_video: 90 | img = rearrange(frame, "c h w -> h w c").cpu().numpy().astype(np.uint8) 91 | img = HWC3(img) 92 | if apply_depth_detect: 93 | detected_map, _ = self.depth_estimator(img) 94 | else: 95 | detected_map = img 96 | detected_map = HWC3(detected_map) 97 | H, W, C = img.shape 98 | detected_map = cv2.resize( 99 | detected_map, (W, H), interpolation=cv2.INTER_NEAREST 100 | ) 101 | detected_maps.append(detected_map[None]) 102 | detected_maps = np.concatenate(detected_maps) 103 | control = torch.from_numpy(detected_maps.copy()).float() / 255.0 104 | return rearrange(control, "f h w c -> f c h w") 105 | 106 | def inference(self, inputs, resolution=512): 107 | vid_path = inputs 108 | video, fps = prepare_video( 109 | vid_path, 110 | resolution=resolution, 111 | device=self.device, 112 | dtype=self.dtype, 113 | normalize=False, 114 | ) 115 | control = self.pre_process_depth(video).to(self.device).to(self.dtype) 116 | 117 | depth_map_to_save = list( 118 | rearrange(control, "f c w h -> f w h c").cpu().detach().numpy() 119 | ) 120 | out_path = get_new_video_name(vid_path, "depth") 121 | return create_video(depth_map_to_save, fps, out_path) 122 | -------------------------------------------------------------------------------- /modules/sadtalker/src/utils/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import numpy as np 4 | # import tensorflow as tf 5 | from scipy import signal 6 | from scipy.io import wavfile 7 | from .hparams import hparams as hp 8 | 9 | 10 | def load_wav(path, sr): 11 | return librosa.core.load(path, sr=sr)[0] 12 | 13 | def save_wav(wav, path, sr): 14 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 15 | #proposed by @dsmiller 16 | wavfile.write(path, sr, wav.astype(np.int16)) 17 | 18 | def save_wavenet_wav(wav, path, sr): 19 | librosa.output.write_wav(path, wav, sr=sr) 20 | 21 | def preemphasis(wav, k, preemphasize=True): 22 | if preemphasize: 23 | return signal.lfilter([1, -k], [1], wav) 24 | return wav 25 | 26 | def inv_preemphasis(wav, k, inv_preemphasize=True): 27 | if inv_preemphasize: 28 | return signal.lfilter([1], [1, -k], wav) 29 | return wav 30 | 31 | def get_hop_size(): 32 | hop_size = hp.hop_size 33 | if hop_size is None: 34 | assert hp.frame_shift_ms is not None 35 | hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate) 36 | return hop_size 37 | 38 | def linearspectrogram(wav): 39 | D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize)) 40 | S = _amp_to_db(np.abs(D)) - hp.ref_level_db 41 | 42 | if hp.signal_normalization: 43 | return _normalize(S) 44 | return S 45 | 46 | def melspectrogram(wav): 47 | D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize)) 48 | S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db 49 | 50 | if hp.signal_normalization: 51 | return _normalize(S) 52 | return S 53 | 54 | def _lws_processor(): 55 | import lws 56 | return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech") 57 | 58 | def _stft(y): 59 | if hp.use_lws: 60 | return _lws_processor(hp).stft(y).T 61 | else: 62 | return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size) 63 | 64 | ########################################################## 65 | #Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!) 66 | def num_frames(length, fsize, fshift): 67 | """Compute number of time frames of spectrogram 68 | """ 69 | pad = (fsize - fshift) 70 | if length % fshift == 0: 71 | M = (length + pad * 2 - fsize) // fshift + 1 72 | else: 73 | M = (length + pad * 2 - fsize) // fshift + 2 74 | return M 75 | 76 | 77 | def pad_lr(x, fsize, fshift): 78 | """Compute left and right padding 79 | """ 80 | M = num_frames(len(x), fsize, fshift) 81 | pad = (fsize - fshift) 82 | T = len(x) + 2 * pad 83 | r = (M - 1) * fshift + fsize - T 84 | return pad, pad + r 85 | ########################################################## 86 | #Librosa correct padding 87 | def librosa_pad_lr(x, fsize, fshift): 88 | return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0] 89 | 90 | # Conversions 91 | _mel_basis = None 92 | 93 | def _linear_to_mel(spectogram): 94 | global _mel_basis 95 | if _mel_basis is None: 96 | _mel_basis = _build_mel_basis() 97 | return np.dot(_mel_basis, spectogram) 98 | 99 | def _build_mel_basis(): 100 | assert hp.fmax <= hp.sample_rate // 2 101 | return librosa.filters.mel(sr=hp.sample_rate, n_fft=hp.n_fft, n_mels=hp.num_mels, 102 | fmin=hp.fmin, fmax=hp.fmax) 103 | 104 | def _amp_to_db(x): 105 | min_level = np.exp(hp.min_level_db / 20 * np.log(10)) 106 | return 20 * np.log10(np.maximum(min_level, x)) 107 | 108 | def _db_to_amp(x): 109 | return np.power(10.0, (x) * 0.05) 110 | 111 | def _normalize(S): 112 | if hp.allow_clipping_in_normalization: 113 | if hp.symmetric_mels: 114 | return np.clip((2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value, 115 | -hp.max_abs_value, hp.max_abs_value) 116 | else: 117 | return np.clip(hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)), 0, hp.max_abs_value) 118 | 119 | assert S.max() <= 0 and S.min() - hp.min_level_db >= 0 120 | if hp.symmetric_mels: 121 | return (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value 122 | else: 123 | return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)) 124 | 125 | def _denormalize(D): 126 | if hp.allow_clipping_in_normalization: 127 | if hp.symmetric_mels: 128 | return (((np.clip(D, -hp.max_abs_value, 129 | hp.max_abs_value) + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) 130 | + hp.min_level_db) 131 | else: 132 | return ((np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db) 133 | 134 | if hp.symmetric_mels: 135 | return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db) 136 | else: 137 | return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db) 138 | -------------------------------------------------------------------------------- /modules/mplug/models/clip/simple_tokenizer.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import html 3 | import os 4 | from functools import lru_cache 5 | 6 | import ftfy 7 | import regex as re 8 | 9 | 10 | @lru_cache() 11 | def default_bpe(): 12 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") 13 | 14 | 15 | @lru_cache() 16 | def bytes_to_unicode(): 17 | """ 18 | Returns list of utf-8 byte and a corresponding list of unicode strings. 19 | The reversible bpe codes work on unicode strings. 20 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 21 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 22 | This is a signficant percentage of your normal, say, 32K bpe vocab. 23 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 24 | And avoids mapping to whitespace/control characters the bpe code barfs on. 25 | """ 26 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) 27 | cs = bs[:] 28 | n = 0 29 | for b in range(2**8): 30 | if b not in bs: 31 | bs.append(b) 32 | cs.append(2**8+n) 33 | n += 1 34 | cs = [chr(n) for n in cs] 35 | return dict(zip(bs, cs)) 36 | 37 | 38 | def get_pairs(word): 39 | """Return set of symbol pairs in a word. 40 | Word is represented as tuple of symbols (symbols being variable-length strings). 41 | """ 42 | pairs = set() 43 | prev_char = word[0] 44 | for char in word[1:]: 45 | pairs.add((prev_char, char)) 46 | prev_char = char 47 | return pairs 48 | 49 | 50 | def basic_clean(text): 51 | text = ftfy.fix_text(text) 52 | text = html.unescape(html.unescape(text)) 53 | return text.strip() 54 | 55 | 56 | def whitespace_clean(text): 57 | text = re.sub(r'\s+', ' ', text) 58 | text = text.strip() 59 | return text 60 | 61 | 62 | class SimpleTokenizer(object): 63 | def __init__(self, bpe_path: str = default_bpe()): 64 | self.byte_encoder = bytes_to_unicode() 65 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 66 | merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') 67 | merges = merges[1:49152-256-2+1] 68 | merges = [tuple(merge.split()) for merge in merges] 69 | vocab = list(bytes_to_unicode().values()) 70 | vocab = vocab + [v+'' for v in vocab] 71 | for merge in merges: 72 | vocab.append(''.join(merge)) 73 | vocab.extend(['<|startoftext|>', '<|endoftext|>']) 74 | self.encoder = dict(zip(vocab, range(len(vocab)))) 75 | self.decoder = {v: k for k, v in self.encoder.items()} 76 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 77 | self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} 78 | self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) 79 | 80 | def bpe(self, token): 81 | if token in self.cache: 82 | return self.cache[token] 83 | word = tuple(token[:-1]) + ( token[-1] + '',) 84 | pairs = get_pairs(word) 85 | 86 | if not pairs: 87 | return token+'' 88 | 89 | while True: 90 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) 91 | if bigram not in self.bpe_ranks: 92 | break 93 | first, second = bigram 94 | new_word = [] 95 | i = 0 96 | while i < len(word): 97 | try: 98 | j = word.index(first, i) 99 | new_word.extend(word[i:j]) 100 | i = j 101 | except: 102 | new_word.extend(word[i:]) 103 | break 104 | 105 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 106 | new_word.append(first+second) 107 | i += 2 108 | else: 109 | new_word.append(word[i]) 110 | i += 1 111 | new_word = tuple(new_word) 112 | word = new_word 113 | if len(word) == 1: 114 | break 115 | else: 116 | pairs = get_pairs(word) 117 | word = ' '.join(word) 118 | self.cache[token] = word 119 | return word 120 | 121 | def encode(self, text): 122 | bpe_tokens = [] 123 | text = whitespace_clean(basic_clean(text)).lower() 124 | for token in re.findall(self.pat, text): 125 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) 126 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) 127 | return bpe_tokens 128 | 129 | def decode(self, tokens): 130 | text = ''.join([self.decoder[token] for token in tokens]) 131 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') 132 | return text 133 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/util/collections.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from typing import Any, Callable, Dict, List, Optional 3 | from typing import OrderedDict, Generic, TypeVar 4 | 5 | K = TypeVar('K') 6 | V = TypeVar('V') 7 | 8 | class AttrDict(OrderedDict[K, V], Generic[K, V]): 9 | """ 10 | An attribute dictionary that automatically handles nested keys joined by "/". 11 | 12 | Originally copied from: https://stackoverflow.com/questions/3031219/recursively-access-dict-via-attributes-as-well-as-index-access 13 | """ 14 | 15 | MARKER = object() 16 | 17 | # pylint: disable=super-init-not-called 18 | def __init__(self, *args, **kwargs): 19 | if len(args) == 0: 20 | for key, value in kwargs.items(): 21 | self.__setitem__(key, value) 22 | else: 23 | assert len(args) == 1 24 | assert isinstance(args[0], (dict, AttrDict)) 25 | for key, value in args[0].items(): 26 | self.__setitem__(key, value) 27 | 28 | def __contains__(self, key): 29 | if "/" in key: 30 | keys = key.split("/") 31 | key, next_key = keys[0], "/".join(keys[1:]) 32 | return key in self and next_key in self[key] 33 | return super(AttrDict, self).__contains__(key) 34 | 35 | def __setitem__(self, key, value): 36 | if "/" in key: 37 | keys = key.split("/") 38 | key, next_key = keys[0], "/".join(keys[1:]) 39 | if key not in self: 40 | self[key] = AttrDict() 41 | self[key].__setitem__(next_key, value) 42 | return 43 | 44 | if isinstance(value, dict) and not isinstance(value, AttrDict): 45 | value = AttrDict(**value) 46 | if isinstance(value, list): 47 | value = [AttrDict(val) if isinstance(val, dict) else val for val in value] 48 | super(AttrDict, self).__setitem__(key, value) 49 | 50 | def __getitem__(self, key): 51 | if "/" in key: 52 | keys = key.split("/") 53 | key, next_key = keys[0], "/".join(keys[1:]) 54 | val = self[key] 55 | if not isinstance(val, AttrDict): 56 | raise ValueError 57 | return val.__getitem__(next_key) 58 | 59 | return self.get(key, None) 60 | 61 | def all_keys( 62 | self, 63 | leaves_only: bool = False, 64 | parent: Optional[str] = None, 65 | ) -> List[str]: 66 | keys = [] 67 | for key in self.keys(): 68 | cur = key if parent is None else f"{parent}/{key}" 69 | if not leaves_only or not isinstance(self[key], dict): 70 | keys.append(cur) 71 | if isinstance(self[key], dict): 72 | keys.extend(self[key].all_keys(leaves_only=leaves_only, parent=cur)) 73 | return keys 74 | 75 | def dumpable(self, strip=True): 76 | """ 77 | Casts into OrderedDict and removes internal attributes 78 | """ 79 | 80 | def _dump(val): 81 | if isinstance(val, AttrDict): 82 | return val.dumpable() 83 | elif isinstance(val, list): 84 | return [_dump(v) for v in val] 85 | return val 86 | 87 | if strip: 88 | return {k: _dump(v) for k, v in self.items() if not k.startswith("_")} 89 | return {k: _dump(v if not k.startswith("_") else repr(v)) for k, v in self.items()} 90 | 91 | def map( 92 | self, 93 | map_fn: Callable[[Any, Any], Any], 94 | should_map: Optional[Callable[[Any, Any], bool]] = None, 95 | ) -> "AttrDict": 96 | """ 97 | Creates a copy of self where some or all values are transformed by 98 | map_fn. 99 | 100 | :param should_map: If provided, only those values that evaluate to true 101 | are converted; otherwise, all values are mapped. 102 | """ 103 | 104 | def _apply(key, val): 105 | if isinstance(val, AttrDict): 106 | return val.map(map_fn, should_map) 107 | elif should_map is None or should_map(key, val): 108 | return map_fn(key, val) 109 | return val 110 | 111 | return AttrDict({k: _apply(k, v) for k, v in self.items()}) 112 | 113 | def __eq__(self, other): 114 | return self.keys() == other.keys() and all(self[k] == other[k] for k in self.keys()) 115 | 116 | def combine( 117 | self, 118 | other: Dict[str, Any], 119 | combine_fn: Callable[[Optional[Any], Optional[Any]], Any], 120 | ) -> "AttrDict": 121 | """ 122 | Some values may be missing, but the dictionary structures must be the 123 | same. 124 | 125 | :param combine_fn: a (possibly non-commutative) function to combine the 126 | values 127 | """ 128 | 129 | def _apply(val, other_val): 130 | if val is not None and isinstance(val, AttrDict): 131 | assert isinstance(other_val, AttrDict) 132 | return val.combine(other_val, combine_fn) 133 | return combine_fn(val, other_val) 134 | 135 | # TODO nit: this changes the ordering.. 136 | keys = self.keys() | other.keys() 137 | return AttrDict({k: _apply(self[k], other[k]) for k in keys}) 138 | 139 | __setattr__, __getattr__ = __setitem__, __getitem__ 140 | -------------------------------------------------------------------------------- /modules/shap_e/shap_e/rendering/blender/render.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import subprocess 4 | import tempfile 5 | import zipfile 6 | 7 | import blobfile as bf 8 | import numpy as np 9 | from PIL import Image 10 | 11 | from ...rendering.mesh import TriMesh 12 | 13 | from .constants import BASIC_AMBIENT_COLOR, BASIC_DIFFUSE_COLOR, UNIFORM_LIGHT_DIRECTION 14 | 15 | SCRIPT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "blender_script.py") 16 | 17 | 18 | def render_model( 19 | model_path: str, 20 | output_path: str, 21 | num_images: int, 22 | backend: str = "BLENDER_EEVEE", 23 | light_mode: str = "random", 24 | camera_pose: str = "random", 25 | camera_dist_min: float = 2.0, 26 | camera_dist_max: float = 2.0, 27 | fast_mode: bool = False, 28 | extract_material: bool = False, 29 | delete_material: bool = False, 30 | verbose: bool = False, 31 | timeout: float = 15 * 60, 32 | ): 33 | with tempfile.TemporaryDirectory() as tmp_dir: 34 | tmp_in = model_path 35 | tmp_out = os.path.join(tmp_dir, "out") 36 | zip_out = tmp_out + ".zip" 37 | os.mkdir(tmp_out) 38 | args = [] 39 | if platform.system() == "Linux": 40 | # Needed to enable Eevee backend on headless linux. 41 | args = ["xvfb-run", "-a"] 42 | args.extend( 43 | [ 44 | _blender_binary_path(), 45 | "-b", 46 | "-P", 47 | SCRIPT_PATH, 48 | "--", 49 | "--input_path", 50 | tmp_in, 51 | "--output_path", 52 | tmp_out, 53 | "--num_images", 54 | str(num_images), 55 | "--backend", 56 | backend, 57 | "--light_mode", 58 | light_mode, 59 | "--camera_pose", 60 | camera_pose, 61 | "--camera_dist_min", 62 | str(camera_dist_min), 63 | "--camera_dist_max", 64 | str(camera_dist_max), 65 | "--uniform_light_direction", 66 | *[str(x) for x in UNIFORM_LIGHT_DIRECTION], 67 | "--basic_ambient", 68 | str(BASIC_AMBIENT_COLOR), 69 | "--basic_diffuse", 70 | str(BASIC_DIFFUSE_COLOR), 71 | ] 72 | ) 73 | if fast_mode: 74 | args.append("--fast_mode") 75 | if extract_material: 76 | args.append("--extract_material") 77 | if delete_material: 78 | args.append("--delete_material") 79 | if verbose: 80 | subprocess.check_call(args) 81 | else: 82 | try: 83 | output = subprocess.check_output(args, stderr=subprocess.STDOUT, timeout=timeout) 84 | except subprocess.CalledProcessError as exc: 85 | raise RuntimeError(f"{exc}: {exc.output}") from exc 86 | if not os.path.exists(os.path.join(tmp_out, "info.json")): 87 | if verbose: 88 | # There is no output available, since it was 89 | # logged directly to stdout/stderr. 90 | raise RuntimeError(f"render failed: output file missing") 91 | else: 92 | raise RuntimeError(f"render failed: output file missing. Output: {output}") 93 | _combine_rgba(tmp_out) 94 | with zipfile.ZipFile(zip_out, mode="w") as zf: 95 | for name in os.listdir(tmp_out): 96 | zf.write(os.path.join(tmp_out, name), name) 97 | bf.copy(zip_out, output_path, overwrite=True) 98 | 99 | 100 | def render_mesh( 101 | mesh: TriMesh, 102 | output_path: str, 103 | num_images: int, 104 | backend: str = "BLENDER_EEVEE", 105 | **kwargs, 106 | ): 107 | if mesh.has_vertex_colors() and backend not in ["BLENDER_EEVEE", "CYCLES"]: 108 | raise ValueError(f"backend does not support vertex colors: {backend}") 109 | 110 | with tempfile.TemporaryDirectory() as tmp_dir: 111 | ply_path = os.path.join(tmp_dir, "out.ply") 112 | with open(ply_path, "wb") as f: 113 | mesh.write_ply(f) 114 | render_model( 115 | ply_path, output_path=output_path, num_images=num_images, backend=backend, **kwargs 116 | ) 117 | 118 | 119 | def _combine_rgba(out_dir: str): 120 | i = 0 121 | while True: 122 | paths = [os.path.join(out_dir, f"{i:05}_{ch}.png") for ch in "rgba"] 123 | if not os.path.exists(paths[0]): 124 | break 125 | joined = np.stack( 126 | [(np.array(Image.open(path)) >> 8).astype(np.uint8) for path in paths], axis=-1 127 | ) 128 | Image.fromarray(joined).save(os.path.join(out_dir, f"{i:05}.png")) 129 | for path in paths: 130 | os.remove(path) 131 | i += 1 132 | 133 | 134 | def _blender_binary_path() -> str: 135 | path = os.getenv("BLENDER_PATH", None) 136 | if path is not None: 137 | return path 138 | 139 | if os.path.exists("/Applications/Blender.app/Contents/MacOS/Blender"): 140 | return "/Applications/Blender.app/Contents/MacOS/Blender" 141 | 142 | raise EnvironmentError( 143 | "To render 3D models, install Blender version 3.3.1 or higher and " 144 | "set the environment variable `BLENDER_PATH` to the path of the Blender executable." 145 | ) 146 | -------------------------------------------------------------------------------- /modules/annotator/midas/utils.py: -------------------------------------------------------------------------------- 1 | """Utils for monoDepth.""" 2 | import sys 3 | import re 4 | import numpy as np 5 | import cv2 6 | import torch 7 | 8 | 9 | def read_pfm(path): 10 | """Read pfm file. 11 | 12 | Args: 13 | path (str): path to file 14 | 15 | Returns: 16 | tuple: (data, scale) 17 | """ 18 | with open(path, "rb") as file: 19 | 20 | color = None 21 | width = None 22 | height = None 23 | scale = None 24 | endian = None 25 | 26 | header = file.readline().rstrip() 27 | if header.decode("ascii") == "PF": 28 | color = True 29 | elif header.decode("ascii") == "Pf": 30 | color = False 31 | else: 32 | raise Exception("Not a PFM file: " + path) 33 | 34 | dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii")) 35 | if dim_match: 36 | width, height = list(map(int, dim_match.groups())) 37 | else: 38 | raise Exception("Malformed PFM header.") 39 | 40 | scale = float(file.readline().decode("ascii").rstrip()) 41 | if scale < 0: 42 | # little-endian 43 | endian = "<" 44 | scale = -scale 45 | else: 46 | # big-endian 47 | endian = ">" 48 | 49 | data = np.fromfile(file, endian + "f") 50 | shape = (height, width, 3) if color else (height, width) 51 | 52 | data = np.reshape(data, shape) 53 | data = np.flipud(data) 54 | 55 | return data, scale 56 | 57 | 58 | def write_pfm(path, image, scale=1): 59 | """Write pfm file. 60 | 61 | Args: 62 | path (str): pathto file 63 | image (array): data 64 | scale (int, optional): Scale. Defaults to 1. 65 | """ 66 | 67 | with open(path, "wb") as file: 68 | color = None 69 | 70 | if image.dtype.name != "float32": 71 | raise Exception("Image dtype must be float32.") 72 | 73 | image = np.flipud(image) 74 | 75 | if len(image.shape) == 3 and image.shape[2] == 3: # color image 76 | color = True 77 | elif ( 78 | len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1 79 | ): # greyscale 80 | color = False 81 | else: 82 | raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.") 83 | 84 | file.write("PF\n" if color else "Pf\n".encode()) 85 | file.write("%d %d\n".encode() % (image.shape[1], image.shape[0])) 86 | 87 | endian = image.dtype.byteorder 88 | 89 | if endian == "<" or endian == "=" and sys.byteorder == "little": 90 | scale = -scale 91 | 92 | file.write("%f\n".encode() % scale) 93 | 94 | image.tofile(file) 95 | 96 | 97 | def read_image(path): 98 | """Read image and output RGB image (0-1). 99 | 100 | Args: 101 | path (str): path to file 102 | 103 | Returns: 104 | array: RGB image (0-1) 105 | """ 106 | img = cv2.imread(path) 107 | 108 | if img.ndim == 2: 109 | img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) 110 | 111 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0 112 | 113 | return img 114 | 115 | 116 | def resize_image(img): 117 | """Resize image and make it fit for network. 118 | 119 | Args: 120 | img (array): image 121 | 122 | Returns: 123 | tensor: data ready for network 124 | """ 125 | height_orig = img.shape[0] 126 | width_orig = img.shape[1] 127 | 128 | if width_orig > height_orig: 129 | scale = width_orig / 384 130 | else: 131 | scale = height_orig / 384 132 | 133 | height = (np.ceil(height_orig / scale / 32) * 32).astype(int) 134 | width = (np.ceil(width_orig / scale / 32) * 32).astype(int) 135 | 136 | img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA) 137 | 138 | img_resized = ( 139 | torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float() 140 | ) 141 | img_resized = img_resized.unsqueeze(0) 142 | 143 | return img_resized 144 | 145 | 146 | def resize_depth(depth, width, height): 147 | """Resize depth map and bring to CPU (numpy). 148 | 149 | Args: 150 | depth (tensor): depth 151 | width (int): image width 152 | height (int): image height 153 | 154 | Returns: 155 | array: processed depth 156 | """ 157 | depth = torch.squeeze(depth[0, :, :, :]).to("cpu") 158 | 159 | depth_resized = cv2.resize( 160 | depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC 161 | ) 162 | 163 | return depth_resized 164 | 165 | def write_depth(path, depth, bits=1): 166 | """Write depth map to pfm and png file. 167 | 168 | Args: 169 | path (str): filepath without extension 170 | depth (array): depth 171 | """ 172 | write_pfm(path + ".pfm", depth.astype(np.float32)) 173 | 174 | depth_min = depth.min() 175 | depth_max = depth.max() 176 | 177 | max_val = (2**(8*bits))-1 178 | 179 | if depth_max - depth_min > np.finfo("float").eps: 180 | out = max_val * (depth - depth_min) / (depth_max - depth_min) 181 | else: 182 | out = np.zeros(depth.shape, dtype=depth.type) 183 | 184 | if bits == 1: 185 | cv2.imwrite(path + ".png", out.astype("uint8")) 186 | elif bits == 2: 187 | cv2.imwrite(path + ".png", out.astype("uint16")) 188 | 189 | return 190 | -------------------------------------------------------------------------------- /modules/sadtalker/src/face3d/models/arcface_torch/backbones/mobilefacenet.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Adapted from https://github.com/cavalleria/cavaface.pytorch/blob/master/backbone/mobilefacenet.py 3 | Original author cavalleria 4 | ''' 5 | 6 | import torch.nn as nn 7 | from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, Sequential, Module 8 | import torch 9 | 10 | 11 | 12 | class Flatten(Module): 13 | def forward(self, x): 14 | return x.view(x.size(0), -1) 15 | 16 | 17 | class ConvBlock(Module): 18 | def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1): 19 | super(ConvBlock, self).__init__() 20 | self.layers = nn.Sequential( 21 | Conv2d(in_c, out_c, kernel, groups=groups, stride=stride, padding=padding, bias=False), 22 | BatchNorm2d(num_features=out_c), 23 | PReLU(num_parameters=out_c) 24 | ) 25 | 26 | def forward(self, x): 27 | return self.layers(x) 28 | 29 | 30 | class LinearBlock(Module): 31 | def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1): 32 | super(LinearBlock, self).__init__() 33 | self.layers = nn.Sequential( 34 | Conv2d(in_c, out_c, kernel, stride, padding, groups=groups, bias=False), 35 | BatchNorm2d(num_features=out_c) 36 | ) 37 | 38 | def forward(self, x): 39 | return self.layers(x) 40 | 41 | 42 | class DepthWise(Module): 43 | def __init__(self, in_c, out_c, residual=False, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=1): 44 | super(DepthWise, self).__init__() 45 | self.residual = residual 46 | self.layers = nn.Sequential( 47 | ConvBlock(in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1)), 48 | ConvBlock(groups, groups, groups=groups, kernel=kernel, padding=padding, stride=stride), 49 | LinearBlock(groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1)) 50 | ) 51 | 52 | def forward(self, x): 53 | short_cut = None 54 | if self.residual: 55 | short_cut = x 56 | x = self.layers(x) 57 | if self.residual: 58 | output = short_cut + x 59 | else: 60 | output = x 61 | return output 62 | 63 | 64 | class Residual(Module): 65 | def __init__(self, c, num_block, groups, kernel=(3, 3), stride=(1, 1), padding=(1, 1)): 66 | super(Residual, self).__init__() 67 | modules = [] 68 | for _ in range(num_block): 69 | modules.append(DepthWise(c, c, True, kernel, stride, padding, groups)) 70 | self.layers = Sequential(*modules) 71 | 72 | def forward(self, x): 73 | return self.layers(x) 74 | 75 | 76 | class GDC(Module): 77 | def __init__(self, embedding_size): 78 | super(GDC, self).__init__() 79 | self.layers = nn.Sequential( 80 | LinearBlock(512, 512, groups=512, kernel=(7, 7), stride=(1, 1), padding=(0, 0)), 81 | Flatten(), 82 | Linear(512, embedding_size, bias=False), 83 | BatchNorm1d(embedding_size)) 84 | 85 | def forward(self, x): 86 | return self.layers(x) 87 | 88 | 89 | class MobileFaceNet(Module): 90 | def __init__(self, fp16=False, num_features=512): 91 | super(MobileFaceNet, self).__init__() 92 | scale = 2 93 | self.fp16 = fp16 94 | self.layers = nn.Sequential( 95 | ConvBlock(3, 64 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1)), 96 | ConvBlock(64 * scale, 64 * scale, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64), 97 | DepthWise(64 * scale, 64 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128), 98 | Residual(64 * scale, num_block=4, groups=128, kernel=(3, 3), stride=(1, 1), padding=(1, 1)), 99 | DepthWise(64 * scale, 128 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256), 100 | Residual(128 * scale, num_block=6, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)), 101 | DepthWise(128 * scale, 128 * scale, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512), 102 | Residual(128 * scale, num_block=2, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)), 103 | ) 104 | self.conv_sep = ConvBlock(128 * scale, 512, kernel=(1, 1), stride=(1, 1), padding=(0, 0)) 105 | self.features = GDC(num_features) 106 | self._initialize_weights() 107 | 108 | def _initialize_weights(self): 109 | for m in self.modules(): 110 | if isinstance(m, nn.Conv2d): 111 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 112 | if m.bias is not None: 113 | m.bias.data.zero_() 114 | elif isinstance(m, nn.BatchNorm2d): 115 | m.weight.data.fill_(1) 116 | m.bias.data.zero_() 117 | elif isinstance(m, nn.Linear): 118 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 119 | if m.bias is not None: 120 | m.bias.data.zero_() 121 | 122 | def forward(self, x): 123 | with torch.cuda.amp.autocast(self.fp16): 124 | x = self.layers(x) 125 | x = self.conv_sep(x.float() if self.fp16 else x) 126 | x = self.features(x) 127 | return x 128 | 129 | 130 | def get_mbf(fp16, num_features): 131 | return MobileFaceNet(fp16, num_features) -------------------------------------------------------------------------------- /modules/shap_e/shap_e/rendering/raycast/cast.py: -------------------------------------------------------------------------------- 1 | from typing import Iterator, Optional, Tuple 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from ...rendering.view_data import ProjectiveCamera 7 | 8 | from ._utils import cross_product 9 | from .types import RayCollisions, Rays, TriMesh 10 | 11 | 12 | def cast_camera( 13 | camera: ProjectiveCamera, 14 | mesh: TriMesh, 15 | ray_batch_size: Optional[int] = None, 16 | checkpoint: Optional[bool] = None, 17 | ) -> Iterator[RayCollisions]: 18 | pixel_indices = np.arange(camera.width * camera.height) 19 | image_coords = np.stack([pixel_indices % camera.width, pixel_indices // camera.width], axis=1) 20 | rays = camera.camera_rays(image_coords) 21 | batch_size = ray_batch_size or len(rays) 22 | checkpoint = checkpoint if checkpoint is not None else batch_size < len(rays) 23 | for i in range(0, len(rays), batch_size): 24 | sub_rays = rays[i : i + batch_size] 25 | origins = torch.from_numpy(sub_rays[:, 0]).to(mesh.vertices) 26 | directions = torch.from_numpy(sub_rays[:, 1]).to(mesh.vertices) 27 | yield cast_rays(Rays(origins=origins, directions=directions), mesh, checkpoint=checkpoint) 28 | 29 | 30 | def cast_rays(rays: Rays, mesh: TriMesh, checkpoint: bool = False) -> RayCollisions: 31 | """ 32 | Cast a batch of rays onto a mesh. 33 | """ 34 | if checkpoint: 35 | collides, ray_dists, tri_indices, barycentric, normals = RayCollisionFunction.apply( 36 | rays.origins, rays.directions, mesh.faces, mesh.vertices 37 | ) 38 | return RayCollisions( 39 | collides=collides, 40 | ray_dists=ray_dists, 41 | tri_indices=tri_indices, 42 | barycentric=barycentric, 43 | normals=normals, 44 | ) 45 | 46 | # https://github.com/unixpickle/vae-textures/blob/2968549ddd4a3487f9437d4db00793324453cd59/vae_textures/render.py#L98 47 | normals = mesh.normals() # [N x 3] 48 | directions = rays.directions # [M x 3] 49 | collides = (directions @ normals.T).abs() > 1e-8 # [N x M] 50 | 51 | tris = mesh.vertices[mesh.faces] # [N x 3 x 3] 52 | v1 = tris[:, 1] - tris[:, 0] 53 | v2 = tris[:, 2] - tris[:, 0] 54 | 55 | cross1 = cross_product(directions[:, None], v2[None]) # [N x M x 3] 56 | det = torch.sum(cross1 * v1[None], dim=-1) # [N x M] 57 | collides = torch.logical_and(collides, det.abs() > 1e-8) 58 | 59 | invDet = 1 / det # [N x M] 60 | o = rays.origins[:, None] - tris[None, :, 0] # [N x M x 3] 61 | bary1 = invDet * torch.sum(o * cross1, dim=-1) # [N x M] 62 | collides = torch.logical_and(collides, torch.logical_and(bary1 >= 0, bary1 <= 1)) 63 | 64 | cross2 = cross_product(o, v1[None]) # [N x M x 3] 65 | bary2 = invDet * torch.sum(directions[:, None] * cross2, dim=-1) # [N x M] 66 | collides = torch.logical_and(collides, torch.logical_and(bary2 >= 0, bary2 <= 1)) 67 | 68 | bary0 = 1 - (bary1 + bary2) 69 | 70 | # Make sure this is in the positive part of the ray. 71 | scale = invDet * torch.sum(v2 * cross2, dim=-1) 72 | collides = torch.logical_and(collides, scale > 0) 73 | 74 | # Select the nearest collision 75 | ray_dists, tri_indices = torch.min( 76 | torch.where(collides, scale, torch.tensor(torch.inf).to(scale)), dim=-1 77 | ) # [N] 78 | nearest_bary = torch.stack( 79 | [ 80 | bary0[range(len(tri_indices)), tri_indices], 81 | bary1[range(len(tri_indices)), tri_indices], 82 | bary2[range(len(tri_indices)), tri_indices], 83 | ], 84 | dim=-1, 85 | ) 86 | 87 | return RayCollisions( 88 | collides=torch.any(collides, dim=-1), 89 | ray_dists=ray_dists, 90 | tri_indices=tri_indices, 91 | barycentric=nearest_bary, 92 | normals=normals[tri_indices], 93 | ) 94 | 95 | 96 | class RayCollisionFunction(torch.autograd.Function): 97 | @staticmethod 98 | def forward( 99 | ctx, origins, directions, faces, vertices 100 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: 101 | ctx.save_for_backward(origins, directions, faces, vertices) 102 | with torch.no_grad(): 103 | res = cast_rays( 104 | Rays(origins=origins, directions=directions), 105 | TriMesh(faces=faces, vertices=vertices), 106 | checkpoint=False, 107 | ) 108 | return (res.collides, res.ray_dists, res.tri_indices, res.barycentric, res.normals) 109 | 110 | @staticmethod 111 | def backward( 112 | ctx, _collides_grad, ray_dists_grad, _tri_indices_grad, barycentric_grad, normals_grad 113 | ): 114 | origins, directions, faces, vertices = ctx.input_tensors 115 | 116 | origins = origins.detach().requires_grad_(True) 117 | directions = directions.detach().requires_grad_(True) 118 | vertices = vertices.detach().requires_grad_(True) 119 | 120 | with torch.enable_grad(): 121 | outputs = cast_rays( 122 | Rays(origins=origins, directions=directions), 123 | TriMesh(faces=faces, vertices=vertices), 124 | checkpoint=False, 125 | ) 126 | 127 | origins_grad, directions_grad, vertices_grad = torch.autograd.grad( 128 | (outputs.ray_dists, outputs.barycentric, outputs.normals), 129 | (origins, directions, vertices), 130 | (ray_dists_grad, barycentric_grad, normals_grad), 131 | ) 132 | return (origins_grad, directions_grad, None, vertices_grad) 133 | --------------------------------------------------------------------------------