├── .gitignore
├── DiT
    ├── __init__.py
    ├── model.py
    ├── utils.py
    └── vae.py
├── LICENSE.txt
├── README.md
├── __assets__
    ├── demos
    │   ├── demo_1
    │   │   ├── first_frame.jpg
    │   │   ├── layer_0.jpg
    │   │   ├── layer_1.jpg
    │   │   ├── layer_2.jpg
    │   │   ├── sketch.mp4
    │   │   ├── trajectory.json
    │   │   └── trajectory.npz
    │   ├── demo_2
    │   │   ├── first_frame.jpg
    │   │   ├── layer_0.jpg
    │   │   ├── layer_1.jpg
    │   │   ├── layer_2.jpg
    │   │   ├── sketch.mp4
    │   │   ├── trajectory.json
    │   │   └── trajectory.npz
    │   ├── demo_3
    │   │   ├── first_frame.jpg
    │   │   ├── last_frame.jpg
    │   │   ├── layer_0.jpg
    │   │   ├── layer_0_last.jpg
    │   │   ├── layer_1.jpg
    │   │   ├── layer_1_last.jpg
    │   │   ├── layer_2.jpg
    │   │   ├── layer_2_last.jpg
    │   │   ├── layer_3.jpg
    │   │   ├── layer_3_last.jpg
    │   │   ├── sketch.mp4
    │   │   ├── trajectory.json
    │   │   └── trajectory.npz
    │   ├── demo_4
    │   │   ├── first_frame.jpg
    │   │   ├── layer_0.jpg
    │   │   ├── layer_1.jpg
    │   │   ├── layer_2.jpg
    │   │   ├── sketch.mp4
    │   │   ├── trajectory.json
    │   │   └── trajectory.npz
    │   ├── demo_5
    │   │   ├── first_frame.jpg
    │   │   ├── layer_0.jpg
    │   │   ├── layer_1.jpg
    │   │   ├── sketch.mp4
    │   │   ├── trajectory.json
    │   │   └── trajectory.npz
    │   └── realworld
    │   │   ├── config.yaml
    │   │   ├── first_frame.jpg
    │   │   ├── layer_0.jpg
    │   │   ├── layer_1.jpg
    │   │   ├── layer_2.jpg
    │   │   ├── sketch.mp4
    │   │   ├── trajectory_bg.json
    │   │   └── trajectory_dog.json
    └── figs
    │   └── demos.gif
├── lineart
    ├── LICENSE
    └── __init__.py
├── lvdm
    ├── basics.py
    ├── common.py
    ├── data
    │   └── dataset.py
    ├── models
    │   ├── autoencoder.py
    │   ├── condition.py
    │   ├── controlnet.py
    │   ├── layer_controlnet.py
    │   └── unet.py
    ├── modules
    │   ├── ae_dualref_modules.py
    │   ├── ae_modules.py
    │   ├── attention.py
    │   └── attention_svd.py
    ├── pipelines
    │   └── pipeline_animation.py
    └── utils.py
├── requirements.txt
└── scripts
    ├── animate_Layer.py
    ├── app.py
    ├── demo1.yaml
    ├── demo2.yaml
    ├── demo3.yaml
    ├── demo4.yaml
    ├── demo5.yaml
    └── infer_DiT.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | wandb/
 2 | *debug*
 3 | debugs/
 4 | outputs/
 5 | samples/
 6 | __pycache__/
 7 | 
 8 | *.ipynb
 9 | *.safetensors
10 | *.ckpt
11 | *.pth
12 | /data
13 | /vis
14 | /checkpoints
15 | 


--------------------------------------------------------------------------------
/DiT/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/DiT/__init__.py


--------------------------------------------------------------------------------
/DiT/model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (c) Alibaba, Inc. and its affiliates.
  3 | import torch
  4 | import torch.nn as nn
  5 | from typing import Any, Dict
  6 | from diffusers import __version__
  7 | from diffusers.configuration_utils import register_to_config
  8 | from diffusers.utils import (
  9 |     SAFETENSORS_WEIGHTS_NAME,
 10 |     WEIGHTS_NAME,
 11 |     logging,
 12 |     is_torch_version,
 13 |     _get_model_file,
 14 |     _add_variant
 15 | )
 16 | from diffusers.models.model_loading_utils import load_state_dict
 17 | from wan.modules.model import WanModel, WanAttentionBlock, sinusoidal_embedding_1d
 18 | from omegaconf import ListConfig, DictConfig, OmegaConf
 19 | 
 20 | 
 21 | logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 22 | 
 23 | 
 24 | class VaceWanAttentionBlock(WanAttentionBlock):
 25 |     def __init__(
 26 |             self,
 27 |             cross_attn_type,
 28 |             dim,
 29 |             ffn_dim,
 30 |             num_heads,
 31 |             window_size=(-1, -1),
 32 |             qk_norm=True,
 33 |             cross_attn_norm=False,
 34 |             eps=1e-6,
 35 |             block_id=0
 36 |     ):
 37 |         super().__init__(cross_attn_type, dim, ffn_dim, num_heads, window_size, qk_norm, cross_attn_norm, eps)
 38 |         self.block_id = block_id
 39 |         if block_id == 0:
 40 |             self.before_proj = nn.Linear(self.dim, self.dim)
 41 |             nn.init.zeros_(self.before_proj.weight)
 42 |             nn.init.zeros_(self.before_proj.bias)
 43 |         self.after_proj = nn.Linear(self.dim, self.dim)
 44 |         nn.init.zeros_(self.after_proj.weight)
 45 |         nn.init.zeros_(self.after_proj.bias)
 46 | 
 47 |     def forward(self, c, x, **kwargs):
 48 |         if self.block_id == 0:
 49 |             c = self.before_proj(c) + x
 50 |             all_c = []
 51 |         else:
 52 |             all_c = list(torch.unbind(c))
 53 |             c = all_c.pop(-1)
 54 |         c = super().forward(c, **kwargs)
 55 |         c_skip = self.after_proj(c)
 56 |         all_c += [c_skip, c]
 57 |         c = torch.stack(all_c)
 58 |         return c
 59 | 
 60 | 
 61 | class BaseWanAttentionBlock(WanAttentionBlock):
 62 |     def __init__(
 63 |         self,
 64 |         cross_attn_type,
 65 |         dim,
 66 |         ffn_dim,
 67 |         num_heads,
 68 |         window_size=(-1, -1),
 69 |         qk_norm=True,
 70 |         cross_attn_norm=False,
 71 |         eps=1e-6,
 72 |         block_id=None
 73 |     ):
 74 |         super().__init__(cross_attn_type, dim, ffn_dim, num_heads, window_size, qk_norm, cross_attn_norm, eps)
 75 |         self.block_id = block_id
 76 | 
 77 |     def forward(self, x, hints, context_scale=1.0, **kwargs):
 78 |         x = super().forward(x, **kwargs)
 79 |         if self.block_id is not None:
 80 |             x = x + hints[self.block_id] * context_scale
 81 |         return x
 82 | 
 83 | 
 84 | class VaceWanModel(WanModel):
 85 |     _supports_gradient_checkpointing = True
 86 | 
 87 |     @register_to_config
 88 |     def __init__(self,
 89 |                  vace_layers=None,
 90 |                  vace_in_dim=None,
 91 |                  model_type='t2v',
 92 |                  patch_size=(1, 2, 2),
 93 |                  text_len=512,
 94 |                  in_dim=16,
 95 |                  dim=2048,
 96 |                  ffn_dim=8192,
 97 |                  freq_dim=256,
 98 |                  text_dim=4096,
 99 |                  out_dim=16,
100 |                  num_heads=16,
101 |                  num_layers=32,
102 |                  window_size=(-1, -1),
103 |                  qk_norm=True,
104 |                  cross_attn_norm=True,
105 |                  eps=1e-6):
106 |         super().__init__(model_type, patch_size, text_len, in_dim, dim, ffn_dim, freq_dim, text_dim, out_dim,
107 |                          num_heads, num_layers, window_size, qk_norm, cross_attn_norm, eps)
108 | 
109 |         self.vace_layers = [i for i in range(0, self.num_layers, 2)] if vace_layers is None else vace_layers
110 |         self.vace_in_dim = self.in_dim if vace_in_dim is None else vace_in_dim
111 | 
112 |         assert 0 in self.vace_layers
113 |         self.vace_layers_mapping = {i: n for n, i in enumerate(self.vace_layers)}
114 | 
115 |         # blocks
116 |         self.blocks = nn.ModuleList([
117 |             BaseWanAttentionBlock('t2v_cross_attn', self.dim, self.ffn_dim, self.num_heads, self.window_size, self.qk_norm,
118 |                                   self.cross_attn_norm, self.eps,
119 |                                   block_id=self.vace_layers_mapping[i] if i in self.vace_layers else None)
120 |             for i in range(self.num_layers)
121 |         ])
122 | 
123 |         # vace blocks
124 |         self.vace_blocks = nn.ModuleList([
125 |             VaceWanAttentionBlock('t2v_cross_attn', self.dim, self.ffn_dim, self.num_heads, self.window_size, self.qk_norm,
126 |                                      self.cross_attn_norm, self.eps, block_id=i)
127 |             for i in self.vace_layers
128 |         ])
129 | 
130 |         # vace patch embeddings
131 |         self.vace_patch_embedding = nn.Conv3d(
132 |             self.vace_in_dim, self.dim, kernel_size=self.patch_size, stride=self.patch_size
133 |         )
134 | 
135 |         self.gradient_checkpointing = False
136 | 
137 |     def _set_gradient_checkpointing(self, module, value=False):
138 |         self.gradient_checkpointing = value
139 | 
140 |     def forward_vace(
141 |         self,
142 |         x,
143 |         vace_context,
144 |         seq_len,
145 |         kwargs
146 |     ):
147 |         # embeddings
148 |         c = [self.vace_patch_embedding(u.unsqueeze(0)) for u in vace_context]
149 |         c = [u.flatten(2).transpose(1, 2) for u in c]
150 |         c = torch.cat([
151 |             torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))],
152 |                       dim=1) for u in c
153 |         ])
154 | 
155 |         # arguments
156 |         new_kwargs = dict(x=x)
157 |         new_kwargs.update(kwargs)
158 | 
159 |         for block in self.vace_blocks:
160 |             if self.training and self.gradient_checkpointing:
161 | 
162 |                 def create_custom_forward(module):
163 |                     def custom_forward(*inputs, **kwargs):
164 |                         return module(*inputs, **kwargs)
165 | 
166 |                     return custom_forward
167 |                 ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
168 |                 new_kwargs.update(ckpt_kwargs)
169 |                 c = torch.utils.checkpoint.checkpoint(create_custom_forward(block), c, **new_kwargs)
170 |             else:
171 |                 c = block(c, **new_kwargs)
172 |         hints = torch.unbind(c)[:-1]
173 |         return hints
174 | 
175 |     def forward(
176 |         self,
177 |         x,
178 |         t,
179 |         vace_context,
180 |         context,
181 |         seq_len,
182 |         vace_context_scale=1.0,
183 |         clip_fea=None,
184 |         y=None,
185 |     ):
186 |         r"""
187 |         Forward pass through the diffusion model
188 | 
189 |         Args:
190 |             x (List[Tensor]):
191 |                 List of input video tensors, each with shape [C_in, F, H, W]
192 |             t (Tensor):
193 |                 Diffusion timesteps tensor of shape [B]
194 |             context (List[Tensor]):
195 |                 List of text embeddings each with shape [L, C]
196 |             seq_len (`int`):
197 |                 Maximum sequence length for positional encoding
198 |             clip_fea (Tensor, *optional*):
199 |                 CLIP image features for image-to-video mode
200 |             y (List[Tensor], *optional*):
201 |                 Conditional video inputs for image-to-video mode, same shape as x
202 | 
203 |         Returns:
204 |             List[Tensor]:
205 |                 List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
206 |         """
207 |         # if self.model_type == 'i2v':
208 |         #     assert clip_fea is not None and y is not None
209 |         # params
210 |         device = self.patch_embedding.weight.device
211 |         if self.freqs.device != device:
212 |             self.freqs = self.freqs.to(device)
213 | 
214 |         # if y is not None:
215 |         #     x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
216 | 
217 |         # embeddings
218 |         x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
219 |         grid_sizes = torch.stack(
220 |             [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
221 |         x = [u.flatten(2).transpose(1, 2) for u in x]
222 |         seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
223 |         assert seq_lens.max() <= seq_len
224 |         x = torch.cat([
225 |             torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))],
226 |                       dim=1) for u in x
227 |         ])
228 | 
229 |         # time embeddings
230 |         with torch.amp.autocast('cuda', dtype=torch.float32):
231 |             e = self.time_embedding(
232 |                 sinusoidal_embedding_1d(self.freq_dim, t).float())
233 |             e0 = self.time_projection(e).unflatten(1, (6, self.dim))
234 |             assert e.dtype == torch.float32 and e0.dtype == torch.float32
235 | 
236 |         # context
237 |         context_lens = None
238 |         context = self.text_embedding(
239 |             torch.stack([
240 |                 torch.cat(
241 |                     [u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
242 |                 for u in context
243 |             ]))
244 | 
245 |         # if clip_fea is not None:
246 |         #     context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
247 |         #     context = torch.concat([context_clip, context], dim=1)
248 | 
249 |         # arguments
250 |         kwargs = dict(
251 |             e=e0,
252 |             seq_lens=seq_lens,
253 |             grid_sizes=grid_sizes,
254 |             freqs=self.freqs,
255 |             context=context,
256 |             context_lens=context_lens)
257 | 
258 |         hints = self.forward_vace(x, vace_context, seq_len, kwargs)
259 |         kwargs['hints'] = hints
260 |         kwargs['context_scale'] = vace_context_scale
261 | 
262 |         for block in self.blocks:
263 |             if self.training and self.gradient_checkpointing:
264 | 
265 |                 def create_custom_forward(module):
266 |                     def custom_forward(*inputs, **kwargs):
267 |                         return module(*inputs, **kwargs)
268 | 
269 |                     return custom_forward
270 |                 ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
271 |                 kwargs.update(ckpt_kwargs)
272 |                 x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x, **kwargs)
273 |             else:
274 |                 x = block(x, **kwargs)
275 | 
276 |         # head
277 |         x = self.head(x, e)
278 | 
279 |         # unpatchify
280 |         x = self.unpatchify(x, grid_sizes)
281 |         x = torch.stack(x)
282 |         return x
283 | 
284 |     @classmethod
285 |     def from_pretrained(cls, pretrained_model_name_or_path, model_additional_kwargs={}, **kwargs):
286 |         cache_dir = kwargs.pop("cache_dir", None)
287 |         force_download = kwargs.pop("force_download", False)
288 |         proxies = kwargs.pop("proxies", None)
289 |         local_files_only = kwargs.pop("local_files_only", None)
290 |         token = kwargs.pop("token", None)
291 |         revision = kwargs.pop("revision", None)
292 |         subfolder = kwargs.pop("subfolder", None)
293 |         variant = kwargs.pop("variant", None)
294 |         use_safetensors = kwargs.pop("use_safetensors", None)
295 | 
296 |         allow_pickle = False
297 |         if use_safetensors is None:
298 |             use_safetensors = True
299 |             allow_pickle = True
300 | 
301 |         # Load config if we don't provide a configuration
302 |         config_path = pretrained_model_name_or_path
303 | 
304 |         user_agent = {
305 |             "diffusers": __version__,
306 |             "file_type": "model",
307 |             "framework": "pytorch",
308 |         }
309 | 
310 |         # load config
311 |         config, unused_kwargs, commit_hash = cls.load_config(
312 |             config_path,
313 |             cache_dir=cache_dir,
314 |             return_unused_kwargs=True,
315 |             return_commit_hash=True,
316 |             force_download=force_download,
317 |             proxies=proxies,
318 |             local_files_only=local_files_only,
319 |             token=token,
320 |             revision=revision,
321 |             subfolder=subfolder,
322 |             user_agent=user_agent,
323 |             **kwargs,
324 |         )
325 | 
326 |         for key, value in model_additional_kwargs.items():
327 |             if isinstance(value, (ListConfig, DictConfig)):
328 |                 config[key] = OmegaConf.to_container(value, resolve=True)
329 |             else:
330 |                 config[key] = value
331 | 
332 |         # load model
333 |         model_file = None
334 |         if use_safetensors:
335 |             try:
336 |                 model_file = _get_model_file(
337 |                     pretrained_model_name_or_path,
338 |                     weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant),
339 |                     cache_dir=cache_dir,
340 |                     force_download=force_download,
341 |                     proxies=proxies,
342 |                     local_files_only=local_files_only,
343 |                     token=token,
344 |                     revision=revision,
345 |                     subfolder=subfolder,
346 |                     user_agent=user_agent,
347 |                     commit_hash=commit_hash,
348 |                 )
349 | 
350 |             except IOError as e:
351 |                 logger.error(f"An error occurred while trying to fetch {pretrained_model_name_or_path}: {e}")
352 |                 if not allow_pickle:
353 |                     raise
354 |                 logger.warning(
355 |                     "Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead."
356 |                 )
357 | 
358 |         if model_file is None:
359 |             model_file = _get_model_file(
360 |                 pretrained_model_name_or_path,
361 |                 weights_name=_add_variant(WEIGHTS_NAME, variant),
362 |                 cache_dir=cache_dir,
363 |                 force_download=force_download,
364 |                 proxies=proxies,
365 |                 local_files_only=local_files_only,
366 |                 token=token,
367 |                 revision=revision,
368 |                 subfolder=subfolder,
369 |                 user_agent=user_agent,
370 |                 commit_hash=commit_hash,
371 |             )
372 | 
373 |         model = cls.from_config(config, **unused_kwargs)
374 |         state_dict = load_state_dict(model_file, variant)
375 | 
376 |         if state_dict['vace_patch_embedding.weight'].shape[1] != model.vace_patch_embedding.weight.shape[1]:
377 |             state_dict.pop('vace_patch_embedding.weight')
378 | 
379 |         missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
380 |         print(f"VaceWanModel loaded from {model_file} with {len(missing_keys)} missing keys and {len(unexpected_keys)} unexpected keys.")
381 |         return model


--------------------------------------------------------------------------------
/DiT/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import cv2
  3 | import torch
  4 | import os
  5 | from einops import rearrange
  6 | import imageio
  7 | import torchvision
  8 | 
  9 | 
 10 | def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8):
 11 |     videos = rearrange(videos, "b c t h w -> t b c h w")
 12 |     outputs = []
 13 |     for x in videos:
 14 |         x = torchvision.utils.make_grid(x, nrow=n_rows)
 15 |         x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
 16 |         if rescale:
 17 |             x = (x + 1.0) / 2.0  # -1,1 -> 0,1
 18 |         x = (x * 255).numpy().astype(np.uint8)
 19 |         outputs.append(x)
 20 | 
 21 |     os.makedirs(os.path.dirname(path), exist_ok=True)
 22 |     imageio.mimsave(path, outputs, fps=fps)
 23 | 
 24 | 
 25 | def save_videos_with_traj(videos: torch.Tensor, trajectory: torch.Tensor, path: str, rescale=False, fps=8, line_width=7, circle_radius=10):
 26 |     # videos: [C, F, H, W]
 27 |     # trajectory: [F, N, 2]
 28 |     os.makedirs(os.path.dirname(path), exist_ok=True)
 29 |     videos = rearrange(videos, "c f h w -> f h w c")
 30 |     if rescale:
 31 |         videos = (videos + 1) / 2
 32 |     videos = (videos * 255).numpy().astype(np.uint8)
 33 |     outputs = []
 34 |     for frame_idx, img in enumerate(videos):
 35 |         # img: [H, W, C], traj: [N, 2]
 36 |         # draw trajectory use cv2.line
 37 |         img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
 38 |         for traj_idx in range(trajectory.shape[1]):
 39 |             for history_idx in range(frame_idx):
 40 |                 cv2.line(img, tuple(trajectory[history_idx, traj_idx].int().tolist()), tuple(trajectory[history_idx+1, traj_idx].int().tolist()), (0, 0, 255), line_width)
 41 |             cv2.circle(img, tuple(trajectory[frame_idx, traj_idx].int().tolist()), circle_radius, (100, 230, 160), -1)
 42 |         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 43 |         outputs.append(img)
 44 |     imageio.mimsave(path, outputs, fps=fps)
 45 | 
 46 | 
 47 | def generate_gaussian_template(imgSize=200):
 48 |     """ Adapted from DragAnything: https://github.com/showlab/DragAnything/blob/79355363218a7eb9b3437a31b8604b6d436d9337/dataset/dataset.py#L110"""
 49 |     circle_img = np.zeros((imgSize, imgSize), np.float32)
 50 |     circle_mask = cv2.circle(circle_img, (imgSize//2, imgSize//2), imgSize//2, 1, -1)
 51 | 
 52 |     isotropicGrayscaleImage = np.zeros((imgSize, imgSize), np.float32)
 53 | 
 54 |     # Guass Map
 55 |     for i in range(imgSize):
 56 |         for j in range(imgSize):
 57 |             isotropicGrayscaleImage[i, j] = 1 / 2 / np.pi / (40 ** 2) * np.exp(
 58 |                 -1 / 2 * ((i - imgSize / 2) ** 2 / (40 ** 2) + (j - imgSize / 2) ** 2 / (40 ** 2)))
 59 | 
 60 |     isotropicGrayscaleImage = isotropicGrayscaleImage * circle_mask
 61 |     isotropicGrayscaleImage = (isotropicGrayscaleImage / np.max(isotropicGrayscaleImage)).astype(np.float32)
 62 |     isotropicGrayscaleImage = (isotropicGrayscaleImage / np.max(isotropicGrayscaleImage)*255).astype(np.uint8)
 63 | 
 64 |     # isotropicGrayscaleImage = cv2.resize(isotropicGrayscaleImage, (40, 40))
 65 |     return isotropicGrayscaleImage
 66 | 
 67 | 
 68 | def generate_gaussian_heatmap(tracks, width, height, layer_index, layer_capacity, side=20, offset=True):
 69 |     heatmap_template = generate_gaussian_template()
 70 |     num_frames, num_points = tracks.shape[:2]
 71 |     if isinstance(tracks, torch.Tensor):
 72 |         tracks = tracks.cpu().numpy()
 73 |     if offset:
 74 |         offset_kernel = cv2.resize(heatmap_template / 255, (2 * side + 1, 2 * side + 1))
 75 |         offset_kernel /= np.sum(offset_kernel)
 76 |         offset_kernel /= offset_kernel[side, side]
 77 |     heatmaps = []
 78 |     for frame_idx in range(num_frames):
 79 |         if offset:
 80 |             layer_imgs = np.zeros((layer_capacity, height, width, 3), dtype=np.float32)
 81 |         else:
 82 |             layer_imgs = np.zeros((layer_capacity, height, width, 1), dtype=np.float32)
 83 |         layer_heatmaps = []
 84 |         for point_idx in range(num_points):
 85 |             x, y = tracks[frame_idx, point_idx]
 86 |             layer_id = layer_index[point_idx]
 87 |             if x < 0 or y < 0 or x >= width or y >= height:
 88 |                 continue
 89 |             x1 = int(max(x - side, 0))
 90 |             x2 = int(min(x + side, width - 1))
 91 |             y1 = int(max(y - side, 0))
 92 |             y2 = int(min(y + side, height - 1))
 93 |             if (x2 - x1) < 1 or (y2 - y1) < 1:
 94 |                 continue
 95 |             temp_map = cv2.resize(heatmap_template, (x2-x1, y2-y1))
 96 |             layer_imgs[layer_id, y1:y2,x1:x2, 0] = np.maximum(layer_imgs[layer_id, y1:y2,x1:x2, 0], temp_map)
 97 |             if offset:
 98 |                 if frame_idx < (num_frames - 1):
 99 |                     next_x, next_y = tracks[frame_idx + 1, point_idx]
100 |                 else:
101 |                     next_x, next_y = x, y
102 |                 layer_imgs[layer_id, int(y), int(x), 1] = next_x - x
103 |                 layer_imgs[layer_id, int(y), int(x), 2] = next_y - y
104 |         for img in layer_imgs:
105 |             if offset:
106 |                 img[:, :, 1:] = cv2.filter2D(img[:, :, 1:], -1, offset_kernel)
107 |             else:
108 |                 img = cv2.cvtColor(img[:, :, 0].astype(np.uint8), cv2.COLOR_GRAY2RGB)
109 |             layer_heatmaps.append(img)
110 |         heatmaps.append(np.stack(layer_heatmaps, axis=0))
111 |     heatmaps = np.stack(heatmaps, axis=0)
112 |     return torch.from_numpy(heatmaps).permute(0, 1, 4, 2, 3).contiguous().float()   # [F, N_layer, C, H, W]
113 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LayerAnimate: Layer-level Control for Animation
  2 | 
  3 | [Yuxue Yang](https://yuxueyang1204.github.io/)<sup>1,2</sup>, [Lue Fan](https://lue.fan/)<sup>2</sup>, [Zuzeng Lin](https://www.researchgate.net/scientific-contributions/Zuzeng-Lin-2192777418)<sup>3</sup>, [Feng Wang](https://happynear.wang/)<sup>4</sup>, [Zhaoxiang Zhang](https://zhaoxiangzhang.net)<sup>1,2†</sup>
  4 | 
  5 | <sup>1</sup>UCAS&emsp; <sup>2</sup>CASIA&emsp; <sup>3</sup>TJU&emsp; <sup>4</sup>CreateAI&emsp; <sup>†</sup>Corresponding author
  6 | 
  7 | <a href='https://arxiv.org/abs/2501.08295'><img src='https://img.shields.io/badge/arXiv-2501.08295-b31b1b.svg'></a> &nbsp;
  8 | <a href='https://layeranimate.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a> &nbsp;
  9 | <a href='https://www.bilibili.com/video/BV1EycqeaEqF/'><img src='https://img.shields.io/badge/BiliBili-Video-479fd1.svg'></a> &nbsp;
 10 | <a href='https://youtu.be/b_bvVKigky4'><img src='https://img.shields.io/badge/Youtube-Video-b31b1b.svg'></a> &nbsp;
 11 | <a href='https://huggingface.co/spaces/IamCreateAI/LayerAnimate'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face%20-Demo-blue'></a><br>
 12 | 
 13 | Official implementation of **LayerAnimate: Layer-level Control for Animation**, ICCV 2025
 14 | 
 15 | <div align="center"> <img src='__assets__/figs/demos.gif'></img></div>
 16 | 
 17 | **Videos on the [project website](https://layeranimate.github.io) vividly introduces our work and presents qualitative results for an enhanced view experience.**
 18 | 
 19 | ## Updates
 20 | 
 21 | - [25-08-22] Release the [Layer Curation Pipeline](https://github.com/YuxueYang1204/Layer-Curation-Pipeline), including the demo and comprehensive usage guidance.
 22 | - [25-06-26] Our work is accepted by ICCV 2025! 🎉
 23 | - [25-05-29] We have extended LayerAnimate to the DiT ([Wan2.1 1.3B](https://github.com/Wan-Video/Wan2.1)) variant, enabling the generation of 81 frames at 480 × 832 resolution. It performs surprisingly well in the [Real-World Domain](https://layeranimate.github.io/#real_world) shown in the project website.
 24 | - [25-03-31] Release the online demo on [Hugging Face](https://huggingface.co/spaces/IamCreateAI/LayerAnimate).
 25 | - [25-03-30] Release a gradio script [app.py](scripts/app.py) to run the demo locally. Please raise an issue if you encounter any problems.
 26 | - [25-03-22] Release the checkpoint and the inference script. **We update layer curation pipeline and support trajectory control for a flexible composition of various layer-level controls.**
 27 | - [25-01-15] Release the project page and the arXiv preprint.
 28 | 
 29 | ## Layer curation pipeline
 30 | 
 31 | We have released [a comprehensive pipeline](https://github.com/YuxueYang1204/Layer-Curation-Pipeline) for extracting motion-based layers from video sequences. The layer curation pipeline automatically decomposes videos into different layers based on motion patterns, where you can control the number of extracted layers by adjusting the layer capacity parameter to obtain varying levels of motion granularity.
 32 | 
 33 | More details can be found in the [repo](https://github.com/YuxueYang1204/Layer-Curation-Pipeline).
 34 | 
 35 | | Input Videos | Layer Results |
 36 | |:--:|:--:|
 37 | | <video src="https://github.com/user-attachments/assets/d8e28bc4-541f-4b9f-bb95-379ebc83aa89"> | <video src="https://github.com/user-attachments/assets/464c8b53-71d6-4a68-80b4-e4b9b81bdf8a"> |
 38 | | <video src="https://github.com/user-attachments/assets/89360155-622c-40cd-8619-07c14935e3d7"> | <video src="https://github.com/user-attachments/assets/39304b8d-a794-43d0-a271-2682958b9e82"> |
 39 | | <video src="https://github.com/user-attachments/assets/da338346-5e6f-4cc8-8822-b76e715146e7"> | <video src="https://github.com/user-attachments/assets/b951f04c-3ce9-4eab-8aa9-a3eac625371e"> |
 40 | | <video src="https://github.com/user-attachments/assets/4507d1fa-136f-4298-a58f-90082ebcaf4c"> | <video src="https://github.com/user-attachments/assets/575d7c2e-92d3-440d-93ee-fbcaba55aac9"> |
 41 | 
 42 | ## Installation
 43 | 
 44 | ```bash
 45 | git clone git@github.com:IamCreateAI/LayerAnimate.git
 46 | conda create -n layeranimate python=3.10 -y
 47 | conda activate layeranimate
 48 | pip install -r requirements.txt
 49 | pip install wan@git+https://github.com/Wan-Video/Wan2.1  # If you want to use DiT variant.
 50 | ```
 51 | 
 52 | ## Models
 53 | 
 54 | | Models                   | Download Link                                                                                                                                           | Video Size        |
 55 | |--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------|
 56 | | UNet variant | [Huggingface](https://huggingface.co/Yuppie1204/LayerAnimate-Mix) 🤗  | 16 x 320 x 512  |
 57 | | DiT variant | [Huggingface](https://huggingface.co/Yuppie1204/LayerAnimate-DiT) 🤗  | 81 x 480 x 832  |
 58 | 
 59 | Download the pretrained weights and put them in `checkpoints/` directory as follows:
 60 | 
 61 | ```bash
 62 | checkpoints/
 63 | ├─ LayerAnimate-Mix (UNet variant)
 64 | └─ LayerAnimate-DiT
 65 | ```
 66 | 
 67 | ## Inference script
 68 | 
 69 | ### UNet variant (Paper version)
 70 | 
 71 | Run the following command to generate a video from input images:
 72 | 
 73 | ```bash
 74 | python scripts/animate_Layer.py --config scripts/demo1.yaml --savedir outputs/sample1
 75 | 
 76 | python scripts/animate_Layer.py --config scripts/demo2.yaml --savedir outputs/sample2
 77 | 
 78 | python scripts/animate_Layer.py --config scripts/demo3.yaml --savedir outputs/sample3
 79 | 
 80 | python scripts/animate_Layer.py --config scripts/demo4.yaml --savedir outputs/sample4
 81 | 
 82 | python scripts/animate_Layer.py --config scripts/demo5.yaml --savedir outputs/sample5
 83 | ```
 84 | 
 85 | Note that the layer-level controls are prepared in `__assets__/demos`.
 86 | 
 87 | #### Run demo locally
 88 | 
 89 | You can run the demo locally by executing the following command:
 90 | 
 91 | ```bash
 92 | python scripts/app.py --savedir outputs/gradio
 93 | ```
 94 | 
 95 | Then, open the link in your browser to access the demo interface. The output video and the video with trajectory will be saved in the `outputs/gradio` directory.
 96 | 
 97 | ### DiT variant (Wan2.1 1.3B)
 98 | 
 99 | Run the following command to generate a video from input images:
100 | 
101 | ```bash
102 | python scripts/infer_DiT.py --config __assets__/demos/realworld/config.yaml --savedir outputs/realworld
103 | ```
104 | 
105 | We take the `config.yaml` in `demos/realworld/` as an example. You can also modify the config file to suit your needs.
106 | 
107 | ## Todo
108 | 
109 | - [x] Release the code and checkpoint of LayerAnimate.
110 | - [x] Upload a gradio script to run the demo locally.
111 | - [x] Create a online demo in the huggingface space.
112 | - [x] DiT-based LayerAnimate.
113 | - [x] Release layer curation pipeline.
114 | - [ ] Training script for LayerAnimate.
115 | 
116 | ## Acknowledgements
117 | 
118 | We sincerely thank the great work [ToonCrafter](https://doubiiu.github.io/projects/ToonCrafter/), [LVCD](https://luckyhzt.github.io/lvcd), [AniDoc](https://yihao-meng.github.io/AniDoc_demo/), and [Wan-Video](https://github.com/Wan-Video/Wan2.1) for their inspiring work and contributions to the AIGC community.
119 | 
120 | ## Citation
121 | 
122 | Please consider citing our work as follows if it is helpful.
123 | ```bib
124 | @article{yang2025layeranimate,
125 |   author    = {Yang, Yuxue and Fan, Lue and Lin, Zuzeng and Wang, Feng and Zhang, Zhaoxiang},
126 |   title     = {LayerAnimate: Layer-level Control for Animation},
127 |   journal   = {arXiv preprint arXiv:2501.08295},
128 |   year      = {2025},
129 | }
130 | ```
131 | 


--------------------------------------------------------------------------------
/__assets__/demos/demo_1/first_frame.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_1/first_frame.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_1/layer_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_1/layer_0.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_1/layer_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_1/layer_1.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_1/layer_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_1/layer_2.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_1/sketch.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_1/sketch.mp4


--------------------------------------------------------------------------------
/__assets__/demos/demo_1/trajectory.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   [
  3 |     [
  4 |       111.87965393066406,
  5 |       204.28741455078125
  6 |     ],
  7 |     [
  8 |       83.42483520507812,
  9 |       204.21835327148438
 10 |     ],
 11 |     [
 12 |       52.417137145996094,
 13 |       205.34869384765625
 14 |     ],
 15 |     [
 16 |       -10.01504135131836,
 17 |       205.83694458007812
 18 |     ],
 19 |     [
 20 |       -33.109561920166016,
 21 |       206.53018188476562
 22 |     ],
 23 |     [
 24 |       -86.02885437011719,
 25 |       205.10772705078125
 26 |     ],
 27 |     [
 28 |       -119.59435272216797,
 29 |       204.4576873779297
 30 |     ],
 31 |     [
 32 |       -168.70248413085938,
 33 |       210.6188201904297
 34 |     ],
 35 |     [
 36 |       -185.9542999267578,
 37 |       211.16294860839844
 38 |     ],
 39 |     [
 40 |       -206.82852172851562,
 41 |       207.50912475585938
 42 |     ],
 43 |     [
 44 |       -232.2637939453125,
 45 |       208.35643005371094
 46 |     ],
 47 |     [
 48 |       -177.6964111328125,
 49 |       205.50949096679688
 50 |     ],
 51 |     [
 52 |       -231.19761657714844,
 53 |       203.8624267578125
 54 |     ],
 55 |     [
 56 |       -276.06622314453125,
 57 |       208.6024169921875
 58 |     ],
 59 |     [
 60 |       -285.68218994140625,
 61 |       210.30313110351562
 62 |     ],
 63 |     [
 64 |       -235.0211639404297,
 65 |       207.910400390625
 66 |     ]
 67 |   ],
 68 |   [
 69 |     [
 70 |       130.59063720703125,
 71 |       131.48106384277344
 72 |     ],
 73 |     [
 74 |       101.31892395019531,
 75 |       131.62567138671875
 76 |     ],
 77 |     [
 78 |       69.3387451171875,
 79 |       132.40696716308594
 80 |     ],
 81 |     [
 82 |       6.821704864501953,
 83 |       133.10546875
 84 |     ],
 85 |     [
 86 |       -21.6120548248291,
 87 |       132.92977905273438
 88 |     ],
 89 |     [
 90 |       -83.36480712890625,
 91 |       132.2947998046875
 92 |     ],
 93 |     [
 94 |       -111.29481506347656,
 95 |       131.91827392578125
 96 |     ],
 97 |     [
 98 |       -168.74850463867188,
 99 |       138.11587524414062
100 |     ],
101 |     [
102 |       -198.75299072265625,
103 |       139.32774353027344
104 |     ],
105 |     [
106 |       -253.08055114746094,
107 |       136.65480041503906
108 |     ],
109 |     [
110 |       -278.3507080078125,
111 |       136.42958068847656
112 |     ],
113 |     [
114 |       -312.9150390625,
115 |       134.22898864746094
116 |     ],
117 |     [
118 |       -332.20989990234375,
119 |       133.93161010742188
120 |     ],
121 |     [
122 |       -357.1211853027344,
123 |       139.33224487304688
124 |     ],
125 |     [
126 |       -361.4031677246094,
127 |       139.66172790527344
128 |     ],
129 |     [
130 |       -338.45501708984375,
131 |       141.38809204101562
132 |     ]
133 |   ],
134 |   [
135 |     [
136 |       308.344970703125,
137 |       6.6701483726501465
138 |     ],
139 |     [
140 |       278.66864013671875,
141 |       7.116205215454102
142 |     ],
143 |     [
144 |       247.65390014648438,
145 |       7.756659507751465
146 |     ],
147 |     [
148 |       184.76953125,
149 |       8.749884605407715
150 |     ],
151 |     [
152 |       154.9658203125,
153 |       8.66163444519043
154 |     ],
155 |     [
156 |       92.775146484375,
157 |       7.572597503662109
158 |     ],
159 |     [
160 |       63.20433044433594,
161 |       7.524573802947998
162 |     ],
163 |     [
164 |       1.4797935485839844,
165 |       13.07353401184082
166 |     ],
167 |     [
168 |       -26.288057327270508,
169 |       13.74260139465332
170 |     ],
171 |     [
172 |       -83.00379943847656,
173 |       11.522849082946777
174 |     ],
175 |     [
176 |       -109.52509307861328,
177 |       10.739717483520508
178 |     ],
179 |     [
180 |       -140.5462646484375,
181 |       8.596296310424805
182 |     ],
183 |     [
184 |       -155.35394287109375,
185 |       8.009984970092773
186 |     ],
187 |     [
188 |       -180.55775451660156,
189 |       13.584362030029297
190 |     ],
191 |     [
192 |       -185.0371856689453,
193 |       14.09956169128418
194 |     ],
195 |     [
196 |       -203.57778930664062,
197 |       18.082473754882812
198 |     ]
199 |   ]
200 | ]


--------------------------------------------------------------------------------
/__assets__/demos/demo_1/trajectory.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_1/trajectory.npz


--------------------------------------------------------------------------------
/__assets__/demos/demo_2/first_frame.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_2/first_frame.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_2/layer_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_2/layer_0.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_2/layer_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_2/layer_1.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_2/layer_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_2/layer_2.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_2/sketch.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_2/sketch.mp4


--------------------------------------------------------------------------------
/__assets__/demos/demo_2/trajectory.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   [
  3 |     [
  4 |       158.21946716308594,
  5 |       245.89105224609375
  6 |     ],
  7 |     [
  8 |       148.94857788085938,
  9 |       246.4789276123047
 10 |     ],
 11 |     [
 12 |       137.88522338867188,
 13 |       247.1299285888672
 14 |     ],
 15 |     [
 16 |       128.4403839111328,
 17 |       247.8033905029297
 18 |     ],
 19 |     [
 20 |       127.84039306640625,
 21 |       246.24864196777344
 22 |     ],
 23 |     [
 24 |       127.06155395507812,
 25 |       244.60606384277344
 26 |     ],
 27 |     [
 28 |       126.77435302734375,
 29 |       243.17208862304688
 30 |     ],
 31 |     [
 32 |       126.42509460449219,
 33 |       243.04747009277344
 34 |     ],
 35 |     [
 36 |       125.61285400390625,
 37 |       242.14913940429688
 38 |     ],
 39 |     [
 40 |       125.40904235839844,
 41 |       242.65948486328125
 42 |     ],
 43 |     [
 44 |       125.03759765625,
 45 |       242.90908813476562
 46 |     ],
 47 |     [
 48 |       124.67877197265625,
 49 |       242.95994567871094
 50 |     ],
 51 |     [
 52 |       125.00759887695312,
 53 |       242.61265563964844
 54 |     ],
 55 |     [
 56 |       125.37916564941406,
 57 |       242.13555908203125
 58 |     ],
 59 |     [
 60 |       125.7420654296875,
 61 |       242.410888671875
 62 |     ],
 63 |     [
 64 |       125.54336547851562,
 65 |       242.98825073242188
 66 |     ]
 67 |   ],
 68 |   [
 69 |     [
 70 |       223.55435180664062,
 71 |       204.28741455078125
 72 |     ],
 73 |     [
 74 |       207.83377075195312,
 75 |       202.7445068359375
 76 |     ],
 77 |     [
 78 |       193.4696044921875,
 79 |       200.418701171875
 80 |     ],
 81 |     [
 82 |       178.7669677734375,
 83 |       199.83621215820312
 84 |     ],
 85 |     [
 86 |       178.14218139648438,
 87 |       200.34848022460938
 88 |     ],
 89 |     [
 90 |       176.58251953125,
 91 |       200.19627380371094
 92 |     ],
 93 |     [
 94 |       175.0523681640625,
 95 |       200.24407958984375
 96 |     ],
 97 |     [
 98 |       174.57379150390625,
 99 |       199.90940856933594
100 |     ],
101 |     [
102 |       173.37542724609375,
103 |       200.4640350341797
104 |     ],
105 |     [
106 |       173.5262451171875,
107 |       200.5198974609375
108 |     ],
109 |     [
110 |       173.60935974121094,
111 |       200.36471557617188
112 |     ],
113 |     [
114 |       173.8643035888672,
115 |       200.39389038085938
116 |     ],
117 |     [
118 |       173.903076171875,
119 |       200.2958984375
120 |     ],
121 |     [
122 |       173.96859741210938,
123 |       200.00491333007812
124 |     ],
125 |     [
126 |       174.22422790527344,
127 |       200.09921264648438
128 |     ],
129 |     [
130 |       174.16683959960938,
131 |       200.00193786621094
132 |     ]
133 |   ],
134 |   [
135 |     [
136 |       232.88790893554688,
137 |       261.492431640625
138 |     ],
139 |     [
140 |       224.37376403808594,
141 |       258.9049072265625
142 |     ],
143 |     [
144 |       214.7504119873047,
145 |       255.82171630859375
146 |     ],
147 |     [
148 |       205.59695434570312,
149 |       252.74368286132812
150 |     ],
151 |     [
152 |       203.56024169921875,
153 |       254.83567810058594
154 |     ],
155 |     [
156 |       200.3128662109375,
157 |       256.933349609375
158 |     ],
159 |     [
160 |       197.56045532226562,
161 |       258.17236328125
162 |     ],
163 |     [
164 |       196.72007751464844,
165 |       258.3282470703125
166 |     ],
167 |     [
168 |       194.2041473388672,
169 |       259.42486572265625
170 |     ],
171 |     [
172 |       194.23858642578125,
173 |       259.9649353027344
174 |     ],
175 |     [
176 |       194.01547241210938,
177 |       260.14569091796875
178 |     ],
179 |     [
180 |       193.87156677246094,
181 |       259.9699401855469
182 |     ],
183 |     [
184 |       193.9617919921875,
185 |       259.7339172363281
186 |     ],
187 |     [
188 |       193.89659118652344,
189 |       259.5014343261719
190 |     ],
191 |     [
192 |       193.8680419921875,
193 |       259.7557373046875
194 |     ],
195 |     [
196 |       193.91842651367188,
197 |       260.28717041015625
198 |     ]
199 |   ]
200 | ]


--------------------------------------------------------------------------------
/__assets__/demos/demo_2/trajectory.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_2/trajectory.npz


--------------------------------------------------------------------------------
/__assets__/demos/demo_3/first_frame.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_3/first_frame.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_3/last_frame.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_3/last_frame.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_3/layer_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_3/layer_0.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_3/layer_0_last.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_3/layer_0_last.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_3/layer_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_3/layer_1.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_3/layer_1_last.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_3/layer_1_last.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_3/layer_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_3/layer_2.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_3/layer_2_last.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_3/layer_2_last.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_3/layer_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_3/layer_3.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_3/layer_3_last.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_3/layer_3_last.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_3/sketch.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_3/sketch.mp4


--------------------------------------------------------------------------------
/__assets__/demos/demo_3/trajectory.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   [
  3 |     [
  4 |       49.66927719116211,
  5 |       126.28060150146484
  6 |     ],
  7 |     [
  8 |       53.070796966552734,
  9 |       140.00479125976562
 10 |     ],
 11 |     [
 12 |       58.86982345581055,
 13 |       157.8321533203125
 14 |     ],
 15 |     [
 16 |       69.01676177978516,
 17 |       175.84800720214844
 18 |     ],
 19 |     [
 20 |       76.01651000976562,
 21 |       197.62847900390625
 22 |     ],
 23 |     [
 24 |       93.34223937988281,
 25 |       232.17538452148438
 26 |     ],
 27 |     [
 28 |       96.88280487060547,
 29 |       246.68162536621094
 30 |     ],
 31 |     [
 32 |       105.09373474121094,
 33 |       265.91741943359375
 34 |     ],
 35 |     [
 36 |       122.41947174072266,
 37 |       300.46429443359375
 38 |     ],
 39 |     [
 40 |       139.74520874023438,
 41 |       335.0111999511719
 42 |     ],
 43 |     [
 44 |       157.07093811035156,
 45 |       369.55810546875
 46 |     ],
 47 |     [
 48 |       174.39666748046875,
 49 |       404.10498046875
 50 |     ],
 51 |     [
 52 |       191.722412109375,
 53 |       438.65185546875
 54 |     ],
 55 |     [
 56 |       209.0481414794922,
 57 |       473.19873046875
 58 |     ],
 59 |     [
 60 |       226.37387084960938,
 61 |       507.74560546875
 62 |     ],
 63 |     [
 64 |       243.6995849609375,
 65 |       542.29248046875
 66 |     ]
 67 |   ],
 68 |   [
 69 |     [
 70 |       56.677669525146484,
 71 |       69.07560729980469
 72 |     ],
 73 |     [
 74 |       66.92218780517578,
 75 |       90.37911224365234
 76 |     ],
 77 |     [
 78 |       79.62323760986328,
 79 |       116.14250183105469
 80 |     ],
 81 |     [
 82 |       91.2628173828125,
 83 |       141.8087921142578
 84 |     ],
 85 |     [
 86 |       103.7956771850586,
 87 |       167.58724975585938
 88 |     ],
 89 |     [
 90 |       117.59683227539062,
 91 |       195.22598266601562
 92 |     ],
 93 |     [
 94 |       127.79037475585938,
 95 |       221.12567138671875
 96 |     ],
 97 |     [
 98 |       140.4638671875,
 99 |       248.97164916992188
100 |     ],
101 |     [
102 |       138.9651641845703,
103 |       256.9488830566406
104 |     ],
105 |     [
106 |       165.24566650390625,
107 |       296.32525634765625
108 |     ],
109 |     [
110 |       191.52615356445312,
111 |       335.70166015625
112 |     ],
113 |     [
114 |       217.806640625,
115 |       375.07806396484375
116 |     ],
117 |     [
118 |       244.08714294433594,
119 |       414.4544372558594
120 |     ],
121 |     [
122 |       270.3676452636719,
123 |       453.830810546875
124 |     ],
125 |     [
126 |       296.64813232421875,
127 |       493.20721435546875
128 |     ],
129 |     [
130 |       322.92864990234375,
131 |       532.5836181640625
132 |     ]
133 |   ]
134 | ]


--------------------------------------------------------------------------------
/__assets__/demos/demo_3/trajectory.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_3/trajectory.npz


--------------------------------------------------------------------------------
/__assets__/demos/demo_4/first_frame.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_4/first_frame.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_4/layer_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_4/layer_0.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_4/layer_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_4/layer_1.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_4/layer_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_4/layer_2.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_4/sketch.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_4/sketch.mp4


--------------------------------------------------------------------------------
/__assets__/demos/demo_4/trajectory.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   [
  3 |     [
  4 |       186.72357177734375,
  5 |       225.0892333984375
  6 |     ],
  7 |     [
  8 |       186.59104919433594,
  9 |       220.61599731445312
 10 |     ],
 11 |     [
 12 |       190.39842224121094,
 13 |       216.0291748046875
 14 |     ],
 15 |     [
 16 |       199.52769470214844,
 17 |       213.26031494140625
 18 |     ],
 19 |     [
 20 |       204.145263671875,
 21 |       214.56866455078125
 22 |     ],
 23 |     [
 24 |       209.41751098632812,
 25 |       214.23330688476562
 26 |     ],
 27 |     [
 28 |       211.30255126953125,
 29 |       216.12774658203125
 30 |     ],
 31 |     [
 32 |       215.53131103515625,
 33 |       215.55880737304688
 34 |     ],
 35 |     [
 36 |       211.28453063964844,
 37 |       215.3497314453125
 38 |     ],
 39 |     [
 40 |       205.66819763183594,
 41 |       210.34344482421875
 42 |     ],
 43 |     [
 44 |       208.09231567382812,
 45 |       197.720458984375
 46 |     ],
 47 |     [
 48 |       201.51205444335938,
 49 |       215.72598266601562
 50 |     ],
 51 |     [
 52 |       191.19480895996094,
 53 |       223.12850952148438
 54 |     ],
 55 |     [
 56 |       194.90512084960938,
 57 |       222.38108825683594
 58 |     ],
 59 |     [
 60 |       200.74607849121094,
 61 |       217.3187713623047
 62 |     ],
 63 |     [
 64 |       207.563720703125,
 65 |       235.63250732421875
 66 |     ]
 67 |   ],
 68 |   [
 69 |     [
 70 |       289.63397216796875,
 71 |       230.28970336914062
 72 |     ],
 73 |     [
 74 |       289.8543701171875,
 75 |       227.20205688476562
 76 |     ],
 77 |     [
 78 |       292.2384033203125,
 79 |       223.03854370117188
 80 |     ],
 81 |     [
 82 |       301.47711181640625,
 83 |       219.50289916992188
 84 |     ],
 85 |     [
 86 |       308.8260803222656,
 87 |       220.3004608154297
 88 |     ],
 89 |     [
 90 |       315.6751403808594,
 91 |       219.62095642089844
 92 |     ],
 93 |     [
 94 |       317.8089599609375,
 95 |       221.09295654296875
 96 |     ],
 97 |     [
 98 |       320.73956298828125,
 99 |       221.21011352539062
100 |     ],
101 |     [
102 |       317.1898193359375,
103 |       221.21250915527344
104 |     ],
105 |     [
106 |       319.5433349609375,
107 |       217.74606323242188
108 |     ],
109 |     [
110 |       317.6147155761719,
111 |       207.62603759765625
112 |     ],
113 |     [
114 |       308.29156494140625,
115 |       224.09878540039062
116 |     ],
117 |     [
118 |       294.7052917480469,
119 |       230.4814910888672
120 |     ],
121 |     [
122 |       298.7985534667969,
123 |       230.0016326904297
124 |     ],
125 |     [
126 |       304.0728454589844,
127 |       226.04998779296875
128 |     ],
129 |     [
130 |       314.6731872558594,
131 |       242.630126953125
132 |     ]
133 |   ],
134 |   [
135 |     [
136 |       214.7900390625,
137 |       230.28970336914062
138 |     ],
139 |     [
140 |       214.2034912109375,
141 |       226.12539672851562
142 |     ],
143 |     [
144 |       216.921630859375,
145 |       221.91062927246094
146 |     ],
147 |     [
148 |       226.7117156982422,
149 |       219.55148315429688
150 |     ],
151 |     [
152 |       232.1102294921875,
153 |       220.2542724609375
154 |     ],
155 |     [
156 |       237.49270629882812,
157 |       219.5577850341797
158 |     ],
159 |     [
160 |       240.1033935546875,
161 |       220.77169799804688
162 |     ],
163 |     [
164 |       243.27154541015625,
165 |       220.56069946289062
166 |     ],
167 |     [
168 |       240.3792724609375,
169 |       221.12344360351562
170 |     ],
171 |     [
172 |       235.10897827148438,
173 |       216.4136962890625
174 |     ],
175 |     [
176 |       234.0819091796875,
177 |       202.91900634765625
178 |     ],
179 |     [
180 |       224.08642578125,
181 |       220.4688720703125
182 |     ],
183 |     [
184 |       212.40911865234375,
185 |       227.7927703857422
186 |     ],
187 |     [
188 |       218.22300720214844,
189 |       226.47549438476562
190 |     ],
191 |     [
192 |       225.32315063476562,
193 |       221.8306884765625
194 |     ],
195 |     [
196 |       234.59808349609375,
197 |       239.94235229492188
198 |     ]
199 |   ]
200 | ]


--------------------------------------------------------------------------------
/__assets__/demos/demo_4/trajectory.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_4/trajectory.npz


--------------------------------------------------------------------------------
/__assets__/demos/demo_5/first_frame.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_5/first_frame.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_5/layer_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_5/layer_0.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_5/layer_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_5/layer_1.jpg


--------------------------------------------------------------------------------
/__assets__/demos/demo_5/sketch.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_5/sketch.mp4


--------------------------------------------------------------------------------
/__assets__/demos/demo_5/trajectory.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   [
  3 |     [
  4 |       494.2274169921875,
  5 |       22.271512985229492
  6 |     ],
  7 |     [
  8 |       499.44189453125,
  9 |       21.746015548706055
 10 |     ],
 11 |     [
 12 |       504.0919189453125,
 13 |       21.225364685058594
 14 |     ],
 15 |     [
 16 |       514.5880737304688,
 17 |       20.82619285583496
 18 |     ],
 19 |     [
 20 |       520.4939575195312,
 21 |       20.672199249267578
 22 |     ],
 23 |     [
 24 |       526.637451171875,
 25 |       20.305557250976562
 26 |     ],
 27 |     [
 28 |       534.9617919921875,
 29 |       20.358591079711914
 30 |     ],
 31 |     [
 32 |       539.2017211914062,
 33 |       20.12591552734375
 34 |     ],
 35 |     [
 36 |       543.9376220703125,
 37 |       20.107173919677734
 38 |     ],
 39 |     [
 40 |       549.5306396484375,
 41 |       19.739456176757812
 42 |     ],
 43 |     [
 44 |       553.4171142578125,
 45 |       20.842308044433594
 46 |     ],
 47 |     [
 48 |       554.49462890625,
 49 |       20.15322494506836
 50 |     ],
 51 |     [
 52 |       559.0555419921875,
 53 |       21.292396545410156
 54 |     ],
 55 |     [
 56 |       558.5130004882812,
 57 |       21.357444763183594
 58 |     ],
 59 |     [
 60 |       561.72607421875,
 61 |       20.114139556884766
 62 |     ],
 63 |     [
 64 |       560.4268798828125,
 65 |       21.73964500427246
 66 |     ]
 67 |   ],
 68 |   [
 69 |     [
 70 |       494.2274169921875,
 71 |       48.27378463745117
 72 |     ],
 73 |     [
 74 |       494.85711669921875,
 75 |       48.05669403076172
 76 |     ],
 77 |     [
 78 |       494.21563720703125,
 79 |       48.0822868347168
 80 |     ],
 81 |     [
 82 |       492.88446044921875,
 83 |       48.20854187011719
 84 |     ],
 85 |     [
 86 |       491.5914306640625,
 87 |       48.36796569824219
 88 |     ],
 89 |     [
 90 |       490.6370849609375,
 91 |       48.649070739746094
 92 |     ],
 93 |     [
 94 |       488.6202392578125,
 95 |       48.874202728271484
 96 |     ],
 97 |     [
 98 |       487.603271484375,
 99 |       49.16374969482422
100 |     ],
101 |     [
102 |       486.469970703125,
103 |       49.414939880371094
104 |     ],
105 |     [
106 |       484.92120361328125,
107 |       49.98759460449219
108 |     ],
109 |     [
110 |       483.7000427246094,
111 |       50.26809310913086
112 |     ],
113 |     [
114 |       482.22125244140625,
115 |       50.42219161987305
116 |     ],
117 |     [
118 |       480.54931640625,
119 |       50.766448974609375
120 |     ],
121 |     [
122 |       479.24481201171875,
123 |       51.03229522705078
124 |     ],
125 |     [
126 |       478.1097106933594,
127 |       51.489837646484375
128 |     ],
129 |     [
130 |       476.470947265625,
131 |       52.048194885253906
132 |     ]
133 |   ],
134 |   [
135 |     [
136 |       64.8839111328125,
137 |       287.4947204589844
138 |     ],
139 |     [
140 |       81.71736145019531,
141 |       288.09869384765625
142 |     ],
143 |     [
144 |       100.02552795410156,
145 |       288.89111328125
146 |     ],
147 |     [
148 |       128.72686767578125,
149 |       289.8943176269531
150 |     ],
151 |     [
152 |       149.62322998046875,
153 |       290.7263488769531
154 |     ],
155 |     [
156 |       170.50192260742188,
157 |       291.29925537109375
158 |     ],
159 |     [
160 |       203.6192626953125,
161 |       292.2691345214844
162 |     ],
163 |     [
164 |       227.08547973632812,
165 |       292.68035888671875
166 |     ],
167 |     [
168 |       250.68621826171875,
169 |       293.3591613769531
170 |     ],
171 |     [
172 |       286.62176513671875,
173 |       294.1515197753906
174 |     ],
175 |     [
176 |       311.21240234375,
177 |       294.3829650878906
178 |     ],
179 |     [
180 |       335.68389892578125,
181 |       294.7114562988281
182 |     ],
183 |     [
184 |       373.18115234375,
185 |       295.2404479980469
186 |     ],
187 |     [
188 |       397.2961120605469,
189 |       295.111572265625
190 |     ],
191 |     [
192 |       422.346923828125,
193 |       295.5068054199219
194 |     ],
195 |     [
196 |       457.2431335449219,
197 |       295.49383544921875
198 |     ]
199 |   ],
200 |   [
201 |     [
202 |       64.8839111328125,
203 |       235.4901580810547
204 |     ],
205 |     [
206 |       61.33024597167969,
207 |       235.5504150390625
208 |     ],
209 |     [
210 |       57.36271667480469,
211 |       235.6099090576172
212 |     ],
213 |     [
214 |       50.592864990234375,
215 |       235.9037322998047
216 |     ],
217 |     [
218 |       46.184783935546875,
219 |       235.94981384277344
220 |     ],
221 |     [
222 |       42.2303466796875,
223 |       235.8488006591797
224 |     ],
225 |     [
226 |       35.333221435546875,
227 |       235.73272705078125
228 |     ],
229 |     [
230 |       29.864356994628906,
231 |       236.13253784179688
232 |     ],
233 |     [
234 |       24.596290588378906,
235 |       236.366943359375
236 |     ],
237 |     [
238 |       17.585124969482422,
239 |       236.61953735351562
240 |     ],
241 |     [
242 |       12.934989929199219,
243 |       236.7737274169922
244 |     ],
245 |     [
246 |       8.478790283203125,
247 |       236.75421142578125
248 |     ],
249 |     [
250 |       2.206012725830078,
251 |       236.9993896484375
252 |     ],
253 |     [
254 |       -2.862123489379883,
255 |       237.2617645263672
256 |     ],
257 |     [
258 |       -7.3507843017578125,
259 |       237.2784423828125
260 |     ],
261 |     [
262 |       -12.782325744628906,
263 |       237.2703094482422
264 |     ]
265 |   ],
266 |   [
267 |     [
268 |       92.88457489013672,
269 |       225.0892333984375
270 |     ],
271 |     [
272 |       88.737548828125,
273 |       225.09442138671875
274 |     ],
275 |     [
276 |       84.08223724365234,
277 |       225.36553955078125
278 |     ],
279 |     [
280 |       76.90846252441406,
281 |       225.7208251953125
282 |     ],
283 |     [
284 |       72.26066589355469,
285 |       225.9451141357422
286 |     ],
287 |     [
288 |       67.7042465209961,
289 |       226.13169860839844
290 |     ],
291 |     [
292 |       60.917144775390625,
293 |       226.32199096679688
294 |     ],
295 |     [
296 |       55.98236083984375,
297 |       226.5792236328125
298 |     ],
299 |     [
300 |       51.30162811279297,
301 |       226.9581298828125
302 |     ],
303 |     [
304 |       44.654823303222656,
305 |       227.06956481933594
306 |     ],
307 |     [
308 |       40.06951904296875,
309 |       227.15420532226562
310 |     ],
311 |     [
312 |       35.59206771850586,
313 |       227.13719177246094
314 |     ],
315 |     [
316 |       29.056011199951172,
317 |       227.17002868652344
318 |     ],
319 |     [
320 |       24.805736541748047,
321 |       227.24826049804688
322 |     ],
323 |     [
324 |       20.537612915039062,
325 |       227.34564208984375
326 |     ],
327 |     [
328 |       14.309333801269531,
329 |       227.30154418945312
330 |     ]
331 |   ]
332 | ]


--------------------------------------------------------------------------------
/__assets__/demos/demo_5/trajectory.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/demo_5/trajectory.npz


--------------------------------------------------------------------------------
/__assets__/demos/realworld/config.yaml:
--------------------------------------------------------------------------------
 1 | prompt: A man skateboarding with a running dog.
 2 | num_inference_steps: 25
 3 | guidance_scale: 6.0
 4 | seed: 289
 5 | first_frame_path: __assets__/demos/realworld/first_frame.jpg
 6 | layer:
 7 |   - mask_path: __assets__/demos/realworld/layer_0.jpg
 8 |     control_type: sketch
 9 |     sketch_path: __assets__/demos/realworld/sketch.mp4
10 |   - mask_path: __assets__/demos/realworld/layer_1.jpg
11 |     control_type: trajectory
12 |     trajectory_path: __assets__/demos/realworld/trajectory_dog.json
13 |   - mask_path: __assets__/demos/realworld/layer_2.jpg
14 |     control_type: trajectory
15 |     trajectory_path: __assets__/demos/realworld/trajectory_bg.json
16 | 


--------------------------------------------------------------------------------
/__assets__/demos/realworld/first_frame.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/realworld/first_frame.jpg


--------------------------------------------------------------------------------
/__assets__/demos/realworld/layer_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/realworld/layer_0.jpg


--------------------------------------------------------------------------------
/__assets__/demos/realworld/layer_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/realworld/layer_1.jpg


--------------------------------------------------------------------------------
/__assets__/demos/realworld/layer_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/realworld/layer_2.jpg


--------------------------------------------------------------------------------
/__assets__/demos/realworld/sketch.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/demos/realworld/sketch.mp4


--------------------------------------------------------------------------------
/__assets__/demos/realworld/trajectory_bg.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     [
  3 |         [
  4 |             100.0,
  5 |             70.0
  6 |         ],
  7 |         [
  8 |             108.19310760498047,
  9 |             70.0
 10 |         ],
 11 |         [
 12 |             116.39154052734375,
 13 |             70.0
 14 |         ],
 15 |         [
 16 |             124.59541320800781,
 17 |             70.0
 18 |         ],
 19 |         [
 20 |             132.80484008789062,
 21 |             70.0
 22 |         ],
 23 |         [
 24 |             141.01991271972656,
 25 |             70.0
 26 |         ],
 27 |         [
 28 |             149.2407684326172,
 29 |             70.0
 30 |         ],
 31 |         [
 32 |             157.46749877929688,
 33 |             70.0
 34 |         ],
 35 |         [
 36 |             165.70022583007812,
 37 |             70.0
 38 |         ],
 39 |         [
 40 |             173.9390411376953,
 41 |             70.0
 42 |         ],
 43 |         [
 44 |             182.18406677246094,
 45 |             70.0
 46 |         ],
 47 |         [
 48 |             190.43540954589844,
 49 |             70.0
 50 |         ],
 51 |         [
 52 |             198.6931915283203,
 53 |             70.0
 54 |         ],
 55 |         [
 56 |             206.95750427246094,
 57 |             70.0
 58 |         ],
 59 |         [
 60 |             215.2284698486328,
 61 |             70.0
 62 |         ],
 63 |         [
 64 |             223.5061798095703,
 65 |             70.0
 66 |         ],
 67 |         [
 68 |             231.790771484375,
 69 |             70.0
 70 |         ],
 71 |         [
 72 |             240.08233642578125,
 73 |             70.0
 74 |         ],
 75 |         [
 76 |             248.38099670410156,
 77 |             70.0
 78 |         ],
 79 |         [
 80 |             256.68682861328125,
 81 |             70.0
 82 |         ],
 83 |         [
 84 |             265.0,
 85 |             70.0
 86 |         ],
 87 |         [
 88 |             273.3267517089844,
 89 |             70.0
 90 |         ],
 91 |         [
 92 |             281.67230224609375,
 93 |             70.0
 94 |         ],
 95 |         [
 96 |             290.0350036621094,
 97 |             70.0
 98 |         ],
 99 |         [
100 |             298.41326904296875,
101 |             70.0
102 |         ],
103 |         [
104 |             306.80548095703125,
105 |             70.0
106 |         ],
107 |         [
108 |             315.2100830078125,
109 |             70.0
110 |         ],
111 |         [
112 |             323.6254577636719,
113 |             70.0
114 |         ],
115 |         [
116 |             332.0499572753906,
117 |             70.0
118 |         ],
119 |         [
120 |             340.4820556640625,
121 |             70.0
122 |         ],
123 |         [
124 |             348.92010498046875,
125 |             70.0
126 |         ],
127 |         [
128 |             357.3625183105469,
129 |             70.0
130 |         ],
131 |         [
132 |             365.80767822265625,
133 |             70.0
134 |         ],
135 |         [
136 |             374.2540283203125,
137 |             70.0
138 |         ],
139 |         [
140 |             382.6999206542969,
141 |             70.0
142 |         ],
143 |         [
144 |             391.1437683105469,
145 |             70.0
146 |         ],
147 |         [
148 |             399.583984375,
149 |             70.0
150 |         ],
151 |         [
152 |             408.0189514160156,
153 |             70.0
154 |         ],
155 |         [
156 |             416.44708251953125,
157 |             70.0
158 |         ],
159 |         [
160 |             424.86676025390625,
161 |             70.0
162 |         ],
163 |         [
164 |             433.2763977050781,
165 |             70.0
166 |         ],
167 |         [
168 |             441.6744079589844,
169 |             70.0
170 |         ],
171 |         [
172 |             450.0591735839844,
173 |             70.0
174 |         ],
175 |         [
176 |             458.4290771484375,
177 |             70.0
178 |         ],
179 |         [
180 |             466.7825622558594,
181 |             70.0
182 |         ],
183 |         [
184 |             475.11798095703125,
185 |             70.0
186 |         ],
187 |         [
188 |             483.43377685546875,
189 |             70.0
190 |         ],
191 |         [
192 |             491.7283020019531,
193 |             70.0
194 |         ],
195 |         [
196 |             500.0,
197 |             70.0
198 |         ],
199 |         [
200 |             508.2557678222656,
201 |             70.00708770751953
202 |         ],
203 |         [
204 |             516.5036010742188,
205 |             70.028076171875
206 |         ],
207 |         [
208 |             524.7433471679688,
209 |             70.06253051757812
210 |         ],
211 |         [
212 |             532.97509765625,
213 |             70.11002349853516
214 |         ],
215 |         [
216 |             541.19873046875,
217 |             70.17013549804688
218 |         ],
219 |         [
220 |             549.4142456054688,
221 |             70.242431640625
222 |         ],
223 |         [
224 |             557.6215209960938,
225 |             70.32648468017578
226 |         ],
227 |         [
228 |             565.820556640625,
229 |             70.421875
230 |         ],
231 |         [
232 |             574.0113525390625,
233 |             70.52816772460938
234 |         ],
235 |         [
236 |             582.1937255859375,
237 |             70.64493560791016
238 |         ],
239 |         [
240 |             590.3677368164062,
241 |             70.77175903320312
242 |         ],
243 |         [
244 |             598.5333251953125,
245 |             70.908203125
246 |         ],
247 |         [
248 |             606.6904296875,
249 |             71.05384063720703
250 |         ],
251 |         [
252 |             614.8389892578125,
253 |             71.208251953125
254 |         ],
255 |         [
256 |             622.97900390625,
257 |             71.37100219726562
258 |         ],
259 |         [
260 |             631.1103515625,
261 |             71.54166412353516
262 |         ],
263 |         [
264 |             639.2330322265625,
265 |             71.71981811523438
266 |         ],
267 |         [
268 |             647.3469848632812,
269 |             71.905029296875
270 |         ],
271 |         [
272 |             655.4522094726562,
273 |             72.09687042236328
274 |         ],
275 |         [
276 |             663.548583984375,
277 |             72.294921875
278 |         ],
279 |         [
280 |             671.6361083984375,
281 |             72.49874877929688
282 |         ],
283 |         [
284 |             679.7146606445312,
285 |             72.70792388916016
286 |         ],
287 |         [
288 |             687.7843017578125,
289 |             72.92202758789062
290 |         ],
291 |         [
292 |             695.844970703125,
293 |             73.140625
294 |         ],
295 |         [
296 |             703.8965454101562,
297 |             73.36328887939453
298 |         ],
299 |         [
300 |             711.9390258789062,
301 |             73.589599609375
302 |         ],
303 |         [
304 |             719.9723510742188,
305 |             73.81912231445312
306 |         ],
307 |         [
308 |             727.9964599609375,
309 |             74.05142974853516
310 |         ],
311 |         [
312 |             736.0113525390625,
313 |             74.28610229492188
314 |         ],
315 |         [
316 |             744.0169067382812,
317 |             74.522705078125
318 |         ],
319 |         [
320 |             752.0131225585938,
321 |             74.76081085205078
322 |         ],
323 |         [
324 |             760.0,
325 |             75.0
326 |         ]
327 |     ],
328 |     [
329 |         [
330 |             115.0,
331 |             190.0
332 |         ],
333 |         [
334 |             123.19310760498047,
335 |             190.0
336 |         ],
337 |         [
338 |             131.39154052734375,
339 |             190.0
340 |         ],
341 |         [
342 |             139.5954132080078,
343 |             190.0
344 |         ],
345 |         [
346 |             147.80484008789062,
347 |             190.0
348 |         ],
349 |         [
350 |             156.01991271972656,
351 |             190.0
352 |         ],
353 |         [
354 |             164.2407684326172,
355 |             190.0
356 |         ],
357 |         [
358 |             172.46749877929688,
359 |             190.0
360 |         ],
361 |         [
362 |             180.70022583007812,
363 |             190.0
364 |         ],
365 |         [
366 |             188.9390411376953,
367 |             190.0
368 |         ],
369 |         [
370 |             197.18406677246094,
371 |             190.0
372 |         ],
373 |         [
374 |             205.43540954589844,
375 |             190.0
376 |         ],
377 |         [
378 |             213.6931915283203,
379 |             190.0
380 |         ],
381 |         [
382 |             221.95750427246094,
383 |             190.0
384 |         ],
385 |         [
386 |             230.2284698486328,
387 |             190.0
388 |         ],
389 |         [
390 |             238.5061798095703,
391 |             190.0
392 |         ],
393 |         [
394 |             246.790771484375,
395 |             190.0
396 |         ],
397 |         [
398 |             255.08233642578125,
399 |             190.0
400 |         ],
401 |         [
402 |             263.3809814453125,
403 |             190.0
404 |         ],
405 |         [
406 |             271.68682861328125,
407 |             190.0
408 |         ],
409 |         [
410 |             280.0,
411 |             190.0
412 |         ],
413 |         [
414 |             288.3267517089844,
415 |             190.0
416 |         ],
417 |         [
418 |             296.67230224609375,
419 |             190.0
420 |         ],
421 |         [
422 |             305.0350036621094,
423 |             190.0
424 |         ],
425 |         [
426 |             313.41326904296875,
427 |             190.0
428 |         ],
429 |         [
430 |             321.80548095703125,
431 |             190.0
432 |         ],
433 |         [
434 |             330.2100830078125,
435 |             190.0
436 |         ],
437 |         [
438 |             338.6254577636719,
439 |             190.0
440 |         ],
441 |         [
442 |             347.0499572753906,
443 |             190.0
444 |         ],
445 |         [
446 |             355.4820556640625,
447 |             190.0
448 |         ],
449 |         [
450 |             363.92010498046875,
451 |             190.0
452 |         ],
453 |         [
454 |             372.3625183105469,
455 |             190.0
456 |         ],
457 |         [
458 |             380.80767822265625,
459 |             190.0
460 |         ],
461 |         [
462 |             389.2540283203125,
463 |             190.0
464 |         ],
465 |         [
466 |             397.6999206542969,
467 |             190.0
468 |         ],
469 |         [
470 |             406.1437683105469,
471 |             190.0
472 |         ],
473 |         [
474 |             414.583984375,
475 |             190.0
476 |         ],
477 |         [
478 |             423.0189514160156,
479 |             190.0
480 |         ],
481 |         [
482 |             431.44708251953125,
483 |             190.0
484 |         ],
485 |         [
486 |             439.86676025390625,
487 |             190.0
488 |         ],
489 |         [
490 |             448.2763977050781,
491 |             190.0
492 |         ],
493 |         [
494 |             456.6744079589844,
495 |             190.0
496 |         ],
497 |         [
498 |             465.0591735839844,
499 |             190.0
500 |         ],
501 |         [
502 |             473.4290771484375,
503 |             190.0
504 |         ],
505 |         [
506 |             481.7825622558594,
507 |             190.0
508 |         ],
509 |         [
510 |             490.11798095703125,
511 |             190.0
512 |         ],
513 |         [
514 |             498.43377685546875,
515 |             190.0
516 |         ],
517 |         [
518 |             506.7283020019531,
519 |             190.0
520 |         ],
521 |         [
522 |             515.0,
523 |             190.0
524 |         ],
525 |         [
526 |             523.2557983398438,
527 |             190.00709533691406
528 |         ],
529 |         [
530 |             531.5036010742188,
531 |             190.028076171875
532 |         ],
533 |         [
534 |             539.7433471679688,
535 |             190.06253051757812
536 |         ],
537 |         [
538 |             547.97509765625,
539 |             190.1100311279297
540 |         ],
541 |         [
542 |             556.19873046875,
543 |             190.17013549804688
544 |         ],
545 |         [
546 |             564.4142456054688,
547 |             190.242431640625
548 |         ],
549 |         [
550 |             572.6215209960938,
551 |             190.3264923095703
552 |         ],
553 |         [
554 |             580.820556640625,
555 |             190.421875
556 |         ],
557 |         [
558 |             589.0113525390625,
559 |             190.52816772460938
560 |         ],
561 |         [
562 |             597.1937255859375,
563 |             190.6449432373047
564 |         ],
565 |         [
566 |             605.3677368164062,
567 |             190.77175903320312
568 |         ],
569 |         [
570 |             613.5333251953125,
571 |             190.908203125
572 |         ],
573 |         [
574 |             621.6904296875,
575 |             191.05384826660156
576 |         ],
577 |         [
578 |             629.8389892578125,
579 |             191.208251953125
580 |         ],
581 |         [
582 |             637.97900390625,
583 |             191.37100219726562
584 |         ],
585 |         [
586 |             646.1103515625,
587 |             191.5416717529297
588 |         ],
589 |         [
590 |             654.2330322265625,
591 |             191.71981811523438
592 |         ],
593 |         [
594 |             662.3469848632812,
595 |             191.905029296875
596 |         ],
597 |         [
598 |             670.4522094726562,
599 |             192.0968780517578
600 |         ],
601 |         [
602 |             678.548583984375,
603 |             192.294921875
604 |         ],
605 |         [
606 |             686.6361083984375,
607 |             192.49874877929688
608 |         ],
609 |         [
610 |             694.7146606445312,
611 |             192.7079315185547
612 |         ],
613 |         [
614 |             702.7843017578125,
615 |             192.92202758789062
616 |         ],
617 |         [
618 |             710.844970703125,
619 |             193.140625
620 |         ],
621 |         [
622 |             718.8965454101562,
623 |             193.36329650878906
624 |         ],
625 |         [
626 |             726.9390258789062,
627 |             193.589599609375
628 |         ],
629 |         [
630 |             734.9723510742188,
631 |             193.81912231445312
632 |         ],
633 |         [
634 |             742.9964599609375,
635 |             194.0514373779297
636 |         ],
637 |         [
638 |             751.0113525390625,
639 |             194.28610229492188
640 |         ],
641 |         [
642 |             759.0169067382812,
643 |             194.522705078125
644 |         ],
645 |         [
646 |             767.0131225585938,
647 |             194.7608184814453
648 |         ],
649 |         [
650 |             775.0,
651 |             195.0
652 |         ]
653 |     ]
654 | ]


--------------------------------------------------------------------------------
/__assets__/demos/realworld/trajectory_dog.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     [
  3 |         [
  4 |             700.0,
  5 |             350.0
  6 |         ],
  7 |         [
  8 |             682.2483520507812,
  9 |             348.18017578125
 10 |         ],
 11 |         [
 12 |             664.6619262695312,
 13 |             346.3773498535156
 14 |         ],
 15 |         [
 16 |             647.2615356445312,
 17 |             344.5942687988281
 18 |         ],
 19 |         [
 20 |             630.06787109375,
 21 |             342.8337097167969
 22 |         ],
 23 |         [
 24 |             613.1016845703125,
 25 |             341.0984802246094
 26 |         ],
 27 |         [
 28 |             596.3836669921875,
 29 |             339.3913879394531
 30 |         ],
 31 |         [
 32 |             579.9345703125,
 33 |             337.71514892578125
 34 |         ],
 35 |         [
 36 |             563.775146484375,
 37 |             336.07257080078125
 38 |         ],
 39 |         [
 40 |             547.926025390625,
 41 |             334.4664611816406
 42 |         ],
 43 |         [
 44 |             532.4080810546875,
 45 |             332.8995666503906
 46 |         ],
 47 |         [
 48 |             517.241943359375,
 49 |             331.3746643066406
 50 |         ],
 51 |         [
 52 |             502.4483947753906,
 53 |             329.8945617675781
 54 |         ],
 55 |         [
 56 |             488.04815673828125,
 57 |             328.4620361328125
 58 |         ],
 59 |         [
 60 |             474.0619201660156,
 61 |             327.0798645019531
 62 |         ],
 63 |         [
 64 |             460.51043701171875,
 65 |             325.7508239746094
 66 |         ],
 67 |         [
 68 |             447.4144592285156,
 69 |             324.47772216796875
 70 |         ],
 71 |         [
 72 |             434.794677734375,
 73 |             323.2632751464844
 74 |         ],
 75 |         [
 76 |             422.671875,
 77 |             322.1103515625
 78 |         ],
 79 |         [
 80 |             411.0667419433594,
 81 |             321.0216369628906
 82 |         ],
 83 |         [
 84 |             400.0,
 85 |             320.0
 86 |         ],
 87 |         [
 88 |             389.2277526855469,
 89 |             319.04083251953125
 90 |         ],
 91 |         [
 92 |             378.5032043457031,
 93 |             318.135009765625
 94 |         ],
 95 |         [
 96 |             367.8426818847656,
 97 |             317.27850341796875
 98 |         ],
 99 |         [
100 |             357.2625732421875,
101 |             316.46722412109375
102 |         ],
103 |         [
104 |             346.7792053222656,
105 |             315.69708251953125
106 |         ],
107 |         [
108 |             336.4089660644531,
109 |             314.9640808105469
110 |         ],
111 |         [
112 |             326.1681823730469,
113 |             314.26409912109375
114 |         ],
115 |         [
116 |             316.0732727050781,
117 |             313.59307861328125
118 |         ],
119 |         [
120 |             306.1405944824219,
121 |             312.9469909667969
122 |         ],
123 |         [
124 |             296.3864440917969,
125 |             312.3217468261719
126 |         ],
127 |         [
128 |             286.8272399902344,
129 |             311.7132873535156
130 |         ],
131 |         [
132 |             277.4793395996094,
133 |             311.1175537109375
134 |         ],
135 |         [
136 |             268.35906982421875,
137 |             310.5304870605469
138 |         ],
139 |         [
140 |             259.48284912109375,
141 |             309.947998046875
142 |         ],
143 |         [
144 |             250.8669891357422,
145 |             309.36602783203125
146 |         ],
147 |         [
148 |             242.52786254882812,
149 |             308.7805480957031
150 |         ],
151 |         [
152 |             234.4818572998047,
153 |             308.1874694824219
154 |         ],
155 |         [
156 |             226.74530029296875,
157 |             307.5827331542969
158 |         ],
159 |         [
160 |             219.33457946777344,
161 |             306.9622497558594
162 |         ],
163 |         [
164 |             212.26605224609375,
165 |             306.3219909667969
166 |         ],
167 |         [
168 |             205.55606079101562,
169 |             305.65789794921875
170 |         ],
171 |         [
172 |             199.2209930419922,
173 |             304.96588134765625
174 |         ],
175 |         [
176 |             193.27720642089844,
177 |             304.24188232421875
178 |         ],
179 |         [
180 |             187.7410430908203,
181 |             303.4818420410156
182 |         ],
183 |         [
184 |             182.62887573242188,
185 |             302.68170166015625
186 |         ],
187 |         [
188 |             177.9570770263672,
189 |             301.8373718261719
190 |         ],
191 |         [
192 |             173.7419891357422,
193 |             300.9448547363281
194 |         ],
195 |         [
196 |             170.0,
197 |             300.0
198 |         ],
199 |         [
200 |             166.5145263671875,
201 |             299.0050354003906
202 |         ],
203 |         [
204 |             163.0615234375,
205 |             297.96539306640625
206 |         ],
207 |         [
208 |             159.6461181640625,
209 |             296.8818664550781
210 |         ],
211 |         [
212 |             156.2734375,
213 |             295.7552795410156
214 |         ],
215 |         [
216 |             152.9486083984375,
217 |             294.5863952636719
218 |         ],
219 |         [
220 |             149.6767578125,
221 |             293.3760681152344
222 |         ],
223 |         [
224 |             146.4630126953125,
225 |             292.1250915527344
226 |         ],
227 |         [
228 |             143.3125,
229 |             290.83428955078125
230 |         ],
231 |         [
232 |             140.2303466796875,
233 |             289.5044250488281
234 |         ],
235 |         [
236 |             137.2216796875,
237 |             288.1363220214844
238 |         ],
239 |         [
240 |             134.2916259765625,
241 |             286.7308044433594
242 |         ],
243 |         [
244 |             131.4453125,
245 |             285.2886962890625
246 |         ],
247 |         [
248 |             128.6878662109375,
249 |             283.8107604980469
250 |         ],
251 |         [
252 |             126.0244140625,
253 |             282.2978210449219
254 |         ],
255 |         [
256 |             123.4600830078125,
257 |             280.75067138671875
258 |         ],
259 |         [
260 |             121.0,
261 |             279.170166015625
262 |         ],
263 |         [
264 |             118.6492919921875,
265 |             277.55706787109375
266 |         ],
267 |         [
268 |             116.4130859375,
269 |             275.91217041015625
270 |         ],
271 |         [
272 |             114.2965087890625,
273 |             274.2363586425781
274 |         ],
275 |         [
276 |             112.3046875,
277 |             272.5303649902344
278 |         ],
279 |         [
280 |             110.4427490234375,
281 |             270.7950134277344
282 |         ],
283 |         [
284 |             108.7158203125,
285 |             269.0311279296875
286 |         ],
287 |         [
288 |             107.1290283203125,
289 |             267.239501953125
290 |         ],
291 |         [
292 |             105.6875,
293 |             265.42095947265625
294 |         ],
295 |         [
296 |             104.3963623046875,
297 |             263.5762939453125
298 |         ],
299 |         [
300 |             103.2607421875,
301 |             261.706298828125
302 |         ],
303 |         [
304 |             102.2857666015625,
305 |             259.8117980957031
306 |         ],
307 |         [
308 |             101.4765625,
309 |             257.89361572265625
310 |         ],
311 |         [
312 |             100.8382568359375,
313 |             255.95252990722656
314 |         ],
315 |         [
316 |             100.3759765625,
317 |             253.98936462402344
318 |         ],
319 |         [
320 |             100.0948486328125,
321 |             252.00491333007812
322 |         ],
323 |         [
324 |             100.0,
325 |             250.0
326 |         ]
327 |     ]
328 | ]


--------------------------------------------------------------------------------
/__assets__/figs/demos.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IamCreateAI/LayerAnimate/bb8417c519a4a130ad70b49d2990a89b4b6eed72/__assets__/figs/demos.gif


--------------------------------------------------------------------------------
/lineart/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Caroline Chan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/lineart/__init__.py:
--------------------------------------------------------------------------------
  1 | # From https://github.com/carolineec/informative-drawings
  2 | # MIT License
  3 | 
  4 | import os
  5 | import cv2
  6 | import torch
  7 | import numpy as np
  8 | 
  9 | import torch.nn as nn
 10 | from einops import rearrange
 11 | from huggingface_hub import hf_hub_download
 12 | 
 13 | annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
 14 | 
 15 | 
 16 | norm_layer = nn.InstanceNorm2d
 17 | 
 18 | 
 19 | class ResidualBlock(nn.Module):
 20 |     def __init__(self, in_features):
 21 |         super(ResidualBlock, self).__init__()
 22 | 
 23 |         conv_block = [  nn.ReflectionPad2d(1),
 24 |                         nn.Conv2d(in_features, in_features, 3),
 25 |                         norm_layer(in_features),
 26 |                         nn.ReLU(inplace=True),
 27 |                         nn.ReflectionPad2d(1),
 28 |                         nn.Conv2d(in_features, in_features, 3),
 29 |                         norm_layer(in_features)
 30 |                         ]
 31 | 
 32 |         self.conv_block = nn.Sequential(*conv_block)
 33 | 
 34 |     def forward(self, x):
 35 |         return x + self.conv_block(x)
 36 | 
 37 | 
 38 | class Generator(nn.Module):
 39 |     def __init__(self, input_nc, output_nc, n_residual_blocks=9, sigmoid=True):
 40 |         super(Generator, self).__init__()
 41 | 
 42 |         # Initial convolution block
 43 |         model0 = [   nn.ReflectionPad2d(3),
 44 |                     nn.Conv2d(input_nc, 64, 7),
 45 |                     norm_layer(64),
 46 |                     nn.ReLU(inplace=True) ]
 47 |         self.model0 = nn.Sequential(*model0)
 48 | 
 49 |         # Downsampling
 50 |         model1 = []
 51 |         in_features = 64
 52 |         out_features = in_features*2
 53 |         for _ in range(2):
 54 |             model1 += [  nn.Conv2d(in_features, out_features, 3, stride=2, padding=1),
 55 |                         norm_layer(out_features),
 56 |                         nn.ReLU(inplace=True) ]
 57 |             in_features = out_features
 58 |             out_features = in_features*2
 59 |         self.model1 = nn.Sequential(*model1)
 60 | 
 61 |         model2 = []
 62 |         # Residual blocks
 63 |         for _ in range(n_residual_blocks):
 64 |             model2 += [ResidualBlock(in_features)]
 65 |         self.model2 = nn.Sequential(*model2)
 66 | 
 67 |         # Upsampling
 68 |         model3 = []
 69 |         out_features = in_features//2
 70 |         for _ in range(2):
 71 |             model3 += [  nn.ConvTranspose2d(in_features, out_features, 3, stride=2, padding=1, output_padding=1),
 72 |                         norm_layer(out_features),
 73 |                         nn.ReLU(inplace=True) ]
 74 |             in_features = out_features
 75 |             out_features = in_features//2
 76 |         self.model3 = nn.Sequential(*model3)
 77 | 
 78 |         # Output layer
 79 |         model4 = [  nn.ReflectionPad2d(3),
 80 |                         nn.Conv2d(64, output_nc, 7)]
 81 |         if sigmoid:
 82 |             model4 += [nn.Sigmoid()]
 83 | 
 84 |         self.model4 = nn.Sequential(*model4)
 85 | 
 86 |     def forward(self, x, cond=None):
 87 |         out = self.model0(x)
 88 |         out = self.model1(out)
 89 |         out = self.model2(out)
 90 |         out = self.model3(out)
 91 |         out = self.model4(out)
 92 | 
 93 |         return out
 94 | 
 95 | 
 96 | class LineartDetector:
 97 |     def __init__(self, device):
 98 |         self.device = device
 99 |         self.model = self.load_model('sk_model.pth')
100 |         self.model_coarse = self.load_model('sk_model2.pth')
101 | 
102 |     def load_model(self, name):
103 |         modelpath = os.path.join(annotator_ckpts_path, name)
104 |         if not os.path.exists(modelpath):
105 |             hf_hub_download(repo_id="lllyasviel/Annotators", filename=name, local_dir=annotator_ckpts_path)
106 |         model = Generator(3, 1, 3)
107 |         model.load_state_dict(torch.load(modelpath, map_location=torch.device('cpu')))
108 |         model.eval()
109 |         model = model.to(self.device)
110 |         return model
111 | 
112 |     def __call__(self, input_image, coarse, rescale=False):
113 |         model = self.model_coarse if coarse else self.model
114 |         image = input_image
115 |         with torch.no_grad():
116 |             image = image.float()
117 |             if rescale:
118 |                 # [-1, 1] -> [0, 1]
119 |                 image = image / 2 + 0.5
120 |             line = model(image)
121 |         return line
122 | 


--------------------------------------------------------------------------------
/lvdm/basics.py:
--------------------------------------------------------------------------------
  1 | # adopted from
  2 | # https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
  3 | # and
  4 | # https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
  5 | # and
  6 | # https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
  7 | #
  8 | # thanks!
  9 | 
 10 | import torch.nn as nn
 11 | from .utils import instantiate_from_config
 12 | 
 13 | 
 14 | def disabled_train(self, mode=True):
 15 |     """Overwrite model.train with this function to make sure train/eval mode
 16 |     does not change anymore."""
 17 |     return self
 18 | 
 19 | def zero_module(module):
 20 |     """
 21 |     Zero out the parameters of a module and return it.
 22 |     """
 23 |     for p in module.parameters():
 24 |         p.detach().zero_()
 25 |     return module
 26 | 
 27 | def scale_module(module, scale):
 28 |     """
 29 |     Scale the parameters of a module and return it.
 30 |     """
 31 |     for p in module.parameters():
 32 |         p.detach().mul_(scale)
 33 |     return module
 34 | 
 35 | 
 36 | def conv_nd(dims, *args, **kwargs):
 37 |     """
 38 |     Create a 1D, 2D, or 3D convolution module.
 39 |     """
 40 |     if dims == 1:
 41 |         return nn.Conv1d(*args, **kwargs)
 42 |     elif dims == 2:
 43 |         return nn.Conv2d(*args, **kwargs)
 44 |     elif dims == 3:
 45 |         return nn.Conv3d(*args, **kwargs)
 46 |     raise ValueError(f"unsupported dimensions: {dims}")
 47 | 
 48 | 
 49 | def linear(*args, **kwargs):
 50 |     """
 51 |     Create a linear module.
 52 |     """
 53 |     return nn.Linear(*args, **kwargs)
 54 | 
 55 | 
 56 | def avg_pool_nd(dims, *args, **kwargs):
 57 |     """
 58 |     Create a 1D, 2D, or 3D average pooling module.
 59 |     """
 60 |     if dims == 1:
 61 |         return nn.AvgPool1d(*args, **kwargs)
 62 |     elif dims == 2:
 63 |         return nn.AvgPool2d(*args, **kwargs)
 64 |     elif dims == 3:
 65 |         return nn.AvgPool3d(*args, **kwargs)
 66 |     raise ValueError(f"unsupported dimensions: {dims}")
 67 | 
 68 | 
 69 | def nonlinearity(type='silu'):
 70 |     if type == 'silu':
 71 |         return nn.SiLU()
 72 |     elif type == 'leaky_relu':
 73 |         return nn.LeakyReLU()
 74 | 
 75 | 
 76 | class GroupNormSpecific(nn.GroupNorm):
 77 |     def forward(self, x):
 78 |         return super().forward(x.float()).type(x.dtype)
 79 | 
 80 | 
 81 | def normalization(channels, num_groups=32):
 82 |     """
 83 |     Make a standard normalization layer.
 84 |     :param channels: number of input channels.
 85 |     :return: an nn.Module for normalization.
 86 |     """
 87 |     return GroupNormSpecific(num_groups, channels)
 88 | 
 89 | 
 90 | class HybridConditioner(nn.Module):
 91 | 
 92 |     def __init__(self, c_concat_config, c_crossattn_config):
 93 |         super().__init__()
 94 |         self.concat_conditioner = instantiate_from_config(c_concat_config)
 95 |         self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
 96 | 
 97 |     def forward(self, c_concat, c_crossattn):
 98 |         c_concat = self.concat_conditioner(c_concat)
 99 |         c_crossattn = self.crossattn_conditioner(c_crossattn)
100 |         return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}


--------------------------------------------------------------------------------
/lvdm/common.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from inspect import isfunction
 3 | import torch
 4 | from torch import nn
 5 | import torch.distributed as dist
 6 | 
 7 | 
 8 | def gather_data(data, return_np=True):
 9 |     ''' gather data from multiple processes to one list '''
10 |     data_list = [torch.zeros_like(data) for _ in range(dist.get_world_size())]
11 |     dist.all_gather(data_list, data)  # gather not supported with NCCL
12 |     if return_np:
13 |         data_list = [data.cpu().numpy() for data in data_list]
14 |     return data_list
15 | 
16 | def autocast(f):
17 |     def do_autocast(*args, **kwargs):
18 |         with torch.cuda.amp.autocast(enabled=True,
19 |                                      dtype=torch.get_autocast_gpu_dtype(),
20 |                                      cache_enabled=torch.is_autocast_cache_enabled()):
21 |             return f(*args, **kwargs)
22 |     return do_autocast
23 | 
24 | 
25 | def extract_into_tensor(a, t, x_shape):
26 |     b, *_ = t.shape
27 |     out = a.gather(-1, t)
28 |     return out.reshape(b, *((1,) * (len(x_shape) - 1)))
29 | 
30 | 
31 | def noise_like(shape, device, repeat=False):
32 |     repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
33 |     noise = lambda: torch.randn(shape, device=device)
34 |     return repeat_noise() if repeat else noise()
35 | 
36 | 
37 | def default(val, d):
38 |     if exists(val):
39 |         return val
40 |     return d() if isfunction(d) else d
41 | 
42 | def exists(val):
43 |     return val is not None
44 | 
45 | def identity(*args, **kwargs):
46 |     return nn.Identity()
47 | 
48 | def uniq(arr):
49 |     return{el: True for el in arr}.keys()
50 | 
51 | def mean_flat(tensor):
52 |     """
53 |     Take the mean over all non-batch dimensions.
54 |     """
55 |     return tensor.mean(dim=list(range(1, len(tensor.shape))))
56 | 
57 | def ismap(x):
58 |     if not isinstance(x, torch.Tensor):
59 |         return False
60 |     return (len(x.shape) == 4) and (x.shape[1] > 3)
61 | 
62 | def isimage(x):
63 |     if not isinstance(x,torch.Tensor):
64 |         return False
65 |     return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
66 | 
67 | def max_neg_value(t):
68 |     return -torch.finfo(t.dtype).max
69 | 
70 | def shape_to_str(x):
71 |     shape_str = "x".join([str(x) for x in x.shape])
72 |     return shape_str
73 | 
74 | def init_(tensor):
75 |     dim = tensor.shape[-1]
76 |     std = 1 / math.sqrt(dim)
77 |     tensor.uniform_(-std, std)
78 |     return tensor
79 | 
80 | ckpt = torch.utils.checkpoint.checkpoint
81 | def checkpoint(func, inputs, params, flag):
82 |     """
83 |     Evaluate a function without caching intermediate activations, allowing for
84 |     reduced memory at the expense of extra compute in the backward pass.
85 |     :param func: the function to evaluate.
86 |     :param inputs: the argument sequence to pass to `func`.
87 |     :param params: a sequence of parameters `func` depends on but does not
88 |                    explicitly take as arguments.
89 |     :param flag: if False, disable gradient checkpointing.
90 |     """
91 |     if flag:
92 |         return ckpt(func, *inputs, use_reentrant=False)
93 |     else:
94 |         return func(*inputs)


--------------------------------------------------------------------------------
/lvdm/models/autoencoder.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from functools import partial
  3 | from dataclasses import dataclass
  4 | 
  5 | import torch
  6 | import numpy as np
  7 | from einops import rearrange
  8 | import torch.nn.functional as F
  9 | from torch.utils.checkpoint import checkpoint
 10 | from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
 11 | from diffusers.configuration_utils import ConfigMixin, register_to_config
 12 | from diffusers.models import ModelMixin
 13 | from diffusers.utils import BaseOutput
 14 | 
 15 | from ..modules.ae_modules import Encoder, Decoder
 16 | from ..modules.ae_dualref_modules import VideoDecoder
 17 | from ..utils import instantiate_from_config
 18 | 
 19 | 
 20 | @dataclass
 21 | class DecoderOutput(BaseOutput):
 22 |     """
 23 |     Output of decoding method.
 24 | 
 25 |     Args:
 26 |         sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
 27 |             Decoded output sample of the model. Output of the last layer of the model.
 28 |     """
 29 | 
 30 |     sample: torch.FloatTensor
 31 | 
 32 | 
 33 | @dataclass
 34 | class AutoencoderKLOutput(BaseOutput):
 35 |     """
 36 |     Output of AutoencoderKL encoding method.
 37 | 
 38 |     Args:
 39 |         latent_dist (`DiagonalGaussianDistribution`):
 40 |             Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
 41 |             `DiagonalGaussianDistribution` allows for sampling latents from the distribution.
 42 |     """
 43 | 
 44 |     latent_dist: "DiagonalGaussianDistribution"
 45 | 
 46 | 
 47 | class AutoencoderKL(ModelMixin, ConfigMixin):
 48 |     @register_to_config
 49 |     def __init__(self,
 50 |                  ddconfig,
 51 |                  embed_dim,
 52 |                  image_key="image",
 53 |                  input_dim=4,
 54 |                  use_checkpoint=False,
 55 |                  ):
 56 |         super().__init__()
 57 |         self.image_key = image_key
 58 |         self.encoder = Encoder(**ddconfig)
 59 |         self.decoder = Decoder(**ddconfig)
 60 |         assert ddconfig["double_z"]
 61 |         self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
 62 |         self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
 63 |         self.embed_dim = embed_dim
 64 |         self.input_dim = input_dim
 65 |         self.use_checkpoint = use_checkpoint
 66 | 
 67 |     def encode(self, x, return_hidden_states=False, **kwargs):
 68 |         if return_hidden_states:
 69 |             h, hidden = self.encoder(x, return_hidden_states)
 70 |             moments = self.quant_conv(h)
 71 |             posterior = DiagonalGaussianDistribution(moments)
 72 |             return AutoencoderKLOutput(latent_dist=posterior), hidden
 73 |         else:
 74 |             h = self.encoder(x)
 75 |             moments = self.quant_conv(h)
 76 |             posterior = DiagonalGaussianDistribution(moments)
 77 |             return AutoencoderKLOutput(latent_dist=posterior)
 78 | 
 79 |     def decode(self, z, **kwargs):
 80 |         if len(kwargs) == 0: ## use the original decoder in AutoencoderKL
 81 |             z = self.post_quant_conv(z)
 82 |         dec = self.decoder(z, **kwargs)  ##change for SVD decoder by adding **kwargs
 83 |         return dec
 84 | 
 85 |     def forward(self, input, sample_posterior=True, **additional_decode_kwargs):
 86 |         input_tuple = (input, )
 87 |         forward_temp = partial(self._forward, sample_posterior=sample_posterior, **additional_decode_kwargs)
 88 |         return checkpoint(forward_temp, input_tuple, self.parameters(), self.use_checkpoint)
 89 | 
 90 | 
 91 |     def _forward(self, input, sample_posterior=True, **additional_decode_kwargs):
 92 |         posterior = self.encode(input)[0]
 93 |         if sample_posterior:
 94 |             z = posterior.sample()
 95 |         else:
 96 |             z = posterior.mode()
 97 |         dec = self.decode(z, **additional_decode_kwargs)
 98 |         ## print(input.shape, dec.shape) torch.Size([16, 3, 256, 256]) torch.Size([16, 3, 256, 256])
 99 |         return dec, posterior
100 | 
101 |     def get_input(self, batch, k):
102 |         x = batch[k]
103 |         if x.dim() == 5 and self.input_dim == 4:
104 |             b,c,t,h,w = x.shape
105 |             self.b = b
106 |             self.t = t
107 |             x = rearrange(x, 'b c t h w -> (b t) c h w')
108 | 
109 |         return x
110 | 
111 |     def get_last_layer(self):
112 |         return self.decoder.conv_out.weight
113 | 
114 | 
115 | class AutoencoderKL_Dualref(AutoencoderKL):
116 |     @register_to_config
117 |     def __init__(self,
118 |                  ddconfig,
119 |                  embed_dim,
120 |                  image_key="image",
121 |                  input_dim=4,
122 |                  use_checkpoint=False,
123 |                  ):
124 |         super().__init__(ddconfig, embed_dim, image_key, input_dim, use_checkpoint)
125 |         self.decoder = VideoDecoder(**ddconfig)
126 | 
127 |     def _forward(self, input, batch_size, sample_posterior=True, **additional_decode_kwargs):
128 |         posterior, hidden_states = self.encode(input, return_hidden_states=True)
129 | 
130 |         hidden_states_first_last = []
131 |         ### use only the first and last hidden states
132 |         for hid in hidden_states:
133 |             hid = rearrange(hid, '(b t) c h w -> b c t h w', b=batch_size)
134 |             hid_new = torch.cat([hid[:, :, 0:1], hid[:, :, -1:]], dim=2)
135 |             hidden_states_first_last.append(hid_new)
136 | 
137 |         if sample_posterior:
138 |             z = posterior[0].sample()
139 |         else:
140 |             z = posterior[0].mode()
141 |         dec = self.decode(z, ref_context=hidden_states_first_last, **additional_decode_kwargs)
142 |         ## print(input.shape, dec.shape) torch.Size([16, 3, 256, 256]) torch.Size([16, 3, 256, 256])
143 |         return dec, posterior


--------------------------------------------------------------------------------
/lvdm/models/condition.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | from torchvision.transforms import functional as F
  5 | import open_clip
  6 | from torch.utils.checkpoint import checkpoint
  7 | from transformers import T5Tokenizer, T5EncoderModel, CLIPTokenizer, CLIPTextModel
  8 | from diffusers.configuration_utils import ConfigMixin, register_to_config
  9 | from diffusers.models import ModelMixin
 10 | from ..common import autocast
 11 | from ..utils import count_params
 12 | 
 13 | 
 14 | class AbstractEncoder(nn.Module):
 15 |     def __init__(self):
 16 |         super().__init__()
 17 | 
 18 |     def encode(self, *args, **kwargs):
 19 |         raise NotImplementedError
 20 | 
 21 |     @property
 22 |     def device(self):
 23 |         return next(self.parameters()).device
 24 | 
 25 |     @property
 26 |     def dtype(self):
 27 |         return next(self.parameters()).dtype
 28 | 
 29 | class IdentityEncoder(AbstractEncoder):
 30 |     def encode(self, x):
 31 |         return x
 32 | 
 33 | 
 34 | class ClassEmbedder(nn.Module):
 35 |     def __init__(self, embed_dim, n_classes=1000, key='class', ucg_rate=0.1):
 36 |         super().__init__()
 37 |         self.key = key
 38 |         self.embedding = nn.Embedding(n_classes, embed_dim)
 39 |         self.n_classes = n_classes
 40 |         self.ucg_rate = ucg_rate
 41 | 
 42 |     def forward(self, batch, key=None, disable_dropout=False):
 43 |         if key is None:
 44 |             key = self.key
 45 |         # this is for use in crossattn
 46 |         c = batch[key][:, None]
 47 |         if self.ucg_rate > 0. and not disable_dropout:
 48 |             mask = 1. - torch.bernoulli(torch.ones_like(c) * self.ucg_rate)
 49 |             c = mask * c + (1 - mask) * torch.ones_like(c) * (self.n_classes - 1)
 50 |             c = c.long()
 51 |         c = self.embedding(c)
 52 |         return c
 53 | 
 54 |     def get_unconditional_conditioning(self, bs, device="cuda"):
 55 |         uc_class = self.n_classes - 1  # 1000 classes --> 0 ... 999, one extra class for ucg (class 1000)
 56 |         uc = torch.ones((bs,), device=device) * uc_class
 57 |         uc = {self.key: uc}
 58 |         return uc
 59 | 
 60 | 
 61 | def disabled_train(self, mode=True):
 62 |     """Overwrite model.train with this function to make sure train/eval mode
 63 |     does not change anymore."""
 64 |     return self
 65 | 
 66 | 
 67 | class FrozenT5Embedder(AbstractEncoder):
 68 |     """Uses the T5 transformer encoder for text"""
 69 | 
 70 |     def __init__(self, version="google/t5-v1_1-large", max_length=77,
 71 |                  freeze=True):  # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
 72 |         super().__init__()
 73 |         self.tokenizer = T5Tokenizer.from_pretrained(version)
 74 |         self.transformer = T5EncoderModel.from_pretrained(version)
 75 |         self.max_length = max_length  # TODO: typical value?
 76 |         if freeze:
 77 |             self.freeze()
 78 | 
 79 |     def freeze(self):
 80 |         self.transformer = self.transformer.eval()
 81 |         # self.train = disabled_train
 82 |         for param in self.parameters():
 83 |             param.requires_grad = False
 84 | 
 85 |     def forward(self, text):
 86 |         batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
 87 |                                         return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
 88 |         tokens = batch_encoding["input_ids"].to(self.device)
 89 |         outputs = self.transformer(input_ids=tokens)
 90 | 
 91 |         z = outputs.last_hidden_state
 92 |         return z
 93 | 
 94 |     def encode(self, text):
 95 |         return self(text)
 96 | 
 97 | 
 98 | class FrozenCLIPEmbedder(AbstractEncoder):
 99 |     """Uses the CLIP transformer encoder for text (from huggingface)"""
100 |     LAYERS = [
101 |         "last",
102 |         "pooled",
103 |         "hidden"
104 |     ]
105 | 
106 |     def __init__(self, version="openai/clip-vit-large-patch14", max_length=77,
107 |                  freeze=True, layer="last", layer_idx=None):  # clip-vit-base-patch32
108 |         super().__init__()
109 |         assert layer in self.LAYERS
110 |         self.tokenizer = CLIPTokenizer.from_pretrained(version)
111 |         self.transformer = CLIPTextModel.from_pretrained(version)
112 |         self.max_length = max_length
113 |         if freeze:
114 |             self.freeze()
115 |         self.layer = layer
116 |         self.layer_idx = layer_idx
117 |         if layer == "hidden":
118 |             assert layer_idx is not None
119 |             assert 0 <= abs(layer_idx) <= 12
120 | 
121 |     def freeze(self):
122 |         self.transformer = self.transformer.eval()
123 |         # self.train = disabled_train
124 |         for param in self.parameters():
125 |             param.requires_grad = False
126 | 
127 |     def forward(self, text):
128 |         batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
129 |                                         return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
130 |         tokens = batch_encoding["input_ids"].to(self.device)
131 |         outputs = self.transformer(input_ids=tokens, output_hidden_states=self.layer == "hidden")
132 |         if self.layer == "last":
133 |             z = outputs.last_hidden_state
134 |         elif self.layer == "pooled":
135 |             z = outputs.pooler_output[:, None, :]
136 |         else:
137 |             z = outputs.hidden_states[self.layer_idx]
138 |         return z
139 | 
140 |     def encode(self, text):
141 |         return self(text)
142 | 
143 | 
144 | class FrozenOpenCLIPEmbedder(AbstractEncoder):
145 |     """
146 |     Uses the OpenCLIP transformer encoder for text
147 |     """
148 |     LAYERS = [
149 |         # "pooled",
150 |         "last",
151 |         "penultimate"
152 |     ]
153 | 
154 |     def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", max_length=77,
155 |                  freeze=True, layer="penultimate"):
156 |         super().__init__()
157 |         assert layer in self.LAYERS
158 |         model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), pretrained=version)
159 |         del model.visual
160 |         self.model = model
161 | 
162 |         self.max_length = max_length
163 |         if freeze:
164 |             self.freeze()
165 |         self.layer = layer
166 |         if self.layer == "last":
167 |             self.layer_idx = 0
168 |         elif self.layer == "penultimate":
169 |             self.layer_idx = 1
170 |         else:
171 |             raise NotImplementedError()
172 | 
173 |     def freeze(self):
174 |         self.model = self.model.eval()
175 |         for param in self.parameters():
176 |             param.requires_grad = False
177 | 
178 |     def forward(self, text):
179 |         tokens = open_clip.tokenize(text) ## all clip models use 77 as context length
180 |         z = self.encode_with_transformer(tokens.to(self.device))
181 |         return z
182 | 
183 |     def encode_with_transformer(self, text):
184 |         x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
185 |         x = x + self.model.positional_embedding
186 |         x = x.permute(1, 0, 2)  # NLD -> LND
187 |         x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
188 |         x = x.permute(1, 0, 2)  # LND -> NLD
189 |         x = self.model.ln_final(x)
190 |         return x
191 | 
192 |     def text_transformer_forward(self, x: torch.Tensor, attn_mask=None):
193 |         for i, r in enumerate(self.model.transformer.resblocks):
194 |             if i == len(self.model.transformer.resblocks) - self.layer_idx:
195 |                 break
196 |             if self.model.transformer.grad_checkpointing and not torch.jit.is_scripting():
197 |                 x = checkpoint(r, x, attn_mask)
198 |             else:
199 |                 x = r(x, attn_mask=attn_mask)
200 |         return x
201 | 
202 |     def encode(self, text):
203 |         return self(text)
204 | 
205 | 
206 | class FrozenOpenCLIPImageEmbedder(AbstractEncoder):
207 |     """
208 |     Uses the OpenCLIP vision transformer encoder for images
209 |     """
210 | 
211 |     def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", max_length=77,
212 |                  freeze=True, layer="pooled", antialias=True, ucg_rate=0.):
213 |         super().__init__()
214 |         model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'),
215 |                                                             pretrained=version, )
216 |         del model.transformer
217 |         self.model = model
218 |         self.preprocess_val = preprocess_val
219 |         # self.mapper = torch.nn.Linear(1280, 1024)
220 |         self.max_length = max_length
221 |         if freeze:
222 |             self.freeze()
223 |         self.layer = layer
224 |         if self.layer == "penultimate":
225 |             raise NotImplementedError()
226 |             self.layer_idx = 1
227 | 
228 |         self.antialias = antialias
229 | 
230 |         self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
231 |         self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
232 |         self.ucg_rate = ucg_rate
233 | 
234 |     def preprocess(self, x):
235 |         # normalize to [0,1]
236 |         x = F.resize(x, (224, 224), interpolation=F.InterpolationMode.BICUBIC, antialias=self.antialias)
237 |         x = (x + 1.) / 2.
238 |         # renormalize according to clip
239 |         x = F.normalize(x, mean=self.mean, std=self.std)
240 |         return x
241 | 
242 |     def freeze(self):
243 |         self.model = self.model.eval()
244 |         for param in self.model.parameters():
245 |             param.requires_grad = False
246 | 
247 |     @autocast
248 |     def forward(self, image, no_dropout=False):
249 |         z = self.encode_with_vision_transformer(image)
250 |         if self.ucg_rate > 0. and not no_dropout:
251 |             z = torch.bernoulli((1. - self.ucg_rate) * torch.ones(z.shape[0], device=z.device))[:, None] * z
252 |         return z
253 | 
254 |     def encode_with_vision_transformer(self, img):
255 |         img = self.preprocess(img)
256 |         x = self.model.visual(img)
257 |         return x
258 | 
259 |     def encode(self, text):
260 |         return self(text)
261 | 
262 | class FrozenOpenCLIPImageEmbedderV2(AbstractEncoder):
263 |     """
264 |     Uses the OpenCLIP vision transformer encoder for images
265 |     """
266 | 
267 |     def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k",
268 |                  freeze=True, layer="pooled", antialias=True):
269 |         super().__init__()
270 |         model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'),
271 |                                                             pretrained=version, )
272 |         del model.transformer
273 |         self.model = model
274 |         self.preprocess_val = preprocess_val
275 | 
276 |         if freeze:
277 |             self.freeze()
278 |         self.layer = layer
279 |         if self.layer == "penultimate":
280 |             raise NotImplementedError()
281 |             self.layer_idx = 1
282 | 
283 |         self.antialias = antialias
284 | 
285 |         self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
286 |         self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
287 | 
288 | 
289 |     def preprocess(self, x):
290 |         # normalize to [0,1]
291 |         x = F.resize(x, (224, 224), interpolation=F.InterpolationMode.BICUBIC, antialias=self.antialias)
292 |         x = (x + 1.) / 2.
293 |         # renormalize according to clip
294 |         x = F.normalize(x, mean=self.mean, std=self.std)
295 |         return x
296 | 
297 |     def freeze(self):
298 |         self.model = self.model.eval()
299 |         for param in self.model.parameters():
300 |             param.requires_grad = False
301 | 
302 |     def forward(self, image, no_dropout=False):
303 |         ## image: b c h w
304 |         z = self.encode_with_vision_transformer(image)
305 |         return z
306 | 
307 |     def encode_with_vision_transformer(self, x):
308 |         x = self.preprocess(x)
309 | 
310 |         # to patches - whether to use dual patchnorm - https://arxiv.org/abs/2302.01327v1
311 |         if self.model.visual.input_patchnorm:
312 |             # einops - rearrange(x, 'b c (h p1) (w p2) -> b (h w) (c p1 p2)')
313 |             x = x.reshape(x.shape[0], x.shape[1], self.model.visual.grid_size[0], self.model.visual.patch_size[0], self.model.visual.grid_size[1], self.model.visual.patch_size[1])
314 |             x = x.permute(0, 2, 4, 1, 3, 5)
315 |             x = x.reshape(x.shape[0], self.model.visual.grid_size[0] * self.model.visual.grid_size[1], -1)
316 |             x = self.model.visual.patchnorm_pre_ln(x)
317 |             x = self.model.visual.conv1(x)
318 |         else:
319 |             x = self.model.visual.conv1(x)  # shape = [*, width, grid, grid]
320 |             x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
321 |             x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
322 | 
323 |         # class embeddings and positional embeddings
324 |         x = torch.cat(
325 |             [self.model.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
326 |              x], dim=1)  # shape = [*, grid ** 2 + 1, width]
327 |         x = x + self.model.visual.positional_embedding.to(x.dtype)
328 | 
329 |         # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in
330 |         x = self.model.visual.patch_dropout(x)
331 |         x = self.model.visual.ln_pre(x)
332 | 
333 |         x = x.permute(1, 0, 2)  # NLD -> LND
334 |         x = self.model.visual.transformer(x)
335 |         x = x.permute(1, 0, 2)  # LND -> NLD
336 | 
337 |         return x
338 | 
339 | class FrozenCLIPT5Encoder(AbstractEncoder):
340 |     def __init__(self, clip_version="openai/clip-vit-large-patch14", t5_version="google/t5-v1_1-xl",
341 |                  clip_max_length=77, t5_max_length=77):
342 |         super().__init__()
343 |         self.clip_encoder = FrozenCLIPEmbedder(clip_version, max_length=clip_max_length)
344 |         self.t5_encoder = FrozenT5Embedder(t5_version, max_length=t5_max_length)
345 |         print(f"{self.clip_encoder.__class__.__name__} has {count_params(self.clip_encoder) * 1.e-6:.2f} M parameters, "
346 |               f"{self.t5_encoder.__class__.__name__} comes with {count_params(self.t5_encoder) * 1.e-6:.2f} M params.")
347 | 
348 |     def encode(self, text):
349 |         return self(text)
350 | 
351 |     def forward(self, text):
352 |         clip_z = self.clip_encoder.encode(text)
353 |         t5_z = self.t5_encoder.encode(text)
354 |         return [clip_z, t5_z]
355 | 
356 | 
357 | # FFN
358 | def FeedForward(dim, mult=4):
359 |     inner_dim = int(dim * mult)
360 |     return nn.Sequential(
361 |         nn.LayerNorm(dim),
362 |         nn.Linear(dim, inner_dim, bias=False),
363 |         nn.GELU(),
364 |         nn.Linear(inner_dim, dim, bias=False),
365 |     )
366 | 
367 | 
368 | def reshape_tensor(x, heads):
369 |     bs, length, width = x.shape
370 |     #(bs, length, width) --> (bs, length, n_heads, dim_per_head)
371 |     x = x.view(bs, length, heads, -1)
372 |     # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
373 |     x = x.transpose(1, 2)
374 |     # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
375 |     x = x.reshape(bs, heads, length, -1)
376 |     return x
377 | 
378 | 
379 | class PerceiverAttention(nn.Module):
380 |     def __init__(self, *, dim, dim_head=64, heads=8):
381 |         super().__init__()
382 |         self.scale = dim_head**-0.5
383 |         self.dim_head = dim_head
384 |         self.heads = heads
385 |         inner_dim = dim_head * heads
386 | 
387 |         self.norm1 = nn.LayerNorm(dim)
388 |         self.norm2 = nn.LayerNorm(dim)
389 | 
390 |         self.to_q = nn.Linear(dim, inner_dim, bias=False)
391 |         self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
392 |         self.to_out = nn.Linear(inner_dim, dim, bias=False)
393 | 
394 | 
395 |     def forward(self, x, latents):
396 |         """
397 |         Args:
398 |             x (torch.Tensor): image features
399 |                 shape (b, n1, D)
400 |             latent (torch.Tensor): latent features
401 |                 shape (b, n2, D)
402 |         """
403 |         x = self.norm1(x)
404 |         latents = self.norm2(latents)
405 | 
406 |         b, l, _ = latents.shape
407 | 
408 |         q = self.to_q(latents)
409 |         kv_input = torch.cat((x, latents), dim=-2)
410 |         k, v = self.to_kv(kv_input).chunk(2, dim=-1)
411 | 
412 |         q = reshape_tensor(q, self.heads)
413 |         k = reshape_tensor(k, self.heads)
414 |         v = reshape_tensor(v, self.heads)
415 | 
416 |         # attention
417 |         scale = 1 / math.sqrt(math.sqrt(self.dim_head))
418 |         weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
419 |         weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
420 |         out = weight @ v
421 | 
422 |         out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
423 | 
424 |         return self.to_out(out)
425 | 
426 | 
427 | class Resampler(ModelMixin, ConfigMixin):
428 |     @register_to_config
429 |     def __init__(
430 |         self,
431 |         dim=1024,
432 |         depth=8,
433 |         dim_head=64,
434 |         heads=16,
435 |         num_queries=8,
436 |         embedding_dim=768,
437 |         output_dim=1024,
438 |         ff_mult=4,
439 |         video_length=None, # using frame-wise version or not
440 |     ):
441 |         super().__init__()
442 |         ## queries for a single frame / image
443 |         self.num_queries = num_queries
444 |         self.video_length = video_length
445 | 
446 |         ## <num_queries> queries for each frame
447 |         if video_length is not None:
448 |             num_queries = num_queries * video_length
449 | 
450 |         self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
451 |         self.proj_in = nn.Linear(embedding_dim, dim)
452 |         self.proj_out = nn.Linear(dim, output_dim)
453 |         self.norm_out = nn.LayerNorm(output_dim)
454 | 
455 |         self.layers = nn.ModuleList([])
456 |         for _ in range(depth):
457 |             self.layers.append(
458 |                 nn.ModuleList(
459 |                     [
460 |                         PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
461 |                         FeedForward(dim=dim, mult=ff_mult),
462 |                     ]
463 |                 )
464 |             )
465 | 
466 |     def forward(self, x):
467 |         latents = self.latents.repeat(x.size(0), 1, 1) ## B (T L) C
468 |         x = self.proj_in(x)
469 | 
470 |         for attn, ff in self.layers:
471 |             latents = attn(x, latents) + latents
472 |             latents = ff(latents) + latents
473 | 
474 |         latents = self.proj_out(latents)
475 |         latents = self.norm_out(latents) # B L C or B (T L) C
476 | 
477 |         return latents


--------------------------------------------------------------------------------
/lvdm/models/controlnet.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, List, Optional, Tuple, Union
  2 | from einops import rearrange, repeat
  3 | import numpy as np
  4 | from functools import partial
  5 | import torch
  6 | from torch import nn
  7 | from torch.nn import functional as F
  8 | from .unet import TimestepEmbedSequential, ResBlock, Downsample, Upsample, TemporalConvBlock
  9 | from ..basics import zero_module, conv_nd
 10 | from ..modules.attention import SpatialTransformer, TemporalTransformer
 11 | from ..common import checkpoint
 12 | 
 13 | from diffusers import __version__
 14 | from diffusers.configuration_utils import ConfigMixin, register_to_config
 15 | from diffusers.models.modeling_utils import ModelMixin
 16 | from diffusers.models.embeddings import TimestepEmbedding, Timesteps
 17 | from diffusers.models.model_loading_utils import load_state_dict
 18 | from diffusers.utils import (
 19 |     SAFETENSORS_WEIGHTS_NAME,
 20 |     WEIGHTS_NAME,
 21 |     logging,
 22 |     _get_model_file,
 23 |     _add_variant
 24 | )
 25 | from omegaconf import ListConfig, DictConfig, OmegaConf
 26 | 
 27 | 
 28 | logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 29 | 
 30 | 
 31 | class ResBlock_v2(nn.Module):
 32 |     def __init__(
 33 |         self,
 34 |         channels,
 35 |         emb_channels,
 36 |         dropout,
 37 |         out_channels=None,
 38 |         dims=2,
 39 |         use_checkpoint=False,
 40 |         use_conv=False,
 41 |         up=False,
 42 |         down=False,
 43 |         use_temporal_conv=False,
 44 |         tempspatial_aware=False
 45 |     ):
 46 |         super().__init__()
 47 |         self.channels = channels
 48 |         self.emb_channels = emb_channels
 49 |         self.dropout = dropout
 50 |         self.out_channels = out_channels or channels
 51 |         self.use_conv = use_conv
 52 |         self.use_checkpoint = use_checkpoint
 53 |         self.use_temporal_conv = use_temporal_conv
 54 | 
 55 |         self.in_layers = nn.Sequential(
 56 |             nn.GroupNorm(32, channels),
 57 |             nn.SiLU(),
 58 |             zero_module(conv_nd(dims, channels, self.out_channels, 3, padding=1)),
 59 |         )
 60 | 
 61 |         self.updown = up or down
 62 | 
 63 |         if up:
 64 |             self.h_upd = Upsample(channels, False, dims)
 65 |             self.x_upd = Upsample(channels, False, dims)
 66 |         elif down:
 67 |             self.h_upd = Downsample(channels, False, dims)
 68 |             self.x_upd = Downsample(channels, False, dims)
 69 |         else:
 70 |             self.h_upd = self.x_upd = nn.Identity()
 71 | 
 72 |         if self.out_channels == channels:
 73 |             self.skip_connection = nn.Identity()
 74 |         elif use_conv:
 75 |             self.skip_connection = conv_nd(dims, channels, self.out_channels, 3, padding=1)
 76 |         else:
 77 |             self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
 78 | 
 79 |         if self.use_temporal_conv:
 80 |             self.temopral_conv = TemporalConvBlock(
 81 |                 self.out_channels,
 82 |                 self.out_channels,
 83 |                 dropout=0.1,
 84 |                 spatial_aware=tempspatial_aware
 85 |             )
 86 | 
 87 |     def forward(self, x, batch_size=None):
 88 |         """
 89 |         Apply the block to a Tensor, conditioned on a timestep embedding.
 90 |         :param x: an [N x C x ...] Tensor of features.
 91 |         :return: an [N x C x ...] Tensor of outputs.
 92 |         """
 93 |         input_tuple = (x, )
 94 |         if batch_size:
 95 |             forward_batchsize = partial(self._forward, batch_size=batch_size)
 96 |             return checkpoint(forward_batchsize, input_tuple, self.parameters(), self.use_checkpoint)
 97 |         return checkpoint(self._forward, input_tuple, self.parameters(), self.use_checkpoint)
 98 | 
 99 |     def _forward(self, x, batch_size=None):
100 |         if self.updown:
101 |             in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
102 |             h = in_rest(x)
103 |             h = self.h_upd(h)
104 |             x = self.x_upd(x)
105 |             h = in_conv(h)
106 |         else:
107 |             h = self.in_layers(x)
108 |         h = self.skip_connection(x) + h
109 | 
110 |         if self.use_temporal_conv and batch_size:
111 |             h = rearrange(h, '(b t) c h w -> b c t h w', b=batch_size)
112 |             h = self.temopral_conv(h)
113 |             h = rearrange(h, 'b c t h w -> (b t) c h w')
114 |         return h
115 | 
116 | 
117 | class TrajectoryEncoder(nn.Module):
118 |     def __init__(self, cin, time_embed_dim, channels=[320, 640, 1280, 1280], nums_rb=3,
119 |                  dropout=0.0, use_checkpoint=False, tempspatial_aware=False, temporal_conv=False):
120 |         super(TrajectoryEncoder, self).__init__()
121 |         # self.unshuffle = nn.PixelUnshuffle(8)
122 |         self.channels = channels
123 |         self.nums_rb = nums_rb
124 |         self.body = []
125 |         # self.conv_out = []
126 |         for i in range(len(channels)):
127 |             for j in range(nums_rb):
128 |                 if (i != 0) and (j == 0):
129 |                     self.body.append(
130 |                         ResBlock_v2(channels[i - 1], time_embed_dim, dropout,
131 |                             out_channels=channels[i], dims=2, use_checkpoint=use_checkpoint,
132 |                             tempspatial_aware=tempspatial_aware,
133 |                             use_temporal_conv=temporal_conv,
134 |                             down=True
135 |                         )
136 |                     )
137 |                 else:
138 |                     self.body.append(
139 |                         ResBlock_v2(channels[i], time_embed_dim, dropout,
140 |                             out_channels=channels[i], dims=2, use_checkpoint=use_checkpoint,
141 |                             tempspatial_aware=tempspatial_aware,
142 |                             use_temporal_conv=temporal_conv,
143 |                             down=False
144 |                         )
145 |                     )
146 |         self.body.append(
147 |             ResBlock_v2(channels[-1], time_embed_dim, dropout,
148 |                 out_channels=channels[-1], dims=2, use_checkpoint=use_checkpoint,
149 |                 tempspatial_aware=tempspatial_aware,
150 |                 use_temporal_conv=temporal_conv,
151 |                 down=True
152 |             )
153 |         )
154 |         self.body = nn.ModuleList(self.body)
155 |         self.conv_in = nn.Conv2d(cin, channels[0], 3, 1, 1)
156 |         self.conv_out = zero_module(conv_nd(2, channels[-1], channels[-1], 3, 1, 1))
157 | 
158 |     def forward(self, x, batch_size=None):
159 |         # unshuffle
160 |         # x = self.unshuffle(x)
161 |         # extract features
162 |         # features = []
163 |         x = self.conv_in(x)
164 |         for i in range(len(self.channels)):
165 |             for j in range(self.nums_rb):
166 |                 idx = i * self.nums_rb + j
167 |                 x = self.body[idx](x, batch_size)
168 |         x = self.body[-1](x, batch_size)
169 |         out = self.conv_out(x)
170 |         return out
171 | 
172 | 
173 | class ControlNet(ModelMixin, ConfigMixin):
174 |     _supports_gradient_checkpointing = True
175 | 
176 |     @register_to_config
177 |     def __init__(
178 |         self,
179 |         in_channels,
180 |         model_channels,
181 |         out_channels,
182 |         num_res_blocks,
183 |         attention_resolutions,
184 |         dropout=0.0,
185 |         channel_mult=(1, 2, 4, 8),
186 |         conv_resample=True,
187 |         dims=2,
188 |         context_dim=None,
189 |         use_scale_shift_norm=False,
190 |         resblock_updown=False,
191 |         num_heads=-1,
192 |         num_head_channels=-1,
193 |         transformer_depth=1,
194 |         use_linear=False,
195 |         use_checkpoint=False,
196 |         temporal_conv=False,
197 |         tempspatial_aware=False,
198 |         temporal_attention=True,
199 |         use_relative_position=True,
200 |         use_causal_attention=False,
201 |         temporal_length=None,
202 |         addition_attention=False,
203 |         temporal_selfatt_only=True,
204 |         image_cross_attention=False,
205 |         image_cross_attention_scale_learnable=False,
206 |         default_fps=4,
207 |         fps_condition=False,
208 |         ignore_noisy_latents=True,
209 |         conditioning_channels=4,
210 |     ):
211 |         super().__init__()
212 |         if num_heads == -1:
213 |             assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
214 |         if num_head_channels == -1:
215 |             assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
216 | 
217 |         self.in_channels = in_channels
218 |         self.model_channels = model_channels
219 |         self.out_channels = out_channels
220 |         self.num_res_blocks = num_res_blocks
221 |         self.attention_resolutions = attention_resolutions
222 |         self.dropout = dropout
223 |         self.channel_mult = channel_mult
224 |         self.conv_resample = conv_resample
225 |         self.temporal_attention = temporal_attention
226 |         time_embed_dim = model_channels * 4
227 |         self.use_checkpoint = use_checkpoint
228 |         temporal_self_att_only = True
229 |         self.addition_attention = addition_attention
230 |         self.temporal_length = temporal_length
231 |         self.image_cross_attention = image_cross_attention
232 |         self.image_cross_attention_scale_learnable = image_cross_attention_scale_learnable
233 |         self.default_fps = default_fps
234 |         self.fps_condition = fps_condition
235 |         self.ignore_noisy_latents = ignore_noisy_latents
236 | 
237 |         ## Time embedding blocks
238 |         self.time_proj = Timesteps(model_channels, flip_sin_to_cos=True, downscale_freq_shift=0)
239 |         self.time_embed = TimestepEmbedding(model_channels, time_embed_dim)
240 | 
241 |         if fps_condition:
242 |             self.fps_embedding = TimestepEmbedding(model_channels, time_embed_dim)
243 |             nn.init.zeros_(self.fps_embedding.linear_2.weight)
244 |             nn.init.zeros_(self.fps_embedding.linear_2.bias)
245 | 
246 |         # self.cond_embedding = TrajectoryEncoder(
247 |         #         cin=conditioning_channels, time_embed_dim=time_embed_dim, channels=trajectory_channels, nums_rb=3,
248 |         #         dropout=dropout, use_checkpoint=use_checkpoint, tempspatial_aware=tempspatial_aware, temporal_conv=False
249 |         #     )
250 |         self.cond_embedding = zero_module(conv_nd(dims, conditioning_channels, model_channels, 3, padding=1))
251 |         self.input_blocks = nn.ModuleList(
252 |             [
253 |                 TimestepEmbedSequential(conv_nd(dims, in_channels, model_channels, 3, padding=1))
254 |             ]
255 |         )
256 | 
257 |         ## Output Block
258 |         self.downsample_output = nn.ModuleList(
259 |             [
260 |                 nn.Sequential(
261 |                     nn.GroupNorm(32, model_channels),
262 |                     nn.SiLU(),
263 |                     zero_module(conv_nd(dims, model_channels, model_channels, 3, padding=1))
264 |                 )
265 |             ]
266 |         )
267 | 
268 |         if self.addition_attention:
269 |             self.init_attn = TimestepEmbedSequential(
270 |                 TemporalTransformer(
271 |                     model_channels,
272 |                     n_heads=8,
273 |                     d_head=num_head_channels,
274 |                     depth=transformer_depth,
275 |                     context_dim=context_dim,
276 |                     use_checkpoint=use_checkpoint, only_self_att=temporal_selfatt_only,
277 |                     causal_attention=False, relative_position=use_relative_position,
278 |                     temporal_length=temporal_length
279 |                 )
280 |             )
281 | 
282 |         ch = model_channels
283 |         ds = 1
284 |         for level, mult in enumerate(channel_mult):
285 |             for _ in range(num_res_blocks):
286 |                 layers = [
287 |                     ResBlock(ch, time_embed_dim, dropout,
288 |                         out_channels=mult * model_channels, dims=dims, use_checkpoint=use_checkpoint,
289 |                         use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware,
290 |                         use_temporal_conv=temporal_conv
291 |                     )
292 |                 ]
293 |                 ch = mult * model_channels
294 |                 if ds in attention_resolutions:
295 |                     if num_head_channels == -1:
296 |                         dim_head = ch // num_heads
297 |                     else:
298 |                         num_heads = ch // num_head_channels
299 |                         dim_head = num_head_channels
300 |                     layers.append(
301 |                         SpatialTransformer(ch, num_heads, dim_head,
302 |                             depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
303 |                             use_checkpoint=use_checkpoint, disable_self_attn=False,
304 |                             video_length=temporal_length, image_cross_attention=self.image_cross_attention,
305 |                             image_cross_attention_scale_learnable=self.image_cross_attention_scale_learnable,
306 |                         )
307 |                     )
308 |                     if self.temporal_attention:
309 |                         layers.append(
310 |                             TemporalTransformer(ch, num_heads, dim_head,
311 |                                 depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
312 |                                 use_checkpoint=use_checkpoint, only_self_att=temporal_self_att_only,
313 |                                 causal_attention=use_causal_attention, relative_position=use_relative_position,
314 |                                 temporal_length=temporal_length
315 |                             )
316 |                         )
317 |                 self.input_blocks.append(TimestepEmbedSequential(*layers))
318 |                 self.downsample_output.append(
319 |                     nn.Sequential(
320 |                         nn.GroupNorm(32, ch),
321 |                         nn.SiLU(),
322 |                         zero_module(conv_nd(dims, ch, ch, 3, padding=1))
323 |                     )
324 |                 )
325 |             if level < len(channel_mult) - 1:
326 |                 out_ch = ch
327 |                 self.input_blocks.append(
328 |                     TimestepEmbedSequential(
329 |                         ResBlock(ch, time_embed_dim, dropout,
330 |                             out_channels=out_ch, dims=dims, use_checkpoint=use_checkpoint,
331 |                             use_scale_shift_norm=use_scale_shift_norm,
332 |                             down=True
333 |                         )
334 |                         if resblock_updown
335 |                         else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch)
336 |                     )
337 |                 )
338 |                 self.downsample_output.append(
339 |                     nn.Sequential(
340 |                         nn.GroupNorm(32, out_ch),
341 |                         nn.SiLU(),
342 |                         zero_module(conv_nd(dims, out_ch, out_ch, 3, padding=1))
343 |                     )
344 |                 )
345 |                 ch = out_ch
346 |                 ds *= 2
347 | 
348 |     def forward(
349 |         self,
350 |         noisy_latents,
351 |         timesteps,
352 |         context_text,
353 |         context_img=None,
354 |         fps=None,
355 |         condition=None,            # [b, t, c, h, w]
356 |     ):
357 |         if self.ignore_noisy_latents:
358 |             noisy_latents = torch.zeros_like(noisy_latents)
359 | 
360 |         b, _, t, height, width = noisy_latents.shape
361 |         t_emb = self.time_proj(timesteps).type(noisy_latents.dtype)
362 |         emb = self.time_embed(t_emb)
363 | 
364 |         ## repeat t times for context [(b t) 77 768] & time embedding
365 |         ## check if we use per-frame image conditioning
366 |         if context_img is not None: ## decompose context into text and image
367 |             context_text = context_text.repeat_interleave(repeats=t, dim=0)
368 |             context_img = rearrange(context_img, 'b (t l) c -> (b t) l c', t=t)
369 |             context = torch.cat([context_text, context_img], dim=1)
370 |         else:
371 |             context = context_text.repeat_interleave(repeats=t, dim=0)
372 |         emb = emb.repeat_interleave(repeats=t, dim=0)
373 | 
374 |         ## always in shape (b n t) c h w, except for temporal layer
375 |         noisy_latents = rearrange(noisy_latents, 'b c t h w -> (b t) c h w')
376 |         condition = rearrange(condition, 'b t c h w -> (b t) c h w')
377 | 
378 |         ## combine emb
379 |         if self.fps_condition:
380 |             if fps is None:
381 |                 fps = torch.tensor(
382 |                     [self.default_fs] * b, dtype=torch.long, device=noisy_latents.device)
383 |             fps_emb = self.time_proj(fps).type(noisy_latents.dtype)
384 | 
385 |             fps_embed = self.fps_embedding(fps_emb)
386 |             fps_embed = fps_embed.repeat_interleave(repeats=t, dim=0)
387 |             emb = emb + fps_embed
388 | 
389 |         h = noisy_latents.type(self.dtype)
390 |         hs = []
391 |         for id, module in enumerate(self.input_blocks):
392 |             h = module(h, emb, context=context, batch_size=b)
393 |             if id == 0:
394 |                 h = h + self.cond_embedding(condition)
395 |                 if self.addition_attention:
396 |                     h = self.init_attn(h, emb, context=context, batch_size=b)
397 |             hs.append(h)
398 | 
399 |         guidance_feature_list = []
400 |         for hidden, module in zip(hs, self.downsample_output):
401 |             h = module(hidden)
402 |             guidance_feature_list.append(h)
403 | 
404 |         return guidance_feature_list
405 | 
406 |     @classmethod
407 |     def from_pretrained(cls, pretrained_model_name_or_path, layer_encoder_additional_kwargs={}, **kwargs):
408 |         cache_dir = kwargs.pop("cache_dir", None)
409 |         force_download = kwargs.pop("force_download", False)
410 |         proxies = kwargs.pop("proxies", None)
411 |         local_files_only = kwargs.pop("local_files_only", None)
412 |         token = kwargs.pop("token", None)
413 |         revision = kwargs.pop("revision", None)
414 |         subfolder = kwargs.pop("subfolder", None)
415 |         variant = kwargs.pop("variant", None)
416 |         use_safetensors = kwargs.pop("use_safetensors", None)
417 | 
418 |         allow_pickle = False
419 |         if use_safetensors is None:
420 |             use_safetensors = True
421 |             allow_pickle = True
422 | 
423 |         # Load config if we don't provide a configuration
424 |         config_path = pretrained_model_name_or_path
425 | 
426 |         user_agent = {
427 |             "diffusers": __version__,
428 |             "file_type": "model",
429 |             "framework": "pytorch",
430 |         }
431 | 
432 |         # load config
433 |         config, unused_kwargs, commit_hash = cls.load_config(
434 |             config_path,
435 |             cache_dir=cache_dir,
436 |             return_unused_kwargs=True,
437 |             return_commit_hash=True,
438 |             force_download=force_download,
439 |             proxies=proxies,
440 |             local_files_only=local_files_only,
441 |             token=token,
442 |             revision=revision,
443 |             subfolder=subfolder,
444 |             user_agent=user_agent,
445 |             **kwargs,
446 |         )
447 | 
448 |         for key, value in layer_encoder_additional_kwargs.items():
449 |             if isinstance(value, (ListConfig, DictConfig)):
450 |                 config[key] = OmegaConf.to_container(value, resolve=True)
451 |             else:
452 |                 config[key] = value
453 | 
454 |         # load model
455 |         model_file = None
456 |         if use_safetensors:
457 |             try:
458 |                 model_file = _get_model_file(
459 |                     pretrained_model_name_or_path,
460 |                     weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant),
461 |                     cache_dir=cache_dir,
462 |                     force_download=force_download,
463 |                     proxies=proxies,
464 |                     local_files_only=local_files_only,
465 |                     token=token,
466 |                     revision=revision,
467 |                     subfolder=subfolder,
468 |                     user_agent=user_agent,
469 |                     commit_hash=commit_hash,
470 |                 )
471 | 
472 |             except IOError as e:
473 |                 logger.error(f"An error occurred while trying to fetch {pretrained_model_name_or_path}: {e}")
474 |                 if not allow_pickle:
475 |                     raise
476 |                 logger.warning(
477 |                     "Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead."
478 |                 )
479 | 
480 |         if model_file is None:
481 |             model_file = _get_model_file(
482 |                 pretrained_model_name_or_path,
483 |                 weights_name=_add_variant(WEIGHTS_NAME, variant),
484 |                 cache_dir=cache_dir,
485 |                 force_download=force_download,
486 |                 proxies=proxies,
487 |                 local_files_only=local_files_only,
488 |                 token=token,
489 |                 revision=revision,
490 |                 subfolder=subfolder,
491 |                 user_agent=user_agent,
492 |                 commit_hash=commit_hash,
493 |             )
494 | 
495 |         model = cls.from_config(config, **unused_kwargs)
496 |         state_dict = load_state_dict(model_file, variant)
497 | 
498 |         missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
499 |         print(f"Controlnet loaded from {model_file} with {len(missing_keys)} missing keys and {len(unexpected_keys)} unexpected keys.")
500 |         return model


--------------------------------------------------------------------------------
/lvdm/models/layer_controlnet.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, List, Optional, Tuple, Union
  2 | from einops import rearrange, repeat
  3 | import numpy as np
  4 | from functools import partial
  5 | import torch
  6 | from torch import nn
  7 | from torch.nn import functional as F
  8 | from .unet import TimestepEmbedSequential, ResBlock, Downsample, Upsample, TemporalConvBlock
  9 | from ..basics import zero_module, conv_nd
 10 | from ..modules.attention import SpatialTransformer, TemporalTransformer
 11 | from ..common import checkpoint
 12 | 
 13 | from diffusers import __version__
 14 | from diffusers.configuration_utils import ConfigMixin, register_to_config
 15 | from diffusers.models.modeling_utils import ModelMixin
 16 | from diffusers.models.embeddings import TimestepEmbedding, Timesteps
 17 | from diffusers.models.model_loading_utils import load_state_dict
 18 | from diffusers.utils import (
 19 |     SAFETENSORS_WEIGHTS_NAME,
 20 |     WEIGHTS_NAME,
 21 |     logging,
 22 |     _get_model_file,
 23 |     _add_variant
 24 | )
 25 | from omegaconf import ListConfig, DictConfig, OmegaConf
 26 | 
 27 | 
 28 | logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 29 | 
 30 | 
 31 | class ControlNetConditioningEmbedding(nn.Module):
 32 |     """
 33 |     Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
 34 |     [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
 35 |     training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
 36 |     convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
 37 |     (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
 38 |     model) to encode image-space conditions ... into feature maps ..."
 39 |     """
 40 | 
 41 |     def __init__(
 42 |         self,
 43 |         conditioning_embedding_channels: int,
 44 |         conditioning_channels: int = 3,
 45 |         block_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
 46 |     ):
 47 |         super().__init__()
 48 | 
 49 |         self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
 50 | 
 51 |         self.blocks = nn.ModuleList([])
 52 | 
 53 |         for i in range(len(block_out_channels) - 1):
 54 |             channel_in = block_out_channels[i]
 55 |             channel_out = block_out_channels[i + 1]
 56 |             self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
 57 |             self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
 58 | 
 59 |         self.conv_out = zero_module(
 60 |             nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
 61 |         )
 62 | 
 63 |     def forward(self, conditioning):
 64 |         embedding = self.conv_in(conditioning)
 65 |         embedding = F.silu(embedding)
 66 | 
 67 |         for block in self.blocks:
 68 |             embedding = block(embedding)
 69 |             embedding = F.silu(embedding)
 70 | 
 71 |         embedding = self.conv_out(embedding)
 72 | 
 73 |         return embedding
 74 | 
 75 | 
 76 | class LayerControlNet(ModelMixin, ConfigMixin):
 77 |     _supports_gradient_checkpointing = True
 78 | 
 79 |     @register_to_config
 80 |     def __init__(
 81 |         self,
 82 |         in_channels,
 83 |         model_channels,
 84 |         out_channels,
 85 |         num_res_blocks,
 86 |         attention_resolutions,
 87 |         dropout=0.0,
 88 |         channel_mult=(1, 2, 4, 8),
 89 |         conv_resample=True,
 90 |         dims=2,
 91 |         context_dim=None,
 92 |         use_scale_shift_norm=False,
 93 |         resblock_updown=False,
 94 |         num_heads=-1,
 95 |         num_head_channels=-1,
 96 |         transformer_depth=1,
 97 |         use_linear=False,
 98 |         use_checkpoint=False,
 99 |         temporal_conv=False,
100 |         tempspatial_aware=False,
101 |         temporal_attention=True,
102 |         use_relative_position=True,
103 |         use_causal_attention=False,
104 |         temporal_length=None,
105 |         addition_attention=False,
106 |         temporal_selfatt_only=True,
107 |         image_cross_attention=False,
108 |         image_cross_attention_scale_learnable=False,
109 |         default_fps=4,
110 |         fps_condition=False,
111 |         ignore_noisy_latents=True,
112 |         condition_channels={},
113 |         control_injection_mode='add',
114 |         use_vae_for_trajectory=False,
115 |     ):
116 |         super().__init__()
117 |         if num_heads == -1:
118 |             assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
119 |         if num_head_channels == -1:
120 |             assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
121 | 
122 |         self.in_channels = in_channels
123 |         self.model_channels = model_channels
124 |         self.out_channels = out_channels
125 |         self.num_res_blocks = num_res_blocks
126 |         self.attention_resolutions = attention_resolutions
127 |         self.dropout = dropout
128 |         self.channel_mult = channel_mult
129 |         self.conv_resample = conv_resample
130 |         self.temporal_attention = temporal_attention
131 |         time_embed_dim = model_channels * 4
132 |         self.use_checkpoint = use_checkpoint
133 |         temporal_self_att_only = True
134 |         self.addition_attention = addition_attention
135 |         self.temporal_length = temporal_length
136 |         self.image_cross_attention = image_cross_attention
137 |         self.image_cross_attention_scale_learnable = image_cross_attention_scale_learnable
138 |         self.default_fps = default_fps
139 |         self.fps_condition = fps_condition
140 |         self.ignore_noisy_latents = ignore_noisy_latents
141 |         assert len(condition_channels) > 0, 'Condition types must be specified'
142 |         self.condition_channels = condition_channels
143 |         self.control_injection_mode = control_injection_mode
144 |         self.use_vae_for_trajectory = use_vae_for_trajectory
145 | 
146 |         ## Time embedding blocks
147 |         self.time_proj = Timesteps(model_channels, flip_sin_to_cos=True, downscale_freq_shift=0)
148 |         self.time_embed = TimestepEmbedding(model_channels, time_embed_dim)
149 | 
150 |         if fps_condition:
151 |             self.fps_embedding = TimestepEmbedding(model_channels, time_embed_dim)
152 |             nn.init.zeros_(self.fps_embedding.linear_2.weight)
153 |             nn.init.zeros_(self.fps_embedding.linear_2.bias)
154 | 
155 |         if "motion_score" in condition_channels:
156 |             if control_injection_mode == 'add':
157 |                 self.motion_embedding = zero_module(conv_nd(dims, condition_channels["motion_score"], model_channels, 3, padding=1))
158 |             elif control_injection_mode == 'concat':
159 |                 self.motion_embedding = zero_module(conv_nd(dims, condition_channels["motion_score"], condition_channels["motion_score"], 3, padding=1))
160 |             else:
161 |                 raise ValueError(f"control_injection_mode {control_injection_mode} is not supported, use 'add' or 'concat'")
162 |         if "sketch" in condition_channels:
163 |             if control_injection_mode == 'add':
164 |                 self.sketch_embedding = zero_module(conv_nd(dims, condition_channels["sketch"], model_channels, 3, padding=1))
165 |             elif control_injection_mode == 'concat':
166 |                 self.sketch_embedding = zero_module(conv_nd(dims, condition_channels["sketch"], condition_channels["sketch"], 3, padding=1))
167 |             else:
168 |                 raise ValueError(f"control_injection_mode {control_injection_mode} is not supported, use 'add' or 'concat'")
169 |         if "trajectory" in condition_channels:
170 |             if control_injection_mode == 'add':
171 |                 if use_vae_for_trajectory:
172 |                     self.trajectory_embedding = zero_module(conv_nd(dims, condition_channels["trajectory"], model_channels, 3, padding=1))
173 |                 else:
174 |                     self.trajectory_embedding = ControlNetConditioningEmbedding(model_channels, condition_channels["trajectory"])
175 |             elif control_injection_mode == 'concat':
176 |                 if use_vae_for_trajectory:
177 |                     self.trajectory_embedding = zero_module(conv_nd(dims, condition_channels["trajectory"], condition_channels["trajectory"], 3, padding=1))
178 |                 else:
179 |                     self.trajectory_embedding = ControlNetConditioningEmbedding(condition_channels["trajectory"], condition_channels["trajectory"])
180 |             else:
181 |                 raise ValueError(f"control_injection_mode {control_injection_mode} is not supported, use 'add' or 'concat'")
182 | 
183 |         self.input_blocks = nn.ModuleList(
184 |             [
185 |                 TimestepEmbedSequential(conv_nd(dims, in_channels, model_channels, 3, padding=1))
186 |             ]
187 |         )
188 | 
189 |         if self.addition_attention:
190 |             self.init_attn = TimestepEmbedSequential(
191 |                 TemporalTransformer(
192 |                     model_channels,
193 |                     n_heads=8,
194 |                     d_head=num_head_channels,
195 |                     depth=transformer_depth,
196 |                     context_dim=context_dim,
197 |                     use_checkpoint=use_checkpoint, only_self_att=temporal_selfatt_only,
198 |                     causal_attention=False, relative_position=use_relative_position,
199 |                     temporal_length=temporal_length
200 |                 )
201 |             )
202 | 
203 |         ch = model_channels
204 |         ds = 1
205 |         for level, mult in enumerate(channel_mult):
206 |             for _ in range(num_res_blocks):
207 |                 layers = [
208 |                     ResBlock(ch, time_embed_dim, dropout,
209 |                         out_channels=mult * model_channels, dims=dims, use_checkpoint=use_checkpoint,
210 |                         use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware,
211 |                         use_temporal_conv=temporal_conv
212 |                     )
213 |                 ]
214 |                 ch = mult * model_channels
215 |                 if ds in attention_resolutions:
216 |                     if num_head_channels == -1:
217 |                         dim_head = ch // num_heads
218 |                     else:
219 |                         num_heads = ch // num_head_channels
220 |                         dim_head = num_head_channels
221 |                     layers.append(
222 |                         SpatialTransformer(ch, num_heads, dim_head,
223 |                             depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
224 |                             use_checkpoint=use_checkpoint, disable_self_attn=False,
225 |                             video_length=temporal_length, image_cross_attention=self.image_cross_attention,
226 |                             image_cross_attention_scale_learnable=self.image_cross_attention_scale_learnable,
227 |                         )
228 |                     )
229 |                     if self.temporal_attention:
230 |                         layers.append(
231 |                             TemporalTransformer(ch, num_heads, dim_head,
232 |                                 depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
233 |                                 use_checkpoint=use_checkpoint, only_self_att=temporal_self_att_only,
234 |                                 causal_attention=use_causal_attention, relative_position=use_relative_position,
235 |                                 temporal_length=temporal_length
236 |                             )
237 |                         )
238 |                 self.input_blocks.append(TimestepEmbedSequential(*layers))
239 | 
240 |             if level < len(channel_mult) - 1:
241 |                 out_ch = ch
242 |                 self.input_blocks.append(
243 |                     TimestepEmbedSequential(
244 |                         ResBlock(ch, time_embed_dim, dropout,
245 |                             out_channels=out_ch, dims=dims, use_checkpoint=use_checkpoint,
246 |                             use_scale_shift_norm=use_scale_shift_norm,
247 |                             down=True
248 |                         )
249 |                         if resblock_updown
250 |                         else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch)
251 |                     )
252 |                 )
253 |                 ch = out_ch
254 |                 ds *= 2
255 | 
256 |     def forward(
257 |         self,
258 |         noisy_latents,
259 |         timesteps,
260 |         context_text,
261 |         context_img=None,
262 |         fps=None,
263 |         layer_latents=None,     # [b, n_layer, t, c, h, w]
264 |         layer_latent_mask=None, # [b, n_layer, t, 1, h, w]
265 |         motion_scores=None,     # [b, n_layer]
266 |         sketch=None,            # [b, n_layer, t, c, h, w]
267 |         trajectory=None,        # [b, n_layer, t, c, h, w]
268 |     ):
269 |         if self.ignore_noisy_latents:
270 |             noisy_latents_shape = list(noisy_latents.shape)
271 |             noisy_latents_shape[1] = 0
272 |             noisy_latents = torch.zeros(noisy_latents_shape, device=noisy_latents.device, dtype=noisy_latents.dtype)
273 | 
274 |         b, _, t, height, width = noisy_latents.shape
275 |         n_layer = layer_latents.shape[1]
276 |         t_emb = self.time_proj(timesteps).type(noisy_latents.dtype)
277 |         emb = self.time_embed(t_emb)
278 | 
279 |         ## repeat t times for context [(b t) 77 768] & time embedding
280 |         ## check if we use per-frame image conditioning
281 |         if context_img is not None: ## decompose context into text and image
282 |             context_text = repeat(context_text, 'b l c -> (b n t) l c', n=n_layer, t=t)
283 |             context_img = repeat(context_img, 'b tl c -> b n tl c', n=n_layer)
284 |             context_img = rearrange(context_img, 'b n (t l) c -> (b n t) l c', t=t)
285 |             context = torch.cat([context_text, context_img], dim=1)
286 |         else:
287 |             context = repeat(context_text, 'b l c -> (b n t) l c', n=n_layer, t=t)
288 |         emb = repeat(emb, 'b c -> (b n t) c', n=n_layer, t=t)
289 | 
290 |         ## always in shape (b n t) c h w, except for temporal layer
291 |         noisy_latents = repeat(noisy_latents, 'b c t h w -> (b n t) c h w', n=n_layer)
292 | 
293 |         ## combine emb
294 |         if self.fps_condition:
295 |             if fps is None:
296 |                 fps = torch.tensor(
297 |                     [self.default_fs] * b, dtype=torch.long, device=noisy_latents.device)
298 |             fps_emb = self.time_proj(fps).type(noisy_latents.dtype)
299 | 
300 |             fps_embed = self.fps_embedding(fps_emb)
301 |             fps_embed = repeat(fps_embed, 'b c -> (b n t) c', n=n_layer, t=t)
302 |             emb = emb + fps_embed
303 | 
304 |         ## process conditions
305 |         layer_condition = torch.cat([layer_latents, layer_latent_mask], dim=3)
306 |         layer_condition = rearrange(layer_condition, 'b n t c h w -> (b n t) c h w')
307 |         h = torch.cat([noisy_latents, layer_condition], dim=1)
308 | 
309 |         if "motion_score" in self.condition_channels:
310 |             motion_condition = repeat(motion_scores, 'b n -> b n t 1 h w', t=t, h=height, w=width)
311 |             motion_condition = torch.cat([motion_condition, layer_latent_mask], dim=3)
312 |             motion_condition = rearrange(motion_condition, 'b n t c h w -> (b n t) c h w')
313 |             motion_condition = self.motion_embedding(motion_condition)
314 |             if self.control_injection_mode == 'concat':
315 |                 h = torch.cat([h, motion_condition], dim=1)
316 | 
317 |         if "sketch" in self.condition_channels:
318 |             sketch_condition = rearrange(sketch, 'b n t c h w -> (b n t) c h w')
319 |             sketch_condition = self.sketch_embedding(sketch_condition)
320 |             if self.control_injection_mode == 'concat':
321 |                 h = torch.cat([h, sketch_condition], dim=1)
322 | 
323 |         if "trajectory" in self.condition_channels:
324 |             traj_condition = rearrange(trajectory, 'b n t c h w -> (b n t) c h w')
325 |             traj_condition = self.trajectory_embedding(traj_condition)
326 |             if self.control_injection_mode == 'concat':
327 |                 h = torch.cat([h, traj_condition], dim=1)
328 | 
329 |         layer_features = []
330 |         for id, module in enumerate(self.input_blocks):
331 |             h = module(h, emb, context=context, batch_size=b*n_layer)
332 |             if id == 0:
333 |                 if self.control_injection_mode == 'add':
334 |                     if "motion_score" in self.condition_channels:
335 |                         h = h + motion_condition
336 |                     if "sketch" in self.condition_channels:
337 |                         h = h + sketch_condition
338 |                     if "trajectory" in self.condition_channels:
339 |                         h = h + traj_condition
340 |                 if self.addition_attention:
341 |                     h = self.init_attn(h, emb, context=context, batch_size=b*n_layer)
342 |             if SpatialTransformer in [type(m) for m in module]:
343 |                 layer_features.append(rearrange(h, '(b n t) c h w -> b n t c h w', b=b, n=n_layer))
344 | 
345 |         return layer_features
346 | 
347 |     @classmethod
348 |     def from_pretrained(cls, pretrained_model_name_or_path, layer_controlnet_additional_kwargs={}, **kwargs):
349 |         cache_dir = kwargs.pop("cache_dir", None)
350 |         force_download = kwargs.pop("force_download", False)
351 |         proxies = kwargs.pop("proxies", None)
352 |         local_files_only = kwargs.pop("local_files_only", None)
353 |         token = kwargs.pop("token", None)
354 |         revision = kwargs.pop("revision", None)
355 |         subfolder = kwargs.pop("subfolder", None)
356 |         variant = kwargs.pop("variant", None)
357 |         use_safetensors = kwargs.pop("use_safetensors", None)
358 | 
359 |         allow_pickle = False
360 |         if use_safetensors is None:
361 |             use_safetensors = True
362 |             allow_pickle = True
363 | 
364 |         # Load config if we don't provide a configuration
365 |         config_path = pretrained_model_name_or_path
366 | 
367 |         user_agent = {
368 |             "diffusers": __version__,
369 |             "file_type": "model",
370 |             "framework": "pytorch",
371 |         }
372 | 
373 |         # load config
374 |         config, unused_kwargs, commit_hash = cls.load_config(
375 |             config_path,
376 |             cache_dir=cache_dir,
377 |             return_unused_kwargs=True,
378 |             return_commit_hash=True,
379 |             force_download=force_download,
380 |             proxies=proxies,
381 |             local_files_only=local_files_only,
382 |             token=token,
383 |             revision=revision,
384 |             subfolder=subfolder,
385 |             user_agent=user_agent,
386 |             **kwargs,
387 |         )
388 | 
389 |         for key, value in layer_controlnet_additional_kwargs.items():
390 |             if isinstance(value, (ListConfig, DictConfig)):
391 |                 config[key] = OmegaConf.to_container(value, resolve=True)
392 |             else:
393 |                 config[key] = value
394 | 
395 |         # load model
396 |         model_file = None
397 |         if use_safetensors:
398 |             try:
399 |                 model_file = _get_model_file(
400 |                     pretrained_model_name_or_path,
401 |                     weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant),
402 |                     cache_dir=cache_dir,
403 |                     force_download=force_download,
404 |                     proxies=proxies,
405 |                     local_files_only=local_files_only,
406 |                     token=token,
407 |                     revision=revision,
408 |                     subfolder=subfolder,
409 |                     user_agent=user_agent,
410 |                     commit_hash=commit_hash,
411 |                 )
412 | 
413 |             except IOError as e:
414 |                 logger.error(f"An error occurred while trying to fetch {pretrained_model_name_or_path}: {e}")
415 |                 if not allow_pickle:
416 |                     raise
417 |                 logger.warning(
418 |                     "Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead."
419 |                 )
420 | 
421 |         if model_file is None:
422 |             model_file = _get_model_file(
423 |                 pretrained_model_name_or_path,
424 |                 weights_name=_add_variant(WEIGHTS_NAME, variant),
425 |                 cache_dir=cache_dir,
426 |                 force_download=force_download,
427 |                 proxies=proxies,
428 |                 local_files_only=local_files_only,
429 |                 token=token,
430 |                 revision=revision,
431 |                 subfolder=subfolder,
432 |                 user_agent=user_agent,
433 |                 commit_hash=commit_hash,
434 |             )
435 | 
436 |         model = cls.from_config(config, **unused_kwargs)
437 |         state_dict = load_state_dict(model_file, variant)
438 | 
439 |         if state_dict['input_blocks.0.0.weight'].shape[1] != model.input_blocks[0][0].weight.shape[1]:
440 |             state_dict.pop('input_blocks.0.0.weight')
441 | 
442 |         missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
443 |         print(f"LayerControlNet loaded from {model_file} with {len(missing_keys)} missing keys and {len(unexpected_keys)} unexpected keys.")
444 |         return model


--------------------------------------------------------------------------------
/lvdm/utils.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | import numpy as np
  3 | import cv2
  4 | import torch
  5 | import torch.distributed as dist
  6 | import os
  7 | from einops import rearrange
  8 | import imageio
  9 | import torchvision
 10 | from PIL import Image
 11 | import io
 12 | from matplotlib import pyplot as plt
 13 | 
 14 | 
 15 | RY = 15
 16 | YG = 6
 17 | GC = 4
 18 | CB = 11
 19 | BM = 13
 20 | MR = 6
 21 | 
 22 | COLORWHEEL = torch.zeros((RY + YG + GC + CB + BM + MR, 3))
 23 | col = 0
 24 | 
 25 | # RY
 26 | COLORWHEEL[0:RY, 0] = 255
 27 | COLORWHEEL[0:RY, 1] = torch.floor(255 * torch.arange(0, RY) / RY)
 28 | col = col + RY
 29 | # YG
 30 | COLORWHEEL[col:col + YG, 0] = 255 - torch.floor(255 * torch.arange(0, YG) / YG)
 31 | COLORWHEEL[col:col + YG, 1] = 255
 32 | col = col + YG
 33 | # GC
 34 | COLORWHEEL[col:col + GC, 1] = 255
 35 | COLORWHEEL[col:col + GC, 2] = torch.floor(255 * torch.arange(0, GC) / GC)
 36 | col = col + GC
 37 | # CB
 38 | COLORWHEEL[col:col + CB, 1] = 255 - torch.floor(255 * torch.arange(CB) / CB)
 39 | COLORWHEEL[col:col + CB, 2] = 255
 40 | col = col + CB
 41 | # BM
 42 | COLORWHEEL[col:col + BM, 2] = 255
 43 | COLORWHEEL[col:col + BM, 0] = torch.floor(255 * torch.arange(0, BM) / BM)
 44 | col = col + BM
 45 | # MR
 46 | COLORWHEEL[col:col + MR, 2] = 255 - torch.floor(255 * torch.arange(MR) / MR)
 47 | COLORWHEEL[col:col + MR, 0] = 255
 48 | 
 49 | 
 50 | def count_params(model, verbose=False):
 51 |     total_params = sum(p.numel() for p in model.parameters())
 52 |     if verbose:
 53 |         print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
 54 |     return total_params
 55 | 
 56 | 
 57 | def check_istarget(name, para_list):
 58 |     """
 59 |     name: full name of source para
 60 |     para_list: partial name of target para
 61 |     """
 62 |     istarget=False
 63 |     for para in para_list:
 64 |         if para in name:
 65 |             return True
 66 |     return istarget
 67 | 
 68 | 
 69 | def instantiate_from_config(config):
 70 |     if not "target" in config:
 71 |         if config == '__is_first_stage__':
 72 |             return None
 73 |         elif config == "__is_unconditional__":
 74 |             return None
 75 |         raise KeyError("Expected key `target` to instantiate.")
 76 |     return get_obj_from_str(config["target"])(**config.get("params", dict()))
 77 | 
 78 | 
 79 | def get_obj_from_str(string, reload=False):
 80 |     module, cls = string.rsplit(".", 1)
 81 |     if reload:
 82 |         module_imp = importlib.import_module(module)
 83 |         importlib.reload(module_imp)
 84 |     return getattr(importlib.import_module(module, package=None), cls)
 85 | 
 86 | 
 87 | def load_npz_from_dir(data_dir):
 88 |     data = [np.load(os.path.join(data_dir, data_name))['arr_0'] for data_name in os.listdir(data_dir)]
 89 |     data = np.concatenate(data, axis=0)
 90 |     return data
 91 | 
 92 | 
 93 | def load_npz_from_paths(data_paths):
 94 |     data = [np.load(data_path)['arr_0'] for data_path in data_paths]
 95 |     data = np.concatenate(data, axis=0)
 96 |     return data
 97 | 
 98 | 
 99 | def resize_numpy_image(image, max_resolution=512 * 512, resize_short_edge=None):
100 |     h, w = image.shape[:2]
101 |     if resize_short_edge is not None:
102 |         k = resize_short_edge / min(h, w)
103 |     else:
104 |         k = max_resolution / (h * w)
105 |         k = k**0.5
106 |     h = int(np.round(h * k / 64)) * 64
107 |     w = int(np.round(w * k / 64)) * 64
108 |     image = cv2.resize(image, (w, h), interpolation=cv2.INTER_LANCZOS4)
109 |     return image
110 | 
111 | 
112 | def setup_dist(args):
113 |     if dist.is_initialized():
114 |         return
115 |     torch.cuda.set_device(args.local_rank)
116 |     torch.distributed.init_process_group(
117 |         'nccl',
118 |         init_method='env://'
119 |     )
120 | 
121 | 
122 | def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8):
123 |     videos = rearrange(videos, "b c t h w -> t b c h w")
124 |     outputs = []
125 |     for x in videos:
126 |         x = torchvision.utils.make_grid(x, nrow=n_rows)
127 |         x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
128 |         if rescale:
129 |             x = (x + 1.0) / 2.0  # -1,1 -> 0,1
130 |         x = (x * 255).numpy().astype(np.uint8)
131 |         outputs.append(x)
132 | 
133 |     os.makedirs(os.path.dirname(path), exist_ok=True)
134 |     imageio.mimsave(path, outputs, fps=fps)
135 | 
136 | def save_images_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6):
137 |     videos = rearrange(videos, "b c t h w -> t b c h w")
138 |     os.makedirs(path, exist_ok=True)
139 |     for time_idx, x in enumerate(videos):
140 |         x = torchvision.utils.make_grid(x, nrow=n_rows)
141 |         x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
142 |         if rescale:
143 |             x = (x + 1.0) / 2.0  # -1,1 -> 0,1
144 |         x = (x * 255).numpy().astype(np.uint8)
145 |         image = Image.fromarray(x)
146 |         image.save(os.path.join(path, f"{time_idx:04d}.png"))
147 | 
148 | def save_image_with_mask(image: torch.Tensor, masks: torch.Tensor, path: str, rescale=False, alpha=0.6):
149 |     # image: [C, H, W], mask: [N, H, W]
150 |     os.makedirs(os.path.dirname(path), exist_ok=True)
151 |     image = rearrange(image, "c h w -> h w c")
152 |     if rescale:
153 |         image = (image + 1.0) / 2.0 # -1,1 -> 0,1
154 |     image = (image * 255).numpy().astype(np.uint8)
155 |     final_image = Image.fromarray(image).convert("RGBA")
156 |     cmap = plt.get_cmap("tab20c")
157 |     masks = masks.cpu().numpy().astype(np.float32)
158 |     for i, img in enumerate(masks):
159 |         mask_color = np.array([*cmap(i * 4 + 2)[:3], alpha])
160 |         mask = img[:,:,None] * mask_color[None,None,:] * 255
161 |         mask = mask.astype(np.uint8)
162 |         mask = Image.fromarray(mask).convert("RGBA")
163 |         final_image = Image.alpha_composite(final_image, mask)
164 |     final_image.save(path)
165 | 
166 | def save_videos_with_heatmap(videos: torch.Tensor, trajectory: torch.Tensor, path: str, n_rows=6, fps=8):
167 |     # use Image RGBA and alpha_composite to combine video and trajectory
168 |     # use imageio to save video
169 |     videos = rearrange(videos, "b c t h w -> t b c h w")
170 |     trajectory = rearrange(trajectory, "b c t h w -> t b c h w")
171 |     outputs = []
172 |     for x, y in zip(videos, trajectory):
173 |         x = torchvision.utils.make_grid(x, nrow=6)
174 |         x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
175 |         x = (x * 255).numpy().astype(np.uint8)
176 |         y = torchvision.utils.make_grid(y, nrow=6)
177 |         y = y.transpose(0, 1).transpose(1, 2).squeeze(-1)
178 |         y = torch.cat([y, torch.mean(y, dim=-1, keepdim=True)], dim=-1)
179 |         y = (y * 255).numpy().astype(np.uint8)
180 |         x = Image.fromarray(x).convert("RGBA")
181 |         y = Image.fromarray(y)
182 |         x = Image.alpha_composite(x, y)
183 |         outputs.append(x)
184 |     os.makedirs(os.path.dirname(path), exist_ok=True)
185 |     imageio.mimsave(path, outputs, fps=fps)
186 | 
187 | def save_videos_with_traj(videos: torch.Tensor, trajectory: torch.Tensor, path: str, rescale=False, fps=8, line_width=3, circle_radius=5):
188 |     # videos: [C, F, H, W]
189 |     # trajectory: [F, N, 2]
190 |     os.makedirs(os.path.dirname(path), exist_ok=True)
191 |     videos = rearrange(videos, "c f h w -> f h w c")
192 |     if rescale:
193 |         videos = (videos + 1) / 2
194 |     videos = (videos * 255).numpy().astype(np.uint8)
195 |     outputs = []
196 |     for frame_idx, img in enumerate(videos):
197 |         # img: [H, W, C], traj: [N, 2]
198 |         # draw trajectory use cv2.line
199 |         img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
200 |         for traj_idx in range(trajectory.shape[1]):
201 |             for history_idx in range(frame_idx):
202 |                 cv2.line(img, tuple(trajectory[history_idx, traj_idx].int().tolist()), tuple(trajectory[history_idx+1, traj_idx].int().tolist()), (0, 0, 255), line_width)
203 |             cv2.circle(img, tuple(trajectory[frame_idx, traj_idx].int().tolist()), circle_radius, (100, 230, 160), -1)
204 |         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
205 |         outputs.append(img)
206 |     imageio.mimsave(path, outputs, fps=fps)
207 | 
208 | def save_layer_prompts_video(videos, layer_masks, motion_scores, flow_maps, path, alpha=0.6, fps=8, flow_step=10, flow_scale=1.0):
209 |     # videos: [F, C, H, W]
210 |     # layer_masks: [N, F, H, W]
211 |     # motion_scores: [N, ]
212 |     # flow_maps: [F, 2, H, W]
213 |     frame_length = videos.shape[0]
214 |     h, w = videos.shape[-2:]
215 |     n_keyframes = layer_masks.shape[1]
216 |     if n_keyframes == 1:
217 |         keyframe_indices = [0]
218 |     elif n_keyframes == 2:
219 |         keyframe_indices = [0, frame_length - 1]
220 |     else:
221 |         keyframe_indices = list(range(n_keyframes))
222 |     videos = rearrange(videos, "t c h w -> t h w c")
223 |     videos = ((videos + 1) / 2 * 255).clamp(0, 255).numpy().astype(np.uint8)
224 |     layer_masks = layer_masks.numpy()
225 |     flow_maps = flow_maps.float().numpy()
226 |     frame_list = []
227 |     cmap = plt.get_cmap("tab10")
228 |     for frame_idx in range(frame_length):
229 |         output_frame = Image.new("RGBA", (w * 2, h * 2))
230 |         frame = Image.fromarray(videos[frame_idx]).convert("RGBA")
231 |         frame_mask = None
232 |         output_frame.paste(frame, (0, 0))
233 |         for layer_idx, layer_mask in enumerate(layer_masks):
234 |             if frame_idx in keyframe_indices:
235 |                 layer_color = (np.array([*cmap(layer_idx)[:3], alpha]) * 255).astype(np.uint8)
236 |                 if frame_idx == frame_length - 1:
237 |                     mask_with_color = Image.fromarray(layer_mask[-1, :, :, np.newaxis] * layer_color[np.newaxis, np.newaxis, :])
238 |                 else:
239 |                     mask_with_color = Image.fromarray(layer_mask[frame_idx, :, :, np.newaxis] * layer_color[np.newaxis, np.newaxis, :])
240 |             else:
241 |                 mask_with_color = Image.fromarray(np.zeros((h, w, 4), dtype=np.uint8))
242 |             frame = Image.alpha_composite(frame, mask_with_color)
243 |             frame_mask = Image.alpha_composite(frame_mask, mask_with_color) if frame_mask is not None else mask_with_color
244 |         output_frame.paste(frame, (w, 0))
245 |         output_frame.paste(frame_mask, (0, h))
246 |         flow_x = flow_maps[frame_idx, 0] * flow_scale
247 |         flow_y = flow_maps[frame_idx, 1] * flow_scale
248 |         x, y = np.arange(0, w, step=flow_step), np.arange(0, h, step=flow_step)
249 |         X, Y = np.meshgrid(x, y)
250 |         U, V = flow_x[::flow_step, ::flow_step], flow_y[::flow_step, ::flow_step]
251 |         plt.figure()
252 |         plt.gca().set_facecolor('white')
253 |         plt.quiver(X, Y, U, V, color='black', angles='xy', scale_units='xy', scale=1)
254 |         plt.xlim(0, w)
255 |         plt.ylim(h, 0)
256 |         plt.gca().set_xticks([])
257 |         plt.gca().set_yticks([])
258 |         buf = io.BytesIO()
259 |         plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0)
260 |         buf.seek(0)
261 |         flow = Image.open(buf).convert("RGBA")
262 |         output_frame.paste(flow, (w, h))
263 |         plt.close()
264 |         frame_list.append(output_frame)
265 |     os.makedirs(os.path.dirname(path), exist_ok=True)
266 |     imageio.mimsave(path, frame_list, fps=fps)
267 | 
268 | def flow_uv_to_colors(u, v, rad, convert_to_bgr=False):
269 |     """
270 |     Applies the flow color wheel to (possibly clipped) flow components u and v.
271 | 
272 |     According to the C++ source code of Daniel Scharstein
273 |     According to the Matlab source code of Deqing Sun
274 | 
275 |     Args:
276 |         u (torch.tensor): Input horizontal flow of shape [N,H,W]
277 |         v (torch.tensor): Input vertical flow of shape [N,H,W]
278 |         convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
279 | 
280 |     Returns:
281 |         torch.tensor: Flow visualization image of shape [N,3,H,W]
282 |     """
283 |     flow_image = torch.zeros((u.shape[0], 3, u.shape[1], u.shape[2]), dtype=torch.uint8, device=u.device)
284 |     colorwheel = COLORWHEEL.to(u.device)
285 |     ncols = colorwheel.shape[0]
286 |     a = torch.arctan2(-v, -u) / np.pi
287 |     fk = (a + 1) / 2 * (ncols - 1)
288 |     k0 = torch.floor(fk).int()
289 |     k1 = k0 + 1
290 |     k1[k1 == ncols] = 0
291 |     f = fk - k0
292 |     for i in range(colorwheel.shape[1]):
293 |         tmp = colorwheel[:, i]
294 |         col0 = tmp[k0] / 255.0
295 |         col1 = tmp[k1] / 255.0
296 |         col = (1 - f) * col0 + f * col1
297 |         idx = rad <= 1
298 |         col[idx] = 1 - rad[idx] * (1 - col[idx])
299 |         col[~idx] = col[~idx] * 0.75  # out of range
300 |         # Note the 2-i => BGR instead of RGB
301 |         ch_idx = 2 - i if convert_to_bgr else i
302 |         flow_image[:, ch_idx, :, :] = torch.floor(255 * col)
303 |     return flow_image
304 | 
305 | def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False):
306 |     """
307 |     Adapted from Tora: https://github.com/alibaba/Tora/blob/14db1b0a074284a6c265564eef07f5320911dc00/sat/utils/flow_utils.py#L120
308 |     Expects a two dimensional flow image of shape.
309 | 
310 |     Args:
311 |         flow_uv (torch.Tensor): Flow UV image of shape [N,2,H,W]
312 |         clip_flow (float, optional): Clip maximum of flow values. Defaults to None.
313 |         convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
314 | 
315 |     Returns:
316 |         torch.Tensor: Flow visualization image of shape [N,3,H,W]
317 |     """
318 |     if clip_flow is not None:
319 |         flow_uv = torch.clamp(flow_uv, 0, clip_flow)
320 |     u = flow_uv[:, 0]
321 |     v = flow_uv[:, 1]
322 |     rad = torch.sqrt(u**2 + v**2)
323 |     rad_max = torch.max(rad)
324 |     epsilon = 1e-5
325 |     u = u / (rad_max + epsilon)
326 |     v = v / (rad_max + epsilon)
327 |     flow_image = flow_uv_to_colors(u, v, rad, convert_to_bgr)
328 |     return flow_image
329 | 
330 | def generate_gaussian_template(imgSize=200):
331 |     """ Adapted from DragAnything: https://github.com/showlab/DragAnything/blob/79355363218a7eb9b3437a31b8604b6d436d9337/dataset/dataset.py#L110"""
332 |     circle_img = np.zeros((imgSize, imgSize), np.float32)
333 |     circle_mask = cv2.circle(circle_img, (imgSize//2, imgSize//2), imgSize//2, 1, -1)
334 | 
335 |     isotropicGrayscaleImage = np.zeros((imgSize, imgSize), np.float32)
336 | 
337 |     # Guass Map
338 |     for i in range(imgSize):
339 |         for j in range(imgSize):
340 |             isotropicGrayscaleImage[i, j] = 1 / 2 / np.pi / (40 ** 2) * np.exp(
341 |                 -1 / 2 * ((i - imgSize / 2) ** 2 / (40 ** 2) + (j - imgSize / 2) ** 2 / (40 ** 2)))
342 | 
343 |     isotropicGrayscaleImage = isotropicGrayscaleImage * circle_mask
344 |     isotropicGrayscaleImage = (isotropicGrayscaleImage / np.max(isotropicGrayscaleImage)).astype(np.float32)
345 |     isotropicGrayscaleImage = (isotropicGrayscaleImage / np.max(isotropicGrayscaleImage)*255).astype(np.uint8)
346 | 
347 |     # isotropicGrayscaleImage = cv2.resize(isotropicGrayscaleImage, (40, 40))
348 |     return isotropicGrayscaleImage
349 | 
350 | def generate_gaussian_heatmap(tracks, width, height, layer_index, layer_capacity, side=20, offset=True):
351 |     heatmap_template = generate_gaussian_template()
352 |     num_frames, num_points = tracks.shape[:2]
353 |     if isinstance(tracks, torch.Tensor):
354 |         tracks = tracks.cpu().numpy()
355 |     if offset:
356 |         offset_kernel = cv2.resize(heatmap_template / 255, (2 * side + 1, 2 * side + 1))
357 |         offset_kernel /= np.sum(offset_kernel)
358 |         offset_kernel /= offset_kernel[side, side]
359 |     heatmaps = []
360 |     for frame_idx in range(num_frames):
361 |         if offset:
362 |             layer_imgs = np.zeros((layer_capacity, height, width, 3), dtype=np.float32)
363 |         else:
364 |             layer_imgs = np.zeros((layer_capacity, height, width, 1), dtype=np.float32)
365 |         layer_heatmaps = []
366 |         for point_idx in range(num_points):
367 |             x, y = tracks[frame_idx, point_idx]
368 |             layer_id = layer_index[point_idx]
369 |             if x < 0 or y < 0 or x >= width or y >= height:
370 |                 continue
371 |             x1 = int(max(x - side, 0))
372 |             x2 = int(min(x + side, width - 1))
373 |             y1 = int(max(y - side, 0))
374 |             y2 = int(min(y + side, height - 1))
375 |             if (x2 - x1) < 1 or (y2 - y1) < 1:
376 |                 continue
377 |             temp_map = cv2.resize(heatmap_template, (x2-x1, y2-y1))
378 |             layer_imgs[layer_id, y1:y2,x1:x2, 0] = np.maximum(layer_imgs[layer_id, y1:y2,x1:x2, 0], temp_map)
379 |             if offset:
380 |                 if frame_idx < num_frames - 1:
381 |                     next_x, next_y = tracks[frame_idx + 1, point_idx]
382 |                 else:
383 |                     next_x, next_y = x, y
384 |                 layer_imgs[layer_id, int(y), int(x), 1] = next_x - x
385 |                 layer_imgs[layer_id, int(y), int(x), 2] = next_y - y
386 |         for img in layer_imgs:
387 |             if offset:
388 |                 img[:, :, 1:] = cv2.filter2D(img[:, :, 1:], -1, offset_kernel)
389 |             else:
390 |                 img = cv2.cvtColor(img[:, :, 0].astype(np.uint8), cv2.COLOR_GRAY2RGB)
391 |             layer_heatmaps.append(img)
392 |         heatmaps.append(np.stack(layer_heatmaps, axis=0))
393 |     heatmaps = np.stack(heatmaps, axis=0)
394 |     return torch.from_numpy(heatmaps).permute(0, 1, 4, 2, 3).contiguous().float()   # [F, N_layer, C, H, W]
395 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.31.0
 2 | torch==2.3.1
 3 | torchvision==0.18.1
 4 | diffusers==0.30.2
 5 | transformers==4.25.1
 6 | xformers==0.0.27
 7 | imageio==2.27.0
 8 | imageio-ffmpeg==0.4.9
 9 | decord==0.6.0
10 | omegaconf==2.3.0
11 | gradio==5.23.0
12 | spaces==0.32.0
13 | open_clip_torch==2.22.0
14 | deepspeed
15 | opencv-python
16 | pycocotools
17 | safetensors
18 | einops
19 | wandb
20 | scipy


--------------------------------------------------------------------------------
/scripts/animate_Layer.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | import datetime
  4 | import os
  5 | from omegaconf import OmegaConf
  6 | 
  7 | import torch
  8 | import torchvision.transforms as transforms
  9 | from torchvision.transforms import functional as F
 10 | 
 11 | import diffusers
 12 | from diffusers import DDIMScheduler
 13 | from diffusers.utils.import_utils import is_xformers_available
 14 | 
 15 | sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 16 | from lvdm.models.unet import UNetModel
 17 | from lvdm.models.autoencoder import AutoencoderKL, AutoencoderKL_Dualref
 18 | from lvdm.models.condition import FrozenOpenCLIPEmbedder, FrozenOpenCLIPImageEmbedderV2, Resampler
 19 | from lvdm.models.layer_controlnet import LayerControlNet
 20 | from lvdm.pipelines.pipeline_animation import AnimationPipeline
 21 | from lvdm.utils import generate_gaussian_heatmap, save_videos_grid, save_videos_with_traj
 22 | 
 23 | from einops import rearrange
 24 | import decord
 25 | from pathlib import Path
 26 | from PIL import Image
 27 | import numpy as np
 28 | 
 29 | # import debugpy
 30 | # debugpy.listen(5678)
 31 | # print("Waiting for debugger attach")
 32 | # debugpy.wait_for_client()
 33 | 
 34 | @torch.no_grad()
 35 | def main(args):
 36 |     if args.savedir is None:
 37 |         time_str = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
 38 |         savedir = f"samples/{Path(args.config).stem}-{time_str}"
 39 |     else:
 40 |         savedir = args.savedir
 41 |     os.makedirs(savedir, exist_ok=True)
 42 | 
 43 |     config  = OmegaConf.load(args.config)
 44 |     weight_dtype = torch.bfloat16 if config["mixed_precision"] == "bf16" else torch.float32
 45 |     mode = config.get("mode", "interpolate")
 46 |     # create validation pipeline
 47 |     scheduler         = DDIMScheduler.from_pretrained(config.pretrained_model_path, subfolder="scheduler")
 48 |     text_encoder      = FrozenOpenCLIPEmbedder().eval()
 49 |     image_encoder     = FrozenOpenCLIPImageEmbedderV2().eval()
 50 |     image_projector   = Resampler.from_pretrained(config.pretrained_model_path, subfolder="image_projector").eval()
 51 |     vae, vae_dualref = None, None
 52 |     if mode == "interpolate":
 53 |         vae_dualref   = AutoencoderKL_Dualref.from_pretrained(config.pretrained_model_path, subfolder="vae_dualref").eval()
 54 |     else:
 55 |         vae           = AutoencoderKL.from_pretrained(config.pretrained_model_path, subfolder="vae").eval()
 56 |     unet              = UNetModel.from_pretrained(config.pretrained_model_path, subfolder="unet").eval()
 57 |     layer_controlnet  = LayerControlNet.from_pretrained(config.pretrained_model_path, subfolder="layer_controlnet").eval()
 58 | 
 59 |     pipeline = AnimationPipeline(
 60 |         vae=vae, vae_dualref=vae_dualref, text_encoder=text_encoder, image_encoder=image_encoder, image_projector=image_projector,
 61 |         unet=unet, layer_controlnet=layer_controlnet,
 62 |         scheduler=scheduler,
 63 |     ).to(device=args.device, dtype=weight_dtype)
 64 |     if mode == "interpolate":
 65 |         pipeline.vae_dualref.decoder.to(dtype=torch.float32)
 66 |     if config.enable_xformers_memory_efficient_attention:
 67 |         if is_xformers_available():
 68 |             pipeline.enable_xformers_memory_efficient_attention()
 69 |         else:
 70 |             raise ValueError("xformers is not available. Make sure it is installed correctly")
 71 | 
 72 |     if config.seed is None:
 73 |         generator = None
 74 |     else:
 75 |         np.random.seed(config.seed)
 76 |         torch.manual_seed(config.seed)
 77 |         generator = torch.Generator(args.device).manual_seed(config.seed)
 78 | 
 79 |     config.W = config.get("W", args.W)
 80 |     config.H = config.get("H", args.H)
 81 |     config.L = config.get("L", args.L)
 82 | 
 83 |     image_transforms = transforms.Compose([
 84 |         transforms.Resize(min(config.H, config.W)),
 85 |         transforms.CenterCrop((config.H, config.W)),
 86 |         transforms.ToTensor(),
 87 |         transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
 88 |     ])
 89 |     mask_transforms = transforms.Compose([
 90 |         transforms.Resize(min(config.H, config.W)),
 91 |         transforms.CenterCrop((config.H, config.W)),
 92 |     ])
 93 | 
 94 |     demo_dir = config.get("demo_dir")
 95 |     first_frame = os.path.join(demo_dir, "first_frame.jpg")
 96 |     if mode == "interpolate":
 97 |         last_frame = os.path.join(demo_dir, "last_frame.jpg")
 98 |     else:
 99 |         last_frame = None
100 |     sketch_path = os.path.join(demo_dir, "sketch.mp4")
101 |     trajectory_path = os.path.join(demo_dir, "trajectory.npz")
102 | 
103 |     if last_frame is None:
104 |         # Image to Video
105 |         image = image_transforms(Image.open(first_frame).convert("RGB"))
106 |         frame_tensor = image[None].to(args.device)  # [F, C, H, W]
107 |     else:
108 |         # Interpolate
109 |         image1 = image_transforms(Image.open(first_frame).convert("RGB"))
110 |         image2 = image_transforms(Image.open(last_frame).convert("RGB"))
111 |         frame_tensor1 = image1[None]
112 |         frame_tensor2 = image2[None]
113 |         frame_tensor = torch.cat([frame_tensor1, frame_tensor2], dim=0).to(args.device)
114 |     frame_tensor = frame_tensor[None]
115 | 
116 |     if mode == "interpolate":
117 |         layer_masks = torch.zeros((1, config.layer_capacity, 2, 1, config.H, config.W), dtype=torch.bool)
118 |     else:
119 |         layer_masks = torch.zeros((1, config.layer_capacity, 1, 1, config.H, config.W), dtype=torch.bool)
120 |     for layer_idx in range(config.layer_capacity):
121 |         mask_path = os.path.join(demo_dir, f"layer_{layer_idx}.jpg")
122 |         if os.path.exists(mask_path):
123 |             mask = mask_transforms(Image.open(mask_path).convert("L"))
124 |             mask = F.to_tensor(mask) > 0.5
125 |             layer_masks[0, layer_idx, 0] = mask
126 |         last_mask_path = os.path.join(demo_dir, f"layer_{layer_idx}_last.jpg")
127 |         if os.path.exists(last_mask_path) and mode == "interpolate":
128 |             mask = mask_transforms(Image.open(last_mask_path).convert("L"))
129 |             mask = F.to_tensor(mask) > 0.5
130 |             layer_masks[0, layer_idx, 1] = mask
131 |     layer_masks = layer_masks.to(args.device)
132 |     layer_regions = layer_masks * frame_tensor[:, None]
133 |     layer_validity = torch.tensor([config.layer_validity], dtype=torch.bool, device=args.device)
134 |     motion_scores = torch.tensor([config.motion_scores], dtype=weight_dtype, device=args.device)
135 |     layer_static = torch.tensor([config.layer_static], dtype=torch.bool, device=args.device)
136 | 
137 |     sketch = torch.ones((1, config.layer_capacity, config.L, 3, config.H, config.W), dtype=weight_dtype)
138 |     if os.path.exists(sketch_path):
139 |         video_reader = decord.VideoReader(sketch_path)
140 |         assert len(video_reader) == config.L, f"Input the length of sketch sequence should match the video length."
141 |         video_frames = video_reader.get_batch(range(config.L)).asnumpy()
142 |         sketch_values = [image_transforms(Image.fromarray(frame)) for frame in video_frames]
143 |         sketch_values = torch.stack(sketch_values, dim=0)
144 |         sketch[0, config.sketch_layer_index] = sketch_values
145 |     sketch = sketch.to(args.device)
146 | 
147 |     heatmap = torch.zeros((1, config.layer_capacity, config.L, 3, config.H, config.W), dtype=weight_dtype)
148 |     heatmap[:, :, :, 0] -= 1
149 |     if os.path.exists(trajectory_path):
150 |         traj_file = np.load(trajectory_path)
151 |         traj_width = traj_file["width"]
152 |         traj_height = traj_file["height"]
153 |         trajectory = traj_file["trajectory"]
154 |         if traj_width < traj_height:
155 |             scale = min(config.H, config.W) / traj_width
156 |             new_h = int(traj_height * scale)
157 |             new_w = min(config.H, config.W)
158 |         else:
159 |             scale = min(config.H, config.W) / traj_height
160 |             new_w = int(traj_width * scale)
161 |             new_h = min(config.H, config.W)
162 |         trajectory[..., :2] *= scale
163 |         crop_x = int(round((new_w - config.W) / 2.0))
164 |         crop_y = int(round((new_h - config.H) / 2.0))
165 |         trajectory[..., 0] -= crop_x
166 |         trajectory[..., 1] -= crop_y
167 |         traj_layer_index = torch.zeros(trajectory.shape[1], dtype=torch.long) + config.traj_layer_index
168 | 
169 |         heatmap = generate_gaussian_heatmap(trajectory, config.W, config.H, traj_layer_index, config.layer_capacity, offset=True)
170 |         heatmap = rearrange(heatmap, "f n c h w -> (f n) c h w")
171 |         graymap, offset = heatmap[:, :1], heatmap[:, 1:]
172 |         graymap = graymap / 255.
173 |         rad = torch.sqrt(offset[:, 0:1]**2 + offset[:, 1:2]**2)
174 |         rad_max = torch.max(rad)
175 |         epsilon = 1e-5
176 |         offset = offset / (rad_max + epsilon)
177 |         graymap = graymap * 2 - 1
178 |         heatmap = torch.cat([graymap, offset], dim=1)
179 |         heatmap = mask_transforms(heatmap)  # no need for normalization
180 |         heatmap = rearrange(heatmap, '(f n) c h w -> n f c h w', n=config.layer_capacity)
181 |         heatmap = heatmap[None]
182 |     heatmap = heatmap.to(args.device)
183 | 
184 |     sample = pipeline(
185 |         config.prompt,
186 |         config.L,
187 |         config.H,
188 |         config.W,
189 |         frame_tensor,
190 |         layer_masks             = layer_masks,
191 |         layer_regions           = layer_regions,
192 |         layer_static            = layer_static,
193 |         motion_scores           = motion_scores,
194 |         sketch                  = sketch,
195 |         trajectory              = heatmap,
196 |         layer_validity          = layer_validity,
197 |         num_inference_steps     = config.num_inference_steps,
198 |         guidance_scale          = config.guidance_scale,
199 |         guidance_rescale        = config.guidance_rescale,
200 |         negative_prompt         = config.n_prompt,
201 |         num_videos_per_prompt   = config.num_videos_per_prompt,
202 |         eta                     = config.eta,
203 |         generator               = generator,
204 |         fps                     = config.fps,
205 |         mode                    = mode,
206 |         weight_dtype            = weight_dtype,
207 |         output_type             = "tensor",
208 |     ).videos
209 | 
210 |     for idx, video in enumerate(sample):
211 |         save_videos_grid(video[None], os.path.join(savedir, f"video_{idx}.mp4"), fps=8)
212 |         print(f"Saved {os.path.join(savedir, f'video_{idx}.mp4')}")
213 |         if os.path.exists(trajectory_path):
214 |             save_videos_with_traj(video, torch.from_numpy(trajectory), os.path.join(savedir, f"video_{idx}_with_traj.mp4"), fps=8, line_width=7, circle_radius=10)
215 | 
216 | if __name__ == "__main__":
217 |     parser = argparse.ArgumentParser()
218 |     parser.add_argument("--config", type=str, required=True)
219 |     parser.add_argument("--savedir", type=str, default=None)
220 | 
221 |     parser.add_argument("--L", type=int, default=16 )
222 |     parser.add_argument("--W", type=int, default=512)
223 |     parser.add_argument("--H", type=int, default=320)
224 |     parser.add_argument("--device", type=str, default="cuda:0")
225 |     args = parser.parse_args()
226 |     main(args)


--------------------------------------------------------------------------------
/scripts/demo1.yaml:
--------------------------------------------------------------------------------
 1 | pretrained_model_path: "checkpoints/LayerAnimate-Mix"
 2 | mode: "i2v"
 3 | demo_dir: "__assets__/demos/demo_1"
 4 | layer_capacity: 4
 5 | motion_scores: [-1, 0.6, -1, -1]
 6 | layer_static: [false, false, false, false]
 7 | layer_validity: [true, true, true, false]
 8 | sketch_layer_index: 0
 9 | traj_layer_index: 2
10 | prompt: "an anime scene."
11 | n_prompt: ""
12 | fps: 24
13 | enable_xformers_memory_efficient_attention: True
14 | mixed_precision: "bf16"
15 | num_inference_steps: 50
16 | guidance_scale: 7.5
17 | guidance_rescale: 0.7
18 | eta: 1.0
19 | seed: 57
20 | num_videos_per_prompt: 1


--------------------------------------------------------------------------------
/scripts/demo2.yaml:
--------------------------------------------------------------------------------
 1 | pretrained_model_path: "checkpoints/LayerAnimate-Mix"
 2 | mode: "i2v"
 3 | demo_dir: "__assets__/demos/demo_2"
 4 | layer_capacity: 4
 5 | motion_scores: [-1, 0.0, -1, -1]
 6 | layer_static: [false, true, false, false]
 7 | layer_validity: [true, true, true, false]
 8 | sketch_layer_index: 0
 9 | traj_layer_index: 2
10 | prompt: "an anime scene."
11 | n_prompt: ""
12 | fps: 24
13 | enable_xformers_memory_efficient_attention: True
14 | mixed_precision: "bf16"
15 | num_inference_steps: 50
16 | guidance_scale: 7.5
17 | guidance_rescale: 0.7
18 | eta: 1.0
19 | seed: 46
20 | num_videos_per_prompt: 1


--------------------------------------------------------------------------------
/scripts/demo3.yaml:
--------------------------------------------------------------------------------
 1 | pretrained_model_path: "checkpoints/LayerAnimate-Mix"
 2 | mode: "interpolate"
 3 | demo_dir: "__assets__/demos/demo_3"
 4 | layer_capacity: 4
 5 | motion_scores: [0.4, 0.2, -1, -1]
 6 | layer_static: [false, false, false, false]
 7 | layer_validity: [true, true, true, true]
 8 | sketch_layer_index: 3
 9 | traj_layer_index: 2
10 | prompt: "an anime scene."
11 | n_prompt: ""
12 | fps: 24
13 | enable_xformers_memory_efficient_attention: True
14 | mixed_precision: "bf16"
15 | num_inference_steps: 50
16 | guidance_scale: 7.5
17 | guidance_rescale: 0.7
18 | eta: 1.0
19 | seed: 52
20 | num_videos_per_prompt: 1


--------------------------------------------------------------------------------
/scripts/demo4.yaml:
--------------------------------------------------------------------------------
 1 | pretrained_model_path: "checkpoints/LayerAnimate-Mix"
 2 | mode: "i2v"
 3 | demo_dir: "__assets__/demos/demo_4"
 4 | layer_capacity: 4
 5 | motion_scores: [0.0, -1, -1, -1]
 6 | layer_static: [true, false, false, false]
 7 | layer_validity: [true, true, true, false]
 8 | sketch_layer_index: 2
 9 | traj_layer_index: 1
10 | prompt: "an anime scene."
11 | n_prompt: ""
12 | fps: 24
13 | enable_xformers_memory_efficient_attention: True
14 | mixed_precision: "bf16"
15 | num_inference_steps: 50
16 | guidance_scale: 7.5
17 | guidance_rescale: 0.7
18 | eta: 1.0
19 | seed: 42
20 | num_videos_per_prompt: 1


--------------------------------------------------------------------------------
/scripts/demo5.yaml:
--------------------------------------------------------------------------------
 1 | pretrained_model_path: "checkpoints/LayerAnimate-Mix"
 2 | mode: "i2v"
 3 | demo_dir: "__assets__/demos/demo_5"
 4 | layer_capacity: 4
 5 | motion_scores: [-1, -1, -1, -1]
 6 | layer_static: [false, false, false, false]
 7 | layer_validity: [true, true, false, false]
 8 | sketch_layer_index: 0
 9 | traj_layer_index: 1
10 | prompt: "an anime scene."
11 | n_prompt: ""
12 | fps: 24
13 | enable_xformers_memory_efficient_attention: True
14 | mixed_precision: "bf16"
15 | num_inference_steps: 50
16 | guidance_scale: 7.5
17 | guidance_rescale: 0.7
18 | eta: 1.0
19 | seed: 47
20 | num_videos_per_prompt: 1


--------------------------------------------------------------------------------
/scripts/infer_DiT.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | import math
  4 | import json
  5 | import torch
  6 | import decord
  7 | import os
  8 | from omegaconf import OmegaConf
  9 | import numpy as np
 10 | from PIL import Image
 11 | from einops import rearrange
 12 | from tqdm.auto import tqdm
 13 | 
 14 | from torch.nn.functional import interpolate
 15 | import torchvision.transforms as transforms
 16 | from torchvision.transforms import functional as F
 17 | from scipy.interpolate import PchipInterpolator
 18 | 
 19 | sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 20 | from DiT.vae import WanVAE
 21 | from DiT.model import VaceWanModel
 22 | from DiT.utils import save_videos_grid, save_videos_with_traj, generate_gaussian_heatmap
 23 | from wan.text2video import T5EncoderModel, FlowUniPCMultistepScheduler
 24 | 
 25 | 
 26 | HEIGHT = 480
 27 | WIDTH = 832
 28 | LENGTH = 81
 29 | LAYER_CAPACITY = 4
 30 | DEVICE = "cuda"
 31 | WEIGHT_DTYPE = torch.bfloat16
 32 | VAE_STRIDE = [4, 8, 8]
 33 | PATCH_SIZE = [1, 2, 2]
 34 | IMG_TRANSFORM = transforms.Compose([
 35 |     transforms.Resize(min(HEIGHT, WIDTH)),
 36 |     transforms.CenterCrop((HEIGHT, WIDTH)),
 37 | ])
 38 | 
 39 | def main():
 40 |     parser = argparse.ArgumentParser()
 41 |     parser.add_argument(
 42 |         "--config",
 43 |         type=str,
 44 |         required=True,
 45 |         help="Path to the JSON configuration file."
 46 |     )
 47 |     parser.add_argument(
 48 |         "--output_dir",
 49 |         type=str,
 50 |         default="outputs",
 51 |         help="Directory to save the output video."
 52 |     )
 53 |     parser.add_argument(
 54 |         "--pretrained_model",
 55 |         type=str,
 56 |         default="checkpoints/LayerAnimate-DiT",
 57 |         help="Path to the pretrained model directory."
 58 |     )
 59 |     args = parser.parse_args()
 60 | 
 61 |     text_encoder = T5EncoderModel(
 62 |         text_len=WIDTH,
 63 |         dtype=WEIGHT_DTYPE,
 64 |         device=DEVICE,
 65 |         checkpoint_path=os.path.join(args.pretrained_model, "models_t5_umt5-xxl-enc-bf16.pth"),
 66 |         tokenizer_path=os.path.join(args.pretrained_model, "google/umt5-xxl"))
 67 |     vae = WanVAE(vae_pth=os.path.join(args.pretrained_model, "Wan2.1_VAE.pth"), dtype=WEIGHT_DTYPE, device=DEVICE)
 68 |     video_model = VaceWanModel.from_pretrained(args.pretrained_model, subfolder="transformer").to(device=DEVICE, dtype=WEIGHT_DTYPE)
 69 |     video_model.eval().requires_grad_(False)
 70 | 
 71 | 
 72 |     config  = OmegaConf.load(args.config)
 73 | 
 74 |     prompt = config.get("prompt", "an anime scene.")
 75 |     n_prompt = config.get("n_prompt", "")
 76 |     seed = config.get("seed", 42)
 77 |     num_inference_steps = config.get("num_inference_steps", 25)
 78 |     guidance_scale = config.get("guidance_scale", 6.0)
 79 | 
 80 |     if not os.path.exists(args.output_dir):
 81 |         os.makedirs(args.output_dir)
 82 | 
 83 |     # 加载初始帧和结束帧
 84 |     try:
 85 |         input_image = Image.open(config["first_frame_path"]).convert("RGB")
 86 |         input_image = IMG_TRANSFORM(input_image)
 87 | 
 88 |         input_image_end = None
 89 |         if config.get("last_frame_path"):
 90 |             input_image_end = Image.open(config["last_frame_path"]).convert("RGB")
 91 |             input_image_end = IMG_TRANSFORM(input_image_end)
 92 |     except FileNotFoundError as e:
 93 |         print(f"Error: Image file not found - {e}")
 94 |         return
 95 | 
 96 |     # Prepare layer parameters
 97 |     layer_masks = []
 98 |     layer_scores = []
 99 |     layer_sketches = []
100 |     layer_trajectories = []
101 |     assert len(config["layer"]) <= LAYER_CAPACITY, f"The number of layers in the config exceeds the maximum capacity of {LAYER_CAPACITY}."
102 | 
103 |     print("Parsing layer configurations...")
104 |     for i in range(LAYER_CAPACITY):
105 |         if i >= len(config["layer"]):
106 |             layer_masks.append(None)
107 |             layer_scores.append(-1)
108 |             layer_sketches.append(None)
109 |             layer_trajectories.append([[]])
110 |             continue
111 | 
112 |         layer_config = config["layer"][i]
113 |         mask_path = layer_config["mask_path"]
114 |         layer_masks.append(IMG_TRANSFORM(Image.open(mask_path).convert("L")) if mask_path else None)
115 |         control_type = layer_config.get("control_type", None)
116 |         if control_type == "sketch":
117 |             score = -1
118 |             sketch = layer_config["sketch_path"]
119 |             trajectory = [[]]
120 |         elif control_type == "trajectory":
121 |             score = -1
122 |             sketch = None
123 |             traj_path = layer_config["trajectory_path"]
124 |             with open(traj_path, 'r') as f:
125 |                 trajectory = json.load(f)
126 |         elif control_type == "score":
127 |             score = layer_config["score"]
128 |             sketch = None
129 |             trajectory = [[]]
130 |         else:
131 |             raise ValueError(f"Unsupported control type: {control_type}")
132 |         layer_scores.append(score)
133 |         layer_sketches.append(sketch)
134 |         layer_trajectories.append(trajectory)
135 | 
136 |     print("Starting inference...")
137 |     run(
138 |         text_encoder,
139 |         vae,
140 |         video_model,
141 |         input_image=input_image,
142 |         input_image_end=input_image_end,
143 |         seed=seed,
144 |         prompt=prompt,
145 |         n_prompt=n_prompt,
146 |         num_inference_steps=num_inference_steps,
147 |         guidance_scale=guidance_scale,
148 |         savedir=args.output_dir,
149 |         input_layer_masks=layer_masks,
150 |         input_layer_scores=layer_scores,
151 |         input_layer_sketches=layer_sketches,
152 |         input_layer_trajectories=layer_trajectories,
153 |     )
154 | 
155 | 
156 | @torch.no_grad()
157 | def run(text_encoder, vae, video_model,
158 |         input_image, input_image_end, seed, prompt, n_prompt,
159 |         num_inference_steps, guidance_scale, savedir,
160 |         input_layer_masks, input_layer_scores, input_layer_sketches, input_layer_trajectories):
161 |     np.random.seed(seed)
162 |     torch.manual_seed(seed)
163 |     generator = torch.Generator(DEVICE).manual_seed(seed)
164 |     do_classifier_free_guidance = guidance_scale > 1.0
165 | 
166 |     masked_videos = torch.zeros((1, LENGTH, 3, HEIGHT, WIDTH), dtype=WEIGHT_DTYPE, device=DEVICE)
167 |     context_frame_masks = torch.zeros_like(masked_videos)
168 |     image1 = F.to_tensor(input_image) * 2 - 1
169 |     masked_videos[0, 0] = image1.to(DEVICE)
170 |     context_frame_masks[0, 0] = 1.0
171 |     if input_image_end is not None:
172 |         image2 = F.to_tensor(input_image_end) * 2 - 1
173 |         masked_videos[0, -1] = image2.to(DEVICE)
174 |         context_frame_masks[0, -1] = 1.0
175 | 
176 |     layer_masks = torch.zeros((1, LAYER_CAPACITY, 1, 1, HEIGHT, WIDTH), dtype=torch.bool)
177 |     for layer_idx in range(LAYER_CAPACITY):
178 |         if input_layer_masks[layer_idx] is not None:
179 |             mask = F.to_tensor(input_layer_masks[layer_idx]) > 0.5
180 |             layer_masks[0, layer_idx, 0] = mask
181 |     layer_masks = layer_masks.to(DEVICE)
182 |     motion_scores = torch.tensor([input_layer_scores], dtype=WEIGHT_DTYPE, device=DEVICE)
183 | 
184 |     # prepare motion scores condition
185 |     motion_scores = motion_scores[:, :, None, None, None, None]
186 |     motion_score_mask = layer_masks.to(dtype=motion_scores.dtype)
187 |     context_score = motion_scores * motion_score_mask - torch.ones_like(motion_scores) * (1 - motion_score_mask)
188 |     context_score = torch.max(context_score, dim=1).values  # reduce to [b, f, c, h, w]
189 |     context_score = context_score.repeat(1, LENGTH, 3, 1, 1)
190 | 
191 |     # prepare trajectory condition
192 |     sketch = torch.ones((1, LAYER_CAPACITY, LENGTH, 3, HEIGHT, WIDTH), dtype=WEIGHT_DTYPE)
193 |     for layer_idx in range(LAYER_CAPACITY):
194 |         sketch_path = input_layer_sketches[layer_idx]
195 |         if sketch_path is not None:
196 |             video_reader = decord.VideoReader(sketch_path)
197 |             assert len(video_reader) == LENGTH, f"Input the length of sketch sequence should match the video length."
198 |             video_frames = video_reader.get_batch(range(LENGTH)).asnumpy()
199 |             sketch_values = [F.to_tensor(IMG_TRANSFORM(Image.fromarray(frame))) for frame in video_frames]
200 |             sketch_values = torch.stack(sketch_values) * 2 - 1
201 |             sketch[0, layer_idx] = sketch_values
202 |     sketch = sketch.to(DEVICE)
203 |     context_sketch = torch.min(sketch, dim=1).values  # reduce to [b, f, c, h, w]
204 | 
205 |     # prepare trajectory condition
206 |     heatmap = torch.zeros((1, LAYER_CAPACITY, LENGTH, 3, HEIGHT, WIDTH), dtype=WEIGHT_DTYPE)
207 |     heatmap[:, :, :, 0] -= 1
208 |     trajectory = []
209 |     traj_layer_index = []
210 |     for layer_idx in range(LAYER_CAPACITY):
211 |         tracking_points = input_layer_trajectories[layer_idx]
212 |         for temp_track in tracking_points:
213 |             if len(temp_track) > 1:
214 |                 x = [point[0] for point in temp_track]
215 |                 y = [point[1] for point in temp_track]
216 |                 t = np.linspace(0, 1, len(temp_track))
217 |                 fx = PchipInterpolator(t, x)
218 |                 fy = PchipInterpolator(t, y)
219 |                 t_new = np.linspace(0, 1, LENGTH)
220 |                 x_new = fx(t_new)
221 |                 y_new = fy(t_new)
222 |                 temp_traj = np.stack([x_new, y_new], axis=-1).astype(np.float32)
223 |                 trajectory.append(temp_traj)
224 |                 traj_layer_index.append(layer_idx)
225 |             elif len(temp_track) == 1:
226 |                 trajectory.append(np.array(temp_track * LENGTH))
227 |                 traj_layer_index.append(layer_idx)
228 |     trajectory = np.stack(trajectory)
229 |     trajectory = np.transpose(trajectory, (1, 0, 2))
230 |     traj_layer_index = np.array(traj_layer_index)
231 |     heatmap = generate_gaussian_heatmap(trajectory, WIDTH, HEIGHT, traj_layer_index, LAYER_CAPACITY, offset=True)
232 |     heatmap = rearrange(heatmap, "f n c h w -> (f n) c h w")
233 |     graymap, offset = heatmap[:, :1], heatmap[:, 1:]
234 |     graymap = graymap / 255.
235 |     rad = torch.sqrt(offset[:, 0:1]**2 + offset[:, 1:2]**2)
236 |     rad_max = torch.max(rad)
237 |     epsilon = 1e-5
238 |     offset = offset / (rad_max + epsilon)
239 |     graymap = graymap * 2 - 1
240 |     heatmap = torch.cat([graymap, offset], dim=1)
241 |     heatmap = rearrange(heatmap, '(f n) c h w -> n f c h w', n=LAYER_CAPACITY)
242 |     heatmap = heatmap[None]
243 |     indices = torch.max(heatmap[:, :, :, 0:1], dim=1, keepdim=True).indices  # [b, 1, f, 1, h, w]
244 |     expanded_indices = indices.expand(-1, -1, -1, 3, -1, -1)  # [b, 1, f, 3, h, w]
245 |     context_trajectory = torch.gather(heatmap, 1, expanded_indices) # [b, 1, f, c, h, w]
246 |     context_trajectory = context_trajectory.squeeze(1)  # reduce to [b, f, c, h, w]
247 |     context_trajectory = context_trajectory.to(device=DEVICE, dtype=WEIGHT_DTYPE)
248 | 
249 |     context_frame_latents = vae.encode(rearrange(masked_videos, "b f c h w -> b c f h w"))[0].sample()
250 |     context_frame_masks = rearrange(context_frame_masks, "b f c h w -> b c f h w")
251 |     context_frame_masks = 1 - context_frame_masks   # we follow vace to indicate the masked area as 1, and the unmasked area as 0
252 |     length, height, width = context_frame_masks.shape[2:]
253 |     new_length = int((length + 3) // VAE_STRIDE[0])
254 |     height = 2 * (int(height) // (VAE_STRIDE[1] * 2))
255 |     width = 2 * (int(width) // (VAE_STRIDE[2] * 2))
256 |     context_frame_masks = context_frame_masks[:, 0]
257 |     context_frame_masks = context_frame_masks.view(
258 |         -1, length, height, VAE_STRIDE[1], width, VAE_STRIDE[2]
259 |     )
260 |     context_frame_masks = context_frame_masks.permute(0, 3, 5, 1, 2, 4)
261 |     context_frame_masks = context_frame_masks.reshape(
262 |         -1, VAE_STRIDE[1] * VAE_STRIDE[2], length, height, width
263 |     )
264 |     context_frame_masks = interpolate(
265 |         context_frame_masks, size=(new_length, height, width), mode='nearest-exact'
266 |     )
267 | 
268 |     context_score_latents = vae.encode(rearrange(context_score, "b f c h w -> b c f h w"))[0].sample()
269 |     context_sketch_latents = vae.encode(rearrange(context_sketch, "b f c h w -> b c f h w"))[0].sample()
270 |     context_trajectory_latents = vae.encode(rearrange(context_trajectory, "b f c h w -> b c f h w"))[0].sample()
271 | 
272 |     context_control = torch.cat([context_frame_latents, context_frame_masks, context_score_latents, context_sketch_latents, context_trajectory_latents], dim=1)
273 | 
274 |     # Get the text embedding for conditioning
275 |     text_embeddings = text_encoder([prompt], DEVICE)
276 | 
277 |     if do_classifier_free_guidance:
278 |         un_text_embeddings = text_encoder([n_prompt], DEVICE)
279 |         text_embeddings = un_text_embeddings + text_embeddings
280 |         context_control = torch.cat([context_control] * 2, dim=0)
281 | 
282 |     latent_shape = context_frame_latents.shape
283 |     latents = torch.randn(latent_shape, generator=generator, device=DEVICE, dtype=WEIGHT_DTYPE)
284 |     seq_len = math.ceil((latent_shape[3] * latent_shape[4]) /
285 |                         (PATCH_SIZE[1] * PATCH_SIZE[2]) * latent_shape[2])
286 | 
287 |     noise_scheduler = FlowUniPCMultistepScheduler(
288 |         num_train_timesteps=1000,
289 |         shift=1,
290 |         use_dynamic_shifting=False
291 |     )
292 |     noise_scheduler.set_timesteps(
293 |         num_inference_steps, device=DEVICE, shift=8.0
294 |     )
295 |     timesteps = noise_scheduler.timesteps
296 |     for _, t in enumerate(tqdm(timesteps)):
297 |         latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
298 |         timestep = torch.tensor([t], device=DEVICE, dtype=torch.long)
299 |         timestep = timestep.expand(latent_model_input.shape[0])
300 | 
301 |         with torch.amp.autocast('cuda', dtype=WEIGHT_DTYPE):
302 |             noise_pred = video_model(
303 |                 latent_model_input,
304 |                 context=text_embeddings,
305 |                 t=timestep,
306 |                 seq_len=seq_len,
307 |                 vace_context=context_control,
308 |                 vace_context_scale=1.0,
309 |             )
310 | 
311 |         # perform guidance
312 |         if do_classifier_free_guidance:
313 |             noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
314 |             noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
315 | 
316 |         # compute the previous noisy sample x_t -> x_t-1
317 |         latents = noise_scheduler.step(
318 |             noise_pred,
319 |             t,
320 |             latents,
321 |             return_dict=False,
322 |             generator=generator,
323 |         )[0]
324 | 
325 |     video = vae.decode(latents).sample
326 |     video = (video / 2 + 0.5).clamp(0, 1)
327 |     video = video.cpu().float()
328 |     output_video_path = os.path.join(savedir, "video.mp4")
329 |     save_videos_grid(video, output_video_path, fps=24)
330 |     output_video_traj_path = os.path.join(savedir, "video_with_traj.mp4")
331 |     save_videos_with_traj(video[0], torch.from_numpy(trajectory), output_video_traj_path, fps=24, line_width=7, circle_radius=10)
332 |     return output_video_path, output_video_traj_path
333 | 
334 | 
335 | if __name__ == "__main__":
336 |     main()


--------------------------------------------------------------------------------