├── .gitignore ├── LICENSE ├── README.md ├── TrailBlazer ├── Baseline │ ├── README.org │ └── __init__.py ├── CrossAttn │ ├── BaseProc.py │ ├── InjecterProc.py │ ├── Utils.py │ └── __init__.py ├── Metrics │ ├── ClipScore │ │ ├── ClipScore.py │ │ └── __init__.py │ ├── Metrics.py │ ├── OwlVit │ │ └── main.py │ ├── README.md │ └── __init__.py ├── Misc │ ├── BBox.py │ ├── ConfigIO.py │ ├── Const.py │ ├── Logger.py │ ├── Painter.py │ └── __init__.py ├── Pipeline │ ├── TextToVideoSDMultiPipelineCall.py │ ├── TextToVideoSDPipelineCall.py │ ├── UNet3DConditionModelCall.py │ ├── Utils.py │ └── __init__.py ├── README.md ├── Setting │ ├── Config.py │ ├── Const.py │ ├── Keyframe.py │ └── __init__.py └── __init__.py ├── assets ├── gradio │ ├── Cat2Dog.mp4 │ ├── cat-LRLR.mp4 │ ├── fish-RL.mp4 │ ├── fish-TL2BR.mp4 │ ├── gradio-bbox.jpg │ ├── gradio.jpg │ └── tiger-TL2BR.mp4 ├── teaser.gif ├── v1-Peekaboo-Repro │ ├── 2_of_50_2_peekaboo.gif │ ├── Peekaboo-Reproduce.0000-by-Peekaboo.gif │ ├── Peekaboo-Reproduce.0000-by-TrailBlazer.gif │ └── mask.png ├── v1-Peekaboo │ ├── 2ndKeyFast.0000.gif │ ├── ChangingFish.0000.gif │ ├── CrazyHorse.0000.gif │ ├── FastDog.0000.gif │ ├── PerspBR2TL-Tiger.0000.gif │ ├── PerspTL2BR-Tiger.0000.gif │ ├── RigidMoving-Astronaut.0000.gif │ ├── RigidMoving-Bee.0006.gif │ ├── RigidMoving-Cat.0010.gif │ ├── RigidMoving-Clownfish.0001.gif │ ├── Speed2ndKey-Astronaut.0000.gif │ ├── SpeedKeys-Cat.0000.gif │ └── TinyFish.0000.gif ├── v1-T2VZero │ ├── PerspBR2TL-Tiger.0000.gif │ ├── PerspTL2BR-Tiger.0000.gif │ ├── RigidMoving-Astronaut.0000.gif │ ├── RigidMoving-Bee.0000.gif │ ├── RigidMoving-Cat.0000.gif │ ├── RigidMoving-Clownfish.0000.gif │ └── Speed2ndKey-Astronaut.0000.gif └── v1-TrailBlazer │ ├── 2ndKeyFast.0003.gif │ ├── Cat2Dog.0000.gif │ ├── Cat2Fish.0000.gif │ ├── ChangingFish.0009.gif │ ├── CrazyHorse.0007.gif │ ├── CrazyHorse.0008.gif │ ├── FastDog.0003.gif │ ├── MultiSubject-Cat.0000.gif │ ├── MultiSubject-Dog.0000.gif │ ├── MultiSubjects.0000.gif │ ├── Parrot2Penguin.0000.gif │ ├── Peekaboo-Reproduce.0000.gif │ ├── PerspBR2TL-Tiger.0000.gif │ ├── PerspTL2BR-Tiger.0000.gif │ ├── RigidMoving-Astronaut.0000.gif │ ├── RigidMoving-Bee.0000.gif │ ├── RigidMoving-Cat.0000.gif │ ├── RigidMoving-Clownfish.0000.gif │ ├── SpeedKeys-Cat.0000.gif │ ├── Tiger2Elephant.0000.gif │ └── TinyFish.0008.gif ├── bin ├── CmdGradio.py ├── CmdMetric.py ├── CmdPeekaboo.py ├── CmdText2VidZero.py ├── CmdTrailBlazer.py ├── CmdTrailBlazerMulti.py ├── TestDyn.py └── TestMakeCache.py ├── config ├── Archive │ ├── 2ndKey-astronaut.0001.yaml │ ├── 2ndKey-astronaut.0002.yaml │ ├── 2ndKey-astronaut.0003.yaml │ ├── 2ndKey-astronaut.0004.yaml │ ├── BR2TL-fish.yaml │ ├── BR2TL-tiger.yaml │ ├── L2R-fish.yaml │ ├── L2R-horse.yaml │ ├── Omg-CatDog.yaml │ ├── Omg-IrrPath.yaml │ ├── Omg-Speed-sloth.0004.yaml │ ├── Omg-Speed-snail.0004.yaml │ ├── Omg-Speed-tortoise.0004.yaml │ ├── Peekapoo-default.yaml │ ├── Perspective-fish.0001.yaml │ ├── Perspective.0002.yaml │ ├── R2L-fish.yaml │ ├── R2L-horse.yaml │ ├── README.md │ ├── Speed-cat.0001.yaml │ ├── Speed-cat.0002.yaml │ ├── Speed-cat.0003.yaml │ ├── Speed-cat.0004.yaml │ ├── Speed-cheetah.0004.yaml │ ├── Speed-dog.0004.yaml │ ├── Speed-horse.0004.yaml │ ├── Speed-reindeer.0004.yaml │ ├── Speed-tiger.0004.yaml │ ├── TL2BR-fish.yaml │ └── TL2BR-tiger.yaml ├── Main │ ├── PerspBR2TL-Tiger.yaml │ ├── PerspTL2BR-Tiger.yaml │ ├── README.md │ ├── RigidMoving-Astronaut.yaml │ ├── RigidMoving-Bee.yaml │ ├── RigidMoving-Cat.yaml │ ├── RigidMoving-Clownfish.yaml │ └── SpeedKeys-Cat.yaml ├── Metric │ ├── AirBalloon.yaml │ ├── AirBalloon2.yaml │ ├── Bear.yaml │ ├── Bird.yaml │ ├── Bus.yaml │ ├── Camel.yaml │ ├── Deer.yaml │ ├── Dolphin.yaml │ ├── Duck.yaml │ ├── Fox.yaml │ ├── Frog.yaml │ ├── Helicopter.yaml │ ├── House.yaml │ ├── Jet.yaml │ ├── Kangaroo.yaml │ ├── Kangaroo2.yaml │ ├── Leaf.yaml │ ├── Lion.yaml │ ├── Owl.yaml │ ├── Owl2.yaml │ ├── Panda.yaml │ ├── Paper.yaml │ ├── Parrot.yaml │ ├── Penguin.yaml │ ├── Rabbit.yaml │ ├── Rocket.yaml │ ├── Roller.yaml │ ├── RollerCoaster.yaml │ ├── Satellite.yaml │ ├── Skateboarder.yaml │ ├── Squirrel.yaml │ ├── Squirrel2.yaml │ ├── StreetCar.yaml │ ├── Swan.yaml │ └── Woodpecker.yaml ├── Morphin │ ├── Cat2Dog.yaml │ ├── Cat2Fish.yaml │ ├── Parrot2Penguin.yaml │ └── Tiger2Elephant.yaml ├── Multi │ ├── MultiSubject-Cat.yaml │ ├── MultiSubject-Dog.yaml │ └── MultiSubjects.yaml ├── Peekaboo │ ├── 2ndKeyFast.yaml │ ├── ChangingFish.yaml │ ├── CrazyHorse.yaml │ ├── FastDog.yaml │ ├── Peekaboo-Reproduce.yaml │ ├── README.md │ └── TinyFish.yaml └── README.md └── doc ├── Command.md ├── Config.md ├── Gradio.md ├── Peekaboo.md └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | assets 2 | __pycache__ 3 | *.pyc 4 | *.png 5 | *undo* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Hōhonu VicML 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # ___***TrailBlazer***___ 3 | 4 | [![Paper](https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red)](https://arxiv.org/abs/2401.00896) 5 | [![Project Page](https://img.shields.io/badge/TrailBlazer-Website-green?logo=googlechrome&logoColor=green)](https://hohonu-vicml.github.io/Trailblazer.Page/) 6 | [![HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo-blue)](https://huggingface.co/spaces/hohonu-vicml/Trailblazer) 7 | [![Video](https://img.shields.io/badge/YouTube-Project-c4302b?logo=youtube&logoColor=red)](https://www.youtube.com/watch?v=kEN-32wN-xQ) 8 | [![Video](https://img.shields.io/badge/YouTube-Result-c4302b?logo=youtube&logoColor=red)](https://www.youtube.com/watch?v=P-PSkS7sNco) 9 | [![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2Fhohonu-vicml%2FTrailblazer&count_bg=%238B00FB&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false)](https://hits.seeyoufarm.com) 10 | 11 | 12 | This repository contains the implementation of the following paper: > 13 | **TrailBlazer: Trajectory Control for Diffusion-Based Video Generation**
> 14 | [Wan-Duo Kurt Ma](https://www.linkedin.com/in/kurt-ma/)1, [J.P. 15 | Lewis](http://www.scribblethink.org/)2, [ W. Bastiaan 16 | Kleijn](https://people.wgtn.ac.nz/bastiaan.kleijn)1,
Victoria 17 | University of Wellington1, NVIDIA Research2 18 | 19 | ## :fire: Overview 20 | ![teaser](./assets/teaser.gif) 21 | 22 | Large text-to-video (T2V) models such as Sora have the potential to revolutionize visual effects and the creation of some types of movies. Current T2V models require tedious trial-and-error experimentation to achieve desired results, however. This motivates the search for methods to directly control desired attributes. In this work, we take a step toward this goal, introducing a method for high-level, temporally-coherent control over the basic trajectories and appearance of objects. Our algorithm, **TrailBlazer**, allows the general positions and (optionally) appearance of objects to controlled simply by keyframing approximate bounding boxes and (optionally) their corresponding prompts. 23 | 24 | Importantly, our method does not require a pre-existing control video signal that already contains an accurate outline of the desired motion, yet the synthesized motion is surprisingly natural with emergent effects including perspective and movement toward the virtual camera as the box size increases. The method is efficient, making use of a pre-trained T2V model and requiring no training or fine-tuning, with negligible additional computation. Specifically, the bounding box controls are used as soft masks to guide manipulation of the self-attention and cross-attention modules in the video model. While our visual results are limited by those of the underlying model, the algorithm may generalize to future models that use standard self- and cross-attention components. 25 | 26 | ## :fire: Requirements 27 | 28 | The codebase is tested under **NVIDIA GeForce RTX 3090** with the python library 29 | **pytorch-2.1.2+cu121** and **diffusers-0.21.4**. We strongly recommend using a 30 | specific version of Diffusers as it is continuously evolving. For PyTorch, you 31 | could probably use other version under 2.x.x. With RTX 3090, I follow the 32 | [post](https://discuss.pytorch.org/t/geforce-rtx-3090-with-cuda-capability-sm-86-is-not-compatible-with-the-current-pytorch-installation/123499) 33 | to avoid the compatibility of sm_86 issue. 34 | 35 | ## :fire: Timeline 36 | 37 | - [2024/04/08]: Our new v2 preprint is now appeared on ArXiv (See [link](https://arxiv.org/abs/2401.00896)) 38 | 39 | - [2024/03/23]: A new ArXiv update will be made. 40 | 41 | - [2024/03/22]: We release the multiple object synthesis (See 42 | [link](doc/Command.md#multiple-objects)), and the 43 | [Peekaboo](https://github.com/microsoft/Peekaboo) integration (See 44 | [link](doc/Peekaboo.md)) 45 | 46 | - [2024/02/07]: The Gradio app is updated with better keyframe interface (See 47 | ([link](assets/gradio/gradio.jpg))) 48 | 49 | - [2024/02/06]: We now have Gradio web app at Huggingface Space! 50 | 51 | - [2024/02/01]: The official codebase released 52 | 53 | - [2024/01/03]: Paper released 54 | 55 | - [2023/12/31]: Paper submitted on ArXiv 56 | 57 | ## :fire: Usage 58 | 59 | #### [Prepare] 60 | 61 | First of all, download the pre-trained zeroscope model 62 | ([link](https://huggingface.co/cerspense/zeroscope_v2_576w)). You need to 63 | register huggingface and make access token ([link](https://huggingface.co/)) 64 | 65 | ```bash 66 | git clone https://huggingface.co/cerspense/zeroscope_v2_576w ${MODEL_ROOT}/cerspense/zeroscope_v2_576w 67 | ``` 68 | 69 | where MODEL_ROOT is your preference that stores the model. Then, clone this Repo and cd into it: 70 | ```bash 71 | git clone https://github.com/hohonu-vicml/Trailblazer && cd Trailbalzer 72 | ``` 73 | 74 | #### [Run it] 75 | 76 | Our executable script is located in the "bin" folder, and the core module is 77 | implemented in the "TrailBlazer" folder under the project root. Therefore, no 78 | additional dependencies need to be added to PYTHONPATH; you can simply run the 79 | command below :smirk: : 80 | 81 | ```bash 82 | python bin/CmdTrailBlazer.py -mr ${MODEL_ROOT} --config config/XXXX.yaml ## single experiment 83 | python bin/CmdTrailBlazer.py -mr ${MODEL_ROOT} --config config/ ## run all yamls in a folder 84 | ``` 85 | 86 | :cupid:**UPDATE**:cupid:: TrailBlazer has just released Gradio app for the 87 | alternative interface. Please checkout our documentation 88 | ([Gradio.md](doc/Gradio.md)) for more information. To run the app, simply run: 89 | 90 | ```bash 91 | python bin/CmdGradio.py ${MODEL_ROOT} # no -mr here 92 | ``` 93 | 94 | When the shell environment variable ZEROSCOPE_MODEL_ROOT is specified, then you 95 | can ignore the -mr (--model-root) argument above. 96 | 97 | ```bash 98 | export ZEROSCOPE_MODEL_ROOT=/path/to/your/diffusion/root 99 | # then you can ignore -mr term to simplify the command 100 | python bin/CmdTrailBlazer.py --config config/XXXX.yaml 101 | ``` 102 | 103 | Please see [here](doc/Command.md) for more information about the command set 104 | used in TrailBlazer. 105 | 106 | #### [Config] 107 | 108 | A list of config example files is stored in the `config` folder. Feel free to 109 | run each of them and the result will be written in the `/tmp` folder. For more 110 | information how to design the config file, and the visual result of each config. 111 | Please visit [here](doc/Config.md) and [there](config/README.md) for more 112 | details about config structure and the visual result, respectively. 113 | 114 | ## :fire: Contribution 115 | 116 | This project is still working in progress, and there are numerous directions in 117 | which it can be improved. Please don't hesitate to contact us if you are 118 | interested, or feel free to make a pull request to strengthen the ideas. 119 | 120 | ## :fire: TODO 121 | 122 | We regret to inform you that this repository is currently not fully accessible 123 | to the public. Nevertheless, the majority of the core modules have been made 124 | available (e.g., Single, Multiple objects synthesis, and Peekaboo comparison). 125 | Our next release will include useful tools for measuring metrics. 126 | 127 | ## :fire: Fun 128 | 129 | 130 | 131 | 132 | Poor cat: Someone, Stop me! 133 | 134 | 135 | 136 | Am I a cat, or a dog... 137 | 138 | Please inform us if you have generated any interesting videos! 139 | 140 | ## :fire: Citation 141 | 142 | **TrailBlazer** is built on top of its mother project [**DirectedDiffusion**](https://hohonu-vicml.github.io/DirectedDiffusion.Page/), which recently published at AAAI2024. If you find our work useful for your research, please consider citing our paper. 143 | 144 | ```bibtex 145 | @article{ma2023trailblazer, 146 | title={TrailBlazer: Trajectory Control for Diffusion-Based Video Generation}, 147 | author={Wan-Duo Kurt Ma and J. P. Lewis and W. Bastiaan Kleijn}, 148 | year={2023}, 149 | eprint={2401.00896}, 150 | archivePrefix={arXiv}, 151 | primaryClass={cs.CV} 152 | } 153 | 154 | @article{ma2023directed, 155 | title={Directed Diffusion: Direct Control of Object Placement through Attention Guidance}, 156 | author={Wan-Duo Kurt Ma and J. P. Lewis and Avisek Lahiri and Thomas Leung and W. Bastiaan Kleijn}, 157 | year={2023}, 158 | eprint={2302.13153}, 159 | archivePrefix={arXiv}, 160 | primaryClass={cs.CV} 161 | } 162 | ``` 163 | -------------------------------------------------------------------------------- /TrailBlazer/Baseline/README.org: -------------------------------------------------------------------------------- 1 | Try to clone Peekaboo here: 2 | 3 | #+begin_src bash 4 | git clone https://github.com/microsoft/Peekaboo.git 5 | #+end_src 6 | -------------------------------------------------------------------------------- /TrailBlazer/Baseline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/TrailBlazer/Baseline/__init__.py -------------------------------------------------------------------------------- /TrailBlazer/CrossAttn/InjecterProc.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, TypedDict 2 | import numpy as np 3 | import torch 4 | import math 5 | 6 | from ..Misc import Logger as log 7 | 8 | from .BaseProc import CrossAttnProcessorBase 9 | from .BaseProc import BundleType 10 | from ..Misc.BBox import BoundingBox 11 | 12 | 13 | class InjecterProcessor(CrossAttnProcessorBase): 14 | def __init__( 15 | self, 16 | bundle: BundleType, 17 | bbox_per_frame: List[BoundingBox], 18 | name: str, 19 | strengthen_scale: float = 0.0, 20 | weaken_scale: float = 1.0, 21 | is_text2vidzero: bool = False, 22 | ): 23 | super().__init__(bundle, is_text2vidzero=is_text2vidzero) 24 | self.strengthen_scale = strengthen_scale 25 | self.weaken_scale = weaken_scale 26 | self.bundle = bundle 27 | self.num_frames = len(bbox_per_frame) 28 | self.bbox_per_frame = bbox_per_frame 29 | self.use_weaken = True 30 | self.name = name 31 | 32 | def dd_core(self, attention_probs: torch.Tensor, dim_x, dim_y): 33 | """ """ 34 | 35 | frame_size = attention_probs.shape[0] // self.num_frames 36 | num_affected_frames = self.num_frames 37 | attention_probs_copied = attention_probs.detach().clone() 38 | 39 | token_inds = self.bundle.get("token_inds") 40 | trailing_length = self.bundle["trailblazer"]["trailing_length"] 41 | trailing_inds = list( 42 | range(self.len_prompt + 1, self.len_prompt + trailing_length + 1) 43 | ) 44 | # NOTE: Spatial cross attention editing 45 | if len(attention_probs.size()) == 4: 46 | all_tokens_inds = list(set(token_inds).union(set(trailing_inds))) 47 | strengthen_map = self.localized_weight_map( 48 | attention_probs_copied, 49 | token_inds=all_tokens_inds, 50 | bbox_per_frame=self.bbox_per_frame, 51 | dim_x = dim_x, 52 | dim_y = dim_y 53 | ) 54 | 55 | weaken_map = torch.ones_like(strengthen_map) 56 | zero_indices = torch.where(strengthen_map == 0) 57 | weaken_map[zero_indices] = self.weaken_scale 58 | 59 | # weakening 60 | attention_probs_copied[..., all_tokens_inds] *= weaken_map[ 61 | ..., all_tokens_inds 62 | ] 63 | # strengthen 64 | attention_probs_copied[..., all_tokens_inds] += ( 65 | self.strengthen_scale * strengthen_map[..., all_tokens_inds] 66 | ) 67 | # NOTE: Temporal cross attention editing 68 | elif len(attention_probs.size()) == 5: 69 | strengthen_map = self.localized_temporal_weight_map( 70 | attention_probs_copied, 71 | bbox_per_frame=self.bbox_per_frame, 72 | dim_x = dim_x, 73 | dim_y = dim_y 74 | ) 75 | weaken_map = torch.ones_like(strengthen_map) 76 | zero_indices = torch.where(strengthen_map == 0) 77 | weaken_map[zero_indices] = self.weaken_scale 78 | # weakening 79 | attention_probs_copied *= weaken_map 80 | # strengthen 81 | attention_probs_copied += self.strengthen_scale * strengthen_map 82 | 83 | return attention_probs_copied 84 | -------------------------------------------------------------------------------- /TrailBlazer/CrossAttn/Utils.py: -------------------------------------------------------------------------------- 1 | import enum 2 | import torch 3 | import torchvision 4 | import numpy as np 5 | 6 | from ..Misc import Logger as log 7 | from ..Setting import Config 8 | 9 | import matplotlib.pyplot as plt 10 | import matplotlib 11 | 12 | # To avoid plt.imshow crash 13 | matplotlib.use("Agg") 14 | 15 | 16 | class CAttnProcChoice(enum.Enum): 17 | INVALID = -1 18 | BASIC = 0 19 | 20 | 21 | def plot_activations(cross_attn, prompt, plot_with_trailings=False): 22 | num_frames = cross_attn.shape[0] 23 | cross_attn = cross_attn.cpu() 24 | for i in range(num_frames): 25 | filename = "/tmp/out.{:04d}.jpg".format(i) 26 | plot_activation(cross_attn[i], prompt, filename, plot_with_trailings) 27 | 28 | 29 | def plot_activation(cross_attn, prompt, filepath="", plot_with_trailings=False): 30 | 31 | splitted_prompt = prompt.split(" ") 32 | n = len(splitted_prompt) 33 | start = 0 34 | arrs = [] 35 | if plot_with_trailings: 36 | for j in range(5): 37 | arr = [] 38 | for i in range(start, start + n): 39 | cross_attn_sliced = cross_attn[..., i + 1] 40 | arr.append(cross_attn_sliced.T) 41 | start += n 42 | arr = np.hstack(arr) 43 | arrs.append(arr) 44 | arrs = np.vstack(arrs).T 45 | else: 46 | arr = [] 47 | for i in range(start, start + n): 48 | print(i) 49 | cross_attn_sliced = cross_attn[..., i + 1] 50 | arr.append(cross_attn_sliced) 51 | arrs = np.hstack(arr).astype(np.float32) 52 | plt.clf() 53 | 54 | v_min = arrs.min() 55 | v_max = arrs.max() 56 | n_min = 0.0 57 | n_max = 1 58 | 59 | arrs = (arrs - v_min) / (v_max - v_min) 60 | arrs = (arrs * (n_max - n_min)) + n_min 61 | 62 | plt.imshow(arrs, cmap="jet") 63 | plt.title(prompt) 64 | plt.colorbar(orientation="horizontal", pad=0.2) 65 | if filepath: 66 | plt.savefig(filepath) 67 | log.info(f"Saved [{filepath}]") 68 | else: 69 | plt.show() 70 | 71 | 72 | def get_cross_attn( 73 | unet, 74 | resolution=32, 75 | target_size=64, 76 | ): 77 | """To get the cross attention map softmax(QK^T) from Unet. 78 | Args: 79 | unet (UNet2DConditionModel): unet 80 | resolution (int): the cross attention map with specific resolution. It only supports 64, 32, 16, and 8 81 | target_size (int): the target resolution for resizing the cross attention map 82 | Returns: 83 | (torch.tensor): a tensor with shape (target_size, target_size, 77) 84 | """ 85 | attns = [] 86 | check = [8, 16, 32, 64] 87 | if resolution not in check: 88 | raise ValueError( 89 | "The cross attention resolution only support 8x8, 16x16, 32x32, and 64x64. " 90 | "The given resolution {}x{} is not in the list. Abort.".format( 91 | resolution, resolution 92 | ) 93 | ) 94 | for name, module in unet.named_modules(): 95 | module_name = type(module).__name__ 96 | # NOTE: attn2 is for cross-attention while attn1 is self-attention 97 | dim = resolution * resolution 98 | if not hasattr(module, "processor"): 99 | continue 100 | if hasattr(module.processor, "cross_attention_map"): 101 | attn = module.processor.cross_attention_map[None, ...] 102 | attns.append(attn) 103 | 104 | if not attns: 105 | print("Err: Quried attns size [{}]".format(len(attns))) 106 | return 107 | attns = torch.cat(attns, dim=0) 108 | attns = torch.sum(attns, dim=0) 109 | # resized = torch.zeros([target_size, target_size, 77]) 110 | # f = torchvision.transforms.Resize(size=(64, 64)) 111 | # dim = attns.shape[1] 112 | # print(attns.shape) 113 | # for i in range(77): 114 | # attn_slice = attns[..., i].view(1, dim, dim) 115 | # resized[..., i] = f(attn_slice)[0] 116 | return attns 117 | 118 | 119 | def get_avg_cross_attn(unet, resolutions, resize): 120 | """To get the average cross attention map across its resolutions. 121 | Args: 122 | unet (UNet2DConditionModel): unet 123 | resolution (list): a list of specific resolution. It only supports 64, 32, 16, and 8 124 | target_size (int): the target resolution for resizing the cross attention map 125 | Returns: 126 | (torch.tensor): a tensor with shape (target_size, target_size, 77) 127 | """ 128 | cross_attns = [] 129 | for resolution in resolutions: 130 | try: 131 | cross_attns.append(get_cross_attn(unet, resolution, resize)) 132 | except: 133 | log.warn(f"No cross-attention map with resolution [{resolution}]") 134 | if cross_attns: 135 | cross_attns = torch.stack(cross_attns).mean(0) 136 | return cross_attns 137 | 138 | 139 | def save_cross_attn(unet): 140 | """TODO: to save cross attn""" 141 | for name, module in unet.named_modules(): 142 | module_name = type(module).__name__ 143 | if module_name == "CrossAttention" and "attn2" in name: 144 | folder = "/tmp" 145 | filepath = os.path.join(folder, name + ".pt") 146 | torch.save(module.attn, filepath) 147 | print(filepath) 148 | 149 | 150 | def use_dd(unet, use=True): 151 | for name, module in unet.named_modules(): 152 | module_name = type(module).__name__ 153 | if module_name == "CrossAttention" and "attn2" in name: 154 | module.processor.use_dd = use 155 | 156 | 157 | def use_dd_temporal(unet, use=True): 158 | for name, module in unet.named_modules(): 159 | module_name = type(module).__name__ 160 | if module_name == "CrossAttention" and "attn2" in name: 161 | module.processor.use_dd_temporal = use 162 | 163 | 164 | def get_loss(unet): 165 | loss = 0 166 | total = 0 167 | for name, module in unet.named_modules(): 168 | module_name = type(module).__name__ 169 | if module_name == "CrossAttention" and "attn2" in name: 170 | loss += module.processor.loss 171 | total += 1 172 | return loss / total 173 | 174 | 175 | def get_params(unet): 176 | parameters = [] 177 | for name, module in unet.named_modules(): 178 | module_name = type(module).__name__ 179 | if module_name == "CrossAttention" and "attn2" in name: 180 | parameters.append(module.processor.parameters) 181 | return parameters 182 | -------------------------------------------------------------------------------- /TrailBlazer/CrossAttn/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /TrailBlazer/Metrics/ClipScore/ClipScore.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import torch 3 | from PIL import Image 4 | from io import BytesIO 5 | import os 6 | import glob 7 | import imageio 8 | import tqdm 9 | import numpy as np 10 | from diffusers import StableDiffusionImg2ImgPipeline 11 | from torchmetrics.functional.multimodal import clip_score 12 | from functools import partial 13 | 14 | from TrailBlazer.Misc import Logger as log 15 | 16 | os.environ["TOKENIZERS_PARALLELISM"] = "0" 17 | 18 | 19 | def calculate_clip_score(path, frame_skips=2): 20 | 21 | model_root = "/home/kma/Workspace/Project/Models" 22 | model_id = "openai/clip-vit-base-patch16" 23 | model_path = os.path.join(model_root, model_id) 24 | clip_score_fn = partial(clip_score, model_name_or_path=model_path) 25 | 26 | video_paths = sorted(glob.glob(os.path.join(path, "*.mp4"))) 27 | mean_clipscore = 0 28 | total_number = 0 29 | for f, video_path in enumerate(video_paths): 30 | vid = imageio.get_reader(video_path) 31 | metadata_path = os.path.splitext(video_path)[0] + ".pt" 32 | metadata = torch.load(metadata_path) 33 | if metadata["bundle"].get("prompt"): 34 | prompt = metadata["bundle"].get("prompt") 35 | else: 36 | prompt = metadata["bundle"]["keyframe"][0]["prompt"] 37 | clipscore_per_video = [] 38 | for i, frame in tqdm.tqdm( 39 | enumerate(vid), 40 | total=vid.count_frames(), 41 | leave=False, 42 | bar_format="{l_bar}{bar:15}{r_bar}{bar:-15b}", 43 | ): 44 | if i % frame_skips != 0: 45 | continue 46 | image_int = np.array(Image.fromarray(frame))[np.newaxis, ...].astype( 47 | "uint8" 48 | ) 49 | # images_int = (image * 255).astype("uint8") 50 | clipscore = clip_score_fn( 51 | torch.from_numpy(image_int).permute(0, 3, 1, 2), [prompt] 52 | ).detach() 53 | clipscore_per_video.append(clipscore) 54 | mean_clipscore_video = np.array(clipscore_per_video).mean() 55 | 56 | msg = f"{f:02d}/{len(video_paths)} |" 57 | msg += f"CS: {mean_clipscore_video:.2f} |" 58 | msg += f"Prompt: {prompt} |" 59 | msg += f"Config: {os.path.basename(video_path)}" 60 | log.info(msg) 61 | mean_clipscore += np.array(clipscore_per_video).mean() 62 | return round(float(mean_clipscore) / len(video_paths), 4) 63 | -------------------------------------------------------------------------------- /TrailBlazer/Metrics/ClipScore/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/TrailBlazer/Metrics/ClipScore/__init__.py -------------------------------------------------------------------------------- /TrailBlazer/Metrics/Metrics.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import tqdm 3 | import glob 4 | import imageio 5 | import numpy as np 6 | import torch 7 | from PIL import Image 8 | 9 | from CommonMetricsOnVideoQuality.calculate_fvd import calculate_fvd as calc_fvd 10 | from InceptionScorePytorch.inception_score import inception_score 11 | from OwlVit.main import compute_miou 12 | from PytorchFid.src.pytorch_fid.fid_score import calculate_fid_given_paths 13 | from ClipScore.ClipScore import calculate_clip_score 14 | from CleanFid.cleanfid import fid 15 | 16 | from TrailBlazer.Misc import Logger as log 17 | 18 | 19 | def to_torch(path): 20 | arr = np.load(path) 21 | arr = np.transpose(arr, [0, 3, 1, 2]) 22 | return torch.from_numpy(arr) 23 | 24 | 25 | def calculate_kid(paths): 26 | paths = [os.path.join(p, "images") for p in paths] 27 | score = fid.compute_kid(paths[0], paths[1]) 28 | print("\n -----> KID:", score) 29 | return score 30 | 31 | def calculate_clipscore(path): 32 | clip_score = calculate_clip_score(path, frame_skips=1) 33 | print("\n -----> ClipScore:", clip_score) 34 | return clip_score 35 | 36 | 37 | def calculate_fvd(paths): 38 | 39 | paths = [os.path.join(p, "out.npy") for p in paths] 40 | videos1 = torch.unsqueeze(to_torch(paths[0]), 0) 41 | videos2 = torch.unsqueeze(to_torch(paths[1]), 0) 42 | 43 | # NUMBER_OF_VIDEOS = 8 44 | # VIDEO_LENGTH = 50 45 | # CHANNEL = 3 46 | # SIZE = 64 47 | # videos1 = torch.zeros( 48 | # NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False 49 | # ) 50 | # videos2 = torch.ones( 51 | # NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False 52 | # ) 53 | device = torch.device("cuda") 54 | # device = torch.device("cpu") 55 | 56 | import json 57 | 58 | result = calc_fvd(videos1, videos2, device, method="videogpt") 59 | fvd_value = sum(result["value"].values()) / len(result["value"]) 60 | print("\n -----> FVD:", fvd_value) 61 | return fvd_value 62 | 63 | def calculate_is(path): 64 | 65 | import torchvision.datasets as dset 66 | import torchvision.transforms as transforms 67 | from torch.utils.data import Dataset, DataLoader 68 | 69 | class MyDataset(Dataset): 70 | def __init__(self, data, transform=None): 71 | self.data = data 72 | # self.targets = torch.LongTensor(targets) 73 | self.transform = transform 74 | 75 | def __getitem__(self, index): 76 | x = self.data[index] 77 | # y = self.targets[index] 78 | 79 | # if self.transform: 80 | # x = Image.fromarray(self.data[index].astype(np.uint8).transpose(1,2,0)) 81 | # x = self.transform(x) 82 | 83 | return x 84 | 85 | def __len__(self): 86 | return len(self.data) 87 | 88 | path = os.path.join(path, "out.npy") 89 | data = to_torch(path) 90 | 91 | dataset = MyDataset(data) 92 | result = inception_score(dataset, cuda=True, batch_size=32, resize=True, splits=10) 93 | print( 94 | "\n -----> IS(Mean/Std): ", 95 | result 96 | ) 97 | result = [round(v, 4) for v in result] 98 | return result 99 | 100 | 101 | def calculate_miou(path): 102 | miou = compute_miou(path) 103 | print("\n -----> mIOU: ", miou) 104 | return miou 105 | 106 | def calculate_fid(paths, batch_size=64, device="cuda", dims=2048, num_workers=8): 107 | paths = [os.path.join(p, "images") for p in paths] 108 | fid_value = calculate_fid_given_paths(paths, batch_size, device, dims, num_workers) 109 | print("\n -----> FID: ", fid_value) 110 | return fid_value 111 | 112 | def prepare_assets( 113 | video_folder, frame_skips=1, res_skips=1, out_filename="out.npy", max_n_videos=-1 114 | ): 115 | 116 | video_paths = glob.glob(os.path.join(video_folder, "*.mp4")) 117 | image_folder = os.path.join(video_folder, "images") 118 | if not os.path.exists(image_folder): 119 | os.makedirs(image_folder) 120 | 121 | videos = [] 122 | total_frame = 0 123 | target_shape = () 124 | msg = "Pre-analyzing the video folders by " 125 | msg += f"frame skips every {frame_skips}, " 126 | msg += f"res skips {res_skips}, " 127 | msg += f"and maximum videos {max_n_videos}" 128 | log.info(msg) 129 | n = max_n_videos if max_n_videos > 0 else len(video_paths) 130 | for f, video_path in tqdm.tqdm(enumerate(video_paths), total=n, leave=False): 131 | if max_n_videos > 0 and f > max_n_videos: 132 | break 133 | vid = imageio.get_reader(video_path) 134 | shape = vid.get_data(0).shape 135 | frames = np.zeros((vid.count_frames(), *shape), dtype=np.float16) 136 | if frame_skips > 1: 137 | frames = frames[::frame_skips] 138 | if res_skips > 1: 139 | frames = frames[:, ::res_skips, ::res_skips, :] 140 | total_frame += frames.shape[0] 141 | if not target_shape: 142 | target_shape = frames.shape[1:] 143 | log.info(f"Analyzed, total frame: {total_frame}, target res: {target_shape}") 144 | 145 | videos_arr = np.zeros((total_frame, *target_shape), dtype=np.float16) 146 | total_frame = 0 147 | current_total_frame_ids = 0 148 | log.info("Start feeding the video data...") 149 | for f, video_path in tqdm.tqdm(enumerate(video_paths), total=n, leave=False): 150 | if max_n_videos > 0 and f > max_n_videos: 151 | break 152 | vid = imageio.get_reader(video_path) 153 | shape = vid.get_data(0).shape 154 | frames = np.zeros((vid.count_frames(), *shape), dtype=np.float16) 155 | for i, frame in enumerate(vid): 156 | frames[i] = frame 157 | if i % frame_skips == 0: 158 | filename = ( 159 | os.path.splitext(os.path.basename(video_path))[0] + f".{i:04d}.jpg" 160 | ) 161 | image = Image.fromarray(frame) 162 | image_filepath = os.path.join(image_folder, filename) 163 | image.save(image_filepath) 164 | 165 | if frame_skips > 1: 166 | frames = frames[::frame_skips] 167 | if res_skips > 1: 168 | frames = frames[:, ::res_skips, ::res_skips, :] 169 | 170 | total_frame += frames.shape[0] 171 | 172 | videos_arr[ 173 | current_total_frame_ids : current_total_frame_ids + frames.shape[0] 174 | ] = frames 175 | current_total_frame_ids += frames.shape[0] 176 | # msg = "" 177 | # msg += f"Prog: {f:05d}/{len(video_paths)} " 178 | # msg += f"File: {filename} " 179 | # msg += f"#Fr(U/O): {frames.shape[0]:03d}/{vid.count_frames():03d} " 180 | # msg += ( 181 | # f"Res(U/O): ({frames.shape[1]},{frames.shape[2]})/({shape[0]},{shape[1]}) " 182 | # ) 183 | # msg += f"Current total frames: {total_frame}" 184 | # print(msg) 185 | 186 | # videos = np.concatenate(videos, dtype=np.float16) 187 | out_filepath = os.path.join(video_folder, out_filename) 188 | np.save(out_filepath, videos_arr) 189 | log.info(f"Saved [{out_filepath}]") 190 | log.info( 191 | f"Array size: {videos_arr.shape} File size: {os.path.getsize(out_filepath)/(1024**2):.2f}MB" 192 | ) 193 | return videos 194 | 195 | 196 | def batch_evaluate(real_folder, fake_folder): 197 | paths = [fake_folder, real_folder] 198 | scores = {} 199 | scores["FID"] = calculate_fid(paths) 200 | scores["FVD"] = calculate_fvd(paths) 201 | scores["IS"] = calculate_is(paths[0]) 202 | scores["KID"] = calculate_kid(paths) 203 | scores["CLIPSim"] = calculate_clipscore(paths[0]) 204 | if "Text2VideoZero" not in fake_folder: 205 | scores["mIOU"] = calculate_miou(paths[0]) 206 | return scores 207 | 208 | 209 | def make_dataset(path): 210 | prepare_assets( 211 | # f"/home/kma/Workspace/Project/Trailblazer/ECCV/Metrics-Dyna/{tester}", 212 | path, 213 | frame_skips=1, 214 | res_skips=2, 215 | max_n_videos=400, 216 | ) 217 | 218 | 219 | if __name__ == "__main__": 220 | pass 221 | # paths = ["/home/kma/Workspace/Project/Data/Test2", "/home/kma/Workspace/Project/Data/Test1"] 222 | # paths = ["/home/kma/Workspace/Project/Data/Test3"] 223 | # prepare_assets("/home/kma/Workspace/Project/Trailblazer/ECCV/Supp/TrailBlazer") 224 | # prepare_assets("/home/kma/Workspace/Project/Trailblazer/ECCV/Supp/Peekaboo") 225 | # quit() 226 | # calculate_clipscore("/home/kma/Workspace/Project/Trailblazer/ECCV/Metrics/TrailBlazer") 227 | testers = ["Text2VideoZero", "Peekaboo", "TrailBlazer"] 228 | testers = ["Peekaboo", "TrailBlazer"] 229 | # calculate_clipscore(f"/home/kma/Workspace/Project/Trailblazer/ECCV/Metrics/{tester}") 230 | real_folder = "/home/kma/Workspace/Project/Data/AnimalKingdom/video/" 231 | 232 | all_scores = {} 233 | for tester in testers: 234 | print("\n\n\n") 235 | log.info(f"===== {tester} =====") 236 | # fake_folder = ( 237 | # f"/home/kma/Workspace/Project/Trailblazer/ECCV/Metrics-Static/{tester}" 238 | # ) 239 | fake_folder = f"/home/kma/Workspace/Project/Trailblazer/ECCV/Supp/{tester}" 240 | all_scores[tester] = batch_evaluate(real_folder, fake_folder) 241 | 242 | log.info("===== Done =====") 243 | log.info("") 244 | log.info("") 245 | log.info("") 246 | log.info("==== Summary ====") 247 | log.info("") 248 | for tester in all_scores.keys(): 249 | log.info(f"[[{tester}]]") 250 | for s in all_scores[tester].keys(): 251 | log.info(f" {s}:{all_scores[tester][s]}") 252 | log.info("------------------------") 253 | -------------------------------------------------------------------------------- /TrailBlazer/Metrics/OwlVit/main.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from PIL import Image 3 | import torch 4 | import tqdm 5 | import os 6 | from transformers import OwlViTProcessor, OwlViTForObjectDetection 7 | import glob 8 | import imageio 9 | 10 | 11 | def compute_miou(path): 12 | 13 | # model 14 | model_root = "/home/kma/Workspace/Project/Models" 15 | model_id = "google/owlvit-base-patch32" 16 | model_path = os.path.join(model_root, model_id) 17 | processor = OwlViTProcessor.from_pretrained(model_path) 18 | model = OwlViTForObjectDetection.from_pretrained(model_path) 19 | 20 | video_paths = glob.glob(os.path.join(path, "*.mp4")) 21 | mean_iou = 0 22 | for f, video_path in enumerate(video_paths): 23 | vid = imageio.get_reader(video_path) 24 | shape = vid.get_data(0).shape 25 | 26 | metadata_path = os.path.splitext(video_path)[0] + ".pt" 27 | metadata = torch.load(metadata_path) 28 | 29 | prompt = metadata["bundle"]["keyframe"][0]["prompt"] 30 | token_ids = metadata["bundle"]["token_inds"] 31 | gt_bboxes = metadata["bbox"] 32 | texts = [ 33 | [ 34 | " ".join( 35 | [p for i, p in enumerate(prompt.split(" ")) if i + 1 in token_ids] 36 | ) 37 | ] 38 | ] 39 | 40 | total_frames = 0 41 | mean_iou_video = 0.0 42 | print(video_path) 43 | for i, frame in tqdm.tqdm( 44 | enumerate(vid), 45 | total=vid.count_frames(), 46 | leave=False, 47 | bar_format="{l_bar}{bar:15}{r_bar}{bar:-15b}", 48 | ): 49 | 50 | image = Image.fromarray(frame) 51 | inputs = processor(text=texts, images=image, return_tensors="pt") 52 | outputs = model(**inputs) 53 | # Target image sizes (height, width) to rescale box predictions [batch_size, 2] 54 | target_sizes = torch.Tensor([image.size[::-1]]) 55 | # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) 56 | results = processor.post_process_object_detection( 57 | outputs=outputs, target_sizes=target_sizes, threshold=0.1 58 | ) 59 | 60 | text = texts[0] 61 | boxes, scores, labels = ( 62 | results[0]["boxes"], 63 | results[0]["scores"], 64 | results[0]["labels"], 65 | ) 66 | 67 | try: 68 | box = [v.item() for v in boxes[torch.argmax(scores)]] 69 | except: 70 | continue 71 | 72 | gt_box = gt_bboxes[i] 73 | gt_box[0] = gt_box[0] * metadata["bundle"]["width"] 74 | gt_box[1] = gt_box[1] * metadata["bundle"]["height"] 75 | gt_box[2] = gt_box[2] * metadata["bundle"]["width"] 76 | gt_box[3] = gt_box[3] * metadata["bundle"]["height"] 77 | score = scores[torch.argmax(scores)].item() 78 | 79 | area_overlap = (min(box[2], gt_box[2]) - max(box[0], gt_box[0])) * ( 80 | min(box[3], gt_box[3]) - max(box[1], gt_box[1]) 81 | ) 82 | 83 | area_union = ( 84 | (box[2] - box[0]) * (box[3] - box[1]) 85 | + (gt_box[2] - gt_box[0]) * (gt_box[3] - gt_box[1]) 86 | - area_overlap 87 | ) 88 | 89 | iou = area_overlap / area_union 90 | # print(iou) 91 | if iou < 0: 92 | iou = 0 93 | # print( 94 | # f"Score {score:.2f} Box(L/T/R/B) {box[0]:.2f} {box[1]:.2f} {box[2]:.2f} {box[3]:.2f}" 95 | # ) 96 | # print(box) 97 | # print(gt_box) 98 | # print("iou", iou) 99 | mean_iou_video += iou 100 | total_frames += 1 101 | 102 | # quit() 103 | # for box, score, label in zip(boxes, scores, labels): 104 | # box = [round(i, 2) for i in box.tolist()] 105 | # print( 106 | # f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}" 107 | # ) 108 | if total_frames: 109 | mean_iou_video = mean_iou_video/total_frames 110 | mean_iou += mean_iou_video 111 | 112 | return round(mean_iou / len(video_paths), 4) 113 | -------------------------------------------------------------------------------- /TrailBlazer/Metrics/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/TrailBlazer/Metrics/README.md -------------------------------------------------------------------------------- /TrailBlazer/Metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/TrailBlazer/Metrics/__init__.py -------------------------------------------------------------------------------- /TrailBlazer/Misc/BBox.py: -------------------------------------------------------------------------------- 1 | """ 2 | """ 3 | import torch 4 | 5 | 6 | class BoundingBox: 7 | """A rectangular bounding box determines the directed regions.""" 8 | 9 | def __init__(self, dim_x, dim_y, box_ratios, margin=0.01): 10 | """ 11 | Args: 12 | resolution(int): the resolution of the 2d spatial input 13 | box_ratios(List[float]): 14 | Returns: 15 | """ 16 | assert ( 17 | box_ratios[1] < box_ratios[3] 18 | ), "the boundary top ratio should be less than bottom" 19 | assert ( 20 | box_ratios[0] < box_ratios[2] 21 | ), "the boundary left ratio should be less than right" 22 | self.left = int((box_ratios[0] + margin) * dim_x) 23 | self.right = int((box_ratios[2] - margin) * dim_x) 24 | self.top = int((box_ratios[1] + margin) * dim_y) 25 | self.bottom = int((box_ratios[3] - margin) * dim_y) 26 | 27 | self.height = self.bottom - self.top 28 | self.width = self.right - self.left 29 | if self.height == 0: 30 | self.height = 1 31 | if self.width == 0: 32 | self.width = 1 33 | 34 | def sliced_tensor_in_bbox(self, tensor: torch.tensor) -> torch.tensor: 35 | """ slicing the tensor with bbox area 36 | 37 | Args: 38 | tensor(torch.tensor): the original tensor in 4d 39 | Returns: 40 | (torch.tensor): the reduced tensor inside bbox 41 | """ 42 | return tensor[:, self.top : self.bottom, self.left : self.right, :] 43 | 44 | def mask_reweight_out_bbox( 45 | self, tensor: torch.tensor, value: float = 0.0 46 | ) -> torch.tensor: 47 | """reweighting value outside bbox 48 | 49 | Args: 50 | tensor(torch.tensor): the original tensor in 4d 51 | value(float): reweighting factor default with 0.0 52 | Returns: 53 | (torch.tensor): the reweighted tensor 54 | """ 55 | mask = torch.ones_like(tensor).to(tensor.device) * value 56 | mask[:, self.top : self.bottom, self.left : self.right, :] = 1 57 | return tensor * mask 58 | 59 | def mask_reweight_in_bbox( 60 | self, tensor: torch.tensor, value: float = 0.0 61 | ) -> torch.tensor: 62 | """reweighting value within bbox 63 | 64 | Args: 65 | tensor(torch.tensor): the original tensor in 4d 66 | value(float): reweighting factor default with 0.0 67 | Returns: 68 | (torch.tensor): the reweighted tensor 69 | """ 70 | mask = torch.ones_like(tensor).to(tensor.device) 71 | mask[:, self.top : self.bottom, self.left : self.right, :] = value 72 | return tensor * mask 73 | 74 | def __str__(self): 75 | """it prints Box(L:%d, R:%d, T:%d, B:%d) for better ingestion""" 76 | return f"Box(L:{self.left}, R:{self.right}, T:{self.top}, B:{self.bottom})" 77 | 78 | def __rerp__(self): 79 | """ """ 80 | return f"Box(L:{self.left}, R:{self.right}, T:{self.top}, B:{self.bottom})" 81 | 82 | 83 | if __name__ == "__main__": 84 | # Example: second quadrant 85 | input_res = 32 86 | left = 0.0 87 | top = 0.0 88 | right = 0.5 89 | bottom = 0.5 90 | box_ratios = [left, top, right, bottom] 91 | bbox = BoundingBox(resolution=input_res, box_ratios=box_ratios) 92 | 93 | print(bbox) 94 | # Box(L:0, R:16, T:0, B:16) 95 | -------------------------------------------------------------------------------- /TrailBlazer/Misc/ConfigIO.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | def config_loader(filepath): 4 | data = None 5 | with open(filepath, "r") as yamlfile: 6 | data = yaml.load(yamlfile, Loader=yaml.FullLoader) 7 | yamlfile.close() 8 | return data 9 | 10 | def config_saver(data, filepath): 11 | with open(filepath, 'w') as yamlfile: 12 | data1 = yaml.dump(data, yamlfile) 13 | yamlfile.close() 14 | -------------------------------------------------------------------------------- /TrailBlazer/Misc/Const.py: -------------------------------------------------------------------------------- 1 | # https://okuha.com/best-stable-diffusion-prompts/ 2 | 3 | NEGATIVE_PROMPT = "bad anatomy, bad proportions, blurry, cloned face, cropped, deformed, dehydrated, disfigured, duplicate, error, extra arms, extra fingers, extra legs, extra limbs, fused fingers, gross proportions, jpeg artifacts, long neck, low quality, lowres, malformed limbs, missing arms, missing legs, morbid, mutated hands, mutation, mutilated, out of frame, poorly drawn face, poorly drawn hands, signature, text, too many fingers, ugly, username, watermark, worst quality, Amputee, Autograph, Bad anatomy, Bad illustration, Bad proportions, Beyond the borders, Blank background, Blurry, Body out of frame, Boring background, Branding, Cropped, Cut off, Deformed, Disfigured, Dismembered, Disproportioned, Distorted, Draft, Duplicate, Duplicated features, Extra arms, Extra fingers, Extra hands, Extra legs, Extra limbs, Fault, Flaw, Fused fingers, Grains, Grainy, Gross proportions, Hazy, Identifying mark, Improper scale, Incorrect physiology, Incorrect ratio, Indistinct, Kitsch, Logo, Long neck, Low quality, Low resolution, Macabre, Malformed, Mark, Misshapen, Missing arms, Missing fingers, Missing hands, Missing legs, Mistake, Morbid, Mutated hands, Mutation, Mutilated, Off-screen, Out of frame, Outside the picture, Pixelated, Poorly drawn face, Poorly drawn feet, Poorly drawn hands, Printed words, Render, Repellent, Replicate, Reproduce, Revolting dimensions, Script, Shortened, Sign, Signature, Split image, Squint, Storyboard, Text, Tiling, Trimmed, Ugly, Unfocused, Unattractive, Unnatural pose, Unreal engine, Unsightly, Watermark, Written language, Absent limbs, Additional appendages, Additional digits, Additional limbs, Altered appendages, Amputee, Asymmetric, Asymmetric ears, Bad anatomy, Bad ears, Bad eyes, Bad face, Bad proportions, Broken finger, Broken hand, Broken leg, Broken wrist, Cartoon, Cloned face, Cloned head, Collapsed eyeshadow, Combined appendages, Conjoined, Copied visage, Corpse, Cripple, Cropped head, Cross-eyed, Depressed, Desiccated, Disconnected limb, Disfigured, Dismembered, Disproportionate, Double face, Duplicated features, Eerie, Elongated throat, lowres, low quality, jpeg, artifacts, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, drawing, painting, crayon, sketch, graphite, impressionist, noisy, soft, extra tails" 4 | 5 | 6 | POSITIVE_PROMPT = "; masterpiece, best quality, intricate, detailed, sharp, focused, intricate details, hyperdetailed, 8k, RAW photo,realistic style, national geography, fantasy, hyper-realistic, rich colors, realistic texture" 7 | 8 | DEFAULT_HEIGHT = 320 9 | DEFAULT_WIDTH = 576 10 | -------------------------------------------------------------------------------- /TrailBlazer/Misc/Logger.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | 4 | from io import StringIO # Python3 5 | 6 | import sys 7 | 8 | class SilencedStdOut: 9 | # https://stackoverflow.com/questions/65608502/is-there-a-way-to-force-any-function-to-not-be-verbose-in-python 10 | def __enter__(self): 11 | self.old_stdout = sys.stdout 12 | self.result = StringIO() 13 | sys.stdout = self.result 14 | 15 | def __exit__(self, *args, **kwargs): 16 | 17 | sys.stdout = self.old_stdout 18 | result_string = self.result.getvalue() # use if you want or discard. 19 | 20 | class CustomFormatter(logging.Formatter): 21 | 22 | GRAY = "\x1b[38m" 23 | YELLOW = "\x1b[33m" 24 | CYAN = "\x1b[36m" 25 | RED = "\x1b[31m" 26 | BOLD_RED = "\x1b[31;1m" 27 | RESET = "\x1b[0m" 28 | FORMAT = "[%(asctime)s - %(name)s - %(levelname)8s] - %(message)s (%(filename)s:%(lineno)d)" 29 | 30 | FORMATS = { 31 | logging.DEBUG: GRAY + FORMAT + RESET, 32 | logging.INFO: GRAY + FORMAT + RESET, 33 | logging.WARNING: YELLOW + FORMAT + RESET, 34 | logging.ERROR: RED + FORMAT + RESET, 35 | logging.CRITICAL: BOLD_RED + FORMAT + RESET, 36 | logging.DEBUG: CYAN + FORMAT + RESET, 37 | } 38 | 39 | def format(self, record): 40 | log_fmt = self.FORMATS.get(record.levelno) 41 | formatter = logging.Formatter(log_fmt) 42 | return formatter.format(record) 43 | 44 | # create logger with 'spam_application' 45 | 46 | logger = logging.getLogger("TrailBlazer") 47 | logger.handlers = [] 48 | logger.setLevel(logging.DEBUG) 49 | # create console handler with a higher log level 50 | console_handler = logging.StreamHandler() 51 | console_handler.setLevel(logging.DEBUG) 52 | 53 | console_handler.setFormatter(CustomFormatter()) 54 | logger.addHandler(console_handler) 55 | 56 | critical = logger.critical 57 | fatal = logger.fatal 58 | error = logger.error 59 | warning = logger.warning 60 | warn = logger.warn 61 | info = logger.info 62 | debug = logger.debug 63 | 64 | if __name__ == "__main__": 65 | from DirectedDiffusion import Logger as log 66 | log.info("info message") 67 | log.warning("warning message") 68 | log.error("error message") 69 | log.debug("debug message") 70 | log.critical("critical message") 71 | -------------------------------------------------------------------------------- /TrailBlazer/Misc/Painter.py: -------------------------------------------------------------------------------- 1 | """ 2 | """ 3 | import torch 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import torch.nn.functional as nnf 7 | import torchvision 8 | import einops 9 | import matplotlib.pyplot as plt 10 | import scipy.stats as st 11 | from PIL import Image, ImageFont, ImageDraw 12 | 13 | plt.rcParams["figure.figsize"] = [ 14 | float(v) * 1.5 for v in plt.rcParams["figure.figsize"] 15 | ] 16 | 17 | 18 | class CrossAttnPainter: 19 | 20 | def __init__(self, bundle, pipe, root="/tmp"): 21 | self.dim = 64 22 | self.folder = 23 | 24 | def plot_frames(self): 25 | folder = "/tmp" 26 | from PIL import Image 27 | for i, f in enumerate(video_frames): 28 | img = Image.fromarray(f) 29 | filepath = os.path.join(folder, "recons.{:04d}.jpg".format(i)) 30 | img.save(filepath) 31 | 32 | 33 | def plot_spatial_attn(self): 34 | 35 | arr = ( 36 | pipe.unet.up_blocks[1] 37 | .attentions[0] 38 | .transformer_blocks[0] 39 | .attn2.processor.cross_attention_map 40 | ) 41 | heads = pipe.unet.up_blocks[1].attentions[0].transformer_blocks[0].attn2.heads 42 | arr = torch.transpose(arr, 1, 3) 43 | arr = nnf.interpolate(arr, size=(64, 64), mode='bicubic', align_corners=False) 44 | arr = torch.transpose(arr, 1, 3) 45 | arr = arr.cpu().numpy() 46 | arr = arr.reshape(24, heads, 64, 64, 77) 47 | arr = arr.mean(axis=1) 48 | n = arr.shape[0] 49 | for i in range(n): 50 | filename = "/tmp/spatialca.{:04d}.jpg".format(i) 51 | plt.clf() 52 | plt.imshow(arr[i, :, :, 2], cmap="jet") 53 | plt.gca().set_axis_off() 54 | plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0, 55 | hspace = 0, wspace = 0) 56 | plt.margins(0,0) 57 | plt.gca().xaxis.set_major_locator(plt.NullLocator()) 58 | plt.gca().yaxis.set_major_locator(plt.NullLocator()) 59 | plt.savefig(filename, bbox_inches = 'tight',pad_inches = 0) 60 | print(filename) 61 | 62 | def plot_temporal_attn(self): 63 | 64 | # arr = pipe.unet.mid_block.temp_attentions[0].transformer_blocks[0].attn2.processor.cross_attention_map 65 | import matplotlib.pyplot as plt 66 | import torch.nn.functional as nnf 67 | arr = ( 68 | pipe.unet.up_blocks[2] 69 | .temp_attentions[1] 70 | .transformer_blocks[0] 71 | .attn2.processor.cross_attention_map 72 | ) 73 | #arr = pipe.unet.transformer_in.transformer_blocks[0].attn2.processor.cross_attention_map 74 | arr = torch.transpose(arr, 0, 2).transpose(1, 3) 75 | arr = nnf.interpolate(arr, size=(64, 64), mode="bicubic", align_corners=False) 76 | arr = torch.transpose(arr, 0, 2).transpose(1, 3) 77 | arr = arr.cpu().numpy() 78 | n = arr.shape[-1] 79 | for i in range(n-2): 80 | filename = "/tmp/tempcaiip2.{:04d}.jpg".format(i) 81 | plt.clf() 82 | plt.imshow(arr[..., i+2, i], cmap="jet") 83 | plt.gca().set_axis_off() 84 | plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0) 85 | plt.margins(0, 0) 86 | plt.gca().xaxis.set_major_locator(plt.NullLocator()) 87 | plt.gca().yaxis.set_major_locator(plt.NullLocator()) 88 | plt.savefig(filename, bbox_inches="tight", pad_inches=0) 89 | print(filename) 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | def plot_latent_noise(latents, mode): 101 | 102 | for i in range(latents.shape[0]): 103 | tensor = latents[i].cpu() 104 | min_val = torch.min(tensor) 105 | max_val = torch.max(tensor) 106 | scale = 255 * (max_val - min_val) 107 | tensor = scale * (tensor - min_val) 108 | tensor = tensor.type(torch.int8) 109 | tensor = einops.rearrange(tensor, "c w h -> w h c") 110 | if mode == "RGB": 111 | tensor = tensor[...,:3] 112 | mode_ = "RGB" 113 | elif mode == "RGBA": 114 | mode_ = "RGBA" 115 | pass 116 | elif mode == "GRAY": 117 | tensor = tensor[...,0] 118 | mode_ = "L" 119 | 120 | x = tensor.numpy() 121 | 122 | img = Image.fromarray(x, mode_) 123 | img = img.resize((256, 256), resample=Image.NEAREST ) 124 | filepath = f"/tmp/out.{i:04d}.jpg" 125 | img.save(filepath) 126 | 127 | tensor = latents[i].cpu() 128 | x = tensor.flatten().numpy() 129 | x /= x.max() 130 | plt.hist(x, density=True, bins=20, range=[-1, 1]) 131 | mn, mx = plt.xlim() 132 | plt.xlim(mn, mx) 133 | kde_xs = np.linspace(mn, mx, 300) 134 | kde = st.gaussian_kde(x) 135 | plt.plot(kde_xs, kde.pdf(kde_xs), label="PDF") 136 | filepath = f"/tmp/hist.{i:04d}.jpg" 137 | plt.savefig(filepath) 138 | plt.clf() 139 | 140 | print(i) 141 | 142 | 143 | def plot_activation(cross_attn, prompt, filepath="", plot_with_trailings=False, n_trailing=2): 144 | splitted_prompt = prompt.split(" ") 145 | n = len(splitted_prompt) 146 | start = 0 147 | arrs = [] 148 | if plot_with_trailings: 149 | for j in range(n_trailing): 150 | arr = [] 151 | for i in range(start, start + n): 152 | cross_attn_sliced = cross_attn[..., i + 1] 153 | arr.append(cross_attn_sliced.T) 154 | start += n 155 | arr = np.hstack(arr) 156 | arrs.append(arr) 157 | arrs = np.vstack(arrs).T 158 | else: 159 | arr = [] 160 | for i in range(start, start + n): 161 | cross_attn_sliced = cross_attn[..., i + 1] 162 | arr.append(cross_attn_sliced) 163 | arrs = np.vstack(arr) 164 | plt.imshow(arrs, cmap="jet", vmin=0.0, vmax=.5) 165 | plt.title(prompt) 166 | if filepath: 167 | plt.savefig(filepath) 168 | else: 169 | plt.show() 170 | 171 | 172 | def draw_dd_metadata(img, bbox, text="", target_res=1024): 173 | img = img.resize((target_res, target_res)) 174 | image_editable = ImageDraw.Draw(img) 175 | 176 | for region in [bbox]: 177 | x0 = region[0] * target_res 178 | y0 = region[2] * target_res 179 | x1 = region[1] * target_res 180 | y1 = region[3] * target_res 181 | image_editable.rectangle(xy=[x0, y0, x1, y1], outline=(255, 0, 0, 255), width=5) 182 | if text: 183 | font = ImageFont.truetype("./assets/JetBrainsMono-Bold.ttf", size=13) 184 | image_editable.multiline_text( 185 | (15, 15), 186 | text, 187 | (255, 255, 255, 0), 188 | font=font, 189 | stroke_width=2, 190 | stroke_fill=(0, 0, 0, 255), 191 | spacing=0, 192 | ) 193 | return img 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | if __name__ == "__main__": 223 | latents = torch.load("assets/experiments/a-cat-sitting-on-a-car_230615-144611/latents.pt") 224 | plot_latent_noise(latents, "GRAY") 225 | -------------------------------------------------------------------------------- /TrailBlazer/Misc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/TrailBlazer/Misc/__init__.py -------------------------------------------------------------------------------- /TrailBlazer/Pipeline/UNet3DConditionModelCall.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved. 2 | # Copyright 2023 The ModelScope Team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from dataclasses import dataclass 16 | from typing import Any, Dict, List, Optional, Tuple, Union 17 | 18 | import torch 19 | import torch.nn as nn 20 | import torch.utils.checkpoint 21 | 22 | from diffusers.configuration_utils import ConfigMixin, register_to_config 23 | from diffusers.loaders import UNet2DConditionLoadersMixin 24 | from diffusers.utils import BaseOutput, logging 25 | from diffusers.models.attention_processor import ( 26 | ADDED_KV_ATTENTION_PROCESSORS, 27 | CROSS_ATTENTION_PROCESSORS, 28 | AttentionProcessor, 29 | AttnAddedKVProcessor, 30 | AttnProcessor, 31 | ) 32 | from diffusers.models.embeddings import TimestepEmbedding, Timesteps 33 | from diffusers.models.modeling_utils import ModelMixin 34 | from diffusers.models.transformer_temporal import TransformerTemporalModel 35 | from diffusers.models.unet_3d_blocks import ( 36 | CrossAttnDownBlock3D, 37 | CrossAttnUpBlock3D, 38 | DownBlock3D, 39 | UNetMidBlock3DCrossAttn, 40 | UpBlock3D, 41 | get_down_block, 42 | get_up_block, 43 | ) 44 | from diffusers.models.unet_3d_condition import UNet3DConditionOutput 45 | 46 | 47 | 48 | def unet3d_condition_model_forward( 49 | self, 50 | sample: torch.FloatTensor, 51 | timestep: Union[torch.Tensor, float, int], 52 | encoder_hidden_states: torch.Tensor, 53 | class_labels: Optional[torch.Tensor] = None, 54 | timestep_cond: Optional[torch.Tensor] = None, 55 | attention_mask: Optional[torch.Tensor] = None, 56 | cross_attention_kwargs: Optional[Dict[str, Any]] = None, 57 | down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, 58 | mid_block_additional_residual: Optional[torch.Tensor] = None, 59 | return_dict: bool = True, 60 | ) -> Union[UNet3DConditionOutput, Tuple]: 61 | r""" 62 | The [`UNet3DConditionModel`] forward method. 63 | 64 | Args: 65 | sample (`torch.FloatTensor`): 66 | The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`. 67 | timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. 68 | encoder_hidden_states (`torch.FloatTensor`): 69 | The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. 70 | return_dict (`bool`, *optional*, defaults to `True`): 71 | Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain 72 | tuple. 73 | cross_attention_kwargs (`dict`, *optional*): 74 | A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. 75 | 76 | Returns: 77 | [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`: 78 | If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise 79 | a `tuple` is returned where the first element is the sample tensor. 80 | """ 81 | # By default samples have to be AT least a multiple of the overall upsampling factor. 82 | # The overall upsampling factor is equal to 2 ** (# num of upsampling layears). 83 | # However, the upsampling interpolation output size can be forced to fit any upsampling size 84 | # on the fly if necessary. 85 | default_overall_up_factor = 2**self.num_upsamplers 86 | 87 | # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor` 88 | forward_upsample_size = False 89 | upsample_size = None 90 | 91 | if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]): 92 | logger.info("Forward upsample size to force interpolation output size.") 93 | forward_upsample_size = True 94 | 95 | # prepare attention_mask 96 | if attention_mask is not None: 97 | attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 98 | attention_mask = attention_mask.unsqueeze(1) 99 | 100 | # 1. time 101 | timesteps = timestep 102 | if not torch.is_tensor(timesteps): 103 | # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can 104 | # This would be a good case for the `match` statement (Python 3.10+) 105 | is_mps = sample.device.type == "mps" 106 | if isinstance(timestep, float): 107 | dtype = torch.float32 if is_mps else torch.float64 108 | else: 109 | dtype = torch.int32 if is_mps else torch.int64 110 | timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device) 111 | elif len(timesteps.shape) == 0: 112 | timesteps = timesteps[None].to(sample.device) 113 | 114 | # broadcast to batch dimension in a way that's compatible with ONNX/Core ML 115 | num_frames = sample.shape[2] 116 | timesteps = timesteps.expand(sample.shape[0]) 117 | 118 | t_emb = self.time_proj(timesteps) 119 | 120 | # timesteps does not contain any weights and will always return f32 tensors 121 | # but time_embedding might actually be running in fp16. so we need to cast here. 122 | # there might be better ways to encapsulate this. 123 | t_emb = t_emb.to(dtype=self.dtype) 124 | 125 | emb = self.time_embedding(t_emb, timestep_cond) 126 | emb = emb.repeat_interleave(repeats=num_frames, dim=0) 127 | # encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0) 128 | # print(encoder_hidden_states.shape) 129 | # quit() 130 | 131 | # 2. pre-process 132 | sample = sample.permute(0, 2, 1, 3, 4).reshape((sample.shape[0] * num_frames, -1) + sample.shape[3:]) 133 | sample = self.conv_in(sample) 134 | 135 | sample = self.transformer_in( 136 | sample, 137 | num_frames=num_frames, 138 | cross_attention_kwargs=cross_attention_kwargs, 139 | return_dict=False, 140 | )[0] 141 | 142 | # 3. down 143 | down_block_res_samples = (sample,) 144 | for downsample_block in self.down_blocks: 145 | if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: 146 | sample, res_samples = downsample_block( 147 | hidden_states=sample, 148 | temb=emb, 149 | encoder_hidden_states=encoder_hidden_states, 150 | attention_mask=attention_mask, 151 | num_frames=num_frames, 152 | cross_attention_kwargs=cross_attention_kwargs, 153 | ) 154 | else: 155 | sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames) 156 | 157 | down_block_res_samples += res_samples 158 | 159 | if down_block_additional_residuals is not None: 160 | new_down_block_res_samples = () 161 | 162 | for down_block_res_sample, down_block_additional_residual in zip( 163 | down_block_res_samples, down_block_additional_residuals 164 | ): 165 | down_block_res_sample = down_block_res_sample + down_block_additional_residual 166 | new_down_block_res_samples += (down_block_res_sample,) 167 | 168 | down_block_res_samples = new_down_block_res_samples 169 | 170 | # 4. mid 171 | if self.mid_block is not None: 172 | sample = self.mid_block( 173 | sample, 174 | emb, 175 | encoder_hidden_states=encoder_hidden_states, 176 | attention_mask=attention_mask, 177 | num_frames=num_frames, 178 | cross_attention_kwargs=cross_attention_kwargs, 179 | ) 180 | 181 | if mid_block_additional_residual is not None: 182 | sample = sample + mid_block_additional_residual 183 | 184 | # 5. up 185 | for i, upsample_block in enumerate(self.up_blocks): 186 | is_final_block = i == len(self.up_blocks) - 1 187 | 188 | res_samples = down_block_res_samples[-len(upsample_block.resnets) :] 189 | down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] 190 | 191 | # if we have not reached the final block and need to forward the 192 | # upsample size, we do it here 193 | if not is_final_block and forward_upsample_size: 194 | upsample_size = down_block_res_samples[-1].shape[2:] 195 | 196 | if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention: 197 | sample = upsample_block( 198 | hidden_states=sample, 199 | temb=emb, 200 | res_hidden_states_tuple=res_samples, 201 | encoder_hidden_states=encoder_hidden_states, 202 | upsample_size=upsample_size, 203 | attention_mask=attention_mask, 204 | num_frames=num_frames, 205 | cross_attention_kwargs=cross_attention_kwargs, 206 | ) 207 | else: 208 | sample = upsample_block( 209 | hidden_states=sample, 210 | temb=emb, 211 | res_hidden_states_tuple=res_samples, 212 | upsample_size=upsample_size, 213 | num_frames=num_frames, 214 | ) 215 | 216 | # 6. post-process 217 | if self.conv_norm_out: 218 | sample = self.conv_norm_out(sample) 219 | sample = self.conv_act(sample) 220 | 221 | sample = self.conv_out(sample) 222 | 223 | # reshape to (batch, channel, framerate, width, height) 224 | sample = sample[None, :].reshape((-1, num_frames) + sample.shape[1:]).permute(0, 2, 1, 3, 4) 225 | 226 | if not return_dict: 227 | return (sample,) 228 | 229 | return UNet3DConditionOutput(sample=sample) 230 | -------------------------------------------------------------------------------- /TrailBlazer/Pipeline/Utils.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from typing import Any, Callable, Dict, List, Optional, Union 3 | 4 | import numpy as np 5 | import torch 6 | from transformers import CLIPTextModel, CLIPTokenizer 7 | from dataclasses import dataclass 8 | 9 | from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin 10 | from diffusers.models import AutoencoderKL, UNet3DConditionModel 11 | from diffusers.models.lora import adjust_lora_scale_text_encoder 12 | from diffusers.schedulers import KarrasDiffusionSchedulers 13 | from diffusers.utils import ( 14 | deprecate, 15 | logging, 16 | replace_example_docstring, 17 | BaseOutput, 18 | ) 19 | from diffusers.utils.torch_utils import randn_tensor 20 | from diffusers.pipeline_utils import DiffusionPipeline 21 | from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth import ( 22 | tensor2vid, 23 | ) 24 | from ..CrossAttn.InjecterProc import InjecterProcessor 25 | from ..Misc import Logger as log 26 | from ..Misc import Const 27 | 28 | 29 | 30 | 31 | def use_dd_temporal(unet, use=True): 32 | """ To determine using the temporal attention editing at a step 33 | """ 34 | for name, module in unet.named_modules(): 35 | module_name = type(module).__name__ 36 | if module_name == "Attention" and "attn2" in name: 37 | module.processor.use_dd_temporal = use 38 | 39 | 40 | def use_dd(unet, use=True): 41 | """ To determine using the spatial attention editing at a step 42 | """ 43 | for name, module in unet.named_modules(): 44 | module_name = type(module).__name__ 45 | # if module_name == "CrossAttention" and "attn2" in name: 46 | if module_name == "Attention" and "attn2" in name: 47 | module.processor.use_dd = use 48 | 49 | 50 | def initiailization(unet, bundle, bbox_per_frame): 51 | log.info("Intialization") 52 | 53 | for name, module in unet.named_modules(): 54 | module_name = type(module).__name__ 55 | if module_name == "Attention" and "attn2" in name: 56 | if "temp_attentions" in name: 57 | processor = InjecterProcessor( 58 | bundle=bundle, 59 | bbox_per_frame=bbox_per_frame, 60 | strengthen_scale=bundle["trailblazer"]["temp_strengthen_scale"], 61 | weaken_scale=bundle["trailblazer"]["temp_weaken_scale"], 62 | is_text2vidzero=False, 63 | name=name, 64 | ) 65 | else: 66 | processor = InjecterProcessor( 67 | bundle=bundle, 68 | bbox_per_frame=bbox_per_frame, 69 | strengthen_scale=bundle["trailblazer"]["spatial_strengthen_scale"], 70 | weaken_scale=bundle["trailblazer"]["spatial_weaken_scale"], 71 | is_text2vidzero=False, 72 | name=name, 73 | ) 74 | module.processor = processor 75 | # print(name) 76 | log.info("Initialized") 77 | 78 | 79 | def keyframed_prompt_embeds(bundle, encode_prompt_func, device): 80 | num_frames = bundle["keyframe"][-1]["frame"] + 1 81 | keyframe = bundle["keyframe"] 82 | f = lambda start, end, index: (1 - index) * start + index * end 83 | n = len(keyframe) 84 | keyed_prompt_embeds = [] 85 | for i in range(n - 1): 86 | if i == 0: 87 | start_fr = keyframe[i]["frame"] 88 | else: 89 | start_fr = keyframe[i]["frame"] + 1 90 | end_fr = keyframe[i + 1]["frame"] 91 | 92 | start_prompt = keyframe[i]["prompt"] + Const.POSITIVE_PROMPT 93 | end_prompt = keyframe[i + 1]["prompt"] + Const.POSITIVE_PROMPT 94 | clip_length = end_fr - start_fr + 1 95 | 96 | start_prompt_embeds, _ = encode_prompt_func( 97 | start_prompt, 98 | device=device, 99 | num_images_per_prompt=1, 100 | do_classifier_free_guidance=True, 101 | negative_prompt=Const.NEGATIVE_PROMPT, 102 | ) 103 | 104 | end_prompt_embeds, negative_prompt_embeds = encode_prompt_func( 105 | end_prompt, 106 | device=device, 107 | num_images_per_prompt=1, 108 | do_classifier_free_guidance=True, 109 | negative_prompt=Const.NEGATIVE_PROMPT, 110 | ) 111 | 112 | for fr in range(clip_length): 113 | index = float(fr) / (clip_length - 1) 114 | keyed_prompt_embeds.append(f(start_prompt_embeds, end_prompt_embeds, index)) 115 | assert len(keyed_prompt_embeds) == num_frames 116 | 117 | return torch.cat(keyed_prompt_embeds), negative_prompt_embeds.repeat_interleave( 118 | num_frames, dim=0 119 | ) 120 | 121 | 122 | def keyframed_bbox(bundle): 123 | 124 | keyframe = bundle["keyframe"] 125 | bbox_per_frame = [] 126 | f = lambda start, end, index: (1 - index) * start + index * end 127 | n = len(keyframe) 128 | for i in range(n - 1): 129 | if i == 0: 130 | start_fr = keyframe[i]["frame"] 131 | else: 132 | start_fr = keyframe[i]["frame"] + 1 133 | end_fr = keyframe[i + 1]["frame"] 134 | start_bbox = keyframe[i]["bbox_ratios"] 135 | end_bbox = keyframe[i + 1]["bbox_ratios"] 136 | clip_length = end_fr - start_fr + 1 137 | for fr in range(clip_length): 138 | index = float(fr) / (clip_length - 1) 139 | bbox = [] 140 | for j in range(4): 141 | bbox.append(f(start_bbox[j], end_bbox[j], index)) 142 | bbox_per_frame.append(bbox) 143 | 144 | return bbox_per_frame 145 | -------------------------------------------------------------------------------- /TrailBlazer/Pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/TrailBlazer/Pipeline/__init__.py -------------------------------------------------------------------------------- /TrailBlazer/README.md: -------------------------------------------------------------------------------- 1 | # TrailBlazer - Codebase 2 | -------------------------------------------------------------------------------- /TrailBlazer/Setting/Config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | 4 | DEVICE = "cuda" 5 | GUIDANCE_SCALE = 7.5 6 | WIDTH = 512 7 | HEIGHT = 512 8 | NUM_BACKWARD_STEPS = 50 9 | STEPS = 50 10 | DTYPE = torch.float16 11 | 12 | MODEL_HOME = f"{os.path.expanduser('~')}/Workspace/Project/Models" 13 | 14 | NEGATIVE_PROMPT = "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer difits, cropped, worst quality, low quality, deformed body, bloated, ugly, unrealistic" 15 | POSITIVE_PROMPT = "best quality, extremely detailed, HD, ultra-realistic, 8K, HQ, masterpiece, trending on artstation, art, smooth" 16 | 17 | 18 | SD_V1_5_ID = "runwayml/stable-diffusion-v1-5" 19 | SD_V1_5_PATH = f"{MODEL_HOME}/{SD_V1_5_ID}" 20 | CNET_CANNY_ID = "lllyasviel/sd-controlnet-canny" 21 | CNET_CANNY_PATH = f"{MODEL_HOME}/{CNET_CANNY_ID}" 22 | CNET_OPENPOSE_ID = "lllyasviel/sd-controlnet-openpose" 23 | CNET_OPENPOSE_PATH = f"{MODEL_HOME}/{CNET_OPENPOSE_ID}" 24 | -------------------------------------------------------------------------------- /TrailBlazer/Setting/Const.py: -------------------------------------------------------------------------------- 1 | RECONS_NAME = "recons.jpg" 2 | LATENTS_NAME = "latents.pt" 3 | CATTN_NAME = "cattn.pt" 4 | CATTN_VIZ_NAME = "cattn.jpg" 5 | -------------------------------------------------------------------------------- /TrailBlazer/Setting/Keyframe.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def get_stt_keyframe(prompt): 5 | keyframe = [] 6 | left = round(np.random.uniform(low=0.0, high=0.5), 2) 7 | top = round(np.random.uniform(low=0.0, high=0.5), 2) 8 | right = round(left + np.random.uniform(low=0.2, high=0.5), 2) 9 | bottom = round(top + np.random.uniform(low=0.2, high=0.5), 2) 10 | bbox_ratio = [left, top, right, bottom] 11 | start = {"bbox_ratios": bbox_ratio, "frame": 0, "prompt": prompt} 12 | end = {"bbox_ratios": bbox_ratio, "frame": 24, "prompt": prompt} 13 | keyframe.append(start) 14 | keyframe.append(end) 15 | return keyframe 16 | 17 | 18 | def get_dyn_keyframe(prompt): 19 | 20 | num_keyframes = np.random.randint(5) + 2 21 | frames = [int(v) for v in np.linspace(0, 24, num_keyframes)] 22 | bbox_ratios = [] 23 | choice = np.random.randint(6) 24 | # right to left 25 | if choice == 0: 26 | start = [0.0, 0.35, 0.3, 0.65] 27 | end = [0.7, 0.35, 1.0, 0.65] 28 | # left to right 29 | elif choice == 1: 30 | start = [0.7, 0.35, 1.0, 0.65] 31 | end = [0.0, 0.35, 0.3, 0.65] 32 | # top left (small) to bottom right (large) 33 | elif choice == 2: 34 | start = [0.0, 0.0, 0.2, 0.2] 35 | end = [0.5, 0.5, 1.0, 1.0] 36 | # top left to bottom right 37 | elif choice == 3: 38 | start = [0.0, 0.0, 0.5, 0.5] 39 | end = [0.5, 0.5, 1.0, 1.0] 40 | # top left (small) to bottom right (large) 41 | elif choice == 4: 42 | start = [0.5, 0.5, 1.0, 1.0] 43 | end = [0.0, 0.0, 0.2, 0.2] 44 | # top right to bottom left 45 | elif choice == 5: 46 | start = [0.5, 0.5, 1.0, 1.0] 47 | end = [0.0, 0.0, 0.5, 0.5] 48 | 49 | for i in range(num_keyframes): 50 | if i % 2 == 0: 51 | bbox_ratios.append(start) 52 | else: 53 | bbox_ratios.append(end) 54 | 55 | keyframe = [] 56 | for i in range(num_keyframes): 57 | keyframe.append( 58 | {"bbox_ratios": bbox_ratios[i], "frame": frames[i], "prompt": prompt} 59 | ) 60 | return keyframe 61 | -------------------------------------------------------------------------------- /TrailBlazer/Setting/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/TrailBlazer/Setting/__init__.py -------------------------------------------------------------------------------- /TrailBlazer/__init__.py: -------------------------------------------------------------------------------- 1 | # # VideoDiffusion 2 | # from .Pipeline.Dumnmy import DummyPipeline 3 | # from .Pipeline.Standard import StandardPipeline 4 | # from .Pipeline.ControlNet import ControlNetPipeline 5 | # from .Pipeline.Img2Img import Img2ImgPipeline 6 | # from .Pipeline.Video import VideoPipeline 7 | 8 | # from .Pipeline.TestMayaNoise import TestMayaNoisePipeline 9 | -------------------------------------------------------------------------------- /assets/gradio/Cat2Dog.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/gradio/Cat2Dog.mp4 -------------------------------------------------------------------------------- /assets/gradio/cat-LRLR.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/gradio/cat-LRLR.mp4 -------------------------------------------------------------------------------- /assets/gradio/fish-RL.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/gradio/fish-RL.mp4 -------------------------------------------------------------------------------- /assets/gradio/fish-TL2BR.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/gradio/fish-TL2BR.mp4 -------------------------------------------------------------------------------- /assets/gradio/gradio-bbox.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/gradio/gradio-bbox.jpg -------------------------------------------------------------------------------- /assets/gradio/gradio.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/gradio/gradio.jpg -------------------------------------------------------------------------------- /assets/gradio/tiger-TL2BR.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/gradio/tiger-TL2BR.mp4 -------------------------------------------------------------------------------- /assets/teaser.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/teaser.gif -------------------------------------------------------------------------------- /assets/v1-Peekaboo-Repro/2_of_50_2_peekaboo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo-Repro/2_of_50_2_peekaboo.gif -------------------------------------------------------------------------------- /assets/v1-Peekaboo-Repro/Peekaboo-Reproduce.0000-by-Peekaboo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo-Repro/Peekaboo-Reproduce.0000-by-Peekaboo.gif -------------------------------------------------------------------------------- /assets/v1-Peekaboo-Repro/Peekaboo-Reproduce.0000-by-TrailBlazer.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo-Repro/Peekaboo-Reproduce.0000-by-TrailBlazer.gif -------------------------------------------------------------------------------- /assets/v1-Peekaboo-Repro/mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo-Repro/mask.png -------------------------------------------------------------------------------- /assets/v1-Peekaboo/2ndKeyFast.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo/2ndKeyFast.0000.gif -------------------------------------------------------------------------------- /assets/v1-Peekaboo/ChangingFish.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo/ChangingFish.0000.gif -------------------------------------------------------------------------------- /assets/v1-Peekaboo/CrazyHorse.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo/CrazyHorse.0000.gif -------------------------------------------------------------------------------- /assets/v1-Peekaboo/FastDog.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo/FastDog.0000.gif -------------------------------------------------------------------------------- /assets/v1-Peekaboo/PerspBR2TL-Tiger.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo/PerspBR2TL-Tiger.0000.gif -------------------------------------------------------------------------------- /assets/v1-Peekaboo/PerspTL2BR-Tiger.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo/PerspTL2BR-Tiger.0000.gif -------------------------------------------------------------------------------- /assets/v1-Peekaboo/RigidMoving-Astronaut.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo/RigidMoving-Astronaut.0000.gif -------------------------------------------------------------------------------- /assets/v1-Peekaboo/RigidMoving-Bee.0006.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo/RigidMoving-Bee.0006.gif -------------------------------------------------------------------------------- /assets/v1-Peekaboo/RigidMoving-Cat.0010.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo/RigidMoving-Cat.0010.gif -------------------------------------------------------------------------------- /assets/v1-Peekaboo/RigidMoving-Clownfish.0001.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo/RigidMoving-Clownfish.0001.gif -------------------------------------------------------------------------------- /assets/v1-Peekaboo/Speed2ndKey-Astronaut.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo/Speed2ndKey-Astronaut.0000.gif -------------------------------------------------------------------------------- /assets/v1-Peekaboo/SpeedKeys-Cat.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo/SpeedKeys-Cat.0000.gif -------------------------------------------------------------------------------- /assets/v1-Peekaboo/TinyFish.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-Peekaboo/TinyFish.0000.gif -------------------------------------------------------------------------------- /assets/v1-T2VZero/PerspBR2TL-Tiger.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-T2VZero/PerspBR2TL-Tiger.0000.gif -------------------------------------------------------------------------------- /assets/v1-T2VZero/PerspTL2BR-Tiger.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-T2VZero/PerspTL2BR-Tiger.0000.gif -------------------------------------------------------------------------------- /assets/v1-T2VZero/RigidMoving-Astronaut.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-T2VZero/RigidMoving-Astronaut.0000.gif -------------------------------------------------------------------------------- /assets/v1-T2VZero/RigidMoving-Bee.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-T2VZero/RigidMoving-Bee.0000.gif -------------------------------------------------------------------------------- /assets/v1-T2VZero/RigidMoving-Cat.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-T2VZero/RigidMoving-Cat.0000.gif -------------------------------------------------------------------------------- /assets/v1-T2VZero/RigidMoving-Clownfish.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-T2VZero/RigidMoving-Clownfish.0000.gif -------------------------------------------------------------------------------- /assets/v1-T2VZero/Speed2ndKey-Astronaut.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-T2VZero/Speed2ndKey-Astronaut.0000.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/2ndKeyFast.0003.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/2ndKeyFast.0003.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/Cat2Dog.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/Cat2Dog.0000.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/Cat2Fish.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/Cat2Fish.0000.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/ChangingFish.0009.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/ChangingFish.0009.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/CrazyHorse.0007.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/CrazyHorse.0007.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/CrazyHorse.0008.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/CrazyHorse.0008.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/FastDog.0003.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/FastDog.0003.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/MultiSubject-Cat.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/MultiSubject-Cat.0000.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/MultiSubject-Dog.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/MultiSubject-Dog.0000.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/MultiSubjects.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/MultiSubjects.0000.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/Parrot2Penguin.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/Parrot2Penguin.0000.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/Peekaboo-Reproduce.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/Peekaboo-Reproduce.0000.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/PerspBR2TL-Tiger.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/PerspBR2TL-Tiger.0000.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/PerspTL2BR-Tiger.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/PerspTL2BR-Tiger.0000.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/RigidMoving-Astronaut.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/RigidMoving-Astronaut.0000.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/RigidMoving-Bee.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/RigidMoving-Bee.0000.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/RigidMoving-Cat.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/RigidMoving-Cat.0000.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/RigidMoving-Clownfish.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/RigidMoving-Clownfish.0000.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/SpeedKeys-Cat.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/SpeedKeys-Cat.0000.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/Tiger2Elephant.0000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/Tiger2Elephant.0000.gif -------------------------------------------------------------------------------- /assets/v1-TrailBlazer/TinyFish.0008.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/assets/v1-TrailBlazer/TinyFish.0008.gif -------------------------------------------------------------------------------- /bin/CmdMetric.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/bin/CmdMetric.py -------------------------------------------------------------------------------- /bin/CmdPeekaboo.py: -------------------------------------------------------------------------------- 1 | """This code is edited based on the following implementation at latest commit 2 | 5fc80c4 of the file: 3 | 4 | https://github.com/microsoft/Peekaboo/blob/main/src/generate.py 5 | 6 | The main changes are listed as follows: 7 | 8 | 1) We incorporate TrailBlazer configuration settings into the Peekaboo project 9 | instead of hard-coding. 10 | 11 | 2) We introduce additional parser flags, such as model root, to enhance 12 | flexibility. 13 | 14 | 3) This configuration can now be utilized for both TrailBlazer and Peekaboo, 15 | enabling a more effective comparison. 16 | 17 | How to use this command: 18 | 19 | python bin/CmdPeekaboo.py -mr /path/to/diffusion/model/folder --config path/to/valid/config.yaml 20 | 21 | To reproduce the command used in Peekaboo README.md 22 | 23 | python bin/CmdPeekaboo.py -mr /home/kma/Workspace/Project/Models --config config/Peekaboo-Reproduce.yaml 24 | """ 25 | 26 | 27 | import os 28 | import glob 29 | import sys 30 | import copy 31 | from pprint import pprint 32 | sys.path.insert(1, os.path.join(sys.path[0], "..")) 33 | 34 | import warnings 35 | 36 | import cv2 37 | import numpy as np 38 | import tqdm 39 | import torch 40 | import torch.nn.functional as F 41 | import torchvision.io as vision_io 42 | 43 | from TrailBlazer.Baseline.Peekaboo.src.models.pipelines import ( 44 | TextToVideoSDPipelineSpatialAware, 45 | ) 46 | from TrailBlazer.Pipeline.Utils import keyframed_bbox 47 | from TrailBlazer.Misc import ConfigIO 48 | from TrailBlazer.Misc import Logger as log 49 | from TrailBlazer.Setting import Keyframe 50 | 51 | from diffusers.utils import export_to_video 52 | from PIL import Image 53 | import torchvision 54 | 55 | import argparse 56 | import warnings 57 | 58 | warnings.filterwarnings("ignore") 59 | 60 | DTYPE = torch.float32 61 | 62 | 63 | def get_parser(): 64 | parser = argparse.ArgumentParser( 65 | description="Generate videos with different prompts and fg objects" 66 | ) 67 | parser.add_argument( 68 | "-c", "--config", help="Input config file", required=True, type=str 69 | ) 70 | parser.add_argument( 71 | "-mr", "--model-root", help="Model root directory", default="./", type=str 72 | ) 73 | parser.add_argument( 74 | "-s", 75 | "--search", 76 | type=str, 77 | default="", 78 | help="Search parameter based on the number of trailing attention", 79 | ) 80 | parser.add_argument( 81 | "--seed", type=int, default=2, help="Seed for random number generation" 82 | ) 83 | 84 | parser.add_argument( 85 | "--output-path", 86 | type=str, 87 | default="/tmp", 88 | help="Path to save the generated videos", 89 | ) 90 | return parser 91 | 92 | 93 | def generate_video( 94 | pipe, 95 | overall_prompt, 96 | latents, 97 | get_latents=False, 98 | num_frames=24, 99 | num_inference_steps=50, 100 | fg_masks=None, 101 | fg_masked_latents=None, 102 | frozen_steps=0, 103 | custom_attention_mask=None, 104 | fg_prompt=None, 105 | height=320, 106 | width=576, 107 | ): 108 | 109 | video_frames = pipe( 110 | overall_prompt, 111 | num_frames=num_frames, 112 | latents=latents, 113 | num_inference_steps=num_inference_steps, 114 | frozen_mask=fg_masks, 115 | frozen_steps=frozen_steps, 116 | latents_all_input=fg_masked_latents, 117 | custom_attention_mask=custom_attention_mask, 118 | fg_prompt=fg_prompt, 119 | make_attention_mask_2d=True, 120 | attention_mask_block_diagonal=True, 121 | height=height, 122 | width=width, 123 | ).frames 124 | if get_latents: 125 | video_latents = pipe( 126 | overall_prompt, 127 | num_frames=num_frames, 128 | latents=latents, 129 | num_inference_steps=num_inference_steps, 130 | output_type="latent", 131 | ).frames 132 | return video_frames, video_latents 133 | 134 | return video_frames 135 | 136 | 137 | def save_frames(path): 138 | video, audio, video_info = vision_io.read_video(f"{path}.mp4", pts_unit="sec") 139 | 140 | # Number of frames 141 | num_frames = video.size(0) 142 | 143 | # Save each frame 144 | os.makedirs(f"{path}", exist_ok=True) 145 | for i in range(num_frames): 146 | frame = video[i, :, :, :].numpy() 147 | # Convert from C x H x W to H x W x C and from torch tensor to PIL Image 148 | # frame = frame.permute(1, 2, 0).numpy() 149 | img = Image.fromarray(frame.astype("uint8")) 150 | img.save(f"{path}/frame_{i:04d}.png") 151 | 152 | 153 | class ClearCache: 154 | def __enter__(self): 155 | torch.cuda.empty_cache() 156 | 157 | def __exit__(self, exc_type, exc_val, exc_tb): 158 | torch.cuda.empty_cache() 159 | 160 | 161 | if __name__ == "__main__": 162 | parser = get_parser() 163 | args = parser.parse_args() 164 | 165 | 166 | torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 167 | model_root = os.environ.get("ZEROSCOPE_MODEL_ROOT") 168 | if not model_root: 169 | model_root = args.model_root 170 | model_id = "cerspense/zeroscope_v2_576w" 171 | model_path = os.path.join(model_root, model_id) 172 | 173 | pipe = TextToVideoSDPipelineSpatialAware.from_pretrained( 174 | model_path, torch_dtype=DTYPE, variant="fp32" 175 | ).to(torch_device) 176 | 177 | if DTYPE == torch.float16: 178 | pipe.unet = pipe.unet.half() 179 | 180 | experiment_bundles = [] 181 | log.info("Loading config..") 182 | if os.path.isdir(args.config): 183 | configs = glob.glob(f"{args.config}/*.yaml") 184 | for cfg in configs: 185 | log.info(cfg) 186 | bundle = ConfigIO.config_loader(cfg) 187 | experiment_bundles.append([bundle, cfg]) 188 | else: 189 | log.info(args.config) 190 | bundle = ConfigIO.config_loader(args.config) 191 | experiment_bundles.append([bundle, args.config]) 192 | if args.search: 193 | for i in range(-5,5): 194 | bundle_new = copy.deepcopy(bundle) 195 | bundle_new[args.search] = bundle[args.search] + i 196 | print(bundle_new[args.search]) 197 | experiment_bundles.append([bundle_new, args.config]) 198 | 199 | for bundle, config in experiment_bundles: 200 | 201 | if not bundle.get("keyframe"): 202 | bundle["keyframe"] = Keyframe.get_stt_keyframe(bundle["prompt"]) 203 | 204 | num_frames = 24 205 | height = int(bundle["height"] // 8) 206 | width = int(bundle["width"] // 8) 207 | 208 | bbox_mask = torch.zeros( 209 | [num_frames, 1, height, width], 210 | device=torch_device, 211 | ) 212 | bbox_mask.fill_(0.1) 213 | fg_masks = torch.zeros( 214 | [num_frames, 1, height, width], 215 | device=torch_device, 216 | ) 217 | 218 | if not bundle.get("peekaboo"): 219 | log.warn("No [peekaboo] field found in the config file. Abort.") 220 | continue 221 | 222 | bbox = keyframed_bbox(bundle) 223 | seed = bundle["seed"] 224 | random_latents = torch.randn( 225 | [1, 4, num_frames, height, width], 226 | generator=torch.Generator().manual_seed(seed), 227 | dtype=DTYPE, 228 | ).to(torch_device) 229 | 230 | y_start = [int(b[0] * width) for b in bbox] 231 | y_end = [int(b[2] * width) for b in bbox] 232 | x_start = [int(b[1] * height) for b in bbox] 233 | x_end = [int(b[3] * height) for b in bbox] 234 | 235 | # Populate the bbox_mask tensor with ones where the bounding box is located 236 | for i in range(num_frames): 237 | bbox_mask[i, :, x_start[i] : x_end[i], y_start[i] : y_end[i]] = 1 238 | fg_masks[i, :, x_start[i] : x_end[i], y_start[i] : y_end[i]] = 1 239 | 240 | fg_masked_latents = None 241 | 242 | overall_prompt = bundle["keyframe"][0]["prompt"] 243 | fg_object = " ".join( 244 | [overall_prompt.split(" ")[i - 1] for i in bundle["token_inds"]] 245 | ) 246 | save_path = "Peekaboo" 247 | log.info(f"Generating video for prompt: [{overall_prompt}]") 248 | log.info(f"FG object: [{fg_object}]") 249 | 250 | frozen_steps = bundle["peekaboo"]["frozen_steps"] 251 | num_inference_steps = ( 252 | 50 253 | if not bundle.get("num_inference_steps") 254 | else bundle.get("num_inference_steps") 255 | ) 256 | log.info(f"Frozen steps: [{frozen_steps}]") 257 | assert ( 258 | frozen_steps <= num_inference_steps 259 | ), "Frozen steps should be less than or equal to the number of inference steps" 260 | pprint("=================================") 261 | pprint(bundle) 262 | pprint("=================================") 263 | 264 | # NOTE: The Peekaboo entry 265 | video_frames = generate_video( 266 | pipe, 267 | overall_prompt, 268 | random_latents, 269 | get_latents=False, 270 | num_frames=num_frames, 271 | num_inference_steps=num_inference_steps, 272 | fg_masks=fg_masks, 273 | fg_masked_latents=fg_masked_latents, 274 | frozen_steps=frozen_steps, 275 | fg_prompt=fg_object, 276 | height=bundle["height"], 277 | width=bundle["width"], 278 | ) 279 | # Save video frames 280 | output_folder = os.path.join(args.output_path, save_path) 281 | os.makedirs(output_folder, exist_ok=True) 282 | task_name = os.path.splitext(os.path.basename(config))[0] 283 | output_video_path = os.path.join(output_folder, f"{task_name}.0000.mp4") 284 | if os.path.exists(output_video_path): 285 | import glob 286 | 287 | repeated = os.path.join(output_folder, task_name + "*mp4") 288 | num_reapts = len(glob.glob(repeated)) 289 | output_video_path = os.path.join( 290 | output_folder, task_name + ".{:04d}.mp4".format(num_reapts) 291 | ) 292 | 293 | video_path = export_to_video(video_frames, output_video_path) 294 | mask_folder = os.path.join( 295 | output_folder, os.path.splitext(output_video_path)[0] + "-mask" 296 | ) 297 | os.makedirs(mask_folder, exist_ok=True) 298 | for i in range(num_frames): 299 | filepath = os.path.join(mask_folder, f"frame.{i:04d}.png") 300 | torchvision.utils.save_image(bbox_mask[i], filepath) 301 | 302 | config_path = os.path.splitext(output_video_path)[0] + ".yaml" 303 | ConfigIO.config_saver(bundle, config_path) 304 | 305 | log.info(f"Video saved at {output_video_path}") 306 | -------------------------------------------------------------------------------- /bin/CmdText2VidZero.py: -------------------------------------------------------------------------------- 1 | """ This is the comparison used in our paper 2 | """ 3 | import argparse 4 | import torch 5 | import imageio 6 | import glob 7 | import os 8 | import numpy as np 9 | from diffusers import TextToVideoZeroPipeline 10 | from diffusers.utils import export_to_video 11 | 12 | from TrailBlazer.Misc import Logger as log 13 | from TrailBlazer.Misc import ConfigIO 14 | 15 | 16 | def get_args(): 17 | """args parsing 18 | Args: 19 | Returns: 20 | """ 21 | parser = argparse.ArgumentParser(description="Directed Video Diffusion") 22 | # parser.add_argument('--foobar', action='store_true') 23 | parser.add_argument( 24 | "-c", "--config", help="Input config file", required=True, type=str 25 | ) 26 | parser.add_argument( 27 | "-mr", "--model-root", help="Model root directory", default="./", type=str 28 | ) 29 | parser.add_argument( 30 | "--output-path", 31 | type=str, 32 | default="/tmp", 33 | help="Path to save the generated videos", 34 | ) 35 | parser.add_argument( 36 | "-xl", "--zeroscope-xl", help="Search parameter", action="store_true" 37 | ) 38 | 39 | return parser.parse_args() 40 | 41 | 42 | if __name__ == "__main__": 43 | 44 | args = get_args() 45 | 46 | experiment_bundles = [] 47 | log.info("Loading config..") 48 | if os.path.isdir(args.config): 49 | configs = sorted(glob.glob(f"{args.config}/*.yaml")) 50 | for cfg in configs: 51 | log.info(cfg) 52 | bundle = ConfigIO.config_loader(cfg) 53 | experiment_bundles.append([bundle, cfg]) 54 | else: 55 | log.info(args.config) 56 | bundle = ConfigIO.config_loader(args.config) 57 | experiment_bundles.append([bundle, args.config]) 58 | 59 | model_id = "runwayml/stable-diffusion-v1-5" 60 | model_path = os.path.join(args.model_root, model_id) 61 | pipe = TextToVideoZeroPipeline.from_pretrained( 62 | model_path, torch_dtype=torch.float16 63 | ).to("cuda") 64 | 65 | for bundle, config in experiment_bundles: 66 | 67 | if not bundle.get("text2vidzero"): 68 | log.warn("No [text2vidzero] field found in the config file. Abort.") 69 | continue 70 | if not bundle.get("keyframe"): 71 | prompt = bundle["prompt"] 72 | else: 73 | prompt = bundle["keyframe"][0]["prompt"] 74 | motion_field_strength_x = bundle["text2vidzero"]["motion_field_strength_x"] 75 | motion_field_strength_y = bundle["text2vidzero"]["motion_field_strength_y"] 76 | motion_field_strength_x = 0. 77 | motion_field_strength_y = 0. 78 | height = bundle["height"] 79 | width = bundle["width"] 80 | num_inference_steps = bundle.get("num_inference_steps") 81 | kwargs = {} 82 | if num_inference_steps: 83 | kwargs.update({"num_inference_steps": num_inference_steps}) 84 | result = pipe( 85 | prompt=prompt, 86 | motion_field_strength_x=motion_field_strength_x, 87 | motion_field_strength_y=motion_field_strength_y, 88 | video_length=24, 89 | height=height, 90 | width=width, 91 | **kwargs, 92 | ).images 93 | 94 | # Save video frames 95 | save_path = "Text2VideoZero" 96 | output_folder = os.path.join(args.output_path, save_path) 97 | os.makedirs(output_folder, exist_ok=True) 98 | task_name = os.path.splitext(os.path.basename(config))[0] 99 | output_video_path = os.path.join(output_folder, f"{task_name}.0000.mp4") 100 | if os.path.exists(output_video_path): 101 | import glob 102 | 103 | repeated = os.path.join(output_folder, task_name + "*mp4") 104 | num_reapts = len(glob.glob(repeated)) 105 | output_video_path = os.path.join( 106 | output_folder, task_name + ".{:04d}.mp4".format(num_reapts) 107 | ) 108 | result = [(r * 255).astype("uint8") for r in result] 109 | imageio.mimsave(output_video_path, result, fps=4) 110 | 111 | data = { 112 | "latents": {}, 113 | "bundle": bundle, 114 | "bbox": {}, 115 | } 116 | latent_path = os.path.splitext(output_video_path)[0] + ".pt" 117 | torch.save(data, latent_path) 118 | config_path = os.path.splitext(output_video_path)[0] + ".yaml" 119 | ConfigIO.config_saver(bundle, config_path) 120 | 121 | log.info(f"Video saved at {output_video_path}") 122 | -------------------------------------------------------------------------------- /bin/CmdTrailBlazer.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | """ 4 | #!/usr/bin/env pyton 5 | import argparse 6 | import copy 7 | import os 8 | import glob 9 | import torch 10 | from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler 11 | from diffusers.pipelines import TextToVideoSDPipeline 12 | from diffusers.utils import export_to_video 13 | from PIL import Image 14 | 15 | from TrailBlazer.Misc import ConfigIO 16 | from TrailBlazer.Setting import Keyframe 17 | from TrailBlazer.Misc import Logger as log 18 | from TrailBlazer.Misc import Const 19 | from TrailBlazer.Pipeline.TextToVideoSDPipelineCall import ( 20 | text_to_video_sd_pipeline_call, 21 | ) 22 | from TrailBlazer.Pipeline.UNet3DConditionModelCall import ( 23 | unet3d_condition_model_forward, 24 | ) 25 | 26 | TextToVideoSDPipeline.__call__ = text_to_video_sd_pipeline_call 27 | from diffusers.models.unet_3d_condition import UNet3DConditionModel 28 | 29 | unet3d_condition_model_forward_copy = UNet3DConditionModel.forward 30 | UNet3DConditionModel.forward = unet3d_condition_model_forward 31 | 32 | 33 | def get_args(): 34 | """args parsing 35 | Args: 36 | Returns: 37 | """ 38 | parser = argparse.ArgumentParser(description="Directed Video Diffusion") 39 | # parser.add_argument('--foobar', action='store_true') 40 | parser.add_argument( 41 | "-c", "--config", help="Input config file", required=True, type=str 42 | ) 43 | parser.add_argument( 44 | "-mr", "--model-root", help="Model root directory", default="", type=str 45 | ) 46 | parser.add_argument( 47 | "-cr", "--config-recover", help="Input saved data path", type=str 48 | ) 49 | parser.add_argument( 50 | "-s", 51 | "--search", 52 | help="Search parameter based on the number of trailing attention", 53 | action="store_true", 54 | ) 55 | parser.add_argument( 56 | "--output-path", 57 | type=str, 58 | default="/tmp", 59 | help="Path to save the generated videos", 60 | ) 61 | parser.add_argument( 62 | "-xl", "--zeroscope-xl", help="Search parameter", action="store_true" 63 | ) 64 | 65 | return parser.parse_args() 66 | 67 | 68 | def main(): 69 | """The entry point to execute this program 70 | Args: 71 | Returns: 72 | """ 73 | args = get_args() 74 | video_frames = None 75 | 76 | output_folder = os.path.join(args.output_path, "TrailBlazer") 77 | if not os.path.exists(output_folder): 78 | os.makedirs(output_folder) 79 | 80 | if args.config_recover: 81 | if not os.path.exists(args.config_recover): 82 | log.info("Path [{}] is invalid.".format(args.config_recover)) 83 | 84 | data = torch.load(args.config_recover)["bundle"] 85 | 86 | filepath = os.path.splitext(os.path.basename(args.config_recover))[0] + ".yaml" 87 | ConfigIO.config_saver(data, filepath=filepath) 88 | log.info("Config recovered in [{}]".format(filepath)) 89 | 90 | elif args.config: 91 | 92 | experiment_bundles = [] 93 | log.info("Loading config..") 94 | if os.path.isdir(args.config): 95 | configs = sorted(glob.glob(f"{args.config}/*.yaml")) 96 | for cfg in configs: 97 | log.info(cfg) 98 | bundle = ConfigIO.config_loader(cfg) 99 | experiment_bundles.append([bundle, cfg]) 100 | else: 101 | log.info(args.config) 102 | bundle = ConfigIO.config_loader(args.config) 103 | experiment_bundles.append([bundle, args.config]) 104 | 105 | 106 | model_root = os.environ.get("ZEROSCOPE_MODEL_ROOT") 107 | if not model_root: 108 | model_root = args.model_root 109 | 110 | model_id = "cerspense/zeroscope_v2_576w" 111 | model_path = os.path.join(model_root, model_id) 112 | pipe = DiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16) 113 | pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) 114 | pipe.enable_model_cpu_offload() 115 | 116 | def run(bundle, config): 117 | # Note: We use Const module in attention processor as well and that's why here 118 | Const.DEFAULT_HEIGHT = bundle["height"] 119 | Const.DEFAULT_WIDTH = bundle["width"] 120 | print(Const.DEFAULT_HEIGHT, Const.DEFAULT_WIDTH) 121 | num_inference_steps = ( 122 | 40 123 | if not bundle.get("num_inference_steps") 124 | else bundle.get("num_inference_steps") 125 | ) 126 | generator = torch.Generator().manual_seed(bundle["seed"]) 127 | result = pipe( 128 | bundle=bundle, 129 | height=Const.DEFAULT_HEIGHT, 130 | width=Const.DEFAULT_WIDTH, 131 | generator=generator, 132 | num_inference_steps=num_inference_steps, 133 | ) 134 | video_frames = result.frames 135 | video_latent = result.latents 136 | task_name = os.path.splitext(os.path.basename(config))[0] 137 | output_video_path = os.path.join(output_folder, task_name + ".0000.mp4") 138 | if os.path.exists(output_video_path): 139 | import glob 140 | repeated = os.path.join(output_folder, task_name + "*mp4") 141 | num_reapts = len(glob.glob(repeated)) 142 | output_video_path = os.path.join( 143 | output_folder, task_name + ".{:04d}.mp4".format(num_reapts) 144 | ) 145 | export_to_video(video_frames, output_video_path=output_video_path) 146 | data = { 147 | "latents": result.latents, 148 | "bundle": bundle, 149 | "bbox": result.bbox_per_frame, 150 | } 151 | latent_path = os.path.splitext(output_video_path)[0] + ".pt" 152 | torch.save(data, latent_path) 153 | config_path = os.path.splitext(output_video_path)[0] + ".yaml" 154 | ConfigIO.config_saver(bundle, config_path) 155 | log.info(latent_path) 156 | log.info(output_video_path) 157 | log.info(config_path) 158 | log.info("Done") 159 | return video_frames 160 | 161 | if args.search: 162 | log.info( 163 | "Searching trailing length by range (-3, 4) of given {}".format( 164 | bundle_copy["trailing_length"] 165 | ) 166 | ) 167 | for i in range(-3, 4): 168 | bundle = copy.deepcopy(bundle_copy) 169 | bundle["trailblazer"]["trailing_length"] += i 170 | run(bundle) 171 | else: 172 | for bundle, config in experiment_bundles: 173 | 174 | if not bundle.get("keyframe"): 175 | bundle["keyframe"] = Keyframe.get_dyn_keyframe(bundle["prompt"]) 176 | # TODO: 177 | bundle["trailblazer"]["spatial_strengthen_scale"] = 0.125 178 | bundle["trailblazer"]["temp_strengthen_scale"] = 0.125 179 | bundle["trailblazer"]["trailing_length"] = 15 180 | 181 | if not bundle.get("trailblazer"): 182 | log.warn("No [trailblazer] field found in the config file. Abort.") 183 | continue 184 | video_frames = run(bundle, config) 185 | 186 | if args.zeroscope_xl: 187 | 188 | UNet3DConditionModel.forward = unet3d_condition_model_forward_copy 189 | 190 | if not video_frames: 191 | log.error( 192 | "Cannot find the cache of video_frames. Did you run the base zeroscope?" 193 | ) 194 | return 195 | model_id = "cerspense/zeroscope_v2_XL" 196 | model_root = "/home/kma/Workspace/Project/Models" 197 | model_path = os.path.join(model_root, model_id) 198 | pipe = DiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16) 199 | pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) 200 | pipe.enable_model_cpu_offload() 201 | # memory optimization 202 | pipe.unet.enable_forward_chunking(chunk_size=1, dim=1) 203 | pipe.enable_vae_slicing() 204 | video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames] 205 | video_frames = pipe(bundle["prompt_base"], video=video, strength=0.8).frames 206 | video_path = export_to_video(video_frames) 207 | log.info(video_path) 208 | 209 | 210 | if __name__ == "__main__": 211 | main() 212 | -------------------------------------------------------------------------------- /bin/CmdTrailBlazerMulti.py: -------------------------------------------------------------------------------- 1 | """ 2 | Mostly, the implementation here is the same as CmdTrailBlazer.py 3 | 4 | 5 | 6 | python bin/CmdTrailBlazer.py --config config/MultiSubject-Dog.yaml 7 | python bin/CmdTrailBlazer.py --config config/MultiSubject-Cat.yaml 8 | 9 | python bin/CmdTrailBlazerMulti.py --config config/MultiSubjects.yaml 10 | 11 | 12 | """ 13 | #!/usr/bin/env pyton 14 | import argparse 15 | import copy 16 | import os 17 | import glob 18 | import torch 19 | from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler 20 | from diffusers.pipelines import TextToVideoSDPipeline 21 | from diffusers.utils import export_to_video 22 | from PIL import Image 23 | 24 | from TrailBlazer.Misc import ConfigIO 25 | from TrailBlazer.Setting import Keyframe 26 | from TrailBlazer.Misc import Logger as log 27 | from TrailBlazer.Misc import Const 28 | from TrailBlazer.Pipeline.TextToVideoSDMultiPipelineCall import ( 29 | text_to_video_sd_multi_pipeline_call, 30 | ) 31 | from TrailBlazer.Pipeline.UNet3DConditionModelCall import ( 32 | unet3d_condition_model_forward, 33 | ) 34 | 35 | TextToVideoSDPipeline.__call__ = text_to_video_sd_multi_pipeline_call 36 | from diffusers.models.unet_3d_condition import UNet3DConditionModel 37 | 38 | unet3d_condition_model_forward_copy = UNet3DConditionModel.forward 39 | UNet3DConditionModel.forward = unet3d_condition_model_forward 40 | 41 | 42 | def get_args(): 43 | """args parsing 44 | Args: 45 | Returns: 46 | """ 47 | parser = argparse.ArgumentParser(description="Directed Video Diffusion") 48 | # parser.add_argument('--foobar', action='store_true') 49 | parser.add_argument( 50 | "-mr", "--model-root", help="Model root directory", default="", type=str 51 | ) 52 | parser.add_argument( 53 | "-s", 54 | "--search", 55 | help="Search parameter based on the number of trailing attention", 56 | action="store_true", 57 | ) 58 | parser.add_argument( 59 | "-c", "--config", help="Input config file", required=True, type=str 60 | ) 61 | parser.add_argument( 62 | "--output-path", 63 | type=str, 64 | default="/tmp", 65 | help="Path to save the generated videos", 66 | ) 67 | parser.add_argument( 68 | "-xl", "--zeroscope-xl", help="Search parameter", action="store_true" 69 | ) 70 | 71 | return parser.parse_args() 72 | 73 | 74 | def main(): 75 | """The entry point to execute this program 76 | Args: 77 | Returns: 78 | """ 79 | args = get_args() 80 | video_frames = None 81 | 82 | output_folder = os.path.join(args.output_path, "TrailBlazer") 83 | if not os.path.exists(output_folder): 84 | os.makedirs(output_folder) 85 | 86 | if args.config: 87 | 88 | experiment_bundles = [] 89 | log.info("Loading config..") 90 | if os.path.isdir(args.config): 91 | configs = sorted(glob.glob(f"{args.config}/*.yaml")) 92 | for cfg in configs: 93 | log.info(cfg) 94 | bundle = ConfigIO.config_loader(cfg) 95 | experiment_bundles.append([bundle, cfg]) 96 | else: 97 | log.info(args.config) 98 | bundle = ConfigIO.config_loader(args.config) 99 | experiment_bundles.append([bundle, args.config]) 100 | 101 | model_root = os.environ.get("ZEROSCOPE_MODEL_ROOT") 102 | if not model_root: 103 | model_root = args.model_root 104 | 105 | model_id = "cerspense/zeroscope_v2_576w" 106 | model_path = os.path.join(model_root, model_id) 107 | pipe = DiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16) 108 | pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) 109 | pipe.enable_model_cpu_offload() 110 | 111 | def run(bundle, config): 112 | # Note: We use Const module in attention processor as well and that's why here 113 | 114 | 115 | # TODO: 116 | peek = torch.load(bundle["multisubs"]["subjects"][0]) 117 | 118 | Const.DEFAULT_HEIGHT = peek["bundle"]["height"] 119 | Const.DEFAULT_WIDTH = peek["bundle"]["width"] 120 | num_inference_steps = ( 121 | 40 122 | if not bundle.get("num_inference_steps") 123 | else bundle.get("num_inference_steps") 124 | ) 125 | generator = torch.Generator().manual_seed(bundle["multisubs"]["seed"]) 126 | result = pipe( 127 | bundle=bundle, 128 | height=Const.DEFAULT_HEIGHT, 129 | width=Const.DEFAULT_WIDTH, 130 | generator=generator, 131 | num_inference_steps=num_inference_steps, 132 | ) 133 | video_frames = result.frames 134 | video_latent = result.latents 135 | task_name = os.path.splitext(os.path.basename(config))[0] 136 | output_video_path = os.path.join(output_folder, task_name + ".0000.mp4") 137 | if os.path.exists(output_video_path): 138 | import glob 139 | repeated = os.path.join(output_folder, task_name + "*mp4") 140 | num_reapts = len(glob.glob(repeated)) 141 | output_video_path = os.path.join( 142 | output_folder, task_name + ".{:04d}.mp4".format(num_reapts) 143 | ) 144 | export_to_video(video_frames, output_video_path=output_video_path) 145 | data = { 146 | "latents": result.latents, 147 | "bundle": bundle, 148 | "bbox": result.bbox_per_frame, 149 | } 150 | latent_path = os.path.splitext(output_video_path)[0] + ".pt" 151 | torch.save(data, latent_path) 152 | config_path = os.path.splitext(output_video_path)[0] + ".yaml" 153 | ConfigIO.config_saver(bundle, config_path) 154 | log.info(latent_path) 155 | log.info(output_video_path) 156 | log.info(config_path) 157 | log.info("Done") 158 | return video_frames 159 | 160 | if args.search: 161 | log.info( 162 | "Searching trailing length by range (-3, 4) of given {}".format( 163 | bundle_copy["trailing_length"] 164 | ) 165 | ) 166 | for i in range(-3, 4): 167 | bundle = copy.deepcopy(bundle_copy) 168 | bundle["trailblazer"]["trailing_length"] += i 169 | run(bundle) 170 | else: 171 | for bundle, config in experiment_bundles: 172 | video_frames = run(bundle, config) 173 | 174 | if args.zeroscope_xl: 175 | 176 | UNet3DConditionModel.forward = unet3d_condition_model_forward_copy 177 | 178 | if not video_frames: 179 | log.error( 180 | "Cannot find the cache of video_frames. Did you run the base zeroscope?" 181 | ) 182 | return 183 | model_id = "cerspense/zeroscope_v2_XL" 184 | model_root = "/home/kma/Workspace/Project/Models" 185 | model_path = os.path.join(model_root, model_id) 186 | pipe = DiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16) 187 | pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) 188 | pipe.enable_model_cpu_offload() 189 | # memory optimization 190 | pipe.unet.enable_forward_chunking(chunk_size=1, dim=1) 191 | pipe.enable_vae_slicing() 192 | video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames] 193 | video_frames = pipe(bundle["prompt_base"], video=video, strength=0.8).frames 194 | video_path = export_to_video(video_frames) 195 | log.info(video_path) 196 | 197 | 198 | if __name__ == "__main__": 199 | main() 200 | -------------------------------------------------------------------------------- /bin/TestDyn.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | 3 | 4 | cnt = 0 5 | 6 | def words(num): 7 | 8 | cnt = num 9 | print(num) 10 | update_show = [] 11 | update_hide = [] 12 | for i in range(cnt): 13 | text = gr.Label(str(i), visible=True) 14 | row = gr.Button(visible=True) 15 | update_show.append(text) 16 | update_show.append(row) 17 | for i in range(10 - cnt): 18 | text = gr.Label(str(i), visible=False) 19 | row = gr.Button(visible=False) 20 | update_hide.append(text) 21 | update_hide.append(row) 22 | print(update_show, update_hide, cnt) 23 | return update_show + update_hide 24 | 25 | rows = [] 26 | 27 | with gr.Blocks() as demo: 28 | with gr.Column(): 29 | for i in range(10): 30 | with gr.Row(): 31 | text = gr.Label(str(i), visible=False) 32 | btn = gr.Button(visible=False) 33 | #row.add(btn) 34 | rows.append(text) 35 | rows.append(btn) 36 | 37 | dropdown = gr.Dropdown(choices = range(2,10)) 38 | dropdown.change(words, dropdown, rows) 39 | 40 | demo.launch() 41 | -------------------------------------------------------------------------------- /bin/TestMakeCache.py: -------------------------------------------------------------------------------- 1 | import torch, os, sys, glob 2 | from TrailBlazer.Misc import ConfigIO 3 | from TrailBlazer.Pipeline.Utils import keyframed_bbox 4 | 5 | path = "/home/kma/Workspace/Project/Trailblazer/ECCV/Supp/Peekaboo" 6 | configs = sorted(glob.glob(f"{path}/*.yaml")) 7 | for cfg in configs: 8 | bundle = ConfigIO.config_loader(cfg) 9 | keyframe_bboxes = keyframed_bbox(bundle) 10 | 11 | latent_path = os.path.splitext(cfg)[0] + ".pt" 12 | data = { 13 | "latents": {}, 14 | "bundle": bundle, 15 | "bbox": keyframe_bboxes, 16 | } 17 | torch.save(data, latent_path) 18 | print(latent_path) 19 | -------------------------------------------------------------------------------- /config/Archive/2ndKey-astronaut.0001.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.35 5 | - 0.3 6 | - 0.75 7 | frame: 0 8 | prompt: an astronaut standing on the moon 9 | - bbox_ratios: 10 | - 0.7 11 | - 0.35 12 | - 1.0 13 | - 0.75 14 | frame: 24 15 | prompt: an astronaut walking on the moon 16 | num_dd_spatial_steps: 5 17 | num_dd_temporal_steps: 5 18 | num_frames: 24 19 | seed: 123451232531 20 | spatial_strengthen_scale: 0.15 21 | spatial_weaken_scale: 0.001 22 | temp_strengthen_scale: 0.1 23 | temp_weaken_scale: 0.001 24 | token_inds: 25 | - 1 26 | - 2 27 | trailing_length: 15 28 | -------------------------------------------------------------------------------- /config/Archive/2ndKey-astronaut.0002.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.35 5 | - 0.3 6 | - 0.75 7 | frame: 0 8 | prompt: an astronaut standing on the moon 9 | - bbox_ratios: 10 | - 0.0 11 | - 0.35 12 | - 0.3 13 | - 0.75 14 | frame: 6 15 | prompt: an astronaut standing on the moon 16 | - bbox_ratios: 17 | - 0.7 18 | - 0.35 19 | - 1.0 20 | - 0.75 21 | frame: 24 22 | prompt: an astronaut walking on the moon 23 | num_dd_spatial_steps: 5 24 | num_dd_temporal_steps: 5 25 | num_frames: 24 26 | seed: 123451232531 27 | spatial_strengthen_scale: 0.15 28 | spatial_weaken_scale: 0.001 29 | temp_strengthen_scale: 0.1 30 | temp_weaken_scale: 0.001 31 | token_inds: 32 | - 1 33 | - 2 34 | trailing_length: 15 35 | -------------------------------------------------------------------------------- /config/Archive/2ndKey-astronaut.0003.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.35 5 | - 0.3 6 | - 0.75 7 | frame: 0 8 | prompt: an astronaut standing on the moon 9 | - bbox_ratios: 10 | - 0.0 11 | - 0.35 12 | - 0.3 13 | - 0.75 14 | frame: 12 15 | prompt: an astronaut standing on the moon 16 | - bbox_ratios: 17 | - 0.7 18 | - 0.35 19 | - 1.0 20 | - 0.75 21 | frame: 24 22 | prompt: an astronaut walking on the moon 23 | num_dd_spatial_steps: 5 24 | num_dd_temporal_steps: 5 25 | num_frames: 24 26 | seed: 123451232531 27 | spatial_strengthen_scale: 0.125 28 | spatial_weaken_scale: 0.001 29 | temp_strengthen_scale: 0.1 30 | temp_weaken_scale: 0.001 31 | token_inds: 32 | - 1 33 | - 2 34 | trailing_length: 10 35 | -------------------------------------------------------------------------------- /config/Archive/2ndKey-astronaut.0004.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.35 5 | - 0.3 6 | - 0.75 7 | frame: 0 8 | prompt: an astronaut standing on the moon 9 | - bbox_ratios: 10 | - 0.0 11 | - 0.35 12 | - 0.3 13 | - 0.75 14 | frame: 18 15 | prompt: an astronaut standing on the moon 16 | - bbox_ratios: 17 | - 0.7 18 | - 0.35 19 | - 1.0 20 | - 0.75 21 | frame: 24 22 | prompt: an astronaut walking on the moon 23 | num_dd_spatial_steps: 5 24 | num_dd_temporal_steps: 5 25 | num_frames: 24 26 | seed: 123451232531 27 | spatial_strengthen_scale: 0.15 28 | spatial_weaken_scale: 0.001 29 | temp_strengthen_scale: 0.1 30 | temp_weaken_scale: 0.001 31 | token_inds: 32 | - 1 33 | - 2 34 | trailing_length: 11 35 | -------------------------------------------------------------------------------- /config/Archive/BR2TL-fish.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.5 4 | - 0.5 5 | - 1.0 6 | - 1.0 7 | frame: 0 8 | prompt: A fish swimming in the ocean 9 | - bbox_ratios: 10 | - 0.0 11 | - 0.0 12 | - 0.1 13 | - 0.1 14 | frame: 24 15 | prompt: A fish swimming in the ocean 16 | num_dd_spatial_steps: 5 17 | num_dd_temporal_steps: 5 18 | num_frames: 24 19 | seed: 123451232532 20 | spatial_strengthen_scale: 0.15 21 | spatial_weaken_scale: 0.001 22 | temp_strengthen_scale: 0.1 23 | temp_weaken_scale: 0.001 24 | token_inds: 25 | - 1 26 | - 2 27 | trailing_length: 15 28 | -------------------------------------------------------------------------------- /config/Archive/BR2TL-tiger.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.5 4 | - 0.5 5 | - 1.0 6 | - 1.0 7 | frame: 0 8 | prompt: A tiger walking alone down the street 9 | - bbox_ratios: 10 | - 0.0 11 | - 0.0 12 | - 0.1 13 | - 0.1 14 | frame: 24 15 | prompt: A tiger walking alone down the street 16 | num_dd_spatial_steps: 7 17 | num_dd_temporal_steps: 7 18 | num_frames: 24 19 | seed: 123451232532 20 | spatial_strengthen_scale: 0.1 21 | spatial_weaken_scale: 0.001 22 | temp_strengthen_scale: 0.1 23 | temp_weaken_scale: 0.001 24 | token_inds: 25 | - 1 26 | - 2 27 | trailing_length: 14 28 | -------------------------------------------------------------------------------- /config/Archive/L2R-fish.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.5 4 | - 0.35 5 | - 1.0 6 | - 0.65 7 | frame: 0 8 | prompt: A clown fish swimming in a coral reef 9 | - bbox_ratios: 10 | - 0.0 11 | - 0.35 12 | - 0.5 13 | - 0.65 14 | frame: 24 15 | prompt: A clown fish swimming in a coral reef 16 | num_dd_spatial_steps: 5 17 | num_dd_temporal_steps: 5 18 | num_frames: 24 19 | seed: 123451232531 20 | spatial_strengthen_scale: 0.1 21 | spatial_weaken_scale: 0.001 22 | temp_strengthen_scale: 0.1 23 | temp_weaken_scale: 0.001 24 | token_inds: 25 | - 1 26 | - 2 27 | - 3 28 | trailing_length: 15 29 | -------------------------------------------------------------------------------- /config/Archive/L2R-horse.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.35 5 | - 0.3 6 | - 0.65 7 | frame: 0 8 | prompt: A white horse galloping on a street 9 | - bbox_ratios: 10 | - 0.7 11 | - 0.35 12 | - 1.0 13 | - 0.65 14 | frame: 24 15 | prompt: A white horse galloping on a street 16 | num_dd_spatial_steps: 5 17 | num_dd_temporal_steps: 5 18 | num_frames: 24 19 | seed: 123451232532 20 | spatial_strengthen_scale: 0.15 21 | spatial_weaken_scale: 0.001 22 | temp_strengthen_scale: 0.15 23 | temp_weaken_scale: 0.001 24 | token_inds: 25 | - 1 26 | - 2 27 | - 3 28 | trailing_length: 16 29 | -------------------------------------------------------------------------------- /config/Archive/Omg-CatDog.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.7 4 | - 0.4 5 | - 1.0 6 | - 0.65 7 | frame: 0 8 | prompt: A white cat walking on the grass 9 | - bbox_ratios: 10 | - 0.0 11 | - 0.4 12 | - 0.3 13 | - 0.65 14 | frame: 24 15 | prompt: A yellow dog walking on the grass 16 | num_dd_spatial_steps: 5 17 | num_dd_temporal_steps: 5 18 | num_frames: 24 19 | seed: 123451232531 20 | spatial_strengthen_scale: 0.1 21 | spatial_weaken_scale: 0.001 22 | temp_strengthen_scale: 0.1 23 | temp_weaken_scale: 0.001 24 | token_inds: 25 | - 1 26 | - 2 27 | - 3 28 | trailing_length: 16 29 | -------------------------------------------------------------------------------- /config/Archive/Omg-IrrPath.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.0 5 | - 0.4 6 | - 0.3 7 | frame: 0 8 | prompt: A horse fast galloping on a street 9 | - bbox_ratios: 10 | - 0.6 11 | - 0.0 12 | - 1.0 13 | - 0.3 14 | frame: 8 15 | prompt: A horse fast galloping on a street 16 | - bbox_ratios: 17 | - 0.0 18 | - 0.3 19 | - 0.4 20 | - 0.6 21 | frame: 10 22 | prompt: A horse fast galloping on a street 23 | - bbox_ratios: 24 | - 0.6 25 | - 0.3 26 | - 1.0 27 | - 0.6 28 | frame: 16 29 | prompt: A horse fast galloping on a street 30 | - bbox_ratios: 31 | - 0.0 32 | - 0.7 33 | - 0.4 34 | - 1.0 35 | frame: 18 36 | prompt: A horse fast galloping on a street 37 | - bbox_ratios: 38 | - 0.6 39 | - 0.7 40 | - 1.0 41 | - 1.0 42 | frame: 24 43 | prompt: A horse fast galloping on a street 44 | num_dd_spatial_steps: 5 45 | num_dd_temporal_steps: 5 46 | seed: 5 47 | spatial_strengthen_scale: 0.1 48 | spatial_weaken_scale: 0.01 49 | temp_strengthen_scale: 0.1 50 | temp_weaken_scale: 0.01 51 | token_inds: 52 | - 1 53 | - 2 54 | trailing_length: 15 55 | -------------------------------------------------------------------------------- /config/Archive/Omg-Speed-sloth.0004.yaml: -------------------------------------------------------------------------------- 1 | # bbox_ratios: left, top, bottom, right 2 | keyframe: 3 | - bbox_ratios: 4 | - 0.0 5 | - 0.35 6 | - 0.4 7 | - 0.65 8 | frame: 0 9 | prompt: A sloth is running on the grass 10 | - bbox_ratios: 11 | - 0.6 12 | - 0.35 13 | - 1.0 14 | - 0.65 15 | frame: 6 16 | prompt: A sloth is running on the grass 17 | - bbox_ratios: 18 | - 0.0 19 | - 0.35 20 | - 0.4 21 | - 0.65 22 | frame: 12 23 | prompt: A sloth is running on the grass 24 | - bbox_ratios: 25 | - 0.6 26 | - 0.35 27 | - 1.0 28 | - 0.65 29 | frame: 18 30 | prompt: A sloth is running on the grass 31 | - bbox_ratios: 32 | - 0.0 33 | - 0.35 34 | - 0.4 35 | - 0.65 36 | frame: 24 37 | prompt: A sloth is running on the grass 38 | num_dd_spatial_steps: 6 39 | num_dd_temporal_steps: 6 40 | seed: 1 41 | spatial_strengthen_scale: 0.1 42 | spatial_weaken_scale: 0.01 43 | temp_strengthen_scale: 0.1 44 | temp_weaken_scale: 0.01 45 | token_inds: 46 | - 1 47 | - 2 48 | trailing_length: 20 49 | -------------------------------------------------------------------------------- /config/Archive/Omg-Speed-snail.0004.yaml: -------------------------------------------------------------------------------- 1 | # bbox_ratios: left, top, bottom, right 2 | keyframe: 3 | - bbox_ratios: 4 | - 0.0 5 | - 0.35 6 | - 0.4 7 | - 0.65 8 | frame: 0 9 | prompt: A snail is running on the grass 10 | - bbox_ratios: 11 | - 0.6 12 | - 0.35 13 | - 1.0 14 | - 0.65 15 | frame: 6 16 | prompt: A snail is running on the grass 17 | - bbox_ratios: 18 | - 0.0 19 | - 0.35 20 | - 0.4 21 | - 0.65 22 | frame: 12 23 | prompt: A snail is running on the grass 24 | - bbox_ratios: 25 | - 0.6 26 | - 0.35 27 | - 1.0 28 | - 0.65 29 | frame: 18 30 | prompt: A snail is running on the grass 31 | - bbox_ratios: 32 | - 0.0 33 | - 0.35 34 | - 0.4 35 | - 0.65 36 | frame: 24 37 | prompt: A snail is running on the grass 38 | num_dd_spatial_steps: 6 39 | num_dd_temporal_steps: 6 40 | seed: 1 41 | spatial_strengthen_scale: 0.1 42 | spatial_weaken_scale: 0.01 43 | temp_strengthen_scale: 0.1 44 | temp_weaken_scale: 0.01 45 | token_inds: 46 | - 1 47 | - 2 48 | trailing_length: 20 49 | -------------------------------------------------------------------------------- /config/Archive/Omg-Speed-tortoise.0004.yaml: -------------------------------------------------------------------------------- 1 | # bbox_ratios: left, top, bottom, right 2 | keyframe: 3 | - bbox_ratios: 4 | - 0.0 5 | - 0.35 6 | - 0.4 7 | - 0.65 8 | frame: 0 9 | prompt: A tortoise is running on the grass 10 | - bbox_ratios: 11 | - 0.6 12 | - 0.35 13 | - 1.0 14 | - 0.65 15 | frame: 6 16 | prompt: A tortoise is running on the grass 17 | - bbox_ratios: 18 | - 0.0 19 | - 0.35 20 | - 0.4 21 | - 0.65 22 | frame: 12 23 | prompt: A tortoise is running on the grass 24 | - bbox_ratios: 25 | - 0.6 26 | - 0.35 27 | - 1.0 28 | - 0.65 29 | frame: 18 30 | prompt: A tortoise is running on the grass 31 | - bbox_ratios: 32 | - 0.0 33 | - 0.35 34 | - 0.4 35 | - 0.65 36 | frame: 24 37 | prompt: A tortoise is running on the grass 38 | num_dd_spatial_steps: 6 39 | num_dd_temporal_steps: 6 40 | seed: 1 41 | spatial_strengthen_scale: 0.1 42 | spatial_weaken_scale: 0.01 43 | temp_strengthen_scale: 0.1 44 | temp_weaken_scale: 0.01 45 | token_inds: 46 | - 1 47 | - 2 48 | trailing_length: 20 49 | -------------------------------------------------------------------------------- /config/Archive/Peekapoo-default.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.14 4 | - 0.13 5 | - 0.34 6 | - 0.75 7 | frame: 0 8 | prompt: A panda eating bamboo in a lush bamboo forest 9 | - bbox_ratios: 10 | - 0.14 11 | - 0.13 12 | - 0.34 13 | - 0.75 14 | frame: 24 15 | prompt: A panda eating bamboo in a lush bamboo forest 16 | seed: 2 17 | token_inds: 18 | - 2 19 | width: 576 20 | height: 320 21 | num_inference_steps: 50 22 | trailblazer: 23 | num_dd_spatial_steps: 5 24 | num_dd_temporal_steps: 5 25 | spatial_strengthen_scale: 0.15 26 | spatial_weaken_scale: 0.001 27 | temp_strengthen_scale: 0.15 28 | temp_weaken_scale: 0.001 29 | trailing_length: 13 30 | text2vidzero: 31 | motion_field_strength_x: 8 32 | motion_field_strength_y: 0 33 | peekaboo: 34 | frozen_steps: 2 35 | -------------------------------------------------------------------------------- /config/Archive/Perspective-fish.0001.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.0 5 | - 0.2 6 | - 0.2 7 | frame: 0 8 | prompt: A fish swimming in the sea 9 | - bbox_ratios: 10 | - 0.2 11 | - 0.2 12 | - 0.8 13 | - 0.8 14 | frame: 12 15 | prompt: A fish swimming in the sea 16 | - bbox_ratios: 17 | - 0.8 18 | - 0.8 19 | - 1.0 20 | - 1.0 21 | frame: 23 22 | prompt: A fish swimming in the sea 23 | num_dd_spatial_steps: 7 24 | num_dd_temporal_steps: 7 25 | seed: 53 26 | spatial_strengthen_scale: 0.075 27 | spatial_weaken_scale: 0.0001 28 | temp_strengthen_scale: 0.075 29 | temp_weaken_scale: 0.0001 30 | token_inds: 31 | - 1 32 | - 2 33 | - 3 34 | trailing_length: 13 35 | -------------------------------------------------------------------------------- /config/Archive/Perspective.0002.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.0 5 | - 0.2 6 | - 0.2 7 | frame: 0 8 | prompt: A fish swimming in the sea 9 | - bbox_ratios: 10 | - 0.2 11 | - 0.2 12 | - 0.8 13 | - 0.8 14 | frame: 12 15 | prompt: A fish swimming in the sea 16 | - bbox_ratios: 17 | - 0.5 18 | - 0.5 19 | - 1.0 20 | - 1.0 21 | frame: 23 22 | prompt: A fish swimming in the sea 23 | num_dd_spatial_steps: 7 24 | num_dd_temporal_steps: 7 25 | seed: 53 26 | spatial_strengthen_scale: 0.075 27 | spatial_weaken_scale: 0.0001 28 | temp_strengthen_scale: 0.075 29 | temp_weaken_scale: 0.0001 30 | token_inds: 31 | - 1 32 | - 2 33 | - 3 34 | trailing_length: 13 35 | -------------------------------------------------------------------------------- /config/Archive/R2L-fish.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.35 5 | - 0.5 6 | - 0.65 7 | frame: 0 8 | prompt: A clown fish swimming in a coral reef 9 | - bbox_ratios: 10 | - 0.5 11 | - 0.35 12 | - 1.0 13 | - 0.65 14 | frame: 24 15 | prompt: A clown fish swimming in a coral reef 16 | num_dd_spatial_steps: 5 17 | num_dd_temporal_steps: 5 18 | num_frames: 24 19 | seed: 123451232531 20 | spatial_strengthen_scale: 0.1 21 | spatial_weaken_scale: 0.001 22 | temp_strengthen_scale: 0.1 23 | temp_weaken_scale: 0.001 24 | token_inds: 25 | - 1 26 | - 2 27 | - 3 28 | trailing_length: 15 29 | -------------------------------------------------------------------------------- /config/Archive/R2L-horse.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.7 4 | - 0.35 5 | - 1.0 6 | - 0.65 7 | frame: 0 8 | prompt: A white horse galloping on a street 9 | - bbox_ratios: 10 | - 0.0 11 | - 0.35 12 | - 0.3 13 | - 0.65 14 | frame: 24 15 | prompt: A white horse galloping on a street 16 | num_dd_spatial_steps: 5 17 | num_dd_temporal_steps: 5 18 | num_frames: 24 19 | seed: 123451232538 20 | spatial_strengthen_scale: 0.1 21 | spatial_weaken_scale: 0.001 22 | temp_strengthen_scale: 0.1 23 | temp_weaken_scale: 0.001 24 | token_inds: 25 | - 1 26 | - 2 27 | - 3 28 | trailing_length: 15 29 | -------------------------------------------------------------------------------- /config/Archive/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Archived Results 3 | 4 | Please note that these results were obtained using our TrailBlazer(v0.0.2) 5 | version, which utilizes our original config structure that is not compatible 6 | with the latest updates. 7 | 8 | - **L2R**: Standard test to move subject from left to right. 9 | 10 | **L2R-fish.yaml** 11 | 12 | ![L2R-fish.0000](../assets/figs/L2R-fish.0000.gif) 13 | 14 | **L2R-horse.yaml** 15 | 16 | ![L2R-horse.0000](../assets/figs/L2R-horse.0000.gif) 17 | 18 | - **R2L**: Standard test to move subject from right to left. 19 | 20 | **R2L-fish.yaml** 21 | 22 | ![R2L-fish.0000](../assets/figs/R2L-fish.0000.gif) 23 | 24 | **R2L-horse.yaml** 25 | 26 | ![R2L-horse.0000](../assets/figs/R2L-horse.0000.gif) 27 | 28 | - **2ndKey**: To make varied speed by adjusting the middle key timing. 29 | 30 | **2ndKey-astronaut.0001.yaml** 31 | 32 | ![2ndKey-astronaut.0001](../assets/figs/2ndKey-astronaut.0001.0000.gif) 33 | 34 | **2ndKey-astronaut.0002.yaml** 35 | 36 | ![2ndKey-astronaut.0002](../assets/figs/2ndKey-astronaut.0002.0000.gif) 37 | 38 | **2ndKey-astronaut.0003.yaml** 39 | 40 | ![2ndKey-astronaut.0003](../assets/figs/2ndKey-astronaut.0003.0000.gif) 41 | 42 | **2ndKey-astronaut.0004.yaml** 43 | 44 | ![2ndKey-astronaut.0004](../assets/figs/2ndKey-astronaut.0004.0000.gif) 45 | 46 | - **BR2TL**: To make the subject from bottom right (BR) to top left (TL). 47 | 48 | **BR2TL-fish.yaml** 49 | 50 | ![BR2TL-fish.0000](../assets/figs/BR2TL-fish.0000.gif) 51 | 52 | **BR2TL-tiger.yaml** 53 | 54 | ![BR2TL-tiger.0000](../assets/figs/BR2TL-tiger.0000.gif) 55 | 56 | - **TL2BR**: To make the subject from top left (TL) to bottom right (BR). 57 | 58 | **TL2BR-fish.yaml** 59 | 60 | ![TL2BR-fish.0000](../assets/figs/TL2BR-fish.0000.gif) 61 | 62 | **TL2BR-tiger.yaml** 63 | 64 | ![TL2BR-tiger.0000](../assets/figs/TL2BR-tiger.0000.gif) 65 | 66 | - **Perspective-fish**: Two comparison on changing or not changing the bbox size of fish to emulate the perspective view changes. 67 | 68 | **Perspective-fish.0001.yaml** 69 | 70 | ![FNF-fish.0001](../assets/figs/FNF-fish.0001.0000.gif) 71 | 72 | **Perspective-fish.0002.yaml** 73 | 74 | ![FNF-fish.0002](../assets/figs/FNF-fish.0002.0000.gif) 75 | 76 | - **Speed-cat**: To see different number of keyframes that controls the poor cat. 77 | 78 | **Speed-cat.0001.yaml** 79 | 80 | ![Speed-cat.0001](../assets/figs/Speed-cat.0001.0000.gif) 81 | 82 | **Speed-cat.0002.yaml** 83 | 84 | ![Speed-cat.0002](../assets/figs/Speed-cat.0002.0001.gif) 85 | 86 | **Speed-cat.0003.yaml** 87 | 88 | ![Speed-cat.0003](../assets/figs/Speed-cat.0003.0000.gif) 89 | 90 | **Speed-cat.0004.yaml** 91 | 92 | ![Speed-cat.0004](../assets/figs/Speed-cat.0004.0000.gif) 93 | 94 | - **Speed-{animal}.0004**: To see how other animal runs back-and-forth. 95 | 96 | **Speed-dog.0004.yaml** 97 | 98 | ![Speed-dog.0004](../assets/figs/Speed-dog.0004.0000.gif) 99 | 100 | **Speed-horse.0004.yaml** 101 | 102 | ![Speed-horse.0004](../assets/figs/Speed-horse.0004.0000.gif) 103 | 104 | **Speed-tiger.0004.yaml** 105 | 106 | ![Speed-tiger.0004](../assets/figs/Speed-tiger.0004.0000.gif) 107 | 108 | **Speed-reindeer.0004.yaml** 109 | 110 | ![Speed-reindeer.0004](../assets/figs/Speed-reindeer.0004.0000.gif) 111 | 112 | - **Omg**: The configs that defy common sense can lead to unexpected and 113 | potentially problematic behavior. On my god. 114 | 115 | **Omg-CatDog.yaml**: Consistency in the subject is crucial. You will witness how a cat transforms into a dog if the subject prompt word is inconsistent. 116 | 117 | ![Omg-CatDog.0004](../assets/figs/Omg-CatDog.0003.gif) 118 | 119 | 120 | **Omg-IrrPath.yaml**: It is advised to avoid using a discontinuous keyframe bbox. In such a case, you will observe three different horses appearing from the left. 121 | 122 | ![Omg-IrrPath.0004](../assets/figs/Omg-IrrPath.0003.gif) 123 | 124 | 125 | **Omg-Speed-sloth.0004.yaml**: The sloth, tortoise, and snail are recognized for their slow movements. Applying fast keyframing to these creatures may yield unexpected results. 126 | 127 | ![Speed-sloth.0004](../assets/figs/Speed-sloth.0004.0000.gif) 128 | 129 | **Omg-Speed-tortoise.0004.yaml**: I believe master roshi (Dragon ball) is hiding in the shell. 130 | 131 | ![Speed-tortoise.0004](../assets/figs/Speed-tortoise.0004.0000.gif) 132 | 133 | **Omg-Speed-snail.0004.yaml**: It feels like the snail is rolling without using his body. 134 | 135 | ![Speed-snail.0004](../assets/figs/Speed-snail.0004.0000.gif) 136 | -------------------------------------------------------------------------------- /config/Archive/Speed-cat.0001.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.2 5 | - 0.4 6 | - 0.7 7 | frame: 0 8 | prompt: A cat is running on the grass 9 | - bbox_ratios: 10 | - 0.6 11 | - 0.2 12 | - 1.0 13 | - 0.7 14 | frame: 24 15 | prompt: A cat is running on the grass 16 | num_dd_spatial_steps: 5 17 | num_dd_temporal_steps: 5 18 | seed: 20 19 | spatial_strengthen_scale: 0.15 20 | spatial_weaken_scale: 0.001 21 | temp_strengthen_scale: 0.15 22 | temp_weaken_scale: 0.001 23 | token_inds: 24 | - 1 25 | - 2 26 | trailing_length: 15 27 | -------------------------------------------------------------------------------- /config/Archive/Speed-cat.0002.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.35 5 | - 0.5 6 | - 0.65 7 | frame: 0 8 | prompt: A cat is running on the grass 9 | - bbox_ratios: 10 | - 0.5 11 | - 0.35 12 | - 1.0 13 | - 0.65 14 | frame: 12 15 | prompt: A cat is running on the grass 16 | - bbox_ratios: 17 | - 0.0 18 | - 0.35 19 | - 0.5 20 | - 0.65 21 | frame: 24 22 | prompt: A cat is running on the grass 23 | num_dd_spatial_steps: 5 24 | num_dd_temporal_steps: 5 25 | seed: 20 26 | spatial_strengthen_scale: 0.15 27 | spatial_weaken_scale: 0.001 28 | temp_strengthen_scale: 0.15 29 | temp_weaken_scale: 0.001 30 | token_inds: 31 | - 1 32 | - 2 33 | trailing_length: 10 34 | -------------------------------------------------------------------------------- /config/Archive/Speed-cat.0003.yaml: -------------------------------------------------------------------------------- 1 | # bbox_ratios: left, top, bottom, right 2 | keyframe: 3 | - bbox_ratios: 4 | - 0.0 5 | - 0.35 6 | - 0.4 7 | - 0.65 8 | frame: 0 9 | prompt: A cat is running on the grass 10 | - bbox_ratios: 11 | - 0.6 12 | - 0.35 13 | - 1.0 14 | - 0.65 15 | frame: 8 16 | prompt: A cat is running on the grass 17 | - bbox_ratios: 18 | - 0.0 19 | - 0.35 20 | - 0.4 21 | - 0.65 22 | frame: 16 23 | prompt: A cat is running on the grass 24 | - bbox_ratios: 25 | - 0.6 26 | - 0.35 27 | - 1.0 28 | - 0.65 29 | frame: 24 30 | prompt: A cat is running on the grass 31 | num_dd_spatial_steps: 5 32 | num_dd_temporal_steps: 5 33 | seed: 20 34 | spatial_strengthen_scale: 0.15 35 | spatial_weaken_scale: 0.001 36 | temp_strengthen_scale: 0.15 37 | temp_weaken_scale: 0.001 38 | token_inds: 39 | - 1 40 | - 2 41 | trailing_length: 10 42 | -------------------------------------------------------------------------------- /config/Archive/Speed-cat.0004.yaml: -------------------------------------------------------------------------------- 1 | # bbox_ratios: left, top, bottom, right 2 | keyframe: 3 | - bbox_ratios: 4 | - 0.0 5 | - 0.35 6 | - 0.4 7 | - 0.65 8 | frame: 0 9 | prompt: A cat is running on the grass 10 | - bbox_ratios: 11 | - 0.6 12 | - 0.35 13 | - 1.0 14 | - 0.65 15 | frame: 6 16 | prompt: A cat is running on the grass 17 | - bbox_ratios: 18 | - 0.0 19 | - 0.35 20 | - 0.4 21 | - 0.65 22 | frame: 12 23 | prompt: A cat is running on the grass 24 | - bbox_ratios: 25 | - 0.6 26 | - 0.35 27 | - 1.0 28 | - 0.65 29 | frame: 18 30 | prompt: A cat is running on the grass 31 | - bbox_ratios: 32 | - 0.0 33 | - 0.35 34 | - 0.4 35 | - 0.65 36 | frame: 24 37 | prompt: A cat is running on the grass 38 | num_dd_spatial_steps: 6 39 | num_dd_temporal_steps: 6 40 | seed: 1 41 | spatial_strengthen_scale: 0.1 42 | spatial_weaken_scale: 0.01 43 | temp_strengthen_scale: 0.1 44 | temp_weaken_scale: 0.01 45 | token_inds: 46 | - 1 47 | - 2 48 | trailing_length: 20 49 | -------------------------------------------------------------------------------- /config/Archive/Speed-cheetah.0004.yaml: -------------------------------------------------------------------------------- 1 | # bbox_ratios: left, top, bottom, right 2 | keyframe: 3 | - bbox_ratios: 4 | - 0.0 5 | - 0.35 6 | - 0.4 7 | - 0.65 8 | frame: 0 9 | prompt: A cheetah is running on the grass 10 | - bbox_ratios: 11 | - 0.6 12 | - 0.35 13 | - 1.0 14 | - 0.65 15 | frame: 6 16 | prompt: A cheetah is running on the grass 17 | - bbox_ratios: 18 | - 0.0 19 | - 0.35 20 | - 0.4 21 | - 0.65 22 | frame: 12 23 | prompt: A cheetah is running on the grass 24 | - bbox_ratios: 25 | - 0.6 26 | - 0.35 27 | - 1.0 28 | - 0.65 29 | frame: 18 30 | prompt: A cheetah is running on the grass 31 | - bbox_ratios: 32 | - 0.0 33 | - 0.35 34 | - 0.4 35 | - 0.65 36 | frame: 24 37 | prompt: A cheetah is running on the grass 38 | num_dd_spatial_steps: 6 39 | num_dd_temporal_steps: 6 40 | seed: 1 41 | spatial_strengthen_scale: 0.1 42 | spatial_weaken_scale: 0.01 43 | temp_strengthen_scale: 0.1 44 | temp_weaken_scale: 0.01 45 | token_inds: 46 | - 1 47 | - 2 48 | trailing_length: 20 49 | -------------------------------------------------------------------------------- /config/Archive/Speed-dog.0004.yaml: -------------------------------------------------------------------------------- 1 | # bbox_ratios: left, top, bottom, right 2 | keyframe: 3 | - bbox_ratios: 4 | - 0.0 5 | - 0.35 6 | - 0.4 7 | - 0.65 8 | frame: 0 9 | prompt: A dog is running on the grass 10 | - bbox_ratios: 11 | - 0.6 12 | - 0.35 13 | - 1.0 14 | - 0.65 15 | frame: 6 16 | prompt: A dog is running on the grass 17 | - bbox_ratios: 18 | - 0.0 19 | - 0.35 20 | - 0.4 21 | - 0.65 22 | frame: 12 23 | prompt: A dog is running on the grass 24 | - bbox_ratios: 25 | - 0.6 26 | - 0.35 27 | - 1.0 28 | - 0.65 29 | frame: 18 30 | prompt: A dog is running on the grass 31 | - bbox_ratios: 32 | - 0.0 33 | - 0.35 34 | - 0.4 35 | - 0.65 36 | frame: 24 37 | prompt: A dog is running on the grass 38 | num_dd_spatial_steps: 6 39 | num_dd_temporal_steps: 6 40 | seed: 1 41 | spatial_strengthen_scale: 0.1 42 | spatial_weaken_scale: 0.01 43 | temp_strengthen_scale: 0.1 44 | temp_weaken_scale: 0.01 45 | token_inds: 46 | - 1 47 | - 2 48 | trailing_length: 20 49 | -------------------------------------------------------------------------------- /config/Archive/Speed-horse.0004.yaml: -------------------------------------------------------------------------------- 1 | # bbox_ratios: left, top, bottom, right 2 | keyframe: 3 | - bbox_ratios: 4 | - 0.0 5 | - 0.35 6 | - 0.4 7 | - 0.65 8 | frame: 0 9 | prompt: A horse is running on the grass 10 | - bbox_ratios: 11 | - 0.6 12 | - 0.35 13 | - 1.0 14 | - 0.65 15 | frame: 6 16 | prompt: A horse is running on the grass 17 | - bbox_ratios: 18 | - 0.0 19 | - 0.35 20 | - 0.4 21 | - 0.65 22 | frame: 12 23 | prompt: A horse is running on the grass 24 | - bbox_ratios: 25 | - 0.6 26 | - 0.35 27 | - 1.0 28 | - 0.65 29 | frame: 18 30 | prompt: A horse is running on the grass 31 | - bbox_ratios: 32 | - 0.0 33 | - 0.35 34 | - 0.4 35 | - 0.65 36 | frame: 24 37 | prompt: A horse is running on the grass 38 | num_dd_spatial_steps: 6 39 | num_dd_temporal_steps: 6 40 | seed: 1 41 | spatial_strengthen_scale: 0.1 42 | spatial_weaken_scale: 0.01 43 | temp_strengthen_scale: 0.1 44 | temp_weaken_scale: 0.01 45 | token_inds: 46 | - 1 47 | - 2 48 | trailing_length: 20 49 | -------------------------------------------------------------------------------- /config/Archive/Speed-reindeer.0004.yaml: -------------------------------------------------------------------------------- 1 | # bbox_ratios: left, top, bottom, right 2 | keyframe: 3 | - bbox_ratios: 4 | - 0.0 5 | - 0.35 6 | - 0.4 7 | - 0.65 8 | frame: 0 9 | prompt: A reindeer is running on the grass 10 | - bbox_ratios: 11 | - 0.6 12 | - 0.35 13 | - 1.0 14 | - 0.65 15 | frame: 6 16 | prompt: A reindeer is running on the grass 17 | - bbox_ratios: 18 | - 0.0 19 | - 0.35 20 | - 0.4 21 | - 0.65 22 | frame: 12 23 | prompt: A reindeer is running on the grass 24 | - bbox_ratios: 25 | - 0.6 26 | - 0.35 27 | - 1.0 28 | - 0.65 29 | frame: 18 30 | prompt: A reindeer is running on the grass 31 | - bbox_ratios: 32 | - 0.0 33 | - 0.35 34 | - 0.4 35 | - 0.65 36 | frame: 24 37 | prompt: A reindeer is running on the grass 38 | num_dd_spatial_steps: 6 39 | num_dd_temporal_steps: 6 40 | seed: 1 41 | spatial_strengthen_scale: 0.1 42 | spatial_weaken_scale: 0.01 43 | temp_strengthen_scale: 0.1 44 | temp_weaken_scale: 0.01 45 | token_inds: 46 | - 1 47 | - 2 48 | trailing_length: 20 49 | -------------------------------------------------------------------------------- /config/Archive/Speed-tiger.0004.yaml: -------------------------------------------------------------------------------- 1 | # bbox_ratios: left, top, bottom, right 2 | keyframe: 3 | - bbox_ratios: 4 | - 0.0 5 | - 0.35 6 | - 0.4 7 | - 0.65 8 | frame: 0 9 | prompt: A tiger is running on the grass 10 | - bbox_ratios: 11 | - 0.6 12 | - 0.35 13 | - 1.0 14 | - 0.65 15 | frame: 6 16 | prompt: A tiger is running on the grass 17 | - bbox_ratios: 18 | - 0.0 19 | - 0.35 20 | - 0.4 21 | - 0.65 22 | frame: 12 23 | prompt: A tiger is running on the grass 24 | - bbox_ratios: 25 | - 0.6 26 | - 0.35 27 | - 1.0 28 | - 0.65 29 | frame: 18 30 | prompt: A tiger is running on the grass 31 | - bbox_ratios: 32 | - 0.0 33 | - 0.35 34 | - 0.4 35 | - 0.65 36 | frame: 24 37 | prompt: A tiger is running on the grass 38 | num_dd_spatial_steps: 6 39 | num_dd_temporal_steps: 6 40 | seed: 1 41 | spatial_strengthen_scale: 0.1 42 | spatial_weaken_scale: 0.01 43 | temp_strengthen_scale: 0.1 44 | temp_weaken_scale: 0.01 45 | token_inds: 46 | - 1 47 | - 2 48 | trailing_length: 20 49 | -------------------------------------------------------------------------------- /config/Archive/TL2BR-fish.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.0 5 | - 0.1 6 | - 0.1 7 | frame: 0 8 | prompt: A fish swimming in the ocean 9 | - bbox_ratios: 10 | - 0.5 11 | - 0.5 12 | - 1.0 13 | - 1.0 14 | frame: 24 15 | prompt: A fish swimming in the ocean 16 | num_dd_spatial_steps: 7 17 | num_dd_temporal_steps: 7 18 | num_frames: 24 19 | seed: 123451232532 20 | spatial_strengthen_scale: 0.1 21 | spatial_weaken_scale: 0.001 22 | temp_strengthen_scale: 0.1 23 | temp_weaken_scale: 0.001 24 | token_inds: 25 | - 1 26 | - 2 27 | trailing_length: 14 28 | -------------------------------------------------------------------------------- /config/Archive/TL2BR-tiger.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.0 5 | - 0.1 6 | - 0.1 7 | frame: 0 8 | prompt: A tiger walking alone down the street 9 | - bbox_ratios: 10 | - 0.5 11 | - 0.5 12 | - 1.0 13 | - 1.0 14 | frame: 24 15 | prompt: A tiger walking alone down the street 16 | num_dd_spatial_steps: 7 17 | num_dd_temporal_steps: 7 18 | num_frames: 24 19 | seed: 123451232532 20 | spatial_strengthen_scale: 0.1 21 | spatial_weaken_scale: 0.001 22 | temp_strengthen_scale: 0.1 23 | temp_weaken_scale: 0.001 24 | token_inds: 25 | - 1 26 | - 2 27 | trailing_length: 14 28 | -------------------------------------------------------------------------------- /config/Main/PerspBR2TL-Tiger.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.5 4 | - 0.5 5 | - 1.0 6 | - 1.0 7 | frame: 0 8 | prompt: A tiger walking alone down the street 9 | - bbox_ratios: 10 | - 0.0 11 | - 0.0 12 | - 0.1 13 | - 0.1 14 | frame: 24 15 | prompt: A tiger walking alone down the street 16 | seed: 123451232531 17 | token_inds: 18 | - 2 19 | width: 576 20 | height: 320 21 | num_inference_steps: 22 | trailblazer: 23 | num_dd_spatial_steps: 5 24 | num_dd_temporal_steps: 5 25 | spatial_strengthen_scale: 0.15 26 | spatial_weaken_scale: 0.001 27 | temp_strengthen_scale: 0.15 28 | temp_weaken_scale: 0.001 29 | trailing_length: 13 30 | text2vidzero: 31 | motion_field_strength_x: -8 32 | motion_field_strength_y: 0 33 | peekaboo: 34 | frozen_steps: 2 35 | -------------------------------------------------------------------------------- /config/Main/PerspTL2BR-Tiger.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.0 5 | - 0.1 6 | - 0.1 7 | frame: 0 8 | prompt: A tiger walking alone down the street 9 | - bbox_ratios: 10 | - 0.5 11 | - 0.5 12 | - 1.0 13 | - 1.0 14 | frame: 24 15 | prompt: A tiger walking alone down the street 16 | seed: 123451232531 17 | token_inds: 18 | - 2 19 | width: 576 20 | height: 320 21 | num_inference_steps: 22 | trailblazer: 23 | num_dd_spatial_steps: 5 24 | num_dd_temporal_steps: 5 25 | spatial_strengthen_scale: 0.15 26 | spatial_weaken_scale: 0.001 27 | temp_strengthen_scale: 0.15 28 | temp_weaken_scale: 0.001 29 | trailing_length: 13 30 | text2vidzero: 31 | motion_field_strength_x: -8 32 | motion_field_strength_y: 0 33 | peekaboo: 34 | frozen_steps: 2 35 | -------------------------------------------------------------------------------- /config/Main/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hohonu-vicml/TrailBlazer/de2696ef50537a473ab5afc57de89a5edc97c00b/config/Main/README.md -------------------------------------------------------------------------------- /config/Main/RigidMoving-Astronaut.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.35 5 | - 0.3 6 | - 0.65 7 | frame: 0 8 | prompt: An astronaut walking on the moon 9 | - bbox_ratios: 10 | - 0.7 11 | - 0.35 12 | - 1.0 13 | - 0.65 14 | frame: 24 15 | prompt: An astronaut walking on the moon 16 | seed: 1234 17 | token_inds: 18 | - 2 19 | width: 576 20 | height: 320 21 | num_inference_steps: 22 | trailblazer: 23 | num_dd_spatial_steps: 7 24 | num_dd_temporal_steps: 7 25 | spatial_strengthen_scale: 0.125 26 | spatial_weaken_scale: 0.001 27 | temp_strengthen_scale: 0.125 28 | temp_weaken_scale: 0.001 29 | trailing_length: 10 30 | text2vidzero: 31 | motion_field_strength_x: -8 32 | motion_field_strength_y: 0 33 | peekaboo: 34 | frozen_steps: 2 35 | -------------------------------------------------------------------------------- /config/Main/RigidMoving-Bee.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.3 5 | - 0.3 6 | - 0.7 7 | frame: 0 8 | prompt: A macro video of a bee pollinating a flower 9 | - bbox_ratios: 10 | - 0.7 11 | - 0.3 12 | - 1.0 13 | - 0.7 14 | frame: 24 15 | prompt: A macro video of a bee pollinating a flower 16 | seed: 123451232532 17 | token_inds: 18 | - 6 19 | width: 576 20 | height: 320 21 | num_inference_steps: 22 | trailblazer: 23 | num_dd_spatial_steps: 5 24 | num_dd_temporal_steps: 5 25 | spatial_strengthen_scale: 0.15 26 | spatial_weaken_scale: 0.001 27 | temp_strengthen_scale: 0.15 28 | temp_weaken_scale: 0.001 29 | trailing_length: 13 30 | text2vidzero: 31 | motion_field_strength_x: -8 32 | motion_field_strength_y: 0 33 | peekaboo: 34 | frozen_steps: 3 35 | -------------------------------------------------------------------------------- /config/Main/RigidMoving-Cat.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.1 4 | - 0.3 5 | - 0.4 6 | - 0.7 7 | frame: 0 8 | prompt: A cat walking on the grass 9 | - bbox_ratios: 10 | - 0.6 11 | - 0.3 12 | - 0.9 13 | - 0.7 14 | frame: 24 15 | prompt: A cat walking on the grass 16 | seed: 4 17 | token_inds: 18 | - 2 19 | width: 576 20 | height: 320 21 | num_inference_steps: 22 | trailblazer: 23 | num_dd_spatial_steps: 5 24 | num_dd_temporal_steps: 5 25 | spatial_strengthen_scale: 0.15 26 | spatial_weaken_scale: 0.001 27 | temp_strengthen_scale: 0.15 28 | temp_weaken_scale: 0.001 29 | trailing_length: 15 30 | text2vidzero: 31 | motion_field_strength_x: -8 32 | motion_field_strength_y: 0 33 | peekaboo: 34 | frozen_steps: 2 35 | -------------------------------------------------------------------------------- /config/Main/RigidMoving-Clownfish.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.35 5 | - 0.3 6 | - 0.65 7 | frame: 0 8 | prompt: A clownfish swimming in a coral reef 9 | - bbox_ratios: 10 | - 0.7 11 | - 0.35 12 | - 1.0 13 | - 0.65 14 | frame: 24 15 | prompt: A clownfish swimming in a coral reef 16 | seed: 123451232531 17 | token_inds: 18 | - 2 19 | width: 576 20 | height: 320 21 | num_inference_steps: 22 | trailblazer: 23 | num_dd_spatial_steps: 5 24 | num_dd_temporal_steps: 5 25 | spatial_strengthen_scale: 0.15 26 | spatial_weaken_scale: 0.001 27 | temp_strengthen_scale: 0.15 28 | temp_weaken_scale: 0.001 29 | trailing_length: 13 30 | text2vidzero: 31 | motion_field_strength_x: -8 32 | motion_field_strength_y: 0 33 | peekaboo: 34 | frozen_steps: 2 35 | -------------------------------------------------------------------------------- /config/Main/SpeedKeys-Cat.yaml: -------------------------------------------------------------------------------- 1 | # bbox_ratios: left, top, bottom, right 2 | keyframe: 3 | - bbox_ratios: 4 | - 0.0 5 | - 0.35 6 | - 0.3 7 | - 0.65 8 | frame: 0 9 | prompt: A cat is running on the grass 10 | - bbox_ratios: 11 | - 0.7 12 | - 0.35 13 | - 1.0 14 | - 0.65 15 | frame: 6 16 | prompt: A cat is running on the grass 17 | - bbox_ratios: 18 | - 0.0 19 | - 0.35 20 | - 0.3 21 | - 0.65 22 | frame: 12 23 | prompt: A cat is running on the grass 24 | - bbox_ratios: 25 | - 0.7 26 | - 0.35 27 | - 1.0 28 | - 0.65 29 | frame: 18 30 | prompt: A cat is running on the grass 31 | - bbox_ratios: 32 | - 0.0 33 | - 0.35 34 | - 0.3 35 | - 0.65 36 | frame: 24 37 | prompt: A cat is running on the grass 38 | seed: 123451232532 39 | token_inds: 40 | - 2 41 | width: 576 42 | height: 320 43 | num_inference_steps: 44 | trailblazer: 45 | num_dd_spatial_steps: 5 46 | num_dd_temporal_steps: 5 47 | spatial_strengthen_scale: 0.125 48 | spatial_weaken_scale: 0.05 49 | temp_strengthen_scale: 0.125 50 | temp_weaken_scale: 0.05 51 | trailing_length: 15 52 | peekaboo: 53 | frozen_steps: 2 54 | -------------------------------------------------------------------------------- /config/Metric/AirBalloon.yaml: -------------------------------------------------------------------------------- 1 | prompt: A hot air balloon drifting across a clear sky 2 | seed: 4 3 | token_inds: 4 | - 4 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/AirBalloon2.yaml: -------------------------------------------------------------------------------- 1 | prompt: A colorful hot air balloon tethered to the ground 2 | seed: 4 3 | token_inds: 4 | - 3 5 | - 4 6 | - 5 7 | width: 576 8 | height: 320 9 | num_inference_steps: 10 | trailblazer: 11 | num_dd_spatial_steps: 5 12 | num_dd_temporal_steps: 5 13 | spatial_strengthen_scale: 0.1 14 | spatial_weaken_scale: 0.001 15 | temp_strengthen_scale: 0.1 16 | temp_weaken_scale: 0.001 17 | trailing_length: 13 18 | text2vidzero: 19 | motion_field_strength_x: -8 20 | motion_field_strength_y: 0 21 | peekaboo: 22 | frozen_steps: 2 23 | -------------------------------------------------------------------------------- /config/Metric/Bear.yaml: -------------------------------------------------------------------------------- 1 | prompt: A bear climbing down a tree after spotting a threat 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Bird.yaml: -------------------------------------------------------------------------------- 1 | prompt: A bird diving towards the water to catch fish 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Bus.yaml: -------------------------------------------------------------------------------- 1 | prompt: A red double-decker bus moving through London streets 2 | seed: 4 3 | token_inds: 4 | - 2 5 | - 3 6 | - 4 7 | width: 576 8 | height: 320 9 | num_inference_steps: 10 | trailblazer: 11 | num_dd_spatial_steps: 5 12 | num_dd_temporal_steps: 5 13 | spatial_strengthen_scale: 0.1 14 | spatial_weaken_scale: 0.001 15 | temp_strengthen_scale: 0.1 16 | temp_weaken_scale: 0.001 17 | trailing_length: 13 18 | text2vidzero: 19 | motion_field_strength_x: -8 20 | motion_field_strength_y: 0 21 | peekaboo: 22 | frozen_steps: 2 23 | -------------------------------------------------------------------------------- /config/Metric/Camel.yaml: -------------------------------------------------------------------------------- 1 | prompt: A camel resting in a desert landscape 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Deer.yaml: -------------------------------------------------------------------------------- 1 | prompt: A deer standing in a snowy field 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Dolphin.yaml: -------------------------------------------------------------------------------- 1 | prompt: A dolphin just breaking the ocean surface 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Duck.yaml: -------------------------------------------------------------------------------- 1 | prompt: A duck diving underwater in search of food 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Fox.yaml: -------------------------------------------------------------------------------- 1 | prompt: A fox sitting in a forest clearing 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Frog.yaml: -------------------------------------------------------------------------------- 1 | prompt: A frog leaping up to catch a fly 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Helicopter.yaml: -------------------------------------------------------------------------------- 1 | prompt: A helicopter hovering above a cityscape 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/House.yaml: -------------------------------------------------------------------------------- 1 | prompt: A horse grazing in a meadow 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Jet.yaml: -------------------------------------------------------------------------------- 1 | prompt: A jet plane flying high in the sky 2 | seed: 4 3 | token_inds: 4 | - 2 5 | - 3 6 | width: 576 7 | height: 320 8 | num_inference_steps: 9 | trailblazer: 10 | num_dd_spatial_steps: 5 11 | num_dd_temporal_steps: 5 12 | spatial_strengthen_scale: 0.1 13 | spatial_weaken_scale: 0.001 14 | temp_strengthen_scale: 0.1 15 | temp_weaken_scale: 0.001 16 | trailing_length: 13 17 | text2vidzero: 18 | motion_field_strength_x: -8 19 | motion_field_strength_y: 0 20 | peekaboo: 21 | frozen_steps: 2 22 | -------------------------------------------------------------------------------- /config/Metric/Kangaroo.yaml: -------------------------------------------------------------------------------- 1 | prompt: A kangaroo hopping down a gentle slope 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Kangaroo2.yaml: -------------------------------------------------------------------------------- 1 | prompt: A kangaroo standing in the Australian outback 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Leaf.yaml: -------------------------------------------------------------------------------- 1 | prompt: A leaf falling gently from a tree 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Lion.yaml: -------------------------------------------------------------------------------- 1 | prompt: A lion lying in the savanna grass 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Owl.yaml: -------------------------------------------------------------------------------- 1 | prompt: An owl swooping down on its prey during the night 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Owl2.yaml: -------------------------------------------------------------------------------- 1 | prompt: An owl perched silently in a tree at night 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Panda.yaml: -------------------------------------------------------------------------------- 1 | prompt: A panda munching bamboo in a bamboo forest 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Paper.yaml: -------------------------------------------------------------------------------- 1 | prompt: A paper plane gliding in the air 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Parrot.yaml: -------------------------------------------------------------------------------- 1 | prompt: A parrot flying upwards towards the treetops 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Penguin.yaml: -------------------------------------------------------------------------------- 1 | prompt: A penguin standing on an iceberg 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Rabbit.yaml: -------------------------------------------------------------------------------- 1 | prompt: A rabbit burrowing downwards into its warren 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Rocket.yaml: -------------------------------------------------------------------------------- 1 | prompt: A rocket launching into space from a launchpad 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Roller.yaml: -------------------------------------------------------------------------------- 1 | prompt: A woodpecker climbing up a tree trunk 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/RollerCoaster.yaml: -------------------------------------------------------------------------------- 1 | prompt: A roller coaster looping in an amusement park 2 | seed: 4 3 | token_inds: 4 | - 2 5 | - 3 6 | width: 576 7 | height: 320 8 | num_inference_steps: 9 | trailblazer: 10 | num_dd_spatial_steps: 5 11 | num_dd_temporal_steps: 5 12 | spatial_strengthen_scale: 0.1 13 | spatial_weaken_scale: 0.001 14 | temp_strengthen_scale: 0.1 15 | temp_weaken_scale: 0.001 16 | trailing_length: 13 17 | text2vidzero: 18 | motion_field_strength_x: -8 19 | motion_field_strength_y: 0 20 | peekaboo: 21 | frozen_steps: 2 22 | -------------------------------------------------------------------------------- /config/Metric/Satellite.yaml: -------------------------------------------------------------------------------- 1 | prompt: A satellite orbiting Earth in outer space 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Skateboarder.yaml: -------------------------------------------------------------------------------- 1 | prompt: A skateboarder performing tricks at a skate park 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Squirrel.yaml: -------------------------------------------------------------------------------- 1 | prompt: A squirrel descending a tree after gathering nuts. 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Squirrel2.yaml: -------------------------------------------------------------------------------- 1 | prompt: A squirrel jumping from one tree to another 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/StreetCar.yaml: -------------------------------------------------------------------------------- 1 | prompt: A streetcar trundling down tracks in a historic district 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Swan.yaml: -------------------------------------------------------------------------------- 1 | prompt: A swan floating gracefully on a lake 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Metric/Woodpecker.yaml: -------------------------------------------------------------------------------- 1 | prompt: A woodpecker climbing up a tree trunk 2 | seed: 4 3 | token_inds: 4 | - 2 5 | width: 576 6 | height: 320 7 | num_inference_steps: 8 | trailblazer: 9 | num_dd_spatial_steps: 5 10 | num_dd_temporal_steps: 5 11 | spatial_strengthen_scale: 0.1 12 | spatial_weaken_scale: 0.001 13 | temp_strengthen_scale: 0.1 14 | temp_weaken_scale: 0.001 15 | trailing_length: 13 16 | text2vidzero: 17 | motion_field_strength_x: -8 18 | motion_field_strength_y: 0 19 | peekaboo: 20 | frozen_steps: 2 21 | -------------------------------------------------------------------------------- /config/Morphin/Cat2Dog.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.7 4 | - 0.4 5 | - 1.0 6 | - 0.65 7 | frame: 0 8 | prompt: A white cat walking on the grass 9 | - bbox_ratios: 10 | - 0.0 11 | - 0.4 12 | - 0.3 13 | - 0.65 14 | frame: 24 15 | prompt: A yellow dog walking on the grass 16 | seed: 123451232531 17 | token_inds: 18 | - 1 19 | - 2 20 | - 3 21 | width: 512 22 | height: 512 23 | trailblazer: 24 | num_dd_spatial_steps: 5 25 | num_dd_temporal_steps: 5 26 | spatial_strengthen_scale: 0.1 27 | spatial_weaken_scale: 0.001 28 | temp_strengthen_scale: 0.1 29 | temp_weaken_scale: 0.001 30 | trailing_length: 16 31 | -------------------------------------------------------------------------------- /config/Morphin/Cat2Fish.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.7 4 | - 0.4 5 | - 1.0 6 | - 0.65 7 | frame: 0 8 | prompt: A white cat walking on the grass 9 | - bbox_ratios: 10 | - 0.0 11 | - 0.4 12 | - 0.3 13 | - 0.65 14 | frame: 24 15 | prompt: A golden fish walking on the grass 16 | seed: 123451232531 17 | token_inds: 18 | - 1 19 | - 2 20 | - 3 21 | width: 512 22 | height: 512 23 | trailblazer: 24 | num_dd_spatial_steps: 5 25 | num_dd_temporal_steps: 5 26 | spatial_strengthen_scale: 0.1 27 | spatial_weaken_scale: 0.001 28 | temp_strengthen_scale: 0.1 29 | temp_weaken_scale: 0.001 30 | trailing_length: 16 31 | -------------------------------------------------------------------------------- /config/Morphin/Parrot2Penguin.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.55 5 | - 0.3 6 | - 0.65 7 | frame: 0 8 | prompt: A colorful parrots walking on the beach 9 | - bbox_ratios: 10 | - 0.3 11 | - 0.55 12 | - 0.6 13 | - 0.65 14 | frame: 4 15 | prompt: A colorful parrots walking on the beach 16 | - bbox_ratios: 17 | - 0.4 18 | - 0.35 19 | - 0.7 20 | - 0.65 21 | frame: 12 22 | prompt: a king penguin walking on the beach 23 | - bbox_ratios: 24 | - 0.7 25 | - 0.35 26 | - 1.0 27 | - 0.65 28 | frame: 24 29 | prompt: a king penguin walking on the beach 30 | seed: 123451232578 31 | token_inds: 32 | - 1 33 | - 2 34 | - 3 35 | width: 512 36 | height: 512 37 | num_inference_steps: 38 | trailblazer: 39 | num_dd_spatial_steps: 3 40 | num_dd_temporal_steps: 3 41 | spatial_strengthen_scale: 0.25 42 | spatial_weaken_scale: 0.01 43 | temp_strengthen_scale: 0.25 44 | temp_weaken_scale: 0.01 45 | trailing_length: 10 46 | -------------------------------------------------------------------------------- /config/Morphin/Tiger2Elephant.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.5 5 | - 0.3 6 | - 0.7 7 | frame: 0 8 | prompt: A tiger walking in the wildpark 9 | - bbox_ratios: 10 | - 0.7 11 | - 0.3 12 | - 1.0 13 | - 0.7 14 | frame: 24 15 | prompt: An elephant walking in the wildpark 16 | seed: 123451232579 17 | token_inds: 18 | - 1 19 | - 2 20 | width: 512 21 | height: 512 22 | num_inference_steps: 23 | trailblazer: 24 | num_dd_spatial_steps: 5 25 | num_dd_temporal_steps: 5 26 | spatial_strengthen_scale: 0.1 27 | spatial_weaken_scale: 0.001 28 | temp_strengthen_scale: 0.1 29 | temp_weaken_scale: 0.001 30 | trailing_length: 13 31 | -------------------------------------------------------------------------------- /config/Multi/MultiSubject-Cat.yaml: -------------------------------------------------------------------------------- 1 | # See the line92 - line95 in genreate.py for how Peekaboo generating bbox 2 | keyframe: 3 | - bbox_ratios: 4 | - 0.0 5 | - 0.35 6 | - 0.2 7 | - 0.65 8 | frame: 0 9 | prompt: A white cat running on the grass 10 | - bbox_ratios: 11 | - 0.3 12 | - 0.35 13 | - 0.5 14 | - 0.65 15 | frame: 24 16 | prompt: A white cat running on the grass 17 | token_inds: 18 | - 1 19 | - 2 20 | - 3 21 | width: 512 22 | height: 512 23 | num_inference_steps: 24 | seed: 3 25 | trailblazer: 26 | num_dd_spatial_steps: 5 27 | num_dd_temporal_steps: 5 28 | spatial_strengthen_scale: 0.5 29 | spatial_weaken_scale: 0.01 30 | temp_strengthen_scale: 0.5 31 | temp_weaken_scale: 0.01 32 | trailing_length: 4 33 | -------------------------------------------------------------------------------- /config/Multi/MultiSubject-Dog.yaml: -------------------------------------------------------------------------------- 1 | # See the line92 - line95 in genreate.py for how Peekaboo generating bbox 2 | keyframe: 3 | - bbox_ratios: 4 | - 0.8 5 | - 0.35 6 | - 1.0 7 | - 0.65 8 | frame: 0 9 | prompt: A yellow dog running on the grass 10 | - bbox_ratios: 11 | - 0.6 12 | - 0.35 13 | - 0.8 14 | - 0.65 15 | frame: 24 16 | prompt: A yellow dog running on the grass 17 | token_inds: 18 | - 1 19 | - 2 20 | - 3 21 | width: 512 22 | height: 512 23 | num_inference_steps: 24 | seed: 2 25 | trailblazer: 26 | num_dd_spatial_steps: 5 27 | num_dd_temporal_steps: 5 28 | spatial_strengthen_scale: 0.5 29 | spatial_weaken_scale: 0.05 30 | temp_strengthen_scale: 0.5 31 | temp_weaken_scale: 0.05 32 | trailing_length: 4 33 | -------------------------------------------------------------------------------- /config/Multi/MultiSubjects.yaml: -------------------------------------------------------------------------------- 1 | multisubs: 2 | seed: 12345611 3 | num_integration_steps: 20 4 | prompt: a white cat and a yellow dog running in the botanic garden 5 | subjects: 6 | - /tmp/TrailBlazer/MultiSubject-Dog.0000.pt 7 | - /tmp/TrailBlazer/MultiSubject-Cat.0000.pt 8 | -------------------------------------------------------------------------------- /config/Peekaboo/2ndKeyFast.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.35 5 | - 0.25 6 | - 0.75 7 | frame: 0 8 | prompt: an elephant standing on the moon 9 | - bbox_ratios: 10 | - 0.0 11 | - 0.35 12 | - 0.25 13 | - 0.75 14 | frame: 18 15 | prompt: an elephant standing on the moon 16 | - bbox_ratios: 17 | - 0.75 18 | - 0.35 19 | - 1.0 20 | - 0.75 21 | frame: 24 22 | prompt: an elephant running on the moon 23 | token_inds: 24 | - 2 25 | width: 576 26 | height: 320 27 | num_inference_steps: 28 | seed: 2 29 | trailblazer: 30 | num_dd_spatial_steps: 5 31 | num_dd_temporal_steps: 5 32 | spatial_strengthen_scale: 0.175 33 | spatial_weaken_scale: 0.05 34 | temp_strengthen_scale: 0.175 35 | temp_weaken_scale: 0.05 36 | trailing_length: 15 37 | peekaboo: 38 | frozen_steps: 2 39 | -------------------------------------------------------------------------------- /config/Peekaboo/ChangingFish.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.0 5 | - 0.2 6 | - 0.2 7 | frame: 0 8 | prompt: a photorealistic whale jumping out of water while smoking a cigar 9 | - bbox_ratios: 10 | - 0.3 11 | - 0.2 12 | - 0.7 13 | - 0.8 14 | frame: 12 15 | prompt: a photorealistic whale jumping out of water while smoking a cigar 16 | - bbox_ratios: 17 | - 0.8 18 | - 0.8 19 | - 1.0 20 | - 1.0 21 | frame: 24 22 | prompt: a photorealistic whale jumping out of water while smoking a cigar 23 | token_inds: 24 | - 3 25 | width: 576 26 | height: 320 27 | num_inference_steps: 28 | seed: 2 29 | trailblazer: 30 | num_dd_spatial_steps: 5 31 | num_dd_temporal_steps: 5 32 | spatial_strengthen_scale: 0.1 33 | spatial_weaken_scale: 0.01 34 | temp_strengthen_scale: 0.1 35 | temp_weaken_scale: 0.01 36 | trailing_length: 20 37 | peekaboo: 38 | frozen_steps: 2 39 | -------------------------------------------------------------------------------- /config/Peekaboo/CrazyHorse.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.0 4 | - 0.0 5 | - 0.25 6 | - 0.3 7 | frame: 0 8 | prompt: A horse fast galloping on a street 9 | - bbox_ratios: 10 | - 0.0 11 | - 0.6 12 | - 0.25 13 | - 1.0 14 | frame: 5 15 | prompt: A horse fast galloping on a street 16 | - bbox_ratios: 17 | - 0.4 18 | - 0.0 19 | - 0.65 20 | - 0.3 21 | frame: 10 22 | prompt: A horse fast galloping on a street 23 | - bbox_ratios: 24 | - 0.4 25 | - 0.6 26 | - 0.65 27 | - 1.0 28 | frame: 15 29 | prompt: A horse fast galloping on a street 30 | - bbox_ratios: 31 | - 0.75 32 | - 0.0 33 | - 1.0 34 | - 0.3 35 | frame: 20 36 | prompt: A horse fast galloping on a street 37 | - bbox_ratios: 38 | - 0.75 39 | - 0.6 40 | - 1.0 41 | - 1.0 42 | frame: 24 43 | prompt: A horse fast galloping on a street 44 | token_inds: 45 | - 2 46 | width: 576 47 | height: 320 48 | num_inference_steps: 49 | seed: 2 50 | trailblazer: 51 | num_dd_spatial_steps: 5 52 | num_dd_temporal_steps: 5 53 | spatial_strengthen_scale: 0.2 54 | spatial_weaken_scale: 0.05 55 | temp_strengthen_scale: 0.2 56 | temp_weaken_scale: 0.05 57 | trailing_length: 10 58 | peekaboo: 59 | frozen_steps: 2 60 | -------------------------------------------------------------------------------- /config/Peekaboo/FastDog.yaml: -------------------------------------------------------------------------------- 1 | # bbox_ratios: left, top, bottom, right 2 | keyframe: 3 | - bbox_ratios: 4 | - 0.0 5 | - 0.35 6 | - 0.3 7 | - 0.65 8 | frame: 0 9 | prompt: A dog is running on the grass 10 | - bbox_ratios: 11 | - 0.7 12 | - 0.35 13 | - 1.0 14 | - 0.65 15 | frame: 3 16 | prompt: A dog is running on the grass 17 | - bbox_ratios: 18 | - 0.0 19 | - 0.35 20 | - 0.3 21 | - 0.65 22 | frame: 6 23 | prompt: A dog is running on the grass 24 | - bbox_ratios: 25 | - 0.7 26 | - 0.35 27 | - 1.0 28 | - 0.65 29 | frame: 9 30 | prompt: A dog is running on the grass 31 | - bbox_ratios: 32 | - 0.0 33 | - 0.35 34 | - 0.3 35 | - 0.65 36 | frame: 12 37 | prompt: A dog is running on the grass 38 | - bbox_ratios: 39 | - 0.7 40 | - 0.35 41 | - 1.0 42 | - 0.65 43 | frame: 15 44 | prompt: A dog is running on the grass 45 | - bbox_ratios: 46 | - 0.0 47 | - 0.35 48 | - 0.3 49 | - 0.65 50 | frame: 18 51 | prompt: A dog is running on the grass 52 | - bbox_ratios: 53 | - 0.7 54 | - 0.35 55 | - 1.0 56 | - 0.65 57 | frame: 21 58 | prompt: A dog is running on the grass 59 | - bbox_ratios: 60 | - 0.0 61 | - 0.35 62 | - 0.3 63 | - 0.65 64 | frame: 24 65 | prompt: A dog is running on the grass 66 | seed: 123451232532 67 | token_inds: 68 | - 2 69 | width: 576 70 | height: 320 71 | num_inference_steps: 72 | trailblazer: 73 | num_dd_spatial_steps: 5 74 | num_dd_temporal_steps: 5 75 | spatial_strengthen_scale: 0.2 76 | spatial_weaken_scale: 0.05 77 | temp_strengthen_scale: 0.2 78 | temp_weaken_scale: 0.05 79 | trailing_length: 15 80 | peekaboo: 81 | frozen_steps: 2 82 | -------------------------------------------------------------------------------- /config/Peekaboo/Peekaboo-Reproduce.yaml: -------------------------------------------------------------------------------- 1 | # See the line92 - line95 in genreate.py for how Peekaboo generating bbox 2 | keyframe: 3 | - bbox_ratios: 4 | - 0.138 5 | - 0.25 6 | - 0.416 7 | - 0.625 8 | frame: 0 9 | prompt: A panda eating bamboo in a lush bamboo forest 10 | - bbox_ratios: 11 | - 0.138 12 | - 0.25 13 | - 0.416 14 | - 0.625 15 | frame: 24 16 | prompt: A panda eating bamboo in a lush bamboo forest 17 | token_inds: 18 | - 2 19 | width: 576 20 | height: 320 21 | num_inference_steps: 22 | seed: 2 23 | trailblazer: 24 | num_dd_spatial_steps: 5 25 | num_dd_temporal_steps: 5 26 | spatial_strengthen_scale: 0.175 27 | spatial_weaken_scale: 0.05 28 | temp_strengthen_scale: 0.175 29 | temp_weaken_scale: 0.05 30 | trailing_length: 15 31 | peekaboo: 32 | frozen_steps: 2 33 | -------------------------------------------------------------------------------- /config/Peekaboo/README.md: -------------------------------------------------------------------------------- 1 | # Peekaboo comparison 2 | -------------------------------------------------------------------------------- /config/Peekaboo/TinyFish.yaml: -------------------------------------------------------------------------------- 1 | keyframe: 2 | - bbox_ratios: 3 | - 0.475 4 | - 0.45 5 | - 0.525 6 | - 0.65 7 | frame: 0 8 | prompt: A clownfish swimming in a coral reef 9 | - bbox_ratios: 10 | - 0.475 11 | - 0.45 12 | - 0.525 13 | - 0.65 14 | frame: 24 15 | prompt: A clownfish swimming in a coral reef 16 | seed: 123451232531 17 | token_inds: 18 | - 2 19 | width: 576 20 | height: 320 21 | num_inference_steps: 22 | trailblazer: 23 | num_dd_spatial_steps: 5 24 | num_dd_temporal_steps: 5 25 | spatial_strengthen_scale: 0.3 26 | spatial_weaken_scale: 0.001 27 | temp_strengthen_scale: 0.3 28 | temp_weaken_scale: 0.001 29 | trailing_length: 35 30 | text2vidzero: 31 | motion_field_strength_x: -8 32 | motion_field_strength_y: 0 33 | peekaboo: 34 | frozen_steps: 2 35 | -------------------------------------------------------------------------------- /config/README.md: -------------------------------------------------------------------------------- 1 | # TrailBlazer: Config 2 | 3 | Below, we display the icon of the result, which has been reduced to 20% of the 4 | original resolution to maintain a small repository size. 5 | 6 | 7 | ## Main - TrailBlazer 8 | 9 | ### Fig.5 Main result: Rigid bbox moving from left to right 10 | 11 | Main/RigidMoving-Astronaut.yaml 12 | 13 | ![RigidMoving-Astronaut.0000.gif](../assets/v1-TrailBlazer/RigidMoving-Astronaut.0000.gif) 14 | 15 | Main/RigidMoving-Bee.yaml 16 | 17 | ![RigidMoving-Bee.0000.gif](../assets/v1-TrailBlazer/RigidMoving-Bee.0000.gif) 18 | 19 | Main/RigidMoving-Cat.yaml 20 | 21 | ![RigidMoving-Cat.0000.gif](../assets/v1-TrailBlazer/RigidMoving-Cat.0000.gif) 22 | 23 | Main/RigidMoving-Clownfish.yaml 24 | 25 | ![RigidMoving-Clownfish.0000.gif](../assets/v1-TrailBlazer/RigidMoving-Clownfish.0000.gif) 26 | 27 | ### Fig.6 Main result: Dynamic moving bbox 28 | 29 | Main/PerspTL2BR-Tiger.yaml 30 | 31 | ![PerspTL2BR-Tiger.0000.gif](../assets/v1-TrailBlazer/PerspTL2BR-Tiger.0000.gif) 32 | 33 | Main/PerspBR2TL-Tiger.yaml 34 | 35 | ![PerspBR2TL-Tiger.0000.gif](../assets/v1-TrailBlazer/PerspBR2TL-Tiger.0000.gif) 36 | 37 | Main/SpeedKeys-Cat.yaml 38 | 39 | ![SpeedKeys-Cat.0000.gif](../assets/v1-TrailBlazer/SpeedKeys-Cat.0000.gif) 40 | 41 | ## Main - Peekaboo 42 | 43 | ### Fig.5 Main result: Rigid bbox moving from left to right 44 | 45 | Main/RigidMoving-Astronaut.yaml 46 | 47 | ![RigidMoving-Astronaut.0000.gif](../assets/v1-Peekaboo/RigidMoving-Astronaut.0000.gif) 48 | 49 | Main/RigidMoving-Bee.yaml 50 | 51 | ![RigidMoving-Bee.0000.gif](../assets/v1-Peekaboo/RigidMoving-Bee.0006.gif) 52 | 53 | Main/RigidMoving-Cat.yaml 54 | 55 | ![RigidMoving-Cat.0000.gif](../assets/v1-Peekaboo/RigidMoving-Cat.0010.gif) 56 | 57 | Main/RigidMoving-Clownfish.yaml 58 | 59 | ![RigidMoving-Clownfish.0000.gif](../assets/v1-Peekaboo/RigidMoving-Clownfish.0001.gif) 60 | 61 | ### Fig.6 Main result: Dynamic moving bbox 62 | 63 | Main/PerspTL2BR-Tiger.yaml 64 | 65 | ![PerspTL2BR-Tiger.0000.gif](../assets/v1-Peekaboo/PerspTL2BR-Tiger.0000.gif) 66 | 67 | Main/PerspBR2TL-Tiger.yaml 68 | 69 | ![PerspBR2TL-Tiger.0000.gif](../assets/v1-Peekaboo/PerspBR2TL-Tiger.0000.gif) 70 | 71 | Main/SpeedKeys-Cat.yaml 72 | 73 | ![SpeedKeys-Cat.0000.gif](../assets/v1-Peekaboo/SpeedKeys-Cat.0000.gif) 74 | 75 | ## Main - Text2Video-Zero 76 | 77 | ### Fig.5 Main result: Rigid bbox moving from left to right 78 | 79 | Main/RigidMoving-Astronaut.yaml 80 | 81 | ![RigidMoving-Astronaut.0000.gif](../assets/v1-T2VZero/RigidMoving-Astronaut.0000.gif) 82 | 83 | Main/RigidMoving-Bee.yaml 84 | 85 | ![RigidMoving-Bee.0000.gif](../assets/v1-T2VZero/RigidMoving-Bee.0000.gif) 86 | 87 | Main/RigidMoving-Cat.yaml 88 | 89 | ![RigidMoving-Cat.0000.gif](../assets/v1-T2VZero/RigidMoving-Cat.0000.gif) 90 | 91 | Main/RigidMoving-Clownfish.yaml 92 | 93 | ![RigidMoving-Clownfish.0000.gif](../assets/v1-T2VZero/RigidMoving-Clownfish.0000.gif) 94 | 95 | ## Morphin 96 | 97 | We refrain from discussing the results in the paper, but we provide a glimpse of 98 | what we can experiment with. 99 | 100 | Morphin/Tiger2Elephant.yaml 101 | 102 | ![Tiger2Elephant.0000.gif](../assets/v1-TrailBlazer/Tiger2Elephant.0000.gif) 103 | 104 | Morphin/Parrot2Penguin.yaml 105 | 106 | ![Tiger2Elephant.0000.gif](../assets/v1-TrailBlazer/Parrot2Penguin.0000.gif) 107 | 108 | Morphin/Cat2Fish.yaml 109 | 110 | ![Tiger2Elephant.0000.gif](../assets/v1-TrailBlazer/Cat2Fish.0000.gif) 111 | 112 | Morphin/Cat2Dog.yaml 113 | 114 | ![Tiger2Elephant.0000.gif](../assets/v1-TrailBlazer/Cat2Dog.0000.gif) 115 | 116 | ## Multi 117 | 118 | Multi/MultiSubject-Cat.yaml 119 | 120 | ![MultiSubject-Cat.0000.gif](../assets/v1-TrailBlazer/MultiSubject-Cat.0000.gif) 121 | 122 | Multi/MultiSubject-Dog.yaml 123 | 124 | ![MultiSubject-Dog.0000.gif](../assets/v1-TrailBlazer/MultiSubject-Dog.0000.gif) 125 | 126 | Multi/MultiSubjects.yaml 127 | 128 | ![MultiSubjects.0000.gif](../assets/v1-TrailBlazer/MultiSubjects.0000.gif) 129 | 130 | ## (Supp Fig.4) Extreme comparison - TrailBlazer 131 | 132 | Peekaboo/2ndKeyFast.yaml 133 | 134 | ![2ndKeyFast.0003.gif](../assets/v1-TrailBlazer/2ndKeyFast.0003.gif) 135 | 136 | Peekaboo/ChangingFish.yaml 137 | 138 | ![ChangingFish.0009.gif](../assets/v1-TrailBlazer/ChangingFish.0009.gif) 139 | 140 | Peekaboo/CrazyHorse.yaml 141 | 142 | ![CrazyHorse.0007.gif](../assets/v1-TrailBlazer/CrazyHorse.0007.gif) 143 | 144 | Peekaboo/FastDog.yaml 145 | 146 | ![FastDog.0003.gif](../assets/v1-TrailBlazer/FastDog.0003.gif) 147 | 148 | Peekaboo/TinyFish.yaml 149 | 150 | ![TinyFish.0008.gif](../assets/v1-TrailBlazer/TinyFish.0008.gif) 151 | 152 | ## (Supp Fig.4) Extreme comparison - Peekaboo 153 | 154 | Peekaboo/2ndKeyFast.yaml 155 | 156 | ![2ndKeyFast.0000.gif](../assets/v1-Peekaboo/2ndKeyFast.0000.gif) 157 | 158 | Peekaboo/ChangingFish.yaml 159 | 160 | ![ChangingFish.0000.gif](../assets/v1-Peekaboo/ChangingFish.0000.gif) 161 | 162 | Peekaboo/CrazyHorse.yaml 163 | 164 | ![CrazyHorse.0000.gif](../assets/v1-Peekaboo/CrazyHorse.0000.gif) 165 | 166 | Peekaboo/FastDog.yaml 167 | 168 | ![FastDog.0000.gif](../assets/v1-Peekaboo/FastDog.0000.gif) 169 | 170 | Peekaboo/TinyFish.yaml 171 | 172 | ![TinyFish.0000.gif](../assets/v1-Peekaboo/TinyFish.0000.gif) 173 | -------------------------------------------------------------------------------- /doc/Command.md: -------------------------------------------------------------------------------- 1 | # TrailBlazer: Commands 2 | 3 | Here we illustrate how to make your own config under the current implementation. 4 | Please use the following commands to run our TrailBlazer. The config structure 5 | is detailed [here](Config.md) 6 | 7 | ## bin/CmdTrailBlazer.py 8 | 9 | ```bash 10 | python bin/CmdTrailBlazer.py --config ${YOUR_CONFIG_FILEPATH} -mr ${YOUR_MODEL_ROOT} 11 | e.g., python bin/CmdTrailBlazer.py --config config/PerspBR2TL-Tiger.yaml -mr ${YOUR_MODEL_ROOT} 12 | ``` 13 | 14 | If you feel the flag -mr(--model-root) is combersome, you can create a shell 15 | environment variable named ZEROSCOPE_MODEL_ROOT so that flag can be ignored. 16 | It's the folder path that contains cerspense/zeroscope_v2_576w 17 | 18 | ```bash 19 | export ZEROSCOPE_MODEL_ROOT=/your/huggingface/model/root/ 20 | # Note: then we expect the zeroscope model is here /your/huggingface/model/root/cerspense/zeroscope_v2_576w/ 21 | ``` 22 | 23 | Then it above command now becomes 24 | 25 | ```bash 26 | python bin/CmdTrailBlazer.py --config ${YOUR_CONFIG_FILEPATH} 27 | e.g., python bin/CmdTrailBlazer.py --config config/Main/PerspBR2TL-Tiger.yaml 28 | ``` 29 | 30 | For your convenience, you could run the following command to get the results 31 | from all config yamls in the given folder: 32 | 33 | ```bash 34 | python bin/CmdTrailBlazer.py --config ${YOUR_CONFIG_FOLDER} 35 | # say, you want to execute all yaml files in the provided config folder 36 | e.g., python bin/CmdTrailBlazer.py --config config/Main/ 37 | ``` 38 | 39 | ## bin/CmdTrailBlazerMulti.py 40 | 41 | As outlined in the paper, the individual objects are initially generated 42 | independently before being integrated. For example, if one wishes to guide 43 | both a dog and a cat based on the prompt "a dog and a cat running in the park," 44 | each needs to be processed separately using our **CmdTrailBlazer** command: 45 | 46 | ```bash 47 | python bin/CmdTrailBlazer.py --config config/Multi/MultiSubject-Dog.yaml 48 | python bin/CmdTrailBlazer.py --config config/Multi/MultiSubject-Cat.yaml 49 | ``` 50 | 51 | Upon execution, you will receive the reconstructed video as usual. Additionally, 52 | a .pt file will be generated containing the bounding box at each frame and the 53 | latent vectors at each step. For example: 54 | 55 | ```bash 56 | /tmp/TrailBlazer/MultiSubject-Dog.0000.pt 57 | /tmp/TrailBlazer/MultiSubject-Cat.0000.pt 58 | ``` 59 | 60 | After placing these .pt files in the "subjects" key within the configuration, 61 | proceed to execute the distinct command **CmdTrailBlazerMulti** to obtain the final 62 | result. 63 | 64 | ```bash 65 | python bin/CmdTrailBlazerMulti.py --config config/Multi/MultiSubjects.yaml 66 | ``` 67 | 68 | ## bin/CmdPeekaboo.py 69 | 70 | We've managed the entry point of Peekaboo within our executable script 71 | **CmdPeekaboo**, enhancing compatibility with our configuration file in their 72 | implementation. Please checkout the page [Peekaboo.md](Peekaboo.md) for more 73 | information about the reproducibility, and the [Config.md](Config.md) for the 74 | configuration design about Peekaboo. 75 | 76 | ```bash 77 | python bin/CmdPeekaboo.py --config config/config.yaml 78 | ``` 79 | 80 | We notice that the full precision is used in Peekaboo implementation (e.g., 81 | torch.float (See 82 | [src/generate.py#L76](https://github.com/microsoft/Peekaboo/blob/main/src/generate.py#L76))) 83 | that goes beyond our VRAM to run the batch execution (e.g., folder path in 84 | --config flag). If you want to run all config under specific folder. Here is an 85 | alternative way: 86 | 87 | ```bash 88 | for f in config/*.yaml; do python bin/CmdPeekaboo.py --config $f; done 89 | ``` 90 | 91 | Get some coffee after executing it :) 92 | -------------------------------------------------------------------------------- /doc/Config.md: -------------------------------------------------------------------------------- 1 | # TrailBlazer: Config 2 | 3 | ## Single object 4 | 5 | The following example pseudo config covers the needs to run our TrailBlazer for 6 | the single object. 7 | 8 | ```yaml 9 | keyframe: 10 | - bbox_ratios: 11 | - 0.5 12 | - 0.5 13 | - 1.0 14 | - 1.0 15 | frame: 0 16 | prompt: A tiger walking alone down the street 17 | - bbox_ratios: 18 | - 0.0 19 | - 0.0 20 | - 0.1 21 | - 0.1 22 | frame: 24 23 | prompt: A tiger walking alone down the street 24 | seed: 123451232531 25 | token_inds: 26 | - 2 27 | width: 576 28 | height: 320 29 | num_inference_steps: 30 | trailblazer: 31 | num_dd_spatial_steps: 5 32 | num_dd_temporal_steps: 5 33 | spatial_strengthen_scale: 0.15 34 | spatial_weaken_scale: 0.001 35 | temp_strengthen_scale: 0.15 36 | temp_weaken_scale: 0.001 37 | trailing_length: 13 38 | text2vidzero: 39 | motion_field_strength_x: -8 40 | motion_field_strength_y: 0 41 | peekaboo: 42 | frozen_steps: 2 43 | ``` 44 | 45 | Some requirements: 46 | 47 | - At least two keyframes are required for the initial and the end frame. The 48 | initial frame index must be 0. 49 | 50 | - It's recommend to set the end frame at 24 as this is how ZeroScope model was 51 | trained (See [here](https://zeroscope.replicate.dev/)). In this example, the 52 | second keyframe is set as 24. 53 | 54 | - Each keyframe contains bbox_ratios, frame, and prompt. The consistency between 55 | each component should be maintained conceptually by user. 56 | 57 | - In our experience, the trailing_length is a parameter that needs frequent 58 | adjustment for optimal results. 59 | 60 | - The tuple of the floats in bbox is the left, top, right, and bottom of the 61 | boundary relative to the normalized image space between 0 and 1. The bbox 62 | should be reasonably specified. E.g., b_left < b_right, b_top < b_bottom in 63 | OpenCV style. (e.g., 0.0,0.0,0.5,0.5 is the second quadrant) 64 | 65 | - There are three sections in the config calling **trailblazer**, 66 | **text2vidzero**, and **peekaboo**. The arguments under each category are the 67 | hyper-parameters of each method. For text2vidzero please checkout the external 68 | [link](https://huggingface.co/docs/diffusers/en/api/pipelines/text_to_video_zero). 69 | 70 | - For Peekaboo, as the method used for main comparison in our method. The key 71 | frozen_steps is the only one hyper-parameter in the 72 | [implementation](https://github.com/microsoft/Peekaboo/blob/main/src/generate.py#L30). 73 | Please refer to [link](Peekaboo.md) for more information. 74 | 75 | 76 | ## Multiple object 77 | 78 | The multiple object generation config needs 79 | 80 | ```yaml 81 | multisubs: 82 | seed: 12345611 83 | num_integration_steps: 20 84 | prompt: a white cat and a yellow dog running in the botanic garden 85 | subjects: 86 | - /tmp/TrailBlazer/MultiSubject-Dog.0000.pt 87 | - /tmp/TrailBlazer/MultiSubject-Cat.0000.pt 88 | ``` 89 | 90 | The **num_integration_steps** key is the number of steps used for integrating 91 | the latents between the subjects listed under **subjets** key, obtained from the 92 | single object generation in TrailBlazer. 93 | -------------------------------------------------------------------------------- /doc/Gradio.md: -------------------------------------------------------------------------------- 1 | # Gradio App 2 | 3 | TrailBlazer provides the web app on Huggingface Space using Gradio 4 | ([link](https://huggingface.co/spaces/hohonu-vicml/Trailblazer)). The following 5 | screenshot is the overview of the page: ![gradio](../assets/gradio/gradio.jpg) 6 | 7 | Alternatively, you can execute the app through terminal, then run the app at 8 | your local browser with the provided URL (e.g., http://127.0.0.1:xxxx) 9 | 10 | ```bash 11 | python bin/CmdGradio.py ${MODEL_ROOT} 12 | ``` 13 | 14 | 15 | We also offer a handy tool for constructing bounding boxes through drawing. You 16 | can find it on the second tab called "SketchPadHelper." To use it, simply choose 17 | the pen/eraser (1.) to draw a single box on the canvas. Afterward, press the 18 | green check button (2.), which default by gray in the screenshot. Then 19 | TrailBlazer will process it, displaying the bbox string in (4.) and a preview in 20 | the next canvas. 21 | 22 | Please keep in mind that our implementation of bbox is linked to layering 23 | located at (3.). If you have multiple bboxes to process, you can press the + 24 | sign, select the new layer, and then start drawing. 25 | 26 | ![gradio](../assets/gradio/gradio-bbox.jpg) 27 | 28 | Similar to GLIGEN(CVPR2023), that they noticed that Gradio doesn't have feature 29 | to make bounding box in a convenient way (See their doc 30 | [here](https://github.com/gligen/GLIGEN/tree/master/demo#question-how-do-you-draw-bounding-boxes-using-gradio-sketchpad)). 31 | Nevertheless, the latest Gradio version 4.x.x is incompatible with their 32 | implementation. We adopt a similar approach that examines the minimum and 33 | maximum values of the user's drawing on each layer to generate bounding box 34 | information. 35 | -------------------------------------------------------------------------------- /doc/Peekaboo.md: -------------------------------------------------------------------------------- 1 | # TrailBlazer: Peekaboo comparison 2 | 3 | In TrailBlazer repository, our new progress integrates all the latest Peekaboo 4 | development into here for better and easy comparison. We clone the Peekaboo 5 | based on the commit 6564274d5329644b51c75f4e4f6f86d56edf96a9 as one of our 6 | package module [**TrailBlazer/Baseline/Peekaboo**](../TrailBlazer/Baseline). 7 | 8 | To download their repository please do 9 | ```bash 10 | # at TrailBlazer root 11 | git clone https://github.com/microsoft/Peekaboo.git TrailBlazer/Baseline/Peekaboo 12 | # just make sure it is at that commit 13 | cd TrailBlazer/Baseline/Peekaboo && git checkout 6564274d5329644b51c75f4e4f6f86d56edf96a9 14 | ``` 15 | 16 | ## CmdPeekaboo 17 | 18 | 19 | For your convenience, we've configured the command as the entry point for 20 | Peekaboo, named **CmdPeekaboo**, based on their 21 | [src/generate.py](https://github.com/microsoft/Peekaboo/blob/main/src/generate.py). 22 | This command now accepts our configurations, allowing you to manipulate the bbox 23 | easily, which is hard-coded in their implementation 24 | [here](https://github.com/microsoft/Peekaboo/blob/main/src/generate.py#L92). To 25 | run Peekaboo, simply follow our convention: 26 | 27 | ```bash 28 | # assume ZEROSCOPE_MODEL_ROOT is set 29 | python bin/CmdPeekaboo.py --config config/config.yaml 30 | ``` 31 | 32 | 33 | To ensure the re-producibility, you can try to replace the following code to 34 | allow the model in customized path at their 35 | [src/generate.py#L74](https://github.com/microsoft/Peekaboo/blob/main/src/generate.py#L74): 36 | 37 | ```python 38 | if args.model == "zeroscope": 39 | model_id = "cerspense/zeroscope_v2_576w" 40 | model_folder = "/your/model/root" 41 | model_path = os.path.join(model_folder, model_id) 42 | pipe = TextToVideoSDPipelineSpatialAware.from_pretrained( 43 | model_path, torch_dtype=torch.float, variant="fp32" 44 | ).to(torch_device) 45 | ``` 46 | 47 | Then run the suggested command from their page, 48 | 49 | ```bash 50 | python src/generate.py --model zeroscope --prompt "A panda eating bamboo in a lush bamboo forest" --fg_object "panda" 51 | # Result: src/demo/zeroscope/A_panda_eating_bamboo_in_a_lush_bamboo_forest/2_of_50_2_peekaboo.mp4 52 | ``` 53 | 54 | Then run the refurbished alternative CmdPeekaboo with the config that mimic the 55 | default bbox in their code: 56 | 57 | ```bash 58 | python bin/CmdPeekaboo.py --config config/Peekaboo/Peekaboo-Reproduce.yaml 59 | # Result: /tmp/Peekaboo/Peekaboo-Reproduce.0000.mp4 60 | ``` 61 | 62 | ## Visual comparison 63 | 64 | Here we show the visual comparison between the implementation from Peekaboo, our 65 | CmdPeekaboo for reproducibility, and our TrailBlazer method. Specifically, 66 | 67 | The following result is produced by src/generate.py, the original Peekaboo command: 68 | 69 | 70 | 71 | This is the reproducibility from our CmdPeekaboo, with the use of 72 | config/Peekaboo/Peekaboo-Reproduce.yaml. Please be aware that the noticeable 73 | difference may stem from the fact that we are employing static bounding boxes 74 | without jittering in their implementation. Nonetheless, both results appear 75 | similar. 76 | 77 | 78 | 79 | If you're curious, here is the result obtained from the TrailBlazer command, CmdTrailBlazer: 80 | 81 | 82 | 83 | The corresponding masks used in the default setting is here generated from 84 | [src/generate.py#L110](https://github.com/microsoft/Peekaboo/blob/main/src/generate.py#L110): 85 | 86 | 87 | 88 | Sweet! 89 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | # TrailBlazer: Wiki 2 | 3 | Here we have a couple of pages describing the features of this repository to 4 | support our TrailBlazer research. It includes: 5 | 6 | * [**Command.md**](Command.md): To showcase the commands used in TrailBlazer 7 | from scratch 8 | 9 | * [**Config.md**](Config.md): To illustrate the config structure between 10 | single/multiple objects synthesis. For single object synthesis, it supports 11 | both T2V-Zero, Peekaboo(New), and our TrailBlazer. 12 | 13 | * [**Gradio.md**](Gradio.md): A visual guide to see how TrailBlazer works in 14 | Gradio app. 15 | 16 | * [**Peekaboo.md**](Peekaboo.md): To show how we integrate Peekaboo into our framework. 17 | --------------------------------------------------------------------------------