├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── assets ├── AnyV2V-SlidesShow-GIF-1080P-02.gif └── AnyV2V-SlidesShow-MP4-1080P.mp4 ├── black_box_image_edit ├── __init__.py ├── cosxl │ ├── custom_pipeline.py │ └── utils.py ├── cosxl_edit.py ├── instantstyle.py ├── instructpix2pix.py ├── ip_adapter │ ├── __init__.py │ ├── attention_processor.py │ ├── ip_adapter.py │ ├── resampler.py │ └── utils.py └── utils.py ├── cog.yaml ├── consisti2v ├── README.md ├── configs │ ├── pipeline_256 │ │ ├── ddim_inversion_256.yaml │ │ └── pnp_edit.yaml │ └── pipeline_512 │ │ ├── ddim_inversion_512.yaml │ │ └── pnp_edit.yaml ├── consisti2v │ ├── data │ │ └── dataset.py │ ├── models │ │ ├── rotary_embedding.py │ │ ├── videoldm_attention.py │ │ ├── videoldm_transformer_blocks.py │ │ ├── videoldm_unet.py │ │ └── videoldm_unet_blocks.py │ ├── pipelines │ │ ├── pipeline_autoregress_animation.py │ │ ├── pipeline_conditional_animation.py │ │ └── pipeline_video_editing.py │ └── utils │ │ ├── frameinit_utils.py │ │ └── util.py ├── ddim_inverse_scheduler.py ├── environment.yaml ├── pnp_utils.py ├── run_ddim_inversion.py ├── run_pnp_edit.py └── utils.py ├── demo ├── A Couple In A Public Display Of Affection.mp4 ├── A Couple In A Public Display Of Affection │ ├── 00000.png │ ├── 00001.png │ ├── 00002.png │ ├── 00003.png │ ├── 00004.png │ ├── 00005.png │ ├── 00006.png │ ├── 00007.png │ ├── 00008.png │ ├── 00009.png │ ├── 00010.png │ ├── 00011.png │ ├── 00012.png │ ├── 00013.png │ ├── 00014.png │ ├── 00015.png │ └── edited_first_frame │ │ ├── Sketch style.png │ │ └── Snowing.png ├── A kitten turning its head on a wooden floor.mp4 ├── A kitten turning its head on a wooden floor │ ├── 00000.png │ ├── 00001.png │ ├── 00002.png │ ├── 00003.png │ ├── 00004.png │ ├── 00005.png │ ├── 00006.png │ ├── 00007.png │ ├── 00008.png │ ├── 00009.png │ ├── 00010.png │ ├── 00011.png │ ├── 00012.png │ ├── 00013.png │ ├── 00014.png │ ├── 00015.png │ └── edited_first_frame │ │ └── A dog turning its head on a wooden floor.png ├── An Old Man Doing Exercises For The Body And Mind.mp4 ├── An Old Man Doing Exercises For The Body And Mind │ ├── 00000.png │ ├── 00001.png │ ├── 00002.png │ ├── 00003.png │ ├── 00004.png │ ├── 00005.png │ ├── 00006.png │ ├── 00007.png │ ├── 00008.png │ ├── 00009.png │ ├── 00010.png │ ├── 00011.png │ ├── 00012.png │ ├── 00013.png │ ├── 00014.png │ ├── 00015.png │ └── edited_first_frame │ │ ├── add a party hat on his head.png │ │ ├── cyberpunk style.png │ │ ├── give him a punk hair style.png │ │ ├── helmet.png │ │ ├── hinton.png │ │ ├── jack ma.png │ │ ├── starry night style.png │ │ ├── turn his hair white.png │ │ └── turn man into robot.png ├── Ballet.mp4 ├── Ballet │ ├── 00000.png │ ├── 00001.png │ ├── 00002.png │ ├── 00003.png │ ├── 00004.png │ ├── 00005.png │ ├── 00006.png │ ├── 00007.png │ ├── 00008.png │ ├── 00009.png │ ├── 00010.png │ ├── 00011.png │ ├── 00012.png │ ├── 00013.png │ ├── 00014.png │ ├── 00015.png │ └── edited_first_frame │ │ └── van gogh style.png ├── Man Walking.mp4 ├── Man Walking │ ├── 00000.png │ ├── 00001.png │ ├── 00002.png │ ├── 00003.png │ ├── 00004.png │ ├── 00005.png │ ├── 00006.png │ ├── 00007.png │ ├── 00008.png │ ├── 00009.png │ ├── 00010.png │ ├── 00011.png │ ├── 00012.png │ ├── 00013.png │ ├── 00014.png │ ├── 00015.png │ └── edited_first_frame │ │ ├── ElonMusk_02.png │ │ ├── Yann LeCun Walking.png │ │ ├── add a cowboy hat.png │ │ ├── change his clothes to red.png │ │ ├── policeman costume.png │ │ ├── turn him into an astronaut.png │ │ ├── turn him into batman.png │ │ └── turn the man into darth vader.png ├── Your-Video-Name │ └── edited_first_frame │ │ └── Your-edited-first-frame └── Your-Video-mp4 ├── edit_image.py ├── gradio_demo.py ├── gradio_demo_cosxl.py ├── gradio_demo_style.py ├── i2vgen-xl ├── __init__.py ├── configs │ ├── group_ddim_inversion │ │ ├── group_config.json │ │ └── template.yaml │ └── group_pnp_edit │ │ ├── group_config.json │ │ └── template.yaml ├── demo.ipynb ├── environment.yml ├── pipelines │ ├── __init__.py │ └── pipeline_i2vgen_xl.py ├── pnp_utils.py ├── run_group_ddim_inversion.py ├── run_group_pnp_edit.py ├── scripts │ ├── run_group_ddim_inversion.sh │ └── run_group_pnp_edit.sh └── utils.py ├── predict.py ├── prepare_video.py └── seine ├── README.md ├── configs ├── ddim_inversion.yaml └── pnp_edit.yaml ├── datasets └── video_transforms.py ├── diffusion ├── __init__.py ├── diffusion_utils.py ├── gaussian_diffusion.py ├── respace.py └── timestep_sampler.py ├── models ├── __init__.py ├── attention.py ├── clip.py ├── resnet.py ├── unet.py ├── unet_blocks.py └── utils.py ├── pnp_utils.py ├── requirement.txt ├── run_ddim_inversion.py ├── run_pnp_edit.py └── seine_utils.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Version [e.g. 22] 29 | 30 | **Software Packages (please complete the following information):** 31 | - Torch version: 32 | - Diffusers version: 33 | 34 | **Additional context** 35 | Add any other context about the problem here. 36 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Extra 2 | _demo_temp 3 | sdxl_models 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | m3ku@uwaterloo.ca. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 TIGER Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /assets/AnyV2V-SlidesShow-GIF-1080P-02.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/assets/AnyV2V-SlidesShow-GIF-1080P-02.gif -------------------------------------------------------------------------------- /assets/AnyV2V-SlidesShow-MP4-1080P.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/assets/AnyV2V-SlidesShow-MP4-1080P.mp4 -------------------------------------------------------------------------------- /black_box_image_edit/__init__.py: -------------------------------------------------------------------------------- 1 | from .instructpix2pix import InstructPix2Pix, MagicBrush 2 | from .cosxl_edit import CosXLEdit 3 | 4 | from typing import Union, Optional, Tuple 5 | import numpy as np 6 | from PIL import Image, ImageOps 7 | import os 8 | import requests 9 | 10 | 11 | 12 | 13 | def load_image(image: Union[str, Image.Image], format: str = "RGB", size: Optional[Tuple] = None) -> Image.Image: 14 | """ 15 | Load an image from a given path or URL and convert it to a PIL Image. 16 | 17 | Args: 18 | image (Union[str, Image.Image]): The image path, URL, or a PIL Image object to be loaded. 19 | format (str, optional): Desired color format of the resulting image. Defaults to "RGB". 20 | size (Optional[Tuple], optional): Desired size for resizing the image. Defaults to None. 21 | 22 | Returns: 23 | Image.Image: A PIL Image in the specified format and size. 24 | 25 | Raises: 26 | ValueError: If the provided image format is not recognized. 27 | """ 28 | if isinstance(image, str): 29 | if image.startswith("http://") or image.startswith("https://"): 30 | image = Image.open(requests.get(image, stream=True).raw) 31 | elif os.path.isfile(image): 32 | image = Image.open(image) 33 | else: 34 | raise ValueError( 35 | f"Incorrect path or url, URLs must start with `http://` or `https://`, and {image} is not a valid path" 36 | ) 37 | elif isinstance(image, Image.Image): 38 | image = image 39 | else: 40 | raise ValueError( 41 | "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image." 42 | ) 43 | image = ImageOps.exif_transpose(image) 44 | image = image.convert(format) 45 | if (size != None): 46 | image = image.resize(size, Image.LANCZOS) 47 | return image 48 | -------------------------------------------------------------------------------- /black_box_image_edit/cosxl/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import torch 4 | 5 | def set_timesteps_patched(self, num_inference_steps: int, device = None): 6 | self.num_inference_steps = num_inference_steps 7 | 8 | ramp = np.linspace(0, 1, self.num_inference_steps) 9 | sigmas = torch.linspace(math.log(self.config.sigma_min), math.log(self.config.sigma_max), len(ramp)).exp().flip(0) 10 | 11 | sigmas = (sigmas).to(dtype=torch.float32, device=device) 12 | self.timesteps = self.precondition_noise(sigmas) 13 | 14 | self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)]) 15 | self._step_index = None 16 | self._begin_index = None 17 | self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication 18 | -------------------------------------------------------------------------------- /black_box_image_edit/cosxl_edit.py: -------------------------------------------------------------------------------- 1 | 2 | from huggingface_hub import hf_hub_download 3 | import torch 4 | import PIL 5 | 6 | class CosXLEdit(): 7 | """ 8 | Edit Cos Stable Diffusion XL 1.0 Base is tuned to use a Cosine-Continuous EDM VPred schedule, and then upgraded to perform instructed image editing. 9 | Reference: https://huggingface.co/stabilityai/cosxl 10 | """ 11 | def __init__(self, device="cuda"): 12 | """ 13 | Attributes: 14 | pipe (CosStableDiffusionXLInstructPix2PixPipeline): The InstructPix2Pix pipeline for image transformation. 15 | 16 | Args: 17 | device (str, optional): Device on which the pipeline runs. Defaults to "cuda". 18 | """ 19 | from diffusers import EDMEulerScheduler 20 | from .cosxl.custom_pipeline import CosStableDiffusionXLInstructPix2PixPipeline 21 | from .cosxl.utils import set_timesteps_patched 22 | 23 | EDMEulerScheduler.set_timesteps = set_timesteps_patched 24 | edit_file = hf_hub_download(repo_id="stabilityai/cosxl", filename="cosxl_edit.safetensors") 25 | self.pipe = CosStableDiffusionXLInstructPix2PixPipeline.from_single_file( 26 | edit_file, num_in_channels=8 27 | ) 28 | self.pipe.scheduler = EDMEulerScheduler(sigma_min=0.002, sigma_max=120.0, sigma_data=1.0, prediction_type="v_prediction") 29 | self.pipe.to(device) 30 | 31 | def infer_one_image(self, src_image: PIL.Image.Image = None, src_prompt: str = None, target_prompt: str = None, instruct_prompt: str = None, seed: int = 42, negative_prompt=""): 32 | """ 33 | Modifies the source image based on the provided instruction prompt. 34 | 35 | Args: 36 | src_image (PIL.Image.Image): Source image in RGB format. 37 | instruct_prompt (str): Caption for editing the image. 38 | seed (int, optional): Seed for random generator. Defaults to 42. 39 | 40 | Returns: 41 | PIL.Image.Image: The transformed image. 42 | """ 43 | src_image = src_image.convert('RGB') # force it to RGB format 44 | generator = torch.manual_seed(seed) 45 | 46 | resolution = 1024 47 | preprocessed_image = src_image.resize((resolution, resolution)) 48 | image = self.pipe(prompt=instruct_prompt, 49 | image=preprocessed_image, 50 | height=resolution, 51 | width=resolution, 52 | negative_prompt=negative_prompt, 53 | guidance_scale=7, 54 | num_inference_steps=20, 55 | generator=generator).images[0] 56 | image = image.resize((src_image.width, src_image.height)) 57 | 58 | return image 59 | -------------------------------------------------------------------------------- /black_box_image_edit/instantstyle.py: -------------------------------------------------------------------------------- 1 | from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline 2 | import cv2 3 | import torch 4 | import PIL 5 | import numpy as np 6 | import os 7 | 8 | class InstantStyle(): 9 | def __init__(self, 10 | device="cuda", 11 | weight="stabilityai/stable-diffusion-xl-base-1.0", 12 | control_weight="diffusers/controlnet-canny-sdxl-1.0", 13 | custom_sdxl_models_folder="sdxl_models"): 14 | from .ip_adapter import IPAdapterXL 15 | 16 | controlnet = ControlNetModel.from_pretrained(control_weight, 17 | use_safetensors=False, 18 | torch_dtype=torch.float16).to(device) 19 | # load SDXL pipeline 20 | sdxl_control_pipe = StableDiffusionXLControlNetPipeline.from_pretrained( 21 | weight, 22 | controlnet=controlnet, 23 | torch_dtype=torch.float16, 24 | add_watermarker=False, 25 | ) 26 | sdxl_control_pipe.enable_vae_tiling() 27 | self.ip_model = IPAdapterXL(sdxl_control_pipe, 28 | os.path.join(custom_sdxl_models_folder, "image_encoder"), 29 | os.path.join(custom_sdxl_models_folder, "ip-adapter_sdxl.bin"), 30 | device, 31 | target_blocks=["up_blocks.0.attentions.1"]) 32 | 33 | 34 | def infer_one_image(self, src_image: PIL.Image.Image = None, 35 | style_image: PIL.Image.Image = None, 36 | prompt: str = "masterpiece, best quality, high quality", 37 | seed: int = 42, 38 | negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry"): 39 | 40 | src_image = src_image.convert('RGB') # force it to RGB format 41 | style_image = style_image.convert('RGB') # force it to RGB format 42 | 43 | def pil_to_cv2(image_pil): 44 | image_np = np.array(image_pil) 45 | image_cv2 = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR) 46 | 47 | return image_cv2 48 | # control image 49 | input_image = pil_to_cv2(src_image) 50 | detected_map = cv2.Canny(input_image, 50, 200) 51 | canny_map = PIL.Image.fromarray(cv2.cvtColor(detected_map, cv2.COLOR_BGR2RGB)) 52 | 53 | # generate image 54 | if prompt is None: 55 | prompt = "masterpiece, best quality, high quality" 56 | image = self.ip_model.generate(pil_image=style_image, 57 | prompt=prompt, 58 | negative_prompt=negative_prompt, 59 | scale=1.0, 60 | guidance_scale=5, 61 | num_samples=1, 62 | num_inference_steps=30, 63 | seed=seed, 64 | image=canny_map, 65 | controlnet_conditioning_scale=0.6, 66 | )[0] 67 | return image -------------------------------------------------------------------------------- /black_box_image_edit/instructpix2pix.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import PIL 3 | 4 | from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler 5 | 6 | class InstructPix2Pix(): 7 | """ 8 | A wrapper around the StableDiffusionInstructPix2PixPipeline for guided image transformation. 9 | 10 | This class uses the Pix2Pix pipeline to transform an image based on an instruction prompt. 11 | Reference: https://huggingface.co/docs/diffusers/api/pipelines/pix2pix 12 | """ 13 | def __init__(self, device="cuda", weight="timbrooks/instruct-pix2pix"): 14 | """ 15 | Attributes: 16 | pipe (StableDiffusionInstructPix2PixPipeline): The Pix2Pix pipeline for image transformation. 17 | 18 | Args: 19 | device (str, optional): Device on which the pipeline runs. Defaults to "cuda". 20 | weight (str, optional): Pretrained weights for the model. Defaults to "timbrooks/instruct-pix2pix". 21 | """ 22 | self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( 23 | weight, 24 | torch_dtype=torch.float16, 25 | safety_checker=None, 26 | ).to(device) 27 | self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config( 28 | self.pipe.scheduler.config) 29 | 30 | def infer_one_image(self, src_image: PIL.Image.Image = None, src_prompt: str = None, target_prompt: str = None, instruct_prompt: str = None, seed: int = 42, negative_prompt=""): 31 | """ 32 | Modifies the source image based on the provided instruction prompt. 33 | 34 | Args: 35 | src_image (PIL.Image.Image): Source image in RGB format. 36 | instruct_prompt (str): Caption for editing the image. 37 | seed (int, optional): Seed for random generator. Defaults to 42. 38 | 39 | Returns: 40 | PIL.Image.Image: The transformed image. 41 | """ 42 | src_image = src_image.convert('RGB') # force it to RGB format 43 | generator = torch.manual_seed(seed) 44 | 45 | # configs from https://github.com/timothybrooks/instruct-pix2pix/blob/main/edit_cli.py 46 | image = self.pipe(instruct_prompt, image=src_image, 47 | num_inference_steps=100, 48 | image_guidance_scale=1.5, 49 | guidance_scale=7.5, 50 | negative_prompt=negative_prompt, 51 | generator=generator 52 | ).images[0] 53 | return image 54 | 55 | class MagicBrush(InstructPix2Pix): 56 | def __init__(self, device="cuda", weight="vinesmsuic/magicbrush-jul7"): 57 | """ 58 | A class for MagicBrush. 59 | 60 | Args: 61 | device (str, optional): The device on which the model should run. Default is "cuda". 62 | weight (str, optional): The pretrained model weights for MagicBrush. Default is "vinesmsuic/magicbrush-jul7". 63 | """ 64 | super().__init__(device=device, weight=weight) 65 | 66 | def infer_one_image(self, src_image: PIL.Image.Image = None, src_prompt: str = None, target_prompt: str = None, instruct_prompt: str = None, seed: int = 42, negative_prompt=""): 67 | return super().infer_one_image(src_image, src_prompt, target_prompt, instruct_prompt, seed, negative_prompt) -------------------------------------------------------------------------------- /black_box_image_edit/ip_adapter/__init__.py: -------------------------------------------------------------------------------- 1 | from .ip_adapter import IPAdapter, IPAdapterPlus, IPAdapterPlusXL, IPAdapterXL, IPAdapterFull 2 | 3 | __all__ = [ 4 | "IPAdapter", 5 | "IPAdapterPlus", 6 | "IPAdapterPlusXL", 7 | "IPAdapterXL", 8 | "IPAdapterFull", 9 | ] 10 | -------------------------------------------------------------------------------- /black_box_image_edit/ip_adapter/resampler.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py 2 | # and https://github.com/lucidrains/imagen-pytorch/blob/main/imagen_pytorch/imagen_pytorch.py 3 | 4 | import math 5 | 6 | import torch 7 | import torch.nn as nn 8 | from einops import rearrange 9 | from einops.layers.torch import Rearrange 10 | 11 | 12 | # FFN 13 | def FeedForward(dim, mult=4): 14 | inner_dim = int(dim * mult) 15 | return nn.Sequential( 16 | nn.LayerNorm(dim), 17 | nn.Linear(dim, inner_dim, bias=False), 18 | nn.GELU(), 19 | nn.Linear(inner_dim, dim, bias=False), 20 | ) 21 | 22 | 23 | def reshape_tensor(x, heads): 24 | bs, length, width = x.shape 25 | # (bs, length, width) --> (bs, length, n_heads, dim_per_head) 26 | x = x.view(bs, length, heads, -1) 27 | # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head) 28 | x = x.transpose(1, 2) 29 | # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head) 30 | x = x.reshape(bs, heads, length, -1) 31 | return x 32 | 33 | 34 | class PerceiverAttention(nn.Module): 35 | def __init__(self, *, dim, dim_head=64, heads=8): 36 | super().__init__() 37 | self.scale = dim_head**-0.5 38 | self.dim_head = dim_head 39 | self.heads = heads 40 | inner_dim = dim_head * heads 41 | 42 | self.norm1 = nn.LayerNorm(dim) 43 | self.norm2 = nn.LayerNorm(dim) 44 | 45 | self.to_q = nn.Linear(dim, inner_dim, bias=False) 46 | self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False) 47 | self.to_out = nn.Linear(inner_dim, dim, bias=False) 48 | 49 | def forward(self, x, latents): 50 | """ 51 | Args: 52 | x (torch.Tensor): image features 53 | shape (b, n1, D) 54 | latent (torch.Tensor): latent features 55 | shape (b, n2, D) 56 | """ 57 | x = self.norm1(x) 58 | latents = self.norm2(latents) 59 | 60 | b, l, _ = latents.shape 61 | 62 | q = self.to_q(latents) 63 | kv_input = torch.cat((x, latents), dim=-2) 64 | k, v = self.to_kv(kv_input).chunk(2, dim=-1) 65 | 66 | q = reshape_tensor(q, self.heads) 67 | k = reshape_tensor(k, self.heads) 68 | v = reshape_tensor(v, self.heads) 69 | 70 | # attention 71 | scale = 1 / math.sqrt(math.sqrt(self.dim_head)) 72 | weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards 73 | weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype) 74 | out = weight @ v 75 | 76 | out = out.permute(0, 2, 1, 3).reshape(b, l, -1) 77 | 78 | return self.to_out(out) 79 | 80 | 81 | class Resampler(nn.Module): 82 | def __init__( 83 | self, 84 | dim=1024, 85 | depth=8, 86 | dim_head=64, 87 | heads=16, 88 | num_queries=8, 89 | embedding_dim=768, 90 | output_dim=1024, 91 | ff_mult=4, 92 | max_seq_len: int = 257, # CLIP tokens + CLS token 93 | apply_pos_emb: bool = False, 94 | num_latents_mean_pooled: int = 0, # number of latents derived from mean pooled representation of the sequence 95 | ): 96 | super().__init__() 97 | self.pos_emb = nn.Embedding(max_seq_len, embedding_dim) if apply_pos_emb else None 98 | 99 | self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5) 100 | 101 | self.proj_in = nn.Linear(embedding_dim, dim) 102 | 103 | self.proj_out = nn.Linear(dim, output_dim) 104 | self.norm_out = nn.LayerNorm(output_dim) 105 | 106 | self.to_latents_from_mean_pooled_seq = ( 107 | nn.Sequential( 108 | nn.LayerNorm(dim), 109 | nn.Linear(dim, dim * num_latents_mean_pooled), 110 | Rearrange("b (n d) -> b n d", n=num_latents_mean_pooled), 111 | ) 112 | if num_latents_mean_pooled > 0 113 | else None 114 | ) 115 | 116 | self.layers = nn.ModuleList([]) 117 | for _ in range(depth): 118 | self.layers.append( 119 | nn.ModuleList( 120 | [ 121 | PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads), 122 | FeedForward(dim=dim, mult=ff_mult), 123 | ] 124 | ) 125 | ) 126 | 127 | def forward(self, x): 128 | if self.pos_emb is not None: 129 | n, device = x.shape[1], x.device 130 | pos_emb = self.pos_emb(torch.arange(n, device=device)) 131 | x = x + pos_emb 132 | 133 | latents = self.latents.repeat(x.size(0), 1, 1) 134 | 135 | x = self.proj_in(x) 136 | 137 | if self.to_latents_from_mean_pooled_seq: 138 | meanpooled_seq = masked_mean(x, dim=1, mask=torch.ones(x.shape[:2], device=x.device, dtype=torch.bool)) 139 | meanpooled_latents = self.to_latents_from_mean_pooled_seq(meanpooled_seq) 140 | latents = torch.cat((meanpooled_latents, latents), dim=-2) 141 | 142 | for attn, ff in self.layers: 143 | latents = attn(x, latents) + latents 144 | latents = ff(latents) + latents 145 | 146 | latents = self.proj_out(latents) 147 | return self.norm_out(latents) 148 | 149 | 150 | def masked_mean(t, *, dim, mask=None): 151 | if mask is None: 152 | return t.mean(dim=dim) 153 | 154 | denom = mask.sum(dim=dim, keepdim=True) 155 | mask = rearrange(mask, "b n -> b n 1") 156 | masked_t = t.masked_fill(~mask, 0.0) 157 | 158 | return masked_t.sum(dim=dim) / denom.clamp(min=1e-5) 159 | -------------------------------------------------------------------------------- /black_box_image_edit/ip_adapter/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | from PIL import Image 5 | 6 | attn_maps = {} 7 | def hook_fn(name): 8 | def forward_hook(module, input, output): 9 | if hasattr(module.processor, "attn_map"): 10 | attn_maps[name] = module.processor.attn_map 11 | del module.processor.attn_map 12 | 13 | return forward_hook 14 | 15 | def register_cross_attention_hook(unet): 16 | for name, module in unet.named_modules(): 17 | if name.split('.')[-1].startswith('attn2'): 18 | module.register_forward_hook(hook_fn(name)) 19 | 20 | return unet 21 | 22 | def upscale(attn_map, target_size): 23 | attn_map = torch.mean(attn_map, dim=0) 24 | attn_map = attn_map.permute(1,0) 25 | temp_size = None 26 | 27 | for i in range(0,5): 28 | scale = 2 ** i 29 | if ( target_size[0] // scale ) * ( target_size[1] // scale) == attn_map.shape[1]*64: 30 | temp_size = (target_size[0]//(scale*8), target_size[1]//(scale*8)) 31 | break 32 | 33 | assert temp_size is not None, "temp_size cannot is None" 34 | 35 | attn_map = attn_map.view(attn_map.shape[0], *temp_size) 36 | 37 | attn_map = F.interpolate( 38 | attn_map.unsqueeze(0).to(dtype=torch.float32), 39 | size=target_size, 40 | mode='bilinear', 41 | align_corners=False 42 | )[0] 43 | 44 | attn_map = torch.softmax(attn_map, dim=0) 45 | return attn_map 46 | def get_net_attn_map(image_size, batch_size=2, instance_or_negative=False, detach=True): 47 | 48 | idx = 0 if instance_or_negative else 1 49 | net_attn_maps = [] 50 | 51 | for name, attn_map in attn_maps.items(): 52 | attn_map = attn_map.cpu() if detach else attn_map 53 | attn_map = torch.chunk(attn_map, batch_size)[idx].squeeze() 54 | attn_map = upscale(attn_map, image_size) 55 | net_attn_maps.append(attn_map) 56 | 57 | net_attn_maps = torch.mean(torch.stack(net_attn_maps,dim=0),dim=0) 58 | 59 | return net_attn_maps 60 | 61 | def attnmaps2images(net_attn_maps): 62 | 63 | #total_attn_scores = 0 64 | images = [] 65 | 66 | for attn_map in net_attn_maps: 67 | attn_map = attn_map.cpu().numpy() 68 | #total_attn_scores += attn_map.mean().item() 69 | 70 | normalized_attn_map = (attn_map - np.min(attn_map)) / (np.max(attn_map) - np.min(attn_map)) * 255 71 | normalized_attn_map = normalized_attn_map.astype(np.uint8) 72 | #print("norm: ", normalized_attn_map.shape) 73 | image = Image.fromarray(normalized_attn_map) 74 | 75 | #image = fix_save_attn_map(attn_map) 76 | images.append(image) 77 | 78 | #print(total_attn_scores) 79 | return images 80 | def is_torch2_available(): 81 | return hasattr(F, "scaled_dot_product_attention") 82 | 83 | def get_generator(seed, device): 84 | 85 | if seed is not None: 86 | if isinstance(seed, list): 87 | generator = [torch.Generator(device).manual_seed(seed_item) for seed_item in seed] 88 | else: 89 | generator = torch.Generator(device).manual_seed(seed) 90 | else: 91 | generator = None 92 | 93 | return generator -------------------------------------------------------------------------------- /black_box_image_edit/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from moviepy.editor import VideoFileClip 3 | import random 4 | from PIL import Image 5 | import numpy as np 6 | 7 | def crop_and_resize_video(input_video_path, output_folder, clip_duration=None, width=None, height=None, start_time=None, end_time=None, n_frames=16, center_crop=False, x_offset=0, y_offset=0, longest_to_width=False): # Load the video file 8 | video = VideoFileClip(input_video_path) 9 | 10 | # Calculate start and end times for cropping 11 | if clip_duration is not None: 12 | if start_time is not None: 13 | start_time = float(start_time) 14 | end_time = start_time + clip_duration 15 | elif end_time is not None: 16 | end_time = float(end_time) 17 | start_time = end_time - clip_duration 18 | else: 19 | # Default to random cropping if neither start nor end time is specified 20 | video_duration = video.duration 21 | if video_duration <= clip_duration: 22 | print(f"Skipping {input_video_path}: duration is less than or equal to the clip duration.") 23 | return 24 | max_start_time = video_duration - clip_duration 25 | start_time = random.uniform(0, max_start_time) 26 | end_time = start_time + clip_duration 27 | elif start_time is not None and end_time is not None: 28 | start_time = float(start_time) 29 | end_time = float(end_time) 30 | clip_duration = int(end_time - start_time) 31 | else: 32 | raise ValueError("Either clip_duration must be provided, or both start_time and end_time must be specified.") 33 | 34 | # Crop the video 35 | cropped_video = video.subclip(start_time, end_time) 36 | 37 | 38 | if center_crop: 39 | # Calculate scale to ensure the desired crop size fits within the video 40 | video_width, video_height = cropped_video.size 41 | scale_width = video_width / width 42 | scale_height = video_height / height 43 | if longest_to_width: 44 | scale = max(scale_width, scale_height) 45 | else: 46 | scale = min(scale_width, scale_height) 47 | 48 | # Resize video to ensure the crop area fits within the frame 49 | # This step ensures that the smallest dimension matches or exceeds 512 pixels 50 | new_width = int(video_width / scale) 51 | new_height = int(video_height / scale) 52 | resized_video = cropped_video.resize(newsize=(new_width, new_height)) 53 | print(f"Resized video to ({new_width}, {new_height})") 54 | 55 | # Calculate crop position with offset, ensuring the crop does not go out of bounds 56 | # The offset calculation needs to ensure that the cropping area remains within the video frame 57 | offset_x = int(((x_offset + 1) / 2) * (new_width - width)) # Adjusted for [-1, 1] scale 58 | offset_y = int(((y_offset + 1) / 2) * (new_height - height)) # Adjusted for [-1, 1] scale 59 | 60 | # Ensure offsets do not push the crop area out of the video frame 61 | offset_x = max(0, min(new_width - width, offset_x)) 62 | offset_y = max(0, min(new_height - height, offset_y)) 63 | 64 | # Apply center crop with offsets 65 | cropped_video = resized_video.crop(x1=offset_x, y1=offset_y, width=width, height=height) 66 | elif width and height: 67 | # Directly resize the video to specified width and height if no center crop is specified 68 | cropped_video = cropped_video.resize(newsize=(width, height)) 69 | 70 | 71 | # After resizing and cropping, set the frame rate to fps 72 | fps = n_frames // clip_duration 73 | final_video = cropped_video.set_fps(fps) 74 | 75 | # Prepare the output video path 76 | if not os.path.exists(output_folder): 77 | os.makedirs(output_folder) 78 | filename = os.path.basename(input_video_path) 79 | output_video_path = os.path.join(output_folder, filename) 80 | 81 | # Write the result to the output file 82 | final_video.write_videofile(output_video_path, codec='libx264', audio_codec='aac', fps=fps) 83 | print(f"Processed {input_video_path}, saved to {output_video_path}") 84 | return output_video_path 85 | 86 | 87 | def infer_video_prompt(model, video_path, output_dir, prompt, prompt_type="instruct", force_512=False, seed=42, negative_prompt="", overwrite=False): 88 | """ 89 | Processes videos from the input directory, resizes them to 512x512 before feeding into the model by first frame, 90 | and saves the processed video back to its original size in the output directory. 91 | 92 | Args: 93 | model: The video editing model. 94 | input_dir (str): Path to the directory containing input videos. 95 | output_dir (str): Path to the directory where processed videos will be saved. 96 | prompt (str): Instruction prompt for video editing. 97 | """ 98 | 99 | # Create the output directory if it does not exist 100 | if not os.path.exists(output_dir): 101 | os.makedirs(output_dir) 102 | 103 | video_clip = VideoFileClip(video_path) 104 | video_filename = os.path.basename(video_path) 105 | # filename_noext = os.path.splitext(video_filename)[0] 106 | 107 | # Create the output directory if it does not exist 108 | # final_output_dir = os.path.join(output_dir, filename_noext) 109 | final_output_dir = output_dir 110 | if not os.path.exists(final_output_dir): 111 | os.makedirs(final_output_dir) 112 | 113 | result_path = os.path.join(final_output_dir, prompt + ".png") 114 | 115 | # Check if result already exists 116 | if os.path.exists(result_path) and overwrite is False: 117 | print(f"Result already exists: {result_path}") 118 | return 119 | 120 | def process_frame(image): 121 | pil_image = Image.fromarray(image) 122 | if force_512: 123 | pil_image = pil_image.resize((512, 512), Image.LANCZOS) 124 | if prompt_type == "instruct": 125 | result = model.infer_one_image(pil_image, instruct_prompt=prompt, seed=seed, negative_prompt=negative_prompt) 126 | else: 127 | result = model.infer_one_image(pil_image, target_prompt=prompt, seed=seed, negative_prompt=negative_prompt) 128 | if force_512: 129 | result = result.resize(video_clip.size, Image.LANCZOS) 130 | return np.array(result) 131 | 132 | # Process only the first frame 133 | first_frame = video_clip.get_frame(0) # Get the first frame 134 | processed_frame = process_frame(first_frame) # Process the first frame 135 | 136 | 137 | #Image.fromarray(first_frame).save(os.path.join(final_output_dir, "00000.png")) 138 | Image.fromarray(processed_frame).save(result_path) 139 | print(f"Processed and saved the first frame: {result_path}") 140 | return result_path 141 | 142 | def infer_video_style(model, video_path, output_dir, style_image, prompt, force_512=False, seed=42, negative_prompt="", overwrite=False): 143 | if not os.path.exists(output_dir): 144 | os.makedirs(output_dir) 145 | 146 | video_clip = VideoFileClip(video_path) 147 | video_filename = os.path.basename(video_path) 148 | final_output_dir = output_dir 149 | if not os.path.exists(final_output_dir): 150 | os.makedirs(final_output_dir) 151 | 152 | result_path = os.path.join(final_output_dir, "style" + ".png") 153 | if os.path.exists(result_path) and overwrite is False: 154 | print(f"Result already exists: {result_path}") 155 | return 156 | def process_frame(image): 157 | pil_image = Image.fromarray(image) 158 | if force_512: 159 | pil_image = pil_image.resize((512, 512), Image.LANCZOS) 160 | result = model.infer_one_image(pil_image, 161 | style_image=style_image, 162 | prompt=prompt, 163 | seed=seed, 164 | negative_prompt=negative_prompt) 165 | if force_512: 166 | result = result.resize(video_clip.size, Image.LANCZOS) 167 | return np.array(result) 168 | # Process only the first frame 169 | first_frame = video_clip.get_frame(0) # Get the first frame 170 | processed_frame = process_frame(first_frame) # Process the first frame 171 | Image.fromarray(processed_frame).save(result_path) 172 | print(f"Processed and saved the first frame: {result_path}") 173 | return result_path -------------------------------------------------------------------------------- /cog.yaml: -------------------------------------------------------------------------------- 1 | # Configuration for Cog ⚙️ 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md 3 | 4 | build: 5 | # set to true if your model requires a GPU 6 | gpu: true 7 | system_packages: 8 | - "libgl1-mesa-glx" 9 | - "libglib2.0-0" 10 | python_version: "3.11" 11 | python_packages: 12 | - torch==2.0.1 13 | - torchvision==0.15.2 14 | - accelerate==0.27.2 15 | - diffusers==0.26.3 16 | - moviepy 17 | - transformers==4.38.1 18 | - omegaconf==2.3.0 19 | - opencv-python 20 | - imageio 21 | run: 22 | - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.6.0/pget_linux_x86_64" && chmod +x /usr/local/bin/pget 23 | predict: "predict.py:Predictor" 24 | -------------------------------------------------------------------------------- /consisti2v/README.md: -------------------------------------------------------------------------------- 1 | # AnyV2V(_ConsistI2V_) 2 | 3 | Our AnyV2V(_ConsistI2V_) is a standalone version. 4 | 5 | ## Setup for ConsistI2V 6 | 7 | ### Prepare Environment 8 | ``` 9 | conda env create -f environment.yaml 10 | conda activate consisti2v 11 | ``` 12 | 13 | ## AnyV2V 14 | 15 | **Note:** due to the lower training resolution of ConsistI2V (256x256), it might perform better on 256x256 inputs. We provide configurations for running on both 256x256 and 512x512. 16 | 17 | ### Run ConsistI2V DDIM Inversion to get the initial latent 18 | Usage Example: 19 | ```shell 20 | python run_ddim_inversion.py --config configs/pipeline_256/ddim_inversion_256.yaml video_path=/path/to/your_video.mp4 video_name=your_video 21 | ``` 22 | 23 | Saved latent goes to `./ddim_version` (can be configurated in `./configs/pipeline_256(512)/ddim_inversion_256(512).yaml`). 24 | 25 | ### Run AnyV2V with ConsistI2V 26 | 27 | Your need to prepare your edited image frame first. We provided an image editing script in the root folder of AnyV2V. 28 | 29 | Usage Example: 30 | ```shell 31 | python run_pnp_edit.py --config configs/pipeline_256/pnp_edit.yaml \ 32 | video_path=/path/to/your_video.mp4 \ 33 | video_name=your_video \ 34 | edited_first_frame_path=/path/to/edited_first_frame.png \ 35 | editing_prompt="" \ 36 | ddim_latents_path=/path/to/ddim_latents 37 | ``` 38 | 39 | Saved video goes to `./anyv2v_results` (can be configurated in `./configs/pipeline_256(512)/pnp_edit.yaml`). 40 | -------------------------------------------------------------------------------- /consisti2v/configs/pipeline_256/ddim_inversion_256.yaml: -------------------------------------------------------------------------------- 1 | # General 2 | seed: 8888 3 | device: "cuda:0" 4 | debug: False # For logging 5 | 6 | # Dir 7 | exp_name: "${video_name}" 8 | output_dir: "ddim_inversion/${exp_name}" 9 | 10 | # Data 11 | image_size: [256, 256] 12 | data_dir: null 13 | video_name: "" 14 | video_path: "/${video_name}.mp4" 15 | video_frames_path: null 16 | save_frames: True 17 | 18 | # DDIM settings 19 | n_frames: 16 20 | 21 | # DDIM inversion 22 | inverse_config: 23 | image_size: ${image_size} 24 | n_frames: ${n_frames} 25 | cfg_txt: 1.0 26 | cfg_img: 1.0 27 | frame_stride: 3 28 | prompt: "" 29 | negative_prompt: "" 30 | n_steps: 500 31 | output_dir: "outputs/${exp_name}" 32 | 33 | # DDIM reconstruction 34 | recon_config: 35 | image_size: ${image_size} 36 | n_frames: ${n_frames} 37 | cfg_txt: 1.0 38 | cfg_img: 1.0 39 | frame_stride: 3 40 | prompt: "" 41 | negative_prompt: "" 42 | n_steps: 50 43 | ddim_init_latents_t_idx: 0 # 0 for 981, 3 for 921, 9 for 801, 20 for 581 if n_steps=50 -------------------------------------------------------------------------------- /consisti2v/configs/pipeline_256/pnp_edit.yaml: -------------------------------------------------------------------------------- 1 | # General 2 | seed: 8888 3 | device: "cuda:0" 4 | debug: True # For logging 5 | 6 | # Dir 7 | exp_name: "${video_name}" 8 | output_dir: "anyv2v_results/${exp_name}" 9 | 10 | # Data 11 | image_size: [256, 256] 12 | data_dir: null 13 | video_name: "" 14 | video_path: "/${video_name}.mp4" 15 | video_frames_path: "/${video_name}" 16 | edited_first_frame_path: "/.png" 17 | 18 | 19 | # Pnp Editing 20 | n_frames: 16 21 | cfg_txt: 35 22 | cfg_img: 1.0 23 | frame_stride: 3 24 | editing_prompt: "" 25 | editing_negative_prompt: "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms" 26 | n_steps: 50 27 | ddim_init_latents_t_idx: 4 # 0 for 981, 3 for 921, 9 for 801, 20 for 581 if n_steps=50 28 | ddim_inv_prompt: "" 29 | ddim_latents_path: "path/to/ddim_latents" 30 | 31 | # Pnp config 32 | pnp_f_t: 0.2 33 | pnp_spatial_attn_t: 0.2 34 | pnp_temp_attn_t: 0.5 35 | 36 | blend_ratio: 0.0 -------------------------------------------------------------------------------- /consisti2v/configs/pipeline_512/ddim_inversion_512.yaml: -------------------------------------------------------------------------------- 1 | # General 2 | seed: 8888 3 | device: "cuda:0" 4 | debug: False # For logging 5 | 6 | # Dir 7 | exp_name: "${video_name}" 8 | output_dir: "ddim_inversion/${exp_name}" 9 | 10 | # Data 11 | image_size: [512, 512] 12 | data_dir: null 13 | video_name: "" 14 | video_path: "/${video_name}.mp4" 15 | video_frames_path: null 16 | save_frames: True 17 | 18 | # DDIM settings 19 | n_frames: 16 20 | 21 | # DDIM inversion 22 | inverse_config: 23 | image_size: ${image_size} 24 | n_frames: ${n_frames} 25 | cfg_txt: 1.0 26 | cfg_img: 1.0 27 | frame_stride: 3 28 | prompt: "" 29 | negative_prompt: "" 30 | n_steps: 500 31 | output_dir: "outputs/${exp_name}" 32 | 33 | # DDIM reconstruction 34 | recon_config: 35 | image_size: ${image_size} 36 | n_frames: ${n_frames} 37 | cfg_txt: 1.0 38 | cfg_img: 1.0 39 | frame_stride: 3 40 | prompt: "" 41 | negative_prompt: "" 42 | n_steps: 50 43 | ddim_init_latents_t_idx: 0 # 0 for 981, 3 for 921, 9 for 801, 20 for 581 if n_steps=50 -------------------------------------------------------------------------------- /consisti2v/configs/pipeline_512/pnp_edit.yaml: -------------------------------------------------------------------------------- 1 | # General 2 | seed: 8888 3 | device: "cuda:0" 4 | debug: True # For logging 5 | 6 | # Dir 7 | exp_name: "${video_name}" 8 | output_dir: "anyv2v_results/${exp_name}" 9 | 10 | # Data 11 | image_size: [512, 512] 12 | data_dir: null 13 | video_name: "" 14 | video_path: "/${video_name}.mp4" 15 | video_frames_path: "/${video_name}" 16 | edited_first_frame_path: "/.png" 17 | 18 | 19 | # Pnp Editing 20 | n_frames: 16 21 | cfg_txt: 35 22 | cfg_img: 1.0 23 | frame_stride: 3 24 | editing_prompt: "" 25 | editing_negative_prompt: "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms" 26 | n_steps: 50 27 | ddim_init_latents_t_idx: 4 # 0 for 981, 3 for 921, 9 for 801, 20 for 581 if n_steps=50 28 | ddim_inv_prompt: "" 29 | ddim_latents_path: "path/to/ddim_latents" 30 | 31 | # Pnp config 32 | pnp_f_t: 0.2 33 | pnp_spatial_attn_t: 0.2 34 | pnp_temp_attn_t: 0.5 35 | 36 | blend_ratio: 0.0 -------------------------------------------------------------------------------- /consisti2v/consisti2v/utils/frameinit_utils.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/TianxingWu/FreeInit/blob/master/freeinit_utils.py 2 | import torch 3 | import torch.fft as fft 4 | import math 5 | 6 | 7 | def freq_mix_3d(x, noise, LPF): 8 | """ 9 | Noise reinitialization. 10 | 11 | Args: 12 | x: diffused latent 13 | noise: randomly sampled noise 14 | LPF: low pass filter 15 | """ 16 | # FFT 17 | x_freq = fft.fftn(x, dim=(-3, -2, -1)) 18 | x_freq = fft.fftshift(x_freq, dim=(-3, -2, -1)) 19 | noise_freq = fft.fftn(noise, dim=(-3, -2, -1)) 20 | noise_freq = fft.fftshift(noise_freq, dim=(-3, -2, -1)) 21 | 22 | # frequency mix 23 | HPF = 1 - LPF 24 | x_freq_low = x_freq * LPF 25 | noise_freq_high = noise_freq * HPF 26 | x_freq_mixed = x_freq_low + noise_freq_high # mix in freq domain 27 | 28 | # IFFT 29 | x_freq_mixed = fft.ifftshift(x_freq_mixed, dim=(-3, -2, -1)) 30 | x_mixed = fft.ifftn(x_freq_mixed, dim=(-3, -2, -1)).real 31 | 32 | return x_mixed 33 | 34 | 35 | def get_freq_filter(shape, device, filter_type, n, d_s, d_t): 36 | """ 37 | Form the frequency filter for noise reinitialization. 38 | 39 | Args: 40 | shape: shape of latent (B, C, T, H, W) 41 | filter_type: type of the freq filter 42 | n: (only for butterworth) order of the filter, larger n ~ ideal, smaller n ~ gaussian 43 | d_s: normalized stop frequency for spatial dimensions (0.0-1.0) 44 | d_t: normalized stop frequency for temporal dimension (0.0-1.0) 45 | """ 46 | if filter_type == "gaussian": 47 | return gaussian_low_pass_filter(shape=shape, d_s=d_s, d_t=d_t).to(device) 48 | elif filter_type == "ideal": 49 | return ideal_low_pass_filter(shape=shape, d_s=d_s, d_t=d_t).to(device) 50 | elif filter_type == "box": 51 | return box_low_pass_filter(shape=shape, d_s=d_s, d_t=d_t).to(device) 52 | elif filter_type == "butterworth": 53 | return butterworth_low_pass_filter(shape=shape, n=n, d_s=d_s, d_t=d_t).to(device) 54 | else: 55 | raise NotImplementedError 56 | 57 | 58 | def gaussian_low_pass_filter(shape, d_s=0.25, d_t=0.25): 59 | """ 60 | Compute the gaussian low pass filter mask. 61 | 62 | Args: 63 | shape: shape of the filter (volume) 64 | d_s: normalized stop frequency for spatial dimensions (0.0-1.0) 65 | d_t: normalized stop frequency for temporal dimension (0.0-1.0) 66 | """ 67 | T, H, W = shape[-3], shape[-2], shape[-1] 68 | mask = torch.zeros(shape) 69 | if d_s==0 or d_t==0: 70 | return mask 71 | for t in range(T): 72 | for h in range(H): 73 | for w in range(W): 74 | d_square = (((d_s/d_t)*(2*t/T-1))**2 + (2*h/H-1)**2 + (2*w/W-1)**2) 75 | mask[..., t,h,w] = math.exp(-1/(2*d_s**2) * d_square) 76 | return mask 77 | 78 | 79 | def butterworth_low_pass_filter(shape, n=4, d_s=0.25, d_t=0.25): 80 | """ 81 | Compute the butterworth low pass filter mask. 82 | 83 | Args: 84 | shape: shape of the filter (volume) 85 | n: order of the filter, larger n ~ ideal, smaller n ~ gaussian 86 | d_s: normalized stop frequency for spatial dimensions (0.0-1.0) 87 | d_t: normalized stop frequency for temporal dimension (0.0-1.0) 88 | """ 89 | T, H, W = shape[-3], shape[-2], shape[-1] 90 | mask = torch.zeros(shape) 91 | if d_s==0 or d_t==0: 92 | return mask 93 | for t in range(T): 94 | for h in range(H): 95 | for w in range(W): 96 | d_square = (((d_s/d_t)*(2*t/T-1))**2 + (2*h/H-1)**2 + (2*w/W-1)**2) 97 | mask[..., t,h,w] = 1 / (1 + (d_square / d_s**2)**n) 98 | return mask 99 | 100 | 101 | def ideal_low_pass_filter(shape, d_s=0.25, d_t=0.25): 102 | """ 103 | Compute the ideal low pass filter mask. 104 | 105 | Args: 106 | shape: shape of the filter (volume) 107 | d_s: normalized stop frequency for spatial dimensions (0.0-1.0) 108 | d_t: normalized stop frequency for temporal dimension (0.0-1.0) 109 | """ 110 | T, H, W = shape[-3], shape[-2], shape[-1] 111 | mask = torch.zeros(shape) 112 | if d_s==0 or d_t==0: 113 | return mask 114 | for t in range(T): 115 | for h in range(H): 116 | for w in range(W): 117 | d_square = (((d_s/d_t)*(2*t/T-1))**2 + (2*h/H-1)**2 + (2*w/W-1)**2) 118 | mask[..., t,h,w] = 1 if d_square <= d_s*2 else 0 119 | return mask 120 | 121 | 122 | def box_low_pass_filter(shape, d_s=0.25, d_t=0.25): 123 | """ 124 | Compute the ideal low pass filter mask (approximated version). 125 | 126 | Args: 127 | shape: shape of the filter (volume) 128 | d_s: normalized stop frequency for spatial dimensions (0.0-1.0) 129 | d_t: normalized stop frequency for temporal dimension (0.0-1.0) 130 | """ 131 | T, H, W = shape[-3], shape[-2], shape[-1] 132 | mask = torch.zeros(shape) 133 | if d_s==0 or d_t==0: 134 | return mask 135 | 136 | threshold_s = round(int(H // 2) * d_s) 137 | threshold_t = round(T // 2 * d_t) 138 | 139 | cframe, crow, ccol = T // 2, H // 2, W //2 140 | mask[..., cframe - threshold_t:cframe + threshold_t, crow - threshold_s:crow + threshold_s, ccol - threshold_s:ccol + threshold_s] = 1.0 141 | 142 | return mask -------------------------------------------------------------------------------- /consisti2v/consisti2v/utils/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import imageio 3 | import numpy as np 4 | from typing import Union 5 | 6 | import torch 7 | import torchvision 8 | import torch.distributed as dist 9 | import wandb 10 | 11 | from tqdm import tqdm 12 | from einops import rearrange 13 | 14 | from torchmetrics.image.fid import _compute_fid 15 | 16 | 17 | def zero_rank_print(s): 18 | if (not dist.is_initialized()) or (dist.is_initialized() and dist.get_rank() == 0): print("### " + s) 19 | 20 | 21 | def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8, wandb=False, global_step=0, format="gif"): 22 | videos = rearrange(videos, "b c t h w -> t b c h w") 23 | outputs = [] 24 | for x in videos: 25 | x = torchvision.utils.make_grid(x, nrow=n_rows) 26 | x = x.transpose(0, 1).transpose(1, 2).squeeze(-1) 27 | if rescale: 28 | x = (x + 1.0) / 2.0 # -1,1 -> 0,1 29 | x = (x * 255).numpy().astype(np.uint8) 30 | outputs.append(x) 31 | 32 | if wandb: 33 | wandb_video = wandb.Video(outputs, fps=fps) 34 | wandb.log({"val_videos": wandb_video}, step=global_step) 35 | 36 | os.makedirs(os.path.dirname(path), exist_ok=True) 37 | if format == "gif": 38 | imageio.mimsave(path, outputs, fps=fps) 39 | elif format == "mp4": 40 | torchvision.io.write_video(path, np.array(outputs), fps=fps, video_codec='h264', options={'crf': '10'}) 41 | 42 | # DDIM Inversion 43 | @torch.no_grad() 44 | def init_prompt(prompt, pipeline): 45 | uncond_input = pipeline.tokenizer( 46 | [""], padding="max_length", max_length=pipeline.tokenizer.model_max_length, 47 | return_tensors="pt" 48 | ) 49 | uncond_embeddings = pipeline.text_encoder(uncond_input.input_ids.to(pipeline.device))[0] 50 | text_input = pipeline.tokenizer( 51 | [prompt], 52 | padding="max_length", 53 | max_length=pipeline.tokenizer.model_max_length, 54 | truncation=True, 55 | return_tensors="pt", 56 | ) 57 | text_embeddings = pipeline.text_encoder(text_input.input_ids.to(pipeline.device))[0] 58 | context = torch.cat([uncond_embeddings, text_embeddings]) 59 | 60 | return context 61 | 62 | 63 | def next_step(model_output: Union[torch.FloatTensor, np.ndarray], timestep: int, 64 | sample: Union[torch.FloatTensor, np.ndarray], ddim_scheduler): 65 | timestep, next_timestep = min( 66 | timestep - ddim_scheduler.config.num_train_timesteps // ddim_scheduler.num_inference_steps, 999), timestep 67 | alpha_prod_t = ddim_scheduler.alphas_cumprod[timestep] if timestep >= 0 else ddim_scheduler.final_alpha_cumprod 68 | alpha_prod_t_next = ddim_scheduler.alphas_cumprod[next_timestep] 69 | beta_prod_t = 1 - alpha_prod_t 70 | next_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5 71 | next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * model_output 72 | next_sample = alpha_prod_t_next ** 0.5 * next_original_sample + next_sample_direction 73 | return next_sample 74 | 75 | 76 | def get_noise_pred_single(latents, t, context, first_frame_latents, frame_stride, unet): 77 | noise_pred = unet(latents, t, encoder_hidden_states=context, first_frame_latents=first_frame_latents, frame_stride=frame_stride).sample 78 | return noise_pred 79 | 80 | 81 | @torch.no_grad() 82 | def ddim_loop(pipeline, ddim_scheduler, latent, num_inv_steps, prompt, first_frame_latents, frame_stride): 83 | context = init_prompt(prompt, pipeline) 84 | uncond_embeddings, cond_embeddings = context.chunk(2) 85 | all_latent = [latent] 86 | latent = latent.clone().detach() 87 | for i in tqdm(range(num_inv_steps)): 88 | t = ddim_scheduler.timesteps[len(ddim_scheduler.timesteps) - i - 1] 89 | noise_pred = get_noise_pred_single(latent, t, cond_embeddings, first_frame_latents, frame_stride, pipeline.unet) 90 | latent = next_step(noise_pred, t, latent, ddim_scheduler) 91 | all_latent.append(latent) 92 | return all_latent 93 | 94 | 95 | @torch.no_grad() 96 | def ddim_inversion(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt="", first_frame_latents=None, frame_stride=3): 97 | ddim_latents = ddim_loop(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt, first_frame_latents, frame_stride) 98 | return ddim_latents 99 | 100 | 101 | def compute_fid(real_features, fake_features, num_features, device): 102 | orig_dtype = real_features.dtype 103 | 104 | mx_num_feats = (num_features, num_features) 105 | real_features_sum = torch.zeros(num_features).double().to(device) 106 | real_features_cov_sum = torch.zeros(mx_num_feats).double().to(device) 107 | real_features_num_samples = torch.tensor(0).long().to(device) 108 | 109 | fake_features_sum = torch.zeros(num_features).double().to(device) 110 | fake_features_cov_sum = torch.zeros(mx_num_feats).double().to(device) 111 | fake_features_num_samples = torch.tensor(0).long().to(device) 112 | 113 | real_features = real_features.double() 114 | fake_features = fake_features.double() 115 | 116 | real_features_sum += real_features.sum(dim=0) 117 | real_features_cov_sum += real_features.t().mm(real_features) 118 | real_features_num_samples += real_features.shape[0] 119 | 120 | fake_features_sum += fake_features.sum(dim=0) 121 | fake_features_cov_sum += fake_features.t().mm(fake_features) 122 | fake_features_num_samples += fake_features.shape[0] 123 | 124 | """Calculate FID score based on accumulated extracted features from the two distributions.""" 125 | if real_features_num_samples < 2 or fake_features_num_samples < 2: 126 | raise RuntimeError("More than one sample is required for both the real and fake distributed to compute FID") 127 | mean_real = (real_features_sum / real_features_num_samples).unsqueeze(0) 128 | mean_fake = (fake_features_sum / fake_features_num_samples).unsqueeze(0) 129 | 130 | cov_real_num = real_features_cov_sum - real_features_num_samples * mean_real.t().mm(mean_real) 131 | cov_real = cov_real_num / (real_features_num_samples - 1) 132 | cov_fake_num = fake_features_cov_sum - fake_features_num_samples * mean_fake.t().mm(mean_fake) 133 | cov_fake = cov_fake_num / (fake_features_num_samples - 1) 134 | return _compute_fid(mean_real.squeeze(0), cov_real, mean_fake.squeeze(0), cov_fake).to(orig_dtype) 135 | 136 | 137 | def compute_inception_score(gen_probs, num_splits=10): 138 | num_gen = gen_probs.shape[0] 139 | gen_probs = gen_probs.detach().cpu().numpy() 140 | scores = [] 141 | np.random.RandomState(42).shuffle(gen_probs) 142 | for i in range(num_splits): 143 | part = gen_probs[i * num_gen // num_splits : (i + 1) * num_gen // num_splits] 144 | kl = part * (np.log(part) - np.log(np.mean(part, axis=0, keepdims=True))) 145 | kl = np.mean(np.sum(kl, axis=1)) 146 | scores.append(np.exp(kl)) 147 | return float(np.mean(scores)), float(np.std(scores)) 148 | # idx = torch.randperm(features.shape[0]) 149 | # features = features[idx] 150 | # # calculate probs and logits 151 | # prob = features.softmax(dim=1) 152 | # log_prob = features.log_softmax(dim=1) 153 | 154 | # # split into groups 155 | # prob = prob.chunk(splits, dim=0) 156 | # log_prob = log_prob.chunk(splits, dim=0) 157 | 158 | # # calculate score per split 159 | # mean_prob = [p.mean(dim=0, keepdim=True) for p in prob] 160 | # kl_ = [p * (log_p - m_p.log()) for p, log_p, m_p in zip(prob, log_prob, mean_prob)] 161 | # kl_ = [k.sum(dim=1).mean().exp() for k in kl_] 162 | # kl = torch.stack(kl_) 163 | 164 | # return mean and std 165 | # return kl.mean(), kl.std() -------------------------------------------------------------------------------- /consisti2v/environment.yaml: -------------------------------------------------------------------------------- 1 | name: consisti2v 2 | channels: 3 | - pytorch 4 | - nvidia 5 | dependencies: 6 | - python=3.10 7 | - pytorch=2.1.0 8 | - torchvision=0.16.0 9 | - torchaudio=2.1.0 10 | - pytorch-cuda=11.8 11 | - pip 12 | - pip: 13 | - diffusers==0.21.2 14 | - transformers==4.25.1 15 | - accelerate==0.23.0 16 | - imageio==2.27.0 17 | - decord==0.6.0 18 | - einops 19 | - omegaconf 20 | - safetensors 21 | - gradio==3.42.0 22 | - wandb 23 | - moviepy 24 | - scikit-learn 25 | - av 26 | - rotary_embedding_torch 27 | - torchmetrics 28 | - torch-fidelity 29 | -------------------------------------------------------------------------------- /consisti2v/run_ddim_inversion.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import argparse 5 | import logging 6 | from omegaconf import OmegaConf 7 | from PIL import Image 8 | from pathlib import Path 9 | 10 | # HF imports 11 | from diffusers import DDIMScheduler 12 | from ddim_inverse_scheduler import DDIMInverseScheduler 13 | 14 | # Project imports 15 | from utils import ( 16 | seed_everything, 17 | load_video_frames, 18 | convert_video_to_frames, 19 | load_ddim_latents_at_T, 20 | load_ddim_latents_at_t, 21 | ) 22 | from consisti2v.pipelines.pipeline_video_editing import ConditionalVideoEditingPipeline 23 | from consisti2v.utils.util import save_videos_grid 24 | 25 | 26 | def ddim_inversion(config, first_frame, frame_list, pipe: ConditionalVideoEditingPipeline, inverse_scheduler, g): 27 | pipe.scheduler = inverse_scheduler 28 | video_latents_at_0 = pipe.encode_vae_video( 29 | frame_list, 30 | device=pipe._execution_device, 31 | height=config.image_size[1], 32 | width=config.image_size[0], 33 | ) 34 | ddim_latents = pipe.invert( 35 | prompt=config.prompt, 36 | first_frame_paths=first_frame, 37 | height=config.image_size[1], 38 | width=config.image_size[0], 39 | video_length=config.n_frames, 40 | num_inference_steps=config.n_steps, 41 | guidance_scale_txt=config.cfg_txt, 42 | guidance_scale_img=config.cfg_img, 43 | negative_prompt=config.negative_prompt, 44 | frame_stride=config.frame_stride, 45 | latents=video_latents_at_0, 46 | generator=g, # TODO: this is not correct 47 | return_dict=False, 48 | output_type="latent", 49 | output_dir=config.output_dir, 50 | ).videos # [b, num_inference_steps, c, num_frames, h, w] 51 | logger.debug(f"ddim_latents.shape: {ddim_latents.shape}") 52 | ddim_latents = ddim_latents[0] # [num_inference_steps, c, num_frames, h, w] 53 | return ddim_latents 54 | 55 | 56 | def ddim_sampling( 57 | config, first_frame, ddim_latents_at_T, pipe: ConditionalVideoEditingPipeline, ddim_scheduler, g, ddim_init_latents_t_idx 58 | ): 59 | pipe.scheduler = ddim_scheduler 60 | reconstructed_video = pipe( 61 | prompt=config.prompt, 62 | first_frame_paths=first_frame, 63 | height=config.image_size[1], 64 | width=config.image_size[0], 65 | video_length=config.n_frames, 66 | num_inference_steps=config.n_steps, 67 | guidance_scale_txt=config.cfg_txt, 68 | guidance_scale_img=config.cfg_img, 69 | negative_prompt=config.negative_prompt, 70 | frame_stride=config.frame_stride, 71 | latents=ddim_latents_at_T, 72 | generator=g, # TODO: this is not correct 73 | return_dict=True, 74 | ddim_init_latents_t_idx=ddim_init_latents_t_idx, 75 | ).videos 76 | return reconstructed_video 77 | 78 | 79 | def main(config): 80 | seed_everything(config.seed) 81 | torch.set_grad_enabled(False) 82 | device = torch.device(config.device) 83 | 84 | # Initialize the pipeline 85 | # TODO: do we need the get_inverse_timesteps function? 86 | pipe = ConditionalVideoEditingPipeline.from_pretrained( 87 | "TIGER-Lab/ConsistI2V", 88 | torch_dtype=torch.float16, 89 | ) 90 | # TODO: set the model to GPU and eval mode 91 | pipe.to(device) 92 | g = torch.Generator(device=device) 93 | g = g.manual_seed(config.seed) 94 | 95 | # Initialize the DDIM inverse scheduler 96 | inverse_scheduler = DDIMInverseScheduler.from_pretrained( 97 | "TIGER-Lab/ConsistI2V", 98 | subfolder="scheduler", 99 | ) 100 | # Initialize the DDIM scheduler 101 | ddim_scheduler = DDIMScheduler.from_pretrained( 102 | "TIGER-Lab/ConsistI2V", 103 | subfolder="scheduler", 104 | ) 105 | 106 | if config.video_path: 107 | frame_list = convert_video_to_frames(config.video_path, config.image_size, save_frames=config.save_frames, save_dir=config.output_dir) 108 | frame_list = frame_list[: config.n_frames] # 16 frames for img2vid 109 | logger.debug(f"len(frame_list): {len(frame_list)}") 110 | video_name = Path(config.video_path).stem 111 | first_frame_path = os.path.join(config.output_dir, video_name, '00000.png') 112 | elif config.video_frames_path: 113 | _, frame_list = load_video_frames(config.video_frames_path, config.n_frames) 114 | first_frame_path = os.path.join(config.video_frames_path, '00000.png') 115 | else: 116 | raise ValueError("Please provide either video_path or video_frames_path") 117 | 118 | # Main pipeline 119 | ddim_latents = ddim_inversion(config.inverse_config, first_frame_path, frame_list, pipe, inverse_scheduler, g) 120 | 121 | recon_config = config.recon_config 122 | ddim_init_latents_t_idx = recon_config.ddim_init_latents_t_idx 123 | ddim_scheduler.set_timesteps(recon_config.n_steps) 124 | logger.info(f"ddim_scheduler.timesteps: {ddim_scheduler.timesteps}") 125 | ddim_latents_path = config.inverse_config.output_dir 126 | ddim_latents_at_t = load_ddim_latents_at_t( 127 | ddim_scheduler.timesteps[ddim_init_latents_t_idx], ddim_latents_path=ddim_latents_path 128 | ) 129 | logger.debug(f"ddim_scheduler.timesteps[t_idx]: {ddim_scheduler.timesteps[ddim_init_latents_t_idx]}") 130 | 131 | reconstructed_video = ddim_sampling(recon_config, first_frame_path, ddim_latents_at_t, pipe, ddim_scheduler, g, ddim_init_latents_t_idx) 132 | 133 | # Save reconstructed frames and video 134 | os.makedirs(config.output_dir, exist_ok=True) 135 | save_videos_grid(reconstructed_video, os.path.join(config.output_dir, "ddim_reconstruction.gif"), fps=10, format="gif") 136 | save_videos_grid(reconstructed_video, os.path.join(config.output_dir, "ddim_reconstruction.mp4"), fps=10, format="mp4") 137 | logger.info(f"Saved reconstructed video to {config.output_dir}") 138 | 139 | 140 | if __name__ == "__main__": 141 | parser = argparse.ArgumentParser() 142 | parser.add_argument("--config", type=str, default="configs/pipeline_256/ddim_inversion_256.yaml") 143 | parser.add_argument("optional_args", nargs='*', default=[]) 144 | args = parser.parse_args() 145 | config = OmegaConf.load(args.config) 146 | 147 | if args.optional_args: 148 | modified_config = OmegaConf.from_dotlist(args.optional_args) 149 | config = OmegaConf.merge(config, modified_config) 150 | 151 | logging_level = logging.DEBUG if config.debug else logging.INFO 152 | logging.basicConfig( 153 | level=logging_level, 154 | format="%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s", 155 | ) 156 | logger = logging.getLogger(__name__) 157 | logger.info(f"config: {OmegaConf.to_yaml(config)}") 158 | 159 | main(config) 160 | -------------------------------------------------------------------------------- /consisti2v/run_pnp_edit.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from pathlib import Path 4 | import torch 5 | import argparse 6 | import logging 7 | from omegaconf import OmegaConf 8 | from PIL import Image 9 | 10 | # HF imports 11 | from diffusers import DDIMScheduler 12 | 13 | # Project imports 14 | from utils import ( 15 | seed_everything, 16 | load_video_frames, 17 | convert_video_to_frames, 18 | load_ddim_latents_at_T, 19 | load_ddim_latents_at_t, 20 | ) 21 | from consisti2v.pipelines.pipeline_video_editing import ConditionalVideoEditingPipeline 22 | from consisti2v.utils.util import save_videos_grid 23 | from pnp_utils import ( 24 | register_time, 25 | register_conv_injection, 26 | register_spatial_attention_pnp, 27 | register_temp_attention_pnp, 28 | ) 29 | 30 | 31 | def init_pnp(pipe, scheduler, config): 32 | conv_injection_t = int(config.n_steps * config.pnp_f_t) 33 | spatial_attn_qk_injection_t = int(config.n_steps * config.pnp_spatial_attn_t) 34 | temp_attn_qk_injection_t = int(config.n_steps * config.pnp_temp_attn_t) 35 | conv_injection_timesteps = scheduler.timesteps[:conv_injection_t] if conv_injection_t >= 0 else [] 36 | spatial_attn_qk_injection_timesteps = ( 37 | scheduler.timesteps[:spatial_attn_qk_injection_t] if spatial_attn_qk_injection_t >= 0 else [] 38 | ) 39 | temp_attn_qk_injection_timesteps = ( 40 | scheduler.timesteps[:temp_attn_qk_injection_t] if temp_attn_qk_injection_t >= 0 else [] 41 | ) 42 | register_conv_injection(pipe, conv_injection_timesteps) 43 | register_spatial_attention_pnp(pipe, spatial_attn_qk_injection_timesteps) 44 | register_temp_attention_pnp(pipe, temp_attn_qk_injection_timesteps) 45 | 46 | logger.debug(f"conv_injection_t: {conv_injection_t}") 47 | logger.debug(f"spatial_attn_qk_injection_t: {spatial_attn_qk_injection_t}") 48 | logger.debug(f"temp_attn_qk_injection_t: {temp_attn_qk_injection_t}") 49 | logger.debug(f"conv_injection_timesteps: {conv_injection_timesteps}") 50 | logger.debug(f"spatial_attn_qk_injection_timesteps: {spatial_attn_qk_injection_timesteps}") 51 | logger.debug(f"temp_attn_qk_injection_timesteps: {temp_attn_qk_injection_timesteps}") 52 | 53 | 54 | def main(config): 55 | # Initialize the pipeline 56 | pipe = ConditionalVideoEditingPipeline.from_pretrained( 57 | "TIGER-Lab/ConsistI2V", 58 | torch_dtype=torch.float16, 59 | ) 60 | pipe.to(device) 61 | 62 | # Initialize the DDIM scheduler 63 | ddim_scheduler = DDIMScheduler.from_pretrained( 64 | "TIGER-Lab/ConsistI2V", 65 | subfolder="scheduler", 66 | ) 67 | 68 | # Load first frame and source frames 69 | if config.video_path: 70 | frame_list = convert_video_to_frames(config.video_path, config.image_size, save_frames=True) 71 | frame_list = frame_list[: config.n_frames] # 16 frames for img2vid 72 | logger.debug(f"len(frame_list): {len(frame_list)}") 73 | video_name = Path(config.video_path).stem 74 | video_dir = Path(config.video_path).parent 75 | config.video_frames_path = f"{video_dir}/{video_name}" 76 | elif config.video_frames_path: 77 | _, frame_list = load_video_frames(config.video_frames_path, config.n_frames) 78 | else: 79 | raise ValueError("Please provide either video_path or video_frames_path") 80 | src_frame_list = frame_list 81 | src_1st_frame = os.path.join(config.video_frames_path, '00000.png') 82 | 83 | # Load the edited first frame 84 | edited_1st_frame = config.edited_first_frame_path 85 | 86 | # Load the initial latents at t 87 | ddim_init_latents_t_idx = config.ddim_init_latents_t_idx 88 | ddim_scheduler.set_timesteps(config.n_steps) 89 | logger.info(f"ddim_scheduler.timesteps: {ddim_scheduler.timesteps}") 90 | ddim_latents_path = os.path.join(config.ddim_latents_path, config.exp_name) 91 | ddim_latents_at_t = load_ddim_latents_at_t( 92 | ddim_scheduler.timesteps[ddim_init_latents_t_idx], ddim_latents_path=ddim_latents_path 93 | ) 94 | logger.debug(f"ddim_scheduler.timesteps[t_idx]: {ddim_scheduler.timesteps[ddim_init_latents_t_idx]}") 95 | logger.debug(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}") 96 | 97 | # Blend the latents 98 | random_latents = torch.randn_like(ddim_latents_at_t) 99 | random_ratio = config.blend_ratio 100 | mixed_latents = random_latents * random_ratio + ddim_latents_at_t * (1 - random_ratio) 101 | 102 | # Init Pnp 103 | init_pnp(pipe, ddim_scheduler, config) 104 | 105 | # Edit video 106 | pipe.register_modules(scheduler=ddim_scheduler) 107 | edited_video = pipe.sample_with_pnp( 108 | prompt=config.editing_prompt, 109 | first_frame_paths=edited_1st_frame, 110 | height=config.image_size[1], 111 | width=config.image_size[0], 112 | video_length=config.n_frames, 113 | num_inference_steps=config.n_steps, 114 | guidance_scale_txt=config.cfg_txt, 115 | guidance_scale_img=config.cfg_img, 116 | negative_prompt=config.editing_negative_prompt, 117 | frame_stride=config.frame_stride, 118 | latents=mixed_latents, 119 | generator=torch.manual_seed(config.seed), 120 | return_dict=True, 121 | ddim_init_latents_t_idx=ddim_init_latents_t_idx, 122 | ddim_inv_latents_path=ddim_latents_path, 123 | ddim_inv_prompt=config.ddim_inv_prompt, 124 | ddim_inv_1st_frame_path=src_1st_frame, 125 | ).videos 126 | 127 | # Save video 128 | os.makedirs(config.output_dir, exist_ok=True) 129 | # Downsampling the video for space saving 130 | save_videos_grid(edited_video, os.path.join(config.output_dir, config.editing_prompt, "video.gif"), fps=8, format="gif") 131 | save_videos_grid(edited_video, os.path.join(config.output_dir, config.editing_prompt, "video.mp4"), fps=8, format="mp4") 132 | logger.info(f"Saved edited video to {config.output_dir}") 133 | 134 | 135 | if __name__ == "__main__": 136 | parser = argparse.ArgumentParser() 137 | parser.add_argument("--config", type=str, default="./configs/pnp_edit.yaml") 138 | parser.add_argument("optional_args", nargs='*', default=[]) 139 | args = parser.parse_args() 140 | config = OmegaConf.load(args.config) 141 | 142 | if args.optional_args: 143 | modified_config = OmegaConf.from_dotlist(args.optional_args) 144 | config = OmegaConf.merge(config, modified_config) 145 | 146 | # Set up logging 147 | logging_level = logging.DEBUG if config.debug else logging.INFO 148 | logging.basicConfig(level=logging_level, format="%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s") 149 | logger = logging.getLogger(__name__) 150 | logger.info(f"config: {OmegaConf.to_yaml(config)}") 151 | 152 | # Set up device and seed 153 | device = torch.device(config.device) 154 | seed_everything(config.seed) 155 | main(config) -------------------------------------------------------------------------------- /consisti2v/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import numpy as np 4 | import torch 5 | from torchvision.io import read_video 6 | import torchvision.transforms as T 7 | from pathlib import Path 8 | from PIL import Image 9 | from diffusers.utils import load_image 10 | import torch.nn.functional as F 11 | import glob 12 | 13 | def isinstance_str(x: object, cls_name: str): 14 | """ 15 | Checks whether x has any class *named* cls_name in its ancestry. 16 | Doesn't require access to the class's implementation. 17 | 18 | Useful for patching! 19 | """ 20 | 21 | for _cls in x.__class__.__mro__: 22 | if _cls.__name__ == cls_name: 23 | return True 24 | 25 | return False 26 | 27 | 28 | def seed_everything(seed): 29 | torch.manual_seed(seed) 30 | torch.cuda.manual_seed(seed) 31 | torch.cuda.manual_seed_all(seed) 32 | random.seed(seed) 33 | np.random.seed(seed) 34 | 35 | def load_ddim_latents_at_t(t, ddim_latents_path): 36 | ddim_latents_at_t_path = os.path.join(ddim_latents_path, f"ddim_latents_{t}.pt") 37 | assert os.path.exists(ddim_latents_at_t_path), f"Missing latents at t {t} path {ddim_latents_at_t_path}" 38 | ddim_latents_at_t = torch.load(ddim_latents_at_t_path) 39 | print(f"############ Loaded ddim_latents_at_t from {ddim_latents_at_t_path}") 40 | return ddim_latents_at_t 41 | 42 | def load_ddim_latents_at_T(ddim_latents_path): 43 | noisest = max( 44 | [ 45 | int(x.split("_")[-1].split(".")[0]) 46 | for x in glob.glob(os.path.join(ddim_latents_path, f"ddim_latents_*.pt")) 47 | ] 48 | ) 49 | ddim_latents_at_T_path = os.path.join(ddim_latents_path, f"ddim_latents_{noisest}.pt") 50 | ddim_latents_at_T = torch.load(ddim_latents_at_T_path) # [b, c, f, h, w] [1, 4, 16, 40, 64] 51 | return ddim_latents_at_T 52 | 53 | 54 | # Modified from tokenflow/utils.py 55 | def convert_video_to_frames(video_path, img_size=(512, 512), save_frames=True, save_dir=None): 56 | video, _, _ = read_video(video_path, output_format="TCHW") 57 | # rotate video -90 degree if video is .mov format. this is a weird bug in torchvision 58 | if video_path.endswith(".mov"): 59 | video = T.functional.rotate(video, -90) 60 | if save_frames: 61 | video_name = Path(video_path).stem 62 | video_dir = Path(video_path).parent 63 | if save_dir is not None: 64 | video_dir = save_dir 65 | os.makedirs(f"{video_dir}/{video_name}", exist_ok=True) 66 | frames = [] 67 | for i in range(len(video)): 68 | ind = str(i).zfill(5) 69 | image = T.ToPILImage()(video[i]) 70 | image_resized = image.resize(img_size, resample=Image.Resampling.LANCZOS) 71 | print(f"image_resized.size, height, width: {image_resized.size}, {img_size[1]}, {img_size[0]}") 72 | if save_frames: 73 | image_resized.save(f"{video_dir}/{video_name}/{ind}.png") 74 | print(f"Saved frame {video_dir}/{video_name}/{ind}.png") 75 | frames.append(image_resized) 76 | return frames 77 | 78 | 79 | # Modified from tokenflow/utils.py 80 | def load_video_frames(frames_path, n_frames): 81 | # Load paths 82 | paths = [f"{frames_path}/%05d.png" % i for i in range(n_frames)] 83 | frames = [load_image(p) for p in paths] 84 | return paths, frames 85 | 86 | 87 | def register_spatial_attention_pnp(model, injection_schedule): 88 | def sa_forward(self): 89 | def forward(hidden_states, encoder_hidden_states=None, attention_mask=None, use_image_num=None): 90 | batch_size, sequence_length, _dim = hidden_states.shape 91 | n_frames = batch_size // 3 # batch_size is 3*n_frames because concat[source, uncond, cond] 92 | 93 | encoder_hidden_states = encoder_hidden_states 94 | 95 | if self.group_norm is not None: 96 | hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) 97 | 98 | query = self.to_q(hidden_states) # [b (h w)] f (nd * d) 99 | 100 | if self.added_kv_proj_dim is not None: 101 | print(f"[ERROR] Run into added_kv_proj_dim, which is not supported yet. Exiting...") 102 | key = self.to_k(hidden_states) 103 | value = self.to_v(hidden_states) 104 | encoder_hidden_states_key_proj = self.add_k_proj(encoder_hidden_states) 105 | encoder_hidden_states_value_proj = self.add_v_proj(encoder_hidden_states) 106 | 107 | key = self.reshape_heads_to_batch_dim(key) 108 | value = self.reshape_heads_to_batch_dim(value) 109 | encoder_hidden_states_key_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_key_proj) 110 | encoder_hidden_states_value_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_value_proj) 111 | 112 | key = torch.concat([encoder_hidden_states_key_proj, key], dim=1) 113 | value = torch.concat([encoder_hidden_states_value_proj, value], dim=1) 114 | else: 115 | encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states 116 | key = self.to_k(encoder_hidden_states) 117 | value = self.to_v(encoder_hidden_states) 118 | 119 | if self.injection_schedule is not None and (self.t in self.injection_schedule or self.t == 1000): 120 | # inject source into unconditional 121 | query[n_frames: 2 * n_frames] = query[:n_frames] 122 | key[n_frames: 2 * n_frames] = key[:n_frames] 123 | # inject source into conditional 124 | query[2 * n_frames:] = query[:n_frames] 125 | key[2 * n_frames:] = key[:n_frames] 126 | 127 | if not self.use_relative_position: 128 | key = self.reshape_heads_to_batch_dim(key) 129 | value = self.reshape_heads_to_batch_dim(value) 130 | 131 | dim = query.shape[-1] 132 | if not self.use_relative_position: 133 | query = self.reshape_heads_to_batch_dim(query) # [b (h w) nd] f d 134 | 135 | if attention_mask is not None: 136 | if attention_mask.shape[-1] != query.shape[1]: 137 | target_length = query.shape[1] 138 | attention_mask = F.pad(attention_mask, (0, target_length), value=0.0) 139 | attention_mask = attention_mask.repeat_interleave(self.heads, dim=0) 140 | 141 | # attention, what we cannot get enough of 142 | if self._use_memory_efficient_attention_xformers: 143 | hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask) 144 | # Some versions of xformers return output in fp32, cast it back to the dtype of the input 145 | hidden_states = hidden_states.to(query.dtype) 146 | else: 147 | if self._slice_size is None or query.shape[0] // self._slice_size == 1: 148 | hidden_states = self._attention(query, key, value, attention_mask) 149 | else: 150 | hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask) 151 | 152 | # linear proj 153 | hidden_states = self.to_out[0](hidden_states) 154 | 155 | # dropout 156 | hidden_states = self.to_out[1](hidden_states) 157 | return hidden_states 158 | 159 | return forward 160 | 161 | for _, module in model.unet.named_modules(): 162 | if isinstance_str(module, "BasicTransformerBlock"): 163 | module.attn1.forward = sa_forward(module.attn1) 164 | setattr(module.attn1, "injection_schedule", []) # Disable PNP 165 | 166 | res_dict = {1: [1, 2], 2: [0, 1, 2], 3: [0, 1, 2]} 167 | # we are injecting attention in blocks 4 - 11 of the decoder, so not in the first block of the lowest resolution 168 | for res in res_dict: 169 | for block in res_dict[res]: 170 | module = model.unet.up_blocks[res].attentions[block].transformer_blocks[0].attn1 171 | module.forward = sa_forward(module) 172 | setattr(module, "injection_schedule", injection_schedule) 173 | -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection.mp4 -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/00000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00000.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/00001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00001.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/00002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00002.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/00003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00003.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/00004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00004.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/00005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00005.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/00006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00006.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/00007.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00007.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/00008.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00008.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/00009.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00009.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/00010.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00010.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/00011.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00011.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/00012.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00012.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/00013.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00013.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/00014.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00014.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/00015.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00015.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/edited_first_frame/Sketch style.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/edited_first_frame/Sketch style.png -------------------------------------------------------------------------------- /demo/A Couple In A Public Display Of Affection/edited_first_frame/Snowing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/edited_first_frame/Snowing.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor.mp4 -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/00000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00000.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/00001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00001.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/00002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00002.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/00003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00003.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/00004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00004.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/00005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00005.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/00006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00006.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/00007.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00007.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/00008.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00008.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/00009.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00009.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/00010.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00010.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/00011.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00011.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/00012.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00012.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/00013.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00013.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/00014.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00014.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/00015.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00015.png -------------------------------------------------------------------------------- /demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind.mp4 -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/00000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00000.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/00001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00001.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/00002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00002.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/00003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00003.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/00004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00004.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/00005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00005.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/00006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00006.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/00007.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00007.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/00008.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00008.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/00009.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00009.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/00010.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00010.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/00011.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00011.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/00012.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00012.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/00013.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00013.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/00014.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00014.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/00015.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00015.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/add a party hat on his head.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/add a party hat on his head.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/cyberpunk style.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/cyberpunk style.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/give him a punk hair style.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/give him a punk hair style.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/helmet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/helmet.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/hinton.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/hinton.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/starry night style.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/starry night style.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/turn his hair white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/turn his hair white.png -------------------------------------------------------------------------------- /demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/turn man into robot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/turn man into robot.png -------------------------------------------------------------------------------- /demo/Ballet.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet.mp4 -------------------------------------------------------------------------------- /demo/Ballet/00000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00000.png -------------------------------------------------------------------------------- /demo/Ballet/00001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00001.png -------------------------------------------------------------------------------- /demo/Ballet/00002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00002.png -------------------------------------------------------------------------------- /demo/Ballet/00003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00003.png -------------------------------------------------------------------------------- /demo/Ballet/00004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00004.png -------------------------------------------------------------------------------- /demo/Ballet/00005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00005.png -------------------------------------------------------------------------------- /demo/Ballet/00006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00006.png -------------------------------------------------------------------------------- /demo/Ballet/00007.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00007.png -------------------------------------------------------------------------------- /demo/Ballet/00008.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00008.png -------------------------------------------------------------------------------- /demo/Ballet/00009.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00009.png -------------------------------------------------------------------------------- /demo/Ballet/00010.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00010.png -------------------------------------------------------------------------------- /demo/Ballet/00011.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00011.png -------------------------------------------------------------------------------- /demo/Ballet/00012.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00012.png -------------------------------------------------------------------------------- /demo/Ballet/00013.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00013.png -------------------------------------------------------------------------------- /demo/Ballet/00014.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00014.png -------------------------------------------------------------------------------- /demo/Ballet/00015.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00015.png -------------------------------------------------------------------------------- /demo/Ballet/edited_first_frame/van gogh style.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/edited_first_frame/van gogh style.png -------------------------------------------------------------------------------- /demo/Man Walking.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking.mp4 -------------------------------------------------------------------------------- /demo/Man Walking/00000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00000.png -------------------------------------------------------------------------------- /demo/Man Walking/00001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00001.png -------------------------------------------------------------------------------- /demo/Man Walking/00002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00002.png -------------------------------------------------------------------------------- /demo/Man Walking/00003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00003.png -------------------------------------------------------------------------------- /demo/Man Walking/00004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00004.png -------------------------------------------------------------------------------- /demo/Man Walking/00005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00005.png -------------------------------------------------------------------------------- /demo/Man Walking/00006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00006.png -------------------------------------------------------------------------------- /demo/Man Walking/00007.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00007.png -------------------------------------------------------------------------------- /demo/Man Walking/00008.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00008.png -------------------------------------------------------------------------------- /demo/Man Walking/00009.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00009.png -------------------------------------------------------------------------------- /demo/Man Walking/00010.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00010.png -------------------------------------------------------------------------------- /demo/Man Walking/00011.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00011.png -------------------------------------------------------------------------------- /demo/Man Walking/00012.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00012.png -------------------------------------------------------------------------------- /demo/Man Walking/00013.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00013.png -------------------------------------------------------------------------------- /demo/Man Walking/00014.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00014.png -------------------------------------------------------------------------------- /demo/Man Walking/00015.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00015.png -------------------------------------------------------------------------------- /demo/Man Walking/edited_first_frame/ElonMusk_02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/edited_first_frame/ElonMusk_02.png -------------------------------------------------------------------------------- /demo/Man Walking/edited_first_frame/Yann LeCun Walking.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/edited_first_frame/Yann LeCun Walking.png -------------------------------------------------------------------------------- /demo/Man Walking/edited_first_frame/add a cowboy hat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/edited_first_frame/add a cowboy hat.png -------------------------------------------------------------------------------- /demo/Man Walking/edited_first_frame/change his clothes to red.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/edited_first_frame/change his clothes to red.png -------------------------------------------------------------------------------- /demo/Man Walking/edited_first_frame/policeman costume.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/edited_first_frame/policeman costume.png -------------------------------------------------------------------------------- /demo/Man Walking/edited_first_frame/turn him into an astronaut.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/edited_first_frame/turn him into an astronaut.png -------------------------------------------------------------------------------- /demo/Man Walking/edited_first_frame/turn him into batman.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/edited_first_frame/turn him into batman.png -------------------------------------------------------------------------------- /demo/Man Walking/edited_first_frame/turn the man into darth vader.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/edited_first_frame/turn the man into darth vader.png -------------------------------------------------------------------------------- /demo/Your-Video-Name/edited_first_frame/Your-edited-first-frame: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Your-Video-Name/edited_first_frame/Your-edited-first-frame -------------------------------------------------------------------------------- /demo/Your-Video-mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Your-Video-mp4 -------------------------------------------------------------------------------- /edit_image.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from PIL import Image 4 | import json 5 | from moviepy.editor import VideoFileClip 6 | import numpy as np 7 | 8 | import black_box_image_edit as image_edit 9 | 10 | def infer_video(model, video_path, output_dir, prompt, prompt_type="instruct", force_512=False, seed=42, negative_prompt="", overwrite=False): 11 | """ 12 | Processes videos from the input directory, resizes them to 512x512 before feeding into the model by first frame, 13 | and saves the processed video back to its original size in the output directory. 14 | 15 | Args: 16 | model: The video editing model. 17 | input_dir (str): Path to the directory containing input videos. 18 | output_dir (str): Path to the directory where processed videos will be saved. 19 | prompt (str): Instruction prompt for video editing. 20 | """ 21 | 22 | # Create the output directory if it does not exist 23 | if not os.path.exists(output_dir): 24 | os.makedirs(output_dir) 25 | 26 | video_clip = VideoFileClip(video_path) 27 | video_filename = os.path.basename(video_path) 28 | # filename_noext = os.path.splitext(video_filename)[0] 29 | 30 | # Create the output directory if it does not exist 31 | # final_output_dir = os.path.join(output_dir, filename_noext) 32 | final_output_dir = output_dir 33 | if not os.path.exists(final_output_dir): 34 | os.makedirs(final_output_dir) 35 | 36 | result_path = os.path.join(final_output_dir, prompt + ".png") 37 | 38 | # Check if result already exists 39 | if os.path.exists(result_path) and overwrite is False: 40 | print(f"Result already exists: {result_path}") 41 | return 42 | 43 | def process_frame(image): 44 | pil_image = Image.fromarray(image) 45 | if force_512: 46 | pil_image = pil_image.resize((512, 512), Image.LANCZOS) 47 | if prompt_type == "instruct": 48 | result = model.infer_one_image(pil_image, instruct_prompt=prompt, seed=seed, negative_prompt=negative_prompt) 49 | else: 50 | result = model.infer_one_image(pil_image, target_prompt=prompt, seed=seed, negative_prompt=negative_prompt) 51 | if force_512: 52 | result = result.resize(video_clip.size, Image.LANCZOS) 53 | return np.array(result) 54 | 55 | # Process only the first frame 56 | first_frame = video_clip.get_frame(0) # Get the first frame 57 | processed_frame = process_frame(first_frame) # Process the first frame 58 | 59 | 60 | #Image.fromarray(first_frame).save(os.path.join(final_output_dir, "00000.png")) 61 | Image.fromarray(processed_frame).save(result_path) 62 | print(f"Processed and saved the first frame: {result_path}") 63 | return result_path 64 | 65 | 66 | if __name__ == "__main__": 67 | parser = argparse.ArgumentParser(description='Process some images.') 68 | parser.add_argument('--model', type=str, default='instructpix2pix', choices=['magicbrush','instructpix2pix', 'cosxl'], help='Name of the image editing model') 69 | parser.add_argument('--video_path', type=str, required=False, help='Name of the video', default=None) 70 | parser.add_argument('--input_dir', type=str, required=False, help='Directory containing the video', default="./demo/") 71 | parser.add_argument('--output_dir', type=str, required=False, help='Directory to save the processed images', default=None) 72 | parser.add_argument('--prompt', type=str, required=False, help='Instruction prompt for editing', default="turn the man into darth vader") 73 | parser.add_argument('--force_512', action='store_true', help='Force resize to 512x512 when feeding into image model') 74 | parser.add_argument('--dict_file', type=str, required=False, help='JSON file containing files, instructions etc.', default=None) 75 | parser.add_argument('--seed', type=int, required=False, help='Seed for random number generator', default=42) 76 | parser.add_argument('--negative_prompt', type=str, required=False, help='Negative prompt for editing', default=None) 77 | args = parser.parse_args() 78 | 79 | if args.negative_prompt is None: 80 | negative_prompt = "worst quality, normal quality, low quality, low res, blurry, watermark, jpeg artifacts" 81 | else: 82 | negative_prompt = args.negative_prompt 83 | 84 | if args.dict_file: 85 | with open(args.dict_file, 'r') as json_file: 86 | folders_info = json.load(json_file) 87 | 88 | for video_name, video_infos in folders_info.items(): 89 | input_dir = args.input_dir 90 | video_path = os.path.join(input_dir, video_name) 91 | 92 | for video_info in video_infos: 93 | model_name = video_info.get('image_model', None) 94 | instruction = video_info.get('instruction', None) 95 | target_caption = video_info.get('target_caption', None) 96 | 97 | if instruction is None and target_caption is None: 98 | continue 99 | 100 | if model_name == 'magicbrush': 101 | model = image_edit.MagicBrush() 102 | prompt_type = "instruct" 103 | prompt = instruction 104 | elif model_name == 'instructpix2pix': 105 | model = image_edit.InstructPix2Pix() 106 | prompt_type = "instruct" 107 | prompt = instruction 108 | elif model_name == 'cosxl': 109 | model = image_edit.CosXLEdit() 110 | prompt_type = "instruct" 111 | prompt = instruction 112 | else: 113 | prompt_type = "target" 114 | prompt = target_caption 115 | 116 | 117 | if args.output_dir is None: 118 | video_filename = os.path.basename(video_path) 119 | filename_noext = os.path.splitext(video_filename)[0] 120 | output_dir = os.path.dirname(video_path) 121 | else: 122 | output_dir = args.output_dir 123 | 124 | infer_video(model, video_path, output_dir, prompt, prompt_type, args.force_512, args.seed, negative_prompt) 125 | else: 126 | if args.model == 'magicbrush': 127 | model = image_edit.MagicBrush() 128 | prompt_type = "instruct" 129 | elif args.model == 'instructpix2pix': 130 | model = image_edit.InstructPix2Pix() 131 | prompt_type = "instruct" 132 | elif args.model == 'cosxl': 133 | model = image_edit.CosXLEdit() 134 | prompt_type = "instruct" 135 | 136 | video_path = args.video_path 137 | 138 | if args.output_dir is None: 139 | video_filename = os.path.basename(video_path) 140 | filename_noext = os.path.splitext(video_filename)[0] 141 | output_dir = os.path.dirname(video_path) 142 | else: 143 | output_dir = args.output_dir 144 | 145 | print("video_filename", video_filename) 146 | print("output_dir", output_dir) 147 | 148 | infer_video(model, video_path, output_dir, args.prompt, prompt_type, args.force_512, args.seed, negative_prompt) 149 | -------------------------------------------------------------------------------- /i2vgen-xl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/i2vgen-xl/__init__.py -------------------------------------------------------------------------------- /i2vgen-xl/configs/group_ddim_inversion/group_config.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "active": true, 4 | "force_recompute_latents": false, 5 | "video_name": "An Old Man Doing Exercises For The Body And Mind", 6 | "recon_config": 7 | { 8 | "enable_recon": true 9 | } 10 | }, 11 | { 12 | "active": false, 13 | "force_recompute_latents": false, 14 | "video_name": "A Couple In A Public Display Of Affection" 15 | }, 16 | { 17 | "active": false, 18 | "force_recompute_latents": false, 19 | "video_name": "Ballet" 20 | }, 21 | { 22 | "active": false, 23 | "force_recompute_latents": false, 24 | "video_name": "Man Walking" 25 | }, 26 | { 27 | "active": false, 28 | "force_recompute_latents": false, 29 | "video_name":"A kitten turning its head on a wooden floor", 30 | "image_size": [512, 512] 31 | }, 32 | { 33 | "active": false, 34 | "force_recompute_latents": false, 35 | "video_name":"Your-Video-Name", 36 | "image_size": [512, 512], 37 | "recon_config": 38 | { 39 | "enable_recon": false 40 | } 41 | } 42 | ] -------------------------------------------------------------------------------- /i2vgen-xl/configs/group_ddim_inversion/template.yaml: -------------------------------------------------------------------------------- 1 | # "ReplaceMe" will be overwritten by the values in group_config.json 2 | 3 | # General 4 | seed: 8888 5 | device: "cuda:7" # <-- change this to the GPU you want to use 6 | debug: False # For logging DEBUG level messages otherwise INFO 7 | 8 | # Dir 9 | data_dir: ".." # <-- change this to the path of the data directory, if you cloned the repo, leave it as "..", the inversion latents will be saved in AnyV2V/ 10 | model_name: "i2vgen-xl" 11 | exp_name: "${video_name}" 12 | output_dir: "${data_dir}/inversions/${model_name}/${exp_name}" 13 | 14 | # Data 15 | #image_size: [1280, 704] 16 | image_size: [512, 512] 17 | video_dir: "${data_dir}/demo" 18 | video_name: "ReplaceMe" 19 | video_path: "ReplaceMe" 20 | video_frames_path: "ReplaceMe" 21 | 22 | # DDIM settings 23 | n_frames: 16 24 | 25 | # DDIM inversion 26 | inverse_config: 27 | image_size: ${image_size} 28 | n_frames: ${n_frames} 29 | cfg: 1.0 30 | target_fps: 8 31 | prompt: "" 32 | negative_prompt: "" 33 | n_steps: 500 34 | output_dir: "${output_dir}/ddim_latents" 35 | inverse_static_video: False 36 | null_image_inversion: False 37 | 38 | # DDIM reconstruction 39 | recon_config: 40 | enable_recon: False 41 | image_size: ${image_size} 42 | n_frames: ${n_frames} 43 | cfg: 9.0 44 | target_fps: 8 45 | prompt: "" 46 | negative_prompt: "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms" 47 | n_steps: 50 48 | ddim_init_latents_t_idx: 3 # 0 for 981, 3 for 921, 9 for 801, 20 for 581 if n_steps=50 49 | ddim_latents_path: "${inverse_config.output_dir}" 50 | -------------------------------------------------------------------------------- /i2vgen-xl/configs/group_pnp_edit/group_config.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "active": true, 4 | "task_name": "Prompt-Based-Editing", 5 | "video_name":"An Old Man Doing Exercises For The Body And Mind", 6 | "edited_first_frame_path":"demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/turn man into robot.png", 7 | "editing_prompt":"a man doing exercises for the body and mind", 8 | "edited_video_name": "a robot doing exercises for the body and mind", 9 | "ddim_init_latents_t_idx": 0, 10 | "pnp_f_t": 1.0, 11 | "pnp_spatial_attn_t": 1.0, 12 | "pnp_temp_attn_t":1.0 13 | }, 14 | { 15 | "active": false, 16 | "task_name": "Prompt-Based-Editing", 17 | "video_name":"An Old Man Doing Exercises For The Body And Mind", 18 | "edited_first_frame_path":"demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/turn his hair white.png", 19 | "editing_prompt":"a man with white hair doing exercises for the body and mind", 20 | "edited_video_name": "a man with white hair doing exercises for the body and mind", 21 | "pnp_temp_attn_t": 1.0 22 | }, 23 | { 24 | "active": false, 25 | "task_name": "Prompt-Based-Editing", 26 | "video_name":"An Old Man Doing Exercises For The Body And Mind", 27 | "edited_first_frame_path":"demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/add a party hat on his head.png", 28 | "editing_prompt":"a man with a party hat doing exercises for the body and mind", 29 | "edited_video_name": "a man with a party hat doing exercises for the body and mind", 30 | "ddim_init_latents_t_idx": 0, 31 | "pnp_f_t": 0.1, 32 | "pnp_spatial_attn_t": 0.1, 33 | "pnp_temp_attn_t":1.0 34 | }, 35 | { 36 | "active": false, 37 | "task_name": "Style-Transfer", 38 | "video_name":"An Old Man Doing Exercises For The Body And Mind", 39 | "edited_first_frame_path":"demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/starry night style.png", 40 | "editing_prompt":"an old man doing exercises for the body and mind, in a style of starry night", 41 | "edited_video_name": "an old man doing exercises for the body and mind, in a style of starry night" 42 | }, 43 | { 44 | "active": false, 45 | "task_name": "Style-Transfer", 46 | "video_name":"An Old Man Doing Exercises For The Body And Mind", 47 | "edited_first_frame_path":"demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/cyberpunk style.png", 48 | "editing_prompt":"an old man doing exercises for the body and mind, in a style of cyberpunk", 49 | "edited_video_name": "an old man doing exercises for the body and mind, in a style of cyberpunk", 50 | "ddim_init_latents_t_idx": 0, 51 | "pnp_f_t": 1.0, 52 | "pnp_spatial_attn_t": 1.0, 53 | "pnp_temp_attn_t":1.0 54 | }, 55 | { 56 | "active": false, 57 | "task_name": "Identity-Manipulation", 58 | "video_name":"An Old Man Doing Exercises For The Body And Mind", 59 | "edited_first_frame_path":"demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png", 60 | "editing_prompt":"a man doing exercises for the body and mind", 61 | "edited_video_name": "Middle Aged Jack Ma Doing Exercises For The Body And Mind-pnp_temp_attn_t_1.0", 62 | "ddim_init_latents_t_idx": 0, 63 | "pnp_f_t": 0.8, 64 | "pnp_spatial_attn_t": 0.8, 65 | "pnp_temp_attn_t": 1.0 66 | }, 67 | { 68 | "active": false, 69 | "task_name": "Identity-Manipulation", 70 | "video_name":"An Old Man Doing Exercises For The Body And Mind", 71 | "edited_first_frame_path":"demos/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/hinton.png", 72 | "editing_prompt":"a young man doing exercises for the body and mind", 73 | "edited_video_name": "an old man doing exercises for the body and mind-hinton_01", 74 | "ddim_init_latents_t_idx": 0, 75 | "pnp_f_t": 1.0, 76 | "pnp_spatial_attn_t": 1.0, 77 | "pnp_temp_attn_t":1.0 78 | }, 79 | { 80 | "active": false, 81 | "task_name": "Subject-Driven-Editing", 82 | "video_name":"An Old Man Doing Exercises For The Body And Mind", 83 | "edited_first_frame_path":"demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/helmet.png", 84 | "editing_prompt":"a man doing exercises for the body and mind", 85 | "edited_video_name": "a robot doing exercises for the body and mind-helmet", 86 | "ddim_init_latents_t_idx": 0, 87 | "pnp_f_t": 0.2, 88 | "pnp_spatial_attn_t": 0.2, 89 | "pnp_temp_attn_t":1.0 90 | }, 91 | { 92 | "active": false, 93 | "task_name": "Prompt-Based-Editing", 94 | "video_name":"A Couple In A Public Display Of Affection", 95 | "edited_first_frame_path":"demo/A Couple In A Public Display Of Affection/edited_first_frame/Snowing.png", 96 | "editing_prompt":"A couple in a public display of affection, snowing", 97 | "edited_video_name": "A couple in a public display of affection, snowing", 98 | "ddim_init_latents_t_idx": 0, 99 | "pnp_f_t": 0.3, 100 | "pnp_spatial_attn_t": 0.3, 101 | "pnp_temp_attn_t":1.0 102 | }, 103 | { 104 | "active": false, 105 | "task_name": "Style-Transfer", 106 | "video_name":"Ballet", 107 | "edited_first_frame_path":"demo/Ballet/edited_first_frame/van gogh style.png", 108 | "editing_prompt":"girl dancing ballet, in the style of van gogh", 109 | "edited_video_name": "girl dancing ballet, in the style of van gogh", 110 | "ddim_init_latents_t_idx": 0, 111 | "pnp_f_t": 1.0, 112 | "pnp_spatial_attn_t": 1.0, 113 | "pnp_temp_attn_t":1.0 114 | }, 115 | { 116 | "active": false, 117 | "task_name": "Prompt-Based-Editing", 118 | "video_name":"Man Walking", 119 | "edited_first_frame_path":"demo/Man Walking/edited_first_frame/turn the man into darth vader.png", 120 | "editing_prompt":"man walking", 121 | "edited_video_name": "darth vader walking", 122 | "ddim_init_latents_t_idx": 0, 123 | "pnp_f_t": 0.1, 124 | "pnp_spatial_attn_t": 0.1, 125 | "pnp_temp_attn_t": 1.0 126 | }, 127 | { 128 | "active": false, 129 | "video_name":"Man Walking", 130 | "task_name": "Identity-Manipulation", 131 | "edited_first_frame_path":"demo/Man Walking/edited_first_frame/ElonMusk_02.png", 132 | "editing_prompt":"a man walking in autumn", 133 | "edited_video_name": "Elon Musk walking in autumn", 134 | "ddim_init_latents_t_idx": 0, 135 | "pnp_f_t": 0.1, 136 | "pnp_spatial_attn_t": 0.1, 137 | "pnp_temp_attn_t": 1.0 138 | }, 139 | { 140 | "active": false, 141 | "task_name": "Identity-Manipulation", 142 | "video_name":"Man Walking", 143 | "edited_first_frame_path":"demo/Man Walking/edited_first_frame/Yann LeCun Walking.png", 144 | "editing_prompt":"a man walking", 145 | "edited_video_name": "Yann LeCun walking", 146 | "ddim_init_latents_t_idx": 0, 147 | "pnp_f_t": 0.0, 148 | "pnp_spatial_attn_t": 0.0, 149 | "pnp_temp_attn_t": 1.0 150 | }, 151 | { 152 | "active": false, 153 | "task_name": "Subject-Driven-Editing", 154 | "video_name":"A kitten turning its head on a wooden floor", 155 | "edited_first_frame_path":"demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png", 156 | "editing_prompt":"A dog turning its head on a wooden floor", 157 | "edited_video_name": "A dog turning its head on a wooden floor", 158 | "ddim_init_latents_t_idx": 0, 159 | "pnp_f_t": 0.2, 160 | "pnp_spatial_attn_t": 0.2, 161 | "pnp_temp_attn_t":0.5 162 | }, 163 | { 164 | "active": false, 165 | "task_name": "Your-Task from the list[Prompt-Based-Editing, Style-Transfer, Identity-Manipulation, Subject-Driven-Editing]", 166 | "video_name":"Your-Video-Name", 167 | "edited_first_frame_path":"demo/Your-Video-Name/edited_first_frame/Your-Edited-First-Frame.png", 168 | "editing_prompt":"Your-Editing-Prompt", 169 | "edited_video_name": "Your-Edited-Video-Name", 170 | "ddim_init_latents_t_idx": 0, 171 | "pnp_f_t": 0, 172 | "pnp_spatial_attn_t": 0, 173 | "pnp_temp_attn_t":0 174 | } 175 | ] -------------------------------------------------------------------------------- /i2vgen-xl/configs/group_pnp_edit/template.yaml: -------------------------------------------------------------------------------- 1 | # "ReplaceMe" will be overwritten by the values in group_config.json 2 | 3 | # General 4 | seed: 8888 5 | device: "cuda:4" # <-- change this to the GPU you want to use 6 | debug: False # For logging DEBUG level messages otherwise INFO 7 | 8 | # Dir 9 | data_dir: ".." # <-- change this to the path of the data directory, if you cloned the repo, leave it as "..", the inversion latents will be saved in AnyV2V/ 10 | model_name: "i2vgen-xl" 11 | task_name: "Prompt-Based-Editing" 12 | edited_video_name: "ReplaceMe" 13 | output_dir: "${data_dir}/Results/${task_name}/${model_name}/${video_name}/${edited_video_name}/" 14 | 15 | # Data 16 | image_size: [512, 512] 17 | video_dir: "${data_dir}/demo" 18 | video_name: "ReplaceMe" 19 | video_path: "ReplaceMe" 20 | video_frames_path: "ReplaceMe" 21 | edited_first_frame_path: "ReplaceMe" 22 | ddim_latents_path: "${data_dir}/inversions/${model_name}/${video_name}/ddim_latents" # Same as inverse_config.output_dir 23 | 24 | # Pnp Editing 25 | n_frames: 16 26 | cfg: 9.0 27 | target_fps: 8 28 | editing_prompt: "ReplaceMe" 29 | editing_negative_prompt: "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms" 30 | n_steps: 50 31 | ddim_init_latents_t_idx: 1 # 0 for 981, 3 for 921, 9 for 801, 20 for 581 if n_steps=50 32 | ddim_inv_prompt: "" 33 | random_ratio: 0.0 34 | 35 | # Pnp config 36 | pnp_f_t: 0.2 37 | pnp_spatial_attn_t: 0.2 38 | pnp_temp_attn_t: 0.5 -------------------------------------------------------------------------------- /i2vgen-xl/environment.yml: -------------------------------------------------------------------------------- 1 | name: anyv2v-i2vgen-xl 2 | channels: 3 | - pytorch 4 | - nvidia 5 | - defaults 6 | dependencies: 7 | - python=3.9 8 | - pytorch 9 | - torchvision 10 | - torchaudio 11 | - pytorch-cuda=11.8 12 | - pip 13 | - pip: 14 | - accelerate 15 | - diffusers==0.26.3 16 | - transformers 17 | - omegaconf 18 | - opencv-python 19 | - ipython 20 | - moviepy 21 | - notebook 22 | -------------------------------------------------------------------------------- /i2vgen-xl/pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/i2vgen-xl/pipelines/__init__.py -------------------------------------------------------------------------------- /i2vgen-xl/run_group_ddim_inversion.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from pathlib import Path 4 | import torch 5 | import argparse 6 | import logging 7 | from omegaconf import OmegaConf 8 | from PIL import Image 9 | import json 10 | 11 | # HF imports 12 | from diffusers import ( 13 | DDIMInverseScheduler, 14 | DDIMScheduler, 15 | ) 16 | from diffusers.utils import load_image, export_to_video, export_to_gif 17 | 18 | # Project imports 19 | from utils import ( 20 | seed_everything, 21 | load_video_frames, 22 | convert_video_to_frames, 23 | load_ddim_latents_at_T, 24 | load_ddim_latents_at_t, 25 | ) 26 | from pipelines.pipeline_i2vgen_xl import I2VGenXLPipeline 27 | 28 | 29 | def ddim_inversion(config, first_frame, frame_list, pipe: I2VGenXLPipeline, inverse_scheduler, g): 30 | pipe.scheduler = inverse_scheduler 31 | video_latents_at_0 = pipe.encode_vae_video( 32 | frame_list, 33 | device=pipe._execution_device, 34 | height=config.image_size[1], 35 | width=config.image_size[0], 36 | ) 37 | ddim_latents = pipe.invert( 38 | prompt=config.prompt, 39 | image=first_frame, 40 | height=config.image_size[1], 41 | width=config.image_size[0], 42 | num_frames=config.n_frames, 43 | num_inference_steps=config.n_steps, 44 | guidance_scale=config.cfg, 45 | negative_prompt=config.negative_prompt, 46 | target_fps=config.target_fps, 47 | latents=video_latents_at_0, 48 | generator=g, # TODO: this is not correct 49 | return_dict=False, 50 | output_dir=config.output_dir, 51 | ) # [b, num_inference_steps, c, num_frames, h, w] 52 | logger = logging.getLogger(__name__) 53 | logger.debug(f"ddim_latents.shape: {ddim_latents.shape}") 54 | ddim_latents = ddim_latents[0] # [num_inference_steps, c, num_frames, h, w] 55 | return ddim_latents 56 | 57 | 58 | def ddim_sampling( 59 | config, first_frame, ddim_latents_at_T, pipe: I2VGenXLPipeline, ddim_scheduler, ddim_init_latents_t_idx, g 60 | ): 61 | pipe.scheduler = ddim_scheduler 62 | reconstructed_video = pipe( 63 | prompt=config.prompt, 64 | image=first_frame, 65 | height=config.image_size[1], 66 | width=config.image_size[0], 67 | num_frames=config.n_frames, 68 | num_inference_steps=config.n_steps, 69 | guidance_scale=config.cfg, 70 | negative_prompt=config.negative_prompt, 71 | target_fps=config.target_fps, 72 | latents=ddim_latents_at_T, 73 | generator=g, # TODO: this is not correct 74 | return_dict=True, 75 | ddim_init_latents_t_idx=ddim_init_latents_t_idx, 76 | ).frames[0] 77 | return reconstructed_video 78 | 79 | 80 | def main(template_config, configs_list): 81 | # Initialize the pipeline 82 | pipe = I2VGenXLPipeline.from_pretrained( 83 | "ali-vilab/i2vgen-xl", 84 | torch_dtype=torch.float16, 85 | variant="fp16", 86 | ) 87 | pipe.to(device) 88 | g = torch.Generator(device=device) 89 | g = g.manual_seed(template_config.seed) 90 | 91 | # Initialize the DDIM inverse scheduler 92 | inverse_scheduler = DDIMInverseScheduler.from_pretrained( 93 | "ali-vilab/i2vgen-xl", 94 | subfolder="scheduler", 95 | ) 96 | # Initialize the DDIM scheduler 97 | ddim_scheduler = DDIMScheduler.from_pretrained( 98 | "ali-vilab/i2vgen-xl", 99 | subfolder="scheduler", 100 | ) 101 | 102 | video_dir = template_config.video_dir 103 | assert os.path.exists(video_dir), f"video_dir: {video_dir} does not exist" 104 | # loop through the video_dir and process every mp4 file 105 | for config_entry in configs_list: 106 | if config_entry["active"] == False: 107 | logger.info(f"Skipping config_entry: {config_entry}") 108 | continue 109 | logger.info(f"Processing config_entry: {config_entry}") 110 | 111 | # Override the config with the data_meta_entry 112 | config = OmegaConf.merge(template_config, OmegaConf.create(config_entry)) 113 | 114 | config.video_path = os.path.join(config.video_dir, config.video_name + ".mp4") 115 | config.video_frames_path = os.path.join(config.video_dir, config.video_name) 116 | 117 | # If already computed the latents, skip 118 | if os.path.exists(config.output_dir) and not config.force_recompute_latents: 119 | logger.info(f"### Skipping !!! {config.output_dir} already exists. ") 120 | continue 121 | 122 | logger.info(f"config: {OmegaConf.to_yaml(config)}") 123 | 124 | # This is the same as run_ddim_inversion.py 125 | try: 126 | logger.info(f"Loading frames from: {config.video_frames_path}") 127 | _, frame_list = load_video_frames(config.video_frames_path, config.n_frames, config.image_size) 128 | except: 129 | logger.error(f"Failed to load frames from: {config.video_frames_path}") 130 | logger.info(f"Converting mp4 video to frames: {config.video_path}") 131 | frame_list = convert_video_to_frames(config.video_path, config.image_size, save_frames=True) 132 | frame_list = frame_list[: config.n_frames] # 16 frames for img2vid 133 | logger.debug(f"len(frame_list): {len(frame_list)}") 134 | # Save the source frames as GIF 135 | export_to_gif( 136 | frame_list, 137 | os.path.join(config.video_frames_path, config.video_name + ".gif") 138 | ) 139 | logger.info(f"Saved source video as gif to {config.video_frames_path}") 140 | first_frame = frame_list[0] # Is a PIL image 141 | 142 | # Produce static video 143 | if config.inverse_config.inverse_static_video: 144 | logger.info("### Inverse a static video!") 145 | frame_list = [frame_list[0]] * config.n_frames 146 | 147 | # Null image inversion 148 | if config.inverse_config.null_image_inversion: 149 | logger.info("### Inverse a null image!") 150 | first_frame = Image.new("RGB", (config.image_size[0], config.image_size[1]), (0, 0, 0)) 151 | 152 | # Main pipeline 153 | # Inversion 154 | logger.info(f"config: {OmegaConf.to_yaml(config)}") 155 | _ddim_latents = ddim_inversion(config.inverse_config, first_frame, frame_list, pipe, inverse_scheduler, g) 156 | 157 | # Reconstruction 158 | recon_config = config.recon_config 159 | if recon_config.enable_recon: 160 | ddim_init_latents_t_idx = recon_config.ddim_init_latents_t_idx 161 | ddim_scheduler.set_timesteps(recon_config.n_steps) 162 | logger.info(f"ddim_scheduler.timesteps: {ddim_scheduler.timesteps}") 163 | ddim_latents_path = recon_config.ddim_latents_path 164 | ddim_latents_at_t = load_ddim_latents_at_t( 165 | ddim_scheduler.timesteps[ddim_init_latents_t_idx], 166 | ddim_latents_path=ddim_latents_path, 167 | ) 168 | logger.debug(f"ddim_scheduler.timesteps[t_idx]: {ddim_scheduler.timesteps[ddim_init_latents_t_idx]}") 169 | reconstructed_video = ddim_sampling( 170 | recon_config, 171 | first_frame, 172 | ddim_latents_at_t, 173 | pipe, 174 | ddim_scheduler, 175 | ddim_init_latents_t_idx, 176 | g, 177 | ) 178 | 179 | # Save the reconstructed video 180 | os.makedirs(config.output_dir, exist_ok=True) 181 | # Downsampling the video for space saving 182 | reconstructed_video = [frame.resize((512, 512), resample=Image.LANCZOS) for frame in reconstructed_video] 183 | export_to_video( 184 | reconstructed_video, 185 | os.path.join(config.output_dir, "ddim_reconstruction.mp4"), 186 | fps=10, 187 | ) 188 | export_to_gif( 189 | reconstructed_video, 190 | os.path.join(config.output_dir, "ddim_reconstruction.gif"), 191 | ) 192 | logger.info(f"Saved reconstructed video to {config.output_dir}") 193 | 194 | 195 | if __name__ == "__main__": 196 | parser = argparse.ArgumentParser() 197 | parser.add_argument("--template_config", type=str, default="./configs/group_ddim_inversion/template.yaml") 198 | parser.add_argument("--configs_json", type=str, default="./configs/group_config.json") # This is going to override the template_config 199 | 200 | args = parser.parse_args() 201 | template_config = OmegaConf.load(args.template_config) 202 | 203 | # Set up logging 204 | logging_level = logging.DEBUG if template_config.debug else logging.INFO 205 | logging.basicConfig(level=logging_level, format="%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s") 206 | logger = logging.getLogger(__name__) 207 | logger.info(f"template_config: {OmegaConf.to_yaml(template_config)}") 208 | 209 | # Load data jsonl into list 210 | configs_json = args.configs_json 211 | assert Path(configs_json).exists() 212 | with open(configs_json, 'r') as file: 213 | configs_list = json.load(file) 214 | logger.info(f"Loaded {len(configs_list)} configs from {configs_json}") 215 | 216 | # Set up device and seed 217 | device = torch.device(template_config.device) 218 | torch.set_grad_enabled(False) 219 | seed_everything(template_config.seed) 220 | main(template_config, configs_list) 221 | -------------------------------------------------------------------------------- /i2vgen-xl/run_group_pnp_edit.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from pathlib import Path 4 | import torch 5 | import argparse 6 | import logging 7 | from omegaconf import OmegaConf 8 | from PIL import Image 9 | import json 10 | 11 | # HF imports 12 | from diffusers import ( 13 | DDIMInverseScheduler, 14 | DDIMScheduler, 15 | ) 16 | from diffusers.utils import load_image, export_to_video, export_to_gif 17 | 18 | # Project imports 19 | from utils import ( 20 | seed_everything, 21 | load_video_frames, 22 | convert_video_to_frames, 23 | load_ddim_latents_at_T, 24 | load_ddim_latents_at_t, 25 | ) 26 | from pipelines.pipeline_i2vgen_xl import I2VGenXLPipeline 27 | from pnp_utils import ( 28 | register_time, 29 | register_conv_injection, 30 | register_spatial_attention_pnp, 31 | register_temp_attention_pnp, 32 | ) 33 | 34 | 35 | def init_pnp(pipe, scheduler, config): 36 | conv_injection_t = int(config.n_steps * config.pnp_f_t) 37 | spatial_attn_qk_injection_t = int(config.n_steps * config.pnp_spatial_attn_t) 38 | temp_attn_qk_injection_t = int(config.n_steps * config.pnp_temp_attn_t) 39 | conv_injection_timesteps = scheduler.timesteps[:conv_injection_t] if conv_injection_t >= 0 else [] 40 | spatial_attn_qk_injection_timesteps = ( 41 | scheduler.timesteps[:spatial_attn_qk_injection_t] if spatial_attn_qk_injection_t >= 0 else [] 42 | ) 43 | temp_attn_qk_injection_timesteps = ( 44 | scheduler.timesteps[:temp_attn_qk_injection_t] if temp_attn_qk_injection_t >= 0 else [] 45 | ) 46 | register_conv_injection(pipe, conv_injection_timesteps) 47 | register_spatial_attention_pnp(pipe, spatial_attn_qk_injection_timesteps) 48 | register_temp_attention_pnp(pipe, temp_attn_qk_injection_timesteps) 49 | 50 | logger = logging.getLogger(__name__) 51 | logger.debug(f"conv_injection_t: {conv_injection_t}") 52 | logger.debug(f"spatial_attn_qk_injection_t: {spatial_attn_qk_injection_t}") 53 | logger.debug(f"temp_attn_qk_injection_t: {temp_attn_qk_injection_t}") 54 | logger.debug(f"conv_injection_timesteps: {conv_injection_timesteps}") 55 | logger.debug(f"spatial_attn_qk_injection_timesteps: {spatial_attn_qk_injection_timesteps}") 56 | logger.debug(f"temp_attn_qk_injection_timesteps: {temp_attn_qk_injection_timesteps}") 57 | 58 | 59 | def main(template_config, configs_list): 60 | # Initialize the pipeline 61 | pipe = I2VGenXLPipeline.from_pretrained( 62 | "ali-vilab/i2vgen-xl", 63 | torch_dtype=torch.float16, 64 | variant="fp16", 65 | ) 66 | pipe.to(device) 67 | 68 | # Initialize the DDIM scheduler 69 | ddim_scheduler = DDIMScheduler.from_pretrained( 70 | "ali-vilab/i2vgen-xl", 71 | subfolder="scheduler", 72 | ) 73 | 74 | for config_entry in configs_list: 75 | if config_entry["active"] == False: 76 | logger.info(f"Skipping config_entry: {config_entry}") 77 | continue 78 | logger.info(f"Processing config_entry: {config_entry}") 79 | 80 | # Override the config with the data_meta_entry 81 | config = OmegaConf.merge(template_config, OmegaConf.create(config_entry)) 82 | 83 | # Update the related paths to absolute paths 84 | config.video_path = os.path.join(config.video_dir, config.video_name + ".mp4") 85 | config.video_frames_path = os.path.join(config.video_dir, config.video_name) 86 | config.edited_first_frame_path = os.path.join(config.data_dir, config.edited_first_frame_path) 87 | logger.info(f"config: {OmegaConf.to_yaml(config)}") 88 | 89 | # Check if there are fields contain "ReplaceMe" 90 | for k, v in config.items(): 91 | if "ReplaceMe" in str(v): 92 | logger.error(f"Field {k} contains 'ReplaceMe'") 93 | continue 94 | 95 | # This is the same as run_pnp_edit.py 96 | # Load first frame and source frames 97 | try: 98 | logger.info(f"Loading frames from: {config.video_frames_path}") 99 | _, frame_list = load_video_frames(config.video_frames_path, config.n_frames, config.image_size) 100 | except: 101 | logger.error(f"Failed to load frames from: {config.video_frames_path}") 102 | logger.info(f"Converting mp4 video to frames: {config.video_path}") 103 | frame_list = convert_video_to_frames(config.video_path, config.image_size, save_frames=True) 104 | frame_list = frame_list[: config.n_frames] # 16 frames for img2vid 105 | logger.debug(f"len(frame_list): {len(frame_list)}") 106 | src_frame_list = frame_list 107 | src_1st_frame = src_frame_list[0] # Is a PIL image 108 | 109 | # Load the edited first frame 110 | edited_1st_frame = load_image(config.edited_first_frame_path) 111 | edited_1st_frame = edited_1st_frame.resize(config.image_size, resample=Image.Resampling.LANCZOS) 112 | 113 | # Load the initial latents at t 114 | ddim_init_latents_t_idx = config.ddim_init_latents_t_idx 115 | ddim_scheduler.set_timesteps(config.n_steps) 116 | logger.info(f"ddim_scheduler.timesteps: {ddim_scheduler.timesteps}") 117 | ddim_latents_at_t = load_ddim_latents_at_t( 118 | ddim_scheduler.timesteps[ddim_init_latents_t_idx], ddim_latents_path=config.ddim_latents_path 119 | ) 120 | logger.debug(f"ddim_scheduler.timesteps[t_idx]: {ddim_scheduler.timesteps[ddim_init_latents_t_idx]}") 121 | logger.debug(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}") 122 | 123 | # Blend the latents 124 | random_latents = torch.randn_like(ddim_latents_at_t) 125 | logger.info(f"Blending random_ratio (1 means random latent): {config.random_ratio}") 126 | mixed_latents = random_latents * config.random_ratio + ddim_latents_at_t * (1 - config.random_ratio) 127 | 128 | # Init Pnp 129 | init_pnp(pipe, ddim_scheduler, config) 130 | 131 | # Edit video 132 | pipe.register_modules(scheduler=ddim_scheduler) 133 | edited_video = pipe.sample_with_pnp( 134 | prompt=config.editing_prompt, 135 | image=edited_1st_frame, 136 | height=config.image_size[1], 137 | width=config.image_size[0], 138 | num_frames=config.n_frames, 139 | num_inference_steps=config.n_steps, 140 | guidance_scale=config.cfg, 141 | negative_prompt=config.editing_negative_prompt, 142 | target_fps=config.target_fps, 143 | latents=mixed_latents, 144 | generator=torch.manual_seed(config.seed), 145 | return_dict=True, 146 | ddim_init_latents_t_idx=ddim_init_latents_t_idx, 147 | ddim_inv_latents_path=config.ddim_latents_path, 148 | ddim_inv_prompt=config.ddim_inv_prompt, 149 | ddim_inv_1st_frame=src_1st_frame, 150 | ).frames[0] 151 | 152 | # Save video 153 | # Add the config to the output_dir, TODO: make this more elegant 154 | config_suffix = ( 155 | "ddim_init_latents_t_idx_" 156 | + str(ddim_init_latents_t_idx) 157 | + "_nsteps_" 158 | + str(config.n_steps) 159 | + "_cfg_" 160 | + str(config.cfg) 161 | + "_pnpf" 162 | + str(config.pnp_f_t) 163 | + "_pnps" 164 | + str(config.pnp_spatial_attn_t) 165 | + "_pnpt" 166 | + str(config.pnp_temp_attn_t) 167 | ) 168 | output_dir = os.path.join(config.output_dir, config_suffix) 169 | os.makedirs(output_dir, exist_ok=True) 170 | edited_video = [frame.resize(config.image_size, resample=Image.LANCZOS) for frame in edited_video] 171 | # Downsampling the video for space saving 172 | # edited_video = [frame.resize((512, 512), resample=Image.LANCZOS) for frame in edited_video] 173 | # if config.pnp_f_t == 0.0 and config.pnp_spatial_attn_t == 0.0 and config.pnp_temp_attn_t == 0.0: 174 | # edited_video_file_name = "ddim_edit" 175 | # else: 176 | # edited_video_file_name = "pnp_edit" 177 | edited_video_file_name = "video" 178 | export_to_video(edited_video, os.path.join(output_dir, f"{edited_video_file_name}.mp4"), fps=config.target_fps) 179 | export_to_gif(edited_video, os.path.join(output_dir, f"{edited_video_file_name}.gif")) 180 | logger.info(f"Saved video to: {os.path.join(output_dir, f'{edited_video_file_name}.mp4')}") 181 | logger.info(f"Saved gif to: {os.path.join(output_dir, f'{edited_video_file_name}.gif')}") 182 | for i, frame in enumerate(edited_video): 183 | frame.save(os.path.join(output_dir, f"{edited_video_file_name}_{i:05d}.png")) 184 | logger.info(f"Saved frames to: {os.path.join(output_dir, f'{edited_video_file_name}_{i:05d}.png')}") 185 | 186 | 187 | if __name__ == "__main__": 188 | parser = argparse.ArgumentParser() 189 | parser.add_argument("--template_config", type=str, default="./configs/group_pnp_edit/template.yaml") 190 | parser.add_argument( 191 | "--configs_json", type=str, default="./configs/group_config.json" 192 | ) # This is going to override the template_config 193 | 194 | args = parser.parse_args() 195 | template_config = OmegaConf.load(args.template_config) 196 | 197 | # Set up logging 198 | logging_level = logging.DEBUG if template_config.debug else logging.INFO 199 | logging.basicConfig(level=logging_level, format="%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s") 200 | logger = logging.getLogger(__name__) 201 | logger.info(f"template_config: {OmegaConf.to_yaml(template_config)}") 202 | 203 | # Load data jsonl into list 204 | configs_json = args.configs_json 205 | assert Path(configs_json).exists() 206 | with open(configs_json, "r") as file: 207 | configs_list = json.load(file) 208 | logger.info(f"Loaded {len(configs_list)} configs from {configs_json}") 209 | 210 | # Set up device and seed 211 | device = torch.device(template_config.device) 212 | torch.set_grad_enabled(False) 213 | seed_everything(template_config.seed) 214 | main(template_config, configs_list) 215 | -------------------------------------------------------------------------------- /i2vgen-xl/scripts/run_group_ddim_inversion.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #source /home/YourName/miniconda3/etc/profile.d/conda.sh #<-- change this to your own miniconda path 3 | conda activate anyv2v-i2vgen-xl 4 | 5 | cd .. 6 | python run_group_ddim_inversion.py \ 7 | --template_config "configs/group_ddim_inversion/template.yaml" \ 8 | --configs_json "configs/group_ddim_inversion/group_config.json" -------------------------------------------------------------------------------- /i2vgen-xl/scripts/run_group_pnp_edit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #source /home/YourName/miniconda3/etc/profile.d/conda.sh #<-- change this to your own miniconda path 3 | conda activate anyv2v-i2vgen-xl 4 | 5 | cd .. 6 | python run_group_pnp_edit.py \ 7 | --template_config "configs/group_pnp_edit/template.yaml" \ 8 | --configs_json "configs/group_pnp_edit/group_config.json" -------------------------------------------------------------------------------- /i2vgen-xl/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import numpy as np 4 | import torch 5 | from torchvision.io import read_video 6 | import torchvision.transforms as T 7 | from pathlib import Path 8 | from PIL import Image 9 | from diffusers.utils import load_image 10 | import glob 11 | 12 | 13 | import logging 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def seed_everything(seed): 18 | torch.manual_seed(seed) 19 | torch.cuda.manual_seed(seed) 20 | torch.cuda.manual_seed_all(seed) 21 | random.seed(seed) 22 | np.random.seed(seed) 23 | 24 | 25 | def load_ddim_latents_at_t(t, ddim_latents_path): 26 | ddim_latents_at_t_path = os.path.join(ddim_latents_path, f"ddim_latents_{t}.pt") 27 | assert os.path.exists(ddim_latents_at_t_path), f"Missing latents at t {t} path {ddim_latents_at_t_path}" 28 | ddim_latents_at_t = torch.load(ddim_latents_at_t_path) 29 | logger.debug(f"Loaded ddim_latents_at_t from {ddim_latents_at_t_path}") 30 | return ddim_latents_at_t 31 | 32 | 33 | def load_ddim_latents_at_T(ddim_latents_path): 34 | noisest = max( 35 | [int(x.split("_")[-1].split(".")[0]) for x in glob.glob(os.path.join(ddim_latents_path, f"ddim_latents_*.pt"))] 36 | ) 37 | ddim_latents_at_T_path = os.path.join(ddim_latents_path, f"ddim_latents_{noisest}.pt") 38 | ddim_latents_at_T = torch.load(ddim_latents_at_T_path) # [b, c, f, h, w] [1, 4, 16, 40, 64] 39 | return ddim_latents_at_T 40 | 41 | 42 | # Modified from tokenflow/utils.py 43 | def convert_video_to_frames(video_path, img_size=(512, 512), save_frames=True): 44 | video, _, _ = read_video(video_path, output_format="TCHW") 45 | # rotate video -90 degree if video is .mov format. this is a weird bug in torchvision 46 | if video_path.endswith(".mov"): 47 | video = T.functional.rotate(video, -90) 48 | if save_frames: 49 | video_name = Path(video_path).stem 50 | video_dir = Path(video_path).parent 51 | os.makedirs(f"{video_dir}/{video_name}", exist_ok=True) 52 | frames = [] 53 | for i in range(len(video)): 54 | ind = str(i).zfill(5) 55 | image = T.ToPILImage()(video[i]) 56 | logger.info(f"Original video frame size: {image.size}") 57 | if image.size != img_size: 58 | image_resized = image.resize(img_size, resample=Image.Resampling.LANCZOS) 59 | logger.info(f"Resized video frame, height, width: {image_resized.size}, {img_size[1]}, {img_size[0]}") 60 | else: 61 | image_resized = image 62 | if save_frames: 63 | image_resized.save(f"{video_dir}/{video_name}/{ind}.png") 64 | print(f"Saved frame {video_dir}/{video_name}/{ind}.png") 65 | frames.append(image_resized) 66 | return frames 67 | 68 | 69 | # Modified from tokenflow/utils.py 70 | def load_video_frames(frames_path, n_frames, image_size=(512, 512)): 71 | # Load paths 72 | paths = [f"{frames_path}/%05d.png" % i for i in range(n_frames)] 73 | frames = [load_image(p) for p in paths] 74 | # Check if the frames are the right size 75 | for f in frames: 76 | if f.size != image_size: 77 | logger.error(f"Frame size {f.size} does not match config.image_size {image_size}") 78 | raise ValueError(f"Frame size {f.size} does not match config.image_size {image_size}") 79 | return paths, frames 80 | 81 | -------------------------------------------------------------------------------- /prepare_video.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from moviepy.editor import VideoFileClip 3 | import os 4 | import glob 5 | import random 6 | import numpy as np 7 | from PIL import Image 8 | 9 | def extract_frames(video_path, frame_count=16): 10 | clip = VideoFileClip(video_path) 11 | duration = clip.duration 12 | frames = [] 13 | 14 | # Calculate the time interval at which to extract frames 15 | times = np.linspace(0, duration, frame_count, endpoint=False) 16 | 17 | for t in times: 18 | # Extract the frame at the specific timestamp 19 | frame = clip.get_frame(t) 20 | # Convert the frame (numpy array) to a PIL Image 21 | pil_img = Image.fromarray(frame) 22 | frames.append(pil_img) 23 | 24 | return frames 25 | 26 | def crop_and_resize_video(input_video_path, output_folder, clip_duration, width=None, height=None, start_time=None, end_time=None, n_frames=16, center_crop=False, x_offset=0, y_offset=0, longest_to_width=False, use_full_clip=False): # Load the video file 27 | video = VideoFileClip(input_video_path) 28 | 29 | if use_full_clip: 30 | cropped_video = video 31 | else: 32 | # Calculate start and end times for cropping 33 | if start_time is not None: 34 | start_time = float(start_time) 35 | end_time = start_time + clip_duration 36 | elif end_time is not None: 37 | end_time = float(end_time) 38 | start_time = end_time - clip_duration 39 | else: 40 | # Default to random cropping if neither start nor end time is specified 41 | video_duration = video.duration 42 | if video_duration <= clip_duration: 43 | print(f"Skipping {input_video_path}: duration is less than or equal to the clip duration.") 44 | return 45 | max_start_time = video_duration - clip_duration 46 | start_time = random.uniform(0, max_start_time) 47 | end_time = start_time + clip_duration 48 | cropped_video = video.subclip(start_time, end_time) 49 | 50 | if center_crop: 51 | # Calculate scale to ensure the desired crop size fits within the video 52 | video_width, video_height = cropped_video.size 53 | scale_width = video_width / width 54 | scale_height = video_height / height 55 | if longest_to_width: 56 | scale = max(scale_width, scale_height) 57 | else: 58 | scale = min(scale_width, scale_height) 59 | 60 | # Resize video to ensure the crop area fits within the frame 61 | # This step ensures that the smallest dimension matches or exceeds 512 pixels 62 | new_width = int(video_width / scale) 63 | new_height = int(video_height / scale) 64 | resized_video = cropped_video.resize(newsize=(new_width, new_height)) 65 | print(f"Resized video to ({new_width}, {new_height})") 66 | 67 | # Calculate crop position with offset, ensuring the crop does not go out of bounds 68 | # The offset calculation needs to ensure that the cropping area remains within the video frame 69 | offset_x = int(((x_offset + 1) / 2) * (new_width - width)) # Adjusted for [-1, 1] scale 70 | offset_y = int(((y_offset + 1) / 2) * (new_height - height)) # Adjusted for [-1, 1] scale 71 | 72 | # Ensure offsets do not push the crop area out of the video frame 73 | offset_x = max(0, min(new_width - width, offset_x)) 74 | offset_y = max(0, min(new_height - height, offset_y)) 75 | 76 | # Apply center crop with offsets 77 | cropped_video = resized_video.crop(x1=offset_x, y1=offset_y, width=width, height=height) 78 | elif width and height: 79 | # Directly resize the video to specified width and height if no center crop is specified 80 | cropped_video = cropped_video.resize(newsize=(width, height)) 81 | 82 | 83 | # After resizing and cropping, set the frame rate to fps 84 | fps = n_frames // clip_duration 85 | final_video = cropped_video.set_fps(fps) 86 | 87 | # Prepare the output video path 88 | if not os.path.exists(output_folder): 89 | os.makedirs(output_folder) 90 | filename = os.path.basename(input_video_path) 91 | output_video_path = os.path.join(output_folder, filename) 92 | 93 | # Write the result to the output file 94 | final_video.write_videofile(output_video_path, codec='libx264', audio_codec='aac', fps=fps) 95 | print(f"Processed {input_video_path}, saved to {output_video_path}") 96 | return output_video_path 97 | 98 | def process_videos(input_folder, output_base_folder, clip_duration, width=None, height=None, start_time=None, end_time=None, n_frames=16, center_crop=False, x_offset=0, y_offset=0, longest_to_width=False, use_full_clip=False): 99 | video_files = glob.glob(os.path.join(input_folder, '*.mp4')) # Adjust the pattern if needed 100 | if video_files == []: 101 | print(f"No video files found in {input_folder}") 102 | return 103 | 104 | for video_file in video_files: 105 | crop_and_resize_video(video_file, output_base_folder, clip_duration, width, height, start_time, end_time, n_frames, center_crop, x_offset, y_offset, longest_to_width, use_full_clip) 106 | return 107 | 108 | def main(): 109 | parser = argparse.ArgumentParser(description='Crop and resize video segments.') 110 | parser.add_argument('--input_folder', type=str, help='Path to the input folder containing video files') 111 | parser.add_argument('--video_path', type=str, default=None, required=False, help='Path to the input video file') 112 | parser.add_argument('--output_folder', type=str, default="processed_video_data", help='Path to the folder for the output videos') 113 | parser.add_argument('--clip_duration', type=int, default=2, required=False, help='Duration of the video clips in seconds') 114 | parser.add_argument('--width', type=int, default=512, help='Width of the output video (optional)') 115 | parser.add_argument('--height', type=int, default=512, help='Height of the output video (optional)') 116 | parser.add_argument('--start_time', type=float, help='Start time for cropping (optional)') 117 | parser.add_argument('--end_time', type=float, help='End time for cropping (optional)') 118 | parser.add_argument('--n_frames', type=int, default=16, help='Number of frames to extract from each video') 119 | parser.add_argument('--center_crop', action='store_true', help='Center crop the video') 120 | parser.add_argument('--x_offset', type=float, default=0, required=False, help='Horizontal offset for center cropping, range -1 to 1 (optional)') 121 | parser.add_argument('--y_offset', type=float, default=0, required=False, help='Vertical offset for center cropping, range -1 to 1 (optional)') 122 | parser.add_argument('--longest_to_width', action='store_true', help='Resize the longest dimension to the specified width') 123 | parser.add_argument('--use_full_clip', action='store_true', help='Use the full video clip without trimming') 124 | args = parser.parse_args() 125 | 126 | if args.start_time and args.end_time: 127 | print("Please specify only one of start_time or end_time, not both.") 128 | return 129 | 130 | if args.video_path: 131 | crop_and_resize_video(args.video_path, 132 | args.output_folder, 133 | args.clip_duration, 134 | args.width, args.height, 135 | args.start_time, args.end_time, 136 | args.n_frames, 137 | args.center_crop, args.x_offset, args.y_offset, args.longest_to_width, 138 | args.use_full_clip) 139 | else: 140 | process_videos(args.input_folder, 141 | args.output_folder, 142 | args.clip_duration, 143 | args.width, args.height, 144 | args.start_time, args.end_time, 145 | args.n_frames, 146 | args.center_crop, args.x_offset, args.y_offset, args.longest_to_width, 147 | args.use_full_clip) 148 | 149 | if __name__ == "__main__": 150 | main() 151 | -------------------------------------------------------------------------------- /seine/README.md: -------------------------------------------------------------------------------- 1 | # AnyV2V(_SEINE_) 2 | 3 | Our AnyV2V(_SEINE_) is a standalone version. 4 | 5 | ## Setup for SEINE 6 | 7 | ### Prepare Environment 8 | ``` 9 | conda create -n seine python==3.9.16 10 | conda activate seine 11 | pip install -r requirement.txt 12 | ``` 13 | 14 | ### Download SEINE model and T2I base model 15 | 16 | SEINE model is based on Stable diffusion v1.4, you may download [Stable Diffusion v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) to the director of ``` pretrained ``` 17 | . 18 | Download SEINE model checkpoint (from [google drive](https://drive.google.com/drive/folders/1cWfeDzKJhpb0m6HA5DoMOH0_ItuUY95b?usp=sharing) or [hugging face](https://huggingface.co/xinyuanc91/SEINE/tree/main)) and save to the directory of ```pretrained``` 19 | 20 | 21 | Now under `./pretrained`, you should be able to see the following: 22 | ``` 23 | ├── pretrained 24 | │ ├── seine.pt 25 | │ ├── stable-diffusion-v1-4 26 | │ │ ├── ... 27 | └── └── ├── ... 28 | ├── ... 29 | ``` 30 | 31 | ## AnyV2V 32 | 33 | ### Configure paths for SEINE models 34 | 35 | Edit the model paths in both yaml files: 36 | * `./configs/ddim_inversion.yaml` 37 | * `./configs/pnp_edit.yaml` 38 | 39 | ```yaml 40 | # Model 41 | model_name: "seine" 42 | sd_path: "/stable-diffusion-v1-4" 43 | ckpt_path: "/SEINE/seine.pt" 44 | model_key: "/stable-diffusion-v1-4" 45 | ``` 46 | 47 | Theortically, `` should equal to `./pretrained`. 48 | 49 | 50 | ### Run SEINE DDIM Inversion to get the initial latent 51 | ```shell 52 | usage: run_ddim_inversion.py [-h] [--config CONFIG] [--video_path VIDEO_PATH] [--gpu GPU] 53 | [--width WIDTH] [--height HEIGHT] 54 | 55 | options: 56 | -h, --help show this help message and exit 57 | --config CONFIG 58 | --video_path VIDEO_PATH 59 | Path to the video to invert. 60 | --gpu GPU GPU number to use. 61 | --width WIDTH 62 | --height HEIGHT 63 | ``` 64 | 65 | Usage Example: 66 | ```shell 67 | python run_ddim_inversion.py --gpu 0 --video_path "../demo/Man Walking.mp4" --width 512 --height 512 68 | ``` 69 | 70 | Saved latent goes to `./ddim_version` (can be configurated in `./configs/ddim_inversion.yaml`). 71 | 72 | ### Run AnyV2V with SEINE 73 | 74 | Your need to prepare your edited image frame first. We provided an image editing script in the root folder of AnyV2V. 75 | 76 | ```shell 77 | python run_pnp_edit.py --config ./configs/pnp_edit.yaml \ 78 | src_video_path="your_video.mp4" \ 79 | edited_first_frame_path="your edited first frame image.png" \ 80 | prompt="your prompt" \ 81 | device="cuda:0" 82 | ``` 83 | 84 | Usage Example: 85 | ```shell 86 | python run_pnp_edit.py --config ./configs/pnp_edit.yaml \ 87 | src_video_path="../demo/Man Walking.mp4" \ 88 | edited_first_frame_path="../demo/Man Walking/edited_first_frame/turn the man into darth vader.png" \ 89 | prompt="Darth Vader Walking" 90 | ``` 91 | 92 | Saved video goes to `./anyv2v_results` (can be configurated in `./configs/pnp_edit.yaml`). 93 | -------------------------------------------------------------------------------- /seine/configs/ddim_inversion.yaml: -------------------------------------------------------------------------------- 1 | # General 2 | seed: 1 3 | device: "cuda:0" 4 | debug: True # For logging 5 | 6 | # Dir 7 | exp_name: "default" 8 | output_dir: "ddim-inversion/${exp_name}" 9 | 10 | # Data 11 | data_dir: "/data" 12 | src_video_path: "${data_dir}/woman-running.mp4" #Override it with video_path 13 | image_size: [512, 512] 14 | save_video_frames: False 15 | 16 | # Model 17 | model_name: "seine" 18 | sd_path: "./pretrained/stable-diffusion-v1-4" 19 | ckpt_path: "./pretrained/SEINE/seine.pt" 20 | model_key: "./pretrained/stable-diffusion-v1-4" 21 | enable_xformers_memory_efficient_attention: True 22 | use_fp16: True 23 | 24 | # Schedular 25 | beta_start: 0.0001 26 | beta_end: 0.02 27 | beta_schedule: "linear" 28 | 29 | # DDIM inversion 30 | n_steps: 500 31 | n_save_steps: 250 32 | n_frame_to_invert: 16 33 | inversion_prompt: "" 34 | batch_size: 1 # TODO: batchsize is always 1 for inversion, we can remove this -------------------------------------------------------------------------------- /seine/configs/pnp_edit.yaml: -------------------------------------------------------------------------------- 1 | # General 2 | seed: 1 3 | device: "cuda:0" 4 | debug: False # For logging 5 | 6 | # Dir 7 | exp_name: "default" 8 | output_dir: "anyv2v_results/${exp_name}" 9 | 10 | # Data 11 | data_dir: "/data" 12 | src_video_path: "${data_dir}/video.mp4" #Override it with src_video_path 13 | ddim_inversion_dir: 'ddim-inversion/default/' 14 | n_ddim_inversion_steps: 500 # for retrieving the latents of the inversion 15 | n_frame_inverted: 16 16 | n_frames: 16 17 | edited_first_frame_path: '/edited_first_frame.png' #Override it with edited_first_frame_path 18 | image_size: [512, 512] 19 | 20 | # Model 21 | model_name: "seine" 22 | sd_path: "./pretrained/stable-diffusion-v1-4" 23 | ckpt_path: "./pretrained/SEINE/seine.pt" 24 | model_key: "./pretrained/stable-diffusion-v1-4" 25 | enable_xformers_memory_efficient_attention: True 26 | use_fp16: True 27 | 28 | # Schedular 29 | sample_method: 'ddpm' 30 | beta_start: 0.0001 31 | beta_end: 0.02 32 | beta_schedule: "linear" 33 | 34 | # Diffusion 35 | cfg_scale: 4 36 | n_steps: 50 37 | init_with_ddim_inversion: True 38 | prompt: "" #Override it with prompt 39 | negative_prompt: "" 40 | batch_size: 1 # TODO: batchsize is always 1, we can remove this 41 | 42 | # Pnp params -- injection thresholds ∈ [0, 1] 43 | enable_pnp: True 44 | pnp_f_t: 0.2 45 | pnp_spatial_attn_t: 0.2 46 | pnp_temp_attn_t: 0.5 47 | pnp_cross_attn_t: 0.0 48 | -------------------------------------------------------------------------------- /seine/diffusion/__init__.py: -------------------------------------------------------------------------------- 1 | # Modified from OpenAI's diffusion repos 2 | # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py 3 | # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion 4 | # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py 5 | 6 | from . import gaussian_diffusion as gd 7 | from .respace import SpacedDiffusion, space_timesteps 8 | 9 | 10 | def create_diffusion( 11 | timestep_respacing, 12 | noise_schedule="linear", 13 | use_kl=False, 14 | sigma_small=False, 15 | predict_xstart=False, 16 | # learn_sigma=True, 17 | learn_sigma=False, # for unet 18 | rescale_learned_sigmas=False, 19 | diffusion_steps=1000 20 | ): 21 | betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps) 22 | if use_kl: 23 | loss_type = gd.LossType.RESCALED_KL 24 | elif rescale_learned_sigmas: 25 | loss_type = gd.LossType.RESCALED_MSE 26 | else: 27 | loss_type = gd.LossType.MSE 28 | if timestep_respacing is None or timestep_respacing == "": 29 | timestep_respacing = [diffusion_steps] 30 | return SpacedDiffusion( 31 | use_timesteps=space_timesteps(diffusion_steps, timestep_respacing), 32 | betas=betas, 33 | model_mean_type=( 34 | gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X 35 | ), 36 | model_var_type=( 37 | ( 38 | gd.ModelVarType.FIXED_LARGE 39 | if not sigma_small 40 | else gd.ModelVarType.FIXED_SMALL 41 | ) 42 | if not learn_sigma 43 | else gd.ModelVarType.LEARNED_RANGE 44 | ), 45 | loss_type=loss_type 46 | # rescale_timesteps=rescale_timesteps, 47 | ) 48 | -------------------------------------------------------------------------------- /seine/diffusion/diffusion_utils.py: -------------------------------------------------------------------------------- 1 | # Modified from OpenAI's diffusion repos 2 | # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py 3 | # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion 4 | # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py 5 | 6 | import torch as th 7 | import numpy as np 8 | 9 | 10 | def normal_kl(mean1, logvar1, mean2, logvar2): 11 | """ 12 | Compute the KL divergence between two gaussians. 13 | Shapes are automatically broadcasted, so batches can be compared to 14 | scalars, among other use cases. 15 | """ 16 | tensor = None 17 | for obj in (mean1, logvar1, mean2, logvar2): 18 | if isinstance(obj, th.Tensor): 19 | tensor = obj 20 | break 21 | assert tensor is not None, "at least one argument must be a Tensor" 22 | 23 | # Force variances to be Tensors. Broadcasting helps convert scalars to 24 | # Tensors, but it does not work for th.exp(). 25 | logvar1, logvar2 = [ 26 | x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) 27 | for x in (logvar1, logvar2) 28 | ] 29 | 30 | return 0.5 * ( 31 | -1.0 32 | + logvar2 33 | - logvar1 34 | + th.exp(logvar1 - logvar2) 35 | + ((mean1 - mean2) ** 2) * th.exp(-logvar2) 36 | ) 37 | 38 | 39 | def approx_standard_normal_cdf(x): 40 | """ 41 | A fast approximation of the cumulative distribution function of the 42 | standard normal. 43 | """ 44 | return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3)))) 45 | 46 | 47 | def continuous_gaussian_log_likelihood(x, *, means, log_scales): 48 | """ 49 | Compute the log-likelihood of a continuous Gaussian distribution. 50 | :param x: the targets 51 | :param means: the Gaussian mean Tensor. 52 | :param log_scales: the Gaussian log stddev Tensor. 53 | :return: a tensor like x of log probabilities (in nats). 54 | """ 55 | centered_x = x - means 56 | inv_stdv = th.exp(-log_scales) 57 | normalized_x = centered_x * inv_stdv 58 | log_probs = th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob(normalized_x) 59 | return log_probs 60 | 61 | 62 | def discretized_gaussian_log_likelihood(x, *, means, log_scales): 63 | """ 64 | Compute the log-likelihood of a Gaussian distribution discretizing to a 65 | given image. 66 | :param x: the target images. It is assumed that this was uint8 values, 67 | rescaled to the range [-1, 1]. 68 | :param means: the Gaussian mean Tensor. 69 | :param log_scales: the Gaussian log stddev Tensor. 70 | :return: a tensor like x of log probabilities (in nats). 71 | """ 72 | assert x.shape == means.shape == log_scales.shape 73 | centered_x = x - means 74 | inv_stdv = th.exp(-log_scales) 75 | plus_in = inv_stdv * (centered_x + 1.0 / 255.0) 76 | cdf_plus = approx_standard_normal_cdf(plus_in) 77 | min_in = inv_stdv * (centered_x - 1.0 / 255.0) 78 | cdf_min = approx_standard_normal_cdf(min_in) 79 | log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12)) 80 | log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12)) 81 | cdf_delta = cdf_plus - cdf_min 82 | log_probs = th.where( 83 | x < -0.999, 84 | log_cdf_plus, 85 | th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))), 86 | ) 87 | assert log_probs.shape == x.shape 88 | return log_probs 89 | -------------------------------------------------------------------------------- /seine/diffusion/respace.py: -------------------------------------------------------------------------------- 1 | # Modified from OpenAI's diffusion repos 2 | # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py 3 | # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion 4 | # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py 5 | import torch 6 | import numpy as np 7 | import torch as th 8 | 9 | from .gaussian_diffusion import GaussianDiffusion 10 | 11 | 12 | def space_timesteps(num_timesteps, section_counts): 13 | """ 14 | Create a list of timesteps to use from an original diffusion process, 15 | given the number of timesteps we want to take from equally-sized portions 16 | of the original process. 17 | For example, if there's 300 timesteps and the section counts are [10,15,20] 18 | then the first 100 timesteps are strided to be 10 timesteps, the second 100 19 | are strided to be 15 timesteps, and the final 100 are strided to be 20. 20 | If the stride is a string starting with "ddim", then the fixed striding 21 | from the DDIM paper is used, and only one section is allowed. 22 | :param num_timesteps: the number of diffusion steps in the original 23 | process to divide up. 24 | :param section_counts: either a list of numbers, or a string containing 25 | comma-separated numbers, indicating the step count 26 | per section. As a special case, use "ddimN" where N 27 | is a number of steps to use the striding from the 28 | DDIM paper. 29 | :return: a set of diffusion steps from the original process to use. 30 | """ 31 | if isinstance(section_counts, str): 32 | if section_counts.startswith("ddim"): 33 | desired_count = int(section_counts[len("ddim") :]) 34 | for i in range(1, num_timesteps): 35 | if len(range(0, num_timesteps, i)) == desired_count: 36 | return set(range(0, num_timesteps, i)) 37 | raise ValueError( 38 | f"cannot create exactly {num_timesteps} steps with an integer stride" 39 | ) 40 | section_counts = [int(x) for x in section_counts.split(",")] 41 | size_per = num_timesteps // len(section_counts) 42 | extra = num_timesteps % len(section_counts) 43 | start_idx = 0 44 | all_steps = [] 45 | for i, section_count in enumerate(section_counts): 46 | size = size_per + (1 if i < extra else 0) 47 | if size < section_count: 48 | raise ValueError( 49 | f"cannot divide section of {size} steps into {section_count}" 50 | ) 51 | if section_count <= 1: 52 | frac_stride = 1 53 | else: 54 | frac_stride = (size - 1) / (section_count - 1) 55 | cur_idx = 0.0 56 | taken_steps = [] 57 | for _ in range(section_count): 58 | taken_steps.append(start_idx + round(cur_idx)) 59 | cur_idx += frac_stride 60 | all_steps += taken_steps 61 | start_idx += size 62 | return set(all_steps) 63 | 64 | 65 | class SpacedDiffusion(GaussianDiffusion): 66 | """ 67 | A diffusion process which can skip steps in a base diffusion process. 68 | :param use_timesteps: a collection (sequence or set) of timesteps from the 69 | original diffusion process to retain. 70 | :param kwargs: the kwargs to create the base diffusion process. 71 | """ 72 | 73 | def __init__(self, use_timesteps, **kwargs): 74 | self.use_timesteps = set(use_timesteps) 75 | self.timestep_map = [] 76 | self.original_num_steps = len(kwargs["betas"]) 77 | 78 | base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa 79 | last_alpha_cumprod = 1.0 80 | new_betas = [] 81 | for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod): 82 | if i in self.use_timesteps: 83 | new_betas.append(1 - alpha_cumprod / last_alpha_cumprod) 84 | last_alpha_cumprod = alpha_cumprod 85 | self.timestep_map.append(i) 86 | kwargs["betas"] = np.array(new_betas) 87 | super().__init__(**kwargs) 88 | 89 | def p_mean_variance( 90 | self, model, *args, **kwargs 91 | ): # pylint: disable=signature-differs 92 | return super().p_mean_variance(self._wrap_model(model), *args, **kwargs) 93 | 94 | # @torch.compile 95 | def training_losses( 96 | self, model, *args, **kwargs 97 | ): # pylint: disable=signature-differs 98 | return super().training_losses(self._wrap_model(model), *args, **kwargs) 99 | 100 | def condition_mean(self, cond_fn, *args, **kwargs): 101 | return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs) 102 | 103 | def condition_score(self, cond_fn, *args, **kwargs): 104 | return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs) 105 | 106 | def _wrap_model(self, model): 107 | if isinstance(model, _WrappedModel): 108 | return model 109 | return _WrappedModel( 110 | model, self.timestep_map, self.original_num_steps 111 | ) 112 | 113 | def _scale_timesteps(self, t): 114 | # Scaling is done by the wrapped model. 115 | return t 116 | 117 | 118 | class _WrappedModel: 119 | def __init__(self, model, timestep_map, original_num_steps): 120 | self.model = model 121 | self.timestep_map = timestep_map 122 | # self.rescale_timesteps = rescale_timesteps 123 | self.original_num_steps = original_num_steps 124 | 125 | def __call__(self, x, ts, **kwargs): 126 | map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype) 127 | new_ts = map_tensor[ts] 128 | # if self.rescale_timesteps: 129 | # new_ts = new_ts.float() * (1000.0 / self.original_num_steps) 130 | return self.model(x, new_ts, **kwargs) 131 | -------------------------------------------------------------------------------- /seine/diffusion/timestep_sampler.py: -------------------------------------------------------------------------------- 1 | # Modified from OpenAI's diffusion repos 2 | # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py 3 | # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion 4 | # IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py 5 | 6 | from abc import ABC, abstractmethod 7 | 8 | import numpy as np 9 | import torch as th 10 | import torch.distributed as dist 11 | 12 | 13 | def create_named_schedule_sampler(name, diffusion): 14 | """ 15 | Create a ScheduleSampler from a library of pre-defined samplers. 16 | :param name: the name of the sampler. 17 | :param diffusion: the diffusion object to sample for. 18 | """ 19 | if name == "uniform": 20 | return UniformSampler(diffusion) 21 | elif name == "loss-second-moment": 22 | return LossSecondMomentResampler(diffusion) 23 | else: 24 | raise NotImplementedError(f"unknown schedule sampler: {name}") 25 | 26 | 27 | class ScheduleSampler(ABC): 28 | """ 29 | A distribution over timesteps in the diffusion process, intended to reduce 30 | variance of the objective. 31 | By default, samplers perform unbiased importance sampling, in which the 32 | objective's mean is unchanged. 33 | However, subclasses may override sample() to change how the resampled 34 | terms are reweighted, allowing for actual changes in the objective. 35 | """ 36 | 37 | @abstractmethod 38 | def weights(self): 39 | """ 40 | Get a numpy array of weights, one per diffusion step. 41 | The weights needn't be normalized, but must be positive. 42 | """ 43 | 44 | def sample(self, batch_size, device): 45 | """ 46 | Importance-sample timesteps for a batch. 47 | :param batch_size: the number of timesteps. 48 | :param device: the torch device to save to. 49 | :return: a tuple (timesteps, weights): 50 | - timesteps: a tensor of timestep indices. 51 | - weights: a tensor of weights to scale the resulting losses. 52 | """ 53 | w = self.weights() 54 | p = w / np.sum(w) 55 | indices_np = np.random.choice(len(p), size=(batch_size,), p=p) 56 | indices = th.from_numpy(indices_np).long().to(device) 57 | weights_np = 1 / (len(p) * p[indices_np]) 58 | weights = th.from_numpy(weights_np).float().to(device) 59 | return indices, weights 60 | 61 | 62 | class UniformSampler(ScheduleSampler): 63 | def __init__(self, diffusion): 64 | self.diffusion = diffusion 65 | self._weights = np.ones([diffusion.num_timesteps]) 66 | 67 | def weights(self): 68 | return self._weights 69 | 70 | 71 | class LossAwareSampler(ScheduleSampler): 72 | def update_with_local_losses(self, local_ts, local_losses): 73 | """ 74 | Update the reweighting using losses from a model. 75 | Call this method from each rank with a batch of timesteps and the 76 | corresponding losses for each of those timesteps. 77 | This method will perform synchronization to make sure all of the ranks 78 | maintain the exact same reweighting. 79 | :param local_ts: an integer Tensor of timesteps. 80 | :param local_losses: a 1D Tensor of losses. 81 | """ 82 | batch_sizes = [ 83 | th.tensor([0], dtype=th.int32, device=local_ts.device) 84 | for _ in range(dist.get_world_size()) 85 | ] 86 | dist.all_gather( 87 | batch_sizes, 88 | th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device), 89 | ) 90 | 91 | # Pad all_gather batches to be the maximum batch size. 92 | batch_sizes = [x.item() for x in batch_sizes] 93 | max_bs = max(batch_sizes) 94 | 95 | timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes] 96 | loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes] 97 | dist.all_gather(timestep_batches, local_ts) 98 | dist.all_gather(loss_batches, local_losses) 99 | timesteps = [ 100 | x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs] 101 | ] 102 | losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]] 103 | self.update_with_all_losses(timesteps, losses) 104 | 105 | @abstractmethod 106 | def update_with_all_losses(self, ts, losses): 107 | """ 108 | Update the reweighting using losses from a model. 109 | Sub-classes should override this method to update the reweighting 110 | using losses from the model. 111 | This method directly updates the reweighting without synchronizing 112 | between workers. It is called by update_with_local_losses from all 113 | ranks with identical arguments. Thus, it should have deterministic 114 | behavior to maintain state across workers. 115 | :param ts: a list of int timesteps. 116 | :param losses: a list of float losses, one per timestep. 117 | """ 118 | 119 | 120 | class LossSecondMomentResampler(LossAwareSampler): 121 | def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001): 122 | self.diffusion = diffusion 123 | self.history_per_term = history_per_term 124 | self.uniform_prob = uniform_prob 125 | self._loss_history = np.zeros( 126 | [diffusion.num_timesteps, history_per_term], dtype=np.float64 127 | ) 128 | self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int) 129 | 130 | def weights(self): 131 | if not self._warmed_up(): 132 | return np.ones([self.diffusion.num_timesteps], dtype=np.float64) 133 | weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1)) 134 | weights /= np.sum(weights) 135 | weights *= 1 - self.uniform_prob 136 | weights += self.uniform_prob / len(weights) 137 | return weights 138 | 139 | def update_with_all_losses(self, ts, losses): 140 | for t, loss in zip(ts, losses): 141 | if self._loss_counts[t] == self.history_per_term: 142 | # Shift out the oldest loss term. 143 | self._loss_history[t, :-1] = self._loss_history[t, 1:] 144 | self._loss_history[t, -1] = loss 145 | else: 146 | self._loss_history[t, self._loss_counts[t]] = loss 147 | self._loss_counts[t] += 1 148 | 149 | def _warmed_up(self): 150 | return (self._loss_counts == self.history_per_term).all() 151 | -------------------------------------------------------------------------------- /seine/models/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append(os.path.split(sys.path[0])[0]) 4 | 5 | from .unet import UNet3DConditionModel 6 | from torch.optim.lr_scheduler import LambdaLR 7 | 8 | def customized_lr_scheduler(optimizer, warmup_steps=5000): # 5000 from u-vit 9 | from torch.optim.lr_scheduler import LambdaLR 10 | def fn(step): 11 | if warmup_steps > 0: 12 | return min(step / warmup_steps, 1) 13 | else: 14 | return 1 15 | return LambdaLR(optimizer, fn) 16 | 17 | 18 | def get_lr_scheduler(optimizer, name, **kwargs): 19 | if name == 'warmup': 20 | return customized_lr_scheduler(optimizer, **kwargs) 21 | elif name == 'cosine': 22 | from torch.optim.lr_scheduler import CosineAnnealingLR 23 | return CosineAnnealingLR(optimizer, **kwargs) 24 | else: 25 | raise NotImplementedError(name) 26 | 27 | def get_models(args): 28 | if 'UNet' in args.model: 29 | pretrained_model_path = args.pretrained_model_path 30 | return UNet3DConditionModel.from_pretrained_2d(pretrained_model_path, subfolder="unet", use_concat=args.use_mask) 31 | else: 32 | raise '{} Model Not Supported!'.format(args.model) 33 | -------------------------------------------------------------------------------- /seine/models/clip.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import torch.nn as nn 3 | from transformers import CLIPTokenizer, CLIPTextModel 4 | 5 | import transformers 6 | transformers.logging.set_verbosity_error() 7 | 8 | """ 9 | Will encounter following warning: 10 | - This IS expected if you are initializing CLIPTextModel from the checkpoint of a model trained on another task 11 | or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). 12 | - This IS NOT expected if you are initializing CLIPTextModel from the checkpoint of a model 13 | that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). 14 | 15 | https://github.com/CompVis/stable-diffusion/issues/97 16 | according to this issue, this warning is safe. 17 | 18 | This is expected since the vision backbone of the CLIP model is not needed to run Stable Diffusion. 19 | You can safely ignore the warning, it is not an error. 20 | 21 | This clip usage is from U-ViT and same with Stable Diffusion. 22 | """ 23 | 24 | class AbstractEncoder(nn.Module): 25 | def __init__(self): 26 | super().__init__() 27 | 28 | def encode(self, *args, **kwargs): 29 | raise NotImplementedError 30 | 31 | 32 | class FrozenCLIPEmbedder(AbstractEncoder): 33 | """Uses the CLIP transformer encoder for text (from Hugging Face)""" 34 | # def __init__(self, version="openai/clip-vit-huge-patch14", device="cuda", max_length=77): 35 | def __init__(self, path, device="cuda", max_length=77): 36 | super().__init__() 37 | self.tokenizer = CLIPTokenizer.from_pretrained(path, subfolder="tokenizer") 38 | self.transformer = CLIPTextModel.from_pretrained(path, subfolder='text_encoder') 39 | self.device = device 40 | self.max_length = max_length 41 | self.freeze() 42 | 43 | def freeze(self): 44 | self.transformer = self.transformer.eval() 45 | for param in self.parameters(): 46 | param.requires_grad = False 47 | 48 | def forward(self, text): 49 | batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True, 50 | return_overflowing_tokens=False, padding="max_length", return_tensors="pt") 51 | tokens = batch_encoding["input_ids"].to(self.device) 52 | outputs = self.transformer(input_ids=tokens) 53 | 54 | z = outputs.last_hidden_state 55 | return z 56 | 57 | def encode(self, text): 58 | return self(text) 59 | 60 | 61 | class TextEmbedder(nn.Module): 62 | """ 63 | Embeds text prompt into vector representations. Also handles text dropout for classifier-free guidance. 64 | """ 65 | def __init__(self, path, device='cuda', dropout_prob=0.1): # Modified 66 | super().__init__() 67 | self.text_encodder = FrozenCLIPEmbedder(path=path, device=device) # Modified 68 | self.dropout_prob = dropout_prob 69 | 70 | def token_drop(self, text_prompts, force_drop_ids=None): 71 | """ 72 | Drops text to enable classifier-free guidance. 73 | """ 74 | if force_drop_ids is None: 75 | drop_ids = numpy.random.uniform(0, 1, len(text_prompts)) < self.dropout_prob 76 | else: 77 | # TODO 78 | drop_ids = force_drop_ids == 1 79 | labels = list(numpy.where(drop_ids, "", text_prompts)) 80 | # print(labels) 81 | return labels 82 | 83 | def forward(self, text_prompts, train, force_drop_ids=None): 84 | use_dropout = self.dropout_prob > 0 85 | if (train and use_dropout) or (force_drop_ids is not None): 86 | text_prompts = self.token_drop(text_prompts, force_drop_ids) 87 | embeddings = self.text_encodder(text_prompts) 88 | return embeddings 89 | 90 | 91 | if __name__ == '__main__': 92 | 93 | r""" 94 | Returns: 95 | 96 | Examples from CLIPTextModel: 97 | 98 | ```python 99 | >>> from transformers import AutoTokenizer, CLIPTextModel 100 | 101 | >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32") 102 | >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32") 103 | 104 | >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") 105 | 106 | >>> outputs = model(**inputs) 107 | >>> last_hidden_state = outputs.last_hidden_state 108 | >>> pooled_output = outputs.pooler_output # pooled (EOS token) states 109 | ```""" 110 | 111 | import torch 112 | 113 | device = "cuda" if torch.cuda.is_available() else "cpu" 114 | 115 | text_encoder = TextEmbedder(path='/mnt/petrelfs/maxin/work/pretrained/stable-diffusion-2-1-base', 116 | dropout_prob=0.00001).to(device) 117 | 118 | text_prompt = [["a photo of a cat", "a photo of a cat"], ["a photo of a dog", "a photo of a cat"], ['a photo of a dog human', "a photo of a cat"]] 119 | # text_prompt = ('None', 'None', 'None') 120 | output = text_encoder(text_prompts=text_prompt, train=False) 121 | # print(output) 122 | print(output.shape) 123 | # print(output.shape) -------------------------------------------------------------------------------- /seine/models/resnet.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py 2 | import os 3 | import sys 4 | sys.path.append(os.path.split(sys.path[0])[0]) 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | from einops import rearrange 11 | 12 | 13 | class InflatedConv3d(nn.Conv2d): 14 | def forward(self, x): 15 | video_length = x.shape[2] 16 | 17 | x = rearrange(x, "b c f h w -> (b f) c h w") 18 | x = super().forward(x) 19 | x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length) 20 | 21 | return x 22 | 23 | 24 | class Upsample3D(nn.Module): 25 | def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"): 26 | super().__init__() 27 | self.channels = channels 28 | self.out_channels = out_channels or channels 29 | self.use_conv = use_conv 30 | self.use_conv_transpose = use_conv_transpose 31 | self.name = name 32 | 33 | conv = None 34 | if use_conv_transpose: 35 | raise NotImplementedError 36 | elif use_conv: 37 | conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1) 38 | 39 | if name == "conv": 40 | self.conv = conv 41 | else: 42 | self.Conv2d_0 = conv 43 | 44 | def forward(self, hidden_states, output_size=None): 45 | assert hidden_states.shape[1] == self.channels 46 | 47 | if self.use_conv_transpose: 48 | raise NotImplementedError 49 | 50 | # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16 51 | dtype = hidden_states.dtype 52 | if dtype == torch.bfloat16: 53 | hidden_states = hidden_states.to(torch.float32) 54 | 55 | # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984 56 | if hidden_states.shape[0] >= 64: 57 | hidden_states = hidden_states.contiguous() 58 | 59 | # if `output_size` is passed we force the interpolation output 60 | # size and do not make use of `scale_factor=2` 61 | if output_size is None: 62 | hidden_states = F.interpolate(hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest") 63 | else: 64 | hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest") 65 | 66 | # If the input is bfloat16, we cast back to bfloat16 67 | if dtype == torch.bfloat16: 68 | hidden_states = hidden_states.to(dtype) 69 | 70 | if self.use_conv: 71 | if self.name == "conv": 72 | hidden_states = self.conv(hidden_states) 73 | else: 74 | hidden_states = self.Conv2d_0(hidden_states) 75 | 76 | return hidden_states 77 | 78 | 79 | class Downsample3D(nn.Module): 80 | def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"): 81 | super().__init__() 82 | self.channels = channels 83 | self.out_channels = out_channels or channels 84 | self.use_conv = use_conv 85 | self.padding = padding 86 | stride = 2 87 | self.name = name 88 | 89 | if use_conv: 90 | conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding) 91 | else: 92 | raise NotImplementedError 93 | 94 | if name == "conv": 95 | self.Conv2d_0 = conv 96 | self.conv = conv 97 | elif name == "Conv2d_0": 98 | self.conv = conv 99 | else: 100 | self.conv = conv 101 | 102 | def forward(self, hidden_states): 103 | assert hidden_states.shape[1] == self.channels 104 | if self.use_conv and self.padding == 0: 105 | raise NotImplementedError 106 | 107 | assert hidden_states.shape[1] == self.channels 108 | hidden_states = self.conv(hidden_states) 109 | 110 | return hidden_states 111 | 112 | 113 | class ResnetBlock3D(nn.Module): 114 | def __init__( 115 | self, 116 | *, 117 | in_channels, 118 | out_channels=None, 119 | conv_shortcut=False, 120 | dropout=0.0, 121 | temb_channels=512, 122 | groups=32, 123 | groups_out=None, 124 | pre_norm=True, 125 | eps=1e-6, 126 | non_linearity="swish", 127 | time_embedding_norm="default", 128 | output_scale_factor=1.0, 129 | use_in_shortcut=None, 130 | ): 131 | super().__init__() 132 | self.pre_norm = pre_norm 133 | self.pre_norm = True 134 | self.in_channels = in_channels 135 | out_channels = in_channels if out_channels is None else out_channels 136 | self.out_channels = out_channels 137 | self.use_conv_shortcut = conv_shortcut 138 | self.time_embedding_norm = time_embedding_norm 139 | self.output_scale_factor = output_scale_factor 140 | 141 | if groups_out is None: 142 | groups_out = groups 143 | 144 | self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True) 145 | 146 | self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) 147 | 148 | if temb_channels is not None: 149 | if self.time_embedding_norm == "default": 150 | time_emb_proj_out_channels = out_channels 151 | elif self.time_embedding_norm == "scale_shift": 152 | time_emb_proj_out_channels = out_channels * 2 153 | else: 154 | raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ") 155 | 156 | self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels) 157 | else: 158 | self.time_emb_proj = None 159 | 160 | self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True) 161 | self.dropout = torch.nn.Dropout(dropout) 162 | self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) 163 | 164 | if non_linearity == "swish": 165 | self.nonlinearity = lambda x: F.silu(x) 166 | elif non_linearity == "mish": 167 | self.nonlinearity = Mish() 168 | elif non_linearity == "silu": 169 | self.nonlinearity = nn.SiLU() 170 | 171 | self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut 172 | 173 | self.conv_shortcut = None 174 | if self.use_in_shortcut: 175 | self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) 176 | 177 | def forward(self, input_tensor, temb): 178 | hidden_states = input_tensor 179 | 180 | hidden_states = self.norm1(hidden_states) 181 | hidden_states = self.nonlinearity(hidden_states) 182 | 183 | hidden_states = self.conv1(hidden_states) 184 | 185 | if temb is not None: 186 | temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None] 187 | 188 | if temb is not None and self.time_embedding_norm == "default": 189 | hidden_states = hidden_states + temb 190 | 191 | hidden_states = self.norm2(hidden_states) 192 | 193 | if temb is not None and self.time_embedding_norm == "scale_shift": 194 | scale, shift = torch.chunk(temb, 2, dim=1) 195 | hidden_states = hidden_states * (1 + scale) + shift 196 | 197 | hidden_states = self.nonlinearity(hidden_states) 198 | 199 | hidden_states = self.dropout(hidden_states) 200 | hidden_states = self.conv2(hidden_states) 201 | 202 | if self.conv_shortcut is not None: 203 | input_tensor = self.conv_shortcut(input_tensor) 204 | 205 | output_tensor = (input_tensor + hidden_states) / self.output_scale_factor 206 | 207 | return output_tensor 208 | 209 | 210 | class Mish(torch.nn.Module): 211 | def forward(self, hidden_states): 212 | return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states)) -------------------------------------------------------------------------------- /seine/models/utils.py: -------------------------------------------------------------------------------- 1 | # adopted from 2 | # https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py 3 | # and 4 | # https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py 5 | # and 6 | # https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py 7 | # 8 | # thanks! 9 | 10 | 11 | import os 12 | import math 13 | import torch 14 | 15 | import numpy as np 16 | import torch.nn as nn 17 | 18 | from einops import repeat 19 | 20 | 21 | ################################################################################# 22 | # Unet Utils # 23 | ################################################################################# 24 | 25 | def checkpoint(func, inputs, params, flag): 26 | """ 27 | Evaluate a function without caching intermediate activations, allowing for 28 | reduced memory at the expense of extra compute in the backward pass. 29 | :param func: the function to evaluate. 30 | :param inputs: the argument sequence to pass to `func`. 31 | :param params: a sequence of parameters `func` depends on but does not 32 | explicitly take as arguments. 33 | :param flag: if False, disable gradient checkpointing. 34 | """ 35 | if flag: 36 | args = tuple(inputs) + tuple(params) 37 | return CheckpointFunction.apply(func, len(inputs), *args) 38 | else: 39 | return func(*inputs) 40 | 41 | 42 | class CheckpointFunction(torch.autograd.Function): 43 | @staticmethod 44 | def forward(ctx, run_function, length, *args): 45 | ctx.run_function = run_function 46 | ctx.input_tensors = list(args[:length]) 47 | ctx.input_params = list(args[length:]) 48 | 49 | with torch.no_grad(): 50 | output_tensors = ctx.run_function(*ctx.input_tensors) 51 | return output_tensors 52 | 53 | @staticmethod 54 | def backward(ctx, *output_grads): 55 | ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors] 56 | with torch.enable_grad(): 57 | # Fixes a bug where the first op in run_function modifies the 58 | # Tensor storage in place, which is not allowed for detach()'d 59 | # Tensors. 60 | shallow_copies = [x.view_as(x) for x in ctx.input_tensors] 61 | output_tensors = ctx.run_function(*shallow_copies) 62 | input_grads = torch.autograd.grad( 63 | output_tensors, 64 | ctx.input_tensors + ctx.input_params, 65 | output_grads, 66 | allow_unused=True, 67 | ) 68 | del ctx.input_tensors 69 | del ctx.input_params 70 | del output_tensors 71 | return (None, None) + input_grads 72 | 73 | 74 | def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False): 75 | """ 76 | Create sinusoidal timestep embeddings. 77 | :param timesteps: a 1-D Tensor of N indices, one per batch element. 78 | These may be fractional. 79 | :param dim: the dimension of the output. 80 | :param max_period: controls the minimum frequency of the embeddings. 81 | :return: an [N x dim] Tensor of positional embeddings. 82 | """ 83 | if not repeat_only: 84 | half = dim // 2 85 | freqs = torch.exp( 86 | -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half 87 | ).to(device=timesteps.device) 88 | args = timesteps[:, None].float() * freqs[None] 89 | embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) 90 | if dim % 2: 91 | embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) 92 | else: 93 | embedding = repeat(timesteps, 'b -> b d', d=dim).contiguous() 94 | return embedding 95 | 96 | 97 | def zero_module(module): 98 | """ 99 | Zero out the parameters of a module and return it. 100 | """ 101 | for p in module.parameters(): 102 | p.detach().zero_() 103 | return module 104 | 105 | 106 | def scale_module(module, scale): 107 | """ 108 | Scale the parameters of a module and return it. 109 | """ 110 | for p in module.parameters(): 111 | p.detach().mul_(scale) 112 | return module 113 | 114 | 115 | def mean_flat(tensor): 116 | """ 117 | Take the mean over all non-batch dimensions. 118 | """ 119 | return tensor.mean(dim=list(range(1, len(tensor.shape)))) 120 | 121 | 122 | def normalization(channels): 123 | """ 124 | Make a standard normalization layer. 125 | :param channels: number of input channels. 126 | :return: an nn.Module for normalization. 127 | """ 128 | return GroupNorm32(32, channels) 129 | 130 | 131 | # PyTorch 1.7 has SiLU, but we support PyTorch 1.5. 132 | class SiLU(nn.Module): 133 | def forward(self, x): 134 | return x * torch.sigmoid(x) 135 | 136 | 137 | class GroupNorm32(nn.GroupNorm): 138 | def forward(self, x): 139 | return super().forward(x.float()).type(x.dtype) 140 | 141 | def conv_nd(dims, *args, **kwargs): 142 | """ 143 | Create a 1D, 2D, or 3D convolution module. 144 | """ 145 | if dims == 1: 146 | return nn.Conv1d(*args, **kwargs) 147 | elif dims == 2: 148 | return nn.Conv2d(*args, **kwargs) 149 | elif dims == 3: 150 | return nn.Conv3d(*args, **kwargs) 151 | raise ValueError(f"unsupported dimensions: {dims}") 152 | 153 | 154 | def linear(*args, **kwargs): 155 | """ 156 | Create a linear module. 157 | """ 158 | return nn.Linear(*args, **kwargs) 159 | 160 | 161 | def avg_pool_nd(dims, *args, **kwargs): 162 | """ 163 | Create a 1D, 2D, or 3D average pooling module. 164 | """ 165 | if dims == 1: 166 | return nn.AvgPool1d(*args, **kwargs) 167 | elif dims == 2: 168 | return nn.AvgPool2d(*args, **kwargs) 169 | elif dims == 3: 170 | return nn.AvgPool3d(*args, **kwargs) 171 | raise ValueError(f"unsupported dimensions: {dims}") 172 | 173 | 174 | # class HybridConditioner(nn.Module): 175 | 176 | # def __init__(self, c_concat_config, c_crossattn_config): 177 | # super().__init__() 178 | # self.concat_conditioner = instantiate_from_config(c_concat_config) 179 | # self.crossattn_conditioner = instantiate_from_config(c_crossattn_config) 180 | 181 | # def forward(self, c_concat, c_crossattn): 182 | # c_concat = self.concat_conditioner(c_concat) 183 | # c_crossattn = self.crossattn_conditioner(c_crossattn) 184 | # return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]} 185 | 186 | 187 | def noise_like(shape, device, repeat=False): 188 | repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1))) 189 | noise = lambda: torch.randn(shape, device=device) 190 | return repeat_noise() if repeat else noise() 191 | 192 | def count_flops_attn(model, _x, y): 193 | """ 194 | A counter for the `thop` package to count the operations in an 195 | attention operation. 196 | Meant to be used like: 197 | macs, params = thop.profile( 198 | model, 199 | inputs=(inputs, timestamps), 200 | custom_ops={QKVAttention: QKVAttention.count_flops}, 201 | ) 202 | """ 203 | b, c, *spatial = y[0].shape 204 | num_spatial = int(np.prod(spatial)) 205 | # We perform two matmuls with the same number of ops. 206 | # The first computes the weight matrix, the second computes 207 | # the combination of the value vectors. 208 | matmul_ops = 2 * b * (num_spatial ** 2) * c 209 | model.total_ops += torch.DoubleTensor([matmul_ops]) 210 | 211 | def count_params(model, verbose=False): 212 | total_params = sum(p.numel() for p in model.parameters()) 213 | if verbose: 214 | print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.") 215 | return total_params -------------------------------------------------------------------------------- /seine/requirement.txt: -------------------------------------------------------------------------------- 1 | torch==2.0.1 2 | torchaudio==2.0.2 3 | torchvision==0.15.2 4 | decord==0.6.0 5 | diffusers==0.15.0 6 | imageio==2.29.0 7 | transformers==4.29.2 8 | xformers==0.0.20 9 | einops 10 | omegaconf 11 | tensorboard==2.15.1 12 | timm==0.9.10 13 | rotary-embedding-torch==0.3.5 14 | natsort==8.4.0 15 | moviepy -------------------------------------------------------------------------------- /seine/seine_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | # Adopt from SEINE/utils.py 5 | def mask_generation_before(mask_type, shape, dtype, device, dropout_prob=0.0, use_image_num=0): 6 | b, f, c, h, w = shape 7 | if mask_type.startswith("first"): 8 | num = int(mask_type.split("first")[-1]) 9 | mask_f = torch.cat( 10 | [ 11 | torch.zeros(1, num, 1, 1, 1, dtype=dtype, device=device), 12 | torch.ones(1, f - num, 1, 1, 1, dtype=dtype, device=device), 13 | ], 14 | dim=1, 15 | ) 16 | mask = mask_f.expand(b, -1, c, h, w) 17 | elif mask_type.startswith("all"): 18 | mask = torch.ones(b, f, c, h, w, dtype=dtype, device=device) 19 | elif mask_type.startswith("onelast"): 20 | num = int(mask_type.split("onelast")[-1]) 21 | mask_one = torch.zeros(1, 1, 1, 1, 1, dtype=dtype, device=device) 22 | mask_mid = torch.ones(1, f - 2 * num, 1, 1, 1, dtype=dtype, device=device) 23 | mask_last = torch.zeros_like(mask_one) 24 | mask = torch.cat([mask_one] * num + [mask_mid] + [mask_last] * num, dim=1) 25 | mask = mask.expand(b, -1, c, h, w) 26 | else: 27 | raise ValueError(f"Invalid mask type: {mask_type}") 28 | return mask 29 | 30 | --------------------------------------------------------------------------------