├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── assets
    ├── AnyV2V-SlidesShow-GIF-1080P-02.gif
    └── AnyV2V-SlidesShow-MP4-1080P.mp4
├── black_box_image_edit
    ├── __init__.py
    ├── cosxl
    │   ├── custom_pipeline.py
    │   └── utils.py
    ├── cosxl_edit.py
    ├── instantstyle.py
    ├── instructpix2pix.py
    ├── ip_adapter
    │   ├── __init__.py
    │   ├── attention_processor.py
    │   ├── ip_adapter.py
    │   ├── resampler.py
    │   └── utils.py
    └── utils.py
├── cog.yaml
├── consisti2v
    ├── README.md
    ├── configs
    │   ├── pipeline_256
    │   │   ├── ddim_inversion_256.yaml
    │   │   └── pnp_edit.yaml
    │   └── pipeline_512
    │   │   ├── ddim_inversion_512.yaml
    │   │   └── pnp_edit.yaml
    ├── consisti2v
    │   ├── data
    │   │   └── dataset.py
    │   ├── models
    │   │   ├── rotary_embedding.py
    │   │   ├── videoldm_attention.py
    │   │   ├── videoldm_transformer_blocks.py
    │   │   ├── videoldm_unet.py
    │   │   └── videoldm_unet_blocks.py
    │   ├── pipelines
    │   │   ├── pipeline_autoregress_animation.py
    │   │   ├── pipeline_conditional_animation.py
    │   │   └── pipeline_video_editing.py
    │   └── utils
    │   │   ├── frameinit_utils.py
    │   │   └── util.py
    ├── ddim_inverse_scheduler.py
    ├── environment.yaml
    ├── pnp_utils.py
    ├── run_ddim_inversion.py
    ├── run_pnp_edit.py
    └── utils.py
├── demo
    ├── A Couple In A Public Display Of Affection.mp4
    ├── A Couple In A Public Display Of Affection
    │   ├── 00000.png
    │   ├── 00001.png
    │   ├── 00002.png
    │   ├── 00003.png
    │   ├── 00004.png
    │   ├── 00005.png
    │   ├── 00006.png
    │   ├── 00007.png
    │   ├── 00008.png
    │   ├── 00009.png
    │   ├── 00010.png
    │   ├── 00011.png
    │   ├── 00012.png
    │   ├── 00013.png
    │   ├── 00014.png
    │   ├── 00015.png
    │   └── edited_first_frame
    │   │   ├── Sketch style.png
    │   │   └── Snowing.png
    ├── A kitten turning its head on a wooden floor.mp4
    ├── A kitten turning its head on a wooden floor
    │   ├── 00000.png
    │   ├── 00001.png
    │   ├── 00002.png
    │   ├── 00003.png
    │   ├── 00004.png
    │   ├── 00005.png
    │   ├── 00006.png
    │   ├── 00007.png
    │   ├── 00008.png
    │   ├── 00009.png
    │   ├── 00010.png
    │   ├── 00011.png
    │   ├── 00012.png
    │   ├── 00013.png
    │   ├── 00014.png
    │   ├── 00015.png
    │   └── edited_first_frame
    │   │   └── A dog turning its head on a wooden floor.png
    ├── An Old Man Doing Exercises For The Body And Mind.mp4
    ├── An Old Man Doing Exercises For The Body And Mind
    │   ├── 00000.png
    │   ├── 00001.png
    │   ├── 00002.png
    │   ├── 00003.png
    │   ├── 00004.png
    │   ├── 00005.png
    │   ├── 00006.png
    │   ├── 00007.png
    │   ├── 00008.png
    │   ├── 00009.png
    │   ├── 00010.png
    │   ├── 00011.png
    │   ├── 00012.png
    │   ├── 00013.png
    │   ├── 00014.png
    │   ├── 00015.png
    │   └── edited_first_frame
    │   │   ├── add a party hat on his head.png
    │   │   ├── cyberpunk style.png
    │   │   ├── give him a punk hair style.png
    │   │   ├── helmet.png
    │   │   ├── hinton.png
    │   │   ├── jack ma.png
    │   │   ├── starry night style.png
    │   │   ├── turn his hair white.png
    │   │   └── turn man into robot.png
    ├── Ballet.mp4
    ├── Ballet
    │   ├── 00000.png
    │   ├── 00001.png
    │   ├── 00002.png
    │   ├── 00003.png
    │   ├── 00004.png
    │   ├── 00005.png
    │   ├── 00006.png
    │   ├── 00007.png
    │   ├── 00008.png
    │   ├── 00009.png
    │   ├── 00010.png
    │   ├── 00011.png
    │   ├── 00012.png
    │   ├── 00013.png
    │   ├── 00014.png
    │   ├── 00015.png
    │   └── edited_first_frame
    │   │   └── van gogh style.png
    ├── Man Walking.mp4
    ├── Man Walking
    │   ├── 00000.png
    │   ├── 00001.png
    │   ├── 00002.png
    │   ├── 00003.png
    │   ├── 00004.png
    │   ├── 00005.png
    │   ├── 00006.png
    │   ├── 00007.png
    │   ├── 00008.png
    │   ├── 00009.png
    │   ├── 00010.png
    │   ├── 00011.png
    │   ├── 00012.png
    │   ├── 00013.png
    │   ├── 00014.png
    │   ├── 00015.png
    │   └── edited_first_frame
    │   │   ├── ElonMusk_02.png
    │   │   ├── Yann LeCun Walking.png
    │   │   ├── add a cowboy hat.png
    │   │   ├── change his clothes to red.png
    │   │   ├── policeman costume.png
    │   │   ├── turn him into an astronaut.png
    │   │   ├── turn him into batman.png
    │   │   └── turn the man into darth vader.png
    ├── Your-Video-Name
    │   └── edited_first_frame
    │   │   └── Your-edited-first-frame
    └── Your-Video-mp4
├── edit_image.py
├── gradio_demo.py
├── gradio_demo_cosxl.py
├── gradio_demo_style.py
├── i2vgen-xl
    ├── __init__.py
    ├── configs
    │   ├── group_ddim_inversion
    │   │   ├── group_config.json
    │   │   └── template.yaml
    │   └── group_pnp_edit
    │   │   ├── group_config.json
    │   │   └── template.yaml
    ├── demo.ipynb
    ├── environment.yml
    ├── pipelines
    │   ├── __init__.py
    │   └── pipeline_i2vgen_xl.py
    ├── pnp_utils.py
    ├── run_group_ddim_inversion.py
    ├── run_group_pnp_edit.py
    ├── scripts
    │   ├── run_group_ddim_inversion.sh
    │   └── run_group_pnp_edit.sh
    └── utils.py
├── predict.py
├── prepare_video.py
└── seine
    ├── README.md
    ├── configs
        ├── ddim_inversion.yaml
        └── pnp_edit.yaml
    ├── datasets
        └── video_transforms.py
    ├── diffusion
        ├── __init__.py
        ├── diffusion_utils.py
        ├── gaussian_diffusion.py
        ├── respace.py
        └── timestep_sampler.py
    ├── models
        ├── __init__.py
        ├── attention.py
        ├── clip.py
        ├── resnet.py
        ├── unet.py
        ├── unet_blocks.py
        └── utils.py
    ├── pnp_utils.py
    ├── requirement.txt
    ├── run_ddim_inversion.py
    ├── run_pnp_edit.py
    └── seine_utils.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Version [e.g. 22]
29 | 
30 | **Software Packages (please complete the following information):**
31 | - Torch version:
32 | - Diffusers version:
33 | 
34 | **Additional context**
35 | Add any other context about the problem here.
36 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Extra
  2 | _demo_temp
  3 | sdxl_models
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # poetry
102 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
104 | #   commonly ignored for libraries.
105 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 | 
108 | # pdm
109 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | #   in version control.
113 | #   https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | m3ku@uwaterloo.ca.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 TIGER Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/assets/AnyV2V-SlidesShow-GIF-1080P-02.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/assets/AnyV2V-SlidesShow-GIF-1080P-02.gif


--------------------------------------------------------------------------------
/assets/AnyV2V-SlidesShow-MP4-1080P.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/assets/AnyV2V-SlidesShow-MP4-1080P.mp4


--------------------------------------------------------------------------------
/black_box_image_edit/__init__.py:
--------------------------------------------------------------------------------
 1 | from .instructpix2pix import InstructPix2Pix, MagicBrush
 2 | from .cosxl_edit import CosXLEdit
 3 | 
 4 | from typing import Union, Optional, Tuple
 5 | import numpy as np
 6 | from PIL import Image, ImageOps
 7 | import os
 8 | import requests
 9 | 
10 | 
11 | 
12 | 
13 | def load_image(image: Union[str, Image.Image], format: str = "RGB", size: Optional[Tuple] = None) -> Image.Image:
14 |     """
15 |     Load an image from a given path or URL and convert it to a PIL Image.
16 | 
17 |     Args:
18 |         image (Union[str, Image.Image]): The image path, URL, or a PIL Image object to be loaded.
19 |         format (str, optional): Desired color format of the resulting image. Defaults to "RGB".
20 |         size (Optional[Tuple], optional): Desired size for resizing the image. Defaults to None.
21 | 
22 |     Returns:
23 |         Image.Image: A PIL Image in the specified format and size.
24 | 
25 |     Raises:
26 |         ValueError: If the provided image format is not recognized.
27 |     """
28 |     if isinstance(image, str):
29 |         if image.startswith("http://") or image.startswith("https://"):
30 |             image = Image.open(requests.get(image, stream=True).raw)
31 |         elif os.path.isfile(image):
32 |             image = Image.open(image)
33 |         else:
34 |             raise ValueError(
35 |                 f"Incorrect path or url, URLs must start with `http://` or `https://`, and {image} is not a valid path"
36 |             )
37 |     elif isinstance(image, Image.Image):
38 |         image = image
39 |     else:
40 |         raise ValueError(
41 |             "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
42 |         )
43 |     image = ImageOps.exif_transpose(image)
44 |     image = image.convert(format)
45 |     if (size != None):
46 |         image = image.resize(size, Image.LANCZOS)
47 |     return image
48 | 


--------------------------------------------------------------------------------
/black_box_image_edit/cosxl/utils.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy as np
 3 | import torch 
 4 | 
 5 | def set_timesteps_patched(self, num_inference_steps: int, device = None):
 6 |     self.num_inference_steps = num_inference_steps
 7 |     
 8 |     ramp = np.linspace(0, 1, self.num_inference_steps)
 9 |     sigmas = torch.linspace(math.log(self.config.sigma_min), math.log(self.config.sigma_max), len(ramp)).exp().flip(0)
10 |     
11 |     sigmas = (sigmas).to(dtype=torch.float32, device=device)
12 |     self.timesteps = self.precondition_noise(sigmas)
13 |     
14 |     self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
15 |     self._step_index = None
16 |     self._begin_index = None
17 |     self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
18 | 


--------------------------------------------------------------------------------
/black_box_image_edit/cosxl_edit.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from huggingface_hub import hf_hub_download
 3 | import torch
 4 | import PIL
 5 | 
 6 | class CosXLEdit():
 7 |     """
 8 |     Edit Cos Stable Diffusion XL 1.0 Base is tuned to use a Cosine-Continuous EDM VPred schedule, and then upgraded to perform instructed image editing.
 9 |     Reference: https://huggingface.co/stabilityai/cosxl
10 |     """
11 |     def __init__(self, device="cuda"):
12 |         """
13 |         Attributes:
14 |             pipe (CosStableDiffusionXLInstructPix2PixPipeline): The InstructPix2Pix pipeline for image transformation.
15 | 
16 |         Args:
17 |             device (str, optional): Device on which the pipeline runs. Defaults to "cuda".
18 |         """
19 |         from diffusers import EDMEulerScheduler
20 |         from .cosxl.custom_pipeline import CosStableDiffusionXLInstructPix2PixPipeline
21 |         from .cosxl.utils import set_timesteps_patched
22 | 
23 |         EDMEulerScheduler.set_timesteps = set_timesteps_patched
24 |         edit_file = hf_hub_download(repo_id="stabilityai/cosxl", filename="cosxl_edit.safetensors")
25 |         self.pipe = CosStableDiffusionXLInstructPix2PixPipeline.from_single_file(
26 |             edit_file, num_in_channels=8
27 |         )
28 |         self.pipe.scheduler = EDMEulerScheduler(sigma_min=0.002, sigma_max=120.0, sigma_data=1.0, prediction_type="v_prediction")
29 |         self.pipe.to(device)
30 | 
31 |     def infer_one_image(self, src_image: PIL.Image.Image = None, src_prompt: str = None, target_prompt: str = None, instruct_prompt: str = None, seed: int = 42, negative_prompt=""):
32 |         """
33 |         Modifies the source image based on the provided instruction prompt.
34 | 
35 |         Args:
36 |             src_image (PIL.Image.Image): Source image in RGB format.
37 |             instruct_prompt (str): Caption for editing the image.
38 |             seed (int, optional): Seed for random generator. Defaults to 42.
39 | 
40 |         Returns:
41 |             PIL.Image.Image: The transformed image.
42 |         """
43 |         src_image = src_image.convert('RGB') # force it to RGB format
44 |         generator = torch.manual_seed(seed)
45 | 
46 |         resolution = 1024
47 |         preprocessed_image = src_image.resize((resolution, resolution))
48 |         image = self.pipe(prompt=instruct_prompt,
49 |                         image=preprocessed_image,
50 |                         height=resolution,
51 |                         width=resolution,
52 |                         negative_prompt=negative_prompt, 
53 |                         guidance_scale=7,
54 |                         num_inference_steps=20,
55 |                         generator=generator).images[0]
56 |         image = image.resize((src_image.width, src_image.height))
57 | 
58 |         return image
59 | 


--------------------------------------------------------------------------------
/black_box_image_edit/instantstyle.py:
--------------------------------------------------------------------------------
 1 | from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
 2 | import cv2
 3 | import torch
 4 | import PIL
 5 | import numpy as np
 6 | import os
 7 | 
 8 | class InstantStyle():
 9 |     def __init__(self, 
10 |                  device="cuda", 
11 |                  weight="stabilityai/stable-diffusion-xl-base-1.0", 
12 |                  control_weight="diffusers/controlnet-canny-sdxl-1.0",
13 |                  custom_sdxl_models_folder="sdxl_models"):
14 |         from .ip_adapter import IPAdapterXL
15 | 
16 |         controlnet = ControlNetModel.from_pretrained(control_weight, 
17 |                                                     use_safetensors=False, 
18 |                                                     torch_dtype=torch.float16).to(device)
19 |         # load SDXL pipeline
20 |         sdxl_control_pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
21 |             weight,
22 |             controlnet=controlnet,
23 |             torch_dtype=torch.float16,
24 |             add_watermarker=False,
25 |         )
26 |         sdxl_control_pipe.enable_vae_tiling()
27 |         self.ip_model = IPAdapterXL(sdxl_control_pipe, 
28 |                             os.path.join(custom_sdxl_models_folder, "image_encoder"),
29 |                             os.path.join(custom_sdxl_models_folder, "ip-adapter_sdxl.bin"),
30 |                             device, 
31 |                             target_blocks=["up_blocks.0.attentions.1"])
32 | 
33 | 
34 |     def infer_one_image(self, src_image: PIL.Image.Image = None, 
35 |                         style_image: PIL.Image.Image = None,
36 |                         prompt: str = "masterpiece, best quality, high quality", 
37 |                         seed: int = 42, 
38 |                         negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry"):
39 | 
40 |         src_image = src_image.convert('RGB') # force it to RGB format
41 |         style_image = style_image.convert('RGB') # force it to RGB format
42 | 
43 |         def pil_to_cv2(image_pil):
44 |             image_np = np.array(image_pil)
45 |             image_cv2 = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
46 |             
47 |             return image_cv2
48 |         # control image
49 |         input_image = pil_to_cv2(src_image)
50 |         detected_map = cv2.Canny(input_image, 50, 200)
51 |         canny_map = PIL.Image.fromarray(cv2.cvtColor(detected_map, cv2.COLOR_BGR2RGB))
52 | 
53 |         # generate image
54 |         if prompt is None:
55 |             prompt = "masterpiece, best quality, high quality"
56 |         image = self.ip_model.generate(pil_image=style_image,
57 |                                 prompt=prompt,
58 |                                 negative_prompt=negative_prompt,
59 |                                 scale=1.0,
60 |                                 guidance_scale=5,
61 |                                 num_samples=1,
62 |                                 num_inference_steps=30, 
63 |                                 seed=seed,
64 |                                 image=canny_map,
65 |                                 controlnet_conditioning_scale=0.6,
66 |                                 )[0]
67 |         return image


--------------------------------------------------------------------------------
/black_box_image_edit/instructpix2pix.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import PIL
 3 | 
 4 | from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
 5 | 
 6 | class InstructPix2Pix():
 7 |     """
 8 |     A wrapper around the StableDiffusionInstructPix2PixPipeline for guided image transformation.
 9 | 
10 |     This class uses the Pix2Pix pipeline to transform an image based on an instruction prompt.
11 |     Reference: https://huggingface.co/docs/diffusers/api/pipelines/pix2pix
12 |     """
13 |     def __init__(self, device="cuda", weight="timbrooks/instruct-pix2pix"):
14 |         """
15 |         Attributes:
16 |             pipe (StableDiffusionInstructPix2PixPipeline): The Pix2Pix pipeline for image transformation.
17 | 
18 |         Args:
19 |             device (str, optional): Device on which the pipeline runs. Defaults to "cuda".
20 |             weight (str, optional): Pretrained weights for the model. Defaults to "timbrooks/instruct-pix2pix".
21 |         """
22 |         self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
23 |             weight,
24 |             torch_dtype=torch.float16,
25 |             safety_checker=None,
26 |         ).to(device)
27 |         self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(
28 |             self.pipe.scheduler.config)
29 | 
30 |     def infer_one_image(self, src_image: PIL.Image.Image = None, src_prompt: str = None, target_prompt: str = None, instruct_prompt: str = None, seed: int = 42, negative_prompt=""):
31 |         """
32 |         Modifies the source image based on the provided instruction prompt.
33 | 
34 |         Args:
35 |             src_image (PIL.Image.Image): Source image in RGB format.
36 |             instruct_prompt (str): Caption for editing the image.
37 |             seed (int, optional): Seed for random generator. Defaults to 42.
38 | 
39 |         Returns:
40 |             PIL.Image.Image: The transformed image.
41 |         """
42 |         src_image = src_image.convert('RGB') # force it to RGB format
43 |         generator = torch.manual_seed(seed)
44 | 
45 |         # configs from https://github.com/timothybrooks/instruct-pix2pix/blob/main/edit_cli.py
46 |         image = self.pipe(instruct_prompt, image=src_image,
47 |                           num_inference_steps=100,
48 |                           image_guidance_scale=1.5,
49 |                           guidance_scale=7.5,
50 |                           negative_prompt=negative_prompt,
51 |                           generator=generator
52 |                           ).images[0]
53 |         return image
54 | 
55 | class MagicBrush(InstructPix2Pix):
56 |     def __init__(self, device="cuda", weight="vinesmsuic/magicbrush-jul7"):
57 |         """
58 |         A class for MagicBrush.
59 | 
60 |         Args:
61 |             device (str, optional): The device on which the model should run. Default is "cuda".
62 |             weight (str, optional): The pretrained model weights for MagicBrush. Default is "vinesmsuic/magicbrush-jul7".
63 |         """
64 |         super().__init__(device=device, weight=weight)
65 | 
66 |     def infer_one_image(self, src_image: PIL.Image.Image = None, src_prompt: str = None, target_prompt: str = None, instruct_prompt: str = None, seed: int = 42, negative_prompt=""):
67 |         return super().infer_one_image(src_image, src_prompt, target_prompt, instruct_prompt, seed, negative_prompt)


--------------------------------------------------------------------------------
/black_box_image_edit/ip_adapter/__init__.py:
--------------------------------------------------------------------------------
 1 | from .ip_adapter import IPAdapter, IPAdapterPlus, IPAdapterPlusXL, IPAdapterXL, IPAdapterFull
 2 | 
 3 | __all__ = [
 4 |     "IPAdapter",
 5 |     "IPAdapterPlus",
 6 |     "IPAdapterPlusXL",
 7 |     "IPAdapterXL",
 8 |     "IPAdapterFull",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/black_box_image_edit/ip_adapter/resampler.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
  2 | # and https://github.com/lucidrains/imagen-pytorch/blob/main/imagen_pytorch/imagen_pytorch.py
  3 | 
  4 | import math
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | from einops import rearrange
  9 | from einops.layers.torch import Rearrange
 10 | 
 11 | 
 12 | # FFN
 13 | def FeedForward(dim, mult=4):
 14 |     inner_dim = int(dim * mult)
 15 |     return nn.Sequential(
 16 |         nn.LayerNorm(dim),
 17 |         nn.Linear(dim, inner_dim, bias=False),
 18 |         nn.GELU(),
 19 |         nn.Linear(inner_dim, dim, bias=False),
 20 |     )
 21 | 
 22 | 
 23 | def reshape_tensor(x, heads):
 24 |     bs, length, width = x.shape
 25 |     # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
 26 |     x = x.view(bs, length, heads, -1)
 27 |     # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
 28 |     x = x.transpose(1, 2)
 29 |     # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
 30 |     x = x.reshape(bs, heads, length, -1)
 31 |     return x
 32 | 
 33 | 
 34 | class PerceiverAttention(nn.Module):
 35 |     def __init__(self, *, dim, dim_head=64, heads=8):
 36 |         super().__init__()
 37 |         self.scale = dim_head**-0.5
 38 |         self.dim_head = dim_head
 39 |         self.heads = heads
 40 |         inner_dim = dim_head * heads
 41 | 
 42 |         self.norm1 = nn.LayerNorm(dim)
 43 |         self.norm2 = nn.LayerNorm(dim)
 44 | 
 45 |         self.to_q = nn.Linear(dim, inner_dim, bias=False)
 46 |         self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
 47 |         self.to_out = nn.Linear(inner_dim, dim, bias=False)
 48 | 
 49 |     def forward(self, x, latents):
 50 |         """
 51 |         Args:
 52 |             x (torch.Tensor): image features
 53 |                 shape (b, n1, D)
 54 |             latent (torch.Tensor): latent features
 55 |                 shape (b, n2, D)
 56 |         """
 57 |         x = self.norm1(x)
 58 |         latents = self.norm2(latents)
 59 | 
 60 |         b, l, _ = latents.shape
 61 | 
 62 |         q = self.to_q(latents)
 63 |         kv_input = torch.cat((x, latents), dim=-2)
 64 |         k, v = self.to_kv(kv_input).chunk(2, dim=-1)
 65 | 
 66 |         q = reshape_tensor(q, self.heads)
 67 |         k = reshape_tensor(k, self.heads)
 68 |         v = reshape_tensor(v, self.heads)
 69 | 
 70 |         # attention
 71 |         scale = 1 / math.sqrt(math.sqrt(self.dim_head))
 72 |         weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
 73 |         weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
 74 |         out = weight @ v
 75 | 
 76 |         out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
 77 | 
 78 |         return self.to_out(out)
 79 | 
 80 | 
 81 | class Resampler(nn.Module):
 82 |     def __init__(
 83 |         self,
 84 |         dim=1024,
 85 |         depth=8,
 86 |         dim_head=64,
 87 |         heads=16,
 88 |         num_queries=8,
 89 |         embedding_dim=768,
 90 |         output_dim=1024,
 91 |         ff_mult=4,
 92 |         max_seq_len: int = 257,  # CLIP tokens + CLS token
 93 |         apply_pos_emb: bool = False,
 94 |         num_latents_mean_pooled: int = 0,  # number of latents derived from mean pooled representation of the sequence
 95 |     ):
 96 |         super().__init__()
 97 |         self.pos_emb = nn.Embedding(max_seq_len, embedding_dim) if apply_pos_emb else None
 98 | 
 99 |         self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
100 | 
101 |         self.proj_in = nn.Linear(embedding_dim, dim)
102 | 
103 |         self.proj_out = nn.Linear(dim, output_dim)
104 |         self.norm_out = nn.LayerNorm(output_dim)
105 | 
106 |         self.to_latents_from_mean_pooled_seq = (
107 |             nn.Sequential(
108 |                 nn.LayerNorm(dim),
109 |                 nn.Linear(dim, dim * num_latents_mean_pooled),
110 |                 Rearrange("b (n d) -> b n d", n=num_latents_mean_pooled),
111 |             )
112 |             if num_latents_mean_pooled > 0
113 |             else None
114 |         )
115 | 
116 |         self.layers = nn.ModuleList([])
117 |         for _ in range(depth):
118 |             self.layers.append(
119 |                 nn.ModuleList(
120 |                     [
121 |                         PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
122 |                         FeedForward(dim=dim, mult=ff_mult),
123 |                     ]
124 |                 )
125 |             )
126 | 
127 |     def forward(self, x):
128 |         if self.pos_emb is not None:
129 |             n, device = x.shape[1], x.device
130 |             pos_emb = self.pos_emb(torch.arange(n, device=device))
131 |             x = x + pos_emb
132 | 
133 |         latents = self.latents.repeat(x.size(0), 1, 1)
134 | 
135 |         x = self.proj_in(x)
136 | 
137 |         if self.to_latents_from_mean_pooled_seq:
138 |             meanpooled_seq = masked_mean(x, dim=1, mask=torch.ones(x.shape[:2], device=x.device, dtype=torch.bool))
139 |             meanpooled_latents = self.to_latents_from_mean_pooled_seq(meanpooled_seq)
140 |             latents = torch.cat((meanpooled_latents, latents), dim=-2)
141 | 
142 |         for attn, ff in self.layers:
143 |             latents = attn(x, latents) + latents
144 |             latents = ff(latents) + latents
145 | 
146 |         latents = self.proj_out(latents)
147 |         return self.norm_out(latents)
148 | 
149 | 
150 | def masked_mean(t, *, dim, mask=None):
151 |     if mask is None:
152 |         return t.mean(dim=dim)
153 | 
154 |     denom = mask.sum(dim=dim, keepdim=True)
155 |     mask = rearrange(mask, "b n -> b n 1")
156 |     masked_t = t.masked_fill(~mask, 0.0)
157 | 
158 |     return masked_t.sum(dim=dim) / denom.clamp(min=1e-5)
159 | 


--------------------------------------------------------------------------------
/black_box_image_edit/ip_adapter/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | import numpy as np
 4 | from PIL import Image
 5 | 
 6 | attn_maps = {}
 7 | def hook_fn(name):
 8 |     def forward_hook(module, input, output):
 9 |         if hasattr(module.processor, "attn_map"):
10 |             attn_maps[name] = module.processor.attn_map
11 |             del module.processor.attn_map
12 | 
13 |     return forward_hook
14 | 
15 | def register_cross_attention_hook(unet):
16 |     for name, module in unet.named_modules():
17 |         if name.split('.')[-1].startswith('attn2'):
18 |             module.register_forward_hook(hook_fn(name))
19 | 
20 |     return unet
21 | 
22 | def upscale(attn_map, target_size):
23 |     attn_map = torch.mean(attn_map, dim=0)
24 |     attn_map = attn_map.permute(1,0)
25 |     temp_size = None
26 | 
27 |     for i in range(0,5):
28 |         scale = 2 ** i
29 |         if ( target_size[0] // scale ) * ( target_size[1] // scale) == attn_map.shape[1]*64:
30 |             temp_size = (target_size[0]//(scale*8), target_size[1]//(scale*8))
31 |             break
32 | 
33 |     assert temp_size is not None, "temp_size cannot is None"
34 | 
35 |     attn_map = attn_map.view(attn_map.shape[0], *temp_size)
36 | 
37 |     attn_map = F.interpolate(
38 |         attn_map.unsqueeze(0).to(dtype=torch.float32),
39 |         size=target_size,
40 |         mode='bilinear',
41 |         align_corners=False
42 |     )[0]
43 | 
44 |     attn_map = torch.softmax(attn_map, dim=0)
45 |     return attn_map
46 | def get_net_attn_map(image_size, batch_size=2, instance_or_negative=False, detach=True):
47 | 
48 |     idx = 0 if instance_or_negative else 1
49 |     net_attn_maps = []
50 | 
51 |     for name, attn_map in attn_maps.items():
52 |         attn_map = attn_map.cpu() if detach else attn_map
53 |         attn_map = torch.chunk(attn_map, batch_size)[idx].squeeze()
54 |         attn_map = upscale(attn_map, image_size) 
55 |         net_attn_maps.append(attn_map) 
56 | 
57 |     net_attn_maps = torch.mean(torch.stack(net_attn_maps,dim=0),dim=0)
58 | 
59 |     return net_attn_maps
60 | 
61 | def attnmaps2images(net_attn_maps):
62 | 
63 |     #total_attn_scores = 0
64 |     images = []
65 | 
66 |     for attn_map in net_attn_maps:
67 |         attn_map = attn_map.cpu().numpy()
68 |         #total_attn_scores += attn_map.mean().item()
69 | 
70 |         normalized_attn_map = (attn_map - np.min(attn_map)) / (np.max(attn_map) - np.min(attn_map)) * 255
71 |         normalized_attn_map = normalized_attn_map.astype(np.uint8)
72 |         #print("norm: ", normalized_attn_map.shape)
73 |         image = Image.fromarray(normalized_attn_map)
74 | 
75 |         #image = fix_save_attn_map(attn_map)
76 |         images.append(image)
77 | 
78 |     #print(total_attn_scores)
79 |     return images
80 | def is_torch2_available():
81 |     return hasattr(F, "scaled_dot_product_attention")
82 | 
83 | def get_generator(seed, device):
84 | 
85 |     if seed is not None:
86 |         if isinstance(seed, list):
87 |             generator = [torch.Generator(device).manual_seed(seed_item) for seed_item in seed]
88 |         else:
89 |             generator = torch.Generator(device).manual_seed(seed)
90 |     else:
91 |         generator = None
92 | 
93 |     return generator


--------------------------------------------------------------------------------
/black_box_image_edit/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from moviepy.editor import VideoFileClip
  3 | import random
  4 | from PIL import Image
  5 | import numpy as np
  6 | 
  7 | def crop_and_resize_video(input_video_path, output_folder, clip_duration=None, width=None, height=None, start_time=None, end_time=None, n_frames=16, center_crop=False, x_offset=0, y_offset=0, longest_to_width=False):    # Load the video file
  8 |     video = VideoFileClip(input_video_path)
  9 |     
 10 |     # Calculate start and end times for cropping
 11 |     if clip_duration is not None:
 12 |         if start_time is not None:
 13 |             start_time = float(start_time)
 14 |             end_time = start_time + clip_duration
 15 |         elif end_time is not None:
 16 |             end_time = float(end_time)
 17 |             start_time = end_time - clip_duration
 18 |         else:
 19 |             # Default to random cropping if neither start nor end time is specified
 20 |             video_duration = video.duration
 21 |             if video_duration <= clip_duration:
 22 |                 print(f"Skipping {input_video_path}: duration is less than or equal to the clip duration.")
 23 |                 return
 24 |             max_start_time = video_duration - clip_duration
 25 |             start_time = random.uniform(0, max_start_time)
 26 |             end_time = start_time + clip_duration
 27 |     elif start_time is not None and end_time is not None:
 28 |         start_time = float(start_time)
 29 |         end_time = float(end_time)
 30 |         clip_duration = int(end_time - start_time)
 31 |     else:
 32 |         raise ValueError("Either clip_duration must be provided, or both start_time and end_time must be specified.")
 33 | 
 34 |     # Crop the video
 35 |     cropped_video = video.subclip(start_time, end_time)
 36 | 
 37 | 
 38 |     if center_crop:
 39 |         # Calculate scale to ensure the desired crop size fits within the video
 40 |         video_width, video_height = cropped_video.size
 41 |         scale_width = video_width / width
 42 |         scale_height = video_height / height
 43 |         if longest_to_width:
 44 |             scale = max(scale_width, scale_height)
 45 |         else:
 46 |             scale = min(scale_width, scale_height)
 47 |         
 48 |         # Resize video to ensure the crop area fits within the frame
 49 |         # This step ensures that the smallest dimension matches or exceeds 512 pixels
 50 |         new_width = int(video_width / scale)
 51 |         new_height = int(video_height / scale)
 52 |         resized_video = cropped_video.resize(newsize=(new_width, new_height))
 53 |         print(f"Resized video to ({new_width}, {new_height})")
 54 |         
 55 |         # Calculate crop position with offset, ensuring the crop does not go out of bounds
 56 |         # The offset calculation needs to ensure that the cropping area remains within the video frame
 57 |         offset_x = int(((x_offset + 1) / 2) * (new_width - width))  # Adjusted for [-1, 1] scale
 58 |         offset_y = int(((y_offset + 1) / 2) * (new_height - height))  # Adjusted for [-1, 1] scale
 59 |         
 60 |         # Ensure offsets do not push the crop area out of the video frame
 61 |         offset_x = max(0, min(new_width - width, offset_x))
 62 |         offset_y = max(0, min(new_height - height, offset_y))
 63 |         
 64 |         # Apply center crop with offsets
 65 |         cropped_video = resized_video.crop(x1=offset_x, y1=offset_y, width=width, height=height)
 66 |     elif width and height:
 67 |         # Directly resize the video to specified width and height if no center crop is specified
 68 |         cropped_video = cropped_video.resize(newsize=(width, height))
 69 |     
 70 | 
 71 |     # After resizing and cropping, set the frame rate to fps
 72 |     fps = n_frames // clip_duration
 73 |     final_video = cropped_video.set_fps(fps)
 74 |     
 75 |     # Prepare the output video path
 76 |     if not os.path.exists(output_folder):
 77 |         os.makedirs(output_folder)
 78 |     filename = os.path.basename(input_video_path)
 79 |     output_video_path = os.path.join(output_folder, filename)
 80 |     
 81 |     # Write the result to the output file
 82 |     final_video.write_videofile(output_video_path, codec='libx264', audio_codec='aac', fps=fps)
 83 |     print(f"Processed {input_video_path}, saved to {output_video_path}")
 84 |     return output_video_path
 85 | 
 86 | 
 87 | def infer_video_prompt(model, video_path, output_dir, prompt, prompt_type="instruct", force_512=False, seed=42, negative_prompt="", overwrite=False):
 88 |     """
 89 |     Processes videos from the input directory, resizes them to 512x512 before feeding into the model by first frame,
 90 |     and saves the processed video back to its original size in the output directory.
 91 | 
 92 |     Args:
 93 |         model: The video editing model.
 94 |         input_dir (str): Path to the directory containing input videos.
 95 |         output_dir (str): Path to the directory where processed videos will be saved.
 96 |         prompt (str): Instruction prompt for video editing.
 97 |     """
 98 | 
 99 |     # Create the output directory if it does not exist
100 |     if not os.path.exists(output_dir):
101 |         os.makedirs(output_dir)
102 | 
103 |     video_clip = VideoFileClip(video_path)
104 |     video_filename = os.path.basename(video_path)
105 |     # filename_noext = os.path.splitext(video_filename)[0]
106 |     
107 |     # Create the output directory if it does not exist
108 |     # final_output_dir = os.path.join(output_dir, filename_noext)
109 |     final_output_dir = output_dir
110 |     if not os.path.exists(final_output_dir):
111 |         os.makedirs(final_output_dir)
112 | 
113 |     result_path = os.path.join(final_output_dir, prompt + ".png")
114 | 
115 |     # Check if result already exists
116 |     if os.path.exists(result_path) and overwrite is False:
117 |         print(f"Result already exists: {result_path}")
118 |         return
119 | 
120 |     def process_frame(image):
121 |         pil_image = Image.fromarray(image)
122 |         if force_512:
123 |             pil_image = pil_image.resize((512, 512), Image.LANCZOS)
124 |         if prompt_type == "instruct":
125 |             result = model.infer_one_image(pil_image, instruct_prompt=prompt, seed=seed, negative_prompt=negative_prompt)
126 |         else:
127 |             result = model.infer_one_image(pil_image, target_prompt=prompt, seed=seed, negative_prompt=negative_prompt)
128 |         if force_512:
129 |             result = result.resize(video_clip.size, Image.LANCZOS)
130 |         return np.array(result)
131 |     
132 |     # Process only the first frame
133 |     first_frame = video_clip.get_frame(0)  # Get the first frame
134 |     processed_frame = process_frame(first_frame)  # Process the first frame
135 | 
136 | 
137 |     #Image.fromarray(first_frame).save(os.path.join(final_output_dir, "00000.png"))
138 |     Image.fromarray(processed_frame).save(result_path)
139 |     print(f"Processed and saved the first frame: {result_path}")
140 |     return result_path
141 | 
142 | def infer_video_style(model, video_path, output_dir, style_image, prompt, force_512=False, seed=42, negative_prompt="", overwrite=False):
143 |     if not os.path.exists(output_dir):
144 |         os.makedirs(output_dir)
145 | 
146 |     video_clip = VideoFileClip(video_path)
147 |     video_filename = os.path.basename(video_path)
148 |     final_output_dir = output_dir
149 |     if not os.path.exists(final_output_dir):
150 |         os.makedirs(final_output_dir)
151 | 
152 |     result_path = os.path.join(final_output_dir, "style" + ".png")
153 |     if os.path.exists(result_path) and overwrite is False:
154 |         print(f"Result already exists: {result_path}")
155 |         return
156 |     def process_frame(image):
157 |         pil_image = Image.fromarray(image)
158 |         if force_512:
159 |             pil_image = pil_image.resize((512, 512), Image.LANCZOS)
160 |         result = model.infer_one_image(pil_image, 
161 |                                         style_image=style_image,
162 |                                         prompt=prompt, 
163 |                                         seed=seed, 
164 |                                         negative_prompt=negative_prompt)
165 |         if force_512:
166 |             result = result.resize(video_clip.size, Image.LANCZOS)
167 |         return np.array(result)
168 |     # Process only the first frame
169 |     first_frame = video_clip.get_frame(0)  # Get the first frame
170 |     processed_frame = process_frame(first_frame)  # Process the first frame
171 |     Image.fromarray(processed_frame).save(result_path)
172 |     print(f"Processed and saved the first frame: {result_path}")
173 |     return result_path


--------------------------------------------------------------------------------
/cog.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration for Cog ⚙️
 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
 3 | 
 4 | build:
 5 |   # set to true if your model requires a GPU
 6 |   gpu: true
 7 |   system_packages:
 8 |     - "libgl1-mesa-glx"
 9 |     - "libglib2.0-0"
10 |   python_version: "3.11"
11 |   python_packages:
12 |     - torch==2.0.1
13 |     - torchvision==0.15.2
14 |     - accelerate==0.27.2
15 |     - diffusers==0.26.3
16 |     - moviepy
17 |     - transformers==4.38.1
18 |     - omegaconf==2.3.0
19 |     - opencv-python
20 |     - imageio
21 |   run:
22 |     - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.6.0/pget_linux_x86_64" && chmod +x /usr/local/bin/pget
23 | predict: "predict.py:Predictor"
24 | 


--------------------------------------------------------------------------------
/consisti2v/README.md:
--------------------------------------------------------------------------------
 1 | # AnyV2V(_ConsistI2V_)
 2 | 
 3 | Our AnyV2V(_ConsistI2V_) is a standalone version.
 4 | 
 5 | ##  Setup for ConsistI2V
 6 | 
 7 | ### Prepare Environment
 8 | ```
 9 | conda env create -f environment.yaml
10 | conda activate consisti2v
11 | ```
12 | 
13 | ## AnyV2V
14 | 
15 | **Note:** due to the lower training resolution of ConsistI2V (256x256), it might perform better on 256x256 inputs. We provide configurations for running on both 256x256 and 512x512.
16 | 
17 | ### Run ConsistI2V DDIM Inversion to get the initial latent
18 | Usage Example:
19 | ```shell
20 | python run_ddim_inversion.py --config configs/pipeline_256/ddim_inversion_256.yaml video_path=/path/to/your_video.mp4 video_name=your_video
21 | ```
22 | 
23 | Saved latent goes to `./ddim_version` (can be configurated in `./configs/pipeline_256(512)/ddim_inversion_256(512).yaml`).
24 | 
25 | ### Run AnyV2V with ConsistI2V
26 | 
27 | Your need to prepare your edited image frame first. We provided an image editing script in the root folder of AnyV2V.
28 | 
29 | Usage Example:
30 | ```shell
31 | python run_pnp_edit.py --config configs/pipeline_256/pnp_edit.yaml \
32 |     video_path=/path/to/your_video.mp4 \
33 |     video_name=your_video \
34 |     edited_first_frame_path=/path/to/edited_first_frame.png \
35 |     editing_prompt="<editing_prompt>" \
36 |     ddim_latents_path=/path/to/ddim_latents
37 | ```
38 | 
39 | Saved video goes to `./anyv2v_results` (can be configurated in `./configs/pipeline_256(512)/pnp_edit.yaml`).
40 | 


--------------------------------------------------------------------------------
/consisti2v/configs/pipeline_256/ddim_inversion_256.yaml:
--------------------------------------------------------------------------------
 1 | # General
 2 | seed: 8888
 3 | device: "cuda:0"
 4 | debug: False # For logging
 5 | 
 6 | # Dir
 7 | exp_name: "${video_name}"
 8 | output_dir: "ddim_inversion/${exp_name}"
 9 | 
10 | # Data
11 | image_size: [256, 256]
12 | data_dir: null
13 | video_name: "<your_video>"
14 | video_path: "<your_path>/${video_name}.mp4"
15 | video_frames_path: null
16 | save_frames: True
17 | 
18 | # DDIM settings
19 | n_frames: 16
20 | 
21 | # DDIM inversion
22 | inverse_config:
23 |     image_size: ${image_size}
24 |     n_frames: ${n_frames}
25 |     cfg_txt: 1.0
26 |     cfg_img: 1.0
27 |     frame_stride: 3
28 |     prompt: ""
29 |     negative_prompt: ""
30 |     n_steps: 500
31 |     output_dir: "outputs/${exp_name}"
32 | 
33 | # DDIM reconstruction
34 | recon_config:
35 |     image_size: ${image_size}
36 |     n_frames: ${n_frames}
37 |     cfg_txt: 1.0
38 |     cfg_img: 1.0
39 |     frame_stride: 3
40 |     prompt: ""
41 |     negative_prompt: ""
42 |     n_steps: 50
43 |     ddim_init_latents_t_idx: 0  # 0 for 981, 3 for 921, 9 for 801, 20 for 581 if n_steps=50


--------------------------------------------------------------------------------
/consisti2v/configs/pipeline_256/pnp_edit.yaml:
--------------------------------------------------------------------------------
 1 | # General
 2 | seed: 8888
 3 | device: "cuda:0"
 4 | debug: True # For logging
 5 | 
 6 | # Dir
 7 | exp_name: "${video_name}"
 8 | output_dir: "anyv2v_results/${exp_name}"
 9 | 
10 | # Data
11 | image_size: [256, 256]
12 | data_dir: null
13 | video_name: "<your_video>"
14 | video_path: "<your_path>/${video_name}.mp4"
15 | video_frames_path: "<your_path>/${video_name}"
16 | edited_first_frame_path: "<your_path>/<edited_first_frame>.png"
17 | 
18 | 
19 | # Pnp Editing
20 | n_frames: 16
21 | cfg_txt: 35
22 | cfg_img: 1.0
23 | frame_stride: 3
24 | editing_prompt: "<editing_prompt>"
25 | editing_negative_prompt: "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"
26 | n_steps: 50
27 | ddim_init_latents_t_idx: 4  # 0 for 981, 3 for 921, 9 for 801, 20 for 581 if n_steps=50
28 | ddim_inv_prompt: ""
29 | ddim_latents_path: "path/to/ddim_latents"
30 | 
31 | # Pnp config
32 | pnp_f_t: 0.2
33 | pnp_spatial_attn_t: 0.2
34 | pnp_temp_attn_t: 0.5
35 | 
36 | blend_ratio: 0.0


--------------------------------------------------------------------------------
/consisti2v/configs/pipeline_512/ddim_inversion_512.yaml:
--------------------------------------------------------------------------------
 1 | # General
 2 | seed: 8888
 3 | device: "cuda:0"
 4 | debug: False # For logging
 5 | 
 6 | # Dir
 7 | exp_name: "${video_name}"
 8 | output_dir: "ddim_inversion/${exp_name}"
 9 | 
10 | # Data
11 | image_size: [512, 512]
12 | data_dir: null
13 | video_name: "<your_video>"
14 | video_path: "<your_path>/${video_name}.mp4"
15 | video_frames_path: null
16 | save_frames: True
17 | 
18 | # DDIM settings
19 | n_frames: 16
20 | 
21 | # DDIM inversion
22 | inverse_config:
23 |     image_size: ${image_size}
24 |     n_frames: ${n_frames}
25 |     cfg_txt: 1.0
26 |     cfg_img: 1.0
27 |     frame_stride: 3
28 |     prompt: ""
29 |     negative_prompt: ""
30 |     n_steps: 500
31 |     output_dir: "outputs/${exp_name}"
32 | 
33 | # DDIM reconstruction
34 | recon_config:
35 |     image_size: ${image_size}
36 |     n_frames: ${n_frames}
37 |     cfg_txt: 1.0
38 |     cfg_img: 1.0
39 |     frame_stride: 3
40 |     prompt: ""
41 |     negative_prompt: ""
42 |     n_steps: 50
43 |     ddim_init_latents_t_idx: 0  # 0 for 981, 3 for 921, 9 for 801, 20 for 581 if n_steps=50


--------------------------------------------------------------------------------
/consisti2v/configs/pipeline_512/pnp_edit.yaml:
--------------------------------------------------------------------------------
 1 | # General
 2 | seed: 8888
 3 | device: "cuda:0"
 4 | debug: True # For logging
 5 | 
 6 | # Dir
 7 | exp_name: "${video_name}"
 8 | output_dir: "anyv2v_results/${exp_name}"
 9 | 
10 | # Data
11 | image_size: [512, 512]
12 | data_dir: null
13 | video_name: "<your_video>"
14 | video_path: "<your_path>/${video_name}.mp4"
15 | video_frames_path: "<your_path>/${video_name}"
16 | edited_first_frame_path: "<your_path>/<edited_first_frame>.png"
17 | 
18 | 
19 | # Pnp Editing
20 | n_frames: 16
21 | cfg_txt: 35
22 | cfg_img: 1.0
23 | frame_stride: 3
24 | editing_prompt: "<editing_prompt>"
25 | editing_negative_prompt: "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"
26 | n_steps: 50
27 | ddim_init_latents_t_idx: 4  # 0 for 981, 3 for 921, 9 for 801, 20 for 581 if n_steps=50
28 | ddim_inv_prompt: ""
29 | ddim_latents_path: "path/to/ddim_latents"
30 | 
31 | # Pnp config
32 | pnp_f_t: 0.2
33 | pnp_spatial_attn_t: 0.2
34 | pnp_temp_attn_t: 0.5
35 | 
36 | blend_ratio: 0.0


--------------------------------------------------------------------------------
/consisti2v/consisti2v/utils/frameinit_utils.py:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/TianxingWu/FreeInit/blob/master/freeinit_utils.py
  2 | import torch
  3 | import torch.fft as fft
  4 | import math
  5 | 
  6 | 
  7 | def freq_mix_3d(x, noise, LPF):
  8 |     """
  9 |     Noise reinitialization.
 10 | 
 11 |     Args:
 12 |         x: diffused latent
 13 |         noise: randomly sampled noise
 14 |         LPF: low pass filter
 15 |     """
 16 |     # FFT
 17 |     x_freq = fft.fftn(x, dim=(-3, -2, -1))
 18 |     x_freq = fft.fftshift(x_freq, dim=(-3, -2, -1))
 19 |     noise_freq = fft.fftn(noise, dim=(-3, -2, -1))
 20 |     noise_freq = fft.fftshift(noise_freq, dim=(-3, -2, -1))
 21 | 
 22 |     # frequency mix
 23 |     HPF = 1 - LPF
 24 |     x_freq_low = x_freq * LPF
 25 |     noise_freq_high = noise_freq * HPF
 26 |     x_freq_mixed = x_freq_low + noise_freq_high # mix in freq domain
 27 | 
 28 |     # IFFT
 29 |     x_freq_mixed = fft.ifftshift(x_freq_mixed, dim=(-3, -2, -1))
 30 |     x_mixed = fft.ifftn(x_freq_mixed, dim=(-3, -2, -1)).real
 31 | 
 32 |     return x_mixed
 33 | 
 34 | 
 35 | def get_freq_filter(shape, device, filter_type, n, d_s, d_t):
 36 |     """
 37 |     Form the frequency filter for noise reinitialization.
 38 | 
 39 |     Args:
 40 |         shape: shape of latent (B, C, T, H, W)
 41 |         filter_type: type of the freq filter
 42 |         n: (only for butterworth) order of the filter, larger n ~ ideal, smaller n ~ gaussian
 43 |         d_s: normalized stop frequency for spatial dimensions (0.0-1.0)
 44 |         d_t: normalized stop frequency for temporal dimension (0.0-1.0)
 45 |     """
 46 |     if filter_type == "gaussian":
 47 |         return gaussian_low_pass_filter(shape=shape, d_s=d_s, d_t=d_t).to(device)
 48 |     elif filter_type == "ideal":
 49 |         return ideal_low_pass_filter(shape=shape, d_s=d_s, d_t=d_t).to(device)
 50 |     elif filter_type == "box":
 51 |         return box_low_pass_filter(shape=shape, d_s=d_s, d_t=d_t).to(device)
 52 |     elif filter_type == "butterworth":
 53 |         return butterworth_low_pass_filter(shape=shape, n=n, d_s=d_s, d_t=d_t).to(device)
 54 |     else:
 55 |         raise NotImplementedError
 56 | 
 57 | 
 58 | def gaussian_low_pass_filter(shape, d_s=0.25, d_t=0.25):
 59 |     """
 60 |     Compute the gaussian low pass filter mask.
 61 | 
 62 |     Args:
 63 |         shape: shape of the filter (volume)
 64 |         d_s: normalized stop frequency for spatial dimensions (0.0-1.0)
 65 |         d_t: normalized stop frequency for temporal dimension (0.0-1.0)
 66 |     """
 67 |     T, H, W = shape[-3], shape[-2], shape[-1]
 68 |     mask = torch.zeros(shape)
 69 |     if d_s==0 or d_t==0:
 70 |         return mask
 71 |     for t in range(T):
 72 |         for h in range(H):
 73 |             for w in range(W):
 74 |                 d_square = (((d_s/d_t)*(2*t/T-1))**2 + (2*h/H-1)**2 + (2*w/W-1)**2)
 75 |                 mask[..., t,h,w] = math.exp(-1/(2*d_s**2) * d_square)
 76 |     return mask
 77 | 
 78 | 
 79 | def butterworth_low_pass_filter(shape, n=4, d_s=0.25, d_t=0.25):
 80 |     """
 81 |     Compute the butterworth low pass filter mask.
 82 | 
 83 |     Args:
 84 |         shape: shape of the filter (volume)
 85 |         n: order of the filter, larger n ~ ideal, smaller n ~ gaussian
 86 |         d_s: normalized stop frequency for spatial dimensions (0.0-1.0)
 87 |         d_t: normalized stop frequency for temporal dimension (0.0-1.0)
 88 |     """
 89 |     T, H, W = shape[-3], shape[-2], shape[-1]
 90 |     mask = torch.zeros(shape)
 91 |     if d_s==0 or d_t==0:
 92 |         return mask
 93 |     for t in range(T):
 94 |         for h in range(H):
 95 |             for w in range(W):
 96 |                 d_square = (((d_s/d_t)*(2*t/T-1))**2 + (2*h/H-1)**2 + (2*w/W-1)**2)
 97 |                 mask[..., t,h,w] = 1 / (1 + (d_square / d_s**2)**n)
 98 |     return mask
 99 | 
100 | 
101 | def ideal_low_pass_filter(shape, d_s=0.25, d_t=0.25):
102 |     """
103 |     Compute the ideal low pass filter mask.
104 | 
105 |     Args:
106 |         shape: shape of the filter (volume)
107 |         d_s: normalized stop frequency for spatial dimensions (0.0-1.0)
108 |         d_t: normalized stop frequency for temporal dimension (0.0-1.0)
109 |     """
110 |     T, H, W = shape[-3], shape[-2], shape[-1]
111 |     mask = torch.zeros(shape)
112 |     if d_s==0 or d_t==0:
113 |         return mask
114 |     for t in range(T):
115 |         for h in range(H):
116 |             for w in range(W):
117 |                 d_square = (((d_s/d_t)*(2*t/T-1))**2 + (2*h/H-1)**2 + (2*w/W-1)**2)
118 |                 mask[..., t,h,w] =  1 if d_square <= d_s*2 else 0
119 |     return mask
120 | 
121 | 
122 | def box_low_pass_filter(shape, d_s=0.25, d_t=0.25):
123 |     """
124 |     Compute the ideal low pass filter mask (approximated version).
125 | 
126 |     Args:
127 |         shape: shape of the filter (volume)
128 |         d_s: normalized stop frequency for spatial dimensions (0.0-1.0)
129 |         d_t: normalized stop frequency for temporal dimension (0.0-1.0)
130 |     """
131 |     T, H, W = shape[-3], shape[-2], shape[-1]
132 |     mask = torch.zeros(shape)
133 |     if d_s==0 or d_t==0:
134 |         return mask
135 | 
136 |     threshold_s = round(int(H // 2) * d_s)
137 |     threshold_t = round(T // 2 * d_t)
138 | 
139 |     cframe, crow, ccol = T // 2, H // 2, W //2
140 |     mask[..., cframe - threshold_t:cframe + threshold_t, crow - threshold_s:crow + threshold_s, ccol - threshold_s:ccol + threshold_s] = 1.0
141 | 
142 |     return mask


--------------------------------------------------------------------------------
/consisti2v/consisti2v/utils/util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import imageio
  3 | import numpy as np
  4 | from typing import Union
  5 | 
  6 | import torch
  7 | import torchvision
  8 | import torch.distributed as dist
  9 | import wandb
 10 | 
 11 | from tqdm import tqdm
 12 | from einops import rearrange
 13 | 
 14 | from torchmetrics.image.fid import _compute_fid
 15 | 
 16 | 
 17 | def zero_rank_print(s):
 18 |     if (not dist.is_initialized()) or (dist.is_initialized() and dist.get_rank() == 0): print("### " + s)
 19 | 
 20 | 
 21 | def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8, wandb=False, global_step=0, format="gif"):
 22 |     videos = rearrange(videos, "b c t h w -> t b c h w")
 23 |     outputs = []
 24 |     for x in videos:
 25 |         x = torchvision.utils.make_grid(x, nrow=n_rows)
 26 |         x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
 27 |         if rescale:
 28 |             x = (x + 1.0) / 2.0  # -1,1 -> 0,1
 29 |         x = (x * 255).numpy().astype(np.uint8)
 30 |         outputs.append(x)
 31 | 
 32 |     if wandb:
 33 |         wandb_video = wandb.Video(outputs, fps=fps)
 34 |         wandb.log({"val_videos": wandb_video}, step=global_step)
 35 |         
 36 |     os.makedirs(os.path.dirname(path), exist_ok=True)
 37 |     if format == "gif":
 38 |         imageio.mimsave(path, outputs, fps=fps)
 39 |     elif format == "mp4":
 40 |         torchvision.io.write_video(path, np.array(outputs), fps=fps, video_codec='h264', options={'crf': '10'})
 41 | 
 42 | # DDIM Inversion
 43 | @torch.no_grad()
 44 | def init_prompt(prompt, pipeline):
 45 |     uncond_input = pipeline.tokenizer(
 46 |         [""], padding="max_length", max_length=pipeline.tokenizer.model_max_length,
 47 |         return_tensors="pt"
 48 |     )
 49 |     uncond_embeddings = pipeline.text_encoder(uncond_input.input_ids.to(pipeline.device))[0]
 50 |     text_input = pipeline.tokenizer(
 51 |         [prompt],
 52 |         padding="max_length",
 53 |         max_length=pipeline.tokenizer.model_max_length,
 54 |         truncation=True,
 55 |         return_tensors="pt",
 56 |     )
 57 |     text_embeddings = pipeline.text_encoder(text_input.input_ids.to(pipeline.device))[0]
 58 |     context = torch.cat([uncond_embeddings, text_embeddings])
 59 | 
 60 |     return context
 61 | 
 62 | 
 63 | def next_step(model_output: Union[torch.FloatTensor, np.ndarray], timestep: int,
 64 |               sample: Union[torch.FloatTensor, np.ndarray], ddim_scheduler):
 65 |     timestep, next_timestep = min(
 66 |         timestep - ddim_scheduler.config.num_train_timesteps // ddim_scheduler.num_inference_steps, 999), timestep
 67 |     alpha_prod_t = ddim_scheduler.alphas_cumprod[timestep] if timestep >= 0 else ddim_scheduler.final_alpha_cumprod
 68 |     alpha_prod_t_next = ddim_scheduler.alphas_cumprod[next_timestep]
 69 |     beta_prod_t = 1 - alpha_prod_t
 70 |     next_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5
 71 |     next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * model_output
 72 |     next_sample = alpha_prod_t_next ** 0.5 * next_original_sample + next_sample_direction
 73 |     return next_sample
 74 | 
 75 | 
 76 | def get_noise_pred_single(latents, t, context, first_frame_latents, frame_stride, unet):
 77 |     noise_pred = unet(latents, t, encoder_hidden_states=context, first_frame_latents=first_frame_latents, frame_stride=frame_stride).sample
 78 |     return noise_pred
 79 | 
 80 | 
 81 | @torch.no_grad()
 82 | def ddim_loop(pipeline, ddim_scheduler, latent, num_inv_steps, prompt, first_frame_latents, frame_stride):
 83 |     context = init_prompt(prompt, pipeline)
 84 |     uncond_embeddings, cond_embeddings = context.chunk(2)
 85 |     all_latent = [latent]
 86 |     latent = latent.clone().detach()
 87 |     for i in tqdm(range(num_inv_steps)):
 88 |         t = ddim_scheduler.timesteps[len(ddim_scheduler.timesteps) - i - 1]
 89 |         noise_pred = get_noise_pred_single(latent, t, cond_embeddings, first_frame_latents, frame_stride, pipeline.unet)
 90 |         latent = next_step(noise_pred, t, latent, ddim_scheduler)
 91 |         all_latent.append(latent)
 92 |     return all_latent
 93 | 
 94 | 
 95 | @torch.no_grad()
 96 | def ddim_inversion(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt="", first_frame_latents=None, frame_stride=3):
 97 |     ddim_latents = ddim_loop(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt, first_frame_latents, frame_stride)
 98 |     return ddim_latents
 99 | 
100 | 
101 | def compute_fid(real_features, fake_features, num_features, device):
102 |     orig_dtype = real_features.dtype
103 | 
104 |     mx_num_feats = (num_features, num_features)
105 |     real_features_sum = torch.zeros(num_features).double().to(device)
106 |     real_features_cov_sum = torch.zeros(mx_num_feats).double().to(device)
107 |     real_features_num_samples = torch.tensor(0).long().to(device)
108 | 
109 |     fake_features_sum = torch.zeros(num_features).double().to(device)
110 |     fake_features_cov_sum = torch.zeros(mx_num_feats).double().to(device)
111 |     fake_features_num_samples = torch.tensor(0).long().to(device)
112 | 
113 |     real_features = real_features.double()
114 |     fake_features = fake_features.double()
115 | 
116 |     real_features_sum += real_features.sum(dim=0)
117 |     real_features_cov_sum += real_features.t().mm(real_features)
118 |     real_features_num_samples += real_features.shape[0]
119 | 
120 |     fake_features_sum += fake_features.sum(dim=0)
121 |     fake_features_cov_sum += fake_features.t().mm(fake_features)
122 |     fake_features_num_samples += fake_features.shape[0]
123 | 
124 |     """Calculate FID score based on accumulated extracted features from the two distributions."""
125 |     if real_features_num_samples < 2 or fake_features_num_samples < 2:
126 |         raise RuntimeError("More than one sample is required for both the real and fake distributed to compute FID")
127 |     mean_real = (real_features_sum / real_features_num_samples).unsqueeze(0)
128 |     mean_fake = (fake_features_sum / fake_features_num_samples).unsqueeze(0)
129 | 
130 |     cov_real_num = real_features_cov_sum - real_features_num_samples * mean_real.t().mm(mean_real)
131 |     cov_real = cov_real_num / (real_features_num_samples - 1)
132 |     cov_fake_num = fake_features_cov_sum - fake_features_num_samples * mean_fake.t().mm(mean_fake)
133 |     cov_fake = cov_fake_num / (fake_features_num_samples - 1)
134 |     return _compute_fid(mean_real.squeeze(0), cov_real, mean_fake.squeeze(0), cov_fake).to(orig_dtype)
135 | 
136 | 
137 | def compute_inception_score(gen_probs, num_splits=10):
138 |     num_gen = gen_probs.shape[0]
139 |     gen_probs = gen_probs.detach().cpu().numpy()
140 |     scores = []
141 |     np.random.RandomState(42).shuffle(gen_probs)
142 |     for i in range(num_splits):
143 |         part = gen_probs[i * num_gen // num_splits : (i + 1) * num_gen // num_splits]
144 |         kl = part * (np.log(part) - np.log(np.mean(part, axis=0, keepdims=True)))
145 |         kl = np.mean(np.sum(kl, axis=1))
146 |         scores.append(np.exp(kl))
147 |     return float(np.mean(scores)), float(np.std(scores))
148 |     # idx = torch.randperm(features.shape[0])
149 |     # features = features[idx]
150 |     # # calculate probs and logits
151 |     # prob = features.softmax(dim=1)
152 |     # log_prob = features.log_softmax(dim=1)
153 | 
154 |     # # split into groups
155 |     # prob = prob.chunk(splits, dim=0)
156 |     # log_prob = log_prob.chunk(splits, dim=0)
157 | 
158 |     # # calculate score per split
159 |     # mean_prob = [p.mean(dim=0, keepdim=True) for p in prob]
160 |     # kl_ = [p * (log_p - m_p.log()) for p, log_p, m_p in zip(prob, log_prob, mean_prob)]
161 |     # kl_ = [k.sum(dim=1).mean().exp() for k in kl_]
162 |     # kl = torch.stack(kl_)
163 | 
164 |     # return mean and std
165 |     # return kl.mean(), kl.std()


--------------------------------------------------------------------------------
/consisti2v/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: consisti2v
 2 | channels:
 3 |   - pytorch
 4 |   - nvidia
 5 | dependencies:
 6 |   - python=3.10
 7 |   - pytorch=2.1.0
 8 |   - torchvision=0.16.0
 9 |   - torchaudio=2.1.0
10 |   - pytorch-cuda=11.8
11 |   - pip
12 |   - pip:
13 |     - diffusers==0.21.2
14 |     - transformers==4.25.1
15 |     - accelerate==0.23.0
16 |     - imageio==2.27.0
17 |     - decord==0.6.0
18 |     - einops
19 |     - omegaconf
20 |     - safetensors
21 |     - gradio==3.42.0
22 |     - wandb
23 |     - moviepy
24 |     - scikit-learn
25 |     - av
26 |     - rotary_embedding_torch
27 |     - torchmetrics
28 |     - torch-fidelity
29 | 


--------------------------------------------------------------------------------
/consisti2v/run_ddim_inversion.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import torch
  4 | import argparse
  5 | import logging
  6 | from omegaconf import OmegaConf
  7 | from PIL import Image
  8 | from pathlib import Path
  9 | 
 10 | # HF imports
 11 | from diffusers import DDIMScheduler
 12 | from ddim_inverse_scheduler import DDIMInverseScheduler
 13 | 
 14 | # Project imports
 15 | from utils import (
 16 |     seed_everything,
 17 |     load_video_frames,
 18 |     convert_video_to_frames,
 19 |     load_ddim_latents_at_T,
 20 |     load_ddim_latents_at_t,
 21 | )
 22 | from consisti2v.pipelines.pipeline_video_editing import ConditionalVideoEditingPipeline
 23 | from consisti2v.utils.util import save_videos_grid
 24 | 
 25 | 
 26 | def ddim_inversion(config, first_frame, frame_list, pipe: ConditionalVideoEditingPipeline, inverse_scheduler, g):
 27 |     pipe.scheduler = inverse_scheduler
 28 |     video_latents_at_0 = pipe.encode_vae_video(
 29 |         frame_list,
 30 |         device=pipe._execution_device,
 31 |         height=config.image_size[1],
 32 |         width=config.image_size[0],
 33 |     )
 34 |     ddim_latents = pipe.invert(
 35 |         prompt=config.prompt,
 36 |         first_frame_paths=first_frame,
 37 |         height=config.image_size[1],
 38 |         width=config.image_size[0],
 39 |         video_length=config.n_frames,
 40 |         num_inference_steps=config.n_steps,
 41 |         guidance_scale_txt=config.cfg_txt,
 42 |         guidance_scale_img=config.cfg_img,
 43 |         negative_prompt=config.negative_prompt,
 44 |         frame_stride=config.frame_stride,
 45 |         latents=video_latents_at_0,
 46 |         generator=g,  # TODO: this is not correct
 47 |         return_dict=False,
 48 |         output_type="latent",
 49 |         output_dir=config.output_dir,
 50 |     ).videos  # [b, num_inference_steps, c, num_frames, h, w]
 51 |     logger.debug(f"ddim_latents.shape: {ddim_latents.shape}")
 52 |     ddim_latents = ddim_latents[0]  # [num_inference_steps, c, num_frames, h, w]
 53 |     return ddim_latents
 54 | 
 55 | 
 56 | def ddim_sampling(
 57 |     config, first_frame, ddim_latents_at_T, pipe: ConditionalVideoEditingPipeline, ddim_scheduler, g, ddim_init_latents_t_idx
 58 | ):
 59 |     pipe.scheduler = ddim_scheduler
 60 |     reconstructed_video = pipe(
 61 |         prompt=config.prompt,
 62 |         first_frame_paths=first_frame,
 63 |         height=config.image_size[1],
 64 |         width=config.image_size[0],
 65 |         video_length=config.n_frames,
 66 |         num_inference_steps=config.n_steps,
 67 |         guidance_scale_txt=config.cfg_txt,
 68 |         guidance_scale_img=config.cfg_img,
 69 |         negative_prompt=config.negative_prompt,
 70 |         frame_stride=config.frame_stride,
 71 |         latents=ddim_latents_at_T,
 72 |         generator=g,  # TODO: this is not correct
 73 |         return_dict=True,
 74 |         ddim_init_latents_t_idx=ddim_init_latents_t_idx,
 75 |     ).videos
 76 |     return reconstructed_video
 77 | 
 78 | 
 79 | def main(config):
 80 |     seed_everything(config.seed)
 81 |     torch.set_grad_enabled(False)
 82 |     device = torch.device(config.device)
 83 | 
 84 |     # Initialize the pipeline
 85 |     # TODO: do we need the get_inverse_timesteps function?
 86 |     pipe = ConditionalVideoEditingPipeline.from_pretrained(
 87 |         "TIGER-Lab/ConsistI2V",
 88 |         torch_dtype=torch.float16,
 89 |     )
 90 |     # TODO: set the model to GPU and eval mode
 91 |     pipe.to(device)
 92 |     g = torch.Generator(device=device)
 93 |     g = g.manual_seed(config.seed)
 94 | 
 95 |     # Initialize the DDIM inverse scheduler
 96 |     inverse_scheduler = DDIMInverseScheduler.from_pretrained(
 97 |         "TIGER-Lab/ConsistI2V",
 98 |         subfolder="scheduler",
 99 |     )
100 |     # Initialize the DDIM scheduler
101 |     ddim_scheduler = DDIMScheduler.from_pretrained(
102 |         "TIGER-Lab/ConsistI2V",
103 |         subfolder="scheduler",
104 |     )
105 | 
106 |     if config.video_path:
107 |         frame_list = convert_video_to_frames(config.video_path, config.image_size, save_frames=config.save_frames, save_dir=config.output_dir)
108 |         frame_list = frame_list[: config.n_frames]  # 16 frames for img2vid
109 |         logger.debug(f"len(frame_list): {len(frame_list)}")
110 |         video_name = Path(config.video_path).stem
111 |         first_frame_path = os.path.join(config.output_dir, video_name, '00000.png')
112 |     elif config.video_frames_path:
113 |         _, frame_list = load_video_frames(config.video_frames_path, config.n_frames)
114 |         first_frame_path = os.path.join(config.video_frames_path, '00000.png')
115 |     else:
116 |         raise ValueError("Please provide either video_path or video_frames_path")
117 |     
118 |     # Main pipeline
119 |     ddim_latents = ddim_inversion(config.inverse_config, first_frame_path, frame_list, pipe, inverse_scheduler, g)
120 | 
121 |     recon_config = config.recon_config
122 |     ddim_init_latents_t_idx = recon_config.ddim_init_latents_t_idx
123 |     ddim_scheduler.set_timesteps(recon_config.n_steps)
124 |     logger.info(f"ddim_scheduler.timesteps: {ddim_scheduler.timesteps}")
125 |     ddim_latents_path = config.inverse_config.output_dir
126 |     ddim_latents_at_t = load_ddim_latents_at_t(
127 |         ddim_scheduler.timesteps[ddim_init_latents_t_idx], ddim_latents_path=ddim_latents_path
128 |     )
129 |     logger.debug(f"ddim_scheduler.timesteps[t_idx]: {ddim_scheduler.timesteps[ddim_init_latents_t_idx]}")
130 | 
131 |     reconstructed_video = ddim_sampling(recon_config, first_frame_path, ddim_latents_at_t, pipe, ddim_scheduler, g, ddim_init_latents_t_idx)
132 | 
133 |     # Save reconstructed frames and video
134 |     os.makedirs(config.output_dir, exist_ok=True)
135 |     save_videos_grid(reconstructed_video, os.path.join(config.output_dir, "ddim_reconstruction.gif"), fps=10, format="gif")
136 |     save_videos_grid(reconstructed_video, os.path.join(config.output_dir, "ddim_reconstruction.mp4"), fps=10, format="mp4")
137 |     logger.info(f"Saved reconstructed video to {config.output_dir}")
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     parser = argparse.ArgumentParser()
142 |     parser.add_argument("--config", type=str, default="configs/pipeline_256/ddim_inversion_256.yaml")
143 |     parser.add_argument("optional_args", nargs='*', default=[])
144 |     args = parser.parse_args()
145 |     config = OmegaConf.load(args.config)
146 | 
147 |     if args.optional_args:
148 |         modified_config = OmegaConf.from_dotlist(args.optional_args)
149 |         config = OmegaConf.merge(config, modified_config)
150 | 
151 |     logging_level = logging.DEBUG if config.debug else logging.INFO
152 |     logging.basicConfig(
153 |         level=logging_level,
154 |         format="%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s",
155 |     )
156 |     logger = logging.getLogger(__name__)
157 |     logger.info(f"config: {OmegaConf.to_yaml(config)}")
158 | 
159 |     main(config)
160 | 


--------------------------------------------------------------------------------
/consisti2v/run_pnp_edit.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from pathlib import Path
  4 | import torch
  5 | import argparse
  6 | import logging
  7 | from omegaconf import OmegaConf
  8 | from PIL import Image
  9 | 
 10 | # HF imports
 11 | from diffusers import DDIMScheduler
 12 | 
 13 | # Project imports
 14 | from utils import (
 15 |     seed_everything,
 16 |     load_video_frames,
 17 |     convert_video_to_frames,
 18 |     load_ddim_latents_at_T,
 19 |     load_ddim_latents_at_t,
 20 | )
 21 | from consisti2v.pipelines.pipeline_video_editing import ConditionalVideoEditingPipeline
 22 | from consisti2v.utils.util import save_videos_grid
 23 | from pnp_utils import (
 24 |     register_time,
 25 |     register_conv_injection,
 26 |     register_spatial_attention_pnp,
 27 |     register_temp_attention_pnp,
 28 | )
 29 | 
 30 | 
 31 | def init_pnp(pipe, scheduler, config):
 32 |     conv_injection_t = int(config.n_steps * config.pnp_f_t)
 33 |     spatial_attn_qk_injection_t = int(config.n_steps * config.pnp_spatial_attn_t)
 34 |     temp_attn_qk_injection_t = int(config.n_steps * config.pnp_temp_attn_t)
 35 |     conv_injection_timesteps = scheduler.timesteps[:conv_injection_t] if conv_injection_t >= 0 else []
 36 |     spatial_attn_qk_injection_timesteps = (
 37 |         scheduler.timesteps[:spatial_attn_qk_injection_t] if spatial_attn_qk_injection_t >= 0 else []
 38 |     )
 39 |     temp_attn_qk_injection_timesteps = (
 40 |         scheduler.timesteps[:temp_attn_qk_injection_t] if temp_attn_qk_injection_t >= 0 else []
 41 |     )
 42 |     register_conv_injection(pipe, conv_injection_timesteps)
 43 |     register_spatial_attention_pnp(pipe, spatial_attn_qk_injection_timesteps)
 44 |     register_temp_attention_pnp(pipe, temp_attn_qk_injection_timesteps)
 45 | 
 46 |     logger.debug(f"conv_injection_t: {conv_injection_t}")
 47 |     logger.debug(f"spatial_attn_qk_injection_t: {spatial_attn_qk_injection_t}")
 48 |     logger.debug(f"temp_attn_qk_injection_t: {temp_attn_qk_injection_t}")
 49 |     logger.debug(f"conv_injection_timesteps: {conv_injection_timesteps}")
 50 |     logger.debug(f"spatial_attn_qk_injection_timesteps: {spatial_attn_qk_injection_timesteps}")
 51 |     logger.debug(f"temp_attn_qk_injection_timesteps: {temp_attn_qk_injection_timesteps}")
 52 | 
 53 | 
 54 | def main(config):
 55 |     # Initialize the pipeline
 56 |     pipe = ConditionalVideoEditingPipeline.from_pretrained(
 57 |         "TIGER-Lab/ConsistI2V",
 58 |         torch_dtype=torch.float16,
 59 |     )
 60 |     pipe.to(device)
 61 | 
 62 |     # Initialize the DDIM scheduler
 63 |     ddim_scheduler = DDIMScheduler.from_pretrained(
 64 |         "TIGER-Lab/ConsistI2V",
 65 |         subfolder="scheduler",
 66 |     )
 67 | 
 68 |     # Load first frame and source frames
 69 |     if config.video_path:
 70 |         frame_list = convert_video_to_frames(config.video_path, config.image_size, save_frames=True)
 71 |         frame_list = frame_list[: config.n_frames]  # 16 frames for img2vid
 72 |         logger.debug(f"len(frame_list): {len(frame_list)}")
 73 |         video_name = Path(config.video_path).stem
 74 |         video_dir = Path(config.video_path).parent
 75 |         config.video_frames_path = f"{video_dir}/{video_name}"
 76 |     elif config.video_frames_path:
 77 |         _, frame_list = load_video_frames(config.video_frames_path, config.n_frames)
 78 |     else:
 79 |         raise ValueError("Please provide either video_path or video_frames_path")
 80 |     src_frame_list = frame_list
 81 |     src_1st_frame = os.path.join(config.video_frames_path, '00000.png')
 82 | 
 83 |     # Load the edited first frame
 84 |     edited_1st_frame = config.edited_first_frame_path
 85 | 
 86 |     # Load the initial latents at t
 87 |     ddim_init_latents_t_idx = config.ddim_init_latents_t_idx
 88 |     ddim_scheduler.set_timesteps(config.n_steps)
 89 |     logger.info(f"ddim_scheduler.timesteps: {ddim_scheduler.timesteps}")
 90 |     ddim_latents_path = os.path.join(config.ddim_latents_path, config.exp_name)
 91 |     ddim_latents_at_t = load_ddim_latents_at_t(
 92 |         ddim_scheduler.timesteps[ddim_init_latents_t_idx], ddim_latents_path=ddim_latents_path
 93 |     )
 94 |     logger.debug(f"ddim_scheduler.timesteps[t_idx]: {ddim_scheduler.timesteps[ddim_init_latents_t_idx]}")
 95 |     logger.debug(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}")
 96 | 
 97 |     # Blend the latents
 98 |     random_latents = torch.randn_like(ddim_latents_at_t)
 99 |     random_ratio = config.blend_ratio
100 |     mixed_latents = random_latents * random_ratio + ddim_latents_at_t * (1 - random_ratio)
101 | 
102 |     # Init Pnp
103 |     init_pnp(pipe, ddim_scheduler, config)
104 | 
105 |     # Edit video
106 |     pipe.register_modules(scheduler=ddim_scheduler)
107 |     edited_video = pipe.sample_with_pnp(
108 |         prompt=config.editing_prompt,
109 |         first_frame_paths=edited_1st_frame,
110 |         height=config.image_size[1],
111 |         width=config.image_size[0],
112 |         video_length=config.n_frames,
113 |         num_inference_steps=config.n_steps,
114 |         guidance_scale_txt=config.cfg_txt,
115 |         guidance_scale_img=config.cfg_img,
116 |         negative_prompt=config.editing_negative_prompt,
117 |         frame_stride=config.frame_stride,
118 |         latents=mixed_latents,
119 |         generator=torch.manual_seed(config.seed),
120 |         return_dict=True,
121 |         ddim_init_latents_t_idx=ddim_init_latents_t_idx,
122 |         ddim_inv_latents_path=ddim_latents_path,
123 |         ddim_inv_prompt=config.ddim_inv_prompt,
124 |         ddim_inv_1st_frame_path=src_1st_frame,
125 |     ).videos
126 | 
127 |     # Save video
128 |     os.makedirs(config.output_dir, exist_ok=True)
129 |     # Downsampling the video for space saving
130 |     save_videos_grid(edited_video, os.path.join(config.output_dir, config.editing_prompt, "video.gif"), fps=8, format="gif")
131 |     save_videos_grid(edited_video, os.path.join(config.output_dir, config.editing_prompt, "video.mp4"), fps=8, format="mp4")
132 |     logger.info(f"Saved edited video to {config.output_dir}")
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     parser = argparse.ArgumentParser()
137 |     parser.add_argument("--config", type=str, default="./configs/pnp_edit.yaml")
138 |     parser.add_argument("optional_args", nargs='*', default=[])
139 |     args = parser.parse_args()
140 |     config = OmegaConf.load(args.config)
141 | 
142 |     if args.optional_args:
143 |         modified_config = OmegaConf.from_dotlist(args.optional_args)
144 |         config = OmegaConf.merge(config, modified_config)
145 | 
146 |     # Set up logging
147 |     logging_level = logging.DEBUG if config.debug else logging.INFO
148 |     logging.basicConfig(level=logging_level, format="%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s")
149 |     logger = logging.getLogger(__name__)
150 |     logger.info(f"config: {OmegaConf.to_yaml(config)}")
151 | 
152 |     # Set up device and seed
153 |     device = torch.device(config.device)
154 |     seed_everything(config.seed)
155 |     main(config)


--------------------------------------------------------------------------------
/consisti2v/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import numpy as np
  4 | import torch
  5 | from torchvision.io import read_video
  6 | import torchvision.transforms as T
  7 | from pathlib import Path
  8 | from PIL import Image
  9 | from diffusers.utils import load_image
 10 | import torch.nn.functional as F
 11 | import glob
 12 | 
 13 | def isinstance_str(x: object, cls_name: str):
 14 |     """
 15 |     Checks whether x has any class *named* cls_name in its ancestry.
 16 |     Doesn't require access to the class's implementation.
 17 |     
 18 |     Useful for patching!
 19 |     """
 20 | 
 21 |     for _cls in x.__class__.__mro__:
 22 |         if _cls.__name__ == cls_name:
 23 |             return True
 24 |     
 25 |     return False
 26 | 
 27 | 
 28 | def seed_everything(seed):
 29 |     torch.manual_seed(seed)
 30 |     torch.cuda.manual_seed(seed)
 31 |     torch.cuda.manual_seed_all(seed)
 32 |     random.seed(seed)
 33 |     np.random.seed(seed)
 34 | 
 35 | def load_ddim_latents_at_t(t, ddim_latents_path):
 36 |     ddim_latents_at_t_path = os.path.join(ddim_latents_path, f"ddim_latents_{t}.pt")
 37 |     assert os.path.exists(ddim_latents_at_t_path), f"Missing latents at t {t} path {ddim_latents_at_t_path}"
 38 |     ddim_latents_at_t = torch.load(ddim_latents_at_t_path)
 39 |     print(f"############ Loaded ddim_latents_at_t from {ddim_latents_at_t_path}")
 40 |     return ddim_latents_at_t
 41 | 
 42 | def load_ddim_latents_at_T(ddim_latents_path):
 43 |     noisest = max(
 44 |         [
 45 |             int(x.split("_")[-1].split(".")[0])
 46 |             for x in glob.glob(os.path.join(ddim_latents_path, f"ddim_latents_*.pt"))
 47 |         ]
 48 |     )
 49 |     ddim_latents_at_T_path = os.path.join(ddim_latents_path, f"ddim_latents_{noisest}.pt")
 50 |     ddim_latents_at_T = torch.load(ddim_latents_at_T_path)  # [b, c, f, h, w] [1, 4, 16, 40, 64]
 51 |     return ddim_latents_at_T
 52 | 
 53 | 
 54 | # Modified from tokenflow/utils.py
 55 | def convert_video_to_frames(video_path, img_size=(512, 512), save_frames=True, save_dir=None):
 56 |     video, _, _ = read_video(video_path, output_format="TCHW")
 57 |     # rotate video -90 degree if video is .mov format. this is a weird bug in torchvision
 58 |     if video_path.endswith(".mov"):
 59 |         video = T.functional.rotate(video, -90)
 60 |     if save_frames:
 61 |         video_name = Path(video_path).stem
 62 |         video_dir = Path(video_path).parent
 63 |         if save_dir is not None:
 64 |             video_dir = save_dir
 65 |         os.makedirs(f"{video_dir}/{video_name}", exist_ok=True)
 66 |     frames = []
 67 |     for i in range(len(video)):
 68 |         ind = str(i).zfill(5)
 69 |         image = T.ToPILImage()(video[i])
 70 |         image_resized = image.resize(img_size, resample=Image.Resampling.LANCZOS)
 71 |         print(f"image_resized.size, height, width: {image_resized.size}, {img_size[1]}, {img_size[0]}")
 72 |         if save_frames:
 73 |             image_resized.save(f"{video_dir}/{video_name}/{ind}.png")
 74 |             print(f"Saved frame {video_dir}/{video_name}/{ind}.png")
 75 |         frames.append(image_resized)
 76 |     return frames
 77 | 
 78 | 
 79 | # Modified from tokenflow/utils.py
 80 | def load_video_frames(frames_path, n_frames):
 81 |     # Load paths
 82 |     paths = [f"{frames_path}/%05d.png" % i for i in range(n_frames)]
 83 |     frames = [load_image(p) for p in paths]
 84 |     return paths, frames
 85 | 
 86 | 
 87 | def register_spatial_attention_pnp(model, injection_schedule):
 88 |     def sa_forward(self):
 89 |         def forward(hidden_states, encoder_hidden_states=None, attention_mask=None, use_image_num=None):
 90 |             batch_size, sequence_length, _dim = hidden_states.shape
 91 |             n_frames = batch_size // 3  # batch_size is 3*n_frames because concat[source, uncond, cond]
 92 | 
 93 |             encoder_hidden_states = encoder_hidden_states
 94 | 
 95 |             if self.group_norm is not None:
 96 |                 hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
 97 | 
 98 |             query = self.to_q(hidden_states)  # [b (h w)] f (nd * d)
 99 | 
100 |             if self.added_kv_proj_dim is not None:
101 |                 print(f"[ERROR] Run into added_kv_proj_dim, which is not supported yet. Exiting...")
102 |                 key = self.to_k(hidden_states)
103 |                 value = self.to_v(hidden_states)
104 |                 encoder_hidden_states_key_proj = self.add_k_proj(encoder_hidden_states)
105 |                 encoder_hidden_states_value_proj = self.add_v_proj(encoder_hidden_states)
106 | 
107 |                 key = self.reshape_heads_to_batch_dim(key)
108 |                 value = self.reshape_heads_to_batch_dim(value)
109 |                 encoder_hidden_states_key_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_key_proj)
110 |                 encoder_hidden_states_value_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_value_proj)
111 | 
112 |                 key = torch.concat([encoder_hidden_states_key_proj, key], dim=1)
113 |                 value = torch.concat([encoder_hidden_states_value_proj, value], dim=1)
114 |             else:
115 |                 encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
116 |                 key = self.to_k(encoder_hidden_states)
117 |                 value = self.to_v(encoder_hidden_states)
118 | 
119 |                 if self.injection_schedule is not None and (self.t in self.injection_schedule or self.t == 1000):
120 |                     # inject source into unconditional
121 |                     query[n_frames: 2 * n_frames] = query[:n_frames]
122 |                     key[n_frames: 2 * n_frames] = key[:n_frames]
123 |                     # inject source into conditional
124 |                     query[2 * n_frames:] = query[:n_frames]
125 |                     key[2 * n_frames:] = key[:n_frames]
126 | 
127 |                 if not self.use_relative_position:
128 |                     key = self.reshape_heads_to_batch_dim(key)
129 |                     value = self.reshape_heads_to_batch_dim(value)
130 | 
131 |             dim = query.shape[-1]
132 |             if not self.use_relative_position:
133 |                 query = self.reshape_heads_to_batch_dim(query)  # [b (h w) nd] f d
134 | 
135 |             if attention_mask is not None:
136 |                 if attention_mask.shape[-1] != query.shape[1]:
137 |                     target_length = query.shape[1]
138 |                     attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
139 |                     attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)
140 | 
141 |             # attention, what we cannot get enough of
142 |             if self._use_memory_efficient_attention_xformers:
143 |                 hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
144 |                 # Some versions of xformers return output in fp32, cast it back to the dtype of the input
145 |                 hidden_states = hidden_states.to(query.dtype)
146 |             else:
147 |                 if self._slice_size is None or query.shape[0] // self._slice_size == 1:
148 |                     hidden_states = self._attention(query, key, value, attention_mask)
149 |                 else:
150 |                     hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)
151 | 
152 |             # linear proj
153 |             hidden_states = self.to_out[0](hidden_states)
154 | 
155 |             # dropout
156 |             hidden_states = self.to_out[1](hidden_states)
157 |             return hidden_states
158 | 
159 |         return forward
160 | 
161 |     for _, module in model.unet.named_modules():
162 |         if isinstance_str(module, "BasicTransformerBlock"):
163 |             module.attn1.forward = sa_forward(module.attn1)
164 |             setattr(module.attn1, "injection_schedule", [])  # Disable PNP
165 | 
166 |     res_dict = {1: [1, 2], 2: [0, 1, 2], 3: [0, 1, 2]}
167 |     # we are injecting attention in blocks 4 - 11 of the decoder, so not in the first block of the lowest resolution
168 |     for res in res_dict:
169 |         for block in res_dict[res]:
170 |             module = model.unet.up_blocks[res].attentions[block].transformer_blocks[0].attn1
171 |             module.forward = sa_forward(module)
172 |             setattr(module, "injection_schedule", injection_schedule)
173 | 


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection.mp4


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/00000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00000.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/00001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00001.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/00002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00002.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/00003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00003.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/00004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00004.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/00005.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00005.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/00006.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00006.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/00007.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00007.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/00008.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00008.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/00009.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00009.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/00010.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00010.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/00011.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00011.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/00012.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00012.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/00013.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00013.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/00014.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00014.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/00015.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/00015.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/edited_first_frame/Sketch style.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/edited_first_frame/Sketch style.png


--------------------------------------------------------------------------------
/demo/A Couple In A Public Display Of Affection/edited_first_frame/Snowing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A Couple In A Public Display Of Affection/edited_first_frame/Snowing.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor.mp4


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/00000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00000.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/00001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00001.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/00002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00002.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/00003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00003.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/00004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00004.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/00005.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00005.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/00006.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00006.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/00007.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00007.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/00008.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00008.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/00009.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00009.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/00010.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00010.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/00011.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00011.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/00012.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00012.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/00013.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00013.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/00014.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00014.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/00015.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/00015.png


--------------------------------------------------------------------------------
/demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind.mp4


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/00000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00000.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/00001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00001.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/00002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00002.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/00003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00003.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/00004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00004.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/00005.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00005.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/00006.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00006.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/00007.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00007.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/00008.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00008.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/00009.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00009.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/00010.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00010.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/00011.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00011.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/00012.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00012.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/00013.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00013.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/00014.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00014.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/00015.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/00015.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/add a party hat on his head.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/add a party hat on his head.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/cyberpunk style.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/cyberpunk style.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/give him a punk hair style.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/give him a punk hair style.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/helmet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/helmet.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/hinton.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/hinton.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/starry night style.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/starry night style.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/turn his hair white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/turn his hair white.png


--------------------------------------------------------------------------------
/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/turn man into robot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/turn man into robot.png


--------------------------------------------------------------------------------
/demo/Ballet.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet.mp4


--------------------------------------------------------------------------------
/demo/Ballet/00000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00000.png


--------------------------------------------------------------------------------
/demo/Ballet/00001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00001.png


--------------------------------------------------------------------------------
/demo/Ballet/00002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00002.png


--------------------------------------------------------------------------------
/demo/Ballet/00003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00003.png


--------------------------------------------------------------------------------
/demo/Ballet/00004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00004.png


--------------------------------------------------------------------------------
/demo/Ballet/00005.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00005.png


--------------------------------------------------------------------------------
/demo/Ballet/00006.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00006.png


--------------------------------------------------------------------------------
/demo/Ballet/00007.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00007.png


--------------------------------------------------------------------------------
/demo/Ballet/00008.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00008.png


--------------------------------------------------------------------------------
/demo/Ballet/00009.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00009.png


--------------------------------------------------------------------------------
/demo/Ballet/00010.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00010.png


--------------------------------------------------------------------------------
/demo/Ballet/00011.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00011.png


--------------------------------------------------------------------------------
/demo/Ballet/00012.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00012.png


--------------------------------------------------------------------------------
/demo/Ballet/00013.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00013.png


--------------------------------------------------------------------------------
/demo/Ballet/00014.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00014.png


--------------------------------------------------------------------------------
/demo/Ballet/00015.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/00015.png


--------------------------------------------------------------------------------
/demo/Ballet/edited_first_frame/van gogh style.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Ballet/edited_first_frame/van gogh style.png


--------------------------------------------------------------------------------
/demo/Man Walking.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking.mp4


--------------------------------------------------------------------------------
/demo/Man Walking/00000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00000.png


--------------------------------------------------------------------------------
/demo/Man Walking/00001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00001.png


--------------------------------------------------------------------------------
/demo/Man Walking/00002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00002.png


--------------------------------------------------------------------------------
/demo/Man Walking/00003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00003.png


--------------------------------------------------------------------------------
/demo/Man Walking/00004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00004.png


--------------------------------------------------------------------------------
/demo/Man Walking/00005.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00005.png


--------------------------------------------------------------------------------
/demo/Man Walking/00006.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00006.png


--------------------------------------------------------------------------------
/demo/Man Walking/00007.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00007.png


--------------------------------------------------------------------------------
/demo/Man Walking/00008.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00008.png


--------------------------------------------------------------------------------
/demo/Man Walking/00009.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00009.png


--------------------------------------------------------------------------------
/demo/Man Walking/00010.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00010.png


--------------------------------------------------------------------------------
/demo/Man Walking/00011.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00011.png


--------------------------------------------------------------------------------
/demo/Man Walking/00012.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00012.png


--------------------------------------------------------------------------------
/demo/Man Walking/00013.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00013.png


--------------------------------------------------------------------------------
/demo/Man Walking/00014.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00014.png


--------------------------------------------------------------------------------
/demo/Man Walking/00015.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/00015.png


--------------------------------------------------------------------------------
/demo/Man Walking/edited_first_frame/ElonMusk_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/edited_first_frame/ElonMusk_02.png


--------------------------------------------------------------------------------
/demo/Man Walking/edited_first_frame/Yann LeCun Walking.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/edited_first_frame/Yann LeCun Walking.png


--------------------------------------------------------------------------------
/demo/Man Walking/edited_first_frame/add a cowboy hat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/edited_first_frame/add a cowboy hat.png


--------------------------------------------------------------------------------
/demo/Man Walking/edited_first_frame/change his clothes to red.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/edited_first_frame/change his clothes to red.png


--------------------------------------------------------------------------------
/demo/Man Walking/edited_first_frame/policeman costume.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/edited_first_frame/policeman costume.png


--------------------------------------------------------------------------------
/demo/Man Walking/edited_first_frame/turn him into an astronaut.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/edited_first_frame/turn him into an astronaut.png


--------------------------------------------------------------------------------
/demo/Man Walking/edited_first_frame/turn him into batman.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/edited_first_frame/turn him into batman.png


--------------------------------------------------------------------------------
/demo/Man Walking/edited_first_frame/turn the man into darth vader.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Man Walking/edited_first_frame/turn the man into darth vader.png


--------------------------------------------------------------------------------
/demo/Your-Video-Name/edited_first_frame/Your-edited-first-frame:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Your-Video-Name/edited_first_frame/Your-edited-first-frame


--------------------------------------------------------------------------------
/demo/Your-Video-mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/demo/Your-Video-mp4


--------------------------------------------------------------------------------
/edit_image.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | from PIL import Image
  4 | import json
  5 | from moviepy.editor import VideoFileClip
  6 | import numpy as np
  7 | 
  8 | import black_box_image_edit as image_edit
  9 | 
 10 | def infer_video(model, video_path, output_dir, prompt, prompt_type="instruct", force_512=False, seed=42, negative_prompt="", overwrite=False):
 11 |     """
 12 |     Processes videos from the input directory, resizes them to 512x512 before feeding into the model by first frame,
 13 |     and saves the processed video back to its original size in the output directory.
 14 | 
 15 |     Args:
 16 |         model: The video editing model.
 17 |         input_dir (str): Path to the directory containing input videos.
 18 |         output_dir (str): Path to the directory where processed videos will be saved.
 19 |         prompt (str): Instruction prompt for video editing.
 20 |     """
 21 | 
 22 |     # Create the output directory if it does not exist
 23 |     if not os.path.exists(output_dir):
 24 |         os.makedirs(output_dir)
 25 | 
 26 |     video_clip = VideoFileClip(video_path)
 27 |     video_filename = os.path.basename(video_path)
 28 |     # filename_noext = os.path.splitext(video_filename)[0]
 29 |     
 30 |     # Create the output directory if it does not exist
 31 |     # final_output_dir = os.path.join(output_dir, filename_noext)
 32 |     final_output_dir = output_dir
 33 |     if not os.path.exists(final_output_dir):
 34 |         os.makedirs(final_output_dir)
 35 | 
 36 |     result_path = os.path.join(final_output_dir, prompt + ".png")
 37 | 
 38 |     # Check if result already exists
 39 |     if os.path.exists(result_path) and overwrite is False:
 40 |         print(f"Result already exists: {result_path}")
 41 |         return
 42 | 
 43 |     def process_frame(image):
 44 |         pil_image = Image.fromarray(image)
 45 |         if force_512:
 46 |             pil_image = pil_image.resize((512, 512), Image.LANCZOS)
 47 |         if prompt_type == "instruct":
 48 |             result = model.infer_one_image(pil_image, instruct_prompt=prompt, seed=seed, negative_prompt=negative_prompt)
 49 |         else:
 50 |             result = model.infer_one_image(pil_image, target_prompt=prompt, seed=seed, negative_prompt=negative_prompt)
 51 |         if force_512:
 52 |             result = result.resize(video_clip.size, Image.LANCZOS)
 53 |         return np.array(result)
 54 |     
 55 |     # Process only the first frame
 56 |     first_frame = video_clip.get_frame(0)  # Get the first frame
 57 |     processed_frame = process_frame(first_frame)  # Process the first frame
 58 | 
 59 | 
 60 |     #Image.fromarray(first_frame).save(os.path.join(final_output_dir, "00000.png"))
 61 |     Image.fromarray(processed_frame).save(result_path)
 62 |     print(f"Processed and saved the first frame: {result_path}")
 63 |     return result_path
 64 | 
 65 | 
 66 | if __name__ == "__main__":
 67 |     parser = argparse.ArgumentParser(description='Process some images.')
 68 |     parser.add_argument('--model', type=str, default='instructpix2pix', choices=['magicbrush','instructpix2pix', 'cosxl'], help='Name of the image editing model')
 69 |     parser.add_argument('--video_path', type=str, required=False, help='Name of the video', default=None)
 70 |     parser.add_argument('--input_dir', type=str, required=False, help='Directory containing the video', default="./demo/")
 71 |     parser.add_argument('--output_dir', type=str, required=False, help='Directory to save the processed images', default=None)
 72 |     parser.add_argument('--prompt', type=str, required=False, help='Instruction prompt for editing', default="turn the man into darth vader")
 73 |     parser.add_argument('--force_512', action='store_true', help='Force resize to 512x512 when feeding into image model')
 74 |     parser.add_argument('--dict_file', type=str, required=False, help='JSON file containing files, instructions etc.', default=None)
 75 |     parser.add_argument('--seed', type=int, required=False, help='Seed for random number generator', default=42)
 76 |     parser.add_argument('--negative_prompt', type=str, required=False, help='Negative prompt for editing', default=None)
 77 |     args = parser.parse_args()
 78 | 
 79 |     if args.negative_prompt is None:
 80 |         negative_prompt = "worst quality, normal quality, low quality, low res, blurry, watermark, jpeg artifacts"
 81 |     else:
 82 |         negative_prompt = args.negative_prompt
 83 |         
 84 |     if args.dict_file:
 85 |         with open(args.dict_file, 'r') as json_file:
 86 |             folders_info = json.load(json_file)
 87 | 
 88 |         for video_name, video_infos in folders_info.items():
 89 |             input_dir = args.input_dir
 90 |             video_path = os.path.join(input_dir, video_name)
 91 | 
 92 |             for video_info in video_infos:
 93 |                 model_name = video_info.get('image_model', None)
 94 |                 instruction = video_info.get('instruction', None)
 95 |                 target_caption = video_info.get('target_caption', None)
 96 | 
 97 |                 if instruction is None and target_caption is None:
 98 |                     continue
 99 | 
100 |                 if model_name == 'magicbrush':
101 |                     model = image_edit.MagicBrush()
102 |                     prompt_type = "instruct"
103 |                     prompt = instruction
104 |                 elif model_name == 'instructpix2pix':
105 |                     model = image_edit.InstructPix2Pix()
106 |                     prompt_type = "instruct"
107 |                     prompt = instruction
108 |                 elif model_name == 'cosxl':
109 |                     model = image_edit.CosXLEdit()
110 |                     prompt_type = "instruct"
111 |                     prompt = instruction
112 |                 else:
113 |                     prompt_type = "target"
114 |                     prompt = target_caption
115 | 
116 | 
117 |                 if args.output_dir is None:
118 |                     video_filename = os.path.basename(video_path)
119 |                     filename_noext = os.path.splitext(video_filename)[0]
120 |                     output_dir = os.path.dirname(video_path)
121 |                 else:
122 |                     output_dir = args.output_dir
123 | 
124 |                 infer_video(model, video_path, output_dir, prompt, prompt_type, args.force_512, args.seed, negative_prompt)
125 |     else:
126 |         if args.model == 'magicbrush':
127 |             model = image_edit.MagicBrush()
128 |             prompt_type = "instruct"
129 |         elif args.model == 'instructpix2pix':
130 |             model = image_edit.InstructPix2Pix()
131 |             prompt_type = "instruct"
132 |         elif args.model == 'cosxl':
133 |             model = image_edit.CosXLEdit()
134 |             prompt_type = "instruct"
135 | 
136 |         video_path = args.video_path
137 |         
138 |         if args.output_dir is None:
139 |             video_filename = os.path.basename(video_path)
140 |             filename_noext = os.path.splitext(video_filename)[0]
141 |             output_dir = os.path.dirname(video_path)
142 |         else:
143 |             output_dir = args.output_dir
144 |         
145 |         print("video_filename", video_filename)
146 |         print("output_dir", output_dir)
147 | 
148 |         infer_video(model, video_path, output_dir, args.prompt, prompt_type, args.force_512, args.seed, negative_prompt)
149 | 


--------------------------------------------------------------------------------
/i2vgen-xl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/i2vgen-xl/__init__.py


--------------------------------------------------------------------------------
/i2vgen-xl/configs/group_ddim_inversion/group_config.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "active": true,
 4 |     "force_recompute_latents": false,
 5 |     "video_name": "An Old Man Doing Exercises For The Body And Mind",
 6 |     "recon_config":
 7 |       {
 8 |         "enable_recon": true
 9 |       }
10 |   },
11 |   {
12 |     "active": false,
13 |     "force_recompute_latents": false,
14 |     "video_name": "A Couple In A Public Display Of Affection"
15 |   },
16 |   {
17 |     "active": false,
18 |     "force_recompute_latents": false,
19 |     "video_name": "Ballet"
20 |   },
21 |   {
22 |     "active": false,
23 |     "force_recompute_latents": false,
24 |     "video_name": "Man Walking"
25 |   },
26 |   {
27 |     "active": false,
28 |     "force_recompute_latents": false,
29 |     "video_name":"A kitten turning its head on a wooden floor",
30 |     "image_size": [512, 512]
31 |   },
32 |   {
33 |     "active": false,
34 |     "force_recompute_latents": false,
35 |     "video_name":"Your-Video-Name",
36 |     "image_size": [512, 512],
37 |     "recon_config":
38 |         {
39 |           "enable_recon": false
40 |         }
41 |   }
42 | ]


--------------------------------------------------------------------------------
/i2vgen-xl/configs/group_ddim_inversion/template.yaml:
--------------------------------------------------------------------------------
 1 | # "ReplaceMe" will be overwritten by the values in group_config.json
 2 | 
 3 | # General
 4 | seed: 8888
 5 | device: "cuda:7" # <-- change this to the GPU you want to use
 6 | debug: False # For logging DEBUG level messages otherwise INFO
 7 | 
 8 | # Dir
 9 | data_dir: ".."  # <-- change this to the path of the data directory, if you cloned the repo, leave it as "..", the inversion latents will be saved in AnyV2V/
10 | model_name: "i2vgen-xl"
11 | exp_name: "${video_name}"
12 | output_dir: "${data_dir}/inversions/${model_name}/${exp_name}"
13 | 
14 | # Data
15 | #image_size: [1280, 704]
16 | image_size: [512, 512]
17 | video_dir: "${data_dir}/demo"
18 | video_name: "ReplaceMe"
19 | video_path: "ReplaceMe"
20 | video_frames_path: "ReplaceMe"
21 | 
22 | # DDIM settings
23 | n_frames: 16
24 | 
25 | # DDIM inversion
26 | inverse_config:
27 |     image_size: ${image_size}
28 |     n_frames: ${n_frames}
29 |     cfg: 1.0
30 |     target_fps: 8
31 |     prompt: ""
32 |     negative_prompt: ""
33 |     n_steps: 500
34 |     output_dir: "${output_dir}/ddim_latents"
35 |     inverse_static_video: False
36 |     null_image_inversion: False
37 | 
38 | # DDIM reconstruction
39 | recon_config:
40 |     enable_recon: False
41 |     image_size: ${image_size}
42 |     n_frames: ${n_frames}
43 |     cfg: 9.0
44 |     target_fps: 8
45 |     prompt: ""
46 |     negative_prompt: "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"
47 |     n_steps: 50
48 |     ddim_init_latents_t_idx: 3  # 0 for 981, 3 for 921, 9 for 801, 20 for 581 if n_steps=50
49 |     ddim_latents_path: "${inverse_config.output_dir}"
50 | 


--------------------------------------------------------------------------------
/i2vgen-xl/configs/group_pnp_edit/group_config.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "active": true,
  4 |     "task_name": "Prompt-Based-Editing",
  5 |     "video_name":"An Old Man Doing Exercises For The Body And Mind",
  6 |     "edited_first_frame_path":"demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/turn man into robot.png",
  7 |     "editing_prompt":"a man doing exercises for the body and mind",
  8 |     "edited_video_name": "a robot doing exercises for the body and mind",
  9 |     "ddim_init_latents_t_idx": 0,
 10 |     "pnp_f_t": 1.0,
 11 |     "pnp_spatial_attn_t": 1.0,
 12 |     "pnp_temp_attn_t":1.0
 13 |   },
 14 |   {
 15 |     "active": false,
 16 |     "task_name": "Prompt-Based-Editing",
 17 |     "video_name":"An Old Man Doing Exercises For The Body And Mind",
 18 |     "edited_first_frame_path":"demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/turn his hair white.png",
 19 |     "editing_prompt":"a man with white hair doing exercises for the body and mind",
 20 |     "edited_video_name": "a man with white hair doing exercises for the body and mind",
 21 |     "pnp_temp_attn_t": 1.0
 22 |   },
 23 |   {
 24 |     "active": false,
 25 |     "task_name": "Prompt-Based-Editing",
 26 |     "video_name":"An Old Man Doing Exercises For The Body And Mind",
 27 |     "edited_first_frame_path":"demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/add a party hat on his head.png",
 28 |     "editing_prompt":"a man with a party hat doing exercises for the body and mind",
 29 |     "edited_video_name": "a man with a party hat doing exercises for the body and mind",
 30 |     "ddim_init_latents_t_idx": 0,
 31 |     "pnp_f_t": 0.1,
 32 |     "pnp_spatial_attn_t": 0.1,
 33 |     "pnp_temp_attn_t":1.0
 34 |   },
 35 |   {
 36 |     "active": false,
 37 |     "task_name": "Style-Transfer",
 38 |     "video_name":"An Old Man Doing Exercises For The Body And Mind",
 39 |     "edited_first_frame_path":"demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/starry night style.png",
 40 |     "editing_prompt":"an old man doing exercises for the body and mind, in a style of starry night",
 41 |     "edited_video_name": "an old man doing exercises for the body and mind, in a style of starry night"
 42 |   },
 43 |   {
 44 |     "active": false,
 45 |     "task_name": "Style-Transfer",
 46 |     "video_name":"An Old Man Doing Exercises For The Body And Mind",
 47 |     "edited_first_frame_path":"demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/cyberpunk style.png",
 48 |     "editing_prompt":"an old man doing exercises for the body and mind, in a style of cyberpunk",
 49 |     "edited_video_name": "an old man doing exercises for the body and mind, in a style of cyberpunk",
 50 |     "ddim_init_latents_t_idx": 0,
 51 |     "pnp_f_t": 1.0,
 52 |     "pnp_spatial_attn_t": 1.0,
 53 |     "pnp_temp_attn_t":1.0
 54 |   },
 55 |   {
 56 |     "active": false,
 57 |     "task_name": "Identity-Manipulation",
 58 |     "video_name":"An Old Man Doing Exercises For The Body And Mind",
 59 |     "edited_first_frame_path":"demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png",
 60 |     "editing_prompt":"a man doing exercises for the body and mind",
 61 |     "edited_video_name": "Middle Aged Jack Ma Doing Exercises For The Body And Mind-pnp_temp_attn_t_1.0",
 62 |     "ddim_init_latents_t_idx": 0,
 63 |     "pnp_f_t": 0.8,
 64 |     "pnp_spatial_attn_t": 0.8,
 65 |     "pnp_temp_attn_t": 1.0
 66 |   },
 67 |   {
 68 |     "active": false,
 69 |     "task_name": "Identity-Manipulation",
 70 |     "video_name":"An Old Man Doing Exercises For The Body And Mind",
 71 |     "edited_first_frame_path":"demos/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/hinton.png",
 72 |     "editing_prompt":"a young man doing exercises for the body and mind",
 73 |     "edited_video_name": "an old man doing exercises for the body and mind-hinton_01",
 74 |     "ddim_init_latents_t_idx": 0,
 75 |     "pnp_f_t": 1.0,
 76 |     "pnp_spatial_attn_t": 1.0,
 77 |     "pnp_temp_attn_t":1.0
 78 |   },
 79 |   {
 80 |     "active": false,
 81 |     "task_name": "Subject-Driven-Editing",
 82 |     "video_name":"An Old Man Doing Exercises For The Body And Mind",
 83 |     "edited_first_frame_path":"demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/helmet.png",
 84 |     "editing_prompt":"a man doing exercises for the body and mind",
 85 |     "edited_video_name": "a robot doing exercises for the body and mind-helmet",
 86 |     "ddim_init_latents_t_idx": 0,
 87 |     "pnp_f_t": 0.2,
 88 |     "pnp_spatial_attn_t": 0.2,
 89 |     "pnp_temp_attn_t":1.0
 90 |   },
 91 |   {
 92 |     "active": false,
 93 |     "task_name": "Prompt-Based-Editing",
 94 |     "video_name":"A Couple In A Public Display Of Affection",
 95 |     "edited_first_frame_path":"demo/A Couple In A Public Display Of Affection/edited_first_frame/Snowing.png",
 96 |     "editing_prompt":"A couple in a public display of affection, snowing",
 97 |     "edited_video_name": "A couple in a public display of affection, snowing",
 98 |     "ddim_init_latents_t_idx": 0,
 99 |     "pnp_f_t": 0.3,
100 |     "pnp_spatial_attn_t": 0.3,
101 |     "pnp_temp_attn_t":1.0
102 |   },
103 |   {
104 |     "active": false,
105 |     "task_name": "Style-Transfer",
106 |     "video_name":"Ballet",
107 |     "edited_first_frame_path":"demo/Ballet/edited_first_frame/van gogh style.png",
108 |     "editing_prompt":"girl dancing ballet, in the style of van gogh",
109 |     "edited_video_name": "girl dancing ballet, in the style of van gogh",
110 |     "ddim_init_latents_t_idx": 0,
111 |     "pnp_f_t": 1.0,
112 |     "pnp_spatial_attn_t": 1.0,
113 |     "pnp_temp_attn_t":1.0
114 |   },
115 |   {
116 |     "active": false,
117 |     "task_name": "Prompt-Based-Editing",
118 |     "video_name":"Man Walking",
119 |     "edited_first_frame_path":"demo/Man Walking/edited_first_frame/turn the man into darth vader.png",
120 |     "editing_prompt":"man walking",
121 |     "edited_video_name": "darth vader walking",
122 |     "ddim_init_latents_t_idx": 0,
123 |     "pnp_f_t": 0.1,
124 |     "pnp_spatial_attn_t": 0.1,
125 |     "pnp_temp_attn_t": 1.0
126 |   },
127 |   {
128 |     "active": false,
129 |     "video_name":"Man Walking",
130 |     "task_name": "Identity-Manipulation",
131 |     "edited_first_frame_path":"demo/Man Walking/edited_first_frame/ElonMusk_02.png",
132 |     "editing_prompt":"a man walking in autumn",
133 |     "edited_video_name": "Elon Musk walking in autumn",
134 |     "ddim_init_latents_t_idx": 0,
135 |     "pnp_f_t": 0.1,
136 |     "pnp_spatial_attn_t": 0.1,
137 |     "pnp_temp_attn_t": 1.0
138 |   },
139 |   {
140 |     "active": false,
141 |     "task_name": "Identity-Manipulation",
142 |     "video_name":"Man Walking",
143 |     "edited_first_frame_path":"demo/Man Walking/edited_first_frame/Yann LeCun Walking.png",
144 |     "editing_prompt":"a man walking",
145 |     "edited_video_name": "Yann LeCun walking",
146 |     "ddim_init_latents_t_idx": 0,
147 |     "pnp_f_t": 0.0,
148 |     "pnp_spatial_attn_t": 0.0,
149 |     "pnp_temp_attn_t": 1.0
150 |   },
151 |   {
152 |     "active": false,
153 |     "task_name": "Subject-Driven-Editing",
154 |     "video_name":"A kitten turning its head on a wooden floor",
155 |     "edited_first_frame_path":"demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png",
156 |     "editing_prompt":"A dog turning its head on a wooden floor",
157 |     "edited_video_name": "A dog turning its head on a wooden floor",
158 |     "ddim_init_latents_t_idx": 0,
159 |     "pnp_f_t": 0.2,
160 |     "pnp_spatial_attn_t": 0.2,
161 |     "pnp_temp_attn_t":0.5
162 |   },
163 |   {
164 |     "active": false,
165 |     "task_name": "Your-Task from the list[Prompt-Based-Editing, Style-Transfer, Identity-Manipulation, Subject-Driven-Editing]",
166 |     "video_name":"Your-Video-Name",
167 |     "edited_first_frame_path":"demo/Your-Video-Name/edited_first_frame/Your-Edited-First-Frame.png",
168 |     "editing_prompt":"Your-Editing-Prompt",
169 |     "edited_video_name": "Your-Edited-Video-Name",
170 |     "ddim_init_latents_t_idx": 0,
171 |     "pnp_f_t": 0,
172 |     "pnp_spatial_attn_t": 0,
173 |     "pnp_temp_attn_t":0
174 |   }
175 | ]


--------------------------------------------------------------------------------
/i2vgen-xl/configs/group_pnp_edit/template.yaml:
--------------------------------------------------------------------------------
 1 | # "ReplaceMe" will be overwritten by the values in group_config.json
 2 | 
 3 | # General
 4 | seed: 8888
 5 | device: "cuda:4" # <-- change this to the GPU you want to use
 6 | debug: False # For logging DEBUG level messages otherwise INFO
 7 | 
 8 | # Dir
 9 | data_dir: ".."  # <-- change this to the path of the data directory, if you cloned the repo, leave it as "..", the inversion latents will be saved in AnyV2V/
10 | model_name: "i2vgen-xl"
11 | task_name: "Prompt-Based-Editing"
12 | edited_video_name: "ReplaceMe"
13 | output_dir: "${data_dir}/Results/${task_name}/${model_name}/${video_name}/${edited_video_name}/"
14 | 
15 | # Data
16 | image_size: [512, 512]
17 | video_dir: "${data_dir}/demo"
18 | video_name: "ReplaceMe"
19 | video_path: "ReplaceMe"
20 | video_frames_path: "ReplaceMe"
21 | edited_first_frame_path: "ReplaceMe"
22 | ddim_latents_path: "${data_dir}/inversions/${model_name}/${video_name}/ddim_latents" # Same as inverse_config.output_dir
23 | 
24 | # Pnp Editing
25 | n_frames: 16
26 | cfg: 9.0
27 | target_fps: 8
28 | editing_prompt: "ReplaceMe"
29 | editing_negative_prompt: "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"
30 | n_steps: 50
31 | ddim_init_latents_t_idx: 1  # 0 for 981, 3 for 921, 9 for 801, 20 for 581 if n_steps=50
32 | ddim_inv_prompt: ""
33 | random_ratio: 0.0
34 | 
35 | # Pnp config
36 | pnp_f_t: 0.2
37 | pnp_spatial_attn_t: 0.2
38 | pnp_temp_attn_t: 0.5


--------------------------------------------------------------------------------
/i2vgen-xl/environment.yml:
--------------------------------------------------------------------------------
 1 | name: anyv2v-i2vgen-xl
 2 | channels:
 3 |   - pytorch
 4 |   - nvidia
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.9
 8 |   - pytorch
 9 |   - torchvision
10 |   - torchaudio
11 |   - pytorch-cuda=11.8
12 |   - pip
13 |   - pip:
14 |     - accelerate
15 |     - diffusers==0.26.3
16 |     - transformers
17 |     - omegaconf
18 |     - opencv-python
19 |     - ipython
20 |     - moviepy
21 |     - notebook
22 | 


--------------------------------------------------------------------------------
/i2vgen-xl/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/AnyV2V/bc540befacafddb9689ee86a396e7738bfed0e4f/i2vgen-xl/pipelines/__init__.py


--------------------------------------------------------------------------------
/i2vgen-xl/run_group_ddim_inversion.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from pathlib import Path
  4 | import torch
  5 | import argparse
  6 | import logging
  7 | from omegaconf import OmegaConf
  8 | from PIL import Image
  9 | import json
 10 | 
 11 | # HF imports
 12 | from diffusers import (
 13 |     DDIMInverseScheduler,
 14 |     DDIMScheduler,
 15 | )
 16 | from diffusers.utils import load_image, export_to_video, export_to_gif
 17 | 
 18 | # Project imports
 19 | from utils import (
 20 |     seed_everything,
 21 |     load_video_frames,
 22 |     convert_video_to_frames,
 23 |     load_ddim_latents_at_T,
 24 |     load_ddim_latents_at_t,
 25 | )
 26 | from pipelines.pipeline_i2vgen_xl import I2VGenXLPipeline
 27 | 
 28 | 
 29 | def ddim_inversion(config, first_frame, frame_list, pipe: I2VGenXLPipeline, inverse_scheduler, g):
 30 |     pipe.scheduler = inverse_scheduler
 31 |     video_latents_at_0 = pipe.encode_vae_video(
 32 |         frame_list,
 33 |         device=pipe._execution_device,
 34 |         height=config.image_size[1],
 35 |         width=config.image_size[0],
 36 |     )
 37 |     ddim_latents = pipe.invert(
 38 |         prompt=config.prompt,
 39 |         image=first_frame,
 40 |         height=config.image_size[1],
 41 |         width=config.image_size[0],
 42 |         num_frames=config.n_frames,
 43 |         num_inference_steps=config.n_steps,
 44 |         guidance_scale=config.cfg,
 45 |         negative_prompt=config.negative_prompt,
 46 |         target_fps=config.target_fps,
 47 |         latents=video_latents_at_0,
 48 |         generator=g,  # TODO: this is not correct
 49 |         return_dict=False,
 50 |         output_dir=config.output_dir,
 51 |     )  # [b, num_inference_steps, c, num_frames, h, w]
 52 |     logger = logging.getLogger(__name__)
 53 |     logger.debug(f"ddim_latents.shape: {ddim_latents.shape}")
 54 |     ddim_latents = ddim_latents[0]  # [num_inference_steps, c, num_frames, h, w]
 55 |     return ddim_latents
 56 | 
 57 | 
 58 | def ddim_sampling(
 59 |     config, first_frame, ddim_latents_at_T, pipe: I2VGenXLPipeline, ddim_scheduler, ddim_init_latents_t_idx, g
 60 | ):
 61 |     pipe.scheduler = ddim_scheduler
 62 |     reconstructed_video = pipe(
 63 |         prompt=config.prompt,
 64 |         image=first_frame,
 65 |         height=config.image_size[1],
 66 |         width=config.image_size[0],
 67 |         num_frames=config.n_frames,
 68 |         num_inference_steps=config.n_steps,
 69 |         guidance_scale=config.cfg,
 70 |         negative_prompt=config.negative_prompt,
 71 |         target_fps=config.target_fps,
 72 |         latents=ddim_latents_at_T,
 73 |         generator=g,  # TODO: this is not correct
 74 |         return_dict=True,
 75 |         ddim_init_latents_t_idx=ddim_init_latents_t_idx,
 76 |     ).frames[0]
 77 |     return reconstructed_video
 78 | 
 79 | 
 80 | def main(template_config, configs_list):
 81 |     # Initialize the pipeline
 82 |     pipe = I2VGenXLPipeline.from_pretrained(
 83 |         "ali-vilab/i2vgen-xl",
 84 |         torch_dtype=torch.float16,
 85 |         variant="fp16",
 86 |     )
 87 |     pipe.to(device)
 88 |     g = torch.Generator(device=device)
 89 |     g = g.manual_seed(template_config.seed)
 90 | 
 91 |     # Initialize the DDIM inverse scheduler
 92 |     inverse_scheduler = DDIMInverseScheduler.from_pretrained(
 93 |         "ali-vilab/i2vgen-xl",
 94 |         subfolder="scheduler",
 95 |     )
 96 |     # Initialize the DDIM scheduler
 97 |     ddim_scheduler = DDIMScheduler.from_pretrained(
 98 |         "ali-vilab/i2vgen-xl",
 99 |         subfolder="scheduler",
100 |     )
101 | 
102 |     video_dir = template_config.video_dir
103 |     assert os.path.exists(video_dir), f"video_dir: {video_dir} does not exist"
104 |     # loop through the video_dir and process every mp4 file
105 |     for config_entry in configs_list:
106 |         if config_entry["active"] == False:
107 |             logger.info(f"Skipping config_entry: {config_entry}")
108 |             continue
109 |         logger.info(f"Processing config_entry: {config_entry}")
110 | 
111 |         # Override the config with the data_meta_entry
112 |         config = OmegaConf.merge(template_config, OmegaConf.create(config_entry))
113 | 
114 |         config.video_path = os.path.join(config.video_dir, config.video_name + ".mp4")
115 |         config.video_frames_path = os.path.join(config.video_dir, config.video_name)
116 | 
117 |         # If already computed the latents, skip
118 |         if os.path.exists(config.output_dir) and not config.force_recompute_latents:
119 |             logger.info(f"### Skipping !!! {config.output_dir} already exists. ")
120 |             continue
121 | 
122 |         logger.info(f"config: {OmegaConf.to_yaml(config)}")
123 | 
124 |         # This is the same as run_ddim_inversion.py
125 |         try:
126 |             logger.info(f"Loading frames from: {config.video_frames_path}")
127 |             _, frame_list = load_video_frames(config.video_frames_path, config.n_frames, config.image_size)
128 |         except:
129 |             logger.error(f"Failed to load frames from: {config.video_frames_path}")
130 |             logger.info(f"Converting mp4 video to frames: {config.video_path}")
131 |             frame_list = convert_video_to_frames(config.video_path, config.image_size, save_frames=True)
132 |             frame_list = frame_list[: config.n_frames]  # 16 frames for img2vid
133 |             logger.debug(f"len(frame_list): {len(frame_list)}")
134 |             # Save the source frames as GIF
135 |             export_to_gif(
136 |                 frame_list,
137 |                 os.path.join(config.video_frames_path, config.video_name + ".gif")
138 |             )
139 |             logger.info(f"Saved source video as gif to {config.video_frames_path}")
140 |         first_frame = frame_list[0]  # Is a PIL image
141 | 
142 |         # Produce static video
143 |         if config.inverse_config.inverse_static_video:
144 |             logger.info("### Inverse a static video!")
145 |             frame_list = [frame_list[0]] * config.n_frames
146 | 
147 |         # Null image inversion
148 |         if config.inverse_config.null_image_inversion:
149 |             logger.info("### Inverse a null image!")
150 |             first_frame = Image.new("RGB", (config.image_size[0], config.image_size[1]), (0, 0, 0))
151 | 
152 |         # Main pipeline
153 |         # Inversion
154 |         logger.info(f"config: {OmegaConf.to_yaml(config)}")
155 |         _ddim_latents = ddim_inversion(config.inverse_config, first_frame, frame_list, pipe, inverse_scheduler, g)
156 | 
157 |         # Reconstruction
158 |         recon_config = config.recon_config
159 |         if recon_config.enable_recon:
160 |             ddim_init_latents_t_idx = recon_config.ddim_init_latents_t_idx
161 |             ddim_scheduler.set_timesteps(recon_config.n_steps)
162 |             logger.info(f"ddim_scheduler.timesteps: {ddim_scheduler.timesteps}")
163 |             ddim_latents_path = recon_config.ddim_latents_path
164 |             ddim_latents_at_t = load_ddim_latents_at_t(
165 |                 ddim_scheduler.timesteps[ddim_init_latents_t_idx],
166 |                 ddim_latents_path=ddim_latents_path,
167 |             )
168 |             logger.debug(f"ddim_scheduler.timesteps[t_idx]: {ddim_scheduler.timesteps[ddim_init_latents_t_idx]}")
169 |             reconstructed_video = ddim_sampling(
170 |                 recon_config,
171 |                 first_frame,
172 |                 ddim_latents_at_t,
173 |                 pipe,
174 |                 ddim_scheduler,
175 |                 ddim_init_latents_t_idx,
176 |                 g,
177 |             )
178 | 
179 |             # Save the reconstructed video
180 |             os.makedirs(config.output_dir, exist_ok=True)
181 |             # Downsampling the video for space saving
182 |             reconstructed_video = [frame.resize((512, 512), resample=Image.LANCZOS) for frame in reconstructed_video]
183 |             export_to_video(
184 |                 reconstructed_video,
185 |                 os.path.join(config.output_dir, "ddim_reconstruction.mp4"),
186 |                 fps=10,
187 |             )
188 |             export_to_gif(
189 |                 reconstructed_video,
190 |                 os.path.join(config.output_dir, "ddim_reconstruction.gif"),
191 |             )
192 |             logger.info(f"Saved reconstructed video to {config.output_dir}")
193 | 
194 | 
195 | if __name__ == "__main__":
196 |     parser = argparse.ArgumentParser()
197 |     parser.add_argument("--template_config", type=str, default="./configs/group_ddim_inversion/template.yaml")
198 |     parser.add_argument("--configs_json", type=str, default="./configs/group_config.json") # This is going to override the template_config
199 | 
200 |     args = parser.parse_args()
201 |     template_config = OmegaConf.load(args.template_config)
202 | 
203 |     # Set up logging
204 |     logging_level = logging.DEBUG if template_config.debug else logging.INFO
205 |     logging.basicConfig(level=logging_level, format="%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s")
206 |     logger = logging.getLogger(__name__)
207 |     logger.info(f"template_config: {OmegaConf.to_yaml(template_config)}")
208 | 
209 |     # Load data jsonl into list
210 |     configs_json = args.configs_json
211 |     assert Path(configs_json).exists()
212 |     with open(configs_json, 'r') as file:
213 |         configs_list = json.load(file)
214 |     logger.info(f"Loaded {len(configs_list)} configs from {configs_json}")
215 | 
216 |     # Set up device and seed
217 |     device = torch.device(template_config.device)
218 |     torch.set_grad_enabled(False)
219 |     seed_everything(template_config.seed)
220 |     main(template_config, configs_list)
221 | 


--------------------------------------------------------------------------------
/i2vgen-xl/run_group_pnp_edit.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from pathlib import Path
  4 | import torch
  5 | import argparse
  6 | import logging
  7 | from omegaconf import OmegaConf
  8 | from PIL import Image
  9 | import json
 10 | 
 11 | # HF imports
 12 | from diffusers import (
 13 |     DDIMInverseScheduler,
 14 |     DDIMScheduler,
 15 | )
 16 | from diffusers.utils import load_image, export_to_video, export_to_gif
 17 | 
 18 | # Project imports
 19 | from utils import (
 20 |     seed_everything,
 21 |     load_video_frames,
 22 |     convert_video_to_frames,
 23 |     load_ddim_latents_at_T,
 24 |     load_ddim_latents_at_t,
 25 | )
 26 | from pipelines.pipeline_i2vgen_xl import I2VGenXLPipeline
 27 | from pnp_utils import (
 28 |     register_time,
 29 |     register_conv_injection,
 30 |     register_spatial_attention_pnp,
 31 |     register_temp_attention_pnp,
 32 | )
 33 | 
 34 | 
 35 | def init_pnp(pipe, scheduler, config):
 36 |     conv_injection_t = int(config.n_steps * config.pnp_f_t)
 37 |     spatial_attn_qk_injection_t = int(config.n_steps * config.pnp_spatial_attn_t)
 38 |     temp_attn_qk_injection_t = int(config.n_steps * config.pnp_temp_attn_t)
 39 |     conv_injection_timesteps = scheduler.timesteps[:conv_injection_t] if conv_injection_t >= 0 else []
 40 |     spatial_attn_qk_injection_timesteps = (
 41 |         scheduler.timesteps[:spatial_attn_qk_injection_t] if spatial_attn_qk_injection_t >= 0 else []
 42 |     )
 43 |     temp_attn_qk_injection_timesteps = (
 44 |         scheduler.timesteps[:temp_attn_qk_injection_t] if temp_attn_qk_injection_t >= 0 else []
 45 |     )
 46 |     register_conv_injection(pipe, conv_injection_timesteps)
 47 |     register_spatial_attention_pnp(pipe, spatial_attn_qk_injection_timesteps)
 48 |     register_temp_attention_pnp(pipe, temp_attn_qk_injection_timesteps)
 49 | 
 50 |     logger = logging.getLogger(__name__)
 51 |     logger.debug(f"conv_injection_t: {conv_injection_t}")
 52 |     logger.debug(f"spatial_attn_qk_injection_t: {spatial_attn_qk_injection_t}")
 53 |     logger.debug(f"temp_attn_qk_injection_t: {temp_attn_qk_injection_t}")
 54 |     logger.debug(f"conv_injection_timesteps: {conv_injection_timesteps}")
 55 |     logger.debug(f"spatial_attn_qk_injection_timesteps: {spatial_attn_qk_injection_timesteps}")
 56 |     logger.debug(f"temp_attn_qk_injection_timesteps: {temp_attn_qk_injection_timesteps}")
 57 | 
 58 | 
 59 | def main(template_config, configs_list):
 60 |     # Initialize the pipeline
 61 |     pipe = I2VGenXLPipeline.from_pretrained(
 62 |         "ali-vilab/i2vgen-xl",
 63 |         torch_dtype=torch.float16,
 64 |         variant="fp16",
 65 |     )
 66 |     pipe.to(device)
 67 | 
 68 |     # Initialize the DDIM scheduler
 69 |     ddim_scheduler = DDIMScheduler.from_pretrained(
 70 |         "ali-vilab/i2vgen-xl",
 71 |         subfolder="scheduler",
 72 |     )
 73 | 
 74 |     for config_entry in configs_list:
 75 |         if config_entry["active"] == False:
 76 |             logger.info(f"Skipping config_entry: {config_entry}")
 77 |             continue
 78 |         logger.info(f"Processing config_entry: {config_entry}")
 79 | 
 80 |         # Override the config with the data_meta_entry
 81 |         config = OmegaConf.merge(template_config, OmegaConf.create(config_entry))
 82 | 
 83 |         # Update the related paths to absolute paths
 84 |         config.video_path = os.path.join(config.video_dir, config.video_name + ".mp4")
 85 |         config.video_frames_path = os.path.join(config.video_dir, config.video_name)
 86 |         config.edited_first_frame_path = os.path.join(config.data_dir, config.edited_first_frame_path)
 87 |         logger.info(f"config: {OmegaConf.to_yaml(config)}")
 88 | 
 89 |         # Check if there are fields contain "ReplaceMe"
 90 |         for k, v in config.items():
 91 |             if "ReplaceMe" in str(v):
 92 |                 logger.error(f"Field {k} contains 'ReplaceMe'")
 93 |                 continue
 94 | 
 95 |         # This is the same as run_pnp_edit.py
 96 |         # Load first frame and source frames
 97 |         try:
 98 |             logger.info(f"Loading frames from: {config.video_frames_path}")
 99 |             _, frame_list = load_video_frames(config.video_frames_path, config.n_frames, config.image_size)
100 |         except:
101 |             logger.error(f"Failed to load frames from: {config.video_frames_path}")
102 |             logger.info(f"Converting mp4 video to frames: {config.video_path}")
103 |             frame_list = convert_video_to_frames(config.video_path, config.image_size, save_frames=True)
104 |             frame_list = frame_list[: config.n_frames]  # 16 frames for img2vid
105 |             logger.debug(f"len(frame_list): {len(frame_list)}")
106 |         src_frame_list = frame_list
107 |         src_1st_frame = src_frame_list[0]  # Is a PIL image
108 | 
109 |         # Load the edited first frame
110 |         edited_1st_frame = load_image(config.edited_first_frame_path)
111 |         edited_1st_frame = edited_1st_frame.resize(config.image_size, resample=Image.Resampling.LANCZOS)
112 | 
113 |         # Load the initial latents at t
114 |         ddim_init_latents_t_idx = config.ddim_init_latents_t_idx
115 |         ddim_scheduler.set_timesteps(config.n_steps)
116 |         logger.info(f"ddim_scheduler.timesteps: {ddim_scheduler.timesteps}")
117 |         ddim_latents_at_t = load_ddim_latents_at_t(
118 |             ddim_scheduler.timesteps[ddim_init_latents_t_idx], ddim_latents_path=config.ddim_latents_path
119 |         )
120 |         logger.debug(f"ddim_scheduler.timesteps[t_idx]: {ddim_scheduler.timesteps[ddim_init_latents_t_idx]}")
121 |         logger.debug(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}")
122 | 
123 |         # Blend the latents
124 |         random_latents = torch.randn_like(ddim_latents_at_t)
125 |         logger.info(f"Blending random_ratio (1 means random latent): {config.random_ratio}")
126 |         mixed_latents = random_latents * config.random_ratio + ddim_latents_at_t * (1 - config.random_ratio)
127 | 
128 |         # Init Pnp
129 |         init_pnp(pipe, ddim_scheduler, config)
130 | 
131 |         # Edit video
132 |         pipe.register_modules(scheduler=ddim_scheduler)
133 |         edited_video = pipe.sample_with_pnp(
134 |             prompt=config.editing_prompt,
135 |             image=edited_1st_frame,
136 |             height=config.image_size[1],
137 |             width=config.image_size[0],
138 |             num_frames=config.n_frames,
139 |             num_inference_steps=config.n_steps,
140 |             guidance_scale=config.cfg,
141 |             negative_prompt=config.editing_negative_prompt,
142 |             target_fps=config.target_fps,
143 |             latents=mixed_latents,
144 |             generator=torch.manual_seed(config.seed),
145 |             return_dict=True,
146 |             ddim_init_latents_t_idx=ddim_init_latents_t_idx,
147 |             ddim_inv_latents_path=config.ddim_latents_path,
148 |             ddim_inv_prompt=config.ddim_inv_prompt,
149 |             ddim_inv_1st_frame=src_1st_frame,
150 |         ).frames[0]
151 | 
152 |         # Save video
153 |         # Add the config to the output_dir, TODO: make this more elegant
154 |         config_suffix = (
155 |             "ddim_init_latents_t_idx_"
156 |             + str(ddim_init_latents_t_idx)
157 |             + "_nsteps_"
158 |             + str(config.n_steps)
159 |             + "_cfg_"
160 |             + str(config.cfg)
161 |             + "_pnpf"
162 |             + str(config.pnp_f_t)
163 |             + "_pnps"
164 |             + str(config.pnp_spatial_attn_t)
165 |             + "_pnpt"
166 |             + str(config.pnp_temp_attn_t)
167 |         )
168 |         output_dir = os.path.join(config.output_dir, config_suffix)
169 |         os.makedirs(output_dir, exist_ok=True)
170 |         edited_video = [frame.resize(config.image_size, resample=Image.LANCZOS) for frame in edited_video]
171 |         # Downsampling the video for space saving
172 |         # edited_video = [frame.resize((512, 512), resample=Image.LANCZOS) for frame in edited_video]
173 |         # if config.pnp_f_t == 0.0 and config.pnp_spatial_attn_t == 0.0 and config.pnp_temp_attn_t == 0.0:
174 |         #     edited_video_file_name = "ddim_edit"
175 |         # else:
176 |         #     edited_video_file_name = "pnp_edit"
177 |         edited_video_file_name = "video"
178 |         export_to_video(edited_video, os.path.join(output_dir, f"{edited_video_file_name}.mp4"), fps=config.target_fps)
179 |         export_to_gif(edited_video, os.path.join(output_dir, f"{edited_video_file_name}.gif"))
180 |         logger.info(f"Saved video to: {os.path.join(output_dir, f'{edited_video_file_name}.mp4')}")
181 |         logger.info(f"Saved gif to: {os.path.join(output_dir, f'{edited_video_file_name}.gif')}")
182 |         for i, frame in enumerate(edited_video):
183 |             frame.save(os.path.join(output_dir, f"{edited_video_file_name}_{i:05d}.png"))
184 |             logger.info(f"Saved frames to: {os.path.join(output_dir, f'{edited_video_file_name}_{i:05d}.png')}")
185 | 
186 | 
187 | if __name__ == "__main__":
188 |     parser = argparse.ArgumentParser()
189 |     parser.add_argument("--template_config", type=str, default="./configs/group_pnp_edit/template.yaml")
190 |     parser.add_argument(
191 |         "--configs_json", type=str, default="./configs/group_config.json"
192 |     )  # This is going to override the template_config
193 | 
194 |     args = parser.parse_args()
195 |     template_config = OmegaConf.load(args.template_config)
196 | 
197 |     # Set up logging
198 |     logging_level = logging.DEBUG if template_config.debug else logging.INFO
199 |     logging.basicConfig(level=logging_level, format="%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s")
200 |     logger = logging.getLogger(__name__)
201 |     logger.info(f"template_config: {OmegaConf.to_yaml(template_config)}")
202 | 
203 |     # Load data jsonl into list
204 |     configs_json = args.configs_json
205 |     assert Path(configs_json).exists()
206 |     with open(configs_json, "r") as file:
207 |         configs_list = json.load(file)
208 |     logger.info(f"Loaded {len(configs_list)} configs from {configs_json}")
209 | 
210 |     # Set up device and seed
211 |     device = torch.device(template_config.device)
212 |     torch.set_grad_enabled(False)
213 |     seed_everything(template_config.seed)
214 |     main(template_config, configs_list)
215 | 


--------------------------------------------------------------------------------
/i2vgen-xl/scripts/run_group_ddim_inversion.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #source /home/YourName/miniconda3/etc/profile.d/conda.sh  #<-- change this to your own miniconda path
3 | conda activate anyv2v-i2vgen-xl
4 | 
5 | cd ..
6 | python run_group_ddim_inversion.py \
7 | --template_config "configs/group_ddim_inversion/template.yaml" \
8 | --configs_json "configs/group_ddim_inversion/group_config.json"


--------------------------------------------------------------------------------
/i2vgen-xl/scripts/run_group_pnp_edit.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #source /home/YourName/miniconda3/etc/profile.d/conda.sh #<-- change this to your own miniconda path
3 | conda activate anyv2v-i2vgen-xl
4 | 
5 | cd ..
6 | python run_group_pnp_edit.py \
7 | --template_config "configs/group_pnp_edit/template.yaml" \
8 | --configs_json "configs/group_pnp_edit/group_config.json"


--------------------------------------------------------------------------------
/i2vgen-xl/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import numpy as np
 4 | import torch
 5 | from torchvision.io import read_video
 6 | import torchvision.transforms as T
 7 | from pathlib import Path
 8 | from PIL import Image
 9 | from diffusers.utils import load_image
10 | import glob
11 | 
12 | 
13 | import logging
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | def seed_everything(seed):
18 |     torch.manual_seed(seed)
19 |     torch.cuda.manual_seed(seed)
20 |     torch.cuda.manual_seed_all(seed)
21 |     random.seed(seed)
22 |     np.random.seed(seed)
23 | 
24 | 
25 | def load_ddim_latents_at_t(t, ddim_latents_path):
26 |     ddim_latents_at_t_path = os.path.join(ddim_latents_path, f"ddim_latents_{t}.pt")
27 |     assert os.path.exists(ddim_latents_at_t_path), f"Missing latents at t {t} path {ddim_latents_at_t_path}"
28 |     ddim_latents_at_t = torch.load(ddim_latents_at_t_path)
29 |     logger.debug(f"Loaded ddim_latents_at_t from {ddim_latents_at_t_path}")
30 |     return ddim_latents_at_t
31 | 
32 | 
33 | def load_ddim_latents_at_T(ddim_latents_path):
34 |     noisest = max(
35 |         [int(x.split("_")[-1].split(".")[0]) for x in glob.glob(os.path.join(ddim_latents_path, f"ddim_latents_*.pt"))]
36 |     )
37 |     ddim_latents_at_T_path = os.path.join(ddim_latents_path, f"ddim_latents_{noisest}.pt")
38 |     ddim_latents_at_T = torch.load(ddim_latents_at_T_path)  # [b, c, f, h, w] [1, 4, 16, 40, 64]
39 |     return ddim_latents_at_T
40 | 
41 | 
42 | # Modified from tokenflow/utils.py
43 | def convert_video_to_frames(video_path, img_size=(512, 512), save_frames=True):
44 |     video, _, _ = read_video(video_path, output_format="TCHW")
45 |     # rotate video -90 degree if video is .mov format. this is a weird bug in torchvision
46 |     if video_path.endswith(".mov"):
47 |         video = T.functional.rotate(video, -90)
48 |     if save_frames:
49 |         video_name = Path(video_path).stem
50 |         video_dir = Path(video_path).parent
51 |         os.makedirs(f"{video_dir}/{video_name}", exist_ok=True)
52 |     frames = []
53 |     for i in range(len(video)):
54 |         ind = str(i).zfill(5)
55 |         image = T.ToPILImage()(video[i])
56 |         logger.info(f"Original video frame size: {image.size}")
57 |         if image.size != img_size:
58 |             image_resized = image.resize(img_size, resample=Image.Resampling.LANCZOS)
59 |             logger.info(f"Resized video frame, height, width: {image_resized.size}, {img_size[1]}, {img_size[0]}")
60 |         else:
61 |             image_resized = image
62 |         if save_frames:
63 |             image_resized.save(f"{video_dir}/{video_name}/{ind}.png")
64 |             print(f"Saved frame {video_dir}/{video_name}/{ind}.png")
65 |         frames.append(image_resized)
66 |     return frames
67 | 
68 | 
69 | # Modified from tokenflow/utils.py
70 | def load_video_frames(frames_path, n_frames, image_size=(512, 512)):
71 |     # Load paths
72 |     paths = [f"{frames_path}/%05d.png" % i for i in range(n_frames)]
73 |     frames = [load_image(p) for p in paths]
74 |     # Check if the frames are the right size
75 |     for f in frames:
76 |         if f.size != image_size:
77 |             logger.error(f"Frame size {f.size} does not match config.image_size {image_size}")
78 |             raise ValueError(f"Frame size {f.size} does not match config.image_size {image_size}")
79 |     return paths, frames
80 | 
81 | 


--------------------------------------------------------------------------------
/prepare_video.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from moviepy.editor import VideoFileClip
  3 | import os
  4 | import glob
  5 | import random
  6 | import numpy as np
  7 | from PIL import Image
  8 | 
  9 | def extract_frames(video_path, frame_count=16):
 10 |     clip = VideoFileClip(video_path)
 11 |     duration = clip.duration
 12 |     frames = []
 13 |     
 14 |     # Calculate the time interval at which to extract frames
 15 |     times = np.linspace(0, duration, frame_count, endpoint=False)
 16 |     
 17 |     for t in times:
 18 |         # Extract the frame at the specific timestamp
 19 |         frame = clip.get_frame(t)
 20 |         # Convert the frame (numpy array) to a PIL Image
 21 |         pil_img = Image.fromarray(frame)
 22 |         frames.append(pil_img)
 23 |     
 24 |     return frames
 25 | 
 26 | def crop_and_resize_video(input_video_path, output_folder, clip_duration, width=None, height=None, start_time=None, end_time=None, n_frames=16, center_crop=False, x_offset=0, y_offset=0, longest_to_width=False, use_full_clip=False):    # Load the video file
 27 |     video = VideoFileClip(input_video_path)
 28 |     
 29 |     if use_full_clip:
 30 |         cropped_video = video
 31 |     else:
 32 |         # Calculate start and end times for cropping
 33 |         if start_time is not None:
 34 |             start_time = float(start_time)
 35 |             end_time = start_time + clip_duration
 36 |         elif end_time is not None:
 37 |             end_time = float(end_time)
 38 |             start_time = end_time - clip_duration
 39 |         else:
 40 |             # Default to random cropping if neither start nor end time is specified
 41 |             video_duration = video.duration
 42 |             if video_duration <= clip_duration:
 43 |                 print(f"Skipping {input_video_path}: duration is less than or equal to the clip duration.")
 44 |                 return
 45 |             max_start_time = video_duration - clip_duration
 46 |             start_time = random.uniform(0, max_start_time)
 47 |             end_time = start_time + clip_duration
 48 |         cropped_video = video.subclip(start_time, end_time)
 49 | 
 50 |     if center_crop:
 51 |         # Calculate scale to ensure the desired crop size fits within the video
 52 |         video_width, video_height = cropped_video.size
 53 |         scale_width = video_width / width
 54 |         scale_height = video_height / height
 55 |         if longest_to_width:
 56 |             scale = max(scale_width, scale_height)
 57 |         else:
 58 |             scale = min(scale_width, scale_height)
 59 |         
 60 |         # Resize video to ensure the crop area fits within the frame
 61 |         # This step ensures that the smallest dimension matches or exceeds 512 pixels
 62 |         new_width = int(video_width / scale)
 63 |         new_height = int(video_height / scale)
 64 |         resized_video = cropped_video.resize(newsize=(new_width, new_height))
 65 |         print(f"Resized video to ({new_width}, {new_height})")
 66 |         
 67 |         # Calculate crop position with offset, ensuring the crop does not go out of bounds
 68 |         # The offset calculation needs to ensure that the cropping area remains within the video frame
 69 |         offset_x = int(((x_offset + 1) / 2) * (new_width - width))  # Adjusted for [-1, 1] scale
 70 |         offset_y = int(((y_offset + 1) / 2) * (new_height - height))  # Adjusted for [-1, 1] scale
 71 |         
 72 |         # Ensure offsets do not push the crop area out of the video frame
 73 |         offset_x = max(0, min(new_width - width, offset_x))
 74 |         offset_y = max(0, min(new_height - height, offset_y))
 75 |         
 76 |         # Apply center crop with offsets
 77 |         cropped_video = resized_video.crop(x1=offset_x, y1=offset_y, width=width, height=height)
 78 |     elif width and height:
 79 |         # Directly resize the video to specified width and height if no center crop is specified
 80 |         cropped_video = cropped_video.resize(newsize=(width, height))
 81 |     
 82 | 
 83 |     # After resizing and cropping, set the frame rate to fps
 84 |     fps = n_frames // clip_duration
 85 |     final_video = cropped_video.set_fps(fps)
 86 |     
 87 |     # Prepare the output video path
 88 |     if not os.path.exists(output_folder):
 89 |         os.makedirs(output_folder)
 90 |     filename = os.path.basename(input_video_path)
 91 |     output_video_path = os.path.join(output_folder, filename)
 92 |     
 93 |     # Write the result to the output file
 94 |     final_video.write_videofile(output_video_path, codec='libx264', audio_codec='aac', fps=fps)
 95 |     print(f"Processed {input_video_path}, saved to {output_video_path}")
 96 |     return output_video_path
 97 | 
 98 | def process_videos(input_folder, output_base_folder, clip_duration, width=None, height=None, start_time=None, end_time=None, n_frames=16, center_crop=False, x_offset=0, y_offset=0, longest_to_width=False, use_full_clip=False):
 99 |     video_files = glob.glob(os.path.join(input_folder, '*.mp4'))  # Adjust the pattern if needed
100 |     if video_files == []:
101 |         print(f"No video files found in {input_folder}")
102 |         return
103 |     
104 |     for video_file in video_files:
105 |         crop_and_resize_video(video_file, output_base_folder, clip_duration, width, height, start_time, end_time, n_frames, center_crop, x_offset, y_offset, longest_to_width, use_full_clip)
106 |     return 
107 | 
108 | def main():
109 |     parser = argparse.ArgumentParser(description='Crop and resize video segments.')
110 |     parser.add_argument('--input_folder', type=str, help='Path to the input folder containing video files')
111 |     parser.add_argument('--video_path', type=str, default=None, required=False, help='Path to the input video file')
112 |     parser.add_argument('--output_folder', type=str, default="processed_video_data", help='Path to the folder for the output videos')
113 |     parser.add_argument('--clip_duration', type=int, default=2, required=False, help='Duration of the video clips in seconds')
114 |     parser.add_argument('--width', type=int, default=512, help='Width of the output video (optional)')
115 |     parser.add_argument('--height', type=int, default=512, help='Height of the output video (optional)')
116 |     parser.add_argument('--start_time', type=float, help='Start time for cropping (optional)')
117 |     parser.add_argument('--end_time', type=float, help='End time for cropping (optional)')
118 |     parser.add_argument('--n_frames', type=int, default=16, help='Number of frames to extract from each video')
119 |     parser.add_argument('--center_crop', action='store_true', help='Center crop the video')
120 |     parser.add_argument('--x_offset', type=float, default=0, required=False, help='Horizontal offset for center cropping, range -1 to 1 (optional)')
121 |     parser.add_argument('--y_offset', type=float, default=0, required=False, help='Vertical offset for center cropping, range -1 to 1 (optional)')
122 |     parser.add_argument('--longest_to_width', action='store_true', help='Resize the longest dimension to the specified width')
123 |     parser.add_argument('--use_full_clip', action='store_true', help='Use the full video clip without trimming')
124 |     args = parser.parse_args()
125 |     
126 |     if args.start_time and args.end_time:
127 |         print("Please specify only one of start_time or end_time, not both.")
128 |         return
129 |     
130 |     if args.video_path:
131 |         crop_and_resize_video(args.video_path, 
132 |                               args.output_folder, 
133 |                               args.clip_duration, 
134 |                               args.width, args.height, 
135 |                               args.start_time, args.end_time, 
136 |                               args.n_frames, 
137 |                               args.center_crop, args.x_offset, args.y_offset, args.longest_to_width,
138 |                               args.use_full_clip)
139 |     else:
140 |         process_videos(args.input_folder, 
141 |                        args.output_folder, 
142 |                        args.clip_duration, 
143 |                        args.width, args.height, 
144 |                        args.start_time, args.end_time, 
145 |                        args.n_frames, 
146 |                        args.center_crop, args.x_offset, args.y_offset, args.longest_to_width,
147 |                        args.use_full_clip)
148 | 
149 | if __name__ == "__main__":
150 |     main()
151 | 


--------------------------------------------------------------------------------
/seine/README.md:
--------------------------------------------------------------------------------
 1 | # AnyV2V(_SEINE_)
 2 | 
 3 | Our AnyV2V(_SEINE_) is a standalone version.
 4 | 
 5 | ##  Setup for SEINE
 6 | 
 7 | ### Prepare Environment
 8 | ```
 9 | conda create -n seine python==3.9.16
10 | conda activate seine
11 | pip install -r requirement.txt
12 | ```
13 | 
14 | ### Download SEINE model and T2I base model
15 | 
16 | SEINE model is based on Stable diffusion v1.4, you may download [Stable Diffusion v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) to the director of ``` pretrained ```
17 | .
18 | Download SEINE model checkpoint (from [google drive](https://drive.google.com/drive/folders/1cWfeDzKJhpb0m6HA5DoMOH0_ItuUY95b?usp=sharing) or [hugging face](https://huggingface.co/xinyuanc91/SEINE/tree/main)) and save to the directory of ```pretrained```
19 | 
20 | 
21 | Now under `./pretrained`, you should be able to see the following:
22 | ```
23 | ├── pretrained
24 | │   ├── seine.pt
25 | │   ├── stable-diffusion-v1-4
26 | │   │   ├── ...
27 | └── └── ├── ...
28 |         ├── ...
29 | ```
30 | 
31 | ## AnyV2V
32 | 
33 | ### Configure paths for SEINE models
34 | 
35 | Edit the model paths in both yaml files:
36 | * `./configs/ddim_inversion.yaml`
37 | * `./configs/pnp_edit.yaml`
38 | 
39 | ```yaml
40 | # Model
41 | model_name: "seine"
42 | sd_path: "<your_path>/stable-diffusion-v1-4"
43 | ckpt_path: "<your_path>/SEINE/seine.pt"
44 | model_key: "<your_path>/stable-diffusion-v1-4"
45 | ```
46 | 
47 | Theortically, `<your_path>` should equal to `./pretrained`.
48 | 
49 | 
50 | ### Run SEINE DDIM Inversion to get the initial latent
51 | ```shell
52 | usage: run_ddim_inversion.py [-h] [--config CONFIG] [--video_path VIDEO_PATH] [--gpu GPU]
53 |                              [--width WIDTH] [--height HEIGHT]
54 | 
55 | options:
56 |   -h, --help            show this help message and exit
57 |   --config CONFIG
58 |   --video_path VIDEO_PATH
59 |                         Path to the video to invert.
60 |   --gpu GPU             GPU number to use.
61 |   --width WIDTH
62 |   --height HEIGHT
63 | ```
64 | 
65 | Usage Example:
66 | ```shell
67 | python run_ddim_inversion.py --gpu 0 --video_path "../demo/Man Walking.mp4" --width 512 --height 512
68 | ```
69 | 
70 | Saved latent goes to `./ddim_version` (can be configurated in `./configs/ddim_inversion.yaml`).
71 | 
72 | ### Run AnyV2V with SEINE
73 | 
74 | Your need to prepare your edited image frame first. We provided an image editing script in the root folder of AnyV2V.
75 | 
76 | ```shell
77 | python run_pnp_edit.py --config ./configs/pnp_edit.yaml \
78 |     src_video_path="your_video.mp4" \
79 |     edited_first_frame_path="your edited first frame image.png" \
80 |     prompt="your prompt" \
81 |     device="cuda:0"
82 | ```
83 | 
84 | Usage Example:
85 | ```shell
86 | python run_pnp_edit.py --config ./configs/pnp_edit.yaml \
87 |     src_video_path="../demo/Man Walking.mp4" \
88 |     edited_first_frame_path="../demo/Man Walking/edited_first_frame/turn the man into darth vader.png" \
89 |     prompt="Darth Vader Walking"
90 | ```
91 | 
92 | Saved video goes to `./anyv2v_results` (can be configurated in `./configs/pnp_edit.yaml`).
93 | 


--------------------------------------------------------------------------------
/seine/configs/ddim_inversion.yaml:
--------------------------------------------------------------------------------
 1 | # General
 2 | seed: 1
 3 | device: "cuda:0"
 4 | debug: True # For logging
 5 | 
 6 | # Dir
 7 | exp_name: "default"
 8 | output_dir: "ddim-inversion/${exp_name}"
 9 | 
10 | # Data
11 | data_dir: "<your_path>/data"
12 | src_video_path: "${data_dir}/woman-running.mp4" #Override it with video_path
13 | image_size: [512, 512]
14 | save_video_frames: False
15 | 
16 | # Model
17 | model_name: "seine"
18 | sd_path: "./pretrained/stable-diffusion-v1-4"
19 | ckpt_path: "./pretrained/SEINE/seine.pt"
20 | model_key: "./pretrained/stable-diffusion-v1-4"
21 | enable_xformers_memory_efficient_attention: True
22 | use_fp16: True
23 | 
24 | # Schedular
25 | beta_start: 0.0001
26 | beta_end: 0.02
27 | beta_schedule: "linear"
28 | 
29 | # DDIM inversion
30 | n_steps: 500
31 | n_save_steps: 250
32 | n_frame_to_invert: 16
33 | inversion_prompt: ""
34 | batch_size: 1  # TODO: batchsize is always 1 for inversion, we can remove this


--------------------------------------------------------------------------------
/seine/configs/pnp_edit.yaml:
--------------------------------------------------------------------------------
 1 | # General
 2 | seed: 1
 3 | device: "cuda:0"
 4 | debug: False # For logging
 5 | 
 6 | # Dir
 7 | exp_name: "default"
 8 | output_dir: "anyv2v_results/${exp_name}"
 9 | 
10 | # Data
11 | data_dir: "<your_path>/data"
12 | src_video_path: "${data_dir}/video.mp4" #Override it with src_video_path
13 | ddim_inversion_dir: 'ddim-inversion/default/'
14 | n_ddim_inversion_steps: 500 # for retrieving the latents of the inversion
15 | n_frame_inverted: 16
16 | n_frames: 16
17 | edited_first_frame_path: '<your_path>/edited_first_frame.png' #Override it with edited_first_frame_path
18 | image_size: [512, 512]
19 | 
20 | # Model
21 | model_name: "seine"
22 | sd_path: "./pretrained/stable-diffusion-v1-4"
23 | ckpt_path: "./pretrained/SEINE/seine.pt"
24 | model_key: "./pretrained/stable-diffusion-v1-4"
25 | enable_xformers_memory_efficient_attention: True
26 | use_fp16: True
27 | 
28 | # Schedular
29 | sample_method: 'ddpm'
30 | beta_start: 0.0001
31 | beta_end: 0.02
32 | beta_schedule: "linear"
33 | 
34 | # Diffusion
35 | cfg_scale: 4
36 | n_steps: 50
37 | init_with_ddim_inversion: True
38 | prompt: "<prompt>" #Override it with prompt
39 | negative_prompt: ""
40 | batch_size: 1 # TODO: batchsize is always 1, we can remove this
41 | 
42 | # Pnp params -- injection thresholds ∈ [0, 1]
43 | enable_pnp: True
44 | pnp_f_t: 0.2
45 | pnp_spatial_attn_t: 0.2
46 | pnp_temp_attn_t: 0.5
47 | pnp_cross_attn_t: 0.0
48 | 


--------------------------------------------------------------------------------
/seine/diffusion/__init__.py:
--------------------------------------------------------------------------------
 1 | # Modified from OpenAI's diffusion repos
 2 | #     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
 3 | #     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
 4 | #     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
 5 | 
 6 | from . import gaussian_diffusion as gd
 7 | from .respace import SpacedDiffusion, space_timesteps
 8 | 
 9 | 
10 | def create_diffusion(
11 |     timestep_respacing,
12 |     noise_schedule="linear", 
13 |     use_kl=False,
14 |     sigma_small=False,
15 |     predict_xstart=False,
16 |     # learn_sigma=True,
17 |     learn_sigma=False, # for unet
18 |     rescale_learned_sigmas=False,
19 |     diffusion_steps=1000
20 | ):
21 |     betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
22 |     if use_kl:
23 |         loss_type = gd.LossType.RESCALED_KL
24 |     elif rescale_learned_sigmas:
25 |         loss_type = gd.LossType.RESCALED_MSE
26 |     else:
27 |         loss_type = gd.LossType.MSE
28 |     if timestep_respacing is None or timestep_respacing == "":
29 |         timestep_respacing = [diffusion_steps]
30 |     return SpacedDiffusion(
31 |         use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
32 |         betas=betas,
33 |         model_mean_type=(
34 |             gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
35 |         ),
36 |         model_var_type=(
37 |             (
38 |                 gd.ModelVarType.FIXED_LARGE
39 |                 if not sigma_small
40 |                 else gd.ModelVarType.FIXED_SMALL
41 |             )
42 |             if not learn_sigma
43 |             else gd.ModelVarType.LEARNED_RANGE
44 |         ),
45 |         loss_type=loss_type
46 |         # rescale_timesteps=rescale_timesteps,
47 |     )
48 | 


--------------------------------------------------------------------------------
/seine/diffusion/diffusion_utils.py:
--------------------------------------------------------------------------------
 1 | # Modified from OpenAI's diffusion repos
 2 | #     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
 3 | #     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
 4 | #     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
 5 | 
 6 | import torch as th
 7 | import numpy as np
 8 | 
 9 | 
10 | def normal_kl(mean1, logvar1, mean2, logvar2):
11 |     """
12 |     Compute the KL divergence between two gaussians.
13 |     Shapes are automatically broadcasted, so batches can be compared to
14 |     scalars, among other use cases.
15 |     """
16 |     tensor = None
17 |     for obj in (mean1, logvar1, mean2, logvar2):
18 |         if isinstance(obj, th.Tensor):
19 |             tensor = obj
20 |             break
21 |     assert tensor is not None, "at least one argument must be a Tensor"
22 | 
23 |     # Force variances to be Tensors. Broadcasting helps convert scalars to
24 |     # Tensors, but it does not work for th.exp().
25 |     logvar1, logvar2 = [
26 |         x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
27 |         for x in (logvar1, logvar2)
28 |     ]
29 | 
30 |     return 0.5 * (
31 |         -1.0
32 |         + logvar2
33 |         - logvar1
34 |         + th.exp(logvar1 - logvar2)
35 |         + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
36 |     )
37 | 
38 | 
39 | def approx_standard_normal_cdf(x):
40 |     """
41 |     A fast approximation of the cumulative distribution function of the
42 |     standard normal.
43 |     """
44 |     return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
45 | 
46 | 
47 | def continuous_gaussian_log_likelihood(x, *, means, log_scales):
48 |     """
49 |     Compute the log-likelihood of a continuous Gaussian distribution.
50 |     :param x: the targets
51 |     :param means: the Gaussian mean Tensor.
52 |     :param log_scales: the Gaussian log stddev Tensor.
53 |     :return: a tensor like x of log probabilities (in nats).
54 |     """
55 |     centered_x = x - means
56 |     inv_stdv = th.exp(-log_scales)
57 |     normalized_x = centered_x * inv_stdv
58 |     log_probs = th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob(normalized_x)
59 |     return log_probs
60 | 
61 | 
62 | def discretized_gaussian_log_likelihood(x, *, means, log_scales):
63 |     """
64 |     Compute the log-likelihood of a Gaussian distribution discretizing to a
65 |     given image.
66 |     :param x: the target images. It is assumed that this was uint8 values,
67 |               rescaled to the range [-1, 1].
68 |     :param means: the Gaussian mean Tensor.
69 |     :param log_scales: the Gaussian log stddev Tensor.
70 |     :return: a tensor like x of log probabilities (in nats).
71 |     """
72 |     assert x.shape == means.shape == log_scales.shape
73 |     centered_x = x - means
74 |     inv_stdv = th.exp(-log_scales)
75 |     plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
76 |     cdf_plus = approx_standard_normal_cdf(plus_in)
77 |     min_in = inv_stdv * (centered_x - 1.0 / 255.0)
78 |     cdf_min = approx_standard_normal_cdf(min_in)
79 |     log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
80 |     log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
81 |     cdf_delta = cdf_plus - cdf_min
82 |     log_probs = th.where(
83 |         x < -0.999,
84 |         log_cdf_plus,
85 |         th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
86 |     )
87 |     assert log_probs.shape == x.shape
88 |     return log_probs
89 | 


--------------------------------------------------------------------------------
/seine/diffusion/respace.py:
--------------------------------------------------------------------------------
  1 | # Modified from OpenAI's diffusion repos
  2 | #     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
  3 | #     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
  4 | #     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
  5 | import torch
  6 | import numpy as np
  7 | import torch as th
  8 | 
  9 | from .gaussian_diffusion import GaussianDiffusion
 10 | 
 11 | 
 12 | def space_timesteps(num_timesteps, section_counts):
 13 |     """
 14 |     Create a list of timesteps to use from an original diffusion process,
 15 |     given the number of timesteps we want to take from equally-sized portions
 16 |     of the original process.
 17 |     For example, if there's 300 timesteps and the section counts are [10,15,20]
 18 |     then the first 100 timesteps are strided to be 10 timesteps, the second 100
 19 |     are strided to be 15 timesteps, and the final 100 are strided to be 20.
 20 |     If the stride is a string starting with "ddim", then the fixed striding
 21 |     from the DDIM paper is used, and only one section is allowed.
 22 |     :param num_timesteps: the number of diffusion steps in the original
 23 |                           process to divide up.
 24 |     :param section_counts: either a list of numbers, or a string containing
 25 |                            comma-separated numbers, indicating the step count
 26 |                            per section. As a special case, use "ddimN" where N
 27 |                            is a number of steps to use the striding from the
 28 |                            DDIM paper.
 29 |     :return: a set of diffusion steps from the original process to use.
 30 |     """
 31 |     if isinstance(section_counts, str):
 32 |         if section_counts.startswith("ddim"):
 33 |             desired_count = int(section_counts[len("ddim") :])
 34 |             for i in range(1, num_timesteps):
 35 |                 if len(range(0, num_timesteps, i)) == desired_count:
 36 |                     return set(range(0, num_timesteps, i))
 37 |             raise ValueError(
 38 |                 f"cannot create exactly {num_timesteps} steps with an integer stride"
 39 |             )
 40 |         section_counts = [int(x) for x in section_counts.split(",")]
 41 |     size_per = num_timesteps // len(section_counts)
 42 |     extra = num_timesteps % len(section_counts)
 43 |     start_idx = 0
 44 |     all_steps = []
 45 |     for i, section_count in enumerate(section_counts):
 46 |         size = size_per + (1 if i < extra else 0)
 47 |         if size < section_count:
 48 |             raise ValueError(
 49 |                 f"cannot divide section of {size} steps into {section_count}"
 50 |             )
 51 |         if section_count <= 1:
 52 |             frac_stride = 1
 53 |         else:
 54 |             frac_stride = (size - 1) / (section_count - 1)
 55 |         cur_idx = 0.0
 56 |         taken_steps = []
 57 |         for _ in range(section_count):
 58 |             taken_steps.append(start_idx + round(cur_idx))
 59 |             cur_idx += frac_stride
 60 |         all_steps += taken_steps
 61 |         start_idx += size
 62 |     return set(all_steps)
 63 | 
 64 | 
 65 | class SpacedDiffusion(GaussianDiffusion):
 66 |     """
 67 |     A diffusion process which can skip steps in a base diffusion process.
 68 |     :param use_timesteps: a collection (sequence or set) of timesteps from the
 69 |                           original diffusion process to retain.
 70 |     :param kwargs: the kwargs to create the base diffusion process.
 71 |     """
 72 | 
 73 |     def __init__(self, use_timesteps, **kwargs):
 74 |         self.use_timesteps = set(use_timesteps)
 75 |         self.timestep_map = []
 76 |         self.original_num_steps = len(kwargs["betas"])
 77 | 
 78 |         base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
 79 |         last_alpha_cumprod = 1.0
 80 |         new_betas = []
 81 |         for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
 82 |             if i in self.use_timesteps:
 83 |                 new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
 84 |                 last_alpha_cumprod = alpha_cumprod
 85 |                 self.timestep_map.append(i)
 86 |         kwargs["betas"] = np.array(new_betas)
 87 |         super().__init__(**kwargs)
 88 | 
 89 |     def p_mean_variance(
 90 |         self, model, *args, **kwargs
 91 |     ):  # pylint: disable=signature-differs
 92 |         return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
 93 | 
 94 |     # @torch.compile
 95 |     def training_losses(
 96 |         self, model, *args, **kwargs
 97 |     ):  # pylint: disable=signature-differs
 98 |         return super().training_losses(self._wrap_model(model), *args, **kwargs)
 99 | 
100 |     def condition_mean(self, cond_fn, *args, **kwargs):
101 |         return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
102 | 
103 |     def condition_score(self, cond_fn, *args, **kwargs):
104 |         return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
105 | 
106 |     def _wrap_model(self, model):
107 |         if isinstance(model, _WrappedModel):
108 |             return model
109 |         return _WrappedModel(
110 |             model, self.timestep_map, self.original_num_steps
111 |         )
112 | 
113 |     def _scale_timesteps(self, t):
114 |         # Scaling is done by the wrapped model.
115 |         return t
116 | 
117 | 
118 | class _WrappedModel:
119 |     def __init__(self, model, timestep_map, original_num_steps):
120 |         self.model = model
121 |         self.timestep_map = timestep_map
122 |         # self.rescale_timesteps = rescale_timesteps
123 |         self.original_num_steps = original_num_steps
124 | 
125 |     def __call__(self, x, ts, **kwargs):
126 |         map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
127 |         new_ts = map_tensor[ts]
128 |         # if self.rescale_timesteps:
129 |         #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
130 |         return self.model(x, new_ts, **kwargs)
131 | 


--------------------------------------------------------------------------------
/seine/diffusion/timestep_sampler.py:
--------------------------------------------------------------------------------
  1 | # Modified from OpenAI's diffusion repos
  2 | #     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
  3 | #     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
  4 | #     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
  5 | 
  6 | from abc import ABC, abstractmethod
  7 | 
  8 | import numpy as np
  9 | import torch as th
 10 | import torch.distributed as dist
 11 | 
 12 | 
 13 | def create_named_schedule_sampler(name, diffusion):
 14 |     """
 15 |     Create a ScheduleSampler from a library of pre-defined samplers.
 16 |     :param name: the name of the sampler.
 17 |     :param diffusion: the diffusion object to sample for.
 18 |     """
 19 |     if name == "uniform":
 20 |         return UniformSampler(diffusion)
 21 |     elif name == "loss-second-moment":
 22 |         return LossSecondMomentResampler(diffusion)
 23 |     else:
 24 |         raise NotImplementedError(f"unknown schedule sampler: {name}")
 25 | 
 26 | 
 27 | class ScheduleSampler(ABC):
 28 |     """
 29 |     A distribution over timesteps in the diffusion process, intended to reduce
 30 |     variance of the objective.
 31 |     By default, samplers perform unbiased importance sampling, in which the
 32 |     objective's mean is unchanged.
 33 |     However, subclasses may override sample() to change how the resampled
 34 |     terms are reweighted, allowing for actual changes in the objective.
 35 |     """
 36 | 
 37 |     @abstractmethod
 38 |     def weights(self):
 39 |         """
 40 |         Get a numpy array of weights, one per diffusion step.
 41 |         The weights needn't be normalized, but must be positive.
 42 |         """
 43 | 
 44 |     def sample(self, batch_size, device):
 45 |         """
 46 |         Importance-sample timesteps for a batch.
 47 |         :param batch_size: the number of timesteps.
 48 |         :param device: the torch device to save to.
 49 |         :return: a tuple (timesteps, weights):
 50 |                  - timesteps: a tensor of timestep indices.
 51 |                  - weights: a tensor of weights to scale the resulting losses.
 52 |         """
 53 |         w = self.weights()
 54 |         p = w / np.sum(w)
 55 |         indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
 56 |         indices = th.from_numpy(indices_np).long().to(device)
 57 |         weights_np = 1 / (len(p) * p[indices_np])
 58 |         weights = th.from_numpy(weights_np).float().to(device)
 59 |         return indices, weights
 60 | 
 61 | 
 62 | class UniformSampler(ScheduleSampler):
 63 |     def __init__(self, diffusion):
 64 |         self.diffusion = diffusion
 65 |         self._weights = np.ones([diffusion.num_timesteps])
 66 | 
 67 |     def weights(self):
 68 |         return self._weights
 69 | 
 70 | 
 71 | class LossAwareSampler(ScheduleSampler):
 72 |     def update_with_local_losses(self, local_ts, local_losses):
 73 |         """
 74 |         Update the reweighting using losses from a model.
 75 |         Call this method from each rank with a batch of timesteps and the
 76 |         corresponding losses for each of those timesteps.
 77 |         This method will perform synchronization to make sure all of the ranks
 78 |         maintain the exact same reweighting.
 79 |         :param local_ts: an integer Tensor of timesteps.
 80 |         :param local_losses: a 1D Tensor of losses.
 81 |         """
 82 |         batch_sizes = [
 83 |             th.tensor([0], dtype=th.int32, device=local_ts.device)
 84 |             for _ in range(dist.get_world_size())
 85 |         ]
 86 |         dist.all_gather(
 87 |             batch_sizes,
 88 |             th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
 89 |         )
 90 | 
 91 |         # Pad all_gather batches to be the maximum batch size.
 92 |         batch_sizes = [x.item() for x in batch_sizes]
 93 |         max_bs = max(batch_sizes)
 94 | 
 95 |         timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
 96 |         loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
 97 |         dist.all_gather(timestep_batches, local_ts)
 98 |         dist.all_gather(loss_batches, local_losses)
 99 |         timesteps = [
100 |             x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
101 |         ]
102 |         losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
103 |         self.update_with_all_losses(timesteps, losses)
104 | 
105 |     @abstractmethod
106 |     def update_with_all_losses(self, ts, losses):
107 |         """
108 |         Update the reweighting using losses from a model.
109 |         Sub-classes should override this method to update the reweighting
110 |         using losses from the model.
111 |         This method directly updates the reweighting without synchronizing
112 |         between workers. It is called by update_with_local_losses from all
113 |         ranks with identical arguments. Thus, it should have deterministic
114 |         behavior to maintain state across workers.
115 |         :param ts: a list of int timesteps.
116 |         :param losses: a list of float losses, one per timestep.
117 |         """
118 | 
119 | 
120 | class LossSecondMomentResampler(LossAwareSampler):
121 |     def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
122 |         self.diffusion = diffusion
123 |         self.history_per_term = history_per_term
124 |         self.uniform_prob = uniform_prob
125 |         self._loss_history = np.zeros(
126 |             [diffusion.num_timesteps, history_per_term], dtype=np.float64
127 |         )
128 |         self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
129 | 
130 |     def weights(self):
131 |         if not self._warmed_up():
132 |             return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
133 |         weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
134 |         weights /= np.sum(weights)
135 |         weights *= 1 - self.uniform_prob
136 |         weights += self.uniform_prob / len(weights)
137 |         return weights
138 | 
139 |     def update_with_all_losses(self, ts, losses):
140 |         for t, loss in zip(ts, losses):
141 |             if self._loss_counts[t] == self.history_per_term:
142 |                 # Shift out the oldest loss term.
143 |                 self._loss_history[t, :-1] = self._loss_history[t, 1:]
144 |                 self._loss_history[t, -1] = loss
145 |             else:
146 |                 self._loss_history[t, self._loss_counts[t]] = loss
147 |                 self._loss_counts[t] += 1
148 | 
149 |     def _warmed_up(self):
150 |         return (self._loss_counts == self.history_per_term).all()
151 | 


--------------------------------------------------------------------------------
/seine/models/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.append(os.path.split(sys.path[0])[0])
 4 | 
 5 | from .unet import UNet3DConditionModel
 6 | from torch.optim.lr_scheduler import LambdaLR
 7 | 
 8 | def customized_lr_scheduler(optimizer, warmup_steps=5000): # 5000 from u-vit
 9 |     from torch.optim.lr_scheduler import LambdaLR
10 |     def fn(step):
11 |         if warmup_steps > 0:
12 |             return min(step / warmup_steps, 1)
13 |         else:
14 |             return 1
15 |     return LambdaLR(optimizer, fn)
16 | 
17 | 
18 | def get_lr_scheduler(optimizer, name, **kwargs):
19 |     if name == 'warmup':
20 |         return customized_lr_scheduler(optimizer, **kwargs)
21 |     elif name == 'cosine':
22 |         from torch.optim.lr_scheduler import CosineAnnealingLR
23 |         return CosineAnnealingLR(optimizer, **kwargs)
24 |     else:
25 |         raise NotImplementedError(name)
26 |     
27 | def get_models(args):
28 |     if 'UNet' in args.model:
29 |         pretrained_model_path = args.pretrained_model_path
30 |         return UNet3DConditionModel.from_pretrained_2d(pretrained_model_path, subfolder="unet", use_concat=args.use_mask)
31 |     else:
32 |         raise '{} Model Not Supported!'.format(args.model)
33 |     


--------------------------------------------------------------------------------
/seine/models/clip.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import torch.nn as nn
  3 | from transformers import CLIPTokenizer, CLIPTextModel
  4 | 
  5 | import transformers
  6 | transformers.logging.set_verbosity_error()
  7 | 
  8 | """
  9 | Will encounter following warning:
 10 | - This IS expected if you are initializing CLIPTextModel from the checkpoint of a model trained on another task
 11 | or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
 12 | - This IS NOT expected if you are initializing CLIPTextModel from the checkpoint of a model 
 13 | that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 14 | 
 15 | https://github.com/CompVis/stable-diffusion/issues/97 
 16 | according to this issue, this warning is safe.
 17 | 
 18 | This is expected since the vision backbone of the CLIP model is not needed to run Stable Diffusion. 
 19 | You can safely ignore the warning, it is not an error.
 20 | 
 21 | This clip usage is from U-ViT and same with Stable Diffusion.
 22 | """
 23 | 
 24 | class AbstractEncoder(nn.Module):
 25 |     def __init__(self):
 26 |         super().__init__()
 27 | 
 28 |     def encode(self, *args, **kwargs):
 29 |         raise NotImplementedError
 30 | 
 31 | 
 32 | class FrozenCLIPEmbedder(AbstractEncoder):
 33 |     """Uses the CLIP transformer encoder for text (from Hugging Face)"""
 34 |     # def __init__(self, version="openai/clip-vit-huge-patch14", device="cuda", max_length=77):
 35 |     def __init__(self, path, device="cuda", max_length=77):
 36 |         super().__init__()
 37 |         self.tokenizer = CLIPTokenizer.from_pretrained(path, subfolder="tokenizer")
 38 |         self.transformer = CLIPTextModel.from_pretrained(path, subfolder='text_encoder')
 39 |         self.device = device
 40 |         self.max_length = max_length
 41 |         self.freeze()
 42 | 
 43 |     def freeze(self):
 44 |         self.transformer = self.transformer.eval()
 45 |         for param in self.parameters():
 46 |             param.requires_grad = False
 47 | 
 48 |     def forward(self, text):
 49 |         batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
 50 |                                         return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
 51 |         tokens = batch_encoding["input_ids"].to(self.device)
 52 |         outputs = self.transformer(input_ids=tokens)
 53 | 
 54 |         z = outputs.last_hidden_state
 55 |         return z
 56 | 
 57 |     def encode(self, text):
 58 |         return self(text)
 59 |     
 60 | 
 61 | class TextEmbedder(nn.Module):
 62 |     """
 63 |     Embeds text prompt into vector representations. Also handles text dropout for classifier-free guidance.
 64 |     """
 65 |     def __init__(self, path, device='cuda', dropout_prob=0.1):  # Modified
 66 |         super().__init__()
 67 |         self.text_encodder = FrozenCLIPEmbedder(path=path, device=device) # Modified
 68 |         self.dropout_prob = dropout_prob
 69 |     
 70 |     def token_drop(self, text_prompts, force_drop_ids=None):
 71 |         """
 72 |         Drops text to enable classifier-free guidance.
 73 |         """
 74 |         if force_drop_ids is None:
 75 |             drop_ids = numpy.random.uniform(0, 1, len(text_prompts)) < self.dropout_prob
 76 |         else:
 77 |             # TODO
 78 |             drop_ids = force_drop_ids == 1
 79 |         labels = list(numpy.where(drop_ids, "", text_prompts))
 80 |         # print(labels)
 81 |         return labels
 82 | 
 83 |     def forward(self, text_prompts, train, force_drop_ids=None):
 84 |         use_dropout = self.dropout_prob > 0
 85 |         if (train and use_dropout) or (force_drop_ids is not None):
 86 |             text_prompts = self.token_drop(text_prompts, force_drop_ids)
 87 |         embeddings = self.text_encodder(text_prompts)
 88 |         return embeddings
 89 |     
 90 | 
 91 | if __name__ == '__main__':
 92 | 
 93 |     r"""
 94 |     Returns:
 95 | 
 96 |     Examples from CLIPTextModel:
 97 | 
 98 |     ```python
 99 |     >>> from transformers import AutoTokenizer, CLIPTextModel
100 | 
101 |     >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
102 |     >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
103 | 
104 |     >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
105 | 
106 |     >>> outputs = model(**inputs)
107 |     >>> last_hidden_state = outputs.last_hidden_state
108 |     >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
109 |     ```"""
110 | 
111 |     import torch
112 | 
113 |     device = "cuda" if torch.cuda.is_available() else "cpu"
114 | 
115 |     text_encoder = TextEmbedder(path='/mnt/petrelfs/maxin/work/pretrained/stable-diffusion-2-1-base',
116 |                                 dropout_prob=0.00001).to(device)
117 | 
118 |     text_prompt = [["a photo of a cat", "a photo of a cat"], ["a photo of a dog", "a photo of a cat"], ['a photo of a dog human', "a photo of a cat"]]
119 |     # text_prompt = ('None', 'None', 'None')
120 |     output = text_encoder(text_prompts=text_prompt, train=False)
121 |     # print(output)
122 |     print(output.shape)
123 |     # print(output.shape)


--------------------------------------------------------------------------------
/seine/models/resnet.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py
  2 | import os
  3 | import sys
  4 | sys.path.append(os.path.split(sys.path[0])[0])
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | 
 10 | from einops import rearrange
 11 | 
 12 | 
 13 | class InflatedConv3d(nn.Conv2d):
 14 |     def forward(self, x):
 15 |         video_length = x.shape[2]
 16 | 
 17 |         x = rearrange(x, "b c f h w -> (b f) c h w")
 18 |         x = super().forward(x)
 19 |         x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
 20 | 
 21 |         return x
 22 | 
 23 | 
 24 | class Upsample3D(nn.Module):
 25 |     def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
 26 |         super().__init__()
 27 |         self.channels = channels
 28 |         self.out_channels = out_channels or channels
 29 |         self.use_conv = use_conv
 30 |         self.use_conv_transpose = use_conv_transpose
 31 |         self.name = name
 32 | 
 33 |         conv = None
 34 |         if use_conv_transpose:
 35 |             raise NotImplementedError
 36 |         elif use_conv:
 37 |             conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1)
 38 | 
 39 |         if name == "conv":
 40 |             self.conv = conv
 41 |         else:
 42 |             self.Conv2d_0 = conv
 43 | 
 44 |     def forward(self, hidden_states, output_size=None):
 45 |         assert hidden_states.shape[1] == self.channels
 46 | 
 47 |         if self.use_conv_transpose:
 48 |             raise NotImplementedError
 49 | 
 50 |         # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
 51 |         dtype = hidden_states.dtype
 52 |         if dtype == torch.bfloat16:
 53 |             hidden_states = hidden_states.to(torch.float32)
 54 | 
 55 |         # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
 56 |         if hidden_states.shape[0] >= 64:
 57 |             hidden_states = hidden_states.contiguous()
 58 | 
 59 |         # if `output_size` is passed we force the interpolation output
 60 |         # size and do not make use of `scale_factor=2`
 61 |         if output_size is None:
 62 |             hidden_states = F.interpolate(hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest")
 63 |         else:
 64 |             hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
 65 | 
 66 |         # If the input is bfloat16, we cast back to bfloat16
 67 |         if dtype == torch.bfloat16:
 68 |             hidden_states = hidden_states.to(dtype)
 69 | 
 70 |         if self.use_conv:
 71 |             if self.name == "conv":
 72 |                 hidden_states = self.conv(hidden_states)
 73 |             else:
 74 |                 hidden_states = self.Conv2d_0(hidden_states)
 75 | 
 76 |         return hidden_states
 77 | 
 78 | 
 79 | class Downsample3D(nn.Module):
 80 |     def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
 81 |         super().__init__()
 82 |         self.channels = channels
 83 |         self.out_channels = out_channels or channels
 84 |         self.use_conv = use_conv
 85 |         self.padding = padding
 86 |         stride = 2
 87 |         self.name = name
 88 | 
 89 |         if use_conv:
 90 |             conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
 91 |         else:
 92 |             raise NotImplementedError
 93 | 
 94 |         if name == "conv":
 95 |             self.Conv2d_0 = conv
 96 |             self.conv = conv
 97 |         elif name == "Conv2d_0":
 98 |             self.conv = conv
 99 |         else:
100 |             self.conv = conv
101 | 
102 |     def forward(self, hidden_states):
103 |         assert hidden_states.shape[1] == self.channels
104 |         if self.use_conv and self.padding == 0:
105 |             raise NotImplementedError
106 | 
107 |         assert hidden_states.shape[1] == self.channels
108 |         hidden_states = self.conv(hidden_states)
109 | 
110 |         return hidden_states
111 | 
112 | 
113 | class ResnetBlock3D(nn.Module):
114 |     def __init__(
115 |         self,
116 |         *,
117 |         in_channels,
118 |         out_channels=None,
119 |         conv_shortcut=False,
120 |         dropout=0.0,
121 |         temb_channels=512,
122 |         groups=32,
123 |         groups_out=None,
124 |         pre_norm=True,
125 |         eps=1e-6,
126 |         non_linearity="swish",
127 |         time_embedding_norm="default",
128 |         output_scale_factor=1.0,
129 |         use_in_shortcut=None,
130 |     ):
131 |         super().__init__()
132 |         self.pre_norm = pre_norm
133 |         self.pre_norm = True
134 |         self.in_channels = in_channels
135 |         out_channels = in_channels if out_channels is None else out_channels
136 |         self.out_channels = out_channels
137 |         self.use_conv_shortcut = conv_shortcut
138 |         self.time_embedding_norm = time_embedding_norm
139 |         self.output_scale_factor = output_scale_factor
140 | 
141 |         if groups_out is None:
142 |             groups_out = groups
143 | 
144 |         self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
145 | 
146 |         self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
147 | 
148 |         if temb_channels is not None:
149 |             if self.time_embedding_norm == "default":
150 |                 time_emb_proj_out_channels = out_channels
151 |             elif self.time_embedding_norm == "scale_shift":
152 |                 time_emb_proj_out_channels = out_channels * 2
153 |             else:
154 |                 raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
155 | 
156 |             self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels)
157 |         else:
158 |             self.time_emb_proj = None
159 | 
160 |         self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
161 |         self.dropout = torch.nn.Dropout(dropout)
162 |         self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
163 | 
164 |         if non_linearity == "swish":
165 |             self.nonlinearity = lambda x: F.silu(x)
166 |         elif non_linearity == "mish":
167 |             self.nonlinearity = Mish()
168 |         elif non_linearity == "silu":
169 |             self.nonlinearity = nn.SiLU()
170 | 
171 |         self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
172 | 
173 |         self.conv_shortcut = None
174 |         if self.use_in_shortcut:
175 |             self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
176 | 
177 |     def forward(self, input_tensor, temb):
178 |         hidden_states = input_tensor
179 | 
180 |         hidden_states = self.norm1(hidden_states)
181 |         hidden_states = self.nonlinearity(hidden_states)
182 | 
183 |         hidden_states = self.conv1(hidden_states)
184 | 
185 |         if temb is not None:
186 |             temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None]
187 | 
188 |         if temb is not None and self.time_embedding_norm == "default":
189 |             hidden_states = hidden_states + temb
190 | 
191 |         hidden_states = self.norm2(hidden_states)
192 | 
193 |         if temb is not None and self.time_embedding_norm == "scale_shift":
194 |             scale, shift = torch.chunk(temb, 2, dim=1)
195 |             hidden_states = hidden_states * (1 + scale) + shift
196 | 
197 |         hidden_states = self.nonlinearity(hidden_states)
198 | 
199 |         hidden_states = self.dropout(hidden_states)
200 |         hidden_states = self.conv2(hidden_states)
201 | 
202 |         if self.conv_shortcut is not None:
203 |             input_tensor = self.conv_shortcut(input_tensor)
204 | 
205 |         output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
206 | 
207 |         return output_tensor
208 | 
209 | 
210 | class Mish(torch.nn.Module):
211 |     def forward(self, hidden_states):
212 |         return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states))


--------------------------------------------------------------------------------
/seine/models/utils.py:
--------------------------------------------------------------------------------
  1 | # adopted from
  2 | # https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
  3 | # and
  4 | # https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
  5 | # and
  6 | # https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
  7 | #
  8 | # thanks!
  9 | 
 10 | 
 11 | import os
 12 | import math
 13 | import torch
 14 | 
 15 | import numpy as np
 16 | import torch.nn as nn
 17 | 
 18 | from einops import repeat
 19 | 
 20 | 
 21 | #################################################################################
 22 | #                                  Unet Utils                                   #
 23 | #################################################################################
 24 | 
 25 | def checkpoint(func, inputs, params, flag):
 26 |     """
 27 |     Evaluate a function without caching intermediate activations, allowing for
 28 |     reduced memory at the expense of extra compute in the backward pass.
 29 |     :param func: the function to evaluate.
 30 |     :param inputs: the argument sequence to pass to `func`.
 31 |     :param params: a sequence of parameters `func` depends on but does not
 32 |                    explicitly take as arguments.
 33 |     :param flag: if False, disable gradient checkpointing.
 34 |     """
 35 |     if flag:
 36 |         args = tuple(inputs) + tuple(params)
 37 |         return CheckpointFunction.apply(func, len(inputs), *args)
 38 |     else:
 39 |         return func(*inputs)
 40 | 
 41 | 
 42 | class CheckpointFunction(torch.autograd.Function):
 43 |     @staticmethod
 44 |     def forward(ctx, run_function, length, *args):
 45 |         ctx.run_function = run_function
 46 |         ctx.input_tensors = list(args[:length])
 47 |         ctx.input_params = list(args[length:])
 48 | 
 49 |         with torch.no_grad():
 50 |             output_tensors = ctx.run_function(*ctx.input_tensors)
 51 |         return output_tensors
 52 | 
 53 |     @staticmethod
 54 |     def backward(ctx, *output_grads):
 55 |         ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
 56 |         with torch.enable_grad():
 57 |             # Fixes a bug where the first op in run_function modifies the
 58 |             # Tensor storage in place, which is not allowed for detach()'d
 59 |             # Tensors.
 60 |             shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
 61 |             output_tensors = ctx.run_function(*shallow_copies)
 62 |         input_grads = torch.autograd.grad(
 63 |             output_tensors,
 64 |             ctx.input_tensors + ctx.input_params,
 65 |             output_grads,
 66 |             allow_unused=True,
 67 |         )
 68 |         del ctx.input_tensors
 69 |         del ctx.input_params
 70 |         del output_tensors
 71 |         return (None, None) + input_grads
 72 | 
 73 | 
 74 | def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
 75 |     """
 76 |     Create sinusoidal timestep embeddings.
 77 |     :param timesteps: a 1-D Tensor of N indices, one per batch element.
 78 |                       These may be fractional.
 79 |     :param dim: the dimension of the output.
 80 |     :param max_period: controls the minimum frequency of the embeddings.
 81 |     :return: an [N x dim] Tensor of positional embeddings.
 82 |     """
 83 |     if not repeat_only:
 84 |         half = dim // 2
 85 |         freqs = torch.exp(
 86 |             -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
 87 |         ).to(device=timesteps.device)
 88 |         args = timesteps[:, None].float() * freqs[None]
 89 |         embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
 90 |         if dim % 2:
 91 |             embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
 92 |     else:
 93 |         embedding = repeat(timesteps, 'b -> b d', d=dim).contiguous()
 94 |     return embedding
 95 | 
 96 | 
 97 | def zero_module(module):
 98 |     """
 99 |     Zero out the parameters of a module and return it.
100 |     """
101 |     for p in module.parameters():
102 |         p.detach().zero_()
103 |     return module
104 | 
105 | 
106 | def scale_module(module, scale):
107 |     """
108 |     Scale the parameters of a module and return it.
109 |     """
110 |     for p in module.parameters():
111 |         p.detach().mul_(scale)
112 |     return module
113 | 
114 | 
115 | def mean_flat(tensor):
116 |     """
117 |     Take the mean over all non-batch dimensions.
118 |     """
119 |     return tensor.mean(dim=list(range(1, len(tensor.shape))))
120 | 
121 | 
122 | def normalization(channels):
123 |     """
124 |     Make a standard normalization layer.
125 |     :param channels: number of input channels.
126 |     :return: an nn.Module for normalization.
127 |     """
128 |     return GroupNorm32(32, channels)
129 | 
130 | 
131 | # PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
132 | class SiLU(nn.Module):
133 |     def forward(self, x):
134 |         return x * torch.sigmoid(x)
135 | 
136 | 
137 | class GroupNorm32(nn.GroupNorm):
138 |     def forward(self, x):
139 |         return super().forward(x.float()).type(x.dtype)
140 | 
141 | def conv_nd(dims, *args, **kwargs):
142 |     """
143 |     Create a 1D, 2D, or 3D convolution module.
144 |     """
145 |     if dims == 1:
146 |         return nn.Conv1d(*args, **kwargs)
147 |     elif dims == 2:
148 |         return nn.Conv2d(*args, **kwargs)
149 |     elif dims == 3:
150 |         return nn.Conv3d(*args, **kwargs)
151 |     raise ValueError(f"unsupported dimensions: {dims}")
152 | 
153 | 
154 | def linear(*args, **kwargs):
155 |     """
156 |     Create a linear module.
157 |     """
158 |     return nn.Linear(*args, **kwargs)
159 | 
160 | 
161 | def avg_pool_nd(dims, *args, **kwargs):
162 |     """
163 |     Create a 1D, 2D, or 3D average pooling module.
164 |     """
165 |     if dims == 1:
166 |         return nn.AvgPool1d(*args, **kwargs)
167 |     elif dims == 2:
168 |         return nn.AvgPool2d(*args, **kwargs)
169 |     elif dims == 3:
170 |         return nn.AvgPool3d(*args, **kwargs)
171 |     raise ValueError(f"unsupported dimensions: {dims}")
172 | 
173 | 
174 | # class HybridConditioner(nn.Module):
175 | 
176 | #     def __init__(self, c_concat_config, c_crossattn_config):
177 | #         super().__init__()
178 | #         self.concat_conditioner = instantiate_from_config(c_concat_config)
179 | #         self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
180 | 
181 | #     def forward(self, c_concat, c_crossattn):
182 | #         c_concat = self.concat_conditioner(c_concat)
183 | #         c_crossattn = self.crossattn_conditioner(c_crossattn)
184 | #         return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
185 | 
186 | 
187 | def noise_like(shape, device, repeat=False):
188 |     repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
189 |     noise = lambda: torch.randn(shape, device=device)
190 |     return repeat_noise() if repeat else noise()
191 | 
192 | def count_flops_attn(model, _x, y):
193 |     """
194 |     A counter for the `thop` package to count the operations in an
195 |     attention operation.
196 |     Meant to be used like:
197 |         macs, params = thop.profile(
198 |             model,
199 |             inputs=(inputs, timestamps),
200 |             custom_ops={QKVAttention: QKVAttention.count_flops},
201 |         )
202 |     """
203 |     b, c, *spatial = y[0].shape
204 |     num_spatial = int(np.prod(spatial))
205 |     # We perform two matmuls with the same number of ops.
206 |     # The first computes the weight matrix, the second computes
207 |     # the combination of the value vectors.
208 |     matmul_ops = 2 * b * (num_spatial ** 2) * c
209 |     model.total_ops += torch.DoubleTensor([matmul_ops])
210 | 
211 | def count_params(model, verbose=False):
212 |     total_params = sum(p.numel() for p in model.parameters())
213 |     if verbose:
214 |         print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
215 |     return total_params


--------------------------------------------------------------------------------
/seine/requirement.txt:
--------------------------------------------------------------------------------
 1 | torch==2.0.1
 2 | torchaudio==2.0.2
 3 | torchvision==0.15.2
 4 | decord==0.6.0
 5 | diffusers==0.15.0
 6 | imageio==2.29.0
 7 | transformers==4.29.2
 8 | xformers==0.0.20
 9 | einops
10 | omegaconf
11 | tensorboard==2.15.1
12 | timm==0.9.10
13 | rotary-embedding-torch==0.3.5
14 | natsort==8.4.0
15 | moviepy


--------------------------------------------------------------------------------
/seine/seine_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | # Adopt from SEINE/utils.py
 5 | def mask_generation_before(mask_type, shape, dtype, device, dropout_prob=0.0, use_image_num=0):
 6 |     b, f, c, h, w = shape
 7 |     if mask_type.startswith("first"):
 8 |         num = int(mask_type.split("first")[-1])
 9 |         mask_f = torch.cat(
10 |             [
11 |                 torch.zeros(1, num, 1, 1, 1, dtype=dtype, device=device),
12 |                 torch.ones(1, f - num, 1, 1, 1, dtype=dtype, device=device),
13 |             ],
14 |             dim=1,
15 |         )
16 |         mask = mask_f.expand(b, -1, c, h, w)
17 |     elif mask_type.startswith("all"):
18 |         mask = torch.ones(b, f, c, h, w, dtype=dtype, device=device)
19 |     elif mask_type.startswith("onelast"):
20 |         num = int(mask_type.split("onelast")[-1])
21 |         mask_one = torch.zeros(1, 1, 1, 1, 1, dtype=dtype, device=device)
22 |         mask_mid = torch.ones(1, f - 2 * num, 1, 1, 1, dtype=dtype, device=device)
23 |         mask_last = torch.zeros_like(mask_one)
24 |         mask = torch.cat([mask_one] * num + [mask_mid] + [mask_last] * num, dim=1)
25 |         mask = mask.expand(b, -1, c, h, w)
26 |     else:
27 |         raise ValueError(f"Invalid mask type: {mask_type}")
28 |     return mask
29 | 
30 | 


--------------------------------------------------------------------------------