├── .gitignore ├── ACKNOWLEDGEMENTS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── alignment ├── aesthetic_scorer.py ├── assets │ ├── DrawBench Prompts.csv │ ├── HPDv2 │ │ ├── benchmark_anime.json │ │ ├── benchmark_concept-art.json │ │ ├── benchmark_paintings.json │ │ └── benchmark_photo.json │ ├── activities.txt │ ├── activities_v0.txt │ ├── drawbench.json │ ├── imagenet_classes.txt │ ├── sac+logos+ava1-l14-linearMSE.pth │ └── simple_animals.txt ├── diffusers_patch │ ├── ddim_with_logprob.py │ └── pipeline_with_logprob.py ├── flow.py ├── model_configs │ └── ViT-H-14.json ├── prompts.py ├── rewards.py └── utils.py ├── config └── sd.yaml ├── scripts ├── distributed.py └── train_gfn.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.DS_Store 2 | .idea/ 3 | .vscode/ 4 | *.mps 5 | wandb/ 6 | .psync 7 | *.out 8 | *.pt 9 | *.pkl 10 | *.ipynb_checkpoints 11 | .ipynb 12 | *.pdf 13 | *.png 14 | *.whl 15 | *.tar.gz 16 | 17 | # Byte-compiled / optimized / DLL files 18 | __pycache__/ 19 | *.py[cod] 20 | *$py.class 21 | 22 | # C extensions 23 | *.so 24 | 25 | # Distribution / packaging 26 | .Python 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | pip-wheel-metadata/ 40 | share/python-wheels/ 41 | *.egg-info/ 42 | .installed.cfg 43 | *.egg 44 | MANIFEST 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .nox/ 60 | .coverage 61 | .coverage.* 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | *.cover 66 | *.py,cover 67 | .hypothesis/ 68 | .pytest_cache/ 69 | 70 | # Translations 71 | *.mo 72 | *.pot 73 | 74 | # Django stuff: 75 | *.log 76 | local_settings.py 77 | db.sqlite3 78 | db.sqlite3-journal 79 | 80 | # Flask stuff: 81 | instance/ 82 | .webassets-cache 83 | 84 | # Scrapy stuff: 85 | .scrapy 86 | 87 | # Sphinx documentation 88 | docs/_build/ 89 | 90 | # PyBuilder 91 | target/ 92 | 93 | # Jupyter Notebook 94 | .ipynb_checkpoints 95 | 96 | # IPython 97 | profile_default/ 98 | ipython_config.py 99 | 100 | # pyenv 101 | .python-version 102 | 103 | # pipenv 104 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 105 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 106 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 107 | # install all needed dependencies. 108 | #Pipfile.lock 109 | 110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 111 | __pypackages__/ 112 | 113 | # Celery stuff 114 | celerybeat-schedule 115 | celerybeat.pid 116 | 117 | # SageMath parsed files 118 | *.sage.py 119 | 120 | # Environments 121 | .env 122 | .venv 123 | env/ 124 | venv/ 125 | ENV/ 126 | env.bak/ 127 | venv.bak/ 128 | 129 | # Spyder project settings 130 | .spyderproject 131 | .spyproject 132 | 133 | # Rope project settings 134 | .ropeproject 135 | 136 | # mkdocs documentation 137 | /site 138 | 139 | # mypy 140 | .mypy_cache/ 141 | .dmypy.json 142 | dmypy.json 143 | 144 | # Pyre type checker 145 | .pyre/ 146 | -------------------------------------------------------------------------------- /ACKNOWLEDGEMENTS: -------------------------------------------------------------------------------- 1 | Acknowledgements 2 | Portions of this Diffusion Alignment GFlowNet Software may utilize the following copyrighted 3 | material, the use of which is hereby acknowledged. 4 | 5 | _____________________ 6 | 7 | Kevin Black (ddpo-pytorch) 8 | 9 | Copyright (c) 2023 Kevin Black 10 | 11 | Permission is hereby granted, free of charge, to any person obtaining a copy 12 | of this software and associated documentation files (the "Software"), to deal 13 | in the Software without restriction, including without limitation the rights 14 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 15 | copies of the Software, and to permit persons to whom the Software is 16 | furnished to do so, subject to the following conditions: 17 | 18 | The above copyright notice and this permission notice shall be included in all 19 | copies or substantial portions of the Software. 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 24 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 26 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 27 | SOFTWARE. -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the open source team at [opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com). All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4, 71 | available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html) -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution Guide 2 | 3 | Thanks for your interest in contributing. This project was released to accompany a research paper for purposes of reproducibility, and beyond its publication there are limited plans for future development of the repository. 4 | 5 | While we welcome new pull requests and issues please note that our response may be limited. Forks and out-of-tree improvements are strongly encouraged. 6 | 7 | ## Before you get started 8 | 9 | By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, 10 | and agree by submitting the patch that your contributions are licensed under the [LICENSE](LICENSE). 11 | 12 | We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md). 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2024 Apple Inc. All Rights Reserved. 2 | 3 | IMPORTANT: This Apple software is supplied to you by Apple 4 | Inc. ("Apple") in consideration of your agreement to the following 5 | terms, and your use, installation, modification or redistribution of 6 | this Apple software constitutes acceptance of these terms. If you do 7 | not agree with these terms, please do not use, install, modify or 8 | redistribute this Apple software. 9 | 10 | In consideration of your agreement to abide by the following terms, and 11 | subject to these terms, Apple grants you a personal, non-exclusive 12 | license, under Apple's copyrights in this original Apple software (the 13 | "Apple Software"), to use, reproduce, modify and redistribute the Apple 14 | Software, with or without modifications, in source and/or binary forms; 15 | provided that if you redistribute the Apple Software in its entirety and 16 | without modifications, you must retain this notice and the following 17 | text and disclaimers in all such redistributions of the Apple Software. 18 | Neither the name, trademarks, service marks or logos of Apple Inc. may 19 | be used to endorse or promote products derived from the Apple Software 20 | without specific prior written permission from Apple. Except as 21 | expressly stated in this notice, no other rights or licenses, express or 22 | implied, are granted by Apple herein, including but not limited to any 23 | patent rights that may be infringed by your derivative works or by other 24 | works in which the Apple Software may be incorporated. 25 | 26 | The Apple Software is provided by Apple on an "AS IS" basis. APPLE 27 | MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION 28 | THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS 29 | FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND 30 | OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS. 31 | 32 | IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL 33 | OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 35 | INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, 36 | MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED 37 | AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE), 38 | STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE 39 | POSSIBILITY OF SUCH DAMAGE. 40 | 41 | ------------------------------------------------------------------------------- 42 | SOFTWARE DISTRIBUTED WITH Diffusion Alignment GFlowNet: 43 | 44 | This software includes a number of subcomponents with separate 45 | copyright notices and license terms - please see the file ACKNOWLEDGEMENTS. 46 | ------------------------------------------------------------------------------- -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Diffusion Alignment GFlowNet 2 | 3 | This is the PyTorch implementation for our paper [Improving GFlowNets for Text-to-Image Diffusion Alignment 4 | ](https://arxiv.org/abs/2406.00633). 5 | 6 | This work proposes methods to align text-to-image diffusion models with given reward functions 7 | through the algorithmic framework of GFlowNet. 8 | We provide code for DAG (diffusion alignment with gflownet) 9 | and DAG-KL (DAG with KL divergence based gradient). For more details, we refer to our paper. 10 | 11 | ## Installation 12 | 13 | ```bash 14 | pip install -e . 15 | ``` 16 | 17 | ## Usage 18 | 19 | Diffusion alignment training with GFlowNet on Stable Diffusion: 20 | ```bash 21 | torchrun --standalone --nproc_per_node=8 scripts/train_gfn.py 22 | ``` 23 | To use DAG-KL, set `config['train']['klpf]` in `config/sd.yaml` to a positive coefficient. 24 | 25 | 26 | ## Important Hyperparameters 27 | 28 | A detailed explanation of all the hyperparameters can be found in `config/sd.yaml`. 29 | 30 | ### prompt_fn and reward_fn 31 | At a high level, the problem of finetuning a diffusion model is defined by 2 things: 32 | a set of prompts to generate images, and a reward function to evaluate those images. 33 | The prompts are defined by a `prompt_fn` which takes no arguments and 34 | generates a random prompt each time it is called. 35 | The reward function is defined by a `reward_fn` which takes in a batch of images and returns 36 | a batch of rewards for those images. All of the prompt and reward functions currently implemented can be 37 | found in `alignment/prompts.py` and `alignment/rewards.py`, respectively. 38 | 39 | ## Acknowledgements 40 | 41 | We thank the authors of the [ddpo-pytorch](https://github.com/kvablack/ddpo-pytorch) repository for open sourcing their code, 42 | which part of our code is based on. 43 | 44 | 45 | # Citation 46 | If you find this code useful, please consider citing our paper: 47 | ``` 48 | @article{diffusion_alignment_gfn, 49 | title={Improving GFlowNets for Text-to-Image Diffusion Alignment}, 50 | author={Dinghuai Zhang and Yizhe Zhang and Jiatao Gu and Ruixiang Zhang and Josh Susskind and Navdeep Jaitly and Shuangfei Zhai}, 51 | journal={Arxiv}, 52 | year={2024}, 53 | url={https://arxiv.org/abs/2406.00633}, 54 | } 55 | ``` -------------------------------------------------------------------------------- /alignment/aesthetic_scorer.py: -------------------------------------------------------------------------------- 1 | # For licensing see accompanying LICENSE file. 2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved. 3 | 4 | # Based on https://github.com/christophschuhmann/improved-aesthetic-predictor/blob/fe88a163f4661b4ddabba0751ff645e2e620746e/simple_inference.py 5 | 6 | import torch 7 | import torch.nn as nn 8 | import numpy as np 9 | from transformers import CLIPModel, CLIPProcessor 10 | from PIL import Image 11 | 12 | import torch.distributed as dist 13 | from scripts.distributed import get_local_rank 14 | 15 | import sys 16 | if sys.version_info < (3, 9): 17 | from importlib_resources import files 18 | else: 19 | from importlib.resources import files 20 | ASSETS_PATH = files("alignment.assets") 21 | 22 | 23 | class MLP(nn.Module): 24 | def __init__(self): 25 | super().__init__() 26 | self.layers = nn.Sequential( 27 | nn.Linear(768, 1024), 28 | nn.Dropout(0.2), 29 | nn.Linear(1024, 128), 30 | nn.Dropout(0.2), 31 | nn.Linear(128, 64), 32 | nn.Dropout(0.1), 33 | nn.Linear(64, 16), 34 | nn.Linear(16, 1), 35 | ) 36 | 37 | # @torch.no_grad() 38 | def forward(self, embed): 39 | return self.layers(embed) 40 | 41 | 42 | class AestheticScorer(torch.nn.Module): 43 | def __init__(self, dtype, distributed=True): 44 | super().__init__() 45 | if distributed: 46 | if get_local_rank() == 0: # only download once 47 | self.clip = CLIPModel.from_pretrained("openai/clip-vit-large-patch14") 48 | self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") 49 | dist.barrier() 50 | self.clip = CLIPModel.from_pretrained("openai/clip-vit-large-patch14") 51 | self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") 52 | 53 | self.mlp = MLP() 54 | state_dict = torch.load( 55 | ASSETS_PATH.joinpath("sac+logos+ava1-l14-linearMSE.pth") 56 | ) 57 | self.mlp.load_state_dict(state_dict) 58 | self.dtype = dtype 59 | self.eval() 60 | 61 | # @torch.no_grad() 62 | def __call__(self, images): 63 | device = next(self.parameters()).device 64 | inputs = self.processor(images=images, return_tensors="pt") 65 | inputs = {k: v.to(self.dtype).to(device) for k, v in inputs.items()} 66 | embed = self.clip.get_image_features(**inputs) 67 | # normalize embedding 68 | embed = embed / torch.linalg.vector_norm(embed, dim=-1, keepdim=True) 69 | return self.mlp(embed).squeeze(1) -------------------------------------------------------------------------------- /alignment/assets/DrawBench Prompts.csv: -------------------------------------------------------------------------------- 1 | Prompts,Category 2 | A red colored car.,Colors 3 | A black colored car.,Colors 4 | A pink colored car.,Colors 5 | A black colored dog.,Colors 6 | A red colored dog.,Colors 7 | A blue colored dog.,Colors 8 | A green colored banana.,Colors 9 | A red colored banana.,Colors 10 | A black colored banana.,Colors 11 | A white colored sandwich.,Colors 12 | A black colored sandwich.,Colors 13 | An orange colored sandwich.,Colors 14 | A pink colored giraffe.,Colors 15 | A yellow colored giraffe.,Colors 16 | A brown colored giraffe.,Colors 17 | A red car and a white sheep.,Colors 18 | A blue bird and a brown bear.,Colors 19 | A green apple and a black backpack.,Colors 20 | A green cup and a blue cell phone.,Colors 21 | A yellow book and a red vase.,Colors 22 | A white car and a red sheep.,Colors 23 | A brown bird and a blue bear.,Colors 24 | A black apple and a green backpack.,Colors 25 | A blue cup and a green cell phone.,Colors 26 | A red book and a yellow vase.,Colors 27 | A horse riding an astronaut.,Conflicting 28 | A pizza cooking an oven.,Conflicting 29 | A bird scaring a scarecrow.,Conflicting 30 | A blue coloured pizza.,Conflicting 31 | Hovering cow abducting aliens.,Conflicting 32 | A panda making latte art.,Conflicting 33 | A shark in the desert.,Conflicting 34 | An elephant under the sea.,Conflicting 35 | Rainbow coloured penguin.,Conflicting 36 | A fish eating a pelican.,Conflicting 37 | One car on the street.,Counting 38 | Two cars on the street.,Counting 39 | Three cars on the street.,Counting 40 | Four cars on the street.,Counting 41 | Five cars on the street.,Counting 42 | One dog on the street.,Counting 43 | Two dogs on the street.,Counting 44 | Three dogs on the street.,Counting 45 | Four dogs on the street.,Counting 46 | Five dogs on the street.,Counting 47 | One cat and one dog sitting on the grass.,Counting 48 | One cat and two dogs sitting on the grass.,Counting 49 | One cat and three dogs sitting on the grass.,Counting 50 | Two cats and one dog sitting on the grass.,Counting 51 | Two cats and two dogs sitting on the grass.,Counting 52 | Two cats and three dogs sitting on the grass.,Counting 53 | Three cats and one dog sitting on the grass.,Counting 54 | Three cats and two dogs sitting on the grass.,Counting 55 | Three cats and three dogs sitting on the grass.,Counting 56 | A triangular purple flower pot. A purple flower pot in the shape of a triangle.,DALL-E 57 | A triangular orange picture frame. An orange picture frame in the shape of a triangle.,DALL-E 58 | A triangular pink stop sign. A pink stop sign in the shape of a triangle.,DALL-E 59 | A cube made of denim. A cube with the texture of denim.,DALL-E 60 | A sphere made of kitchen tile. A sphere with the texture of kitchen tile.,DALL-E 61 | A cube made of brick. A cube with the texture of brick.,DALL-E 62 | A collection of nail is sitting on a table.,DALL-E 63 | A single clock is sitting on a table.,DALL-E 64 | A couple of glasses are sitting on a table.,DALL-E 65 | An illustration of a large red elephant sitting on a small blue mouse.,DALL-E 66 | An illustration of a small green elephant standing behind a large red mouse.,DALL-E 67 | A small blue book sitting on a large red book.,DALL-E 68 | "A stack of 3 plates. A blue plate is on the top, sitting on a blue plate. The blue plate is in the middle, sitting on a green plate. The green plate is on the bottom.",DALL-E 69 | "A stack of 3 cubes. A red cube is on the top, sitting on a red cube. The red cube is in the middle, sitting on a green cube. The green cube is on the bottom.",DALL-E 70 | "A stack of 3 books. A green book is on the top, sitting on a red book. The red book is in the middle, sitting on a blue book. The blue book is on the bottom.",DALL-E 71 | "An emoji of a baby panda wearing a red hat, green gloves, red shirt, and green pants.",DALL-E 72 | "An emoji of a baby panda wearing a red hat, blue gloves, green shirt, and blue pants.",DALL-E 73 | A fisheye lens view of a turtle sitting in a forest.,DALL-E 74 | A side view of an owl sitting in a field.,DALL-E 75 | A cross-section view of a brain.,DALL-E 76 | "A vehicle composed of two wheels held in a frame one behind the other, propelled by pedals and steered with handlebars attached to the front wheel.",Descriptions 77 | "A large motor vehicle carrying passengers by road, typically one serving the public on a fixed route and for a fare.",Descriptions 78 | "A small vessel propelled on water by oars, sails, or an engine.",Descriptions 79 | A connection point by which firefighters can tap into a water supply.,Descriptions 80 | "A machine next to a parking space in a street, into which the driver puts money so as to be authorized to park the vehicle for a particular length of time.",Descriptions 81 | "A device consisting of a circular canopy of cloth on a folding metal frame supported by a central rod, used as protection against rain or sometimes sun.",Descriptions 82 | "A separate seat for one person, typically with a back and four legs.",Descriptions 83 | An appliance or compartment which is artificially kept cool and used to store food and drink.,Descriptions 84 | A mechanical or electrical device for measuring time.,Descriptions 85 | "An instrument used for cutting cloth, paper, and other thin material, consisting of two blades laid one on top of the other and fastened in the middle so as to allow them to be opened and closed by a thumb and finger inserted through rings on the end of their handles.",Descriptions 86 | "A large plant-eating domesticated mammal with solid hoofs and a flowing mane and tail, used for riding, racing, and to carry and pull loads.",Descriptions 87 | A long curved fruit which grows in clusters and has soft pulpy flesh and yellow skin when ripe.,Descriptions 88 | "A small domesticated carnivorous mammal with soft fur, a short snout, and retractable claws. It is widely kept as a pet or for catching mice, and many breeds have been developed.",Descriptions 89 | "A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, nonretractable claws, and a barking, howling, or whining voice.",Descriptions 90 | "An organ of soft nervous tissue contained in the skull of vertebrates, functioning as the coordinating center of sensation and intellectual and nervous activity.",Descriptions 91 | "An American multinational technology company that focuses on artificial intelligence, search engine, online advertising, cloud computing, computer software, quantum computing, e-commerce, and consumer electronics.",Descriptions 92 | "A large keyboard musical instrument with a wooden case enclosing a soundboard and metal strings, which are struck by hammers when the keys are depressed. The strings' vibration is stopped by dampers when the keys are released and can be regulated for length and volume by two or three pedals.",Descriptions 93 | "A type of digital currency in which a record of transactions is maintained and new units of currency are generated by the computational solution of mathematical problems, and which operates independently of a central bank.",Descriptions 94 | "A large thick-skinned semiaquatic African mammal, with massive jaws and large tusks.",Descriptions 95 | A machine resembling a human being and able to replicate certain human movements and functions automatically.,Descriptions 96 | Paying for a quarter-sized pizza with a pizza-sized quarter.,Gary Marcus et al. 97 | An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with no umbrellas.,Gary Marcus et al. 98 | "A grocery store refrigerator has pint cartons of milk on the top shelf, quart cartons on the middle shelf, and gallon plastic jugs on the bottom shelf.",Gary Marcus et al. 99 | "In late afternoon in January in New England, a man stands in the shadow of a maple tree.",Gary Marcus et al. 100 | An elephant is behind a tree. You can see the trunk on one side and the back legs on the other.,Gary Marcus et al. 101 | A tomato has been put on top of a pumpkin on a kitchen stool. There is a fork sticking into the pumpkin. The scene is viewed from above.,Gary Marcus et al. 102 | A pear cut into seven pieces arranged in a ring.,Gary Marcus et al. 103 | "A donkey and an octopus are playing a game. The donkey is holding a rope on one end, the octopus is holding onto the other. The donkey holds the rope in its mouth. A cat is jumping over the rope.",Gary Marcus et al. 104 | "Supreme Court Justices play a baseball game with the FBI. The FBI is at bat, the justices are on the field.",Gary Marcus et al. 105 | Abraham Lincoln touches his toes while George Washington does chin-ups. Lincoln is barefoot. Washington is wearing boots.,Gary Marcus et al. 106 | Tcennis rpacket.,Misspellings 107 | Bzaseball galove.,Misspellings 108 | Rbefraigerator.,Misspellings 109 | Dininrg tablez.,Misspellings 110 | Pafrking metr.,Misspellings 111 | "A smafml vessef epropoeilled on watvewr by ors, sauls, or han engie.",Misspellings 112 | "A sjmall domesticated carnivorious mammnal with sof fuh,y a sthort sout, and retracwtablbe flaws. It iw widexly kept as a pet or for catchitng mic, ad many breeds zhlyde beefn develvoked.",Misspellings 113 | "An instqrumemnt used for cutting cloth, paper, axdz othr thdin mteroial, consamistng of two blades lad one on tvopb of the other and fhastned in tle mixdqdjle so as to bllow them txo be pened and closed by thumb and fitngesr inserted tgrough rings on kthe end oc thei vatndlzes.",Misspellings 114 | "A domesticated carnivvorous mzammal that typicbally hfaas a lons sfnout, an acxujte sense off osmell, noneetractaaln crlaws, anid xbarkring,y howlingu, or whining rvoiche.",Misspellings 115 | "A ldarge keybord msical instroument lwith a woden case enmclosig a qsouvnkboajrd and mfgtal strivgf, which are strucrk b hammrs when the nels are depresdsmed.f lhe strsingsj' vibration ie stopped by damperds when the keys re released and can bce regulavewdd for lengh and vnolume y two or three pedalvs.",Misspellings 116 | A train on top of a surfboard.,Positional 117 | A wine glass on top of a dog.,Positional 118 | A bicycle on top of a boat.,Positional 119 | An umbrella on top of a spoon.,Positional 120 | A laptop on top of a teddy bear.,Positional 121 | A giraffe underneath a microwave.,Positional 122 | A donut underneath a toilet.,Positional 123 | A hair drier underneath a sheep.,Positional 124 | A tennis racket underneath a traffic light.,Positional 125 | A zebra underneath a broccoli.,Positional 126 | A banana on the left of an apple.,Positional 127 | A couch on the left of a chair.,Positional 128 | A car on the left of a bus.,Positional 129 | A cat on the left of a dog.,Positional 130 | A carrot on the left of a broccoli.,Positional 131 | A pizza on the right of a suitcase.,Positional 132 | A cat on the right of a tennis racket.,Positional 133 | A stop sign on the right of a refrigerator.,Positional 134 | A sheep to the right of a wine glass.,Positional 135 | A zebra to the right of a fire hydrant.,Positional 136 | Acersecomicke.,Rare Words 137 | Jentacular.,Rare Words 138 | Matutinal.,Rare Words 139 | Peristeronic.,Rare Words 140 | Artophagous.,Rare Words 141 | Backlotter.,Rare Words 142 | Octothorpe.,Rare Words 143 | A church with stained glass windows depicting a hamburger and french fries.,Reddit 144 | "Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna.",Reddit 145 | "A baby fennec sneezing onto a strawberry, detailed, macro, studio light, droplets, backlit ears.",Reddit 146 | A photo of a confused grizzly bear in calculus class.,Reddit 147 | An ancient Egyptian painting depicting an argument over whose turn it is to take out the trash.,Reddit 148 | "A fluffy baby sloth with a knitted hat trying to figure out a laptop, close up, highly detailed, studio lighting, screen reflecting in its eyes.",Reddit 149 | "A tiger in a lab coat with a 1980s Miami vibe, turning a well oiled science content machine, digital art.",Reddit 150 | A 1960s yearbook photo with animals dressed as humans.,Reddit 151 | Lego Arnold Schwarzenegger.,Reddit 152 | A yellow and black bus cruising through the rainforest.,Reddit 153 | A medieval painting of the wifi not working.,Reddit 154 | "An IT-guy trying to fix hardware of a PC tower is being tangled by the PC cables like Laokoon. Marble, copy after Hellenistic original from ca. 200 BC. Found in the Baths of Trajan, 1506.",Reddit 155 | "35mm macro shot a kitten licking a baby duck, studio lighting.",Reddit 156 | McDonalds Church.,Reddit 157 | Photo of an athlete cat explaining it's latest scandal at a press conference to journalists.,Reddit 158 | Greek statue of a man tripping over a cat.,Reddit 159 | "An old photograph of a 1920s airship shaped like a pig, floating over a wheat field.",Reddit 160 | Photo of a cat singing in a barbershop quartet.,Reddit 161 | "A painting by Grant Wood of an astronaut couple, american gothic style.",Reddit 162 | An oil painting portrait of the regal Burger King posing with a Whopper.,Reddit 163 | "A keyboard made of water, the water is made of light, the light is turned off.",Reddit 164 | Painting of Mona Lisa but the view is from behind of Mona Lisa.,Reddit 165 | Hyper-realistic photo of an abandoned industrial site during a storm.,Reddit 166 | A screenshot of an iOS app for ordering different types of milk.,Reddit 167 | "A real life photography of super mario, 8k Ultra HD.",Reddit 168 | Colouring page of large cats climbing the eifel tower in a cyberpunk future.,Reddit 169 | Photo of a mega Lego space station inside a kid's bedroom.,Reddit 170 | A spider with a moustache bidding an equally gentlemanly grasshopper a good day during his walk to work.,Reddit 171 | A photocopy of a photograph of a painting of a sculpture of a giraffe.,Reddit 172 | "A bridge connecting Europe and North America on the Atlantic Ocean, bird's eye view.",Reddit 173 | "A maglev train going vertically downward in high speed, New York Times photojournalism.",Reddit 174 | A magnifying glass over a page of a 1950s batman comic.,Reddit 175 | "A car playing soccer, digital art.",Reddit 176 | Darth Vader playing with raccoon in Mars during sunset.,Reddit 177 | A 1960s poster warning against climate change.,Reddit 178 | Illustration of a mouse using a mushroom as an umbrella.,Reddit 179 | A realistic photo of a Pomeranian dressed up like a 1980s professional wrestler with neon green and neon orange face paint and bright green wrestling tights with bright orange boots.,Reddit 180 | A pyramid made of falafel with a partial solar eclipse in the background.,Reddit 181 | A storefront with 'Hello World' written on it.,Text 182 | A storefront with 'Diffusion' written on it.,Text 183 | A storefront with 'Text to Image' written on it.,Text 184 | A storefront with 'NeurIPS' written on it.,Text 185 | A storefront with 'Deep Learning' written on it.,Text 186 | A storefront with 'Google Brain Toronto' written on it.,Text 187 | A storefront with 'Google Research Pizza Cafe' written on it.,Text 188 | A sign that says 'Hello World'.,Text 189 | A sign that says 'Diffusion'.,Text 190 | A sign that says 'Text to Image'.,Text 191 | A sign that says 'NeurIPS'.,Text 192 | A sign that says 'Deep Learning'.,Text 193 | A sign that says 'Google Brain Toronto'.,Text 194 | A sign that says 'Google Research Pizza Cafe'.,Text 195 | New York Skyline with 'Hello World' written with fireworks on the sky.,Text 196 | New York Skyline with 'Diffusion' written with fireworks on the sky.,Text 197 | New York Skyline with 'Text to Image' written with fireworks on the sky.,Text 198 | New York Skyline with 'NeurIPS' written with fireworks on the sky.,Text 199 | New York Skyline with 'Deep Learning' written with fireworks on the sky.,Text 200 | New York Skyline with 'Google Brain Toronto' written with fireworks on the sky.,Text 201 | New York Skyline with 'Google Research Pizza Cafe' written with fireworks on the sky.,Text -------------------------------------------------------------------------------- /alignment/assets/HPDv2/benchmark_photo.json: -------------------------------------------------------------------------------- 1 | ["A man taking a drink from a water fountain.", "Fruit in a jar filled with liquid sitting on a wooden table.", "A bathroom sink cluttered with multiple personal care items.", "A smiling man is cooking in his kitchen.", "A beautiful blue and pink sky overlooking the beach.", "A man smiles as he stirs his food in the pot.", "Several bikers are lined up in a parking lot.", "There is no picture or image sorry sorry", "A small car parked in by a vespa.", "Several people around some motorcycles on the side of a road.", "A black and white cat looking out a window over another cat.", "A woman in a purple top pulling food out of a oven", "Fighter jets on display in front of a museum.", "An empty road with buildings on each side.", "Two vespas parked next to a light post.", "A peak into a long bathroom with a toilet, but no shower.", "A face car driving past a parked motorcycle.", "A computer monitor glows on a wooden desk that has a black computer chair near it.", "a medium sized plane on an air port run way", "A bicycle chained to a pole on a snowy day", "A half eaten dessert and half empty cup.", "A blue airplane in a blue, cloudless sky", "A corner view of a kitchen with white appliances and dark wood cabinets.", "a cat laying on the floor of a kitchen", "A man and his dog riding on a bike. ", "A bathroom with a toilet and sink inside.", "A bathroom stall containing an empty toilet in it.", "A brown and black dog sticking its head out a window.", "A white busted up toilet sitting on it's side.", "a hairy man lying on a bench besides a bush", "A bunch of people waiting in line by a rail.", "A counter in a coffee house with choices of coffee and syrup flavors.", "Motorcycles parked on the sidewalk next to a road.", "A dresser in a room that is painted bright yellow.", "A person with his head out of a window while on a train. ", "a tiled bathroom with a toilet and sink inside of it ", "A man sitting in a chair, in a black and white photo.", "A bathroom with clear glass shower door and tile floor.", "A dog sitting in a bathroom with a urinal and a torn wall.", "A white expensive car parked on top of a cement slab.", "An airplane flying past the Moon in the sky.", "A woman sitting under an umbrella in the middle of a restaurant.", "A woman getting ready to cook some food in a small kitchen.", "A car sitting in the middle of the grass in the rain.", "A man and woman riding on the back of a motorcycle.", "Foods are being put in to the mason jars", "A man sitting on a bench in a lobby.", "A motorcycle is parked next to the fire hydrant", "A bike parked on top of a boat.", "A for of four urinals mounted to a wall.", "A couple of old fashioned oak wood dining tables.", "A magazine with a couple of cat around a toilet on it's cover.", "this is a very dark picture of a room with a shelf", "Meat left out on the kitchen counter could spoil.", "People leaning out the windows of a train as it goes through the countryside.", "a small plane with a propellor sitting on a runway", "An out house with the door opened sitting in a field.", "A man sitting on a modern bench talking on a phone.", "A woman wearing a hair net cutting a large sheet cake.", "a toilet sits next to a shower and sink ", "Chopped meat laid out on towels in a home kitchen, in preparation for cooking.", "A cat standing on a toilet seat in a bathroom.", "A small engine plane sitting on a runway.", "A bicycle parked and leaning against a brick building.", "there is a small kitten inside of a sink", "A kitchen with a wooden table with a cat sleeping on top of it.", "A small kitchen does have plenty of cabinets.", "A white bathroom with a white toilet and sink.", "a small propeller plane sits on a run way ", "A white toilet sitting next to a large window.", "A large fire place sitting next to a doorway.", "A man wearing a black neck tie and glasses.", "Large shower sectional of a bathroom in a brown and white photograph.", "A bike sitting next to a brick wall in the open.", "A group of motor bikes on a street.", "there are many people trying to avoid the rain", "The small, single engine airplane is parked on the tarmac. ", "A woman eating vegetables in front of a stove.", "The blue shower curtains are inside of the bathtub next to the toilet. ", "A group of people with umbrellas standing around a white car.", "A man hanging his head out of the side of a train.", "A couple of small rooms in a house.", "Personal computer desk room with large glass double doors.", "A modern style bathroom with a large tub and shower and tile floor.", "A golden bicycle with a basket next to a brick wall.", "A man getting food ready while people watch.", "A wire fence containing various hair clips with a building in the background.", "a vintage photo of some people sitting on a bench ", "An elderly lady pours some cups of tea on a tray.", "A white jet airliner parked on a runway at night.", "there is a chocolate cake and ice cream on a plate", "An outhouse sitting in the middle of a field.", "A bunch of birds that are sitting on steps.", "A city filled with lots of tall white buildings.", "A bathroom sink that is under a mirror.", "there is a mirror and a picture on the wall ", "A man that is sitting on a couch.", "there us a woman and a young child sitting on a bench", "A woman that is sitting under an umbrella.", "A woman that is standing near an open oven.", "there is a white toilet and a sink in this bathroom ", "A group of people posing with festive items.", "A woman in an orange vest and blue helmet riding a horse up a flight of stairs.", "A group of people that are sitting on bikes in the street.", "A group of motorcycles are parked next together.", "A black motorcycle parked on a brick sidewalk next to a road.", "Cars, people, buildings and street lamps on a city street.", "there is a chef making food as people watch", "An empty kitchen with lots of tile blue counter top space.", "there is a man sticking his head out of a train window", "a tiled bathroom with a toilet and scale in it ", "A bunch of airplanes lined up in a row at an airport.", "A desk sitting next to a showroom of cars in it.", "An elderly man is sitting on a couch.", "a bunch of glasses with some food inside of it ", "A city at night filled with lots of traffic.", "Careful bicycle riders add florescents to their clothes for safety in the dark.", "this is a dark picture of a large kitchen", "A white toilet sitting next to a shower in a bathroom.", "A crowd of people watching an airplane on a runway.", "A man sitting on a black and yellow bench on the phone.", "A woman taking a photo over the shoulder of a man on a bike.", "this kitchen has a white and black stove in it", "The dirt bike has seen many hill climbs in its history.", "A plane flies in the sky in front of a silhouette of a moon.", "a cluttered room with a table and shelf on the wall.", "there are many men playing soccer in a field", "A woman forks vegetables out of a bowl into her mouth. ", "A woman taking a picture of herself in a mirror.", "A couple of men riding a motorcycle down a street.", "Portable toilet in a wooden box area of a field.", "A motorcycle bike leaning against a white trailer.", "The view of a bathroom tub, shower, and toilet.", "A bathroom with a toilet and a scale.", "A jet flies in the distance with the moon in the background. ", "there is a man wearing a suit sitting on a bench", "A group of Navy cooks standing around a giant cake.", "A group of people in suits standing in a kitchen.", "A white toilet sitting under a window next to a chair.", "A dog is staring at a picture on a flat screen tv.", "A man is sitting on a public bench on a busy city street.", "Man talking on personal cell phone on a yellow and black bench.", "a counter top with food sitting on some towels", "A bathroom with a sink, vanity and shower stall.", "A view of a very dark lit kitchen from the other side of the room.", "A motorcycle parked on a sidewalk near a street light.", "There is traffic on a busy city street. ", "a blue bicycle a blender sand and a person", "Two people on a motorcycle with tone taking a photo", "The cat is sitting on the old butcher block.", "a dirt bike laying against a trailer in a grassy field", "A tabby cat sleeping on a wooden island in an old looking kitchen.", "view of tall city buildings with cars and people walking by", "a man standing in front of a big display case of donuts ", "Woman walking down the side walk of a busy night city.", "A single propellor aircraft that is parked on an airport apron with vehicles and another plane in the background.", "Woman eating an assortment of mixed vegetables in a bowl.", "There are orange slices in canning jars without lids.", "a bathroom with a stand up shower and tub.", "some people driving down the road with their bikes ", "A brown cat crouches and arches its back in a white sink.", "A group of waiters standing in a line. ", "A beach area with a bicycle that has a blender attached to the front, parked on the sand.", "Meats being prepared for cooking on kitchen counter.", "A woman sits under the sheet on a mattress on the floor.", "Large dog looking at television show in living room.", "A man driving a motorcycle with a woman holding a cell phone.", "A young woman standing in a kitchen eats a plate of vegetables.", "The motorcyclist in a helmet is looking over the side of a bridge. ", "At night on a street with a group of a bicycle riders riding down the road together.", "The woman sitting at the table looks bored.", "The woman in the kitchen is tending to her food.", "some people holding umbrellas and standing by a car in the rain", "A bike parked in front of a doorway.", "A person is riding his motorcycle on the dirt road.", "A bathroom that has a door just for the toilet area.", "Eight jars are being filled with orange slices. ", "Woman with a motorcycle staring over a bridge at a wetlands. ", "A bathroom has pink tiles and a black toilet.", "A group of people holding umbrellas stand near a car.", "there are two woman that are riding motorcycles ", "Some men and women in white shirts and bow ties standing in a row.", "A container of antibacterial wipes in a bathroom.", "A monitor screen, printer, couch and chair in the room", "A very dimly lite kitchen in someone's house at night.", "this kitchen is very big and has wood cainets", "A bunch of uncooked food on a counter.", "this is a wood table in a cluttered kitchen", "A bathtub that is in a bathroom under a wooden object.", "A bunch of people standing around and posing for a picture.", "A shelf of various cups and glasses mounted to the wall.", "A man standing by his motorcycle is looking out to take in the view. ", "A light that is on above a mirror.", "a bathroom with a tub next to a fancy shower stall ", "A automobile with multiple bicycles on a roof rack. ", "this small bathroom has white sink and a toilet", "A man standing behind the counter at a doughnut shop.", "a toilet a tub some pipes and a window", "A view of a table with a bunch of cakes and tea on it.", "Large sized kitchen with a dining room section.", "There is a cyclist riding above all the pigeons.", "there is a woman that is cutting a white cake", "Some people are enjoying time on a beach. ", "Pile of strings and books next to a laptop computer.", "A man is standing in front of a case filled with pastries.\n", "A woman marking a cake with the back of a chef's knife. ", "A bicycle that is stored in someone's closet in the apartment. ", "A woman eating fresh vegetables from a bowl.", "A large kitchen with a lot of cabinets and counter space.", " a bathroom with a picture of a bookshelf above the urinals", "Line of men and three woman standing in front of a kitchen.", "kitchen with a wooden kitchen island and checkered floor", "there is a woman staring in the kitchen pouring tea", "this man is riding a board near a field", "A man on a motorcycle riding in the desert.", "A dining room with hard wood floors that is very fancy. ", "A group of young bicyclists on a city street at night.", "The bath tub and toilet in this bathroom are black.", "Pots and pans that are on the side of a sink.", "there is a all black motorcycle that is parked on the street", "a black toilet in a wood floored bathroom", "The jars on the table are full of oranges.", "A cat sits on an open toilet in a bathroom.", "a female standing in the bathroom and taking a photo with her phone", "two men on a scooter riding down the roadway", "A bathroom, showing the shower, toilet and sink.", "A wooden table sitting in the center of a kitchen.", "a jet airplane sitting on a runway next to a building", "There is an airplane on the runway in the distance.", "a group of people sitting on the sand with a lake in the background", "A small powder room with a sink and vanity, toilet, mirror, and an empty towel bar.", "Various kitchen dishes are arranged on many different shelves. ", "A man on a bicycle above spectator stands, where pigeons graze.", "There is a cat standing on the toilet seat.", "A man in a helmet and jacket riding a motorcycle in the desert.", "Many objects are sitting on a counter in a kitchen.", "a man sitting on a motorcycle in the desert", "A line of urinals against a wall with bookshelves above.", "A woman holding a colorful kite on top of a green field.", "a bathroom with tiled floor and a circular window ", "this is a bench out near a field", "a bathroom view of a tiolet and sink ", "A white sink sitting under a mirror next to a toilet.", "A woman sitting at a table next to an umbrella.", "A woman standing in a kitchen baking bread.", "A series of shelves holding colorful glassware and dishes.", "a guy in the desert sitting on his motorcycle", "A kitchen in a camp with gear and coats laid out.", "Three people sit on a bench looking out over the water. ", "A person on a bike is next to a train on the tracks. ", "a group of boys playing in a field next to a forrest", " A blender sitting on top of a table.", "A public restroom with toilet, sink and a grab bar.", "A very tall clock tower sitting above a building.", "five restaurant wait staff and two mangers ", "A plane traveling down a run way, near the highway.", "Two people standing in a small kitchen with an arched passage.", "A man rides a motorcycle down a dirt road. ", "i table filled with cups and a plate of food.", "A motorcycle parked on a stone cobble road, in the sun.", "A man standing in front of a bunch of doughnuts.", "Open shelves hold an assortment of glasses, cups, and bowls. ", "Old photo of man sitting on his motorcycle", "Two people ride motorcycles down a city street.", "a mirror a sink a toilet and a blue basket", "A family riding their bikes next to the streetlight. ", "A top down view of a bathroom with a scale and toilet.", "Dessert for two is placed on a table.", "An intersection with cars is pictured in this image.", "A table topped with lots of food and drinks.", "An old motorcycle with a side car attached.", "the kitchen has a stove and sink with pots and pans", "The urinals are sitting below the shelves full of books,", "A cat sitting inside a sink in a bathroom ", "A kitchen with a lot of kitchen furniture and accessories", "A bicycle sits parked in front of a bookstore.", "A colorful kite is ready for launch on a blue sky day", "A cyclist pedals past a flock of birds perched on a grating.", "A mirror shows another light in a background of a wonderful bathroom", "a white table with sandwiches and cups of tea and people and sivlerware", "A bike rider traveling down a road, in the desert.", "Two kittens are cuddling and enjoying a soft pillow", "A man on a skateboard rides down a narrow road.", "A kitchen with a stove, table, cabinets, and other items ", "A plane flies in the sky passing over the moon.", "A kitchen with many of the appliances removed with blue and white tile.", "A bathroom with sink, toilet, and bathtub and black and white floor tiles.", "this is a red bike on a dirt path", "A man waits to cross the railroad tracks as two trains cross.", "A person rides an electric bike on a desert trail.", "A bathroom with a sink and other items. ", "this is a toilet and trash can and a sink", "A stop sign sits in front of a billboard in a quiet area.", "A bike parked in front of a book shelf.", "A view of a kitchen with a burner top stove.", "a black toilet some toilet paper and brown tiles", "A boy wearing a suit riding a skateboard down the road", "a bathroom with a big mirror above the sink", "A TV sitting on top of a counter inside of a store.", "a bathroom with a glass sink base with a bowl on top", "A motorcycle with a flat rear tire sits in a workshop, while a person stands behind it, facing away from the camera.", "A bathroom with a toilet, sink and shower stall.", "A busy street with cars and buses on it.", "some people an airport a runway and a jet", "A man and a woman looking at cell phones.", "A man with a fro riding a skateboard down a road.", "A kitchen with and island and several counters in it.", "A couple of sinks with brown tile and a decorative mirror.", "A view of a messy room, with shelves on the wall.", "The motorcycle is parked on the side of the paved road. ", "a bathroom view of a stand up shower and toilet with a sink near by", "A bathroom with a toilet and a scale on the floor.", "a couple of people standing inside a kitchen.", "There are a lot of cupboards and refrigerator in the room. ", "Night is falling on an empty city street.", "A vintage antique motorcycle sitting in a shop being worked on.", "a black gray and white cat a toilet sink and mirror", "The plane is taking off into the yellow sky.", "A curly haired boy rides a skateboard down a road.", "A woman is seen in the rear view mirror of a motorcycle.", "A woman pouring coffee into cups on a counter.", "a bike resting in the sand with a blender built on top", "A PICTURE OF A KITCHEN WITH TILE COUNTER TOP", "A walk in shower sitting next to a bath tub.", "A city street filled with lots of traffic.", "Person riding a four wheeler on a beach towards a bridge.", "A PICTURE OF A BATHROOM WITH SLIDING SHOWER ", "Black motorcycle with a side car in the middle of the street. ", "Group of people standing around each other in the middle of a city street. ", "a kitchen with a stove sitting on a hard wood floor and cabinets", "A man walking around with his dog and sheep.", "there are two cats that are laying inside of a tub", "there is a small dog that is looking threw the glass", "A man taking a picture of himself in front of three huge beer bottles", "A PICTURE OF A MAN WITH BEER BEHIND HIM ", "there is a small out house that is made of wood", "Several people smile for the camera at night.", "this is a bathroom that has a sink and toilet", "A black dog sitting in front of a TV.", "A PICTURE OF ALL WHITE IN A BATHROOM ", "A person wearing a safety vest rides a horse up the staircase.", "a man sitting in a chair on a tiled floor next to a heater", "this is a clock on top of a tower", "Bike leaned against a wall of books inside and establishment.", "this is a group of people standing near a river\n", "a bathroom with a toilet and sink and a bath tub sitting on a hardfloor", "A person rides a vehicle on the beach.", "this kitchen has a white stove and all white cabinets", "Three people sit on a bench together facing away.", "Two cats sitting together in an empty bathtub.", "A puppy staring through a red sectioned window.", "a toilet sitting on a tiled floor in enclosed bathroom stall", "People are walking and cars are driving in a city.", "A kitchen with a stove, microwave and cabinets.", "Two teams compete at a sport in a park.", "A bathroom with a toilet, counter, and mirror.", "A lidless toilet is shown caked in dirt or other filth.", "A tiled bathroom is shown with a compact style toilet.", "there is a police man riding a tav on the beach", "A small white car with a small white dog riding in it.", "a toilet a sink a towel a light and a mirror", "A man riding an ATV next to the ocean.", "Two people riding a motorcycle near a group of people.", "A group of people walking down a walkway.", "a woman a white mat and pillow and white wall", "A man riding a motorcycle down a road near a forest.", "A little girl holding a brown stuffed animal.", "there is a very beautiful view out of this bathroom window", "A dog stands close to a television looking at it.", "this bathroom is very big and has lots of room", "A pair of cats sit in an empty bathtub.", "A dog looks through ribbed glass in a red door.", "there is a old black motorcycle inside of a garage", "A motorcyclist parked near a railing looks out over the water.", "Two people are looking at a truck while a dog is being walked.", "Several people are seen sitting around and smoking.", "this is a man sitting on a green couch", "A juicer attached to the top of a bike.", "A cramped bathroom with a sink in the corner.", "A crowded street filled with British traffic and buses.", "A small clock is seen on the side of a church.", "this is an airplane sitting on the runway", "A girl is holding a large kite on a grassy field.", "Two people sitting on a motorcycle that parked on the road.", "A composite image of an office desk, cars and buildings.", "A man wearing a hat in front of large bottles.", "a church with a tall tower with a clock built into it", "A group of young men jump in the air playing a game.", "some shelves filled with bowls and cups ", "A dog sits in front of and watches the television.", "An old rusting toilet with the lid up. ", "some peeled oranges sitting in a clear blender", "Two people standing in a kitchen near a stove.", "A young girl walking barefoot carries a stuffed animal.", "a man reflected in a rear view mirror of a motorcycle", "A native American couple on a bike pose for a photo.", "A person laying on a bathtub with their feet sticking out.", "A wooden outhouse sitting in the grass near trees.", "A toilet sitting in a stall on tile.", "A kite flying in the sky on a cloudy day.", "Two cats occupy a bathtub, one sitting and one lying down. ", "A dog looks out through a lined window. ", "A group of people are standing in the snow on skis.", "A person is sitting on a motorcycle looking in the mirror.", "a bathroom wall missing some pink wall tiles ", "A man walking across a field holding a wand near a dog.", "a bathroom with towels under a sink and a big mirror above it", "Adults and children gather near a dock on the beach.", "A green, red, yellow and blue kite fly's through the sky.", "a man standing next to a laptop and bottles of beer", "two cats resting side by side on a bed", "A large jetliner sitting on top of an airport runway.", "A sad woman laying on a mattress on a hardwood floor.", "A red bus parked next to a crowd of people.", "Looking through the window of showroom at car dealership.", "Two people standing next to each other in a kitchen.", "a bathroom view of a sink toilet on a tiled floor", "a group of people standing in the snow with gear on", "a church with a clock built into the side of it", "A daytime view of a messy kitchen corner.", "a colorful kite flying high on a cloudy day", "A man riding a red scooter down the street.", "An old toilet outside against an old painted wall.", "A kitchen counter top with a white bowl sitting next to another white bowl.", "A toilet filled with nasty grime sitting up against a bathroom wall.", "A woman standing between a motor bike and a striped wall over a river.", "A group of people sitting on top of a bench.", "A group of men standing around a luggage cart.", "a person in a bathroom having a reflection in the mirror", "a kitchen with a microwave, a stove, and cabinets.", "a street with cars lined with poles and wires.", " two men and one woman standing in a kitchen", "A dark and cluttered storage area with wood walls.", "Three people sitting on a bench looking at the ocean.", "people sitting on a bench facing the water.", "a cat sitting in a sink with its eyes open", "two cars parked side by side on a show room floor", "two cats chill in the bathtub one is laying down", "a dog who looks sad stares outside of the window of a red door ", "a lady holding a kite and walking in a grassy area", "An old toilet with a rotten lid next to a rusted pipe.", "some piled oranges in a glass blender ready to be blended", "A room with a chair and pictures mounted on the wall. ", "a sink in a bathroom with a shaver and personal hygeine items on the counter top", "a street with people and vehicles in the middle of it", "a room showing a wooden table and a capboard", "Several cars parked near a desk holding a computer.", "A parked white car with and open door and a dog inside.", "A black motorcycle with a sidecar parked on cobblestone.", "a group of vehicles parked next to a firehydrant", "A woman standing on grass holding a colorful kite.", "A white kitchen with a gas stove and microwave.", "a motor bike carrying very many people on the street", "A row of urinals with a well-stocked bookshelf in front. ", "A woman lying on a thin mattress on the floor with her knees up.", "A plane riding down a runway of an airport.", "a cake and two spoons on a plate", "Toilet in a bathroom in an international location with a basket.", "Group of horses in a field with a pinto in the foreground.", "A dim lit room consisting of many objects put together. ", "a man standing in a bathroom looking into a mirror", "a street view of people walking down the sidewalk ", "A airplane sitting on a runway at a small airport.", "A room with a sink and a skeleton foot. ", "Sun shining through the blinds into a white bathroom.", "A man in a blue hat on a bike behind a train. ", "An airplane on the runway of an airport.", "Two motorcycles going down a city street with woman drivers", "A black and white still life of a branch with flowers in a vase", "A clean odd little bathroom with a white porcelain toilet.", "A paint horse and other breeds in the background grazing in a green field.", "A kitchen area that has items on the counter tops. ", "a tall red bus is by the curb in a city", "A restroom with a toilet and a mirror. ", "A tiled floor bathroom with a red and black shower curtain.", "An older woman pouring tea in the kitchen.", "A bicycle is placed behind an open door.", "A man and two dogs are riding a scooter.", "A busy street with traffic moving in both directions and several two level buses on the street with people around.", "A bathroom has a toilet and a scale.", "A bathroom with outdated fixtures and a clothes hamper in the middle of the floor.", "The tiles are falling off the wall in this old bathroom", "A dog sits in a white car with the door open.", "a white toilet is in the corner of a bathroom", "Top view of a few skinned oranges inside of a blender", "a sink well cleaned and some drawers and hand wash", "a bunch of people are standing on a snowy hill", "View of toilet with a dirty lid and a missing cover to it's tank", "A street with a few people walking and cars in the road. ", "a coupe of people are sitting outside on a bench", "A little girl is carrying a stuffed animal.", "A man and a woman are riding a motorcycle.", "a couple of bathroom items sitting on a sink", "a couple of motorcyclists are driving down the road", "three people sitting on a motorcycle in a street", "a couple of vehicles are parked in a lot", "A messy kitchen with dirty dishes and white cabinets", "A minimalist room features white appliances and beige walls.", "A kichen with dirty dishes in the sink.", "A table with a plate holding several sandwiches, tea cups and condiments. ", "A bathroom sink that is surrounded by various toiletries.", "Two motorcycles are parked on the shoulder of a mountainous freeway.", "An intersection is shown on a cloudy day.", "Pink bike sits on a guard rail by the river.", "people on the street with their cars moving", "This is a state of the art bathroom where the appliances don't look like they should", "The man who uses this bathroom shaved this morning", "A bathroom with a white toilet, tub, and tile floor.", "A man riding a scooter with a dog on it. ", "a t.v. that is sitting on a shelf with some lights near by", "a bath room with its door open and light on", "a bike that is leaning up against a book rack", "Motorcycles parked in a row in the street. ", "A group of people are standing together at night.", "a little white car that has a dog in it", "A man is standing in a field with a dog and goat.", "a bunch of different electronics all on one big pile. ", "Two motorcycles sit on the side of a secluded road.", "a airplane that is on a runway by some grass", "A pink bicycle leaning against a green railing next to a canal.", "Three people on a motor bile that is riding in a street, with one of them wearing a helmet.", "A man looks at himself in the mirror of a motorcycle.", "a room filled with white furniture and books on the ground. ", "A bicycle leaned against the hallway wall in a house", "two different kinds of lights in a bath room. ", "a room with wood and ivory furniture inside. ", "THERE IS A PLATE WITH SWEET DESSERTS ON THE PLATE ", "a road sign showing stop and a vehicle moving", "Two lights shine above a messy bathroom toilet.", "A chair sits against a wall in a wood floored room.", "a couple of men that are next to some boxes", "Several people are standing around watching a band perform on stage.", "A small bathroom has a port hole window.", "A young girl with a stuffed toy in a park.", "A black and whit cat sitting in a sink.", "A person is taking a picture of a bathroom with a toilet in it.", "Some cakes are on a white plate with spoons.", "A man looks into the mirror as he styles his hair.", "a couple of sinks in a bright colored bathroom", "a man on a motorcycle that is in some grass", "Some people are next to a pier on the sand.", "A car is illegally parked near a fire hydrant.", "A couple of dead, stuffed giraffe on display.", "A quaint toilet in a room with no door, a chair sitting outside of the area.", "A green and blue motorcycle parked on the side of a road.", "A very simple bathroom with beige and cream colored decor.", "a man in a room with a camera with a toilet", "A modern bathroom with a toilet and sink area.", "A man on his bicycle waits for two trains to pass by.", "a bath room with a trash can next to the tolit. ", "A purple bicycle is parked on a fence next to a river.", "A kitchen with a lot of counter space, a sink, stove and refrigerator in it. ", "A man on a motorcycle is looking in his mirror.", "A little red headed girl walking with a stuffed puppy.", "A white towel is at the edge of a white bathtub.", "A bathroom is shown with a glass counter and cone-shaped sink.", "A man walking down the street with a cane while others sit on a bench.", "a motorcycle that is parked in side a buliding", "A view from a bus shows people on bicycles and another bus in traffic.", "a bathroom that has a tub and a shower", "a vase with a flower growing very well", "a small little toilet that is in a corner", "a couple of horse that are eating some grass", "a man that is riding a motorcycle on a road", "a couple of motorcycles are off the side of the street", "A tea kettle sits on the burner of stove.", "a black cat that is sitting in a sink", "a room tha has a toilet and a sink in it", "A blender filled with three peeled oranges sitting on a counter.", "a couple of motorcycles that are next to a road", "A man is in a yard on a motorcycle.", "A truck traveling down the street near a fire hydrant.", "Two small cats are sleeping on white sheets.", "a group of people with bikes posing for a photo ", "Toilet with raised lid with tub and chair in old bathroom. ", "A person riding a four wheel on the beach.", "a bright light sitting in front of a tv ", "Clocks are brightly lit on a huge tower.", "a group of people that are smoking on a bench", "A bathroom with shower stall, toilet, and bathtub.", "A man is training a sheepdog for a sheepdog trial.", "Looking down on a stony surface shows a bowl with an orange in it and what looks like a large piece of red plastic.", "Two motorcycles ride down a street in a city.", "A little girl is making a huge mess with a birthday cake. ", "A very large kitchen area in a building.", "A yellow bike sits on a wall in the hallway.", "This is a photo of someones bathroom in their home and there are feet hanging out the side of the tub.", "a small little bathroom with a toilet in it", "Asian man and woman sitting and looking at cell phones", "Someone is juicing an orange on a juicer.", "A bicycle leaned against an outdoor magazine stand.", "A black and white photo of a steam of flowers inside a vase.", "A bathroom with white toliet and sink visible", "A kitchen with tile back splash and stainless steel appliances.", "Looking through a door and seeing a toilet and sink.", "Some guys are standing over an old antique truck and someone is walking a dog nearby. ", "A large jetliner sitting on top of a tarmac.", "A baby with a bib eats a cake.", "A stop sign out in the middle of nowhere ", "A group of police officer standing in front of a red bus.", "A woman holding two rainbow slices of cake.", "A group of Frisbee players are running around a field. ", "A toilet that has been covered in filth.", "The clock on the side of the metal building is gold and black. ", "The motorcyclist has his hands at his side while riding swiftly down the road. ", "A modern restroom with a weird looking sink, toilet, and shower.", "Small groups of people, including a person walking a dog, are scattered about an outdoor area, encompassing some streets, that is filled with classic cars. ", "a bunch of crates on a air plane run way", "A sky view looking up at a jumbo jet plane.", "A bathroom showing toilet, sink, and shower ", " room with a book and a white carpet", "A scooter with a helmet hanging off it's handlebars.", "A truck driving on a crowded street past several parked cars.", "A bunch of people walking around in a street", "people riding bikes near a beach and others swimming", "Two kittens curled up in a white sheet that looks soft.", "A cat laying on the seat of a motorcycle ", "a piece of orange in a bowl next to a concrete edge ", "A road lined with rock-face shows a man and a woman, both wearing hats, astride a red, white and blue decorated bike. ", "A kitchen has white cabinets and stainless steel appliances.", "A crowd of people walking and riding their bikes.", "A crowd of people are gathered outdoors on the street.", "Sheepherders move their sheep across a highway as vehicular traffic passes between their flock.", "A bathroom with sink, toilet, and tub ", "A crowd of people at an outdoor concert.", "A woman sitting on a bench with cars behind her.", "A cat is alseep on a motorcycle seat.", "A kite flying in a partly cloudy sky ", "white toilet and sink with mirror on white wall", "A small kitchen is shown with a stove, dishwasher and sink.", "A toilet that is has been colored black.", "A small baby bird on a piece of metal.", "a jet airliner wing that has two jet engines", "two sinks under a mirror and a light on a wall", "human hands juicing an orange on a counter top", "young man looking a different image of himself in the mirror", "A man riding a bike down a dirt road.", "a black and white photo with a vase and flower coming out of it", "Three people are riding down the street on one motorcycle. ", "Seven people on a biking trip in front of a large city.", "A wooden table sitting in the middle of a room.", "Three bikers by a red bus on the street.", "Three people are standing in the same kitchen area.", "A view of wing with two jet engines are on a runway while people watch.", "Man and dog on scooter in city street on sunny day.", "Two Asian people inside a train looking at their mobile phones.", "A white toilet tin a bathroom sitting next to a sink.", "The view of a restroom toilet, and sink area.", "The motorcycle is tilting as he turns through a cave. ", "A view of an airplane traveling across the bright sky.", "The kitchen counter and sink have dishes on them.", "A tower with a clock is displayed in the evening.", "A giraffe and fence design are painted onto the wall.", "A man wearing a helmet posing on top of a motorcycle.", "A very large black and gold clock mounted to the side of a building.", "A man riding on the back of a motorcycle down a highway.", "A huge commercial airplane goes down the landing strip.", "A bike is chained to the post on the sidewalk", "The black and white cat is sitting in a bathroom sink.", "The show girl is posing on a blue motorcycle on display. ", "Two fake looking giraffes are on display at an exhibit.", "A young baby is eating and playing with some cake.", "Two adults and a child ride a motorcycle together.", "A small eating area with a table and cabinets next to a window.", "A small wooden toy car has an elephant sitting inside.", "Cement ledge with orange in bowl and red plastic bag below. ", "An old classic church is in front a big blue sky.", "Kitchen area with modern appliances and plenty of cabinets.", "A man with a baseball cap and glasses seated in front of three large beer bottles.", "A bathroom with a small sink and toilet. ", "A bathroom with mirror, toilet, and sink ", "This is a photo of someones bathroom in their home.", "A child in a booster chair eating a cake ", "A woman sitting on top of a purple motorcycle.", "A bathroom scene with a toilet and a sink.", "The top of a steeped church building with clocks and small windows. ", "Two messy toilet stalls with toilets where one lid is raised. ", "a man wearing a helmet while riding a motorcycle ", "Man with golf club and a dog and a goat", "Two turbines on the wing of an airplane", "An empty bench along a sidewalk in neighborhood.", "there is a man riding a bike up the road", "A brick ally way with an old wooden bench with people sitting and smoking on it. ", "there is a very tall giraffe inside of a building", "A couple of airplanes sitting on top of a runway.", "a bathroom with a littlt tub and a clothes hamper by the toilet", "A large jet flying through a cloudy blue sky.", "A close up of the face of a clock on a building.", "A man riding on a motorcycle on the road.", "there is a very large black and gold clock on a building", "there is a man riding a motorcycle and not holding the handles", "there is a person making freshly squeezed orange juice", "A man riding on the back of a motorcycle on top of a grass field.", "A christmas wreath is hanging from the door", "group of bikers posing for a picture ", "A black bench that is by a sidewalk on a street.", "A bottle of wine sitting on top of a table next to a glass of wine.", "A dog sitting in front of an open door looking outside.", "A toy elephant sits in a toy wooden car.", "A group of bikers parked in the middle of a street.", "A wreath with a red bow on it hanging on a white door.", "A white toilet sitting in the corner of a room.", "A lush green field with horses standing on top of it.", "Several cars drive down the road on a cloudy day.", "A crowd of people riding bikes down a street.", "there is a woman sitting on a bench in front of cars", "There is an orange in the cup and a bag in the water.", "an empty bench sitting on the side of a sidewalk", "A person sits on a motorcycle while wearing riding gear.", "A plane is on display near the water.", "A mans reflection in a side view mirror.", "Two people wearing hats riding a motorcycle together.", "there is a dog that is sitting in a car", "A couple of white bathroom sinks mounted to a wall.", "A pink bicycle leaning against a fence near a river.", "there is a man crossing the tracks on a bike", "tan colored bathroom with white toilet and mirror", "A closed toilet seat in a bathroom next to a checkered curtain.", "A bathroom vanity with a large mirror hanging on the wall", "A colorful kite flying in a cloudy blue sky.", "A road with two vehicles out in the middle of nowhere with animals climbing up a hill on the left.", "The numbers and hands on the clock are gold.", "The man on the motorcycle does not have his hands on the handlebars.", "A white stove top oven inside of a kitchen.", "A line of motorcycles parked on the side of a street.", "A small elephant toy sitting inside of a wooden car.", "some one in the bath room laying in the bath", "Men are unloading the trolley of luggage on the runway.", "A small bird sitting in a metal wheel ", "Someone is riding a motorcycle through a grassy field. ", "a bunch of people in a kitchen getting food ready", "A billboard posed by the side of a street in a rural town.", "A picture of a man sitting on a motorcycle on a dirt road.", "A woman juicing oranges on top of a manual juicer.", "A small cute cat sitting in the bathroom sink.", "Several people standing next to each other that are snow skiing.", "some cut up fruit is sitting in a blender", "A small and plain white bathroom with a toilet and a tub.", "A man in riding gear, riding a red motorcycle down a road.", "A bunch of airplanes are parked on the runway. ", "this plane has two large fans on its wings", "This is a photo of a bathroom in someones home.", "This is a large statue in someones living room.", "An old propeller airplane is displayed near the water.", "A man and a woman using their cellphones simultaneously.", "there are many people walking along this street", "A kitchen showing marble tile and wood cabinets.", "A line of motorcycles are all parked next to each other.", "A man, woman, and child preparing food in a kitchen.", "A black and white photo of a flowing growing out of a vase.", "A passenger jet being serviced on a runway in an airport.", "Three people are preparing a meal in a small kitchen.", "A pair of planes parked in a small rural airfield.", "A bathroom with a stand alone shower and a peep window.", "Several vehicles with pieces of luggage on them with planes off to the side.", "a black motorcycle is parked by the side of the road", "A small bathroom with a tub, toilet, sink, and a laundry basket are shown.", "A bus stopped on the side of the road while people board it.", "A bunch of people posing with some bikes.", "a jet engine on the wing of a plane", "A bunch of bicycles parked on the street with items sitting around them ", "A dog standing in front of a doorway.", "Two small planes sitting near each other on a run way.", "there is a bus that has a bike attached to the front", "A bird that is sitting in the rim of a tire.", "The black motorcycle is parked on the sidewalk.", "A corner of a rest room with a big shower.", "a dog with a plate of food on the ground", "there is a very large plane that is stopped at the airport ", "Bicycles with back packs parked in a public place.", "A white walled bathroom features beige appliances and furniture.", "Several bicycles sit parked nest to each other.", "Some big commercial planes all parked by each other.", "a woman holding a plate of cake in her hand", "yellow and red motorcycle with a man riding on it next to grass", "A motorcycle stands in front of three people on a sidewalk.", "classic cars on a city street with people and a dog", "People getting on a bus in the city", "A large commercial airliner silhoetted in the sun.", "Residential bathroom with modern design and tile floor.", "a bus with a view of a lot of traffic and the back of another bus with a billboard on the back end", "A young man riding through the air on top of a skateboard.", "A toy elephant is sitting inside a wooden car toy.", "A motorized bicycle covered with greens and beans.", "A man sitting at a table in front of bowls of spices.", "there is a bathroom that has a lot of things on the floor", "A passenger jet aircraft flying in the sky.", "An eye level counter-view shows blue tile, a faucet, dish scrubbers, bowls, a squirt bottle and similar kitchen items. ", "A TV sitting on top of a wooden stand.", "A person sitting on a motorcycle in the grass.", "A white toilet in a generic public bathroom stall.", "a couple of people in uniforms are sitting together", "A group of giraffe standing around each other.", "Street merchant with bowls of grains and other products. ", "A man driving a luggage cart sitting on top of a runway.", "Residential bathroom with commode and shower and plain white walls.", "Ornate archway inset with matching fireplace in room.", "there is a red bus that has a mans face on it", "a wooden skate with a toy elephant inside of it ", "a bunch of people on skiing on a hill"] -------------------------------------------------------------------------------- /alignment/assets/activities.txt: -------------------------------------------------------------------------------- 1 | washing the dishes 2 | riding a bike 3 | playing chess -------------------------------------------------------------------------------- /alignment/assets/activities_v0.txt: -------------------------------------------------------------------------------- 1 | washing the dishes 2 | riding a bike 3 | playing chess -------------------------------------------------------------------------------- /alignment/assets/drawbench.json: -------------------------------------------------------------------------------- 1 | { 2 | "A red colored car.": { 3 | "category": "Colors" 4 | }, 5 | "A black colored car.": { 6 | "category": "Colors" 7 | }, 8 | "A pink colored car.": { 9 | "category": "Colors" 10 | }, 11 | "A black colored dog.": { 12 | "category": "Colors" 13 | }, 14 | "A red colored dog.": { 15 | "category": "Colors" 16 | }, 17 | "A blue colored dog.": { 18 | "category": "Colors" 19 | }, 20 | "A green colored banana.": { 21 | "category": "Colors" 22 | }, 23 | "A red colored banana.": { 24 | "category": "Colors" 25 | }, 26 | "A black colored banana.": { 27 | "category": "Colors" 28 | }, 29 | "A white colored sandwich.": { 30 | "category": "Colors" 31 | }, 32 | "A black colored sandwich.": { 33 | "category": "Colors" 34 | }, 35 | "An orange colored sandwich.": { 36 | "category": "Colors" 37 | }, 38 | "A pink colored giraffe.": { 39 | "category": "Colors" 40 | }, 41 | "A yellow colored giraffe.": { 42 | "category": "Colors" 43 | }, 44 | "A brown colored giraffe.": { 45 | "category": "Colors" 46 | }, 47 | "A red car and a white sheep.": { 48 | "category": "Colors" 49 | }, 50 | "A blue bird and a brown bear.": { 51 | "category": "Colors" 52 | }, 53 | "A green apple and a black backpack.": { 54 | "category": "Colors" 55 | }, 56 | "A green cup and a blue cell phone.": { 57 | "category": "Colors" 58 | }, 59 | "A yellow book and a red vase.": { 60 | "category": "Colors" 61 | }, 62 | "A white car and a red sheep.": { 63 | "category": "Colors" 64 | }, 65 | "A brown bird and a blue bear.": { 66 | "category": "Colors" 67 | }, 68 | "A black apple and a green backpack.": { 69 | "category": "Colors" 70 | }, 71 | "A blue cup and a green cell phone.": { 72 | "category": "Colors" 73 | }, 74 | "A red book and a yellow vase.": { 75 | "category": "Colors" 76 | }, 77 | "A horse riding an astronaut.": { 78 | "category": "Conflicting" 79 | }, 80 | "A pizza cooking an oven.": { 81 | "category": "Conflicting" 82 | }, 83 | "A bird scaring a scarecrow.": { 84 | "category": "Conflicting" 85 | }, 86 | "A blue coloured pizza.": { 87 | "category": "Conflicting" 88 | }, 89 | "Hovering cow abducting aliens.": { 90 | "category": "Conflicting" 91 | }, 92 | "A panda making latte art.": { 93 | "category": "Conflicting" 94 | }, 95 | "A shark in the desert.": { 96 | "category": "Conflicting" 97 | }, 98 | "An elephant under the sea.": { 99 | "category": "Conflicting" 100 | }, 101 | "Rainbow coloured penguin.": { 102 | "category": "Conflicting" 103 | }, 104 | "A fish eating a pelican.": { 105 | "category": "Conflicting" 106 | }, 107 | "One car on the street.": { 108 | "category": "Counting" 109 | }, 110 | "Two cars on the street.": { 111 | "category": "Counting" 112 | }, 113 | "Three cars on the street.": { 114 | "category": "Counting" 115 | }, 116 | "Four cars on the street.": { 117 | "category": "Counting" 118 | }, 119 | "Five cars on the street.": { 120 | "category": "Counting" 121 | }, 122 | "One dog on the street.": { 123 | "category": "Counting" 124 | }, 125 | "Two dogs on the street.": { 126 | "category": "Counting" 127 | }, 128 | "Three dogs on the street.": { 129 | "category": "Counting" 130 | }, 131 | "Four dogs on the street.": { 132 | "category": "Counting" 133 | }, 134 | "Five dogs on the street.": { 135 | "category": "Counting" 136 | }, 137 | "One cat and one dog sitting on the grass.": { 138 | "category": "Counting" 139 | }, 140 | "One cat and two dogs sitting on the grass.": { 141 | "category": "Counting" 142 | }, 143 | "One cat and three dogs sitting on the grass.": { 144 | "category": "Counting" 145 | }, 146 | "Two cats and one dog sitting on the grass.": { 147 | "category": "Counting" 148 | }, 149 | "Two cats and two dogs sitting on the grass.": { 150 | "category": "Counting" 151 | }, 152 | "Two cats and three dogs sitting on the grass.": { 153 | "category": "Counting" 154 | }, 155 | "Three cats and one dog sitting on the grass.": { 156 | "category": "Counting" 157 | }, 158 | "Three cats and two dogs sitting on the grass.": { 159 | "category": "Counting" 160 | }, 161 | "Three cats and three dogs sitting on the grass.": { 162 | "category": "Counting" 163 | }, 164 | "A triangular purple flower pot. A purple flower pot in the shape of a triangle.": { 165 | "category": "DALL-E" 166 | }, 167 | "A triangular orange picture frame. An orange picture frame in the shape of a triangle.": { 168 | "category": "DALL-E" 169 | }, 170 | "A triangular pink stop sign. A pink stop sign in the shape of a triangle.": { 171 | "category": "DALL-E" 172 | }, 173 | "A cube made of denim. A cube with the texture of denim.": { 174 | "category": "DALL-E" 175 | }, 176 | "A sphere made of kitchen tile. A sphere with the texture of kitchen tile.": { 177 | "category": "DALL-E" 178 | }, 179 | "A cube made of brick. A cube with the texture of brick.": { 180 | "category": "DALL-E" 181 | }, 182 | "A collection of nail is sitting on a table.": { 183 | "category": "DALL-E" 184 | }, 185 | "A single clock is sitting on a table.": { 186 | "category": "DALL-E" 187 | }, 188 | "A couple of glasses are sitting on a table.": { 189 | "category": "DALL-E" 190 | }, 191 | "An illustration of a large red elephant sitting on a small blue mouse.": { 192 | "category": "DALL-E" 193 | }, 194 | "An illustration of a small green elephant standing behind a large red mouse.": { 195 | "category": "DALL-E" 196 | }, 197 | "A small blue book sitting on a large red book.": { 198 | "category": "DALL-E" 199 | }, 200 | "A stack of 3 plates. A blue plate is on the top, sitting on a blue plate. The blue plate is in the middle, sitting on a green plate. The green plate is on the bottom.": { 201 | "category": "DALL-E" 202 | }, 203 | "A stack of 3 cubes. A red cube is on the top, sitting on a red cube. The red cube is in the middle, sitting on a green cube. The green cube is on the bottom.": { 204 | "category": "DALL-E" 205 | }, 206 | "A stack of 3 books. A green book is on the top, sitting on a red book. The red book is in the middle, sitting on a blue book. The blue book is on the bottom.": { 207 | "category": "DALL-E" 208 | }, 209 | "An emoji of a baby panda wearing a red hat, green gloves, red shirt, and green pants.": { 210 | "category": "DALL-E" 211 | }, 212 | "An emoji of a baby panda wearing a red hat, blue gloves, green shirt, and blue pants.": { 213 | "category": "DALL-E" 214 | }, 215 | "A fisheye lens view of a turtle sitting in a forest.": { 216 | "category": "DALL-E" 217 | }, 218 | "A side view of an owl sitting in a field.": { 219 | "category": "DALL-E" 220 | }, 221 | "A cross-section view of a brain.": { 222 | "category": "DALL-E" 223 | }, 224 | "A vehicle composed of two wheels held in a frame one behind the other, propelled by pedals and steered with handlebars attached to the front wheel.": { 225 | "category": "Descriptions" 226 | }, 227 | "A large motor vehicle carrying passengers by road, typically one serving the public on a fixed route and for a fare.": { 228 | "category": "Descriptions" 229 | }, 230 | "A small vessel propelled on water by oars, sails, or an engine.": { 231 | "category": "Descriptions" 232 | }, 233 | "A connection point by which firefighters can tap into a water supply.": { 234 | "category": "Descriptions" 235 | }, 236 | "A machine next to a parking space in a street, into which the driver puts money so as to be authorized to park the vehicle for a particular length of time.": { 237 | "category": "Descriptions" 238 | }, 239 | "A device consisting of a circular canopy of cloth on a folding metal frame supported by a central rod, used as protection against rain or sometimes sun.": { 240 | "category": "Descriptions" 241 | }, 242 | "A separate seat for one person, typically with a back and four legs.": { 243 | "category": "Descriptions" 244 | }, 245 | "An appliance or compartment which is artificially kept cool and used to store food and drink.": { 246 | "category": "Descriptions" 247 | }, 248 | "A mechanical or electrical device for measuring time.": { 249 | "category": "Descriptions" 250 | }, 251 | "An instrument used for cutting cloth, paper, and other thin material, consisting of two blades laid one on top of the other and fastened in the middle so as to allow them to be opened and closed by a thumb and finger inserted through rings on the end of their handles.": { 252 | "category": "Descriptions" 253 | }, 254 | "A large plant-eating domesticated mammal with solid hoofs and a flowing mane and tail, used for riding, racing, and to carry and pull loads.": { 255 | "category": "Descriptions" 256 | }, 257 | "A long curved fruit which grows in clusters and has soft pulpy flesh and yellow skin when ripe.": { 258 | "category": "Descriptions" 259 | }, 260 | "A small domesticated carnivorous mammal with soft fur, a short snout, and retractable claws. It is widely kept as a pet or for catching mice, and many breeds have been developed.": { 261 | "category": "Descriptions" 262 | }, 263 | "A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, nonretractable claws, and a barking, howling, or whining voice.": { 264 | "category": "Descriptions" 265 | }, 266 | "An organ of soft nervous tissue contained in the skull of vertebrates, functioning as the coordinating center of sensation and intellectual and nervous activity.": { 267 | "category": "Descriptions" 268 | }, 269 | "An American multinational technology company that focuses on artificial intelligence, search engine, online advertising, cloud computing, computer software, quantum computing, e-commerce, and consumer electronics.": { 270 | "category": "Descriptions" 271 | }, 272 | "A large keyboard musical instrument with a wooden case enclosing a soundboard and metal strings, which are struck by hammers when the keys are depressed. The strings' vibration is stopped by dampers when the keys are released and can be regulated for length and volume by two or three pedals.": { 273 | "category": "Descriptions" 274 | }, 275 | "A type of digital currency in which a record of transactions is maintained and new units of currency are generated by the computational solution of mathematical problems, and which operates independently of a central bank.": { 276 | "category": "Descriptions" 277 | }, 278 | "A large thick-skinned semiaquatic African mammal, with massive jaws and large tusks.": { 279 | "category": "Descriptions" 280 | }, 281 | "A machine resembling a human being and able to replicate certain human movements and functions automatically.": { 282 | "category": "Descriptions" 283 | }, 284 | "Paying for a quarter-sized pizza with a pizza-sized quarter.": { 285 | "category": "Gary Marcus et al. " 286 | }, 287 | "An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with no umbrellas.": { 288 | "category": "Gary Marcus et al. " 289 | }, 290 | "A grocery store refrigerator has pint cartons of milk on the top shelf, quart cartons on the middle shelf, and gallon plastic jugs on the bottom shelf.": { 291 | "category": "Gary Marcus et al. " 292 | }, 293 | "In late afternoon in January in New England, a man stands in the shadow of a maple tree.": { 294 | "category": "Gary Marcus et al. " 295 | }, 296 | "An elephant is behind a tree. You can see the trunk on one side and the back legs on the other.": { 297 | "category": "Gary Marcus et al. " 298 | }, 299 | "A tomato has been put on top of a pumpkin on a kitchen stool. There is a fork sticking into the pumpkin. The scene is viewed from above.": { 300 | "category": "Gary Marcus et al. " 301 | }, 302 | "A pear cut into seven pieces arranged in a ring.": { 303 | "category": "Gary Marcus et al. " 304 | }, 305 | "A donkey and an octopus are playing a game. The donkey is holding a rope on one end, the octopus is holding onto the other. The donkey holds the rope in its mouth. A cat is jumping over the rope.": { 306 | "category": "Gary Marcus et al. " 307 | }, 308 | "Supreme Court Justices play a baseball game with the FBI. The FBI is at bat, the justices are on the field.": { 309 | "category": "Gary Marcus et al. " 310 | }, 311 | "Abraham Lincoln touches his toes while George Washington does chin-ups. Lincoln is barefoot. Washington is wearing boots.": { 312 | "category": "Gary Marcus et al. " 313 | }, 314 | "A train on top of a surfboard.": { 315 | "category": "Positional" 316 | }, 317 | "A wine glass on top of a dog.": { 318 | "category": "Positional" 319 | }, 320 | "A bicycle on top of a boat.": { 321 | "category": "Positional" 322 | }, 323 | "An umbrella on top of a spoon.": { 324 | "category": "Positional" 325 | }, 326 | "A laptop on top of a teddy bear.": { 327 | "category": "Positional" 328 | }, 329 | "A giraffe underneath a microwave.": { 330 | "category": "Positional" 331 | }, 332 | "A donut underneath a toilet.": { 333 | "category": "Positional" 334 | }, 335 | "A hair drier underneath a sheep.": { 336 | "category": "Positional" 337 | }, 338 | "A tennis racket underneath a traffic light.": { 339 | "category": "Positional" 340 | }, 341 | "A zebra underneath a broccoli.": { 342 | "category": "Positional" 343 | }, 344 | "A banana on the left of an apple.": { 345 | "category": "Positional" 346 | }, 347 | "A couch on the left of a chair.": { 348 | "category": "Positional" 349 | }, 350 | "A car on the left of a bus.": { 351 | "category": "Positional" 352 | }, 353 | "A cat on the left of a dog.": { 354 | "category": "Positional" 355 | }, 356 | "A carrot on the left of a broccoli.": { 357 | "category": "Positional" 358 | }, 359 | "A pizza on the right of a suitcase.": { 360 | "category": "Positional" 361 | }, 362 | "A cat on the right of a tennis racket.": { 363 | "category": "Positional" 364 | }, 365 | "A stop sign on the right of a refrigerator.": { 366 | "category": "Positional" 367 | }, 368 | "A sheep to the right of a wine glass.": { 369 | "category": "Positional" 370 | }, 371 | "A zebra to the right of a fire hydrant.": { 372 | "category": "Positional" 373 | }, 374 | "A church with stained glass windows depicting a hamburger and french fries.": { 375 | "category": "Reddit" 376 | }, 377 | "Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna.": { 378 | "category": "Reddit" 379 | }, 380 | "A baby fennec sneezing onto a strawberry, detailed, macro, studio light, droplets, backlit ears.": { 381 | "category": "Reddit" 382 | }, 383 | "A photo of a confused grizzly bear in calculus class.": { 384 | "category": "Reddit" 385 | }, 386 | "An ancient Egyptian painting depicting an argument over whose turn it is to take out the trash.": { 387 | "category": "Reddit" 388 | }, 389 | "A fluffy baby sloth with a knitted hat trying to figure out a laptop, close up, highly detailed, studio lighting, screen reflecting in its eyes.": { 390 | "category": "Reddit" 391 | }, 392 | "A tiger in a lab coat with a 1980s Miami vibe, turning a well oiled science content machine, digital art.": { 393 | "category": "Reddit" 394 | }, 395 | "A 1960s yearbook photo with animals dressed as humans.": { 396 | "category": "Reddit" 397 | }, 398 | "Lego Arnold Schwarzenegger.": { 399 | "category": "Reddit" 400 | }, 401 | "A yellow and black bus cruising through the rainforest.": { 402 | "category": "Reddit" 403 | }, 404 | "A medieval painting of the wifi not working.": { 405 | "category": "Reddit" 406 | }, 407 | "An IT-guy trying to fix hardware of a PC tower is being tangled by the PC cables like Laokoon. Marble, copy after Hellenistic original from ca. 200 BC. Found in the Baths of Trajan, 1506.": { 408 | "category": "Reddit" 409 | }, 410 | "35mm macro shot a kitten licking a baby duck, studio lighting.": { 411 | "category": "Reddit" 412 | }, 413 | "McDonalds Church.": { 414 | "category": "Reddit" 415 | }, 416 | "Photo of an athlete cat explaining it's latest scandal at a press conference to journalists.": { 417 | "category": "Reddit" 418 | }, 419 | "Greek statue of a man tripping over a cat.": { 420 | "category": "Reddit" 421 | }, 422 | "An old photograph of a 1920s airship shaped like a pig, floating over a wheat field.": { 423 | "category": "Reddit" 424 | }, 425 | "Photo of a cat singing in a barbershop quartet.": { 426 | "category": "Reddit" 427 | }, 428 | "A painting by Grant Wood of an astronaut couple, american gothic style.": { 429 | "category": "Reddit" 430 | }, 431 | "An oil painting portrait of the regal Burger King posing with a Whopper.": { 432 | "category": "Reddit" 433 | }, 434 | "A keyboard made of water, the water is made of light, the light is turned off.": { 435 | "category": "Reddit" 436 | }, 437 | "Painting of Mona Lisa but the view is from behind of Mona Lisa.": { 438 | "category": "Reddit" 439 | }, 440 | "Hyper-realistic photo of an abandoned industrial site during a storm.": { 441 | "category": "Reddit" 442 | }, 443 | "A screenshot of an iOS app for ordering different types of milk.": { 444 | "category": "Reddit" 445 | }, 446 | "A real life photography of super mario, 8k Ultra HD.": { 447 | "category": "Reddit" 448 | }, 449 | "Colouring page of large cats climbing the eifel tower in a cyberpunk future.": { 450 | "category": "Reddit" 451 | }, 452 | "Photo of a mega Lego space station inside a kid's bedroom.": { 453 | "category": "Reddit" 454 | }, 455 | "A spider with a moustache bidding an equally gentlemanly grasshopper a good day during his walk to work.": { 456 | "category": "Reddit" 457 | }, 458 | "A photocopy of a photograph of a painting of a sculpture of a giraffe.": { 459 | "category": "Reddit" 460 | }, 461 | "A bridge connecting Europe and North America on the Atlantic Ocean, bird's eye view.": { 462 | "category": "Reddit" 463 | }, 464 | "A maglev train going vertically downward in high speed, New York Times photojournalism.": { 465 | "category": "Reddit" 466 | }, 467 | "A magnifying glass over a page of a 1950s batman comic.": { 468 | "category": "Reddit" 469 | }, 470 | "A car playing soccer, digital art.": { 471 | "category": "Reddit" 472 | }, 473 | "Darth Vader playing with raccoon in Mars during sunset.": { 474 | "category": "Reddit" 475 | }, 476 | "A 1960s poster warning against climate change.": { 477 | "category": "Reddit" 478 | }, 479 | "Illustration of a mouse using a mushroom as an umbrella.": { 480 | "category": "Reddit" 481 | }, 482 | "A realistic photo of a Pomeranian dressed up like a 1980s professional wrestler with neon green and neon orange face paint and bright green wrestling tights with bright orange boots.": { 483 | "category": "Reddit" 484 | }, 485 | "A pyramid made of falafel with a partial solar eclipse in the background.": { 486 | "category": "Reddit" 487 | }, 488 | "A storefront with 'Hello World' written on it.": { 489 | "category": "Text" 490 | }, 491 | "A storefront with 'Diffusion' written on it.": { 492 | "category": "Text" 493 | }, 494 | "A storefront with 'Text to Image' written on it.": { 495 | "category": "Text" 496 | }, 497 | "A storefront with 'NeurIPS' written on it.": { 498 | "category": "Text" 499 | }, 500 | "A storefront with 'Deep Learning' written on it.": { 501 | "category": "Text" 502 | }, 503 | "A storefront with 'Google Brain Toronto' written on it.": { 504 | "category": "Text" 505 | }, 506 | "A storefront with 'Google Research Pizza Cafe' written on it.": { 507 | "category": "Text" 508 | }, 509 | "A sign that says 'Hello World'.": { 510 | "category": "Text" 511 | }, 512 | "A sign that says 'Diffusion'.": { 513 | "category": "Text" 514 | }, 515 | "A sign that says 'Text to Image'.": { 516 | "category": "Text" 517 | }, 518 | "A sign that says 'NeurIPS'.": { 519 | "category": "Text" 520 | }, 521 | "A sign that says 'Deep Learning'.": { 522 | "category": "Text" 523 | }, 524 | "A sign that says 'Google Brain Toronto'.": { 525 | "category": "Text" 526 | }, 527 | "A sign that says 'Google Research Pizza Cafe'.": { 528 | "category": "Text" 529 | }, 530 | "New York Skyline with 'Hello World' written with fireworks on the sky.": { 531 | "category": "Text" 532 | }, 533 | "New York Skyline with 'Diffusion' written with fireworks on the sky.": { 534 | "category": "Text" 535 | }, 536 | "New York Skyline with 'Text to Image' written with fireworks on the sky.": { 537 | "category": "Text" 538 | }, 539 | "New York Skyline with 'NeurIPS' written with fireworks on the sky.": { 540 | "category": "Text" 541 | }, 542 | "New York Skyline with 'Deep Learning' written with fireworks on the sky.": { 543 | "category": "Text" 544 | }, 545 | "New York Skyline with 'Google Brain Toronto' written with fireworks on the sky.": { 546 | "category": "Text" 547 | }, 548 | "New York Skyline with 'Google Research Pizza Cafe' written with fireworks on the sky.": { 549 | "category": "Text" 550 | } 551 | } -------------------------------------------------------------------------------- /alignment/assets/imagenet_classes.txt: -------------------------------------------------------------------------------- 1 | tench, Tinca tinca 2 | goldfish, Carassius auratus 3 | great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias 4 | tiger shark, Galeocerdo cuvieri 5 | hammerhead, hammerhead shark 6 | electric ray, crampfish, numbfish, torpedo 7 | stingray 8 | cock 9 | hen 10 | ostrich, Struthio camelus 11 | brambling, Fringilla montifringilla 12 | goldfinch, Carduelis carduelis 13 | house finch, linnet, Carpodacus mexicanus 14 | junco, snowbird 15 | indigo bunting, indigo finch, indigo bird, Passerina cyanea 16 | robin, American robin, Turdus migratorius 17 | bulbul 18 | jay 19 | magpie 20 | chickadee 21 | water ouzel, dipper 22 | kite 23 | bald eagle, American eagle, Haliaeetus leucocephalus 24 | vulture 25 | great grey owl, great gray owl, Strix nebulosa 26 | European fire salamander, Salamandra salamandra 27 | common newt, Triturus vulgaris 28 | eft 29 | spotted salamander, Ambystoma maculatum 30 | axolotl, mud puppy, Ambystoma mexicanum 31 | bullfrog, Rana catesbeiana 32 | tree frog, tree-frog 33 | tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui 34 | loggerhead, loggerhead turtle, Caretta caretta 35 | leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea 36 | mud turtle 37 | terrapin 38 | box turtle, box tortoise 39 | banded gecko 40 | common iguana, iguana, Iguana iguana 41 | American chameleon, anole, Anolis carolinensis 42 | whiptail, whiptail lizard 43 | agama 44 | frilled lizard, Chlamydosaurus kingi 45 | alligator lizard 46 | Gila monster, Heloderma suspectum 47 | green lizard, Lacerta viridis 48 | African chameleon, Chamaeleo chamaeleon 49 | Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis 50 | African crocodile, Nile crocodile, Crocodylus niloticus 51 | American alligator, Alligator mississipiensis 52 | triceratops 53 | thunder snake, worm snake, Carphophis amoenus 54 | ringneck snake, ring-necked snake, ring snake 55 | hognose snake, puff adder, sand viper 56 | green snake, grass snake 57 | king snake, kingsnake 58 | garter snake, grass snake 59 | water snake 60 | vine snake 61 | night snake, Hypsiglena torquata 62 | boa constrictor, Constrictor constrictor 63 | rock python, rock snake, Python sebae 64 | Indian cobra, Naja naja 65 | green mamba 66 | sea snake 67 | horned viper, cerastes, sand viper, horned asp, Cerastes cornutus 68 | diamondback, diamondback rattlesnake, Crotalus adamanteus 69 | sidewinder, horned rattlesnake, Crotalus cerastes 70 | trilobite 71 | harvestman, daddy longlegs, Phalangium opilio 72 | scorpion 73 | black and gold garden spider, Argiope aurantia 74 | barn spider, Araneus cavaticus 75 | garden spider, Aranea diademata 76 | black widow, Latrodectus mactans 77 | tarantula 78 | wolf spider, hunting spider 79 | tick 80 | centipede 81 | black grouse 82 | ptarmigan 83 | ruffed grouse, partridge, Bonasa umbellus 84 | prairie chicken, prairie grouse, prairie fowl 85 | peacock 86 | quail 87 | partridge 88 | African grey, African gray, Psittacus erithacus 89 | macaw 90 | sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita 91 | lorikeet 92 | coucal 93 | bee eater 94 | hornbill 95 | hummingbird 96 | jacamar 97 | toucan 98 | drake 99 | red-breasted merganser, Mergus serrator 100 | goose 101 | black swan, Cygnus atratus 102 | tusker 103 | echidna, spiny anteater, anteater 104 | platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus 105 | wallaby, brush kangaroo 106 | koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus 107 | wombat 108 | jellyfish 109 | sea anemone, anemone 110 | brain coral 111 | flatworm, platyhelminth 112 | nematode, nematode worm, roundworm 113 | conch 114 | snail 115 | slug 116 | sea slug, nudibranch 117 | chiton, coat-of-mail shell, sea cradle, polyplacophore 118 | chambered nautilus, pearly nautilus, nautilus 119 | Dungeness crab, Cancer magister 120 | rock crab, Cancer irroratus 121 | fiddler crab 122 | king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica 123 | American lobster, Northern lobster, Maine lobster, Homarus americanus 124 | spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish 125 | crayfish, crawfish, crawdad, crawdaddy 126 | hermit crab 127 | isopod 128 | white stork, Ciconia ciconia 129 | black stork, Ciconia nigra 130 | spoonbill 131 | flamingo 132 | little blue heron, Egretta caerulea 133 | American egret, great white heron, Egretta albus 134 | bittern 135 | crane 136 | limpkin, Aramus pictus 137 | European gallinule, Porphyrio porphyrio 138 | American coot, marsh hen, mud hen, water hen, Fulica americana 139 | bustard 140 | ruddy turnstone, Arenaria interpres 141 | red-backed sandpiper, dunlin, Erolia alpina 142 | redshank, Tringa totanus 143 | dowitcher 144 | oystercatcher, oyster catcher 145 | pelican 146 | king penguin, Aptenodytes patagonica 147 | albatross, mollymawk 148 | grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus 149 | killer whale, killer, orca, grampus, sea wolf, Orcinus orca 150 | dugong, Dugong dugon 151 | sea lion 152 | Chihuahua 153 | Japanese spaniel 154 | Maltese dog, Maltese terrier, Maltese 155 | Pekinese, Pekingese, Peke 156 | Shih-Tzu 157 | Blenheim spaniel 158 | papillon 159 | toy terrier 160 | Rhodesian ridgeback 161 | Afghan hound, Afghan 162 | basset, basset hound 163 | beagle 164 | bloodhound, sleuthhound 165 | bluetick 166 | black-and-tan coonhound 167 | Walker hound, Walker foxhound 168 | English foxhound 169 | redbone 170 | borzoi, Russian wolfhound 171 | Irish wolfhound 172 | Italian greyhound 173 | whippet 174 | Ibizan hound, Ibizan Podenco 175 | Norwegian elkhound, elkhound 176 | otterhound, otter hound 177 | Saluki, gazelle hound 178 | Scottish deerhound, deerhound 179 | Weimaraner 180 | Staffordshire bullterrier, Staffordshire bull terrier 181 | American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier 182 | Bedlington terrier 183 | Border terrier 184 | Kerry blue terrier 185 | Irish terrier 186 | Norfolk terrier 187 | Norwich terrier 188 | Yorkshire terrier 189 | wire-haired fox terrier 190 | Lakeland terrier 191 | Sealyham terrier, Sealyham 192 | Airedale, Airedale terrier 193 | cairn, cairn terrier 194 | Australian terrier 195 | Dandie Dinmont, Dandie Dinmont terrier 196 | Boston bull, Boston terrier 197 | miniature schnauzer 198 | giant schnauzer 199 | standard schnauzer 200 | Scotch terrier, Scottish terrier, Scottie 201 | Tibetan terrier, chrysanthemum dog 202 | silky terrier, Sydney silky 203 | soft-coated wheaten terrier 204 | West Highland white terrier 205 | Lhasa, Lhasa apso 206 | flat-coated retriever 207 | curly-coated retriever 208 | golden retriever 209 | Labrador retriever 210 | Chesapeake Bay retriever 211 | German short-haired pointer 212 | vizsla, Hungarian pointer 213 | English setter 214 | Irish setter, red setter 215 | Gordon setter 216 | Brittany spaniel 217 | clumber, clumber spaniel 218 | English springer, English springer spaniel 219 | Welsh springer spaniel 220 | cocker spaniel, English cocker spaniel, cocker 221 | Sussex spaniel 222 | Irish water spaniel 223 | kuvasz 224 | schipperke 225 | groenendael 226 | malinois 227 | briard 228 | kelpie 229 | komondor 230 | Old English sheepdog, bobtail 231 | Shetland sheepdog, Shetland sheep dog, Shetland 232 | collie 233 | Border collie 234 | Bouvier des Flandres, Bouviers des Flandres 235 | Rottweiler 236 | German shepherd, German shepherd dog, German police dog, alsatian 237 | Doberman, Doberman pinscher 238 | miniature pinscher 239 | Greater Swiss Mountain dog 240 | Bernese mountain dog 241 | Appenzeller 242 | EntleBucher 243 | boxer 244 | bull mastiff 245 | Tibetan mastiff 246 | French bulldog 247 | Great Dane 248 | Saint Bernard, St Bernard 249 | Eskimo dog, husky 250 | malamute, malemute, Alaskan malamute 251 | Siberian husky 252 | dalmatian, coach dog, carriage dog 253 | affenpinscher, monkey pinscher, monkey dog 254 | basenji 255 | pug, pug-dog 256 | Leonberg 257 | Newfoundland, Newfoundland dog 258 | Great Pyrenees 259 | Samoyed, Samoyede 260 | Pomeranian 261 | chow, chow chow 262 | keeshond 263 | Brabancon griffon 264 | Pembroke, Pembroke Welsh corgi 265 | Cardigan, Cardigan Welsh corgi 266 | toy poodle 267 | miniature poodle 268 | standard poodle 269 | Mexican hairless 270 | timber wolf, grey wolf, gray wolf, Canis lupus 271 | white wolf, Arctic wolf, Canis lupus tundrarum 272 | red wolf, maned wolf, Canis rufus, Canis niger 273 | coyote, prairie wolf, brush wolf, Canis latrans 274 | dingo, warrigal, warragal, Canis dingo 275 | dhole, Cuon alpinus 276 | African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus 277 | hyena, hyaena 278 | red fox, Vulpes vulpes 279 | kit fox, Vulpes macrotis 280 | Arctic fox, white fox, Alopex lagopus 281 | grey fox, gray fox, Urocyon cinereoargenteus 282 | tabby, tabby cat 283 | tiger cat 284 | Persian cat 285 | Siamese cat, Siamese 286 | Egyptian cat 287 | cougar, puma, catamount, mountain lion, painter, panther, Felis concolor 288 | lynx, catamount 289 | leopard, Panthera pardus 290 | snow leopard, ounce, Panthera uncia 291 | jaguar, panther, Panthera onca, Felis onca 292 | lion, king of beasts, Panthera leo 293 | tiger, Panthera tigris 294 | cheetah, chetah, Acinonyx jubatus 295 | brown bear, bruin, Ursus arctos 296 | American black bear, black bear, Ursus americanus, Euarctos americanus 297 | ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus 298 | sloth bear, Melursus ursinus, Ursus ursinus 299 | mongoose 300 | meerkat, mierkat 301 | tiger beetle 302 | ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle 303 | ground beetle, carabid beetle 304 | long-horned beetle, longicorn, longicorn beetle 305 | leaf beetle, chrysomelid 306 | dung beetle 307 | rhinoceros beetle 308 | weevil 309 | fly 310 | bee 311 | ant, emmet, pismire 312 | grasshopper, hopper 313 | cricket 314 | walking stick, walkingstick, stick insect 315 | cockroach, roach 316 | mantis, mantid 317 | cicada, cicala 318 | leafhopper 319 | lacewing, lacewing fly 320 | dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk 321 | damselfly 322 | admiral 323 | ringlet, ringlet butterfly 324 | monarch, monarch butterfly, milkweed butterfly, Danaus plexippus 325 | cabbage butterfly 326 | sulphur butterfly, sulfur butterfly 327 | lycaenid, lycaenid butterfly 328 | starfish, sea star 329 | sea urchin 330 | sea cucumber, holothurian 331 | wood rabbit, cottontail, cottontail rabbit 332 | hare 333 | Angora, Angora rabbit 334 | hamster 335 | porcupine, hedgehog 336 | fox squirrel, eastern fox squirrel, Sciurus niger 337 | marmot 338 | beaver 339 | guinea pig, Cavia cobaya 340 | sorrel 341 | zebra 342 | hog, pig, grunter, squealer, Sus scrofa 343 | wild boar, boar, Sus scrofa 344 | warthog 345 | hippopotamus, hippo, river horse, Hippopotamus amphibius 346 | ox 347 | water buffalo, water ox, Asiatic buffalo, Bubalus bubalis 348 | bison 349 | ram, tup 350 | bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis 351 | ibex, Capra ibex 352 | hartebeest 353 | impala, Aepyceros melampus 354 | gazelle 355 | Arabian camel, dromedary, Camelus dromedarius 356 | llama 357 | weasel 358 | mink 359 | polecat, fitch, foulmart, foumart, Mustela putorius 360 | black-footed ferret, ferret, Mustela nigripes 361 | otter 362 | skunk, polecat, wood pussy 363 | badger 364 | armadillo 365 | three-toed sloth, ai, Bradypus tridactylus 366 | orangutan, orang, orangutang, Pongo pygmaeus 367 | gorilla, Gorilla gorilla 368 | chimpanzee, chimp, Pan troglodytes 369 | gibbon, Hylobates lar 370 | siamang, Hylobates syndactylus, Symphalangus syndactylus 371 | guenon, guenon monkey 372 | patas, hussar monkey, Erythrocebus patas 373 | baboon 374 | macaque 375 | langur 376 | colobus, colobus monkey 377 | proboscis monkey, Nasalis larvatus 378 | marmoset 379 | capuchin, ringtail, Cebus capucinus 380 | howler monkey, howler 381 | titi, titi monkey 382 | spider monkey, Ateles geoffroyi 383 | squirrel monkey, Saimiri sciureus 384 | Madagascar cat, ring-tailed lemur, Lemur catta 385 | indri, indris, Indri indri, Indri brevicaudatus 386 | Indian elephant, Elephas maximus 387 | African elephant, Loxodonta africana 388 | lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens 389 | giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca 390 | barracouta, snoek 391 | eel 392 | coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch 393 | rock beauty, Holocanthus tricolor 394 | anemone fish 395 | sturgeon 396 | gar, garfish, garpike, billfish, Lepisosteus osseus 397 | lionfish 398 | puffer, pufferfish, blowfish, globefish 399 | abacus 400 | abaya 401 | academic gown, academic robe, judge's robe 402 | accordion, piano accordion, squeeze box 403 | acoustic guitar 404 | aircraft carrier, carrier, flattop, attack aircraft carrier 405 | airliner 406 | airship, dirigible 407 | altar 408 | ambulance 409 | amphibian, amphibious vehicle 410 | analog clock 411 | apiary, bee house 412 | apron 413 | ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin 414 | assault rifle, assault gun 415 | backpack, back pack, knapsack, packsack, rucksack, haversack 416 | bakery, bakeshop, bakehouse 417 | balance beam, beam 418 | balloon 419 | ballpoint, ballpoint pen, ballpen, Biro 420 | Band Aid 421 | banjo 422 | bannister, banister, balustrade, balusters, handrail 423 | barbell 424 | barber chair 425 | barbershop 426 | barn 427 | barometer 428 | barrel, cask 429 | barrow, garden cart, lawn cart, wheelbarrow 430 | baseball 431 | basketball 432 | bassinet 433 | bassoon 434 | bathing cap, swimming cap 435 | bath towel 436 | bathtub, bathing tub, bath, tub 437 | beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon 438 | beacon, lighthouse, beacon light, pharos 439 | beaker 440 | bearskin, busby, shako 441 | beer bottle 442 | beer glass 443 | bell cote, bell cot 444 | bib 445 | bicycle-built-for-two, tandem bicycle, tandem 446 | bikini, two-piece 447 | binder, ring-binder 448 | binoculars, field glasses, opera glasses 449 | birdhouse 450 | boathouse 451 | bobsled, bobsleigh, bob 452 | bolo tie, bolo, bola tie, bola 453 | bonnet, poke bonnet 454 | bookcase 455 | bookshop, bookstore, bookstall 456 | bottlecap 457 | bow 458 | bow tie, bow-tie, bowtie 459 | brass, memorial tablet, plaque 460 | brassiere, bra, bandeau 461 | breakwater, groin, groyne, mole, bulwark, seawall, jetty 462 | breastplate, aegis, egis 463 | broom 464 | bucket, pail 465 | buckle 466 | bulletproof vest 467 | bullet train, bullet 468 | butcher shop, meat market 469 | cab, hack, taxi, taxicab 470 | caldron, cauldron 471 | candle, taper, wax light 472 | cannon 473 | canoe 474 | can opener, tin opener 475 | cardigan 476 | car mirror 477 | carousel, carrousel, merry-go-round, roundabout, whirligig 478 | carpenter's kit, tool kit 479 | carton 480 | car wheel 481 | cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM 482 | cassette 483 | cassette player 484 | castle 485 | catamaran 486 | CD player 487 | cello, violoncello 488 | cellular telephone, cellular phone, cellphone, cell, mobile phone 489 | chain 490 | chainlink fence 491 | chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour 492 | chain saw, chainsaw 493 | chest 494 | chiffonier, commode 495 | chime, bell, gong 496 | china cabinet, china closet 497 | Christmas stocking 498 | church, church building 499 | cinema, movie theater, movie theatre, movie house, picture palace 500 | cleaver, meat cleaver, chopper 501 | cliff dwelling 502 | cloak 503 | clog, geta, patten, sabot 504 | cocktail shaker 505 | coffee mug 506 | coffeepot 507 | coil, spiral, volute, whorl, helix 508 | combination lock 509 | computer keyboard, keypad 510 | confectionery, confectionary, candy store 511 | container ship, containership, container vessel 512 | convertible 513 | corkscrew, bottle screw 514 | cornet, horn, trumpet, trump 515 | cowboy boot 516 | cowboy hat, ten-gallon hat 517 | cradle 518 | crane 519 | crash helmet 520 | crate 521 | crib, cot 522 | Crock Pot 523 | croquet ball 524 | crutch 525 | cuirass 526 | dam, dike, dyke 527 | desk 528 | desktop computer 529 | dial telephone, dial phone 530 | diaper, nappy, napkin 531 | digital clock 532 | digital watch 533 | dining table, board 534 | dishrag, dishcloth 535 | dishwasher, dish washer, dishwashing machine 536 | disk brake, disc brake 537 | dock, dockage, docking facility 538 | dogsled, dog sled, dog sleigh 539 | dome 540 | doormat, welcome mat 541 | drilling platform, offshore rig 542 | drum, membranophone, tympan 543 | drumstick 544 | dumbbell 545 | Dutch oven 546 | electric fan, blower 547 | electric guitar 548 | electric locomotive 549 | entertainment center 550 | envelope 551 | espresso maker 552 | face powder 553 | feather boa, boa 554 | file, file cabinet, filing cabinet 555 | fireboat 556 | fire engine, fire truck 557 | fire screen, fireguard 558 | flagpole, flagstaff 559 | flute, transverse flute 560 | folding chair 561 | football helmet 562 | forklift 563 | fountain 564 | fountain pen 565 | four-poster 566 | freight car 567 | French horn, horn 568 | frying pan, frypan, skillet 569 | fur coat 570 | garbage truck, dustcart 571 | gasmask, respirator, gas helmet 572 | gas pump, gasoline pump, petrol pump, island dispenser 573 | goblet 574 | go-kart 575 | golf ball 576 | golfcart, golf cart 577 | gondola 578 | gong, tam-tam 579 | gown 580 | grand piano, grand 581 | greenhouse, nursery, glasshouse 582 | grille, radiator grille 583 | grocery store, grocery, food market, market 584 | guillotine 585 | hair slide 586 | hair spray 587 | half track 588 | hammer 589 | hamper 590 | hand blower, blow dryer, blow drier, hair dryer, hair drier 591 | hand-held computer, hand-held microcomputer 592 | handkerchief, hankie, hanky, hankey 593 | hard disc, hard disk, fixed disk 594 | harmonica, mouth organ, harp, mouth harp 595 | harp 596 | harvester, reaper 597 | hatchet 598 | holster 599 | home theater, home theatre 600 | honeycomb 601 | hook, claw 602 | hoopskirt, crinoline 603 | horizontal bar, high bar 604 | horse cart, horse-cart 605 | hourglass 606 | iPod 607 | iron, smoothing iron 608 | jack-o'-lantern 609 | jean, blue jean, denim 610 | jeep, landrover 611 | jersey, T-shirt, tee shirt 612 | jigsaw puzzle 613 | jinrikisha, ricksha, rickshaw 614 | joystick 615 | kimono 616 | knee pad 617 | knot 618 | lab coat, laboratory coat 619 | ladle 620 | lampshade, lamp shade 621 | laptop, laptop computer 622 | lawn mower, mower 623 | lens cap, lens cover 624 | letter opener, paper knife, paperknife 625 | library 626 | lifeboat 627 | lighter, light, igniter, ignitor 628 | limousine, limo 629 | liner, ocean liner 630 | lipstick, lip rouge 631 | Loafer 632 | lotion 633 | loudspeaker, speaker, speaker unit, loudspeaker system, speaker system 634 | loupe, jeweler's loupe 635 | lumbermill, sawmill 636 | magnetic compass 637 | mailbag, postbag 638 | mailbox, letter box 639 | maillot 640 | maillot, tank suit 641 | manhole cover 642 | maraca 643 | marimba, xylophone 644 | mask 645 | matchstick 646 | maypole 647 | maze, labyrinth 648 | measuring cup 649 | medicine chest, medicine cabinet 650 | megalith, megalithic structure 651 | microphone, mike 652 | microwave, microwave oven 653 | military uniform 654 | milk can 655 | minibus 656 | miniskirt, mini 657 | minivan 658 | missile 659 | mitten 660 | mixing bowl 661 | mobile home, manufactured home 662 | Model T 663 | modem 664 | monastery 665 | monitor 666 | moped 667 | mortar 668 | mortarboard 669 | mosque 670 | mosquito net 671 | motor scooter, scooter 672 | mountain bike, all-terrain bike, off-roader 673 | mountain tent 674 | mouse, computer mouse 675 | mousetrap 676 | moving van 677 | muzzle 678 | nail 679 | neck brace 680 | necklace 681 | nipple 682 | notebook, notebook computer 683 | obelisk 684 | oboe, hautboy, hautbois 685 | ocarina, sweet potato 686 | odometer, hodometer, mileometer, milometer 687 | oil filter 688 | organ, pipe organ 689 | oscilloscope, scope, cathode-ray oscilloscope, CRO 690 | overskirt 691 | oxcart 692 | oxygen mask 693 | packet 694 | paddle, boat paddle 695 | paddlewheel, paddle wheel 696 | padlock 697 | paintbrush 698 | pajama, pyjama, pj's, jammies 699 | palace 700 | panpipe, pandean pipe, syrinx 701 | paper towel 702 | parachute, chute 703 | parallel bars, bars 704 | park bench 705 | parking meter 706 | passenger car, coach, carriage 707 | patio, terrace 708 | pay-phone, pay-station 709 | pedestal, plinth, footstall 710 | pencil box, pencil case 711 | pencil sharpener 712 | perfume, essence 713 | Petri dish 714 | photocopier 715 | pick, plectrum, plectron 716 | pickelhaube 717 | picket fence, paling 718 | pickup, pickup truck 719 | pier 720 | piggy bank, penny bank 721 | pill bottle 722 | pillow 723 | ping-pong ball 724 | pinwheel 725 | pirate, pirate ship 726 | pitcher, ewer 727 | plane, carpenter's plane, woodworking plane 728 | planetarium 729 | plastic bag 730 | plate rack 731 | plow, plough 732 | plunger, plumber's helper 733 | Polaroid camera, Polaroid Land camera 734 | pole 735 | police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria 736 | poncho 737 | pool table, billiard table, snooker table 738 | pop bottle, soda bottle 739 | pot, flowerpot 740 | potter's wheel 741 | power drill 742 | prayer rug, prayer mat 743 | printer 744 | prison, prison house 745 | projectile, missile 746 | projector 747 | puck, hockey puck 748 | punching bag, punch bag, punching ball, punchball 749 | purse 750 | quill, quill pen 751 | quilt, comforter, comfort, puff 752 | racer, race car, racing car 753 | racket, racquet 754 | radiator 755 | radio, wireless 756 | radio telescope, radio reflector 757 | rain barrel 758 | recreational vehicle, RV, R.V. 759 | reel 760 | reflex camera 761 | refrigerator, icebox 762 | remote control, remote 763 | restaurant, eating house, eating place, eatery 764 | revolver, six-gun, six-shooter 765 | rifle 766 | rocking chair, rocker 767 | rotisserie 768 | rubber eraser, rubber, pencil eraser 769 | rugby ball 770 | rule, ruler 771 | running shoe 772 | safe 773 | safety pin 774 | saltshaker, salt shaker 775 | sandal 776 | sarong 777 | sax, saxophone 778 | scabbard 779 | scale, weighing machine 780 | school bus 781 | schooner 782 | scoreboard 783 | screen, CRT screen 784 | screw 785 | screwdriver 786 | seat belt, seatbelt 787 | sewing machine 788 | shield, buckler 789 | shoe shop, shoe-shop, shoe store 790 | shoji 791 | shopping basket 792 | shopping cart 793 | shovel 794 | shower cap 795 | shower curtain 796 | ski 797 | ski mask 798 | sleeping bag 799 | slide rule, slipstick 800 | sliding door 801 | slot, one-armed bandit 802 | snorkel 803 | snowmobile 804 | snowplow, snowplough 805 | soap dispenser 806 | soccer ball 807 | sock 808 | solar dish, solar collector, solar furnace 809 | sombrero 810 | soup bowl 811 | space bar 812 | space heater 813 | space shuttle 814 | spatula 815 | speedboat 816 | spider web, spider's web 817 | spindle 818 | sports car, sport car 819 | spotlight, spot 820 | stage 821 | steam locomotive 822 | steel arch bridge 823 | steel drum 824 | stethoscope 825 | stole 826 | stone wall 827 | stopwatch, stop watch 828 | stove 829 | strainer 830 | streetcar, tram, tramcar, trolley, trolley car 831 | stretcher 832 | studio couch, day bed 833 | stupa, tope 834 | submarine, pigboat, sub, U-boat 835 | suit, suit of clothes 836 | sundial 837 | sunglass 838 | sunglasses, dark glasses, shades 839 | sunscreen, sunblock, sun blocker 840 | suspension bridge 841 | swab, swob, mop 842 | sweatshirt 843 | swimming trunks, bathing trunks 844 | swing 845 | switch, electric switch, electrical switch 846 | syringe 847 | table lamp 848 | tank, army tank, armored combat vehicle, armoured combat vehicle 849 | tape player 850 | teapot 851 | teddy, teddy bear 852 | television, television system 853 | tennis ball 854 | thatch, thatched roof 855 | theater curtain, theatre curtain 856 | thimble 857 | thresher, thrasher, threshing machine 858 | throne 859 | tile roof 860 | toaster 861 | tobacco shop, tobacconist shop, tobacconist 862 | toilet seat 863 | torch 864 | totem pole 865 | tow truck, tow car, wrecker 866 | toyshop 867 | tractor 868 | trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi 869 | tray 870 | trench coat 871 | tricycle, trike, velocipede 872 | trimaran 873 | tripod 874 | triumphal arch 875 | trolleybus, trolley coach, trackless trolley 876 | trombone 877 | tub, vat 878 | turnstile 879 | typewriter keyboard 880 | umbrella 881 | unicycle, monocycle 882 | upright, upright piano 883 | vacuum, vacuum cleaner 884 | vase 885 | vault 886 | velvet 887 | vending machine 888 | vestment 889 | viaduct 890 | violin, fiddle 891 | volleyball 892 | waffle iron 893 | wall clock 894 | wallet, billfold, notecase, pocketbook 895 | wardrobe, closet, press 896 | warplane, military plane 897 | washbasin, handbasin, washbowl, lavabo, wash-hand basin 898 | washer, automatic washer, washing machine 899 | water bottle 900 | water jug 901 | water tower 902 | whiskey jug 903 | whistle 904 | wig 905 | window screen 906 | window shade 907 | Windsor tie 908 | wine bottle 909 | wing 910 | wok 911 | wooden spoon 912 | wool, woolen, woollen 913 | worm fence, snake fence, snake-rail fence, Virginia fence 914 | wreck 915 | yawl 916 | yurt 917 | web site, website, internet site, site 918 | comic book 919 | crossword puzzle, crossword 920 | street sign 921 | traffic light, traffic signal, stoplight 922 | book jacket, dust cover, dust jacket, dust wrapper 923 | menu 924 | plate 925 | guacamole 926 | consomme 927 | hot pot, hotpot 928 | trifle 929 | ice cream, icecream 930 | ice lolly, lolly, lollipop, popsicle 931 | French loaf 932 | bagel, beigel 933 | pretzel 934 | cheeseburger 935 | hotdog, hot dog, red hot 936 | mashed potato 937 | head cabbage 938 | broccoli 939 | cauliflower 940 | zucchini, courgette 941 | spaghetti squash 942 | acorn squash 943 | butternut squash 944 | cucumber, cuke 945 | artichoke, globe artichoke 946 | bell pepper 947 | cardoon 948 | mushroom 949 | Granny Smith 950 | strawberry 951 | orange 952 | lemon 953 | fig 954 | pineapple, ananas 955 | banana 956 | jackfruit, jak, jack 957 | custard apple 958 | pomegranate 959 | hay 960 | carbonara 961 | chocolate sauce, chocolate syrup 962 | dough 963 | meat loaf, meatloaf 964 | pizza, pizza pie 965 | potpie 966 | burrito 967 | red wine 968 | espresso 969 | cup 970 | eggnog 971 | alp 972 | bubble 973 | cliff, drop, drop-off 974 | coral reef 975 | geyser 976 | lakeside, lakeshore 977 | promontory, headland, head, foreland 978 | sandbar, sand bar 979 | seashore, coast, seacoast, sea-coast 980 | valley, vale 981 | volcano 982 | ballplayer, baseball player 983 | groom, bridegroom 984 | scuba diver 985 | rapeseed 986 | daisy 987 | yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum 988 | corn 989 | acorn 990 | hip, rose hip, rosehip 991 | buckeye, horse chestnut, conker 992 | coral fungus 993 | agaric 994 | gyromitra 995 | stinkhorn, carrion fungus 996 | earthstar 997 | hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa 998 | bolete 999 | ear, spike, capitulum 1000 | toilet tissue, toilet paper, bathroom tissue -------------------------------------------------------------------------------- /alignment/assets/sac+logos+ava1-l14-linearMSE.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apple/ml-diffusion-alignment-gflownet/12c59520669b195e919540b940a510cef6f46ae7/alignment/assets/sac+logos+ava1-l14-linearMSE.pth -------------------------------------------------------------------------------- /alignment/assets/simple_animals.txt: -------------------------------------------------------------------------------- 1 | cat 2 | dog 3 | horse 4 | monkey 5 | rabbit 6 | zebra 7 | spider 8 | bird 9 | sheep 10 | deer 11 | cow 12 | goat 13 | lion 14 | tiger 15 | bear 16 | raccoon 17 | fox 18 | wolf 19 | lizard 20 | beetle 21 | ant 22 | butterfly 23 | fish 24 | shark 25 | whale 26 | dolphin 27 | squirrel 28 | mouse 29 | rat 30 | snake 31 | turtle 32 | frog 33 | chicken 34 | duck 35 | goose 36 | bee 37 | pig 38 | turkey 39 | fly 40 | llama 41 | camel 42 | bat 43 | gorilla 44 | hedgehog 45 | kangaroo 46 | -------------------------------------------------------------------------------- /alignment/diffusers_patch/ddim_with_logprob.py: -------------------------------------------------------------------------------- 1 | # For licensing see accompanying LICENSE file. 2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved. 3 | 4 | # Modified from https://github.com/huggingface/diffusers/blob/fc6acb6b97e93d58cb22b5fee52d884d77ce84d8/src/diffusers/schedulers/scheduling_ddim.py 5 | 6 | from typing import Optional, Tuple, Union 7 | 8 | import math 9 | import torch 10 | 11 | try: 12 | from diffusers.utils import randn_tensor 13 | except ImportError: 14 | from diffusers.utils.torch_utils import randn_tensor 15 | from diffusers.schedulers.scheduling_ddim import DDIMSchedulerOutput, DDIMScheduler 16 | 17 | 18 | def _left_broadcast(t, shape): 19 | assert t.ndim <= len(shape) 20 | return t.reshape(t.shape + (1,) * (len(shape) - t.ndim)).broadcast_to(shape) 21 | 22 | 23 | def _get_variance(self, timestep, prev_timestep): 24 | alpha_prod_t = torch.gather(self.alphas_cumprod, 0, timestep.cpu()).to( 25 | timestep.device 26 | ) 27 | alpha_prod_t_prev = torch.where( 28 | prev_timestep.cpu() >= 0, 29 | self.alphas_cumprod.gather(0, prev_timestep.cpu()), 30 | self.final_alpha_cumprod, 31 | ).to(timestep.device) 32 | beta_prod_t = 1 - alpha_prod_t 33 | beta_prod_t_prev = 1 - alpha_prod_t_prev 34 | 35 | variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev) 36 | return variance 37 | 38 | 39 | def ddim_step_with_logprob( 40 | self: DDIMScheduler, 41 | model_output: torch.FloatTensor, 42 | timestep: int, 43 | sample: torch.FloatTensor, 44 | eta: float = 1.0, 45 | use_clipped_model_output: bool = False, 46 | generator=None, 47 | prev_sample: Optional[torch.FloatTensor] = None, 48 | 49 | calculate_pb: bool = False, logp_mean=True, 50 | prev_timestep: int =None, 51 | ) -> Union[DDIMSchedulerOutput, Tuple]: 52 | """ 53 | Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion 54 | process from the learned model outputs (most often the predicted noise). 55 | 56 | Args: 57 | model_output (`torch.FloatTensor`): direct output from learned diffusion model. 58 | timestep (`int`): current discrete timestep in the diffusion chain. 59 | sample (`torch.FloatTensor`): 60 | current instance of sample being created by diffusion process. 61 | eta (`float`): weight of noise for added noise in diffusion step. 62 | use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped 63 | predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when 64 | `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would 65 | coincide with the one provided as input and `use_clipped_model_output` will have not effect. 66 | generator: random number generator. 67 | variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we 68 | can directly provide the noise for the variance itself. This is useful for methods such as 69 | CycleDiffusion. (https://arxiv.org/abs/2210.05559) 70 | return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class 71 | 72 | sample: x_t 73 | prev_sample: x_{t-1} (closer to clean image) 74 | 75 | Returns: 76 | [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`: 77 | [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When 78 | returning a tuple, the first element is the sample tensor. 79 | 80 | """ 81 | assert isinstance(self, DDIMScheduler) 82 | if self.num_inference_steps is None: 83 | raise ValueError( 84 | "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" 85 | ) 86 | 87 | # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf 88 | # Ideally, read DDIM paper in-detail understanding 89 | 90 | # Notation ( -> 91 | # - pred_noise_t -> e_theta(x_t, t) 92 | # - pred_original_sample -> f_theta(x_t, t) or x_0 93 | # - std_dev_t -> sigma_t 94 | # - eta -> η 95 | # - pred_sample_direction -> "direction pointing to x_t" 96 | # - pred_prev_sample -> "x_{t-1}" 97 | 98 | # 1. get previous step value (=t-1) 99 | if prev_timestep is None: 100 | prev_timestep = ( 101 | timestep - self.config.num_train_timesteps // self.num_inference_steps 102 | ) 103 | # to prevent OOB on gather 104 | prev_timestep = torch.clamp(prev_timestep, 0, self.config.num_train_timesteps - 1) 105 | 106 | # 2. compute alphas, betas 107 | # self.alphas_cumprod torch.Size([1000]) 108 | alpha_prod_t = self.alphas_cumprod.gather(0, timestep.cpu()) # torch scalar 109 | alpha_prod_t_prev = torch.where( 110 | prev_timestep.cpu() >= 0, 111 | self.alphas_cumprod.gather(0, prev_timestep.cpu()), 112 | self.final_alpha_cumprod, 113 | ) 114 | alpha_prod_t = _left_broadcast(alpha_prod_t, sample.shape).to(sample.device) 115 | alpha_prod_t_prev = _left_broadcast(alpha_prod_t_prev, sample.shape).to( 116 | sample.device 117 | ) 118 | # alpha_prod_t = alpha_prod_t.to(sample.dtype) # float32 -> bf16 119 | # alpha_prod_t_prev = alpha_prod_t_prev.to(sample.dtype) # float32 -> bf16 120 | 121 | beta_prod_t = 1 - alpha_prod_t 122 | 123 | # 3. compute predicted original sample from predicted noise also called 124 | # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf 125 | if self.config.prediction_type == "epsilon": 126 | pred_original_sample = ( 127 | sample - beta_prod_t ** (0.5) * model_output 128 | ) / alpha_prod_t ** (0.5) 129 | pred_epsilon = model_output 130 | elif self.config.prediction_type == "sample": 131 | pred_original_sample = model_output 132 | pred_epsilon = ( 133 | sample - alpha_prod_t ** (0.5) * pred_original_sample 134 | ) / beta_prod_t ** (0.5) 135 | elif self.config.prediction_type == "v_prediction": 136 | pred_original_sample = (alpha_prod_t**0.5) * sample - ( 137 | beta_prod_t**0.5 138 | ) * model_output 139 | pred_epsilon = (alpha_prod_t**0.5) * model_output + ( 140 | beta_prod_t**0.5 141 | ) * sample 142 | else: 143 | raise ValueError( 144 | f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" 145 | " `v_prediction`" 146 | ) 147 | 148 | # 4. Clip or threshold "predicted x_0" 149 | # cifar ddpm: self.config.thresholding = False, self.config.clip_sample_range = 1.0 150 | # SD: self.config.thresholding = False, self.config.clip_sample = False 151 | if self.config.thresholding: 152 | pred_original_sample = self._threshold_sample(pred_original_sample) 153 | elif self.config.clip_sample: 154 | pred_original_sample = pred_original_sample.clamp( 155 | -self.config.clip_sample_range, self.config.clip_sample_range 156 | ) 157 | 158 | # 5. compute variance: "sigma_t(η)" -> see formula (16) 159 | # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1) 160 | variance = _get_variance(self, timestep, prev_timestep) 161 | std_dev_t = eta * variance ** (0.5) # eta is 1.0 162 | std_dev_t = _left_broadcast(std_dev_t, sample.shape).to(sample.device) 163 | 164 | if use_clipped_model_output: # not used? 165 | # the pred_epsilon is always re-derived from the clipped x_0 in Glide 166 | pred_epsilon = ( 167 | sample - alpha_prod_t ** (0.5) * pred_original_sample 168 | ) / beta_prod_t ** (0.5) 169 | 170 | # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf 171 | pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** ( 172 | 0.5 173 | ) * pred_epsilon 174 | 175 | # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf 176 | prev_sample_mean = ( 177 | alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction 178 | ) 179 | 180 | if prev_sample is not None and generator is not None: 181 | raise ValueError( 182 | "Cannot pass both generator and prev_sample. Please make sure that either `generator` or" 183 | " `prev_sample` stays `None`." 184 | ) 185 | 186 | if prev_sample is None: 187 | variance_noise = randn_tensor( 188 | model_output.shape, 189 | generator=generator, 190 | device=model_output.device, 191 | dtype=model_output.dtype, 192 | ) 193 | prev_sample = prev_sample_mean + std_dev_t * variance_noise 194 | 195 | # log prob of prev_sample given prev_sample_mean and std_dev_t 196 | log_prob = ( 197 | -((prev_sample.detach() - prev_sample_mean) ** 2) / (2 * (std_dev_t**2)) 198 | - torch.log(std_dev_t) 199 | - torch.log(torch.sqrt(2 * torch.as_tensor(math.pi))) 200 | ) 201 | if logp_mean: 202 | # mean along all but batch dimension 203 | log_prob = log_prob.mean(dim=tuple(range(1, log_prob.ndim))) 204 | else: 205 | log_prob = log_prob.sum(dim=tuple(range(1, log_prob.ndim))) 206 | 207 | if calculate_pb: 208 | assert prev_sample is not None 209 | alpha_ddim = alpha_prod_t / alpha_prod_t_prev # (bs, 4, 64, 64) 210 | pb_mean = alpha_ddim.sqrt() * prev_sample 211 | pb_std = (1 - alpha_ddim).sqrt() 212 | log_pb = ( 213 | -((sample.detach() - pb_mean.detach()) ** 2) / (2 * (pb_std ** 2)) 214 | - torch.log(pb_std) 215 | - torch.log(torch.sqrt(2 * torch.as_tensor(math.pi))) 216 | ) 217 | if logp_mean: 218 | log_pb = log_pb.mean(dim=tuple(range(1, sample.ndim))) 219 | else: 220 | log_pb = log_pb.sum(dim=tuple(range(1, sample.ndim))) 221 | return prev_sample.type(sample.dtype), log_prob, log_pb 222 | 223 | else: 224 | return prev_sample.type(sample.dtype), log_prob 225 | # output is float32 as the self.alpha is float32 226 | 227 | 228 | @torch.no_grad() 229 | def pred_orig_latent(self: DDIMScheduler, model_output, sample: torch.FloatTensor, timestep: int): 230 | # 2. compute alphas, betas 231 | # self.alphas_cumprod torch.Size([1000]) 232 | alpha_prod_t = self.alphas_cumprod.gather(0, timestep.cpu()) # torch scalar 233 | alpha_prod_t = _left_broadcast(alpha_prod_t, sample.shape).to(sample.device) 234 | alpha_prod_t = alpha_prod_t.to(sample.dtype) # float32 -> bf16 235 | beta_prod_t = 1 - alpha_prod_t 236 | 237 | if self.config.prediction_type == "epsilon": 238 | pred_original_sample = ( 239 | sample - beta_prod_t ** (0.5) * model_output 240 | ) / alpha_prod_t ** (0.5) 241 | elif self.config.prediction_type == "sample": 242 | pred_original_sample = model_output 243 | elif self.config.prediction_type == "v_prediction": 244 | pred_original_sample = (alpha_prod_t**0.5) * sample - ( 245 | beta_prod_t**0.5 246 | ) * model_output 247 | else: 248 | raise ValueError( 249 | f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" 250 | " `v_prediction`" 251 | ) 252 | return pred_original_sample 253 | 254 | 255 | def compute_snr(noise_scheduler, timesteps): 256 | """ 257 | Computes SNR as per 258 | https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849 259 | """ 260 | alphas_cumprod = noise_scheduler.alphas_cumprod 261 | sqrt_alphas_cumprod = alphas_cumprod**0.5 262 | sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5 263 | 264 | # Expand the tensors. 265 | # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026 266 | sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float() 267 | while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape): 268 | sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None] 269 | alpha = sqrt_alphas_cumprod.expand(timesteps.shape) 270 | 271 | sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float() 272 | while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape): 273 | sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None] 274 | sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape) 275 | 276 | # Compute SNR. 277 | snr = (alpha / sigma) ** 2 278 | return snr 279 | 280 | 281 | # given x_{t-1} "prev_sample", compute x_t "sample" 282 | def step_backward(self: DDIMScheduler, 283 | timestep: int, 284 | prev_sample: torch.FloatTensor, 285 | generator=None,): 286 | 287 | prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps 288 | # to prevent OOB on gather 289 | prev_timestep = torch.clamp(prev_timestep, 0, self.config.num_train_timesteps - 1) 290 | 291 | # 2. compute alphas, betas 292 | # self.alphas_cumprod torch.Size([1000]) 293 | alpha_prod_t = self.alphas_cumprod.gather(0, timestep.cpu()) # torch scalar 294 | alpha_prod_t_prev = torch.where( 295 | prev_timestep.cpu() >= 0, 296 | self.alphas_cumprod.gather(0, prev_timestep.cpu()), 297 | self.final_alpha_cumprod, 298 | ) 299 | alpha_prod_t = _left_broadcast(alpha_prod_t, prev_sample.shape).to(prev_sample.device) 300 | alpha_prod_t_prev = _left_broadcast(alpha_prod_t_prev, prev_sample.shape).to(prev_sample.device) 301 | # beta_prod_t = 1 - alpha_prod_t 302 | 303 | alpha_ddim = alpha_prod_t / alpha_prod_t_prev # (bs, 4, 64, 64) 304 | pb_mean = alpha_ddim.sqrt() * prev_sample 305 | pb_std = (1 - alpha_ddim).sqrt() 306 | 307 | sample = pb_mean + pb_std * randn_tensor( 308 | prev_sample.shape, 309 | generator=generator, 310 | device=prev_sample.device, 311 | dtype=prev_sample.dtype, 312 | ) 313 | return sample -------------------------------------------------------------------------------- /alignment/diffusers_patch/pipeline_with_logprob.py: -------------------------------------------------------------------------------- 1 | # For licensing see accompanying LICENSE file. 2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved. 3 | 4 | # Modified from https://github.com/huggingface/diffusers/blob/fc6acb6b97e93d58cb22b5fee52d884d77ce84d8/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py 5 | 6 | from typing import Any, Callable, Dict, List, Optional, Union 7 | 8 | import torch 9 | 10 | from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import ( 11 | StableDiffusionPipeline, 12 | rescale_noise_cfg, 13 | ) 14 | try: 15 | from diffusers.utils import randn_tensor 16 | except ImportError: 17 | from diffusers.utils.torch_utils import randn_tensor 18 | from .ddim_with_logprob import ddim_step_with_logprob 19 | from ..utils import image_postprocess 20 | 21 | @torch.no_grad() 22 | def pipeline_with_logprob( 23 | self: StableDiffusionPipeline, 24 | prompt: Union[str, List[str]] = None, 25 | height: Optional[int] = None, 26 | width: Optional[int] = None, 27 | num_inference_steps: int = 50, 28 | guidance_scale: float = 5, 29 | negative_prompt: Optional[Union[str, List[str]]] = None, 30 | num_images_per_prompt: Optional[int] = 1, 31 | eta: float = 0.0, 32 | generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, 33 | latents: Optional[torch.FloatTensor] = None, 34 | prompt_embeds: Optional[torch.FloatTensor] = None, 35 | negative_prompt_embeds: Optional[torch.FloatTensor] = None, 36 | output_type: Optional[str] = "pil", 37 | return_dict: bool = True, 38 | callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, 39 | callback_steps: int = 1, 40 | cross_attention_kwargs: Optional[Dict[str, Any]] = None, 41 | guidance_rescale: float = 0.0, 42 | 43 | batch_size = None, dtype=None, 44 | device = None, 45 | calculate_pb = False, logp_mean = True, 46 | return_unetoutput = False, 47 | ): 48 | r""" 49 | Function invoked when calling the pipeline for generation. 50 | 51 | Args: 52 | prompt (`str` or `List[str]`, *optional*): 53 | The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. 54 | instead. 55 | height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): 56 | The height in pixels of the generated image. 57 | width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): 58 | The width in pixels of the generated image. 59 | num_inference_steps (`int`, *optional*, defaults to 50): 60 | The number of denoising steps. More denoising steps usually lead to a higher quality image at the 61 | expense of slower inference. 62 | guidance_scale (`float`, *optional*, defaults to 7.5): 63 | Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). 64 | `guidance_scale` is defined as `w` of equation 2. of [Imagen 65 | Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > 66 | 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, 67 | usually at the expense of lower image quality. 68 | negative_prompt (`str` or `List[str]`, *optional*): 69 | The prompt or prompts not to guide the image generation. If not defined, one has to pass 70 | `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is 71 | less than `1`). 72 | num_images_per_prompt (`int`, *optional*, defaults to 1): 73 | The number of images to generate per prompt. 74 | eta (`float`, *optional*, defaults to 0.0): 75 | Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to 76 | [`schedulers.DDIMScheduler`], will be ignored for others. 77 | generator (`torch.Generator` or `List[torch.Generator]`, *optional*): 78 | One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) 79 | to make generation deterministic. 80 | latents (`torch.FloatTensor`, *optional*): 81 | Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image 82 | generation. Can be used to tweak the same generation with different prompts. If not provided, a latents 83 | tensor will ge generated by sampling using the supplied random `generator`. 84 | prompt_embeds (`torch.FloatTensor`, *optional*): 85 | Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not 86 | provided, text embeddings will be generated from `prompt` input argument. 87 | negative_prompt_embeds (`torch.FloatTensor`, *optional*): 88 | Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt 89 | weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input 90 | argument. 91 | output_type (`str`, *optional*, defaults to `"pil"`): 92 | The output format of the generate image. Choose between 93 | [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. 94 | return_dict (`bool`, *optional*, defaults to `True`): 95 | Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a 96 | plain tuple. 97 | callback (`Callable`, *optional*): 98 | A function that will be called every `callback_steps` steps during inference. The function will be 99 | called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. 100 | callback_steps (`int`, *optional*, defaults to 1): 101 | The frequency at which the `callback` function will be called. If not specified, the callback will be 102 | called at every step. 103 | cross_attention_kwargs (`dict`, *optional*): 104 | A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under 105 | `self.processor` in 106 | [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). 107 | guidance_rescale (`float`, *optional*, defaults to 0.7): 108 | Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are 109 | Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of 110 | [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). 111 | Guidance rescale factor should fix overexposure when using zero terminal SNR. 112 | 113 | Examples: 114 | 115 | Returns: 116 | [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: 117 | [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. 118 | When returning a tuple, the first element is a list with the generated images, and the second element is a 119 | list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" 120 | (nsfw) content, according to the `safety_checker`. 121 | """ 122 | # 0. Default height and width to unet 123 | if height is None: 124 | height = height or self.unet.config.sample_size * self.vae_scale_factor 125 | if width is None: 126 | width = width or self.unet.config.sample_size * self.vae_scale_factor 127 | 128 | # 1. Check inputs. Raise error if not correct 129 | if hasattr(self, "check_inputs"): # DDPMPipeline does not have this method 130 | self.check_inputs( 131 | prompt, 132 | height, 133 | width, 134 | callback_steps, 135 | negative_prompt, 136 | prompt_embeds, 137 | negative_prompt_embeds, 138 | ) 139 | 140 | # 2. Define call parameters 141 | if batch_size is None: 142 | if prompt is not None and isinstance(prompt, str): 143 | batch_size = 1 144 | elif prompt is not None and isinstance(prompt, list): 145 | batch_size = len(prompt) 146 | else: 147 | batch_size = prompt_embeds.shape[0] 148 | 149 | if device is None: 150 | device = self._execution_device 151 | # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) 152 | # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` 153 | # corresponds to doing no classifier free guidance. 154 | do_classifier_free_guidance = guidance_scale > 1.0 155 | 156 | # 3. Encode input prompt 157 | if prompt_embeds is not None: 158 | text_encoder_lora_scale = ( 159 | cross_attention_kwargs.get("scale", None) 160 | if cross_attention_kwargs is not None 161 | else None 162 | ) 163 | prompt_embeds = self._encode_prompt( 164 | prompt, 165 | device, 166 | num_images_per_prompt, 167 | do_classifier_free_guidance, 168 | negative_prompt, 169 | prompt_embeds=prompt_embeds, 170 | negative_prompt_embeds=negative_prompt_embeds, 171 | lora_scale=text_encoder_lora_scale, 172 | ) 173 | 174 | # 4. Prepare timesteps 175 | if num_inference_steps is None: 176 | timesteps = self.scheduler.timesteps 177 | num_inference_steps = len(timesteps) 178 | else: 179 | self.scheduler.set_timesteps(num_inference_steps, device=device) 180 | timesteps = self.scheduler.timesteps 181 | 182 | # 5. Prepare latent variables 183 | num_channels_latents = self.unet.config.in_channels 184 | if prompt_embeds is not None: 185 | latents = self.prepare_latents( 186 | batch_size * num_images_per_prompt, 187 | num_channels_latents, 188 | height, 189 | width, 190 | prompt_embeds.dtype, 191 | device, 192 | generator, 193 | latents, 194 | ) 195 | 196 | # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline 197 | extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # {'eta': 1.0, 'generator': None} 198 | 199 | else: 200 | shape = (batch_size, num_channels_latents, height, width) 201 | latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) 202 | extra_step_kwargs = {'eta': eta, 'generator': generator} 203 | 204 | # 7. Denoising loop 205 | # self.scheduler.order is 1, not sure what it does 206 | num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order 207 | 208 | all_latents = [latents] 209 | all_log_probs = [] 210 | all_log_pbs = [] 211 | unet_outputs = [] 212 | with self.progress_bar(total=num_inference_steps) as progress_bar: 213 | for i, t in enumerate(timesteps): 214 | # expand the latents if we are doing classifier free guidance 215 | latent_model_input = ( 216 | torch.cat([latents] * 2) if do_classifier_free_guidance else latents 217 | ) 218 | latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) 219 | 220 | # predict the noise residual 221 | if prompt_embeds is not None: 222 | noise_pred = self.unet( 223 | latent_model_input, 224 | t, 225 | encoder_hidden_states=prompt_embeds, 226 | cross_attention_kwargs=cross_attention_kwargs, 227 | return_dict=False, 228 | )[0] 229 | else: 230 | noise_pred = self.unet( 231 | latent_model_input, t, return_dict=False 232 | )[0] 233 | 234 | # perform guidance 235 | if do_classifier_free_guidance: 236 | noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) 237 | noise_pred = noise_pred_uncond + guidance_scale * ( 238 | noise_pred_text - noise_pred_uncond 239 | ) 240 | if return_unetoutput: 241 | unet_outputs.append(noise_pred.detach()) 242 | 243 | # by default not used (as guidance_rescale = 0.0) 244 | if do_classifier_free_guidance and guidance_rescale > 0.0: 245 | # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf 246 | noise_pred = rescale_noise_cfg( 247 | noise_pred, noise_pred_text, guidance_rescale=guidance_rescale 248 | ) 249 | 250 | # compute the previous noisy sample x_t -> x_t-1 251 | prev_timestep = timesteps[i + 1] if i < num_inference_steps-1 else None 252 | if calculate_pb: 253 | latents, log_prob, log_pb = ddim_step_with_logprob( 254 | self.scheduler, noise_pred, t, latents, 255 | calculate_pb=calculate_pb, logp_mean=logp_mean, 256 | prev_timestep=prev_timestep, # 257 | **extra_step_kwargs 258 | ) 259 | all_log_pbs.append(log_pb) 260 | else: 261 | latents, log_prob = ddim_step_with_logprob( 262 | self.scheduler, noise_pred, t, latents, 263 | prev_timestep=prev_timestep, # 264 | **extra_step_kwargs 265 | ) 266 | 267 | all_latents.append(latents) 268 | all_log_probs.append(log_prob) 269 | 270 | # call the callback, if provided 271 | if i == len(timesteps) - 1 or ( 272 | (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 273 | ): 274 | progress_bar.update() 275 | if callback is not None and i % callback_steps == 0: 276 | callback(i, t, latents) 277 | 278 | if not output_type == "latent": 279 | image = self.vae.decode( 280 | latents / self.vae.config.scaling_factor, return_dict=False 281 | )[0] 282 | image, has_nsfw_concept = self.run_safety_checker( 283 | image, device, prompt_embeds.dtype 284 | ) 285 | else: 286 | image = latents 287 | has_nsfw_concept = None 288 | 289 | if has_nsfw_concept is None: 290 | do_denormalize = [True] * image.shape[0] 291 | else: 292 | do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] 293 | 294 | # At least for the cifar10 DDPM, the generated image is in [-1, 1], 295 | # so we need this postprocessing to make it [0, 1] 296 | if prompt_embeds is not None: 297 | image = self.image_processor.postprocess( 298 | image, output_type=output_type, do_denormalize=do_denormalize 299 | ) 300 | # Offload last model to CPU 301 | if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: 302 | self.final_offload_hook.offload() 303 | else: 304 | # image = (image / 2 + 0.5).clamp(0, 1) 305 | image = image_postprocess(image) 306 | 307 | assert not (calculate_pb and return_unetoutput), "Cannot return both log_pb and unet_outputs" 308 | if calculate_pb: 309 | return image, has_nsfw_concept, all_latents, all_log_probs, all_log_pbs 310 | if return_unetoutput: 311 | return image, has_nsfw_concept, all_latents, all_log_probs, unet_outputs 312 | 313 | return image, has_nsfw_concept, all_latents, all_log_probs 314 | -------------------------------------------------------------------------------- /alignment/flow.py: -------------------------------------------------------------------------------- 1 | # For licensing see accompanying LICENSE file. 2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved. 3 | 4 | # Adapted from https://github.com/christophschuhmann/improved-aesthetic-predictor/blob/fe88a163f4661b4ddabba0751ff645e2e620746e/simple_inference.py 5 | import torch 6 | import torch.nn as nn 7 | import numpy as np 8 | from transformers import CLIPModel, CLIPProcessor 9 | from PIL import Image 10 | 11 | import sys 12 | if sys.version_info < (3, 9): 13 | from importlib_resources import files 14 | else: 15 | from importlib.resources import files 16 | ASSETS_PATH = files("alignment.assets") 17 | 18 | 19 | from dataclasses import dataclass 20 | from typing import Any, Dict, List, Optional, Tuple, Union 21 | from diffusers.models.embeddings import TimestepEmbedding, Timesteps, GaussianFourierProjection 22 | from diffusers.models.unets.unet_2d_blocks import get_down_block, DownBlock2D, CrossAttnDownBlock2D 23 | 24 | 25 | # https://github.com/huggingface/diffusers/blob/v0.17.1-patch/src/diffusers/models/unet_2d_condition.py 26 | class ConditionalFlow(torch.nn.Module): 27 | def __init__(self, 28 | # sample_size: Optional[int] = None, 29 | in_channels: int = 4, 30 | # center_input_sample: bool = False, 31 | flip_sin_to_cos: bool = True, 32 | freq_shift: int = 0, 33 | down_block_types: Tuple[str] = ("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"), 34 | only_cross_attention: Union[bool, Tuple[bool]] = False, 35 | block_out_channels: Tuple[int] = (320, 640, 1280, 1280), 36 | layers_per_block: Union[int, Tuple[int]] = 2, 37 | downsample_padding: int = 1, 38 | act_fn: str = "silu", 39 | norm_num_groups: Optional[int] = 32, 40 | norm_eps: float = 1e-5, 41 | cross_attention_dim: Union[int, Tuple[int]] = 1280, 42 | encoder_hid_dim: Optional[int] = None, 43 | encoder_hid_dim_type: Optional[str] = None, 44 | attention_head_dim: Union[int, Tuple[int]] = 8, 45 | # dual_cross_attention: bool = False, 46 | # use_linear_projection: bool = False, 47 | # class_embed_type: Optional[str] = None, 48 | # addition_embed_type: Optional[str] = None, 49 | # num_class_embeds: Optional[int] = None, 50 | # upcast_attention: bool = False, 51 | # resnet_time_scale_shift: str = "default", 52 | # resnet_skip_time_act: bool = False, 53 | # resnet_out_scale_factor: int = 1.0, 54 | # time_embedding_type: str = "positional", 55 | # time_embedding_dim: Optional[int] = None, 56 | # time_embedding_act_fn: Optional[str] = None, 57 | timestep_post_act: Optional[str] = None, 58 | time_cond_proj_dim: Optional[int] = None, 59 | conv_in_kernel: int = 3, 60 | # conv_out_kernel: int = 3, 61 | # projection_class_embeddings_input_dim: Optional[int] = None, 62 | class_embeddings_concat: bool = False, 63 | # mid_block_only_cross_attention: Optional[bool] = None, 64 | # cross_attention_norm: Optional[str] = None, 65 | ): 66 | 67 | super().__init__() 68 | 69 | timestep_input_dim = block_out_channels[0] 70 | self.time_proj = Timesteps(block_out_channels[0], 71 | flip_sin_to_cos=flip_sin_to_cos, downscale_freq_shift=freq_shift) 72 | time_embed_dim = block_out_channels[0] * 4 73 | self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, 74 | act_fn=act_fn, post_act_fn=timestep_post_act, cond_proj_dim=time_cond_proj_dim) 75 | 76 | conv_in_padding = (conv_in_kernel - 1) // 2 77 | self.conv_in = nn.Conv2d( 78 | in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding 79 | ) 80 | self.encoder_hid_proj = None 81 | 82 | self.down_blocks = nn.ModuleList([]) 83 | # only_cross_attention = [only_cross_attention] * len(down_block_types) 84 | 85 | if isinstance(attention_head_dim, int): 86 | attention_head_dim = (attention_head_dim,) * len(down_block_types) 87 | 88 | if isinstance(cross_attention_dim, int): 89 | cross_attention_dim = (cross_attention_dim,) * len(down_block_types) 90 | 91 | if isinstance(layers_per_block, int): 92 | layers_per_block = [layers_per_block] * len(down_block_types) 93 | 94 | if class_embeddings_concat: 95 | # The time embeddings are concatenated with the class embeddings. The dimension of the 96 | # time embeddings passed to the down, middle, and up blocks is twice the dimension of the 97 | # regular time embeddings 98 | blocks_time_embed_dim = time_embed_dim * 2 99 | else: 100 | blocks_time_embed_dim = time_embed_dim 101 | 102 | output_channel = block_out_channels[0] 103 | for i, down_block_type in enumerate(down_block_types): 104 | input_channel = output_channel 105 | output_channel = block_out_channels[i] 106 | # is_final_block = i == len(block_out_channels) - 1 107 | 108 | down_block = get_down_block( 109 | down_block_type, 110 | num_layers=layers_per_block[i], 111 | in_channels=input_channel, 112 | out_channels=output_channel, 113 | temb_channels=blocks_time_embed_dim, 114 | # add_downsample=not is_final_block, 115 | add_downsample=True, 116 | resnet_eps=norm_eps, 117 | resnet_act_fn=act_fn, 118 | resnet_groups=norm_num_groups, 119 | cross_attention_dim=cross_attention_dim[i], 120 | # attn_num_head_channels=attention_head_dim[i], # old diffusers version 121 | num_attention_heads=attention_head_dim[i], 122 | attention_head_dim=attention_head_dim[i], # can be annotated 123 | downsample_padding=downsample_padding, 124 | # dual_cross_attention=dual_cross_attention, 125 | # use_linear_projection=use_linear_projection, 126 | # only_cross_attention=only_cross_attention[i], 127 | # upcast_attention=upcast_attention, 128 | # resnet_time_scale_shift=resnet_time_scale_shift, 129 | # resnet_skip_time_act=resnet_skip_time_act, 130 | # resnet_out_scale_factor=resnet_out_scale_factor, 131 | # cross_attention_norm=cross_attention_norm, 132 | ) 133 | self.down_blocks.append(down_block) 134 | 135 | self.pool = nn.AvgPool2d(4, stride=4) # (bs, 4, 64, 64) -> downsample 4 times -> (bs, ..., 4, 4) 136 | self.fc = nn.Linear(block_out_channels[-1], 1) 137 | 138 | def forward(self, sample, timesteps, encoder_hidden_states, 139 | attention_mask: Optional[torch.Tensor] = None, 140 | cross_attention_kwargs: Optional[Dict[str, Any]] = None, 141 | encoder_attention_mask: Optional[torch.Tensor] = None, 142 | ): 143 | # bs = sample.shape[0] 144 | dtype = next(self.down_blocks.parameters()).dtype 145 | # device = next(self.down_blocks.parameters()).device 146 | 147 | # timesteps = timesteps * torch.ones(sample.shape[0], dtype=timesteps.dtype, device=timesteps.device) 148 | t_emb = self.time_proj(timesteps) 149 | t_emb = t_emb.to(dtype=dtype) 150 | emb = self.time_embedding(t_emb) 151 | 152 | sample = self.conv_in(sample) # (bs, 320, 64, 64) 153 | # down_block_res_samples = (sample,) 154 | for downsample_block in self.down_blocks: 155 | if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: 156 | sample, res_samples = downsample_block( 157 | hidden_states=sample, 158 | temb=emb, 159 | encoder_hidden_states=encoder_hidden_states, 160 | attention_mask=attention_mask, 161 | cross_attention_kwargs=cross_attention_kwargs, 162 | encoder_attention_mask=encoder_attention_mask, 163 | ) 164 | else: 165 | sample, res_samples = downsample_block(hidden_states=sample, temb=emb) 166 | # down_block_res_samples += res_samples 167 | 168 | sample = self.pool(sample) 169 | sample = sample.view(sample.size(0), -1) 170 | sample = self.fc(sample).squeeze() 171 | return sample 172 | -------------------------------------------------------------------------------- /alignment/model_configs/ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /alignment/prompts.py: -------------------------------------------------------------------------------- 1 | # For licensing see accompanying LICENSE file. 2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved. 3 | 4 | from importlib import resources 5 | import os 6 | import functools 7 | import random 8 | import inflect 9 | 10 | IE = inflect.engine() 11 | 12 | import sys 13 | if sys.version_info < (3, 9): 14 | from importlib_resources import files 15 | else: 16 | from importlib.resources import files 17 | ASSETS_PATH = files("alignment.assets") 18 | 19 | 20 | @functools.lru_cache() # will remember previous 128 calls 21 | def _load_lines(path): 22 | """ 23 | Load lines from a file. First tries to load from `path` directly, and if that doesn't exist, searches the 24 | `alignment/assets` directory for a file named `path`. 25 | """ 26 | if not os.path.exists(path): 27 | newpath = ASSETS_PATH.joinpath(path) 28 | if not os.path.exists(newpath): 29 | raise FileNotFoundError(f"Could not find {path} or alignment.assets/{path}") 30 | path = newpath 31 | with open(path, "r") as f: 32 | return [line.strip() for line in f.readlines()] 33 | 34 | def from_file(path, low=None, high=None): 35 | prompts = _load_lines(path)[low:high] 36 | return random.choice(prompts), {} 37 | 38 | 39 | short_names = { 40 | "imagenet_all": "inall", 41 | "imagenet_animals": "inanm", 42 | "imagenet_dogs": "indog", 43 | "simple_animals": "simanm", 44 | "drawbench": "drawb", 45 | 46 | "hpd": "hpd", 47 | "hpd_photo": "hppho", 48 | "hpd_photo_painting": "hpphopa", 49 | "hpd_photo_anime": "hpphoan", 50 | "hpd_photo_concept": "hpphoct", 51 | 52 | "nouns_activities": "nounact", 53 | "counting": "count", 54 | } 55 | 56 | def imagenet_all(): 57 | return from_file("imagenet_classes.txt") 58 | 59 | 60 | def imagenet_animals(): 61 | return from_file("imagenet_classes.txt", 0, 398) 62 | 63 | 64 | def imagenet_dogs(): 65 | return from_file("imagenet_classes.txt", 151, 269) 66 | 67 | 68 | def simple_animals(): 69 | return from_file("simple_animals.txt") 70 | 71 | 72 | import csv 73 | import collections 74 | @functools.lru_cache() 75 | def read_csv(path): 76 | # reader = csv.DictReader(open(path)) 77 | with open (path, 'r') as f: 78 | reader = csv.DictReader(f) 79 | reader = [row for row in reader] 80 | 81 | info = collections.defaultdict(list) 82 | for row in reader: 83 | info[row["Category"]].append(row["Prompts"]) 84 | """ 85 | [(k, len(v)) for k, v in info.items()] 86 | [('Colors', 25), ('Conflicting', 10), ('Counting', 19), ('DALL-E', 20), ('Descriptions', 20), ('Gary Marcus et al. ', 10), 87 | ('Misspellings', 10), ('Positional', 20), ('Rare Words', 7), ('Reddit', 38), ('Text', 21)] 88 | """ 89 | 90 | filtered_info = {} 91 | for k, v in info.items(): 92 | if k in ["Misspellings", "Rare Words"]: # filter out, rest 183 93 | continue 94 | filtered_info[k] = v[2:] # saved for test 95 | drawbench_prompt_ls = sum(filtered_info.values(), []) 96 | return drawbench_prompt_ls # len=165 97 | 98 | def drawbench(): 99 | drawbench_prompt_ls = read_csv(ASSETS_PATH.joinpath("DrawBench Prompts.csv")) 100 | return random.choice(drawbench_prompt_ls), {} 101 | 102 | 103 | import json 104 | @functools.lru_cache() 105 | def read_hpd(style=None): 106 | if style is None: 107 | # 800 prompts for each of the 4 styles 108 | styles = ["anime", "concept-art", "paintings", "photo"] 109 | else: 110 | styles = [style,] 111 | # dic = {} 112 | prompts_ls = [] 113 | for style in styles: 114 | with open(ASSETS_PATH.joinpath(f"HPDv2/benchmark_{style}.json"), "r") as f: 115 | # dic[style] = json.load(f) # list of strings 116 | prompts_ls.extend(json.load(f)[10:]) # 790 for train, 10 for test 117 | 118 | return prompts_ls 119 | 120 | def hpd(): 121 | prompts_ls = read_hpd() 122 | return random.choice(prompts_ls), {} 123 | 124 | def hpd_photo(): 125 | prompts_ls = read_hpd("photo") 126 | return random.choice(prompts_ls), {} 127 | 128 | def hpd_photo_painting(): 129 | prompts_ls = read_hpd("photo") 130 | prompts_ls.extend(read_hpd("paintings")) # not "painting" 131 | return random.choice(prompts_ls), {} 132 | 133 | def hpd_photo_anime(): 134 | prompts_ls = read_hpd("photo") 135 | prompts_ls.extend(read_hpd("anime")) 136 | return random.choice(prompts_ls), {} 137 | 138 | def hpd_photo_concept(): 139 | prompts_ls = read_hpd("photo") 140 | prompts_ls.extend(read_hpd("concept-art")) 141 | return random.choice(prompts_ls), {} 142 | 143 | def nouns_activities(nouns_file, activities_file): 144 | nouns = _load_lines(nouns_file) 145 | activities = _load_lines(activities_file) 146 | return f"{IE.a(random.choice(nouns))} {random.choice(activities)}", {} 147 | 148 | 149 | def counting(nouns_file, low, high): 150 | nouns = _load_lines(nouns_file) 151 | number = IE.number_to_words(random.randint(low, high)) 152 | noun = random.choice(nouns) 153 | plural_noun = IE.plural(noun) 154 | prompt = f"{number} {plural_noun}" 155 | metadata = { 156 | "questions": [ 157 | f"How many {plural_noun} are there in this image?", 158 | f"What animal is in this image?", 159 | ], 160 | "answers": [ 161 | number, 162 | noun, 163 | ], 164 | } 165 | return prompt, metadata -------------------------------------------------------------------------------- /alignment/rewards.py: -------------------------------------------------------------------------------- 1 | # For licensing see accompanying LICENSE file. 2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved. 3 | 4 | import os 5 | from PIL import Image 6 | import io 7 | import numpy as np 8 | import time 9 | import requests 10 | 11 | import torch 12 | import torch.distributed as dist 13 | 14 | from scripts.distributed import get_local_rank 15 | 16 | 17 | short_names = { 18 | "jpeg_incompressibility": "incomp", 19 | "jpeg_compressibility": "comp", 20 | "aesthetic_score": "aes", 21 | "imagereward": "imgr", 22 | "llava_strict_satisfaction": "llava_strict", 23 | "llava_bertscore": "llava", 24 | } 25 | use_prompt = { 26 | "jpeg_incompressibility": False, 27 | "jpeg_compressibility": False, 28 | "aesthetic_score": False, 29 | "imagereward": True, 30 | } 31 | 32 | def jpeg_incompressibility(dtype=torch.float32, device="cuda"): 33 | def _fn(images, prompts, metadata): 34 | if isinstance(images, torch.Tensor): 35 | images = (images * 255).round().clamp(0, 255).to(torch.uint8).cpu().numpy() 36 | images = images.transpose(0, 2, 3, 1) # NCHW -> NHWC 37 | images = [Image.fromarray(image) for image in images] 38 | buffers = [io.BytesIO() for _ in images] 39 | for image, buffer in zip(images, buffers): 40 | image.save(buffer, format="JPEG", quality=95) 41 | sizes = [buffer.tell() / 1000 for buffer in buffers] 42 | sizes = np.array(sizes) 43 | return torch.from_numpy(sizes).cuda(), {} 44 | 45 | return _fn 46 | 47 | 48 | def jpeg_compressibility(dtype=torch.float32, device="cuda"): 49 | jpeg_fn = jpeg_incompressibility(dtype, device) 50 | 51 | def _fn(images, prompts, metadata): 52 | rew, meta = jpeg_fn(images, prompts, metadata) 53 | return -rew, meta 54 | 55 | return _fn 56 | 57 | 58 | def aesthetic_score(dtype=torch.float32, device="cuda", distributed=True): 59 | from alignment.aesthetic_scorer import AestheticScorer 60 | # why cuda() doesn't cause a bug? 61 | scorer = AestheticScorer(dtype=torch.float32, distributed=distributed).cuda() # ignore type; 62 | 63 | # @torch.no_grad() # original AestheticScorer already has no_grad() 64 | def _fn(images, prompts, metadata): 65 | if isinstance(images, torch.Tensor): 66 | images = (images * 255).round().clamp(0, 255).to(torch.uint8) 67 | else: 68 | images = images.transpose(0, 3, 1, 2) # NHWC -> NCHW 69 | images = torch.tensor(images, dtype=torch.uint8) 70 | scores = scorer(images) 71 | return scores, {} 72 | 73 | return _fn 74 | 75 | 76 | # For ImageReward 77 | import ImageReward as RM 78 | from PIL import Image 79 | from torchvision.transforms import Compose, Resize, CenterCrop, Normalize 80 | try: 81 | from torchvision.transforms import InterpolationMode 82 | BICUBIC = InterpolationMode.BICUBIC 83 | except ImportError: 84 | BICUBIC = Image.BICUBIC 85 | 86 | def imagereward(dtype=torch.float32, device="cuda"): 87 | # aesthetic = RM.load_score("Aesthetic", device=device) 88 | if get_local_rank() == 0: # only download once 89 | reward_model = RM.load("ImageReward-v1.0") 90 | dist.barrier() 91 | reward_model = RM.load("ImageReward-v1.0") 92 | reward_model.to(dtype).to(device) 93 | 94 | rm_preprocess = Compose([ 95 | Resize(224, interpolation=BICUBIC), 96 | CenterCrop(224), 97 | Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), 98 | ]) 99 | 100 | def _fn(images, prompts, metadata): 101 | dic = reward_model.blip.tokenizer(prompts, 102 | padding='max_length', truncation=True, return_tensors="pt", 103 | max_length=reward_model.blip.tokenizer.model_max_length) # max_length=512 104 | device = images.device 105 | input_ids, attention_mask = dic.input_ids.to(device), dic.attention_mask.to(device) 106 | reward = reward_model.score_gard(input_ids, attention_mask, rm_preprocess(images)) 107 | return reward.reshape(images.shape[0]).float(), {} # bf16 -> f32 108 | 109 | return _fn 110 | 111 | 112 | def llava_strict_satisfaction(dtype=torch.float32, device="cuda"): 113 | """Submits images to LLaVA and computes a reward by matching the responses to ground truth answers directly without 114 | using BERTScore. Prompt metadata must have "questions" and "answers" keys. See 115 | https://github.com/kvablack/LLaVA-server for server-side code. 116 | """ 117 | import requests 118 | from requests.adapters import HTTPAdapter, Retry 119 | from io import BytesIO 120 | import pickle 121 | 122 | batch_size = 4 123 | url = "http://127.0.0.1:8085" 124 | sess = requests.Session() 125 | retries = Retry( 126 | total=1000, backoff_factor=1, status_forcelist=[500], allowed_methods=False 127 | ) 128 | sess.mount("http://", HTTPAdapter(max_retries=retries)) 129 | 130 | def _fn(images, prompts, metadata): 131 | del prompts 132 | if isinstance(images, torch.Tensor): 133 | images = (images * 255).round().clamp(0, 255).to(torch.uint8).cpu().numpy() 134 | images = images.transpose(0, 2, 3, 1) # NCHW -> NHWC 135 | 136 | images_batched = np.array_split(images, np.ceil(len(images) / batch_size)) 137 | metadata_batched = np.array_split(metadata, np.ceil(len(metadata) / batch_size)) 138 | 139 | all_scores = [] 140 | all_info = { 141 | "answers": [], 142 | } 143 | for image_batch, metadata_batch in zip(images_batched, metadata_batched): 144 | jpeg_images = [] 145 | 146 | # Compress the images using JPEG 147 | for image in image_batch: 148 | img = Image.fromarray(image) 149 | buffer = BytesIO() 150 | img.save(buffer, format="JPEG", quality=80) 151 | jpeg_images.append(buffer.getvalue()) 152 | 153 | # format for LLaVA server 154 | data = { 155 | "images": jpeg_images, 156 | "queries": [m["questions"] for m in metadata_batch], 157 | } 158 | data_bytes = pickle.dumps(data) 159 | 160 | # send a request to the llava server 161 | response = sess.post(url, data=data_bytes, timeout=120) 162 | 163 | response_data = pickle.loads(response.content) 164 | 165 | correct = np.array( 166 | [ 167 | [ans in resp for ans, resp in zip(m["answers"], responses)] 168 | for m, responses in zip(metadata_batch, response_data["outputs"]) 169 | ] 170 | ) 171 | scores = correct.mean(axis=-1) 172 | 173 | all_scores += scores.tolist() 174 | all_info["answers"] += response_data["outputs"] 175 | 176 | return np.array(all_scores), {k: np.array(v) for k, v in all_info.items()} 177 | 178 | return _fn 179 | 180 | 181 | def llava_bertscore(dtype=torch.float32, device="cuda"): 182 | """Submits images to LLaVA and computes a reward by comparing the responses to the prompts using BERTScore. See 183 | https://github.com/kvablack/LLaVA-server for server-side code. 184 | """ 185 | import requests 186 | from requests.adapters import HTTPAdapter, Retry 187 | from io import BytesIO 188 | import pickle 189 | 190 | batch_size = 16 191 | url = "http://127.0.0.1:8085" 192 | sess = requests.Session() 193 | retries = Retry( 194 | total=1000, backoff_factor=1, status_forcelist=[500], allowed_methods=False 195 | ) 196 | sess.mount("http://", HTTPAdapter(max_retries=retries)) 197 | 198 | def _fn(images, prompts, metadata): 199 | del metadata 200 | if isinstance(images, torch.Tensor): 201 | images = (images * 255).round().clamp(0, 255).to(torch.uint8).cpu().numpy() 202 | images = images.transpose(0, 2, 3, 1) # NCHW -> NHWC 203 | 204 | images_batched = np.array_split(images, np.ceil(len(images) / batch_size)) 205 | prompts_batched = np.array_split(prompts, np.ceil(len(prompts) / batch_size)) 206 | 207 | all_scores = [] 208 | all_info = { 209 | "precision": [], 210 | "f1": [], 211 | "outputs": [], 212 | } 213 | for image_batch, prompt_batch in zip(images_batched, prompts_batched): 214 | jpeg_images = [] 215 | 216 | # Compress the images using JPEG 217 | for image in image_batch: 218 | img = Image.fromarray(image) 219 | buffer = BytesIO() 220 | img.save(buffer, format="JPEG", quality=80) 221 | jpeg_images.append(buffer.getvalue()) 222 | 223 | # format for LLaVA server 224 | data = { 225 | "images": jpeg_images, 226 | "queries": [["Answer concisely: what is going on in this image?"]] 227 | * len(image_batch), 228 | "answers": [ 229 | [f"The image contains {prompt}"] for prompt in prompt_batch 230 | ], 231 | } 232 | data_bytes = pickle.dumps(data) 233 | 234 | # send a request to the llava server 235 | response = sess.post(url, data=data_bytes, timeout=120) 236 | 237 | response_data = pickle.loads(response.content) 238 | 239 | # use the recall score as the reward 240 | scores = np.array(response_data["recall"]).squeeze() 241 | all_scores += scores.tolist() 242 | 243 | # save the precision and f1 scores for analysis 244 | all_info["precision"] += ( 245 | np.array(response_data["precision"]).squeeze().tolist() 246 | ) 247 | all_info["f1"] += np.array(response_data["f1"]).squeeze().tolist() 248 | all_info["outputs"] += np.array(response_data["outputs"]).squeeze().tolist() 249 | 250 | return np.array(all_scores), {k: np.array(v) for k, v in all_info.items()} 251 | 252 | return _fn 253 | -------------------------------------------------------------------------------- /alignment/utils.py: -------------------------------------------------------------------------------- 1 | # For licensing see accompanying LICENSE file. 2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved. 3 | 4 | import sys 5 | import time 6 | 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | 12 | def image_postprocess(x): 13 | # [-1, 1] -> [0, 1] 14 | return torch.clamp((x + 1) / 2, 0, 1) # x / 2 + 0.5 15 | 16 | def soft_update(target, source, tau): 17 | for target_param, param in zip(target.parameters(), source.parameters()): 18 | target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) 19 | 20 | def hard_update(target, source): 21 | for target_param, param in zip(target.parameters(), source.parameters()): 22 | target_param.data.copy_(param.data) 23 | 24 | -------------------------------------------------------------------------------- /config/sd.yaml: -------------------------------------------------------------------------------- 1 | parameters: 2 | save_freq: 10 3 | num_checkpoint_limit: 5 4 | mixed_precision: "bf16" 5 | allow_tf32: True 6 | # whether or not to use LoRA. LoRA reduces memory usage significantly by injecting small weight matrices into the 7 | # attention layers of the UNet. with LoRA, fp16, and a batch size of 1, finetuning Stable Diffusion should take 8 | # about 10GB of GPU memory. beware that if LoRA is disabled, training will take a lot of memory and saved checkpoint 9 | # files will also be large. 10 | use_lora: True 11 | 12 | pretrained: 13 | model: "runwayml/stable-diffusion-v1-5" 14 | # model: "CompVis/stable-diffusion-v1-4" # similar to v1.5 15 | revision: "main" 16 | 17 | sample: 18 | num_steps: 50 19 | # eta parameter for the DDIM sampler. this controls the amount of noise injected into the sampling process, with 0.0 20 | # being fully deterministic and 1.0 being equivalent to the DDPM sampler. 21 | eta: 1.0 22 | guidance_scale: 5.0 23 | # batch size (per GPU!) to use for sampling. 24 | # number of batches to sample per epoch. the total number of samples per epoch is `num_batches_per_epoch * 25 | # batch_size * num_gpus`. 26 | batch_size: 16 27 | num_batches_per_epoch: 4 28 | 29 | train: 30 | # whether to use the 8bit Adam optimizer from bitsandbytes. 31 | use_8bit_adam: False 32 | learning_rate: 3.0e-4 33 | adam_beta1: 0.9 34 | adam_beta2: 0.999 35 | adam_weight_decay: 1.0e-4 36 | adam_epsilon: 1.e-8 37 | max_grad_norm: 1.0 38 | # number of inner epochs per outer epoch. each inner epoch is one iteration through the data collected during one 39 | # outer epoch's round of sampling. 40 | num_inner_epochs: 1 41 | # whether or not to use classifier-free guidance during training. if enabled, the same guidance scale used during 42 | # sampling will be used during training. 43 | cfg: True 44 | # clip advantages to the range [-adv_clip_max, adv_clip_max]. 45 | adv_clip_max: 5 46 | # the PPO clip range. 47 | clip_range: 1.e-4 48 | # the fraction of timesteps to train on. if set to less than 1.0, the model will be trained on a subset of the 49 | # timesteps for each sample. this will speed up training but reduce the accuracy of policy gradient estimates. 50 | timestep_fraction: 1.0 # does not affect GPU memory occupation 51 | lora_rank: 4 52 | 53 | batch_size: 8 # 8 (fp16) or 4 (fp32) -> 60000 MB GPU memory 54 | gradient_accumulation_steps: 4 55 | 56 | ########### for GFN 57 | reward_exp: 1.0e+2 58 | flow_learning_rate: 3.0e-4 59 | anneal: linear 60 | unetreg: 1.0e+0 61 | # whether to use GFN-DB with REINFORCE gradient 62 | klpf: -1. 63 | 64 | seed: 0 65 | num_epochs: 100 66 | wandb: False 67 | 68 | # prompt_fn: "simple_animals" # for aesthetic_score 69 | # prompt_fn: "imagenet_all" # for compression 70 | prompt_fn: "drawbench" # for imagereward 71 | # prompt_fn: "hpd" # for HPSv2 72 | # prompt_fn: "hpd_photo" # for HPSv2 73 | # prompt_fn: "hpd_photo_painting" # for HPSv2 74 | # prompt_fn: "hpd_photo_anime" # for HPSv2 75 | # prompt_fn: "hpd_photo_concept" # for HPSv2 76 | 77 | # reward_fn: "aesthetic_score" 78 | # reward_fn: "jpeg_compressibility" 79 | # reward_fn: "jpeg_incompressibility" 80 | reward_fn: "imagereward" 81 | prompt_fn_kwargs: { } -------------------------------------------------------------------------------- /scripts/distributed.py: -------------------------------------------------------------------------------- 1 | # For licensing see accompanying LICENSE file. 2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved. 3 | 4 | import datetime 5 | import os 6 | import logging 7 | import torch 8 | import torch.distributed as dist 9 | import irisctl.api as irisctl 10 | 11 | 12 | def setup_for_distributed(is_master): 13 | """ 14 | This function disables printing when not in master process 15 | """ 16 | import builtins as __builtin__ 17 | builtin_print = __builtin__.print 18 | 19 | def print(*args, **kwargs): 20 | force = kwargs.pop('force', False) 21 | if is_master or force: 22 | builtin_print(*args, **kwargs) 23 | 24 | __builtin__.print = print 25 | 26 | 27 | def init_distributed_multinode(timeout=0): 28 | master_host = "" 29 | world_size = 0 30 | for tasklet in irisctl.distributed_tasklets(): 31 | if tasklet.role_rank == 0: 32 | master_host = f"{tasklet.host_ip_address}:{tasklet.distributed_port}" 33 | world_size += 1 34 | print( 35 | f"Init PyTorch DDP with master host {master_host}, " 36 | f"world size {world_size}, rank {irisctl.role_rank()}" 37 | ) 38 | if timeout == 0: 39 | timeout = dist.default_pg_timeout 40 | else: 41 | timeout = datetime.timedelta(seconds=timeout) 42 | 43 | logging.info(f'Default timeout: {timeout}') 44 | if world_size >= 1: 45 | torch.distributed.init_process_group( 46 | backend="nccl", 47 | init_method="tcp://" + master_host, 48 | world_size=world_size, 49 | timeout=timeout, 50 | rank=irisctl.role_rank(), 51 | ) 52 | 53 | logging.info("Starting {} workers with rank {}".format(world_size, irisctl.role_rank())) 54 | # Pick a GPU based on the local rank 55 | torch.cuda.set_device(irisctl.local_rank()) 56 | 57 | dist.barrier() 58 | setup_for_distributed(irisctl.local_rank() == 0) 59 | return irisctl.local_rank(), irisctl.role_rank(), world_size 60 | 61 | 62 | def init_distributed_singlenode(timeout=0): 63 | # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 64 | dist_url = "env://" # default 65 | 66 | # only works with torch.distributed.launch // torch.run 67 | rank = int(os.environ["RANK"]) 68 | world_size = int(os.environ['WORLD_SIZE']) 69 | local_rank = int(os.environ['LOCAL_RANK']) 70 | 71 | if timeout == 0: 72 | timeout = dist.default_pg_timeout 73 | else: 74 | timeout = datetime.timedelta(seconds=timeout) 75 | 76 | logging.info(f'Default timeout: {timeout}') 77 | dist.init_process_group( 78 | backend="nccl", 79 | init_method=dist_url, 80 | world_size=world_size, 81 | timeout=timeout, 82 | rank=rank) 83 | 84 | # this will make all .cuda() calls work properly 85 | torch.cuda.set_device(local_rank) 86 | # synchronizes all the threads to reach this point before moving on 87 | dist.barrier() 88 | logging.info(f'setting up local_rank {local_rank} global_rank {rank} world size {world_size}') 89 | setup_for_distributed(rank == 0) 90 | return local_rank, rank, world_size 91 | 92 | 93 | def get_rank(): 94 | return torch.distributed.get_rank() if torch.distributed.is_initialized() else 0 95 | 96 | 97 | def get_local_rank(): 98 | return int(os.environ.get('LOCAL_RANK', '0')) 99 | 100 | 101 | # ---------------------------------------------------------------------------- 102 | 103 | def get_world_size(): 104 | return torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1 105 | 106 | 107 | def print0(*args, **kwargs): 108 | if get_rank() == 0: 109 | print(*args, **kwargs) 110 | 111 | 112 | def set_seed(seed): 113 | import random 114 | random.seed(seed) 115 | os.environ['PYTHONHASHSEED'] = str(seed) 116 | 117 | import numpy as np 118 | np.random.seed(seed) 119 | 120 | torch.manual_seed(seed) 121 | torch.random.manual_seed(seed) 122 | if torch.cuda.is_available(): 123 | torch.cuda.manual_seed(seed) 124 | torch.cuda.manual_seed_all(seed) 125 | 126 | torch.backends.cudnn.deterministic = True 127 | torch.backends.cudnn.benchmark = False 128 | torch.cuda.empty_cache() 129 | 130 | logging.info(f'Using seed: {seed}') 131 | 132 | 133 | def load_distributed(ddp_model, CHECKPOINT_PATH, rank=0): 134 | # configure map_location properly 135 | map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} 136 | # ddp_model.load_attn_procs( # ? 137 | ddp_model.load_state_dict( 138 | torch.load(CHECKPOINT_PATH, map_location=map_location)) 139 | -------------------------------------------------------------------------------- /scripts/train_gfn.py: -------------------------------------------------------------------------------- 1 | # For licensing see accompanying LICENSE file. 2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved. 3 | 4 | import os, sys 5 | from collections import defaultdict 6 | import contextlib 7 | import datetime 8 | import time 9 | from concurrent import futures 10 | import wandb 11 | from functools import partial 12 | import tempfile 13 | from PIL import Image 14 | import tqdm 15 | tqdm = partial(tqdm.tqdm, dynamic_ncols=True) 16 | import logging 17 | import yaml 18 | from termcolor import colored 19 | import copy 20 | import math 21 | import pickle, gzip 22 | 23 | import diffusers 24 | from diffusers import AutoencoderKL, DDPMScheduler, DDIMScheduler, StableDiffusionPipeline, UNet2DConditionModel 25 | from diffusers.optimization import get_scheduler 26 | from diffusers.training_utils import cast_training_params 27 | from diffusers.utils import check_min_version, convert_state_dict_to_diffusers, is_wandb_available 28 | from diffusers.utils.torch_utils import is_compiled_module 29 | 30 | import datasets 31 | from packaging import version 32 | from peft import LoraConfig 33 | from peft.utils import get_peft_model_state_dict 34 | import transformers 35 | from transformers import CLIPTextModel, CLIPTokenizer 36 | 37 | import numpy as np 38 | import torch 39 | import torch.nn.functional as F 40 | import torch.utils.checkpoint 41 | import torch.distributed as dist 42 | from torch.nn.parallel import DistributedDataParallel as DDP 43 | from scripts.distributed import init_distributed_singlenode, set_seed, load_distributed, setup_for_distributed 44 | 45 | import alignment.prompts 46 | import alignment.rewards 47 | from alignment.diffusers_patch.pipeline_with_logprob import pipeline_with_logprob 48 | from alignment.diffusers_patch.ddim_with_logprob import ddim_step_with_logprob, pred_orig_latent 49 | from alignment.flow import ConditionalFlow 50 | 51 | 52 | def unwrap_model(model): 53 | model = model.module if isinstance(model, DDP) else model 54 | model = model._orig_mod if is_compiled_module(model) else model 55 | return model 56 | 57 | def main(): 58 | logging.basicConfig( 59 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 60 | datefmt="%m/%d/%Y %H:%M:%S", 61 | level=logging.INFO, 62 | ) 63 | logger = logging.getLogger(__name__) 64 | 65 | config = yaml.safe_load(open("config/sd.yaml"))['parameters'] 66 | 67 | local_rank, global_rank, world_size = init_distributed_singlenode(timeout=36000) 68 | num_processes = world_size 69 | is_local_main_process = local_rank == 0 70 | setup_for_distributed(is_local_main_process) 71 | 72 | config['gpu_type'] = torch.cuda.get_device_name() \ 73 | if torch.cuda.is_available() else "CPU" 74 | logger.info(f"GPU type: {config['gpu_type']}") 75 | 76 | output_dir = os.path.join("./output") 77 | os.makedirs(output_dir, exist_ok=True) 78 | if config['wandb']: 79 | wandb.init(project="gflownet-alignment SD", config=config, 80 | save_code=True, mode="online" if is_local_main_process else "disabled") 81 | 82 | logger.info(f"\n{config}") 83 | set_seed(config['seed']) 84 | 85 | # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision 86 | # as these weights are only used for inference, keeping weights in full precision is not required. 87 | weight_dtype = torch.float32 88 | if config['mixed_precision'] == "fp16": 89 | weight_dtype = torch.float16 90 | elif config['mixed_precision'] == "bf16": 91 | weight_dtype = torch.bfloat16 92 | device = torch.device(local_rank) 93 | 94 | pipeline = StableDiffusionPipeline.from_pretrained( 95 | config['pretrained']['model'], revision=config['pretrained']['revision'], torch_dtype=weight_dtype, 96 | ) 97 | scheduler_config = {} 98 | scheduler_config.update(pipeline.scheduler.config) 99 | pipeline.scheduler = DDIMScheduler.from_config(scheduler_config) 100 | pipeline.vae.requires_grad_(False) 101 | pipeline.text_encoder.requires_grad_(False) 102 | pipeline.vae.to(device, dtype=weight_dtype) 103 | pipeline.text_encoder.to(device, dtype=weight_dtype) 104 | 105 | pipeline.safety_checker = None 106 | pipeline.set_progress_bar_config( 107 | position=1, 108 | disable=not is_local_main_process, 109 | leave=False, 110 | desc="Timestep", 111 | dynamic_ncols=True, 112 | ) 113 | 114 | unet = pipeline.unet 115 | unet.requires_grad_(False) 116 | for param in unet.parameters(): 117 | param.requires_grad_(False) 118 | assert config['use_lora'] 119 | unet.to(device, dtype=weight_dtype) 120 | unet_lora_config = LoraConfig( 121 | r=config['train']['lora_rank'], lora_alpha=config['train']['lora_rank'], 122 | init_lora_weights="gaussian", target_modules=["to_k", "to_q", "to_v", "to_out.0"], 123 | ) 124 | unet.add_adapter(unet_lora_config) 125 | if config['mixed_precision'] in ["fp16", "bf16"]: 126 | # only upcast trainable parameters (LoRA) into fp32 127 | cast_training_params(unet, dtype=torch.float32) 128 | lora_layers = filter(lambda p: p.requires_grad, unet.parameters()) 129 | 130 | scaler = None 131 | if config['mixed_precision'] in ["fp16", "bf16"]: 132 | scaler = torch.cuda.amp.GradScaler() 133 | 134 | # Enable TF32 for faster training on Ampere GPUs, 135 | # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices 136 | if config['allow_tf32']: 137 | torch.backends.cuda.matmul.allow_tf32 = True 138 | # torch.backends.cudnn.allow_tf32 is True by default 139 | torch.backends.cudnn.benchmark = True 140 | 141 | if config['train']['use_8bit_adam']: 142 | try: 143 | import bitsandbytes as bnb 144 | except ImportError: 145 | raise ImportError( 146 | "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`" 147 | ) 148 | optimizer_cls = bnb.optim.AdamW8bit 149 | else: 150 | optimizer_cls = torch.optim.AdamW 151 | 152 | # prepare prompt and reward fn 153 | prompt_fn = getattr(alignment.prompts, config['prompt_fn']) 154 | reward_fn = getattr(alignment.rewards, config['reward_fn'])(weight_dtype, device) 155 | 156 | # generate negative prompt embeddings 157 | neg_prompt_embed = pipeline.text_encoder( 158 | pipeline.tokenizer( 159 | [""], 160 | return_tensors="pt", 161 | padding="max_length", 162 | truncation=True, 163 | max_length=pipeline.tokenizer.model_max_length, # 77 164 | ).input_ids.to(device) 165 | )[0] 166 | sample_neg_prompt_embeds = neg_prompt_embed.repeat(config['sample']['batch_size'], 1, 1) 167 | train_neg_prompt_embeds = neg_prompt_embed.repeat(config['train']['batch_size'], 1, 1) 168 | 169 | # for some reason, autocast is necessary for non-lora training but for lora training it isn't necessary and it uses 170 | # more memory 171 | def func_autocast(): 172 | return torch.cuda.amp.autocast(dtype=weight_dtype) 173 | if config['use_lora']: 174 | # LoRA weights are actually float32, but other part of SD are in bf16/fp16 175 | autocast = contextlib.nullcontext 176 | else: 177 | autocast = func_autocast 178 | 179 | unet.to(device) 180 | unet = DDP(unet, device_ids=[local_rank]) 181 | 182 | ####################################################### 183 | #################### FOR GFN ########################## 184 | def decode(latents): 185 | image = pipeline.vae.decode( 186 | latents / pipeline.vae.config.scaling_factor, return_dict=False 187 | )[0] 188 | # image, has_nsfw_concept = pipeline.run_safety_checker( 189 | # image, device, prompt_embeds.dtype 190 | # ) 191 | do_denormalize = [True] * image.shape[0] 192 | image = pipeline.image_processor.postprocess(image, 193 | output_type="pt", do_denormalize=do_denormalize) 194 | return image 195 | 196 | flow_model = ConditionalFlow(in_channels=4, block_out_channels=(64, 128, 256, 256), 197 | layers_per_block=1, cross_attention_dim=pipeline.text_encoder.config.hidden_size) # hidden_size=768 is SD's text enconder output size 198 | flow_model = flow_model.to(device, dtype=torch.float32) 199 | autocast_flow = func_autocast 200 | 201 | flow_model = DDP(flow_model, device_ids=[local_rank]) 202 | params = [ 203 | {"params": lora_layers, "lr": config['train']['learning_rate']}, 204 | {"params": flow_model.parameters(), "lr": config['train']['learning_rate']} 205 | ] 206 | optimizer = optimizer_cls( 207 | params, 208 | betas=(config['train']['adam_beta1'], config['train']['adam_beta2']), 209 | weight_decay=config['train']['adam_weight_decay'], 210 | eps=config['train']['adam_epsilon'], 211 | ) 212 | 213 | result = defaultdict(dict) 214 | result["config"] = config 215 | start_time = time.time() 216 | 217 | ####################################################### 218 | # Start! 219 | samples_per_epoch = ( 220 | config['sample']['batch_size'] * num_processes 221 | * config['sample']['num_batches_per_epoch'] 222 | ) 223 | total_train_batch_size = ( 224 | config['train']['batch_size'] * num_processes 225 | * config['train']['gradient_accumulation_steps'] 226 | ) 227 | 228 | logger.info("***** Running training *****") 229 | logger.info(f" Num Epochs = {config['num_epochs']}") 230 | logger.info(f" Sample batch size per device = {config['sample']['batch_size']}") 231 | logger.info(f" Train batch size per device = {config['train']['batch_size']}") 232 | logger.info( 233 | f" Gradient Accumulation steps = {config['train']['gradient_accumulation_steps']}" 234 | ) 235 | logger.info("") 236 | logger.info(f" Total number of samples per epoch = test_bs * num_batch_per_epoch * num_process = {samples_per_epoch}") 237 | logger.info( 238 | f" Total train batch size (w. parallel, distributed & accumulation) = train_bs * grad_accumul * num_process = {total_train_batch_size}" 239 | ) 240 | logger.info( 241 | f" Number of gradient updates per inner epoch = samples_per_epoch // total_train_batch_size = {samples_per_epoch // total_train_batch_size}" 242 | ) 243 | logger.info(f" Number of inner epochs = {config['train']['num_inner_epochs']}") 244 | 245 | assert config['sample']['batch_size'] >= config['train']['batch_size'] 246 | assert config['sample']['batch_size'] % config['train']['batch_size'] == 0 # not necessary 247 | assert samples_per_epoch % total_train_batch_size == 0 248 | 249 | first_epoch = 0 250 | global_step = 0 251 | for epoch in range(first_epoch, config['num_epochs']): 252 | if config['train']['anneal'] in ["linear"]: 253 | ratio = min(1, epoch / (0.5 * config['num_epochs'])) 254 | else: 255 | ratio = 1. 256 | reward_exp_ep = config['train']['reward_exp'] * ratio 257 | def reward_transform(value): 258 | return value * reward_exp_ep 259 | 260 | num_diffusion_steps = config['sample']['num_steps'] 261 | pipeline.scheduler.set_timesteps(num_diffusion_steps, device=device) # set_timesteps(): 1000 steps -> 50 steps 262 | scheduler_dt = pipeline.scheduler.timesteps[0] - pipeline.scheduler.timesteps[1] 263 | num_train_timesteps = int(num_diffusion_steps * config['train']['timestep_fraction']) 264 | accumulation_steps = config['train']['gradient_accumulation_steps'] * num_train_timesteps 265 | 266 | #################### SAMPLING #################### 267 | torch.cuda.empty_cache() 268 | unet.zero_grad() 269 | unet.eval() 270 | flow_model.zero_grad() 271 | 272 | if True: 273 | with torch.inference_mode(): # similar to torch.no_grad() but also disables autograd.grad() 274 | samples = [] 275 | prompts = [] 276 | for i in tqdm( 277 | range(config['sample']['num_batches_per_epoch']), 278 | desc=f"Epoch {epoch}: sampling", 279 | disable=not is_local_main_process, 280 | position=0, 281 | ): 282 | # generate prompts 283 | prompts, prompt_metadata = zip( 284 | *[ 285 | prompt_fn(**config['prompt_fn_kwargs']) 286 | for _ in range(config['sample']['batch_size']) 287 | ] 288 | ) 289 | 290 | # encode prompts 291 | prompt_ids = pipeline.tokenizer( 292 | prompts, 293 | return_tensors="pt", 294 | padding="max_length", 295 | truncation=True, 296 | max_length=pipeline.tokenizer.model_max_length, 297 | ).input_ids.to(device) 298 | prompt_embeds = pipeline.text_encoder(prompt_ids)[0] 299 | 300 | # sample 301 | with autocast(): 302 | ret_tuple = pipeline_with_logprob( 303 | pipeline, 304 | prompt_embeds=prompt_embeds, 305 | negative_prompt_embeds=sample_neg_prompt_embeds, 306 | num_inference_steps=num_diffusion_steps, 307 | guidance_scale=config['sample']['guidance_scale'], 308 | eta=config['sample']['eta'], 309 | output_type="pt", 310 | 311 | return_unetoutput=config['train']['unetreg'] > 0., 312 | ) 313 | if config['train']['unetreg'] > 0: 314 | images, _, latents, log_probs, unet_outputs = ret_tuple 315 | unet_outputs = torch.stack(unet_outputs, dim=1) # (batch_size, num_steps, 3, 32, 32) 316 | else: 317 | images, _, latents, log_probs = ret_tuple 318 | 319 | latents = torch.stack(latents, dim=1) # (batch_size, num_steps + 1, 4, 64, 64) 320 | log_probs = torch.stack(log_probs, dim=1) # (batch_size, num_steps, 1) 321 | timesteps = pipeline.scheduler.timesteps.repeat( 322 | config['sample']['batch_size'], 1 323 | ) # (bs, num_steps) (981, 961, ..., 21, 1) corresponds to "next_latents" 324 | 325 | rewards = reward_fn(images, prompts, prompt_metadata) # (reward, reward_metadata) 326 | 327 | samples.append( 328 | { 329 | "prompts": prompts, # tuple of strings 330 | "prompt_metadata": prompt_metadata, 331 | 332 | "prompt_ids": prompt_ids, 333 | "prompt_embeds": prompt_embeds, 334 | "timesteps": timesteps, 335 | "latents": latents[ 336 | :, :-1 337 | ], # each entry is the latent before timestep t 338 | "next_latents": latents[ 339 | :, 1: 340 | ], # each entry is the latent after timestep t 341 | "log_probs": log_probs, 342 | "rewards": rewards, 343 | } 344 | ) 345 | if config['train']['unetreg'] > 0: 346 | samples[-1]["unet_outputs"] = unet_outputs 347 | 348 | # wait for all rewards to be computed 349 | for sample in tqdm( 350 | samples, 351 | desc="Waiting for rewards", 352 | disable=not is_local_main_process, 353 | position=0, 354 | ): 355 | rewards, reward_metadata = sample["rewards"] 356 | sample["rewards"] = torch.as_tensor(rewards, device=device) 357 | 358 | # collate samples into dict where each entry has shape (num_batches_per_epoch * sample.batch_size, ...) 359 | new_samples = {} 360 | for k in samples[0].keys(): 361 | if k in ["prompts", "prompt_metadata"]: 362 | # list of tuples [('cat', 'dog'), ('cat', 'tiger'), ...] -> list ['cat', 'dog', 'cat', 'tiger', ...] 363 | new_samples[k] = [item for s in samples for item in s[k]] 364 | else: 365 | new_samples[k] = torch.cat([s[k] for s in samples]) 366 | samples = new_samples 367 | 368 | # this is a hack to force wandb to log the images as JPEGs instead of PNGs 369 | with tempfile.TemporaryDirectory() as tmpdir: 370 | for i, image in enumerate(images): 371 | # bf16 cannot be converted to numpy directly 372 | pil = Image.fromarray( 373 | (image.cpu().float().numpy().transpose(1, 2, 0) * 255).astype(np.uint8) 374 | ) 375 | pil = pil.resize((256, 256)) 376 | pil.save(os.path.join(tmpdir, f"{i}.jpg")) 377 | if config['wandb'] and is_local_main_process: 378 | wandb.log( 379 | { 380 | "images": [ 381 | wandb.Image( 382 | os.path.join(tmpdir, f"{i}.jpg"), 383 | caption=f"{prompt} | {reward:.2f}", 384 | ) 385 | for i, (prompt, reward) in enumerate( 386 | zip(prompts, rewards) 387 | ) 388 | ], 389 | }, 390 | step=global_step, 391 | ) 392 | 393 | rewards = torch.zeros(world_size * len(samples["rewards"]), 394 | dtype=samples["rewards"].dtype, device=device) 395 | dist.all_gather_into_tensor(rewards, samples["rewards"]) 396 | rewards = rewards.cpu().float().numpy() 397 | result["reward_mean"][global_step] = rewards.mean() 398 | result["reward_std"][global_step] = rewards.std() 399 | 400 | if is_local_main_process: 401 | logger.info(f"global_step: {global_step} rewards: {rewards.mean().item():.3f}") 402 | if config['wandb']: 403 | wandb.log( 404 | { 405 | "reward_mean": rewards.mean(), # samples["rewards"].mean() 406 | "reward_std": rewards.std(), 407 | }, 408 | step=global_step, 409 | ) 410 | 411 | del samples["prompt_ids"] 412 | 413 | total_batch_size, num_timesteps = samples["timesteps"].shape 414 | assert ( 415 | total_batch_size 416 | == config['sample']['batch_size'] * config['sample']['num_batches_per_epoch'] 417 | ) 418 | assert num_timesteps == num_diffusion_steps 419 | 420 | #################### TRAINING #################### 421 | for inner_epoch in range(config['train']['num_inner_epochs']): 422 | # shuffle samples along batch dimension 423 | perm = torch.randperm(total_batch_size, device=device) 424 | for k, v in samples.items(): 425 | if k in ["prompts", "prompt_metadata"]: 426 | samples[k] = [v[i] for i in perm] 427 | elif k in ["unet_outputs"]: 428 | samples[k] = v[perm] 429 | else: 430 | samples[k] = v[perm] 431 | 432 | perms = torch.stack( 433 | [ 434 | torch.randperm(num_timesteps, device=device) 435 | for _ in range(total_batch_size) 436 | ] 437 | ) # (total_batch_size, num_steps) 438 | # "prompts" & "prompt_metadata" are constant along time dimension 439 | key_ls = ["timesteps", "latents", "next_latents", "log_probs"] 440 | for key in key_ls: 441 | samples[key] = samples[key][torch.arange(total_batch_size, device=device)[:, None], perms] 442 | if config['train']['unetreg'] > 0: 443 | samples["unet_outputs"] = \ 444 | samples["unet_outputs"][torch.arange(total_batch_size, device=device)[:, None], perms] 445 | 446 | ### rebatch for training 447 | samples_batched = {} 448 | for k, v in samples.items(): 449 | if k in ["prompts", "prompt_metadata"]: 450 | samples_batched[k] = [v[i:i + config['train']['batch_size']] 451 | for i in range(0, len(v), config['train']['batch_size'])] 452 | elif k in ["unet_outputs"]: 453 | samples_batched[k] = v.reshape(-1, config['train']['batch_size'], *v.shape[1:]) 454 | else: 455 | samples_batched[k] = v.reshape(-1, config['train']['batch_size'], *v.shape[1:]) 456 | 457 | # dict of lists -> list of dicts for easier iteration 458 | samples_batched = [ 459 | dict(zip(samples_batched, x)) for x in zip(*samples_batched.values()) 460 | ] # len = sample_bs * num_batches_per_epoch // train_bs = num_train_batches_per_epoch 461 | 462 | unet.train() 463 | flow_model.train() 464 | info = defaultdict(list) 465 | for i, sample in tqdm( 466 | list(enumerate(samples_batched)), 467 | desc=f"Epoch {epoch}.{inner_epoch}: training", 468 | position=0, 469 | disable=not is_local_main_process, 470 | ): 471 | """ 472 | sample: [ 473 | ('prompts', list of strings, len=train_bs), ('prompt_metadata', list of dicts), 474 | (bf16) ('prompt_embeds', torch.Size([1, 77, 768])), 475 | (int64) ('timesteps', torch.Size([1, 50])), 476 | (bf16) ('latents', torch.Size([1, 50, 4, 64, 64])), ('next_latents', torch.Size([1, 50, 4, 64, 64])), 477 | ('log_probs', torch.Size([1, 50])), 478 | ] 479 | """ 480 | if config['train']['cfg']: 481 | # concat negative prompts to sample prompts to avoid two forward passes 482 | embeds = torch.cat( 483 | [train_neg_prompt_embeds, sample["prompt_embeds"]] 484 | ) 485 | else: 486 | embeds = sample["prompt_embeds"] 487 | 488 | for j in tqdm(range(num_train_timesteps), desc="Timestep", position=1, leave=False, disable=not is_local_main_process): 489 | with autocast(): 490 | if config['train']['cfg']: 491 | noise_pred = unet( 492 | torch.cat([sample["latents"][:, j]] * 2), 493 | torch.cat([sample["timesteps"][:, j]] * 2), 494 | embeds, 495 | ).sample 496 | noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) 497 | noise_pred = ( 498 | noise_pred_uncond 499 | + config['sample']['guidance_scale'] 500 | * (noise_pred_text - noise_pred_uncond) 501 | ) 502 | if config['train']['unetreg'] > 0: 503 | unetreg = (noise_pred - sample["unet_outputs"][:, j])**2 504 | unetreg = torch.mean(unetreg, dim=(1, 2, 3)) 505 | 506 | else: 507 | noise_pred = unet( 508 | sample["latents"][:, j], 509 | sample["timesteps"][:, j], 510 | embeds, 511 | ).sample # (bs, 4, 64, 64) 512 | if config['train']['unetreg'] > 0: 513 | unetreg = (noise_pred - sample["unet_outputs"][:, j])**2 514 | 515 | _, log_pf, log_pb = ddim_step_with_logprob( 516 | pipeline.scheduler, noise_pred, 517 | sample["timesteps"][:, j], # (train_bs, 50) -> (train_bs,) 518 | sample["latents"][:, j], eta=config['sample']['eta'], 519 | prev_sample=sample["next_latents"][:, j], calculate_pb=True, 520 | ) # log_pf :(bs,) 521 | 522 | ####################################################### 523 | #################### GFN ALGORITHM #################### 524 | ####################################################### 525 | with autocast_flow(): 526 | flow = flow_model(sample["latents"][:, j], sample["timesteps"][:, j], sample["prompt_embeds"]) 527 | timestep_next = torch.clamp(sample["timesteps"][:, j] - scheduler_dt, min=0) 528 | flow_next = flow_model(sample["next_latents"][:, j], timestep_next, sample["prompt_embeds"]) 529 | 530 | with autocast(), torch.no_grad(): 531 | unet_output = unet(sample["latents"][:, j], sample["timesteps"][:, j], sample["prompt_embeds"]).sample 532 | latent = pred_orig_latent(pipeline.scheduler, unet_output, sample["latents"][:, j], sample["timesteps"][:, j]) 533 | with torch.inference_mode(): 534 | logr_tmp = reward_fn(decode(latent), sample["prompts"], sample["prompt_metadata"])[0] # tuple -> tensor 535 | logr = reward_transform(logr_tmp) 536 | flow = flow + logr # bf16 + float32 -> float32 537 | 538 | with autocast(), torch.no_grad(): 539 | unet_output = unet(sample["next_latents"][:, j], timestep_next, sample["prompt_embeds"]).sample 540 | latent_next = pred_orig_latent(pipeline.scheduler, unet_output, sample["next_latents"][:, j], timestep_next) 541 | with torch.inference_mode(): 542 | logr_next_tmp = reward_fn(decode(latent_next), sample["prompts"], sample["prompt_metadata"])[0] 543 | logr_next = reward_transform(logr_next_tmp) 544 | flow_next = flow_next + logr_next 545 | end_mask = sample["timesteps"][:, j] == pipeline.scheduler.timesteps[-1] # RHS is 1 546 | flow_next[end_mask] = reward_transform(sample['rewards'][end_mask].to(flow_next)) 547 | 548 | info["log_pf"].append(torch.mean(log_pf).detach()) 549 | info["flow"].append(torch.mean(flow).detach()) 550 | info["log_pb"].append(torch.mean(log_pb).detach()) 551 | 552 | if config['train']['klpf'] > 0: 553 | losses_flow = (flow + log_pf.detach() - log_pb.detach() - flow_next) ** 2 554 | 555 | flow_next_klpf = flow_next.detach() 556 | log_pb_klpf, log_pf_klpf = log_pb.detach(), log_pf.detach() 557 | reward_db = (flow_next_klpf + log_pb_klpf - log_pf_klpf - flow).detach() 558 | 559 | # different gpu has different states, so cannot share a baseline 560 | assert len(reward_db) > 1 561 | rloo_baseline = (reward_db.sum() - reward_db) / (len(reward_db) - 1) 562 | reward_ = (reward_db - rloo_baseline) ** 2 563 | rloo_var = (reward_.sum() - reward_) / (len(reward_db) - 1) 564 | advantages = (reward_db - rloo_baseline) / (rloo_var.sqrt() + 1e-8) 565 | advantages = torch.clamp(advantages, -config['train']['adv_clip_max'], config['train']['adv_clip_max']) 566 | 567 | ratio = torch.exp(log_pf - sample["log_probs"][:, j]) 568 | unclipped_losses = -advantages * ratio 569 | clipped_losses = -advantages * torch.clamp( 570 | ratio, 571 | 1.0 - config['train']['clip_range'], 572 | 1.0 + config['train']['clip_range'], 573 | ) 574 | losses_klpf = torch.maximum(unclipped_losses, clipped_losses) 575 | info["ratio"].append(torch.mean(ratio).detach()) 576 | 577 | losses = losses_flow + config['train']['klpf'] * losses_klpf 578 | info["loss"].append(losses_flow.mean().detach()) 579 | info["loss_klpf"].append(losses_klpf.mean().detach()) 580 | torch.cuda.empty_cache() # clear comp graph for log_pf_next 581 | else: 582 | losses_gfn = (flow + log_pf - log_pb - flow_next) ** 2 # (bs,) 583 | info["loss"].append(losses_gfn.mean().detach()) 584 | losses = losses_gfn 585 | 586 | if config['train']['unetreg'] > 0: 587 | losses = losses + config['train']['unetreg'] * unetreg 588 | info["unetreg"].append(unetreg.mean().detach()) 589 | loss = torch.mean(losses) 590 | 591 | if logr_tmp is not None: 592 | info["logr"].append(torch.mean(logr_tmp).detach()) 593 | 594 | loss = loss / accumulation_steps 595 | if scaler: 596 | # Backward passes under autocast are not recommended 597 | scaler.scale(loss).backward() 598 | else: 599 | loss.backward() 600 | 601 | # prevent OOM 602 | image_next = image = prev_sample_klpf = unet_output = latent = latent_next = latent_next_next = None 603 | noise_pred_next_uncond = noise_pred_next_text = noise_pred_uncond = noise_pred_text = noise_pred = noise_pred_next = None 604 | flow = flow_next = flow_next_next = logr = logr_next = logr_next_next = logr_next_tmp = logr_tmp = reward_db = advantages = None 605 | _ = log_pf = log_pb = log_pf_next = log_pb_next = log_pf_klpf = log_pb_klpf = None 606 | unetreg = unetreg_initial = losses = losses_flow = losses_klpf = losses_gfn = None 607 | 608 | if ((j == num_train_timesteps - 1) and 609 | (i + 1) % config['train']['gradient_accumulation_steps'] == 0): 610 | if scaler: 611 | scaler.unscale_(optimizer) 612 | torch.nn.utils.clip_grad_norm_(unet.parameters(), config['train']['max_grad_norm']) 613 | torch.nn.utils.clip_grad_norm_(flow_model.parameters(), config['train']['max_grad_norm']) 614 | scaler.step(optimizer) 615 | scaler.update() 616 | else: 617 | torch.nn.utils.clip_grad_norm_(unet.parameters(), config['train']['max_grad_norm']) 618 | torch.nn.utils.clip_grad_norm_(flow_model.parameters(), config['train']['max_grad_norm']) 619 | optimizer.step() 620 | optimizer.zero_grad() 621 | global_step += 1 622 | 623 | info = {k: torch.mean(torch.stack(v)) for k, v in info.items()} 624 | dist.barrier() 625 | for k, v in info.items(): 626 | dist.all_reduce(v, op=dist.ReduceOp.SUM) 627 | info = {k: v / num_processes for k, v in info.items()} 628 | for k, v in info.items(): 629 | result[k][global_step] = v.item() 630 | 631 | info.update({"epoch": epoch}) 632 | result["epoch"][global_step] = epoch 633 | result["time"][global_step] = time.time() - start_time 634 | 635 | if is_local_main_process: 636 | if config['wandb']: 637 | wandb.log(info, step=global_step) 638 | logger.info(f"global_step={global_step} " + 639 | " ".join([f"{k}={v:.3f}" for k, v in info.items()])) 640 | info = defaultdict(list) # reset info dict 641 | 642 | if is_local_main_process: 643 | pickle.dump(result, gzip.open(os.path.join(output_dir, f"result.json"), 'wb')) 644 | dist.barrier() 645 | 646 | if epoch % config['save_freq'] == 0 or epoch == config['num_epochs'] - 1: 647 | if is_local_main_process: 648 | save_path = os.path.join(output_dir, f"checkpoint_epoch{epoch}") 649 | unwrapped_unet = unwrap_model(unet) 650 | unet_lora_state_dict = convert_state_dict_to_diffusers( 651 | get_peft_model_state_dict(unwrapped_unet) 652 | ) 653 | StableDiffusionPipeline.save_lora_weights( 654 | save_directory=save_path, 655 | unet_lora_layers=unet_lora_state_dict, 656 | is_main_process=is_local_main_process, 657 | safe_serialization=True, 658 | ) 659 | logger.info(f"Saved state to {save_path}") 660 | 661 | dist.barrier() 662 | 663 | if config['wandb'] and is_local_main_process: 664 | wandb.finish() 665 | 666 | 667 | if __name__ == "__main__": 668 | main() 669 | dist.destroy_process_group() 670 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # For licensing see accompanying LICENSE file. 2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved. 3 | 4 | from setuptools import setup, find_packages 5 | 6 | setup( 7 | name="diffusion-alignment-pytorch", 8 | version="0.0.1", 9 | python_requires=">=3.8", 10 | install_requires=[ 11 | "ml-collections", 12 | "absl-py", 13 | "diffusers[torch]>=0.29.0", # 0.29.0 supports SD3 14 | "accelerate", 15 | "torchvision", 16 | "inflect==6.0.4", 17 | "pydantic==1.10.13", 18 | 19 | "wandb", 20 | "ipdb", 21 | "line_profiler", 22 | "timm", 23 | "termcolor", 24 | "openai-clip", 25 | "image-reward", 26 | "ipykernel", 27 | "clint", 28 | "torchmetrics[image]>=1.4.0", # using [image] to install torch-fidelity 29 | "peft>=0.6.0", 30 | "transformers>=4.41.2" 31 | "einops", 32 | "torchdiffeq", 33 | ], 34 | ) 35 | --------------------------------------------------------------------------------