├── .gitignore
├── ACKNOWLEDGEMENTS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── alignment
    ├── aesthetic_scorer.py
    ├── assets
    │   ├── DrawBench Prompts.csv
    │   ├── HPDv2
    │   │   ├── benchmark_anime.json
    │   │   ├── benchmark_concept-art.json
    │   │   ├── benchmark_paintings.json
    │   │   └── benchmark_photo.json
    │   ├── activities.txt
    │   ├── activities_v0.txt
    │   ├── drawbench.json
    │   ├── imagenet_classes.txt
    │   ├── sac+logos+ava1-l14-linearMSE.pth
    │   └── simple_animals.txt
    ├── diffusers_patch
    │   ├── ddim_with_logprob.py
    │   └── pipeline_with_logprob.py
    ├── flow.py
    ├── model_configs
    │   └── ViT-H-14.json
    ├── prompts.py
    ├── rewards.py
    └── utils.py
├── config
    └── sd.yaml
├── scripts
    ├── distributed.py
    └── train_gfn.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | *.DS_Store
  2 | .idea/
  3 | .vscode/
  4 | *.mps
  5 | wandb/
  6 | .psync
  7 | *.out
  8 | *.pt
  9 | *.pkl
 10 | *.ipynb_checkpoints
 11 | .ipynb
 12 | *.pdf
 13 | *.png
 14 | *.whl
 15 | *.tar.gz
 16 | 
 17 | # Byte-compiled / optimized / DLL files
 18 | __pycache__/
 19 | *.py[cod]
 20 | *$py.class
 21 | 
 22 | # C extensions
 23 | *.so
 24 | 
 25 | # Distribution / packaging
 26 | .Python
 27 | build/
 28 | develop-eggs/
 29 | dist/
 30 | downloads/
 31 | eggs/
 32 | .eggs/
 33 | lib/
 34 | lib64/
 35 | parts/
 36 | sdist/
 37 | var/
 38 | wheels/
 39 | pip-wheel-metadata/
 40 | share/python-wheels/
 41 | *.egg-info/
 42 | .installed.cfg
 43 | *.egg
 44 | MANIFEST
 45 | 
 46 | # PyInstaller
 47 | #  Usually these files are written by a python script from a template
 48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 49 | *.manifest
 50 | *.spec
 51 | 
 52 | # Installer logs
 53 | pip-log.txt
 54 | pip-delete-this-directory.txt
 55 | 
 56 | # Unit test / coverage reports
 57 | htmlcov/
 58 | .tox/
 59 | .nox/
 60 | .coverage
 61 | .coverage.*
 62 | .cache
 63 | nosetests.xml
 64 | coverage.xml
 65 | *.cover
 66 | *.py,cover
 67 | .hypothesis/
 68 | .pytest_cache/
 69 | 
 70 | # Translations
 71 | *.mo
 72 | *.pot
 73 | 
 74 | # Django stuff:
 75 | *.log
 76 | local_settings.py
 77 | db.sqlite3
 78 | db.sqlite3-journal
 79 | 
 80 | # Flask stuff:
 81 | instance/
 82 | .webassets-cache
 83 | 
 84 | # Scrapy stuff:
 85 | .scrapy
 86 | 
 87 | # Sphinx documentation
 88 | docs/_build/
 89 | 
 90 | # PyBuilder
 91 | target/
 92 | 
 93 | # Jupyter Notebook
 94 | .ipynb_checkpoints
 95 | 
 96 | # IPython
 97 | profile_default/
 98 | ipython_config.py
 99 | 
100 | # pyenv
101 | .python-version
102 | 
103 | # pipenv
104 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
105 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
106 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
107 | #   install all needed dependencies.
108 | #Pipfile.lock
109 | 
110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
111 | __pypackages__/
112 | 
113 | # Celery stuff
114 | celerybeat-schedule
115 | celerybeat.pid
116 | 
117 | # SageMath parsed files
118 | *.sage.py
119 | 
120 | # Environments
121 | .env
122 | .venv
123 | env/
124 | venv/
125 | ENV/
126 | env.bak/
127 | venv.bak/
128 | 
129 | # Spyder project settings
130 | .spyderproject
131 | .spyproject
132 | 
133 | # Rope project settings
134 | .ropeproject
135 | 
136 | # mkdocs documentation
137 | /site
138 | 
139 | # mypy
140 | .mypy_cache/
141 | .dmypy.json
142 | dmypy.json
143 | 
144 | # Pyre type checker
145 | .pyre/
146 | 


--------------------------------------------------------------------------------
/ACKNOWLEDGEMENTS:
--------------------------------------------------------------------------------
 1 | Acknowledgements
 2 | Portions of this Diffusion Alignment GFlowNet Software may utilize the following copyrighted
 3 | material, the use of which is hereby acknowledged.
 4 | 
 5 | _____________________
 6 | 
 7 | Kevin Black (ddpo-pytorch)
 8 | 
 9 |         Copyright (c) 2023 Kevin Black
10 | 
11 |         Permission is hereby granted, free of charge, to any person obtaining a copy
12 |         of this software and associated documentation files (the "Software"), to deal
13 |         in the Software without restriction, including without limitation the rights
14 |         to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15 |         copies of the Software, and to permit persons to whom the Software is
16 |         furnished to do so, subject to the following conditions:
17 | 
18 |         The above copyright notice and this permission notice shall be included in all
19 |         copies or substantial portions of the Software.
20 | 
21 |         THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 |         IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 |         FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 |         AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 |         LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26 |         OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 |         SOFTWARE.


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the open source team at [opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com). All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4,
71 | available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html)


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribution Guide
 2 | 
 3 | Thanks for your interest in contributing. This project was released to accompany a research paper for purposes of reproducibility, and beyond its publication there are limited plans for future development of the repository.
 4 | 
 5 | While we welcome new pull requests and issues please note that our response may be limited. Forks and out-of-tree improvements are strongly encouraged.
 6 | 
 7 | ## Before you get started
 8 | 
 9 | By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community,
10 | and agree by submitting the patch that your contributions are licensed under the [LICENSE](LICENSE).
11 | 
12 | We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md).
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2024 Apple Inc. All Rights Reserved.
 2 | 
 3 | IMPORTANT:  This Apple software is supplied to you by Apple
 4 | Inc. ("Apple") in consideration of your agreement to the following
 5 | terms, and your use, installation, modification or redistribution of
 6 | this Apple software constitutes acceptance of these terms.  If you do
 7 | not agree with these terms, please do not use, install, modify or
 8 | redistribute this Apple software.
 9 | 
10 | In consideration of your agreement to abide by the following terms, and
11 | subject to these terms, Apple grants you a personal, non-exclusive
12 | license, under Apple's copyrights in this original Apple software (the
13 | "Apple Software"), to use, reproduce, modify and redistribute the Apple
14 | Software, with or without modifications, in source and/or binary forms;
15 | provided that if you redistribute the Apple Software in its entirety and
16 | without modifications, you must retain this notice and the following
17 | text and disclaimers in all such redistributions of the Apple Software.
18 | Neither the name, trademarks, service marks or logos of Apple Inc. may
19 | be used to endorse or promote products derived from the Apple Software
20 | without specific prior written permission from Apple.  Except as
21 | expressly stated in this notice, no other rights or licenses, express or
22 | implied, are granted by Apple herein, including but not limited to any
23 | patent rights that may be infringed by your derivative works or by other
24 | works in which the Apple Software may be incorporated.
25 | 
26 | The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
27 | MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
28 | THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
29 | FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
30 | OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
31 | 
32 | IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
33 | OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 | INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
36 | MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
37 | AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
38 | STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
39 | POSSIBILITY OF SUCH DAMAGE.
40 | 
41 | -------------------------------------------------------------------------------
42 | SOFTWARE DISTRIBUTED WITH Diffusion Alignment GFlowNet:
43 | 
44 | This software includes a number of subcomponents with separate
45 | copyright notices and license terms - please see the file ACKNOWLEDGEMENTS.
46 | -------------------------------------------------------------------------------


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Diffusion Alignment GFlowNet
 2 | 
 3 | This is the PyTorch implementation for our paper [Improving GFlowNets for Text-to-Image Diffusion Alignment
 4 | ](https://arxiv.org/abs/2406.00633).
 5 | 
 6 | This work proposes methods to align text-to-image diffusion models with given reward functions 
 7 | through the algorithmic framework of GFlowNet. 
 8 | We provide code for DAG (diffusion alignment with gflownet) 
 9 | and DAG-KL (DAG with KL divergence based gradient). For more details, we refer to our paper.
10 | 
11 | ## Installation
12 | 
13 | ```bash
14 | pip install -e .
15 | ```
16 | 
17 | ## Usage
18 | 
19 | Diffusion alignment training with GFlowNet on Stable Diffusion:
20 | ```bash
21 | torchrun --standalone --nproc_per_node=8 scripts/train_gfn.py
22 | ```
23 | To use DAG-KL, set `config['train']['klpf]` in `config/sd.yaml` to a positive coefficient.
24 | 
25 | 
26 | ## Important Hyperparameters
27 | 
28 | A detailed explanation of all the hyperparameters can be found in `config/sd.yaml`. 
29 | 
30 | ### prompt_fn and reward_fn
31 | At a high level, the problem of finetuning a diffusion model is defined by 2 things: 
32 | a set of prompts to generate images, and a reward function to evaluate those images. 
33 | The prompts are defined by a `prompt_fn` which takes no arguments and 
34 | generates a random prompt each time it is called. 
35 | The reward function is defined by a `reward_fn` which takes in a batch of images and returns 
36 | a batch of rewards for those images. All of the prompt and reward functions currently implemented can be
37 | found in `alignment/prompts.py` and `alignment/rewards.py`, respectively.
38 | 
39 | ## Acknowledgements
40 | 
41 | We thank the authors of the [ddpo-pytorch](https://github.com/kvablack/ddpo-pytorch) repository for open sourcing their code, 
42 | which part of our code is based on.
43 | 
44 | 
45 | # Citation
46 | If you find this code useful, please consider citing our paper:
47 | ```
48 | @article{diffusion_alignment_gfn,
49 |   title={Improving GFlowNets for Text-to-Image Diffusion Alignment},
50 |   author={Dinghuai Zhang and Yizhe Zhang and Jiatao Gu and Ruixiang Zhang and Josh Susskind and Navdeep Jaitly and Shuangfei Zhai},
51 |   journal={Arxiv},
52 |   year={2024},
53 |   url={https://arxiv.org/abs/2406.00633}, 
54 | }
55 | ```


--------------------------------------------------------------------------------
/alignment/aesthetic_scorer.py:
--------------------------------------------------------------------------------
 1 | # For licensing see accompanying LICENSE file.
 2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved.
 3 | 
 4 | # Based on https://github.com/christophschuhmann/improved-aesthetic-predictor/blob/fe88a163f4661b4ddabba0751ff645e2e620746e/simple_inference.py
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | import numpy as np
 9 | from transformers import CLIPModel, CLIPProcessor
10 | from PIL import Image
11 | 
12 | import torch.distributed as dist
13 | from scripts.distributed import get_local_rank
14 | 
15 | import sys
16 | if sys.version_info < (3, 9):
17 |     from importlib_resources import files
18 | else:
19 |     from importlib.resources import files
20 | ASSETS_PATH = files("alignment.assets")
21 | 
22 | 
23 | class MLP(nn.Module):
24 |     def __init__(self):
25 |         super().__init__()
26 |         self.layers = nn.Sequential(
27 |             nn.Linear(768, 1024),
28 |             nn.Dropout(0.2),
29 |             nn.Linear(1024, 128),
30 |             nn.Dropout(0.2),
31 |             nn.Linear(128, 64),
32 |             nn.Dropout(0.1),
33 |             nn.Linear(64, 16),
34 |             nn.Linear(16, 1),
35 |         )
36 | 
37 |     # @torch.no_grad()
38 |     def forward(self, embed):
39 |         return self.layers(embed)
40 | 
41 | 
42 | class AestheticScorer(torch.nn.Module):
43 |     def __init__(self, dtype, distributed=True):
44 |         super().__init__()
45 |         if distributed:
46 |             if get_local_rank() == 0: # only download once
47 |                 self.clip = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
48 |                 self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
49 |             dist.barrier()
50 |         self.clip = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
51 |         self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
52 | 
53 |         self.mlp = MLP()
54 |         state_dict = torch.load(
55 |             ASSETS_PATH.joinpath("sac+logos+ava1-l14-linearMSE.pth")
56 |         )
57 |         self.mlp.load_state_dict(state_dict)
58 |         self.dtype = dtype
59 |         self.eval()
60 | 
61 |     # @torch.no_grad()
62 |     def __call__(self, images):
63 |         device = next(self.parameters()).device
64 |         inputs = self.processor(images=images, return_tensors="pt")
65 |         inputs = {k: v.to(self.dtype).to(device) for k, v in inputs.items()}
66 |         embed = self.clip.get_image_features(**inputs)
67 |         # normalize embedding
68 |         embed = embed / torch.linalg.vector_norm(embed, dim=-1, keepdim=True)
69 |         return self.mlp(embed).squeeze(1)


--------------------------------------------------------------------------------
/alignment/assets/DrawBench Prompts.csv:
--------------------------------------------------------------------------------
  1 | Prompts,Category
  2 | A red colored car.,Colors
  3 | A black colored car.,Colors
  4 | A pink colored car.,Colors
  5 | A black colored dog.,Colors
  6 | A red colored dog.,Colors
  7 | A blue colored dog.,Colors
  8 | A green colored banana.,Colors
  9 | A red colored banana.,Colors
 10 | A black colored banana.,Colors
 11 | A white colored sandwich.,Colors
 12 | A black colored sandwich.,Colors
 13 | An orange colored sandwich.,Colors
 14 | A pink colored giraffe.,Colors
 15 | A yellow colored giraffe.,Colors
 16 | A brown colored giraffe.,Colors
 17 | A red car and a white sheep.,Colors
 18 | A blue bird and a brown bear.,Colors
 19 | A green apple and a black backpack.,Colors
 20 | A green cup and a blue cell phone.,Colors
 21 | A yellow book and a red vase.,Colors
 22 | A white car and a red sheep.,Colors
 23 | A brown bird and a blue bear.,Colors
 24 | A black apple and a green backpack.,Colors
 25 | A blue cup and a green cell phone.,Colors
 26 | A red book and a yellow vase.,Colors
 27 | A horse riding an astronaut.,Conflicting
 28 | A pizza cooking an oven.,Conflicting
 29 | A bird scaring a scarecrow.,Conflicting
 30 | A blue coloured pizza.,Conflicting
 31 | Hovering cow abducting aliens.,Conflicting
 32 | A panda making latte art.,Conflicting
 33 | A shark in the desert.,Conflicting
 34 | An elephant under the sea.,Conflicting
 35 | Rainbow coloured penguin.,Conflicting
 36 | A fish eating a pelican.,Conflicting
 37 | One car on the street.,Counting
 38 | Two cars on the street.,Counting
 39 | Three cars on the street.,Counting
 40 | Four cars on the street.,Counting
 41 | Five cars on the street.,Counting
 42 | One dog on the street.,Counting
 43 | Two dogs on the street.,Counting
 44 | Three dogs on the street.,Counting
 45 | Four dogs on the street.,Counting
 46 | Five dogs on the street.,Counting
 47 | One cat and one dog sitting on the grass.,Counting
 48 | One cat and two dogs sitting on the grass.,Counting
 49 | One cat and three dogs sitting on the grass.,Counting
 50 | Two cats and one dog sitting on the grass.,Counting
 51 | Two cats and two dogs sitting on the grass.,Counting
 52 | Two cats and three dogs sitting on the grass.,Counting
 53 | Three cats and one dog sitting on the grass.,Counting
 54 | Three cats and two dogs sitting on the grass.,Counting
 55 | Three cats and three dogs sitting on the grass.,Counting
 56 | A triangular purple flower pot. A purple flower pot in the shape of a triangle.,DALL-E
 57 | A triangular orange picture frame. An orange picture frame in the shape of a triangle.,DALL-E
 58 | A triangular pink stop sign. A pink stop sign in the shape of a triangle.,DALL-E
 59 | A cube made of denim. A cube with the texture of denim.,DALL-E
 60 | A sphere made of kitchen tile. A sphere with the texture of kitchen tile.,DALL-E
 61 | A cube made of brick. A cube with the texture of brick.,DALL-E
 62 | A collection of nail is sitting on a table.,DALL-E
 63 | A single clock is sitting on a table.,DALL-E
 64 | A couple of glasses are sitting on a table.,DALL-E
 65 | An illustration of a large red elephant sitting on a small blue mouse.,DALL-E
 66 | An illustration of a small green elephant standing behind a large red mouse.,DALL-E
 67 | A small blue book sitting on a large red book.,DALL-E
 68 | "A stack of 3 plates. A blue plate is on the top, sitting on a blue plate. The blue plate is in the middle, sitting on a green plate. The green plate is on the bottom.",DALL-E
 69 | "A stack of 3 cubes. A red cube is on the top, sitting on a red cube. The red cube is in the middle, sitting on a green cube. The green cube is on the bottom.",DALL-E
 70 | "A stack of 3 books. A green book is on the top, sitting on a red book. The red book is in the middle, sitting on a blue book. The blue book is on the bottom.",DALL-E
 71 | "An emoji of a baby panda wearing a red hat, green gloves, red shirt, and green pants.",DALL-E
 72 | "An emoji of a baby panda wearing a red hat, blue gloves, green shirt, and blue pants.",DALL-E
 73 | A fisheye lens view of a turtle sitting in a forest.,DALL-E
 74 | A side view of an owl sitting in a field.,DALL-E
 75 | A cross-section view of a brain.,DALL-E
 76 | "A vehicle composed of two wheels held in a frame one behind the other, propelled by pedals and steered with handlebars attached to the front wheel.",Descriptions
 77 | "A large motor vehicle carrying passengers by road, typically one serving the public on a fixed route and for a fare.",Descriptions
 78 | "A small vessel propelled on water by oars, sails, or an engine.",Descriptions
 79 | A connection point by which firefighters can tap into a water supply.,Descriptions
 80 | "A machine next to a parking space in a street, into which the driver puts money so as to be authorized to park the vehicle for a particular length of time.",Descriptions
 81 | "A device consisting of a circular canopy of cloth on a folding metal frame supported by a central rod, used as protection against rain or sometimes sun.",Descriptions
 82 | "A separate seat for one person, typically with a back and four legs.",Descriptions
 83 | An appliance or compartment which is artificially kept cool and used to store food and drink.,Descriptions
 84 | A mechanical or electrical device for measuring time.,Descriptions
 85 | "An instrument used for cutting cloth, paper, and other thin material, consisting of two blades laid one on top of the other and fastened in the middle so as to allow them to be opened and closed by a thumb and finger inserted through rings on the end of their handles.",Descriptions
 86 | "A large plant-eating domesticated mammal with solid hoofs and a flowing mane and tail, used for riding, racing, and to carry and pull loads.",Descriptions
 87 | A long curved fruit which grows in clusters and has soft pulpy flesh and yellow skin when ripe.,Descriptions
 88 | "A small domesticated carnivorous mammal with soft fur, a short snout, and retractable claws. It is widely kept as a pet or for catching mice, and many breeds have been developed.",Descriptions
 89 | "A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, nonretractable claws, and a barking, howling, or whining voice.",Descriptions
 90 | "An organ of soft nervous tissue contained in the skull of vertebrates, functioning as the coordinating center of sensation and intellectual and nervous activity.",Descriptions
 91 | "An American multinational technology company that focuses on artificial intelligence, search engine, online advertising, cloud computing, computer software, quantum computing, e-commerce, and consumer electronics.",Descriptions
 92 | "A large keyboard musical instrument with a wooden case enclosing a soundboard and metal strings, which are struck by hammers when the keys are depressed. The strings' vibration is stopped by dampers when the keys are released and can be regulated for length and volume by two or three pedals.",Descriptions
 93 | "A type of digital currency in which a record of transactions is maintained and new units of currency are generated by the computational solution of mathematical problems, and which operates independently of a central bank.",Descriptions
 94 | "A large thick-skinned semiaquatic African mammal, with massive jaws and large tusks.",Descriptions
 95 | A machine resembling a human being and able to replicate certain human movements and functions automatically.,Descriptions
 96 | Paying for a quarter-sized pizza with a pizza-sized quarter.,Gary Marcus et al. 
 97 | An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with no umbrellas.,Gary Marcus et al. 
 98 | "A grocery store refrigerator has pint cartons of milk on the top shelf, quart cartons on the middle shelf, and gallon plastic jugs on the bottom shelf.",Gary Marcus et al. 
 99 | "In late afternoon in January in New England, a man stands in the shadow of a maple tree.",Gary Marcus et al. 
100 | An elephant is behind a tree. You can see the trunk on one side and the back legs on the other.,Gary Marcus et al. 
101 | A tomato has been put on top of a pumpkin on a kitchen stool. There is a fork sticking into the pumpkin. The scene is viewed from above.,Gary Marcus et al. 
102 | A pear cut into seven pieces arranged in a ring.,Gary Marcus et al. 
103 | "A donkey and an octopus are playing a game. The donkey is holding a rope on one end, the octopus is holding onto the other. The donkey holds the rope in its mouth. A cat is jumping over the rope.",Gary Marcus et al. 
104 | "Supreme Court Justices play a baseball game with the FBI. The FBI is at bat, the justices are on the field.",Gary Marcus et al. 
105 | Abraham Lincoln touches his toes while George Washington does chin-ups. Lincoln is barefoot. Washington is wearing boots.,Gary Marcus et al. 
106 | Tcennis rpacket.,Misspellings
107 | Bzaseball galove.,Misspellings
108 | Rbefraigerator.,Misspellings
109 | Dininrg tablez.,Misspellings
110 | Pafrking metr.,Misspellings
111 | "A smafml vessef epropoeilled on watvewr by ors, sauls, or han engie.",Misspellings
112 | "A sjmall domesticated carnivorious mammnal with sof fuh,y a sthort sout, and retracwtablbe flaws. It iw widexly kept as a pet or for catchitng mic, ad many breeds zhlyde beefn develvoked.",Misspellings
113 | "An instqrumemnt used for cutting cloth, paper, axdz othr thdin mteroial, consamistng of two blades lad one on tvopb of the other and fhastned in tle mixdqdjle so as to bllow them txo be pened and closed by thumb and fitngesr inserted tgrough rings on kthe end oc thei vatndlzes.",Misspellings
114 | "A domesticated carnivvorous mzammal that typicbally hfaas a lons sfnout, an acxujte sense off osmell, noneetractaaln crlaws, anid xbarkring,y howlingu, or whining rvoiche.",Misspellings
115 | "A ldarge keybord msical instroument lwith a woden case enmclosig a qsouvnkboajrd and mfgtal strivgf, which are strucrk b hammrs when the nels are depresdsmed.f lhe strsingsj' vibration ie stopped by damperds when the keys re released and can bce regulavewdd for lengh and vnolume y two or three pedalvs.",Misspellings
116 | A train on top of a surfboard.,Positional
117 | A wine glass on top of a dog.,Positional
118 | A bicycle on top of a boat.,Positional
119 | An umbrella on top of a spoon.,Positional
120 | A laptop on top of a teddy bear.,Positional
121 | A giraffe underneath a microwave.,Positional
122 | A donut underneath a toilet.,Positional
123 | A hair drier underneath a sheep.,Positional
124 | A tennis racket underneath a traffic light.,Positional
125 | A zebra underneath a broccoli.,Positional
126 | A banana on the left of an apple.,Positional
127 | A couch on the left of a chair.,Positional
128 | A car on the left of a bus.,Positional
129 | A cat on the left of a dog.,Positional
130 | A carrot on the left of a broccoli.,Positional
131 | A pizza on the right of a suitcase.,Positional
132 | A cat on the right of a tennis racket.,Positional
133 | A stop sign on the right of a refrigerator.,Positional
134 | A sheep to the right of a wine glass.,Positional
135 | A zebra to the right of a fire hydrant.,Positional
136 | Acersecomicke.,Rare Words
137 | Jentacular.,Rare Words
138 | Matutinal.,Rare Words
139 | Peristeronic.,Rare Words
140 | Artophagous.,Rare Words
141 | Backlotter.,Rare Words
142 | Octothorpe.,Rare Words
143 | A church with stained glass windows depicting a hamburger and french fries.,Reddit
144 | "Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna.",Reddit
145 | "A baby fennec sneezing onto a strawberry, detailed, macro, studio light, droplets, backlit ears.",Reddit
146 | A photo of a confused grizzly bear in calculus class.,Reddit
147 | An ancient Egyptian painting depicting an argument over whose turn it is to take out the trash.,Reddit
148 | "A fluffy baby sloth with a knitted hat trying to figure out a laptop, close up, highly detailed, studio lighting, screen reflecting in its eyes.",Reddit
149 | "A tiger in a lab coat with a 1980s Miami vibe, turning a well oiled science content machine, digital art.",Reddit
150 | A 1960s yearbook photo with animals dressed as humans.,Reddit
151 | Lego Arnold Schwarzenegger.,Reddit
152 | A yellow and black bus cruising through the rainforest.,Reddit
153 | A medieval painting of the wifi not working.,Reddit
154 | "An IT-guy trying to fix hardware of a PC tower is being tangled by the PC cables like Laokoon. Marble, copy after Hellenistic original from ca. 200 BC. Found in the Baths of Trajan, 1506.",Reddit
155 | "35mm macro shot a kitten licking a baby duck, studio lighting.",Reddit
156 | McDonalds Church.,Reddit
157 | Photo of an athlete cat explaining it's latest scandal at a press conference to journalists.,Reddit
158 | Greek statue of a man tripping over a cat.,Reddit
159 | "An old photograph of a 1920s airship shaped like a pig, floating over a wheat field.",Reddit
160 | Photo of a cat singing in a barbershop quartet.,Reddit
161 | "A painting by Grant Wood of an astronaut couple, american gothic style.",Reddit
162 | An oil painting portrait of the regal Burger King posing with a Whopper.,Reddit
163 | "A keyboard made of water, the water is made of light, the light is turned off.",Reddit
164 | Painting of Mona Lisa but the view is from behind of Mona Lisa.,Reddit
165 | Hyper-realistic photo of an abandoned industrial site during a storm.,Reddit
166 | A screenshot of an iOS app for ordering different types of milk.,Reddit
167 | "A real life photography of super mario, 8k Ultra HD.",Reddit
168 | Colouring page of large cats climbing the eifel tower in a cyberpunk future.,Reddit
169 | Photo of a mega Lego space station inside a kid's bedroom.,Reddit
170 | A spider with a moustache bidding an equally gentlemanly grasshopper a good day during his walk to work.,Reddit
171 | A photocopy of a photograph of a painting of a sculpture of a giraffe.,Reddit
172 | "A bridge connecting Europe and North America on the Atlantic Ocean, bird's eye view.",Reddit
173 | "A maglev train going vertically downward in high speed, New York Times photojournalism.",Reddit
174 | A magnifying glass over a page of a 1950s batman comic.,Reddit
175 | "A car playing soccer, digital art.",Reddit
176 | Darth Vader playing with raccoon in Mars during sunset.,Reddit
177 | A 1960s poster warning against climate change.,Reddit
178 | Illustration of a mouse using a mushroom as an umbrella.,Reddit
179 | A realistic photo of a Pomeranian dressed up like a 1980s professional wrestler with neon green and neon orange face paint and bright green wrestling tights with bright orange boots.,Reddit
180 | A pyramid made of falafel with a partial solar eclipse in the background.,Reddit
181 | A storefront with 'Hello World' written on it.,Text
182 | A storefront with 'Diffusion' written on it.,Text
183 | A storefront with 'Text to Image' written on it.,Text
184 | A storefront with 'NeurIPS' written on it.,Text
185 | A storefront with 'Deep Learning' written on it.,Text
186 | A storefront with 'Google Brain Toronto' written on it.,Text
187 | A storefront with 'Google Research Pizza Cafe' written on it.,Text
188 | A sign that says 'Hello World'.,Text
189 | A sign that says 'Diffusion'.,Text
190 | A sign that says 'Text to Image'.,Text
191 | A sign that says 'NeurIPS'.,Text
192 | A sign that says 'Deep Learning'.,Text
193 | A sign that says 'Google Brain Toronto'.,Text
194 | A sign that says 'Google Research Pizza Cafe'.,Text
195 | New York Skyline with 'Hello World' written with fireworks on the sky.,Text
196 | New York Skyline with 'Diffusion' written with fireworks on the sky.,Text
197 | New York Skyline with 'Text to Image' written with fireworks on the sky.,Text
198 | New York Skyline with 'NeurIPS' written with fireworks on the sky.,Text
199 | New York Skyline with 'Deep Learning' written with fireworks on the sky.,Text
200 | New York Skyline with 'Google Brain Toronto' written with fireworks on the sky.,Text
201 | New York Skyline with 'Google Research Pizza Cafe' written with fireworks on the sky.,Text


--------------------------------------------------------------------------------
/alignment/assets/HPDv2/benchmark_photo.json:
--------------------------------------------------------------------------------
1 | ["A man taking a drink from a water fountain.", "Fruit in a jar filled with liquid sitting on a wooden table.", "A bathroom sink cluttered with multiple personal care items.", "A smiling man is cooking in his kitchen.", "A beautiful blue and pink sky overlooking the beach.", "A man smiles as he stirs his food in the pot.", "Several bikers are lined up in a parking lot.", "There is no picture or image sorry sorry", "A small car parked in by a vespa.", "Several people around some motorcycles on the side of a road.", "A black and white cat looking out a window over another cat.", "A woman in a purple top pulling food out of a oven", "Fighter jets on display in front of a museum.", "An empty road with buildings on each side.", "Two vespas parked next to a light post.", "A peak into a long bathroom with a toilet, but no shower.", "A face car driving past a parked motorcycle.", "A computer monitor glows on a wooden desk that has a black computer chair near it.", "a medium sized plane on an air port run way", "A bicycle chained to a pole on a snowy day", "A half eaten dessert and half empty cup.", "A blue airplane in a blue, cloudless sky", "A corner view of a kitchen with white appliances and dark wood cabinets.", "a cat laying on the floor of a kitchen", "A man and his dog riding on a bike. ", "A bathroom with a toilet and sink inside.", "A bathroom stall containing an empty toilet in it.", "A brown and black dog sticking its head out a window.", "A white busted up toilet sitting on it's side.", "a hairy man lying on a bench besides a bush", "A bunch of people waiting in line by a rail.", "A counter in a coffee house with choices of coffee and syrup flavors.", "Motorcycles parked on the sidewalk next to a road.", "A dresser in a room that is painted bright yellow.", "A person with his head out of a window while on a train. ", "a tiled bathroom with a toilet and sink inside of it ", "A man sitting in a chair, in a black and white photo.", "A bathroom with clear glass shower door and tile floor.", "A dog sitting in a bathroom with a urinal and a torn wall.", "A white expensive car parked on top of a cement slab.", "An airplane flying past the Moon in the sky.", "A woman sitting under an umbrella in the middle of a restaurant.", "A woman getting ready to cook some food in a small kitchen.", "A car sitting in the middle of the grass in the rain.", "A man and woman riding on the back of a motorcycle.", "Foods are being put in to the mason jars", "A man sitting on a bench in a lobby.", "A motorcycle is parked next to the fire hydrant", "A bike parked on top of a boat.", "A for of four urinals mounted to a wall.", "A couple of old fashioned oak wood dining tables.", "A magazine with a couple of cat around a toilet on it's cover.", "this is a very dark picture of a room with a shelf", "Meat left out on the kitchen counter could spoil.", "People leaning out the windows of a train as it goes through the countryside.", "a small plane with a propellor sitting on a runway", "An out house with the door opened sitting in a field.", "A man sitting on a modern bench talking on a phone.", "A woman wearing a hair net cutting a large sheet cake.", "a toilet sits next to a shower and sink ", "Chopped meat laid out on towels in a home kitchen, in preparation for cooking.", "A cat standing on a toilet seat in a bathroom.", "A small engine plane sitting on a runway.", "A bicycle parked and leaning against a brick building.", "there is a small kitten inside of a sink", "A kitchen with a wooden table  with a cat sleeping on top of it.", "A small kitchen does have plenty of cabinets.", "A white bathroom with a  white toilet and sink.", "a small propeller plane sits on a run way ", "A white toilet sitting next to a large window.", "A large fire place sitting next to a doorway.", "A man wearing a black neck tie and glasses.", "Large shower sectional of a bathroom in a brown and white photograph.", "A bike sitting next to a brick wall in the open.", "A group of motor bikes on a street.", "there are many people trying to avoid the rain", "The small, single engine airplane is parked on the tarmac. ", "A woman eating vegetables in front of a stove.", "The blue shower curtains are inside of the bathtub next to the toilet. ", "A group of people with umbrellas standing around a white car.", "A man hanging his head out of the side of a train.", "A couple of small rooms in a house.", "Personal computer desk room with large glass double doors.", "A modern style bathroom with a large tub and shower and tile floor.", "A golden bicycle with a basket next to a brick wall.", "A man getting food ready while people watch.", "A wire fence containing various hair clips with a building in the background.", "a vintage photo of some people sitting on a bench ", "An elderly lady pours some cups of tea on a tray.", "A white jet airliner parked on a runway at night.", "there is a chocolate cake and ice cream on a plate", "An outhouse sitting in the middle of a field.", "A bunch of birds that are sitting on steps.", "A city filled with lots of tall white buildings.", "A bathroom sink that is under a mirror.", "there is a mirror and a picture on the wall ", "A man that is sitting on a couch.", "there us a woman and a young child sitting on a bench", "A woman that is sitting under an umbrella.", "A woman that is standing near an open oven.", "there is a white toilet and a sink in this bathroom ", "A group of people posing with festive items.", "A woman in an orange vest and blue helmet riding a horse up a flight of stairs.", "A group of people that are sitting on bikes in the street.", "A group of motorcycles are parked next together.", "A black motorcycle parked on a brick sidewalk next to a road.", "Cars, people, buildings and street lamps on a city street.", "there is a chef making food as people watch", "An empty kitchen with lots of tile blue counter top space.", "there is a man sticking his head out of a train window", "a tiled bathroom with a toilet and scale in it ", "A bunch of airplanes lined up in a row at an airport.", "A desk sitting next to a showroom of cars in it.", "An elderly man is sitting on a couch.", "a bunch of glasses with some food inside of it ", "A city at night filled with lots of traffic.", "Careful bicycle riders add florescents to their clothes for safety in the dark.", "this is a dark picture of a large kitchen", "A white toilet sitting next to a shower in a bathroom.", "A crowd of people watching an airplane on a runway.", "A man sitting on a black and yellow bench on the phone.", "A woman taking a photo over the shoulder of a man on a bike.", "this kitchen has a white and black stove in it", "The dirt bike has seen many hill climbs in its history.", "A plane flies in the sky in front of a silhouette of a moon.", "a cluttered room with a table and shelf on the wall.", "there are many men playing soccer in a field", "A woman forks vegetables out of a bowl into her mouth. ", "A woman taking a picture of herself in a mirror.", "A couple of men riding a motorcycle down a street.", "Portable toilet in a wooden box area of a field.", "A motorcycle bike leaning against a white trailer.", "The view of a bathroom tub, shower, and toilet.", "A bathroom with a toilet and a scale.", "A jet flies in the distance with the moon in the background. ", "there is a man wearing a suit sitting on a bench", "A group of Navy cooks standing around a giant cake.", "A group of people in suits standing in a kitchen.", "A white toilet sitting under a window next to a chair.", "A dog is staring at a picture on a flat screen tv.", "A man is sitting on a public bench on a busy city street.", "Man talking on personal cell phone on a yellow and black bench.", "a counter top with food sitting on some towels", "A bathroom with a sink, vanity and shower stall.", "A view of a very dark lit kitchen from the other side of the room.", "A motorcycle parked on a sidewalk near a street light.", "There is traffic on a busy city street. ", "a blue bicycle a blender sand and a person", "Two people on a motorcycle with tone taking a photo", "The cat is sitting on the old butcher block.", "a dirt bike laying against a trailer in a grassy field", "A tabby cat sleeping on a wooden island in an old looking kitchen.", "view of tall city buildings with cars and people walking by", "a man standing in front of a big display case of donuts ", "Woman walking down the side walk of a busy night city.", "A single propellor aircraft that is parked on an airport apron with vehicles and another plane in the background.", "Woman eating an assortment of mixed vegetables in a bowl.", "There are orange slices in canning jars without lids.", "a bathroom with a stand up shower and tub.", "some people driving down the road with their bikes ", "A brown cat crouches and arches its back in a white sink.", "A group of waiters standing in a line. ", "A beach area with a bicycle that has a blender attached to the front, parked on the sand.", "Meats being prepared for cooking on kitchen counter.", "A woman sits under the sheet on a mattress on the floor.", "Large dog looking at television show in living room.", "A man driving a motorcycle with a woman holding a cell phone.", "A young woman standing in a kitchen eats a plate of vegetables.", "The motorcyclist in a helmet is looking over the side of a bridge. ", "At night on a street with a group of a bicycle riders riding down the road together.", "The woman sitting at the table looks bored.", "The woman in the kitchen is tending to her food.", "some people holding umbrellas and  standing by a car in the rain", "A bike parked in front of a doorway.", "A person is riding his motorcycle on the dirt road.", "A bathroom that has a door just for the toilet area.", "Eight jars are being filled with orange slices. ", "Woman with a motorcycle staring over a bridge at a wetlands. ", "A bathroom has pink tiles and a black toilet.", "A group of people holding umbrellas stand near a car.", "there are two woman that are riding motorcycles ", "Some men and women in white shirts and bow ties standing in a row.", "A container of antibacterial wipes in a bathroom.", "A monitor screen, printer, couch and chair in the room", "A very dimly lite kitchen in someone's house at night.", "this kitchen is very big and has wood cainets", "A bunch of uncooked food on a counter.", "this is a wood table in a cluttered kitchen", "A bathtub that is in a bathroom under a wooden object.", "A bunch of people standing around and posing for a picture.", "A shelf of various cups and glasses mounted to the wall.", "A man standing by his motorcycle is looking out to take in the view. ", "A light that is on above a mirror.", "a bathroom with a tub next to a fancy shower stall ", "A automobile with multiple bicycles on a roof rack. ", "this small bathroom has white sink and a toilet", "A man standing behind the counter at a doughnut shop.", "a toilet a tub  some pipes and a window", "A view of a table with a bunch of cakes and tea on it.", "Large sized kitchen with a dining room section.", "There is a cyclist riding above all the pigeons.", "there is a woman that is cutting a white cake", "Some people are enjoying time on a beach.  ", "Pile of strings and books next to a laptop computer.", "A man is standing in front of a case filled with pastries.\n", "A woman marking a cake with the back of a chef's knife. ", "A bicycle that is stored in someone's closet in the apartment. ", "A woman eating fresh vegetables from a bowl.", "A large kitchen with a lot of cabinets and counter space.", " a bathroom with a picture of a bookshelf  above the urinals", "Line of men and three woman standing in front of a kitchen.", "kitchen with a wooden kitchen island and checkered floor", "there is a woman staring in the kitchen pouring tea", "this man is riding a board near a field", "A man on a motorcycle riding in the desert.", "A dining room with hard wood floors that is very fancy. ", "A group of young bicyclists on a city street at night.", "The bath tub and toilet in this bathroom are black.", "Pots and pans that are on the side of a sink.", "there is a all black motorcycle that is parked on the street", "a black toilet in a wood floored bathroom", "The jars on the table are full of oranges.", "A cat sits on an open toilet in a bathroom.", "a female standing in the bathroom and taking a photo with her phone", "two men on a scooter riding down the roadway", "A bathroom, showing the shower, toilet and sink.", "A wooden table sitting in the center of a kitchen.", "a jet airplane sitting on a runway next to a building", "There is an airplane on the runway in the distance.", "a group of people sitting on the sand with a lake in the background", "A small powder room with a sink and vanity, toilet, mirror, and an empty towel bar.", "Various kitchen dishes are arranged on many different shelves. ", "A man on a bicycle above spectator stands, where pigeons graze.", "There is a cat standing on the toilet seat.", "A man in a helmet and jacket riding a motorcycle in the desert.", "Many objects are sitting on a counter in a kitchen.", "a man sitting on a motorcycle in the desert", "A line of urinals against a wall with bookshelves above.", "A woman holding a colorful kite on top of a green field.", "a bathroom with tiled floor and a circular window ", "this is a bench out near a field", "a bathroom view of a tiolet and sink ", "A white sink sitting under a mirror next to a toilet.", "A woman sitting at a table next to an umbrella.", "A woman standing in a  kitchen baking bread.", "A series of shelves holding colorful glassware and dishes.", "a guy in the desert  sitting on his motorcycle", "A kitchen in a camp with gear and coats laid out.", "Three people sit on a bench looking out over the water. ", "A person on a bike is next to a train on the tracks. ", "a group of boys playing in a field next to a forrest", " A blender sitting on top of a table.", "A public restroom with toilet, sink and a grab bar.", "A very tall clock tower sitting above a building.", "five restaurant wait staff and two mangers ", "A plane traveling down a run way, near the highway.", "Two people standing in a small kitchen with an arched passage.", "A man rides a motorcycle down a dirt road. ", "i table filled with cups and a plate of food.", "A motorcycle parked on a stone cobble road, in the sun.", "A man standing in front of a bunch of doughnuts.", "Open shelves hold an assortment of glasses, cups, and bowls. ", "Old photo of man sitting on his motorcycle", "Two people ride motorcycles down a city street.", "a mirror a sink a toilet  and a blue basket", "A family riding their bikes next to the streetlight. ", "A top down view of a bathroom with a scale and toilet.", "Dessert for two is placed on a table.", "An intersection with cars is pictured in this image.", "A table topped with  lots of food and drinks.", "An old motorcycle with a side car attached.", "the kitchen has a stove and sink with pots and pans", "The urinals are sitting below the shelves full of books,", "A cat sitting inside a sink in a bathroom ", "A kitchen with a lot of kitchen furniture and accessories", "A bicycle sits parked in front of a bookstore.", "A colorful kite is ready for launch on a blue sky day", "A cyclist pedals past a flock of birds perched on a grating.", "A mirror shows another light in a background of a wonderful bathroom", "a white table with sandwiches and cups of tea and people and sivlerware", "A bike rider traveling down a road, in the desert.", "Two kittens are cuddling and enjoying a soft pillow", "A man on a skateboard rides down a narrow road.", "A kitchen with a stove, table, cabinets, and other items ", "A plane flies in the sky passing over the moon.", "A kitchen with many of the appliances removed with blue and white tile.", "A bathroom with sink, toilet, and bathtub and black and white floor tiles.", "this is a red bike on a dirt path", "A man waits to cross the railroad tracks as two trains cross.", "A person rides an electric bike on a desert trail.", "A bathroom with a sink and other items. ", "this is a toilet and trash can and a sink", "A stop sign sits in front of a billboard in a quiet area.", "A bike parked in front of a book shelf.", "A view of a kitchen with a burner top stove.", "a black toilet some toilet paper and brown tiles", "A boy wearing a suit riding a skateboard down the road", "a bathroom with a big mirror above the sink", "A TV sitting on top of a counter inside of a store.", "a bathroom with a glass sink base with a bowl on top", "A motorcycle with a flat rear tire sits in a workshop, while a person stands behind it, facing away from the camera.", "A bathroom with a toilet, sink and shower stall.", "A busy street with cars and buses on it.", "some people an airport a runway and a jet", "A man and a woman looking at cell phones.", "A man with a fro riding a skateboard down a road.", "A kitchen with and island and several counters in it.", "A couple of sinks with brown tile and a decorative mirror.", "A view of a messy room, with shelves on the wall.", "The motorcycle is parked on the side of the paved road. ", "a bathroom view of a stand up shower and toilet with a sink near by", "A bathroom with a toilet and a scale on the floor.", "a couple of people standing inside a kitchen.", "There are a lot of cupboards and  refrigerator in the room. ", "Night is falling on an empty city street.", "A vintage antique motorcycle sitting in a shop being worked on.", "a black gray and white cat a toilet sink and mirror", "The plane is taking off into the yellow sky.", "A curly haired boy rides a skateboard down a road.", "A woman is seen in the rear view mirror of a motorcycle.", "A woman pouring coffee into cups on a counter.", "a bike resting in the sand with a blender built on top", "A PICTURE OF A KITCHEN WITH TILE COUNTER TOP", "A walk in shower sitting next to a bath tub.", "A city street filled with lots of traffic.", "Person riding a four wheeler on a beach towards a bridge.", "A PICTURE OF A BATHROOM WITH SLIDING SHOWER ", "Black motorcycle with a side car in the middle of the street. ", "Group of people standing around each other in the middle of a city street. ", "a kitchen with  a stove sitting on a hard wood floor and cabinets", "A man walking around with his dog and sheep.", "there are two cats that are laying inside of a tub", "there is a small dog that is looking threw the glass", "A man taking a picture of himself in front of three huge beer bottles", "A PICTURE OF A MAN WITH BEER BEHIND HIM ", "there is a small out house that is made of wood", "Several people smile for the camera at night.", "this is a bathroom that has a sink and toilet", "A black dog sitting in front of a TV.", "A PICTURE OF ALL WHITE IN A BATHROOM ", "A person wearing a safety vest rides a horse up the staircase.", "a man sitting in a chair on a tiled floor next to a heater", "this is a clock on top of a tower", "Bike leaned against a wall of books inside and establishment.", "this is a group of people standing near a river\n", "a bathroom with a toilet and sink and a bath tub sitting on a hardfloor", "A person rides a vehicle on the beach.", "this kitchen has a white stove and all white cabinets", "Three people sit on a bench together facing away.", "Two cats sitting together in an empty bathtub.", "A puppy staring through a red sectioned window.", "a toilet sitting on a tiled floor in enclosed bathroom stall", "People are walking and cars are driving in a city.", "A kitchen with a stove, microwave and cabinets.", "Two teams compete at a sport in a park.", "A bathroom with a toilet, counter, and mirror.", "A lidless toilet is shown caked in dirt or other filth.", "A tiled bathroom is shown with a compact style toilet.", "there is a police man riding a tav on the beach", "A small white car with a small white dog riding in it.", "a toilet a sink a towel a light and a mirror", "A man riding an ATV next to the ocean.", "Two people riding a motorcycle near a group of people.", "A group of people walking down a walkway.", "a woman a white mat and pillow and white wall", "A man riding a motorcycle down a road near a forest.", "A little girl holding a brown stuffed animal.", "there is a very beautiful view out of this bathroom window", "A dog stands close to a television looking at it.", "this bathroom is very big and has lots of room", "A pair of cats sit in an empty bathtub.", "A dog looks through ribbed glass in a red door.", "there is a old black motorcycle inside of a garage", "A motorcyclist parked near a railing looks out over the water.", "Two people are looking at a truck while a dog is being walked.", "Several people are seen sitting around and smoking.", "this is a man sitting on a green couch", "A juicer attached to the top of a bike.", "A cramped bathroom with a sink in the corner.", "A crowded street filled with British traffic and buses.", "A small clock is seen on the side of a church.", "this is an airplane sitting on the runway", "A girl is holding a large kite on a grassy field.", "Two people sitting on a motorcycle that parked on the road.", "A composite image of an office desk, cars and buildings.", "A man wearing a hat in front of large bottles.", "a church with a tall tower with a clock built into it", "A group of young men jump in the air playing a game.", "some shelves filled with bowls and cups ", "A dog sits in front of and watches the television.", "An old rusting toilet with the lid up. ", "some peeled oranges sitting in a clear blender", "Two people standing in a kitchen near a stove.", "A young girl walking barefoot carries a stuffed animal.", "a man reflected in a rear view mirror of a motorcycle", "A native American couple on a bike pose for a photo.", "A person laying on a bathtub with their feet sticking out.", "A wooden outhouse sitting in the grass near trees.", "A toilet sitting in a stall on tile.", "A kite flying in the sky on a cloudy day.", "Two cats occupy a bathtub, one sitting and one lying down. ", "A dog looks out through a lined window. ", "A group of people are standing in the snow on skis.", "A person is sitting on a motorcycle looking in the mirror.", "a bathroom wall missing some pink wall tiles ", "A man walking across a field holding a wand near a dog.", "a bathroom with towels under a sink and a big mirror above it", "Adults and children gather near a dock on the beach.", "A green, red, yellow and blue kite fly's through the sky.", "a man standing next to a laptop and bottles of beer", "two cats resting side by side on a bed", "A large jetliner sitting on top of an airport runway.", "A sad woman laying on a mattress on a hardwood floor.", "A red bus parked next to a crowd of people.", "Looking through the window of showroom at car dealership.", "Two people standing next to each other in a kitchen.", "a bathroom view of a sink toilet on a tiled floor", "a group of people standing in the snow  with gear on", "a church with a clock built into the side of it", "A daytime view of a messy kitchen corner.", "a colorful  kite flying high on a cloudy day", "A man riding a red scooter down the street.", "An old toilet outside against an old painted wall.", "A kitchen counter top with a white bowl sitting next to another white bowl.", "A toilet filled with nasty grime sitting up against a bathroom wall.", "A woman standing between a motor bike and a striped wall over a river.", "A group of people sitting on top of a bench.", "A group of men standing around a luggage cart.", "a person in a bathroom having a reflection in the mirror", "a kitchen with a microwave, a stove, and cabinets.", "a street with cars lined with poles and wires.", " two men and one woman standing in a kitchen", "A dark and cluttered storage area with wood walls.", "Three people sitting on a bench looking at the ocean.", "people sitting on a bench facing the water.", "a cat sitting in a sink with its eyes open", "two cars parked side by side on a show room floor", "two cats chill in the bathtub one is laying down", "a dog who looks sad stares outside of the window of a red door ", "a lady holding a kite and walking in a grassy area", "An old toilet with a rotten lid next to a rusted pipe.", "some piled oranges in a glass blender ready to be blended", "A room with a chair and pictures mounted on the wall. ", "a sink in a bathroom with a shaver and personal hygeine items on the counter top", "a street with people and vehicles in the middle of it", "a room showing a wooden table and a capboard", "Several cars parked near a desk holding a computer.", "A parked white car with and open door and a dog inside.", "A black motorcycle with a sidecar parked on cobblestone.", "a group of vehicles parked next to a firehydrant", "A woman standing on grass holding a colorful kite.", "A white kitchen with a gas stove and microwave.", "a motor bike carrying very many people on the street", "A row of urinals with a well-stocked bookshelf in front. ", "A woman lying on a thin mattress on the floor with her knees up.", "A plane riding down a runway of an airport.", "a cake and two spoons on a plate", "Toilet in a bathroom in an international location with a basket.", "Group of horses in a field with a pinto in the foreground.", "A dim lit room consisting of many objects put together. ", "a man standing in a bathroom looking into a mirror", "a street view of people walking down the sidewalk ", "A airplane sitting on a runway at a small airport.", "A room with a sink and a skeleton foot. ", "Sun shining through the blinds into a white bathroom.", "A man in a blue hat on a bike behind a train. ", "An airplane on the runway of an airport.", "Two motorcycles going down a city street with woman drivers", "A black and white still life of a branch with flowers in a vase", "A clean odd little bathroom with a white porcelain toilet.", "A paint horse and other breeds in the background grazing in a green field.", "A kitchen area that has items on the counter tops. ", "a tall red bus is by the curb in a city", "A restroom with a toilet and a mirror. ", "A tiled floor bathroom with a red and black shower curtain.", "An older woman pouring tea in the kitchen.", "A bicycle is placed behind an open door.", "A man and two dogs are riding a scooter.", "A busy street with traffic moving in both directions and several two level buses on the street with people around.", "A bathroom has a toilet and a scale.", "A bathroom with outdated fixtures and a clothes hamper in the middle of the floor.", "The tiles are falling off the wall in this old bathroom", "A dog sits in a white car with the door open.", "a white toilet is in the corner of a bathroom", "Top view of a few skinned oranges inside of a blender", "a sink well cleaned and some drawers and hand wash", "a bunch of people are standing on a snowy hill", "View of toilet with a dirty lid and a missing cover to it's tank", "A street with a few people walking and cars in the road. ", "a coupe of people are sitting outside on a bench", "A little girl is carrying a stuffed animal.", "A man and a woman are riding a motorcycle.", "a couple of bathroom items sitting on a sink", "a couple of motorcyclists are driving down the road", "three people sitting on a motorcycle in a street", "a couple of vehicles are parked in a lot", "A messy kitchen with dirty dishes and white cabinets", "A minimalist room features white appliances and beige walls.", "A kichen with dirty dishes in the sink.", "A table with a plate holding several sandwiches, tea cups and condiments. ", "A bathroom sink that is surrounded by various toiletries.", "Two motorcycles are parked on the shoulder of a mountainous freeway.", "An intersection is shown on a cloudy day.", "Pink bike sits on a guard rail by the river.", "people on the street with their cars  moving", "This is a state of the art bathroom where the appliances don't look like they should", "The man who uses this bathroom shaved this morning", "A bathroom with a white toilet, tub, and tile floor.", "A man riding a scooter with a dog on it. ", "a t.v. that is sitting on a shelf with some lights near by", "a bath room with its door open and light on", "a bike that is leaning up against a book rack", "Motorcycles parked in a row in the street. ", "A group of people are standing together at night.", "a little white car that has a dog in it", "A man is standing in a field with a dog and goat.", "a bunch of different electronics all on one big pile. ", "Two motorcycles sit on the side of a secluded road.", "a airplane that is on a runway by some grass", "A pink bicycle leaning against a green railing next to a canal.", "Three people on a motor bile that is riding in a street, with one of them wearing a helmet.", "A man looks at himself in the mirror of a motorcycle.", "a room filled with white furniture and books on the ground. ", "A bicycle leaned against the hallway wall in a house", "two different kinds of lights in a bath room. ", "a room with wood and ivory furniture inside. ", "THERE IS A PLATE WITH SWEET DESSERTS ON THE PLATE ", "a road sign showing stop and a vehicle moving", "Two lights shine above a messy bathroom toilet.", "A chair sits against a wall in a wood floored room.", "a couple of men that are next to some boxes", "Several people are standing around watching a band perform on stage.", "A small bathroom has a port hole window.", "A young girl with a stuffed toy in a park.", "A black and whit cat sitting in a sink.", "A person is taking a picture of a bathroom with a toilet in it.", "Some cakes are on a white plate with spoons.", "A man looks into the mirror as he styles his hair.", "a couple of sinks in a bright colored bathroom", "a man on a motorcycle that is in some grass", "Some people are next to a pier on the sand.", "A car is illegally parked near a fire hydrant.", "A couple of dead, stuffed giraffe on display.", "A quaint toilet in a room with no door, a chair sitting outside of the area.", "A green and blue motorcycle parked on the side of a road.", "A very simple bathroom with beige and cream colored decor.", "a man in a room with a camera with a toilet", "A modern bathroom with a toilet and sink area.", "A man on his bicycle waits for two trains to pass by.", "a bath room with a trash can next to the tolit. ", "A purple bicycle is parked on a fence next to a river.", "A kitchen with a lot of counter space, a sink, stove and refrigerator in it. ", "A man on a motorcycle is looking in his mirror.", "A little red headed girl walking with a stuffed puppy.", "A white towel is at the edge of a white bathtub.", "A bathroom is shown with a glass counter and cone-shaped sink.", "A man walking down the street with a cane while others sit on a bench.", "a motorcycle that is parked in side a buliding", "A view from a bus shows people on bicycles and another bus in traffic.", "a bathroom that has a tub and a shower", "a vase with a flower growing very well", "a small little toilet that is in a corner", "a couple of horse that are eating some grass", "a man that is riding a motorcycle on a road", "a couple of motorcycles are off the side of the street", "A tea kettle sits on the burner of stove.", "a black cat that is sitting in a sink", "a room tha has a toilet and a sink in it", "A blender filled with three peeled oranges sitting on a counter.", "a couple of motorcycles that are next to a road", "A man is in a yard on a motorcycle.", "A truck traveling down the street near a fire hydrant.", "Two small cats are sleeping on white sheets.", "a group of people with bikes posing for a photo ", "Toilet with raised lid with tub and chair in old bathroom. ", "A person riding a four wheel on the beach.", "a bright light sitting in front of a tv ", "Clocks are brightly lit on a huge tower.", "a group of people that are smoking on a bench", "A bathroom with shower stall, toilet, and bathtub.", "A man is training a sheepdog for a sheepdog trial.", "Looking down on a stony surface shows a bowl with an orange in it and what looks like a large piece of red plastic.", "Two motorcycles ride down a street in a city.", "A little girl is making a huge mess with a birthday cake. ", "A very large kitchen area in a building.", "A yellow bike sits on a wall in the hallway.", "This is a photo of someones bathroom in their home and there are feet hanging out the side of the tub.", "a small little bathroom with a toilet in it", "Asian man and woman sitting and looking at cell phones", "Someone is juicing an orange on a juicer.", "A bicycle leaned against an outdoor magazine stand.", "A black and white photo of a steam of flowers inside a vase.", "A bathroom with white toliet and sink visible", "A kitchen with tile back splash and stainless steel appliances.", "Looking through a door and seeing a toilet and sink.", "Some guys are standing over an old antique truck and someone is walking a dog nearby. ", "A large jetliner sitting on top of a tarmac.", "A baby with a bib eats a cake.", "A stop sign out in the middle of nowhere ", "A group of police officer standing in front of a red bus.", "A woman holding two rainbow slices of cake.", "A group of Frisbee players are running around a field. ", "A toilet that has been covered in filth.", "The clock on the side of the metal building is gold and black. ", "The motorcyclist has his hands at his side while riding swiftly down the road. ", "A modern restroom with a weird looking sink, toilet, and shower.", "Small groups of people, including a person walking a dog, are scattered about an outdoor area, encompassing some streets, that is filled with classic cars. ", "a bunch of crates on a air plane run way", "A sky view looking up at a jumbo jet plane.", "A bathroom showing toilet, sink, and shower ", " room with a book and a white carpet", "A scooter with a helmet hanging off it's handlebars.", "A truck driving on a crowded street past several parked cars.", "A bunch of people walking around in a street", "people riding bikes near a beach and others swimming", "Two kittens curled up in a white sheet that looks soft.", "A cat laying on the seat of a motorcycle ", "a piece of orange in a bowl next to a concrete edge ", "A road lined with rock-face shows a man and a woman, both wearing hats, astride a red, white and blue decorated bike. ", "A kitchen has white cabinets and stainless steel appliances.", "A crowd of people walking and riding their bikes.", "A crowd of people are gathered outdoors on the street.", "Sheepherders move their sheep across a highway as vehicular traffic passes between their flock.", "A bathroom with sink, toilet, and tub ", "A crowd of people at an outdoor concert.", "A woman sitting on a bench with cars behind her.", "A cat is alseep on a motorcycle seat.", "A kite flying in a partly cloudy sky ", "white toilet and sink with mirror on white wall", "A small kitchen is shown with a stove, dishwasher and sink.", "A toilet that is has been colored black.", "A small baby bird on a piece of metal.", "a jet airliner wing that has two jet engines", "two sinks under a mirror and a light on a wall", "human hands juicing an orange on a counter top", "young man looking a different image of himself in the mirror", "A man riding a bike down a dirt road.", "a black and white photo with a vase and flower coming out of it", "Three people are riding down the street on one motorcycle. ", "Seven people on a biking trip in front of a large city.", "A wooden table sitting in the middle of a room.", "Three bikers by a red bus on the street.", "Three people are standing in the same kitchen area.", "A view of wing with two jet engines are on a runway while people watch.", "Man and dog on scooter in city street on sunny day.", "Two Asian people inside a train looking at their mobile phones.", "A white toilet tin a bathroom sitting next to a sink.", "The view of a restroom toilet, and sink area.", "The motorcycle is tilting as he turns through a cave. ", "A view of an airplane traveling across the bright sky.", "The kitchen counter and sink have dishes on them.", "A tower with a clock is displayed in the evening.", "A giraffe and fence design are painted onto the wall.", "A man wearing a helmet posing on top of a motorcycle.", "A very large black and gold clock mounted to the side of a building.", "A man riding on the back of a motorcycle down a highway.", "A huge commercial airplane goes down the landing strip.", "A bike is chained to the post on the sidewalk", "The black and white cat is sitting in a bathroom sink.", "The show girl is posing on a blue motorcycle on display. ", "Two fake looking giraffes are on display at an exhibit.", "A young baby is eating and playing with some cake.", "Two adults and a child ride a motorcycle together.", "A small eating area with a table and cabinets next to a window.", "A small wooden toy car has an elephant sitting inside.", "Cement ledge with orange in bowl and red plastic bag below. ", "An old classic church is in front a big blue sky.", "Kitchen area with modern appliances and plenty of cabinets.", "A man with a baseball cap and glasses seated in front of three large beer bottles.", "A bathroom with a small sink and toilet. ", "A bathroom with mirror, toilet, and sink ", "This is a photo of someones bathroom in their home.", "A child in a booster chair eating a cake ", "A woman sitting on top of a purple motorcycle.", "A bathroom scene with a toilet and a sink.", "The top of a steeped church building with clocks and small windows.  ", "Two messy toilet stalls with toilets where one lid is raised. ", "a man wearing a helmet while riding a motorcycle ", "Man with golf club and a dog and a goat", "Two turbines on the wing of an airplane", "An empty bench along a sidewalk in neighborhood.", "there is a man riding a bike up the road", "A brick ally way with an old wooden bench with people sitting and smoking on it.  ", "there is a very tall giraffe inside of a building", "A couple of airplanes sitting on top of a runway.", "a bathroom with a littlt tub and a clothes hamper by the toilet", "A large jet flying through a cloudy blue sky.", "A close up of the face of a clock on a building.", "A man riding on a motorcycle on the road.", "there is a very large black and gold clock on a building", "there is a man riding a motorcycle and not holding the handles", "there is a person making freshly squeezed orange juice", "A man riding on the back of a motorcycle on top of a grass field.", "A christmas wreath is hanging from the door", "group of bikers posing for a picture ", "A black bench that is by a sidewalk on a street.", "A bottle of wine sitting on top of a table next to a glass of wine.", "A dog sitting in front of an open door looking outside.", "A toy elephant sits in a toy wooden car.", "A group of bikers parked in the middle of a street.", "A wreath with a red bow on it hanging on a white door.", "A white toilet sitting in the corner of a room.", "A lush green field with horses standing on top of it.", "Several cars drive down the road on a cloudy day.", "A crowd of people riding bikes down a street.", "there is a woman sitting on a bench in front of cars", "There is an orange in the cup and a bag in the water.", "an empty bench sitting on the side of a sidewalk", "A person sits on a motorcycle while wearing riding gear.", "A plane is on display near the water.", "A mans reflection in a side view mirror.", "Two people wearing hats riding a motorcycle together.", "there is a dog that is sitting in a car", "A couple of white bathroom sinks mounted to a wall.", "A pink bicycle leaning against a fence near a river.", "there is a man crossing the tracks on a bike", "tan colored bathroom with white toilet and mirror", "A closed toilet seat in a bathroom next to a checkered curtain.", "A bathroom vanity with a large mirror hanging on the wall", "A colorful kite flying in a cloudy blue sky.", "A road with two vehicles out in the middle of nowhere with animals climbing up a hill on the left.", "The numbers and hands on the clock are gold.", "The man on the motorcycle does not have his hands on the handlebars.", "A white stove top oven inside of a kitchen.", "A line of motorcycles parked on the side of a street.", "A small elephant toy sitting inside of a wooden car.", "some one in the bath room laying in the bath", "Men are unloading the trolley of luggage on the runway.", "A small bird sitting in a metal wheel ", "Someone is riding a motorcycle through a grassy field. ", "a bunch of people in a kitchen getting food ready", "A billboard posed by the side of a street in a rural town.", "A picture of a man sitting on a motorcycle on a dirt road.", "A woman juicing oranges on top of a manual juicer.", "A small cute cat sitting in the bathroom sink.", "Several people standing next to each other that are snow skiing.", "some cut up fruit is sitting in a blender", "A small and plain white bathroom with a toilet and a tub.", "A man in riding gear, riding a red motorcycle down a road.", "A bunch of airplanes are parked on the runway. ", "this plane has two large fans on its wings", "This is a photo of a bathroom in someones home.", "This is a large statue in someones living room.", "An old propeller airplane is displayed near the water.", "A man and a woman using their cellphones simultaneously.", "there are many people walking along this street", "A kitchen showing marble tile and wood cabinets.", "A line of motorcycles are all parked next to each other.", "A man, woman, and child preparing food in a kitchen.", "A black and white photo of a flowing growing out of a vase.", "A passenger jet being serviced on a runway in an airport.", "Three people are preparing a meal in a small kitchen.", "A pair of planes parked in a small rural airfield.", "A bathroom with a stand alone shower and a peep window.", "Several vehicles with pieces of luggage on them with planes off to the side.", "a black motorcycle is parked by the side of the road", "A small bathroom with a tub, toilet, sink, and a laundry basket are shown.", "A bus stopped on the side of the road while people board it.", "A bunch of people posing with some bikes.", "a jet engine on the wing of a plane", "A bunch of bicycles parked on the street with items sitting around them ", "A dog standing in front of a doorway.", "Two small planes sitting near each other on a run way.", "there is a bus that has a bike attached to the front", "A bird that is sitting in the rim of a tire.", "The black motorcycle is parked on the sidewalk.", "A corner of a rest room with a big shower.", "a dog with a plate of food on the ground", "there is a very large plane that is stopped at the airport ", "Bicycles with back packs parked in a public place.", "A white walled bathroom features beige appliances and furniture.", "Several bicycles sit parked nest to each other.", "Some big commercial planes all parked by each other.", "a woman holding a plate of cake in her hand", "yellow and red motorcycle with a man riding on it next to grass", "A motorcycle stands in front of three people on a sidewalk.", "classic cars on a city street with people and a dog", "People getting on a bus in the city", "A large commercial airliner silhoetted in the sun.", "Residential bathroom with modern design and tile floor.", "a bus with a view of a lot of traffic and the back of another bus with a billboard on the back end", "A young man riding through the air on top of a skateboard.", "A toy elephant is sitting inside a wooden car toy.", "A motorized bicycle covered with greens and beans.", "A man sitting at a table in front of bowls of spices.", "there is a bathroom that has a lot of things on the floor", "A passenger jet aircraft flying in the sky.", "An eye level counter-view shows blue tile, a faucet, dish scrubbers, bowls, a squirt bottle and similar kitchen items. ", "A TV sitting on top of a wooden stand.", "A person sitting on a motorcycle in the grass.", "A white toilet in a generic public bathroom stall.", "a couple of people in uniforms are sitting together", "A group of giraffe standing around each other.", "Street merchant with bowls of grains and other products. ", "A man driving a luggage cart sitting on top of a runway.", "Residential bathroom with commode and shower and plain white walls.", "Ornate archway inset with matching fireplace in room.", "there is a red bus that has a mans face on it", "a wooden skate with a toy elephant inside of it ", "a bunch of people on skiing on a hill"]


--------------------------------------------------------------------------------
/alignment/assets/activities.txt:
--------------------------------------------------------------------------------
1 | washing the dishes
2 | riding a bike
3 | playing chess


--------------------------------------------------------------------------------
/alignment/assets/activities_v0.txt:
--------------------------------------------------------------------------------
1 | washing the dishes
2 | riding a bike
3 | playing chess


--------------------------------------------------------------------------------
/alignment/assets/drawbench.json:
--------------------------------------------------------------------------------
  1 | {
  2 |    "A red colored car.": {
  3 |       "category": "Colors"
  4 |    },
  5 |    "A black colored car.": {
  6 |       "category": "Colors"
  7 |    },
  8 |    "A pink colored car.": {
  9 |       "category": "Colors"
 10 |    },
 11 |    "A black colored dog.": {
 12 |       "category": "Colors"
 13 |    },
 14 |    "A red colored dog.": {
 15 |       "category": "Colors"
 16 |    },
 17 |    "A blue colored dog.": {
 18 |       "category": "Colors"
 19 |    },
 20 |    "A green colored banana.": {
 21 |       "category": "Colors"
 22 |    },
 23 |    "A red colored banana.": {
 24 |       "category": "Colors"
 25 |    },
 26 |    "A black colored banana.": {
 27 |       "category": "Colors"
 28 |    },
 29 |    "A white colored sandwich.": {
 30 |       "category": "Colors"
 31 |    },
 32 |    "A black colored sandwich.": {
 33 |       "category": "Colors"
 34 |    },
 35 |    "An orange colored sandwich.": {
 36 |       "category": "Colors"
 37 |    },
 38 |    "A pink colored giraffe.": {
 39 |       "category": "Colors"
 40 |    },
 41 |    "A yellow colored giraffe.": {
 42 |       "category": "Colors"
 43 |    },
 44 |    "A brown colored giraffe.": {
 45 |       "category": "Colors"
 46 |    },
 47 |    "A red car and a white sheep.": {
 48 |       "category": "Colors"
 49 |    },
 50 |    "A blue bird and a brown bear.": {
 51 |       "category": "Colors"
 52 |    },
 53 |    "A green apple and a black backpack.": {
 54 |       "category": "Colors"
 55 |    },
 56 |    "A green cup and a blue cell phone.": {
 57 |       "category": "Colors"
 58 |    },
 59 |    "A yellow book and a red vase.": {
 60 |       "category": "Colors"
 61 |    },
 62 |    "A white car and a red sheep.": {
 63 |       "category": "Colors"
 64 |    },
 65 |    "A brown bird and a blue bear.": {
 66 |       "category": "Colors"
 67 |    },
 68 |    "A black apple and a green backpack.": {
 69 |       "category": "Colors"
 70 |    },
 71 |    "A blue cup and a green cell phone.": {
 72 |       "category": "Colors"
 73 |    },
 74 |    "A red book and a yellow vase.": {
 75 |       "category": "Colors"
 76 |    },
 77 |    "A horse riding an astronaut.": {
 78 |       "category": "Conflicting"
 79 |    },
 80 |    "A pizza cooking an oven.": {
 81 |       "category": "Conflicting"
 82 |    },
 83 |    "A bird scaring a scarecrow.": {
 84 |       "category": "Conflicting"
 85 |    },
 86 |    "A blue coloured pizza.": {
 87 |       "category": "Conflicting"
 88 |    },
 89 |    "Hovering cow abducting aliens.": {
 90 |       "category": "Conflicting"
 91 |    },
 92 |    "A panda making latte art.": {
 93 |       "category": "Conflicting"
 94 |    },
 95 |    "A shark in the desert.": {
 96 |       "category": "Conflicting"
 97 |    },
 98 |    "An elephant under the sea.": {
 99 |       "category": "Conflicting"
100 |    },
101 |    "Rainbow coloured penguin.": {
102 |       "category": "Conflicting"
103 |    },
104 |    "A fish eating a pelican.": {
105 |       "category": "Conflicting"
106 |    },
107 |    "One car on the street.": {
108 |       "category": "Counting"
109 |    },
110 |    "Two cars on the street.": {
111 |       "category": "Counting"
112 |    },
113 |    "Three cars on the street.": {
114 |       "category": "Counting"
115 |    },
116 |    "Four cars on the street.": {
117 |       "category": "Counting"
118 |    },
119 |    "Five cars on the street.": {
120 |       "category": "Counting"
121 |    },
122 |    "One dog on the street.": {
123 |       "category": "Counting"
124 |    },
125 |    "Two dogs on the street.": {
126 |       "category": "Counting"
127 |    },
128 |    "Three dogs on the street.": {
129 |       "category": "Counting"
130 |    },
131 |    "Four dogs on the street.": {
132 |       "category": "Counting"
133 |    },
134 |    "Five dogs on the street.": {
135 |       "category": "Counting"
136 |    },
137 |    "One cat and one dog sitting on the grass.": {
138 |       "category": "Counting"
139 |    },
140 |    "One cat and two dogs sitting on the grass.": {
141 |       "category": "Counting"
142 |    },
143 |    "One cat and three dogs sitting on the grass.": {
144 |       "category": "Counting"
145 |    },
146 |    "Two cats and one dog sitting on the grass.": {
147 |       "category": "Counting"
148 |    },
149 |    "Two cats and two dogs sitting on the grass.": {
150 |       "category": "Counting"
151 |    },
152 |    "Two cats and three dogs sitting on the grass.": {
153 |       "category": "Counting"
154 |    },
155 |    "Three cats and one dog sitting on the grass.": {
156 |       "category": "Counting"
157 |    },
158 |    "Three cats and two dogs sitting on the grass.": {
159 |       "category": "Counting"
160 |    },
161 |    "Three cats and three dogs sitting on the grass.": {
162 |       "category": "Counting"
163 |    },
164 |    "A triangular purple flower pot. A purple flower pot in the shape of a triangle.": {
165 |       "category": "DALL-E"
166 |    },
167 |    "A triangular orange picture frame. An orange picture frame in the shape of a triangle.": {
168 |       "category": "DALL-E"
169 |    },
170 |    "A triangular pink stop sign. A pink stop sign in the shape of a triangle.": {
171 |       "category": "DALL-E"
172 |    },
173 |    "A cube made of denim. A cube with the texture of denim.": {
174 |       "category": "DALL-E"
175 |    },
176 |    "A sphere made of kitchen tile. A sphere with the texture of kitchen tile.": {
177 |       "category": "DALL-E"
178 |    },
179 |    "A cube made of brick. A cube with the texture of brick.": {
180 |       "category": "DALL-E"
181 |    },
182 |    "A collection of nail is sitting on a table.": {
183 |       "category": "DALL-E"
184 |    },
185 |    "A single clock is sitting on a table.": {
186 |       "category": "DALL-E"
187 |    },
188 |    "A couple of glasses are sitting on a table.": {
189 |       "category": "DALL-E"
190 |    },
191 |    "An illustration of a large red elephant sitting on a small blue mouse.": {
192 |       "category": "DALL-E"
193 |    },
194 |    "An illustration of a small green elephant standing behind a large red mouse.": {
195 |       "category": "DALL-E"
196 |    },
197 |    "A small blue book sitting on a large red book.": {
198 |       "category": "DALL-E"
199 |    },
200 |    "A stack of 3 plates. A blue plate is on the top, sitting on a blue plate. The blue plate is in the middle, sitting on a green plate. The green plate is on the bottom.": {
201 |       "category": "DALL-E"
202 |    },
203 |    "A stack of 3 cubes. A red cube is on the top, sitting on a red cube. The red cube is in the middle, sitting on a green cube. The green cube is on the bottom.": {
204 |       "category": "DALL-E"
205 |    },
206 |    "A stack of 3 books. A green book is on the top, sitting on a red book. The red book is in the middle, sitting on a blue book. The blue book is on the bottom.": {
207 |       "category": "DALL-E"
208 |    },
209 |    "An emoji of a baby panda wearing a red hat, green gloves, red shirt, and green pants.": {
210 |       "category": "DALL-E"
211 |    },
212 |    "An emoji of a baby panda wearing a red hat, blue gloves, green shirt, and blue pants.": {
213 |       "category": "DALL-E"
214 |    },
215 |    "A fisheye lens view of a turtle sitting in a forest.": {
216 |       "category": "DALL-E"
217 |    },
218 |    "A side view of an owl sitting in a field.": {
219 |       "category": "DALL-E"
220 |    },
221 |    "A cross-section view of a brain.": {
222 |       "category": "DALL-E"
223 |    },
224 |    "A vehicle composed of two wheels held in a frame one behind the other, propelled by pedals and steered with handlebars attached to the front wheel.": {
225 |       "category": "Descriptions"
226 |    },
227 |    "A large motor vehicle carrying passengers by road, typically one serving the public on a fixed route and for a fare.": {
228 |       "category": "Descriptions"
229 |    },
230 |    "A small vessel propelled on water by oars, sails, or an engine.": {
231 |       "category": "Descriptions"
232 |    },
233 |    "A connection point by which firefighters can tap into a water supply.": {
234 |       "category": "Descriptions"
235 |    },
236 |    "A machine next to a parking space in a street, into which the driver puts money so as to be authorized to park the vehicle for a particular length of time.": {
237 |       "category": "Descriptions"
238 |    },
239 |    "A device consisting of a circular canopy of cloth on a folding metal frame supported by a central rod, used as protection against rain or sometimes sun.": {
240 |       "category": "Descriptions"
241 |    },
242 |    "A separate seat for one person, typically with a back and four legs.": {
243 |       "category": "Descriptions"
244 |    },
245 |    "An appliance or compartment which is artificially kept cool and used to store food and drink.": {
246 |       "category": "Descriptions"
247 |    },
248 |    "A mechanical or electrical device for measuring time.": {
249 |       "category": "Descriptions"
250 |    },
251 |    "An instrument used for cutting cloth, paper, and other thin material, consisting of two blades laid one on top of the other and fastened in the middle so as to allow them to be opened and closed by a thumb and finger inserted through rings on the end of their handles.": {
252 |       "category": "Descriptions"
253 |    },
254 |    "A large plant-eating domesticated mammal with solid hoofs and a flowing mane and tail, used for riding, racing, and to carry and pull loads.": {
255 |       "category": "Descriptions"
256 |    },
257 |    "A long curved fruit which grows in clusters and has soft pulpy flesh and yellow skin when ripe.": {
258 |       "category": "Descriptions"
259 |    },
260 |    "A small domesticated carnivorous mammal with soft fur, a short snout, and retractable claws. It is widely kept as a pet or for catching mice, and many breeds have been developed.": {
261 |       "category": "Descriptions"
262 |    },
263 |    "A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, nonretractable claws, and a barking, howling, or whining voice.": {
264 |       "category": "Descriptions"
265 |    },
266 |    "An organ of soft nervous tissue contained in the skull of vertebrates, functioning as the coordinating center of sensation and intellectual and nervous activity.": {
267 |       "category": "Descriptions"
268 |    },
269 |    "An American multinational technology company that focuses on artificial intelligence, search engine, online advertising, cloud computing, computer software, quantum computing, e-commerce, and consumer electronics.": {
270 |       "category": "Descriptions"
271 |    },
272 |    "A large keyboard musical instrument with a wooden case enclosing a soundboard and metal strings, which are struck by hammers when the keys are depressed. The strings' vibration is stopped by dampers when the keys are released and can be regulated for length and volume by two or three pedals.": {
273 |       "category": "Descriptions"
274 |    },
275 |    "A type of digital currency in which a record of transactions is maintained and new units of currency are generated by the computational solution of mathematical problems, and which operates independently of a central bank.": {
276 |       "category": "Descriptions"
277 |    },
278 |    "A large thick-skinned semiaquatic African mammal, with massive jaws and large tusks.": {
279 |       "category": "Descriptions"
280 |    },
281 |    "A machine resembling a human being and able to replicate certain human movements and functions automatically.": {
282 |       "category": "Descriptions"
283 |    },
284 |    "Paying for a quarter-sized pizza with a pizza-sized quarter.": {
285 |       "category": "Gary Marcus et al. "
286 |    },
287 |    "An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with no umbrellas.": {
288 |       "category": "Gary Marcus et al. "
289 |    },
290 |    "A grocery store refrigerator has pint cartons of milk on the top shelf, quart cartons on the middle shelf, and gallon plastic jugs on the bottom shelf.": {
291 |       "category": "Gary Marcus et al. "
292 |    },
293 |    "In late afternoon in January in New England, a man stands in the shadow of a maple tree.": {
294 |       "category": "Gary Marcus et al. "
295 |    },
296 |    "An elephant is behind a tree. You can see the trunk on one side and the back legs on the other.": {
297 |       "category": "Gary Marcus et al. "
298 |    },
299 |    "A tomato has been put on top of a pumpkin on a kitchen stool. There is a fork sticking into the pumpkin. The scene is viewed from above.": {
300 |       "category": "Gary Marcus et al. "
301 |    },
302 |    "A pear cut into seven pieces arranged in a ring.": {
303 |       "category": "Gary Marcus et al. "
304 |    },
305 |    "A donkey and an octopus are playing a game. The donkey is holding a rope on one end, the octopus is holding onto the other. The donkey holds the rope in its mouth. A cat is jumping over the rope.": {
306 |       "category": "Gary Marcus et al. "
307 |    },
308 |    "Supreme Court Justices play a baseball game with the FBI. The FBI is at bat, the justices are on the field.": {
309 |       "category": "Gary Marcus et al. "
310 |    },
311 |    "Abraham Lincoln touches his toes while George Washington does chin-ups. Lincoln is barefoot. Washington is wearing boots.": {
312 |       "category": "Gary Marcus et al. "
313 |    },
314 |    "A train on top of a surfboard.": {
315 |       "category": "Positional"
316 |    },
317 |    "A wine glass on top of a dog.": {
318 |       "category": "Positional"
319 |    },
320 |    "A bicycle on top of a boat.": {
321 |       "category": "Positional"
322 |    },
323 |    "An umbrella on top of a spoon.": {
324 |       "category": "Positional"
325 |    },
326 |    "A laptop on top of a teddy bear.": {
327 |       "category": "Positional"
328 |    },
329 |    "A giraffe underneath a microwave.": {
330 |       "category": "Positional"
331 |    },
332 |    "A donut underneath a toilet.": {
333 |       "category": "Positional"
334 |    },
335 |    "A hair drier underneath a sheep.": {
336 |       "category": "Positional"
337 |    },
338 |    "A tennis racket underneath a traffic light.": {
339 |       "category": "Positional"
340 |    },
341 |    "A zebra underneath a broccoli.": {
342 |       "category": "Positional"
343 |    },
344 |    "A banana on the left of an apple.": {
345 |       "category": "Positional"
346 |    },
347 |    "A couch on the left of a chair.": {
348 |       "category": "Positional"
349 |    },
350 |    "A car on the left of a bus.": {
351 |       "category": "Positional"
352 |    },
353 |    "A cat on the left of a dog.": {
354 |       "category": "Positional"
355 |    },
356 |    "A carrot on the left of a broccoli.": {
357 |       "category": "Positional"
358 |    },
359 |    "A pizza on the right of a suitcase.": {
360 |       "category": "Positional"
361 |    },
362 |    "A cat on the right of a tennis racket.": {
363 |       "category": "Positional"
364 |    },
365 |    "A stop sign on the right of a refrigerator.": {
366 |       "category": "Positional"
367 |    },
368 |    "A sheep to the right of a wine glass.": {
369 |       "category": "Positional"
370 |    },
371 |    "A zebra to the right of a fire hydrant.": {
372 |       "category": "Positional"
373 |    },
374 |    "A church with stained glass windows depicting a hamburger and french fries.": {
375 |       "category": "Reddit"
376 |    },
377 |    "Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna.": {
378 |       "category": "Reddit"
379 |    },
380 |    "A baby fennec sneezing onto a strawberry, detailed, macro, studio light, droplets, backlit ears.": {
381 |       "category": "Reddit"
382 |    },
383 |    "A photo of a confused grizzly bear in calculus class.": {
384 |       "category": "Reddit"
385 |    },
386 |    "An ancient Egyptian painting depicting an argument over whose turn it is to take out the trash.": {
387 |       "category": "Reddit"
388 |    },
389 |    "A fluffy baby sloth with a knitted hat trying to figure out a laptop, close up, highly detailed, studio lighting, screen reflecting in its eyes.": {
390 |       "category": "Reddit"
391 |    },
392 |    "A tiger in a lab coat with a 1980s Miami vibe, turning a well oiled science content machine, digital art.": {
393 |       "category": "Reddit"
394 |    },
395 |    "A 1960s yearbook photo with animals dressed as humans.": {
396 |       "category": "Reddit"
397 |    },
398 |    "Lego Arnold Schwarzenegger.": {
399 |       "category": "Reddit"
400 |    },
401 |    "A yellow and black bus cruising through the rainforest.": {
402 |       "category": "Reddit"
403 |    },
404 |    "A medieval painting of the wifi not working.": {
405 |       "category": "Reddit"
406 |    },
407 |    "An IT-guy trying to fix hardware of a PC tower is being tangled by the PC cables like Laokoon. Marble, copy after Hellenistic original from ca. 200 BC. Found in the Baths of Trajan, 1506.": {
408 |       "category": "Reddit"
409 |    },
410 |    "35mm macro shot a kitten licking a baby duck, studio lighting.": {
411 |       "category": "Reddit"
412 |    },
413 |    "McDonalds Church.": {
414 |       "category": "Reddit"
415 |    },
416 |    "Photo of an athlete cat explaining it's latest scandal at a press conference to journalists.": {
417 |       "category": "Reddit"
418 |    },
419 |    "Greek statue of a man tripping over a cat.": {
420 |       "category": "Reddit"
421 |    },
422 |    "An old photograph of a 1920s airship shaped like a pig, floating over a wheat field.": {
423 |       "category": "Reddit"
424 |    },
425 |    "Photo of a cat singing in a barbershop quartet.": {
426 |       "category": "Reddit"
427 |    },
428 |    "A painting by Grant Wood of an astronaut couple, american gothic style.": {
429 |       "category": "Reddit"
430 |    },
431 |    "An oil painting portrait of the regal Burger King posing with a Whopper.": {
432 |       "category": "Reddit"
433 |    },
434 |    "A keyboard made of water, the water is made of light, the light is turned off.": {
435 |       "category": "Reddit"
436 |    },
437 |    "Painting of Mona Lisa but the view is from behind of Mona Lisa.": {
438 |       "category": "Reddit"
439 |    },
440 |    "Hyper-realistic photo of an abandoned industrial site during a storm.": {
441 |       "category": "Reddit"
442 |    },
443 |    "A screenshot of an iOS app for ordering different types of milk.": {
444 |       "category": "Reddit"
445 |    },
446 |    "A real life photography of super mario, 8k Ultra HD.": {
447 |       "category": "Reddit"
448 |    },
449 |    "Colouring page of large cats climbing the eifel tower in a cyberpunk future.": {
450 |       "category": "Reddit"
451 |    },
452 |    "Photo of a mega Lego space station inside a kid's bedroom.": {
453 |       "category": "Reddit"
454 |    },
455 |    "A spider with a moustache bidding an equally gentlemanly grasshopper a good day during his walk to work.": {
456 |       "category": "Reddit"
457 |    },
458 |    "A photocopy of a photograph of a painting of a sculpture of a giraffe.": {
459 |       "category": "Reddit"
460 |    },
461 |    "A bridge connecting Europe and North America on the Atlantic Ocean, bird's eye view.": {
462 |       "category": "Reddit"
463 |    },
464 |    "A maglev train going vertically downward in high speed, New York Times photojournalism.": {
465 |       "category": "Reddit"
466 |    },
467 |    "A magnifying glass over a page of a 1950s batman comic.": {
468 |       "category": "Reddit"
469 |    },
470 |    "A car playing soccer, digital art.": {
471 |       "category": "Reddit"
472 |    },
473 |    "Darth Vader playing with raccoon in Mars during sunset.": {
474 |       "category": "Reddit"
475 |    },
476 |    "A 1960s poster warning against climate change.": {
477 |       "category": "Reddit"
478 |    },
479 |    "Illustration of a mouse using a mushroom as an umbrella.": {
480 |       "category": "Reddit"
481 |    },
482 |    "A realistic photo of a Pomeranian dressed up like a 1980s professional wrestler with neon green and neon orange face paint and bright green wrestling tights with bright orange boots.": {
483 |       "category": "Reddit"
484 |    },
485 |    "A pyramid made of falafel with a partial solar eclipse in the background.": {
486 |       "category": "Reddit"
487 |    },
488 |    "A storefront with 'Hello World' written on it.": {
489 |       "category": "Text"
490 |    },
491 |    "A storefront with 'Diffusion' written on it.": {
492 |       "category": "Text"
493 |    },
494 |    "A storefront with 'Text to Image' written on it.": {
495 |       "category": "Text"
496 |    },
497 |    "A storefront with 'NeurIPS' written on it.": {
498 |       "category": "Text"
499 |    },
500 |    "A storefront with 'Deep Learning' written on it.": {
501 |       "category": "Text"
502 |    },
503 |    "A storefront with 'Google Brain Toronto' written on it.": {
504 |       "category": "Text"
505 |    },
506 |    "A storefront with 'Google Research Pizza Cafe' written on it.": {
507 |       "category": "Text"
508 |    },
509 |    "A sign that says 'Hello World'.": {
510 |       "category": "Text"
511 |    },
512 |    "A sign that says 'Diffusion'.": {
513 |       "category": "Text"
514 |    },
515 |    "A sign that says 'Text to Image'.": {
516 |       "category": "Text"
517 |    },
518 |    "A sign that says 'NeurIPS'.": {
519 |       "category": "Text"
520 |    },
521 |    "A sign that says 'Deep Learning'.": {
522 |       "category": "Text"
523 |    },
524 |    "A sign that says 'Google Brain Toronto'.": {
525 |       "category": "Text"
526 |    },
527 |    "A sign that says 'Google Research Pizza Cafe'.": {
528 |       "category": "Text"
529 |    },
530 |    "New York Skyline with 'Hello World' written with fireworks on the sky.": {
531 |       "category": "Text"
532 |    },
533 |    "New York Skyline with 'Diffusion' written with fireworks on the sky.": {
534 |       "category": "Text"
535 |    },
536 |    "New York Skyline with 'Text to Image' written with fireworks on the sky.": {
537 |       "category": "Text"
538 |    },
539 |    "New York Skyline with 'NeurIPS' written with fireworks on the sky.": {
540 |       "category": "Text"
541 |    },
542 |    "New York Skyline with 'Deep Learning' written with fireworks on the sky.": {
543 |       "category": "Text"
544 |    },
545 |    "New York Skyline with 'Google Brain Toronto' written with fireworks on the sky.": {
546 |       "category": "Text"
547 |    },
548 |    "New York Skyline with 'Google Research Pizza Cafe' written with fireworks on the sky.": {
549 |       "category": "Text"
550 |    }
551 | }


--------------------------------------------------------------------------------
/alignment/assets/imagenet_classes.txt:
--------------------------------------------------------------------------------
   1 | tench, Tinca tinca
   2 | goldfish, Carassius auratus
   3 | great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
   4 | tiger shark, Galeocerdo cuvieri
   5 | hammerhead, hammerhead shark
   6 | electric ray, crampfish, numbfish, torpedo
   7 | stingray
   8 | cock
   9 | hen
  10 | ostrich, Struthio camelus
  11 | brambling, Fringilla montifringilla
  12 | goldfinch, Carduelis carduelis
  13 | house finch, linnet, Carpodacus mexicanus
  14 | junco, snowbird
  15 | indigo bunting, indigo finch, indigo bird, Passerina cyanea
  16 | robin, American robin, Turdus migratorius
  17 | bulbul
  18 | jay
  19 | magpie
  20 | chickadee
  21 | water ouzel, dipper
  22 | kite
  23 | bald eagle, American eagle, Haliaeetus leucocephalus
  24 | vulture
  25 | great grey owl, great gray owl, Strix nebulosa
  26 | European fire salamander, Salamandra salamandra
  27 | common newt, Triturus vulgaris
  28 | eft
  29 | spotted salamander, Ambystoma maculatum
  30 | axolotl, mud puppy, Ambystoma mexicanum
  31 | bullfrog, Rana catesbeiana
  32 | tree frog, tree-frog
  33 | tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
  34 | loggerhead, loggerhead turtle, Caretta caretta
  35 | leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
  36 | mud turtle
  37 | terrapin
  38 | box turtle, box tortoise
  39 | banded gecko
  40 | common iguana, iguana, Iguana iguana
  41 | American chameleon, anole, Anolis carolinensis
  42 | whiptail, whiptail lizard
  43 | agama
  44 | frilled lizard, Chlamydosaurus kingi
  45 | alligator lizard
  46 | Gila monster, Heloderma suspectum
  47 | green lizard, Lacerta viridis
  48 | African chameleon, Chamaeleo chamaeleon
  49 | Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
  50 | African crocodile, Nile crocodile, Crocodylus niloticus
  51 | American alligator, Alligator mississipiensis
  52 | triceratops
  53 | thunder snake, worm snake, Carphophis amoenus
  54 | ringneck snake, ring-necked snake, ring snake
  55 | hognose snake, puff adder, sand viper
  56 | green snake, grass snake
  57 | king snake, kingsnake
  58 | garter snake, grass snake
  59 | water snake
  60 | vine snake
  61 | night snake, Hypsiglena torquata
  62 | boa constrictor, Constrictor constrictor
  63 | rock python, rock snake, Python sebae
  64 | Indian cobra, Naja naja
  65 | green mamba
  66 | sea snake
  67 | horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
  68 | diamondback, diamondback rattlesnake, Crotalus adamanteus
  69 | sidewinder, horned rattlesnake, Crotalus cerastes
  70 | trilobite
  71 | harvestman, daddy longlegs, Phalangium opilio
  72 | scorpion
  73 | black and gold garden spider, Argiope aurantia
  74 | barn spider, Araneus cavaticus
  75 | garden spider, Aranea diademata
  76 | black widow, Latrodectus mactans
  77 | tarantula
  78 | wolf spider, hunting spider
  79 | tick
  80 | centipede
  81 | black grouse
  82 | ptarmigan
  83 | ruffed grouse, partridge, Bonasa umbellus
  84 | prairie chicken, prairie grouse, prairie fowl
  85 | peacock
  86 | quail
  87 | partridge
  88 | African grey, African gray, Psittacus erithacus
  89 | macaw
  90 | sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
  91 | lorikeet
  92 | coucal
  93 | bee eater
  94 | hornbill
  95 | hummingbird
  96 | jacamar
  97 | toucan
  98 | drake
  99 | red-breasted merganser, Mergus serrator
 100 | goose
 101 | black swan, Cygnus atratus
 102 | tusker
 103 | echidna, spiny anteater, anteater
 104 | platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
 105 | wallaby, brush kangaroo
 106 | koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
 107 | wombat
 108 | jellyfish
 109 | sea anemone, anemone
 110 | brain coral
 111 | flatworm, platyhelminth
 112 | nematode, nematode worm, roundworm
 113 | conch
 114 | snail
 115 | slug
 116 | sea slug, nudibranch
 117 | chiton, coat-of-mail shell, sea cradle, polyplacophore
 118 | chambered nautilus, pearly nautilus, nautilus
 119 | Dungeness crab, Cancer magister
 120 | rock crab, Cancer irroratus
 121 | fiddler crab
 122 | king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
 123 | American lobster, Northern lobster, Maine lobster, Homarus americanus
 124 | spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
 125 | crayfish, crawfish, crawdad, crawdaddy
 126 | hermit crab
 127 | isopod
 128 | white stork, Ciconia ciconia
 129 | black stork, Ciconia nigra
 130 | spoonbill
 131 | flamingo
 132 | little blue heron, Egretta caerulea
 133 | American egret, great white heron, Egretta albus
 134 | bittern
 135 | crane
 136 | limpkin, Aramus pictus
 137 | European gallinule, Porphyrio porphyrio
 138 | American coot, marsh hen, mud hen, water hen, Fulica americana
 139 | bustard
 140 | ruddy turnstone, Arenaria interpres
 141 | red-backed sandpiper, dunlin, Erolia alpina
 142 | redshank, Tringa totanus
 143 | dowitcher
 144 | oystercatcher, oyster catcher
 145 | pelican
 146 | king penguin, Aptenodytes patagonica
 147 | albatross, mollymawk
 148 | grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
 149 | killer whale, killer, orca, grampus, sea wolf, Orcinus orca
 150 | dugong, Dugong dugon
 151 | sea lion
 152 | Chihuahua
 153 | Japanese spaniel
 154 | Maltese dog, Maltese terrier, Maltese
 155 | Pekinese, Pekingese, Peke
 156 | Shih-Tzu
 157 | Blenheim spaniel
 158 | papillon
 159 | toy terrier
 160 | Rhodesian ridgeback
 161 | Afghan hound, Afghan
 162 | basset, basset hound
 163 | beagle
 164 | bloodhound, sleuthhound
 165 | bluetick
 166 | black-and-tan coonhound
 167 | Walker hound, Walker foxhound
 168 | English foxhound
 169 | redbone
 170 | borzoi, Russian wolfhound
 171 | Irish wolfhound
 172 | Italian greyhound
 173 | whippet
 174 | Ibizan hound, Ibizan Podenco
 175 | Norwegian elkhound, elkhound
 176 | otterhound, otter hound
 177 | Saluki, gazelle hound
 178 | Scottish deerhound, deerhound
 179 | Weimaraner
 180 | Staffordshire bullterrier, Staffordshire bull terrier
 181 | American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
 182 | Bedlington terrier
 183 | Border terrier
 184 | Kerry blue terrier
 185 | Irish terrier
 186 | Norfolk terrier
 187 | Norwich terrier
 188 | Yorkshire terrier
 189 | wire-haired fox terrier
 190 | Lakeland terrier
 191 | Sealyham terrier, Sealyham
 192 | Airedale, Airedale terrier
 193 | cairn, cairn terrier
 194 | Australian terrier
 195 | Dandie Dinmont, Dandie Dinmont terrier
 196 | Boston bull, Boston terrier
 197 | miniature schnauzer
 198 | giant schnauzer
 199 | standard schnauzer
 200 | Scotch terrier, Scottish terrier, Scottie
 201 | Tibetan terrier, chrysanthemum dog
 202 | silky terrier, Sydney silky
 203 | soft-coated wheaten terrier
 204 | West Highland white terrier
 205 | Lhasa, Lhasa apso
 206 | flat-coated retriever
 207 | curly-coated retriever
 208 | golden retriever
 209 | Labrador retriever
 210 | Chesapeake Bay retriever
 211 | German short-haired pointer
 212 | vizsla, Hungarian pointer
 213 | English setter
 214 | Irish setter, red setter
 215 | Gordon setter
 216 | Brittany spaniel
 217 | clumber, clumber spaniel
 218 | English springer, English springer spaniel
 219 | Welsh springer spaniel
 220 | cocker spaniel, English cocker spaniel, cocker
 221 | Sussex spaniel
 222 | Irish water spaniel
 223 | kuvasz
 224 | schipperke
 225 | groenendael
 226 | malinois
 227 | briard
 228 | kelpie
 229 | komondor
 230 | Old English sheepdog, bobtail
 231 | Shetland sheepdog, Shetland sheep dog, Shetland
 232 | collie
 233 | Border collie
 234 | Bouvier des Flandres, Bouviers des Flandres
 235 | Rottweiler
 236 | German shepherd, German shepherd dog, German police dog, alsatian
 237 | Doberman, Doberman pinscher
 238 | miniature pinscher
 239 | Greater Swiss Mountain dog
 240 | Bernese mountain dog
 241 | Appenzeller
 242 | EntleBucher
 243 | boxer
 244 | bull mastiff
 245 | Tibetan mastiff
 246 | French bulldog
 247 | Great Dane
 248 | Saint Bernard, St Bernard
 249 | Eskimo dog, husky
 250 | malamute, malemute, Alaskan malamute
 251 | Siberian husky
 252 | dalmatian, coach dog, carriage dog
 253 | affenpinscher, monkey pinscher, monkey dog
 254 | basenji
 255 | pug, pug-dog
 256 | Leonberg
 257 | Newfoundland, Newfoundland dog
 258 | Great Pyrenees
 259 | Samoyed, Samoyede
 260 | Pomeranian
 261 | chow, chow chow
 262 | keeshond
 263 | Brabancon griffon
 264 | Pembroke, Pembroke Welsh corgi
 265 | Cardigan, Cardigan Welsh corgi
 266 | toy poodle
 267 | miniature poodle
 268 | standard poodle
 269 | Mexican hairless
 270 | timber wolf, grey wolf, gray wolf, Canis lupus
 271 | white wolf, Arctic wolf, Canis lupus tundrarum
 272 | red wolf, maned wolf, Canis rufus, Canis niger
 273 | coyote, prairie wolf, brush wolf, Canis latrans
 274 | dingo, warrigal, warragal, Canis dingo
 275 | dhole, Cuon alpinus
 276 | African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
 277 | hyena, hyaena
 278 | red fox, Vulpes vulpes
 279 | kit fox, Vulpes macrotis
 280 | Arctic fox, white fox, Alopex lagopus
 281 | grey fox, gray fox, Urocyon cinereoargenteus
 282 | tabby, tabby cat
 283 | tiger cat
 284 | Persian cat
 285 | Siamese cat, Siamese
 286 | Egyptian cat
 287 | cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
 288 | lynx, catamount
 289 | leopard, Panthera pardus
 290 | snow leopard, ounce, Panthera uncia
 291 | jaguar, panther, Panthera onca, Felis onca
 292 | lion, king of beasts, Panthera leo
 293 | tiger, Panthera tigris
 294 | cheetah, chetah, Acinonyx jubatus
 295 | brown bear, bruin, Ursus arctos
 296 | American black bear, black bear, Ursus americanus, Euarctos americanus
 297 | ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
 298 | sloth bear, Melursus ursinus, Ursus ursinus
 299 | mongoose
 300 | meerkat, mierkat
 301 | tiger beetle
 302 | ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
 303 | ground beetle, carabid beetle
 304 | long-horned beetle, longicorn, longicorn beetle
 305 | leaf beetle, chrysomelid
 306 | dung beetle
 307 | rhinoceros beetle
 308 | weevil
 309 | fly
 310 | bee
 311 | ant, emmet, pismire
 312 | grasshopper, hopper
 313 | cricket
 314 | walking stick, walkingstick, stick insect
 315 | cockroach, roach
 316 | mantis, mantid
 317 | cicada, cicala
 318 | leafhopper
 319 | lacewing, lacewing fly
 320 | dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
 321 | damselfly
 322 | admiral
 323 | ringlet, ringlet butterfly
 324 | monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
 325 | cabbage butterfly
 326 | sulphur butterfly, sulfur butterfly
 327 | lycaenid, lycaenid butterfly
 328 | starfish, sea star
 329 | sea urchin
 330 | sea cucumber, holothurian
 331 | wood rabbit, cottontail, cottontail rabbit
 332 | hare
 333 | Angora, Angora rabbit
 334 | hamster
 335 | porcupine, hedgehog
 336 | fox squirrel, eastern fox squirrel, Sciurus niger
 337 | marmot
 338 | beaver
 339 | guinea pig, Cavia cobaya
 340 | sorrel
 341 | zebra
 342 | hog, pig, grunter, squealer, Sus scrofa
 343 | wild boar, boar, Sus scrofa
 344 | warthog
 345 | hippopotamus, hippo, river horse, Hippopotamus amphibius
 346 | ox
 347 | water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
 348 | bison
 349 | ram, tup
 350 | bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
 351 | ibex, Capra ibex
 352 | hartebeest
 353 | impala, Aepyceros melampus
 354 | gazelle
 355 | Arabian camel, dromedary, Camelus dromedarius
 356 | llama
 357 | weasel
 358 | mink
 359 | polecat, fitch, foulmart, foumart, Mustela putorius
 360 | black-footed ferret, ferret, Mustela nigripes
 361 | otter
 362 | skunk, polecat, wood pussy
 363 | badger
 364 | armadillo
 365 | three-toed sloth, ai, Bradypus tridactylus
 366 | orangutan, orang, orangutang, Pongo pygmaeus
 367 | gorilla, Gorilla gorilla
 368 | chimpanzee, chimp, Pan troglodytes
 369 | gibbon, Hylobates lar
 370 | siamang, Hylobates syndactylus, Symphalangus syndactylus
 371 | guenon, guenon monkey
 372 | patas, hussar monkey, Erythrocebus patas
 373 | baboon
 374 | macaque
 375 | langur
 376 | colobus, colobus monkey
 377 | proboscis monkey, Nasalis larvatus
 378 | marmoset
 379 | capuchin, ringtail, Cebus capucinus
 380 | howler monkey, howler
 381 | titi, titi monkey
 382 | spider monkey, Ateles geoffroyi
 383 | squirrel monkey, Saimiri sciureus
 384 | Madagascar cat, ring-tailed lemur, Lemur catta
 385 | indri, indris, Indri indri, Indri brevicaudatus
 386 | Indian elephant, Elephas maximus
 387 | African elephant, Loxodonta africana
 388 | lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
 389 | giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
 390 | barracouta, snoek
 391 | eel
 392 | coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
 393 | rock beauty, Holocanthus tricolor
 394 | anemone fish
 395 | sturgeon
 396 | gar, garfish, garpike, billfish, Lepisosteus osseus
 397 | lionfish
 398 | puffer, pufferfish, blowfish, globefish
 399 | abacus
 400 | abaya
 401 | academic gown, academic robe, judge's robe
 402 | accordion, piano accordion, squeeze box
 403 | acoustic guitar
 404 | aircraft carrier, carrier, flattop, attack aircraft carrier
 405 | airliner
 406 | airship, dirigible
 407 | altar
 408 | ambulance
 409 | amphibian, amphibious vehicle
 410 | analog clock
 411 | apiary, bee house
 412 | apron
 413 | ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
 414 | assault rifle, assault gun
 415 | backpack, back pack, knapsack, packsack, rucksack, haversack
 416 | bakery, bakeshop, bakehouse
 417 | balance beam, beam
 418 | balloon
 419 | ballpoint, ballpoint pen, ballpen, Biro
 420 | Band Aid
 421 | banjo
 422 | bannister, banister, balustrade, balusters, handrail
 423 | barbell
 424 | barber chair
 425 | barbershop
 426 | barn
 427 | barometer
 428 | barrel, cask
 429 | barrow, garden cart, lawn cart, wheelbarrow
 430 | baseball
 431 | basketball
 432 | bassinet
 433 | bassoon
 434 | bathing cap, swimming cap
 435 | bath towel
 436 | bathtub, bathing tub, bath, tub
 437 | beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
 438 | beacon, lighthouse, beacon light, pharos
 439 | beaker
 440 | bearskin, busby, shako
 441 | beer bottle
 442 | beer glass
 443 | bell cote, bell cot
 444 | bib
 445 | bicycle-built-for-two, tandem bicycle, tandem
 446 | bikini, two-piece
 447 | binder, ring-binder
 448 | binoculars, field glasses, opera glasses
 449 | birdhouse
 450 | boathouse
 451 | bobsled, bobsleigh, bob
 452 | bolo tie, bolo, bola tie, bola
 453 | bonnet, poke bonnet
 454 | bookcase
 455 | bookshop, bookstore, bookstall
 456 | bottlecap
 457 | bow
 458 | bow tie, bow-tie, bowtie
 459 | brass, memorial tablet, plaque
 460 | brassiere, bra, bandeau
 461 | breakwater, groin, groyne, mole, bulwark, seawall, jetty
 462 | breastplate, aegis, egis
 463 | broom
 464 | bucket, pail
 465 | buckle
 466 | bulletproof vest
 467 | bullet train, bullet
 468 | butcher shop, meat market
 469 | cab, hack, taxi, taxicab
 470 | caldron, cauldron
 471 | candle, taper, wax light
 472 | cannon
 473 | canoe
 474 | can opener, tin opener
 475 | cardigan
 476 | car mirror
 477 | carousel, carrousel, merry-go-round, roundabout, whirligig
 478 | carpenter's kit, tool kit
 479 | carton
 480 | car wheel
 481 | cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
 482 | cassette
 483 | cassette player
 484 | castle
 485 | catamaran
 486 | CD player
 487 | cello, violoncello
 488 | cellular telephone, cellular phone, cellphone, cell, mobile phone
 489 | chain
 490 | chainlink fence
 491 | chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
 492 | chain saw, chainsaw
 493 | chest
 494 | chiffonier, commode
 495 | chime, bell, gong
 496 | china cabinet, china closet
 497 | Christmas stocking
 498 | church, church building
 499 | cinema, movie theater, movie theatre, movie house, picture palace
 500 | cleaver, meat cleaver, chopper
 501 | cliff dwelling
 502 | cloak
 503 | clog, geta, patten, sabot
 504 | cocktail shaker
 505 | coffee mug
 506 | coffeepot
 507 | coil, spiral, volute, whorl, helix
 508 | combination lock
 509 | computer keyboard, keypad
 510 | confectionery, confectionary, candy store
 511 | container ship, containership, container vessel
 512 | convertible
 513 | corkscrew, bottle screw
 514 | cornet, horn, trumpet, trump
 515 | cowboy boot
 516 | cowboy hat, ten-gallon hat
 517 | cradle
 518 | crane
 519 | crash helmet
 520 | crate
 521 | crib, cot
 522 | Crock Pot
 523 | croquet ball
 524 | crutch
 525 | cuirass
 526 | dam, dike, dyke
 527 | desk
 528 | desktop computer
 529 | dial telephone, dial phone
 530 | diaper, nappy, napkin
 531 | digital clock
 532 | digital watch
 533 | dining table, board
 534 | dishrag, dishcloth
 535 | dishwasher, dish washer, dishwashing machine
 536 | disk brake, disc brake
 537 | dock, dockage, docking facility
 538 | dogsled, dog sled, dog sleigh
 539 | dome
 540 | doormat, welcome mat
 541 | drilling platform, offshore rig
 542 | drum, membranophone, tympan
 543 | drumstick
 544 | dumbbell
 545 | Dutch oven
 546 | electric fan, blower
 547 | electric guitar
 548 | electric locomotive
 549 | entertainment center
 550 | envelope
 551 | espresso maker
 552 | face powder
 553 | feather boa, boa
 554 | file, file cabinet, filing cabinet
 555 | fireboat
 556 | fire engine, fire truck
 557 | fire screen, fireguard
 558 | flagpole, flagstaff
 559 | flute, transverse flute
 560 | folding chair
 561 | football helmet
 562 | forklift
 563 | fountain
 564 | fountain pen
 565 | four-poster
 566 | freight car
 567 | French horn, horn
 568 | frying pan, frypan, skillet
 569 | fur coat
 570 | garbage truck, dustcart
 571 | gasmask, respirator, gas helmet
 572 | gas pump, gasoline pump, petrol pump, island dispenser
 573 | goblet
 574 | go-kart
 575 | golf ball
 576 | golfcart, golf cart
 577 | gondola
 578 | gong, tam-tam
 579 | gown
 580 | grand piano, grand
 581 | greenhouse, nursery, glasshouse
 582 | grille, radiator grille
 583 | grocery store, grocery, food market, market
 584 | guillotine
 585 | hair slide
 586 | hair spray
 587 | half track
 588 | hammer
 589 | hamper
 590 | hand blower, blow dryer, blow drier, hair dryer, hair drier
 591 | hand-held computer, hand-held microcomputer
 592 | handkerchief, hankie, hanky, hankey
 593 | hard disc, hard disk, fixed disk
 594 | harmonica, mouth organ, harp, mouth harp
 595 | harp
 596 | harvester, reaper
 597 | hatchet
 598 | holster
 599 | home theater, home theatre
 600 | honeycomb
 601 | hook, claw
 602 | hoopskirt, crinoline
 603 | horizontal bar, high bar
 604 | horse cart, horse-cart
 605 | hourglass
 606 | iPod
 607 | iron, smoothing iron
 608 | jack-o'-lantern
 609 | jean, blue jean, denim
 610 | jeep, landrover
 611 | jersey, T-shirt, tee shirt
 612 | jigsaw puzzle
 613 | jinrikisha, ricksha, rickshaw
 614 | joystick
 615 | kimono
 616 | knee pad
 617 | knot
 618 | lab coat, laboratory coat
 619 | ladle
 620 | lampshade, lamp shade
 621 | laptop, laptop computer
 622 | lawn mower, mower
 623 | lens cap, lens cover
 624 | letter opener, paper knife, paperknife
 625 | library
 626 | lifeboat
 627 | lighter, light, igniter, ignitor
 628 | limousine, limo
 629 | liner, ocean liner
 630 | lipstick, lip rouge
 631 | Loafer
 632 | lotion
 633 | loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
 634 | loupe, jeweler's loupe
 635 | lumbermill, sawmill
 636 | magnetic compass
 637 | mailbag, postbag
 638 | mailbox, letter box
 639 | maillot
 640 | maillot, tank suit
 641 | manhole cover
 642 | maraca
 643 | marimba, xylophone
 644 | mask
 645 | matchstick
 646 | maypole
 647 | maze, labyrinth
 648 | measuring cup
 649 | medicine chest, medicine cabinet
 650 | megalith, megalithic structure
 651 | microphone, mike
 652 | microwave, microwave oven
 653 | military uniform
 654 | milk can
 655 | minibus
 656 | miniskirt, mini
 657 | minivan
 658 | missile
 659 | mitten
 660 | mixing bowl
 661 | mobile home, manufactured home
 662 | Model T
 663 | modem
 664 | monastery
 665 | monitor
 666 | moped
 667 | mortar
 668 | mortarboard
 669 | mosque
 670 | mosquito net
 671 | motor scooter, scooter
 672 | mountain bike, all-terrain bike, off-roader
 673 | mountain tent
 674 | mouse, computer mouse
 675 | mousetrap
 676 | moving van
 677 | muzzle
 678 | nail
 679 | neck brace
 680 | necklace
 681 | nipple
 682 | notebook, notebook computer
 683 | obelisk
 684 | oboe, hautboy, hautbois
 685 | ocarina, sweet potato
 686 | odometer, hodometer, mileometer, milometer
 687 | oil filter
 688 | organ, pipe organ
 689 | oscilloscope, scope, cathode-ray oscilloscope, CRO
 690 | overskirt
 691 | oxcart
 692 | oxygen mask
 693 | packet
 694 | paddle, boat paddle
 695 | paddlewheel, paddle wheel
 696 | padlock
 697 | paintbrush
 698 | pajama, pyjama, pj's, jammies
 699 | palace
 700 | panpipe, pandean pipe, syrinx
 701 | paper towel
 702 | parachute, chute
 703 | parallel bars, bars
 704 | park bench
 705 | parking meter
 706 | passenger car, coach, carriage
 707 | patio, terrace
 708 | pay-phone, pay-station
 709 | pedestal, plinth, footstall
 710 | pencil box, pencil case
 711 | pencil sharpener
 712 | perfume, essence
 713 | Petri dish
 714 | photocopier
 715 | pick, plectrum, plectron
 716 | pickelhaube
 717 | picket fence, paling
 718 | pickup, pickup truck
 719 | pier
 720 | piggy bank, penny bank
 721 | pill bottle
 722 | pillow
 723 | ping-pong ball
 724 | pinwheel
 725 | pirate, pirate ship
 726 | pitcher, ewer
 727 | plane, carpenter's plane, woodworking plane
 728 | planetarium
 729 | plastic bag
 730 | plate rack
 731 | plow, plough
 732 | plunger, plumber's helper
 733 | Polaroid camera, Polaroid Land camera
 734 | pole
 735 | police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
 736 | poncho
 737 | pool table, billiard table, snooker table
 738 | pop bottle, soda bottle
 739 | pot, flowerpot
 740 | potter's wheel
 741 | power drill
 742 | prayer rug, prayer mat
 743 | printer
 744 | prison, prison house
 745 | projectile, missile
 746 | projector
 747 | puck, hockey puck
 748 | punching bag, punch bag, punching ball, punchball
 749 | purse
 750 | quill, quill pen
 751 | quilt, comforter, comfort, puff
 752 | racer, race car, racing car
 753 | racket, racquet
 754 | radiator
 755 | radio, wireless
 756 | radio telescope, radio reflector
 757 | rain barrel
 758 | recreational vehicle, RV, R.V.
 759 | reel
 760 | reflex camera
 761 | refrigerator, icebox
 762 | remote control, remote
 763 | restaurant, eating house, eating place, eatery
 764 | revolver, six-gun, six-shooter
 765 | rifle
 766 | rocking chair, rocker
 767 | rotisserie
 768 | rubber eraser, rubber, pencil eraser
 769 | rugby ball
 770 | rule, ruler
 771 | running shoe
 772 | safe
 773 | safety pin
 774 | saltshaker, salt shaker
 775 | sandal
 776 | sarong
 777 | sax, saxophone
 778 | scabbard
 779 | scale, weighing machine
 780 | school bus
 781 | schooner
 782 | scoreboard
 783 | screen, CRT screen
 784 | screw
 785 | screwdriver
 786 | seat belt, seatbelt
 787 | sewing machine
 788 | shield, buckler
 789 | shoe shop, shoe-shop, shoe store
 790 | shoji
 791 | shopping basket
 792 | shopping cart
 793 | shovel
 794 | shower cap
 795 | shower curtain
 796 | ski
 797 | ski mask
 798 | sleeping bag
 799 | slide rule, slipstick
 800 | sliding door
 801 | slot, one-armed bandit
 802 | snorkel
 803 | snowmobile
 804 | snowplow, snowplough
 805 | soap dispenser
 806 | soccer ball
 807 | sock
 808 | solar dish, solar collector, solar furnace
 809 | sombrero
 810 | soup bowl
 811 | space bar
 812 | space heater
 813 | space shuttle
 814 | spatula
 815 | speedboat
 816 | spider web, spider's web
 817 | spindle
 818 | sports car, sport car
 819 | spotlight, spot
 820 | stage
 821 | steam locomotive
 822 | steel arch bridge
 823 | steel drum
 824 | stethoscope
 825 | stole
 826 | stone wall
 827 | stopwatch, stop watch
 828 | stove
 829 | strainer
 830 | streetcar, tram, tramcar, trolley, trolley car
 831 | stretcher
 832 | studio couch, day bed
 833 | stupa, tope
 834 | submarine, pigboat, sub, U-boat
 835 | suit, suit of clothes
 836 | sundial
 837 | sunglass
 838 | sunglasses, dark glasses, shades
 839 | sunscreen, sunblock, sun blocker
 840 | suspension bridge
 841 | swab, swob, mop
 842 | sweatshirt
 843 | swimming trunks, bathing trunks
 844 | swing
 845 | switch, electric switch, electrical switch
 846 | syringe
 847 | table lamp
 848 | tank, army tank, armored combat vehicle, armoured combat vehicle
 849 | tape player
 850 | teapot
 851 | teddy, teddy bear
 852 | television, television system
 853 | tennis ball
 854 | thatch, thatched roof
 855 | theater curtain, theatre curtain
 856 | thimble
 857 | thresher, thrasher, threshing machine
 858 | throne
 859 | tile roof
 860 | toaster
 861 | tobacco shop, tobacconist shop, tobacconist
 862 | toilet seat
 863 | torch
 864 | totem pole
 865 | tow truck, tow car, wrecker
 866 | toyshop
 867 | tractor
 868 | trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
 869 | tray
 870 | trench coat
 871 | tricycle, trike, velocipede
 872 | trimaran
 873 | tripod
 874 | triumphal arch
 875 | trolleybus, trolley coach, trackless trolley
 876 | trombone
 877 | tub, vat
 878 | turnstile
 879 | typewriter keyboard
 880 | umbrella
 881 | unicycle, monocycle
 882 | upright, upright piano
 883 | vacuum, vacuum cleaner
 884 | vase
 885 | vault
 886 | velvet
 887 | vending machine
 888 | vestment
 889 | viaduct
 890 | violin, fiddle
 891 | volleyball
 892 | waffle iron
 893 | wall clock
 894 | wallet, billfold, notecase, pocketbook
 895 | wardrobe, closet, press
 896 | warplane, military plane
 897 | washbasin, handbasin, washbowl, lavabo, wash-hand basin
 898 | washer, automatic washer, washing machine
 899 | water bottle
 900 | water jug
 901 | water tower
 902 | whiskey jug
 903 | whistle
 904 | wig
 905 | window screen
 906 | window shade
 907 | Windsor tie
 908 | wine bottle
 909 | wing
 910 | wok
 911 | wooden spoon
 912 | wool, woolen, woollen
 913 | worm fence, snake fence, snake-rail fence, Virginia fence
 914 | wreck
 915 | yawl
 916 | yurt
 917 | web site, website, internet site, site
 918 | comic book
 919 | crossword puzzle, crossword
 920 | street sign
 921 | traffic light, traffic signal, stoplight
 922 | book jacket, dust cover, dust jacket, dust wrapper
 923 | menu
 924 | plate
 925 | guacamole
 926 | consomme
 927 | hot pot, hotpot
 928 | trifle
 929 | ice cream, icecream
 930 | ice lolly, lolly, lollipop, popsicle
 931 | French loaf
 932 | bagel, beigel
 933 | pretzel
 934 | cheeseburger
 935 | hotdog, hot dog, red hot
 936 | mashed potato
 937 | head cabbage
 938 | broccoli
 939 | cauliflower
 940 | zucchini, courgette
 941 | spaghetti squash
 942 | acorn squash
 943 | butternut squash
 944 | cucumber, cuke
 945 | artichoke, globe artichoke
 946 | bell pepper
 947 | cardoon
 948 | mushroom
 949 | Granny Smith
 950 | strawberry
 951 | orange
 952 | lemon
 953 | fig
 954 | pineapple, ananas
 955 | banana
 956 | jackfruit, jak, jack
 957 | custard apple
 958 | pomegranate
 959 | hay
 960 | carbonara
 961 | chocolate sauce, chocolate syrup
 962 | dough
 963 | meat loaf, meatloaf
 964 | pizza, pizza pie
 965 | potpie
 966 | burrito
 967 | red wine
 968 | espresso
 969 | cup
 970 | eggnog
 971 | alp
 972 | bubble
 973 | cliff, drop, drop-off
 974 | coral reef
 975 | geyser
 976 | lakeside, lakeshore
 977 | promontory, headland, head, foreland
 978 | sandbar, sand bar
 979 | seashore, coast, seacoast, sea-coast
 980 | valley, vale
 981 | volcano
 982 | ballplayer, baseball player
 983 | groom, bridegroom
 984 | scuba diver
 985 | rapeseed
 986 | daisy
 987 | yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
 988 | corn
 989 | acorn
 990 | hip, rose hip, rosehip
 991 | buckeye, horse chestnut, conker
 992 | coral fungus
 993 | agaric
 994 | gyromitra
 995 | stinkhorn, carrion fungus
 996 | earthstar
 997 | hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
 998 | bolete
 999 | ear, spike, capitulum
1000 | toilet tissue, toilet paper, bathroom tissue


--------------------------------------------------------------------------------
/alignment/assets/sac+logos+ava1-l14-linearMSE.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-diffusion-alignment-gflownet/12c59520669b195e919540b940a510cef6f46ae7/alignment/assets/sac+logos+ava1-l14-linearMSE.pth


--------------------------------------------------------------------------------
/alignment/assets/simple_animals.txt:
--------------------------------------------------------------------------------
 1 | cat
 2 | dog
 3 | horse
 4 | monkey
 5 | rabbit
 6 | zebra
 7 | spider
 8 | bird
 9 | sheep
10 | deer
11 | cow
12 | goat
13 | lion
14 | tiger
15 | bear
16 | raccoon
17 | fox
18 | wolf
19 | lizard
20 | beetle
21 | ant
22 | butterfly
23 | fish
24 | shark
25 | whale
26 | dolphin
27 | squirrel
28 | mouse
29 | rat
30 | snake
31 | turtle
32 | frog
33 | chicken
34 | duck
35 | goose
36 | bee
37 | pig
38 | turkey
39 | fly
40 | llama
41 | camel
42 | bat
43 | gorilla
44 | hedgehog
45 | kangaroo
46 | 


--------------------------------------------------------------------------------
/alignment/diffusers_patch/ddim_with_logprob.py:
--------------------------------------------------------------------------------
  1 | # For licensing see accompanying LICENSE file.
  2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved.
  3 | 
  4 | # Modified from https://github.com/huggingface/diffusers/blob/fc6acb6b97e93d58cb22b5fee52d884d77ce84d8/src/diffusers/schedulers/scheduling_ddim.py
  5 | 
  6 | from typing import Optional, Tuple, Union
  7 | 
  8 | import math
  9 | import torch
 10 | 
 11 | try:
 12 |     from diffusers.utils import randn_tensor
 13 | except ImportError:
 14 |     from diffusers.utils.torch_utils import randn_tensor
 15 | from diffusers.schedulers.scheduling_ddim import DDIMSchedulerOutput, DDIMScheduler
 16 | 
 17 | 
 18 | def _left_broadcast(t, shape):
 19 |     assert t.ndim <= len(shape)
 20 |     return t.reshape(t.shape + (1,) * (len(shape) - t.ndim)).broadcast_to(shape)
 21 | 
 22 | 
 23 | def _get_variance(self, timestep, prev_timestep):
 24 |     alpha_prod_t = torch.gather(self.alphas_cumprod, 0, timestep.cpu()).to(
 25 |         timestep.device
 26 |     )
 27 |     alpha_prod_t_prev = torch.where(
 28 |         prev_timestep.cpu() >= 0,
 29 |         self.alphas_cumprod.gather(0, prev_timestep.cpu()),
 30 |         self.final_alpha_cumprod,
 31 |     ).to(timestep.device)
 32 |     beta_prod_t = 1 - alpha_prod_t
 33 |     beta_prod_t_prev = 1 - alpha_prod_t_prev
 34 | 
 35 |     variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
 36 |     return variance
 37 | 
 38 | 
 39 | def ddim_step_with_logprob(
 40 |     self: DDIMScheduler,
 41 |     model_output: torch.FloatTensor,
 42 |     timestep: int,
 43 |     sample: torch.FloatTensor,
 44 |     eta: float = 1.0,
 45 |     use_clipped_model_output: bool = False,
 46 |     generator=None,
 47 |     prev_sample: Optional[torch.FloatTensor] = None,
 48 | 
 49 |     calculate_pb: bool = False, logp_mean=True,
 50 |     prev_timestep: int =None,
 51 | ) -> Union[DDIMSchedulerOutput, Tuple]:
 52 |     """
 53 |     Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
 54 |     process from the learned model outputs (most often the predicted noise).
 55 | 
 56 |     Args:
 57 |         model_output (`torch.FloatTensor`): direct output from learned diffusion model.
 58 |         timestep (`int`): current discrete timestep in the diffusion chain.
 59 |         sample (`torch.FloatTensor`):
 60 |             current instance of sample being created by diffusion process.
 61 |         eta (`float`): weight of noise for added noise in diffusion step.
 62 |         use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped
 63 |             predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when
 64 |             `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would
 65 |             coincide with the one provided as input and `use_clipped_model_output` will have not effect.
 66 |         generator: random number generator.
 67 |         variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we
 68 |             can directly provide the noise for the variance itself. This is useful for methods such as
 69 |             CycleDiffusion. (https://arxiv.org/abs/2210.05559)
 70 |         return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class
 71 | 
 72 |         sample: x_t
 73 |         prev_sample: x_{t-1} (closer to clean image)
 74 | 
 75 |     Returns:
 76 |         [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`:
 77 |         [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
 78 |         returning a tuple, the first element is the sample tensor.
 79 | 
 80 |     """
 81 |     assert isinstance(self, DDIMScheduler)
 82 |     if self.num_inference_steps is None:
 83 |         raise ValueError(
 84 |             "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
 85 |         )
 86 | 
 87 |     # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
 88 |     # Ideally, read DDIM paper in-detail understanding
 89 | 
 90 |     # Notation (<variable name> -> <name in paper>
 91 |     # - pred_noise_t -> e_theta(x_t, t)
 92 |     # - pred_original_sample -> f_theta(x_t, t) or x_0
 93 |     # - std_dev_t -> sigma_t
 94 |     # - eta -> η
 95 |     # - pred_sample_direction -> "direction pointing to x_t"
 96 |     # - pred_prev_sample -> "x_{t-1}"
 97 | 
 98 |     # 1. get previous step value (=t-1)
 99 |     if prev_timestep is None:
100 |         prev_timestep = (
101 |             timestep - self.config.num_train_timesteps // self.num_inference_steps
102 |         )
103 |     # to prevent OOB on gather
104 |     prev_timestep = torch.clamp(prev_timestep, 0, self.config.num_train_timesteps - 1)
105 | 
106 |     # 2. compute alphas, betas
107 |     # self.alphas_cumprod  torch.Size([1000])
108 |     alpha_prod_t = self.alphas_cumprod.gather(0, timestep.cpu())  # torch scalar
109 |     alpha_prod_t_prev = torch.where(
110 |         prev_timestep.cpu() >= 0,
111 |         self.alphas_cumprod.gather(0, prev_timestep.cpu()),
112 |         self.final_alpha_cumprod,
113 |     )
114 |     alpha_prod_t = _left_broadcast(alpha_prod_t, sample.shape).to(sample.device)
115 |     alpha_prod_t_prev = _left_broadcast(alpha_prod_t_prev, sample.shape).to(
116 |         sample.device
117 |     )
118 |     # alpha_prod_t = alpha_prod_t.to(sample.dtype)  # float32 -> bf16
119 |     # alpha_prod_t_prev = alpha_prod_t_prev.to(sample.dtype)  # float32 -> bf16
120 | 
121 |     beta_prod_t = 1 - alpha_prod_t
122 | 
123 |     # 3. compute predicted original sample from predicted noise also called
124 |     # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
125 |     if self.config.prediction_type == "epsilon":
126 |         pred_original_sample = (
127 |             sample - beta_prod_t ** (0.5) * model_output
128 |         ) / alpha_prod_t ** (0.5)
129 |         pred_epsilon = model_output
130 |     elif self.config.prediction_type == "sample":
131 |         pred_original_sample = model_output
132 |         pred_epsilon = (
133 |             sample - alpha_prod_t ** (0.5) * pred_original_sample
134 |         ) / beta_prod_t ** (0.5)
135 |     elif self.config.prediction_type == "v_prediction":
136 |         pred_original_sample = (alpha_prod_t**0.5) * sample - (
137 |             beta_prod_t**0.5
138 |         ) * model_output
139 |         pred_epsilon = (alpha_prod_t**0.5) * model_output + (
140 |             beta_prod_t**0.5
141 |         ) * sample
142 |     else:
143 |         raise ValueError(
144 |             f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
145 |             " `v_prediction`"
146 |         )
147 | 
148 |     # 4. Clip or threshold "predicted x_0"
149 |     # cifar ddpm: self.config.thresholding = False, self.config.clip_sample_range = 1.0
150 |     # SD: self.config.thresholding = False, self.config.clip_sample = False
151 |     if self.config.thresholding:
152 |         pred_original_sample = self._threshold_sample(pred_original_sample)
153 |     elif self.config.clip_sample:
154 |         pred_original_sample = pred_original_sample.clamp(
155 |             -self.config.clip_sample_range, self.config.clip_sample_range
156 |         )
157 | 
158 |     # 5. compute variance: "sigma_t(η)" -> see formula (16)
159 |     # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
160 |     variance = _get_variance(self, timestep, prev_timestep)
161 |     std_dev_t = eta * variance ** (0.5)  # eta is 1.0
162 |     std_dev_t = _left_broadcast(std_dev_t, sample.shape).to(sample.device)
163 | 
164 |     if use_clipped_model_output: # not used?
165 |         # the pred_epsilon is always re-derived from the clipped x_0 in Glide
166 |         pred_epsilon = (
167 |             sample - alpha_prod_t ** (0.5) * pred_original_sample
168 |         ) / beta_prod_t ** (0.5)
169 | 
170 |     # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
171 |     pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (
172 |         0.5
173 |     ) * pred_epsilon
174 | 
175 |     # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
176 |     prev_sample_mean = (
177 |         alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
178 |     )
179 | 
180 |     if prev_sample is not None and generator is not None:
181 |         raise ValueError(
182 |             "Cannot pass both generator and prev_sample. Please make sure that either `generator` or"
183 |             " `prev_sample` stays `None`."
184 |         )
185 | 
186 |     if prev_sample is None:
187 |         variance_noise = randn_tensor(
188 |             model_output.shape,
189 |             generator=generator,
190 |             device=model_output.device,
191 |             dtype=model_output.dtype,
192 |         )
193 |         prev_sample = prev_sample_mean + std_dev_t * variance_noise
194 | 
195 |     # log prob of prev_sample given prev_sample_mean and std_dev_t
196 |     log_prob = (
197 |         -((prev_sample.detach() - prev_sample_mean) ** 2) / (2 * (std_dev_t**2))
198 |         - torch.log(std_dev_t)
199 |         - torch.log(torch.sqrt(2 * torch.as_tensor(math.pi)))
200 |     )
201 |     if logp_mean:
202 |         # mean along all but batch dimension
203 |         log_prob = log_prob.mean(dim=tuple(range(1, log_prob.ndim)))
204 |     else:
205 |         log_prob = log_prob.sum(dim=tuple(range(1, log_prob.ndim)))
206 | 
207 |     if calculate_pb:
208 |         assert prev_sample is not None
209 |         alpha_ddim = alpha_prod_t / alpha_prod_t_prev  # (bs, 4, 64, 64)
210 |         pb_mean = alpha_ddim.sqrt() * prev_sample
211 |         pb_std = (1 - alpha_ddim).sqrt()
212 |         log_pb = (
213 |                 -((sample.detach() - pb_mean.detach()) ** 2) / (2 * (pb_std ** 2))
214 |                 - torch.log(pb_std)
215 |                 - torch.log(torch.sqrt(2 * torch.as_tensor(math.pi)))
216 |         )
217 |         if logp_mean:
218 |             log_pb = log_pb.mean(dim=tuple(range(1, sample.ndim)))
219 |         else:
220 |             log_pb = log_pb.sum(dim=tuple(range(1, sample.ndim)))
221 |         return prev_sample.type(sample.dtype), log_prob, log_pb
222 | 
223 |     else:
224 |         return prev_sample.type(sample.dtype), log_prob
225 |     # output is float32 as the self.alpha is float32
226 | 
227 | 
228 | @torch.no_grad()
229 | def pred_orig_latent(self: DDIMScheduler, model_output, sample: torch.FloatTensor, timestep: int):
230 |     # 2. compute alphas, betas
231 |     # self.alphas_cumprod  torch.Size([1000])
232 |     alpha_prod_t = self.alphas_cumprod.gather(0, timestep.cpu())  # torch scalar
233 |     alpha_prod_t = _left_broadcast(alpha_prod_t, sample.shape).to(sample.device)
234 |     alpha_prod_t = alpha_prod_t.to(sample.dtype) # float32 -> bf16
235 |     beta_prod_t = 1 - alpha_prod_t
236 | 
237 |     if self.config.prediction_type == "epsilon":
238 |         pred_original_sample = (
239 |             sample - beta_prod_t ** (0.5) * model_output
240 |         ) / alpha_prod_t ** (0.5)
241 |     elif self.config.prediction_type == "sample":
242 |         pred_original_sample = model_output
243 |     elif self.config.prediction_type == "v_prediction":
244 |         pred_original_sample = (alpha_prod_t**0.5) * sample - (
245 |             beta_prod_t**0.5
246 |         ) * model_output
247 |     else:
248 |         raise ValueError(
249 |             f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
250 |             " `v_prediction`"
251 |         )
252 |     return pred_original_sample
253 | 
254 | 
255 | def compute_snr(noise_scheduler, timesteps):
256 |     """
257 |     Computes SNR as per
258 |     https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
259 |     """
260 |     alphas_cumprod = noise_scheduler.alphas_cumprod
261 |     sqrt_alphas_cumprod = alphas_cumprod**0.5
262 |     sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
263 | 
264 |     # Expand the tensors.
265 |     # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
266 |     sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
267 |     while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
268 |         sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
269 |     alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
270 | 
271 |     sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
272 |     while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
273 |         sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
274 |     sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
275 | 
276 |     # Compute SNR.
277 |     snr = (alpha / sigma) ** 2
278 |     return snr
279 | 
280 | 
281 | # given x_{t-1} "prev_sample", compute x_t "sample"
282 | def step_backward(self: DDIMScheduler,
283 |     timestep: int,
284 |     prev_sample: torch.FloatTensor,
285 |     generator=None,):
286 | 
287 |     prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
288 |     # to prevent OOB on gather
289 |     prev_timestep = torch.clamp(prev_timestep, 0, self.config.num_train_timesteps - 1)
290 | 
291 |     # 2. compute alphas, betas
292 |     # self.alphas_cumprod  torch.Size([1000])
293 |     alpha_prod_t = self.alphas_cumprod.gather(0, timestep.cpu())  # torch scalar
294 |     alpha_prod_t_prev = torch.where(
295 |         prev_timestep.cpu() >= 0,
296 |         self.alphas_cumprod.gather(0, prev_timestep.cpu()),
297 |         self.final_alpha_cumprod,
298 |     )
299 |     alpha_prod_t = _left_broadcast(alpha_prod_t, prev_sample.shape).to(prev_sample.device)
300 |     alpha_prod_t_prev = _left_broadcast(alpha_prod_t_prev, prev_sample.shape).to(prev_sample.device)
301 |     # beta_prod_t = 1 - alpha_prod_t
302 | 
303 |     alpha_ddim = alpha_prod_t / alpha_prod_t_prev  # (bs, 4, 64, 64)
304 |     pb_mean = alpha_ddim.sqrt() * prev_sample
305 |     pb_std = (1 - alpha_ddim).sqrt()
306 | 
307 |     sample = pb_mean + pb_std * randn_tensor(
308 |         prev_sample.shape,
309 |         generator=generator,
310 |         device=prev_sample.device,
311 |         dtype=prev_sample.dtype,
312 |     )
313 |     return sample


--------------------------------------------------------------------------------
/alignment/diffusers_patch/pipeline_with_logprob.py:
--------------------------------------------------------------------------------
  1 | # For licensing see accompanying LICENSE file.
  2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved.
  3 | 
  4 | # Modified from https://github.com/huggingface/diffusers/blob/fc6acb6b97e93d58cb22b5fee52d884d77ce84d8/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
  5 | 
  6 | from typing import Any, Callable, Dict, List, Optional, Union
  7 | 
  8 | import torch
  9 | 
 10 | from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
 11 |     StableDiffusionPipeline,
 12 |     rescale_noise_cfg,
 13 | )
 14 | try:
 15 |     from diffusers.utils import randn_tensor
 16 | except ImportError:
 17 |     from diffusers.utils.torch_utils import randn_tensor
 18 | from .ddim_with_logprob import ddim_step_with_logprob
 19 | from ..utils import image_postprocess
 20 | 
 21 | @torch.no_grad()
 22 | def pipeline_with_logprob(
 23 |     self: StableDiffusionPipeline,
 24 |     prompt: Union[str, List[str]] = None,
 25 |     height: Optional[int] = None,
 26 |     width: Optional[int] = None,
 27 |     num_inference_steps: int = 50,
 28 |     guidance_scale: float = 5,
 29 |     negative_prompt: Optional[Union[str, List[str]]] = None,
 30 |     num_images_per_prompt: Optional[int] = 1,
 31 |     eta: float = 0.0,
 32 |     generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
 33 |     latents: Optional[torch.FloatTensor] = None,
 34 |     prompt_embeds: Optional[torch.FloatTensor] = None,
 35 |     negative_prompt_embeds: Optional[torch.FloatTensor] = None,
 36 |     output_type: Optional[str] = "pil",
 37 |     return_dict: bool = True,
 38 |     callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
 39 |     callback_steps: int = 1,
 40 |     cross_attention_kwargs: Optional[Dict[str, Any]] = None,
 41 |     guidance_rescale: float = 0.0,
 42 | 
 43 |     batch_size = None, dtype=None,
 44 |     device = None,
 45 |     calculate_pb = False, logp_mean = True,
 46 |     return_unetoutput = False,
 47 | ):
 48 |     r"""
 49 |     Function invoked when calling the pipeline for generation.
 50 | 
 51 |     Args:
 52 |         prompt (`str` or `List[str]`, *optional*):
 53 |             The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
 54 |             instead.
 55 |         height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
 56 |             The height in pixels of the generated image.
 57 |         width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
 58 |             The width in pixels of the generated image.
 59 |         num_inference_steps (`int`, *optional*, defaults to 50):
 60 |             The number of denoising steps. More denoising steps usually lead to a higher quality image at the
 61 |             expense of slower inference.
 62 |         guidance_scale (`float`, *optional*, defaults to 7.5):
 63 |             Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
 64 |             `guidance_scale` is defined as `w` of equation 2. of [Imagen
 65 |             Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
 66 |             1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
 67 |             usually at the expense of lower image quality.
 68 |         negative_prompt (`str` or `List[str]`, *optional*):
 69 |             The prompt or prompts not to guide the image generation. If not defined, one has to pass
 70 |             `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
 71 |             less than `1`).
 72 |         num_images_per_prompt (`int`, *optional*, defaults to 1):
 73 |             The number of images to generate per prompt.
 74 |         eta (`float`, *optional*, defaults to 0.0):
 75 |             Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
 76 |             [`schedulers.DDIMScheduler`], will be ignored for others.
 77 |         generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
 78 |             One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
 79 |             to make generation deterministic.
 80 |         latents (`torch.FloatTensor`, *optional*):
 81 |             Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
 82 |             generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
 83 |             tensor will ge generated by sampling using the supplied random `generator`.
 84 |         prompt_embeds (`torch.FloatTensor`, *optional*):
 85 |             Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
 86 |             provided, text embeddings will be generated from `prompt` input argument.
 87 |         negative_prompt_embeds (`torch.FloatTensor`, *optional*):
 88 |             Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
 89 |             weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
 90 |             argument.
 91 |         output_type (`str`, *optional*, defaults to `"pil"`):
 92 |             The output format of the generate image. Choose between
 93 |             [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
 94 |         return_dict (`bool`, *optional*, defaults to `True`):
 95 |             Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
 96 |             plain tuple.
 97 |         callback (`Callable`, *optional*):
 98 |             A function that will be called every `callback_steps` steps during inference. The function will be
 99 |             called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
100 |         callback_steps (`int`, *optional*, defaults to 1):
101 |             The frequency at which the `callback` function will be called. If not specified, the callback will be
102 |             called at every step.
103 |         cross_attention_kwargs (`dict`, *optional*):
104 |             A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
105 |             `self.processor` in
106 |             [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
107 |         guidance_rescale (`float`, *optional*, defaults to 0.7):
108 |             Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
109 |             Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
110 |             [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
111 |             Guidance rescale factor should fix overexposure when using zero terminal SNR.
112 | 
113 |     Examples:
114 | 
115 |     Returns:
116 |         [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
117 |         [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
118 |         When returning a tuple, the first element is a list with the generated images, and the second element is a
119 |         list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
120 |         (nsfw) content, according to the `safety_checker`.
121 |     """
122 |     # 0. Default height and width to unet
123 |     if height is None:
124 |         height = height or self.unet.config.sample_size * self.vae_scale_factor
125 |     if width is None:
126 |         width = width or self.unet.config.sample_size * self.vae_scale_factor
127 | 
128 |     # 1. Check inputs. Raise error if not correct
129 |     if hasattr(self, "check_inputs"):  # DDPMPipeline does not have this method
130 |         self.check_inputs(
131 |             prompt,
132 |             height,
133 |             width,
134 |             callback_steps,
135 |             negative_prompt,
136 |             prompt_embeds,
137 |             negative_prompt_embeds,
138 |         )
139 | 
140 |     # 2. Define call parameters
141 |     if batch_size is None:
142 |         if prompt is not None and isinstance(prompt, str):
143 |             batch_size = 1
144 |         elif prompt is not None and isinstance(prompt, list):
145 |             batch_size = len(prompt)
146 |         else:
147 |             batch_size = prompt_embeds.shape[0]
148 | 
149 |     if device is None:
150 |         device = self._execution_device
151 |     # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
152 |     # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
153 |     # corresponds to doing no classifier free guidance.
154 |     do_classifier_free_guidance = guidance_scale > 1.0
155 | 
156 |     # 3. Encode input prompt
157 |     if prompt_embeds is not None:
158 |         text_encoder_lora_scale = (
159 |             cross_attention_kwargs.get("scale", None)
160 |             if cross_attention_kwargs is not None
161 |             else None
162 |         )
163 |         prompt_embeds = self._encode_prompt(
164 |             prompt,
165 |             device,
166 |             num_images_per_prompt,
167 |             do_classifier_free_guidance,
168 |             negative_prompt,
169 |             prompt_embeds=prompt_embeds,
170 |             negative_prompt_embeds=negative_prompt_embeds,
171 |             lora_scale=text_encoder_lora_scale,
172 |         )
173 | 
174 |     # 4. Prepare timesteps
175 |     if num_inference_steps is None:
176 |         timesteps = self.scheduler.timesteps
177 |         num_inference_steps = len(timesteps)
178 |     else:
179 |         self.scheduler.set_timesteps(num_inference_steps, device=device)
180 |         timesteps = self.scheduler.timesteps
181 | 
182 |     # 5. Prepare latent variables
183 |     num_channels_latents = self.unet.config.in_channels
184 |     if prompt_embeds is not None:
185 |         latents = self.prepare_latents(
186 |             batch_size * num_images_per_prompt,
187 |             num_channels_latents,
188 |             height,
189 |             width,
190 |             prompt_embeds.dtype,
191 |             device,
192 |             generator,
193 |             latents,
194 |         )
195 | 
196 |         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
197 |         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # {'eta': 1.0, 'generator': None}
198 | 
199 |     else:
200 |         shape = (batch_size, num_channels_latents, height, width)
201 |         latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
202 |         extra_step_kwargs = {'eta': eta, 'generator': generator}
203 | 
204 |     # 7. Denoising loop
205 |     # self.scheduler.order is 1, not sure what it does
206 |     num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
207 | 
208 |     all_latents = [latents]
209 |     all_log_probs = []
210 |     all_log_pbs = []
211 |     unet_outputs = []
212 |     with self.progress_bar(total=num_inference_steps) as progress_bar:
213 |         for i, t in enumerate(timesteps):
214 |             # expand the latents if we are doing classifier free guidance
215 |             latent_model_input = (
216 |                 torch.cat([latents] * 2) if do_classifier_free_guidance else latents
217 |             )
218 |             latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
219 | 
220 |             # predict the noise residual
221 |             if prompt_embeds is not None:
222 |                 noise_pred = self.unet(
223 |                     latent_model_input,
224 |                     t,
225 |                     encoder_hidden_states=prompt_embeds,
226 |                     cross_attention_kwargs=cross_attention_kwargs,
227 |                     return_dict=False,
228 |                 )[0]
229 |             else:
230 |                 noise_pred = self.unet(
231 |                     latent_model_input, t, return_dict=False
232 |                 )[0]
233 | 
234 |             # perform guidance
235 |             if do_classifier_free_guidance:
236 |                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
237 |                 noise_pred = noise_pred_uncond + guidance_scale * (
238 |                     noise_pred_text - noise_pred_uncond
239 |                 )
240 |             if return_unetoutput:
241 |                 unet_outputs.append(noise_pred.detach())
242 | 
243 |             # by default not used (as guidance_rescale = 0.0)
244 |             if do_classifier_free_guidance and guidance_rescale > 0.0:
245 |                 # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
246 |                 noise_pred = rescale_noise_cfg(
247 |                     noise_pred, noise_pred_text, guidance_rescale=guidance_rescale
248 |                 )
249 | 
250 |             # compute the previous noisy sample x_t -> x_t-1
251 |             prev_timestep = timesteps[i + 1] if i < num_inference_steps-1 else None
252 |             if calculate_pb:
253 |                 latents, log_prob, log_pb = ddim_step_with_logprob(
254 |                     self.scheduler, noise_pred, t, latents,
255 |                     calculate_pb=calculate_pb, logp_mean=logp_mean,
256 |                     prev_timestep=prev_timestep, #
257 |                     **extra_step_kwargs
258 |                 )
259 |                 all_log_pbs.append(log_pb)
260 |             else:
261 |                 latents, log_prob = ddim_step_with_logprob(
262 |                     self.scheduler, noise_pred, t, latents,
263 |                     prev_timestep=prev_timestep, #
264 |                     **extra_step_kwargs
265 |                 )
266 | 
267 |             all_latents.append(latents)
268 |             all_log_probs.append(log_prob)
269 | 
270 |             # call the callback, if provided
271 |             if i == len(timesteps) - 1 or (
272 |                 (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
273 |             ):
274 |                 progress_bar.update()
275 |                 if callback is not None and i % callback_steps == 0:
276 |                     callback(i, t, latents)
277 | 
278 |     if not output_type == "latent":
279 |         image = self.vae.decode(
280 |             latents / self.vae.config.scaling_factor, return_dict=False
281 |         )[0]
282 |         image, has_nsfw_concept = self.run_safety_checker(
283 |             image, device, prompt_embeds.dtype
284 |         )
285 |     else:
286 |         image = latents
287 |         has_nsfw_concept = None
288 | 
289 |     if has_nsfw_concept is None:
290 |         do_denormalize = [True] * image.shape[0]
291 |     else:
292 |         do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
293 | 
294 |     # At least for the cifar10 DDPM, the generated image is in [-1, 1],
295 |     # so we need this postprocessing to make it [0, 1]
296 |     if prompt_embeds is not None:
297 |         image = self.image_processor.postprocess(
298 |             image, output_type=output_type, do_denormalize=do_denormalize
299 |         )
300 |         # Offload last model to CPU
301 |         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
302 |             self.final_offload_hook.offload()
303 |     else:
304 |         # image = (image / 2 + 0.5).clamp(0, 1)
305 |         image = image_postprocess(image)
306 | 
307 |     assert not (calculate_pb and return_unetoutput), "Cannot return both log_pb and unet_outputs"
308 |     if calculate_pb:
309 |         return image, has_nsfw_concept, all_latents, all_log_probs, all_log_pbs
310 |     if return_unetoutput:
311 |         return image, has_nsfw_concept, all_latents, all_log_probs, unet_outputs
312 | 
313 |     return image, has_nsfw_concept, all_latents, all_log_probs
314 | 


--------------------------------------------------------------------------------
/alignment/flow.py:
--------------------------------------------------------------------------------
  1 | # For licensing see accompanying LICENSE file.
  2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved.
  3 | 
  4 | # Adapted from https://github.com/christophschuhmann/improved-aesthetic-predictor/blob/fe88a163f4661b4ddabba0751ff645e2e620746e/simple_inference.py
  5 | import torch
  6 | import torch.nn as nn
  7 | import numpy as np
  8 | from transformers import CLIPModel, CLIPProcessor
  9 | from PIL import Image
 10 | 
 11 | import sys
 12 | if sys.version_info < (3, 9):
 13 |     from importlib_resources import files
 14 | else:
 15 |     from importlib.resources import files
 16 | ASSETS_PATH = files("alignment.assets")
 17 | 
 18 | 
 19 | from dataclasses import dataclass
 20 | from typing import Any, Dict, List, Optional, Tuple, Union
 21 | from diffusers.models.embeddings import TimestepEmbedding, Timesteps, GaussianFourierProjection
 22 | from diffusers.models.unets.unet_2d_blocks import get_down_block, DownBlock2D, CrossAttnDownBlock2D
 23 | 
 24 | 
 25 | # https://github.com/huggingface/diffusers/blob/v0.17.1-patch/src/diffusers/models/unet_2d_condition.py
 26 | class ConditionalFlow(torch.nn.Module):
 27 |     def __init__(self,
 28 |         # sample_size: Optional[int] = None,
 29 |         in_channels: int = 4,
 30 |         # center_input_sample: bool = False,
 31 |         flip_sin_to_cos: bool = True,
 32 |         freq_shift: int = 0,
 33 |         down_block_types: Tuple[str] = ("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"),
 34 |         only_cross_attention: Union[bool, Tuple[bool]] = False,
 35 |         block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
 36 |         layers_per_block: Union[int, Tuple[int]] = 2,
 37 |         downsample_padding: int = 1,
 38 |         act_fn: str = "silu",
 39 |         norm_num_groups: Optional[int] = 32,
 40 |         norm_eps: float = 1e-5,
 41 |         cross_attention_dim: Union[int, Tuple[int]] = 1280,
 42 |         encoder_hid_dim: Optional[int] = None,
 43 |         encoder_hid_dim_type: Optional[str] = None,
 44 |         attention_head_dim: Union[int, Tuple[int]] = 8,
 45 |         # dual_cross_attention: bool = False,
 46 |         # use_linear_projection: bool = False,
 47 |         # class_embed_type: Optional[str] = None,
 48 |         # addition_embed_type: Optional[str] = None,
 49 |         # num_class_embeds: Optional[int] = None,
 50 |         # upcast_attention: bool = False,
 51 |         # resnet_time_scale_shift: str = "default",
 52 |         # resnet_skip_time_act: bool = False,
 53 |         # resnet_out_scale_factor: int = 1.0,
 54 |         # time_embedding_type: str = "positional",
 55 |         # time_embedding_dim: Optional[int] = None,
 56 |         # time_embedding_act_fn: Optional[str] = None,
 57 |         timestep_post_act: Optional[str] = None,
 58 |         time_cond_proj_dim: Optional[int] = None,
 59 |         conv_in_kernel: int = 3,
 60 |         # conv_out_kernel: int = 3,
 61 |         # projection_class_embeddings_input_dim: Optional[int] = None,
 62 |         class_embeddings_concat: bool = False,
 63 |         # mid_block_only_cross_attention: Optional[bool] = None,
 64 |         # cross_attention_norm: Optional[str] = None,
 65 |         ):
 66 | 
 67 |         super().__init__()
 68 | 
 69 |         timestep_input_dim = block_out_channels[0]
 70 |         self.time_proj = Timesteps(block_out_channels[0],
 71 |                flip_sin_to_cos=flip_sin_to_cos, downscale_freq_shift=freq_shift)
 72 |         time_embed_dim = block_out_channels[0] * 4
 73 |         self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim,
 74 |                 act_fn=act_fn, post_act_fn=timestep_post_act, cond_proj_dim=time_cond_proj_dim)
 75 | 
 76 |         conv_in_padding = (conv_in_kernel - 1) // 2
 77 |         self.conv_in = nn.Conv2d(
 78 |             in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
 79 |         )
 80 |         self.encoder_hid_proj = None
 81 | 
 82 |         self.down_blocks = nn.ModuleList([])
 83 |         # only_cross_attention = [only_cross_attention] * len(down_block_types)
 84 | 
 85 |         if isinstance(attention_head_dim, int):
 86 |             attention_head_dim = (attention_head_dim,) * len(down_block_types)
 87 | 
 88 |         if isinstance(cross_attention_dim, int):
 89 |             cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
 90 | 
 91 |         if isinstance(layers_per_block, int):
 92 |             layers_per_block = [layers_per_block] * len(down_block_types)
 93 | 
 94 |         if class_embeddings_concat:
 95 |             # The time embeddings are concatenated with the class embeddings. The dimension of the
 96 |             # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
 97 |             # regular time embeddings
 98 |             blocks_time_embed_dim = time_embed_dim * 2
 99 |         else:
100 |             blocks_time_embed_dim = time_embed_dim
101 | 
102 |         output_channel = block_out_channels[0]
103 |         for i, down_block_type in enumerate(down_block_types):
104 |             input_channel = output_channel
105 |             output_channel = block_out_channels[i]
106 |             # is_final_block = i == len(block_out_channels) - 1
107 | 
108 |             down_block = get_down_block(
109 |                 down_block_type,
110 |                 num_layers=layers_per_block[i],
111 |                 in_channels=input_channel,
112 |                 out_channels=output_channel,
113 |                 temb_channels=blocks_time_embed_dim,
114 |                 # add_downsample=not is_final_block,
115 |                 add_downsample=True,
116 |                 resnet_eps=norm_eps,
117 |                 resnet_act_fn=act_fn,
118 |                 resnet_groups=norm_num_groups,
119 |                 cross_attention_dim=cross_attention_dim[i],
120 |                 # attn_num_head_channels=attention_head_dim[i], # old diffusers version
121 |                 num_attention_heads=attention_head_dim[i],
122 |                 attention_head_dim=attention_head_dim[i], # can be annotated
123 |                 downsample_padding=downsample_padding,
124 |                 # dual_cross_attention=dual_cross_attention,
125 |                 # use_linear_projection=use_linear_projection,
126 |                 # only_cross_attention=only_cross_attention[i],
127 |                 # upcast_attention=upcast_attention,
128 |                 # resnet_time_scale_shift=resnet_time_scale_shift,
129 |                 # resnet_skip_time_act=resnet_skip_time_act,
130 |                 # resnet_out_scale_factor=resnet_out_scale_factor,
131 |                 # cross_attention_norm=cross_attention_norm,
132 |             )
133 |             self.down_blocks.append(down_block)
134 | 
135 |         self.pool = nn.AvgPool2d(4, stride=4) # (bs, 4, 64, 64) -> downsample 4 times -> (bs, ..., 4, 4)
136 |         self.fc = nn.Linear(block_out_channels[-1], 1)
137 | 
138 |     def forward(self, sample, timesteps, encoder_hidden_states,
139 |                 attention_mask: Optional[torch.Tensor] = None,
140 |                 cross_attention_kwargs: Optional[Dict[str, Any]] = None,
141 |                 encoder_attention_mask: Optional[torch.Tensor] = None,
142 |                 ):
143 |         # bs = sample.shape[0]
144 |         dtype = next(self.down_blocks.parameters()).dtype
145 |         # device = next(self.down_blocks.parameters()).device
146 | 
147 |         # timesteps = timesteps * torch.ones(sample.shape[0], dtype=timesteps.dtype, device=timesteps.device)
148 |         t_emb = self.time_proj(timesteps)
149 |         t_emb = t_emb.to(dtype=dtype)
150 |         emb = self.time_embedding(t_emb)
151 | 
152 |         sample = self.conv_in(sample)  # (bs, 320, 64, 64)
153 |         # down_block_res_samples = (sample,)
154 |         for downsample_block in self.down_blocks:
155 |             if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
156 |                 sample, res_samples = downsample_block(
157 |                     hidden_states=sample,
158 |                     temb=emb,
159 |                     encoder_hidden_states=encoder_hidden_states,
160 |                     attention_mask=attention_mask,
161 |                     cross_attention_kwargs=cross_attention_kwargs,
162 |                     encoder_attention_mask=encoder_attention_mask,
163 |                 )
164 |             else:
165 |                 sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
166 |             # down_block_res_samples += res_samples
167 | 
168 |         sample = self.pool(sample)
169 |         sample = sample.view(sample.size(0), -1)
170 |         sample = self.fc(sample).squeeze()
171 |         return sample
172 | 


--------------------------------------------------------------------------------
/alignment/model_configs/ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }


--------------------------------------------------------------------------------
/alignment/prompts.py:
--------------------------------------------------------------------------------
  1 | # For licensing see accompanying LICENSE file.
  2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved.
  3 | 
  4 | from importlib import resources
  5 | import os
  6 | import functools
  7 | import random
  8 | import inflect
  9 | 
 10 | IE = inflect.engine()
 11 | 
 12 | import sys
 13 | if sys.version_info < (3, 9):
 14 |     from importlib_resources import files
 15 | else:
 16 |     from importlib.resources import files
 17 | ASSETS_PATH = files("alignment.assets")
 18 | 
 19 | 
 20 | @functools.lru_cache() # will remember previous 128 calls
 21 | def _load_lines(path):
 22 |     """
 23 |     Load lines from a file. First tries to load from `path` directly, and if that doesn't exist, searches the
 24 |     `alignment/assets` directory for a file named `path`.
 25 |     """
 26 |     if not os.path.exists(path):
 27 |         newpath = ASSETS_PATH.joinpath(path)
 28 |     if not os.path.exists(newpath):
 29 |         raise FileNotFoundError(f"Could not find {path} or alignment.assets/{path}")
 30 |     path = newpath
 31 |     with open(path, "r") as f:
 32 |         return [line.strip() for line in f.readlines()]
 33 | 
 34 | def from_file(path, low=None, high=None):
 35 |     prompts = _load_lines(path)[low:high]
 36 |     return random.choice(prompts), {}
 37 | 
 38 | 
 39 | short_names = {
 40 |     "imagenet_all": "inall",
 41 |     "imagenet_animals": "inanm",
 42 |     "imagenet_dogs": "indog",
 43 |     "simple_animals": "simanm",
 44 |     "drawbench": "drawb",
 45 | 
 46 |     "hpd": "hpd",
 47 |     "hpd_photo": "hppho",
 48 |     "hpd_photo_painting": "hpphopa",
 49 |     "hpd_photo_anime": "hpphoan",
 50 |     "hpd_photo_concept": "hpphoct",
 51 | 
 52 |     "nouns_activities": "nounact",
 53 |     "counting": "count",
 54 | }
 55 | 
 56 | def imagenet_all():
 57 |     return from_file("imagenet_classes.txt")
 58 | 
 59 | 
 60 | def imagenet_animals():
 61 |     return from_file("imagenet_classes.txt", 0, 398)
 62 | 
 63 | 
 64 | def imagenet_dogs():
 65 |     return from_file("imagenet_classes.txt", 151, 269)
 66 | 
 67 | 
 68 | def simple_animals():
 69 |     return from_file("simple_animals.txt")
 70 | 
 71 | 
 72 | import csv
 73 | import collections
 74 | @functools.lru_cache()
 75 | def read_csv(path):
 76 |     # reader = csv.DictReader(open(path))
 77 |     with open (path, 'r') as f:
 78 |         reader = csv.DictReader(f)
 79 |         reader = [row for row in reader]
 80 | 
 81 |     info = collections.defaultdict(list)
 82 |     for row in reader:
 83 |         info[row["Category"]].append(row["Prompts"])
 84 |     """
 85 |     [(k, len(v)) for k, v in info.items()]
 86 |     [('Colors', 25), ('Conflicting', 10), ('Counting', 19), ('DALL-E', 20), ('Descriptions', 20), ('Gary Marcus et al. ', 10),
 87 |      ('Misspellings', 10), ('Positional', 20), ('Rare Words', 7), ('Reddit', 38), ('Text', 21)]
 88 |     """
 89 | 
 90 |     filtered_info = {}
 91 |     for k, v in info.items():
 92 |         if k in ["Misspellings", "Rare Words"]: # filter out, rest 183
 93 |             continue
 94 |         filtered_info[k] = v[2:] # saved for test
 95 |     drawbench_prompt_ls = sum(filtered_info.values(), [])
 96 |     return drawbench_prompt_ls  # len=165
 97 | 
 98 | def drawbench():
 99 |     drawbench_prompt_ls = read_csv(ASSETS_PATH.joinpath("DrawBench Prompts.csv"))
100 |     return random.choice(drawbench_prompt_ls), {}
101 | 
102 | 
103 | import json
104 | @functools.lru_cache()
105 | def read_hpd(style=None):
106 |     if style is None:
107 |         # 800 prompts for each of the 4 styles
108 |         styles = ["anime", "concept-art", "paintings", "photo"]
109 |     else:
110 |         styles = [style,]
111 |     # dic = {}
112 |     prompts_ls = []
113 |     for style in styles:
114 |         with open(ASSETS_PATH.joinpath(f"HPDv2/benchmark_{style}.json"), "r") as f:
115 |             # dic[style] = json.load(f)  # list of strings
116 |             prompts_ls.extend(json.load(f)[10:]) # 790 for train, 10 for test
117 | 
118 |     return prompts_ls
119 | 
120 | def hpd():
121 |     prompts_ls = read_hpd()
122 |     return random.choice(prompts_ls), {}
123 | 
124 | def hpd_photo():
125 |     prompts_ls = read_hpd("photo")
126 |     return random.choice(prompts_ls), {}
127 | 
128 | def hpd_photo_painting():
129 |     prompts_ls = read_hpd("photo")
130 |     prompts_ls.extend(read_hpd("paintings")) # not "painting"
131 |     return random.choice(prompts_ls), {}
132 | 
133 | def hpd_photo_anime():
134 |     prompts_ls = read_hpd("photo")
135 |     prompts_ls.extend(read_hpd("anime"))
136 |     return random.choice(prompts_ls), {}
137 | 
138 | def hpd_photo_concept():
139 |     prompts_ls = read_hpd("photo")
140 |     prompts_ls.extend(read_hpd("concept-art"))
141 |     return random.choice(prompts_ls), {}
142 | 
143 | def nouns_activities(nouns_file, activities_file):
144 |     nouns = _load_lines(nouns_file)
145 |     activities = _load_lines(activities_file)
146 |     return f"{IE.a(random.choice(nouns))} {random.choice(activities)}", {}
147 | 
148 | 
149 | def counting(nouns_file, low, high):
150 |     nouns = _load_lines(nouns_file)
151 |     number = IE.number_to_words(random.randint(low, high))
152 |     noun = random.choice(nouns)
153 |     plural_noun = IE.plural(noun)
154 |     prompt = f"{number} {plural_noun}"
155 |     metadata = {
156 |         "questions": [
157 |             f"How many {plural_noun} are there in this image?",
158 |             f"What animal is in this image?",
159 |         ],
160 |         "answers": [
161 |             number,
162 |             noun,
163 |         ],
164 |     }
165 |     return prompt, metadata


--------------------------------------------------------------------------------
/alignment/rewards.py:
--------------------------------------------------------------------------------
  1 | # For licensing see accompanying LICENSE file.
  2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved.
  3 | 
  4 | import os
  5 | from PIL import Image
  6 | import io
  7 | import numpy as np
  8 | import time
  9 | import requests
 10 | 
 11 | import torch
 12 | import torch.distributed as dist
 13 | 
 14 | from scripts.distributed import get_local_rank
 15 | 
 16 | 
 17 | short_names = {
 18 |     "jpeg_incompressibility": "incomp",
 19 |     "jpeg_compressibility": "comp",
 20 |     "aesthetic_score": "aes",
 21 |     "imagereward": "imgr",
 22 |     "llava_strict_satisfaction": "llava_strict",
 23 |     "llava_bertscore": "llava",
 24 | }
 25 | use_prompt = {
 26 |     "jpeg_incompressibility": False,
 27 |     "jpeg_compressibility": False,
 28 |     "aesthetic_score": False,
 29 |     "imagereward": True,
 30 | }
 31 | 
 32 | def jpeg_incompressibility(dtype=torch.float32, device="cuda"):
 33 |     def _fn(images, prompts, metadata):
 34 |         if isinstance(images, torch.Tensor):
 35 |             images = (images * 255).round().clamp(0, 255).to(torch.uint8).cpu().numpy()
 36 |             images = images.transpose(0, 2, 3, 1)  # NCHW -> NHWC
 37 |         images = [Image.fromarray(image) for image in images]
 38 |         buffers = [io.BytesIO() for _ in images]
 39 |         for image, buffer in zip(images, buffers):
 40 |             image.save(buffer, format="JPEG", quality=95)
 41 |         sizes = [buffer.tell() / 1000 for buffer in buffers]
 42 |         sizes = np.array(sizes)
 43 |         return torch.from_numpy(sizes).cuda(), {}
 44 | 
 45 |     return _fn
 46 | 
 47 | 
 48 | def jpeg_compressibility(dtype=torch.float32, device="cuda"):
 49 |     jpeg_fn = jpeg_incompressibility(dtype, device)
 50 | 
 51 |     def _fn(images, prompts, metadata):
 52 |         rew, meta = jpeg_fn(images, prompts, metadata)
 53 |         return -rew, meta
 54 | 
 55 |     return _fn
 56 | 
 57 | 
 58 | def aesthetic_score(dtype=torch.float32, device="cuda", distributed=True):
 59 |     from alignment.aesthetic_scorer import AestheticScorer
 60 |     # why cuda() doesn't cause a bug?
 61 |     scorer = AestheticScorer(dtype=torch.float32, distributed=distributed).cuda() # ignore type;
 62 | 
 63 |     # @torch.no_grad() # original AestheticScorer already has no_grad()
 64 |     def _fn(images, prompts, metadata):
 65 |         if isinstance(images, torch.Tensor):
 66 |             images = (images * 255).round().clamp(0, 255).to(torch.uint8)
 67 |         else:
 68 |             images = images.transpose(0, 3, 1, 2)  # NHWC -> NCHW
 69 |             images = torch.tensor(images, dtype=torch.uint8)
 70 |         scores = scorer(images)
 71 |         return scores, {}
 72 | 
 73 |     return _fn
 74 | 
 75 | 
 76 | # For ImageReward
 77 | import ImageReward as RM
 78 | from PIL import Image
 79 | from torchvision.transforms import Compose, Resize, CenterCrop, Normalize
 80 | try:
 81 |     from torchvision.transforms import InterpolationMode
 82 |     BICUBIC = InterpolationMode.BICUBIC
 83 | except ImportError:
 84 |     BICUBIC = Image.BICUBIC
 85 | 
 86 | def imagereward(dtype=torch.float32, device="cuda"):
 87 |     # aesthetic = RM.load_score("Aesthetic", device=device)
 88 |     if get_local_rank() == 0:  # only download once
 89 |         reward_model = RM.load("ImageReward-v1.0")
 90 |     dist.barrier()
 91 |     reward_model = RM.load("ImageReward-v1.0")
 92 |     reward_model.to(dtype).to(device)
 93 | 
 94 |     rm_preprocess = Compose([
 95 |             Resize(224, interpolation=BICUBIC),
 96 |             CenterCrop(224),
 97 |             Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
 98 |         ])
 99 | 
100 |     def _fn(images, prompts, metadata):
101 |         dic = reward_model.blip.tokenizer(prompts,
102 |                 padding='max_length', truncation=True,  return_tensors="pt",
103 |                 max_length=reward_model.blip.tokenizer.model_max_length) # max_length=512
104 |         device = images.device
105 |         input_ids, attention_mask = dic.input_ids.to(device), dic.attention_mask.to(device)
106 |         reward = reward_model.score_gard(input_ids, attention_mask, rm_preprocess(images))
107 |         return reward.reshape(images.shape[0]).float(), {} # bf16 -> f32
108 | 
109 |     return _fn
110 | 
111 | 
112 | def llava_strict_satisfaction(dtype=torch.float32, device="cuda"):
113 |     """Submits images to LLaVA and computes a reward by matching the responses to ground truth answers directly without
114 |     using BERTScore. Prompt metadata must have "questions" and "answers" keys. See
115 |     https://github.com/kvablack/LLaVA-server for server-side code.
116 |     """
117 |     import requests
118 |     from requests.adapters import HTTPAdapter, Retry
119 |     from io import BytesIO
120 |     import pickle
121 | 
122 |     batch_size = 4
123 |     url = "http://127.0.0.1:8085"
124 |     sess = requests.Session()
125 |     retries = Retry(
126 |         total=1000, backoff_factor=1, status_forcelist=[500], allowed_methods=False
127 |     )
128 |     sess.mount("http://", HTTPAdapter(max_retries=retries))
129 | 
130 |     def _fn(images, prompts, metadata):
131 |         del prompts
132 |         if isinstance(images, torch.Tensor):
133 |             images = (images * 255).round().clamp(0, 255).to(torch.uint8).cpu().numpy()
134 |             images = images.transpose(0, 2, 3, 1)  # NCHW -> NHWC
135 | 
136 |         images_batched = np.array_split(images, np.ceil(len(images) / batch_size))
137 |         metadata_batched = np.array_split(metadata, np.ceil(len(metadata) / batch_size))
138 | 
139 |         all_scores = []
140 |         all_info = {
141 |             "answers": [],
142 |         }
143 |         for image_batch, metadata_batch in zip(images_batched, metadata_batched):
144 |             jpeg_images = []
145 | 
146 |             # Compress the images using JPEG
147 |             for image in image_batch:
148 |                 img = Image.fromarray(image)
149 |                 buffer = BytesIO()
150 |                 img.save(buffer, format="JPEG", quality=80)
151 |                 jpeg_images.append(buffer.getvalue())
152 | 
153 |             # format for LLaVA server
154 |             data = {
155 |                 "images": jpeg_images,
156 |                 "queries": [m["questions"] for m in metadata_batch],
157 |             }
158 |             data_bytes = pickle.dumps(data)
159 | 
160 |             # send a request to the llava server
161 |             response = sess.post(url, data=data_bytes, timeout=120)
162 | 
163 |             response_data = pickle.loads(response.content)
164 | 
165 |             correct = np.array(
166 |                 [
167 |                     [ans in resp for ans, resp in zip(m["answers"], responses)]
168 |                     for m, responses in zip(metadata_batch, response_data["outputs"])
169 |                 ]
170 |             )
171 |             scores = correct.mean(axis=-1)
172 | 
173 |             all_scores += scores.tolist()
174 |             all_info["answers"] += response_data["outputs"]
175 | 
176 |         return np.array(all_scores), {k: np.array(v) for k, v in all_info.items()}
177 | 
178 |     return _fn
179 | 
180 | 
181 | def llava_bertscore(dtype=torch.float32, device="cuda"):
182 |     """Submits images to LLaVA and computes a reward by comparing the responses to the prompts using BERTScore. See
183 |     https://github.com/kvablack/LLaVA-server for server-side code.
184 |     """
185 |     import requests
186 |     from requests.adapters import HTTPAdapter, Retry
187 |     from io import BytesIO
188 |     import pickle
189 | 
190 |     batch_size = 16
191 |     url = "http://127.0.0.1:8085"
192 |     sess = requests.Session()
193 |     retries = Retry(
194 |         total=1000, backoff_factor=1, status_forcelist=[500], allowed_methods=False
195 |     )
196 |     sess.mount("http://", HTTPAdapter(max_retries=retries))
197 | 
198 |     def _fn(images, prompts, metadata):
199 |         del metadata
200 |         if isinstance(images, torch.Tensor):
201 |             images = (images * 255).round().clamp(0, 255).to(torch.uint8).cpu().numpy()
202 |             images = images.transpose(0, 2, 3, 1)  # NCHW -> NHWC
203 | 
204 |         images_batched = np.array_split(images, np.ceil(len(images) / batch_size))
205 |         prompts_batched = np.array_split(prompts, np.ceil(len(prompts) / batch_size))
206 | 
207 |         all_scores = []
208 |         all_info = {
209 |             "precision": [],
210 |             "f1": [],
211 |             "outputs": [],
212 |         }
213 |         for image_batch, prompt_batch in zip(images_batched, prompts_batched):
214 |             jpeg_images = []
215 | 
216 |             # Compress the images using JPEG
217 |             for image in image_batch:
218 |                 img = Image.fromarray(image)
219 |                 buffer = BytesIO()
220 |                 img.save(buffer, format="JPEG", quality=80)
221 |                 jpeg_images.append(buffer.getvalue())
222 | 
223 |             # format for LLaVA server
224 |             data = {
225 |                 "images": jpeg_images,
226 |                 "queries": [["Answer concisely: what is going on in this image?"]]
227 |                 * len(image_batch),
228 |                 "answers": [
229 |                     [f"The image contains {prompt}"] for prompt in prompt_batch
230 |                 ],
231 |             }
232 |             data_bytes = pickle.dumps(data)
233 | 
234 |             # send a request to the llava server
235 |             response = sess.post(url, data=data_bytes, timeout=120)
236 | 
237 |             response_data = pickle.loads(response.content)
238 | 
239 |             # use the recall score as the reward
240 |             scores = np.array(response_data["recall"]).squeeze()
241 |             all_scores += scores.tolist()
242 | 
243 |             # save the precision and f1 scores for analysis
244 |             all_info["precision"] += (
245 |                 np.array(response_data["precision"]).squeeze().tolist()
246 |             )
247 |             all_info["f1"] += np.array(response_data["f1"]).squeeze().tolist()
248 |             all_info["outputs"] += np.array(response_data["outputs"]).squeeze().tolist()
249 | 
250 |         return np.array(all_scores), {k: np.array(v) for k, v in all_info.items()}
251 | 
252 |     return _fn
253 | 


--------------------------------------------------------------------------------
/alignment/utils.py:
--------------------------------------------------------------------------------
 1 | # For licensing see accompanying LICENSE file.
 2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved.
 3 | 
 4 | import sys
 5 | import time
 6 | 
 7 | import numpy as np
 8 | import torch
 9 | import torch.nn as nn
10 | import torch.nn.functional as F
11 | 
12 | def image_postprocess(x):
13 |     # [-1, 1] -> [0, 1]
14 |     return torch.clamp((x + 1) / 2, 0, 1)  # x / 2 + 0.5
15 | 
16 | def soft_update(target, source, tau):
17 |     for target_param, param in zip(target.parameters(), source.parameters()):
18 |         target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
19 | 
20 | def hard_update(target, source):
21 |     for target_param, param in zip(target.parameters(), source.parameters()):
22 |         target_param.data.copy_(param.data)
23 | 
24 | 


--------------------------------------------------------------------------------
/config/sd.yaml:
--------------------------------------------------------------------------------
 1 | parameters:
 2 |   save_freq: 10
 3 |   num_checkpoint_limit: 5
 4 |   mixed_precision: "bf16"
 5 |   allow_tf32: True
 6 |   # whether or not to use LoRA. LoRA reduces memory usage significantly by injecting small weight matrices into the
 7 |   # attention layers of the UNet. with LoRA, fp16, and a batch size of 1, finetuning Stable Diffusion should take
 8 |   # about 10GB of GPU memory. beware that if LoRA is disabled, training will take a lot of memory and saved checkpoint
 9 |   # files will also be large.
10 |   use_lora: True
11 | 
12 |   pretrained:
13 |     model: "runwayml/stable-diffusion-v1-5"
14 | #    model: "CompVis/stable-diffusion-v1-4" # similar to v1.5
15 |     revision: "main"
16 | 
17 |   sample:
18 |     num_steps: 50
19 |     # eta parameter for the DDIM sampler. this controls the amount of noise injected into the sampling process, with 0.0
20 |     # being fully deterministic and 1.0 being equivalent to the DDPM sampler.
21 |     eta: 1.0
22 |     guidance_scale: 5.0
23 |     # batch size (per GPU!) to use for sampling.
24 |     # number of batches to sample per epoch. the total number of samples per epoch is `num_batches_per_epoch *
25 |     # batch_size * num_gpus`.
26 |     batch_size: 16
27 |     num_batches_per_epoch: 4
28 | 
29 |   train:
30 |     # whether to use the 8bit Adam optimizer from bitsandbytes.
31 |     use_8bit_adam: False
32 |     learning_rate: 3.0e-4
33 |     adam_beta1: 0.9
34 |     adam_beta2: 0.999
35 |     adam_weight_decay: 1.0e-4
36 |     adam_epsilon: 1.e-8
37 |     max_grad_norm: 1.0
38 |     # number of inner epochs per outer epoch. each inner epoch is one iteration through the data collected during one
39 |     # outer epoch's round of sampling.
40 |     num_inner_epochs: 1
41 |     # whether or not to use classifier-free guidance during training. if enabled, the same guidance scale used during
42 |     # sampling will be used during training.
43 |     cfg: True
44 |     # clip advantages to the range [-adv_clip_max, adv_clip_max].
45 |     adv_clip_max: 5
46 |     # the PPO clip range.
47 |     clip_range: 1.e-4
48 |     # the fraction of timesteps to train on. if set to less than 1.0, the model will be trained on a subset of the
49 |     # timesteps for each sample. this will speed up training but reduce the accuracy of policy gradient estimates.
50 |     timestep_fraction: 1.0  # does not affect GPU memory occupation
51 |     lora_rank: 4
52 | 
53 |     batch_size: 8 # 8 (fp16) or 4 (fp32) -> 60000 MB GPU memory
54 |     gradient_accumulation_steps: 4
55 | 
56 |     ########### for GFN
57 |     reward_exp: 1.0e+2
58 |     flow_learning_rate: 3.0e-4
59 |     anneal: linear
60 |     unetreg: 1.0e+0
61 |     # whether to use GFN-DB with REINFORCE gradient
62 |     klpf: -1.
63 | 
64 |   seed: 0
65 |   num_epochs: 100
66 |   wandb: False
67 | 
68 | #  prompt_fn: "simple_animals" # for aesthetic_score
69 | #  prompt_fn: "imagenet_all" # for compression
70 |   prompt_fn: "drawbench"   # for imagereward
71 | #  prompt_fn: "hpd" # for HPSv2
72 | #  prompt_fn: "hpd_photo" # for HPSv2
73 | #  prompt_fn: "hpd_photo_painting" # for HPSv2
74 | #  prompt_fn: "hpd_photo_anime" # for HPSv2
75 | #  prompt_fn: "hpd_photo_concept" # for HPSv2
76 | 
77 | #  reward_fn: "aesthetic_score"
78 | #  reward_fn: "jpeg_compressibility"
79 | #  reward_fn: "jpeg_incompressibility"
80 |   reward_fn: "imagereward"
81 |   prompt_fn_kwargs: { }


--------------------------------------------------------------------------------
/scripts/distributed.py:
--------------------------------------------------------------------------------
  1 | # For licensing see accompanying LICENSE file.
  2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved.
  3 | 
  4 | import datetime
  5 | import os
  6 | import logging
  7 | import torch
  8 | import torch.distributed as dist
  9 | import irisctl.api as irisctl
 10 | 
 11 | 
 12 | def setup_for_distributed(is_master):
 13 |     """
 14 |     This function disables printing when not in master process
 15 |     """
 16 |     import builtins as __builtin__
 17 |     builtin_print = __builtin__.print
 18 | 
 19 |     def print(*args, **kwargs):
 20 |         force = kwargs.pop('force', False)
 21 |         if is_master or force:
 22 |             builtin_print(*args, **kwargs)
 23 | 
 24 |     __builtin__.print = print
 25 | 
 26 | 
 27 | def init_distributed_multinode(timeout=0):
 28 |     master_host = ""
 29 |     world_size = 0
 30 |     for tasklet in irisctl.distributed_tasklets():
 31 |         if tasklet.role_rank == 0:
 32 |             master_host = f"{tasklet.host_ip_address}:{tasklet.distributed_port}"
 33 |         world_size += 1
 34 |     print(
 35 |         f"Init PyTorch DDP with master host {master_host}, "
 36 |         f"world size {world_size}, rank {irisctl.role_rank()}"
 37 |     )
 38 |     if timeout == 0:
 39 |         timeout = dist.default_pg_timeout
 40 |     else:
 41 |         timeout = datetime.timedelta(seconds=timeout)
 42 | 
 43 |     logging.info(f'Default timeout: {timeout}')
 44 |     if world_size >= 1:
 45 |         torch.distributed.init_process_group(
 46 |             backend="nccl",
 47 |             init_method="tcp://" + master_host,
 48 |             world_size=world_size,
 49 |             timeout=timeout,
 50 |             rank=irisctl.role_rank(),
 51 |         )
 52 | 
 53 |     logging.info("Starting {} workers with rank {}".format(world_size, irisctl.role_rank()))
 54 |     # Pick a GPU based on the local rank
 55 |     torch.cuda.set_device(irisctl.local_rank())
 56 | 
 57 |     dist.barrier()
 58 |     setup_for_distributed(irisctl.local_rank() == 0)
 59 |     return irisctl.local_rank(), irisctl.role_rank(), world_size
 60 | 
 61 | 
 62 | def init_distributed_singlenode(timeout=0):
 63 |     # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
 64 |     dist_url = "env://"  # default
 65 | 
 66 |     # only works with torch.distributed.launch // torch.run
 67 |     rank = int(os.environ["RANK"])
 68 |     world_size = int(os.environ['WORLD_SIZE'])
 69 |     local_rank = int(os.environ['LOCAL_RANK'])
 70 | 
 71 |     if timeout == 0:
 72 |         timeout = dist.default_pg_timeout
 73 |     else:
 74 |         timeout = datetime.timedelta(seconds=timeout)
 75 | 
 76 |     logging.info(f'Default timeout: {timeout}')
 77 |     dist.init_process_group(
 78 |         backend="nccl",
 79 |         init_method=dist_url,
 80 |         world_size=world_size,
 81 |         timeout=timeout,
 82 |         rank=rank)
 83 | 
 84 |     # this will make all .cuda() calls work properly
 85 |     torch.cuda.set_device(local_rank)
 86 |     # synchronizes all the threads to reach this point before moving on
 87 |     dist.barrier()
 88 |     logging.info(f'setting up local_rank {local_rank} global_rank {rank} world size {world_size}')
 89 |     setup_for_distributed(rank == 0)
 90 |     return local_rank, rank, world_size
 91 | 
 92 | 
 93 | def get_rank():
 94 |     return torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
 95 | 
 96 | 
 97 | def get_local_rank():
 98 |     return int(os.environ.get('LOCAL_RANK', '0'))
 99 | 
100 | 
101 | # ----------------------------------------------------------------------------
102 | 
103 | def get_world_size():
104 |     return torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
105 | 
106 | 
107 | def print0(*args, **kwargs):
108 |     if get_rank() == 0:
109 |         print(*args, **kwargs)
110 | 
111 | 
112 | def set_seed(seed):
113 |     import random
114 |     random.seed(seed)
115 |     os.environ['PYTHONHASHSEED'] = str(seed)
116 | 
117 |     import numpy as np
118 |     np.random.seed(seed)
119 | 
120 |     torch.manual_seed(seed)
121 |     torch.random.manual_seed(seed)
122 |     if torch.cuda.is_available():
123 |         torch.cuda.manual_seed(seed)
124 |         torch.cuda.manual_seed_all(seed)
125 | 
126 |     torch.backends.cudnn.deterministic = True
127 |     torch.backends.cudnn.benchmark = False
128 |     torch.cuda.empty_cache()
129 | 
130 |     logging.info(f'Using seed: {seed}')
131 | 
132 | 
133 | def load_distributed(ddp_model, CHECKPOINT_PATH, rank=0):
134 |     # configure map_location properly
135 |     map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
136 |     # ddp_model.load_attn_procs( # ?
137 |     ddp_model.load_state_dict(
138 |         torch.load(CHECKPOINT_PATH, map_location=map_location))
139 | 


--------------------------------------------------------------------------------
/scripts/train_gfn.py:
--------------------------------------------------------------------------------
  1 | # For licensing see accompanying LICENSE file.
  2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved.
  3 | 
  4 | import os, sys
  5 | from collections import defaultdict
  6 | import contextlib
  7 | import datetime
  8 | import time
  9 | from concurrent import futures
 10 | import wandb
 11 | from functools import partial
 12 | import tempfile
 13 | from PIL import Image
 14 | import tqdm
 15 | tqdm = partial(tqdm.tqdm, dynamic_ncols=True)
 16 | import logging
 17 | import yaml
 18 | from termcolor import colored
 19 | import copy
 20 | import math
 21 | import pickle, gzip
 22 | 
 23 | import diffusers
 24 | from diffusers import AutoencoderKL, DDPMScheduler, DDIMScheduler, StableDiffusionPipeline, UNet2DConditionModel
 25 | from diffusers.optimization import get_scheduler
 26 | from diffusers.training_utils import cast_training_params
 27 | from diffusers.utils import check_min_version, convert_state_dict_to_diffusers, is_wandb_available
 28 | from diffusers.utils.torch_utils import is_compiled_module
 29 | 
 30 | import datasets
 31 | from packaging import version
 32 | from peft import LoraConfig
 33 | from peft.utils import get_peft_model_state_dict
 34 | import transformers
 35 | from transformers import CLIPTextModel, CLIPTokenizer
 36 | 
 37 | import numpy as np
 38 | import torch
 39 | import torch.nn.functional as F
 40 | import torch.utils.checkpoint
 41 | import torch.distributed as dist
 42 | from torch.nn.parallel import DistributedDataParallel as DDP
 43 | from scripts.distributed import init_distributed_singlenode, set_seed, load_distributed, setup_for_distributed
 44 | 
 45 | import alignment.prompts
 46 | import alignment.rewards
 47 | from alignment.diffusers_patch.pipeline_with_logprob import pipeline_with_logprob
 48 | from alignment.diffusers_patch.ddim_with_logprob import ddim_step_with_logprob, pred_orig_latent
 49 | from alignment.flow import ConditionalFlow
 50 | 
 51 | 
 52 | def unwrap_model(model):
 53 |     model = model.module if isinstance(model, DDP) else model
 54 |     model = model._orig_mod if is_compiled_module(model) else model
 55 |     return model
 56 | 
 57 | def main():
 58 |     logging.basicConfig(
 59 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 60 |         datefmt="%m/%d/%Y %H:%M:%S",
 61 |         level=logging.INFO,
 62 |     )
 63 |     logger = logging.getLogger(__name__)
 64 | 
 65 |     config = yaml.safe_load(open("config/sd.yaml"))['parameters']
 66 | 
 67 |     local_rank, global_rank, world_size = init_distributed_singlenode(timeout=36000)
 68 |     num_processes = world_size
 69 |     is_local_main_process = local_rank == 0
 70 |     setup_for_distributed(is_local_main_process)
 71 | 
 72 |     config['gpu_type'] = torch.cuda.get_device_name() \
 73 |                             if torch.cuda.is_available() else "CPU"
 74 |     logger.info(f"GPU type: {config['gpu_type']}")
 75 | 
 76 |     output_dir = os.path.join("./output")
 77 |     os.makedirs(output_dir, exist_ok=True)
 78 |     if config['wandb']:
 79 |         wandb.init(project="gflownet-alignment SD", config=config,
 80 |            save_code=True, mode="online" if is_local_main_process else "disabled")
 81 | 
 82 |     logger.info(f"\n{config}")
 83 |     set_seed(config['seed'])
 84 | 
 85 |     # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
 86 |     # as these weights are only used for inference, keeping weights in full precision is not required.
 87 |     weight_dtype = torch.float32
 88 |     if config['mixed_precision'] == "fp16":
 89 |         weight_dtype = torch.float16
 90 |     elif config['mixed_precision'] == "bf16":
 91 |         weight_dtype = torch.bfloat16
 92 |     device = torch.device(local_rank)
 93 | 
 94 |     pipeline = StableDiffusionPipeline.from_pretrained(
 95 |         config['pretrained']['model'], revision=config['pretrained']['revision'], torch_dtype=weight_dtype,
 96 |     )
 97 |     scheduler_config = {}
 98 |     scheduler_config.update(pipeline.scheduler.config)
 99 |     pipeline.scheduler = DDIMScheduler.from_config(scheduler_config)
100 |     pipeline.vae.requires_grad_(False)
101 |     pipeline.text_encoder.requires_grad_(False)
102 |     pipeline.vae.to(device, dtype=weight_dtype)
103 |     pipeline.text_encoder.to(device, dtype=weight_dtype)
104 | 
105 |     pipeline.safety_checker = None
106 |     pipeline.set_progress_bar_config(
107 |         position=1,
108 |         disable=not is_local_main_process,
109 |         leave=False,
110 |         desc="Timestep",
111 |         dynamic_ncols=True,
112 |     )
113 | 
114 |     unet = pipeline.unet
115 |     unet.requires_grad_(False)
116 |     for param in unet.parameters():
117 |         param.requires_grad_(False)
118 |     assert config['use_lora']
119 |     unet.to(device, dtype=weight_dtype)
120 |     unet_lora_config = LoraConfig(
121 |         r=config['train']['lora_rank'], lora_alpha=config['train']['lora_rank'],
122 |         init_lora_weights="gaussian", target_modules=["to_k", "to_q", "to_v", "to_out.0"],
123 |     )
124 |     unet.add_adapter(unet_lora_config)
125 |     if config['mixed_precision'] in ["fp16", "bf16"]:
126 |         # only upcast trainable parameters (LoRA) into fp32
127 |         cast_training_params(unet, dtype=torch.float32)
128 |     lora_layers = filter(lambda p: p.requires_grad, unet.parameters())
129 | 
130 |     scaler = None
131 |     if config['mixed_precision'] in ["fp16", "bf16"]:
132 |         scaler = torch.cuda.amp.GradScaler()
133 | 
134 |     # Enable TF32 for faster training on Ampere GPUs,
135 |     # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
136 |     if config['allow_tf32']:
137 |         torch.backends.cuda.matmul.allow_tf32 = True
138 |         # torch.backends.cudnn.allow_tf32 is True by default
139 |         torch.backends.cudnn.benchmark = True
140 | 
141 |     if config['train']['use_8bit_adam']:
142 |         try:
143 |             import bitsandbytes as bnb
144 |         except ImportError:
145 |             raise ImportError(
146 |                 "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
147 |             )
148 |         optimizer_cls = bnb.optim.AdamW8bit
149 |     else:
150 |         optimizer_cls = torch.optim.AdamW
151 | 
152 |     # prepare prompt and reward fn
153 |     prompt_fn = getattr(alignment.prompts, config['prompt_fn'])
154 |     reward_fn = getattr(alignment.rewards, config['reward_fn'])(weight_dtype, device)
155 | 
156 |     # generate negative prompt embeddings
157 |     neg_prompt_embed = pipeline.text_encoder(
158 |         pipeline.tokenizer(
159 |             [""],
160 |             return_tensors="pt",
161 |             padding="max_length",
162 |             truncation=True,
163 |             max_length=pipeline.tokenizer.model_max_length, # 77
164 |         ).input_ids.to(device)
165 |     )[0]
166 |     sample_neg_prompt_embeds = neg_prompt_embed.repeat(config['sample']['batch_size'], 1, 1)
167 |     train_neg_prompt_embeds = neg_prompt_embed.repeat(config['train']['batch_size'], 1, 1)
168 | 
169 |     # for some reason, autocast is necessary for non-lora training but for lora training it isn't necessary and it uses
170 |     # more memory
171 |     def func_autocast():
172 |         return torch.cuda.amp.autocast(dtype=weight_dtype)
173 |     if config['use_lora']:
174 |         # LoRA weights are actually float32, but other part of SD are in bf16/fp16
175 |         autocast = contextlib.nullcontext
176 |     else:
177 |         autocast = func_autocast
178 | 
179 |     unet.to(device)
180 |     unet = DDP(unet, device_ids=[local_rank])
181 | 
182 |     #######################################################
183 |     #################### FOR GFN ##########################
184 |     def decode(latents):
185 |         image = pipeline.vae.decode(
186 |             latents / pipeline.vae.config.scaling_factor, return_dict=False
187 |         )[0]
188 |         # image, has_nsfw_concept = pipeline.run_safety_checker(
189 |         #     image, device, prompt_embeds.dtype
190 |         # )
191 |         do_denormalize = [True] * image.shape[0]
192 |         image = pipeline.image_processor.postprocess(image,
193 |                      output_type="pt", do_denormalize=do_denormalize)
194 |         return image
195 | 
196 |     flow_model = ConditionalFlow(in_channels=4, block_out_channels=(64, 128, 256, 256),
197 |          layers_per_block=1, cross_attention_dim=pipeline.text_encoder.config.hidden_size) # hidden_size=768 is SD's text enconder output size
198 |     flow_model = flow_model.to(device, dtype=torch.float32)
199 |     autocast_flow = func_autocast
200 | 
201 |     flow_model = DDP(flow_model, device_ids=[local_rank])
202 |     params = [
203 |         {"params": lora_layers, "lr": config['train']['learning_rate']},
204 |         {"params": flow_model.parameters(), "lr": config['train']['learning_rate']}
205 |     ]
206 |     optimizer = optimizer_cls(
207 |         params,
208 |         betas=(config['train']['adam_beta1'], config['train']['adam_beta2']),
209 |         weight_decay=config['train']['adam_weight_decay'],
210 |         eps=config['train']['adam_epsilon'],
211 |     )
212 | 
213 |     result = defaultdict(dict)
214 |     result["config"] = config
215 |     start_time = time.time()
216 | 
217 |     #######################################################
218 |     # Start!
219 |     samples_per_epoch = (
220 |         config['sample']['batch_size'] * num_processes
221 |         * config['sample']['num_batches_per_epoch']
222 |     )
223 |     total_train_batch_size = (
224 |         config['train']['batch_size'] * num_processes
225 |         * config['train']['gradient_accumulation_steps']
226 |     )
227 | 
228 |     logger.info("***** Running training *****")
229 |     logger.info(f"  Num Epochs = {config['num_epochs']}")
230 |     logger.info(f"  Sample batch size per device = {config['sample']['batch_size']}")
231 |     logger.info(f"  Train batch size per device = {config['train']['batch_size']}")
232 |     logger.info(
233 |         f"  Gradient Accumulation steps = {config['train']['gradient_accumulation_steps']}"
234 |     )
235 |     logger.info("")
236 |     logger.info(f"  Total number of samples per epoch = test_bs * num_batch_per_epoch * num_process = {samples_per_epoch}")
237 |     logger.info(
238 |         f"  Total train batch size (w. parallel, distributed & accumulation) = train_bs * grad_accumul * num_process = {total_train_batch_size}"
239 |     )
240 |     logger.info(
241 |         f"  Number of gradient updates per inner epoch = samples_per_epoch // total_train_batch_size = {samples_per_epoch // total_train_batch_size}"
242 |     )
243 |     logger.info(f"  Number of inner epochs = {config['train']['num_inner_epochs']}")
244 | 
245 |     assert config['sample']['batch_size'] >= config['train']['batch_size']
246 |     assert config['sample']['batch_size'] % config['train']['batch_size'] == 0 # not necessary
247 |     assert samples_per_epoch % total_train_batch_size == 0
248 | 
249 |     first_epoch = 0
250 |     global_step = 0
251 |     for epoch in range(first_epoch, config['num_epochs']):
252 |         if config['train']['anneal'] in ["linear"]:
253 |             ratio = min(1, epoch / (0.5 * config['num_epochs']))
254 |         else:
255 |             ratio = 1.
256 |         reward_exp_ep = config['train']['reward_exp'] * ratio
257 |         def reward_transform(value):
258 |             return value * reward_exp_ep
259 | 
260 |         num_diffusion_steps = config['sample']['num_steps']
261 |         pipeline.scheduler.set_timesteps(num_diffusion_steps, device=device)  # set_timesteps(): 1000 steps -> 50 steps
262 |         scheduler_dt = pipeline.scheduler.timesteps[0] - pipeline.scheduler.timesteps[1]
263 |         num_train_timesteps = int(num_diffusion_steps * config['train']['timestep_fraction'])
264 |         accumulation_steps = config['train']['gradient_accumulation_steps'] * num_train_timesteps
265 | 
266 |         #################### SAMPLING ####################
267 |         torch.cuda.empty_cache()
268 |         unet.zero_grad()
269 |         unet.eval()
270 |         flow_model.zero_grad()
271 | 
272 |         if True:
273 |             with torch.inference_mode(): # similar to torch.no_grad() but also disables autograd.grad()
274 |                 samples = []
275 |                 prompts = []
276 |                 for i in tqdm(
277 |                     range(config['sample']['num_batches_per_epoch']),
278 |                     desc=f"Epoch {epoch}: sampling",
279 |                     disable=not is_local_main_process,
280 |                     position=0,
281 |                 ):
282 |                     # generate prompts
283 |                     prompts, prompt_metadata = zip(
284 |                         *[
285 |                             prompt_fn(**config['prompt_fn_kwargs'])
286 |                             for _ in range(config['sample']['batch_size'])
287 |                         ]
288 |                     )
289 | 
290 |                     # encode prompts
291 |                     prompt_ids = pipeline.tokenizer(
292 |                         prompts,
293 |                         return_tensors="pt",
294 |                         padding="max_length",
295 |                         truncation=True,
296 |                         max_length=pipeline.tokenizer.model_max_length,
297 |                     ).input_ids.to(device)
298 |                     prompt_embeds = pipeline.text_encoder(prompt_ids)[0]
299 | 
300 |                     # sample
301 |                     with autocast():
302 |                         ret_tuple = pipeline_with_logprob(
303 |                             pipeline,
304 |                             prompt_embeds=prompt_embeds,
305 |                             negative_prompt_embeds=sample_neg_prompt_embeds,
306 |                             num_inference_steps=num_diffusion_steps,
307 |                             guidance_scale=config['sample']['guidance_scale'],
308 |                             eta=config['sample']['eta'],
309 |                             output_type="pt",
310 | 
311 |                             return_unetoutput=config['train']['unetreg'] > 0.,
312 |                         )
313 |                     if config['train']['unetreg'] > 0:
314 |                         images, _, latents, log_probs, unet_outputs = ret_tuple
315 |                         unet_outputs = torch.stack(unet_outputs, dim=1)  # (batch_size, num_steps, 3, 32, 32)
316 |                     else:
317 |                         images, _, latents, log_probs = ret_tuple
318 | 
319 |                     latents = torch.stack(latents, dim=1)  # (batch_size, num_steps + 1, 4, 64, 64)
320 |                     log_probs = torch.stack(log_probs, dim=1)  # (batch_size, num_steps, 1)
321 |                     timesteps = pipeline.scheduler.timesteps.repeat(
322 |                         config['sample']['batch_size'], 1
323 |                     )  # (bs, num_steps)  (981, 961, ..., 21, 1) corresponds to "next_latents"
324 | 
325 |                     rewards = reward_fn(images, prompts, prompt_metadata) # (reward, reward_metadata)
326 | 
327 |                     samples.append(
328 |                         {
329 |                             "prompts": prompts, # tuple of strings
330 |                             "prompt_metadata": prompt_metadata,
331 | 
332 |                             "prompt_ids": prompt_ids,
333 |                             "prompt_embeds": prompt_embeds,
334 |                             "timesteps": timesteps,
335 |                             "latents": latents[
336 |                                 :, :-1
337 |                             ],  # each entry is the latent before timestep t
338 |                             "next_latents": latents[
339 |                                 :, 1:
340 |                             ],  # each entry is the latent after timestep t
341 |                             "log_probs": log_probs,
342 |                             "rewards": rewards,
343 |                         }
344 |                     )
345 |                     if config['train']['unetreg'] > 0:
346 |                         samples[-1]["unet_outputs"] = unet_outputs
347 | 
348 |                 # wait for all rewards to be computed
349 |                 for sample in tqdm(
350 |                     samples,
351 |                     desc="Waiting for rewards",
352 |                     disable=not is_local_main_process,
353 |                     position=0,
354 |                 ):
355 |                     rewards, reward_metadata = sample["rewards"]
356 |                     sample["rewards"] = torch.as_tensor(rewards, device=device)
357 | 
358 |             # collate samples into dict where each entry has shape (num_batches_per_epoch * sample.batch_size, ...)
359 |             new_samples = {}
360 |             for k in samples[0].keys():
361 |                 if k in ["prompts", "prompt_metadata"]:
362 |                     # list of tuples [('cat', 'dog'), ('cat', 'tiger'), ...] -> list ['cat', 'dog', 'cat', 'tiger', ...]
363 |                     new_samples[k] = [item for s in samples for item in s[k]]
364 |                 else:
365 |                     new_samples[k] = torch.cat([s[k] for s in samples])
366 |             samples = new_samples
367 | 
368 |             # this is a hack to force wandb to log the images as JPEGs instead of PNGs
369 |             with tempfile.TemporaryDirectory() as tmpdir:
370 |                 for i, image in enumerate(images):
371 |                     # bf16 cannot be converted to numpy directly
372 |                     pil = Image.fromarray(
373 |                         (image.cpu().float().numpy().transpose(1, 2, 0) * 255).astype(np.uint8)
374 |                     )
375 |                     pil = pil.resize((256, 256))
376 |                     pil.save(os.path.join(tmpdir, f"{i}.jpg"))
377 |                 if config['wandb'] and is_local_main_process:
378 |                     wandb.log(
379 |                         {
380 |                             "images": [
381 |                                 wandb.Image(
382 |                                     os.path.join(tmpdir, f"{i}.jpg"),
383 |                                     caption=f"{prompt} | {reward:.2f}",
384 |                                 )
385 |                                 for i, (prompt, reward) in enumerate(
386 |                                     zip(prompts, rewards)
387 |                                 )
388 |                             ],
389 |                         },
390 |                         step=global_step,
391 |                     )
392 | 
393 |             rewards = torch.zeros(world_size * len(samples["rewards"]),
394 |                           dtype=samples["rewards"].dtype, device=device)
395 |             dist.all_gather_into_tensor(rewards, samples["rewards"])
396 |             rewards = rewards.cpu().float().numpy()
397 |             result["reward_mean"][global_step] = rewards.mean()
398 |             result["reward_std"][global_step] = rewards.std()
399 | 
400 |             if is_local_main_process:
401 |                 logger.info(f"global_step: {global_step}  rewards: {rewards.mean().item():.3f}")
402 |                 if config['wandb']:
403 |                     wandb.log(
404 |                         {
405 |                             "reward_mean": rewards.mean(), # samples["rewards"].mean()
406 |                             "reward_std": rewards.std(),
407 |                         },
408 |                         step=global_step,
409 |                     )
410 | 
411 |             del samples["prompt_ids"]
412 | 
413 |             total_batch_size, num_timesteps = samples["timesteps"].shape
414 |             assert (
415 |                 total_batch_size
416 |                 == config['sample']['batch_size'] * config['sample']['num_batches_per_epoch']
417 |             )
418 |             assert num_timesteps == num_diffusion_steps
419 | 
420 |         #################### TRAINING ####################
421 |         for inner_epoch in range(config['train']['num_inner_epochs']):
422 |             # shuffle samples along batch dimension
423 |             perm = torch.randperm(total_batch_size, device=device)
424 |             for k, v in samples.items():
425 |                 if k in ["prompts", "prompt_metadata"]:
426 |                     samples[k] = [v[i] for i in perm]
427 |                 elif k in ["unet_outputs"]:
428 |                     samples[k] = v[perm]
429 |                 else:
430 |                     samples[k] = v[perm]
431 | 
432 |             perms = torch.stack(
433 |                 [
434 |                     torch.randperm(num_timesteps, device=device)
435 |                     for _ in range(total_batch_size)
436 |                 ]
437 |             ) # (total_batch_size, num_steps)
438 |             # "prompts" & "prompt_metadata" are constant along time dimension
439 |             key_ls = ["timesteps", "latents", "next_latents", "log_probs"]
440 |             for key in key_ls:
441 |                 samples[key] = samples[key][torch.arange(total_batch_size, device=device)[:, None], perms]
442 |             if config['train']['unetreg'] > 0:
443 |                 samples["unet_outputs"] = \
444 |                     samples["unet_outputs"][torch.arange(total_batch_size, device=device)[:, None], perms]
445 | 
446 |             ### rebatch for training
447 |             samples_batched = {}
448 |             for k, v in samples.items():
449 |                 if k in ["prompts", "prompt_metadata"]:
450 |                     samples_batched[k] = [v[i:i + config['train']['batch_size']]
451 |                                 for i in range(0, len(v), config['train']['batch_size'])]
452 |                 elif k in ["unet_outputs"]:
453 |                     samples_batched[k] = v.reshape(-1, config['train']['batch_size'], *v.shape[1:])
454 |                 else:
455 |                     samples_batched[k] = v.reshape(-1, config['train']['batch_size'], *v.shape[1:])
456 | 
457 |             # dict of lists -> list of dicts for easier iteration
458 |             samples_batched = [
459 |                 dict(zip(samples_batched, x)) for x in zip(*samples_batched.values())
460 |             ] # len = sample_bs * num_batches_per_epoch // train_bs = num_train_batches_per_epoch
461 | 
462 |             unet.train()
463 |             flow_model.train()
464 |             info = defaultdict(list)
465 |             for i, sample in tqdm(
466 |                 list(enumerate(samples_batched)),
467 |                 desc=f"Epoch {epoch}.{inner_epoch}: training",
468 |                 position=0,
469 |                 disable=not is_local_main_process,
470 |             ):
471 |                 """
472 |                 sample: [
473 |                 ('prompts', list of strings, len=train_bs), ('prompt_metadata', list of dicts),
474 |                 (bf16) ('prompt_embeds', torch.Size([1, 77, 768])), 
475 |                 (int64) ('timesteps', torch.Size([1, 50])), 
476 |                 (bf16) ('latents', torch.Size([1, 50, 4, 64, 64])), ('next_latents', torch.Size([1, 50, 4, 64, 64])), 
477 |                 ('log_probs', torch.Size([1, 50])), 
478 |                 ]
479 |                 """
480 |                 if config['train']['cfg']:
481 |                     # concat negative prompts to sample prompts to avoid two forward passes
482 |                     embeds = torch.cat(
483 |                         [train_neg_prompt_embeds, sample["prompt_embeds"]]
484 |                     )
485 |                 else:
486 |                     embeds = sample["prompt_embeds"]
487 | 
488 |                 for j in tqdm(range(num_train_timesteps), desc="Timestep", position=1, leave=False, disable=not is_local_main_process):
489 |                     with autocast():
490 |                         if config['train']['cfg']:
491 |                             noise_pred = unet(
492 |                                 torch.cat([sample["latents"][:, j]] * 2),
493 |                                 torch.cat([sample["timesteps"][:, j]] * 2),
494 |                                 embeds,
495 |                             ).sample
496 |                             noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
497 |                             noise_pred = (
498 |                                     noise_pred_uncond
499 |                                     + config['sample']['guidance_scale']
500 |                                     * (noise_pred_text - noise_pred_uncond)
501 |                             )
502 |                             if config['train']['unetreg'] > 0:
503 |                                 unetreg = (noise_pred - sample["unet_outputs"][:, j])**2
504 |                                 unetreg = torch.mean(unetreg, dim=(1, 2, 3))
505 | 
506 |                         else:
507 |                             noise_pred = unet(
508 |                                 sample["latents"][:, j],
509 |                                 sample["timesteps"][:, j],
510 |                                 embeds,
511 |                             ).sample   # (bs, 4, 64, 64)
512 |                             if config['train']['unetreg'] > 0:
513 |                                 unetreg = (noise_pred - sample["unet_outputs"][:, j])**2
514 | 
515 |                         _, log_pf, log_pb = ddim_step_with_logprob(
516 |                             pipeline.scheduler, noise_pred,
517 |                             sample["timesteps"][:, j], # (train_bs, 50) -> (train_bs,)
518 |                             sample["latents"][:, j], eta=config['sample']['eta'],
519 |                             prev_sample=sample["next_latents"][:, j], calculate_pb=True,
520 |                         ) # log_pf :(bs,)
521 | 
522 |                     #######################################################
523 |                     #################### GFN ALGORITHM ####################
524 |                     #######################################################
525 |                     with autocast_flow():
526 |                         flow = flow_model(sample["latents"][:, j], sample["timesteps"][:, j], sample["prompt_embeds"])
527 |                         timestep_next = torch.clamp(sample["timesteps"][:, j] - scheduler_dt, min=0)
528 |                         flow_next = flow_model(sample["next_latents"][:, j], timestep_next, sample["prompt_embeds"])
529 | 
530 |                     with autocast(), torch.no_grad():
531 |                         unet_output = unet(sample["latents"][:, j], sample["timesteps"][:, j], sample["prompt_embeds"]).sample
532 |                         latent = pred_orig_latent(pipeline.scheduler, unet_output, sample["latents"][:, j], sample["timesteps"][:, j])
533 |                     with torch.inference_mode():
534 |                         logr_tmp = reward_fn(decode(latent), sample["prompts"], sample["prompt_metadata"])[0]   # tuple -> tensor
535 |                     logr = reward_transform(logr_tmp)
536 |                     flow = flow + logr # bf16 + float32 -> float32
537 | 
538 |                     with autocast(), torch.no_grad():
539 |                         unet_output = unet(sample["next_latents"][:, j], timestep_next, sample["prompt_embeds"]).sample
540 |                         latent_next = pred_orig_latent(pipeline.scheduler, unet_output, sample["next_latents"][:, j], timestep_next)
541 |                     with torch.inference_mode():
542 |                         logr_next_tmp = reward_fn(decode(latent_next), sample["prompts"], sample["prompt_metadata"])[0]
543 |                     logr_next = reward_transform(logr_next_tmp)
544 |                     flow_next = flow_next + logr_next
545 |                     end_mask = sample["timesteps"][:, j] == pipeline.scheduler.timesteps[-1] # RHS is 1
546 |                     flow_next[end_mask] = reward_transform(sample['rewards'][end_mask].to(flow_next))
547 | 
548 |                     info["log_pf"].append(torch.mean(log_pf).detach())
549 |                     info["flow"].append(torch.mean(flow).detach())
550 |                     info["log_pb"].append(torch.mean(log_pb).detach())
551 | 
552 |                     if config['train']['klpf'] > 0:
553 |                         losses_flow = (flow + log_pf.detach() - log_pb.detach() - flow_next) ** 2
554 | 
555 |                         flow_next_klpf = flow_next.detach()
556 |                         log_pb_klpf, log_pf_klpf = log_pb.detach(), log_pf.detach()
557 |                         reward_db = (flow_next_klpf + log_pb_klpf - log_pf_klpf - flow).detach()
558 | 
559 |                         # different gpu has different states, so cannot share a baseline
560 |                         assert len(reward_db) > 1
561 |                         rloo_baseline = (reward_db.sum() - reward_db) / (len(reward_db) - 1)
562 |                         reward_ = (reward_db - rloo_baseline) ** 2
563 |                         rloo_var = (reward_.sum() - reward_) / (len(reward_db) - 1)
564 |                         advantages = (reward_db - rloo_baseline) / (rloo_var.sqrt() + 1e-8)
565 |                         advantages = torch.clamp(advantages, -config['train']['adv_clip_max'], config['train']['adv_clip_max'])
566 | 
567 |                         ratio = torch.exp(log_pf - sample["log_probs"][:, j])
568 |                         unclipped_losses = -advantages * ratio
569 |                         clipped_losses = -advantages * torch.clamp(
570 |                             ratio,
571 |                             1.0 - config['train']['clip_range'],
572 |                             1.0 + config['train']['clip_range'],
573 |                             )
574 |                         losses_klpf = torch.maximum(unclipped_losses, clipped_losses)
575 |                         info["ratio"].append(torch.mean(ratio).detach())
576 | 
577 |                         losses = losses_flow + config['train']['klpf'] * losses_klpf
578 |                         info["loss"].append(losses_flow.mean().detach())
579 |                         info["loss_klpf"].append(losses_klpf.mean().detach())
580 |                         torch.cuda.empty_cache() # clear comp graph for log_pf_next
581 |                     else:
582 |                         losses_gfn = (flow + log_pf - log_pb - flow_next) ** 2  # (bs,)
583 |                         info["loss"].append(losses_gfn.mean().detach())
584 |                         losses = losses_gfn
585 | 
586 |                     if config['train']['unetreg'] > 0:
587 |                         losses = losses + config['train']['unetreg'] * unetreg
588 |                         info["unetreg"].append(unetreg.mean().detach())
589 |                     loss = torch.mean(losses)
590 | 
591 |                     if logr_tmp is not None:
592 |                         info["logr"].append(torch.mean(logr_tmp).detach())
593 | 
594 |                     loss = loss / accumulation_steps
595 |                     if scaler:
596 |                         # Backward passes under autocast are not recommended
597 |                         scaler.scale(loss).backward()
598 |                     else:
599 |                         loss.backward()
600 | 
601 |                     # prevent OOM
602 |                     image_next = image = prev_sample_klpf = unet_output = latent = latent_next = latent_next_next = None
603 |                     noise_pred_next_uncond = noise_pred_next_text = noise_pred_uncond = noise_pred_text = noise_pred = noise_pred_next = None
604 |                     flow = flow_next = flow_next_next = logr = logr_next = logr_next_next = logr_next_tmp = logr_tmp = reward_db = advantages = None
605 |                     _ = log_pf = log_pb = log_pf_next = log_pb_next = log_pf_klpf = log_pb_klpf = None
606 |                     unetreg = unetreg_initial = losses = losses_flow = losses_klpf = losses_gfn = None
607 | 
608 |                 if ((j == num_train_timesteps - 1) and
609 |                         (i + 1) % config['train']['gradient_accumulation_steps'] == 0):
610 |                     if scaler:
611 |                         scaler.unscale_(optimizer)
612 |                         torch.nn.utils.clip_grad_norm_(unet.parameters(), config['train']['max_grad_norm'])
613 |                         torch.nn.utils.clip_grad_norm_(flow_model.parameters(), config['train']['max_grad_norm'])
614 |                         scaler.step(optimizer)
615 |                         scaler.update()
616 |                     else:
617 |                         torch.nn.utils.clip_grad_norm_(unet.parameters(), config['train']['max_grad_norm'])
618 |                         torch.nn.utils.clip_grad_norm_(flow_model.parameters(), config['train']['max_grad_norm'])
619 |                         optimizer.step()
620 |                     optimizer.zero_grad()
621 |                     global_step += 1
622 | 
623 |                     info = {k: torch.mean(torch.stack(v)) for k, v in info.items()}
624 |                     dist.barrier()
625 |                     for k, v in info.items():
626 |                         dist.all_reduce(v, op=dist.ReduceOp.SUM)
627 |                     info = {k: v / num_processes for k, v in info.items()}
628 |                     for k, v in info.items():
629 |                         result[k][global_step] = v.item()
630 | 
631 |                     info.update({"epoch": epoch})
632 |                     result["epoch"][global_step] = epoch
633 |                     result["time"][global_step] = time.time() - start_time
634 | 
635 |                     if is_local_main_process:
636 |                         if config['wandb']:
637 |                             wandb.log(info, step=global_step)
638 |                         logger.info(f"global_step={global_step}  " +
639 |                               " ".join([f"{k}={v:.3f}" for k, v in info.items()]))
640 |                     info = defaultdict(list) # reset info dict
641 | 
642 |         if is_local_main_process:
643 |             pickle.dump(result, gzip.open(os.path.join(output_dir, f"result.json"), 'wb'))
644 |         dist.barrier()
645 | 
646 |         if epoch % config['save_freq'] == 0 or epoch == config['num_epochs'] - 1:
647 |             if is_local_main_process:
648 |                 save_path = os.path.join(output_dir, f"checkpoint_epoch{epoch}")
649 |                 unwrapped_unet = unwrap_model(unet)
650 |                 unet_lora_state_dict = convert_state_dict_to_diffusers(
651 |                     get_peft_model_state_dict(unwrapped_unet)
652 |                 )
653 |                 StableDiffusionPipeline.save_lora_weights(
654 |                     save_directory=save_path,
655 |                     unet_lora_layers=unet_lora_state_dict,
656 |                     is_main_process=is_local_main_process,
657 |                     safe_serialization=True,
658 |                 )
659 |                 logger.info(f"Saved state to {save_path}")
660 | 
661 |             dist.barrier()
662 | 
663 |     if config['wandb'] and is_local_main_process:
664 |         wandb.finish()
665 | 
666 | 
667 | if __name__ == "__main__":
668 |     main()
669 |     dist.destroy_process_group()
670 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # For licensing see accompanying LICENSE file.
 2 | # Copyright (C) 2024 Apple Inc. All Rights Reserved.
 3 | 
 4 | from setuptools import setup, find_packages
 5 | 
 6 | setup(
 7 |     name="diffusion-alignment-pytorch",
 8 |     version="0.0.1",
 9 |     python_requires=">=3.8",
10 |     install_requires=[
11 |         "ml-collections",
12 |         "absl-py",
13 |         "diffusers[torch]>=0.29.0", # 0.29.0 supports SD3
14 |         "accelerate",
15 |         "torchvision",
16 |         "inflect==6.0.4",
17 |         "pydantic==1.10.13",
18 | 
19 |         "wandb",
20 |         "ipdb",
21 |         "line_profiler",
22 |         "timm",
23 |         "termcolor",
24 |         "openai-clip",
25 |         "image-reward",
26 |         "ipykernel",
27 |         "clint",
28 |         "torchmetrics[image]>=1.4.0", # using [image] to install torch-fidelity
29 |         "peft>=0.6.0",
30 |         "transformers>=4.41.2"
31 |         "einops",
32 |         "torchdiffeq",
33 |     ],
34 | )
35 | 


--------------------------------------------------------------------------------