├── LICENSE
├── README.md
├── assets
    ├── a-painting-of-a-fire.png
    ├── a-photograph-of-a-fire.png
    ├── a-shirt-with-a-fire-printed-on-it.png
    ├── a-shirt-with-the-inscription-'fire'.png
    ├── a-watercolor-painting-of-a-fire.png
    ├── birdhouse.png
    ├── fire.png
    ├── inpainting.png
    ├── modelfigure.png
    ├── rdm-preview.jpg
    ├── reconstruction1.png
    ├── reconstruction2.png
    ├── results.gif
    ├── the-earth-is-on-fire,-oil-on-canvas.png
    ├── txt2img-convsample.png
    └── txt2img-preview.png
├── configs
    ├── autoencoder
    │   ├── autoencoder_kl_16x16x16.yaml
    │   ├── autoencoder_kl_32x32x4.yaml
    │   ├── autoencoder_kl_64x64x3.yaml
    │   └── autoencoder_kl_8x8x64.yaml
    ├── latent-diffusion
    │   ├── celebahq-ldm-vq-4.yaml
    │   ├── cin-ldm-vq-f8.yaml
    │   ├── cin256-v2.yaml
    │   ├── ffhq-ldm-vq-4.yaml
    │   ├── lsun_bedrooms-ldm-vq-4.yaml
    │   ├── lsun_churches-ldm-kl-8.yaml
    │   └── txt2img-1p4B-eval.yaml
    └── retrieval-augmented-diffusion
    │   └── 768x768.yaml
├── data
    ├── DejaVuSans.ttf
    ├── example_conditioning
    │   ├── superresolution
    │   │   └── sample_0.jpg
    │   └── text_conditional
    │   │   └── sample_0.txt
    ├── imagenet_clsidx_to_label.txt
    ├── imagenet_train_hr_indices.p
    ├── imagenet_val_hr_indices.p
    ├── index_synset.yaml
    └── inpainting_examples
    │   ├── 6458524847_2f4c361183_k.png
    │   ├── 6458524847_2f4c361183_k_mask.png
    │   ├── 8399166846_f6fb4e4b8e_k.png
    │   ├── 8399166846_f6fb4e4b8e_k_mask.png
    │   ├── alex-iby-G_Pk4D9rMLs.png
    │   ├── alex-iby-G_Pk4D9rMLs_mask.png
    │   ├── bench2.png
    │   ├── bench2_mask.png
    │   ├── bertrand-gabioud-CpuFzIsHYJ0.png
    │   ├── bertrand-gabioud-CpuFzIsHYJ0_mask.png
    │   ├── billow926-12-Wc-Zgx6Y.png
    │   ├── billow926-12-Wc-Zgx6Y_mask.png
    │   ├── overture-creations-5sI6fQgYIuo.png
    │   ├── overture-creations-5sI6fQgYIuo_mask.png
    │   ├── photo-1583445095369-9c651e7e5d34.png
    │   └── photo-1583445095369-9c651e7e5d34_mask.png
├── environment.yaml
├── ldm
    ├── data
    │   ├── __init__.py
    │   ├── base.py
    │   ├── imagenet.py
    │   └── lsun.py
    ├── lr_scheduler.py
    ├── models
    │   ├── autoencoder.py
    │   └── diffusion
    │   │   ├── __init__.py
    │   │   ├── classifier.py
    │   │   ├── ddim.py
    │   │   ├── ddpm.py
    │   │   └── plms.py
    ├── modules
    │   ├── attention.py
    │   ├── diffusionmodules
    │   │   ├── __init__.py
    │   │   ├── model.py
    │   │   ├── openaimodel.py
    │   │   └── util.py
    │   ├── distributions
    │   │   ├── __init__.py
    │   │   └── distributions.py
    │   ├── ema.py
    │   ├── encoders
    │   │   ├── __init__.py
    │   │   └── modules.py
    │   ├── image_degradation
    │   │   ├── __init__.py
    │   │   ├── bsrgan.py
    │   │   ├── bsrgan_light.py
    │   │   ├── utils
    │   │   │   └── test.png
    │   │   └── utils_image.py
    │   ├── losses
    │   │   ├── __init__.py
    │   │   ├── contperceptual.py
    │   │   └── vqperceptual.py
    │   └── x_transformer.py
    └── util.py
├── main.py
├── models
    ├── first_stage_models
    │   ├── kl-f16
    │   │   └── config.yaml
    │   ├── kl-f32
    │   │   └── config.yaml
    │   ├── kl-f4
    │   │   └── config.yaml
    │   ├── kl-f8
    │   │   └── config.yaml
    │   ├── vq-f16
    │   │   └── config.yaml
    │   ├── vq-f4-noattn
    │   │   └── config.yaml
    │   ├── vq-f4
    │   │   └── config.yaml
    │   ├── vq-f8-n256
    │   │   └── config.yaml
    │   └── vq-f8
    │   │   └── config.yaml
    └── ldm
    │   ├── bsr_sr
    │       └── config.yaml
    │   ├── celeba256
    │       └── config.yaml
    │   ├── cin256
    │       └── config.yaml
    │   ├── ffhq256
    │       └── config.yaml
    │   ├── inpainting_big
    │       └── config.yaml
    │   ├── layout2img-openimages256
    │       └── config.yaml
    │   ├── lsun_beds256
    │       └── config.yaml
    │   ├── lsun_churches256
    │       └── config.yaml
    │   ├── semantic_synthesis256
    │       └── config.yaml
    │   ├── semantic_synthesis512
    │       └── config.yaml
    │   └── text2img256
    │       └── config.yaml
├── notebook_helpers.py
├── scripts
    ├── download_first_stages.sh
    ├── download_models.sh
    ├── inpaint.py
    ├── knn2img.py
    ├── latent_imagenet_diffusion.ipynb
    ├── sample_diffusion.py
    ├── train_searcher.py
    └── txt2img.py
└── setup.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Machine Vision and Learning Group, LMU Munich
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/assets/a-painting-of-a-fire.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/assets/a-painting-of-a-fire.png


--------------------------------------------------------------------------------
/assets/a-photograph-of-a-fire.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/assets/a-photograph-of-a-fire.png


--------------------------------------------------------------------------------
/assets/a-shirt-with-a-fire-printed-on-it.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/assets/a-shirt-with-a-fire-printed-on-it.png


--------------------------------------------------------------------------------
/assets/a-shirt-with-the-inscription-'fire'.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/assets/a-shirt-with-the-inscription-'fire'.png


--------------------------------------------------------------------------------
/assets/a-watercolor-painting-of-a-fire.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/assets/a-watercolor-painting-of-a-fire.png


--------------------------------------------------------------------------------
/assets/birdhouse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/assets/birdhouse.png


--------------------------------------------------------------------------------
/assets/fire.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/assets/fire.png


--------------------------------------------------------------------------------
/assets/inpainting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/assets/inpainting.png


--------------------------------------------------------------------------------
/assets/modelfigure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/assets/modelfigure.png


--------------------------------------------------------------------------------
/assets/rdm-preview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/assets/rdm-preview.jpg


--------------------------------------------------------------------------------
/assets/reconstruction1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/assets/reconstruction1.png


--------------------------------------------------------------------------------
/assets/reconstruction2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/assets/reconstruction2.png


--------------------------------------------------------------------------------
/assets/results.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/assets/results.gif


--------------------------------------------------------------------------------
/assets/the-earth-is-on-fire,-oil-on-canvas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/assets/the-earth-is-on-fire,-oil-on-canvas.png


--------------------------------------------------------------------------------
/assets/txt2img-convsample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/assets/txt2img-convsample.png


--------------------------------------------------------------------------------
/assets/txt2img-preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/assets/txt2img-preview.png


--------------------------------------------------------------------------------
/configs/autoencoder/autoencoder_kl_16x16x16.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-6
 3 |   target: ldm.models.autoencoder.AutoencoderKL
 4 |   params:
 5 |     monitor: "val/rec_loss"
 6 |     embed_dim: 16
 7 |     lossconfig:
 8 |       target: ldm.modules.losses.LPIPSWithDiscriminator
 9 |       params:
10 |         disc_start: 50001
11 |         kl_weight: 0.000001
12 |         disc_weight: 0.5
13 | 
14 |     ddconfig:
15 |       double_z: True
16 |       z_channels: 16
17 |       resolution: 256
18 |       in_channels: 3
19 |       out_ch: 3
20 |       ch: 128
21 |       ch_mult: [ 1,1,2,2,4]  # num_down = len(ch_mult)-1
22 |       num_res_blocks: 2
23 |       attn_resolutions: [16]
24 |       dropout: 0.0
25 | 
26 | 
27 | data:
28 |   target: main.DataModuleFromConfig
29 |   params:
30 |     batch_size: 12
31 |     wrap: True
32 |     train:
33 |       target: ldm.data.imagenet.ImageNetSRTrain
34 |       params:
35 |         size: 256
36 |         degradation: pil_nearest
37 |     validation:
38 |       target: ldm.data.imagenet.ImageNetSRValidation
39 |       params:
40 |         size: 256
41 |         degradation: pil_nearest
42 | 
43 | lightning:
44 |   callbacks:
45 |     image_logger:
46 |       target: main.ImageLogger
47 |       params:
48 |         batch_frequency: 1000
49 |         max_images: 8
50 |         increase_log_steps: True
51 | 
52 |   trainer:
53 |     benchmark: True
54 |     accumulate_grad_batches: 2
55 | 


--------------------------------------------------------------------------------
/configs/autoencoder/autoencoder_kl_32x32x4.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-6
 3 |   target: ldm.models.autoencoder.AutoencoderKL
 4 |   params:
 5 |     monitor: "val/rec_loss"
 6 |     embed_dim: 4
 7 |     lossconfig:
 8 |       target: ldm.modules.losses.LPIPSWithDiscriminator
 9 |       params:
10 |         disc_start: 50001
11 |         kl_weight: 0.000001
12 |         disc_weight: 0.5
13 | 
14 |     ddconfig:
15 |       double_z: True
16 |       z_channels: 4
17 |       resolution: 256
18 |       in_channels: 3
19 |       out_ch: 3
20 |       ch: 128
21 |       ch_mult: [ 1,2,4,4 ]  # num_down = len(ch_mult)-1
22 |       num_res_blocks: 2
23 |       attn_resolutions: [ ]
24 |       dropout: 0.0
25 | 
26 | data:
27 |   target: main.DataModuleFromConfig
28 |   params:
29 |     batch_size: 12
30 |     wrap: True
31 |     train:
32 |       target: ldm.data.imagenet.ImageNetSRTrain
33 |       params:
34 |         size: 256
35 |         degradation: pil_nearest
36 |     validation:
37 |       target: ldm.data.imagenet.ImageNetSRValidation
38 |       params:
39 |         size: 256
40 |         degradation: pil_nearest
41 | 
42 | lightning:
43 |   callbacks:
44 |     image_logger:
45 |       target: main.ImageLogger
46 |       params:
47 |         batch_frequency: 1000
48 |         max_images: 8
49 |         increase_log_steps: True
50 | 
51 |   trainer:
52 |     benchmark: True
53 |     accumulate_grad_batches: 2
54 | 


--------------------------------------------------------------------------------
/configs/autoencoder/autoencoder_kl_64x64x3.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-6
 3 |   target: ldm.models.autoencoder.AutoencoderKL
 4 |   params:
 5 |     monitor: "val/rec_loss"
 6 |     embed_dim: 3
 7 |     lossconfig:
 8 |       target: ldm.modules.losses.LPIPSWithDiscriminator
 9 |       params:
10 |         disc_start: 50001
11 |         kl_weight: 0.000001
12 |         disc_weight: 0.5
13 | 
14 |     ddconfig:
15 |       double_z: True
16 |       z_channels: 3
17 |       resolution: 256
18 |       in_channels: 3
19 |       out_ch: 3
20 |       ch: 128
21 |       ch_mult: [ 1,2,4 ]  # num_down = len(ch_mult)-1
22 |       num_res_blocks: 2
23 |       attn_resolutions: [ ]
24 |       dropout: 0.0
25 | 
26 | 
27 | data:
28 |   target: main.DataModuleFromConfig
29 |   params:
30 |     batch_size: 12
31 |     wrap: True
32 |     train:
33 |       target: ldm.data.imagenet.ImageNetSRTrain
34 |       params:
35 |         size: 256
36 |         degradation: pil_nearest
37 |     validation:
38 |       target: ldm.data.imagenet.ImageNetSRValidation
39 |       params:
40 |         size: 256
41 |         degradation: pil_nearest
42 | 
43 | lightning:
44 |   callbacks:
45 |     image_logger:
46 |       target: main.ImageLogger
47 |       params:
48 |         batch_frequency: 1000
49 |         max_images: 8
50 |         increase_log_steps: True
51 | 
52 |   trainer:
53 |     benchmark: True
54 |     accumulate_grad_batches: 2
55 | 


--------------------------------------------------------------------------------
/configs/autoencoder/autoencoder_kl_8x8x64.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-6
 3 |   target: ldm.models.autoencoder.AutoencoderKL
 4 |   params:
 5 |     monitor: "val/rec_loss"
 6 |     embed_dim: 64
 7 |     lossconfig:
 8 |       target: ldm.modules.losses.LPIPSWithDiscriminator
 9 |       params:
10 |         disc_start: 50001
11 |         kl_weight: 0.000001
12 |         disc_weight: 0.5
13 | 
14 |     ddconfig:
15 |       double_z: True
16 |       z_channels: 64
17 |       resolution: 256
18 |       in_channels: 3
19 |       out_ch: 3
20 |       ch: 128
21 |       ch_mult: [ 1,1,2,2,4,4]  # num_down = len(ch_mult)-1
22 |       num_res_blocks: 2
23 |       attn_resolutions: [16,8]
24 |       dropout: 0.0
25 | 
26 | data:
27 |   target: main.DataModuleFromConfig
28 |   params:
29 |     batch_size: 12
30 |     wrap: True
31 |     train:
32 |       target: ldm.data.imagenet.ImageNetSRTrain
33 |       params:
34 |         size: 256
35 |         degradation: pil_nearest
36 |     validation:
37 |       target: ldm.data.imagenet.ImageNetSRValidation
38 |       params:
39 |         size: 256
40 |         degradation: pil_nearest
41 | 
42 | lightning:
43 |   callbacks:
44 |     image_logger:
45 |       target: main.ImageLogger
46 |       params:
47 |         batch_frequency: 1000
48 |         max_images: 8
49 |         increase_log_steps: True
50 | 
51 |   trainer:
52 |     benchmark: True
53 |     accumulate_grad_batches: 2
54 | 


--------------------------------------------------------------------------------
/configs/latent-diffusion/celebahq-ldm-vq-4.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 2.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0195
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     image_size: 64
12 |     channels: 3
13 |     monitor: val/loss_simple_ema
14 | 
15 |     unet_config:
16 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
17 |       params:
18 |         image_size: 64
19 |         in_channels: 3
20 |         out_channels: 3
21 |         model_channels: 224
22 |         attention_resolutions:
23 |         # note: this isn\t actually the resolution but
24 |         # the downsampling factor, i.e. this corresnponds to
25 |         # attention on spatial resolution 8,16,32, as the
26 |         # spatial reolution of the latents is 64 for f4
27 |         - 8
28 |         - 4
29 |         - 2
30 |         num_res_blocks: 2
31 |         channel_mult:
32 |         - 1
33 |         - 2
34 |         - 3
35 |         - 4
36 |         num_head_channels: 32
37 |     first_stage_config:
38 |       target: ldm.models.autoencoder.VQModelInterface
39 |       params:
40 |         embed_dim: 3
41 |         n_embed: 8192
42 |         ckpt_path: models/first_stage_models/vq-f4/model.ckpt
43 |         ddconfig:
44 |           double_z: false
45 |           z_channels: 3
46 |           resolution: 256
47 |           in_channels: 3
48 |           out_ch: 3
49 |           ch: 128
50 |           ch_mult:
51 |           - 1
52 |           - 2
53 |           - 4
54 |           num_res_blocks: 2
55 |           attn_resolutions: []
56 |           dropout: 0.0
57 |         lossconfig:
58 |           target: torch.nn.Identity
59 |     cond_stage_config: __is_unconditional__
60 | data:
61 |   target: main.DataModuleFromConfig
62 |   params:
63 |     batch_size: 48
64 |     num_workers: 5
65 |     wrap: false
66 |     train:
67 |       target: taming.data.faceshq.CelebAHQTrain
68 |       params:
69 |         size: 256
70 |     validation:
71 |       target: taming.data.faceshq.CelebAHQValidation
72 |       params:
73 |         size: 256
74 | 
75 | 
76 | lightning:
77 |   callbacks:
78 |     image_logger:
79 |       target: main.ImageLogger
80 |       params:
81 |         batch_frequency: 5000
82 |         max_images: 8
83 |         increase_log_steps: False
84 | 
85 |   trainer:
86 |     benchmark: True


--------------------------------------------------------------------------------
/configs/latent-diffusion/cin-ldm-vq-f8.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 1.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0195
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     cond_stage_key: class_label
12 |     image_size: 32
13 |     channels: 4
14 |     cond_stage_trainable: true
15 |     conditioning_key: crossattn
16 |     monitor: val/loss_simple_ema
17 |     unet_config:
18 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
19 |       params:
20 |         image_size: 32
21 |         in_channels: 4
22 |         out_channels: 4
23 |         model_channels: 256
24 |         attention_resolutions:
25 |         #note: this isn\t actually the resolution but
26 |         # the downsampling factor, i.e. this corresnponds to
27 |         # attention on spatial resolution 8,16,32, as the
28 |         # spatial reolution of the latents is 32 for f8
29 |         - 4
30 |         - 2
31 |         - 1
32 |         num_res_blocks: 2
33 |         channel_mult:
34 |         - 1
35 |         - 2
36 |         - 4
37 |         num_head_channels: 32
38 |         use_spatial_transformer: true
39 |         transformer_depth: 1
40 |         context_dim: 512
41 |     first_stage_config:
42 |       target: ldm.models.autoencoder.VQModelInterface
43 |       params:
44 |         embed_dim: 4
45 |         n_embed: 16384
46 |         ckpt_path: configs/first_stage_models/vq-f8/model.yaml
47 |         ddconfig:
48 |           double_z: false
49 |           z_channels: 4
50 |           resolution: 256
51 |           in_channels: 3
52 |           out_ch: 3
53 |           ch: 128
54 |           ch_mult:
55 |           - 1
56 |           - 2
57 |           - 2
58 |           - 4
59 |           num_res_blocks: 2
60 |           attn_resolutions:
61 |           - 32
62 |           dropout: 0.0
63 |         lossconfig:
64 |           target: torch.nn.Identity
65 |     cond_stage_config:
66 |       target: ldm.modules.encoders.modules.ClassEmbedder
67 |       params:
68 |         embed_dim: 512
69 |         key: class_label
70 | data:
71 |   target: main.DataModuleFromConfig
72 |   params:
73 |     batch_size: 64
74 |     num_workers: 12
75 |     wrap: false
76 |     train:
77 |       target: ldm.data.imagenet.ImageNetTrain
78 |       params:
79 |         config:
80 |           size: 256
81 |     validation:
82 |       target: ldm.data.imagenet.ImageNetValidation
83 |       params:
84 |         config:
85 |           size: 256
86 | 
87 | 
88 | lightning:
89 |   callbacks:
90 |     image_logger:
91 |       target: main.ImageLogger
92 |       params:
93 |         batch_frequency: 5000
94 |         max_images: 8
95 |         increase_log_steps: False
96 | 
97 |   trainer:
98 |     benchmark: True


--------------------------------------------------------------------------------
/configs/latent-diffusion/cin256-v2.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 0.0001
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0195
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     cond_stage_key: class_label
12 |     image_size: 64
13 |     channels: 3
14 |     cond_stage_trainable: true
15 |     conditioning_key: crossattn
16 |     monitor: val/loss
17 |     use_ema: False
18 |     
19 |     unet_config:
20 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
21 |       params:
22 |         image_size: 64
23 |         in_channels: 3
24 |         out_channels: 3
25 |         model_channels: 192
26 |         attention_resolutions:
27 |         - 8
28 |         - 4
29 |         - 2
30 |         num_res_blocks: 2
31 |         channel_mult:
32 |         - 1
33 |         - 2
34 |         - 3
35 |         - 5
36 |         num_heads: 1
37 |         use_spatial_transformer: true
38 |         transformer_depth: 1
39 |         context_dim: 512
40 |     
41 |     first_stage_config:
42 |       target: ldm.models.autoencoder.VQModelInterface
43 |       params:
44 |         embed_dim: 3
45 |         n_embed: 8192
46 |         ddconfig:
47 |           double_z: false
48 |           z_channels: 3
49 |           resolution: 256
50 |           in_channels: 3
51 |           out_ch: 3
52 |           ch: 128
53 |           ch_mult:
54 |           - 1
55 |           - 2
56 |           - 4
57 |           num_res_blocks: 2
58 |           attn_resolutions: []
59 |           dropout: 0.0
60 |         lossconfig:
61 |           target: torch.nn.Identity
62 |     
63 |     cond_stage_config:
64 |       target: ldm.modules.encoders.modules.ClassEmbedder
65 |       params:
66 |         n_classes: 1001
67 |         embed_dim: 512
68 |         key: class_label
69 | 


--------------------------------------------------------------------------------
/configs/latent-diffusion/ffhq-ldm-vq-4.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 2.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0195
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     image_size: 64
12 |     channels: 3
13 |     monitor: val/loss_simple_ema
14 |     unet_config:
15 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
16 |       params:
17 |         image_size: 64
18 |         in_channels: 3
19 |         out_channels: 3
20 |         model_channels: 224
21 |         attention_resolutions:
22 |         # note: this isn\t actually the resolution but
23 |         # the downsampling factor, i.e. this corresnponds to
24 |         # attention on spatial resolution 8,16,32, as the
25 |         # spatial reolution of the latents is 64 for f4
26 |         - 8
27 |         - 4
28 |         - 2
29 |         num_res_blocks: 2
30 |         channel_mult:
31 |         - 1
32 |         - 2
33 |         - 3
34 |         - 4
35 |         num_head_channels: 32
36 |     first_stage_config:
37 |       target: ldm.models.autoencoder.VQModelInterface
38 |       params:
39 |         embed_dim: 3
40 |         n_embed: 8192
41 |         ckpt_path: configs/first_stage_models/vq-f4/model.yaml
42 |         ddconfig:
43 |           double_z: false
44 |           z_channels: 3
45 |           resolution: 256
46 |           in_channels: 3
47 |           out_ch: 3
48 |           ch: 128
49 |           ch_mult:
50 |           - 1
51 |           - 2
52 |           - 4
53 |           num_res_blocks: 2
54 |           attn_resolutions: []
55 |           dropout: 0.0
56 |         lossconfig:
57 |           target: torch.nn.Identity
58 |     cond_stage_config: __is_unconditional__
59 | data:
60 |   target: main.DataModuleFromConfig
61 |   params:
62 |     batch_size: 42
63 |     num_workers: 5
64 |     wrap: false
65 |     train:
66 |       target: taming.data.faceshq.FFHQTrain
67 |       params:
68 |         size: 256
69 |     validation:
70 |       target: taming.data.faceshq.FFHQValidation
71 |       params:
72 |         size: 256
73 | 
74 | 
75 | lightning:
76 |   callbacks:
77 |     image_logger:
78 |       target: main.ImageLogger
79 |       params:
80 |         batch_frequency: 5000
81 |         max_images: 8
82 |         increase_log_steps: False
83 | 
84 |   trainer:
85 |     benchmark: True


--------------------------------------------------------------------------------
/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 2.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0195
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     image_size: 64
12 |     channels: 3
13 |     monitor: val/loss_simple_ema
14 |     unet_config:
15 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
16 |       params:
17 |         image_size: 64
18 |         in_channels: 3
19 |         out_channels: 3
20 |         model_channels: 224
21 |         attention_resolutions:
22 |         # note: this isn\t actually the resolution but
23 |         # the downsampling factor, i.e. this corresnponds to
24 |         # attention on spatial resolution 8,16,32, as the
25 |         # spatial reolution of the latents is 64 for f4
26 |         - 8
27 |         - 4
28 |         - 2
29 |         num_res_blocks: 2
30 |         channel_mult:
31 |         - 1
32 |         - 2
33 |         - 3
34 |         - 4
35 |         num_head_channels: 32
36 |     first_stage_config:
37 |       target: ldm.models.autoencoder.VQModelInterface
38 |       params:
39 |         ckpt_path: configs/first_stage_models/vq-f4/model.yaml
40 |         embed_dim: 3
41 |         n_embed: 8192
42 |         ddconfig:
43 |           double_z: false
44 |           z_channels: 3
45 |           resolution: 256
46 |           in_channels: 3
47 |           out_ch: 3
48 |           ch: 128
49 |           ch_mult:
50 |           - 1
51 |           - 2
52 |           - 4
53 |           num_res_blocks: 2
54 |           attn_resolutions: []
55 |           dropout: 0.0
56 |         lossconfig:
57 |           target: torch.nn.Identity
58 |     cond_stage_config: __is_unconditional__
59 | data:
60 |   target: main.DataModuleFromConfig
61 |   params:
62 |     batch_size: 48
63 |     num_workers: 5
64 |     wrap: false
65 |     train:
66 |       target: ldm.data.lsun.LSUNBedroomsTrain
67 |       params:
68 |         size: 256
69 |     validation:
70 |       target: ldm.data.lsun.LSUNBedroomsValidation
71 |       params:
72 |         size: 256
73 | 
74 | 
75 | lightning:
76 |   callbacks:
77 |     image_logger:
78 |       target: main.ImageLogger
79 |       params:
80 |         batch_frequency: 5000
81 |         max_images: 8
82 |         increase_log_steps: False
83 | 
84 |   trainer:
85 |     benchmark: True


--------------------------------------------------------------------------------
/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 5.0e-5   # set to target_lr by starting main.py with '--scale_lr False'
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0155
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     loss_type: l1
11 |     first_stage_key: "image"
12 |     cond_stage_key: "image"
13 |     image_size: 32
14 |     channels: 4
15 |     cond_stage_trainable: False
16 |     concat_mode: False
17 |     scale_by_std: True
18 |     monitor: 'val/loss_simple_ema'
19 | 
20 |     scheduler_config: # 10000 warmup steps
21 |       target: ldm.lr_scheduler.LambdaLinearScheduler
22 |       params:
23 |         warm_up_steps: [10000]
24 |         cycle_lengths: [10000000000000]
25 |         f_start: [1.e-6]
26 |         f_max: [1.]
27 |         f_min: [ 1.]
28 | 
29 |     unet_config:
30 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
31 |       params:
32 |         image_size: 32
33 |         in_channels: 4
34 |         out_channels: 4
35 |         model_channels: 192
36 |         attention_resolutions: [ 1, 2, 4, 8 ]   # 32, 16, 8, 4
37 |         num_res_blocks: 2
38 |         channel_mult: [ 1,2,2,4,4 ]  # 32, 16, 8, 4, 2
39 |         num_heads: 8
40 |         use_scale_shift_norm: True
41 |         resblock_updown: True
42 | 
43 |     first_stage_config:
44 |       target: ldm.models.autoencoder.AutoencoderKL
45 |       params:
46 |         embed_dim: 4
47 |         monitor: "val/rec_loss"
48 |         ckpt_path: "models/first_stage_models/kl-f8/model.ckpt"
49 |         ddconfig:
50 |           double_z: True
51 |           z_channels: 4
52 |           resolution: 256
53 |           in_channels: 3
54 |           out_ch: 3
55 |           ch: 128
56 |           ch_mult: [ 1,2,4,4 ]  # num_down = len(ch_mult)-1
57 |           num_res_blocks: 2
58 |           attn_resolutions: [ ]
59 |           dropout: 0.0
60 |         lossconfig:
61 |           target: torch.nn.Identity
62 | 
63 |     cond_stage_config: "__is_unconditional__"
64 | 
65 | data:
66 |   target: main.DataModuleFromConfig
67 |   params:
68 |     batch_size: 96
69 |     num_workers: 5
70 |     wrap: False
71 |     train:
72 |       target: ldm.data.lsun.LSUNChurchesTrain
73 |       params:
74 |         size: 256
75 |     validation:
76 |       target: ldm.data.lsun.LSUNChurchesValidation
77 |       params:
78 |         size: 256
79 | 
80 | lightning:
81 |   callbacks:
82 |     image_logger:
83 |       target: main.ImageLogger
84 |       params:
85 |         batch_frequency: 5000
86 |         max_images: 8
87 |         increase_log_steps: False
88 | 
89 | 
90 |   trainer:
91 |     benchmark: True


--------------------------------------------------------------------------------
/configs/latent-diffusion/txt2img-1p4B-eval.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 5.0e-05
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.00085
 6 |     linear_end: 0.012
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     cond_stage_key: caption
12 |     image_size: 32
13 |     channels: 4
14 |     cond_stage_trainable: true
15 |     conditioning_key: crossattn
16 |     monitor: val/loss_simple_ema
17 |     scale_factor: 0.18215
18 |     use_ema: False
19 | 
20 |     unet_config:
21 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
22 |       params:
23 |         image_size: 32
24 |         in_channels: 4
25 |         out_channels: 4
26 |         model_channels: 320
27 |         attention_resolutions:
28 |         - 4
29 |         - 2
30 |         - 1
31 |         num_res_blocks: 2
32 |         channel_mult:
33 |         - 1
34 |         - 2
35 |         - 4
36 |         - 4
37 |         num_heads: 8
38 |         use_spatial_transformer: true
39 |         transformer_depth: 1
40 |         context_dim: 1280
41 |         use_checkpoint: true
42 |         legacy: False
43 | 
44 |     first_stage_config:
45 |       target: ldm.models.autoencoder.AutoencoderKL
46 |       params:
47 |         embed_dim: 4
48 |         monitor: val/rec_loss
49 |         ddconfig:
50 |           double_z: true
51 |           z_channels: 4
52 |           resolution: 256
53 |           in_channels: 3
54 |           out_ch: 3
55 |           ch: 128
56 |           ch_mult:
57 |           - 1
58 |           - 2
59 |           - 4
60 |           - 4
61 |           num_res_blocks: 2
62 |           attn_resolutions: []
63 |           dropout: 0.0
64 |         lossconfig:
65 |           target: torch.nn.Identity
66 | 
67 |     cond_stage_config:
68 |       target: ldm.modules.encoders.modules.BERTEmbedder
69 |       params:
70 |         n_embed: 1280
71 |         n_layer: 32
72 | 


--------------------------------------------------------------------------------
/configs/retrieval-augmented-diffusion/768x768.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 0.0001
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.015
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: jpg
11 |     cond_stage_key: nix
12 |     image_size: 48
13 |     channels: 16
14 |     cond_stage_trainable: false
15 |     conditioning_key: crossattn
16 |     monitor: val/loss_simple_ema
17 |     scale_by_std: false
18 |     scale_factor: 0.22765929
19 |     unet_config:
20 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
21 |       params:
22 |         image_size: 48
23 |         in_channels: 16
24 |         out_channels: 16
25 |         model_channels: 448
26 |         attention_resolutions:
27 |         - 4
28 |         - 2
29 |         - 1
30 |         num_res_blocks: 2
31 |         channel_mult:
32 |         - 1
33 |         - 2
34 |         - 3
35 |         - 4
36 |         use_scale_shift_norm: false
37 |         resblock_updown: false
38 |         num_head_channels: 32
39 |         use_spatial_transformer: true
40 |         transformer_depth: 1
41 |         context_dim: 768
42 |         use_checkpoint: true
43 |     first_stage_config:
44 |       target: ldm.models.autoencoder.AutoencoderKL
45 |       params:
46 |         monitor: val/rec_loss
47 |         embed_dim: 16
48 |         ddconfig:
49 |           double_z: true
50 |           z_channels: 16
51 |           resolution: 256
52 |           in_channels: 3
53 |           out_ch: 3
54 |           ch: 128
55 |           ch_mult:
56 |           - 1
57 |           - 1
58 |           - 2
59 |           - 2
60 |           - 4
61 |           num_res_blocks: 2
62 |           attn_resolutions:
63 |           - 16
64 |           dropout: 0.0
65 |         lossconfig:
66 |           target: torch.nn.Identity
67 |     cond_stage_config:
68 |       target: torch.nn.Identity


--------------------------------------------------------------------------------
/data/DejaVuSans.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/DejaVuSans.ttf


--------------------------------------------------------------------------------
/data/example_conditioning/superresolution/sample_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/example_conditioning/superresolution/sample_0.jpg


--------------------------------------------------------------------------------
/data/example_conditioning/text_conditional/sample_0.txt:
--------------------------------------------------------------------------------
1 | A basket of cerries
2 | 


--------------------------------------------------------------------------------
/data/imagenet_train_hr_indices.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/imagenet_train_hr_indices.p


--------------------------------------------------------------------------------
/data/imagenet_val_hr_indices.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/imagenet_val_hr_indices.p


--------------------------------------------------------------------------------
/data/inpainting_examples/6458524847_2f4c361183_k.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/inpainting_examples/6458524847_2f4c361183_k.png


--------------------------------------------------------------------------------
/data/inpainting_examples/6458524847_2f4c361183_k_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/inpainting_examples/6458524847_2f4c361183_k_mask.png


--------------------------------------------------------------------------------
/data/inpainting_examples/8399166846_f6fb4e4b8e_k.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/inpainting_examples/8399166846_f6fb4e4b8e_k.png


--------------------------------------------------------------------------------
/data/inpainting_examples/8399166846_f6fb4e4b8e_k_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/inpainting_examples/8399166846_f6fb4e4b8e_k_mask.png


--------------------------------------------------------------------------------
/data/inpainting_examples/alex-iby-G_Pk4D9rMLs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/inpainting_examples/alex-iby-G_Pk4D9rMLs.png


--------------------------------------------------------------------------------
/data/inpainting_examples/alex-iby-G_Pk4D9rMLs_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/inpainting_examples/alex-iby-G_Pk4D9rMLs_mask.png


--------------------------------------------------------------------------------
/data/inpainting_examples/bench2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/inpainting_examples/bench2.png


--------------------------------------------------------------------------------
/data/inpainting_examples/bench2_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/inpainting_examples/bench2_mask.png


--------------------------------------------------------------------------------
/data/inpainting_examples/bertrand-gabioud-CpuFzIsHYJ0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/inpainting_examples/bertrand-gabioud-CpuFzIsHYJ0.png


--------------------------------------------------------------------------------
/data/inpainting_examples/bertrand-gabioud-CpuFzIsHYJ0_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/inpainting_examples/bertrand-gabioud-CpuFzIsHYJ0_mask.png


--------------------------------------------------------------------------------
/data/inpainting_examples/billow926-12-Wc-Zgx6Y.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/inpainting_examples/billow926-12-Wc-Zgx6Y.png


--------------------------------------------------------------------------------
/data/inpainting_examples/billow926-12-Wc-Zgx6Y_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/inpainting_examples/billow926-12-Wc-Zgx6Y_mask.png


--------------------------------------------------------------------------------
/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png


--------------------------------------------------------------------------------
/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png


--------------------------------------------------------------------------------
/data/inpainting_examples/photo-1583445095369-9c651e7e5d34.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/inpainting_examples/photo-1583445095369-9c651e7e5d34.png


--------------------------------------------------------------------------------
/data/inpainting_examples/photo-1583445095369-9c651e7e5d34_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/data/inpainting_examples/photo-1583445095369-9c651e7e5d34_mask.png


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: ldm
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 | dependencies:
 6 |   - python=3.8.5
 7 |   - pip=20.3
 8 |   - cudatoolkit=11.0
 9 |   - pytorch=1.7.0
10 |   - torchvision=0.8.1
11 |   - numpy=1.19.2
12 |   - pip:
13 |     - albumentations==0.4.3
14 |     - opencv-python==4.1.2.30
15 |     - pudb==2019.2
16 |     - imageio==2.9.0
17 |     - imageio-ffmpeg==0.4.2
18 |     - pytorch-lightning==1.4.2
19 |     - omegaconf==2.1.1
20 |     - test-tube>=0.7.5
21 |     - streamlit>=0.73.1
22 |     - einops==0.3.0
23 |     - torch-fidelity==0.3.0
24 |     - transformers==4.3.1
25 |     - -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
26 |     - -e git+https://github.com/openai/CLIP.git@main#egg=clip
27 |     - -e .


--------------------------------------------------------------------------------
/ldm/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/ldm/data/__init__.py


--------------------------------------------------------------------------------
/ldm/data/base.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset
 3 | 
 4 | 
 5 | class Txt2ImgIterableBaseDataset(IterableDataset):
 6 |     '''
 7 |     Define an interface to make the IterableDatasets for text2img data chainable
 8 |     '''
 9 |     def __init__(self, num_records=0, valid_ids=None, size=256):
10 |         super().__init__()
11 |         self.num_records = num_records
12 |         self.valid_ids = valid_ids
13 |         self.sample_ids = valid_ids
14 |         self.size = size
15 | 
16 |         print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.')
17 | 
18 |     def __len__(self):
19 |         return self.num_records
20 | 
21 |     @abstractmethod
22 |     def __iter__(self):
23 |         pass


--------------------------------------------------------------------------------
/ldm/data/lsun.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import PIL
 4 | from PIL import Image
 5 | from torch.utils.data import Dataset
 6 | from torchvision import transforms
 7 | 
 8 | 
 9 | class LSUNBase(Dataset):
10 |     def __init__(self,
11 |                  txt_file,
12 |                  data_root,
13 |                  size=None,
14 |                  interpolation="bicubic",
15 |                  flip_p=0.5
16 |                  ):
17 |         self.data_paths = txt_file
18 |         self.data_root = data_root
19 |         with open(self.data_paths, "r") as f:
20 |             self.image_paths = f.read().splitlines()
21 |         self._length = len(self.image_paths)
22 |         self.labels = {
23 |             "relative_file_path_": [l for l in self.image_paths],
24 |             "file_path_": [os.path.join(self.data_root, l)
25 |                            for l in self.image_paths],
26 |         }
27 | 
28 |         self.size = size
29 |         self.interpolation = {"linear": PIL.Image.LINEAR,
30 |                               "bilinear": PIL.Image.BILINEAR,
31 |                               "bicubic": PIL.Image.BICUBIC,
32 |                               "lanczos": PIL.Image.LANCZOS,
33 |                               }[interpolation]
34 |         self.flip = transforms.RandomHorizontalFlip(p=flip_p)
35 | 
36 |     def __len__(self):
37 |         return self._length
38 | 
39 |     def __getitem__(self, i):
40 |         example = dict((k, self.labels[k][i]) for k in self.labels)
41 |         image = Image.open(example["file_path_"])
42 |         if not image.mode == "RGB":
43 |             image = image.convert("RGB")
44 | 
45 |         # default to score-sde preprocessing
46 |         img = np.array(image).astype(np.uint8)
47 |         crop = min(img.shape[0], img.shape[1])
48 |         h, w, = img.shape[0], img.shape[1]
49 |         img = img[(h - crop) // 2:(h + crop) // 2,
50 |               (w - crop) // 2:(w + crop) // 2]
51 | 
52 |         image = Image.fromarray(img)
53 |         if self.size is not None:
54 |             image = image.resize((self.size, self.size), resample=self.interpolation)
55 | 
56 |         image = self.flip(image)
57 |         image = np.array(image).astype(np.uint8)
58 |         example["image"] = (image / 127.5 - 1.0).astype(np.float32)
59 |         return example
60 | 
61 | 
62 | class LSUNChurchesTrain(LSUNBase):
63 |     def __init__(self, **kwargs):
64 |         super().__init__(txt_file="data/lsun/church_outdoor_train.txt", data_root="data/lsun/churches", **kwargs)
65 | 
66 | 
67 | class LSUNChurchesValidation(LSUNBase):
68 |     def __init__(self, flip_p=0., **kwargs):
69 |         super().__init__(txt_file="data/lsun/church_outdoor_val.txt", data_root="data/lsun/churches",
70 |                          flip_p=flip_p, **kwargs)
71 | 
72 | 
73 | class LSUNBedroomsTrain(LSUNBase):
74 |     def __init__(self, **kwargs):
75 |         super().__init__(txt_file="data/lsun/bedrooms_train.txt", data_root="data/lsun/bedrooms", **kwargs)
76 | 
77 | 
78 | class LSUNBedroomsValidation(LSUNBase):
79 |     def __init__(self, flip_p=0.0, **kwargs):
80 |         super().__init__(txt_file="data/lsun/bedrooms_val.txt", data_root="data/lsun/bedrooms",
81 |                          flip_p=flip_p, **kwargs)
82 | 
83 | 
84 | class LSUNCatsTrain(LSUNBase):
85 |     def __init__(self, **kwargs):
86 |         super().__init__(txt_file="data/lsun/cat_train.txt", data_root="data/lsun/cats", **kwargs)
87 | 
88 | 
89 | class LSUNCatsValidation(LSUNBase):
90 |     def __init__(self, flip_p=0., **kwargs):
91 |         super().__init__(txt_file="data/lsun/cat_val.txt", data_root="data/lsun/cats",
92 |                          flip_p=flip_p, **kwargs)
93 | 


--------------------------------------------------------------------------------
/ldm/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class LambdaWarmUpCosineScheduler:
 5 |     """
 6 |     note: use with a base_lr of 1.0
 7 |     """
 8 |     def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
 9 |         self.lr_warm_up_steps = warm_up_steps
10 |         self.lr_start = lr_start
11 |         self.lr_min = lr_min
12 |         self.lr_max = lr_max
13 |         self.lr_max_decay_steps = max_decay_steps
14 |         self.last_lr = 0.
15 |         self.verbosity_interval = verbosity_interval
16 | 
17 |     def schedule(self, n, **kwargs):
18 |         if self.verbosity_interval > 0:
19 |             if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
20 |         if n < self.lr_warm_up_steps:
21 |             lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
22 |             self.last_lr = lr
23 |             return lr
24 |         else:
25 |             t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
26 |             t = min(t, 1.0)
27 |             lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
28 |                     1 + np.cos(t * np.pi))
29 |             self.last_lr = lr
30 |             return lr
31 | 
32 |     def __call__(self, n, **kwargs):
33 |         return self.schedule(n,**kwargs)
34 | 
35 | 
36 | class LambdaWarmUpCosineScheduler2:
37 |     """
38 |     supports repeated iterations, configurable via lists
39 |     note: use with a base_lr of 1.0.
40 |     """
41 |     def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
42 |         assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
43 |         self.lr_warm_up_steps = warm_up_steps
44 |         self.f_start = f_start
45 |         self.f_min = f_min
46 |         self.f_max = f_max
47 |         self.cycle_lengths = cycle_lengths
48 |         self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
49 |         self.last_f = 0.
50 |         self.verbosity_interval = verbosity_interval
51 | 
52 |     def find_in_interval(self, n):
53 |         interval = 0
54 |         for cl in self.cum_cycles[1:]:
55 |             if n <= cl:
56 |                 return interval
57 |             interval += 1
58 | 
59 |     def schedule(self, n, **kwargs):
60 |         cycle = self.find_in_interval(n)
61 |         n = n - self.cum_cycles[cycle]
62 |         if self.verbosity_interval > 0:
63 |             if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
64 |                                                        f"current cycle {cycle}")
65 |         if n < self.lr_warm_up_steps[cycle]:
66 |             f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
67 |             self.last_f = f
68 |             return f
69 |         else:
70 |             t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
71 |             t = min(t, 1.0)
72 |             f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
73 |                     1 + np.cos(t * np.pi))
74 |             self.last_f = f
75 |             return f
76 | 
77 |     def __call__(self, n, **kwargs):
78 |         return self.schedule(n, **kwargs)
79 | 
80 | 
81 | class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
82 | 
83 |     def schedule(self, n, **kwargs):
84 |         cycle = self.find_in_interval(n)
85 |         n = n - self.cum_cycles[cycle]
86 |         if self.verbosity_interval > 0:
87 |             if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
88 |                                                        f"current cycle {cycle}")
89 | 
90 |         if n < self.lr_warm_up_steps[cycle]:
91 |             f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
92 |             self.last_f = f
93 |             return f
94 |         else:
95 |             f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
96 |             self.last_f = f
97 |             return f
98 | 
99 | 


--------------------------------------------------------------------------------
/ldm/models/diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/ldm/models/diffusion/__init__.py


--------------------------------------------------------------------------------
/ldm/models/diffusion/classifier.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import pytorch_lightning as pl
  4 | from omegaconf import OmegaConf
  5 | from torch.nn import functional as F
  6 | from torch.optim import AdamW
  7 | from torch.optim.lr_scheduler import LambdaLR
  8 | from copy import deepcopy
  9 | from einops import rearrange
 10 | from glob import glob
 11 | from natsort import natsorted
 12 | 
 13 | from ldm.modules.diffusionmodules.openaimodel import EncoderUNetModel, UNetModel
 14 | from ldm.util import log_txt_as_img, default, ismap, instantiate_from_config
 15 | 
 16 | __models__ = {
 17 |     'class_label': EncoderUNetModel,
 18 |     'segmentation': UNetModel
 19 | }
 20 | 
 21 | 
 22 | def disabled_train(self, mode=True):
 23 |     """Overwrite model.train with this function to make sure train/eval mode
 24 |     does not change anymore."""
 25 |     return self
 26 | 
 27 | 
 28 | class NoisyLatentImageClassifier(pl.LightningModule):
 29 | 
 30 |     def __init__(self,
 31 |                  diffusion_path,
 32 |                  num_classes,
 33 |                  ckpt_path=None,
 34 |                  pool='attention',
 35 |                  label_key=None,
 36 |                  diffusion_ckpt_path=None,
 37 |                  scheduler_config=None,
 38 |                  weight_decay=1.e-2,
 39 |                  log_steps=10,
 40 |                  monitor='val/loss',
 41 |                  *args,
 42 |                  **kwargs):
 43 |         super().__init__(*args, **kwargs)
 44 |         self.num_classes = num_classes
 45 |         # get latest config of diffusion model
 46 |         diffusion_config = natsorted(glob(os.path.join(diffusion_path, 'configs', '*-project.yaml')))[-1]
 47 |         self.diffusion_config = OmegaConf.load(diffusion_config).model
 48 |         self.diffusion_config.params.ckpt_path = diffusion_ckpt_path
 49 |         self.load_diffusion()
 50 | 
 51 |         self.monitor = monitor
 52 |         self.numd = self.diffusion_model.first_stage_model.encoder.num_resolutions - 1
 53 |         self.log_time_interval = self.diffusion_model.num_timesteps // log_steps
 54 |         self.log_steps = log_steps
 55 | 
 56 |         self.label_key = label_key if not hasattr(self.diffusion_model, 'cond_stage_key') \
 57 |             else self.diffusion_model.cond_stage_key
 58 | 
 59 |         assert self.label_key is not None, 'label_key neither in diffusion model nor in model.params'
 60 | 
 61 |         if self.label_key not in __models__:
 62 |             raise NotImplementedError()
 63 | 
 64 |         self.load_classifier(ckpt_path, pool)
 65 | 
 66 |         self.scheduler_config = scheduler_config
 67 |         self.use_scheduler = self.scheduler_config is not None
 68 |         self.weight_decay = weight_decay
 69 | 
 70 |     def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
 71 |         sd = torch.load(path, map_location="cpu")
 72 |         if "state_dict" in list(sd.keys()):
 73 |             sd = sd["state_dict"]
 74 |         keys = list(sd.keys())
 75 |         for k in keys:
 76 |             for ik in ignore_keys:
 77 |                 if k.startswith(ik):
 78 |                     print("Deleting key {} from state_dict.".format(k))
 79 |                     del sd[k]
 80 |         missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
 81 |             sd, strict=False)
 82 |         print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
 83 |         if len(missing) > 0:
 84 |             print(f"Missing Keys: {missing}")
 85 |         if len(unexpected) > 0:
 86 |             print(f"Unexpected Keys: {unexpected}")
 87 | 
 88 |     def load_diffusion(self):
 89 |         model = instantiate_from_config(self.diffusion_config)
 90 |         self.diffusion_model = model.eval()
 91 |         self.diffusion_model.train = disabled_train
 92 |         for param in self.diffusion_model.parameters():
 93 |             param.requires_grad = False
 94 | 
 95 |     def load_classifier(self, ckpt_path, pool):
 96 |         model_config = deepcopy(self.diffusion_config.params.unet_config.params)
 97 |         model_config.in_channels = self.diffusion_config.params.unet_config.params.out_channels
 98 |         model_config.out_channels = self.num_classes
 99 |         if self.label_key == 'class_label':
100 |             model_config.pool = pool
101 | 
102 |         self.model = __models__[self.label_key](**model_config)
103 |         if ckpt_path is not None:
104 |             print('#####################################################################')
105 |             print(f'load from ckpt "{ckpt_path}"')
106 |             print('#####################################################################')
107 |             self.init_from_ckpt(ckpt_path)
108 | 
109 |     @torch.no_grad()
110 |     def get_x_noisy(self, x, t, noise=None):
111 |         noise = default(noise, lambda: torch.randn_like(x))
112 |         continuous_sqrt_alpha_cumprod = None
113 |         if self.diffusion_model.use_continuous_noise:
114 |             continuous_sqrt_alpha_cumprod = self.diffusion_model.sample_continuous_noise_level(x.shape[0], t + 1)
115 |             # todo: make sure t+1 is correct here
116 | 
117 |         return self.diffusion_model.q_sample(x_start=x, t=t, noise=noise,
118 |                                              continuous_sqrt_alpha_cumprod=continuous_sqrt_alpha_cumprod)
119 | 
120 |     def forward(self, x_noisy, t, *args, **kwargs):
121 |         return self.model(x_noisy, t)
122 | 
123 |     @torch.no_grad()
124 |     def get_input(self, batch, k):
125 |         x = batch[k]
126 |         if len(x.shape) == 3:
127 |             x = x[..., None]
128 |         x = rearrange(x, 'b h w c -> b c h w')
129 |         x = x.to(memory_format=torch.contiguous_format).float()
130 |         return x
131 | 
132 |     @torch.no_grad()
133 |     def get_conditioning(self, batch, k=None):
134 |         if k is None:
135 |             k = self.label_key
136 |         assert k is not None, 'Needs to provide label key'
137 | 
138 |         targets = batch[k].to(self.device)
139 | 
140 |         if self.label_key == 'segmentation':
141 |             targets = rearrange(targets, 'b h w c -> b c h w')
142 |             for down in range(self.numd):
143 |                 h, w = targets.shape[-2:]
144 |                 targets = F.interpolate(targets, size=(h // 2, w // 2), mode='nearest')
145 | 
146 |             # targets = rearrange(targets,'b c h w -> b h w c')
147 | 
148 |         return targets
149 | 
150 |     def compute_top_k(self, logits, labels, k, reduction="mean"):
151 |         _, top_ks = torch.topk(logits, k, dim=1)
152 |         if reduction == "mean":
153 |             return (top_ks == labels[:, None]).float().sum(dim=-1).mean().item()
154 |         elif reduction == "none":
155 |             return (top_ks == labels[:, None]).float().sum(dim=-1)
156 | 
157 |     def on_train_epoch_start(self):
158 |         # save some memory
159 |         self.diffusion_model.model.to('cpu')
160 | 
161 |     @torch.no_grad()
162 |     def write_logs(self, loss, logits, targets):
163 |         log_prefix = 'train' if self.training else 'val'
164 |         log = {}
165 |         log[f"{log_prefix}/loss"] = loss.mean()
166 |         log[f"{log_prefix}/acc@1"] = self.compute_top_k(
167 |             logits, targets, k=1, reduction="mean"
168 |         )
169 |         log[f"{log_prefix}/acc@5"] = self.compute_top_k(
170 |             logits, targets, k=5, reduction="mean"
171 |         )
172 | 
173 |         self.log_dict(log, prog_bar=False, logger=True, on_step=self.training, on_epoch=True)
174 |         self.log('loss', log[f"{log_prefix}/loss"], prog_bar=True, logger=False)
175 |         self.log('global_step', self.global_step, logger=False, on_epoch=False, prog_bar=True)
176 |         lr = self.optimizers().param_groups[0]['lr']
177 |         self.log('lr_abs', lr, on_step=True, logger=True, on_epoch=False, prog_bar=True)
178 | 
179 |     def shared_step(self, batch, t=None):
180 |         x, *_ = self.diffusion_model.get_input(batch, k=self.diffusion_model.first_stage_key)
181 |         targets = self.get_conditioning(batch)
182 |         if targets.dim() == 4:
183 |             targets = targets.argmax(dim=1)
184 |         if t is None:
185 |             t = torch.randint(0, self.diffusion_model.num_timesteps, (x.shape[0],), device=self.device).long()
186 |         else:
187 |             t = torch.full(size=(x.shape[0],), fill_value=t, device=self.device).long()
188 |         x_noisy = self.get_x_noisy(x, t)
189 |         logits = self(x_noisy, t)
190 | 
191 |         loss = F.cross_entropy(logits, targets, reduction='none')
192 | 
193 |         self.write_logs(loss.detach(), logits.detach(), targets.detach())
194 | 
195 |         loss = loss.mean()
196 |         return loss, logits, x_noisy, targets
197 | 
198 |     def training_step(self, batch, batch_idx):
199 |         loss, *_ = self.shared_step(batch)
200 |         return loss
201 | 
202 |     def reset_noise_accs(self):
203 |         self.noisy_acc = {t: {'acc@1': [], 'acc@5': []} for t in
204 |                           range(0, self.diffusion_model.num_timesteps, self.diffusion_model.log_every_t)}
205 | 
206 |     def on_validation_start(self):
207 |         self.reset_noise_accs()
208 | 
209 |     @torch.no_grad()
210 |     def validation_step(self, batch, batch_idx):
211 |         loss, *_ = self.shared_step(batch)
212 | 
213 |         for t in self.noisy_acc:
214 |             _, logits, _, targets = self.shared_step(batch, t)
215 |             self.noisy_acc[t]['acc@1'].append(self.compute_top_k(logits, targets, k=1, reduction='mean'))
216 |             self.noisy_acc[t]['acc@5'].append(self.compute_top_k(logits, targets, k=5, reduction='mean'))
217 | 
218 |         return loss
219 | 
220 |     def configure_optimizers(self):
221 |         optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
222 | 
223 |         if self.use_scheduler:
224 |             scheduler = instantiate_from_config(self.scheduler_config)
225 | 
226 |             print("Setting up LambdaLR scheduler...")
227 |             scheduler = [
228 |                 {
229 |                     'scheduler': LambdaLR(optimizer, lr_lambda=scheduler.schedule),
230 |                     'interval': 'step',
231 |                     'frequency': 1
232 |                 }]
233 |             return [optimizer], scheduler
234 | 
235 |         return optimizer
236 | 
237 |     @torch.no_grad()
238 |     def log_images(self, batch, N=8, *args, **kwargs):
239 |         log = dict()
240 |         x = self.get_input(batch, self.diffusion_model.first_stage_key)
241 |         log['inputs'] = x
242 | 
243 |         y = self.get_conditioning(batch)
244 | 
245 |         if self.label_key == 'class_label':
246 |             y = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
247 |             log['labels'] = y
248 | 
249 |         if ismap(y):
250 |             log['labels'] = self.diffusion_model.to_rgb(y)
251 | 
252 |             for step in range(self.log_steps):
253 |                 current_time = step * self.log_time_interval
254 | 
255 |                 _, logits, x_noisy, _ = self.shared_step(batch, t=current_time)
256 | 
257 |                 log[f'inputs@t{current_time}'] = x_noisy
258 | 
259 |                 pred = F.one_hot(logits.argmax(dim=1), num_classes=self.num_classes)
260 |                 pred = rearrange(pred, 'b h w c -> b c h w')
261 | 
262 |                 log[f'pred@t{current_time}'] = self.diffusion_model.to_rgb(pred)
263 | 
264 |         for key in log:
265 |             log[key] = log[key][:N]
266 | 
267 |         return log
268 | 


--------------------------------------------------------------------------------
/ldm/models/diffusion/ddim.py:
--------------------------------------------------------------------------------
  1 | """SAMPLING ONLY."""
  2 | 
  3 | import torch
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | from functools import partial
  7 | 
  8 | from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
  9 | 
 10 | 
 11 | class DDIMSampler(object):
 12 |     def __init__(self, model, schedule="linear", **kwargs):
 13 |         super().__init__()
 14 |         self.model = model
 15 |         self.ddpm_num_timesteps = model.num_timesteps
 16 |         self.schedule = schedule
 17 | 
 18 |     def register_buffer(self, name, attr):
 19 |         if type(attr) == torch.Tensor:
 20 |             if attr.device != torch.device("cuda"):
 21 |                 attr = attr.to(torch.device("cuda"))
 22 |         setattr(self, name, attr)
 23 | 
 24 |     def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
 25 |         self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
 26 |                                                   num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
 27 |         alphas_cumprod = self.model.alphas_cumprod
 28 |         assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
 29 |         to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
 30 | 
 31 |         self.register_buffer('betas', to_torch(self.model.betas))
 32 |         self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
 33 |         self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
 34 | 
 35 |         # calculations for diffusion q(x_t | x_{t-1}) and others
 36 |         self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
 37 |         self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
 38 |         self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
 39 |         self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
 40 |         self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
 41 | 
 42 |         # ddim sampling parameters
 43 |         ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
 44 |                                                                                    ddim_timesteps=self.ddim_timesteps,
 45 |                                                                                    eta=ddim_eta,verbose=verbose)
 46 |         self.register_buffer('ddim_sigmas', ddim_sigmas)
 47 |         self.register_buffer('ddim_alphas', ddim_alphas)
 48 |         self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
 49 |         self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
 50 |         sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
 51 |             (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
 52 |                         1 - self.alphas_cumprod / self.alphas_cumprod_prev))
 53 |         self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
 54 | 
 55 |     @torch.no_grad()
 56 |     def sample(self,
 57 |                S,
 58 |                batch_size,
 59 |                shape,
 60 |                conditioning=None,
 61 |                callback=None,
 62 |                normals_sequence=None,
 63 |                img_callback=None,
 64 |                quantize_x0=False,
 65 |                eta=0.,
 66 |                mask=None,
 67 |                x0=None,
 68 |                temperature=1.,
 69 |                noise_dropout=0.,
 70 |                score_corrector=None,
 71 |                corrector_kwargs=None,
 72 |                verbose=True,
 73 |                x_T=None,
 74 |                log_every_t=100,
 75 |                unconditional_guidance_scale=1.,
 76 |                unconditional_conditioning=None,
 77 |                # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
 78 |                **kwargs
 79 |                ):
 80 |         if conditioning is not None:
 81 |             if isinstance(conditioning, dict):
 82 |                 cbs = conditioning[list(conditioning.keys())[0]].shape[0]
 83 |                 if cbs != batch_size:
 84 |                     print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
 85 |             else:
 86 |                 if conditioning.shape[0] != batch_size:
 87 |                     print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
 88 | 
 89 |         self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
 90 |         # sampling
 91 |         C, H, W = shape
 92 |         size = (batch_size, C, H, W)
 93 |         print(f'Data shape for DDIM sampling is {size}, eta {eta}')
 94 | 
 95 |         samples, intermediates = self.ddim_sampling(conditioning, size,
 96 |                                                     callback=callback,
 97 |                                                     img_callback=img_callback,
 98 |                                                     quantize_denoised=quantize_x0,
 99 |                                                     mask=mask, x0=x0,
100 |                                                     ddim_use_original_steps=False,
101 |                                                     noise_dropout=noise_dropout,
102 |                                                     temperature=temperature,
103 |                                                     score_corrector=score_corrector,
104 |                                                     corrector_kwargs=corrector_kwargs,
105 |                                                     x_T=x_T,
106 |                                                     log_every_t=log_every_t,
107 |                                                     unconditional_guidance_scale=unconditional_guidance_scale,
108 |                                                     unconditional_conditioning=unconditional_conditioning,
109 |                                                     )
110 |         return samples, intermediates
111 | 
112 |     @torch.no_grad()
113 |     def ddim_sampling(self, cond, shape,
114 |                       x_T=None, ddim_use_original_steps=False,
115 |                       callback=None, timesteps=None, quantize_denoised=False,
116 |                       mask=None, x0=None, img_callback=None, log_every_t=100,
117 |                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
118 |                       unconditional_guidance_scale=1., unconditional_conditioning=None,):
119 |         device = self.model.betas.device
120 |         b = shape[0]
121 |         if x_T is None:
122 |             img = torch.randn(shape, device=device)
123 |         else:
124 |             img = x_T
125 | 
126 |         if timesteps is None:
127 |             timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
128 |         elif timesteps is not None and not ddim_use_original_steps:
129 |             subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
130 |             timesteps = self.ddim_timesteps[:subset_end]
131 | 
132 |         intermediates = {'x_inter': [img], 'pred_x0': [img]}
133 |         time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
134 |         total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
135 |         print(f"Running DDIM Sampling with {total_steps} timesteps")
136 | 
137 |         iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
138 | 
139 |         for i, step in enumerate(iterator):
140 |             index = total_steps - i - 1
141 |             ts = torch.full((b,), step, device=device, dtype=torch.long)
142 | 
143 |             if mask is not None:
144 |                 assert x0 is not None
145 |                 img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
146 |                 img = img_orig * mask + (1. - mask) * img
147 | 
148 |             outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
149 |                                       quantize_denoised=quantize_denoised, temperature=temperature,
150 |                                       noise_dropout=noise_dropout, score_corrector=score_corrector,
151 |                                       corrector_kwargs=corrector_kwargs,
152 |                                       unconditional_guidance_scale=unconditional_guidance_scale,
153 |                                       unconditional_conditioning=unconditional_conditioning)
154 |             img, pred_x0 = outs
155 |             if callback: callback(i)
156 |             if img_callback: img_callback(pred_x0, i)
157 | 
158 |             if index % log_every_t == 0 or index == total_steps - 1:
159 |                 intermediates['x_inter'].append(img)
160 |                 intermediates['pred_x0'].append(pred_x0)
161 | 
162 |         return img, intermediates
163 | 
164 |     @torch.no_grad()
165 |     def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
166 |                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
167 |                       unconditional_guidance_scale=1., unconditional_conditioning=None):
168 |         b, *_, device = *x.shape, x.device
169 | 
170 |         if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
171 |             e_t = self.model.apply_model(x, t, c)
172 |         else:
173 |             x_in = torch.cat([x] * 2)
174 |             t_in = torch.cat([t] * 2)
175 |             c_in = torch.cat([unconditional_conditioning, c])
176 |             e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
177 |             e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
178 | 
179 |         if score_corrector is not None:
180 |             assert self.model.parameterization == "eps"
181 |             e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
182 | 
183 |         alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
184 |         alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
185 |         sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
186 |         sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
187 |         # select parameters corresponding to the currently considered timestep
188 |         a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
189 |         a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
190 |         sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
191 |         sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
192 | 
193 |         # current prediction for x_0
194 |         pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
195 |         if quantize_denoised:
196 |             pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
197 |         # direction pointing to x_t
198 |         dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
199 |         noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
200 |         if noise_dropout > 0.:
201 |             noise = torch.nn.functional.dropout(noise, p=noise_dropout)
202 |         x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
203 |         return x_prev, pred_x0
204 | 


--------------------------------------------------------------------------------
/ldm/models/diffusion/plms.py:
--------------------------------------------------------------------------------
  1 | """SAMPLING ONLY."""
  2 | 
  3 | import torch
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | from functools import partial
  7 | 
  8 | from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
  9 | 
 10 | 
 11 | class PLMSSampler(object):
 12 |     def __init__(self, model, schedule="linear", **kwargs):
 13 |         super().__init__()
 14 |         self.model = model
 15 |         self.ddpm_num_timesteps = model.num_timesteps
 16 |         self.schedule = schedule
 17 | 
 18 |     def register_buffer(self, name, attr):
 19 |         if type(attr) == torch.Tensor:
 20 |             if attr.device != torch.device("cuda"):
 21 |                 attr = attr.to(torch.device("cuda"))
 22 |         setattr(self, name, attr)
 23 | 
 24 |     def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
 25 |         if ddim_eta != 0:
 26 |             raise ValueError('ddim_eta must be 0 for PLMS')
 27 |         self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
 28 |                                                   num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
 29 |         alphas_cumprod = self.model.alphas_cumprod
 30 |         assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
 31 |         to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
 32 | 
 33 |         self.register_buffer('betas', to_torch(self.model.betas))
 34 |         self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
 35 |         self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
 36 | 
 37 |         # calculations for diffusion q(x_t | x_{t-1}) and others
 38 |         self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
 39 |         self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
 40 |         self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
 41 |         self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
 42 |         self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
 43 | 
 44 |         # ddim sampling parameters
 45 |         ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
 46 |                                                                                    ddim_timesteps=self.ddim_timesteps,
 47 |                                                                                    eta=ddim_eta,verbose=verbose)
 48 |         self.register_buffer('ddim_sigmas', ddim_sigmas)
 49 |         self.register_buffer('ddim_alphas', ddim_alphas)
 50 |         self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
 51 |         self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
 52 |         sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
 53 |             (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
 54 |                         1 - self.alphas_cumprod / self.alphas_cumprod_prev))
 55 |         self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
 56 | 
 57 |     @torch.no_grad()
 58 |     def sample(self,
 59 |                S,
 60 |                batch_size,
 61 |                shape,
 62 |                conditioning=None,
 63 |                callback=None,
 64 |                normals_sequence=None,
 65 |                img_callback=None,
 66 |                quantize_x0=False,
 67 |                eta=0.,
 68 |                mask=None,
 69 |                x0=None,
 70 |                temperature=1.,
 71 |                noise_dropout=0.,
 72 |                score_corrector=None,
 73 |                corrector_kwargs=None,
 74 |                verbose=True,
 75 |                x_T=None,
 76 |                log_every_t=100,
 77 |                unconditional_guidance_scale=1.,
 78 |                unconditional_conditioning=None,
 79 |                # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
 80 |                **kwargs
 81 |                ):
 82 |         if conditioning is not None:
 83 |             if isinstance(conditioning, dict):
 84 |                 cbs = conditioning[list(conditioning.keys())[0]].shape[0]
 85 |                 if cbs != batch_size:
 86 |                     print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
 87 |             else:
 88 |                 if conditioning.shape[0] != batch_size:
 89 |                     print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
 90 | 
 91 |         self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
 92 |         # sampling
 93 |         C, H, W = shape
 94 |         size = (batch_size, C, H, W)
 95 |         print(f'Data shape for PLMS sampling is {size}')
 96 | 
 97 |         samples, intermediates = self.plms_sampling(conditioning, size,
 98 |                                                     callback=callback,
 99 |                                                     img_callback=img_callback,
100 |                                                     quantize_denoised=quantize_x0,
101 |                                                     mask=mask, x0=x0,
102 |                                                     ddim_use_original_steps=False,
103 |                                                     noise_dropout=noise_dropout,
104 |                                                     temperature=temperature,
105 |                                                     score_corrector=score_corrector,
106 |                                                     corrector_kwargs=corrector_kwargs,
107 |                                                     x_T=x_T,
108 |                                                     log_every_t=log_every_t,
109 |                                                     unconditional_guidance_scale=unconditional_guidance_scale,
110 |                                                     unconditional_conditioning=unconditional_conditioning,
111 |                                                     )
112 |         return samples, intermediates
113 | 
114 |     @torch.no_grad()
115 |     def plms_sampling(self, cond, shape,
116 |                       x_T=None, ddim_use_original_steps=False,
117 |                       callback=None, timesteps=None, quantize_denoised=False,
118 |                       mask=None, x0=None, img_callback=None, log_every_t=100,
119 |                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
120 |                       unconditional_guidance_scale=1., unconditional_conditioning=None,):
121 |         device = self.model.betas.device
122 |         b = shape[0]
123 |         if x_T is None:
124 |             img = torch.randn(shape, device=device)
125 |         else:
126 |             img = x_T
127 | 
128 |         if timesteps is None:
129 |             timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
130 |         elif timesteps is not None and not ddim_use_original_steps:
131 |             subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
132 |             timesteps = self.ddim_timesteps[:subset_end]
133 | 
134 |         intermediates = {'x_inter': [img], 'pred_x0': [img]}
135 |         time_range = list(reversed(range(0,timesteps))) if ddim_use_original_steps else np.flip(timesteps)
136 |         total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
137 |         print(f"Running PLMS Sampling with {total_steps} timesteps")
138 | 
139 |         iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps)
140 |         old_eps = []
141 | 
142 |         for i, step in enumerate(iterator):
143 |             index = total_steps - i - 1
144 |             ts = torch.full((b,), step, device=device, dtype=torch.long)
145 |             ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
146 | 
147 |             if mask is not None:
148 |                 assert x0 is not None
149 |                 img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
150 |                 img = img_orig * mask + (1. - mask) * img
151 | 
152 |             outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
153 |                                       quantize_denoised=quantize_denoised, temperature=temperature,
154 |                                       noise_dropout=noise_dropout, score_corrector=score_corrector,
155 |                                       corrector_kwargs=corrector_kwargs,
156 |                                       unconditional_guidance_scale=unconditional_guidance_scale,
157 |                                       unconditional_conditioning=unconditional_conditioning,
158 |                                       old_eps=old_eps, t_next=ts_next)
159 |             img, pred_x0, e_t = outs
160 |             old_eps.append(e_t)
161 |             if len(old_eps) >= 4:
162 |                 old_eps.pop(0)
163 |             if callback: callback(i)
164 |             if img_callback: img_callback(pred_x0, i)
165 | 
166 |             if index % log_every_t == 0 or index == total_steps - 1:
167 |                 intermediates['x_inter'].append(img)
168 |                 intermediates['pred_x0'].append(pred_x0)
169 | 
170 |         return img, intermediates
171 | 
172 |     @torch.no_grad()
173 |     def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
174 |                       temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
175 |                       unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None):
176 |         b, *_, device = *x.shape, x.device
177 | 
178 |         def get_model_output(x, t):
179 |             if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
180 |                 e_t = self.model.apply_model(x, t, c)
181 |             else:
182 |                 x_in = torch.cat([x] * 2)
183 |                 t_in = torch.cat([t] * 2)
184 |                 c_in = torch.cat([unconditional_conditioning, c])
185 |                 e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
186 |                 e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
187 | 
188 |             if score_corrector is not None:
189 |                 assert self.model.parameterization == "eps"
190 |                 e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
191 | 
192 |             return e_t
193 | 
194 |         alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
195 |         alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
196 |         sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
197 |         sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
198 | 
199 |         def get_x_prev_and_pred_x0(e_t, index):
200 |             # select parameters corresponding to the currently considered timestep
201 |             a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
202 |             a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
203 |             sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
204 |             sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
205 | 
206 |             # current prediction for x_0
207 |             pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
208 |             if quantize_denoised:
209 |                 pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
210 |             # direction pointing to x_t
211 |             dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
212 |             noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
213 |             if noise_dropout > 0.:
214 |                 noise = torch.nn.functional.dropout(noise, p=noise_dropout)
215 |             x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
216 |             return x_prev, pred_x0
217 | 
218 |         e_t = get_model_output(x, t)
219 |         if len(old_eps) == 0:
220 |             # Pseudo Improved Euler (2nd order)
221 |             x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
222 |             e_t_next = get_model_output(x_prev, t_next)
223 |             e_t_prime = (e_t + e_t_next) / 2
224 |         elif len(old_eps) == 1:
225 |             # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
226 |             e_t_prime = (3 * e_t - old_eps[-1]) / 2
227 |         elif len(old_eps) == 2:
228 |             # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
229 |             e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
230 |         elif len(old_eps) >= 3:
231 |             # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
232 |             e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
233 | 
234 |         x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
235 | 
236 |         return x_prev, pred_x0, e_t
237 | 


--------------------------------------------------------------------------------
/ldm/modules/attention.py:
--------------------------------------------------------------------------------
  1 | from inspect import isfunction
  2 | import math
  3 | import torch
  4 | import torch.nn.functional as F
  5 | from torch import nn, einsum
  6 | from einops import rearrange, repeat
  7 | 
  8 | from ldm.modules.diffusionmodules.util import checkpoint
  9 | 
 10 | 
 11 | def exists(val):
 12 |     return val is not None
 13 | 
 14 | 
 15 | def uniq(arr):
 16 |     return{el: True for el in arr}.keys()
 17 | 
 18 | 
 19 | def default(val, d):
 20 |     if exists(val):
 21 |         return val
 22 |     return d() if isfunction(d) else d
 23 | 
 24 | 
 25 | def max_neg_value(t):
 26 |     return -torch.finfo(t.dtype).max
 27 | 
 28 | 
 29 | def init_(tensor):
 30 |     dim = tensor.shape[-1]
 31 |     std = 1 / math.sqrt(dim)
 32 |     tensor.uniform_(-std, std)
 33 |     return tensor
 34 | 
 35 | 
 36 | # feedforward
 37 | class GEGLU(nn.Module):
 38 |     def __init__(self, dim_in, dim_out):
 39 |         super().__init__()
 40 |         self.proj = nn.Linear(dim_in, dim_out * 2)
 41 | 
 42 |     def forward(self, x):
 43 |         x, gate = self.proj(x).chunk(2, dim=-1)
 44 |         return x * F.gelu(gate)
 45 | 
 46 | 
 47 | class FeedForward(nn.Module):
 48 |     def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
 49 |         super().__init__()
 50 |         inner_dim = int(dim * mult)
 51 |         dim_out = default(dim_out, dim)
 52 |         project_in = nn.Sequential(
 53 |             nn.Linear(dim, inner_dim),
 54 |             nn.GELU()
 55 |         ) if not glu else GEGLU(dim, inner_dim)
 56 | 
 57 |         self.net = nn.Sequential(
 58 |             project_in,
 59 |             nn.Dropout(dropout),
 60 |             nn.Linear(inner_dim, dim_out)
 61 |         )
 62 | 
 63 |     def forward(self, x):
 64 |         return self.net(x)
 65 | 
 66 | 
 67 | def zero_module(module):
 68 |     """
 69 |     Zero out the parameters of a module and return it.
 70 |     """
 71 |     for p in module.parameters():
 72 |         p.detach().zero_()
 73 |     return module
 74 | 
 75 | 
 76 | def Normalize(in_channels):
 77 |     return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
 78 | 
 79 | 
 80 | class LinearAttention(nn.Module):
 81 |     def __init__(self, dim, heads=4, dim_head=32):
 82 |         super().__init__()
 83 |         self.heads = heads
 84 |         hidden_dim = dim_head * heads
 85 |         self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
 86 |         self.to_out = nn.Conv2d(hidden_dim, dim, 1)
 87 | 
 88 |     def forward(self, x):
 89 |         b, c, h, w = x.shape
 90 |         qkv = self.to_qkv(x)
 91 |         q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
 92 |         k = k.softmax(dim=-1)  
 93 |         context = torch.einsum('bhdn,bhen->bhde', k, v)
 94 |         out = torch.einsum('bhde,bhdn->bhen', context, q)
 95 |         out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
 96 |         return self.to_out(out)
 97 | 
 98 | 
 99 | class SpatialSelfAttention(nn.Module):
100 |     def __init__(self, in_channels):
101 |         super().__init__()
102 |         self.in_channels = in_channels
103 | 
104 |         self.norm = Normalize(in_channels)
105 |         self.q = torch.nn.Conv2d(in_channels,
106 |                                  in_channels,
107 |                                  kernel_size=1,
108 |                                  stride=1,
109 |                                  padding=0)
110 |         self.k = torch.nn.Conv2d(in_channels,
111 |                                  in_channels,
112 |                                  kernel_size=1,
113 |                                  stride=1,
114 |                                  padding=0)
115 |         self.v = torch.nn.Conv2d(in_channels,
116 |                                  in_channels,
117 |                                  kernel_size=1,
118 |                                  stride=1,
119 |                                  padding=0)
120 |         self.proj_out = torch.nn.Conv2d(in_channels,
121 |                                         in_channels,
122 |                                         kernel_size=1,
123 |                                         stride=1,
124 |                                         padding=0)
125 | 
126 |     def forward(self, x):
127 |         h_ = x
128 |         h_ = self.norm(h_)
129 |         q = self.q(h_)
130 |         k = self.k(h_)
131 |         v = self.v(h_)
132 | 
133 |         # compute attention
134 |         b,c,h,w = q.shape
135 |         q = rearrange(q, 'b c h w -> b (h w) c')
136 |         k = rearrange(k, 'b c h w -> b c (h w)')
137 |         w_ = torch.einsum('bij,bjk->bik', q, k)
138 | 
139 |         w_ = w_ * (int(c)**(-0.5))
140 |         w_ = torch.nn.functional.softmax(w_, dim=2)
141 | 
142 |         # attend to values
143 |         v = rearrange(v, 'b c h w -> b c (h w)')
144 |         w_ = rearrange(w_, 'b i j -> b j i')
145 |         h_ = torch.einsum('bij,bjk->bik', v, w_)
146 |         h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
147 |         h_ = self.proj_out(h_)
148 | 
149 |         return x+h_
150 | 
151 | 
152 | class CrossAttention(nn.Module):
153 |     def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
154 |         super().__init__()
155 |         inner_dim = dim_head * heads
156 |         context_dim = default(context_dim, query_dim)
157 | 
158 |         self.scale = dim_head ** -0.5
159 |         self.heads = heads
160 | 
161 |         self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
162 |         self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
163 |         self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
164 | 
165 |         self.to_out = nn.Sequential(
166 |             nn.Linear(inner_dim, query_dim),
167 |             nn.Dropout(dropout)
168 |         )
169 | 
170 |     def forward(self, x, context=None, mask=None):
171 |         h = self.heads
172 | 
173 |         q = self.to_q(x)
174 |         context = default(context, x)
175 |         k = self.to_k(context)
176 |         v = self.to_v(context)
177 | 
178 |         q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
179 | 
180 |         sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
181 | 
182 |         if exists(mask):
183 |             mask = rearrange(mask, 'b ... -> b (...)')
184 |             max_neg_value = -torch.finfo(sim.dtype).max
185 |             mask = repeat(mask, 'b j -> (b h) () j', h=h)
186 |             sim.masked_fill_(~mask, max_neg_value)
187 | 
188 |         # attention, what we cannot get enough of
189 |         attn = sim.softmax(dim=-1)
190 | 
191 |         out = einsum('b i j, b j d -> b i d', attn, v)
192 |         out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
193 |         return self.to_out(out)
194 | 
195 | 
196 | class BasicTransformerBlock(nn.Module):
197 |     def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True):
198 |         super().__init__()
199 |         self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout)  # is a self-attention
200 |         self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
201 |         self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim,
202 |                                     heads=n_heads, dim_head=d_head, dropout=dropout)  # is self-attn if context is none
203 |         self.norm1 = nn.LayerNorm(dim)
204 |         self.norm2 = nn.LayerNorm(dim)
205 |         self.norm3 = nn.LayerNorm(dim)
206 |         self.checkpoint = checkpoint
207 | 
208 |     def forward(self, x, context=None):
209 |         return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
210 | 
211 |     def _forward(self, x, context=None):
212 |         x = self.attn1(self.norm1(x)) + x
213 |         x = self.attn2(self.norm2(x), context=context) + x
214 |         x = self.ff(self.norm3(x)) + x
215 |         return x
216 | 
217 | 
218 | class SpatialTransformer(nn.Module):
219 |     """
220 |     Transformer block for image-like data.
221 |     First, project the input (aka embedding)
222 |     and reshape to b, t, d.
223 |     Then apply standard transformer action.
224 |     Finally, reshape to image
225 |     """
226 |     def __init__(self, in_channels, n_heads, d_head,
227 |                  depth=1, dropout=0., context_dim=None):
228 |         super().__init__()
229 |         self.in_channels = in_channels
230 |         inner_dim = n_heads * d_head
231 |         self.norm = Normalize(in_channels)
232 | 
233 |         self.proj_in = nn.Conv2d(in_channels,
234 |                                  inner_dim,
235 |                                  kernel_size=1,
236 |                                  stride=1,
237 |                                  padding=0)
238 | 
239 |         self.transformer_blocks = nn.ModuleList(
240 |             [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim)
241 |                 for d in range(depth)]
242 |         )
243 | 
244 |         self.proj_out = zero_module(nn.Conv2d(inner_dim,
245 |                                               in_channels,
246 |                                               kernel_size=1,
247 |                                               stride=1,
248 |                                               padding=0))
249 | 
250 |     def forward(self, x, context=None):
251 |         # note: if no context is given, cross-attention defaults to self-attention
252 |         b, c, h, w = x.shape
253 |         x_in = x
254 |         x = self.norm(x)
255 |         x = self.proj_in(x)
256 |         x = rearrange(x, 'b c h w -> b (h w) c')
257 |         for block in self.transformer_blocks:
258 |             x = block(x, context=context)
259 |         x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
260 |         x = self.proj_out(x)
261 |         return x + x_in


--------------------------------------------------------------------------------
/ldm/modules/diffusionmodules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/ldm/modules/diffusionmodules/__init__.py


--------------------------------------------------------------------------------
/ldm/modules/diffusionmodules/util.py:
--------------------------------------------------------------------------------
  1 | # adopted from
  2 | # https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
  3 | # and
  4 | # https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
  5 | # and
  6 | # https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
  7 | #
  8 | # thanks!
  9 | 
 10 | 
 11 | import os
 12 | import math
 13 | import torch
 14 | import torch.nn as nn
 15 | import numpy as np
 16 | from einops import repeat
 17 | 
 18 | from ldm.util import instantiate_from_config
 19 | 
 20 | 
 21 | def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
 22 |     if schedule == "linear":
 23 |         betas = (
 24 |                 torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
 25 |         )
 26 | 
 27 |     elif schedule == "cosine":
 28 |         timesteps = (
 29 |                 torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
 30 |         )
 31 |         alphas = timesteps / (1 + cosine_s) * np.pi / 2
 32 |         alphas = torch.cos(alphas).pow(2)
 33 |         alphas = alphas / alphas[0]
 34 |         betas = 1 - alphas[1:] / alphas[:-1]
 35 |         betas = np.clip(betas, a_min=0, a_max=0.999)
 36 | 
 37 |     elif schedule == "sqrt_linear":
 38 |         betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
 39 |     elif schedule == "sqrt":
 40 |         betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
 41 |     else:
 42 |         raise ValueError(f"schedule '{schedule}' unknown.")
 43 |     return betas.numpy()
 44 | 
 45 | 
 46 | def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
 47 |     if ddim_discr_method == 'uniform':
 48 |         c = num_ddpm_timesteps // num_ddim_timesteps
 49 |         ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
 50 |     elif ddim_discr_method == 'quad':
 51 |         ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int)
 52 |     else:
 53 |         raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
 54 | 
 55 |     # assert ddim_timesteps.shape[0] == num_ddim_timesteps
 56 |     # add one to get the final alpha values right (the ones from first scale to data during sampling)
 57 |     steps_out = ddim_timesteps + 1
 58 |     if verbose:
 59 |         print(f'Selected timesteps for ddim sampler: {steps_out}')
 60 |     return steps_out
 61 | 
 62 | 
 63 | def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
 64 |     # select alphas for computing the variance schedule
 65 |     alphas = alphacums[ddim_timesteps]
 66 |     alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
 67 | 
 68 |     # according the the formula provided in https://arxiv.org/abs/2010.02502
 69 |     sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
 70 |     if verbose:
 71 |         print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
 72 |         print(f'For the chosen value of eta, which is {eta}, '
 73 |               f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
 74 |     return sigmas, alphas, alphas_prev
 75 | 
 76 | 
 77 | def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
 78 |     """
 79 |     Create a beta schedule that discretizes the given alpha_t_bar function,
 80 |     which defines the cumulative product of (1-beta) over time from t = [0,1].
 81 |     :param num_diffusion_timesteps: the number of betas to produce.
 82 |     :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
 83 |                       produces the cumulative product of (1-beta) up to that
 84 |                       part of the diffusion process.
 85 |     :param max_beta: the maximum beta to use; use values lower than 1 to
 86 |                      prevent singularities.
 87 |     """
 88 |     betas = []
 89 |     for i in range(num_diffusion_timesteps):
 90 |         t1 = i / num_diffusion_timesteps
 91 |         t2 = (i + 1) / num_diffusion_timesteps
 92 |         betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
 93 |     return np.array(betas)
 94 | 
 95 | 
 96 | def extract_into_tensor(a, t, x_shape):
 97 |     b, *_ = t.shape
 98 |     out = a.gather(-1, t)
 99 |     return out.reshape(b, *((1,) * (len(x_shape) - 1)))
100 | 
101 | 
102 | def checkpoint(func, inputs, params, flag):
103 |     """
104 |     Evaluate a function without caching intermediate activations, allowing for
105 |     reduced memory at the expense of extra compute in the backward pass.
106 |     :param func: the function to evaluate.
107 |     :param inputs: the argument sequence to pass to `func`.
108 |     :param params: a sequence of parameters `func` depends on but does not
109 |                    explicitly take as arguments.
110 |     :param flag: if False, disable gradient checkpointing.
111 |     """
112 |     if flag:
113 |         args = tuple(inputs) + tuple(params)
114 |         return CheckpointFunction.apply(func, len(inputs), *args)
115 |     else:
116 |         return func(*inputs)
117 | 
118 | 
119 | class CheckpointFunction(torch.autograd.Function):
120 |     @staticmethod
121 |     def forward(ctx, run_function, length, *args):
122 |         ctx.run_function = run_function
123 |         ctx.input_tensors = list(args[:length])
124 |         ctx.input_params = list(args[length:])
125 | 
126 |         with torch.no_grad():
127 |             output_tensors = ctx.run_function(*ctx.input_tensors)
128 |         return output_tensors
129 | 
130 |     @staticmethod
131 |     def backward(ctx, *output_grads):
132 |         ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
133 |         with torch.enable_grad():
134 |             # Fixes a bug where the first op in run_function modifies the
135 |             # Tensor storage in place, which is not allowed for detach()'d
136 |             # Tensors.
137 |             shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
138 |             output_tensors = ctx.run_function(*shallow_copies)
139 |         input_grads = torch.autograd.grad(
140 |             output_tensors,
141 |             ctx.input_tensors + ctx.input_params,
142 |             output_grads,
143 |             allow_unused=True,
144 |         )
145 |         del ctx.input_tensors
146 |         del ctx.input_params
147 |         del output_tensors
148 |         return (None, None) + input_grads
149 | 
150 | 
151 | def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
152 |     """
153 |     Create sinusoidal timestep embeddings.
154 |     :param timesteps: a 1-D Tensor of N indices, one per batch element.
155 |                       These may be fractional.
156 |     :param dim: the dimension of the output.
157 |     :param max_period: controls the minimum frequency of the embeddings.
158 |     :return: an [N x dim] Tensor of positional embeddings.
159 |     """
160 |     if not repeat_only:
161 |         half = dim // 2
162 |         freqs = torch.exp(
163 |             -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
164 |         ).to(device=timesteps.device)
165 |         args = timesteps[:, None].float() * freqs[None]
166 |         embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
167 |         if dim % 2:
168 |             embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
169 |     else:
170 |         embedding = repeat(timesteps, 'b -> b d', d=dim)
171 |     return embedding
172 | 
173 | 
174 | def zero_module(module):
175 |     """
176 |     Zero out the parameters of a module and return it.
177 |     """
178 |     for p in module.parameters():
179 |         p.detach().zero_()
180 |     return module
181 | 
182 | 
183 | def scale_module(module, scale):
184 |     """
185 |     Scale the parameters of a module and return it.
186 |     """
187 |     for p in module.parameters():
188 |         p.detach().mul_(scale)
189 |     return module
190 | 
191 | 
192 | def mean_flat(tensor):
193 |     """
194 |     Take the mean over all non-batch dimensions.
195 |     """
196 |     return tensor.mean(dim=list(range(1, len(tensor.shape))))
197 | 
198 | 
199 | def normalization(channels):
200 |     """
201 |     Make a standard normalization layer.
202 |     :param channels: number of input channels.
203 |     :return: an nn.Module for normalization.
204 |     """
205 |     return GroupNorm32(32, channels)
206 | 
207 | 
208 | # PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
209 | class SiLU(nn.Module):
210 |     def forward(self, x):
211 |         return x * torch.sigmoid(x)
212 | 
213 | 
214 | class GroupNorm32(nn.GroupNorm):
215 |     def forward(self, x):
216 |         return super().forward(x.float()).type(x.dtype)
217 | 
218 | def conv_nd(dims, *args, **kwargs):
219 |     """
220 |     Create a 1D, 2D, or 3D convolution module.
221 |     """
222 |     if dims == 1:
223 |         return nn.Conv1d(*args, **kwargs)
224 |     elif dims == 2:
225 |         return nn.Conv2d(*args, **kwargs)
226 |     elif dims == 3:
227 |         return nn.Conv3d(*args, **kwargs)
228 |     raise ValueError(f"unsupported dimensions: {dims}")
229 | 
230 | 
231 | def linear(*args, **kwargs):
232 |     """
233 |     Create a linear module.
234 |     """
235 |     return nn.Linear(*args, **kwargs)
236 | 
237 | 
238 | def avg_pool_nd(dims, *args, **kwargs):
239 |     """
240 |     Create a 1D, 2D, or 3D average pooling module.
241 |     """
242 |     if dims == 1:
243 |         return nn.AvgPool1d(*args, **kwargs)
244 |     elif dims == 2:
245 |         return nn.AvgPool2d(*args, **kwargs)
246 |     elif dims == 3:
247 |         return nn.AvgPool3d(*args, **kwargs)
248 |     raise ValueError(f"unsupported dimensions: {dims}")
249 | 
250 | 
251 | class HybridConditioner(nn.Module):
252 | 
253 |     def __init__(self, c_concat_config, c_crossattn_config):
254 |         super().__init__()
255 |         self.concat_conditioner = instantiate_from_config(c_concat_config)
256 |         self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
257 | 
258 |     def forward(self, c_concat, c_crossattn):
259 |         c_concat = self.concat_conditioner(c_concat)
260 |         c_crossattn = self.crossattn_conditioner(c_crossattn)
261 |         return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
262 | 
263 | 
264 | def noise_like(shape, device, repeat=False):
265 |     repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
266 |     noise = lambda: torch.randn(shape, device=device)
267 |     return repeat_noise() if repeat else noise()


--------------------------------------------------------------------------------
/ldm/modules/distributions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/ldm/modules/distributions/__init__.py


--------------------------------------------------------------------------------
/ldm/modules/distributions/distributions.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | class AbstractDistribution:
 6 |     def sample(self):
 7 |         raise NotImplementedError()
 8 | 
 9 |     def mode(self):
10 |         raise NotImplementedError()
11 | 
12 | 
13 | class DiracDistribution(AbstractDistribution):
14 |     def __init__(self, value):
15 |         self.value = value
16 | 
17 |     def sample(self):
18 |         return self.value
19 | 
20 |     def mode(self):
21 |         return self.value
22 | 
23 | 
24 | class DiagonalGaussianDistribution(object):
25 |     def __init__(self, parameters, deterministic=False):
26 |         self.parameters = parameters
27 |         self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
28 |         self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
29 |         self.deterministic = deterministic
30 |         self.std = torch.exp(0.5 * self.logvar)
31 |         self.var = torch.exp(self.logvar)
32 |         if self.deterministic:
33 |             self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
34 | 
35 |     def sample(self):
36 |         x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
37 |         return x
38 | 
39 |     def kl(self, other=None):
40 |         if self.deterministic:
41 |             return torch.Tensor([0.])
42 |         else:
43 |             if other is None:
44 |                 return 0.5 * torch.sum(torch.pow(self.mean, 2)
45 |                                        + self.var - 1.0 - self.logvar,
46 |                                        dim=[1, 2, 3])
47 |             else:
48 |                 return 0.5 * torch.sum(
49 |                     torch.pow(self.mean - other.mean, 2) / other.var
50 |                     + self.var / other.var - 1.0 - self.logvar + other.logvar,
51 |                     dim=[1, 2, 3])
52 | 
53 |     def nll(self, sample, dims=[1,2,3]):
54 |         if self.deterministic:
55 |             return torch.Tensor([0.])
56 |         logtwopi = np.log(2.0 * np.pi)
57 |         return 0.5 * torch.sum(
58 |             logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
59 |             dim=dims)
60 | 
61 |     def mode(self):
62 |         return self.mean
63 | 
64 | 
65 | def normal_kl(mean1, logvar1, mean2, logvar2):
66 |     """
67 |     source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
68 |     Compute the KL divergence between two gaussians.
69 |     Shapes are automatically broadcasted, so batches can be compared to
70 |     scalars, among other use cases.
71 |     """
72 |     tensor = None
73 |     for obj in (mean1, logvar1, mean2, logvar2):
74 |         if isinstance(obj, torch.Tensor):
75 |             tensor = obj
76 |             break
77 |     assert tensor is not None, "at least one argument must be a Tensor"
78 | 
79 |     # Force variances to be Tensors. Broadcasting helps convert scalars to
80 |     # Tensors, but it does not work for torch.exp().
81 |     logvar1, logvar2 = [
82 |         x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
83 |         for x in (logvar1, logvar2)
84 |     ]
85 | 
86 |     return 0.5 * (
87 |         -1.0
88 |         + logvar2
89 |         - logvar1
90 |         + torch.exp(logvar1 - logvar2)
91 |         + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
92 |     )
93 | 


--------------------------------------------------------------------------------
/ldm/modules/ema.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class LitEma(nn.Module):
 6 |     def __init__(self, model, decay=0.9999, use_num_upates=True):
 7 |         super().__init__()
 8 |         if decay < 0.0 or decay > 1.0:
 9 |             raise ValueError('Decay must be between 0 and 1')
10 | 
11 |         self.m_name2s_name = {}
12 |         self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
13 |         self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates
14 |                              else torch.tensor(-1,dtype=torch.int))
15 | 
16 |         for name, p in model.named_parameters():
17 |             if p.requires_grad:
18 |                 #remove as '.'-character is not allowed in buffers
19 |                 s_name = name.replace('.','')
20 |                 self.m_name2s_name.update({name:s_name})
21 |                 self.register_buffer(s_name,p.clone().detach().data)
22 | 
23 |         self.collected_params = []
24 | 
25 |     def forward(self,model):
26 |         decay = self.decay
27 | 
28 |         if self.num_updates >= 0:
29 |             self.num_updates += 1
30 |             decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates))
31 | 
32 |         one_minus_decay = 1.0 - decay
33 | 
34 |         with torch.no_grad():
35 |             m_param = dict(model.named_parameters())
36 |             shadow_params = dict(self.named_buffers())
37 | 
38 |             for key in m_param:
39 |                 if m_param[key].requires_grad:
40 |                     sname = self.m_name2s_name[key]
41 |                     shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
42 |                     shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
43 |                 else:
44 |                     assert not key in self.m_name2s_name
45 | 
46 |     def copy_to(self, model):
47 |         m_param = dict(model.named_parameters())
48 |         shadow_params = dict(self.named_buffers())
49 |         for key in m_param:
50 |             if m_param[key].requires_grad:
51 |                 m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
52 |             else:
53 |                 assert not key in self.m_name2s_name
54 | 
55 |     def store(self, parameters):
56 |         """
57 |         Save the current parameters for restoring later.
58 |         Args:
59 |           parameters: Iterable of `torch.nn.Parameter`; the parameters to be
60 |             temporarily stored.
61 |         """
62 |         self.collected_params = [param.clone() for param in parameters]
63 | 
64 |     def restore(self, parameters):
65 |         """
66 |         Restore the parameters stored with the `store` method.
67 |         Useful to validate the model with EMA parameters without affecting the
68 |         original optimization process. Store the parameters before the
69 |         `copy_to` method. After validation (or model saving), use this to
70 |         restore the former parameters.
71 |         Args:
72 |           parameters: Iterable of `torch.nn.Parameter`; the parameters to be
73 |             updated with the stored parameters.
74 |         """
75 |         for c_param, param in zip(self.collected_params, parameters):
76 |             param.data.copy_(c_param.data)
77 | 


--------------------------------------------------------------------------------
/ldm/modules/encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/ldm/modules/encoders/__init__.py


--------------------------------------------------------------------------------
/ldm/modules/encoders/modules.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from functools import partial
  4 | import clip
  5 | from einops import rearrange, repeat
  6 | import kornia
  7 | 
  8 | 
  9 | from ldm.modules.x_transformer import Encoder, TransformerWrapper  # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
 10 | 
 11 | 
 12 | class AbstractEncoder(nn.Module):
 13 |     def __init__(self):
 14 |         super().__init__()
 15 | 
 16 |     def encode(self, *args, **kwargs):
 17 |         raise NotImplementedError
 18 | 
 19 | 
 20 | 
 21 | class ClassEmbedder(nn.Module):
 22 |     def __init__(self, embed_dim, n_classes=1000, key='class'):
 23 |         super().__init__()
 24 |         self.key = key
 25 |         self.embedding = nn.Embedding(n_classes, embed_dim)
 26 | 
 27 |     def forward(self, batch, key=None):
 28 |         if key is None:
 29 |             key = self.key
 30 |         # this is for use in crossattn
 31 |         c = batch[key][:, None]
 32 |         c = self.embedding(c)
 33 |         return c
 34 | 
 35 | 
 36 | class TransformerEmbedder(AbstractEncoder):
 37 |     """Some transformer encoder layers"""
 38 |     def __init__(self, n_embed, n_layer, vocab_size, max_seq_len=77, device="cuda"):
 39 |         super().__init__()
 40 |         self.device = device
 41 |         self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
 42 |                                               attn_layers=Encoder(dim=n_embed, depth=n_layer))
 43 | 
 44 |     def forward(self, tokens):
 45 |         tokens = tokens.to(self.device)  # meh
 46 |         z = self.transformer(tokens, return_embeddings=True)
 47 |         return z
 48 | 
 49 |     def encode(self, x):
 50 |         return self(x)
 51 | 
 52 | 
 53 | class BERTTokenizer(AbstractEncoder):
 54 |     """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)"""
 55 |     def __init__(self, device="cuda", vq_interface=True, max_length=77):
 56 |         super().__init__()
 57 |         from transformers import BertTokenizerFast  # TODO: add to reuquirements
 58 |         self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
 59 |         self.device = device
 60 |         self.vq_interface = vq_interface
 61 |         self.max_length = max_length
 62 | 
 63 |     def forward(self, text):
 64 |         batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
 65 |                                         return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
 66 |         tokens = batch_encoding["input_ids"].to(self.device)
 67 |         return tokens
 68 | 
 69 |     @torch.no_grad()
 70 |     def encode(self, text):
 71 |         tokens = self(text)
 72 |         if not self.vq_interface:
 73 |             return tokens
 74 |         return None, None, [None, None, tokens]
 75 | 
 76 |     def decode(self, text):
 77 |         return text
 78 | 
 79 | 
 80 | class BERTEmbedder(AbstractEncoder):
 81 |     """Uses the BERT tokenizr model and add some transformer encoder layers"""
 82 |     def __init__(self, n_embed, n_layer, vocab_size=30522, max_seq_len=77,
 83 |                  device="cuda",use_tokenizer=True, embedding_dropout=0.0):
 84 |         super().__init__()
 85 |         self.use_tknz_fn = use_tokenizer
 86 |         if self.use_tknz_fn:
 87 |             self.tknz_fn = BERTTokenizer(vq_interface=False, max_length=max_seq_len)
 88 |         self.device = device
 89 |         self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
 90 |                                               attn_layers=Encoder(dim=n_embed, depth=n_layer),
 91 |                                               emb_dropout=embedding_dropout)
 92 | 
 93 |     def forward(self, text):
 94 |         if self.use_tknz_fn:
 95 |             tokens = self.tknz_fn(text)#.to(self.device)
 96 |         else:
 97 |             tokens = text
 98 |         z = self.transformer(tokens, return_embeddings=True)
 99 |         return z
100 | 
101 |     def encode(self, text):
102 |         # output of length 77
103 |         return self(text)
104 | 
105 | 
106 | class SpatialRescaler(nn.Module):
107 |     def __init__(self,
108 |                  n_stages=1,
109 |                  method='bilinear',
110 |                  multiplier=0.5,
111 |                  in_channels=3,
112 |                  out_channels=None,
113 |                  bias=False):
114 |         super().__init__()
115 |         self.n_stages = n_stages
116 |         assert self.n_stages >= 0
117 |         assert method in ['nearest','linear','bilinear','trilinear','bicubic','area']
118 |         self.multiplier = multiplier
119 |         self.interpolator = partial(torch.nn.functional.interpolate, mode=method)
120 |         self.remap_output = out_channels is not None
121 |         if self.remap_output:
122 |             print(f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.')
123 |             self.channel_mapper = nn.Conv2d(in_channels,out_channels,1,bias=bias)
124 | 
125 |     def forward(self,x):
126 |         for stage in range(self.n_stages):
127 |             x = self.interpolator(x, scale_factor=self.multiplier)
128 | 
129 | 
130 |         if self.remap_output:
131 |             x = self.channel_mapper(x)
132 |         return x
133 | 
134 |     def encode(self, x):
135 |         return self(x)
136 | 
137 | 
138 | class FrozenCLIPTextEmbedder(nn.Module):
139 |     """
140 |     Uses the CLIP transformer encoder for text.
141 |     """
142 |     def __init__(self, version='ViT-L/14', device="cuda", max_length=77, n_repeat=1, normalize=True):
143 |         super().__init__()
144 |         self.model, _ = clip.load(version, jit=False, device="cpu")
145 |         self.device = device
146 |         self.max_length = max_length
147 |         self.n_repeat = n_repeat
148 |         self.normalize = normalize
149 | 
150 |     def freeze(self):
151 |         self.model = self.model.eval()
152 |         for param in self.parameters():
153 |             param.requires_grad = False
154 | 
155 |     def forward(self, text):
156 |         tokens = clip.tokenize(text).to(self.device)
157 |         z = self.model.encode_text(tokens)
158 |         if self.normalize:
159 |             z = z / torch.linalg.norm(z, dim=1, keepdim=True)
160 |         return z
161 | 
162 |     def encode(self, text):
163 |         z = self(text)
164 |         if z.ndim==2:
165 |             z = z[:, None, :]
166 |         z = repeat(z, 'b 1 d -> b k d', k=self.n_repeat)
167 |         return z
168 | 
169 | 
170 | class FrozenClipImageEmbedder(nn.Module):
171 |     """
172 |         Uses the CLIP image encoder.
173 |         """
174 |     def __init__(
175 |             self,
176 |             model,
177 |             jit=False,
178 |             device='cuda' if torch.cuda.is_available() else 'cpu',
179 |             antialias=False,
180 |         ):
181 |         super().__init__()
182 |         self.model, _ = clip.load(name=model, device=device, jit=jit)
183 | 
184 |         self.antialias = antialias
185 | 
186 |         self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
187 |         self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
188 | 
189 |     def preprocess(self, x):
190 |         # normalize to [0,1]
191 |         x = kornia.geometry.resize(x, (224, 224),
192 |                                    interpolation='bicubic',align_corners=True,
193 |                                    antialias=self.antialias)
194 |         x = (x + 1.) / 2.
195 |         # renormalize according to clip
196 |         x = kornia.enhance.normalize(x, self.mean, self.std)
197 |         return x
198 | 
199 |     def forward(self, x):
200 |         # x is assumed to be in range [-1,1]
201 |         return self.model.encode_image(self.preprocess(x))
202 | 
203 | 


--------------------------------------------------------------------------------
/ldm/modules/image_degradation/__init__.py:
--------------------------------------------------------------------------------
1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
3 | 


--------------------------------------------------------------------------------
/ldm/modules/image_degradation/utils/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CompVis/latent-diffusion/a506df5756472e2ebaf9078affdde2c4f1502cd4/ldm/modules/image_degradation/utils/test.png


--------------------------------------------------------------------------------
/ldm/modules/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator


--------------------------------------------------------------------------------
/ldm/modules/losses/contperceptual.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from taming.modules.losses.vqperceptual import *  # TODO: taming dependency yes/no?
  5 | 
  6 | 
  7 | class LPIPSWithDiscriminator(nn.Module):
  8 |     def __init__(self, disc_start, logvar_init=0.0, kl_weight=1.0, pixelloss_weight=1.0,
  9 |                  disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
 10 |                  perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
 11 |                  disc_loss="hinge"):
 12 | 
 13 |         super().__init__()
 14 |         assert disc_loss in ["hinge", "vanilla"]
 15 |         self.kl_weight = kl_weight
 16 |         self.pixel_weight = pixelloss_weight
 17 |         self.perceptual_loss = LPIPS().eval()
 18 |         self.perceptual_weight = perceptual_weight
 19 |         # output log variance
 20 |         self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
 21 | 
 22 |         self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
 23 |                                                  n_layers=disc_num_layers,
 24 |                                                  use_actnorm=use_actnorm
 25 |                                                  ).apply(weights_init)
 26 |         self.discriminator_iter_start = disc_start
 27 |         self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
 28 |         self.disc_factor = disc_factor
 29 |         self.discriminator_weight = disc_weight
 30 |         self.disc_conditional = disc_conditional
 31 | 
 32 |     def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
 33 |         if last_layer is not None:
 34 |             nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
 35 |             g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
 36 |         else:
 37 |             nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
 38 |             g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
 39 | 
 40 |         d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
 41 |         d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
 42 |         d_weight = d_weight * self.discriminator_weight
 43 |         return d_weight
 44 | 
 45 |     def forward(self, inputs, reconstructions, posteriors, optimizer_idx,
 46 |                 global_step, last_layer=None, cond=None, split="train",
 47 |                 weights=None):
 48 |         rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
 49 |         if self.perceptual_weight > 0:
 50 |             p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
 51 |             rec_loss = rec_loss + self.perceptual_weight * p_loss
 52 | 
 53 |         nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
 54 |         weighted_nll_loss = nll_loss
 55 |         if weights is not None:
 56 |             weighted_nll_loss = weights*nll_loss
 57 |         weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
 58 |         nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
 59 |         kl_loss = posteriors.kl()
 60 |         kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
 61 | 
 62 |         # now the GAN part
 63 |         if optimizer_idx == 0:
 64 |             # generator update
 65 |             if cond is None:
 66 |                 assert not self.disc_conditional
 67 |                 logits_fake = self.discriminator(reconstructions.contiguous())
 68 |             else:
 69 |                 assert self.disc_conditional
 70 |                 logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
 71 |             g_loss = -torch.mean(logits_fake)
 72 | 
 73 |             if self.disc_factor > 0.0:
 74 |                 try:
 75 |                     d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
 76 |                 except RuntimeError:
 77 |                     assert not self.training
 78 |                     d_weight = torch.tensor(0.0)
 79 |             else:
 80 |                 d_weight = torch.tensor(0.0)
 81 | 
 82 |             disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
 83 |             loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss
 84 | 
 85 |             log = {"{}/total_loss".format(split): loss.clone().detach().mean(), "{}/logvar".format(split): self.logvar.detach(),
 86 |                    "{}/kl_loss".format(split): kl_loss.detach().mean(), "{}/nll_loss".format(split): nll_loss.detach().mean(),
 87 |                    "{}/rec_loss".format(split): rec_loss.detach().mean(),
 88 |                    "{}/d_weight".format(split): d_weight.detach(),
 89 |                    "{}/disc_factor".format(split): torch.tensor(disc_factor),
 90 |                    "{}/g_loss".format(split): g_loss.detach().mean(),
 91 |                    }
 92 |             return loss, log
 93 | 
 94 |         if optimizer_idx == 1:
 95 |             # second pass for discriminator update
 96 |             if cond is None:
 97 |                 logits_real = self.discriminator(inputs.contiguous().detach())
 98 |                 logits_fake = self.discriminator(reconstructions.contiguous().detach())
 99 |             else:
100 |                 logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
101 |                 logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
102 | 
103 |             disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
104 |             d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
105 | 
106 |             log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
107 |                    "{}/logits_real".format(split): logits_real.detach().mean(),
108 |                    "{}/logits_fake".format(split): logits_fake.detach().mean()
109 |                    }
110 |             return d_loss, log
111 | 
112 | 


--------------------------------------------------------------------------------
/ldm/modules/losses/vqperceptual.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import torch.nn.functional as F
  4 | from einops import repeat
  5 | 
  6 | from taming.modules.discriminator.model import NLayerDiscriminator, weights_init
  7 | from taming.modules.losses.lpips import LPIPS
  8 | from taming.modules.losses.vqperceptual import hinge_d_loss, vanilla_d_loss
  9 | 
 10 | 
 11 | def hinge_d_loss_with_exemplar_weights(logits_real, logits_fake, weights):
 12 |     assert weights.shape[0] == logits_real.shape[0] == logits_fake.shape[0]
 13 |     loss_real = torch.mean(F.relu(1. - logits_real), dim=[1,2,3])
 14 |     loss_fake = torch.mean(F.relu(1. + logits_fake), dim=[1,2,3])
 15 |     loss_real = (weights * loss_real).sum() / weights.sum()
 16 |     loss_fake = (weights * loss_fake).sum() / weights.sum()
 17 |     d_loss = 0.5 * (loss_real + loss_fake)
 18 |     return d_loss
 19 | 
 20 | def adopt_weight(weight, global_step, threshold=0, value=0.):
 21 |     if global_step < threshold:
 22 |         weight = value
 23 |     return weight
 24 | 
 25 | 
 26 | def measure_perplexity(predicted_indices, n_embed):
 27 |     # src: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py
 28 |     # eval cluster perplexity. when perplexity == num_embeddings then all clusters are used exactly equally
 29 |     encodings = F.one_hot(predicted_indices, n_embed).float().reshape(-1, n_embed)
 30 |     avg_probs = encodings.mean(0)
 31 |     perplexity = (-(avg_probs * torch.log(avg_probs + 1e-10)).sum()).exp()
 32 |     cluster_use = torch.sum(avg_probs > 0)
 33 |     return perplexity, cluster_use
 34 | 
 35 | def l1(x, y):
 36 |     return torch.abs(x-y)
 37 | 
 38 | 
 39 | def l2(x, y):
 40 |     return torch.pow((x-y), 2)
 41 | 
 42 | 
 43 | class VQLPIPSWithDiscriminator(nn.Module):
 44 |     def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0,
 45 |                  disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
 46 |                  perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
 47 |                  disc_ndf=64, disc_loss="hinge", n_classes=None, perceptual_loss="lpips",
 48 |                  pixel_loss="l1"):
 49 |         super().__init__()
 50 |         assert disc_loss in ["hinge", "vanilla"]
 51 |         assert perceptual_loss in ["lpips", "clips", "dists"]
 52 |         assert pixel_loss in ["l1", "l2"]
 53 |         self.codebook_weight = codebook_weight
 54 |         self.pixel_weight = pixelloss_weight
 55 |         if perceptual_loss == "lpips":
 56 |             print(f"{self.__class__.__name__}: Running with LPIPS.")
 57 |             self.perceptual_loss = LPIPS().eval()
 58 |         else:
 59 |             raise ValueError(f"Unknown perceptual loss: >> {perceptual_loss} <<")
 60 |         self.perceptual_weight = perceptual_weight
 61 | 
 62 |         if pixel_loss == "l1":
 63 |             self.pixel_loss = l1
 64 |         else:
 65 |             self.pixel_loss = l2
 66 | 
 67 |         self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
 68 |                                                  n_layers=disc_num_layers,
 69 |                                                  use_actnorm=use_actnorm,
 70 |                                                  ndf=disc_ndf
 71 |                                                  ).apply(weights_init)
 72 |         self.discriminator_iter_start = disc_start
 73 |         if disc_loss == "hinge":
 74 |             self.disc_loss = hinge_d_loss
 75 |         elif disc_loss == "vanilla":
 76 |             self.disc_loss = vanilla_d_loss
 77 |         else:
 78 |             raise ValueError(f"Unknown GAN loss '{disc_loss}'.")
 79 |         print(f"VQLPIPSWithDiscriminator running with {disc_loss} loss.")
 80 |         self.disc_factor = disc_factor
 81 |         self.discriminator_weight = disc_weight
 82 |         self.disc_conditional = disc_conditional
 83 |         self.n_classes = n_classes
 84 | 
 85 |     def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
 86 |         if last_layer is not None:
 87 |             nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
 88 |             g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
 89 |         else:
 90 |             nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
 91 |             g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
 92 | 
 93 |         d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
 94 |         d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
 95 |         d_weight = d_weight * self.discriminator_weight
 96 |         return d_weight
 97 | 
 98 |     def forward(self, codebook_loss, inputs, reconstructions, optimizer_idx,
 99 |                 global_step, last_layer=None, cond=None, split="train", predicted_indices=None):
100 |         if not exists(codebook_loss):
101 |             codebook_loss = torch.tensor([0.]).to(inputs.device)
102 |         #rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
103 |         rec_loss = self.pixel_loss(inputs.contiguous(), reconstructions.contiguous())
104 |         if self.perceptual_weight > 0:
105 |             p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
106 |             rec_loss = rec_loss + self.perceptual_weight * p_loss
107 |         else:
108 |             p_loss = torch.tensor([0.0])
109 | 
110 |         nll_loss = rec_loss
111 |         #nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
112 |         nll_loss = torch.mean(nll_loss)
113 | 
114 |         # now the GAN part
115 |         if optimizer_idx == 0:
116 |             # generator update
117 |             if cond is None:
118 |                 assert not self.disc_conditional
119 |                 logits_fake = self.discriminator(reconstructions.contiguous())
120 |             else:
121 |                 assert self.disc_conditional
122 |                 logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
123 |             g_loss = -torch.mean(logits_fake)
124 | 
125 |             try:
126 |                 d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
127 |             except RuntimeError:
128 |                 assert not self.training
129 |                 d_weight = torch.tensor(0.0)
130 | 
131 |             disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
132 |             loss = nll_loss + d_weight * disc_factor * g_loss + self.codebook_weight * codebook_loss.mean()
133 | 
134 |             log = {"{}/total_loss".format(split): loss.clone().detach().mean(),
135 |                    "{}/quant_loss".format(split): codebook_loss.detach().mean(),
136 |                    "{}/nll_loss".format(split): nll_loss.detach().mean(),
137 |                    "{}/rec_loss".format(split): rec_loss.detach().mean(),
138 |                    "{}/p_loss".format(split): p_loss.detach().mean(),
139 |                    "{}/d_weight".format(split): d_weight.detach(),
140 |                    "{}/disc_factor".format(split): torch.tensor(disc_factor),
141 |                    "{}/g_loss".format(split): g_loss.detach().mean(),
142 |                    }
143 |             if predicted_indices is not None:
144 |                 assert self.n_classes is not None
145 |                 with torch.no_grad():
146 |                     perplexity, cluster_usage = measure_perplexity(predicted_indices, self.n_classes)
147 |                 log[f"{split}/perplexity"] = perplexity
148 |                 log[f"{split}/cluster_usage"] = cluster_usage
149 |             return loss, log
150 | 
151 |         if optimizer_idx == 1:
152 |             # second pass for discriminator update
153 |             if cond is None:
154 |                 logits_real = self.discriminator(inputs.contiguous().detach())
155 |                 logits_fake = self.discriminator(reconstructions.contiguous().detach())
156 |             else:
157 |                 logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
158 |                 logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
159 | 
160 |             disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
161 |             d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
162 | 
163 |             log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
164 |                    "{}/logits_real".format(split): logits_real.detach().mean(),
165 |                    "{}/logits_fake".format(split): logits_fake.detach().mean()
166 |                    }
167 |             return d_loss, log
168 | 


--------------------------------------------------------------------------------
/ldm/util.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | 
  3 | import torch
  4 | import numpy as np
  5 | from collections import abc
  6 | from einops import rearrange
  7 | from functools import partial
  8 | 
  9 | import multiprocessing as mp
 10 | from threading import Thread
 11 | from queue import Queue
 12 | 
 13 | from inspect import isfunction
 14 | from PIL import Image, ImageDraw, ImageFont
 15 | 
 16 | 
 17 | def log_txt_as_img(wh, xc, size=10):
 18 |     # wh a tuple of (width, height)
 19 |     # xc a list of captions to plot
 20 |     b = len(xc)
 21 |     txts = list()
 22 |     for bi in range(b):
 23 |         txt = Image.new("RGB", wh, color="white")
 24 |         draw = ImageDraw.Draw(txt)
 25 |         font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
 26 |         nc = int(40 * (wh[0] / 256))
 27 |         lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
 28 | 
 29 |         try:
 30 |             draw.text((0, 0), lines, fill="black", font=font)
 31 |         except UnicodeEncodeError:
 32 |             print("Cant encode string for logging. Skipping.")
 33 | 
 34 |         txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
 35 |         txts.append(txt)
 36 |     txts = np.stack(txts)
 37 |     txts = torch.tensor(txts)
 38 |     return txts
 39 | 
 40 | 
 41 | def ismap(x):
 42 |     if not isinstance(x, torch.Tensor):
 43 |         return False
 44 |     return (len(x.shape) == 4) and (x.shape[1] > 3)
 45 | 
 46 | 
 47 | def isimage(x):
 48 |     if not isinstance(x, torch.Tensor):
 49 |         return False
 50 |     return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
 51 | 
 52 | 
 53 | def exists(x):
 54 |     return x is not None
 55 | 
 56 | 
 57 | def default(val, d):
 58 |     if exists(val):
 59 |         return val
 60 |     return d() if isfunction(d) else d
 61 | 
 62 | 
 63 | def mean_flat(tensor):
 64 |     """
 65 |     https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
 66 |     Take the mean over all non-batch dimensions.
 67 |     """
 68 |     return tensor.mean(dim=list(range(1, len(tensor.shape))))
 69 | 
 70 | 
 71 | def count_params(model, verbose=False):
 72 |     total_params = sum(p.numel() for p in model.parameters())
 73 |     if verbose:
 74 |         print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
 75 |     return total_params
 76 | 
 77 | 
 78 | def instantiate_from_config(config):
 79 |     if not "target" in config:
 80 |         if config == '__is_first_stage__':
 81 |             return None
 82 |         elif config == "__is_unconditional__":
 83 |             return None
 84 |         raise KeyError("Expected key `target` to instantiate.")
 85 |     return get_obj_from_str(config["target"])(**config.get("params", dict()))
 86 | 
 87 | 
 88 | def get_obj_from_str(string, reload=False):
 89 |     module, cls = string.rsplit(".", 1)
 90 |     if reload:
 91 |         module_imp = importlib.import_module(module)
 92 |         importlib.reload(module_imp)
 93 |     return getattr(importlib.import_module(module, package=None), cls)
 94 | 
 95 | 
 96 | def _do_parallel_data_prefetch(func, Q, data, idx, idx_to_fn=False):
 97 |     # create dummy dataset instance
 98 | 
 99 |     # run prefetching
100 |     if idx_to_fn:
101 |         res = func(data, worker_id=idx)
102 |     else:
103 |         res = func(data)
104 |     Q.put([idx, res])
105 |     Q.put("Done")
106 | 
107 | 
108 | def parallel_data_prefetch(
109 |         func: callable, data, n_proc, target_data_type="ndarray", cpu_intensive=True, use_worker_id=False
110 | ):
111 |     # if target_data_type not in ["ndarray", "list"]:
112 |     #     raise ValueError(
113 |     #         "Data, which is passed to parallel_data_prefetch has to be either of type list or ndarray."
114 |     #     )
115 |     if isinstance(data, np.ndarray) and target_data_type == "list":
116 |         raise ValueError("list expected but function got ndarray.")
117 |     elif isinstance(data, abc.Iterable):
118 |         if isinstance(data, dict):
119 |             print(
120 |                 f'WARNING:"data" argument passed to parallel_data_prefetch is a dict: Using only its values and disregarding keys.'
121 |             )
122 |             data = list(data.values())
123 |         if target_data_type == "ndarray":
124 |             data = np.asarray(data)
125 |         else:
126 |             data = list(data)
127 |     else:
128 |         raise TypeError(
129 |             f"The data, that shall be processed parallel has to be either an np.ndarray or an Iterable, but is actually {type(data)}."
130 |         )
131 | 
132 |     if cpu_intensive:
133 |         Q = mp.Queue(1000)
134 |         proc = mp.Process
135 |     else:
136 |         Q = Queue(1000)
137 |         proc = Thread
138 |     # spawn processes
139 |     if target_data_type == "ndarray":
140 |         arguments = [
141 |             [func, Q, part, i, use_worker_id]
142 |             for i, part in enumerate(np.array_split(data, n_proc))
143 |         ]
144 |     else:
145 |         step = (
146 |             int(len(data) / n_proc + 1)
147 |             if len(data) % n_proc != 0
148 |             else int(len(data) / n_proc)
149 |         )
150 |         arguments = [
151 |             [func, Q, part, i, use_worker_id]
152 |             for i, part in enumerate(
153 |                 [data[i: i + step] for i in range(0, len(data), step)]
154 |             )
155 |         ]
156 |     processes = []
157 |     for i in range(n_proc):
158 |         p = proc(target=_do_parallel_data_prefetch, args=arguments[i])
159 |         processes += [p]
160 | 
161 |     # start processes
162 |     print(f"Start prefetching...")
163 |     import time
164 | 
165 |     start = time.time()
166 |     gather_res = [[] for _ in range(n_proc)]
167 |     try:
168 |         for p in processes:
169 |             p.start()
170 | 
171 |         k = 0
172 |         while k < n_proc:
173 |             # get result
174 |             res = Q.get()
175 |             if res == "Done":
176 |                 k += 1
177 |             else:
178 |                 gather_res[res[0]] = res[1]
179 | 
180 |     except Exception as e:
181 |         print("Exception: ", e)
182 |         for p in processes:
183 |             p.terminate()
184 | 
185 |         raise e
186 |     finally:
187 |         for p in processes:
188 |             p.join()
189 |         print(f"Prefetching complete. [{time.time() - start} sec.]")
190 | 
191 |     if target_data_type == 'ndarray':
192 |         if not isinstance(gather_res[0], np.ndarray):
193 |             return np.concatenate([np.asarray(r) for r in gather_res], axis=0)
194 | 
195 |         # order outputs
196 |         return np.concatenate(gather_res, axis=0)
197 |     elif target_data_type == 'list':
198 |         out = []
199 |         for r in gather_res:
200 |             out.extend(r)
201 |         return out
202 |     else:
203 |         return gather_res
204 | 


--------------------------------------------------------------------------------
/models/first_stage_models/kl-f16/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.AutoencoderKL
 4 |   params:
 5 |     monitor: val/rec_loss
 6 |     embed_dim: 16
 7 |     lossconfig:
 8 |       target: ldm.modules.losses.LPIPSWithDiscriminator
 9 |       params:
10 |         disc_start: 50001
11 |         kl_weight: 1.0e-06
12 |         disc_weight: 0.5
13 |     ddconfig:
14 |       double_z: true
15 |       z_channels: 16
16 |       resolution: 256
17 |       in_channels: 3
18 |       out_ch: 3
19 |       ch: 128
20 |       ch_mult:
21 |       - 1
22 |       - 1
23 |       - 2
24 |       - 2
25 |       - 4
26 |       num_res_blocks: 2
27 |       attn_resolutions:
28 |       - 16
29 |       dropout: 0.0
30 | data:
31 |   target: main.DataModuleFromConfig
32 |   params:
33 |     batch_size: 6
34 |     wrap: true
35 |     train:
36 |       target: ldm.data.openimages.FullOpenImagesTrain
37 |       params:
38 |         size: 384
39 |         crop_size: 256
40 |     validation:
41 |       target: ldm.data.openimages.FullOpenImagesValidation
42 |       params:
43 |         size: 384
44 |         crop_size: 256
45 | 


--------------------------------------------------------------------------------
/models/first_stage_models/kl-f32/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.AutoencoderKL
 4 |   params:
 5 |     monitor: val/rec_loss
 6 |     embed_dim: 64
 7 |     lossconfig:
 8 |       target: ldm.modules.losses.LPIPSWithDiscriminator
 9 |       params:
10 |         disc_start: 50001
11 |         kl_weight: 1.0e-06
12 |         disc_weight: 0.5
13 |     ddconfig:
14 |       double_z: true
15 |       z_channels: 64
16 |       resolution: 256
17 |       in_channels: 3
18 |       out_ch: 3
19 |       ch: 128
20 |       ch_mult:
21 |       - 1
22 |       - 1
23 |       - 2
24 |       - 2
25 |       - 4
26 |       - 4
27 |       num_res_blocks: 2
28 |       attn_resolutions:
29 |       - 16
30 |       - 8
31 |       dropout: 0.0
32 | data:
33 |   target: main.DataModuleFromConfig
34 |   params:
35 |     batch_size: 6
36 |     wrap: true
37 |     train:
38 |       target: ldm.data.openimages.FullOpenImagesTrain
39 |       params:
40 |         size: 384
41 |         crop_size: 256
42 |     validation:
43 |       target: ldm.data.openimages.FullOpenImagesValidation
44 |       params:
45 |         size: 384
46 |         crop_size: 256
47 | 


--------------------------------------------------------------------------------
/models/first_stage_models/kl-f4/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.AutoencoderKL
 4 |   params:
 5 |     monitor: val/rec_loss
 6 |     embed_dim: 3
 7 |     lossconfig:
 8 |       target: ldm.modules.losses.LPIPSWithDiscriminator
 9 |       params:
10 |         disc_start: 50001
11 |         kl_weight: 1.0e-06
12 |         disc_weight: 0.5
13 |     ddconfig:
14 |       double_z: true
15 |       z_channels: 3
16 |       resolution: 256
17 |       in_channels: 3
18 |       out_ch: 3
19 |       ch: 128
20 |       ch_mult:
21 |       - 1
22 |       - 2
23 |       - 4
24 |       num_res_blocks: 2
25 |       attn_resolutions: []
26 |       dropout: 0.0
27 | data:
28 |   target: main.DataModuleFromConfig
29 |   params:
30 |     batch_size: 10
31 |     wrap: true
32 |     train:
33 |       target: ldm.data.openimages.FullOpenImagesTrain
34 |       params:
35 |         size: 384
36 |         crop_size: 256
37 |     validation:
38 |       target: ldm.data.openimages.FullOpenImagesValidation
39 |       params:
40 |         size: 384
41 |         crop_size: 256
42 | 


--------------------------------------------------------------------------------
/models/first_stage_models/kl-f8/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.AutoencoderKL
 4 |   params:
 5 |     monitor: val/rec_loss
 6 |     embed_dim: 4
 7 |     lossconfig:
 8 |       target: ldm.modules.losses.LPIPSWithDiscriminator
 9 |       params:
10 |         disc_start: 50001
11 |         kl_weight: 1.0e-06
12 |         disc_weight: 0.5
13 |     ddconfig:
14 |       double_z: true
15 |       z_channels: 4
16 |       resolution: 256
17 |       in_channels: 3
18 |       out_ch: 3
19 |       ch: 128
20 |       ch_mult:
21 |       - 1
22 |       - 2
23 |       - 4
24 |       - 4
25 |       num_res_blocks: 2
26 |       attn_resolutions: []
27 |       dropout: 0.0
28 | data:
29 |   target: main.DataModuleFromConfig
30 |   params:
31 |     batch_size: 4
32 |     wrap: true
33 |     train:
34 |       target: ldm.data.openimages.FullOpenImagesTrain
35 |       params:
36 |         size: 384
37 |         crop_size: 256
38 |     validation:
39 |       target: ldm.data.openimages.FullOpenImagesValidation
40 |       params:
41 |         size: 384
42 |         crop_size: 256
43 | 


--------------------------------------------------------------------------------
/models/first_stage_models/vq-f16/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.VQModel
 4 |   params:
 5 |     embed_dim: 8
 6 |     n_embed: 16384
 7 |     ddconfig:
 8 |       double_z: false
 9 |       z_channels: 8
10 |       resolution: 256
11 |       in_channels: 3
12 |       out_ch: 3
13 |       ch: 128
14 |       ch_mult:
15 |       - 1
16 |       - 1
17 |       - 2
18 |       - 2
19 |       - 4
20 |       num_res_blocks: 2
21 |       attn_resolutions:
22 |       - 16
23 |       dropout: 0.0
24 |     lossconfig:
25 |       target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
26 |       params:
27 |         disc_conditional: false
28 |         disc_in_channels: 3
29 |         disc_start: 250001
30 |         disc_weight: 0.75
31 |         disc_num_layers: 2
32 |         codebook_weight: 1.0
33 | 
34 | data:
35 |   target: main.DataModuleFromConfig
36 |   params:
37 |     batch_size: 14
38 |     num_workers: 20
39 |     wrap: true
40 |     train:
41 |       target: ldm.data.openimages.FullOpenImagesTrain
42 |       params:
43 |         size: 384
44 |         crop_size: 256
45 |     validation:
46 |       target: ldm.data.openimages.FullOpenImagesValidation
47 |       params:
48 |         size: 384
49 |         crop_size: 256
50 | 


--------------------------------------------------------------------------------
/models/first_stage_models/vq-f4-noattn/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.VQModel
 4 |   params:
 5 |     embed_dim: 3
 6 |     n_embed: 8192
 7 |     monitor: val/rec_loss
 8 | 
 9 |     ddconfig:
10 |       attn_type: none
11 |       double_z: false
12 |       z_channels: 3
13 |       resolution: 256
14 |       in_channels: 3
15 |       out_ch: 3
16 |       ch: 128
17 |       ch_mult:
18 |       - 1
19 |       - 2
20 |       - 4
21 |       num_res_blocks: 2
22 |       attn_resolutions: []
23 |       dropout: 0.0
24 |     lossconfig:
25 |       target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
26 |       params:
27 |         disc_conditional: false
28 |         disc_in_channels: 3
29 |         disc_start: 11
30 |         disc_weight: 0.75
31 |         codebook_weight: 1.0
32 | 
33 | data:
34 |   target: main.DataModuleFromConfig
35 |   params:
36 |     batch_size: 8
37 |     num_workers: 12
38 |     wrap: true
39 |     train:
40 |       target: ldm.data.openimages.FullOpenImagesTrain
41 |       params:
42 |         crop_size: 256
43 |     validation:
44 |       target: ldm.data.openimages.FullOpenImagesValidation
45 |       params:
46 |         crop_size: 256
47 | 


--------------------------------------------------------------------------------
/models/first_stage_models/vq-f4/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.VQModel
 4 |   params:
 5 |     embed_dim: 3
 6 |     n_embed: 8192
 7 |     monitor: val/rec_loss
 8 | 
 9 |     ddconfig:
10 |       double_z: false
11 |       z_channels: 3
12 |       resolution: 256
13 |       in_channels: 3
14 |       out_ch: 3
15 |       ch: 128
16 |       ch_mult:
17 |       - 1
18 |       - 2
19 |       - 4
20 |       num_res_blocks: 2
21 |       attn_resolutions: []
22 |       dropout: 0.0
23 |     lossconfig:
24 |       target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
25 |       params:
26 |         disc_conditional: false
27 |         disc_in_channels: 3
28 |         disc_start: 0
29 |         disc_weight: 0.75
30 |         codebook_weight: 1.0
31 | 
32 | data:
33 |   target: main.DataModuleFromConfig
34 |   params:
35 |     batch_size: 8
36 |     num_workers: 16
37 |     wrap: true
38 |     train:
39 |       target: ldm.data.openimages.FullOpenImagesTrain
40 |       params:
41 |         crop_size: 256
42 |     validation:
43 |       target: ldm.data.openimages.FullOpenImagesValidation
44 |       params:
45 |         crop_size: 256
46 | 


--------------------------------------------------------------------------------
/models/first_stage_models/vq-f8-n256/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.VQModel
 4 |   params:
 5 |     embed_dim: 4
 6 |     n_embed: 256
 7 |     monitor: val/rec_loss
 8 |     ddconfig:
 9 |       double_z: false
10 |       z_channels: 4
11 |       resolution: 256
12 |       in_channels: 3
13 |       out_ch: 3
14 |       ch: 128
15 |       ch_mult:
16 |       - 1
17 |       - 2
18 |       - 2
19 |       - 4
20 |       num_res_blocks: 2
21 |       attn_resolutions:
22 |       - 32
23 |       dropout: 0.0
24 |     lossconfig:
25 |       target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
26 |       params:
27 |         disc_conditional: false
28 |         disc_in_channels: 3
29 |         disc_start: 250001
30 |         disc_weight: 0.75
31 |         codebook_weight: 1.0
32 | 
33 | data:
34 |   target: main.DataModuleFromConfig
35 |   params:
36 |     batch_size: 10
37 |     num_workers: 20
38 |     wrap: true
39 |     train:
40 |       target: ldm.data.openimages.FullOpenImagesTrain
41 |       params:
42 |         size: 384
43 |         crop_size: 256
44 |     validation:
45 |       target: ldm.data.openimages.FullOpenImagesValidation
46 |       params:
47 |         size: 384
48 |         crop_size: 256
49 | 


--------------------------------------------------------------------------------
/models/first_stage_models/vq-f8/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: ldm.models.autoencoder.VQModel
 4 |   params:
 5 |     embed_dim: 4
 6 |     n_embed: 16384
 7 |     monitor: val/rec_loss
 8 |     ddconfig:
 9 |       double_z: false
10 |       z_channels: 4
11 |       resolution: 256
12 |       in_channels: 3
13 |       out_ch: 3
14 |       ch: 128
15 |       ch_mult:
16 |       - 1
17 |       - 2
18 |       - 2
19 |       - 4
20 |       num_res_blocks: 2
21 |       attn_resolutions:
22 |       - 32
23 |       dropout: 0.0
24 |     lossconfig:
25 |       target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
26 |       params:
27 |         disc_conditional: false
28 |         disc_in_channels: 3
29 |         disc_num_layers: 2
30 |         disc_start: 1
31 |         disc_weight: 0.6
32 |         codebook_weight: 1.0
33 | data:
34 |   target: main.DataModuleFromConfig
35 |   params:
36 |     batch_size: 10
37 |     num_workers: 20
38 |     wrap: true
39 |     train:
40 |       target: ldm.data.openimages.FullOpenImagesTrain
41 |       params:
42 |         size: 384
43 |         crop_size: 256
44 |     validation:
45 |       target: ldm.data.openimages.FullOpenImagesValidation
46 |       params:
47 |         size: 384
48 |         crop_size: 256
49 | 


--------------------------------------------------------------------------------
/models/ldm/bsr_sr/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 1.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0155
 7 |     log_every_t: 100
 8 |     timesteps: 1000
 9 |     loss_type: l2
10 |     first_stage_key: image
11 |     cond_stage_key: LR_image
12 |     image_size: 64
13 |     channels: 3
14 |     concat_mode: true
15 |     cond_stage_trainable: false
16 |     unet_config:
17 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
18 |       params:
19 |         image_size: 64
20 |         in_channels: 6
21 |         out_channels: 3
22 |         model_channels: 160
23 |         attention_resolutions:
24 |         - 16
25 |         - 8
26 |         num_res_blocks: 2
27 |         channel_mult:
28 |         - 1
29 |         - 2
30 |         - 2
31 |         - 4
32 |         num_head_channels: 32
33 |     first_stage_config:
34 |       target: ldm.models.autoencoder.VQModelInterface
35 |       params:
36 |         embed_dim: 3
37 |         n_embed: 8192
38 |         monitor: val/rec_loss
39 |         ddconfig:
40 |           double_z: false
41 |           z_channels: 3
42 |           resolution: 256
43 |           in_channels: 3
44 |           out_ch: 3
45 |           ch: 128
46 |           ch_mult:
47 |           - 1
48 |           - 2
49 |           - 4
50 |           num_res_blocks: 2
51 |           attn_resolutions: []
52 |           dropout: 0.0
53 |         lossconfig:
54 |           target: torch.nn.Identity
55 |     cond_stage_config:
56 |       target: torch.nn.Identity
57 | data:
58 |   target: main.DataModuleFromConfig
59 |   params:
60 |     batch_size: 64
61 |     wrap: false
62 |     num_workers: 12
63 |     train:
64 |       target: ldm.data.openimages.SuperresOpenImagesAdvancedTrain
65 |       params:
66 |         size: 256
67 |         degradation: bsrgan_light
68 |         downscale_f: 4
69 |         min_crop_f: 0.5
70 |         max_crop_f: 1.0
71 |         random_crop: true
72 |     validation:
73 |       target: ldm.data.openimages.SuperresOpenImagesAdvancedValidation
74 |       params:
75 |         size: 256
76 |         degradation: bsrgan_light
77 |         downscale_f: 4
78 |         min_crop_f: 0.5
79 |         max_crop_f: 1.0
80 |         random_crop: true
81 | 


--------------------------------------------------------------------------------
/models/ldm/celeba256/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 2.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0195
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     cond_stage_key: class_label
12 |     image_size: 64
13 |     channels: 3
14 |     cond_stage_trainable: false
15 |     concat_mode: false
16 |     monitor: val/loss
17 |     unet_config:
18 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
19 |       params:
20 |         image_size: 64
21 |         in_channels: 3
22 |         out_channels: 3
23 |         model_channels: 224
24 |         attention_resolutions:
25 |         - 8
26 |         - 4
27 |         - 2
28 |         num_res_blocks: 2
29 |         channel_mult:
30 |         - 1
31 |         - 2
32 |         - 3
33 |         - 4
34 |         num_head_channels: 32
35 |     first_stage_config:
36 |       target: ldm.models.autoencoder.VQModelInterface
37 |       params:
38 |         embed_dim: 3
39 |         n_embed: 8192
40 |         ddconfig:
41 |           double_z: false
42 |           z_channels: 3
43 |           resolution: 256
44 |           in_channels: 3
45 |           out_ch: 3
46 |           ch: 128
47 |           ch_mult:
48 |           - 1
49 |           - 2
50 |           - 4
51 |           num_res_blocks: 2
52 |           attn_resolutions: []
53 |           dropout: 0.0
54 |         lossconfig:
55 |           target: torch.nn.Identity
56 |     cond_stage_config: __is_unconditional__
57 | data:
58 |   target: main.DataModuleFromConfig
59 |   params:
60 |     batch_size: 48
61 |     num_workers: 5
62 |     wrap: false
63 |     train:
64 |       target: ldm.data.faceshq.CelebAHQTrain
65 |       params:
66 |         size: 256
67 |     validation:
68 |       target: ldm.data.faceshq.CelebAHQValidation
69 |       params:
70 |         size: 256
71 | 


--------------------------------------------------------------------------------
/models/ldm/cin256/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 1.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0195
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     cond_stage_key: class_label
12 |     image_size: 32
13 |     channels: 4
14 |     cond_stage_trainable: true
15 |     conditioning_key: crossattn
16 |     monitor: val/loss_simple_ema
17 |     unet_config:
18 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
19 |       params:
20 |         image_size: 32
21 |         in_channels: 4
22 |         out_channels: 4
23 |         model_channels: 256
24 |         attention_resolutions:
25 |         - 4
26 |         - 2
27 |         - 1
28 |         num_res_blocks: 2
29 |         channel_mult:
30 |         - 1
31 |         - 2
32 |         - 4
33 |         num_head_channels: 32
34 |         use_spatial_transformer: true
35 |         transformer_depth: 1
36 |         context_dim: 512
37 |     first_stage_config:
38 |       target: ldm.models.autoencoder.VQModelInterface
39 |       params:
40 |         embed_dim: 4
41 |         n_embed: 16384
42 |         ddconfig:
43 |           double_z: false
44 |           z_channels: 4
45 |           resolution: 256
46 |           in_channels: 3
47 |           out_ch: 3
48 |           ch: 128
49 |           ch_mult:
50 |           - 1
51 |           - 2
52 |           - 2
53 |           - 4
54 |           num_res_blocks: 2
55 |           attn_resolutions:
56 |           - 32
57 |           dropout: 0.0
58 |         lossconfig:
59 |           target: torch.nn.Identity
60 |     cond_stage_config:
61 |       target: ldm.modules.encoders.modules.ClassEmbedder
62 |       params:
63 |         embed_dim: 512
64 |         key: class_label
65 | data:
66 |   target: main.DataModuleFromConfig
67 |   params:
68 |     batch_size: 64
69 |     num_workers: 12
70 |     wrap: false
71 |     train:
72 |       target: ldm.data.imagenet.ImageNetTrain
73 |       params:
74 |         config:
75 |           size: 256
76 |     validation:
77 |       target: ldm.data.imagenet.ImageNetValidation
78 |       params:
79 |         config:
80 |           size: 256
81 | 


--------------------------------------------------------------------------------
/models/ldm/ffhq256/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 2.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0195
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     cond_stage_key: class_label
12 |     image_size: 64
13 |     channels: 3
14 |     cond_stage_trainable: false
15 |     concat_mode: false
16 |     monitor: val/loss
17 |     unet_config:
18 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
19 |       params:
20 |         image_size: 64
21 |         in_channels: 3
22 |         out_channels: 3
23 |         model_channels: 224
24 |         attention_resolutions:
25 |         - 8
26 |         - 4
27 |         - 2
28 |         num_res_blocks: 2
29 |         channel_mult:
30 |         - 1
31 |         - 2
32 |         - 3
33 |         - 4
34 |         num_head_channels: 32
35 |     first_stage_config:
36 |       target: ldm.models.autoencoder.VQModelInterface
37 |       params:
38 |         embed_dim: 3
39 |         n_embed: 8192
40 |         ddconfig:
41 |           double_z: false
42 |           z_channels: 3
43 |           resolution: 256
44 |           in_channels: 3
45 |           out_ch: 3
46 |           ch: 128
47 |           ch_mult:
48 |           - 1
49 |           - 2
50 |           - 4
51 |           num_res_blocks: 2
52 |           attn_resolutions: []
53 |           dropout: 0.0
54 |         lossconfig:
55 |           target: torch.nn.Identity
56 |     cond_stage_config: __is_unconditional__
57 | data:
58 |   target: main.DataModuleFromConfig
59 |   params:
60 |     batch_size: 42
61 |     num_workers: 5
62 |     wrap: false
63 |     train:
64 |       target: ldm.data.faceshq.FFHQTrain
65 |       params:
66 |         size: 256
67 |     validation:
68 |       target: ldm.data.faceshq.FFHQValidation
69 |       params:
70 |         size: 256
71 | 


--------------------------------------------------------------------------------
/models/ldm/inpainting_big/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 1.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0205
 7 |     log_every_t: 100
 8 |     timesteps: 1000
 9 |     loss_type: l1
10 |     first_stage_key: image
11 |     cond_stage_key: masked_image
12 |     image_size: 64
13 |     channels: 3
14 |     concat_mode: true
15 |     monitor: val/loss
16 |     scheduler_config:
17 |       target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler
18 |       params:
19 |         verbosity_interval: 0
20 |         warm_up_steps: 1000
21 |         max_decay_steps: 50000
22 |         lr_start: 0.001
23 |         lr_max: 0.1
24 |         lr_min: 0.0001
25 |     unet_config:
26 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
27 |       params:
28 |         image_size: 64
29 |         in_channels: 7
30 |         out_channels: 3
31 |         model_channels: 256
32 |         attention_resolutions:
33 |         - 8
34 |         - 4
35 |         - 2
36 |         num_res_blocks: 2
37 |         channel_mult:
38 |         - 1
39 |         - 2
40 |         - 3
41 |         - 4
42 |         num_heads: 8
43 |         resblock_updown: true
44 |     first_stage_config:
45 |       target: ldm.models.autoencoder.VQModelInterface
46 |       params:
47 |         embed_dim: 3
48 |         n_embed: 8192
49 |         monitor: val/rec_loss
50 |         ddconfig:
51 |           attn_type: none
52 |           double_z: false
53 |           z_channels: 3
54 |           resolution: 256
55 |           in_channels: 3
56 |           out_ch: 3
57 |           ch: 128
58 |           ch_mult:
59 |           - 1
60 |           - 2
61 |           - 4
62 |           num_res_blocks: 2
63 |           attn_resolutions: []
64 |           dropout: 0.0
65 |         lossconfig:
66 |           target: ldm.modules.losses.contperceptual.DummyLoss
67 |     cond_stage_config: __is_first_stage__
68 | 


--------------------------------------------------------------------------------
/models/ldm/layout2img-openimages256/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 2.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0205
 7 |     log_every_t: 100
 8 |     timesteps: 1000
 9 |     loss_type: l1
10 |     first_stage_key: image
11 |     cond_stage_key: coordinates_bbox
12 |     image_size: 64
13 |     channels: 3
14 |     conditioning_key: crossattn
15 |     cond_stage_trainable: true
16 |     unet_config:
17 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
18 |       params:
19 |         image_size: 64
20 |         in_channels: 3
21 |         out_channels: 3
22 |         model_channels: 128
23 |         attention_resolutions:
24 |         - 8
25 |         - 4
26 |         - 2
27 |         num_res_blocks: 2
28 |         channel_mult:
29 |         - 1
30 |         - 2
31 |         - 3
32 |         - 4
33 |         num_head_channels: 32
34 |         use_spatial_transformer: true
35 |         transformer_depth: 3
36 |         context_dim: 512
37 |     first_stage_config:
38 |       target: ldm.models.autoencoder.VQModelInterface
39 |       params:
40 |         embed_dim: 3
41 |         n_embed: 8192
42 |         monitor: val/rec_loss
43 |         ddconfig:
44 |           double_z: false
45 |           z_channels: 3
46 |           resolution: 256
47 |           in_channels: 3
48 |           out_ch: 3
49 |           ch: 128
50 |           ch_mult:
51 |           - 1
52 |           - 2
53 |           - 4
54 |           num_res_blocks: 2
55 |           attn_resolutions: []
56 |           dropout: 0.0
57 |         lossconfig:
58 |           target: torch.nn.Identity
59 |     cond_stage_config:
60 |       target: ldm.modules.encoders.modules.BERTEmbedder
61 |       params:
62 |         n_embed: 512
63 |         n_layer: 16
64 |         vocab_size: 8192
65 |         max_seq_len: 92
66 |         use_tokenizer: false
67 |     monitor: val/loss_simple_ema
68 | data:
69 |   target: main.DataModuleFromConfig
70 |   params:
71 |     batch_size: 24
72 |     wrap: false
73 |     num_workers: 10
74 |     train:
75 |       target: ldm.data.openimages.OpenImagesBBoxTrain
76 |       params:
77 |         size: 256
78 |     validation:
79 |       target: ldm.data.openimages.OpenImagesBBoxValidation
80 |       params:
81 |         size: 256
82 | 


--------------------------------------------------------------------------------
/models/ldm/lsun_beds256/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 2.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0195
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     cond_stage_key: class_label
12 |     image_size: 64
13 |     channels: 3
14 |     cond_stage_trainable: false
15 |     concat_mode: false
16 |     monitor: val/loss
17 |     unet_config:
18 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
19 |       params:
20 |         image_size: 64
21 |         in_channels: 3
22 |         out_channels: 3
23 |         model_channels: 224
24 |         attention_resolutions:
25 |         - 8
26 |         - 4
27 |         - 2
28 |         num_res_blocks: 2
29 |         channel_mult:
30 |         - 1
31 |         - 2
32 |         - 3
33 |         - 4
34 |         num_head_channels: 32
35 |     first_stage_config:
36 |       target: ldm.models.autoencoder.VQModelInterface
37 |       params:
38 |         embed_dim: 3
39 |         n_embed: 8192
40 |         ddconfig:
41 |           double_z: false
42 |           z_channels: 3
43 |           resolution: 256
44 |           in_channels: 3
45 |           out_ch: 3
46 |           ch: 128
47 |           ch_mult:
48 |           - 1
49 |           - 2
50 |           - 4
51 |           num_res_blocks: 2
52 |           attn_resolutions: []
53 |           dropout: 0.0
54 |         lossconfig:
55 |           target: torch.nn.Identity
56 |     cond_stage_config: __is_unconditional__
57 | data:
58 |   target: main.DataModuleFromConfig
59 |   params:
60 |     batch_size: 48
61 |     num_workers: 5
62 |     wrap: false
63 |     train:
64 |       target: ldm.data.lsun.LSUNBedroomsTrain
65 |       params:
66 |         size: 256
67 |     validation:
68 |       target: ldm.data.lsun.LSUNBedroomsValidation
69 |       params:
70 |         size: 256
71 | 


--------------------------------------------------------------------------------
/models/ldm/lsun_churches256/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 5.0e-05
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0155
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     loss_type: l1
11 |     first_stage_key: image
12 |     cond_stage_key: image
13 |     image_size: 32
14 |     channels: 4
15 |     cond_stage_trainable: false
16 |     concat_mode: false
17 |     scale_by_std: true
18 |     monitor: val/loss_simple_ema
19 |     scheduler_config:
20 |       target: ldm.lr_scheduler.LambdaLinearScheduler
21 |       params:
22 |         warm_up_steps:
23 |         - 10000
24 |         cycle_lengths:
25 |         - 10000000000000
26 |         f_start:
27 |         - 1.0e-06
28 |         f_max:
29 |         - 1.0
30 |         f_min:
31 |         - 1.0
32 |     unet_config:
33 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
34 |       params:
35 |         image_size: 32
36 |         in_channels: 4
37 |         out_channels: 4
38 |         model_channels: 192
39 |         attention_resolutions:
40 |         - 1
41 |         - 2
42 |         - 4
43 |         - 8
44 |         num_res_blocks: 2
45 |         channel_mult:
46 |         - 1
47 |         - 2
48 |         - 2
49 |         - 4
50 |         - 4
51 |         num_heads: 8
52 |         use_scale_shift_norm: true
53 |         resblock_updown: true
54 |     first_stage_config:
55 |       target: ldm.models.autoencoder.AutoencoderKL
56 |       params:
57 |         embed_dim: 4
58 |         monitor: val/rec_loss
59 |         ddconfig:
60 |           double_z: true
61 |           z_channels: 4
62 |           resolution: 256
63 |           in_channels: 3
64 |           out_ch: 3
65 |           ch: 128
66 |           ch_mult:
67 |           - 1
68 |           - 2
69 |           - 4
70 |           - 4
71 |           num_res_blocks: 2
72 |           attn_resolutions: []
73 |           dropout: 0.0
74 |         lossconfig:
75 |           target: torch.nn.Identity
76 | 
77 |     cond_stage_config: '__is_unconditional__'
78 | 
79 | data:
80 |   target: main.DataModuleFromConfig
81 |   params:
82 |     batch_size: 96
83 |     num_workers: 5
84 |     wrap: false
85 |     train:
86 |       target: ldm.data.lsun.LSUNChurchesTrain
87 |       params:
88 |         size: 256
89 |     validation:
90 |       target: ldm.data.lsun.LSUNChurchesValidation
91 |       params:
92 |         size: 256
93 | 


--------------------------------------------------------------------------------
/models/ldm/semantic_synthesis256/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 1.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0205
 7 |     log_every_t: 100
 8 |     timesteps: 1000
 9 |     loss_type: l1
10 |     first_stage_key: image
11 |     cond_stage_key: segmentation
12 |     image_size: 64
13 |     channels: 3
14 |     concat_mode: true
15 |     cond_stage_trainable: true
16 |     unet_config:
17 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
18 |       params:
19 |         image_size: 64
20 |         in_channels: 6
21 |         out_channels: 3
22 |         model_channels: 128
23 |         attention_resolutions:
24 |         - 32
25 |         - 16
26 |         - 8
27 |         num_res_blocks: 2
28 |         channel_mult:
29 |         - 1
30 |         - 4
31 |         - 8
32 |         num_heads: 8
33 |     first_stage_config:
34 |       target: ldm.models.autoencoder.VQModelInterface
35 |       params:
36 |         embed_dim: 3
37 |         n_embed: 8192
38 |         ddconfig:
39 |           double_z: false
40 |           z_channels: 3
41 |           resolution: 256
42 |           in_channels: 3
43 |           out_ch: 3
44 |           ch: 128
45 |           ch_mult:
46 |           - 1
47 |           - 2
48 |           - 4
49 |           num_res_blocks: 2
50 |           attn_resolutions: []
51 |           dropout: 0.0
52 |         lossconfig:
53 |           target: torch.nn.Identity
54 |     cond_stage_config:
55 |       target: ldm.modules.encoders.modules.SpatialRescaler
56 |       params:
57 |         n_stages: 2
58 |         in_channels: 182
59 |         out_channels: 3
60 | 


--------------------------------------------------------------------------------
/models/ldm/semantic_synthesis512/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 1.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0205
 7 |     log_every_t: 100
 8 |     timesteps: 1000
 9 |     loss_type: l1
10 |     first_stage_key: image
11 |     cond_stage_key: segmentation
12 |     image_size: 128
13 |     channels: 3
14 |     concat_mode: true
15 |     cond_stage_trainable: true
16 |     unet_config:
17 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
18 |       params:
19 |         image_size: 128
20 |         in_channels: 6
21 |         out_channels: 3
22 |         model_channels: 128
23 |         attention_resolutions:
24 |         - 32
25 |         - 16
26 |         - 8
27 |         num_res_blocks: 2
28 |         channel_mult:
29 |         - 1
30 |         - 4
31 |         - 8
32 |         num_heads: 8
33 |     first_stage_config:
34 |       target: ldm.models.autoencoder.VQModelInterface
35 |       params:
36 |         embed_dim: 3
37 |         n_embed: 8192
38 |         monitor: val/rec_loss
39 |         ddconfig:
40 |           double_z: false
41 |           z_channels: 3
42 |           resolution: 256
43 |           in_channels: 3
44 |           out_ch: 3
45 |           ch: 128
46 |           ch_mult:
47 |           - 1
48 |           - 2
49 |           - 4
50 |           num_res_blocks: 2
51 |           attn_resolutions: []
52 |           dropout: 0.0
53 |         lossconfig:
54 |           target: torch.nn.Identity
55 |     cond_stage_config:
56 |       target: ldm.modules.encoders.modules.SpatialRescaler
57 |       params:
58 |         n_stages: 2
59 |         in_channels: 182
60 |         out_channels: 3
61 | data:
62 |   target: main.DataModuleFromConfig
63 |   params:
64 |     batch_size: 8
65 |     wrap: false
66 |     num_workers: 10
67 |     train:
68 |       target: ldm.data.landscapes.RFWTrain
69 |       params:
70 |         size: 768
71 |         crop_size: 512
72 |         segmentation_to_float32: true
73 |     validation:
74 |       target: ldm.data.landscapes.RFWValidation
75 |       params:
76 |         size: 768
77 |         crop_size: 512
78 |         segmentation_to_float32: true
79 | 


--------------------------------------------------------------------------------
/models/ldm/text2img256/config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 2.0e-06
 3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
 4 |   params:
 5 |     linear_start: 0.0015
 6 |     linear_end: 0.0195
 7 |     num_timesteps_cond: 1
 8 |     log_every_t: 200
 9 |     timesteps: 1000
10 |     first_stage_key: image
11 |     cond_stage_key: caption
12 |     image_size: 64
13 |     channels: 3
14 |     cond_stage_trainable: true
15 |     conditioning_key: crossattn
16 |     monitor: val/loss_simple_ema
17 |     unet_config:
18 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
19 |       params:
20 |         image_size: 64
21 |         in_channels: 3
22 |         out_channels: 3
23 |         model_channels: 192
24 |         attention_resolutions:
25 |         - 8
26 |         - 4
27 |         - 2
28 |         num_res_blocks: 2
29 |         channel_mult:
30 |         - 1
31 |         - 2
32 |         - 3
33 |         - 5
34 |         num_head_channels: 32
35 |         use_spatial_transformer: true
36 |         transformer_depth: 1
37 |         context_dim: 640
38 |     first_stage_config:
39 |       target: ldm.models.autoencoder.VQModelInterface
40 |       params:
41 |         embed_dim: 3
42 |         n_embed: 8192
43 |         ddconfig:
44 |           double_z: false
45 |           z_channels: 3
46 |           resolution: 256
47 |           in_channels: 3
48 |           out_ch: 3
49 |           ch: 128
50 |           ch_mult:
51 |           - 1
52 |           - 2
53 |           - 4
54 |           num_res_blocks: 2
55 |           attn_resolutions: []
56 |           dropout: 0.0
57 |         lossconfig:
58 |           target: torch.nn.Identity
59 |     cond_stage_config:
60 |       target: ldm.modules.encoders.modules.BERTEmbedder
61 |       params:
62 |         n_embed: 640
63 |         n_layer: 32
64 | data:
65 |   target: main.DataModuleFromConfig
66 |   params:
67 |     batch_size: 28
68 |     num_workers: 10
69 |     wrap: false
70 |     train:
71 |       target: ldm.data.previews.pytorch_dataset.PreviewsTrain
72 |       params:
73 |         size: 256
74 |     validation:
75 |       target: ldm.data.previews.pytorch_dataset.PreviewsValidation
76 |       params:
77 |         size: 256
78 | 


--------------------------------------------------------------------------------
/notebook_helpers.py:
--------------------------------------------------------------------------------
  1 | from torchvision.datasets.utils import download_url
  2 | from ldm.util import instantiate_from_config
  3 | import torch
  4 | import os
  5 | # todo ?
  6 | from google.colab import files
  7 | from IPython.display import Image as ipyimg
  8 | import ipywidgets as widgets
  9 | from PIL import Image
 10 | from numpy import asarray
 11 | from einops import rearrange, repeat
 12 | import torch, torchvision
 13 | from ldm.models.diffusion.ddim import DDIMSampler
 14 | from ldm.util import ismap
 15 | import time
 16 | from omegaconf import OmegaConf
 17 | 
 18 | 
 19 | def download_models(mode):
 20 | 
 21 |     if mode == "superresolution":
 22 |         # this is the small bsr light model
 23 |         url_conf = 'https://heibox.uni-heidelberg.de/f/31a76b13ea27482981b4/?dl=1'
 24 |         url_ckpt = 'https://heibox.uni-heidelberg.de/f/578df07c8fc04ffbadf3/?dl=1'
 25 | 
 26 |         path_conf = 'logs/diffusion/superresolution_bsr/configs/project.yaml'
 27 |         path_ckpt = 'logs/diffusion/superresolution_bsr/checkpoints/last.ckpt'
 28 | 
 29 |         download_url(url_conf, path_conf)
 30 |         download_url(url_ckpt, path_ckpt)
 31 | 
 32 |         path_conf = path_conf + '/?dl=1' # fix it
 33 |         path_ckpt = path_ckpt + '/?dl=1' # fix it
 34 |         return path_conf, path_ckpt
 35 | 
 36 |     else:
 37 |         raise NotImplementedError
 38 | 
 39 | 
 40 | def load_model_from_config(config, ckpt):
 41 |     print(f"Loading model from {ckpt}")
 42 |     pl_sd = torch.load(ckpt, map_location="cpu")
 43 |     global_step = pl_sd["global_step"]
 44 |     sd = pl_sd["state_dict"]
 45 |     model = instantiate_from_config(config.model)
 46 |     m, u = model.load_state_dict(sd, strict=False)
 47 |     model.cuda()
 48 |     model.eval()
 49 |     return {"model": model}, global_step
 50 | 
 51 | 
 52 | def get_model(mode):
 53 |     path_conf, path_ckpt = download_models(mode)
 54 |     config = OmegaConf.load(path_conf)
 55 |     model, step = load_model_from_config(config, path_ckpt)
 56 |     return model
 57 | 
 58 | 
 59 | def get_custom_cond(mode):
 60 |     dest = "data/example_conditioning"
 61 | 
 62 |     if mode == "superresolution":
 63 |         uploaded_img = files.upload()
 64 |         filename = next(iter(uploaded_img))
 65 |         name, filetype = filename.split(".") # todo assumes just one dot in name !
 66 |         os.rename(f"{filename}", f"{dest}/{mode}/custom_{name}.{filetype}")
 67 | 
 68 |     elif mode == "text_conditional":
 69 |         w = widgets.Text(value='A cake with cream!', disabled=True)
 70 |         display(w)
 71 | 
 72 |         with open(f"{dest}/{mode}/custom_{w.value[:20]}.txt", 'w') as f:
 73 |             f.write(w.value)
 74 | 
 75 |     elif mode == "class_conditional":
 76 |         w = widgets.IntSlider(min=0, max=1000)
 77 |         display(w)
 78 |         with open(f"{dest}/{mode}/custom.txt", 'w') as f:
 79 |             f.write(w.value)
 80 | 
 81 |     else:
 82 |         raise NotImplementedError(f"cond not implemented for mode{mode}")
 83 | 
 84 | 
 85 | def get_cond_options(mode):
 86 |     path = "data/example_conditioning"
 87 |     path = os.path.join(path, mode)
 88 |     onlyfiles = [f for f in sorted(os.listdir(path))]
 89 |     return path, onlyfiles
 90 | 
 91 | 
 92 | def select_cond_path(mode):
 93 |     path = "data/example_conditioning"  # todo
 94 |     path = os.path.join(path, mode)
 95 |     onlyfiles = [f for f in sorted(os.listdir(path))]
 96 | 
 97 |     selected = widgets.RadioButtons(
 98 |         options=onlyfiles,
 99 |         description='Select conditioning:',
100 |         disabled=False
101 |     )
102 |     display(selected)
103 |     selected_path = os.path.join(path, selected.value)
104 |     return selected_path
105 | 
106 | 
107 | def get_cond(mode, selected_path):
108 |     example = dict()
109 |     if mode == "superresolution":
110 |         up_f = 4
111 |         visualize_cond_img(selected_path)
112 | 
113 |         c = Image.open(selected_path)
114 |         c = torch.unsqueeze(torchvision.transforms.ToTensor()(c), 0)
115 |         c_up = torchvision.transforms.functional.resize(c, size=[up_f * c.shape[2], up_f * c.shape[3]], antialias=True)
116 |         c_up = rearrange(c_up, '1 c h w -> 1 h w c')
117 |         c = rearrange(c, '1 c h w -> 1 h w c')
118 |         c = 2. * c - 1.
119 | 
120 |         c = c.to(torch.device("cuda"))
121 |         example["LR_image"] = c
122 |         example["image"] = c_up
123 | 
124 |     return example
125 | 
126 | 
127 | def visualize_cond_img(path):
128 |     display(ipyimg(filename=path))
129 | 
130 | 
131 | def run(model, selected_path, task, custom_steps, resize_enabled=False, classifier_ckpt=None, global_step=None):
132 | 
133 |     example = get_cond(task, selected_path)
134 | 
135 |     save_intermediate_vid = False
136 |     n_runs = 1
137 |     masked = False
138 |     guider = None
139 |     ckwargs = None
140 |     mode = 'ddim'
141 |     ddim_use_x0_pred = False
142 |     temperature = 1.
143 |     eta = 1.
144 |     make_progrow = True
145 |     custom_shape = None
146 | 
147 |     height, width = example["image"].shape[1:3]
148 |     split_input = height >= 128 and width >= 128
149 | 
150 |     if split_input:
151 |         ks = 128
152 |         stride = 64
153 |         vqf = 4  #
154 |         model.split_input_params = {"ks": (ks, ks), "stride": (stride, stride),
155 |                                     "vqf": vqf,
156 |                                     "patch_distributed_vq": True,
157 |                                     "tie_braker": False,
158 |                                     "clip_max_weight": 0.5,
159 |                                     "clip_min_weight": 0.01,
160 |                                     "clip_max_tie_weight": 0.5,
161 |                                     "clip_min_tie_weight": 0.01}
162 |     else:
163 |         if hasattr(model, "split_input_params"):
164 |             delattr(model, "split_input_params")
165 | 
166 |     invert_mask = False
167 | 
168 |     x_T = None
169 |     for n in range(n_runs):
170 |         if custom_shape is not None:
171 |             x_T = torch.randn(1, custom_shape[1], custom_shape[2], custom_shape[3]).to(model.device)
172 |             x_T = repeat(x_T, '1 c h w -> b c h w', b=custom_shape[0])
173 | 
174 |         logs = make_convolutional_sample(example, model,
175 |                                          mode=mode, custom_steps=custom_steps,
176 |                                          eta=eta, swap_mode=False , masked=masked,
177 |                                          invert_mask=invert_mask, quantize_x0=False,
178 |                                          custom_schedule=None, decode_interval=10,
179 |                                          resize_enabled=resize_enabled, custom_shape=custom_shape,
180 |                                          temperature=temperature, noise_dropout=0.,
181 |                                          corrector=guider, corrector_kwargs=ckwargs, x_T=x_T, save_intermediate_vid=save_intermediate_vid,
182 |                                          make_progrow=make_progrow,ddim_use_x0_pred=ddim_use_x0_pred
183 |                                          )
184 |     return logs
185 | 
186 | 
187 | @torch.no_grad()
188 | def convsample_ddim(model, cond, steps, shape, eta=1.0, callback=None, normals_sequence=None,
189 |                     mask=None, x0=None, quantize_x0=False, img_callback=None,
190 |                     temperature=1., noise_dropout=0., score_corrector=None,
191 |                     corrector_kwargs=None, x_T=None, log_every_t=None
192 |                     ):
193 | 
194 |     ddim = DDIMSampler(model)
195 |     bs = shape[0]  # dont know where this comes from but wayne
196 |     shape = shape[1:]  # cut batch dim
197 |     print(f"Sampling with eta = {eta}; steps: {steps}")
198 |     samples, intermediates = ddim.sample(steps, batch_size=bs, shape=shape, conditioning=cond, callback=callback,
199 |                                          normals_sequence=normals_sequence, quantize_x0=quantize_x0, eta=eta,
200 |                                          mask=mask, x0=x0, temperature=temperature, verbose=False,
201 |                                          score_corrector=score_corrector,
202 |                                          corrector_kwargs=corrector_kwargs, x_T=x_T)
203 | 
204 |     return samples, intermediates
205 | 
206 | 
207 | @torch.no_grad()
208 | def make_convolutional_sample(batch, model, mode="vanilla", custom_steps=None, eta=1.0, swap_mode=False, masked=False,
209 |                               invert_mask=True, quantize_x0=False, custom_schedule=None, decode_interval=1000,
210 |                               resize_enabled=False, custom_shape=None, temperature=1., noise_dropout=0., corrector=None,
211 |                               corrector_kwargs=None, x_T=None, save_intermediate_vid=False, make_progrow=True,ddim_use_x0_pred=False):
212 |     log = dict()
213 | 
214 |     z, c, x, xrec, xc = model.get_input(batch, model.first_stage_key,
215 |                                         return_first_stage_outputs=True,
216 |                                         force_c_encode=not (hasattr(model, 'split_input_params')
217 |                                                             and model.cond_stage_key == 'coordinates_bbox'),
218 |                                         return_original_cond=True)
219 | 
220 |     log_every_t = 1 if save_intermediate_vid else None
221 | 
222 |     if custom_shape is not None:
223 |         z = torch.randn(custom_shape)
224 |         print(f"Generating {custom_shape[0]} samples of shape {custom_shape[1:]}")
225 | 
226 |     z0 = None
227 | 
228 |     log["input"] = x
229 |     log["reconstruction"] = xrec
230 | 
231 |     if ismap(xc):
232 |         log["original_conditioning"] = model.to_rgb(xc)
233 |         if hasattr(model, 'cond_stage_key'):
234 |             log[model.cond_stage_key] = model.to_rgb(xc)
235 | 
236 |     else:
237 |         log["original_conditioning"] = xc if xc is not None else torch.zeros_like(x)
238 |         if model.cond_stage_model:
239 |             log[model.cond_stage_key] = xc if xc is not None else torch.zeros_like(x)
240 |             if model.cond_stage_key =='class_label':
241 |                 log[model.cond_stage_key] = xc[model.cond_stage_key]
242 | 
243 |     with model.ema_scope("Plotting"):
244 |         t0 = time.time()
245 |         img_cb = None
246 | 
247 |         sample, intermediates = convsample_ddim(model, c, steps=custom_steps, shape=z.shape,
248 |                                                 eta=eta,
249 |                                                 quantize_x0=quantize_x0, img_callback=img_cb, mask=None, x0=z0,
250 |                                                 temperature=temperature, noise_dropout=noise_dropout,
251 |                                                 score_corrector=corrector, corrector_kwargs=corrector_kwargs,
252 |                                                 x_T=x_T, log_every_t=log_every_t)
253 |         t1 = time.time()
254 | 
255 |         if ddim_use_x0_pred:
256 |             sample = intermediates['pred_x0'][-1]
257 | 
258 |     x_sample = model.decode_first_stage(sample)
259 | 
260 |     try:
261 |         x_sample_noquant = model.decode_first_stage(sample, force_not_quantize=True)
262 |         log["sample_noquant"] = x_sample_noquant
263 |         log["sample_diff"] = torch.abs(x_sample_noquant - x_sample)
264 |     except:
265 |         pass
266 | 
267 |     log["sample"] = x_sample
268 |     log["time"] = t1 - t0
269 | 
270 |     return log


--------------------------------------------------------------------------------
/scripts/download_first_stages.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | wget -O models/first_stage_models/kl-f4/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f4.zip
 3 | wget -O models/first_stage_models/kl-f8/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f8.zip
 4 | wget -O models/first_stage_models/kl-f16/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f16.zip
 5 | wget -O models/first_stage_models/kl-f32/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f32.zip
 6 | wget -O models/first_stage_models/vq-f4/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f4.zip
 7 | wget -O models/first_stage_models/vq-f4-noattn/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f4-noattn.zip
 8 | wget -O models/first_stage_models/vq-f8/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f8.zip
 9 | wget -O models/first_stage_models/vq-f8-n256/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f8-n256.zip
10 | wget -O models/first_stage_models/vq-f16/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f16.zip
11 | 
12 | 
13 | 
14 | cd models/first_stage_models/kl-f4
15 | unzip -o model.zip
16 | 
17 | cd ../kl-f8
18 | unzip -o model.zip
19 | 
20 | cd ../kl-f16
21 | unzip -o model.zip
22 | 
23 | cd ../kl-f32
24 | unzip -o model.zip
25 | 
26 | cd ../vq-f4
27 | unzip -o model.zip
28 | 
29 | cd ../vq-f4-noattn
30 | unzip -o model.zip
31 | 
32 | cd ../vq-f8
33 | unzip -o model.zip
34 | 
35 | cd ../vq-f8-n256
36 | unzip -o model.zip
37 | 
38 | cd ../vq-f16
39 | unzip -o model.zip
40 | 
41 | cd ../..


--------------------------------------------------------------------------------
/scripts/download_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | wget -O models/ldm/celeba256/celeba-256.zip https://ommer-lab.com/files/latent-diffusion/celeba.zip
 3 | wget -O models/ldm/ffhq256/ffhq-256.zip https://ommer-lab.com/files/latent-diffusion/ffhq.zip
 4 | wget -O models/ldm/lsun_churches256/lsun_churches-256.zip https://ommer-lab.com/files/latent-diffusion/lsun_churches.zip
 5 | wget -O models/ldm/lsun_beds256/lsun_beds-256.zip https://ommer-lab.com/files/latent-diffusion/lsun_bedrooms.zip
 6 | wget -O models/ldm/text2img256/model.zip https://ommer-lab.com/files/latent-diffusion/text2img.zip
 7 | wget -O models/ldm/cin256/model.zip https://ommer-lab.com/files/latent-diffusion/cin.zip
 8 | wget -O models/ldm/semantic_synthesis512/model.zip https://ommer-lab.com/files/latent-diffusion/semantic_synthesis.zip
 9 | wget -O models/ldm/semantic_synthesis256/model.zip https://ommer-lab.com/files/latent-diffusion/semantic_synthesis256.zip
10 | wget -O models/ldm/bsr_sr/model.zip https://ommer-lab.com/files/latent-diffusion/sr_bsr.zip
11 | wget -O models/ldm/layout2img-openimages256/model.zip https://ommer-lab.com/files/latent-diffusion/layout2img_model.zip
12 | wget -O models/ldm/inpainting_big/model.zip https://ommer-lab.com/files/latent-diffusion/inpainting_big.zip
13 | 
14 | 
15 | 
16 | cd models/ldm/celeba256
17 | unzip -o celeba-256.zip
18 | 
19 | cd ../ffhq256
20 | unzip -o ffhq-256.zip
21 | 
22 | cd ../lsun_churches256
23 | unzip -o lsun_churches-256.zip
24 | 
25 | cd ../lsun_beds256
26 | unzip -o lsun_beds-256.zip
27 | 
28 | cd ../text2img256
29 | unzip -o model.zip
30 | 
31 | cd ../cin256
32 | unzip -o model.zip
33 | 
34 | cd ../semantic_synthesis512
35 | unzip -o model.zip
36 | 
37 | cd ../semantic_synthesis256
38 | unzip -o model.zip
39 | 
40 | cd ../bsr_sr
41 | unzip -o model.zip
42 | 
43 | cd ../layout2img-openimages256
44 | unzip -o model.zip
45 | 
46 | cd ../inpainting_big
47 | unzip -o model.zip
48 | 
49 | cd ../..
50 | 


--------------------------------------------------------------------------------
/scripts/inpaint.py:
--------------------------------------------------------------------------------
 1 | import argparse, os, sys, glob
 2 | from omegaconf import OmegaConf
 3 | from PIL import Image
 4 | from tqdm import tqdm
 5 | import numpy as np
 6 | import torch
 7 | from main import instantiate_from_config
 8 | from ldm.models.diffusion.ddim import DDIMSampler
 9 | 
10 | 
11 | def make_batch(image, mask, device):
12 |     image = np.array(Image.open(image).convert("RGB"))
13 |     image = image.astype(np.float32)/255.0
14 |     image = image[None].transpose(0,3,1,2)
15 |     image = torch.from_numpy(image)
16 | 
17 |     mask = np.array(Image.open(mask).convert("L"))
18 |     mask = mask.astype(np.float32)/255.0
19 |     mask = mask[None,None]
20 |     mask[mask < 0.5] = 0
21 |     mask[mask >= 0.5] = 1
22 |     mask = torch.from_numpy(mask)
23 | 
24 |     masked_image = (1-mask)*image
25 | 
26 |     batch = {"image": image, "mask": mask, "masked_image": masked_image}
27 |     for k in batch:
28 |         batch[k] = batch[k].to(device=device)
29 |         batch[k] = batch[k]*2.0-1.0
30 |     return batch
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     parser = argparse.ArgumentParser()
35 |     parser.add_argument(
36 |         "--indir",
37 |         type=str,
38 |         nargs="?",
39 |         help="dir containing image-mask pairs (`example.png` and `example_mask.png`)",
40 |     )
41 |     parser.add_argument(
42 |         "--outdir",
43 |         type=str,
44 |         nargs="?",
45 |         help="dir to write results to",
46 |     )
47 |     parser.add_argument(
48 |         "--steps",
49 |         type=int,
50 |         default=50,
51 |         help="number of ddim sampling steps",
52 |     )
53 |     opt = parser.parse_args()
54 | 
55 |     masks = sorted(glob.glob(os.path.join(opt.indir, "*_mask.png")))
56 |     images = [x.replace("_mask.png", ".png") for x in masks]
57 |     print(f"Found {len(masks)} inputs.")
58 | 
59 |     config = OmegaConf.load("models/ldm/inpainting_big/config.yaml")
60 |     model = instantiate_from_config(config.model)
61 |     model.load_state_dict(torch.load("models/ldm/inpainting_big/last.ckpt")["state_dict"],
62 |                           strict=False)
63 | 
64 |     device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
65 |     model = model.to(device)
66 |     sampler = DDIMSampler(model)
67 | 
68 |     os.makedirs(opt.outdir, exist_ok=True)
69 |     with torch.no_grad():
70 |         with model.ema_scope():
71 |             for image, mask in tqdm(zip(images, masks)):
72 |                 outpath = os.path.join(opt.outdir, os.path.split(image)[1])
73 |                 batch = make_batch(image, mask, device=device)
74 | 
75 |                 # encode masked image and concat downsampled mask
76 |                 c = model.cond_stage_model.encode(batch["masked_image"])
77 |                 cc = torch.nn.functional.interpolate(batch["mask"],
78 |                                                      size=c.shape[-2:])
79 |                 c = torch.cat((c, cc), dim=1)
80 | 
81 |                 shape = (c.shape[1]-1,)+c.shape[2:]
82 |                 samples_ddim, _ = sampler.sample(S=opt.steps,
83 |                                                  conditioning=c,
84 |                                                  batch_size=c.shape[0],
85 |                                                  shape=shape,
86 |                                                  verbose=False)
87 |                 x_samples_ddim = model.decode_first_stage(samples_ddim)
88 | 
89 |                 image = torch.clamp((batch["image"]+1.0)/2.0,
90 |                                     min=0.0, max=1.0)
91 |                 mask = torch.clamp((batch["mask"]+1.0)/2.0,
92 |                                    min=0.0, max=1.0)
93 |                 predicted_image = torch.clamp((x_samples_ddim+1.0)/2.0,
94 |                                               min=0.0, max=1.0)
95 | 
96 |                 inpainted = (1-mask)*image+mask*predicted_image
97 |                 inpainted = inpainted.cpu().numpy().transpose(0,2,3,1)[0]*255
98 |                 Image.fromarray(inpainted.astype(np.uint8)).save(outpath)
99 | 


--------------------------------------------------------------------------------
/scripts/sample_diffusion.py:
--------------------------------------------------------------------------------
  1 | import argparse, os, sys, glob, datetime, yaml
  2 | import torch
  3 | import time
  4 | import numpy as np
  5 | from tqdm import trange
  6 | 
  7 | from omegaconf import OmegaConf
  8 | from PIL import Image
  9 | 
 10 | from ldm.models.diffusion.ddim import DDIMSampler
 11 | from ldm.util import instantiate_from_config
 12 | 
 13 | rescale = lambda x: (x + 1.) / 2.
 14 | 
 15 | def custom_to_pil(x):
 16 |     x = x.detach().cpu()
 17 |     x = torch.clamp(x, -1., 1.)
 18 |     x = (x + 1.) / 2.
 19 |     x = x.permute(1, 2, 0).numpy()
 20 |     x = (255 * x).astype(np.uint8)
 21 |     x = Image.fromarray(x)
 22 |     if not x.mode == "RGB":
 23 |         x = x.convert("RGB")
 24 |     return x
 25 | 
 26 | 
 27 | def custom_to_np(x):
 28 |     # saves the batch in adm style as in https://github.com/openai/guided-diffusion/blob/main/scripts/image_sample.py
 29 |     sample = x.detach().cpu()
 30 |     sample = ((sample + 1) * 127.5).clamp(0, 255).to(torch.uint8)
 31 |     sample = sample.permute(0, 2, 3, 1)
 32 |     sample = sample.contiguous()
 33 |     return sample
 34 | 
 35 | 
 36 | def logs2pil(logs, keys=["sample"]):
 37 |     imgs = dict()
 38 |     for k in logs:
 39 |         try:
 40 |             if len(logs[k].shape) == 4:
 41 |                 img = custom_to_pil(logs[k][0, ...])
 42 |             elif len(logs[k].shape) == 3:
 43 |                 img = custom_to_pil(logs[k])
 44 |             else:
 45 |                 print(f"Unknown format for key {k}. ")
 46 |                 img = None
 47 |         except:
 48 |             img = None
 49 |         imgs[k] = img
 50 |     return imgs
 51 | 
 52 | 
 53 | @torch.no_grad()
 54 | def convsample(model, shape, return_intermediates=True,
 55 |                verbose=True,
 56 |                make_prog_row=False):
 57 | 
 58 | 
 59 |     if not make_prog_row:
 60 |         return model.p_sample_loop(None, shape,
 61 |                                    return_intermediates=return_intermediates, verbose=verbose)
 62 |     else:
 63 |         return model.progressive_denoising(
 64 |             None, shape, verbose=True
 65 |         )
 66 | 
 67 | 
 68 | @torch.no_grad()
 69 | def convsample_ddim(model, steps, shape, eta=1.0
 70 |                     ):
 71 |     ddim = DDIMSampler(model)
 72 |     bs = shape[0]
 73 |     shape = shape[1:]
 74 |     samples, intermediates = ddim.sample(steps, batch_size=bs, shape=shape, eta=eta, verbose=False,)
 75 |     return samples, intermediates
 76 | 
 77 | 
 78 | @torch.no_grad()
 79 | def make_convolutional_sample(model, batch_size, vanilla=False, custom_steps=None, eta=1.0,):
 80 | 
 81 | 
 82 |     log = dict()
 83 | 
 84 |     shape = [batch_size,
 85 |              model.model.diffusion_model.in_channels,
 86 |              model.model.diffusion_model.image_size,
 87 |              model.model.diffusion_model.image_size]
 88 | 
 89 |     with model.ema_scope("Plotting"):
 90 |         t0 = time.time()
 91 |         if vanilla:
 92 |             sample, progrow = convsample(model, shape,
 93 |                                          make_prog_row=True)
 94 |         else:
 95 |             sample, intermediates = convsample_ddim(model,  steps=custom_steps, shape=shape,
 96 |                                                     eta=eta)
 97 | 
 98 |         t1 = time.time()
 99 | 
100 |     x_sample = model.decode_first_stage(sample)
101 | 
102 |     log["sample"] = x_sample
103 |     log["time"] = t1 - t0
104 |     log['throughput'] = sample.shape[0] / (t1 - t0)
105 |     print(f'Throughput for this batch: {log["throughput"]}')
106 |     return log
107 | 
108 | def run(model, logdir, batch_size=50, vanilla=False, custom_steps=None, eta=None, n_samples=50000, nplog=None):
109 |     if vanilla:
110 |         print(f'Using Vanilla DDPM sampling with {model.num_timesteps} sampling steps.')
111 |     else:
112 |         print(f'Using DDIM sampling with {custom_steps} sampling steps and eta={eta}')
113 | 
114 | 
115 |     tstart = time.time()
116 |     n_saved = len(glob.glob(os.path.join(logdir,'*.png')))-1
117 |     # path = logdir
118 |     if model.cond_stage_model is None:
119 |         all_images = []
120 | 
121 |         print(f"Running unconditional sampling for {n_samples} samples")
122 |         for _ in trange(n_samples // batch_size, desc="Sampling Batches (unconditional)"):
123 |             logs = make_convolutional_sample(model, batch_size=batch_size,
124 |                                              vanilla=vanilla, custom_steps=custom_steps,
125 |                                              eta=eta)
126 |             n_saved = save_logs(logs, logdir, n_saved=n_saved, key="sample")
127 |             all_images.extend([custom_to_np(logs["sample"])])
128 |             if n_saved >= n_samples:
129 |                 print(f'Finish after generating {n_saved} samples')
130 |                 break
131 |         all_img = np.concatenate(all_images, axis=0)
132 |         all_img = all_img[:n_samples]
133 |         shape_str = "x".join([str(x) for x in all_img.shape])
134 |         nppath = os.path.join(nplog, f"{shape_str}-samples.npz")
135 |         np.savez(nppath, all_img)
136 | 
137 |     else:
138 |        raise NotImplementedError('Currently only sampling for unconditional models supported.')
139 | 
140 |     print(f"sampling of {n_saved} images finished in {(time.time() - tstart) / 60.:.2f} minutes.")
141 | 
142 | 
143 | def save_logs(logs, path, n_saved=0, key="sample", np_path=None):
144 |     for k in logs:
145 |         if k == key:
146 |             batch = logs[key]
147 |             if np_path is None:
148 |                 for x in batch:
149 |                     img = custom_to_pil(x)
150 |                     imgpath = os.path.join(path, f"{key}_{n_saved:06}.png")
151 |                     img.save(imgpath)
152 |                     n_saved += 1
153 |             else:
154 |                 npbatch = custom_to_np(batch)
155 |                 shape_str = "x".join([str(x) for x in npbatch.shape])
156 |                 nppath = os.path.join(np_path, f"{n_saved}-{shape_str}-samples.npz")
157 |                 np.savez(nppath, npbatch)
158 |                 n_saved += npbatch.shape[0]
159 |     return n_saved
160 | 
161 | 
162 | def get_parser():
163 |     parser = argparse.ArgumentParser()
164 |     parser.add_argument(
165 |         "-r",
166 |         "--resume",
167 |         type=str,
168 |         nargs="?",
169 |         help="load from logdir or checkpoint in logdir",
170 |     )
171 |     parser.add_argument(
172 |         "-n",
173 |         "--n_samples",
174 |         type=int,
175 |         nargs="?",
176 |         help="number of samples to draw",
177 |         default=50000
178 |     )
179 |     parser.add_argument(
180 |         "-e",
181 |         "--eta",
182 |         type=float,
183 |         nargs="?",
184 |         help="eta for ddim sampling (0.0 yields deterministic sampling)",
185 |         default=1.0
186 |     )
187 |     parser.add_argument(
188 |         "-v",
189 |         "--vanilla_sample",
190 |         default=False,
191 |         action='store_true',
192 |         help="vanilla sampling (default option is DDIM sampling)?",
193 |     )
194 |     parser.add_argument(
195 |         "-l",
196 |         "--logdir",
197 |         type=str,
198 |         nargs="?",
199 |         help="extra logdir",
200 |         default="none"
201 |     )
202 |     parser.add_argument(
203 |         "-c",
204 |         "--custom_steps",
205 |         type=int,
206 |         nargs="?",
207 |         help="number of steps for ddim and fastdpm sampling",
208 |         default=50
209 |     )
210 |     parser.add_argument(
211 |         "--batch_size",
212 |         type=int,
213 |         nargs="?",
214 |         help="the bs",
215 |         default=10
216 |     )
217 |     return parser
218 | 
219 | 
220 | def load_model_from_config(config, sd):
221 |     model = instantiate_from_config(config)
222 |     model.load_state_dict(sd,strict=False)
223 |     model.cuda()
224 |     model.eval()
225 |     return model
226 | 
227 | 
228 | def load_model(config, ckpt, gpu, eval_mode):
229 |     if ckpt:
230 |         print(f"Loading model from {ckpt}")
231 |         pl_sd = torch.load(ckpt, map_location="cpu")
232 |         global_step = pl_sd["global_step"]
233 |     else:
234 |         pl_sd = {"state_dict": None}
235 |         global_step = None
236 |     model = load_model_from_config(config.model,
237 |                                    pl_sd["state_dict"])
238 | 
239 |     return model, global_step
240 | 
241 | 
242 | if __name__ == "__main__":
243 |     now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
244 |     sys.path.append(os.getcwd())
245 |     command = " ".join(sys.argv)
246 | 
247 |     parser = get_parser()
248 |     opt, unknown = parser.parse_known_args()
249 |     ckpt = None
250 | 
251 |     if not os.path.exists(opt.resume):
252 |         raise ValueError("Cannot find {}".format(opt.resume))
253 |     if os.path.isfile(opt.resume):
254 |         # paths = opt.resume.split("/")
255 |         try:
256 |             logdir = '/'.join(opt.resume.split('/')[:-1])
257 |             # idx = len(paths)-paths[::-1].index("logs")+1
258 |             print(f'Logdir is {logdir}')
259 |         except ValueError:
260 |             paths = opt.resume.split("/")
261 |             idx = -2  # take a guess: path/to/logdir/checkpoints/model.ckpt
262 |             logdir = "/".join(paths[:idx])
263 |         ckpt = opt.resume
264 |     else:
265 |         assert os.path.isdir(opt.resume), f"{opt.resume} is not a directory"
266 |         logdir = opt.resume.rstrip("/")
267 |         ckpt = os.path.join(logdir, "model.ckpt")
268 | 
269 |     base_configs = sorted(glob.glob(os.path.join(logdir, "config.yaml")))
270 |     opt.base = base_configs
271 | 
272 |     configs = [OmegaConf.load(cfg) for cfg in opt.base]
273 |     cli = OmegaConf.from_dotlist(unknown)
274 |     config = OmegaConf.merge(*configs, cli)
275 | 
276 |     gpu = True
277 |     eval_mode = True
278 | 
279 |     if opt.logdir != "none":
280 |         locallog = logdir.split(os.sep)[-1]
281 |         if locallog == "": locallog = logdir.split(os.sep)[-2]
282 |         print(f"Switching logdir from '{logdir}' to '{os.path.join(opt.logdir, locallog)}'")
283 |         logdir = os.path.join(opt.logdir, locallog)
284 | 
285 |     print(config)
286 | 
287 |     model, global_step = load_model(config, ckpt, gpu, eval_mode)
288 |     print(f"global step: {global_step}")
289 |     print(75 * "=")
290 |     print("logging to:")
291 |     logdir = os.path.join(logdir, "samples", f"{global_step:08}", now)
292 |     imglogdir = os.path.join(logdir, "img")
293 |     numpylogdir = os.path.join(logdir, "numpy")
294 | 
295 |     os.makedirs(imglogdir)
296 |     os.makedirs(numpylogdir)
297 |     print(logdir)
298 |     print(75 * "=")
299 | 
300 |     # write config out
301 |     sampling_file = os.path.join(logdir, "sampling_config.yaml")
302 |     sampling_conf = vars(opt)
303 | 
304 |     with open(sampling_file, 'w') as f:
305 |         yaml.dump(sampling_conf, f, default_flow_style=False)
306 |     print(sampling_conf)
307 | 
308 | 
309 |     run(model, imglogdir, eta=opt.eta,
310 |         vanilla=opt.vanilla_sample,  n_samples=opt.n_samples, custom_steps=opt.custom_steps,
311 |         batch_size=opt.batch_size, nplog=numpylogdir)
312 | 
313 |     print("done.")
314 | 


--------------------------------------------------------------------------------
/scripts/train_searcher.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import numpy as np
  3 | import scann
  4 | import argparse
  5 | import glob
  6 | from multiprocessing import cpu_count
  7 | from tqdm import tqdm
  8 | 
  9 | from ldm.util import parallel_data_prefetch
 10 | 
 11 | 
 12 | def search_bruteforce(searcher):
 13 |     return searcher.score_brute_force().build()
 14 | 
 15 | 
 16 | def search_partioned_ah(searcher, dims_per_block, aiq_threshold, reorder_k,
 17 |                         partioning_trainsize, num_leaves, num_leaves_to_search):
 18 |     return searcher.tree(num_leaves=num_leaves,
 19 |                          num_leaves_to_search=num_leaves_to_search,
 20 |                          training_sample_size=partioning_trainsize). \
 21 |         score_ah(dims_per_block, anisotropic_quantization_threshold=aiq_threshold).reorder(reorder_k).build()
 22 | 
 23 | 
 24 | def search_ah(searcher, dims_per_block, aiq_threshold, reorder_k):
 25 |     return searcher.score_ah(dims_per_block, anisotropic_quantization_threshold=aiq_threshold).reorder(
 26 |         reorder_k).build()
 27 | 
 28 | def load_datapool(dpath):
 29 | 
 30 | 
 31 |     def load_single_file(saved_embeddings):
 32 |         compressed = np.load(saved_embeddings)
 33 |         database = {key: compressed[key] for key in compressed.files}
 34 |         return database
 35 | 
 36 |     def load_multi_files(data_archive):
 37 |         database = {key: [] for key in data_archive[0].files}
 38 |         for d in tqdm(data_archive, desc=f'Loading datapool from {len(data_archive)} individual files.'):
 39 |             for key in d.files:
 40 |                 database[key].append(d[key])
 41 | 
 42 |         return database
 43 | 
 44 |     print(f'Load saved patch embedding from "{dpath}"')
 45 |     file_content = glob.glob(os.path.join(dpath, '*.npz'))
 46 | 
 47 |     if len(file_content) == 1:
 48 |         data_pool = load_single_file(file_content[0])
 49 |     elif len(file_content) > 1:
 50 |         data = [np.load(f) for f in file_content]
 51 |         prefetched_data = parallel_data_prefetch(load_multi_files, data,
 52 |                                                  n_proc=min(len(data), cpu_count()), target_data_type='dict')
 53 | 
 54 |         data_pool = {key: np.concatenate([od[key] for od in prefetched_data], axis=1)[0] for key in prefetched_data[0].keys()}
 55 |     else:
 56 |         raise ValueError(f'No npz-files in specified path "{dpath}" is this directory existing?')
 57 | 
 58 |     print(f'Finished loading of retrieval database of length {data_pool["embedding"].shape[0]}.')
 59 |     return data_pool
 60 | 
 61 | 
 62 | def train_searcher(opt,
 63 |                    metric='dot_product',
 64 |                    partioning_trainsize=None,
 65 |                    reorder_k=None,
 66 |                    # todo tune
 67 |                    aiq_thld=0.2,
 68 |                    dims_per_block=2,
 69 |                    num_leaves=None,
 70 |                    num_leaves_to_search=None,):
 71 | 
 72 |     data_pool = load_datapool(opt.database)
 73 |     k = opt.knn
 74 | 
 75 |     if not reorder_k:
 76 |         reorder_k = 2 * k
 77 | 
 78 |     # normalize
 79 |     # embeddings =
 80 |     searcher = scann.scann_ops_pybind.builder(data_pool['embedding'] / np.linalg.norm(data_pool['embedding'], axis=1)[:, np.newaxis], k, metric)
 81 |     pool_size = data_pool['embedding'].shape[0]
 82 | 
 83 |     print(*(['#'] * 100))
 84 |     print('Initializing scaNN searcher with the following values:')
 85 |     print(f'k: {k}')
 86 |     print(f'metric: {metric}')
 87 |     print(f'reorder_k: {reorder_k}')
 88 |     print(f'anisotropic_quantization_threshold: {aiq_thld}')
 89 |     print(f'dims_per_block: {dims_per_block}')
 90 |     print(*(['#'] * 100))
 91 |     print('Start training searcher....')
 92 |     print(f'N samples in pool is {pool_size}')
 93 | 
 94 |     # this reflects the recommended design choices proposed at
 95 |     # https://github.com/google-research/google-research/blob/aca5f2e44e301af172590bb8e65711f0c9ee0cfd/scann/docs/algorithms.md
 96 |     if pool_size < 2e4:
 97 |         print('Using brute force search.')
 98 |         searcher = search_bruteforce(searcher)
 99 |     elif 2e4 <= pool_size and pool_size < 1e5:
100 |         print('Using asymmetric hashing search and reordering.')
101 |         searcher = search_ah(searcher, dims_per_block, aiq_thld, reorder_k)
102 |     else:
103 |         print('Using using partioning, asymmetric hashing search and reordering.')
104 | 
105 |         if not partioning_trainsize:
106 |             partioning_trainsize = data_pool['embedding'].shape[0] // 10
107 |         if not num_leaves:
108 |             num_leaves = int(np.sqrt(pool_size))
109 | 
110 |         if not num_leaves_to_search:
111 |             num_leaves_to_search = max(num_leaves // 20, 1)
112 | 
113 |         print('Partitioning params:')
114 |         print(f'num_leaves: {num_leaves}')
115 |         print(f'num_leaves_to_search: {num_leaves_to_search}')
116 |         # self.searcher = self.search_ah(searcher, dims_per_block, aiq_thld, reorder_k)
117 |         searcher = search_partioned_ah(searcher, dims_per_block, aiq_thld, reorder_k,
118 |                                                  partioning_trainsize, num_leaves, num_leaves_to_search)
119 | 
120 |     print('Finish training searcher')
121 |     searcher_savedir = opt.target_path
122 |     os.makedirs(searcher_savedir, exist_ok=True)
123 |     searcher.serialize(searcher_savedir)
124 |     print(f'Saved trained searcher under "{searcher_savedir}"')
125 | 
126 | if __name__ == '__main__':
127 |     sys.path.append(os.getcwd())
128 |     parser = argparse.ArgumentParser()
129 |     parser.add_argument('--database',
130 |                         '-d',
131 |                         default='data/rdm/retrieval_databases/openimages',
132 |                         type=str,
133 |                         help='path to folder containing the clip feature of the database')
134 |     parser.add_argument('--target_path',
135 |                         '-t',
136 |                         default='data/rdm/searchers/openimages',
137 |                         type=str,
138 |                         help='path to the target folder where the searcher shall be stored.')
139 |     parser.add_argument('--knn',
140 |                         '-k',
141 |                         default=20,
142 |                         type=int,
143 |                         help='number of nearest neighbors, for which the searcher shall be optimized')
144 | 
145 |     opt, _  = parser.parse_known_args()
146 | 
147 |     train_searcher(opt,)


--------------------------------------------------------------------------------
/scripts/txt2img.py:
--------------------------------------------------------------------------------
  1 | import argparse, os, sys, glob
  2 | import torch
  3 | import numpy as np
  4 | from omegaconf import OmegaConf
  5 | from PIL import Image
  6 | from tqdm import tqdm, trange
  7 | from einops import rearrange
  8 | from torchvision.utils import make_grid
  9 | 
 10 | from ldm.util import instantiate_from_config
 11 | from ldm.models.diffusion.ddim import DDIMSampler
 12 | from ldm.models.diffusion.plms import PLMSSampler
 13 | 
 14 | 
 15 | def load_model_from_config(config, ckpt, verbose=False):
 16 |     print(f"Loading model from {ckpt}")
 17 |     pl_sd = torch.load(ckpt, map_location="cpu")
 18 |     sd = pl_sd["state_dict"]
 19 |     model = instantiate_from_config(config.model)
 20 |     m, u = model.load_state_dict(sd, strict=False)
 21 |     if len(m) > 0 and verbose:
 22 |         print("missing keys:")
 23 |         print(m)
 24 |     if len(u) > 0 and verbose:
 25 |         print("unexpected keys:")
 26 |         print(u)
 27 | 
 28 |     model.cuda()
 29 |     model.eval()
 30 |     return model
 31 | 
 32 | 
 33 | if __name__ == "__main__":
 34 |     parser = argparse.ArgumentParser()
 35 | 
 36 |     parser.add_argument(
 37 |         "--prompt",
 38 |         type=str,
 39 |         nargs="?",
 40 |         default="a painting of a virus monster playing guitar",
 41 |         help="the prompt to render"
 42 |     )
 43 | 
 44 |     parser.add_argument(
 45 |         "--outdir",
 46 |         type=str,
 47 |         nargs="?",
 48 |         help="dir to write results to",
 49 |         default="outputs/txt2img-samples"
 50 |     )
 51 |     parser.add_argument(
 52 |         "--ddim_steps",
 53 |         type=int,
 54 |         default=200,
 55 |         help="number of ddim sampling steps",
 56 |     )
 57 | 
 58 |     parser.add_argument(
 59 |         "--plms",
 60 |         action='store_true',
 61 |         help="use plms sampling",
 62 |     )
 63 | 
 64 |     parser.add_argument(
 65 |         "--ddim_eta",
 66 |         type=float,
 67 |         default=0.0,
 68 |         help="ddim eta (eta=0.0 corresponds to deterministic sampling",
 69 |     )
 70 |     parser.add_argument(
 71 |         "--n_iter",
 72 |         type=int,
 73 |         default=1,
 74 |         help="sample this often",
 75 |     )
 76 | 
 77 |     parser.add_argument(
 78 |         "--H",
 79 |         type=int,
 80 |         default=256,
 81 |         help="image height, in pixel space",
 82 |     )
 83 | 
 84 |     parser.add_argument(
 85 |         "--W",
 86 |         type=int,
 87 |         default=256,
 88 |         help="image width, in pixel space",
 89 |     )
 90 | 
 91 |     parser.add_argument(
 92 |         "--n_samples",
 93 |         type=int,
 94 |         default=4,
 95 |         help="how many samples to produce for the given prompt",
 96 |     )
 97 | 
 98 |     parser.add_argument(
 99 |         "--scale",
100 |         type=float,
101 |         default=5.0,
102 |         help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
103 |     )
104 |     opt = parser.parse_args()
105 | 
106 | 
107 |     config = OmegaConf.load("configs/latent-diffusion/txt2img-1p4B-eval.yaml")  # TODO: Optionally download from same location as ckpt and chnage this logic
108 |     model = load_model_from_config(config, "models/ldm/text2img-large/model.ckpt")  # TODO: check path
109 | 
110 |     device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
111 |     model = model.to(device)
112 | 
113 |     if opt.plms:
114 |         sampler = PLMSSampler(model)
115 |     else:
116 |         sampler = DDIMSampler(model)
117 | 
118 |     os.makedirs(opt.outdir, exist_ok=True)
119 |     outpath = opt.outdir
120 | 
121 |     prompt = opt.prompt
122 | 
123 | 
124 |     sample_path = os.path.join(outpath, "samples")
125 |     os.makedirs(sample_path, exist_ok=True)
126 |     base_count = len(os.listdir(sample_path))
127 | 
128 |     all_samples=list()
129 |     with torch.no_grad():
130 |         with model.ema_scope():
131 |             uc = None
132 |             if opt.scale != 1.0:
133 |                 uc = model.get_learned_conditioning(opt.n_samples * [""])
134 |             for n in trange(opt.n_iter, desc="Sampling"):
135 |                 c = model.get_learned_conditioning(opt.n_samples * [prompt])
136 |                 shape = [4, opt.H//8, opt.W//8]
137 |                 samples_ddim, _ = sampler.sample(S=opt.ddim_steps,
138 |                                                  conditioning=c,
139 |                                                  batch_size=opt.n_samples,
140 |                                                  shape=shape,
141 |                                                  verbose=False,
142 |                                                  unconditional_guidance_scale=opt.scale,
143 |                                                  unconditional_conditioning=uc,
144 |                                                  eta=opt.ddim_eta)
145 | 
146 |                 x_samples_ddim = model.decode_first_stage(samples_ddim)
147 |                 x_samples_ddim = torch.clamp((x_samples_ddim+1.0)/2.0, min=0.0, max=1.0)
148 | 
149 |                 for x_sample in x_samples_ddim:
150 |                     x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
151 |                     Image.fromarray(x_sample.astype(np.uint8)).save(os.path.join(sample_path, f"{base_count:04}.png"))
152 |                     base_count += 1
153 |                 all_samples.append(x_samples_ddim)
154 | 
155 | 
156 |     # additionally, save as grid
157 |     grid = torch.stack(all_samples, 0)
158 |     grid = rearrange(grid, 'n b c h w -> (n b) c h w')
159 |     grid = make_grid(grid, nrow=opt.n_samples)
160 | 
161 |     # to image
162 |     grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()
163 |     Image.fromarray(grid.astype(np.uint8)).save(os.path.join(outpath, f'{prompt.replace(" ", "-")}.png'))
164 | 
165 |     print(f"Your samples are ready and waiting four you here: \n{outpath} \nEnjoy.")
166 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='latent-diffusion',
 5 |     version='0.0.1',
 6 |     description='',
 7 |     packages=find_packages(),
 8 |     install_requires=[
 9 |         'torch',
10 |         'numpy',
11 |         'tqdm',
12 |     ],
13 | )


--------------------------------------------------------------------------------