├── tests
    ├── __init__.py
    ├── test_show.py
    ├── test_super_resolution.py
    ├── test_emojich_unet.py
    ├── test_tokenizer.py
    ├── test_image_prompts.py
    ├── test_dalle.py
    ├── conftest.py
    └── test_vae.py
├── pics
    ├── cat-ru.png
    ├── man-0.png
    ├── man-1.png
    ├── man-2.png
    ├── man-3.png
    ├── man-4.png
    ├── man-5.png
    ├── man-ru.png
    ├── cat-gan.png
    ├── man-gan.png
    ├── woman-gan.png
    ├── woman-ru.png
    ├── avocado-gan.png
    ├── avocado-ru.png
    ├── cat-diffusion.png
    ├── cathedral-gan.png
    ├── cathedral-ru.png
    ├── emojich
    │   ├── examples.png
    │   ├── emoji-Donald.png
    │   ├── emojich_rgba.png
    │   ├── emojich-stickers.png
    │   └── emojich_rgba_100.png
    ├── woman-diffusion.png
    ├── avocado-diffusion.png
    ├── cathedral-diffusion.png
    ├── malevich
    │   ├── rainbow-full.png
    │   ├── rainbow-cherry-pick.png
    │   ├── rainbow-super-resolution.png
    │   ├── anime-girl-super-resolution.png
    │   └── russian-temple-image-prompt.png
    ├── habr_eng.svg
    └── habr.svg
├── requirements-test.txt
├── .coveragerc
├── requirements.txt
├── setup.cfg
├── .gitlab-ci.yml
├── .pre-commit-config.yaml
├── rudalle
    ├── __init__.py
    ├── vae
    │   ├── vqgan.gumbelf8-sber.config.yml
    │   ├── __init__.py
    │   ├── decoder_dwt.py
    │   ├── model.py
    │   └── pytorch_wavelets_utils.py
    ├── ruclip
    │   ├── __init__.py
    │   └── processor.py
    ├── utils.py
    ├── realesrgan
    │   ├── __init__.py
    │   ├── model.py
    │   ├── utils.py
    │   ├── rrdbnet_arch.py
    │   └── arch_util.py
    ├── emojich_unet
    │   └── __init__.py
    ├── dalle
    │   ├── utils.py
    │   ├── image_attention.py
    │   ├── fp16.py
    │   ├── __init__.py
    │   ├── model.py
    │   └── transformer.py
    ├── tokenizer.py
    ├── image_prompts.py
    └── pipelines.py
├── setup.py
├── .gitignore
├── README.md
├── Emojich.md
└── LICENSE.txt


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pics/cat-ru.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/cat-ru.png


--------------------------------------------------------------------------------
/pics/man-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/man-0.png


--------------------------------------------------------------------------------
/pics/man-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/man-1.png


--------------------------------------------------------------------------------
/pics/man-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/man-2.png


--------------------------------------------------------------------------------
/pics/man-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/man-3.png


--------------------------------------------------------------------------------
/pics/man-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/man-4.png


--------------------------------------------------------------------------------
/pics/man-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/man-5.png


--------------------------------------------------------------------------------
/pics/man-ru.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/man-ru.png


--------------------------------------------------------------------------------
/pics/cat-gan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/cat-gan.png


--------------------------------------------------------------------------------
/pics/man-gan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/man-gan.png


--------------------------------------------------------------------------------
/pics/woman-gan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/woman-gan.png


--------------------------------------------------------------------------------
/pics/woman-ru.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/woman-ru.png


--------------------------------------------------------------------------------
/requirements-test.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | pytest
3 | pytest-cov
4 | pre-commit
5 | 


--------------------------------------------------------------------------------
/pics/avocado-gan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/avocado-gan.png


--------------------------------------------------------------------------------
/pics/avocado-ru.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/avocado-ru.png


--------------------------------------------------------------------------------
/pics/cat-diffusion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/cat-diffusion.png


--------------------------------------------------------------------------------
/pics/cathedral-gan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/cathedral-gan.png


--------------------------------------------------------------------------------
/pics/cathedral-ru.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/cathedral-ru.png


--------------------------------------------------------------------------------
/pics/emojich/examples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/emojich/examples.png


--------------------------------------------------------------------------------
/pics/woman-diffusion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/woman-diffusion.png


--------------------------------------------------------------------------------
/pics/avocado-diffusion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/avocado-diffusion.png


--------------------------------------------------------------------------------
/pics/cathedral-diffusion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/cathedral-diffusion.png


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 |     # omit this single file
4 |     rudalle/vae/pytorch_wavelets_utils.py
5 | 


--------------------------------------------------------------------------------
/pics/emojich/emoji-Donald.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/emojich/emoji-Donald.png


--------------------------------------------------------------------------------
/pics/emojich/emojich_rgba.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/emojich/emojich_rgba.png


--------------------------------------------------------------------------------
/pics/malevich/rainbow-full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/malevich/rainbow-full.png


--------------------------------------------------------------------------------
/pics/emojich/emojich-stickers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/emojich/emojich-stickers.png


--------------------------------------------------------------------------------
/pics/emojich/emojich_rgba_100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/emojich/emojich_rgba_100.png


--------------------------------------------------------------------------------
/pics/malevich/rainbow-cherry-pick.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/malevich/rainbow-cherry-pick.png


--------------------------------------------------------------------------------
/pics/malevich/rainbow-super-resolution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/malevich/rainbow-super-resolution.png


--------------------------------------------------------------------------------
/pics/malevich/anime-girl-super-resolution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/malevich/anime-girl-super-resolution.png


--------------------------------------------------------------------------------
/pics/malevich/russian-temple-image-prompt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jack000/ru-dalle/HEAD/pics/malevich/russian-temple-image-prompt.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | taming-transformers==0.0.1
 2 | more_itertools~=8.10.0
 3 | transformers~=4.10.2
 4 | youtokentome~=1.0.6
 5 | omegaconf>=2.0.0
 6 | einops~=0.3.2
 7 | PyWavelets==1.1.1
 8 | segmentation-models-pytorch==0.1.3
 9 | opencv-python==4.5.4.60
10 | torch
11 | torchvision
12 | matplotlib
13 | 


--------------------------------------------------------------------------------
/tests/test_show.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from rudalle.pipelines import show
 3 | 
 4 | 
 5 | def test_show(sample_image):
 6 |     img = sample_image.copy()
 7 |     img = img.resize((256, 256))
 8 |     pil_images = [img]*5
 9 |     show(pil_images, nrow=2, save_dir='/tmp/pics', show=False)
10 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [pep8]
 2 | max-line-length = 120
 3 | exclude = .tox,*migrations*,.json
 4 | 
 5 | [flake8]
 6 | max-line-length = 120
 7 | exclude = .tox,*migrations*,.json
 8 | 
 9 | [autopep8-wrapper]
10 | exclude = .tox,*migrations*,.json
11 | 
12 | [check-docstring-first]
13 | exclude = .tox,*migrations*,.json
14 | 


--------------------------------------------------------------------------------
/tests/test_super_resolution.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from rudalle.pipelines import super_resolution
 3 | 
 4 | 
 5 | def test_super_resolution(sample_image, realesrgan):
 6 |     img = sample_image.copy()
 7 |     img = img.resize((32, 32))
 8 |     sr_img = super_resolution([img], realesrgan)[0]
 9 |     assert sr_img.size[0] == 32*2
10 |     assert sr_img.size[1] == 32*2
11 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | stages:
 2 | - test
 3 | 
 4 | all_branch_test:
 5 |   stage: test
 6 |   tags:
 7 |   - docker
 8 |   image: python:3.9
 9 |   script:
10 |   - apt-get update ##[edited]
11 |   - apt-get install ffmpeg libsm6 libxext6 -y
12 |   - pip install cython
13 |   - pip install -r requirements-test.txt --no-cache-dir
14 |   - pip install timm==0.4.12
15 |   - pip install codecov
16 |   - pytest --cov=rudalle tests/
17 |   - bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN
18 |   except:
19 |   - tags
20 | 


--------------------------------------------------------------------------------
/tests/test_emojich_unet.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | 
 4 | from rudalle.pipelines import convert_emoji_to_rgba
 5 | 
 6 | 
 7 | def test_convert_emoji_to_rgba(sample_image, emojich_unet):
 8 |     img = sample_image.copy()
 9 |     img = img.resize((512, 512))
10 |     rgba_images, runs = convert_emoji_to_rgba([img], emojich_unet, score_thr=0.99)
11 |     assert len(runs) == len(rgba_images)
12 |     rgba_img = rgba_images[0]
13 |     assert rgba_img.size[0] == 512
14 |     assert rgba_img.size[1] == 512
15 |     assert np.array(rgba_img).shape[-1] == 4
16 |     assert runs[0] in ['unet', 'classic']
17 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.0.1
 4 |     hooks:
 5 |     -   id: check-docstring-first
 6 |     -   id: check-merge-conflict
 7 |         stages:
 8 |         - push
 9 |     -   id: double-quote-string-fixer
10 |     -   id: end-of-file-fixer
11 |     -   id: fix-encoding-pragma
12 |     -   id: mixed-line-ending
13 |     -   id: trailing-whitespace
14 | -   repo: https://github.com/pycqa/flake8
15 |     rev: "4.0.1"
16 |     hooks:
17 |     -   id: flake8
18 |         args: ['--config=setup.cfg']
19 | -   repo: https://github.com/pre-commit/mirrors-autopep8
20 |     rev: v1.5.7
21 |     hooks:
22 |     -   id: autopep8
23 | 


--------------------------------------------------------------------------------
/rudalle/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from .vae import get_vae
 3 | from .dalle import get_rudalle_model
 4 | from .tokenizer import get_tokenizer
 5 | from .realesrgan import get_realesrgan
 6 | from .ruclip import get_ruclip
 7 | from .emojich_unet import get_emojich_unet
 8 | from . import vae, dalle, tokenizer, realesrgan, pipelines, ruclip, image_prompts
 9 | 
10 | 
11 | __all__ = [
12 |     'get_vae',
13 |     'get_rudalle_model',
14 |     'get_tokenizer',
15 |     'get_realesrgan',
16 |     'get_ruclip',
17 |     'get_emojich_unet',
18 |     'vae',
19 |     'dalle',
20 |     'ruclip',
21 |     'tokenizer',
22 |     'realesrgan',
23 |     'pipelines',
24 |     'image_prompts',
25 | ]
26 | 
27 | __version__ = '0.4.0'
28 | 


--------------------------------------------------------------------------------
/tests/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | 
 5 | @pytest.mark.parametrize('text, text_seq_length, bpe_dropout', [
 6 |     ('hello, how are you?', 128, 0.1),
 7 |     ('hello, how are you?', 128, 0.5),
 8 |     ('hello, how are you?', 128, 1.0),
 9 |     ('hello ... how are you ?', 256, 1.0),
10 |     ('a person standing at a table with bottles of win', 64, 0.5),
11 |     ('привет как дела???', 76, 0.0),
12 |     ('клип на русском языке :)', 76, 0.1),
13 | ])
14 | def test_encode_decode_text_yttm(yttm_tokenizer, text, text_seq_length, bpe_dropout):
15 |     tokens = yttm_tokenizer.encode_text(text, text_seq_length=text_seq_length, bpe_dropout=bpe_dropout)
16 |     decoded_text = yttm_tokenizer.decode_text(tokens)
17 |     assert text == decoded_text
18 | 


--------------------------------------------------------------------------------
/tests/test_image_prompts.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | from rudalle.image_prompts import ImagePrompts
 5 | 
 6 | 
 7 | @pytest.mark.parametrize('borders, crop_first', [
 8 |     ({'up': 4, 'right': 0, 'left': 0, 'down': 0}, False),
 9 |     ({'up': 4, 'right': 0, 'left': 0, 'down': 0}, True),
10 |     ({'up': 4, 'right': 3, 'left': 3, 'down': 3}, False)
11 | ])
12 | def test_image_prompts(sample_image, vae, borders, crop_first):
13 |     img = sample_image.copy()
14 |     img = img.resize((256, 256))
15 |     image_prompt = ImagePrompts(img, borders, vae, crop_first=crop_first)
16 |     assert image_prompt.image_prompts.shape[1] == 32 * 32
17 |     assert len(image_prompt.image_prompts_idx) == (borders['up'] + borders['down']) * 32 \
18 |         + (borders['left'] + borders['right']) * (32 - borders['up'] - borders['down'])
19 | 


--------------------------------------------------------------------------------
/rudalle/vae/vqgan.gumbelf8-sber.config.yml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   base_learning_rate: 4.5e-06
 3 |   target: taming.models.vqgan.GumbelVQ
 4 |   params:
 5 |     kl_weight: 1.0e-08
 6 |     embed_dim: 256
 7 |     n_embed: 8192
 8 |     monitor: val/rec_loss
 9 |     temperature_scheduler_config:
10 |       target: taming.lr_scheduler.LambdaWarmUpCosineScheduler
11 |       params:
12 |         warm_up_steps: 0
13 |         max_decay_steps: 1000001
14 |         lr_start: 0.9
15 |         lr_max: 0.9
16 |         lr_min: 1.0e-06
17 |     ddconfig:
18 |       double_z: false
19 |       z_channels: 256
20 |       resolution: 256
21 |       in_channels: 3
22 |       out_ch: 3
23 |       ch: 128
24 |       ch_mult:
25 |       - 1
26 |       - 1
27 |       - 2
28 |       - 4
29 |       num_res_blocks: 2
30 |       attn_resolutions:
31 |       - 32
32 |       dropout: 0.0
33 |     lossconfig:
34 |       target: taming.modules.losses.vqperceptual.DummyLoss
35 | 


--------------------------------------------------------------------------------
/rudalle/ruclip/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | 
 4 | from transformers import CLIPModel
 5 | from huggingface_hub import hf_hub_url, cached_download
 6 | 
 7 | from .processor import RuCLIPProcessor
 8 | 
 9 | MODELS = {
10 |     'ruclip-vit-base-patch32-v5': dict(
11 |         repo_id='sberbank-ai/ru-clip',
12 |         filenames=[
13 |             'bpe.model', 'config.json', 'pytorch_model.bin'
14 |         ]
15 |     ),
16 | }
17 | 
18 | 
19 | def get_ruclip(name, cache_dir='/tmp/rudalle'):
20 |     assert name in MODELS
21 |     config = MODELS[name]
22 |     repo_id = config['repo_id']
23 |     cache_dir = os.path.join(cache_dir, name)
24 |     for filename in config['filenames']:
25 |         config_file_url = hf_hub_url(repo_id=repo_id, filename=f'{name}/{filename}')
26 |         cached_download(config_file_url, cache_dir=cache_dir, force_filename=filename)
27 |     ruclip = CLIPModel.from_pretrained(cache_dir)
28 |     ruclip_processor = RuCLIPProcessor.from_pretrained(cache_dir)
29 |     print('ruclip --> ready')
30 |     return ruclip, ruclip_processor
31 | 


--------------------------------------------------------------------------------
/rudalle/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import random
 4 | 
 5 | import torch
 6 | import torchvision
 7 | import numpy as np
 8 | 
 9 | 
10 | def seed_everything(seed):
11 |     random.seed(seed)
12 |     os.environ['PYTHONHASHSEED'] = str(seed)
13 |     np.random.seed(seed)
14 |     torch.manual_seed(seed)
15 |     torch.cuda.manual_seed(seed)
16 |     torch.backends.cudnn.deterministic = True
17 |     torch.backends.cudnn.benchmark = True
18 | 
19 | 
20 | def torch_tensors_to_pil_list(input_images):
21 |     out_images = []
22 |     for in_image in input_images:
23 |         in_image = in_image.cpu().detach()
24 |         out_image = torchvision.transforms.functional.to_pil_image(in_image).convert('RGB')
25 |         out_images.append(out_image)
26 |     return out_images
27 | 
28 | 
29 | def pil_list_to_torch_tensors(pil_images):
30 |     result = []
31 |     for pil_image in pil_images:
32 |         image = np.array(pil_image, dtype=np.uint8)
33 |         image = torch.from_numpy(image)
34 |         image = image.permute(2, 0, 1).unsqueeze(0)
35 |         result.append(image)
36 |     return torch.cat(result, dim=0)
37 | 


--------------------------------------------------------------------------------
/rudalle/realesrgan/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | 
 4 | from huggingface_hub import hf_hub_url, cached_download
 5 | 
 6 | from .model import RealESRGAN
 7 | 
 8 | 
 9 | MODELS = {
10 |     'x2': dict(
11 |         scale=2,
12 |         repo_id='shonenkov/rudalle-utils',
13 |         filename='RealESRGAN_x2.pth',
14 |     ),
15 |     'x4': dict(
16 |         scale=4,
17 |         repo_id='shonenkov/rudalle-utils',
18 |         filename='RealESRGAN_x4.pth',
19 |     ),
20 |     'x8': dict(
21 |         scale=8,
22 |         repo_id='shonenkov/rudalle-utils',
23 |         filename='RealESRGAN_x8.pth',
24 |     ),
25 | }
26 | 
27 | 
28 | def get_realesrgan(name, device='cpu', fp16=False, cache_dir='/tmp/rudalle'):
29 |     assert name in MODELS
30 |     config = MODELS[name]
31 |     model = RealESRGAN(device, config['scale'], fp16=fp16)
32 |     cache_dir = os.path.join(cache_dir, name)
33 |     config_file_url = hf_hub_url(repo_id=config['repo_id'], filename=config['filename'])
34 |     cached_download(config_file_url, cache_dir=cache_dir, force_filename=config['filename'])
35 |     model.load_weights(os.path.join(cache_dir, config['filename']))
36 |     print(f'{name} --> ready')
37 |     return model
38 | 


--------------------------------------------------------------------------------
/rudalle/vae/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from os.path import dirname, abspath, join
 3 | 
 4 | import torch
 5 | from huggingface_hub import hf_hub_url, cached_download
 6 | from omegaconf import OmegaConf
 7 | 
 8 | from .model import VQGanGumbelVAE
 9 | 
10 | 
11 | def get_vae(pretrained=True, dwt=False, cache_dir='/tmp/rudalle'):
12 |     # TODO
13 |     config = OmegaConf.load(join(dirname(abspath(__file__)), 'vqgan.gumbelf8-sber.config.yml'))
14 |     vae = VQGanGumbelVAE(config, dwt=dwt)
15 |     if pretrained:
16 |         repo_id = 'shonenkov/rudalle-utils'
17 |         if dwt:
18 |             filename = 'vqgan.gumbelf8-sber-dwt.model.ckpt'
19 |         else:
20 |             filename = 'vqgan.gumbelf8-sber.model.ckpt'
21 |         cache_dir = join(cache_dir, 'vae')
22 |         config_file_url = hf_hub_url(repo_id=repo_id, filename=filename)
23 |         cached_download(config_file_url, cache_dir=cache_dir, force_filename=filename)
24 |         checkpoint = torch.load(join(cache_dir, filename), map_location='cpu')
25 |         if dwt:
26 |             vae.load_state_dict(checkpoint['state_dict'])
27 |         else:
28 |             vae.model.load_state_dict(checkpoint['state_dict'], strict=False)
29 |     print('vae --> ready')
30 |     return vae
31 | 


--------------------------------------------------------------------------------
/tests/test_dalle.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import torch
 3 | import pytest
 4 | 
 5 | from .test_vae import preprocess
 6 | 
 7 | 
 8 | @pytest.mark.parametrize('text', [
 9 |     'мальчик играет с оленем',
10 | ])
11 | def test_forward_step_and_criterion(text, sample_image, yttm_tokenizer, vae, small_dalle):
12 |     bs = 4
13 |     text_seq_length = small_dalle.get_param('text_seq_length')
14 |     total_seq_length = small_dalle.get_param('total_seq_length')
15 |     device = small_dalle.get_param('device')
16 | 
17 |     img = sample_image.copy()
18 |     img = preprocess(img, target_image_size=256)
19 |     images = img.repeat(bs, 1, 1, 1).to(device)
20 | 
21 |     text = text.lower().strip()
22 |     text_input_ids = yttm_tokenizer.encode_text(text, text_seq_length=text_seq_length)
23 |     text_input_ids = text_input_ids.unsqueeze(0).repeat(bs, 1).to(device)
24 | 
25 |     attention_mask = torch.tril(torch.ones((bs, 1, total_seq_length, total_seq_length), device=device))
26 |     with torch.no_grad():
27 |         image_input_ids = vae.get_codebook_indices(images)
28 |         input_ids = torch.cat((text_input_ids, image_input_ids), dim=1)
29 |         loss, loss_values = small_dalle.forward(input_ids, attention_mask, return_loss=True)
30 |         assert type(loss.data.detach().item()) == float
31 |         assert type(loss_values) == dict
32 | 


--------------------------------------------------------------------------------
/pics/habr_eng.svg:
--------------------------------------------------------------------------------
1 | <svg width="62" height="24" viewBox="0 0 62 24" xmlns="http://www.w3.org/2000/svg">
2 |   <path fill="#CFB2FF" d="M10.95 0.575V7.45H3.825V0.575H0V18.175H3.825V10.875H10.95V18.175H14.775V0.575H10.95Z"></path>
3 |   <path fill="#CFB2FF" d="M23.6314 4.95C20.2564 4.95 18.3314 6.8 17.9814 9.025H21.3814C21.5814 8.35 22.2314 7.775 23.4814 7.775C24.7064 7.775 25.6064 8.4 25.6064 9.6V10.2H22.7814C19.4814 10.2 17.4814 11.725 17.4814 14.375C17.4814 16.975 19.4814 18.475 21.9814 18.475C23.6564 18.475 25.0064 17.825 25.8064 16.775V18.175H29.1814V9.725C29.1814 6.575 26.9564 4.95 23.6314 4.95ZM22.9814 15.65C21.8564 15.65 21.1314 15.1 21.1314 14.2C21.1314 13.275 21.9314 12.725 23.0564 12.725H25.6064V13.3C25.6064 14.8 24.3814 15.65 22.9814 15.65Z"></path>
4 |   <path fill="#CFB2FF" d="M39.4707 4.95C37.7207 4.95 36.5207 5.65 35.8457 6.55V0H32.1957V18.175H35.6957V16.7C36.2707 17.7 37.5457 18.475 39.3707 18.475C42.5457 18.475 45.0707 15.975 45.0707 11.725C45.0707 7.4 42.5457 4.95 39.4707 4.95ZM38.5957 15.325C36.8957 15.325 35.7707 13.95 35.7707 11.775V11.675C35.7707 9.5 36.8957 8.1 38.5957 8.1C40.2957 8.1 41.3957 9.525 41.3957 11.725C41.3957 13.925 40.2957 15.325 38.5957 15.325Z"></path>
5 |   <path fill="#CFB2FF" d="M54.7777 5.2C53.3027 5.2 51.8777 5.75 51.0277 7.25V5.25H47.5277V18.175H51.1777V11.625C51.1777 9.775 52.3777 8.65 54.5777 8.65H55.4277V5.2H54.7777Z"></path>
6 | </svg>
7 | 


--------------------------------------------------------------------------------
/rudalle/emojich_unet/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | 
 4 | import torch
 5 | from huggingface_hub import hf_hub_url, cached_download
 6 | 
 7 | 
 8 | MODELS = {
 9 |     'unet_effnetb5': dict(
10 |         encoder_name='efficientnet-b5',
11 |         repo_id='sberbank-ai/rudalle-Emojich',
12 |         filename='pytorch_model_v2.bin',
13 |         classes=2,
14 |     ),
15 | }
16 | 
17 | 
18 | def get_emojich_unet(name, cache_dir='/tmp/rudalle'):
19 |     assert name in MODELS
20 |     config = MODELS[name]
21 |     try:
22 |         import segmentation_models_pytorch as smp
23 |     except ImportError:
24 |         import logging
25 |         logging.warning('If you would like to use emojich_unet, you should reinstall timm package:'
26 |                         '"pip install timm==0.4.12"')
27 |         return
28 |     model = smp.Unet(
29 |         encoder_name=config['encoder_name'],
30 |         encoder_weights=None,
31 |         in_channels=3,
32 |         classes=config['classes'],
33 |     )
34 |     cache_dir = os.path.join(cache_dir, name)
35 |     filename = config['filename']
36 |     config_file_url = hf_hub_url(repo_id=config['repo_id'], filename=f'{name}/{filename}')
37 |     cached_download(config_file_url, cache_dir=cache_dir, force_filename=filename)
38 |     checkpoint = torch.load(os.path.join(cache_dir, config['filename']), map_location='cpu')
39 |     model.load_state_dict(checkpoint)
40 |     print(f'{name} --> ready')
41 |     return model
42 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import io
 3 | from os.path import abspath, dirname
 4 | 
 5 | import PIL
 6 | import pytest
 7 | import requests
 8 | 
 9 | from rudalle import get_tokenizer, get_rudalle_model, get_vae, get_realesrgan, get_emojich_unet
10 | 
11 | 
12 | TEST_ROOT = dirname(abspath(__file__))
13 | 
14 | 
15 | @pytest.fixture(scope='module')
16 | def realesrgan():
17 |     realesrgan = get_realesrgan('x2', device='cpu')
18 |     yield realesrgan
19 | 
20 | 
21 | @pytest.fixture(scope='module')
22 | def vae():
23 |     vae = get_vae(pretrained=False)
24 |     yield vae
25 | 
26 | 
27 | @pytest.fixture(scope='module')
28 | def dwt_vae():
29 |     vae = get_vae(pretrained=False, dwt=True)
30 |     yield vae
31 | 
32 | 
33 | @pytest.fixture(scope='module')
34 | def yttm_tokenizer():
35 |     tokenizer = get_tokenizer()
36 |     yield tokenizer
37 | 
38 | 
39 | @pytest.fixture(scope='module')
40 | def sample_image():
41 |     url = 'https://cdn.kqed.org/wp-content/uploads/sites/12/2013/12/rudolph.png'
42 |     resp = requests.get(url)
43 |     resp.raise_for_status()
44 |     image = PIL.Image.open(io.BytesIO(resp.content))
45 |     yield image
46 | 
47 | 
48 | @pytest.fixture(scope='module')
49 | def small_dalle():
50 |     model = get_rudalle_model('small', pretrained=False, fp16=False, device='cpu')
51 |     yield model
52 | 
53 | 
54 | @pytest.fixture(scope='module')
55 | def emojich_unet():
56 |     model = get_emojich_unet('unet_effnetb5')
57 |     yield model
58 | 


--------------------------------------------------------------------------------
/pics/habr.svg:
--------------------------------------------------------------------------------
1 | <svg width="62" height="24" viewBox="0 0 62 24" xmlns="http://www.w3.org/2000/svg">
2 |     <path fill="#CFB2FF" d="M16.875 19L11.075 10.225L16.825 1.4H12.6L8.75 7.4L4.94999 1.4H0.574994L6.32499 10.15L0.524994 19H4.79999L8.64999 12.975L12.525 19H16.875Z"></path>
3 |     <path fill="#CFB2FF" d="M24.2607 5.775C20.8857 5.775 18.9607 7.625 18.6107 9.85H22.0107C22.2107 9.175 22.8607 8.6 24.1107 8.6C25.3357 8.6 26.2357 9.225 26.2357 10.425V11.025H23.4107C20.1107 11.025 18.1107 12.55 18.1107 15.2C18.1107 17.8 20.1107 19.3 22.6107 19.3C24.2857 19.3 25.6357 18.65 26.4357 17.6V19H29.8107V10.55C29.8107 7.4 27.5857 5.775 24.2607 5.775ZM23.6107 16.475C22.4857 16.475 21.7607 15.925 21.7607 15.025C21.7607 14.1 22.5607 13.55 23.6857 13.55H26.2357V14.125C26.2357 15.625 25.0107 16.475 23.6107 16.475Z"></path>
4 |     <path fill="#CFB2FF" d="M39.925 6.3C38.125 6.3 36.65 6.95 35.7 8.275C35.95 5.85 36.925 4.65 39.375 4.275L44.3 3.55V0.375L39.025 1.25C33.925 2.1 32.35 5.5 32.35 11.175C32.35 16.275 34.825 19.3 39.2 19.3C43.125 19.3 45.55 16.3 45.55 12.7C45.55 8.825 43.3 6.3 39.925 6.3ZM39.025 16.25C37.125 16.25 36.075 14.725 36.075 12.675C36.075 10.7 37.175 9.275 39.05 9.275C40.875 9.275 41.9 10.75 41.9 12.7C41.9 14.65 40.9 16.25 39.025 16.25Z"></path>
5 |     <path fill="#CFB2FF" d="M55.2855 5.775C53.3855 5.775 52.1605 6.6 51.5105 7.575V6.075H48.0105V23.775H51.6605V17.75C52.3105 18.65 53.5355 19.3 55.1855 19.3C58.3605 19.3 60.8855 16.8 60.8855 12.55C60.8855 8.225 58.3605 5.775 55.2855 5.775ZM54.4105 16.15C52.7105 16.15 51.5855 14.775 51.5855 12.6V12.5C51.5855 10.325 52.7105 8.925 54.4105 8.925C56.1105 8.925 57.2105 10.35 57.2105 12.55C57.2105 14.75 56.1105 16.15 54.4105 16.15Z"></path>
6 | </svg>
7 | 


--------------------------------------------------------------------------------
/rudalle/dalle/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import torch
 3 | 
 4 | 
 5 | def exists(val):
 6 |     return val is not None
 7 | 
 8 | 
 9 | def is_empty(t):
10 |     return t.nelement() == 0
11 | 
12 | 
13 | def ensure_divisibility(numerator, denominator):
14 |     """Ensure that numerator is divisible by the denominator."""
15 |     assert numerator % denominator == 0, '{} is not divisible by {}'.format(
16 |         numerator, denominator)
17 | 
18 | 
19 | def divide(numerator, denominator):
20 |     """Ensure that numerator is divisible by the denominator and return
21 |     the division value."""
22 |     ensure_divisibility(numerator, denominator)
23 |     return numerator // denominator
24 | 
25 | 
26 | def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=False):
27 |     """
28 |     Split a tensor along its last dimension.
29 |     Arguments:
30 |         tensor: input tensor.
31 |         num_partitions: number of partitions to split the tensor
32 |         contiguous_split_chunks: If True, make each chunk contiguous
33 |                                  in memory.
34 |     """
35 |     # Get the size and dimension.
36 |     last_dim = tensor.dim() - 1
37 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
38 |     # Split.
39 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
40 |     # Note: torch.split does not create contiguous tensors by default.
41 |     if contiguous_split_chunks:
42 |         return tuple(chunk.contiguous() for chunk in tensor_list)
43 |     return tensor_list
44 | 
45 | 
46 | def init_method_normal(std=0.02):
47 |     """Init method based on normal distribution.
48 | 
49 |     This is only used for embeddings. The transformer has its
50 |     own initializer.
51 |     """
52 |     def init_(tensor):
53 |         return torch.nn.init.normal_(tensor, mean=0.0, std=std)
54 |     return init_
55 | 


--------------------------------------------------------------------------------
/rudalle/dalle/image_attention.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def _init_mask(text_tokens, image_tokens_per_dim, is_bool_mask=False):
 7 |     attn_size = text_tokens + image_tokens_per_dim**2
 8 |     mask = torch.tril(torch.ones(attn_size, attn_size, dtype=torch.bool if is_bool_mask else torch.float32))
 9 |     return mask
10 | 
11 | 
12 | def get_row_mask(text_tokens=256, image_tokens_per_dim=32, is_bool_mask=False):
13 |     mask = _init_mask(text_tokens, image_tokens_per_dim, is_bool_mask=is_bool_mask)
14 |     step = image_tokens_per_dim + 1
15 |     for col in range(text_tokens, mask.size(1)):
16 |         mask[col + step:, col] = False if is_bool_mask else 0.0
17 |     return mask
18 | 
19 | 
20 | def get_col_mask(text_tokens=256, image_tokens_per_dim=32, is_bool_mask=False):
21 |     mask = _init_mask(text_tokens, image_tokens_per_dim, is_bool_mask=is_bool_mask)
22 |     step = image_tokens_per_dim - 1
23 |     for col in range(text_tokens, mask.size(1)):
24 |         for i in range(1, mask.size(0), step+1):
25 |             mask[col + i: col + i + step, col] = False if is_bool_mask else 0.0
26 |     return mask
27 | 
28 | 
29 | def get_conv_mask(text_tokens=256, image_tokens_per_dim=32, kernel=11, is_bool_mask=False):
30 |     mask = _init_mask(text_tokens, image_tokens_per_dim, is_bool_mask=is_bool_mask)
31 |     shift = kernel // 2
32 |     for pos in range(text_tokens, mask.size(1)):
33 |         mask[pos+1:, pos] = False if is_bool_mask else 0.0
34 |         img = torch.zeros(image_tokens_per_dim, image_tokens_per_dim)
35 |         pixel_id = pos - text_tokens
36 |         row = pixel_id // image_tokens_per_dim
37 |         col = pixel_id % image_tokens_per_dim
38 |         for r in range(-shift, shift+1):
39 |             for c in range(-shift, shift+1):
40 |                 c_abs = (c + col) % image_tokens_per_dim
41 |                 r_abs = (r + row) % image_tokens_per_dim
42 |                 img[r_abs, c_abs] = 0.2
43 |                 cell_id = r_abs * image_tokens_per_dim + c_abs
44 |                 if text_tokens + cell_id > pos:
45 |                     mask[text_tokens + cell_id, pos] = True if is_bool_mask else 1.0
46 | 
47 |         img[row, col] = 1.0
48 |     return mask
49 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import re
 4 | from setuptools import setup
 5 | 
 6 | 
 7 | def read(filename):
 8 |     with open(os.path.join(os.path.dirname(__file__), filename)) as f:
 9 |         file_content = f.read()
10 |     return file_content
11 | 
12 | 
13 | def get_requirements():
14 |     requirements = []
15 |     for requirement in read('requirements.txt').splitlines():
16 |         if requirement.startswith('git+') or requirement.startswith('svn+') or requirement.startswith('hg+'):
17 |             parsed_requires = re.findall(r'#egg=([\w\d\.]+)-([\d\.]+)$', requirement)
18 |             if parsed_requires:
19 |                 package, version = parsed_requires[0]
20 |                 requirements.append(f'{package}=={version}')
21 |             else:
22 |                 print('WARNING! For correct matching dependency links need to specify package name and version'
23 |                       'such as <dependency url>#egg=<package_name>-<version>')
24 |         else:
25 |             requirements.append(requirement)
26 |     return requirements
27 | 
28 | 
29 | def get_links():
30 |     return [
31 |         requirement for requirement in read('requirements.txt').splitlines()
32 |         if requirement.startswith('git+') or requirement.startswith('svn+') or requirement.startswith('hg+')
33 |     ]
34 | 
35 | 
36 | def get_version():
37 |     """ Get version from the package without actually importing it. """
38 |     init = read('rudalle/__init__.py')
39 |     for line in init.split('\n'):
40 |         if line.startswith('__version__'):
41 |             return eval(line.split('=')[1])
42 | 
43 | 
44 | setup(
45 |     name='rudalle',
46 |     version=get_version(),
47 |     author='SberAI, SberDevices',
48 |     author_email='shonenkov@phystech.edu',
49 |     description='ruDALL-E generate images from texts in Russian language',
50 |     packages=['rudalle', 'rudalle/dalle', 'rudalle/realesrgan', 'rudalle/ruclip', 'rudalle/vae',
51 |               'rudalle/emojich_unet'],
52 |     package_data={'rudalle/vae': ['*.yml']},
53 |     install_requires=get_requirements(),
54 |     dependency_links=get_links(),
55 |     long_description=read('README.md'),
56 |     long_description_content_type='text/markdown',
57 | )
58 | 


--------------------------------------------------------------------------------
/tests/test_vae.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import PIL
 3 | import pytest
 4 | import torch
 5 | import torchvision.transforms as T
 6 | import torchvision.transforms.functional as TF
 7 | 
 8 | 
 9 | @pytest.mark.parametrize('target_image_size', [128, 192, 256])
10 | def test_decode_vae(vae, sample_image, target_image_size):
11 |     img = sample_image.copy()
12 |     img = preprocess(img, target_image_size=target_image_size)
13 |     with torch.no_grad():
14 |         img_seq = vae.get_codebook_indices(img)
15 |         out_img = vae.decode(img_seq)
16 |     assert out_img.shape == (1, 3, target_image_size, target_image_size)
17 | 
18 | 
19 | @pytest.mark.parametrize('target_image_size', [128, 192, 256])
20 | def test_reconstruct_vae(vae, sample_image, target_image_size):
21 |     img = sample_image.copy()
22 |     with torch.no_grad():
23 |         x_vqgan = preprocess(img, target_image_size=target_image_size)
24 |         output = reconstruct_with_vqgan(preprocess_vqgan(x_vqgan), vae.model)
25 |     assert output.shape == (1, 3, target_image_size, target_image_size)
26 | 
27 | 
28 | @pytest.mark.parametrize('target_image_size', [256])
29 | def test_reconstruct_dwt_vae(dwt_vae, sample_image, target_image_size):
30 |     img = sample_image.copy()
31 |     with torch.no_grad():
32 |         x_vqgan = preprocess(img, target_image_size=target_image_size)
33 |         output = reconstruct_with_vqgan(preprocess_vqgan(x_vqgan), dwt_vae.model)
34 |     assert output.shape == (1, 3, target_image_size*2, target_image_size*2)
35 | 
36 | 
37 | def preprocess(img, target_image_size=256):
38 |     s = min(img.size)
39 |     if s < target_image_size:
40 |         raise ValueError(f'min dim for image {s} < {target_image_size}')
41 |     r = target_image_size / s
42 |     s = (round(r * img.size[1]), round(r * img.size[0]))
43 |     img = TF.resize(img, s, interpolation=PIL.Image.LANCZOS)
44 |     img = TF.center_crop(img, output_size=2 * [target_image_size])
45 |     img = torch.unsqueeze(T.ToTensor()(img), 0)
46 |     return img
47 | 
48 | 
49 | def preprocess_vqgan(x):
50 |     x = 2.*x - 1.
51 |     return x
52 | 
53 | 
54 | def reconstruct_with_vqgan(x, model):
55 |     z, _, [_, _, _] = model.encode(x)
56 |     print(f'VQGAN --- {model.__class__.__name__}: latent shape: {z.shape[2:]}')
57 |     xrec = model.decode(z)
58 |     return xrec
59 | 


--------------------------------------------------------------------------------
/rudalle/tokenizer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from os.path import join
 3 | 
 4 | import torch
 5 | import numpy as np
 6 | import youtokentome as yttm
 7 | from huggingface_hub import hf_hub_url, cached_download
 8 | 
 9 | 
10 | def get_tokenizer(path=None, cache_dir='/tmp/rudalle'):
11 |     # TODO docstring
12 |     if path is None:
13 |         repo_id = 'shonenkov/rudalle-utils'
14 |         filename = 'bpe.model'
15 |         cache_dir = join(cache_dir, 'tokenizer')
16 |         config_file_url = hf_hub_url(repo_id=repo_id, filename=filename)
17 |         cached_download(config_file_url, cache_dir=cache_dir, force_filename=filename)
18 |         path = join(cache_dir, filename)
19 |     tokenizer = YTTMTokenizerWrapper(yttm.BPE(model=path))
20 |     print('tokenizer --> ready')
21 |     return tokenizer
22 | 
23 | 
24 | class YTTMTokenizerWrapper:
25 |     eos_id = 3
26 |     bos_id = 2
27 |     unk_id = 1
28 |     pad_id = 0
29 | 
30 |     def __init__(self, tokenizer):
31 |         self.tokenizer = tokenizer
32 | 
33 |     def __len__(self):
34 |         return self.vocab_size()
35 | 
36 |     def get_pad_token_id(self):
37 |         # TODO docstring
38 |         return self.tokenizer.subword_to_id('<PAD>')
39 | 
40 |     def vocab_size(self):
41 |         # TODO docstring
42 |         return self.tokenizer.vocab_size()
43 | 
44 |     def encode_text(self, text, text_seq_length, bpe_dropout=0.0):
45 |         # TODO docstring
46 |         tokens = self.tokenizer.encode([text], output_type=yttm.OutputType.ID, dropout_prob=bpe_dropout)[0]
47 |         tokens = [self.bos_id] + tokens + [self.eos_id]
48 |         return self.prepare_tokens(tokens, text_seq_length)
49 | 
50 |     def decode_text(self, encoded):
51 |         # TODO docstring
52 |         return self.tokenizer.decode(encoded.cpu().numpy().tolist(), ignore_ids=[
53 |             self.eos_id, self.bos_id, self.unk_id, self.pad_id
54 |         ])[0]
55 | 
56 |     @staticmethod
57 |     def prepare_tokens(tokens, text_seq_length):
58 |         # TODO docstring
59 |         empty_positions = text_seq_length - len(tokens)
60 |         if empty_positions > 0:
61 |             tokens = np.hstack((tokens, np.zeros(empty_positions)))  # position tokens after text
62 |         if len(tokens) > text_seq_length:
63 |             tokens = tokens[:text_seq_length]
64 |         return torch.tensor(tokens).long()
65 | 


--------------------------------------------------------------------------------
/rudalle/dalle/fp16.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import torch
 3 | from torch import nn
 4 | from torch.autograd import Variable
 5 | from torch.nn.parameter import Parameter
 6 | 
 7 | FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
 8 | HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
 9 | 
10 | 
11 | def conversion_helper(val, conversion):
12 |     """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
13 |     if not isinstance(val, (tuple, list)):
14 |         return conversion(val)
15 |     rtn = [conversion_helper(v, conversion) for v in val]
16 |     if isinstance(val, tuple):
17 |         rtn = tuple(rtn)
18 |     return rtn
19 | 
20 | 
21 | def fp32_to_fp16(val):
22 |     """Convert fp32 `val` to fp16"""
23 |     def half_conversion(val):
24 |         val_typecheck = val
25 |         if isinstance(val_typecheck, (Parameter, Variable)):
26 |             val_typecheck = val.data
27 |         if isinstance(val_typecheck, FLOAT_TYPES):
28 |             val = val.half()
29 |         return val
30 |     return conversion_helper(val, half_conversion)
31 | 
32 | 
33 | def fp16_to_fp32(val):
34 |     """Convert fp16 `val` to fp32"""
35 |     def float_conversion(val):
36 |         val_typecheck = val
37 |         if isinstance(val_typecheck, (Parameter, Variable)):
38 |             val_typecheck = val.data
39 |         if isinstance(val_typecheck, HALF_TYPES):
40 |             val = val.float()
41 |         return val
42 |     return conversion_helper(val, float_conversion)
43 | 
44 | 
45 | class FP16Module(nn.Module):
46 |     def __init__(self, module):
47 |         super(FP16Module, self).__init__()
48 |         self.add_module('module', module.half())
49 | 
50 |     def forward(self, *inputs, **kwargs):
51 |         return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs))
52 | 
53 |     def state_dict(self, destination=None, prefix='', keep_vars=False):
54 |         return self.module.state_dict(destination, prefix, keep_vars)
55 | 
56 |     def load_state_dict(self, state_dict, strict=True):
57 |         self.module.load_state_dict(state_dict, strict=strict)
58 | 
59 |     def get_param(self, item):
60 |         return self.module.get_param(item)
61 | 
62 |     def to(self, device, *args, **kwargs):
63 |         self.module.to(device)
64 |         return super().to(device, *args, **kwargs)
65 | 


--------------------------------------------------------------------------------
/rudalle/realesrgan/model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Source: https://github.com/boomb0om/Real-ESRGAN-colab
 3 | 
 4 | import torch
 5 | import numpy as np
 6 | from PIL import Image
 7 | 
 8 | from .rrdbnet_arch import RRDBNet
 9 | from .utils import pad_reflect, split_image_into_overlapping_patches, stich_together, unpad_image
10 | from rudalle.dalle.fp16 import FP16Module
11 | 
12 | 
13 | class RealESRGAN:
14 |     def __init__(self, device, scale=4, fp16=False):
15 |         self.device = device
16 |         self.scale = scale
17 |         self.model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=scale)
18 |         self.fp16 = fp16
19 | 
20 |     def load_weights(self, model_path):
21 |         loadnet = torch.load(model_path)
22 |         if 'params' in loadnet:
23 |             self.model.load_state_dict(loadnet['params'], strict=True)
24 |         elif 'params_ema' in loadnet:
25 |             self.model.load_state_dict(loadnet['params_ema'], strict=True)
26 |         else:
27 |             self.model.load_state_dict(loadnet, strict=True)
28 |         self.model.eval()
29 |         if self.fp16:
30 |             self.model = FP16Module(self.model)
31 |         self.model.to(self.device)
32 | 
33 |     def predict(self, lr_image, batch_size=4, patches_size=192,
34 |                 padding=24, pad_size=15):
35 |         scale = self.scale
36 |         device = self.device
37 |         lr_image = np.array(lr_image)
38 |         lr_image = pad_reflect(lr_image, pad_size)
39 | 
40 |         patches, p_shape = split_image_into_overlapping_patches(lr_image, patch_size=patches_size,
41 |                                                                 padding_size=padding)
42 |         if self.fp16:
43 |             img = torch.HalfTensor(patches / 255).permute((0, 3, 1, 2)).to(device).detach()
44 |         else:
45 |             img = torch.FloatTensor(patches / 255).permute((0, 3, 1, 2)).to(device).detach()
46 | 
47 |         with torch.no_grad():
48 |             res = self.model(img[0:batch_size])
49 |             for i in range(batch_size, img.shape[0], batch_size):
50 |                 res = torch.cat((res, self.model(img[i:i + batch_size])), 0)
51 | 
52 |         sr_image = res.permute((0, 2, 3, 1)).cpu().clamp_(0, 1)
53 |         np_sr_image = sr_image.numpy()
54 | 
55 |         padded_size_scaled = tuple(np.multiply(p_shape[0:2], scale)) + (3,)
56 |         scaled_image_shape = tuple(np.multiply(lr_image.shape[0:2], scale)) + (3,)
57 |         np_sr_image = stich_together(np_sr_image, padded_image_shape=padded_size_scaled,
58 |                                      target_shape=scaled_image_shape, padding_size=padding * scale)
59 |         sr_img = (np_sr_image * 255).astype(np.uint8)
60 |         sr_img = unpad_image(sr_img, pad_size * scale)
61 |         sr_img = Image.fromarray(sr_img)
62 | 
63 |         return sr_img
64 | 


--------------------------------------------------------------------------------
/rudalle/ruclip/processor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import json
 4 | import torch
 5 | import youtokentome as yttm
 6 | import torchvision.transforms as T
 7 | from torch.nn.utils.rnn import pad_sequence
 8 | 
 9 | 
10 | class RuCLIPProcessor:
11 |     eos_id = 3
12 |     bos_id = 2
13 |     unk_id = 1
14 |     pad_id = 0
15 | 
16 |     def __init__(self, tokenizer_path, image_size=224, text_seq_length=76, mean=None, std=None):
17 | 
18 |         self.tokenizer = yttm.BPE(tokenizer_path)
19 |         self.mean = mean or [0.485, 0.456, 0.406]
20 |         self.std = std or [0.229, 0.224, 0.225]
21 |         self.image_transform = T.Compose([
22 |             T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
23 |             T.RandomResizedCrop(image_size, scale=(1., 1.), ratio=(1., 1.)),
24 |             T.ToTensor(),
25 |             T.Normalize(mean=self.mean, std=self.std)
26 |         ])
27 |         self.text_seq_length = text_seq_length
28 |         self.image_size = image_size
29 | 
30 |     def encode_text(self, text):
31 |         text = text.lower()
32 |         tokens = self.tokenizer.encode([text], output_type=yttm.OutputType.ID, dropout_prob=0.0)[0]
33 |         tokens = [self.bos_id] + tokens + [self.eos_id]
34 |         tokens = tokens[:self.text_seq_length]
35 |         mask = [1] * len(tokens)
36 |         return torch.tensor(tokens).long(), torch.tensor(mask).long()
37 | 
38 |     def decode_text(self, encoded):
39 |         return self.tokenizer.decode(encoded.cpu().numpy().tolist(), ignore_ids=[
40 |             self.eos_id, self.bos_id, self.unk_id, self.pad_id
41 |         ])[0]
42 | 
43 |     def __call__(self, text=None, images=None, **kwargs):
44 |         inputs = {}
45 |         if text is not None:
46 |             input_ids, masks = [], []
47 |             texts = [text] if isinstance(text, str) else text
48 |             for text in texts:
49 |                 tokens, mask = self.encode_text(text)
50 |                 input_ids.append(tokens)
51 |                 masks.append(mask)
52 |             inputs['input_ids'] = pad_sequence(input_ids, batch_first=True)
53 |             inputs['attention_mask'] = pad_sequence(masks, batch_first=True)
54 |         if images is not None:
55 |             pixel_values = []
56 |             for i, image in enumerate(images):
57 |                 pixel_values.append(self.image_transform(image))
58 |             inputs['pixel_values'] = pad_sequence(pixel_values, batch_first=True)
59 |         return inputs
60 | 
61 |     @classmethod
62 |     def from_pretrained(cls, folder):
63 |         tokenizer_path = os.path.join(folder, 'bpe.model')
64 |         config = json.load(open(os.path.join(folder, 'config.json')))
65 |         image_size = config['vision_config']['image_size']
66 |         text_seq_length = config['text_config']['max_position_embeddings'] - 1
67 |         mean, std = config.get('mean'), config.get('std')
68 |         return cls(tokenizer_path, image_size=image_size, text_seq_length=text_seq_length, mean=mean, std=std)
69 | 


--------------------------------------------------------------------------------
/rudalle/image_prompts.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import torch
 3 | import numpy as np
 4 | 
 5 | 
 6 | class ImagePrompts:
 7 | 
 8 |     def __init__(self, pil_image, borders, vae, device='cpu', crop_first=False):
 9 |         """
10 |         Args:
11 |             pil_image (PIL.Image): image in PIL format
12 |             borders (dict[str] | int): borders that we croped from pil_image
13 |                 example: {'up': 4, 'right': 0, 'left': 0, 'down': 0} (1 int eq 8 pixels)
14 |             vae (VQGanGumbelVAE): VQGAN model for image encoding
15 |             device (str): cpu or cuda
16 |             crop_first (bool): if True, croped image before VQGAN encoding
17 |         """
18 |         self.device = device
19 |         img = self._preprocess_img(pil_image)
20 |         self.image_prompts_idx, self.image_prompts = self._get_image_prompts(img, borders, vae, crop_first)
21 | 
22 |     def _preprocess_img(self, pil_img):
23 |         img = torch.tensor(np.array(pil_img.convert('RGB')).transpose(2, 0, 1)) / 255.
24 |         img = img.unsqueeze(0).to(self.device, dtype=torch.float32)
25 |         img = (2 * img) - 1
26 |         return img
27 | 
28 |     def _get_image_prompts(self, img, borders, vae, crop_first):
29 |         if crop_first:
30 |             bs, _, img_w, img_h = img.shape
31 |             vqg_img_w, vqg_img_h = img_w // 8, img_h // 8
32 |             vqg_img = torch.zeros((bs, vqg_img_w, vqg_img_h), dtype=torch.int32, device=img.device)
33 |             if borders['down'] != 0:
34 |                 down_border = borders['down'] * 8
35 |                 _, _, [_, _, down_vqg_img] = vae.model.encode(img[:, :, -down_border:, :])
36 |                 vqg_img[:, -borders['down']:, :] = down_vqg_img
37 |             if borders['right'] != 0:
38 |                 right_border = borders['right'] * 8
39 |                 _, _, [_, _, right_vqg_img] = vae.model.encode(img[:, :, :, -right_border:])
40 |                 vqg_img[:, :, -borders['right']:] = right_vqg_img
41 |             if borders['left'] != 0:
42 |                 left_border = borders['left'] * 8
43 |                 _, _, [_, _, left_vqg_img] = vae.model.encode(img[:, :, :, :left_border])
44 |                 vqg_img[:, :, :borders['left']] = left_vqg_img
45 |             if borders['up'] != 0:
46 |                 up_border = borders['up'] * 8
47 |                 _, _, [_, _, up_vqg_img] = vae.model.encode(img[:, :, :up_border, :])
48 |                 vqg_img[:, :borders['up'], :] = up_vqg_img
49 |         else:
50 |             _, _, [_, _, vqg_img] = vae.model.encode(img)
51 | 
52 |         bs, vqg_img_w, vqg_img_h = vqg_img.shape
53 |         mask = torch.zeros(vqg_img_w, vqg_img_h)
54 |         if borders['up'] != 0:
55 |             mask[:borders['up'], :] = 1.
56 |         if borders['down'] != 0:
57 |             mask[-borders['down']:, :] = 1.
58 |         if borders['right'] != 0:
59 |             mask[:, -borders['right']:] = 1.
60 |         if borders['left'] != 0:
61 |             mask[:, :borders['left']] = 1.
62 |         mask = mask.reshape(-1).bool()
63 | 
64 |         image_prompts = vqg_img.reshape((bs, -1))
65 |         image_prompts_idx = np.arange(vqg_img_w * vqg_img_h)
66 |         image_prompts_idx = set(image_prompts_idx[mask])
67 | 
68 |         return image_prompts_idx, image_prompts
69 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### JetBrains template
  3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
  4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  5 | 
  6 | settings/local.py
  7 | logs/*.log
  8 | 
  9 | # User-specific stuff:
 10 | .idea/
 11 | 
 12 | # Sensitive or high-churn files:
 13 | .idea/**/dataSources/
 14 | .idea/**/dataSources.ids
 15 | .idea/**/dataSources.xml
 16 | .idea/**/dataSources.local.xml
 17 | .idea/**/sqlDataSources.xml
 18 | .idea/**/dynamic.xml
 19 | .idea/**/uiDesigner.xml
 20 | 
 21 | # Gradle:
 22 | .idea/**/gradle.xml
 23 | .idea/**/libraries
 24 | 
 25 | # CMake
 26 | cmake-build-debug/
 27 | 
 28 | # Mongo Explorer plugin:
 29 | .idea/**/mongoSettings.xml
 30 | 
 31 | ## File-based project format:
 32 | *.iws
 33 | 
 34 | ## Plugin-specific files:
 35 | 
 36 | # IntelliJ
 37 | out/
 38 | 
 39 | # mpeltonen/sbt-idea plugin
 40 | .idea_modules/
 41 | 
 42 | # JIRA plugin
 43 | atlassian-ide-plugin.xml
 44 | 
 45 | # Cursive Clojure plugin
 46 | .idea/replstate.xml
 47 | 
 48 | # Crashlytics plugin (for Android Studio and IntelliJ)
 49 | com_crashlytics_export_strings.xml
 50 | crashlytics.properties
 51 | crashlytics-build.properties
 52 | fabric.properties
 53 | ### Python template
 54 | # Byte-compiled / optimized / DLL files
 55 | __pycache__/
 56 | *.py[cod]
 57 | *$py.class
 58 | 
 59 | # C extensions
 60 | *.so
 61 | 
 62 | # Distribution / packaging
 63 | .Python
 64 | build/
 65 | develop-eggs/
 66 | dist/
 67 | downloads/
 68 | eggs/
 69 | .eggs/
 70 | lib/
 71 | lib64/
 72 | parts/
 73 | sdist/
 74 | var/
 75 | wheels/
 76 | *.egg-info/
 77 | .installed.cfg
 78 | *.egg
 79 | 
 80 | # PyInstaller
 81 | #  Usually these files are written by a python script from a template
 82 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 83 | *.manifest
 84 | *.spec
 85 | 
 86 | # Installer logs
 87 | pip-log.txt
 88 | pip-delete-this-directory.txt
 89 | 
 90 | # Unit test / coverage reports
 91 | htmlcov/
 92 | .tox/
 93 | .coverage
 94 | .coverage.*
 95 | .cache
 96 | nosetests.xml
 97 | coverage.xml
 98 | *.cover
 99 | .hypothesis/
100 | 
101 | # Translations
102 | *.mo
103 | *.pot
104 | 
105 | # Django stuff:
106 | *.log
107 | local_settings.py
108 | 
109 | # Flask stuff:
110 | instance/
111 | .webassets-cache
112 | 
113 | # Scrapy stuff:
114 | .scrapy
115 | 
116 | # Sphinx documentation
117 | docs/_build/
118 | 
119 | # PyBuilder
120 | target/
121 | 
122 | # Jupyter Notebook
123 | .ipynb_checkpoints
124 | 
125 | # pyenv
126 | .python-version
127 | 
128 | # celery beat schedule file
129 | celerybeat-schedule
130 | 
131 | # SageMath parsed files
132 | *.sage.py
133 | 
134 | # Environments
135 | .env
136 | .venv
137 | env/
138 | venv/
139 | ENV/
140 | 
141 | # Spyder project settings
142 | .spyderproject
143 | .spyproject
144 | 
145 | # Rope project settings
146 | .ropeproject
147 | 
148 | # mkdocs documentation
149 | /site
150 | 
151 | # mypy
152 | .mypy_cache/
153 | /tests/load_tests/logs/*
154 | /tests/.pytest_cache/
155 | ws_test.py
156 | /.vscode/
157 | 
158 | .s3_cache/
159 | mlruns
160 | *.pyc
161 | *.swp
162 | *.pt
163 | *.bin
164 | .vscode/
165 | runs/
166 | jupyters/custom_*
167 | 
168 | *logs/
169 | .DS_store
170 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ruDALL-E diffusion
 2 | 
 3 | try it out on colab 
 4 | <a href="https://colab.research.google.com/drive/1fXNfS2-ojRc6l4Rqg4rykFbPNSPvimZZ?usp=sharing">
 5 |     <img alt="ru-dalle-diffusion colab link" src="https://colab.research.google.com/assets/colab-badge.svg">
 6 | </a>
 7 | 
 8 | ruDALL-E diffusion is ruDALL-E with a diffusion decoder, similar to [dall-3](https://github.com/Jack000/DALLE-pytorch/)
 9 | 
10 | Decoding VQ embeddings with a DDPM model can produce much more realistic fine-grain details than VQVAE and VQGAN.
11 | 
12 | the only code change to ruDALL-E is to return the image tokens in generate_images() - the actual diffusion model is here: https://github.com/Jack000/guided-diffusion
13 | 
14 | # Samples
15 | | ruDALL-E + real-ESRGAN | diffusion |
16 | | --- | --- |
17 | | <img src="./pics/cat-gan.png" width="512"></img> | <img src="./pics/cat-diffusion.png" width="512"></img> |
18 | | <img src="./pics/woman-gan.png" width="512"></img> | <img src="./pics/woman-diffusion.png" width="512"></img> |
19 | | <img src="./pics/avocado-gan.png" width="512"></img> | <img src="./pics/avocado-diffusion.png" width="512"></img> |
20 | | <img src="./pics/cathedral-gan.png" width="512"></img> | <img src="./pics/cathedral-diffusion.png" width="512"></img> |
21 | 
22 | note that the results depend a lot on the seed value
23 | 
24 | base image:
25 | <img src="./pics/man-ru.png"></img>
26 | 
27 | diffusion-generated samples (different seeds):
28 | | &nbsp; | &nbsp; | &nbsp; |
29 | | --- | --- | --- |
30 | | <img src="./pics/man-0.png" width="256"></img> | <img src="./pics/man-1.png" width="256"></img> | <img src="./pics/man-2.png" width="256"></img> |
31 | | <img src="./pics/man-3.png" width="256"></img> | <img src="./pics/man-4.png" width="256"></img> | <img src="./pics/man-5.png" width="256"></img> |
32 | 
33 | ### generation by ruDALLE:
34 | ```python
35 | from rudalle.pipelines import generate_images, show, super_resolution, cherry_pick_by_clip
36 | from rudalle import get_rudalle_model, get_tokenizer, get_vae, get_realesrgan, get_ruclip
37 | from rudalle.utils import seed_everything
38 | import numpy as np
39 | 
40 | # prepare models:
41 | device = 'cuda'
42 | dalle = get_rudalle_model('Malevich', pretrained=True, fp16=True, device=device)
43 | tokenizer = get_tokenizer()
44 | vae = get_vae(dwt=False).to(device) # Make sure to set dwt to False!
45 | 
46 | text = 'изображение радуги на фоне ночного города'
47 | 
48 | seed_everything(42)
49 | pil_images = []
50 | scores = []
51 | codes = []
52 | 
53 | for top_k, top_p, images_num in [
54 |     (2048, 0.995, 3),
55 |     (1536, 0.99, 3),
56 |     (1024, 0.99, 3),
57 |     (1024, 0.98, 3),
58 |     (512, 0.97, 3),
59 |     (384, 0.96, 3),
60 |     (256, 0.95, 3),
61 |     (128, 0.95, 3),
62 | ]:
63 |     _pil_images, _scores, _codes = generate_images(text, tokenizer, dalle, vae, top_k=top_k, images_num=images_num, top_p=top_p, return_codes=True)
64 |     pil_images += _pil_images
65 |     scores += _scores
66 |     codes += _codes
67 | 
68 | sr_images = super_resolution(pil_images, realesrgan)
69 | 
70 | for i, im in enumerate(pil_images):
71 |     im.save(str(i)+'.png')
72 |     sr_images[i].save(str(i)+'sr.png')
73 |     with open(str(i)+'.npy', 'wb') as f:
74 |         np.save(f, codes[i])
75 | 
76 | # afterward, pass .npy file to diffusion model https://github.com/Jack000/guided-diffusion
77 | ```
78 | 


--------------------------------------------------------------------------------
/rudalle/dalle/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | 
  4 | import torch
  5 | from huggingface_hub import hf_hub_url, cached_download
  6 | 
  7 | from .model import DalleModel
  8 | from .fp16 import FP16Module
  9 | 
 10 | 
 11 | MODELS = {
 12 |     'Malevich': dict(
 13 |         description='◼️ Malevich is 1.3 billion params model from the family GPT3-like, '
 14 |                     'that uses Russian language and text+image multi-modality.',
 15 |         model_params=dict(
 16 |             num_layers=24,
 17 |             hidden_size=2048,
 18 |             num_attention_heads=16,
 19 |             embedding_dropout_prob=0.1,
 20 |             output_dropout_prob=0.1,
 21 |             attention_dropout_prob=0.1,
 22 |             image_tokens_per_dim=32,
 23 |             text_seq_length=128,
 24 |             cogview_sandwich_layernorm=True,
 25 |             cogview_pb_relax=True,
 26 |             vocab_size=16384+128,
 27 |             image_vocab_size=8192,
 28 |         ),
 29 |         repo_id='sberbank-ai/rudalle-Malevich',
 30 |         filename='pytorch_model_v2.bin',
 31 |         authors='SberAI, SberDevices',
 32 |         full_description='',  # TODO
 33 |     ),
 34 |     'Emojich': dict(
 35 |         description='😋 Emojich is a 1.3 billion params model from the family GPT3-like, '
 36 |                     'it generates emoji-style images with the brain of ◾ Malevich.',
 37 |         model_params=dict(
 38 |             num_layers=24,
 39 |             hidden_size=2048,
 40 |             num_attention_heads=16,
 41 |             embedding_dropout_prob=0.1,
 42 |             output_dropout_prob=0.1,
 43 |             attention_dropout_prob=0.1,
 44 |             image_tokens_per_dim=32,
 45 |             text_seq_length=128,
 46 |             cogview_sandwich_layernorm=True,
 47 |             cogview_pb_relax=True,
 48 |             vocab_size=16384 + 128,
 49 |             image_vocab_size=8192,
 50 |         ),
 51 |         repo_id='sberbank-ai/rudalle-Emojich',
 52 |         filename='pytorch_model.bin',
 53 |         authors='SberAI',
 54 |         full_description='',  # TODO
 55 |     ),
 56 |     'small': dict(
 57 |         description='',
 58 |         model_params=dict(
 59 |             num_layers=12,
 60 |             hidden_size=768,
 61 |             num_attention_heads=12,
 62 |             embedding_dropout_prob=0.1,
 63 |             output_dropout_prob=0.1,
 64 |             attention_dropout_prob=0.1,
 65 |             image_tokens_per_dim=32,
 66 |             text_seq_length=128,
 67 |             cogview_sandwich_layernorm=True,
 68 |             cogview_pb_relax=True,
 69 |             vocab_size=16384+128,
 70 |             image_vocab_size=8192,
 71 |         ),
 72 |         repo_id='',
 73 |         filename='',
 74 |         full_description='',  # TODO
 75 |     ),
 76 | }
 77 | 
 78 | 
 79 | def get_rudalle_model(name, pretrained=True, fp16=False, device='cpu', cache_dir='/tmp/rudalle', **model_kwargs):
 80 |     # TODO docstring
 81 |     assert name in MODELS
 82 | 
 83 |     if fp16 and device == 'cpu':
 84 |         print('Warning! Using both fp16 and cpu doesnt support. You can use cuda device or turn off fp16.')
 85 | 
 86 |     config = MODELS[name].copy()
 87 |     config['model_params'].update(model_kwargs)
 88 |     model = DalleModel(device=device, **config['model_params'])
 89 |     if pretrained:
 90 |         cache_dir = os.path.join(cache_dir, name)
 91 |         config_file_url = hf_hub_url(repo_id=config['repo_id'], filename=config['filename'])
 92 |         cached_download(config_file_url, cache_dir=cache_dir, force_filename=config['filename'])
 93 |         checkpoint = torch.load(os.path.join(cache_dir, config['filename']), map_location='cpu')
 94 |         model.load_state_dict(checkpoint)
 95 |     if fp16:
 96 |         model = FP16Module(model)
 97 |     model.eval()
 98 |     model = model.to(device)
 99 |     if config['description'] and pretrained:
100 |         print(config['description'])
101 |     return model
102 | 


--------------------------------------------------------------------------------
/rudalle/vae/decoder_dwt.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import pywt
  3 | import torch
  4 | import torch.nn as nn
  5 | from taming.modules.diffusionmodules.model import Decoder
  6 | 
  7 | from .pytorch_wavelets_utils import SFB2D, _SFB2D, prep_filt_sfb2d, mode_to_int
  8 | 
  9 | 
 10 | class DecoderDWT(nn.Module):
 11 |     def __init__(self, ddconfig, embed_dim):
 12 |         super().__init__()
 13 |         if ddconfig.out_ch != 12:
 14 |             ddconfig.out_ch = 12
 15 |         self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig['z_channels'], 1)
 16 |         self.decoder = Decoder(**ddconfig)
 17 |         self.idwt = DWTInverse(mode='zero', wave='db1')
 18 | 
 19 |     def forward(self, x):
 20 |         # x = self.post_quant_conv(x)
 21 |         freq = self.decoder(x)
 22 |         img = self.dwt_to_img(freq)
 23 |         return img
 24 | 
 25 |     def dwt_to_img(self, img):
 26 |         b, c, h, w = img.size()
 27 |         low = img[:, :3, :, :]
 28 |         high = img[:, 3:, :, :].view(b, 3, 3, h, w)
 29 |         return self.idwt((low, [high]))
 30 | 
 31 | 
 32 | class DWTInverse(nn.Module):
 33 |     """ Performs a 2d DWT Inverse reconstruction of an image
 34 | 
 35 |     Args:
 36 |         wave (str or pywt.Wavelet): Which wavelet to use
 37 |         C: deprecated, will be removed in future
 38 |     """
 39 | 
 40 |     def __init__(self, wave='db1', mode='zero', trace_model=False):
 41 |         super().__init__()
 42 |         if isinstance(wave, str):
 43 |             wave = pywt.Wavelet(wave)
 44 |         if isinstance(wave, pywt.Wavelet):
 45 |             g0_col, g1_col = wave.rec_lo, wave.rec_hi
 46 |             g0_row, g1_row = g0_col, g1_col
 47 |         else:
 48 |             if len(wave) == 2:
 49 |                 g0_col, g1_col = wave[0], wave[1]
 50 |                 g0_row, g1_row = g0_col, g1_col
 51 |             elif len(wave) == 4:
 52 |                 g0_col, g1_col = wave[0], wave[1]
 53 |                 g0_row, g1_row = wave[2], wave[3]
 54 |         # Prepare the filters
 55 |         filts = prep_filt_sfb2d(g0_col, g1_col, g0_row, g1_row)
 56 |         self.register_buffer('g0_col', filts[0])
 57 |         self.register_buffer('g1_col', filts[1])
 58 |         self.register_buffer('g0_row', filts[2])
 59 |         self.register_buffer('g1_row', filts[3])
 60 |         self.mode = mode
 61 |         self.trace_model = trace_model
 62 | 
 63 |     def forward(self, coeffs):
 64 |         """
 65 |         Args:
 66 |             coeffs (yl, yh): tuple of lowpass and bandpass coefficients, where:
 67 |               yl is a lowpass tensor of shape :math:`(N, C_{in}, H_{in}',
 68 |               W_{in}')` and yh is a list of bandpass tensors of shape
 69 |               :math:`list(N, C_{in}, 3, H_{in}'', W_{in}'')`. I.e. should match
 70 |               the format returned by DWTForward
 71 | 
 72 |         Returns:
 73 |             Reconstructed input of shape :math:`(N, C_{in}, H_{in}, W_{in})`
 74 | 
 75 |         Note:
 76 |             :math:`H_{in}', W_{in}', H_{in}'', W_{in}''` denote the correctly
 77 |             downsampled shapes of the DWT pyramid.
 78 | 
 79 |         Note:
 80 |             Can have None for any of the highpass scales and will treat the
 81 |             values as zeros (not in an efficient way though).
 82 |         """
 83 |         yl, yh = coeffs
 84 |         ll = yl
 85 |         mode = mode_to_int(self.mode)
 86 | 
 87 |         # Do a multilevel inverse transform
 88 |         for h in yh[::-1]:
 89 |             if h is None:
 90 |                 h = torch.zeros(ll.shape[0], ll.shape[1], 3, ll.shape[-2],
 91 |                                 ll.shape[-1], device=ll.device)
 92 | 
 93 |             # 'Unpad' added dimensions
 94 |             if ll.shape[-2] > h.shape[-2]:
 95 |                 ll = ll[..., :-1, :]
 96 |             if ll.shape[-1] > h.shape[-1]:
 97 |                 ll = ll[..., :-1]
 98 |             if not self.trace_model:
 99 |                 ll = SFB2D.apply(ll, h, self.g0_col, self.g1_col, self.g0_row, self.g1_row, mode)
100 |             else:
101 |                 ll = _SFB2D(ll, h, self.g0_col, self.g1_col, self.g0_row, self.g1_row, mode)
102 |         return ll
103 | 


--------------------------------------------------------------------------------
/rudalle/vae/model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from math import sqrt, log
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch import einsum
  8 | from einops import rearrange
  9 | from taming.modules.diffusionmodules.model import Encoder, Decoder
 10 | 
 11 | from .decoder_dwt import DecoderDWT
 12 | 
 13 | 
 14 | class VQGanGumbelVAE(torch.nn.Module):
 15 | 
 16 |     def __init__(self, config, dwt=False):
 17 |         super().__init__()
 18 |         model = GumbelVQ(
 19 |             ddconfig=config.model.params.ddconfig,
 20 |             n_embed=config.model.params.n_embed,
 21 |             embed_dim=config.model.params.embed_dim,
 22 |             kl_weight=config.model.params.kl_weight,
 23 |             dwt=dwt,
 24 |         )
 25 |         self.model = model
 26 |         self.num_layers = int(log(config.model.params.ddconfig.attn_resolutions[0]) / log(2))
 27 |         self.image_size = 256
 28 |         self.num_tokens = config.model.params.n_embed
 29 | 
 30 |     @torch.no_grad()
 31 |     def get_codebook_indices(self, img):
 32 |         img = (2 * img) - 1
 33 |         _, _, [_, _, indices] = self.model.encode(img)
 34 |         return rearrange(indices, 'b h w -> b (h w)')
 35 | 
 36 |     def decode(self, img_seq):
 37 |         b, n = img_seq.shape
 38 |         one_hot_indices = torch.nn.functional.one_hot(img_seq, num_classes=self.num_tokens).float()
 39 |         z = (one_hot_indices @ self.model.quantize.embed.weight)
 40 |         z = rearrange(z, 'b (h w) c -> b c h w', h=int(sqrt(n)))
 41 |         img = self.model.decode(z)
 42 |         img = (img.clamp(-1., 1.) + 1) * 0.5
 43 |         return img
 44 | 
 45 | 
 46 | class GumbelQuantize(nn.Module):
 47 |     """
 48 |     credit to @karpathy: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py (thanks!)
 49 |     Gumbel Softmax trick quantizer
 50 |     Categorical Reparameterization with Gumbel-Softmax, Jang et al. 2016
 51 |     https://arxiv.org/abs/1611.01144
 52 |     """
 53 | 
 54 |     def __init__(self, num_hiddens, embedding_dim, n_embed, straight_through=True,
 55 |                  kl_weight=5e-4, temp_init=1.0, use_vqinterface=True):
 56 |         super().__init__()
 57 |         self.embedding_dim = embedding_dim
 58 |         self.n_embed = n_embed
 59 |         self.straight_through = straight_through
 60 |         self.temperature = temp_init
 61 |         self.kl_weight = kl_weight
 62 |         self.proj = nn.Conv2d(num_hiddens, n_embed, 1)
 63 |         self.embed = nn.Embedding(self.n_embed, self.embedding_dim)
 64 |         self.use_vqinterface = use_vqinterface
 65 | 
 66 |     def forward(self, z, temp=None, return_logits=False):
 67 |         hard = self.straight_through if self.training else True
 68 |         temp = self.temperature if temp is None else temp
 69 |         logits = self.proj(z)
 70 |         soft_one_hot = F.gumbel_softmax(logits, tau=temp, dim=1, hard=hard)
 71 |         z_q = einsum('b n h w, n d -> b d h w', soft_one_hot, self.embed.weight)
 72 |         # + kl divergence to the prior loss
 73 |         qy = F.softmax(logits, dim=1)
 74 |         diff = self.kl_weight * torch.sum(qy * torch.log(qy * self.n_embed + 1e-10), dim=1).mean()
 75 |         ind = soft_one_hot.argmax(dim=1)
 76 |         if self.use_vqinterface:
 77 |             if return_logits:
 78 |                 return z_q, diff, (None, None, ind), logits
 79 |             return z_q, diff, (None, None, ind)
 80 |         return z_q, diff, ind
 81 | 
 82 | 
 83 | class GumbelVQ(nn.Module):
 84 | 
 85 |     def __init__(self, ddconfig, n_embed, embed_dim, dwt=False, kl_weight=1e-8):
 86 |         super().__init__()
 87 |         z_channels = ddconfig['z_channels']
 88 |         self.dwt = dwt
 89 |         self.encoder = Encoder(**ddconfig)
 90 |         self.decoder = DecoderDWT(ddconfig, embed_dim) if dwt else Decoder(**ddconfig)
 91 |         self.quantize = GumbelQuantize(z_channels, embed_dim, n_embed=n_embed, kl_weight=kl_weight, temp_init=1.0)
 92 |         self.quant_conv = torch.nn.Conv2d(ddconfig['z_channels'], embed_dim, 1)
 93 |         self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig['z_channels'], 1)
 94 | 
 95 |     def encode(self, x):
 96 |         h = self.encoder(x)
 97 |         h = self.quant_conv(h)
 98 |         quant, emb_loss, info = self.quantize(h)
 99 |         return quant, emb_loss, info
100 | 
101 |     def decode(self, quant):
102 |         if self.dwt:
103 |             quant = self.decoder.post_quant_conv(quant)
104 |         else:
105 |             quant = self.post_quant_conv(quant)
106 |         dec = self.decoder(quant)
107 |         return dec
108 | 


--------------------------------------------------------------------------------
/rudalle/realesrgan/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | 
  4 | 
  5 | def pad_reflect(image, pad_size):
  6 |     imsize = image.shape
  7 |     height, width = imsize[:2]
  8 |     new_img = np.zeros([height + pad_size * 2, width + pad_size * 2, imsize[2]]).astype(np.uint8)
  9 |     new_img[pad_size:-pad_size, pad_size:-pad_size, :] = image
 10 |     new_img[0:pad_size, pad_size:-pad_size, :] = np.flip(image[0:pad_size, :, :], axis=0)  # top
 11 |     new_img[-pad_size:, pad_size:-pad_size, :] = np.flip(image[-pad_size:, :, :], axis=0)  # bottom
 12 |     new_img[:, 0:pad_size, :] = np.flip(new_img[:, pad_size:pad_size * 2, :], axis=1)  # left
 13 |     new_img[:, -pad_size:, :] = np.flip(new_img[:, -pad_size * 2:-pad_size, :], axis=1)  # right
 14 |     return new_img
 15 | 
 16 | 
 17 | def unpad_image(image, pad_size):
 18 |     return image[pad_size:-pad_size, pad_size:-pad_size, :]
 19 | 
 20 | 
 21 | def pad_patch(image_patch, padding_size, channel_last=True):
 22 |     """ Pads image_patch with with padding_size edge values. """
 23 |     if channel_last:
 24 |         return np.pad(
 25 |             image_patch,
 26 |             ((padding_size, padding_size), (padding_size, padding_size), (0, 0)),
 27 |             'edge',
 28 |         )
 29 |     else:
 30 |         return np.pad(
 31 |             image_patch,
 32 |             ((0, 0), (padding_size, padding_size), (padding_size, padding_size)),
 33 |             'edge',
 34 |         )
 35 | 
 36 | 
 37 | def unpad_patches(image_patches, padding_size):
 38 |     return image_patches[:, padding_size:-padding_size, padding_size:-padding_size, :]
 39 | 
 40 | 
 41 | def split_image_into_overlapping_patches(image_array, patch_size, padding_size=2):
 42 |     """ Splits the image into partially overlapping patches.
 43 |     The patches overlap by padding_size pixels.
 44 |     Pads the image twice:
 45 |         - first to have a size multiple of the patch size,
 46 |         - then to have equal padding at the borders.
 47 |     Args:
 48 |         image_array: numpy array of the input image.
 49 |         patch_size: size of the patches from the original image (without padding).
 50 |         padding_size: size of the overlapping area.
 51 |     """
 52 |     xmax, ymax, _ = image_array.shape
 53 |     x_remainder = xmax % patch_size
 54 |     y_remainder = ymax % patch_size
 55 | 
 56 |     # modulo here is to avoid extending of patch_size instead of 0
 57 |     x_extend = (patch_size - x_remainder) % patch_size
 58 |     y_extend = (patch_size - y_remainder) % patch_size
 59 | 
 60 |     # make sure the image is divisible into regular patches
 61 |     extended_image = np.pad(image_array, ((0, x_extend), (0, y_extend), (0, 0)), 'edge')
 62 | 
 63 |     # add padding around the image to simplify computations
 64 |     padded_image = pad_patch(extended_image, padding_size, channel_last=True)
 65 | 
 66 |     xmax, ymax, _ = padded_image.shape
 67 |     patches = []
 68 | 
 69 |     x_lefts = range(padding_size, xmax - padding_size, patch_size)
 70 |     y_tops = range(padding_size, ymax - padding_size, patch_size)
 71 | 
 72 |     for x in x_lefts:
 73 |         for y in y_tops:
 74 |             x_left = x - padding_size
 75 |             y_top = y - padding_size
 76 |             x_right = x + patch_size + padding_size
 77 |             y_bottom = y + patch_size + padding_size
 78 |             patch = padded_image[x_left:x_right, y_top:y_bottom, :]
 79 |             patches.append(patch)
 80 | 
 81 |     return np.array(patches), padded_image.shape
 82 | 
 83 | 
 84 | def stich_together(patches, padded_image_shape, target_shape, padding_size=4):
 85 |     """ Reconstruct the image from overlapping patches.
 86 |     After scaling, shapes and padding should be scaled too.
 87 |     Args:
 88 |         patches: patches obtained with split_image_into_overlapping_patches
 89 |         padded_image_shape: shape of the padded image contructed in split_image_into_overlapping_patches
 90 |         target_shape: shape of the final image
 91 |         padding_size: size of the overlapping area.
 92 |     """
 93 | 
 94 |     xmax, ymax, _ = padded_image_shape
 95 |     patches = unpad_patches(patches, padding_size)
 96 |     patch_size = patches.shape[1]
 97 |     n_patches_per_row = ymax // patch_size
 98 | 
 99 |     complete_image = np.zeros((xmax, ymax, 3))
100 | 
101 |     row = -1
102 |     col = 0
103 |     for i in range(len(patches)):
104 |         if i % n_patches_per_row == 0:
105 |             row += 1
106 |             col = 0
107 |         complete_image[
108 |             row * patch_size: (row + 1) * patch_size, col * patch_size: (col + 1) * patch_size, :
109 |         ] = patches[i]
110 |         col += 1
111 |     return complete_image[0: target_shape[0], 0: target_shape[1], :]
112 | 


--------------------------------------------------------------------------------
/Emojich.md:
--------------------------------------------------------------------------------
  1 | [[Paper]](https://arxiv.org/abs/2112.02448) [[Хабр]](https://habr.com/ru/company/sberbank/blog/593893/) [[Model Card]](https://huggingface.co/sberbank-ai/rudalle-Emojich) [[Kaggle]](https://www.kaggle.com/shonenkov/emojich-rudall-e) [[Dataset]](https://www.kaggle.com/shonenkov/russian-emoji)
  2 | # Emojich
  3 | ![](./pics/emojich/emojich_rgba_100.png)
  4 | ### generate emojis from text
  5 | 
  6 | Model was trained by [Sber AI](https://github.com/sberbank-ai)
  7 | * Task: `text2image generation`
  8 | * Num Parameters: `1.3 B`
  9 | * Training Data Volume: `120 million text-image pairs` & [`2749 text-emoji pairs`](https://www.kaggle.com/shonenkov/russian-emoji)
 10 | 
 11 | [![Telegram](https://img.shields.io/badge/Telegram-Stickers-blue?style=for-the-badge&logo=data:image/svg%2bxml;base64,PHN2ZyBlbmFibGUtYmFja2dyb3VuZD0ibmV3IDAgMCAyNCAyNCIgaGVpZ2h0PSI1MTIiIHZpZXdCb3g9IjAgMCAyNCAyNCIgd2lkdGg9IjUxMiIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48cGF0aCBkPSJtOS40MTcgMTUuMTgxLS4zOTcgNS41ODRjLjU2OCAwIC44MTQtLjI0NCAxLjEwOS0uNTM3bDIuNjYzLTIuNTQ1IDUuNTE4IDQuMDQxYzEuMDEyLjU2NCAxLjcyNS4yNjcgMS45OTgtLjkzMWwzLjYyMi0xNi45NzIuMDAxLS4wMDFjLjMyMS0xLjQ5Ni0uNTQxLTIuMDgxLTEuNTI3LTEuNzE0bC0yMS4yOSA4LjE1MWMtMS40NTMuNTY0LTEuNDMxIDEuMzc0LS4yNDcgMS43NDFsNS40NDMgMS42OTMgMTIuNjQzLTcuOTExYy41OTUtLjM5NCAxLjEzNi0uMTc2LjY5MS4yMTh6IiBmaWxsPSIjMDM5YmU1Ii8+PC9zdmc+)](https://telegram.me/addstickers/SberAI_ruDALLE)
 12 | 
 13 | ### Model Description
 14 | 😋 Emojich is a 1.3 billion params model from the family GPT3-like, it generates emoji-style images with the brain of ◾ Malevich.
 15 | 
 16 | 
 17 | ### Fine-tuning stage:
 18 | 
 19 | The main goal of fine-tuning is trying to keep the generalization of [ruDALL-E Malevich (XL)](https://huggingface.co/sberbank-ai/rudalle-Malevich)
 20 | model on text to emoji tasks. ruDALL-E Malevich is a multi-modality big pretrained transformer, that uses images and texts.
 21 | The idea with freezing feedforward and self-attention layers in pretrained transformer is demonstrated high performance in changing different modalities.
 22 | Also, the model has a good chance for over-fitting text modality and lost generalization.
 23 | To deal with this problem is increased coefficient 10^3 in weighted cross-entropy loss for image codebooks part.
 24 | 
 25 | 
 26 | Full version of training code is available on Kaggle: [![Kaggle](https://kaggle.com/static/images/open-in-kaggle.svg)](https://www.kaggle.com/shonenkov/emojich-rudall-e)
 27 | 
 28 | ### Usage:
 29 | 
 30 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1YbEduCe8jH0DXMXKxnb8ulmT8jscJ54i?usp=sharing)
 31 | 
 32 | ```python
 33 | from rudalle.pipelines import generate_images, show
 34 | from rudalle import get_rudalle_model, get_tokenizer, get_vae
 35 | from rudalle.utils import seed_everything
 36 | 
 37 | device = 'cuda'
 38 | dalle = get_rudalle_model('Emojich', pretrained=True, fp16=True, device=device)
 39 | tokenizer = get_tokenizer()
 40 | vae = get_vae(dwt=True).to(device)
 41 | 
 42 | text = 'Дональд Трамп из лего'  # Donald Trump made of LEGO
 43 | 
 44 | seed_everything(42)
 45 | pil_images = []
 46 | for top_k, top_p, images_num in [
 47 |     (2048, 0.995, 16),
 48 | ]:
 49 |     pil_images += generate_images(text, tokenizer, dalle, vae, top_k=top_k, images_num=images_num, top_p=top_p, bs=8)[0]
 50 | 
 51 | show(pil_images, 4)
 52 | ```
 53 | ![](./pics/emojich/emoji-Donald.png)
 54 | 
 55 | ### Super Resolution:
 56 | ```python
 57 | from rudalle.pipelines import super_resolution
 58 | from rudalle import get_realesrgan
 59 | 
 60 | device = 'cuda'
 61 | realesrgan = get_realesrgan('x4', device=device)
 62 | sr_images = super_resolution(pil_images, realesrgan)
 63 | ```
 64 | 
 65 | ### Converting to Telegram Stickers format (512x512 RGBA)
 66 | ```python
 67 | from rudalle.pipelines import convert_emoji_to_rgba, show_rgba
 68 | from rudalle import get_emojich_unet
 69 | 
 70 | device = 'cuda'
 71 | emojich_unet = get_emojich_unet('unet_effnetb7').to(device)
 72 | rgba_images, _ = convert_emoji_to_rgba(sr_images, emojich_unet,  device=device)
 73 | for rgba_image in rgba_images:
 74 |     show_rgba(rgba_image);
 75 | ```
 76 | ![](./pics/emojich/emojich-stickers.png)
 77 | 
 78 | ### Examples of generated emojis
 79 | 
 80 | All examples are generated automatically (without manual cherry-picking) with hyper-parameters:
 81 | seed 42, batch size 16, top-k 2048, top-p 0.995, temperature 1.0, GPU A100.
 82 | For making better generative emojis should use more attempts (~512) and select the best one manually.
 83 | 
 84 | *Remember, the great art makers became "great" after creating just only one masterpiece.*
 85 | 
 86 | ![](./pics/emojich/examples.png)
 87 | 
 88 | 
 89 | ### Citation
 90 | Feel free to cite our work in your research if it is helpful for you
 91 | ```
 92 | @misc{shonenkov2021emojich,
 93 |       title={Emojich -- zero-shot emoji generation using Russian language: a technical report},
 94 |       author={Alex Shonenkov and Daria Bakshandaeva and Denis Dimitrov and Aleksandr Nikolich},
 95 |       year={2021},
 96 |       eprint={2112.02448},
 97 |       archivePrefix={arXiv},
 98 |       primaryClass={cs.CL}
 99 | }
100 | ```
101 | 


--------------------------------------------------------------------------------
/rudalle/realesrgan/rrdbnet_arch.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import torch
  3 | from torch import nn as nn
  4 | from torch.nn import functional as F
  5 | 
  6 | from .arch_util import default_init_weights, make_layer, pixel_unshuffle
  7 | 
  8 | 
  9 | class ResidualDenseBlock(nn.Module):
 10 |     """Residual Dense Block.
 11 |     Used in RRDB block in ESRGAN.
 12 |     Args:
 13 |         num_feat (int): Channel number of intermediate features.
 14 |         num_grow_ch (int): Channels for each growth.
 15 |     """
 16 | 
 17 |     def __init__(self, num_feat=64, num_grow_ch=32):
 18 |         super(ResidualDenseBlock, self).__init__()
 19 |         self.conv1 = nn.Conv2d(num_feat, num_grow_ch, 3, 1, 1)
 20 |         self.conv2 = nn.Conv2d(num_feat + num_grow_ch, num_grow_ch, 3, 1, 1)
 21 |         self.conv3 = nn.Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, 3, 1, 1)
 22 |         self.conv4 = nn.Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, 3, 1, 1)
 23 |         self.conv5 = nn.Conv2d(num_feat + 4 * num_grow_ch, num_feat, 3, 1, 1)
 24 | 
 25 |         self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
 26 | 
 27 |         # initialization
 28 |         default_init_weights([self.conv1, self.conv2, self.conv3, self.conv4, self.conv5], 0.1)
 29 | 
 30 |     def forward(self, x):
 31 |         x1 = self.lrelu(self.conv1(x))
 32 |         x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
 33 |         x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
 34 |         x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
 35 |         x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
 36 |         # Emperically, we use 0.2 to scale the residual for better performance
 37 |         return x5 * 0.2 + x
 38 | 
 39 | 
 40 | class RRDB(nn.Module):
 41 |     """Residual in Residual Dense Block.
 42 |     Used in RRDB-Net in ESRGAN.
 43 |     Args:
 44 |         num_feat (int): Channel number of intermediate features.
 45 |         num_grow_ch (int): Channels for each growth.
 46 |     """
 47 | 
 48 |     def __init__(self, num_feat, num_grow_ch=32):
 49 |         super(RRDB, self).__init__()
 50 |         self.rdb1 = ResidualDenseBlock(num_feat, num_grow_ch)
 51 |         self.rdb2 = ResidualDenseBlock(num_feat, num_grow_ch)
 52 |         self.rdb3 = ResidualDenseBlock(num_feat, num_grow_ch)
 53 | 
 54 |     def forward(self, x):
 55 |         out = self.rdb1(x)
 56 |         out = self.rdb2(out)
 57 |         out = self.rdb3(out)
 58 |         # Emperically, we use 0.2 to scale the residual for better performance
 59 |         return out * 0.2 + x
 60 | 
 61 | 
 62 | class RRDBNet(nn.Module):
 63 |     """Networks consisting of Residual in Residual Dense Block, which is used
 64 |     in ESRGAN.
 65 |     ESRGAN: Enhanced Super-Resolution Generative Adversarial Networks.
 66 |     We extend ESRGAN for scale x2 and scale x1.
 67 |     Note: This is one option for scale 1, scale 2 in RRDBNet.
 68 |     We first employ the pixel-unshuffle (an inverse operation of pixelshuffle to reduce the spatial size
 69 |     and enlarge the channel size before feeding inputs into the main ESRGAN architecture.
 70 |     Args:
 71 |         num_in_ch (int): Channel number of inputs.
 72 |         num_out_ch (int): Channel number of outputs.
 73 |         num_feat (int): Channel number of intermediate features.
 74 |             Default: 64
 75 |         num_block (int): Block number in the trunk network. Defaults: 23
 76 |         num_grow_ch (int): Channels for each growth. Default: 32.
 77 |     """
 78 | 
 79 |     def __init__(self, num_in_ch, num_out_ch, scale=4, num_feat=64, num_block=23, num_grow_ch=32):
 80 |         super(RRDBNet, self).__init__()
 81 |         self.scale = scale
 82 |         if scale == 2:
 83 |             num_in_ch = num_in_ch * 4
 84 |         elif scale == 1:
 85 |             num_in_ch = num_in_ch * 16
 86 |         self.conv_first = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)
 87 |         self.body = make_layer(RRDB, num_block, num_feat=num_feat, num_grow_ch=num_grow_ch)
 88 |         self.conv_body = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
 89 |         # upsample
 90 |         self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
 91 |         self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
 92 |         if scale == 8:
 93 |             self.conv_up3 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
 94 |         self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
 95 |         self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
 96 | 
 97 |         self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
 98 | 
 99 |     def forward(self, x):
100 |         if self.scale == 2:
101 |             feat = pixel_unshuffle(x, scale=2)
102 |         elif self.scale == 1:
103 |             feat = pixel_unshuffle(x, scale=4)
104 |         else:
105 |             feat = x
106 |         feat = self.conv_first(feat)
107 |         body_feat = self.conv_body(self.body(feat))
108 |         feat = feat + body_feat
109 |         # upsample
110 |         feat = self.lrelu(self.conv_up1(F.interpolate(feat, scale_factor=2, mode='nearest')))
111 |         feat = self.lrelu(self.conv_up2(F.interpolate(feat, scale_factor=2, mode='nearest')))
112 |         if self.scale == 8:
113 |             feat = self.lrelu(self.conv_up3(F.interpolate(feat, scale_factor=2, mode='nearest')))
114 |         out = self.conv_last(self.lrelu(self.conv_hr(feat)))
115 |         return out
116 | 


--------------------------------------------------------------------------------
/rudalle/dalle/model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from einops import rearrange
  5 | 
  6 | from .utils import exists, is_empty, init_method_normal
  7 | 
  8 | from .transformer import DalleTransformer
  9 | 
 10 | 
 11 | class DalleModel(torch.nn.Module):
 12 |     def __init__(self,
 13 |                  device,
 14 |                  num_layers,
 15 |                  vocab_size,
 16 |                  hidden_size,
 17 |                  num_attention_heads,
 18 |                  embedding_dropout_prob,
 19 |                  attention_dropout_prob,
 20 |                  output_dropout_prob,
 21 |                  text_seq_length=128,
 22 |                  image_tokens_per_dim=32,
 23 |                  image_vocab_size=16384,
 24 |                  loss_img_weight=7,
 25 |                  cogview_sandwich_layernorm=False,
 26 |                  cogview_pb_relax=False,
 27 |                  is_bool_mask=True,
 28 |                  mlp_activation='gelu_jit'):
 29 |         super(DalleModel, self).__init__()
 30 |         self.device = device
 31 |         self.image_tokens_per_dim = image_tokens_per_dim
 32 |         self.image_seq_length = image_tokens_per_dim ** 2
 33 |         self.text_seq_length = text_seq_length
 34 |         self.total_seq_length = self.text_seq_length + self.image_seq_length
 35 |         self.total_vocab_size = vocab_size + image_vocab_size
 36 |         self.vocab_size = vocab_size
 37 |         self.loss_img_weight = loss_img_weight
 38 | 
 39 |         init_method = init_method_normal(std=0.02)
 40 | 
 41 |         self.text_embeddings = torch.nn.Embedding(vocab_size, hidden_size)
 42 |         self.image_embeddings = torch.nn.Embedding(image_vocab_size, hidden_size)
 43 | 
 44 |         # Position embedding (serial).
 45 |         self.text_pos_embeddings = torch.nn.Embedding(text_seq_length + 1, hidden_size)
 46 |         self.image_row_embeddings = torch.nn.Embedding(image_tokens_per_dim, hidden_size)
 47 |         self.image_col_embeddings = torch.nn.Embedding(image_tokens_per_dim, hidden_size)
 48 |         init_method(self.text_pos_embeddings.weight)
 49 |         init_method(self.image_row_embeddings.weight)
 50 |         init_method(self.image_col_embeddings.weight)
 51 | 
 52 |         self.to_logits = torch.nn.Sequential(
 53 |             torch.nn.LayerNorm(hidden_size),
 54 |             torch.nn.Linear(hidden_size, self.total_vocab_size),
 55 |         )
 56 | 
 57 |         # Embeddings dropout
 58 |         self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
 59 | 
 60 |         # Transformer
 61 |         self.transformer = DalleTransformer(
 62 |             num_layers,
 63 |             hidden_size,
 64 |             num_attention_heads,
 65 |             attention_dropout_prob,
 66 |             output_dropout_prob,
 67 |             text_seq_length=text_seq_length,
 68 |             image_tokens_per_dim=image_tokens_per_dim,
 69 |             cogview_sandwich_layernorm=cogview_sandwich_layernorm,
 70 |             cogview_pb_relax=cogview_pb_relax,
 71 |             mlp_activation=mlp_activation,
 72 |             is_bool_mask=is_bool_mask,
 73 |         )
 74 | 
 75 |     def get_param(self, item):
 76 |         return getattr(self, item)
 77 | 
 78 |     def get_image_pos_embeddings(self, image_input_ids, past_length=0):
 79 |         input_shape = image_input_ids.size()
 80 |         row_ids = torch.arange(past_length, input_shape[-1] + past_length,
 81 |                                dtype=torch.long, device=self.device) // self.image_tokens_per_dim
 82 |         row_ids = row_ids.unsqueeze(0).view(-1, input_shape[-1])
 83 |         col_ids = torch.arange(past_length, input_shape[-1] + past_length,
 84 |                                dtype=torch.long, device=self.device) % self.image_tokens_per_dim
 85 |         col_ids = col_ids.unsqueeze(0).view(-1, input_shape[-1])
 86 |         return self.image_row_embeddings(row_ids) + self.image_col_embeddings(col_ids)
 87 | 
 88 |     def forward(
 89 |             self,
 90 |             input_ids,
 91 |             attention_mask,
 92 |             return_loss=False,
 93 |             has_cache=False,
 94 |             use_cache=False,
 95 |     ):
 96 |         text = input_ids[:, :self.text_seq_length]
 97 |         text_range = torch.arange(self.text_seq_length)
 98 |         text_range += (self.vocab_size - self.text_seq_length)
 99 |         text_range = text_range.to(self.device)
100 |         text = torch.where(text == 0, text_range, text)
101 |         # some hardcode :)
102 |         text = F.pad(text, (1, 0), value=2)
103 |         text_embeddings = self.text_embeddings(text) + \
104 |             self.text_pos_embeddings(torch.arange(text.shape[1], device=self.device))
105 | 
106 |         image_input_ids = input_ids[:, self.text_seq_length:]
107 | 
108 |         if exists(image_input_ids) and not is_empty(image_input_ids):
109 |             image_embeddings = self.image_embeddings(image_input_ids) + \
110 |                 self.get_image_pos_embeddings(image_input_ids, past_length=0)
111 |             embeddings = torch.cat((text_embeddings, image_embeddings), dim=1)
112 |         else:
113 |             embeddings = text_embeddings
114 |         # some hardcode :)
115 |         if embeddings.shape[1] > self.total_seq_length:
116 |             embeddings = embeddings[:, :-1]
117 | 
118 |         alpha = 0.1
119 |         embeddings = embeddings * alpha + embeddings.detach() * (1-alpha)
120 | 
121 |         attention_mask = attention_mask[:, :, :embeddings.shape[1], :embeddings.shape[1]]
122 |         transformer_output, present_has_cache = self.transformer(
123 |             embeddings, attention_mask, has_cache=has_cache, use_cache=use_cache)
124 | 
125 |         logits = self.to_logits(transformer_output)
126 |         if return_loss is False:
127 |             return logits, present_has_cache
128 | 
129 |         labels = torch.cat((text[:, 1:], image_input_ids), dim=1).contiguous().long()
130 |         logits = rearrange(logits, 'b n c -> b c n')
131 | 
132 |         text_logits = logits[:, :self.vocab_size, :self.text_seq_length].contiguous().float()
133 |         image_logits = logits[:, self.vocab_size:, self.text_seq_length:].contiguous().float()
134 | 
135 |         loss_text = F.cross_entropy(
136 |             text_logits,
137 |             labels[:, :self.text_seq_length])
138 |         loss_img = F.cross_entropy(
139 |             image_logits,
140 |             labels[:, self.text_seq_length:])
141 | 
142 |         loss = (loss_text + self.loss_img_weight * loss_img) / (self.loss_img_weight + 1)
143 |         return loss, {'text': loss_text.data.detach().float(), 'image': loss_img.data.detach().float()}
144 | 
145 |     def to(self, device, *args, **kwargs):
146 |         self.device = device
147 |         return super().to(device, *args, **kwargs)
148 | 


--------------------------------------------------------------------------------
/rudalle/realesrgan/arch_util.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import math
  3 | import torch
  4 | from torch import nn as nn
  5 | from torch.nn import functional as F
  6 | from torch.nn import init as init
  7 | from torch.nn.modules.batchnorm import _BatchNorm
  8 | 
  9 | 
 10 | @torch.no_grad()
 11 | def default_init_weights(module_list, scale=1, bias_fill=0, **kwargs):
 12 |     """Initialize network weights.
 13 |     Args:
 14 |         module_list (list[nn.Module] | nn.Module): Modules to be initialized.
 15 |         scale (float): Scale initialized weights, especially for residual
 16 |             blocks. Default: 1.
 17 |         bias_fill (float): The value to fill bias. Default: 0
 18 |         kwargs (dict): Other arguments for initialization function.
 19 |     """
 20 |     if not isinstance(module_list, list):
 21 |         module_list = [module_list]
 22 |     for module in module_list:
 23 |         for m in module.modules():
 24 |             if isinstance(m, nn.Conv2d):
 25 |                 init.kaiming_normal_(m.weight, **kwargs)
 26 |                 m.weight.data *= scale
 27 |                 if m.bias is not None:
 28 |                     m.bias.data.fill_(bias_fill)
 29 |             elif isinstance(m, nn.Linear):
 30 |                 init.kaiming_normal_(m.weight, **kwargs)
 31 |                 m.weight.data *= scale
 32 |                 if m.bias is not None:
 33 |                     m.bias.data.fill_(bias_fill)
 34 |             elif isinstance(m, _BatchNorm):
 35 |                 init.constant_(m.weight, 1)
 36 |                 if m.bias is not None:
 37 |                     m.bias.data.fill_(bias_fill)
 38 | 
 39 | 
 40 | def make_layer(basic_block, num_basic_block, **kwarg):
 41 |     """Make layers by stacking the same blocks.
 42 |     Args:
 43 |         basic_block (nn.module): nn.module class for basic block.
 44 |         num_basic_block (int): number of blocks.
 45 |     Returns:
 46 |         nn.Sequential: Stacked blocks in nn.Sequential.
 47 |     """
 48 |     layers = []
 49 |     for _ in range(num_basic_block):
 50 |         layers.append(basic_block(**kwarg))
 51 |     return nn.Sequential(*layers)
 52 | 
 53 | 
 54 | class ResidualBlockNoBN(nn.Module):
 55 |     """Residual block without BN.
 56 |     It has a style of:
 57 |         ---Conv-ReLU-Conv-+-
 58 |          |________________|
 59 |     Args:
 60 |         num_feat (int): Channel number of intermediate features.
 61 |             Default: 64.
 62 |         res_scale (float): Residual scale. Default: 1.
 63 |         pytorch_init (bool): If set to True, use pytorch default init,
 64 |             otherwise, use default_init_weights. Default: False.
 65 |     """
 66 | 
 67 |     def __init__(self, num_feat=64, res_scale=1, pytorch_init=False):
 68 |         super(ResidualBlockNoBN, self).__init__()
 69 |         self.res_scale = res_scale
 70 |         self.conv1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1, bias=True)
 71 |         self.conv2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1, bias=True)
 72 |         self.relu = nn.ReLU(inplace=True)
 73 | 
 74 |         if not pytorch_init:
 75 |             default_init_weights([self.conv1, self.conv2], 0.1)
 76 | 
 77 |     def forward(self, x):
 78 |         identity = x
 79 |         out = self.conv2(self.relu(self.conv1(x)))
 80 |         return identity + out * self.res_scale
 81 | 
 82 | 
 83 | class Upsample(nn.Sequential):
 84 |     """Upsample module.
 85 |     Args:
 86 |         scale (int): Scale factor. Supported scales: 2^n and 3.
 87 |         num_feat (int): Channel number of intermediate features.
 88 |     """
 89 | 
 90 |     def __init__(self, scale, num_feat):
 91 |         m = []
 92 |         if (scale & (scale - 1)) == 0:  # scale = 2^n
 93 |             for _ in range(int(math.log(scale, 2))):
 94 |                 m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))
 95 |                 m.append(nn.PixelShuffle(2))
 96 |         elif scale == 3:
 97 |             m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))
 98 |             m.append(nn.PixelShuffle(3))
 99 |         else:
100 |             raise ValueError(f'scale {scale} is not supported. ' 'Supported scales: 2^n and 3.')
101 |         super(Upsample, self).__init__(*m)
102 | 
103 | 
104 | def flow_warp(x, flow, interp_mode='bilinear', padding_mode='zeros', align_corners=True):
105 |     """Warp an image or feature map with optical flow.
106 |     Args:
107 |         x (Tensor): Tensor with size (n, c, h, w).
108 |         flow (Tensor): Tensor with size (n, h, w, 2), normal value.
109 |         interp_mode (str): 'nearest' or 'bilinear'. Default: 'bilinear'.
110 |         padding_mode (str): 'zeros' or 'border' or 'reflection'.
111 |             Default: 'zeros'.
112 |         align_corners (bool): Before pytorch 1.3, the default value is
113 |             align_corners=True. After pytorch 1.3, the default value is
114 |             align_corners=False. Here, we use the True as default.
115 |     Returns:
116 |         Tensor: Warped image or feature map.
117 |     """
118 |     assert x.size()[-2:] == flow.size()[1:3]
119 |     _, _, h, w = x.size()
120 |     # create mesh grid
121 |     grid_y, grid_x = torch.meshgrid(torch.arange(0, h).type_as(x), torch.arange(0, w).type_as(x))
122 |     grid = torch.stack((grid_x, grid_y), 2).float()  # W(x), H(y), 2
123 |     grid.requires_grad = False
124 | 
125 |     vgrid = grid + flow
126 |     # scale grid to [-1,1]
127 |     vgrid_x = 2.0 * vgrid[:, :, :, 0] / max(w - 1, 1) - 1.0
128 |     vgrid_y = 2.0 * vgrid[:, :, :, 1] / max(h - 1, 1) - 1.0
129 |     vgrid_scaled = torch.stack((vgrid_x, vgrid_y), dim=3)
130 |     output = F.grid_sample(x, vgrid_scaled, mode=interp_mode, padding_mode=padding_mode, align_corners=align_corners)
131 | 
132 |     # TODO, what if align_corners=False
133 |     return output
134 | 
135 | 
136 | def resize_flow(flow, size_type, sizes, interp_mode='bilinear', align_corners=False):
137 |     """Resize a flow according to ratio or shape.
138 |     Args:
139 |         flow (Tensor): Precomputed flow. shape [N, 2, H, W].
140 |         size_type (str): 'ratio' or 'shape'.
141 |         sizes (list[int | float]): the ratio for resizing or the final output
142 |             shape.
143 |             1) The order of ratio should be [ratio_h, ratio_w]. For
144 |             downsampling, the ratio should be smaller than 1.0 (i.e., ratio
145 |             < 1.0). For upsampling, the ratio should be larger than 1.0 (i.e.,
146 |             ratio > 1.0).
147 |             2) The order of output_size should be [out_h, out_w].
148 |         interp_mode (str): The mode of interpolation for resizing.
149 |             Default: 'bilinear'.
150 |         align_corners (bool): Whether align corners. Default: False.
151 |     Returns:
152 |         Tensor: Resized flow.
153 |     """
154 |     _, _, flow_h, flow_w = flow.size()
155 |     if size_type == 'ratio':
156 |         output_h, output_w = int(flow_h * sizes[0]), int(flow_w * sizes[1])
157 |     elif size_type == 'shape':
158 |         output_h, output_w = sizes[0], sizes[1]
159 |     else:
160 |         raise ValueError(f'Size type should be ratio or shape, but got type {size_type}.')
161 | 
162 |     input_flow = flow.clone()
163 |     ratio_h = output_h / flow_h
164 |     ratio_w = output_w / flow_w
165 |     input_flow[:, 0, :, :] *= ratio_w
166 |     input_flow[:, 1, :, :] *= ratio_h
167 |     resized_flow = F.interpolate(
168 |         input=input_flow, size=(output_h, output_w), mode=interp_mode, align_corners=align_corners)
169 |     return resized_flow
170 | 
171 | 
172 | # TODO: may write a cpp file
173 | def pixel_unshuffle(x, scale):
174 |     """ Pixel unshuffle.
175 |     Args:
176 |         x (Tensor): Input feature with shape (b, c, hh, hw).
177 |         scale (int): Downsample ratio.
178 |     Returns:
179 |         Tensor: the pixel unshuffled feature.
180 |     """
181 |     b, c, hh, hw = x.size()
182 |     out_channel = c * (scale**2)
183 |     assert hh % scale == 0 and hw % scale == 0
184 |     h = hh // scale
185 |     w = hw // scale
186 |     x_view = x.view(b, c, h, scale, w, scale)
187 |     return x_view.permute(0, 1, 3, 5, 2, 4).reshape(b, out_channel, h, w)
188 | 


--------------------------------------------------------------------------------
/rudalle/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | from glob import glob
  4 | from os.path import join
  5 | 
  6 | import cv2
  7 | import torch
  8 | import torchvision
  9 | import transformers
 10 | import more_itertools
 11 | import numpy as np
 12 | import matplotlib.pyplot as plt
 13 | from tqdm.auto import tqdm
 14 | from PIL import Image
 15 | 
 16 | from . import utils
 17 | 
 18 | 
 19 | def generate_images(text, tokenizer, dalle, vae, top_k, top_p, images_num, image_prompts=None, temperature=1.0, bs=4,
 20 |                     seed=None, use_cache=True, return_codes=False):
 21 |     # TODO docstring
 22 |     if seed is not None:
 23 |         utils.seed_everything(seed)
 24 | 
 25 |     vocab_size = dalle.get_param('vocab_size')
 26 |     text_seq_length = dalle.get_param('text_seq_length')
 27 |     image_seq_length = dalle.get_param('image_seq_length')
 28 |     total_seq_length = dalle.get_param('total_seq_length')
 29 |     device = dalle.get_param('device')
 30 | 
 31 |     text = text.lower().strip()
 32 |     input_ids = tokenizer.encode_text(text, text_seq_length=text_seq_length)
 33 |     pil_images, scores, codes = [], [], []
 34 |     for chunk in more_itertools.chunked(range(images_num), bs):
 35 |         chunk_bs = len(chunk)
 36 |         with torch.no_grad():
 37 |             attention_mask = torch.tril(torch.ones((chunk_bs, 1, total_seq_length, total_seq_length), device=device))
 38 |             out = input_ids.unsqueeze(0).repeat(chunk_bs, 1).to(device)
 39 |             has_cache = False
 40 |             sample_scores = []
 41 |             if image_prompts is not None:
 42 |                 prompts_idx, prompts = image_prompts.image_prompts_idx, image_prompts.image_prompts
 43 |                 prompts = prompts.repeat(chunk_bs, 1)
 44 |             for idx in tqdm(range(out.shape[1], total_seq_length)):
 45 |                 idx -= text_seq_length
 46 |                 if image_prompts is not None and idx in prompts_idx:
 47 |                     out = torch.cat((out, prompts[:, idx].unsqueeze(1)), dim=-1)
 48 |                 else:
 49 |                     logits, has_cache = dalle(out, attention_mask,
 50 |                                               has_cache=has_cache, use_cache=use_cache, return_loss=False)
 51 |                     logits = logits[:, -1, vocab_size:]
 52 |                     logits /= temperature
 53 |                     filtered_logits = transformers.top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
 54 |                     probs = torch.nn.functional.softmax(filtered_logits, dim=-1)
 55 |                     sample = torch.multinomial(probs, 1)
 56 |                     sample_scores.append(probs[torch.arange(probs.size(0)), sample.transpose(0, 1)])
 57 |                     out = torch.cat((out, sample), dim=-1)
 58 |             codebooks = out[:, -image_seq_length:]
 59 |             images = vae.decode(codebooks)
 60 |             pil_images += utils.torch_tensors_to_pil_list(images)
 61 |             scores += torch.cat(sample_scores).sum(0).detach().cpu().numpy().tolist()
 62 |             for j in range(codebooks.shape[0]):
 63 |                 codes.append(codebooks[j].detach().cpu().numpy())
 64 | 
 65 |     if return_codes:
 66 |         return pil_images, scores, codes
 67 |     return pil_images, scores
 68 | 
 69 | def super_resolution(pil_images, realesrgan, batch_size=4):
 70 |     result = []
 71 |     for pil_image in pil_images:
 72 |         with torch.no_grad():
 73 |             sr_image = realesrgan.predict(np.array(pil_image), batch_size=batch_size)
 74 |         result.append(sr_image)
 75 |     return result
 76 | 
 77 | 
 78 | def cherry_pick_by_clip(pil_images, text, ruclip, ruclip_processor, device='cpu', count=4):
 79 |     with torch.no_grad():
 80 |         inputs = ruclip_processor(text=text, images=pil_images)
 81 |         for key in inputs.keys():
 82 |             inputs[key] = inputs[key].to(device)
 83 |         outputs = ruclip(**inputs)
 84 |         sims = outputs.logits_per_image.view(-1).softmax(dim=0)
 85 |         items = []
 86 |         for index, sim in enumerate(sims.cpu().numpy()):
 87 |             items.append({'img_index': index, 'cosine': sim})
 88 |     items = sorted(items, key=lambda x: x['cosine'], reverse=True)[:count]
 89 |     top_pil_images = [pil_images[x['img_index']] for x in items]
 90 |     top_scores = [x['cosine'] for x in items]
 91 |     return top_pil_images, top_scores
 92 | 
 93 | 
 94 | def show(pil_images, nrow=4, size=14, save_dir=None, show=True):
 95 |     """
 96 |     :param pil_images: list of images in PIL
 97 |     :param nrow: number of rows
 98 |     :param size: size of the images
 99 |     :param save_dir: dir for separately saving of images, example: save_dir='./pics'
100 |     """
101 |     if save_dir is not None:
102 |         os.makedirs(save_dir, exist_ok=True)
103 |         count = len(glob(join(save_dir, 'img_*.png')))
104 |         for i, pil_image in enumerate(pil_images):
105 |             pil_image.save(join(save_dir, f'img_{count+i}.png'))
106 | 
107 |     pil_images = [pil_image.convert('RGB') for pil_image in pil_images]
108 |     imgs = torchvision.utils.make_grid(utils.pil_list_to_torch_tensors(pil_images), nrow=nrow)
109 |     if not isinstance(imgs, list):
110 |         imgs = [imgs.cpu()]
111 |     fix, axs = plt.subplots(ncols=len(imgs), squeeze=False, figsize=(size, size))
112 |     for i, img in enumerate(imgs):
113 |         img = img.detach()
114 |         img = torchvision.transforms.functional.to_pil_image(img)
115 |         if save_dir is not None:
116 |             count = len(glob(join(save_dir, 'group_*.png')))
117 |             img.save(join(save_dir, f'group_{count+i}.png'))
118 |         if show:
119 |             axs[0, i].imshow(np.asarray(img))
120 |             axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
121 |     if show:
122 |         fix.show()
123 |         plt.show()
124 | 
125 | 
126 | def classic_convert_emoji_to_rgba(np_image, lower_thr=240, upper_thr=255, width=2):
127 |     img = np_image[:, :, :3].copy()
128 |     lower = np.array([lower_thr, lower_thr, lower_thr], dtype='uint8')
129 |     upper = np.array([upper_thr, upper_thr, upper_thr], dtype='uint8')
130 |     mask = cv2.inRange(img, lower, upper)
131 |     ret, thresh = cv2.threshold(mask, 0, 255, 0)
132 |     contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
133 |     a_channel = np.ones((512, 512), dtype=np.uint8)*255
134 |     if len(contours) != 0:
135 |         contours = sorted(contours, key=lambda x: x.shape[0])[-7:]
136 |         cv2.fillPoly(a_channel, contours, (0, 0, 0))
137 |         cv2.drawContours(a_channel, contours, -1, (0, 0, 0), width)
138 |     img = cv2.cvtColor(img, cv2.COLOR_RGB2RGBA)
139 |     img[:, :, 3] = a_channel
140 |     return img
141 | 
142 | 
143 | def convert_emoji_to_rgba(pil_images, emojich_unet,  device='cpu', bs=1, score_thr=0.99):
144 |     final_images, runs = [], []
145 |     with torch.no_grad():
146 |         for chunk in more_itertools.chunked(pil_images, bs):
147 |             images = []
148 |             for pil_image in chunk:
149 |                 image = np.array(pil_image.resize((512, 512)))[:, :, :3]
150 |                 image = image.astype(np.float32) / 255.0
151 |                 image = torch.from_numpy(image).permute(2, 0, 1)
152 |                 images.append(image)
153 |             images = torch.nn.utils.rnn.pad_sequence(images, batch_first=True)
154 |             pred_masks = emojich_unet(images.to(device))
155 |             pred_masks = torch.softmax(pred_masks, 1)
156 |             scores, pred_masks = torch.max(pred_masks, 1)
157 |             pred_masks = pred_masks.int().cpu().numpy()
158 |             pred_masks = (pred_masks * 255).astype(np.uint8)
159 |             for pil_image, pred_mask, score in zip(chunk, pred_masks, scores):
160 |                 score = score.mean().item()
161 |                 final_image = np.zeros((512, 512, 4), np.uint8)
162 |                 final_image[:, :, :3] = np.array(pil_image.resize((512, 512)))[:, :, :3]
163 |                 if score > score_thr:
164 |                     run = 'unet'
165 |                     final_image[:, :, -1] = pred_mask
166 |                 else:
167 |                     run = 'classic'
168 |                     final_image = classic_convert_emoji_to_rgba(final_image)
169 |                 final_image = Image.fromarray(final_image)
170 |                 final_images.append(final_image)
171 |                 runs.append(run)
172 |     return final_images, runs
173 | 
174 | 
175 | def show_rgba(rgba_pil_image):
176 |     img = np.array(rgba_pil_image)
177 |     fig, ax = plt.subplots(1, 3, figsize=(10, 10), dpi=100)
178 |     ax[0].imshow(img[:, :, :3])
179 |     ax[1].imshow(img[:, :, -1])
180 |     mask = np.repeat(np.expand_dims(img[:, :, -1] < 128, -1), 3, axis=-1)
181 |     img = img[:, :, :3]
182 |     img[mask[:, :, 0], 0] = 64
183 |     img[mask[:, :, 0], 1] = 255
184 |     img[mask[:, :, 0], 2] = 64
185 |     ax[2].imshow(img)
186 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [2020] [sberbank-ai]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/rudalle/vae/pytorch_wavelets_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Useful utilities for testing the 2-D DTCWT with synthetic images
  4 | License: https://github.com/fbcotter/pytorch_wavelets/blob/master/LICENSE
  5 | Source: https://github.com/fbcotter/pytorch_wavelets/blob/31d6ac1b51b08f811a6a70eb7b3440f106009da0/pytorch_wavelets/dwt/lowlevel.py  # noqa
  6 | """
  7 | 
  8 | import pywt
  9 | import torch
 10 | import numpy as np
 11 | import torch.nn.functional as F
 12 | from torch.autograd import Function
 13 | 
 14 | 
 15 | def sfb1d(lo, hi, g0, g1, mode='zero', dim=-1):
 16 |     """ 1D synthesis filter bank of an image tensor
 17 |     """
 18 |     C = lo.shape[1]
 19 |     d = dim % 4
 20 |     # If g0, g1 are not tensors, make them. If they are, then assume that they
 21 |     # are in the right order
 22 |     if not isinstance(g0, torch.Tensor):
 23 |         g0 = torch.tensor(np.copy(np.array(g0).ravel()),
 24 |                           dtype=torch.float, device=lo.device)
 25 |     if not isinstance(g1, torch.Tensor):
 26 |         g1 = torch.tensor(np.copy(np.array(g1).ravel()),
 27 |                           dtype=torch.float, device=lo.device)
 28 |     L = g0.numel()
 29 |     shape = [1, 1, 1, 1]
 30 |     shape[d] = L
 31 |     N = 2*lo.shape[d]
 32 |     # If g aren't in the right shape, make them so
 33 |     if g0.shape != tuple(shape):
 34 |         g0 = g0.reshape(*shape)
 35 |     if g1.shape != tuple(shape):
 36 |         g1 = g1.reshape(*shape)
 37 | 
 38 |     s = (2, 1) if d == 2 else (1, 2)
 39 |     g0 = torch.cat([g0]*C, dim=0)
 40 |     g1 = torch.cat([g1]*C, dim=0)
 41 |     if mode == 'per' or mode == 'periodization':
 42 |         y = F.conv_transpose2d(lo, g0, stride=s, groups=C) + \
 43 |             F.conv_transpose2d(hi, g1, stride=s, groups=C)
 44 |         if d == 2:
 45 |             y[:, :, :L-2] = y[:, :, :L-2] + y[:, :, N:N+L-2]
 46 |             y = y[:, :, :N]
 47 |         else:
 48 |             y[:, :, :, :L-2] = y[:, :, :, :L-2] + y[:, :, :, N:N+L-2]
 49 |             y = y[:, :, :, :N]
 50 |         y = roll(y, 1-L//2, dim=dim)
 51 |     else:
 52 |         if mode == 'zero' or mode == 'symmetric' or mode == 'reflect' or \
 53 |                 mode == 'periodic':
 54 |             pad = (L-2, 0) if d == 2 else (0, L-2)
 55 |             y = F.conv_transpose2d(lo, g0, stride=s, padding=pad, groups=C) + \
 56 |                 F.conv_transpose2d(hi, g1, stride=s, padding=pad, groups=C)
 57 |         else:
 58 |             raise ValueError('Unkown pad type: {}'.format(mode))
 59 | 
 60 |     return y
 61 | 
 62 | 
 63 | def _SFB2D(low, highs, g0_row, g1_row, g0_col, g1_col, mode):
 64 |     mode = int_to_mode(mode)
 65 | 
 66 |     lh, hl, hh = torch.unbind(highs, dim=2)
 67 |     lo = sfb1d(low, lh, g0_col, g1_col, mode=mode, dim=2)
 68 |     hi = sfb1d(hl, hh, g0_col, g1_col, mode=mode, dim=2)
 69 |     y = sfb1d(lo, hi, g0_row, g1_row, mode=mode, dim=3)
 70 | 
 71 |     return y
 72 | 
 73 | 
 74 | def roll(x, n, dim, make_even=False):
 75 |     if n < 0:
 76 |         n = x.shape[dim] + n
 77 | 
 78 |     if make_even and x.shape[dim] % 2 == 1:
 79 |         end = 1
 80 |     else:
 81 |         end = 0
 82 | 
 83 |     if dim == 0:
 84 |         return torch.cat((x[-n:], x[:-n+end]), dim=0)
 85 |     elif dim == 1:
 86 |         return torch.cat((x[:, -n:], x[:, :-n+end]), dim=1)
 87 |     elif dim == 2 or dim == -2:
 88 |         return torch.cat((x[:, :, -n:], x[:, :, :-n+end]), dim=2)
 89 |     elif dim == 3 or dim == -1:
 90 |         return torch.cat((x[:, :, :, -n:], x[:, :, :, :-n+end]), dim=3)
 91 | 
 92 | 
 93 | def int_to_mode(mode):
 94 |     if mode == 0:
 95 |         return 'zero'
 96 |     elif mode == 1:
 97 |         return 'symmetric'
 98 |     elif mode == 2:
 99 |         return 'periodization'
100 |     elif mode == 3:
101 |         return 'constant'
102 |     elif mode == 4:
103 |         return 'reflect'
104 |     elif mode == 5:
105 |         return 'replicate'
106 |     elif mode == 6:
107 |         return 'periodic'
108 |     else:
109 |         raise ValueError('Unkown pad type: {}'.format(mode))
110 | 
111 | 
112 | def prep_filt_sfb2d(g0_col, g1_col, g0_row=None, g1_row=None, device=None):
113 |     """
114 |     Prepares the filters to be of the right form for the sfb2d function.  In
115 |     particular, makes the tensors the right shape. It does not mirror image them
116 |     as as sfb2d uses conv2d_transpose which acts like normal convolution.
117 |     Inputs:
118 |         g0_col (array-like): low pass column filter bank
119 |         g1_col (array-like): high pass column filter bank
120 |         g0_row (array-like): low pass row filter bank. If none, will assume the
121 |             same as column filter
122 |         g1_row (array-like): high pass row filter bank. If none, will assume the
123 |             same as column filter
124 |         device: which device to put the tensors on to
125 |     Returns:
126 |         (g0_col, g1_col, g0_row, g1_row)
127 |     """
128 |     g0_col, g1_col = prep_filt_sfb1d(g0_col, g1_col, device)
129 |     if g0_row is None:
130 |         g0_row, g1_row = g0_col, g1_col
131 |     else:
132 |         g0_row, g1_row = prep_filt_sfb1d(g0_row, g1_row, device)
133 | 
134 |     g0_col = g0_col.reshape((1, 1, -1, 1))
135 |     g1_col = g1_col.reshape((1, 1, -1, 1))
136 |     g0_row = g0_row.reshape((1, 1, 1, -1))
137 |     g1_row = g1_row.reshape((1, 1, 1, -1))
138 | 
139 |     return g0_col, g1_col, g0_row, g1_row
140 | 
141 | 
142 | def prep_filt_sfb1d(g0, g1, device=None):
143 |     """
144 |     Prepares the filters to be of the right form for the sfb1d function. In
145 |     particular, makes the tensors the right shape. It does not mirror image them
146 |     as as sfb2d uses conv2d_transpose which acts like normal convolution.
147 |     Inputs:
148 |         g0 (array-like): low pass filter bank
149 |         g1 (array-like): high pass filter bank
150 |         device: which device to put the tensors on to
151 |     Returns:
152 |         (g0, g1)
153 |     """
154 |     g0 = np.array(g0).ravel()
155 |     g1 = np.array(g1).ravel()
156 |     t = torch.get_default_dtype()
157 |     g0 = torch.tensor(g0, device=device, dtype=t).reshape((1, 1, -1))
158 |     g1 = torch.tensor(g1, device=device, dtype=t).reshape((1, 1, -1))
159 | 
160 |     return g0, g1
161 | 
162 | 
163 | def mode_to_int(mode):
164 |     if mode == 'zero':
165 |         return 0
166 |     elif mode == 'symmetric':
167 |         return 1
168 |     elif mode == 'per' or mode == 'periodization':
169 |         return 2
170 |     elif mode == 'constant':
171 |         return 3
172 |     elif mode == 'reflect':
173 |         return 4
174 |     elif mode == 'replicate':
175 |         return 5
176 |     elif mode == 'periodic':
177 |         return 6
178 |     else:
179 |         raise ValueError('Unkown pad type: {}'.format(mode))
180 | 
181 | 
182 | def afb1d(x, h0, h1, mode='zero', dim=-1):
183 |     """ 1D analysis filter bank (along one dimension only) of an image
184 |     Inputs:
185 |         x (tensor): 4D input with the last two dimensions the spatial input
186 |         h0 (tensor): 4D input for the lowpass filter. Should have shape (1, 1,
187 |             h, 1) or (1, 1, 1, w)
188 |         h1 (tensor): 4D input for the highpass filter. Should have shape (1, 1,
189 |             h, 1) or (1, 1, 1, w)
190 |         mode (str): padding method
191 |         dim (int) - dimension of filtering. d=2 is for a vertical filter (called
192 |             column filtering but filters across the rows). d=3 is for a
193 |             horizontal filter, (called row filtering but filters across the
194 |             columns).
195 |     Returns:
196 |         lohi: lowpass and highpass subbands concatenated along the channel
197 |             dimension
198 |     """
199 |     C = x.shape[1]
200 |     # Convert the dim to positive
201 |     d = dim % 4
202 |     s = (2, 1) if d == 2 else (1, 2)
203 |     N = x.shape[d]
204 |     # If h0, h1 are not tensors, make them. If they are, then assume that they
205 |     # are in the right order
206 |     if not isinstance(h0, torch.Tensor):
207 |         h0 = torch.tensor(np.copy(np.array(h0).ravel()[::-1]),
208 |                           dtype=torch.float, device=x.device)
209 |     if not isinstance(h1, torch.Tensor):
210 |         h1 = torch.tensor(np.copy(np.array(h1).ravel()[::-1]),
211 |                           dtype=torch.float, device=x.device)
212 |     L = h0.numel()
213 |     L2 = L // 2
214 |     shape = [1, 1, 1, 1]
215 |     shape[d] = L
216 |     # If h aren't in the right shape, make them so
217 |     if h0.shape != tuple(shape):
218 |         h0 = h0.reshape(*shape)
219 |     if h1.shape != tuple(shape):
220 |         h1 = h1.reshape(*shape)
221 |     h = torch.cat([h0, h1] * C, dim=0)
222 | 
223 |     if mode == 'per' or mode == 'periodization':
224 |         if x.shape[dim] % 2 == 1:
225 |             if d == 2:
226 |                 x = torch.cat((x, x[:, :, -1:]), dim=2)
227 |             else:
228 |                 x = torch.cat((x, x[:, :, :, -1:]), dim=3)
229 |             N += 1
230 |         x = roll(x, -L2, dim=d)
231 |         pad = (L-1, 0) if d == 2 else (0, L-1)
232 |         lohi = F.conv2d(x, h, padding=pad, stride=s, groups=C)
233 |         N2 = N//2
234 |         if d == 2:
235 |             lohi[:, :, :L2] = lohi[:, :, :L2] + lohi[:, :, N2:N2+L2]
236 |             lohi = lohi[:, :, :N2]
237 |         else:
238 |             lohi[:, :, :, :L2] = lohi[:, :, :, :L2] + lohi[:, :, :, N2:N2+L2]
239 |             lohi = lohi[:, :, :, :N2]
240 |     else:
241 |         # Calculate the pad size
242 |         outsize = pywt.dwt_coeff_len(N, L, mode=mode)
243 |         p = 2 * (outsize - 1) - N + L
244 |         if mode == 'zero':
245 |             # Sadly, pytorch only allows for same padding before and after, if
246 |             # we need to do more padding after for odd length signals, have to
247 |             # prepad
248 |             if p % 2 == 1:
249 |                 pad = (0, 0, 0, 1) if d == 2 else (0, 1, 0, 0)
250 |                 x = F.pad(x, pad)
251 |             pad = (p//2, 0) if d == 2 else (0, p//2)
252 |             # Calculate the high and lowpass
253 |             lohi = F.conv2d(x, h, padding=pad, stride=s, groups=C)
254 |         elif mode == 'symmetric' or mode == 'reflect' or mode == 'periodic':
255 |             pad = (0, 0, p//2, (p+1)//2) if d == 2 else (p//2, (p+1)//2, 0, 0)
256 |             x = mypad(x, pad=pad, mode=mode)
257 |             lohi = F.conv2d(x, h, stride=s, groups=C)
258 |         else:
259 |             raise ValueError('Unkown pad type: {}'.format(mode))
260 | 
261 |     return lohi
262 | 
263 | 
264 | def mypad(x, pad, mode='constant', value=0):
265 |     """ Function to do numpy like padding on tensors. Only works for 2-D
266 |     padding.
267 |     Inputs:
268 |         x (tensor): tensor to pad
269 |         pad (tuple): tuple of (left, right, top, bottom) pad sizes
270 |         mode (str): 'symmetric', 'wrap', 'constant, 'reflect', 'replicate', or
271 |             'zero'. The padding technique.
272 |     """
273 |     if mode == 'symmetric':
274 |         # Vertical only
275 |         if pad[0] == 0 and pad[1] == 0:
276 |             m1, m2 = pad[2], pad[3]
277 |             l = x.shape[-2]  # noqa
278 |             xe = reflect(np.arange(-m1, l+m2, dtype='int32'), -0.5, l-0.5)
279 |             return x[:, :, xe]
280 |         # horizontal only
281 |         elif pad[2] == 0 and pad[3] == 0:
282 |             m1, m2 = pad[0], pad[1]
283 |             l = x.shape[-1]  # noqa
284 |             xe = reflect(np.arange(-m1, l+m2, dtype='int32'), -0.5, l-0.5)
285 |             return x[:, :, :, xe]
286 |         # Both
287 |         else:
288 |             m1, m2 = pad[0], pad[1]
289 |             l1 = x.shape[-1]
290 |             xe_row = reflect(np.arange(-m1, l1+m2, dtype='int32'), -0.5, l1-0.5)
291 |             m1, m2 = pad[2], pad[3]
292 |             l2 = x.shape[-2]
293 |             xe_col = reflect(np.arange(-m1, l2+m2, dtype='int32'), -0.5, l2-0.5)
294 |             i = np.outer(xe_col, np.ones(xe_row.shape[0]))
295 |             j = np.outer(np.ones(xe_col.shape[0]), xe_row)
296 |             return x[:, :, i, j]
297 |     elif mode == 'periodic':
298 |         # Vertical only
299 |         if pad[0] == 0 and pad[1] == 0:
300 |             xe = np.arange(x.shape[-2])
301 |             xe = np.pad(xe, (pad[2], pad[3]), mode='wrap')
302 |             return x[:, :, xe]
303 |         # Horizontal only
304 |         elif pad[2] == 0 and pad[3] == 0:
305 |             xe = np.arange(x.shape[-1])
306 |             xe = np.pad(xe, (pad[0], pad[1]), mode='wrap')
307 |             return x[:, :, :, xe]
308 |         # Both
309 |         else:
310 |             xe_col = np.arange(x.shape[-2])
311 |             xe_col = np.pad(xe_col, (pad[2], pad[3]), mode='wrap')
312 |             xe_row = np.arange(x.shape[-1])
313 |             xe_row = np.pad(xe_row, (pad[0], pad[1]), mode='wrap')
314 |             i = np.outer(xe_col, np.ones(xe_row.shape[0]))
315 |             j = np.outer(np.ones(xe_col.shape[0]), xe_row)
316 |             return x[:, :, i, j]
317 | 
318 |     elif mode == 'constant' or mode == 'reflect' or mode == 'replicate':
319 |         return F.pad(x, pad, mode, value)
320 |     elif mode == 'zero':
321 |         return F.pad(x, pad)
322 |     else:
323 |         raise ValueError('Unkown pad type: {}'.format(mode))
324 | 
325 | 
326 | def reflect(x, minx, maxx):
327 |     """Reflect the values in matrix *x* about the scalar values *minx* and
328 |     *maxx*.  Hence a vector *x* containing a long linearly increasing series is
329 |     converted into a waveform which ramps linearly up and down between *minx*
330 |     and *maxx*.  If *x* contains integers and *minx* and *maxx* are (integers +
331 |     0.5), the ramps will have repeated max and min samples.
332 |     .. codeauthor:: Rich Wareham <rjw57@cantab.net>, Aug 2013
333 |     .. codeauthor:: Nick Kingsbury, Cambridge University, January 1999.
334 |     """
335 |     x = np.asanyarray(x)
336 |     rng = maxx - minx
337 |     rng_by_2 = 2 * rng
338 |     mod = np.fmod(x - minx, rng_by_2)
339 |     normed_mod = np.where(mod < 0, mod + rng_by_2, mod)
340 |     out = np.where(normed_mod >= rng, rng_by_2 - normed_mod, normed_mod) + minx
341 |     return np.array(out, dtype=x.dtype)
342 | 
343 | 
344 | class SFB2D(Function):
345 |     """ Does a single level 2d wavelet decomposition of an input. Does separate
346 |     row and column filtering by two calls to
347 |     :py:func:`pytorch_wavelets.dwt.lowlevel.afb1d`
348 |     Needs to have the tensors in the right form. Because this function defines
349 |     its own backward pass, saves on memory by not having to save the input
350 |     tensors.
351 |     Inputs:
352 |         x (torch.Tensor): Input to decompose
353 |         h0_row: row lowpass
354 |         h1_row: row highpass
355 |         h0_col: col lowpass
356 |         h1_col: col highpass
357 |         mode (int): use mode_to_int to get the int code here
358 |     We encode the mode as an integer rather than a string as gradcheck causes an
359 |     error when a string is provided.
360 |     Returns:
361 |         y: Tensor of shape (N, C*4, H, W)
362 |     """
363 |     @staticmethod
364 |     def forward(ctx, low, highs, g0_row, g1_row, g0_col, g1_col, mode):
365 |         mode = int_to_mode(mode)
366 |         ctx.mode = mode
367 |         ctx.save_for_backward(g0_row, g1_row, g0_col, g1_col)
368 | 
369 |         lh, hl, hh = torch.unbind(highs, dim=2)
370 |         lo = sfb1d(low, lh, g0_col, g1_col, mode=mode, dim=2)
371 |         hi = sfb1d(hl, hh, g0_col, g1_col, mode=mode, dim=2)
372 |         y = sfb1d(lo, hi, g0_row, g1_row, mode=mode, dim=3)
373 |         return y
374 | 
375 |     @staticmethod
376 |     def backward(ctx, dy):
377 |         dlow, dhigh = None, None
378 |         if ctx.needs_input_grad[0]:
379 |             mode = ctx.mode
380 |             g0_row, g1_row, g0_col, g1_col = ctx.saved_tensors
381 |             dx = afb1d(dy, g0_row, g1_row, mode=mode, dim=3)
382 |             dx = afb1d(dx, g0_col, g1_col, mode=mode, dim=2)
383 |             s = dx.shape
384 |             dx = dx.reshape(s[0], -1, 4, s[-2], s[-1])
385 |             dlow = dx[:, :, 0].contiguous()
386 |             dhigh = dx[:, :, 1:].contiguous()
387 |         return dlow, dhigh, None, None, None, None, None
388 | 


--------------------------------------------------------------------------------
/rudalle/dalle/transformer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import math
  3 | 
  4 | import torch
  5 | from torch.nn import LayerNorm
  6 | 
  7 | from .utils import divide, split_tensor_along_last_dim
  8 | from .image_attention import get_conv_mask, get_row_mask, get_col_mask
  9 | 
 10 | 
 11 | def gelu(x):
 12 |     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
 13 | 
 14 | 
 15 | @torch.jit.script
 16 | def gelu_jit(x):
 17 |     """OpenAI's gelu implementation."""
 18 |     return gelu(x)
 19 | 
 20 | 
 21 | class DalleTransformer(torch.nn.Module):
 22 |     """
 23 |     This module takes input from embedding layer and it's output can
 24 |     be used directly by a logit layer. It consists of L (num-layers)
 25 |     blocks of:
 26 |         layer norm
 27 |         self attention
 28 |         residual connection
 29 |         layer norm
 30 |         mlp
 31 |         residual connection
 32 |     followed by a final layer norm.
 33 | 
 34 |     Arguments:
 35 |         num_layers: Number of transformer layers.
 36 |         hidden_size: The hidden size of the self attention.
 37 |         num_attention_heads: number of attention head in the self
 38 |                              attention.
 39 |         attention_dropout_prob: dropout probability of the attention
 40 |                                 score in self attention.
 41 |         output_dropout_prob: dropout probability for the outputs
 42 |                              after self attention and final output.
 43 |         layernorm_epsilon: epsilon used in layernorm to avoid
 44 |                            division by zero.
 45 |     """
 46 |     _mask_map = []
 47 | 
 48 |     def __init__(self, num_layers, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob,
 49 |                  text_seq_length, image_tokens_per_dim, layernorm_epsilon=1.0e-5,
 50 |                  cogview_sandwich_layernorm=False, cogview_pb_relax=False, mlp_activation='gelu_jit',
 51 |                  is_bool_mask=False):
 52 |         super(DalleTransformer, self).__init__()
 53 | 
 54 |         self.num_layers = num_layers
 55 |         # CogView stabilization of training features, see chapter 2.4 https://arxiv.org/pdf/2105.13290.pdf
 56 |         self.cogview_pb_relax = cogview_pb_relax
 57 | 
 58 |         # Transformer layers.
 59 |         self.layers = torch.nn.ModuleList([
 60 |             DalleTransformerLayer(
 61 |                 hidden_size,
 62 |                 num_attention_heads,
 63 |                 attention_dropout_prob,
 64 |                 output_dropout_prob,
 65 |                 layernorm_epsilon,
 66 |                 cogview_sandwich_layernorm=cogview_sandwich_layernorm,
 67 |                 cogview_pb_relax=cogview_pb_relax,
 68 |                 mlp_activation=mlp_activation,
 69 |             ) for _ in range(num_layers)
 70 |         ])
 71 | 
 72 |         row_mask = get_row_mask(text_seq_length, image_tokens_per_dim, is_bool_mask=is_bool_mask)
 73 |         col_mask = get_col_mask(text_seq_length, image_tokens_per_dim, is_bool_mask=is_bool_mask)
 74 |         conv_mask = get_conv_mask(text_seq_length, image_tokens_per_dim, is_bool_mask=is_bool_mask)
 75 |         self.register_buffer('row_mask', row_mask)
 76 |         self.register_buffer('col_mask', col_mask)
 77 |         self.register_buffer('conv_mask', conv_mask)
 78 | 
 79 |         # Final layer norm before output.
 80 |         self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
 81 | 
 82 |     def _get_layer_mask(self, layer_id):
 83 |         if ((layer_id - 1) % 4 == 0):
 84 |             layer_mask = self.col_mask
 85 |         elif layer_id != self.num_layers - 1:
 86 |             layer_mask = self.row_mask
 87 |         else:
 88 |             layer_mask = self.conv_mask
 89 |         return layer_mask
 90 | 
 91 |     def forward(self, hidden_states, attention_mask, has_cache, use_cache):
 92 |         for i, layer in enumerate(self.layers):
 93 |             mask = attention_mask
 94 |             layer_mask = self._get_layer_mask(i)[:mask.size(2), :mask.size(3)]
 95 |             mask = torch.mul(attention_mask, layer_mask)
 96 |             hidden_states, present_has_cache = layer(hidden_states, mask, has_cache=has_cache, use_cache=use_cache)
 97 |         output = self.final_layernorm(hidden_states)
 98 |         return output, present_has_cache
 99 | 
100 | 
101 | class DalleTransformerLayer(torch.nn.Module):
102 |     """
103 |     A single layer transformer.
104 | 
105 |     We use the following notation:
106 |         h: hidden size
107 |         n: number of attention heads
108 |         b: batch size
109 |         s: sequence length
110 |     Transformer layer takes input with size [b, s, h] and returns an
111 |     output of the same size.
112 | 
113 |     Arguments:
114 |         hidden_size: The hidden size of the self attention.
115 |         num_attention_heads: number of attention head in the self
116 |                              attention.
117 |         attention_dropout_prob: dropout probability of the attention
118 |                                 score in self attention.
119 |         output_dropout_prob: dropout probability for the outputs
120 |                              after self attention and final output.
121 |         layernorm_epsilon: epsilon used in layernorm to avoid
122 |                            division by zero.
123 |     """
124 | 
125 |     def __init__(self,
126 |                  hidden_size,
127 |                  num_attention_heads,
128 |                  attention_dropout_prob,
129 |                  output_dropout_prob,
130 |                  layernorm_epsilon,
131 |                  cogview_sandwich_layernorm=False,
132 |                  cogview_pb_relax=False,
133 |                  mlp_activation='gelu_jit'):
134 |         super(DalleTransformerLayer, self).__init__()
135 | 
136 |         # CogView stabilization of training features, see chapter 2.4 https://arxiv.org/pdf/2105.13290.pdf
137 |         self.cogview_sandwich_layernorm = cogview_sandwich_layernorm
138 |         self.cogview_pb_relax = cogview_pb_relax
139 | 
140 |         # Layernorm on the input data.
141 |         self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
142 | 
143 |         if self.cogview_sandwich_layernorm:
144 |             self.before_first_addition_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
145 |             self.before_second_addition_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
146 | 
147 |         # Self attention.
148 |         self.attention = DalleSelfAttention(
149 |             hidden_size,
150 |             num_attention_heads,
151 |             attention_dropout_prob,
152 |             output_dropout_prob,
153 |             cogview_pb_relax=cogview_pb_relax
154 |         )
155 | 
156 |         # Layernorm on the input data.
157 |         self.post_attention_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
158 | 
159 |         # MLP
160 |         self.mlp = DalleMLP(hidden_size, output_dropout_prob, activation=mlp_activation)
161 | 
162 |     def forward(self, hidden_states, ltor_mask, has_cache, use_cache):
163 |         # hidden_states: [b, s, h]
164 |         # ltor_mask: [1, 1, s, s]
165 | 
166 |         # Layer norm at the begining of the transformer layer.
167 |         layernorm_output = self.input_layernorm(hidden_states)
168 | 
169 |         # Self attention.
170 |         attention_output, att_has_cache = self.attention(
171 |             layernorm_output, ltor_mask, has_cache=has_cache, use_cache=use_cache)
172 | 
173 |         if self.cogview_sandwich_layernorm:
174 |             attention_output = self.before_first_addition_layernorm(attention_output)
175 | 
176 |         # Residual connection.
177 |         layernorm_input = hidden_states + attention_output
178 | 
179 |         # Layer norm post the self attention.
180 |         layernorm_output = self.post_attention_layernorm(layernorm_input)
181 | 
182 |         # MLP.
183 |         mlp_output, mlp_has_cache = self.mlp(
184 |             layernorm_output, has_cache=has_cache, use_cache=use_cache)
185 | 
186 |         if self.cogview_sandwich_layernorm:
187 |             mlp_output = self.before_second_addition_layernorm(mlp_output)
188 | 
189 |         # Second residual connection.
190 |         output = layernorm_input + mlp_output
191 | 
192 |         return output, att_has_cache and mlp_has_cache
193 | 
194 | 
195 | class DalleSelfAttention(torch.nn.Module):
196 |     """
197 |     Self-attention layer takes input with size [b, s, h] where b is
198 |     the batch size, s is the sequence length, and h is the hidden size
199 |     and creates output of the same size.
200 |     Arguments:
201 |         hidden_size: total hidden size of the layer (h).
202 |         num_attention_heads: number of attention heads (n). Note that we
203 |                              require n to be divisible by number of GPUs
204 |                              used to parallelize the model. Also, we
205 |                              require hidden size to be divisible by n.
206 |         attention_dropout_prob: dropout probability for the attention scores.
207 |         output_dropout_prob: dropout probability for the output.
208 |     We use the following notation:
209 |         h: hidden_size
210 |         n: num_attention_heads
211 |         p: number of partitions
212 |         np: n/p
213 |         hp: h/p
214 |         hn: h/n
215 |         b: batch size
216 |         s: sequence length
217 |     """
218 | 
219 |     def __init__(self, hidden_size, num_attention_heads,
220 |                  attention_dropout_prob, output_dropout_prob, cogview_pb_relax=False):
221 |         super(DalleSelfAttention, self).__init__()
222 | 
223 |         # CogView stabilization of training features, see chapter 2.4 https://arxiv.org/pdf/2105.13290.pdf
224 |         self.cogview_pb_relax = cogview_pb_relax
225 | 
226 |         self.hidden_size = hidden_size
227 |         self.num_attention_heads = num_attention_heads
228 |         self.hidden_size_per_attention_head = divide(hidden_size, num_attention_heads)
229 | 
230 |         self.query_key_value = torch.nn.Linear(hidden_size, 3 * hidden_size)
231 |         self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
232 | 
233 |         # Output.
234 |         self.dense = torch.nn.Linear(hidden_size, hidden_size)
235 |         self.output_dropout = torch.nn.Dropout(output_dropout_prob)
236 | 
237 |         # Cache
238 |         self.past_key = None
239 |         self.past_value = None
240 |         self.past_output = None
241 | 
242 |     def _transpose_for_scores(self, tensor):
243 |         """ Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with size [b, np, s, hn]. """
244 |         new_tensor_shape = tensor.size()[:-1] + (self.num_attention_heads, self.hidden_size_per_attention_head)
245 |         tensor = tensor.view(*new_tensor_shape)
246 |         return tensor.permute(0, 2, 1, 3)
247 | 
248 |     def _calculate_attention_scores(self, query_layer, key_layer, ltor_mask):
249 |         key_t = key_layer.transpose(-1, -2)
250 |         if self.cogview_pb_relax:
251 |             attention_scores = torch.matmul(
252 |                 query_layer / math.sqrt(self.hidden_size_per_attention_head),
253 |                 key_t
254 |             )
255 |         else:
256 |             attention_scores = torch.matmul(query_layer, key_t) / math.sqrt(self.hidden_size_per_attention_head)
257 |         ltor_mask = ltor_mask[:, :, -attention_scores.shape[-2]:]
258 |         attention_scores = torch.mul(attention_scores, ltor_mask) - 10000.0 * (1.0 - ltor_mask)
259 |         if self.cogview_pb_relax:
260 |             # normalize attention scores. Should not affect resulting softmax value
261 |             alpha = 32
262 |             attention_scores_scaled = attention_scores / alpha
263 |             attention_scores_scaled_maxes, _ = attention_scores_scaled.detach().view(
264 |                 [attention_scores.size(0), attention_scores.size(1), -1]
265 |             ).max(dim=-1)  # max per head per sample
266 |             attention_scores_scaled_maxes = attention_scores_scaled_maxes.unsqueeze(-1).unsqueeze(-1).expand(
267 |                 [-1, -1, attention_scores.size(2), attention_scores.size(3)]
268 |             )  # expand to [b, np, s, s]
269 |             attention_scores = (attention_scores_scaled - attention_scores_scaled_maxes) * alpha
270 |         return attention_scores
271 | 
272 |     def forward(self, hidden_states, ltor_mask, has_cache=False, use_cache=False, ):
273 |         # hidden_states: [b, s, h]
274 |         # ltor_mask: [1, 1, s, s]
275 |         # Attention heads. [b, s, hp]
276 |         if has_cache and use_cache:
277 |             mixed_x_layer = self.query_key_value(hidden_states[:, self.past_key.shape[-2]:, :])
278 |         else:
279 |             mixed_x_layer = self.query_key_value(hidden_states)
280 | 
281 |         (mixed_query_layer,
282 |          mixed_key_layer,
283 |          mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
284 | 
285 |         query_layer = self._transpose_for_scores(mixed_query_layer)
286 |         key_layer = self._transpose_for_scores(mixed_key_layer)
287 |         value_layer = self._transpose_for_scores(mixed_value_layer)
288 | 
289 |         # Can be simplified, but I didn't for readability's sake
290 |         if use_cache and has_cache:
291 |             key_layer = torch.cat((self.past_key, key_layer), dim=-2)
292 |             value_layer = torch.cat((self.past_value, value_layer), dim=-2)
293 |             attention_scores = self._calculate_attention_scores(
294 |                 query_layer=query_layer, key_layer=key_layer, ltor_mask=ltor_mask
295 |             )
296 |         else:
297 |             attention_scores = self._calculate_attention_scores(
298 |                 query_layer=query_layer, key_layer=key_layer, ltor_mask=ltor_mask
299 |             )
300 | 
301 |         if use_cache and has_cache:
302 |             extra_cache_size = hidden_states.shape[-2] - self.past_key.shape[-2]
303 |             attention_scores = attention_scores[..., -extra_cache_size:, :]
304 | 
305 |         if use_cache:
306 |             self.past_key = key_layer
307 |             self.past_value = value_layer
308 |         else:
309 |             self.past_key = None
310 |             self.past_value = None
311 |             self.past_output = None
312 |             has_cache = False
313 | 
314 |         # Attention probabilities. [b, np, s, s]
315 |         attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
316 | 
317 |         # This is actually dropping out entire tokens to attend to, which might
318 |         # seem a bit unusual, but is taken from the original Transformer paper.
319 |         attention_probs = self.attention_dropout(attention_probs)
320 | 
321 |         # Context layer.
322 |         # [b, np, s, hn]
323 |         context_layer = torch.matmul(attention_probs, value_layer)
324 | 
325 |         # [b, s, np, hn]
326 |         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
327 | 
328 |         new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size,)
329 |         # [b, s, hp]
330 |         context_layer = context_layer.view(*new_context_layer_shape)
331 | 
332 |         # Output. [b, s, h]
333 |         output = self.dense(context_layer)
334 | 
335 |         if use_cache:
336 |             # Can be simplified, but I didn't for readability's sake
337 |             if has_cache:
338 |                 output = torch.cat((self.past_output, output), dim=-2)
339 |                 self.past_output = output
340 |             else:
341 |                 self.past_output = output
342 |             has_cache = True
343 | 
344 |         output = self.output_dropout(output)
345 |         return output, has_cache
346 | 
347 | 
348 | class DalleMLP(torch.nn.Module):
349 |     """
350 |     MLP will take the input with h hidden state, project it to 4*h
351 |     hidden dimension, perform gelu transformation, and project the
352 |     state back into h hidden dimension. At the end, dropout is also
353 |     applied.
354 |     Arguments:
355 |         hidden_size: The hidden size of the self attention.
356 |         output_dropout_prob: dropout probability for the outputs
357 |                              after self attention and final output.
358 |     """
359 | 
360 |     def __init__(self, hidden_size, output_dropout_prob, activation='gelu_jit'):
361 |         super(DalleMLP, self).__init__()
362 |         self.activation = activation
363 |         # Project to 4h.
364 |         self.dense_h_to_4h = torch.nn.Linear(hidden_size, 4 * hidden_size)
365 |         # Project back to h.
366 |         self.dense_4h_to_h = torch.nn.Linear(4 * hidden_size, hidden_size)
367 |         self.dropout = torch.nn.Dropout(output_dropout_prob)
368 |         # MLP cache
369 |         self.past_x = None
370 | 
371 |     def forward(self, hidden_states, has_cache=False, use_cache=False):
372 |         if has_cache and use_cache:
373 |             hidden_states = hidden_states[:, self.past_x.shape[-2]:]
374 | 
375 |         # [b, s, 4hp]
376 |         x = self.dense_h_to_4h(hidden_states)
377 |         if self.activation == 'gelu_jit':
378 |             x = gelu_jit(x)
379 |         elif self.activation == 'gelu':
380 |             x = gelu(x)
381 |         else:
382 |             raise NotImplementedError('Used MLP activation is not implemented.')
383 |         # [b, s, h]
384 |         x = self.dense_4h_to_h(x)
385 |         if use_cache:
386 |             # Can be simplified, but I didn't for readability's sake
387 |             if has_cache:
388 |                 x = torch.cat((self.past_x, x), dim=-2)
389 |                 self.past_x = x
390 |             else:
391 |                 self.past_x = x
392 | 
393 |             has_cache = True
394 |         else:
395 |             self.past_x = None
396 |             has_cache = False
397 |         output = self.dropout(x)
398 | 
399 |         return output, has_cache
400 | 


--------------------------------------------------------------------------------