├── sample
    └── .gitignore
├── checkpoint
    └── .gitignore
├── vqvae_560.pt
├── stage1_sample.png
├── distributed
    ├── __init__.py
    ├── launch.py
    └── distributed.py
├── README.md
├── dataset.py
├── .gitignore
├── pixelsnail_mnist.py
├── extract_code.py
├── sample.py
├── train_vqvae.py
├── train_pixelsnail.py
├── vqvae.py
├── scheduler.py
├── LICENSE
└── pixelsnail.py


/sample/.gitignore:
--------------------------------------------------------------------------------
1 | *.png
2 | 


--------------------------------------------------------------------------------
/checkpoint/.gitignore:
--------------------------------------------------------------------------------
1 | *.pt
2 | 


--------------------------------------------------------------------------------
/vqvae_560.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rosinality/vq-vae-2-pytorch/HEAD/vqvae_560.pt


--------------------------------------------------------------------------------
/stage1_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rosinality/vq-vae-2-pytorch/HEAD/stage1_sample.png


--------------------------------------------------------------------------------
/distributed/__init__.py:
--------------------------------------------------------------------------------
 1 | from .distributed import (
 2 |     get_rank,
 3 |     get_local_rank,
 4 |     is_primary,
 5 |     synchronize,
 6 |     get_world_size,
 7 |     all_reduce,
 8 |     all_gather,
 9 |     reduce_dict,
10 |     data_sampler,
11 |     LOCAL_PROCESS_GROUP,
12 | )
13 | from .launch import launch
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # vq-vae-2-pytorch
 2 | Implementation of Generating Diverse High-Fidelity Images with VQ-VAE-2 in PyTorch
 3 | 
 4 | ## Update
 5 | 
 6 | * 2020-06-01
 7 | 
 8 | train_vqvae.py and vqvae.py now supports distributed training. You can use --n_gpu [NUM_GPUS] arguments for train_vqvae.py to use [NUM_GPUS] during training.
 9 | 
10 | ## Requisite
11 | 
12 | * Python >= 3.6
13 | * PyTorch >= 1.1
14 | * lmdb (for storing extracted codes)
15 | 
16 | [Checkpoint of VQ-VAE pretrained on FFHQ](vqvae_560.pt)
17 | 
18 | ## Usage
19 | 
20 | Currently supports 256px (top/bottom hierarchical prior)
21 | 
22 | 1. Stage 1 (VQ-VAE)
23 | 
24 | > python train_vqvae.py [DATASET PATH]
25 | 
26 | If you use FFHQ, I highly recommends to preprocess images. (resize and convert to jpeg)
27 | 
28 | 2. Extract codes for stage 2 training
29 | 
30 | > python extract_code.py --ckpt checkpoint/[VQ-VAE CHECKPOINT] --name [LMDB NAME] [DATASET PATH]
31 | 
32 | 3. Stage 2 (PixelSNAIL)
33 | 
34 | > python train_pixelsnail.py [LMDB NAME]
35 | 
36 | Maybe it is better to use larger PixelSNAIL model. Currently model size is reduced due to GPU constraints.
37 | 
38 | ## Sample
39 | 
40 | ### Stage 1
41 | 
42 | Note: This is a training sample
43 | 
44 | ![Sample from Stage 1 (VQ-VAE)](stage1_sample.png)
45 | 


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | from collections import namedtuple
 4 | 
 5 | import torch
 6 | from torch.utils.data import Dataset
 7 | from torchvision import datasets
 8 | import lmdb
 9 | 
10 | 
11 | CodeRow = namedtuple('CodeRow', ['top', 'bottom', 'filename'])
12 | 
13 | 
14 | class ImageFileDataset(datasets.ImageFolder):
15 |     def __getitem__(self, index):
16 |         sample, target = super().__getitem__(index)
17 |         path, _ = self.samples[index]
18 |         dirs, filename = os.path.split(path)
19 |         _, class_name = os.path.split(dirs)
20 |         filename = os.path.join(class_name, filename)
21 | 
22 |         return sample, target, filename
23 | 
24 | 
25 | class LMDBDataset(Dataset):
26 |     def __init__(self, path):
27 |         self.env = lmdb.open(
28 |             path,
29 |             max_readers=32,
30 |             readonly=True,
31 |             lock=False,
32 |             readahead=False,
33 |             meminit=False,
34 |         )
35 | 
36 |         if not self.env:
37 |             raise IOError('Cannot open lmdb dataset', path)
38 | 
39 |         with self.env.begin(write=False) as txn:
40 |             self.length = int(txn.get('length'.encode('utf-8')).decode('utf-8'))
41 | 
42 |     def __len__(self):
43 |         return self.length
44 | 
45 |     def __getitem__(self, index):
46 |         with self.env.begin(write=False) as txn:
47 |             key = str(index).encode('utf-8')
48 | 
49 |             row = pickle.loads(txn.get(key))
50 | 
51 |         return torch.from_numpy(row.top), torch.from_numpy(row.bottom), row.filename
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/pixelsnail_mnist.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from torch import nn, optim
 4 | from torch.utils.data import DataLoader
 5 | from torchvision import datasets
 6 | from tqdm import tqdm
 7 | 
 8 | from pixelsnail import PixelSNAIL
 9 | 
10 | 
11 | def train(epoch, loader, model, optimizer, device):
12 |     loader = tqdm(loader)
13 | 
14 |     criterion = nn.CrossEntropyLoss()
15 | 
16 |     for i, (img, label) in enumerate(loader):
17 |         model.zero_grad()
18 | 
19 |         img = img.to(device)
20 | 
21 |         out = model(img)
22 |         loss = criterion(out, img)
23 |         loss.backward()
24 | 
25 |         optimizer.step()
26 | 
27 |         _, pred = out.max(1)
28 |         correct = (pred == img).float()
29 |         accuracy = correct.sum() / img.numel()
30 | 
31 |         loader.set_description(
32 |             (f'epoch: {epoch + 1}; loss: {loss.item():.5f}; ' f'acc: {accuracy:.5f}')
33 |         )
34 | 
35 | 
36 | class PixelTransform:
37 |     def __init__(self):
38 |         pass
39 | 
40 |     def __call__(self, input):
41 |         ar = np.array(input)
42 | 
43 |         return torch.from_numpy(ar).long()
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     device = 'cuda'
48 |     epoch = 10
49 | 
50 |     dataset = datasets.MNIST('.', transform=PixelTransform(), download=True)
51 |     loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)
52 | 
53 |     model = PixelSNAIL([28, 28], 256, 128, 5, 2, 4, 128)
54 |     model = model.to(device)
55 | 
56 |     optimizer = optim.Adam(model.parameters(), lr=1e-3)
57 | 
58 |     for i in range(10):
59 |         train(i, loader, model, optimizer, device)
60 |         torch.save(model.state_dict(), f'checkpoint/mnist_{str(i + 1).zfill(3)}.pt')
61 | 


--------------------------------------------------------------------------------
/extract_code.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pickle
 3 | 
 4 | import torch
 5 | from torch.utils.data import DataLoader
 6 | from torchvision import transforms
 7 | import lmdb
 8 | from tqdm import tqdm
 9 | 
10 | from dataset import ImageFileDataset, CodeRow
11 | from vqvae import VQVAE
12 | 
13 | 
14 | def extract(lmdb_env, loader, model, device):
15 |     index = 0
16 | 
17 |     with lmdb_env.begin(write=True) as txn:
18 |         pbar = tqdm(loader)
19 | 
20 |         for img, _, filename in pbar:
21 |             img = img.to(device)
22 | 
23 |             _, _, _, id_t, id_b = model.encode(img)
24 |             id_t = id_t.detach().cpu().numpy()
25 |             id_b = id_b.detach().cpu().numpy()
26 | 
27 |             for file, top, bottom in zip(filename, id_t, id_b):
28 |                 row = CodeRow(top=top, bottom=bottom, filename=file)
29 |                 txn.put(str(index).encode('utf-8'), pickle.dumps(row))
30 |                 index += 1
31 |                 pbar.set_description(f'inserted: {index}')
32 | 
33 |         txn.put('length'.encode('utf-8'), str(index).encode('utf-8'))
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     parser = argparse.ArgumentParser()
38 |     parser.add_argument('--size', type=int, default=256)
39 |     parser.add_argument('--ckpt', type=str)
40 |     parser.add_argument('--name', type=str)
41 |     parser.add_argument('path', type=str)
42 | 
43 |     args = parser.parse_args()
44 | 
45 |     device = 'cuda'
46 | 
47 |     transform = transforms.Compose(
48 |         [
49 |             transforms.Resize(args.size),
50 |             transforms.CenterCrop(args.size),
51 |             transforms.ToTensor(),
52 |             transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
53 |         ]
54 |     )
55 | 
56 |     dataset = ImageFileDataset(args.path, transform=transform)
57 |     loader = DataLoader(dataset, batch_size=128, shuffle=False, num_workers=4)
58 | 
59 |     model = VQVAE()
60 |     model.load_state_dict(torch.load(args.ckpt))
61 |     model = model.to(device)
62 |     model.eval()
63 | 
64 |     map_size = 100 * 1024 * 1024 * 1024
65 | 
66 |     env = lmdb.open(args.name, map_size=map_size)
67 | 
68 |     extract(env, loader, model, device)
69 | 


--------------------------------------------------------------------------------
/distributed/launch.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | from torch import distributed as dist
 5 | from torch import multiprocessing as mp
 6 | 
 7 | import distributed as dist_fn
 8 | 
 9 | 
10 | def find_free_port():
11 |     import socket
12 | 
13 |     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
14 | 
15 |     sock.bind(("", 0))
16 |     port = sock.getsockname()[1]
17 |     sock.close()
18 | 
19 |     return port
20 | 
21 | 
22 | def launch(fn, n_gpu_per_machine, n_machine=1, machine_rank=0, dist_url=None, args=()):
23 |     world_size = n_machine * n_gpu_per_machine
24 | 
25 |     if world_size > 1:
26 |         if "OMP_NUM_THREADS" not in os.environ:
27 |             os.environ["OMP_NUM_THREADS"] = "1"
28 | 
29 |         if dist_url == "auto":
30 |             if n_machine != 1:
31 |                 raise ValueError('dist_url="auto" not supported in multi-machine jobs')
32 | 
33 |             port = find_free_port()
34 |             dist_url = f"tcp://127.0.0.1:{port}"
35 | 
36 |         if n_machine > 1 and dist_url.startswith("file://"):
37 |             raise ValueError(
38 |                 "file:// is not a reliable init method in multi-machine jobs. Prefer tcp://"
39 |             )
40 | 
41 |         mp.spawn(
42 |             distributed_worker,
43 |             nprocs=n_gpu_per_machine,
44 |             args=(fn, world_size, n_gpu_per_machine, machine_rank, dist_url, args),
45 |             daemon=False,
46 |         )
47 | 
48 |     else:
49 |         fn(*args)
50 | 
51 | 
52 | def distributed_worker(
53 |     local_rank, fn, world_size, n_gpu_per_machine, machine_rank, dist_url, args
54 | ):
55 |     if not torch.cuda.is_available():
56 |         raise OSError("CUDA is not available. Please check your environments")
57 | 
58 |     global_rank = machine_rank * n_gpu_per_machine + local_rank
59 | 
60 |     try:
61 |         dist.init_process_group(
62 |             backend="NCCL",
63 |             init_method=dist_url,
64 |             world_size=world_size,
65 |             rank=global_rank,
66 |         )
67 | 
68 |     except Exception:
69 |         raise OSError("failed to initialize NCCL groups")
70 | 
71 |     dist_fn.synchronize()
72 | 
73 |     if n_gpu_per_machine > torch.cuda.device_count():
74 |         raise ValueError(
75 |             f"specified n_gpu_per_machine larger than available device ({torch.cuda.device_count()})"
76 |         )
77 | 
78 |     torch.cuda.set_device(local_rank)
79 | 
80 |     if dist_fn.LOCAL_PROCESS_GROUP is not None:
81 |         raise ValueError("torch.distributed.LOCAL_PROCESS_GROUP is not None")
82 | 
83 |     n_machine = world_size // n_gpu_per_machine
84 | 
85 |     for i in range(n_machine):
86 |         ranks_on_i = list(range(i * n_gpu_per_machine, (i + 1) * n_gpu_per_machine))
87 |         pg = dist.new_group(ranks_on_i)
88 | 
89 |         if i == machine_rank:
90 |             dist_fn.distributed.LOCAL_PROCESS_GROUP = pg
91 | 
92 |     fn(*args)
93 | 


--------------------------------------------------------------------------------
/sample.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import torch
  5 | from torchvision.utils import save_image
  6 | from tqdm import tqdm
  7 | 
  8 | from vqvae import VQVAE
  9 | from pixelsnail import PixelSNAIL
 10 | 
 11 | 
 12 | @torch.no_grad()
 13 | def sample_model(model, device, batch, size, temperature, condition=None):
 14 |     row = torch.zeros(batch, *size, dtype=torch.int64).to(device)
 15 |     cache = {}
 16 | 
 17 |     for i in tqdm(range(size[0])):
 18 |         for j in range(size[1]):
 19 |             out, cache = model(row[:, : i + 1, :], condition=condition, cache=cache)
 20 |             prob = torch.softmax(out[:, :, i, j] / temperature, 1)
 21 |             sample = torch.multinomial(prob, 1).squeeze(-1)
 22 |             row[:, i, j] = sample
 23 | 
 24 |     return row
 25 | 
 26 | 
 27 | def load_model(model, checkpoint, device):
 28 |     ckpt = torch.load(os.path.join('checkpoint', checkpoint))
 29 | 
 30 |     
 31 |     if 'args' in ckpt:
 32 |         args = ckpt['args']
 33 | 
 34 |     if model == 'vqvae':
 35 |         model = VQVAE()
 36 | 
 37 |     elif model == 'pixelsnail_top':
 38 |         model = PixelSNAIL(
 39 |             [32, 32],
 40 |             512,
 41 |             args.channel,
 42 |             5,
 43 |             4,
 44 |             args.n_res_block,
 45 |             args.n_res_channel,
 46 |             dropout=args.dropout,
 47 |             n_out_res_block=args.n_out_res_block,
 48 |         )
 49 | 
 50 |     elif model == 'pixelsnail_bottom':
 51 |         model = PixelSNAIL(
 52 |             [64, 64],
 53 |             512,
 54 |             args.channel,
 55 |             5,
 56 |             4,
 57 |             args.n_res_block,
 58 |             args.n_res_channel,
 59 |             attention=False,
 60 |             dropout=args.dropout,
 61 |             n_cond_res_block=args.n_cond_res_block,
 62 |             cond_res_channel=args.n_res_channel,
 63 |         )
 64 |         
 65 |     if 'model' in ckpt:
 66 |         ckpt = ckpt['model']
 67 | 
 68 |     model.load_state_dict(ckpt)
 69 |     model = model.to(device)
 70 |     model.eval()
 71 | 
 72 |     return model
 73 | 
 74 | 
 75 | if __name__ == '__main__':
 76 |     device = 'cuda'
 77 | 
 78 |     parser = argparse.ArgumentParser()
 79 |     parser.add_argument('--batch', type=int, default=8)
 80 |     parser.add_argument('--vqvae', type=str)
 81 |     parser.add_argument('--top', type=str)
 82 |     parser.add_argument('--bottom', type=str)
 83 |     parser.add_argument('--temp', type=float, default=1.0)
 84 |     parser.add_argument('filename', type=str)
 85 | 
 86 |     args = parser.parse_args()
 87 | 
 88 |     model_vqvae = load_model('vqvae', args.vqvae, device)
 89 |     model_top = load_model('pixelsnail_top', args.top, device)
 90 |     model_bottom = load_model('pixelsnail_bottom', args.bottom, device)
 91 | 
 92 |     top_sample = sample_model(model_top, device, args.batch, [32, 32], args.temp)
 93 |     bottom_sample = sample_model(
 94 |         model_bottom, device, args.batch, [64, 64], args.temp, condition=top_sample
 95 |     )
 96 | 
 97 |     decoded_sample = model_vqvae.decode_code(top_sample, bottom_sample)
 98 |     decoded_sample = decoded_sample.clamp(-1, 1)
 99 | 
100 |     save_image(decoded_sample, args.filename, normalize=True, range=(-1, 1))
101 | 


--------------------------------------------------------------------------------
/distributed/distributed.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import pickle
  3 | 
  4 | import torch
  5 | from torch import distributed as dist
  6 | from torch.utils import data
  7 | 
  8 | 
  9 | LOCAL_PROCESS_GROUP = None
 10 | 
 11 | 
 12 | def is_primary():
 13 |     return get_rank() == 0
 14 | 
 15 | 
 16 | def get_rank():
 17 |     if not dist.is_available():
 18 |         return 0
 19 | 
 20 |     if not dist.is_initialized():
 21 |         return 0
 22 | 
 23 |     return dist.get_rank()
 24 | 
 25 | 
 26 | def get_local_rank():
 27 |     if not dist.is_available():
 28 |         return 0
 29 | 
 30 |     if not dist.is_initialized():
 31 |         return 0
 32 | 
 33 |     if LOCAL_PROCESS_GROUP is None:
 34 |         raise ValueError("tensorfn.distributed.LOCAL_PROCESS_GROUP is None")
 35 | 
 36 |     return dist.get_rank(group=LOCAL_PROCESS_GROUP)
 37 | 
 38 | 
 39 | def synchronize():
 40 |     if not dist.is_available():
 41 |         return
 42 | 
 43 |     if not dist.is_initialized():
 44 |         return
 45 | 
 46 |     world_size = dist.get_world_size()
 47 | 
 48 |     if world_size == 1:
 49 |         return
 50 | 
 51 |     dist.barrier()
 52 | 
 53 | 
 54 | def get_world_size():
 55 |     if not dist.is_available():
 56 |         return 1
 57 | 
 58 |     if not dist.is_initialized():
 59 |         return 1
 60 | 
 61 |     return dist.get_world_size()
 62 | 
 63 | 
 64 | def all_reduce(tensor, op=dist.ReduceOp.SUM):
 65 |     world_size = get_world_size()
 66 | 
 67 |     if world_size == 1:
 68 |         return tensor
 69 | 
 70 |     dist.all_reduce(tensor, op=op)
 71 | 
 72 |     return tensor
 73 | 
 74 | 
 75 | def all_gather(data):
 76 |     world_size = get_world_size()
 77 | 
 78 |     if world_size == 1:
 79 |         return [data]
 80 | 
 81 |     buffer = pickle.dumps(data)
 82 |     storage = torch.ByteStorage.from_buffer(buffer)
 83 |     tensor = torch.ByteTensor(storage).to("cuda")
 84 | 
 85 |     local_size = torch.IntTensor([tensor.numel()]).to("cuda")
 86 |     size_list = [torch.IntTensor([1]).to("cuda") for _ in range(world_size)]
 87 |     dist.all_gather(size_list, local_size)
 88 |     size_list = [int(size.item()) for size in size_list]
 89 |     max_size = max(size_list)
 90 | 
 91 |     tensor_list = []
 92 |     for _ in size_list:
 93 |         tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda"))
 94 | 
 95 |     if local_size != max_size:
 96 |         padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda")
 97 |         tensor = torch.cat((tensor, padding), 0)
 98 | 
 99 |     dist.all_gather(tensor_list, tensor)
100 | 
101 |     data_list = []
102 | 
103 |     for size, tensor in zip(size_list, tensor_list):
104 |         buffer = tensor.cpu().numpy().tobytes()[:size]
105 |         data_list.append(pickle.loads(buffer))
106 | 
107 |     return data_list
108 | 
109 | 
110 | def reduce_dict(input_dict, average=True):
111 |     world_size = get_world_size()
112 | 
113 |     if world_size < 2:
114 |         return input_dict
115 | 
116 |     with torch.no_grad():
117 |         keys = []
118 |         values = []
119 | 
120 |         for k in sorted(input_dict.keys()):
121 |             keys.append(k)
122 |             values.append(input_dict[k])
123 | 
124 |         values = torch.stack(values, 0)
125 |         dist.reduce(values, dst=0)
126 | 
127 |         if dist.get_rank() == 0 and average:
128 |             values /= world_size
129 | 
130 |         reduced_dict = {k: v for k, v in zip(keys, values)}
131 | 
132 |     return reduced_dict
133 | 
134 | 
135 | def data_sampler(dataset, shuffle, distributed):
136 |     if distributed:
137 |         return data.distributed.DistributedSampler(dataset, shuffle=shuffle)
138 | 
139 |     if shuffle:
140 |         return data.RandomSampler(dataset)
141 | 
142 |     else:
143 |         return data.SequentialSampler(dataset)
144 | 


--------------------------------------------------------------------------------
/train_vqvae.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | import os
  4 | 
  5 | import torch
  6 | from torch import nn, optim
  7 | from torch.utils.data import DataLoader
  8 | 
  9 | from torchvision import datasets, transforms, utils
 10 | 
 11 | from tqdm import tqdm
 12 | 
 13 | from vqvae import VQVAE
 14 | from scheduler import CycleScheduler
 15 | import distributed as dist
 16 | 
 17 | 
 18 | def train(epoch, loader, model, optimizer, scheduler, device):
 19 |     if dist.is_primary():
 20 |         loader = tqdm(loader)
 21 | 
 22 |     criterion = nn.MSELoss()
 23 | 
 24 |     latent_loss_weight = 0.25
 25 |     sample_size = 25
 26 | 
 27 |     mse_sum = 0
 28 |     mse_n = 0
 29 | 
 30 |     for i, (img, label) in enumerate(loader):
 31 |         model.zero_grad()
 32 | 
 33 |         img = img.to(device)
 34 | 
 35 |         out, latent_loss = model(img)
 36 |         recon_loss = criterion(out, img)
 37 |         latent_loss = latent_loss.mean()
 38 |         loss = recon_loss + latent_loss_weight * latent_loss
 39 |         loss.backward()
 40 | 
 41 |         if scheduler is not None:
 42 |             scheduler.step()
 43 |         optimizer.step()
 44 | 
 45 |         part_mse_sum = recon_loss.item() * img.shape[0]
 46 |         part_mse_n = img.shape[0]
 47 |         comm = {"mse_sum": part_mse_sum, "mse_n": part_mse_n}
 48 |         comm = dist.all_gather(comm)
 49 | 
 50 |         for part in comm:
 51 |             mse_sum += part["mse_sum"]
 52 |             mse_n += part["mse_n"]
 53 | 
 54 |         if dist.is_primary():
 55 |             lr = optimizer.param_groups[0]["lr"]
 56 | 
 57 |             loader.set_description(
 58 |                 (
 59 |                     f"epoch: {epoch + 1}; mse: {recon_loss.item():.5f}; "
 60 |                     f"latent: {latent_loss.item():.3f}; avg mse: {mse_sum / mse_n:.5f}; "
 61 |                     f"lr: {lr:.5f}"
 62 |                 )
 63 |             )
 64 | 
 65 |             if i % 100 == 0:
 66 |                 model.eval()
 67 | 
 68 |                 sample = img[:sample_size]
 69 | 
 70 |                 with torch.no_grad():
 71 |                     out, _ = model(sample)
 72 | 
 73 |                 utils.save_image(
 74 |                     torch.cat([sample, out], 0),
 75 |                     f"sample/{str(epoch + 1).zfill(5)}_{str(i).zfill(5)}.png",
 76 |                     nrow=sample_size,
 77 |                     normalize=True,
 78 |                     range=(-1, 1),
 79 |                 )
 80 | 
 81 |                 model.train()
 82 | 
 83 | 
 84 | def main(args):
 85 |     device = "cuda"
 86 | 
 87 |     args.distributed = dist.get_world_size() > 1
 88 | 
 89 |     transform = transforms.Compose(
 90 |         [
 91 |             transforms.Resize(args.size),
 92 |             transforms.CenterCrop(args.size),
 93 |             transforms.ToTensor(),
 94 |             transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
 95 |         ]
 96 |     )
 97 | 
 98 |     dataset = datasets.ImageFolder(args.path, transform=transform)
 99 |     sampler = dist.data_sampler(dataset, shuffle=True, distributed=args.distributed)
100 |     loader = DataLoader(
101 |         dataset, batch_size=128 // args.n_gpu, sampler=sampler, num_workers=2
102 |     )
103 | 
104 |     model = VQVAE().to(device)
105 | 
106 |     if args.distributed:
107 |         model = nn.parallel.DistributedDataParallel(
108 |             model,
109 |             device_ids=[dist.get_local_rank()],
110 |             output_device=dist.get_local_rank(),
111 |         )
112 | 
113 |     optimizer = optim.Adam(model.parameters(), lr=args.lr)
114 |     scheduler = None
115 |     if args.sched == "cycle":
116 |         scheduler = CycleScheduler(
117 |             optimizer,
118 |             args.lr,
119 |             n_iter=len(loader) * args.epoch,
120 |             momentum=None,
121 |             warmup_proportion=0.05,
122 |         )
123 | 
124 |     for i in range(args.epoch):
125 |         train(i, loader, model, optimizer, scheduler, device)
126 | 
127 |         if dist.is_primary():
128 |             torch.save(model.state_dict(), f"checkpoint/vqvae_{str(i + 1).zfill(3)}.pt")
129 | 
130 | 
131 | if __name__ == "__main__":
132 |     parser = argparse.ArgumentParser()
133 |     parser.add_argument("--n_gpu", type=int, default=1)
134 | 
135 |     port = (
136 |         2 ** 15
137 |         + 2 ** 14
138 |         + hash(os.getuid() if sys.platform != "win32" else 1) % 2 ** 14
139 |     )
140 |     parser.add_argument("--dist_url", default=f"tcp://127.0.0.1:{port}")
141 | 
142 |     parser.add_argument("--size", type=int, default=256)
143 |     parser.add_argument("--epoch", type=int, default=560)
144 |     parser.add_argument("--lr", type=float, default=3e-4)
145 |     parser.add_argument("--sched", type=str)
146 |     parser.add_argument("path", type=str)
147 | 
148 |     args = parser.parse_args()
149 | 
150 |     print(args)
151 | 
152 |     dist.launch(main, args.n_gpu, 1, 0, args.dist_url, args=(args,))
153 | 


--------------------------------------------------------------------------------
/train_pixelsnail.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | from torch import nn, optim
  6 | from torch.utils.data import DataLoader
  7 | from tqdm import tqdm
  8 | 
  9 | try:
 10 |     from apex import amp
 11 | 
 12 | except ImportError:
 13 |     amp = None
 14 | 
 15 | from dataset import LMDBDataset
 16 | from pixelsnail import PixelSNAIL
 17 | from scheduler import CycleScheduler
 18 | 
 19 | 
 20 | def train(args, epoch, loader, model, optimizer, scheduler, device):
 21 |     loader = tqdm(loader)
 22 | 
 23 |     criterion = nn.CrossEntropyLoss()
 24 | 
 25 |     for i, (top, bottom, label) in enumerate(loader):
 26 |         model.zero_grad()
 27 | 
 28 |         top = top.to(device)
 29 | 
 30 |         if args.hier == 'top':
 31 |             target = top
 32 |             out, _ = model(top)
 33 | 
 34 |         elif args.hier == 'bottom':
 35 |             bottom = bottom.to(device)
 36 |             target = bottom
 37 |             out, _ = model(bottom, condition=top)
 38 | 
 39 |         loss = criterion(out, target)
 40 |         loss.backward()
 41 | 
 42 |         if scheduler is not None:
 43 |             scheduler.step()
 44 |         optimizer.step()
 45 | 
 46 |         _, pred = out.max(1)
 47 |         correct = (pred == target).float()
 48 |         accuracy = correct.sum() / target.numel()
 49 | 
 50 |         lr = optimizer.param_groups[0]['lr']
 51 | 
 52 |         loader.set_description(
 53 |             (
 54 |                 f'epoch: {epoch + 1}; loss: {loss.item():.5f}; '
 55 |                 f'acc: {accuracy:.5f}; lr: {lr:.5f}'
 56 |             )
 57 |         )
 58 | 
 59 | 
 60 | class PixelTransform:
 61 |     def __init__(self):
 62 |         pass
 63 | 
 64 |     def __call__(self, input):
 65 |         ar = np.array(input)
 66 | 
 67 |         return torch.from_numpy(ar).long()
 68 | 
 69 | 
 70 | if __name__ == '__main__':
 71 |     parser = argparse.ArgumentParser()
 72 |     parser.add_argument('--batch', type=int, default=32)
 73 |     parser.add_argument('--epoch', type=int, default=420)
 74 |     parser.add_argument('--hier', type=str, default='top')
 75 |     parser.add_argument('--lr', type=float, default=3e-4)
 76 |     parser.add_argument('--channel', type=int, default=256)
 77 |     parser.add_argument('--n_res_block', type=int, default=4)
 78 |     parser.add_argument('--n_res_channel', type=int, default=256)
 79 |     parser.add_argument('--n_out_res_block', type=int, default=0)
 80 |     parser.add_argument('--n_cond_res_block', type=int, default=3)
 81 |     parser.add_argument('--dropout', type=float, default=0.1)
 82 |     parser.add_argument('--amp', type=str, default='O0')
 83 |     parser.add_argument('--sched', type=str)
 84 |     parser.add_argument('--ckpt', type=str)
 85 |     parser.add_argument('path', type=str)
 86 | 
 87 |     args = parser.parse_args()
 88 | 
 89 |     print(args)
 90 | 
 91 |     device = 'cuda'
 92 | 
 93 |     dataset = LMDBDataset(args.path)
 94 |     loader = DataLoader(
 95 |         dataset, batch_size=args.batch, shuffle=True, num_workers=4, drop_last=True
 96 |     )
 97 | 
 98 |     ckpt = {}
 99 | 
100 |     if args.ckpt is not None:
101 |         ckpt = torch.load(args.ckpt)
102 |         args = ckpt['args']
103 | 
104 |     if args.hier == 'top':
105 |         model = PixelSNAIL(
106 |             [32, 32],
107 |             512,
108 |             args.channel,
109 |             5,
110 |             4,
111 |             args.n_res_block,
112 |             args.n_res_channel,
113 |             dropout=args.dropout,
114 |             n_out_res_block=args.n_out_res_block,
115 |         )
116 | 
117 |     elif args.hier == 'bottom':
118 |         model = PixelSNAIL(
119 |             [64, 64],
120 |             512,
121 |             args.channel,
122 |             5,
123 |             4,
124 |             args.n_res_block,
125 |             args.n_res_channel,
126 |             attention=False,
127 |             dropout=args.dropout,
128 |             n_cond_res_block=args.n_cond_res_block,
129 |             cond_res_channel=args.n_res_channel,
130 |         )
131 | 
132 |     if 'model' in ckpt:
133 |         model.load_state_dict(ckpt['model'])
134 | 
135 |     model = model.to(device)
136 |     optimizer = optim.Adam(model.parameters(), lr=args.lr)
137 | 
138 |     if amp is not None:
139 |         model, optimizer = amp.initialize(model, optimizer, opt_level=args.amp)
140 | 
141 |     model = nn.DataParallel(model)
142 |     model = model.to(device)
143 | 
144 |     scheduler = None
145 |     if args.sched == 'cycle':
146 |         scheduler = CycleScheduler(
147 |             optimizer, args.lr, n_iter=len(loader) * args.epoch, momentum=None
148 |         )
149 | 
150 |     for i in range(args.epoch):
151 |         train(args, i, loader, model, optimizer, scheduler, device)
152 |         torch.save(
153 |             {'model': model.module.state_dict(), 'args': args},
154 |             f'checkpoint/pixelsnail_{args.hier}_{str(i + 1).zfill(3)}.pt',
155 |         )
156 | 


--------------------------------------------------------------------------------
/vqvae.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.nn import functional as F
  4 | 
  5 | import distributed as dist_fn
  6 | 
  7 | 
  8 | # Copyright 2018 The Sonnet Authors. All Rights Reserved.
  9 | #
 10 | # Licensed under the Apache License, Version 2.0 (the "License");
 11 | # you may not use this file except in compliance with the License.
 12 | # You may obtain a copy of the License at
 13 | #
 14 | #    http://www.apache.org/licenses/LICENSE-2.0
 15 | #
 16 | # Unless required by applicable law or agreed to in writing, software
 17 | # distributed under the License is distributed on an "AS IS" BASIS,
 18 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
 19 | # See the License for the specific language governing permissions and
 20 | # limitations under the License.
 21 | # ============================================================================
 22 | 
 23 | 
 24 | # Borrowed from https://github.com/deepmind/sonnet and ported it to PyTorch
 25 | 
 26 | 
 27 | class Quantize(nn.Module):
 28 |     def __init__(self, dim, n_embed, decay=0.99, eps=1e-5):
 29 |         super().__init__()
 30 | 
 31 |         self.dim = dim
 32 |         self.n_embed = n_embed
 33 |         self.decay = decay
 34 |         self.eps = eps
 35 | 
 36 |         embed = torch.randn(dim, n_embed)
 37 |         self.register_buffer("embed", embed)
 38 |         self.register_buffer("cluster_size", torch.zeros(n_embed))
 39 |         self.register_buffer("embed_avg", embed.clone())
 40 | 
 41 |     def forward(self, input):
 42 |         flatten = input.reshape(-1, self.dim)
 43 |         dist = (
 44 |             flatten.pow(2).sum(1, keepdim=True)
 45 |             - 2 * flatten @ self.embed
 46 |             + self.embed.pow(2).sum(0, keepdim=True)
 47 |         )
 48 |         _, embed_ind = (-dist).max(1)
 49 |         embed_onehot = F.one_hot(embed_ind, self.n_embed).type(flatten.dtype)
 50 |         embed_ind = embed_ind.view(*input.shape[:-1])
 51 |         quantize = self.embed_code(embed_ind)
 52 | 
 53 |         if self.training:
 54 |             embed_onehot_sum = embed_onehot.sum(0)
 55 |             embed_sum = flatten.transpose(0, 1) @ embed_onehot
 56 | 
 57 |             dist_fn.all_reduce(embed_onehot_sum)
 58 |             dist_fn.all_reduce(embed_sum)
 59 | 
 60 |             self.cluster_size.data.mul_(self.decay).add_(
 61 |                 embed_onehot_sum, alpha=1 - self.decay
 62 |             )
 63 |             self.embed_avg.data.mul_(self.decay).add_(embed_sum, alpha=1 - self.decay)
 64 |             n = self.cluster_size.sum()
 65 |             cluster_size = (
 66 |                 (self.cluster_size + self.eps) / (n + self.n_embed * self.eps) * n
 67 |             )
 68 |             embed_normalized = self.embed_avg / cluster_size.unsqueeze(0)
 69 |             self.embed.data.copy_(embed_normalized)
 70 | 
 71 |         diff = (quantize.detach() - input).pow(2).mean()
 72 |         quantize = input + (quantize - input).detach()
 73 | 
 74 |         return quantize, diff, embed_ind
 75 | 
 76 |     def embed_code(self, embed_id):
 77 |         return F.embedding(embed_id, self.embed.transpose(0, 1))
 78 | 
 79 | 
 80 | class ResBlock(nn.Module):
 81 |     def __init__(self, in_channel, channel):
 82 |         super().__init__()
 83 | 
 84 |         self.conv = nn.Sequential(
 85 |             nn.ReLU(),
 86 |             nn.Conv2d(in_channel, channel, 3, padding=1),
 87 |             nn.ReLU(inplace=True),
 88 |             nn.Conv2d(channel, in_channel, 1),
 89 |         )
 90 | 
 91 |     def forward(self, input):
 92 |         out = self.conv(input)
 93 |         out += input
 94 | 
 95 |         return out
 96 | 
 97 | 
 98 | class Encoder(nn.Module):
 99 |     def __init__(self, in_channel, channel, n_res_block, n_res_channel, stride):
100 |         super().__init__()
101 | 
102 |         if stride == 4:
103 |             blocks = [
104 |                 nn.Conv2d(in_channel, channel // 2, 4, stride=2, padding=1),
105 |                 nn.ReLU(inplace=True),
106 |                 nn.Conv2d(channel // 2, channel, 4, stride=2, padding=1),
107 |                 nn.ReLU(inplace=True),
108 |                 nn.Conv2d(channel, channel, 3, padding=1),
109 |             ]
110 | 
111 |         elif stride == 2:
112 |             blocks = [
113 |                 nn.Conv2d(in_channel, channel // 2, 4, stride=2, padding=1),
114 |                 nn.ReLU(inplace=True),
115 |                 nn.Conv2d(channel // 2, channel, 3, padding=1),
116 |             ]
117 | 
118 |         for i in range(n_res_block):
119 |             blocks.append(ResBlock(channel, n_res_channel))
120 | 
121 |         blocks.append(nn.ReLU(inplace=True))
122 | 
123 |         self.blocks = nn.Sequential(*blocks)
124 | 
125 |     def forward(self, input):
126 |         return self.blocks(input)
127 | 
128 | 
129 | class Decoder(nn.Module):
130 |     def __init__(
131 |         self, in_channel, out_channel, channel, n_res_block, n_res_channel, stride
132 |     ):
133 |         super().__init__()
134 | 
135 |         blocks = [nn.Conv2d(in_channel, channel, 3, padding=1)]
136 | 
137 |         for i in range(n_res_block):
138 |             blocks.append(ResBlock(channel, n_res_channel))
139 | 
140 |         blocks.append(nn.ReLU(inplace=True))
141 | 
142 |         if stride == 4:
143 |             blocks.extend(
144 |                 [
145 |                     nn.ConvTranspose2d(channel, channel // 2, 4, stride=2, padding=1),
146 |                     nn.ReLU(inplace=True),
147 |                     nn.ConvTranspose2d(
148 |                         channel // 2, out_channel, 4, stride=2, padding=1
149 |                     ),
150 |                 ]
151 |             )
152 | 
153 |         elif stride == 2:
154 |             blocks.append(
155 |                 nn.ConvTranspose2d(channel, out_channel, 4, stride=2, padding=1)
156 |             )
157 | 
158 |         self.blocks = nn.Sequential(*blocks)
159 | 
160 |     def forward(self, input):
161 |         return self.blocks(input)
162 | 
163 | 
164 | class VQVAE(nn.Module):
165 |     def __init__(
166 |         self,
167 |         in_channel=3,
168 |         channel=128,
169 |         n_res_block=2,
170 |         n_res_channel=32,
171 |         embed_dim=64,
172 |         n_embed=512,
173 |         decay=0.99,
174 |     ):
175 |         super().__init__()
176 | 
177 |         self.enc_b = Encoder(in_channel, channel, n_res_block, n_res_channel, stride=4)
178 |         self.enc_t = Encoder(channel, channel, n_res_block, n_res_channel, stride=2)
179 |         self.quantize_conv_t = nn.Conv2d(channel, embed_dim, 1)
180 |         self.quantize_t = Quantize(embed_dim, n_embed)
181 |         self.dec_t = Decoder(
182 |             embed_dim, embed_dim, channel, n_res_block, n_res_channel, stride=2
183 |         )
184 |         self.quantize_conv_b = nn.Conv2d(embed_dim + channel, embed_dim, 1)
185 |         self.quantize_b = Quantize(embed_dim, n_embed)
186 |         self.upsample_t = nn.ConvTranspose2d(
187 |             embed_dim, embed_dim, 4, stride=2, padding=1
188 |         )
189 |         self.dec = Decoder(
190 |             embed_dim + embed_dim,
191 |             in_channel,
192 |             channel,
193 |             n_res_block,
194 |             n_res_channel,
195 |             stride=4,
196 |         )
197 | 
198 |     def forward(self, input):
199 |         quant_t, quant_b, diff, _, _ = self.encode(input)
200 |         dec = self.decode(quant_t, quant_b)
201 | 
202 |         return dec, diff
203 | 
204 |     def encode(self, input):
205 |         enc_b = self.enc_b(input)
206 |         enc_t = self.enc_t(enc_b)
207 | 
208 |         quant_t = self.quantize_conv_t(enc_t).permute(0, 2, 3, 1)
209 |         quant_t, diff_t, id_t = self.quantize_t(quant_t)
210 |         quant_t = quant_t.permute(0, 3, 1, 2)
211 |         diff_t = diff_t.unsqueeze(0)
212 | 
213 |         dec_t = self.dec_t(quant_t)
214 |         enc_b = torch.cat([dec_t, enc_b], 1)
215 | 
216 |         quant_b = self.quantize_conv_b(enc_b).permute(0, 2, 3, 1)
217 |         quant_b, diff_b, id_b = self.quantize_b(quant_b)
218 |         quant_b = quant_b.permute(0, 3, 1, 2)
219 |         diff_b = diff_b.unsqueeze(0)
220 | 
221 |         return quant_t, quant_b, diff_t + diff_b, id_t, id_b
222 | 
223 |     def decode(self, quant_t, quant_b):
224 |         upsample_t = self.upsample_t(quant_t)
225 |         quant = torch.cat([upsample_t, quant_b], 1)
226 |         dec = self.dec(quant)
227 | 
228 |         return dec
229 | 
230 |     def decode_code(self, code_t, code_b):
231 |         quant_t = self.quantize_t.embed_code(code_t)
232 |         quant_t = quant_t.permute(0, 3, 1, 2)
233 |         quant_b = self.quantize_b.embed_code(code_b)
234 |         quant_b = quant_b.permute(0, 3, 1, 2)
235 | 
236 |         dec = self.decode(quant_t, quant_b)
237 | 
238 |         return dec
239 | 


--------------------------------------------------------------------------------
/scheduler.py:
--------------------------------------------------------------------------------
  1 | from math import cos, pi, floor, sin
  2 | 
  3 | from torch.optim import lr_scheduler
  4 | 
  5 | 
  6 | class CosineLR(lr_scheduler._LRScheduler):
  7 |     def __init__(self, optimizer, lr_min, lr_max, step_size):
  8 |         self.lr_min = lr_min
  9 |         self.lr_max = lr_max
 10 |         self.step_size = step_size
 11 |         self.iteration = 0
 12 | 
 13 |         super().__init__(optimizer, -1)
 14 | 
 15 |     def get_lr(self):
 16 |         lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
 17 |             1 + cos(self.iteration / self.step_size * pi)
 18 |         )
 19 |         self.iteration += 1
 20 | 
 21 |         if self.iteration == self.step_size:
 22 |             self.iteration = 0
 23 | 
 24 |         return [lr for base_lr in self.base_lrs]
 25 | 
 26 | 
 27 | class PowerLR(lr_scheduler._LRScheduler):
 28 |     def __init__(self, optimizer, lr_min, lr_max, warmup):
 29 |         self.lr_min = lr_min
 30 |         self.lr_max = lr_max
 31 |         self.warmup = warmup
 32 |         self.iteration = 0
 33 | 
 34 |         super().__init__(optimizer, -1)
 35 | 
 36 |     def get_lr(self):
 37 |         if self.iteration < self.warmup:
 38 |             lr = (
 39 |                 self.lr_min + (self.lr_max - self.lr_min) / self.warmup * self.iteration
 40 |             )
 41 | 
 42 |         else:
 43 |             lr = self.lr_max * (self.iteration - self.warmup + 1) ** -0.5
 44 | 
 45 |         self.iteration += 1
 46 | 
 47 |         return [lr for base_lr in self.base_lrs]
 48 | 
 49 | 
 50 | class SineLR(lr_scheduler._LRScheduler):
 51 |     def __init__(self, optimizer, lr_min, lr_max, step_size):
 52 |         self.lr_min = lr_min
 53 |         self.lr_max = lr_max
 54 |         self.step_size = step_size
 55 |         self.iteration = 0
 56 | 
 57 |         super().__init__(optimizer, -1)
 58 | 
 59 |     def get_lr(self):
 60 |         lr = self.lr_min + (self.lr_max - self.lr_min) * sin(
 61 |             self.iteration / self.step_size * pi
 62 |         )
 63 |         self.iteration += 1
 64 | 
 65 |         if self.iteration == self.step_size:
 66 |             self.iteration = 0
 67 | 
 68 |         return [lr for base_lr in self.base_lrs]
 69 | 
 70 | 
 71 | class LinearLR(lr_scheduler._LRScheduler):
 72 |     def __init__(self, optimizer, lr_min, lr_max, warmup, step_size):
 73 |         self.lr_min = lr_min
 74 |         self.lr_max = lr_max
 75 |         self.step_size = step_size
 76 |         self.warmup = warmup
 77 |         self.iteration = 0
 78 | 
 79 |         super().__init__(optimizer, -1)
 80 | 
 81 |     def get_lr(self):
 82 |         if self.iteration < self.warmup:
 83 |             lr = self.lr_max
 84 | 
 85 |         else:
 86 |             lr = self.lr_max + (self.iteration - self.warmup) * (
 87 |                 self.lr_min - self.lr_max
 88 |             ) / (self.step_size - self.warmup)
 89 |         self.iteration += 1
 90 | 
 91 |         if self.iteration == self.step_size:
 92 |             self.iteration = 0
 93 | 
 94 |         return [lr for base_lr in self.base_lrs]
 95 | 
 96 | 
 97 | class CLR(lr_scheduler._LRScheduler):
 98 |     def __init__(self, optimizer, lr_min, lr_max, step_size):
 99 |         self.epoch = 0
100 |         self.lr_min = lr_min
101 |         self.lr_max = lr_max
102 |         self.current_lr = lr_min
103 |         self.step_size = step_size
104 | 
105 |         super().__init__(optimizer, -1)
106 | 
107 |     def get_lr(self):
108 |         cycle = floor(1 + self.epoch / (2 * self.step_size))
109 |         x = abs(self.epoch / self.step_size - 2 * cycle + 1)
110 |         lr = self.lr_min + (self.lr_max - self.lr_min) * max(0, 1 - x)
111 |         self.current_lr = lr
112 | 
113 |         self.epoch += 1
114 | 
115 |         return [lr for base_lr in self.base_lrs]
116 | 
117 | 
118 | class Warmup(lr_scheduler._LRScheduler):
119 |     def __init__(self, optimizer, model_dim, factor=1, warmup=16000):
120 |         self.optimizer = optimizer
121 |         self.model_dim = model_dim
122 |         self.factor = factor
123 |         self.warmup = warmup
124 |         self.iteration = 0
125 | 
126 |         super().__init__(optimizer, -1)
127 | 
128 |     def get_lr(self):
129 |         self.iteration += 1
130 |         lr = (
131 |             self.factor
132 |             * self.model_dim ** (-0.5)
133 |             * min(self.iteration ** (-0.5), self.iteration * self.warmup ** (-1.5))
134 |         )
135 | 
136 |         return [lr for base_lr in self.base_lrs]
137 | 
138 | 
139 | # Copyright 2019 fastai
140 | 
141 | # Licensed under the Apache License, Version 2.0 (the "License");
142 | # you may not use this file except in compliance with the License.
143 | # You may obtain a copy of the License at
144 | 
145 | #     http://www.apache.org/licenses/LICENSE-2.0
146 | 
147 | # Unless required by applicable law or agreed to in writing, software
148 | # distributed under the License is distributed on an "AS IS" BASIS,
149 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
150 | # See the License for the specific language governing permissions and
151 | # limitations under the License.
152 | 
153 | 
154 | # Borrowed from https://github.com/fastai/fastai and changed to make it runs like PyTorch lr scheduler
155 | 
156 | 
157 | class CycleAnnealScheduler:
158 |     def __init__(
159 |         self, optimizer, lr_max, lr_divider, cut_point, step_size, momentum=None
160 |     ):
161 |         self.lr_max = lr_max
162 |         self.lr_divider = lr_divider
163 |         self.cut_point = step_size // cut_point
164 |         self.step_size = step_size
165 |         self.iteration = 0
166 |         self.cycle_step = int(step_size * (1 - cut_point / 100) / 2)
167 |         self.momentum = momentum
168 |         self.optimizer = optimizer
169 | 
170 |     def get_lr(self):
171 |         if self.iteration > 2 * self.cycle_step:
172 |             cut = (self.iteration - 2 * self.cycle_step) / (
173 |                 self.step_size - 2 * self.cycle_step
174 |             )
175 |             lr = self.lr_max * (1 + (cut * (1 - 100) / 100)) / self.lr_divider
176 | 
177 |         elif self.iteration > self.cycle_step:
178 |             cut = 1 - (self.iteration - self.cycle_step) / self.cycle_step
179 |             lr = self.lr_max * (1 + cut * (self.lr_divider - 1)) / self.lr_divider
180 | 
181 |         else:
182 |             cut = self.iteration / self.cycle_step
183 |             lr = self.lr_max * (1 + cut * (self.lr_divider - 1)) / self.lr_divider
184 | 
185 |         return lr
186 | 
187 |     def get_momentum(self):
188 |         if self.iteration > 2 * self.cycle_step:
189 |             momentum = self.momentum[0]
190 | 
191 |         elif self.iteration > self.cycle_step:
192 |             cut = 1 - (self.iteration - self.cycle_step) / self.cycle_step
193 |             momentum = self.momentum[0] + cut * (self.momentum[1] - self.momentum[0])
194 | 
195 |         else:
196 |             cut = self.iteration / self.cycle_step
197 |             momentum = self.momentum[0] + cut * (self.momentum[1] - self.momentum[0])
198 | 
199 |         return momentum
200 | 
201 |     def step(self):
202 |         lr = self.get_lr()
203 | 
204 |         if self.momentum is not None:
205 |             momentum = self.get_momentum()
206 | 
207 |         self.iteration += 1
208 | 
209 |         if self.iteration == self.step_size:
210 |             self.iteration = 0
211 | 
212 |         for group in self.optimizer.param_groups:
213 |             group['lr'] = lr
214 | 
215 |             if self.momentum is not None:
216 |                 group['betas'] = (momentum, group['betas'][1])
217 | 
218 |         return lr
219 | 
220 | 
221 | def anneal_linear(start, end, proportion):
222 |     return start + proportion * (end - start)
223 | 
224 | 
225 | def anneal_cos(start, end, proportion):
226 |     cos_val = cos(pi * proportion) + 1
227 | 
228 |     return end + (start - end) / 2 * cos_val
229 | 
230 | 
231 | class Phase:
232 |     def __init__(self, start, end, n_iter, anneal_fn):
233 |         self.start, self.end = start, end
234 |         self.n_iter = n_iter
235 |         self.anneal_fn = anneal_fn
236 |         self.n = 0
237 | 
238 |     def step(self):
239 |         self.n += 1
240 | 
241 |         return self.anneal_fn(self.start, self.end, self.n / self.n_iter)
242 | 
243 |     def reset(self):
244 |         self.n = 0
245 | 
246 |     @property
247 |     def is_done(self):
248 |         return self.n >= self.n_iter
249 | 
250 | 
251 | class CycleScheduler:
252 |     def __init__(
253 |         self,
254 |         optimizer,
255 |         lr_max,
256 |         n_iter,
257 |         momentum=(0.95, 0.85),
258 |         divider=25,
259 |         warmup_proportion=0.3,
260 |         phase=('linear', 'cos'),
261 |     ):
262 |         self.optimizer = optimizer
263 | 
264 |         phase1 = int(n_iter * warmup_proportion)
265 |         phase2 = n_iter - phase1
266 |         lr_min = lr_max / divider
267 | 
268 |         phase_map = {'linear': anneal_linear, 'cos': anneal_cos}
269 | 
270 |         self.lr_phase = [
271 |             Phase(lr_min, lr_max, phase1, phase_map[phase[0]]),
272 |             Phase(lr_max, lr_min / 1e4, phase2, phase_map[phase[1]]),
273 |         ]
274 | 
275 |         self.momentum = momentum
276 | 
277 |         if momentum is not None:
278 |             mom1, mom2 = momentum
279 |             self.momentum_phase = [
280 |                 Phase(mom1, mom2, phase1, phase_map[phase[0]]),
281 |                 Phase(mom2, mom1, phase2, phase_map[phase[1]]),
282 |             ]
283 | 
284 |         else:
285 |             self.momentum_phase = []
286 | 
287 |         self.phase = 0
288 | 
289 |     def step(self):
290 |         lr = self.lr_phase[self.phase].step()
291 | 
292 |         if self.momentum is not None:
293 |             momentum = self.momentum_phase[self.phase].step()
294 | 
295 |         else:
296 |             momentum = None
297 | 
298 |         for group in self.optimizer.param_groups:
299 |             group['lr'] = lr
300 | 
301 |             if self.momentum is not None:
302 |                 if 'betas' in group:
303 |                     group['betas'] = (momentum, group['betas'][1])
304 | 
305 |                 else:
306 |                     group['momentum'] = momentum
307 | 
308 |         if self.lr_phase[self.phase].is_done:
309 |             self.phase += 1
310 | 
311 |         if self.phase >= len(self.lr_phase):
312 |             for phase in self.lr_phase:
313 |                 phase.reset()
314 | 
315 |             for phase in self.momentum_phase:
316 |                 phase.reset()
317 | 
318 |             self.phase = 0
319 | 
320 |         return lr, momentum
321 | 
322 | 
323 | class LRFinder(lr_scheduler._LRScheduler):
324 |     def __init__(self, optimizer, lr_min, lr_max, step_size, linear=False):
325 |         ratio = lr_max / lr_min
326 |         self.linear = linear
327 |         self.lr_min = lr_min
328 |         self.lr_mult = (ratio / step_size) if linear else ratio ** (1 / step_size)
329 |         self.iteration = 0
330 |         self.lrs = []
331 |         self.losses = []
332 | 
333 |         super().__init__(optimizer, -1)
334 | 
335 |     def get_lr(self):
336 |         lr = (
337 |             self.lr_mult * self.iteration
338 |             if self.linear
339 |             else self.lr_mult ** self.iteration
340 |         )
341 |         lr = self.lr_min + lr if self.linear else self.lr_min * lr
342 | 
343 |         self.iteration += 1
344 |         self.lrs.append(lr)
345 | 
346 |         return [lr for base_lr in self.base_lrs]
347 | 
348 |     def record(self, loss):
349 |         self.losses.append(loss)
350 | 
351 |     def save(self, filename):
352 |         with open(filename, 'w') as f:
353 |             for lr, loss in zip(self.lrs, self.losses):
354 |                 f.write('{},{}\n'.format(lr, loss))
355 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | MIT License
  2 | 
  3 | Copyright (c) 2019 Kim Seonghyeon
  4 | 
  5 | Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | of this software and associated documentation files (the "Software"), to deal
  7 | in the Software without restriction, including without limitation the rights
  8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | copies of the Software, and to permit persons to whom the Software is
 10 | furnished to do so, subject to the following conditions:
 11 | 
 12 | The above copyright notice and this permission notice shall be included in all
 13 | copies or substantial portions of the Software.
 14 | 
 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | SOFTWARE.
 22 | 
 23 | 
 24 | ==============================================================================
 25 | PixelSNAIL
 26 | ==============================================================================
 27 | 
 28 | MIT License
 29 | 
 30 | Copyright (c) 2019 Xi Chen
 31 | 
 32 | Permission is hereby granted, free of charge, to any person obtaining a copy
 33 | of this software and associated documentation files (the "Software"), to deal
 34 | in the Software without restriction, including without limitation the rights
 35 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 36 | copies of the Software, and to permit persons to whom the Software is
 37 | furnished to do so, subject to the following conditions:
 38 | 
 39 | The above copyright notice and this permission notice shall be included in all
 40 | copies or substantial portions of the Software.
 41 | 
 42 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 43 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 44 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 45 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 46 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 47 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 48 | SOFTWARE.
 49 | 
 50 | ==============================================================================
 51 | Learning rate scheduler and VQ-VAE
 52 | ==============================================================================
 53 | 
 54 | Apache License, Version 2.0 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/
 55 | 
 56 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 57 | 
 58 | 1. Definitions.
 59 | 
 60 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
 61 | 
 62 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
 63 | 
 64 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
 65 | 
 66 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
 67 | 
 68 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
 69 | 
 70 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
 71 | 
 72 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
 73 | 
 74 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
 75 | 
 76 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
 77 | 
 78 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
 79 | 
 80 | 2. Grant of Copyright License.
 81 | 
 82 | Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
 83 | 
 84 | 3. Grant of Patent License.
 85 | 
 86 | Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
 87 | 
 88 | 4. Redistribution.
 89 | 
 90 | You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
 91 | 
 92 | You must give any other recipients of the Work or Derivative Works a copy of this License; and You must cause any modified files to carry prominent notices stating that You changed the files; and You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
 93 | 
 94 | 5. Submission of Contributions.
 95 | 
 96 | Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
 97 | 
 98 | 6. Trademarks.
 99 | 
100 | This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
101 | 
102 | 7. Disclaimer of Warranty.
103 | 
104 | Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
105 | 
106 | 8. Limitation of Liability.
107 | 
108 | In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
109 | 
110 | 9. Accepting Warranty or Additional Liability.
111 | 
112 | While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
113 | 
114 | 


--------------------------------------------------------------------------------
/pixelsnail.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Xi Chen
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | # Borrowed from https://github.com/neocxi/pixelsnail-public and ported it to PyTorch
  7 | 
  8 | from math import sqrt
  9 | from functools import partial, lru_cache
 10 | 
 11 | import numpy as np
 12 | import torch
 13 | from torch import nn
 14 | from torch.nn import functional as F
 15 | 
 16 | 
 17 | def wn_linear(in_dim, out_dim):
 18 |     return nn.utils.weight_norm(nn.Linear(in_dim, out_dim))
 19 | 
 20 | 
 21 | class WNConv2d(nn.Module):
 22 |     def __init__(
 23 |         self,
 24 |         in_channel,
 25 |         out_channel,
 26 |         kernel_size,
 27 |         stride=1,
 28 |         padding=0,
 29 |         bias=True,
 30 |         activation=None,
 31 |     ):
 32 |         super().__init__()
 33 | 
 34 |         self.conv = nn.utils.weight_norm(
 35 |             nn.Conv2d(
 36 |                 in_channel,
 37 |                 out_channel,
 38 |                 kernel_size,
 39 |                 stride=stride,
 40 |                 padding=padding,
 41 |                 bias=bias,
 42 |             )
 43 |         )
 44 | 
 45 |         self.out_channel = out_channel
 46 | 
 47 |         if isinstance(kernel_size, int):
 48 |             kernel_size = [kernel_size, kernel_size]
 49 | 
 50 |         self.kernel_size = kernel_size
 51 | 
 52 |         self.activation = activation
 53 | 
 54 |     def forward(self, input):
 55 |         out = self.conv(input)
 56 | 
 57 |         if self.activation is not None:
 58 |             out = self.activation(out)
 59 | 
 60 |         return out
 61 | 
 62 | 
 63 | def shift_down(input, size=1):
 64 |     return F.pad(input, [0, 0, size, 0])[:, :, : input.shape[2], :]
 65 | 
 66 | 
 67 | def shift_right(input, size=1):
 68 |     return F.pad(input, [size, 0, 0, 0])[:, :, :, : input.shape[3]]
 69 | 
 70 | 
 71 | class CausalConv2d(nn.Module):
 72 |     def __init__(
 73 |         self,
 74 |         in_channel,
 75 |         out_channel,
 76 |         kernel_size,
 77 |         stride=1,
 78 |         padding='downright',
 79 |         activation=None,
 80 |     ):
 81 |         super().__init__()
 82 | 
 83 |         if isinstance(kernel_size, int):
 84 |             kernel_size = [kernel_size] * 2
 85 | 
 86 |         self.kernel_size = kernel_size
 87 | 
 88 |         if padding == 'downright':
 89 |             pad = [kernel_size[1] - 1, 0, kernel_size[0] - 1, 0]
 90 | 
 91 |         elif padding == 'down' or padding == 'causal':
 92 |             pad = kernel_size[1] // 2
 93 | 
 94 |             pad = [pad, pad, kernel_size[0] - 1, 0]
 95 | 
 96 |         self.causal = 0
 97 |         if padding == 'causal':
 98 |             self.causal = kernel_size[1] // 2
 99 | 
100 |         self.pad = nn.ZeroPad2d(pad)
101 | 
102 |         self.conv = WNConv2d(
103 |             in_channel,
104 |             out_channel,
105 |             kernel_size,
106 |             stride=stride,
107 |             padding=0,
108 |             activation=activation,
109 |         )
110 | 
111 |     def forward(self, input):
112 |         out = self.pad(input)
113 | 
114 |         if self.causal > 0:
115 |             self.conv.conv.weight_v.data[:, :, -1, self.causal :].zero_()
116 | 
117 |         out = self.conv(out)
118 | 
119 |         return out
120 | 
121 | 
122 | class GatedResBlock(nn.Module):
123 |     def __init__(
124 |         self,
125 |         in_channel,
126 |         channel,
127 |         kernel_size,
128 |         conv='wnconv2d',
129 |         activation=nn.ELU,
130 |         dropout=0.1,
131 |         auxiliary_channel=0,
132 |         condition_dim=0,
133 |     ):
134 |         super().__init__()
135 | 
136 |         if conv == 'wnconv2d':
137 |             conv_module = partial(WNConv2d, padding=kernel_size // 2)
138 | 
139 |         elif conv == 'causal_downright':
140 |             conv_module = partial(CausalConv2d, padding='downright')
141 | 
142 |         elif conv == 'causal':
143 |             conv_module = partial(CausalConv2d, padding='causal')
144 | 
145 |         self.activation = activation()
146 |         self.conv1 = conv_module(in_channel, channel, kernel_size)
147 | 
148 |         if auxiliary_channel > 0:
149 |             self.aux_conv = WNConv2d(auxiliary_channel, channel, 1)
150 | 
151 |         self.dropout = nn.Dropout(dropout)
152 | 
153 |         self.conv2 = conv_module(channel, in_channel * 2, kernel_size)
154 | 
155 |         if condition_dim > 0:
156 |             # self.condition = nn.Linear(condition_dim, in_channel * 2, bias=False)
157 |             self.condition = WNConv2d(condition_dim, in_channel * 2, 1, bias=False)
158 | 
159 |         self.gate = nn.GLU(1)
160 | 
161 |     def forward(self, input, aux_input=None, condition=None):
162 |         out = self.conv1(self.activation(input))
163 | 
164 |         if aux_input is not None:
165 |             out = out + self.aux_conv(self.activation(aux_input))
166 | 
167 |         out = self.activation(out)
168 |         out = self.dropout(out)
169 |         out = self.conv2(out)
170 | 
171 |         if condition is not None:
172 |             condition = self.condition(condition)
173 |             out += condition
174 |             # out = out + condition.view(condition.shape[0], 1, 1, condition.shape[1])
175 | 
176 |         out = self.gate(out)
177 |         out += input
178 | 
179 |         return out
180 | 
181 | 
182 | @lru_cache(maxsize=64)
183 | def causal_mask(size):
184 |     shape = [size, size]
185 |     mask = np.triu(np.ones(shape), k=1).astype(np.uint8).T
186 |     start_mask = np.ones(size).astype(np.float32)
187 |     start_mask[0] = 0
188 | 
189 |     return (
190 |         torch.from_numpy(mask).unsqueeze(0),
191 |         torch.from_numpy(start_mask).unsqueeze(1),
192 |     )
193 | 
194 | 
195 | class CausalAttention(nn.Module):
196 |     def __init__(self, query_channel, key_channel, channel, n_head=8, dropout=0.1):
197 |         super().__init__()
198 | 
199 |         self.query = wn_linear(query_channel, channel)
200 |         self.key = wn_linear(key_channel, channel)
201 |         self.value = wn_linear(key_channel, channel)
202 | 
203 |         self.dim_head = channel // n_head
204 |         self.n_head = n_head
205 | 
206 |         self.dropout = nn.Dropout(dropout)
207 | 
208 |     def forward(self, query, key):
209 |         batch, _, height, width = key.shape
210 | 
211 |         def reshape(input):
212 |             return input.view(batch, -1, self.n_head, self.dim_head).transpose(1, 2)
213 | 
214 |         query_flat = query.view(batch, query.shape[1], -1).transpose(1, 2)
215 |         key_flat = key.view(batch, key.shape[1], -1).transpose(1, 2)
216 |         query = reshape(self.query(query_flat))
217 |         key = reshape(self.key(key_flat)).transpose(2, 3)
218 |         value = reshape(self.value(key_flat))
219 | 
220 |         attn = torch.matmul(query, key) / sqrt(self.dim_head)
221 |         mask, start_mask = causal_mask(height * width)
222 |         mask = mask.type_as(query)
223 |         start_mask = start_mask.type_as(query)
224 |         attn = attn.masked_fill(mask == 0, -1e4)
225 |         attn = torch.softmax(attn, 3) * start_mask
226 |         attn = self.dropout(attn)
227 | 
228 |         out = attn @ value
229 |         out = out.transpose(1, 2).reshape(
230 |             batch, height, width, self.dim_head * self.n_head
231 |         )
232 |         out = out.permute(0, 3, 1, 2)
233 | 
234 |         return out
235 | 
236 | 
237 | class PixelBlock(nn.Module):
238 |     def __init__(
239 |         self,
240 |         in_channel,
241 |         channel,
242 |         kernel_size,
243 |         n_res_block,
244 |         attention=True,
245 |         dropout=0.1,
246 |         condition_dim=0,
247 |     ):
248 |         super().__init__()
249 | 
250 |         resblocks = []
251 |         for i in range(n_res_block):
252 |             resblocks.append(
253 |                 GatedResBlock(
254 |                     in_channel,
255 |                     channel,
256 |                     kernel_size,
257 |                     conv='causal',
258 |                     dropout=dropout,
259 |                     condition_dim=condition_dim,
260 |                 )
261 |             )
262 | 
263 |         self.resblocks = nn.ModuleList(resblocks)
264 | 
265 |         self.attention = attention
266 | 
267 |         if attention:
268 |             self.key_resblock = GatedResBlock(
269 |                 in_channel * 2 + 2, in_channel, 1, dropout=dropout
270 |             )
271 |             self.query_resblock = GatedResBlock(
272 |                 in_channel + 2, in_channel, 1, dropout=dropout
273 |             )
274 | 
275 |             self.causal_attention = CausalAttention(
276 |                 in_channel + 2, in_channel * 2 + 2, in_channel // 2, dropout=dropout
277 |             )
278 | 
279 |             self.out_resblock = GatedResBlock(
280 |                 in_channel,
281 |                 in_channel,
282 |                 1,
283 |                 auxiliary_channel=in_channel // 2,
284 |                 dropout=dropout,
285 |             )
286 | 
287 |         else:
288 |             self.out = WNConv2d(in_channel + 2, in_channel, 1)
289 | 
290 |     def forward(self, input, background, condition=None):
291 |         out = input
292 | 
293 |         for resblock in self.resblocks:
294 |             out = resblock(out, condition=condition)
295 | 
296 |         if self.attention:
297 |             key_cat = torch.cat([input, out, background], 1)
298 |             key = self.key_resblock(key_cat)
299 |             query_cat = torch.cat([out, background], 1)
300 |             query = self.query_resblock(query_cat)
301 |             attn_out = self.causal_attention(query, key)
302 |             out = self.out_resblock(out, attn_out)
303 | 
304 |         else:
305 |             bg_cat = torch.cat([out, background], 1)
306 |             out = self.out(bg_cat)
307 | 
308 |         return out
309 | 
310 | 
311 | class CondResNet(nn.Module):
312 |     def __init__(self, in_channel, channel, kernel_size, n_res_block):
313 |         super().__init__()
314 | 
315 |         blocks = [WNConv2d(in_channel, channel, kernel_size, padding=kernel_size // 2)]
316 | 
317 |         for i in range(n_res_block):
318 |             blocks.append(GatedResBlock(channel, channel, kernel_size))
319 | 
320 |         self.blocks = nn.Sequential(*blocks)
321 | 
322 |     def forward(self, input):
323 |         return self.blocks(input)
324 | 
325 | 
326 | class PixelSNAIL(nn.Module):
327 |     def __init__(
328 |         self,
329 |         shape,
330 |         n_class,
331 |         channel,
332 |         kernel_size,
333 |         n_block,
334 |         n_res_block,
335 |         res_channel,
336 |         attention=True,
337 |         dropout=0.1,
338 |         n_cond_res_block=0,
339 |         cond_res_channel=0,
340 |         cond_res_kernel=3,
341 |         n_out_res_block=0,
342 |     ):
343 |         super().__init__()
344 | 
345 |         height, width = shape
346 | 
347 |         self.n_class = n_class
348 | 
349 |         if kernel_size % 2 == 0:
350 |             kernel = kernel_size + 1
351 | 
352 |         else:
353 |             kernel = kernel_size
354 | 
355 |         self.horizontal = CausalConv2d(
356 |             n_class, channel, [kernel // 2, kernel], padding='down'
357 |         )
358 |         self.vertical = CausalConv2d(
359 |             n_class, channel, [(kernel + 1) // 2, kernel // 2], padding='downright'
360 |         )
361 | 
362 |         coord_x = (torch.arange(height).float() - height / 2) / height
363 |         coord_x = coord_x.view(1, 1, height, 1).expand(1, 1, height, width)
364 |         coord_y = (torch.arange(width).float() - width / 2) / width
365 |         coord_y = coord_y.view(1, 1, 1, width).expand(1, 1, height, width)
366 |         self.register_buffer('background', torch.cat([coord_x, coord_y], 1))
367 | 
368 |         self.blocks = nn.ModuleList()
369 | 
370 |         for i in range(n_block):
371 |             self.blocks.append(
372 |                 PixelBlock(
373 |                     channel,
374 |                     res_channel,
375 |                     kernel_size,
376 |                     n_res_block,
377 |                     attention=attention,
378 |                     dropout=dropout,
379 |                     condition_dim=cond_res_channel,
380 |                 )
381 |             )
382 | 
383 |         if n_cond_res_block > 0:
384 |             self.cond_resnet = CondResNet(
385 |                 n_class, cond_res_channel, cond_res_kernel, n_cond_res_block
386 |             )
387 | 
388 |         out = []
389 | 
390 |         for i in range(n_out_res_block):
391 |             out.append(GatedResBlock(channel, res_channel, 1))
392 | 
393 |         out.extend([nn.ELU(inplace=True), WNConv2d(channel, n_class, 1)])
394 | 
395 |         self.out = nn.Sequential(*out)
396 | 
397 |     def forward(self, input, condition=None, cache=None):
398 |         if cache is None:
399 |             cache = {}
400 |         batch, height, width = input.shape
401 |         input = (
402 |             F.one_hot(input, self.n_class).permute(0, 3, 1, 2).type_as(self.background)
403 |         )
404 |         horizontal = shift_down(self.horizontal(input))
405 |         vertical = shift_right(self.vertical(input))
406 |         out = horizontal + vertical
407 | 
408 |         background = self.background[:, :, :height, :].expand(batch, 2, height, width)
409 | 
410 |         if condition is not None:
411 |             if 'condition' in cache:
412 |                 condition = cache['condition']
413 |                 condition = condition[:, :, :height, :]
414 | 
415 |             else:
416 |                 condition = (
417 |                     F.one_hot(condition, self.n_class)
418 |                     .permute(0, 3, 1, 2)
419 |                     .type_as(self.background)
420 |                 )
421 |                 condition = self.cond_resnet(condition)
422 |                 condition = F.interpolate(condition, scale_factor=2)
423 |                 cache['condition'] = condition.detach().clone()
424 |                 condition = condition[:, :, :height, :]
425 | 
426 |         for block in self.blocks:
427 |             out = block(out, background, condition=condition)
428 | 
429 |         out = self.out(out)
430 | 
431 |         return out, cache
432 | 


--------------------------------------------------------------------------------