├── pose_synthesis
    ├── zero123-xl.ckpt
    ├── ldm
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── inpainting
    │   │   │   ├── __init__.py
    │   │   │   └── synthetic_mask.py
    │   │   ├── dummy.py
    │   │   ├── base.py
    │   │   ├── lsun.py
    │   │   └── nerf_like.py
    │   ├── models
    │   │   └── diffusion
    │   │   │   ├── __init__.py
    │   │   │   └── sampling_util.py
    │   ├── modules
    │   │   ├── encoders
    │   │   │   └── __init__.py
    │   │   ├── distributions
    │   │   │   ├── __init__.py
    │   │   │   └── distributions.py
    │   │   ├── diffusionmodules
    │   │   │   └── __init__.py
    │   │   ├── losses
    │   │   │   ├── __init__.py
    │   │   │   ├── contperceptual.py
    │   │   │   └── vqperceptual.py
    │   │   ├── image_degradation
    │   │   │   ├── utils
    │   │   │   │   └── test.png
    │   │   │   └── __init__.py
    │   │   ├── ema.py
    │   │   └── evaluate
    │   │   │   ├── ssim.py
    │   │   │   └── frechet_video_distance.py
    │   ├── thirdp
    │   │   └── psp
    │   │   │   ├── id_loss.py
    │   │   │   ├── model_irse.py
    │   │   │   └── helpers.py
    │   ├── extras.py
    │   ├── guidance.py
    │   ├── lr_scheduler.py
    │   └── util.py
    ├── sam_vit_h_4b8939.pth
    ├── elevation_estimate
    │   ├── __init__.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── weights
    │   │   │   └── .gitkeep
    │   │   ├── utils3d.py
    │   │   ├── plotting.py
    │   │   └── elev_est_api.py
    │   ├── .gitignore
    │   ├── loftr
    │   │   ├── __init__.py
    │   │   ├── loftr_module
    │   │   │   ├── __init__.py
    │   │   │   ├── fine_preprocess.py
    │   │   │   ├── linear_attention.py
    │   │   │   └── transformer.py
    │   │   ├── backbone
    │   │   │   ├── __init__.py
    │   │   │   └── resnet_fpn.py
    │   │   ├── utils
    │   │   │   ├── cvpr_ds_config.py
    │   │   │   ├── position_encoding.py
    │   │   │   ├── geometry.py
    │   │   │   ├── fine_matching.py
    │   │   │   └── supervision.py
    │   │   └── loftr.py
    │   ├── pyproject.toml
    │   └── estimate_wild_imgs.py
    ├── download_ckpt.py
    ├── utils
    │   ├── sam_utils.py
    │   ├── utils.py
    │   └── zero123_utils.py
    ├── pose_synthesis_batch.py
    ├── configs
    │   └── sd-objaverse-finetune-c_concat-256.yaml
    ├── run.py
    └── README.md
├── .gitignore
├── imgs
    ├── demo
    │   ├── Intro.jpg
    │   └── pipeline.jpg
    ├── sofa_set
    │   ├── sofa_10_a.png
    │   ├── sofa_11_a.png
    │   ├── sofa_12_a.png
    │   ├── sofa_13_a.png
    │   ├── sofa_14_a.png
    │   ├── sofa_15_a.png
    │   ├── sofa_16_a.png
    │   ├── sofa_17_a.png
    │   ├── sofa_18_a.png
    │   ├── sofa_19_a.png
    │   ├── sofa_1_a.png
    │   ├── sofa_20_a.png
    │   ├── sofa_21_a.png
    │   ├── sofa_22_a.png
    │   ├── sofa_23_a.png
    │   ├── sofa_24_a.png
    │   ├── sofa_25_a.png
    │   ├── sofa_26_a.png
    │   ├── sofa_27_a.png
    │   ├── sofa_28_a.png
    │   ├── sofa_2_a.png
    │   ├── sofa_3_a.png
    │   ├── sofa_4_a.png
    │   ├── sofa_5_a.png
    │   ├── sofa_6_a.png
    │   ├── sofa_7_a.png
    │   ├── sofa_8_a.png
    │   ├── sofa_9_a.png
    │   ├── sofa_bg_a.png
    │   ├── sofa_bg_b.png
    │   ├── sofa_bg_f1.png
    │   └── sofa_bg_f2.png
    └── synthesized_imgs
    │   └── sofa_1_a
    │       ├── -30_0.png
    │       ├── 0_-10.png
    │       ├── 0_-20.png
    │       ├── 0_-30.png
    │       ├── 0_-40.png
    │       ├── 0_0.png
    │       ├── 0_10.png
    │       ├── 0_120.png
    │       ├── 0_20.png
    │       ├── 0_30.png
    │       ├── 0_40.png
    │       ├── 0_50.png
    │       ├── 0_60.png
    │       ├── 0_90.png
    │       ├── 10_10.png
    │       ├── 10_20.png
    │       ├── 10_40.png
    │       ├── 10_50.png
    │       ├── -10_10.png
    │       ├── -10_20.png
    │       ├── -10_40.png
    │       ├── -10_50.png
    │       ├── -30_120.png
    │       ├── -30_30.png
    │       ├── -30_60.png
    │       ├── -30_90.png
    │       ├── 10_-10.png
    │       ├── 10_-20.png
    │       ├── 10_-30.png
    │       └── 10_-40.png
├── obj_name_synthesis.py
├── requirements.txt
├── util.py
├── pose_estimation.py
├── README.md
└── train_pose_estimator.py


/pose_synthesis/zero123-xl.ckpt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pose_synthesis/sam_vit_h_4b8939.pth:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | **/.DS_Store
3 | /tmp


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/data/inpainting/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/models/diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/modules/encoders/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/modules/distributions/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/utils/weights/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/modules/diffusionmodules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | .idea/
3 | *.egg-info/
4 | 


--------------------------------------------------------------------------------
/imgs/demo/Intro.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/demo/Intro.jpg


--------------------------------------------------------------------------------
/imgs/demo/pipeline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/demo/pipeline.jpg


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_10_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_10_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_11_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_11_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_12_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_12_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_13_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_13_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_14_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_14_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_15_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_15_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_16_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_16_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_17_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_17_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_18_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_18_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_19_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_19_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_1_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_1_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_20_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_20_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_21_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_21_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_22_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_22_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_23_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_23_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_24_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_24_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_25_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_25_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_26_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_26_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_27_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_27_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_28_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_28_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_2_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_2_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_3_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_3_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_4_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_4_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_5_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_5_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_6_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_6_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_7_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_7_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_8_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_8_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_9_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_9_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_bg_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_bg_a.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_bg_b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_bg_b.png


--------------------------------------------------------------------------------
/pose_synthesis/ldm/modules/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_bg_f1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_bg_f1.png


--------------------------------------------------------------------------------
/imgs/sofa_set/sofa_bg_f2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_bg_f2.png


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/loftr/__init__.py:
--------------------------------------------------------------------------------
1 | from .loftr import LoFTR
2 | from .utils.cvpr_ds_config import default_cfg
3 | 


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/-30_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-30_0.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/0_-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_-10.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/0_-20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_-20.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/0_-30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_-30.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/0_-40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_-40.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/0_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_0.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/0_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_10.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/0_120.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_120.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/0_20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_20.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/0_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_30.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/0_40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_40.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/0_50.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_50.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/0_60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_60.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/0_90.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_90.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/10_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/10_10.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/10_20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/10_20.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/10_40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/10_40.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/10_50.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/10_50.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/-10_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-10_10.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/-10_20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-10_20.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/-10_40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-10_40.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/-10_50.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-10_50.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/-30_120.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-30_120.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/-30_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-30_30.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/-30_60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-30_60.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/-30_90.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-30_90.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/10_-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/10_-10.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/10_-20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/10_-20.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/10_-30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/10_-30.png


--------------------------------------------------------------------------------
/imgs/synthesized_imgs/sofa_1_a/10_-40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/10_-40.png


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/loftr/loftr_module/__init__.py:
--------------------------------------------------------------------------------
1 | from .transformer import LocalFeatureTransformer
2 | from .fine_preprocess import FinePreprocess
3 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/modules/image_degradation/utils/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/pose_synthesis/ldm/modules/image_degradation/utils/test.png


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "elevation_estimate"
3 | version = "0.1"
4 | 
5 | [tool.setuptools.packages.find]
6 | exclude = ["configs", "tests"]  # empty by default
7 | namespaces = false  # true by default


--------------------------------------------------------------------------------
/pose_synthesis/ldm/modules/image_degradation/__init__.py:
--------------------------------------------------------------------------------
1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
3 | 


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/estimate_wild_imgs.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | from .utils.elev_est_api import elev_est_api
 3 | 
 4 | def estimate_elev(root_dir):
 5 |     img_dir = osp.join(root_dir, "stage2_8")
 6 |     img_paths = []
 7 |     for i in range(4):
 8 |         img_paths.append(f"{img_dir}/0_{i}.png")
 9 |     elev = elev_est_api(img_paths)
10 |     return elev
11 | 


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/loftr/backbone/__init__.py:
--------------------------------------------------------------------------------
 1 | from .resnet_fpn import ResNetFPN_8_2, ResNetFPN_16_4
 2 | 
 3 | 
 4 | def build_backbone(config):
 5 |     if config['backbone_type'] == 'ResNetFPN':
 6 |         if config['resolution'] == (8, 2):
 7 |             return ResNetFPN_8_2(config['resnetfpn'])
 8 |         elif config['resolution'] == (16, 4):
 9 |             return ResNetFPN_16_4(config['resnetfpn'])
10 |     else:
11 |         raise ValueError(f"LOFTR.BACKBONE_TYPE {config['backbone_type']} not supported.")
12 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/thirdp/psp/id_loss.py:
--------------------------------------------------------------------------------
 1 | # https://github.com/eladrich/pixel2style2pixel
 2 | import torch
 3 | from torch import nn
 4 | from ldm.thirdp.psp.model_irse import Backbone
 5 | 
 6 | 
 7 | class IDFeatures(nn.Module):
 8 |     def __init__(self, model_path):
 9 |         super(IDFeatures, self).__init__()
10 |         print('Loading ResNet ArcFace')
11 |         self.facenet = Backbone(input_size=112, num_layers=50, drop_ratio=0.6, mode='ir_se')
12 |         self.facenet.load_state_dict(torch.load(model_path, map_location="cpu"))
13 |         self.face_pool = torch.nn.AdaptiveAvgPool2d((112, 112))
14 |         self.facenet.eval()
15 | 
16 |     def forward(self, x, crop=False):
17 |         # Not sure of the image range here
18 |         if crop:
19 |             x = torch.nn.functional.interpolate(x, (256, 256), mode="area")
20 |             x = x[:, :, 35:223, 32:220]
21 |         x = self.face_pool(x)
22 |         x_feats = self.facenet(x)
23 |         return x_feats
24 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/data/dummy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | import string
 4 | from torch.utils.data import Dataset, Subset
 5 | 
 6 | class DummyData(Dataset):
 7 |     def __init__(self, length, size):
 8 |         self.length = length
 9 |         self.size = size
10 | 
11 |     def __len__(self):
12 |         return self.length
13 | 
14 |     def __getitem__(self, i):
15 |         x = np.random.randn(*self.size)
16 |         letters = string.ascii_lowercase
17 |         y = ''.join(random.choice(string.ascii_lowercase) for i in range(10))
18 |         return {"jpg": x, "txt": y}
19 | 
20 | 
21 | class DummyDataWithEmbeddings(Dataset):
22 |     def __init__(self, length, size, emb_size):
23 |         self.length = length
24 |         self.size = size
25 |         self.emb_size = emb_size
26 | 
27 |     def __len__(self):
28 |         return self.length
29 | 
30 |     def __getitem__(self, i):
31 |         x = np.random.randn(*self.size)
32 |         y = np.random.randn(*self.emb_size).astype(np.float32)
33 |         return {"jpg": x, "txt": y}
34 | 
35 | 


--------------------------------------------------------------------------------
/obj_name_synthesis.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from transformers import AutoProcessor, Blip2ForConditionalGeneration
 4 | import torch
 5 | from diffusers.utils import load_image
 6 | import argparse
 7 | 
 8 | 
 9 | 
10 | def main(input_path, output_path):
11 | 
12 |     processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
13 |     model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
14 | 
15 |     prompt = ""
16 |     image = load_image(input_path)
17 | 
18 |     device = "cuda" if torch.cuda.is_available() else "cpu"
19 |     model.to(device)
20 | 
21 |     inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
22 | 
23 |     generated_ids = model.generate(**inputs, max_new_tokens=20)
24 |     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
25 |     
26 |     with open(output_path, "w") as f:
27 |         f.write(generated_text)
28 | 
29 |     print("image from {} captioned as {}".format(input_path, generated_text))
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     parser = argparse.ArgumentParser()
34 |     parser.add_argument("--input_path", type=str, default="./input_images_path")
35 |     parser.add_argument("--output_path", type=str, default="./output_caption_path")
36 | 
37 |     args = parser.parse_args()
38 |     main(args.input_path, args.output_path)
39 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/data/base.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from abc import abstractmethod
 4 | from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset
 5 | 
 6 | 
 7 | class Txt2ImgIterableBaseDataset(IterableDataset):
 8 |     '''
 9 |     Define an interface to make the IterableDatasets for text2img data chainable
10 |     '''
11 |     def __init__(self, num_records=0, valid_ids=None, size=256):
12 |         super().__init__()
13 |         self.num_records = num_records
14 |         self.valid_ids = valid_ids
15 |         self.sample_ids = valid_ids
16 |         self.size = size
17 | 
18 |         print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.')
19 | 
20 |     def __len__(self):
21 |         return self.num_records
22 | 
23 |     @abstractmethod
24 |     def __iter__(self):
25 |         pass
26 | 
27 | 
28 | class PRNGMixin(object):
29 |     """
30 |     Adds a prng property which is a numpy RandomState which gets
31 |     reinitialized whenever the pid changes to avoid synchronized sampling
32 |     behavior when used in conjunction with multiprocessing.
33 |     """
34 |     @property
35 |     def prng(self):
36 |         currentpid = os.getpid()
37 |         if getattr(self, "_initpid", None) != currentpid:
38 |             self._initpid = currentpid
39 |             self._prng = np.random.RandomState()
40 |         return self._prng
41 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate
 2 | Pillow
 3 | 
 4 | albumentations>=1.3.1
 5 | opencv-python>=4.8.0.76
 6 | pudb>=2022.1.3
 7 | imageio>=2.31.1
 8 | imageio-ffmpeg>=0.4.8
 9 | pytorch-lightning>=2.0.6
10 | omegaconf>=2.3.0
11 | test-tube>=0.7.5
12 | streamlit>=1.25.0
13 | einops>=0.6.1
14 | torch-fidelity>=0.3.0
15 | transformers>=4.31.0
16 | kornia>=0.7.0
17 | webdataset>=0.2.48
18 | torchmetrics>=1.0.3
19 | fire>=0.5.0
20 | gradio>=3.40.1
21 | diffusers>=0.19.3
22 | datasets[vision]>=2.14.4
23 | rich>=13.5.2
24 | plotly>=5.16.0
25 | -e git+https://github.com/CompVis/taming-transformers.git#egg=taming-transformers
26 | # elev est
27 | dl_ext>=1.3.4
28 | loguru>=0.7.0
29 | matplotlib>=3.7.2
30 | multipledispatch>=1.0.0
31 | packaging>=23.1
32 | Pillow>=9.3.0
33 | PyYAML>=6.0.1
34 | scikit_image>=0.21.0
35 | scikit_learn>=1.3.0
36 | scipy>=1.11.1
37 | setuptools>=59.6.0
38 | tensorboardX>=2.6.2
39 | tqdm>=4.66.1
40 | transforms3d>=0.4.1
41 | trimesh>=3.23.1
42 | yacs>=0.1.8
43 | gdown>=4.7.1
44 | git+https://github.com/NVlabs/nvdiffrast.git
45 | git+https://github.com/openai/CLIP.git
46 | # segment anything
47 | onnxruntime>=1.15.1
48 | onnx>=1.14.0
49 | git+https://github.com/facebookresearch/segment-anything.git
50 | # rembg
51 | rembg>=2.0.50
52 | # reconstruction
53 | pyhocon>=0.3.60
54 | icecream>=2.1.3
55 | PyMCubes>=0.1.4
56 | ninja>=1.11.1
57 | # juypter
58 | jupyter>=1.0.0
59 | jupyterlab>=4.0.5
60 | ipywidgets>=8.1.0
61 | ipykernel>=6.25.1
62 | panel>=1.2.1
63 | jupyter_bokeh>=3.0.7
64 | 
65 | 


--------------------------------------------------------------------------------
/pose_synthesis/download_ckpt.py:
--------------------------------------------------------------------------------
 1 | import urllib.request
 2 | from tqdm import tqdm
 3 | 
 4 | def download_checkpoint(url, save_path):
 5 |     try:
 6 |         with urllib.request.urlopen(url) as response, open(save_path, 'wb') as file:
 7 |             file_size = int(response.info().get('Content-Length', -1))
 8 |             chunk_size = 8192
 9 |             num_chunks = file_size // chunk_size if file_size > chunk_size else 1
10 | 
11 |             with tqdm(total=file_size, unit='B', unit_scale=True, desc='Downloading', ncols=100) as pbar:
12 |                 for chunk in iter(lambda: response.read(chunk_size), b''):
13 |                     file.write(chunk)
14 |                     pbar.update(len(chunk))
15 |         
16 |         print(f"Checkpoint downloaded and saved to: {save_path}")
17 |     except Exception as e:
18 |         print(f"Error downloading checkpoint: {e}")
19 | 
20 | if __name__ == "__main__":
21 |     ckpts = {
22 |         "sam_vit_h_4b8939.pth": "https://huggingface.co/One-2-3-45/code/resolve/main/sam_vit_h_4b8939.pth",
23 |         "zero123-xl.ckpt": "https://huggingface.co/One-2-3-45/code/resolve/main/zero123-xl.ckpt",
24 |         "elevation_estimate/utils/weights/indoor_ds_new.ckpt" : "https://huggingface.co/One-2-3-45/code/resolve/main/elevation_estimate/utils/weights/indoor_ds_new.ckpt"
25 |     }
26 |     for ckpt_name, ckpt_url in ckpts.items():
27 |         print(f"Downloading checkpoint: {ckpt_name}")
28 |         download_checkpoint(ckpt_url, ckpt_name)
29 | 
30 | 


--------------------------------------------------------------------------------
/pose_synthesis/utils/sam_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import torch
 4 | from PIL import Image
 5 | import time
 6 | 
 7 | from segment_anything import sam_model_registry, SamPredictor
 8 | 
 9 | def sam_init(device_id=0):
10 |     sam_checkpoint = os.path.join(os.path.dirname(__file__), "../sam_vit_h_4b8939.pth")
11 |     model_type = "vit_h"
12 | 
13 |     device = "cuda:{}".format(device_id) if torch.cuda.is_available() else "cpu"
14 | 
15 |     sam = sam_model_registry[model_type](checkpoint=sam_checkpoint).to(device=device)
16 |     predictor = SamPredictor(sam)
17 |     return predictor
18 | 
19 | def sam_out_nosave(predictor, input_image, *bbox_sliders):
20 |     bbox = np.array(bbox_sliders)
21 |     image = np.asarray(input_image)
22 | 
23 |     start_time = time.time()
24 |     predictor.set_image(image)
25 | 
26 |     h, w, _ = image.shape
27 |     input_point = np.array([[h//2, w//2]])
28 |     input_label = np.array([1])
29 | 
30 |     masks, scores, logits = predictor.predict(
31 |         point_coords=input_point,
32 |         point_labels=input_label,
33 |         multimask_output=True,
34 |     )
35 | 
36 |     masks_bbox, scores_bbox, logits_bbox = predictor.predict(
37 |         box=bbox,
38 |         multimask_output=True
39 |     )
40 | 
41 |     print(f"SAM Time: {time.time() - start_time:.3f}s")
42 |     opt_idx = np.argmax(scores)
43 |     mask = masks[opt_idx]
44 |     out_image = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8)
45 |     out_image[:, :, :3] = image
46 |     out_image_bbox = out_image.copy()
47 |     out_image[:, :, 3] = mask.astype(np.uint8) * 255
48 |     out_image_bbox[:, :, 3] = masks_bbox[-1].astype(np.uint8) * 255 # np.argmax(scores_bbox)
49 |     torch.cuda.empty_cache()
50 |     return Image.fromarray(out_image_bbox, mode='RGBA') 


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/loftr/utils/cvpr_ds_config.py:
--------------------------------------------------------------------------------
 1 | from yacs.config import CfgNode as CN
 2 | 
 3 | 
 4 | def lower_config(yacs_cfg):
 5 |     if not isinstance(yacs_cfg, CN):
 6 |         return yacs_cfg
 7 |     return {k.lower(): lower_config(v) for k, v in yacs_cfg.items()}
 8 | 
 9 | 
10 | _CN = CN()
11 | _CN.BACKBONE_TYPE = 'ResNetFPN'
12 | _CN.RESOLUTION = (8, 2)  # options: [(8, 2), (16, 4)]
13 | _CN.FINE_WINDOW_SIZE = 5  # window_size in fine_level, must be odd
14 | _CN.FINE_CONCAT_COARSE_FEAT = True
15 | 
16 | # 1. LoFTR-backbone (local feature CNN) config
17 | _CN.RESNETFPN = CN()
18 | _CN.RESNETFPN.INITIAL_DIM = 128
19 | _CN.RESNETFPN.BLOCK_DIMS = [128, 196, 256]  # s1, s2, s3
20 | 
21 | # 2. LoFTR-coarse module config
22 | _CN.COARSE = CN()
23 | _CN.COARSE.D_MODEL = 256
24 | _CN.COARSE.D_FFN = 256
25 | _CN.COARSE.NHEAD = 8
26 | _CN.COARSE.LAYER_NAMES = ['self', 'cross'] * 4
27 | _CN.COARSE.ATTENTION = 'linear'  # options: ['linear', 'full']
28 | _CN.COARSE.TEMP_BUG_FIX = False
29 | 
30 | # 3. Coarse-Matching config
31 | _CN.MATCH_COARSE = CN()
32 | _CN.MATCH_COARSE.THR = 0.2
33 | _CN.MATCH_COARSE.BORDER_RM = 2
34 | _CN.MATCH_COARSE.MATCH_TYPE = 'dual_softmax'  # options: ['dual_softmax, 'sinkhorn']
35 | _CN.MATCH_COARSE.DSMAX_TEMPERATURE = 0.1
36 | _CN.MATCH_COARSE.SKH_ITERS = 3
37 | _CN.MATCH_COARSE.SKH_INIT_BIN_SCORE = 1.0
38 | _CN.MATCH_COARSE.SKH_PREFILTER = True
39 | _CN.MATCH_COARSE.TRAIN_COARSE_PERCENT = 0.4  # training tricks: save GPU memory
40 | _CN.MATCH_COARSE.TRAIN_PAD_NUM_GT_MIN = 200  # training tricks: avoid DDP deadlock
41 | 
42 | # 4. LoFTR-fine module config
43 | _CN.FINE = CN()
44 | _CN.FINE.D_MODEL = 128
45 | _CN.FINE.D_FFN = 128
46 | _CN.FINE.NHEAD = 8
47 | _CN.FINE.LAYER_NAMES = ['self', 'cross'] * 1
48 | _CN.FINE.ATTENTION = 'linear'
49 | 
50 | default_cfg = lower_config(_CN)
51 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/models/diffusion/sampling_util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | def append_dims(x, target_dims):
 6 |     """Appends dimensions to the end of a tensor until it has target_dims dimensions.
 7 |     From https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/utils.py"""
 8 |     dims_to_append = target_dims - x.ndim
 9 |     if dims_to_append < 0:
10 |         raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less')
11 |     return x[(...,) + (None,) * dims_to_append]
12 | 
13 | 
14 | def renorm_thresholding(x0, value):
15 |     # renorm
16 |     pred_max = x0.max()
17 |     pred_min = x0.min()
18 |     pred_x0 = (x0 - pred_min) / (pred_max - pred_min)  # 0 ... 1
19 |     pred_x0 = 2 * pred_x0 - 1.  # -1 ... 1
20 | 
21 |     s = torch.quantile(
22 |         rearrange(pred_x0, 'b ... -> b (...)').abs(),
23 |         value,
24 |         dim=-1
25 |     )
26 |     s.clamp_(min=1.0)
27 |     s = s.view(-1, *((1,) * (pred_x0.ndim - 1)))
28 | 
29 |     # clip by threshold
30 |     # pred_x0 = pred_x0.clamp(-s, s) / s  # needs newer pytorch  # TODO bring back to pure-gpu with min/max
31 | 
32 |     # temporary hack: numpy on cpu
33 |     pred_x0 = np.clip(pred_x0.cpu().numpy(), -s.cpu().numpy(), s.cpu().numpy()) / s.cpu().numpy()
34 |     pred_x0 = torch.tensor(pred_x0).to(self.model.device)
35 | 
36 |     # re.renorm
37 |     pred_x0 = (pred_x0 + 1.) / 2.  # 0 ... 1
38 |     pred_x0 = (pred_max - pred_min) * pred_x0 + pred_min  # orig range
39 |     return pred_x0
40 | 
41 | 
42 | def norm_thresholding(x0, value):
43 |     s = append_dims(x0.pow(2).flatten(1).mean(1).sqrt().clamp(min=value), x0.ndim)
44 |     return x0 * (value / s)
45 | 
46 | 
47 | def spatial_norm_thresholding(x0, value):
48 |     # b c h w
49 |     s = x0.pow(2).mean(1, keepdim=True).sqrt().clamp(min=value)
50 |     return x0 * (value / s)


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from PIL import Image
 3 | from lang_sam import LangSAM 
 4 | import numpy as np
 5 | import argparse
 6 | 
 7 | 
 8 | def segmentation(image, text, output_path):
 9 | 
10 |     model = LangSAM(sam_type="vit_h")
11 | 
12 |     def predict(image_path, text_prompt, box_threshold=0.3, text_threshold=0.25):
13 |         if isinstance(image_path, str):
14 |             image_pil = Image.open(image_path).convert("RGB")
15 |         else:
16 |             # bug here, need to be improved
17 |             image_pil = image_path
18 |         masks, boxes, phrases, logits = model.predict(image_pil, text_prompt, box_threshold, text_threshold)
19 |         labels = [f"{phrase} {logit:.2f}" for phrase, logit in zip(phrases, logits)]
20 |         image_array = np.asarray(image_pil.convert("RGBA"))
21 | 
22 |         output_image = np.zeros_like(image_array)
23 |         output_image[:,:,3] = 255
24 |         output_image[:,:,0:3] = image_array[:,:,0:3]
25 | 
26 |         for i in range(len(masks)):
27 |             mask = masks[i]
28 |             mask = np.expand_dims(mask, axis=2)
29 |             mask = np.repeat(mask, 4, axis=2)
30 |             mask = mask.astype(np.uint8)
31 |             mask = mask * 255
32 |             output_image = np.where(mask > 0, output_image, 0)
33 | 
34 |         output_image = Image.fromarray(np.uint8(output_image)).convert("RGBA")
35 | 
36 |         output_image.save(output_path)
37 | 
38 |         
39 |     predict(image, text)
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     parser = argparse.ArgumentParser()
44 |     parser.add_argument("--input_path", type=str, default="./input_images_path")
45 |     parser.add_argument("--prompt", type=str, default="sofa")
46 |     parser.add_argument("--output_path", type=str, default="./output_images_path")
47 | 
48 |     args = parser.parse_args()
49 |     segmentation(args.input_path, args.prompt, args.output_path)


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/loftr/utils/position_encoding.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | from torch import nn
 4 | 
 5 | 
 6 | class PositionEncodingSine(nn.Module):
 7 |     """
 8 |     This is a sinusoidal position encoding that generalized to 2-dimensional images
 9 |     """
10 | 
11 |     def __init__(self, d_model, max_shape=(256, 256), temp_bug_fix=True):
12 |         """
13 |         Args:
14 |             max_shape (tuple): for 1/8 featmap, the max length of 256 corresponds to 2048 pixels
15 |             temp_bug_fix (bool): As noted in this [issue](https://github.com/zju3dv/LoFTR/issues/41),
16 |                 the original implementation of LoFTR includes a bug in the pos-enc impl, which has little impact
17 |                 on the final performance. For now, we keep both impls for backward compatability.
18 |                 We will remove the buggy impl after re-training all variants of our released models.
19 |         """
20 |         super().__init__()
21 | 
22 |         pe = torch.zeros((d_model, *max_shape))
23 |         y_position = torch.ones(max_shape).cumsum(0).float().unsqueeze(0)
24 |         x_position = torch.ones(max_shape).cumsum(1).float().unsqueeze(0)
25 |         if temp_bug_fix:
26 |             div_term = torch.exp(torch.arange(0, d_model//2, 2).float() * (-math.log(10000.0) / (d_model//2)))
27 |         else:  # a buggy implementation (for backward compatability only)
28 |             div_term = torch.exp(torch.arange(0, d_model//2, 2).float() * (-math.log(10000.0) / d_model//2))
29 |         div_term = div_term[:, None, None]  # [C//4, 1, 1]
30 |         pe[0::4, :, :] = torch.sin(x_position * div_term)
31 |         pe[1::4, :, :] = torch.cos(x_position * div_term)
32 |         pe[2::4, :, :] = torch.sin(y_position * div_term)
33 |         pe[3::4, :, :] = torch.cos(y_position * div_term)
34 | 
35 |         self.register_buffer('pe', pe.unsqueeze(0), persistent=False)  # [1, C, H, W]
36 | 
37 |     def forward(self, x):
38 |         """
39 |         Args:
40 |             x: [N, C, H, W]
41 |         """
42 |         return x + self.pe[:, :, :x.size(2), :x.size(3)]
43 | 


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/utils/utils3d.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | 
 5 | def cart_to_hom(pts):
 6 |     """
 7 |     :param pts: (N, 3 or 2)
 8 |     :return pts_hom: (N, 4 or 3)
 9 |     """
10 |     if isinstance(pts, np.ndarray):
11 |         pts_hom = np.concatenate((pts, np.ones([*pts.shape[:-1], 1], dtype=np.float32)), -1)
12 |     else:
13 |         ones = torch.ones([*pts.shape[:-1], 1], dtype=torch.float32, device=pts.device)
14 |         pts_hom = torch.cat((pts, ones), dim=-1)
15 |     return pts_hom
16 | 
17 | 
18 | def hom_to_cart(pts):
19 |     return pts[..., :-1] / pts[..., -1:]
20 | 
21 | 
22 | def canonical_to_camera(pts, pose):
23 |     pts = cart_to_hom(pts)
24 |     pts = pts @ pose.transpose(-1, -2)
25 |     pts = hom_to_cart(pts)
26 |     return pts
27 | 
28 | 
29 | def rect_to_img(K, pts_rect):
30 |     from dl_ext.vision_ext.datasets.kitti.structures import Calibration
31 |     pts_2d_hom = pts_rect @ K.t()
32 |     pts_img = Calibration.hom_to_cart(pts_2d_hom)
33 |     return pts_img
34 | 
35 | 
36 | def calc_pose(phis, thetas, size, radius=1.2):
37 |     import torch
38 |     def normalize(vectors):
39 |         return vectors / (torch.norm(vectors, dim=-1, keepdim=True) + 1e-10)
40 | 
41 |     device = torch.device('cuda')
42 |     thetas = torch.FloatTensor(thetas).to(device)
43 |     phis = torch.FloatTensor(phis).to(device)
44 | 
45 |     centers = torch.stack([
46 |         radius * torch.sin(thetas) * torch.sin(phis),
47 |         -radius * torch.cos(thetas) * torch.sin(phis),
48 |         radius * torch.cos(phis),
49 |     ], dim=-1)  # [B, 3]
50 | 
51 |     # lookat
52 |     forward_vector = normalize(centers).squeeze(0)
53 |     up_vector = torch.FloatTensor([0, 0, 1]).to(device).unsqueeze(0).repeat(size, 1)
54 |     right_vector = normalize(torch.cross(up_vector, forward_vector, dim=-1))
55 |     if right_vector.pow(2).sum() < 0.01:
56 |         right_vector = torch.FloatTensor([0, 1, 0]).to(device).unsqueeze(0).repeat(size, 1)
57 |     up_vector = normalize(torch.cross(forward_vector, right_vector, dim=-1))
58 | 
59 |     poses = torch.eye(4, dtype=torch.float, device=device).unsqueeze(0).repeat(size, 1, 1)
60 |     poses[:, :3, :3] = torch.stack((right_vector, up_vector, forward_vector), dim=-1)
61 |     poses[:, :3, 3] = centers
62 |     return poses
63 | 


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/loftr/utils/geometry.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | @torch.no_grad()
 5 | def warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1):
 6 |     """ Warp kpts0 from I0 to I1 with depth, K and Rt
 7 |     Also check covisibility and depth consistency.
 8 |     Depth is consistent if relative error < 0.2 (hard-coded).
 9 |     
10 |     Args:
11 |         kpts0 (torch.Tensor): [N, L, 2] - <x, y>,
12 |         depth0 (torch.Tensor): [N, H, W],
13 |         depth1 (torch.Tensor): [N, H, W],
14 |         T_0to1 (torch.Tensor): [N, 3, 4],
15 |         K0 (torch.Tensor): [N, 3, 3],
16 |         K1 (torch.Tensor): [N, 3, 3],
17 |     Returns:
18 |         calculable_mask (torch.Tensor): [N, L]
19 |         warped_keypoints0 (torch.Tensor): [N, L, 2] <x0_hat, y1_hat>
20 |     """
21 |     kpts0_long = kpts0.round().long()
22 | 
23 |     # Sample depth, get calculable_mask on depth != 0
24 |     kpts0_depth = torch.stack(
25 |         [depth0[i, kpts0_long[i, :, 1], kpts0_long[i, :, 0]] for i in range(kpts0.shape[0])], dim=0
26 |     )  # (N, L)
27 |     nonzero_mask = kpts0_depth != 0
28 | 
29 |     # Unproject
30 |     kpts0_h = torch.cat([kpts0, torch.ones_like(kpts0[:, :, [0]])], dim=-1) * kpts0_depth[..., None]  # (N, L, 3)
31 |     kpts0_cam = K0.inverse() @ kpts0_h.transpose(2, 1)  # (N, 3, L)
32 | 
33 |     # Rigid Transform
34 |     w_kpts0_cam = T_0to1[:, :3, :3] @ kpts0_cam + T_0to1[:, :3, [3]]    # (N, 3, L)
35 |     w_kpts0_depth_computed = w_kpts0_cam[:, 2, :]
36 | 
37 |     # Project
38 |     w_kpts0_h = (K1 @ w_kpts0_cam).transpose(2, 1)  # (N, L, 3)
39 |     w_kpts0 = w_kpts0_h[:, :, :2] / (w_kpts0_h[:, :, [2]] + 1e-4)  # (N, L, 2), +1e-4 to avoid zero depth
40 | 
41 |     # Covisible Check
42 |     h, w = depth1.shape[1:3]
43 |     covisible_mask = (w_kpts0[:, :, 0] > 0) * (w_kpts0[:, :, 0] < w-1) * \
44 |         (w_kpts0[:, :, 1] > 0) * (w_kpts0[:, :, 1] < h-1)
45 |     w_kpts0_long = w_kpts0.long()
46 |     w_kpts0_long[~covisible_mask, :] = 0
47 | 
48 |     w_kpts0_depth = torch.stack(
49 |         [depth1[i, w_kpts0_long[i, :, 1], w_kpts0_long[i, :, 0]] for i in range(w_kpts0_long.shape[0])], dim=0
50 |     )  # (N, L)
51 |     consistent_mask = ((w_kpts0_depth - w_kpts0_depth_computed) / w_kpts0_depth).abs() < 0.2
52 |     valid_mask = nonzero_mask * covisible_mask * consistent_mask
53 | 
54 |     return valid_mask, w_kpts0
55 | 


--------------------------------------------------------------------------------
/pose_synthesis/pose_synthesis_batch.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import argparse
 4 | from multiprocessing import Pool
 5 | 
 6 | def generate_images(args):
 7 |     x, y, img_path, half_precision, output_path = args
 8 |     x = "["+x+"]"
 9 |     y = "["+y+"]"
10 | 
11 |     command = f"python pose_synthesis.py --x {x} --y {y} --img_path {img_path} --half_precision --output_path {output_path}"
12 |     subprocess.run(command, shell=True)
13 | 
14 | def main(input_dir, output_dir, x_values, y_values):
15 |     if not os.path.isdir(input_dir):
16 |         image_files = []
17 |     else:
18 |         image_files = os.listdir(input_dir)
19 |     
20 |     # if input_dir is a single image
21 |     if len(image_files) == 0:
22 |         image_files = [input_dir.split('/')[-1]]
23 |     
24 |     for image_file in image_files:
25 |         print("Processing image: ", image_file)
26 |         image_name = os.path.splitext(image_file)[0]
27 |         if len(image_files) == 1:
28 |             image_path = input_dir
29 |         else:
30 |             image_path = os.path.join(input_dir, image_file)
31 | 
32 |         output_subdir = os.path.join(output_dir, image_name)
33 |         if not os.path.exists(output_subdir):
34 |             os.makedirs(output_subdir)
35 | 
36 |         # x_values = [-10,0,10]
37 |         # y_values = [0,-10,-20, -30, -40,-50,10,20,30,40,50,60]
38 | 
39 |         args_list = []
40 |         args_list.append((x_values, y_values, image_path, True, output_subdir))
41 | 
42 |         # use multiple threads to generate images
43 |         with Pool(processes=2) as pool:
44 |             pool.map(generate_images, args_list)
45 | 
46 | if __name__ == "__main__":
47 |     parser = argparse.ArgumentParser()
48 |     parser.add_argument("--input_dir", type=str, default="./input_images_dir")
49 |     parser.add_argument("--output_dir", type=str, default="./output_images_dir")
50 |     parser.add_argument("--x_values", type=str, default="0,10", help="comma separated list of x values")
51 |     parser.add_argument("--y_values", type=str, default="0,-10", help="comma separated list of y values")
52 |     parser.add_argument("--pose_file_path", type=str, default=None)
53 | 
54 |     args = parser.parse_args()
55 | 
56 |     if args.pose_file_path:
57 |         with open(args.pose_file_path, "r") as f:
58 |             args.x_values, args.y_values = f.read().split(' ')
59 |     main(args.input_dir, args.output_dir, args.x_values, args.y_values)
60 | 


--------------------------------------------------------------------------------
/pose_estimation.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import argparse
 5 | import logging
 6 | from PIL import Image
 7 | from transformers import AutoImageProcessor, Dinov2Model
 8 | vit_model = Dinov2Model.from_pretrained("facebook/dinov2-base")
 9 | processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
10 | 
11 | class RegressionModel(nn.Module):
12 |     def __init__(self):
13 |         super(RegressionModel, self).__init__()
14 |         self.vit = vit_model
15 |         # self.fc1 = nn.Linear(768, 768)
16 |         # self.fc2 = nn.Linear(768, 768)
17 |         self.fc3 = nn.Linear(768, 128)
18 |         self.fc4 = nn.Linear(128, 2)
19 | 
20 |         
21 |         for param in self.vit.parameters():
22 |             param.requires_grad = True
23 | 
24 |     def forward(self, x):
25 |         outputs = self.vit(x)
26 |         sequence_output = outputs[0]
27 |         x = sequence_output[:, 0, :] #[B,768]
28 |         
29 |         # x = F.relu(self.fc1(x))
30 |         # x = F.relu(self.fc2(x))
31 |         x = F.relu(self.fc3(x))
32 |         x = self.fc4(x)
33 |         return x
34 | 
35 | 
36 | def main(input_path, output_path, model_path, device="cuda"):
37 |     # inference
38 |     # input a image, and predict R and T
39 | 
40 |     input_image = Image.open(input_path).convert('RGB')
41 |     input_image = processor(images=input_image, return_tensors="pt")
42 |     input_image = input_image['pixel_values'].to(device)
43 |     
44 |     model = RegressionModel().float().to(device)
45 |     model.load_state_dict(torch.load(model_path, map_location=device))
46 |     model.eval()
47 |     with torch.no_grad():
48 |         prediction = model(input_image)[0]
49 |         predicted_R, predicted_T = prediction[0], prediction[1]
50 |         # round to integer and write to output_path. You may change this part to round to the integer that can be divided by 10
51 |         predicted_R = round(predicted_R.item())
52 |         predicted_T = round(predicted_T.item())
53 |         with open(output_path, "w") as f:
54 |             f.write(f"{predicted_R} {predicted_T}")
55 |         logging.info(f'Predicted R: {predicted_R}, Predicted T: {predicted_T}')
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     parser = argparse.ArgumentParser()
60 |     parser.add_argument("--input_path", type=str, default="./input_images_path")
61 |     parser.add_argument("--output_path", type=str, default="./output_pose_path")
62 |     parser.add_argument("--model_path", type=str, default="./model_path")
63 |     parser.add_argument("--device", type=str, default="cuda")
64 | 
65 |     args = parser.parse_args()
66 |     main(args.input_path, args.output_path, args.model_path, args.device)


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/loftr/loftr_module/fine_preprocess.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from einops.einops import rearrange, repeat
 5 | 
 6 | 
 7 | class FinePreprocess(nn.Module):
 8 |     def __init__(self, config):
 9 |         super().__init__()
10 | 
11 |         self.config = config
12 |         self.cat_c_feat = config['fine_concat_coarse_feat']
13 |         self.W = self.config['fine_window_size']
14 | 
15 |         d_model_c = self.config['coarse']['d_model']
16 |         d_model_f = self.config['fine']['d_model']
17 |         self.d_model_f = d_model_f
18 |         if self.cat_c_feat:
19 |             self.down_proj = nn.Linear(d_model_c, d_model_f, bias=True)
20 |             self.merge_feat = nn.Linear(2*d_model_f, d_model_f, bias=True)
21 | 
22 |         self._reset_parameters()
23 | 
24 |     def _reset_parameters(self):
25 |         for p in self.parameters():
26 |             if p.dim() > 1:
27 |                 nn.init.kaiming_normal_(p, mode="fan_out", nonlinearity="relu")
28 | 
29 |     def forward(self, feat_f0, feat_f1, feat_c0, feat_c1, data):
30 |         W = self.W
31 |         stride = data['hw0_f'][0] // data['hw0_c'][0]
32 | 
33 |         data.update({'W': W})
34 |         if data['b_ids'].shape[0] == 0:
35 |             feat0 = torch.empty(0, self.W**2, self.d_model_f, device=feat_f0.device)
36 |             feat1 = torch.empty(0, self.W**2, self.d_model_f, device=feat_f0.device)
37 |             return feat0, feat1
38 | 
39 |         # 1. unfold(crop) all local windows
40 |         feat_f0_unfold = F.unfold(feat_f0, kernel_size=(W, W), stride=stride, padding=W//2)
41 |         feat_f0_unfold = rearrange(feat_f0_unfold, 'n (c ww) l -> n l ww c', ww=W**2)
42 |         feat_f1_unfold = F.unfold(feat_f1, kernel_size=(W, W), stride=stride, padding=W//2)
43 |         feat_f1_unfold = rearrange(feat_f1_unfold, 'n (c ww) l -> n l ww c', ww=W**2)
44 | 
45 |         # 2. select only the predicted matches
46 |         feat_f0_unfold = feat_f0_unfold[data['b_ids'], data['i_ids']]  # [n, ww, cf]
47 |         feat_f1_unfold = feat_f1_unfold[data['b_ids'], data['j_ids']]
48 | 
49 |         # option: use coarse-level loftr feature as context: concat and linear
50 |         if self.cat_c_feat:
51 |             feat_c_win = self.down_proj(torch.cat([feat_c0[data['b_ids'], data['i_ids']],
52 |                                                    feat_c1[data['b_ids'], data['j_ids']]], 0))  # [2n, c]
53 |             feat_cf_win = self.merge_feat(torch.cat([
54 |                 torch.cat([feat_f0_unfold, feat_f1_unfold], 0),  # [2n, ww, cf]
55 |                 repeat(feat_c_win, 'n c -> n ww c', ww=W**2),  # [2n, ww, cf]
56 |             ], -1))
57 |             feat_f0_unfold, feat_f1_unfold = torch.chunk(feat_cf_win, 2, dim=0)
58 | 
59 |         return feat_f0_unfold, feat_f1_unfold
60 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/extras.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from omegaconf import OmegaConf
 3 | import torch
 4 | from ldm.util import instantiate_from_config
 5 | import logging
 6 | from contextlib import contextmanager
 7 | 
 8 | from contextlib import contextmanager
 9 | import logging
10 | 
11 | @contextmanager
12 | def all_logging_disabled(highest_level=logging.CRITICAL):
13 |     """
14 |     A context manager that will prevent any logging messages
15 |     triggered during the body from being processed.
16 | 
17 |     :param highest_level: the maximum logging level in use.
18 |       This would only need to be changed if a custom level greater than CRITICAL
19 |       is defined.
20 | 
21 |     https://gist.github.com/simon-weber/7853144
22 |     """
23 |     # two kind-of hacks here:
24 |     #    * can't get the highest logging level in effect => delegate to the user
25 |     #    * can't get the current module-level override => use an undocumented
26 |     #       (but non-private!) interface
27 | 
28 |     previous_level = logging.root.manager.disable
29 | 
30 |     logging.disable(highest_level)
31 | 
32 |     try:
33 |         yield
34 |     finally:
35 |         logging.disable(previous_level)
36 | 
37 | def load_training_dir(train_dir, device, epoch="last"):
38 |     """Load a checkpoint and config from training directory"""
39 |     train_dir = Path(train_dir)
40 |     ckpt = list(train_dir.rglob(f"*{epoch}.ckpt"))
41 |     assert len(ckpt) == 1, f"found {len(ckpt)} matching ckpt files"
42 |     config = list(train_dir.rglob(f"*-project.yaml"))
43 |     assert len(ckpt) > 0, f"didn't find any config in {train_dir}"
44 |     if len(config) > 1:
45 |         print(f"found {len(config)} matching config files")
46 |         config = sorted(config)[-1]
47 |         print(f"selecting {config}")
48 |     else:
49 |         config = config[0]
50 | 
51 | 
52 |     config = OmegaConf.load(config)
53 |     return load_model_from_config(config, ckpt[0], device)
54 | 
55 | def load_model_from_config(config, ckpt, device="cpu", verbose=False):
56 |     """Loads a model from config and a ckpt
57 |     if config is a path will use omegaconf to load
58 |     """
59 |     if isinstance(config, (str, Path)):
60 |         config = OmegaConf.load(config)
61 | 
62 |     with all_logging_disabled():
63 |         print(f"Loading model from {ckpt}")
64 |         pl_sd = torch.load(ckpt, map_location="cpu")
65 |         global_step = pl_sd["global_step"]
66 |         sd = pl_sd["state_dict"]
67 |         model = instantiate_from_config(config.model)
68 |         m, u = model.load_state_dict(sd, strict=False)
69 |         if len(m) > 0 and verbose:
70 |             print("missing keys:")
71 |             print(m)
72 |         if len(u) > 0 and verbose:
73 |             print("unexpected keys:")
74 |         model.to(device)
75 |         model.eval()
76 |         model.cond_stage_model.device = device
77 |         return model


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/loftr/utils/fine_matching.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from kornia.geometry.subpix import dsnt
 6 | from kornia.utils.grid import create_meshgrid
 7 | 
 8 | 
 9 | class FineMatching(nn.Module):
10 |     """FineMatching with s2d paradigm"""
11 | 
12 |     def __init__(self):
13 |         super().__init__()
14 | 
15 |     def forward(self, feat_f0, feat_f1, data):
16 |         """
17 |         Args:
18 |             feat0 (torch.Tensor): [M, WW, C]
19 |             feat1 (torch.Tensor): [M, WW, C]
20 |             data (dict)
21 |         Update:
22 |             data (dict):{
23 |                 'expec_f' (torch.Tensor): [M, 3],
24 |                 'mkpts0_f' (torch.Tensor): [M, 2],
25 |                 'mkpts1_f' (torch.Tensor): [M, 2]}
26 |         """
27 |         M, WW, C = feat_f0.shape
28 |         W = int(math.sqrt(WW))
29 |         scale = data['hw0_i'][0] / data['hw0_f'][0]
30 |         self.M, self.W, self.WW, self.C, self.scale = M, W, WW, C, scale
31 | 
32 |         # corner case: if no coarse matches found
33 |         if M == 0:
34 |             assert self.training == False, "M is always >0, when training, see coarse_matching.py"
35 |             # logger.warning('No matches found in coarse-level.')
36 |             data.update({
37 |                 'expec_f': torch.empty(0, 3, device=feat_f0.device),
38 |                 'mkpts0_f': data['mkpts0_c'],
39 |                 'mkpts1_f': data['mkpts1_c'],
40 |             })
41 |             return
42 | 
43 |         feat_f0_picked = feat_f0_picked = feat_f0[:, WW//2, :]
44 |         sim_matrix = torch.einsum('mc,mrc->mr', feat_f0_picked, feat_f1)
45 |         softmax_temp = 1. / C**.5
46 |         heatmap = torch.softmax(softmax_temp * sim_matrix, dim=1).view(-1, W, W)
47 | 
48 |         # compute coordinates from heatmap
49 |         coords_normalized = dsnt.spatial_expectation2d(heatmap[None], True)[0]  # [M, 2]
50 |         grid_normalized = create_meshgrid(W, W, True, heatmap.device).reshape(1, -1, 2)  # [1, WW, 2]
51 | 
52 |         # compute std over <x, y>
53 |         var = torch.sum(grid_normalized**2 * heatmap.view(-1, WW, 1), dim=1) - coords_normalized**2  # [M, 2]
54 |         std = torch.sum(torch.sqrt(torch.clamp(var, min=1e-10)), -1)  # [M]  clamp needed for numerical stability
55 |         
56 |         # for fine-level supervision
57 |         data.update({'expec_f': torch.cat([coords_normalized, std.unsqueeze(1)], -1)})
58 | 
59 |         # compute absolute kpt coords
60 |         self.get_fine_match(coords_normalized, data)
61 | 
62 |     @torch.no_grad()
63 |     def get_fine_match(self, coords_normed, data):
64 |         W, WW, C, scale = self.W, self.WW, self.C, self.scale
65 | 
66 |         # mkpts0_f and mkpts1_f
67 |         mkpts0_f = data['mkpts0_c']
68 |         scale1 = scale * data['scale1'][data['b_ids']] if 'scale0' in data else scale
69 |         mkpts1_f = data['mkpts1_c'] + (coords_normed * (W // 2) * scale1)[:len(data['mconf'])]
70 | 
71 |         data.update({
72 |             "mkpts0_f": mkpts0_f,
73 |             "mkpts1_f": mkpts1_f
74 |         })
75 | 


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/loftr/loftr_module/linear_attention.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Linear Transformer proposed in "Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention"
 3 | Modified from: https://github.com/idiap/fast-transformers/blob/master/fast_transformers/attention/linear_attention.py
 4 | """
 5 | 
 6 | import torch
 7 | from torch.nn import Module, Dropout
 8 | 
 9 | 
10 | def elu_feature_map(x):
11 |     return torch.nn.functional.elu(x) + 1
12 | 
13 | 
14 | class LinearAttention(Module):
15 |     def __init__(self, eps=1e-6):
16 |         super().__init__()
17 |         self.feature_map = elu_feature_map
18 |         self.eps = eps
19 | 
20 |     def forward(self, queries, keys, values, q_mask=None, kv_mask=None):
21 |         """ Multi-Head linear attention proposed in "Transformers are RNNs"
22 |         Args:
23 |             queries: [N, L, H, D]
24 |             keys: [N, S, H, D]
25 |             values: [N, S, H, D]
26 |             q_mask: [N, L]
27 |             kv_mask: [N, S]
28 |         Returns:
29 |             queried_values: (N, L, H, D)
30 |         """
31 |         Q = self.feature_map(queries)
32 |         K = self.feature_map(keys)
33 | 
34 |         # set padded position to zero
35 |         if q_mask is not None:
36 |             Q = Q * q_mask[:, :, None, None]
37 |         if kv_mask is not None:
38 |             K = K * kv_mask[:, :, None, None]
39 |             values = values * kv_mask[:, :, None, None]
40 | 
41 |         v_length = values.size(1)
42 |         values = values / v_length  # prevent fp16 overflow
43 |         KV = torch.einsum("nshd,nshv->nhdv", K, values)  # (S,D)' @ S,V
44 |         Z = 1 / (torch.einsum("nlhd,nhd->nlh", Q, K.sum(dim=1)) + self.eps)
45 |         queried_values = torch.einsum("nlhd,nhdv,nlh->nlhv", Q, KV, Z) * v_length
46 | 
47 |         return queried_values.contiguous()
48 | 
49 | 
50 | class FullAttention(Module):
51 |     def __init__(self, use_dropout=False, attention_dropout=0.1):
52 |         super().__init__()
53 |         self.use_dropout = use_dropout
54 |         self.dropout = Dropout(attention_dropout)
55 | 
56 |     def forward(self, queries, keys, values, q_mask=None, kv_mask=None):
57 |         """ Multi-head scaled dot-product attention, a.k.a full attention.
58 |         Args:
59 |             queries: [N, L, H, D]
60 |             keys: [N, S, H, D]
61 |             values: [N, S, H, D]
62 |             q_mask: [N, L]
63 |             kv_mask: [N, S]
64 |         Returns:
65 |             queried_values: (N, L, H, D)
66 |         """
67 | 
68 |         # Compute the unnormalized attention and apply the masks
69 |         QK = torch.einsum("nlhd,nshd->nlsh", queries, keys)
70 |         if kv_mask is not None:
71 |             QK.masked_fill_(~(q_mask[:, :, None, None] * kv_mask[:, None, :, None]), float('-inf'))
72 | 
73 |         # Compute the attention and the weighted average
74 |         softmax_temp = 1. / queries.size(3)**.5  # sqrt(D)
75 |         A = torch.softmax(softmax_temp * QK, dim=2)
76 |         if self.use_dropout:
77 |             A = self.dropout(A)
78 | 
79 |         queried_values = torch.einsum("nlsh,nshd->nlhd", A, values)
80 | 
81 |         return queried_values.contiguous()
82 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/thirdp/psp/model_irse.py:
--------------------------------------------------------------------------------
 1 | # https://github.com/eladrich/pixel2style2pixel
 2 | 
 3 | from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, Dropout, Sequential, Module
 4 | from ldm.thirdp.psp.helpers import get_blocks, Flatten, bottleneck_IR, bottleneck_IR_SE, l2_norm
 5 | 
 6 | """
 7 | Modified Backbone implementation from [TreB1eN](https://github.com/TreB1eN/InsightFace_Pytorch)
 8 | """
 9 | 
10 | 
11 | class Backbone(Module):
12 | 	def __init__(self, input_size, num_layers, mode='ir', drop_ratio=0.4, affine=True):
13 | 		super(Backbone, self).__init__()
14 | 		assert input_size in [112, 224], "input_size should be 112 or 224"
15 | 		assert num_layers in [50, 100, 152], "num_layers should be 50, 100 or 152"
16 | 		assert mode in ['ir', 'ir_se'], "mode should be ir or ir_se"
17 | 		blocks = get_blocks(num_layers)
18 | 		if mode == 'ir':
19 | 			unit_module = bottleneck_IR
20 | 		elif mode == 'ir_se':
21 | 			unit_module = bottleneck_IR_SE
22 | 		self.input_layer = Sequential(Conv2d(3, 64, (3, 3), 1, 1, bias=False),
23 | 									  BatchNorm2d(64),
24 | 									  PReLU(64))
25 | 		if input_size == 112:
26 | 			self.output_layer = Sequential(BatchNorm2d(512),
27 | 			                               Dropout(drop_ratio),
28 | 			                               Flatten(),
29 | 			                               Linear(512 * 7 * 7, 512),
30 | 			                               BatchNorm1d(512, affine=affine))
31 | 		else:
32 | 			self.output_layer = Sequential(BatchNorm2d(512),
33 | 			                               Dropout(drop_ratio),
34 | 			                               Flatten(),
35 | 			                               Linear(512 * 14 * 14, 512),
36 | 			                               BatchNorm1d(512, affine=affine))
37 | 
38 | 		modules = []
39 | 		for block in blocks:
40 | 			for bottleneck in block:
41 | 				modules.append(unit_module(bottleneck.in_channel,
42 | 										   bottleneck.depth,
43 | 										   bottleneck.stride))
44 | 		self.body = Sequential(*modules)
45 | 
46 | 	def forward(self, x):
47 | 		x = self.input_layer(x)
48 | 		x = self.body(x)
49 | 		x = self.output_layer(x)
50 | 		return l2_norm(x)
51 | 
52 | 
53 | def IR_50(input_size):
54 | 	"""Constructs a ir-50 model."""
55 | 	model = Backbone(input_size, num_layers=50, mode='ir', drop_ratio=0.4, affine=False)
56 | 	return model
57 | 
58 | 
59 | def IR_101(input_size):
60 | 	"""Constructs a ir-101 model."""
61 | 	model = Backbone(input_size, num_layers=100, mode='ir', drop_ratio=0.4, affine=False)
62 | 	return model
63 | 
64 | 
65 | def IR_152(input_size):
66 | 	"""Constructs a ir-152 model."""
67 | 	model = Backbone(input_size, num_layers=152, mode='ir', drop_ratio=0.4, affine=False)
68 | 	return model
69 | 
70 | 
71 | def IR_SE_50(input_size):
72 | 	"""Constructs a ir_se-50 model."""
73 | 	model = Backbone(input_size, num_layers=50, mode='ir_se', drop_ratio=0.4, affine=False)
74 | 	return model
75 | 
76 | 
77 | def IR_SE_101(input_size):
78 | 	"""Constructs a ir_se-101 model."""
79 | 	model = Backbone(input_size, num_layers=100, mode='ir_se', drop_ratio=0.4, affine=False)
80 | 	return model
81 | 
82 | 
83 | def IR_SE_152(input_size):
84 | 	"""Constructs a ir_se-152 model."""
85 | 	model = Backbone(input_size, num_layers=152, mode='ir_se', drop_ratio=0.4, affine=False)
86 | 	return model


--------------------------------------------------------------------------------
/pose_synthesis/ldm/modules/ema.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class LitEma(nn.Module):
 6 |     def __init__(self, model, decay=0.9999, use_num_upates=True):
 7 |         super().__init__()
 8 |         if decay < 0.0 or decay > 1.0:
 9 |             raise ValueError('Decay must be between 0 and 1')
10 | 
11 |         self.m_name2s_name = {}
12 |         self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
13 |         self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates
14 |                              else torch.tensor(-1,dtype=torch.int))
15 | 
16 |         for name, p in model.named_parameters():
17 |             if p.requires_grad:
18 |                 #remove as '.'-character is not allowed in buffers
19 |                 s_name = name.replace('.','')
20 |                 self.m_name2s_name.update({name:s_name})
21 |                 self.register_buffer(s_name,p.clone().detach().data)
22 | 
23 |         self.collected_params = []
24 | 
25 |     def forward(self,model):
26 |         decay = self.decay
27 | 
28 |         if self.num_updates >= 0:
29 |             self.num_updates += 1
30 |             decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates))
31 | 
32 |         one_minus_decay = 1.0 - decay
33 | 
34 |         with torch.no_grad():
35 |             m_param = dict(model.named_parameters())
36 |             shadow_params = dict(self.named_buffers())
37 | 
38 |             for key in m_param:
39 |                 if m_param[key].requires_grad:
40 |                     sname = self.m_name2s_name[key]
41 |                     shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
42 |                     shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
43 |                 else:
44 |                     assert not key in self.m_name2s_name
45 | 
46 |     def copy_to(self, model):
47 |         m_param = dict(model.named_parameters())
48 |         shadow_params = dict(self.named_buffers())
49 |         for key in m_param:
50 |             if m_param[key].requires_grad:
51 |                 m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
52 |             else:
53 |                 assert not key in self.m_name2s_name
54 | 
55 |     def store(self, parameters):
56 |         """
57 |         Save the current parameters for restoring later.
58 |         Args:
59 |           parameters: Iterable of `torch.nn.Parameter`; the parameters to be
60 |             temporarily stored.
61 |         """
62 |         self.collected_params = [param.clone() for param in parameters]
63 | 
64 |     def restore(self, parameters):
65 |         """
66 |         Restore the parameters stored with the `store` method.
67 |         Useful to validate the model with EMA parameters without affecting the
68 |         original optimization process. Store the parameters before the
69 |         `copy_to` method. After validation (or model saving), use this to
70 |         restore the former parameters.
71 |         Args:
72 |           parameters: Iterable of `torch.nn.Parameter`; the parameters to be
73 |             updated with the stored parameters.
74 |         """
75 |         for c_param, param in zip(self.collected_params, parameters):
76 |             param.data.copy_(c_param.data)
77 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/modules/distributions/distributions.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | class AbstractDistribution:
 6 |     def sample(self):
 7 |         raise NotImplementedError()
 8 | 
 9 |     def mode(self):
10 |         raise NotImplementedError()
11 | 
12 | 
13 | class DiracDistribution(AbstractDistribution):
14 |     def __init__(self, value):
15 |         self.value = value
16 | 
17 |     def sample(self):
18 |         return self.value
19 | 
20 |     def mode(self):
21 |         return self.value
22 | 
23 | 
24 | class DiagonalGaussianDistribution(object):
25 |     def __init__(self, parameters, deterministic=False):
26 |         self.parameters = parameters
27 |         self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
28 |         self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
29 |         self.deterministic = deterministic
30 |         self.std = torch.exp(0.5 * self.logvar)
31 |         self.var = torch.exp(self.logvar)
32 |         if self.deterministic:
33 |             self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
34 | 
35 |     def sample(self):
36 |         x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
37 |         return x
38 | 
39 |     def kl(self, other=None):
40 |         if self.deterministic:
41 |             return torch.Tensor([0.])
42 |         else:
43 |             if other is None:
44 |                 return 0.5 * torch.sum(torch.pow(self.mean, 2)
45 |                                        + self.var - 1.0 - self.logvar,
46 |                                        dim=[1, 2, 3])
47 |             else:
48 |                 return 0.5 * torch.sum(
49 |                     torch.pow(self.mean - other.mean, 2) / other.var
50 |                     + self.var / other.var - 1.0 - self.logvar + other.logvar,
51 |                     dim=[1, 2, 3])
52 | 
53 |     def nll(self, sample, dims=[1,2,3]):
54 |         if self.deterministic:
55 |             return torch.Tensor([0.])
56 |         logtwopi = np.log(2.0 * np.pi)
57 |         return 0.5 * torch.sum(
58 |             logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
59 |             dim=dims)
60 | 
61 |     def mode(self):
62 |         return self.mean
63 | 
64 | 
65 | def normal_kl(mean1, logvar1, mean2, logvar2):
66 |     """
67 |     source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
68 |     Compute the KL divergence between two gaussians.
69 |     Shapes are automatically broadcasted, so batches can be compared to
70 |     scalars, among other use cases.
71 |     """
72 |     tensor = None
73 |     for obj in (mean1, logvar1, mean2, logvar2):
74 |         if isinstance(obj, torch.Tensor):
75 |             tensor = obj
76 |             break
77 |     assert tensor is not None, "at least one argument must be a Tensor"
78 | 
79 |     # Force variances to be Tensors. Broadcasting helps convert scalars to
80 |     # Tensors, but it does not work for torch.exp().
81 |     logvar1, logvar2 = [
82 |         x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
83 |         for x in (logvar1, logvar2)
84 |     ]
85 | 
86 |     return 0.5 * (
87 |         -1.0
88 |         + logvar2
89 |         - logvar1
90 |         + torch.exp(logvar1 - logvar2)
91 |         + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
92 |     )
93 | 


--------------------------------------------------------------------------------
/pose_synthesis/configs/sd-objaverse-finetune-c_concat-256.yaml:
--------------------------------------------------------------------------------
  1 | model:
  2 |   base_learning_rate: 1.0e-04
  3 |   target: ldm.models.diffusion.ddpm.LatentDiffusion
  4 |   params:
  5 |     linear_start: 0.00085
  6 |     linear_end: 0.0120
  7 |     num_timesteps_cond: 1
  8 |     log_every_t: 200
  9 |     timesteps: 1000
 10 |     first_stage_key: "image_target"
 11 |     cond_stage_key: "image_cond"
 12 |     image_size: 32
 13 |     channels: 4
 14 |     cond_stage_trainable: false   # Note: different from the one we trained before
 15 |     conditioning_key: hybrid
 16 |     monitor: val/loss_simple_ema
 17 |     scale_factor: 0.18215
 18 | 
 19 |     scheduler_config: # 10000 warmup steps
 20 |       target: ldm.lr_scheduler.LambdaLinearScheduler
 21 |       params:
 22 |         warm_up_steps: [ 100 ]
 23 |         cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
 24 |         f_start: [ 1.e-6 ]
 25 |         f_max: [ 1. ]
 26 |         f_min: [ 1. ]
 27 | 
 28 |     unet_config:
 29 |       target: ldm.modules.diffusionmodules.openaimodel.UNetModel
 30 |       params:
 31 |         image_size: 32 # unused
 32 |         in_channels: 8
 33 |         out_channels: 4
 34 |         model_channels: 320
 35 |         attention_resolutions: [ 4, 2, 1 ]
 36 |         num_res_blocks: 2
 37 |         channel_mult: [ 1, 2, 4, 4 ]
 38 |         num_heads: 8
 39 |         use_spatial_transformer: True
 40 |         transformer_depth: 1
 41 |         context_dim: 768
 42 |         use_checkpoint: True
 43 |         legacy: False
 44 | 
 45 |     first_stage_config:
 46 |       target: ldm.models.autoencoder.AutoencoderKL
 47 |       params:
 48 |         embed_dim: 4
 49 |         monitor: val/rec_loss
 50 |         ddconfig:
 51 |           double_z: true
 52 |           z_channels: 4
 53 |           resolution: 256
 54 |           in_channels: 3
 55 |           out_ch: 3
 56 |           ch: 128
 57 |           ch_mult:
 58 |           - 1
 59 |           - 2
 60 |           - 4
 61 |           - 4
 62 |           num_res_blocks: 2
 63 |           attn_resolutions: []
 64 |           dropout: 0.0
 65 |         lossconfig:
 66 |           target: torch.nn.Identity
 67 | 
 68 |     cond_stage_config:
 69 |       target: ldm.modules.encoders.modules.FrozenCLIPImageEmbedder
 70 | 
 71 | 
 72 | data:
 73 |   target: ldm.data.simple.ObjaverseDataModuleFromConfig
 74 |   params:
 75 |     root_dir: 'views_whole_sphere'
 76 |     batch_size: 192
 77 |     num_workers: 16
 78 |     total_view: 4
 79 |     train:
 80 |       validation: False
 81 |       image_transforms:
 82 |         size: 256
 83 | 
 84 |     validation:
 85 |       validation: True
 86 |       image_transforms:
 87 |         size: 256
 88 | 
 89 | 
 90 | lightning:
 91 |   find_unused_parameters: false
 92 |   metrics_over_trainsteps_checkpoint: True
 93 |   modelcheckpoint:
 94 |     params:
 95 |       every_n_train_steps: 5000
 96 |   callbacks:
 97 |     image_logger:
 98 |       target: main.ImageLogger
 99 |       params:
100 |         batch_frequency: 500
101 |         max_images: 32
102 |         increase_log_steps: False
103 |         log_first_step: True
104 |         log_images_kwargs:
105 |           use_ema_scope: False
106 |           inpaint: False
107 |           plot_progressive_rows: False
108 |           plot_diffusion_rows: False
109 |           N: 32
110 |           unconditional_guidance_scale: 3.0
111 |           unconditional_guidance_label: [""]
112 | 
113 |   trainer:
114 |     benchmark: True
115 |     val_check_interval: 5000000 # really sorry
116 |     num_sanity_val_steps: 0
117 |     accumulate_grad_batches: 1
118 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/data/lsun.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import PIL
 4 | from PIL import Image
 5 | from torch.utils.data import Dataset
 6 | from torchvision import transforms
 7 | 
 8 | 
 9 | class LSUNBase(Dataset):
10 |     def __init__(self,
11 |                  txt_file,
12 |                  data_root,
13 |                  size=None,
14 |                  interpolation="bicubic",
15 |                  flip_p=0.5
16 |                  ):
17 |         self.data_paths = txt_file
18 |         self.data_root = data_root
19 |         with open(self.data_paths, "r") as f:
20 |             self.image_paths = f.read().splitlines()
21 |         self._length = len(self.image_paths)
22 |         self.labels = {
23 |             "relative_file_path_": [l for l in self.image_paths],
24 |             "file_path_": [os.path.join(self.data_root, l)
25 |                            for l in self.image_paths],
26 |         }
27 | 
28 |         self.size = size
29 |         self.interpolation = {"linear": PIL.Image.LINEAR,
30 |                               "bilinear": PIL.Image.BILINEAR,
31 |                               "bicubic": PIL.Image.BICUBIC,
32 |                               "lanczos": PIL.Image.LANCZOS,
33 |                               }[interpolation]
34 |         self.flip = transforms.RandomHorizontalFlip(p=flip_p)
35 | 
36 |     def __len__(self):
37 |         return self._length
38 | 
39 |     def __getitem__(self, i):
40 |         example = dict((k, self.labels[k][i]) for k in self.labels)
41 |         image = Image.open(example["file_path_"])
42 |         if not image.mode == "RGB":
43 |             image = image.convert("RGB")
44 | 
45 |         # default to score-sde preprocessing
46 |         img = np.array(image).astype(np.uint8)
47 |         crop = min(img.shape[0], img.shape[1])
48 |         h, w, = img.shape[0], img.shape[1]
49 |         img = img[(h - crop) // 2:(h + crop) // 2,
50 |               (w - crop) // 2:(w + crop) // 2]
51 | 
52 |         image = Image.fromarray(img)
53 |         if self.size is not None:
54 |             image = image.resize((self.size, self.size), resample=self.interpolation)
55 | 
56 |         image = self.flip(image)
57 |         image = np.array(image).astype(np.uint8)
58 |         example["image"] = (image / 127.5 - 1.0).astype(np.float32)
59 |         return example
60 | 
61 | 
62 | class LSUNChurchesTrain(LSUNBase):
63 |     def __init__(self, **kwargs):
64 |         super().__init__(txt_file="data/lsun/church_outdoor_train.txt", data_root="data/lsun/churches", **kwargs)
65 | 
66 | 
67 | class LSUNChurchesValidation(LSUNBase):
68 |     def __init__(self, flip_p=0., **kwargs):
69 |         super().__init__(txt_file="data/lsun/church_outdoor_val.txt", data_root="data/lsun/churches",
70 |                          flip_p=flip_p, **kwargs)
71 | 
72 | 
73 | class LSUNBedroomsTrain(LSUNBase):
74 |     def __init__(self, **kwargs):
75 |         super().__init__(txt_file="data/lsun/bedrooms_train.txt", data_root="data/lsun/bedrooms", **kwargs)
76 | 
77 | 
78 | class LSUNBedroomsValidation(LSUNBase):
79 |     def __init__(self, flip_p=0.0, **kwargs):
80 |         super().__init__(txt_file="data/lsun/bedrooms_val.txt", data_root="data/lsun/bedrooms",
81 |                          flip_p=flip_p, **kwargs)
82 | 
83 | 
84 | class LSUNCatsTrain(LSUNBase):
85 |     def __init__(self, **kwargs):
86 |         super().__init__(txt_file="data/lsun/cat_train.txt", data_root="data/lsun/cats", **kwargs)
87 | 
88 | 
89 | class LSUNCatsValidation(LSUNBase):
90 |     def __init__(self, flip_p=0., **kwargs):
91 |         super().__init__(txt_file="data/lsun/cat_val.txt", data_root="data/lsun/cats",
92 |                          flip_p=flip_p, **kwargs)
93 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/guidance.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | from scipy import interpolate
 3 | import numpy as np
 4 | import torch
 5 | import matplotlib.pyplot as plt
 6 | from IPython.display import clear_output
 7 | import abc
 8 | 
 9 | 
10 | class GuideModel(torch.nn.Module, abc.ABC):
11 |     def __init__(self) -> None:
12 |         super().__init__()
13 | 
14 |     @abc.abstractmethod
15 |     def preprocess(self, x_img):
16 |         pass
17 | 
18 |     @abc.abstractmethod
19 |     def compute_loss(self, inp):
20 |         pass
21 | 
22 | 
23 | class Guider(torch.nn.Module):
24 |     def __init__(self, sampler, guide_model, scale=1.0, verbose=False):
25 |         """Apply classifier guidance
26 | 
27 |         Specify a guidance scale as either a scalar
28 |         Or a schedule as a list of tuples t = 0->1 and scale, e.g.
29 |         [(0, 10), (0.5, 20), (1, 50)]
30 |         """
31 |         super().__init__()
32 |         self.sampler = sampler
33 |         self.index = 0
34 |         self.show = verbose
35 |         self.guide_model = guide_model
36 |         self.history = []
37 | 
38 |         if isinstance(scale, (Tuple, List)):
39 |             times = np.array([x[0] for x in scale])
40 |             values = np.array([x[1] for x in scale])
41 |             self.scale_schedule = {"times": times, "values": values}
42 |         else:
43 |             self.scale_schedule = float(scale)
44 | 
45 |         self.ddim_timesteps = sampler.ddim_timesteps
46 |         self.ddpm_num_timesteps = sampler.ddpm_num_timesteps
47 | 
48 | 
49 |     def get_scales(self):
50 |         if isinstance(self.scale_schedule, float):
51 |             return len(self.ddim_timesteps)*[self.scale_schedule]
52 | 
53 |         interpolater = interpolate.interp1d(self.scale_schedule["times"], self.scale_schedule["values"])
54 |         fractional_steps = np.array(self.ddim_timesteps)/self.ddpm_num_timesteps
55 |         return interpolater(fractional_steps)
56 | 
57 |     def modify_score(self, model, e_t, x, t, c):
58 | 
59 |         # TODO look up index by t
60 |         scale = self.get_scales()[self.index]
61 | 
62 |         if (scale == 0):
63 |             return e_t
64 | 
65 |         sqrt_1ma = self.sampler.ddim_sqrt_one_minus_alphas[self.index].to(x.device)
66 |         with torch.enable_grad():
67 |             x_in = x.detach().requires_grad_(True)
68 |             pred_x0 = model.predict_start_from_noise(x_in, t=t, noise=e_t)
69 |             x_img = model.first_stage_model.decode((1/0.18215)*pred_x0)
70 | 
71 |             inp = self.guide_model.preprocess(x_img)
72 |             loss = self.guide_model.compute_loss(inp)
73 |             grads = torch.autograd.grad(loss.sum(), x_in)[0]
74 |             correction = grads * scale
75 | 
76 |             if self.show:
77 |                 clear_output(wait=True)
78 |                 print(loss.item(), scale, correction.abs().max().item(), e_t.abs().max().item())
79 |                 self.history.append([loss.item(), scale, correction.min().item(), correction.max().item()])
80 |                 plt.imshow((inp[0].detach().permute(1,2,0).clamp(-1,1).cpu()+1)/2)
81 |                 plt.axis('off')
82 |                 plt.show()
83 |                 plt.imshow(correction[0][0].detach().cpu())
84 |                 plt.axis('off')
85 |                 plt.show()
86 | 
87 | 
88 |         e_t_mod = e_t - sqrt_1ma*correction
89 |         if self.show:
90 |             fig, axs = plt.subplots(1, 3)
91 |             axs[0].imshow(e_t[0][0].detach().cpu(), vmin=-2, vmax=+2)
92 |             axs[1].imshow(e_t_mod[0][0].detach().cpu(), vmin=-2, vmax=+2)
93 |             axs[2].imshow(correction[0][0].detach().cpu(), vmin=-2, vmax=+2)
94 |             plt.show()
95 |         self.index += 1
96 |         return e_t_mod


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/loftr/loftr.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from einops.einops import rearrange
 4 | 
 5 | from .backbone import build_backbone
 6 | from .utils.position_encoding import PositionEncodingSine
 7 | from .loftr_module import LocalFeatureTransformer, FinePreprocess
 8 | from .utils.coarse_matching import CoarseMatching
 9 | from .utils.fine_matching import FineMatching
10 | 
11 | 
12 | class LoFTR(nn.Module):
13 |     def __init__(self, config):
14 |         super().__init__()
15 |         # Misc
16 |         self.config = config
17 | 
18 |         # Modules
19 |         self.backbone = build_backbone(config)
20 |         self.pos_encoding = PositionEncodingSine(
21 |             config['coarse']['d_model'],
22 |             temp_bug_fix=config['coarse']['temp_bug_fix'])
23 |         self.loftr_coarse = LocalFeatureTransformer(config['coarse'])
24 |         self.coarse_matching = CoarseMatching(config['match_coarse'])
25 |         self.fine_preprocess = FinePreprocess(config)
26 |         self.loftr_fine = LocalFeatureTransformer(config["fine"])
27 |         self.fine_matching = FineMatching()
28 | 
29 |     def forward(self, data):
30 |         """ 
31 |         Update:
32 |             data (dict): {
33 |                 'image0': (torch.Tensor): (N, 1, H, W)
34 |                 'image1': (torch.Tensor): (N, 1, H, W)
35 |                 'mask0'(optional) : (torch.Tensor): (N, H, W) '0' indicates a padded position
36 |                 'mask1'(optional) : (torch.Tensor): (N, H, W)
37 |             }
38 |         """
39 |         # 1. Local Feature CNN
40 |         data.update({
41 |             'bs': data['image0'].size(0),
42 |             'hw0_i': data['image0'].shape[2:], 'hw1_i': data['image1'].shape[2:]
43 |         })
44 | 
45 |         if data['hw0_i'] == data['hw1_i']:  # faster & better BN convergence
46 |             feats_c, feats_f = self.backbone(torch.cat([data['image0'], data['image1']], dim=0))
47 |             (feat_c0, feat_c1), (feat_f0, feat_f1) = feats_c.split(data['bs']), feats_f.split(data['bs'])
48 |         else:  # handle different input shapes
49 |             (feat_c0, feat_f0), (feat_c1, feat_f1) = self.backbone(data['image0']), self.backbone(data['image1'])
50 | 
51 |         data.update({
52 |             'hw0_c': feat_c0.shape[2:], 'hw1_c': feat_c1.shape[2:],
53 |             'hw0_f': feat_f0.shape[2:], 'hw1_f': feat_f1.shape[2:]
54 |         })
55 | 
56 |         # 2. coarse-level loftr module
57 |         # add featmap with positional encoding, then flatten it to sequence [N, HW, C]
58 |         feat_c0 = rearrange(self.pos_encoding(feat_c0), 'n c h w -> n (h w) c')
59 |         feat_c1 = rearrange(self.pos_encoding(feat_c1), 'n c h w -> n (h w) c')
60 | 
61 |         mask_c0 = mask_c1 = None  # mask is useful in training
62 |         if 'mask0' in data:
63 |             mask_c0, mask_c1 = data['mask0'].flatten(-2), data['mask1'].flatten(-2)
64 |         feat_c0, feat_c1 = self.loftr_coarse(feat_c0, feat_c1, mask_c0, mask_c1)
65 | 
66 |         # 3. match coarse-level
67 |         self.coarse_matching(feat_c0, feat_c1, data, mask_c0=mask_c0, mask_c1=mask_c1)
68 | 
69 |         # 4. fine-level refinement
70 |         feat_f0_unfold, feat_f1_unfold = self.fine_preprocess(feat_f0, feat_f1, feat_c0, feat_c1, data)
71 |         if feat_f0_unfold.size(0) != 0:  # at least one coarse level predicted
72 |             feat_f0_unfold, feat_f1_unfold = self.loftr_fine(feat_f0_unfold, feat_f1_unfold)
73 | 
74 |         # 5. match fine-level
75 |         self.fine_matching(feat_f0_unfold, feat_f1_unfold, data)
76 | 
77 |     def load_state_dict(self, state_dict, *args, **kwargs):
78 |         for k in list(state_dict.keys()):
79 |             if k.startswith('matcher.'):
80 |                 state_dict[k.replace('matcher.', '', 1)] = state_dict.pop(k)
81 |         return super().load_state_dict(state_dict, *args, **kwargs)
82 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/modules/evaluate/ssim.py:
--------------------------------------------------------------------------------
  1 | # MIT Licence
  2 | 
  3 | # Methods to predict the SSIM, taken from
  4 | # https://github.com/Po-Hsun-Su/pytorch-ssim/blob/master/pytorch_ssim/__init__.py
  5 | 
  6 | from math import exp
  7 | 
  8 | import torch
  9 | import torch.nn.functional as F
 10 | from torch.autograd import Variable
 11 | 
 12 | def gaussian(window_size, sigma):
 13 |     gauss = torch.Tensor(
 14 |         [
 15 |             exp(-((x - window_size // 2) ** 2) / float(2 * sigma ** 2))
 16 |             for x in range(window_size)
 17 |         ]
 18 |     )
 19 |     return gauss / gauss.sum()
 20 | 
 21 | 
 22 | def create_window(window_size, channel):
 23 |     _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
 24 |     _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
 25 |     window = Variable(
 26 |         _2D_window.expand(channel, 1, window_size, window_size).contiguous()
 27 |     )
 28 |     return window
 29 | 
 30 | 
 31 | def _ssim(
 32 |     img1, img2, window, window_size, channel, mask=None, size_average=True
 33 | ):
 34 |     mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
 35 |     mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
 36 | 
 37 |     mu1_sq = mu1.pow(2)
 38 |     mu2_sq = mu2.pow(2)
 39 |     mu1_mu2 = mu1 * mu2
 40 | 
 41 |     sigma1_sq = (
 42 |         F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel)
 43 |         - mu1_sq
 44 |     )
 45 |     sigma2_sq = (
 46 |         F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel)
 47 |         - mu2_sq
 48 |     )
 49 |     sigma12 = (
 50 |         F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel)
 51 |         - mu1_mu2
 52 |     )
 53 | 
 54 |     C1 = (0.01) ** 2
 55 |     C2 = (0.03) ** 2
 56 | 
 57 |     ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / (
 58 |         (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)
 59 |     )
 60 | 
 61 |     if not (mask is None):
 62 |         b = mask.size(0)
 63 |         ssim_map = ssim_map.mean(dim=1, keepdim=True) * mask
 64 |         ssim_map = ssim_map.view(b, -1).sum(dim=1) / mask.view(b, -1).sum(
 65 |             dim=1
 66 |         ).clamp(min=1)
 67 |         return ssim_map
 68 | 
 69 |     import pdb
 70 | 
 71 |     pdb.set_trace
 72 | 
 73 |     if size_average:
 74 |         return ssim_map.mean()
 75 |     else:
 76 |         return ssim_map.mean(1).mean(1).mean(1)
 77 | 
 78 | 
 79 | class SSIM(torch.nn.Module):
 80 |     def __init__(self, window_size=11, size_average=True):
 81 |         super(SSIM, self).__init__()
 82 |         self.window_size = window_size
 83 |         self.size_average = size_average
 84 |         self.channel = 1
 85 |         self.window = create_window(window_size, self.channel)
 86 | 
 87 |     def forward(self, img1, img2, mask=None):
 88 |         (_, channel, _, _) = img1.size()
 89 | 
 90 |         if (
 91 |             channel == self.channel
 92 |             and self.window.data.type() == img1.data.type()
 93 |         ):
 94 |             window = self.window
 95 |         else:
 96 |             window = create_window(self.window_size, channel)
 97 | 
 98 |             if img1.is_cuda:
 99 |                 window = window.cuda(img1.get_device())
100 |             window = window.type_as(img1)
101 | 
102 |             self.window = window
103 |             self.channel = channel
104 | 
105 |         return _ssim(
106 |             img1,
107 |             img2,
108 |             window,
109 |             self.window_size,
110 |             channel,
111 |             mask,
112 |             self.size_average,
113 |         )
114 | 
115 | 
116 | def ssim(img1, img2, window_size=11, mask=None, size_average=True):
117 |     (_, channel, _, _) = img1.size()
118 |     window = create_window(window_size, channel)
119 | 
120 |     if img1.is_cuda:
121 |         window = window.cuda(img1.get_device())
122 |     window = window.type_as(img1)
123 | 
124 |     return _ssim(img1, img2, window, window_size, channel, mask, size_average)
125 | 


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/loftr/loftr_module/transformer.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import torch
  3 | import torch.nn as nn
  4 | from .linear_attention import LinearAttention, FullAttention
  5 | 
  6 | 
  7 | class LoFTREncoderLayer(nn.Module):
  8 |     def __init__(self,
  9 |                  d_model,
 10 |                  nhead,
 11 |                  attention='linear'):
 12 |         super(LoFTREncoderLayer, self).__init__()
 13 | 
 14 |         self.dim = d_model // nhead
 15 |         self.nhead = nhead
 16 | 
 17 |         # multi-head attention
 18 |         self.q_proj = nn.Linear(d_model, d_model, bias=False)
 19 |         self.k_proj = nn.Linear(d_model, d_model, bias=False)
 20 |         self.v_proj = nn.Linear(d_model, d_model, bias=False)
 21 |         self.attention = LinearAttention() if attention == 'linear' else FullAttention()
 22 |         self.merge = nn.Linear(d_model, d_model, bias=False)
 23 | 
 24 |         # feed-forward network
 25 |         self.mlp = nn.Sequential(
 26 |             nn.Linear(d_model*2, d_model*2, bias=False),
 27 |             nn.ReLU(True),
 28 |             nn.Linear(d_model*2, d_model, bias=False),
 29 |         )
 30 | 
 31 |         # norm and dropout
 32 |         self.norm1 = nn.LayerNorm(d_model)
 33 |         self.norm2 = nn.LayerNorm(d_model)
 34 | 
 35 |     def forward(self, x, source, x_mask=None, source_mask=None):
 36 |         """
 37 |         Args:
 38 |             x (torch.Tensor): [N, L, C]
 39 |             source (torch.Tensor): [N, S, C]
 40 |             x_mask (torch.Tensor): [N, L] (optional)
 41 |             source_mask (torch.Tensor): [N, S] (optional)
 42 |         """
 43 |         bs = x.size(0)
 44 |         query, key, value = x, source, source
 45 | 
 46 |         # multi-head attention
 47 |         query = self.q_proj(query).view(bs, -1, self.nhead, self.dim)  # [N, L, (H, D)]
 48 |         key = self.k_proj(key).view(bs, -1, self.nhead, self.dim)  # [N, S, (H, D)]
 49 |         value = self.v_proj(value).view(bs, -1, self.nhead, self.dim)
 50 |         message = self.attention(query, key, value, q_mask=x_mask, kv_mask=source_mask)  # [N, L, (H, D)]
 51 |         message = self.merge(message.view(bs, -1, self.nhead*self.dim))  # [N, L, C]
 52 |         message = self.norm1(message)
 53 | 
 54 |         # feed-forward network
 55 |         message = self.mlp(torch.cat([x, message], dim=2))
 56 |         message = self.norm2(message)
 57 | 
 58 |         return x + message
 59 | 
 60 | 
 61 | class LocalFeatureTransformer(nn.Module):
 62 |     """A Local Feature Transformer (LoFTR) module."""
 63 | 
 64 |     def __init__(self, config):
 65 |         super(LocalFeatureTransformer, self).__init__()
 66 | 
 67 |         self.config = config
 68 |         self.d_model = config['d_model']
 69 |         self.nhead = config['nhead']
 70 |         self.layer_names = config['layer_names']
 71 |         encoder_layer = LoFTREncoderLayer(config['d_model'], config['nhead'], config['attention'])
 72 |         self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(len(self.layer_names))])
 73 |         self._reset_parameters()
 74 | 
 75 |     def _reset_parameters(self):
 76 |         for p in self.parameters():
 77 |             if p.dim() > 1:
 78 |                 nn.init.xavier_uniform_(p)
 79 | 
 80 |     def forward(self, feat0, feat1, mask0=None, mask1=None):
 81 |         """
 82 |         Args:
 83 |             feat0 (torch.Tensor): [N, L, C]
 84 |             feat1 (torch.Tensor): [N, S, C]
 85 |             mask0 (torch.Tensor): [N, L] (optional)
 86 |             mask1 (torch.Tensor): [N, S] (optional)
 87 |         """
 88 | 
 89 |         assert self.d_model == feat0.size(2), "the feature number of src and transformer must be equal"
 90 | 
 91 |         for layer, name in zip(self.layers, self.layer_names):
 92 |             if name == 'self':
 93 |                 feat0 = layer(feat0, feat0, mask0, mask0)
 94 |                 feat1 = layer(feat1, feat1, mask1, mask1)
 95 |             elif name == 'cross':
 96 |                 feat0 = layer(feat0, feat1, mask0, mask1)
 97 |                 feat1 = layer(feat1, feat0, mask1, mask0)
 98 |             else:
 99 |                 raise KeyError
100 | 
101 |         return feat0, feat1
102 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/thirdp/psp/helpers.py:
--------------------------------------------------------------------------------
  1 | # https://github.com/eladrich/pixel2style2pixel
  2 | 
  3 | from collections import namedtuple
  4 | import torch
  5 | from torch.nn import Conv2d, BatchNorm2d, PReLU, ReLU, Sigmoid, MaxPool2d, AdaptiveAvgPool2d, Sequential, Module
  6 | 
  7 | """
  8 | ArcFace implementation from [TreB1eN](https://github.com/TreB1eN/InsightFace_Pytorch)
  9 | """
 10 | 
 11 | 
 12 | class Flatten(Module):
 13 | 	def forward(self, input):
 14 | 		return input.view(input.size(0), -1)
 15 | 
 16 | 
 17 | def l2_norm(input, axis=1):
 18 | 	norm = torch.norm(input, 2, axis, True)
 19 | 	output = torch.div(input, norm)
 20 | 	return output
 21 | 
 22 | 
 23 | class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])):
 24 | 	""" A named tuple describing a ResNet block. """
 25 | 
 26 | 
 27 | def get_block(in_channel, depth, num_units, stride=2):
 28 | 	return [Bottleneck(in_channel, depth, stride)] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]
 29 | 
 30 | 
 31 | def get_blocks(num_layers):
 32 | 	if num_layers == 50:
 33 | 		blocks = [
 34 | 			get_block(in_channel=64, depth=64, num_units=3),
 35 | 			get_block(in_channel=64, depth=128, num_units=4),
 36 | 			get_block(in_channel=128, depth=256, num_units=14),
 37 | 			get_block(in_channel=256, depth=512, num_units=3)
 38 | 		]
 39 | 	elif num_layers == 100:
 40 | 		blocks = [
 41 | 			get_block(in_channel=64, depth=64, num_units=3),
 42 | 			get_block(in_channel=64, depth=128, num_units=13),
 43 | 			get_block(in_channel=128, depth=256, num_units=30),
 44 | 			get_block(in_channel=256, depth=512, num_units=3)
 45 | 		]
 46 | 	elif num_layers == 152:
 47 | 		blocks = [
 48 | 			get_block(in_channel=64, depth=64, num_units=3),
 49 | 			get_block(in_channel=64, depth=128, num_units=8),
 50 | 			get_block(in_channel=128, depth=256, num_units=36),
 51 | 			get_block(in_channel=256, depth=512, num_units=3)
 52 | 		]
 53 | 	else:
 54 | 		raise ValueError("Invalid number of layers: {}. Must be one of [50, 100, 152]".format(num_layers))
 55 | 	return blocks
 56 | 
 57 | 
 58 | class SEModule(Module):
 59 | 	def __init__(self, channels, reduction):
 60 | 		super(SEModule, self).__init__()
 61 | 		self.avg_pool = AdaptiveAvgPool2d(1)
 62 | 		self.fc1 = Conv2d(channels, channels // reduction, kernel_size=1, padding=0, bias=False)
 63 | 		self.relu = ReLU(inplace=True)
 64 | 		self.fc2 = Conv2d(channels // reduction, channels, kernel_size=1, padding=0, bias=False)
 65 | 		self.sigmoid = Sigmoid()
 66 | 
 67 | 	def forward(self, x):
 68 | 		module_input = x
 69 | 		x = self.avg_pool(x)
 70 | 		x = self.fc1(x)
 71 | 		x = self.relu(x)
 72 | 		x = self.fc2(x)
 73 | 		x = self.sigmoid(x)
 74 | 		return module_input * x
 75 | 
 76 | 
 77 | class bottleneck_IR(Module):
 78 | 	def __init__(self, in_channel, depth, stride):
 79 | 		super(bottleneck_IR, self).__init__()
 80 | 		if in_channel == depth:
 81 | 			self.shortcut_layer = MaxPool2d(1, stride)
 82 | 		else:
 83 | 			self.shortcut_layer = Sequential(
 84 | 				Conv2d(in_channel, depth, (1, 1), stride, bias=False),
 85 | 				BatchNorm2d(depth)
 86 | 			)
 87 | 		self.res_layer = Sequential(
 88 | 			BatchNorm2d(in_channel),
 89 | 			Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), PReLU(depth),
 90 | 			Conv2d(depth, depth, (3, 3), stride, 1, bias=False), BatchNorm2d(depth)
 91 | 		)
 92 | 
 93 | 	def forward(self, x):
 94 | 		shortcut = self.shortcut_layer(x)
 95 | 		res = self.res_layer(x)
 96 | 		return res + shortcut
 97 | 
 98 | 
 99 | class bottleneck_IR_SE(Module):
100 | 	def __init__(self, in_channel, depth, stride):
101 | 		super(bottleneck_IR_SE, self).__init__()
102 | 		if in_channel == depth:
103 | 			self.shortcut_layer = MaxPool2d(1, stride)
104 | 		else:
105 | 			self.shortcut_layer = Sequential(
106 | 				Conv2d(in_channel, depth, (1, 1), stride, bias=False),
107 | 				BatchNorm2d(depth)
108 | 			)
109 | 		self.res_layer = Sequential(
110 | 			BatchNorm2d(in_channel),
111 | 			Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
112 | 			PReLU(depth),
113 | 			Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
114 | 			BatchNorm2d(depth),
115 | 			SEModule(depth, 16)
116 | 		)
117 | 
118 | 	def forward(self, x):
119 | 		shortcut = self.shortcut_layer(x)
120 | 		res = self.res_layer(x)
121 | 		return res + shortcut


--------------------------------------------------------------------------------
/pose_synthesis/ldm/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class LambdaWarmUpCosineScheduler:
 5 |     """
 6 |     note: use with a base_lr of 1.0
 7 |     """
 8 |     def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
 9 |         self.lr_warm_up_steps = warm_up_steps
10 |         self.lr_start = lr_start
11 |         self.lr_min = lr_min
12 |         self.lr_max = lr_max
13 |         self.lr_max_decay_steps = max_decay_steps
14 |         self.last_lr = 0.
15 |         self.verbosity_interval = verbosity_interval
16 | 
17 |     def schedule(self, n, **kwargs):
18 |         if self.verbosity_interval > 0:
19 |             if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
20 |         if n < self.lr_warm_up_steps:
21 |             lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
22 |             self.last_lr = lr
23 |             return lr
24 |         else:
25 |             t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
26 |             t = min(t, 1.0)
27 |             lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
28 |                     1 + np.cos(t * np.pi))
29 |             self.last_lr = lr
30 |             return lr
31 | 
32 |     def __call__(self, n, **kwargs):
33 |         return self.schedule(n,**kwargs)
34 | 
35 | 
36 | class LambdaWarmUpCosineScheduler2:
37 |     """
38 |     supports repeated iterations, configurable via lists
39 |     note: use with a base_lr of 1.0.
40 |     """
41 |     def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
42 |         assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
43 |         self.lr_warm_up_steps = warm_up_steps
44 |         self.f_start = f_start
45 |         self.f_min = f_min
46 |         self.f_max = f_max
47 |         self.cycle_lengths = cycle_lengths
48 |         self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
49 |         self.last_f = 0.
50 |         self.verbosity_interval = verbosity_interval
51 | 
52 |     def find_in_interval(self, n):
53 |         interval = 0
54 |         for cl in self.cum_cycles[1:]:
55 |             if n <= cl:
56 |                 return interval
57 |             interval += 1
58 | 
59 |     def schedule(self, n, **kwargs):
60 |         cycle = self.find_in_interval(n)
61 |         n = n - self.cum_cycles[cycle]
62 |         if self.verbosity_interval > 0:
63 |             if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
64 |                                                        f"current cycle {cycle}")
65 |         if n < self.lr_warm_up_steps[cycle]:
66 |             f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
67 |             self.last_f = f
68 |             return f
69 |         else:
70 |             t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
71 |             t = min(t, 1.0)
72 |             f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
73 |                     1 + np.cos(t * np.pi))
74 |             self.last_f = f
75 |             return f
76 | 
77 |     def __call__(self, n, **kwargs):
78 |         return self.schedule(n, **kwargs)
79 | 
80 | 
81 | class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
82 | 
83 |     def schedule(self, n, **kwargs):
84 |         cycle = self.find_in_interval(n)
85 |         n = n - self.cum_cycles[cycle]
86 |         if self.verbosity_interval > 0:
87 |             if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
88 |                                                        f"current cycle {cycle}")
89 | 
90 |         if n < self.lr_warm_up_steps[cycle]:
91 |             f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
92 |             self.last_f = f
93 |             return f
94 |         else:
95 |             f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
96 |             self.last_f = f
97 |             return f
98 | 
99 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/data/inpainting/synthetic_mask.py:
--------------------------------------------------------------------------------
  1 | from PIL import Image, ImageDraw
  2 | import numpy as np
  3 | 
  4 | settings = {
  5 |     "256narrow": {
  6 |         "p_irr": 1,
  7 |         "min_n_irr": 4,
  8 |         "max_n_irr": 50,
  9 |         "max_l_irr": 40,
 10 |         "max_w_irr": 10,
 11 |         "min_n_box": None,
 12 |         "max_n_box": None,
 13 |         "min_s_box": None,
 14 |         "max_s_box": None,
 15 |         "marg": None,
 16 |     },
 17 |     "256train": {
 18 |         "p_irr": 0.5,
 19 |         "min_n_irr": 1,
 20 |         "max_n_irr": 5,
 21 |         "max_l_irr": 200,
 22 |         "max_w_irr": 100,
 23 |         "min_n_box": 1,
 24 |         "max_n_box": 4,
 25 |         "min_s_box": 30,
 26 |         "max_s_box": 150,
 27 |         "marg": 10,
 28 |     },
 29 |     "512train": {    # TODO: experimental
 30 |             "p_irr": 0.5,
 31 |             "min_n_irr": 1,
 32 |             "max_n_irr": 5,
 33 |             "max_l_irr": 450,
 34 |             "max_w_irr": 250,
 35 |             "min_n_box": 1,
 36 |             "max_n_box": 4,
 37 |             "min_s_box": 30,
 38 |             "max_s_box": 300,
 39 |             "marg": 10,
 40 |         },
 41 |     "512train-large": {    # TODO: experimental
 42 |             "p_irr": 0.5,
 43 |             "min_n_irr": 1,
 44 |             "max_n_irr": 5,
 45 |             "max_l_irr": 450,
 46 |             "max_w_irr": 400,
 47 |             "min_n_box": 1,
 48 |             "max_n_box": 4,
 49 |             "min_s_box": 75,
 50 |             "max_s_box": 450,
 51 |             "marg": 10,
 52 |         },
 53 | }
 54 | 
 55 | 
 56 | def gen_segment_mask(mask, start, end, brush_width):
 57 |     mask = mask > 0
 58 |     mask = (255 * mask).astype(np.uint8)
 59 |     mask = Image.fromarray(mask)
 60 |     draw = ImageDraw.Draw(mask)
 61 |     draw.line([start, end], fill=255, width=brush_width, joint="curve")
 62 |     mask = np.array(mask) / 255
 63 |     return mask
 64 | 
 65 | 
 66 | def gen_box_mask(mask, masked):
 67 |     x_0, y_0, w, h = masked
 68 |     mask[y_0:y_0 + h, x_0:x_0 + w] = 1
 69 |     return mask
 70 | 
 71 | 
 72 | def gen_round_mask(mask, masked, radius):
 73 |     x_0, y_0, w, h = masked
 74 |     xy = [(x_0, y_0), (x_0 + w, y_0 + w)]
 75 | 
 76 |     mask = mask > 0
 77 |     mask = (255 * mask).astype(np.uint8)
 78 |     mask = Image.fromarray(mask)
 79 |     draw = ImageDraw.Draw(mask)
 80 |     draw.rounded_rectangle(xy, radius=radius, fill=255)
 81 |     mask = np.array(mask) / 255
 82 |     return mask
 83 | 
 84 | 
 85 | def gen_large_mask(prng, img_h, img_w,
 86 |                    marg, p_irr, min_n_irr, max_n_irr, max_l_irr, max_w_irr,
 87 |                    min_n_box, max_n_box, min_s_box, max_s_box):
 88 |     """
 89 |     img_h: int, an image height
 90 |     img_w: int, an image width
 91 |     marg: int, a margin for a box starting coordinate
 92 |     p_irr: float, 0 <= p_irr <= 1, a probability of a polygonal chain mask
 93 | 
 94 |     min_n_irr: int, min number of segments
 95 |     max_n_irr: int, max number of segments
 96 |     max_l_irr: max length of a segment in polygonal chain
 97 |     max_w_irr: max width of a segment in polygonal chain
 98 | 
 99 |     min_n_box: int, min bound for the number of box primitives
100 |     max_n_box: int, max bound for the number of box primitives
101 |     min_s_box: int, min length of a box side
102 |     max_s_box: int, max length of a box side
103 |     """
104 | 
105 |     mask = np.zeros((img_h, img_w))
106 |     uniform = prng.randint
107 | 
108 |     if np.random.uniform(0, 1) < p_irr:  # generate polygonal chain
109 |         n = uniform(min_n_irr, max_n_irr)  # sample number of segments
110 | 
111 |         for _ in range(n):
112 |             y = uniform(0, img_h)  # sample a starting point
113 |             x = uniform(0, img_w)
114 | 
115 |             a = uniform(0, 360)  # sample angle
116 |             l = uniform(10, max_l_irr)  # sample segment length
117 |             w = uniform(5, max_w_irr)  # sample a segment width
118 | 
119 |             # draw segment starting from (x,y) to (x_,y_) using brush of width w
120 |             x_ = x + l * np.sin(a)
121 |             y_ = y + l * np.cos(a)
122 | 
123 |             mask = gen_segment_mask(mask, start=(x, y), end=(x_, y_), brush_width=w)
124 |             x, y = x_, y_
125 |     else:  # generate Box masks
126 |         n = uniform(min_n_box, max_n_box)  # sample number of rectangles
127 | 
128 |         for _ in range(n):
129 |             h = uniform(min_s_box, max_s_box)  # sample box shape
130 |             w = uniform(min_s_box, max_s_box)
131 | 
132 |             x_0 = uniform(marg, img_w - marg - w)  # sample upper-left coordinates of box
133 |             y_0 = uniform(marg, img_h - marg - h)
134 | 
135 |             if np.random.uniform(0, 1) < 0.5:
136 |                 mask = gen_box_mask(mask, masked=(x_0, y_0, w, h))
137 |             else:
138 |                 r = uniform(0, 60)  # sample radius
139 |                 mask = gen_round_mask(mask, masked=(x_0, y_0, w, h), radius=r)
140 |     return mask
141 | 
142 | 
143 | make_lama_mask = lambda prng, h, w: gen_large_mask(prng, h, w, **settings["256train"])
144 | make_narrow_lama_mask = lambda prng, h, w: gen_large_mask(prng, h, w, **settings["256narrow"])
145 | make_512_lama_mask = lambda prng, h, w: gen_large_mask(prng, h, w, **settings["512train"])
146 | make_512_lama_mask_large = lambda prng, h, w: gen_large_mask(prng, h, w, **settings["512train-large"])
147 | 
148 | 
149 | MASK_MODES = {
150 |     "256train": make_lama_mask,
151 |     "256narrow": make_narrow_lama_mask,
152 |     "512train": make_512_lama_mask,
153 |     "512train-large": make_512_lama_mask_large
154 | }
155 | 
156 | if __name__ == "__main__":
157 |     import sys
158 | 
159 |     out = sys.argv[1]
160 | 
161 |     prng = np.random.RandomState(1)
162 |     kwargs = settings["256train"]
163 |     mask = gen_large_mask(prng, 256, 256, **kwargs)
164 |     mask = (255 * mask).astype(np.uint8)
165 |     mask = Image.fromarray(mask)
166 |     mask.save(out)
167 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/modules/evaluate/frechet_video_distance.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2022 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # Lint as: python2, python3
 17 | """Minimal Reference implementation for the Frechet Video Distance (FVD).
 18 | 
 19 | FVD is a metric for the quality of video generation models. It is inspired by
 20 | the FID (Frechet Inception Distance) used for images, but uses a different
 21 | embedding to be better suitable for videos.
 22 | """
 23 | 
 24 | from __future__ import absolute_import
 25 | from __future__ import division
 26 | from __future__ import print_function
 27 | 
 28 | 
 29 | import six
 30 | import tensorflow.compat.v1 as tf
 31 | import tensorflow_gan as tfgan
 32 | import tensorflow_hub as hub
 33 | 
 34 | 
 35 | def preprocess(videos, target_resolution):
 36 |   """Runs some preprocessing on the videos for I3D model.
 37 | 
 38 |   Args:
 39 |     videos: <T>[batch_size, num_frames, height, width, depth] The videos to be
 40 |       preprocessed. We don't care about the specific dtype of the videos, it can
 41 |       be anything that tf.image.resize_bilinear accepts. Values are expected to
 42 |       be in the range 0-255.
 43 |     target_resolution: (width, height): target video resolution
 44 | 
 45 |   Returns:
 46 |     videos: <float32>[batch_size, num_frames, height, width, depth]
 47 |   """
 48 |   videos_shape = list(videos.shape)
 49 |   all_frames = tf.reshape(videos, [-1] + videos_shape[-3:])
 50 |   resized_videos = tf.image.resize_bilinear(all_frames, size=target_resolution)
 51 |   target_shape = [videos_shape[0], -1] + list(target_resolution) + [3]
 52 |   output_videos = tf.reshape(resized_videos, target_shape)
 53 |   scaled_videos = 2. * tf.cast(output_videos, tf.float32) / 255. - 1
 54 |   return scaled_videos
 55 | 
 56 | 
 57 | def _is_in_graph(tensor_name):
 58 |   """Checks whether a given tensor does exists in the graph."""
 59 |   try:
 60 |     tf.get_default_graph().get_tensor_by_name(tensor_name)
 61 |   except KeyError:
 62 |     return False
 63 |   return True
 64 | 
 65 | 
 66 | def create_id3_embedding(videos,warmup=False,batch_size=16):
 67 |   """Embeds the given videos using the Inflated 3D Convolution ne   twork.
 68 | 
 69 |   Downloads the graph of the I3D from tf.hub and adds it to the graph on the
 70 |   first call.
 71 | 
 72 |   Args:
 73 |     videos: <float32>[batch_size, num_frames, height=224, width=224, depth=3].
 74 |       Expected range is [-1, 1].
 75 | 
 76 |   Returns:
 77 |     embedding: <float32>[batch_size, embedding_size]. embedding_size depends
 78 |                on the model used.
 79 | 
 80 |   Raises:
 81 |     ValueError: when a provided embedding_layer is not supported.
 82 |   """
 83 | 
 84 |   # batch_size = 16
 85 |   module_spec = "https://tfhub.dev/deepmind/i3d-kinetics-400/1"
 86 | 
 87 | 
 88 |   # Making sure that we import the graph separately for
 89 |   # each different input video tensor.
 90 |   module_name = "fvd_kinetics-400_id3_module_" + six.ensure_str(
 91 |       videos.name).replace(":", "_")
 92 | 
 93 | 
 94 | 
 95 |   assert_ops = [
 96 |       tf.Assert(
 97 |           tf.reduce_max(videos) <= 1.001,
 98 |           ["max value in frame is > 1", videos]),
 99 |       tf.Assert(
100 |           tf.reduce_min(videos) >= -1.001,
101 |           ["min value in frame is < -1", videos]),
102 |       tf.assert_equal(
103 |           tf.shape(videos)[0],
104 |           batch_size, ["invalid frame batch size: ",
105 |                        tf.shape(videos)],
106 |           summarize=6),
107 |   ]
108 |   with tf.control_dependencies(assert_ops):
109 |     videos = tf.identity(videos)
110 | 
111 |   module_scope = "%s_apply_default/" % module_name
112 | 
113 |   # To check whether the module has already been loaded into the graph, we look
114 |   # for a given tensor name. If this tensor name exists, we assume the function
115 |   # has been called before and the graph was imported. Otherwise we import it.
116 |   # Note: in theory, the tensor could exist, but have wrong shapes.
117 |   # This will happen if create_id3_embedding is called with a frames_placehoder
118 |   # of wrong size/batch size, because even though that will throw a tf.Assert
119 |   # on graph-execution time, it will insert the tensor (with wrong shape) into
120 |   # the graph. This is why we need the following assert.
121 |   if warmup:
122 |       video_batch_size = int(videos.shape[0])
123 |       assert video_batch_size in [batch_size, -1, None], f"Invalid batch size {video_batch_size}"
124 |   tensor_name = module_scope + "RGB/inception_i3d/Mean:0"
125 |   if not _is_in_graph(tensor_name):
126 |     i3d_model = hub.Module(module_spec, name=module_name)
127 |     i3d_model(videos)
128 | 
129 |   # gets the kinetics-i3d-400-logits layer
130 |   tensor_name = module_scope + "RGB/inception_i3d/Mean:0"
131 |   tensor = tf.get_default_graph().get_tensor_by_name(tensor_name)
132 |   return tensor
133 | 
134 | 
135 | def calculate_fvd(real_activations,
136 |                   generated_activations):
137 |   """Returns a list of ops that compute metrics as funcs of activations.
138 | 
139 |   Args:
140 |     real_activations: <float32>[num_samples, embedding_size]
141 |     generated_activations: <float32>[num_samples, embedding_size]
142 | 
143 |   Returns:
144 |     A scalar that contains the requested FVD.
145 |   """
146 |   return tfgan.eval.frechet_classifier_distance_from_activations(
147 |       real_activations, generated_activations)
148 | 


--------------------------------------------------------------------------------
/pose_synthesis/run.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import argparse
  4 | from PIL import Image
  5 | from utils.zero123_utils import init_model, predict_stage1_gradio, zero123_infer
  6 | from utils.sam_utils import sam_init, sam_out_nosave
  7 | from utils.utils import pred_bbox, image_preprocess_nosave, gen_poses, convert_mesh_format
  8 | from elevation_estimate.estimate_wild_imgs import estimate_elev
  9 | 
 10 | 
 11 | def preprocess(predictor, raw_im, lower_contrast=False):
 12 |     raw_im.thumbnail([512, 512], Image.Resampling.LANCZOS)
 13 |     image_sam = sam_out_nosave(predictor, raw_im.convert("RGB"), pred_bbox(raw_im))
 14 |     input_256 = image_preprocess_nosave(image_sam, lower_contrast=lower_contrast, rescale=True)
 15 |     torch.cuda.empty_cache()
 16 |     return input_256
 17 | 
 18 | def stage1_run(model, device, exp_dir,
 19 |                input_im, scale, ddim_steps):
 20 |     # folder to save the stage 1 images
 21 |     stage1_dir = os.path.join(exp_dir, "stage1_8")
 22 |     os.makedirs(stage1_dir, exist_ok=True)
 23 | 
 24 |     # stage 1: generate 4 views at the same elevation as the input
 25 |     output_ims = predict_stage1_gradio(model, input_im, save_path=stage1_dir, adjust_set=list(range(4)), device=device, ddim_steps=ddim_steps, scale=scale)
 26 |     
 27 |     # stage 2 for the first image
 28 |     # infer 4 nearby views for an image to estimate the polar angle of the input
 29 |     stage2_steps = 50 # ddim_steps
 30 |     zero123_infer(model, exp_dir, indices=[0], device=device, ddim_steps=stage2_steps, scale=scale)
 31 |     # estimate the camera pose (elevation) of the input image.
 32 |     try:
 33 |         polar_angle = estimate_elev(exp_dir)
 34 |     except:
 35 |         print("Failed to estimate polar angle")
 36 |         polar_angle = 90
 37 |     print("Estimated polar angle:", polar_angle)
 38 |     gen_poses(exp_dir, polar_angle)
 39 | 
 40 |     # stage 1: generate another 4 views at a different elevation
 41 |     if polar_angle <= 75:
 42 |         output_ims_2 = predict_stage1_gradio(model, input_im, save_path=stage1_dir, adjust_set=list(range(4,8)), device=device, ddim_steps=ddim_steps, scale=scale)
 43 |     else:
 44 |         output_ims_2 = predict_stage1_gradio(model, input_im, save_path=stage1_dir, adjust_set=list(range(8,12)), device=device, ddim_steps=ddim_steps, scale=scale)
 45 |     torch.cuda.empty_cache()
 46 |     return 90-polar_angle, output_ims+output_ims_2
 47 |     
 48 | def stage2_run(model, device, exp_dir,
 49 |                elev, scale, stage2_steps=50):
 50 |     # stage 2 for the remaining 7 images, generate 7*4=28 views
 51 |     if 90-elev <= 75:
 52 |         zero123_infer(model, exp_dir, indices=list(range(1,8)), device=device, ddim_steps=stage2_steps, scale=scale)
 53 |     else:
 54 |         zero123_infer(model, exp_dir, indices=list(range(1,4))+list(range(8,12)), device=device, ddim_steps=stage2_steps, scale=scale)
 55 | 
 56 | def reconstruct(exp_dir, output_format=".ply", device_idx=0, resolution=256):
 57 |     exp_dir = os.path.abspath(exp_dir)
 58 |     main_dir_path = os.path.abspath(os.path.dirname("./"))
 59 |     os.chdir('reconstruction/')
 60 | 
 61 |     bash_script = f'CUDA_VISIBLE_DEVICES={device_idx} python exp_runner_generic_blender_val.py \
 62 |                     --specific_dataset_name {exp_dir} \
 63 |                     --mode export_mesh \
 64 |                     --conf confs/one2345_lod0_val_demo.conf \
 65 |                     --resolution {resolution}'
 66 |     print(bash_script)
 67 |     os.system(bash_script)
 68 |     os.chdir(main_dir_path)
 69 | 
 70 |     ply_path = os.path.join(exp_dir, f"mesh.ply")
 71 |     if output_format == ".ply":
 72 |         return ply_path
 73 |     if output_format not in [".obj", ".glb"]:
 74 |         print("Invalid output format, must be one of .ply, .obj, .glb")
 75 |         return ply_path
 76 |     return convert_mesh_format(exp_dir, output_format=output_format)
 77 | 
 78 | 
 79 | def predict_multiview(shape_dir, args):
 80 |     device = f"cuda:{args.gpu_idx}"
 81 | 
 82 |     # initialize the zero123 model
 83 |     models = init_model(device, 'zero123-xl.ckpt', half_precision=args.half_precision)
 84 |     model_zero123 = models["turncam"]
 85 | 
 86 |     # initialize the Segment Anything model
 87 |     predictor = sam_init(args.gpu_idx)
 88 |     input_raw = Image.open(args.img_path)
 89 | 
 90 |     # preprocess the input image
 91 |     input_256 = preprocess(predictor, input_raw)
 92 | 
 93 |     # generate multi-view images in two stages with Zero123.
 94 |     # first stage: generate N=8 views cover 360 degree of the input shape.
 95 |     elev, stage1_imgs = stage1_run(model_zero123, device, shape_dir, input_256, scale=3, ddim_steps=75)
 96 |     # second stage: 4 local views for each of the first-stage view, resulting in N*4=32 source view images.
 97 |     stage2_run(model_zero123, device, shape_dir, elev, scale=3, stage2_steps=50)
 98 | 
 99 | if __name__ == "__main__":
100 |     parser = argparse.ArgumentParser(description='Process some integers.')
101 |     parser.add_argument('--img_path', type=str, default="./demo/demo_examples/01_wild_hydrant.png", help='Path to the input image')
102 |     parser.add_argument('--gpu_idx', type=int, default=0, help='GPU index')
103 |     parser.add_argument('--half_precision', action='store_true', help='Use half precision')
104 |     parser.add_argument('--mesh_resolution', type=int, default=256, help='Mesh resolution')
105 |     parser.add_argument('--output_format', type=str, default=".ply", help='Output format: .ply, .obj, .glb')
106 | 
107 |     args = parser.parse_args()
108 | 
109 |     assert(torch.cuda.is_available())
110 | 
111 |     shape_id = args.img_path.split('/')[-1].split('.')[0]
112 |     shape_dir = f"./exp/{shape_id}"
113 |     os.makedirs(shape_dir, exist_ok=True)
114 | 
115 |     predict_multiview(shape_dir, args)
116 | 
117 |     # utilize cost volume-based 3D reconstruction to generate textured 3D mesh
118 |     mesh_path = reconstruct(shape_dir, output_format=args.output_format, device_idx=args.gpu_idx, resolution=args.mesh_resolution)
119 |     print("Mesh saved to:", mesh_path)
120 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/modules/losses/contperceptual.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from taming.modules.losses.vqperceptual import *  # TODO: taming dependency yes/no?
  5 | 
  6 | 
  7 | class LPIPSWithDiscriminator(nn.Module):
  8 |     def __init__(self, disc_start, logvar_init=0.0, kl_weight=1.0, pixelloss_weight=1.0,
  9 |                  disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
 10 |                  perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
 11 |                  disc_loss="hinge"):
 12 | 
 13 |         super().__init__()
 14 |         assert disc_loss in ["hinge", "vanilla"]
 15 |         self.kl_weight = kl_weight
 16 |         self.pixel_weight = pixelloss_weight
 17 |         self.perceptual_loss = LPIPS().eval()
 18 |         self.perceptual_weight = perceptual_weight
 19 |         # output log variance
 20 |         self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
 21 | 
 22 |         self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
 23 |                                                  n_layers=disc_num_layers,
 24 |                                                  use_actnorm=use_actnorm
 25 |                                                  ).apply(weights_init)
 26 |         self.discriminator_iter_start = disc_start
 27 |         self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
 28 |         self.disc_factor = disc_factor
 29 |         self.discriminator_weight = disc_weight
 30 |         self.disc_conditional = disc_conditional
 31 | 
 32 |     def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
 33 |         if last_layer is not None:
 34 |             nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
 35 |             g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
 36 |         else:
 37 |             nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
 38 |             g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
 39 | 
 40 |         d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
 41 |         d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
 42 |         d_weight = d_weight * self.discriminator_weight
 43 |         return d_weight
 44 | 
 45 |     def forward(self, inputs, reconstructions, posteriors, optimizer_idx,
 46 |                 global_step, last_layer=None, cond=None, split="train",
 47 |                 weights=None):
 48 |         rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
 49 |         if self.perceptual_weight > 0:
 50 |             p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
 51 |             rec_loss = rec_loss + self.perceptual_weight * p_loss
 52 | 
 53 |         nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
 54 |         weighted_nll_loss = nll_loss
 55 |         if weights is not None:
 56 |             weighted_nll_loss = weights*nll_loss
 57 |         weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
 58 |         nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
 59 |         kl_loss = posteriors.kl()
 60 |         kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
 61 | 
 62 |         # now the GAN part
 63 |         if optimizer_idx == 0:
 64 |             # generator update
 65 |             if cond is None:
 66 |                 assert not self.disc_conditional
 67 |                 logits_fake = self.discriminator(reconstructions.contiguous())
 68 |             else:
 69 |                 assert self.disc_conditional
 70 |                 logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
 71 |             g_loss = -torch.mean(logits_fake)
 72 | 
 73 |             if self.disc_factor > 0.0:
 74 |                 try:
 75 |                     d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
 76 |                 except RuntimeError:
 77 |                     assert not self.training
 78 |                     d_weight = torch.tensor(0.0)
 79 |             else:
 80 |                 d_weight = torch.tensor(0.0)
 81 | 
 82 |             disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
 83 |             loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss
 84 | 
 85 |             log = {"{}/total_loss".format(split): loss.clone().detach().mean(), "{}/logvar".format(split): self.logvar.detach(),
 86 |                    "{}/kl_loss".format(split): kl_loss.detach().mean(), "{}/nll_loss".format(split): nll_loss.detach().mean(),
 87 |                    "{}/rec_loss".format(split): rec_loss.detach().mean(),
 88 |                    "{}/d_weight".format(split): d_weight.detach(),
 89 |                    "{}/disc_factor".format(split): torch.tensor(disc_factor),
 90 |                    "{}/g_loss".format(split): g_loss.detach().mean(),
 91 |                    }
 92 |             return loss, log
 93 | 
 94 |         if optimizer_idx == 1:
 95 |             # second pass for discriminator update
 96 |             if cond is None:
 97 |                 logits_real = self.discriminator(inputs.contiguous().detach())
 98 |                 logits_fake = self.discriminator(reconstructions.contiguous().detach())
 99 |             else:
100 |                 logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
101 |                 logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
102 | 
103 |             disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
104 |             d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
105 | 
106 |             log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
107 |                    "{}/logits_real".format(split): logits_real.detach().mean(),
108 |                    "{}/logits_fake".format(split): logits_fake.detach().mean()
109 |                    }
110 |             return d_loss, log
111 | 
112 | 


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/utils/plotting.py:
--------------------------------------------------------------------------------
  1 | import bisect
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import matplotlib
  5 | 
  6 | 
  7 | def _compute_conf_thresh(data):
  8 |     dataset_name = data['dataset_name'][0].lower()
  9 |     if dataset_name == 'scannet':
 10 |         thr = 5e-4
 11 |     elif dataset_name == 'megadepth':
 12 |         thr = 1e-4
 13 |     else:
 14 |         raise ValueError(f'Unknown dataset: {dataset_name}')
 15 |     return thr
 16 | 
 17 | 
 18 | # --- VISUALIZATION --- #
 19 | 
 20 | def make_matching_figure(
 21 |         img0, img1, mkpts0, mkpts1, color,
 22 |         kpts0=None, kpts1=None, text=[], dpi=75, path=None):
 23 |     # draw image pair
 24 |     assert mkpts0.shape[0] == mkpts1.shape[0], f'mkpts0: {mkpts0.shape[0]} v.s. mkpts1: {mkpts1.shape[0]}'
 25 |     fig, axes = plt.subplots(1, 2, figsize=(10, 6), dpi=dpi)
 26 |     axes[0].imshow(img0, cmap='gray')
 27 |     axes[1].imshow(img1, cmap='gray')
 28 |     for i in range(2):   # clear all frames
 29 |         axes[i].get_yaxis().set_ticks([])
 30 |         axes[i].get_xaxis().set_ticks([])
 31 |         for spine in axes[i].spines.values():
 32 |             spine.set_visible(False)
 33 |     plt.tight_layout(pad=1)
 34 |     
 35 |     if kpts0 is not None:
 36 |         assert kpts1 is not None
 37 |         axes[0].scatter(kpts0[:, 0], kpts0[:, 1], c='w', s=2)
 38 |         axes[1].scatter(kpts1[:, 0], kpts1[:, 1], c='w', s=2)
 39 | 
 40 |     # draw matches
 41 |     if mkpts0.shape[0] != 0 and mkpts1.shape[0] != 0:
 42 |         fig.canvas.draw()
 43 |         transFigure = fig.transFigure.inverted()
 44 |         fkpts0 = transFigure.transform(axes[0].transData.transform(mkpts0))
 45 |         fkpts1 = transFigure.transform(axes[1].transData.transform(mkpts1))
 46 |         fig.lines = [matplotlib.lines.Line2D((fkpts0[i, 0], fkpts1[i, 0]),
 47 |                                             (fkpts0[i, 1], fkpts1[i, 1]),
 48 |                                             transform=fig.transFigure, c=color[i], linewidth=1)
 49 |                                         for i in range(len(mkpts0))]
 50 |         
 51 |         axes[0].scatter(mkpts0[:, 0], mkpts0[:, 1], c=color, s=4)
 52 |         axes[1].scatter(mkpts1[:, 0], mkpts1[:, 1], c=color, s=4)
 53 | 
 54 |     # put txts
 55 |     txt_color = 'k' if img0[:100, :200].mean() > 200 else 'w'
 56 |     fig.text(
 57 |         0.01, 0.99, '\n'.join(text), transform=fig.axes[0].transAxes,
 58 |         fontsize=15, va='top', ha='left', color=txt_color)
 59 | 
 60 |     # save or return figure
 61 |     if path:
 62 |         plt.savefig(str(path), bbox_inches='tight', pad_inches=0)
 63 |         plt.close()
 64 |     else:
 65 |         return fig
 66 | 
 67 | 
 68 | def _make_evaluation_figure(data, b_id, alpha='dynamic'):
 69 |     b_mask = data['m_bids'] == b_id
 70 |     conf_thr = _compute_conf_thresh(data)
 71 |     
 72 |     img0 = (data['image0'][b_id][0].cpu().numpy() * 255).round().astype(np.int32)
 73 |     img1 = (data['image1'][b_id][0].cpu().numpy() * 255).round().astype(np.int32)
 74 |     kpts0 = data['mkpts0_f'][b_mask].cpu().numpy()
 75 |     kpts1 = data['mkpts1_f'][b_mask].cpu().numpy()
 76 |     
 77 |     # for megadepth, we visualize matches on the resized image
 78 |     if 'scale0' in data:
 79 |         kpts0 = kpts0 / data['scale0'][b_id].cpu().numpy()[[1, 0]]
 80 |         kpts1 = kpts1 / data['scale1'][b_id].cpu().numpy()[[1, 0]]
 81 | 
 82 |     epi_errs = data['epi_errs'][b_mask].cpu().numpy()
 83 |     correct_mask = epi_errs < conf_thr
 84 |     precision = np.mean(correct_mask) if len(correct_mask) > 0 else 0
 85 |     n_correct = np.sum(correct_mask)
 86 |     n_gt_matches = int(data['conf_matrix_gt'][b_id].sum().cpu())
 87 |     recall = 0 if n_gt_matches == 0 else n_correct / (n_gt_matches)
 88 |     # recall might be larger than 1, since the calculation of conf_matrix_gt
 89 |     # uses groundtruth depths and camera poses, but epipolar distance is used here.
 90 | 
 91 |     # matching info
 92 |     if alpha == 'dynamic':
 93 |         alpha = dynamic_alpha(len(correct_mask))
 94 |     color = error_colormap(epi_errs, conf_thr, alpha=alpha)
 95 |     
 96 |     text = [
 97 |         f'#Matches {len(kpts0)}',
 98 |         f'Precision({conf_thr:.2e}) ({100 * precision:.1f}%): {n_correct}/{len(kpts0)}',
 99 |         f'Recall({conf_thr:.2e}) ({100 * recall:.1f}%): {n_correct}/{n_gt_matches}'
100 |     ]
101 |     
102 |     # make the figure
103 |     figure = make_matching_figure(img0, img1, kpts0, kpts1,
104 |                                   color, text=text)
105 |     return figure
106 | 
107 | def _make_confidence_figure(data, b_id):
108 |     # TODO: Implement confidence figure
109 |     raise NotImplementedError()
110 | 
111 | 
112 | def make_matching_figures(data, config, mode='evaluation'):
113 |     """ Make matching figures for a batch.
114 |     
115 |     Args:
116 |         data (Dict): a batch updated by PL_LoFTR.
117 |         config (Dict): matcher config
118 |     Returns:
119 |         figures (Dict[str, List[plt.figure]]
120 |     """
121 |     assert mode in ['evaluation', 'confidence']  # 'confidence'
122 |     figures = {mode: []}
123 |     for b_id in range(data['image0'].size(0)):
124 |         if mode == 'evaluation':
125 |             fig = _make_evaluation_figure(
126 |                 data, b_id,
127 |                 alpha=config.TRAINER.PLOT_MATCHES_ALPHA)
128 |         elif mode == 'confidence':
129 |             fig = _make_confidence_figure(data, b_id)
130 |         else:
131 |             raise ValueError(f'Unknown plot mode: {mode}')
132 |     figures[mode].append(fig)
133 |     return figures
134 | 
135 | 
136 | def dynamic_alpha(n_matches,
137 |                   milestones=[0, 300, 1000, 2000],
138 |                   alphas=[1.0, 0.8, 0.4, 0.2]):
139 |     if n_matches == 0:
140 |         return 1.0
141 |     ranges = list(zip(alphas, alphas[1:] + [None]))
142 |     loc = bisect.bisect_right(milestones, n_matches) - 1
143 |     _range = ranges[loc]
144 |     if _range[1] is None:
145 |         return _range[0]
146 |     return _range[1] + (milestones[loc + 1] - n_matches) / (
147 |         milestones[loc + 1] - milestones[loc]) * (_range[0] - _range[1])
148 | 
149 | 
150 | def error_colormap(err, thr, alpha=1.0):
151 |     assert alpha <= 1.0 and alpha > 0, f"Invaid alpha value: {alpha}"
152 |     x = 1 - np.clip(err / (thr * 2), 0, 1)
153 |     return np.clip(
154 |         np.stack([2-x*2, x*2, np.zeros_like(x), np.ones_like(x)*alpha], -1), 0, 1)
155 | 


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/loftr/utils/supervision.py:
--------------------------------------------------------------------------------
  1 | from math import log
  2 | from loguru import logger
  3 | 
  4 | import torch
  5 | from einops import repeat
  6 | from kornia.utils import create_meshgrid
  7 | 
  8 | from .geometry import warp_kpts
  9 | 
 10 | ##############  ↓  Coarse-Level supervision  ↓  ##############
 11 | 
 12 | 
 13 | @torch.no_grad()
 14 | def mask_pts_at_padded_regions(grid_pt, mask):
 15 |     """For megadepth dataset, zero-padding exists in images"""
 16 |     mask = repeat(mask, 'n h w -> n (h w) c', c=2)
 17 |     grid_pt[~mask.bool()] = 0
 18 |     return grid_pt
 19 | 
 20 | 
 21 | @torch.no_grad()
 22 | def spvs_coarse(data, config):
 23 |     """
 24 |     Update:
 25 |         data (dict): {
 26 |             "conf_matrix_gt": [N, hw0, hw1],
 27 |             'spv_b_ids': [M]
 28 |             'spv_i_ids': [M]
 29 |             'spv_j_ids': [M]
 30 |             'spv_w_pt0_i': [N, hw0, 2], in original image resolution
 31 |             'spv_pt1_i': [N, hw1, 2], in original image resolution
 32 |         }
 33 |         
 34 |     NOTE:
 35 |         - for scannet dataset, there're 3 kinds of resolution {i, c, f}
 36 |         - for megadepth dataset, there're 4 kinds of resolution {i, i_resize, c, f}
 37 |     """
 38 |     # 1. misc
 39 |     device = data['image0'].device
 40 |     N, _, H0, W0 = data['image0'].shape
 41 |     _, _, H1, W1 = data['image1'].shape
 42 |     scale = config['LOFTR']['RESOLUTION'][0]
 43 |     scale0 = scale * data['scale0'][:, None] if 'scale0' in data else scale
 44 |     scale1 = scale * data['scale1'][:, None] if 'scale0' in data else scale
 45 |     h0, w0, h1, w1 = map(lambda x: x // scale, [H0, W0, H1, W1])
 46 | 
 47 |     # 2. warp grids
 48 |     # create kpts in meshgrid and resize them to image resolution
 49 |     grid_pt0_c = create_meshgrid(h0, w0, False, device).reshape(1, h0*w0, 2).repeat(N, 1, 1)    # [N, hw, 2]
 50 |     grid_pt0_i = scale0 * grid_pt0_c
 51 |     grid_pt1_c = create_meshgrid(h1, w1, False, device).reshape(1, h1*w1, 2).repeat(N, 1, 1)
 52 |     grid_pt1_i = scale1 * grid_pt1_c
 53 | 
 54 |     # mask padded region to (0, 0), so no need to manually mask conf_matrix_gt
 55 |     if 'mask0' in data:
 56 |         grid_pt0_i = mask_pts_at_padded_regions(grid_pt0_i, data['mask0'])
 57 |         grid_pt1_i = mask_pts_at_padded_regions(grid_pt1_i, data['mask1'])
 58 | 
 59 |     # warp kpts bi-directionally and resize them to coarse-level resolution
 60 |     # (no depth consistency check, since it leads to worse results experimentally)
 61 |     # (unhandled edge case: points with 0-depth will be warped to the left-up corner)
 62 |     _, w_pt0_i = warp_kpts(grid_pt0_i, data['depth0'], data['depth1'], data['T_0to1'], data['K0'], data['K1'])
 63 |     _, w_pt1_i = warp_kpts(grid_pt1_i, data['depth1'], data['depth0'], data['T_1to0'], data['K1'], data['K0'])
 64 |     w_pt0_c = w_pt0_i / scale1
 65 |     w_pt1_c = w_pt1_i / scale0
 66 | 
 67 |     # 3. check if mutual nearest neighbor
 68 |     w_pt0_c_round = w_pt0_c[:, :, :].round().long()
 69 |     nearest_index1 = w_pt0_c_round[..., 0] + w_pt0_c_round[..., 1] * w1
 70 |     w_pt1_c_round = w_pt1_c[:, :, :].round().long()
 71 |     nearest_index0 = w_pt1_c_round[..., 0] + w_pt1_c_round[..., 1] * w0
 72 | 
 73 |     # corner case: out of boundary
 74 |     def out_bound_mask(pt, w, h):
 75 |         return (pt[..., 0] < 0) + (pt[..., 0] >= w) + (pt[..., 1] < 0) + (pt[..., 1] >= h)
 76 |     nearest_index1[out_bound_mask(w_pt0_c_round, w1, h1)] = 0
 77 |     nearest_index0[out_bound_mask(w_pt1_c_round, w0, h0)] = 0
 78 | 
 79 |     loop_back = torch.stack([nearest_index0[_b][_i] for _b, _i in enumerate(nearest_index1)], dim=0)
 80 |     correct_0to1 = loop_back == torch.arange(h0*w0, device=device)[None].repeat(N, 1)
 81 |     correct_0to1[:, 0] = False  # ignore the top-left corner
 82 | 
 83 |     # 4. construct a gt conf_matrix
 84 |     conf_matrix_gt = torch.zeros(N, h0*w0, h1*w1, device=device)
 85 |     b_ids, i_ids = torch.where(correct_0to1 != 0)
 86 |     j_ids = nearest_index1[b_ids, i_ids]
 87 | 
 88 |     conf_matrix_gt[b_ids, i_ids, j_ids] = 1
 89 |     data.update({'conf_matrix_gt': conf_matrix_gt})
 90 | 
 91 |     # 5. save coarse matches(gt) for training fine level
 92 |     if len(b_ids) == 0:
 93 |         logger.warning(f"No groundtruth coarse match found for: {data['pair_names']}")
 94 |         # this won't affect fine-level loss calculation
 95 |         b_ids = torch.tensor([0], device=device)
 96 |         i_ids = torch.tensor([0], device=device)
 97 |         j_ids = torch.tensor([0], device=device)
 98 | 
 99 |     data.update({
100 |         'spv_b_ids': b_ids,
101 |         'spv_i_ids': i_ids,
102 |         'spv_j_ids': j_ids
103 |     })
104 | 
105 |     # 6. save intermediate results (for fast fine-level computation)
106 |     data.update({
107 |         'spv_w_pt0_i': w_pt0_i,
108 |         'spv_pt1_i': grid_pt1_i
109 |     })
110 | 
111 | 
112 | def compute_supervision_coarse(data, config):
113 |     assert len(set(data['dataset_name'])) == 1, "Do not support mixed datasets training!"
114 |     data_source = data['dataset_name'][0]
115 |     if data_source.lower() in ['scannet', 'megadepth']:
116 |         spvs_coarse(data, config)
117 |     else:
118 |         raise ValueError(f'Unknown data source: {data_source}')
119 | 
120 | 
121 | ##############  ↓  Fine-Level supervision  ↓  ##############
122 | 
123 | @torch.no_grad()
124 | def spvs_fine(data, config):
125 |     """
126 |     Update:
127 |         data (dict):{
128 |             "expec_f_gt": [M, 2]}
129 |     """
130 |     # 1. misc
131 |     # w_pt0_i, pt1_i = data.pop('spv_w_pt0_i'), data.pop('spv_pt1_i')
132 |     w_pt0_i, pt1_i = data['spv_w_pt0_i'], data['spv_pt1_i']
133 |     scale = config['LOFTR']['RESOLUTION'][1]
134 |     radius = config['LOFTR']['FINE_WINDOW_SIZE'] // 2
135 | 
136 |     # 2. get coarse prediction
137 |     b_ids, i_ids, j_ids = data['b_ids'], data['i_ids'], data['j_ids']
138 | 
139 |     # 3. compute gt
140 |     scale = scale * data['scale1'][b_ids] if 'scale0' in data else scale
141 |     # `expec_f_gt` might exceed the window, i.e. abs(*) > 1, which would be filtered later
142 |     expec_f_gt = (w_pt0_i[b_ids, i_ids] - pt1_i[b_ids, j_ids]) / scale / radius  # [M, 2]
143 |     data.update({"expec_f_gt": expec_f_gt})
144 | 
145 | 
146 | def compute_supervision_fine(data, config):
147 |     data_source = data['dataset_name'][0]
148 |     if data_source.lower() in ['scannet', 'megadepth']:
149 |         spvs_fine(data, config)
150 |     else:
151 |         raise NotImplementedError
152 | 


--------------------------------------------------------------------------------
/pose_synthesis/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import numpy as np
  4 | import cv2
  5 | from PIL import Image
  6 | from rembg import remove
  7 | import trimesh
  8 | 
  9 | # predict bbox of the foreground
 10 | def pred_bbox(image):
 11 |     image_nobg = remove(image.convert('RGBA'), alpha_matting=True)
 12 |     alpha = np.asarray(image_nobg)[:,:,-1]
 13 |     x_nonzero = np.nonzero(alpha.sum(axis=0))
 14 |     y_nonzero = np.nonzero(alpha.sum(axis=1))
 15 |     x_min = int(x_nonzero[0].min())
 16 |     y_min = int(y_nonzero[0].min())
 17 |     x_max = int(x_nonzero[0].max())
 18 |     y_max = int(y_nonzero[0].max())
 19 |     return x_min, y_min, x_max, y_max
 20 | 
 21 | def image_grid(imgs, rows, cols):
 22 |     assert len(imgs) == rows*cols
 23 |     w, h = imgs[0].size
 24 |     grid = Image.new('RGB', size=(cols*w, rows*h))
 25 |     grid_w, grid_h = grid.size
 26 |     
 27 |     for i, img in enumerate(imgs):
 28 |         grid.paste(img, box=(i%cols*w, i//cols*h))
 29 |     return grid
 30 | 
 31 | def convert_mesh_format(exp_dir, output_format=".obj"):
 32 |     ply_path = os.path.join(exp_dir, "mesh.ply")
 33 |     mesh_path = os.path.join(exp_dir, f"mesh{output_format}")
 34 |     mesh = trimesh.load_mesh(ply_path)
 35 |     rotation_matrix = trimesh.transformations.rotation_matrix(np.pi/2, [1, 0, 0])
 36 |     mesh.apply_transform(rotation_matrix)
 37 |     rotation_matrix = trimesh.transformations.rotation_matrix(np.pi, [0, 0, 1])
 38 |     mesh.apply_transform(rotation_matrix)
 39 |     # flip x
 40 |     mesh.vertices[:, 0] = -mesh.vertices[:, 0]
 41 |     mesh.faces = np.fliplr(mesh.faces)
 42 |     if output_format == ".obj":
 43 |         # Export the mesh as .obj file with colors
 44 |         mesh.export(mesh_path, file_type='obj', include_color=True)
 45 |     else:
 46 |         mesh.export(mesh_path, file_type='glb')
 47 |     return mesh_path
 48 | 
 49 | # contrast correction, rescale and recenter
 50 | def image_preprocess_nosave(input_image, lower_contrast=True, rescale=True):
 51 | 
 52 |     image_arr = np.array(input_image)
 53 |     in_w, in_h = image_arr.shape[:2]
 54 | 
 55 |     if lower_contrast:
 56 |         alpha = 0.8  # Contrast control (1.0-3.0)
 57 |         beta =  0   # Brightness control (0-100)
 58 |         # Apply the contrast adjustment
 59 |         image_arr = cv2.convertScaleAbs(image_arr, alpha=alpha, beta=beta)
 60 |         image_arr[image_arr[...,-1]>200, -1] = 255
 61 | 
 62 |     ret, mask = cv2.threshold(np.array(input_image.split()[-1]), 0, 255, cv2.THRESH_BINARY)
 63 |     x, y, w, h = cv2.boundingRect(mask)
 64 |     max_size = max(w, h)
 65 |     ratio = 0.75
 66 |     if rescale:
 67 |         side_len = int(max_size / ratio)
 68 |     else:
 69 |         side_len = in_w
 70 |     padded_image = np.zeros((side_len, side_len, 4), dtype=np.uint8)
 71 |     center = side_len//2
 72 |     padded_image[center-h//2:center-h//2+h, center-w//2:center-w//2+w] = image_arr[y:y+h, x:x+w]
 73 |     rgba = Image.fromarray(padded_image).resize((256, 256), Image.LANCZOS)
 74 | 
 75 |     rgba_arr = np.array(rgba) / 255.0
 76 |     rgb = rgba_arr[...,:3] * rgba_arr[...,-1:] + (1 - rgba_arr[...,-1:])
 77 |     return Image.fromarray((rgb * 255).astype(np.uint8))
 78 | 
 79 | # pose generation
 80 | def calc_pose(phis, thetas, size, radius = 1.2, device='cuda'):
 81 |     import torch
 82 |     def normalize(vectors):
 83 |         return vectors / (torch.norm(vectors, dim=-1, keepdim=True) + 1e-10)
 84 |     thetas = torch.FloatTensor(thetas).to(device)
 85 |     phis = torch.FloatTensor(phis).to(device)
 86 |     
 87 |     centers = torch.stack([
 88 |         radius * torch.sin(thetas) * torch.sin(phis),
 89 |         -radius * torch.cos(thetas) * torch.sin(phis),
 90 |         radius * torch.cos(phis),
 91 |     ], dim=-1) # [B, 3]
 92 | 
 93 |     # lookat
 94 |     forward_vector = normalize(centers).squeeze(0)
 95 |     up_vector = torch.FloatTensor([0, 0, 1]).to(device).unsqueeze(0).repeat(size, 1) 
 96 |     right_vector = normalize(torch.cross(up_vector, forward_vector, dim=-1))      
 97 |     if right_vector.pow(2).sum() < 0.01:
 98 |         right_vector = torch.FloatTensor([0, 1, 0]).to(device).unsqueeze(0).repeat(size, 1)  
 99 |     up_vector = normalize(torch.cross(forward_vector, right_vector, dim=-1))     
100 | 
101 |     poses = torch.eye(4, dtype=torch.float, device=device)[:3].unsqueeze(0).repeat(size, 1, 1)
102 |     poses[:, :3, :3] = torch.stack((right_vector, up_vector, forward_vector), dim=-1)
103 |     poses[:, :3, 3] = centers 
104 |     return poses
105 | 
106 | def get_poses(init_elev):
107 |     mid = init_elev
108 |     deg = 10
109 |     if init_elev <= 75:
110 |         low = init_elev + 30
111 |         # e.g. 30, 60, 20, 40, 30, 30, 50, 70, 50, 50
112 |         
113 |         elevations = np.radians([mid]*4 + [low]*4 + [mid-deg,mid+deg,mid,mid]*4 + [low-deg,low+deg,low,low]*4)
114 |         img_ids = [f"{num}.png" for num in range(8)] + [f"{num}_{view_num}.png" for num in range(8) for view_num in range(4)]
115 |     else:
116 |         
117 |         high = init_elev - 30
118 |         elevations = np.radians([mid]*4 + [high]*4 + [mid-deg,mid+deg,mid,mid]*4 + [high-deg,high+deg,high,high]*4)
119 |         img_ids = [f"{num}.png" for num in list(range(4)) + list(range(8,12))]  + \
120 |                 [f"{num}_{view_num}.png" for num in list(range(4)) + list(range(8,12)) for view_num in range(4)]
121 |     overlook_theta = [30+x*90 for x in range(4)]
122 |     eyelevel_theta = [60+x*90 for x in range(4)]
123 |     source_theta_delta = [0, 0, -deg, deg]
124 |     azimuths = np.radians(overlook_theta + eyelevel_theta + \
125 |                             [view_theta + source for view_theta in overlook_theta for source in source_theta_delta] + \
126 |                             [view_theta + source for view_theta in eyelevel_theta for source in source_theta_delta])
127 |     return img_ids, calc_pose(elevations, azimuths, len(azimuths)).cpu().numpy()
128 | 
129 | 
130 | def gen_poses(shape_dir, pose_est):
131 |     img_ids, input_poses = get_poses(pose_est)
132 |         
133 |     out_dict = {}
134 |     focal = 560/2; h = w = 256
135 |     out_dict['intrinsics'] = [[focal, 0, w / 2], [0, focal, h / 2], [0, 0, 1]]
136 |     out_dict['near_far'] = [1.2-0.7, 1.2+0.6]
137 |     out_dict['c2ws'] = {}
138 |     for view_id, img_id in enumerate(img_ids):
139 |         pose = input_poses[view_id]
140 |         pose = pose.tolist()
141 |         pose = [pose[0], pose[1], pose[2], [0, 0, 0, 1]]
142 |         out_dict['c2ws'][img_id] = pose
143 |     json_path = os.path.join(shape_dir, 'pose.json')
144 |     with open(json_path, 'w') as f:
145 |         json.dump(out_dict, f, indent=4)
146 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Integrating View Conditions for Image Synthesis
  2 | This is the official implementation of the paper "Integrating View Conditions for Image Synthesis", which is accepted by IJCAI 2024. 🎉 
  3 | 
  4 | [[Paper]](https://www.ijcai.org/proceedings/2024/840)
  5 | 
  6 | ## Introduction
  7 | 
  8 | This paper presents **ViewControl** that enhances existing models with awareness of viewpoint information, thereby
  9 | enabling improved control over text-to-image diffusion models, such as Stable Diffusion. This advancement leads to a
 10 | more controllable approach for image editing tasks. Our proposed pipeline effectively addresses crucial aspects of image synthesis, including *consistency*, *controllability*, and *harmony*. Through both quantitative and qualitative comparisons with recently published
 11 | open-source state-of-the-art methods, we have showcased the
 12 | favorable performance of our approach across various dimensions.
 13 | 
 14 | 
 15 | ## Pipeline
 16 | 
 17 | The pipeline of ViewControl consists of three steps: LLM Planer, Pose Estimation and Synthesis, and Image Synthesis. The LLM Planer is responsible for understanding the users' inputs and bridging the gap between the users' inputs and the following steps. The Pose Estimation and Synthesis module is responsible for estimating the pose of the object in the input image and synthesizing the image of the object at the target pose. The Image Synthesis module is responsible for synthesizing the final image by combining the synthesized image of the object with the background of the input image. The pipeline of ViewControl is shown in the following figure:
 18 | 
 19 | <p align="center">
 20 |   <img src="./imgs/demo/pipeline.jpg" width="800" />
 21 | 
 22 | ## Installation
 23 | First, clone the repository locally:
 24 | ```bash
 25 | git clone https://github.com/huggingface/diffusers.git 
 26 | git clone https://github.com/luca-medeiros/lang-segment-anything.git 
 27 | git clone https://github.com/IDEA-Research/GroundingDINO.git
 28 | ```
 29 | Then, create a conda environment and install the required packages:
 30 | ```bash
 31 | conda create -n view_cond python=3.10
 32 | conda activate view_cond
 33 | 
 34 | cd diffusers
 35 | pip install -e .
 36 | 
 37 | cd ../lang-segment-anything
 38 | pip install torch torchvision
 39 | pip install -e .
 40 | 
 41 | cd ../GroundingDINO
 42 | pip install -e .
 43 | mkdir weights
 44 | cd weights
 45 | wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
 46 | cd ..
 47 | 
 48 | cd ..
 49 | 
 50 | pip install -r requirements.txt
 51 | cd pose_synthesis
 52 | python download_ckpt.py
 53 | 
 54 | pip install --upgrade torchaudio
 55 | cd ..
 56 | ```
 57 | 
 58 | ## Training
 59 | If you want to train your own pose estimator, you can use the following command:
 60 | ```bash
 61 | python train_pose_estimator.py --dataset_path <path_to_dataset> --output_dir <path_to_output_dir>
 62 | ```
 63 | You may need to adjust the hyperparameters (learning rate, batch size, etc.) in the script to get the best performance.
 64 | 
 65 | Your dataset should be organized as follows:
 66 | ```
 67 | dataset
 68 | ├── class_1
 69 | │   ├── obj_1
 70 | │   │   ├── x1_y1.png
 71 | │   │   ├── x2_y2.png
 72 | │   │   ├── ...
 73 | │   │   └── xN_yN.png
 74 | │   ├── obj_2
 75 | │   │   ├── x1_y1.png
 76 | │   │   ├── x2_y2.png
 77 | │   │   ├── ...
 78 | │   │   └── xN_yN.png
 79 | │   ├── ...
 80 | │   └── obj_N
 81 | │       ├── ...
 82 | |── class_N
 83 | │   ├── ...
 84 | ```
 85 | where `x1_y1.png` is the image of `obj_1` at pose `(x1, y1)`, and `class_1` is the class name of `obj_1`. The dataset can be synthetic or real. If you want to synthesize your own dataset, first prepare a set of images of the object for the same class with the same pose, then use the pose_synthesis module to synthesize the images of the object at different poses. 
 86 | 
 87 | 
 88 | ## Inference
 89 | 
 90 | ### Pose Estimation
 91 | To estimate the pose of a given image, you can use the following command:
 92 | ```bash
 93 | python pose_estimation.py --image_path <path_to_image> --output_dir <path_to_output_dir> --model_path <path_to_model>
 94 | ```
 95 | ### Pose Synthesis
 96 | To synthesize an image of one object from a given pose, you can use the following command:
 97 | ```bash
 98 | cd pose_synthesis
 99 | python pose_synthesis_batch.py --input_dir input_dir --output_dir output_dir --x x_value --y y_value
100 | cd ..
101 | ```
102 | for example:
103 | ```bash
104 | cd pose_synthesis
105 | python pose_synthesis_batch.py --input_dir input_dir --output_dir output_dir --x 0 --y 0
106 | cd ..
107 | ```
108 | To synthesis a set of images of one object from a given set of poses, you can use the following command:
109 | ```bash
110 | cd pose_synthesis
111 | python pose_synthesis_batch.py --input_dir input_dir --output_dir output_dir --x x_values --y y_values
112 | cd ..
113 | ```
114 | for example:
115 | ```bash
116 | cd pose_synthesis
117 | python pose_synthesis_batch.py --input_dir input_dir --output_dir output_dir --x 0,10 --y 0,-10
118 | cd ..
119 | ```
120 | ### Image Synthesis
121 | To synthesize an image, you can use the following command:
122 | ```bash
123 | python image_synthesis.py --path_src_img <path_to_src_img> --path_ref_img <path_to_ref_img> --text_prompt <text_prompt> --save_path <save_path> --mask_obj_name <mask_obj_name> --ref_obj_name <ref_obj_name>
124 | ```
125 | If you need faster inference, you can set the dreambooth option to False or pre train it or change it to another lightweight personalization method like LoRA. 
126 | 
127 | ### Other Utils
128 | If you need to obtain a more accurate caption or class name from an image, you can use the following command:
129 | ```bash
130 | python obj_name_synthesis.py --path_src_img <path_to_src_img> --save_path <save_path>
131 | ```
132 | 
133 | If you need to remove the background of an image, you can use the following command:
134 | ```bash
135 | python utils.py --input_path <input_path> --prompt <prompt> --output_path <output_path>
136 | ```
137 | 
138 | ## Examples
139 | Here are some examples of the results of ViewControl:
140 | <p align="center">
141 |   <img src="./imgs/demo/Intro.jpg" width="800" />
142 | </p>
143 | 
144 | 
145 | ## Citation 
146 | If you find this work useful, please cite our paper:
147 | ```
148 | @inproceedings{ijcai2024p840,
149 |   title     = {Integrating View Conditions for Image Synthesis},
150 |   author    = {Bai, Jinbin and Dong, Zhen and Feng, Aosong and Zhang, Xiao and Ye, Tian and Zhou, Kaicheng},
151 |   booktitle = {Proceedings of the Thirty-Third International Joint Conference on
152 |                Artificial Intelligence, {IJCAI-24}},
153 |   publisher = {International Joint Conferences on Artificial Intelligence Organization},
154 |   editor    = {Kate Larson},
155 |   pages     = {7591--7599},
156 |   year      = {2024},
157 |   month     = {8},
158 |   note      = {AI, Arts & Creativity},
159 |   doi       = {10.24963/ijcai.2024/840},
160 |   url       = {https://doi.org/10.24963/ijcai.2024/840},
161 | }
162 | ```
163 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/data/nerf_like.py:
--------------------------------------------------------------------------------
  1 | from torch.utils.data import Dataset
  2 | import os
  3 | import json
  4 | import numpy as np
  5 | import torch
  6 | import imageio
  7 | import math
  8 | import cv2
  9 | from torchvision import transforms
 10 | 
 11 | def cartesian_to_spherical(xyz):
 12 |     ptsnew = np.hstack((xyz, np.zeros(xyz.shape)))
 13 |     xy = xyz[:,0]**2 + xyz[:,1]**2
 14 |     z = np.sqrt(xy + xyz[:,2]**2)
 15 |     theta = np.arctan2(np.sqrt(xy), xyz[:,2]) # for elevation angle defined from Z-axis down
 16 |     #ptsnew[:,4] = np.arctan2(xyz[:,2], np.sqrt(xy)) # for elevation angle defined from XY-plane up
 17 |     azimuth = np.arctan2(xyz[:,1], xyz[:,0])
 18 |     return np.array([theta, azimuth, z])
 19 | 
 20 | 
 21 | def get_T(T_target, T_cond):
 22 |     theta_cond, azimuth_cond, z_cond = cartesian_to_spherical(T_cond[None, :])
 23 |     theta_target, azimuth_target, z_target = cartesian_to_spherical(T_target[None, :])
 24 |     
 25 |     d_theta = theta_target - theta_cond
 26 |     d_azimuth = (azimuth_target - azimuth_cond) % (2 * math.pi)
 27 |     d_z = z_target - z_cond
 28 |     
 29 |     d_T = torch.tensor([d_theta.item(), math.sin(d_azimuth.item()), math.cos(d_azimuth.item()), d_z.item()])
 30 |     return d_T
 31 | 
 32 | def get_spherical(T_target, T_cond):
 33 |     theta_cond, azimuth_cond, z_cond = cartesian_to_spherical(T_cond[None, :])
 34 |     theta_target, azimuth_target, z_target = cartesian_to_spherical(T_target[None, :])
 35 |     
 36 |     d_theta = theta_target - theta_cond
 37 |     d_azimuth = (azimuth_target - azimuth_cond) % (2 * math.pi)
 38 |     d_z = z_target - z_cond
 39 |     
 40 |     d_T = torch.tensor([math.degrees(d_theta.item()), math.degrees(d_azimuth.item()), d_z.item()])
 41 |     return d_T
 42 | 
 43 | class RTMV(Dataset):
 44 |     def __init__(self, root_dir='datasets/RTMV/google_scanned',\
 45 |                  first_K=64, resolution=256, load_target=False):
 46 |         self.root_dir = root_dir
 47 |         self.scene_list = sorted(next(os.walk(root_dir))[1])
 48 |         self.resolution = resolution
 49 |         self.first_K = first_K
 50 |         self.load_target = load_target
 51 | 
 52 |     def __len__(self):
 53 |         return len(self.scene_list)
 54 | 
 55 |     def __getitem__(self, idx):
 56 |         scene_dir = os.path.join(self.root_dir, self.scene_list[idx])
 57 |         with open(os.path.join(scene_dir, 'transforms.json'), "r") as f:
 58 |             meta = json.load(f)
 59 |         imgs = []
 60 |         poses = []
 61 |         for i_img in range(self.first_K):
 62 |             meta_img = meta['frames'][i_img]
 63 | 
 64 |             if i_img == 0 or self.load_target:
 65 |                 img_path = os.path.join(scene_dir, meta_img['file_path'])
 66 |                 img = imageio.imread(img_path)
 67 |                 img = cv2.resize(img, (self.resolution, self.resolution), interpolation = cv2.INTER_LINEAR)
 68 |                 imgs.append(img)
 69 |             
 70 |             c2w = meta_img['transform_matrix']
 71 |             poses.append(c2w)
 72 |             
 73 |         imgs = (np.array(imgs) / 255.).astype(np.float32)  # (RGBA) imgs
 74 |         imgs = torch.tensor(self.blend_rgba(imgs)).permute(0, 3, 1, 2)
 75 |         imgs = imgs * 2 - 1. # convert to stable diffusion range
 76 |         poses = torch.tensor(np.array(poses).astype(np.float32))
 77 |         return imgs, poses
 78 |                 
 79 |     def blend_rgba(self, img):
 80 |         img = img[..., :3] * img[..., -1:] + (1. - img[..., -1:])  # blend A to RGB
 81 |         return img
 82 |             
 83 | 
 84 | class GSO(Dataset):
 85 |     def __init__(self, root_dir='datasets/GoogleScannedObjects',\
 86 |                  split='val', first_K=5, resolution=256, load_target=False, name='render_mvs'):
 87 |         self.root_dir = root_dir
 88 |         with open(os.path.join(root_dir, '%s.json' % split), "r") as f:
 89 |             self.scene_list = json.load(f)
 90 |         self.resolution = resolution
 91 |         self.first_K = first_K
 92 |         self.load_target = load_target
 93 |         self.name = name
 94 | 
 95 |     def __len__(self):
 96 |         return len(self.scene_list)
 97 | 
 98 |     def __getitem__(self, idx):
 99 |         scene_dir = os.path.join(self.root_dir, self.scene_list[idx])
100 |         with open(os.path.join(scene_dir, 'transforms_%s.json' % self.name), "r") as f:
101 |             meta = json.load(f)
102 |         imgs = []
103 |         poses = []
104 |         for i_img in range(self.first_K):
105 |             meta_img = meta['frames'][i_img]
106 | 
107 |             if i_img == 0 or self.load_target:
108 |                 img_path = os.path.join(scene_dir, meta_img['file_path'])
109 |                 img = imageio.imread(img_path)
110 |                 img = cv2.resize(img, (self.resolution, self.resolution), interpolation = cv2.INTER_LINEAR)
111 |                 imgs.append(img)
112 |             
113 |             c2w = meta_img['transform_matrix']
114 |             poses.append(c2w)
115 |             
116 |         imgs = (np.array(imgs) / 255.).astype(np.float32)  # (RGBA) imgs
117 |         mask = imgs[:, :, :, -1]
118 |         imgs = torch.tensor(self.blend_rgba(imgs)).permute(0, 3, 1, 2)
119 |         imgs = imgs * 2 - 1. # convert to stable diffusion range
120 |         poses = torch.tensor(np.array(poses).astype(np.float32))
121 |         return imgs, poses
122 |                 
123 |     def blend_rgba(self, img):
124 |         img = img[..., :3] * img[..., -1:] + (1. - img[..., -1:])  # blend A to RGB
125 |         return img
126 |              
127 | class WILD(Dataset):
128 |     def __init__(self, root_dir='data/nerf_wild',\
129 |                  first_K=33, resolution=256, load_target=False):
130 |         self.root_dir = root_dir
131 |         self.scene_list = sorted(next(os.walk(root_dir))[1])
132 |         self.resolution = resolution
133 |         self.first_K = first_K
134 |         self.load_target = load_target
135 | 
136 |     def __len__(self):
137 |         return len(self.scene_list)
138 | 
139 |     def __getitem__(self, idx):
140 |         scene_dir = os.path.join(self.root_dir, self.scene_list[idx])
141 |         with open(os.path.join(scene_dir, 'transforms_train.json'), "r") as f:
142 |             meta = json.load(f)
143 |         imgs = []
144 |         poses = []
145 |         for i_img in range(self.first_K):
146 |             meta_img = meta['frames'][i_img]
147 | 
148 |             if i_img == 0 or self.load_target:
149 |                 img_path = os.path.join(scene_dir, meta_img['file_path'])
150 |                 img = imageio.imread(img_path + '.png')
151 |                 img = cv2.resize(img, (self.resolution, self.resolution), interpolation = cv2.INTER_LINEAR)
152 |                 imgs.append(img)
153 |             
154 |             c2w = meta_img['transform_matrix']
155 |             poses.append(c2w)
156 |             
157 |         imgs = (np.array(imgs) / 255.).astype(np.float32)  # (RGBA) imgs
158 |         imgs = torch.tensor(self.blend_rgba(imgs)).permute(0, 3, 1, 2)
159 |         imgs = imgs * 2 - 1. # convert to stable diffusion range
160 |         poses = torch.tensor(np.array(poses).astype(np.float32))
161 |         return imgs, poses
162 |                 
163 |     def blend_rgba(self, img):
164 |         img = img[..., :3] * img[..., -1:] + (1. - img[..., -1:])  # blend A to RGB
165 |         return img


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/loftr/backbone/resnet_fpn.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.nn.functional as F
  3 | 
  4 | 
  5 | def conv1x1(in_planes, out_planes, stride=1):
  6 |     """1x1 convolution without padding"""
  7 |     return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=0, bias=False)
  8 | 
  9 | 
 10 | def conv3x3(in_planes, out_planes, stride=1):
 11 |     """3x3 convolution with padding"""
 12 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
 13 | 
 14 | 
 15 | class BasicBlock(nn.Module):
 16 |     def __init__(self, in_planes, planes, stride=1):
 17 |         super().__init__()
 18 |         self.conv1 = conv3x3(in_planes, planes, stride)
 19 |         self.conv2 = conv3x3(planes, planes)
 20 |         self.bn1 = nn.BatchNorm2d(planes)
 21 |         self.bn2 = nn.BatchNorm2d(planes)
 22 |         self.relu = nn.ReLU(inplace=True)
 23 | 
 24 |         if stride == 1:
 25 |             self.downsample = None
 26 |         else:
 27 |             self.downsample = nn.Sequential(
 28 |                 conv1x1(in_planes, planes, stride=stride),
 29 |                 nn.BatchNorm2d(planes)
 30 |             )
 31 | 
 32 |     def forward(self, x):
 33 |         y = x
 34 |         y = self.relu(self.bn1(self.conv1(y)))
 35 |         y = self.bn2(self.conv2(y))
 36 | 
 37 |         if self.downsample is not None:
 38 |             x = self.downsample(x)
 39 | 
 40 |         return self.relu(x+y)
 41 | 
 42 | 
 43 | class ResNetFPN_8_2(nn.Module):
 44 |     """
 45 |     ResNet+FPN, output resolution are 1/8 and 1/2.
 46 |     Each block has 2 layers.
 47 |     """
 48 | 
 49 |     def __init__(self, config):
 50 |         super().__init__()
 51 |         # Config
 52 |         block = BasicBlock
 53 |         initial_dim = config['initial_dim']
 54 |         block_dims = config['block_dims']
 55 | 
 56 |         # Class Variable
 57 |         self.in_planes = initial_dim
 58 | 
 59 |         # Networks
 60 |         self.conv1 = nn.Conv2d(1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False)
 61 |         self.bn1 = nn.BatchNorm2d(initial_dim)
 62 |         self.relu = nn.ReLU(inplace=True)
 63 | 
 64 |         self.layer1 = self._make_layer(block, block_dims[0], stride=1)  # 1/2
 65 |         self.layer2 = self._make_layer(block, block_dims[1], stride=2)  # 1/4
 66 |         self.layer3 = self._make_layer(block, block_dims[2], stride=2)  # 1/8
 67 | 
 68 |         # 3. FPN upsample
 69 |         self.layer3_outconv = conv1x1(block_dims[2], block_dims[2])
 70 |         self.layer2_outconv = conv1x1(block_dims[1], block_dims[2])
 71 |         self.layer2_outconv2 = nn.Sequential(
 72 |             conv3x3(block_dims[2], block_dims[2]),
 73 |             nn.BatchNorm2d(block_dims[2]),
 74 |             nn.LeakyReLU(),
 75 |             conv3x3(block_dims[2], block_dims[1]),
 76 |         )
 77 |         self.layer1_outconv = conv1x1(block_dims[0], block_dims[1])
 78 |         self.layer1_outconv2 = nn.Sequential(
 79 |             conv3x3(block_dims[1], block_dims[1]),
 80 |             nn.BatchNorm2d(block_dims[1]),
 81 |             nn.LeakyReLU(),
 82 |             conv3x3(block_dims[1], block_dims[0]),
 83 |         )
 84 | 
 85 |         for m in self.modules():
 86 |             if isinstance(m, nn.Conv2d):
 87 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
 88 |             elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
 89 |                 nn.init.constant_(m.weight, 1)
 90 |                 nn.init.constant_(m.bias, 0)
 91 | 
 92 |     def _make_layer(self, block, dim, stride=1):
 93 |         layer1 = block(self.in_planes, dim, stride=stride)
 94 |         layer2 = block(dim, dim, stride=1)
 95 |         layers = (layer1, layer2)
 96 | 
 97 |         self.in_planes = dim
 98 |         return nn.Sequential(*layers)
 99 | 
100 |     def forward(self, x):
101 |         # ResNet Backbone
102 |         x0 = self.relu(self.bn1(self.conv1(x)))
103 |         x1 = self.layer1(x0)  # 1/2
104 |         x2 = self.layer2(x1)  # 1/4
105 |         x3 = self.layer3(x2)  # 1/8
106 | 
107 |         # FPN
108 |         x3_out = self.layer3_outconv(x3)
109 | 
110 |         x3_out_2x = F.interpolate(x3_out, scale_factor=2., mode='bilinear', align_corners=True)
111 |         x2_out = self.layer2_outconv(x2)
112 |         x2_out = self.layer2_outconv2(x2_out+x3_out_2x)
113 | 
114 |         x2_out_2x = F.interpolate(x2_out, scale_factor=2., mode='bilinear', align_corners=True)
115 |         x1_out = self.layer1_outconv(x1)
116 |         x1_out = self.layer1_outconv2(x1_out+x2_out_2x)
117 | 
118 |         return [x3_out, x1_out]
119 | 
120 | 
121 | class ResNetFPN_16_4(nn.Module):
122 |     """
123 |     ResNet+FPN, output resolution are 1/16 and 1/4.
124 |     Each block has 2 layers.
125 |     """
126 | 
127 |     def __init__(self, config):
128 |         super().__init__()
129 |         # Config
130 |         block = BasicBlock
131 |         initial_dim = config['initial_dim']
132 |         block_dims = config['block_dims']
133 | 
134 |         # Class Variable
135 |         self.in_planes = initial_dim
136 | 
137 |         # Networks
138 |         self.conv1 = nn.Conv2d(1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False)
139 |         self.bn1 = nn.BatchNorm2d(initial_dim)
140 |         self.relu = nn.ReLU(inplace=True)
141 | 
142 |         self.layer1 = self._make_layer(block, block_dims[0], stride=1)  # 1/2
143 |         self.layer2 = self._make_layer(block, block_dims[1], stride=2)  # 1/4
144 |         self.layer3 = self._make_layer(block, block_dims[2], stride=2)  # 1/8
145 |         self.layer4 = self._make_layer(block, block_dims[3], stride=2)  # 1/16
146 | 
147 |         # 3. FPN upsample
148 |         self.layer4_outconv = conv1x1(block_dims[3], block_dims[3])
149 |         self.layer3_outconv = conv1x1(block_dims[2], block_dims[3])
150 |         self.layer3_outconv2 = nn.Sequential(
151 |             conv3x3(block_dims[3], block_dims[3]),
152 |             nn.BatchNorm2d(block_dims[3]),
153 |             nn.LeakyReLU(),
154 |             conv3x3(block_dims[3], block_dims[2]),
155 |         )
156 | 
157 |         self.layer2_outconv = conv1x1(block_dims[1], block_dims[2])
158 |         self.layer2_outconv2 = nn.Sequential(
159 |             conv3x3(block_dims[2], block_dims[2]),
160 |             nn.BatchNorm2d(block_dims[2]),
161 |             nn.LeakyReLU(),
162 |             conv3x3(block_dims[2], block_dims[1]),
163 |         )
164 | 
165 |         for m in self.modules():
166 |             if isinstance(m, nn.Conv2d):
167 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
168 |             elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
169 |                 nn.init.constant_(m.weight, 1)
170 |                 nn.init.constant_(m.bias, 0)
171 | 
172 |     def _make_layer(self, block, dim, stride=1):
173 |         layer1 = block(self.in_planes, dim, stride=stride)
174 |         layer2 = block(dim, dim, stride=1)
175 |         layers = (layer1, layer2)
176 | 
177 |         self.in_planes = dim
178 |         return nn.Sequential(*layers)
179 | 
180 |     def forward(self, x):
181 |         # ResNet Backbone
182 |         x0 = self.relu(self.bn1(self.conv1(x)))
183 |         x1 = self.layer1(x0)  # 1/2
184 |         x2 = self.layer2(x1)  # 1/4
185 |         x3 = self.layer3(x2)  # 1/8
186 |         x4 = self.layer4(x3)  # 1/16
187 | 
188 |         # FPN
189 |         x4_out = self.layer4_outconv(x4)
190 | 
191 |         x4_out_2x = F.interpolate(x4_out, scale_factor=2., mode='bilinear', align_corners=True)
192 |         x3_out = self.layer3_outconv(x3)
193 |         x3_out = self.layer3_outconv2(x3_out+x4_out_2x)
194 | 
195 |         x3_out_2x = F.interpolate(x3_out, scale_factor=2., mode='bilinear', align_corners=True)
196 |         x2_out = self.layer2_outconv(x2)
197 |         x2_out = self.layer2_outconv2(x2_out+x3_out_2x)
198 | 
199 |         return [x4_out, x2_out]
200 | 


--------------------------------------------------------------------------------
/pose_synthesis/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center" width="100%">
  2 | <img src="https://github.com/Dustinpro/Dustinpro/assets/23076389/0fbdb69a-0fb4-4b42-b9da-e0b28532bdfd"  width="80%" height="80%">
  3 | </p>
  4 | 
  5 | 
  6 | <p align="center">
  7 |   [<a href="https://arxiv.org/pdf/2306.16928.pdf"><strong>Paper</strong></a>]
  8 |   [<a href="http://one-2-3-45.com"><strong>Project</strong></a>]
  9 |   [<a href="https://huggingface.co/spaces/One-2-3-45/One-2-3-45"><strong>Demo</strong></a>]
 10 |   [<a href="#citation"><strong>BibTeX</strong></a>]
 11 | </p>
 12 | 
 13 | <p align="center">
 14 |   <a href="https://huggingface.co/spaces/One-2-3-45/One-2-3-45">
 15 |     <img alt="Hugging Face Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Space_of_the_Week_%F0%9F%94%A5-blue">
 16 |   </a>
 17 | </p>
 18 | 
 19 | One-2-3-45 rethinks how to leverage 2D diffusion models for 3D AIGC and introduces a novel forward-only paradigm that avoids the time-consuming optimization.
 20 | 
 21 | https://github.com/One-2-3-45/One-2-3-45/assets/16759292/a81d6e32-8d29-43a5-b044-b5112b9f9664
 22 | 
 23 | 
 24 | 
 25 | https://github.com/One-2-3-45/One-2-3-45/assets/16759292/5ecd45ef-8fd3-4643-af4c-fac3050a0428
 26 | 
 27 | 
 28 | ## News
 29 | **[09/21/2023]**
 30 | One-2-3-45 is accepted by NeurIPS 2023. See you in New Orleans!
 31 | 
 32 | **[09/11/2023]**
 33 | Training code released.
 34 | 
 35 | **[08/18/2023]**
 36 | Inference code released.
 37 | 
 38 | **[07/24/2023]**
 39 | Our demo reached the HuggingFace top 4 trending and was featured in 🤗 Spaces of the Week 🔥! Special thanks to HuggingFace 🤗 for sponsoring this demo!!
 40 | 
 41 | **[07/11/2023]**
 42 | [Online interactive demo](https://huggingface.co/spaces/One-2-3-45/One-2-3-45) released! Explore it and create your own 3D models in just 45 seconds! 
 43 | 
 44 | **[06/29/2023]**
 45 | Check out our [paper](https://arxiv.org/pdf/2306.16928.pdf). [[X](https://twitter.com/_akhaliq/status/1674617785119305728)]
 46 | 
 47 | ## Installation
 48 | Hardware requirement: an NVIDIA GPU with memory >=18GB (_e.g._, RTX 3090 or A10). Tested on Ubuntu.
 49 | 
 50 | We offer two ways to setup the environment:
 51 | 
 52 | ### Traditional Installation
 53 | <details>
 54 | <summary>Step 1: Install Debian packages. </summary> 
 55 | 
 56 | ```bash
 57 | sudo apt update && sudo apt install git-lfs libsparsehash-dev build-essential
 58 | ```
 59 | </details>
 60 | 
 61 | <details>
 62 | <summary>Step 2: Create and activate a conda environment. </summary>
 63 | 
 64 | ```bash
 65 | conda create -n One2345 python=3.10
 66 | conda activate One2345
 67 | ```
 68 | </details>
 69 | 
 70 | <details>
 71 | <summary>Step 3: Clone the repository to the local machine. </summary>
 72 | 
 73 | ```bash
 74 | # Make sure you have git-lfs installed.
 75 | git lfs install
 76 | git clone https://github.com/One-2-3-45/One-2-3-45
 77 | cd One-2-3-45
 78 | ```
 79 | </details>
 80 | 
 81 | <details>
 82 | <summary>Step 4: Install project dependencies using pip. </summary>
 83 | 
 84 | ```bash
 85 | # Ensure that the installed CUDA version matches the torch's cuda version.
 86 | # Example: CUDA 11.8 installation
 87 | wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
 88 | sudo sh cuda_11.8.0_520.61.05_linux.run
 89 | export PATH="/usr/local/cuda-11.8/bin:$PATH"
 90 | export LD_LIBRARY_PATH="/usr/local/cuda-11.8/lib64:$LD_LIBRARY_PATH"
 91 | # Install PyTorch 2.0
 92 | pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
 93 | # Install dependencies
 94 | pip install -r requirements.txt
 95 | # Install inplace_abn and torchsparse
 96 | export TORCH_CUDA_ARCH_LIST="7.0;7.2;8.0;8.6+PTX" # CUDA architectures. Modify according to your hardware.
 97 | export IABN_FORCE_CUDA=1
 98 | pip install inplace_abn
 99 | FORCE_CUDA=1 pip install --no-cache-dir git+https://github.com/mit-han-lab/torchsparse.git@v1.4.0
100 | ```
101 | </details>
102 | 
103 | <details>
104 | <summary>Step 5: Download model checkpoints. </summary>
105 | 
106 | ```bash
107 | python download_ckpt.py
108 | ```
109 | </details>
110 | 
111 | 
112 | ### Installation by Docker Images
113 | <details>
114 | <summary>Option 1: Pull and Play (environment and checkpoints). (~22.3G)</summary> 
115 | 
116 | ```bash
117 | # Pull the Docker image that contains the full repository.
118 | docker pull chaoxu98/one2345:demo_1.0
119 | # An interactive demo will be launched automatically upon running the container.
120 | # This will provide a public URL like XXXXXXX.gradio.live
121 | docker run --name One-2-3-45_demo --gpus all -it chaoxu98/one2345:demo_1.0
122 | ```
123 | </details>
124 | 
125 | <details>
126 | <summary>Option 2: Environment Only. (~7.3G)</summary> 
127 | 
128 | ```bash
129 | # Pull the Docker image that installed all project dependencies.
130 | docker pull chaoxu98/one2345:1.0
131 | # Start a Docker container named One2345.
132 | docker run --name One-2-3-45 --gpus all -it chaoxu98/one2345:1.0
133 | # Get a bash shell in the container.
134 | docker exec -it One-2-3-45 /bin/bash
135 | # Clone the repository to the local machine.
136 | git clone https://github.com/One-2-3-45/One-2-3-45
137 | cd One-2-3-45
138 | # Download model checkpoints. 
139 | python download_ckpt.py
140 | # Refer to getting started for inference.
141 | ```
142 | </details>
143 | 
144 | ## Getting Started (Inference)
145 | 
146 | First-time running will take longer time to compile the models.
147 | 
148 | Expected time cost per image: 40s on an NVIDIA A6000.
149 | ```bash
150 | # 1. Script
151 | python run.py --img_path PATH_TO_INPUT_IMG --half_precision
152 | 
153 | # 2. Interactive demo (Gradio) with a friendly web interface
154 | #    An URL will be provided in the output 
155 | #    (Local: 127.0.0.1:7860; Public: XXXXXXX.gradio.live)
156 | cd demo/
157 | python app.py
158 | 
159 | # 3. Jupyter Notebook
160 | example.ipynb
161 | ```
162 | 
163 | ## Training Your Own Model
164 | 
165 | ### Data Preparation
166 | We use Objaverse-LVIS dataset for training and render the selected shapes (with CC-BY license) into 2D images with Blender. 
167 | #### Download the training images.
168 | Download all One2345.zip.part-* files (5 files in total) from <a href="https://huggingface.co/datasets/One-2-3-45/training_data/tree/main">here</a> and then cat them into a single .zip file using the following command:
169 | ```bash
170 | cat One2345.zip.part-* > One2345.zip
171 | ```
172 | 
173 | #### Unzip the training images zip file.
174 | Unzip the zip file into a folder specified by yourself (`YOUR_BASE_FOLDER`) with the following command:
175 | 
176 | ```bash
177 | unzip One2345.zip -d YOUR_BASE_FOLDER
178 | ```
179 | 
180 | #### Download meta files.
181 | 
182 | Download `One2345_training_pose.json` and `lvis_split_cc_by.json` from <a href="https://huggingface.co/datasets/One-2-3-45/training_data/tree/main">here</a> and put them into the same folder as the training images (`YOUR_BASE_FOLDER`).
183 | 
184 | Your file structure should look like this:
185 | ```
186 | # One2345 is your base folder used in the previous steps
187 | 
188 | One2345
189 | ├── One2345_training_pose.json
190 | ├── lvis_split_cc_by.json
191 | └── zero12345_narrow
192 |     ├── 000-000
193 |     ├── 000-001
194 |     ├── 000-002
195 |     ...
196 |     └── 000-159
197 |     
198 | ```
199 | 
200 | ### Training
201 | Specify the `trainpath`, `valpath`, and `testpath` in the config file `./reconstruction/confs/one2345_lod_train.conf` to be `YOUR_BASE_FOLDER` used in data preparation steps and run the following command:
202 | ```bash
203 | cd reconstruction
204 | python exp_runner_generic_blender_train.py --mode train --conf confs/one2345_lod_train.conf
205 | ```
206 | Experiment logs and checkpoints will be saved in `./reconstruction/exp/`.
207 | 
208 | ## Citation
209 | 
210 | If you find our code helpful, please cite our paper:
211 | 
212 | ```
213 | @misc{liu2023one2345,
214 |       title={One-2-3-45: Any Single Image to 3D Mesh in 45 Seconds without Per-Shape Optimization}, 
215 |       author={Minghua Liu and Chao Xu and Haian Jin and Linghao Chen and Mukund Varma T and Zexiang Xu and Hao Su},
216 |       year={2023},
217 |       eprint={2306.16928},
218 |       archivePrefix={arXiv},
219 |       primaryClass={cs.CV}
220 | }
221 | ```
222 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/modules/losses/vqperceptual.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import torch.nn.functional as F
  4 | from einops import repeat
  5 | 
  6 | from taming.modules.discriminator.model import NLayerDiscriminator, weights_init
  7 | from taming.modules.losses.lpips import LPIPS
  8 | from taming.modules.losses.vqperceptual import hinge_d_loss, vanilla_d_loss
  9 | 
 10 | 
 11 | def hinge_d_loss_with_exemplar_weights(logits_real, logits_fake, weights):
 12 |     assert weights.shape[0] == logits_real.shape[0] == logits_fake.shape[0]
 13 |     loss_real = torch.mean(F.relu(1. - logits_real), dim=[1,2,3])
 14 |     loss_fake = torch.mean(F.relu(1. + logits_fake), dim=[1,2,3])
 15 |     loss_real = (weights * loss_real).sum() / weights.sum()
 16 |     loss_fake = (weights * loss_fake).sum() / weights.sum()
 17 |     d_loss = 0.5 * (loss_real + loss_fake)
 18 |     return d_loss
 19 | 
 20 | def adopt_weight(weight, global_step, threshold=0, value=0.):
 21 |     if global_step < threshold:
 22 |         weight = value
 23 |     return weight
 24 | 
 25 | 
 26 | def measure_perplexity(predicted_indices, n_embed):
 27 |     # src: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py
 28 |     # eval cluster perplexity. when perplexity == num_embeddings then all clusters are used exactly equally
 29 |     encodings = F.one_hot(predicted_indices, n_embed).float().reshape(-1, n_embed)
 30 |     avg_probs = encodings.mean(0)
 31 |     perplexity = (-(avg_probs * torch.log(avg_probs + 1e-10)).sum()).exp()
 32 |     cluster_use = torch.sum(avg_probs > 0)
 33 |     return perplexity, cluster_use
 34 | 
 35 | def l1(x, y):
 36 |     return torch.abs(x-y)
 37 | 
 38 | 
 39 | def l2(x, y):
 40 |     return torch.pow((x-y), 2)
 41 | 
 42 | 
 43 | class VQLPIPSWithDiscriminator(nn.Module):
 44 |     def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0,
 45 |                  disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
 46 |                  perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
 47 |                  disc_ndf=64, disc_loss="hinge", n_classes=None, perceptual_loss="lpips",
 48 |                  pixel_loss="l1"):
 49 |         super().__init__()
 50 |         assert disc_loss in ["hinge", "vanilla"]
 51 |         assert perceptual_loss in ["lpips", "clips", "dists"]
 52 |         assert pixel_loss in ["l1", "l2"]
 53 |         self.codebook_weight = codebook_weight
 54 |         self.pixel_weight = pixelloss_weight
 55 |         if perceptual_loss == "lpips":
 56 |             print(f"{self.__class__.__name__}: Running with LPIPS.")
 57 |             self.perceptual_loss = LPIPS().eval()
 58 |         else:
 59 |             raise ValueError(f"Unknown perceptual loss: >> {perceptual_loss} <<")
 60 |         self.perceptual_weight = perceptual_weight
 61 | 
 62 |         if pixel_loss == "l1":
 63 |             self.pixel_loss = l1
 64 |         else:
 65 |             self.pixel_loss = l2
 66 | 
 67 |         self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
 68 |                                                  n_layers=disc_num_layers,
 69 |                                                  use_actnorm=use_actnorm,
 70 |                                                  ndf=disc_ndf
 71 |                                                  ).apply(weights_init)
 72 |         self.discriminator_iter_start = disc_start
 73 |         if disc_loss == "hinge":
 74 |             self.disc_loss = hinge_d_loss
 75 |         elif disc_loss == "vanilla":
 76 |             self.disc_loss = vanilla_d_loss
 77 |         else:
 78 |             raise ValueError(f"Unknown GAN loss '{disc_loss}'.")
 79 |         print(f"VQLPIPSWithDiscriminator running with {disc_loss} loss.")
 80 |         self.disc_factor = disc_factor
 81 |         self.discriminator_weight = disc_weight
 82 |         self.disc_conditional = disc_conditional
 83 |         self.n_classes = n_classes
 84 | 
 85 |     def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
 86 |         if last_layer is not None:
 87 |             nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
 88 |             g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
 89 |         else:
 90 |             nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
 91 |             g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
 92 | 
 93 |         d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
 94 |         d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
 95 |         d_weight = d_weight * self.discriminator_weight
 96 |         return d_weight
 97 | 
 98 |     def forward(self, codebook_loss, inputs, reconstructions, optimizer_idx,
 99 |                 global_step, last_layer=None, cond=None, split="train", predicted_indices=None):
100 |         if not exists(codebook_loss):
101 |             codebook_loss = torch.tensor([0.]).to(inputs.device)
102 |         #rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
103 |         rec_loss = self.pixel_loss(inputs.contiguous(), reconstructions.contiguous())
104 |         if self.perceptual_weight > 0:
105 |             p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
106 |             rec_loss = rec_loss + self.perceptual_weight * p_loss
107 |         else:
108 |             p_loss = torch.tensor([0.0])
109 | 
110 |         nll_loss = rec_loss
111 |         #nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
112 |         nll_loss = torch.mean(nll_loss)
113 | 
114 |         # now the GAN part
115 |         if optimizer_idx == 0:
116 |             # generator update
117 |             if cond is None:
118 |                 assert not self.disc_conditional
119 |                 logits_fake = self.discriminator(reconstructions.contiguous())
120 |             else:
121 |                 assert self.disc_conditional
122 |                 logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
123 |             g_loss = -torch.mean(logits_fake)
124 | 
125 |             try:
126 |                 d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
127 |             except RuntimeError:
128 |                 assert not self.training
129 |                 d_weight = torch.tensor(0.0)
130 | 
131 |             disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
132 |             loss = nll_loss + d_weight * disc_factor * g_loss + self.codebook_weight * codebook_loss.mean()
133 | 
134 |             log = {"{}/total_loss".format(split): loss.clone().detach().mean(),
135 |                    "{}/quant_loss".format(split): codebook_loss.detach().mean(),
136 |                    "{}/nll_loss".format(split): nll_loss.detach().mean(),
137 |                    "{}/rec_loss".format(split): rec_loss.detach().mean(),
138 |                    "{}/p_loss".format(split): p_loss.detach().mean(),
139 |                    "{}/d_weight".format(split): d_weight.detach(),
140 |                    "{}/disc_factor".format(split): torch.tensor(disc_factor),
141 |                    "{}/g_loss".format(split): g_loss.detach().mean(),
142 |                    }
143 |             if predicted_indices is not None:
144 |                 assert self.n_classes is not None
145 |                 with torch.no_grad():
146 |                     perplexity, cluster_usage = measure_perplexity(predicted_indices, self.n_classes)
147 |                 log[f"{split}/perplexity"] = perplexity
148 |                 log[f"{split}/cluster_usage"] = cluster_usage
149 |             return loss, log
150 | 
151 |         if optimizer_idx == 1:
152 |             # second pass for discriminator update
153 |             if cond is None:
154 |                 logits_real = self.discriminator(inputs.contiguous().detach())
155 |                 logits_fake = self.discriminator(reconstructions.contiguous().detach())
156 |             else:
157 |                 logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
158 |                 logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
159 | 
160 |             disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
161 |             d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
162 | 
163 |             log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
164 |                    "{}/logits_real".format(split): logits_real.detach().mean(),
165 |                    "{}/logits_fake".format(split): logits_fake.detach().mean()
166 |                    }
167 |             return d_loss, log
168 | 


--------------------------------------------------------------------------------
/train_pose_estimator.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.optim as optim
  4 | import torchvision.transforms as transforms
  5 | from torch.utils.data import DataLoader, Dataset
  6 | from torchvision import models
  7 | import numpy as np
  8 | import os
  9 | from PIL import Image
 10 | import torch.nn.functional as F
 11 | import random
 12 | 
 13 | name = "log_dinov2_mlp_1e-5_tmp"
 14 | gpuid = 1
 15 | from transformers import AutoImageProcessor, Dinov2Model
 16 | 
 17 | import logging
 18 | import sys
 19 | logging.basicConfig(encoding='utf-8', level=logging.INFO,
 20 |                     handlers=[logging.FileHandler("{}.log".format(name)),
 21 |                         logging.StreamHandler(sys.stdout) ] )
 22 | device ="cuda:{}".format(gpuid)
 23 | model_save_path = 'best_model_{}.pth'.format(name)
 24 | processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
 25 | vit_model = Dinov2Model.from_pretrained("facebook/dinov2-base")
 26 | bs = 128 #100
 27 | lr = 1e-5
 28 | data_folder = "./imgs/pose_estimation_train_dataset"  
 29 | 
 30 | class CustomDataset(Dataset):
 31 |     def __init__(self, root_dir, transform=None, train=True, test_split=0.2):
 32 |         self.root_dir = root_dir
 33 |         self.transform = transform
 34 |         self.train = train
 35 |         self.test_split = test_split
 36 |         self.data, self.labels = self.load_data()
 37 | 
 38 |     def load_data(self):
 39 |         data = []
 40 |         labels = []
 41 | 
 42 |         fur_dir_list = [d for d in os.listdir(self.root_dir) if os.path.isdir(os.path.join(self.root_dir, d))]
 43 |         for fur_dir in fur_dir_list:
 44 |             if "dreambooth" in fur_dir and "old" not in  fur_dir:
 45 |                 fur_path = os.path.join(self.root_dir, fur_dir)
 46 |                 deg_dir_list = [d for d in os.listdir(fur_path) if os.path.isdir(fur_path )]
 47 |                 
 48 |                 for deg_dir in deg_dir_list :
 49 |                     deg_path = os.path.join(fur_path, deg_dir)
 50 |                     files = [f for f in os.listdir(deg_path) if f.endswith('.png')]
 51 |                     for file in files:
 52 |                         file_path = os.path.join(deg_path, file)
 53 |                         r, t = file.split('_')
 54 |                         label = [float(r), float(t[:-4])]  # 移除文件名中的".png"后缀
 55 |                         data.append(file_path)
 56 |                         labels.append(label)
 57 |         
 58 |         zipped = list(zip(data, labels))
 59 |         random.shuffle(zipped)
 60 |         data, labels = zip(*zipped)
 61 | 
 62 |         split_index = int(len(data) * (1 - self.test_split))
 63 |         if self.train:
 64 |             data = data[:split_index]
 65 |             labels = labels[:split_index]
 66 |         else:
 67 |             data = data[split_index:]
 68 |             labels = labels[split_index:]
 69 | 
 70 |         return data, labels
 71 | 
 72 |     def __len__(self):
 73 |         return len(self.data)
 74 | 
 75 |     def __getitem__(self, idx):
 76 |         img_path = self.data[idx]
 77 |         label = self.labels[idx]
 78 | 
 79 |         img = Image.open(img_path).convert('RGB')
 80 |         inputs = processor(images=img, return_tensors="pt")
 81 |         inputs['pixel_values'] = inputs['pixel_values'][0]
 82 |         inputs['label'] = label
 83 |         return inputs
 84 | 
 85 | 
 86 | class RegressionModel(nn.Module):
 87 |     def __init__(self):
 88 |         super(RegressionModel, self).__init__()
 89 |         self.vit = vit_model
 90 |         # self.fc1 = nn.Linear(768, 768)
 91 |         # self.fc2 = nn.Linear(768, 768)
 92 |         self.fc3 = nn.Linear(768, 128)
 93 |         self.fc4 = nn.Linear(128, 2)
 94 | 
 95 |         
 96 |         for param in self.vit.parameters():
 97 |             param.requires_grad = True
 98 | 
 99 |     def forward(self, x):
100 |         outputs = self.vit(x)
101 |         sequence_output = outputs[0]
102 |         x = sequence_output[:, 0, :] #[B,768]
103 |         
104 |         # x = F.relu(self.fc1(x))
105 |         # x = F.relu(self.fc2(x))
106 |         x = F.relu(self.fc3(x))
107 |         x = self.fc4(x)
108 |         return x
109 | 
110 | 
111 | transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])
112 | 
113 | train_dataset = CustomDataset(data_folder, transform=transform, train=True)
114 | train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True, pin_memory=True, num_workers=32)
115 | test_dataset = CustomDataset(data_folder, transform=transform, train=False)
116 | test_loader = DataLoader(test_dataset, batch_size=bs, shuffle=False, pin_memory=True, num_workers=32)
117 | 
118 | 
119 | model = RegressionModel().float().to(device)
120 | model = model.to(device)
121 | 
122 | 
123 | criterion = nn.MSELoss()
124 | criterion_mae = nn.L1Loss()
125 | 
126 | optimizer = optim.AdamW(model.parameters(), lr=lr)
127 | 
128 | 
129 | round_list =np.array (list(range(-160,170,10)))
130 | def discretize(outputs):
131 |     outputs = outputs.numpy()
132 |     round_cand = round_list
133 |     for i in range(len(outputs.shape)):
134 |         round_cand = np.expand_dims(round_cand, 0)
135 |     outputs = np.expand_dims(outputs, -1)
136 |     diff = np.abs(outputs-round_cand)
137 |     pos = np.expand_dims(np.argmin(diff, axis = -1), axis = -1)
138 |     res = round_list[pos].squeeze()
139 |     return torch.tensor(res)
140 | 
141 | lowest_loss = float('inf')
142 | 
143 | 
144 | num_epochs = 500
145 | discrete_val = []
146 | for epoch in range(num_epochs):
147 |     model.train()
148 |     for batch in train_loader:
149 |         inputs, labels = batch['pixel_values'], batch['label']
150 |         labels = torch.stack(labels, dim=-1)
151 | 
152 |         inputs = inputs.float().to(device)
153 |         labels = labels.float().to(device)
154 |         optimizer.zero_grad()
155 |         outputs = model(inputs)
156 |         
157 |         # from thop import profile
158 |         # flops, params = profile(model, (inputs,))
159 |         # print('flops: ', flops, 'params: ', params)
160 |         
161 |         loss = criterion(outputs, labels)
162 |         loss_mae = criterion_mae(outputs.detach().cpu(), labels.detach().cpu())
163 | 
164 |         loss.backward()
165 |         optimizer.step()
166 |         
167 |         logging.info(f'Epoch [{epoch+1}/{num_epochs}], train MSE: {loss.item():.2f}, mae: {loss_mae.item():.2f}, RMSE: {np.sqrt(loss.item()):.2f}')
168 |         
169 |         outputs_round =discretize(outputs.detach().cpu()) 
170 |         loss = criterion(outputs_round.detach().cpu(), labels.detach().cpu())
171 |         loss_mae = criterion_mae(outputs_round.detach().cpu(), labels.detach().cpu())
172 |         logging.info(f'Epoch [{epoch+1}/{num_epochs}],     r MSE: {loss.item():.2f}, mae: {loss_mae.item():.2f},  RMSE: {np.sqrt(loss.item()):.2f}')
173 | 
174 |     # After each epoch, the model will be evaluated on the test set
175 |     model.eval()
176 |     with torch.no_grad():
177 |         total_loss = 0
178 |         total_loss_mae = 0
179 | 
180 |         r_total_loss = 0
181 |         r_total_loss_mae = 0
182 |         
183 |         for batch in test_loader:
184 |             inputs, labels = batch['pixel_values'], batch['label']
185 |             labels = torch.stack(labels, dim=-1)
186 |             inputs = inputs.float().to(device)
187 |             labels = labels.float().to(device)
188 |             outputs = model(inputs)
189 |             
190 |             loss = criterion(outputs, labels)
191 |             loss_mae = criterion_mae(outputs.detach().cpu(), labels.detach().cpu())
192 | 
193 |             
194 |             total_loss += loss.item()
195 |             total_loss_mae += loss_mae.item()
196 | 
197 |             
198 |             outputs_round =discretize(outputs.detach().cpu()) 
199 |             r_loss = criterion(outputs_round.detach().cpu(), labels.detach().cpu())
200 |             r_loss_mae = criterion_mae(outputs_round.detach().cpu(), labels.detach().cpu())
201 |     
202 | 
203 |             r_total_loss += r_loss.item()
204 |             r_total_loss_mae += r_loss_mae.item()
205 | 
206 |             
207 |             
208 |         average_loss = total_loss / len(test_loader)
209 |         average_loss_mae = total_loss_mae / len(test_loader)
210 | 
211 |         logging.info(f'Epoch [{epoch+1}/{num_epochs}], Test MSE: {average_loss:.2f}, mae: {average_loss_mae:.2f}, RMSE: {np.sqrt(average_loss):.2f}')
212 |     
213 |         average_loss = r_total_loss / len(test_loader)
214 |         average_loss_mae = r_total_loss_mae / len(test_loader)
215 |         logging.info(f'Epoch [{epoch+1}/{num_epochs}],    r MSE: {average_loss:.2f}, mae: {average_loss_mae:.2f},  RMSE: {np.sqrt(average_loss):.2f}')
216 | 
217 |     # If the test loss of the current model is lower, save the current model
218 |     if average_loss < lowest_loss:
219 |         lowest_loss = average_loss
220 |         torch.save(model.state_dict(), model_save_path)
221 |         logging.info(f'Saved model with lowest test loss {lowest_loss}: {model_save_path}')
222 | 
223 | 


--------------------------------------------------------------------------------
/pose_synthesis/elevation_estimate/utils/elev_est_api.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import numpy as np
  4 | import os.path as osp
  5 | import imageio
  6 | from copy import deepcopy
  7 | 
  8 | import loguru
  9 | import torch
 10 | import matplotlib.cm as cm
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | from ..loftr import LoFTR, default_cfg
 14 | from . import plt_utils
 15 | from .plotting import make_matching_figure
 16 | from .utils3d import rect_to_img, canonical_to_camera, calc_pose
 17 | 
 18 | 
 19 | class ElevEstHelper:
 20 |     _feature_matcher = None
 21 | 
 22 |     @classmethod
 23 |     def get_feature_matcher(cls):
 24 |         if cls._feature_matcher is None:
 25 |             loguru.logger.info("Loading feature matcher...")
 26 |             _default_cfg = deepcopy(default_cfg)
 27 |             _default_cfg['coarse']['temp_bug_fix'] = True  # set to False when using the old ckpt
 28 |             matcher = LoFTR(config=_default_cfg)
 29 |             current_dir = os.path.dirname(os.path.abspath(__file__))
 30 |             ckpt_path = os.path.join(current_dir, "weights/indoor_ds_new.ckpt")
 31 |             if not osp.exists(ckpt_path):
 32 |                 loguru.logger.info("Downloading feature matcher...")
 33 |                 os.makedirs("weights", exist_ok=True)
 34 |                 import gdown
 35 |                 gdown.cached_download(url="https://drive.google.com/uc?id=19s3QvcCWQ6g-N1PrYlDCg-2mOJZ3kkgS",
 36 |                                       path=ckpt_path)
 37 |             matcher.load_state_dict(torch.load(ckpt_path)['state_dict'])
 38 |             matcher = matcher.eval().cuda()
 39 |             cls._feature_matcher = matcher
 40 |         return cls._feature_matcher
 41 | 
 42 | 
 43 | def mask_out_bkgd(img_path, dbg=False):
 44 |     img = imageio.imread_v2(img_path)
 45 |     if img.shape[-1] == 4:
 46 |         fg_mask = img[:, :, :3]
 47 |     else:
 48 |         loguru.logger.info("Image has no alpha channel, using thresholding to mask out background")
 49 |         fg_mask = ~(img > 245).all(axis=-1)
 50 |         if dbg:
 51 |             plt.imshow(plt_utils.vis_mask(img, fg_mask.astype(np.uint8), color=[0, 255, 0]))
 52 |             plt.show()
 53 |     return fg_mask
 54 | 
 55 | 
 56 | def get_feature_matching(img_paths, dbg=False):
 57 |     assert len(img_paths) == 4
 58 |     matcher = ElevEstHelper.get_feature_matcher()
 59 |     feature_matching = {}
 60 |     masks = []
 61 |     for i in range(4):
 62 |         mask = mask_out_bkgd(img_paths[i], dbg=dbg)
 63 |         masks.append(mask)
 64 |     for i in range(0, 4):
 65 |         for j in range(i + 1, 4):
 66 |             img0_pth = img_paths[i]
 67 |             img1_pth = img_paths[j]
 68 |             mask0 = masks[i]
 69 |             mask1 = masks[j]
 70 |             img0_raw = cv2.imread(img0_pth, cv2.IMREAD_GRAYSCALE)
 71 |             img1_raw = cv2.imread(img1_pth, cv2.IMREAD_GRAYSCALE)
 72 |             original_shape = img0_raw.shape
 73 |             img0_raw_resized = cv2.resize(img0_raw, (480, 480))
 74 |             img1_raw_resized = cv2.resize(img1_raw, (480, 480))
 75 | 
 76 |             img0 = torch.from_numpy(img0_raw_resized)[None][None].cuda() / 255.
 77 |             img1 = torch.from_numpy(img1_raw_resized)[None][None].cuda() / 255.
 78 |             batch = {'image0': img0, 'image1': img1}
 79 | 
 80 |             # Inference with LoFTR and get prediction
 81 |             with torch.no_grad():
 82 |                 matcher(batch)
 83 |                 mkpts0 = batch['mkpts0_f'].cpu().numpy()
 84 |                 mkpts1 = batch['mkpts1_f'].cpu().numpy()
 85 |                 mconf = batch['mconf'].cpu().numpy()
 86 |             mkpts0[:, 0] = mkpts0[:, 0] * original_shape[1] / 480
 87 |             mkpts0[:, 1] = mkpts0[:, 1] * original_shape[0] / 480
 88 |             mkpts1[:, 0] = mkpts1[:, 0] * original_shape[1] / 480
 89 |             mkpts1[:, 1] = mkpts1[:, 1] * original_shape[0] / 480
 90 |             keep0 = mask0[mkpts0[:, 1].astype(int), mkpts1[:, 0].astype(int)]
 91 |             keep1 = mask1[mkpts1[:, 1].astype(int), mkpts1[:, 0].astype(int)]
 92 |             keep = np.logical_and(keep0, keep1)
 93 |             mkpts0 = mkpts0[keep]
 94 |             mkpts1 = mkpts1[keep]
 95 |             mconf = mconf[keep]
 96 |             if dbg:
 97 |                 # Draw visualization
 98 |                 color = cm.jet(mconf)
 99 |                 text = [
100 |                     'LoFTR',
101 |                     'Matches: {}'.format(len(mkpts0)),
102 |                 ]
103 |                 fig = make_matching_figure(img0_raw, img1_raw, mkpts0, mkpts1, color, text=text)
104 |                 fig.show()
105 |             feature_matching[f"{i}_{j}"] = np.concatenate([mkpts0, mkpts1, mconf[:, None]], axis=1)
106 | 
107 |     return feature_matching
108 | 
109 | 
110 | def gen_pose_hypothesis(center_elevation):
111 |     elevations = np.radians(
112 |         [center_elevation, center_elevation - 10, center_elevation + 10, center_elevation, center_elevation])  # 45~120
113 |     azimuths = np.radians([30, 30, 30, 20, 40])
114 |     input_poses = calc_pose(elevations, azimuths, len(azimuths))
115 |     input_poses = input_poses[1:]
116 |     input_poses[..., 1] *= -1
117 |     input_poses[..., 2] *= -1
118 |     return input_poses
119 | 
120 | 
121 | def ba_error_general(K, matches, poses):
122 |     projmat0 = K @ poses[0].inverse()[:3, :4]
123 |     projmat1 = K @ poses[1].inverse()[:3, :4]
124 |     match_01 = matches[0]
125 |     pts0 = match_01[:, :2]
126 |     pts1 = match_01[:, 2:4]
127 |     Xref = cv2.triangulatePoints(projmat0.cpu().numpy(), projmat1.cpu().numpy(),
128 |                                  pts0.cpu().numpy().T, pts1.cpu().numpy().T)
129 |     Xref = Xref[:3] / Xref[3:]
130 |     Xref = Xref.T
131 |     Xref = torch.from_numpy(Xref).cuda().float()
132 |     reproj_error = 0
133 |     for match, cp in zip(matches[1:], poses[2:]):
134 |         dist = (torch.norm(match_01[:, :2][:, None, :] - match[:, :2][None, :, :], dim=-1))
135 |         if dist.numel() > 0:
136 |             # print("dist.shape", dist.shape)
137 |             m0to2_index = dist.argmin(1)
138 |             keep = dist[torch.arange(match_01.shape[0]), m0to2_index] < 1
139 |             if keep.sum() > 0:
140 |                 xref_in2 = rect_to_img(K, canonical_to_camera(Xref, cp.inverse()))
141 |                 reproj_error2 = torch.norm(match[m0to2_index][keep][:, 2:4] - xref_in2[keep], dim=-1)
142 |                 conf02 = match[m0to2_index][keep][:, -1]
143 |                 reproj_error += (reproj_error2 * conf02).sum() / (conf02.sum())
144 | 
145 |     return reproj_error
146 | 
147 | 
148 | def find_optim_elev(elevs, nimgs, matches, K, dbg=False):
149 |     errs = []
150 |     for elev in elevs:
151 |         err = 0
152 |         cam_poses = gen_pose_hypothesis(elev)
153 |         for start in range(nimgs - 1):
154 |             batch_matches, batch_poses = [], []
155 |             for i in range(start, nimgs + start):
156 |                 ci = i % nimgs
157 |                 batch_poses.append(cam_poses[ci])
158 |             for j in range(nimgs - 1):
159 |                 key = f"{start}_{(start + j + 1) % nimgs}"
160 |                 match = matches[key]
161 |                 batch_matches.append(match)
162 |             err += ba_error_general(K, batch_matches, batch_poses)
163 |         errs.append(err)
164 |     errs = torch.tensor(errs)
165 |     if dbg:
166 |         plt.plot(elevs, errs)
167 |         plt.show()
168 |     optim_elev = elevs[torch.argmin(errs)].item()
169 |     return optim_elev
170 | 
171 | 
172 | def get_elev_est(feature_matching, min_elev=30, max_elev=150, K=None, dbg=False):
173 |     flag = True
174 |     matches = {}
175 |     for i in range(4):
176 |         for j in range(i + 1, 4):
177 |             match_ij = feature_matching[f"{i}_{j}"]
178 |             if len(match_ij) == 0:
179 |                 flag = False
180 |             match_ji = np.concatenate([match_ij[:, 2:4], match_ij[:, 0:2], match_ij[:, 4:5]], axis=1)
181 |             matches[f"{i}_{j}"] = torch.from_numpy(match_ij).float().cuda()
182 |             matches[f"{j}_{i}"] = torch.from_numpy(match_ji).float().cuda()
183 |     if not flag:
184 |         loguru.logger.info("0 matches, could not estimate elevation")
185 |         return None
186 |     interval = 10
187 |     elevs = np.arange(min_elev, max_elev, interval)
188 |     optim_elev1 = find_optim_elev(elevs, 4, matches, K)
189 | 
190 |     elevs = np.arange(optim_elev1 - 10, optim_elev1 + 10, 1)
191 |     optim_elev2 = find_optim_elev(elevs, 4, matches, K)
192 | 
193 |     return optim_elev2
194 | 
195 | 
196 | def elev_est_api(img_paths, min_elev=30, max_elev=150, K=None, dbg=False):
197 |     feature_matching = get_feature_matching(img_paths, dbg=dbg)
198 |     if K is None:
199 |         loguru.logger.warning("K is not provided, using default K")
200 |         K = np.array([[280.0, 0, 128.0],
201 |                       [0, 280.0, 128.0],
202 |                       [0, 0, 1]])
203 |     K = torch.from_numpy(K).cuda().float()
204 |     elev = get_elev_est(feature_matching, min_elev, max_elev, K, dbg=dbg)
205 |     return elev
206 | 


--------------------------------------------------------------------------------
/pose_synthesis/utils/zero123_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import torch
  4 | from contextlib import nullcontext
  5 | from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
  6 | from einops import rearrange
  7 | from ldm.util import instantiate_from_config
  8 | from ldm.models.diffusion.ddim import DDIMSampler
  9 | from omegaconf import OmegaConf
 10 | from PIL import Image
 11 | from rich import print
 12 | from transformers import CLIPImageProcessor
 13 | from torch import autocast
 14 | from torchvision import transforms
 15 | 
 16 | 
 17 | def load_model_from_config(config, ckpt, device, verbose=False):
 18 |     print(f'Loading model from {ckpt}')
 19 |     pl_sd = torch.load(ckpt, map_location='cpu')
 20 |     if 'global_step' in pl_sd:
 21 |         print(f'Global Step: {pl_sd["global_step"]}')
 22 |     sd = pl_sd['state_dict']
 23 |     model = instantiate_from_config(config.model)
 24 |     m, u = model.load_state_dict(sd, strict=False)
 25 |     if len(m) > 0 and verbose:
 26 |         print('missing keys:')
 27 |         print(m)
 28 |     if len(u) > 0 and verbose:
 29 |         print('unexpected keys:')
 30 |         print(u)
 31 | 
 32 |     model.to(device)
 33 |     model.eval()
 34 |     return model
 35 | 
 36 | 
 37 | def init_model(device, ckpt, half_precision=False):
 38 |     config = os.path.join(os.path.dirname(__file__), '../configs/sd-objaverse-finetune-c_concat-256.yaml')
 39 |     config = OmegaConf.load(config)
 40 | 
 41 |     # Instantiate all models beforehand for efficiency.
 42 |     models = dict()
 43 |     print('Instantiating LatentDiffusion...')
 44 |     if half_precision:
 45 |         models['turncam'] = torch.compile(load_model_from_config(config, ckpt, device=device)).half()
 46 |     else:
 47 |         models['turncam'] = torch.compile(load_model_from_config(config, ckpt, device=device))
 48 |     print('Instantiating StableDiffusionSafetyChecker...')
 49 |     models['nsfw'] = StableDiffusionSafetyChecker.from_pretrained(
 50 |         'CompVis/stable-diffusion-safety-checker').to(device)
 51 |     models['clip_fe'] = CLIPImageProcessor.from_pretrained(
 52 |         "openai/clip-vit-large-patch14")
 53 |     # We multiply all by some factor > 1 to make them less likely to be triggered.
 54 |     models['nsfw'].concept_embeds_weights *= 1.2
 55 |     models['nsfw'].special_care_embeds_weights *= 1.2
 56 | 
 57 |     return models
 58 | 
 59 | @torch.no_grad()
 60 | def sample_model_batch(model, sampler, input_im, xs, ys, n_samples=4, precision='autocast', ddim_eta=1.0, ddim_steps=75, scale=3.0, h=256, w=256):
 61 |     precision_scope = autocast if precision == 'autocast' else nullcontext
 62 |     with precision_scope("cuda"):
 63 |         with model.ema_scope():
 64 |             c = model.get_learned_conditioning(input_im).tile(n_samples, 1, 1)
 65 |             T = []
 66 |             for x, y in zip(xs, ys):
 67 |                 T.append([np.radians(x), np.sin(np.radians(y)), np.cos(np.radians(y)), 0])
 68 |             T = torch.tensor(np.array(T))[:, None, :].float().to(c.device)
 69 |             c = torch.cat([c, T], dim=-1)
 70 |             c = model.cc_projection(c)
 71 |             cond = {}
 72 |             cond['c_crossattn'] = [c]
 73 |             cond['c_concat'] = [model.encode_first_stage(input_im).mode().detach()
 74 |                                 .repeat(n_samples, 1, 1, 1)]
 75 |             if scale != 1.0:
 76 |                 uc = {}
 77 |                 uc['c_concat'] = [torch.zeros(n_samples, 4, h // 8, w // 8).to(c.device)]
 78 |                 uc['c_crossattn'] = [torch.zeros_like(c).to(c.device)]
 79 |             else:
 80 |                 uc = None
 81 | 
 82 |             shape = [4, h // 8, w // 8]
 83 |             samples_ddim, _ = sampler.sample(S=ddim_steps,
 84 |                                              conditioning=cond,
 85 |                                              batch_size=n_samples,
 86 |                                              shape=shape,
 87 |                                              verbose=False,
 88 |                                              unconditional_guidance_scale=scale,
 89 |                                              unconditional_conditioning=uc,
 90 |                                              eta=ddim_eta,
 91 |                                              x_T=None)
 92 |             # print(samples_ddim.shape)
 93 |             # samples_ddim = torch.nn.functional.interpolate(samples_ddim, 64, mode='nearest', antialias=False)
 94 |             x_samples_ddim = model.decode_first_stage(samples_ddim)
 95 |             ret_imgs = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0).cpu()
 96 |             del cond, c, x_samples_ddim, samples_ddim, uc, input_im
 97 |             torch.cuda.empty_cache()
 98 |             return ret_imgs
 99 | 
100 | @torch.no_grad()
101 | def predict_stage1_gradio(model, raw_im, save_path = "", adjust_set=[], device="cuda", ddim_steps=75, scale=3.0):
102 |     # raw_im = raw_im.resize([256, 256], Image.LANCZOS)
103 |     # input_im_init = preprocess_image(models, raw_im, preprocess=False)
104 |     input_im_init = np.asarray(raw_im, dtype=np.float32) / 255.0
105 |     input_im = transforms.ToTensor()(input_im_init).unsqueeze(0).to(device)
106 |     input_im = input_im * 2 - 1
107 | 
108 |     # stage 1: 8
109 |     delta_x_1_8 = [0] * 4 + [30] * 4 + [-30] * 4
110 |     delta_y_1_8 = [0+90*(i%4) if i < 4 else 30+90*(i%4) for i in range(8)] + [30+90*(i%4) for i in range(4)]
111 | 
112 |     ret_imgs = []
113 |     sampler = DDIMSampler(model)
114 |     # sampler.to(device)
115 |     if adjust_set != []:
116 |         x_samples_ddims_8 = sample_model_batch(model, sampler, input_im, 
117 |                                                [delta_x_1_8[i] for i in adjust_set], [delta_y_1_8[i] for i in adjust_set], 
118 |                                                n_samples=len(adjust_set), ddim_steps=ddim_steps, scale=scale)
119 |     else:
120 |         x_samples_ddims_8 = sample_model_batch(model, sampler, input_im, delta_x_1_8, delta_y_1_8, n_samples=len(delta_x_1_8), ddim_steps=ddim_steps, scale=scale)
121 |     sample_idx = 0
122 |     for stage1_idx in range(len(delta_x_1_8)):
123 |         if adjust_set != [] and stage1_idx not in adjust_set:
124 |             continue
125 |         x_sample = 255.0 * rearrange(x_samples_ddims_8[sample_idx].numpy(), 'c h w -> h w c')
126 |         out_image = Image.fromarray(x_sample.astype(np.uint8))
127 |         ret_imgs.append(out_image)
128 |         if save_path:
129 |             out_image.save(os.path.join(save_path, '%d.png'%(stage1_idx)))
130 |         sample_idx += 1
131 |     del x_samples_ddims_8
132 |     del sampler
133 |     torch.cuda.empty_cache()
134 |     return ret_imgs
135 | 
136 | def infer_stage_2(model, save_path_stage1, save_path_stage2, delta_x_2, delta_y_2, indices, device, ddim_steps=75, scale=3.0):
137 |     for stage1_idx in indices:
138 |         # save stage 1 image
139 |         # x_sample = 255.0 * rearrange(x_samples_ddims[stage1_idx].cpu().numpy(), 'c h w -> h w c')
140 |         # Image.fromarray(x_sample.astype(np.uint8)).save()
141 |         stage1_image_path = os.path.join(save_path_stage1, '%d.png'%(stage1_idx))
142 | 
143 |         raw_im = Image.open(stage1_image_path)
144 |         # input_im_init = preprocess_image(models, raw_im, preprocess=False)
145 |         input_im_init = np.asarray(raw_im, dtype=np.float32) #/ 255.0
146 |         input_im_init[input_im_init >= 253.0] = 255.0
147 |         input_im_init = input_im_init / 255.0
148 |         input_im = transforms.ToTensor()(input_im_init).unsqueeze(0).to(device)
149 |         input_im = input_im * 2 - 1
150 |         # infer stage 2
151 |         sampler = DDIMSampler(model)
152 |         # sampler.to(device)
153 |         # stage2_in = x_samples_ddims[stage1_idx][None, ...].to(device) * 2 - 1
154 |         x_samples_ddims_stage2 = sample_model_batch(model, sampler, input_im, delta_x_2, delta_y_2, n_samples=len(delta_x_2), ddim_steps=ddim_steps, scale=scale)
155 |         for stage2_idx in range(len(delta_x_2)):
156 |             x_sample_stage2 = 255.0 * rearrange(x_samples_ddims_stage2[stage2_idx].numpy(), 'c h w -> h w c')
157 |             Image.fromarray(x_sample_stage2.astype(np.uint8)).save(os.path.join(save_path_stage2, '%d_%d.png'%(stage1_idx, stage2_idx)))
158 |         del input_im
159 |         del x_samples_ddims_stage2
160 |         torch.cuda.empty_cache()
161 | 
162 | def zero123_infer(model, input_dir_path, start_idx=0, end_idx=12, indices=None, device="cuda", ddim_steps=75, scale=3.0):
163 |     # input_img_path = os.path.join(input_dir_path, "input_256.png")
164 |     save_path_8 = os.path.join(input_dir_path, "stage1_8")
165 |     save_path_8_2 = os.path.join(input_dir_path, "stage2_8")
166 |     os.makedirs(save_path_8_2, exist_ok=True)
167 | 
168 |     # raw_im = Image.open(input_img_path)
169 |     # # input_im_init = preprocess_image(models, raw_im, preprocess=False)
170 |     # input_im_init = np.asarray(raw_im, dtype=np.float32) / 255.0
171 |     # input_im = transforms.ToTensor()(input_im_init).unsqueeze(0).to(device)
172 |     # input_im = input_im * 2 - 1
173 | 
174 |     # stage 2: 6*4 or 8*4
175 |     delta_x_2 = [-10, 10, 0, 0]
176 |     delta_y_2 = [0, 0, -10, 10]
177 |     
178 |     infer_stage_2(model, save_path_8, save_path_8_2, delta_x_2, delta_y_2, indices=indices if indices else list(range(start_idx,end_idx)), device=device, ddim_steps=ddim_steps, scale=scale)
179 | 


--------------------------------------------------------------------------------
/pose_synthesis/ldm/util.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | 
  3 | import torch
  4 | from torch import optim
  5 | import numpy as np
  6 | 
  7 | from inspect import isfunction
  8 | from PIL import Image, ImageDraw, ImageFont
  9 | 
 10 | import os
 11 | import numpy as np
 12 | import matplotlib.pyplot as plt
 13 | from PIL import Image
 14 | import torch
 15 | import time
 16 | import cv2
 17 | import PIL
 18 | 
 19 | def pil_rectangle_crop(im):
 20 |     width, height = im.size   # Get dimensions
 21 |     
 22 |     if width <= height:
 23 |         left = 0
 24 |         right = width
 25 |         top = (height - width)/2
 26 |         bottom = (height + width)/2
 27 |     else:
 28 |         
 29 |         top = 0
 30 |         bottom = height
 31 |         left = (width - height) / 2
 32 |         bottom = (width + height) / 2
 33 | 
 34 |     # Crop the center of the image
 35 |     im = im.crop((left, top, right, bottom))
 36 |     return im
 37 | 
 38 | def add_margin(pil_img, color, size=256):
 39 |     width, height = pil_img.size
 40 |     result = Image.new(pil_img.mode, (size, size), color)
 41 |     result.paste(pil_img, ((size - width) // 2, (size - height) // 2))
 42 |     return result
 43 | 
 44 | def load_and_preprocess(interface, input_im):
 45 |     '''
 46 |     :param input_im (PIL Image).
 47 |     :return image (H, W, 3) array in [0, 1].
 48 |     '''
 49 |     # See https://github.com/Ir1d/image-background-remove-tool
 50 |     image = input_im.convert('RGB')
 51 | 
 52 |     image_without_background = interface([image])[0]
 53 |     image_without_background = np.array(image_without_background)
 54 |     est_seg = image_without_background > 127
 55 |     image = np.array(image)
 56 |     foreground = est_seg[:, : , -1].astype(np.bool_)
 57 |     image[~foreground] = [255., 255., 255.]
 58 |     x, y, w, h = cv2.boundingRect(foreground.astype(np.uint8))
 59 |     image = image[y:y+h, x:x+w, :]
 60 |     image = PIL.Image.fromarray(np.array(image))
 61 |     
 62 |     # resize image such that long edge is 512
 63 |     image.thumbnail([200, 200], Image.Resampling.LANCZOS)
 64 |     image = add_margin(image, (255, 255, 255), size=256)
 65 |     image = np.array(image)
 66 |     
 67 |     return image
 68 | 
 69 | 
 70 | def log_txt_as_img(wh, xc, size=10):
 71 |     # wh a tuple of (width, height)
 72 |     # xc a list of captions to plot
 73 |     b = len(xc)
 74 |     txts = list()
 75 |     for bi in range(b):
 76 |         txt = Image.new("RGB", wh, color="white")
 77 |         draw = ImageDraw.Draw(txt)
 78 |         font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
 79 |         nc = int(40 * (wh[0] / 256))
 80 |         lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
 81 | 
 82 |         try:
 83 |             draw.text((0, 0), lines, fill="black", font=font)
 84 |         except UnicodeEncodeError:
 85 |             print("Cant encode string for logging. Skipping.")
 86 | 
 87 |         txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
 88 |         txts.append(txt)
 89 |     txts = np.stack(txts)
 90 |     txts = torch.tensor(txts)
 91 |     return txts
 92 | 
 93 | 
 94 | def ismap(x):
 95 |     if not isinstance(x, torch.Tensor):
 96 |         return False
 97 |     return (len(x.shape) == 4) and (x.shape[1] > 3)
 98 | 
 99 | 
100 | def isimage(x):
101 |     if not isinstance(x,torch.Tensor):
102 |         return False
103 |     return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
104 | 
105 | 
106 | def exists(x):
107 |     return x is not None
108 | 
109 | 
110 | def default(val, d):
111 |     if exists(val):
112 |         return val
113 |     return d() if isfunction(d) else d
114 | 
115 | 
116 | def mean_flat(tensor):
117 |     """
118 |     https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
119 |     Take the mean over all non-batch dimensions.
120 |     """
121 |     return tensor.mean(dim=list(range(1, len(tensor.shape))))
122 | 
123 | 
124 | def count_params(model, verbose=False):
125 |     total_params = sum(p.numel() for p in model.parameters())
126 |     if verbose:
127 |         print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
128 |     return total_params
129 | 
130 | 
131 | def instantiate_from_config(config):
132 |     if not "target" in config:
133 |         if config == '__is_first_stage__':
134 |             return None
135 |         elif config == "__is_unconditional__":
136 |             return None
137 |         raise KeyError("Expected key `target` to instantiate.")
138 |     return get_obj_from_str(config["target"])(**config.get("params", dict()))
139 | 
140 | 
141 | def get_obj_from_str(string, reload=False):
142 |     module, cls = string.rsplit(".", 1)
143 |     if reload:
144 |         module_imp = importlib.import_module(module)
145 |         importlib.reload(module_imp)
146 |     return getattr(importlib.import_module(module, package=None), cls)
147 | 
148 | 
149 | class AdamWwithEMAandWings(optim.Optimizer):
150 |     # credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298
151 |     def __init__(self, params, lr=1.e-3, betas=(0.9, 0.999), eps=1.e-8,  # TODO: check hyperparameters before using
152 |                  weight_decay=1.e-2, amsgrad=False, ema_decay=0.9999,   # ema decay to match previous code
153 |                  ema_power=1., param_names=()):
154 |         """AdamW that saves EMA versions of the parameters."""
155 |         if not 0.0 <= lr:
156 |             raise ValueError("Invalid learning rate: {}".format(lr))
157 |         if not 0.0 <= eps:
158 |             raise ValueError("Invalid epsilon value: {}".format(eps))
159 |         if not 0.0 <= betas[0] < 1.0:
160 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
161 |         if not 0.0 <= betas[1] < 1.0:
162 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
163 |         if not 0.0 <= weight_decay:
164 |             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
165 |         if not 0.0 <= ema_decay <= 1.0:
166 |             raise ValueError("Invalid ema_decay value: {}".format(ema_decay))
167 |         defaults = dict(lr=lr, betas=betas, eps=eps,
168 |                         weight_decay=weight_decay, amsgrad=amsgrad, ema_decay=ema_decay,
169 |                         ema_power=ema_power, param_names=param_names)
170 |         super().__init__(params, defaults)
171 | 
172 |     def __setstate__(self, state):
173 |         super().__setstate__(state)
174 |         for group in self.param_groups:
175 |             group.setdefault('amsgrad', False)
176 | 
177 |     @torch.no_grad()
178 |     def step(self, closure=None):
179 |         """Performs a single optimization step.
180 |         Args:
181 |             closure (callable, optional): A closure that reevaluates the model
182 |                 and returns the loss.
183 |         """
184 |         loss = None
185 |         if closure is not None:
186 |             with torch.enable_grad():
187 |                 loss = closure()
188 | 
189 |         for group in self.param_groups:
190 |             params_with_grad = []
191 |             grads = []
192 |             exp_avgs = []
193 |             exp_avg_sqs = []
194 |             ema_params_with_grad = []
195 |             state_sums = []
196 |             max_exp_avg_sqs = []
197 |             state_steps = []
198 |             amsgrad = group['amsgrad']
199 |             beta1, beta2 = group['betas']
200 |             ema_decay = group['ema_decay']
201 |             ema_power = group['ema_power']
202 | 
203 |             for p in group['params']:
204 |                 if p.grad is None:
205 |                     continue
206 |                 params_with_grad.append(p)
207 |                 if p.grad.is_sparse:
208 |                     raise RuntimeError('AdamW does not support sparse gradients')
209 |                 grads.append(p.grad)
210 | 
211 |                 state = self.state[p]
212 | 
213 |                 # State initialization
214 |                 if len(state) == 0:
215 |                     state['step'] = 0
216 |                     # Exponential moving average of gradient values
217 |                     state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
218 |                     # Exponential moving average of squared gradient values
219 |                     state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
220 |                     if amsgrad:
221 |                         # Maintains max of all exp. moving avg. of sq. grad. values
222 |                         state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
223 |                     # Exponential moving average of parameter values
224 |                     state['param_exp_avg'] = p.detach().float().clone()
225 | 
226 |                 exp_avgs.append(state['exp_avg'])
227 |                 exp_avg_sqs.append(state['exp_avg_sq'])
228 |                 ema_params_with_grad.append(state['param_exp_avg'])
229 | 
230 |                 if amsgrad:
231 |                     max_exp_avg_sqs.append(state['max_exp_avg_sq'])
232 | 
233 |                 # update the steps for each param group update
234 |                 state['step'] += 1
235 |                 # record the step after step update
236 |                 state_steps.append(state['step'])
237 | 
238 |             optim._functional.adamw(params_with_grad,
239 |                     grads,
240 |                     exp_avgs,
241 |                     exp_avg_sqs,
242 |                     max_exp_avg_sqs,
243 |                     state_steps,
244 |                     amsgrad=amsgrad,
245 |                     beta1=beta1,
246 |                     beta2=beta2,
247 |                     lr=group['lr'],
248 |                     weight_decay=group['weight_decay'],
249 |                     eps=group['eps'],
250 |                     maximize=False)
251 | 
252 |             cur_ema_decay = min(ema_decay, 1 - state['step'] ** -ema_power)
253 |             for param, ema_param in zip(params_with_grad, ema_params_with_grad):
254 |                 ema_param.mul_(cur_ema_decay).add_(param.float(), alpha=1 - cur_ema_decay)
255 | 
256 |         return loss


--------------------------------------------------------------------------------