├── pose_synthesis ├── zero123-xl.ckpt ├── ldm │ ├── data │ │ ├── __init__.py │ │ ├── inpainting │ │ │ ├── __init__.py │ │ │ └── synthetic_mask.py │ │ ├── dummy.py │ │ ├── base.py │ │ ├── lsun.py │ │ └── nerf_like.py │ ├── models │ │ └── diffusion │ │ │ ├── __init__.py │ │ │ └── sampling_util.py │ ├── modules │ │ ├── encoders │ │ │ └── __init__.py │ │ ├── distributions │ │ │ ├── __init__.py │ │ │ └── distributions.py │ │ ├── diffusionmodules │ │ │ └── __init__.py │ │ ├── losses │ │ │ ├── __init__.py │ │ │ ├── contperceptual.py │ │ │ └── vqperceptual.py │ │ ├── image_degradation │ │ │ ├── utils │ │ │ │ └── test.png │ │ │ └── __init__.py │ │ ├── ema.py │ │ └── evaluate │ │ │ ├── ssim.py │ │ │ └── frechet_video_distance.py │ ├── thirdp │ │ └── psp │ │ │ ├── id_loss.py │ │ │ ├── model_irse.py │ │ │ └── helpers.py │ ├── extras.py │ ├── guidance.py │ ├── lr_scheduler.py │ └── util.py ├── sam_vit_h_4b8939.pth ├── elevation_estimate │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ ├── weights │ │ │ └── .gitkeep │ │ ├── utils3d.py │ │ ├── plotting.py │ │ └── elev_est_api.py │ ├── .gitignore │ ├── loftr │ │ ├── __init__.py │ │ ├── loftr_module │ │ │ ├── __init__.py │ │ │ ├── fine_preprocess.py │ │ │ ├── linear_attention.py │ │ │ └── transformer.py │ │ ├── backbone │ │ │ ├── __init__.py │ │ │ └── resnet_fpn.py │ │ ├── utils │ │ │ ├── cvpr_ds_config.py │ │ │ ├── position_encoding.py │ │ │ ├── geometry.py │ │ │ ├── fine_matching.py │ │ │ └── supervision.py │ │ └── loftr.py │ ├── pyproject.toml │ └── estimate_wild_imgs.py ├── download_ckpt.py ├── utils │ ├── sam_utils.py │ ├── utils.py │ └── zero123_utils.py ├── pose_synthesis_batch.py ├── configs │ └── sd-objaverse-finetune-c_concat-256.yaml ├── run.py └── README.md ├── .gitignore ├── imgs ├── demo │ ├── Intro.jpg │ └── pipeline.jpg ├── sofa_set │ ├── sofa_10_a.png │ ├── sofa_11_a.png │ ├── sofa_12_a.png │ ├── sofa_13_a.png │ ├── sofa_14_a.png │ ├── sofa_15_a.png │ ├── sofa_16_a.png │ ├── sofa_17_a.png │ ├── sofa_18_a.png │ ├── sofa_19_a.png │ ├── sofa_1_a.png │ ├── sofa_20_a.png │ ├── sofa_21_a.png │ ├── sofa_22_a.png │ ├── sofa_23_a.png │ ├── sofa_24_a.png │ ├── sofa_25_a.png │ ├── sofa_26_a.png │ ├── sofa_27_a.png │ ├── sofa_28_a.png │ ├── sofa_2_a.png │ ├── sofa_3_a.png │ ├── sofa_4_a.png │ ├── sofa_5_a.png │ ├── sofa_6_a.png │ ├── sofa_7_a.png │ ├── sofa_8_a.png │ ├── sofa_9_a.png │ ├── sofa_bg_a.png │ ├── sofa_bg_b.png │ ├── sofa_bg_f1.png │ └── sofa_bg_f2.png └── synthesized_imgs │ └── sofa_1_a │ ├── -30_0.png │ ├── 0_-10.png │ ├── 0_-20.png │ ├── 0_-30.png │ ├── 0_-40.png │ ├── 0_0.png │ ├── 0_10.png │ ├── 0_120.png │ ├── 0_20.png │ ├── 0_30.png │ ├── 0_40.png │ ├── 0_50.png │ ├── 0_60.png │ ├── 0_90.png │ ├── 10_10.png │ ├── 10_20.png │ ├── 10_40.png │ ├── 10_50.png │ ├── -10_10.png │ ├── -10_20.png │ ├── -10_40.png │ ├── -10_50.png │ ├── -30_120.png │ ├── -30_30.png │ ├── -30_60.png │ ├── -30_90.png │ ├── 10_-10.png │ ├── 10_-20.png │ ├── 10_-30.png │ └── 10_-40.png ├── obj_name_synthesis.py ├── requirements.txt ├── util.py ├── pose_estimation.py ├── README.md └── train_pose_estimator.py /pose_synthesis/zero123-xl.ckpt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pose_synthesis/sam_vit_h_4b8939.pth: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | **/.DS_Store 3 | /tmp -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/data/inpainting/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/models/diffusion/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/modules/encoders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/modules/distributions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/utils/weights/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/modules/diffusionmodules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | .idea/ 3 | *.egg-info/ 4 | -------------------------------------------------------------------------------- /imgs/demo/Intro.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/demo/Intro.jpg -------------------------------------------------------------------------------- /imgs/demo/pipeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/demo/pipeline.jpg -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_10_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_10_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_11_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_11_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_12_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_12_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_13_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_13_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_14_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_14_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_15_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_15_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_16_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_16_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_17_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_17_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_18_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_18_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_19_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_19_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_1_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_1_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_20_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_20_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_21_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_21_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_22_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_22_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_23_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_23_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_24_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_24_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_25_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_25_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_26_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_26_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_27_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_27_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_28_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_28_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_2_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_2_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_3_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_3_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_4_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_4_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_5_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_5_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_6_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_6_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_7_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_7_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_8_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_8_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_9_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_9_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_bg_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_bg_a.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_bg_b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_bg_b.png -------------------------------------------------------------------------------- /pose_synthesis/ldm/modules/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_bg_f1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_bg_f1.png -------------------------------------------------------------------------------- /imgs/sofa_set/sofa_bg_f2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/sofa_set/sofa_bg_f2.png -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/loftr/__init__.py: -------------------------------------------------------------------------------- 1 | from .loftr import LoFTR 2 | from .utils.cvpr_ds_config import default_cfg 3 | -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/-30_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-30_0.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/0_-10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_-10.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/0_-20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_-20.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/0_-30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_-30.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/0_-40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_-40.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/0_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_0.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/0_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_10.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/0_120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_120.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/0_20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_20.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/0_30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_30.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/0_40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_40.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/0_50.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_50.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/0_60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_60.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/0_90.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/0_90.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/10_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/10_10.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/10_20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/10_20.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/10_40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/10_40.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/10_50.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/10_50.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/-10_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-10_10.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/-10_20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-10_20.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/-10_40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-10_40.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/-10_50.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-10_50.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/-30_120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-30_120.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/-30_30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-30_30.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/-30_60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-30_60.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/-30_90.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/-30_90.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/10_-10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/10_-10.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/10_-20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/10_-20.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/10_-30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/10_-30.png -------------------------------------------------------------------------------- /imgs/synthesized_imgs/sofa_1_a/10_-40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/imgs/synthesized_imgs/sofa_1_a/10_-40.png -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/loftr/loftr_module/__init__.py: -------------------------------------------------------------------------------- 1 | from .transformer import LocalFeatureTransformer 2 | from .fine_preprocess import FinePreprocess 3 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/modules/image_degradation/utils/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collovlabs/ViewControl/HEAD/pose_synthesis/ldm/modules/image_degradation/utils/test.png -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "elevation_estimate" 3 | version = "0.1" 4 | 5 | [tool.setuptools.packages.find] 6 | exclude = ["configs", "tests"] # empty by default 7 | namespaces = false # true by default -------------------------------------------------------------------------------- /pose_synthesis/ldm/modules/image_degradation/__init__.py: -------------------------------------------------------------------------------- 1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr 2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light 3 | -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/estimate_wild_imgs.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | from .utils.elev_est_api import elev_est_api 3 | 4 | def estimate_elev(root_dir): 5 | img_dir = osp.join(root_dir, "stage2_8") 6 | img_paths = [] 7 | for i in range(4): 8 | img_paths.append(f"{img_dir}/0_{i}.png") 9 | elev = elev_est_api(img_paths) 10 | return elev 11 | -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/loftr/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet_fpn import ResNetFPN_8_2, ResNetFPN_16_4 2 | 3 | 4 | def build_backbone(config): 5 | if config['backbone_type'] == 'ResNetFPN': 6 | if config['resolution'] == (8, 2): 7 | return ResNetFPN_8_2(config['resnetfpn']) 8 | elif config['resolution'] == (16, 4): 9 | return ResNetFPN_16_4(config['resnetfpn']) 10 | else: 11 | raise ValueError(f"LOFTR.BACKBONE_TYPE {config['backbone_type']} not supported.") 12 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/thirdp/psp/id_loss.py: -------------------------------------------------------------------------------- 1 | # https://github.com/eladrich/pixel2style2pixel 2 | import torch 3 | from torch import nn 4 | from ldm.thirdp.psp.model_irse import Backbone 5 | 6 | 7 | class IDFeatures(nn.Module): 8 | def __init__(self, model_path): 9 | super(IDFeatures, self).__init__() 10 | print('Loading ResNet ArcFace') 11 | self.facenet = Backbone(input_size=112, num_layers=50, drop_ratio=0.6, mode='ir_se') 12 | self.facenet.load_state_dict(torch.load(model_path, map_location="cpu")) 13 | self.face_pool = torch.nn.AdaptiveAvgPool2d((112, 112)) 14 | self.facenet.eval() 15 | 16 | def forward(self, x, crop=False): 17 | # Not sure of the image range here 18 | if crop: 19 | x = torch.nn.functional.interpolate(x, (256, 256), mode="area") 20 | x = x[:, :, 35:223, 32:220] 21 | x = self.face_pool(x) 22 | x_feats = self.facenet(x) 23 | return x_feats 24 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/data/dummy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import string 4 | from torch.utils.data import Dataset, Subset 5 | 6 | class DummyData(Dataset): 7 | def __init__(self, length, size): 8 | self.length = length 9 | self.size = size 10 | 11 | def __len__(self): 12 | return self.length 13 | 14 | def __getitem__(self, i): 15 | x = np.random.randn(*self.size) 16 | letters = string.ascii_lowercase 17 | y = ''.join(random.choice(string.ascii_lowercase) for i in range(10)) 18 | return {"jpg": x, "txt": y} 19 | 20 | 21 | class DummyDataWithEmbeddings(Dataset): 22 | def __init__(self, length, size, emb_size): 23 | self.length = length 24 | self.size = size 25 | self.emb_size = emb_size 26 | 27 | def __len__(self): 28 | return self.length 29 | 30 | def __getitem__(self, i): 31 | x = np.random.randn(*self.size) 32 | y = np.random.randn(*self.emb_size).astype(np.float32) 33 | return {"jpg": x, "txt": y} 34 | 35 | -------------------------------------------------------------------------------- /obj_name_synthesis.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from transformers import AutoProcessor, Blip2ForConditionalGeneration 4 | import torch 5 | from diffusers.utils import load_image 6 | import argparse 7 | 8 | 9 | 10 | def main(input_path, output_path): 11 | 12 | processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b") 13 | model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16) 14 | 15 | prompt = "" 16 | image = load_image(input_path) 17 | 18 | device = "cuda" if torch.cuda.is_available() else "cpu" 19 | model.to(device) 20 | 21 | inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16) 22 | 23 | generated_ids = model.generate(**inputs, max_new_tokens=20) 24 | generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() 25 | 26 | with open(output_path, "w") as f: 27 | f.write(generated_text) 28 | 29 | print("image from {} captioned as {}".format(input_path, generated_text)) 30 | 31 | 32 | if __name__ == "__main__": 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument("--input_path", type=str, default="./input_images_path") 35 | parser.add_argument("--output_path", type=str, default="./output_caption_path") 36 | 37 | args = parser.parse_args() 38 | main(args.input_path, args.output_path) 39 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/data/base.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from abc import abstractmethod 4 | from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset 5 | 6 | 7 | class Txt2ImgIterableBaseDataset(IterableDataset): 8 | ''' 9 | Define an interface to make the IterableDatasets for text2img data chainable 10 | ''' 11 | def __init__(self, num_records=0, valid_ids=None, size=256): 12 | super().__init__() 13 | self.num_records = num_records 14 | self.valid_ids = valid_ids 15 | self.sample_ids = valid_ids 16 | self.size = size 17 | 18 | print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.') 19 | 20 | def __len__(self): 21 | return self.num_records 22 | 23 | @abstractmethod 24 | def __iter__(self): 25 | pass 26 | 27 | 28 | class PRNGMixin(object): 29 | """ 30 | Adds a prng property which is a numpy RandomState which gets 31 | reinitialized whenever the pid changes to avoid synchronized sampling 32 | behavior when used in conjunction with multiprocessing. 33 | """ 34 | @property 35 | def prng(self): 36 | currentpid = os.getpid() 37 | if getattr(self, "_initpid", None) != currentpid: 38 | self._initpid = currentpid 39 | self._prng = np.random.RandomState() 40 | return self._prng 41 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | Pillow 3 | 4 | albumentations>=1.3.1 5 | opencv-python>=4.8.0.76 6 | pudb>=2022.1.3 7 | imageio>=2.31.1 8 | imageio-ffmpeg>=0.4.8 9 | pytorch-lightning>=2.0.6 10 | omegaconf>=2.3.0 11 | test-tube>=0.7.5 12 | streamlit>=1.25.0 13 | einops>=0.6.1 14 | torch-fidelity>=0.3.0 15 | transformers>=4.31.0 16 | kornia>=0.7.0 17 | webdataset>=0.2.48 18 | torchmetrics>=1.0.3 19 | fire>=0.5.0 20 | gradio>=3.40.1 21 | diffusers>=0.19.3 22 | datasets[vision]>=2.14.4 23 | rich>=13.5.2 24 | plotly>=5.16.0 25 | -e git+https://github.com/CompVis/taming-transformers.git#egg=taming-transformers 26 | # elev est 27 | dl_ext>=1.3.4 28 | loguru>=0.7.0 29 | matplotlib>=3.7.2 30 | multipledispatch>=1.0.0 31 | packaging>=23.1 32 | Pillow>=9.3.0 33 | PyYAML>=6.0.1 34 | scikit_image>=0.21.0 35 | scikit_learn>=1.3.0 36 | scipy>=1.11.1 37 | setuptools>=59.6.0 38 | tensorboardX>=2.6.2 39 | tqdm>=4.66.1 40 | transforms3d>=0.4.1 41 | trimesh>=3.23.1 42 | yacs>=0.1.8 43 | gdown>=4.7.1 44 | git+https://github.com/NVlabs/nvdiffrast.git 45 | git+https://github.com/openai/CLIP.git 46 | # segment anything 47 | onnxruntime>=1.15.1 48 | onnx>=1.14.0 49 | git+https://github.com/facebookresearch/segment-anything.git 50 | # rembg 51 | rembg>=2.0.50 52 | # reconstruction 53 | pyhocon>=0.3.60 54 | icecream>=2.1.3 55 | PyMCubes>=0.1.4 56 | ninja>=1.11.1 57 | # juypter 58 | jupyter>=1.0.0 59 | jupyterlab>=4.0.5 60 | ipywidgets>=8.1.0 61 | ipykernel>=6.25.1 62 | panel>=1.2.1 63 | jupyter_bokeh>=3.0.7 64 | 65 | -------------------------------------------------------------------------------- /pose_synthesis/download_ckpt.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | from tqdm import tqdm 3 | 4 | def download_checkpoint(url, save_path): 5 | try: 6 | with urllib.request.urlopen(url) as response, open(save_path, 'wb') as file: 7 | file_size = int(response.info().get('Content-Length', -1)) 8 | chunk_size = 8192 9 | num_chunks = file_size // chunk_size if file_size > chunk_size else 1 10 | 11 | with tqdm(total=file_size, unit='B', unit_scale=True, desc='Downloading', ncols=100) as pbar: 12 | for chunk in iter(lambda: response.read(chunk_size), b''): 13 | file.write(chunk) 14 | pbar.update(len(chunk)) 15 | 16 | print(f"Checkpoint downloaded and saved to: {save_path}") 17 | except Exception as e: 18 | print(f"Error downloading checkpoint: {e}") 19 | 20 | if __name__ == "__main__": 21 | ckpts = { 22 | "sam_vit_h_4b8939.pth": "https://huggingface.co/One-2-3-45/code/resolve/main/sam_vit_h_4b8939.pth", 23 | "zero123-xl.ckpt": "https://huggingface.co/One-2-3-45/code/resolve/main/zero123-xl.ckpt", 24 | "elevation_estimate/utils/weights/indoor_ds_new.ckpt" : "https://huggingface.co/One-2-3-45/code/resolve/main/elevation_estimate/utils/weights/indoor_ds_new.ckpt" 25 | } 26 | for ckpt_name, ckpt_url in ckpts.items(): 27 | print(f"Downloading checkpoint: {ckpt_name}") 28 | download_checkpoint(ckpt_url, ckpt_name) 29 | 30 | -------------------------------------------------------------------------------- /pose_synthesis/utils/sam_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import torch 4 | from PIL import Image 5 | import time 6 | 7 | from segment_anything import sam_model_registry, SamPredictor 8 | 9 | def sam_init(device_id=0): 10 | sam_checkpoint = os.path.join(os.path.dirname(__file__), "../sam_vit_h_4b8939.pth") 11 | model_type = "vit_h" 12 | 13 | device = "cuda:{}".format(device_id) if torch.cuda.is_available() else "cpu" 14 | 15 | sam = sam_model_registry[model_type](checkpoint=sam_checkpoint).to(device=device) 16 | predictor = SamPredictor(sam) 17 | return predictor 18 | 19 | def sam_out_nosave(predictor, input_image, *bbox_sliders): 20 | bbox = np.array(bbox_sliders) 21 | image = np.asarray(input_image) 22 | 23 | start_time = time.time() 24 | predictor.set_image(image) 25 | 26 | h, w, _ = image.shape 27 | input_point = np.array([[h//2, w//2]]) 28 | input_label = np.array([1]) 29 | 30 | masks, scores, logits = predictor.predict( 31 | point_coords=input_point, 32 | point_labels=input_label, 33 | multimask_output=True, 34 | ) 35 | 36 | masks_bbox, scores_bbox, logits_bbox = predictor.predict( 37 | box=bbox, 38 | multimask_output=True 39 | ) 40 | 41 | print(f"SAM Time: {time.time() - start_time:.3f}s") 42 | opt_idx = np.argmax(scores) 43 | mask = masks[opt_idx] 44 | out_image = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8) 45 | out_image[:, :, :3] = image 46 | out_image_bbox = out_image.copy() 47 | out_image[:, :, 3] = mask.astype(np.uint8) * 255 48 | out_image_bbox[:, :, 3] = masks_bbox[-1].astype(np.uint8) * 255 # np.argmax(scores_bbox) 49 | torch.cuda.empty_cache() 50 | return Image.fromarray(out_image_bbox, mode='RGBA') -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/loftr/utils/cvpr_ds_config.py: -------------------------------------------------------------------------------- 1 | from yacs.config import CfgNode as CN 2 | 3 | 4 | def lower_config(yacs_cfg): 5 | if not isinstance(yacs_cfg, CN): 6 | return yacs_cfg 7 | return {k.lower(): lower_config(v) for k, v in yacs_cfg.items()} 8 | 9 | 10 | _CN = CN() 11 | _CN.BACKBONE_TYPE = 'ResNetFPN' 12 | _CN.RESOLUTION = (8, 2) # options: [(8, 2), (16, 4)] 13 | _CN.FINE_WINDOW_SIZE = 5 # window_size in fine_level, must be odd 14 | _CN.FINE_CONCAT_COARSE_FEAT = True 15 | 16 | # 1. LoFTR-backbone (local feature CNN) config 17 | _CN.RESNETFPN = CN() 18 | _CN.RESNETFPN.INITIAL_DIM = 128 19 | _CN.RESNETFPN.BLOCK_DIMS = [128, 196, 256] # s1, s2, s3 20 | 21 | # 2. LoFTR-coarse module config 22 | _CN.COARSE = CN() 23 | _CN.COARSE.D_MODEL = 256 24 | _CN.COARSE.D_FFN = 256 25 | _CN.COARSE.NHEAD = 8 26 | _CN.COARSE.LAYER_NAMES = ['self', 'cross'] * 4 27 | _CN.COARSE.ATTENTION = 'linear' # options: ['linear', 'full'] 28 | _CN.COARSE.TEMP_BUG_FIX = False 29 | 30 | # 3. Coarse-Matching config 31 | _CN.MATCH_COARSE = CN() 32 | _CN.MATCH_COARSE.THR = 0.2 33 | _CN.MATCH_COARSE.BORDER_RM = 2 34 | _CN.MATCH_COARSE.MATCH_TYPE = 'dual_softmax' # options: ['dual_softmax, 'sinkhorn'] 35 | _CN.MATCH_COARSE.DSMAX_TEMPERATURE = 0.1 36 | _CN.MATCH_COARSE.SKH_ITERS = 3 37 | _CN.MATCH_COARSE.SKH_INIT_BIN_SCORE = 1.0 38 | _CN.MATCH_COARSE.SKH_PREFILTER = True 39 | _CN.MATCH_COARSE.TRAIN_COARSE_PERCENT = 0.4 # training tricks: save GPU memory 40 | _CN.MATCH_COARSE.TRAIN_PAD_NUM_GT_MIN = 200 # training tricks: avoid DDP deadlock 41 | 42 | # 4. LoFTR-fine module config 43 | _CN.FINE = CN() 44 | _CN.FINE.D_MODEL = 128 45 | _CN.FINE.D_FFN = 128 46 | _CN.FINE.NHEAD = 8 47 | _CN.FINE.LAYER_NAMES = ['self', 'cross'] * 1 48 | _CN.FINE.ATTENTION = 'linear' 49 | 50 | default_cfg = lower_config(_CN) 51 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/models/diffusion/sampling_util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def append_dims(x, target_dims): 6 | """Appends dimensions to the end of a tensor until it has target_dims dimensions. 7 | From https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/utils.py""" 8 | dims_to_append = target_dims - x.ndim 9 | if dims_to_append < 0: 10 | raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less') 11 | return x[(...,) + (None,) * dims_to_append] 12 | 13 | 14 | def renorm_thresholding(x0, value): 15 | # renorm 16 | pred_max = x0.max() 17 | pred_min = x0.min() 18 | pred_x0 = (x0 - pred_min) / (pred_max - pred_min) # 0 ... 1 19 | pred_x0 = 2 * pred_x0 - 1. # -1 ... 1 20 | 21 | s = torch.quantile( 22 | rearrange(pred_x0, 'b ... -> b (...)').abs(), 23 | value, 24 | dim=-1 25 | ) 26 | s.clamp_(min=1.0) 27 | s = s.view(-1, *((1,) * (pred_x0.ndim - 1))) 28 | 29 | # clip by threshold 30 | # pred_x0 = pred_x0.clamp(-s, s) / s # needs newer pytorch # TODO bring back to pure-gpu with min/max 31 | 32 | # temporary hack: numpy on cpu 33 | pred_x0 = np.clip(pred_x0.cpu().numpy(), -s.cpu().numpy(), s.cpu().numpy()) / s.cpu().numpy() 34 | pred_x0 = torch.tensor(pred_x0).to(self.model.device) 35 | 36 | # re.renorm 37 | pred_x0 = (pred_x0 + 1.) / 2. # 0 ... 1 38 | pred_x0 = (pred_max - pred_min) * pred_x0 + pred_min # orig range 39 | return pred_x0 40 | 41 | 42 | def norm_thresholding(x0, value): 43 | s = append_dims(x0.pow(2).flatten(1).mean(1).sqrt().clamp(min=value), x0.ndim) 44 | return x0 * (value / s) 45 | 46 | 47 | def spatial_norm_thresholding(x0, value): 48 | # b c h w 49 | s = x0.pow(2).mean(1, keepdim=True).sqrt().clamp(min=value) 50 | return x0 * (value / s) -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from PIL import Image 3 | from lang_sam import LangSAM 4 | import numpy as np 5 | import argparse 6 | 7 | 8 | def segmentation(image, text, output_path): 9 | 10 | model = LangSAM(sam_type="vit_h") 11 | 12 | def predict(image_path, text_prompt, box_threshold=0.3, text_threshold=0.25): 13 | if isinstance(image_path, str): 14 | image_pil = Image.open(image_path).convert("RGB") 15 | else: 16 | # bug here, need to be improved 17 | image_pil = image_path 18 | masks, boxes, phrases, logits = model.predict(image_pil, text_prompt, box_threshold, text_threshold) 19 | labels = [f"{phrase} {logit:.2f}" for phrase, logit in zip(phrases, logits)] 20 | image_array = np.asarray(image_pil.convert("RGBA")) 21 | 22 | output_image = np.zeros_like(image_array) 23 | output_image[:,:,3] = 255 24 | output_image[:,:,0:3] = image_array[:,:,0:3] 25 | 26 | for i in range(len(masks)): 27 | mask = masks[i] 28 | mask = np.expand_dims(mask, axis=2) 29 | mask = np.repeat(mask, 4, axis=2) 30 | mask = mask.astype(np.uint8) 31 | mask = mask * 255 32 | output_image = np.where(mask > 0, output_image, 0) 33 | 34 | output_image = Image.fromarray(np.uint8(output_image)).convert("RGBA") 35 | 36 | output_image.save(output_path) 37 | 38 | 39 | predict(image, text) 40 | 41 | 42 | if __name__ == "__main__": 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument("--input_path", type=str, default="./input_images_path") 45 | parser.add_argument("--prompt", type=str, default="sofa") 46 | parser.add_argument("--output_path", type=str, default="./output_images_path") 47 | 48 | args = parser.parse_args() 49 | segmentation(args.input_path, args.prompt, args.output_path) -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/loftr/utils/position_encoding.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch import nn 4 | 5 | 6 | class PositionEncodingSine(nn.Module): 7 | """ 8 | This is a sinusoidal position encoding that generalized to 2-dimensional images 9 | """ 10 | 11 | def __init__(self, d_model, max_shape=(256, 256), temp_bug_fix=True): 12 | """ 13 | Args: 14 | max_shape (tuple): for 1/8 featmap, the max length of 256 corresponds to 2048 pixels 15 | temp_bug_fix (bool): As noted in this [issue](https://github.com/zju3dv/LoFTR/issues/41), 16 | the original implementation of LoFTR includes a bug in the pos-enc impl, which has little impact 17 | on the final performance. For now, we keep both impls for backward compatability. 18 | We will remove the buggy impl after re-training all variants of our released models. 19 | """ 20 | super().__init__() 21 | 22 | pe = torch.zeros((d_model, *max_shape)) 23 | y_position = torch.ones(max_shape).cumsum(0).float().unsqueeze(0) 24 | x_position = torch.ones(max_shape).cumsum(1).float().unsqueeze(0) 25 | if temp_bug_fix: 26 | div_term = torch.exp(torch.arange(0, d_model//2, 2).float() * (-math.log(10000.0) / (d_model//2))) 27 | else: # a buggy implementation (for backward compatability only) 28 | div_term = torch.exp(torch.arange(0, d_model//2, 2).float() * (-math.log(10000.0) / d_model//2)) 29 | div_term = div_term[:, None, None] # [C//4, 1, 1] 30 | pe[0::4, :, :] = torch.sin(x_position * div_term) 31 | pe[1::4, :, :] = torch.cos(x_position * div_term) 32 | pe[2::4, :, :] = torch.sin(y_position * div_term) 33 | pe[3::4, :, :] = torch.cos(y_position * div_term) 34 | 35 | self.register_buffer('pe', pe.unsqueeze(0), persistent=False) # [1, C, H, W] 36 | 37 | def forward(self, x): 38 | """ 39 | Args: 40 | x: [N, C, H, W] 41 | """ 42 | return x + self.pe[:, :, :x.size(2), :x.size(3)] 43 | -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/utils/utils3d.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | def cart_to_hom(pts): 6 | """ 7 | :param pts: (N, 3 or 2) 8 | :return pts_hom: (N, 4 or 3) 9 | """ 10 | if isinstance(pts, np.ndarray): 11 | pts_hom = np.concatenate((pts, np.ones([*pts.shape[:-1], 1], dtype=np.float32)), -1) 12 | else: 13 | ones = torch.ones([*pts.shape[:-1], 1], dtype=torch.float32, device=pts.device) 14 | pts_hom = torch.cat((pts, ones), dim=-1) 15 | return pts_hom 16 | 17 | 18 | def hom_to_cart(pts): 19 | return pts[..., :-1] / pts[..., -1:] 20 | 21 | 22 | def canonical_to_camera(pts, pose): 23 | pts = cart_to_hom(pts) 24 | pts = pts @ pose.transpose(-1, -2) 25 | pts = hom_to_cart(pts) 26 | return pts 27 | 28 | 29 | def rect_to_img(K, pts_rect): 30 | from dl_ext.vision_ext.datasets.kitti.structures import Calibration 31 | pts_2d_hom = pts_rect @ K.t() 32 | pts_img = Calibration.hom_to_cart(pts_2d_hom) 33 | return pts_img 34 | 35 | 36 | def calc_pose(phis, thetas, size, radius=1.2): 37 | import torch 38 | def normalize(vectors): 39 | return vectors / (torch.norm(vectors, dim=-1, keepdim=True) + 1e-10) 40 | 41 | device = torch.device('cuda') 42 | thetas = torch.FloatTensor(thetas).to(device) 43 | phis = torch.FloatTensor(phis).to(device) 44 | 45 | centers = torch.stack([ 46 | radius * torch.sin(thetas) * torch.sin(phis), 47 | -radius * torch.cos(thetas) * torch.sin(phis), 48 | radius * torch.cos(phis), 49 | ], dim=-1) # [B, 3] 50 | 51 | # lookat 52 | forward_vector = normalize(centers).squeeze(0) 53 | up_vector = torch.FloatTensor([0, 0, 1]).to(device).unsqueeze(0).repeat(size, 1) 54 | right_vector = normalize(torch.cross(up_vector, forward_vector, dim=-1)) 55 | if right_vector.pow(2).sum() < 0.01: 56 | right_vector = torch.FloatTensor([0, 1, 0]).to(device).unsqueeze(0).repeat(size, 1) 57 | up_vector = normalize(torch.cross(forward_vector, right_vector, dim=-1)) 58 | 59 | poses = torch.eye(4, dtype=torch.float, device=device).unsqueeze(0).repeat(size, 1, 1) 60 | poses[:, :3, :3] = torch.stack((right_vector, up_vector, forward_vector), dim=-1) 61 | poses[:, :3, 3] = centers 62 | return poses 63 | -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/loftr/utils/geometry.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | @torch.no_grad() 5 | def warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1): 6 | """ Warp kpts0 from I0 to I1 with depth, K and Rt 7 | Also check covisibility and depth consistency. 8 | Depth is consistent if relative error < 0.2 (hard-coded). 9 | 10 | Args: 11 | kpts0 (torch.Tensor): [N, L, 2] - , 12 | depth0 (torch.Tensor): [N, H, W], 13 | depth1 (torch.Tensor): [N, H, W], 14 | T_0to1 (torch.Tensor): [N, 3, 4], 15 | K0 (torch.Tensor): [N, 3, 3], 16 | K1 (torch.Tensor): [N, 3, 3], 17 | Returns: 18 | calculable_mask (torch.Tensor): [N, L] 19 | warped_keypoints0 (torch.Tensor): [N, L, 2] 20 | """ 21 | kpts0_long = kpts0.round().long() 22 | 23 | # Sample depth, get calculable_mask on depth != 0 24 | kpts0_depth = torch.stack( 25 | [depth0[i, kpts0_long[i, :, 1], kpts0_long[i, :, 0]] for i in range(kpts0.shape[0])], dim=0 26 | ) # (N, L) 27 | nonzero_mask = kpts0_depth != 0 28 | 29 | # Unproject 30 | kpts0_h = torch.cat([kpts0, torch.ones_like(kpts0[:, :, [0]])], dim=-1) * kpts0_depth[..., None] # (N, L, 3) 31 | kpts0_cam = K0.inverse() @ kpts0_h.transpose(2, 1) # (N, 3, L) 32 | 33 | # Rigid Transform 34 | w_kpts0_cam = T_0to1[:, :3, :3] @ kpts0_cam + T_0to1[:, :3, [3]] # (N, 3, L) 35 | w_kpts0_depth_computed = w_kpts0_cam[:, 2, :] 36 | 37 | # Project 38 | w_kpts0_h = (K1 @ w_kpts0_cam).transpose(2, 1) # (N, L, 3) 39 | w_kpts0 = w_kpts0_h[:, :, :2] / (w_kpts0_h[:, :, [2]] + 1e-4) # (N, L, 2), +1e-4 to avoid zero depth 40 | 41 | # Covisible Check 42 | h, w = depth1.shape[1:3] 43 | covisible_mask = (w_kpts0[:, :, 0] > 0) * (w_kpts0[:, :, 0] < w-1) * \ 44 | (w_kpts0[:, :, 1] > 0) * (w_kpts0[:, :, 1] < h-1) 45 | w_kpts0_long = w_kpts0.long() 46 | w_kpts0_long[~covisible_mask, :] = 0 47 | 48 | w_kpts0_depth = torch.stack( 49 | [depth1[i, w_kpts0_long[i, :, 1], w_kpts0_long[i, :, 0]] for i in range(w_kpts0_long.shape[0])], dim=0 50 | ) # (N, L) 51 | consistent_mask = ((w_kpts0_depth - w_kpts0_depth_computed) / w_kpts0_depth).abs() < 0.2 52 | valid_mask = nonzero_mask * covisible_mask * consistent_mask 53 | 54 | return valid_mask, w_kpts0 55 | -------------------------------------------------------------------------------- /pose_synthesis/pose_synthesis_batch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import argparse 4 | from multiprocessing import Pool 5 | 6 | def generate_images(args): 7 | x, y, img_path, half_precision, output_path = args 8 | x = "["+x+"]" 9 | y = "["+y+"]" 10 | 11 | command = f"python pose_synthesis.py --x {x} --y {y} --img_path {img_path} --half_precision --output_path {output_path}" 12 | subprocess.run(command, shell=True) 13 | 14 | def main(input_dir, output_dir, x_values, y_values): 15 | if not os.path.isdir(input_dir): 16 | image_files = [] 17 | else: 18 | image_files = os.listdir(input_dir) 19 | 20 | # if input_dir is a single image 21 | if len(image_files) == 0: 22 | image_files = [input_dir.split('/')[-1]] 23 | 24 | for image_file in image_files: 25 | print("Processing image: ", image_file) 26 | image_name = os.path.splitext(image_file)[0] 27 | if len(image_files) == 1: 28 | image_path = input_dir 29 | else: 30 | image_path = os.path.join(input_dir, image_file) 31 | 32 | output_subdir = os.path.join(output_dir, image_name) 33 | if not os.path.exists(output_subdir): 34 | os.makedirs(output_subdir) 35 | 36 | # x_values = [-10,0,10] 37 | # y_values = [0,-10,-20, -30, -40,-50,10,20,30,40,50,60] 38 | 39 | args_list = [] 40 | args_list.append((x_values, y_values, image_path, True, output_subdir)) 41 | 42 | # use multiple threads to generate images 43 | with Pool(processes=2) as pool: 44 | pool.map(generate_images, args_list) 45 | 46 | if __name__ == "__main__": 47 | parser = argparse.ArgumentParser() 48 | parser.add_argument("--input_dir", type=str, default="./input_images_dir") 49 | parser.add_argument("--output_dir", type=str, default="./output_images_dir") 50 | parser.add_argument("--x_values", type=str, default="0,10", help="comma separated list of x values") 51 | parser.add_argument("--y_values", type=str, default="0,-10", help="comma separated list of y values") 52 | parser.add_argument("--pose_file_path", type=str, default=None) 53 | 54 | args = parser.parse_args() 55 | 56 | if args.pose_file_path: 57 | with open(args.pose_file_path, "r") as f: 58 | args.x_values, args.y_values = f.read().split(' ') 59 | main(args.input_dir, args.output_dir, args.x_values, args.y_values) 60 | -------------------------------------------------------------------------------- /pose_estimation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import argparse 5 | import logging 6 | from PIL import Image 7 | from transformers import AutoImageProcessor, Dinov2Model 8 | vit_model = Dinov2Model.from_pretrained("facebook/dinov2-base") 9 | processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base") 10 | 11 | class RegressionModel(nn.Module): 12 | def __init__(self): 13 | super(RegressionModel, self).__init__() 14 | self.vit = vit_model 15 | # self.fc1 = nn.Linear(768, 768) 16 | # self.fc2 = nn.Linear(768, 768) 17 | self.fc3 = nn.Linear(768, 128) 18 | self.fc4 = nn.Linear(128, 2) 19 | 20 | 21 | for param in self.vit.parameters(): 22 | param.requires_grad = True 23 | 24 | def forward(self, x): 25 | outputs = self.vit(x) 26 | sequence_output = outputs[0] 27 | x = sequence_output[:, 0, :] #[B,768] 28 | 29 | # x = F.relu(self.fc1(x)) 30 | # x = F.relu(self.fc2(x)) 31 | x = F.relu(self.fc3(x)) 32 | x = self.fc4(x) 33 | return x 34 | 35 | 36 | def main(input_path, output_path, model_path, device="cuda"): 37 | # inference 38 | # input a image, and predict R and T 39 | 40 | input_image = Image.open(input_path).convert('RGB') 41 | input_image = processor(images=input_image, return_tensors="pt") 42 | input_image = input_image['pixel_values'].to(device) 43 | 44 | model = RegressionModel().float().to(device) 45 | model.load_state_dict(torch.load(model_path, map_location=device)) 46 | model.eval() 47 | with torch.no_grad(): 48 | prediction = model(input_image)[0] 49 | predicted_R, predicted_T = prediction[0], prediction[1] 50 | # round to integer and write to output_path. You may change this part to round to the integer that can be divided by 10 51 | predicted_R = round(predicted_R.item()) 52 | predicted_T = round(predicted_T.item()) 53 | with open(output_path, "w") as f: 54 | f.write(f"{predicted_R} {predicted_T}") 55 | logging.info(f'Predicted R: {predicted_R}, Predicted T: {predicted_T}') 56 | 57 | 58 | if __name__ == "__main__": 59 | parser = argparse.ArgumentParser() 60 | parser.add_argument("--input_path", type=str, default="./input_images_path") 61 | parser.add_argument("--output_path", type=str, default="./output_pose_path") 62 | parser.add_argument("--model_path", type=str, default="./model_path") 63 | parser.add_argument("--device", type=str, default="cuda") 64 | 65 | args = parser.parse_args() 66 | main(args.input_path, args.output_path, args.model_path, args.device) -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/loftr/loftr_module/fine_preprocess.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from einops.einops import rearrange, repeat 5 | 6 | 7 | class FinePreprocess(nn.Module): 8 | def __init__(self, config): 9 | super().__init__() 10 | 11 | self.config = config 12 | self.cat_c_feat = config['fine_concat_coarse_feat'] 13 | self.W = self.config['fine_window_size'] 14 | 15 | d_model_c = self.config['coarse']['d_model'] 16 | d_model_f = self.config['fine']['d_model'] 17 | self.d_model_f = d_model_f 18 | if self.cat_c_feat: 19 | self.down_proj = nn.Linear(d_model_c, d_model_f, bias=True) 20 | self.merge_feat = nn.Linear(2*d_model_f, d_model_f, bias=True) 21 | 22 | self._reset_parameters() 23 | 24 | def _reset_parameters(self): 25 | for p in self.parameters(): 26 | if p.dim() > 1: 27 | nn.init.kaiming_normal_(p, mode="fan_out", nonlinearity="relu") 28 | 29 | def forward(self, feat_f0, feat_f1, feat_c0, feat_c1, data): 30 | W = self.W 31 | stride = data['hw0_f'][0] // data['hw0_c'][0] 32 | 33 | data.update({'W': W}) 34 | if data['b_ids'].shape[0] == 0: 35 | feat0 = torch.empty(0, self.W**2, self.d_model_f, device=feat_f0.device) 36 | feat1 = torch.empty(0, self.W**2, self.d_model_f, device=feat_f0.device) 37 | return feat0, feat1 38 | 39 | # 1. unfold(crop) all local windows 40 | feat_f0_unfold = F.unfold(feat_f0, kernel_size=(W, W), stride=stride, padding=W//2) 41 | feat_f0_unfold = rearrange(feat_f0_unfold, 'n (c ww) l -> n l ww c', ww=W**2) 42 | feat_f1_unfold = F.unfold(feat_f1, kernel_size=(W, W), stride=stride, padding=W//2) 43 | feat_f1_unfold = rearrange(feat_f1_unfold, 'n (c ww) l -> n l ww c', ww=W**2) 44 | 45 | # 2. select only the predicted matches 46 | feat_f0_unfold = feat_f0_unfold[data['b_ids'], data['i_ids']] # [n, ww, cf] 47 | feat_f1_unfold = feat_f1_unfold[data['b_ids'], data['j_ids']] 48 | 49 | # option: use coarse-level loftr feature as context: concat and linear 50 | if self.cat_c_feat: 51 | feat_c_win = self.down_proj(torch.cat([feat_c0[data['b_ids'], data['i_ids']], 52 | feat_c1[data['b_ids'], data['j_ids']]], 0)) # [2n, c] 53 | feat_cf_win = self.merge_feat(torch.cat([ 54 | torch.cat([feat_f0_unfold, feat_f1_unfold], 0), # [2n, ww, cf] 55 | repeat(feat_c_win, 'n c -> n ww c', ww=W**2), # [2n, ww, cf] 56 | ], -1)) 57 | feat_f0_unfold, feat_f1_unfold = torch.chunk(feat_cf_win, 2, dim=0) 58 | 59 | return feat_f0_unfold, feat_f1_unfold 60 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/extras.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from omegaconf import OmegaConf 3 | import torch 4 | from ldm.util import instantiate_from_config 5 | import logging 6 | from contextlib import contextmanager 7 | 8 | from contextlib import contextmanager 9 | import logging 10 | 11 | @contextmanager 12 | def all_logging_disabled(highest_level=logging.CRITICAL): 13 | """ 14 | A context manager that will prevent any logging messages 15 | triggered during the body from being processed. 16 | 17 | :param highest_level: the maximum logging level in use. 18 | This would only need to be changed if a custom level greater than CRITICAL 19 | is defined. 20 | 21 | https://gist.github.com/simon-weber/7853144 22 | """ 23 | # two kind-of hacks here: 24 | # * can't get the highest logging level in effect => delegate to the user 25 | # * can't get the current module-level override => use an undocumented 26 | # (but non-private!) interface 27 | 28 | previous_level = logging.root.manager.disable 29 | 30 | logging.disable(highest_level) 31 | 32 | try: 33 | yield 34 | finally: 35 | logging.disable(previous_level) 36 | 37 | def load_training_dir(train_dir, device, epoch="last"): 38 | """Load a checkpoint and config from training directory""" 39 | train_dir = Path(train_dir) 40 | ckpt = list(train_dir.rglob(f"*{epoch}.ckpt")) 41 | assert len(ckpt) == 1, f"found {len(ckpt)} matching ckpt files" 42 | config = list(train_dir.rglob(f"*-project.yaml")) 43 | assert len(ckpt) > 0, f"didn't find any config in {train_dir}" 44 | if len(config) > 1: 45 | print(f"found {len(config)} matching config files") 46 | config = sorted(config)[-1] 47 | print(f"selecting {config}") 48 | else: 49 | config = config[0] 50 | 51 | 52 | config = OmegaConf.load(config) 53 | return load_model_from_config(config, ckpt[0], device) 54 | 55 | def load_model_from_config(config, ckpt, device="cpu", verbose=False): 56 | """Loads a model from config and a ckpt 57 | if config is a path will use omegaconf to load 58 | """ 59 | if isinstance(config, (str, Path)): 60 | config = OmegaConf.load(config) 61 | 62 | with all_logging_disabled(): 63 | print(f"Loading model from {ckpt}") 64 | pl_sd = torch.load(ckpt, map_location="cpu") 65 | global_step = pl_sd["global_step"] 66 | sd = pl_sd["state_dict"] 67 | model = instantiate_from_config(config.model) 68 | m, u = model.load_state_dict(sd, strict=False) 69 | if len(m) > 0 and verbose: 70 | print("missing keys:") 71 | print(m) 72 | if len(u) > 0 and verbose: 73 | print("unexpected keys:") 74 | model.to(device) 75 | model.eval() 76 | model.cond_stage_model.device = device 77 | return model -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/loftr/utils/fine_matching.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | 5 | from kornia.geometry.subpix import dsnt 6 | from kornia.utils.grid import create_meshgrid 7 | 8 | 9 | class FineMatching(nn.Module): 10 | """FineMatching with s2d paradigm""" 11 | 12 | def __init__(self): 13 | super().__init__() 14 | 15 | def forward(self, feat_f0, feat_f1, data): 16 | """ 17 | Args: 18 | feat0 (torch.Tensor): [M, WW, C] 19 | feat1 (torch.Tensor): [M, WW, C] 20 | data (dict) 21 | Update: 22 | data (dict):{ 23 | 'expec_f' (torch.Tensor): [M, 3], 24 | 'mkpts0_f' (torch.Tensor): [M, 2], 25 | 'mkpts1_f' (torch.Tensor): [M, 2]} 26 | """ 27 | M, WW, C = feat_f0.shape 28 | W = int(math.sqrt(WW)) 29 | scale = data['hw0_i'][0] / data['hw0_f'][0] 30 | self.M, self.W, self.WW, self.C, self.scale = M, W, WW, C, scale 31 | 32 | # corner case: if no coarse matches found 33 | if M == 0: 34 | assert self.training == False, "M is always >0, when training, see coarse_matching.py" 35 | # logger.warning('No matches found in coarse-level.') 36 | data.update({ 37 | 'expec_f': torch.empty(0, 3, device=feat_f0.device), 38 | 'mkpts0_f': data['mkpts0_c'], 39 | 'mkpts1_f': data['mkpts1_c'], 40 | }) 41 | return 42 | 43 | feat_f0_picked = feat_f0_picked = feat_f0[:, WW//2, :] 44 | sim_matrix = torch.einsum('mc,mrc->mr', feat_f0_picked, feat_f1) 45 | softmax_temp = 1. / C**.5 46 | heatmap = torch.softmax(softmax_temp * sim_matrix, dim=1).view(-1, W, W) 47 | 48 | # compute coordinates from heatmap 49 | coords_normalized = dsnt.spatial_expectation2d(heatmap[None], True)[0] # [M, 2] 50 | grid_normalized = create_meshgrid(W, W, True, heatmap.device).reshape(1, -1, 2) # [1, WW, 2] 51 | 52 | # compute std over 53 | var = torch.sum(grid_normalized**2 * heatmap.view(-1, WW, 1), dim=1) - coords_normalized**2 # [M, 2] 54 | std = torch.sum(torch.sqrt(torch.clamp(var, min=1e-10)), -1) # [M] clamp needed for numerical stability 55 | 56 | # for fine-level supervision 57 | data.update({'expec_f': torch.cat([coords_normalized, std.unsqueeze(1)], -1)}) 58 | 59 | # compute absolute kpt coords 60 | self.get_fine_match(coords_normalized, data) 61 | 62 | @torch.no_grad() 63 | def get_fine_match(self, coords_normed, data): 64 | W, WW, C, scale = self.W, self.WW, self.C, self.scale 65 | 66 | # mkpts0_f and mkpts1_f 67 | mkpts0_f = data['mkpts0_c'] 68 | scale1 = scale * data['scale1'][data['b_ids']] if 'scale0' in data else scale 69 | mkpts1_f = data['mkpts1_c'] + (coords_normed * (W // 2) * scale1)[:len(data['mconf'])] 70 | 71 | data.update({ 72 | "mkpts0_f": mkpts0_f, 73 | "mkpts1_f": mkpts1_f 74 | }) 75 | -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/loftr/loftr_module/linear_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Linear Transformer proposed in "Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention" 3 | Modified from: https://github.com/idiap/fast-transformers/blob/master/fast_transformers/attention/linear_attention.py 4 | """ 5 | 6 | import torch 7 | from torch.nn import Module, Dropout 8 | 9 | 10 | def elu_feature_map(x): 11 | return torch.nn.functional.elu(x) + 1 12 | 13 | 14 | class LinearAttention(Module): 15 | def __init__(self, eps=1e-6): 16 | super().__init__() 17 | self.feature_map = elu_feature_map 18 | self.eps = eps 19 | 20 | def forward(self, queries, keys, values, q_mask=None, kv_mask=None): 21 | """ Multi-Head linear attention proposed in "Transformers are RNNs" 22 | Args: 23 | queries: [N, L, H, D] 24 | keys: [N, S, H, D] 25 | values: [N, S, H, D] 26 | q_mask: [N, L] 27 | kv_mask: [N, S] 28 | Returns: 29 | queried_values: (N, L, H, D) 30 | """ 31 | Q = self.feature_map(queries) 32 | K = self.feature_map(keys) 33 | 34 | # set padded position to zero 35 | if q_mask is not None: 36 | Q = Q * q_mask[:, :, None, None] 37 | if kv_mask is not None: 38 | K = K * kv_mask[:, :, None, None] 39 | values = values * kv_mask[:, :, None, None] 40 | 41 | v_length = values.size(1) 42 | values = values / v_length # prevent fp16 overflow 43 | KV = torch.einsum("nshd,nshv->nhdv", K, values) # (S,D)' @ S,V 44 | Z = 1 / (torch.einsum("nlhd,nhd->nlh", Q, K.sum(dim=1)) + self.eps) 45 | queried_values = torch.einsum("nlhd,nhdv,nlh->nlhv", Q, KV, Z) * v_length 46 | 47 | return queried_values.contiguous() 48 | 49 | 50 | class FullAttention(Module): 51 | def __init__(self, use_dropout=False, attention_dropout=0.1): 52 | super().__init__() 53 | self.use_dropout = use_dropout 54 | self.dropout = Dropout(attention_dropout) 55 | 56 | def forward(self, queries, keys, values, q_mask=None, kv_mask=None): 57 | """ Multi-head scaled dot-product attention, a.k.a full attention. 58 | Args: 59 | queries: [N, L, H, D] 60 | keys: [N, S, H, D] 61 | values: [N, S, H, D] 62 | q_mask: [N, L] 63 | kv_mask: [N, S] 64 | Returns: 65 | queried_values: (N, L, H, D) 66 | """ 67 | 68 | # Compute the unnormalized attention and apply the masks 69 | QK = torch.einsum("nlhd,nshd->nlsh", queries, keys) 70 | if kv_mask is not None: 71 | QK.masked_fill_(~(q_mask[:, :, None, None] * kv_mask[:, None, :, None]), float('-inf')) 72 | 73 | # Compute the attention and the weighted average 74 | softmax_temp = 1. / queries.size(3)**.5 # sqrt(D) 75 | A = torch.softmax(softmax_temp * QK, dim=2) 76 | if self.use_dropout: 77 | A = self.dropout(A) 78 | 79 | queried_values = torch.einsum("nlsh,nshd->nlhd", A, values) 80 | 81 | return queried_values.contiguous() 82 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/thirdp/psp/model_irse.py: -------------------------------------------------------------------------------- 1 | # https://github.com/eladrich/pixel2style2pixel 2 | 3 | from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, Dropout, Sequential, Module 4 | from ldm.thirdp.psp.helpers import get_blocks, Flatten, bottleneck_IR, bottleneck_IR_SE, l2_norm 5 | 6 | """ 7 | Modified Backbone implementation from [TreB1eN](https://github.com/TreB1eN/InsightFace_Pytorch) 8 | """ 9 | 10 | 11 | class Backbone(Module): 12 | def __init__(self, input_size, num_layers, mode='ir', drop_ratio=0.4, affine=True): 13 | super(Backbone, self).__init__() 14 | assert input_size in [112, 224], "input_size should be 112 or 224" 15 | assert num_layers in [50, 100, 152], "num_layers should be 50, 100 or 152" 16 | assert mode in ['ir', 'ir_se'], "mode should be ir or ir_se" 17 | blocks = get_blocks(num_layers) 18 | if mode == 'ir': 19 | unit_module = bottleneck_IR 20 | elif mode == 'ir_se': 21 | unit_module = bottleneck_IR_SE 22 | self.input_layer = Sequential(Conv2d(3, 64, (3, 3), 1, 1, bias=False), 23 | BatchNorm2d(64), 24 | PReLU(64)) 25 | if input_size == 112: 26 | self.output_layer = Sequential(BatchNorm2d(512), 27 | Dropout(drop_ratio), 28 | Flatten(), 29 | Linear(512 * 7 * 7, 512), 30 | BatchNorm1d(512, affine=affine)) 31 | else: 32 | self.output_layer = Sequential(BatchNorm2d(512), 33 | Dropout(drop_ratio), 34 | Flatten(), 35 | Linear(512 * 14 * 14, 512), 36 | BatchNorm1d(512, affine=affine)) 37 | 38 | modules = [] 39 | for block in blocks: 40 | for bottleneck in block: 41 | modules.append(unit_module(bottleneck.in_channel, 42 | bottleneck.depth, 43 | bottleneck.stride)) 44 | self.body = Sequential(*modules) 45 | 46 | def forward(self, x): 47 | x = self.input_layer(x) 48 | x = self.body(x) 49 | x = self.output_layer(x) 50 | return l2_norm(x) 51 | 52 | 53 | def IR_50(input_size): 54 | """Constructs a ir-50 model.""" 55 | model = Backbone(input_size, num_layers=50, mode='ir', drop_ratio=0.4, affine=False) 56 | return model 57 | 58 | 59 | def IR_101(input_size): 60 | """Constructs a ir-101 model.""" 61 | model = Backbone(input_size, num_layers=100, mode='ir', drop_ratio=0.4, affine=False) 62 | return model 63 | 64 | 65 | def IR_152(input_size): 66 | """Constructs a ir-152 model.""" 67 | model = Backbone(input_size, num_layers=152, mode='ir', drop_ratio=0.4, affine=False) 68 | return model 69 | 70 | 71 | def IR_SE_50(input_size): 72 | """Constructs a ir_se-50 model.""" 73 | model = Backbone(input_size, num_layers=50, mode='ir_se', drop_ratio=0.4, affine=False) 74 | return model 75 | 76 | 77 | def IR_SE_101(input_size): 78 | """Constructs a ir_se-101 model.""" 79 | model = Backbone(input_size, num_layers=100, mode='ir_se', drop_ratio=0.4, affine=False) 80 | return model 81 | 82 | 83 | def IR_SE_152(input_size): 84 | """Constructs a ir_se-152 model.""" 85 | model = Backbone(input_size, num_layers=152, mode='ir_se', drop_ratio=0.4, affine=False) 86 | return model -------------------------------------------------------------------------------- /pose_synthesis/ldm/modules/ema.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class LitEma(nn.Module): 6 | def __init__(self, model, decay=0.9999, use_num_upates=True): 7 | super().__init__() 8 | if decay < 0.0 or decay > 1.0: 9 | raise ValueError('Decay must be between 0 and 1') 10 | 11 | self.m_name2s_name = {} 12 | self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32)) 13 | self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates 14 | else torch.tensor(-1,dtype=torch.int)) 15 | 16 | for name, p in model.named_parameters(): 17 | if p.requires_grad: 18 | #remove as '.'-character is not allowed in buffers 19 | s_name = name.replace('.','') 20 | self.m_name2s_name.update({name:s_name}) 21 | self.register_buffer(s_name,p.clone().detach().data) 22 | 23 | self.collected_params = [] 24 | 25 | def forward(self,model): 26 | decay = self.decay 27 | 28 | if self.num_updates >= 0: 29 | self.num_updates += 1 30 | decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates)) 31 | 32 | one_minus_decay = 1.0 - decay 33 | 34 | with torch.no_grad(): 35 | m_param = dict(model.named_parameters()) 36 | shadow_params = dict(self.named_buffers()) 37 | 38 | for key in m_param: 39 | if m_param[key].requires_grad: 40 | sname = self.m_name2s_name[key] 41 | shadow_params[sname] = shadow_params[sname].type_as(m_param[key]) 42 | shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key])) 43 | else: 44 | assert not key in self.m_name2s_name 45 | 46 | def copy_to(self, model): 47 | m_param = dict(model.named_parameters()) 48 | shadow_params = dict(self.named_buffers()) 49 | for key in m_param: 50 | if m_param[key].requires_grad: 51 | m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data) 52 | else: 53 | assert not key in self.m_name2s_name 54 | 55 | def store(self, parameters): 56 | """ 57 | Save the current parameters for restoring later. 58 | Args: 59 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 60 | temporarily stored. 61 | """ 62 | self.collected_params = [param.clone() for param in parameters] 63 | 64 | def restore(self, parameters): 65 | """ 66 | Restore the parameters stored with the `store` method. 67 | Useful to validate the model with EMA parameters without affecting the 68 | original optimization process. Store the parameters before the 69 | `copy_to` method. After validation (or model saving), use this to 70 | restore the former parameters. 71 | Args: 72 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 73 | updated with the stored parameters. 74 | """ 75 | for c_param, param in zip(self.collected_params, parameters): 76 | param.data.copy_(c_param.data) 77 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/modules/distributions/distributions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | class AbstractDistribution: 6 | def sample(self): 7 | raise NotImplementedError() 8 | 9 | def mode(self): 10 | raise NotImplementedError() 11 | 12 | 13 | class DiracDistribution(AbstractDistribution): 14 | def __init__(self, value): 15 | self.value = value 16 | 17 | def sample(self): 18 | return self.value 19 | 20 | def mode(self): 21 | return self.value 22 | 23 | 24 | class DiagonalGaussianDistribution(object): 25 | def __init__(self, parameters, deterministic=False): 26 | self.parameters = parameters 27 | self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) 28 | self.logvar = torch.clamp(self.logvar, -30.0, 20.0) 29 | self.deterministic = deterministic 30 | self.std = torch.exp(0.5 * self.logvar) 31 | self.var = torch.exp(self.logvar) 32 | if self.deterministic: 33 | self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device) 34 | 35 | def sample(self): 36 | x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device) 37 | return x 38 | 39 | def kl(self, other=None): 40 | if self.deterministic: 41 | return torch.Tensor([0.]) 42 | else: 43 | if other is None: 44 | return 0.5 * torch.sum(torch.pow(self.mean, 2) 45 | + self.var - 1.0 - self.logvar, 46 | dim=[1, 2, 3]) 47 | else: 48 | return 0.5 * torch.sum( 49 | torch.pow(self.mean - other.mean, 2) / other.var 50 | + self.var / other.var - 1.0 - self.logvar + other.logvar, 51 | dim=[1, 2, 3]) 52 | 53 | def nll(self, sample, dims=[1,2,3]): 54 | if self.deterministic: 55 | return torch.Tensor([0.]) 56 | logtwopi = np.log(2.0 * np.pi) 57 | return 0.5 * torch.sum( 58 | logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, 59 | dim=dims) 60 | 61 | def mode(self): 62 | return self.mean 63 | 64 | 65 | def normal_kl(mean1, logvar1, mean2, logvar2): 66 | """ 67 | source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12 68 | Compute the KL divergence between two gaussians. 69 | Shapes are automatically broadcasted, so batches can be compared to 70 | scalars, among other use cases. 71 | """ 72 | tensor = None 73 | for obj in (mean1, logvar1, mean2, logvar2): 74 | if isinstance(obj, torch.Tensor): 75 | tensor = obj 76 | break 77 | assert tensor is not None, "at least one argument must be a Tensor" 78 | 79 | # Force variances to be Tensors. Broadcasting helps convert scalars to 80 | # Tensors, but it does not work for torch.exp(). 81 | logvar1, logvar2 = [ 82 | x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) 83 | for x in (logvar1, logvar2) 84 | ] 85 | 86 | return 0.5 * ( 87 | -1.0 88 | + logvar2 89 | - logvar1 90 | + torch.exp(logvar1 - logvar2) 91 | + ((mean1 - mean2) ** 2) * torch.exp(-logvar2) 92 | ) 93 | -------------------------------------------------------------------------------- /pose_synthesis/configs/sd-objaverse-finetune-c_concat-256.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 1.0e-04 3 | target: ldm.models.diffusion.ddpm.LatentDiffusion 4 | params: 5 | linear_start: 0.00085 6 | linear_end: 0.0120 7 | num_timesteps_cond: 1 8 | log_every_t: 200 9 | timesteps: 1000 10 | first_stage_key: "image_target" 11 | cond_stage_key: "image_cond" 12 | image_size: 32 13 | channels: 4 14 | cond_stage_trainable: false # Note: different from the one we trained before 15 | conditioning_key: hybrid 16 | monitor: val/loss_simple_ema 17 | scale_factor: 0.18215 18 | 19 | scheduler_config: # 10000 warmup steps 20 | target: ldm.lr_scheduler.LambdaLinearScheduler 21 | params: 22 | warm_up_steps: [ 100 ] 23 | cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases 24 | f_start: [ 1.e-6 ] 25 | f_max: [ 1. ] 26 | f_min: [ 1. ] 27 | 28 | unet_config: 29 | target: ldm.modules.diffusionmodules.openaimodel.UNetModel 30 | params: 31 | image_size: 32 # unused 32 | in_channels: 8 33 | out_channels: 4 34 | model_channels: 320 35 | attention_resolutions: [ 4, 2, 1 ] 36 | num_res_blocks: 2 37 | channel_mult: [ 1, 2, 4, 4 ] 38 | num_heads: 8 39 | use_spatial_transformer: True 40 | transformer_depth: 1 41 | context_dim: 768 42 | use_checkpoint: True 43 | legacy: False 44 | 45 | first_stage_config: 46 | target: ldm.models.autoencoder.AutoencoderKL 47 | params: 48 | embed_dim: 4 49 | monitor: val/rec_loss 50 | ddconfig: 51 | double_z: true 52 | z_channels: 4 53 | resolution: 256 54 | in_channels: 3 55 | out_ch: 3 56 | ch: 128 57 | ch_mult: 58 | - 1 59 | - 2 60 | - 4 61 | - 4 62 | num_res_blocks: 2 63 | attn_resolutions: [] 64 | dropout: 0.0 65 | lossconfig: 66 | target: torch.nn.Identity 67 | 68 | cond_stage_config: 69 | target: ldm.modules.encoders.modules.FrozenCLIPImageEmbedder 70 | 71 | 72 | data: 73 | target: ldm.data.simple.ObjaverseDataModuleFromConfig 74 | params: 75 | root_dir: 'views_whole_sphere' 76 | batch_size: 192 77 | num_workers: 16 78 | total_view: 4 79 | train: 80 | validation: False 81 | image_transforms: 82 | size: 256 83 | 84 | validation: 85 | validation: True 86 | image_transforms: 87 | size: 256 88 | 89 | 90 | lightning: 91 | find_unused_parameters: false 92 | metrics_over_trainsteps_checkpoint: True 93 | modelcheckpoint: 94 | params: 95 | every_n_train_steps: 5000 96 | callbacks: 97 | image_logger: 98 | target: main.ImageLogger 99 | params: 100 | batch_frequency: 500 101 | max_images: 32 102 | increase_log_steps: False 103 | log_first_step: True 104 | log_images_kwargs: 105 | use_ema_scope: False 106 | inpaint: False 107 | plot_progressive_rows: False 108 | plot_diffusion_rows: False 109 | N: 32 110 | unconditional_guidance_scale: 3.0 111 | unconditional_guidance_label: [""] 112 | 113 | trainer: 114 | benchmark: True 115 | val_check_interval: 5000000 # really sorry 116 | num_sanity_val_steps: 0 117 | accumulate_grad_batches: 1 118 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/data/lsun.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import PIL 4 | from PIL import Image 5 | from torch.utils.data import Dataset 6 | from torchvision import transforms 7 | 8 | 9 | class LSUNBase(Dataset): 10 | def __init__(self, 11 | txt_file, 12 | data_root, 13 | size=None, 14 | interpolation="bicubic", 15 | flip_p=0.5 16 | ): 17 | self.data_paths = txt_file 18 | self.data_root = data_root 19 | with open(self.data_paths, "r") as f: 20 | self.image_paths = f.read().splitlines() 21 | self._length = len(self.image_paths) 22 | self.labels = { 23 | "relative_file_path_": [l for l in self.image_paths], 24 | "file_path_": [os.path.join(self.data_root, l) 25 | for l in self.image_paths], 26 | } 27 | 28 | self.size = size 29 | self.interpolation = {"linear": PIL.Image.LINEAR, 30 | "bilinear": PIL.Image.BILINEAR, 31 | "bicubic": PIL.Image.BICUBIC, 32 | "lanczos": PIL.Image.LANCZOS, 33 | }[interpolation] 34 | self.flip = transforms.RandomHorizontalFlip(p=flip_p) 35 | 36 | def __len__(self): 37 | return self._length 38 | 39 | def __getitem__(self, i): 40 | example = dict((k, self.labels[k][i]) for k in self.labels) 41 | image = Image.open(example["file_path_"]) 42 | if not image.mode == "RGB": 43 | image = image.convert("RGB") 44 | 45 | # default to score-sde preprocessing 46 | img = np.array(image).astype(np.uint8) 47 | crop = min(img.shape[0], img.shape[1]) 48 | h, w, = img.shape[0], img.shape[1] 49 | img = img[(h - crop) // 2:(h + crop) // 2, 50 | (w - crop) // 2:(w + crop) // 2] 51 | 52 | image = Image.fromarray(img) 53 | if self.size is not None: 54 | image = image.resize((self.size, self.size), resample=self.interpolation) 55 | 56 | image = self.flip(image) 57 | image = np.array(image).astype(np.uint8) 58 | example["image"] = (image / 127.5 - 1.0).astype(np.float32) 59 | return example 60 | 61 | 62 | class LSUNChurchesTrain(LSUNBase): 63 | def __init__(self, **kwargs): 64 | super().__init__(txt_file="data/lsun/church_outdoor_train.txt", data_root="data/lsun/churches", **kwargs) 65 | 66 | 67 | class LSUNChurchesValidation(LSUNBase): 68 | def __init__(self, flip_p=0., **kwargs): 69 | super().__init__(txt_file="data/lsun/church_outdoor_val.txt", data_root="data/lsun/churches", 70 | flip_p=flip_p, **kwargs) 71 | 72 | 73 | class LSUNBedroomsTrain(LSUNBase): 74 | def __init__(self, **kwargs): 75 | super().__init__(txt_file="data/lsun/bedrooms_train.txt", data_root="data/lsun/bedrooms", **kwargs) 76 | 77 | 78 | class LSUNBedroomsValidation(LSUNBase): 79 | def __init__(self, flip_p=0.0, **kwargs): 80 | super().__init__(txt_file="data/lsun/bedrooms_val.txt", data_root="data/lsun/bedrooms", 81 | flip_p=flip_p, **kwargs) 82 | 83 | 84 | class LSUNCatsTrain(LSUNBase): 85 | def __init__(self, **kwargs): 86 | super().__init__(txt_file="data/lsun/cat_train.txt", data_root="data/lsun/cats", **kwargs) 87 | 88 | 89 | class LSUNCatsValidation(LSUNBase): 90 | def __init__(self, flip_p=0., **kwargs): 91 | super().__init__(txt_file="data/lsun/cat_val.txt", data_root="data/lsun/cats", 92 | flip_p=flip_p, **kwargs) 93 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/guidance.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | from scipy import interpolate 3 | import numpy as np 4 | import torch 5 | import matplotlib.pyplot as plt 6 | from IPython.display import clear_output 7 | import abc 8 | 9 | 10 | class GuideModel(torch.nn.Module, abc.ABC): 11 | def __init__(self) -> None: 12 | super().__init__() 13 | 14 | @abc.abstractmethod 15 | def preprocess(self, x_img): 16 | pass 17 | 18 | @abc.abstractmethod 19 | def compute_loss(self, inp): 20 | pass 21 | 22 | 23 | class Guider(torch.nn.Module): 24 | def __init__(self, sampler, guide_model, scale=1.0, verbose=False): 25 | """Apply classifier guidance 26 | 27 | Specify a guidance scale as either a scalar 28 | Or a schedule as a list of tuples t = 0->1 and scale, e.g. 29 | [(0, 10), (0.5, 20), (1, 50)] 30 | """ 31 | super().__init__() 32 | self.sampler = sampler 33 | self.index = 0 34 | self.show = verbose 35 | self.guide_model = guide_model 36 | self.history = [] 37 | 38 | if isinstance(scale, (Tuple, List)): 39 | times = np.array([x[0] for x in scale]) 40 | values = np.array([x[1] for x in scale]) 41 | self.scale_schedule = {"times": times, "values": values} 42 | else: 43 | self.scale_schedule = float(scale) 44 | 45 | self.ddim_timesteps = sampler.ddim_timesteps 46 | self.ddpm_num_timesteps = sampler.ddpm_num_timesteps 47 | 48 | 49 | def get_scales(self): 50 | if isinstance(self.scale_schedule, float): 51 | return len(self.ddim_timesteps)*[self.scale_schedule] 52 | 53 | interpolater = interpolate.interp1d(self.scale_schedule["times"], self.scale_schedule["values"]) 54 | fractional_steps = np.array(self.ddim_timesteps)/self.ddpm_num_timesteps 55 | return interpolater(fractional_steps) 56 | 57 | def modify_score(self, model, e_t, x, t, c): 58 | 59 | # TODO look up index by t 60 | scale = self.get_scales()[self.index] 61 | 62 | if (scale == 0): 63 | return e_t 64 | 65 | sqrt_1ma = self.sampler.ddim_sqrt_one_minus_alphas[self.index].to(x.device) 66 | with torch.enable_grad(): 67 | x_in = x.detach().requires_grad_(True) 68 | pred_x0 = model.predict_start_from_noise(x_in, t=t, noise=e_t) 69 | x_img = model.first_stage_model.decode((1/0.18215)*pred_x0) 70 | 71 | inp = self.guide_model.preprocess(x_img) 72 | loss = self.guide_model.compute_loss(inp) 73 | grads = torch.autograd.grad(loss.sum(), x_in)[0] 74 | correction = grads * scale 75 | 76 | if self.show: 77 | clear_output(wait=True) 78 | print(loss.item(), scale, correction.abs().max().item(), e_t.abs().max().item()) 79 | self.history.append([loss.item(), scale, correction.min().item(), correction.max().item()]) 80 | plt.imshow((inp[0].detach().permute(1,2,0).clamp(-1,1).cpu()+1)/2) 81 | plt.axis('off') 82 | plt.show() 83 | plt.imshow(correction[0][0].detach().cpu()) 84 | plt.axis('off') 85 | plt.show() 86 | 87 | 88 | e_t_mod = e_t - sqrt_1ma*correction 89 | if self.show: 90 | fig, axs = plt.subplots(1, 3) 91 | axs[0].imshow(e_t[0][0].detach().cpu(), vmin=-2, vmax=+2) 92 | axs[1].imshow(e_t_mod[0][0].detach().cpu(), vmin=-2, vmax=+2) 93 | axs[2].imshow(correction[0][0].detach().cpu(), vmin=-2, vmax=+2) 94 | plt.show() 95 | self.index += 1 96 | return e_t_mod -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/loftr/loftr.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from einops.einops import rearrange 4 | 5 | from .backbone import build_backbone 6 | from .utils.position_encoding import PositionEncodingSine 7 | from .loftr_module import LocalFeatureTransformer, FinePreprocess 8 | from .utils.coarse_matching import CoarseMatching 9 | from .utils.fine_matching import FineMatching 10 | 11 | 12 | class LoFTR(nn.Module): 13 | def __init__(self, config): 14 | super().__init__() 15 | # Misc 16 | self.config = config 17 | 18 | # Modules 19 | self.backbone = build_backbone(config) 20 | self.pos_encoding = PositionEncodingSine( 21 | config['coarse']['d_model'], 22 | temp_bug_fix=config['coarse']['temp_bug_fix']) 23 | self.loftr_coarse = LocalFeatureTransformer(config['coarse']) 24 | self.coarse_matching = CoarseMatching(config['match_coarse']) 25 | self.fine_preprocess = FinePreprocess(config) 26 | self.loftr_fine = LocalFeatureTransformer(config["fine"]) 27 | self.fine_matching = FineMatching() 28 | 29 | def forward(self, data): 30 | """ 31 | Update: 32 | data (dict): { 33 | 'image0': (torch.Tensor): (N, 1, H, W) 34 | 'image1': (torch.Tensor): (N, 1, H, W) 35 | 'mask0'(optional) : (torch.Tensor): (N, H, W) '0' indicates a padded position 36 | 'mask1'(optional) : (torch.Tensor): (N, H, W) 37 | } 38 | """ 39 | # 1. Local Feature CNN 40 | data.update({ 41 | 'bs': data['image0'].size(0), 42 | 'hw0_i': data['image0'].shape[2:], 'hw1_i': data['image1'].shape[2:] 43 | }) 44 | 45 | if data['hw0_i'] == data['hw1_i']: # faster & better BN convergence 46 | feats_c, feats_f = self.backbone(torch.cat([data['image0'], data['image1']], dim=0)) 47 | (feat_c0, feat_c1), (feat_f0, feat_f1) = feats_c.split(data['bs']), feats_f.split(data['bs']) 48 | else: # handle different input shapes 49 | (feat_c0, feat_f0), (feat_c1, feat_f1) = self.backbone(data['image0']), self.backbone(data['image1']) 50 | 51 | data.update({ 52 | 'hw0_c': feat_c0.shape[2:], 'hw1_c': feat_c1.shape[2:], 53 | 'hw0_f': feat_f0.shape[2:], 'hw1_f': feat_f1.shape[2:] 54 | }) 55 | 56 | # 2. coarse-level loftr module 57 | # add featmap with positional encoding, then flatten it to sequence [N, HW, C] 58 | feat_c0 = rearrange(self.pos_encoding(feat_c0), 'n c h w -> n (h w) c') 59 | feat_c1 = rearrange(self.pos_encoding(feat_c1), 'n c h w -> n (h w) c') 60 | 61 | mask_c0 = mask_c1 = None # mask is useful in training 62 | if 'mask0' in data: 63 | mask_c0, mask_c1 = data['mask0'].flatten(-2), data['mask1'].flatten(-2) 64 | feat_c0, feat_c1 = self.loftr_coarse(feat_c0, feat_c1, mask_c0, mask_c1) 65 | 66 | # 3. match coarse-level 67 | self.coarse_matching(feat_c0, feat_c1, data, mask_c0=mask_c0, mask_c1=mask_c1) 68 | 69 | # 4. fine-level refinement 70 | feat_f0_unfold, feat_f1_unfold = self.fine_preprocess(feat_f0, feat_f1, feat_c0, feat_c1, data) 71 | if feat_f0_unfold.size(0) != 0: # at least one coarse level predicted 72 | feat_f0_unfold, feat_f1_unfold = self.loftr_fine(feat_f0_unfold, feat_f1_unfold) 73 | 74 | # 5. match fine-level 75 | self.fine_matching(feat_f0_unfold, feat_f1_unfold, data) 76 | 77 | def load_state_dict(self, state_dict, *args, **kwargs): 78 | for k in list(state_dict.keys()): 79 | if k.startswith('matcher.'): 80 | state_dict[k.replace('matcher.', '', 1)] = state_dict.pop(k) 81 | return super().load_state_dict(state_dict, *args, **kwargs) 82 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/modules/evaluate/ssim.py: -------------------------------------------------------------------------------- 1 | # MIT Licence 2 | 3 | # Methods to predict the SSIM, taken from 4 | # https://github.com/Po-Hsun-Su/pytorch-ssim/blob/master/pytorch_ssim/__init__.py 5 | 6 | from math import exp 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | from torch.autograd import Variable 11 | 12 | def gaussian(window_size, sigma): 13 | gauss = torch.Tensor( 14 | [ 15 | exp(-((x - window_size // 2) ** 2) / float(2 * sigma ** 2)) 16 | for x in range(window_size) 17 | ] 18 | ) 19 | return gauss / gauss.sum() 20 | 21 | 22 | def create_window(window_size, channel): 23 | _1D_window = gaussian(window_size, 1.5).unsqueeze(1) 24 | _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0) 25 | window = Variable( 26 | _2D_window.expand(channel, 1, window_size, window_size).contiguous() 27 | ) 28 | return window 29 | 30 | 31 | def _ssim( 32 | img1, img2, window, window_size, channel, mask=None, size_average=True 33 | ): 34 | mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel) 35 | mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel) 36 | 37 | mu1_sq = mu1.pow(2) 38 | mu2_sq = mu2.pow(2) 39 | mu1_mu2 = mu1 * mu2 40 | 41 | sigma1_sq = ( 42 | F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) 43 | - mu1_sq 44 | ) 45 | sigma2_sq = ( 46 | F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) 47 | - mu2_sq 48 | ) 49 | sigma12 = ( 50 | F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) 51 | - mu1_mu2 52 | ) 53 | 54 | C1 = (0.01) ** 2 55 | C2 = (0.03) ** 2 56 | 57 | ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ( 58 | (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2) 59 | ) 60 | 61 | if not (mask is None): 62 | b = mask.size(0) 63 | ssim_map = ssim_map.mean(dim=1, keepdim=True) * mask 64 | ssim_map = ssim_map.view(b, -1).sum(dim=1) / mask.view(b, -1).sum( 65 | dim=1 66 | ).clamp(min=1) 67 | return ssim_map 68 | 69 | import pdb 70 | 71 | pdb.set_trace 72 | 73 | if size_average: 74 | return ssim_map.mean() 75 | else: 76 | return ssim_map.mean(1).mean(1).mean(1) 77 | 78 | 79 | class SSIM(torch.nn.Module): 80 | def __init__(self, window_size=11, size_average=True): 81 | super(SSIM, self).__init__() 82 | self.window_size = window_size 83 | self.size_average = size_average 84 | self.channel = 1 85 | self.window = create_window(window_size, self.channel) 86 | 87 | def forward(self, img1, img2, mask=None): 88 | (_, channel, _, _) = img1.size() 89 | 90 | if ( 91 | channel == self.channel 92 | and self.window.data.type() == img1.data.type() 93 | ): 94 | window = self.window 95 | else: 96 | window = create_window(self.window_size, channel) 97 | 98 | if img1.is_cuda: 99 | window = window.cuda(img1.get_device()) 100 | window = window.type_as(img1) 101 | 102 | self.window = window 103 | self.channel = channel 104 | 105 | return _ssim( 106 | img1, 107 | img2, 108 | window, 109 | self.window_size, 110 | channel, 111 | mask, 112 | self.size_average, 113 | ) 114 | 115 | 116 | def ssim(img1, img2, window_size=11, mask=None, size_average=True): 117 | (_, channel, _, _) = img1.size() 118 | window = create_window(window_size, channel) 119 | 120 | if img1.is_cuda: 121 | window = window.cuda(img1.get_device()) 122 | window = window.type_as(img1) 123 | 124 | return _ssim(img1, img2, window, window_size, channel, mask, size_average) 125 | -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/loftr/loftr_module/transformer.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | import torch.nn as nn 4 | from .linear_attention import LinearAttention, FullAttention 5 | 6 | 7 | class LoFTREncoderLayer(nn.Module): 8 | def __init__(self, 9 | d_model, 10 | nhead, 11 | attention='linear'): 12 | super(LoFTREncoderLayer, self).__init__() 13 | 14 | self.dim = d_model // nhead 15 | self.nhead = nhead 16 | 17 | # multi-head attention 18 | self.q_proj = nn.Linear(d_model, d_model, bias=False) 19 | self.k_proj = nn.Linear(d_model, d_model, bias=False) 20 | self.v_proj = nn.Linear(d_model, d_model, bias=False) 21 | self.attention = LinearAttention() if attention == 'linear' else FullAttention() 22 | self.merge = nn.Linear(d_model, d_model, bias=False) 23 | 24 | # feed-forward network 25 | self.mlp = nn.Sequential( 26 | nn.Linear(d_model*2, d_model*2, bias=False), 27 | nn.ReLU(True), 28 | nn.Linear(d_model*2, d_model, bias=False), 29 | ) 30 | 31 | # norm and dropout 32 | self.norm1 = nn.LayerNorm(d_model) 33 | self.norm2 = nn.LayerNorm(d_model) 34 | 35 | def forward(self, x, source, x_mask=None, source_mask=None): 36 | """ 37 | Args: 38 | x (torch.Tensor): [N, L, C] 39 | source (torch.Tensor): [N, S, C] 40 | x_mask (torch.Tensor): [N, L] (optional) 41 | source_mask (torch.Tensor): [N, S] (optional) 42 | """ 43 | bs = x.size(0) 44 | query, key, value = x, source, source 45 | 46 | # multi-head attention 47 | query = self.q_proj(query).view(bs, -1, self.nhead, self.dim) # [N, L, (H, D)] 48 | key = self.k_proj(key).view(bs, -1, self.nhead, self.dim) # [N, S, (H, D)] 49 | value = self.v_proj(value).view(bs, -1, self.nhead, self.dim) 50 | message = self.attention(query, key, value, q_mask=x_mask, kv_mask=source_mask) # [N, L, (H, D)] 51 | message = self.merge(message.view(bs, -1, self.nhead*self.dim)) # [N, L, C] 52 | message = self.norm1(message) 53 | 54 | # feed-forward network 55 | message = self.mlp(torch.cat([x, message], dim=2)) 56 | message = self.norm2(message) 57 | 58 | return x + message 59 | 60 | 61 | class LocalFeatureTransformer(nn.Module): 62 | """A Local Feature Transformer (LoFTR) module.""" 63 | 64 | def __init__(self, config): 65 | super(LocalFeatureTransformer, self).__init__() 66 | 67 | self.config = config 68 | self.d_model = config['d_model'] 69 | self.nhead = config['nhead'] 70 | self.layer_names = config['layer_names'] 71 | encoder_layer = LoFTREncoderLayer(config['d_model'], config['nhead'], config['attention']) 72 | self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(len(self.layer_names))]) 73 | self._reset_parameters() 74 | 75 | def _reset_parameters(self): 76 | for p in self.parameters(): 77 | if p.dim() > 1: 78 | nn.init.xavier_uniform_(p) 79 | 80 | def forward(self, feat0, feat1, mask0=None, mask1=None): 81 | """ 82 | Args: 83 | feat0 (torch.Tensor): [N, L, C] 84 | feat1 (torch.Tensor): [N, S, C] 85 | mask0 (torch.Tensor): [N, L] (optional) 86 | mask1 (torch.Tensor): [N, S] (optional) 87 | """ 88 | 89 | assert self.d_model == feat0.size(2), "the feature number of src and transformer must be equal" 90 | 91 | for layer, name in zip(self.layers, self.layer_names): 92 | if name == 'self': 93 | feat0 = layer(feat0, feat0, mask0, mask0) 94 | feat1 = layer(feat1, feat1, mask1, mask1) 95 | elif name == 'cross': 96 | feat0 = layer(feat0, feat1, mask0, mask1) 97 | feat1 = layer(feat1, feat0, mask1, mask0) 98 | else: 99 | raise KeyError 100 | 101 | return feat0, feat1 102 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/thirdp/psp/helpers.py: -------------------------------------------------------------------------------- 1 | # https://github.com/eladrich/pixel2style2pixel 2 | 3 | from collections import namedtuple 4 | import torch 5 | from torch.nn import Conv2d, BatchNorm2d, PReLU, ReLU, Sigmoid, MaxPool2d, AdaptiveAvgPool2d, Sequential, Module 6 | 7 | """ 8 | ArcFace implementation from [TreB1eN](https://github.com/TreB1eN/InsightFace_Pytorch) 9 | """ 10 | 11 | 12 | class Flatten(Module): 13 | def forward(self, input): 14 | return input.view(input.size(0), -1) 15 | 16 | 17 | def l2_norm(input, axis=1): 18 | norm = torch.norm(input, 2, axis, True) 19 | output = torch.div(input, norm) 20 | return output 21 | 22 | 23 | class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])): 24 | """ A named tuple describing a ResNet block. """ 25 | 26 | 27 | def get_block(in_channel, depth, num_units, stride=2): 28 | return [Bottleneck(in_channel, depth, stride)] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)] 29 | 30 | 31 | def get_blocks(num_layers): 32 | if num_layers == 50: 33 | blocks = [ 34 | get_block(in_channel=64, depth=64, num_units=3), 35 | get_block(in_channel=64, depth=128, num_units=4), 36 | get_block(in_channel=128, depth=256, num_units=14), 37 | get_block(in_channel=256, depth=512, num_units=3) 38 | ] 39 | elif num_layers == 100: 40 | blocks = [ 41 | get_block(in_channel=64, depth=64, num_units=3), 42 | get_block(in_channel=64, depth=128, num_units=13), 43 | get_block(in_channel=128, depth=256, num_units=30), 44 | get_block(in_channel=256, depth=512, num_units=3) 45 | ] 46 | elif num_layers == 152: 47 | blocks = [ 48 | get_block(in_channel=64, depth=64, num_units=3), 49 | get_block(in_channel=64, depth=128, num_units=8), 50 | get_block(in_channel=128, depth=256, num_units=36), 51 | get_block(in_channel=256, depth=512, num_units=3) 52 | ] 53 | else: 54 | raise ValueError("Invalid number of layers: {}. Must be one of [50, 100, 152]".format(num_layers)) 55 | return blocks 56 | 57 | 58 | class SEModule(Module): 59 | def __init__(self, channels, reduction): 60 | super(SEModule, self).__init__() 61 | self.avg_pool = AdaptiveAvgPool2d(1) 62 | self.fc1 = Conv2d(channels, channels // reduction, kernel_size=1, padding=0, bias=False) 63 | self.relu = ReLU(inplace=True) 64 | self.fc2 = Conv2d(channels // reduction, channels, kernel_size=1, padding=0, bias=False) 65 | self.sigmoid = Sigmoid() 66 | 67 | def forward(self, x): 68 | module_input = x 69 | x = self.avg_pool(x) 70 | x = self.fc1(x) 71 | x = self.relu(x) 72 | x = self.fc2(x) 73 | x = self.sigmoid(x) 74 | return module_input * x 75 | 76 | 77 | class bottleneck_IR(Module): 78 | def __init__(self, in_channel, depth, stride): 79 | super(bottleneck_IR, self).__init__() 80 | if in_channel == depth: 81 | self.shortcut_layer = MaxPool2d(1, stride) 82 | else: 83 | self.shortcut_layer = Sequential( 84 | Conv2d(in_channel, depth, (1, 1), stride, bias=False), 85 | BatchNorm2d(depth) 86 | ) 87 | self.res_layer = Sequential( 88 | BatchNorm2d(in_channel), 89 | Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), PReLU(depth), 90 | Conv2d(depth, depth, (3, 3), stride, 1, bias=False), BatchNorm2d(depth) 91 | ) 92 | 93 | def forward(self, x): 94 | shortcut = self.shortcut_layer(x) 95 | res = self.res_layer(x) 96 | return res + shortcut 97 | 98 | 99 | class bottleneck_IR_SE(Module): 100 | def __init__(self, in_channel, depth, stride): 101 | super(bottleneck_IR_SE, self).__init__() 102 | if in_channel == depth: 103 | self.shortcut_layer = MaxPool2d(1, stride) 104 | else: 105 | self.shortcut_layer = Sequential( 106 | Conv2d(in_channel, depth, (1, 1), stride, bias=False), 107 | BatchNorm2d(depth) 108 | ) 109 | self.res_layer = Sequential( 110 | BatchNorm2d(in_channel), 111 | Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), 112 | PReLU(depth), 113 | Conv2d(depth, depth, (3, 3), stride, 1, bias=False), 114 | BatchNorm2d(depth), 115 | SEModule(depth, 16) 116 | ) 117 | 118 | def forward(self, x): 119 | shortcut = self.shortcut_layer(x) 120 | res = self.res_layer(x) 121 | return res + shortcut -------------------------------------------------------------------------------- /pose_synthesis/ldm/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class LambdaWarmUpCosineScheduler: 5 | """ 6 | note: use with a base_lr of 1.0 7 | """ 8 | def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0): 9 | self.lr_warm_up_steps = warm_up_steps 10 | self.lr_start = lr_start 11 | self.lr_min = lr_min 12 | self.lr_max = lr_max 13 | self.lr_max_decay_steps = max_decay_steps 14 | self.last_lr = 0. 15 | self.verbosity_interval = verbosity_interval 16 | 17 | def schedule(self, n, **kwargs): 18 | if self.verbosity_interval > 0: 19 | if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}") 20 | if n < self.lr_warm_up_steps: 21 | lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start 22 | self.last_lr = lr 23 | return lr 24 | else: 25 | t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps) 26 | t = min(t, 1.0) 27 | lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * ( 28 | 1 + np.cos(t * np.pi)) 29 | self.last_lr = lr 30 | return lr 31 | 32 | def __call__(self, n, **kwargs): 33 | return self.schedule(n,**kwargs) 34 | 35 | 36 | class LambdaWarmUpCosineScheduler2: 37 | """ 38 | supports repeated iterations, configurable via lists 39 | note: use with a base_lr of 1.0. 40 | """ 41 | def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0): 42 | assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths) 43 | self.lr_warm_up_steps = warm_up_steps 44 | self.f_start = f_start 45 | self.f_min = f_min 46 | self.f_max = f_max 47 | self.cycle_lengths = cycle_lengths 48 | self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths)) 49 | self.last_f = 0. 50 | self.verbosity_interval = verbosity_interval 51 | 52 | def find_in_interval(self, n): 53 | interval = 0 54 | for cl in self.cum_cycles[1:]: 55 | if n <= cl: 56 | return interval 57 | interval += 1 58 | 59 | def schedule(self, n, **kwargs): 60 | cycle = self.find_in_interval(n) 61 | n = n - self.cum_cycles[cycle] 62 | if self.verbosity_interval > 0: 63 | if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, " 64 | f"current cycle {cycle}") 65 | if n < self.lr_warm_up_steps[cycle]: 66 | f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle] 67 | self.last_f = f 68 | return f 69 | else: 70 | t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle]) 71 | t = min(t, 1.0) 72 | f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * ( 73 | 1 + np.cos(t * np.pi)) 74 | self.last_f = f 75 | return f 76 | 77 | def __call__(self, n, **kwargs): 78 | return self.schedule(n, **kwargs) 79 | 80 | 81 | class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2): 82 | 83 | def schedule(self, n, **kwargs): 84 | cycle = self.find_in_interval(n) 85 | n = n - self.cum_cycles[cycle] 86 | if self.verbosity_interval > 0: 87 | if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, " 88 | f"current cycle {cycle}") 89 | 90 | if n < self.lr_warm_up_steps[cycle]: 91 | f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle] 92 | self.last_f = f 93 | return f 94 | else: 95 | f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle]) 96 | self.last_f = f 97 | return f 98 | 99 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/data/inpainting/synthetic_mask.py: -------------------------------------------------------------------------------- 1 | from PIL import Image, ImageDraw 2 | import numpy as np 3 | 4 | settings = { 5 | "256narrow": { 6 | "p_irr": 1, 7 | "min_n_irr": 4, 8 | "max_n_irr": 50, 9 | "max_l_irr": 40, 10 | "max_w_irr": 10, 11 | "min_n_box": None, 12 | "max_n_box": None, 13 | "min_s_box": None, 14 | "max_s_box": None, 15 | "marg": None, 16 | }, 17 | "256train": { 18 | "p_irr": 0.5, 19 | "min_n_irr": 1, 20 | "max_n_irr": 5, 21 | "max_l_irr": 200, 22 | "max_w_irr": 100, 23 | "min_n_box": 1, 24 | "max_n_box": 4, 25 | "min_s_box": 30, 26 | "max_s_box": 150, 27 | "marg": 10, 28 | }, 29 | "512train": { # TODO: experimental 30 | "p_irr": 0.5, 31 | "min_n_irr": 1, 32 | "max_n_irr": 5, 33 | "max_l_irr": 450, 34 | "max_w_irr": 250, 35 | "min_n_box": 1, 36 | "max_n_box": 4, 37 | "min_s_box": 30, 38 | "max_s_box": 300, 39 | "marg": 10, 40 | }, 41 | "512train-large": { # TODO: experimental 42 | "p_irr": 0.5, 43 | "min_n_irr": 1, 44 | "max_n_irr": 5, 45 | "max_l_irr": 450, 46 | "max_w_irr": 400, 47 | "min_n_box": 1, 48 | "max_n_box": 4, 49 | "min_s_box": 75, 50 | "max_s_box": 450, 51 | "marg": 10, 52 | }, 53 | } 54 | 55 | 56 | def gen_segment_mask(mask, start, end, brush_width): 57 | mask = mask > 0 58 | mask = (255 * mask).astype(np.uint8) 59 | mask = Image.fromarray(mask) 60 | draw = ImageDraw.Draw(mask) 61 | draw.line([start, end], fill=255, width=brush_width, joint="curve") 62 | mask = np.array(mask) / 255 63 | return mask 64 | 65 | 66 | def gen_box_mask(mask, masked): 67 | x_0, y_0, w, h = masked 68 | mask[y_0:y_0 + h, x_0:x_0 + w] = 1 69 | return mask 70 | 71 | 72 | def gen_round_mask(mask, masked, radius): 73 | x_0, y_0, w, h = masked 74 | xy = [(x_0, y_0), (x_0 + w, y_0 + w)] 75 | 76 | mask = mask > 0 77 | mask = (255 * mask).astype(np.uint8) 78 | mask = Image.fromarray(mask) 79 | draw = ImageDraw.Draw(mask) 80 | draw.rounded_rectangle(xy, radius=radius, fill=255) 81 | mask = np.array(mask) / 255 82 | return mask 83 | 84 | 85 | def gen_large_mask(prng, img_h, img_w, 86 | marg, p_irr, min_n_irr, max_n_irr, max_l_irr, max_w_irr, 87 | min_n_box, max_n_box, min_s_box, max_s_box): 88 | """ 89 | img_h: int, an image height 90 | img_w: int, an image width 91 | marg: int, a margin for a box starting coordinate 92 | p_irr: float, 0 <= p_irr <= 1, a probability of a polygonal chain mask 93 | 94 | min_n_irr: int, min number of segments 95 | max_n_irr: int, max number of segments 96 | max_l_irr: max length of a segment in polygonal chain 97 | max_w_irr: max width of a segment in polygonal chain 98 | 99 | min_n_box: int, min bound for the number of box primitives 100 | max_n_box: int, max bound for the number of box primitives 101 | min_s_box: int, min length of a box side 102 | max_s_box: int, max length of a box side 103 | """ 104 | 105 | mask = np.zeros((img_h, img_w)) 106 | uniform = prng.randint 107 | 108 | if np.random.uniform(0, 1) < p_irr: # generate polygonal chain 109 | n = uniform(min_n_irr, max_n_irr) # sample number of segments 110 | 111 | for _ in range(n): 112 | y = uniform(0, img_h) # sample a starting point 113 | x = uniform(0, img_w) 114 | 115 | a = uniform(0, 360) # sample angle 116 | l = uniform(10, max_l_irr) # sample segment length 117 | w = uniform(5, max_w_irr) # sample a segment width 118 | 119 | # draw segment starting from (x,y) to (x_,y_) using brush of width w 120 | x_ = x + l * np.sin(a) 121 | y_ = y + l * np.cos(a) 122 | 123 | mask = gen_segment_mask(mask, start=(x, y), end=(x_, y_), brush_width=w) 124 | x, y = x_, y_ 125 | else: # generate Box masks 126 | n = uniform(min_n_box, max_n_box) # sample number of rectangles 127 | 128 | for _ in range(n): 129 | h = uniform(min_s_box, max_s_box) # sample box shape 130 | w = uniform(min_s_box, max_s_box) 131 | 132 | x_0 = uniform(marg, img_w - marg - w) # sample upper-left coordinates of box 133 | y_0 = uniform(marg, img_h - marg - h) 134 | 135 | if np.random.uniform(0, 1) < 0.5: 136 | mask = gen_box_mask(mask, masked=(x_0, y_0, w, h)) 137 | else: 138 | r = uniform(0, 60) # sample radius 139 | mask = gen_round_mask(mask, masked=(x_0, y_0, w, h), radius=r) 140 | return mask 141 | 142 | 143 | make_lama_mask = lambda prng, h, w: gen_large_mask(prng, h, w, **settings["256train"]) 144 | make_narrow_lama_mask = lambda prng, h, w: gen_large_mask(prng, h, w, **settings["256narrow"]) 145 | make_512_lama_mask = lambda prng, h, w: gen_large_mask(prng, h, w, **settings["512train"]) 146 | make_512_lama_mask_large = lambda prng, h, w: gen_large_mask(prng, h, w, **settings["512train-large"]) 147 | 148 | 149 | MASK_MODES = { 150 | "256train": make_lama_mask, 151 | "256narrow": make_narrow_lama_mask, 152 | "512train": make_512_lama_mask, 153 | "512train-large": make_512_lama_mask_large 154 | } 155 | 156 | if __name__ == "__main__": 157 | import sys 158 | 159 | out = sys.argv[1] 160 | 161 | prng = np.random.RandomState(1) 162 | kwargs = settings["256train"] 163 | mask = gen_large_mask(prng, 256, 256, **kwargs) 164 | mask = (255 * mask).astype(np.uint8) 165 | mask = Image.fromarray(mask) 166 | mask.save(out) 167 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/modules/evaluate/frechet_video_distance.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Lint as: python2, python3 17 | """Minimal Reference implementation for the Frechet Video Distance (FVD). 18 | 19 | FVD is a metric for the quality of video generation models. It is inspired by 20 | the FID (Frechet Inception Distance) used for images, but uses a different 21 | embedding to be better suitable for videos. 22 | """ 23 | 24 | from __future__ import absolute_import 25 | from __future__ import division 26 | from __future__ import print_function 27 | 28 | 29 | import six 30 | import tensorflow.compat.v1 as tf 31 | import tensorflow_gan as tfgan 32 | import tensorflow_hub as hub 33 | 34 | 35 | def preprocess(videos, target_resolution): 36 | """Runs some preprocessing on the videos for I3D model. 37 | 38 | Args: 39 | videos: [batch_size, num_frames, height, width, depth] The videos to be 40 | preprocessed. We don't care about the specific dtype of the videos, it can 41 | be anything that tf.image.resize_bilinear accepts. Values are expected to 42 | be in the range 0-255. 43 | target_resolution: (width, height): target video resolution 44 | 45 | Returns: 46 | videos: [batch_size, num_frames, height, width, depth] 47 | """ 48 | videos_shape = list(videos.shape) 49 | all_frames = tf.reshape(videos, [-1] + videos_shape[-3:]) 50 | resized_videos = tf.image.resize_bilinear(all_frames, size=target_resolution) 51 | target_shape = [videos_shape[0], -1] + list(target_resolution) + [3] 52 | output_videos = tf.reshape(resized_videos, target_shape) 53 | scaled_videos = 2. * tf.cast(output_videos, tf.float32) / 255. - 1 54 | return scaled_videos 55 | 56 | 57 | def _is_in_graph(tensor_name): 58 | """Checks whether a given tensor does exists in the graph.""" 59 | try: 60 | tf.get_default_graph().get_tensor_by_name(tensor_name) 61 | except KeyError: 62 | return False 63 | return True 64 | 65 | 66 | def create_id3_embedding(videos,warmup=False,batch_size=16): 67 | """Embeds the given videos using the Inflated 3D Convolution ne twork. 68 | 69 | Downloads the graph of the I3D from tf.hub and adds it to the graph on the 70 | first call. 71 | 72 | Args: 73 | videos: [batch_size, num_frames, height=224, width=224, depth=3]. 74 | Expected range is [-1, 1]. 75 | 76 | Returns: 77 | embedding: [batch_size, embedding_size]. embedding_size depends 78 | on the model used. 79 | 80 | Raises: 81 | ValueError: when a provided embedding_layer is not supported. 82 | """ 83 | 84 | # batch_size = 16 85 | module_spec = "https://tfhub.dev/deepmind/i3d-kinetics-400/1" 86 | 87 | 88 | # Making sure that we import the graph separately for 89 | # each different input video tensor. 90 | module_name = "fvd_kinetics-400_id3_module_" + six.ensure_str( 91 | videos.name).replace(":", "_") 92 | 93 | 94 | 95 | assert_ops = [ 96 | tf.Assert( 97 | tf.reduce_max(videos) <= 1.001, 98 | ["max value in frame is > 1", videos]), 99 | tf.Assert( 100 | tf.reduce_min(videos) >= -1.001, 101 | ["min value in frame is < -1", videos]), 102 | tf.assert_equal( 103 | tf.shape(videos)[0], 104 | batch_size, ["invalid frame batch size: ", 105 | tf.shape(videos)], 106 | summarize=6), 107 | ] 108 | with tf.control_dependencies(assert_ops): 109 | videos = tf.identity(videos) 110 | 111 | module_scope = "%s_apply_default/" % module_name 112 | 113 | # To check whether the module has already been loaded into the graph, we look 114 | # for a given tensor name. If this tensor name exists, we assume the function 115 | # has been called before and the graph was imported. Otherwise we import it. 116 | # Note: in theory, the tensor could exist, but have wrong shapes. 117 | # This will happen if create_id3_embedding is called with a frames_placehoder 118 | # of wrong size/batch size, because even though that will throw a tf.Assert 119 | # on graph-execution time, it will insert the tensor (with wrong shape) into 120 | # the graph. This is why we need the following assert. 121 | if warmup: 122 | video_batch_size = int(videos.shape[0]) 123 | assert video_batch_size in [batch_size, -1, None], f"Invalid batch size {video_batch_size}" 124 | tensor_name = module_scope + "RGB/inception_i3d/Mean:0" 125 | if not _is_in_graph(tensor_name): 126 | i3d_model = hub.Module(module_spec, name=module_name) 127 | i3d_model(videos) 128 | 129 | # gets the kinetics-i3d-400-logits layer 130 | tensor_name = module_scope + "RGB/inception_i3d/Mean:0" 131 | tensor = tf.get_default_graph().get_tensor_by_name(tensor_name) 132 | return tensor 133 | 134 | 135 | def calculate_fvd(real_activations, 136 | generated_activations): 137 | """Returns a list of ops that compute metrics as funcs of activations. 138 | 139 | Args: 140 | real_activations: [num_samples, embedding_size] 141 | generated_activations: [num_samples, embedding_size] 142 | 143 | Returns: 144 | A scalar that contains the requested FVD. 145 | """ 146 | return tfgan.eval.frechet_classifier_distance_from_activations( 147 | real_activations, generated_activations) 148 | -------------------------------------------------------------------------------- /pose_synthesis/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import argparse 4 | from PIL import Image 5 | from utils.zero123_utils import init_model, predict_stage1_gradio, zero123_infer 6 | from utils.sam_utils import sam_init, sam_out_nosave 7 | from utils.utils import pred_bbox, image_preprocess_nosave, gen_poses, convert_mesh_format 8 | from elevation_estimate.estimate_wild_imgs import estimate_elev 9 | 10 | 11 | def preprocess(predictor, raw_im, lower_contrast=False): 12 | raw_im.thumbnail([512, 512], Image.Resampling.LANCZOS) 13 | image_sam = sam_out_nosave(predictor, raw_im.convert("RGB"), pred_bbox(raw_im)) 14 | input_256 = image_preprocess_nosave(image_sam, lower_contrast=lower_contrast, rescale=True) 15 | torch.cuda.empty_cache() 16 | return input_256 17 | 18 | def stage1_run(model, device, exp_dir, 19 | input_im, scale, ddim_steps): 20 | # folder to save the stage 1 images 21 | stage1_dir = os.path.join(exp_dir, "stage1_8") 22 | os.makedirs(stage1_dir, exist_ok=True) 23 | 24 | # stage 1: generate 4 views at the same elevation as the input 25 | output_ims = predict_stage1_gradio(model, input_im, save_path=stage1_dir, adjust_set=list(range(4)), device=device, ddim_steps=ddim_steps, scale=scale) 26 | 27 | # stage 2 for the first image 28 | # infer 4 nearby views for an image to estimate the polar angle of the input 29 | stage2_steps = 50 # ddim_steps 30 | zero123_infer(model, exp_dir, indices=[0], device=device, ddim_steps=stage2_steps, scale=scale) 31 | # estimate the camera pose (elevation) of the input image. 32 | try: 33 | polar_angle = estimate_elev(exp_dir) 34 | except: 35 | print("Failed to estimate polar angle") 36 | polar_angle = 90 37 | print("Estimated polar angle:", polar_angle) 38 | gen_poses(exp_dir, polar_angle) 39 | 40 | # stage 1: generate another 4 views at a different elevation 41 | if polar_angle <= 75: 42 | output_ims_2 = predict_stage1_gradio(model, input_im, save_path=stage1_dir, adjust_set=list(range(4,8)), device=device, ddim_steps=ddim_steps, scale=scale) 43 | else: 44 | output_ims_2 = predict_stage1_gradio(model, input_im, save_path=stage1_dir, adjust_set=list(range(8,12)), device=device, ddim_steps=ddim_steps, scale=scale) 45 | torch.cuda.empty_cache() 46 | return 90-polar_angle, output_ims+output_ims_2 47 | 48 | def stage2_run(model, device, exp_dir, 49 | elev, scale, stage2_steps=50): 50 | # stage 2 for the remaining 7 images, generate 7*4=28 views 51 | if 90-elev <= 75: 52 | zero123_infer(model, exp_dir, indices=list(range(1,8)), device=device, ddim_steps=stage2_steps, scale=scale) 53 | else: 54 | zero123_infer(model, exp_dir, indices=list(range(1,4))+list(range(8,12)), device=device, ddim_steps=stage2_steps, scale=scale) 55 | 56 | def reconstruct(exp_dir, output_format=".ply", device_idx=0, resolution=256): 57 | exp_dir = os.path.abspath(exp_dir) 58 | main_dir_path = os.path.abspath(os.path.dirname("./")) 59 | os.chdir('reconstruction/') 60 | 61 | bash_script = f'CUDA_VISIBLE_DEVICES={device_idx} python exp_runner_generic_blender_val.py \ 62 | --specific_dataset_name {exp_dir} \ 63 | --mode export_mesh \ 64 | --conf confs/one2345_lod0_val_demo.conf \ 65 | --resolution {resolution}' 66 | print(bash_script) 67 | os.system(bash_script) 68 | os.chdir(main_dir_path) 69 | 70 | ply_path = os.path.join(exp_dir, f"mesh.ply") 71 | if output_format == ".ply": 72 | return ply_path 73 | if output_format not in [".obj", ".glb"]: 74 | print("Invalid output format, must be one of .ply, .obj, .glb") 75 | return ply_path 76 | return convert_mesh_format(exp_dir, output_format=output_format) 77 | 78 | 79 | def predict_multiview(shape_dir, args): 80 | device = f"cuda:{args.gpu_idx}" 81 | 82 | # initialize the zero123 model 83 | models = init_model(device, 'zero123-xl.ckpt', half_precision=args.half_precision) 84 | model_zero123 = models["turncam"] 85 | 86 | # initialize the Segment Anything model 87 | predictor = sam_init(args.gpu_idx) 88 | input_raw = Image.open(args.img_path) 89 | 90 | # preprocess the input image 91 | input_256 = preprocess(predictor, input_raw) 92 | 93 | # generate multi-view images in two stages with Zero123. 94 | # first stage: generate N=8 views cover 360 degree of the input shape. 95 | elev, stage1_imgs = stage1_run(model_zero123, device, shape_dir, input_256, scale=3, ddim_steps=75) 96 | # second stage: 4 local views for each of the first-stage view, resulting in N*4=32 source view images. 97 | stage2_run(model_zero123, device, shape_dir, elev, scale=3, stage2_steps=50) 98 | 99 | if __name__ == "__main__": 100 | parser = argparse.ArgumentParser(description='Process some integers.') 101 | parser.add_argument('--img_path', type=str, default="./demo/demo_examples/01_wild_hydrant.png", help='Path to the input image') 102 | parser.add_argument('--gpu_idx', type=int, default=0, help='GPU index') 103 | parser.add_argument('--half_precision', action='store_true', help='Use half precision') 104 | parser.add_argument('--mesh_resolution', type=int, default=256, help='Mesh resolution') 105 | parser.add_argument('--output_format', type=str, default=".ply", help='Output format: .ply, .obj, .glb') 106 | 107 | args = parser.parse_args() 108 | 109 | assert(torch.cuda.is_available()) 110 | 111 | shape_id = args.img_path.split('/')[-1].split('.')[0] 112 | shape_dir = f"./exp/{shape_id}" 113 | os.makedirs(shape_dir, exist_ok=True) 114 | 115 | predict_multiview(shape_dir, args) 116 | 117 | # utilize cost volume-based 3D reconstruction to generate textured 3D mesh 118 | mesh_path = reconstruct(shape_dir, output_format=args.output_format, device_idx=args.gpu_idx, resolution=args.mesh_resolution) 119 | print("Mesh saved to:", mesh_path) 120 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/modules/losses/contperceptual.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from taming.modules.losses.vqperceptual import * # TODO: taming dependency yes/no? 5 | 6 | 7 | class LPIPSWithDiscriminator(nn.Module): 8 | def __init__(self, disc_start, logvar_init=0.0, kl_weight=1.0, pixelloss_weight=1.0, 9 | disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0, 10 | perceptual_weight=1.0, use_actnorm=False, disc_conditional=False, 11 | disc_loss="hinge"): 12 | 13 | super().__init__() 14 | assert disc_loss in ["hinge", "vanilla"] 15 | self.kl_weight = kl_weight 16 | self.pixel_weight = pixelloss_weight 17 | self.perceptual_loss = LPIPS().eval() 18 | self.perceptual_weight = perceptual_weight 19 | # output log variance 20 | self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init) 21 | 22 | self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels, 23 | n_layers=disc_num_layers, 24 | use_actnorm=use_actnorm 25 | ).apply(weights_init) 26 | self.discriminator_iter_start = disc_start 27 | self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss 28 | self.disc_factor = disc_factor 29 | self.discriminator_weight = disc_weight 30 | self.disc_conditional = disc_conditional 31 | 32 | def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None): 33 | if last_layer is not None: 34 | nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0] 35 | g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0] 36 | else: 37 | nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0] 38 | g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0] 39 | 40 | d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4) 41 | d_weight = torch.clamp(d_weight, 0.0, 1e4).detach() 42 | d_weight = d_weight * self.discriminator_weight 43 | return d_weight 44 | 45 | def forward(self, inputs, reconstructions, posteriors, optimizer_idx, 46 | global_step, last_layer=None, cond=None, split="train", 47 | weights=None): 48 | rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous()) 49 | if self.perceptual_weight > 0: 50 | p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous()) 51 | rec_loss = rec_loss + self.perceptual_weight * p_loss 52 | 53 | nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar 54 | weighted_nll_loss = nll_loss 55 | if weights is not None: 56 | weighted_nll_loss = weights*nll_loss 57 | weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0] 58 | nll_loss = torch.sum(nll_loss) / nll_loss.shape[0] 59 | kl_loss = posteriors.kl() 60 | kl_loss = torch.sum(kl_loss) / kl_loss.shape[0] 61 | 62 | # now the GAN part 63 | if optimizer_idx == 0: 64 | # generator update 65 | if cond is None: 66 | assert not self.disc_conditional 67 | logits_fake = self.discriminator(reconstructions.contiguous()) 68 | else: 69 | assert self.disc_conditional 70 | logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1)) 71 | g_loss = -torch.mean(logits_fake) 72 | 73 | if self.disc_factor > 0.0: 74 | try: 75 | d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer) 76 | except RuntimeError: 77 | assert not self.training 78 | d_weight = torch.tensor(0.0) 79 | else: 80 | d_weight = torch.tensor(0.0) 81 | 82 | disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) 83 | loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss 84 | 85 | log = {"{}/total_loss".format(split): loss.clone().detach().mean(), "{}/logvar".format(split): self.logvar.detach(), 86 | "{}/kl_loss".format(split): kl_loss.detach().mean(), "{}/nll_loss".format(split): nll_loss.detach().mean(), 87 | "{}/rec_loss".format(split): rec_loss.detach().mean(), 88 | "{}/d_weight".format(split): d_weight.detach(), 89 | "{}/disc_factor".format(split): torch.tensor(disc_factor), 90 | "{}/g_loss".format(split): g_loss.detach().mean(), 91 | } 92 | return loss, log 93 | 94 | if optimizer_idx == 1: 95 | # second pass for discriminator update 96 | if cond is None: 97 | logits_real = self.discriminator(inputs.contiguous().detach()) 98 | logits_fake = self.discriminator(reconstructions.contiguous().detach()) 99 | else: 100 | logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1)) 101 | logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1)) 102 | 103 | disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) 104 | d_loss = disc_factor * self.disc_loss(logits_real, logits_fake) 105 | 106 | log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(), 107 | "{}/logits_real".format(split): logits_real.detach().mean(), 108 | "{}/logits_fake".format(split): logits_fake.detach().mean() 109 | } 110 | return d_loss, log 111 | 112 | -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/utils/plotting.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import matplotlib 5 | 6 | 7 | def _compute_conf_thresh(data): 8 | dataset_name = data['dataset_name'][0].lower() 9 | if dataset_name == 'scannet': 10 | thr = 5e-4 11 | elif dataset_name == 'megadepth': 12 | thr = 1e-4 13 | else: 14 | raise ValueError(f'Unknown dataset: {dataset_name}') 15 | return thr 16 | 17 | 18 | # --- VISUALIZATION --- # 19 | 20 | def make_matching_figure( 21 | img0, img1, mkpts0, mkpts1, color, 22 | kpts0=None, kpts1=None, text=[], dpi=75, path=None): 23 | # draw image pair 24 | assert mkpts0.shape[0] == mkpts1.shape[0], f'mkpts0: {mkpts0.shape[0]} v.s. mkpts1: {mkpts1.shape[0]}' 25 | fig, axes = plt.subplots(1, 2, figsize=(10, 6), dpi=dpi) 26 | axes[0].imshow(img0, cmap='gray') 27 | axes[1].imshow(img1, cmap='gray') 28 | for i in range(2): # clear all frames 29 | axes[i].get_yaxis().set_ticks([]) 30 | axes[i].get_xaxis().set_ticks([]) 31 | for spine in axes[i].spines.values(): 32 | spine.set_visible(False) 33 | plt.tight_layout(pad=1) 34 | 35 | if kpts0 is not None: 36 | assert kpts1 is not None 37 | axes[0].scatter(kpts0[:, 0], kpts0[:, 1], c='w', s=2) 38 | axes[1].scatter(kpts1[:, 0], kpts1[:, 1], c='w', s=2) 39 | 40 | # draw matches 41 | if mkpts0.shape[0] != 0 and mkpts1.shape[0] != 0: 42 | fig.canvas.draw() 43 | transFigure = fig.transFigure.inverted() 44 | fkpts0 = transFigure.transform(axes[0].transData.transform(mkpts0)) 45 | fkpts1 = transFigure.transform(axes[1].transData.transform(mkpts1)) 46 | fig.lines = [matplotlib.lines.Line2D((fkpts0[i, 0], fkpts1[i, 0]), 47 | (fkpts0[i, 1], fkpts1[i, 1]), 48 | transform=fig.transFigure, c=color[i], linewidth=1) 49 | for i in range(len(mkpts0))] 50 | 51 | axes[0].scatter(mkpts0[:, 0], mkpts0[:, 1], c=color, s=4) 52 | axes[1].scatter(mkpts1[:, 0], mkpts1[:, 1], c=color, s=4) 53 | 54 | # put txts 55 | txt_color = 'k' if img0[:100, :200].mean() > 200 else 'w' 56 | fig.text( 57 | 0.01, 0.99, '\n'.join(text), transform=fig.axes[0].transAxes, 58 | fontsize=15, va='top', ha='left', color=txt_color) 59 | 60 | # save or return figure 61 | if path: 62 | plt.savefig(str(path), bbox_inches='tight', pad_inches=0) 63 | plt.close() 64 | else: 65 | return fig 66 | 67 | 68 | def _make_evaluation_figure(data, b_id, alpha='dynamic'): 69 | b_mask = data['m_bids'] == b_id 70 | conf_thr = _compute_conf_thresh(data) 71 | 72 | img0 = (data['image0'][b_id][0].cpu().numpy() * 255).round().astype(np.int32) 73 | img1 = (data['image1'][b_id][0].cpu().numpy() * 255).round().astype(np.int32) 74 | kpts0 = data['mkpts0_f'][b_mask].cpu().numpy() 75 | kpts1 = data['mkpts1_f'][b_mask].cpu().numpy() 76 | 77 | # for megadepth, we visualize matches on the resized image 78 | if 'scale0' in data: 79 | kpts0 = kpts0 / data['scale0'][b_id].cpu().numpy()[[1, 0]] 80 | kpts1 = kpts1 / data['scale1'][b_id].cpu().numpy()[[1, 0]] 81 | 82 | epi_errs = data['epi_errs'][b_mask].cpu().numpy() 83 | correct_mask = epi_errs < conf_thr 84 | precision = np.mean(correct_mask) if len(correct_mask) > 0 else 0 85 | n_correct = np.sum(correct_mask) 86 | n_gt_matches = int(data['conf_matrix_gt'][b_id].sum().cpu()) 87 | recall = 0 if n_gt_matches == 0 else n_correct / (n_gt_matches) 88 | # recall might be larger than 1, since the calculation of conf_matrix_gt 89 | # uses groundtruth depths and camera poses, but epipolar distance is used here. 90 | 91 | # matching info 92 | if alpha == 'dynamic': 93 | alpha = dynamic_alpha(len(correct_mask)) 94 | color = error_colormap(epi_errs, conf_thr, alpha=alpha) 95 | 96 | text = [ 97 | f'#Matches {len(kpts0)}', 98 | f'Precision({conf_thr:.2e}) ({100 * precision:.1f}%): {n_correct}/{len(kpts0)}', 99 | f'Recall({conf_thr:.2e}) ({100 * recall:.1f}%): {n_correct}/{n_gt_matches}' 100 | ] 101 | 102 | # make the figure 103 | figure = make_matching_figure(img0, img1, kpts0, kpts1, 104 | color, text=text) 105 | return figure 106 | 107 | def _make_confidence_figure(data, b_id): 108 | # TODO: Implement confidence figure 109 | raise NotImplementedError() 110 | 111 | 112 | def make_matching_figures(data, config, mode='evaluation'): 113 | """ Make matching figures for a batch. 114 | 115 | Args: 116 | data (Dict): a batch updated by PL_LoFTR. 117 | config (Dict): matcher config 118 | Returns: 119 | figures (Dict[str, List[plt.figure]] 120 | """ 121 | assert mode in ['evaluation', 'confidence'] # 'confidence' 122 | figures = {mode: []} 123 | for b_id in range(data['image0'].size(0)): 124 | if mode == 'evaluation': 125 | fig = _make_evaluation_figure( 126 | data, b_id, 127 | alpha=config.TRAINER.PLOT_MATCHES_ALPHA) 128 | elif mode == 'confidence': 129 | fig = _make_confidence_figure(data, b_id) 130 | else: 131 | raise ValueError(f'Unknown plot mode: {mode}') 132 | figures[mode].append(fig) 133 | return figures 134 | 135 | 136 | def dynamic_alpha(n_matches, 137 | milestones=[0, 300, 1000, 2000], 138 | alphas=[1.0, 0.8, 0.4, 0.2]): 139 | if n_matches == 0: 140 | return 1.0 141 | ranges = list(zip(alphas, alphas[1:] + [None])) 142 | loc = bisect.bisect_right(milestones, n_matches) - 1 143 | _range = ranges[loc] 144 | if _range[1] is None: 145 | return _range[0] 146 | return _range[1] + (milestones[loc + 1] - n_matches) / ( 147 | milestones[loc + 1] - milestones[loc]) * (_range[0] - _range[1]) 148 | 149 | 150 | def error_colormap(err, thr, alpha=1.0): 151 | assert alpha <= 1.0 and alpha > 0, f"Invaid alpha value: {alpha}" 152 | x = 1 - np.clip(err / (thr * 2), 0, 1) 153 | return np.clip( 154 | np.stack([2-x*2, x*2, np.zeros_like(x), np.ones_like(x)*alpha], -1), 0, 1) 155 | -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/loftr/utils/supervision.py: -------------------------------------------------------------------------------- 1 | from math import log 2 | from loguru import logger 3 | 4 | import torch 5 | from einops import repeat 6 | from kornia.utils import create_meshgrid 7 | 8 | from .geometry import warp_kpts 9 | 10 | ############## ↓ Coarse-Level supervision ↓ ############## 11 | 12 | 13 | @torch.no_grad() 14 | def mask_pts_at_padded_regions(grid_pt, mask): 15 | """For megadepth dataset, zero-padding exists in images""" 16 | mask = repeat(mask, 'n h w -> n (h w) c', c=2) 17 | grid_pt[~mask.bool()] = 0 18 | return grid_pt 19 | 20 | 21 | @torch.no_grad() 22 | def spvs_coarse(data, config): 23 | """ 24 | Update: 25 | data (dict): { 26 | "conf_matrix_gt": [N, hw0, hw1], 27 | 'spv_b_ids': [M] 28 | 'spv_i_ids': [M] 29 | 'spv_j_ids': [M] 30 | 'spv_w_pt0_i': [N, hw0, 2], in original image resolution 31 | 'spv_pt1_i': [N, hw1, 2], in original image resolution 32 | } 33 | 34 | NOTE: 35 | - for scannet dataset, there're 3 kinds of resolution {i, c, f} 36 | - for megadepth dataset, there're 4 kinds of resolution {i, i_resize, c, f} 37 | """ 38 | # 1. misc 39 | device = data['image0'].device 40 | N, _, H0, W0 = data['image0'].shape 41 | _, _, H1, W1 = data['image1'].shape 42 | scale = config['LOFTR']['RESOLUTION'][0] 43 | scale0 = scale * data['scale0'][:, None] if 'scale0' in data else scale 44 | scale1 = scale * data['scale1'][:, None] if 'scale0' in data else scale 45 | h0, w0, h1, w1 = map(lambda x: x // scale, [H0, W0, H1, W1]) 46 | 47 | # 2. warp grids 48 | # create kpts in meshgrid and resize them to image resolution 49 | grid_pt0_c = create_meshgrid(h0, w0, False, device).reshape(1, h0*w0, 2).repeat(N, 1, 1) # [N, hw, 2] 50 | grid_pt0_i = scale0 * grid_pt0_c 51 | grid_pt1_c = create_meshgrid(h1, w1, False, device).reshape(1, h1*w1, 2).repeat(N, 1, 1) 52 | grid_pt1_i = scale1 * grid_pt1_c 53 | 54 | # mask padded region to (0, 0), so no need to manually mask conf_matrix_gt 55 | if 'mask0' in data: 56 | grid_pt0_i = mask_pts_at_padded_regions(grid_pt0_i, data['mask0']) 57 | grid_pt1_i = mask_pts_at_padded_regions(grid_pt1_i, data['mask1']) 58 | 59 | # warp kpts bi-directionally and resize them to coarse-level resolution 60 | # (no depth consistency check, since it leads to worse results experimentally) 61 | # (unhandled edge case: points with 0-depth will be warped to the left-up corner) 62 | _, w_pt0_i = warp_kpts(grid_pt0_i, data['depth0'], data['depth1'], data['T_0to1'], data['K0'], data['K1']) 63 | _, w_pt1_i = warp_kpts(grid_pt1_i, data['depth1'], data['depth0'], data['T_1to0'], data['K1'], data['K0']) 64 | w_pt0_c = w_pt0_i / scale1 65 | w_pt1_c = w_pt1_i / scale0 66 | 67 | # 3. check if mutual nearest neighbor 68 | w_pt0_c_round = w_pt0_c[:, :, :].round().long() 69 | nearest_index1 = w_pt0_c_round[..., 0] + w_pt0_c_round[..., 1] * w1 70 | w_pt1_c_round = w_pt1_c[:, :, :].round().long() 71 | nearest_index0 = w_pt1_c_round[..., 0] + w_pt1_c_round[..., 1] * w0 72 | 73 | # corner case: out of boundary 74 | def out_bound_mask(pt, w, h): 75 | return (pt[..., 0] < 0) + (pt[..., 0] >= w) + (pt[..., 1] < 0) + (pt[..., 1] >= h) 76 | nearest_index1[out_bound_mask(w_pt0_c_round, w1, h1)] = 0 77 | nearest_index0[out_bound_mask(w_pt1_c_round, w0, h0)] = 0 78 | 79 | loop_back = torch.stack([nearest_index0[_b][_i] for _b, _i in enumerate(nearest_index1)], dim=0) 80 | correct_0to1 = loop_back == torch.arange(h0*w0, device=device)[None].repeat(N, 1) 81 | correct_0to1[:, 0] = False # ignore the top-left corner 82 | 83 | # 4. construct a gt conf_matrix 84 | conf_matrix_gt = torch.zeros(N, h0*w0, h1*w1, device=device) 85 | b_ids, i_ids = torch.where(correct_0to1 != 0) 86 | j_ids = nearest_index1[b_ids, i_ids] 87 | 88 | conf_matrix_gt[b_ids, i_ids, j_ids] = 1 89 | data.update({'conf_matrix_gt': conf_matrix_gt}) 90 | 91 | # 5. save coarse matches(gt) for training fine level 92 | if len(b_ids) == 0: 93 | logger.warning(f"No groundtruth coarse match found for: {data['pair_names']}") 94 | # this won't affect fine-level loss calculation 95 | b_ids = torch.tensor([0], device=device) 96 | i_ids = torch.tensor([0], device=device) 97 | j_ids = torch.tensor([0], device=device) 98 | 99 | data.update({ 100 | 'spv_b_ids': b_ids, 101 | 'spv_i_ids': i_ids, 102 | 'spv_j_ids': j_ids 103 | }) 104 | 105 | # 6. save intermediate results (for fast fine-level computation) 106 | data.update({ 107 | 'spv_w_pt0_i': w_pt0_i, 108 | 'spv_pt1_i': grid_pt1_i 109 | }) 110 | 111 | 112 | def compute_supervision_coarse(data, config): 113 | assert len(set(data['dataset_name'])) == 1, "Do not support mixed datasets training!" 114 | data_source = data['dataset_name'][0] 115 | if data_source.lower() in ['scannet', 'megadepth']: 116 | spvs_coarse(data, config) 117 | else: 118 | raise ValueError(f'Unknown data source: {data_source}') 119 | 120 | 121 | ############## ↓ Fine-Level supervision ↓ ############## 122 | 123 | @torch.no_grad() 124 | def spvs_fine(data, config): 125 | """ 126 | Update: 127 | data (dict):{ 128 | "expec_f_gt": [M, 2]} 129 | """ 130 | # 1. misc 131 | # w_pt0_i, pt1_i = data.pop('spv_w_pt0_i'), data.pop('spv_pt1_i') 132 | w_pt0_i, pt1_i = data['spv_w_pt0_i'], data['spv_pt1_i'] 133 | scale = config['LOFTR']['RESOLUTION'][1] 134 | radius = config['LOFTR']['FINE_WINDOW_SIZE'] // 2 135 | 136 | # 2. get coarse prediction 137 | b_ids, i_ids, j_ids = data['b_ids'], data['i_ids'], data['j_ids'] 138 | 139 | # 3. compute gt 140 | scale = scale * data['scale1'][b_ids] if 'scale0' in data else scale 141 | # `expec_f_gt` might exceed the window, i.e. abs(*) > 1, which would be filtered later 142 | expec_f_gt = (w_pt0_i[b_ids, i_ids] - pt1_i[b_ids, j_ids]) / scale / radius # [M, 2] 143 | data.update({"expec_f_gt": expec_f_gt}) 144 | 145 | 146 | def compute_supervision_fine(data, config): 147 | data_source = data['dataset_name'][0] 148 | if data_source.lower() in ['scannet', 'megadepth']: 149 | spvs_fine(data, config) 150 | else: 151 | raise NotImplementedError 152 | -------------------------------------------------------------------------------- /pose_synthesis/utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | import cv2 5 | from PIL import Image 6 | from rembg import remove 7 | import trimesh 8 | 9 | # predict bbox of the foreground 10 | def pred_bbox(image): 11 | image_nobg = remove(image.convert('RGBA'), alpha_matting=True) 12 | alpha = np.asarray(image_nobg)[:,:,-1] 13 | x_nonzero = np.nonzero(alpha.sum(axis=0)) 14 | y_nonzero = np.nonzero(alpha.sum(axis=1)) 15 | x_min = int(x_nonzero[0].min()) 16 | y_min = int(y_nonzero[0].min()) 17 | x_max = int(x_nonzero[0].max()) 18 | y_max = int(y_nonzero[0].max()) 19 | return x_min, y_min, x_max, y_max 20 | 21 | def image_grid(imgs, rows, cols): 22 | assert len(imgs) == rows*cols 23 | w, h = imgs[0].size 24 | grid = Image.new('RGB', size=(cols*w, rows*h)) 25 | grid_w, grid_h = grid.size 26 | 27 | for i, img in enumerate(imgs): 28 | grid.paste(img, box=(i%cols*w, i//cols*h)) 29 | return grid 30 | 31 | def convert_mesh_format(exp_dir, output_format=".obj"): 32 | ply_path = os.path.join(exp_dir, "mesh.ply") 33 | mesh_path = os.path.join(exp_dir, f"mesh{output_format}") 34 | mesh = trimesh.load_mesh(ply_path) 35 | rotation_matrix = trimesh.transformations.rotation_matrix(np.pi/2, [1, 0, 0]) 36 | mesh.apply_transform(rotation_matrix) 37 | rotation_matrix = trimesh.transformations.rotation_matrix(np.pi, [0, 0, 1]) 38 | mesh.apply_transform(rotation_matrix) 39 | # flip x 40 | mesh.vertices[:, 0] = -mesh.vertices[:, 0] 41 | mesh.faces = np.fliplr(mesh.faces) 42 | if output_format == ".obj": 43 | # Export the mesh as .obj file with colors 44 | mesh.export(mesh_path, file_type='obj', include_color=True) 45 | else: 46 | mesh.export(mesh_path, file_type='glb') 47 | return mesh_path 48 | 49 | # contrast correction, rescale and recenter 50 | def image_preprocess_nosave(input_image, lower_contrast=True, rescale=True): 51 | 52 | image_arr = np.array(input_image) 53 | in_w, in_h = image_arr.shape[:2] 54 | 55 | if lower_contrast: 56 | alpha = 0.8 # Contrast control (1.0-3.0) 57 | beta = 0 # Brightness control (0-100) 58 | # Apply the contrast adjustment 59 | image_arr = cv2.convertScaleAbs(image_arr, alpha=alpha, beta=beta) 60 | image_arr[image_arr[...,-1]>200, -1] = 255 61 | 62 | ret, mask = cv2.threshold(np.array(input_image.split()[-1]), 0, 255, cv2.THRESH_BINARY) 63 | x, y, w, h = cv2.boundingRect(mask) 64 | max_size = max(w, h) 65 | ratio = 0.75 66 | if rescale: 67 | side_len = int(max_size / ratio) 68 | else: 69 | side_len = in_w 70 | padded_image = np.zeros((side_len, side_len, 4), dtype=np.uint8) 71 | center = side_len//2 72 | padded_image[center-h//2:center-h//2+h, center-w//2:center-w//2+w] = image_arr[y:y+h, x:x+w] 73 | rgba = Image.fromarray(padded_image).resize((256, 256), Image.LANCZOS) 74 | 75 | rgba_arr = np.array(rgba) / 255.0 76 | rgb = rgba_arr[...,:3] * rgba_arr[...,-1:] + (1 - rgba_arr[...,-1:]) 77 | return Image.fromarray((rgb * 255).astype(np.uint8)) 78 | 79 | # pose generation 80 | def calc_pose(phis, thetas, size, radius = 1.2, device='cuda'): 81 | import torch 82 | def normalize(vectors): 83 | return vectors / (torch.norm(vectors, dim=-1, keepdim=True) + 1e-10) 84 | thetas = torch.FloatTensor(thetas).to(device) 85 | phis = torch.FloatTensor(phis).to(device) 86 | 87 | centers = torch.stack([ 88 | radius * torch.sin(thetas) * torch.sin(phis), 89 | -radius * torch.cos(thetas) * torch.sin(phis), 90 | radius * torch.cos(phis), 91 | ], dim=-1) # [B, 3] 92 | 93 | # lookat 94 | forward_vector = normalize(centers).squeeze(0) 95 | up_vector = torch.FloatTensor([0, 0, 1]).to(device).unsqueeze(0).repeat(size, 1) 96 | right_vector = normalize(torch.cross(up_vector, forward_vector, dim=-1)) 97 | if right_vector.pow(2).sum() < 0.01: 98 | right_vector = torch.FloatTensor([0, 1, 0]).to(device).unsqueeze(0).repeat(size, 1) 99 | up_vector = normalize(torch.cross(forward_vector, right_vector, dim=-1)) 100 | 101 | poses = torch.eye(4, dtype=torch.float, device=device)[:3].unsqueeze(0).repeat(size, 1, 1) 102 | poses[:, :3, :3] = torch.stack((right_vector, up_vector, forward_vector), dim=-1) 103 | poses[:, :3, 3] = centers 104 | return poses 105 | 106 | def get_poses(init_elev): 107 | mid = init_elev 108 | deg = 10 109 | if init_elev <= 75: 110 | low = init_elev + 30 111 | # e.g. 30, 60, 20, 40, 30, 30, 50, 70, 50, 50 112 | 113 | elevations = np.radians([mid]*4 + [low]*4 + [mid-deg,mid+deg,mid,mid]*4 + [low-deg,low+deg,low,low]*4) 114 | img_ids = [f"{num}.png" for num in range(8)] + [f"{num}_{view_num}.png" for num in range(8) for view_num in range(4)] 115 | else: 116 | 117 | high = init_elev - 30 118 | elevations = np.radians([mid]*4 + [high]*4 + [mid-deg,mid+deg,mid,mid]*4 + [high-deg,high+deg,high,high]*4) 119 | img_ids = [f"{num}.png" for num in list(range(4)) + list(range(8,12))] + \ 120 | [f"{num}_{view_num}.png" for num in list(range(4)) + list(range(8,12)) for view_num in range(4)] 121 | overlook_theta = [30+x*90 for x in range(4)] 122 | eyelevel_theta = [60+x*90 for x in range(4)] 123 | source_theta_delta = [0, 0, -deg, deg] 124 | azimuths = np.radians(overlook_theta + eyelevel_theta + \ 125 | [view_theta + source for view_theta in overlook_theta for source in source_theta_delta] + \ 126 | [view_theta + source for view_theta in eyelevel_theta for source in source_theta_delta]) 127 | return img_ids, calc_pose(elevations, azimuths, len(azimuths)).cpu().numpy() 128 | 129 | 130 | def gen_poses(shape_dir, pose_est): 131 | img_ids, input_poses = get_poses(pose_est) 132 | 133 | out_dict = {} 134 | focal = 560/2; h = w = 256 135 | out_dict['intrinsics'] = [[focal, 0, w / 2], [0, focal, h / 2], [0, 0, 1]] 136 | out_dict['near_far'] = [1.2-0.7, 1.2+0.6] 137 | out_dict['c2ws'] = {} 138 | for view_id, img_id in enumerate(img_ids): 139 | pose = input_poses[view_id] 140 | pose = pose.tolist() 141 | pose = [pose[0], pose[1], pose[2], [0, 0, 0, 1]] 142 | out_dict['c2ws'][img_id] = pose 143 | json_path = os.path.join(shape_dir, 'pose.json') 144 | with open(json_path, 'w') as f: 145 | json.dump(out_dict, f, indent=4) 146 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Integrating View Conditions for Image Synthesis 2 | This is the official implementation of the paper "Integrating View Conditions for Image Synthesis", which is accepted by IJCAI 2024. 🎉 3 | 4 | [[Paper]](https://www.ijcai.org/proceedings/2024/840) 5 | 6 | ## Introduction 7 | 8 | This paper presents **ViewControl** that enhances existing models with awareness of viewpoint information, thereby 9 | enabling improved control over text-to-image diffusion models, such as Stable Diffusion. This advancement leads to a 10 | more controllable approach for image editing tasks. Our proposed pipeline effectively addresses crucial aspects of image synthesis, including *consistency*, *controllability*, and *harmony*. Through both quantitative and qualitative comparisons with recently published 11 | open-source state-of-the-art methods, we have showcased the 12 | favorable performance of our approach across various dimensions. 13 | 14 | 15 | ## Pipeline 16 | 17 | The pipeline of ViewControl consists of three steps: LLM Planer, Pose Estimation and Synthesis, and Image Synthesis. The LLM Planer is responsible for understanding the users' inputs and bridging the gap between the users' inputs and the following steps. The Pose Estimation and Synthesis module is responsible for estimating the pose of the object in the input image and synthesizing the image of the object at the target pose. The Image Synthesis module is responsible for synthesizing the final image by combining the synthesized image of the object with the background of the input image. The pipeline of ViewControl is shown in the following figure: 18 | 19 |

20 | 21 | 22 | ## Installation 23 | First, clone the repository locally: 24 | ```bash 25 | git clone https://github.com/huggingface/diffusers.git 26 | git clone https://github.com/luca-medeiros/lang-segment-anything.git 27 | git clone https://github.com/IDEA-Research/GroundingDINO.git 28 | ``` 29 | Then, create a conda environment and install the required packages: 30 | ```bash 31 | conda create -n view_cond python=3.10 32 | conda activate view_cond 33 | 34 | cd diffusers 35 | pip install -e . 36 | 37 | cd ../lang-segment-anything 38 | pip install torch torchvision 39 | pip install -e . 40 | 41 | cd ../GroundingDINO 42 | pip install -e . 43 | mkdir weights 44 | cd weights 45 | wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth 46 | cd .. 47 | 48 | cd .. 49 | 50 | pip install -r requirements.txt 51 | cd pose_synthesis 52 | python download_ckpt.py 53 | 54 | pip install --upgrade torchaudio 55 | cd .. 56 | ``` 57 | 58 | ## Training 59 | If you want to train your own pose estimator, you can use the following command: 60 | ```bash 61 | python train_pose_estimator.py --dataset_path --output_dir 62 | ``` 63 | You may need to adjust the hyperparameters (learning rate, batch size, etc.) in the script to get the best performance. 64 | 65 | Your dataset should be organized as follows: 66 | ``` 67 | dataset 68 | ├── class_1 69 | │ ├── obj_1 70 | │ │ ├── x1_y1.png 71 | │ │ ├── x2_y2.png 72 | │ │ ├── ... 73 | │ │ └── xN_yN.png 74 | │ ├── obj_2 75 | │ │ ├── x1_y1.png 76 | │ │ ├── x2_y2.png 77 | │ │ ├── ... 78 | │ │ └── xN_yN.png 79 | │ ├── ... 80 | │ └── obj_N 81 | │ ├── ... 82 | |── class_N 83 | │ ├── ... 84 | ``` 85 | where `x1_y1.png` is the image of `obj_1` at pose `(x1, y1)`, and `class_1` is the class name of `obj_1`. The dataset can be synthetic or real. If you want to synthesize your own dataset, first prepare a set of images of the object for the same class with the same pose, then use the pose_synthesis module to synthesize the images of the object at different poses. 86 | 87 | 88 | ## Inference 89 | 90 | ### Pose Estimation 91 | To estimate the pose of a given image, you can use the following command: 92 | ```bash 93 | python pose_estimation.py --image_path --output_dir --model_path 94 | ``` 95 | ### Pose Synthesis 96 | To synthesize an image of one object from a given pose, you can use the following command: 97 | ```bash 98 | cd pose_synthesis 99 | python pose_synthesis_batch.py --input_dir input_dir --output_dir output_dir --x x_value --y y_value 100 | cd .. 101 | ``` 102 | for example: 103 | ```bash 104 | cd pose_synthesis 105 | python pose_synthesis_batch.py --input_dir input_dir --output_dir output_dir --x 0 --y 0 106 | cd .. 107 | ``` 108 | To synthesis a set of images of one object from a given set of poses, you can use the following command: 109 | ```bash 110 | cd pose_synthesis 111 | python pose_synthesis_batch.py --input_dir input_dir --output_dir output_dir --x x_values --y y_values 112 | cd .. 113 | ``` 114 | for example: 115 | ```bash 116 | cd pose_synthesis 117 | python pose_synthesis_batch.py --input_dir input_dir --output_dir output_dir --x 0,10 --y 0,-10 118 | cd .. 119 | ``` 120 | ### Image Synthesis 121 | To synthesize an image, you can use the following command: 122 | ```bash 123 | python image_synthesis.py --path_src_img --path_ref_img --text_prompt --save_path --mask_obj_name --ref_obj_name 124 | ``` 125 | If you need faster inference, you can set the dreambooth option to False or pre train it or change it to another lightweight personalization method like LoRA. 126 | 127 | ### Other Utils 128 | If you need to obtain a more accurate caption or class name from an image, you can use the following command: 129 | ```bash 130 | python obj_name_synthesis.py --path_src_img --save_path 131 | ``` 132 | 133 | If you need to remove the background of an image, you can use the following command: 134 | ```bash 135 | python utils.py --input_path --prompt --output_path 136 | ``` 137 | 138 | ## Examples 139 | Here are some examples of the results of ViewControl: 140 |

141 | 142 |

143 | 144 | 145 | ## Citation 146 | If you find this work useful, please cite our paper: 147 | ``` 148 | @inproceedings{ijcai2024p840, 149 | title = {Integrating View Conditions for Image Synthesis}, 150 | author = {Bai, Jinbin and Dong, Zhen and Feng, Aosong and Zhang, Xiao and Ye, Tian and Zhou, Kaicheng}, 151 | booktitle = {Proceedings of the Thirty-Third International Joint Conference on 152 | Artificial Intelligence, {IJCAI-24}}, 153 | publisher = {International Joint Conferences on Artificial Intelligence Organization}, 154 | editor = {Kate Larson}, 155 | pages = {7591--7599}, 156 | year = {2024}, 157 | month = {8}, 158 | note = {AI, Arts & Creativity}, 159 | doi = {10.24963/ijcai.2024/840}, 160 | url = {https://doi.org/10.24963/ijcai.2024/840}, 161 | } 162 | ``` 163 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/data/nerf_like.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | import os 3 | import json 4 | import numpy as np 5 | import torch 6 | import imageio 7 | import math 8 | import cv2 9 | from torchvision import transforms 10 | 11 | def cartesian_to_spherical(xyz): 12 | ptsnew = np.hstack((xyz, np.zeros(xyz.shape))) 13 | xy = xyz[:,0]**2 + xyz[:,1]**2 14 | z = np.sqrt(xy + xyz[:,2]**2) 15 | theta = np.arctan2(np.sqrt(xy), xyz[:,2]) # for elevation angle defined from Z-axis down 16 | #ptsnew[:,4] = np.arctan2(xyz[:,2], np.sqrt(xy)) # for elevation angle defined from XY-plane up 17 | azimuth = np.arctan2(xyz[:,1], xyz[:,0]) 18 | return np.array([theta, azimuth, z]) 19 | 20 | 21 | def get_T(T_target, T_cond): 22 | theta_cond, azimuth_cond, z_cond = cartesian_to_spherical(T_cond[None, :]) 23 | theta_target, azimuth_target, z_target = cartesian_to_spherical(T_target[None, :]) 24 | 25 | d_theta = theta_target - theta_cond 26 | d_azimuth = (azimuth_target - azimuth_cond) % (2 * math.pi) 27 | d_z = z_target - z_cond 28 | 29 | d_T = torch.tensor([d_theta.item(), math.sin(d_azimuth.item()), math.cos(d_azimuth.item()), d_z.item()]) 30 | return d_T 31 | 32 | def get_spherical(T_target, T_cond): 33 | theta_cond, azimuth_cond, z_cond = cartesian_to_spherical(T_cond[None, :]) 34 | theta_target, azimuth_target, z_target = cartesian_to_spherical(T_target[None, :]) 35 | 36 | d_theta = theta_target - theta_cond 37 | d_azimuth = (azimuth_target - azimuth_cond) % (2 * math.pi) 38 | d_z = z_target - z_cond 39 | 40 | d_T = torch.tensor([math.degrees(d_theta.item()), math.degrees(d_azimuth.item()), d_z.item()]) 41 | return d_T 42 | 43 | class RTMV(Dataset): 44 | def __init__(self, root_dir='datasets/RTMV/google_scanned',\ 45 | first_K=64, resolution=256, load_target=False): 46 | self.root_dir = root_dir 47 | self.scene_list = sorted(next(os.walk(root_dir))[1]) 48 | self.resolution = resolution 49 | self.first_K = first_K 50 | self.load_target = load_target 51 | 52 | def __len__(self): 53 | return len(self.scene_list) 54 | 55 | def __getitem__(self, idx): 56 | scene_dir = os.path.join(self.root_dir, self.scene_list[idx]) 57 | with open(os.path.join(scene_dir, 'transforms.json'), "r") as f: 58 | meta = json.load(f) 59 | imgs = [] 60 | poses = [] 61 | for i_img in range(self.first_K): 62 | meta_img = meta['frames'][i_img] 63 | 64 | if i_img == 0 or self.load_target: 65 | img_path = os.path.join(scene_dir, meta_img['file_path']) 66 | img = imageio.imread(img_path) 67 | img = cv2.resize(img, (self.resolution, self.resolution), interpolation = cv2.INTER_LINEAR) 68 | imgs.append(img) 69 | 70 | c2w = meta_img['transform_matrix'] 71 | poses.append(c2w) 72 | 73 | imgs = (np.array(imgs) / 255.).astype(np.float32) # (RGBA) imgs 74 | imgs = torch.tensor(self.blend_rgba(imgs)).permute(0, 3, 1, 2) 75 | imgs = imgs * 2 - 1. # convert to stable diffusion range 76 | poses = torch.tensor(np.array(poses).astype(np.float32)) 77 | return imgs, poses 78 | 79 | def blend_rgba(self, img): 80 | img = img[..., :3] * img[..., -1:] + (1. - img[..., -1:]) # blend A to RGB 81 | return img 82 | 83 | 84 | class GSO(Dataset): 85 | def __init__(self, root_dir='datasets/GoogleScannedObjects',\ 86 | split='val', first_K=5, resolution=256, load_target=False, name='render_mvs'): 87 | self.root_dir = root_dir 88 | with open(os.path.join(root_dir, '%s.json' % split), "r") as f: 89 | self.scene_list = json.load(f) 90 | self.resolution = resolution 91 | self.first_K = first_K 92 | self.load_target = load_target 93 | self.name = name 94 | 95 | def __len__(self): 96 | return len(self.scene_list) 97 | 98 | def __getitem__(self, idx): 99 | scene_dir = os.path.join(self.root_dir, self.scene_list[idx]) 100 | with open(os.path.join(scene_dir, 'transforms_%s.json' % self.name), "r") as f: 101 | meta = json.load(f) 102 | imgs = [] 103 | poses = [] 104 | for i_img in range(self.first_K): 105 | meta_img = meta['frames'][i_img] 106 | 107 | if i_img == 0 or self.load_target: 108 | img_path = os.path.join(scene_dir, meta_img['file_path']) 109 | img = imageio.imread(img_path) 110 | img = cv2.resize(img, (self.resolution, self.resolution), interpolation = cv2.INTER_LINEAR) 111 | imgs.append(img) 112 | 113 | c2w = meta_img['transform_matrix'] 114 | poses.append(c2w) 115 | 116 | imgs = (np.array(imgs) / 255.).astype(np.float32) # (RGBA) imgs 117 | mask = imgs[:, :, :, -1] 118 | imgs = torch.tensor(self.blend_rgba(imgs)).permute(0, 3, 1, 2) 119 | imgs = imgs * 2 - 1. # convert to stable diffusion range 120 | poses = torch.tensor(np.array(poses).astype(np.float32)) 121 | return imgs, poses 122 | 123 | def blend_rgba(self, img): 124 | img = img[..., :3] * img[..., -1:] + (1. - img[..., -1:]) # blend A to RGB 125 | return img 126 | 127 | class WILD(Dataset): 128 | def __init__(self, root_dir='data/nerf_wild',\ 129 | first_K=33, resolution=256, load_target=False): 130 | self.root_dir = root_dir 131 | self.scene_list = sorted(next(os.walk(root_dir))[1]) 132 | self.resolution = resolution 133 | self.first_K = first_K 134 | self.load_target = load_target 135 | 136 | def __len__(self): 137 | return len(self.scene_list) 138 | 139 | def __getitem__(self, idx): 140 | scene_dir = os.path.join(self.root_dir, self.scene_list[idx]) 141 | with open(os.path.join(scene_dir, 'transforms_train.json'), "r") as f: 142 | meta = json.load(f) 143 | imgs = [] 144 | poses = [] 145 | for i_img in range(self.first_K): 146 | meta_img = meta['frames'][i_img] 147 | 148 | if i_img == 0 or self.load_target: 149 | img_path = os.path.join(scene_dir, meta_img['file_path']) 150 | img = imageio.imread(img_path + '.png') 151 | img = cv2.resize(img, (self.resolution, self.resolution), interpolation = cv2.INTER_LINEAR) 152 | imgs.append(img) 153 | 154 | c2w = meta_img['transform_matrix'] 155 | poses.append(c2w) 156 | 157 | imgs = (np.array(imgs) / 255.).astype(np.float32) # (RGBA) imgs 158 | imgs = torch.tensor(self.blend_rgba(imgs)).permute(0, 3, 1, 2) 159 | imgs = imgs * 2 - 1. # convert to stable diffusion range 160 | poses = torch.tensor(np.array(poses).astype(np.float32)) 161 | return imgs, poses 162 | 163 | def blend_rgba(self, img): 164 | img = img[..., :3] * img[..., -1:] + (1. - img[..., -1:]) # blend A to RGB 165 | return img -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/loftr/backbone/resnet_fpn.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | 5 | def conv1x1(in_planes, out_planes, stride=1): 6 | """1x1 convolution without padding""" 7 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=0, bias=False) 8 | 9 | 10 | def conv3x3(in_planes, out_planes, stride=1): 11 | """3x3 convolution with padding""" 12 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) 13 | 14 | 15 | class BasicBlock(nn.Module): 16 | def __init__(self, in_planes, planes, stride=1): 17 | super().__init__() 18 | self.conv1 = conv3x3(in_planes, planes, stride) 19 | self.conv2 = conv3x3(planes, planes) 20 | self.bn1 = nn.BatchNorm2d(planes) 21 | self.bn2 = nn.BatchNorm2d(planes) 22 | self.relu = nn.ReLU(inplace=True) 23 | 24 | if stride == 1: 25 | self.downsample = None 26 | else: 27 | self.downsample = nn.Sequential( 28 | conv1x1(in_planes, planes, stride=stride), 29 | nn.BatchNorm2d(planes) 30 | ) 31 | 32 | def forward(self, x): 33 | y = x 34 | y = self.relu(self.bn1(self.conv1(y))) 35 | y = self.bn2(self.conv2(y)) 36 | 37 | if self.downsample is not None: 38 | x = self.downsample(x) 39 | 40 | return self.relu(x+y) 41 | 42 | 43 | class ResNetFPN_8_2(nn.Module): 44 | """ 45 | ResNet+FPN, output resolution are 1/8 and 1/2. 46 | Each block has 2 layers. 47 | """ 48 | 49 | def __init__(self, config): 50 | super().__init__() 51 | # Config 52 | block = BasicBlock 53 | initial_dim = config['initial_dim'] 54 | block_dims = config['block_dims'] 55 | 56 | # Class Variable 57 | self.in_planes = initial_dim 58 | 59 | # Networks 60 | self.conv1 = nn.Conv2d(1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False) 61 | self.bn1 = nn.BatchNorm2d(initial_dim) 62 | self.relu = nn.ReLU(inplace=True) 63 | 64 | self.layer1 = self._make_layer(block, block_dims[0], stride=1) # 1/2 65 | self.layer2 = self._make_layer(block, block_dims[1], stride=2) # 1/4 66 | self.layer3 = self._make_layer(block, block_dims[2], stride=2) # 1/8 67 | 68 | # 3. FPN upsample 69 | self.layer3_outconv = conv1x1(block_dims[2], block_dims[2]) 70 | self.layer2_outconv = conv1x1(block_dims[1], block_dims[2]) 71 | self.layer2_outconv2 = nn.Sequential( 72 | conv3x3(block_dims[2], block_dims[2]), 73 | nn.BatchNorm2d(block_dims[2]), 74 | nn.LeakyReLU(), 75 | conv3x3(block_dims[2], block_dims[1]), 76 | ) 77 | self.layer1_outconv = conv1x1(block_dims[0], block_dims[1]) 78 | self.layer1_outconv2 = nn.Sequential( 79 | conv3x3(block_dims[1], block_dims[1]), 80 | nn.BatchNorm2d(block_dims[1]), 81 | nn.LeakyReLU(), 82 | conv3x3(block_dims[1], block_dims[0]), 83 | ) 84 | 85 | for m in self.modules(): 86 | if isinstance(m, nn.Conv2d): 87 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 88 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): 89 | nn.init.constant_(m.weight, 1) 90 | nn.init.constant_(m.bias, 0) 91 | 92 | def _make_layer(self, block, dim, stride=1): 93 | layer1 = block(self.in_planes, dim, stride=stride) 94 | layer2 = block(dim, dim, stride=1) 95 | layers = (layer1, layer2) 96 | 97 | self.in_planes = dim 98 | return nn.Sequential(*layers) 99 | 100 | def forward(self, x): 101 | # ResNet Backbone 102 | x0 = self.relu(self.bn1(self.conv1(x))) 103 | x1 = self.layer1(x0) # 1/2 104 | x2 = self.layer2(x1) # 1/4 105 | x3 = self.layer3(x2) # 1/8 106 | 107 | # FPN 108 | x3_out = self.layer3_outconv(x3) 109 | 110 | x3_out_2x = F.interpolate(x3_out, scale_factor=2., mode='bilinear', align_corners=True) 111 | x2_out = self.layer2_outconv(x2) 112 | x2_out = self.layer2_outconv2(x2_out+x3_out_2x) 113 | 114 | x2_out_2x = F.interpolate(x2_out, scale_factor=2., mode='bilinear', align_corners=True) 115 | x1_out = self.layer1_outconv(x1) 116 | x1_out = self.layer1_outconv2(x1_out+x2_out_2x) 117 | 118 | return [x3_out, x1_out] 119 | 120 | 121 | class ResNetFPN_16_4(nn.Module): 122 | """ 123 | ResNet+FPN, output resolution are 1/16 and 1/4. 124 | Each block has 2 layers. 125 | """ 126 | 127 | def __init__(self, config): 128 | super().__init__() 129 | # Config 130 | block = BasicBlock 131 | initial_dim = config['initial_dim'] 132 | block_dims = config['block_dims'] 133 | 134 | # Class Variable 135 | self.in_planes = initial_dim 136 | 137 | # Networks 138 | self.conv1 = nn.Conv2d(1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False) 139 | self.bn1 = nn.BatchNorm2d(initial_dim) 140 | self.relu = nn.ReLU(inplace=True) 141 | 142 | self.layer1 = self._make_layer(block, block_dims[0], stride=1) # 1/2 143 | self.layer2 = self._make_layer(block, block_dims[1], stride=2) # 1/4 144 | self.layer3 = self._make_layer(block, block_dims[2], stride=2) # 1/8 145 | self.layer4 = self._make_layer(block, block_dims[3], stride=2) # 1/16 146 | 147 | # 3. FPN upsample 148 | self.layer4_outconv = conv1x1(block_dims[3], block_dims[3]) 149 | self.layer3_outconv = conv1x1(block_dims[2], block_dims[3]) 150 | self.layer3_outconv2 = nn.Sequential( 151 | conv3x3(block_dims[3], block_dims[3]), 152 | nn.BatchNorm2d(block_dims[3]), 153 | nn.LeakyReLU(), 154 | conv3x3(block_dims[3], block_dims[2]), 155 | ) 156 | 157 | self.layer2_outconv = conv1x1(block_dims[1], block_dims[2]) 158 | self.layer2_outconv2 = nn.Sequential( 159 | conv3x3(block_dims[2], block_dims[2]), 160 | nn.BatchNorm2d(block_dims[2]), 161 | nn.LeakyReLU(), 162 | conv3x3(block_dims[2], block_dims[1]), 163 | ) 164 | 165 | for m in self.modules(): 166 | if isinstance(m, nn.Conv2d): 167 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 168 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): 169 | nn.init.constant_(m.weight, 1) 170 | nn.init.constant_(m.bias, 0) 171 | 172 | def _make_layer(self, block, dim, stride=1): 173 | layer1 = block(self.in_planes, dim, stride=stride) 174 | layer2 = block(dim, dim, stride=1) 175 | layers = (layer1, layer2) 176 | 177 | self.in_planes = dim 178 | return nn.Sequential(*layers) 179 | 180 | def forward(self, x): 181 | # ResNet Backbone 182 | x0 = self.relu(self.bn1(self.conv1(x))) 183 | x1 = self.layer1(x0) # 1/2 184 | x2 = self.layer2(x1) # 1/4 185 | x3 = self.layer3(x2) # 1/8 186 | x4 = self.layer4(x3) # 1/16 187 | 188 | # FPN 189 | x4_out = self.layer4_outconv(x4) 190 | 191 | x4_out_2x = F.interpolate(x4_out, scale_factor=2., mode='bilinear', align_corners=True) 192 | x3_out = self.layer3_outconv(x3) 193 | x3_out = self.layer3_outconv2(x3_out+x4_out_2x) 194 | 195 | x3_out_2x = F.interpolate(x3_out, scale_factor=2., mode='bilinear', align_corners=True) 196 | x2_out = self.layer2_outconv(x2) 197 | x2_out = self.layer2_outconv2(x2_out+x3_out_2x) 198 | 199 | return [x4_out, x2_out] 200 | -------------------------------------------------------------------------------- /pose_synthesis/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 | 6 |

7 | [Paper] 8 | [Project] 9 | [Demo] 10 | [BibTeX] 11 |

12 | 13 |

14 | 15 | Hugging Face Spaces 16 | 17 |

18 | 19 | One-2-3-45 rethinks how to leverage 2D diffusion models for 3D AIGC and introduces a novel forward-only paradigm that avoids the time-consuming optimization. 20 | 21 | https://github.com/One-2-3-45/One-2-3-45/assets/16759292/a81d6e32-8d29-43a5-b044-b5112b9f9664 22 | 23 | 24 | 25 | https://github.com/One-2-3-45/One-2-3-45/assets/16759292/5ecd45ef-8fd3-4643-af4c-fac3050a0428 26 | 27 | 28 | ## News 29 | **[09/21/2023]** 30 | One-2-3-45 is accepted by NeurIPS 2023. See you in New Orleans! 31 | 32 | **[09/11/2023]** 33 | Training code released. 34 | 35 | **[08/18/2023]** 36 | Inference code released. 37 | 38 | **[07/24/2023]** 39 | Our demo reached the HuggingFace top 4 trending and was featured in 🤗 Spaces of the Week 🔥! Special thanks to HuggingFace 🤗 for sponsoring this demo!! 40 | 41 | **[07/11/2023]** 42 | [Online interactive demo](https://huggingface.co/spaces/One-2-3-45/One-2-3-45) released! Explore it and create your own 3D models in just 45 seconds! 43 | 44 | **[06/29/2023]** 45 | Check out our [paper](https://arxiv.org/pdf/2306.16928.pdf). [[X](https://twitter.com/_akhaliq/status/1674617785119305728)] 46 | 47 | ## Installation 48 | Hardware requirement: an NVIDIA GPU with memory >=18GB (_e.g._, RTX 3090 or A10). Tested on Ubuntu. 49 | 50 | We offer two ways to setup the environment: 51 | 52 | ### Traditional Installation 53 |
54 | Step 1: Install Debian packages. 55 | 56 | ```bash 57 | sudo apt update && sudo apt install git-lfs libsparsehash-dev build-essential 58 | ``` 59 |
60 | 61 |
62 | Step 2: Create and activate a conda environment. 63 | 64 | ```bash 65 | conda create -n One2345 python=3.10 66 | conda activate One2345 67 | ``` 68 |
69 | 70 |
71 | Step 3: Clone the repository to the local machine. 72 | 73 | ```bash 74 | # Make sure you have git-lfs installed. 75 | git lfs install 76 | git clone https://github.com/One-2-3-45/One-2-3-45 77 | cd One-2-3-45 78 | ``` 79 |
80 | 81 |
82 | Step 4: Install project dependencies using pip. 83 | 84 | ```bash 85 | # Ensure that the installed CUDA version matches the torch's cuda version. 86 | # Example: CUDA 11.8 installation 87 | wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run 88 | sudo sh cuda_11.8.0_520.61.05_linux.run 89 | export PATH="/usr/local/cuda-11.8/bin:$PATH" 90 | export LD_LIBRARY_PATH="/usr/local/cuda-11.8/lib64:$LD_LIBRARY_PATH" 91 | # Install PyTorch 2.0 92 | pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 93 | # Install dependencies 94 | pip install -r requirements.txt 95 | # Install inplace_abn and torchsparse 96 | export TORCH_CUDA_ARCH_LIST="7.0;7.2;8.0;8.6+PTX" # CUDA architectures. Modify according to your hardware. 97 | export IABN_FORCE_CUDA=1 98 | pip install inplace_abn 99 | FORCE_CUDA=1 pip install --no-cache-dir git+https://github.com/mit-han-lab/torchsparse.git@v1.4.0 100 | ``` 101 |
102 | 103 |
104 | Step 5: Download model checkpoints. 105 | 106 | ```bash 107 | python download_ckpt.py 108 | ``` 109 |
110 | 111 | 112 | ### Installation by Docker Images 113 |
114 | Option 1: Pull and Play (environment and checkpoints). (~22.3G) 115 | 116 | ```bash 117 | # Pull the Docker image that contains the full repository. 118 | docker pull chaoxu98/one2345:demo_1.0 119 | # An interactive demo will be launched automatically upon running the container. 120 | # This will provide a public URL like XXXXXXX.gradio.live 121 | docker run --name One-2-3-45_demo --gpus all -it chaoxu98/one2345:demo_1.0 122 | ``` 123 |
124 | 125 |
126 | Option 2: Environment Only. (~7.3G) 127 | 128 | ```bash 129 | # Pull the Docker image that installed all project dependencies. 130 | docker pull chaoxu98/one2345:1.0 131 | # Start a Docker container named One2345. 132 | docker run --name One-2-3-45 --gpus all -it chaoxu98/one2345:1.0 133 | # Get a bash shell in the container. 134 | docker exec -it One-2-3-45 /bin/bash 135 | # Clone the repository to the local machine. 136 | git clone https://github.com/One-2-3-45/One-2-3-45 137 | cd One-2-3-45 138 | # Download model checkpoints. 139 | python download_ckpt.py 140 | # Refer to getting started for inference. 141 | ``` 142 |
143 | 144 | ## Getting Started (Inference) 145 | 146 | First-time running will take longer time to compile the models. 147 | 148 | Expected time cost per image: 40s on an NVIDIA A6000. 149 | ```bash 150 | # 1. Script 151 | python run.py --img_path PATH_TO_INPUT_IMG --half_precision 152 | 153 | # 2. Interactive demo (Gradio) with a friendly web interface 154 | # An URL will be provided in the output 155 | # (Local: 127.0.0.1:7860; Public: XXXXXXX.gradio.live) 156 | cd demo/ 157 | python app.py 158 | 159 | # 3. Jupyter Notebook 160 | example.ipynb 161 | ``` 162 | 163 | ## Training Your Own Model 164 | 165 | ### Data Preparation 166 | We use Objaverse-LVIS dataset for training and render the selected shapes (with CC-BY license) into 2D images with Blender. 167 | #### Download the training images. 168 | Download all One2345.zip.part-* files (5 files in total) from here and then cat them into a single .zip file using the following command: 169 | ```bash 170 | cat One2345.zip.part-* > One2345.zip 171 | ``` 172 | 173 | #### Unzip the training images zip file. 174 | Unzip the zip file into a folder specified by yourself (`YOUR_BASE_FOLDER`) with the following command: 175 | 176 | ```bash 177 | unzip One2345.zip -d YOUR_BASE_FOLDER 178 | ``` 179 | 180 | #### Download meta files. 181 | 182 | Download `One2345_training_pose.json` and `lvis_split_cc_by.json` from here and put them into the same folder as the training images (`YOUR_BASE_FOLDER`). 183 | 184 | Your file structure should look like this: 185 | ``` 186 | # One2345 is your base folder used in the previous steps 187 | 188 | One2345 189 | ├── One2345_training_pose.json 190 | ├── lvis_split_cc_by.json 191 | └── zero12345_narrow 192 | ├── 000-000 193 | ├── 000-001 194 | ├── 000-002 195 | ... 196 | └── 000-159 197 | 198 | ``` 199 | 200 | ### Training 201 | Specify the `trainpath`, `valpath`, and `testpath` in the config file `./reconstruction/confs/one2345_lod_train.conf` to be `YOUR_BASE_FOLDER` used in data preparation steps and run the following command: 202 | ```bash 203 | cd reconstruction 204 | python exp_runner_generic_blender_train.py --mode train --conf confs/one2345_lod_train.conf 205 | ``` 206 | Experiment logs and checkpoints will be saved in `./reconstruction/exp/`. 207 | 208 | ## Citation 209 | 210 | If you find our code helpful, please cite our paper: 211 | 212 | ``` 213 | @misc{liu2023one2345, 214 | title={One-2-3-45: Any Single Image to 3D Mesh in 45 Seconds without Per-Shape Optimization}, 215 | author={Minghua Liu and Chao Xu and Haian Jin and Linghao Chen and Mukund Varma T and Zexiang Xu and Hao Su}, 216 | year={2023}, 217 | eprint={2306.16928}, 218 | archivePrefix={arXiv}, 219 | primaryClass={cs.CV} 220 | } 221 | ``` 222 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/modules/losses/vqperceptual.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | from einops import repeat 5 | 6 | from taming.modules.discriminator.model import NLayerDiscriminator, weights_init 7 | from taming.modules.losses.lpips import LPIPS 8 | from taming.modules.losses.vqperceptual import hinge_d_loss, vanilla_d_loss 9 | 10 | 11 | def hinge_d_loss_with_exemplar_weights(logits_real, logits_fake, weights): 12 | assert weights.shape[0] == logits_real.shape[0] == logits_fake.shape[0] 13 | loss_real = torch.mean(F.relu(1. - logits_real), dim=[1,2,3]) 14 | loss_fake = torch.mean(F.relu(1. + logits_fake), dim=[1,2,3]) 15 | loss_real = (weights * loss_real).sum() / weights.sum() 16 | loss_fake = (weights * loss_fake).sum() / weights.sum() 17 | d_loss = 0.5 * (loss_real + loss_fake) 18 | return d_loss 19 | 20 | def adopt_weight(weight, global_step, threshold=0, value=0.): 21 | if global_step < threshold: 22 | weight = value 23 | return weight 24 | 25 | 26 | def measure_perplexity(predicted_indices, n_embed): 27 | # src: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py 28 | # eval cluster perplexity. when perplexity == num_embeddings then all clusters are used exactly equally 29 | encodings = F.one_hot(predicted_indices, n_embed).float().reshape(-1, n_embed) 30 | avg_probs = encodings.mean(0) 31 | perplexity = (-(avg_probs * torch.log(avg_probs + 1e-10)).sum()).exp() 32 | cluster_use = torch.sum(avg_probs > 0) 33 | return perplexity, cluster_use 34 | 35 | def l1(x, y): 36 | return torch.abs(x-y) 37 | 38 | 39 | def l2(x, y): 40 | return torch.pow((x-y), 2) 41 | 42 | 43 | class VQLPIPSWithDiscriminator(nn.Module): 44 | def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0, 45 | disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0, 46 | perceptual_weight=1.0, use_actnorm=False, disc_conditional=False, 47 | disc_ndf=64, disc_loss="hinge", n_classes=None, perceptual_loss="lpips", 48 | pixel_loss="l1"): 49 | super().__init__() 50 | assert disc_loss in ["hinge", "vanilla"] 51 | assert perceptual_loss in ["lpips", "clips", "dists"] 52 | assert pixel_loss in ["l1", "l2"] 53 | self.codebook_weight = codebook_weight 54 | self.pixel_weight = pixelloss_weight 55 | if perceptual_loss == "lpips": 56 | print(f"{self.__class__.__name__}: Running with LPIPS.") 57 | self.perceptual_loss = LPIPS().eval() 58 | else: 59 | raise ValueError(f"Unknown perceptual loss: >> {perceptual_loss} <<") 60 | self.perceptual_weight = perceptual_weight 61 | 62 | if pixel_loss == "l1": 63 | self.pixel_loss = l1 64 | else: 65 | self.pixel_loss = l2 66 | 67 | self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels, 68 | n_layers=disc_num_layers, 69 | use_actnorm=use_actnorm, 70 | ndf=disc_ndf 71 | ).apply(weights_init) 72 | self.discriminator_iter_start = disc_start 73 | if disc_loss == "hinge": 74 | self.disc_loss = hinge_d_loss 75 | elif disc_loss == "vanilla": 76 | self.disc_loss = vanilla_d_loss 77 | else: 78 | raise ValueError(f"Unknown GAN loss '{disc_loss}'.") 79 | print(f"VQLPIPSWithDiscriminator running with {disc_loss} loss.") 80 | self.disc_factor = disc_factor 81 | self.discriminator_weight = disc_weight 82 | self.disc_conditional = disc_conditional 83 | self.n_classes = n_classes 84 | 85 | def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None): 86 | if last_layer is not None: 87 | nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0] 88 | g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0] 89 | else: 90 | nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0] 91 | g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0] 92 | 93 | d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4) 94 | d_weight = torch.clamp(d_weight, 0.0, 1e4).detach() 95 | d_weight = d_weight * self.discriminator_weight 96 | return d_weight 97 | 98 | def forward(self, codebook_loss, inputs, reconstructions, optimizer_idx, 99 | global_step, last_layer=None, cond=None, split="train", predicted_indices=None): 100 | if not exists(codebook_loss): 101 | codebook_loss = torch.tensor([0.]).to(inputs.device) 102 | #rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous()) 103 | rec_loss = self.pixel_loss(inputs.contiguous(), reconstructions.contiguous()) 104 | if self.perceptual_weight > 0: 105 | p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous()) 106 | rec_loss = rec_loss + self.perceptual_weight * p_loss 107 | else: 108 | p_loss = torch.tensor([0.0]) 109 | 110 | nll_loss = rec_loss 111 | #nll_loss = torch.sum(nll_loss) / nll_loss.shape[0] 112 | nll_loss = torch.mean(nll_loss) 113 | 114 | # now the GAN part 115 | if optimizer_idx == 0: 116 | # generator update 117 | if cond is None: 118 | assert not self.disc_conditional 119 | logits_fake = self.discriminator(reconstructions.contiguous()) 120 | else: 121 | assert self.disc_conditional 122 | logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1)) 123 | g_loss = -torch.mean(logits_fake) 124 | 125 | try: 126 | d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer) 127 | except RuntimeError: 128 | assert not self.training 129 | d_weight = torch.tensor(0.0) 130 | 131 | disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) 132 | loss = nll_loss + d_weight * disc_factor * g_loss + self.codebook_weight * codebook_loss.mean() 133 | 134 | log = {"{}/total_loss".format(split): loss.clone().detach().mean(), 135 | "{}/quant_loss".format(split): codebook_loss.detach().mean(), 136 | "{}/nll_loss".format(split): nll_loss.detach().mean(), 137 | "{}/rec_loss".format(split): rec_loss.detach().mean(), 138 | "{}/p_loss".format(split): p_loss.detach().mean(), 139 | "{}/d_weight".format(split): d_weight.detach(), 140 | "{}/disc_factor".format(split): torch.tensor(disc_factor), 141 | "{}/g_loss".format(split): g_loss.detach().mean(), 142 | } 143 | if predicted_indices is not None: 144 | assert self.n_classes is not None 145 | with torch.no_grad(): 146 | perplexity, cluster_usage = measure_perplexity(predicted_indices, self.n_classes) 147 | log[f"{split}/perplexity"] = perplexity 148 | log[f"{split}/cluster_usage"] = cluster_usage 149 | return loss, log 150 | 151 | if optimizer_idx == 1: 152 | # second pass for discriminator update 153 | if cond is None: 154 | logits_real = self.discriminator(inputs.contiguous().detach()) 155 | logits_fake = self.discriminator(reconstructions.contiguous().detach()) 156 | else: 157 | logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1)) 158 | logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1)) 159 | 160 | disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) 161 | d_loss = disc_factor * self.disc_loss(logits_real, logits_fake) 162 | 163 | log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(), 164 | "{}/logits_real".format(split): logits_real.detach().mean(), 165 | "{}/logits_fake".format(split): logits_fake.detach().mean() 166 | } 167 | return d_loss, log 168 | -------------------------------------------------------------------------------- /train_pose_estimator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | import torchvision.transforms as transforms 5 | from torch.utils.data import DataLoader, Dataset 6 | from torchvision import models 7 | import numpy as np 8 | import os 9 | from PIL import Image 10 | import torch.nn.functional as F 11 | import random 12 | 13 | name = "log_dinov2_mlp_1e-5_tmp" 14 | gpuid = 1 15 | from transformers import AutoImageProcessor, Dinov2Model 16 | 17 | import logging 18 | import sys 19 | logging.basicConfig(encoding='utf-8', level=logging.INFO, 20 | handlers=[logging.FileHandler("{}.log".format(name)), 21 | logging.StreamHandler(sys.stdout) ] ) 22 | device ="cuda:{}".format(gpuid) 23 | model_save_path = 'best_model_{}.pth'.format(name) 24 | processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base") 25 | vit_model = Dinov2Model.from_pretrained("facebook/dinov2-base") 26 | bs = 128 #100 27 | lr = 1e-5 28 | data_folder = "./imgs/pose_estimation_train_dataset" 29 | 30 | class CustomDataset(Dataset): 31 | def __init__(self, root_dir, transform=None, train=True, test_split=0.2): 32 | self.root_dir = root_dir 33 | self.transform = transform 34 | self.train = train 35 | self.test_split = test_split 36 | self.data, self.labels = self.load_data() 37 | 38 | def load_data(self): 39 | data = [] 40 | labels = [] 41 | 42 | fur_dir_list = [d for d in os.listdir(self.root_dir) if os.path.isdir(os.path.join(self.root_dir, d))] 43 | for fur_dir in fur_dir_list: 44 | if "dreambooth" in fur_dir and "old" not in fur_dir: 45 | fur_path = os.path.join(self.root_dir, fur_dir) 46 | deg_dir_list = [d for d in os.listdir(fur_path) if os.path.isdir(fur_path )] 47 | 48 | for deg_dir in deg_dir_list : 49 | deg_path = os.path.join(fur_path, deg_dir) 50 | files = [f for f in os.listdir(deg_path) if f.endswith('.png')] 51 | for file in files: 52 | file_path = os.path.join(deg_path, file) 53 | r, t = file.split('_') 54 | label = [float(r), float(t[:-4])] # 移除文件名中的".png"后缀 55 | data.append(file_path) 56 | labels.append(label) 57 | 58 | zipped = list(zip(data, labels)) 59 | random.shuffle(zipped) 60 | data, labels = zip(*zipped) 61 | 62 | split_index = int(len(data) * (1 - self.test_split)) 63 | if self.train: 64 | data = data[:split_index] 65 | labels = labels[:split_index] 66 | else: 67 | data = data[split_index:] 68 | labels = labels[split_index:] 69 | 70 | return data, labels 71 | 72 | def __len__(self): 73 | return len(self.data) 74 | 75 | def __getitem__(self, idx): 76 | img_path = self.data[idx] 77 | label = self.labels[idx] 78 | 79 | img = Image.open(img_path).convert('RGB') 80 | inputs = processor(images=img, return_tensors="pt") 81 | inputs['pixel_values'] = inputs['pixel_values'][0] 82 | inputs['label'] = label 83 | return inputs 84 | 85 | 86 | class RegressionModel(nn.Module): 87 | def __init__(self): 88 | super(RegressionModel, self).__init__() 89 | self.vit = vit_model 90 | # self.fc1 = nn.Linear(768, 768) 91 | # self.fc2 = nn.Linear(768, 768) 92 | self.fc3 = nn.Linear(768, 128) 93 | self.fc4 = nn.Linear(128, 2) 94 | 95 | 96 | for param in self.vit.parameters(): 97 | param.requires_grad = True 98 | 99 | def forward(self, x): 100 | outputs = self.vit(x) 101 | sequence_output = outputs[0] 102 | x = sequence_output[:, 0, :] #[B,768] 103 | 104 | # x = F.relu(self.fc1(x)) 105 | # x = F.relu(self.fc2(x)) 106 | x = F.relu(self.fc3(x)) 107 | x = self.fc4(x) 108 | return x 109 | 110 | 111 | transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()]) 112 | 113 | train_dataset = CustomDataset(data_folder, transform=transform, train=True) 114 | train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True, pin_memory=True, num_workers=32) 115 | test_dataset = CustomDataset(data_folder, transform=transform, train=False) 116 | test_loader = DataLoader(test_dataset, batch_size=bs, shuffle=False, pin_memory=True, num_workers=32) 117 | 118 | 119 | model = RegressionModel().float().to(device) 120 | model = model.to(device) 121 | 122 | 123 | criterion = nn.MSELoss() 124 | criterion_mae = nn.L1Loss() 125 | 126 | optimizer = optim.AdamW(model.parameters(), lr=lr) 127 | 128 | 129 | round_list =np.array (list(range(-160,170,10))) 130 | def discretize(outputs): 131 | outputs = outputs.numpy() 132 | round_cand = round_list 133 | for i in range(len(outputs.shape)): 134 | round_cand = np.expand_dims(round_cand, 0) 135 | outputs = np.expand_dims(outputs, -1) 136 | diff = np.abs(outputs-round_cand) 137 | pos = np.expand_dims(np.argmin(diff, axis = -1), axis = -1) 138 | res = round_list[pos].squeeze() 139 | return torch.tensor(res) 140 | 141 | lowest_loss = float('inf') 142 | 143 | 144 | num_epochs = 500 145 | discrete_val = [] 146 | for epoch in range(num_epochs): 147 | model.train() 148 | for batch in train_loader: 149 | inputs, labels = batch['pixel_values'], batch['label'] 150 | labels = torch.stack(labels, dim=-1) 151 | 152 | inputs = inputs.float().to(device) 153 | labels = labels.float().to(device) 154 | optimizer.zero_grad() 155 | outputs = model(inputs) 156 | 157 | # from thop import profile 158 | # flops, params = profile(model, (inputs,)) 159 | # print('flops: ', flops, 'params: ', params) 160 | 161 | loss = criterion(outputs, labels) 162 | loss_mae = criterion_mae(outputs.detach().cpu(), labels.detach().cpu()) 163 | 164 | loss.backward() 165 | optimizer.step() 166 | 167 | logging.info(f'Epoch [{epoch+1}/{num_epochs}], train MSE: {loss.item():.2f}, mae: {loss_mae.item():.2f}, RMSE: {np.sqrt(loss.item()):.2f}') 168 | 169 | outputs_round =discretize(outputs.detach().cpu()) 170 | loss = criterion(outputs_round.detach().cpu(), labels.detach().cpu()) 171 | loss_mae = criterion_mae(outputs_round.detach().cpu(), labels.detach().cpu()) 172 | logging.info(f'Epoch [{epoch+1}/{num_epochs}], r MSE: {loss.item():.2f}, mae: {loss_mae.item():.2f}, RMSE: {np.sqrt(loss.item()):.2f}') 173 | 174 | # After each epoch, the model will be evaluated on the test set 175 | model.eval() 176 | with torch.no_grad(): 177 | total_loss = 0 178 | total_loss_mae = 0 179 | 180 | r_total_loss = 0 181 | r_total_loss_mae = 0 182 | 183 | for batch in test_loader: 184 | inputs, labels = batch['pixel_values'], batch['label'] 185 | labels = torch.stack(labels, dim=-1) 186 | inputs = inputs.float().to(device) 187 | labels = labels.float().to(device) 188 | outputs = model(inputs) 189 | 190 | loss = criterion(outputs, labels) 191 | loss_mae = criterion_mae(outputs.detach().cpu(), labels.detach().cpu()) 192 | 193 | 194 | total_loss += loss.item() 195 | total_loss_mae += loss_mae.item() 196 | 197 | 198 | outputs_round =discretize(outputs.detach().cpu()) 199 | r_loss = criterion(outputs_round.detach().cpu(), labels.detach().cpu()) 200 | r_loss_mae = criterion_mae(outputs_round.detach().cpu(), labels.detach().cpu()) 201 | 202 | 203 | r_total_loss += r_loss.item() 204 | r_total_loss_mae += r_loss_mae.item() 205 | 206 | 207 | 208 | average_loss = total_loss / len(test_loader) 209 | average_loss_mae = total_loss_mae / len(test_loader) 210 | 211 | logging.info(f'Epoch [{epoch+1}/{num_epochs}], Test MSE: {average_loss:.2f}, mae: {average_loss_mae:.2f}, RMSE: {np.sqrt(average_loss):.2f}') 212 | 213 | average_loss = r_total_loss / len(test_loader) 214 | average_loss_mae = r_total_loss_mae / len(test_loader) 215 | logging.info(f'Epoch [{epoch+1}/{num_epochs}], r MSE: {average_loss:.2f}, mae: {average_loss_mae:.2f}, RMSE: {np.sqrt(average_loss):.2f}') 216 | 217 | # If the test loss of the current model is lower, save the current model 218 | if average_loss < lowest_loss: 219 | lowest_loss = average_loss 220 | torch.save(model.state_dict(), model_save_path) 221 | logging.info(f'Saved model with lowest test loss {lowest_loss}: {model_save_path}') 222 | 223 | -------------------------------------------------------------------------------- /pose_synthesis/elevation_estimate/utils/elev_est_api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import numpy as np 4 | import os.path as osp 5 | import imageio 6 | from copy import deepcopy 7 | 8 | import loguru 9 | import torch 10 | import matplotlib.cm as cm 11 | import matplotlib.pyplot as plt 12 | 13 | from ..loftr import LoFTR, default_cfg 14 | from . import plt_utils 15 | from .plotting import make_matching_figure 16 | from .utils3d import rect_to_img, canonical_to_camera, calc_pose 17 | 18 | 19 | class ElevEstHelper: 20 | _feature_matcher = None 21 | 22 | @classmethod 23 | def get_feature_matcher(cls): 24 | if cls._feature_matcher is None: 25 | loguru.logger.info("Loading feature matcher...") 26 | _default_cfg = deepcopy(default_cfg) 27 | _default_cfg['coarse']['temp_bug_fix'] = True # set to False when using the old ckpt 28 | matcher = LoFTR(config=_default_cfg) 29 | current_dir = os.path.dirname(os.path.abspath(__file__)) 30 | ckpt_path = os.path.join(current_dir, "weights/indoor_ds_new.ckpt") 31 | if not osp.exists(ckpt_path): 32 | loguru.logger.info("Downloading feature matcher...") 33 | os.makedirs("weights", exist_ok=True) 34 | import gdown 35 | gdown.cached_download(url="https://drive.google.com/uc?id=19s3QvcCWQ6g-N1PrYlDCg-2mOJZ3kkgS", 36 | path=ckpt_path) 37 | matcher.load_state_dict(torch.load(ckpt_path)['state_dict']) 38 | matcher = matcher.eval().cuda() 39 | cls._feature_matcher = matcher 40 | return cls._feature_matcher 41 | 42 | 43 | def mask_out_bkgd(img_path, dbg=False): 44 | img = imageio.imread_v2(img_path) 45 | if img.shape[-1] == 4: 46 | fg_mask = img[:, :, :3] 47 | else: 48 | loguru.logger.info("Image has no alpha channel, using thresholding to mask out background") 49 | fg_mask = ~(img > 245).all(axis=-1) 50 | if dbg: 51 | plt.imshow(plt_utils.vis_mask(img, fg_mask.astype(np.uint8), color=[0, 255, 0])) 52 | plt.show() 53 | return fg_mask 54 | 55 | 56 | def get_feature_matching(img_paths, dbg=False): 57 | assert len(img_paths) == 4 58 | matcher = ElevEstHelper.get_feature_matcher() 59 | feature_matching = {} 60 | masks = [] 61 | for i in range(4): 62 | mask = mask_out_bkgd(img_paths[i], dbg=dbg) 63 | masks.append(mask) 64 | for i in range(0, 4): 65 | for j in range(i + 1, 4): 66 | img0_pth = img_paths[i] 67 | img1_pth = img_paths[j] 68 | mask0 = masks[i] 69 | mask1 = masks[j] 70 | img0_raw = cv2.imread(img0_pth, cv2.IMREAD_GRAYSCALE) 71 | img1_raw = cv2.imread(img1_pth, cv2.IMREAD_GRAYSCALE) 72 | original_shape = img0_raw.shape 73 | img0_raw_resized = cv2.resize(img0_raw, (480, 480)) 74 | img1_raw_resized = cv2.resize(img1_raw, (480, 480)) 75 | 76 | img0 = torch.from_numpy(img0_raw_resized)[None][None].cuda() / 255. 77 | img1 = torch.from_numpy(img1_raw_resized)[None][None].cuda() / 255. 78 | batch = {'image0': img0, 'image1': img1} 79 | 80 | # Inference with LoFTR and get prediction 81 | with torch.no_grad(): 82 | matcher(batch) 83 | mkpts0 = batch['mkpts0_f'].cpu().numpy() 84 | mkpts1 = batch['mkpts1_f'].cpu().numpy() 85 | mconf = batch['mconf'].cpu().numpy() 86 | mkpts0[:, 0] = mkpts0[:, 0] * original_shape[1] / 480 87 | mkpts0[:, 1] = mkpts0[:, 1] * original_shape[0] / 480 88 | mkpts1[:, 0] = mkpts1[:, 0] * original_shape[1] / 480 89 | mkpts1[:, 1] = mkpts1[:, 1] * original_shape[0] / 480 90 | keep0 = mask0[mkpts0[:, 1].astype(int), mkpts1[:, 0].astype(int)] 91 | keep1 = mask1[mkpts1[:, 1].astype(int), mkpts1[:, 0].astype(int)] 92 | keep = np.logical_and(keep0, keep1) 93 | mkpts0 = mkpts0[keep] 94 | mkpts1 = mkpts1[keep] 95 | mconf = mconf[keep] 96 | if dbg: 97 | # Draw visualization 98 | color = cm.jet(mconf) 99 | text = [ 100 | 'LoFTR', 101 | 'Matches: {}'.format(len(mkpts0)), 102 | ] 103 | fig = make_matching_figure(img0_raw, img1_raw, mkpts0, mkpts1, color, text=text) 104 | fig.show() 105 | feature_matching[f"{i}_{j}"] = np.concatenate([mkpts0, mkpts1, mconf[:, None]], axis=1) 106 | 107 | return feature_matching 108 | 109 | 110 | def gen_pose_hypothesis(center_elevation): 111 | elevations = np.radians( 112 | [center_elevation, center_elevation - 10, center_elevation + 10, center_elevation, center_elevation]) # 45~120 113 | azimuths = np.radians([30, 30, 30, 20, 40]) 114 | input_poses = calc_pose(elevations, azimuths, len(azimuths)) 115 | input_poses = input_poses[1:] 116 | input_poses[..., 1] *= -1 117 | input_poses[..., 2] *= -1 118 | return input_poses 119 | 120 | 121 | def ba_error_general(K, matches, poses): 122 | projmat0 = K @ poses[0].inverse()[:3, :4] 123 | projmat1 = K @ poses[1].inverse()[:3, :4] 124 | match_01 = matches[0] 125 | pts0 = match_01[:, :2] 126 | pts1 = match_01[:, 2:4] 127 | Xref = cv2.triangulatePoints(projmat0.cpu().numpy(), projmat1.cpu().numpy(), 128 | pts0.cpu().numpy().T, pts1.cpu().numpy().T) 129 | Xref = Xref[:3] / Xref[3:] 130 | Xref = Xref.T 131 | Xref = torch.from_numpy(Xref).cuda().float() 132 | reproj_error = 0 133 | for match, cp in zip(matches[1:], poses[2:]): 134 | dist = (torch.norm(match_01[:, :2][:, None, :] - match[:, :2][None, :, :], dim=-1)) 135 | if dist.numel() > 0: 136 | # print("dist.shape", dist.shape) 137 | m0to2_index = dist.argmin(1) 138 | keep = dist[torch.arange(match_01.shape[0]), m0to2_index] < 1 139 | if keep.sum() > 0: 140 | xref_in2 = rect_to_img(K, canonical_to_camera(Xref, cp.inverse())) 141 | reproj_error2 = torch.norm(match[m0to2_index][keep][:, 2:4] - xref_in2[keep], dim=-1) 142 | conf02 = match[m0to2_index][keep][:, -1] 143 | reproj_error += (reproj_error2 * conf02).sum() / (conf02.sum()) 144 | 145 | return reproj_error 146 | 147 | 148 | def find_optim_elev(elevs, nimgs, matches, K, dbg=False): 149 | errs = [] 150 | for elev in elevs: 151 | err = 0 152 | cam_poses = gen_pose_hypothesis(elev) 153 | for start in range(nimgs - 1): 154 | batch_matches, batch_poses = [], [] 155 | for i in range(start, nimgs + start): 156 | ci = i % nimgs 157 | batch_poses.append(cam_poses[ci]) 158 | for j in range(nimgs - 1): 159 | key = f"{start}_{(start + j + 1) % nimgs}" 160 | match = matches[key] 161 | batch_matches.append(match) 162 | err += ba_error_general(K, batch_matches, batch_poses) 163 | errs.append(err) 164 | errs = torch.tensor(errs) 165 | if dbg: 166 | plt.plot(elevs, errs) 167 | plt.show() 168 | optim_elev = elevs[torch.argmin(errs)].item() 169 | return optim_elev 170 | 171 | 172 | def get_elev_est(feature_matching, min_elev=30, max_elev=150, K=None, dbg=False): 173 | flag = True 174 | matches = {} 175 | for i in range(4): 176 | for j in range(i + 1, 4): 177 | match_ij = feature_matching[f"{i}_{j}"] 178 | if len(match_ij) == 0: 179 | flag = False 180 | match_ji = np.concatenate([match_ij[:, 2:4], match_ij[:, 0:2], match_ij[:, 4:5]], axis=1) 181 | matches[f"{i}_{j}"] = torch.from_numpy(match_ij).float().cuda() 182 | matches[f"{j}_{i}"] = torch.from_numpy(match_ji).float().cuda() 183 | if not flag: 184 | loguru.logger.info("0 matches, could not estimate elevation") 185 | return None 186 | interval = 10 187 | elevs = np.arange(min_elev, max_elev, interval) 188 | optim_elev1 = find_optim_elev(elevs, 4, matches, K) 189 | 190 | elevs = np.arange(optim_elev1 - 10, optim_elev1 + 10, 1) 191 | optim_elev2 = find_optim_elev(elevs, 4, matches, K) 192 | 193 | return optim_elev2 194 | 195 | 196 | def elev_est_api(img_paths, min_elev=30, max_elev=150, K=None, dbg=False): 197 | feature_matching = get_feature_matching(img_paths, dbg=dbg) 198 | if K is None: 199 | loguru.logger.warning("K is not provided, using default K") 200 | K = np.array([[280.0, 0, 128.0], 201 | [0, 280.0, 128.0], 202 | [0, 0, 1]]) 203 | K = torch.from_numpy(K).cuda().float() 204 | elev = get_elev_est(feature_matching, min_elev, max_elev, K, dbg=dbg) 205 | return elev 206 | -------------------------------------------------------------------------------- /pose_synthesis/utils/zero123_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import torch 4 | from contextlib import nullcontext 5 | from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker 6 | from einops import rearrange 7 | from ldm.util import instantiate_from_config 8 | from ldm.models.diffusion.ddim import DDIMSampler 9 | from omegaconf import OmegaConf 10 | from PIL import Image 11 | from rich import print 12 | from transformers import CLIPImageProcessor 13 | from torch import autocast 14 | from torchvision import transforms 15 | 16 | 17 | def load_model_from_config(config, ckpt, device, verbose=False): 18 | print(f'Loading model from {ckpt}') 19 | pl_sd = torch.load(ckpt, map_location='cpu') 20 | if 'global_step' in pl_sd: 21 | print(f'Global Step: {pl_sd["global_step"]}') 22 | sd = pl_sd['state_dict'] 23 | model = instantiate_from_config(config.model) 24 | m, u = model.load_state_dict(sd, strict=False) 25 | if len(m) > 0 and verbose: 26 | print('missing keys:') 27 | print(m) 28 | if len(u) > 0 and verbose: 29 | print('unexpected keys:') 30 | print(u) 31 | 32 | model.to(device) 33 | model.eval() 34 | return model 35 | 36 | 37 | def init_model(device, ckpt, half_precision=False): 38 | config = os.path.join(os.path.dirname(__file__), '../configs/sd-objaverse-finetune-c_concat-256.yaml') 39 | config = OmegaConf.load(config) 40 | 41 | # Instantiate all models beforehand for efficiency. 42 | models = dict() 43 | print('Instantiating LatentDiffusion...') 44 | if half_precision: 45 | models['turncam'] = torch.compile(load_model_from_config(config, ckpt, device=device)).half() 46 | else: 47 | models['turncam'] = torch.compile(load_model_from_config(config, ckpt, device=device)) 48 | print('Instantiating StableDiffusionSafetyChecker...') 49 | models['nsfw'] = StableDiffusionSafetyChecker.from_pretrained( 50 | 'CompVis/stable-diffusion-safety-checker').to(device) 51 | models['clip_fe'] = CLIPImageProcessor.from_pretrained( 52 | "openai/clip-vit-large-patch14") 53 | # We multiply all by some factor > 1 to make them less likely to be triggered. 54 | models['nsfw'].concept_embeds_weights *= 1.2 55 | models['nsfw'].special_care_embeds_weights *= 1.2 56 | 57 | return models 58 | 59 | @torch.no_grad() 60 | def sample_model_batch(model, sampler, input_im, xs, ys, n_samples=4, precision='autocast', ddim_eta=1.0, ddim_steps=75, scale=3.0, h=256, w=256): 61 | precision_scope = autocast if precision == 'autocast' else nullcontext 62 | with precision_scope("cuda"): 63 | with model.ema_scope(): 64 | c = model.get_learned_conditioning(input_im).tile(n_samples, 1, 1) 65 | T = [] 66 | for x, y in zip(xs, ys): 67 | T.append([np.radians(x), np.sin(np.radians(y)), np.cos(np.radians(y)), 0]) 68 | T = torch.tensor(np.array(T))[:, None, :].float().to(c.device) 69 | c = torch.cat([c, T], dim=-1) 70 | c = model.cc_projection(c) 71 | cond = {} 72 | cond['c_crossattn'] = [c] 73 | cond['c_concat'] = [model.encode_first_stage(input_im).mode().detach() 74 | .repeat(n_samples, 1, 1, 1)] 75 | if scale != 1.0: 76 | uc = {} 77 | uc['c_concat'] = [torch.zeros(n_samples, 4, h // 8, w // 8).to(c.device)] 78 | uc['c_crossattn'] = [torch.zeros_like(c).to(c.device)] 79 | else: 80 | uc = None 81 | 82 | shape = [4, h // 8, w // 8] 83 | samples_ddim, _ = sampler.sample(S=ddim_steps, 84 | conditioning=cond, 85 | batch_size=n_samples, 86 | shape=shape, 87 | verbose=False, 88 | unconditional_guidance_scale=scale, 89 | unconditional_conditioning=uc, 90 | eta=ddim_eta, 91 | x_T=None) 92 | # print(samples_ddim.shape) 93 | # samples_ddim = torch.nn.functional.interpolate(samples_ddim, 64, mode='nearest', antialias=False) 94 | x_samples_ddim = model.decode_first_stage(samples_ddim) 95 | ret_imgs = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0).cpu() 96 | del cond, c, x_samples_ddim, samples_ddim, uc, input_im 97 | torch.cuda.empty_cache() 98 | return ret_imgs 99 | 100 | @torch.no_grad() 101 | def predict_stage1_gradio(model, raw_im, save_path = "", adjust_set=[], device="cuda", ddim_steps=75, scale=3.0): 102 | # raw_im = raw_im.resize([256, 256], Image.LANCZOS) 103 | # input_im_init = preprocess_image(models, raw_im, preprocess=False) 104 | input_im_init = np.asarray(raw_im, dtype=np.float32) / 255.0 105 | input_im = transforms.ToTensor()(input_im_init).unsqueeze(0).to(device) 106 | input_im = input_im * 2 - 1 107 | 108 | # stage 1: 8 109 | delta_x_1_8 = [0] * 4 + [30] * 4 + [-30] * 4 110 | delta_y_1_8 = [0+90*(i%4) if i < 4 else 30+90*(i%4) for i in range(8)] + [30+90*(i%4) for i in range(4)] 111 | 112 | ret_imgs = [] 113 | sampler = DDIMSampler(model) 114 | # sampler.to(device) 115 | if adjust_set != []: 116 | x_samples_ddims_8 = sample_model_batch(model, sampler, input_im, 117 | [delta_x_1_8[i] for i in adjust_set], [delta_y_1_8[i] for i in adjust_set], 118 | n_samples=len(adjust_set), ddim_steps=ddim_steps, scale=scale) 119 | else: 120 | x_samples_ddims_8 = sample_model_batch(model, sampler, input_im, delta_x_1_8, delta_y_1_8, n_samples=len(delta_x_1_8), ddim_steps=ddim_steps, scale=scale) 121 | sample_idx = 0 122 | for stage1_idx in range(len(delta_x_1_8)): 123 | if adjust_set != [] and stage1_idx not in adjust_set: 124 | continue 125 | x_sample = 255.0 * rearrange(x_samples_ddims_8[sample_idx].numpy(), 'c h w -> h w c') 126 | out_image = Image.fromarray(x_sample.astype(np.uint8)) 127 | ret_imgs.append(out_image) 128 | if save_path: 129 | out_image.save(os.path.join(save_path, '%d.png'%(stage1_idx))) 130 | sample_idx += 1 131 | del x_samples_ddims_8 132 | del sampler 133 | torch.cuda.empty_cache() 134 | return ret_imgs 135 | 136 | def infer_stage_2(model, save_path_stage1, save_path_stage2, delta_x_2, delta_y_2, indices, device, ddim_steps=75, scale=3.0): 137 | for stage1_idx in indices: 138 | # save stage 1 image 139 | # x_sample = 255.0 * rearrange(x_samples_ddims[stage1_idx].cpu().numpy(), 'c h w -> h w c') 140 | # Image.fromarray(x_sample.astype(np.uint8)).save() 141 | stage1_image_path = os.path.join(save_path_stage1, '%d.png'%(stage1_idx)) 142 | 143 | raw_im = Image.open(stage1_image_path) 144 | # input_im_init = preprocess_image(models, raw_im, preprocess=False) 145 | input_im_init = np.asarray(raw_im, dtype=np.float32) #/ 255.0 146 | input_im_init[input_im_init >= 253.0] = 255.0 147 | input_im_init = input_im_init / 255.0 148 | input_im = transforms.ToTensor()(input_im_init).unsqueeze(0).to(device) 149 | input_im = input_im * 2 - 1 150 | # infer stage 2 151 | sampler = DDIMSampler(model) 152 | # sampler.to(device) 153 | # stage2_in = x_samples_ddims[stage1_idx][None, ...].to(device) * 2 - 1 154 | x_samples_ddims_stage2 = sample_model_batch(model, sampler, input_im, delta_x_2, delta_y_2, n_samples=len(delta_x_2), ddim_steps=ddim_steps, scale=scale) 155 | for stage2_idx in range(len(delta_x_2)): 156 | x_sample_stage2 = 255.0 * rearrange(x_samples_ddims_stage2[stage2_idx].numpy(), 'c h w -> h w c') 157 | Image.fromarray(x_sample_stage2.astype(np.uint8)).save(os.path.join(save_path_stage2, '%d_%d.png'%(stage1_idx, stage2_idx))) 158 | del input_im 159 | del x_samples_ddims_stage2 160 | torch.cuda.empty_cache() 161 | 162 | def zero123_infer(model, input_dir_path, start_idx=0, end_idx=12, indices=None, device="cuda", ddim_steps=75, scale=3.0): 163 | # input_img_path = os.path.join(input_dir_path, "input_256.png") 164 | save_path_8 = os.path.join(input_dir_path, "stage1_8") 165 | save_path_8_2 = os.path.join(input_dir_path, "stage2_8") 166 | os.makedirs(save_path_8_2, exist_ok=True) 167 | 168 | # raw_im = Image.open(input_img_path) 169 | # # input_im_init = preprocess_image(models, raw_im, preprocess=False) 170 | # input_im_init = np.asarray(raw_im, dtype=np.float32) / 255.0 171 | # input_im = transforms.ToTensor()(input_im_init).unsqueeze(0).to(device) 172 | # input_im = input_im * 2 - 1 173 | 174 | # stage 2: 6*4 or 8*4 175 | delta_x_2 = [-10, 10, 0, 0] 176 | delta_y_2 = [0, 0, -10, 10] 177 | 178 | infer_stage_2(model, save_path_8, save_path_8_2, delta_x_2, delta_y_2, indices=indices if indices else list(range(start_idx,end_idx)), device=device, ddim_steps=ddim_steps, scale=scale) 179 | -------------------------------------------------------------------------------- /pose_synthesis/ldm/util.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | import torch 4 | from torch import optim 5 | import numpy as np 6 | 7 | from inspect import isfunction 8 | from PIL import Image, ImageDraw, ImageFont 9 | 10 | import os 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from PIL import Image 14 | import torch 15 | import time 16 | import cv2 17 | import PIL 18 | 19 | def pil_rectangle_crop(im): 20 | width, height = im.size # Get dimensions 21 | 22 | if width <= height: 23 | left = 0 24 | right = width 25 | top = (height - width)/2 26 | bottom = (height + width)/2 27 | else: 28 | 29 | top = 0 30 | bottom = height 31 | left = (width - height) / 2 32 | bottom = (width + height) / 2 33 | 34 | # Crop the center of the image 35 | im = im.crop((left, top, right, bottom)) 36 | return im 37 | 38 | def add_margin(pil_img, color, size=256): 39 | width, height = pil_img.size 40 | result = Image.new(pil_img.mode, (size, size), color) 41 | result.paste(pil_img, ((size - width) // 2, (size - height) // 2)) 42 | return result 43 | 44 | def load_and_preprocess(interface, input_im): 45 | ''' 46 | :param input_im (PIL Image). 47 | :return image (H, W, 3) array in [0, 1]. 48 | ''' 49 | # See https://github.com/Ir1d/image-background-remove-tool 50 | image = input_im.convert('RGB') 51 | 52 | image_without_background = interface([image])[0] 53 | image_without_background = np.array(image_without_background) 54 | est_seg = image_without_background > 127 55 | image = np.array(image) 56 | foreground = est_seg[:, : , -1].astype(np.bool_) 57 | image[~foreground] = [255., 255., 255.] 58 | x, y, w, h = cv2.boundingRect(foreground.astype(np.uint8)) 59 | image = image[y:y+h, x:x+w, :] 60 | image = PIL.Image.fromarray(np.array(image)) 61 | 62 | # resize image such that long edge is 512 63 | image.thumbnail([200, 200], Image.Resampling.LANCZOS) 64 | image = add_margin(image, (255, 255, 255), size=256) 65 | image = np.array(image) 66 | 67 | return image 68 | 69 | 70 | def log_txt_as_img(wh, xc, size=10): 71 | # wh a tuple of (width, height) 72 | # xc a list of captions to plot 73 | b = len(xc) 74 | txts = list() 75 | for bi in range(b): 76 | txt = Image.new("RGB", wh, color="white") 77 | draw = ImageDraw.Draw(txt) 78 | font = ImageFont.truetype('data/DejaVuSans.ttf', size=size) 79 | nc = int(40 * (wh[0] / 256)) 80 | lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc)) 81 | 82 | try: 83 | draw.text((0, 0), lines, fill="black", font=font) 84 | except UnicodeEncodeError: 85 | print("Cant encode string for logging. Skipping.") 86 | 87 | txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0 88 | txts.append(txt) 89 | txts = np.stack(txts) 90 | txts = torch.tensor(txts) 91 | return txts 92 | 93 | 94 | def ismap(x): 95 | if not isinstance(x, torch.Tensor): 96 | return False 97 | return (len(x.shape) == 4) and (x.shape[1] > 3) 98 | 99 | 100 | def isimage(x): 101 | if not isinstance(x,torch.Tensor): 102 | return False 103 | return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1) 104 | 105 | 106 | def exists(x): 107 | return x is not None 108 | 109 | 110 | def default(val, d): 111 | if exists(val): 112 | return val 113 | return d() if isfunction(d) else d 114 | 115 | 116 | def mean_flat(tensor): 117 | """ 118 | https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86 119 | Take the mean over all non-batch dimensions. 120 | """ 121 | return tensor.mean(dim=list(range(1, len(tensor.shape)))) 122 | 123 | 124 | def count_params(model, verbose=False): 125 | total_params = sum(p.numel() for p in model.parameters()) 126 | if verbose: 127 | print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.") 128 | return total_params 129 | 130 | 131 | def instantiate_from_config(config): 132 | if not "target" in config: 133 | if config == '__is_first_stage__': 134 | return None 135 | elif config == "__is_unconditional__": 136 | return None 137 | raise KeyError("Expected key `target` to instantiate.") 138 | return get_obj_from_str(config["target"])(**config.get("params", dict())) 139 | 140 | 141 | def get_obj_from_str(string, reload=False): 142 | module, cls = string.rsplit(".", 1) 143 | if reload: 144 | module_imp = importlib.import_module(module) 145 | importlib.reload(module_imp) 146 | return getattr(importlib.import_module(module, package=None), cls) 147 | 148 | 149 | class AdamWwithEMAandWings(optim.Optimizer): 150 | # credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298 151 | def __init__(self, params, lr=1.e-3, betas=(0.9, 0.999), eps=1.e-8, # TODO: check hyperparameters before using 152 | weight_decay=1.e-2, amsgrad=False, ema_decay=0.9999, # ema decay to match previous code 153 | ema_power=1., param_names=()): 154 | """AdamW that saves EMA versions of the parameters.""" 155 | if not 0.0 <= lr: 156 | raise ValueError("Invalid learning rate: {}".format(lr)) 157 | if not 0.0 <= eps: 158 | raise ValueError("Invalid epsilon value: {}".format(eps)) 159 | if not 0.0 <= betas[0] < 1.0: 160 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 161 | if not 0.0 <= betas[1] < 1.0: 162 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 163 | if not 0.0 <= weight_decay: 164 | raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) 165 | if not 0.0 <= ema_decay <= 1.0: 166 | raise ValueError("Invalid ema_decay value: {}".format(ema_decay)) 167 | defaults = dict(lr=lr, betas=betas, eps=eps, 168 | weight_decay=weight_decay, amsgrad=amsgrad, ema_decay=ema_decay, 169 | ema_power=ema_power, param_names=param_names) 170 | super().__init__(params, defaults) 171 | 172 | def __setstate__(self, state): 173 | super().__setstate__(state) 174 | for group in self.param_groups: 175 | group.setdefault('amsgrad', False) 176 | 177 | @torch.no_grad() 178 | def step(self, closure=None): 179 | """Performs a single optimization step. 180 | Args: 181 | closure (callable, optional): A closure that reevaluates the model 182 | and returns the loss. 183 | """ 184 | loss = None 185 | if closure is not None: 186 | with torch.enable_grad(): 187 | loss = closure() 188 | 189 | for group in self.param_groups: 190 | params_with_grad = [] 191 | grads = [] 192 | exp_avgs = [] 193 | exp_avg_sqs = [] 194 | ema_params_with_grad = [] 195 | state_sums = [] 196 | max_exp_avg_sqs = [] 197 | state_steps = [] 198 | amsgrad = group['amsgrad'] 199 | beta1, beta2 = group['betas'] 200 | ema_decay = group['ema_decay'] 201 | ema_power = group['ema_power'] 202 | 203 | for p in group['params']: 204 | if p.grad is None: 205 | continue 206 | params_with_grad.append(p) 207 | if p.grad.is_sparse: 208 | raise RuntimeError('AdamW does not support sparse gradients') 209 | grads.append(p.grad) 210 | 211 | state = self.state[p] 212 | 213 | # State initialization 214 | if len(state) == 0: 215 | state['step'] = 0 216 | # Exponential moving average of gradient values 217 | state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) 218 | # Exponential moving average of squared gradient values 219 | state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) 220 | if amsgrad: 221 | # Maintains max of all exp. moving avg. of sq. grad. values 222 | state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) 223 | # Exponential moving average of parameter values 224 | state['param_exp_avg'] = p.detach().float().clone() 225 | 226 | exp_avgs.append(state['exp_avg']) 227 | exp_avg_sqs.append(state['exp_avg_sq']) 228 | ema_params_with_grad.append(state['param_exp_avg']) 229 | 230 | if amsgrad: 231 | max_exp_avg_sqs.append(state['max_exp_avg_sq']) 232 | 233 | # update the steps for each param group update 234 | state['step'] += 1 235 | # record the step after step update 236 | state_steps.append(state['step']) 237 | 238 | optim._functional.adamw(params_with_grad, 239 | grads, 240 | exp_avgs, 241 | exp_avg_sqs, 242 | max_exp_avg_sqs, 243 | state_steps, 244 | amsgrad=amsgrad, 245 | beta1=beta1, 246 | beta2=beta2, 247 | lr=group['lr'], 248 | weight_decay=group['weight_decay'], 249 | eps=group['eps'], 250 | maximize=False) 251 | 252 | cur_ema_decay = min(ema_decay, 1 - state['step'] ** -ema_power) 253 | for param, ema_param in zip(params_with_grad, ema_params_with_grad): 254 | ema_param.mul_(cur_ema_decay).add_(param.float(), alpha=1 - cur_ema_decay) 255 | 256 | return loss --------------------------------------------------------------------------------