├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── __init__.py ├── bundled_sources.txt ├── ddepth_anything_v2 ├── DA-2K.md ├── LICENSE ├── README.md ├── __init__.py ├── app.py ├── depth_anything_v2 │ ├── dinov2.py │ ├── dinov2_layers │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── block.py │ │ ├── drop_path.py │ │ ├── layer_scale.py │ │ ├── mlp.py │ │ ├── patch_embed.py │ │ └── swiglu_ffn.py │ ├── dpt.py │ └── util │ │ ├── blocks.py │ │ └── transform.py ├── metric_depth │ ├── README.md │ ├── dataset │ │ ├── hypersim.py │ │ ├── kitti.py │ │ ├── transform.py │ │ └── vkitti2.py │ ├── depth_anything_v2 │ │ ├── dinov2.py │ │ ├── dinov2_layers │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── block.py │ │ │ ├── drop_path.py │ │ │ ├── layer_scale.py │ │ │ ├── mlp.py │ │ │ ├── patch_embed.py │ │ │ └── swiglu_ffn.py │ │ ├── dpt.py │ │ └── util │ │ │ ├── blocks.py │ │ │ └── transform.py │ ├── depth_to_pointcloud.py │ ├── dist_train.sh │ ├── requirements.txt │ ├── run.py │ ├── train.py │ └── util │ │ ├── dist_helper.py │ │ ├── loss.py │ │ ├── metric.py │ │ └── utils.py ├── requirements.txt ├── run.py └── run_video.py ├── dmarigold └── marigold │ ├── __init__.py │ ├── marigold_pipeline.py │ └── util │ ├── batchsize.py │ ├── ensemble.py │ ├── image_util.py │ └── seed_all.py ├── dmidas ├── LICENSE ├── backbones │ ├── beit.py │ ├── levit.py │ ├── next_vit.py │ ├── swin.py │ ├── swin2.py │ ├── swin_common.py │ ├── utils.py │ └── vit.py ├── base_model.py ├── blocks.py ├── dpt_depth.py ├── midas_net.py ├── midas_net_custom.py ├── model_loader.py └── transforms.py ├── dzoedepth ├── LICENSE ├── __init__.py ├── data │ ├── __init__.py │ ├── data_mono.py │ ├── ddad.py │ ├── diml_indoor_test.py │ ├── diml_outdoor_test.py │ ├── diode.py │ ├── hypersim.py │ ├── ibims.py │ ├── preprocess.py │ ├── sun_rgbd_loader.py │ ├── transforms.py │ ├── vkitti.py │ └── vkitti2.py ├── models │ ├── __init__.py │ ├── base_models │ │ ├── __init__.py │ │ └── midas.py │ ├── builder.py │ ├── depth_model.py │ ├── layers │ │ ├── __init__.py │ │ ├── attractor.py │ │ ├── dist_layers.py │ │ ├── localbins_layers.py │ │ └── patch_transformer.py │ ├── model_io.py │ ├── zoedepth │ │ ├── __init__.py │ │ ├── config_zoedepth.json │ │ ├── config_zoedepth_kitti.json │ │ └── zoedepth_v1.py │ └── zoedepth_nk │ │ ├── __init__.py │ │ ├── config_zoedepth_nk.json │ │ └── zoedepth_nk_v1.py ├── trainers │ ├── __init__.py │ ├── base_trainer.py │ ├── builder.py │ ├── loss.py │ ├── zoedepth_nk_trainer.py │ └── zoedepth_trainer.py └── utils │ ├── __init__.py │ ├── arg_utils.py │ ├── config.py │ ├── easydict │ └── __init__.py │ ├── geometry.py │ └── misc.py ├── examples.png ├── inpaint ├── DOCUMENTATION.md ├── LICENSE ├── README.md ├── __init__.py ├── argument.yml ├── bilateral_filtering.py ├── boostmonodepth_utils.py ├── download.sh ├── main.py ├── mesh.py ├── mesh_tools.py ├── networks.py ├── requirements.txt └── utils.py ├── install.py ├── javascript └── depthmap.js ├── lib ├── LICENSE ├── Resnet.py ├── Resnext_torch.py ├── __init__.py ├── multi_depth_model_woauxi.py ├── net_tools.py ├── network_auxi.py ├── spvcnn_classsification.py ├── spvcnn_utils.py └── test_utils.py ├── main.py ├── options.png ├── pix2pix ├── LICENSE ├── __init__.py ├── data │ ├── __init__.py │ ├── base_dataset.py │ ├── depthmerge_dataset.py │ └── image_folder.py ├── models │ ├── __init__.py │ ├── base_model.py │ ├── base_model_hg.py │ ├── networks.py │ └── pix2pix4depth_model.py ├── options │ ├── __init__.py │ ├── base_options.py │ ├── test_options.py │ └── train_options.py ├── test.py ├── train.py └── util │ ├── __init__.py │ ├── get_data.py │ ├── guidedfilter.py │ ├── html.py │ ├── image_pool.py │ ├── util.py │ └── visualizer.py ├── requirements.txt ├── scripts ├── depthmap.py └── depthmap_api.py └── src ├── backbone.py ├── common_constants.py ├── common_ui.py ├── core.py ├── depthmap_generation.py ├── gradio_args_transport.py ├── misc.py ├── normalmap_generation.py ├── stereoimage_generation.py └── video_mode.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | venv/ 3 | .idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Bob Thiry 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/__init__.py -------------------------------------------------------------------------------- /bundled_sources.txt: -------------------------------------------------------------------------------- 1 | Since commit 110549b2 this extension bundles some code from other repositories. 2 | This was done to prevent possible upstream breakage and allow fixing breakage quicker. 3 | This file provides information about the original location of the code. 4 | *** Some of the bundled code was already modified. *** 5 | 6 | dmidas 7 | https://github.com/isl-org/MiDaS/tree/master/midas/ 8 | 9 | dzoedepth 10 | https://github.com/isl-org/ZoeDepth/tree/main/zoedepth/ 11 | 12 | inpaint 13 | https://github.com/vt-vl-lab/3d-photo-inpainting/ 14 | 15 | lib 16 | https://github.com/aim-uofa/AdelaiDepth/tree/main/LeReS/Minist_Test/lib/ 17 | 18 | pix2pix 19 | https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/ 20 | 21 | Marigold 22 | https://github.com/prs-eth/Marigold/tree/22437a 23 | 24 | depth_anything_v2 25 | https://github.com/DepthAnything/Depth-Anything-V2/tree/bc0283 26 | -------------------------------------------------------------------------------- /ddepth_anything_v2/DA-2K.md: -------------------------------------------------------------------------------- 1 | # DA-2K Evaluation Benchmark 2 | 3 | ## Introduction 4 | 5 | ![DA-2K](assets/DA-2K.png) 6 | 7 | DA-2K is proposed in [Depth Anything V2](https://depth-anything-v2.github.io) to evaluate the relative depth estimation capability. It encompasses eight representative scenarios of `indoor`, `outdoor`, `non_real`, `transparent_reflective`, `adverse_style`, `aerial`, `underwater`, and `object`. It consists of 1K diverse high-quality images and 2K precise pair-wise relative depth annotations. 8 | 9 | Please refer to our [paper](https://arxiv.org/abs/2406.09414) for details in constructing this benchmark. 10 | 11 | 12 | ## Usage 13 | 14 | Please first [download the benchmark](https://huggingface.co/datasets/depth-anything/DA-2K/tree/main). 15 | 16 | All annotations are stored in `annotations.json`. The annotation file is a JSON object where each key is the path to an image file, and the value is a list of annotations associated with that image. Each annotation describes two points and identifies which point is closer to the camera. The structure is detailed below: 17 | 18 | ``` 19 | { 20 | "image_path": [ 21 | { 22 | "point1": [h1, w1], # (vertical position, horizontal position) 23 | "point2": [h2, w2], # (vertical position, horizontal position) 24 | "closer_point": "point1" # we always set "point1" as the closer one 25 | }, 26 | ... 27 | ], 28 | ... 29 | } 30 | ``` 31 | 32 | To visualize the annotations: 33 | ```bash 34 | python visualize.py [--scene-type ] 35 | ``` 36 | 37 | **Options** 38 | - `--scene-type ` (optional): Specify the scene type (`indoor`, `outdoor`, `non_real`, `transparent_reflective`, `adverse_style`, `aerial`, `underwater`, and `object`). Skip this argument or set as `""` to include all scene types. 39 | 40 | ## Citation 41 | 42 | If you find this benchmark useful, please consider citing: 43 | 44 | ```bibtex 45 | @article{depth_anything_v2, 46 | title={Depth Anything V2}, 47 | author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Zhao, Zhen and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang}, 48 | journal={arXiv:2406.09414}, 49 | year={2024} 50 | } 51 | ``` -------------------------------------------------------------------------------- /ddepth_anything_v2/__init__.py: -------------------------------------------------------------------------------- 1 | from .depth_anything_v2.dpt import DepthAnythingV2 -------------------------------------------------------------------------------- /ddepth_anything_v2/app.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import gradio as gr 3 | import matplotlib 4 | import numpy as np 5 | from PIL import Image 6 | import torch 7 | import tempfile 8 | from gradio_imageslider import ImageSlider 9 | 10 | from depth_anything_v2.dpt import DepthAnythingV2 11 | 12 | css = """ 13 | #img-display-container { 14 | max-height: 100vh; 15 | } 16 | #img-display-input { 17 | max-height: 80vh; 18 | } 19 | #img-display-output { 20 | max-height: 80vh; 21 | } 22 | #download { 23 | height: 62px; 24 | } 25 | """ 26 | DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' 27 | model_configs = { 28 | 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, 29 | 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, 30 | 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, 31 | 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} 32 | } 33 | encoder = 'vitl' 34 | model = DepthAnythingV2(**model_configs[encoder]) 35 | state_dict = torch.load(f'checkpoints/depth_anything_v2_{encoder}.pth', map_location="cpu") 36 | model.load_state_dict(state_dict) 37 | model = model.to(DEVICE).eval() 38 | 39 | title = "# Depth Anything V2" 40 | description = """Official demo for **Depth Anything V2**. 41 | Please refer to our [paper](https://arxiv.org/abs/2406.09414), [project page](https://depth-anything-v2.github.io), or [github](https://github.com/DepthAnything/Depth-Anything-V2) for more details.""" 42 | 43 | def predict_depth(image): 44 | return model.infer_image(image) 45 | 46 | with gr.Blocks(css=css) as demo: 47 | gr.Markdown(title) 48 | gr.Markdown(description) 49 | gr.Markdown("### Depth Prediction demo") 50 | 51 | with gr.Row(): 52 | input_image = gr.Image(label="Input Image", type='numpy', elem_id='img-display-input') 53 | depth_image_slider = ImageSlider(label="Depth Map with Slider View", elem_id='img-display-output', position=0.5) 54 | submit = gr.Button(value="Compute Depth") 55 | gray_depth_file = gr.File(label="Grayscale depth map", elem_id="download",) 56 | raw_file = gr.File(label="16-bit raw output (can be considered as disparity)", elem_id="download",) 57 | 58 | cmap = matplotlib.colormaps.get_cmap('Spectral_r') 59 | 60 | def on_submit(image): 61 | original_image = image.copy() 62 | 63 | h, w = image.shape[:2] 64 | 65 | depth = predict_depth(image[:, :, ::-1]) 66 | 67 | raw_depth = Image.fromarray(depth.astype('uint16')) 68 | tmp_raw_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False) 69 | raw_depth.save(tmp_raw_depth.name) 70 | 71 | depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 72 | depth = depth.astype(np.uint8) 73 | colored_depth = (cmap(depth)[:, :, :3] * 255).astype(np.uint8) 74 | 75 | gray_depth = Image.fromarray(depth) 76 | tmp_gray_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False) 77 | gray_depth.save(tmp_gray_depth.name) 78 | 79 | return [(original_image, colored_depth), tmp_gray_depth.name, tmp_raw_depth.name] 80 | 81 | submit.click(on_submit, inputs=[input_image], outputs=[depth_image_slider, gray_depth_file, raw_file]) 82 | 83 | example_files = glob.glob('assets/examples/*') 84 | examples = gr.Examples(examples=example_files, inputs=[input_image], outputs=[depth_image_slider, gray_depth_file, raw_file], fn=on_submit) 85 | 86 | 87 | if __name__ == '__main__': 88 | demo.queue().launch() -------------------------------------------------------------------------------- /ddepth_anything_v2/depth_anything_v2/dinov2_layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .mlp import Mlp 8 | from .patch_embed import PatchEmbed 9 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused 10 | from .block import NestedTensorBlock 11 | from .attention import MemEffAttention 12 | -------------------------------------------------------------------------------- /ddepth_anything_v2/depth_anything_v2/dinov2_layers/attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py 10 | 11 | import logging 12 | 13 | from torch import Tensor 14 | from torch import nn 15 | 16 | 17 | logger = logging.getLogger("dinov2") 18 | 19 | 20 | try: 21 | from xformers.ops import memory_efficient_attention, unbind, fmha 22 | 23 | XFORMERS_AVAILABLE = True 24 | except ImportError: 25 | logger.warning("xFormers not available") 26 | XFORMERS_AVAILABLE = False 27 | 28 | 29 | class Attention(nn.Module): 30 | def __init__( 31 | self, 32 | dim: int, 33 | num_heads: int = 8, 34 | qkv_bias: bool = False, 35 | proj_bias: bool = True, 36 | attn_drop: float = 0.0, 37 | proj_drop: float = 0.0, 38 | ) -> None: 39 | super().__init__() 40 | self.num_heads = num_heads 41 | head_dim = dim // num_heads 42 | self.scale = head_dim**-0.5 43 | 44 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) 45 | self.attn_drop = nn.Dropout(attn_drop) 46 | self.proj = nn.Linear(dim, dim, bias=proj_bias) 47 | self.proj_drop = nn.Dropout(proj_drop) 48 | 49 | def forward(self, x: Tensor) -> Tensor: 50 | B, N, C = x.shape 51 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 52 | 53 | q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] 54 | attn = q @ k.transpose(-2, -1) 55 | 56 | attn = attn.softmax(dim=-1) 57 | attn = self.attn_drop(attn) 58 | 59 | x = (attn @ v).transpose(1, 2).reshape(B, N, C) 60 | x = self.proj(x) 61 | x = self.proj_drop(x) 62 | return x 63 | 64 | 65 | class MemEffAttention(Attention): 66 | def forward(self, x: Tensor, attn_bias=None) -> Tensor: 67 | if not XFORMERS_AVAILABLE: 68 | assert attn_bias is None, "xFormers is required for nested tensors usage" 69 | return super().forward(x) 70 | 71 | B, N, C = x.shape 72 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) 73 | 74 | q, k, v = unbind(qkv, 2) 75 | 76 | x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) 77 | x = x.reshape([B, N, C]) 78 | 79 | x = self.proj(x) 80 | x = self.proj_drop(x) 81 | return x 82 | 83 | -------------------------------------------------------------------------------- /ddepth_anything_v2/depth_anything_v2/dinov2_layers/drop_path.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py 10 | 11 | 12 | from torch import nn 13 | 14 | 15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False): 16 | if drop_prob == 0.0 or not training: 17 | return x 18 | keep_prob = 1 - drop_prob 19 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 20 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 21 | if keep_prob > 0.0: 22 | random_tensor.div_(keep_prob) 23 | output = x * random_tensor 24 | return output 25 | 26 | 27 | class DropPath(nn.Module): 28 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" 29 | 30 | def __init__(self, drop_prob=None): 31 | super(DropPath, self).__init__() 32 | self.drop_prob = drop_prob 33 | 34 | def forward(self, x): 35 | return drop_path(x, self.drop_prob, self.training) 36 | -------------------------------------------------------------------------------- /ddepth_anything_v2/depth_anything_v2/dinov2_layers/layer_scale.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 8 | 9 | from typing import Union 10 | 11 | import torch 12 | from torch import Tensor 13 | from torch import nn 14 | 15 | 16 | class LayerScale(nn.Module): 17 | def __init__( 18 | self, 19 | dim: int, 20 | init_values: Union[float, Tensor] = 1e-5, 21 | inplace: bool = False, 22 | ) -> None: 23 | super().__init__() 24 | self.inplace = inplace 25 | self.gamma = nn.Parameter(init_values * torch.ones(dim)) 26 | 27 | def forward(self, x: Tensor) -> Tensor: 28 | return x.mul_(self.gamma) if self.inplace else x * self.gamma 29 | -------------------------------------------------------------------------------- /ddepth_anything_v2/depth_anything_v2/dinov2_layers/mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py 10 | 11 | 12 | from typing import Callable, Optional 13 | 14 | from torch import Tensor, nn 15 | 16 | 17 | class Mlp(nn.Module): 18 | def __init__( 19 | self, 20 | in_features: int, 21 | hidden_features: Optional[int] = None, 22 | out_features: Optional[int] = None, 23 | act_layer: Callable[..., nn.Module] = nn.GELU, 24 | drop: float = 0.0, 25 | bias: bool = True, 26 | ) -> None: 27 | super().__init__() 28 | out_features = out_features or in_features 29 | hidden_features = hidden_features or in_features 30 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) 31 | self.act = act_layer() 32 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) 33 | self.drop = nn.Dropout(drop) 34 | 35 | def forward(self, x: Tensor) -> Tensor: 36 | x = self.fc1(x) 37 | x = self.act(x) 38 | x = self.drop(x) 39 | x = self.fc2(x) 40 | x = self.drop(x) 41 | return x 42 | -------------------------------------------------------------------------------- /ddepth_anything_v2/depth_anything_v2/dinov2_layers/patch_embed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py 10 | 11 | from typing import Callable, Optional, Tuple, Union 12 | 13 | from torch import Tensor 14 | import torch.nn as nn 15 | 16 | 17 | def make_2tuple(x): 18 | if isinstance(x, tuple): 19 | assert len(x) == 2 20 | return x 21 | 22 | assert isinstance(x, int) 23 | return (x, x) 24 | 25 | 26 | class PatchEmbed(nn.Module): 27 | """ 28 | 2D image to patch embedding: (B,C,H,W) -> (B,N,D) 29 | 30 | Args: 31 | img_size: Image size. 32 | patch_size: Patch token size. 33 | in_chans: Number of input image channels. 34 | embed_dim: Number of linear projection output channels. 35 | norm_layer: Normalization layer. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | img_size: Union[int, Tuple[int, int]] = 224, 41 | patch_size: Union[int, Tuple[int, int]] = 16, 42 | in_chans: int = 3, 43 | embed_dim: int = 768, 44 | norm_layer: Optional[Callable] = None, 45 | flatten_embedding: bool = True, 46 | ) -> None: 47 | super().__init__() 48 | 49 | image_HW = make_2tuple(img_size) 50 | patch_HW = make_2tuple(patch_size) 51 | patch_grid_size = ( 52 | image_HW[0] // patch_HW[0], 53 | image_HW[1] // patch_HW[1], 54 | ) 55 | 56 | self.img_size = image_HW 57 | self.patch_size = patch_HW 58 | self.patches_resolution = patch_grid_size 59 | self.num_patches = patch_grid_size[0] * patch_grid_size[1] 60 | 61 | self.in_chans = in_chans 62 | self.embed_dim = embed_dim 63 | 64 | self.flatten_embedding = flatten_embedding 65 | 66 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) 67 | self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() 68 | 69 | def forward(self, x: Tensor) -> Tensor: 70 | _, _, H, W = x.shape 71 | patch_H, patch_W = self.patch_size 72 | 73 | assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" 74 | assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" 75 | 76 | x = self.proj(x) # B C H W 77 | H, W = x.size(2), x.size(3) 78 | x = x.flatten(2).transpose(1, 2) # B HW C 79 | x = self.norm(x) 80 | if not self.flatten_embedding: 81 | x = x.reshape(-1, H, W, self.embed_dim) # B H W C 82 | return x 83 | 84 | def flops(self) -> float: 85 | Ho, Wo = self.patches_resolution 86 | flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) 87 | if self.norm is not None: 88 | flops += Ho * Wo * self.embed_dim 89 | return flops 90 | -------------------------------------------------------------------------------- /ddepth_anything_v2/depth_anything_v2/dinov2_layers/swiglu_ffn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Callable, Optional 8 | 9 | from torch import Tensor, nn 10 | import torch.nn.functional as F 11 | 12 | 13 | class SwiGLUFFN(nn.Module): 14 | def __init__( 15 | self, 16 | in_features: int, 17 | hidden_features: Optional[int] = None, 18 | out_features: Optional[int] = None, 19 | act_layer: Callable[..., nn.Module] = None, 20 | drop: float = 0.0, 21 | bias: bool = True, 22 | ) -> None: 23 | super().__init__() 24 | out_features = out_features or in_features 25 | hidden_features = hidden_features or in_features 26 | self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) 27 | self.w3 = nn.Linear(hidden_features, out_features, bias=bias) 28 | 29 | def forward(self, x: Tensor) -> Tensor: 30 | x12 = self.w12(x) 31 | x1, x2 = x12.chunk(2, dim=-1) 32 | hidden = F.silu(x1) * x2 33 | return self.w3(hidden) 34 | 35 | 36 | try: 37 | from xformers.ops import SwiGLU 38 | 39 | XFORMERS_AVAILABLE = True 40 | except ImportError: 41 | SwiGLU = SwiGLUFFN 42 | XFORMERS_AVAILABLE = False 43 | 44 | 45 | class SwiGLUFFNFused(SwiGLU): 46 | def __init__( 47 | self, 48 | in_features: int, 49 | hidden_features: Optional[int] = None, 50 | out_features: Optional[int] = None, 51 | act_layer: Callable[..., nn.Module] = None, 52 | drop: float = 0.0, 53 | bias: bool = True, 54 | ) -> None: 55 | out_features = out_features or in_features 56 | hidden_features = hidden_features or in_features 57 | hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 58 | super().__init__( 59 | in_features=in_features, 60 | hidden_features=hidden_features, 61 | out_features=out_features, 62 | bias=bias, 63 | ) 64 | -------------------------------------------------------------------------------- /ddepth_anything_v2/depth_anything_v2/util/blocks.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | def _make_scratch(in_shape, out_shape, groups=1, expand=False): 5 | scratch = nn.Module() 6 | 7 | out_shape1 = out_shape 8 | out_shape2 = out_shape 9 | out_shape3 = out_shape 10 | if len(in_shape) >= 4: 11 | out_shape4 = out_shape 12 | 13 | if expand: 14 | out_shape1 = out_shape 15 | out_shape2 = out_shape * 2 16 | out_shape3 = out_shape * 4 17 | if len(in_shape) >= 4: 18 | out_shape4 = out_shape * 8 19 | 20 | scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) 21 | scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) 22 | scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) 23 | if len(in_shape) >= 4: 24 | scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) 25 | 26 | return scratch 27 | 28 | 29 | class ResidualConvUnit(nn.Module): 30 | """Residual convolution module. 31 | """ 32 | 33 | def __init__(self, features, activation, bn): 34 | """Init. 35 | 36 | Args: 37 | features (int): number of features 38 | """ 39 | super().__init__() 40 | 41 | self.bn = bn 42 | 43 | self.groups=1 44 | 45 | self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups) 46 | 47 | self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups) 48 | 49 | if self.bn == True: 50 | self.bn1 = nn.BatchNorm2d(features) 51 | self.bn2 = nn.BatchNorm2d(features) 52 | 53 | self.activation = activation 54 | 55 | self.skip_add = nn.quantized.FloatFunctional() 56 | 57 | def forward(self, x): 58 | """Forward pass. 59 | 60 | Args: 61 | x (tensor): input 62 | 63 | Returns: 64 | tensor: output 65 | """ 66 | 67 | out = self.activation(x) 68 | out = self.conv1(out) 69 | if self.bn == True: 70 | out = self.bn1(out) 71 | 72 | out = self.activation(out) 73 | out = self.conv2(out) 74 | if self.bn == True: 75 | out = self.bn2(out) 76 | 77 | if self.groups > 1: 78 | out = self.conv_merge(out) 79 | 80 | return self.skip_add.add(out, x) 81 | 82 | 83 | class FeatureFusionBlock(nn.Module): 84 | """Feature fusion block. 85 | """ 86 | 87 | def __init__( 88 | self, 89 | features, 90 | activation, 91 | deconv=False, 92 | bn=False, 93 | expand=False, 94 | align_corners=True, 95 | size=None 96 | ): 97 | """Init. 98 | 99 | Args: 100 | features (int): number of features 101 | """ 102 | super(FeatureFusionBlock, self).__init__() 103 | 104 | self.deconv = deconv 105 | self.align_corners = align_corners 106 | 107 | self.groups=1 108 | 109 | self.expand = expand 110 | out_features = features 111 | if self.expand == True: 112 | out_features = features // 2 113 | 114 | self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1) 115 | 116 | self.resConfUnit1 = ResidualConvUnit(features, activation, bn) 117 | self.resConfUnit2 = ResidualConvUnit(features, activation, bn) 118 | 119 | self.skip_add = nn.quantized.FloatFunctional() 120 | 121 | self.size=size 122 | 123 | def forward(self, *xs, size=None): 124 | """Forward pass. 125 | 126 | Returns: 127 | tensor: output 128 | """ 129 | output = xs[0] 130 | 131 | if len(xs) == 2: 132 | res = self.resConfUnit1(xs[1]) 133 | output = self.skip_add.add(output, res) 134 | 135 | output = self.resConfUnit2(output) 136 | 137 | if (size is None) and (self.size is None): 138 | modifier = {"scale_factor": 2} 139 | elif size is None: 140 | modifier = {"size": self.size} 141 | else: 142 | modifier = {"size": size} 143 | 144 | output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners) 145 | 146 | output = self.out_conv(output) 147 | 148 | return output 149 | -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/dataset/hypersim.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import h5py 3 | import numpy as np 4 | import torch 5 | from torch.utils.data import Dataset 6 | from torchvision.transforms import Compose 7 | 8 | from dataset.transform import Resize, NormalizeImage, PrepareForNet, Crop 9 | 10 | 11 | def hypersim_distance_to_depth(npyDistance): 12 | intWidth, intHeight, fltFocal = 1024, 768, 886.81 13 | 14 | npyImageplaneX = np.linspace((-0.5 * intWidth) + 0.5, (0.5 * intWidth) - 0.5, intWidth).reshape( 15 | 1, intWidth).repeat(intHeight, 0).astype(np.float32)[:, :, None] 16 | npyImageplaneY = np.linspace((-0.5 * intHeight) + 0.5, (0.5 * intHeight) - 0.5, 17 | intHeight).reshape(intHeight, 1).repeat(intWidth, 1).astype(np.float32)[:, :, None] 18 | npyImageplaneZ = np.full([intHeight, intWidth, 1], fltFocal, np.float32) 19 | npyImageplane = np.concatenate( 20 | [npyImageplaneX, npyImageplaneY, npyImageplaneZ], 2) 21 | 22 | npyDepth = npyDistance / np.linalg.norm(npyImageplane, 2, 2) * fltFocal 23 | return npyDepth 24 | 25 | 26 | class Hypersim(Dataset): 27 | def __init__(self, filelist_path, mode, size=(518, 518)): 28 | 29 | self.mode = mode 30 | self.size = size 31 | 32 | with open(filelist_path, 'r') as f: 33 | self.filelist = f.read().splitlines() 34 | 35 | net_w, net_h = size 36 | self.transform = Compose([ 37 | Resize( 38 | width=net_w, 39 | height=net_h, 40 | resize_target=True if mode == 'train' else False, 41 | keep_aspect_ratio=True, 42 | ensure_multiple_of=14, 43 | resize_method='lower_bound', 44 | image_interpolation_method=cv2.INTER_CUBIC, 45 | ), 46 | NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 47 | PrepareForNet(), 48 | ] + ([Crop(size[0])] if self.mode == 'train' else [])) 49 | 50 | def __getitem__(self, item): 51 | img_path = self.filelist[item].split(' ')[0] 52 | depth_path = self.filelist[item].split(' ')[1] 53 | 54 | image = cv2.imread(img_path) 55 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0 56 | 57 | depth_fd = h5py.File(depth_path, "r") 58 | distance_meters = np.array(depth_fd['dataset']) 59 | depth = hypersim_distance_to_depth(distance_meters) 60 | 61 | sample = self.transform({'image': image, 'depth': depth}) 62 | 63 | sample['image'] = torch.from_numpy(sample['image']) 64 | sample['depth'] = torch.from_numpy(sample['depth']) 65 | 66 | sample['valid_mask'] = (torch.isnan(sample['depth']) == 0) 67 | sample['depth'][sample['valid_mask'] == 0] = 0 68 | 69 | sample['image_path'] = self.filelist[item].split(' ')[0] 70 | 71 | return sample 72 | 73 | def __len__(self): 74 | return len(self.filelist) -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/dataset/kitti.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import torch 3 | from torch.utils.data import Dataset 4 | from torchvision.transforms import Compose 5 | 6 | from dataset.transform import Resize, NormalizeImage, PrepareForNet 7 | 8 | 9 | class KITTI(Dataset): 10 | def __init__(self, filelist_path, mode, size=(518, 518)): 11 | if mode != 'val': 12 | raise NotImplementedError 13 | 14 | self.mode = mode 15 | self.size = size 16 | 17 | with open(filelist_path, 'r') as f: 18 | self.filelist = f.read().splitlines() 19 | 20 | net_w, net_h = size 21 | self.transform = Compose([ 22 | Resize( 23 | width=net_w, 24 | height=net_h, 25 | resize_target=True if mode == 'train' else False, 26 | keep_aspect_ratio=True, 27 | ensure_multiple_of=14, 28 | resize_method='lower_bound', 29 | image_interpolation_method=cv2.INTER_CUBIC, 30 | ), 31 | NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 32 | PrepareForNet(), 33 | ]) 34 | 35 | def __getitem__(self, item): 36 | img_path = self.filelist[item].split(' ')[0] 37 | depth_path = self.filelist[item].split(' ')[1] 38 | 39 | image = cv2.imread(img_path) 40 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0 41 | 42 | depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED).astype('float32') 43 | 44 | sample = self.transform({'image': image, 'depth': depth}) 45 | 46 | sample['image'] = torch.from_numpy(sample['image']) 47 | sample['depth'] = torch.from_numpy(sample['depth']) 48 | sample['depth'] = sample['depth'] / 256.0 # convert in meters 49 | 50 | sample['valid_mask'] = sample['depth'] > 0 51 | 52 | sample['image_path'] = self.filelist[item].split(' ')[0] 53 | 54 | return sample 55 | 56 | def __len__(self): 57 | return len(self.filelist) -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/dataset/vkitti2.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import torch 3 | from torch.utils.data import Dataset 4 | from torchvision.transforms import Compose 5 | 6 | from dataset.transform import Resize, NormalizeImage, PrepareForNet, Crop 7 | 8 | 9 | class VKITTI2(Dataset): 10 | def __init__(self, filelist_path, mode, size=(518, 518)): 11 | 12 | self.mode = mode 13 | self.size = size 14 | 15 | with open(filelist_path, 'r') as f: 16 | self.filelist = f.read().splitlines() 17 | 18 | net_w, net_h = size 19 | self.transform = Compose([ 20 | Resize( 21 | width=net_w, 22 | height=net_h, 23 | resize_target=True if mode == 'train' else False, 24 | keep_aspect_ratio=True, 25 | ensure_multiple_of=14, 26 | resize_method='lower_bound', 27 | image_interpolation_method=cv2.INTER_CUBIC, 28 | ), 29 | NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 30 | PrepareForNet(), 31 | ] + ([Crop(size[0])] if self.mode == 'train' else [])) 32 | 33 | def __getitem__(self, item): 34 | img_path = self.filelist[item].split(' ')[0] 35 | depth_path = self.filelist[item].split(' ')[1] 36 | 37 | image = cv2.imread(img_path) 38 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0 39 | 40 | depth = cv2.imread(depth_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH) / 100.0 # cm to m 41 | 42 | sample = self.transform({'image': image, 'depth': depth}) 43 | 44 | sample['image'] = torch.from_numpy(sample['image']) 45 | sample['depth'] = torch.from_numpy(sample['depth']) 46 | 47 | sample['valid_mask'] = (sample['depth'] <= 80) 48 | 49 | sample['image_path'] = self.filelist[item].split(' ')[0] 50 | 51 | return sample 52 | 53 | def __len__(self): 54 | return len(self.filelist) -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .mlp import Mlp 8 | from .patch_embed import PatchEmbed 9 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused 10 | from .block import NestedTensorBlock 11 | from .attention import MemEffAttention 12 | -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py 10 | 11 | import logging 12 | 13 | from torch import Tensor 14 | from torch import nn 15 | 16 | 17 | logger = logging.getLogger("dinov2") 18 | 19 | 20 | try: 21 | from xformers.ops import memory_efficient_attention, unbind, fmha 22 | 23 | XFORMERS_AVAILABLE = True 24 | except ImportError: 25 | logger.warning("xFormers not available") 26 | XFORMERS_AVAILABLE = False 27 | 28 | 29 | class Attention(nn.Module): 30 | def __init__( 31 | self, 32 | dim: int, 33 | num_heads: int = 8, 34 | qkv_bias: bool = False, 35 | proj_bias: bool = True, 36 | attn_drop: float = 0.0, 37 | proj_drop: float = 0.0, 38 | ) -> None: 39 | super().__init__() 40 | self.num_heads = num_heads 41 | head_dim = dim // num_heads 42 | self.scale = head_dim**-0.5 43 | 44 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) 45 | self.attn_drop = nn.Dropout(attn_drop) 46 | self.proj = nn.Linear(dim, dim, bias=proj_bias) 47 | self.proj_drop = nn.Dropout(proj_drop) 48 | 49 | def forward(self, x: Tensor) -> Tensor: 50 | B, N, C = x.shape 51 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 52 | 53 | q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] 54 | attn = q @ k.transpose(-2, -1) 55 | 56 | attn = attn.softmax(dim=-1) 57 | attn = self.attn_drop(attn) 58 | 59 | x = (attn @ v).transpose(1, 2).reshape(B, N, C) 60 | x = self.proj(x) 61 | x = self.proj_drop(x) 62 | return x 63 | 64 | 65 | class MemEffAttention(Attention): 66 | def forward(self, x: Tensor, attn_bias=None) -> Tensor: 67 | if not XFORMERS_AVAILABLE: 68 | assert attn_bias is None, "xFormers is required for nested tensors usage" 69 | return super().forward(x) 70 | 71 | B, N, C = x.shape 72 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) 73 | 74 | q, k, v = unbind(qkv, 2) 75 | 76 | x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) 77 | x = x.reshape([B, N, C]) 78 | 79 | x = self.proj(x) 80 | x = self.proj_drop(x) 81 | return x 82 | 83 | -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/drop_path.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py 10 | 11 | 12 | from torch import nn 13 | 14 | 15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False): 16 | if drop_prob == 0.0 or not training: 17 | return x 18 | keep_prob = 1 - drop_prob 19 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 20 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 21 | if keep_prob > 0.0: 22 | random_tensor.div_(keep_prob) 23 | output = x * random_tensor 24 | return output 25 | 26 | 27 | class DropPath(nn.Module): 28 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" 29 | 30 | def __init__(self, drop_prob=None): 31 | super(DropPath, self).__init__() 32 | self.drop_prob = drop_prob 33 | 34 | def forward(self, x): 35 | return drop_path(x, self.drop_prob, self.training) 36 | -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/layer_scale.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 8 | 9 | from typing import Union 10 | 11 | import torch 12 | from torch import Tensor 13 | from torch import nn 14 | 15 | 16 | class LayerScale(nn.Module): 17 | def __init__( 18 | self, 19 | dim: int, 20 | init_values: Union[float, Tensor] = 1e-5, 21 | inplace: bool = False, 22 | ) -> None: 23 | super().__init__() 24 | self.inplace = inplace 25 | self.gamma = nn.Parameter(init_values * torch.ones(dim)) 26 | 27 | def forward(self, x: Tensor) -> Tensor: 28 | return x.mul_(self.gamma) if self.inplace else x * self.gamma 29 | -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py 10 | 11 | 12 | from typing import Callable, Optional 13 | 14 | from torch import Tensor, nn 15 | 16 | 17 | class Mlp(nn.Module): 18 | def __init__( 19 | self, 20 | in_features: int, 21 | hidden_features: Optional[int] = None, 22 | out_features: Optional[int] = None, 23 | act_layer: Callable[..., nn.Module] = nn.GELU, 24 | drop: float = 0.0, 25 | bias: bool = True, 26 | ) -> None: 27 | super().__init__() 28 | out_features = out_features or in_features 29 | hidden_features = hidden_features or in_features 30 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) 31 | self.act = act_layer() 32 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) 33 | self.drop = nn.Dropout(drop) 34 | 35 | def forward(self, x: Tensor) -> Tensor: 36 | x = self.fc1(x) 37 | x = self.act(x) 38 | x = self.drop(x) 39 | x = self.fc2(x) 40 | x = self.drop(x) 41 | return x 42 | -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/patch_embed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # References: 8 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 9 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py 10 | 11 | from typing import Callable, Optional, Tuple, Union 12 | 13 | from torch import Tensor 14 | import torch.nn as nn 15 | 16 | 17 | def make_2tuple(x): 18 | if isinstance(x, tuple): 19 | assert len(x) == 2 20 | return x 21 | 22 | assert isinstance(x, int) 23 | return (x, x) 24 | 25 | 26 | class PatchEmbed(nn.Module): 27 | """ 28 | 2D image to patch embedding: (B,C,H,W) -> (B,N,D) 29 | 30 | Args: 31 | img_size: Image size. 32 | patch_size: Patch token size. 33 | in_chans: Number of input image channels. 34 | embed_dim: Number of linear projection output channels. 35 | norm_layer: Normalization layer. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | img_size: Union[int, Tuple[int, int]] = 224, 41 | patch_size: Union[int, Tuple[int, int]] = 16, 42 | in_chans: int = 3, 43 | embed_dim: int = 768, 44 | norm_layer: Optional[Callable] = None, 45 | flatten_embedding: bool = True, 46 | ) -> None: 47 | super().__init__() 48 | 49 | image_HW = make_2tuple(img_size) 50 | patch_HW = make_2tuple(patch_size) 51 | patch_grid_size = ( 52 | image_HW[0] // patch_HW[0], 53 | image_HW[1] // patch_HW[1], 54 | ) 55 | 56 | self.img_size = image_HW 57 | self.patch_size = patch_HW 58 | self.patches_resolution = patch_grid_size 59 | self.num_patches = patch_grid_size[0] * patch_grid_size[1] 60 | 61 | self.in_chans = in_chans 62 | self.embed_dim = embed_dim 63 | 64 | self.flatten_embedding = flatten_embedding 65 | 66 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) 67 | self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() 68 | 69 | def forward(self, x: Tensor) -> Tensor: 70 | _, _, H, W = x.shape 71 | patch_H, patch_W = self.patch_size 72 | 73 | assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" 74 | assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" 75 | 76 | x = self.proj(x) # B C H W 77 | H, W = x.size(2), x.size(3) 78 | x = x.flatten(2).transpose(1, 2) # B HW C 79 | x = self.norm(x) 80 | if not self.flatten_embedding: 81 | x = x.reshape(-1, H, W, self.embed_dim) # B H W C 82 | return x 83 | 84 | def flops(self) -> float: 85 | Ho, Wo = self.patches_resolution 86 | flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) 87 | if self.norm is not None: 88 | flops += Ho * Wo * self.embed_dim 89 | return flops 90 | -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/swiglu_ffn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Callable, Optional 8 | 9 | from torch import Tensor, nn 10 | import torch.nn.functional as F 11 | 12 | 13 | class SwiGLUFFN(nn.Module): 14 | def __init__( 15 | self, 16 | in_features: int, 17 | hidden_features: Optional[int] = None, 18 | out_features: Optional[int] = None, 19 | act_layer: Callable[..., nn.Module] = None, 20 | drop: float = 0.0, 21 | bias: bool = True, 22 | ) -> None: 23 | super().__init__() 24 | out_features = out_features or in_features 25 | hidden_features = hidden_features or in_features 26 | self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) 27 | self.w3 = nn.Linear(hidden_features, out_features, bias=bias) 28 | 29 | def forward(self, x: Tensor) -> Tensor: 30 | x12 = self.w12(x) 31 | x1, x2 = x12.chunk(2, dim=-1) 32 | hidden = F.silu(x1) * x2 33 | return self.w3(hidden) 34 | 35 | 36 | try: 37 | from xformers.ops import SwiGLU 38 | 39 | XFORMERS_AVAILABLE = True 40 | except ImportError: 41 | SwiGLU = SwiGLUFFN 42 | XFORMERS_AVAILABLE = False 43 | 44 | 45 | class SwiGLUFFNFused(SwiGLU): 46 | def __init__( 47 | self, 48 | in_features: int, 49 | hidden_features: Optional[int] = None, 50 | out_features: Optional[int] = None, 51 | act_layer: Callable[..., nn.Module] = None, 52 | drop: float = 0.0, 53 | bias: bool = True, 54 | ) -> None: 55 | out_features = out_features or in_features 56 | hidden_features = hidden_features or in_features 57 | hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 58 | super().__init__( 59 | in_features=in_features, 60 | hidden_features=hidden_features, 61 | out_features=out_features, 62 | bias=bias, 63 | ) 64 | -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | now=$(date +"%Y%m%d_%H%M%S") 3 | 4 | epoch=120 5 | bs=4 6 | gpus=8 7 | lr=0.000005 8 | encoder=vitl 9 | dataset=hypersim # vkitti 10 | img_size=518 11 | min_depth=0.001 12 | max_depth=20 # 80 for virtual kitti 13 | pretrained_from=../checkpoints/depth_anything_v2_${encoder}.pth 14 | save_path=exp/hypersim # exp/vkitti 15 | 16 | mkdir -p $save_path 17 | 18 | python3 -m torch.distributed.launch \ 19 | --nproc_per_node=$gpus \ 20 | --nnodes 1 \ 21 | --node_rank=0 \ 22 | --master_addr=localhost \ 23 | --master_port=20596 \ 24 | train.py --epoch $epoch --encoder $encoder --bs $bs --lr $lr --save-path $save_path --dataset $dataset \ 25 | --img-size $img_size --min-depth $min_depth --max-depth $max_depth --pretrained-from $pretrained_from \ 26 | --port 20596 2>&1 | tee -a $save_path/$now.log 27 | -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | opencv-python 3 | open3d 4 | torch 5 | torchvision 6 | -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cv2 3 | import glob 4 | import matplotlib 5 | import numpy as np 6 | import os 7 | import torch 8 | 9 | from depth_anything_v2.dpt import DepthAnythingV2 10 | 11 | 12 | if __name__ == '__main__': 13 | parser = argparse.ArgumentParser(description='Depth Anything V2 Metric Depth Estimation') 14 | 15 | parser.add_argument('--img-path', type=str) 16 | parser.add_argument('--input-size', type=int, default=518) 17 | parser.add_argument('--outdir', type=str, default='./vis_depth') 18 | 19 | parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg']) 20 | parser.add_argument('--load-from', type=str, default='checkpoints/depth_anything_v2_metric_hypersim_vitl.pth') 21 | parser.add_argument('--max-depth', type=float, default=20) 22 | 23 | parser.add_argument('--save-numpy', dest='save_numpy', action='store_true', help='save the model raw output') 24 | parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction') 25 | parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette') 26 | 27 | args = parser.parse_args() 28 | 29 | DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' 30 | 31 | model_configs = { 32 | 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, 33 | 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, 34 | 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, 35 | 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} 36 | } 37 | 38 | depth_anything = DepthAnythingV2(**{**model_configs[args.encoder], 'max_depth': args.max_depth}) 39 | depth_anything.load_state_dict(torch.load(args.load_from, map_location='cpu')) 40 | depth_anything = depth_anything.to(DEVICE).eval() 41 | 42 | if os.path.isfile(args.img_path): 43 | if args.img_path.endswith('txt'): 44 | with open(args.img_path, 'r') as f: 45 | filenames = f.read().splitlines() 46 | else: 47 | filenames = [args.img_path] 48 | else: 49 | filenames = glob.glob(os.path.join(args.img_path, '**/*'), recursive=True) 50 | 51 | os.makedirs(args.outdir, exist_ok=True) 52 | 53 | cmap = matplotlib.colormaps.get_cmap('Spectral') 54 | 55 | for k, filename in enumerate(filenames): 56 | print(f'Progress {k+1}/{len(filenames)}: {filename}') 57 | 58 | raw_image = cv2.imread(filename) 59 | 60 | depth = depth_anything.infer_image(raw_image, args.input_size) 61 | 62 | if args.save_numpy: 63 | output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '_raw_depth_meter.npy') 64 | np.save(output_path, depth) 65 | 66 | depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 67 | depth = depth.astype(np.uint8) 68 | 69 | if args.grayscale: 70 | depth = np.repeat(depth[..., np.newaxis], 3, axis=-1) 71 | else: 72 | depth = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8) 73 | 74 | output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.png') 75 | if args.pred_only: 76 | cv2.imwrite(output_path, depth) 77 | else: 78 | split_region = np.ones((raw_image.shape[0], 50, 3), dtype=np.uint8) * 255 79 | combined_result = cv2.hconcat([raw_image, split_region, depth]) 80 | 81 | cv2.imwrite(output_path, combined_result) -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/util/dist_helper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | import torch 5 | import torch.distributed as dist 6 | 7 | 8 | def setup_distributed(backend="nccl", port=None): 9 | """AdaHessian Optimizer 10 | Lifted from https://github.com/BIGBALLON/distribuuuu/blob/master/distribuuuu/utils.py 11 | Originally licensed MIT, Copyright (c) 2020 Wei Li 12 | """ 13 | num_gpus = torch.cuda.device_count() 14 | 15 | if "SLURM_JOB_ID" in os.environ: 16 | rank = int(os.environ["SLURM_PROCID"]) 17 | world_size = int(os.environ["SLURM_NTASKS"]) 18 | node_list = os.environ["SLURM_NODELIST"] 19 | addr = subprocess.getoutput(f"scontrol show hostname {node_list} | head -n1") 20 | # specify master port 21 | if port is not None: 22 | os.environ["MASTER_PORT"] = str(port) 23 | elif "MASTER_PORT" not in os.environ: 24 | os.environ["MASTER_PORT"] = "10685" 25 | if "MASTER_ADDR" not in os.environ: 26 | os.environ["MASTER_ADDR"] = addr 27 | os.environ["WORLD_SIZE"] = str(world_size) 28 | os.environ["LOCAL_RANK"] = str(rank % num_gpus) 29 | os.environ["RANK"] = str(rank) 30 | else: 31 | rank = int(os.environ["RANK"]) 32 | world_size = int(os.environ["WORLD_SIZE"]) 33 | 34 | torch.cuda.set_device(rank % num_gpus) 35 | 36 | dist.init_process_group( 37 | backend=backend, 38 | world_size=world_size, 39 | rank=rank, 40 | ) 41 | return rank, world_size 42 | -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/util/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class SiLogLoss(nn.Module): 6 | def __init__(self, lambd=0.5): 7 | super().__init__() 8 | self.lambd = lambd 9 | 10 | def forward(self, pred, target, valid_mask): 11 | valid_mask = valid_mask.detach() 12 | diff_log = torch.log(target[valid_mask]) - torch.log(pred[valid_mask]) 13 | loss = torch.sqrt(torch.pow(diff_log, 2).mean() - 14 | self.lambd * torch.pow(diff_log.mean(), 2)) 15 | 16 | return loss 17 | -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/util/metric.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def eval_depth(pred, target): 5 | assert pred.shape == target.shape 6 | 7 | thresh = torch.max((target / pred), (pred / target)) 8 | 9 | d1 = torch.sum(thresh < 1.25).float() / len(thresh) 10 | d2 = torch.sum(thresh < 1.25 ** 2).float() / len(thresh) 11 | d3 = torch.sum(thresh < 1.25 ** 3).float() / len(thresh) 12 | 13 | diff = pred - target 14 | diff_log = torch.log(pred) - torch.log(target) 15 | 16 | abs_rel = torch.mean(torch.abs(diff) / target) 17 | sq_rel = torch.mean(torch.pow(diff, 2) / target) 18 | 19 | rmse = torch.sqrt(torch.mean(torch.pow(diff, 2))) 20 | rmse_log = torch.sqrt(torch.mean(torch.pow(diff_log , 2))) 21 | 22 | log10 = torch.mean(torch.abs(torch.log10(pred) - torch.log10(target))) 23 | silog = torch.sqrt(torch.pow(diff_log, 2).mean() - 0.5 * torch.pow(diff_log.mean(), 2)) 24 | 25 | return {'d1': d1.item(), 'd2': d2.item(), 'd3': d3.item(), 'abs_rel': abs_rel.item(), 'sq_rel': sq_rel.item(), 26 | 'rmse': rmse.item(), 'rmse_log': rmse_log.item(), 'log10':log10.item(), 'silog':silog.item()} -------------------------------------------------------------------------------- /ddepth_anything_v2/metric_depth/util/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import numpy as np 4 | import logging 5 | 6 | logs = set() 7 | 8 | 9 | def init_log(name, level=logging.INFO): 10 | if (name, level) in logs: 11 | return 12 | logs.add((name, level)) 13 | logger = logging.getLogger(name) 14 | logger.setLevel(level) 15 | ch = logging.StreamHandler() 16 | ch.setLevel(level) 17 | if "SLURM_PROCID" in os.environ: 18 | rank = int(os.environ["SLURM_PROCID"]) 19 | logger.addFilter(lambda record: rank == 0) 20 | else: 21 | rank = 0 22 | format_str = "[%(asctime)s][%(levelname)8s] %(message)s" 23 | formatter = logging.Formatter(format_str) 24 | ch.setFormatter(formatter) 25 | logger.addHandler(ch) 26 | return logger 27 | -------------------------------------------------------------------------------- /ddepth_anything_v2/requirements.txt: -------------------------------------------------------------------------------- 1 | gradio_imageslider 2 | gradio==4.29.0 3 | matplotlib 4 | opencv-python 5 | torch 6 | torchvision 7 | -------------------------------------------------------------------------------- /ddepth_anything_v2/run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cv2 3 | import glob 4 | import matplotlib 5 | import numpy as np 6 | import os 7 | import torch 8 | 9 | from depth_anything_v2.dpt import DepthAnythingV2 10 | 11 | 12 | if __name__ == '__main__': 13 | parser = argparse.ArgumentParser(description='Depth Anything V2') 14 | 15 | parser.add_argument('--img-path', type=str) 16 | parser.add_argument('--input-size', type=int, default=518) 17 | parser.add_argument('--outdir', type=str, default='./vis_depth') 18 | 19 | parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg']) 20 | 21 | parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction') 22 | parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette') 23 | 24 | args = parser.parse_args() 25 | 26 | DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' 27 | 28 | model_configs = { 29 | 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, 30 | 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, 31 | 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, 32 | 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} 33 | } 34 | 35 | depth_anything = DepthAnythingV2(**model_configs[args.encoder]) 36 | depth_anything.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_{args.encoder}.pth', map_location='cpu')) 37 | depth_anything = depth_anything.to(DEVICE).eval() 38 | 39 | if os.path.isfile(args.img_path): 40 | if args.img_path.endswith('txt'): 41 | with open(args.img_path, 'r') as f: 42 | filenames = f.read().splitlines() 43 | else: 44 | filenames = [args.img_path] 45 | else: 46 | filenames = glob.glob(os.path.join(args.img_path, '**/*'), recursive=True) 47 | 48 | os.makedirs(args.outdir, exist_ok=True) 49 | 50 | cmap = matplotlib.colormaps.get_cmap('Spectral_r') 51 | 52 | for k, filename in enumerate(filenames): 53 | print(f'Progress {k+1}/{len(filenames)}: {filename}') 54 | 55 | raw_image = cv2.imread(filename) 56 | 57 | depth = depth_anything.infer_image(raw_image, args.input_size) 58 | 59 | depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 60 | depth = depth.astype(np.uint8) 61 | 62 | if args.grayscale: 63 | depth = np.repeat(depth[..., np.newaxis], 3, axis=-1) 64 | else: 65 | depth = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8) 66 | 67 | if args.pred_only: 68 | cv2.imwrite(os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.png'), depth) 69 | else: 70 | split_region = np.ones((raw_image.shape[0], 50, 3), dtype=np.uint8) * 255 71 | combined_result = cv2.hconcat([raw_image, split_region, depth]) 72 | 73 | cv2.imwrite(os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.png'), combined_result) -------------------------------------------------------------------------------- /ddepth_anything_v2/run_video.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cv2 3 | import glob 4 | import matplotlib 5 | import numpy as np 6 | import os 7 | import torch 8 | 9 | from depth_anything_v2.dpt import DepthAnythingV2 10 | 11 | 12 | if __name__ == '__main__': 13 | parser = argparse.ArgumentParser(description='Depth Anything V2') 14 | 15 | parser.add_argument('--video-path', type=str) 16 | parser.add_argument('--input-size', type=int, default=518) 17 | parser.add_argument('--outdir', type=str, default='./vis_video_depth') 18 | 19 | parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg']) 20 | 21 | parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction') 22 | parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette') 23 | 24 | args = parser.parse_args() 25 | 26 | DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' 27 | 28 | model_configs = { 29 | 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, 30 | 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, 31 | 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, 32 | 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} 33 | } 34 | 35 | depth_anything = DepthAnythingV2(**model_configs[args.encoder]) 36 | depth_anything.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_{args.encoder}.pth', map_location='cpu')) 37 | depth_anything = depth_anything.to(DEVICE).eval() 38 | 39 | if os.path.isfile(args.video_path): 40 | if args.video_path.endswith('txt'): 41 | with open(args.video_path, 'r') as f: 42 | lines = f.read().splitlines() 43 | else: 44 | filenames = [args.video_path] 45 | else: 46 | filenames = glob.glob(os.path.join(args.video_path, '**/*'), recursive=True) 47 | 48 | os.makedirs(args.outdir, exist_ok=True) 49 | 50 | margin_width = 50 51 | cmap = matplotlib.colormaps.get_cmap('Spectral_r') 52 | 53 | for k, filename in enumerate(filenames): 54 | print(f'Progress {k+1}/{len(filenames)}: {filename}') 55 | 56 | raw_video = cv2.VideoCapture(filename) 57 | frame_width, frame_height = int(raw_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(raw_video.get(cv2.CAP_PROP_FRAME_HEIGHT)) 58 | frame_rate = int(raw_video.get(cv2.CAP_PROP_FPS)) 59 | 60 | if args.pred_only: 61 | output_width = frame_width 62 | else: 63 | output_width = frame_width * 2 + margin_width 64 | 65 | output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.mp4') 66 | out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (output_width, frame_height)) 67 | 68 | while raw_video.isOpened(): 69 | ret, raw_frame = raw_video.read() 70 | if not ret: 71 | break 72 | 73 | depth = depth_anything.infer_image(raw_frame, args.input_size) 74 | 75 | depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 76 | depth = depth.astype(np.uint8) 77 | 78 | if args.grayscale: 79 | depth = np.repeat(depth[..., np.newaxis], 3, axis=-1) 80 | else: 81 | depth = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8) 82 | 83 | if args.pred_only: 84 | out.write(depth) 85 | else: 86 | split_region = np.ones((frame_height, margin_width, 3), dtype=np.uint8) * 255 87 | combined_frame = cv2.hconcat([raw_frame, split_region, depth]) 88 | 89 | out.write(combined_frame) 90 | 91 | raw_video.release() 92 | out.release() 93 | -------------------------------------------------------------------------------- /dmarigold/marigold/__init__.py: -------------------------------------------------------------------------------- 1 | from .marigold_pipeline import MarigoldPipeline, MarigoldDepthOutput 2 | -------------------------------------------------------------------------------- /dmarigold/marigold/util/batchsize.py: -------------------------------------------------------------------------------- 1 | # Author: Bingxin Ke 2 | # Last modified: 2023-12-15 3 | 4 | import torch 5 | import math 6 | 7 | 8 | # Search table for suggested max. inference batch size 9 | bs_search_table = [ 10 | # tested on A100-PCIE-80GB 11 | {"res": 768, "total_vram": 79, "bs": 35}, 12 | {"res": 1024, "total_vram": 79, "bs": 20}, 13 | # tested on A100-PCIE-40GB 14 | {"res": 768, "total_vram": 39, "bs": 15}, 15 | {"res": 1024, "total_vram": 39, "bs": 8}, 16 | # tested on RTX3090, RTX4090 17 | {"res": 512, "total_vram": 23, "bs": 20}, 18 | {"res": 768, "total_vram": 23, "bs": 7}, 19 | {"res": 1024, "total_vram": 23, "bs": 3}, 20 | # tested on GTX1080Ti 21 | {"res": 512, "total_vram": 10, "bs": 5}, 22 | {"res": 768, "total_vram": 10, "bs": 2}, 23 | ] 24 | 25 | 26 | def find_batch_size(ensemble_size: int, input_res: int) -> int: 27 | """ 28 | Automatically search for suitable operating batch size. 29 | 30 | Args: 31 | ensemble_size (int): Number of predictions to be ensembled 32 | input_res (int): Operating resolution of the input image. 33 | 34 | Returns: 35 | int: Operating batch size 36 | """ 37 | if not torch.cuda.is_available(): 38 | return 1 39 | 40 | total_vram = torch.cuda.mem_get_info()[1] / 1024.0**3 41 | 42 | for settings in sorted(bs_search_table, key=lambda k: (k["res"], -k["total_vram"])): 43 | if input_res <= settings["res"] and total_vram >= settings["total_vram"]: 44 | bs = settings["bs"] 45 | if bs > ensemble_size: 46 | bs = ensemble_size 47 | elif bs > math.ceil(ensemble_size / 2) and bs < ensemble_size: 48 | bs = math.ceil(ensemble_size / 2) 49 | return bs 50 | 51 | return 1 52 | -------------------------------------------------------------------------------- /dmarigold/marigold/util/ensemble.py: -------------------------------------------------------------------------------- 1 | # Test align depth images 2 | # Author: Bingxin Ke 3 | # Last modified: 2023-12-15 4 | 5 | import numpy as np 6 | import torch 7 | 8 | from scipy.optimize import minimize 9 | 10 | 11 | def inter_distances(tensors: torch.Tensor): 12 | """ 13 | To calculate the distance between each two depth maps. 14 | """ 15 | distances = [] 16 | for i, j in torch.combinations(torch.arange(tensors.shape[0])): 17 | arr1 = tensors[i : i + 1] 18 | arr2 = tensors[j : j + 1] 19 | distances.append(arr1 - arr2) 20 | dist = torch.concatenate(distances, dim=0) 21 | return dist 22 | 23 | 24 | def ensemble_depths( 25 | input_images: torch.Tensor, 26 | regularizer_strength: float = 0.02, 27 | max_iter: int = 2, 28 | tol: float = 1e-3, 29 | reduction: str = "median", 30 | max_res: int = None, 31 | ): 32 | """ 33 | To ensemble multiple affine-invariant depth images (up to scale and shift), 34 | by aligning estimating the scale and shift 35 | """ 36 | device = input_images.device 37 | dtype = np.float32 38 | 39 | original_input = input_images.clone() 40 | n_img = input_images.shape[0] 41 | ori_shape = input_images.shape 42 | 43 | if max_res is not None: 44 | scale_factor = torch.min(max_res / torch.tensor(ori_shape[-2:])) 45 | if scale_factor < 1: 46 | downscaler = torch.nn.Upsample(scale_factor=scale_factor, mode="nearest") 47 | input_images = downscaler(torch.from_numpy(input_images)).numpy() 48 | 49 | # init guess 50 | _min = np.min(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1) 51 | _max = np.max(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1) 52 | s_init = 1.0 / (_max - _min).reshape((-1, 1, 1)) 53 | t_init = (-1 * s_init.flatten() * _min.flatten()).reshape((-1, 1, 1)) 54 | x = np.concatenate([s_init, t_init]).reshape(-1) 55 | 56 | input_images = input_images.to(device) 57 | 58 | # objective function 59 | def closure(x): 60 | x = x.astype(dtype) 61 | l = len(x) 62 | s = x[: int(l / 2)] 63 | t = x[int(l / 2) :] 64 | s = torch.from_numpy(s).to(device) 65 | t = torch.from_numpy(t).to(device) 66 | 67 | transformed_arrays = input_images * s.view((-1, 1, 1)) + t.view((-1, 1, 1)) 68 | dists = inter_distances(transformed_arrays) 69 | sqrt_dist = torch.sqrt(torch.mean(dists**2)) 70 | 71 | if "mean" == reduction: 72 | pred = torch.mean(transformed_arrays, dim=0) 73 | elif "median" == reduction: 74 | pred = torch.median(transformed_arrays, dim=0).values 75 | else: 76 | raise ValueError 77 | 78 | near_err = torch.sqrt((0 - torch.min(pred)) ** 2) 79 | far_err = torch.sqrt((1 - torch.max(pred)) ** 2) 80 | 81 | err = sqrt_dist + (near_err + far_err) * regularizer_strength 82 | err = err.detach().cpu().numpy() 83 | return err 84 | 85 | res = minimize( 86 | closure, x, method="BFGS", tol=tol, options={"maxiter": max_iter, "disp": False} 87 | ) 88 | x = res.x 89 | x = x.astype(dtype) 90 | l = len(x) 91 | s = x[: int(l / 2)] 92 | t = x[int(l / 2) :] 93 | 94 | # Prediction 95 | s = torch.from_numpy(s).to(device) 96 | t = torch.from_numpy(t).to(device) 97 | transformed_arrays = original_input * s.view(-1, 1, 1) + t.view(-1, 1, 1) 98 | if "mean" == reduction: 99 | aligned_images = torch.mean(transformed_arrays, dim=0) 100 | std = torch.std(transformed_arrays, dim=0) 101 | uncertainty = std 102 | elif "median" == reduction: 103 | aligned_images = torch.median(transformed_arrays, dim=0).values 104 | # MAD (median absolute deviation) as uncertainty indicator 105 | abs_dev = torch.abs(transformed_arrays - aligned_images) 106 | mad = torch.median(abs_dev, dim=0).values 107 | uncertainty = mad 108 | else: 109 | raise ValueError(f"Unknown reduction method: {reduction}") 110 | 111 | # Scale and shift to [0, 1] 112 | _min = torch.min(aligned_images) 113 | _max = torch.max(aligned_images) 114 | aligned_images = (aligned_images - _min) / (_max - _min) 115 | uncertainty /= _max - _min 116 | 117 | return aligned_images, uncertainty 118 | -------------------------------------------------------------------------------- /dmarigold/marigold/util/image_util.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import numpy as np 3 | import torch 4 | from PIL import Image 5 | 6 | 7 | def colorize_depth_maps( 8 | depth_map, min_depth, max_depth, cmap="Spectral", valid_mask=None 9 | ): 10 | """ 11 | Colorize depth maps. 12 | """ 13 | assert len(depth_map.shape) >= 2, "Invalid dimension" 14 | 15 | if isinstance(depth_map, torch.Tensor): 16 | depth = depth_map.detach().clone().squeeze().numpy() 17 | elif isinstance(depth_map, np.ndarray): 18 | depth = depth_map.copy().squeeze() 19 | # reshape to [ (B,) H, W ] 20 | if depth.ndim < 3: 21 | depth = depth[np.newaxis, :, :] 22 | 23 | # colorize 24 | cm = matplotlib.colormaps[cmap] 25 | depth = ((depth - min_depth) / (max_depth - min_depth)).clip(0, 1) 26 | img_colored_np = cm(depth, bytes=False)[:, :, :, 0:3] # value from 0 to 1 27 | img_colored_np = np.rollaxis(img_colored_np, 3, 1) 28 | 29 | if valid_mask is not None: 30 | if isinstance(depth_map, torch.Tensor): 31 | valid_mask = valid_mask.detach().numpy() 32 | valid_mask = valid_mask.squeeze() # [H, W] or [B, H, W] 33 | if valid_mask.ndim < 3: 34 | valid_mask = valid_mask[np.newaxis, np.newaxis, :, :] 35 | else: 36 | valid_mask = valid_mask[:, np.newaxis, :, :] 37 | valid_mask = np.repeat(valid_mask, 3, axis=1) 38 | img_colored_np[~valid_mask] = 0 39 | 40 | if isinstance(depth_map, torch.Tensor): 41 | img_colored = torch.from_numpy(img_colored_np).float() 42 | elif isinstance(depth_map, np.ndarray): 43 | img_colored = img_colored_np 44 | 45 | return img_colored 46 | 47 | 48 | def chw2hwc(chw): 49 | assert 3 == len(chw.shape) 50 | if isinstance(chw, torch.Tensor): 51 | hwc = torch.permute(chw, (1, 2, 0)) 52 | elif isinstance(chw, np.ndarray): 53 | hwc = np.moveaxis(chw, 0, -1) 54 | return hwc 55 | 56 | 57 | def resize_max_res(img: Image.Image, max_edge_resolution: int) -> Image.Image: 58 | """ 59 | Resize image to limit maximum edge length while keeping aspect ratio 60 | 61 | Args: 62 | img (Image.Image): Image to be resized 63 | max_edge_resolution (int): Maximum edge length (px). 64 | 65 | Returns: 66 | Image.Image: Resized image. 67 | """ 68 | original_width, original_height = img.size 69 | downscale_factor = min( 70 | max_edge_resolution / original_width, max_edge_resolution / original_height 71 | ) 72 | 73 | new_width = int(original_width * downscale_factor) 74 | new_height = int(original_height * downscale_factor) 75 | 76 | resized_img = img.resize((new_width, new_height)) 77 | return resized_img 78 | -------------------------------------------------------------------------------- /dmarigold/marigold/util/seed_all.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import torch 4 | 5 | 6 | def seed_all(seed: int = 0): 7 | """ 8 | Set random seeds of all components. 9 | """ 10 | random.seed(seed) 11 | np.random.seed(seed) 12 | torch.manual_seed(seed) 13 | torch.cuda.manual_seed_all(seed) 14 | -------------------------------------------------------------------------------- /dmidas/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Intel ISL (Intel Intelligent Systems Lab) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /dmidas/backbones/levit.py: -------------------------------------------------------------------------------- 1 | import timm 2 | import torch 3 | import torch.nn as nn 4 | import numpy as np 5 | 6 | from .utils import activations, get_activation, Transpose 7 | 8 | 9 | def forward_levit(pretrained, x): 10 | pretrained.model.forward_features(x) 11 | 12 | layer_1 = pretrained.activations["1"] 13 | layer_2 = pretrained.activations["2"] 14 | layer_3 = pretrained.activations["3"] 15 | 16 | layer_1 = pretrained.act_postprocess1(layer_1) 17 | layer_2 = pretrained.act_postprocess2(layer_2) 18 | layer_3 = pretrained.act_postprocess3(layer_3) 19 | 20 | return layer_1, layer_2, layer_3 21 | 22 | 23 | def _make_levit_backbone( 24 | model, 25 | hooks=[3, 11, 21], 26 | patch_grid=[14, 14] 27 | ): 28 | pretrained = nn.Module() 29 | 30 | pretrained.model = model 31 | pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1")) 32 | pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2")) 33 | pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3")) 34 | 35 | pretrained.activations = activations 36 | 37 | patch_grid_size = np.array(patch_grid, dtype=int) 38 | 39 | pretrained.act_postprocess1 = nn.Sequential( 40 | Transpose(1, 2), 41 | nn.Unflatten(2, torch.Size(patch_grid_size.tolist())) 42 | ) 43 | pretrained.act_postprocess2 = nn.Sequential( 44 | Transpose(1, 2), 45 | nn.Unflatten(2, torch.Size((np.ceil(patch_grid_size / 2).astype(int)).tolist())) 46 | ) 47 | pretrained.act_postprocess3 = nn.Sequential( 48 | Transpose(1, 2), 49 | nn.Unflatten(2, torch.Size((np.ceil(patch_grid_size / 4).astype(int)).tolist())) 50 | ) 51 | 52 | return pretrained 53 | 54 | 55 | class ConvTransposeNorm(nn.Sequential): 56 | """ 57 | Modification of 58 | https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/levit.py: ConvNorm 59 | such that ConvTranspose2d is used instead of Conv2d. 60 | """ 61 | 62 | def __init__( 63 | self, in_chs, out_chs, kernel_size=1, stride=1, pad=0, dilation=1, 64 | groups=1, bn_weight_init=1): 65 | super().__init__() 66 | self.add_module('c', 67 | nn.ConvTranspose2d(in_chs, out_chs, kernel_size, stride, pad, dilation, groups, bias=False)) 68 | self.add_module('bn', nn.BatchNorm2d(out_chs)) 69 | 70 | nn.init.constant_(self.bn.weight, bn_weight_init) 71 | 72 | @torch.no_grad() 73 | def fuse(self): 74 | c, bn = self._modules.values() 75 | w = bn.weight / (bn.running_var + bn.eps) ** 0.5 76 | w = c.weight * w[:, None, None, None] 77 | b = bn.bias - bn.running_mean * bn.weight / (bn.running_var + bn.eps) ** 0.5 78 | m = nn.ConvTranspose2d( 79 | w.size(1), w.size(0), w.shape[2:], stride=self.c.stride, 80 | padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups) 81 | m.weight.data.copy_(w) 82 | m.bias.data.copy_(b) 83 | return m 84 | 85 | 86 | def stem_b4_transpose(in_chs, out_chs, activation): 87 | """ 88 | Modification of 89 | https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/levit.py: stem_b16 90 | such that ConvTranspose2d is used instead of Conv2d and stem is also reduced to the half. 91 | """ 92 | return nn.Sequential( 93 | ConvTransposeNorm(in_chs, out_chs, 3, 2, 1), 94 | activation(), 95 | ConvTransposeNorm(out_chs, out_chs // 2, 3, 2, 1), 96 | activation()) 97 | 98 | 99 | def _make_pretrained_levit_384(pretrained, hooks=None): 100 | model = timm.create_model("levit_384", pretrained=pretrained) 101 | 102 | hooks = [3, 11, 21] if hooks == None else hooks 103 | return _make_levit_backbone( 104 | model, 105 | hooks=hooks 106 | ) 107 | -------------------------------------------------------------------------------- /dmidas/backbones/swin.py: -------------------------------------------------------------------------------- 1 | import timm 2 | 3 | from .swin_common import _make_swin_backbone 4 | 5 | 6 | def _make_pretrained_swinl12_384(pretrained, hooks=None): 7 | model = timm.create_model("swin_large_patch4_window12_384", pretrained=pretrained) 8 | 9 | hooks = [1, 1, 17, 1] if hooks == None else hooks 10 | return _make_swin_backbone( 11 | model, 12 | hooks=hooks 13 | ) 14 | -------------------------------------------------------------------------------- /dmidas/backbones/swin2.py: -------------------------------------------------------------------------------- 1 | import timm 2 | 3 | from .swin_common import _make_swin_backbone 4 | 5 | 6 | def _make_pretrained_swin2l24_384(pretrained, hooks=None): 7 | model = timm.create_model("swinv2_large_window12to24_192to384_22kft1k", pretrained=pretrained) 8 | 9 | hooks = [1, 1, 17, 1] if hooks == None else hooks 10 | return _make_swin_backbone( 11 | model, 12 | hooks=hooks 13 | ) 14 | 15 | 16 | def _make_pretrained_swin2b24_384(pretrained, hooks=None): 17 | model = timm.create_model("swinv2_base_window12to24_192to384_22kft1k", pretrained=pretrained) 18 | 19 | hooks = [1, 1, 17, 1] if hooks == None else hooks 20 | return _make_swin_backbone( 21 | model, 22 | hooks=hooks 23 | ) 24 | 25 | 26 | def _make_pretrained_swin2t16_256(pretrained, hooks=None): 27 | model = timm.create_model("swinv2_tiny_window16_256", pretrained=pretrained) 28 | 29 | hooks = [1, 1, 5, 1] if hooks == None else hooks 30 | return _make_swin_backbone( 31 | model, 32 | hooks=hooks, 33 | patch_grid=[64, 64] 34 | ) 35 | -------------------------------------------------------------------------------- /dmidas/backbones/swin_common.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import torch.nn as nn 4 | import numpy as np 5 | 6 | from .utils import activations, forward_default, get_activation, Transpose 7 | 8 | 9 | def forward_swin(pretrained, x): 10 | return forward_default(pretrained, x) 11 | 12 | 13 | def _make_swin_backbone( 14 | model, 15 | hooks=[1, 1, 17, 1], 16 | patch_grid=[96, 96] 17 | ): 18 | pretrained = nn.Module() 19 | 20 | pretrained.model = model 21 | pretrained.model.layers[0].blocks[hooks[0]].register_forward_hook(get_activation("1")) 22 | pretrained.model.layers[1].blocks[hooks[1]].register_forward_hook(get_activation("2")) 23 | pretrained.model.layers[2].blocks[hooks[2]].register_forward_hook(get_activation("3")) 24 | pretrained.model.layers[3].blocks[hooks[3]].register_forward_hook(get_activation("4")) 25 | 26 | pretrained.activations = activations 27 | 28 | if hasattr(model, "patch_grid"): 29 | used_patch_grid = model.patch_grid 30 | else: 31 | used_patch_grid = patch_grid 32 | 33 | patch_grid_size = np.array(used_patch_grid, dtype=int) 34 | 35 | pretrained.act_postprocess1 = nn.Sequential( 36 | Transpose(1, 2), 37 | nn.Unflatten(2, torch.Size(patch_grid_size.tolist())) 38 | ) 39 | pretrained.act_postprocess2 = nn.Sequential( 40 | Transpose(1, 2), 41 | nn.Unflatten(2, torch.Size((patch_grid_size // 2).tolist())) 42 | ) 43 | pretrained.act_postprocess3 = nn.Sequential( 44 | Transpose(1, 2), 45 | nn.Unflatten(2, torch.Size((patch_grid_size // 4).tolist())) 46 | ) 47 | pretrained.act_postprocess4 = nn.Sequential( 48 | Transpose(1, 2), 49 | nn.Unflatten(2, torch.Size((patch_grid_size // 8).tolist())) 50 | ) 51 | 52 | return pretrained 53 | -------------------------------------------------------------------------------- /dmidas/base_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class BaseModel(torch.nn.Module): 5 | def load(self, path): 6 | """Load model from file. 7 | 8 | Args: 9 | path (str): file path 10 | """ 11 | parameters = torch.load(path, map_location=torch.device('cpu')) 12 | 13 | if "optimizer" in parameters: 14 | parameters = parameters["model"] 15 | 16 | self.load_state_dict(parameters) 17 | -------------------------------------------------------------------------------- /dmidas/midas_net.py: -------------------------------------------------------------------------------- 1 | """MidashNet: Network for monocular depth estimation trained by mixing several datasets. 2 | This file contains code that is adapted from 3 | https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py 4 | """ 5 | import torch 6 | import torch.nn as nn 7 | 8 | from .base_model import BaseModel 9 | from .blocks import FeatureFusionBlock, Interpolate, _make_encoder 10 | 11 | 12 | class MidasNet(BaseModel): 13 | """Network for monocular depth estimation. 14 | """ 15 | 16 | def __init__(self, path=None, features=256, non_negative=True): 17 | """Init. 18 | 19 | Args: 20 | path (str, optional): Path to saved model. Defaults to None. 21 | features (int, optional): Number of features. Defaults to 256. 22 | backbone (str, optional): Backbone network for encoder. Defaults to resnet50 23 | """ 24 | print("Loading weights: ", path) 25 | 26 | super(MidasNet, self).__init__() 27 | 28 | use_pretrained = False if path is None else True 29 | 30 | self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained) 31 | 32 | self.scratch.refinenet4 = FeatureFusionBlock(features) 33 | self.scratch.refinenet3 = FeatureFusionBlock(features) 34 | self.scratch.refinenet2 = FeatureFusionBlock(features) 35 | self.scratch.refinenet1 = FeatureFusionBlock(features) 36 | 37 | self.scratch.output_conv = nn.Sequential( 38 | nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1), 39 | Interpolate(scale_factor=2, mode="bilinear"), 40 | nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1), 41 | nn.ReLU(True), 42 | nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), 43 | nn.ReLU(True) if non_negative else nn.Identity(), 44 | ) 45 | 46 | if path: 47 | self.load(path) 48 | 49 | def forward(self, x): 50 | """Forward pass. 51 | 52 | Args: 53 | x (tensor): input data (image) 54 | 55 | Returns: 56 | tensor: depth 57 | """ 58 | 59 | layer_1 = self.pretrained.layer1(x) 60 | layer_2 = self.pretrained.layer2(layer_1) 61 | layer_3 = self.pretrained.layer3(layer_2) 62 | layer_4 = self.pretrained.layer4(layer_3) 63 | 64 | layer_1_rn = self.scratch.layer1_rn(layer_1) 65 | layer_2_rn = self.scratch.layer2_rn(layer_2) 66 | layer_3_rn = self.scratch.layer3_rn(layer_3) 67 | layer_4_rn = self.scratch.layer4_rn(layer_4) 68 | 69 | path_4 = self.scratch.refinenet4(layer_4_rn) 70 | path_3 = self.scratch.refinenet3(path_4, layer_3_rn) 71 | path_2 = self.scratch.refinenet2(path_3, layer_2_rn) 72 | path_1 = self.scratch.refinenet1(path_2, layer_1_rn) 73 | 74 | out = self.scratch.output_conv(path_1) 75 | 76 | return torch.squeeze(out, dim=1) 77 | -------------------------------------------------------------------------------- /dzoedepth/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /dzoedepth/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/dzoedepth/__init__.py -------------------------------------------------------------------------------- /dzoedepth/data/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | -------------------------------------------------------------------------------- /dzoedepth/data/ddad.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | import os 26 | 27 | import numpy as np 28 | import torch 29 | from PIL import Image 30 | from torch.utils.data import DataLoader, Dataset 31 | from torchvision import transforms 32 | 33 | 34 | class ToTensor(object): 35 | def __init__(self, resize_shape): 36 | # self.normalize = transforms.Normalize( 37 | # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 38 | self.normalize = lambda x : x 39 | self.resize = transforms.Resize(resize_shape) 40 | 41 | def __call__(self, sample): 42 | image, depth = sample['image'], sample['depth'] 43 | image = self.to_tensor(image) 44 | image = self.normalize(image) 45 | depth = self.to_tensor(depth) 46 | 47 | image = self.resize(image) 48 | 49 | return {'image': image, 'depth': depth, 'dataset': "ddad"} 50 | 51 | def to_tensor(self, pic): 52 | 53 | if isinstance(pic, np.ndarray): 54 | img = torch.from_numpy(pic.transpose((2, 0, 1))) 55 | return img 56 | 57 | # # handle PIL Image 58 | if pic.mode == 'I': 59 | img = torch.from_numpy(np.array(pic, np.int32, copy=False)) 60 | elif pic.mode == 'I;16': 61 | img = torch.from_numpy(np.array(pic, np.int16, copy=False)) 62 | else: 63 | img = torch.ByteTensor( 64 | torch.ByteStorage.from_buffer(pic.tobytes())) 65 | # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK 66 | if pic.mode == 'YCbCr': 67 | nchannel = 3 68 | elif pic.mode == 'I;16': 69 | nchannel = 1 70 | else: 71 | nchannel = len(pic.mode) 72 | img = img.view(pic.size[1], pic.size[0], nchannel) 73 | 74 | img = img.transpose(0, 1).transpose(0, 2).contiguous() 75 | 76 | if isinstance(img, torch.ByteTensor): 77 | return img.float() 78 | else: 79 | return img 80 | 81 | 82 | class DDAD(Dataset): 83 | def __init__(self, data_dir_root, resize_shape): 84 | import glob 85 | 86 | # image paths are of the form /{outleft, depthmap}/*.png 87 | self.image_files = glob.glob(os.path.join(data_dir_root, '*.png')) 88 | self.depth_files = [r.replace("_rgb.png", "_depth.npy") 89 | for r in self.image_files] 90 | self.transform = ToTensor(resize_shape) 91 | 92 | def __getitem__(self, idx): 93 | 94 | image_path = self.image_files[idx] 95 | depth_path = self.depth_files[idx] 96 | 97 | image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0 98 | depth = np.load(depth_path) # meters 99 | 100 | # depth[depth > 8] = -1 101 | depth = depth[..., None] 102 | 103 | sample = dict(image=image, depth=depth) 104 | sample = self.transform(sample) 105 | 106 | if idx == 0: 107 | print(sample["image"].shape) 108 | 109 | return sample 110 | 111 | def __len__(self): 112 | return len(self.image_files) 113 | 114 | 115 | def get_ddad_loader(data_dir_root, resize_shape, batch_size=1, **kwargs): 116 | dataset = DDAD(data_dir_root, resize_shape) 117 | return DataLoader(dataset, batch_size, **kwargs) 118 | -------------------------------------------------------------------------------- /dzoedepth/data/diml_outdoor_test.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | import os 26 | 27 | import numpy as np 28 | import torch 29 | from PIL import Image 30 | from torch.utils.data import DataLoader, Dataset 31 | from torchvision import transforms 32 | 33 | 34 | class ToTensor(object): 35 | def __init__(self): 36 | # self.normalize = transforms.Normalize( 37 | # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 38 | self.normalize = lambda x : x 39 | 40 | def __call__(self, sample): 41 | image, depth = sample['image'], sample['depth'] 42 | image = self.to_tensor(image) 43 | image = self.normalize(image) 44 | depth = self.to_tensor(depth) 45 | 46 | return {'image': image, 'depth': depth, 'dataset': "diml_outdoor"} 47 | 48 | def to_tensor(self, pic): 49 | 50 | if isinstance(pic, np.ndarray): 51 | img = torch.from_numpy(pic.transpose((2, 0, 1))) 52 | return img 53 | 54 | # # handle PIL Image 55 | if pic.mode == 'I': 56 | img = torch.from_numpy(np.array(pic, np.int32, copy=False)) 57 | elif pic.mode == 'I;16': 58 | img = torch.from_numpy(np.array(pic, np.int16, copy=False)) 59 | else: 60 | img = torch.ByteTensor( 61 | torch.ByteStorage.from_buffer(pic.tobytes())) 62 | # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK 63 | if pic.mode == 'YCbCr': 64 | nchannel = 3 65 | elif pic.mode == 'I;16': 66 | nchannel = 1 67 | else: 68 | nchannel = len(pic.mode) 69 | img = img.view(pic.size[1], pic.size[0], nchannel) 70 | 71 | img = img.transpose(0, 1).transpose(0, 2).contiguous() 72 | if isinstance(img, torch.ByteTensor): 73 | return img.float() 74 | else: 75 | return img 76 | 77 | 78 | class DIML_Outdoor(Dataset): 79 | def __init__(self, data_dir_root): 80 | import glob 81 | 82 | # image paths are of the form /{outleft, depthmap}/*.png 83 | self.image_files = glob.glob(os.path.join( 84 | data_dir_root, "*", 'outleft', '*.png')) 85 | self.depth_files = [r.replace("outleft", "depthmap") 86 | for r in self.image_files] 87 | self.transform = ToTensor() 88 | 89 | def __getitem__(self, idx): 90 | image_path = self.image_files[idx] 91 | depth_path = self.depth_files[idx] 92 | 93 | image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0 94 | depth = np.asarray(Image.open(depth_path), 95 | dtype='uint16') / 1000.0 # mm to meters 96 | 97 | # depth[depth > 8] = -1 98 | depth = depth[..., None] 99 | 100 | sample = dict(image=image, depth=depth, dataset="diml_outdoor") 101 | 102 | # return sample 103 | return self.transform(sample) 104 | 105 | def __len__(self): 106 | return len(self.image_files) 107 | 108 | 109 | def get_diml_outdoor_loader(data_dir_root, batch_size=1, **kwargs): 110 | dataset = DIML_Outdoor(data_dir_root) 111 | return DataLoader(dataset, batch_size, **kwargs) 112 | 113 | # get_diml_outdoor_loader(data_dir_root="datasets/diml/outdoor/test/HR") 114 | # get_diml_outdoor_loader(data_dir_root="datasets/diml/outdoor/test/LR") 115 | -------------------------------------------------------------------------------- /dzoedepth/data/ibims.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | import os 26 | 27 | import numpy as np 28 | import torch 29 | from PIL import Image 30 | from torch.utils.data import DataLoader, Dataset 31 | from torchvision import transforms as T 32 | 33 | 34 | class iBims(Dataset): 35 | def __init__(self, config): 36 | root_folder = config.ibims_root 37 | with open(os.path.join(root_folder, "imagelist.txt"), 'r') as f: 38 | imglist = f.read().split() 39 | 40 | samples = [] 41 | for basename in imglist: 42 | img_path = os.path.join(root_folder, 'rgb', basename + ".png") 43 | depth_path = os.path.join(root_folder, 'depth', basename + ".png") 44 | valid_mask_path = os.path.join( 45 | root_folder, 'mask_invalid', basename+".png") 46 | transp_mask_path = os.path.join( 47 | root_folder, 'mask_transp', basename+".png") 48 | 49 | samples.append( 50 | (img_path, depth_path, valid_mask_path, transp_mask_path)) 51 | 52 | self.samples = samples 53 | # self.normalize = T.Normalize( 54 | # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 55 | self.normalize = lambda x : x 56 | 57 | def __getitem__(self, idx): 58 | img_path, depth_path, valid_mask_path, transp_mask_path = self.samples[idx] 59 | 60 | img = np.asarray(Image.open(img_path), dtype=np.float32) / 255.0 61 | depth = np.asarray(Image.open(depth_path), 62 | dtype=np.uint16).astype('float')*50.0/65535 63 | 64 | mask_valid = np.asarray(Image.open(valid_mask_path)) 65 | mask_transp = np.asarray(Image.open(transp_mask_path)) 66 | 67 | # depth = depth * mask_valid * mask_transp 68 | depth = np.where(mask_valid * mask_transp, depth, -1) 69 | 70 | img = torch.from_numpy(img).permute(2, 0, 1) 71 | img = self.normalize(img) 72 | depth = torch.from_numpy(depth).unsqueeze(0) 73 | return dict(image=img, depth=depth, image_path=img_path, depth_path=depth_path, dataset='ibims') 74 | 75 | def __len__(self): 76 | return len(self.samples) 77 | 78 | 79 | def get_ibims_loader(config, batch_size=1, **kwargs): 80 | dataloader = DataLoader(iBims(config), batch_size=batch_size, **kwargs) 81 | return dataloader 82 | -------------------------------------------------------------------------------- /dzoedepth/data/sun_rgbd_loader.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | import os 26 | 27 | import numpy as np 28 | import torch 29 | from PIL import Image 30 | from torch.utils.data import DataLoader, Dataset 31 | from torchvision import transforms 32 | 33 | 34 | class ToTensor(object): 35 | def __init__(self): 36 | # self.normalize = transforms.Normalize( 37 | # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 38 | self.normalize = lambda x : x 39 | 40 | def __call__(self, sample): 41 | image, depth = sample['image'], sample['depth'] 42 | image = self.to_tensor(image) 43 | image = self.normalize(image) 44 | depth = self.to_tensor(depth) 45 | 46 | return {'image': image, 'depth': depth, 'dataset': "sunrgbd"} 47 | 48 | def to_tensor(self, pic): 49 | 50 | if isinstance(pic, np.ndarray): 51 | img = torch.from_numpy(pic.transpose((2, 0, 1))) 52 | return img 53 | 54 | # # handle PIL Image 55 | if pic.mode == 'I': 56 | img = torch.from_numpy(np.array(pic, np.int32, copy=False)) 57 | elif pic.mode == 'I;16': 58 | img = torch.from_numpy(np.array(pic, np.int16, copy=False)) 59 | else: 60 | img = torch.ByteTensor( 61 | torch.ByteStorage.from_buffer(pic.tobytes())) 62 | # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK 63 | if pic.mode == 'YCbCr': 64 | nchannel = 3 65 | elif pic.mode == 'I;16': 66 | nchannel = 1 67 | else: 68 | nchannel = len(pic.mode) 69 | img = img.view(pic.size[1], pic.size[0], nchannel) 70 | 71 | img = img.transpose(0, 1).transpose(0, 2).contiguous() 72 | if isinstance(img, torch.ByteTensor): 73 | return img.float() 74 | else: 75 | return img 76 | 77 | 78 | class SunRGBD(Dataset): 79 | def __init__(self, data_dir_root): 80 | # test_file_dirs = loadmat(train_test_file)['alltest'].squeeze() 81 | # all_test = [t[0].replace("/n/fs/sun3d/data/", "") for t in test_file_dirs] 82 | # self.all_test = [os.path.join(data_dir_root, t) for t in all_test] 83 | import glob 84 | self.image_files = glob.glob( 85 | os.path.join(data_dir_root, 'rgb', 'rgb', '*')) 86 | self.depth_files = [ 87 | r.replace("rgb/rgb", "gt/gt").replace("jpg", "png") for r in self.image_files] 88 | self.transform = ToTensor() 89 | 90 | def __getitem__(self, idx): 91 | image_path = self.image_files[idx] 92 | depth_path = self.depth_files[idx] 93 | 94 | image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0 95 | depth = np.asarray(Image.open(depth_path), dtype='uint16') / 1000.0 96 | depth[depth > 8] = -1 97 | depth = depth[..., None] 98 | return self.transform(dict(image=image, depth=depth)) 99 | 100 | def __len__(self): 101 | return len(self.image_files) 102 | 103 | 104 | def get_sunrgbd_loader(data_dir_root, batch_size=1, **kwargs): 105 | dataset = SunRGBD(data_dir_root) 106 | return DataLoader(dataset, batch_size, **kwargs) 107 | -------------------------------------------------------------------------------- /dzoedepth/models/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | -------------------------------------------------------------------------------- /dzoedepth/models/base_models/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | -------------------------------------------------------------------------------- /dzoedepth/models/builder.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | from importlib import import_module 26 | from dzoedepth.models.depth_model import DepthModel 27 | 28 | def build_model(config) -> DepthModel: 29 | """Builds a model from a config. The model is specified by the model name and version in the config. The model is then constructed using the build_from_config function of the model interface. 30 | This function should be used to construct models for training and evaluation. 31 | 32 | Args: 33 | config (dict): Config dict. Config is constructed in utils/config.py. Each model has its own config file(s) saved in its root model folder. 34 | 35 | Returns: 36 | torch.nn.Module: Model corresponding to name and version as specified in config 37 | """ 38 | module_name = f"dzoedepth.models.{config.model}" 39 | try: 40 | module = import_module(module_name) 41 | except ModuleNotFoundError as e: 42 | # print the original error message 43 | print(e) 44 | raise ValueError( 45 | f"Model {config.model} not found. Refer above error for details.") from e 46 | try: 47 | get_version = getattr(module, "get_version") 48 | except AttributeError as e: 49 | raise ValueError( 50 | f"Model {config.model} has no get_version function.") from e 51 | return get_version(config.version_name).build_from_config(config) 52 | -------------------------------------------------------------------------------- /dzoedepth/models/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/dzoedepth/models/layers/__init__.py -------------------------------------------------------------------------------- /dzoedepth/models/layers/patch_transformer.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | import torch 26 | import torch.nn as nn 27 | 28 | 29 | class PatchTransformerEncoder(nn.Module): 30 | def __init__(self, in_channels, patch_size=10, embedding_dim=128, num_heads=4, use_class_token=False): 31 | """ViT-like transformer block 32 | 33 | Args: 34 | in_channels (int): Input channels 35 | patch_size (int, optional): patch size. Defaults to 10. 36 | embedding_dim (int, optional): Embedding dimension in transformer model. Defaults to 128. 37 | num_heads (int, optional): number of attention heads. Defaults to 4. 38 | use_class_token (bool, optional): Whether to use extra token at the start for global accumulation (called as "class token"). Defaults to False. 39 | """ 40 | super(PatchTransformerEncoder, self).__init__() 41 | self.use_class_token = use_class_token 42 | encoder_layers = nn.TransformerEncoderLayer( 43 | embedding_dim, num_heads, dim_feedforward=1024) 44 | self.transformer_encoder = nn.TransformerEncoder( 45 | encoder_layers, num_layers=4) # takes shape S,N,E 46 | 47 | self.embedding_convPxP = nn.Conv2d(in_channels, embedding_dim, 48 | kernel_size=patch_size, stride=patch_size, padding=0) 49 | 50 | def positional_encoding_1d(self, sequence_length, batch_size, embedding_dim, device='cpu'): 51 | """Generate positional encodings 52 | 53 | Args: 54 | sequence_length (int): Sequence length 55 | embedding_dim (int): Embedding dimension 56 | 57 | Returns: 58 | torch.Tensor SBE: Positional encodings 59 | """ 60 | position = torch.arange( 61 | 0, sequence_length, dtype=torch.float32, device=device).unsqueeze(1) 62 | index = torch.arange( 63 | 0, embedding_dim, 2, dtype=torch.float32, device=device).unsqueeze(0) 64 | div_term = torch.exp(index * (-torch.log(torch.tensor(10000.0, device=device)) / embedding_dim)) 65 | pos_encoding = position * div_term 66 | pos_encoding = torch.cat([torch.sin(pos_encoding), torch.cos(pos_encoding)], dim=1) 67 | pos_encoding = pos_encoding.unsqueeze(1).repeat(1, batch_size, 1) 68 | return pos_encoding 69 | 70 | 71 | def forward(self, x): 72 | """Forward pass 73 | 74 | Args: 75 | x (torch.Tensor - NCHW): Input feature tensor 76 | 77 | Returns: 78 | torch.Tensor - SNE: Transformer output embeddings. S - sequence length (=HW/patch_size^2), N - batch size, E - embedding dim 79 | """ 80 | embeddings = self.embedding_convPxP(x).flatten( 81 | 2) # .shape = n,c,s = n, embedding_dim, s 82 | if self.use_class_token: 83 | # extra special token at start ? 84 | embeddings = nn.functional.pad(embeddings, (1, 0)) 85 | 86 | # change to S,N,E format required by transformer 87 | embeddings = embeddings.permute(2, 0, 1) 88 | S, N, E = embeddings.shape 89 | # dtype IS ADDED, NOT PRESENT IN THE MAINLINE 90 | embeddings = embeddings + self.positional_encoding_1d(S, N, E, device=embeddings.device).to(dtype=embeddings.dtype) 91 | x = self.transformer_encoder(embeddings) # .shape = S, N, E 92 | return x 93 | -------------------------------------------------------------------------------- /dzoedepth/models/model_io.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | import torch 26 | 27 | def load_state_dict(model, state_dict): 28 | """Load state_dict into model, handling DataParallel and DistributedDataParallel. Also checks for "model" key in state_dict. 29 | 30 | DataParallel prefixes state_dict keys with 'module.' when saving. 31 | If the model is not a DataParallel model but the state_dict is, then prefixes are removed. 32 | If the model is a DataParallel model but the state_dict is not, then prefixes are added. 33 | """ 34 | state_dict = state_dict.get('model', state_dict) 35 | # if model is a DataParallel model, then state_dict keys are prefixed with 'module.' 36 | 37 | do_prefix = isinstance( 38 | model, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel)) 39 | state = {} 40 | for k, v in state_dict.items(): 41 | if k.startswith('module.') and not do_prefix: 42 | k = k[7:] 43 | 44 | if not k.startswith('module.') and do_prefix: 45 | k = 'module.' + k 46 | 47 | state[k] = v 48 | 49 | model.load_state_dict(state, strict=False) 50 | print("Loaded successfully") 51 | return model 52 | 53 | 54 | def load_wts(model, checkpoint_path): 55 | ckpt = torch.load(checkpoint_path, map_location='cpu') 56 | return load_state_dict(model, ckpt) 57 | 58 | 59 | def load_state_dict_from_url(model, url, **kwargs): 60 | state_dict = torch.hub.load_state_dict_from_url(url, map_location='cpu', **kwargs) 61 | return load_state_dict(model, state_dict) 62 | 63 | 64 | def load_state_from_resource(model, resource: str): 65 | """Loads weights to the model from a given resource. A resource can be of following types: 66 | 1. URL. Prefixed with "url::" 67 | e.g. url::http(s)://url.resource.com/ckpt.pt 68 | 69 | 2. Local path. Prefixed with "local::" 70 | e.g. local::/path/to/ckpt.pt 71 | 72 | 73 | Args: 74 | model (torch.nn.Module): Model 75 | resource (str): resource string 76 | 77 | Returns: 78 | torch.nn.Module: Model with loaded weights 79 | """ 80 | print(f"Using pretrained resource {resource}") 81 | 82 | if resource.startswith('url::'): 83 | url = resource.split('url::')[1] 84 | return load_state_dict_from_url(model, url, progress=True) 85 | 86 | elif resource.startswith('local::'): 87 | path = resource.split('local::')[1] 88 | return load_wts(model, path) 89 | 90 | else: 91 | raise ValueError("Invalid resource type, only url:: and local:: are supported") 92 | -------------------------------------------------------------------------------- /dzoedepth/models/zoedepth/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | from .zoedepth_v1 import ZoeDepth 26 | 27 | all_versions = { 28 | "v1": ZoeDepth, 29 | } 30 | 31 | get_version = lambda v : all_versions[v] -------------------------------------------------------------------------------- /dzoedepth/models/zoedepth/config_zoedepth.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "name": "ZoeDepth", 4 | "version_name": "v1", 5 | "n_bins": 64, 6 | "bin_embedding_dim": 128, 7 | "bin_centers_type": "softplus", 8 | "n_attractors":[16, 8, 4, 1], 9 | "attractor_alpha": 1000, 10 | "attractor_gamma": 2, 11 | "attractor_kind" : "mean", 12 | "attractor_type" : "inv", 13 | "midas_model_type" : "DPT_BEiT_L_384", 14 | "min_temp": 0.0212, 15 | "max_temp": 50.0, 16 | "output_distribution": "logbinomial", 17 | "memory_efficient": true, 18 | "inverse_midas": false, 19 | "img_size": [384, 512] 20 | }, 21 | 22 | "train": { 23 | "train_midas": true, 24 | "use_pretrained_midas": true, 25 | "trainer": "zoedepth", 26 | "epochs": 5, 27 | "bs": 16, 28 | "optim_kwargs": {"lr": 0.000161, "wd": 0.01}, 29 | "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true}, 30 | "same_lr": false, 31 | "w_si": 1, 32 | "w_domain": 0.2, 33 | "w_reg": 0, 34 | "w_grad": 0, 35 | "avoid_boundary": false, 36 | "random_crop": false, 37 | "input_width": 640, 38 | "input_height": 480, 39 | "midas_lr_factor": 1, 40 | "encoder_lr_factor":10, 41 | "pos_enc_lr_factor":10, 42 | "freeze_midas_bn": true 43 | 44 | }, 45 | 46 | "infer":{ 47 | "train_midas": false, 48 | "use_pretrained_midas": false, 49 | "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt", 50 | "force_keep_ar": true 51 | }, 52 | 53 | "eval":{ 54 | "train_midas": false, 55 | "use_pretrained_midas": false, 56 | "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt" 57 | } 58 | } -------------------------------------------------------------------------------- /dzoedepth/models/zoedepth/config_zoedepth_kitti.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "bin_centers_type": "normed", 4 | "img_size": [384, 768] 5 | }, 6 | 7 | "train": { 8 | }, 9 | 10 | "infer":{ 11 | "train_midas": false, 12 | "use_pretrained_midas": false, 13 | "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt", 14 | "force_keep_ar": true 15 | }, 16 | 17 | "eval":{ 18 | "train_midas": false, 19 | "use_pretrained_midas": false, 20 | "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt" 21 | } 22 | } -------------------------------------------------------------------------------- /dzoedepth/models/zoedepth_nk/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | from .zoedepth_nk_v1 import ZoeDepthNK 26 | 27 | all_versions = { 28 | "v1": ZoeDepthNK, 29 | } 30 | 31 | get_version = lambda v : all_versions[v] -------------------------------------------------------------------------------- /dzoedepth/models/zoedepth_nk/config_zoedepth_nk.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "name": "ZoeDepthNK", 4 | "version_name": "v1", 5 | "bin_conf" : [ 6 | { 7 | "name": "nyu", 8 | "n_bins": 64, 9 | "min_depth": 1e-3, 10 | "max_depth": 10.0 11 | }, 12 | { 13 | "name": "kitti", 14 | "n_bins": 64, 15 | "min_depth": 1e-3, 16 | "max_depth": 80.0 17 | } 18 | ], 19 | "bin_embedding_dim": 128, 20 | "bin_centers_type": "softplus", 21 | "n_attractors":[16, 8, 4, 1], 22 | "attractor_alpha": 1000, 23 | "attractor_gamma": 2, 24 | "attractor_kind" : "mean", 25 | "attractor_type" : "inv", 26 | "min_temp": 0.0212, 27 | "max_temp": 50.0, 28 | "memory_efficient": true, 29 | "midas_model_type" : "DPT_BEiT_L_384", 30 | "img_size": [384, 512] 31 | }, 32 | 33 | "train": { 34 | "train_midas": true, 35 | "use_pretrained_midas": true, 36 | "trainer": "zoedepth_nk", 37 | "epochs": 5, 38 | "bs": 16, 39 | "optim_kwargs": {"lr": 0.0002512, "wd": 0.01}, 40 | "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true}, 41 | "same_lr": false, 42 | "w_si": 1, 43 | "w_domain": 100, 44 | "avoid_boundary": false, 45 | "random_crop": false, 46 | "input_width": 640, 47 | "input_height": 480, 48 | "w_grad": 0, 49 | "w_reg": 0, 50 | "midas_lr_factor": 10, 51 | "encoder_lr_factor":10, 52 | "pos_enc_lr_factor":10 53 | }, 54 | 55 | "infer": { 56 | "train_midas": false, 57 | "pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt", 58 | "use_pretrained_midas": false, 59 | "force_keep_ar": true 60 | }, 61 | 62 | "eval": { 63 | "train_midas": false, 64 | "pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt", 65 | "use_pretrained_midas": false 66 | } 67 | } -------------------------------------------------------------------------------- /dzoedepth/trainers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/dzoedepth/trainers/__init__.py -------------------------------------------------------------------------------- /dzoedepth/trainers/builder.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | from importlib import import_module 26 | 27 | 28 | def get_trainer(config): 29 | """Builds and returns a trainer based on the config. 30 | 31 | Args: 32 | config (dict): the config dict (typically constructed using utils.config.get_config) 33 | config.trainer (str): the name of the trainer to use. The module named "{config.trainer}_trainer" must exist in trainers root module 34 | 35 | Raises: 36 | ValueError: If the specified trainer does not exist under trainers/ folder 37 | 38 | Returns: 39 | Trainer (inherited from zoedepth.trainers.BaseTrainer): The Trainer object 40 | """ 41 | assert "trainer" in config and config.trainer is not None and config.trainer != '', "Trainer not specified. Config: {0}".format( 42 | config) 43 | try: 44 | Trainer = getattr(import_module( 45 | f"zoedepth.trainers.{config.trainer}_trainer"), 'Trainer') 46 | except ModuleNotFoundError as e: 47 | raise ValueError(f"Trainer {config.trainer}_trainer not found.") from e 48 | return Trainer 49 | -------------------------------------------------------------------------------- /dzoedepth/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | -------------------------------------------------------------------------------- /dzoedepth/utils/arg_utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def infer_type(x): # hacky way to infer type from string args 4 | if not isinstance(x, str): 5 | return x 6 | 7 | try: 8 | x = int(x) 9 | return x 10 | except ValueError: 11 | pass 12 | 13 | try: 14 | x = float(x) 15 | return x 16 | except ValueError: 17 | pass 18 | 19 | return x 20 | 21 | 22 | def parse_unknown(unknown_args): 23 | clean = [] 24 | for a in unknown_args: 25 | if "=" in a: 26 | k, v = a.split("=") 27 | clean.extend([k, v]) 28 | else: 29 | clean.append(a) 30 | 31 | keys = clean[::2] 32 | values = clean[1::2] 33 | return {k.replace("--", ""): infer_type(v) for k, v in zip(keys, values)} 34 | -------------------------------------------------------------------------------- /dzoedepth/utils/easydict/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | EasyDict 3 | Copy/pasted from https://github.com/makinacorpus/easydict 4 | Original author: Mathieu Leplatre 5 | """ 6 | 7 | class EasyDict(dict): 8 | """ 9 | Get attributes 10 | 11 | >>> d = EasyDict({'foo':3}) 12 | >>> d['foo'] 13 | 3 14 | >>> d.foo 15 | 3 16 | >>> d.bar 17 | Traceback (most recent call last): 18 | ... 19 | AttributeError: 'EasyDict' object has no attribute 'bar' 20 | 21 | Works recursively 22 | 23 | >>> d = EasyDict({'foo':3, 'bar':{'x':1, 'y':2}}) 24 | >>> isinstance(d.bar, dict) 25 | True 26 | >>> d.bar.x 27 | 1 28 | 29 | Bullet-proof 30 | 31 | >>> EasyDict({}) 32 | {} 33 | >>> EasyDict(d={}) 34 | {} 35 | >>> EasyDict(None) 36 | {} 37 | >>> d = {'a': 1} 38 | >>> EasyDict(**d) 39 | {'a': 1} 40 | >>> EasyDict((('a', 1), ('b', 2))) 41 | {'a': 1, 'b': 2} 42 | 43 | Set attributes 44 | 45 | >>> d = EasyDict() 46 | >>> d.foo = 3 47 | >>> d.foo 48 | 3 49 | >>> d.bar = {'prop': 'value'} 50 | >>> d.bar.prop 51 | 'value' 52 | >>> d 53 | {'foo': 3, 'bar': {'prop': 'value'}} 54 | >>> d.bar.prop = 'newer' 55 | >>> d.bar.prop 56 | 'newer' 57 | 58 | 59 | Values extraction 60 | 61 | >>> d = EasyDict({'foo':0, 'bar':[{'x':1, 'y':2}, {'x':3, 'y':4}]}) 62 | >>> isinstance(d.bar, list) 63 | True 64 | >>> from operator import attrgetter 65 | >>> list(map(attrgetter('x'), d.bar)) 66 | [1, 3] 67 | >>> list(map(attrgetter('y'), d.bar)) 68 | [2, 4] 69 | >>> d = EasyDict() 70 | >>> list(d.keys()) 71 | [] 72 | >>> d = EasyDict(foo=3, bar=dict(x=1, y=2)) 73 | >>> d.foo 74 | 3 75 | >>> d.bar.x 76 | 1 77 | 78 | Still like a dict though 79 | 80 | >>> o = EasyDict({'clean':True}) 81 | >>> list(o.items()) 82 | [('clean', True)] 83 | 84 | And like a class 85 | 86 | >>> class Flower(EasyDict): 87 | ... power = 1 88 | ... 89 | >>> f = Flower() 90 | >>> f.power 91 | 1 92 | >>> f = Flower({'height': 12}) 93 | >>> f.height 94 | 12 95 | >>> f['power'] 96 | 1 97 | >>> sorted(f.keys()) 98 | ['height', 'power'] 99 | 100 | update and pop items 101 | >>> d = EasyDict(a=1, b='2') 102 | >>> e = EasyDict(c=3.0, a=9.0) 103 | >>> d.update(e) 104 | >>> d.c 105 | 3.0 106 | >>> d['c'] 107 | 3.0 108 | >>> d.get('c') 109 | 3.0 110 | >>> d.update(a=4, b=4) 111 | >>> d.b 112 | 4 113 | >>> d.pop('a') 114 | 4 115 | >>> d.a 116 | Traceback (most recent call last): 117 | ... 118 | AttributeError: 'EasyDict' object has no attribute 'a' 119 | """ 120 | def __init__(self, d=None, **kwargs): 121 | if d is None: 122 | d = {} 123 | else: 124 | d = dict(d) 125 | if kwargs: 126 | d.update(**kwargs) 127 | for k, v in d.items(): 128 | setattr(self, k, v) 129 | # Class attributes 130 | for k in self.__class__.__dict__.keys(): 131 | if not (k.startswith('__') and k.endswith('__')) and not k in ('update', 'pop'): 132 | setattr(self, k, getattr(self, k)) 133 | 134 | def __setattr__(self, name, value): 135 | if isinstance(value, (list, tuple)): 136 | value = [self.__class__(x) 137 | if isinstance(x, dict) else x for x in value] 138 | elif isinstance(value, dict) and not isinstance(value, self.__class__): 139 | value = self.__class__(value) 140 | super(EasyDict, self).__setattr__(name, value) 141 | super(EasyDict, self).__setitem__(name, value) 142 | 143 | __setitem__ = __setattr__ 144 | 145 | def update(self, e=None, **f): 146 | d = e or dict() 147 | d.update(f) 148 | for k in d: 149 | setattr(self, k, d[k]) 150 | 151 | def pop(self, k, d=None): 152 | delattr(self, k) 153 | return super(EasyDict, self).pop(k, d) 154 | 155 | 156 | if __name__ == "__main__": 157 | import doctest 158 | doctest.testmod() -------------------------------------------------------------------------------- /dzoedepth/utils/geometry.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2022 Intelligent Systems Lab Org 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | # File author: Shariq Farooq Bhat 24 | 25 | import numpy as np 26 | 27 | def get_intrinsics(H,W): 28 | """ 29 | Intrinsics for a pinhole camera model. 30 | Assume fov of 55 degrees and central principal point. 31 | """ 32 | f = 0.5 * W / np.tan(0.5 * 55 * np.pi / 180.0) 33 | cx = 0.5 * W 34 | cy = 0.5 * H 35 | return np.array([[f, 0, cx], 36 | [0, f, cy], 37 | [0, 0, 1]]) 38 | 39 | def depth_to_points(depth, R=None, t=None): 40 | 41 | K = get_intrinsics(depth.shape[1], depth.shape[2]) 42 | Kinv = np.linalg.inv(K) 43 | if R is None: 44 | R = np.eye(3) 45 | if t is None: 46 | t = np.zeros(3) 47 | 48 | # M converts from your coordinate to PyTorch3D's coordinate system 49 | M = np.eye(3) 50 | M[0, 0] = -1.0 51 | M[1, 1] = -1.0 52 | 53 | height, width = depth.shape[1:3] 54 | 55 | x = np.arange(width) 56 | y = np.arange(height) 57 | coord = np.stack(np.meshgrid(x, y), -1) 58 | coord = np.concatenate((coord, np.ones_like(coord)[:, :, [0]]), -1) # z=1 59 | coord = coord.astype(np.float32) 60 | # coord = torch.as_tensor(coord, dtype=torch.float32, device=device) 61 | coord = coord[None] # bs, h, w, 3 62 | 63 | D = depth[:, :, :, None, None] 64 | # print(D.shape, Kinv[None, None, None, ...].shape, coord[:, :, :, :, None].shape ) 65 | pts3D_1 = D * Kinv[None, None, None, ...] @ coord[:, :, :, :, None] 66 | # pts3D_1 live in your coordinate system. Convert them to Py3D's 67 | pts3D_1 = M[None, None, None, ...] @ pts3D_1 68 | # from reference to targe tviewpoint 69 | pts3D_2 = R[None, None, None, ...] @ pts3D_1 + t[None, None, None, :, None] 70 | # pts3D_2 = pts3D_1 71 | # depth_2 = pts3D_2[:, :, :, 2, :] # b,1,h,w 72 | return pts3D_2[:, :, :, :3, 0][0] 73 | 74 | 75 | def create_triangles(h, w, mask=None): 76 | """ 77 | Reference: https://github.com/google-research/google-research/blob/e96197de06613f1b027d20328e06d69829fa5a89/infinite_nature/render_utils.py#L68 78 | Creates mesh triangle indices from a given pixel grid size. 79 | This function is not and need not be differentiable as triangle indices are 80 | fixed. 81 | Args: 82 | h: (int) denoting the height of the image. 83 | w: (int) denoting the width of the image. 84 | Returns: 85 | triangles: 2D numpy array of indices (int) with shape (2(W-1)(H-1) x 3) 86 | """ 87 | x, y = np.meshgrid(range(w - 1), range(h - 1)) 88 | tl = y * w + x 89 | tr = y * w + x + 1 90 | bl = (y + 1) * w + x 91 | br = (y + 1) * w + x + 1 92 | triangles = np.array([tl, bl, tr, br, tr, bl]) 93 | triangles = np.transpose(triangles, (1, 2, 0)).reshape( 94 | ((w - 1) * (h - 1) * 2, 3)) 95 | if mask is not None: 96 | mask = mask.reshape(-1) 97 | triangles = triangles[mask[triangles].all(1)] 98 | return triangles 99 | -------------------------------------------------------------------------------- /examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/examples.png -------------------------------------------------------------------------------- /inpaint/LICENSE: -------------------------------------------------------------------------------- 1 | 2 | MIT License 3 | 4 | Copyright (c) 2020 Virginia Tech Vision and Learning Lab 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | 24 | ------------------ LICENSE FOR MiDaS -------------------- 25 | 26 | MIT License 27 | 28 | Copyright (c) 2019 Intel ISL (Intel Intelligent Systems Lab) 29 | 30 | Permission is hereby granted, free of charge, to any person obtaining a copy 31 | of this software and associated documentation files (the "Software"), to deal 32 | in the Software without restriction, including without limitation the rights 33 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 34 | copies of the Software, and to permit persons to whom the Software is 35 | furnished to do so, subject to the following conditions: 36 | 37 | The above copyright notice and this permission notice shall be included in all 38 | copies or substantial portions of the Software. 39 | 40 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 41 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 42 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 43 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 44 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 45 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 46 | SOFTWARE. 47 | 48 | --------------------------- LICENSE FOR EdgeConnect -------------------------------- 49 | 50 | Attribution-NonCommercial 4.0 International -------------------------------------------------------------------------------- /inpaint/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/inpaint/__init__.py -------------------------------------------------------------------------------- /inpaint/argument.yml: -------------------------------------------------------------------------------- 1 | depth_edge_model_ckpt: checkpoints/edge-model.pth 2 | depth_feat_model_ckpt: checkpoints/depth-model.pth 3 | rgb_feat_model_ckpt: checkpoints/color-model.pth 4 | MiDaS_model_ckpt: MiDaS/model.pt 5 | use_boostmonodepth: True 6 | fps: 40 7 | num_frames: 240 8 | x_shift_range: [0.00, 0.00, -0.015, -0.015] 9 | y_shift_range: [0.00, 0.00, -0.015, -0.00] 10 | z_shift_range: [-0.05, -0.05, -0.05, -0.05] 11 | traj_types: ['double-straight-line', 'double-straight-line', 'circle', 'circle'] 12 | video_postfix: ['dolly-zoom-in', 'zoom-in', 'circle', 'swing'] 13 | specific: '' 14 | longer_side_len: 960 15 | src_folder: image 16 | depth_folder: depth 17 | mesh_folder: mesh 18 | video_folder: video 19 | load_ply: False 20 | save_ply: True 21 | inference_video: True 22 | gpu_ids: 0 23 | offscreen_rendering: False 24 | img_format: '.jpg' 25 | depth_format: '.npy' 26 | require_midas: True 27 | depth_threshold: 0.04 28 | ext_edge_threshold: 0.002 29 | sparse_iter: 5 30 | filter_size: [7, 7, 5, 5, 5] 31 | sigma_s: 4.0 32 | sigma_r: 0.5 33 | redundant_number: 12 34 | background_thickness: 70 35 | context_thickness: 140 36 | background_thickness_2: 70 37 | context_thickness_2: 70 38 | discount_factor: 1.00 39 | log_depth: True 40 | largest_size: 512 41 | depth_edge_dilate: 10 42 | depth_edge_dilate_2: 5 43 | extrapolate_border: True 44 | extrapolation_thickness: 60 45 | repeat_inpaint_edge: True 46 | crop_border: [0.03, 0.03, 0.05, 0.03] 47 | anti_flickering: True 48 | -------------------------------------------------------------------------------- /inpaint/boostmonodepth_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import glob 4 | import numpy as np 5 | import imageio 6 | from MiDaS.MiDaS_utils import write_depth 7 | 8 | BOOST_BASE = 'BoostingMonocularDepth' 9 | 10 | BOOST_INPUTS = 'inputs' 11 | BOOST_OUTPUTS = 'outputs' 12 | 13 | def run_boostmonodepth(img_names, src_folder, depth_folder): 14 | 15 | if not isinstance(img_names, list): 16 | img_names = [img_names] 17 | 18 | # remove irrelevant files first 19 | clean_folder(os.path.join(BOOST_BASE, BOOST_INPUTS)) 20 | clean_folder(os.path.join(BOOST_BASE, BOOST_OUTPUTS)) 21 | 22 | tgt_names = [] 23 | for img_name in img_names: 24 | base_name = os.path.basename(img_name) 25 | tgt_name = os.path.join(BOOST_BASE, BOOST_INPUTS, base_name) 26 | os.system(f'cp {img_name} {tgt_name}') 27 | 28 | # keep only the file name here. 29 | # they save all depth as .png file 30 | tgt_names.append(os.path.basename(tgt_name).replace('.jpg', '.png')) 31 | 32 | os.system(f'cd {BOOST_BASE} && python run.py --Final --data_dir {BOOST_INPUTS}/ --output_dir {BOOST_OUTPUTS} --depthNet 0') 33 | 34 | for i, (img_name, tgt_name) in enumerate(zip(img_names, tgt_names)): 35 | img = imageio.imread(img_name) 36 | H, W = img.shape[:2] 37 | scale = 640. / max(H, W) 38 | 39 | # resize and save depth 40 | target_height, target_width = int(round(H * scale)), int(round(W * scale)) 41 | depth = imageio.imread(os.path.join(BOOST_BASE, BOOST_OUTPUTS, tgt_name)) 42 | depth = np.array(depth).astype(np.float32) 43 | depth = resize_depth(depth, target_width, target_height) 44 | np.save(os.path.join(depth_folder, tgt_name.replace('.png', '.npy')), depth / 32768. - 1.) 45 | write_depth(os.path.join(depth_folder, tgt_name.replace('.png', '')), depth) 46 | 47 | def clean_folder(folder, img_exts=['.png', '.jpg', '.npy']): 48 | 49 | for img_ext in img_exts: 50 | paths_to_check = os.path.join(folder, f'*{img_ext}') 51 | if len(glob.glob(paths_to_check)) == 0: 52 | continue 53 | print(paths_to_check) 54 | os.system(f'rm {paths_to_check}') 55 | 56 | def resize_depth(depth, width, height): 57 | """Resize numpy (or image read by imageio) depth map 58 | 59 | Args: 60 | depth (numpy): depth 61 | width (int): image width 62 | height (int): image height 63 | 64 | Returns: 65 | array: processed depth 66 | """ 67 | depth = cv2.blur(depth, (3, 3)) 68 | return cv2.resize(depth, (width, height), interpolation=cv2.INTER_AREA) 69 | -------------------------------------------------------------------------------- /inpaint/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | fb_status=$(wget --spider -S https://filebox.ece.vt.edu/ 2>&1 | grep "HTTP/1.1 200 OK") 3 | 4 | mkdir checkpoints 5 | 6 | echo "downloading from filebox ..." 7 | wget https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/color-model.pth 8 | wget https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/depth-model.pth 9 | wget https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/edge-model.pth 10 | wget https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/model.pt 11 | 12 | mv color-model.pth checkpoints/. 13 | mv depth-model.pth checkpoints/. 14 | mv edge-model.pth checkpoints/. 15 | mv model.pt MiDaS/. 16 | 17 | echo "cloning from BoostingMonocularDepth ..." 18 | git clone https://github.com/compphoto/BoostingMonocularDepth.git 19 | mkdir -p BoostingMonocularDepth/pix2pix/checkpoints/mergemodel/ 20 | 21 | echo "downloading mergenet weights ..." 22 | wget https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/latest_net_G.pth 23 | mv latest_net_G.pth BoostingMonocularDepth/pix2pix/checkpoints/mergemodel/ 24 | wget https://github.com/intel-isl/MiDaS/releases/download/v2/model-f46da743.pt 25 | mv model-f46da743.pt BoostingMonocularDepth/midas/model.pt 26 | -------------------------------------------------------------------------------- /inpaint/requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python==4.2.0.32 2 | vispy==0.6.4 3 | moviepy==1.0.2 4 | transforms3d==0.3.1 5 | networkx==2.3 6 | cynetworkx 7 | scikit-image 8 | -------------------------------------------------------------------------------- /install.py: -------------------------------------------------------------------------------- 1 | # Installs dependencies 2 | # Make sure to add to requirements.txt - it can be used for the standalone mode 3 | 4 | import launch 5 | import platform 6 | import sys 7 | import importlib.metadata 8 | 9 | # TODO: some dependencies apparently being reinstalled on every run. Investigate and fix. 10 | 11 | if sys.version_info < (3, 8): 12 | launch.run_pip("install importlib-metadata", "importlib-metadata for depthmap script") 13 | import importlib_metadata 14 | else: 15 | import importlib.metadata as importlib_metadata 16 | if not launch.is_installed('packaging'): 17 | launch.run_pip("install packaging", "packaging requirement for depthmap script") 18 | from packaging.version import Version 19 | 20 | def ensure(module_name, min_version=None): 21 | if launch.is_installed(module_name): 22 | if min_version is None or Version(importlib_metadata.version(module_name)) >= Version(min_version): 23 | return 24 | requirement = f'{module_name}>={min_version}' if min_version is not None else module_name 25 | cmd = f'install "{requirement}"' 26 | msg = f'{requirement} requirement for depthmap script' 27 | launch.run_pip(cmd, msg) 28 | 29 | 30 | ensure('timm', '0.9.2') # For midas, specified just in case 31 | 32 | ensure('matplotlib') 33 | 34 | ensure('trimesh') 35 | 36 | ensure('numba', '0.57.0') 37 | ensure('vispy', '0.13.0') 38 | 39 | ensure('rembg', '2.0.50') 40 | 41 | if not launch.is_installed("moviepy"): 42 | launch.run_pip('install "moviepy==1.0.2"', "moviepy requirement for depthmap script") 43 | ensure('transforms3d', '0.4.1') 44 | 45 | ensure('diffusers', '0.20.1') # For Merigold 46 | 47 | ensure('imageio') # 2.4.1 48 | try: # Dirty hack to not reinstall every time 49 | importlib_metadata.version('imageio-ffmpeg') 50 | except: 51 | ensure('imageio-ffmpeg') 52 | 53 | 54 | if not launch.is_installed("networkx"): 55 | launch.run_pip('install install "networkx==2.5"', "networkx requirement for depthmap script") 56 | if platform.system() == 'Windows': 57 | ensure('pyqt5') 58 | 59 | if platform.system() == 'Darwin': 60 | ensure('pyqt6') 61 | ensure('PyOpenGL', '3.1.7') 62 | 63 | # Depth Anything 64 | def get_installed_version(package: str): 65 | try: 66 | return importlib.metadata.version(package) 67 | except Exception: 68 | return None 69 | def try_install_from_wheel(pkg_name: str, wheel_url: str): 70 | if get_installed_version(pkg_name) is not None: 71 | return 72 | try: 73 | launch.run_pip(f"install {wheel_url}", f" {pkg_name} requirement for depthmap script") 74 | except Exception as e: 75 | print('Failed to install wheel for Depth Anything support. It won\'t work.') 76 | try_install_from_wheel( 77 | "depth_anything", 78 | "https://github.com/huchenlei/Depth-Anything/releases/download/v1.0.0/depth_anything-2024.1.22.0-py2.py3-none-any.whl") 79 | -------------------------------------------------------------------------------- /javascript/depthmap.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/javascript/depthmap.js -------------------------------------------------------------------------------- /lib/LICENSE: -------------------------------------------------------------------------------- 1 | Adobe Research License Terms 2 | 3 | 1. You may use, reproduce, modify, and display the research materials provided under this license (the “Research 4 | Materials”) solely for noncommercial purposes. Noncommercial purposes include academic research, teaching, and 5 | testing, but do not include commercial licensing or distribution, development of commercial products, or any other 6 | activity which results in commercial gain. You may not redistribute the Research Materials. 7 | 8 | 2. You agree to (a) comply with all laws and regulations applicable to your use of the Research Materials under this license, 9 | including but not limited to any import or export laws; (b) preserve any copyright or other notices from the Research 10 | Materials; and (c) for any Research Materials in object code, not attempt to modify, reverse engineer, or decompile 11 | such Research Materials except as permitted by applicable law. 12 | 13 | 3. THE RESEARCH MATERIALS ARE PROVIDED “AS IS,” WITHOUT WARRANTY OF ANY KIND, AND YOU ASSUME ALL RISKS 14 | ASSOCIATED WITH THEIR USE. IN NO EVENT WILL ANYONE BE LIABLE TO YOU FOR ANY ACTUAL, INCIDENTAL, SPECIAL, 15 | OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR IN CONNECTION WITH USE OF THE RESEARCH MATERIALS. 16 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lib/multi_depth_model_woauxi.py: -------------------------------------------------------------------------------- 1 | from lib import network_auxi as network 2 | from lib.net_tools import get_func 3 | import torch 4 | import torch.nn as nn 5 | 6 | class RelDepthModel(nn.Module): 7 | def __init__(self, backbone='resnet50'): 8 | super(RelDepthModel, self).__init__() 9 | if backbone == 'resnet50': 10 | encoder = 'resnet50_stride32' 11 | elif backbone == 'resnext101': 12 | encoder = 'resnext101_stride32x8d' 13 | self.depth_model = DepthModel(encoder) 14 | 15 | def inference(self, rgb): 16 | with torch.no_grad(): 17 | input = rgb.cuda() 18 | depth = self.depth_model(input) 19 | #pred_depth_out = depth - depth.min() + 0.01 20 | return depth #pred_depth_out 21 | 22 | 23 | class DepthModel(nn.Module): 24 | def __init__(self, encoder): 25 | super(DepthModel, self).__init__() 26 | backbone = network.__name__.split('.')[-1] + '.' + encoder 27 | self.encoder_modules = get_func(backbone)() 28 | self.decoder_modules = network.Decoder() 29 | 30 | def forward(self, x): 31 | lateral_out = self.encoder_modules(x) 32 | out_logit = self.decoder_modules(lateral_out) 33 | return out_logit -------------------------------------------------------------------------------- /lib/net_tools.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import torch 3 | import os 4 | from collections import OrderedDict 5 | 6 | 7 | def get_func(func_name): 8 | """Helper to return a function object by name. func_name must identify a 9 | function in this module or the path to a function relative to the base 10 | 'modeling' module. 11 | """ 12 | if func_name == '': 13 | return None 14 | try: 15 | parts = func_name.split('.') 16 | # Refers to a function in this module 17 | if len(parts) == 1: 18 | return globals()[parts[0]] 19 | # Otherwise, assume we're referencing a module under modeling 20 | module_name = 'lib.' + '.'.join(parts[:-1]) 21 | module = importlib.import_module(module_name) 22 | return getattr(module, parts[-1]) 23 | except Exception: 24 | print('Failed to f1ind function: %s', func_name) 25 | raise 26 | 27 | def load_ckpt(args, depth_model, shift_model, focal_model): 28 | """ 29 | Load checkpoint. 30 | """ 31 | if os.path.isfile(args.load_ckpt): 32 | print("loading checkpoint %s" % args.load_ckpt) 33 | checkpoint = torch.load(args.load_ckpt) 34 | if shift_model is not None: 35 | shift_model.load_state_dict(strip_prefix_if_present(checkpoint['shift_model'], 'module.'), 36 | strict=True) 37 | if focal_model is not None: 38 | focal_model.load_state_dict(strip_prefix_if_present(checkpoint['focal_model'], 'module.'), 39 | strict=True) 40 | depth_model.load_state_dict(strip_prefix_if_present(checkpoint['depth_model'], "module."), 41 | strict=True) 42 | del checkpoint 43 | torch.cuda.empty_cache() 44 | 45 | 46 | def strip_prefix_if_present(state_dict, prefix): 47 | keys = sorted(state_dict.keys()) 48 | if not all(key.startswith(prefix) for key in keys): 49 | return state_dict 50 | stripped_state_dict = OrderedDict() 51 | for key, value in state_dict.items(): 52 | stripped_state_dict[key.replace(prefix, "")] = value 53 | return stripped_state_dict -------------------------------------------------------------------------------- /lib/spvcnn_utils.py: -------------------------------------------------------------------------------- 1 | import torchsparse.nn.functional as spf 2 | from torchsparse.point_tensor import PointTensor 3 | from torchsparse.utils.kernel_region import * 4 | from torchsparse.utils.helpers import * 5 | 6 | 7 | __all__ = ['initial_voxelize', 'point_to_voxel', 'voxel_to_point'] 8 | 9 | 10 | # z: PointTensor 11 | # return: SparseTensor 12 | def initial_voxelize(z, init_res, after_res): 13 | new_float_coord = torch.cat( 14 | [(z.C[:, :3] * init_res) / after_res, z.C[:, -1].view(-1, 1)], 1) 15 | 16 | pc_hash = spf.sphash(torch.floor(new_float_coord).int()) 17 | sparse_hash = torch.unique(pc_hash) 18 | idx_query = spf.sphashquery(pc_hash, sparse_hash) 19 | counts = spf.spcount(idx_query.int(), len(sparse_hash)) 20 | 21 | inserted_coords = spf.spvoxelize(torch.floor(new_float_coord), idx_query, 22 | counts) 23 | inserted_coords = torch.round(inserted_coords).int() 24 | inserted_feat = spf.spvoxelize(z.F, idx_query, counts) 25 | 26 | new_tensor = SparseTensor(inserted_feat, inserted_coords, 1) 27 | new_tensor.check() 28 | z.additional_features['idx_query'][1] = idx_query 29 | z.additional_features['counts'][1] = counts 30 | z.C = new_float_coord 31 | 32 | return new_tensor 33 | 34 | 35 | # x: SparseTensor, z: PointTensor 36 | # return: SparseTensor 37 | def point_to_voxel(x, z): 38 | if z.additional_features is None or z.additional_features.get('idx_query') is None\ 39 | or z.additional_features['idx_query'].get(x.s) is None: 40 | #pc_hash = hash_gpu(torch.floor(z.C).int()) 41 | pc_hash = spf.sphash( 42 | torch.cat([ 43 | torch.floor(z.C[:, :3] / x.s).int() * x.s, 44 | z.C[:, -1].int().view(-1, 1) 45 | ], 1)) 46 | sparse_hash = spf.sphash(x.C) 47 | idx_query = spf.sphashquery(pc_hash, sparse_hash) 48 | counts = spf.spcount(idx_query.int(), x.C.shape[0]) 49 | z.additional_features['idx_query'][x.s] = idx_query 50 | z.additional_features['counts'][x.s] = counts 51 | else: 52 | idx_query = z.additional_features['idx_query'][x.s] 53 | counts = z.additional_features['counts'][x.s] 54 | 55 | inserted_feat = spf.spvoxelize(z.F, idx_query, counts) 56 | new_tensor = SparseTensor(inserted_feat, x.C, x.s) 57 | new_tensor.coord_maps = x.coord_maps 58 | new_tensor.kernel_maps = x.kernel_maps 59 | 60 | return new_tensor 61 | 62 | 63 | # x: SparseTensor, z: PointTensor 64 | # return: PointTensor 65 | def voxel_to_point(x, z, nearest=False): 66 | if z.idx_query is None or z.weights is None or z.idx_query.get( 67 | x.s) is None or z.weights.get(x.s) is None: 68 | kr = KernelRegion(2, x.s, 1) 69 | off = kr.get_kernel_offset().to(z.F.device) 70 | #old_hash = kernel_hash_gpu(torch.floor(z.C).int(), off) 71 | old_hash = spf.sphash( 72 | torch.cat([ 73 | torch.floor(z.C[:, :3] / x.s).int() * x.s, 74 | z.C[:, -1].int().view(-1, 1) 75 | ], 1), off) 76 | pc_hash = spf.sphash(x.C.to(z.F.device)) 77 | idx_query = spf.sphashquery(old_hash, pc_hash) 78 | weights = spf.calc_ti_weights(z.C, idx_query, 79 | scale=x.s).transpose(0, 1).contiguous() 80 | idx_query = idx_query.transpose(0, 1).contiguous() 81 | if nearest: 82 | weights[:, 1:] = 0. 83 | idx_query[:, 1:] = -1 84 | new_feat = spf.spdevoxelize(x.F, idx_query, weights) 85 | new_tensor = PointTensor(new_feat, 86 | z.C, 87 | idx_query=z.idx_query, 88 | weights=z.weights) 89 | new_tensor.additional_features = z.additional_features 90 | new_tensor.idx_query[x.s] = idx_query 91 | new_tensor.weights[x.s] = weights 92 | z.idx_query[x.s] = idx_query 93 | z.weights[x.s] = weights 94 | 95 | else: 96 | new_feat = spf.spdevoxelize(x.F, z.idx_query.get(x.s), z.weights.get(x.s)) 97 | new_tensor = PointTensor(new_feat, 98 | z.C, 99 | idx_query=z.idx_query, 100 | weights=z.weights) 101 | new_tensor.additional_features = z.additional_features 102 | 103 | return new_tensor 104 | 105 | 106 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # This launches DepthMap without the AUTOMATIC1111/stable-diffusion-webui 2 | 3 | import argparse 4 | import os 5 | import pathlib 6 | 7 | import src.misc 8 | 9 | 10 | def maybe_chdir(): 11 | """Detects if DepthMap was installed as a stable-diffusion-webui script, but run without current directory set to 12 | the stable-diffusion-webui root. Changes current directory if needed. 13 | This is to avoid re-downloading models and putting results into a wrong folder.""" 14 | try: 15 | file_path = pathlib.Path(__file__) 16 | path = file_path.parts 17 | while len(path) > 0 and path[-1] != src.misc.REPOSITORY_NAME: 18 | path = path[:-1] 19 | if len(path) >= 2 and path[-1] == src.misc.REPOSITORY_NAME and path[-2] == "extensions": 20 | path = path[:-2] 21 | listdir = os.listdir(str(pathlib.Path(*path))) 22 | if 'launch.py' in listdir and 'webui.py': 23 | os.chdir(str(pathlib.Path(*path))) 24 | except: 25 | pass 26 | 27 | 28 | if __name__ == '__main__': 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("--share", help="Create public link", action='store_true') 31 | parser.add_argument("--listen", help="Create public link", action='store_true') 32 | parser.add_argument("--no_chdir", help="Do not try to use the root of stable-diffusion-webui", action='store_true') 33 | args = parser.parse_args() 34 | 35 | print(f"{src.misc.SCRIPT_FULL_NAME} running in standalone mode!") 36 | if not args.no_chdir: 37 | maybe_chdir() 38 | server_name = "0.0.0.0" if args.listen else None 39 | import src.common_ui 40 | src.common_ui.on_ui_tabs().launch(share=args.share, server_name=server_name) 41 | -------------------------------------------------------------------------------- /options.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/options.png -------------------------------------------------------------------------------- /pix2pix/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017, Jun-Yan Zhu and Taesung Park 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | 26 | --------------------------- LICENSE FOR pix2pix -------------------------------- 27 | BSD License 28 | 29 | For pix2pix software 30 | Copyright (c) 2016, Phillip Isola and Jun-Yan Zhu 31 | All rights reserved. 32 | 33 | Redistribution and use in source and binary forms, with or without 34 | modification, are permitted provided that the following conditions are met: 35 | 36 | * Redistributions of source code must retain the above copyright notice, this 37 | list of conditions and the following disclaimer. 38 | 39 | * Redistributions in binary form must reproduce the above copyright notice, 40 | this list of conditions and the following disclaimer in the documentation 41 | and/or other materials provided with the distribution. 42 | 43 | ----------------------------- LICENSE FOR DCGAN -------------------------------- 44 | BSD License 45 | 46 | For dcgan.torch software 47 | 48 | Copyright (c) 2015, Facebook, Inc. All rights reserved. 49 | 50 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 51 | 52 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 53 | 54 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 55 | 56 | Neither the name Facebook nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 57 | 58 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 59 | -------------------------------------------------------------------------------- /pix2pix/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/pix2pix/__init__.py -------------------------------------------------------------------------------- /pix2pix/data/__init__.py: -------------------------------------------------------------------------------- 1 | """This package includes all the modules related to data loading and preprocessing 2 | 3 | To add a custom dataset class called 'dummy', you need to add a file called 'dummy_dataset.py' and define a subclass 'DummyDataset' inherited from BaseDataset. 4 | You need to implement four functions: 5 | -- <__init__>: initialize the class, first call BaseDataset.__init__(self, opt). 6 | -- <__len__>: return the size of dataset. 7 | -- <__getitem__>: get a data point from data loader. 8 | -- : (optionally) add dataset-specific options and set default options. 9 | 10 | Now you can use the dataset class by specifying flag '--dataset_mode dummy'. 11 | See our template dataset class 'template_dataset.py' for more details. 12 | """ 13 | import importlib 14 | import torch.utils.data 15 | from pix2pix.data.base_dataset import BaseDataset 16 | 17 | 18 | def find_dataset_using_name(dataset_name): 19 | """Import the module "data/[dataset_name]_dataset.py". 20 | 21 | In the file, the class called DatasetNameDataset() will 22 | be instantiated. It has to be a subclass of BaseDataset, 23 | and it is case-insensitive. 24 | """ 25 | dataset_filename = "pix2pix.data." + dataset_name + "_dataset" 26 | datasetlib = importlib.import_module(dataset_filename) 27 | 28 | dataset = None 29 | target_dataset_name = dataset_name.replace('_', '') + 'dataset' 30 | for name, cls in datasetlib.__dict__.items(): 31 | if name.lower() == target_dataset_name.lower() \ 32 | and issubclass(cls, BaseDataset): 33 | dataset = cls 34 | 35 | if dataset is None: 36 | raise NotImplementedError("In %s.py, there should be a subclass of BaseDataset with class name that matches %s in lowercase." % (dataset_filename, target_dataset_name)) 37 | 38 | return dataset 39 | 40 | 41 | def get_option_setter(dataset_name): 42 | """Return the static method of the dataset class.""" 43 | dataset_class = find_dataset_using_name(dataset_name) 44 | return dataset_class.modify_commandline_options 45 | 46 | 47 | def create_dataset(opt): 48 | """Create a dataset given the option. 49 | 50 | This function wraps the class CustomDatasetDataLoader. 51 | This is the main interface between this package and 'train.py'/'test.py' 52 | 53 | Example: 54 | >>> from data import create_dataset 55 | >>> dataset = create_dataset(opt) 56 | """ 57 | data_loader = CustomDatasetDataLoader(opt) 58 | dataset = data_loader.load_data() 59 | return dataset 60 | 61 | 62 | class CustomDatasetDataLoader(): 63 | """Wrapper class of Dataset class that performs multi-threaded data loading""" 64 | 65 | def __init__(self, opt): 66 | """Initialize this class 67 | 68 | Step 1: create a dataset instance given the name [dataset_mode] 69 | Step 2: create a multi-threaded data loader. 70 | """ 71 | self.opt = opt 72 | dataset_class = find_dataset_using_name(opt.dataset_mode) 73 | self.dataset = dataset_class(opt) 74 | print("dataset [%s] was created" % type(self.dataset).__name__) 75 | self.dataloader = torch.utils.data.DataLoader( 76 | self.dataset, 77 | batch_size=opt.batch_size, 78 | shuffle=not opt.serial_batches, 79 | num_workers=int(opt.num_threads)) 80 | 81 | def load_data(self): 82 | return self 83 | 84 | def __len__(self): 85 | """Return the number of data in the dataset""" 86 | return min(len(self.dataset), self.opt.max_dataset_size) 87 | 88 | def __iter__(self): 89 | """Return a batch of data""" 90 | for i, data in enumerate(self.dataloader): 91 | if i * self.opt.batch_size >= self.opt.max_dataset_size: 92 | break 93 | yield data 94 | -------------------------------------------------------------------------------- /pix2pix/data/depthmerge_dataset.py: -------------------------------------------------------------------------------- 1 | from pix2pix.data.base_dataset import BaseDataset 2 | from pix2pix.data.image_folder import make_dataset 3 | from pix2pix.util.guidedfilter import GuidedFilter 4 | 5 | import numpy as np 6 | import os 7 | import torch 8 | from PIL import Image 9 | 10 | 11 | def normalize(img): 12 | img = img * 2 13 | img = img - 1 14 | return img 15 | 16 | 17 | def normalize01(img): 18 | return (img - torch.min(img)) / (torch.max(img)-torch.min(img)) 19 | 20 | 21 | class DepthMergeDataset(BaseDataset): 22 | def __init__(self, opt): 23 | BaseDataset.__init__(self, opt) 24 | self.dir_outer = os.path.join(opt.dataroot, opt.phase, 'outer') 25 | self.dir_inner = os.path.join(opt.dataroot, opt.phase, 'inner') 26 | self.dir_gtfake = os.path.join(opt.dataroot, opt.phase, 'gtfake') 27 | 28 | self.outer_paths = sorted(make_dataset(self.dir_outer, opt.max_dataset_size)) 29 | self.inner_paths = sorted(make_dataset(self.dir_inner, opt.max_dataset_size)) 30 | self.gtfake_paths = sorted(make_dataset(self.dir_gtfake, opt.max_dataset_size)) 31 | 32 | self.dataset_size = len(self.outer_paths) 33 | 34 | if opt.phase == 'train': 35 | self.isTrain = True 36 | else: 37 | self.isTrain = False 38 | 39 | def __getitem__(self, index): 40 | normalize_coef = np.float32(2 ** 16) 41 | 42 | data_outer = Image.open(self.outer_paths[index % self.dataset_size]) # needs to be a tensor 43 | data_outer = np.array(data_outer, dtype=np.float32) 44 | data_outer = data_outer / normalize_coef 45 | 46 | data_inner = Image.open(self.inner_paths[index % self.dataset_size]) # needs to be a tensor 47 | data_inner = np.array(data_inner, dtype=np.float32) 48 | data_inner = data_inner / normalize_coef 49 | 50 | if self.isTrain: 51 | data_gtfake = Image.open(self.gtfake_paths[index % self.dataset_size]) # needs to be a tensor 52 | data_gtfake = np.array(data_gtfake, dtype=np.float32) 53 | data_gtfake = data_gtfake / normalize_coef 54 | 55 | data_inner = GuidedFilter(data_gtfake, data_inner, 64, 0.00000001).smooth.astype('float32') 56 | data_outer = GuidedFilter(data_outer, data_gtfake, 64, 0.00000001).smooth.astype('float32') 57 | 58 | data_outer = torch.from_numpy(data_outer) 59 | data_outer = torch.unsqueeze(data_outer, 0) 60 | data_outer = normalize01(data_outer) 61 | data_outer = normalize(data_outer) 62 | 63 | data_inner = torch.from_numpy(data_inner) 64 | data_inner = torch.unsqueeze(data_inner, 0) 65 | data_inner = normalize01(data_inner) 66 | data_inner = normalize(data_inner) 67 | 68 | if self.isTrain: 69 | data_gtfake = torch.from_numpy(data_gtfake) 70 | data_gtfake = torch.unsqueeze(data_gtfake, 0) 71 | data_gtfake = normalize01(data_gtfake) 72 | data_gtfake = normalize(data_gtfake) 73 | 74 | image_path = self.outer_paths[index % self.dataset_size] 75 | if self.isTrain: 76 | return {'data_inner': data_inner, 'data_outer': data_outer, 77 | 'data_gtfake': data_gtfake, 'image_path': image_path} 78 | else: 79 | return {'data_inner': data_inner, 'data_outer': data_outer, 'image_path': image_path} 80 | 81 | def __len__(self): 82 | """Return the total number of images.""" 83 | return self.dataset_size 84 | -------------------------------------------------------------------------------- /pix2pix/data/image_folder.py: -------------------------------------------------------------------------------- 1 | """A modified image folder class 2 | 3 | We modify the official PyTorch image folder (https://github.com/pytorch/vision/blob/master/torchvision/datasets/folder.py) 4 | so that this class can load images from both current directory and its subdirectories. 5 | """ 6 | 7 | import torch.utils.data as data 8 | 9 | from PIL import Image 10 | import os 11 | 12 | IMG_EXTENSIONS = [ 13 | '.jpg', '.JPG', '.jpeg', '.JPEG', 14 | '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', 15 | '.tif', '.TIF', '.tiff', '.TIFF', 16 | ] 17 | 18 | 19 | def is_image_file(filename): 20 | return any(filename.endswith(extension) for extension in IMG_EXTENSIONS) 21 | 22 | 23 | def make_dataset(dir, max_dataset_size=float("inf")): 24 | images = [] 25 | assert os.path.isdir(dir), '%s is not a valid directory' % dir 26 | 27 | for root, _, fnames in sorted(os.walk(dir)): 28 | for fname in fnames: 29 | if is_image_file(fname): 30 | path = os.path.join(root, fname) 31 | images.append(path) 32 | return images[:min(max_dataset_size, len(images))] 33 | 34 | 35 | def default_loader(path): 36 | return Image.open(path).convert('RGB') 37 | 38 | 39 | class ImageFolder(data.Dataset): 40 | 41 | def __init__(self, root, transform=None, return_paths=False, 42 | loader=default_loader): 43 | imgs = make_dataset(root) 44 | if len(imgs) == 0: 45 | raise(RuntimeError("Found 0 images in: " + root + "\n" 46 | "Supported image extensions are: " + ",".join(IMG_EXTENSIONS))) 47 | 48 | self.root = root 49 | self.imgs = imgs 50 | self.transform = transform 51 | self.return_paths = return_paths 52 | self.loader = loader 53 | 54 | def __getitem__(self, index): 55 | path = self.imgs[index] 56 | img = self.loader(path) 57 | if self.transform is not None: 58 | img = self.transform(img) 59 | if self.return_paths: 60 | return img, path 61 | else: 62 | return img 63 | 64 | def __len__(self): 65 | return len(self.imgs) 66 | -------------------------------------------------------------------------------- /pix2pix/models/__init__.py: -------------------------------------------------------------------------------- 1 | """This package contains modules related to objective functions, optimizations, and network architectures. 2 | 3 | To add a custom model class called 'dummy', you need to add a file called 'dummy_model.py' and define a subclass DummyModel inherited from BaseModel. 4 | You need to implement the following five functions: 5 | -- <__init__>: initialize the class; first call BaseModel.__init__(self, opt). 6 | -- : unpack data from dataset and apply preprocessing. 7 | -- : produce intermediate results. 8 | -- : calculate loss, gradients, and update network weights. 9 | -- : (optionally) add model-specific options and set default options. 10 | 11 | In the function <__init__>, you need to define four lists: 12 | -- self.loss_names (str list): specify the training losses that you want to plot and save. 13 | -- self.model_names (str list): define networks used in our training. 14 | -- self.visual_names (str list): specify the images that you want to display and save. 15 | -- self.optimizers (optimizer list): define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an usage. 16 | 17 | Now you can use the model class by specifying flag '--model dummy'. 18 | See our template model class 'template_model.py' for more details. 19 | """ 20 | 21 | import importlib 22 | from pix2pix.models.base_model import BaseModel 23 | 24 | 25 | def find_model_using_name(model_name): 26 | """Import the module "models/[model_name]_model.py". 27 | 28 | In the file, the class called DatasetNameModel() will 29 | be instantiated. It has to be a subclass of BaseModel, 30 | and it is case-insensitive. 31 | """ 32 | model_filename = "pix2pix.models." + model_name + "_model" 33 | modellib = importlib.import_module(model_filename) 34 | model = None 35 | target_model_name = model_name.replace('_', '') + 'model' 36 | for name, cls in modellib.__dict__.items(): 37 | if name.lower() == target_model_name.lower() \ 38 | and issubclass(cls, BaseModel): 39 | model = cls 40 | 41 | if model is None: 42 | print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name)) 43 | exit(0) 44 | 45 | return model 46 | 47 | 48 | def get_option_setter(model_name): 49 | """Return the static method of the model class.""" 50 | model_class = find_model_using_name(model_name) 51 | return model_class.modify_commandline_options 52 | 53 | 54 | def create_model(opt): 55 | """Create a model given the option. 56 | 57 | This function warps the class CustomDatasetDataLoader. 58 | This is the main interface between this package and 'train.py'/'test.py' 59 | 60 | Example: 61 | >>> from models import create_model 62 | >>> model = create_model(opt) 63 | """ 64 | model = find_model_using_name(opt.model) 65 | instance = model(opt) 66 | print("model [%s] was created" % type(instance).__name__) 67 | return instance 68 | -------------------------------------------------------------------------------- /pix2pix/models/base_model_hg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | class BaseModelHG(): 5 | def name(self): 6 | return 'BaseModel' 7 | 8 | def initialize(self, opt): 9 | self.opt = opt 10 | self.gpu_ids = opt.gpu_ids 11 | self.isTrain = opt.isTrain 12 | self.Tensor = torch.cuda.FloatTensor if self.gpu_ids else torch.Tensor 13 | self.save_dir = os.path.join(opt.checkpoints_dir, opt.name) 14 | 15 | def set_input(self, input): 16 | self.input = input 17 | 18 | def forward(self): 19 | pass 20 | 21 | # used in test time, no backprop 22 | def test(self): 23 | pass 24 | 25 | def get_image_paths(self): 26 | pass 27 | 28 | def optimize_parameters(self): 29 | pass 30 | 31 | def get_current_visuals(self): 32 | return self.input 33 | 34 | def get_current_errors(self): 35 | return {} 36 | 37 | def save(self, label): 38 | pass 39 | 40 | # helper saving function that can be used by subclasses 41 | def save_network(self, network, network_label, epoch_label, gpu_ids): 42 | save_filename = '_%s_net_%s.pth' % (epoch_label, network_label) 43 | save_path = os.path.join(self.save_dir, save_filename) 44 | torch.save(network.cpu().state_dict(), save_path) 45 | if len(gpu_ids) and torch.cuda.is_available(): 46 | network.cuda(device_id=gpu_ids[0]) 47 | 48 | # helper loading function that can be used by subclasses 49 | def load_network(self, network, network_label, epoch_label): 50 | save_filename = '%s_net_%s.pth' % (epoch_label, network_label) 51 | save_path = os.path.join(self.save_dir, save_filename) 52 | print(save_path) 53 | model = torch.load(save_path) 54 | return model 55 | # network.load_state_dict(torch.load(save_path)) 56 | 57 | def update_learning_rate(): 58 | pass 59 | -------------------------------------------------------------------------------- /pix2pix/options/__init__.py: -------------------------------------------------------------------------------- 1 | """This package options includes option modules: training options, test options, and basic options (used in both training and test).""" 2 | -------------------------------------------------------------------------------- /pix2pix/options/test_options.py: -------------------------------------------------------------------------------- 1 | from .base_options import BaseOptions 2 | 3 | 4 | class TestOptions(BaseOptions): 5 | """This class includes test options. 6 | 7 | It also includes shared options defined in BaseOptions. 8 | """ 9 | 10 | def initialize(self, parser): 11 | parser = BaseOptions.initialize(self, parser) # define shared options 12 | parser.add_argument('--aspect_ratio', type=float, default=1.0, help='aspect ratio of result images') 13 | parser.add_argument('--phase', type=str, default='test', help='train, val, test, etc') 14 | # Dropout and Batchnorm has different behavioir during training and test. 15 | parser.add_argument('--eval', action='store_true', help='use eval mode during test time.') 16 | parser.add_argument('--num_test', type=int, default=50, help='how many test images to run') 17 | # rewrite devalue values 18 | parser.set_defaults(model='pix2pix4depth') 19 | # To avoid cropping, the load_size should be the same as crop_size 20 | parser.set_defaults(load_size=parser.get_default('crop_size')) 21 | self.isTrain = False 22 | return parser 23 | -------------------------------------------------------------------------------- /pix2pix/options/train_options.py: -------------------------------------------------------------------------------- 1 | from .base_options import BaseOptions 2 | 3 | 4 | class TrainOptions(BaseOptions): 5 | """This class includes training options. 6 | 7 | It also includes shared options defined in BaseOptions. 8 | """ 9 | 10 | def initialize(self, parser): 11 | parser = BaseOptions.initialize(self, parser) 12 | # visdom and HTML visualization parameters 13 | parser.add_argument('--display_freq', type=int, default=2500, help='frequency of showing training results on screen') 14 | parser.add_argument('--display_ncols', type=int, default=4, help='if positive, display all images in a single visdom web panel with certain number of images per row.') 15 | parser.add_argument('--display_id', type=int, default=1, help='window id of the web display') 16 | parser.add_argument('--display_server', type=str, default="http://localhost", help='visdom server of the web display') 17 | parser.add_argument('--display_env', type=str, default='main', help='visdom display environment name (default is "main")') 18 | parser.add_argument('--display_port', type=int, default=8097, help='visdom port of the web display') 19 | parser.add_argument('--update_html_freq', type=int, default=1000, help='frequency of saving training results to html') 20 | parser.add_argument('--print_freq', type=int, default=100, help='frequency of showing training results on console') 21 | parser.add_argument('--no_html', action='store_true', help='do not save intermediate training results to [opt.checkpoints_dir]/[opt.name]/web/') 22 | # network saving and loading parameters 23 | parser.add_argument('--save_latest_freq', type=int, default=5000, help='frequency of saving the latest results') 24 | parser.add_argument('--save_epoch_freq', type=int, default=10, help='frequency of saving checkpoints at the end of epochs') 25 | parser.add_argument('--save_by_iter', action='store_true', help='whether saves model by iteration') 26 | parser.add_argument('--continue_train', action='store_true', help='continue training: load the latest model') 27 | parser.add_argument('--epoch_count', type=int, default=1, help='the starting epoch count, we save the model by , +, ...') 28 | parser.add_argument('--phase', type=str, default='train', help='train, val, test, etc') 29 | # training parameters 30 | parser.add_argument('--n_epochs', type=int, default=100, help='number of epochs with the initial learning rate') 31 | parser.add_argument('--n_epochs_decay', type=int, default=100, help='number of epochs to linearly decay learning rate to zero') 32 | parser.add_argument('--beta1', type=float, default=0.5, help='momentum term of adam') 33 | parser.add_argument('--lr', type=float, default=0.0001, help='initial learning rate for adam') 34 | parser.add_argument('--gan_mode', type=str, default='lsgan', help='the type of GAN objective. [vanilla| lsgan | wgangp]. vanilla GAN loss is the cross-entropy objective used in the original GAN paper.') 35 | parser.add_argument('--pool_size', type=int, default=50, help='the size of image buffer that stores previously generated images') 36 | parser.add_argument('--lr_policy', type=str, default='linear', help='learning rate policy. [linear | step | plateau | cosine]') 37 | parser.add_argument('--lr_decay_iters', type=int, default=50, help='multiply by a gamma every lr_decay_iters iterations') 38 | 39 | self.isTrain = True 40 | return parser 41 | -------------------------------------------------------------------------------- /pix2pix/train.py: -------------------------------------------------------------------------------- 1 | """General-purpose training script for image-to-image translation. 2 | 3 | This script works for various models (with option '--model': e.g., pix2pix, cyclegan, colorization) and 4 | different datasets (with option '--dataset_mode': e.g., aligned, unaligned, single, colorization). 5 | You need to specify the dataset ('--dataroot'), experiment name ('--name'), and model ('--model'). 6 | 7 | It first creates model, dataset, and visualizer given the option. 8 | It then does standard network training. During the training, it also visualize/save the images, print/save the loss plot, and save models. 9 | The script supports continue/resume training. Use '--continue_train' to resume your previous training. 10 | 11 | Example: 12 | Train a CycleGAN model: 13 | python train.py --dataroot ./datasets/maps --name maps_cyclegan --model cycle_gan 14 | Train a pix2pix model: 15 | python train.py --dataroot ./datasets/facades --name facades_pix2pix --model pix2pix --direction BtoA 16 | 17 | See options/base_options.py and options/train_options.py for more training options. 18 | See training and test tips at: https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/docs/tips.md 19 | See frequently asked questions at: https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/docs/qa.md 20 | """ 21 | import time 22 | from options.train_options import TrainOptions 23 | from data import create_dataset 24 | from models import create_model 25 | from util.visualizer import Visualizer 26 | 27 | if __name__ == '__main__': 28 | opt = TrainOptions().parse() # get training options 29 | # opt.serial_batches = True 30 | dataset = create_dataset(opt) # create a dataset given opt.dataset_mode and other options 31 | dataset_size = len(dataset) # get the number of images in the dataset. 32 | print('The number of training images = %d' % dataset_size) 33 | 34 | model = create_model(opt) # create a model given opt.model and other options 35 | model.setup(opt) # regular setup: load and print networks; create schedulers 36 | visualizer = Visualizer(opt) # create a visualizer that display/save images and plots 37 | 38 | for epoch in range(opt.epoch_count, opt.n_epochs + opt.n_epochs_decay + 1): # outer loop for different epochs; we save the model by , + 39 | epoch_start_time = time.time() # timer for entire epoch 40 | iter_data_time = time.time() # timer for data loading per iteration 41 | epoch_iter = 0 # the number of training iterations in current epoch, reset to 0 every epoch 42 | visualizer.reset() # reset the visualizer: make sure it saves the results to HTML at least once every epoch 43 | model.update_learning_rate() # update learning rates in the beginning of every epoch. 44 | for i, data in enumerate(dataset): # inner loop within one epoch 45 | iter_start_time = time.time() # timer for computation per iteration 46 | 47 | epoch_iter += opt.batch_size 48 | model.set_input_train(data) # unpack data from dataset and apply preprocessing 49 | model.optimize_parameters() # calculate loss functions, get gradients, update network weights 50 | 51 | if epoch_iter == dataset_size: 52 | model.compute_visuals() 53 | visualizer.display_current_results(model.get_current_visuals(), epoch, True) 54 | 55 | if epoch_iter % 500 == 0 or epoch_iter == dataset_size: # print training losses and save logging information to the disk 56 | losses = model.get_current_losses() 57 | t_data = iter_start_time - iter_data_time 58 | t_comp = (time.time() - iter_start_time) / opt.batch_size 59 | visualizer.print_current_losses(epoch, epoch_iter, losses, t_comp, t_data) 60 | 61 | 62 | if epoch % opt.save_epoch_freq == 0: # cache our model every epochs 63 | print('saving the model at the end of epoch %d' % epoch) 64 | model.save_networks('latest') 65 | model.save_networks(epoch) 66 | 67 | print('End of epoch %d / %d \t Time Taken: %d sec' % (epoch, opt.n_epochs + opt.n_epochs_decay, time.time() - epoch_start_time)) 68 | -------------------------------------------------------------------------------- /pix2pix/util/__init__.py: -------------------------------------------------------------------------------- 1 | """This package includes a miscellaneous collection of useful helper functions.""" 2 | -------------------------------------------------------------------------------- /pix2pix/util/get_data.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import tarfile 4 | import requests 5 | from warnings import warn 6 | from zipfile import ZipFile 7 | from bs4 import BeautifulSoup 8 | from os.path import abspath, isdir, join, basename 9 | 10 | 11 | class GetData(object): 12 | """A Python script for downloading CycleGAN or pix2pix datasets. 13 | 14 | Parameters: 15 | technique (str) -- One of: 'cyclegan' or 'pix2pix'. 16 | verbose (bool) -- If True, print additional information. 17 | 18 | Examples: 19 | >>> from util.get_data import GetData 20 | >>> gd = GetData(technique='cyclegan') 21 | >>> new_data_path = gd.get(save_path='./datasets') # options will be displayed. 22 | 23 | Alternatively, You can use bash scripts: 'scripts/download_pix2pix_model.sh' 24 | and 'scripts/download_cyclegan_model.sh'. 25 | """ 26 | 27 | def __init__(self, technique='cyclegan', verbose=True): 28 | url_dict = { 29 | 'pix2pix': 'http://efrosgans.eecs.berkeley.edu/pix2pix/datasets/', 30 | 'cyclegan': 'https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets' 31 | } 32 | self.url = url_dict.get(technique.lower()) 33 | self._verbose = verbose 34 | 35 | def _print(self, text): 36 | if self._verbose: 37 | print(text) 38 | 39 | @staticmethod 40 | def _get_options(r): 41 | soup = BeautifulSoup(r.text, 'lxml') 42 | options = [h.text for h in soup.find_all('a', href=True) 43 | if h.text.endswith(('.zip', 'tar.gz'))] 44 | return options 45 | 46 | def _present_options(self): 47 | r = requests.get(self.url) 48 | options = self._get_options(r) 49 | print('Options:\n') 50 | for i, o in enumerate(options): 51 | print("{0}: {1}".format(i, o)) 52 | choice = input("\nPlease enter the number of the " 53 | "dataset above you wish to download:") 54 | return options[int(choice)] 55 | 56 | def _download_data(self, dataset_url, save_path): 57 | if not isdir(save_path): 58 | os.makedirs(save_path) 59 | 60 | base = basename(dataset_url) 61 | temp_save_path = join(save_path, base) 62 | 63 | with open(temp_save_path, "wb") as f: 64 | r = requests.get(dataset_url) 65 | f.write(r.content) 66 | 67 | if base.endswith('.tar.gz'): 68 | obj = tarfile.open(temp_save_path) 69 | elif base.endswith('.zip'): 70 | obj = ZipFile(temp_save_path, 'r') 71 | else: 72 | raise ValueError("Unknown File Type: {0}.".format(base)) 73 | 74 | self._print("Unpacking Data...") 75 | obj.extractall(save_path) 76 | obj.close() 77 | os.remove(temp_save_path) 78 | 79 | def get(self, save_path, dataset=None): 80 | """ 81 | 82 | Download a dataset. 83 | 84 | Parameters: 85 | save_path (str) -- A directory to save the data to. 86 | dataset (str) -- (optional). A specific dataset to download. 87 | Note: this must include the file extension. 88 | If None, options will be presented for you 89 | to choose from. 90 | 91 | Returns: 92 | save_path_full (str) -- the absolute path to the downloaded data. 93 | 94 | """ 95 | if dataset is None: 96 | selected_dataset = self._present_options() 97 | else: 98 | selected_dataset = dataset 99 | 100 | save_path_full = join(save_path, selected_dataset.split('.')[0]) 101 | 102 | if isdir(save_path_full): 103 | warn("\n'{0}' already exists. Voiding Download.".format( 104 | save_path_full)) 105 | else: 106 | self._print('Downloading Data...') 107 | url = "{0}/{1}".format(self.url, selected_dataset) 108 | self._download_data(url, save_path=save_path) 109 | 110 | return abspath(save_path_full) 111 | -------------------------------------------------------------------------------- /pix2pix/util/guidedfilter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class GuidedFilter(): 4 | def __init__(self, source, reference, r=64, eps= 0.05**2): 5 | self.source = source; 6 | self.reference = reference; 7 | self.r = r 8 | self.eps = eps 9 | 10 | self.smooth = self.guidedfilter(self.source,self.reference,self.r,self.eps) 11 | 12 | def boxfilter(self,img, r): 13 | (rows, cols) = img.shape 14 | imDst = np.zeros_like(img) 15 | 16 | imCum = np.cumsum(img, 0) 17 | imDst[0 : r+1, :] = imCum[r : 2*r+1, :] 18 | imDst[r+1 : rows-r, :] = imCum[2*r+1 : rows, :] - imCum[0 : rows-2*r-1, :] 19 | imDst[rows-r: rows, :] = np.tile(imCum[rows-1, :], [r, 1]) - imCum[rows-2*r-1 : rows-r-1, :] 20 | 21 | imCum = np.cumsum(imDst, 1) 22 | imDst[:, 0 : r+1] = imCum[:, r : 2*r+1] 23 | imDst[:, r+1 : cols-r] = imCum[:, 2*r+1 : cols] - imCum[:, 0 : cols-2*r-1] 24 | imDst[:, cols-r: cols] = np.tile(imCum[:, cols-1], [r, 1]).T - imCum[:, cols-2*r-1 : cols-r-1] 25 | 26 | return imDst 27 | 28 | def guidedfilter(self,I, p, r, eps): 29 | (rows, cols) = I.shape 30 | N = self.boxfilter(np.ones([rows, cols]), r) 31 | 32 | meanI = self.boxfilter(I, r) / N 33 | meanP = self.boxfilter(p, r) / N 34 | meanIp = self.boxfilter(I * p, r) / N 35 | covIp = meanIp - meanI * meanP 36 | 37 | meanII = self.boxfilter(I * I, r) / N 38 | varI = meanII - meanI * meanI 39 | 40 | a = covIp / (varI + eps) 41 | b = meanP - a * meanI 42 | 43 | meanA = self.boxfilter(a, r) / N 44 | meanB = self.boxfilter(b, r) / N 45 | 46 | q = meanA * I + meanB 47 | return q -------------------------------------------------------------------------------- /pix2pix/util/html.py: -------------------------------------------------------------------------------- 1 | import dominate 2 | from dominate.tags import meta, h3, table, tr, td, p, a, img, br 3 | import os 4 | 5 | 6 | class HTML: 7 | """This HTML class allows us to save images and write texts into a single HTML file. 8 | 9 | It consists of functions such as (add a text header to the HTML file), 10 | (add a row of images to the HTML file), and (save the HTML to the disk). 11 | It is based on Python library 'dominate', a Python library for creating and manipulating HTML documents using a DOM API. 12 | """ 13 | 14 | def __init__(self, web_dir, title, refresh=0): 15 | """Initialize the HTML classes 16 | 17 | Parameters: 18 | web_dir (str) -- a directory that stores the webpage. HTML file will be created at /index.html; images will be saved at 0: 32 | with self.doc.head: 33 | meta(http_equiv="refresh", content=str(refresh)) 34 | 35 | def get_image_dir(self): 36 | """Return the directory that stores images""" 37 | return self.img_dir 38 | 39 | def add_header(self, text): 40 | """Insert a header to the HTML file 41 | 42 | Parameters: 43 | text (str) -- the header text 44 | """ 45 | with self.doc: 46 | h3(text) 47 | 48 | def add_images(self, ims, txts, links, width=400): 49 | """add images to the HTML file 50 | 51 | Parameters: 52 | ims (str list) -- a list of image paths 53 | txts (str list) -- a list of image names shown on the website 54 | links (str list) -- a list of hyperref links; when you click an image, it will redirect you to a new page 55 | """ 56 | self.t = table(border=1, style="table-layout: fixed;") # Insert a table 57 | self.doc.add(self.t) 58 | with self.t: 59 | with tr(): 60 | for im, txt, link in zip(ims, txts, links): 61 | with td(style="word-wrap: break-word;", halign="center", valign="top"): 62 | with p(): 63 | with a(href=os.path.join('images', link)): 64 | img(style="width:%dpx" % width, src=os.path.join('images', im)) 65 | br() 66 | p(txt) 67 | 68 | def save(self): 69 | """save the current content to the HMTL file""" 70 | html_file = '%s/index.html' % self.web_dir 71 | f = open(html_file, 'wt') 72 | f.write(self.doc.render()) 73 | f.close() 74 | 75 | 76 | if __name__ == '__main__': # we show an example usage here. 77 | html = HTML('web/', 'test_html') 78 | html.add_header('hello world') 79 | 80 | ims, txts, links = [], [], [] 81 | for n in range(4): 82 | ims.append('image_%d.png' % n) 83 | txts.append('text_%d' % n) 84 | links.append('image_%d.png' % n) 85 | html.add_images(ims, txts, links) 86 | html.save() 87 | -------------------------------------------------------------------------------- /pix2pix/util/image_pool.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | 4 | 5 | class ImagePool(): 6 | """This class implements an image buffer that stores previously generated images. 7 | 8 | This buffer enables us to update discriminators using a history of generated images 9 | rather than the ones produced by the latest generators. 10 | """ 11 | 12 | def __init__(self, pool_size): 13 | """Initialize the ImagePool class 14 | 15 | Parameters: 16 | pool_size (int) -- the size of image buffer, if pool_size=0, no buffer will be created 17 | """ 18 | self.pool_size = pool_size 19 | if self.pool_size > 0: # create an empty pool 20 | self.num_imgs = 0 21 | self.images = [] 22 | 23 | def query(self, images): 24 | """Return an image from the pool. 25 | 26 | Parameters: 27 | images: the latest generated images from the generator 28 | 29 | Returns images from the buffer. 30 | 31 | By 50/100, the buffer will return input images. 32 | By 50/100, the buffer will return images previously stored in the buffer, 33 | and insert the current images to the buffer. 34 | """ 35 | if self.pool_size == 0: # if the buffer size is 0, do nothing 36 | return images 37 | return_images = [] 38 | for image in images: 39 | image = torch.unsqueeze(image.data, 0) 40 | if self.num_imgs < self.pool_size: # if the buffer is not full; keep inserting current images to the buffer 41 | self.num_imgs = self.num_imgs + 1 42 | self.images.append(image) 43 | return_images.append(image) 44 | else: 45 | p = random.uniform(0, 1) 46 | if p > 0.5: # by 50% chance, the buffer will return a previously stored image, and insert the current image into the buffer 47 | random_id = random.randint(0, self.pool_size - 1) # randint is inclusive 48 | tmp = self.images[random_id].clone() 49 | self.images[random_id] = image 50 | return_images.append(tmp) 51 | else: # by another 50% chance, the buffer will return the current image 52 | return_images.append(image) 53 | return_images = torch.cat(return_images, 0) # collect all the images and return 54 | return return_images 55 | -------------------------------------------------------------------------------- /pix2pix/util/util.py: -------------------------------------------------------------------------------- 1 | """This module contains simple helper functions """ 2 | from __future__ import print_function 3 | import torch 4 | import numpy as np 5 | from PIL import Image 6 | import os 7 | 8 | 9 | def tensor2im(input_image, imtype=np.uint16): 10 | """"Converts a Tensor array into a numpy image array. 11 | 12 | Parameters: 13 | input_image (tensor) -- the input image tensor array 14 | imtype (type) -- the desired type of the converted numpy array 15 | """ 16 | if not isinstance(input_image, np.ndarray): 17 | if isinstance(input_image, torch.Tensor): # get the data from a variable 18 | image_tensor = input_image.data 19 | else: 20 | return input_image 21 | image_numpy = torch.squeeze(image_tensor).cpu().numpy() # convert it into a numpy array 22 | image_numpy = (image_numpy + 1) / 2.0 * (2**16-1) # 23 | else: # if it is a numpy array, do nothing 24 | image_numpy = input_image 25 | return image_numpy.astype(imtype) 26 | 27 | 28 | def diagnose_network(net, name='network'): 29 | """Calculate and print the mean of average absolute(gradients) 30 | 31 | Parameters: 32 | net (torch network) -- Torch network 33 | name (str) -- the name of the network 34 | """ 35 | mean = 0.0 36 | count = 0 37 | for param in net.parameters(): 38 | if param.grad is not None: 39 | mean += torch.mean(torch.abs(param.grad.data)) 40 | count += 1 41 | if count > 0: 42 | mean = mean / count 43 | print(name) 44 | print(mean) 45 | 46 | 47 | def save_image(image_numpy, image_path, aspect_ratio=1.0): 48 | """Save a numpy image to the disk 49 | 50 | Parameters: 51 | image_numpy (numpy array) -- input numpy array 52 | image_path (str) -- the path of the image 53 | """ 54 | image_pil = Image.fromarray(image_numpy) 55 | 56 | image_pil = image_pil.convert('I;16') 57 | 58 | # image_pil = Image.fromarray(image_numpy) 59 | # h, w, _ = image_numpy.shape 60 | # 61 | # if aspect_ratio > 1.0: 62 | # image_pil = image_pil.resize((h, int(w * aspect_ratio)), Image.BICUBIC) 63 | # if aspect_ratio < 1.0: 64 | # image_pil = image_pil.resize((int(h / aspect_ratio), w), Image.BICUBIC) 65 | 66 | image_pil.save(image_path) 67 | 68 | 69 | def print_numpy(x, val=True, shp=False): 70 | """Print the mean, min, max, median, std, and size of a numpy array 71 | 72 | Parameters: 73 | val (bool) -- if print the values of the numpy array 74 | shp (bool) -- if print the shape of the numpy array 75 | """ 76 | x = x.astype(np.float64) 77 | if shp: 78 | print('shape,', x.shape) 79 | if val: 80 | x = x.flatten() 81 | print('mean = %3.3f, min = %3.3f, max = %3.3f, median = %3.3f, std=%3.3f' % ( 82 | np.mean(x), np.min(x), np.max(x), np.median(x), np.std(x))) 83 | 84 | 85 | def mkdirs(paths): 86 | """create empty directories if they don't exist 87 | 88 | Parameters: 89 | paths (str list) -- a list of directory paths 90 | """ 91 | if isinstance(paths, list) and not isinstance(paths, str): 92 | for path in paths: 93 | mkdir(path) 94 | else: 95 | mkdir(paths) 96 | 97 | 98 | def mkdir(path): 99 | """create a single empty directory if it didn't exist 100 | 101 | Parameters: 102 | path (str) -- a single directory path 103 | """ 104 | if not os.path.exists(path): 105 | os.makedirs(path) 106 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Requirements for running in standalone mode 2 | # First, install the corect version of PyTorch! 3 | # PyTorch Compute Platform must match the configuration of the hardware. 4 | 5 | # pip install -r requirements.txt 6 | torch 7 | gradio>=3.38.0,<4.0 # User UI 8 | timm~=0.9.2 # For midas 9 | matplotlib 10 | trimesh # For creating simple meshes 11 | numba>=0.57.0 # Speeding up CPU stereoimage generation 12 | vispy>=0.13.0 13 | rembg>=2.0.50 # Remove background 14 | moviepy>=1.0.2,<2.0 15 | transforms3d>=0.4.1 16 | imageio>=2.4.1,<3.0 17 | imageio-ffmpeg 18 | networkx>=2.5 19 | diffusers>=0.20.1 # For Marigold 20 | pyqt5; sys_platform == 'windows' 21 | pyqt6; sys_platform != 'windows' 22 | PyOpenGL>=3.1.7; sys_platform == 'darwin' 23 | https://github.com/huchenlei/Depth-Anything/releases/download/v1.0.0/depth_anything-2024.1.22.0-py2.py3-none-any.whl 24 | -------------------------------------------------------------------------------- /scripts/depthmap.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | import gradio as gr 3 | from modules import shared 4 | import modules.scripts as scripts 5 | from PIL import Image 6 | 7 | from src import backbone 8 | from src import common_ui 9 | from src.core import core_generation_funnel 10 | from src.gradio_args_transport import GradioComponentBundle 11 | from src.misc import * 12 | 13 | 14 | class Script(scripts.Script): 15 | def title(self): 16 | return SCRIPT_NAME 17 | 18 | def show(self, is_img2img): 19 | return True 20 | 21 | def ui(self, is_img2img): 22 | gr.HTML() # Work around a Gradio bug 23 | with gr.Column(variant='panel'): 24 | gr.HTML() # Work around a Gradio bug 25 | ret = common_ui.main_ui_panel(False) 26 | ret += ret.enkey_tail() 27 | return ret.enkey_body() 28 | 29 | # run from script in txt2img or img2img 30 | def run(self, p, *inputs): 31 | from modules import processing 32 | from modules.processing import create_infotext 33 | 34 | inputs = GradioComponentBundle.enkey_to_dict(inputs) 35 | 36 | # sd process 37 | processed = processing.process_images(p) 38 | processed.sampler = p.sampler # for create_infotext 39 | processed.tiling = p.tiling # for create_infotext 40 | 41 | inputimages = [] 42 | for count in range(0, len(processed.images)): 43 | # skip first grid image 44 | if count == 0 and len(processed.images) > 1 and shared.opts.return_grid: 45 | continue 46 | inputimages.append(processed.images[count]) 47 | 48 | gen_obj = core_generation_funnel(p.outpath_samples, inputimages, None, None, inputs, backbone.gather_ops()) 49 | 50 | for input_i, type, result in gen_obj: 51 | if not isinstance(result, Image.Image): 52 | continue 53 | 54 | # get generation parameters 55 | # TODO: could reuse 56 | if hasattr(processed, 'all_prompts') and shared.opts.enable_pnginfo: 57 | info = create_infotext( 58 | p, processed.all_prompts, processed.all_seeds, processed.all_subseeds, "", 0, input_i) 59 | else: 60 | info = None 61 | 62 | processed.images.append(result) 63 | if inputs["save_outputs"]: 64 | try: 65 | suffix = "" if type == "depth" else f"{type}" 66 | backbone.save_image(result, path=p.outpath_samples, basename="", seed=processed.all_seeds[input_i], 67 | prompt=processed.all_prompts[input_i], extension=shared.opts.samples_format, 68 | info=info, 69 | p=processed, 70 | suffix=suffix) 71 | except Exception as e: 72 | if not ('image has wrong mode' in str(e) or 'I;16' in str(e)): 73 | raise e 74 | print('Catched exception: image has wrong mode!') 75 | traceback.print_exc() 76 | return processed 77 | 78 | 79 | # TODO: some of them may be put into the main ui pane 80 | # TODO: allow in standalone mode 81 | def on_ui_settings(): 82 | section = ('depthmap-script', "Depthmap extension") 83 | 84 | def add_option(name, default_value, description, name_prefix='depthmap_script'): 85 | shared.opts.add_option(f"{name_prefix}_{name}", shared.OptionInfo(default_value, description, section=section)) 86 | 87 | add_option('keepmodels', False, "Do not unload depth and pix2pix models.") 88 | 89 | add_option('boost_rmax', 1600, "Maximum wholesize for boost (Rmax)") 90 | add_option('marigold_ensembles', 5, "How many ensembles to use for Marigold") 91 | add_option('marigold_steps', 10, "How many denoising steps to use for Marigold") 92 | 93 | add_option('save_ply', False, "Save additional PLY file with 3D inpainted mesh.") 94 | add_option('show_3d', True, "Enable showing 3D Meshes in output tab. (Experimental)") 95 | add_option('show_3d_inpaint', True, "Also show 3D Inpainted Mesh in 3D Mesh output tab. (Experimental)") 96 | add_option('mesh_maxsize', 2048, "Max size for generating simple mesh.") 97 | 98 | add_option('gen_heatmap_from_ui', False, "Show an option to generate HeatMap in the UI") 99 | add_option('extra_stereomodes', False, "Enable more possible outputs for stereoimage generation") 100 | 101 | 102 | from modules import script_callbacks 103 | script_callbacks.on_ui_settings(on_ui_settings) 104 | script_callbacks.on_ui_tabs(lambda: [(common_ui.on_ui_tabs(), "Depth", "depthmap_interface")]) 105 | -------------------------------------------------------------------------------- /src/common_constants.py: -------------------------------------------------------------------------------- 1 | import enum 2 | 3 | 4 | class GenerationOptions(enum.Enum): 5 | """This Enum provides the options that are used in the usual generation 6 | (that is, consumed by the core_generation_funnel). 7 | Please use this to avoid typos. Also, this enum provides default values for these options.""" 8 | def __new__(cls, *args, **kwds): 9 | value = len(cls.__members__) + 1 10 | obj = object.__new__(cls) 11 | obj._value_ = value 12 | return obj 13 | 14 | def __init__(self, default_value=None, *args): 15 | """Saves default value as a member (called "df") of a member of this enum""" 16 | self.df = default_value 17 | 18 | COMPUTE_DEVICE = "GPU" 19 | MODEL_TYPE = "Depth Anything v2 Base" # Will become enum element 20 | BOOST = False 21 | NET_SIZE_MATCH = False 22 | NET_WIDTH = 448 23 | NET_HEIGHT = 448 24 | TILING_MODE = False 25 | 26 | DO_OUTPUT_DEPTH = True 27 | OUTPUT_DEPTH_INVERT = False 28 | OUTPUT_DEPTH_COMBINE = False 29 | OUTPUT_DEPTH_COMBINE_AXIS = "Horizontal" # Format (str) is subject to change 30 | DO_OUTPUT_DEPTH_PREDICTION = False # Hidden, do not use, subject to change 31 | 32 | CLIPDEPTH = False 33 | CLIPDEPTH_MODE = "Range" 34 | CLIPDEPTH_FAR = 0.0 35 | CLIPDEPTH_NEAR = 1.0 36 | 37 | GEN_STEREO = False 38 | STEREO_MODES = ["left-right", "red-cyan-anaglyph"] 39 | STEREO_DIVERGENCE = 2.5 40 | STEREO_SEPARATION = 0.0 41 | STEREO_FILL_ALGO = "polylines_sharp" 42 | STEREO_OFFSET_EXPONENT = 1.0 43 | STEREO_BALANCE = 0.0 44 | 45 | GEN_NORMALMAP = False 46 | NORMALMAP_PRE_BLUR = False 47 | NORMALMAP_PRE_BLUR_KERNEL = 3 48 | NORMALMAP_SOBEL = True 49 | NORMALMAP_SOBEL_KERNEL = 3 50 | NORMALMAP_POST_BLUR = False 51 | NORMALMAP_POST_BLUR_KERNEL = 3 52 | NORMALMAP_INVERT = False 53 | 54 | GEN_HEATMAP = False 55 | 56 | GEN_SIMPLE_MESH = False 57 | SIMPLE_MESH_OCCLUDE = True 58 | SIMPLE_MESH_SPHERICAL = False 59 | 60 | GEN_INPAINTED_MESH = False 61 | GEN_INPAINTED_MESH_DEMOS = False 62 | 63 | GEN_REMBG = False 64 | SAVE_BACKGROUND_REMOVAL_MASKS = False # Legacy, will be reworked 65 | PRE_DEPTH_BACKGROUND_REMOVAL = False # Legacy, will be reworked 66 | REMBG_MODEL = "u2net" 67 | -------------------------------------------------------------------------------- /src/gradio_args_transport.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | 3 | class GradioComponentBundle: 4 | """Allows easier transportation of massive ammount of named gradio inputs. 5 | Allows adding visibility rules quicker.""" 6 | def __init__(self): 7 | self.internal = {} 8 | self.internal_ignored = {} 9 | 10 | def _raw_assignment(self, key, value, ignored=False): 11 | assert key not in self.internal, f"Already bundled component with name {key}." 12 | assert key not in self.internal_ignored, f"Already bundled component with name {key}." 13 | if not ignored: 14 | self.internal[key] = value 15 | else: 16 | self.internal_ignored[key] = value 17 | 18 | def _append_el(self, thing, ignored=False): 19 | if isinstance(thing, tuple) and len(thing) == 2 and isinstance(thing[1], gr.blocks.Block): 20 | name = thing[0] if isinstance(thing[0], str) else thing[0].name.lower() # .name is for Enums 21 | if hasattr(thing[0], 'df') and thing[0].df is not None: 22 | thing[1].value = thing[0].df 23 | self._raw_assignment(name, thing[1], ignored) 24 | elif isinstance(thing, gr.components.Component) and thing.elem_id is not None: 25 | self._raw_assignment(thing.elem_id, thing, ignored) 26 | else: 27 | raise Exception(f"This object can not be bundled, {str(thing)}") 28 | 29 | def __iadd__(self, els): 30 | """Add an input element that will be packed into a bundle.""" 31 | self._append_el(els, ignored=False) 32 | return self 33 | 34 | def __isub__(self, els): 35 | """Add an element that will not be packed into a bundle, but will be accessible.""" 36 | self._append_el(els, ignored=True) 37 | return self 38 | 39 | def __ior__(self, thing): 40 | """Add an extra bundle into your bundle, so you could have more bundeled items in your bundle.""" 41 | assert isinstance(thing, GradioComponentBundle), "Use += or -= for bundling elements" 42 | for key in list(thing.internal.keys()): 43 | self._raw_assignment(key, thing[key], False) 44 | for key in list(thing.internal_ignored.keys()): 45 | self._raw_assignment(key, thing[key], True) 46 | return self 47 | 48 | def __getitem__(self, key): 49 | """Return the gradio component elem_id""" 50 | if hasattr(key, 'name'): 51 | key = key.name.lower() # for enum elements 52 | if key in self.internal_ignored: 53 | return self.internal_ignored[key] 54 | return self.internal[key] 55 | 56 | def __contains__(self, key): 57 | if hasattr(key, 'name'): 58 | key = key.name.lower() # for enum elements 59 | return key in self.internal_ignored or key in self.internal 60 | 61 | def enkey_tail(self): 62 | """Must be the last element of the bundle for unbundling to work""" 63 | keys = sorted(list(self.internal.keys())) 64 | head = gr.HTML(elem_id="zzz_depthmap_enkey", value="\u222F" + "\u222F".join(keys), visible=False) 65 | return head 66 | 67 | def enkey_body(self): 68 | """This is what should be passed into the function that is called by gradio""" 69 | return [self.internal[x] for x in sorted(list(self.internal.keys()))] 70 | 71 | def add_rule(self, first, rule, second): 72 | first = self[first] if first in self else first 73 | second = self[second] if second in self else second 74 | if rule == 'visible-if-not': 75 | second.change(fn=lambda v: first.update(visible=not v), inputs=[second], outputs=[first]) 76 | elif rule == 'visible-if': 77 | second.change(fn=lambda v: first.update(visible=v), inputs=[second], outputs=[first]) 78 | else: 79 | raise Exception(f'Unknown rule type {rule}') 80 | 81 | @staticmethod 82 | def enkey_to_dict(inp): 83 | """Unbundle: get a dictionary with stuff after it is sent bby the gradio to the function. 84 | Enkey format: bunch of Gradio components, 85 | then a Gradio component, which value is concatination of names of the previous Gradio objects""" 86 | assert inp[-1].startswith("\u222F") 87 | ret = {} 88 | names = inp[-1].split("\u222F")[1:] 89 | assert len(names) == len(inp) - 1 90 | for i, name in enumerate(names): 91 | ret[name] = inp[i] 92 | return ret 93 | -------------------------------------------------------------------------------- /src/misc.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import os 3 | import pathlib 4 | import builtins 5 | 6 | def get_commit_hash(): 7 | try: 8 | file_path = pathlib.Path(__file__).parent 9 | return subprocess.check_output( 10 | [os.environ.get("GIT", "git"), "rev-parse", "HEAD"], 11 | cwd=file_path, shell=False, stderr=subprocess.DEVNULL, encoding='utf8').strip()[0:8] 12 | except Exception: 13 | return "" 14 | 15 | 16 | REPOSITORY_NAME = "stable-diffusion-webui-depthmap-script" 17 | SCRIPT_NAME = "DepthMap" 18 | SCRIPT_VERSION = "v0.4.8" 19 | SCRIPT_FULL_NAME = f"{SCRIPT_NAME} {SCRIPT_VERSION} ({get_commit_hash()})" 20 | 21 | 22 | # # Returns SHA256 hash of a file 23 | # import hashlib 24 | # def sha256sum(filename): 25 | # with open(filename, 'rb', buffering=0) as f: 26 | # return hashlib.file_digest(f, 'sha256').hexdigest() 27 | def ensure_file_downloaded(filename, url, sha256_hash_prefix=None): 28 | import torch 29 | # Do not check the hash every time - it is somewhat time-consumin 30 | if os.path.exists(filename): 31 | return 32 | 33 | if type(url) is not list: 34 | url = [url] 35 | for cur_url in url: 36 | try: 37 | print("Downloading", cur_url, "to", filename) 38 | torch.hub.download_url_to_file(cur_url, filename, sha256_hash_prefix) 39 | if os.path.exists(filename): 40 | return # The correct model was downloaded, no need to try more 41 | except: 42 | pass 43 | raise RuntimeError(f'Download failed. ' 44 | f'Try again later or manually download the file {filename} to location {url}.') 45 | -------------------------------------------------------------------------------- /src/normalmap_generation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | from PIL import Image 4 | 5 | def create_normalmap(depthmap, 6 | pre_blur = None, sobel_gradient = 3, post_blur = None, 7 | invert=False): 8 | """Generates normalmaps. 9 | :param depthmap: depthmap that will be used to generate normalmap 10 | :param pre_blur: apply gaussian blur before taking gradient, -1 for disable, otherwise kernel size 11 | :param sobel_gradient: use Sobel gradient, None for regular gradient, otherwise kernel size 12 | :param post_blur: apply gaussian blur after taking gradient, -1 for disable, otherwise kernel size 13 | :param invert: depthmap will be inverted before calculating normalmap 14 | """ 15 | # https://stackoverflow.com/questions/53350391/surface-normal-calculation-from-depth-map-in-python 16 | # TODO: Tiling can be improved (gradients could be matched). 17 | # TODO: Implement bilateral filtering (16 bit deflickering) 18 | 19 | # We invert by default, maybe there is a negative sign hiding somewhere 20 | normalmap = depthmap if invert else depthmap * (-1.0) 21 | normalmap = normalmap / 256.0 22 | # pre blur (only blurs z-axis) 23 | if pre_blur is not None and pre_blur > 0: 24 | normalmap = cv2.GaussianBlur(normalmap, (pre_blur, pre_blur), pre_blur) 25 | 26 | # take gradients 27 | if sobel_gradient is not None and sobel_gradient > 0: 28 | zx = cv2.Sobel(np.float64(normalmap), cv2.CV_64F, 1, 0, ksize=sobel_gradient) 29 | zy = cv2.Sobel(np.float64(normalmap), cv2.CV_64F, 0, 1, ksize=sobel_gradient) 30 | else: 31 | zy, zx = np.gradient(normalmap) 32 | 33 | # combine and normalize gradients 34 | normal = np.dstack((zx, -zy, np.ones_like(normalmap))) 35 | # every pixel of a normal map is a normal vector, it should be a unit vector 36 | n = np.linalg.norm(normal, axis=2) 37 | normal[:, :, 0] /= n 38 | normal[:, :, 1] /= n 39 | normal[:, :, 2] /= n 40 | 41 | # TODO: this probably is not a good way to do it 42 | if post_blur is not None and post_blur > 0: 43 | normal = cv2.GaussianBlur(normal, (post_blur, post_blur), post_blur) 44 | # Normalize every vector again 45 | n = np.linalg.norm(normal, axis=2) 46 | normal[:, :, 0] /= n 47 | normal[:, :, 1] /= n 48 | normal[:, :, 2] /= n 49 | 50 | # offset and rescale values to be in 0-255, so we can export them 51 | normal += 1 52 | normal /= 2 53 | normal = np.clip(normal * 256, 0, 256 - 0.1) # Clipping form above is needed to avoid overflowing 54 | normal = normal.astype(np.uint8) 55 | 56 | return Image.fromarray(normal) 57 | --------------------------------------------------------------------------------