├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── __init__.py
├── bundled_sources.txt
├── ddepth_anything_v2
    ├── DA-2K.md
    ├── LICENSE
    ├── README.md
    ├── __init__.py
    ├── app.py
    ├── depth_anything_v2
    │   ├── dinov2.py
    │   ├── dinov2_layers
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── block.py
    │   │   ├── drop_path.py
    │   │   ├── layer_scale.py
    │   │   ├── mlp.py
    │   │   ├── patch_embed.py
    │   │   └── swiglu_ffn.py
    │   ├── dpt.py
    │   └── util
    │   │   ├── blocks.py
    │   │   └── transform.py
    ├── metric_depth
    │   ├── README.md
    │   ├── dataset
    │   │   ├── hypersim.py
    │   │   ├── kitti.py
    │   │   ├── transform.py
    │   │   └── vkitti2.py
    │   ├── depth_anything_v2
    │   │   ├── dinov2.py
    │   │   ├── dinov2_layers
    │   │   │   ├── __init__.py
    │   │   │   ├── attention.py
    │   │   │   ├── block.py
    │   │   │   ├── drop_path.py
    │   │   │   ├── layer_scale.py
    │   │   │   ├── mlp.py
    │   │   │   ├── patch_embed.py
    │   │   │   └── swiglu_ffn.py
    │   │   ├── dpt.py
    │   │   └── util
    │   │   │   ├── blocks.py
    │   │   │   └── transform.py
    │   ├── depth_to_pointcloud.py
    │   ├── dist_train.sh
    │   ├── requirements.txt
    │   ├── run.py
    │   ├── train.py
    │   └── util
    │   │   ├── dist_helper.py
    │   │   ├── loss.py
    │   │   ├── metric.py
    │   │   └── utils.py
    ├── requirements.txt
    ├── run.py
    └── run_video.py
├── dmarigold
    └── marigold
    │   ├── __init__.py
    │   ├── marigold_pipeline.py
    │   └── util
    │       ├── batchsize.py
    │       ├── ensemble.py
    │       ├── image_util.py
    │       └── seed_all.py
├── dmidas
    ├── LICENSE
    ├── backbones
    │   ├── beit.py
    │   ├── levit.py
    │   ├── next_vit.py
    │   ├── swin.py
    │   ├── swin2.py
    │   ├── swin_common.py
    │   ├── utils.py
    │   └── vit.py
    ├── base_model.py
    ├── blocks.py
    ├── dpt_depth.py
    ├── midas_net.py
    ├── midas_net_custom.py
    ├── model_loader.py
    └── transforms.py
├── dzoedepth
    ├── LICENSE
    ├── __init__.py
    ├── data
    │   ├── __init__.py
    │   ├── data_mono.py
    │   ├── ddad.py
    │   ├── diml_indoor_test.py
    │   ├── diml_outdoor_test.py
    │   ├── diode.py
    │   ├── hypersim.py
    │   ├── ibims.py
    │   ├── preprocess.py
    │   ├── sun_rgbd_loader.py
    │   ├── transforms.py
    │   ├── vkitti.py
    │   └── vkitti2.py
    ├── models
    │   ├── __init__.py
    │   ├── base_models
    │   │   ├── __init__.py
    │   │   └── midas.py
    │   ├── builder.py
    │   ├── depth_model.py
    │   ├── layers
    │   │   ├── __init__.py
    │   │   ├── attractor.py
    │   │   ├── dist_layers.py
    │   │   ├── localbins_layers.py
    │   │   └── patch_transformer.py
    │   ├── model_io.py
    │   ├── zoedepth
    │   │   ├── __init__.py
    │   │   ├── config_zoedepth.json
    │   │   ├── config_zoedepth_kitti.json
    │   │   └── zoedepth_v1.py
    │   └── zoedepth_nk
    │   │   ├── __init__.py
    │   │   ├── config_zoedepth_nk.json
    │   │   └── zoedepth_nk_v1.py
    ├── trainers
    │   ├── __init__.py
    │   ├── base_trainer.py
    │   ├── builder.py
    │   ├── loss.py
    │   ├── zoedepth_nk_trainer.py
    │   └── zoedepth_trainer.py
    └── utils
    │   ├── __init__.py
    │   ├── arg_utils.py
    │   ├── config.py
    │   ├── easydict
    │       └── __init__.py
    │   ├── geometry.py
    │   └── misc.py
├── examples.png
├── inpaint
    ├── DOCUMENTATION.md
    ├── LICENSE
    ├── README.md
    ├── __init__.py
    ├── argument.yml
    ├── bilateral_filtering.py
    ├── boostmonodepth_utils.py
    ├── download.sh
    ├── main.py
    ├── mesh.py
    ├── mesh_tools.py
    ├── networks.py
    ├── requirements.txt
    └── utils.py
├── install.py
├── javascript
    └── depthmap.js
├── lib
    ├── LICENSE
    ├── Resnet.py
    ├── Resnext_torch.py
    ├── __init__.py
    ├── multi_depth_model_woauxi.py
    ├── net_tools.py
    ├── network_auxi.py
    ├── spvcnn_classsification.py
    ├── spvcnn_utils.py
    └── test_utils.py
├── main.py
├── options.png
├── pix2pix
    ├── LICENSE
    ├── __init__.py
    ├── data
    │   ├── __init__.py
    │   ├── base_dataset.py
    │   ├── depthmerge_dataset.py
    │   └── image_folder.py
    ├── models
    │   ├── __init__.py
    │   ├── base_model.py
    │   ├── base_model_hg.py
    │   ├── networks.py
    │   └── pix2pix4depth_model.py
    ├── options
    │   ├── __init__.py
    │   ├── base_options.py
    │   ├── test_options.py
    │   └── train_options.py
    ├── test.py
    ├── train.py
    └── util
    │   ├── __init__.py
    │   ├── get_data.py
    │   ├── guidedfilter.py
    │   ├── html.py
    │   ├── image_pool.py
    │   ├── util.py
    │   └── visualizer.py
├── requirements.txt
├── scripts
    ├── depthmap.py
    └── depthmap_api.py
└── src
    ├── backbone.py
    ├── common_constants.py
    ├── common_ui.py
    ├── core.py
    ├── depthmap_generation.py
    ├── gradio_args_transport.py
    ├── misc.py
    ├── normalmap_generation.py
    ├── stereoimage_generation.py
    └── video_mode.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | venv/
3 | .idea/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Bob Thiry
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/__init__.py


--------------------------------------------------------------------------------
/bundled_sources.txt:
--------------------------------------------------------------------------------
 1 | Since commit 110549b2 this extension bundles some code from other repositories.
 2 | This was done to prevent possible upstream breakage and allow fixing breakage quicker.
 3 | This file provides information about the original location of the code.
 4 | *** Some of the bundled code was already modified. ***
 5 | 
 6 | dmidas
 7 | https://github.com/isl-org/MiDaS/tree/master/midas/
 8 | 
 9 | dzoedepth
10 | https://github.com/isl-org/ZoeDepth/tree/main/zoedepth/
11 | 
12 | inpaint
13 | https://github.com/vt-vl-lab/3d-photo-inpainting/
14 | 
15 | lib
16 | https://github.com/aim-uofa/AdelaiDepth/tree/main/LeReS/Minist_Test/lib/
17 | 
18 | pix2pix
19 | https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/
20 | 
21 | Marigold
22 | https://github.com/prs-eth/Marigold/tree/22437a
23 | 
24 | depth_anything_v2
25 | https://github.com/DepthAnything/Depth-Anything-V2/tree/bc0283
26 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/DA-2K.md:
--------------------------------------------------------------------------------
 1 | # DA-2K Evaluation Benchmark
 2 | 
 3 | ## Introduction
 4 | 
 5 | ![DA-2K](assets/DA-2K.png)
 6 | 
 7 | DA-2K is proposed in [Depth Anything V2](https://depth-anything-v2.github.io) to evaluate the relative depth estimation capability. It encompasses eight representative scenarios of `indoor`, `outdoor`, `non_real`, `transparent_reflective`, `adverse_style`, `aerial`, `underwater`, and `object`. It consists of 1K diverse high-quality images and 2K precise pair-wise relative depth annotations.
 8 | 
 9 | Please refer to our [paper](https://arxiv.org/abs/2406.09414) for details in constructing this benchmark.
10 | 
11 | 
12 | ## Usage
13 | 
14 | Please first [download the benchmark](https://huggingface.co/datasets/depth-anything/DA-2K/tree/main).
15 | 
16 | All annotations are stored in `annotations.json`. The annotation file is a JSON object where each key is the path to an image file, and the value is a list of annotations associated with that image. Each annotation describes two points and identifies which point is closer to the camera. The structure is detailed below:
17 | 
18 | ```
19 | {
20 |   "image_path": [
21 |     {
22 |       "point1": [h1, w1], # (vertical position, horizontal position)
23 |       "point2": [h2, w2], # (vertical position, horizontal position)
24 |       "closer_point": "point1" # we always set "point1" as the closer one
25 |     },
26 |     ...
27 |   ],
28 |   ...
29 | }
30 | ```
31 | 
32 | To visualize the annotations:
33 | ```bash
34 | python visualize.py [--scene-type <type>]
35 | ```
36 | 
37 | **Options**
38 | - `--scene-type <type>` (optional): Specify the scene type (`indoor`, `outdoor`, `non_real`, `transparent_reflective`, `adverse_style`, `aerial`, `underwater`, and `object`). Skip this argument or set <type> as `""` to include all scene types.
39 | 
40 | ## Citation
41 | 
42 | If you find this benchmark useful, please consider citing:
43 | 
44 | ```bibtex
45 | @article{depth_anything_v2,
46 |   title={Depth Anything V2},
47 |   author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Zhao, Zhen and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
48 |   journal={arXiv:2406.09414},
49 |   year={2024}
50 | }
51 | ```


--------------------------------------------------------------------------------
/ddepth_anything_v2/__init__.py:
--------------------------------------------------------------------------------
1 | from .depth_anything_v2.dpt import DepthAnythingV2


--------------------------------------------------------------------------------
/ddepth_anything_v2/app.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import gradio as gr
 3 | import matplotlib
 4 | import numpy as np
 5 | from PIL import Image
 6 | import torch
 7 | import tempfile
 8 | from gradio_imageslider import ImageSlider
 9 | 
10 | from depth_anything_v2.dpt import DepthAnythingV2
11 | 
12 | css = """
13 | #img-display-container {
14 |     max-height: 100vh;
15 | }
16 | #img-display-input {
17 |     max-height: 80vh;
18 | }
19 | #img-display-output {
20 |     max-height: 80vh;
21 | }
22 | #download {
23 |     height: 62px;
24 | }
25 | """
26 | DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
27 | model_configs = {
28 |     'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
29 |     'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
30 |     'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
31 |     'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
32 | }
33 | encoder = 'vitl'
34 | model = DepthAnythingV2(**model_configs[encoder])
35 | state_dict = torch.load(f'checkpoints/depth_anything_v2_{encoder}.pth', map_location="cpu")
36 | model.load_state_dict(state_dict)
37 | model = model.to(DEVICE).eval()
38 | 
39 | title = "# Depth Anything V2"
40 | description = """Official demo for **Depth Anything V2**.
41 | Please refer to our [paper](https://arxiv.org/abs/2406.09414), [project page](https://depth-anything-v2.github.io), or [github](https://github.com/DepthAnything/Depth-Anything-V2) for more details."""
42 | 
43 | def predict_depth(image):
44 |     return model.infer_image(image)
45 | 
46 | with gr.Blocks(css=css) as demo:
47 |     gr.Markdown(title)
48 |     gr.Markdown(description)
49 |     gr.Markdown("### Depth Prediction demo")
50 | 
51 |     with gr.Row():
52 |         input_image = gr.Image(label="Input Image", type='numpy', elem_id='img-display-input')
53 |         depth_image_slider = ImageSlider(label="Depth Map with Slider View", elem_id='img-display-output', position=0.5)
54 |     submit = gr.Button(value="Compute Depth")
55 |     gray_depth_file = gr.File(label="Grayscale depth map", elem_id="download",)
56 |     raw_file = gr.File(label="16-bit raw output (can be considered as disparity)", elem_id="download",)
57 | 
58 |     cmap = matplotlib.colormaps.get_cmap('Spectral_r')
59 | 
60 |     def on_submit(image):
61 |         original_image = image.copy()
62 | 
63 |         h, w = image.shape[:2]
64 | 
65 |         depth = predict_depth(image[:, :, ::-1])
66 | 
67 |         raw_depth = Image.fromarray(depth.astype('uint16'))
68 |         tmp_raw_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
69 |         raw_depth.save(tmp_raw_depth.name)
70 | 
71 |         depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
72 |         depth = depth.astype(np.uint8)
73 |         colored_depth = (cmap(depth)[:, :, :3] * 255).astype(np.uint8)
74 | 
75 |         gray_depth = Image.fromarray(depth)
76 |         tmp_gray_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
77 |         gray_depth.save(tmp_gray_depth.name)
78 | 
79 |         return [(original_image, colored_depth), tmp_gray_depth.name, tmp_raw_depth.name]
80 | 
81 |     submit.click(on_submit, inputs=[input_image], outputs=[depth_image_slider, gray_depth_file, raw_file])
82 | 
83 |     example_files = glob.glob('assets/examples/*')
84 |     examples = gr.Examples(examples=example_files, inputs=[input_image], outputs=[depth_image_slider, gray_depth_file, raw_file], fn=on_submit)
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     demo.queue().launch()


--------------------------------------------------------------------------------
/ddepth_anything_v2/depth_anything_v2/dinov2_layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .mlp import Mlp
 8 | from .patch_embed import PatchEmbed
 9 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
10 | from .block import NestedTensorBlock
11 | from .attention import MemEffAttention
12 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/depth_anything_v2/dinov2_layers/attention.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
10 | 
11 | import logging
12 | 
13 | from torch import Tensor
14 | from torch import nn
15 | 
16 | 
17 | logger = logging.getLogger("dinov2")
18 | 
19 | 
20 | try:
21 |     from xformers.ops import memory_efficient_attention, unbind, fmha
22 | 
23 |     XFORMERS_AVAILABLE = True
24 | except ImportError:
25 |     logger.warning("xFormers not available")
26 |     XFORMERS_AVAILABLE = False
27 | 
28 | 
29 | class Attention(nn.Module):
30 |     def __init__(
31 |         self,
32 |         dim: int,
33 |         num_heads: int = 8,
34 |         qkv_bias: bool = False,
35 |         proj_bias: bool = True,
36 |         attn_drop: float = 0.0,
37 |         proj_drop: float = 0.0,
38 |     ) -> None:
39 |         super().__init__()
40 |         self.num_heads = num_heads
41 |         head_dim = dim // num_heads
42 |         self.scale = head_dim**-0.5
43 | 
44 |         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
45 |         self.attn_drop = nn.Dropout(attn_drop)
46 |         self.proj = nn.Linear(dim, dim, bias=proj_bias)
47 |         self.proj_drop = nn.Dropout(proj_drop)
48 | 
49 |     def forward(self, x: Tensor) -> Tensor:
50 |         B, N, C = x.shape
51 |         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
52 | 
53 |         q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
54 |         attn = q @ k.transpose(-2, -1)
55 | 
56 |         attn = attn.softmax(dim=-1)
57 |         attn = self.attn_drop(attn)
58 | 
59 |         x = (attn @ v).transpose(1, 2).reshape(B, N, C)
60 |         x = self.proj(x)
61 |         x = self.proj_drop(x)
62 |         return x
63 | 
64 | 
65 | class MemEffAttention(Attention):
66 |     def forward(self, x: Tensor, attn_bias=None) -> Tensor:
67 |         if not XFORMERS_AVAILABLE:
68 |             assert attn_bias is None, "xFormers is required for nested tensors usage"
69 |             return super().forward(x)
70 | 
71 |         B, N, C = x.shape
72 |         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
73 | 
74 |         q, k, v = unbind(qkv, 2)
75 | 
76 |         x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
77 |         x = x.reshape([B, N, C])
78 | 
79 |         x = self.proj(x)
80 |         x = self.proj_drop(x)
81 |         return x
82 | 
83 |         


--------------------------------------------------------------------------------
/ddepth_anything_v2/depth_anything_v2/dinov2_layers/drop_path.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
10 | 
11 | 
12 | from torch import nn
13 | 
14 | 
15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
16 |     if drop_prob == 0.0 or not training:
17 |         return x
18 |     keep_prob = 1 - drop_prob
19 |     shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
20 |     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
21 |     if keep_prob > 0.0:
22 |         random_tensor.div_(keep_prob)
23 |     output = x * random_tensor
24 |     return output
25 | 
26 | 
27 | class DropPath(nn.Module):
28 |     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
29 | 
30 |     def __init__(self, drop_prob=None):
31 |         super(DropPath, self).__init__()
32 |         self.drop_prob = drop_prob
33 | 
34 |     def forward(self, x):
35 |         return drop_path(x, self.drop_prob, self.training)
36 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/depth_anything_v2/dinov2_layers/layer_scale.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
 8 | 
 9 | from typing import Union
10 | 
11 | import torch
12 | from torch import Tensor
13 | from torch import nn
14 | 
15 | 
16 | class LayerScale(nn.Module):
17 |     def __init__(
18 |         self,
19 |         dim: int,
20 |         init_values: Union[float, Tensor] = 1e-5,
21 |         inplace: bool = False,
22 |     ) -> None:
23 |         super().__init__()
24 |         self.inplace = inplace
25 |         self.gamma = nn.Parameter(init_values * torch.ones(dim))
26 | 
27 |     def forward(self, x: Tensor) -> Tensor:
28 |         return x.mul_(self.gamma) if self.inplace else x * self.gamma
29 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/depth_anything_v2/dinov2_layers/mlp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
10 | 
11 | 
12 | from typing import Callable, Optional
13 | 
14 | from torch import Tensor, nn
15 | 
16 | 
17 | class Mlp(nn.Module):
18 |     def __init__(
19 |         self,
20 |         in_features: int,
21 |         hidden_features: Optional[int] = None,
22 |         out_features: Optional[int] = None,
23 |         act_layer: Callable[..., nn.Module] = nn.GELU,
24 |         drop: float = 0.0,
25 |         bias: bool = True,
26 |     ) -> None:
27 |         super().__init__()
28 |         out_features = out_features or in_features
29 |         hidden_features = hidden_features or in_features
30 |         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
31 |         self.act = act_layer()
32 |         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
33 |         self.drop = nn.Dropout(drop)
34 | 
35 |     def forward(self, x: Tensor) -> Tensor:
36 |         x = self.fc1(x)
37 |         x = self.act(x)
38 |         x = self.drop(x)
39 |         x = self.fc2(x)
40 |         x = self.drop(x)
41 |         return x
42 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/depth_anything_v2/dinov2_layers/patch_embed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
10 | 
11 | from typing import Callable, Optional, Tuple, Union
12 | 
13 | from torch import Tensor
14 | import torch.nn as nn
15 | 
16 | 
17 | def make_2tuple(x):
18 |     if isinstance(x, tuple):
19 |         assert len(x) == 2
20 |         return x
21 | 
22 |     assert isinstance(x, int)
23 |     return (x, x)
24 | 
25 | 
26 | class PatchEmbed(nn.Module):
27 |     """
28 |     2D image to patch embedding: (B,C,H,W) -> (B,N,D)
29 | 
30 |     Args:
31 |         img_size: Image size.
32 |         patch_size: Patch token size.
33 |         in_chans: Number of input image channels.
34 |         embed_dim: Number of linear projection output channels.
35 |         norm_layer: Normalization layer.
36 |     """
37 | 
38 |     def __init__(
39 |         self,
40 |         img_size: Union[int, Tuple[int, int]] = 224,
41 |         patch_size: Union[int, Tuple[int, int]] = 16,
42 |         in_chans: int = 3,
43 |         embed_dim: int = 768,
44 |         norm_layer: Optional[Callable] = None,
45 |         flatten_embedding: bool = True,
46 |     ) -> None:
47 |         super().__init__()
48 | 
49 |         image_HW = make_2tuple(img_size)
50 |         patch_HW = make_2tuple(patch_size)
51 |         patch_grid_size = (
52 |             image_HW[0] // patch_HW[0],
53 |             image_HW[1] // patch_HW[1],
54 |         )
55 | 
56 |         self.img_size = image_HW
57 |         self.patch_size = patch_HW
58 |         self.patches_resolution = patch_grid_size
59 |         self.num_patches = patch_grid_size[0] * patch_grid_size[1]
60 | 
61 |         self.in_chans = in_chans
62 |         self.embed_dim = embed_dim
63 | 
64 |         self.flatten_embedding = flatten_embedding
65 | 
66 |         self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
67 |         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
68 | 
69 |     def forward(self, x: Tensor) -> Tensor:
70 |         _, _, H, W = x.shape
71 |         patch_H, patch_W = self.patch_size
72 | 
73 |         assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
74 |         assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
75 | 
76 |         x = self.proj(x)  # B C H W
77 |         H, W = x.size(2), x.size(3)
78 |         x = x.flatten(2).transpose(1, 2)  # B HW C
79 |         x = self.norm(x)
80 |         if not self.flatten_embedding:
81 |             x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
82 |         return x
83 | 
84 |     def flops(self) -> float:
85 |         Ho, Wo = self.patches_resolution
86 |         flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
87 |         if self.norm is not None:
88 |             flops += Ho * Wo * self.embed_dim
89 |         return flops
90 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/depth_anything_v2/dinov2_layers/swiglu_ffn.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from typing import Callable, Optional
 8 | 
 9 | from torch import Tensor, nn
10 | import torch.nn.functional as F
11 | 
12 | 
13 | class SwiGLUFFN(nn.Module):
14 |     def __init__(
15 |         self,
16 |         in_features: int,
17 |         hidden_features: Optional[int] = None,
18 |         out_features: Optional[int] = None,
19 |         act_layer: Callable[..., nn.Module] = None,
20 |         drop: float = 0.0,
21 |         bias: bool = True,
22 |     ) -> None:
23 |         super().__init__()
24 |         out_features = out_features or in_features
25 |         hidden_features = hidden_features or in_features
26 |         self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
27 |         self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
28 | 
29 |     def forward(self, x: Tensor) -> Tensor:
30 |         x12 = self.w12(x)
31 |         x1, x2 = x12.chunk(2, dim=-1)
32 |         hidden = F.silu(x1) * x2
33 |         return self.w3(hidden)
34 | 
35 | 
36 | try:
37 |     from xformers.ops import SwiGLU
38 | 
39 |     XFORMERS_AVAILABLE = True
40 | except ImportError:
41 |     SwiGLU = SwiGLUFFN
42 |     XFORMERS_AVAILABLE = False
43 | 
44 | 
45 | class SwiGLUFFNFused(SwiGLU):
46 |     def __init__(
47 |         self,
48 |         in_features: int,
49 |         hidden_features: Optional[int] = None,
50 |         out_features: Optional[int] = None,
51 |         act_layer: Callable[..., nn.Module] = None,
52 |         drop: float = 0.0,
53 |         bias: bool = True,
54 |     ) -> None:
55 |         out_features = out_features or in_features
56 |         hidden_features = hidden_features or in_features
57 |         hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
58 |         super().__init__(
59 |             in_features=in_features,
60 |             hidden_features=hidden_features,
61 |             out_features=out_features,
62 |             bias=bias,
63 |         )
64 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/depth_anything_v2/util/blocks.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | 
  3 | 
  4 | def _make_scratch(in_shape, out_shape, groups=1, expand=False):
  5 |     scratch = nn.Module()
  6 | 
  7 |     out_shape1 = out_shape
  8 |     out_shape2 = out_shape
  9 |     out_shape3 = out_shape
 10 |     if len(in_shape) >= 4:
 11 |         out_shape4 = out_shape
 12 | 
 13 |     if expand:
 14 |         out_shape1 = out_shape
 15 |         out_shape2 = out_shape * 2
 16 |         out_shape3 = out_shape * 4
 17 |         if len(in_shape) >= 4:
 18 |             out_shape4 = out_shape * 8
 19 | 
 20 |     scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
 21 |     scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
 22 |     scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
 23 |     if len(in_shape) >= 4:
 24 |         scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
 25 | 
 26 |     return scratch
 27 | 
 28 | 
 29 | class ResidualConvUnit(nn.Module):
 30 |     """Residual convolution module.
 31 |     """
 32 | 
 33 |     def __init__(self, features, activation, bn):
 34 |         """Init.
 35 | 
 36 |         Args:
 37 |             features (int): number of features
 38 |         """
 39 |         super().__init__()
 40 | 
 41 |         self.bn = bn
 42 | 
 43 |         self.groups=1
 44 | 
 45 |         self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
 46 |         
 47 |         self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
 48 | 
 49 |         if self.bn == True:
 50 |             self.bn1 = nn.BatchNorm2d(features)
 51 |             self.bn2 = nn.BatchNorm2d(features)
 52 | 
 53 |         self.activation = activation
 54 | 
 55 |         self.skip_add = nn.quantized.FloatFunctional()
 56 | 
 57 |     def forward(self, x):
 58 |         """Forward pass.
 59 | 
 60 |         Args:
 61 |             x (tensor): input
 62 | 
 63 |         Returns:
 64 |             tensor: output
 65 |         """
 66 |         
 67 |         out = self.activation(x)
 68 |         out = self.conv1(out)
 69 |         if self.bn == True:
 70 |             out = self.bn1(out)
 71 |        
 72 |         out = self.activation(out)
 73 |         out = self.conv2(out)
 74 |         if self.bn == True:
 75 |             out = self.bn2(out)
 76 | 
 77 |         if self.groups > 1:
 78 |             out = self.conv_merge(out)
 79 | 
 80 |         return self.skip_add.add(out, x)
 81 | 
 82 | 
 83 | class FeatureFusionBlock(nn.Module):
 84 |     """Feature fusion block.
 85 |     """
 86 | 
 87 |     def __init__(
 88 |         self, 
 89 |         features, 
 90 |         activation, 
 91 |         deconv=False, 
 92 |         bn=False, 
 93 |         expand=False, 
 94 |         align_corners=True,
 95 |         size=None
 96 |     ):
 97 |         """Init.
 98 |         
 99 |         Args:
100 |             features (int): number of features
101 |         """
102 |         super(FeatureFusionBlock, self).__init__()
103 | 
104 |         self.deconv = deconv
105 |         self.align_corners = align_corners
106 | 
107 |         self.groups=1
108 | 
109 |         self.expand = expand
110 |         out_features = features
111 |         if self.expand == True:
112 |             out_features = features // 2
113 |         
114 |         self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
115 | 
116 |         self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
117 |         self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
118 |         
119 |         self.skip_add = nn.quantized.FloatFunctional()
120 | 
121 |         self.size=size
122 | 
123 |     def forward(self, *xs, size=None):
124 |         """Forward pass.
125 | 
126 |         Returns:
127 |             tensor: output
128 |         """
129 |         output = xs[0]
130 | 
131 |         if len(xs) == 2:
132 |             res = self.resConfUnit1(xs[1])
133 |             output = self.skip_add.add(output, res)
134 | 
135 |         output = self.resConfUnit2(output)
136 | 
137 |         if (size is None) and (self.size is None):
138 |             modifier = {"scale_factor": 2}
139 |         elif size is None:
140 |             modifier = {"size": self.size}
141 |         else:
142 |             modifier = {"size": size}
143 | 
144 |         output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
145 |         
146 |         output = self.out_conv(output)
147 | 
148 |         return output
149 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/dataset/hypersim.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import h5py
 3 | import numpy as np
 4 | import torch
 5 | from torch.utils.data import Dataset
 6 | from torchvision.transforms import Compose
 7 | 
 8 | from dataset.transform import Resize, NormalizeImage, PrepareForNet, Crop
 9 | 
10 | 
11 | def hypersim_distance_to_depth(npyDistance):
12 |     intWidth, intHeight, fltFocal = 1024, 768, 886.81
13 | 
14 |     npyImageplaneX = np.linspace((-0.5 * intWidth) + 0.5, (0.5 * intWidth) - 0.5, intWidth).reshape(
15 |         1, intWidth).repeat(intHeight, 0).astype(np.float32)[:, :, None]
16 |     npyImageplaneY = np.linspace((-0.5 * intHeight) + 0.5, (0.5 * intHeight) - 0.5,
17 |                                  intHeight).reshape(intHeight, 1).repeat(intWidth, 1).astype(np.float32)[:, :, None]
18 |     npyImageplaneZ = np.full([intHeight, intWidth, 1], fltFocal, np.float32)
19 |     npyImageplane = np.concatenate(
20 |         [npyImageplaneX, npyImageplaneY, npyImageplaneZ], 2)
21 | 
22 |     npyDepth = npyDistance / np.linalg.norm(npyImageplane, 2, 2) * fltFocal
23 |     return npyDepth
24 | 
25 | 
26 | class Hypersim(Dataset):
27 |     def __init__(self, filelist_path, mode, size=(518, 518)):
28 |         
29 |         self.mode = mode
30 |         self.size = size
31 |         
32 |         with open(filelist_path, 'r') as f:
33 |             self.filelist = f.read().splitlines()
34 |         
35 |         net_w, net_h = size
36 |         self.transform = Compose([
37 |             Resize(
38 |                 width=net_w,
39 |                 height=net_h,
40 |                 resize_target=True if mode == 'train' else False,
41 |                 keep_aspect_ratio=True,
42 |                 ensure_multiple_of=14,
43 |                 resize_method='lower_bound',
44 |                 image_interpolation_method=cv2.INTER_CUBIC,
45 |             ),
46 |             NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
47 |             PrepareForNet(),
48 |         ] + ([Crop(size[0])] if self.mode == 'train' else []))
49 |         
50 |     def __getitem__(self, item):
51 |         img_path = self.filelist[item].split(' ')[0]
52 |         depth_path = self.filelist[item].split(' ')[1]
53 |         
54 |         image = cv2.imread(img_path)
55 |         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
56 |         
57 |         depth_fd = h5py.File(depth_path, "r")
58 |         distance_meters = np.array(depth_fd['dataset'])
59 |         depth = hypersim_distance_to_depth(distance_meters)
60 |         
61 |         sample = self.transform({'image': image, 'depth': depth})
62 | 
63 |         sample['image'] = torch.from_numpy(sample['image'])
64 |         sample['depth'] = torch.from_numpy(sample['depth'])
65 |         
66 |         sample['valid_mask'] = (torch.isnan(sample['depth']) == 0)
67 |         sample['depth'][sample['valid_mask'] == 0] = 0
68 |         
69 |         sample['image_path'] = self.filelist[item].split(' ')[0]
70 |         
71 |         return sample
72 | 
73 |     def __len__(self):
74 |         return len(self.filelist)


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/dataset/kitti.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import torch
 3 | from torch.utils.data import Dataset
 4 | from torchvision.transforms import Compose
 5 | 
 6 | from dataset.transform import Resize, NormalizeImage, PrepareForNet
 7 | 
 8 | 
 9 | class KITTI(Dataset):
10 |     def __init__(self, filelist_path, mode, size=(518, 518)):
11 |         if mode != 'val':
12 |             raise NotImplementedError
13 |         
14 |         self.mode = mode
15 |         self.size = size
16 |         
17 |         with open(filelist_path, 'r') as f:
18 |             self.filelist = f.read().splitlines()
19 |         
20 |         net_w, net_h = size
21 |         self.transform = Compose([
22 |             Resize(
23 |                 width=net_w,
24 |                 height=net_h,
25 |                 resize_target=True if mode == 'train' else False,
26 |                 keep_aspect_ratio=True,
27 |                 ensure_multiple_of=14,
28 |                 resize_method='lower_bound',
29 |                 image_interpolation_method=cv2.INTER_CUBIC,
30 |             ),
31 |             NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
32 |             PrepareForNet(),
33 |         ])
34 |     
35 |     def __getitem__(self, item):
36 |         img_path = self.filelist[item].split(' ')[0]
37 |         depth_path = self.filelist[item].split(' ')[1]
38 |         
39 |         image = cv2.imread(img_path)
40 |         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
41 |         
42 |         depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED).astype('float32')
43 |         
44 |         sample = self.transform({'image': image, 'depth': depth})
45 |         
46 |         sample['image'] = torch.from_numpy(sample['image'])
47 |         sample['depth'] = torch.from_numpy(sample['depth'])
48 |         sample['depth'] = sample['depth'] / 256.0  # convert in meters
49 |         
50 |         sample['valid_mask'] = sample['depth'] > 0
51 |         
52 |         sample['image_path'] = self.filelist[item].split(' ')[0]
53 |         
54 |         return sample
55 | 
56 |     def __len__(self):
57 |         return len(self.filelist)


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/dataset/vkitti2.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import torch
 3 | from torch.utils.data import Dataset
 4 | from torchvision.transforms import Compose
 5 | 
 6 | from dataset.transform import Resize, NormalizeImage, PrepareForNet, Crop
 7 | 
 8 | 
 9 | class VKITTI2(Dataset):
10 |     def __init__(self, filelist_path, mode, size=(518, 518)):
11 |         
12 |         self.mode = mode
13 |         self.size = size
14 |         
15 |         with open(filelist_path, 'r') as f:
16 |             self.filelist = f.read().splitlines()
17 |         
18 |         net_w, net_h = size
19 |         self.transform = Compose([
20 |             Resize(
21 |                 width=net_w,
22 |                 height=net_h,
23 |                 resize_target=True if mode == 'train' else False,
24 |                 keep_aspect_ratio=True,
25 |                 ensure_multiple_of=14,
26 |                 resize_method='lower_bound',
27 |                 image_interpolation_method=cv2.INTER_CUBIC,
28 |             ),
29 |             NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
30 |             PrepareForNet(),
31 |         ] + ([Crop(size[0])] if self.mode == 'train' else []))
32 |     
33 |     def __getitem__(self, item):
34 |         img_path = self.filelist[item].split(' ')[0]
35 |         depth_path = self.filelist[item].split(' ')[1]
36 |         
37 |         image = cv2.imread(img_path)
38 |         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
39 |         
40 |         depth = cv2.imread(depth_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH) / 100.0  # cm to m
41 |         
42 |         sample = self.transform({'image': image, 'depth': depth})
43 | 
44 |         sample['image'] = torch.from_numpy(sample['image'])
45 |         sample['depth'] = torch.from_numpy(sample['depth'])
46 |         
47 |         sample['valid_mask'] = (sample['depth'] <= 80)
48 |         
49 |         sample['image_path'] = self.filelist[item].split(' ')[0]
50 |         
51 |         return sample
52 | 
53 |     def __len__(self):
54 |         return len(self.filelist)


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .mlp import Mlp
 8 | from .patch_embed import PatchEmbed
 9 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
10 | from .block import NestedTensorBlock
11 | from .attention import MemEffAttention
12 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/attention.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
10 | 
11 | import logging
12 | 
13 | from torch import Tensor
14 | from torch import nn
15 | 
16 | 
17 | logger = logging.getLogger("dinov2")
18 | 
19 | 
20 | try:
21 |     from xformers.ops import memory_efficient_attention, unbind, fmha
22 | 
23 |     XFORMERS_AVAILABLE = True
24 | except ImportError:
25 |     logger.warning("xFormers not available")
26 |     XFORMERS_AVAILABLE = False
27 | 
28 | 
29 | class Attention(nn.Module):
30 |     def __init__(
31 |         self,
32 |         dim: int,
33 |         num_heads: int = 8,
34 |         qkv_bias: bool = False,
35 |         proj_bias: bool = True,
36 |         attn_drop: float = 0.0,
37 |         proj_drop: float = 0.0,
38 |     ) -> None:
39 |         super().__init__()
40 |         self.num_heads = num_heads
41 |         head_dim = dim // num_heads
42 |         self.scale = head_dim**-0.5
43 | 
44 |         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
45 |         self.attn_drop = nn.Dropout(attn_drop)
46 |         self.proj = nn.Linear(dim, dim, bias=proj_bias)
47 |         self.proj_drop = nn.Dropout(proj_drop)
48 | 
49 |     def forward(self, x: Tensor) -> Tensor:
50 |         B, N, C = x.shape
51 |         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
52 | 
53 |         q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
54 |         attn = q @ k.transpose(-2, -1)
55 | 
56 |         attn = attn.softmax(dim=-1)
57 |         attn = self.attn_drop(attn)
58 | 
59 |         x = (attn @ v).transpose(1, 2).reshape(B, N, C)
60 |         x = self.proj(x)
61 |         x = self.proj_drop(x)
62 |         return x
63 | 
64 | 
65 | class MemEffAttention(Attention):
66 |     def forward(self, x: Tensor, attn_bias=None) -> Tensor:
67 |         if not XFORMERS_AVAILABLE:
68 |             assert attn_bias is None, "xFormers is required for nested tensors usage"
69 |             return super().forward(x)
70 | 
71 |         B, N, C = x.shape
72 |         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
73 | 
74 |         q, k, v = unbind(qkv, 2)
75 | 
76 |         x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
77 |         x = x.reshape([B, N, C])
78 | 
79 |         x = self.proj(x)
80 |         x = self.proj_drop(x)
81 |         return x
82 | 
83 |         


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/drop_path.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
10 | 
11 | 
12 | from torch import nn
13 | 
14 | 
15 | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
16 |     if drop_prob == 0.0 or not training:
17 |         return x
18 |     keep_prob = 1 - drop_prob
19 |     shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
20 |     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
21 |     if keep_prob > 0.0:
22 |         random_tensor.div_(keep_prob)
23 |     output = x * random_tensor
24 |     return output
25 | 
26 | 
27 | class DropPath(nn.Module):
28 |     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
29 | 
30 |     def __init__(self, drop_prob=None):
31 |         super(DropPath, self).__init__()
32 |         self.drop_prob = drop_prob
33 | 
34 |     def forward(self, x):
35 |         return drop_path(x, self.drop_prob, self.training)
36 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/layer_scale.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
 8 | 
 9 | from typing import Union
10 | 
11 | import torch
12 | from torch import Tensor
13 | from torch import nn
14 | 
15 | 
16 | class LayerScale(nn.Module):
17 |     def __init__(
18 |         self,
19 |         dim: int,
20 |         init_values: Union[float, Tensor] = 1e-5,
21 |         inplace: bool = False,
22 |     ) -> None:
23 |         super().__init__()
24 |         self.inplace = inplace
25 |         self.gamma = nn.Parameter(init_values * torch.ones(dim))
26 | 
27 |     def forward(self, x: Tensor) -> Tensor:
28 |         return x.mul_(self.gamma) if self.inplace else x * self.gamma
29 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/mlp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
10 | 
11 | 
12 | from typing import Callable, Optional
13 | 
14 | from torch import Tensor, nn
15 | 
16 | 
17 | class Mlp(nn.Module):
18 |     def __init__(
19 |         self,
20 |         in_features: int,
21 |         hidden_features: Optional[int] = None,
22 |         out_features: Optional[int] = None,
23 |         act_layer: Callable[..., nn.Module] = nn.GELU,
24 |         drop: float = 0.0,
25 |         bias: bool = True,
26 |     ) -> None:
27 |         super().__init__()
28 |         out_features = out_features or in_features
29 |         hidden_features = hidden_features or in_features
30 |         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
31 |         self.act = act_layer()
32 |         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
33 |         self.drop = nn.Dropout(drop)
34 | 
35 |     def forward(self, x: Tensor) -> Tensor:
36 |         x = self.fc1(x)
37 |         x = self.act(x)
38 |         x = self.drop(x)
39 |         x = self.fc2(x)
40 |         x = self.drop(x)
41 |         return x
42 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/patch_embed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # References:
 8 | #   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
 9 | #   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
10 | 
11 | from typing import Callable, Optional, Tuple, Union
12 | 
13 | from torch import Tensor
14 | import torch.nn as nn
15 | 
16 | 
17 | def make_2tuple(x):
18 |     if isinstance(x, tuple):
19 |         assert len(x) == 2
20 |         return x
21 | 
22 |     assert isinstance(x, int)
23 |     return (x, x)
24 | 
25 | 
26 | class PatchEmbed(nn.Module):
27 |     """
28 |     2D image to patch embedding: (B,C,H,W) -> (B,N,D)
29 | 
30 |     Args:
31 |         img_size: Image size.
32 |         patch_size: Patch token size.
33 |         in_chans: Number of input image channels.
34 |         embed_dim: Number of linear projection output channels.
35 |         norm_layer: Normalization layer.
36 |     """
37 | 
38 |     def __init__(
39 |         self,
40 |         img_size: Union[int, Tuple[int, int]] = 224,
41 |         patch_size: Union[int, Tuple[int, int]] = 16,
42 |         in_chans: int = 3,
43 |         embed_dim: int = 768,
44 |         norm_layer: Optional[Callable] = None,
45 |         flatten_embedding: bool = True,
46 |     ) -> None:
47 |         super().__init__()
48 | 
49 |         image_HW = make_2tuple(img_size)
50 |         patch_HW = make_2tuple(patch_size)
51 |         patch_grid_size = (
52 |             image_HW[0] // patch_HW[0],
53 |             image_HW[1] // patch_HW[1],
54 |         )
55 | 
56 |         self.img_size = image_HW
57 |         self.patch_size = patch_HW
58 |         self.patches_resolution = patch_grid_size
59 |         self.num_patches = patch_grid_size[0] * patch_grid_size[1]
60 | 
61 |         self.in_chans = in_chans
62 |         self.embed_dim = embed_dim
63 | 
64 |         self.flatten_embedding = flatten_embedding
65 | 
66 |         self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
67 |         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
68 | 
69 |     def forward(self, x: Tensor) -> Tensor:
70 |         _, _, H, W = x.shape
71 |         patch_H, patch_W = self.patch_size
72 | 
73 |         assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
74 |         assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
75 | 
76 |         x = self.proj(x)  # B C H W
77 |         H, W = x.size(2), x.size(3)
78 |         x = x.flatten(2).transpose(1, 2)  # B HW C
79 |         x = self.norm(x)
80 |         if not self.flatten_embedding:
81 |             x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
82 |         return x
83 | 
84 |     def flops(self) -> float:
85 |         Ho, Wo = self.patches_resolution
86 |         flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
87 |         if self.norm is not None:
88 |             flops += Ho * Wo * self.embed_dim
89 |         return flops
90 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/depth_anything_v2/dinov2_layers/swiglu_ffn.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from typing import Callable, Optional
 8 | 
 9 | from torch import Tensor, nn
10 | import torch.nn.functional as F
11 | 
12 | 
13 | class SwiGLUFFN(nn.Module):
14 |     def __init__(
15 |         self,
16 |         in_features: int,
17 |         hidden_features: Optional[int] = None,
18 |         out_features: Optional[int] = None,
19 |         act_layer: Callable[..., nn.Module] = None,
20 |         drop: float = 0.0,
21 |         bias: bool = True,
22 |     ) -> None:
23 |         super().__init__()
24 |         out_features = out_features or in_features
25 |         hidden_features = hidden_features or in_features
26 |         self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
27 |         self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
28 | 
29 |     def forward(self, x: Tensor) -> Tensor:
30 |         x12 = self.w12(x)
31 |         x1, x2 = x12.chunk(2, dim=-1)
32 |         hidden = F.silu(x1) * x2
33 |         return self.w3(hidden)
34 | 
35 | 
36 | try:
37 |     from xformers.ops import SwiGLU
38 | 
39 |     XFORMERS_AVAILABLE = True
40 | except ImportError:
41 |     SwiGLU = SwiGLUFFN
42 |     XFORMERS_AVAILABLE = False
43 | 
44 | 
45 | class SwiGLUFFNFused(SwiGLU):
46 |     def __init__(
47 |         self,
48 |         in_features: int,
49 |         hidden_features: Optional[int] = None,
50 |         out_features: Optional[int] = None,
51 |         act_layer: Callable[..., nn.Module] = None,
52 |         drop: float = 0.0,
53 |         bias: bool = True,
54 |     ) -> None:
55 |         out_features = out_features or in_features
56 |         hidden_features = hidden_features or in_features
57 |         hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
58 |         super().__init__(
59 |             in_features=in_features,
60 |             hidden_features=hidden_features,
61 |             out_features=out_features,
62 |             bias=bias,
63 |         )
64 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/dist_train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | now=$(date +"%Y%m%d_%H%M%S")
 3 | 
 4 | epoch=120
 5 | bs=4
 6 | gpus=8
 7 | lr=0.000005
 8 | encoder=vitl
 9 | dataset=hypersim # vkitti
10 | img_size=518
11 | min_depth=0.001
12 | max_depth=20 # 80 for virtual kitti
13 | pretrained_from=../checkpoints/depth_anything_v2_${encoder}.pth
14 | save_path=exp/hypersim # exp/vkitti
15 | 
16 | mkdir -p $save_path
17 | 
18 | python3 -m torch.distributed.launch \
19 |     --nproc_per_node=$gpus \
20 |     --nnodes 1 \
21 |     --node_rank=0 \
22 |     --master_addr=localhost \
23 |     --master_port=20596 \
24 |     train.py --epoch $epoch --encoder $encoder --bs $bs --lr $lr --save-path $save_path --dataset $dataset \
25 |     --img-size $img_size --min-depth $min_depth --max-depth $max_depth --pretrained-from $pretrained_from \
26 |     --port 20596 2>&1 | tee -a $save_path/$now.log
27 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | opencv-python
3 | open3d
4 | torch
5 | torchvision
6 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/run.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import cv2
 3 | import glob
 4 | import matplotlib
 5 | import numpy as np
 6 | import os
 7 | import torch
 8 | 
 9 | from depth_anything_v2.dpt import DepthAnythingV2
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     parser = argparse.ArgumentParser(description='Depth Anything V2 Metric Depth Estimation')
14 |     
15 |     parser.add_argument('--img-path', type=str)
16 |     parser.add_argument('--input-size', type=int, default=518)
17 |     parser.add_argument('--outdir', type=str, default='./vis_depth')
18 |     
19 |     parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg'])
20 |     parser.add_argument('--load-from', type=str, default='checkpoints/depth_anything_v2_metric_hypersim_vitl.pth')
21 |     parser.add_argument('--max-depth', type=float, default=20)
22 |     
23 |     parser.add_argument('--save-numpy', dest='save_numpy', action='store_true', help='save the model raw output')
24 |     parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction')
25 |     parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette')
26 |     
27 |     args = parser.parse_args()
28 |     
29 |     DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
30 |     
31 |     model_configs = {
32 |         'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
33 |         'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
34 |         'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
35 |         'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
36 |     }
37 |     
38 |     depth_anything = DepthAnythingV2(**{**model_configs[args.encoder], 'max_depth': args.max_depth})
39 |     depth_anything.load_state_dict(torch.load(args.load_from, map_location='cpu'))
40 |     depth_anything = depth_anything.to(DEVICE).eval()
41 |     
42 |     if os.path.isfile(args.img_path):
43 |         if args.img_path.endswith('txt'):
44 |             with open(args.img_path, 'r') as f:
45 |                 filenames = f.read().splitlines()
46 |         else:
47 |             filenames = [args.img_path]
48 |     else:
49 |         filenames = glob.glob(os.path.join(args.img_path, '**/*'), recursive=True)
50 |     
51 |     os.makedirs(args.outdir, exist_ok=True)
52 |     
53 |     cmap = matplotlib.colormaps.get_cmap('Spectral')
54 |     
55 |     for k, filename in enumerate(filenames):
56 |         print(f'Progress {k+1}/{len(filenames)}: {filename}')
57 |         
58 |         raw_image = cv2.imread(filename)
59 |         
60 |         depth = depth_anything.infer_image(raw_image, args.input_size)
61 |         
62 |         if args.save_numpy:
63 |             output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '_raw_depth_meter.npy')
64 |             np.save(output_path, depth)
65 |         
66 |         depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
67 |         depth = depth.astype(np.uint8)
68 |         
69 |         if args.grayscale:
70 |             depth = np.repeat(depth[..., np.newaxis], 3, axis=-1)
71 |         else:
72 |             depth = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8)
73 |         
74 |         output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.png')
75 |         if args.pred_only:
76 |             cv2.imwrite(output_path, depth)
77 |         else:
78 |             split_region = np.ones((raw_image.shape[0], 50, 3), dtype=np.uint8) * 255
79 |             combined_result = cv2.hconcat([raw_image, split_region, depth])
80 |             
81 |             cv2.imwrite(output_path, combined_result)


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/util/dist_helper.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | import torch
 5 | import torch.distributed as dist
 6 | 
 7 | 
 8 | def setup_distributed(backend="nccl", port=None):
 9 |     """AdaHessian Optimizer
10 |     Lifted from https://github.com/BIGBALLON/distribuuuu/blob/master/distribuuuu/utils.py
11 |     Originally licensed MIT, Copyright (c) 2020 Wei Li
12 |     """
13 |     num_gpus = torch.cuda.device_count()
14 | 
15 |     if "SLURM_JOB_ID" in os.environ:
16 |         rank = int(os.environ["SLURM_PROCID"])
17 |         world_size = int(os.environ["SLURM_NTASKS"])
18 |         node_list = os.environ["SLURM_NODELIST"]
19 |         addr = subprocess.getoutput(f"scontrol show hostname {node_list} | head -n1")
20 |         # specify master port
21 |         if port is not None:
22 |             os.environ["MASTER_PORT"] = str(port)
23 |         elif "MASTER_PORT" not in os.environ:
24 |             os.environ["MASTER_PORT"] = "10685"
25 |         if "MASTER_ADDR" not in os.environ:
26 |             os.environ["MASTER_ADDR"] = addr
27 |         os.environ["WORLD_SIZE"] = str(world_size)
28 |         os.environ["LOCAL_RANK"] = str(rank % num_gpus)
29 |         os.environ["RANK"] = str(rank)
30 |     else:
31 |         rank = int(os.environ["RANK"])
32 |         world_size = int(os.environ["WORLD_SIZE"])
33 | 
34 |     torch.cuda.set_device(rank % num_gpus)
35 | 
36 |     dist.init_process_group(
37 |         backend=backend,
38 |         world_size=world_size,
39 |         rank=rank,
40 |     )
41 |     return rank, world_size
42 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/util/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class SiLogLoss(nn.Module):
 6 |     def __init__(self, lambd=0.5):
 7 |         super().__init__()
 8 |         self.lambd = lambd
 9 | 
10 |     def forward(self, pred, target, valid_mask):
11 |         valid_mask = valid_mask.detach()
12 |         diff_log = torch.log(target[valid_mask]) - torch.log(pred[valid_mask])
13 |         loss = torch.sqrt(torch.pow(diff_log, 2).mean() -
14 |                           self.lambd * torch.pow(diff_log.mean(), 2))
15 | 
16 |         return loss
17 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/util/metric.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def eval_depth(pred, target):
 5 |     assert pred.shape == target.shape
 6 | 
 7 |     thresh = torch.max((target / pred), (pred / target))
 8 | 
 9 |     d1 = torch.sum(thresh < 1.25).float() / len(thresh)
10 |     d2 = torch.sum(thresh < 1.25 ** 2).float() / len(thresh)
11 |     d3 = torch.sum(thresh < 1.25 ** 3).float() / len(thresh)
12 | 
13 |     diff = pred - target
14 |     diff_log = torch.log(pred) - torch.log(target)
15 | 
16 |     abs_rel = torch.mean(torch.abs(diff) / target)
17 |     sq_rel = torch.mean(torch.pow(diff, 2) / target)
18 | 
19 |     rmse = torch.sqrt(torch.mean(torch.pow(diff, 2)))
20 |     rmse_log = torch.sqrt(torch.mean(torch.pow(diff_log , 2)))
21 | 
22 |     log10 = torch.mean(torch.abs(torch.log10(pred) - torch.log10(target)))
23 |     silog = torch.sqrt(torch.pow(diff_log, 2).mean() - 0.5 * torch.pow(diff_log.mean(), 2))
24 | 
25 |     return {'d1': d1.item(), 'd2': d2.item(), 'd3': d3.item(), 'abs_rel': abs_rel.item(), 'sq_rel': sq_rel.item(), 
26 |             'rmse': rmse.item(), 'rmse_log': rmse_log.item(), 'log10':log10.item(), 'silog':silog.item()}


--------------------------------------------------------------------------------
/ddepth_anything_v2/metric_depth/util/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import numpy as np
 4 | import logging
 5 | 
 6 | logs = set()
 7 | 
 8 | 
 9 | def init_log(name, level=logging.INFO):
10 |     if (name, level) in logs:
11 |         return
12 |     logs.add((name, level))
13 |     logger = logging.getLogger(name)
14 |     logger.setLevel(level)
15 |     ch = logging.StreamHandler()
16 |     ch.setLevel(level)
17 |     if "SLURM_PROCID" in os.environ:
18 |         rank = int(os.environ["SLURM_PROCID"])
19 |         logger.addFilter(lambda record: rank == 0)
20 |     else:
21 |         rank = 0
22 |     format_str = "[%(asctime)s][%(levelname)8s] %(message)s"
23 |     formatter = logging.Formatter(format_str)
24 |     ch.setFormatter(formatter)
25 |     logger.addHandler(ch)
26 |     return logger
27 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/requirements.txt:
--------------------------------------------------------------------------------
1 | gradio_imageslider
2 | gradio==4.29.0
3 | matplotlib
4 | opencv-python
5 | torch
6 | torchvision
7 | 


--------------------------------------------------------------------------------
/ddepth_anything_v2/run.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import cv2
 3 | import glob
 4 | import matplotlib
 5 | import numpy as np
 6 | import os
 7 | import torch
 8 | 
 9 | from depth_anything_v2.dpt import DepthAnythingV2
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     parser = argparse.ArgumentParser(description='Depth Anything V2')
14 |     
15 |     parser.add_argument('--img-path', type=str)
16 |     parser.add_argument('--input-size', type=int, default=518)
17 |     parser.add_argument('--outdir', type=str, default='./vis_depth')
18 |     
19 |     parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg'])
20 |     
21 |     parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction')
22 |     parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette')
23 |     
24 |     args = parser.parse_args()
25 |     
26 |     DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
27 |     
28 |     model_configs = {
29 |         'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
30 |         'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
31 |         'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
32 |         'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
33 |     }
34 |     
35 |     depth_anything = DepthAnythingV2(**model_configs[args.encoder])
36 |     depth_anything.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_{args.encoder}.pth', map_location='cpu'))
37 |     depth_anything = depth_anything.to(DEVICE).eval()
38 |     
39 |     if os.path.isfile(args.img_path):
40 |         if args.img_path.endswith('txt'):
41 |             with open(args.img_path, 'r') as f:
42 |                 filenames = f.read().splitlines()
43 |         else:
44 |             filenames = [args.img_path]
45 |     else:
46 |         filenames = glob.glob(os.path.join(args.img_path, '**/*'), recursive=True)
47 |     
48 |     os.makedirs(args.outdir, exist_ok=True)
49 |     
50 |     cmap = matplotlib.colormaps.get_cmap('Spectral_r')
51 |     
52 |     for k, filename in enumerate(filenames):
53 |         print(f'Progress {k+1}/{len(filenames)}: {filename}')
54 |         
55 |         raw_image = cv2.imread(filename)
56 |         
57 |         depth = depth_anything.infer_image(raw_image, args.input_size)
58 |         
59 |         depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
60 |         depth = depth.astype(np.uint8)
61 |         
62 |         if args.grayscale:
63 |             depth = np.repeat(depth[..., np.newaxis], 3, axis=-1)
64 |         else:
65 |             depth = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8)
66 |         
67 |         if args.pred_only:
68 |             cv2.imwrite(os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.png'), depth)
69 |         else:
70 |             split_region = np.ones((raw_image.shape[0], 50, 3), dtype=np.uint8) * 255
71 |             combined_result = cv2.hconcat([raw_image, split_region, depth])
72 |             
73 |             cv2.imwrite(os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.png'), combined_result)


--------------------------------------------------------------------------------
/ddepth_anything_v2/run_video.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import cv2
 3 | import glob
 4 | import matplotlib
 5 | import numpy as np
 6 | import os
 7 | import torch
 8 | 
 9 | from depth_anything_v2.dpt import DepthAnythingV2
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     parser = argparse.ArgumentParser(description='Depth Anything V2')
14 |     
15 |     parser.add_argument('--video-path', type=str)
16 |     parser.add_argument('--input-size', type=int, default=518)
17 |     parser.add_argument('--outdir', type=str, default='./vis_video_depth')
18 |     
19 |     parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg'])
20 |     
21 |     parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction')
22 |     parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette')
23 |     
24 |     args = parser.parse_args()
25 |     
26 |     DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
27 |     
28 |     model_configs = {
29 |         'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
30 |         'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
31 |         'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
32 |         'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
33 |     }
34 |     
35 |     depth_anything = DepthAnythingV2(**model_configs[args.encoder])
36 |     depth_anything.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_{args.encoder}.pth', map_location='cpu'))
37 |     depth_anything = depth_anything.to(DEVICE).eval()
38 |     
39 |     if os.path.isfile(args.video_path):
40 |         if args.video_path.endswith('txt'):
41 |             with open(args.video_path, 'r') as f:
42 |                 lines = f.read().splitlines()
43 |         else:
44 |             filenames = [args.video_path]
45 |     else:
46 |         filenames = glob.glob(os.path.join(args.video_path, '**/*'), recursive=True)
47 |     
48 |     os.makedirs(args.outdir, exist_ok=True)
49 |     
50 |     margin_width = 50
51 |     cmap = matplotlib.colormaps.get_cmap('Spectral_r')
52 |     
53 |     for k, filename in enumerate(filenames):
54 |         print(f'Progress {k+1}/{len(filenames)}: {filename}')
55 |         
56 |         raw_video = cv2.VideoCapture(filename)
57 |         frame_width, frame_height = int(raw_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(raw_video.get(cv2.CAP_PROP_FRAME_HEIGHT))
58 |         frame_rate = int(raw_video.get(cv2.CAP_PROP_FPS))
59 |         
60 |         if args.pred_only: 
61 |             output_width = frame_width
62 |         else: 
63 |             output_width = frame_width * 2 + margin_width
64 |         
65 |         output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.mp4')
66 |         out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (output_width, frame_height))
67 |         
68 |         while raw_video.isOpened():
69 |             ret, raw_frame = raw_video.read()
70 |             if not ret:
71 |                 break
72 |             
73 |             depth = depth_anything.infer_image(raw_frame, args.input_size)
74 |             
75 |             depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
76 |             depth = depth.astype(np.uint8)
77 |             
78 |             if args.grayscale:
79 |                 depth = np.repeat(depth[..., np.newaxis], 3, axis=-1)
80 |             else:
81 |                 depth = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8)
82 |             
83 |             if args.pred_only:
84 |                 out.write(depth)
85 |             else:
86 |                 split_region = np.ones((frame_height, margin_width, 3), dtype=np.uint8) * 255
87 |                 combined_frame = cv2.hconcat([raw_frame, split_region, depth])
88 |                 
89 |                 out.write(combined_frame)
90 |         
91 |         raw_video.release()
92 |         out.release()
93 | 


--------------------------------------------------------------------------------
/dmarigold/marigold/__init__.py:
--------------------------------------------------------------------------------
1 | from .marigold_pipeline import MarigoldPipeline, MarigoldDepthOutput
2 | 


--------------------------------------------------------------------------------
/dmarigold/marigold/util/batchsize.py:
--------------------------------------------------------------------------------
 1 | # Author: Bingxin Ke
 2 | # Last modified: 2023-12-15
 3 | 
 4 | import torch
 5 | import math
 6 | 
 7 | 
 8 | # Search table for suggested max. inference batch size
 9 | bs_search_table = [
10 |     # tested on A100-PCIE-80GB
11 |     {"res": 768, "total_vram": 79, "bs": 35},
12 |     {"res": 1024, "total_vram": 79, "bs": 20},
13 |     # tested on A100-PCIE-40GB
14 |     {"res": 768, "total_vram": 39, "bs": 15},
15 |     {"res": 1024, "total_vram": 39, "bs": 8},
16 |     # tested on RTX3090, RTX4090
17 |     {"res": 512, "total_vram": 23, "bs": 20},
18 |     {"res": 768, "total_vram": 23, "bs": 7},
19 |     {"res": 1024, "total_vram": 23, "bs": 3},
20 |     # tested on GTX1080Ti
21 |     {"res": 512, "total_vram": 10, "bs": 5},
22 |     {"res": 768, "total_vram": 10, "bs": 2},
23 | ]
24 | 
25 | 
26 | def find_batch_size(ensemble_size: int, input_res: int) -> int:
27 |     """
28 |     Automatically search for suitable operating batch size.
29 | 
30 |     Args:
31 |         ensemble_size (int): Number of predictions to be ensembled
32 |         input_res (int): Operating resolution of the input image.
33 | 
34 |     Returns:
35 |         int: Operating batch size
36 |     """
37 |     if not torch.cuda.is_available():
38 |         return 1
39 | 
40 |     total_vram = torch.cuda.mem_get_info()[1] / 1024.0**3
41 | 
42 |     for settings in sorted(bs_search_table, key=lambda k: (k["res"], -k["total_vram"])):
43 |         if input_res <= settings["res"] and total_vram >= settings["total_vram"]:
44 |             bs = settings["bs"]
45 |             if bs > ensemble_size:
46 |                 bs = ensemble_size
47 |             elif bs > math.ceil(ensemble_size / 2) and bs < ensemble_size:
48 |                 bs = math.ceil(ensemble_size / 2)
49 |             return bs
50 | 
51 |     return 1
52 | 


--------------------------------------------------------------------------------
/dmarigold/marigold/util/ensemble.py:
--------------------------------------------------------------------------------
  1 | # Test align depth images
  2 | # Author: Bingxin Ke
  3 | # Last modified: 2023-12-15
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | 
  8 | from scipy.optimize import minimize
  9 | 
 10 | 
 11 | def inter_distances(tensors: torch.Tensor):
 12 |     """
 13 |     To calculate the distance between each two depth maps.
 14 |     """
 15 |     distances = []
 16 |     for i, j in torch.combinations(torch.arange(tensors.shape[0])):
 17 |         arr1 = tensors[i : i + 1]
 18 |         arr2 = tensors[j : j + 1]
 19 |         distances.append(arr1 - arr2)
 20 |     dist = torch.concatenate(distances, dim=0)
 21 |     return dist
 22 | 
 23 | 
 24 | def ensemble_depths(
 25 |     input_images: torch.Tensor,
 26 |     regularizer_strength: float = 0.02,
 27 |     max_iter: int = 2,
 28 |     tol: float = 1e-3,
 29 |     reduction: str = "median",
 30 |     max_res: int = None,
 31 | ):
 32 |     """
 33 |     To ensemble multiple affine-invariant depth images (up to scale and shift),
 34 |         by aligning estimating the scale and shift
 35 |     """
 36 |     device = input_images.device
 37 |     dtype = np.float32
 38 | 
 39 |     original_input = input_images.clone()
 40 |     n_img = input_images.shape[0]
 41 |     ori_shape = input_images.shape
 42 | 
 43 |     if max_res is not None:
 44 |         scale_factor = torch.min(max_res / torch.tensor(ori_shape[-2:]))
 45 |         if scale_factor < 1:
 46 |             downscaler = torch.nn.Upsample(scale_factor=scale_factor, mode="nearest")
 47 |             input_images = downscaler(torch.from_numpy(input_images)).numpy()
 48 | 
 49 |     # init guess
 50 |     _min = np.min(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1)
 51 |     _max = np.max(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1)
 52 |     s_init = 1.0 / (_max - _min).reshape((-1, 1, 1))
 53 |     t_init = (-1 * s_init.flatten() * _min.flatten()).reshape((-1, 1, 1))
 54 |     x = np.concatenate([s_init, t_init]).reshape(-1)
 55 | 
 56 |     input_images = input_images.to(device)
 57 | 
 58 |     # objective function
 59 |     def closure(x):
 60 |         x = x.astype(dtype)
 61 |         l = len(x)
 62 |         s = x[: int(l / 2)]
 63 |         t = x[int(l / 2) :]
 64 |         s = torch.from_numpy(s).to(device)
 65 |         t = torch.from_numpy(t).to(device)
 66 | 
 67 |         transformed_arrays = input_images * s.view((-1, 1, 1)) + t.view((-1, 1, 1))
 68 |         dists = inter_distances(transformed_arrays)
 69 |         sqrt_dist = torch.sqrt(torch.mean(dists**2))
 70 | 
 71 |         if "mean" == reduction:
 72 |             pred = torch.mean(transformed_arrays, dim=0)
 73 |         elif "median" == reduction:
 74 |             pred = torch.median(transformed_arrays, dim=0).values
 75 |         else:
 76 |             raise ValueError
 77 | 
 78 |         near_err = torch.sqrt((0 - torch.min(pred)) ** 2)
 79 |         far_err = torch.sqrt((1 - torch.max(pred)) ** 2)
 80 | 
 81 |         err = sqrt_dist + (near_err + far_err) * regularizer_strength
 82 |         err = err.detach().cpu().numpy()
 83 |         return err
 84 | 
 85 |     res = minimize(
 86 |         closure, x, method="BFGS", tol=tol, options={"maxiter": max_iter, "disp": False}
 87 |     )
 88 |     x = res.x
 89 |     x = x.astype(dtype)
 90 |     l = len(x)
 91 |     s = x[: int(l / 2)]
 92 |     t = x[int(l / 2) :]
 93 | 
 94 |     # Prediction
 95 |     s = torch.from_numpy(s).to(device)
 96 |     t = torch.from_numpy(t).to(device)
 97 |     transformed_arrays = original_input * s.view(-1, 1, 1) + t.view(-1, 1, 1)
 98 |     if "mean" == reduction:
 99 |         aligned_images = torch.mean(transformed_arrays, dim=0)
100 |         std = torch.std(transformed_arrays, dim=0)
101 |         uncertainty = std
102 |     elif "median" == reduction:
103 |         aligned_images = torch.median(transformed_arrays, dim=0).values
104 |         # MAD (median absolute deviation) as uncertainty indicator
105 |         abs_dev = torch.abs(transformed_arrays - aligned_images)
106 |         mad = torch.median(abs_dev, dim=0).values
107 |         uncertainty = mad
108 |     else:
109 |         raise ValueError(f"Unknown reduction method: {reduction}")
110 | 
111 |     # Scale and shift to [0, 1]
112 |     _min = torch.min(aligned_images)
113 |     _max = torch.max(aligned_images)
114 |     aligned_images = (aligned_images - _min) / (_max - _min)
115 |     uncertainty /= _max - _min
116 | 
117 |     return aligned_images, uncertainty
118 | 


--------------------------------------------------------------------------------
/dmarigold/marigold/util/image_util.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | import numpy as np
 3 | import torch
 4 | from PIL import Image
 5 | 
 6 | 
 7 | def colorize_depth_maps(
 8 |     depth_map, min_depth, max_depth, cmap="Spectral", valid_mask=None
 9 | ):
10 |     """
11 |     Colorize depth maps.
12 |     """
13 |     assert len(depth_map.shape) >= 2, "Invalid dimension"
14 | 
15 |     if isinstance(depth_map, torch.Tensor):
16 |         depth = depth_map.detach().clone().squeeze().numpy()
17 |     elif isinstance(depth_map, np.ndarray):
18 |         depth = depth_map.copy().squeeze()
19 |     # reshape to [ (B,) H, W ]
20 |     if depth.ndim < 3:
21 |         depth = depth[np.newaxis, :, :]
22 | 
23 |     # colorize
24 |     cm = matplotlib.colormaps[cmap]
25 |     depth = ((depth - min_depth) / (max_depth - min_depth)).clip(0, 1)
26 |     img_colored_np = cm(depth, bytes=False)[:, :, :, 0:3]  # value from 0 to 1
27 |     img_colored_np = np.rollaxis(img_colored_np, 3, 1)
28 | 
29 |     if valid_mask is not None:
30 |         if isinstance(depth_map, torch.Tensor):
31 |             valid_mask = valid_mask.detach().numpy()
32 |         valid_mask = valid_mask.squeeze()  # [H, W] or [B, H, W]
33 |         if valid_mask.ndim < 3:
34 |             valid_mask = valid_mask[np.newaxis, np.newaxis, :, :]
35 |         else:
36 |             valid_mask = valid_mask[:, np.newaxis, :, :]
37 |         valid_mask = np.repeat(valid_mask, 3, axis=1)
38 |         img_colored_np[~valid_mask] = 0
39 | 
40 |     if isinstance(depth_map, torch.Tensor):
41 |         img_colored = torch.from_numpy(img_colored_np).float()
42 |     elif isinstance(depth_map, np.ndarray):
43 |         img_colored = img_colored_np
44 | 
45 |     return img_colored
46 | 
47 | 
48 | def chw2hwc(chw):
49 |     assert 3 == len(chw.shape)
50 |     if isinstance(chw, torch.Tensor):
51 |         hwc = torch.permute(chw, (1, 2, 0))
52 |     elif isinstance(chw, np.ndarray):
53 |         hwc = np.moveaxis(chw, 0, -1)
54 |     return hwc
55 | 
56 | 
57 | def resize_max_res(img: Image.Image, max_edge_resolution: int) -> Image.Image:
58 |     """
59 |     Resize image to limit maximum edge length while keeping aspect ratio
60 | 
61 |     Args:
62 |         img (Image.Image): Image to be resized
63 |         max_edge_resolution (int): Maximum edge length (px).
64 | 
65 |     Returns:
66 |         Image.Image: Resized image.
67 |     """
68 |     original_width, original_height = img.size
69 |     downscale_factor = min(
70 |         max_edge_resolution / original_width, max_edge_resolution / original_height
71 |     )
72 | 
73 |     new_width = int(original_width * downscale_factor)
74 |     new_height = int(original_height * downscale_factor)
75 | 
76 |     resized_img = img.resize((new_width, new_height))
77 |     return resized_img
78 | 


--------------------------------------------------------------------------------
/dmarigold/marigold/util/seed_all.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | import torch
 4 | 
 5 | 
 6 | def seed_all(seed: int = 0):
 7 |     """
 8 |     Set random seeds of all components.
 9 |     """
10 |     random.seed(seed)
11 |     np.random.seed(seed)
12 |     torch.manual_seed(seed)
13 |     torch.cuda.manual_seed_all(seed)
14 | 


--------------------------------------------------------------------------------
/dmidas/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Intel ISL (Intel Intelligent Systems Lab)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/dmidas/backbones/levit.py:
--------------------------------------------------------------------------------
  1 | import timm
  2 | import torch
  3 | import torch.nn as nn
  4 | import numpy as np
  5 | 
  6 | from .utils import activations, get_activation, Transpose
  7 | 
  8 | 
  9 | def forward_levit(pretrained, x):
 10 |     pretrained.model.forward_features(x)
 11 | 
 12 |     layer_1 = pretrained.activations["1"]
 13 |     layer_2 = pretrained.activations["2"]
 14 |     layer_3 = pretrained.activations["3"]
 15 | 
 16 |     layer_1 = pretrained.act_postprocess1(layer_1)
 17 |     layer_2 = pretrained.act_postprocess2(layer_2)
 18 |     layer_3 = pretrained.act_postprocess3(layer_3)
 19 | 
 20 |     return layer_1, layer_2, layer_3
 21 | 
 22 | 
 23 | def _make_levit_backbone(
 24 |         model,
 25 |         hooks=[3, 11, 21],
 26 |         patch_grid=[14, 14]
 27 | ):
 28 |     pretrained = nn.Module()
 29 | 
 30 |     pretrained.model = model
 31 |     pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
 32 |     pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
 33 |     pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
 34 | 
 35 |     pretrained.activations = activations
 36 | 
 37 |     patch_grid_size = np.array(patch_grid, dtype=int)
 38 | 
 39 |     pretrained.act_postprocess1 = nn.Sequential(
 40 |         Transpose(1, 2),
 41 |         nn.Unflatten(2, torch.Size(patch_grid_size.tolist()))
 42 |     )
 43 |     pretrained.act_postprocess2 = nn.Sequential(
 44 |         Transpose(1, 2),
 45 |         nn.Unflatten(2, torch.Size((np.ceil(patch_grid_size / 2).astype(int)).tolist()))
 46 |     )
 47 |     pretrained.act_postprocess3 = nn.Sequential(
 48 |         Transpose(1, 2),
 49 |         nn.Unflatten(2, torch.Size((np.ceil(patch_grid_size / 4).astype(int)).tolist()))
 50 |     )
 51 | 
 52 |     return pretrained
 53 | 
 54 | 
 55 | class ConvTransposeNorm(nn.Sequential):
 56 |     """
 57 |     Modification of
 58 |         https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/levit.py: ConvNorm
 59 |     such that ConvTranspose2d is used instead of Conv2d.
 60 |     """
 61 | 
 62 |     def __init__(
 63 |             self, in_chs, out_chs, kernel_size=1, stride=1, pad=0, dilation=1,
 64 |             groups=1, bn_weight_init=1):
 65 |         super().__init__()
 66 |         self.add_module('c',
 67 |                         nn.ConvTranspose2d(in_chs, out_chs, kernel_size, stride, pad, dilation, groups, bias=False))
 68 |         self.add_module('bn', nn.BatchNorm2d(out_chs))
 69 | 
 70 |         nn.init.constant_(self.bn.weight, bn_weight_init)
 71 | 
 72 |     @torch.no_grad()
 73 |     def fuse(self):
 74 |         c, bn = self._modules.values()
 75 |         w = bn.weight / (bn.running_var + bn.eps) ** 0.5
 76 |         w = c.weight * w[:, None, None, None]
 77 |         b = bn.bias - bn.running_mean * bn.weight / (bn.running_var + bn.eps) ** 0.5
 78 |         m = nn.ConvTranspose2d(
 79 |             w.size(1), w.size(0), w.shape[2:], stride=self.c.stride,
 80 |             padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups)
 81 |         m.weight.data.copy_(w)
 82 |         m.bias.data.copy_(b)
 83 |         return m
 84 | 
 85 | 
 86 | def stem_b4_transpose(in_chs, out_chs, activation):
 87 |     """
 88 |     Modification of
 89 |         https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/levit.py: stem_b16
 90 |     such that ConvTranspose2d is used instead of Conv2d and stem is also reduced to the half.
 91 |     """
 92 |     return nn.Sequential(
 93 |         ConvTransposeNorm(in_chs, out_chs, 3, 2, 1),
 94 |         activation(),
 95 |         ConvTransposeNorm(out_chs, out_chs // 2, 3, 2, 1),
 96 |         activation())
 97 | 
 98 | 
 99 | def _make_pretrained_levit_384(pretrained, hooks=None):
100 |     model = timm.create_model("levit_384", pretrained=pretrained)
101 | 
102 |     hooks = [3, 11, 21] if hooks == None else hooks
103 |     return _make_levit_backbone(
104 |         model,
105 |         hooks=hooks
106 |     )
107 | 


--------------------------------------------------------------------------------
/dmidas/backbones/swin.py:
--------------------------------------------------------------------------------
 1 | import timm
 2 | 
 3 | from .swin_common import _make_swin_backbone
 4 | 
 5 | 
 6 | def _make_pretrained_swinl12_384(pretrained, hooks=None):
 7 |     model = timm.create_model("swin_large_patch4_window12_384", pretrained=pretrained)
 8 | 
 9 |     hooks = [1, 1, 17, 1] if hooks == None else hooks
10 |     return _make_swin_backbone(
11 |         model,
12 |         hooks=hooks
13 |     )
14 | 


--------------------------------------------------------------------------------
/dmidas/backbones/swin2.py:
--------------------------------------------------------------------------------
 1 | import timm
 2 | 
 3 | from .swin_common import _make_swin_backbone
 4 | 
 5 | 
 6 | def _make_pretrained_swin2l24_384(pretrained, hooks=None):
 7 |     model = timm.create_model("swinv2_large_window12to24_192to384_22kft1k", pretrained=pretrained)
 8 | 
 9 |     hooks = [1, 1, 17, 1] if hooks == None else hooks
10 |     return _make_swin_backbone(
11 |         model,
12 |         hooks=hooks
13 |     )
14 | 
15 | 
16 | def _make_pretrained_swin2b24_384(pretrained, hooks=None):
17 |     model = timm.create_model("swinv2_base_window12to24_192to384_22kft1k", pretrained=pretrained)
18 | 
19 |     hooks = [1, 1, 17, 1] if hooks == None else hooks
20 |     return _make_swin_backbone(
21 |         model,
22 |         hooks=hooks
23 |     )
24 | 
25 | 
26 | def _make_pretrained_swin2t16_256(pretrained, hooks=None):
27 |     model = timm.create_model("swinv2_tiny_window16_256", pretrained=pretrained)
28 | 
29 |     hooks = [1, 1, 5, 1] if hooks == None else hooks
30 |     return _make_swin_backbone(
31 |         model,
32 |         hooks=hooks,
33 |         patch_grid=[64, 64]
34 |     )
35 | 


--------------------------------------------------------------------------------
/dmidas/backbones/swin_common.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import torch.nn as nn
 4 | import numpy as np
 5 | 
 6 | from .utils import activations, forward_default, get_activation, Transpose
 7 | 
 8 | 
 9 | def forward_swin(pretrained, x):
10 |     return forward_default(pretrained, x)
11 | 
12 | 
13 | def _make_swin_backbone(
14 |         model,
15 |         hooks=[1, 1, 17, 1],
16 |         patch_grid=[96, 96]
17 | ):
18 |     pretrained = nn.Module()
19 | 
20 |     pretrained.model = model
21 |     pretrained.model.layers[0].blocks[hooks[0]].register_forward_hook(get_activation("1"))
22 |     pretrained.model.layers[1].blocks[hooks[1]].register_forward_hook(get_activation("2"))
23 |     pretrained.model.layers[2].blocks[hooks[2]].register_forward_hook(get_activation("3"))
24 |     pretrained.model.layers[3].blocks[hooks[3]].register_forward_hook(get_activation("4"))
25 | 
26 |     pretrained.activations = activations
27 | 
28 |     if hasattr(model, "patch_grid"):
29 |         used_patch_grid = model.patch_grid
30 |     else:
31 |         used_patch_grid = patch_grid
32 | 
33 |     patch_grid_size = np.array(used_patch_grid, dtype=int)
34 | 
35 |     pretrained.act_postprocess1 = nn.Sequential(
36 |         Transpose(1, 2),
37 |         nn.Unflatten(2, torch.Size(patch_grid_size.tolist()))
38 |     )
39 |     pretrained.act_postprocess2 = nn.Sequential(
40 |         Transpose(1, 2),
41 |         nn.Unflatten(2, torch.Size((patch_grid_size // 2).tolist()))
42 |     )
43 |     pretrained.act_postprocess3 = nn.Sequential(
44 |         Transpose(1, 2),
45 |         nn.Unflatten(2, torch.Size((patch_grid_size // 4).tolist()))
46 |     )
47 |     pretrained.act_postprocess4 = nn.Sequential(
48 |         Transpose(1, 2),
49 |         nn.Unflatten(2, torch.Size((patch_grid_size // 8).tolist()))
50 |     )
51 | 
52 |     return pretrained
53 | 


--------------------------------------------------------------------------------
/dmidas/base_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class BaseModel(torch.nn.Module):
 5 |     def load(self, path):
 6 |         """Load model from file.
 7 | 
 8 |         Args:
 9 |             path (str): file path
10 |         """
11 |         parameters = torch.load(path, map_location=torch.device('cpu'))
12 | 
13 |         if "optimizer" in parameters:
14 |             parameters = parameters["model"]
15 | 
16 |         self.load_state_dict(parameters)
17 | 


--------------------------------------------------------------------------------
/dmidas/midas_net.py:
--------------------------------------------------------------------------------
 1 | """MidashNet: Network for monocular depth estimation trained by mixing several datasets.
 2 | This file contains code that is adapted from
 3 | https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
 4 | """
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | from .base_model import BaseModel
 9 | from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
10 | 
11 | 
12 | class MidasNet(BaseModel):
13 |     """Network for monocular depth estimation.
14 |     """
15 | 
16 |     def __init__(self, path=None, features=256, non_negative=True):
17 |         """Init.
18 | 
19 |         Args:
20 |             path (str, optional): Path to saved model. Defaults to None.
21 |             features (int, optional): Number of features. Defaults to 256.
22 |             backbone (str, optional): Backbone network for encoder. Defaults to resnet50
23 |         """
24 |         print("Loading weights: ", path)
25 | 
26 |         super(MidasNet, self).__init__()
27 | 
28 |         use_pretrained = False if path is None else True
29 | 
30 |         self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained)
31 | 
32 |         self.scratch.refinenet4 = FeatureFusionBlock(features)
33 |         self.scratch.refinenet3 = FeatureFusionBlock(features)
34 |         self.scratch.refinenet2 = FeatureFusionBlock(features)
35 |         self.scratch.refinenet1 = FeatureFusionBlock(features)
36 | 
37 |         self.scratch.output_conv = nn.Sequential(
38 |             nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
39 |             Interpolate(scale_factor=2, mode="bilinear"),
40 |             nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
41 |             nn.ReLU(True),
42 |             nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
43 |             nn.ReLU(True) if non_negative else nn.Identity(),
44 |         )
45 | 
46 |         if path:
47 |             self.load(path)
48 | 
49 |     def forward(self, x):
50 |         """Forward pass.
51 | 
52 |         Args:
53 |             x (tensor): input data (image)
54 | 
55 |         Returns:
56 |             tensor: depth
57 |         """
58 | 
59 |         layer_1 = self.pretrained.layer1(x)
60 |         layer_2 = self.pretrained.layer2(layer_1)
61 |         layer_3 = self.pretrained.layer3(layer_2)
62 |         layer_4 = self.pretrained.layer4(layer_3)
63 | 
64 |         layer_1_rn = self.scratch.layer1_rn(layer_1)
65 |         layer_2_rn = self.scratch.layer2_rn(layer_2)
66 |         layer_3_rn = self.scratch.layer3_rn(layer_3)
67 |         layer_4_rn = self.scratch.layer4_rn(layer_4)
68 | 
69 |         path_4 = self.scratch.refinenet4(layer_4_rn)
70 |         path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
71 |         path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
72 |         path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
73 | 
74 |         out = self.scratch.output_conv(path_1)
75 | 
76 |         return torch.squeeze(out, dim=1)
77 | 


--------------------------------------------------------------------------------
/dzoedepth/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/dzoedepth/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/dzoedepth/__init__.py


--------------------------------------------------------------------------------
/dzoedepth/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | 


--------------------------------------------------------------------------------
/dzoedepth/data/ddad.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | 
  3 | # Copyright (c) 2022 Intelligent Systems Lab Org
  4 | 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | 
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | 
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | # File author: Shariq Farooq Bhat
 24 | 
 25 | import os
 26 | 
 27 | import numpy as np
 28 | import torch
 29 | from PIL import Image
 30 | from torch.utils.data import DataLoader, Dataset
 31 | from torchvision import transforms
 32 | 
 33 | 
 34 | class ToTensor(object):
 35 |     def __init__(self, resize_shape):
 36 |         # self.normalize = transforms.Normalize(
 37 |         #     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 38 |         self.normalize = lambda x : x
 39 |         self.resize = transforms.Resize(resize_shape)
 40 | 
 41 |     def __call__(self, sample):
 42 |         image, depth = sample['image'], sample['depth']
 43 |         image = self.to_tensor(image)
 44 |         image = self.normalize(image)
 45 |         depth = self.to_tensor(depth)
 46 | 
 47 |         image = self.resize(image)
 48 | 
 49 |         return {'image': image, 'depth': depth, 'dataset': "ddad"}
 50 | 
 51 |     def to_tensor(self, pic):
 52 | 
 53 |         if isinstance(pic, np.ndarray):
 54 |             img = torch.from_numpy(pic.transpose((2, 0, 1)))
 55 |             return img
 56 | 
 57 |         #         # handle PIL Image
 58 |         if pic.mode == 'I':
 59 |             img = torch.from_numpy(np.array(pic, np.int32, copy=False))
 60 |         elif pic.mode == 'I;16':
 61 |             img = torch.from_numpy(np.array(pic, np.int16, copy=False))
 62 |         else:
 63 |             img = torch.ByteTensor(
 64 |                 torch.ByteStorage.from_buffer(pic.tobytes()))
 65 |         # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
 66 |         if pic.mode == 'YCbCr':
 67 |             nchannel = 3
 68 |         elif pic.mode == 'I;16':
 69 |             nchannel = 1
 70 |         else:
 71 |             nchannel = len(pic.mode)
 72 |         img = img.view(pic.size[1], pic.size[0], nchannel)
 73 | 
 74 |         img = img.transpose(0, 1).transpose(0, 2).contiguous()
 75 | 
 76 |         if isinstance(img, torch.ByteTensor):
 77 |             return img.float()
 78 |         else:
 79 |             return img
 80 | 
 81 | 
 82 | class DDAD(Dataset):
 83 |     def __init__(self, data_dir_root, resize_shape):
 84 |         import glob
 85 | 
 86 |         # image paths are of the form <data_dir_root>/{outleft, depthmap}/*.png
 87 |         self.image_files = glob.glob(os.path.join(data_dir_root, '*.png'))
 88 |         self.depth_files = [r.replace("_rgb.png", "_depth.npy")
 89 |                             for r in self.image_files]
 90 |         self.transform = ToTensor(resize_shape)
 91 | 
 92 |     def __getitem__(self, idx):
 93 | 
 94 |         image_path = self.image_files[idx]
 95 |         depth_path = self.depth_files[idx]
 96 | 
 97 |         image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0
 98 |         depth = np.load(depth_path)  # meters
 99 | 
100 |         # depth[depth > 8] = -1
101 |         depth = depth[..., None]
102 | 
103 |         sample = dict(image=image, depth=depth)
104 |         sample = self.transform(sample)
105 | 
106 |         if idx == 0:
107 |             print(sample["image"].shape)
108 | 
109 |         return sample
110 | 
111 |     def __len__(self):
112 |         return len(self.image_files)
113 | 
114 | 
115 | def get_ddad_loader(data_dir_root, resize_shape, batch_size=1, **kwargs):
116 |     dataset = DDAD(data_dir_root, resize_shape)
117 |     return DataLoader(dataset, batch_size, **kwargs)
118 | 


--------------------------------------------------------------------------------
/dzoedepth/data/diml_outdoor_test.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | 
  3 | # Copyright (c) 2022 Intelligent Systems Lab Org
  4 | 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | 
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | 
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | # File author: Shariq Farooq Bhat
 24 | 
 25 | import os
 26 | 
 27 | import numpy as np
 28 | import torch
 29 | from PIL import Image
 30 | from torch.utils.data import DataLoader, Dataset
 31 | from torchvision import transforms
 32 | 
 33 | 
 34 | class ToTensor(object):
 35 |     def __init__(self):
 36 |         # self.normalize = transforms.Normalize(
 37 |         #     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 38 |         self.normalize = lambda x : x
 39 | 
 40 |     def __call__(self, sample):
 41 |         image, depth = sample['image'], sample['depth']
 42 |         image = self.to_tensor(image)
 43 |         image = self.normalize(image)
 44 |         depth = self.to_tensor(depth)
 45 | 
 46 |         return {'image': image, 'depth': depth, 'dataset': "diml_outdoor"}
 47 | 
 48 |     def to_tensor(self, pic):
 49 | 
 50 |         if isinstance(pic, np.ndarray):
 51 |             img = torch.from_numpy(pic.transpose((2, 0, 1)))
 52 |             return img
 53 | 
 54 |         #         # handle PIL Image
 55 |         if pic.mode == 'I':
 56 |             img = torch.from_numpy(np.array(pic, np.int32, copy=False))
 57 |         elif pic.mode == 'I;16':
 58 |             img = torch.from_numpy(np.array(pic, np.int16, copy=False))
 59 |         else:
 60 |             img = torch.ByteTensor(
 61 |                 torch.ByteStorage.from_buffer(pic.tobytes()))
 62 |         # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
 63 |         if pic.mode == 'YCbCr':
 64 |             nchannel = 3
 65 |         elif pic.mode == 'I;16':
 66 |             nchannel = 1
 67 |         else:
 68 |             nchannel = len(pic.mode)
 69 |         img = img.view(pic.size[1], pic.size[0], nchannel)
 70 | 
 71 |         img = img.transpose(0, 1).transpose(0, 2).contiguous()
 72 |         if isinstance(img, torch.ByteTensor):
 73 |             return img.float()
 74 |         else:
 75 |             return img
 76 | 
 77 | 
 78 | class DIML_Outdoor(Dataset):
 79 |     def __init__(self, data_dir_root):
 80 |         import glob
 81 | 
 82 |         # image paths are of the form <data_dir_root>/{outleft, depthmap}/*.png
 83 |         self.image_files = glob.glob(os.path.join(
 84 |             data_dir_root, "*", 'outleft', '*.png'))
 85 |         self.depth_files = [r.replace("outleft", "depthmap")
 86 |                             for r in self.image_files]
 87 |         self.transform = ToTensor()
 88 | 
 89 |     def __getitem__(self, idx):
 90 |         image_path = self.image_files[idx]
 91 |         depth_path = self.depth_files[idx]
 92 | 
 93 |         image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0
 94 |         depth = np.asarray(Image.open(depth_path),
 95 |                            dtype='uint16') / 1000.0  # mm to meters
 96 | 
 97 |         # depth[depth > 8] = -1
 98 |         depth = depth[..., None]
 99 | 
100 |         sample = dict(image=image, depth=depth, dataset="diml_outdoor")
101 | 
102 |         # return sample
103 |         return self.transform(sample)
104 | 
105 |     def __len__(self):
106 |         return len(self.image_files)
107 | 
108 | 
109 | def get_diml_outdoor_loader(data_dir_root, batch_size=1, **kwargs):
110 |     dataset = DIML_Outdoor(data_dir_root)
111 |     return DataLoader(dataset, batch_size, **kwargs)
112 | 
113 | # get_diml_outdoor_loader(data_dir_root="datasets/diml/outdoor/test/HR")
114 | # get_diml_outdoor_loader(data_dir_root="datasets/diml/outdoor/test/LR")
115 | 


--------------------------------------------------------------------------------
/dzoedepth/data/ibims.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | import os
26 | 
27 | import numpy as np
28 | import torch
29 | from PIL import Image
30 | from torch.utils.data import DataLoader, Dataset
31 | from torchvision import transforms as T
32 | 
33 | 
34 | class iBims(Dataset):
35 |     def __init__(self, config):
36 |         root_folder = config.ibims_root
37 |         with open(os.path.join(root_folder, "imagelist.txt"), 'r') as f:
38 |             imglist = f.read().split()
39 | 
40 |         samples = []
41 |         for basename in imglist:
42 |             img_path = os.path.join(root_folder, 'rgb', basename + ".png")
43 |             depth_path = os.path.join(root_folder, 'depth', basename + ".png")
44 |             valid_mask_path = os.path.join(
45 |                 root_folder, 'mask_invalid', basename+".png")
46 |             transp_mask_path = os.path.join(
47 |                 root_folder, 'mask_transp', basename+".png")
48 | 
49 |             samples.append(
50 |                 (img_path, depth_path, valid_mask_path, transp_mask_path))
51 | 
52 |         self.samples = samples
53 |         # self.normalize = T.Normalize(
54 |         #     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
55 |         self.normalize = lambda x : x
56 | 
57 |     def __getitem__(self, idx):
58 |         img_path, depth_path, valid_mask_path, transp_mask_path = self.samples[idx]
59 | 
60 |         img = np.asarray(Image.open(img_path), dtype=np.float32) / 255.0
61 |         depth = np.asarray(Image.open(depth_path),
62 |                            dtype=np.uint16).astype('float')*50.0/65535
63 | 
64 |         mask_valid = np.asarray(Image.open(valid_mask_path))
65 |         mask_transp = np.asarray(Image.open(transp_mask_path))
66 | 
67 |         # depth = depth * mask_valid * mask_transp
68 |         depth = np.where(mask_valid * mask_transp, depth, -1)
69 | 
70 |         img = torch.from_numpy(img).permute(2, 0, 1)
71 |         img = self.normalize(img)
72 |         depth = torch.from_numpy(depth).unsqueeze(0)
73 |         return dict(image=img, depth=depth, image_path=img_path, depth_path=depth_path, dataset='ibims')
74 | 
75 |     def __len__(self):
76 |         return len(self.samples)
77 | 
78 | 
79 | def get_ibims_loader(config, batch_size=1, **kwargs):
80 |     dataloader = DataLoader(iBims(config), batch_size=batch_size, **kwargs)
81 |     return dataloader
82 | 


--------------------------------------------------------------------------------
/dzoedepth/data/sun_rgbd_loader.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | 
  3 | # Copyright (c) 2022 Intelligent Systems Lab Org
  4 | 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | 
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | 
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | # File author: Shariq Farooq Bhat
 24 | 
 25 | import os
 26 | 
 27 | import numpy as np
 28 | import torch
 29 | from PIL import Image
 30 | from torch.utils.data import DataLoader, Dataset
 31 | from torchvision import transforms
 32 | 
 33 | 
 34 | class ToTensor(object):
 35 |     def __init__(self):
 36 |         # self.normalize = transforms.Normalize(
 37 |         #     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 38 |         self.normalize = lambda x : x
 39 | 
 40 |     def __call__(self, sample):
 41 |         image, depth = sample['image'], sample['depth']
 42 |         image = self.to_tensor(image)
 43 |         image = self.normalize(image)
 44 |         depth = self.to_tensor(depth)
 45 | 
 46 |         return {'image': image, 'depth': depth, 'dataset': "sunrgbd"}
 47 | 
 48 |     def to_tensor(self, pic):
 49 | 
 50 |         if isinstance(pic, np.ndarray):
 51 |             img = torch.from_numpy(pic.transpose((2, 0, 1)))
 52 |             return img
 53 | 
 54 |         #         # handle PIL Image
 55 |         if pic.mode == 'I':
 56 |             img = torch.from_numpy(np.array(pic, np.int32, copy=False))
 57 |         elif pic.mode == 'I;16':
 58 |             img = torch.from_numpy(np.array(pic, np.int16, copy=False))
 59 |         else:
 60 |             img = torch.ByteTensor(
 61 |                 torch.ByteStorage.from_buffer(pic.tobytes()))
 62 |         # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
 63 |         if pic.mode == 'YCbCr':
 64 |             nchannel = 3
 65 |         elif pic.mode == 'I;16':
 66 |             nchannel = 1
 67 |         else:
 68 |             nchannel = len(pic.mode)
 69 |         img = img.view(pic.size[1], pic.size[0], nchannel)
 70 | 
 71 |         img = img.transpose(0, 1).transpose(0, 2).contiguous()
 72 |         if isinstance(img, torch.ByteTensor):
 73 |             return img.float()
 74 |         else:
 75 |             return img
 76 | 
 77 | 
 78 | class SunRGBD(Dataset):
 79 |     def __init__(self, data_dir_root):
 80 |         # test_file_dirs = loadmat(train_test_file)['alltest'].squeeze()
 81 |         # all_test = [t[0].replace("/n/fs/sun3d/data/", "") for t in test_file_dirs]
 82 |         # self.all_test = [os.path.join(data_dir_root, t) for t in all_test]
 83 |         import glob
 84 |         self.image_files = glob.glob(
 85 |             os.path.join(data_dir_root, 'rgb', 'rgb', '*'))
 86 |         self.depth_files = [
 87 |             r.replace("rgb/rgb", "gt/gt").replace("jpg", "png") for r in self.image_files]
 88 |         self.transform = ToTensor()
 89 | 
 90 |     def __getitem__(self, idx):
 91 |         image_path = self.image_files[idx]
 92 |         depth_path = self.depth_files[idx]
 93 | 
 94 |         image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0
 95 |         depth = np.asarray(Image.open(depth_path), dtype='uint16') / 1000.0
 96 |         depth[depth > 8] = -1
 97 |         depth = depth[..., None]
 98 |         return self.transform(dict(image=image, depth=depth))
 99 | 
100 |     def __len__(self):
101 |         return len(self.image_files)
102 | 
103 | 
104 | def get_sunrgbd_loader(data_dir_root, batch_size=1, **kwargs):
105 |     dataset = SunRGBD(data_dir_root)
106 |     return DataLoader(dataset, batch_size, **kwargs)
107 | 


--------------------------------------------------------------------------------
/dzoedepth/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | 


--------------------------------------------------------------------------------
/dzoedepth/models/base_models/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | 


--------------------------------------------------------------------------------
/dzoedepth/models/builder.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | from importlib import import_module
26 | from dzoedepth.models.depth_model import DepthModel
27 | 
28 | def build_model(config) -> DepthModel:
29 |     """Builds a model from a config. The model is specified by the model name and version in the config. The model is then constructed using the build_from_config function of the model interface.
30 |     This function should be used to construct models for training and evaluation.
31 | 
32 |     Args:
33 |         config (dict): Config dict. Config is constructed in utils/config.py. Each model has its own config file(s) saved in its root model folder.
34 | 
35 |     Returns:
36 |         torch.nn.Module: Model corresponding to name and version as specified in config
37 |     """
38 |     module_name = f"dzoedepth.models.{config.model}"
39 |     try:
40 |         module = import_module(module_name)
41 |     except ModuleNotFoundError as e:
42 |         # print the original error message
43 |         print(e)
44 |         raise ValueError(
45 |             f"Model {config.model} not found. Refer above error for details.") from e
46 |     try:
47 |         get_version = getattr(module, "get_version")
48 |     except AttributeError as e:
49 |         raise ValueError(
50 |             f"Model {config.model} has no get_version function.") from e
51 |     return get_version(config.version_name).build_from_config(config)
52 | 


--------------------------------------------------------------------------------
/dzoedepth/models/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/dzoedepth/models/layers/__init__.py


--------------------------------------------------------------------------------
/dzoedepth/models/layers/patch_transformer.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | import torch
26 | import torch.nn as nn
27 | 
28 | 
29 | class PatchTransformerEncoder(nn.Module):
30 |     def __init__(self, in_channels, patch_size=10, embedding_dim=128, num_heads=4, use_class_token=False):
31 |         """ViT-like transformer block
32 | 
33 |         Args:
34 |             in_channels (int): Input channels
35 |             patch_size (int, optional): patch size. Defaults to 10.
36 |             embedding_dim (int, optional): Embedding dimension in transformer model. Defaults to 128.
37 |             num_heads (int, optional): number of attention heads. Defaults to 4.
38 |             use_class_token (bool, optional): Whether to use extra token at the start for global accumulation (called as "class token"). Defaults to False.
39 |         """
40 |         super(PatchTransformerEncoder, self).__init__()
41 |         self.use_class_token = use_class_token
42 |         encoder_layers = nn.TransformerEncoderLayer(
43 |             embedding_dim, num_heads, dim_feedforward=1024)
44 |         self.transformer_encoder = nn.TransformerEncoder(
45 |             encoder_layers, num_layers=4)  # takes shape S,N,E
46 | 
47 |         self.embedding_convPxP = nn.Conv2d(in_channels, embedding_dim,
48 |                                            kernel_size=patch_size, stride=patch_size, padding=0)
49 |         
50 |     def positional_encoding_1d(self, sequence_length, batch_size, embedding_dim, device='cpu'):
51 |         """Generate positional encodings
52 | 
53 |         Args:
54 |             sequence_length (int): Sequence length
55 |             embedding_dim (int): Embedding dimension
56 | 
57 |         Returns:
58 |             torch.Tensor SBE: Positional encodings
59 |         """
60 |         position = torch.arange(
61 |             0, sequence_length, dtype=torch.float32, device=device).unsqueeze(1)
62 |         index = torch.arange(
63 |             0, embedding_dim, 2, dtype=torch.float32, device=device).unsqueeze(0)
64 |         div_term = torch.exp(index * (-torch.log(torch.tensor(10000.0, device=device)) / embedding_dim))
65 |         pos_encoding = position * div_term
66 |         pos_encoding = torch.cat([torch.sin(pos_encoding), torch.cos(pos_encoding)], dim=1)
67 |         pos_encoding = pos_encoding.unsqueeze(1).repeat(1, batch_size, 1)
68 |         return pos_encoding
69 |         
70 | 
71 |     def forward(self, x):
72 |         """Forward pass
73 | 
74 |         Args:
75 |             x (torch.Tensor - NCHW): Input feature tensor
76 | 
77 |         Returns:
78 |             torch.Tensor - SNE: Transformer output embeddings. S - sequence length (=HW/patch_size^2), N - batch size, E - embedding dim
79 |         """
80 |         embeddings = self.embedding_convPxP(x).flatten(
81 |             2)  # .shape = n,c,s = n, embedding_dim, s
82 |         if self.use_class_token:
83 |             # extra special token at start ?
84 |             embeddings = nn.functional.pad(embeddings, (1, 0))
85 |         
86 |         # change to S,N,E format required by transformer
87 |         embeddings = embeddings.permute(2, 0, 1)
88 |         S, N, E = embeddings.shape
89 |         # dtype IS ADDED, NOT PRESENT IN THE MAINLINE
90 |         embeddings = embeddings + self.positional_encoding_1d(S, N, E, device=embeddings.device).to(dtype=embeddings.dtype)
91 |         x = self.transformer_encoder(embeddings)  # .shape = S, N, E
92 |         return x
93 | 


--------------------------------------------------------------------------------
/dzoedepth/models/model_io.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | import torch
26 | 
27 | def load_state_dict(model, state_dict):
28 |     """Load state_dict into model, handling DataParallel and DistributedDataParallel. Also checks for "model" key in state_dict.
29 | 
30 |     DataParallel prefixes state_dict keys with 'module.' when saving.
31 |     If the model is not a DataParallel model but the state_dict is, then prefixes are removed.
32 |     If the model is a DataParallel model but the state_dict is not, then prefixes are added.
33 |     """
34 |     state_dict = state_dict.get('model', state_dict)
35 |     # if model is a DataParallel model, then state_dict keys are prefixed with 'module.'
36 | 
37 |     do_prefix = isinstance(
38 |         model, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel))
39 |     state = {}
40 |     for k, v in state_dict.items():
41 |         if k.startswith('module.') and not do_prefix:
42 |             k = k[7:]
43 | 
44 |         if not k.startswith('module.') and do_prefix:
45 |             k = 'module.' + k
46 | 
47 |         state[k] = v
48 | 
49 |     model.load_state_dict(state, strict=False)
50 |     print("Loaded successfully")
51 |     return model
52 | 
53 | 
54 | def load_wts(model, checkpoint_path):
55 |     ckpt = torch.load(checkpoint_path, map_location='cpu')
56 |     return load_state_dict(model, ckpt)
57 | 
58 | 
59 | def load_state_dict_from_url(model, url, **kwargs):
60 |     state_dict = torch.hub.load_state_dict_from_url(url, map_location='cpu', **kwargs)
61 |     return load_state_dict(model, state_dict)
62 | 
63 | 
64 | def load_state_from_resource(model, resource: str):
65 |     """Loads weights to the model from a given resource. A resource can be of following types:
66 |         1. URL. Prefixed with "url::"
67 |                 e.g. url::http(s)://url.resource.com/ckpt.pt
68 | 
69 |         2. Local path. Prefixed with "local::"
70 |                 e.g. local::/path/to/ckpt.pt
71 | 
72 | 
73 |     Args:
74 |         model (torch.nn.Module): Model
75 |         resource (str): resource string
76 | 
77 |     Returns:
78 |         torch.nn.Module: Model with loaded weights
79 |     """
80 |     print(f"Using pretrained resource {resource}")
81 | 
82 |     if resource.startswith('url::'):
83 |         url = resource.split('url::')[1]
84 |         return load_state_dict_from_url(model, url, progress=True)
85 | 
86 |     elif resource.startswith('local::'):
87 |         path = resource.split('local::')[1]
88 |         return load_wts(model, path)
89 |         
90 |     else:
91 |         raise ValueError("Invalid resource type, only url:: and local:: are supported")
92 |     


--------------------------------------------------------------------------------
/dzoedepth/models/zoedepth/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | from .zoedepth_v1 import ZoeDepth 
26 | 
27 | all_versions = {
28 |     "v1": ZoeDepth,
29 | }
30 | 
31 | get_version = lambda v : all_versions[v]


--------------------------------------------------------------------------------
/dzoedepth/models/zoedepth/config_zoedepth.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "name": "ZoeDepth",
 4 |         "version_name": "v1",
 5 |         "n_bins": 64,
 6 |         "bin_embedding_dim": 128,
 7 |         "bin_centers_type": "softplus",
 8 |         "n_attractors":[16, 8, 4, 1],
 9 |         "attractor_alpha": 1000,
10 |         "attractor_gamma": 2,
11 |         "attractor_kind" : "mean",
12 |         "attractor_type" : "inv",
13 |         "midas_model_type" : "DPT_BEiT_L_384",
14 |         "min_temp": 0.0212,
15 |         "max_temp": 50.0,
16 |         "output_distribution": "logbinomial",
17 |         "memory_efficient": true,
18 |         "inverse_midas": false,
19 |         "img_size": [384, 512]
20 |     },
21 |     
22 |     "train": {
23 |         "train_midas": true,
24 |         "use_pretrained_midas": true,
25 |         "trainer": "zoedepth",
26 |         "epochs": 5,
27 |         "bs": 16,
28 |         "optim_kwargs": {"lr": 0.000161, "wd": 0.01},
29 |         "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true},
30 |         "same_lr": false,
31 |         "w_si": 1,
32 |         "w_domain": 0.2,
33 |         "w_reg": 0,
34 |         "w_grad": 0,
35 |         "avoid_boundary": false,
36 |         "random_crop": false,
37 |         "input_width": 640,
38 |         "input_height": 480,
39 |         "midas_lr_factor": 1,
40 |         "encoder_lr_factor":10,
41 |         "pos_enc_lr_factor":10,
42 |         "freeze_midas_bn": true
43 | 
44 |     },
45 | 
46 |     "infer":{
47 |         "train_midas": false,
48 |         "use_pretrained_midas": false,
49 |         "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt",
50 |         "force_keep_ar": true
51 |     },
52 | 
53 |     "eval":{
54 |         "train_midas": false,
55 |         "use_pretrained_midas": false,
56 |         "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_N.pt"
57 |     }
58 | }


--------------------------------------------------------------------------------
/dzoedepth/models/zoedepth/config_zoedepth_kitti.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "bin_centers_type": "normed",
 4 |         "img_size": [384, 768]
 5 |     },
 6 |     
 7 |     "train": {
 8 |     },
 9 | 
10 |     "infer":{
11 |         "train_midas": false,
12 |         "use_pretrained_midas": false,
13 |         "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt",
14 |         "force_keep_ar": true
15 |     },
16 | 
17 |     "eval":{
18 |         "train_midas": false,
19 |         "use_pretrained_midas": false,
20 |         "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt"
21 |     }
22 | }


--------------------------------------------------------------------------------
/dzoedepth/models/zoedepth_nk/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | from .zoedepth_nk_v1 import ZoeDepthNK
26 | 
27 | all_versions = {
28 |     "v1": ZoeDepthNK,
29 | }
30 | 
31 | get_version = lambda v : all_versions[v]


--------------------------------------------------------------------------------
/dzoedepth/models/zoedepth_nk/config_zoedepth_nk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |         "name": "ZoeDepthNK",
 4 |         "version_name": "v1",
 5 |         "bin_conf" : [
 6 |             {
 7 |                 "name": "nyu",
 8 |                 "n_bins": 64,
 9 |                 "min_depth": 1e-3,
10 |                 "max_depth": 10.0
11 |             },
12 |             {
13 |                 "name": "kitti",
14 |                 "n_bins": 64,
15 |                 "min_depth": 1e-3,
16 |                 "max_depth": 80.0
17 |             }
18 |         ], 
19 |         "bin_embedding_dim": 128,
20 |         "bin_centers_type": "softplus",
21 |         "n_attractors":[16, 8, 4, 1],
22 |         "attractor_alpha": 1000,
23 |         "attractor_gamma": 2,
24 |         "attractor_kind" : "mean",
25 |         "attractor_type" : "inv",
26 |         "min_temp": 0.0212,
27 |         "max_temp": 50.0, 
28 |         "memory_efficient": true, 
29 |         "midas_model_type" : "DPT_BEiT_L_384",
30 |         "img_size": [384, 512]
31 |     },
32 | 
33 |     "train": {
34 |         "train_midas": true,
35 |         "use_pretrained_midas": true,
36 |         "trainer": "zoedepth_nk",
37 |         "epochs": 5,
38 |         "bs": 16,
39 |         "optim_kwargs": {"lr": 0.0002512, "wd": 0.01},
40 |         "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true},
41 |         "same_lr": false,
42 |         "w_si": 1,
43 |         "w_domain": 100,
44 |         "avoid_boundary": false,
45 |         "random_crop": false,
46 |         "input_width": 640,
47 |         "input_height": 480,
48 |         "w_grad": 0,
49 |         "w_reg": 0,
50 |         "midas_lr_factor": 10,
51 |         "encoder_lr_factor":10,
52 |         "pos_enc_lr_factor":10
53 |     },
54 | 
55 |     "infer": {
56 |         "train_midas": false,
57 |         "pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt",
58 |         "use_pretrained_midas": false,
59 |         "force_keep_ar": true
60 |     },
61 |     
62 |     "eval": {
63 |         "train_midas": false,
64 |         "pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt",
65 |         "use_pretrained_midas": false
66 |     }
67 | }


--------------------------------------------------------------------------------
/dzoedepth/trainers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/dzoedepth/trainers/__init__.py


--------------------------------------------------------------------------------
/dzoedepth/trainers/builder.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | from importlib import import_module
26 | 
27 | 
28 | def get_trainer(config):
29 |     """Builds and returns a trainer based on the config.
30 | 
31 |     Args:
32 |         config (dict): the config dict (typically constructed using utils.config.get_config)
33 |             config.trainer (str): the name of the trainer to use. The module named "{config.trainer}_trainer" must exist in trainers root module
34 | 
35 |     Raises:
36 |         ValueError: If the specified trainer does not exist under trainers/ folder
37 | 
38 |     Returns:
39 |         Trainer (inherited from zoedepth.trainers.BaseTrainer): The Trainer object
40 |     """
41 |     assert "trainer" in config and config.trainer is not None and config.trainer != '', "Trainer not specified. Config: {0}".format(
42 |         config)
43 |     try:
44 |         Trainer = getattr(import_module(
45 |             f"zoedepth.trainers.{config.trainer}_trainer"), 'Trainer')
46 |     except ModuleNotFoundError as e:
47 |         raise ValueError(f"Trainer {config.trainer}_trainer not found.") from e
48 |     return Trainer
49 | 


--------------------------------------------------------------------------------
/dzoedepth/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | 


--------------------------------------------------------------------------------
/dzoedepth/utils/arg_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def infer_type(x):  # hacky way to infer type from string args
 4 |     if not isinstance(x, str):
 5 |         return x
 6 | 
 7 |     try:
 8 |         x = int(x)
 9 |         return x
10 |     except ValueError:
11 |         pass
12 | 
13 |     try:
14 |         x = float(x)
15 |         return x
16 |     except ValueError:
17 |         pass
18 | 
19 |     return x
20 | 
21 | 
22 | def parse_unknown(unknown_args):
23 |     clean = []
24 |     for a in unknown_args:
25 |         if "=" in a:
26 |             k, v = a.split("=")
27 |             clean.extend([k, v])
28 |         else:
29 |             clean.append(a)
30 | 
31 |     keys = clean[::2]
32 |     values = clean[1::2]
33 |     return {k.replace("--", ""): infer_type(v) for k, v in zip(keys, values)}
34 | 


--------------------------------------------------------------------------------
/dzoedepth/utils/easydict/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | EasyDict
  3 | Copy/pasted from https://github.com/makinacorpus/easydict
  4 | Original author: Mathieu Leplatre <mathieu.leplatre@makina-corpus.com>
  5 | """
  6 | 
  7 | class EasyDict(dict):
  8 |     """
  9 |     Get attributes
 10 | 
 11 |     >>> d = EasyDict({'foo':3})
 12 |     >>> d['foo']
 13 |     3
 14 |     >>> d.foo
 15 |     3
 16 |     >>> d.bar
 17 |     Traceback (most recent call last):
 18 |     ...
 19 |     AttributeError: 'EasyDict' object has no attribute 'bar'
 20 | 
 21 |     Works recursively
 22 | 
 23 |     >>> d = EasyDict({'foo':3, 'bar':{'x':1, 'y':2}})
 24 |     >>> isinstance(d.bar, dict)
 25 |     True
 26 |     >>> d.bar.x
 27 |     1
 28 | 
 29 |     Bullet-proof
 30 | 
 31 |     >>> EasyDict({})
 32 |     {}
 33 |     >>> EasyDict(d={})
 34 |     {}
 35 |     >>> EasyDict(None)
 36 |     {}
 37 |     >>> d = {'a': 1}
 38 |     >>> EasyDict(**d)
 39 |     {'a': 1}
 40 |     >>> EasyDict((('a', 1), ('b', 2)))
 41 |     {'a': 1, 'b': 2}
 42 |     
 43 |     Set attributes
 44 | 
 45 |     >>> d = EasyDict()
 46 |     >>> d.foo = 3
 47 |     >>> d.foo
 48 |     3
 49 |     >>> d.bar = {'prop': 'value'}
 50 |     >>> d.bar.prop
 51 |     'value'
 52 |     >>> d
 53 |     {'foo': 3, 'bar': {'prop': 'value'}}
 54 |     >>> d.bar.prop = 'newer'
 55 |     >>> d.bar.prop
 56 |     'newer'
 57 | 
 58 | 
 59 |     Values extraction
 60 | 
 61 |     >>> d = EasyDict({'foo':0, 'bar':[{'x':1, 'y':2}, {'x':3, 'y':4}]})
 62 |     >>> isinstance(d.bar, list)
 63 |     True
 64 |     >>> from operator import attrgetter
 65 |     >>> list(map(attrgetter('x'), d.bar))
 66 |     [1, 3]
 67 |     >>> list(map(attrgetter('y'), d.bar))
 68 |     [2, 4]
 69 |     >>> d = EasyDict()
 70 |     >>> list(d.keys())
 71 |     []
 72 |     >>> d = EasyDict(foo=3, bar=dict(x=1, y=2))
 73 |     >>> d.foo
 74 |     3
 75 |     >>> d.bar.x
 76 |     1
 77 | 
 78 |     Still like a dict though
 79 | 
 80 |     >>> o = EasyDict({'clean':True})
 81 |     >>> list(o.items())
 82 |     [('clean', True)]
 83 | 
 84 |     And like a class
 85 | 
 86 |     >>> class Flower(EasyDict):
 87 |     ...     power = 1
 88 |     ...
 89 |     >>> f = Flower()
 90 |     >>> f.power
 91 |     1
 92 |     >>> f = Flower({'height': 12})
 93 |     >>> f.height
 94 |     12
 95 |     >>> f['power']
 96 |     1
 97 |     >>> sorted(f.keys())
 98 |     ['height', 'power']
 99 | 
100 |     update and pop items
101 |     >>> d = EasyDict(a=1, b='2')
102 |     >>> e = EasyDict(c=3.0, a=9.0)
103 |     >>> d.update(e)
104 |     >>> d.c
105 |     3.0
106 |     >>> d['c']
107 |     3.0
108 |     >>> d.get('c')
109 |     3.0
110 |     >>> d.update(a=4, b=4)
111 |     >>> d.b
112 |     4
113 |     >>> d.pop('a')
114 |     4
115 |     >>> d.a
116 |     Traceback (most recent call last):
117 |     ...
118 |     AttributeError: 'EasyDict' object has no attribute 'a'
119 |     """
120 |     def __init__(self, d=None, **kwargs):
121 |         if d is None:
122 |             d = {}
123 |         else:
124 |             d = dict(d)        
125 |         if kwargs:
126 |             d.update(**kwargs)
127 |         for k, v in d.items():
128 |             setattr(self, k, v)
129 |         # Class attributes
130 |         for k in self.__class__.__dict__.keys():
131 |             if not (k.startswith('__') and k.endswith('__')) and not k in ('update', 'pop'):
132 |                 setattr(self, k, getattr(self, k))
133 | 
134 |     def __setattr__(self, name, value):
135 |         if isinstance(value, (list, tuple)):
136 |             value = [self.__class__(x)
137 |                      if isinstance(x, dict) else x for x in value]
138 |         elif isinstance(value, dict) and not isinstance(value, self.__class__):
139 |             value = self.__class__(value)
140 |         super(EasyDict, self).__setattr__(name, value)
141 |         super(EasyDict, self).__setitem__(name, value)
142 | 
143 |     __setitem__ = __setattr__
144 | 
145 |     def update(self, e=None, **f):
146 |         d = e or dict()
147 |         d.update(f)
148 |         for k in d:
149 |             setattr(self, k, d[k])
150 | 
151 |     def pop(self, k, d=None):
152 |         delattr(self, k)
153 |         return super(EasyDict, self).pop(k, d)
154 | 
155 | 
156 | if __name__ == "__main__":
157 |     import doctest
158 |     doctest.testmod()


--------------------------------------------------------------------------------
/dzoedepth/utils/geometry.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2022 Intelligent Systems Lab Org
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | # File author: Shariq Farooq Bhat
24 | 
25 | import numpy as np
26 | 
27 | def get_intrinsics(H,W):
28 |     """
29 |     Intrinsics for a pinhole camera model.
30 |     Assume fov of 55 degrees and central principal point.
31 |     """
32 |     f = 0.5 * W / np.tan(0.5 * 55 * np.pi / 180.0)
33 |     cx = 0.5 * W
34 |     cy = 0.5 * H
35 |     return np.array([[f, 0, cx],
36 |                      [0, f, cy],
37 |                      [0, 0, 1]])
38 | 
39 | def depth_to_points(depth, R=None, t=None):
40 | 
41 |     K = get_intrinsics(depth.shape[1], depth.shape[2])
42 |     Kinv = np.linalg.inv(K)
43 |     if R is None:
44 |         R = np.eye(3)
45 |     if t is None:
46 |         t = np.zeros(3)
47 | 
48 |     # M converts from your coordinate to PyTorch3D's coordinate system
49 |     M = np.eye(3)
50 |     M[0, 0] = -1.0
51 |     M[1, 1] = -1.0
52 | 
53 |     height, width = depth.shape[1:3]
54 | 
55 |     x = np.arange(width)
56 |     y = np.arange(height)
57 |     coord = np.stack(np.meshgrid(x, y), -1)
58 |     coord = np.concatenate((coord, np.ones_like(coord)[:, :, [0]]), -1)  # z=1
59 |     coord = coord.astype(np.float32)
60 |     # coord = torch.as_tensor(coord, dtype=torch.float32, device=device)
61 |     coord = coord[None]  # bs, h, w, 3
62 | 
63 |     D = depth[:, :, :, None, None]
64 |     # print(D.shape, Kinv[None, None, None, ...].shape, coord[:, :, :, :, None].shape )
65 |     pts3D_1 = D * Kinv[None, None, None, ...] @ coord[:, :, :, :, None]
66 |     # pts3D_1 live in your coordinate system. Convert them to Py3D's
67 |     pts3D_1 = M[None, None, None, ...] @ pts3D_1
68 |     # from reference to targe tviewpoint
69 |     pts3D_2 = R[None, None, None, ...] @ pts3D_1 + t[None, None, None, :, None]
70 |     # pts3D_2 = pts3D_1
71 |     # depth_2 = pts3D_2[:, :, :, 2, :]  # b,1,h,w
72 |     return pts3D_2[:, :, :, :3, 0][0]
73 | 
74 | 
75 | def create_triangles(h, w, mask=None):
76 |     """
77 |     Reference: https://github.com/google-research/google-research/blob/e96197de06613f1b027d20328e06d69829fa5a89/infinite_nature/render_utils.py#L68
78 |     Creates mesh triangle indices from a given pixel grid size.
79 |         This function is not and need not be differentiable as triangle indices are
80 |         fixed.
81 |     Args:
82 |     h: (int) denoting the height of the image.
83 |     w: (int) denoting the width of the image.
84 |     Returns:
85 |     triangles: 2D numpy array of indices (int) with shape (2(W-1)(H-1) x 3)
86 |     """
87 |     x, y = np.meshgrid(range(w - 1), range(h - 1))
88 |     tl = y * w + x
89 |     tr = y * w + x + 1
90 |     bl = (y + 1) * w + x
91 |     br = (y + 1) * w + x + 1
92 |     triangles = np.array([tl, bl, tr, br, tr, bl])
93 |     triangles = np.transpose(triangles, (1, 2, 0)).reshape(
94 |         ((w - 1) * (h - 1) * 2, 3))
95 |     if mask is not None:
96 |         mask = mask.reshape(-1)
97 |         triangles = triangles[mask[triangles].all(1)]
98 |     return triangles
99 | 


--------------------------------------------------------------------------------
/examples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/examples.png


--------------------------------------------------------------------------------
/inpaint/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | MIT License
 3 | 
 4 | Copyright (c) 2020 Virginia Tech Vision and Learning Lab
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 
24 | ------------------ LICENSE FOR MiDaS --------------------
25 | 
26 | MIT License
27 | 
28 | Copyright (c) 2019 Intel ISL (Intel Intelligent Systems Lab)
29 | 
30 | Permission is hereby granted, free of charge, to any person obtaining a copy
31 | of this software and associated documentation files (the "Software"), to deal
32 | in the Software without restriction, including without limitation the rights
33 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
34 | copies of the Software, and to permit persons to whom the Software is
35 | furnished to do so, subject to the following conditions:
36 | 
37 | The above copyright notice and this permission notice shall be included in all
38 | copies or substantial portions of the Software.
39 | 
40 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
43 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
44 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
45 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
46 | SOFTWARE.
47 | 
48 | --------------------------- LICENSE FOR EdgeConnect --------------------------------
49 | 
50 | Attribution-NonCommercial 4.0 International


--------------------------------------------------------------------------------
/inpaint/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/inpaint/__init__.py


--------------------------------------------------------------------------------
/inpaint/argument.yml:
--------------------------------------------------------------------------------
 1 | depth_edge_model_ckpt: checkpoints/edge-model.pth
 2 | depth_feat_model_ckpt: checkpoints/depth-model.pth
 3 | rgb_feat_model_ckpt: checkpoints/color-model.pth
 4 | MiDaS_model_ckpt: MiDaS/model.pt
 5 | use_boostmonodepth: True
 6 | fps: 40
 7 | num_frames: 240
 8 | x_shift_range: [0.00, 0.00, -0.015, -0.015]
 9 | y_shift_range: [0.00, 0.00, -0.015, -0.00]
10 | z_shift_range: [-0.05, -0.05, -0.05, -0.05]
11 | traj_types: ['double-straight-line', 'double-straight-line', 'circle', 'circle']
12 | video_postfix: ['dolly-zoom-in', 'zoom-in', 'circle', 'swing']
13 | specific: ''
14 | longer_side_len: 960
15 | src_folder: image
16 | depth_folder: depth
17 | mesh_folder: mesh
18 | video_folder: video
19 | load_ply: False
20 | save_ply: True
21 | inference_video: True
22 | gpu_ids: 0
23 | offscreen_rendering: False
24 | img_format: '.jpg'
25 | depth_format: '.npy'
26 | require_midas: True
27 | depth_threshold: 0.04
28 | ext_edge_threshold: 0.002
29 | sparse_iter: 5
30 | filter_size: [7, 7, 5, 5, 5]
31 | sigma_s: 4.0
32 | sigma_r: 0.5
33 | redundant_number: 12
34 | background_thickness: 70
35 | context_thickness: 140
36 | background_thickness_2: 70
37 | context_thickness_2: 70
38 | discount_factor: 1.00
39 | log_depth: True
40 | largest_size: 512
41 | depth_edge_dilate: 10
42 | depth_edge_dilate_2: 5
43 | extrapolate_border: True
44 | extrapolation_thickness: 60
45 | repeat_inpaint_edge: True
46 | crop_border: [0.03, 0.03, 0.05, 0.03]
47 | anti_flickering: True
48 | 


--------------------------------------------------------------------------------
/inpaint/boostmonodepth_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cv2
 3 | import glob
 4 | import numpy as np
 5 | import imageio
 6 | from MiDaS.MiDaS_utils import write_depth
 7 | 
 8 | BOOST_BASE = 'BoostingMonocularDepth'
 9 | 
10 | BOOST_INPUTS = 'inputs'
11 | BOOST_OUTPUTS = 'outputs'
12 | 
13 | def run_boostmonodepth(img_names, src_folder, depth_folder):
14 | 
15 |     if not isinstance(img_names, list):
16 |         img_names = [img_names]
17 | 
18 |     # remove irrelevant files first
19 |     clean_folder(os.path.join(BOOST_BASE, BOOST_INPUTS))
20 |     clean_folder(os.path.join(BOOST_BASE, BOOST_OUTPUTS))
21 | 
22 |     tgt_names = []
23 |     for img_name in img_names:
24 |         base_name = os.path.basename(img_name)
25 |         tgt_name = os.path.join(BOOST_BASE, BOOST_INPUTS, base_name)
26 |         os.system(f'cp {img_name} {tgt_name}')
27 | 
28 |         # keep only the file name here.
29 |         # they save all depth as .png file
30 |         tgt_names.append(os.path.basename(tgt_name).replace('.jpg', '.png'))
31 | 
32 |     os.system(f'cd {BOOST_BASE} && python run.py --Final --data_dir {BOOST_INPUTS}/  --output_dir {BOOST_OUTPUTS} --depthNet 0')
33 | 
34 |     for i, (img_name, tgt_name) in enumerate(zip(img_names, tgt_names)):
35 |         img = imageio.imread(img_name)
36 |         H, W = img.shape[:2]
37 |         scale = 640. / max(H, W)
38 | 
39 |         # resize and save depth
40 |         target_height, target_width = int(round(H * scale)), int(round(W * scale))
41 |         depth = imageio.imread(os.path.join(BOOST_BASE, BOOST_OUTPUTS, tgt_name))
42 |         depth = np.array(depth).astype(np.float32)
43 |         depth = resize_depth(depth, target_width, target_height)
44 |         np.save(os.path.join(depth_folder, tgt_name.replace('.png', '.npy')), depth / 32768. - 1.)
45 |         write_depth(os.path.join(depth_folder, tgt_name.replace('.png', '')), depth)
46 | 
47 | def clean_folder(folder, img_exts=['.png', '.jpg', '.npy']):
48 | 
49 |     for img_ext in img_exts:
50 |         paths_to_check = os.path.join(folder, f'*{img_ext}')
51 |         if len(glob.glob(paths_to_check)) == 0:
52 |             continue
53 |         print(paths_to_check)
54 |         os.system(f'rm {paths_to_check}')
55 | 
56 | def resize_depth(depth, width, height):
57 |     """Resize numpy (or image read by imageio) depth map
58 | 
59 |     Args:
60 |         depth (numpy): depth
61 |         width (int): image width
62 |         height (int): image height
63 | 
64 |     Returns:
65 |         array: processed depth
66 |     """
67 |     depth = cv2.blur(depth, (3, 3))
68 |     return cv2.resize(depth, (width, height), interpolation=cv2.INTER_AREA)
69 | 


--------------------------------------------------------------------------------
/inpaint/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | fb_status=$(wget --spider -S https://filebox.ece.vt.edu/ 2>&1 | grep  "HTTP/1.1 200 OK")
 3 | 
 4 | mkdir checkpoints
 5 | 
 6 | echo "downloading from filebox ..."
 7 | wget https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/color-model.pth
 8 | wget https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/depth-model.pth
 9 | wget https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/edge-model.pth
10 | wget https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/model.pt
11 | 
12 | mv color-model.pth checkpoints/.
13 | mv depth-model.pth checkpoints/.
14 | mv edge-model.pth checkpoints/.
15 | mv model.pt MiDaS/.
16 | 
17 | echo "cloning from BoostingMonocularDepth ..."
18 | git clone https://github.com/compphoto/BoostingMonocularDepth.git
19 | mkdir -p BoostingMonocularDepth/pix2pix/checkpoints/mergemodel/
20 | 
21 | echo "downloading mergenet weights ..."
22 | wget https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/latest_net_G.pth
23 | mv latest_net_G.pth BoostingMonocularDepth/pix2pix/checkpoints/mergemodel/
24 | wget https://github.com/intel-isl/MiDaS/releases/download/v2/model-f46da743.pt
25 | mv model-f46da743.pt BoostingMonocularDepth/midas/model.pt
26 | 


--------------------------------------------------------------------------------
/inpaint/requirements.txt:
--------------------------------------------------------------------------------
1 | opencv-python==4.2.0.32
2 | vispy==0.6.4
3 | moviepy==1.0.2
4 | transforms3d==0.3.1
5 | networkx==2.3
6 | cynetworkx
7 | scikit-image
8 | 


--------------------------------------------------------------------------------
/install.py:
--------------------------------------------------------------------------------
 1 | # Installs dependencies
 2 | # Make sure to add to requirements.txt - it can be used for the standalone mode
 3 | 
 4 | import launch
 5 | import platform
 6 | import sys
 7 | import importlib.metadata
 8 | 
 9 | # TODO: some dependencies apparently being reinstalled on every run. Investigate and fix.
10 | 
11 | if sys.version_info < (3, 8):
12 |     launch.run_pip("install importlib-metadata", "importlib-metadata for depthmap script")
13 |     import importlib_metadata
14 | else:
15 |     import importlib.metadata as importlib_metadata
16 | if not launch.is_installed('packaging'):
17 |     launch.run_pip("install packaging", "packaging requirement for depthmap script")
18 | from packaging.version import Version
19 | 
20 | def ensure(module_name, min_version=None):
21 |     if launch.is_installed(module_name):
22 |         if min_version is None or Version(importlib_metadata.version(module_name)) >= Version(min_version):
23 |             return
24 |     requirement = f'{module_name}>={min_version}' if min_version is not None else module_name
25 |     cmd = f'install "{requirement}"'
26 |     msg = f'{requirement} requirement for depthmap script'
27 |     launch.run_pip(cmd, msg)
28 | 
29 | 
30 | ensure('timm', '0.9.2')  # For midas, specified just in case
31 | 
32 | ensure('matplotlib')
33 | 
34 | ensure('trimesh')
35 | 
36 | ensure('numba', '0.57.0')
37 | ensure('vispy', '0.13.0')
38 | 
39 | ensure('rembg', '2.0.50')
40 | 
41 | if not launch.is_installed("moviepy"):
42 |     launch.run_pip('install "moviepy==1.0.2"', "moviepy requirement for depthmap script")
43 | ensure('transforms3d', '0.4.1')
44 | 
45 | ensure('diffusers', '0.20.1')  # For Merigold
46 | 
47 | ensure('imageio')  # 2.4.1
48 | try:  # Dirty hack to not reinstall every time
49 |     importlib_metadata.version('imageio-ffmpeg')
50 | except:
51 |     ensure('imageio-ffmpeg')
52 | 
53 | 
54 | if not launch.is_installed("networkx"):
55 |     launch.run_pip('install install "networkx==2.5"', "networkx requirement for depthmap script")
56 | if platform.system() == 'Windows':
57 |     ensure('pyqt5')
58 | 
59 | if platform.system() == 'Darwin':
60 |     ensure('pyqt6')
61 |     ensure('PyOpenGL', '3.1.7')
62 | 
63 | # Depth Anything
64 | def get_installed_version(package: str):
65 |     try:
66 |         return importlib.metadata.version(package)
67 |     except Exception:
68 |         return None
69 | def try_install_from_wheel(pkg_name: str, wheel_url: str):
70 |     if get_installed_version(pkg_name) is not None:
71 |         return
72 |     try:
73 |         launch.run_pip(f"install {wheel_url}", f" {pkg_name} requirement for depthmap script")
74 |     except Exception as e:
75 |         print('Failed to install wheel for Depth Anything support. It won\'t work.')
76 | try_install_from_wheel(
77 |     "depth_anything",
78 |     "https://github.com/huchenlei/Depth-Anything/releases/download/v1.0.0/depth_anything-2024.1.22.0-py2.py3-none-any.whl")
79 | 


--------------------------------------------------------------------------------
/javascript/depthmap.js:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/javascript/depthmap.js


--------------------------------------------------------------------------------
/lib/LICENSE:
--------------------------------------------------------------------------------
 1 | Adobe Research License Terms
 2 | 
 3 | 1. You may use, reproduce, modify, and display the research materials provided under this license (the “Research
 4 | Materials”) solely for noncommercial purposes. Noncommercial purposes include academic research, teaching, and
 5 | testing, but do not include commercial licensing or distribution, development of commercial products, or any other
 6 | activity which results in commercial gain. You may not redistribute the Research Materials.
 7 | 
 8 | 2. You agree to (a) comply with all laws and regulations applicable to your use of the Research Materials under this license,
 9 | including but not limited to any import or export laws; (b) preserve any copyright or other notices from the Research
10 | Materials; and (c) for any Research Materials in object code, not attempt to modify, reverse engineer, or decompile
11 | such Research Materials except as permitted by applicable law.
12 | 
13 | 3. THE RESEARCH MATERIALS ARE PROVIDED “AS IS,” WITHOUT WARRANTY OF ANY KIND, AND YOU ASSUME ALL RISKS
14 | ASSOCIATED WITH THEIR USE. IN NO EVENT WILL ANYONE BE LIABLE TO YOU FOR ANY ACTUAL, INCIDENTAL, SPECIAL,
15 | OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR IN CONNECTION WITH USE OF THE RESEARCH MATERIALS.
16 | 


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/lib/multi_depth_model_woauxi.py:
--------------------------------------------------------------------------------
 1 | from lib import network_auxi as network
 2 | from lib.net_tools import get_func
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | class RelDepthModel(nn.Module):
 7 |     def __init__(self, backbone='resnet50'):
 8 |         super(RelDepthModel, self).__init__()
 9 |         if backbone == 'resnet50':
10 |             encoder = 'resnet50_stride32'
11 |         elif backbone == 'resnext101':
12 |             encoder = 'resnext101_stride32x8d'
13 |         self.depth_model = DepthModel(encoder)
14 | 
15 |     def inference(self, rgb):
16 |         with torch.no_grad():
17 |             input = rgb.cuda()
18 |             depth = self.depth_model(input)
19 |             #pred_depth_out = depth - depth.min() + 0.01
20 |             return depth #pred_depth_out
21 | 
22 | 
23 | class DepthModel(nn.Module):
24 |     def __init__(self, encoder):
25 |         super(DepthModel, self).__init__()
26 |         backbone = network.__name__.split('.')[-1] + '.' + encoder
27 |         self.encoder_modules = get_func(backbone)()
28 |         self.decoder_modules = network.Decoder()
29 | 
30 |     def forward(self, x):
31 |         lateral_out = self.encoder_modules(x)
32 |         out_logit = self.decoder_modules(lateral_out)
33 |         return out_logit


--------------------------------------------------------------------------------
/lib/net_tools.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import torch
 3 | import os
 4 | from collections import OrderedDict
 5 | 
 6 | 
 7 | def get_func(func_name):
 8 |     """Helper to return a function object by name. func_name must identify a
 9 |     function in this module or the path to a function relative to the base
10 |     'modeling' module.
11 |     """
12 |     if func_name == '':
13 |         return None
14 |     try:
15 |         parts = func_name.split('.')
16 |         # Refers to a function in this module
17 |         if len(parts) == 1:
18 |             return globals()[parts[0]]
19 |         # Otherwise, assume we're referencing a module under modeling
20 |         module_name = 'lib.' + '.'.join(parts[:-1])
21 |         module = importlib.import_module(module_name)
22 |         return getattr(module, parts[-1])
23 |     except Exception:
24 |         print('Failed to f1ind function: %s', func_name)
25 |         raise
26 | 
27 | def load_ckpt(args, depth_model, shift_model, focal_model):
28 |     """
29 |     Load checkpoint.
30 |     """
31 |     if os.path.isfile(args.load_ckpt):
32 |         print("loading checkpoint %s" % args.load_ckpt)
33 |         checkpoint = torch.load(args.load_ckpt)
34 |         if shift_model is not None:
35 |             shift_model.load_state_dict(strip_prefix_if_present(checkpoint['shift_model'], 'module.'),
36 |                                     strict=True)
37 |         if focal_model is not None:
38 |             focal_model.load_state_dict(strip_prefix_if_present(checkpoint['focal_model'], 'module.'),
39 |                                     strict=True)
40 |         depth_model.load_state_dict(strip_prefix_if_present(checkpoint['depth_model'], "module."),
41 |                                     strict=True)
42 |         del checkpoint
43 |         torch.cuda.empty_cache()
44 | 
45 | 
46 | def strip_prefix_if_present(state_dict, prefix):
47 |     keys = sorted(state_dict.keys())
48 |     if not all(key.startswith(prefix) for key in keys):
49 |         return state_dict
50 |     stripped_state_dict = OrderedDict()
51 |     for key, value in state_dict.items():
52 |         stripped_state_dict[key.replace(prefix, "")] = value
53 |     return stripped_state_dict


--------------------------------------------------------------------------------
/lib/spvcnn_utils.py:
--------------------------------------------------------------------------------
  1 | import torchsparse.nn.functional as spf
  2 | from torchsparse.point_tensor import PointTensor
  3 | from torchsparse.utils.kernel_region import *
  4 | from torchsparse.utils.helpers import *
  5 | 
  6 | 
  7 | __all__ = ['initial_voxelize', 'point_to_voxel', 'voxel_to_point']
  8 | 
  9 | 
 10 | # z: PointTensor
 11 | # return: SparseTensor
 12 | def initial_voxelize(z, init_res, after_res):
 13 |     new_float_coord = torch.cat(
 14 |         [(z.C[:, :3] * init_res) / after_res, z.C[:, -1].view(-1, 1)], 1)
 15 | 
 16 |     pc_hash = spf.sphash(torch.floor(new_float_coord).int())
 17 |     sparse_hash = torch.unique(pc_hash)
 18 |     idx_query = spf.sphashquery(pc_hash, sparse_hash)
 19 |     counts = spf.spcount(idx_query.int(), len(sparse_hash))
 20 | 
 21 |     inserted_coords = spf.spvoxelize(torch.floor(new_float_coord), idx_query,
 22 |                                    counts)
 23 |     inserted_coords = torch.round(inserted_coords).int()
 24 |     inserted_feat = spf.spvoxelize(z.F, idx_query, counts)
 25 | 
 26 |     new_tensor = SparseTensor(inserted_feat, inserted_coords, 1)
 27 |     new_tensor.check()
 28 |     z.additional_features['idx_query'][1] = idx_query
 29 |     z.additional_features['counts'][1] = counts
 30 |     z.C = new_float_coord
 31 | 
 32 |     return new_tensor
 33 | 
 34 | 
 35 | # x: SparseTensor, z: PointTensor
 36 | # return: SparseTensor
 37 | def point_to_voxel(x, z):
 38 |     if z.additional_features is None or z.additional_features.get('idx_query') is None\
 39 |        or z.additional_features['idx_query'].get(x.s) is None:
 40 |         #pc_hash = hash_gpu(torch.floor(z.C).int())
 41 |         pc_hash = spf.sphash(
 42 |             torch.cat([
 43 |                 torch.floor(z.C[:, :3] / x.s).int() * x.s,
 44 |                 z.C[:, -1].int().view(-1, 1)
 45 |             ], 1))
 46 |         sparse_hash = spf.sphash(x.C)
 47 |         idx_query = spf.sphashquery(pc_hash, sparse_hash)
 48 |         counts = spf.spcount(idx_query.int(), x.C.shape[0])
 49 |         z.additional_features['idx_query'][x.s] = idx_query
 50 |         z.additional_features['counts'][x.s] = counts
 51 |     else:
 52 |         idx_query = z.additional_features['idx_query'][x.s]
 53 |         counts = z.additional_features['counts'][x.s]
 54 | 
 55 |     inserted_feat = spf.spvoxelize(z.F, idx_query, counts)
 56 |     new_tensor = SparseTensor(inserted_feat, x.C, x.s)
 57 |     new_tensor.coord_maps = x.coord_maps
 58 |     new_tensor.kernel_maps = x.kernel_maps
 59 | 
 60 |     return new_tensor
 61 | 
 62 | 
 63 | # x: SparseTensor, z: PointTensor
 64 | # return: PointTensor
 65 | def voxel_to_point(x, z, nearest=False):
 66 |     if z.idx_query is None or z.weights is None or z.idx_query.get(
 67 |             x.s) is None or z.weights.get(x.s) is None:
 68 |         kr = KernelRegion(2, x.s, 1)
 69 |         off = kr.get_kernel_offset().to(z.F.device)
 70 |         #old_hash = kernel_hash_gpu(torch.floor(z.C).int(), off)
 71 |         old_hash = spf.sphash(
 72 |             torch.cat([
 73 |                 torch.floor(z.C[:, :3] / x.s).int() * x.s,
 74 |                 z.C[:, -1].int().view(-1, 1)
 75 |             ], 1), off)
 76 |         pc_hash = spf.sphash(x.C.to(z.F.device))
 77 |         idx_query = spf.sphashquery(old_hash, pc_hash)
 78 |         weights = spf.calc_ti_weights(z.C, idx_query,
 79 |                                   scale=x.s).transpose(0, 1).contiguous()
 80 |         idx_query = idx_query.transpose(0, 1).contiguous()
 81 |         if nearest:
 82 |             weights[:, 1:] = 0.
 83 |             idx_query[:, 1:] = -1
 84 |         new_feat = spf.spdevoxelize(x.F, idx_query, weights)
 85 |         new_tensor = PointTensor(new_feat,
 86 |                                  z.C,
 87 |                                  idx_query=z.idx_query,
 88 |                                  weights=z.weights)
 89 |         new_tensor.additional_features = z.additional_features
 90 |         new_tensor.idx_query[x.s] = idx_query
 91 |         new_tensor.weights[x.s] = weights
 92 |         z.idx_query[x.s] = idx_query
 93 |         z.weights[x.s] = weights
 94 | 
 95 |     else:
 96 |         new_feat = spf.spdevoxelize(x.F, z.idx_query.get(x.s), z.weights.get(x.s))
 97 |         new_tensor = PointTensor(new_feat,
 98 |                                  z.C,
 99 |                                  idx_query=z.idx_query,
100 |                                  weights=z.weights)
101 |         new_tensor.additional_features = z.additional_features
102 | 
103 |     return new_tensor
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | # This launches DepthMap without the AUTOMATIC1111/stable-diffusion-webui
 2 | 
 3 | import argparse
 4 | import os
 5 | import pathlib
 6 | 
 7 | import src.misc
 8 | 
 9 | 
10 | def maybe_chdir():
11 |     """Detects if DepthMap was installed as a stable-diffusion-webui script, but run without current directory set to
12 |     the stable-diffusion-webui root. Changes current directory if needed.
13 |     This is to avoid re-downloading models and putting results into a wrong folder."""
14 |     try:
15 |         file_path = pathlib.Path(__file__)
16 |         path = file_path.parts
17 |         while len(path) > 0 and path[-1] != src.misc.REPOSITORY_NAME:
18 |             path = path[:-1]
19 |         if len(path) >= 2 and path[-1] == src.misc.REPOSITORY_NAME and path[-2] == "extensions":
20 |             path = path[:-2]
21 |         listdir = os.listdir(str(pathlib.Path(*path)))
22 |         if 'launch.py' in listdir and 'webui.py':
23 |             os.chdir(str(pathlib.Path(*path)))
24 |     except:
25 |         pass
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     parser = argparse.ArgumentParser()
30 |     parser.add_argument("--share", help="Create public link", action='store_true')
31 |     parser.add_argument("--listen", help="Create public link", action='store_true')
32 |     parser.add_argument("--no_chdir", help="Do not try to use the root of stable-diffusion-webui", action='store_true')
33 |     args = parser.parse_args()
34 | 
35 |     print(f"{src.misc.SCRIPT_FULL_NAME} running in standalone mode!")
36 |     if not args.no_chdir:
37 |         maybe_chdir()
38 |     server_name = "0.0.0.0" if args.listen else None
39 |     import src.common_ui
40 |     src.common_ui.on_ui_tabs().launch(share=args.share, server_name=server_name)
41 | 


--------------------------------------------------------------------------------
/options.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/options.png


--------------------------------------------------------------------------------
/pix2pix/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017, Jun-Yan Zhu and Taesung Park
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 
25 | 
26 | --------------------------- LICENSE FOR pix2pix --------------------------------
27 | BSD License
28 | 
29 | For pix2pix software
30 | Copyright (c) 2016, Phillip Isola and Jun-Yan Zhu
31 | All rights reserved.
32 | 
33 | Redistribution and use in source and binary forms, with or without
34 | modification, are permitted provided that the following conditions are met:
35 | 
36 | * Redistributions of source code must retain the above copyright notice, this
37 |   list of conditions and the following disclaimer.
38 | 
39 | * Redistributions in binary form must reproduce the above copyright notice,
40 |   this list of conditions and the following disclaimer in the documentation
41 |   and/or other materials provided with the distribution.
42 | 
43 | ----------------------------- LICENSE FOR DCGAN --------------------------------
44 | BSD License
45 | 
46 | For dcgan.torch software
47 | 
48 | Copyright (c) 2015, Facebook, Inc. All rights reserved.
49 | 
50 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
51 | 
52 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
53 | 
54 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
55 | 
56 | Neither the name Facebook nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
57 | 
58 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
59 | 


--------------------------------------------------------------------------------
/pix2pix/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/e4df29bc557ac86d33f0f88a9f07158b90fda39d/pix2pix/__init__.py


--------------------------------------------------------------------------------
/pix2pix/data/__init__.py:
--------------------------------------------------------------------------------
 1 | """This package includes all the modules related to data loading and preprocessing
 2 | 
 3 |  To add a custom dataset class called 'dummy', you need to add a file called 'dummy_dataset.py' and define a subclass 'DummyDataset' inherited from BaseDataset.
 4 |  You need to implement four functions:
 5 |     -- <__init__>:                      initialize the class, first call BaseDataset.__init__(self, opt).
 6 |     -- <__len__>:                       return the size of dataset.
 7 |     -- <__getitem__>:                   get a data point from data loader.
 8 |     -- <modify_commandline_options>:    (optionally) add dataset-specific options and set default options.
 9 | 
10 | Now you can use the dataset class by specifying flag '--dataset_mode dummy'.
11 | See our template dataset class 'template_dataset.py' for more details.
12 | """
13 | import importlib
14 | import torch.utils.data
15 | from pix2pix.data.base_dataset import BaseDataset
16 | 
17 | 
18 | def find_dataset_using_name(dataset_name):
19 |     """Import the module "data/[dataset_name]_dataset.py".
20 | 
21 |     In the file, the class called DatasetNameDataset() will
22 |     be instantiated. It has to be a subclass of BaseDataset,
23 |     and it is case-insensitive.
24 |     """
25 |     dataset_filename = "pix2pix.data." + dataset_name + "_dataset"
26 |     datasetlib = importlib.import_module(dataset_filename)
27 | 
28 |     dataset = None
29 |     target_dataset_name = dataset_name.replace('_', '') + 'dataset'
30 |     for name, cls in datasetlib.__dict__.items():
31 |         if name.lower() == target_dataset_name.lower() \
32 |            and issubclass(cls, BaseDataset):
33 |             dataset = cls
34 | 
35 |     if dataset is None:
36 |         raise NotImplementedError("In %s.py, there should be a subclass of BaseDataset with class name that matches %s in lowercase." % (dataset_filename, target_dataset_name))
37 | 
38 |     return dataset
39 | 
40 | 
41 | def get_option_setter(dataset_name):
42 |     """Return the static method <modify_commandline_options> of the dataset class."""
43 |     dataset_class = find_dataset_using_name(dataset_name)
44 |     return dataset_class.modify_commandline_options
45 | 
46 | 
47 | def create_dataset(opt):
48 |     """Create a dataset given the option.
49 | 
50 |     This function wraps the class CustomDatasetDataLoader.
51 |         This is the main interface between this package and 'train.py'/'test.py'
52 | 
53 |     Example:
54 |         >>> from data import create_dataset
55 |         >>> dataset = create_dataset(opt)
56 |     """
57 |     data_loader = CustomDatasetDataLoader(opt)
58 |     dataset = data_loader.load_data()
59 |     return dataset
60 | 
61 | 
62 | class CustomDatasetDataLoader():
63 |     """Wrapper class of Dataset class that performs multi-threaded data loading"""
64 | 
65 |     def __init__(self, opt):
66 |         """Initialize this class
67 | 
68 |         Step 1: create a dataset instance given the name [dataset_mode]
69 |         Step 2: create a multi-threaded data loader.
70 |         """
71 |         self.opt = opt
72 |         dataset_class = find_dataset_using_name(opt.dataset_mode)
73 |         self.dataset = dataset_class(opt)
74 |         print("dataset [%s] was created" % type(self.dataset).__name__)
75 |         self.dataloader = torch.utils.data.DataLoader(
76 |             self.dataset,
77 |             batch_size=opt.batch_size,
78 |             shuffle=not opt.serial_batches,
79 |             num_workers=int(opt.num_threads))
80 | 
81 |     def load_data(self):
82 |         return self
83 | 
84 |     def __len__(self):
85 |         """Return the number of data in the dataset"""
86 |         return min(len(self.dataset), self.opt.max_dataset_size)
87 | 
88 |     def __iter__(self):
89 |         """Return a batch of data"""
90 |         for i, data in enumerate(self.dataloader):
91 |             if i * self.opt.batch_size >= self.opt.max_dataset_size:
92 |                 break
93 |             yield data
94 | 


--------------------------------------------------------------------------------
/pix2pix/data/depthmerge_dataset.py:
--------------------------------------------------------------------------------
 1 | from pix2pix.data.base_dataset import BaseDataset
 2 | from pix2pix.data.image_folder import make_dataset
 3 | from pix2pix.util.guidedfilter import GuidedFilter
 4 | 
 5 | import numpy as np
 6 | import os
 7 | import torch
 8 | from PIL import Image
 9 | 
10 | 
11 | def normalize(img):
12 |     img = img * 2
13 |     img = img - 1
14 |     return img
15 | 
16 | 
17 | def normalize01(img):
18 |     return (img - torch.min(img)) / (torch.max(img)-torch.min(img))
19 | 
20 | 
21 | class DepthMergeDataset(BaseDataset):
22 |     def __init__(self, opt):
23 |         BaseDataset.__init__(self, opt)
24 |         self.dir_outer = os.path.join(opt.dataroot, opt.phase, 'outer')
25 |         self.dir_inner = os.path.join(opt.dataroot, opt.phase, 'inner')
26 |         self.dir_gtfake = os.path.join(opt.dataroot, opt.phase, 'gtfake')
27 | 
28 |         self.outer_paths = sorted(make_dataset(self.dir_outer, opt.max_dataset_size))
29 |         self.inner_paths = sorted(make_dataset(self.dir_inner, opt.max_dataset_size))
30 |         self.gtfake_paths = sorted(make_dataset(self.dir_gtfake, opt.max_dataset_size))
31 | 
32 |         self.dataset_size = len(self.outer_paths)
33 | 
34 |         if opt.phase == 'train':
35 |             self.isTrain = True
36 |         else:
37 |             self.isTrain = False
38 | 
39 |     def __getitem__(self, index):
40 |         normalize_coef = np.float32(2 ** 16)
41 | 
42 |         data_outer = Image.open(self.outer_paths[index % self.dataset_size])  # needs to be a tensor
43 |         data_outer = np.array(data_outer, dtype=np.float32)
44 |         data_outer = data_outer / normalize_coef
45 | 
46 |         data_inner = Image.open(self.inner_paths[index % self.dataset_size])  # needs to be a tensor
47 |         data_inner = np.array(data_inner, dtype=np.float32)
48 |         data_inner = data_inner / normalize_coef
49 | 
50 |         if self.isTrain:
51 |             data_gtfake = Image.open(self.gtfake_paths[index % self.dataset_size])  # needs to be a tensor
52 |             data_gtfake = np.array(data_gtfake, dtype=np.float32)
53 |             data_gtfake = data_gtfake / normalize_coef
54 | 
55 |             data_inner = GuidedFilter(data_gtfake, data_inner, 64, 0.00000001).smooth.astype('float32')
56 |             data_outer = GuidedFilter(data_outer, data_gtfake, 64, 0.00000001).smooth.astype('float32')
57 | 
58 |         data_outer = torch.from_numpy(data_outer)
59 |         data_outer = torch.unsqueeze(data_outer, 0)
60 |         data_outer = normalize01(data_outer)
61 |         data_outer = normalize(data_outer)
62 | 
63 |         data_inner = torch.from_numpy(data_inner)
64 |         data_inner = torch.unsqueeze(data_inner, 0)
65 |         data_inner = normalize01(data_inner)
66 |         data_inner = normalize(data_inner)
67 | 
68 |         if self.isTrain:
69 |             data_gtfake = torch.from_numpy(data_gtfake)
70 |             data_gtfake = torch.unsqueeze(data_gtfake, 0)
71 |             data_gtfake = normalize01(data_gtfake)
72 |             data_gtfake = normalize(data_gtfake)
73 | 
74 |         image_path = self.outer_paths[index % self.dataset_size]
75 |         if self.isTrain:
76 |             return {'data_inner': data_inner, 'data_outer': data_outer,
77 |                     'data_gtfake': data_gtfake, 'image_path': image_path}
78 |         else:
79 |             return {'data_inner': data_inner, 'data_outer': data_outer, 'image_path': image_path}
80 | 
81 |     def __len__(self):
82 |         """Return the total number of images."""
83 |         return self.dataset_size
84 | 


--------------------------------------------------------------------------------
/pix2pix/data/image_folder.py:
--------------------------------------------------------------------------------
 1 | """A modified image folder class
 2 | 
 3 | We modify the official PyTorch image folder (https://github.com/pytorch/vision/blob/master/torchvision/datasets/folder.py)
 4 | so that this class can load images from both current directory and its subdirectories.
 5 | """
 6 | 
 7 | import torch.utils.data as data
 8 | 
 9 | from PIL import Image
10 | import os
11 | 
12 | IMG_EXTENSIONS = [
13 |     '.jpg', '.JPG', '.jpeg', '.JPEG',
14 |     '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP',
15 |     '.tif', '.TIF', '.tiff', '.TIFF',
16 | ]
17 | 
18 | 
19 | def is_image_file(filename):
20 |     return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
21 | 
22 | 
23 | def make_dataset(dir, max_dataset_size=float("inf")):
24 |     images = []
25 |     assert os.path.isdir(dir), '%s is not a valid directory' % dir
26 | 
27 |     for root, _, fnames in sorted(os.walk(dir)):
28 |         for fname in fnames:
29 |             if is_image_file(fname):
30 |                 path = os.path.join(root, fname)
31 |                 images.append(path)
32 |     return images[:min(max_dataset_size, len(images))]
33 | 
34 | 
35 | def default_loader(path):
36 |     return Image.open(path).convert('RGB')
37 | 
38 | 
39 | class ImageFolder(data.Dataset):
40 | 
41 |     def __init__(self, root, transform=None, return_paths=False,
42 |                  loader=default_loader):
43 |         imgs = make_dataset(root)
44 |         if len(imgs) == 0:
45 |             raise(RuntimeError("Found 0 images in: " + root + "\n"
46 |                                "Supported image extensions are: " + ",".join(IMG_EXTENSIONS)))
47 | 
48 |         self.root = root
49 |         self.imgs = imgs
50 |         self.transform = transform
51 |         self.return_paths = return_paths
52 |         self.loader = loader
53 | 
54 |     def __getitem__(self, index):
55 |         path = self.imgs[index]
56 |         img = self.loader(path)
57 |         if self.transform is not None:
58 |             img = self.transform(img)
59 |         if self.return_paths:
60 |             return img, path
61 |         else:
62 |             return img
63 | 
64 |     def __len__(self):
65 |         return len(self.imgs)
66 | 


--------------------------------------------------------------------------------
/pix2pix/models/__init__.py:
--------------------------------------------------------------------------------
 1 | """This package contains modules related to objective functions, optimizations, and network architectures.
 2 | 
 3 | To add a custom model class called 'dummy', you need to add a file called 'dummy_model.py' and define a subclass DummyModel inherited from BaseModel.
 4 | You need to implement the following five functions:
 5 |     -- <__init__>:                      initialize the class; first call BaseModel.__init__(self, opt).
 6 |     -- <set_input>:                     unpack data from dataset and apply preprocessing.
 7 |     -- <forward>:                       produce intermediate results.
 8 |     -- <optimize_parameters>:           calculate loss, gradients, and update network weights.
 9 |     -- <modify_commandline_options>:    (optionally) add model-specific options and set default options.
10 | 
11 | In the function <__init__>, you need to define four lists:
12 |     -- self.loss_names (str list):          specify the training losses that you want to plot and save.
13 |     -- self.model_names (str list):         define networks used in our training.
14 |     -- self.visual_names (str list):        specify the images that you want to display and save.
15 |     -- self.optimizers (optimizer list):    define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an usage.
16 | 
17 | Now you can use the model class by specifying flag '--model dummy'.
18 | See our template model class 'template_model.py' for more details.
19 | """
20 | 
21 | import importlib
22 | from pix2pix.models.base_model import BaseModel
23 | 
24 | 
25 | def find_model_using_name(model_name):
26 |     """Import the module "models/[model_name]_model.py".
27 | 
28 |     In the file, the class called DatasetNameModel() will
29 |     be instantiated. It has to be a subclass of BaseModel,
30 |     and it is case-insensitive.
31 |     """
32 |     model_filename = "pix2pix.models." + model_name + "_model"
33 |     modellib = importlib.import_module(model_filename)
34 |     model = None
35 |     target_model_name = model_name.replace('_', '') + 'model'
36 |     for name, cls in modellib.__dict__.items():
37 |         if name.lower() == target_model_name.lower() \
38 |            and issubclass(cls, BaseModel):
39 |             model = cls
40 | 
41 |     if model is None:
42 |         print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name))
43 |         exit(0)
44 | 
45 |     return model
46 | 
47 | 
48 | def get_option_setter(model_name):
49 |     """Return the static method <modify_commandline_options> of the model class."""
50 |     model_class = find_model_using_name(model_name)
51 |     return model_class.modify_commandline_options
52 | 
53 | 
54 | def create_model(opt):
55 |     """Create a model given the option.
56 | 
57 |     This function warps the class CustomDatasetDataLoader.
58 |     This is the main interface between this package and 'train.py'/'test.py'
59 | 
60 |     Example:
61 |         >>> from models import create_model
62 |         >>> model = create_model(opt)
63 |     """
64 |     model = find_model_using_name(opt.model)
65 |     instance = model(opt)
66 |     print("model [%s] was created" % type(instance).__name__)
67 |     return instance
68 | 


--------------------------------------------------------------------------------
/pix2pix/models/base_model_hg.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | 
 4 | class BaseModelHG():
 5 |     def name(self):
 6 |         return 'BaseModel'
 7 | 
 8 |     def initialize(self, opt):
 9 |         self.opt = opt
10 |         self.gpu_ids = opt.gpu_ids
11 |         self.isTrain = opt.isTrain
12 |         self.Tensor = torch.cuda.FloatTensor if self.gpu_ids else torch.Tensor
13 |         self.save_dir = os.path.join(opt.checkpoints_dir, opt.name)
14 | 
15 |     def set_input(self, input):
16 |         self.input = input
17 | 
18 |     def forward(self):
19 |         pass
20 | 
21 |     # used in test time, no backprop
22 |     def test(self):
23 |         pass
24 | 
25 |     def get_image_paths(self):
26 |         pass
27 | 
28 |     def optimize_parameters(self):
29 |         pass
30 | 
31 |     def get_current_visuals(self):
32 |         return self.input
33 | 
34 |     def get_current_errors(self):
35 |         return {}
36 | 
37 |     def save(self, label):
38 |         pass
39 | 
40 |     # helper saving function that can be used by subclasses
41 |     def save_network(self, network, network_label, epoch_label, gpu_ids):
42 |         save_filename = '_%s_net_%s.pth' % (epoch_label, network_label)
43 |         save_path = os.path.join(self.save_dir, save_filename)
44 |         torch.save(network.cpu().state_dict(), save_path)
45 |         if len(gpu_ids) and torch.cuda.is_available():
46 |             network.cuda(device_id=gpu_ids[0])
47 | 
48 |     # helper loading function that can be used by subclasses
49 |     def load_network(self, network, network_label, epoch_label):
50 |         save_filename = '%s_net_%s.pth' % (epoch_label, network_label)
51 |         save_path = os.path.join(self.save_dir, save_filename)
52 |         print(save_path)
53 |         model = torch.load(save_path)
54 |         return model
55 |         # network.load_state_dict(torch.load(save_path))
56 | 
57 |     def update_learning_rate():
58 |         pass
59 | 


--------------------------------------------------------------------------------
/pix2pix/options/__init__.py:
--------------------------------------------------------------------------------
1 | """This package options includes option modules: training options, test options, and basic options (used in both training and test)."""
2 | 


--------------------------------------------------------------------------------
/pix2pix/options/test_options.py:
--------------------------------------------------------------------------------
 1 | from .base_options import BaseOptions
 2 | 
 3 | 
 4 | class TestOptions(BaseOptions):
 5 |     """This class includes test options.
 6 | 
 7 |     It also includes shared options defined in BaseOptions.
 8 |     """
 9 | 
10 |     def initialize(self, parser):
11 |         parser = BaseOptions.initialize(self, parser)  # define shared options
12 |         parser.add_argument('--aspect_ratio', type=float, default=1.0, help='aspect ratio of result images')
13 |         parser.add_argument('--phase', type=str, default='test', help='train, val, test, etc')
14 |         # Dropout and Batchnorm has different behavioir during training and test.
15 |         parser.add_argument('--eval', action='store_true', help='use eval mode during test time.')
16 |         parser.add_argument('--num_test', type=int, default=50, help='how many test images to run')
17 |         # rewrite devalue values
18 |         parser.set_defaults(model='pix2pix4depth')
19 |         # To avoid cropping, the load_size should be the same as crop_size
20 |         parser.set_defaults(load_size=parser.get_default('crop_size'))
21 |         self.isTrain = False
22 |         return parser
23 | 


--------------------------------------------------------------------------------
/pix2pix/options/train_options.py:
--------------------------------------------------------------------------------
 1 | from .base_options import BaseOptions
 2 | 
 3 | 
 4 | class TrainOptions(BaseOptions):
 5 |     """This class includes training options.
 6 | 
 7 |     It also includes shared options defined in BaseOptions.
 8 |     """
 9 | 
10 |     def initialize(self, parser):
11 |         parser = BaseOptions.initialize(self, parser)
12 |         # visdom and HTML visualization parameters
13 |         parser.add_argument('--display_freq', type=int, default=2500, help='frequency of showing training results on screen')
14 |         parser.add_argument('--display_ncols', type=int, default=4, help='if positive, display all images in a single visdom web panel with certain number of images per row.')
15 |         parser.add_argument('--display_id', type=int, default=1, help='window id of the web display')
16 |         parser.add_argument('--display_server', type=str, default="http://localhost", help='visdom server of the web display')
17 |         parser.add_argument('--display_env', type=str, default='main', help='visdom display environment name (default is "main")')
18 |         parser.add_argument('--display_port', type=int, default=8097, help='visdom port of the web display')
19 |         parser.add_argument('--update_html_freq', type=int, default=1000, help='frequency of saving training results to html')
20 |         parser.add_argument('--print_freq', type=int, default=100, help='frequency of showing training results on console')
21 |         parser.add_argument('--no_html', action='store_true', help='do not save intermediate training results to [opt.checkpoints_dir]/[opt.name]/web/')
22 |         # network saving and loading parameters
23 |         parser.add_argument('--save_latest_freq', type=int, default=5000, help='frequency of saving the latest results')
24 |         parser.add_argument('--save_epoch_freq', type=int, default=10, help='frequency of saving checkpoints at the end of epochs')
25 |         parser.add_argument('--save_by_iter', action='store_true', help='whether saves model by iteration')
26 |         parser.add_argument('--continue_train', action='store_true', help='continue training: load the latest model')
27 |         parser.add_argument('--epoch_count', type=int, default=1, help='the starting epoch count, we save the model by <epoch_count>, <epoch_count>+<save_latest_freq>, ...')
28 |         parser.add_argument('--phase', type=str, default='train', help='train, val, test, etc')
29 |         # training parameters
30 |         parser.add_argument('--n_epochs', type=int, default=100, help='number of epochs with the initial learning rate')
31 |         parser.add_argument('--n_epochs_decay', type=int, default=100, help='number of epochs to linearly decay learning rate to zero')
32 |         parser.add_argument('--beta1', type=float, default=0.5, help='momentum term of adam')
33 |         parser.add_argument('--lr', type=float, default=0.0001, help='initial learning rate for adam')
34 |         parser.add_argument('--gan_mode', type=str, default='lsgan', help='the type of GAN objective. [vanilla| lsgan | wgangp]. vanilla GAN loss is the cross-entropy objective used in the original GAN paper.')
35 |         parser.add_argument('--pool_size', type=int, default=50, help='the size of image buffer that stores previously generated images')
36 |         parser.add_argument('--lr_policy', type=str, default='linear', help='learning rate policy. [linear | step | plateau | cosine]')
37 |         parser.add_argument('--lr_decay_iters', type=int, default=50, help='multiply by a gamma every lr_decay_iters iterations')
38 | 
39 |         self.isTrain = True
40 |         return parser
41 | 


--------------------------------------------------------------------------------
/pix2pix/train.py:
--------------------------------------------------------------------------------
 1 | """General-purpose training script for image-to-image translation.
 2 | 
 3 | This script works for various models (with option '--model': e.g., pix2pix, cyclegan, colorization) and
 4 | different datasets (with option '--dataset_mode': e.g., aligned, unaligned, single, colorization).
 5 | You need to specify the dataset ('--dataroot'), experiment name ('--name'), and model ('--model').
 6 | 
 7 | It first creates model, dataset, and visualizer given the option.
 8 | It then does standard network training. During the training, it also visualize/save the images, print/save the loss plot, and save models.
 9 | The script supports continue/resume training. Use '--continue_train' to resume your previous training.
10 | 
11 | Example:
12 |     Train a CycleGAN model:
13 |         python train.py --dataroot ./datasets/maps --name maps_cyclegan --model cycle_gan
14 |     Train a pix2pix model:
15 |         python train.py --dataroot ./datasets/facades --name facades_pix2pix --model pix2pix --direction BtoA
16 | 
17 | See options/base_options.py and options/train_options.py for more training options.
18 | See training and test tips at: https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/docs/tips.md
19 | See frequently asked questions at: https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/docs/qa.md
20 | """
21 | import time
22 | from options.train_options import TrainOptions
23 | from data import create_dataset
24 | from models import create_model
25 | from util.visualizer import Visualizer
26 | 
27 | if __name__ == '__main__':
28 |     opt = TrainOptions().parse()   # get training options
29 |     # opt.serial_batches = True
30 |     dataset = create_dataset(opt)  # create a dataset given opt.dataset_mode and other options
31 |     dataset_size = len(dataset)    # get the number of images in the dataset.
32 |     print('The number of training images = %d' % dataset_size)
33 | 
34 |     model = create_model(opt)      # create a model given opt.model and other options
35 |     model.setup(opt)               # regular setup: load and print networks; create schedulers
36 |     visualizer = Visualizer(opt)   # create a visualizer that display/save images and plots
37 | 
38 |     for epoch in range(opt.epoch_count, opt.n_epochs + opt.n_epochs_decay + 1):    # outer loop for different epochs; we save the model by <epoch_count>, <epoch_count>+<save_latest_freq>
39 |         epoch_start_time = time.time()  # timer for entire epoch
40 |         iter_data_time = time.time()    # timer for data loading per iteration
41 |         epoch_iter = 0                  # the number of training iterations in current epoch, reset to 0 every epoch
42 |         visualizer.reset()              # reset the visualizer: make sure it saves the results to HTML at least once every epoch
43 |         model.update_learning_rate()    # update learning rates in the beginning of every epoch.
44 |         for i, data in enumerate(dataset):  # inner loop within one epoch
45 |             iter_start_time = time.time()  # timer for computation per iteration
46 | 
47 |             epoch_iter += opt.batch_size
48 |             model.set_input_train(data)         # unpack data from dataset and apply preprocessing
49 |             model.optimize_parameters()   # calculate loss functions, get gradients, update network weights
50 | 
51 |             if epoch_iter == dataset_size:
52 |                 model.compute_visuals()
53 |                 visualizer.display_current_results(model.get_current_visuals(), epoch, True)
54 | 
55 |             if epoch_iter % 500 == 0 or epoch_iter == dataset_size:    # print training losses and save logging information to the disk
56 |                 losses = model.get_current_losses()
57 |                 t_data = iter_start_time - iter_data_time
58 |                 t_comp = (time.time() - iter_start_time) / opt.batch_size
59 |                 visualizer.print_current_losses(epoch, epoch_iter, losses, t_comp, t_data)
60 | 
61 | 
62 |         if epoch % opt.save_epoch_freq == 0:              # cache our model every <save_epoch_freq> epochs
63 |             print('saving the model at the end of epoch %d' % epoch)
64 |             model.save_networks('latest')
65 |             model.save_networks(epoch)
66 | 
67 |         print('End of epoch %d / %d \t Time Taken: %d sec' % (epoch, opt.n_epochs + opt.n_epochs_decay, time.time() - epoch_start_time))
68 | 


--------------------------------------------------------------------------------
/pix2pix/util/__init__.py:
--------------------------------------------------------------------------------
1 | """This package includes a miscellaneous collection of useful helper functions."""
2 | 


--------------------------------------------------------------------------------
/pix2pix/util/get_data.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import os
  3 | import tarfile
  4 | import requests
  5 | from warnings import warn
  6 | from zipfile import ZipFile
  7 | from bs4 import BeautifulSoup
  8 | from os.path import abspath, isdir, join, basename
  9 | 
 10 | 
 11 | class GetData(object):
 12 |     """A Python script for downloading CycleGAN or pix2pix datasets.
 13 | 
 14 |     Parameters:
 15 |         technique (str) -- One of: 'cyclegan' or 'pix2pix'.
 16 |         verbose (bool)  -- If True, print additional information.
 17 | 
 18 |     Examples:
 19 |         >>> from util.get_data import GetData
 20 |         >>> gd = GetData(technique='cyclegan')
 21 |         >>> new_data_path = gd.get(save_path='./datasets')  # options will be displayed.
 22 | 
 23 |     Alternatively, You can use bash scripts: 'scripts/download_pix2pix_model.sh'
 24 |     and 'scripts/download_cyclegan_model.sh'.
 25 |     """
 26 | 
 27 |     def __init__(self, technique='cyclegan', verbose=True):
 28 |         url_dict = {
 29 |             'pix2pix': 'http://efrosgans.eecs.berkeley.edu/pix2pix/datasets/',
 30 |             'cyclegan': 'https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets'
 31 |         }
 32 |         self.url = url_dict.get(technique.lower())
 33 |         self._verbose = verbose
 34 | 
 35 |     def _print(self, text):
 36 |         if self._verbose:
 37 |             print(text)
 38 | 
 39 |     @staticmethod
 40 |     def _get_options(r):
 41 |         soup = BeautifulSoup(r.text, 'lxml')
 42 |         options = [h.text for h in soup.find_all('a', href=True)
 43 |                    if h.text.endswith(('.zip', 'tar.gz'))]
 44 |         return options
 45 | 
 46 |     def _present_options(self):
 47 |         r = requests.get(self.url)
 48 |         options = self._get_options(r)
 49 |         print('Options:\n')
 50 |         for i, o in enumerate(options):
 51 |             print("{0}: {1}".format(i, o))
 52 |         choice = input("\nPlease enter the number of the "
 53 |                        "dataset above you wish to download:")
 54 |         return options[int(choice)]
 55 | 
 56 |     def _download_data(self, dataset_url, save_path):
 57 |         if not isdir(save_path):
 58 |             os.makedirs(save_path)
 59 | 
 60 |         base = basename(dataset_url)
 61 |         temp_save_path = join(save_path, base)
 62 | 
 63 |         with open(temp_save_path, "wb") as f:
 64 |             r = requests.get(dataset_url)
 65 |             f.write(r.content)
 66 | 
 67 |         if base.endswith('.tar.gz'):
 68 |             obj = tarfile.open(temp_save_path)
 69 |         elif base.endswith('.zip'):
 70 |             obj = ZipFile(temp_save_path, 'r')
 71 |         else:
 72 |             raise ValueError("Unknown File Type: {0}.".format(base))
 73 | 
 74 |         self._print("Unpacking Data...")
 75 |         obj.extractall(save_path)
 76 |         obj.close()
 77 |         os.remove(temp_save_path)
 78 | 
 79 |     def get(self, save_path, dataset=None):
 80 |         """
 81 | 
 82 |         Download a dataset.
 83 | 
 84 |         Parameters:
 85 |             save_path (str) -- A directory to save the data to.
 86 |             dataset (str)   -- (optional). A specific dataset to download.
 87 |                             Note: this must include the file extension.
 88 |                             If None, options will be presented for you
 89 |                             to choose from.
 90 | 
 91 |         Returns:
 92 |             save_path_full (str) -- the absolute path to the downloaded data.
 93 | 
 94 |         """
 95 |         if dataset is None:
 96 |             selected_dataset = self._present_options()
 97 |         else:
 98 |             selected_dataset = dataset
 99 | 
100 |         save_path_full = join(save_path, selected_dataset.split('.')[0])
101 | 
102 |         if isdir(save_path_full):
103 |             warn("\n'{0}' already exists. Voiding Download.".format(
104 |                 save_path_full))
105 |         else:
106 |             self._print('Downloading Data...')
107 |             url = "{0}/{1}".format(self.url, selected_dataset)
108 |             self._download_data(url, save_path=save_path)
109 | 
110 |         return abspath(save_path_full)
111 | 


--------------------------------------------------------------------------------
/pix2pix/util/guidedfilter.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class GuidedFilter():
 4 |     def __init__(self, source, reference, r=64, eps= 0.05**2):
 5 |         self.source = source;
 6 |         self.reference = reference;
 7 |         self.r = r
 8 |         self.eps = eps
 9 | 
10 |         self.smooth = self.guidedfilter(self.source,self.reference,self.r,self.eps)
11 | 
12 |     def boxfilter(self,img, r):
13 |         (rows, cols) = img.shape
14 |         imDst = np.zeros_like(img)
15 | 
16 |         imCum = np.cumsum(img, 0)
17 |         imDst[0 : r+1, :] = imCum[r : 2*r+1, :]
18 |         imDst[r+1 : rows-r, :] = imCum[2*r+1 : rows, :] - imCum[0 : rows-2*r-1, :]
19 |         imDst[rows-r: rows, :] = np.tile(imCum[rows-1, :], [r, 1]) - imCum[rows-2*r-1 : rows-r-1, :]
20 | 
21 |         imCum = np.cumsum(imDst, 1)
22 |         imDst[:, 0 : r+1] = imCum[:, r : 2*r+1]
23 |         imDst[:, r+1 : cols-r] = imCum[:, 2*r+1 : cols] - imCum[:, 0 : cols-2*r-1]
24 |         imDst[:, cols-r: cols] = np.tile(imCum[:, cols-1], [r, 1]).T - imCum[:, cols-2*r-1 : cols-r-1]
25 | 
26 |         return imDst
27 | 
28 |     def guidedfilter(self,I, p, r, eps):
29 |         (rows, cols) = I.shape
30 |         N = self.boxfilter(np.ones([rows, cols]), r)
31 | 
32 |         meanI = self.boxfilter(I, r) / N
33 |         meanP = self.boxfilter(p, r) / N
34 |         meanIp = self.boxfilter(I * p, r) / N
35 |         covIp = meanIp - meanI * meanP
36 | 
37 |         meanII = self.boxfilter(I * I, r) / N
38 |         varI = meanII - meanI * meanI
39 | 
40 |         a = covIp / (varI + eps)
41 |         b = meanP - a * meanI
42 | 
43 |         meanA = self.boxfilter(a, r) / N
44 |         meanB = self.boxfilter(b, r) / N
45 | 
46 |         q = meanA * I + meanB
47 |         return q


--------------------------------------------------------------------------------
/pix2pix/util/html.py:
--------------------------------------------------------------------------------
 1 | import dominate
 2 | from dominate.tags import meta, h3, table, tr, td, p, a, img, br
 3 | import os
 4 | 
 5 | 
 6 | class HTML:
 7 |     """This HTML class allows us to save images and write texts into a single HTML file.
 8 | 
 9 |      It consists of functions such as <add_header> (add a text header to the HTML file),
10 |      <add_images> (add a row of images to the HTML file), and <save> (save the HTML to the disk).
11 |      It is based on Python library 'dominate', a Python library for creating and manipulating HTML documents using a DOM API.
12 |     """
13 | 
14 |     def __init__(self, web_dir, title, refresh=0):
15 |         """Initialize the HTML classes
16 | 
17 |         Parameters:
18 |             web_dir (str) -- a directory that stores the webpage. HTML file will be created at <web_dir>/index.html; images will be saved at <web_dir/images/
19 |             title (str)   -- the webpage name
20 |             refresh (int) -- how often the website refresh itself; if 0; no refreshing
21 |         """
22 |         self.title = title
23 |         self.web_dir = web_dir
24 |         self.img_dir = os.path.join(self.web_dir, 'images')
25 |         if not os.path.exists(self.web_dir):
26 |             os.makedirs(self.web_dir)
27 |         if not os.path.exists(self.img_dir):
28 |             os.makedirs(self.img_dir)
29 | 
30 |         self.doc = dominate.document(title=title)
31 |         if refresh > 0:
32 |             with self.doc.head:
33 |                 meta(http_equiv="refresh", content=str(refresh))
34 | 
35 |     def get_image_dir(self):
36 |         """Return the directory that stores images"""
37 |         return self.img_dir
38 | 
39 |     def add_header(self, text):
40 |         """Insert a header to the HTML file
41 | 
42 |         Parameters:
43 |             text (str) -- the header text
44 |         """
45 |         with self.doc:
46 |             h3(text)
47 | 
48 |     def add_images(self, ims, txts, links, width=400):
49 |         """add images to the HTML file
50 | 
51 |         Parameters:
52 |             ims (str list)   -- a list of image paths
53 |             txts (str list)  -- a list of image names shown on the website
54 |             links (str list) --  a list of hyperref links; when you click an image, it will redirect you to a new page
55 |         """
56 |         self.t = table(border=1, style="table-layout: fixed;")  # Insert a table
57 |         self.doc.add(self.t)
58 |         with self.t:
59 |             with tr():
60 |                 for im, txt, link in zip(ims, txts, links):
61 |                     with td(style="word-wrap: break-word;", halign="center", valign="top"):
62 |                         with p():
63 |                             with a(href=os.path.join('images', link)):
64 |                                 img(style="width:%dpx" % width, src=os.path.join('images', im))
65 |                             br()
66 |                             p(txt)
67 | 
68 |     def save(self):
69 |         """save the current content to the HMTL file"""
70 |         html_file = '%s/index.html' % self.web_dir
71 |         f = open(html_file, 'wt')
72 |         f.write(self.doc.render())
73 |         f.close()
74 | 
75 | 
76 | if __name__ == '__main__':  # we show an example usage here.
77 |     html = HTML('web/', 'test_html')
78 |     html.add_header('hello world')
79 | 
80 |     ims, txts, links = [], [], []
81 |     for n in range(4):
82 |         ims.append('image_%d.png' % n)
83 |         txts.append('text_%d' % n)
84 |         links.append('image_%d.png' % n)
85 |     html.add_images(ims, txts, links)
86 |     html.save()
87 | 


--------------------------------------------------------------------------------
/pix2pix/util/image_pool.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import torch
 3 | 
 4 | 
 5 | class ImagePool():
 6 |     """This class implements an image buffer that stores previously generated images.
 7 | 
 8 |     This buffer enables us to update discriminators using a history of generated images
 9 |     rather than the ones produced by the latest generators.
10 |     """
11 | 
12 |     def __init__(self, pool_size):
13 |         """Initialize the ImagePool class
14 | 
15 |         Parameters:
16 |             pool_size (int) -- the size of image buffer, if pool_size=0, no buffer will be created
17 |         """
18 |         self.pool_size = pool_size
19 |         if self.pool_size > 0:  # create an empty pool
20 |             self.num_imgs = 0
21 |             self.images = []
22 | 
23 |     def query(self, images):
24 |         """Return an image from the pool.
25 | 
26 |         Parameters:
27 |             images: the latest generated images from the generator
28 | 
29 |         Returns images from the buffer.
30 | 
31 |         By 50/100, the buffer will return input images.
32 |         By 50/100, the buffer will return images previously stored in the buffer,
33 |         and insert the current images to the buffer.
34 |         """
35 |         if self.pool_size == 0:  # if the buffer size is 0, do nothing
36 |             return images
37 |         return_images = []
38 |         for image in images:
39 |             image = torch.unsqueeze(image.data, 0)
40 |             if self.num_imgs < self.pool_size:   # if the buffer is not full; keep inserting current images to the buffer
41 |                 self.num_imgs = self.num_imgs + 1
42 |                 self.images.append(image)
43 |                 return_images.append(image)
44 |             else:
45 |                 p = random.uniform(0, 1)
46 |                 if p > 0.5:  # by 50% chance, the buffer will return a previously stored image, and insert the current image into the buffer
47 |                     random_id = random.randint(0, self.pool_size - 1)  # randint is inclusive
48 |                     tmp = self.images[random_id].clone()
49 |                     self.images[random_id] = image
50 |                     return_images.append(tmp)
51 |                 else:       # by another 50% chance, the buffer will return the current image
52 |                     return_images.append(image)
53 |         return_images = torch.cat(return_images, 0)   # collect all the images and return
54 |         return return_images
55 | 


--------------------------------------------------------------------------------
/pix2pix/util/util.py:
--------------------------------------------------------------------------------
  1 | """This module contains simple helper functions """
  2 | from __future__ import print_function
  3 | import torch
  4 | import numpy as np
  5 | from PIL import Image
  6 | import os
  7 | 
  8 | 
  9 | def tensor2im(input_image, imtype=np.uint16):
 10 |     """"Converts a Tensor array into a numpy image array.
 11 | 
 12 |     Parameters:
 13 |         input_image (tensor) --  the input image tensor array
 14 |         imtype (type)        --  the desired type of the converted numpy array
 15 |     """
 16 |     if not isinstance(input_image, np.ndarray):
 17 |         if isinstance(input_image, torch.Tensor):  # get the data from a variable
 18 |             image_tensor = input_image.data
 19 |         else:
 20 |             return input_image
 21 |         image_numpy = torch.squeeze(image_tensor).cpu().numpy()  # convert it into a numpy array
 22 |         image_numpy = (image_numpy + 1) / 2.0 * (2**16-1) #
 23 |     else:  # if it is a numpy array, do nothing
 24 |         image_numpy = input_image
 25 |     return image_numpy.astype(imtype)
 26 | 
 27 | 
 28 | def diagnose_network(net, name='network'):
 29 |     """Calculate and print the mean of average absolute(gradients)
 30 | 
 31 |     Parameters:
 32 |         net (torch network) -- Torch network
 33 |         name (str) -- the name of the network
 34 |     """
 35 |     mean = 0.0
 36 |     count = 0
 37 |     for param in net.parameters():
 38 |         if param.grad is not None:
 39 |             mean += torch.mean(torch.abs(param.grad.data))
 40 |             count += 1
 41 |     if count > 0:
 42 |         mean = mean / count
 43 |     print(name)
 44 |     print(mean)
 45 | 
 46 | 
 47 | def save_image(image_numpy, image_path, aspect_ratio=1.0):
 48 |     """Save a numpy image to the disk
 49 | 
 50 |     Parameters:
 51 |         image_numpy (numpy array) -- input numpy array
 52 |         image_path (str)          -- the path of the image
 53 |     """
 54 |     image_pil = Image.fromarray(image_numpy)
 55 | 
 56 |     image_pil = image_pil.convert('I;16')
 57 | 
 58 |     # image_pil = Image.fromarray(image_numpy)
 59 |     # h, w, _ = image_numpy.shape
 60 |     #
 61 |     # if aspect_ratio > 1.0:
 62 |     #     image_pil = image_pil.resize((h, int(w * aspect_ratio)), Image.BICUBIC)
 63 |     # if aspect_ratio < 1.0:
 64 |     #     image_pil = image_pil.resize((int(h / aspect_ratio), w), Image.BICUBIC)
 65 | 
 66 |     image_pil.save(image_path)
 67 | 
 68 | 
 69 | def print_numpy(x, val=True, shp=False):
 70 |     """Print the mean, min, max, median, std, and size of a numpy array
 71 | 
 72 |     Parameters:
 73 |         val (bool) -- if print the values of the numpy array
 74 |         shp (bool) -- if print the shape of the numpy array
 75 |     """
 76 |     x = x.astype(np.float64)
 77 |     if shp:
 78 |         print('shape,', x.shape)
 79 |     if val:
 80 |         x = x.flatten()
 81 |         print('mean = %3.3f, min = %3.3f, max = %3.3f, median = %3.3f, std=%3.3f' % (
 82 |             np.mean(x), np.min(x), np.max(x), np.median(x), np.std(x)))
 83 | 
 84 | 
 85 | def mkdirs(paths):
 86 |     """create empty directories if they don't exist
 87 | 
 88 |     Parameters:
 89 |         paths (str list) -- a list of directory paths
 90 |     """
 91 |     if isinstance(paths, list) and not isinstance(paths, str):
 92 |         for path in paths:
 93 |             mkdir(path)
 94 |     else:
 95 |         mkdir(paths)
 96 | 
 97 | 
 98 | def mkdir(path):
 99 |     """create a single empty directory if it didn't exist
100 | 
101 |     Parameters:
102 |         path (str) -- a single directory path
103 |     """
104 |     if not os.path.exists(path):
105 |         os.makedirs(path)
106 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Requirements for running in standalone mode
 2 | # First, install the corect version of PyTorch!
 3 | # PyTorch Compute Platform must match the configuration of the hardware.
 4 | 
 5 | # pip install -r requirements.txt
 6 | torch
 7 | gradio>=3.38.0,<4.0  # User UI
 8 | timm~=0.9.2  # For midas
 9 | matplotlib
10 | trimesh  # For creating simple meshes
11 | numba>=0.57.0  # Speeding up CPU stereoimage generation
12 | vispy>=0.13.0
13 | rembg>=2.0.50  # Remove background
14 | moviepy>=1.0.2,<2.0
15 | transforms3d>=0.4.1
16 | imageio>=2.4.1,<3.0
17 | imageio-ffmpeg
18 | networkx>=2.5
19 | diffusers>=0.20.1 # For Marigold
20 | pyqt5; sys_platform == 'windows'
21 | pyqt6; sys_platform != 'windows'
22 | PyOpenGL>=3.1.7; sys_platform == 'darwin'
23 | https://github.com/huchenlei/Depth-Anything/releases/download/v1.0.0/depth_anything-2024.1.22.0-py2.py3-none-any.whl
24 | 


--------------------------------------------------------------------------------
/scripts/depthmap.py:
--------------------------------------------------------------------------------
  1 | import traceback
  2 | import gradio as gr
  3 | from modules import shared
  4 | import modules.scripts as scripts
  5 | from PIL import Image
  6 | 
  7 | from src import backbone
  8 | from src import common_ui
  9 | from src.core import core_generation_funnel
 10 | from src.gradio_args_transport import GradioComponentBundle
 11 | from src.misc import *
 12 | 
 13 | 
 14 | class Script(scripts.Script):
 15 |     def title(self):
 16 |         return SCRIPT_NAME
 17 | 
 18 |     def show(self, is_img2img):
 19 |         return True
 20 | 
 21 |     def ui(self, is_img2img):
 22 |         gr.HTML()  # Work around a Gradio bug
 23 |         with gr.Column(variant='panel'):
 24 |             gr.HTML()  # Work around a Gradio bug
 25 |             ret = common_ui.main_ui_panel(False)
 26 |             ret += ret.enkey_tail()
 27 |         return ret.enkey_body()
 28 | 
 29 |     # run from script in txt2img or img2img
 30 |     def run(self, p, *inputs):
 31 |         from modules import processing
 32 |         from modules.processing import create_infotext
 33 | 
 34 |         inputs = GradioComponentBundle.enkey_to_dict(inputs)
 35 | 
 36 |         # sd process
 37 |         processed = processing.process_images(p)
 38 |         processed.sampler = p.sampler  # for create_infotext
 39 |         processed.tiling = p.tiling  # for create_infotext
 40 | 
 41 |         inputimages = []
 42 |         for count in range(0, len(processed.images)):
 43 |             # skip first grid image
 44 |             if count == 0 and len(processed.images) > 1 and shared.opts.return_grid:
 45 |                 continue
 46 |             inputimages.append(processed.images[count])
 47 | 
 48 |         gen_obj = core_generation_funnel(p.outpath_samples, inputimages, None, None, inputs, backbone.gather_ops())
 49 | 
 50 |         for input_i, type, result in gen_obj:
 51 |             if not isinstance(result, Image.Image):
 52 |                 continue
 53 | 
 54 |             # get generation parameters
 55 |             # TODO: could reuse
 56 |             if hasattr(processed, 'all_prompts') and shared.opts.enable_pnginfo:
 57 |                 info = create_infotext(
 58 |                     p, processed.all_prompts, processed.all_seeds, processed.all_subseeds, "", 0, input_i)
 59 |             else:
 60 |                 info = None
 61 | 
 62 |             processed.images.append(result)
 63 |             if inputs["save_outputs"]:
 64 |                 try:
 65 |                     suffix = "" if type == "depth" else f"{type}"
 66 |                     backbone.save_image(result, path=p.outpath_samples, basename="", seed=processed.all_seeds[input_i],
 67 |                                prompt=processed.all_prompts[input_i], extension=shared.opts.samples_format,
 68 |                                info=info,
 69 |                                p=processed,
 70 |                                suffix=suffix)
 71 |                 except Exception as e:
 72 |                     if not ('image has wrong mode' in str(e) or 'I;16' in str(e)):
 73 |                         raise e
 74 |                     print('Catched exception: image has wrong mode!')
 75 |                     traceback.print_exc()
 76 |         return processed
 77 | 
 78 | 
 79 | # TODO: some of them may be put into the main ui pane
 80 | # TODO: allow in standalone mode
 81 | def on_ui_settings():
 82 |     section = ('depthmap-script', "Depthmap extension")
 83 | 
 84 |     def add_option(name, default_value, description, name_prefix='depthmap_script'):
 85 |         shared.opts.add_option(f"{name_prefix}_{name}", shared.OptionInfo(default_value, description, section=section))
 86 | 
 87 |     add_option('keepmodels', False, "Do not unload depth and pix2pix models.")
 88 | 
 89 |     add_option('boost_rmax', 1600, "Maximum wholesize for boost (Rmax)")
 90 |     add_option('marigold_ensembles', 5, "How many ensembles to use for Marigold")
 91 |     add_option('marigold_steps', 10, "How many denoising steps to use for Marigold")
 92 | 
 93 |     add_option('save_ply', False, "Save additional PLY file with 3D inpainted mesh.")
 94 |     add_option('show_3d', True, "Enable showing 3D Meshes in output tab. (Experimental)")
 95 |     add_option('show_3d_inpaint', True, "Also show 3D Inpainted Mesh in 3D Mesh output tab. (Experimental)")
 96 |     add_option('mesh_maxsize', 2048, "Max size for generating simple mesh.")
 97 | 
 98 |     add_option('gen_heatmap_from_ui', False, "Show an option to generate HeatMap in the UI")
 99 |     add_option('extra_stereomodes', False, "Enable more possible outputs for stereoimage generation")
100 | 
101 | 
102 | from modules import script_callbacks
103 | script_callbacks.on_ui_settings(on_ui_settings)
104 | script_callbacks.on_ui_tabs(lambda: [(common_ui.on_ui_tabs(), "Depth", "depthmap_interface")])
105 | 


--------------------------------------------------------------------------------
/src/common_constants.py:
--------------------------------------------------------------------------------
 1 | import enum
 2 | 
 3 | 
 4 | class GenerationOptions(enum.Enum):
 5 |     """This Enum provides the options that are used in the usual generation
 6 |     (that is, consumed by the core_generation_funnel).
 7 |     Please use this to avoid typos. Also, this enum provides default values for these options."""
 8 |     def __new__(cls, *args, **kwds):
 9 |         value = len(cls.__members__) + 1
10 |         obj = object.__new__(cls)
11 |         obj._value_ = value
12 |         return obj
13 | 
14 |     def __init__(self, default_value=None, *args):
15 |         """Saves default value as a member (called "df") of a member of this enum"""
16 |         self.df = default_value
17 | 
18 |     COMPUTE_DEVICE = "GPU"
19 |     MODEL_TYPE = "Depth Anything v2 Base"  # Will become enum element
20 |     BOOST = False
21 |     NET_SIZE_MATCH = False
22 |     NET_WIDTH = 448
23 |     NET_HEIGHT = 448
24 |     TILING_MODE = False
25 | 
26 |     DO_OUTPUT_DEPTH = True
27 |     OUTPUT_DEPTH_INVERT = False
28 |     OUTPUT_DEPTH_COMBINE = False
29 |     OUTPUT_DEPTH_COMBINE_AXIS = "Horizontal"  # Format (str) is subject to change
30 |     DO_OUTPUT_DEPTH_PREDICTION = False  # Hidden, do not use, subject to change
31 | 
32 |     CLIPDEPTH = False
33 |     CLIPDEPTH_MODE = "Range"
34 |     CLIPDEPTH_FAR = 0.0
35 |     CLIPDEPTH_NEAR = 1.0
36 | 
37 |     GEN_STEREO = False
38 |     STEREO_MODES = ["left-right", "red-cyan-anaglyph"]
39 |     STEREO_DIVERGENCE = 2.5
40 |     STEREO_SEPARATION = 0.0
41 |     STEREO_FILL_ALGO = "polylines_sharp"
42 |     STEREO_OFFSET_EXPONENT = 1.0
43 |     STEREO_BALANCE = 0.0
44 | 
45 |     GEN_NORMALMAP = False
46 |     NORMALMAP_PRE_BLUR = False
47 |     NORMALMAP_PRE_BLUR_KERNEL = 3
48 |     NORMALMAP_SOBEL = True
49 |     NORMALMAP_SOBEL_KERNEL = 3
50 |     NORMALMAP_POST_BLUR = False
51 |     NORMALMAP_POST_BLUR_KERNEL = 3
52 |     NORMALMAP_INVERT = False
53 | 
54 |     GEN_HEATMAP = False
55 | 
56 |     GEN_SIMPLE_MESH = False
57 |     SIMPLE_MESH_OCCLUDE = True
58 |     SIMPLE_MESH_SPHERICAL = False
59 | 
60 |     GEN_INPAINTED_MESH = False
61 |     GEN_INPAINTED_MESH_DEMOS = False
62 | 
63 |     GEN_REMBG = False
64 |     SAVE_BACKGROUND_REMOVAL_MASKS = False  # Legacy, will be reworked
65 |     PRE_DEPTH_BACKGROUND_REMOVAL = False  # Legacy, will be reworked
66 |     REMBG_MODEL = "u2net"
67 | 


--------------------------------------------------------------------------------
/src/gradio_args_transport.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | 
 3 | class GradioComponentBundle:
 4 |     """Allows easier transportation of massive ammount of named gradio inputs.
 5 |     Allows adding visibility rules quicker."""
 6 |     def __init__(self):
 7 |         self.internal = {}
 8 |         self.internal_ignored = {}
 9 | 
10 |     def _raw_assignment(self, key, value, ignored=False):
11 |         assert key not in self.internal, f"Already bundled component with name {key}."
12 |         assert key not in self.internal_ignored, f"Already bundled component with name {key}."
13 |         if not ignored:
14 |             self.internal[key] = value
15 |         else:
16 |             self.internal_ignored[key] = value
17 | 
18 |     def _append_el(self, thing, ignored=False):
19 |         if isinstance(thing, tuple) and len(thing) == 2 and isinstance(thing[1], gr.blocks.Block):
20 |             name = thing[0] if isinstance(thing[0], str) else thing[0].name.lower()  # .name is for Enums
21 |             if hasattr(thing[0], 'df') and thing[0].df is not None:
22 |                 thing[1].value = thing[0].df
23 |             self._raw_assignment(name, thing[1], ignored)
24 |         elif isinstance(thing, gr.components.Component) and thing.elem_id is not None:
25 |             self._raw_assignment(thing.elem_id, thing, ignored)
26 |         else:
27 |             raise Exception(f"This object can not be bundled, {str(thing)}")
28 | 
29 |     def __iadd__(self, els):
30 |         """Add an input element that will be packed into a bundle."""
31 |         self._append_el(els, ignored=False)
32 |         return self
33 | 
34 |     def __isub__(self, els):
35 |         """Add an element that will not be packed into a bundle, but will be accessible."""
36 |         self._append_el(els, ignored=True)
37 |         return self
38 | 
39 |     def __ior__(self, thing):
40 |         """Add an extra bundle into your bundle, so you could have more bundeled items in your bundle."""
41 |         assert isinstance(thing, GradioComponentBundle), "Use += or -= for bundling elements"
42 |         for key in list(thing.internal.keys()):
43 |             self._raw_assignment(key, thing[key], False)
44 |         for key in list(thing.internal_ignored.keys()):
45 |             self._raw_assignment(key, thing[key], True)
46 |         return self
47 | 
48 |     def __getitem__(self, key):
49 |         """Return the gradio component elem_id"""
50 |         if hasattr(key, 'name'):
51 |             key = key.name.lower()  # for enum elements
52 |         if key in self.internal_ignored:
53 |             return self.internal_ignored[key]
54 |         return self.internal[key]
55 | 
56 |     def __contains__(self, key):
57 |         if hasattr(key, 'name'):
58 |             key = key.name.lower()  # for enum elements
59 |         return key in self.internal_ignored or key in self.internal
60 | 
61 |     def enkey_tail(self):
62 |         """Must be the last element of the bundle for unbundling to work"""
63 |         keys = sorted(list(self.internal.keys()))
64 |         head = gr.HTML(elem_id="zzz_depthmap_enkey", value="\u222F" + "\u222F".join(keys), visible=False)
65 |         return head
66 | 
67 |     def enkey_body(self):
68 |         """This is what should be passed into the function that is called by gradio"""
69 |         return [self.internal[x] for x in sorted(list(self.internal.keys()))]
70 | 
71 |     def add_rule(self, first, rule, second):
72 |         first = self[first] if first in self else first
73 |         second = self[second] if second in self else second
74 |         if rule == 'visible-if-not':
75 |             second.change(fn=lambda v: first.update(visible=not v), inputs=[second], outputs=[first])
76 |         elif rule == 'visible-if':
77 |             second.change(fn=lambda v: first.update(visible=v), inputs=[second], outputs=[first])
78 |         else:
79 |             raise Exception(f'Unknown rule type {rule}')
80 | 
81 |     @staticmethod
82 |     def enkey_to_dict(inp):
83 |         """Unbundle: get a dictionary with stuff after it is sent bby the gradio to the function.
84 |         Enkey format: bunch of Gradio components,
85 |         then a Gradio component, which value is concatination of names of the previous Gradio objects"""
86 |         assert inp[-1].startswith("\u222F")
87 |         ret = {}
88 |         names = inp[-1].split("\u222F")[1:]
89 |         assert len(names) == len(inp) - 1
90 |         for i, name in enumerate(names):
91 |             ret[name] = inp[i]
92 |         return ret
93 | 


--------------------------------------------------------------------------------
/src/misc.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import os
 3 | import pathlib
 4 | import builtins
 5 | 
 6 | def get_commit_hash():
 7 |     try:
 8 |         file_path = pathlib.Path(__file__).parent
 9 |         return subprocess.check_output(
10 |             [os.environ.get("GIT", "git"), "rev-parse", "HEAD"],
11 |             cwd=file_path, shell=False, stderr=subprocess.DEVNULL, encoding='utf8').strip()[0:8]
12 |     except Exception:
13 |         return "<none>"
14 | 
15 | 
16 | REPOSITORY_NAME = "stable-diffusion-webui-depthmap-script"
17 | SCRIPT_NAME = "DepthMap"
18 | SCRIPT_VERSION = "v0.4.8"
19 | SCRIPT_FULL_NAME = f"{SCRIPT_NAME} {SCRIPT_VERSION} ({get_commit_hash()})"
20 | 
21 | 
22 | # # Returns SHA256 hash of a file
23 | # import hashlib
24 | # def sha256sum(filename):
25 | #     with open(filename, 'rb', buffering=0) as f:
26 | #         return hashlib.file_digest(f, 'sha256').hexdigest()
27 | def ensure_file_downloaded(filename, url, sha256_hash_prefix=None):
28 |     import torch
29 |     # Do not check the hash every time - it is somewhat time-consumin
30 |     if os.path.exists(filename):
31 |         return
32 | 
33 |     if type(url) is not list:
34 |         url = [url]
35 |     for cur_url in url:
36 |         try:
37 |             print("Downloading", cur_url, "to", filename)
38 |             torch.hub.download_url_to_file(cur_url, filename, sha256_hash_prefix)
39 |             if os.path.exists(filename):
40 |                 return  # The correct model was downloaded, no need to try more
41 |         except:
42 |             pass
43 |     raise RuntimeError(f'Download failed. '
44 |                        f'Try again later or manually download the file {filename} to location {url}.')
45 | 


--------------------------------------------------------------------------------
/src/normalmap_generation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import cv2
 3 | from PIL import Image
 4 | 
 5 | def create_normalmap(depthmap,
 6 |                      pre_blur = None, sobel_gradient = 3, post_blur = None,
 7 |                      invert=False):
 8 |     """Generates normalmaps.
 9 |     :param depthmap: depthmap that will be used to generate normalmap
10 |     :param pre_blur: apply gaussian blur before taking gradient, -1 for disable, otherwise kernel size
11 |     :param sobel_gradient: use Sobel gradient, None for regular gradient, otherwise kernel size
12 |     :param post_blur: apply gaussian blur after taking gradient, -1 for disable, otherwise kernel size
13 |     :param invert: depthmap will be inverted before calculating normalmap
14 |     """
15 |     # https://stackoverflow.com/questions/53350391/surface-normal-calculation-from-depth-map-in-python
16 |     # TODO: Tiling can be improved (gradients could be matched).
17 |     # TODO: Implement bilateral filtering (16 bit deflickering)
18 | 
19 |     # We invert by default, maybe there is a negative sign hiding somewhere
20 |     normalmap = depthmap if invert else depthmap * (-1.0)
21 |     normalmap = normalmap / 256.0
22 |     # pre blur (only blurs z-axis)
23 |     if pre_blur is not None and pre_blur > 0:
24 |         normalmap = cv2.GaussianBlur(normalmap, (pre_blur, pre_blur), pre_blur)
25 | 
26 |     # take gradients
27 |     if sobel_gradient is not None and sobel_gradient > 0:
28 |         zx = cv2.Sobel(np.float64(normalmap), cv2.CV_64F, 1, 0, ksize=sobel_gradient)
29 |         zy = cv2.Sobel(np.float64(normalmap), cv2.CV_64F, 0, 1, ksize=sobel_gradient)
30 |     else:
31 |         zy, zx = np.gradient(normalmap)
32 | 
33 |     # combine and normalize gradients
34 |     normal = np.dstack((zx, -zy, np.ones_like(normalmap)))
35 |     # every pixel of a normal map is a normal vector, it should be a unit vector
36 |     n = np.linalg.norm(normal, axis=2)
37 |     normal[:, :, 0] /= n
38 |     normal[:, :, 1] /= n
39 |     normal[:, :, 2] /= n
40 | 
41 |     # TODO: this probably is not a good way to do it
42 |     if post_blur is not None and post_blur > 0:
43 |         normal = cv2.GaussianBlur(normal, (post_blur, post_blur), post_blur)
44 |         # Normalize every vector again
45 |         n = np.linalg.norm(normal, axis=2)
46 |         normal[:, :, 0] /= n
47 |         normal[:, :, 1] /= n
48 |         normal[:, :, 2] /= n
49 | 
50 |     # offset and rescale values to be in 0-255, so we can export them
51 |     normal += 1
52 |     normal /= 2
53 |     normal = np.clip(normal * 256, 0, 256 - 0.1)  # Clipping form above is needed to avoid overflowing
54 |     normal = normal.astype(np.uint8)
55 | 
56 |     return Image.fromarray(normal)
57 | 


--------------------------------------------------------------------------------