├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── assets
    ├── FAQ.md
    ├── cow.gif
    ├── header.jpeg
    ├── multi.gif
    ├── single.gif
    └── smpl.gif
├── configs
    ├── multi.yml
    ├── paper.yml
    ├── single.yml
    └── style.yml
├── loop.py
├── main.py
├── primitives
    ├── plane.mtl
    ├── plane.obj
    ├── sphere.mtl
    ├── sphere.obj
    ├── spot.mtl
    └── spot.obj
├── requirements.txt
└── utils
    ├── camera.py
    ├── helpers.py
    ├── limit_subdivide.py
    ├── resize_right.py
    └── video.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Custom
  2 | .vscode
  3 | *.pth
  4 | *.pt
  5 | output/*
  6 | *.sh
  7 | 
  8 | # Others
  9 | .DS_Store
 10 | .vscode
 11 | 
 12 | # Byte-compiled / optimized / DLL files
 13 | __pycache__/
 14 | *.py[cod]
 15 | *$py.class
 16 | 
 17 | # C extensions
 18 | *.so
 19 | 
 20 | # Distribution / packaging
 21 | .Python
 22 | build/
 23 | develop-eggs/
 24 | dist/
 25 | downloads/
 26 | eggs/
 27 | .eggs/
 28 | lib/
 29 | lib64/
 30 | parts/
 31 | sdist/
 32 | var/
 33 | wheels/
 34 | share/python-wheels/
 35 | *.egg-info/
 36 | .installed.cfg
 37 | *.egg
 38 | MANIFEST
 39 | 
 40 | # PyInstaller
 41 | #  Usually these files are written by a python script from a template
 42 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 43 | *.manifest
 44 | *.spec
 45 | 
 46 | # Installer logs
 47 | pip-log.txt
 48 | pip-delete-this-directory.txt
 49 | 
 50 | # Unit test / coverage reports
 51 | htmlcov/
 52 | .tox/
 53 | .nox/
 54 | .coverage
 55 | .coverage.*
 56 | .cache
 57 | nosetests.xml
 58 | coverage.xml
 59 | *.cover
 60 | *.py,cover
 61 | .hypothesis/
 62 | .pytest_cache/
 63 | cover/
 64 | 
 65 | # Translations
 66 | *.mo
 67 | *.pot
 68 | 
 69 | # Django stuff:
 70 | *.log
 71 | local_settings.py
 72 | db.sqlite3
 73 | db.sqlite3-journal
 74 | 
 75 | # Flask stuff:
 76 | instance/
 77 | .webassets-cache
 78 | 
 79 | # Scrapy stuff:
 80 | .scrapy
 81 | 
 82 | # Sphinx documentation
 83 | docs/_build/
 84 | 
 85 | # PyBuilder
 86 | .pybuilder/
 87 | target/
 88 | 
 89 | # Jupyter Notebook
 90 | .ipynb_checkpoints
 91 | 
 92 | # IPython
 93 | profile_default/
 94 | ipython_config.py
 95 | 
 96 | # pyenv
 97 | #   For a library or package, you might want to ignore these files since the code is
 98 | #   intended to run in multiple environments; otherwise, check them in:
 99 | # .python-version
100 | 
101 | # pipenv
102 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
103 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
104 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
105 | #   install all needed dependencies.
106 | #Pipfile.lock
107 | 
108 | # poetry
109 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
110 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
111 | #   commonly ignored for libraries.
112 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
113 | #poetry.lock
114 | 
115 | # pdm
116 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
117 | #pdm.lock
118 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
119 | #   in version control.
120 | #   https://pdm.fming.dev/#use-with-ide
121 | .pdm.toml
122 | 
123 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
124 | __pypackages__/
125 | 
126 | # Celery stuff
127 | celerybeat-schedule
128 | celerybeat.pid
129 | 
130 | # SageMath parsed files
131 | *.sage.py
132 | 
133 | # Environments
134 | .env
135 | .venv
136 | env/
137 | venv/
138 | ENV/
139 | env.bak/
140 | venv.bak/
141 | 
142 | # Spyder project settings
143 | .spyderproject
144 | .spyproject
145 | 
146 | # Rope project settings
147 | .ropeproject
148 | 
149 | # mkdocs documentation
150 | /site
151 | 
152 | # mypy
153 | .mypy_cache/
154 | .dmypy.json
155 | dmypy.json
156 | 
157 | # Pyre type checker
158 | .pyre/
159 | 
160 | # pytype static type analyzer
161 | .pytype/
162 | 
163 | # Cython debug symbols
164 | cython_debug/
165 | 
166 | # PyCharm
167 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
168 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
169 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
170 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
171 | #.idea/


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "loop_limitation"]
 2 | 	path = loop_limitation
 3 | 	url = https://github.com/tianhaoxie/loop_limitation
 4 | [submodule "DALLE2-pytorch"]
 5 | 	path = DALLE2-pytorch
 6 | 	url = https://github.com/lucidrains/DALLE2-pytorch.git
 7 | [submodule "nvdiffmodeling"]
 8 | 	path = nvdiffmodeling
 9 | 	url = https://github.com/NVlabs/nvdiffmodeling
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Nasir Mohammad Khalid
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## CLIP-Mesh
 2 | 
 3 | Official implementation of [**CLIP-Mesh: Generating textured meshes from text using pretrained image-text models**](https://www.nasir.lol/clipmesh)<br/>
 4 | 
 5 | [Nasir Mohammad Khalid](https://www.nasir.lol/),
 6 | [Tianhao Xie](https://www.linkedin.com/in/tianhao-xie-440b20186/),
 7 | [Eugene Belilovsky](http://eugenium.github.io/),
 8 | [Tiberiu Popa](https://users.encs.concordia.ca/~stpopa/index.html)<br/>
 9 | _[SIGGRAPH ASIA 2022]() | [arXiv](https://arxiv.org/abs/2203.13333) | [Project page](https://www.nasir.lol/clipmesh)_
10 | 
11 | ![CLIP-Mesh header image](./assets/header.jpeg)
12 | 
13 | ## Quickstart
14 | 
15 | Notebooks are unavilable working on restoring them:
16 | 
17 | |<img src="./assets/single.gif" width="310"/>|<img src="./assets/multi.gif" width="310"/>|
18 | |:-----------------------------------------------------------------------------------------------------------:|:--------------------------------------------------:|
19 | | Text to Mesh| Multi Mesh Generation|
20 | 
21 | |<img src="./assets/cow.gif" width="310"/>|<img src="./assets/smpl.gif" width="310"/>|
22 | |:-----------------------------------------------------------------------------------------------------------:|:--------------------------------------------------:|
23 | | Stylizing a Mesh| Apply CLIP-Mesh to Human Models|
24 | 
25 | ## Setup
26 | 
27 | Clone this repository recursively to get all submodules - use submodule update to get downstream submodules
28 | 
29 | ```
30 | git clone --recurse-submodules https://github.com/NasirKhalid24/CLIP-Mesh.git
31 | cd CLIP-Mesh
32 | git submodule update --init --recursive
33 | ```
34 | 
35 | Setup Conda environment and install packages
36 | 
37 | ```
38 | conda create -n clip-mesh python=3.7
39 | conda activate clip-mesh
40 | conda install pytorch==1.11.0 torchvision==0.12.0 cudatoolkit=10.2 -c pytorch
41 | 
42 | pip install -r requirements.txt
43 | ```
44 | 
45 | Install loop subdivison code and DALLE-2 (not that DALLE-2 is from an earlier commit so existing install may not work)
46 | 
47 | ```
48 | <!-- Install Loop Subdivision -->
49 | cd loop_limitation
50 | pip install .
51 | cd ..
52 | 
53 | <!-- Install DALLE2 - Diffusion Prior -->
54 | cd DALLE2-pytorch
55 | pip install .
56 | cd ..
57 | 
58 | <!-- Get DALLE2 Prior Weights -->
59 | mkdir weights
60 | wget https://huggingface.co/spaces/NasirKhalid24/Dalle2-Diffusion-Prior/resolve/main/larger-model.pth -O ./weights/model.pth
61 | ```
62 | 
63 | ## Usage
64 | 
65 | This repo comes with some configs that are passed to ```main.py``` using the ```--config``` flag
66 | 
67 | Any of the config paramaters can be overriden by passing them to as arguments to the ```main.py``` file so you can have a base .yml file with all your parameters and just update the text prompt to generate something new
68 | 
69 | An example would be using the given config file for single mesh generation ```single.yml```
70 | 
71 | ```
72 | # Use all parameters in file
73 | python main.py --config configs/single.yml      
74 | 
75 | # Use all parameters in file but change text prompt
76 | python main.py --config configs/single.yml  --text_prompt "a hamburger"    
77 | 
78 | # Use all parameters in file but change text prompt, batch, texture resolution
79 | python main.py \
80 | --config configs/single.yml \
81 | --text_prompt "a hamburger" \
82 | --batch_size 5 \
83 | --texture_resolution 1024
84 | ```
85 | 
86 | ## Tips, Tricks, FAQs etc
87 | 
88 | I recommend checking out the [following document](./assets/FAQ.md) as it could answer any doubts that come up (will be updated regularly) - if you still have questions reach out [@Nymarius_](https://twitter.com/Nymarius_) or open an issue
89 | 


--------------------------------------------------------------------------------
/assets/FAQ.md:
--------------------------------------------------------------------------------
 1 | # Tips, tricks and FAQ
 2 | 
 3 | ## Do I need a powerful GPU?
 4 | 
 5 | Not really, this code was written to work on typical gaming GPUs, if you are having memory issues try reducing the following parameters (listed in order of memory consumption):
 6 | 
 7 | - train_res
 8 | - batch_size
 9 | - texture_resolution
10 | 
11 | If you are having issues with the first step of the program where it cannot load and use the diffusion prior try removing the prior all together by setting the ```prior_path``` in your config file to an empty value while also reducing the values
12 | 
13 | Note that doing all this may reduce quality of results
14 | 
15 | ## Prompt Engineering 
16 | 
17 | Prompt engineering was not explored at all, so there's not much I can share here - if you do find prompts that improve results please submit a pull request and them here
18 | 
19 | ## The texture is quite noisy 
20 | 
21 | Try increasing ```train_res``` or set the range of ```dist_min``` and ```dist_max``` to a lower value - additionally you could also reduce ```texture_resolution``` and increase blur parameters
22 | 
23 | ## The generated shape is flat on some sides
24 | 
25 | Try increasing the ```batch_size```, increase the range of ```dist_min``` and ```dist_max``` and ensure that all ```aug_``` parameters are set to true
26 | 
27 | ## I added a custom mesh to generate with and the program just crashes
28 | 
29 | This could be three reasons that I know of:
30 | 
31 | - The mesh is non-manifold is which case the limit subdivision does not work, try to remesh the shape
32 | - There is a vertice in the mesh whose valence is outside the range of [3, 50] - hence the limit subdivision does not work
33 | - There are a huge number of vertices, the spheres used in the papaer have about 600 vertices
34 | 
35 | ## How can I setup a custom scene to generate in
36 | 
37 | I recommend setting up your scene in Blender and importing ```primitives/sphere.obj``` in to the scene - repositioning, resizing it as required. Then save the newly positoned/sized sphere as a new .obj file and save the rest of your scene meshes as .obj files (ensure the textures are baked and UV unwrapped)
38 | 
39 | To generate with this scene your config file would have all the parameters as is (may need to change camera params) and then towards the end your meshes parameters would like as follows
40 | 
41 | ```yaml
42 | ....
43 | 
44 | 
45 | # Mesh Parameters
46 | 
47 | ## Add meshes to the scene here
48 | meshes: 
49 |   - path/to/exported/sphere.obj
50 |   - path/to/exported/scene_mesh_1.obj
51 |   - path/to/exported/scene_mesh_2.obj
52 |   ....
53 | 
54 | ## Unit scale the meshes? No need as it was done in Blender
55 | unit: 
56 |   - false
57 |   - false
58 |   - false
59 |   ....
60 | 
61 | ## Sphere is first and it will be optimized, the rest are just constant scene objects
62 | train_mesh_idx:
63 |   - [verts, texture, normal, true]
64 |   - []
65 |   - []
66 |   ....
67 | 
68 | ## No scaling as it was done in Blender
69 | scales:
70 |   - 1.0
71 |   - 1.0
72 |   - 1.0
73 |   ....
74 | 
75 | ## No positioning as it was done in Blender
76 | offsets:
77 |   - [0.0,  0.0,  0.0]
78 |   - [0.0,  0.0,  0.0]
79 |   - [0.0,  0.0,  0.0]
80 |   ....
81 | 
82 | ```
83 | 
84 | You could retexture a mesh in your scene by just setting its corresponding ```train_mesh_idx``` to ```[texture, normal, true]``` 
85 | 
86 | ## I cannot generate ____ the results are not good as ____
87 | 
88 | Text to 3D is much newer than Text to Image generation and therefore the results are not up to par - additionally while most text to image models rely on a huge set of Image-Text data there is no 3D-Text data available at the same scale
89 | 
90 | Complex 3D objects with massive variation are tough to generate as CLIP embeddings are limited in the information they hold - it is recommended to generate a single object at a time even something like a car is tough to generate do but a tyre is not
91 | 
92 | You could also try using other text to 3D generation/stylization techniques such as [Dreamfields](https://ajayj.com/dreamfields), [CLIPForge](https://github.com/AutodeskAILab/Clip-Forge), [Text2Mesh](https://github.com/threedle/text2mesh)


--------------------------------------------------------------------------------
/assets/cow.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NasirKhalid24/CLIP-Mesh/e588c12d3c24ce21d9d3452f1e5b83c892be92f4/assets/cow.gif


--------------------------------------------------------------------------------
/assets/header.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NasirKhalid24/CLIP-Mesh/e588c12d3c24ce21d9d3452f1e5b83c892be92f4/assets/header.jpeg


--------------------------------------------------------------------------------
/assets/multi.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NasirKhalid24/CLIP-Mesh/e588c12d3c24ce21d9d3452f1e5b83c892be92f4/assets/multi.gif


--------------------------------------------------------------------------------
/assets/single.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NasirKhalid24/CLIP-Mesh/e588c12d3c24ce21d9d3452f1e5b83c892be92f4/assets/single.gif


--------------------------------------------------------------------------------
/assets/smpl.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NasirKhalid24/CLIP-Mesh/e588c12d3c24ce21d9d3452f1e5b83c892be92f4/assets/smpl.gif


--------------------------------------------------------------------------------
/configs/multi.yml:
--------------------------------------------------------------------------------
 1 | # Config to generate multi mesh results fast
 2 | 
 3 | # Basic
 4 | output_path: "./output/" # Where to save outputs
 5 | gpu: '0'                 # Which GPU to use
 6 | seed: 99                 # Seed for reproducibility
 7 | 
 8 | # CLIP Related
 9 | text_prompt: A cactus on the sand in the desert             # CLIP text prompt
10 | clip_model: ViT-B/32                                        # Which CLIP Model to use see available_models on OPENAI Clip repo
11 | 
12 | # Text-Image Prior Related
13 | prior_path:  weights/model.pth            # Path to weights for the prior network, not used if prior_path empty
14 | # prior_path:                             # Leave empty like this to use only text prompt
15 | 
16 | ## Parameters for diffusion prior network (code by lucidrains)
17 | diffusion_prior_network_dim: 512
18 | diffusion_prior_network_depth: 12
19 | diffusion_prior_network_dim_head: 64
20 | diffusion_prior_network_heads: 12
21 | diffusion_prior_network_normformer: false
22 | 
23 | ## Parameters for diffusion prior (code by lucidrains)
24 | diffusion_prior_embed_dim: 512
25 | diffusion_prior_timesteps: 1000
26 | diffusion_prior_cond_drop_prob: 0.1
27 | diffusion_prior_loss_type: l2
28 | diffusion_prior_condition_on_text_encodings: false
29 | 
30 | # Parameters
31 | epochs: 2000             # Number of optimization steps
32 | lr: 0.01                 # Maximum learning rate
33 | batch_size: 15           # How many images of shape are rendered at one epoch
34 | train_res: 356           # Resolution of render before scaling to 224x224
35 | resize_method: cubic     # Method for resizing from cubic, linear, lanczos2, lanczos3
36 | bsdf: diffuse            # diffuse or pbr (diffuse recommended)
37 | texture_resolution: 512  # Resolution of texture maps (ex: 512x512)
38 | channels: 4              # Texture map image channels (4 for alpha, 3 for RGB only)
39 | init_c: 0.90             # Initial alpha channel value if channels == 4
40 | kernel_size: 7           # Kernel size for gaussian blurring of textures to reduce artifacts
41 | blur_sigma: 3            # Variance of gaussian kernel for blurring of textures
42 | shape_imgs_frac: 0.5     # What % of epochs should the renders include plain shape renders as well as textures - after which only textured renders are done
43 | aug_light: true          # Augment the direction of light around the camera
44 | aug_bkg: true            # Augment the background
45 | diff_loss_weight: 0.33   # Weight of Diffusion prior loss 
46 | clip_weight: 1.0         # Weight of CLIP Text loss
47 | laplacian_weight: 30.0   # Initial uniform laplacian weight
48 | laplacian_min: 0.6       # Minimum uniform laplacian weight (set to 2% of max usually)
49 | layers: 2                # Number of layers to peel back for transparency
50 | 
51 | # Camera Parameters
52 | fov_min: 30.0            # Minimum camera field of view angle during renders 
53 | fov_max: 90.0            # Maximum camera field of view angle during renders 
54 | dist_min: 5.0            # Minimum distance of camera from mesh during renders
55 | dist_max: 12.0           # Maximum distance of camera from mesh during renders
56 | light_power: 5.0         # Light intensity
57 | elev_alpha: 1.0          # Alpha parameter for Beta distribution for elevation sampling
58 | elev_beta: 5.0           # Beta parameter for Beta distribution for elevation sampling
59 | elev_max: 90.0           # Maximum elevation angle
60 | azim_min: 0.0            # Minimum azimuth angle
61 | azim_max: 360.0          # Maximum azimuth angle
62 | aug_loc: true            # Offset mesh from center of image?
63 | 
64 | # Logging Parameters
65 | log_interval: 5          # Interval for logging
66 | log_interval_im: 250     # Image logging interval
67 | log_elev: 30.0           # Logging elevation angle
68 | log_fov: 60.0            # Logging field of view
69 | log_dist: 10.0           # Logging distance from object
70 | log_res: 512             # Logging render resolution
71 | log_light_power: 3.0     # Light intensity for logging
72 | colab: false             # Print logging image (only for Google Colab)
73 | 
74 | # Mesh Parameters
75 | 
76 | ## Add meshes to the scene here
77 | meshes: 
78 |   - primitives/sphere.obj
79 |   - primitives/plane.obj
80 | 
81 | ## Unit scale the meshes?
82 | unit: 
83 |   - true
84 |   - false
85 | 
86 | ## What parameters to optimize for each mesh or none at all (vertices, texture map, normal map, true for random texture; false for using defined texture) ?
87 | train_mesh_idx:
88 |   - [verts, texture, normal, true]
89 |   - [texture, normal, true]
90 | 
91 | ## Scale mesh size by some value
92 | scales:
93 |   - 0.5
94 |   - 3.0
95 | 
96 | ## After scaling (x, y, z) offset vertices (note that axis are +x, -y, +z)
97 | offsets:
98 |   - [0.0,  0.0,  0.0]
99 |   - [0.0, -0.5,  0.0]


--------------------------------------------------------------------------------
/configs/paper.yml:
--------------------------------------------------------------------------------
 1 | # This config was used for figures in the paper
 2 | 
 3 | # Basic
 4 | output_path: "./output/" # Where to save outputs
 5 | gpu: '0'                 # Which GPU to use
 6 | seed: 99                 # Seed for reproducibility
 7 | 
 8 | # CLIP Related
 9 | text_prompt: Pyramid of Giza                                # CLIP text prompt
10 | clip_model: ViT-B/32                                        # Which CLIP Model to use see available_models on OPENAI Clip repo
11 | 
12 | # Text-Image Prior Related
13 | prior_path:  weights/model.pth            # Path to weights for the prior network, not used if prior_path empty
14 | # prior_path:                             # Leave empty like this to use only text prompt
15 | 
16 | ## Parameters for diffusion prior network (code by lucidrains)
17 | diffusion_prior_network_dim: 512
18 | diffusion_prior_network_depth: 12
19 | diffusion_prior_network_dim_head: 64
20 | diffusion_prior_network_heads: 12
21 | diffusion_prior_network_normformer: false
22 | 
23 | ## Parameters for diffusion prior (code by lucidrains)
24 | diffusion_prior_embed_dim: 512
25 | diffusion_prior_timesteps: 1000
26 | diffusion_prior_cond_drop_prob: 0.1
27 | diffusion_prior_loss_type: l2
28 | diffusion_prior_condition_on_text_encodings: false
29 | 
30 | # Parameters
31 | epochs: 5000             # Number of optimization steps
32 | lr: 0.01                 # Maximum learning rate
33 | batch_size: 25           # How many images of shape are rendered at one epoch
34 | train_res: 512           # Resolution of render before scaling to 224x224
35 | resize_method: lanczos2  # Method for resizing from cubic, linear, lanczos2, lanczos3
36 | bsdf: diffuse            # diffuse or pbr (diffuse recommended)
37 | texture_resolution: 512  # Resolution of texture maps (ex: 512x512)
38 | channels: 4              # Texture map image channels (4 for alpha, 3 for RGB only)
39 | init_c: 0.85             # Initial alpha channel value if channels == 4
40 | kernel_size: 5          # Kernel size for gaussian blurring of textures to reduce artifacts
41 | blur_sigma: 3            # Variance of gaussian kernel for blurring of textures
42 | shape_imgs_frac: 0.5     # What % of epochs should the renders include plain shape renders as well as textures - after which only textured renders are done
43 | aug_light: true          # Augment the direction of light around the camera
44 | aug_bkg: true            # Augment the background
45 | diff_loss_weight: 0.33   # Weight of Diffusion prior loss 
46 | clip_weight: 1.0         # Weight of CLIP Text loss
47 | laplacian_weight: 30.0   # Initial uniform laplacian weight
48 | laplacian_min: 0.6       # Minimum uniform laplacian weight (set to 2% of max usually)
49 | layers: 2                # Number of layers to peel back for transparency
50 | 
51 | # Camera Parameters
52 | fov_min: 30.0            # Minimum camera field of view angle during renders 
53 | fov_max: 90.0            # Maximum camera field of view angle during renders 
54 | dist_min: 5.0            # Minimum distance of camera from mesh during renders
55 | dist_max: 12.0           # Maximum distance of camera from mesh during renders
56 | light_power: 5.0         # Light intensity
57 | elev_alpha: 1.0          # Alpha parameter for Beta distribution for elevation sampling
58 | elev_beta: 5.0           # Beta parameter for Beta distribution for elevation sampling
59 | elev_max: 90.0           # Maximum elevation angle
60 | azim_min: 0.0            # Minimum azimuth angle
61 | azim_max: 360.0          # Maximum azimuth angle
62 | aug_loc: true            # Offset mesh from center of image?
63 | 
64 | # Logging Parameters
65 | log_interval: 5          # Interval for logging
66 | log_interval_im: 250     # Image logging interval
67 | log_elev: 30.0           # Logging elevation angle
68 | log_fov: 60.0            # Logging field of view
69 | log_dist: 10.0           # Logging distance from object
70 | log_res: 512             # Logging render resolution
71 | log_light_power: 3.0     # Light intensity for logging
72 | colab: false             # Print logging image (only for Google Colab)
73 | 
74 | # Mesh Parameters
75 | 
76 | ## Add meshes to the scene here
77 | meshes: 
78 |   - primitives/sphere.obj
79 | 
80 | ## Unit scale the meshes?
81 | unit: 
82 |   - true
83 | 
84 | ## What parameters to optimize for each mesh or none at all (vertices, texture map, normal map, true for random texture; false for using defined texture) ?
85 | train_mesh_idx:
86 |   - [verts, texture, normal, true]
87 | 
88 | ## Scale mesh size by some value
89 | scales:
90 |   - 1.0
91 | 
92 | ## After scaling (x, y, z) offset vertices (note that axis are +x, -y, +z)
93 | offsets:
94 |   - [0.0, 0.0,  0.0]


--------------------------------------------------------------------------------
/configs/single.yml:
--------------------------------------------------------------------------------
 1 | # Config to generate single meshes faster than paper config
 2 | 
 3 | # Basic
 4 | output_path: "./output/" # Where to save outputs
 5 | gpu: '0'                 # Which GPU to use
 6 | seed: 99                 # Seed for reproducibility
 7 | 
 8 | # CLIP Related
 9 | text_prompt: A wooden brown table                           # CLIP text prompt
10 | clip_model: ViT-B/32                                        # Which CLIP Model to use see available_models on OPENAI Clip repo
11 | 
12 | # Text-Image Prior Related
13 | prior_path:  weights/model.pth            # Path to weights for the prior network, not used if prior_path empty
14 | # prior_path:                             # Leave empty like this to use only text prompt
15 | 
16 | ## Parameters for diffusion prior network (code by lucidrains)
17 | diffusion_prior_network_dim: 512
18 | diffusion_prior_network_depth: 12
19 | diffusion_prior_network_dim_head: 64
20 | diffusion_prior_network_heads: 12
21 | diffusion_prior_network_normformer: false
22 | 
23 | ## Parameters for diffusion prior (code by lucidrains)
24 | diffusion_prior_embed_dim: 512
25 | diffusion_prior_timesteps: 1000
26 | diffusion_prior_cond_drop_prob: 0.1
27 | diffusion_prior_loss_type: l2
28 | diffusion_prior_condition_on_text_encodings: false
29 | 
30 | # Parameters
31 | epochs: 2000             # Number of optimization steps
32 | lr: 0.01                 # Maximum learning rate
33 | batch_size: 25           # How many images of shape are rendered at one epoch
34 | train_res: 356           # Resolution of render before scaling to 224x224
35 | resize_method: cubic     # Method for resizing from cubic, linear, lanczos2, lanczos3
36 | bsdf: diffuse            # diffuse or pbr (diffuse recommended)
37 | texture_resolution: 512  # Resolution of texture maps (ex: 512x512)
38 | channels: 4              # Texture map image channels (4 for alpha, 3 for RGB only)
39 | init_c: 0.85             # Initial alpha channel value if channels == 4
40 | kernel_size: 7           # Kernel size for gaussian blurring of textures to reduce artifacts
41 | blur_sigma: 3            # Variance of gaussian kernel for blurring of textures
42 | shape_imgs_frac: 0.5     # What % of epochs should the renders include plain shape renders as well as textures - after which only textured renders are done
43 | aug_light: true          # Augment the direction of light around the camera
44 | aug_bkg: true            # Augment the background
45 | diff_loss_weight: 0.33   # Weight of Diffusion prior loss 
46 | clip_weight: 1.0         # Weight of CLIP Text loss
47 | laplacian_weight: 30.0   # Initial uniform laplacian weight
48 | laplacian_min: 0.6       # Minimum uniform laplacian weight (set to 2% of max usually)
49 | layers: 2                # Number of layers to peel back for transparency
50 | 
51 | # Camera Parameters
52 | fov_min: 30.0            # Minimum camera field of view angle during renders 
53 | fov_max: 90.0            # Maximum camera field of view angle during renders 
54 | dist_min: 5.0            # Minimum distance of camera from mesh during renders
55 | dist_max: 8.0            # Maximum distance of camera from mesh during renders
56 | light_power: 5.0         # Light intensity
57 | elev_alpha: 1.0          # Alpha parameter for Beta distribution for elevation sampling
58 | elev_beta: 5.0           # Beta parameter for Beta distribution for elevation sampling
59 | elev_max: 60.0           # Maximum elevation angle
60 | azim_min: -360.0         # Minimum azimuth angle
61 | azim_max: 360.0          # Maximum azimuth angle
62 | aug_loc: true            # Offset mesh from center of image?
63 | 
64 | # Logging Parameters
65 | log_interval: 5          # Interval for logging
66 | log_interval_im: 250     # Image logging interval
67 | log_elev: 30.0           # Logging elevation angle
68 | log_fov: 60.0            # Logging field of view
69 | log_dist: 8.0            # Logging distance from object
70 | log_res: 512             # Logging render resolution
71 | log_light_power: 3.0     # Light intensity for logging
72 | colab: false             # Print logging image (only for Google Colab)
73 | 
74 | # Mesh Parameters
75 | 
76 | ## Add meshes to the scene here
77 | meshes: 
78 |   - primitives/sphere.obj
79 | 
80 | ## Unit scale the meshes?
81 | unit: 
82 |   - true
83 | 
84 | ## What parameters to optimize for each mesh or none at all (vertices, texture map, normal map, true for random texture; false for using defined texture) ?
85 | train_mesh_idx:
86 |   - [verts, texture, normal, true]
87 | 
88 | ## Scale mesh size by some value
89 | scales:
90 |   - 1.0
91 | 
92 | ## After scaling (x, y, z) offset vertices (note that axis are +x, -y, +z)
93 | offsets:
94 |   - [0.0, 0.0,  0.0]


--------------------------------------------------------------------------------
/configs/style.yml:
--------------------------------------------------------------------------------
 1 | # Config to generate single meshes faster than paper config
 2 | 
 3 | # Basic
 4 | output_path: "./output/" # Where to save outputs
 5 | gpu: '0'                 # Which GPU to use
 6 | seed: 99                 # Seed for reproducibility
 7 | 
 8 | # CLIP Related
 9 | text_prompt: A brown and white cow                          # CLIP text prompt
10 | clip_model: ViT-B/32                                        # Which CLIP Model to use see available_models on OPENAI Clip repo
11 | 
12 | # Text-Image Prior Related
13 | prior_path:  weights/model.pth            # Path to weights for the prior network, not used if prior_path empty
14 | # prior_path:                             # Leave empty like this to use only text prompt
15 | 
16 | ## Parameters for diffusion prior network (code by lucidrains)
17 | diffusion_prior_network_dim: 512
18 | diffusion_prior_network_depth: 12
19 | diffusion_prior_network_dim_head: 64
20 | diffusion_prior_network_heads: 12
21 | diffusion_prior_network_normformer: false
22 | 
23 | ## Parameters for diffusion prior (code by lucidrains)
24 | diffusion_prior_embed_dim: 512
25 | diffusion_prior_timesteps: 1000
26 | diffusion_prior_cond_drop_prob: 0.1
27 | diffusion_prior_loss_type: l2
28 | diffusion_prior_condition_on_text_encodings: false
29 | 
30 | # Parameters
31 | epochs: 2000             # Number of optimization steps
32 | lr: 0.01                 # Maximum learning rate
33 | batch_size: 25           # How many images of shape are rendered at one epoch
34 | train_res: 356           # Resolution of render before scaling to 224x224
35 | resize_method: cubic     # Method for resizing from cubic, linear, lanczos2, lanczos3
36 | bsdf: diffuse            # diffuse or pbr (diffuse recommended)
37 | texture_resolution: 1024 # Resolution of texture maps (ex: 512x512)
38 | channels: 3              # Texture map image channels (4 for alpha, 3 for RGB only)
39 | init_c: 0.95             # Initial alpha channel value if channels == 4
40 | kernel_size: 7           # Kernel size for gaussian blurring of textures to reduce artifacts
41 | blur_sigma: 3            # Variance of gaussian kernel for blurring of textures
42 | shape_imgs_frac: 0.5     # What % of epochs should the renders include plain shape renders as well as textures - after which only textured renders are done
43 | aug_light: true          # Augment the direction of light around the camera
44 | aug_bkg: true            # Augment the background
45 | diff_loss_weight: 0.33   # Weight of Diffusion prior loss 
46 | clip_weight: 1.0         # Weight of CLIP Text loss
47 | laplacian_weight: 30.0   # Initial uniform laplacian weight
48 | laplacian_min: 0.6       # Minimum uniform laplacian weight (set to 2% of max usually)
49 | layers: 2                # Number of layers to peel back for transparency
50 | 
51 | # Camera Parameters
52 | fov_min: 30.0            # Minimum camera field of view angle during renders 
53 | fov_max: 90.0            # Maximum camera field of view angle during renders 
54 | dist_min: 3.0            # Minimum distance of camera from mesh during renders
55 | dist_max: 5.0            # Maximum distance of camera from mesh during renders
56 | light_power: 5.0         # Light intensity
57 | elev_alpha: 1.0          # Alpha parameter for Beta distribution for elevation sampling
58 | elev_beta: 5.0           # Beta parameter for Beta distribution for elevation sampling
59 | elev_max: 90.0           # Maximum elevation angle
60 | azim_min: -360.0         # Minimum azimuth angle
61 | azim_max: 360.0          # Maximum azimuth angle
62 | aug_loc: true            # Offset mesh from center of image?
63 | 
64 | # Logging Parameters
65 | log_interval: 5          # Interval for logging
66 | log_interval_im: 250     # Image logging interval
67 | log_elev: 30.0           # Logging elevation angle
68 | log_fov: 60.0            # Logging field of view
69 | log_dist: 8.0           # Logging distance from object
70 | log_res: 512             # Logging render resolution
71 | log_light_power: 3.0     # Light intensity for logging
72 | colab: false             # Print logging image (only for Google Colab)
73 | 
74 | # Mesh Parameters
75 | 
76 | ## Add meshes to the scene here
77 | meshes: 
78 |   - primitives/spot.obj
79 | 
80 | ## Unit scale the meshes?
81 | unit: 
82 |   - true
83 | 
84 | ## What parameters to optimize for each mesh or none at all (vertices, texture map, normal map, true for random texture; false for using defined texture) ?
85 | train_mesh_idx:
86 |   - [texture, normal, true]
87 | 
88 | ## Scale mesh size by some value
89 | scales:
90 |   - 1.0
91 | 
92 | ## After scaling (x, y, z) offset vertices (note that axis are +x, -y, +z)
93 | offsets:
94 |   - [0.0, 0.0,  0.0]


--------------------------------------------------------------------------------
/loop.py:
--------------------------------------------------------------------------------
  1 | # Main optimization loop, takes in dictionary config
  2 | # and performs optimization as highlighted in paper
  3 | 
  4 | import os
  5 | import clip
  6 | import yaml
  7 | import torch
  8 | import kornia
  9 | import torchvision
 10 | 
 11 | import numpy                as np
 12 | import nvdiffrast.torch     as dr
 13 | import matplotlib.pyplot    as plt
 14 | 
 15 | from tqdm                   import tqdm
 16 | from datetime               import datetime
 17 | from dalle2_pytorch         import DiffusionPrior, DiffusionPriorNetwork
 18 | 
 19 | from PIL                    import Image
 20 | from utils.video            import Video
 21 | from utils.limit_subdivide  import LimitSubdivide
 22 | from utils.helpers          import cosine_avg, create_scene
 23 | from utils.camera           import CameraBatch, get_camera_params
 24 | from utils.resize_right     import resize, cubic, linear, lanczos2, lanczos3
 25 | 
 26 | from nvdiffmodeling.src     import obj
 27 | from nvdiffmodeling.src     import util
 28 | from nvdiffmodeling.src     import mesh
 29 | from nvdiffmodeling.src     import render
 30 | from nvdiffmodeling.src     import texture
 31 | from nvdiffmodeling.src     import regularizer
 32 | 
 33 | def loop(cfg):
 34 | 
 35 |     # Set unique output path
 36 |     now = datetime.now()
 37 |     cfg["path"] = os.path.join(
 38 |         cfg["output_path"],
 39 |         now.strftime("%m-%d-%Y_%H-%M-%S") + cfg["text_prompt"]
 40 |     )
 41 |     
 42 |     cfg['path'] = cfg['path'].replace(" ", "_")
 43 |     os.makedirs(cfg['path'])
 44 |     
 45 |     with open(os.path.join(cfg["path"], "config.yml"), 'w') as outfile:
 46 |         yaml.dump(cfg, outfile, default_flow_style=False)
 47 | 
 48 |     print("Result directory '%s' created" % cfg["path"])
 49 |     
 50 |     # Get CUDA device
 51 |     device = torch.device("cuda:" + cfg["gpu"])
 52 |     torch.cuda.set_device(device)
 53 | 
 54 |     # Initialize CLIP model
 55 |     model, _ = clip.load(cfg["clip_model"], device=device)
 56 | 
 57 |     clip_mean = torch.tensor([0.48154660, 0.45782750, 0.40821073], device=device)
 58 |     clip_std  = torch.tensor([0.26862954, 0.26130258, 0.27577711], device=device)
 59 | 
 60 |     # Initialize Video
 61 |     video = Video(cfg["path"])
 62 | 
 63 |     # Intialize GL Context
 64 |     glctx = dr.RasterizeGLContext()
 65 | 
 66 |     # Get text embedding
 67 |     print("Text is %s" % cfg["text_prompt"])
 68 | 
 69 |     texts_embeds = clip.tokenize([cfg["text_prompt"]]).to(device)
 70 |     with torch.no_grad():
 71 |         texts_embeds = model.encode_text(texts_embeds).detach()
 72 |         texts_embeds = texts_embeds / texts_embeds.norm(dim=1, keepdim=True)
 73 | 
 74 |     # Setup Prior model & get image prior (text embed -> image embed)
 75 |     if cfg["prior_path"] is not None:
 76 | 
 77 |         state_dict = torch.load(cfg["prior_path"], map_location=device)["model"]
 78 | 
 79 |         prior_network = DiffusionPriorNetwork( 
 80 |             dim=cfg["diffusion_prior_network_dim"],
 81 |             depth=cfg["diffusion_prior_network_depth"], 
 82 |             dim_head=cfg["diffusion_prior_network_dim_head"], 
 83 |             heads=cfg["diffusion_prior_network_heads"],
 84 |             normformer=cfg["diffusion_prior_network_normformer"]
 85 |         ).to(device)
 86 | 
 87 |         diffusion_prior = DiffusionPrior( 
 88 |             net=prior_network,
 89 |             clip=None,
 90 |             image_embed_dim=cfg["diffusion_prior_embed_dim"], 
 91 |             timesteps=cfg["diffusion_prior_timesteps"],
 92 |             cond_drop_prob=cfg["diffusion_prior_cond_drop_prob"], 
 93 |             loss_type=cfg["diffusion_prior_loss_type"], 
 94 |             condition_on_text_encodings=cfg["diffusion_prior_condition_on_text_encodings"]
 95 |         ).to(device)
 96 | 
 97 |         diffusion_prior.load_state_dict(state_dict, strict=True)
 98 | 
 99 |         text_cond = dict(text_embed = texts_embeds)
100 |         prior_embeds = diffusion_prior.p_sample_loop((1, 512), text_cond = text_cond)
101 | 
102 |         prior_embeds = prior_embeds.detach().clone().to(device)
103 | 
104 |         del prior_network, diffusion_prior, state_dict
105 |         torch.cuda.empty_cache()
106 | 
107 |     # Load all meshes and setup training parameters
108 |     meshes = [] # store Mesh objects
109 |     subdiv = [] # store per mesh limit subdivison
110 |     train_params = [] # store all trainable paramters
111 |     vert_train = False
112 | 
113 |     for idx, m in enumerate(cfg["meshes"]): # Loop over each mesh path
114 | 
115 |         load_mesh = obj.load_obj(m)
116 | 
117 |         if cfg["unit"][idx]: # If mesh is to be unit sized
118 |             load_mesh = mesh.unit_size(load_mesh)
119 | 
120 |         # Scale vertices by factors provided and then offset by offsets provided
121 |         v_pos = torch.tensor(cfg["scales"][idx]).to(load_mesh.v_pos.device) * load_mesh.v_pos.clone().detach()
122 |         v_pos = torch.tensor(cfg["offsets"][idx]).to(v_pos.device) + v_pos.clone().detach()
123 | 
124 |         # Final mesh after all adjustments
125 |         load_mesh = mesh.Mesh(v_pos, base=load_mesh)
126 | 
127 |         # If true is in train_mesh_idx[mesh_idx] then we initialize
128 |         # all textures else we start with textures already on mesh
129 |         if True in cfg["train_mesh_idx"][idx]:
130 | 
131 |             # vertices 
132 |             vertices = load_mesh.v_pos.clone().detach().requires_grad_(True)
133 | 
134 |             # faces
135 |             faces = load_mesh.t_pos_idx.clone().detach()
136 | 
137 |             # texture map
138 |             texture_map = texture.create_trainable(np.random.uniform(size=[cfg["texture_resolution"]]*2 + [cfg["channels"]], low=0.0, high=1.0), [cfg["texture_resolution"]]*2, True)
139 | 
140 |             # normal map
141 |             normal_map = texture.create_trainable(np.array([0, 0, 1]), [cfg["texture_resolution"]]*2, True)
142 | 
143 |             # specular map
144 |             specular_map = texture.create_trainable(np.array([0, 0, 0]), [cfg["texture_resolution"]]*2, True)
145 | 
146 |         else:
147 | 
148 |             # vertices 
149 |             vertices = load_mesh.v_pos.clone().detach().requires_grad_(True)
150 | 
151 |             # faces
152 |             faces = load_mesh.t_pos_idx.clone().detach()
153 | 
154 |             # get existing texture and specular maps
155 |             kd_ = load_mesh.material['kd'].data.permute(0, 3, 1, 2)
156 |             ks_ = load_mesh.material['ks'].data.permute(0, 3, 1, 2)
157 | 
158 |             # if there is a normal map load it or initial a plain one
159 |             try:
160 |                 nrml_ = load_mesh.material['normal'].data.permute(0, 3, 1, 2)
161 |             except:
162 |                 nrml_ = torch.zeros( (1, 3, cfg["texture_resolution"], cfg["texture_resolution"]) ).to(device)
163 |                 nrml_[:, 2, :, :] = 1.0
164 | 
165 |             # convert all texture maps to trainable tensors
166 |             texture_map  = texture.create_trainable( resize(kd_, out_shape=(cfg["texture_resolution"], cfg["texture_resolution"])).permute(0, 2, 3, 1), [cfg["texture_resolution"]]*2, True)
167 |             specular_map = texture.create_trainable( resize(ks_, out_shape=(cfg["texture_resolution"], cfg["texture_resolution"])).permute(0, 2, 3, 1), [cfg["texture_resolution"]]*2, True)
168 |             normal_map   = texture.create_trainable( resize(nrml_, out_shape=(cfg["texture_resolution"], cfg["texture_resolution"])).permute(0, 2, 3, 1), [cfg["texture_resolution"]]*2, True)
169 | 
170 |         # Training parameters
171 |         if "verts" in cfg["train_mesh_idx"][idx]:
172 |             train_params += [vertices]
173 |             vert_train = True
174 |         if "texture" in cfg["train_mesh_idx"][idx]:
175 |             train_params += texture_map.getMips()
176 |         if "normal" in cfg["train_mesh_idx"][idx]:
177 |             train_params += normal_map.getMips()
178 |         if "specular" in cfg["train_mesh_idx"][idx]:
179 |             train_params += specular_map.getMips()
180 |         
181 |         # Create final mesh with all textures
182 |         load_mesh = mesh.Mesh(
183 |             vertices,
184 |             faces,
185 |             material={
186 |                 'bsdf': cfg['bsdf'],
187 |                 'kd': texture_map,
188 |                 'ks': specular_map,
189 |                 'normal': normal_map,
190 |             },
191 |             base=load_mesh # Get UVs from original loaded mesh
192 |         )
193 |         meshes.append( load_mesh )
194 | 
195 |         # Create limit subdivision class for mesh
196 |         if "verts" in cfg["train_mesh_idx"][idx]:
197 |             subdiv.append( LimitSubdivide(
198 |                 load_mesh.v_pos.clone().detach(),
199 |                 load_mesh.t_pos_idx.clone().detach(),
200 |             ) )
201 |         else:
202 |             subdiv.append( None )
203 | 
204 |     # Optimizer and Scheduler
205 |     optimizer  = torch.optim.Adam(train_params, lr=cfg["lr"])
206 |     scheduler  = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: max(0.0, 10**(-x*0.0002))) 
207 |         
208 |     # Dataset to get random camera parameters
209 |     cams_data = CameraBatch(
210 |         cfg["train_res"],
211 |         [cfg["dist_min"], cfg["dist_max"]],
212 |         [cfg["azim_min"], cfg["azim_max"]],
213 |         [cfg["elev_alpha"], cfg["elev_beta"], cfg["elev_max"]],
214 |         [cfg["fov_min"], cfg["fov_max"]],
215 |         cfg["aug_loc"],
216 |         cfg["aug_light"],
217 |         cfg["aug_bkg"],
218 |         cfg["batch_size"]
219 |     )
220 | 
221 |     cams = torch.utils.data.DataLoader(
222 |         cams_data,
223 |         cfg["batch_size"],
224 |         num_workers=0,
225 |         pin_memory=True
226 |     )
227 | 
228 |     # Optimization Loop
229 |     rot_ang = 0.0
230 |     t_loop = tqdm(range(cfg["epochs"]), leave=False)
231 | 
232 |     for it in t_loop:
233 |         
234 |         render_meshes = []          # store meshes with texture that will be rendered
235 |         render_meshes_notex = []    # store meshes without texture that will be rendered
236 | 
237 |         lapl_funcs    = []          # store laplacian for each mesh
238 | 
239 |         # For each mesh initialized
240 |         for i, m in enumerate(meshes):
241 |             
242 |             # Limit subdivide vertices if needed
243 |             if subdiv[i] != None:
244 | 
245 |                 n_vert = subdiv[i].get_limit(
246 |                     m.v_pos.to('cpu').double()
247 |                 ).to(device)
248 | 
249 |             else:
250 | 
251 |                 n_vert = m.v_pos
252 | 
253 |             # Low pass filter for textures
254 |             ready_texture = texture.Texture2D(
255 |                 kornia.filters.gaussian_blur2d(
256 |                     m.material['kd'].data.permute(0, 3, 1, 2),
257 |                     kernel_size=(cfg["kernel_size"], cfg["kernel_size"]),
258 |                     sigma=(cfg["blur_sigma"], cfg["blur_sigma"]),
259 |                 ).permute(0, 2, 3, 1).contiguous()
260 |             )
261 | 
262 |             ready_specular = texture.Texture2D(
263 |                 kornia.filters.gaussian_blur2d(
264 |                     m.material['ks'].data.permute(0, 3, 1, 2),
265 |                     kernel_size=(cfg["kernel_size"], cfg["kernel_size"]),
266 |                     sigma=(cfg["blur_sigma"], cfg["blur_sigma"]),
267 |                 ).permute(0, 2, 3, 1).contiguous()
268 |             )
269 | 
270 |             ready_normal = texture.Texture2D(
271 |                 kornia.filters.gaussian_blur2d(
272 |                     m.material['normal'].data.permute(0, 3, 1, 2),
273 |                     kernel_size=(cfg["kernel_size"], cfg["kernel_size"]),
274 |                     sigma=(cfg["blur_sigma"], cfg["blur_sigma"]),
275 |                 ).permute(0, 2, 3, 1).contiguous()
276 |             )
277 |                 
278 |             # Final mesh with vertices and textures
279 |             load_mesh = mesh.Mesh(
280 |                 n_vert,
281 |                 m.t_pos_idx,
282 |                 material={
283 |                     'bsdf': cfg['bsdf'],
284 |                     'kd': ready_texture,
285 |                     'ks': ready_specular,
286 |                     'normal': ready_normal,
287 |                 },
288 |                 base=m # gets uvs etc from here
289 |             )
290 |             
291 |             if it < cfg["epochs"] * cfg["shape_imgs_frac"] and vert_train:
292 | 
293 |                 # Initialize the no texture mesh
294 |                 kd_notex = torch.full_like( ready_texture.data, 0.5)
295 | 
296 |                 if kd_notex.shape[-1] == 4:
297 |                     kd_notex[:, :, :, 3] = 1.0
298 | 
299 |                 load_mesh_notex = mesh.Mesh(
300 |                     n_vert,
301 |                     m.t_pos_idx,
302 |                     material={
303 |                         'bsdf': cfg['bsdf'],
304 |                         'kd': kd_notex,
305 |                         'ks': ready_specular,
306 |                         'normal': ready_normal,
307 |                     },
308 |                     base=m # gets uvs etc from here
309 |                 )
310 | 
311 |                 render_meshes_notex.append(load_mesh_notex.eval())
312 | 
313 | 
314 |             render_meshes.append(load_mesh.eval())
315 | 
316 |             if subdiv[i] != None:
317 |                 lapl_funcs.append(regularizer.laplace_regularizer_const(m))
318 |             else:
319 |                 lapl_funcs.append(None)
320 | 
321 |         # Create a scene with the textures and another without textures
322 |         complete_scene = create_scene(render_meshes, sz=cfg["texture_resolution"])
323 |         complete_scene = mesh.auto_normals(complete_scene)
324 |         complete_scene = mesh.compute_tangents(complete_scene)
325 | 
326 |         if it < cfg["epochs"] * cfg["shape_imgs_frac"] and vert_train:
327 |             complete_scene_notex = create_scene(render_meshes_notex, sz=cfg["texture_resolution"])
328 |             complete_scene_notex = mesh.auto_normals(complete_scene_notex)
329 |             complete_scene_notex = mesh.compute_tangents(complete_scene_notex)
330 | 
331 |         # Logging
332 |         if it % cfg["log_interval"] == 0:
333 | 
334 |             with torch.no_grad():
335 |                 
336 |                 params = get_camera_params(
337 |                     cfg["log_elev"],
338 |                     rot_ang,
339 |                     cfg["log_dist"],
340 |                     cfg["log_res"],
341 |                     cfg["log_fov"]
342 |                 )
343 | 
344 |                 rot_ang += 1
345 | 
346 |                 log_image = render.render_mesh(
347 |                     glctx,
348 |                     complete_scene.eval(params),
349 |                     params['mvp'],
350 |                     params['campos'],
351 |                     params['lightpos'],
352 |                     cfg["log_light_power"],
353 |                     cfg["log_res"],
354 |                     num_layers=cfg["layers"],
355 |                     background=torch.ones(1, cfg["log_res"], cfg["log_res"], 3).to(device)
356 |                 )
357 | 
358 |                 log_image = video.ready_image(log_image)
359 | 
360 | 
361 |         # Render scene for training
362 |         params_camera = next(iter(cams))
363 | 
364 |         for key in params_camera:
365 |             params_camera[key] = params_camera[key].to(device)
366 | 
367 |         # Render with and without texture to enable shape growth
368 |         if it < cfg["epochs"] * cfg["shape_imgs_frac"] and vert_train:
369 |             
370 |             with_tex = cfg["batch_size"] // 2
371 | 
372 |             with_tex_params = {
373 |                 'mvp': params_camera['mvp'][:with_tex],
374 |                 'lightpos': params_camera['lightpos'][:with_tex],
375 |                 'campos': params_camera['campos'][:with_tex],
376 |                 'resolution': [cfg["train_res"], cfg["train_res"]]
377 |             }
378 | 
379 |             no_tex_params = {
380 |                 'mvp': params_camera['mvp'][with_tex:],
381 |                 'lightpos': params_camera['lightpos'][with_tex:],
382 |                 'campos': params_camera['campos'][with_tex:],
383 |                 'resolution': [cfg["train_res"], cfg["train_res"]]
384 |             }
385 | 
386 |             with_tex_train_render = render.render_mesh(
387 |                 glctx,
388 |                 complete_scene.eval(with_tex_params),
389 |                 with_tex_params["mvp"],
390 |                 with_tex_params["campos"],
391 |                 with_tex_params["lightpos"],
392 |                 cfg["light_power"],
393 |                 cfg["train_res"],
394 |                 spp=1, # no upscale here / render at any resolution then use resize_right to downscale
395 |                 num_layers=cfg["layers"],
396 |                 msaa=False,
397 |                 background=params_camera["bkgs"][:with_tex],
398 |             ).permute(0, 3, 1, 2) # switch to B, C, H, W
399 | 
400 |             no_tex_train_render = render.render_mesh(
401 |                 glctx,
402 |                 complete_scene_notex.eval(no_tex_params),
403 |                 no_tex_params["mvp"],
404 |                 no_tex_params["campos"],
405 |                 no_tex_params["lightpos"],
406 |                 cfg["light_power"],
407 |                 cfg["train_res"],
408 |                 spp=1, # no upscale here / render at any resolution then use resize_right to downscale
409 |                 num_layers=1,
410 |                 msaa=False,
411 |                 background=params_camera["bkgs"][with_tex:],
412 |             ).permute(0, 3, 1, 2) # switch to B, C, H, W
413 | 
414 |             train_render = torch.cat([
415 |                 with_tex_train_render,
416 |                 no_tex_train_render
417 |             ])
418 |             
419 |         # Render with only textured meshes
420 |         else:
421 | 
422 |             params = {
423 |                 'mvp': params_camera['mvp'],
424 |                 'lightpos': params_camera['lightpos'],
425 |                 'campos': params_camera['campos'],
426 |                 'resolution': [cfg["train_res"], cfg["train_res"]]
427 |             }
428 | 
429 |             train_render = render.render_mesh(
430 |                 glctx,
431 |                 complete_scene.eval(params),
432 |                 params["mvp"],
433 |                 params["campos"],
434 |                 params["lightpos"],
435 |                 cfg["light_power"],
436 |                 cfg["train_res"],
437 |                 spp=1, # no upscale here / render at any resolution then use resize_right to downscale
438 |                 num_layers=cfg["layers"],
439 |                 msaa=False,
440 |                 background=params_camera["bkgs"],
441 |             ).permute(0, 3, 1, 2) # switch to B, C, H, W
442 |             
443 |         # resize to CLIP input size: cubic, linear, lanczos2, lanczos3
444 |         if cfg["resize_method"] == "cubic":
445 | 
446 |             train_render = resize(
447 |                 train_render,
448 |                 out_shape=(224, 224), # resize to clip
449 |                 interp_method=cubic
450 |             )
451 | 
452 |         elif cfg["resize_method"] == "linear":
453 | 
454 |             train_render = resize(
455 |                 train_render,
456 |                 out_shape=(224, 224), # resize to clip
457 |                 interp_method=linear
458 |             )
459 | 
460 |         elif cfg["resize_method"] == "lanczos2":
461 | 
462 |             train_render = resize(
463 |                 train_render,
464 |                 out_shape=(224, 224), # resize to clip
465 |                 interp_method=lanczos2
466 |             )
467 |         elif cfg["resize_method"] == "lanczos3":
468 | 
469 |             train_render = resize(
470 |                 train_render,
471 |                 out_shape=(224, 224), # resize to clip
472 |                 interp_method=lanczos3
473 |             )
474 | 
475 |         # Log renders
476 |         if it % cfg["log_interval_im"] == 0:
477 |             
478 |             s_log = train_render[torch.randint(low=0, high=cfg["batch_size"], size=(5 if cfg["batch_size"] > 5 else cfg["batch_size"], )) , :, :, :]
479 | 
480 |             # Source code of save_image
481 |             s_log = torchvision.utils.make_grid(s_log)
482 | 
483 |             # Add 0.5 after unnormalizing to [0, 255] to round to nearest integer
484 |             ndarr = s_log.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
485 |             im = Image.fromarray(ndarr)
486 | 
487 |             if cfg["colab"]:
488 |                 plt.figure()
489 |                 plt.imshow(ndarr)
490 |                 plt.show()
491 | 
492 |             im.save(os.path.join(cfg["path"], 'epoch_%d.png' % it))
493 | 
494 |         # Convert image to image embeddings
495 |         image_embeds = model.encode_image(
496 |             (train_render - clip_mean[None, :, None, None]) / clip_std[None, :, None, None]
497 |         )
498 |         image_embeds = image_embeds / image_embeds.norm(dim=1, keepdim=True)
499 | 
500 |         # Get loss between text embeds and image embeds
501 |         clip_loss  = cosine_avg(image_embeds, texts_embeds)
502 | 
503 |         # Get loss between image prior embedding and image embeds
504 |         if cfg["prior_path"] is not None:
505 |             prior_loss = cosine_avg(image_embeds, prior_embeds)
506 | 
507 |         # Evaluate laplacian for each mesh in scene to be deformed
508 |         lapls = []
509 |         lapls_l = 0
510 |         for fn_l in lapl_funcs:
511 |             if fn_l is not None:
512 |                 lapls.append(fn_l.eval(params))
513 | 
514 |         # Laplace loss weighting
515 |         if it == 0:
516 |             laplacian_weight = cfg["laplacian_weight"]
517 |             laplacian_min = cfg["laplacian_min"]
518 |         else:
519 |             laplacian_weight = (laplacian_weight - laplacian_min) * 10**(-it*0.000001) + laplacian_min
520 | 
521 |         for lap_l in lapls:
522 |             lapls_l += (laplacian_weight * lap_l)
523 | 
524 |         # Get total loss and backprop
525 |         if cfg["prior_path"] is not None:
526 |             total_loss = (cfg["clip_weight"] * clip_loss) + (cfg["diff_loss_weight"] * prior_loss) + lapls_l
527 |         else:
528 |             total_loss = (cfg["clip_weight"] * clip_loss) + lapls_l
529 | 
530 |         optimizer.zero_grad()
531 |         total_loss.backward()
532 |         optimizer.step()
533 |         scheduler.step()
534 | 
535 |         normal_map.clamp_(min=-1, max=1)
536 |         specular_map.clamp_(min=0, max=1)
537 |         texture_map.clamp_(min=0, max=1)
538 | 
539 |         t_loop.set_description("CLIP Loss = %.6f" % clip_loss.item() )
540 |     
541 |     video.close()
542 | 
543 |     for idx, m in enumerate(render_meshes):
544 |         out_path = os.path.join( cfg["path"], "meshes", "mesh_%d" % idx )
545 |         os.makedirs(out_path)
546 | 
547 |         obj.write_obj(
548 |             out_path,
549 |             m
550 |         )
551 | 
552 |     return cfg["path"]
553 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # This file reads a .yml config and if any command line arguments
  2 | # are passed it overrides the config with them. It then validates
  3 | # the config file, sets seed and sends config to the main opt 
  4 | # loop where all the magic happens. The loop returns the final 
  5 | # mesh file for saving to disk
  6 | 
  7 | import os
  8 | import yaml
  9 | import torch
 10 | import random
 11 | import argparse
 12 | import numpy as np
 13 | 
 14 | from loop import loop
 15 | 
 16 | def main():
 17 | 
 18 |     # Command Line Arguments
 19 |     parser = argparse.ArgumentParser()
 20 |     parser.add_argument('--config',      help='Path to config file', type=str, required=True)
 21 | 
 22 |     # Basic
 23 |     parser.add_argument('--output_path', help='Where to store output files', type=str, default=argparse.SUPPRESS)
 24 |     parser.add_argument('--gpu',         help='GPU index', type=str, default=argparse.SUPPRESS)
 25 |     parser.add_argument('--seed',        help='Seed for reproducibility', type=int, default=argparse.SUPPRESS)
 26 | 
 27 |     # CLIP Related
 28 |     parser.add_argument('--text_prompt', help='Text prompt for mesh generation', type=str, default=argparse.SUPPRESS)
 29 |     parser.add_argument('--clip_model',  help='CLIP Model size', type=str, default=argparse.SUPPRESS)
 30 | 
 31 |     # Text-Image Prior Related
 32 |     parser.add_argument('--prior_path',      help='Path to weights for the prior network, not used if left blank', type=str, default=argparse.SUPPRESS)
 33 | 
 34 |     ## Parameters for diffusion prior network (code by lucidrains)
 35 |     parser.add_argument('--diffusion_prior_network_dim',        help='Diffusion Prior Network - Dimension', type=int, default=argparse.SUPPRESS)
 36 |     parser.add_argument('--diffusion_prior_network_depth',      help='Diffusion Prior Network - Depth', type=int, default=argparse.SUPPRESS)
 37 |     parser.add_argument('--diffusion_prior_network_dim_head',   help='Diffusion Prior Network - Head Dimension', type=int, default=argparse.SUPPRESS)
 38 |     parser.add_argument('--diffusion_prior_network_heads',      help='Diffusion Prior Network - # of Heads', type=int, default=argparse.SUPPRESS)
 39 |     parser.add_argument('--diffusion_prior_network_normformer', help='Diffusion Prior Network - Normformer?', type=bool, default=argparse.SUPPRESS)
 40 | 
 41 |     ## Parameters for diffusion prior (code by lucidrains)
 42 |     parser.add_argument('--diffusion_prior_embed_dim',                   help='Diffusion Prior Network - Embedding Dimension', type=int, default=argparse.SUPPRESS)
 43 |     parser.add_argument('--diffusion_prior_timesteps',                   help='Diffusion Prior Network - Timesteps', type=int, default=argparse.SUPPRESS)
 44 |     parser.add_argument('--diffusion_prior_cond_drop_prob',              help='Diffusion Prior Network - Conditional Drop Probability', type=float, default=argparse.SUPPRESS)
 45 |     parser.add_argument('--diffusion_prior_loss_type',                   help='Diffusion Prior Network - Loss Type', type=str, default=argparse.SUPPRESS)
 46 |     parser.add_argument('--diffusion_prior_condition_on_text_encodings', help='Diffusion Prior Network - Condition Prior on Text Encodings?', type=bool, default=argparse.SUPPRESS)
 47 | 
 48 |     # Parameters
 49 |     parser.add_argument('--epochs',             help='Number of optimization steps', type=int, default=argparse.SUPPRESS)
 50 |     parser.add_argument('--lr',                 help='Maximum learning rate', type=float, default=argparse.SUPPRESS)
 51 |     parser.add_argument('--batch_size',         help='Number of images rendered at the same time', type=int, default=argparse.SUPPRESS)
 52 |     parser.add_argument('--train_res',          help='Resolution of render before downscaling to CLIP size', type=int, default=argparse.SUPPRESS)
 53 |     parser.add_argument('--resize_method',      help='Image downsampling/upsampling method', type=str, default=argparse.SUPPRESS, choices=["cubic", "linear", "lanczos2", "lanczos3"])
 54 |     parser.add_argument('--bsdf',               help='Render technique', type=str, default=argparse.SUPPRESS, choices=["diffuse", "pbr"])
 55 |     parser.add_argument('--texture_resolution', help='Resolution of texture maps (ex: 512 -> 512x512)', type=int, default=argparse.SUPPRESS)
 56 |     parser.add_argument('--channels',           help='Texture map image channels (4 for alpha, 3 for RGB only)', type=int, default=argparse.SUPPRESS, choices=[3, 4])
 57 |     parser.add_argument('--init_c',             help='Initial alpha channel value if channels == 4', type=float, default=argparse.SUPPRESS)
 58 |     parser.add_argument('--kernel_size',        help='Kernel size for gaussian blurring of textures to reduce artifacts', type=int, default=argparse.SUPPRESS)
 59 |     parser.add_argument('--blur_sigma',         help='Variance of gaussian kernel for blurring of textures', type=int, default=argparse.SUPPRESS)
 60 |     parser.add_argument('--shape_imgs_frac',    help='What % of epochs should the renders include plain shape renders as well as textures - after which only textured renders are done', type=float, default=argparse.SUPPRESS)
 61 |     parser.add_argument('--aug_light',          help='Augment the direction of light around the camera', type=bool, default=argparse.SUPPRESS)
 62 |     parser.add_argument('--aug_bkg',            help='Augment the background', type=bool, default=argparse.SUPPRESS)
 63 |     parser.add_argument('--diff_loss_weight',   help='Weight of Diffusion prior loss', type=float, default=argparse.SUPPRESS)
 64 |     parser.add_argument('--clip_weight',        help='Weight of CLIP Text loss', type=float, default=argparse.SUPPRESS)
 65 |     parser.add_argument('--laplacian_weight',   help='Initial uniform laplacian weight', type=float, default=argparse.SUPPRESS)
 66 |     parser.add_argument('--laplacian_min',      help='Minimum uniform laplacian weight (set to 2% of max usually)', type=float, default=argparse.SUPPRESS)
 67 | 
 68 |     # Camera Parameters
 69 |     parser.add_argument('--fov_min',            help='Minimum camera field of view angle during renders', type=float, default=argparse.SUPPRESS)
 70 |     parser.add_argument('--fov_max',            help='Maximum camera field of view angle during renders', type=float, default=argparse.SUPPRESS)
 71 |     parser.add_argument('--dist_min',           help='Minimum distance of camera from mesh during renders', type=float, default=argparse.SUPPRESS)
 72 |     parser.add_argument('--dist_max',           help='Maximum distance of camera from mesh during renders', type=float, default=argparse.SUPPRESS)
 73 |     parser.add_argument('--light_power',        help='Light intensity', type=float, default=argparse.SUPPRESS)
 74 |     parser.add_argument('--elev_alpha',         help='Alpha parameter for Beta distribution for elevation sampling', type=float, default=argparse.SUPPRESS)
 75 |     parser.add_argument('--elev_beta',          help='Beta parameter for Beta distribution for elevation sampling', type=float, default=argparse.SUPPRESS)
 76 |     parser.add_argument('--elev_max',           help='Maximum elevation angle in degree', type=float, default=argparse.SUPPRESS)
 77 |     parser.add_argument('--azim_min',           help='Minimum azimuth angle in degree',  type=float, default=argparse.SUPPRESS)
 78 |     parser.add_argument('--azim_max',           help='Maximum azimuth angle in degree', type=float, default=argparse.SUPPRESS)
 79 |     parser.add_argument('--aug_loc',            help='Offset mesh from center of image?', type=bool, default=argparse.SUPPRESS)
 80 | 
 81 |     # Logging Parameters
 82 |     parser.add_argument('--log_interval',       help='Interval for logging, every X epochs',  type=int, default=argparse.SUPPRESS)
 83 |     parser.add_argument('--log_interval_im',    help='Interval for logging renders image, every X epochs',  type=int, default=argparse.SUPPRESS)
 84 |     parser.add_argument('--log_elev',           help='Logging elevation angle',  type=float, default=argparse.SUPPRESS)
 85 |     parser.add_argument('--log_fov',            help='Logging field of view',  type=float, default=argparse.SUPPRESS)
 86 |     parser.add_argument('--log_dist',           help='Logging distance from object',  type=float, default=argparse.SUPPRESS)
 87 |     parser.add_argument('--log_res',            help='Logging render resolution',  type=int, default=argparse.SUPPRESS)
 88 |     parser.add_argument('--log_light_power',    help='Light intensity for logging',  type=float, default=argparse.SUPPRESS)
 89 | 
 90 |     # Mesh Parameters
 91 |     parser.add_argument('--meshes',             help="Path to all meshes in scene", nargs='+', default=argparse.SUPPRESS, type=str)
 92 |     parser.add_argument('--unit',               help="Should mesh be unit scaled? True/False for each mesh in meshes", nargs='+', default=argparse.SUPPRESS, type=bool)
 93 |     parser.add_argument('--train_mesh_idx',     help="What parameters to optimize for each mesh or none at all (vertices, texture map, normal map, true/false for limit subdivide) ?", nargs='+', action='append', default=argparse.SUPPRESS)
 94 |     parser.add_argument('--scales',             help="Scale mesh size by some value", nargs='+', default=argparse.SUPPRESS, type=float)
 95 |     parser.add_argument('--offsets',            help="After scaling (x, y, z) offset vertices", nargs='+', action='append', type=float, default=argparse.SUPPRESS)
 96 | 
 97 |     args = vars(parser.parse_args())
 98 | 
 99 |     # Check if config passed - if so then parse it
100 |     if args['config'] is not None:
101 |         with open(args['config'], "r") as stream:
102 |             try:
103 |                 cfg = yaml.safe_load(stream)
104 |             except yaml.YAMLError as exc:
105 |                 print(exc)
106 |     else:
107 |         raise("No config passed!")
108 | 
109 |     # Override YAML with CL args
110 |     for key in args:
111 |         cfg[key] = args[key]
112 | 
113 |     # Config validation
114 |     lists = ["meshes", "unit", "train_mesh_idx", "scales", "offsets", "prior_path"]
115 |     for item in parser._actions[1:]:
116 |         if item.type != type(cfg[ item.dest ]) and item.dest not in lists:
117 |             raise ValueError("%s is not of type %s" % (item.dest, item.type) )
118 |                     
119 |     if not( len(cfg["meshes"]) == len(cfg["unit"]) == len(cfg["train_mesh_idx"]) == len(cfg["scales"]) == len(cfg["offsets"])):
120 |         raise("Unit, train_mesh_idx, scales and offsets is not specified for each mesh OR there is an extra item in some list. Ensure all are the same length")
121 | 
122 |     print(yaml.dump(cfg, default_flow_style=False))
123 | 
124 |     # Set seed
125 |     random.seed(cfg["seed"])
126 |     os.environ['PYTHONHASHSEED'] = str(cfg["seed"])
127 |     np.random.seed(cfg["seed"])
128 |     torch.manual_seed(cfg["seed"])
129 |     torch.cuda.manual_seed(cfg["seed"])
130 |     torch.backends.cudnn.deterministic = True
131 | 
132 |     loop(cfg)
133 | 
134 | if __name__ == '__main__':
135 |     main()


--------------------------------------------------------------------------------
/primitives/plane.mtl:
--------------------------------------------------------------------------------
 1 | # Blender MTL File: 'None'
 2 | # Material Count: 1
 3 | 
 4 | newmtl None
 5 | Ns 500
 6 | Ka 0.8 0.8 0.8
 7 | Kd 0.8 0.8 0.8
 8 | Ks 0.8 0.8 0.8
 9 | d 1
10 | illum 2
11 | 


--------------------------------------------------------------------------------
/primitives/plane.obj:
--------------------------------------------------------------------------------
 1 | # Blender v3.2.2 OBJ File: ''
 2 | # www.blender.org
 3 | mtllib plane.mtl
 4 | o Plane
 5 | v -1.000000 0.000000 1.000000
 6 | v 1.000000 0.000000 1.000000
 7 | v -1.000000 0.000000 -1.000000
 8 | v 1.000000 0.000000 -1.000000
 9 | vt 0.000000 0.000000
10 | vt 1.000000 0.000000
11 | vt 1.000000 1.000000
12 | vt 0.000000 1.000000
13 | vn 0.0000 1.0000 0.0000
14 | g Plane_Plane_None
15 | usemtl None
16 | s off
17 | f 1/1/1 2/2/1 4/3/1 3/4/1
18 | 


--------------------------------------------------------------------------------
/primitives/sphere.mtl:
--------------------------------------------------------------------------------
 1 | # Blender MTL File: 'None'
 2 | # Material Count: 1
 3 | 
 4 | newmtl None
 5 | Ns 500
 6 | Ka 0.8 0.8 0.8
 7 | Kd 0.8 0.8 0.8
 8 | Ks 0.8 0.8 0.8
 9 | d 1
10 | illum 2
11 | 


--------------------------------------------------------------------------------
/primitives/spot.mtl:
--------------------------------------------------------------------------------
 1 | 
 2 | # Blender MTL File: 'None'
 3 | # Material Count: 1
 4 | 
 5 | newmtl Default_OBJ
 6 | Ns 250.000000
 7 | Ka 1.000000 1.000000 1.000000
 8 | Kd 0.800000 0.800000 0.800000
 9 | Ks 0.500000 0.500000 0.500000
10 | Ke 0.000000 0.000000 0.000000
11 | Ni 1.450000
12 | d 1.000000
13 | illum 2
14 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | clip @ git+https://github.com/openai/CLIP.git@b46f5ac7587d2e1862f8b7b1573179d80dcdd620
 2 | imageio
 3 | cython
 4 | imageio-ffmpeg
 5 | kornia
 6 | numpy
 7 | nvdiffrast @ git+https://github.com/NVlabs/nvdiffrast.git@78528e683210dfaa1be57e3c65aa37d3b36c6644
 8 | Pillow
 9 | PyGLM
10 | resize-right
11 | scipy
12 | smplx
13 | torch==1.11.0
14 | torchvision==0.12.0
15 | tqdm
16 | Ninja
17 | pyyaml
18 | matplotlib>=3.3.0


--------------------------------------------------------------------------------
/utils/camera.py:
--------------------------------------------------------------------------------
  1 | import glm
  2 | import torch
  3 | import random
  4 | 
  5 | import numpy as np
  6 | import torchvision.transforms as transforms
  7 | 
  8 | from .resize_right import resize
  9 | 
 10 | blurs = [
 11 |     transforms.Compose([
 12 |         transforms.GaussianBlur(11, sigma=(5, 5))
 13 |     ]),
 14 |     transforms.Compose([
 15 |         transforms.GaussianBlur(11, sigma=(2, 2))
 16 |     ]),
 17 |     transforms.Compose([
 18 |         transforms.GaussianBlur(5, sigma=(5, 5))
 19 |     ]),
 20 |     transforms.Compose([
 21 |         transforms.GaussianBlur(5, sigma=(2, 2))
 22 |     ]),
 23 | ]
 24 | 
 25 | def get_random_bg(h, w):
 26 | 
 27 |         p = torch.rand(1)
 28 | 
 29 |         if p > 0.66666:
 30 |             background =  blurs[random.randint(0, 3)]( torch.rand((1, 3, h, w)) ).permute(0, 2, 3, 1)
 31 |         elif p > 0.333333:
 32 |             size = random.randint(5, 10)
 33 |             background = torch.vstack([
 34 |                 torch.full( (1, size, size), torch.rand(1).item() / 2),
 35 |                 torch.full( (1, size, size), torch.rand(1).item() / 2 ),
 36 |                 torch.full( (1, size, size), torch.rand(1).item() / 2 ),
 37 |             ]).unsqueeze(0)
 38 | 
 39 |             second = torch.rand(3)
 40 | 
 41 |             background[:, 0, ::2, ::2] = second[0]
 42 |             background[:, 1, ::2, ::2] = second[1]
 43 |             background[:, 2, ::2, ::2] = second[2]
 44 | 
 45 |             background[:, 0, 1::2, 1::2] = second[0]
 46 |             background[:, 1, 1::2, 1::2] = second[1]
 47 |             background[:, 2, 1::2, 1::2] = second[2]
 48 | 
 49 |             background = blurs[random.randint(0, 3)]( resize(background, out_shape=(h, w)) )
 50 | 
 51 |             background = background.permute(0, 2, 3, 1)
 52 | 
 53 |         else:
 54 |             background = torch.vstack([
 55 |                 torch.full( (1, h, w), torch.rand(1).item()),
 56 |                 torch.full( (1, h, w), torch.rand(1).item()),
 57 |                 torch.full( (1, h, w), torch.rand(1).item()),
 58 |             ]).unsqueeze(0).permute(0, 2, 3, 1)
 59 | 
 60 |         return background
 61 | 
 62 | def cosine_sample(N : np.ndarray) -> np.ndarray:
 63 |     """
 64 |     #----------------------------------------------------------------------------
 65 |     # Cosine sample around a vector N
 66 |     #----------------------------------------------------------------------------
 67 | 
 68 |     Copied from nvdiffmodelling
 69 | 
 70 |     """
 71 |     # construct local frame
 72 |     N = N/np.linalg.norm(N)
 73 | 
 74 |     dx0 = np.array([0, N[2], -N[1]])
 75 |     dx1 = np.array([-N[2], 0, N[0]])
 76 | 
 77 |     dx = dx0 if np.dot(dx0,dx0) > np.dot(dx1,dx1) else dx1
 78 |     dx = dx/np.linalg.norm(dx)
 79 |     dy = np.cross(N,dx)
 80 |     dy = dy/np.linalg.norm(dy)
 81 | 
 82 |     # cosine sampling in local frame
 83 |     phi = 2.0*np.pi*np.random.uniform()
 84 |     s = np.random.uniform()
 85 |     costheta = np.sqrt(s)
 86 |     sintheta = np.sqrt(1.0 - s)
 87 | 
 88 |     # cartesian vector in local space
 89 |     x = np.cos(phi)*sintheta
 90 |     y = np.sin(phi)*sintheta
 91 |     z = costheta
 92 | 
 93 |     # local to world
 94 |     return dx*x + dy*y + N*z
 95 | 
 96 | def persp_proj(fov_x=45, ar=1, near=1.0, far=50.0):
 97 |     """
 98 |     From https://github.com/rgl-epfl/large-steps-pytorch by @bathal1 (Baptiste Nicolet)
 99 | 
100 |     Build a perspective projection matrix.
101 |     Parameters
102 |     ----------
103 |     fov_x : float
104 |         Horizontal field of view (in degrees).
105 |     ar : float
106 |         Aspect ratio (w/h).
107 |     near : float
108 |         Depth of the near plane relative to the camera.
109 |     far : float
110 |         Depth of the far plane relative to the camera.
111 |     """
112 |     fov_rad = np.deg2rad(fov_x)
113 | 
114 |     tanhalffov = np.tan( (fov_rad / 2) )
115 |     max_y = tanhalffov * near
116 |     min_y = -max_y
117 |     max_x = max_y * ar
118 |     min_x = -max_x
119 | 
120 |     z_sign = -1.0
121 |     proj_mat = np.array([[0, 0, 0, 0],
122 |                         [0, 0, 0, 0],
123 |                         [0, 0, 0, 0],
124 |                         [0, 0, 0, 0]])
125 | 
126 |     proj_mat[0, 0] = 2.0 * near / (max_x - min_x)
127 |     proj_mat[1, 1] = 2.0 * near / (max_y - min_y)
128 |     proj_mat[0, 2] = (max_x + min_x) / (max_x - min_x)
129 |     proj_mat[1, 2] = (max_y + min_y) / (max_y - min_y)
130 |     proj_mat[3, 2] = z_sign
131 | 
132 |     proj_mat[2, 2] = z_sign * far / (far - near)
133 |     proj_mat[2, 3] = -(far * near) / (far - near)
134 |     
135 |     return proj_mat
136 | 
137 | def get_camera_params(elev_angle, azim_angle, distance, resolution, fov=60, look_at=[0, 0, 0], up=[0, -1, 0]):
138 |     
139 |     elev = np.radians( elev_angle )
140 |     azim = np.radians( azim_angle ) 
141 |     
142 |     # Generate random view
143 |     cam_z = distance * np.cos(elev) * np.sin(azim)
144 |     cam_y = distance * np.sin(elev)
145 |     cam_x = distance * np.cos(elev) * np.cos(azim)
146 | 
147 |     modl = glm.mat4()
148 |     view  = glm.lookAt(
149 |         glm.vec3(cam_x, cam_y, cam_z),
150 |         glm.vec3(look_at[0], look_at[1], look_at[2]),
151 |         glm.vec3(up[0], up[1], up[2]),
152 |     )
153 | 
154 |     a_mv = view * modl
155 |     a_mv = np.array(a_mv.to_list()).T
156 |     proj_mtx = persp_proj(fov)
157 |     
158 |     a_mvp = np.matmul(proj_mtx, a_mv).astype(np.float32)[None, ...]
159 |     
160 |     a_lightpos = np.linalg.inv(a_mv)[None, :3, 3]
161 |     a_campos = a_lightpos
162 | 
163 |     return {
164 |         'mvp' : a_mvp,
165 |         'lightpos' : a_lightpos,
166 |         'campos' : a_campos,
167 |         'resolution' : [resolution, resolution], 
168 |         }
169 | 
170 | # Returns a batch of camera parameters
171 | class CameraBatch(torch.utils.data.Dataset):
172 |     def __init__(
173 |         self,
174 |         image_resolution,
175 |         distances,
176 |         azimuths,
177 |         elevation_params,
178 |         fovs,
179 |         aug_loc, 
180 |         aug_light,
181 |         aug_bkg,
182 |         bs,
183 |         look_at=[0, 0, 0], up=[0, -1, 0]
184 |     ):
185 | 
186 |         self.res = image_resolution
187 | 
188 |         self.dist_min = distances[0]
189 |         self.dist_max = distances[1]
190 | 
191 |         self.azim_min = azimuths[0]
192 |         self.azim_max = azimuths[1]
193 | 
194 |         self.fov_min = fovs[0]
195 |         self.fov_max = fovs[1]
196 |         
197 |         self.elev_alpha = elevation_params[0]
198 |         self.elev_beta  = elevation_params[1]
199 |         self.elev_max   = elevation_params[2]
200 | 
201 |         self.aug_loc   = aug_loc
202 |         self.aug_light = aug_light
203 |         self.aug_bkg   = aug_bkg
204 | 
205 |         self.look_at = look_at
206 |         self.up = up
207 | 
208 |         self.batch_size = bs
209 | 
210 |     def __len__(self):
211 |         return self.batch_size
212 |         
213 |     def __getitem__(self, index):
214 | 
215 |         elev = np.radians( np.random.beta( self.elev_alpha, self.elev_beta ) * self.elev_max )
216 |         azim = np.radians( np.random.uniform( self.azim_min, self.azim_max+1.0 ) )
217 |         dist = np.random.uniform( self.dist_min, self.dist_max )
218 |         fov = np.random.uniform( self.fov_min, self.fov_max )
219 |         
220 |         proj_mtx = persp_proj(fov)
221 |         
222 |         # Generate random view
223 |         cam_z = dist * np.cos(elev) * np.sin(azim)
224 |         cam_y = dist * np.sin(elev)
225 |         cam_x = dist * np.cos(elev) * np.cos(azim)
226 |         
227 |         if self.aug_loc:
228 | 
229 |             # Random offset
230 |             limit  = self.dist_min // 2
231 |             rand_x = np.random.uniform( -limit, limit )
232 |             rand_y = np.random.uniform( -limit, limit )
233 | 
234 |             modl = glm.translate(glm.mat4(), glm.vec3(rand_x, rand_y, 0))
235 | 
236 |         else:
237 |         
238 |             modl = glm.mat4()
239 |             
240 |         view  = glm.lookAt(
241 |             glm.vec3(cam_x, cam_y, cam_z),
242 |             glm.vec3(self.look_at[0], self.look_at[1], self.look_at[2]),
243 |             glm.vec3(self.up[0], self.up[1], self.up[2]),
244 |         )
245 | 
246 |         r_mv = view * modl
247 |         r_mv = np.array(r_mv.to_list()).T
248 | 
249 |         mvp     = np.matmul(proj_mtx, r_mv).astype(np.float32)
250 |         campos  = np.linalg.inv(r_mv)[:3, 3]
251 | 
252 |         if self.aug_light:
253 |             lightpos = cosine_sample(campos)*dist
254 |         else:
255 |             lightpos = campos*dist
256 | 
257 |         if self.aug_bkg:
258 |             bkgs = get_random_bg(self.res, self.res).squeeze(0)
259 |         else:
260 |             bkgs = torch.ones(self.res, self.res, 3)
261 | 
262 |         return {
263 |             'mvp': torch.from_numpy( mvp ).float(),
264 |             'lightpos': torch.from_numpy( lightpos ).float(),
265 |             'campos': torch.from_numpy( campos ).float(),
266 |             'bkgs': bkgs
267 |         }


--------------------------------------------------------------------------------
/utils/helpers.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Various helper functions
  3 | 
  4 |     create_scene -> combines multiple nvdiffmodeling meshes in to a single mesh with mega texture
  5 | """
  6 | import sys
  7 | import torch
  8 | 
  9 | from math import ceil
 10 | 
 11 | sys.path.append("../nvdiffmodeling") 
 12 | 
 13 | import nvdiffmodeling.src.mesh as mesh
 14 | import nvdiffmodeling.src.texture as texture
 15 | 
 16 | cosine_sim = torch.nn.CosineSimilarity()
 17 | 
 18 | def cosine_sum(features, targets):
 19 |     return -cosine_sim(features, targets).sum()
 20 | 
 21 | def cosine_avg(features, targets):
 22 |     return -cosine_sim(features, targets).mean()
 23 |     
 24 | def _merge_attr_idx(a, b, a_idx, b_idx, scale_a=1.0, scale_b=1.0, add_a=0.0, add_b=0.0):
 25 |     if a is None and b is None:
 26 |         return None, None
 27 |     elif a is not None and b is None:
 28 |         return (a*scale_a)+add_a, a_idx
 29 |     elif a is None and b is not None:
 30 |         return (b*scale_b)+add_b, b_idx
 31 |     else:
 32 |         return torch.cat(((a*scale_a)+add_a, (b*scale_b)+add_b), dim=0), torch.cat((a_idx, b_idx + a.shape[0]), dim=0)
 33 | 
 34 | def create_scene(meshes, sz=1024):
 35 |     
 36 |     # Need to comment and fix code
 37 |     
 38 |     scene = mesh.Mesh()
 39 | 
 40 |     tot = len(meshes) if len(meshes) % 2 == 0 else len(meshes)+1
 41 | 
 42 |     nx = 2
 43 |     ny = ceil(tot / 2) if ceil(tot / 2) % 2 == 0 else ceil(tot / 2) + 1
 44 | 
 45 |     w = int(sz*ny)
 46 |     h = int(sz*nx)
 47 | 
 48 |     dev = meshes[0].v_tex.device
 49 | 
 50 |     kd_atlas = torch.ones ( (1, w, h, 4) ).to(dev)
 51 |     ks_atlas = torch.zeros( (1, w, h, 3) ).to(dev)
 52 |     kn_atlas = torch.ones ( (1, w, h, 3) ).to(dev)
 53 | 
 54 |     for i, m in enumerate(meshes):
 55 |         v_pos, t_pos_idx = _merge_attr_idx(scene.v_pos, m.v_pos, scene.t_pos_idx, m.t_pos_idx)
 56 |         v_nrm, t_nrm_idx = _merge_attr_idx(scene.v_nrm, m.v_nrm, scene.t_nrm_idx, m.t_nrm_idx)
 57 |         v_tng, t_tng_idx = _merge_attr_idx(scene.v_tng, m.v_tng, scene.t_tng_idx, m.t_tng_idx)
 58 | 
 59 |         pos_x = i % nx
 60 |         pos_y = int(i / ny)
 61 | 
 62 |         sc_x = 1./nx
 63 |         sc_y = 1./ny
 64 | 
 65 |         v_tex, t_tex_idx = _merge_attr_idx(
 66 |             scene.v_tex,
 67 |             m.v_tex,
 68 |             scene.t_tex_idx,
 69 |             m.t_tex_idx,
 70 |             scale_a=1.,
 71 |             scale_b=torch.tensor([sc_x, sc_y]).to(dev),
 72 |             add_a=0.,
 73 |             add_b=torch.tensor([sc_x*pos_x, sc_y*pos_y]).to(dev)
 74 |         )
 75 | 
 76 |         kd_atlas[:, pos_y*sz:(pos_y*sz)+sz, pos_x*sz:(pos_x*sz)+sz, :m.material['kd'].data.shape[-1]] = m.material['kd'].data
 77 |         ks_atlas[:, pos_y*sz:(pos_y*sz)+sz, pos_x*sz:(pos_x*sz)+sz, :m.material['ks'].data.shape[-1]] = m.material['ks'].data
 78 |         kn_atlas[:, pos_y*sz:(pos_y*sz)+sz, pos_x*sz:(pos_x*sz)+sz, :m.material['normal'].data.shape[-1]] = m.material['normal'].data
 79 | 
 80 |         scene = mesh.Mesh(
 81 |             v_pos=v_pos,
 82 |             t_pos_idx=t_pos_idx,
 83 |             v_nrm=v_nrm,
 84 |             t_nrm_idx=t_nrm_idx,
 85 |             v_tng=v_tng,
 86 |             t_tng_idx=t_tng_idx,
 87 |             v_tex=v_tex,
 88 |             t_tex_idx=t_tex_idx,
 89 |             base=scene 
 90 |         )
 91 | 
 92 |     scene = mesh.Mesh(
 93 |         material={
 94 |             'bsdf': 'diffuse',
 95 |             'kd': texture.Texture2D(
 96 |                 kd_atlas
 97 |             ),
 98 |             'ks': texture.Texture2D(
 99 |                 ks_atlas
100 |             ),
101 |             'normal': texture.Texture2D(
102 |                 kn_atlas
103 |             ),
104 |         },
105 |         base=scene # gets uvs etc from here
106 |     )
107 | 
108 |     return scene


--------------------------------------------------------------------------------
/utils/limit_subdivide.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Loop Limit Subdvide helper class
 3 | """
 4 | import torch
 5 | import loop_limitation
 6 | 
 7 | class limitation_evaluate(torch.autograd.Function):
 8 |     @staticmethod
 9 |     def forward(ctx, input, loop_obj):
10 |         limitation = loop_obj.compute_limitation(input)
11 |         jacobian = loop_obj.get_J()
12 |         ctx.in1 = jacobian
13 |         return limitation
14 | 
15 |     @staticmethod
16 |     def backward(ctx, grad_output):
17 |         grad = ctx.in1.T
18 |         out = torch.matmul(grad,grad_output)      
19 |         return out, None
20 | 
21 | 
22 | class LimitSubdivide():
23 |     def __init__(self, vertices, faces) -> None:
24 |         self.loop_limit = loop_limitation.loop_limitation()
25 |         self.loop_limit.init_J(vertices.to('cpu').double(), faces.to('cpu').int())
26 |         self.compute_limit = limitation_evaluate.apply
27 | 
28 |     def get_limit(self, vertices):
29 |         new_verts  = self.compute_limit(vertices.to('cpu').double(), self.loop_limit)
30 |         return new_verts


--------------------------------------------------------------------------------
/utils/resize_right.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Resize_Right from Assaf Shocher
  3 |     https://github.com/assafshocher/ResizeRight
  4 | """
  5 | from math import pi
  6 | 
  7 | try:
  8 |     import torch
  9 | except ImportError:
 10 |     torch = None
 11 | 
 12 | try:
 13 |     import numpy
 14 | except ImportError:
 15 |     numpy = None
 16 | 
 17 | if numpy is None and torch is None:
 18 |     raise ImportError("Must have either Numpy or PyTorch but both not found")
 19 | 
 20 | 
 21 | def set_framework_dependencies(x):
 22 |     if type(x) is numpy.ndarray:
 23 |         to_dtype = lambda a: a
 24 |         fw = numpy
 25 |     else:
 26 |         to_dtype = lambda a: a.to(x.dtype)
 27 |         fw = torch
 28 |     eps = fw.finfo(fw.float32).eps
 29 |     return fw, to_dtype, eps
 30 | 
 31 | 
 32 | def support_sz(sz):
 33 |     def wrapper(f):
 34 |         f.support_sz = sz
 35 |         return f
 36 |     return wrapper
 37 | 
 38 | 
 39 | @support_sz(4)
 40 | def cubic(x):
 41 |     fw, to_dtype, eps = set_framework_dependencies(x)
 42 |     absx = fw.abs(x)
 43 |     absx2 = absx ** 2
 44 |     absx3 = absx ** 3
 45 |     return ((1.5 * absx3 - 2.5 * absx2 + 1.) * to_dtype(absx <= 1.) +
 46 |             (-0.5 * absx3 + 2.5 * absx2 - 4. * absx + 2.) *
 47 |             to_dtype((1. < absx) & (absx <= 2.)))
 48 | 
 49 | 
 50 | @support_sz(4)
 51 | def lanczos2(x):
 52 |     fw, to_dtype, eps = set_framework_dependencies(x)
 53 |     return (((fw.sin(pi * x) * fw.sin(pi * x / 2) + eps) /
 54 |             ((pi**2 * x**2 / 2) + eps)) * to_dtype(abs(x) < 2))
 55 | 
 56 | 
 57 | @support_sz(6)
 58 | def lanczos3(x):
 59 |     fw, to_dtype, eps = set_framework_dependencies(x)
 60 |     return (((fw.sin(pi * x) * fw.sin(pi * x / 3) + eps) /
 61 |             ((pi**2 * x**2 / 3) + eps)) * to_dtype(abs(x) < 3))
 62 | 
 63 | 
 64 | @support_sz(2)
 65 | def linear(x):
 66 |     fw, to_dtype, eps = set_framework_dependencies(x)
 67 |     return ((x + 1) * to_dtype((-1 <= x) & (x < 0)) + (1 - x) *
 68 |             to_dtype((0 <= x) & (x <= 1)))
 69 | 
 70 | 
 71 | @support_sz(1)
 72 | def box(x):
 73 |     fw, to_dtype, eps = set_framework_dependencies(x)
 74 |     return to_dtype((-1 <= x) & (x < 0)) + to_dtype((0 <= x) & (x <= 1))
 75 | 
 76 | from typing import Tuple
 77 | import warnings
 78 | from math import ceil
 79 | from fractions import Fraction
 80 | 
 81 | 
 82 | class NoneClass:
 83 |     pass
 84 | 
 85 | 
 86 | try:
 87 |     import torch
 88 |     from torch import nn
 89 |     nnModuleWrapped = nn.Module
 90 | except ImportError:
 91 |     warnings.warn('No PyTorch found, will work only with Numpy')
 92 |     torch = None
 93 |     nnModuleWrapped = NoneClass
 94 | 
 95 | try:
 96 |     import numpy
 97 | except ImportError:
 98 |     warnings.warn('No Numpy found, will work only with PyTorch')
 99 |     numpy = None
100 | 
101 | 
102 | if numpy is None and torch is None:
103 |     raise ImportError("Must have either Numpy or PyTorch but both not found")
104 | 
105 | 
106 | def resize(input, scale_factors=None, out_shape=None,
107 |            interp_method=cubic, support_sz=None,
108 |            antialiasing=True, by_convs=False, scale_tolerance=None,
109 |            max_numerator=10, pad_mode='constant'):
110 |     # get properties of the input tensor
111 |     in_shape, n_dims = input.shape, input.ndim
112 | 
113 |     # fw stands for framework that can be either numpy or torch,
114 |     # determined by the input type
115 |     fw = numpy if type(input) is numpy.ndarray else torch
116 |     eps = fw.finfo(fw.float32).eps
117 |     device = input.device if fw is torch else None
118 | 
119 |     # set missing scale factors or output shapem one according to another,
120 |     # scream if both missing. this is also where all the defults policies
121 |     # take place. also handling the by_convs attribute carefully.
122 |     scale_factors, out_shape, by_convs = set_scale_and_out_sz(in_shape,
123 |                                                               out_shape,
124 |                                                               scale_factors,
125 |                                                               by_convs,
126 |                                                               scale_tolerance,
127 |                                                               max_numerator,
128 |                                                               eps, fw)
129 | 
130 |     # sort indices of dimensions according to scale of each dimension.
131 |     # since we are going dim by dim this is efficient
132 |     sorted_filtered_dims_and_scales = [(dim, scale_factors[dim], by_convs[dim],
133 |                                         in_shape[dim], out_shape[dim])
134 |                                        for dim in sorted(range(n_dims),
135 |                                        key=lambda ind: scale_factors[ind])
136 |                                        if scale_factors[dim] != 1.]
137 | 
138 |     # unless support size is specified by the user, it is an attribute
139 |     # of the interpolation method
140 |     if support_sz is None:
141 |         support_sz = interp_method.support_sz
142 | 
143 |     # output begins identical to input and changes with each iteration
144 |     output = input
145 | 
146 |     # iterate over dims
147 |     for (dim, scale_factor, dim_by_convs, in_sz, out_sz
148 |          ) in sorted_filtered_dims_and_scales:
149 |         # STEP 1- PROJECTED GRID: The non-integer locations of the projection
150 |         # of output pixel locations to the input tensor
151 |         projected_grid = get_projected_grid(in_sz, out_sz,
152 |                                             scale_factor, fw, dim_by_convs,
153 |                                             device)
154 | 
155 |         # STEP 1.5: ANTIALIASING- If antialiasing is taking place, we modify
156 |         # the window size and the interpolation method (see inside function)
157 |         cur_interp_method, cur_support_sz = apply_antialiasing_if_needed(
158 |                                                                 interp_method,
159 |                                                                 support_sz,
160 |                                                                 scale_factor,
161 |                                                                 antialiasing)
162 | 
163 |         # STEP 2- FIELDS OF VIEW: for each output pixels, map the input pixels
164 |         # that influence it. Also calculate needed padding and update grid
165 |         # accoedingly
166 |         field_of_view = get_field_of_view(projected_grid, cur_support_sz, fw,
167 |                                           eps, device)
168 | 
169 |         # STEP 2.5- CALCULATE PAD AND UPDATE: according to the field of view,
170 |         # the input should be padded to handle the boundaries, coordinates
171 |         # should be updated. actual padding only occurs when weights are
172 |         # aplied (step 4). if using by_convs for this dim, then we need to
173 |         # calc right and left boundaries for each filter instead.
174 |         pad_sz, projected_grid, field_of_view = calc_pad_sz(in_sz, out_sz,
175 |                                                             field_of_view,
176 |                                                             projected_grid,
177 |                                                             scale_factor,
178 |                                                             dim_by_convs, fw,
179 |                                                             device)
180 | 
181 |         # STEP 3- CALCULATE WEIGHTS: Match a set of weights to the pixels in
182 |         # the field of view for each output pixel
183 |         weights = get_weights(cur_interp_method, projected_grid, field_of_view)
184 | 
185 |         # STEP 4- APPLY WEIGHTS: Each output pixel is calculated by multiplying
186 |         # its set of weights with the pixel values in its field of view.
187 |         # We now multiply the fields of view with their matching weights.
188 |         # We do this by tensor multiplication and broadcasting.
189 |         # if by_convs is true for this dim, then we do this action by
190 |         # convolutions. this is equivalent but faster.
191 |         if not dim_by_convs:
192 |             output = apply_weights(output, field_of_view, weights, dim, n_dims,
193 |                                    pad_sz, pad_mode, fw)
194 |         else:
195 |             output = apply_convs(output, scale_factor, in_sz, out_sz, weights,
196 |                                  dim, pad_sz, pad_mode, fw)
197 |     return output
198 | 
199 | 
200 | def get_projected_grid(in_sz, out_sz, scale_factor, fw, by_convs, device=None):
201 |     # we start by having the ouput coordinates which are just integer locations
202 |     # in the special case when usin by_convs, we only need two cycles of grid
203 |     # points. the first and last.
204 |     grid_sz = out_sz if not by_convs else scale_factor.numerator
205 |     out_coordinates = fw_arange(grid_sz, fw, device)
206 | 
207 |     # This is projecting the ouput pixel locations in 1d to the input tensor,
208 |     # as non-integer locations.
209 |     # the following fomrula is derived in the paper
210 |     # "From Discrete to Continuous Convolutions" by Shocher et al.
211 |     return (out_coordinates / float(scale_factor) +
212 |             (in_sz - 1) / 2 - (out_sz - 1) / (2 * float(scale_factor)))
213 | 
214 | 
215 | def get_field_of_view(projected_grid, cur_support_sz, fw, eps, device):
216 |     # for each output pixel, map which input pixels influence it, in 1d.
217 |     # we start by calculating the leftmost neighbor, using half of the window
218 |     # size (eps is for when boundary is exact int)
219 |     left_boundaries = fw_ceil(projected_grid - cur_support_sz / 2 - eps, fw)
220 | 
221 |     # then we simply take all the pixel centers in the field by counting
222 |     # window size pixels from the left boundary
223 |     ordinal_numbers = fw_arange(ceil(cur_support_sz - eps), fw, device)
224 |     return left_boundaries[:, None] + ordinal_numbers
225 | 
226 | 
227 | def calc_pad_sz(in_sz, out_sz, field_of_view, projected_grid, scale_factor,
228 |                 dim_by_convs, fw, device):
229 |     if not dim_by_convs:
230 |         # determine padding according to neighbor coords out of bound.
231 |         # this is a generalized notion of padding, when pad<0 it means crop
232 |         pad_sz = [-field_of_view[0, 0].item(),
233 |                   field_of_view[-1, -1].item() - in_sz + 1]
234 | 
235 |         # since input image will be changed by padding, coordinates of both
236 |         # field_of_view and projected_grid need to be updated
237 |         field_of_view += pad_sz[0]
238 |         projected_grid += pad_sz[0]
239 | 
240 |     else:
241 |         # only used for by_convs, to calc the boundaries of each filter the
242 |         # number of distinct convolutions is the numerator of the scale factor
243 |         num_convs, stride = scale_factor.numerator, scale_factor.denominator
244 | 
245 |         # calculate left and right boundaries for each conv. left can also be
246 |         # negative right can be bigger than in_sz. such cases imply padding if
247 |         # needed. however if# both are in-bounds, it means we need to crop,
248 |         # practically apply the conv only on part of the image.
249 |         left_pads = -field_of_view[:, 0]
250 | 
251 |         # next calc is tricky, explanation by rows:
252 |         # 1) counting output pixels between the first position of each filter
253 |         #    to the right boundary of the input
254 |         # 2) dividing it by number of filters to count how many 'jumps'
255 |         #    each filter does
256 |         # 3) multiplying by the stride gives us the distance over the input
257 |         #    coords done by all these jumps for each filter
258 |         # 4) to this distance we add the right boundary of the filter when
259 |         #    placed in its leftmost position. so now we get the right boundary
260 |         #    of that filter in input coord.
261 |         # 5) the padding size needed is obtained by subtracting the rightmost
262 |         #    input coordinate. if the result is positive padding is needed. if
263 |         #    negative then negative padding means shaving off pixel columns.
264 |         right_pads = (((out_sz - fw_arange(num_convs, fw, device) - 1)  # (1)
265 |                       // num_convs)  # (2)
266 |                       * stride  # (3)
267 |                       + field_of_view[:, -1]  # (4)
268 |                       - in_sz + 1)  # (5)
269 | 
270 |         # in the by_convs case pad_sz is a list of left-right pairs. one per
271 |         # each filter
272 | 
273 |         pad_sz = list(zip(left_pads, right_pads))
274 | 
275 |     return pad_sz, projected_grid, field_of_view
276 | 
277 | 
278 | def get_weights(interp_method, projected_grid, field_of_view):
279 |     # the set of weights per each output pixels is the result of the chosen
280 |     # interpolation method applied to the distances between projected grid
281 |     # locations and the pixel-centers in the field of view (distances are
282 |     # directed, can be positive or negative)
283 |     weights = interp_method(projected_grid[:, None] - field_of_view)
284 | 
285 |     # we now carefully normalize the weights to sum to 1 per each output pixel
286 |     sum_weights = weights.sum(1, keepdims=True)
287 |     sum_weights[sum_weights == 0] = 1
288 |     return weights / sum_weights
289 | 
290 | 
291 | def apply_weights(input, field_of_view, weights, dim, n_dims, pad_sz, pad_mode,
292 |                   fw):
293 |     # for this operation we assume the resized dim is the first one.
294 |     # so we transpose and will transpose back after multiplying
295 |     tmp_input = fw_swapaxes(input, dim, 0, fw)
296 | 
297 |     # apply padding
298 |     tmp_input = fw_pad(tmp_input, fw, pad_sz, pad_mode)
299 | 
300 |     # field_of_view is a tensor of order 2: for each output (1d location
301 |     # along cur dim)- a list of 1d neighbors locations.
302 |     # note that this whole operations is applied to each dim separately,
303 |     # this is why it is all in 1d.
304 |     # neighbors = tmp_input[field_of_view] is a tensor of order image_dims+1:
305 |     # for each output pixel (this time indicated in all dims), these are the
306 |     # values of the neighbors in the 1d field of view. note that we only
307 |     # consider neighbors along the current dim, but such set exists for every
308 |     # multi-dim location, hence the final tensor order is image_dims+1.
309 |     neighbors = tmp_input[field_of_view]
310 | 
311 |     # weights is an order 2 tensor: for each output location along 1d- a list
312 |     # of weights matching the field of view. we augment it with ones, for
313 |     # broadcasting, so that when multiplies some tensor the weights affect
314 |     # only its first dim.
315 |     tmp_weights = fw.reshape(weights, (*weights.shape, * [1] * (n_dims - 1)))
316 | 
317 |     # now we simply multiply the weights with the neighbors, and then sum
318 |     # along the field of view, to get a single value per out pixel
319 |     tmp_output = (neighbors * tmp_weights).sum(1)
320 | 
321 |     # we transpose back the resized dim to its original position
322 |     return fw_swapaxes(tmp_output, 0, dim, fw)
323 | 
324 | 
325 | def apply_convs(input, scale_factor, in_sz, out_sz, weights, dim, pad_sz,
326 |                 pad_mode, fw):
327 |     # for this operations we assume the resized dim is the last one.
328 |     # so we transpose and will transpose back after multiplying
329 |     input = fw_swapaxes(input, dim, -1, fw)
330 | 
331 |     # the stride for all convs is the denominator of the scale factor
332 |     stride, num_convs = scale_factor.denominator, scale_factor.numerator
333 | 
334 |     # prepare an empty tensor for the output
335 |     tmp_out_shape = list(input.shape)
336 |     tmp_out_shape[-1] = out_sz
337 |     tmp_output = fw_empty(tuple(tmp_out_shape), fw, input.device)
338 | 
339 |     # iterate over the conv operations. we have as many as the numerator
340 |     # of the scale-factor. for each we need boundaries and a filter.
341 |     for conv_ind, (pad_sz, filt) in enumerate(zip(pad_sz, weights)):
342 |         # apply padding (we pad last dim, padding can be negative)
343 |         pad_dim = input.ndim - 1
344 |         tmp_input = fw_pad(input, fw, pad_sz, pad_mode, dim=pad_dim)
345 | 
346 |         # apply convolution over last dim. store in the output tensor with
347 |         # positional strides so that when the loop is comlete conv results are
348 |         # interwind
349 |         tmp_output[..., conv_ind::num_convs] = fw_conv(tmp_input, filt, stride)
350 | 
351 |     return fw_swapaxes(tmp_output, -1, dim, fw)
352 | 
353 | 
354 | def set_scale_and_out_sz(in_shape, out_shape, scale_factors, by_convs,
355 |                          scale_tolerance, max_numerator, eps, fw):
356 |     # eventually we must have both scale-factors and out-sizes for all in/out
357 |     # dims. however, we support many possible partial arguments
358 |     if scale_factors is None and out_shape is None:
359 |         raise ValueError("either scale_factors or out_shape should be "
360 |                          "provided")
361 |     if out_shape is not None:
362 |         # if out_shape has less dims than in_shape, we defaultly resize the
363 |         # first dims for numpy and last dims for torch
364 |         out_shape = (list(out_shape) + list(in_shape[len(out_shape):])
365 |                      if fw is numpy
366 |                      else list(in_shape[:-len(out_shape)]) + list(out_shape))
367 |         if scale_factors is None:
368 |             # if no scale given, we calculate it as the out to in ratio
369 |             # (not recomended)
370 |             scale_factors = [out_sz / in_sz for out_sz, in_sz
371 |                              in zip(out_shape, in_shape)]
372 |     if scale_factors is not None:
373 |         # by default, if a single number is given as scale, we assume resizing
374 |         # two dims (most common are images with 2 spatial dims)
375 |         scale_factors = (scale_factors
376 |                          if isinstance(scale_factors, (list, tuple))
377 |                          else [scale_factors, scale_factors])
378 |         # if less scale_factors than in_shape dims, we defaultly resize the
379 |         # first dims for numpy and last dims for torch
380 |         scale_factors = (list(scale_factors) + [1] *
381 |                          (len(in_shape) - len(scale_factors)) if fw is numpy
382 |                          else [1] * (len(in_shape) - len(scale_factors)) +
383 |                          list(scale_factors))
384 |         if out_shape is None:
385 |             # when no out_shape given, it is calculated by multiplying the
386 |             # scale by the in_shape (not recomended)
387 |             out_shape = [ceil(scale_factor * in_sz)
388 |                          for scale_factor, in_sz in
389 |                          zip(scale_factors, in_shape)]
390 |         # next part intentionally after out_shape determined for stability
391 |         # we fix by_convs to be a list of truth values in case it is not
392 |         if not isinstance(by_convs, (list, tuple)):
393 |             by_convs = [by_convs] * len(out_shape)
394 | 
395 |         # next loop fixes the scale for each dim to be either frac or float.
396 |         # this is determined by by_convs and by tolerance for scale accuracy.
397 |         for ind, (sf, dim_by_convs) in enumerate(zip(scale_factors, by_convs)):
398 |             # first we fractionaize
399 |             if dim_by_convs:
400 |                 frac = Fraction(1/sf).limit_denominator(max_numerator)
401 |                 frac = Fraction(numerator=frac.denominator, denominator=frac.numerator)
402 | 
403 |             # if accuracy is within tolerance scale will be frac. if not, then
404 |             # it will be float and the by_convs attr will be set false for
405 |             # this dim
406 |             if scale_tolerance is None:
407 |                 scale_tolerance = eps
408 |             if dim_by_convs and abs(frac - sf) < scale_tolerance:
409 |                 scale_factors[ind] = frac
410 |             else:
411 |                 scale_factors[ind] = float(sf)
412 |                 by_convs[ind] = False
413 | 
414 |         return scale_factors, out_shape, by_convs
415 | 
416 | 
417 | def apply_antialiasing_if_needed(interp_method, support_sz, scale_factor,
418 |                                  antialiasing):
419 |     # antialiasing is "stretching" the field of view according to the scale
420 |     # factor (only for downscaling). this is low-pass filtering. this
421 |     # requires modifying both the interpolation (stretching the 1d
422 |     # function and multiplying by the scale-factor) and the window size.
423 |     scale_factor = float(scale_factor)
424 |     if scale_factor >= 1.0 or not antialiasing:
425 |         return interp_method, support_sz
426 |     cur_interp_method = (lambda arg: scale_factor *
427 |                          interp_method(scale_factor * arg))
428 |     cur_support_sz = support_sz / scale_factor
429 |     return cur_interp_method, cur_support_sz
430 | 
431 | 
432 | def fw_ceil(x, fw):
433 |     if fw is numpy:
434 |         return fw.int_(fw.ceil(x))
435 |     else:
436 |         return x.ceil().long()
437 | 
438 | 
439 | def fw_floor(x, fw):
440 |     if fw is numpy:
441 |         return fw.int_(fw.floor(x))
442 |     else:
443 |         return x.floor().long()
444 | 
445 | 
446 | def fw_cat(x, fw):
447 |     if fw is numpy:
448 |         return fw.concatenate(x)
449 |     else:
450 |         return fw.cat(x)
451 | 
452 | 
453 | def fw_swapaxes(x, ax_1, ax_2, fw):
454 |     if fw is numpy:
455 |         return fw.swapaxes(x, ax_1, ax_2)
456 |     else:
457 |         return x.transpose(ax_1, ax_2)
458 | 
459 | 
460 | def fw_pad(x, fw, pad_sz, pad_mode, dim=0):
461 |     if pad_sz == (0, 0):
462 |         return x
463 |     if fw is numpy:
464 |         pad_vec = [(0, 0)] * x.ndim
465 |         pad_vec[dim] = pad_sz
466 |         return fw.pad(x, pad_width=pad_vec, mode=pad_mode)
467 |     else:
468 |         if x.ndim < 3:
469 |             x = x[None, None, ...]
470 | 
471 |         pad_vec = [0] * ((x.ndim - 2) * 2)
472 |         pad_vec[0:2] = pad_sz
473 |         return fw.nn.functional.pad(x.transpose(dim, -1), pad=pad_vec,
474 |                                     mode=pad_mode).transpose(dim, -1)
475 | 
476 | 
477 | def fw_conv(input, filter, stride):
478 |     # we want to apply 1d conv to any nd array. the way to do it is to reshape
479 |     # the input to a 4D tensor. first two dims are singeletons, 3rd dim stores
480 |     # all the spatial dims that we are not convolving along now. then we can
481 |     # apply conv2d with a 1xK filter. This convolves the same way all the other
482 |     # dims stored in the 3d dim. like depthwise conv over these.
483 |     # TODO: numpy support
484 |     reshaped_input = input.reshape(1, 1, -1, input.shape[-1])
485 |     reshaped_output = torch.nn.functional.conv2d(reshaped_input,
486 |                                                  filter.view(1, 1, 1, -1),
487 |                                                  stride=(1, stride))
488 |     return reshaped_output.reshape(*input.shape[:-1], -1)
489 | 
490 | 
491 | def fw_arange(upper_bound, fw, device):
492 |     if fw is numpy:
493 |         return fw.arange(upper_bound)
494 |     else:
495 |         return fw.arange(upper_bound, device=device)
496 | 
497 | 
498 | def fw_empty(shape, fw, device):
499 |     if fw is numpy:
500 |         return fw.empty(shape)
501 |     else:
502 |         return fw.empty(size=(*shape,), device=device)


--------------------------------------------------------------------------------
/utils/video.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Helper class to create and add images to video
 3 | """
 4 | import imageio
 5 | import numpy as np
 6 | 
 7 | class Video():
 8 |     def __init__(self, path, name='video_log.mp4', mode='I', fps=30, codec='libx264', bitrate='16M') -> None:
 9 |         
10 |         if path[-1] != "/":
11 |             path += "/"
12 |             
13 |         self.writer = imageio.get_writer(path+name, mode=mode, fps=fps, codec=codec, bitrate=bitrate)
14 |     
15 |     def ready_image(self, image, write_video=True):
16 |         # assuming channels last - as renderer returns it
17 |         if len(image.shape) == 4: 
18 |             image = image.squeeze(0)[..., :3].detach().cpu().numpy()
19 |         else:
20 |             image = image[..., :3].detach().cpu().numpy()
21 | 
22 |         image = np.clip(np.rint(image*255.0), 0, 255).astype(np.uint8)
23 | 
24 |         if write_video:
25 |             self.writer.append_data(image)
26 | 
27 |         return image
28 | 
29 |     def close(self):
30 |         self.writer.close()


--------------------------------------------------------------------------------