├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── assets ├── FAQ.md ├── cow.gif ├── header.jpeg ├── multi.gif ├── single.gif └── smpl.gif ├── configs ├── multi.yml ├── paper.yml ├── single.yml └── style.yml ├── loop.py ├── main.py ├── primitives ├── plane.mtl ├── plane.obj ├── sphere.mtl ├── sphere.obj ├── spot.mtl └── spot.obj ├── requirements.txt └── utils ├── camera.py ├── helpers.py ├── limit_subdivide.py ├── resize_right.py └── video.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Custom 2 | .vscode 3 | *.pth 4 | *.pt 5 | output/* 6 | *.sh 7 | 8 | # Others 9 | .DS_Store 10 | .vscode 11 | 12 | # Byte-compiled / optimized / DLL files 13 | __pycache__/ 14 | *.py[cod] 15 | *$py.class 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | .Python 22 | build/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | wheels/ 34 | share/python-wheels/ 35 | *.egg-info/ 36 | .installed.cfg 37 | *.egg 38 | MANIFEST 39 | 40 | # PyInstaller 41 | # Usually these files are written by a python script from a template 42 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 43 | *.manifest 44 | *.spec 45 | 46 | # Installer logs 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | 50 | # Unit test / coverage reports 51 | htmlcov/ 52 | .tox/ 53 | .nox/ 54 | .coverage 55 | .coverage.* 56 | .cache 57 | nosetests.xml 58 | coverage.xml 59 | *.cover 60 | *.py,cover 61 | .hypothesis/ 62 | .pytest_cache/ 63 | cover/ 64 | 65 | # Translations 66 | *.mo 67 | *.pot 68 | 69 | # Django stuff: 70 | *.log 71 | local_settings.py 72 | db.sqlite3 73 | db.sqlite3-journal 74 | 75 | # Flask stuff: 76 | instance/ 77 | .webassets-cache 78 | 79 | # Scrapy stuff: 80 | .scrapy 81 | 82 | # Sphinx documentation 83 | docs/_build/ 84 | 85 | # PyBuilder 86 | .pybuilder/ 87 | target/ 88 | 89 | # Jupyter Notebook 90 | .ipynb_checkpoints 91 | 92 | # IPython 93 | profile_default/ 94 | ipython_config.py 95 | 96 | # pyenv 97 | # For a library or package, you might want to ignore these files since the code is 98 | # intended to run in multiple environments; otherwise, check them in: 99 | # .python-version 100 | 101 | # pipenv 102 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 103 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 104 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 105 | # install all needed dependencies. 106 | #Pipfile.lock 107 | 108 | # poetry 109 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 110 | # This is especially recommended for binary packages to ensure reproducibility, and is more 111 | # commonly ignored for libraries. 112 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 113 | #poetry.lock 114 | 115 | # pdm 116 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 117 | #pdm.lock 118 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 119 | # in version control. 120 | # https://pdm.fming.dev/#use-with-ide 121 | .pdm.toml 122 | 123 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 124 | __pypackages__/ 125 | 126 | # Celery stuff 127 | celerybeat-schedule 128 | celerybeat.pid 129 | 130 | # SageMath parsed files 131 | *.sage.py 132 | 133 | # Environments 134 | .env 135 | .venv 136 | env/ 137 | venv/ 138 | ENV/ 139 | env.bak/ 140 | venv.bak/ 141 | 142 | # Spyder project settings 143 | .spyderproject 144 | .spyproject 145 | 146 | # Rope project settings 147 | .ropeproject 148 | 149 | # mkdocs documentation 150 | /site 151 | 152 | # mypy 153 | .mypy_cache/ 154 | .dmypy.json 155 | dmypy.json 156 | 157 | # Pyre type checker 158 | .pyre/ 159 | 160 | # pytype static type analyzer 161 | .pytype/ 162 | 163 | # Cython debug symbols 164 | cython_debug/ 165 | 166 | # PyCharm 167 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 168 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 169 | # and can be added to the global gitignore or merged into this file. For a more nuclear 170 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 171 | #.idea/ -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "loop_limitation"] 2 | path = loop_limitation 3 | url = https://github.com/tianhaoxie/loop_limitation 4 | [submodule "DALLE2-pytorch"] 5 | path = DALLE2-pytorch 6 | url = https://github.com/lucidrains/DALLE2-pytorch.git 7 | [submodule "nvdiffmodeling"] 8 | path = nvdiffmodeling 9 | url = https://github.com/NVlabs/nvdiffmodeling 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Nasir Mohammad Khalid 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## CLIP-Mesh 2 | 3 | Official implementation of [**CLIP-Mesh: Generating textured meshes from text using pretrained image-text models**](https://www.nasir.lol/clipmesh)
4 | 5 | [Nasir Mohammad Khalid](https://www.nasir.lol/), 6 | [Tianhao Xie](https://www.linkedin.com/in/tianhao-xie-440b20186/), 7 | [Eugene Belilovsky](http://eugenium.github.io/), 8 | [Tiberiu Popa](https://users.encs.concordia.ca/~stpopa/index.html)
9 | _[SIGGRAPH ASIA 2022]() | [arXiv](https://arxiv.org/abs/2203.13333) | [Project page](https://www.nasir.lol/clipmesh)_ 10 | 11 | ![CLIP-Mesh header image](./assets/header.jpeg) 12 | 13 | ## Quickstart 14 | 15 | Notebooks are unavilable working on restoring them: 16 | 17 | ||| 18 | |:-----------------------------------------------------------------------------------------------------------:|:--------------------------------------------------:| 19 | | Text to Mesh| Multi Mesh Generation| 20 | 21 | ||| 22 | |:-----------------------------------------------------------------------------------------------------------:|:--------------------------------------------------:| 23 | | Stylizing a Mesh| Apply CLIP-Mesh to Human Models| 24 | 25 | ## Setup 26 | 27 | Clone this repository recursively to get all submodules - use submodule update to get downstream submodules 28 | 29 | ``` 30 | git clone --recurse-submodules https://github.com/NasirKhalid24/CLIP-Mesh.git 31 | cd CLIP-Mesh 32 | git submodule update --init --recursive 33 | ``` 34 | 35 | Setup Conda environment and install packages 36 | 37 | ``` 38 | conda create -n clip-mesh python=3.7 39 | conda activate clip-mesh 40 | conda install pytorch==1.11.0 torchvision==0.12.0 cudatoolkit=10.2 -c pytorch 41 | 42 | pip install -r requirements.txt 43 | ``` 44 | 45 | Install loop subdivison code and DALLE-2 (not that DALLE-2 is from an earlier commit so existing install may not work) 46 | 47 | ``` 48 | 49 | cd loop_limitation 50 | pip install . 51 | cd .. 52 | 53 | 54 | cd DALLE2-pytorch 55 | pip install . 56 | cd .. 57 | 58 | 59 | mkdir weights 60 | wget https://huggingface.co/spaces/NasirKhalid24/Dalle2-Diffusion-Prior/resolve/main/larger-model.pth -O ./weights/model.pth 61 | ``` 62 | 63 | ## Usage 64 | 65 | This repo comes with some configs that are passed to ```main.py``` using the ```--config``` flag 66 | 67 | Any of the config paramaters can be overriden by passing them to as arguments to the ```main.py``` file so you can have a base .yml file with all your parameters and just update the text prompt to generate something new 68 | 69 | An example would be using the given config file for single mesh generation ```single.yml``` 70 | 71 | ``` 72 | # Use all parameters in file 73 | python main.py --config configs/single.yml 74 | 75 | # Use all parameters in file but change text prompt 76 | python main.py --config configs/single.yml --text_prompt "a hamburger" 77 | 78 | # Use all parameters in file but change text prompt, batch, texture resolution 79 | python main.py \ 80 | --config configs/single.yml \ 81 | --text_prompt "a hamburger" \ 82 | --batch_size 5 \ 83 | --texture_resolution 1024 84 | ``` 85 | 86 | ## Tips, Tricks, FAQs etc 87 | 88 | I recommend checking out the [following document](./assets/FAQ.md) as it could answer any doubts that come up (will be updated regularly) - if you still have questions reach out [@Nymarius_](https://twitter.com/Nymarius_) or open an issue 89 | -------------------------------------------------------------------------------- /assets/FAQ.md: -------------------------------------------------------------------------------- 1 | # Tips, tricks and FAQ 2 | 3 | ## Do I need a powerful GPU? 4 | 5 | Not really, this code was written to work on typical gaming GPUs, if you are having memory issues try reducing the following parameters (listed in order of memory consumption): 6 | 7 | - train_res 8 | - batch_size 9 | - texture_resolution 10 | 11 | If you are having issues with the first step of the program where it cannot load and use the diffusion prior try removing the prior all together by setting the ```prior_path``` in your config file to an empty value while also reducing the values 12 | 13 | Note that doing all this may reduce quality of results 14 | 15 | ## Prompt Engineering 16 | 17 | Prompt engineering was not explored at all, so there's not much I can share here - if you do find prompts that improve results please submit a pull request and them here 18 | 19 | ## The texture is quite noisy 20 | 21 | Try increasing ```train_res``` or set the range of ```dist_min``` and ```dist_max``` to a lower value - additionally you could also reduce ```texture_resolution``` and increase blur parameters 22 | 23 | ## The generated shape is flat on some sides 24 | 25 | Try increasing the ```batch_size```, increase the range of ```dist_min``` and ```dist_max``` and ensure that all ```aug_``` parameters are set to true 26 | 27 | ## I added a custom mesh to generate with and the program just crashes 28 | 29 | This could be three reasons that I know of: 30 | 31 | - The mesh is non-manifold is which case the limit subdivision does not work, try to remesh the shape 32 | - There is a vertice in the mesh whose valence is outside the range of [3, 50] - hence the limit subdivision does not work 33 | - There are a huge number of vertices, the spheres used in the papaer have about 600 vertices 34 | 35 | ## How can I setup a custom scene to generate in 36 | 37 | I recommend setting up your scene in Blender and importing ```primitives/sphere.obj``` in to the scene - repositioning, resizing it as required. Then save the newly positoned/sized sphere as a new .obj file and save the rest of your scene meshes as .obj files (ensure the textures are baked and UV unwrapped) 38 | 39 | To generate with this scene your config file would have all the parameters as is (may need to change camera params) and then towards the end your meshes parameters would like as follows 40 | 41 | ```yaml 42 | .... 43 | 44 | 45 | # Mesh Parameters 46 | 47 | ## Add meshes to the scene here 48 | meshes: 49 | - path/to/exported/sphere.obj 50 | - path/to/exported/scene_mesh_1.obj 51 | - path/to/exported/scene_mesh_2.obj 52 | .... 53 | 54 | ## Unit scale the meshes? No need as it was done in Blender 55 | unit: 56 | - false 57 | - false 58 | - false 59 | .... 60 | 61 | ## Sphere is first and it will be optimized, the rest are just constant scene objects 62 | train_mesh_idx: 63 | - [verts, texture, normal, true] 64 | - [] 65 | - [] 66 | .... 67 | 68 | ## No scaling as it was done in Blender 69 | scales: 70 | - 1.0 71 | - 1.0 72 | - 1.0 73 | .... 74 | 75 | ## No positioning as it was done in Blender 76 | offsets: 77 | - [0.0, 0.0, 0.0] 78 | - [0.0, 0.0, 0.0] 79 | - [0.0, 0.0, 0.0] 80 | .... 81 | 82 | ``` 83 | 84 | You could retexture a mesh in your scene by just setting its corresponding ```train_mesh_idx``` to ```[texture, normal, true]``` 85 | 86 | ## I cannot generate ____ the results are not good as ____ 87 | 88 | Text to 3D is much newer than Text to Image generation and therefore the results are not up to par - additionally while most text to image models rely on a huge set of Image-Text data there is no 3D-Text data available at the same scale 89 | 90 | Complex 3D objects with massive variation are tough to generate as CLIP embeddings are limited in the information they hold - it is recommended to generate a single object at a time even something like a car is tough to generate do but a tyre is not 91 | 92 | You could also try using other text to 3D generation/stylization techniques such as [Dreamfields](https://ajayj.com/dreamfields), [CLIPForge](https://github.com/AutodeskAILab/Clip-Forge), [Text2Mesh](https://github.com/threedle/text2mesh) -------------------------------------------------------------------------------- /assets/cow.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NasirKhalid24/CLIP-Mesh/e588c12d3c24ce21d9d3452f1e5b83c892be92f4/assets/cow.gif -------------------------------------------------------------------------------- /assets/header.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NasirKhalid24/CLIP-Mesh/e588c12d3c24ce21d9d3452f1e5b83c892be92f4/assets/header.jpeg -------------------------------------------------------------------------------- /assets/multi.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NasirKhalid24/CLIP-Mesh/e588c12d3c24ce21d9d3452f1e5b83c892be92f4/assets/multi.gif -------------------------------------------------------------------------------- /assets/single.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NasirKhalid24/CLIP-Mesh/e588c12d3c24ce21d9d3452f1e5b83c892be92f4/assets/single.gif -------------------------------------------------------------------------------- /assets/smpl.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NasirKhalid24/CLIP-Mesh/e588c12d3c24ce21d9d3452f1e5b83c892be92f4/assets/smpl.gif -------------------------------------------------------------------------------- /configs/multi.yml: -------------------------------------------------------------------------------- 1 | # Config to generate multi mesh results fast 2 | 3 | # Basic 4 | output_path: "./output/" # Where to save outputs 5 | gpu: '0' # Which GPU to use 6 | seed: 99 # Seed for reproducibility 7 | 8 | # CLIP Related 9 | text_prompt: A cactus on the sand in the desert # CLIP text prompt 10 | clip_model: ViT-B/32 # Which CLIP Model to use see available_models on OPENAI Clip repo 11 | 12 | # Text-Image Prior Related 13 | prior_path: weights/model.pth # Path to weights for the prior network, not used if prior_path empty 14 | # prior_path: # Leave empty like this to use only text prompt 15 | 16 | ## Parameters for diffusion prior network (code by lucidrains) 17 | diffusion_prior_network_dim: 512 18 | diffusion_prior_network_depth: 12 19 | diffusion_prior_network_dim_head: 64 20 | diffusion_prior_network_heads: 12 21 | diffusion_prior_network_normformer: false 22 | 23 | ## Parameters for diffusion prior (code by lucidrains) 24 | diffusion_prior_embed_dim: 512 25 | diffusion_prior_timesteps: 1000 26 | diffusion_prior_cond_drop_prob: 0.1 27 | diffusion_prior_loss_type: l2 28 | diffusion_prior_condition_on_text_encodings: false 29 | 30 | # Parameters 31 | epochs: 2000 # Number of optimization steps 32 | lr: 0.01 # Maximum learning rate 33 | batch_size: 15 # How many images of shape are rendered at one epoch 34 | train_res: 356 # Resolution of render before scaling to 224x224 35 | resize_method: cubic # Method for resizing from cubic, linear, lanczos2, lanczos3 36 | bsdf: diffuse # diffuse or pbr (diffuse recommended) 37 | texture_resolution: 512 # Resolution of texture maps (ex: 512x512) 38 | channels: 4 # Texture map image channels (4 for alpha, 3 for RGB only) 39 | init_c: 0.90 # Initial alpha channel value if channels == 4 40 | kernel_size: 7 # Kernel size for gaussian blurring of textures to reduce artifacts 41 | blur_sigma: 3 # Variance of gaussian kernel for blurring of textures 42 | shape_imgs_frac: 0.5 # What % of epochs should the renders include plain shape renders as well as textures - after which only textured renders are done 43 | aug_light: true # Augment the direction of light around the camera 44 | aug_bkg: true # Augment the background 45 | diff_loss_weight: 0.33 # Weight of Diffusion prior loss 46 | clip_weight: 1.0 # Weight of CLIP Text loss 47 | laplacian_weight: 30.0 # Initial uniform laplacian weight 48 | laplacian_min: 0.6 # Minimum uniform laplacian weight (set to 2% of max usually) 49 | layers: 2 # Number of layers to peel back for transparency 50 | 51 | # Camera Parameters 52 | fov_min: 30.0 # Minimum camera field of view angle during renders 53 | fov_max: 90.0 # Maximum camera field of view angle during renders 54 | dist_min: 5.0 # Minimum distance of camera from mesh during renders 55 | dist_max: 12.0 # Maximum distance of camera from mesh during renders 56 | light_power: 5.0 # Light intensity 57 | elev_alpha: 1.0 # Alpha parameter for Beta distribution for elevation sampling 58 | elev_beta: 5.0 # Beta parameter for Beta distribution for elevation sampling 59 | elev_max: 90.0 # Maximum elevation angle 60 | azim_min: 0.0 # Minimum azimuth angle 61 | azim_max: 360.0 # Maximum azimuth angle 62 | aug_loc: true # Offset mesh from center of image? 63 | 64 | # Logging Parameters 65 | log_interval: 5 # Interval for logging 66 | log_interval_im: 250 # Image logging interval 67 | log_elev: 30.0 # Logging elevation angle 68 | log_fov: 60.0 # Logging field of view 69 | log_dist: 10.0 # Logging distance from object 70 | log_res: 512 # Logging render resolution 71 | log_light_power: 3.0 # Light intensity for logging 72 | colab: false # Print logging image (only for Google Colab) 73 | 74 | # Mesh Parameters 75 | 76 | ## Add meshes to the scene here 77 | meshes: 78 | - primitives/sphere.obj 79 | - primitives/plane.obj 80 | 81 | ## Unit scale the meshes? 82 | unit: 83 | - true 84 | - false 85 | 86 | ## What parameters to optimize for each mesh or none at all (vertices, texture map, normal map, true for random texture; false for using defined texture) ? 87 | train_mesh_idx: 88 | - [verts, texture, normal, true] 89 | - [texture, normal, true] 90 | 91 | ## Scale mesh size by some value 92 | scales: 93 | - 0.5 94 | - 3.0 95 | 96 | ## After scaling (x, y, z) offset vertices (note that axis are +x, -y, +z) 97 | offsets: 98 | - [0.0, 0.0, 0.0] 99 | - [0.0, -0.5, 0.0] -------------------------------------------------------------------------------- /configs/paper.yml: -------------------------------------------------------------------------------- 1 | # This config was used for figures in the paper 2 | 3 | # Basic 4 | output_path: "./output/" # Where to save outputs 5 | gpu: '0' # Which GPU to use 6 | seed: 99 # Seed for reproducibility 7 | 8 | # CLIP Related 9 | text_prompt: Pyramid of Giza # CLIP text prompt 10 | clip_model: ViT-B/32 # Which CLIP Model to use see available_models on OPENAI Clip repo 11 | 12 | # Text-Image Prior Related 13 | prior_path: weights/model.pth # Path to weights for the prior network, not used if prior_path empty 14 | # prior_path: # Leave empty like this to use only text prompt 15 | 16 | ## Parameters for diffusion prior network (code by lucidrains) 17 | diffusion_prior_network_dim: 512 18 | diffusion_prior_network_depth: 12 19 | diffusion_prior_network_dim_head: 64 20 | diffusion_prior_network_heads: 12 21 | diffusion_prior_network_normformer: false 22 | 23 | ## Parameters for diffusion prior (code by lucidrains) 24 | diffusion_prior_embed_dim: 512 25 | diffusion_prior_timesteps: 1000 26 | diffusion_prior_cond_drop_prob: 0.1 27 | diffusion_prior_loss_type: l2 28 | diffusion_prior_condition_on_text_encodings: false 29 | 30 | # Parameters 31 | epochs: 5000 # Number of optimization steps 32 | lr: 0.01 # Maximum learning rate 33 | batch_size: 25 # How many images of shape are rendered at one epoch 34 | train_res: 512 # Resolution of render before scaling to 224x224 35 | resize_method: lanczos2 # Method for resizing from cubic, linear, lanczos2, lanczos3 36 | bsdf: diffuse # diffuse or pbr (diffuse recommended) 37 | texture_resolution: 512 # Resolution of texture maps (ex: 512x512) 38 | channels: 4 # Texture map image channels (4 for alpha, 3 for RGB only) 39 | init_c: 0.85 # Initial alpha channel value if channels == 4 40 | kernel_size: 5 # Kernel size for gaussian blurring of textures to reduce artifacts 41 | blur_sigma: 3 # Variance of gaussian kernel for blurring of textures 42 | shape_imgs_frac: 0.5 # What % of epochs should the renders include plain shape renders as well as textures - after which only textured renders are done 43 | aug_light: true # Augment the direction of light around the camera 44 | aug_bkg: true # Augment the background 45 | diff_loss_weight: 0.33 # Weight of Diffusion prior loss 46 | clip_weight: 1.0 # Weight of CLIP Text loss 47 | laplacian_weight: 30.0 # Initial uniform laplacian weight 48 | laplacian_min: 0.6 # Minimum uniform laplacian weight (set to 2% of max usually) 49 | layers: 2 # Number of layers to peel back for transparency 50 | 51 | # Camera Parameters 52 | fov_min: 30.0 # Minimum camera field of view angle during renders 53 | fov_max: 90.0 # Maximum camera field of view angle during renders 54 | dist_min: 5.0 # Minimum distance of camera from mesh during renders 55 | dist_max: 12.0 # Maximum distance of camera from mesh during renders 56 | light_power: 5.0 # Light intensity 57 | elev_alpha: 1.0 # Alpha parameter for Beta distribution for elevation sampling 58 | elev_beta: 5.0 # Beta parameter for Beta distribution for elevation sampling 59 | elev_max: 90.0 # Maximum elevation angle 60 | azim_min: 0.0 # Minimum azimuth angle 61 | azim_max: 360.0 # Maximum azimuth angle 62 | aug_loc: true # Offset mesh from center of image? 63 | 64 | # Logging Parameters 65 | log_interval: 5 # Interval for logging 66 | log_interval_im: 250 # Image logging interval 67 | log_elev: 30.0 # Logging elevation angle 68 | log_fov: 60.0 # Logging field of view 69 | log_dist: 10.0 # Logging distance from object 70 | log_res: 512 # Logging render resolution 71 | log_light_power: 3.0 # Light intensity for logging 72 | colab: false # Print logging image (only for Google Colab) 73 | 74 | # Mesh Parameters 75 | 76 | ## Add meshes to the scene here 77 | meshes: 78 | - primitives/sphere.obj 79 | 80 | ## Unit scale the meshes? 81 | unit: 82 | - true 83 | 84 | ## What parameters to optimize for each mesh or none at all (vertices, texture map, normal map, true for random texture; false for using defined texture) ? 85 | train_mesh_idx: 86 | - [verts, texture, normal, true] 87 | 88 | ## Scale mesh size by some value 89 | scales: 90 | - 1.0 91 | 92 | ## After scaling (x, y, z) offset vertices (note that axis are +x, -y, +z) 93 | offsets: 94 | - [0.0, 0.0, 0.0] -------------------------------------------------------------------------------- /configs/single.yml: -------------------------------------------------------------------------------- 1 | # Config to generate single meshes faster than paper config 2 | 3 | # Basic 4 | output_path: "./output/" # Where to save outputs 5 | gpu: '0' # Which GPU to use 6 | seed: 99 # Seed for reproducibility 7 | 8 | # CLIP Related 9 | text_prompt: A wooden brown table # CLIP text prompt 10 | clip_model: ViT-B/32 # Which CLIP Model to use see available_models on OPENAI Clip repo 11 | 12 | # Text-Image Prior Related 13 | prior_path: weights/model.pth # Path to weights for the prior network, not used if prior_path empty 14 | # prior_path: # Leave empty like this to use only text prompt 15 | 16 | ## Parameters for diffusion prior network (code by lucidrains) 17 | diffusion_prior_network_dim: 512 18 | diffusion_prior_network_depth: 12 19 | diffusion_prior_network_dim_head: 64 20 | diffusion_prior_network_heads: 12 21 | diffusion_prior_network_normformer: false 22 | 23 | ## Parameters for diffusion prior (code by lucidrains) 24 | diffusion_prior_embed_dim: 512 25 | diffusion_prior_timesteps: 1000 26 | diffusion_prior_cond_drop_prob: 0.1 27 | diffusion_prior_loss_type: l2 28 | diffusion_prior_condition_on_text_encodings: false 29 | 30 | # Parameters 31 | epochs: 2000 # Number of optimization steps 32 | lr: 0.01 # Maximum learning rate 33 | batch_size: 25 # How many images of shape are rendered at one epoch 34 | train_res: 356 # Resolution of render before scaling to 224x224 35 | resize_method: cubic # Method for resizing from cubic, linear, lanczos2, lanczos3 36 | bsdf: diffuse # diffuse or pbr (diffuse recommended) 37 | texture_resolution: 512 # Resolution of texture maps (ex: 512x512) 38 | channels: 4 # Texture map image channels (4 for alpha, 3 for RGB only) 39 | init_c: 0.85 # Initial alpha channel value if channels == 4 40 | kernel_size: 7 # Kernel size for gaussian blurring of textures to reduce artifacts 41 | blur_sigma: 3 # Variance of gaussian kernel for blurring of textures 42 | shape_imgs_frac: 0.5 # What % of epochs should the renders include plain shape renders as well as textures - after which only textured renders are done 43 | aug_light: true # Augment the direction of light around the camera 44 | aug_bkg: true # Augment the background 45 | diff_loss_weight: 0.33 # Weight of Diffusion prior loss 46 | clip_weight: 1.0 # Weight of CLIP Text loss 47 | laplacian_weight: 30.0 # Initial uniform laplacian weight 48 | laplacian_min: 0.6 # Minimum uniform laplacian weight (set to 2% of max usually) 49 | layers: 2 # Number of layers to peel back for transparency 50 | 51 | # Camera Parameters 52 | fov_min: 30.0 # Minimum camera field of view angle during renders 53 | fov_max: 90.0 # Maximum camera field of view angle during renders 54 | dist_min: 5.0 # Minimum distance of camera from mesh during renders 55 | dist_max: 8.0 # Maximum distance of camera from mesh during renders 56 | light_power: 5.0 # Light intensity 57 | elev_alpha: 1.0 # Alpha parameter for Beta distribution for elevation sampling 58 | elev_beta: 5.0 # Beta parameter for Beta distribution for elevation sampling 59 | elev_max: 60.0 # Maximum elevation angle 60 | azim_min: -360.0 # Minimum azimuth angle 61 | azim_max: 360.0 # Maximum azimuth angle 62 | aug_loc: true # Offset mesh from center of image? 63 | 64 | # Logging Parameters 65 | log_interval: 5 # Interval for logging 66 | log_interval_im: 250 # Image logging interval 67 | log_elev: 30.0 # Logging elevation angle 68 | log_fov: 60.0 # Logging field of view 69 | log_dist: 8.0 # Logging distance from object 70 | log_res: 512 # Logging render resolution 71 | log_light_power: 3.0 # Light intensity for logging 72 | colab: false # Print logging image (only for Google Colab) 73 | 74 | # Mesh Parameters 75 | 76 | ## Add meshes to the scene here 77 | meshes: 78 | - primitives/sphere.obj 79 | 80 | ## Unit scale the meshes? 81 | unit: 82 | - true 83 | 84 | ## What parameters to optimize for each mesh or none at all (vertices, texture map, normal map, true for random texture; false for using defined texture) ? 85 | train_mesh_idx: 86 | - [verts, texture, normal, true] 87 | 88 | ## Scale mesh size by some value 89 | scales: 90 | - 1.0 91 | 92 | ## After scaling (x, y, z) offset vertices (note that axis are +x, -y, +z) 93 | offsets: 94 | - [0.0, 0.0, 0.0] -------------------------------------------------------------------------------- /configs/style.yml: -------------------------------------------------------------------------------- 1 | # Config to generate single meshes faster than paper config 2 | 3 | # Basic 4 | output_path: "./output/" # Where to save outputs 5 | gpu: '0' # Which GPU to use 6 | seed: 99 # Seed for reproducibility 7 | 8 | # CLIP Related 9 | text_prompt: A brown and white cow # CLIP text prompt 10 | clip_model: ViT-B/32 # Which CLIP Model to use see available_models on OPENAI Clip repo 11 | 12 | # Text-Image Prior Related 13 | prior_path: weights/model.pth # Path to weights for the prior network, not used if prior_path empty 14 | # prior_path: # Leave empty like this to use only text prompt 15 | 16 | ## Parameters for diffusion prior network (code by lucidrains) 17 | diffusion_prior_network_dim: 512 18 | diffusion_prior_network_depth: 12 19 | diffusion_prior_network_dim_head: 64 20 | diffusion_prior_network_heads: 12 21 | diffusion_prior_network_normformer: false 22 | 23 | ## Parameters for diffusion prior (code by lucidrains) 24 | diffusion_prior_embed_dim: 512 25 | diffusion_prior_timesteps: 1000 26 | diffusion_prior_cond_drop_prob: 0.1 27 | diffusion_prior_loss_type: l2 28 | diffusion_prior_condition_on_text_encodings: false 29 | 30 | # Parameters 31 | epochs: 2000 # Number of optimization steps 32 | lr: 0.01 # Maximum learning rate 33 | batch_size: 25 # How many images of shape are rendered at one epoch 34 | train_res: 356 # Resolution of render before scaling to 224x224 35 | resize_method: cubic # Method for resizing from cubic, linear, lanczos2, lanczos3 36 | bsdf: diffuse # diffuse or pbr (diffuse recommended) 37 | texture_resolution: 1024 # Resolution of texture maps (ex: 512x512) 38 | channels: 3 # Texture map image channels (4 for alpha, 3 for RGB only) 39 | init_c: 0.95 # Initial alpha channel value if channels == 4 40 | kernel_size: 7 # Kernel size for gaussian blurring of textures to reduce artifacts 41 | blur_sigma: 3 # Variance of gaussian kernel for blurring of textures 42 | shape_imgs_frac: 0.5 # What % of epochs should the renders include plain shape renders as well as textures - after which only textured renders are done 43 | aug_light: true # Augment the direction of light around the camera 44 | aug_bkg: true # Augment the background 45 | diff_loss_weight: 0.33 # Weight of Diffusion prior loss 46 | clip_weight: 1.0 # Weight of CLIP Text loss 47 | laplacian_weight: 30.0 # Initial uniform laplacian weight 48 | laplacian_min: 0.6 # Minimum uniform laplacian weight (set to 2% of max usually) 49 | layers: 2 # Number of layers to peel back for transparency 50 | 51 | # Camera Parameters 52 | fov_min: 30.0 # Minimum camera field of view angle during renders 53 | fov_max: 90.0 # Maximum camera field of view angle during renders 54 | dist_min: 3.0 # Minimum distance of camera from mesh during renders 55 | dist_max: 5.0 # Maximum distance of camera from mesh during renders 56 | light_power: 5.0 # Light intensity 57 | elev_alpha: 1.0 # Alpha parameter for Beta distribution for elevation sampling 58 | elev_beta: 5.0 # Beta parameter for Beta distribution for elevation sampling 59 | elev_max: 90.0 # Maximum elevation angle 60 | azim_min: -360.0 # Minimum azimuth angle 61 | azim_max: 360.0 # Maximum azimuth angle 62 | aug_loc: true # Offset mesh from center of image? 63 | 64 | # Logging Parameters 65 | log_interval: 5 # Interval for logging 66 | log_interval_im: 250 # Image logging interval 67 | log_elev: 30.0 # Logging elevation angle 68 | log_fov: 60.0 # Logging field of view 69 | log_dist: 8.0 # Logging distance from object 70 | log_res: 512 # Logging render resolution 71 | log_light_power: 3.0 # Light intensity for logging 72 | colab: false # Print logging image (only for Google Colab) 73 | 74 | # Mesh Parameters 75 | 76 | ## Add meshes to the scene here 77 | meshes: 78 | - primitives/spot.obj 79 | 80 | ## Unit scale the meshes? 81 | unit: 82 | - true 83 | 84 | ## What parameters to optimize for each mesh or none at all (vertices, texture map, normal map, true for random texture; false for using defined texture) ? 85 | train_mesh_idx: 86 | - [texture, normal, true] 87 | 88 | ## Scale mesh size by some value 89 | scales: 90 | - 1.0 91 | 92 | ## After scaling (x, y, z) offset vertices (note that axis are +x, -y, +z) 93 | offsets: 94 | - [0.0, 0.0, 0.0] -------------------------------------------------------------------------------- /loop.py: -------------------------------------------------------------------------------- 1 | # Main optimization loop, takes in dictionary config 2 | # and performs optimization as highlighted in paper 3 | 4 | import os 5 | import clip 6 | import yaml 7 | import torch 8 | import kornia 9 | import torchvision 10 | 11 | import numpy as np 12 | import nvdiffrast.torch as dr 13 | import matplotlib.pyplot as plt 14 | 15 | from tqdm import tqdm 16 | from datetime import datetime 17 | from dalle2_pytorch import DiffusionPrior, DiffusionPriorNetwork 18 | 19 | from PIL import Image 20 | from utils.video import Video 21 | from utils.limit_subdivide import LimitSubdivide 22 | from utils.helpers import cosine_avg, create_scene 23 | from utils.camera import CameraBatch, get_camera_params 24 | from utils.resize_right import resize, cubic, linear, lanczos2, lanczos3 25 | 26 | from nvdiffmodeling.src import obj 27 | from nvdiffmodeling.src import util 28 | from nvdiffmodeling.src import mesh 29 | from nvdiffmodeling.src import render 30 | from nvdiffmodeling.src import texture 31 | from nvdiffmodeling.src import regularizer 32 | 33 | def loop(cfg): 34 | 35 | # Set unique output path 36 | now = datetime.now() 37 | cfg["path"] = os.path.join( 38 | cfg["output_path"], 39 | now.strftime("%m-%d-%Y_%H-%M-%S") + cfg["text_prompt"] 40 | ) 41 | 42 | cfg['path'] = cfg['path'].replace(" ", "_") 43 | os.makedirs(cfg['path']) 44 | 45 | with open(os.path.join(cfg["path"], "config.yml"), 'w') as outfile: 46 | yaml.dump(cfg, outfile, default_flow_style=False) 47 | 48 | print("Result directory '%s' created" % cfg["path"]) 49 | 50 | # Get CUDA device 51 | device = torch.device("cuda:" + cfg["gpu"]) 52 | torch.cuda.set_device(device) 53 | 54 | # Initialize CLIP model 55 | model, _ = clip.load(cfg["clip_model"], device=device) 56 | 57 | clip_mean = torch.tensor([0.48154660, 0.45782750, 0.40821073], device=device) 58 | clip_std = torch.tensor([0.26862954, 0.26130258, 0.27577711], device=device) 59 | 60 | # Initialize Video 61 | video = Video(cfg["path"]) 62 | 63 | # Intialize GL Context 64 | glctx = dr.RasterizeGLContext() 65 | 66 | # Get text embedding 67 | print("Text is %s" % cfg["text_prompt"]) 68 | 69 | texts_embeds = clip.tokenize([cfg["text_prompt"]]).to(device) 70 | with torch.no_grad(): 71 | texts_embeds = model.encode_text(texts_embeds).detach() 72 | texts_embeds = texts_embeds / texts_embeds.norm(dim=1, keepdim=True) 73 | 74 | # Setup Prior model & get image prior (text embed -> image embed) 75 | if cfg["prior_path"] is not None: 76 | 77 | state_dict = torch.load(cfg["prior_path"], map_location=device)["model"] 78 | 79 | prior_network = DiffusionPriorNetwork( 80 | dim=cfg["diffusion_prior_network_dim"], 81 | depth=cfg["diffusion_prior_network_depth"], 82 | dim_head=cfg["diffusion_prior_network_dim_head"], 83 | heads=cfg["diffusion_prior_network_heads"], 84 | normformer=cfg["diffusion_prior_network_normformer"] 85 | ).to(device) 86 | 87 | diffusion_prior = DiffusionPrior( 88 | net=prior_network, 89 | clip=None, 90 | image_embed_dim=cfg["diffusion_prior_embed_dim"], 91 | timesteps=cfg["diffusion_prior_timesteps"], 92 | cond_drop_prob=cfg["diffusion_prior_cond_drop_prob"], 93 | loss_type=cfg["diffusion_prior_loss_type"], 94 | condition_on_text_encodings=cfg["diffusion_prior_condition_on_text_encodings"] 95 | ).to(device) 96 | 97 | diffusion_prior.load_state_dict(state_dict, strict=True) 98 | 99 | text_cond = dict(text_embed = texts_embeds) 100 | prior_embeds = diffusion_prior.p_sample_loop((1, 512), text_cond = text_cond) 101 | 102 | prior_embeds = prior_embeds.detach().clone().to(device) 103 | 104 | del prior_network, diffusion_prior, state_dict 105 | torch.cuda.empty_cache() 106 | 107 | # Load all meshes and setup training parameters 108 | meshes = [] # store Mesh objects 109 | subdiv = [] # store per mesh limit subdivison 110 | train_params = [] # store all trainable paramters 111 | vert_train = False 112 | 113 | for idx, m in enumerate(cfg["meshes"]): # Loop over each mesh path 114 | 115 | load_mesh = obj.load_obj(m) 116 | 117 | if cfg["unit"][idx]: # If mesh is to be unit sized 118 | load_mesh = mesh.unit_size(load_mesh) 119 | 120 | # Scale vertices by factors provided and then offset by offsets provided 121 | v_pos = torch.tensor(cfg["scales"][idx]).to(load_mesh.v_pos.device) * load_mesh.v_pos.clone().detach() 122 | v_pos = torch.tensor(cfg["offsets"][idx]).to(v_pos.device) + v_pos.clone().detach() 123 | 124 | # Final mesh after all adjustments 125 | load_mesh = mesh.Mesh(v_pos, base=load_mesh) 126 | 127 | # If true is in train_mesh_idx[mesh_idx] then we initialize 128 | # all textures else we start with textures already on mesh 129 | if True in cfg["train_mesh_idx"][idx]: 130 | 131 | # vertices 132 | vertices = load_mesh.v_pos.clone().detach().requires_grad_(True) 133 | 134 | # faces 135 | faces = load_mesh.t_pos_idx.clone().detach() 136 | 137 | # texture map 138 | texture_map = texture.create_trainable(np.random.uniform(size=[cfg["texture_resolution"]]*2 + [cfg["channels"]], low=0.0, high=1.0), [cfg["texture_resolution"]]*2, True) 139 | 140 | # normal map 141 | normal_map = texture.create_trainable(np.array([0, 0, 1]), [cfg["texture_resolution"]]*2, True) 142 | 143 | # specular map 144 | specular_map = texture.create_trainable(np.array([0, 0, 0]), [cfg["texture_resolution"]]*2, True) 145 | 146 | else: 147 | 148 | # vertices 149 | vertices = load_mesh.v_pos.clone().detach().requires_grad_(True) 150 | 151 | # faces 152 | faces = load_mesh.t_pos_idx.clone().detach() 153 | 154 | # get existing texture and specular maps 155 | kd_ = load_mesh.material['kd'].data.permute(0, 3, 1, 2) 156 | ks_ = load_mesh.material['ks'].data.permute(0, 3, 1, 2) 157 | 158 | # if there is a normal map load it or initial a plain one 159 | try: 160 | nrml_ = load_mesh.material['normal'].data.permute(0, 3, 1, 2) 161 | except: 162 | nrml_ = torch.zeros( (1, 3, cfg["texture_resolution"], cfg["texture_resolution"]) ).to(device) 163 | nrml_[:, 2, :, :] = 1.0 164 | 165 | # convert all texture maps to trainable tensors 166 | texture_map = texture.create_trainable( resize(kd_, out_shape=(cfg["texture_resolution"], cfg["texture_resolution"])).permute(0, 2, 3, 1), [cfg["texture_resolution"]]*2, True) 167 | specular_map = texture.create_trainable( resize(ks_, out_shape=(cfg["texture_resolution"], cfg["texture_resolution"])).permute(0, 2, 3, 1), [cfg["texture_resolution"]]*2, True) 168 | normal_map = texture.create_trainable( resize(nrml_, out_shape=(cfg["texture_resolution"], cfg["texture_resolution"])).permute(0, 2, 3, 1), [cfg["texture_resolution"]]*2, True) 169 | 170 | # Training parameters 171 | if "verts" in cfg["train_mesh_idx"][idx]: 172 | train_params += [vertices] 173 | vert_train = True 174 | if "texture" in cfg["train_mesh_idx"][idx]: 175 | train_params += texture_map.getMips() 176 | if "normal" in cfg["train_mesh_idx"][idx]: 177 | train_params += normal_map.getMips() 178 | if "specular" in cfg["train_mesh_idx"][idx]: 179 | train_params += specular_map.getMips() 180 | 181 | # Create final mesh with all textures 182 | load_mesh = mesh.Mesh( 183 | vertices, 184 | faces, 185 | material={ 186 | 'bsdf': cfg['bsdf'], 187 | 'kd': texture_map, 188 | 'ks': specular_map, 189 | 'normal': normal_map, 190 | }, 191 | base=load_mesh # Get UVs from original loaded mesh 192 | ) 193 | meshes.append( load_mesh ) 194 | 195 | # Create limit subdivision class for mesh 196 | if "verts" in cfg["train_mesh_idx"][idx]: 197 | subdiv.append( LimitSubdivide( 198 | load_mesh.v_pos.clone().detach(), 199 | load_mesh.t_pos_idx.clone().detach(), 200 | ) ) 201 | else: 202 | subdiv.append( None ) 203 | 204 | # Optimizer and Scheduler 205 | optimizer = torch.optim.Adam(train_params, lr=cfg["lr"]) 206 | scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: max(0.0, 10**(-x*0.0002))) 207 | 208 | # Dataset to get random camera parameters 209 | cams_data = CameraBatch( 210 | cfg["train_res"], 211 | [cfg["dist_min"], cfg["dist_max"]], 212 | [cfg["azim_min"], cfg["azim_max"]], 213 | [cfg["elev_alpha"], cfg["elev_beta"], cfg["elev_max"]], 214 | [cfg["fov_min"], cfg["fov_max"]], 215 | cfg["aug_loc"], 216 | cfg["aug_light"], 217 | cfg["aug_bkg"], 218 | cfg["batch_size"] 219 | ) 220 | 221 | cams = torch.utils.data.DataLoader( 222 | cams_data, 223 | cfg["batch_size"], 224 | num_workers=0, 225 | pin_memory=True 226 | ) 227 | 228 | # Optimization Loop 229 | rot_ang = 0.0 230 | t_loop = tqdm(range(cfg["epochs"]), leave=False) 231 | 232 | for it in t_loop: 233 | 234 | render_meshes = [] # store meshes with texture that will be rendered 235 | render_meshes_notex = [] # store meshes without texture that will be rendered 236 | 237 | lapl_funcs = [] # store laplacian for each mesh 238 | 239 | # For each mesh initialized 240 | for i, m in enumerate(meshes): 241 | 242 | # Limit subdivide vertices if needed 243 | if subdiv[i] != None: 244 | 245 | n_vert = subdiv[i].get_limit( 246 | m.v_pos.to('cpu').double() 247 | ).to(device) 248 | 249 | else: 250 | 251 | n_vert = m.v_pos 252 | 253 | # Low pass filter for textures 254 | ready_texture = texture.Texture2D( 255 | kornia.filters.gaussian_blur2d( 256 | m.material['kd'].data.permute(0, 3, 1, 2), 257 | kernel_size=(cfg["kernel_size"], cfg["kernel_size"]), 258 | sigma=(cfg["blur_sigma"], cfg["blur_sigma"]), 259 | ).permute(0, 2, 3, 1).contiguous() 260 | ) 261 | 262 | ready_specular = texture.Texture2D( 263 | kornia.filters.gaussian_blur2d( 264 | m.material['ks'].data.permute(0, 3, 1, 2), 265 | kernel_size=(cfg["kernel_size"], cfg["kernel_size"]), 266 | sigma=(cfg["blur_sigma"], cfg["blur_sigma"]), 267 | ).permute(0, 2, 3, 1).contiguous() 268 | ) 269 | 270 | ready_normal = texture.Texture2D( 271 | kornia.filters.gaussian_blur2d( 272 | m.material['normal'].data.permute(0, 3, 1, 2), 273 | kernel_size=(cfg["kernel_size"], cfg["kernel_size"]), 274 | sigma=(cfg["blur_sigma"], cfg["blur_sigma"]), 275 | ).permute(0, 2, 3, 1).contiguous() 276 | ) 277 | 278 | # Final mesh with vertices and textures 279 | load_mesh = mesh.Mesh( 280 | n_vert, 281 | m.t_pos_idx, 282 | material={ 283 | 'bsdf': cfg['bsdf'], 284 | 'kd': ready_texture, 285 | 'ks': ready_specular, 286 | 'normal': ready_normal, 287 | }, 288 | base=m # gets uvs etc from here 289 | ) 290 | 291 | if it < cfg["epochs"] * cfg["shape_imgs_frac"] and vert_train: 292 | 293 | # Initialize the no texture mesh 294 | kd_notex = torch.full_like( ready_texture.data, 0.5) 295 | 296 | if kd_notex.shape[-1] == 4: 297 | kd_notex[:, :, :, 3] = 1.0 298 | 299 | load_mesh_notex = mesh.Mesh( 300 | n_vert, 301 | m.t_pos_idx, 302 | material={ 303 | 'bsdf': cfg['bsdf'], 304 | 'kd': kd_notex, 305 | 'ks': ready_specular, 306 | 'normal': ready_normal, 307 | }, 308 | base=m # gets uvs etc from here 309 | ) 310 | 311 | render_meshes_notex.append(load_mesh_notex.eval()) 312 | 313 | 314 | render_meshes.append(load_mesh.eval()) 315 | 316 | if subdiv[i] != None: 317 | lapl_funcs.append(regularizer.laplace_regularizer_const(m)) 318 | else: 319 | lapl_funcs.append(None) 320 | 321 | # Create a scene with the textures and another without textures 322 | complete_scene = create_scene(render_meshes, sz=cfg["texture_resolution"]) 323 | complete_scene = mesh.auto_normals(complete_scene) 324 | complete_scene = mesh.compute_tangents(complete_scene) 325 | 326 | if it < cfg["epochs"] * cfg["shape_imgs_frac"] and vert_train: 327 | complete_scene_notex = create_scene(render_meshes_notex, sz=cfg["texture_resolution"]) 328 | complete_scene_notex = mesh.auto_normals(complete_scene_notex) 329 | complete_scene_notex = mesh.compute_tangents(complete_scene_notex) 330 | 331 | # Logging 332 | if it % cfg["log_interval"] == 0: 333 | 334 | with torch.no_grad(): 335 | 336 | params = get_camera_params( 337 | cfg["log_elev"], 338 | rot_ang, 339 | cfg["log_dist"], 340 | cfg["log_res"], 341 | cfg["log_fov"] 342 | ) 343 | 344 | rot_ang += 1 345 | 346 | log_image = render.render_mesh( 347 | glctx, 348 | complete_scene.eval(params), 349 | params['mvp'], 350 | params['campos'], 351 | params['lightpos'], 352 | cfg["log_light_power"], 353 | cfg["log_res"], 354 | num_layers=cfg["layers"], 355 | background=torch.ones(1, cfg["log_res"], cfg["log_res"], 3).to(device) 356 | ) 357 | 358 | log_image = video.ready_image(log_image) 359 | 360 | 361 | # Render scene for training 362 | params_camera = next(iter(cams)) 363 | 364 | for key in params_camera: 365 | params_camera[key] = params_camera[key].to(device) 366 | 367 | # Render with and without texture to enable shape growth 368 | if it < cfg["epochs"] * cfg["shape_imgs_frac"] and vert_train: 369 | 370 | with_tex = cfg["batch_size"] // 2 371 | 372 | with_tex_params = { 373 | 'mvp': params_camera['mvp'][:with_tex], 374 | 'lightpos': params_camera['lightpos'][:with_tex], 375 | 'campos': params_camera['campos'][:with_tex], 376 | 'resolution': [cfg["train_res"], cfg["train_res"]] 377 | } 378 | 379 | no_tex_params = { 380 | 'mvp': params_camera['mvp'][with_tex:], 381 | 'lightpos': params_camera['lightpos'][with_tex:], 382 | 'campos': params_camera['campos'][with_tex:], 383 | 'resolution': [cfg["train_res"], cfg["train_res"]] 384 | } 385 | 386 | with_tex_train_render = render.render_mesh( 387 | glctx, 388 | complete_scene.eval(with_tex_params), 389 | with_tex_params["mvp"], 390 | with_tex_params["campos"], 391 | with_tex_params["lightpos"], 392 | cfg["light_power"], 393 | cfg["train_res"], 394 | spp=1, # no upscale here / render at any resolution then use resize_right to downscale 395 | num_layers=cfg["layers"], 396 | msaa=False, 397 | background=params_camera["bkgs"][:with_tex], 398 | ).permute(0, 3, 1, 2) # switch to B, C, H, W 399 | 400 | no_tex_train_render = render.render_mesh( 401 | glctx, 402 | complete_scene_notex.eval(no_tex_params), 403 | no_tex_params["mvp"], 404 | no_tex_params["campos"], 405 | no_tex_params["lightpos"], 406 | cfg["light_power"], 407 | cfg["train_res"], 408 | spp=1, # no upscale here / render at any resolution then use resize_right to downscale 409 | num_layers=1, 410 | msaa=False, 411 | background=params_camera["bkgs"][with_tex:], 412 | ).permute(0, 3, 1, 2) # switch to B, C, H, W 413 | 414 | train_render = torch.cat([ 415 | with_tex_train_render, 416 | no_tex_train_render 417 | ]) 418 | 419 | # Render with only textured meshes 420 | else: 421 | 422 | params = { 423 | 'mvp': params_camera['mvp'], 424 | 'lightpos': params_camera['lightpos'], 425 | 'campos': params_camera['campos'], 426 | 'resolution': [cfg["train_res"], cfg["train_res"]] 427 | } 428 | 429 | train_render = render.render_mesh( 430 | glctx, 431 | complete_scene.eval(params), 432 | params["mvp"], 433 | params["campos"], 434 | params["lightpos"], 435 | cfg["light_power"], 436 | cfg["train_res"], 437 | spp=1, # no upscale here / render at any resolution then use resize_right to downscale 438 | num_layers=cfg["layers"], 439 | msaa=False, 440 | background=params_camera["bkgs"], 441 | ).permute(0, 3, 1, 2) # switch to B, C, H, W 442 | 443 | # resize to CLIP input size: cubic, linear, lanczos2, lanczos3 444 | if cfg["resize_method"] == "cubic": 445 | 446 | train_render = resize( 447 | train_render, 448 | out_shape=(224, 224), # resize to clip 449 | interp_method=cubic 450 | ) 451 | 452 | elif cfg["resize_method"] == "linear": 453 | 454 | train_render = resize( 455 | train_render, 456 | out_shape=(224, 224), # resize to clip 457 | interp_method=linear 458 | ) 459 | 460 | elif cfg["resize_method"] == "lanczos2": 461 | 462 | train_render = resize( 463 | train_render, 464 | out_shape=(224, 224), # resize to clip 465 | interp_method=lanczos2 466 | ) 467 | elif cfg["resize_method"] == "lanczos3": 468 | 469 | train_render = resize( 470 | train_render, 471 | out_shape=(224, 224), # resize to clip 472 | interp_method=lanczos3 473 | ) 474 | 475 | # Log renders 476 | if it % cfg["log_interval_im"] == 0: 477 | 478 | s_log = train_render[torch.randint(low=0, high=cfg["batch_size"], size=(5 if cfg["batch_size"] > 5 else cfg["batch_size"], )) , :, :, :] 479 | 480 | # Source code of save_image 481 | s_log = torchvision.utils.make_grid(s_log) 482 | 483 | # Add 0.5 after unnormalizing to [0, 255] to round to nearest integer 484 | ndarr = s_log.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy() 485 | im = Image.fromarray(ndarr) 486 | 487 | if cfg["colab"]: 488 | plt.figure() 489 | plt.imshow(ndarr) 490 | plt.show() 491 | 492 | im.save(os.path.join(cfg["path"], 'epoch_%d.png' % it)) 493 | 494 | # Convert image to image embeddings 495 | image_embeds = model.encode_image( 496 | (train_render - clip_mean[None, :, None, None]) / clip_std[None, :, None, None] 497 | ) 498 | image_embeds = image_embeds / image_embeds.norm(dim=1, keepdim=True) 499 | 500 | # Get loss between text embeds and image embeds 501 | clip_loss = cosine_avg(image_embeds, texts_embeds) 502 | 503 | # Get loss between image prior embedding and image embeds 504 | if cfg["prior_path"] is not None: 505 | prior_loss = cosine_avg(image_embeds, prior_embeds) 506 | 507 | # Evaluate laplacian for each mesh in scene to be deformed 508 | lapls = [] 509 | lapls_l = 0 510 | for fn_l in lapl_funcs: 511 | if fn_l is not None: 512 | lapls.append(fn_l.eval(params)) 513 | 514 | # Laplace loss weighting 515 | if it == 0: 516 | laplacian_weight = cfg["laplacian_weight"] 517 | laplacian_min = cfg["laplacian_min"] 518 | else: 519 | laplacian_weight = (laplacian_weight - laplacian_min) * 10**(-it*0.000001) + laplacian_min 520 | 521 | for lap_l in lapls: 522 | lapls_l += (laplacian_weight * lap_l) 523 | 524 | # Get total loss and backprop 525 | if cfg["prior_path"] is not None: 526 | total_loss = (cfg["clip_weight"] * clip_loss) + (cfg["diff_loss_weight"] * prior_loss) + lapls_l 527 | else: 528 | total_loss = (cfg["clip_weight"] * clip_loss) + lapls_l 529 | 530 | optimizer.zero_grad() 531 | total_loss.backward() 532 | optimizer.step() 533 | scheduler.step() 534 | 535 | normal_map.clamp_(min=-1, max=1) 536 | specular_map.clamp_(min=0, max=1) 537 | texture_map.clamp_(min=0, max=1) 538 | 539 | t_loop.set_description("CLIP Loss = %.6f" % clip_loss.item() ) 540 | 541 | video.close() 542 | 543 | for idx, m in enumerate(render_meshes): 544 | out_path = os.path.join( cfg["path"], "meshes", "mesh_%d" % idx ) 545 | os.makedirs(out_path) 546 | 547 | obj.write_obj( 548 | out_path, 549 | m 550 | ) 551 | 552 | return cfg["path"] 553 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # This file reads a .yml config and if any command line arguments 2 | # are passed it overrides the config with them. It then validates 3 | # the config file, sets seed and sends config to the main opt 4 | # loop where all the magic happens. The loop returns the final 5 | # mesh file for saving to disk 6 | 7 | import os 8 | import yaml 9 | import torch 10 | import random 11 | import argparse 12 | import numpy as np 13 | 14 | from loop import loop 15 | 16 | def main(): 17 | 18 | # Command Line Arguments 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--config', help='Path to config file', type=str, required=True) 21 | 22 | # Basic 23 | parser.add_argument('--output_path', help='Where to store output files', type=str, default=argparse.SUPPRESS) 24 | parser.add_argument('--gpu', help='GPU index', type=str, default=argparse.SUPPRESS) 25 | parser.add_argument('--seed', help='Seed for reproducibility', type=int, default=argparse.SUPPRESS) 26 | 27 | # CLIP Related 28 | parser.add_argument('--text_prompt', help='Text prompt for mesh generation', type=str, default=argparse.SUPPRESS) 29 | parser.add_argument('--clip_model', help='CLIP Model size', type=str, default=argparse.SUPPRESS) 30 | 31 | # Text-Image Prior Related 32 | parser.add_argument('--prior_path', help='Path to weights for the prior network, not used if left blank', type=str, default=argparse.SUPPRESS) 33 | 34 | ## Parameters for diffusion prior network (code by lucidrains) 35 | parser.add_argument('--diffusion_prior_network_dim', help='Diffusion Prior Network - Dimension', type=int, default=argparse.SUPPRESS) 36 | parser.add_argument('--diffusion_prior_network_depth', help='Diffusion Prior Network - Depth', type=int, default=argparse.SUPPRESS) 37 | parser.add_argument('--diffusion_prior_network_dim_head', help='Diffusion Prior Network - Head Dimension', type=int, default=argparse.SUPPRESS) 38 | parser.add_argument('--diffusion_prior_network_heads', help='Diffusion Prior Network - # of Heads', type=int, default=argparse.SUPPRESS) 39 | parser.add_argument('--diffusion_prior_network_normformer', help='Diffusion Prior Network - Normformer?', type=bool, default=argparse.SUPPRESS) 40 | 41 | ## Parameters for diffusion prior (code by lucidrains) 42 | parser.add_argument('--diffusion_prior_embed_dim', help='Diffusion Prior Network - Embedding Dimension', type=int, default=argparse.SUPPRESS) 43 | parser.add_argument('--diffusion_prior_timesteps', help='Diffusion Prior Network - Timesteps', type=int, default=argparse.SUPPRESS) 44 | parser.add_argument('--diffusion_prior_cond_drop_prob', help='Diffusion Prior Network - Conditional Drop Probability', type=float, default=argparse.SUPPRESS) 45 | parser.add_argument('--diffusion_prior_loss_type', help='Diffusion Prior Network - Loss Type', type=str, default=argparse.SUPPRESS) 46 | parser.add_argument('--diffusion_prior_condition_on_text_encodings', help='Diffusion Prior Network - Condition Prior on Text Encodings?', type=bool, default=argparse.SUPPRESS) 47 | 48 | # Parameters 49 | parser.add_argument('--epochs', help='Number of optimization steps', type=int, default=argparse.SUPPRESS) 50 | parser.add_argument('--lr', help='Maximum learning rate', type=float, default=argparse.SUPPRESS) 51 | parser.add_argument('--batch_size', help='Number of images rendered at the same time', type=int, default=argparse.SUPPRESS) 52 | parser.add_argument('--train_res', help='Resolution of render before downscaling to CLIP size', type=int, default=argparse.SUPPRESS) 53 | parser.add_argument('--resize_method', help='Image downsampling/upsampling method', type=str, default=argparse.SUPPRESS, choices=["cubic", "linear", "lanczos2", "lanczos3"]) 54 | parser.add_argument('--bsdf', help='Render technique', type=str, default=argparse.SUPPRESS, choices=["diffuse", "pbr"]) 55 | parser.add_argument('--texture_resolution', help='Resolution of texture maps (ex: 512 -> 512x512)', type=int, default=argparse.SUPPRESS) 56 | parser.add_argument('--channels', help='Texture map image channels (4 for alpha, 3 for RGB only)', type=int, default=argparse.SUPPRESS, choices=[3, 4]) 57 | parser.add_argument('--init_c', help='Initial alpha channel value if channels == 4', type=float, default=argparse.SUPPRESS) 58 | parser.add_argument('--kernel_size', help='Kernel size for gaussian blurring of textures to reduce artifacts', type=int, default=argparse.SUPPRESS) 59 | parser.add_argument('--blur_sigma', help='Variance of gaussian kernel for blurring of textures', type=int, default=argparse.SUPPRESS) 60 | parser.add_argument('--shape_imgs_frac', help='What % of epochs should the renders include plain shape renders as well as textures - after which only textured renders are done', type=float, default=argparse.SUPPRESS) 61 | parser.add_argument('--aug_light', help='Augment the direction of light around the camera', type=bool, default=argparse.SUPPRESS) 62 | parser.add_argument('--aug_bkg', help='Augment the background', type=bool, default=argparse.SUPPRESS) 63 | parser.add_argument('--diff_loss_weight', help='Weight of Diffusion prior loss', type=float, default=argparse.SUPPRESS) 64 | parser.add_argument('--clip_weight', help='Weight of CLIP Text loss', type=float, default=argparse.SUPPRESS) 65 | parser.add_argument('--laplacian_weight', help='Initial uniform laplacian weight', type=float, default=argparse.SUPPRESS) 66 | parser.add_argument('--laplacian_min', help='Minimum uniform laplacian weight (set to 2% of max usually)', type=float, default=argparse.SUPPRESS) 67 | 68 | # Camera Parameters 69 | parser.add_argument('--fov_min', help='Minimum camera field of view angle during renders', type=float, default=argparse.SUPPRESS) 70 | parser.add_argument('--fov_max', help='Maximum camera field of view angle during renders', type=float, default=argparse.SUPPRESS) 71 | parser.add_argument('--dist_min', help='Minimum distance of camera from mesh during renders', type=float, default=argparse.SUPPRESS) 72 | parser.add_argument('--dist_max', help='Maximum distance of camera from mesh during renders', type=float, default=argparse.SUPPRESS) 73 | parser.add_argument('--light_power', help='Light intensity', type=float, default=argparse.SUPPRESS) 74 | parser.add_argument('--elev_alpha', help='Alpha parameter for Beta distribution for elevation sampling', type=float, default=argparse.SUPPRESS) 75 | parser.add_argument('--elev_beta', help='Beta parameter for Beta distribution for elevation sampling', type=float, default=argparse.SUPPRESS) 76 | parser.add_argument('--elev_max', help='Maximum elevation angle in degree', type=float, default=argparse.SUPPRESS) 77 | parser.add_argument('--azim_min', help='Minimum azimuth angle in degree', type=float, default=argparse.SUPPRESS) 78 | parser.add_argument('--azim_max', help='Maximum azimuth angle in degree', type=float, default=argparse.SUPPRESS) 79 | parser.add_argument('--aug_loc', help='Offset mesh from center of image?', type=bool, default=argparse.SUPPRESS) 80 | 81 | # Logging Parameters 82 | parser.add_argument('--log_interval', help='Interval for logging, every X epochs', type=int, default=argparse.SUPPRESS) 83 | parser.add_argument('--log_interval_im', help='Interval for logging renders image, every X epochs', type=int, default=argparse.SUPPRESS) 84 | parser.add_argument('--log_elev', help='Logging elevation angle', type=float, default=argparse.SUPPRESS) 85 | parser.add_argument('--log_fov', help='Logging field of view', type=float, default=argparse.SUPPRESS) 86 | parser.add_argument('--log_dist', help='Logging distance from object', type=float, default=argparse.SUPPRESS) 87 | parser.add_argument('--log_res', help='Logging render resolution', type=int, default=argparse.SUPPRESS) 88 | parser.add_argument('--log_light_power', help='Light intensity for logging', type=float, default=argparse.SUPPRESS) 89 | 90 | # Mesh Parameters 91 | parser.add_argument('--meshes', help="Path to all meshes in scene", nargs='+', default=argparse.SUPPRESS, type=str) 92 | parser.add_argument('--unit', help="Should mesh be unit scaled? True/False for each mesh in meshes", nargs='+', default=argparse.SUPPRESS, type=bool) 93 | parser.add_argument('--train_mesh_idx', help="What parameters to optimize for each mesh or none at all (vertices, texture map, normal map, true/false for limit subdivide) ?", nargs='+', action='append', default=argparse.SUPPRESS) 94 | parser.add_argument('--scales', help="Scale mesh size by some value", nargs='+', default=argparse.SUPPRESS, type=float) 95 | parser.add_argument('--offsets', help="After scaling (x, y, z) offset vertices", nargs='+', action='append', type=float, default=argparse.SUPPRESS) 96 | 97 | args = vars(parser.parse_args()) 98 | 99 | # Check if config passed - if so then parse it 100 | if args['config'] is not None: 101 | with open(args['config'], "r") as stream: 102 | try: 103 | cfg = yaml.safe_load(stream) 104 | except yaml.YAMLError as exc: 105 | print(exc) 106 | else: 107 | raise("No config passed!") 108 | 109 | # Override YAML with CL args 110 | for key in args: 111 | cfg[key] = args[key] 112 | 113 | # Config validation 114 | lists = ["meshes", "unit", "train_mesh_idx", "scales", "offsets", "prior_path"] 115 | for item in parser._actions[1:]: 116 | if item.type != type(cfg[ item.dest ]) and item.dest not in lists: 117 | raise ValueError("%s is not of type %s" % (item.dest, item.type) ) 118 | 119 | if not( len(cfg["meshes"]) == len(cfg["unit"]) == len(cfg["train_mesh_idx"]) == len(cfg["scales"]) == len(cfg["offsets"])): 120 | raise("Unit, train_mesh_idx, scales and offsets is not specified for each mesh OR there is an extra item in some list. Ensure all are the same length") 121 | 122 | print(yaml.dump(cfg, default_flow_style=False)) 123 | 124 | # Set seed 125 | random.seed(cfg["seed"]) 126 | os.environ['PYTHONHASHSEED'] = str(cfg["seed"]) 127 | np.random.seed(cfg["seed"]) 128 | torch.manual_seed(cfg["seed"]) 129 | torch.cuda.manual_seed(cfg["seed"]) 130 | torch.backends.cudnn.deterministic = True 131 | 132 | loop(cfg) 133 | 134 | if __name__ == '__main__': 135 | main() -------------------------------------------------------------------------------- /primitives/plane.mtl: -------------------------------------------------------------------------------- 1 | # Blender MTL File: 'None' 2 | # Material Count: 1 3 | 4 | newmtl None 5 | Ns 500 6 | Ka 0.8 0.8 0.8 7 | Kd 0.8 0.8 0.8 8 | Ks 0.8 0.8 0.8 9 | d 1 10 | illum 2 11 | -------------------------------------------------------------------------------- /primitives/plane.obj: -------------------------------------------------------------------------------- 1 | # Blender v3.2.2 OBJ File: '' 2 | # www.blender.org 3 | mtllib plane.mtl 4 | o Plane 5 | v -1.000000 0.000000 1.000000 6 | v 1.000000 0.000000 1.000000 7 | v -1.000000 0.000000 -1.000000 8 | v 1.000000 0.000000 -1.000000 9 | vt 0.000000 0.000000 10 | vt 1.000000 0.000000 11 | vt 1.000000 1.000000 12 | vt 0.000000 1.000000 13 | vn 0.0000 1.0000 0.0000 14 | g Plane_Plane_None 15 | usemtl None 16 | s off 17 | f 1/1/1 2/2/1 4/3/1 3/4/1 18 | -------------------------------------------------------------------------------- /primitives/sphere.mtl: -------------------------------------------------------------------------------- 1 | # Blender MTL File: 'None' 2 | # Material Count: 1 3 | 4 | newmtl None 5 | Ns 500 6 | Ka 0.8 0.8 0.8 7 | Kd 0.8 0.8 0.8 8 | Ks 0.8 0.8 0.8 9 | d 1 10 | illum 2 11 | -------------------------------------------------------------------------------- /primitives/spot.mtl: -------------------------------------------------------------------------------- 1 | 2 | # Blender MTL File: 'None' 3 | # Material Count: 1 4 | 5 | newmtl Default_OBJ 6 | Ns 250.000000 7 | Ka 1.000000 1.000000 1.000000 8 | Kd 0.800000 0.800000 0.800000 9 | Ks 0.500000 0.500000 0.500000 10 | Ke 0.000000 0.000000 0.000000 11 | Ni 1.450000 12 | d 1.000000 13 | illum 2 14 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | clip @ git+https://github.com/openai/CLIP.git@b46f5ac7587d2e1862f8b7b1573179d80dcdd620 2 | imageio 3 | cython 4 | imageio-ffmpeg 5 | kornia 6 | numpy 7 | nvdiffrast @ git+https://github.com/NVlabs/nvdiffrast.git@78528e683210dfaa1be57e3c65aa37d3b36c6644 8 | Pillow 9 | PyGLM 10 | resize-right 11 | scipy 12 | smplx 13 | torch==1.11.0 14 | torchvision==0.12.0 15 | tqdm 16 | Ninja 17 | pyyaml 18 | matplotlib>=3.3.0 -------------------------------------------------------------------------------- /utils/camera.py: -------------------------------------------------------------------------------- 1 | import glm 2 | import torch 3 | import random 4 | 5 | import numpy as np 6 | import torchvision.transforms as transforms 7 | 8 | from .resize_right import resize 9 | 10 | blurs = [ 11 | transforms.Compose([ 12 | transforms.GaussianBlur(11, sigma=(5, 5)) 13 | ]), 14 | transforms.Compose([ 15 | transforms.GaussianBlur(11, sigma=(2, 2)) 16 | ]), 17 | transforms.Compose([ 18 | transforms.GaussianBlur(5, sigma=(5, 5)) 19 | ]), 20 | transforms.Compose([ 21 | transforms.GaussianBlur(5, sigma=(2, 2)) 22 | ]), 23 | ] 24 | 25 | def get_random_bg(h, w): 26 | 27 | p = torch.rand(1) 28 | 29 | if p > 0.66666: 30 | background = blurs[random.randint(0, 3)]( torch.rand((1, 3, h, w)) ).permute(0, 2, 3, 1) 31 | elif p > 0.333333: 32 | size = random.randint(5, 10) 33 | background = torch.vstack([ 34 | torch.full( (1, size, size), torch.rand(1).item() / 2), 35 | torch.full( (1, size, size), torch.rand(1).item() / 2 ), 36 | torch.full( (1, size, size), torch.rand(1).item() / 2 ), 37 | ]).unsqueeze(0) 38 | 39 | second = torch.rand(3) 40 | 41 | background[:, 0, ::2, ::2] = second[0] 42 | background[:, 1, ::2, ::2] = second[1] 43 | background[:, 2, ::2, ::2] = second[2] 44 | 45 | background[:, 0, 1::2, 1::2] = second[0] 46 | background[:, 1, 1::2, 1::2] = second[1] 47 | background[:, 2, 1::2, 1::2] = second[2] 48 | 49 | background = blurs[random.randint(0, 3)]( resize(background, out_shape=(h, w)) ) 50 | 51 | background = background.permute(0, 2, 3, 1) 52 | 53 | else: 54 | background = torch.vstack([ 55 | torch.full( (1, h, w), torch.rand(1).item()), 56 | torch.full( (1, h, w), torch.rand(1).item()), 57 | torch.full( (1, h, w), torch.rand(1).item()), 58 | ]).unsqueeze(0).permute(0, 2, 3, 1) 59 | 60 | return background 61 | 62 | def cosine_sample(N : np.ndarray) -> np.ndarray: 63 | """ 64 | #---------------------------------------------------------------------------- 65 | # Cosine sample around a vector N 66 | #---------------------------------------------------------------------------- 67 | 68 | Copied from nvdiffmodelling 69 | 70 | """ 71 | # construct local frame 72 | N = N/np.linalg.norm(N) 73 | 74 | dx0 = np.array([0, N[2], -N[1]]) 75 | dx1 = np.array([-N[2], 0, N[0]]) 76 | 77 | dx = dx0 if np.dot(dx0,dx0) > np.dot(dx1,dx1) else dx1 78 | dx = dx/np.linalg.norm(dx) 79 | dy = np.cross(N,dx) 80 | dy = dy/np.linalg.norm(dy) 81 | 82 | # cosine sampling in local frame 83 | phi = 2.0*np.pi*np.random.uniform() 84 | s = np.random.uniform() 85 | costheta = np.sqrt(s) 86 | sintheta = np.sqrt(1.0 - s) 87 | 88 | # cartesian vector in local space 89 | x = np.cos(phi)*sintheta 90 | y = np.sin(phi)*sintheta 91 | z = costheta 92 | 93 | # local to world 94 | return dx*x + dy*y + N*z 95 | 96 | def persp_proj(fov_x=45, ar=1, near=1.0, far=50.0): 97 | """ 98 | From https://github.com/rgl-epfl/large-steps-pytorch by @bathal1 (Baptiste Nicolet) 99 | 100 | Build a perspective projection matrix. 101 | Parameters 102 | ---------- 103 | fov_x : float 104 | Horizontal field of view (in degrees). 105 | ar : float 106 | Aspect ratio (w/h). 107 | near : float 108 | Depth of the near plane relative to the camera. 109 | far : float 110 | Depth of the far plane relative to the camera. 111 | """ 112 | fov_rad = np.deg2rad(fov_x) 113 | 114 | tanhalffov = np.tan( (fov_rad / 2) ) 115 | max_y = tanhalffov * near 116 | min_y = -max_y 117 | max_x = max_y * ar 118 | min_x = -max_x 119 | 120 | z_sign = -1.0 121 | proj_mat = np.array([[0, 0, 0, 0], 122 | [0, 0, 0, 0], 123 | [0, 0, 0, 0], 124 | [0, 0, 0, 0]]) 125 | 126 | proj_mat[0, 0] = 2.0 * near / (max_x - min_x) 127 | proj_mat[1, 1] = 2.0 * near / (max_y - min_y) 128 | proj_mat[0, 2] = (max_x + min_x) / (max_x - min_x) 129 | proj_mat[1, 2] = (max_y + min_y) / (max_y - min_y) 130 | proj_mat[3, 2] = z_sign 131 | 132 | proj_mat[2, 2] = z_sign * far / (far - near) 133 | proj_mat[2, 3] = -(far * near) / (far - near) 134 | 135 | return proj_mat 136 | 137 | def get_camera_params(elev_angle, azim_angle, distance, resolution, fov=60, look_at=[0, 0, 0], up=[0, -1, 0]): 138 | 139 | elev = np.radians( elev_angle ) 140 | azim = np.radians( azim_angle ) 141 | 142 | # Generate random view 143 | cam_z = distance * np.cos(elev) * np.sin(azim) 144 | cam_y = distance * np.sin(elev) 145 | cam_x = distance * np.cos(elev) * np.cos(azim) 146 | 147 | modl = glm.mat4() 148 | view = glm.lookAt( 149 | glm.vec3(cam_x, cam_y, cam_z), 150 | glm.vec3(look_at[0], look_at[1], look_at[2]), 151 | glm.vec3(up[0], up[1], up[2]), 152 | ) 153 | 154 | a_mv = view * modl 155 | a_mv = np.array(a_mv.to_list()).T 156 | proj_mtx = persp_proj(fov) 157 | 158 | a_mvp = np.matmul(proj_mtx, a_mv).astype(np.float32)[None, ...] 159 | 160 | a_lightpos = np.linalg.inv(a_mv)[None, :3, 3] 161 | a_campos = a_lightpos 162 | 163 | return { 164 | 'mvp' : a_mvp, 165 | 'lightpos' : a_lightpos, 166 | 'campos' : a_campos, 167 | 'resolution' : [resolution, resolution], 168 | } 169 | 170 | # Returns a batch of camera parameters 171 | class CameraBatch(torch.utils.data.Dataset): 172 | def __init__( 173 | self, 174 | image_resolution, 175 | distances, 176 | azimuths, 177 | elevation_params, 178 | fovs, 179 | aug_loc, 180 | aug_light, 181 | aug_bkg, 182 | bs, 183 | look_at=[0, 0, 0], up=[0, -1, 0] 184 | ): 185 | 186 | self.res = image_resolution 187 | 188 | self.dist_min = distances[0] 189 | self.dist_max = distances[1] 190 | 191 | self.azim_min = azimuths[0] 192 | self.azim_max = azimuths[1] 193 | 194 | self.fov_min = fovs[0] 195 | self.fov_max = fovs[1] 196 | 197 | self.elev_alpha = elevation_params[0] 198 | self.elev_beta = elevation_params[1] 199 | self.elev_max = elevation_params[2] 200 | 201 | self.aug_loc = aug_loc 202 | self.aug_light = aug_light 203 | self.aug_bkg = aug_bkg 204 | 205 | self.look_at = look_at 206 | self.up = up 207 | 208 | self.batch_size = bs 209 | 210 | def __len__(self): 211 | return self.batch_size 212 | 213 | def __getitem__(self, index): 214 | 215 | elev = np.radians( np.random.beta( self.elev_alpha, self.elev_beta ) * self.elev_max ) 216 | azim = np.radians( np.random.uniform( self.azim_min, self.azim_max+1.0 ) ) 217 | dist = np.random.uniform( self.dist_min, self.dist_max ) 218 | fov = np.random.uniform( self.fov_min, self.fov_max ) 219 | 220 | proj_mtx = persp_proj(fov) 221 | 222 | # Generate random view 223 | cam_z = dist * np.cos(elev) * np.sin(azim) 224 | cam_y = dist * np.sin(elev) 225 | cam_x = dist * np.cos(elev) * np.cos(azim) 226 | 227 | if self.aug_loc: 228 | 229 | # Random offset 230 | limit = self.dist_min // 2 231 | rand_x = np.random.uniform( -limit, limit ) 232 | rand_y = np.random.uniform( -limit, limit ) 233 | 234 | modl = glm.translate(glm.mat4(), glm.vec3(rand_x, rand_y, 0)) 235 | 236 | else: 237 | 238 | modl = glm.mat4() 239 | 240 | view = glm.lookAt( 241 | glm.vec3(cam_x, cam_y, cam_z), 242 | glm.vec3(self.look_at[0], self.look_at[1], self.look_at[2]), 243 | glm.vec3(self.up[0], self.up[1], self.up[2]), 244 | ) 245 | 246 | r_mv = view * modl 247 | r_mv = np.array(r_mv.to_list()).T 248 | 249 | mvp = np.matmul(proj_mtx, r_mv).astype(np.float32) 250 | campos = np.linalg.inv(r_mv)[:3, 3] 251 | 252 | if self.aug_light: 253 | lightpos = cosine_sample(campos)*dist 254 | else: 255 | lightpos = campos*dist 256 | 257 | if self.aug_bkg: 258 | bkgs = get_random_bg(self.res, self.res).squeeze(0) 259 | else: 260 | bkgs = torch.ones(self.res, self.res, 3) 261 | 262 | return { 263 | 'mvp': torch.from_numpy( mvp ).float(), 264 | 'lightpos': torch.from_numpy( lightpos ).float(), 265 | 'campos': torch.from_numpy( campos ).float(), 266 | 'bkgs': bkgs 267 | } -------------------------------------------------------------------------------- /utils/helpers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Various helper functions 3 | 4 | create_scene -> combines multiple nvdiffmodeling meshes in to a single mesh with mega texture 5 | """ 6 | import sys 7 | import torch 8 | 9 | from math import ceil 10 | 11 | sys.path.append("../nvdiffmodeling") 12 | 13 | import nvdiffmodeling.src.mesh as mesh 14 | import nvdiffmodeling.src.texture as texture 15 | 16 | cosine_sim = torch.nn.CosineSimilarity() 17 | 18 | def cosine_sum(features, targets): 19 | return -cosine_sim(features, targets).sum() 20 | 21 | def cosine_avg(features, targets): 22 | return -cosine_sim(features, targets).mean() 23 | 24 | def _merge_attr_idx(a, b, a_idx, b_idx, scale_a=1.0, scale_b=1.0, add_a=0.0, add_b=0.0): 25 | if a is None and b is None: 26 | return None, None 27 | elif a is not None and b is None: 28 | return (a*scale_a)+add_a, a_idx 29 | elif a is None and b is not None: 30 | return (b*scale_b)+add_b, b_idx 31 | else: 32 | return torch.cat(((a*scale_a)+add_a, (b*scale_b)+add_b), dim=0), torch.cat((a_idx, b_idx + a.shape[0]), dim=0) 33 | 34 | def create_scene(meshes, sz=1024): 35 | 36 | # Need to comment and fix code 37 | 38 | scene = mesh.Mesh() 39 | 40 | tot = len(meshes) if len(meshes) % 2 == 0 else len(meshes)+1 41 | 42 | nx = 2 43 | ny = ceil(tot / 2) if ceil(tot / 2) % 2 == 0 else ceil(tot / 2) + 1 44 | 45 | w = int(sz*ny) 46 | h = int(sz*nx) 47 | 48 | dev = meshes[0].v_tex.device 49 | 50 | kd_atlas = torch.ones ( (1, w, h, 4) ).to(dev) 51 | ks_atlas = torch.zeros( (1, w, h, 3) ).to(dev) 52 | kn_atlas = torch.ones ( (1, w, h, 3) ).to(dev) 53 | 54 | for i, m in enumerate(meshes): 55 | v_pos, t_pos_idx = _merge_attr_idx(scene.v_pos, m.v_pos, scene.t_pos_idx, m.t_pos_idx) 56 | v_nrm, t_nrm_idx = _merge_attr_idx(scene.v_nrm, m.v_nrm, scene.t_nrm_idx, m.t_nrm_idx) 57 | v_tng, t_tng_idx = _merge_attr_idx(scene.v_tng, m.v_tng, scene.t_tng_idx, m.t_tng_idx) 58 | 59 | pos_x = i % nx 60 | pos_y = int(i / ny) 61 | 62 | sc_x = 1./nx 63 | sc_y = 1./ny 64 | 65 | v_tex, t_tex_idx = _merge_attr_idx( 66 | scene.v_tex, 67 | m.v_tex, 68 | scene.t_tex_idx, 69 | m.t_tex_idx, 70 | scale_a=1., 71 | scale_b=torch.tensor([sc_x, sc_y]).to(dev), 72 | add_a=0., 73 | add_b=torch.tensor([sc_x*pos_x, sc_y*pos_y]).to(dev) 74 | ) 75 | 76 | kd_atlas[:, pos_y*sz:(pos_y*sz)+sz, pos_x*sz:(pos_x*sz)+sz, :m.material['kd'].data.shape[-1]] = m.material['kd'].data 77 | ks_atlas[:, pos_y*sz:(pos_y*sz)+sz, pos_x*sz:(pos_x*sz)+sz, :m.material['ks'].data.shape[-1]] = m.material['ks'].data 78 | kn_atlas[:, pos_y*sz:(pos_y*sz)+sz, pos_x*sz:(pos_x*sz)+sz, :m.material['normal'].data.shape[-1]] = m.material['normal'].data 79 | 80 | scene = mesh.Mesh( 81 | v_pos=v_pos, 82 | t_pos_idx=t_pos_idx, 83 | v_nrm=v_nrm, 84 | t_nrm_idx=t_nrm_idx, 85 | v_tng=v_tng, 86 | t_tng_idx=t_tng_idx, 87 | v_tex=v_tex, 88 | t_tex_idx=t_tex_idx, 89 | base=scene 90 | ) 91 | 92 | scene = mesh.Mesh( 93 | material={ 94 | 'bsdf': 'diffuse', 95 | 'kd': texture.Texture2D( 96 | kd_atlas 97 | ), 98 | 'ks': texture.Texture2D( 99 | ks_atlas 100 | ), 101 | 'normal': texture.Texture2D( 102 | kn_atlas 103 | ), 104 | }, 105 | base=scene # gets uvs etc from here 106 | ) 107 | 108 | return scene -------------------------------------------------------------------------------- /utils/limit_subdivide.py: -------------------------------------------------------------------------------- 1 | """ 2 | Loop Limit Subdvide helper class 3 | """ 4 | import torch 5 | import loop_limitation 6 | 7 | class limitation_evaluate(torch.autograd.Function): 8 | @staticmethod 9 | def forward(ctx, input, loop_obj): 10 | limitation = loop_obj.compute_limitation(input) 11 | jacobian = loop_obj.get_J() 12 | ctx.in1 = jacobian 13 | return limitation 14 | 15 | @staticmethod 16 | def backward(ctx, grad_output): 17 | grad = ctx.in1.T 18 | out = torch.matmul(grad,grad_output) 19 | return out, None 20 | 21 | 22 | class LimitSubdivide(): 23 | def __init__(self, vertices, faces) -> None: 24 | self.loop_limit = loop_limitation.loop_limitation() 25 | self.loop_limit.init_J(vertices.to('cpu').double(), faces.to('cpu').int()) 26 | self.compute_limit = limitation_evaluate.apply 27 | 28 | def get_limit(self, vertices): 29 | new_verts = self.compute_limit(vertices.to('cpu').double(), self.loop_limit) 30 | return new_verts -------------------------------------------------------------------------------- /utils/resize_right.py: -------------------------------------------------------------------------------- 1 | """ 2 | Resize_Right from Assaf Shocher 3 | https://github.com/assafshocher/ResizeRight 4 | """ 5 | from math import pi 6 | 7 | try: 8 | import torch 9 | except ImportError: 10 | torch = None 11 | 12 | try: 13 | import numpy 14 | except ImportError: 15 | numpy = None 16 | 17 | if numpy is None and torch is None: 18 | raise ImportError("Must have either Numpy or PyTorch but both not found") 19 | 20 | 21 | def set_framework_dependencies(x): 22 | if type(x) is numpy.ndarray: 23 | to_dtype = lambda a: a 24 | fw = numpy 25 | else: 26 | to_dtype = lambda a: a.to(x.dtype) 27 | fw = torch 28 | eps = fw.finfo(fw.float32).eps 29 | return fw, to_dtype, eps 30 | 31 | 32 | def support_sz(sz): 33 | def wrapper(f): 34 | f.support_sz = sz 35 | return f 36 | return wrapper 37 | 38 | 39 | @support_sz(4) 40 | def cubic(x): 41 | fw, to_dtype, eps = set_framework_dependencies(x) 42 | absx = fw.abs(x) 43 | absx2 = absx ** 2 44 | absx3 = absx ** 3 45 | return ((1.5 * absx3 - 2.5 * absx2 + 1.) * to_dtype(absx <= 1.) + 46 | (-0.5 * absx3 + 2.5 * absx2 - 4. * absx + 2.) * 47 | to_dtype((1. < absx) & (absx <= 2.))) 48 | 49 | 50 | @support_sz(4) 51 | def lanczos2(x): 52 | fw, to_dtype, eps = set_framework_dependencies(x) 53 | return (((fw.sin(pi * x) * fw.sin(pi * x / 2) + eps) / 54 | ((pi**2 * x**2 / 2) + eps)) * to_dtype(abs(x) < 2)) 55 | 56 | 57 | @support_sz(6) 58 | def lanczos3(x): 59 | fw, to_dtype, eps = set_framework_dependencies(x) 60 | return (((fw.sin(pi * x) * fw.sin(pi * x / 3) + eps) / 61 | ((pi**2 * x**2 / 3) + eps)) * to_dtype(abs(x) < 3)) 62 | 63 | 64 | @support_sz(2) 65 | def linear(x): 66 | fw, to_dtype, eps = set_framework_dependencies(x) 67 | return ((x + 1) * to_dtype((-1 <= x) & (x < 0)) + (1 - x) * 68 | to_dtype((0 <= x) & (x <= 1))) 69 | 70 | 71 | @support_sz(1) 72 | def box(x): 73 | fw, to_dtype, eps = set_framework_dependencies(x) 74 | return to_dtype((-1 <= x) & (x < 0)) + to_dtype((0 <= x) & (x <= 1)) 75 | 76 | from typing import Tuple 77 | import warnings 78 | from math import ceil 79 | from fractions import Fraction 80 | 81 | 82 | class NoneClass: 83 | pass 84 | 85 | 86 | try: 87 | import torch 88 | from torch import nn 89 | nnModuleWrapped = nn.Module 90 | except ImportError: 91 | warnings.warn('No PyTorch found, will work only with Numpy') 92 | torch = None 93 | nnModuleWrapped = NoneClass 94 | 95 | try: 96 | import numpy 97 | except ImportError: 98 | warnings.warn('No Numpy found, will work only with PyTorch') 99 | numpy = None 100 | 101 | 102 | if numpy is None and torch is None: 103 | raise ImportError("Must have either Numpy or PyTorch but both not found") 104 | 105 | 106 | def resize(input, scale_factors=None, out_shape=None, 107 | interp_method=cubic, support_sz=None, 108 | antialiasing=True, by_convs=False, scale_tolerance=None, 109 | max_numerator=10, pad_mode='constant'): 110 | # get properties of the input tensor 111 | in_shape, n_dims = input.shape, input.ndim 112 | 113 | # fw stands for framework that can be either numpy or torch, 114 | # determined by the input type 115 | fw = numpy if type(input) is numpy.ndarray else torch 116 | eps = fw.finfo(fw.float32).eps 117 | device = input.device if fw is torch else None 118 | 119 | # set missing scale factors or output shapem one according to another, 120 | # scream if both missing. this is also where all the defults policies 121 | # take place. also handling the by_convs attribute carefully. 122 | scale_factors, out_shape, by_convs = set_scale_and_out_sz(in_shape, 123 | out_shape, 124 | scale_factors, 125 | by_convs, 126 | scale_tolerance, 127 | max_numerator, 128 | eps, fw) 129 | 130 | # sort indices of dimensions according to scale of each dimension. 131 | # since we are going dim by dim this is efficient 132 | sorted_filtered_dims_and_scales = [(dim, scale_factors[dim], by_convs[dim], 133 | in_shape[dim], out_shape[dim]) 134 | for dim in sorted(range(n_dims), 135 | key=lambda ind: scale_factors[ind]) 136 | if scale_factors[dim] != 1.] 137 | 138 | # unless support size is specified by the user, it is an attribute 139 | # of the interpolation method 140 | if support_sz is None: 141 | support_sz = interp_method.support_sz 142 | 143 | # output begins identical to input and changes with each iteration 144 | output = input 145 | 146 | # iterate over dims 147 | for (dim, scale_factor, dim_by_convs, in_sz, out_sz 148 | ) in sorted_filtered_dims_and_scales: 149 | # STEP 1- PROJECTED GRID: The non-integer locations of the projection 150 | # of output pixel locations to the input tensor 151 | projected_grid = get_projected_grid(in_sz, out_sz, 152 | scale_factor, fw, dim_by_convs, 153 | device) 154 | 155 | # STEP 1.5: ANTIALIASING- If antialiasing is taking place, we modify 156 | # the window size and the interpolation method (see inside function) 157 | cur_interp_method, cur_support_sz = apply_antialiasing_if_needed( 158 | interp_method, 159 | support_sz, 160 | scale_factor, 161 | antialiasing) 162 | 163 | # STEP 2- FIELDS OF VIEW: for each output pixels, map the input pixels 164 | # that influence it. Also calculate needed padding and update grid 165 | # accoedingly 166 | field_of_view = get_field_of_view(projected_grid, cur_support_sz, fw, 167 | eps, device) 168 | 169 | # STEP 2.5- CALCULATE PAD AND UPDATE: according to the field of view, 170 | # the input should be padded to handle the boundaries, coordinates 171 | # should be updated. actual padding only occurs when weights are 172 | # aplied (step 4). if using by_convs for this dim, then we need to 173 | # calc right and left boundaries for each filter instead. 174 | pad_sz, projected_grid, field_of_view = calc_pad_sz(in_sz, out_sz, 175 | field_of_view, 176 | projected_grid, 177 | scale_factor, 178 | dim_by_convs, fw, 179 | device) 180 | 181 | # STEP 3- CALCULATE WEIGHTS: Match a set of weights to the pixels in 182 | # the field of view for each output pixel 183 | weights = get_weights(cur_interp_method, projected_grid, field_of_view) 184 | 185 | # STEP 4- APPLY WEIGHTS: Each output pixel is calculated by multiplying 186 | # its set of weights with the pixel values in its field of view. 187 | # We now multiply the fields of view with their matching weights. 188 | # We do this by tensor multiplication and broadcasting. 189 | # if by_convs is true for this dim, then we do this action by 190 | # convolutions. this is equivalent but faster. 191 | if not dim_by_convs: 192 | output = apply_weights(output, field_of_view, weights, dim, n_dims, 193 | pad_sz, pad_mode, fw) 194 | else: 195 | output = apply_convs(output, scale_factor, in_sz, out_sz, weights, 196 | dim, pad_sz, pad_mode, fw) 197 | return output 198 | 199 | 200 | def get_projected_grid(in_sz, out_sz, scale_factor, fw, by_convs, device=None): 201 | # we start by having the ouput coordinates which are just integer locations 202 | # in the special case when usin by_convs, we only need two cycles of grid 203 | # points. the first and last. 204 | grid_sz = out_sz if not by_convs else scale_factor.numerator 205 | out_coordinates = fw_arange(grid_sz, fw, device) 206 | 207 | # This is projecting the ouput pixel locations in 1d to the input tensor, 208 | # as non-integer locations. 209 | # the following fomrula is derived in the paper 210 | # "From Discrete to Continuous Convolutions" by Shocher et al. 211 | return (out_coordinates / float(scale_factor) + 212 | (in_sz - 1) / 2 - (out_sz - 1) / (2 * float(scale_factor))) 213 | 214 | 215 | def get_field_of_view(projected_grid, cur_support_sz, fw, eps, device): 216 | # for each output pixel, map which input pixels influence it, in 1d. 217 | # we start by calculating the leftmost neighbor, using half of the window 218 | # size (eps is for when boundary is exact int) 219 | left_boundaries = fw_ceil(projected_grid - cur_support_sz / 2 - eps, fw) 220 | 221 | # then we simply take all the pixel centers in the field by counting 222 | # window size pixels from the left boundary 223 | ordinal_numbers = fw_arange(ceil(cur_support_sz - eps), fw, device) 224 | return left_boundaries[:, None] + ordinal_numbers 225 | 226 | 227 | def calc_pad_sz(in_sz, out_sz, field_of_view, projected_grid, scale_factor, 228 | dim_by_convs, fw, device): 229 | if not dim_by_convs: 230 | # determine padding according to neighbor coords out of bound. 231 | # this is a generalized notion of padding, when pad<0 it means crop 232 | pad_sz = [-field_of_view[0, 0].item(), 233 | field_of_view[-1, -1].item() - in_sz + 1] 234 | 235 | # since input image will be changed by padding, coordinates of both 236 | # field_of_view and projected_grid need to be updated 237 | field_of_view += pad_sz[0] 238 | projected_grid += pad_sz[0] 239 | 240 | else: 241 | # only used for by_convs, to calc the boundaries of each filter the 242 | # number of distinct convolutions is the numerator of the scale factor 243 | num_convs, stride = scale_factor.numerator, scale_factor.denominator 244 | 245 | # calculate left and right boundaries for each conv. left can also be 246 | # negative right can be bigger than in_sz. such cases imply padding if 247 | # needed. however if# both are in-bounds, it means we need to crop, 248 | # practically apply the conv only on part of the image. 249 | left_pads = -field_of_view[:, 0] 250 | 251 | # next calc is tricky, explanation by rows: 252 | # 1) counting output pixels between the first position of each filter 253 | # to the right boundary of the input 254 | # 2) dividing it by number of filters to count how many 'jumps' 255 | # each filter does 256 | # 3) multiplying by the stride gives us the distance over the input 257 | # coords done by all these jumps for each filter 258 | # 4) to this distance we add the right boundary of the filter when 259 | # placed in its leftmost position. so now we get the right boundary 260 | # of that filter in input coord. 261 | # 5) the padding size needed is obtained by subtracting the rightmost 262 | # input coordinate. if the result is positive padding is needed. if 263 | # negative then negative padding means shaving off pixel columns. 264 | right_pads = (((out_sz - fw_arange(num_convs, fw, device) - 1) # (1) 265 | // num_convs) # (2) 266 | * stride # (3) 267 | + field_of_view[:, -1] # (4) 268 | - in_sz + 1) # (5) 269 | 270 | # in the by_convs case pad_sz is a list of left-right pairs. one per 271 | # each filter 272 | 273 | pad_sz = list(zip(left_pads, right_pads)) 274 | 275 | return pad_sz, projected_grid, field_of_view 276 | 277 | 278 | def get_weights(interp_method, projected_grid, field_of_view): 279 | # the set of weights per each output pixels is the result of the chosen 280 | # interpolation method applied to the distances between projected grid 281 | # locations and the pixel-centers in the field of view (distances are 282 | # directed, can be positive or negative) 283 | weights = interp_method(projected_grid[:, None] - field_of_view) 284 | 285 | # we now carefully normalize the weights to sum to 1 per each output pixel 286 | sum_weights = weights.sum(1, keepdims=True) 287 | sum_weights[sum_weights == 0] = 1 288 | return weights / sum_weights 289 | 290 | 291 | def apply_weights(input, field_of_view, weights, dim, n_dims, pad_sz, pad_mode, 292 | fw): 293 | # for this operation we assume the resized dim is the first one. 294 | # so we transpose and will transpose back after multiplying 295 | tmp_input = fw_swapaxes(input, dim, 0, fw) 296 | 297 | # apply padding 298 | tmp_input = fw_pad(tmp_input, fw, pad_sz, pad_mode) 299 | 300 | # field_of_view is a tensor of order 2: for each output (1d location 301 | # along cur dim)- a list of 1d neighbors locations. 302 | # note that this whole operations is applied to each dim separately, 303 | # this is why it is all in 1d. 304 | # neighbors = tmp_input[field_of_view] is a tensor of order image_dims+1: 305 | # for each output pixel (this time indicated in all dims), these are the 306 | # values of the neighbors in the 1d field of view. note that we only 307 | # consider neighbors along the current dim, but such set exists for every 308 | # multi-dim location, hence the final tensor order is image_dims+1. 309 | neighbors = tmp_input[field_of_view] 310 | 311 | # weights is an order 2 tensor: for each output location along 1d- a list 312 | # of weights matching the field of view. we augment it with ones, for 313 | # broadcasting, so that when multiplies some tensor the weights affect 314 | # only its first dim. 315 | tmp_weights = fw.reshape(weights, (*weights.shape, * [1] * (n_dims - 1))) 316 | 317 | # now we simply multiply the weights with the neighbors, and then sum 318 | # along the field of view, to get a single value per out pixel 319 | tmp_output = (neighbors * tmp_weights).sum(1) 320 | 321 | # we transpose back the resized dim to its original position 322 | return fw_swapaxes(tmp_output, 0, dim, fw) 323 | 324 | 325 | def apply_convs(input, scale_factor, in_sz, out_sz, weights, dim, pad_sz, 326 | pad_mode, fw): 327 | # for this operations we assume the resized dim is the last one. 328 | # so we transpose and will transpose back after multiplying 329 | input = fw_swapaxes(input, dim, -1, fw) 330 | 331 | # the stride for all convs is the denominator of the scale factor 332 | stride, num_convs = scale_factor.denominator, scale_factor.numerator 333 | 334 | # prepare an empty tensor for the output 335 | tmp_out_shape = list(input.shape) 336 | tmp_out_shape[-1] = out_sz 337 | tmp_output = fw_empty(tuple(tmp_out_shape), fw, input.device) 338 | 339 | # iterate over the conv operations. we have as many as the numerator 340 | # of the scale-factor. for each we need boundaries and a filter. 341 | for conv_ind, (pad_sz, filt) in enumerate(zip(pad_sz, weights)): 342 | # apply padding (we pad last dim, padding can be negative) 343 | pad_dim = input.ndim - 1 344 | tmp_input = fw_pad(input, fw, pad_sz, pad_mode, dim=pad_dim) 345 | 346 | # apply convolution over last dim. store in the output tensor with 347 | # positional strides so that when the loop is comlete conv results are 348 | # interwind 349 | tmp_output[..., conv_ind::num_convs] = fw_conv(tmp_input, filt, stride) 350 | 351 | return fw_swapaxes(tmp_output, -1, dim, fw) 352 | 353 | 354 | def set_scale_and_out_sz(in_shape, out_shape, scale_factors, by_convs, 355 | scale_tolerance, max_numerator, eps, fw): 356 | # eventually we must have both scale-factors and out-sizes for all in/out 357 | # dims. however, we support many possible partial arguments 358 | if scale_factors is None and out_shape is None: 359 | raise ValueError("either scale_factors or out_shape should be " 360 | "provided") 361 | if out_shape is not None: 362 | # if out_shape has less dims than in_shape, we defaultly resize the 363 | # first dims for numpy and last dims for torch 364 | out_shape = (list(out_shape) + list(in_shape[len(out_shape):]) 365 | if fw is numpy 366 | else list(in_shape[:-len(out_shape)]) + list(out_shape)) 367 | if scale_factors is None: 368 | # if no scale given, we calculate it as the out to in ratio 369 | # (not recomended) 370 | scale_factors = [out_sz / in_sz for out_sz, in_sz 371 | in zip(out_shape, in_shape)] 372 | if scale_factors is not None: 373 | # by default, if a single number is given as scale, we assume resizing 374 | # two dims (most common are images with 2 spatial dims) 375 | scale_factors = (scale_factors 376 | if isinstance(scale_factors, (list, tuple)) 377 | else [scale_factors, scale_factors]) 378 | # if less scale_factors than in_shape dims, we defaultly resize the 379 | # first dims for numpy and last dims for torch 380 | scale_factors = (list(scale_factors) + [1] * 381 | (len(in_shape) - len(scale_factors)) if fw is numpy 382 | else [1] * (len(in_shape) - len(scale_factors)) + 383 | list(scale_factors)) 384 | if out_shape is None: 385 | # when no out_shape given, it is calculated by multiplying the 386 | # scale by the in_shape (not recomended) 387 | out_shape = [ceil(scale_factor * in_sz) 388 | for scale_factor, in_sz in 389 | zip(scale_factors, in_shape)] 390 | # next part intentionally after out_shape determined for stability 391 | # we fix by_convs to be a list of truth values in case it is not 392 | if not isinstance(by_convs, (list, tuple)): 393 | by_convs = [by_convs] * len(out_shape) 394 | 395 | # next loop fixes the scale for each dim to be either frac or float. 396 | # this is determined by by_convs and by tolerance for scale accuracy. 397 | for ind, (sf, dim_by_convs) in enumerate(zip(scale_factors, by_convs)): 398 | # first we fractionaize 399 | if dim_by_convs: 400 | frac = Fraction(1/sf).limit_denominator(max_numerator) 401 | frac = Fraction(numerator=frac.denominator, denominator=frac.numerator) 402 | 403 | # if accuracy is within tolerance scale will be frac. if not, then 404 | # it will be float and the by_convs attr will be set false for 405 | # this dim 406 | if scale_tolerance is None: 407 | scale_tolerance = eps 408 | if dim_by_convs and abs(frac - sf) < scale_tolerance: 409 | scale_factors[ind] = frac 410 | else: 411 | scale_factors[ind] = float(sf) 412 | by_convs[ind] = False 413 | 414 | return scale_factors, out_shape, by_convs 415 | 416 | 417 | def apply_antialiasing_if_needed(interp_method, support_sz, scale_factor, 418 | antialiasing): 419 | # antialiasing is "stretching" the field of view according to the scale 420 | # factor (only for downscaling). this is low-pass filtering. this 421 | # requires modifying both the interpolation (stretching the 1d 422 | # function and multiplying by the scale-factor) and the window size. 423 | scale_factor = float(scale_factor) 424 | if scale_factor >= 1.0 or not antialiasing: 425 | return interp_method, support_sz 426 | cur_interp_method = (lambda arg: scale_factor * 427 | interp_method(scale_factor * arg)) 428 | cur_support_sz = support_sz / scale_factor 429 | return cur_interp_method, cur_support_sz 430 | 431 | 432 | def fw_ceil(x, fw): 433 | if fw is numpy: 434 | return fw.int_(fw.ceil(x)) 435 | else: 436 | return x.ceil().long() 437 | 438 | 439 | def fw_floor(x, fw): 440 | if fw is numpy: 441 | return fw.int_(fw.floor(x)) 442 | else: 443 | return x.floor().long() 444 | 445 | 446 | def fw_cat(x, fw): 447 | if fw is numpy: 448 | return fw.concatenate(x) 449 | else: 450 | return fw.cat(x) 451 | 452 | 453 | def fw_swapaxes(x, ax_1, ax_2, fw): 454 | if fw is numpy: 455 | return fw.swapaxes(x, ax_1, ax_2) 456 | else: 457 | return x.transpose(ax_1, ax_2) 458 | 459 | 460 | def fw_pad(x, fw, pad_sz, pad_mode, dim=0): 461 | if pad_sz == (0, 0): 462 | return x 463 | if fw is numpy: 464 | pad_vec = [(0, 0)] * x.ndim 465 | pad_vec[dim] = pad_sz 466 | return fw.pad(x, pad_width=pad_vec, mode=pad_mode) 467 | else: 468 | if x.ndim < 3: 469 | x = x[None, None, ...] 470 | 471 | pad_vec = [0] * ((x.ndim - 2) * 2) 472 | pad_vec[0:2] = pad_sz 473 | return fw.nn.functional.pad(x.transpose(dim, -1), pad=pad_vec, 474 | mode=pad_mode).transpose(dim, -1) 475 | 476 | 477 | def fw_conv(input, filter, stride): 478 | # we want to apply 1d conv to any nd array. the way to do it is to reshape 479 | # the input to a 4D tensor. first two dims are singeletons, 3rd dim stores 480 | # all the spatial dims that we are not convolving along now. then we can 481 | # apply conv2d with a 1xK filter. This convolves the same way all the other 482 | # dims stored in the 3d dim. like depthwise conv over these. 483 | # TODO: numpy support 484 | reshaped_input = input.reshape(1, 1, -1, input.shape[-1]) 485 | reshaped_output = torch.nn.functional.conv2d(reshaped_input, 486 | filter.view(1, 1, 1, -1), 487 | stride=(1, stride)) 488 | return reshaped_output.reshape(*input.shape[:-1], -1) 489 | 490 | 491 | def fw_arange(upper_bound, fw, device): 492 | if fw is numpy: 493 | return fw.arange(upper_bound) 494 | else: 495 | return fw.arange(upper_bound, device=device) 496 | 497 | 498 | def fw_empty(shape, fw, device): 499 | if fw is numpy: 500 | return fw.empty(shape) 501 | else: 502 | return fw.empty(size=(*shape,), device=device) -------------------------------------------------------------------------------- /utils/video.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper class to create and add images to video 3 | """ 4 | import imageio 5 | import numpy as np 6 | 7 | class Video(): 8 | def __init__(self, path, name='video_log.mp4', mode='I', fps=30, codec='libx264', bitrate='16M') -> None: 9 | 10 | if path[-1] != "/": 11 | path += "/" 12 | 13 | self.writer = imageio.get_writer(path+name, mode=mode, fps=fps, codec=codec, bitrate=bitrate) 14 | 15 | def ready_image(self, image, write_video=True): 16 | # assuming channels last - as renderer returns it 17 | if len(image.shape) == 4: 18 | image = image.squeeze(0)[..., :3].detach().cpu().numpy() 19 | else: 20 | image = image[..., :3].detach().cpu().numpy() 21 | 22 | image = np.clip(np.rint(image*255.0), 0, 255).astype(np.uint8) 23 | 24 | if write_video: 25 | self.writer.append_data(image) 26 | 27 | return image 28 | 29 | def close(self): 30 | self.writer.close() --------------------------------------------------------------------------------