├── .gitignore ├── Dockerfile ├── Makefile ├── README.md ├── example ├── cube.py ├── depth.png ├── normals.png ├── nvdiffrast_compare.ipynb ├── smpl_tmp.pkl └── timing.py ├── minimal_pytorch_rasterizer ├── __init__.py ├── assert_utils.py ├── camera.py ├── cuda │ ├── rasterizer.cpp │ └── rasterizer_kernel.cu ├── rasterizer.py └── utils.py ├── requirements.txt ├── setup.py └── setup.sh /.gitignore: -------------------------------------------------------------------------------- 1 | minimal_pytorch_rasterizer.egg-info/* 2 | dist/* 3 | build/* 4 | 5 | .ipynb_checkpoints 6 | */.ipynb_checkpoints/* 7 | 8 | .idea 9 | */.idea/* 10 | 11 | __pycache__ 12 | */__pycache__/* 13 | 14 | .vscode 15 | */.vscode/* 16 | 17 | example/tmp_u.png 18 | example/tmp_v.png 19 | 20 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cudagl:10.2-devel-ubuntu18.04 2 | 3 | ENV TZ=Europe/Moscow 4 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 5 | 6 | RUN apt-get update && apt-get install -y \ 7 | build-essential \ 8 | rsync \ 9 | curl \ 10 | wget \ 11 | htop \ 12 | git \ 13 | openssh-server \ 14 | nano \ 15 | cmake \ 16 | unzip \ 17 | zip \ 18 | python-opencv \ 19 | vim \ 20 | ffmpeg \ 21 | tmux \ 22 | freeglut3-dev 23 | 24 | # cudnn 25 | RUN apt-get update && apt-get install -y --no-install-recommends \ 26 | libcudnn7=7.6.5.32-1+cuda10.2 \ 27 | libcudnn7-dev=7.6.5.32-1+cuda10.2 \ 28 | && apt-mark hold libcudnn7 && \ 29 | rm -rf /var/lib/apt/lists/* 30 | 31 | # nvdiffrast setup 32 | RUN apt-get update && apt-get install -y \ 33 | pkg-config \ 34 | libglvnd0 \ 35 | libgl1 \ 36 | libglx0 \ 37 | libegl1 \ 38 | libgles2 \ 39 | libglvnd-dev \ 40 | libgl1-mesa-dev \ 41 | libegl1-mesa-dev \ 42 | libgles2-mesa-dev 43 | ENV NVIDIA_VISIBLE_DEVICES all 44 | ENV NVIDIA_DRIVER_CAPABILITIES compute,utility,graphics 45 | ENV PYOPENGL_PLATFORM egl 46 | ENV LD_LIBRARY_PATH /usr/lib64:$LD_LIBRARY_PATH 47 | # nvdiffrast python package is installed from requirements.txt then 48 | 49 | 50 | RUN echo '{"file_format_version": "1.0.0", "ICD": {"library_path": "libEGL_nvidia.so.0"}}' | \ 51 | tee /usr/share/glvnd/egl_vendor.d/10_nvidia.json 52 | 53 | ## glew installation from source 54 | RUN curl -L https://downloads.sourceforge.net/project/glew/glew/2.1.0/glew-2.1.0.tgz > /tmp/glew-2.1.0.tgz 55 | RUN mkdir -p /tmp && \ 56 | cd /tmp && tar zxf /tmp/glew-2.1.0.tgz && cd glew-2.1.0 && \ 57 | SYSTEM=linux-egl make && \ 58 | SYSTEM=linux-egl make install && \ 59 | rm -rf /tmp/glew-2.1.0.zip /tmp/glew-2.1.0 60 | 61 | 62 | # fixuid 63 | ARG USERNAME=docker 64 | RUN apt-get update && apt-get install -y sudo curl && \ 65 | addgroup --gid 1000 $USERNAME && \ 66 | adduser --uid 1000 --gid 1000 --disabled-password --gecos '' $USERNAME && \ 67 | adduser $USERNAME sudo && \ 68 | echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \ 69 | USER=$USERNAME && \ 70 | GROUP=$USERNAME && \ 71 | curl -SsL https://github.com/boxboat/fixuid/releases/download/v0.4/fixuid-0.4-linux-amd64.tar.gz | tar -C /usr/local/bin -xzf - && \ 72 | chown root:root /usr/local/bin/fixuid && \ 73 | chmod 4755 /usr/local/bin/fixuid && \ 74 | mkdir -p /etc/fixuid && \ 75 | printf "user: $USER\ngroup: $GROUP\n" > /etc/fixuid/config.yml 76 | 77 | 78 | # conda 79 | RUN wget --quiet https://repo.continuum.io/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh -O ~/miniconda.sh && \ 80 | /bin/bash ~/miniconda.sh -b -p /opt/conda && \ 81 | rm ~/miniconda.sh && \ 82 | /opt/conda/bin/conda clean -tipsy && \ 83 | ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ 84 | echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 85 | # this version of miniconda's /opt/conda/bin provides pip = pip3 = pip3.7, python = python3 = python3.7 86 | ENV PATH /opt/conda/bin:$PATH 87 | ENV PYTHONDONTWRITEBYTECODE=1 88 | ENV PYTHONUNBUFFERED=1 89 | RUN pip install --upgrade pip 90 | 91 | 92 | # python pkgs 93 | RUN conda install pytorch=1.7.1 torchvision=0.8.2 cudatoolkit=10.2 -c pytorch 94 | COPY requirements.txt /opt/requirements.txt 95 | RUN pip --no-cache-dir install -r /opt/requirements.txt 96 | 97 | COPY ./ /opt/minimal_pytorch_rasterizer 98 | RUN cd /opt/minimal_pytorch_rasterizer && ./setup.sh 99 | 100 | USER $USERNAME:$USERNAME 101 | ENTRYPOINT ["fixuid", "-q"] 102 | CMD ["fixuid", "-q", "bash"] 103 | WORKDIR /src 104 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | NAME?=minimal_pytorch_rasterizer 2 | COMMAND?=bash 3 | VOLUMES?=-v /home/renat:/home/renat 4 | 5 | GPUS?=all 6 | ifeq ($(GPUS),none) 7 | GPUS_OPTION= 8 | else 9 | GPUS_OPTION=--gpus=$(GPUS) 10 | endif 11 | 12 | .PHONY: all 13 | all: stop build run 14 | 15 | .PHONY: build 16 | build: 17 | docker build -t $(NAME) . 18 | 19 | .PHONY: stop 20 | stop: 21 | -docker stop $(NAME) 22 | -docker rm $(NAME) 23 | 24 | .PHONY: run 25 | run: 26 | docker run --rm -dit \ 27 | $(GPUS_OPTION) \ 28 | --net=host \ 29 | --ipc=host \ 30 | -u $(shell id -u ${USER}):$(shell id -g ${USER}) \ 31 | -v $(shell pwd)/example:/example \ 32 | --name=$(NAME) \ 33 | -w /example \ 34 | $(VOLUMES) \ 35 | $(NAME) \ 36 | $(COMMAND) 37 | docker attach $(NAME) 38 | 39 | .PHONY: attach 40 | attach: 41 | docker attach $(NAME) 42 | 43 | .PHONY: logs 44 | logs: 45 | docker logs -f $(NAME) 46 | 47 | .PHONY: exec 48 | exec: 49 | docker exec -it $(NAME) $(COMMAND) 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | **minimal_pytorch_rasterizer** is a CUDA non-differentiable mesh rasterization library for pytorch tensors with python bindings. 3 | 4 | It projects mesh to image using pinhole camera model. Vertices could have any number of features (channels). Library also estimates normals for mesh visualization. 5 | 6 | A mesh with 6890 vertices and 13776 faces is rasterized on 1000x1000 image in less than 1ms on 2080ti GPU. Check timings [here](./example/timing.py). 7 | 8 | The results are consistent with [nvdiffrast](https://github.com/NVlabs/nvdiffrast) output. Check comparison [here](./example/nvdiffrast_compare.ipynb). 9 | 10 | ## Example 11 | 12 | [Visualize z buffer and normals of cube](./example/cube.py): 13 | ```python 14 | import minimal_pytorch_rasterizer as mpr 15 | import torch 16 | import cv2 17 | import numpy as np 18 | 19 | 20 | dtype = torch.float32 21 | device = torch.device('cuda:0') 22 | 23 | vertices = torch.tensor([ 24 | [0, 0, 0], [1, 0, 0], [1, 1, 0], [0, 1, 0], 25 | [0, 1, 1], [1, 1, 1], [1, 0, 1], [0, 0, 1] 26 | ], dtype=dtype, device=device) 27 | 28 | faces = torch.tensor([ 29 | [0, 2, 1], [0, 3, 2], [2, 3, 4], [2, 4, 5], [1, 2, 5], [1, 5, 6], 30 | [0, 7, 4], [0, 4, 3], [5, 4, 7], [5, 7, 6], [0, 6, 7], [0, 1, 6] 31 | ], dtype=torch.int32, device=device) 32 | 33 | R = torch.tensor(cv2.Rodrigues(np.array([0.5, 0.8, 0.2]))[0], dtype=dtype, device=device) 34 | t = torch.tensor([-0.5, -0.5, 1.3], dtype=dtype, device=device) 35 | vertices = vertices @ R.T + t 36 | 37 | pinhole2d = mpr.Pinhole2D( 38 | fx=250, fy=200, 39 | cx=160, cy=120, 40 | w=320, h=240, 41 | ) 42 | 43 | z_buffer = mpr.project_mesh( 44 | vertices=vertices, 45 | faces=faces, 46 | vertice_values=vertices[:, [2]], # take z coordinate as values 47 | pinhole=pinhole2d 48 | ) 49 | vis_z_buffer_cpu = mpr.vis_z_buffer(z_buffer) 50 | cv2.imwrite('./depth.png', vis_z_buffer_cpu) 51 | 52 | 53 | coords, normals = mpr.estimate_normals( 54 | vertices=vertices, 55 | faces=faces, 56 | pinhole=pinhole2d 57 | ) 58 | vis_normals_cpu = mpr.vis_normals(coords, normals) 59 | cv2.imwrite('./normals.png', vis_normals_cpu) 60 | ``` 61 | Will produce: 62 | 63 | ![result](./example/depth.png) 64 | ![result](./example/normals.png) 65 | 66 | ## Installation 67 | - `pip intall .` or `./setup.sh` 68 | - To build for custom cuda arches set env variable: `export TORCH_CUDA_ARCH_LIST="Pascal Turing"`. This env variable is used [here](https://github.com/pytorch/pytorch/blob/5710374e4e335c6761d2b8b937a2b54a5577cb5d/torch/utils/cpp_extension.py#L1298). 69 | - Possible intallation errors: 70 | - `packedpacked_accessor32` in error msgs means you have pytorch version < 1.3 71 | - Errors caused by pytorch internal header files could mean that you have pytorch cuda version (provided by cudatoolkit) and nvcc cuda version mismatch 72 | - Docker environment to run comparison [notebook](./example/nvdiffrast_compare.ipynb) is provided: 73 | - Install [Docker](https://www.docker.com/), [nvidia-docker](https://github.com/NVIDIA/nvidia-docker), set `nvidia` your default runtime for `docker` 74 | - Build docker image: `make build` 75 | - Enter docker container: `make run` 76 | - Run jupyter 77 | - Tested till pytorch 1.8 78 | - `torch.float32` or `torch.float64` dtypes are supported, `torch.float16` is not 79 | -------------------------------------------------------------------------------- /example/cube.py: -------------------------------------------------------------------------------- 1 | import minimal_pytorch_rasterizer as mpr 2 | import torch 3 | import cv2 4 | import numpy as np 5 | 6 | 7 | dtype = torch.float32 8 | device = torch.device('cuda:0') 9 | 10 | vertices = torch.tensor([ 11 | [0, 0, 0], [1, 0, 0], [1, 1, 0], [0, 1, 0], 12 | [0, 1, 1], [1, 1, 1], [1, 0, 1], [0, 0, 1] 13 | ], dtype=dtype, device=device) 14 | 15 | faces = torch.tensor([ 16 | [0, 2, 1], [0, 3, 2], [2, 3, 4], [2, 4, 5], [1, 2, 5], [1, 5, 6], 17 | [0, 7, 4], [0, 4, 3], [5, 4, 7], [5, 7, 6], [0, 6, 7], [0, 1, 6] 18 | ], dtype=torch.int32, device=device) 19 | 20 | R = torch.tensor(cv2.Rodrigues(np.array([0.5, 0.8, 0.2]))[0], dtype=dtype, device=device) 21 | t = torch.tensor([-0.5, -0.5, 1.3], dtype=dtype, device=device) 22 | vertices = vertices @ R.T + t 23 | 24 | pinhole2d = mpr.Pinhole2D( 25 | fx=250, fy=200, 26 | cx=160, cy=120, 27 | w=320, h=240, 28 | ) 29 | 30 | z_buffer = mpr.project_mesh( 31 | vertices=vertices, 32 | faces=faces, 33 | vertice_values=vertices[:, [2]], # take z coordinate as values 34 | pinhole=pinhole2d 35 | ) 36 | vis_z_buffer_cpu = mpr.vis_z_buffer(z_buffer) 37 | cv2.imwrite('./depth.png', vis_z_buffer_cpu) 38 | 39 | 40 | coords, normals = mpr.estimate_normals( 41 | vertices=vertices, 42 | faces=faces, 43 | pinhole=pinhole2d 44 | ) 45 | vis_normals_cpu = mpr.vis_normals(coords, normals) 46 | cv2.imwrite('./normals.png', vis_normals_cpu) 47 | -------------------------------------------------------------------------------- /example/depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rmbashirov/minimal_pytorch_rasterizer/de2e1eb9d563e5dbb7b40525829c90ac04f74ef6/example/depth.png -------------------------------------------------------------------------------- /example/normals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rmbashirov/minimal_pytorch_rasterizer/de2e1eb9d563e5dbb7b40525829c90ac04f74ef6/example/normals.png -------------------------------------------------------------------------------- /example/nvdiffrast_compare.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "genetic-disorder", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import minimal_pytorch_rasterizer as mpr\n", 11 | "import numpy as np\n", 12 | "import torch\n", 13 | "from torch import nn\n", 14 | "import pickle, json\n", 15 | "import cv2\n", 16 | "from copy import deepcopy\n", 17 | "import math\n", 18 | "import nvdiffrast.torch as dr\n", 19 | "import matplotlib.pyplot as plt" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "id": "marine-burden", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "dtype = torch.float32\n", 30 | "device = torch.device('cuda:0')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "unnecessary-marker", 36 | "metadata": {}, 37 | "source": [ 38 | "# Set cube object" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "id": "rotary-active", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "def get_object(dtype, device):\n", 49 | " vertices = torch.tensor([\n", 50 | " [0, 0, 0], [1, 0, 0], [1, 1, 0], [0, 1, 0],\n", 51 | " [0, 1, 1], [1, 1, 1], [1, 0, 1], [0, 0, 1]\n", 52 | " ], dtype=dtype, device=device)\n", 53 | "\n", 54 | " faces = torch.tensor([\n", 55 | " [0, 2, 1], [0, 3, 2], [2, 3, 4], [2, 4, 5], [1, 2, 5], [1, 5, 6],\n", 56 | " [0, 7, 4], [0, 4, 3], [5, 4, 7], [5, 7, 6], [0, 6, 7], [0, 1, 6]\n", 57 | " ], dtype=torch.int32, device=device)\n", 58 | "\n", 59 | " R = torch.tensor(cv2.Rodrigues(np.array([0.5, 0.8, 0.2]))[0], dtype=dtype, device=device)\n", 60 | " t = torch.tensor([-0.5, -0.5, 1.3], dtype=dtype, device=device)\n", 61 | " vertices = vertices @ R.T + t\n", 62 | " \n", 63 | " return vertices, faces" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "id": "nutritional-missouri", 69 | "metadata": {}, 70 | "source": [ 71 | "# Set pinhole cam params" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "id": "adaptive-disease", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "K = np.array([\n", 82 | " [250, 0, 160],\n", 83 | " [0, 200, 120],\n", 84 | " [0, 0, 1]\n", 85 | "], dtype=np.float32)\n", 86 | "w = 320\n", 87 | "h = 240\n", 88 | "\n", 89 | "near = 0.01\n", 90 | "far = 100\n", 91 | "\n", 92 | "fx, fy = K[0, 0], K[1, 1]\n", 93 | "cx, cy = K[0, 2], K[1, 2]" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "id": "competitive-france", 99 | "metadata": {}, 100 | "source": [ 101 | "# Get nvdiffrast result" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 5, 107 | "id": "varied-incentive", 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "glctx = dr.RasterizeGLContext()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 6, 117 | "id": "simplified-veteran", 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "vertices, faces = get_object(dtype, device)\n", 122 | "\n", 123 | "matrix_ndc = torch.tensor([\n", 124 | " [2 * fx / w, 0.0, (w - 2 * cx) / w, 0],\n", 125 | " [0, 2 * fy / h, (h - 2 * cy) / h, 0],\n", 126 | " [0, 0, (far + near) / (near - far), 2 * far * near / (near - far)],\n", 127 | " [0, 0, -1, 0]\n", 128 | "], device=device, dtype=dtype)\n", 129 | "\n", 130 | "verts_3d_homo = torch.cat([vertices, torch.ones(len(vertices), 1, device=device)], dim=-1)\n", 131 | "verts_3d_homo[:, 2] *= -1 # invert z-axis\n", 132 | "\n", 133 | "verts_ndc = torch.matmul(verts_3d_homo, matrix_ndc.transpose(0, 1))\n", 134 | "\n", 135 | "verts_ndc = torch.unsqueeze(verts_ndc, dim=0)\n", 136 | "\n", 137 | "rast, rast_db = dr.rasterize(glctx, verts_ndc, faces, resolution=[h, w])\n", 138 | "\n", 139 | "render, render_da = dr.interpolate(vertices[:, [2]], rast, faces, rast_db=rast_db, diff_attrs='all')\n", 140 | "\n", 141 | "render_cpu = render[0, ..., 0].detach().cpu().numpy()\n", 142 | "render_cpu[np.isinf(render_cpu)] = 0\n", 143 | "\n", 144 | "nvdiffrast_depth = render_cpu\n", 145 | "nvdiffrast_barycentric_weights = rast[0, ..., [0, 1]].detach().cpu().numpy()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "id": "monthly-samba", 151 | "metadata": {}, 152 | "source": [ 153 | "# Get minimal_pytorch_rasterizer result" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 7, 159 | "id": "necessary-bullet", 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "vertices, faces = get_object(dtype, device)\n", 164 | "\n", 165 | "pinhole2d = mpr.Pinhole2D(\n", 166 | " fx=fx, fy=fy,\n", 167 | " cx=cx, cy=cy,\n", 168 | " w=w, h=h,\n", 169 | ")\n", 170 | "\n", 171 | "z_buffer = mpr.project_mesh(\n", 172 | " vertices=vertices,\n", 173 | " faces=faces,\n", 174 | " vertice_values=vertices[:, [2]], # take z coordinate as values\n", 175 | " pinhole=pinhole2d\n", 176 | ")\n", 177 | "\n", 178 | "minimal_pytorch_rasterizer_depth = z_buffer[..., 0].detach().cpu().numpy()" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "id": "impaired-acceptance", 184 | "metadata": {}, 185 | "source": [ 186 | "# Compare depth" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 8, 192 | "id": "bridal-advocacy", 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "image/png": "\n", 198 | "text/plain": [ 199 | "
" 200 | ] 201 | }, 202 | "metadata": { 203 | "needs_background": "light" 204 | }, 205 | "output_type": "display_data" 206 | } 207 | ], 208 | "source": [ 209 | "fig, axs = plt.subplots(1, 3, figsize=(15, 5))\n", 210 | "axs[0].imshow(nvdiffrast_depth, vmin=0, vmax=2)\n", 211 | "axs[0].set_title('nvdiffrast')\n", 212 | "axs[1].imshow(minimal_pytorch_rasterizer_depth, vmin=0, vmax=2)\n", 213 | "axs[1].set_title('minimal_pytorch_rasterizer')\n", 214 | "axs[2].imshow(np.abs(nvdiffrast_depth - minimal_pytorch_rasterizer_depth), vmin=0, vmax=.01)\n", 215 | "axs[2].set_title('diff, vmax=0.01')\n", 216 | "plt.show()" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "id": "square-nutrition", 222 | "metadata": {}, 223 | "source": [ 224 | "# Compare barycentric coordinates" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "id": "figured-liverpool", 230 | "metadata": {}, 231 | "source": [ 232 | "minimal_pytorch_rasterizer does not prodive an explicit barycentric coordinates output.\n", 233 | "\n", 234 | "Below is an imitation of what happens in rasterizer_kernel.cu" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 9, 240 | "id": "above-token", 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "def cpu_render(\n", 245 | " vertices, faces,\n", 246 | " fx, fy, cx, cy, h, w\n", 247 | "):\n", 248 | " \n", 249 | " result = np.ones((h, w), dtype=np.float32) * 1e6\n", 250 | " \n", 251 | " weights = np.zeros((h, w, 3), dtype=np.float32)\n", 252 | " \n", 253 | " n = len(vertices)\n", 254 | " m = len(faces)\n", 255 | " \n", 256 | " xs = np.zeros(n, dtype=np.float32)\n", 257 | " ys = np.zeros(n, dtype=np.float32)\n", 258 | " zs = np.zeros(n, dtype=np.float32)\n", 259 | " for i in range(n):\n", 260 | " z = vertices[i][2]\n", 261 | " z_inv = 1. / z;\n", 262 | " xs[i] = vertices[i][0] * z_inv * fx + cx;\n", 263 | " xs[i] = 2 * xs[i] / w - 1;\n", 264 | " ys[i] = vertices[i][1] * z_inv * fy + cy;\n", 265 | " ys[i] = 2 * ys[i] / h - 1;\n", 266 | " zs[i] = z;\n", 267 | " \n", 268 | " face_ndc = np.zeros(9, dtype=np.float32)\n", 269 | " face_ndc_inv = np.zeros(9, dtype=np.float32)\n", 270 | " \n", 271 | " for face_i in range(m):\n", 272 | " ai = faces[face_i][0];\n", 273 | " bi = faces[face_i][1];\n", 274 | " ci = faces[face_i][2];\n", 275 | " \n", 276 | " face_ndc[0] = xs[ai]; face_ndc[1] = ys[ai]; face_ndc[2] = zs[ai];\n", 277 | " face_ndc[3] = xs[bi]; face_ndc[4] = ys[bi]; face_ndc[5] = zs[bi];\n", 278 | " face_ndc[6] = xs[ci]; face_ndc[7] = ys[ci]; face_ndc[8] = zs[ci];\n", 279 | " \n", 280 | " face_ndc_inv[0] = face_ndc[4] - face_ndc[7];\n", 281 | " face_ndc_inv[1] = face_ndc[6] - face_ndc[3];\n", 282 | " face_ndc_inv[2] = face_ndc[3] * face_ndc[7] - face_ndc[6] * face_ndc[4];\n", 283 | " face_ndc_inv[3] = face_ndc[7] - face_ndc[1];\n", 284 | " face_ndc_inv[4] = face_ndc[0] - face_ndc[6];\n", 285 | " face_ndc_inv[5] = face_ndc[6] * face_ndc[1] - face_ndc[0] * face_ndc[7];\n", 286 | " face_ndc_inv[6] = face_ndc[1] - face_ndc[4];\n", 287 | " face_ndc_inv[7] = face_ndc[3] - face_ndc[0];\n", 288 | " face_ndc_inv[8] = face_ndc[0] * face_ndc[4] - face_ndc[3] * face_ndc[1];\n", 289 | "\n", 290 | " denom = (\n", 291 | " face_ndc[6] * (face_ndc[1] - face_ndc[4]) +\n", 292 | " face_ndc[0] * (face_ndc[4] - face_ndc[7]) +\n", 293 | " face_ndc[3] * (face_ndc[7] - face_ndc[1])\n", 294 | " );\n", 295 | " \n", 296 | " face_ndc_inv /= denom;\n", 297 | " \n", 298 | " min_x = min(min(face_ndc[0], face_ndc[3]), face_ndc[6]);\n", 299 | " min_x = (min_x + 1) / 2 * w; # convert from [-1, 1] to [0, W]\n", 300 | " min_xi = int(math.floor(min_x));\n", 301 | " min_xi = min(max(min_xi, 0), w - 1);\n", 302 | " max_x = max(max(face_ndc[0], face_ndc[3]), face_ndc[6]);\n", 303 | " max_x = (max_x + 1) / 2 * w;\n", 304 | " max_xi = int(math.ceil(max_x));\n", 305 | " max_xi = min(max(max_xi, 0), w - 1);\n", 306 | "\n", 307 | " min_y = min(min(face_ndc[1], face_ndc[4]), face_ndc[7]);\n", 308 | " min_y = (min_y + 1) / 2 * h;\n", 309 | " min_yi = int(math.floor(min_y));\n", 310 | " min_yi = min(max(min_yi, 0), h - 1);\n", 311 | " max_y = max(max(face_ndc[1], face_ndc[4]), face_ndc[7]);\n", 312 | " max_y = (max_y + 1) / 2 * h;\n", 313 | " max_yi = int(math.ceil(max_y));\n", 314 | " max_yi = min(max(max_yi, 0), h - 1);\n", 315 | " \n", 316 | " top, bottom = min_yi, max_yi;\n", 317 | " left, right = min_xi, max_xi;\n", 318 | " \n", 319 | " def calc_bary(x, y):\n", 320 | " if (((y - face_ndc[1]) * (face_ndc[3] - face_ndc[0]) > (x - face_ndc[0]) * (face_ndc[4] - face_ndc[1])) or\n", 321 | " ((y - face_ndc[4]) * (face_ndc[6] - face_ndc[3]) > (x - face_ndc[3]) * (face_ndc[7] - face_ndc[4])) or\n", 322 | " ((y - face_ndc[7]) * (face_ndc[0] - face_ndc[6]) > (x - face_ndc[6]) * (face_ndc[1] - face_ndc[7]))):\n", 323 | " return None\n", 324 | " \n", 325 | " wa = face_ndc_inv[0] * x + face_ndc_inv[1] * y + face_ndc_inv[2];\n", 326 | " wb = face_ndc_inv[3] * x + face_ndc_inv[4] * y + face_ndc_inv[5];\n", 327 | " wc = face_ndc_inv[6] * x + face_ndc_inv[7] * y + face_ndc_inv[8];\n", 328 | " \n", 329 | " wsum = wa + wb + wc;\n", 330 | " wa /= wsum; wb /= wsum; wc /= wsum;\n", 331 | " \n", 332 | " wa /= face_ndc[2];\n", 333 | " wb /= face_ndc[5];\n", 334 | " wc /= face_ndc[8];\n", 335 | " \n", 336 | " wsum = wa + wb + wc;\n", 337 | " wa /= wsum; wb /= wsum; wc /= wsum;\n", 338 | " \n", 339 | " return wa, wb, wc \n", 340 | " \n", 341 | " \n", 342 | " for i in range(top, bottom + 1):\n", 343 | " for j in range(left, right + 1):\n", 344 | " x = (j + 0.5) / w * 2 - 1;\n", 345 | " y = (i + 0.5) / h * 2 - 1;\n", 346 | "\n", 347 | " bary = calc_bary(x, y)\n", 348 | " if bary is None:\n", 349 | " continue\n", 350 | " \n", 351 | " wa, wb, wc = bary\n", 352 | " \n", 353 | " face_z = wa * face_ndc[2] + wb * face_ndc[5] + wc * face_ndc[8];\n", 354 | " \n", 355 | " if face_z < result[i][j]:\n", 356 | " result[i][j] = face_z\n", 357 | " weights[i, j, 0] = wa\n", 358 | " weights[i, j, 1] = wb\n", 359 | " weights[i, j, 2] = wc\n", 360 | " \n", 361 | " return result, weights" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 10, 367 | "id": "informative-pasta", 368 | "metadata": {}, 369 | "outputs": [ 370 | { 371 | "data": { 372 | "image/png": "\n", 373 | "text/plain": [ 374 | "
" 375 | ] 376 | }, 377 | "metadata": { 378 | "needs_background": "light" 379 | }, 380 | "output_type": "display_data" 381 | } 382 | ], 383 | "source": [ 384 | "vertices, faces = get_object(dtype, device)\n", 385 | "\n", 386 | "cpu_depth, cpu_weights = cpu_render(\n", 387 | " vertices.detach().cpu().numpy(), faces.detach().cpu().numpy(), \n", 388 | " fx=fx, fy=fy,\n", 389 | " cx=cx, cy=cy,\n", 390 | " w=w, h=h\n", 391 | ")\n", 392 | "cpu_depth[cpu_depth > 1e3] = 0\n", 393 | "\n", 394 | "plt.imshow(cpu_depth)\n", 395 | "plt.title('depth cpu')\n", 396 | "plt.show()" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 11, 402 | "id": "literary-james", 403 | "metadata": {}, 404 | "outputs": [ 405 | { 406 | "data": { 407 | "image/png": "\n", 408 | "text/plain": [ 409 | "
" 410 | ] 411 | }, 412 | "metadata": { 413 | "needs_background": "light" 414 | }, 415 | "output_type": "display_data" 416 | }, 417 | { 418 | "name": "stdout", 419 | "output_type": "stream", 420 | "text": [ 421 | "\n", 422 | "\n", 423 | "\n" 424 | ] 425 | }, 426 | { 427 | "data": { 428 | "image/png": "\n", 429 | "text/plain": [ 430 | "
" 431 | ] 432 | }, 433 | "metadata": { 434 | "needs_background": "light" 435 | }, 436 | "output_type": "display_data" 437 | }, 438 | { 439 | "name": "stdout", 440 | "output_type": "stream", 441 | "text": [ 442 | "\n", 443 | "\n", 444 | "\n" 445 | ] 446 | } 447 | ], 448 | "source": [ 449 | "for i in range(2):\n", 450 | " gt = nvdiffrast_barycentric_weights[..., i]\n", 451 | " pred = cpu_weights[..., i]\n", 452 | " diff = np.abs(pred - gt)\n", 453 | " \n", 454 | " fig, axs = plt.subplots(1, 3, figsize=(15, 5))\n", 455 | " axs[0].imshow(pred, vmin=0, vmax=1)\n", 456 | " axs[0].title.set_text('pred')\n", 457 | " axs[1].imshow(gt, vmin=0, vmax=1)\n", 458 | " axs[1].title.set_text('gt')\n", 459 | " axs[2].imshow(diff, vmin=0, vmax=.2)\n", 460 | " axs[2].title.set_text('diff, vmax=0.2')\n", 461 | " fig.suptitle(f'Barycentric weight {i}', fontsize=16)\n", 462 | " plt.tight_layout()\n", 463 | " plt.show()\n", 464 | " \n", 465 | " print('\\n' * 2)" 466 | ] 467 | } 468 | ], 469 | "metadata": { 470 | "kernelspec": { 471 | "display_name": "Python 3", 472 | "language": "python", 473 | "name": "python3" 474 | }, 475 | "language_info": { 476 | "codemirror_mode": { 477 | "name": "ipython", 478 | "version": 3 479 | }, 480 | "file_extension": ".py", 481 | "mimetype": "text/x-python", 482 | "name": "python", 483 | "nbconvert_exporter": "python", 484 | "pygments_lexer": "ipython3", 485 | "version": "3.7.7" 486 | } 487 | }, 488 | "nbformat": 4, 489 | "nbformat_minor": 5 490 | } 491 | -------------------------------------------------------------------------------- /example/smpl_tmp.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rmbashirov/minimal_pytorch_rasterizer/de2e1eb9d563e5dbb7b40525829c90ac04f74ef6/example/smpl_tmp.pkl -------------------------------------------------------------------------------- /example/timing.py: -------------------------------------------------------------------------------- 1 | import minimal_pytorch_rasterizer as mpr 2 | 3 | import torch 4 | 5 | from skimage import io 6 | import numpy as np 7 | import argparse 8 | import time 9 | import pickle 10 | from collections import defaultdict 11 | 12 | 13 | def save_t(t_hist, t_names, ts): 14 | assert len(ts) - 1 == len(t_names) 15 | ts = np.array(ts) 16 | ts = ts[1:] - ts[:-1] 17 | for i, t_name in enumerate(t_names): 18 | t_hist[t_name].append(ts[i]) 19 | 20 | 21 | def print_t(t_hist, t_names, skip_fraq=0.1): 22 | skip_counts = [] 23 | max_t_names_len = max(map(len, t_names)) + 5 24 | for t_name in t_names: 25 | dts = t_hist[t_name] 26 | skip_count = int(len(dts) * skip_fraq) 27 | skip_counts.append(skip_count) 28 | dts = dts[skip_count:] 29 | mean_str = f'{np.mean(dts) * 1000:.2f}'.rjust(6) 30 | std_str = f'{np.std(dts) * 1000:.2f}'.ljust(6) 31 | print(f'\t{t_name}: '.ljust(max_t_names_len + 1) + f'{mean_str} +- {std_str}ms') 32 | # print(f'skip_counts: {skip_counts}') 33 | 34 | 35 | def parse_args(): 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument('--repeat', type=int, default=1000) 38 | parser.add_argument('--double', action='store_true') 39 | parsed_args = parser.parse_args() 40 | return parsed_args 41 | 42 | 43 | def calculate_timings_0(vertices_cpu, uv_cpu, faces_cpu, pinhole, repeat, dtype=torch.float32): 44 | t_hist = defaultdict(list) 45 | t_names = [ 46 | 'cpu->gpu_transfer', 47 | 'project_mesh', 48 | 'gpu->cpu_transfer', 49 | ] 50 | 51 | for it in range(repeat): 52 | t0 = time.time() 53 | vertices = torch.tensor(vertices_cpu, dtype=dtype, device='cuda:0') 54 | vertice_values = torch.tensor(uv_cpu, dtype=dtype, device='cuda:0') 55 | faces = torch.tensor(faces_cpu, dtype=torch.int32, device='cuda:0') 56 | 57 | t1 = time.time() 58 | projected = mpr.project_mesh(vertices, faces, vertice_values, pinhole) 59 | 60 | t2 = time.time() 61 | if it == 0: 62 | img = (projected * 255).cpu().numpy().round().clip(0, 255).astype(np.uint8) 63 | else: 64 | img = projected[:5, :5, :].cpu().numpy() 65 | 66 | t3 = time.time() 67 | 68 | if it > 0: 69 | save_t(t_hist, t_names, [t0, t1, t2, t3]) 70 | 71 | if it == 0: 72 | io.imsave('./tmp_u.png', img[:, :, 0]) 73 | io.imsave('./tmp_v.png', img[:, :, 1]) 74 | 75 | if repeat > 1: 76 | print('method 1 timings:') 77 | print_t(t_hist, t_names) 78 | 79 | 80 | def calculate_timings_1(vertices_cpu, uv_cpu, faces_cpu, pinhole, repeat, dtype=torch.float32): 81 | for it in range(repeat + 1): 82 | if it == 0: 83 | vertices = torch.tensor(vertices_cpu, dtype=dtype, device='cuda:0') 84 | vertice_values = torch.tensor(uv_cpu, dtype=dtype, device='cuda:0') 85 | faces = torch.tensor(faces_cpu, dtype=torch.int32, device='cuda:0') 86 | elif it == 1: 87 | start = time.time() 88 | 89 | projected = mpr.project_mesh(vertices, faces, vertice_values, pinhole) 90 | torch.cuda.synchronize() 91 | 92 | if it == 0: 93 | img = (projected * 255).cpu().numpy().round().clip(0, 255).astype(np.uint8) 94 | 95 | io.imsave('./tmp_u.png', img[:, :, 0]) 96 | io.imsave('./tmp_v.png', img[:, :, 1]) 97 | 98 | torch.cuda.synchronize() 99 | end = time.time() 100 | timing = (end - start) / (repeat - 1) * 1000 101 | print(f'method 2 timings:\n\t{timing:.3f}ms') 102 | 103 | 104 | def test_project_mesh(repeat, double=False): 105 | with open('smpl_tmp.pkl', 'rb') as f: 106 | mesh_data = pickle.load(f) 107 | vertices_cpu = mesh_data['vertices'] 108 | faces_cpu = mesh_data['faces'] 109 | uv_cpu = mesh_data['uv'] 110 | 111 | pinhole = mpr.Pinhole2D( 112 | fx=500, fy=500, 113 | cx=500, cy=500, 114 | h=1000, w=1000 115 | ) 116 | 117 | R = np.array([ 118 | [1., 0., 0.], 119 | [0., -1., 0.], 120 | [0., 0., -1.] 121 | ]) 122 | t = np.array([[0., -0.3, 1.2]]) 123 | vertices_cpu = vertices_cpu @ R.T + t 124 | 125 | dtype = torch.float64 if double else torch.float32 126 | calculate_timings_0(vertices_cpu, uv_cpu, faces_cpu, pinhole, repeat, dtype=dtype) 127 | if repeat > 1: 128 | calculate_timings_1(vertices_cpu, uv_cpu, faces_cpu, pinhole, repeat, dtype=dtype) 129 | 130 | 131 | def main(): 132 | parsed_args = parse_args() 133 | test_project_mesh(repeat=parsed_args.repeat, double=parsed_args.double) 134 | 135 | 136 | if __name__ == '__main__': 137 | main() 138 | -------------------------------------------------------------------------------- /minimal_pytorch_rasterizer/__init__.py: -------------------------------------------------------------------------------- 1 | from minimal_pytorch_rasterizer.camera import Pinhole2D 2 | from minimal_pytorch_rasterizer.rasterizer import project_mesh, estimate_normals 3 | from minimal_pytorch_rasterizer.utils import vis_normals, vis_z_buffer 4 | 5 | __version__ = '0.5' 6 | name = 'minimal_pytorch_rasterizer' 7 | -------------------------------------------------------------------------------- /minimal_pytorch_rasterizer/assert_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def is_cuda_tensor(t): 4 | assert torch.is_tensor(t) 5 | assert t.is_cuda 6 | 7 | def check_shape_len(t, n): 8 | assert len(t.shape) == n 9 | -------------------------------------------------------------------------------- /minimal_pytorch_rasterizer/camera.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | class Pinhole2D: 6 | def __init__(self, K=None, fx=None, fy=None, cx=None, cy=None, h=0, w=0): 7 | if K is not None: 8 | assert fx is None and fy is None and cx is None and cy is None 9 | self.fx = K[0, 0] 10 | self.fy = K[1, 1] 11 | self.cx = K[0, 2] 12 | self.cy = K[1, 2] 13 | else: 14 | assert \ 15 | fx is not None and fy is not None and \ 16 | cx is not None and cy is not None 17 | self.fx = fx 18 | self.fy = fy 19 | self.cx = cx 20 | self.cy = cy 21 | self.h = h 22 | self.w = w 23 | 24 | def __str__(self): 25 | result = f'fx: {self.fx}, fy: {self.fy}, cx: {self.cx}, cy: {self.cy}' 26 | if self.h > 0 and self.w > 0: 27 | result += f', h: {self.h}, w: {self.w}' 28 | return result 29 | 30 | def get_K(self): 31 | return np.array([ 32 | [self.fx, 0, self.cx], 33 | [0, self.fy, self.cy], 34 | [0, 0, 1] 35 | ]) 36 | 37 | def project_ndc(self, vertices, eps=1e-9): 38 | """ 39 | vertices: torch.Tensor of shape (N, 3), 3 stands for xyz 40 | """ 41 | assert isinstance(vertices, torch.Tensor) 42 | assert len(vertices.shape) == 2 43 | assert vertices.shape[1] == 3 44 | K = torch.tensor(self.get_K(), 45 | device=vertices.device, dtype=vertices.dtype) 46 | 47 | # apply intrinsics 48 | vertices_ndc = vertices @ K.transpose(0, 1) 49 | 50 | # divide xy by z, leave z unchanged 51 | vertices_ndc[:, [0, 1]] /= vertices_ndc[:, [2]] + eps 52 | 53 | # convert x from [0, w) to [-1, 1] range 54 | # convert y from [0, h) to [-1, 1] range 55 | wh = torch.tensor( 56 | [self.w, self.h], 57 | device=vertices.device, dtype=vertices.dtype 58 | ).unsqueeze(0) 59 | vertices_ndc[:, [0, 1]] = 2 * vertices_ndc[:, [0, 1]] / wh - 1 60 | 61 | return vertices_ndc 62 | -------------------------------------------------------------------------------- /minimal_pytorch_rasterizer/cuda/rasterizer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // CUDA forward declarations 5 | 6 | std::vector estimate_normals_cuda( 7 | const torch::Tensor& vertices_ndc, 8 | const torch::Tensor& faces, 9 | const torch::Tensor& vertices, 10 | const torch::Tensor& vertices_filter, 11 | int h, int w 12 | ); 13 | 14 | 15 | torch::Tensor project_mesh_cuda( 16 | const torch::Tensor& vertices_ndc, 17 | const torch::Tensor& faces, 18 | const torch::Tensor& vertice_values, 19 | const torch::Tensor& vertices_filter, 20 | int h, int w 21 | ); 22 | 23 | // C++ interface 24 | 25 | #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") 26 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") 27 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 28 | 29 | void check_equal_dtype(const torch::Tensor& a, const torch::Tensor& b) { 30 | TORCH_CHECK( 31 | a.dtype() == b.dtype(), 32 | "expected equal dtype, got ", a.dtype(), " != ", b.dtype() 33 | ); 34 | } 35 | 36 | void check_equal_gpuid(const torch::Tensor& a, const torch::Tensor& b) { 37 | TORCH_CHECK( 38 | a.device().index() == b.device().index(), 39 | "expected equal gpu id, got ", a.device().index(), " != ", b.device().index() 40 | ); 41 | } 42 | 43 | std::vector estimate_normals( 44 | const torch::Tensor& vertices_ndc, 45 | const torch::Tensor& faces, 46 | const torch::Tensor& vertices, 47 | const torch::Tensor& vertices_filter, 48 | int h, int w 49 | ) { 50 | TORCH_CHECK(h > 0, "h expected to be > 0"); 51 | TORCH_CHECK(w > 0, "w expected to be > 0"); 52 | CHECK_INPUT(vertices_ndc); 53 | CHECK_INPUT(faces); 54 | CHECK_INPUT(vertices_filter); 55 | return estimate_normals_cuda( 56 | vertices_ndc, faces, vertices, vertices_filter, 57 | h, w 58 | ); 59 | } 60 | 61 | torch::Tensor project_mesh( 62 | const torch::Tensor& vertices_ndc, 63 | const torch::Tensor& faces, 64 | const torch::Tensor& vertice_values, 65 | const torch::Tensor& vertices_filter, 66 | int h, int w 67 | ) { 68 | TORCH_CHECK(h > 0, "h expected to be > 0"); 69 | TORCH_CHECK(w > 0, "w expected to be > 0"); 70 | CHECK_INPUT(vertices_ndc); 71 | CHECK_INPUT(faces); 72 | CHECK_INPUT(vertice_values); 73 | CHECK_INPUT(vertices_filter); 74 | return project_mesh_cuda( 75 | vertices_ndc, faces, vertice_values, vertices_filter, 76 | h, w 77 | ); 78 | } 79 | 80 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 81 | m.def("estimate_normals", &estimate_normals, "estimate_normals (CUDA)"); 82 | m.def("project_mesh", &project_mesh, "project_mesh (CUDA)"); 83 | } -------------------------------------------------------------------------------- /minimal_pytorch_rasterizer/cuda/rasterizer_kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | There are 2 ways to rasterize triangles that came to mind: 4 | 1) iterate over all pixels (they define CUDA grid), for selected pixel feed all triangles to 1 CUDA block 5 | 2) iterate over all triangels (they define CUDA grid), for selected triangle feed pixels that are bounded by selected triangle to 1 CUDA block 6 | 7 | 2nd way is implemented here 8 | */ 9 | 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #define BLOCK_SIZE 512 20 | #define BLOCK_SIZE_2D_X 32 21 | #define BLOCK_SIZE_2D_Y 16 22 | #define BLOCK_SIZE_3D_X 32 23 | #define BLOCK_SIZE_3D_Y 8 24 | #define BLOCK_SIZE_3D_Z 4 25 | 26 | // vertices coords: 27 | // vertices[:, 0]: x 28 | // vertices[:, 1]: y 29 | // vertices[:, 2]: z 30 | 31 | // 2d tensor axis: 32 | // 0: yi 33 | // 1: xi 34 | 35 | // 3d tensor axis: 36 | // 0: zi 37 | // 1: yi 38 | // 2: xi 39 | 40 | template 41 | __device__ __forceinline__ scalar_t atomicMinFloat(scalar_t * addr, scalar_t value) { 42 | scalar_t old; 43 | old = (value >= 0) ? __int_as_float(atomicMin((int *)addr, __float_as_int(value))) : 44 | __uint_as_float(atomicMax((unsigned int *)addr, __float_as_uint(value))); 45 | return old; 46 | } 47 | 48 | __device__ double atomicMin_double(double* address, double val) 49 | { 50 | unsigned long long int* address_as_ull = (unsigned long long int*) address; 51 | unsigned long long int old = *address_as_ull, assumed; 52 | do { 53 | assumed = old; 54 | old = atomicCAS(address_as_ull, assumed, 55 | __double_as_longlong(fmin(val, __longlong_as_double(assumed)))); 56 | } while (assumed != old); 57 | return __longlong_as_double(old); 58 | } 59 | 60 | // kernel utils 61 | 62 | template 63 | __device__ int lower_bound(const scalar_t* values, const scalar_t value, const int N) { 64 | int left = 0; 65 | int right = N; 66 | int mid; 67 | while (right - left > 1) { 68 | mid = (left + right) / 2; 69 | if (values[mid] < value) { 70 | left = mid; 71 | } else { 72 | right = mid; 73 | } 74 | } 75 | return right; 76 | } 77 | 78 | // kernels 79 | 80 | template 81 | __global__ void rasterize_cuda_kernel( 82 | const torch::PackedTensorAccessor32 vertices_ndc, 83 | const torch::PackedTensorAccessor32 faces, 84 | const torch::PackedTensorAccessor32 vertices_filter, 85 | torch::PackedTensorAccessor32 depth, 86 | scalar_t* global_face_ndc_inv, 87 | int* global_is_bad_face 88 | ) { 89 | const int face_indx = blockIdx.x; 90 | const int H = depth.size(0); 91 | const int W = depth.size(1); 92 | 93 | scalar_t min_x, max_x, min_y, max_y; 94 | scalar_t denom; 95 | 96 | __shared__ int vertices_per_thread_x, vertices_per_thread_y; 97 | __shared__ int ai, bi, ci; 98 | __shared__ bool is_bad_face; 99 | __shared__ int min_xi, max_xi, min_yi, max_yi; 100 | __shared__ scalar_t face_ndc[9]; 101 | __shared__ scalar_t face_ndc_inv[9]; 102 | const scalar_t eps = 1e-5; 103 | 104 | if (threadIdx.x == 0 && threadIdx.y == 0) { 105 | ai = faces[face_indx][0]; 106 | bi = faces[face_indx][1]; 107 | ci = faces[face_indx][2]; 108 | 109 | if (vertices_filter[ai] == 0 || vertices_filter[bi] == 0 || vertices_filter[ci] == 0) { 110 | is_bad_face = true; 111 | global_is_bad_face[face_indx] = 1; 112 | return; 113 | } 114 | 115 | face_ndc[0] = vertices_ndc[ai][0]; face_ndc[1] = vertices_ndc[ai][1]; face_ndc[2] = vertices_ndc[ai][2]; 116 | face_ndc[3] = vertices_ndc[bi][0]; face_ndc[4] = vertices_ndc[bi][1]; face_ndc[5] = vertices_ndc[bi][2]; 117 | face_ndc[6] = vertices_ndc[ci][0]; face_ndc[7] = vertices_ndc[ci][1]; face_ndc[8] = vertices_ndc[ci][2]; 118 | 119 | // negative vertex 120 | is_bad_face = false; 121 | if (face_ndc[2] < eps or face_ndc[5] < eps or face_ndc[8] < eps) { 122 | is_bad_face = true; 123 | global_is_bad_face[face_indx] = 1; 124 | return; 125 | } 126 | 127 | face_ndc_inv[0] = face_ndc[4] - face_ndc[7]; 128 | face_ndc_inv[1] = face_ndc[6] - face_ndc[3]; 129 | face_ndc_inv[2] = face_ndc[3] * face_ndc[7] - face_ndc[6] * face_ndc[4]; 130 | face_ndc_inv[3] = face_ndc[7] - face_ndc[1]; 131 | face_ndc_inv[4] = face_ndc[0] - face_ndc[6]; 132 | face_ndc_inv[5] = face_ndc[6] * face_ndc[1] - face_ndc[0] * face_ndc[7]; 133 | face_ndc_inv[6] = face_ndc[1] - face_ndc[4]; 134 | face_ndc_inv[7] = face_ndc[3] - face_ndc[0]; 135 | face_ndc_inv[8] = face_ndc[0] * face_ndc[4] - face_ndc[3] * face_ndc[1]; 136 | 137 | denom = ( 138 | face_ndc[6] * (face_ndc[1] - face_ndc[4]) + 139 | face_ndc[0] * (face_ndc[4] - face_ndc[7]) + 140 | face_ndc[3] * (face_ndc[7] - face_ndc[1]) 141 | ); 142 | 143 | // if (abs(denom) < eps) { 144 | // is_bad_face = true; 145 | // global_is_bad_face[face_indx] = 1; 146 | // return; 147 | // } 148 | 149 | for (int i = 0; i < 9; ++i) { 150 | face_ndc_inv[i] /= denom; 151 | } 152 | 153 | for (int i = 0; i < 9; ++i) { 154 | global_face_ndc_inv[9 * face_indx + i] = face_ndc_inv[i]; 155 | } 156 | 157 | global_is_bad_face[face_indx] = 0; 158 | 159 | min_x = min(min(face_ndc[0], face_ndc[3]), face_ndc[6]); 160 | min_x = (min_x + 1) / 2 * W; // convert from ndc to img coordinates 161 | min_xi = static_cast(floorf(static_cast(min_x))); 162 | min_xi = min(max(min_xi, 0), W - 1); 163 | max_x = max(max(face_ndc[0], face_ndc[3]), face_ndc[6]); 164 | max_x = (max_x + 1) / 2 * W; 165 | max_xi = static_cast(ceilf(static_cast(max_x))); 166 | max_xi = min(max(max_xi, 0), W - 1); 167 | 168 | min_y = min(min(face_ndc[1], face_ndc[4]), face_ndc[7]); 169 | min_y = (min_y + 1) / 2 * H; 170 | min_yi = static_cast(floorf(static_cast(min_y))); 171 | min_yi = min(max(min_yi, 0), H - 1); 172 | max_y = max(max(face_ndc[1], face_ndc[4]), face_ndc[7]); 173 | max_y = (max_y + 1) / 2 * H; 174 | max_yi = static_cast(ceilf(static_cast(max_y))); 175 | max_yi = min(max(max_yi, 0), H - 1); 176 | 177 | vertices_per_thread_x = (max_xi - min_xi) / blockDim.x + 1; 178 | vertices_per_thread_y = (max_yi - min_yi) / blockDim.y + 1; 179 | } 180 | __syncthreads(); 181 | if (is_bad_face) { 182 | return; 183 | } 184 | 185 | const int left = min_xi + vertices_per_thread_x * threadIdx.x; 186 | const int right = min(left + vertices_per_thread_x, max_xi); 187 | 188 | const int top = min_yi + vertices_per_thread_y * threadIdx.y; 189 | const int bottom = min(top + vertices_per_thread_y, max_yi); 190 | 191 | scalar_t x, y, face_z, wa, wb, wc, wsum; 192 | for (int i = top; i <= bottom; i++) { 193 | for (int j = left; j <= right; j++) { 194 | x = 2 * ((scalar_t)j + 0.5) / W - 1; 195 | y = 2 * ((scalar_t)i + 0.5) / H - 1; 196 | 197 | // check pixel is inside the face 198 | if (((y - face_ndc[1]) * (face_ndc[3] - face_ndc[0]) > (x - face_ndc[0]) * (face_ndc[4] - face_ndc[1])) || 199 | ((y - face_ndc[4]) * (face_ndc[6] - face_ndc[3]) > (x - face_ndc[3]) * (face_ndc[7] - face_ndc[4])) || 200 | ((y - face_ndc[7]) * (face_ndc[0] - face_ndc[6]) > (x - face_ndc[6]) * (face_ndc[1] - face_ndc[7]))) { 201 | continue; 202 | } 203 | 204 | wa = face_ndc_inv[0] * x + face_ndc_inv[1] * y + face_ndc_inv[2]; 205 | wb = face_ndc_inv[3] * x + face_ndc_inv[4] * y + face_ndc_inv[5]; 206 | wc = face_ndc_inv[6] * x + face_ndc_inv[7] * y + face_ndc_inv[8]; 207 | wsum = wa + wb + wc; 208 | wa /= wsum; wb /= wsum; wc /= wsum; 209 | 210 | wa /= face_ndc[2]; 211 | wb /= face_ndc[5]; 212 | wc /= face_ndc[8]; 213 | wsum = wa + wb + wc; 214 | wa /= wsum; wb /= wsum; wc /= wsum; 215 | 216 | face_z = wa * face_ndc[2] + wb * face_ndc[5] + wc * face_ndc[8]; 217 | 218 | if (sizeof(scalar_t) == sizeof(double)) { 219 | atomicMin_double((double*)&depth[i][j], (double)face_z); 220 | } else { 221 | atomicMinFloat(&depth[i][j], face_z); 222 | } 223 | } 224 | } 225 | } 226 | 227 | 228 | template 229 | __global__ void interpolate_cuda_kernel( 230 | const torch::PackedTensorAccessor32 vertices_ndc, 231 | const torch::PackedTensorAccessor32 faces, 232 | const torch::PackedTensorAccessor32 depth, 233 | const scalar_t* global_face_ndc_inv, 234 | const int* global_is_bad_face, 235 | const torch::PackedTensorAccessor32 vertice_values, 236 | torch::PackedTensorAccessor32 result 237 | ) { 238 | const int face_indx = blockIdx.x; 239 | 240 | if (global_is_bad_face[face_indx]) { 241 | return; 242 | } 243 | 244 | const int H = depth.size(0); 245 | const int W = depth.size(1); 246 | const int C = vertice_values.size(1); 247 | const scalar_t eps = 1e-5; 248 | 249 | scalar_t min_x, max_x, min_y, max_y; 250 | __shared__ int vertices_per_thread_x, vertices_per_thread_y; 251 | __shared__ int ai, bi, ci; 252 | __shared__ scalar_t face_ndc[9]; 253 | __shared__ scalar_t face_ndc_inv[9]; 254 | __shared__ int min_xi, max_xi, min_yi, max_yi; 255 | 256 | if (threadIdx.x == 0 && threadIdx.y == 0) { 257 | ai = faces[face_indx][0]; 258 | bi = faces[face_indx][1]; 259 | ci = faces[face_indx][2]; 260 | 261 | face_ndc[0] = vertices_ndc[ai][0]; face_ndc[1] = vertices_ndc[ai][1]; face_ndc[2] = vertices_ndc[ai][2]; 262 | face_ndc[3] = vertices_ndc[bi][0]; face_ndc[4] = vertices_ndc[bi][1]; face_ndc[5] = vertices_ndc[bi][2]; 263 | face_ndc[6] = vertices_ndc[ci][0]; face_ndc[7] = vertices_ndc[ci][1]; face_ndc[8] = vertices_ndc[ci][2]; 264 | 265 | for (int i = 0; i < 9; ++i) { 266 | face_ndc_inv[i] = global_face_ndc_inv[9 * face_indx + i]; 267 | } 268 | 269 | min_x = min(min(face_ndc[0], face_ndc[3]), face_ndc[6]); 270 | min_x = (min_x + 1) / 2 * W; // convert from ndc to img coordinates 271 | min_xi = static_cast(floorf(static_cast(min_x))); 272 | min_xi = min(max(min_xi, 0), W - 1); 273 | max_x = max(max(face_ndc[0], face_ndc[3]), face_ndc[6]); 274 | max_x = (max_x + 1) / 2 * W; 275 | max_xi = static_cast(ceilf(static_cast(max_x))); 276 | max_xi = min(max(max_xi, 0), W - 1); 277 | 278 | min_y = min(min(face_ndc[1], face_ndc[4]), face_ndc[7]); 279 | min_y = (min_y + 1) / 2 * H; 280 | min_yi = static_cast(floorf(static_cast(min_y))); 281 | min_yi = min(max(min_yi, 0), H - 1); 282 | max_y = max(max(face_ndc[1], face_ndc[4]), face_ndc[7]); 283 | max_y = (max_y + 1) / 2 * H; 284 | max_yi = static_cast(ceilf(static_cast(max_y))); 285 | max_yi = min(max(max_yi, 0), H - 1); 286 | 287 | vertices_per_thread_x = (max_xi - min_xi) / blockDim.x + 1; 288 | vertices_per_thread_y = (max_yi - min_yi) / blockDim.y + 1; 289 | } 290 | __syncthreads(); 291 | 292 | const int left = min_xi + vertices_per_thread_x * threadIdx.x; 293 | const int right = min(left + vertices_per_thread_x, max_xi); 294 | 295 | const int top = min_yi + vertices_per_thread_y * threadIdx.y; 296 | const int bottom = min(top + vertices_per_thread_y, max_yi); 297 | 298 | scalar_t x, y, face_z, wa, wb, wc, wsum; 299 | for (int i = top; i <= bottom; i++) { 300 | for (int j = left; j <= right; j++) { 301 | x = 2 * ((scalar_t)j + 0.5) / W - 1; 302 | y = 2 * ((scalar_t)i + 0.5) / H - 1; 303 | 304 | // check pixel is inside the face 305 | if (((y - face_ndc[1]) * (face_ndc[3] - face_ndc[0]) > (x - face_ndc[0]) * (face_ndc[4] - face_ndc[1])) || 306 | ((y - face_ndc[4]) * (face_ndc[6] - face_ndc[3]) > (x - face_ndc[3]) * (face_ndc[7] - face_ndc[4])) || 307 | ((y - face_ndc[7]) * (face_ndc[0] - face_ndc[6]) > (x - face_ndc[6]) * (face_ndc[1] - face_ndc[7]))) { 308 | continue; 309 | } 310 | 311 | wa = face_ndc_inv[0] * x + face_ndc_inv[1] * y + face_ndc_inv[2]; 312 | wb = face_ndc_inv[3] * x + face_ndc_inv[4] * y + face_ndc_inv[5]; 313 | wc = face_ndc_inv[6] * x + face_ndc_inv[7] * y + face_ndc_inv[8]; 314 | wsum = wa + wb + wc; 315 | wa /= wsum; wb /= wsum; wc /= wsum; 316 | 317 | wa /= face_ndc[2]; 318 | wb /= face_ndc[5]; 319 | wc /= face_ndc[8]; 320 | wsum = wa + wb + wc; 321 | wa /= wsum; wb /= wsum; wc /= wsum; 322 | 323 | face_z = wa * face_ndc[2] + wb * face_ndc[5] + wc * face_ndc[8]; 324 | 325 | if (face_z - eps < depth[i][j]) { 326 | for (int c = 0; c < C; c++) { 327 | result[i][j][c] = wa * vertice_values[ai][c] + wb * vertice_values[bi][c] + wc * vertice_values[ci][c]; 328 | } 329 | } 330 | } 331 | } 332 | } 333 | 334 | 335 | template 336 | __global__ void estimate_normals_cuda_kernel( 337 | const torch::PackedTensorAccessor32 vertices_ndc, 338 | const torch::PackedTensorAccessor32 faces, 339 | const torch::PackedTensorAccessor32 depth, 340 | const scalar_t* global_face_ndc_inv, 341 | const int* global_is_bad_face, 342 | const torch::PackedTensorAccessor32 vertices, 343 | torch::PackedTensorAccessor32 coords, 344 | torch::PackedTensorAccessor32 normals 345 | ) { 346 | const int face_indx = blockIdx.x; 347 | 348 | if (global_is_bad_face[face_indx]) { 349 | return; 350 | } 351 | 352 | const int H = depth.size(0); 353 | const int W = depth.size(1); 354 | const scalar_t eps = 1e-5; 355 | 356 | scalar_t min_x, max_x, min_y, max_y; 357 | scalar_t v1x, v1y, v1z, v2x, v2y, v2z, nlen; 358 | __shared__ int vertices_per_thread_x, vertices_per_thread_y; 359 | __shared__ int ai, bi, ci; 360 | __shared__ scalar_t face[9]; 361 | __shared__ scalar_t face_ndc[9]; 362 | __shared__ scalar_t face_ndc_inv[9]; 363 | __shared__ int min_xi, max_xi, min_yi, max_yi; 364 | __shared__ scalar_t nx, ny, nz; 365 | 366 | if (threadIdx.x == 0 && threadIdx.y == 0) { 367 | ai = faces[face_indx][0]; 368 | bi = faces[face_indx][1]; 369 | ci = faces[face_indx][2]; 370 | 371 | face[0] = vertices[ai][0]; face[1] = vertices[ai][1]; face[2] = vertices[ai][2]; 372 | face[3] = vertices[bi][0]; face[4] = vertices[bi][1]; face[5] = vertices[bi][2]; 373 | face[6] = vertices[ci][0]; face[7] = vertices[ci][1]; face[8] = vertices[ci][2]; 374 | 375 | v1x = face[3] - face[0]; v2x = face[6] - face[0]; 376 | v1y = face[4] - face[1]; v2y = face[7] - face[1]; 377 | v1z = face[5] - face[2]; v2z = face[8] - face[2]; 378 | 379 | nx = v1y * v2z - v1z * v2y; 380 | ny = v1z * v2x - v1x * v2z; 381 | nz = v1x * v2y - v1y * v2x; 382 | nlen = nx * nx + ny * ny + nz * nz; 383 | nlen = (scalar_t)sqrt((float)nlen); 384 | nx /= nlen; 385 | ny /= nlen; 386 | nz /= nlen; 387 | 388 | face_ndc[0] = vertices_ndc[ai][0]; face_ndc[1] = vertices_ndc[ai][1]; face_ndc[2] = vertices_ndc[ai][2]; 389 | face_ndc[3] = vertices_ndc[bi][0]; face_ndc[4] = vertices_ndc[bi][1]; face_ndc[5] = vertices_ndc[bi][2]; 390 | face_ndc[6] = vertices_ndc[ci][0]; face_ndc[7] = vertices_ndc[ci][1]; face_ndc[8] = vertices_ndc[ci][2]; 391 | 392 | for (int i = 0; i < 9; ++i) { 393 | face_ndc_inv[i] = global_face_ndc_inv[9 * face_indx + i]; 394 | } 395 | 396 | min_x = min(min(face_ndc[0], face_ndc[3]), face_ndc[6]); 397 | min_x = (min_x + 1) / 2 * W; // convert from ndc to img coordinates 398 | min_xi = static_cast(floorf(static_cast(min_x))); 399 | min_xi = min(max(min_xi, 0), W - 1); 400 | max_x = max(max(face_ndc[0], face_ndc[3]), face_ndc[6]); 401 | max_x = (max_x + 1) / 2 * W; 402 | max_xi = static_cast(ceilf(static_cast(max_x))); 403 | max_xi = min(max(max_xi, 0), W - 1); 404 | 405 | min_y = min(min(face_ndc[1], face_ndc[4]), face_ndc[7]); 406 | min_y = (min_y + 1) / 2 * H; 407 | min_yi = static_cast(floorf(static_cast(min_y))); 408 | min_yi = min(max(min_yi, 0), H - 1); 409 | max_y = max(max(face_ndc[1], face_ndc[4]), face_ndc[7]); 410 | max_y = (max_y + 1) / 2 * H; 411 | max_yi = static_cast(ceilf(static_cast(max_y))); 412 | max_yi = min(max(max_yi, 0), H - 1); 413 | 414 | vertices_per_thread_x = (max_xi - min_xi) / blockDim.x + 1; 415 | vertices_per_thread_y = (max_yi - min_yi) / blockDim.y + 1; 416 | } 417 | __syncthreads(); 418 | 419 | const int left = min_xi + vertices_per_thread_x * threadIdx.x; 420 | const int right = min(left + vertices_per_thread_x, max_xi); 421 | 422 | const int top = min_yi + vertices_per_thread_y * threadIdx.y; 423 | const int bottom = min(top + vertices_per_thread_y, max_yi); 424 | 425 | scalar_t x, y, face_z, wa, wb, wc, wsum; 426 | for (int i = top; i <= bottom; i++) { 427 | for (int j = left; j <= right; j++) { 428 | x = 2 * ((scalar_t)j + 0.5) / W - 1; 429 | y = 2 * ((scalar_t)i + 0.5) / H - 1; 430 | 431 | // check pixel is inside the face 432 | if (((y - face_ndc[1]) * (face_ndc[3] - face_ndc[0]) > (x - face_ndc[0]) * (face_ndc[4] - face_ndc[1])) || 433 | ((y - face_ndc[4]) * (face_ndc[6] - face_ndc[3]) > (x - face_ndc[3]) * (face_ndc[7] - face_ndc[4])) || 434 | ((y - face_ndc[7]) * (face_ndc[0] - face_ndc[6]) > (x - face_ndc[6]) * (face_ndc[1] - face_ndc[7]))) { 435 | continue; 436 | } 437 | 438 | wa = face_ndc_inv[0] * x + face_ndc_inv[1] * y + face_ndc_inv[2]; 439 | wb = face_ndc_inv[3] * x + face_ndc_inv[4] * y + face_ndc_inv[5]; 440 | wc = face_ndc_inv[6] * x + face_ndc_inv[7] * y + face_ndc_inv[8]; 441 | wsum = wa + wb + wc; 442 | wa /= wsum; wb /= wsum; wc /= wsum; 443 | 444 | wa /= face_ndc[2]; 445 | wb /= face_ndc[5]; 446 | wc /= face_ndc[8]; 447 | wsum = wa + wb + wc; 448 | wa /= wsum; wb /= wsum; wc /= wsum; 449 | 450 | face_z = wa * face_ndc[2] + wb * face_ndc[5] + wc * face_ndc[8]; 451 | 452 | if (face_z - eps < depth[i][j]) { 453 | coords[i][j][0] = wa * face[0] + wb * face[3] + wc * face[6]; 454 | coords[i][j][1] = wa * face[1] + wb * face[4] + wc * face[7]; 455 | coords[i][j][2] = wa * face[2] + wb * face[5] + wc * face[8]; 456 | 457 | normals[i][j][0] = nx; 458 | normals[i][j][1] = ny; 459 | normals[i][j][2] = nz; 460 | } 461 | } 462 | } 463 | } 464 | 465 | // cpp defined functions 466 | 467 | torch::Tensor project_mesh_cuda( 468 | const torch::Tensor& vertices_ndc, 469 | const torch::Tensor& faces, 470 | const torch::Tensor& vertice_values, 471 | const torch::Tensor& vertices_filter, 472 | int H, int W 473 | ) { 474 | const int N = vertices_ndc.size(0); 475 | const int C = vertice_values.size(1); 476 | const int M = faces.size(0); 477 | 478 | const int gpuid = vertices_ndc.device().index(); 479 | AT_CUDA_CHECK(cudaSetDevice(gpuid)); 480 | auto options = torch::dtype(vertices_ndc.scalar_type()).device(torch::kCUDA, gpuid); 481 | 482 | const dim3 dimGrid(M); 483 | const dim3 dimBlock(4, 4); 484 | 485 | auto depth = torch::ones({H, W}, options) * 1e10; 486 | auto result = torch::zeros({H, W, C}, options); 487 | 488 | AT_DISPATCH_FLOATING_TYPES_AND_HALF(vertices_ndc.scalar_type(), "project_mesh_cuda_kernel", [&] { 489 | scalar_t* global_face_ndc_inv; 490 | cudaMalloc(&global_face_ndc_inv, M * 9 * sizeof(scalar_t)); 491 | int* global_is_bad_face; 492 | cudaMalloc(&global_is_bad_face, M * sizeof(int)); 493 | rasterize_cuda_kernel<<>>( 494 | vertices_ndc.packed_accessor32(), 495 | faces.packed_accessor32(), 496 | vertices_filter.packed_accessor32(), 497 | depth.packed_accessor32(), 498 | global_face_ndc_inv, 499 | global_is_bad_face 500 | ); 501 | AT_CUDA_CHECK(cudaGetLastError()); 502 | 503 | interpolate_cuda_kernel<<>>( 504 | vertices_ndc.packed_accessor32(), 505 | faces.packed_accessor32(), 506 | depth.packed_accessor32(), 507 | global_face_ndc_inv, 508 | global_is_bad_face, 509 | vertice_values.packed_accessor32(), 510 | result.packed_accessor32() 511 | ); 512 | AT_CUDA_CHECK(cudaGetLastError()); 513 | 514 | cudaFree(global_face_ndc_inv); 515 | cudaFree(global_is_bad_face); 516 | AT_CUDA_CHECK(cudaGetLastError()); 517 | }); 518 | 519 | return result; 520 | } 521 | 522 | 523 | std::vector estimate_normals_cuda( 524 | const torch::Tensor& vertices_ndc, 525 | const torch::Tensor& faces, 526 | const torch::Tensor& vertices, 527 | const torch::Tensor& vertices_filter, 528 | int H, int W 529 | ) { 530 | const int N = vertices_ndc.size(0); 531 | const int M = faces.size(0); 532 | 533 | const int gpuid = vertices_ndc.device().index(); 534 | AT_CUDA_CHECK(cudaSetDevice(gpuid)); 535 | auto options = torch::dtype(vertices_ndc.scalar_type()).device(torch::kCUDA, gpuid); 536 | 537 | const dim3 dimGrid(M); 538 | const dim3 dimBlock(4, 4); 539 | 540 | auto depth = torch::ones({H, W}, options) * 1e10; 541 | auto coords = torch::zeros({H, W, 3}, options); 542 | auto normals = torch::zeros({H, W, 3}, options); 543 | 544 | AT_DISPATCH_FLOATING_TYPES_AND_HALF(vertices_ndc.scalar_type(), "project_mesh_cuda_kernel", [&] { 545 | scalar_t* global_face_ndc_inv; 546 | cudaMalloc(&global_face_ndc_inv, M * 9 * sizeof(scalar_t)); 547 | int* global_is_bad_face; 548 | cudaMalloc(&global_is_bad_face, M * sizeof(int)); 549 | rasterize_cuda_kernel<<>>( 550 | vertices_ndc.packed_accessor32(), 551 | faces.packed_accessor32(), 552 | vertices_filter.packed_accessor32(), 553 | depth.packed_accessor32(), 554 | global_face_ndc_inv, 555 | global_is_bad_face 556 | ); 557 | AT_CUDA_CHECK(cudaGetLastError()); 558 | 559 | estimate_normals_cuda_kernel<<>>( 560 | vertices_ndc.packed_accessor32(), 561 | faces.packed_accessor32(), 562 | depth.packed_accessor32(), 563 | global_face_ndc_inv, 564 | global_is_bad_face, 565 | vertices.packed_accessor32(), 566 | coords.packed_accessor32(), 567 | normals.packed_accessor32() 568 | ); 569 | AT_CUDA_CHECK(cudaGetLastError()); 570 | 571 | cudaFree(global_face_ndc_inv); 572 | cudaFree(global_is_bad_face); 573 | AT_CUDA_CHECK(cudaGetLastError()); 574 | }); 575 | 576 | return {coords, normals}; 577 | } 578 | -------------------------------------------------------------------------------- /minimal_pytorch_rasterizer/rasterizer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from minimal_pytorch_rasterizer.cuda.rasterizer import estimate_normals as estimate_normals_cuda 3 | from minimal_pytorch_rasterizer.cuda.rasterizer import project_mesh as project_mesh_cuda 4 | from minimal_pytorch_rasterizer import assert_utils 5 | 6 | 7 | def estimate_normals(vertices, faces, pinhole, vertices_filter=None): 8 | if vertices_filter is None: 9 | assert_utils.is_cuda_tensor(vertices) 10 | assert_utils.check_shape_len(vertices, 2) 11 | n = vertices.shape[0] 12 | vertices_filter = torch.ones((n), dtype=torch.uint8, device=vertices.device) 13 | vertices = vertices.contiguous() 14 | vertices_ndc = pinhole.project_ndc(vertices) 15 | coords, normals = estimate_normals_cuda( 16 | vertices_ndc, faces, vertices, vertices_filter, 17 | pinhole.h, pinhole.w 18 | ) 19 | return coords, normals 20 | 21 | 22 | def project_mesh(vertices, faces, vertice_values, pinhole, vertices_filter=None): 23 | if vertices_filter is None: 24 | assert_utils.is_cuda_tensor(vertices) 25 | assert_utils.check_shape_len(vertices, 2) 26 | n = vertices.shape[0] 27 | vertices_filter = torch.ones((n), dtype=torch.uint8, device=vertices.device) 28 | vertices = vertices.contiguous() 29 | vertices_ndc = pinhole.project_ndc(vertices) 30 | return project_mesh_cuda( 31 | vertices_ndc, faces, vertice_values, vertices_filter, 32 | pinhole.h, pinhole.w 33 | ) 34 | -------------------------------------------------------------------------------- /minimal_pytorch_rasterizer/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def vis_z_buffer(z, percentile=1, vis_pad=0.2): 5 | z = z[:, :, 0] 6 | mask = z > 1e-5 7 | if torch.sum(mask) == 0: 8 | z[...] = 0 9 | else: 10 | vmin = torch.quantile(z[mask], percentile / 100) 11 | vmax = torch.quantile(z[mask], 1 - percentile / 100) 12 | pad = (vmax - vmin) * vis_pad 13 | vmin_padded = vmin - pad 14 | vmax_padded = vmax + pad 15 | z[mask] = vmin + vmax - z[mask] 16 | z = (z - vmin_padded) / (vmax_padded - vmin_padded) 17 | z = torch.clip(torch.round(z * 255), 0, 255) 18 | z_cpu = z.to(dtype=torch.uint8).detach().cpu().numpy() 19 | return z_cpu 20 | 21 | 22 | def vis_normals(coords, normals, vis_pad=0.2): 23 | mask = coords[:, :, 2] > 0 24 | coords_masked = -coords[mask] 25 | normals_masked = normals[mask] 26 | 27 | coords_len = torch.sqrt(torch.sum(coords_masked ** 2, dim=1)) 28 | 29 | dot = torch.sum(coords_masked * normals_masked, dim=1) / coords_len 30 | 31 | h, w = normals.shape[:2] 32 | vis = torch.zeros((h, w), dtype=coords.dtype, device=coords.device) 33 | vis[mask] = torch.clamp(dot, 0, 1) * (1 - 2 * vis_pad) + vis_pad 34 | 35 | vis_cpu = (vis * 255).to(dtype=torch.uint8).detach().cpu().numpy() 36 | 37 | return vis_cpu 38 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ipykernel==5.2.0 2 | jupyter==1.0.0 3 | matplotlib==3.2.1 4 | munch==2.5.0 5 | nbconvert==5.6.1 6 | numpy==1.19.2 7 | opencv-python==4.2.0.32 8 | pandas==1.0.3 9 | Pillow==7.0.0 10 | plotly==4.9.0 11 | scikit-image==0.16.2 12 | scikit-learn==0.22.2.post1 13 | scipy==1.4.1 14 | tqdm==4.46.0 15 | numba==0.51.2 16 | einops==0.3.0 17 | tensorboard==1.8.0 18 | tensorboardX==1.2 19 | pyyaml 20 | 21 | git+https://github.com/karfly/nvdiffrast_compute-capability_6.0 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 4 | 5 | 6 | ext_modules = [ 7 | CUDAExtension( 8 | 'minimal_pytorch_rasterizer.cuda.rasterizer', [ 9 | 'minimal_pytorch_rasterizer/cuda/rasterizer.cpp', 10 | 'minimal_pytorch_rasterizer/cuda/rasterizer_kernel.cu', 11 | ]) 12 | ] 13 | 14 | setup( 15 | version='0.5', 16 | description='cuda accelerated point cloud utils', 17 | author='Renat Bashirov', 18 | author_email='rmbashirov@gmail.com', 19 | install_requires=["torch>=1.3"], 20 | packages=['minimal_pytorch_rasterizer', 'minimal_pytorch_rasterizer.cuda'], 21 | name='minimal_pytorch_rasterizer', 22 | ext_modules=ext_modules, 23 | cmdclass={'build_ext': BuildExtension} 24 | ) 25 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | pip uninstall -y minimal_pytorch_rasterizer 4 | pip --no-cache-dir install . 5 | 6 | 7 | --------------------------------------------------------------------------------