├── src ├── extern.cpp ├── fileutils.h ├── textures.h ├── math.h ├── device.h ├── shaders │ ├── depthreduce.comp.glsl │ ├── shadowfill.comp.glsl │ ├── tasksubmit.comp.glsl │ ├── clustersubmit.comp.glsl │ ├── shadowblur.comp.glsl │ ├── mesh.vert.glsl │ ├── final.comp.glsl │ ├── mesh.h │ ├── math.h │ ├── mesh.frag.glsl │ ├── shadow.comp.glsl │ ├── clustercull.comp.glsl │ ├── meshlet.task.glsl │ ├── drawcull.comp.glsl │ ├── meshlet.mesh.glsl │ └── debugtext.comp.glsl ├── swapchain.h ├── common.h ├── scenert.h ├── config.h ├── fileutils.cpp ├── resources.h ├── scene.h ├── shaders.h ├── swapchain.cpp ├── textures.cpp ├── scenecache.cpp ├── resources.cpp ├── device.cpp ├── shaders.cpp ├── scene.cpp └── scenert.cpp ├── .gitignore ├── .clang-format ├── .gitmodules ├── .github └── workflows │ └── build.yml ├── LICENSE.md ├── CMakeLists.txt └── README.md /src/extern.cpp: -------------------------------------------------------------------------------- 1 | #define FAST_OBJ_IMPLEMENTATION 2 | #include "fast_obj.h" 3 | 4 | #define CGLTF_IMPLEMENTATION 5 | #include "cgltf.h" 6 | -------------------------------------------------------------------------------- /src/fileutils.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void* mmapFile(const char* path, size_t* outSize); 4 | void unmapFile(void* data, size_t size); 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .cache/ 2 | spirv/*.spv 3 | 4 | CMakeFiles/ 5 | CMakeCache.txt 6 | build.ninja 7 | .ninja_* 8 | compile_commands.json 9 | *.cmake 10 | -------------------------------------------------------------------------------- /src/textures.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | struct Image; 4 | struct Buffer; 5 | 6 | bool loadImage(Image& image, VkDevice device, VkCommandPool commandPool, VkCommandBuffer commandBuffer, VkQueue queue, const VkPhysicalDeviceMemoryProperties& memoryProperties, const Buffer& scratch, const char* path); 7 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | Standard: Cpp11 2 | Cpp11BracedListStyle: false 3 | UseTab: ForIndentation 4 | TabWidth: 4 5 | IndentWidth: 4 6 | AccessModifierOffset: -4 7 | BreakBeforeBraces: Allman 8 | IndentCaseLabels: false 9 | ColumnLimit: 0 10 | PointerAlignment: Left 11 | BreakConstructorInitializersBeforeComma: true 12 | NamespaceIndentation: None 13 | AlignEscapedNewlines: DontAlign 14 | AlignAfterOpenBracket: DontAlign 15 | SortIncludes: false 16 | -------------------------------------------------------------------------------- /src/math.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using glm::mat2; 14 | using glm::mat3; 15 | using glm::mat4; 16 | using glm::quat; 17 | using glm::vec2; 18 | using glm::vec3; 19 | using glm::vec4; 20 | -------------------------------------------------------------------------------- /src/device.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | bool isInstanceExtensionSupported(const char* name); 4 | 5 | VkInstance createInstance(); 6 | VkDebugUtilsMessengerEXT registerDebugCallback(VkInstance instance); 7 | 8 | uint32_t getGraphicsFamilyIndex(VkPhysicalDevice physicalDevice); 9 | VkPhysicalDevice pickPhysicalDevice(VkPhysicalDevice* physicalDevices, uint32_t physicalDeviceCount); 10 | 11 | VkDevice createDevice(VkInstance instance, VkPhysicalDevice physicalDevice, uint32_t familyIndex, bool meshShadingSupported, bool raytracingSupported, bool clusterrtSupported); 12 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "extern/glfw"] 2 | path = extern/glfw 3 | url = https://github.com/glfw/glfw 4 | [submodule "extern/volk"] 5 | path = extern/volk 6 | url = https://github.com/zeux/volk.git 7 | [submodule "extern/meshoptimizer"] 8 | path = extern/meshoptimizer 9 | url = https://github.com/zeux/meshoptimizer.git 10 | [submodule "extern/glm"] 11 | path = extern/glm 12 | url = https://github.com/g-truc/glm.git 13 | [submodule "extern/fast_obj"] 14 | path = extern/fast_obj 15 | url = https://github.com/thisistherk/fast_obj 16 | [submodule "extern/cgltf"] 17 | path = extern/cgltf 18 | url = https://github.com/jkuhlmann/cgltf 19 | -------------------------------------------------------------------------------- /src/shaders/depthreduce.comp.glsl: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; 4 | 5 | layout(binding = 0, r32f) uniform writeonly image2D outImage; 6 | layout(binding = 1) uniform sampler2D inImage; 7 | 8 | layout(push_constant) uniform block 9 | { 10 | vec2 imageSize; 11 | }; 12 | 13 | void main() 14 | { 15 | uvec2 pos = gl_GlobalInvocationID.xy; 16 | 17 | // Sampler is set up to do min reduction, so this computes the minimum depth of a 2x2 texel quad 18 | float depth = texture(inImage, (vec2(pos) + vec2(0.5)) / imageSize).x; 19 | 20 | imageStore(outImage, ivec2(pos), vec4(depth)); 21 | } 22 | -------------------------------------------------------------------------------- /src/swapchain.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | struct Swapchain 4 | { 5 | VkSwapchainKHR swapchain; 6 | 7 | std::vector images; 8 | 9 | uint32_t width, height; 10 | uint32_t imageCount; 11 | }; 12 | 13 | typedef struct GLFWwindow GLFWwindow; 14 | 15 | const char** getSwapchainExtensions(uint32_t* count); 16 | 17 | VkSurfaceKHR createSurface(VkInstance instance, GLFWwindow* window); 18 | VkFormat getSwapchainFormat(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface); 19 | void createSwapchain(Swapchain& result, VkPhysicalDevice physicalDevice, VkDevice device, VkSurfaceKHR surface, uint32_t familyIndex, GLFWwindow* window, VkFormat format, VkSwapchainKHR oldSwapchain = 0); 20 | void destroySwapchain(VkDevice device, const Swapchain& swapchain); 21 | 22 | enum SwapchainStatus 23 | { 24 | Swapchain_Ready, 25 | Swapchain_Resized, 26 | Swapchain_NotReady, 27 | }; 28 | 29 | SwapchainStatus updateSwapchain(Swapchain& result, VkPhysicalDevice physicalDevice, VkDevice device, VkSurfaceKHR surface, uint32_t familyIndex, GLFWwindow* window, VkFormat format); 30 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - "*.md" 7 | pull_request: 8 | 9 | jobs: 10 | build: 11 | name: ubuntu 12 | runs-on: ubuntu-24.04 13 | steps: 14 | - uses: actions/checkout@v4 15 | with: 16 | submodules: true 17 | - uses: actions/checkout@v4 18 | with: 19 | repository: KhronosGroup/Vulkan-Headers 20 | ref: main 21 | path: Vulkan-Headers 22 | - name: move sdk 23 | run: mv Vulkan-Headers ~/Vulkan-Headers 24 | - name: install deps 25 | run: | 26 | sudo apt update 27 | sudo apt install -y spirv-headers glslang-tools ninja-build 28 | sudo apt install -y xorg-dev libwayland-dev libxkbcommon-dev wayland-protocols 29 | - name: cmake 30 | run: cmake . -G Ninja -D VULKAN_HEADERS_INSTALL_DIR=~/Vulkan-Headers 31 | - name: build 32 | run: ninja 33 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Arseny Kapoulkine 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | 11 | #define VK_CHECK(call) \ 12 | do \ 13 | { \ 14 | VkResult result_ = call; \ 15 | assert(result_ == VK_SUCCESS); \ 16 | } while (0) 17 | 18 | #define VK_CHECK_FORCE(call) \ 19 | do \ 20 | { \ 21 | VkResult result_ = call; \ 22 | if (result_ != VK_SUCCESS) \ 23 | { \ 24 | fprintf(stderr, "%s:%d: %s failed with error %d\n", __FILE__, __LINE__, #call, result_); \ 25 | abort(); \ 26 | } \ 27 | } while (0) 28 | 29 | #define VK_CHECK_SWAPCHAIN(call) \ 30 | do \ 31 | { \ 32 | VkResult result_ = call; \ 33 | assert(result_ == VK_SUCCESS || result_ == VK_SUBOPTIMAL_KHR || result_ == VK_ERROR_OUT_OF_DATE_KHR); \ 34 | } while (0) 35 | 36 | #define VK_CHECK_QUERY(call) \ 37 | do \ 38 | { \ 39 | VkResult result_ = call; \ 40 | assert(result_ == VK_SUCCESS || result_ == VK_NOT_READY); \ 41 | } while (0) 42 | 43 | template 44 | char (*countof_helper(T (&_Array)[Size]))[Size]; 45 | 46 | #define COUNTOF(array) (sizeof(*countof_helper(array)) + 0) 47 | -------------------------------------------------------------------------------- /src/shaders/shadowfill.comp.glsl: -------------------------------------------------------------------------------- 1 | #version 460 2 | 3 | #extension GL_GOOGLE_include_directive: require 4 | 5 | layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in; 6 | 7 | layout(push_constant) uniform block 8 | { 9 | vec2 imageSize; 10 | int checkerboard; 11 | }; 12 | 13 | layout(binding = 0, r8) uniform image2D shadowImage; 14 | layout(binding = 1) uniform sampler2D depthImage; 15 | 16 | void main() 17 | { 18 | ivec2 pos = ivec2(gl_GlobalInvocationID.xy); 19 | 20 | // checkerboard (opposite) 21 | pos.x *= 2; 22 | pos.x += ~(pos.y ^ checkerboard) & 1; 23 | 24 | float depth = texelFetch(depthImage, pos, 0).r; 25 | 26 | vec4 depths = vec4( 27 | texelFetch(depthImage, pos + ivec2(-1, 0), 0).r, 28 | texelFetch(depthImage, pos + ivec2(+1, 0), 0).r, 29 | texelFetch(depthImage, pos + ivec2(0, -1), 0).r, 30 | texelFetch(depthImage, pos + ivec2(0, +1), 0).r 31 | ); 32 | 33 | vec4 shadows = vec4( 34 | imageLoad(shadowImage, pos + ivec2(-1, 0)).r, 35 | imageLoad(shadowImage, pos + ivec2(+1, 0)).r, 36 | imageLoad(shadowImage, pos + ivec2(0, -1)).r, 37 | imageLoad(shadowImage, pos + ivec2(0, +1)).r 38 | ); 39 | 40 | vec4 weights = exp2(-abs(depths / depth - 1) * 20); 41 | 42 | float shadow = dot(weights, shadows) / (dot(weights, vec4(1)) + 1e-2); 43 | 44 | imageStore(shadowImage, pos, vec4(shadow, 0, 0, 0)); 45 | } 46 | -------------------------------------------------------------------------------- /src/shaders/tasksubmit.comp.glsl: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #extension GL_EXT_shader_16bit_storage: require 4 | #extension GL_EXT_shader_8bit_storage: require 5 | 6 | #extension GL_GOOGLE_include_directive: require 7 | 8 | #extension GL_EXT_null_initializer: require 9 | 10 | #include "mesh.h" 11 | 12 | layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; 13 | 14 | layout(binding = 0) buffer CommandCount 15 | { 16 | uint commandCount; 17 | uint groupCountX; 18 | uint groupCountY; 19 | uint groupCountZ; 20 | }; 21 | 22 | layout(binding = 1) writeonly buffer TaskCommands 23 | { 24 | MeshTaskCommand taskCommands[]; 25 | }; 26 | 27 | void main() 28 | { 29 | uint tid = gl_LocalInvocationID.x; 30 | uint count = min(commandCount, TASK_WGLIMIT); 31 | 32 | // represent command count as X*64*1; X has a max of 65535 (per EXT_mesh_shader limits), so this allows us to reach ~4M commands 33 | // note that EXT_mesh_shader doesn't guarantee support for >4M commands anyway, but 4M commands ~= 16B triangles which is surely enough 34 | if (tid == 0) 35 | { 36 | groupCountX = min((count + 63) / 64, 65535); 37 | groupCountY = 64; 38 | groupCountZ = 1; 39 | } 40 | 41 | // the above may result in reading command data that was never written; as such, pad the excess entries with dummy commands (up to 63) 42 | uint boundary = (count + 63) & ~63; 43 | MeshTaskCommand dummy = {}; 44 | 45 | if (count + tid < boundary) 46 | taskCommands[count + tid] = dummy; 47 | } 48 | -------------------------------------------------------------------------------- /src/shaders/clustersubmit.comp.glsl: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #extension GL_EXT_shader_16bit_storage: require 4 | #extension GL_EXT_shader_8bit_storage: require 5 | 6 | #extension GL_GOOGLE_include_directive: require 7 | 8 | #include "mesh.h" 9 | 10 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 11 | 12 | layout(binding = 0) buffer ClusterCount 13 | { 14 | uint clusterCount; 15 | uint groupCountX; 16 | uint groupCountY; 17 | uint groupCountZ; 18 | }; 19 | 20 | layout(binding = 1) writeonly buffer ClusterIndices 21 | { 22 | uint clusterIndices[]; 23 | }; 24 | 25 | void main() 26 | { 27 | uint tid = gl_LocalInvocationID.x; 28 | uint count = min(clusterCount, CLUSTER_LIMIT); 29 | 30 | // represent cluster count as 16*Y*16; Y has a max of 65535 (per EXT_mesh_shader limits), so this allows us to reach ~16M clusters 31 | // the reason for an odd layout like this is that normally we'd use a 2D 256*Y layout (to maximize locality of access), but that is slower than Y*256 on 7900 32 | // however, Y*256 is really slow on integrated RDNA2; 16*Y*16 seems to provide a reasonable balance between the two 33 | if (tid == 0) 34 | { 35 | groupCountX = CLUSTER_TILE; 36 | groupCountY = min((count + 255) / 256, 65535); 37 | groupCountZ = 256 / CLUSTER_TILE; 38 | } 39 | 40 | // the above may result in reading command data that was never written; as such, pad the excess entries with dummy commands (up to 63) 41 | uint boundary = (count + 255) & ~255; 42 | 43 | if (count + tid < boundary) 44 | clusterIndices[count + tid] = ~0; 45 | } 46 | -------------------------------------------------------------------------------- /src/scenert.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | struct Buffer; 4 | 5 | struct Mesh; 6 | struct MeshDraw; 7 | struct Meshlet; 8 | 9 | void buildBLAS(VkDevice device, const std::vector& meshes, const Buffer& vb, const Buffer& ib, std::vector& blas, std::vector& compactedSizes, Buffer& blasBuffer, VkCommandPool commandPool, VkCommandBuffer commandBuffer, VkQueue queue, const VkPhysicalDeviceMemoryProperties& memoryProperties); 10 | void compactBLAS(VkDevice device, std::vector& blas, const std::vector& compactedSizes, Buffer& blasBuffer, VkCommandPool commandPool, VkCommandBuffer commandBuffer, VkQueue queue, const VkPhysicalDeviceMemoryProperties& memoryProperties); 11 | 12 | void buildCBLAS(VkDevice device, const std::vector& meshes, const std::vector& meshlets, const Buffer& vxb, const Buffer& mdb, std::vector& blas, Buffer& blasBuffer, VkCommandPool commandPool, VkCommandBuffer commandBuffer, VkQueue queue, const VkPhysicalDeviceMemoryProperties& memoryProperties); 13 | 14 | void fillInstanceRT(VkAccelerationStructureInstanceKHR& instance, const MeshDraw& draw, uint32_t instanceIndex, VkDeviceAddress blas); 15 | 16 | VkAccelerationStructureKHR createTLAS(VkDevice device, Buffer& tlasBuffer, Buffer& scratchBuffer, const Buffer& instanceBuffer, uint32_t primitiveCount, const VkPhysicalDeviceMemoryProperties& memoryProperties); 17 | 18 | void buildTLAS(VkDevice device, VkCommandBuffer commandBuffer, VkAccelerationStructureKHR tlas, const Buffer& tlasBuffer, const Buffer& scratchBuffer, const Buffer& instanceBuffer, uint32_t primitiveCount, VkBuildAccelerationStructureModeKHR mode); 19 | -------------------------------------------------------------------------------- /src/shaders/shadowblur.comp.glsl: -------------------------------------------------------------------------------- 1 | #version 460 2 | 3 | #define BLUR 1 4 | 5 | #extension GL_GOOGLE_include_directive: require 6 | 7 | #include "math.h" 8 | 9 | layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in; 10 | 11 | layout(push_constant) uniform block 12 | { 13 | vec2 imageSize; 14 | float direction; 15 | float znear; 16 | }; 17 | 18 | layout(binding = 0) uniform writeonly image2D outImage; 19 | 20 | layout(binding = 1) uniform sampler2D shadowImage; 21 | layout(binding = 2) uniform sampler2D depthImage; 22 | 23 | void main() 24 | { 25 | uvec2 pos = gl_GlobalInvocationID.xy; 26 | 27 | #if BLUR 28 | float shadow = texelFetch(shadowImage, ivec2(pos), 0).r; 29 | float accumw = 1; 30 | 31 | float depth = znear / texelFetch(depthImage, ivec2(pos), 0).r; 32 | 33 | ivec2 offsetMask = -ivec2(direction, 1 - direction); 34 | 35 | const int KERNEL = 10; 36 | 37 | for (int sign = -1; sign <= 1; sign += 2) 38 | { 39 | ivec2 uvnext = ivec2(pos) + (ivec2(sign) & offsetMask); 40 | float dnext = znear / texelFetch(depthImage, uvnext, 0).r; 41 | float dgrad = abs(depth - dnext) < 0.1 ? dnext - depth : 0; 42 | 43 | for (int i = 1; i <= KERNEL; ++i) 44 | { 45 | ivec2 uvoff = ivec2(pos) + (ivec2(i * sign) & offsetMask); 46 | 47 | float gw = exp2(-i * i / 50); 48 | float dv = znear / texelFetch(depthImage, uvoff, 0).r; 49 | float dw = exp2(-abs(dv - (depth + dgrad * i)) * 100); 50 | float fw = gw * dw; 51 | 52 | shadow += texelFetch(shadowImage, uvoff, 0).r * fw; 53 | accumw += fw; 54 | } 55 | } 56 | 57 | shadow /= accumw; 58 | #else 59 | float shadow = texelFetch(shadowImage, ivec2(pos), 0).r; 60 | #endif 61 | 62 | imageStore(outImage, ivec2(pos), vec4(shadow, 0, 0, 0)); 63 | } 64 | -------------------------------------------------------------------------------- /src/config.h: -------------------------------------------------------------------------------- 1 | // Workgroup size for task shader; each task shader thread produces up to one meshlet 2 | #define TASK_WGSIZE 64 3 | 4 | // Workgroup size for mesh shader; mesh shader workgroup processes the entire meshlet in parallel 5 | #define MESH_WGSIZE 64 6 | 7 | // Should we do meshlet frustum, occlusion and backface culling in task shader? 8 | #define TASK_CULL 1 9 | 10 | // Should we do triangle frustum and backface culling in mesh shader? 11 | #define MESH_CULL 0 12 | 13 | // Maximum number of vertices and triangles in a meshlet 14 | #define MESH_MAXVTX 64 15 | #define MESH_MAXTRI 96 16 | 17 | // Meshlet build configuration for raster/RT 18 | #define MESHLET_CONE_WEIGHT 0.25f 19 | #define MESHLET_FILL_WEIGHT 0.5f 20 | 21 | // Number of clusters along X dimension in a 3D tiled dispatch (must be a divisor of 256) 22 | #define CLUSTER_TILE 16 23 | 24 | // Maximum number of total task shader workgroups; 4M workgroups ~= 256M meshlets ~= 16B triangles if TASK_WGSIZE=64 and MESH_MAXTRI=64 25 | #define TASK_WGLIMIT (1 << 22) 26 | 27 | // Maximum number of total visible clusters; 16M meshlets ~= 64MB buffer with cluster indices 28 | #define CLUSTER_LIMIT (1 << 24) 29 | 30 | // Maximum number of frames in flight 31 | #define MAX_FRAMES 2 32 | 33 | // Minimum number of images in flight 34 | #define MIN_IMAGES 3 35 | 36 | // Should we enable vertical sync during presentation? Worth setting to 0 when doing perf profiling to avoid GPU downclock during idle 37 | #define CONFIG_VSYNC 1 38 | 39 | // Should we enable validation layers in release? (they are always enabled in debug) 40 | #define CONFIG_RELVAL 0 41 | 42 | // Should we enable synchronization validation? Worth running with 1 occasionally to check correctness. 43 | #define CONFIG_SYNCVAL 0 44 | 45 | // Maximum number of texture descriptors in the pool 46 | #define DESCRIPTOR_LIMIT 65536 47 | -------------------------------------------------------------------------------- /src/shaders/mesh.vert.glsl: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #extension GL_EXT_shader_16bit_storage: require 4 | #extension GL_EXT_shader_8bit_storage: require 5 | 6 | #extension GL_GOOGLE_include_directive: require 7 | 8 | #extension GL_ARB_shader_draw_parameters: require 9 | 10 | #include "mesh.h" 11 | #include "math.h" 12 | 13 | layout(push_constant) uniform block 14 | { 15 | Globals globals; 16 | }; 17 | 18 | layout(binding = 0) readonly buffer DrawCommands 19 | { 20 | MeshDrawCommand drawCommands[]; 21 | }; 22 | 23 | layout(binding = 1) readonly buffer Draws 24 | { 25 | MeshDraw draws[]; 26 | }; 27 | 28 | layout(binding = 2) readonly buffer Vertices 29 | { 30 | Vertex vertices[]; 31 | }; 32 | 33 | layout(location = 0) out flat uint out_drawId; 34 | layout(location = 1) out vec2 out_uv; 35 | layout(location = 2) out vec3 out_normal; 36 | layout(location = 3) out vec4 out_tangent; 37 | layout(location = 4) out vec3 out_wpos; 38 | 39 | void main() 40 | { 41 | uint drawId = drawCommands[gl_DrawIDARB].drawId; 42 | MeshDraw meshDraw = draws[drawId]; 43 | 44 | uint vi = gl_VertexIndex; 45 | vec3 position = vec3(vertices[vi].vx, vertices[vi].vy, vertices[vi].vz); 46 | vec2 texcoord = vec2(vertices[vi].tu, vertices[vi].tv); 47 | 48 | vec3 normal; 49 | vec4 tangent; 50 | unpackTBN(vertices[vi].np, uint(vertices[vi].tp), normal, tangent); 51 | 52 | normal = rotateQuat(normal, meshDraw.orientation); 53 | tangent.xyz = rotateQuat(tangent.xyz, meshDraw.orientation); 54 | 55 | vec3 wpos = rotateQuat(position, meshDraw.orientation) * meshDraw.scale + meshDraw.position; 56 | 57 | gl_Position = globals.projection * (globals.cullData.view * vec4(wpos, 1)); 58 | out_drawId = drawId; 59 | out_uv = texcoord; 60 | out_normal = normal; 61 | out_tangent = tangent; 62 | out_wpos = wpos; 63 | } 64 | -------------------------------------------------------------------------------- /src/fileutils.cpp: -------------------------------------------------------------------------------- 1 | #include "fileutils.h" 2 | 3 | #include 4 | #include 5 | 6 | #ifdef _WIN32 7 | #define NOMINMAX 8 | #define WIN32_LEAN_AND_MEAN 9 | #include 10 | #else 11 | #include 12 | #include 13 | #include 14 | #include 15 | #endif 16 | 17 | #ifdef _WIN32 18 | void* mmapFile(const char* path, size_t* outSize) 19 | { 20 | *outSize = 0; 21 | 22 | HANDLE file = CreateFileA(path, GENERIC_READ, 23 | FILE_SHARE_READ, // allow others to read 24 | NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN, NULL); 25 | if (file == INVALID_HANDLE_VALUE) 26 | return NULL; 27 | 28 | LARGE_INTEGER liSize; 29 | if (!GetFileSizeEx(file, &liSize) || liSize.QuadPart == 0 || liSize.QuadPart > SIZE_MAX) 30 | { 31 | CloseHandle(file); 32 | return NULL; 33 | } 34 | 35 | HANDLE map = CreateFileMappingW(file, NULL, PAGE_READONLY, 0, 0, NULL); 36 | if (!map) 37 | { 38 | CloseHandle(file); 39 | return NULL; 40 | } 41 | 42 | void* view = MapViewOfFile(map, FILE_MAP_READ, 0, 0, 0); 43 | 44 | // safe to close mapping handle & file handle after mapping the view; the kernel keeps track of the association via the mapped pointer 45 | CloseHandle(map); 46 | CloseHandle(file); 47 | 48 | if (!view) 49 | return NULL; 50 | 51 | *outSize = size_t(liSize.QuadPart); 52 | return view; 53 | } 54 | 55 | void unmapFile(void* data, size_t size) 56 | { 57 | (void)size; 58 | 59 | BOOL ok = UnmapViewOfFile(data); 60 | assert(ok); 61 | (void)ok; 62 | } 63 | #else 64 | void* mmapFile(const char* path, size_t* outSize) 65 | { 66 | *outSize = 0; 67 | 68 | int fd = open(path, O_RDONLY); 69 | if (fd == -1) 70 | return NULL; 71 | 72 | struct stat sb; 73 | if (fstat(fd, &sb) == -1 || sb.st_size == 0 || sb.st_size > SIZE_MAX) 74 | { 75 | close(fd); 76 | return NULL; 77 | } 78 | 79 | void* mapped = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 80 | if (mapped == MAP_FAILED) 81 | { 82 | close(fd); 83 | return NULL; 84 | } 85 | 86 | close(fd); 87 | 88 | *outSize = sb.st_size; 89 | return mapped; 90 | } 91 | 92 | void unmapFile(void* data, size_t size) 93 | { 94 | int rc = munmap(data, size); 95 | assert(rc == 0); 96 | (void)rc; 97 | } 98 | #endif 99 | -------------------------------------------------------------------------------- /src/shaders/final.comp.glsl: -------------------------------------------------------------------------------- 1 | #version 460 2 | 3 | #extension GL_GOOGLE_include_directive: require 4 | 5 | #include "math.h" 6 | 7 | layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in; 8 | 9 | struct ShadeData 10 | { 11 | vec3 cameraPosition; 12 | vec3 sunDirection; 13 | int shadowsEnabled; 14 | 15 | mat4 inverseViewProjection; 16 | 17 | vec2 imageSize; 18 | }; 19 | 20 | layout(push_constant) uniform block 21 | { 22 | ShadeData shadeData; 23 | }; 24 | 25 | layout(binding = 0) uniform writeonly image2D outImage; 26 | 27 | layout(binding = 1) uniform sampler2D gbufferImage0; 28 | layout(binding = 2) uniform sampler2D gbufferImage1; 29 | layout(binding = 3) uniform sampler2D depthImage; 30 | 31 | layout(binding = 4) uniform sampler2D shadowImage; 32 | 33 | void main() 34 | { 35 | uvec2 pos = gl_GlobalInvocationID.xy; 36 | vec2 uv = (vec2(pos) + 0.5) / shadeData.imageSize; 37 | 38 | vec4 gbuffer0 = texture(gbufferImage0, uv); 39 | vec4 gbuffer1 = texture(gbufferImage1, uv); 40 | float depth = texture(depthImage, uv).r; 41 | 42 | vec3 albedo = fromsrgb(gbuffer0.rgb); 43 | vec3 emissive = albedo * (exp2(gbuffer0.a * 5) - 1); 44 | vec3 normal = decodeOct(gbuffer1.rg * 2 - 1); 45 | 46 | float ndotl = max(dot(normal, shadeData.sunDirection), 0.0); 47 | 48 | vec4 clip = vec4(uv.x * 2 - 1, 1 - uv.y * 2, depth, 1); 49 | vec4 wposh = shadeData.inverseViewProjection * clip; 50 | vec3 wpos = wposh.xyz / wposh.w; 51 | 52 | vec3 view = normalize(shadeData.cameraPosition - wpos); 53 | vec3 halfv = normalize(view + shadeData.sunDirection); 54 | float ndoth = max(dot(normal, halfv), 0.0); 55 | float gloss = gbuffer1.b; 56 | 57 | // TODO: this is not the BRDF we want 58 | float specular = pow(ndoth, mix(1, 64, gloss)) * gloss; 59 | 60 | float shadow = 1; 61 | if (shadeData.shadowsEnabled == 1) 62 | shadow = texture(shadowImage, uv).r; 63 | 64 | float ambient = 0.07; 65 | float shadowAmbient = 0.05; 66 | float sunIntensity = 2.5; 67 | 68 | vec3 outputColor = albedo.rgb * (ndotl * min(shadow + shadowAmbient, 1.0) * sunIntensity + ambient) + vec3(specular * shadow) * sunIntensity + emissive; 69 | 70 | float deband = gradientNoise(vec2(pos)) * 2 - 1; 71 | imageStore(outImage, ivec2(pos), vec4(tonemap(outputColor) + deband * (0.5 / 255), 1.0)); 72 | } 73 | -------------------------------------------------------------------------------- /src/shaders/mesh.h: -------------------------------------------------------------------------------- 1 | #include "../config.h" 2 | 3 | struct Vertex 4 | { 5 | float16_t vx, vy, vz; 6 | uint16_t tp; // packed tangent: 8-8 octahedral 7 | uint np; // packed normal: 10-10-10-2 vector + bitangent sign 8 | float16_t tu, tv; 9 | }; 10 | 11 | struct Meshlet 12 | { 13 | // vec3 keeps Meshlet aligned to 16 bytes which is important because C++ has an alignas() directive 14 | vec3 center; 15 | float radius; 16 | int8_t cone_axis[3]; 17 | int8_t cone_cutoff; 18 | 19 | uint dataOffset; 20 | uint baseVertex; 21 | uint8_t vertexCount; 22 | uint8_t triangleCount; 23 | uint8_t shortRefs; 24 | }; 25 | 26 | struct CullData 27 | { 28 | mat4 view; 29 | 30 | float P00, P11, znear, zfar; // symmetric projection parameters 31 | float frustum[4]; // data for left/right/top/bottom frustum planes 32 | float lodTarget; // lod target error at z=1 33 | float pyramidWidth, pyramidHeight; // depth pyramid size in texels 34 | 35 | uint drawCount; 36 | 37 | int cullingEnabled; 38 | int lodEnabled; 39 | int occlusionEnabled; 40 | int clusterOcclusionEnabled; 41 | int clusterBackfaceEnabled; 42 | 43 | uint postPass; 44 | }; 45 | 46 | struct Globals 47 | { 48 | mat4 projection; 49 | CullData cullData; 50 | float screenWidth, screenHeight; 51 | }; 52 | 53 | struct MeshLod 54 | { 55 | uint indexOffset; 56 | uint indexCount; 57 | uint meshletOffset; 58 | uint meshletCount; 59 | float error; 60 | }; 61 | 62 | struct Mesh 63 | { 64 | vec3 center; 65 | float radius; 66 | 67 | uint vertexOffset; 68 | uint vertexCount; 69 | 70 | uint lodCount; 71 | MeshLod lods[8]; 72 | }; 73 | 74 | struct Material 75 | { 76 | uint albedoTexture; 77 | uint normalTexture; 78 | uint specularTexture; 79 | uint emissiveTexture; 80 | 81 | vec4 diffuseFactor; 82 | vec4 specularFactor; 83 | vec3 emissiveFactor; 84 | }; 85 | 86 | struct MeshDraw 87 | { 88 | vec3 position; 89 | float scale; 90 | vec4 orientation; 91 | 92 | uint meshIndex; 93 | uint meshletVisibilityOffset; 94 | uint postPass; 95 | uint materialIndex; 96 | }; 97 | 98 | struct MeshDrawCommand 99 | { 100 | uint drawId; 101 | 102 | // VkDrawIndexedIndirectCommand 103 | uint indexCount; 104 | uint instanceCount; 105 | uint firstIndex; 106 | uint vertexOffset; 107 | uint firstInstance; 108 | }; 109 | 110 | struct MeshTaskCommand 111 | { 112 | uint drawId; 113 | uint taskOffset; 114 | uint taskCount; 115 | uint lateDrawVisibility; 116 | uint meshletVisibilityOffset; 117 | }; 118 | 119 | struct MeshTaskPayload 120 | { 121 | uint clusterIndices[TASK_WGSIZE]; 122 | }; 123 | -------------------------------------------------------------------------------- /src/resources.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | struct Buffer 4 | { 5 | VkBuffer buffer; 6 | VkDeviceMemory memory; 7 | void* data; 8 | size_t size; 9 | }; 10 | 11 | struct Image 12 | { 13 | VkImage image; 14 | VkImageView imageView; 15 | VkDeviceMemory memory; 16 | }; 17 | 18 | VkImageMemoryBarrier2 imageBarrier(VkImage image, VkPipelineStageFlags2 srcStageMask, VkAccessFlags2 srcAccessMask, VkImageLayout oldLayout, VkPipelineStageFlags2 dstStageMask, VkAccessFlags2 dstAccessMask, VkImageLayout newLayout, VkImageAspectFlags aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, uint32_t baseMipLevel = 0, uint32_t levelCount = VK_REMAINING_MIP_LEVELS); 19 | VkBufferMemoryBarrier2 bufferBarrier(VkBuffer buffer, VkPipelineStageFlags2 srcStageMask, VkAccessFlags2 srcAccessMask, VkPipelineStageFlags2 dstStageMask, VkAccessFlags2 dstAccessMask); 20 | 21 | void pipelineBarrier(VkCommandBuffer commandBuffer, VkDependencyFlags dependencyFlags, size_t bufferBarrierCount, const VkBufferMemoryBarrier2* bufferBarriers, size_t imageBarrierCount, const VkImageMemoryBarrier2* imageBarriers); 22 | 23 | void invalidateBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stageMask, std::initializer_list colorImages, std::initializer_list depthImages = {}); 24 | 25 | void stageBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 srcStageMask, VkAccessFlags2 srcAccessMask, VkPipelineStageFlags2 dstStageMask, VkAccessFlags2 dstAccessMask); 26 | void stageBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 srcStageMask, VkPipelineStageFlags2 dstStageMask); 27 | void stageBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stageMask); 28 | 29 | void createBuffer(Buffer& result, VkDevice device, const VkPhysicalDeviceMemoryProperties& memoryProperties, size_t size, VkBufferUsageFlags usage, VkMemoryPropertyFlags memoryFlags); 30 | void uploadBuffer(VkDevice device, VkCommandPool commandPool, VkCommandBuffer commandBuffer, VkQueue queue, const Buffer& buffer, const Buffer& scratch, const void* data, size_t size); 31 | void destroyBuffer(const Buffer& buffer, VkDevice device); 32 | 33 | VkDeviceAddress getBufferAddress(const Buffer& buffer, VkDevice device); 34 | 35 | VkImageView createImageView(VkDevice device, VkImage image, VkFormat format, uint32_t mipLevel, uint32_t levelCount); 36 | 37 | void createImage(Image& result, VkDevice device, const VkPhysicalDeviceMemoryProperties& memoryProperties, uint32_t width, uint32_t height, uint32_t mipLevels, VkFormat format, VkImageUsageFlags usage); 38 | void destroyImage(const Image& image, VkDevice device); 39 | 40 | uint32_t getImageMipLevels(uint32_t width, uint32_t height); 41 | 42 | VkSampler createSampler(VkDevice device, VkFilter filter, VkSamplerMipmapMode mipmapMode, VkSamplerAddressMode addressMode, VkSamplerReductionModeEXT reductionMode = VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT); 43 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14) 2 | 3 | project(niagara) 4 | 5 | option(SHADER_DEBUG "Embed debug information into SPIV shaders" OFF) 6 | 7 | file(GLOB_RECURSE GLSL_SOURCE_FILES "src/shaders/*.glsl") 8 | file(GLOB_RECURSE GLSL_HEADER_FILES "src/shaders/*.h" "src/config.h") 9 | file(GLOB_RECURSE CPP_SOURCE_FILES "src/*.h" "src/*.cpp") 10 | 11 | add_executable(niagara 12 | ${CPP_SOURCE_FILES} 13 | ${GLSL_SOURCE_FILES} 14 | ${GLSL_HEADER_FILES}) 15 | 16 | set_target_properties(niagara PROPERTIES CXX_STANDARD 11 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO) 17 | 18 | target_compile_definitions(niagara PRIVATE GLFW_INCLUDE_NONE GLM_FORCE_XYZW_ONLY GLM_FORCE_QUAT_DATA_XYZW GLM_FORCE_QUAT_CTOR_XYZW) 19 | target_include_directories(niagara PRIVATE extern/fast_obj extern/cgltf extern/glm) 20 | 21 | if(WIN32) 22 | target_compile_definitions(niagara PRIVATE GLFW_EXPOSE_NATIVE_WIN32) 23 | target_compile_definitions(niagara PRIVATE WIN32_LEAN_AND_MEAN NOMINMAX) 24 | set(VOLK_STATIC_DEFINES VK_USE_PLATFORM_WIN32_KHR) 25 | endif() 26 | 27 | if(UNIX) 28 | set_source_files_properties(src/extern.cpp PROPERTIES COMPILE_FLAGS $,,-Os>) 29 | endif() 30 | 31 | find_package(glfw3 QUIET) 32 | if(NOT glfw3_FOUND) 33 | message("glfw3 not found, building from source") 34 | set(GLFW_BUILD_DOCS OFF CACHE BOOL "" FORCE) 35 | set(GLFW_BUILD_TESTS OFF CACHE BOOL "" FORCE) 36 | set(GLFW_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE) 37 | add_subdirectory(extern/glfw) 38 | endif() 39 | 40 | add_subdirectory(extern/volk) 41 | add_subdirectory(extern/meshoptimizer) 42 | 43 | target_link_libraries(niagara 44 | PRIVATE 45 | glfw 46 | volk 47 | meshoptimizer) 48 | 49 | if(UNIX) 50 | if(DEFINED ENV{VULKAN_SDK}) 51 | set(GLSL_VALIDATOR "$ENV{VULKAN_SDK}/bin/glslangValidator") 52 | else() 53 | set(GLSL_VALIDATOR "glslangValidator") 54 | endif() 55 | elseif(${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "AMD64") 56 | set(GLSL_VALIDATOR "$ENV{VULKAN_SDK}/Bin/glslangValidator.exe") 57 | else() 58 | set(GLSL_VALIDATOR "$ENV{VULKAN_SDK}/Bin32/glslangValidator.exe") 59 | endif() 60 | 61 | set(GLSL_FLAGS --target-env vulkan1.3) 62 | 63 | if (SHADER_DEBUG) 64 | list(APPEND GLSL_FLAGS -gVS) 65 | endif() 66 | 67 | # Thanks to: https://gist.github.com/evilactually/a0d191701cb48f157b05be7f74d79396 68 | set(SPIRV_OUTPUT_DIR "${PROJECT_BINARY_DIR}/spirv/") 69 | foreach(GLSL ${GLSL_SOURCE_FILES}) 70 | get_filename_component(STEM ${GLSL} NAME_WLE) 71 | set(SPIRV "${SPIRV_OUTPUT_DIR}${STEM}.spv") 72 | add_custom_command( 73 | OUTPUT ${SPIRV} 74 | COMMAND ${CMAKE_COMMAND} -E make_directory "${SPIRV_OUTPUT_DIR}" 75 | COMMAND ${GLSL_VALIDATOR} -V ${GLSL_FLAGS} --quiet ${GLSL} -o ${SPIRV} 76 | DEPENDS ${GLSL} ${GLSL_HEADER_FILES}) 77 | list(APPEND SPIRV_BINARY_FILES ${SPIRV}) 78 | endforeach() 79 | 80 | add_custom_target(compile_shaders DEPENDS ${SPIRV_BINARY_FILES}) 81 | add_dependencies(niagara compile_shaders) 82 | -------------------------------------------------------------------------------- /src/shaders/math.h: -------------------------------------------------------------------------------- 1 | // 2D Polyhedral Bounds of a Clipped, Perspective-Projected 3D Sphere. Michael Mara, Morgan McGuire. 2013 2 | bool projectSphere(vec3 c, float r, float znear, float P00, float P11, out vec4 aabb) 3 | { 4 | if (c.z < r + znear) 5 | return false; 6 | 7 | vec3 cr = c * r; 8 | float czr2 = c.z * c.z - r * r; 9 | 10 | float vx = sqrt(c.x * c.x + czr2); 11 | float minx = (vx * c.x - cr.z) / (vx * c.z + cr.x); 12 | float maxx = (vx * c.x + cr.z) / (vx * c.z - cr.x); 13 | 14 | float vy = sqrt(c.y * c.y + czr2); 15 | float miny = (vy * c.y - cr.z) / (vy * c.z + cr.y); 16 | float maxy = (vy * c.y + cr.z) / (vy * c.z - cr.y); 17 | 18 | aabb = vec4(minx * P00, miny * P11, maxx * P00, maxy * P11); 19 | aabb = aabb.xwzy * vec4(0.5f, -0.5f, 0.5f, -0.5f) + vec4(0.5f); // clip space -> uv space 20 | 21 | return true; 22 | } 23 | 24 | bool coneCull(vec3 center, float radius, vec3 cone_axis, float cone_cutoff, vec3 camera_position) 25 | { 26 | return dot(center - camera_position, cone_axis) >= cone_cutoff * length(center - camera_position) + radius; 27 | } 28 | 29 | vec3 rotateQuat(vec3 v, vec4 q) 30 | { 31 | return v + 2.0 * cross(q.xyz, cross(q.xyz, v) + q.w * v); 32 | } 33 | 34 | // A Survey of Efficient Representations for Independent Unit Vectors 35 | vec2 encodeOct(vec3 v) 36 | { 37 | vec2 p = v.xy * (1.0 / (abs(v.x) + abs(v.y) + abs(v.z))); 38 | vec2 s = vec2((v.x >= 0.0) ? +1.0 : -1.0, (v.y >= 0.0) ? +1.0 : -1.0); 39 | vec2 r = (v.z <= 0.0) ? ((1.0 - abs(p.yx)) * s) : p; 40 | return r; 41 | } 42 | 43 | vec3 decodeOct(vec2 e) 44 | { 45 | // https://x.com/Stubbesaurus/status/937994790553227264 46 | vec3 v = vec3(e.xy, 1.0 - abs(e.x) - abs(e.y)); 47 | float t = max(-v.z, 0); 48 | v.xy += vec2(v.x >= 0 ? -t : t, v.y >= 0 ? -t : t); 49 | return normalize(v); 50 | } 51 | 52 | vec3 tosrgb(vec3 c) 53 | { 54 | return pow(c.xyz, vec3(1.0 / 2.2)); 55 | } 56 | 57 | vec4 tosrgb(vec4 c) 58 | { 59 | return vec4(pow(c.xyz, vec3(1.0 / 2.2)), c.w); 60 | } 61 | 62 | vec3 fromsrgb(vec3 c) 63 | { 64 | return pow(c.xyz, vec3(2.2)); 65 | } 66 | 67 | vec4 fromsrgb(vec4 c) 68 | { 69 | return vec4(pow(c.xyz, vec3(2.2)), c.w); 70 | } 71 | 72 | // Optimized filmic operator by Jim Hejl and Richard Burgess-Dawson 73 | // http://filmicworlds.com/blog/filmic-tonemapping-operators/ 74 | vec3 tonemap(vec3 c) 75 | { 76 | vec3 x = max(vec3(0), c - 0.004); 77 | return (x * (6.2 * x + .5)) / (x * (6.2 * x + 1.7) + 0.06); 78 | } 79 | 80 | // Gradient noise from Jorge Jimenez's presentation: 81 | // http://www.iryoku.com/next-generation-post-processing-in-call-of-duty-advanced-warfare 82 | float gradientNoise(vec2 uv) 83 | { 84 | return fract(52.9829189 * fract(dot(uv, vec2(0.06711056, 0.00583715)))); 85 | } 86 | 87 | void unpackTBN(uint np, uint tp, out vec3 normal, out vec4 tangent) 88 | { 89 | normal = ((ivec3(np) >> ivec3(0, 10, 20)) & ivec3(1023)) / 511.0 - 1.0; 90 | tangent.xyz = decodeOct(((ivec2(tp) >> ivec2(0, 8)) & ivec2(255)) / 127.0 - 1.0); 91 | tangent.w = (np & (1 << 30)) != 0 ? -1.0 : 1.0; 92 | } 93 | -------------------------------------------------------------------------------- /src/shaders/mesh.frag.glsl: -------------------------------------------------------------------------------- 1 | #version 460 2 | 3 | #extension GL_EXT_shader_16bit_storage: require 4 | #extension GL_EXT_shader_8bit_storage: require 5 | #extension GL_GOOGLE_include_directive: require 6 | #extension GL_EXT_nonuniform_qualifier: require 7 | 8 | #include "mesh.h" 9 | #include "math.h" 10 | 11 | layout (constant_id = 2) const int POST = 0; 12 | 13 | #define DEBUG 0 14 | 15 | layout(push_constant) uniform block 16 | { 17 | Globals globals; 18 | }; 19 | 20 | layout(binding = 1) readonly buffer Draws 21 | { 22 | MeshDraw draws[]; 23 | }; 24 | 25 | layout(location = 0) out vec4 gbuffer[2]; 26 | 27 | layout(location = 0) in flat uint drawId; 28 | layout(location = 1) in vec2 uv; 29 | layout(location = 2) in vec3 normal; 30 | layout(location = 3) in vec4 tangent; 31 | layout(location = 4) in vec3 wpos; 32 | 33 | layout(binding = 7) uniform sampler textureSampler; 34 | 35 | layout(binding = 8) readonly buffer Materials 36 | { 37 | Material materials[]; 38 | }; 39 | 40 | layout(binding = 0, set = 1) uniform texture2D textures[]; 41 | 42 | #define SAMP(id) sampler2D(textures[nonuniformEXT(id)], textureSampler) 43 | 44 | uint hash(uint a) 45 | { 46 | a = (a+0x7ed55d16) + (a<<12); 47 | a = (a^0xc761c23c) ^ (a>>19); 48 | a = (a+0x165667b1) + (a<<5); 49 | a = (a+0xd3a2646c) ^ (a<<9); 50 | a = (a+0xfd7046c5) + (a<<3); 51 | a = (a^0xb55a4f09) ^ (a>>16); 52 | return a; 53 | } 54 | 55 | void main() 56 | { 57 | MeshDraw meshDraw = draws[drawId]; 58 | Material material = materials[meshDraw.materialIndex]; 59 | 60 | float deband = gradientNoise(gl_FragCoord.xy) * 2 - 1; 61 | 62 | vec4 albedo = material.diffuseFactor; 63 | if (material.albedoTexture > 0) 64 | albedo *= fromsrgb(texture(SAMP(material.albedoTexture), uv)); 65 | 66 | vec3 nmap = vec3(0, 0, 1); 67 | if (material.normalTexture > 0) 68 | nmap = texture(SAMP(material.normalTexture), uv).rgb * 2 - 1; 69 | 70 | vec4 specgloss = material.specularFactor; 71 | if (material.specularTexture > 0) 72 | specgloss *= fromsrgb(texture(SAMP(material.specularTexture), uv)); 73 | 74 | vec3 emissive = material.emissiveFactor; 75 | if (material.emissiveTexture > 0) 76 | emissive *= fromsrgb(texture(SAMP(material.emissiveTexture), uv).rgb); 77 | 78 | vec3 bitangent = cross(normal, tangent.xyz) * tangent.w; 79 | 80 | vec3 nrm = normalize(nmap.r * tangent.xyz + nmap.g * bitangent + nmap.b * normal); 81 | 82 | float emissivef = dot(emissive, vec3(0.3, 0.6, 0.1)) / (dot(albedo.rgb, vec3(0.3, 0.6, 0.1)) + 1e-3); 83 | 84 | // TODO: reconstruct metalness from specular texture 85 | gbuffer[0] = vec4(tosrgb(albedo).rgb, log2(1 + emissivef) / 5); 86 | gbuffer[1] = vec4(encodeOct(nrm) * 0.5 + 0.5 + deband * (0.5 / 1023), specgloss.a, 0.0); 87 | 88 | if (POST > 0 && albedo.a < 0.5) 89 | discard; 90 | 91 | #if DEBUG 92 | uint mhash = hash(drawId); 93 | gbuffer[0] = vec4(float(mhash & 255), float((mhash >> 8) & 255), float((mhash >> 16) & 255), 255) / 255.0; 94 | #endif 95 | } 96 | -------------------------------------------------------------------------------- /src/scene.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "math.h" 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | struct alignas(16) Meshlet 11 | { 12 | vec3 center; 13 | float radius; 14 | int8_t cone_axis[3]; 15 | int8_t cone_cutoff; 16 | 17 | uint32_t dataOffset; // dataOffset..dataOffset+vertexCount-1 stores vertex indices, we store indices packed in 4b units after that 18 | uint32_t baseVertex; 19 | uint8_t vertexCount; 20 | uint8_t triangleCount; 21 | uint8_t shortRefs; 22 | uint8_t padding; 23 | }; 24 | 25 | struct alignas(16) Material 26 | { 27 | int albedoTexture; 28 | int normalTexture; 29 | int specularTexture; 30 | int emissiveTexture; 31 | 32 | vec4 diffuseFactor; 33 | vec4 specularFactor; 34 | vec3 emissiveFactor; 35 | }; 36 | 37 | struct alignas(16) MeshDraw 38 | { 39 | vec3 position; 40 | float scale; 41 | quat orientation; 42 | 43 | uint32_t meshIndex; 44 | uint32_t meshletVisibilityOffset; 45 | uint32_t postPass; 46 | uint32_t materialIndex; 47 | }; 48 | 49 | struct Vertex 50 | { 51 | uint16_t vx, vy, vz; 52 | uint16_t tp; // packed tangent: 8-8 octahedral 53 | uint32_t np; // packed normal: 10-10-10-2 vector + bitangent sign 54 | uint16_t tu, tv; 55 | }; 56 | 57 | struct MeshLod 58 | { 59 | uint32_t indexOffset; 60 | uint32_t indexCount; 61 | uint32_t meshletOffset; 62 | uint32_t meshletCount; 63 | float error; 64 | }; 65 | 66 | struct alignas(16) Mesh 67 | { 68 | vec3 center; 69 | float radius; 70 | 71 | uint32_t vertexOffset; 72 | uint32_t vertexCount; 73 | 74 | uint32_t lodCount; 75 | MeshLod lods[8]; 76 | }; 77 | 78 | struct Geometry 79 | { 80 | // TODO: remove these vectors - they are just scratch copies that waste space 81 | std::vector vertices; 82 | std::vector indices; 83 | std::vector meshlets; 84 | std::vector meshletdata; 85 | std::vector meshletvtx0; // 4 position components per vertex referenced by meshlets in lod 0, packed tightly 86 | std::vector meshes; 87 | }; 88 | 89 | struct Camera 90 | { 91 | vec3 position; 92 | quat orientation; 93 | float fovY; 94 | float znear; 95 | }; 96 | 97 | struct Keyframe 98 | { 99 | vec3 translation; 100 | float scale; 101 | quat rotation; 102 | }; 103 | 104 | struct Animation 105 | { 106 | uint32_t drawIndex; 107 | 108 | float startTime; 109 | float period; 110 | std::vector keyframes; 111 | }; 112 | 113 | bool loadMesh(Geometry& geometry, const char* path, bool buildMeshlets, bool fast = false, bool clrt = false); 114 | bool loadScene(Geometry& geometry, std::vector& materials, std::vector& draws, std::vector& texturePaths, std::vector& animations, Camera& camera, vec3& sunDirection, const char* path, bool buildMeshlets, bool fast = false, bool clrt = false); 115 | 116 | bool saveSceneCache(const char* path, const Geometry& geometry, const std::vector& materials, const std::vector& draws, const std::vector& texturePaths, const Camera& camera, const vec3& sunDirection, bool clrtMode, bool compressed, bool verbose); 117 | bool loadSceneCache(const char* path, Geometry& geometry, std::vector& materials, std::vector& draws, std::vector& texturePaths, Camera& camera, vec3& sunDirection, bool clrtMode); 118 | -------------------------------------------------------------------------------- /src/shaders.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | struct Shader 6 | { 7 | std::string name; 8 | 9 | std::vector spirv; 10 | VkShaderStageFlagBits stage; 11 | 12 | VkDescriptorType resourceTypes[32]; 13 | uint32_t resourceMask; 14 | 15 | uint32_t localSizeX; 16 | uint32_t localSizeY; 17 | uint32_t localSizeZ; 18 | 19 | bool usesPushConstants; 20 | bool usesDescriptorArray; 21 | }; 22 | 23 | struct ShaderSet 24 | { 25 | std::vector shaders; 26 | 27 | const Shader& operator[](const char* name) const; 28 | }; 29 | 30 | struct Program 31 | { 32 | VkPipelineBindPoint bindPoint; 33 | VkPipelineLayout layout; 34 | VkDescriptorSetLayout setLayout; 35 | VkDescriptorUpdateTemplate updateTemplate; 36 | 37 | VkShaderStageFlags pushConstantStages; 38 | uint32_t pushConstantSize; 39 | uint32_t pushDescriptorCount; 40 | 41 | uint32_t localSizeX; 42 | uint32_t localSizeY; 43 | uint32_t localSizeZ; 44 | 45 | const Shader* shaders[8]; 46 | size_t shaderCount; 47 | }; 48 | 49 | bool loadShader(Shader& shader, const char* path); 50 | bool loadShader(Shader& shader, const char* base, const char* path); 51 | bool loadShaders(ShaderSet& shaders, const char* base, const char* path); 52 | 53 | using Shaders = std::initializer_list; 54 | using Constants = std::initializer_list; 55 | 56 | VkPipeline createGraphicsPipeline(VkDevice device, VkPipelineCache pipelineCache, const VkPipelineRenderingCreateInfo& renderingInfo, const Program& program, Constants constants = {}); 57 | VkPipeline createComputePipeline(VkDevice device, VkPipelineCache pipelineCache, const Program& program, Constants constants = {}); 58 | 59 | Program createProgram(VkDevice device, VkPipelineBindPoint bindPoint, Shaders shaders, size_t pushConstantSize, VkDescriptorSetLayout arrayLayout = nullptr); 60 | void destroyProgram(VkDevice device, const Program& program); 61 | 62 | VkDescriptorSetLayout createDescriptorArrayLayout(VkDevice device); 63 | std::pair createDescriptorArray(VkDevice device, VkDescriptorSetLayout layout, uint32_t descriptorCount); 64 | 65 | inline uint32_t getGroupCount(uint32_t threadCount, uint32_t localSize) 66 | { 67 | return (threadCount + localSize - 1) / localSize; 68 | } 69 | 70 | struct DescriptorInfo 71 | { 72 | union 73 | { 74 | VkDescriptorImageInfo image; 75 | VkDescriptorBufferInfo buffer; 76 | VkAccelerationStructureKHR accelerationStructure; 77 | }; 78 | 79 | DescriptorInfo() 80 | { 81 | } 82 | 83 | DescriptorInfo(VkAccelerationStructureKHR structure) 84 | { 85 | accelerationStructure = structure; 86 | } 87 | 88 | DescriptorInfo(VkImageView imageView, VkImageLayout imageLayout = VK_IMAGE_LAYOUT_GENERAL) 89 | { 90 | image.sampler = VK_NULL_HANDLE; 91 | image.imageView = imageView; 92 | image.imageLayout = imageLayout; 93 | } 94 | 95 | DescriptorInfo(VkSampler sampler) 96 | { 97 | image.sampler = sampler; 98 | image.imageView = VK_NULL_HANDLE; 99 | image.imageLayout = VK_IMAGE_LAYOUT_UNDEFINED; 100 | } 101 | 102 | DescriptorInfo(VkSampler sampler, VkImageView imageView, VkImageLayout imageLayout = VK_IMAGE_LAYOUT_GENERAL) 103 | { 104 | image.sampler = sampler; 105 | image.imageView = imageView; 106 | image.imageLayout = imageLayout; 107 | } 108 | 109 | DescriptorInfo(VkBuffer buffer_, VkDeviceSize offset, VkDeviceSize range) 110 | { 111 | buffer.buffer = buffer_; 112 | buffer.offset = offset; 113 | buffer.range = range; 114 | } 115 | 116 | DescriptorInfo(VkBuffer buffer_) 117 | { 118 | buffer.buffer = buffer_; 119 | buffer.offset = 0; 120 | buffer.range = VK_WHOLE_SIZE; 121 | } 122 | }; 123 | -------------------------------------------------------------------------------- /src/shaders/shadow.comp.glsl: -------------------------------------------------------------------------------- 1 | #version 460 2 | 3 | #extension GL_EXT_ray_query: require 4 | #extension GL_EXT_shader_16bit_storage: require 5 | #extension GL_EXT_shader_8bit_storage: require 6 | #extension GL_EXT_nonuniform_qualifier: require 7 | 8 | #extension GL_GOOGLE_include_directive: require 9 | 10 | #include "math.h" 11 | #include "mesh.h" 12 | 13 | layout (constant_id = 0) const int QUALITY = 0; 14 | 15 | layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in; 16 | 17 | struct ShadowData 18 | { 19 | vec3 sunDirection; 20 | float sunJitter; 21 | 22 | mat4 inverseViewProjection; 23 | 24 | vec2 imageSize; 25 | int checkerboard; 26 | }; 27 | 28 | layout(push_constant) uniform block 29 | { 30 | ShadowData shadowData; 31 | }; 32 | 33 | layout(binding = 0) uniform writeonly image2D outImage; 34 | 35 | layout(binding = 1) uniform sampler2D depthImage; 36 | layout(binding = 2) uniform accelerationStructureEXT tlas; 37 | 38 | layout(binding = 3) readonly buffer Draws 39 | { 40 | MeshDraw draws[]; 41 | }; 42 | 43 | layout(binding = 4) readonly buffer Meshes 44 | { 45 | Mesh meshes[]; 46 | }; 47 | 48 | layout(binding = 5) readonly buffer Materials 49 | { 50 | Material materials[]; 51 | }; 52 | 53 | layout(binding = 6) readonly buffer Vertices 54 | { 55 | Vertex vertices[]; 56 | }; 57 | 58 | layout(binding = 7) readonly buffer Indices 59 | { 60 | uint indices[]; 61 | }; 62 | 63 | layout(binding = 8) uniform sampler textureSampler; 64 | 65 | layout(binding = 0, set = 1) uniform texture2D textures[]; 66 | 67 | #define SAMP(id) sampler2D(textures[nonuniformEXT(id)], textureSampler) 68 | 69 | bool shadowTrace(vec3 wpos, vec3 dir, uint rayflags) 70 | { 71 | rayQueryEXT rq; 72 | rayQueryInitializeEXT(rq, tlas, rayflags, 0xff, wpos, 1e-2, dir, 1e3); 73 | rayQueryProceedEXT(rq); 74 | return rayQueryGetIntersectionTypeEXT(rq, true) != gl_RayQueryCommittedIntersectionNoneEXT; 75 | } 76 | 77 | bool shadowTraceTransparent(vec3 wpos, vec3 dir, uint rayflags) 78 | { 79 | rayQueryEXT rq; 80 | rayQueryInitializeEXT(rq, tlas, rayflags, 0xff, wpos, 1e-2, dir, 1e3); 81 | while (rayQueryProceedEXT(rq)) 82 | { 83 | int objid = rayQueryGetIntersectionInstanceIdEXT(rq, false); 84 | int triid = rayQueryGetIntersectionPrimitiveIndexEXT(rq, false); 85 | vec2 bary = rayQueryGetIntersectionBarycentricsEXT(rq, false); 86 | 87 | MeshDraw draw = draws[objid]; 88 | Material material = materials[draw.materialIndex]; 89 | Mesh mesh = meshes[draw.meshIndex]; 90 | 91 | uint vertexOffset = mesh.vertexOffset; 92 | uint indexOffset = mesh.lods[0].indexOffset; 93 | 94 | // TODO: It might be worth repacking some of this data for RT to reduce indirections 95 | // However, attempting to do this gained us zero performance back, so maybe not? 96 | uint tria = indices[indexOffset + triid * 3 + 0]; 97 | uint trib = indices[indexOffset + triid * 3 + 1]; 98 | uint tric = indices[indexOffset + triid * 3 + 2]; 99 | 100 | vec2 uva = vec2(vertices[vertexOffset + tria].tu, vertices[vertexOffset + tria].tv); 101 | vec2 uvb = vec2(vertices[vertexOffset + trib].tu, vertices[vertexOffset + trib].tv); 102 | vec2 uvc = vec2(vertices[vertexOffset + tric].tu, vertices[vertexOffset + tric].tv); 103 | 104 | vec2 uv = uva * (1 - bary.x - bary.y) + uvb * bary.x + uvc * bary.y; 105 | 106 | float alpha = 1.0; 107 | if (material.albedoTexture > 0) 108 | alpha = textureLod(SAMP(material.albedoTexture), uv, 0).a; 109 | 110 | if (alpha >= 0.5) 111 | rayQueryConfirmIntersectionEXT(rq); 112 | } 113 | return rayQueryGetIntersectionTypeEXT(rq, true) != gl_RayQueryCommittedIntersectionNoneEXT; 114 | } 115 | 116 | void main() 117 | { 118 | ivec2 pos = ivec2(gl_GlobalInvocationID.xy); 119 | 120 | if (shadowData.checkerboard > 0) 121 | { 122 | // checkerboard 123 | pos.x *= 2; 124 | pos.x += (pos.y ^ shadowData.checkerboard) & 1; 125 | } 126 | 127 | vec2 uv = (vec2(pos) + 0.5) / shadowData.imageSize; 128 | float depth = texture(depthImage, uv).r; 129 | 130 | vec4 clip = vec4(uv.x * 2 - 1, 1 - uv.y * 2, depth, 1); 131 | vec4 wposh = shadowData.inverseViewProjection * clip; 132 | vec3 wpos = wposh.xyz / wposh.w; 133 | 134 | vec3 dir = shadowData.sunDirection; 135 | 136 | // TODO: a lot more tuning required here 137 | // TODO: this should actually be doing cone sampling, not random XZ offsets 138 | float dir0 = gradientNoise(vec2(pos.xy)); 139 | float dir1 = gradientNoise(vec2(pos.yx)); 140 | dir.x += (dir0 * 2 - 1) * shadowData.sunJitter; 141 | dir.z += (dir1 * 2 - 1) * shadowData.sunJitter; 142 | dir = normalize(dir); 143 | 144 | // On AMDVLK + RDNA3, two shadow traces are faster in practice than one; however, on NV and radv one trace is noticeably faster 145 | bool shadowhit = QUALITY == 0 146 | ? shadowTrace(wpos, dir, gl_RayFlagsTerminateOnFirstHitEXT | gl_RayFlagsCullNoOpaqueEXT) 147 | : shadowTraceTransparent(wpos, dir, gl_RayFlagsTerminateOnFirstHitEXT); 148 | 149 | float shadow = shadowhit ? 0.0 : 1.0; 150 | 151 | imageStore(outImage, pos, vec4(shadow, 0, 0, 0)); 152 | } 153 | -------------------------------------------------------------------------------- /src/shaders/clustercull.comp.glsl: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #extension GL_EXT_shader_16bit_storage: require 4 | #extension GL_EXT_shader_8bit_storage: require 5 | 6 | #extension GL_GOOGLE_include_directive: require 7 | 8 | #include "mesh.h" 9 | #include "math.h" 10 | 11 | layout (constant_id = 0) const bool LATE = false; 12 | 13 | #define CULL TASK_CULL 14 | 15 | layout(local_size_x = TASK_WGSIZE, local_size_y = 1, local_size_z = 1) in; 16 | 17 | layout(push_constant) uniform block 18 | { 19 | CullData cullData; 20 | }; 21 | 22 | layout(binding = 0) readonly buffer TaskCommands 23 | { 24 | MeshTaskCommand taskCommands[]; 25 | }; 26 | 27 | layout(binding = 1) readonly buffer Draws 28 | { 29 | MeshDraw draws[]; 30 | }; 31 | 32 | layout(binding = 2) readonly buffer Meshlets 33 | { 34 | Meshlet meshlets[]; 35 | }; 36 | 37 | layout(binding = 3) buffer MeshletVisibility 38 | { 39 | uint meshletVisibility[]; 40 | }; 41 | 42 | layout(binding = 4) uniform sampler2D depthPyramid; 43 | 44 | layout(binding = 5) writeonly buffer ClusterIndices 45 | { 46 | uint clusterIndices[]; 47 | }; 48 | 49 | layout(binding = 6) buffer ClusterCount 50 | { 51 | uint clusterCount; 52 | }; 53 | 54 | void main() 55 | { 56 | // we convert 2D index to 1D index using a fixed *64 factor, see tasksubmit.comp.glsl 57 | uint commandId = gl_WorkGroupID.x * 64 + gl_WorkGroupID.y; 58 | MeshTaskCommand command = taskCommands[commandId]; 59 | uint drawId = command.drawId; 60 | MeshDraw meshDraw = draws[drawId]; 61 | 62 | uint lateDrawVisibility = command.lateDrawVisibility; 63 | uint taskCount = command.taskCount; 64 | 65 | uint mgi = gl_LocalInvocationID.x; 66 | uint mi = mgi + command.taskOffset; 67 | uint mvi = mgi + command.meshletVisibilityOffset; 68 | 69 | #if CULL 70 | vec3 center = rotateQuat(meshlets[mi].center, meshDraw.orientation) * meshDraw.scale + meshDraw.position; 71 | center = (cullData.view * vec4(center, 1)).xyz; 72 | 73 | float radius = meshlets[mi].radius * meshDraw.scale; 74 | 75 | vec3 cone_axis = rotateQuat(vec3(int(meshlets[mi].cone_axis[0]) / 127.0, int(meshlets[mi].cone_axis[1]) / 127.0, int(meshlets[mi].cone_axis[2]) / 127.0), meshDraw.orientation); 76 | cone_axis = mat3(cullData.view) * cone_axis; 77 | float cone_cutoff = int(meshlets[mi].cone_cutoff) / 127.0; 78 | 79 | bool valid = mgi < taskCount; 80 | bool visible = valid; 81 | bool skip = false; 82 | 83 | if (cullData.clusterOcclusionEnabled == 1 && cullData.postPass == 0) 84 | { 85 | uint meshletVisibilityBit = meshletVisibility[mvi >> 5] & (1u << (mvi & 31)); 86 | 87 | // in early pass, we have to *only* render clusters that were visible last frame, to build a reasonable depth pyramid out of visible triangles 88 | if (!LATE && meshletVisibilityBit == 0) 89 | visible = false; 90 | 91 | // in late pass, we have to process objects visible last frame again (after rendering them in early pass) 92 | // in early pass, per above test, we render previously visible clusters 93 | // in late pass, we must invert the above test to *not* render previously visible clusters of previously visible objects because they were rendered in early pass. 94 | if (LATE && lateDrawVisibility == 1 && meshletVisibilityBit != 0) 95 | skip = true; 96 | } 97 | 98 | // backface cone culling 99 | visible = visible && (cullData.clusterBackfaceEnabled == 0 || !coneCull(center, radius, cone_axis, cone_cutoff, vec3(0, 0, 0))); 100 | // the left/top/right/bottom plane culling utilizes frustum symmetry to cull against two planes at the same time 101 | visible = visible && center.z * cullData.frustum[1] - abs(center.x) * cullData.frustum[0] > -radius; 102 | visible = visible && center.z * cullData.frustum[3] - abs(center.y) * cullData.frustum[2] > -radius; 103 | // the near/far plane culling uses camera space Z directly 104 | // note: because we use an infinite projection matrix, this may cull meshlets that belong to a mesh that straddles the "far" plane; we could optionally remove the far check to be conservative 105 | visible = visible && center.z + radius > cullData.znear && center.z - radius < cullData.zfar; 106 | 107 | if (LATE && cullData.clusterOcclusionEnabled == 1 && visible) 108 | { 109 | vec4 aabb; 110 | if (projectSphere(center, radius, cullData.znear, cullData.P00, cullData.P11, aabb)) 111 | { 112 | float width = (aabb.z - aabb.x) * cullData.pyramidWidth; 113 | float height = (aabb.w - aabb.y) * cullData.pyramidHeight; 114 | 115 | float level = floor(log2(max(width, height))); 116 | 117 | // Sampler is set up to do min reduction, so this computes the minimum depth of a 2x2 texel quad 118 | float depth = textureLod(depthPyramid, (aabb.xy + aabb.zw) * 0.5, level).x; 119 | float depthSphere = cullData.znear / (center.z - radius); 120 | 121 | visible = visible && depthSphere > depth; 122 | } 123 | } 124 | 125 | if (LATE && cullData.clusterOcclusionEnabled == 1 && valid) 126 | { 127 | if (visible) 128 | atomicOr(meshletVisibility[mvi >> 5], 1u << (mvi & 31)); 129 | else 130 | atomicAnd(meshletVisibility[mvi >> 5], ~(1u << (mvi & 31))); 131 | } 132 | 133 | if (visible && !skip) 134 | { 135 | uint index = atomicAdd(clusterCount, 1); // TODO: potentially slow global atomic 136 | 137 | if (index < CLUSTER_LIMIT) 138 | clusterIndices[index] = commandId | (mgi << 24); 139 | } 140 | #else 141 | if (mgi < taskCount) 142 | { 143 | uint index = atomicAdd(clusterCount, 1); // TODO: potentially slow global atomic 144 | 145 | if (index < CLUSTER_LIMIT) 146 | clusterIndices[index] = commandId | (mgi << 24); 147 | } 148 | #endif 149 | } 150 | -------------------------------------------------------------------------------- /src/shaders/meshlet.task.glsl: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #extension GL_EXT_shader_16bit_storage: require 4 | #extension GL_EXT_shader_8bit_storage: require 5 | #extension GL_EXT_mesh_shader: require 6 | 7 | #extension GL_GOOGLE_include_directive: require 8 | 9 | #include "mesh.h" 10 | #include "math.h" 11 | 12 | layout (constant_id = 0) const bool LATE = false; 13 | 14 | #define CULL TASK_CULL 15 | 16 | layout(local_size_x = TASK_WGSIZE, local_size_y = 1, local_size_z = 1) in; 17 | 18 | layout(push_constant) uniform block 19 | { 20 | Globals globals; 21 | }; 22 | 23 | layout(binding = 0) readonly buffer TaskCommands 24 | { 25 | MeshTaskCommand taskCommands[]; 26 | }; 27 | 28 | layout(binding = 1) readonly buffer Draws 29 | { 30 | MeshDraw draws[]; 31 | }; 32 | 33 | layout(binding = 2) readonly buffer Meshlets 34 | { 35 | Meshlet meshlets[]; 36 | }; 37 | 38 | layout(binding = 5) buffer MeshletVisibility 39 | { 40 | uint meshletVisibility[]; 41 | }; 42 | 43 | layout(binding = 6) uniform sampler2D depthPyramid; 44 | 45 | taskPayloadSharedEXT MeshTaskPayload payload; 46 | 47 | #if CULL 48 | shared int sharedCount; 49 | #endif 50 | 51 | void main() 52 | { 53 | // we convert 2D index to 1D index using a fixed *64 factor, see tasksubmit.comp.glsl 54 | uint commandId = gl_WorkGroupID.x * 64 + gl_WorkGroupID.y; 55 | MeshTaskCommand command = taskCommands[commandId]; 56 | uint drawId = command.drawId; 57 | MeshDraw meshDraw = draws[drawId]; 58 | 59 | uint lateDrawVisibility = command.lateDrawVisibility; 60 | uint taskCount = command.taskCount; 61 | 62 | uint mgi = gl_LocalInvocationID.x; 63 | uint mi = mgi + command.taskOffset; 64 | uint mvi = mgi + command.meshletVisibilityOffset; 65 | 66 | #if CULL 67 | sharedCount = 0; 68 | barrier(); // for sharedCount 69 | 70 | CullData cullData = globals.cullData; 71 | 72 | vec3 center = rotateQuat(meshlets[mi].center, meshDraw.orientation) * meshDraw.scale + meshDraw.position; 73 | center = (cullData.view * vec4(center, 1)).xyz; 74 | 75 | float radius = meshlets[mi].radius * meshDraw.scale; 76 | vec3 cone_axis = rotateQuat(vec3(int(meshlets[mi].cone_axis[0]) / 127.0, int(meshlets[mi].cone_axis[1]) / 127.0, int(meshlets[mi].cone_axis[2]) / 127.0), meshDraw.orientation); 77 | cone_axis = mat3(cullData.view) * cone_axis; 78 | 79 | float cone_cutoff = int(meshlets[mi].cone_cutoff) / 127.0; 80 | 81 | bool valid = mgi < taskCount; 82 | bool visible = valid; 83 | bool skip = false; 84 | 85 | if (cullData.clusterOcclusionEnabled == 1 && cullData.postPass == 0) 86 | { 87 | uint meshletVisibilityBit = meshletVisibility[mvi >> 5] & (1u << (mvi & 31)); 88 | 89 | // in early pass, we have to *only* render clusters that were visible last frame, to build a reasonable depth pyramid out of visible triangles 90 | if (!LATE && meshletVisibilityBit == 0) 91 | visible = false; 92 | 93 | // in late pass, we have to process objects visible last frame again (after rendering them in early pass) 94 | // in early pass, per above test, we render previously visible clusters 95 | // in late pass, we must invert the above test to *not* render previously visible clusters of previously visible objects because they were rendered in early pass. 96 | if (LATE && lateDrawVisibility == 1 && meshletVisibilityBit != 0) 97 | skip = true; 98 | } 99 | 100 | // backface cone culling 101 | visible = visible && (cullData.clusterBackfaceEnabled == 0 || !coneCull(center, radius, cone_axis, cone_cutoff, vec3(0, 0, 0))); 102 | // the left/top/right/bottom plane culling utilizes frustum symmetry to cull against two planes at the same time 103 | visible = visible && center.z * cullData.frustum[1] - abs(center.x) * cullData.frustum[0] > -radius; 104 | visible = visible && center.z * cullData.frustum[3] - abs(center.y) * cullData.frustum[2] > -radius; 105 | // the near/far plane culling uses camera space Z directly 106 | // note: because we use an infinite projection matrix, this may cull meshlets that belong to a mesh that straddles the "far" plane; we could optionally remove the far check to be conservative 107 | visible = visible && center.z + radius > cullData.znear && center.z - radius < cullData.zfar; 108 | 109 | if (LATE && cullData.clusterOcclusionEnabled == 1 && visible) 110 | { 111 | vec4 aabb; 112 | if (projectSphere(center, radius, cullData.znear, cullData.P00, cullData.P11, aabb)) 113 | { 114 | float width = (aabb.z - aabb.x) * cullData.pyramidWidth; 115 | float height = (aabb.w - aabb.y) * cullData.pyramidHeight; 116 | 117 | float level = floor(log2(max(width, height))); 118 | 119 | // Sampler is set up to do min reduction, so this computes the minimum depth of a 2x2 texel quad 120 | float depth = textureLod(depthPyramid, (aabb.xy + aabb.zw) * 0.5, level).x; 121 | float depthSphere = cullData.znear / (center.z - radius); 122 | 123 | visible = visible && depthSphere > depth; 124 | } 125 | } 126 | 127 | if (LATE && cullData.clusterOcclusionEnabled == 1 && valid) 128 | { 129 | if (visible) 130 | atomicOr(meshletVisibility[mvi >> 5], 1u << (mvi & 31)); 131 | else 132 | atomicAnd(meshletVisibility[mvi >> 5], ~(1u << (mvi & 31))); 133 | } 134 | 135 | if (visible && !skip) 136 | { 137 | uint index = atomicAdd(sharedCount, 1); 138 | 139 | payload.clusterIndices[index] = commandId | (mgi << 24); 140 | } 141 | 142 | barrier(); // for sharedCount 143 | EmitMeshTasksEXT(sharedCount, 1, 1); 144 | #else 145 | payload.clusterIndices[gl_LocalInvocationID.x] = commandId | (mgi << 24); 146 | 147 | EmitMeshTasksEXT(taskCount, 1, 1); 148 | #endif 149 | } 150 | -------------------------------------------------------------------------------- /src/shaders/drawcull.comp.glsl: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #extension GL_EXT_shader_16bit_storage: require 4 | #extension GL_EXT_shader_8bit_storage: require 5 | 6 | #extension GL_GOOGLE_include_directive: require 7 | 8 | #include "mesh.h" 9 | #include "math.h" 10 | 11 | layout (constant_id = 0) const bool LATE = false; 12 | layout (constant_id = 1) const bool TASK = false; 13 | 14 | layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; 15 | 16 | layout(push_constant) uniform block 17 | { 18 | CullData cullData; 19 | }; 20 | 21 | layout(binding = 0) readonly buffer Draws 22 | { 23 | MeshDraw draws[]; 24 | }; 25 | 26 | layout(binding = 1) readonly buffer Meshes 27 | { 28 | Mesh meshes[]; 29 | }; 30 | 31 | layout(binding = 2) writeonly buffer DrawCommands 32 | { 33 | MeshDrawCommand drawCommands[]; 34 | }; 35 | 36 | layout(binding = 2) writeonly buffer TaskCommands 37 | { 38 | MeshTaskCommand taskCommands[]; 39 | }; 40 | 41 | layout(binding = 3) buffer CommandCount 42 | { 43 | uint commandCount; 44 | }; 45 | 46 | layout(binding = 4) buffer DrawVisibility 47 | { 48 | uint drawVisibility[]; 49 | }; 50 | 51 | layout(binding = 5) uniform sampler2D depthPyramid; 52 | 53 | void main() 54 | { 55 | uint di = gl_GlobalInvocationID.x; 56 | 57 | if (di >= cullData.drawCount) 58 | return; 59 | 60 | MeshDraw drawData = draws[di]; 61 | 62 | if (drawData.postPass != cullData.postPass) 63 | return; 64 | 65 | // TODO: when occlusion culling is off, can we make sure everything is processed with LATE=false? 66 | if (!LATE && drawVisibility[di] == 0) 67 | return; 68 | 69 | uint meshIndex = drawData.meshIndex; 70 | Mesh mesh = meshes[meshIndex]; 71 | 72 | vec3 center = rotateQuat(mesh.center, drawData.orientation) * drawData.scale + drawData.position; 73 | center = (cullData.view * vec4(center, 1)).xyz; 74 | float radius = mesh.radius * drawData.scale; 75 | 76 | bool visible = true; 77 | // the left/top/right/bottom plane culling utilizes frustum symmetry to cull against two planes at the same time 78 | visible = visible && center.z * cullData.frustum[1] - abs(center.x) * cullData.frustum[0] > -radius; 79 | visible = visible && center.z * cullData.frustum[3] - abs(center.y) * cullData.frustum[2] > -radius; 80 | // the near/far plane culling uses camera space Z directly 81 | visible = visible && center.z + radius > cullData.znear && center.z - radius < cullData.zfar; 82 | 83 | visible = visible || cullData.cullingEnabled == 0; 84 | 85 | if (LATE && visible && cullData.occlusionEnabled == 1) 86 | { 87 | vec4 aabb; 88 | if (projectSphere(center, radius, cullData.znear, cullData.P00, cullData.P11, aabb)) 89 | { 90 | float width = (aabb.z - aabb.x) * cullData.pyramidWidth; 91 | float height = (aabb.w - aabb.y) * cullData.pyramidHeight; 92 | 93 | // Because we only consider 2x2 pixels, we need to make sure we are sampling from a mip that reduces the rectangle to 1x1 texel or smaller. 94 | // Due to the rectangle being arbitrarily offset, a 1x1 rectangle may cover 2x2 texel area. Using floor() here would require sampling 4 corners 95 | // of AABB (using bilinear fetch), which is a little slower. 96 | float level = ceil(log2(max(width, height))); 97 | 98 | // Sampler is set up to do min reduction, so this computes the minimum depth of a 2x2 texel quad 99 | float depth = textureLod(depthPyramid, (aabb.xy + aabb.zw) * 0.5, level).x; 100 | float depthSphere = cullData.znear / (center.z - radius); 101 | 102 | visible = visible && depthSphere > depth; 103 | } 104 | } 105 | 106 | // when meshlet occlusion culling is enabled, we actually *do* need to append the draw command if vis[]==1 in LATE pass, 107 | // so that we can correctly render now-visible previously-invisible meshlets. we also pass drawvis[] along to task shader 108 | // so that it can *reject* clusters that we *did* draw in the first pass 109 | if (visible && (!LATE || (cullData.clusterOcclusionEnabled == 1 && TASK_CULL == 1) || drawVisibility[di] == 0 || cullData.postPass != 0)) 110 | { 111 | uint lodIndex = 0; 112 | 113 | if (cullData.lodEnabled == 1) 114 | { 115 | float distance = max(length(center) - radius, 0); 116 | float threshold = distance * cullData.lodTarget / drawData.scale; 117 | 118 | for (uint i = 1; i < mesh.lodCount; ++i) 119 | if (mesh.lods[i].error < threshold) 120 | lodIndex = i; 121 | } 122 | 123 | MeshLod lod = meshes[meshIndex].lods[lodIndex]; 124 | 125 | if (TASK) 126 | { 127 | uint taskGroups = (lod.meshletCount + TASK_WGSIZE - 1) / TASK_WGSIZE; 128 | uint dci = atomicAdd(commandCount, taskGroups); 129 | 130 | uint lateDrawVisibility = drawVisibility[di]; 131 | uint meshletVisibilityOffset = drawData.meshletVisibilityOffset; 132 | 133 | // drop draw calls on overflow; this limits us to ~4M visible draws or ~32B visible triangles, whichever is larger 134 | if (dci + taskGroups <= TASK_WGLIMIT) 135 | { 136 | for (uint i = 0; i < taskGroups; ++i) 137 | { 138 | taskCommands[dci + i].drawId = di; 139 | taskCommands[dci + i].taskOffset = lod.meshletOffset + i * TASK_WGSIZE; 140 | taskCommands[dci + i].taskCount = min(TASK_WGSIZE, lod.meshletCount - i * TASK_WGSIZE); 141 | taskCommands[dci + i].lateDrawVisibility = lateDrawVisibility; 142 | taskCommands[dci + i].meshletVisibilityOffset = meshletVisibilityOffset + i * TASK_WGSIZE; 143 | } 144 | } 145 | } 146 | else 147 | { 148 | uint dci = atomicAdd(commandCount, 1); 149 | 150 | drawCommands[dci].drawId = di; 151 | drawCommands[dci].indexCount = lod.indexCount; 152 | drawCommands[dci].instanceCount = 1; 153 | drawCommands[dci].firstIndex = lod.indexOffset; 154 | drawCommands[dci].vertexOffset = mesh.vertexOffset; 155 | drawCommands[dci].firstInstance = 0; 156 | } 157 | } 158 | 159 | if (LATE) 160 | drawVisibility[di] = visible ? 1 : 0; 161 | } 162 | -------------------------------------------------------------------------------- /src/shaders/meshlet.mesh.glsl: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #extension GL_EXT_shader_16bit_storage: require 4 | #extension GL_EXT_shader_8bit_storage: require 5 | #extension GL_EXT_mesh_shader: require 6 | 7 | #extension GL_GOOGLE_include_directive: require 8 | 9 | #include "mesh.h" 10 | #include "math.h" 11 | 12 | layout (constant_id = 1) const bool TASK = false; 13 | 14 | #define DEBUG 0 15 | #define CULL MESH_CULL 16 | 17 | layout(local_size_x = MESH_WGSIZE, local_size_y = 1, local_size_z = 1) in; 18 | layout(triangles, max_vertices = MESH_MAXVTX, max_primitives = MESH_MAXTRI) out; 19 | 20 | layout(push_constant) uniform block 21 | { 22 | Globals globals; 23 | }; 24 | 25 | layout(binding = 0) readonly buffer TaskCommands 26 | { 27 | MeshTaskCommand taskCommands[]; 28 | }; 29 | 30 | layout(binding = 1) readonly buffer Draws 31 | { 32 | MeshDraw draws[]; 33 | }; 34 | 35 | layout(binding = 2) readonly buffer Meshlets 36 | { 37 | Meshlet meshlets[]; 38 | }; 39 | 40 | layout(binding = 3) readonly buffer MeshletData 41 | { 42 | uint meshletData[]; 43 | }; 44 | 45 | layout(binding = 3) readonly buffer MeshletData16 46 | { 47 | uint16_t meshletData16[]; 48 | }; 49 | 50 | layout(binding = 3) readonly buffer MeshletData8 51 | { 52 | uint8_t meshletData8[]; 53 | }; 54 | 55 | layout(binding = 4) readonly buffer Vertices 56 | { 57 | Vertex vertices[]; 58 | }; 59 | 60 | layout(binding = 5) readonly buffer ClusterIndices 61 | { 62 | uint clusterIndices[]; 63 | }; 64 | 65 | layout(location = 0) out flat uint out_drawId[]; 66 | layout(location = 1) out vec2 out_uv[]; 67 | layout(location = 2) out vec3 out_normal[]; 68 | layout(location = 3) out vec4 out_tangent[]; 69 | layout(location = 4) out vec3 out_wpos[]; 70 | 71 | // only usable with task shader (TASK=true) 72 | taskPayloadSharedEXT MeshTaskPayload payload; 73 | 74 | uint hash(uint a) 75 | { 76 | a = (a+0x7ed55d16) + (a<<12); 77 | a = (a^0xc761c23c) ^ (a>>19); 78 | a = (a+0x165667b1) + (a<<5); 79 | a = (a+0xd3a2646c) ^ (a<<9); 80 | a = (a+0xfd7046c5) + (a<<3); 81 | a = (a^0xb55a4f09) ^ (a>>16); 82 | return a; 83 | } 84 | 85 | #if CULL 86 | shared vec3 vertexClip[MESH_MAXVTX]; 87 | #endif 88 | 89 | void main() 90 | { 91 | uint ti = gl_LocalInvocationIndex; 92 | 93 | // we convert 3D index to 1D index using a fixed *256 factor, see clustersubmit.comp.glsl 94 | uint ci = TASK ? payload.clusterIndices[gl_WorkGroupID.x] : clusterIndices[gl_WorkGroupID.x + gl_WorkGroupID.y * 256 + gl_WorkGroupID.z * CLUSTER_TILE]; 95 | 96 | if (ci == ~0) 97 | { 98 | SetMeshOutputsEXT(0, 0); 99 | return; 100 | } 101 | 102 | MeshTaskCommand command = taskCommands[ci & 0xffffff]; 103 | uint mi = command.taskOffset + (ci >> 24); 104 | 105 | MeshDraw meshDraw = draws[command.drawId]; 106 | 107 | uint vertexCount = uint(meshlets[mi].vertexCount); 108 | uint triangleCount = uint(meshlets[mi].triangleCount); 109 | 110 | SetMeshOutputsEXT(vertexCount, triangleCount); 111 | 112 | uint dataOffset = meshlets[mi].dataOffset; 113 | uint baseVertex = meshlets[mi].baseVertex; 114 | bool shortRefs = uint(meshlets[mi].shortRefs) == 1; 115 | uint vertexOffset = dataOffset; 116 | uint indexOffset = dataOffset + (shortRefs ? (vertexCount + 1) / 2 : vertexCount); 117 | 118 | #if DEBUG 119 | uint mhash = hash(mi); 120 | vec3 mcolor = vec3(float(mhash & 255), float((mhash >> 8) & 255), float((mhash >> 16) & 255)) / 255.0; 121 | #endif 122 | 123 | vec2 screen = vec2(globals.screenWidth, globals.screenHeight); 124 | 125 | for (uint i = ti; i < vertexCount; ) 126 | { 127 | uint vi = shortRefs ? uint(meshletData16[vertexOffset * 2 + i]) + baseVertex : meshletData[vertexOffset + i] + baseVertex; 128 | 129 | vec3 position = vec3(vertices[vi].vx, vertices[vi].vy, vertices[vi].vz); 130 | vec2 texcoord = vec2(vertices[vi].tu, vertices[vi].tv); 131 | 132 | vec3 normal; 133 | vec4 tangent; 134 | unpackTBN(vertices[vi].np, uint(vertices[vi].tp), normal, tangent); 135 | 136 | normal = rotateQuat(normal, meshDraw.orientation); 137 | tangent.xyz = rotateQuat(tangent.xyz, meshDraw.orientation); 138 | 139 | vec3 wpos = rotateQuat(position, meshDraw.orientation) * meshDraw.scale + meshDraw.position; 140 | vec4 clip = globals.projection * (globals.cullData.view * vec4(wpos, 1)); 141 | 142 | gl_MeshVerticesEXT[i].gl_Position = clip; 143 | out_drawId[i] = command.drawId; 144 | out_uv[i] = texcoord; 145 | out_normal[i] = normal; 146 | out_tangent[i] = tangent; 147 | out_wpos[i] = wpos; 148 | 149 | #if CULL 150 | vertexClip[i] = vec3((clip.xy / clip.w * 0.5 + vec2(0.5)) * screen, clip.w); 151 | #endif 152 | 153 | #if DEBUG 154 | out_normal[i] = mcolor; 155 | #endif 156 | 157 | #if MESH_MAXVTX <= MESH_WGSIZE 158 | break; 159 | #else 160 | i += MESH_WGSIZE; 161 | #endif 162 | } 163 | 164 | #if CULL 165 | barrier(); 166 | #endif 167 | 168 | for (uint i = ti; i < triangleCount; ) 169 | { 170 | uint offset = indexOffset * 4 + i * 3; 171 | uint a = uint(meshletData8[offset]), b = uint(meshletData8[offset + 1]), c = uint(meshletData8[offset + 2]); 172 | 173 | gl_PrimitiveTriangleIndicesEXT[i] = uvec3(a, b, c); 174 | 175 | #if CULL 176 | bool culled = false; 177 | 178 | vec2 pa = vertexClip[a].xy, pb = vertexClip[b].xy, pc = vertexClip[c].xy; 179 | 180 | // backface culling + zero-area culling 181 | vec2 eb = pb - pa; 182 | vec2 ec = pc - pa; 183 | 184 | culled = culled || (eb.x * ec.y <= eb.y * ec.x); 185 | 186 | // small primitive culling 187 | vec2 bmin = min(pa, min(pb, pc)); 188 | vec2 bmax = max(pa, max(pb, pc)); 189 | float sbprec = 1.0 / 256.0; // note: this can be set to 1/2^subpixelPrecisionBits 190 | 191 | // note: this is slightly imprecise (doesn't fully match hw behavior and is both too loose and too strict) 192 | culled = culled || (round(bmin.x - sbprec) == round(bmax.x) || round(bmin.y) == round(bmax.y + sbprec)); 193 | 194 | // the computations above are only valid if all vertices are in front of perspective plane 195 | culled = culled && (vertexClip[a].z > 0 && vertexClip[b].z > 0 && vertexClip[c].z > 0); 196 | 197 | gl_MeshPrimitivesEXT[i].gl_CullPrimitiveEXT = culled; 198 | #endif 199 | 200 | #if MESH_MAXTRI <= MESH_WGSIZE 201 | break; 202 | #else 203 | i += MESH_WGSIZE; 204 | #endif 205 | } 206 | } 207 | -------------------------------------------------------------------------------- /src/swapchain.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "swapchain.h" 3 | 4 | #include "config.h" 5 | 6 | #include 7 | 8 | #ifdef VK_USE_PLATFORM_WIN32_KHR 9 | #include 10 | #endif 11 | 12 | #include 13 | 14 | #define VSYNC CONFIG_VSYNC 15 | 16 | const char** getSwapchainExtensions(uint32_t* count) 17 | { 18 | #ifdef VK_USE_PLATFORM_WIN32_KHR 19 | static const char* extensions[] = { VK_KHR_SURFACE_EXTENSION_NAME, VK_KHR_WIN32_SURFACE_EXTENSION_NAME }; 20 | *count = sizeof(extensions) / sizeof(extensions[0]); 21 | return extensions; 22 | #else 23 | return glfwGetRequiredInstanceExtensions(count); 24 | #endif 25 | } 26 | 27 | VkSurfaceKHR createSurface(VkInstance instance, GLFWwindow* window) 28 | { 29 | VkSurfaceKHR surface = 0; 30 | 31 | #ifdef VK_USE_PLATFORM_WIN32_KHR 32 | // Note: GLFW has a helper glfwCreateWindowSurface but we're going to do this the hard way to demonstrate the platform-specific surface creation 33 | assert(glfwGetPlatform() == GLFW_PLATFORM_WIN32); 34 | VkWin32SurfaceCreateInfoKHR createInfo = { VK_STRUCTURE_TYPE_WIN32_SURFACE_CREATE_INFO_KHR }; 35 | createInfo.hinstance = GetModuleHandle(0); 36 | createInfo.hwnd = glfwGetWin32Window(window); 37 | VK_CHECK(vkCreateWin32SurfaceKHR(instance, &createInfo, 0, &surface)); 38 | #else 39 | VK_CHECK(glfwCreateWindowSurface(instance, window, 0, &surface)); 40 | #endif 41 | 42 | return surface; 43 | } 44 | 45 | VkFormat getSwapchainFormat(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface) 46 | { 47 | uint32_t formatCount = 0; 48 | VK_CHECK(vkGetPhysicalDeviceSurfaceFormatsKHR(physicalDevice, surface, &formatCount, 0)); 49 | assert(formatCount > 0); 50 | 51 | std::vector formats(formatCount); 52 | VK_CHECK(vkGetPhysicalDeviceSurfaceFormatsKHR(physicalDevice, surface, &formatCount, formats.data())); 53 | 54 | if (formatCount == 1 && formats[0].format == VK_FORMAT_UNDEFINED) 55 | return VK_FORMAT_R8G8B8A8_UNORM; 56 | 57 | for (uint32_t i = 0; i < formatCount; ++i) 58 | if (formats[i].format == VK_FORMAT_R8G8B8A8_UNORM || formats[i].format == VK_FORMAT_B8G8R8A8_UNORM) 59 | return formats[i].format; 60 | 61 | return formats[0].format; 62 | } 63 | 64 | static VkPresentModeKHR getPresentMode(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface) 65 | { 66 | if (VSYNC) 67 | return VK_PRESENT_MODE_FIFO_KHR; // guaranteed to be available 68 | 69 | uint32_t presentModeCount = 0; 70 | VK_CHECK(vkGetPhysicalDeviceSurfacePresentModesKHR(physicalDevice, surface, &presentModeCount, 0)); 71 | assert(presentModeCount > 0); 72 | 73 | std::vector presentModes(presentModeCount); 74 | VK_CHECK(vkGetPhysicalDeviceSurfacePresentModesKHR(physicalDevice, surface, &presentModeCount, presentModes.data())); 75 | 76 | for (VkPresentModeKHR mode : presentModes) 77 | { 78 | if (mode == VK_PRESENT_MODE_MAILBOX_KHR) 79 | return mode; 80 | if (mode == VK_PRESENT_MODE_IMMEDIATE_KHR) 81 | return mode; 82 | } 83 | 84 | // fall back to fifo 85 | return VK_PRESENT_MODE_FIFO_KHR; 86 | } 87 | 88 | void createSwapchain(Swapchain& result, VkPhysicalDevice physicalDevice, VkDevice device, VkSurfaceKHR surface, uint32_t familyIndex, GLFWwindow* window, VkFormat format, VkSwapchainKHR oldSwapchain) 89 | { 90 | VkSurfaceCapabilitiesKHR surfaceCaps; 91 | VK_CHECK(vkGetPhysicalDeviceSurfaceCapabilitiesKHR(physicalDevice, surface, &surfaceCaps)); 92 | 93 | int width = 0, height = 0; 94 | glfwGetFramebufferSize(window, &width, &height); 95 | 96 | VkPresentModeKHR presentMode = getPresentMode(physicalDevice, surface); 97 | 98 | VkCompositeAlphaFlagBitsKHR surfaceComposite = 99 | (surfaceCaps.supportedCompositeAlpha & VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR) 100 | ? VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR 101 | : (surfaceCaps.supportedCompositeAlpha & VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR) 102 | ? VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR 103 | : (surfaceCaps.supportedCompositeAlpha & VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR) 104 | ? VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR 105 | : VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR; 106 | 107 | VkSwapchainCreateInfoKHR createInfo = { VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR }; 108 | createInfo.surface = surface; 109 | createInfo.minImageCount = std::max(unsigned(MIN_IMAGES), surfaceCaps.minImageCount); 110 | createInfo.imageFormat = format; 111 | createInfo.imageColorSpace = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR; 112 | createInfo.imageExtent.width = width; 113 | createInfo.imageExtent.height = height; 114 | createInfo.imageArrayLayers = 1; 115 | createInfo.imageUsage = VK_IMAGE_USAGE_STORAGE_BIT; 116 | createInfo.queueFamilyIndexCount = 1; 117 | createInfo.pQueueFamilyIndices = &familyIndex; 118 | createInfo.preTransform = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR; 119 | createInfo.compositeAlpha = surfaceComposite; 120 | createInfo.presentMode = presentMode; 121 | createInfo.oldSwapchain = oldSwapchain; 122 | 123 | VkSwapchainKHR swapchain = 0; 124 | VK_CHECK(vkCreateSwapchainKHR(device, &createInfo, 0, &swapchain)); 125 | 126 | uint32_t imageCount = 0; 127 | VK_CHECK(vkGetSwapchainImagesKHR(device, swapchain, &imageCount, 0)); 128 | 129 | std::vector images(imageCount); 130 | VK_CHECK(vkGetSwapchainImagesKHR(device, swapchain, &imageCount, images.data())); 131 | 132 | result.swapchain = swapchain; 133 | result.images = images; 134 | result.width = width; 135 | result.height = height; 136 | result.imageCount = imageCount; 137 | } 138 | 139 | void destroySwapchain(VkDevice device, const Swapchain& swapchain) 140 | { 141 | vkDestroySwapchainKHR(device, swapchain.swapchain, 0); 142 | } 143 | 144 | SwapchainStatus updateSwapchain(Swapchain& result, VkPhysicalDevice physicalDevice, VkDevice device, VkSurfaceKHR surface, uint32_t familyIndex, GLFWwindow* window, VkFormat format) 145 | { 146 | int width = 0, height = 0; 147 | glfwGetFramebufferSize(window, &width, &height); 148 | 149 | if (width == 0 || height == 0) 150 | return Swapchain_NotReady; 151 | 152 | if (result.width == width && result.height == height) 153 | return Swapchain_Ready; 154 | 155 | Swapchain old = result; 156 | 157 | createSwapchain(result, physicalDevice, device, surface, familyIndex, window, format, old.swapchain); 158 | 159 | VK_CHECK(vkDeviceWaitIdle(device)); 160 | 161 | destroySwapchain(device, old); 162 | 163 | return Swapchain_Resized; 164 | } 165 | -------------------------------------------------------------------------------- /src/shaders/debugtext.comp.glsl: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | // font data courtesy of MicroProfile (https://github.com/zeux/microprofile/blob/master/microprofiledraw.h) 4 | // glyphs are 5x8 pixels, but we shade one more line for drop shadows 5 | layout(local_size_x = 5, local_size_y = 9, local_size_z = 1) in; 6 | 7 | struct TextData 8 | { 9 | ivec2 offset; 10 | int scale; 11 | uint color; 12 | 13 | uint data[112/4]; 14 | }; 15 | 16 | layout(push_constant) uniform block 17 | { 18 | TextData text; 19 | }; 20 | 21 | layout(binding = 0) uniform writeonly image2D outImage; 22 | 23 | // glyph U offset (0-1023) for each ASCII character (uint16), packed into uint 24 | const uint g_MicroProfileFontDescription[256*2/4] = 25 | { 26 | 0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce, 27 | 0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce, 28 | 0x0ce0201,0x2090211,0x2190221,0x2290231,0x2390241,0x2490251,0x2590261,0x2690271, 29 | 0x1b101b9,0x1c101c9,0x1d101d9,0x1e101e9,0x1f101f9,0x2790281,0x2890291,0x29902a1, 30 | 0x2a90001,0x0090011,0x0190021,0x0290031,0x0390041,0x0490051,0x0590061,0x0690071, 31 | 0x0790081,0x0890091,0x09900a1,0x0a900b1,0x0b900c1,0x0c902b1,0x2b902c1,0x2c902d1, 32 | 0x0ce00d9,0x0e100e9,0x0f100f9,0x1010109,0x1110119,0x1210129,0x1310139,0x1410149, 33 | 0x1510159,0x1610169,0x1710179,0x1810189,0x1910199,0x1a102d9,0x2e102e9,0x2f100ce, 34 | 0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce, 35 | 0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce, 36 | 0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce, 37 | 0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce, 38 | 0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce, 39 | 0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce, 40 | 0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce, 41 | 0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce,0x0ce00ce, 42 | }; 43 | 44 | // a 1024x9 texture, packed in raster order into bits: 45 | // each byte encodes a pixel in each bit 46 | // offsets into this data come from font description above 47 | const uint g_MicroProfileFont[1024*9/32] = 48 | { 49 | 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000, 50 | 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000, 51 | 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000, 52 | 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000, 53 | 0x10783878,0x7c7c3c44,0x38044440,0x44443878,0x3878387c,0x44444444,0x447c0000,0x40000400, 54 | 0x18004010,0x08403000,0x00000000,0x00001000,0x00000000,0x00003810,0x387c087c,0x1c7c3838, 55 | 0x10282810,0x00201008,0x10100000,0x00000000,0x00040020,0x38387000,0x1c10001c,0x10703000, 56 | 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000, 57 | 0x28444444,0x40404044,0x10044840,0x6c444444,0x44444410,0x44444444,0x44040000,0x40000400, 58 | 0x24004000,0x00401000,0x00000000,0x00001000,0x00000000,0x00004430,0x44041840,0x20044444, 59 | 0x1028283c,0x44501010,0x08541000,0x00000400,0x00080010,0x44444040,0x04280030,0x10185800, 60 | 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000, 61 | 0x44444044,0x40404044,0x10045040,0x54644444,0x44444010,0x44444428,0x28080038,0x783c3c38, 62 | 0x20387830,0x1844106c,0x7838783c,0x5c3c3c44,0x44444444,0x7c004c10,0x04082878,0x40084444, 63 | 0x10007c50,0x08500020,0x04381000,0x00000810,0x10107c08,0x08544020,0x04440030,0x10180000, 64 | 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000, 65 | 0x44784044,0x7878407c,0x10046040,0x54544478,0x44783810,0x44445410,0x10100004,0x44404444, 66 | 0x78444410,0x08481054,0x44444444,0x60401044,0x44442844,0x08005410,0x18184804,0x7810383c, 67 | 0x10002838,0x10200020,0x04107c00,0x7c001000,0x00200004,0x105c4010,0x04000060,0x100c0000, 68 | 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000, 69 | 0x7c444044,0x40404c44,0x10045040,0x444c4440,0x54500410,0x44445428,0x1020003c,0x4440447c, 70 | 0x20444410,0x08701054,0x44444444,0x40381044,0x44541044,0x10006410,0x20047c04,0x44204404, 71 | 0x10007c14,0x20540020,0x04381010,0x00002010,0x10107c08,0x10584008,0x04000030,0x10180000, 72 | 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000, 73 | 0x44444444,0x40404444,0x10444840,0x44444440,0x48484410,0x44286c44,0x10400044,0x44404440, 74 | 0x203c4410,0x08481054,0x44444444,0x4004124c,0x2854283c,0x20004410,0x40440844,0x44204408, 75 | 0x00002878,0x44480010,0x08541010,0x00004000,0x10080010,0x00404004,0x04000030,0x10180000, 76 | 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000, 77 | 0x44783878,0x7c403c44,0x3838447c,0x44443840,0x34443810,0x38104444,0x107c003c,0x783c3c3c, 78 | 0x20044438,0x48443844,0x4438783c,0x40780c34,0x106c4404,0x7c003838,0x7c380838,0x38203870, 79 | 0x10002810,0x00340008,0x10100020,0x00100000,0x20040020,0x103c7000,0x1c007c1c,0x10700000, 80 | 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000, 81 | 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000, 82 | 0x00380000,0x30000000,0x00004004,0x00000000,0x00000038,0x00000000,0x00000000,0x00000000, 83 | 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000, 84 | 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000, 85 | }; 86 | 87 | void main() 88 | { 89 | ivec2 margin = ivec2(1, 2); 90 | ivec2 size = ivec2(gl_WorkGroupSize.xy) + margin; 91 | ivec2 pos = ivec2(gl_WorkGroupID.xy) * size + ivec2(gl_LocalInvocationID.xy); 92 | 93 | uint char = bitfieldExtract(text.data[gl_WorkGroupID.x / 4], int(gl_WorkGroupID.x % 4) * 8, 8); 94 | uint offset = bitfieldExtract(g_MicroProfileFontDescription[char / 2], 16 - int(char % 2) * 16, 16); 95 | 96 | uint u = offset + gl_LocalInvocationID.x; 97 | uint v = gl_LocalInvocationID.y; 98 | uint texoff = u + 1024 * v; 99 | 100 | // sample the texture twice to get the drop shadow effect 101 | uint texbit0 = bitfieldExtract(g_MicroProfileFont[texoff / 32], 31 - int(texoff % 32), 1); 102 | uint texbit1 = v == 8 ? 0 : bitfieldExtract(g_MicroProfileFont[(texoff + 1024) / 32], 31 - int(texoff % 32), 1); 103 | 104 | if ((texbit0 | texbit1) == 0) 105 | return; 106 | 107 | vec3 color = texbit1 == 1 ? unpackUnorm4x8(text.color).bgr : vec3(0.0); 108 | 109 | for (int y = 0; y < text.scale; ++y) 110 | for (int x = 0; x < text.scale; ++x) 111 | imageStore(outImage, (text.offset * size + pos) * text.scale + ivec2(x, y), vec4(color, 1.0)); 112 | } 113 | -------------------------------------------------------------------------------- /src/textures.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "textures.h" 3 | 4 | #include "resources.h" 5 | 6 | #include 7 | 8 | #include 9 | 10 | struct DDS_PIXELFORMAT 11 | { 12 | unsigned int dwSize; 13 | unsigned int dwFlags; 14 | unsigned int dwFourCC; 15 | unsigned int dwRGBBitCount; 16 | unsigned int dwRBitMask; 17 | unsigned int dwGBitMask; 18 | unsigned int dwBBitMask; 19 | unsigned int dwABitMask; 20 | }; 21 | 22 | struct DDS_HEADER 23 | { 24 | unsigned int dwSize; 25 | unsigned int dwFlags; 26 | unsigned int dwHeight; 27 | unsigned int dwWidth; 28 | unsigned int dwPitchOrLinearSize; 29 | unsigned int dwDepth; 30 | unsigned int dwMipMapCount; 31 | unsigned int dwReserved1[11]; 32 | DDS_PIXELFORMAT ddspf; 33 | unsigned int dwCaps; 34 | unsigned int dwCaps2; 35 | unsigned int dwCaps3; 36 | unsigned int dwCaps4; 37 | unsigned int dwReserved2; 38 | }; 39 | 40 | struct DDS_HEADER_DXT10 41 | { 42 | unsigned int dxgiFormat; 43 | unsigned int resourceDimension; 44 | unsigned int miscFlag; 45 | unsigned int arraySize; 46 | unsigned int miscFlags2; 47 | }; 48 | 49 | const unsigned int DDSCAPS2_CUBEMAP = 0x200; 50 | const unsigned int DDSCAPS2_VOLUME = 0x200000; 51 | 52 | const unsigned int DDS_DIMENSION_TEXTURE2D = 3; 53 | 54 | enum DXGI_FORMAT 55 | { 56 | DXGI_FORMAT_BC1_UNORM = 71, 57 | DXGI_FORMAT_BC1_UNORM_SRGB = 72, 58 | DXGI_FORMAT_BC2_UNORM = 74, 59 | DXGI_FORMAT_BC2_UNORM_SRGB = 75, 60 | DXGI_FORMAT_BC3_UNORM = 77, 61 | DXGI_FORMAT_BC3_UNORM_SRGB = 78, 62 | DXGI_FORMAT_BC4_UNORM = 80, 63 | DXGI_FORMAT_BC4_SNORM = 81, 64 | DXGI_FORMAT_BC5_UNORM = 83, 65 | DXGI_FORMAT_BC5_SNORM = 84, 66 | DXGI_FORMAT_BC6H_UF16 = 95, 67 | DXGI_FORMAT_BC6H_SF16 = 96, 68 | DXGI_FORMAT_BC7_UNORM = 98, 69 | DXGI_FORMAT_BC7_UNORM_SRGB = 99, 70 | }; 71 | 72 | static unsigned int fourCC(const char (&str)[5]) 73 | { 74 | return (unsigned(str[0]) << 0) | (unsigned(str[1]) << 8) | (unsigned(str[2]) << 16) | (unsigned(str[3]) << 24); 75 | } 76 | 77 | static VkFormat getFormat(const DDS_HEADER& header, const DDS_HEADER_DXT10& header10) 78 | { 79 | if (header.ddspf.dwFourCC == fourCC("DXT1")) 80 | return VK_FORMAT_BC1_RGBA_UNORM_BLOCK; 81 | if (header.ddspf.dwFourCC == fourCC("DXT3")) 82 | return VK_FORMAT_BC2_UNORM_BLOCK; 83 | if (header.ddspf.dwFourCC == fourCC("DXT5")) 84 | return VK_FORMAT_BC3_UNORM_BLOCK; 85 | if (header.ddspf.dwFourCC == fourCC("ATI1")) 86 | return VK_FORMAT_BC4_UNORM_BLOCK; 87 | if (header.ddspf.dwFourCC == fourCC("ATI2")) 88 | return VK_FORMAT_BC5_UNORM_BLOCK; 89 | 90 | if (header.ddspf.dwFourCC == fourCC("DX10")) 91 | { 92 | switch (header10.dxgiFormat) 93 | { 94 | case DXGI_FORMAT_BC1_UNORM: 95 | case DXGI_FORMAT_BC1_UNORM_SRGB: 96 | return VK_FORMAT_BC1_RGBA_UNORM_BLOCK; 97 | case DXGI_FORMAT_BC2_UNORM: 98 | case DXGI_FORMAT_BC2_UNORM_SRGB: 99 | return VK_FORMAT_BC2_UNORM_BLOCK; 100 | case DXGI_FORMAT_BC3_UNORM: 101 | case DXGI_FORMAT_BC3_UNORM_SRGB: 102 | return VK_FORMAT_BC3_UNORM_BLOCK; 103 | case DXGI_FORMAT_BC4_UNORM: 104 | return VK_FORMAT_BC4_UNORM_BLOCK; 105 | case DXGI_FORMAT_BC4_SNORM: 106 | return VK_FORMAT_BC4_SNORM_BLOCK; 107 | case DXGI_FORMAT_BC5_UNORM: 108 | return VK_FORMAT_BC5_UNORM_BLOCK; 109 | case DXGI_FORMAT_BC5_SNORM: 110 | return VK_FORMAT_BC5_SNORM_BLOCK; 111 | case DXGI_FORMAT_BC6H_UF16: 112 | return VK_FORMAT_BC6H_UFLOAT_BLOCK; 113 | case DXGI_FORMAT_BC6H_SF16: 114 | return VK_FORMAT_BC6H_SFLOAT_BLOCK; 115 | case DXGI_FORMAT_BC7_UNORM: 116 | case DXGI_FORMAT_BC7_UNORM_SRGB: 117 | return VK_FORMAT_BC7_UNORM_BLOCK; 118 | } 119 | } 120 | 121 | return VK_FORMAT_UNDEFINED; 122 | } 123 | 124 | static size_t getImageSizeBC(unsigned int width, unsigned int height, unsigned int levels, unsigned int blockSize) 125 | { 126 | size_t result = 0; 127 | 128 | for (unsigned int i = 0; i < levels; ++i) 129 | { 130 | result += ((width + 3) / 4) * ((height + 3) / 4) * blockSize; 131 | 132 | width = width > 1 ? width / 2 : 1; 133 | height = height > 1 ? height / 2 : 1; 134 | } 135 | 136 | return result; 137 | } 138 | 139 | bool loadImage(Image& image, VkDevice device, VkCommandPool commandPool, VkCommandBuffer commandBuffer, VkQueue queue, const VkPhysicalDeviceMemoryProperties& memoryProperties, const Buffer& scratch, const char* path) 140 | { 141 | FILE* file = fopen(path, "rb"); 142 | if (!file) 143 | return false; 144 | 145 | std::unique_ptr filePtr(file, fclose); 146 | 147 | unsigned int magic = 0; 148 | if (fread(&magic, sizeof(magic), 1, file) != 1 || magic != fourCC("DDS ")) 149 | return false; 150 | 151 | DDS_HEADER header = {}; 152 | if (fread(&header, sizeof(header), 1, file) != 1) 153 | return false; 154 | 155 | DDS_HEADER_DXT10 header10 = {}; 156 | if (header.ddspf.dwFourCC == fourCC("DX10") && fread(&header10, sizeof(header10), 1, file) != 1) 157 | return false; 158 | 159 | if (header.dwSize != sizeof(header) || header.ddspf.dwSize != sizeof(header.ddspf)) 160 | return false; 161 | 162 | if (header.dwCaps2 & (DDSCAPS2_CUBEMAP | DDSCAPS2_VOLUME)) 163 | return false; 164 | 165 | if (header.ddspf.dwFourCC == fourCC("DX10") && header10.resourceDimension != DDS_DIMENSION_TEXTURE2D) 166 | return false; 167 | 168 | VkFormat format = getFormat(header, header10); 169 | if (format == VK_FORMAT_UNDEFINED) 170 | return false; 171 | 172 | unsigned int blockSize = 173 | (format == VK_FORMAT_BC1_RGBA_UNORM_BLOCK || format == VK_FORMAT_BC4_SNORM_BLOCK || format == VK_FORMAT_BC4_UNORM_BLOCK) ? 8 : 16; 174 | size_t imageSize = getImageSizeBC(header.dwWidth, header.dwHeight, header.dwMipMapCount, blockSize); 175 | 176 | if (scratch.size < imageSize) 177 | return false; 178 | 179 | VkImageUsageFlags usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT; 180 | createImage(image, device, memoryProperties, header.dwWidth, header.dwHeight, header.dwMipMapCount, format, usage); 181 | 182 | size_t readSize = fread(scratch.data, 1, imageSize, file); 183 | if (readSize != imageSize) 184 | return false; 185 | 186 | if (fgetc(file) != -1) 187 | return false; 188 | 189 | filePtr.reset(); 190 | file = nullptr; 191 | 192 | VK_CHECK(vkResetCommandPool(device, commandPool, 0)); 193 | 194 | VkCommandBufferBeginInfo beginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; 195 | beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; 196 | 197 | VK_CHECK(vkBeginCommandBuffer(commandBuffer, &beginInfo)); 198 | 199 | VkImageMemoryBarrier2 preBarrier = imageBarrier(image.image, 200 | 0, 0, VK_IMAGE_LAYOUT_UNDEFINED, 201 | VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL); 202 | pipelineBarrier(commandBuffer, 0, 0, nullptr, 1, &preBarrier); 203 | 204 | size_t bufferOffset = 0; 205 | unsigned int mipWidth = header.dwWidth, mipHeight = header.dwHeight; 206 | 207 | for (unsigned int i = 0; i < header.dwMipMapCount; ++i) 208 | { 209 | VkBufferImageCopy region = { 210 | bufferOffset, 211 | 0, 212 | 0, 213 | { VK_IMAGE_ASPECT_COLOR_BIT, i, 0, 1 }, 214 | { 0, 0, 0 }, 215 | { mipWidth, mipHeight, 1 }, 216 | }; 217 | vkCmdCopyBufferToImage(commandBuffer, scratch.buffer, image.image, VK_IMAGE_LAYOUT_GENERAL, 1, ®ion); 218 | 219 | bufferOffset += ((mipWidth + 3) / 4) * ((mipHeight + 3) / 4) * blockSize; 220 | 221 | mipWidth = mipWidth > 1 ? mipWidth / 2 : 1; 222 | mipHeight = mipHeight > 1 ? mipHeight / 2 : 1; 223 | } 224 | 225 | assert(bufferOffset == imageSize); 226 | 227 | stageBarrier(commandBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT); 228 | 229 | VK_CHECK(vkEndCommandBuffer(commandBuffer)); 230 | 231 | VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; 232 | submitInfo.commandBufferCount = 1; 233 | submitInfo.pCommandBuffers = &commandBuffer; 234 | 235 | VK_CHECK(vkQueueSubmit(queue, 1, &submitInfo, VK_NULL_HANDLE)); 236 | 237 | VK_CHECK(vkDeviceWaitIdle(device)); 238 | 239 | return true; 240 | } 241 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Niagara 2 | 3 | This is a Vulkan renderer that is written on stream from scratch - without using any third party code that is Vulkan specific. We are using non-Vulkan-specific third party libraries however. 4 | 5 | The goal is to experiment with a few modern Vulkan rendering techniques, such as GPU culling & scene submission, cone culling, automatic occlusion culling, task/mesh shading, and whatever else it is that we will want to experiment with. 6 | The code will be written on stream. 7 | 8 | ![image](https://github.com/user-attachments/assets/b102622e-fbe7-4e9c-b575-e4d4533eadfe) 9 | 10 | # Requirements 11 | 12 | The renderer was originally written using Visual Studio and targeted Windows desktops with modern Vulkan drivers. Since then the development platform has switched to Linux, but you can still build and run it on Windows - via CMake. 13 | 14 | # Building 15 | 16 | To build and run the project, clone this repository using --recursive flag: 17 | 18 | git clone https://github.com/zeux/niagara --recursive 19 | 20 | Make sure you have Vulkan 1.4 SDK and drivers installed; open the folder niagara in Visual Studio (as a CMake project) and build it. On Linux, you can use CMake with your build generator of choice. 21 | 22 | To run the program, command line should contain arguments with paths to .obj files or a .gltf scene; you can use kitten.obj from data/ folder for testing. 23 | 24 | To use Amazon Lumberyard Bistro scene, clone https://github.com/zeux/niagara_bistro and specify path to bistro.gltf instead. 25 | 26 | # Stream 27 | 28 | The development of this project is streamed on YouTube, on Saturdays at 11 AM PST with a somewhat irregular schedule. 29 | 30 | Playlist: https://www.youtube.com/playlist?list=PL0JVLUVCkk-l7CWCn3-cdftR0oajugYvd 31 | 32 | 1. Setting up instance/device and filling the screen with a solid color: https://youtu.be/BR2my8OE1Sc 33 | 2. Rendering a triangle on screen: https://youtu.be/5eS3gsL_P-c 34 | 3. Cleaning up validation errors and implementing swapchain resize: https://youtu.be/_VU-G5rglnA 35 | 4. Rendering a mesh using shader storage buffers and int8: https://youtu.be/nKCzD5iK71M 36 | 5. Rendering a mesh using NVidia RTX mesh shading pipeline: https://youtu.be/gbeOKMjmQ-g 37 | 6. Optimizing GPU time by using device-local memory and parallelizing mesh shader: https://youtu.be/ayKoqK3kQ9c 38 | 7. Using descriptor update templates and parsing SPIRV to extract reflection data: https://youtu.be/3Py4GlWAicY 39 | 8. Cluster cone culling using task shaders and subgroup ops: https://youtu.be/KckRq7Rm3Mw 40 | 9. Tuning mesh shading pipeline for performance: https://youtu.be/snZkA4D_qjU 41 | 10. Depth buffer, perspective projection, 3D transforms and multi draw indirect: https://youtu.be/y4WOsAaXLh0 42 | 11. Multiple meshes and GPU frustum culling: https://youtu.be/NGGzk4Fi2iU 43 | 12. Draw call compaction using KHR_draw_indirect_count and LOD support: https://youtu.be/IYRgDcnJJ2I 44 | 13. Depth pyramid construction and extending SPIRV reflection parser: https://youtu.be/YCteLdYdZWQ 45 | 14. Automatic occlusion culling: https://youtu.be/Fj1E1A4CPCM 46 | 15. Vulkan 1.2 and GPU buffer pointers: https://youtu.be/78tVIA6nRQg 47 | 16. Upgrading to Vulkan 1.3: https://youtu.be/Ka30T6BMdhI 48 | 17. Implementing triangle culling: https://youtu.be/JKTfAgv3Vlo 49 | 18. Meshlet occlusion culling: https://youtu.be/5sBpo5wKmEM 50 | 19. Optimizing culling: https://youtu.be/1Tj6bZvZMts 51 | 20. Task command submission: https://youtu.be/eYvGruGHhUE 52 | 21. Cluster compute culling: https://youtu.be/zROUBE5pLuI 53 | 22. Loading glTF scenes: https://youtu.be/9OF6k57orXo 54 | 23. Bindless textures: https://youtu.be/n9nqSEyXMeA 55 | 24. Tracing rays: https://youtu.be/N1OVfBEcyb8 56 | 25. Tracing rays faster: https://youtu.be/U7TGQsjT16E 57 | 26. Materials and shadows: https://youtu.be/iZTUjRntMbM 58 | 27. Transparent shadows: https://youtu.be/233jxF7irmE 59 | 28. Moving objects: https://youtu.be/TcuUz1ib35c 60 | 29. Performance nsights: https://youtu.be/qlxrRyRdOcY 61 | 30. Simplifying synchronization: https://youtu.be/0rqWe1M2HiE 62 | 31. Cooking geometry: https://youtu.be/d04h0sZPwFU 63 | 64 | # Issues 65 | 66 | During the streams we find various bugs in parts of the Vulkan stack and report them; bugs marked with ✔️ have been fixed. 67 | 68 | * ✔️ vkAcquireNextImageKHR crashes in validation layers if vkGetSwapchainImagesKHR hasn't been called \ 69 | https://github.com/KhronosGroup/Vulkan-ValidationLayers/issues/358 70 | 71 | * ✔️ vkGetPhysicalDeviceSurfaceFormatsKHR doesn't fill format count correctly \ 72 | https://software.intel.com/en-us/forums/graphics-driver-bug-reporting/topic/797666 73 | 74 | * ✔️ Fix NonWritable check when vertexPipelineStoresAndAtomics not enabled \ 75 | https://github.com/KhronosGroup/Vulkan-ValidationLayers/issues/73 76 | 77 | * ✔️ Implicit int8->float cast adds Int8 capability to the shader without asking for GL_KHX_shader_explicit_arithmetic_types \ 78 | https://github.com/KhronosGroup/glslang/issues/1525 79 | 80 | * ⁉ vkCreateSwapchainKHR crashes in Intel drivers when display is plugged into a dedicated GPU \ 81 | https://software.intel.com/en-us/forums/graphics-driver-bug-reporting/topic/797756 82 | 83 | * ✔️ Reading uint8_t from storage buffers adds (unnecessarily) UniformAndStorageBuffer8BitAccess capability \ 84 | https://github.com/KhronosGroup/glslang/issues/1539 85 | 86 | * ✔️ Binding a buffer with VK_BUFFER_USAGE_VERTEX_BUFFER_BIT as a storage buffer using push descriptors doesn't produce validation errors \ 87 | https://github.com/KhronosGroup/Vulkan-ValidationLayers/issues/413 88 | 89 | * ✔️ Fragment shader with perprimitiveNV doesn't have OpExtension SPV_NV_mesh_shader \ 90 | https://github.com/KhronosGroup/glslang/issues/1541 91 | 92 | * ✔️ GL_NV_mesh_shader spec typo for per-primitive fragment shader inputs \ 93 | https://github.com/KhronosGroup/GLSL/issues/31 94 | 95 | * ✔️ Push descriptors generate false positive DescriptorSetNotBound errors \ 96 | https://github.com/KhronosGroup/Vulkan-ValidationLayers/issues/341 97 | 98 | * ✔️ vkCmdDrawIndexedIndirect doesn't issue an error when the buffer wasn't created with VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT \ 99 | https://github.com/KhronosGroup/Vulkan-ValidationLayers/issues/450 100 | 101 | * ✔️ vkCmdDrawMeshTasksIndirectNV doesn't trigger an error when multiDrawIndirect feature is disabled \ 102 | https://github.com/KhronosGroup/Vulkan-ValidationLayers/issues/451 103 | 104 | * ✔️ vkCmdDrawIndexedIndirect is rendering fewer than drawCount draw calls on Intel \ 105 | Reproduce using https://github.com/zeux/niagara/commit/fda3d8743c933fb3a533fed560a6671402d6693b 106 | 107 | * ✔️ vkCmdDrawIndexedIndirectCountKHR is causing a GPU crash on Intel \ 108 | Reproduce using https://github.com/zeux/niagara/commit/c22c2c56d06249835a474e370ea3218463721f42 109 | 110 | * ✔️ Crash during Vulkan replay in push descriptor replay \ 111 | https://github.com/baldurk/renderdoc/issues/1182 112 | 113 | * ✔️ NVidia GTX 10xx series GPUs cause VK_ERROR_DEVICE_LOST when drawCount is 1'000'000 \ 114 | Reproduce using https://github.com/zeux/niagara/commit/8d69552aede9c429765c8c8afd6687d3f3e53475 115 | 116 | * ✔️ AMD drivers 18.11.2 on Windows don't handle specialization constants correctly \ 117 | Using specialization constant LATE in drawcull.comp.glsl leads to no objects being rendered on screen after the first frame 118 | 119 | * ✔️ During validation of pipelines with SPIRV 1.4/1.5 and specialization constants, optimizer isn't configured to use Vulkan 1.2 \ 120 | https://github.com/KhronosGroup/Vulkan-ValidationLayers/issues/1512 121 | 122 | * ✔️ Crash when calling vkCmdDrawIndexedIndirectCount loaded through GIPA \ 123 | https://github.com/KhronosGroup/Vulkan-ValidationLayers/issues/1513 124 | 125 | * ✔️ SHADER_MODULE_STATE::has_specialization_constants is not initialized \ 126 | https://github.com/KhronosGroup/Vulkan-ValidationLayers/issues/1530 127 | 128 | * ✔️ Missing validation: push descriptor updates don't trigger image layout mismatch errors \ 129 | https://github.com/KhronosGroup/Vulkan-ValidationLayers/issues/1862 130 | 131 | * ✔️ A valid interface block in mesh/task shader is considered invalid \ 132 | https://github.com/KhronosGroup/SPIRV-Tools/issues/3653 133 | 134 | * ✔️ Usage of any fields of gl_MeshPrimitivesEXT is enabling capability FragmentShadingRateKHR even if gl_PrimitiveShadingRateEXT is not used \ 135 | https://github.com/KhronosGroup/glslang/issues/3103 136 | 137 | * ✔️ Incomplete mip data is encoded for non-power-of-two textures \ 138 | https://github.com/wolfpld/etcpak/pull/43 139 | 140 | * ✔️ radv should use pointer flags on RDNA3 during BVH traversal \ 141 | https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32417 142 | 143 | * radv: VK_BUILD_ACCELERATION_STRUCTURE_MODE_UPDATE_KHR is not supported for TLAS \ 144 | https://gitlab.freedesktop.org/mesa/mesa/-/issues/12346 145 | 146 | * ✔️ Missing synchronization validation for ray tracing acceleration updates & uses \ 147 | https://github.com/KhronosGroup/Vulkan-ValidationLayers/issues/9076 148 | 149 | * ✔️ anv: Mesh shaders with two OpSetMeshOutputsEXT instructions are not supported \ 150 | https://gitlab.freedesktop.org/mesa/mesa/-/issues/12388 151 | 152 | * vkCmdBuildClusterAccelerationStructureIndirectNV(): pCommandInfos->srcInfosCount may be zero \ 153 | https://github.com/KhronosGroup/Vulkan-Docs/issues/2606 154 | 155 | * ✔️ vkCmdBuildClusterAccelerationStructureIndirectNV(): pCommandInfos->srcInfosArray.stride may be zero \ 156 | https://github.com/KhronosGroup/Vulkan-ValidationLayers/issues/10975 157 | -------------------------------------------------------------------------------- /src/scenecache.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "scene.h" 3 | #include "config.h" 4 | 5 | #include "fileutils.h" 6 | 7 | #include "meshoptimizer.h" 8 | 9 | #include 10 | #include 11 | 12 | const uint32_t kSceneCacheMagic = 0x434E4353; // 'SCNC' 13 | const uint32_t kSceneCacheVersion = 2; 14 | 15 | struct SceneHeader 16 | { 17 | uint32_t magic; 18 | uint32_t version; 19 | 20 | uint32_t meshletMaxVertices; 21 | uint32_t meshletMaxTriangles; 22 | 23 | bool clrtMode; 24 | bool compressed; 25 | 26 | uint32_t compressedVertexBytes; 27 | uint32_t compressedIndexBytes; 28 | uint32_t compressedMeshletVtx0Bytes; 29 | 30 | uint32_t vertexCount; 31 | uint32_t indexCount; 32 | uint32_t meshletCount; 33 | uint32_t meshletdataCount; 34 | uint32_t meshletvtx0Count; 35 | uint32_t meshCount; 36 | 37 | uint32_t materialCount; 38 | uint32_t drawCount; 39 | uint32_t texturePathCount; 40 | 41 | Camera camera; 42 | vec3 sunDirection; 43 | }; 44 | 45 | static size_t writeVertexCompressed(const void* vertices, size_t stride, size_t count, FILE* file, int level = 2) 46 | { 47 | size_t bound = meshopt_encodeVertexBufferBound(count, stride); 48 | std::vector buf(bound); 49 | buf.resize(meshopt_encodeVertexBufferLevel(buf.data(), buf.size(), vertices, count, stride, level)); 50 | 51 | fwrite(buf.data(), 1, buf.size(), file); 52 | return buf.size(); 53 | } 54 | 55 | static size_t writeIndexCompressed(const uint32_t* indices, size_t count, FILE* file) 56 | { 57 | size_t bound = meshopt_encodeIndexBufferBound(count, ~0u); // TODO: vertex_count could be optional somehow 58 | std::vector buf(bound); 59 | buf.resize(meshopt_encodeIndexBuffer(buf.data(), buf.size(), indices, count)); 60 | 61 | fwrite(buf.data(), 1, buf.size(), file); 62 | return buf.size(); 63 | } 64 | 65 | bool saveSceneCache(const char* path, const Geometry& geometry, const std::vector& materials, const std::vector& draws, const std::vector& texturePaths, const Camera& camera, const vec3& sunDirection, bool clrtMode, bool compressed, bool verbose) 66 | { 67 | FILE* file = fopen(path, "wb"); 68 | if (!file) 69 | return false; 70 | 71 | SceneHeader header; 72 | memset(&header, 0, sizeof(header)); 73 | 74 | header.magic = kSceneCacheMagic; 75 | header.version = kSceneCacheVersion; 76 | 77 | header.meshletMaxVertices = MESH_MAXVTX; 78 | header.meshletMaxTriangles = MESH_MAXTRI; 79 | header.clrtMode = clrtMode; 80 | header.compressed = compressed; 81 | 82 | header.vertexCount = geometry.vertices.size(); 83 | header.indexCount = geometry.indices.size(); 84 | header.meshletCount = geometry.meshlets.size(); 85 | header.meshletdataCount = geometry.meshletdata.size(); 86 | header.meshletvtx0Count = geometry.meshletvtx0.size(); 87 | header.meshCount = geometry.meshes.size(); 88 | header.materialCount = materials.size(); 89 | header.drawCount = draws.size(); 90 | header.texturePathCount = texturePaths.size(); 91 | 92 | header.camera = camera; 93 | header.sunDirection = sunDirection; 94 | 95 | fwrite(&header, sizeof(header), 1, file); 96 | 97 | if (compressed) 98 | header.compressedVertexBytes = writeVertexCompressed(geometry.vertices.data(), sizeof(Vertex), geometry.vertices.size(), file); 99 | else 100 | fwrite(geometry.vertices.data(), sizeof(Vertex), geometry.vertices.size(), file); 101 | 102 | if (compressed) 103 | header.compressedIndexBytes = writeIndexCompressed(geometry.indices.data(), geometry.indices.size(), file); 104 | else 105 | fwrite(geometry.indices.data(), sizeof(uint32_t), geometry.indices.size(), file); 106 | 107 | fwrite(geometry.meshlets.data(), sizeof(Meshlet), geometry.meshlets.size(), file); 108 | fwrite(geometry.meshletdata.data(), sizeof(uint32_t), geometry.meshletdata.size(), file); 109 | 110 | if (compressed) 111 | header.compressedMeshletVtx0Bytes = writeVertexCompressed(geometry.meshletvtx0.data(), sizeof(uint16_t) * 4, geometry.meshletvtx0.size() / 4, file); 112 | else 113 | fwrite(geometry.meshletvtx0.data(), sizeof(uint16_t), geometry.meshletvtx0.size(), file); 114 | 115 | fwrite(geometry.meshes.data(), sizeof(Mesh), geometry.meshes.size(), file); 116 | fwrite(materials.data(), sizeof(Material), materials.size(), file); 117 | fwrite(draws.data(), sizeof(MeshDraw), draws.size(), file); 118 | 119 | for (const std::string& path : texturePaths) 120 | { 121 | char buf[128] = {}; 122 | strncpy(buf, path.c_str(), sizeof(buf) - 1); 123 | fwrite(buf, sizeof(buf), 1, file); 124 | } 125 | 126 | // fixup final header 127 | fseek(file, 0, SEEK_SET); 128 | fwrite(&header, sizeof(header), 1, file); 129 | 130 | fclose(file); 131 | 132 | if (verbose) 133 | { 134 | printf("Scene cache saved to %s\n", path); 135 | 136 | if (compressed) 137 | printf("Vertex data: %.2f MB (%.2f MB compressed)\n", double(geometry.vertices.size() * sizeof(Vertex)) / 1e6, double(header.compressedVertexBytes) / 1e6); 138 | else 139 | printf("Vertex data: %.2f MB\n", double(geometry.vertices.size() * sizeof(Vertex)) / 1e6); 140 | 141 | if (compressed) 142 | printf("Index data: %.2f MB (%.2f MB compressed)\n", double(geometry.indices.size() * sizeof(uint32_t)) / 1e6, double(header.compressedIndexBytes) / 1e6); 143 | else 144 | printf("Index data: %.2f MB\n", double(geometry.indices.size() * sizeof(uint32_t)) / 1e6); 145 | 146 | printf("Meshlet data: %.2f MB\n", double(geometry.meshlets.size() * sizeof(Meshlet) + geometry.meshletdata.size() * sizeof(uint32_t)) / 1e6); 147 | 148 | if (compressed) 149 | printf("Meshlet RT data: %.2f MB (%.2f MB compressed)\n", double(geometry.meshletvtx0.size() * sizeof(uint16_t)) / 1e6, double(header.compressedMeshletVtx0Bytes) / 1e6); 150 | else 151 | printf("Meshlet RT data: %.2f MB\n", double(geometry.meshletvtx0.size() * sizeof(uint16_t)) / 1e6); 152 | } 153 | 154 | return true; 155 | } 156 | 157 | static void read(void* data, size_t size, size_t count, void* fileMemory, size_t& fileOffset) 158 | { 159 | memcpy(data, (char*)fileMemory + fileOffset, size * count); 160 | fileOffset += size * count; 161 | } 162 | 163 | static void readVertexCompressed(void* data, size_t size, size_t count, size_t compressedBytes, void* fileMemory, size_t& fileOffset) 164 | { 165 | meshopt_decodeVertexBuffer(data, count, size, (unsigned char*)fileMemory + fileOffset, compressedBytes); 166 | fileOffset += compressedBytes; 167 | } 168 | 169 | static void readIndexCompressed(unsigned int* data, size_t count, size_t compressedBytes, void* fileMemory, size_t& fileOffset) 170 | { 171 | meshopt_decodeIndexBuffer(data, count, (unsigned char*)fileMemory + fileOffset, compressedBytes); 172 | fileOffset += compressedBytes; 173 | } 174 | 175 | bool loadSceneCache(const char* path, Geometry& geometry, std::vector& materials, std::vector& draws, std::vector& texturePaths, Camera& camera, vec3& sunDirection, bool clrtMode) 176 | { 177 | size_t fileSize; 178 | void* file = mmapFile(path, &fileSize); 179 | if (!file || fileSize < sizeof(SceneHeader)) 180 | return false; 181 | 182 | SceneHeader header = {}; 183 | memcpy(&header, file, sizeof(header)); 184 | 185 | if (header.magic != kSceneCacheMagic || header.version != kSceneCacheVersion || 186 | header.meshletMaxVertices != MESH_MAXVTX || header.meshletMaxTriangles != MESH_MAXTRI || 187 | header.clrtMode != clrtMode) 188 | { 189 | unmapFile(file, fileSize); 190 | return false; 191 | } 192 | 193 | size_t fileOffset = sizeof(header); 194 | 195 | geometry.vertices.resize(header.vertexCount); 196 | geometry.indices.resize(header.indexCount); 197 | geometry.meshlets.resize(header.meshletCount); 198 | geometry.meshletdata.resize(header.meshletdataCount); 199 | geometry.meshletvtx0.resize(header.meshletvtx0Count); 200 | geometry.meshes.resize(header.meshCount); 201 | materials.resize(header.materialCount); 202 | draws.resize(header.drawCount); 203 | texturePaths.resize(header.texturePathCount); 204 | 205 | if (header.compressed) 206 | readVertexCompressed(geometry.vertices.data(), sizeof(Vertex), geometry.vertices.size(), header.compressedVertexBytes, file, fileOffset); 207 | else 208 | read(geometry.vertices.data(), sizeof(Vertex), geometry.vertices.size(), file, fileOffset); 209 | 210 | if (header.compressed) 211 | readIndexCompressed(geometry.indices.data(), geometry.indices.size(), header.compressedIndexBytes, file, fileOffset); 212 | else 213 | read(geometry.indices.data(), sizeof(uint32_t), geometry.indices.size(), file, fileOffset); 214 | 215 | read(geometry.meshlets.data(), sizeof(Meshlet), geometry.meshlets.size(), file, fileOffset); 216 | read(geometry.meshletdata.data(), sizeof(uint32_t), geometry.meshletdata.size(), file, fileOffset); 217 | 218 | if (header.compressed) 219 | readVertexCompressed(geometry.meshletvtx0.data(), sizeof(uint16_t) * 4, geometry.meshletvtx0.size() / 4, header.compressedMeshletVtx0Bytes, file, fileOffset); 220 | else 221 | read(geometry.meshletvtx0.data(), sizeof(uint16_t), geometry.meshletvtx0.size(), file, fileOffset); 222 | 223 | read(geometry.meshes.data(), sizeof(Mesh), geometry.meshes.size(), file, fileOffset); 224 | read(materials.data(), sizeof(Material), materials.size(), file, fileOffset); 225 | read(draws.data(), sizeof(MeshDraw), draws.size(), file, fileOffset); 226 | 227 | for (std::string& path : texturePaths) 228 | { 229 | char buf[128] = {}; 230 | read(buf, sizeof(buf), 1, file, fileOffset); 231 | buf[sizeof(buf) - 1] = 0; 232 | 233 | path = buf; 234 | } 235 | 236 | unmapFile(file, fileSize); 237 | 238 | camera = header.camera; 239 | sunDirection = header.sunDirection; 240 | 241 | return true; 242 | } 243 | -------------------------------------------------------------------------------- /src/resources.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "resources.h" 3 | 4 | #include 5 | 6 | VkImageMemoryBarrier2 imageBarrier(VkImage image, VkPipelineStageFlags2 srcStageMask, VkAccessFlags2 srcAccessMask, VkImageLayout oldLayout, VkPipelineStageFlags2 dstStageMask, VkAccessFlags2 dstAccessMask, VkImageLayout newLayout, VkImageAspectFlags aspectMask, uint32_t baseMipLevel, uint32_t levelCount) 7 | { 8 | VkImageMemoryBarrier2 result = { VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2 }; 9 | 10 | result.srcStageMask = srcStageMask; 11 | result.srcAccessMask = srcAccessMask; 12 | result.dstStageMask = dstStageMask; 13 | result.dstAccessMask = dstAccessMask; 14 | result.oldLayout = oldLayout; 15 | result.newLayout = newLayout; 16 | result.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; 17 | result.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; 18 | result.image = image; 19 | result.subresourceRange.aspectMask = aspectMask; 20 | result.subresourceRange.baseMipLevel = baseMipLevel; 21 | result.subresourceRange.levelCount = levelCount; 22 | result.subresourceRange.layerCount = VK_REMAINING_ARRAY_LAYERS; 23 | 24 | return result; 25 | } 26 | 27 | VkBufferMemoryBarrier2 bufferBarrier(VkBuffer buffer, VkPipelineStageFlags2 srcStageMask, VkAccessFlags2 srcAccessMask, VkPipelineStageFlags2 dstStageMask, VkAccessFlags2 dstAccessMask) 28 | { 29 | VkBufferMemoryBarrier2 result = { VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2 }; 30 | 31 | result.srcStageMask = srcStageMask; 32 | result.srcAccessMask = srcAccessMask; 33 | result.dstStageMask = dstStageMask; 34 | result.dstAccessMask = dstAccessMask; 35 | result.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; 36 | result.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; 37 | result.buffer = buffer; 38 | result.offset = 0; 39 | result.size = VK_WHOLE_SIZE; 40 | 41 | return result; 42 | } 43 | 44 | void pipelineBarrier(VkCommandBuffer commandBuffer, VkDependencyFlags dependencyFlags, size_t bufferBarrierCount, const VkBufferMemoryBarrier2* bufferBarriers, size_t imageBarrierCount, const VkImageMemoryBarrier2* imageBarriers) 45 | { 46 | VkDependencyInfo dependencyInfo = { VK_STRUCTURE_TYPE_DEPENDENCY_INFO }; 47 | dependencyInfo.dependencyFlags = dependencyFlags; 48 | dependencyInfo.bufferMemoryBarrierCount = unsigned(bufferBarrierCount); 49 | dependencyInfo.pBufferMemoryBarriers = bufferBarriers; 50 | dependencyInfo.imageMemoryBarrierCount = unsigned(imageBarrierCount); 51 | dependencyInfo.pImageMemoryBarriers = imageBarriers; 52 | 53 | vkCmdPipelineBarrier2(commandBuffer, &dependencyInfo); 54 | } 55 | 56 | void invalidateBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stageMask, std::initializer_list colorImages, std::initializer_list depthImages) 57 | { 58 | VkImageMemoryBarrier2 imageBarriers[32]; 59 | assert(colorImages.size() + depthImages.size() <= sizeof(imageBarriers) / sizeof(imageBarriers[0])); 60 | 61 | VkAccessFlags2 accessFlags = VK_ACCESS_2_MEMORY_READ_BIT | VK_ACCESS_2_MEMORY_WRITE_BIT; 62 | 63 | size_t imageBarrierCount = 0; 64 | for (VkImage image : colorImages) 65 | imageBarriers[imageBarrierCount++] = imageBarrier(image, stageMask, 0, VK_IMAGE_LAYOUT_UNDEFINED, stageMask, accessFlags, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT); 66 | for (VkImage image : depthImages) 67 | imageBarriers[imageBarrierCount++] = imageBarrier(image, stageMask, 0, VK_IMAGE_LAYOUT_UNDEFINED, stageMask, accessFlags, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_DEPTH_BIT); 68 | 69 | VkDependencyInfo dependencyInfo = { VK_STRUCTURE_TYPE_DEPENDENCY_INFO }; 70 | dependencyInfo.imageMemoryBarrierCount = unsigned(imageBarrierCount); 71 | dependencyInfo.pImageMemoryBarriers = imageBarriers; 72 | 73 | vkCmdPipelineBarrier2(commandBuffer, &dependencyInfo); 74 | } 75 | 76 | void stageBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 srcStageMask, VkAccessFlags2 srcAccessMask, VkPipelineStageFlags2 dstStageMask, VkAccessFlags2 dstAccessMask) 77 | { 78 | VkMemoryBarrier2 memoryBarrier = { VK_STRUCTURE_TYPE_MEMORY_BARRIER_2 }; 79 | memoryBarrier.srcStageMask = srcStageMask; 80 | memoryBarrier.srcAccessMask = srcAccessMask; 81 | memoryBarrier.dstStageMask = dstStageMask; 82 | memoryBarrier.dstAccessMask = dstAccessMask; 83 | 84 | VkDependencyInfo dependencyInfo = { VK_STRUCTURE_TYPE_DEPENDENCY_INFO }; 85 | dependencyInfo.memoryBarrierCount = 1; 86 | dependencyInfo.pMemoryBarriers = &memoryBarrier; 87 | 88 | vkCmdPipelineBarrier2(commandBuffer, &dependencyInfo); 89 | } 90 | 91 | void stageBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 srcStageMask, VkPipelineStageFlags2 dstStageMask) 92 | { 93 | VkAccessFlags2 accessFlags = VK_ACCESS_2_MEMORY_READ_BIT | VK_ACCESS_2_MEMORY_WRITE_BIT; 94 | stageBarrier(commandBuffer, srcStageMask, accessFlags, dstStageMask, accessFlags); 95 | } 96 | 97 | void stageBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stageMask) 98 | { 99 | stageBarrier(commandBuffer, stageMask, stageMask); 100 | } 101 | 102 | static uint32_t selectMemoryType(const VkPhysicalDeviceMemoryProperties& memoryProperties, uint32_t memoryTypeBits, VkMemoryPropertyFlags flags) 103 | { 104 | for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; ++i) 105 | if ((memoryTypeBits & (1 << i)) != 0 && (memoryProperties.memoryTypes[i].propertyFlags & flags) == flags) 106 | return i; 107 | 108 | assert(!"No compatible memory type found"); 109 | return ~0u; 110 | } 111 | 112 | void createBuffer(Buffer& result, VkDevice device, const VkPhysicalDeviceMemoryProperties& memoryProperties, size_t size, VkBufferUsageFlags usage, VkMemoryPropertyFlags memoryFlags) 113 | { 114 | VkBufferCreateInfo createInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; 115 | createInfo.size = size; 116 | createInfo.usage = usage; 117 | 118 | VkBuffer buffer = 0; 119 | VK_CHECK(vkCreateBuffer(device, &createInfo, 0, &buffer)); 120 | 121 | VkMemoryRequirements memoryRequirements; 122 | vkGetBufferMemoryRequirements(device, buffer, &memoryRequirements); 123 | 124 | uint32_t memoryTypeIndex = selectMemoryType(memoryProperties, memoryRequirements.memoryTypeBits, memoryFlags); 125 | assert(memoryTypeIndex != ~0u); 126 | 127 | VkMemoryAllocateInfo allocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO }; 128 | allocateInfo.allocationSize = memoryRequirements.size; 129 | allocateInfo.memoryTypeIndex = memoryTypeIndex; 130 | 131 | VkMemoryAllocateFlagsInfo flagInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO }; 132 | 133 | if (usage & VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT) 134 | { 135 | allocateInfo.pNext = &flagInfo; 136 | flagInfo.flags = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT; 137 | flagInfo.deviceMask = 1; 138 | } 139 | 140 | VkDeviceMemory memory = 0; 141 | VK_CHECK(vkAllocateMemory(device, &allocateInfo, 0, &memory)); 142 | 143 | VK_CHECK(vkBindBufferMemory(device, buffer, memory, 0)); 144 | 145 | void* data = 0; 146 | if (memoryFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) 147 | VK_CHECK(vkMapMemory(device, memory, 0, size, 0, &data)); 148 | 149 | result.buffer = buffer; 150 | result.memory = memory; 151 | result.data = data; 152 | result.size = size; 153 | } 154 | 155 | void uploadBuffer(VkDevice device, VkCommandPool commandPool, VkCommandBuffer commandBuffer, VkQueue queue, const Buffer& buffer, const Buffer& scratch, const void* data, size_t size) 156 | { 157 | // TODO: this function is submitting a command buffer and waiting for device idle for each buffer upload; this is obviously suboptimal and we'd need to batch this later 158 | assert(size > 0); 159 | assert(scratch.data); 160 | assert(scratch.size >= size); 161 | memcpy(scratch.data, data, size); 162 | 163 | VK_CHECK(vkResetCommandPool(device, commandPool, 0)); 164 | 165 | VkCommandBufferBeginInfo beginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; 166 | beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; 167 | 168 | VK_CHECK(vkBeginCommandBuffer(commandBuffer, &beginInfo)); 169 | 170 | VkBufferCopy region = { 0, 0, VkDeviceSize(size) }; 171 | vkCmdCopyBuffer(commandBuffer, scratch.buffer, buffer.buffer, 1, ®ion); 172 | 173 | VK_CHECK(vkEndCommandBuffer(commandBuffer)); 174 | 175 | VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; 176 | submitInfo.commandBufferCount = 1; 177 | submitInfo.pCommandBuffers = &commandBuffer; 178 | 179 | VK_CHECK(vkQueueSubmit(queue, 1, &submitInfo, VK_NULL_HANDLE)); 180 | 181 | VK_CHECK(vkDeviceWaitIdle(device)); 182 | } 183 | 184 | void destroyBuffer(const Buffer& buffer, VkDevice device) 185 | { 186 | vkDestroyBuffer(device, buffer.buffer, 0); 187 | vkFreeMemory(device, buffer.memory, 0); 188 | } 189 | 190 | VkDeviceAddress getBufferAddress(const Buffer& buffer, VkDevice device) 191 | { 192 | VkBufferDeviceAddressInfo info = { VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO }; 193 | info.buffer = buffer.buffer; 194 | 195 | VkDeviceAddress address = vkGetBufferDeviceAddress(device, &info); 196 | assert(address != 0); 197 | 198 | return address; 199 | } 200 | 201 | VkImageView createImageView(VkDevice device, VkImage image, VkFormat format, uint32_t mipLevel, uint32_t levelCount) 202 | { 203 | VkImageAspectFlags aspectMask = (format == VK_FORMAT_D32_SFLOAT) ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_COLOR_BIT; 204 | 205 | VkImageViewCreateInfo createInfo = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO }; 206 | createInfo.image = image; 207 | createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D; 208 | createInfo.format = format; 209 | createInfo.subresourceRange.aspectMask = aspectMask; 210 | createInfo.subresourceRange.baseMipLevel = mipLevel; 211 | createInfo.subresourceRange.levelCount = levelCount; 212 | createInfo.subresourceRange.layerCount = 1; 213 | 214 | VkImageView view = 0; 215 | VK_CHECK(vkCreateImageView(device, &createInfo, 0, &view)); 216 | 217 | return view; 218 | } 219 | 220 | void createImage(Image& result, VkDevice device, const VkPhysicalDeviceMemoryProperties& memoryProperties, uint32_t width, uint32_t height, uint32_t mipLevels, VkFormat format, VkImageUsageFlags usage) 221 | { 222 | VkImageCreateInfo createInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO }; 223 | 224 | createInfo.imageType = VK_IMAGE_TYPE_2D; 225 | createInfo.format = format; 226 | createInfo.extent = { width, height, 1 }; 227 | createInfo.mipLevels = mipLevels; 228 | createInfo.arrayLayers = 1; 229 | createInfo.samples = VK_SAMPLE_COUNT_1_BIT; 230 | createInfo.tiling = VK_IMAGE_TILING_OPTIMAL; 231 | createInfo.usage = usage; 232 | createInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; 233 | 234 | VkImage image = 0; 235 | VK_CHECK(vkCreateImage(device, &createInfo, 0, &image)); 236 | 237 | VkMemoryRequirements memoryRequirements; 238 | vkGetImageMemoryRequirements(device, image, &memoryRequirements); 239 | 240 | uint32_t memoryTypeIndex = selectMemoryType(memoryProperties, memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); 241 | assert(memoryTypeIndex != ~0u); 242 | 243 | VkMemoryAllocateInfo allocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO }; 244 | allocateInfo.allocationSize = memoryRequirements.size; 245 | allocateInfo.memoryTypeIndex = memoryTypeIndex; 246 | 247 | VkDeviceMemory memory = 0; 248 | VK_CHECK(vkAllocateMemory(device, &allocateInfo, 0, &memory)); 249 | 250 | VK_CHECK(vkBindImageMemory(device, image, memory, 0)); 251 | 252 | result.image = image; 253 | result.imageView = createImageView(device, image, format, 0, mipLevels); 254 | result.memory = memory; 255 | } 256 | 257 | void destroyImage(const Image& image, VkDevice device) 258 | { 259 | vkDestroyImageView(device, image.imageView, 0); 260 | vkDestroyImage(device, image.image, 0); 261 | vkFreeMemory(device, image.memory, 0); 262 | } 263 | 264 | uint32_t getImageMipLevels(uint32_t width, uint32_t height) 265 | { 266 | uint32_t result = 1; 267 | 268 | while (width > 1 || height > 1) 269 | { 270 | result++; 271 | width /= 2; 272 | height /= 2; 273 | } 274 | 275 | return result; 276 | } 277 | 278 | VkSampler createSampler(VkDevice device, VkFilter filter, VkSamplerMipmapMode mipmapMode, VkSamplerAddressMode addressMode, VkSamplerReductionModeEXT reductionMode) 279 | { 280 | VkSamplerCreateInfo createInfo = { VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO }; 281 | 282 | createInfo.magFilter = filter; 283 | createInfo.minFilter = filter; 284 | createInfo.mipmapMode = mipmapMode; 285 | createInfo.addressModeU = addressMode; 286 | createInfo.addressModeV = addressMode; 287 | createInfo.addressModeW = addressMode; 288 | createInfo.minLod = 0; 289 | createInfo.maxLod = 16.f; 290 | createInfo.anisotropyEnable = mipmapMode == VK_SAMPLER_MIPMAP_MODE_LINEAR; 291 | createInfo.maxAnisotropy = mipmapMode == VK_SAMPLER_MIPMAP_MODE_LINEAR ? 4.f : 1.f; 292 | 293 | VkSamplerReductionModeCreateInfoEXT createInfoReduction = { VK_STRUCTURE_TYPE_SAMPLER_REDUCTION_MODE_CREATE_INFO_EXT }; 294 | 295 | if (reductionMode != VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT) 296 | { 297 | createInfoReduction.reductionMode = reductionMode; 298 | 299 | createInfo.pNext = &createInfoReduction; 300 | } 301 | 302 | VkSampler sampler = 0; 303 | VK_CHECK(vkCreateSampler(device, &createInfo, 0, &sampler)); 304 | return sampler; 305 | } 306 | -------------------------------------------------------------------------------- /src/device.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "device.h" 3 | 4 | #include "config.h" 5 | #include "swapchain.h" 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | // Validation is enabled by default in Debug 12 | #ifndef NDEBUG 13 | #define KHR_VALIDATION 1 14 | #else 15 | #define KHR_VALIDATION CONFIG_RELVAL 16 | #endif 17 | 18 | // Synchronization validation is disabled by default in Debug since it's rather slow 19 | #define SYNC_VALIDATION CONFIG_SYNCVAL 20 | 21 | // We have a strict requirement for latest Vulkan version to be available 22 | #define API_VERSION VK_API_VERSION_1_4 23 | 24 | #ifdef _WIN32 25 | #include 26 | #endif 27 | 28 | static bool isLayerSupported(const char* name) 29 | { 30 | uint32_t propertyCount = 0; 31 | VK_CHECK(vkEnumerateInstanceLayerProperties(&propertyCount, 0)); 32 | 33 | std::vector properties(propertyCount); 34 | VK_CHECK(vkEnumerateInstanceLayerProperties(&propertyCount, properties.data())); 35 | 36 | for (uint32_t i = 0; i < propertyCount; ++i) 37 | if (strcmp(name, properties[i].layerName) == 0) 38 | return true; 39 | 40 | return false; 41 | } 42 | 43 | bool isInstanceExtensionSupported(const char* name) 44 | { 45 | uint32_t propertyCount = 0; 46 | VK_CHECK(vkEnumerateInstanceExtensionProperties(NULL, &propertyCount, 0)); 47 | 48 | std::vector properties(propertyCount); 49 | VK_CHECK(vkEnumerateInstanceExtensionProperties(NULL, &propertyCount, properties.data())); 50 | 51 | for (uint32_t i = 0; i < propertyCount; ++i) 52 | if (strcmp(name, properties[i].extensionName) == 0) 53 | return true; 54 | 55 | return false; 56 | } 57 | 58 | VkInstance createInstance() 59 | { 60 | if (volkGetInstanceVersion() < API_VERSION) 61 | { 62 | fprintf(stderr, "ERROR: Vulkan 1.%d instance not found\n", VK_VERSION_MINOR(API_VERSION)); 63 | return 0; 64 | } 65 | 66 | VkApplicationInfo appInfo = { VK_STRUCTURE_TYPE_APPLICATION_INFO }; 67 | appInfo.apiVersion = API_VERSION; 68 | 69 | VkInstanceCreateInfo createInfo = { VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO }; 70 | createInfo.pApplicationInfo = &appInfo; 71 | 72 | #if KHR_VALIDATION || SYNC_VALIDATION 73 | const char* debugLayers[] = { 74 | "VK_LAYER_KHRONOS_validation", 75 | }; 76 | 77 | if (isLayerSupported("VK_LAYER_KHRONOS_validation")) 78 | { 79 | createInfo.ppEnabledLayerNames = debugLayers; 80 | createInfo.enabledLayerCount = sizeof(debugLayers) / sizeof(debugLayers[0]); 81 | printf("Enabled Vulkan validation layers (sync validation %s)\n", SYNC_VALIDATION ? "enabled" : "disabled"); 82 | } 83 | else 84 | { 85 | printf("Warning: Vulkan debug layers are not available\n"); 86 | } 87 | 88 | #if SYNC_VALIDATION 89 | VkValidationFeatureEnableEXT enabledValidationFeatures[] = { 90 | VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT, 91 | }; 92 | 93 | VkValidationFeaturesEXT validationFeatures = { VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT }; 94 | validationFeatures.enabledValidationFeatureCount = sizeof(enabledValidationFeatures) / sizeof(enabledValidationFeatures[0]); 95 | validationFeatures.pEnabledValidationFeatures = enabledValidationFeatures; 96 | 97 | createInfo.pNext = &validationFeatures; 98 | #endif 99 | #endif 100 | 101 | std::vector extensions; 102 | 103 | // Query Vulkan instance extensions required by GLFW for creating Vulkan surfaces for GLFW windows. 104 | uint32_t swapchainExtensionCount; 105 | if (const char** swapchainExtensions = getSwapchainExtensions(&swapchainExtensionCount)) 106 | extensions.insert(extensions.end(), swapchainExtensions, swapchainExtensions + swapchainExtensionCount); 107 | 108 | if (isInstanceExtensionSupported(VK_EXT_DEBUG_UTILS_EXTENSION_NAME)) 109 | extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); 110 | 111 | createInfo.ppEnabledExtensionNames = extensions.data(); 112 | createInfo.enabledExtensionCount = extensions.size(); 113 | 114 | #ifdef VK_USE_PLATFORM_METAL_EXT 115 | createInfo.flags |= VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR; 116 | #endif 117 | 118 | VkInstance instance = 0; 119 | VK_CHECK(vkCreateInstance(&createInfo, 0, &instance)); 120 | 121 | return instance; 122 | } 123 | 124 | static VkBool32 VKAPI_CALL debugUtilsCallback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, VkDebugUtilsMessageTypeFlagsEXT types, const VkDebugUtilsMessengerCallbackDataEXT* callbackData, void* userData) 125 | { 126 | if (severity < VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT) 127 | return VK_FALSE; 128 | 129 | // Works around https://github.com/KhronosGroup/Vulkan-Docs/issues/2606 130 | if (strstr(callbackData->pMessage, "vkCmdBuildClusterAccelerationStructureIndirectNV(): pCommandInfos->srcInfosCount is zero")) 131 | return VK_FALSE; 132 | 133 | const char* type = (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) ? "ERROR" : "WARNING"; 134 | 135 | char message[4096]; 136 | snprintf(message, COUNTOF(message), "%s: %s\n", type, callbackData->pMessage); 137 | 138 | printf("%s", message); 139 | 140 | #ifdef _WIN32 141 | OutputDebugStringA(message); 142 | #endif 143 | 144 | if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) 145 | assert(!"Validation error encountered!"); 146 | 147 | return VK_FALSE; 148 | } 149 | 150 | VkDebugUtilsMessengerEXT registerDebugCallback(VkInstance instance) 151 | { 152 | if (!vkCreateDebugUtilsMessengerEXT) 153 | return nullptr; 154 | 155 | VkDebugUtilsMessengerCreateInfoEXT createInfo = { VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT }; 156 | createInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT; 157 | createInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; 158 | createInfo.pfnUserCallback = debugUtilsCallback; 159 | 160 | VkDebugUtilsMessengerEXT messenger = 0; 161 | VK_CHECK(vkCreateDebugUtilsMessengerEXT(instance, &createInfo, 0, &messenger)); 162 | 163 | return messenger; 164 | } 165 | 166 | uint32_t getGraphicsFamilyIndex(VkPhysicalDevice physicalDevice) 167 | { 168 | uint32_t queueCount = 0; 169 | vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueCount, 0); 170 | 171 | std::vector queues(queueCount); 172 | vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueCount, queues.data()); 173 | 174 | for (uint32_t i = 0; i < queueCount; ++i) 175 | if (queues[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) 176 | return i; 177 | 178 | return VK_QUEUE_FAMILY_IGNORED; 179 | } 180 | 181 | static bool supportsPresentation(VkPhysicalDevice physicalDevice, uint32_t familyIndex) 182 | { 183 | #if defined(VK_USE_PLATFORM_WIN32_KHR) 184 | return !!vkGetPhysicalDeviceWin32PresentationSupportKHR(physicalDevice, familyIndex); 185 | #else 186 | return true; 187 | #endif 188 | } 189 | 190 | VkPhysicalDevice pickPhysicalDevice(VkPhysicalDevice* physicalDevices, uint32_t physicalDeviceCount) 191 | { 192 | VkPhysicalDevice preferred = 0; 193 | VkPhysicalDevice fallback = 0; 194 | 195 | const char* ngpu = getenv("NGPU"); 196 | 197 | for (uint32_t i = 0; i < physicalDeviceCount; ++i) 198 | { 199 | VkPhysicalDeviceProperties props; 200 | vkGetPhysicalDeviceProperties(physicalDevices[i], &props); 201 | 202 | if (props.deviceType == VK_PHYSICAL_DEVICE_TYPE_CPU) 203 | continue; 204 | 205 | printf("GPU%d: %s (Vulkan 1.%d)\n", i, props.deviceName, VK_VERSION_MINOR(props.apiVersion)); 206 | 207 | uint32_t familyIndex = getGraphicsFamilyIndex(physicalDevices[i]); 208 | if (familyIndex == VK_QUEUE_FAMILY_IGNORED) 209 | continue; 210 | 211 | if (!supportsPresentation(physicalDevices[i], familyIndex)) 212 | continue; 213 | 214 | if (props.apiVersion < API_VERSION) 215 | continue; 216 | 217 | if (ngpu && atoi(ngpu) == i) 218 | { 219 | preferred = physicalDevices[i]; 220 | } 221 | 222 | if (!preferred && props.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) 223 | { 224 | preferred = physicalDevices[i]; 225 | } 226 | 227 | if (!fallback) 228 | { 229 | fallback = physicalDevices[i]; 230 | } 231 | } 232 | 233 | VkPhysicalDevice result = preferred ? preferred : fallback; 234 | 235 | if (result) 236 | { 237 | VkPhysicalDeviceProperties props; 238 | vkGetPhysicalDeviceProperties(result, &props); 239 | 240 | printf("Selected GPU %s\n", props.deviceName); 241 | } 242 | else 243 | { 244 | fprintf(stderr, "ERROR: No compatible GPU found\n"); 245 | } 246 | 247 | return result; 248 | } 249 | 250 | VkDevice createDevice(VkInstance instance, VkPhysicalDevice physicalDevice, uint32_t familyIndex, bool meshShadingSupported, bool raytracingSupported, bool clusterrtSupported) 251 | { 252 | float queuePriorities[] = { 1.0f }; 253 | 254 | VkDeviceQueueCreateInfo queueInfo = { VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO }; 255 | queueInfo.queueFamilyIndex = familyIndex; 256 | queueInfo.queueCount = 1; 257 | queueInfo.pQueuePriorities = queuePriorities; 258 | 259 | std::vector extensions = { 260 | VK_KHR_SWAPCHAIN_EXTENSION_NAME, 261 | }; 262 | 263 | if (meshShadingSupported) 264 | extensions.push_back(VK_EXT_MESH_SHADER_EXTENSION_NAME); 265 | 266 | if (raytracingSupported) 267 | { 268 | extensions.push_back(VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME); 269 | extensions.push_back(VK_KHR_RAY_QUERY_EXTENSION_NAME); 270 | extensions.push_back(VK_KHR_DEFERRED_HOST_OPERATIONS_EXTENSION_NAME); 271 | } 272 | 273 | #ifdef VK_NV_cluster_acceleration_structure 274 | if (clusterrtSupported) 275 | extensions.push_back(VK_NV_CLUSTER_ACCELERATION_STRUCTURE_EXTENSION_NAME); 276 | #endif 277 | 278 | VkPhysicalDeviceFeatures2 features = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2 }; 279 | features.features.multiDrawIndirect = true; 280 | features.features.pipelineStatisticsQuery = true; 281 | features.features.shaderInt16 = true; 282 | features.features.shaderInt64 = true; 283 | features.features.samplerAnisotropy = true; 284 | 285 | VkPhysicalDeviceVulkan11Features features11 = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES }; 286 | features11.storageBuffer16BitAccess = true; 287 | features11.shaderDrawParameters = true; 288 | 289 | VkPhysicalDeviceVulkan12Features features12 = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES }; 290 | features12.drawIndirectCount = true; 291 | features12.storageBuffer8BitAccess = true; 292 | features12.uniformAndStorageBuffer8BitAccess = true; 293 | features12.shaderFloat16 = true; 294 | features12.shaderInt8 = true; 295 | features12.samplerFilterMinmax = true; 296 | features12.scalarBlockLayout = true; 297 | 298 | if (raytracingSupported) 299 | features12.bufferDeviceAddress = true; 300 | 301 | features12.descriptorIndexing = true; 302 | features12.shaderSampledImageArrayNonUniformIndexing = true; 303 | features12.descriptorBindingSampledImageUpdateAfterBind = true; 304 | features12.descriptorBindingUpdateUnusedWhilePending = true; 305 | features12.descriptorBindingPartiallyBound = true; 306 | features12.descriptorBindingVariableDescriptorCount = true; 307 | features12.runtimeDescriptorArray = true; 308 | 309 | VkPhysicalDeviceVulkan13Features features13 = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES }; 310 | features13.dynamicRendering = true; 311 | features13.synchronization2 = true; 312 | features13.maintenance4 = true; 313 | features13.shaderDemoteToHelperInvocation = true; // required for discard; under new glslang rules 314 | 315 | VkPhysicalDeviceVulkan14Features features14 = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_4_FEATURES }; 316 | features14.maintenance5 = true; 317 | features14.maintenance6 = true; 318 | features14.pushDescriptor = true; 319 | 320 | // This will only be used if meshShadingSupported=true (see below) 321 | VkPhysicalDeviceMeshShaderFeaturesEXT featuresMesh = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MESH_SHADER_FEATURES_EXT }; 322 | featuresMesh.taskShader = true; 323 | featuresMesh.meshShader = true; 324 | 325 | // This will only be used if raytracingSupported=true (see below) 326 | VkPhysicalDeviceRayQueryFeaturesKHR featuresRayQueries = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR }; 327 | featuresRayQueries.rayQuery = true; 328 | 329 | // This will only be used if raytracingSupported=true (see below) 330 | VkPhysicalDeviceAccelerationStructureFeaturesKHR featuresAccelerationStructure = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR }; 331 | featuresAccelerationStructure.accelerationStructure = true; 332 | 333 | // This will only be used if clusterrtSupported=true (see below) 334 | VkPhysicalDeviceClusterAccelerationStructureFeaturesNV featuresClusterAcceleration = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CLUSTER_ACCELERATION_STRUCTURE_FEATURES_NV }; 335 | featuresClusterAcceleration.clusterAccelerationStructure = true; 336 | 337 | VkDeviceCreateInfo createInfo = { VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO }; 338 | createInfo.queueCreateInfoCount = 1; 339 | createInfo.pQueueCreateInfos = &queueInfo; 340 | 341 | createInfo.ppEnabledExtensionNames = extensions.data(); 342 | createInfo.enabledExtensionCount = uint32_t(extensions.size()); 343 | 344 | createInfo.pNext = &features; 345 | features.pNext = &features11; 346 | features11.pNext = &features12; 347 | features12.pNext = &features13; 348 | features13.pNext = &features14; 349 | 350 | void** ppNext = &features14.pNext; 351 | 352 | if (meshShadingSupported) 353 | { 354 | *ppNext = &featuresMesh; 355 | ppNext = &featuresMesh.pNext; 356 | } 357 | 358 | if (raytracingSupported) 359 | { 360 | *ppNext = &featuresRayQueries; 361 | ppNext = &featuresRayQueries.pNext; 362 | 363 | *ppNext = &featuresAccelerationStructure; 364 | ppNext = &featuresAccelerationStructure.pNext; 365 | } 366 | 367 | if (clusterrtSupported) 368 | { 369 | *ppNext = &featuresClusterAcceleration; 370 | ppNext = &featuresClusterAcceleration.pNext; 371 | } 372 | 373 | VkDevice device = 0; 374 | VK_CHECK(vkCreateDevice(physicalDevice, &createInfo, 0, &device)); 375 | 376 | return device; 377 | } 378 | -------------------------------------------------------------------------------- /src/shaders.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "shaders.h" 3 | #include "config.h" 4 | 5 | #include 6 | #include 7 | 8 | #ifdef _WIN32 9 | #include 10 | #else 11 | #include 12 | #endif 13 | 14 | #include 15 | #include 16 | 17 | #if defined __APPLE__ 18 | #include 19 | #elif defined __linux__ 20 | #include 21 | #elif VK_HEADER_VERSION >= 135 22 | #include 23 | #else 24 | #include 25 | #endif 26 | 27 | // https://www.khronos.org/registry/spir-v/specs/1.0/SPIRV.pdf 28 | struct Id 29 | { 30 | uint32_t opcode; 31 | uint32_t typeId; 32 | uint32_t storageClass; 33 | uint32_t binding; 34 | uint32_t set; 35 | uint32_t constant; 36 | }; 37 | 38 | static VkShaderStageFlagBits getShaderStage(SpvExecutionModel executionModel) 39 | { 40 | switch (executionModel) 41 | { 42 | case SpvExecutionModelVertex: 43 | return VK_SHADER_STAGE_VERTEX_BIT; 44 | case SpvExecutionModelGeometry: 45 | return VK_SHADER_STAGE_GEOMETRY_BIT; 46 | case SpvExecutionModelFragment: 47 | return VK_SHADER_STAGE_FRAGMENT_BIT; 48 | case SpvExecutionModelGLCompute: 49 | return VK_SHADER_STAGE_COMPUTE_BIT; 50 | case SpvExecutionModelTaskEXT: 51 | return VK_SHADER_STAGE_TASK_BIT_EXT; 52 | case SpvExecutionModelMeshEXT: 53 | return VK_SHADER_STAGE_MESH_BIT_EXT; 54 | 55 | default: 56 | assert(!"Unsupported execution model"); 57 | return VkShaderStageFlagBits(0); 58 | } 59 | } 60 | 61 | static VkDescriptorType getDescriptorType(SpvOp op) 62 | { 63 | switch (op) 64 | { 65 | case SpvOpTypeStruct: 66 | return VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; 67 | case SpvOpTypeImage: 68 | return VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; 69 | case SpvOpTypeSampler: 70 | return VK_DESCRIPTOR_TYPE_SAMPLER; 71 | case SpvOpTypeSampledImage: 72 | return VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; 73 | case SpvOpTypeAccelerationStructureKHR: 74 | return VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR; 75 | default: 76 | assert(!"Unknown resource type"); 77 | return VkDescriptorType(0); 78 | } 79 | } 80 | 81 | static void parseShader(Shader& shader, const uint32_t* code, uint32_t codeSize) 82 | { 83 | assert(code[0] == SpvMagicNumber); 84 | 85 | uint32_t idBound = code[3]; 86 | 87 | std::vector ids(idBound); 88 | 89 | int localSizeIdX = -1; 90 | int localSizeIdY = -1; 91 | int localSizeIdZ = -1; 92 | 93 | const uint32_t* insn = code + 5; 94 | 95 | while (insn != code + codeSize) 96 | { 97 | uint16_t opcode = uint16_t(insn[0]); 98 | uint16_t wordCount = uint16_t(insn[0] >> 16); 99 | 100 | switch (opcode) 101 | { 102 | case SpvOpEntryPoint: 103 | { 104 | assert(wordCount >= 2); 105 | shader.stage = getShaderStage(SpvExecutionModel(insn[1])); 106 | } 107 | break; 108 | case SpvOpExecutionMode: 109 | { 110 | assert(wordCount >= 3); 111 | uint32_t mode = insn[2]; 112 | 113 | switch (mode) 114 | { 115 | case SpvExecutionModeLocalSize: 116 | assert(wordCount == 6); 117 | shader.localSizeX = insn[3]; 118 | shader.localSizeY = insn[4]; 119 | shader.localSizeZ = insn[5]; 120 | break; 121 | } 122 | } 123 | break; 124 | case SpvOpExecutionModeId: 125 | { 126 | assert(wordCount >= 3); 127 | uint32_t mode = insn[2]; 128 | 129 | switch (mode) 130 | { 131 | case SpvExecutionModeLocalSizeId: 132 | assert(wordCount == 6); 133 | localSizeIdX = int(insn[3]); 134 | localSizeIdY = int(insn[4]); 135 | localSizeIdZ = int(insn[5]); 136 | break; 137 | } 138 | } 139 | break; 140 | case SpvOpDecorate: 141 | { 142 | assert(wordCount >= 3); 143 | 144 | uint32_t id = insn[1]; 145 | assert(id < idBound); 146 | 147 | switch (insn[2]) 148 | { 149 | case SpvDecorationDescriptorSet: 150 | assert(wordCount == 4); 151 | ids[id].set = insn[3]; 152 | break; 153 | case SpvDecorationBinding: 154 | assert(wordCount == 4); 155 | ids[id].binding = insn[3]; 156 | break; 157 | } 158 | } 159 | break; 160 | case SpvOpTypeStruct: 161 | case SpvOpTypeImage: 162 | case SpvOpTypeSampler: 163 | case SpvOpTypeSampledImage: 164 | case SpvOpTypeAccelerationStructureKHR: 165 | { 166 | assert(wordCount >= 2); 167 | 168 | uint32_t id = insn[1]; 169 | assert(id < idBound); 170 | 171 | assert(ids[id].opcode == 0); 172 | ids[id].opcode = opcode; 173 | } 174 | break; 175 | case SpvOpTypePointer: 176 | { 177 | assert(wordCount == 4); 178 | 179 | uint32_t id = insn[1]; 180 | assert(id < idBound); 181 | 182 | assert(ids[id].opcode == 0); 183 | ids[id].opcode = opcode; 184 | ids[id].typeId = insn[3]; 185 | ids[id].storageClass = insn[2]; 186 | } 187 | break; 188 | case SpvOpConstant: 189 | { 190 | assert(wordCount >= 4); // we currently only correctly handle 32-bit integer constants 191 | 192 | uint32_t id = insn[2]; 193 | assert(id < idBound); 194 | 195 | assert(ids[id].opcode == 0); 196 | ids[id].opcode = opcode; 197 | ids[id].typeId = insn[1]; 198 | ids[id].constant = insn[3]; // note: this is the value, not the id of the constant 199 | } 200 | break; 201 | case SpvOpVariable: 202 | { 203 | assert(wordCount >= 4); 204 | 205 | uint32_t id = insn[2]; 206 | assert(id < idBound); 207 | 208 | assert(ids[id].opcode == 0); 209 | ids[id].opcode = opcode; 210 | ids[id].typeId = insn[1]; 211 | ids[id].storageClass = insn[3]; 212 | } 213 | break; 214 | } 215 | 216 | assert(insn + wordCount <= code + codeSize); 217 | insn += wordCount; 218 | } 219 | 220 | for (auto& id : ids) 221 | { 222 | // set 0 is reserved for push descriptors 223 | if (id.opcode == SpvOpVariable && (id.storageClass == SpvStorageClassUniform || id.storageClass == SpvStorageClassUniformConstant || id.storageClass == SpvStorageClassStorageBuffer) && id.set == 0) 224 | { 225 | assert(id.binding < 32); 226 | assert(ids[id.typeId].opcode == SpvOpTypePointer); 227 | 228 | uint32_t typeKind = ids[ids[id.typeId].typeId].opcode; 229 | VkDescriptorType resourceType = getDescriptorType(SpvOp(typeKind)); 230 | 231 | assert((shader.resourceMask & (1 << id.binding)) == 0 || shader.resourceTypes[id.binding] == resourceType); 232 | 233 | shader.resourceTypes[id.binding] = resourceType; 234 | shader.resourceMask |= 1 << id.binding; 235 | } 236 | 237 | if (id.opcode == SpvOpVariable && id.storageClass == SpvStorageClassUniformConstant && id.set == 1) 238 | { 239 | shader.usesDescriptorArray = true; 240 | } 241 | 242 | if (id.opcode == SpvOpVariable && id.storageClass == SpvStorageClassPushConstant) 243 | { 244 | shader.usesPushConstants = true; 245 | } 246 | } 247 | 248 | if (shader.stage == VK_SHADER_STAGE_COMPUTE_BIT) 249 | { 250 | if (localSizeIdX >= 0) 251 | { 252 | assert(ids[localSizeIdX].opcode == SpvOpConstant); 253 | shader.localSizeX = ids[localSizeIdX].constant; 254 | } 255 | 256 | if (localSizeIdY >= 0) 257 | { 258 | assert(ids[localSizeIdY].opcode == SpvOpConstant); 259 | shader.localSizeY = ids[localSizeIdY].constant; 260 | } 261 | 262 | if (localSizeIdZ >= 0) 263 | { 264 | assert(ids[localSizeIdZ].opcode == SpvOpConstant); 265 | shader.localSizeZ = ids[localSizeIdZ].constant; 266 | } 267 | 268 | assert(shader.localSizeX && shader.localSizeY && shader.localSizeZ); 269 | } 270 | } 271 | 272 | static uint32_t gatherResources(Shaders shaders, VkDescriptorType (&resourceTypes)[32]) 273 | { 274 | uint32_t resourceMask = 0; 275 | 276 | for (const Shader* shader : shaders) 277 | { 278 | for (uint32_t i = 0; i < 32; ++i) 279 | { 280 | if (shader->resourceMask & (1 << i)) 281 | { 282 | if (resourceMask & (1 << i)) 283 | { 284 | assert(resourceTypes[i] == shader->resourceTypes[i]); 285 | } 286 | else 287 | { 288 | resourceTypes[i] = shader->resourceTypes[i]; 289 | resourceMask |= 1 << i; 290 | } 291 | } 292 | } 293 | } 294 | 295 | return resourceMask; 296 | } 297 | 298 | static VkDescriptorSetLayout createSetLayout(VkDevice device, Shaders shaders) 299 | { 300 | std::vector setBindings; 301 | 302 | VkDescriptorType resourceTypes[32] = {}; 303 | uint32_t resourceMask = gatherResources(shaders, resourceTypes); 304 | 305 | for (uint32_t i = 0; i < 32; ++i) 306 | if (resourceMask & (1 << i)) 307 | { 308 | VkDescriptorSetLayoutBinding binding = {}; 309 | binding.binding = i; 310 | binding.descriptorType = resourceTypes[i]; 311 | binding.descriptorCount = 1; 312 | 313 | binding.stageFlags = 0; 314 | for (const Shader* shader : shaders) 315 | if (shader->resourceMask & (1 << i)) 316 | binding.stageFlags |= shader->stage; 317 | 318 | setBindings.push_back(binding); 319 | } 320 | 321 | VkDescriptorSetLayoutCreateInfo setCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO }; 322 | setCreateInfo.flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT; 323 | setCreateInfo.bindingCount = uint32_t(setBindings.size()); 324 | setCreateInfo.pBindings = setBindings.data(); 325 | 326 | VkDescriptorSetLayout setLayout = 0; 327 | VK_CHECK(vkCreateDescriptorSetLayout(device, &setCreateInfo, 0, &setLayout)); 328 | 329 | return setLayout; 330 | } 331 | 332 | static VkPipelineLayout createPipelineLayout(VkDevice device, VkDescriptorSetLayout setLayout, VkDescriptorSetLayout arrayLayout, VkShaderStageFlags pushConstantStages, size_t pushConstantSize) 333 | { 334 | VkDescriptorSetLayout layouts[2] = { setLayout, arrayLayout }; 335 | 336 | VkPipelineLayoutCreateInfo createInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO }; 337 | createInfo.setLayoutCount = arrayLayout ? 2 : 1; 338 | createInfo.pSetLayouts = layouts; 339 | 340 | VkPushConstantRange pushConstantRange = {}; 341 | 342 | if (pushConstantSize) 343 | { 344 | pushConstantRange.stageFlags = pushConstantStages; 345 | pushConstantRange.size = uint32_t(pushConstantSize); 346 | 347 | createInfo.pushConstantRangeCount = 1; 348 | createInfo.pPushConstantRanges = &pushConstantRange; 349 | } 350 | 351 | VkPipelineLayout layout = 0; 352 | VK_CHECK(vkCreatePipelineLayout(device, &createInfo, 0, &layout)); 353 | 354 | return layout; 355 | } 356 | 357 | static VkDescriptorUpdateTemplate createUpdateTemplate(VkDevice device, VkPipelineBindPoint bindPoint, VkPipelineLayout layout, Shaders shaders, uint32_t* pushDescriptorCount) 358 | { 359 | std::vector entries; 360 | 361 | VkDescriptorType resourceTypes[32] = {}; 362 | uint32_t resourceMask = gatherResources(shaders, resourceTypes); 363 | 364 | for (uint32_t i = 0; i < 32; ++i) 365 | if (resourceMask & (1 << i)) 366 | { 367 | VkDescriptorUpdateTemplateEntry entry = {}; 368 | entry.dstBinding = i; 369 | entry.dstArrayElement = 0; 370 | entry.descriptorCount = 1; 371 | entry.descriptorType = resourceTypes[i]; 372 | entry.offset = sizeof(DescriptorInfo) * i; 373 | entry.stride = sizeof(DescriptorInfo); 374 | 375 | entries.push_back(entry); 376 | } 377 | 378 | *pushDescriptorCount = uint32_t(entries.size()); 379 | 380 | VkDescriptorUpdateTemplateCreateInfo createInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO }; 381 | 382 | createInfo.descriptorUpdateEntryCount = uint32_t(entries.size()); 383 | createInfo.pDescriptorUpdateEntries = entries.data(); 384 | 385 | createInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS; 386 | createInfo.pipelineBindPoint = bindPoint; 387 | createInfo.pipelineLayout = layout; 388 | 389 | VkDescriptorUpdateTemplate updateTemplate = 0; 390 | VK_CHECK(vkCreateDescriptorUpdateTemplate(device, &createInfo, 0, &updateTemplate)); 391 | 392 | return updateTemplate; 393 | } 394 | 395 | bool loadShader(Shader& shader, const char* path) 396 | { 397 | FILE* file = fopen(path, "rb"); 398 | if (!file) 399 | return false; 400 | 401 | fseek(file, 0, SEEK_END); 402 | long length = ftell(file); 403 | assert(length >= 0); 404 | fseek(file, 0, SEEK_SET); 405 | 406 | std::vector spirv(length); 407 | 408 | size_t rc = fread(spirv.data(), 1, length, file); 409 | assert(rc == size_t(length)); 410 | fclose(file); 411 | 412 | assert(length % 4 == 0); 413 | parseShader(shader, reinterpret_cast(spirv.data()), length / 4); 414 | 415 | shader.spirv = spirv; 416 | 417 | return true; 418 | } 419 | 420 | bool loadShader(Shader& shader, const char* base, const char* path) 421 | { 422 | std::string spath = base; 423 | std::string::size_type pos = spath.find_last_of("/\\"); 424 | if (pos == std::string::npos) 425 | spath = ""; 426 | else 427 | spath = spath.substr(0, pos + 1); 428 | spath += path; 429 | 430 | return loadShader(shader, spath.c_str()); 431 | } 432 | 433 | bool loadShaders(ShaderSet& shaders, const char* base, const char* path) 434 | { 435 | std::string spath = base; 436 | std::string::size_type pos = spath.find_last_of("/\\"); 437 | if (pos == std::string::npos) 438 | spath = ""; 439 | else 440 | spath = spath.substr(0, pos + 1); 441 | spath += path; 442 | 443 | #ifdef _WIN32 444 | _finddata_t finddata; 445 | intptr_t fh = _findfirst((spath + "/*.spv").c_str(), &finddata); 446 | if (fh == -1) 447 | return false; 448 | 449 | do 450 | { 451 | const char* ext = strrchr(finddata.name, '.'); 452 | if (!ext) 453 | continue; 454 | 455 | std::string fpath = spath + '/' + finddata.name; 456 | Shader shader = {}; 457 | if (!loadShader(shader, fpath.c_str())) 458 | { 459 | fprintf(stderr, "Warning: %s is not a valid SPIRV module\n", finddata.name); 460 | continue; 461 | } 462 | 463 | shader.name = std::string(finddata.name, ext - finddata.name); 464 | shaders.shaders.push_back(shader); 465 | } while (_findnext(fh, &finddata) == 0); 466 | 467 | _findclose(fh); 468 | #else 469 | DIR* dir = opendir(spath.c_str()); 470 | if (!dir) 471 | return false; 472 | 473 | while (dirent* de = readdir(dir)) 474 | { 475 | const char* ext = strstr(de->d_name, ".spv"); 476 | if (!ext || strcmp(ext, ".spv") != 0) 477 | continue; 478 | 479 | std::string fpath = spath + '/' + de->d_name; 480 | Shader shader = {}; 481 | if (!loadShader(shader, fpath.c_str())) 482 | { 483 | fprintf(stderr, "Warning: %s is not a valid SPIRV module\n", de->d_name); 484 | continue; 485 | } 486 | 487 | shader.name = std::string(de->d_name, ext - de->d_name); 488 | shaders.shaders.push_back(shader); 489 | } 490 | 491 | closedir(dir); 492 | #endif 493 | 494 | printf("Loaded %d shaders from %s\n", int(shaders.shaders.size()), spath.c_str()); 495 | return true; 496 | } 497 | 498 | const Shader& ShaderSet::operator[](const char* name) const 499 | { 500 | for (const Shader& shader : shaders) 501 | if (shader.name == name) 502 | return shader; 503 | 504 | fprintf(stderr, "Error: shader %s could not be loaded\n", name); 505 | abort(); 506 | } 507 | 508 | static VkSpecializationInfo fillSpecializationInfo(std::vector& entries, const Constants& constants) 509 | { 510 | for (size_t i = 0; i < constants.size(); ++i) 511 | entries.push_back({ uint32_t(i), uint32_t(i * 4), 4 }); 512 | 513 | VkSpecializationInfo result = {}; 514 | result.mapEntryCount = uint32_t(entries.size()); 515 | result.pMapEntries = entries.data(); 516 | result.dataSize = constants.size() * sizeof(int); 517 | result.pData = constants.begin(); 518 | 519 | return result; 520 | } 521 | 522 | VkPipeline createGraphicsPipeline(VkDevice device, VkPipelineCache pipelineCache, const VkPipelineRenderingCreateInfo& renderingInfo, const Program& program, Constants constants) 523 | { 524 | std::vector specializationEntries; 525 | VkSpecializationInfo specializationInfo = fillSpecializationInfo(specializationEntries, constants); 526 | 527 | VkGraphicsPipelineCreateInfo createInfo = { VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO }; 528 | 529 | std::vector stages(program.shaderCount); 530 | std::vector modules(program.shaderCount); 531 | for (size_t i = 0; i < program.shaderCount; ++i) 532 | { 533 | const Shader* shader = program.shaders[i]; 534 | 535 | VkShaderModuleCreateInfo& module = modules[i]; 536 | module.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; 537 | module.codeSize = shader->spirv.size(); // note: this needs to be a number of bytes! 538 | module.pCode = reinterpret_cast(shader->spirv.data()); 539 | 540 | VkPipelineShaderStageCreateInfo& stage = stages[i]; 541 | stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; 542 | stage.stage = shader->stage; 543 | stage.pName = "main"; 544 | stage.pSpecializationInfo = &specializationInfo; 545 | stage.pNext = &module; 546 | } 547 | 548 | createInfo.stageCount = uint32_t(stages.size()); 549 | createInfo.pStages = stages.data(); 550 | 551 | VkPipelineVertexInputStateCreateInfo vertexInput = { VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO }; 552 | createInfo.pVertexInputState = &vertexInput; 553 | 554 | VkPipelineInputAssemblyStateCreateInfo inputAssembly = { VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO }; 555 | inputAssembly.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; 556 | createInfo.pInputAssemblyState = &inputAssembly; 557 | 558 | VkPipelineViewportStateCreateInfo viewportState = { VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO }; 559 | viewportState.viewportCount = 1; 560 | viewportState.scissorCount = 1; 561 | createInfo.pViewportState = &viewportState; 562 | 563 | VkPipelineRasterizationStateCreateInfo rasterizationState = { VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO }; 564 | rasterizationState.lineWidth = 1.f; 565 | rasterizationState.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE; 566 | rasterizationState.cullMode = VK_CULL_MODE_BACK_BIT; 567 | rasterizationState.depthBiasEnable = true; 568 | createInfo.pRasterizationState = &rasterizationState; 569 | 570 | VkPipelineMultisampleStateCreateInfo multisampleState = { VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO }; 571 | multisampleState.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; 572 | createInfo.pMultisampleState = &multisampleState; 573 | 574 | VkPipelineDepthStencilStateCreateInfo depthStencilState = { VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO }; 575 | depthStencilState.depthTestEnable = true; 576 | depthStencilState.depthWriteEnable = true; 577 | depthStencilState.depthCompareOp = VK_COMPARE_OP_GREATER; 578 | createInfo.pDepthStencilState = &depthStencilState; 579 | 580 | VkPipelineColorBlendAttachmentState colorAttachmentStates[8] = {}; 581 | assert(renderingInfo.colorAttachmentCount <= COUNTOF(colorAttachmentStates)); 582 | for (uint32_t i = 0; i < renderingInfo.colorAttachmentCount; ++i) 583 | colorAttachmentStates[i].colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; 584 | 585 | VkPipelineColorBlendStateCreateInfo colorBlendState = { VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO }; 586 | colorBlendState.attachmentCount = renderingInfo.colorAttachmentCount; 587 | colorBlendState.pAttachments = colorAttachmentStates; 588 | createInfo.pColorBlendState = &colorBlendState; 589 | 590 | VkDynamicState dynamicStates[] = { VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR, VK_DYNAMIC_STATE_CULL_MODE, VK_DYNAMIC_STATE_DEPTH_BIAS }; 591 | 592 | VkPipelineDynamicStateCreateInfo dynamicState = { VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO }; 593 | dynamicState.dynamicStateCount = sizeof(dynamicStates) / sizeof(dynamicStates[0]); 594 | dynamicState.pDynamicStates = dynamicStates; 595 | createInfo.pDynamicState = &dynamicState; 596 | 597 | createInfo.layout = program.layout; 598 | createInfo.pNext = &renderingInfo; 599 | 600 | VkPipeline pipeline = 0; 601 | VK_CHECK(vkCreateGraphicsPipelines(device, pipelineCache, 1, &createInfo, 0, &pipeline)); 602 | 603 | if (vkSetDebugUtilsObjectNameEXT) 604 | { 605 | std::string name; 606 | 607 | for (size_t i = 0; i < program.shaderCount; ++i) 608 | { 609 | const Shader* shader = program.shaders[i]; 610 | 611 | name += shader->name; 612 | if (i + 1 < program.shaderCount) 613 | name += " / "; 614 | } 615 | 616 | VkDebugUtilsObjectNameInfoEXT nameInfo = { VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT }; 617 | nameInfo.objectType = VK_OBJECT_TYPE_PIPELINE; 618 | nameInfo.objectHandle = uint64_t(pipeline); 619 | nameInfo.pObjectName = name.c_str(); 620 | vkSetDebugUtilsObjectNameEXT(device, &nameInfo); 621 | } 622 | 623 | return pipeline; 624 | } 625 | 626 | VkPipeline createComputePipeline(VkDevice device, VkPipelineCache pipelineCache, const Program& program, Constants constants) 627 | { 628 | assert(program.shaderCount == 1); 629 | const Shader& shader = *program.shaders[0]; 630 | 631 | assert(shader.stage == VK_SHADER_STAGE_COMPUTE_BIT); 632 | 633 | VkComputePipelineCreateInfo createInfo = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO }; 634 | 635 | std::vector specializationEntries; 636 | VkSpecializationInfo specializationInfo = fillSpecializationInfo(specializationEntries, constants); 637 | 638 | VkShaderModuleCreateInfo module = { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO }; 639 | module.codeSize = shader.spirv.size(); // note: this needs to be a number of bytes! 640 | module.pCode = reinterpret_cast(shader.spirv.data()); 641 | 642 | VkPipelineShaderStageCreateInfo stage = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO }; 643 | stage.stage = shader.stage; 644 | stage.pName = "main"; 645 | stage.pSpecializationInfo = &specializationInfo; 646 | stage.pNext = &module; 647 | 648 | createInfo.stage = stage; 649 | createInfo.layout = program.layout; 650 | 651 | VkPipeline pipeline = 0; 652 | VK_CHECK(vkCreateComputePipelines(device, pipelineCache, 1, &createInfo, 0, &pipeline)); 653 | 654 | if (vkSetDebugUtilsObjectNameEXT) 655 | { 656 | VkDebugUtilsObjectNameInfoEXT nameInfo = { VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT }; 657 | nameInfo.objectType = VK_OBJECT_TYPE_PIPELINE; 658 | nameInfo.objectHandle = uint64_t(pipeline); 659 | nameInfo.pObjectName = shader.name.c_str(); 660 | vkSetDebugUtilsObjectNameEXT(device, &nameInfo); 661 | } 662 | 663 | return pipeline; 664 | } 665 | 666 | Program createProgram(VkDevice device, VkPipelineBindPoint bindPoint, Shaders shaders, size_t pushConstantSize, VkDescriptorSetLayout arrayLayout) 667 | { 668 | VkShaderStageFlags pushConstantStages = 0; 669 | for (const Shader* shader : shaders) 670 | if (shader->usesPushConstants) 671 | pushConstantStages |= shader->stage; 672 | 673 | bool usesDescriptorArray = false; 674 | for (const Shader* shader : shaders) 675 | usesDescriptorArray |= shader->usesDescriptorArray; 676 | 677 | assert(!usesDescriptorArray || arrayLayout); 678 | 679 | Program program = {}; 680 | 681 | program.bindPoint = bindPoint; 682 | 683 | program.setLayout = createSetLayout(device, shaders); 684 | assert(program.setLayout); 685 | 686 | program.layout = createPipelineLayout(device, program.setLayout, arrayLayout, pushConstantStages, pushConstantSize); 687 | assert(program.layout); 688 | 689 | program.updateTemplate = createUpdateTemplate(device, bindPoint, program.layout, shaders, &program.pushDescriptorCount); 690 | assert(program.updateTemplate); 691 | 692 | program.pushConstantStages = pushConstantStages; 693 | program.pushConstantSize = uint32_t(pushConstantSize); 694 | 695 | const Shader* shader = shaders.size() == 1 ? *shaders.begin() : nullptr; 696 | 697 | if (shader && shader->stage == VK_SHADER_STAGE_COMPUTE_BIT) 698 | { 699 | program.localSizeX = shader->localSizeX; 700 | program.localSizeY = shader->localSizeY; 701 | program.localSizeZ = shader->localSizeZ; 702 | } 703 | 704 | memset(program.shaders, 0, sizeof(program.shaders)); 705 | program.shaderCount = 0; 706 | 707 | for (const Shader* shader : shaders) 708 | program.shaders[program.shaderCount++] = shader; 709 | 710 | return program; 711 | } 712 | 713 | void destroyProgram(VkDevice device, const Program& program) 714 | { 715 | vkDestroyDescriptorUpdateTemplate(device, program.updateTemplate, 0); 716 | vkDestroyPipelineLayout(device, program.layout, 0); 717 | vkDestroyDescriptorSetLayout(device, program.setLayout, 0); 718 | } 719 | 720 | VkDescriptorSetLayout createDescriptorArrayLayout(VkDevice device) 721 | { 722 | VkShaderStageFlags stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_COMPUTE_BIT; 723 | VkDescriptorSetLayoutBinding setBinding = { 0, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, DESCRIPTOR_LIMIT, stageFlags, nullptr }; 724 | 725 | VkDescriptorBindingFlags bindingFlags = VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT | VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT | VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT; 726 | VkDescriptorSetLayoutBindingFlagsCreateInfo setBindingFlags = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO }; 727 | setBindingFlags.bindingCount = 1; 728 | setBindingFlags.pBindingFlags = &bindingFlags; 729 | 730 | VkDescriptorSetLayoutCreateInfo setCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO }; 731 | setCreateInfo.pNext = &setBindingFlags; 732 | setCreateInfo.flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT; 733 | setCreateInfo.bindingCount = 1; 734 | setCreateInfo.pBindings = &setBinding; 735 | 736 | VkDescriptorSetLayout setLayout = 0; 737 | VK_CHECK(vkCreateDescriptorSetLayout(device, &setCreateInfo, 0, &setLayout)); 738 | 739 | return setLayout; 740 | } 741 | 742 | std::pair createDescriptorArray(VkDevice device, VkDescriptorSetLayout layout, uint32_t descriptorCount) 743 | { 744 | VkDescriptorPoolSize poolSize = { VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, descriptorCount }; 745 | VkDescriptorPoolCreateInfo poolInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO }; 746 | poolInfo.flags = VK_DESCRIPTOR_POOL_CREATE_UPDATE_AFTER_BIND_BIT; 747 | poolInfo.maxSets = 1; 748 | poolInfo.poolSizeCount = 1; 749 | poolInfo.pPoolSizes = &poolSize; 750 | 751 | VkDescriptorPool pool = nullptr; 752 | VK_CHECK(vkCreateDescriptorPool(device, &poolInfo, 0, &pool)); 753 | 754 | VkDescriptorSetVariableDescriptorCountAllocateInfo setAllocateCountInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_ALLOCATE_INFO }; 755 | setAllocateCountInfo.descriptorSetCount = 1; 756 | setAllocateCountInfo.pDescriptorCounts = &descriptorCount; 757 | 758 | VkDescriptorSetAllocateInfo setAllocateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO }; 759 | setAllocateInfo.pNext = &setAllocateCountInfo; 760 | setAllocateInfo.descriptorPool = pool; 761 | setAllocateInfo.descriptorSetCount = 1; 762 | setAllocateInfo.pSetLayouts = &layout; 763 | 764 | VkDescriptorSet set = 0; 765 | VK_CHECK(vkAllocateDescriptorSets(device, &setAllocateInfo, &set)); 766 | 767 | return std::make_pair(pool, set); 768 | } 769 | -------------------------------------------------------------------------------- /src/scene.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "scene.h" 3 | 4 | #include "config.h" 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | static void appendMeshlet(Geometry& result, const meshopt_Meshlet& meshlet, const std::vector& vertices, const std::vector& meshlet_vertices, const std::vector& meshlet_triangles, uint32_t baseVertex, bool lod0) 17 | { 18 | size_t dataOffset = result.meshletdata.size(); 19 | 20 | unsigned int minVertex = ~0u, maxVertex = 0; 21 | for (unsigned int i = 0; i < meshlet.vertex_count; ++i) 22 | { 23 | minVertex = std::min(meshlet_vertices[meshlet.vertex_offset + i], minVertex); 24 | maxVertex = std::max(meshlet_vertices[meshlet.vertex_offset + i], maxVertex); 25 | } 26 | 27 | bool shortRefs = maxVertex - minVertex < (1 << 16); 28 | 29 | for (unsigned int i = 0; i < meshlet.vertex_count; ++i) 30 | { 31 | unsigned int ref = meshlet_vertices[meshlet.vertex_offset + i] - minVertex; 32 | if (shortRefs && i % 2) 33 | result.meshletdata.back() |= ref << 16; 34 | else 35 | result.meshletdata.push_back(ref); 36 | } 37 | 38 | const unsigned int* indexGroups = reinterpret_cast(&meshlet_triangles[0] + meshlet.triangle_offset); 39 | unsigned int indexGroupCount = (meshlet.triangle_count * 3 + 3) / 4; 40 | 41 | for (unsigned int i = 0; i < indexGroupCount; ++i) 42 | result.meshletdata.push_back(indexGroups[i]); 43 | 44 | if (lod0) 45 | { 46 | for (unsigned int i = 0; i < meshlet.vertex_count; ++i) 47 | { 48 | unsigned int vtx = meshlet_vertices[meshlet.vertex_offset + i]; 49 | 50 | unsigned short hx = meshopt_quantizeHalf(vertices[vtx].x); 51 | unsigned short hy = meshopt_quantizeHalf(vertices[vtx].y); 52 | unsigned short hz = meshopt_quantizeHalf(vertices[vtx].z); 53 | 54 | result.meshletvtx0.push_back(hx); 55 | result.meshletvtx0.push_back(hy); 56 | result.meshletvtx0.push_back(hz); 57 | result.meshletvtx0.push_back(0); 58 | } 59 | } 60 | 61 | meshopt_Bounds bounds = meshopt_computeMeshletBounds(&meshlet_vertices[meshlet.vertex_offset], &meshlet_triangles[meshlet.triangle_offset], meshlet.triangle_count, &vertices[0].x, vertices.size(), sizeof(vec3)); 62 | 63 | Meshlet m = {}; 64 | m.dataOffset = uint32_t(dataOffset); 65 | m.baseVertex = baseVertex + minVertex; 66 | m.triangleCount = meshlet.triangle_count; 67 | m.vertexCount = meshlet.vertex_count; 68 | m.shortRefs = shortRefs; 69 | 70 | m.center = vec3(bounds.center[0], bounds.center[1], bounds.center[2]); 71 | m.radius = bounds.radius; 72 | m.cone_axis[0] = bounds.cone_axis_s8[0]; 73 | m.cone_axis[1] = bounds.cone_axis_s8[1]; 74 | m.cone_axis[2] = bounds.cone_axis_s8[2]; 75 | m.cone_cutoff = bounds.cone_cutoff_s8; 76 | 77 | result.meshlets.push_back(m); 78 | } 79 | 80 | static size_t appendMeshlets(Geometry& result, const std::vector& vertices, std::vector& indices, uint32_t baseVertex, bool lod0, bool fast, bool clrt) 81 | { 82 | const size_t max_vertices = MESH_MAXVTX; 83 | const size_t min_triangles = MESH_MAXTRI / 4; 84 | const size_t max_triangles = MESH_MAXTRI; 85 | const float cone_weight = MESHLET_CONE_WEIGHT; 86 | const float fill_weight = MESHLET_FILL_WEIGHT; 87 | 88 | std::vector meshlets(meshopt_buildMeshletsBound(indices.size(), max_vertices, min_triangles)); 89 | std::vector meshlet_vertices(meshlets.size() * max_vertices); 90 | std::vector meshlet_triangles(meshlets.size() * max_triangles * 3); 91 | 92 | if (fast) 93 | meshlets.resize(meshopt_buildMeshletsScan(meshlets.data(), meshlet_vertices.data(), meshlet_triangles.data(), indices.data(), indices.size(), vertices.size(), max_vertices, max_triangles)); 94 | else if (clrt && lod0) // only use spatial algo for lod0 as this is the only lod that is used for raytracing 95 | meshlets.resize(meshopt_buildMeshletsSpatial(meshlets.data(), meshlet_vertices.data(), meshlet_triangles.data(), indices.data(), indices.size(), &vertices[0].x, vertices.size(), sizeof(vec3), max_vertices, min_triangles, max_triangles, fill_weight)); 96 | else 97 | meshlets.resize(meshopt_buildMeshlets(meshlets.data(), meshlet_vertices.data(), meshlet_triangles.data(), indices.data(), indices.size(), &vertices[0].x, vertices.size(), sizeof(vec3), max_vertices, max_triangles, cone_weight)); 98 | 99 | for (auto& meshlet : meshlets) 100 | { 101 | meshopt_optimizeMeshlet(&meshlet_vertices[meshlet.vertex_offset], &meshlet_triangles[meshlet.triangle_offset], meshlet.triangle_count, meshlet.vertex_count); 102 | 103 | appendMeshlet(result, meshlet, vertices, meshlet_vertices, meshlet_triangles, baseVertex, lod0); 104 | } 105 | 106 | return meshlets.size(); 107 | } 108 | 109 | static bool loadObj(std::vector& vertices, const char* path) 110 | { 111 | fastObjMesh* obj = fast_obj_read(path); 112 | if (!obj) 113 | return false; 114 | 115 | size_t index_count = 0; 116 | 117 | for (unsigned int i = 0; i < obj->face_count; ++i) 118 | index_count += 3 * (obj->face_vertices[i] - 2); 119 | 120 | vertices.resize(index_count); 121 | 122 | size_t vertex_offset = 0; 123 | size_t index_offset = 0; 124 | 125 | for (unsigned int i = 0; i < obj->face_count; ++i) 126 | { 127 | for (unsigned int j = 0; j < obj->face_vertices[i]; ++j) 128 | { 129 | fastObjIndex gi = obj->indices[index_offset + j]; 130 | 131 | // triangulate polygon on the fly; offset-3 is always the first polygon vertex 132 | if (j >= 3) 133 | { 134 | vertices[vertex_offset + 0] = vertices[vertex_offset - 3]; 135 | vertices[vertex_offset + 1] = vertices[vertex_offset - 1]; 136 | vertex_offset += 2; 137 | } 138 | 139 | Vertex& v = vertices[vertex_offset++]; 140 | 141 | v.vx = meshopt_quantizeHalf(obj->positions[gi.p * 3 + 0]); 142 | v.vy = meshopt_quantizeHalf(obj->positions[gi.p * 3 + 1]); 143 | v.vz = meshopt_quantizeHalf(obj->positions[gi.p * 3 + 2]); 144 | v.tp = 0; 145 | v.np = (meshopt_quantizeSnorm(obj->normals[gi.n * 3 + 0], 10) + 511) | 146 | (meshopt_quantizeSnorm(obj->normals[gi.n * 3 + 1], 10) + 511) << 10 | 147 | (meshopt_quantizeSnorm(obj->normals[gi.n * 3 + 1], 10) + 511) << 20; 148 | v.tu = meshopt_quantizeHalf(obj->texcoords[gi.t * 2 + 0]); 149 | v.tv = meshopt_quantizeHalf(obj->texcoords[gi.t * 2 + 1]); 150 | } 151 | 152 | index_offset += obj->face_vertices[i]; 153 | } 154 | 155 | assert(vertex_offset == index_count); 156 | 157 | fast_obj_destroy(obj); 158 | 159 | return true; 160 | } 161 | 162 | static void appendMesh(Geometry& result, std::vector& vertices, std::vector& indices, bool buildMeshlets, bool fast, bool clrt) 163 | { 164 | std::vector remap(vertices.size()); 165 | size_t uniqueVertices = meshopt_generateVertexRemap(remap.data(), indices.data(), indices.size(), vertices.data(), vertices.size(), sizeof(Vertex)); 166 | 167 | meshopt_remapVertexBuffer(vertices.data(), vertices.data(), vertices.size(), sizeof(Vertex), remap.data()); 168 | meshopt_remapIndexBuffer(indices.data(), indices.data(), indices.size(), remap.data()); 169 | 170 | vertices.resize(uniqueVertices); 171 | 172 | if (fast) 173 | meshopt_optimizeVertexCacheFifo(indices.data(), indices.data(), indices.size(), vertices.size(), 16); 174 | else 175 | meshopt_optimizeVertexCache(indices.data(), indices.data(), indices.size(), vertices.size()); 176 | 177 | meshopt_optimizeVertexFetch(vertices.data(), indices.data(), indices.size(), vertices.data(), vertices.size(), sizeof(Vertex)); 178 | 179 | Mesh mesh = {}; 180 | 181 | mesh.vertexOffset = uint32_t(result.vertices.size()); 182 | mesh.vertexCount = uint32_t(vertices.size()); 183 | 184 | result.vertices.insert(result.vertices.end(), vertices.begin(), vertices.end()); 185 | 186 | std::vector positions(vertices.size()); 187 | for (size_t i = 0; i < vertices.size(); ++i) 188 | { 189 | Vertex& v = vertices[i]; 190 | positions[i] = vec3(meshopt_dequantizeHalf(v.vx), meshopt_dequantizeHalf(v.vy), meshopt_dequantizeHalf(v.vz)); 191 | } 192 | 193 | std::vector normals(vertices.size()); 194 | for (size_t i = 0; i < vertices.size(); ++i) 195 | { 196 | Vertex& v = vertices[i]; 197 | normals[i] = vec3((v.np & 1023) / 511.f - 1.f, ((v.np >> 10) & 1023) / 511.f - 1.f, ((v.np >> 20) & 1023) / 511.f - 1.f); 198 | } 199 | 200 | vec3 center = vec3(0); 201 | 202 | for (auto& v : positions) 203 | center += v; 204 | 205 | center /= float(vertices.size()); 206 | 207 | float radius = 0; 208 | 209 | for (auto& v : positions) 210 | radius = std::max(radius, distance(center, v)); 211 | 212 | mesh.center = center; 213 | mesh.radius = radius; 214 | 215 | float lodScale = meshopt_simplifyScale(&positions[0].x, vertices.size(), sizeof(vec3)); 216 | 217 | std::vector lodIndices = indices; 218 | float lodError = 0.f; 219 | 220 | float normalWeights[3] = { 1.f, 1.f, 1.f }; 221 | 222 | while (mesh.lodCount < COUNTOF(mesh.lods)) 223 | { 224 | MeshLod& lod = mesh.lods[mesh.lodCount++]; 225 | 226 | lod.indexOffset = uint32_t(result.indices.size()); 227 | lod.indexCount = uint32_t(lodIndices.size()); 228 | 229 | result.indices.insert(result.indices.end(), lodIndices.begin(), lodIndices.end()); 230 | 231 | lod.meshletOffset = uint32_t(result.meshlets.size()); 232 | lod.meshletCount = buildMeshlets ? uint32_t(appendMeshlets(result, positions, lodIndices, mesh.vertexOffset, &lod == mesh.lods, fast, clrt)) : 0; 233 | 234 | lod.error = lodError * lodScale; 235 | 236 | if (mesh.lodCount < COUNTOF(mesh.lods)) 237 | { 238 | // note: we're using the same value for all LODs; if this changes, we need to remove/change 85% exit criteria below 239 | const float maxError = 1e-1f; 240 | const unsigned int options = meshopt_SimplifySparse; 241 | 242 | size_t nextIndicesTarget = (size_t(double(lodIndices.size()) * 0.6) / 3) * 3; 243 | float nextError = 0.f; 244 | size_t nextIndices = meshopt_simplifyWithAttributes(lodIndices.data(), lodIndices.data(), lodIndices.size(), &positions[0].x, vertices.size(), sizeof(vec3), &normals[0].x, sizeof(vec3), normalWeights, 3, NULL, nextIndicesTarget, maxError, options, &nextError); 245 | assert(nextIndices <= lodIndices.size()); 246 | 247 | // we've reached the error bound 248 | if (nextIndices == lodIndices.size() || nextIndices == 0) 249 | break; 250 | 251 | // while we could keep this LOD, it's too close to the last one (and it can't go below that due to constant error bound above) 252 | if (nextIndices >= size_t(double(lodIndices.size()) * 0.85)) 253 | break; 254 | 255 | lodIndices.resize(nextIndices); 256 | lodError = std::max(lodError * 1.5f, nextError); // important! since we start from last LOD, we need to accumulate the error 257 | 258 | if (fast) 259 | meshopt_optimizeVertexCacheFifo(lodIndices.data(), lodIndices.data(), lodIndices.size(), vertices.size(), 16); 260 | else 261 | meshopt_optimizeVertexCache(lodIndices.data(), lodIndices.data(), lodIndices.size(), vertices.size()); 262 | } 263 | } 264 | 265 | result.meshes.push_back(mesh); 266 | } 267 | 268 | bool loadMesh(Geometry& geometry, const char* path, bool buildMeshlets, bool fast, bool clrt) 269 | { 270 | std::vector vertices; 271 | if (!loadObj(vertices, path)) 272 | return false; 273 | 274 | std::vector indices(vertices.size()); 275 | for (size_t i = 0; i < indices.size(); ++i) 276 | indices[i] = uint32_t(i); 277 | 278 | appendMesh(geometry, vertices, indices, buildMeshlets, fast, clrt); 279 | return true; 280 | } 281 | 282 | static void decomposeTransform(float translation[3], float rotation[4], float scale[3], const float* transform) 283 | { 284 | float m[4][4] = {}; 285 | memcpy(m, transform, 16 * sizeof(float)); 286 | 287 | // extract translation from last row 288 | translation[0] = m[3][0]; 289 | translation[1] = m[3][1]; 290 | translation[2] = m[3][2]; 291 | 292 | // compute determinant to determine handedness 293 | float det = 294 | m[0][0] * (m[1][1] * m[2][2] - m[2][1] * m[1][2]) - 295 | m[0][1] * (m[1][0] * m[2][2] - m[1][2] * m[2][0]) + 296 | m[0][2] * (m[1][0] * m[2][1] - m[1][1] * m[2][0]); 297 | 298 | float sign = (det < 0.f) ? -1.f : 1.f; 299 | 300 | // recover scale from axis lengths 301 | scale[0] = sqrtf(m[0][0] * m[0][0] + m[0][1] * m[0][1] + m[0][2] * m[0][2]) * sign; 302 | scale[1] = sqrtf(m[1][0] * m[1][0] + m[1][1] * m[1][1] + m[1][2] * m[1][2]) * sign; 303 | scale[2] = sqrtf(m[2][0] * m[2][0] + m[2][1] * m[2][1] + m[2][2] * m[2][2]) * sign; 304 | 305 | // normalize axes to get a pure rotation matrix 306 | float rsx = (scale[0] == 0.f) ? 0.f : 1.f / scale[0]; 307 | float rsy = (scale[1] == 0.f) ? 0.f : 1.f / scale[1]; 308 | float rsz = (scale[2] == 0.f) ? 0.f : 1.f / scale[2]; 309 | 310 | float r00 = m[0][0] * rsx, r10 = m[1][0] * rsy, r20 = m[2][0] * rsz; 311 | float r01 = m[0][1] * rsx, r11 = m[1][1] * rsy, r21 = m[2][1] * rsz; 312 | float r02 = m[0][2] * rsx, r12 = m[1][2] * rsy, r22 = m[2][2] * rsz; 313 | 314 | // "branchless" version of Mike Day's matrix to quaternion conversion 315 | int qc = r22 < 0 ? (r00 > r11 ? 0 : 1) : (r00 < -r11 ? 2 : 3); 316 | float qs1 = qc & 2 ? -1.f : 1.f; 317 | float qs2 = qc & 1 ? -1.f : 1.f; 318 | float qs3 = (qc - 1) & 2 ? -1.f : 1.f; 319 | 320 | float qt = 1.f - qs3 * r00 - qs2 * r11 - qs1 * r22; 321 | float qs = 0.5f / sqrtf(qt); 322 | 323 | rotation[qc ^ 0] = qs * qt; 324 | rotation[qc ^ 1] = qs * (r01 + qs1 * r10); 325 | rotation[qc ^ 2] = qs * (r20 + qs2 * r02); 326 | rotation[qc ^ 3] = qs * (r12 + qs3 * r21); 327 | } 328 | 329 | static void loadVertices(std::vector& vertices, const cgltf_primitive& prim) 330 | { 331 | size_t vertexCount = vertices.size(); 332 | std::vector scratch(vertexCount * 4); 333 | 334 | if (const cgltf_accessor* pos = cgltf_find_accessor(&prim, cgltf_attribute_type_position, 0)) 335 | { 336 | assert(cgltf_num_components(pos->type) == 3); 337 | cgltf_accessor_unpack_floats(pos, scratch.data(), vertexCount * 3); 338 | 339 | for (size_t j = 0; j < vertexCount; ++j) 340 | { 341 | vertices[j].vx = meshopt_quantizeHalf(scratch[j * 3 + 0]); 342 | vertices[j].vy = meshopt_quantizeHalf(scratch[j * 3 + 1]); 343 | vertices[j].vz = meshopt_quantizeHalf(scratch[j * 3 + 2]); 344 | } 345 | } 346 | 347 | if (const cgltf_accessor* nrm = cgltf_find_accessor(&prim, cgltf_attribute_type_normal, 0)) 348 | { 349 | assert(cgltf_num_components(nrm->type) == 3); 350 | cgltf_accessor_unpack_floats(nrm, scratch.data(), vertexCount * 3); 351 | 352 | for (size_t j = 0; j < vertexCount; ++j) 353 | { 354 | float nx = scratch[j * 3 + 0], ny = scratch[j * 3 + 1], nz = scratch[j * 3 + 2]; 355 | 356 | vertices[j].np = (meshopt_quantizeSnorm(nx, 10) + 511) | 357 | (meshopt_quantizeSnorm(ny, 10) + 511) << 10 | 358 | (meshopt_quantizeSnorm(nz, 10) + 511) << 20; 359 | } 360 | } 361 | 362 | if (const cgltf_accessor* tan = cgltf_find_accessor(&prim, cgltf_attribute_type_tangent, 0)) 363 | { 364 | assert(cgltf_num_components(tan->type) == 4); 365 | cgltf_accessor_unpack_floats(tan, scratch.data(), vertexCount * 4); 366 | 367 | for (size_t j = 0; j < vertexCount; ++j) 368 | { 369 | float tx = scratch[j * 4 + 0], ty = scratch[j * 4 + 1], tz = scratch[j * 4 + 2]; 370 | float tsum = fabsf(tx) + fabsf(ty) + fabsf(tz); 371 | float tu = tz >= 0 ? tx / tsum : (1 - fabsf(ty / tsum)) * (tx >= 0 ? 1 : -1); 372 | float tv = tz >= 0 ? ty / tsum : (1 - fabsf(tx / tsum)) * (ty >= 0 ? 1 : -1); 373 | 374 | vertices[j].tp = (meshopt_quantizeSnorm(tu, 8) + 127) | (meshopt_quantizeSnorm(tv, 8) + 127) << 8; 375 | vertices[j].np |= (scratch[j * 4 + 3] >= 0 ? 0 : 1) << 30; 376 | } 377 | } 378 | 379 | if (const cgltf_accessor* tex = cgltf_find_accessor(&prim, cgltf_attribute_type_texcoord, 0)) 380 | { 381 | assert(cgltf_num_components(tex->type) == 2); 382 | cgltf_accessor_unpack_floats(tex, scratch.data(), vertexCount * 2); 383 | 384 | for (size_t j = 0; j < vertexCount; ++j) 385 | { 386 | vertices[j].tu = meshopt_quantizeHalf(scratch[j * 2 + 0]); 387 | vertices[j].tv = meshopt_quantizeHalf(scratch[j * 2 + 1]); 388 | } 389 | } 390 | } 391 | 392 | bool loadScene(Geometry& geometry, std::vector& materials, std::vector& draws, std::vector& texturePaths, std::vector& animations, Camera& camera, vec3& sunDirection, const char* path, bool buildMeshlets, bool fast, bool clrt) 393 | { 394 | clock_t timer = clock(); 395 | 396 | cgltf_options options = {}; 397 | cgltf_data* data = NULL; 398 | cgltf_result res = cgltf_parse_file(&options, path, &data); 399 | if (res != cgltf_result_success) 400 | return false; 401 | 402 | std::unique_ptr dataPtr(data, &cgltf_free); 403 | 404 | res = cgltf_load_buffers(&options, data, path); 405 | if (res != cgltf_result_success) 406 | return false; 407 | 408 | res = cgltf_validate(data); 409 | if (res != cgltf_result_success) 410 | return false; 411 | 412 | std::vector> primitives; 413 | std::vector primitiveMaterials; 414 | 415 | size_t firstMeshOffset = geometry.meshes.size(); 416 | 417 | for (size_t i = 0; i < data->meshes_count; ++i) 418 | { 419 | const cgltf_mesh& mesh = data->meshes[i]; 420 | 421 | size_t meshOffset = geometry.meshes.size(); 422 | 423 | for (size_t pi = 0; pi < mesh.primitives_count; ++pi) 424 | { 425 | const cgltf_primitive& prim = mesh.primitives[pi]; 426 | if (prim.type != cgltf_primitive_type_triangles || !prim.indices) 427 | continue; 428 | 429 | std::vector vertices(prim.attributes[0].data->count); 430 | loadVertices(vertices, prim); 431 | 432 | std::vector indices(prim.indices->count); 433 | cgltf_accessor_unpack_indices(prim.indices, indices.data(), 4, indices.size()); 434 | 435 | appendMesh(geometry, vertices, indices, buildMeshlets, fast, clrt); 436 | primitiveMaterials.push_back(prim.material); 437 | } 438 | 439 | primitives.push_back(std::make_pair(unsigned(meshOffset), unsigned(geometry.meshes.size() - meshOffset))); 440 | } 441 | 442 | assert(primitiveMaterials.size() + firstMeshOffset == geometry.meshes.size()); 443 | 444 | std::vector nodeDraws(data->nodes_count, -1); // for animations 445 | 446 | size_t materialOffset = materials.size(); 447 | assert(materialOffset > 0); // index 0 = dummy materials 448 | 449 | for (size_t i = 0; i < data->nodes_count; ++i) 450 | { 451 | const cgltf_node* node = &data->nodes[i]; 452 | 453 | if (node->mesh) 454 | { 455 | float matrix[16]; 456 | cgltf_node_transform_world(node, matrix); 457 | 458 | float translation[3]; 459 | float rotation[4]; 460 | float scale[3]; 461 | decomposeTransform(translation, rotation, scale, matrix); 462 | 463 | // TODO: better warnings for non-uniform or negative scale 464 | 465 | std::pair range = primitives[cgltf_mesh_index(data, node->mesh)]; 466 | 467 | for (unsigned int j = 0; j < range.second; ++j) 468 | { 469 | MeshDraw draw = {}; 470 | draw.position = vec3(translation[0], translation[1], translation[2]); 471 | draw.scale = std::max(scale[0], std::max(scale[1], scale[2])); 472 | draw.orientation = quat(rotation[0], rotation[1], rotation[2], rotation[3]); 473 | draw.meshIndex = range.first + j; 474 | 475 | cgltf_material* material = primitiveMaterials[range.first + j - firstMeshOffset]; 476 | 477 | draw.materialIndex = material ? materialOffset + int(cgltf_material_index(data, material)) : 0; 478 | 479 | if (material && material->alpha_mode != cgltf_alpha_mode_opaque) 480 | draw.postPass = 1; 481 | 482 | if (material && material->has_transmission) 483 | draw.postPass = 2; 484 | 485 | nodeDraws[i] = int(draws.size()); 486 | 487 | draws.push_back(draw); 488 | } 489 | } 490 | 491 | if (node->camera) 492 | { 493 | float matrix[16]; 494 | cgltf_node_transform_world(node, matrix); 495 | 496 | float translation[3]; 497 | float rotation[4]; 498 | float scale[3]; 499 | decomposeTransform(translation, rotation, scale, matrix); 500 | 501 | assert(node->camera->type == cgltf_camera_type_perspective); 502 | 503 | camera.position = vec3(translation[0], translation[1], translation[2]); 504 | camera.orientation = quat(rotation[0], rotation[1], rotation[2], rotation[3]); 505 | camera.fovY = node->camera->data.perspective.yfov; 506 | } 507 | 508 | if (node->light && node->light->type == cgltf_light_type_directional) 509 | { 510 | float matrix[16]; 511 | cgltf_node_transform_world(node, matrix); 512 | 513 | sunDirection = vec3(matrix[8], matrix[9], matrix[10]); 514 | } 515 | } 516 | 517 | int textureOffset = 1 + int(texturePaths.size()); 518 | 519 | for (size_t i = 0; i < data->materials_count; ++i) 520 | { 521 | cgltf_material* material = &data->materials[i]; 522 | Material mat = {}; 523 | 524 | mat.diffuseFactor = vec4(1); 525 | 526 | if (material->has_pbr_specular_glossiness) 527 | { 528 | if (material->pbr_specular_glossiness.diffuse_texture.texture) 529 | mat.albedoTexture = textureOffset + int(cgltf_texture_index(data, material->pbr_specular_glossiness.diffuse_texture.texture)); 530 | 531 | mat.diffuseFactor = vec4(material->pbr_specular_glossiness.diffuse_factor[0], material->pbr_specular_glossiness.diffuse_factor[1], material->pbr_specular_glossiness.diffuse_factor[2], material->pbr_specular_glossiness.diffuse_factor[3]); 532 | 533 | if (material->pbr_specular_glossiness.specular_glossiness_texture.texture) 534 | mat.specularTexture = textureOffset + int(cgltf_texture_index(data, material->pbr_specular_glossiness.specular_glossiness_texture.texture)); 535 | 536 | mat.specularFactor = vec4(material->pbr_specular_glossiness.specular_factor[0], material->pbr_specular_glossiness.specular_factor[1], material->pbr_specular_glossiness.specular_factor[2], material->pbr_specular_glossiness.glossiness_factor); 537 | } 538 | else if (material->has_pbr_metallic_roughness) 539 | { 540 | if (material->pbr_metallic_roughness.base_color_texture.texture) 541 | mat.albedoTexture = textureOffset + int(cgltf_texture_index(data, material->pbr_metallic_roughness.base_color_texture.texture)); 542 | 543 | mat.diffuseFactor = vec4(material->pbr_metallic_roughness.base_color_factor[0], material->pbr_metallic_roughness.base_color_factor[1], material->pbr_metallic_roughness.base_color_factor[2], material->pbr_metallic_roughness.base_color_factor[3]); 544 | 545 | if (material->pbr_metallic_roughness.metallic_roughness_texture.texture) 546 | mat.specularTexture = textureOffset + int(cgltf_texture_index(data, material->pbr_metallic_roughness.metallic_roughness_texture.texture)); 547 | 548 | mat.specularFactor = vec4(1, 1, 1, 1 - material->pbr_metallic_roughness.roughness_factor); 549 | } 550 | 551 | if (material->normal_texture.texture) 552 | mat.normalTexture = textureOffset + int(cgltf_texture_index(data, material->normal_texture.texture)); 553 | 554 | if (material->emissive_texture.texture) 555 | mat.emissiveTexture = textureOffset + int(cgltf_texture_index(data, material->emissive_texture.texture)); 556 | 557 | mat.emissiveFactor = vec3(material->emissive_factor[0], material->emissive_factor[1], material->emissive_factor[2]); 558 | 559 | materials.push_back(mat); 560 | } 561 | 562 | for (size_t i = 0; i < data->textures_count; ++i) 563 | { 564 | cgltf_texture* texture = &data->textures[i]; 565 | assert(texture->image); 566 | 567 | cgltf_image* image = texture->image; 568 | assert(image->uri); 569 | 570 | std::string ipath = path; 571 | std::string::size_type pos = ipath.find_last_of("/\\"); 572 | if (pos == std::string::npos) 573 | ipath = ""; 574 | else 575 | ipath = ipath.substr(0, pos + 1); 576 | 577 | std::string uri = image->uri; 578 | uri.resize(cgltf_decode_uri(&uri[0])); 579 | 580 | std::string::size_type dot = uri.find_last_of('.'); 581 | if (dot != std::string::npos) 582 | uri.replace(dot, uri.size() - dot, ".dds"); 583 | 584 | texturePaths.push_back(ipath + uri); 585 | } 586 | 587 | std::vector samplersT(data->nodes_count); 588 | std::vector samplersR(data->nodes_count); 589 | std::vector samplersS(data->nodes_count); 590 | 591 | for (size_t i = 0; i < data->animations_count; ++i) 592 | { 593 | cgltf_animation* anim = &data->animations[i]; 594 | 595 | for (size_t j = 0; j < anim->channels_count; ++j) 596 | { 597 | cgltf_animation_channel* channel = &anim->channels[j]; 598 | cgltf_animation_sampler* sampler = channel->sampler; 599 | 600 | if (!channel->target_node) 601 | continue; 602 | 603 | if (channel->target_path == cgltf_animation_path_type_translation) 604 | samplersT[cgltf_node_index(data, channel->target_node)] = sampler; 605 | else if (channel->target_path == cgltf_animation_path_type_rotation) 606 | samplersR[cgltf_node_index(data, channel->target_node)] = sampler; 607 | else if (channel->target_path == cgltf_animation_path_type_scale) 608 | samplersS[cgltf_node_index(data, channel->target_node)] = sampler; 609 | } 610 | } 611 | 612 | for (size_t i = 0; i < data->nodes_count; ++i) 613 | { 614 | if (!samplersR[i] && !samplersT[i] && !samplersS[i]) 615 | continue; 616 | 617 | if (nodeDraws[i] == -1) 618 | { 619 | fprintf(stderr, "Warning: skipping animation for node %d without draw\n", int(i)); 620 | continue; 621 | } 622 | 623 | cgltf_accessor* input = 0; 624 | if (samplersT[i]) 625 | input = samplersT[i]->input; 626 | else if (samplersR[i]) 627 | input = samplersR[i]->input; 628 | else if (samplersS[i]) 629 | input = samplersS[i]->input; 630 | 631 | if ((samplersT[i] && samplersT[i]->input->count != input->count) || 632 | (samplersR[i] && samplersR[i]->input->count != input->count) || 633 | (samplersS[i] && samplersS[i]->input->count != input->count)) 634 | { 635 | fprintf(stderr, "Warning: skipping animation for node %d due to mismatched sampler counts\n", int(i)); 636 | continue; 637 | } 638 | 639 | if ((samplersT[i] && samplersT[i]->interpolation != cgltf_interpolation_type_linear) || 640 | (samplersR[i] && samplersR[i]->interpolation != cgltf_interpolation_type_linear) || 641 | (samplersS[i] && samplersS[i]->interpolation != cgltf_interpolation_type_linear)) 642 | { 643 | fprintf(stderr, "Warning: skipping animation for node %d due to mismatched sampler counts\n", int(i)); 644 | continue; 645 | } 646 | 647 | if (input->count < 2) 648 | { 649 | fprintf(stderr, "Warning: skipping animation for node %d with %d keyframes\n", int(i), int(input->count)); 650 | continue; 651 | } 652 | 653 | std::vector times(input->count); 654 | cgltf_accessor_unpack_floats(input, times.data(), times.size()); 655 | 656 | Animation animation = {}; 657 | animation.drawIndex = nodeDraws[i]; 658 | animation.startTime = times[0]; 659 | animation.period = times[1] - times[0]; 660 | 661 | std::vector valuesR, valuesT, valuesS; 662 | 663 | if (samplersT[i]) 664 | { 665 | valuesT.resize(samplersT[i]->output->count * 3); 666 | cgltf_accessor_unpack_floats(samplersT[i]->output, valuesT.data(), valuesT.size()); 667 | } 668 | 669 | if (samplersR[i]) 670 | { 671 | valuesR.resize(samplersR[i]->output->count * 4); 672 | cgltf_accessor_unpack_floats(samplersR[i]->output, valuesR.data(), valuesR.size()); 673 | } 674 | 675 | if (samplersS[i]) 676 | { 677 | valuesS.resize(samplersS[i]->output->count * 3); 678 | cgltf_accessor_unpack_floats(samplersS[i]->output, valuesS.data(), valuesS.size()); 679 | } 680 | 681 | cgltf_node nodeCopy = data->nodes[i]; 682 | 683 | for (size_t j = 0; j < input->count; ++j) 684 | { 685 | if (samplersT[i]) 686 | memcpy(nodeCopy.translation, &valuesT[j * 3], 3 * sizeof(float)); 687 | 688 | if (samplersR[i]) 689 | memcpy(nodeCopy.rotation, &valuesR[j * 4], 4 * sizeof(float)); 690 | 691 | if (samplersS[i]) 692 | memcpy(nodeCopy.scale, &valuesS[j * 3], 3 * sizeof(float)); 693 | 694 | float matrix[16]; 695 | cgltf_node_transform_world(&nodeCopy, matrix); 696 | 697 | float translation[3]; 698 | float rotation[4]; 699 | float scale[3]; 700 | decomposeTransform(translation, rotation, scale, matrix); 701 | 702 | Keyframe kf = {}; 703 | kf.translation = vec3(translation[0], translation[1], translation[2]); 704 | kf.rotation = quat(rotation[0], rotation[1], rotation[2], rotation[3]); 705 | kf.scale = std::max(scale[0], std::max(scale[1], scale[2])); 706 | 707 | animation.keyframes.push_back(kf); 708 | } 709 | 710 | animations.push_back(std::move(animation)); 711 | } 712 | 713 | printf("Loaded %s: %d meshes, %d draws, %d animations, %d vertices in %.2f sec\n", 714 | path, int(geometry.meshes.size()), int(draws.size()), int(animations.size()), int(geometry.vertices.size()), 715 | double(clock() - timer) / CLOCKS_PER_SEC); 716 | 717 | if (buildMeshlets) 718 | { 719 | unsigned int meshletVtxs = 0, meshletTris = 0; 720 | 721 | for (Meshlet& meshlet : geometry.meshlets) 722 | { 723 | meshletVtxs += meshlet.vertexCount; 724 | meshletTris += meshlet.triangleCount; 725 | } 726 | 727 | printf("Meshlets: %d meshlets, %d triangles, %d vertex refs\n", int(geometry.meshlets.size()), int(meshletTris), int(meshletVtxs)); 728 | } 729 | 730 | return true; 731 | } 732 | -------------------------------------------------------------------------------- /src/scenert.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | #include "scenert.h" 4 | #include "scene.h" 5 | #include "resources.h" 6 | 7 | #include "config.h" 8 | 9 | #include 10 | 11 | const VkBuildAccelerationStructureFlagsKHR kBuildBLAS = VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_TRACE_BIT_KHR; 12 | const VkBuildAccelerationStructureFlagsKHR kBuildCLAS = VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_TRACE_BIT_KHR; 13 | const VkBuildAccelerationStructureFlagsKHR kBuildTLAS = VK_BUILD_ACCELERATION_STRUCTURE_PREFER_FAST_TRACE_BIT_KHR; 14 | 15 | void buildBLAS(VkDevice device, const std::vector& meshes, const Buffer& vb, const Buffer& ib, std::vector& blas, std::vector& compactedSizes, Buffer& blasBuffer, VkCommandPool commandPool, VkCommandBuffer commandBuffer, VkQueue queue, const VkPhysicalDeviceMemoryProperties& memoryProperties) 16 | { 17 | std::vector primitiveCounts(meshes.size()); 18 | std::vector geometries(meshes.size()); 19 | std::vector buildInfos(meshes.size()); 20 | 21 | const size_t kAlignment = 256; // required by spec for acceleration structures, could be smaller for scratch but it's a small waste 22 | const size_t kDefaultScratch = 32 * 1024 * 1024; // 32 MB scratch by default 23 | 24 | size_t totalAccelerationSize = 0; 25 | size_t totalPrimitiveCount = 0; 26 | size_t maxScratchSize = 0; 27 | 28 | std::vector accelerationOffsets(meshes.size()); 29 | std::vector accelerationSizes(meshes.size()); 30 | std::vector scratchSizes(meshes.size()); 31 | 32 | VkDeviceAddress vbAddress = getBufferAddress(vb, device); 33 | VkDeviceAddress ibAddress = getBufferAddress(ib, device); 34 | 35 | for (size_t i = 0; i < meshes.size(); ++i) 36 | { 37 | const Mesh& mesh = meshes[i]; 38 | VkAccelerationStructureGeometryKHR& geo = geometries[i]; 39 | VkAccelerationStructureBuildGeometryInfoKHR& buildInfo = buildInfos[i]; 40 | 41 | unsigned int lodIndex = 0; 42 | 43 | primitiveCounts[i] = mesh.lods[lodIndex].indexCount / 3; 44 | 45 | geo.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_KHR; 46 | geo.geometryType = VK_GEOMETRY_TYPE_TRIANGLES_KHR; 47 | 48 | static_assert(offsetof(Vertex, vz) == offsetof(Vertex, vx) + sizeof(uint16_t) * 2, "Vertex layout mismatch"); 49 | 50 | geo.geometry.triangles.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_TRIANGLES_DATA_KHR; 51 | geo.geometry.triangles.vertexFormat = VK_FORMAT_R16G16B16A16_SFLOAT; 52 | geo.geometry.triangles.vertexData.deviceAddress = vbAddress + mesh.vertexOffset * sizeof(Vertex); 53 | geo.geometry.triangles.vertexStride = sizeof(Vertex); 54 | geo.geometry.triangles.maxVertex = mesh.vertexCount - 1; 55 | geo.geometry.triangles.indexType = VK_INDEX_TYPE_UINT32; 56 | geo.geometry.triangles.indexData.deviceAddress = ibAddress + mesh.lods[lodIndex].indexOffset * sizeof(uint32_t); 57 | 58 | buildInfo.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_GEOMETRY_INFO_KHR; 59 | buildInfo.type = VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR; 60 | buildInfo.flags = kBuildBLAS | VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR; 61 | buildInfo.mode = VK_BUILD_ACCELERATION_STRUCTURE_MODE_BUILD_KHR; 62 | buildInfo.geometryCount = 1; 63 | buildInfo.pGeometries = &geo; 64 | 65 | VkAccelerationStructureBuildSizesInfoKHR sizeInfo = { VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR }; 66 | vkGetAccelerationStructureBuildSizesKHR(device, VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR, &buildInfo, &primitiveCounts[i], &sizeInfo); 67 | 68 | accelerationOffsets[i] = totalAccelerationSize; 69 | accelerationSizes[i] = sizeInfo.accelerationStructureSize; 70 | scratchSizes[i] = sizeInfo.buildScratchSize; 71 | 72 | totalAccelerationSize = (totalAccelerationSize + sizeInfo.accelerationStructureSize + kAlignment - 1) & ~(kAlignment - 1); 73 | totalPrimitiveCount += primitiveCounts[i]; 74 | maxScratchSize = std::max(maxScratchSize, size_t(sizeInfo.buildScratchSize)); 75 | } 76 | 77 | createBuffer(blasBuffer, device, memoryProperties, totalAccelerationSize, VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); 78 | 79 | Buffer scratchBuffer; 80 | createBuffer(scratchBuffer, device, memoryProperties, std::max(kDefaultScratch, maxScratchSize), VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); 81 | 82 | printf("BLAS accelerationStructureSize: %.2f MB, scratchSize: %.2f MB (max %.2f MB), %.3fM triangles\n", double(totalAccelerationSize) / 1e6, double(scratchBuffer.size) / 1e6, double(maxScratchSize) / 1e6, double(totalPrimitiveCount) / 1e6); 83 | 84 | VkDeviceAddress scratchAddress = getBufferAddress(scratchBuffer, device); 85 | 86 | blas.resize(meshes.size()); 87 | 88 | std::vector buildRanges(meshes.size()); 89 | std::vector buildRangePtrs(meshes.size()); 90 | 91 | for (size_t i = 0; i < meshes.size(); ++i) 92 | { 93 | VkAccelerationStructureCreateInfoKHR accelerationInfo = { VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_CREATE_INFO_KHR }; 94 | accelerationInfo.buffer = blasBuffer.buffer; 95 | accelerationInfo.offset = accelerationOffsets[i]; 96 | accelerationInfo.size = accelerationSizes[i]; 97 | accelerationInfo.type = VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR; 98 | 99 | VK_CHECK(vkCreateAccelerationStructureKHR(device, &accelerationInfo, nullptr, &blas[i])); 100 | } 101 | 102 | VkQueryPoolCreateInfo createInfo = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO }; 103 | createInfo.queryType = VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR; 104 | createInfo.queryCount = blas.size(); 105 | 106 | VkQueryPool queryPool = 0; 107 | VK_CHECK(vkCreateQueryPool(device, &createInfo, 0, &queryPool)); 108 | 109 | VK_CHECK(vkResetCommandPool(device, commandPool, 0)); 110 | 111 | VkCommandBufferBeginInfo beginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; 112 | beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; 113 | 114 | VK_CHECK(vkBeginCommandBuffer(commandBuffer, &beginInfo)); 115 | 116 | for (size_t start = 0; start < meshes.size();) 117 | { 118 | size_t scratchOffset = 0; 119 | 120 | // aggregate the range that fits into allocated scratch 121 | size_t i = start; 122 | while (i < meshes.size() && scratchOffset + scratchSizes[i] <= scratchBuffer.size) 123 | { 124 | buildInfos[i].scratchData.deviceAddress = scratchAddress + scratchOffset; 125 | buildInfos[i].dstAccelerationStructure = blas[i]; 126 | buildRanges[i].primitiveCount = primitiveCounts[i]; 127 | buildRangePtrs[i] = &buildRanges[i]; 128 | 129 | scratchOffset = (scratchOffset + scratchSizes[i] + kAlignment - 1) & ~(kAlignment - 1); 130 | ++i; 131 | } 132 | assert(i > start); // guaranteed as scratchBuffer.size >= maxScratchSize 133 | 134 | vkCmdBuildAccelerationStructuresKHR(commandBuffer, uint32_t(i - start), &buildInfos[start], &buildRangePtrs[start]); 135 | start = i; 136 | 137 | stageBarrier(commandBuffer, VK_PIPELINE_STAGE_ACCELERATION_STRUCTURE_BUILD_BIT_KHR); 138 | } 139 | 140 | vkCmdResetQueryPool(commandBuffer, queryPool, 0, blas.size()); 141 | vkCmdWriteAccelerationStructuresPropertiesKHR(commandBuffer, blas.size(), blas.data(), VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR, queryPool, 0); 142 | 143 | VK_CHECK(vkEndCommandBuffer(commandBuffer)); 144 | 145 | VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; 146 | submitInfo.commandBufferCount = 1; 147 | submitInfo.pCommandBuffers = &commandBuffer; 148 | 149 | VK_CHECK(vkQueueSubmit(queue, 1, &submitInfo, VK_NULL_HANDLE)); 150 | VK_CHECK(vkDeviceWaitIdle(device)); 151 | 152 | compactedSizes.resize(blas.size()); 153 | VK_CHECK(vkGetQueryPoolResults(device, queryPool, 0, blas.size(), blas.size() * sizeof(VkDeviceSize), compactedSizes.data(), sizeof(VkDeviceSize), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT)); 154 | 155 | vkDestroyQueryPool(device, queryPool, 0); 156 | 157 | destroyBuffer(scratchBuffer, device); 158 | } 159 | 160 | void compactBLAS(VkDevice device, std::vector& blas, const std::vector& compactedSizes, Buffer& blasBuffer, VkCommandPool commandPool, VkCommandBuffer commandBuffer, VkQueue queue, const VkPhysicalDeviceMemoryProperties& memoryProperties) 161 | { 162 | const size_t kAlignment = 256; // required by spec for acceleration structures 163 | 164 | VK_CHECK(vkResetCommandPool(device, commandPool, 0)); 165 | 166 | size_t totalCompactedSize = 0; 167 | std::vector compactedOffsets(blas.size()); 168 | 169 | for (size_t i = 0; i < blas.size(); ++i) 170 | { 171 | compactedOffsets[i] = totalCompactedSize; 172 | totalCompactedSize = (totalCompactedSize + compactedSizes[i] + kAlignment - 1) & ~(kAlignment - 1); 173 | } 174 | 175 | printf("BLAS compacted accelerationStructureSize: %.2f MB\n", double(totalCompactedSize) / 1e6); 176 | 177 | Buffer compactedBuffer; 178 | createBuffer(compactedBuffer, device, memoryProperties, totalCompactedSize, VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); 179 | 180 | std::vector compactedBlas(blas.size()); 181 | 182 | for (size_t i = 0; i < blas.size(); ++i) 183 | { 184 | VkAccelerationStructureCreateInfoKHR accelerationInfo = { VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_CREATE_INFO_KHR }; 185 | accelerationInfo.buffer = compactedBuffer.buffer; 186 | accelerationInfo.offset = compactedOffsets[i]; 187 | accelerationInfo.size = compactedSizes[i]; 188 | accelerationInfo.type = VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR; 189 | 190 | VK_CHECK(vkCreateAccelerationStructureKHR(device, &accelerationInfo, nullptr, &compactedBlas[i])); 191 | } 192 | 193 | VK_CHECK(vkResetCommandPool(device, commandPool, 0)); 194 | 195 | VkCommandBufferBeginInfo beginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; 196 | beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; 197 | 198 | VK_CHECK(vkBeginCommandBuffer(commandBuffer, &beginInfo)); 199 | 200 | for (size_t i = 0; i < blas.size(); ++i) 201 | { 202 | VkCopyAccelerationStructureInfoKHR copyInfo = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_INFO_KHR }; 203 | copyInfo.src = blas[i]; 204 | copyInfo.dst = compactedBlas[i]; 205 | copyInfo.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_COMPACT_KHR; 206 | 207 | vkCmdCopyAccelerationStructureKHR(commandBuffer, ©Info); 208 | } 209 | 210 | VK_CHECK(vkEndCommandBuffer(commandBuffer)); 211 | 212 | VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; 213 | submitInfo.commandBufferCount = 1; 214 | submitInfo.pCommandBuffers = &commandBuffer; 215 | 216 | VK_CHECK(vkQueueSubmit(queue, 1, &submitInfo, VK_NULL_HANDLE)); 217 | VK_CHECK(vkDeviceWaitIdle(device)); 218 | 219 | for (size_t i = 0; i < blas.size(); ++i) 220 | { 221 | vkDestroyAccelerationStructureKHR(device, blas[i], nullptr); 222 | blas[i] = compactedBlas[i]; 223 | } 224 | 225 | destroyBuffer(blasBuffer, device); 226 | blasBuffer = compactedBuffer; 227 | } 228 | 229 | void buildCBLAS(VkDevice device, const std::vector& meshes, const std::vector& meshlets, const Buffer& vxb, const Buffer& mdb, std::vector& blas, Buffer& blasBuffer, VkCommandPool commandPool, VkCommandBuffer commandBuffer, VkQueue queue, const VkPhysicalDeviceMemoryProperties& memoryProperties) 230 | { 231 | #ifdef VK_NV_cluster_acceleration_structure 232 | const size_t kAlignment = 256; // required by spec for acceleration structures 233 | const size_t kClusterAlignment = 128; // required by spec for cluster acceleration structures 234 | 235 | VkClusterAccelerationStructureTriangleClusterInputNV clusterSizes = { VK_STRUCTURE_TYPE_CLUSTER_ACCELERATION_STRUCTURE_TRIANGLE_CLUSTER_INPUT_NV }; 236 | clusterSizes.vertexFormat = VK_FORMAT_R16G16B16A16_SFLOAT; 237 | clusterSizes.maxGeometryIndexValue = 0; 238 | clusterSizes.maxClusterUniqueGeometryCount = 1; 239 | clusterSizes.maxClusterTriangleCount = MESH_MAXTRI; 240 | clusterSizes.maxClusterVertexCount = MESH_MAXVTX; 241 | clusterSizes.minPositionTruncateBitCount = 0; 242 | 243 | VkClusterAccelerationStructureInputInfoNV clusterInfo = { VK_STRUCTURE_TYPE_CLUSTER_ACCELERATION_STRUCTURE_INPUT_INFO_NV }; 244 | clusterInfo.maxAccelerationStructureCount = 0; 245 | clusterInfo.flags = kBuildCLAS; 246 | clusterInfo.opType = VK_CLUSTER_ACCELERATION_STRUCTURE_OP_TYPE_BUILD_TRIANGLE_CLUSTER_NV; 247 | clusterInfo.opMode = VK_CLUSTER_ACCELERATION_STRUCTURE_OP_MODE_IMPLICIT_DESTINATIONS_NV; 248 | clusterInfo.opInput.pTriangleClusters = &clusterSizes; 249 | 250 | size_t maxClustersPerMesh = 0; 251 | 252 | for (const Mesh& mesh : meshes) 253 | { 254 | clusterSizes.maxTotalTriangleCount += mesh.lods[0].indexCount / 3; 255 | clusterInfo.maxAccelerationStructureCount += mesh.lods[0].meshletCount; 256 | maxClustersPerMesh = std::max(maxClustersPerMesh, size_t(mesh.lods[0].meshletCount)); 257 | 258 | for (size_t mi = 0; mi < mesh.lods[0].meshletCount; ++mi) 259 | { 260 | const Meshlet& ml = meshlets[mesh.lods[0].meshletOffset + mi]; 261 | clusterSizes.maxTotalVertexCount += ml.vertexCount; 262 | } 263 | } 264 | 265 | VkClusterAccelerationStructureClustersBottomLevelInputNV accelSizes = { VK_STRUCTURE_TYPE_CLUSTER_ACCELERATION_STRUCTURE_CLUSTERS_BOTTOM_LEVEL_INPUT_NV }; 266 | accelSizes.maxTotalClusterCount = clusterInfo.maxAccelerationStructureCount; 267 | accelSizes.maxClusterCountPerAccelerationStructure = maxClustersPerMesh; 268 | 269 | VkClusterAccelerationStructureMoveObjectsInputNV moveSizes = { VK_STRUCTURE_TYPE_CLUSTER_ACCELERATION_STRUCTURE_MOVE_OBJECTS_INPUT_NV }; 270 | moveSizes.type = VK_CLUSTER_ACCELERATION_STRUCTURE_TYPE_TRIANGLE_CLUSTER_NV; 271 | moveSizes.noMoveOverlap = true; 272 | 273 | VkClusterAccelerationStructureInputInfoNV accelInfo = { VK_STRUCTURE_TYPE_CLUSTER_ACCELERATION_STRUCTURE_INPUT_INFO_NV }; 274 | accelInfo.maxAccelerationStructureCount = meshes.size(); 275 | accelInfo.flags = kBuildBLAS; 276 | accelInfo.opType = VK_CLUSTER_ACCELERATION_STRUCTURE_OP_TYPE_BUILD_CLUSTERS_BOTTOM_LEVEL_NV; 277 | accelInfo.opMode = VK_CLUSTER_ACCELERATION_STRUCTURE_OP_MODE_IMPLICIT_DESTINATIONS_NV; 278 | accelInfo.opInput.pClustersBottomLevel = &accelSizes; 279 | 280 | VkClusterAccelerationStructureInputInfoNV moveInfo = { VK_STRUCTURE_TYPE_CLUSTER_ACCELERATION_STRUCTURE_INPUT_INFO_NV }; 281 | moveInfo.maxAccelerationStructureCount = clusterInfo.maxAccelerationStructureCount; 282 | moveInfo.opType = VK_CLUSTER_ACCELERATION_STRUCTURE_OP_TYPE_MOVE_OBJECTS_NV; 283 | moveInfo.opMode = VK_CLUSTER_ACCELERATION_STRUCTURE_OP_MODE_IMPLICIT_DESTINATIONS_NV; 284 | moveInfo.opInput.pMoveObjects = &moveSizes; 285 | 286 | VkAccelerationStructureBuildSizesInfoKHR csizeInfo = { VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR }; 287 | vkGetClusterAccelerationStructureBuildSizesNV(device, &clusterInfo, &csizeInfo); 288 | 289 | VkAccelerationStructureBuildSizesInfoKHR bsizeInfo = { VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR }; 290 | vkGetClusterAccelerationStructureBuildSizesNV(device, &accelInfo, &bsizeInfo); 291 | 292 | moveSizes.maxMovedBytes = csizeInfo.accelerationStructureSize; 293 | 294 | VkAccelerationStructureBuildSizesInfoKHR msizeInfo = { VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR }; 295 | vkGetClusterAccelerationStructureBuildSizesNV(device, &moveInfo, &msizeInfo); 296 | 297 | printf("CLAS accelerationStructureSize: %.2f MB, scratchSize: %.2f MB, compaction scratchSize: %.2f MB\n", double(csizeInfo.accelerationStructureSize) / 1e6, double(csizeInfo.buildScratchSize) / 1e6, double(msizeInfo.updateScratchSize) / 1e6); 298 | printf("CBLAS accelerationStructureSize: %.2f MB, scratchSize: %.2f MB\n", double(bsizeInfo.accelerationStructureSize) / 1e6, double(bsizeInfo.buildScratchSize) / 1e6); 299 | 300 | Buffer clasBuffer; 301 | createBuffer(clasBuffer, device, memoryProperties, csizeInfo.accelerationStructureSize, VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); 302 | 303 | Buffer scratchBuffer; 304 | createBuffer(scratchBuffer, device, memoryProperties, std::max(std::max(bsizeInfo.buildScratchSize, csizeInfo.buildScratchSize), msizeInfo.updateScratchSize), VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); 305 | 306 | Buffer infosBuffer; 307 | createBuffer(infosBuffer, device, memoryProperties, std::max(clusterInfo.maxAccelerationStructureCount * sizeof(VkClusterAccelerationStructureBuildTriangleClusterInfoNV), accelInfo.maxAccelerationStructureCount * sizeof(VkClusterAccelerationStructureBuildClustersBottomLevelInfoNV)), VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); 308 | 309 | VkDeviceAddress mdbAddress = getBufferAddress(mdb, device); 310 | VkDeviceAddress vxbAddress = getBufferAddress(vxb, device); 311 | 312 | VkClusterAccelerationStructureBuildTriangleClusterInfoNV* clusterData = static_cast(infosBuffer.data); 313 | size_t vxbOffset = 0; 314 | 315 | for (const Mesh& mesh : meshes) 316 | { 317 | for (size_t mi = 0; mi < mesh.lods[0].meshletCount; ++mi) 318 | { 319 | const Meshlet& ml = meshlets[mesh.lods[0].meshletOffset + mi]; 320 | 321 | VkClusterAccelerationStructureBuildTriangleClusterInfoNV cluster = {}; 322 | cluster.clusterID = uint32_t(mi); 323 | cluster.triangleCount = ml.triangleCount; 324 | cluster.vertexCount = ml.vertexCount; 325 | cluster.positionTruncateBitCount = 0; 326 | cluster.indexType = VK_CLUSTER_ACCELERATION_STRUCTURE_INDEX_FORMAT_8BIT_NV; 327 | cluster.vertexBufferStride = sizeof(uint16_t) * 4; 328 | cluster.indexBuffer = mdbAddress + (ml.dataOffset + (ml.shortRefs ? (ml.vertexCount + 1) / 2 : ml.vertexCount)) * sizeof(uint32_t); 329 | cluster.vertexBuffer = vxbAddress + vxbOffset; 330 | 331 | memcpy(clusterData, &cluster, sizeof(VkClusterAccelerationStructureBuildTriangleClusterInfoNV)); 332 | clusterData++; 333 | vxbOffset += ml.vertexCount * (sizeof(uint16_t) * 4); 334 | } 335 | } 336 | 337 | Buffer rangeBuffer; 338 | // todo host vis -> device local? 339 | // todo merge with infos and suballocate more cleanly 340 | createBuffer(rangeBuffer, device, memoryProperties, (clusterInfo.maxAccelerationStructureCount + accelInfo.maxAccelerationStructureCount) * 16, VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT_KHR | VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); 341 | 342 | VkClusterAccelerationStructureCommandsInfoNV clusterBuild = { VK_STRUCTURE_TYPE_CLUSTER_ACCELERATION_STRUCTURE_COMMANDS_INFO_NV }; 343 | clusterBuild.input = clusterInfo; 344 | clusterBuild.dstImplicitData = getBufferAddress(clasBuffer, device); 345 | clusterBuild.scratchData = getBufferAddress(scratchBuffer, device); 346 | clusterBuild.dstAddressesArray.deviceAddress = getBufferAddress(rangeBuffer, device); 347 | clusterBuild.dstAddressesArray.size = clusterInfo.maxAccelerationStructureCount * 8; 348 | clusterBuild.dstAddressesArray.stride = 8; 349 | clusterBuild.dstSizesArray.deviceAddress = getBufferAddress(rangeBuffer, device) + clusterInfo.maxAccelerationStructureCount * 8; 350 | clusterBuild.dstSizesArray.size = clusterInfo.maxAccelerationStructureCount * 8; 351 | clusterBuild.dstSizesArray.stride = 8; 352 | clusterBuild.srcInfosArray.deviceAddress = getBufferAddress(infosBuffer, device); 353 | clusterBuild.srcInfosArray.size = clusterInfo.maxAccelerationStructureCount * sizeof(VkClusterAccelerationStructureBuildTriangleClusterInfoNV); 354 | clusterBuild.srcInfosArray.stride = sizeof(VkClusterAccelerationStructureBuildTriangleClusterInfoNV); // TODO: redundant, validation layers bug 355 | 356 | VK_CHECK(vkResetCommandPool(device, commandPool, 0)); 357 | 358 | VkCommandBufferBeginInfo beginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; 359 | beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; 360 | 361 | VK_CHECK(vkBeginCommandBuffer(commandBuffer, &beginInfo)); 362 | 363 | vkCmdBuildClusterAccelerationStructureIndirectNV(commandBuffer, &clusterBuild); 364 | 365 | VK_CHECK(vkEndCommandBuffer(commandBuffer)); 366 | 367 | VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; 368 | submitInfo.commandBufferCount = 1; 369 | submitInfo.pCommandBuffers = &commandBuffer; 370 | 371 | VK_CHECK(vkQueueSubmit(queue, 1, &submitInfo, VK_NULL_HANDLE)); 372 | VK_CHECK(vkDeviceWaitIdle(device)); 373 | 374 | size_t compactTotalSize = 0; 375 | for (size_t i = 0; i < clusterInfo.maxAccelerationStructureCount; ++i) 376 | { 377 | uint32_t size = ((uint32_t*)rangeBuffer.data)[clusterInfo.maxAccelerationStructureCount * 2 + i * 2]; 378 | 379 | compactTotalSize += (size + kClusterAlignment - 1) & ~(kClusterAlignment - 1); 380 | } 381 | 382 | // align subsequent acceleration structures 383 | compactTotalSize = (compactTotalSize + kAlignment - 1) & ~(kAlignment - 1); 384 | 385 | printf("CLAS compacted accelerationStructureSize: %.2f MB\n", double(compactTotalSize) / 1e6); 386 | printf("CLAS+CBLAS accelerationStructureSize: %.2f MB\n", double(compactTotalSize + bsizeInfo.accelerationStructureSize) / 1e6); 387 | 388 | createBuffer(blasBuffer, device, memoryProperties, compactTotalSize + bsizeInfo.accelerationStructureSize, VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); 389 | 390 | // TODO: we are not actually querying size required for compaction, so scratch could be insufficient 391 | VkClusterAccelerationStructureCommandsInfoNV clusterMove = { VK_STRUCTURE_TYPE_CLUSTER_ACCELERATION_STRUCTURE_COMMANDS_INFO_NV }; 392 | clusterMove.input = moveInfo; 393 | clusterMove.dstImplicitData = getBufferAddress(blasBuffer, device); 394 | clusterMove.scratchData = getBufferAddress(scratchBuffer, device); 395 | clusterMove.dstAddressesArray.deviceAddress = getBufferAddress(rangeBuffer, device) + clusterInfo.maxAccelerationStructureCount * 8; 396 | clusterMove.dstAddressesArray.size = moveInfo.maxAccelerationStructureCount * 8; 397 | clusterMove.dstAddressesArray.stride = 8; 398 | clusterMove.srcInfosArray.deviceAddress = getBufferAddress(rangeBuffer, device); 399 | clusterMove.srcInfosArray.size = moveInfo.maxAccelerationStructureCount * 8; 400 | clusterMove.srcInfosArray.stride = 8; // TODO: redundant, probably a driver bug 401 | 402 | printf("max cluster count %d, total cluster count %d, total blas count %d\n", int(maxClustersPerMesh), int(clusterInfo.maxAccelerationStructureCount), int(accelInfo.maxAccelerationStructureCount)); 403 | 404 | VkClusterAccelerationStructureBuildClustersBottomLevelInfoNV* accelData = static_cast(infosBuffer.data); 405 | size_t accelOffset = 0; 406 | 407 | for (const Mesh& mesh : meshes) 408 | { 409 | VkClusterAccelerationStructureBuildClustersBottomLevelInfoNV accel = {}; 410 | accel.clusterReferencesCount = uint32_t(mesh.lods[0].meshletCount); 411 | accel.clusterReferencesStride = 8; 412 | accel.clusterReferences = getBufferAddress(rangeBuffer, device) + clusterInfo.maxAccelerationStructureCount * 8 + accelOffset * 8; 413 | 414 | memcpy(accelData, &accel, sizeof(VkClusterAccelerationStructureBuildClustersBottomLevelInfoNV)); 415 | accelData++; 416 | accelOffset += mesh.lods[0].meshletCount; 417 | } 418 | 419 | VkClusterAccelerationStructureCommandsInfoNV accelBuild = { VK_STRUCTURE_TYPE_CLUSTER_ACCELERATION_STRUCTURE_COMMANDS_INFO_NV }; 420 | accelBuild.input = accelInfo; 421 | accelBuild.dstImplicitData = getBufferAddress(blasBuffer, device) + compactTotalSize; 422 | accelBuild.scratchData = getBufferAddress(scratchBuffer, device); 423 | accelBuild.dstAddressesArray.deviceAddress = getBufferAddress(rangeBuffer, device) + clusterInfo.maxAccelerationStructureCount * 16; 424 | accelBuild.dstAddressesArray.size = accelInfo.maxAccelerationStructureCount * 8; 425 | accelBuild.dstAddressesArray.stride = 8; 426 | accelBuild.dstSizesArray.deviceAddress = getBufferAddress(rangeBuffer, device) + clusterInfo.maxAccelerationStructureCount * 16 + accelInfo.maxAccelerationStructureCount * 8; 427 | accelBuild.dstSizesArray.size = accelInfo.maxAccelerationStructureCount * 8; 428 | accelBuild.dstSizesArray.stride = 8; 429 | accelBuild.srcInfosArray.deviceAddress = getBufferAddress(infosBuffer, device); 430 | accelBuild.srcInfosArray.size = accelInfo.maxAccelerationStructureCount * sizeof(VkClusterAccelerationStructureBuildClustersBottomLevelInfoNV); 431 | accelBuild.srcInfosArray.stride = sizeof(VkClusterAccelerationStructureBuildClustersBottomLevelInfoNV); // TODO: redundant, validation layers bug 432 | 433 | VK_CHECK(vkResetCommandPool(device, commandPool, 0)); 434 | VK_CHECK(vkBeginCommandBuffer(commandBuffer, &beginInfo)); 435 | 436 | vkCmdBuildClusterAccelerationStructureIndirectNV(commandBuffer, &clusterMove); 437 | 438 | stageBarrier(commandBuffer, VK_PIPELINE_STAGE_ACCELERATION_STRUCTURE_BUILD_BIT_KHR); 439 | 440 | vkCmdBuildClusterAccelerationStructureIndirectNV(commandBuffer, &accelBuild); 441 | 442 | VK_CHECK(vkEndCommandBuffer(commandBuffer)); 443 | VK_CHECK(vkQueueSubmit(queue, 1, &submitInfo, VK_NULL_HANDLE)); 444 | VK_CHECK(vkDeviceWaitIdle(device)); 445 | 446 | VkDeviceAddress blasAddress = getBufferAddress(blasBuffer, device); 447 | uint32_t* rangeAccel = (uint32_t*)rangeBuffer.data + clusterInfo.maxAccelerationStructureCount * 4; 448 | 449 | blas.resize(meshes.size()); 450 | 451 | for (size_t i = 0; i < accelInfo.maxAccelerationStructureCount; ++i) 452 | { 453 | VkAccelerationStructureCreateInfoKHR accelerationInfo = { VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_CREATE_INFO_KHR }; 454 | accelerationInfo.buffer = blasBuffer.buffer; 455 | accelerationInfo.offset = ((uint64_t*)rangeAccel)[i] - blasAddress; 456 | accelerationInfo.size = rangeAccel[accelInfo.maxAccelerationStructureCount * 2 + i * 2]; 457 | accelerationInfo.type = VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR; 458 | 459 | VK_CHECK(vkCreateAccelerationStructureKHR(device, &accelerationInfo, nullptr, &blas[i])); 460 | } 461 | 462 | destroyBuffer(scratchBuffer, device); 463 | destroyBuffer(infosBuffer, device); 464 | destroyBuffer(rangeBuffer, device); 465 | destroyBuffer(clasBuffer, device); 466 | #else 467 | VK_CHECK(VK_ERROR_FEATURE_NOT_PRESENT); 468 | #endif 469 | } 470 | 471 | void fillInstanceRT(VkAccelerationStructureInstanceKHR& instance, const MeshDraw& draw, uint32_t instanceIndex, VkDeviceAddress blas) 472 | { 473 | mat3 xform = transpose(glm::mat3_cast(draw.orientation)) * draw.scale; 474 | 475 | memcpy(instance.transform.matrix[0], &xform[0], sizeof(float) * 3); 476 | memcpy(instance.transform.matrix[1], &xform[1], sizeof(float) * 3); 477 | memcpy(instance.transform.matrix[2], &xform[2], sizeof(float) * 3); 478 | instance.transform.matrix[0][3] = draw.position.x; 479 | instance.transform.matrix[1][3] = draw.position.y; 480 | instance.transform.matrix[2][3] = draw.position.z; 481 | instance.instanceCustomIndex = instanceIndex; 482 | instance.mask = 1 << draw.postPass; 483 | instance.flags = draw.postPass ? VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR : VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR; 484 | instance.accelerationStructureReference = draw.postPass <= 1 ? blas : 0; 485 | } 486 | 487 | VkAccelerationStructureKHR createTLAS(VkDevice device, Buffer& tlasBuffer, Buffer& scratchBuffer, const Buffer& instanceBuffer, uint32_t primitiveCount, const VkPhysicalDeviceMemoryProperties& memoryProperties) 488 | { 489 | VkAccelerationStructureGeometryKHR geometry = { VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_KHR }; 490 | geometry.geometryType = VK_GEOMETRY_TYPE_INSTANCES_KHR; 491 | geometry.geometry.instances.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_INSTANCES_DATA_KHR; 492 | geometry.geometry.instances.data.deviceAddress = getBufferAddress(instanceBuffer, device); 493 | 494 | VkAccelerationStructureBuildGeometryInfoKHR buildInfo = { VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_GEOMETRY_INFO_KHR }; 495 | buildInfo.type = VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR; 496 | buildInfo.flags = kBuildTLAS | VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR; 497 | buildInfo.mode = VK_BUILD_ACCELERATION_STRUCTURE_MODE_BUILD_KHR; 498 | buildInfo.geometryCount = 1; 499 | buildInfo.pGeometries = &geometry; 500 | 501 | VkAccelerationStructureBuildSizesInfoKHR sizeInfo = { VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR }; 502 | vkGetAccelerationStructureBuildSizesKHR(device, VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR, &buildInfo, &primitiveCount, &sizeInfo); 503 | 504 | printf("TLAS accelerationStructureSize: %.2f MB, scratchSize: %.2f MB, updateScratch: %.2f MB\n", double(sizeInfo.accelerationStructureSize) / 1e6, double(sizeInfo.buildScratchSize) / 1e6, double(sizeInfo.updateScratchSize) / 1e6); 505 | 506 | createBuffer(tlasBuffer, device, memoryProperties, sizeInfo.accelerationStructureSize, VK_BUFFER_USAGE_ACCELERATION_STRUCTURE_STORAGE_BIT_KHR, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); 507 | 508 | createBuffer(scratchBuffer, device, memoryProperties, std::max(sizeInfo.buildScratchSize, sizeInfo.updateScratchSize), VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); 509 | 510 | VkAccelerationStructureCreateInfoKHR accelerationInfo = { VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_CREATE_INFO_KHR }; 511 | accelerationInfo.buffer = tlasBuffer.buffer; 512 | accelerationInfo.size = sizeInfo.accelerationStructureSize; 513 | accelerationInfo.type = VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR; 514 | 515 | VkAccelerationStructureKHR tlas = nullptr; 516 | VK_CHECK(vkCreateAccelerationStructureKHR(device, &accelerationInfo, nullptr, &tlas)); 517 | 518 | return tlas; 519 | } 520 | 521 | void buildTLAS(VkDevice device, VkCommandBuffer commandBuffer, VkAccelerationStructureKHR tlas, const Buffer& tlasBuffer, const Buffer& scratchBuffer, const Buffer& instanceBuffer, uint32_t primitiveCount, VkBuildAccelerationStructureModeKHR mode) 522 | { 523 | VkAccelerationStructureGeometryKHR geometry = { VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_KHR }; 524 | geometry.geometryType = VK_GEOMETRY_TYPE_INSTANCES_KHR; 525 | geometry.geometry.instances.sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_INSTANCES_DATA_KHR; 526 | geometry.geometry.instances.data.deviceAddress = getBufferAddress(instanceBuffer, device); 527 | 528 | VkAccelerationStructureBuildGeometryInfoKHR buildInfo = { VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_GEOMETRY_INFO_KHR }; 529 | buildInfo.type = VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR; 530 | buildInfo.flags = kBuildTLAS | VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_UPDATE_BIT_KHR; 531 | buildInfo.mode = mode; 532 | buildInfo.geometryCount = 1; 533 | buildInfo.pGeometries = &geometry; 534 | 535 | buildInfo.srcAccelerationStructure = tlas; 536 | buildInfo.dstAccelerationStructure = tlas; 537 | buildInfo.scratchData.deviceAddress = getBufferAddress(scratchBuffer, device); 538 | 539 | VkAccelerationStructureBuildRangeInfoKHR buildRange = {}; 540 | buildRange.primitiveCount = primitiveCount; 541 | const VkAccelerationStructureBuildRangeInfoKHR* buildRangePtr = &buildRange; 542 | 543 | vkCmdBuildAccelerationStructuresKHR(commandBuffer, 1, &buildInfo, &buildRangePtr); 544 | 545 | stageBarrier(commandBuffer, VK_PIPELINE_STAGE_ACCELERATION_STRUCTURE_BUILD_BIT_KHR, VK_PIPELINE_STAGE_ACCELERATION_STRUCTURE_BUILD_BIT_KHR | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT); 546 | } 547 | --------------------------------------------------------------------------------