├── .gitignore ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── demo.jpg ├── examples ├── basic_bvh2.rs ├── basic_cwbvh.rs ├── cornell_box_cwbvh.rs └── demoscene.rs ├── src ├── aabb.rs ├── bvh2 │ ├── builder.rs │ ├── leaf_collapser.rs │ ├── mod.rs │ └── reinsertion.rs ├── cwbvh │ ├── builder.rs │ ├── bvh2_to_cwbvh.rs │ ├── mod.rs │ ├── node.rs │ ├── simd.rs │ └── traverse_macro.rs ├── heapstack.rs ├── lib.rs ├── ploc │ ├── mod.rs │ └── morton.rs ├── ray.rs ├── rt_triangle.rs ├── splits.rs ├── test_util.rs └── triangle.rs └── tests └── mod.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | .vscode 3 | Cargo.lock 4 | *_rend.png -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "obvhs" 3 | version = "0.2.0" 4 | edition = "2021" 5 | description = "BVH Construction and Traversal Library" 6 | homepage = "https://github.com/DGriffin91/obvhs" 7 | repository = "https://github.com/DGriffin91/obvhs" 8 | readme = "README.md" 9 | license = "MIT OR Apache-2.0" 10 | keywords = ["bvh", "sah", "aabb", "cwbvh", "ploc"] 11 | 12 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 13 | 14 | [dependencies] 15 | glam = { version = "0.29", features = ["bytemuck"] } 16 | half = "2.3.1" 17 | bytemuck = "1.15" 18 | rdst = { version = "0.20.14", default-features = false } 19 | rayon = { version = "1.9.0", optional = true } 20 | 21 | # Noop unless one of the profile-with features below is also used 22 | profiling = { version = "1.0", optional = true } 23 | 24 | [dev-dependencies] 25 | image = "0.24" 26 | 27 | [features] 28 | #default = [] 29 | parallel = ["dep:rayon", "rdst/multi-threaded"] 30 | timeit = [] 31 | 32 | profile = ["dep:profiling"] 33 | profile-with-puffin = ["profiling/profile-with-puffin"] 34 | profile-with-optick = ["profiling/profile-with-optick"] 35 | profile-with-superluminal = ["profiling/profile-with-superluminal"] 36 | profile-with-tracing = ["profiling/profile-with-tracing"] 37 | profile-with-tracy = ["profiling/profile-with-tracy"] 38 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OBVHS - BVH Construction and Traversal Library 2 | 3 | ![License](https://img.shields.io/badge/license-MIT%2FApache-blue.svg) [![Crates.io](https://img.shields.io/crates/v/obvhs.svg)](https://crates.io/crates/obvhs) 4 | [![Docs](https://docs.rs/obvhs/badge.svg)](https://docs.rs/obvhs/latest/obvhs/) 5 | 6 | - [PLOC](https://meistdan.github.io/publications/ploc/paper.pdf) BVH2 builder with [Parallel Reinsertion](https://meistdan.github.io/publications/prbvh/paper.pdf) and spatial pre-splits. 7 | - [CWBVH](https://research.nvidia.com/sites/default/files/publications/ylitie2017hpg-paper.pdf) An eight-way compressed wide BVH8 builder. Each BVH Node is compressed so that it takes up only 80 bytes per node. 8 | - CPU traversal for both BVH2 and CWBVH (SIMD traversal, intersecting 4 nodes at a time) 9 | - For GPU traversal example, see the [Tray Racing](https://github.com/DGriffin91/tray_racing) benchmark 10 | 11 | ![demo](demo.jpg) 12 | [*demoscene example*](https://github.com/DGriffin91/obvhs/blob/main/examples/demoscene.rs) 13 | 14 | OBVHS optionally uses [rayon](https://github.com/rayon-rs/rayon) to parallelize building. Many parts of the building process are parallelized, but single threaded building speed has initally been the priority so there is still quite a bit of room for improvement in parallel building performance. 15 | 16 | # Benchmarks 17 | See [Tray Racing](https://github.com/DGriffin91/tray_racing). 18 | 19 | # Acknowledgments 20 | - [Tomasz Stachowiak](https://github.com/h3r2tic) for the initial rust/embree CWBVH builder, HLSL traversal, and numerous discussions along the way. 21 | - Jan Van Bergen for their [wonderful CUDA path tracer that implements CWBVH](https://github.com/jan-van-bergen/GPU-Raytracer). 22 | - Arsène Pérard-Gayot for their [series of articles on BVHs](https://madmann91.github.io/) and [BVH library](https://github.com/madmann91/bvh). 23 | - H. Ylitie et al. for [Efficient Incoherent Ray Traversal on GPUs Through 24 | Compressed Wide BVHs](https://research.nvidia.com/sites/default/files/publications/ylitie2017hpg-paper.pdf). 25 | - D. Meister et al. for [Parallel Locally-Ordered Clustering for Bounding Volume Hierarchy Construction](https://meistdan.github.io/publications/ploc/paper.pdf), [Parallel Reinsertion for Bounding Volume Hierarchy Optimization](https://meistdan.github.io/publications/prbvh/paper.pdf), and [Performance Comparison of Bounding Volume Hierarchies for GPU Ray Tracing](https://jcgt.org/published/0011/04/01/paper.pdf). 26 | -------------------------------------------------------------------------------- /demo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DGriffin91/obvhs/63e11fdbb9de52375695c8020bf61910e2825986/demo.jpg -------------------------------------------------------------------------------- /examples/basic_bvh2.rs: -------------------------------------------------------------------------------- 1 | use std::time::Duration; 2 | 3 | use glam::*; 4 | use obvhs::{ 5 | bvh2::builder::build_bvh2_from_tris, 6 | ray::{Ray, RayHit}, 7 | test_util::geometry::{icosphere, PLANE}, 8 | triangle::Triangle, 9 | BvhBuildParams, 10 | }; 11 | 12 | fn main() { 13 | // Build a scene with an icosphere and a plane 14 | // BVH primitives do not need to be triangles, the BVH builder is only concerned with AABBs. 15 | // (With the exception of optional precise triangle aabb splitting) 16 | let mut tris: Vec = Vec::new(); 17 | tris.extend(icosphere(1)); 18 | tris.extend(PLANE); 19 | 20 | // Build the BVH. 21 | // build_bvh_from_tris is just a helper that can build from BvhBuildParams and the 22 | // respective presets. Feel free to copy the contents of build_bvh_from_tris or build_bvh. 23 | // They are very straightforward. If you don't want to use Triangles as the primitive, use 24 | // build_bvh instead. build_cwbvh_from_tris just adds support for splitting tris. 25 | let bvh = build_bvh2_from_tris( 26 | &tris, 27 | BvhBuildParams::medium_build(), 28 | &mut Duration::default(), 29 | ); 30 | 31 | // Create a new ray 32 | let ray = Ray::new_inf(vec3a(0.1, 0.1, 4.0), vec3a(0.0, 0.0, -1.0)); 33 | 34 | // Traverse the BVH, finding the closest hit. 35 | let mut ray_hit = RayHit::none(); 36 | if bvh.ray_traverse(ray, &mut ray_hit, |ray, id| { 37 | // Use primitive_indices to look up the original primitive id. 38 | // (Could reorder tris per bvh.primitive_indices to avoid this lookup, see cornell_box_cwbvh example) 39 | tris[bvh.primitive_indices[id] as usize].intersect(ray) 40 | }) { 41 | println!( 42 | "Hit Triangle {}", 43 | bvh.primitive_indices[ray_hit.primitive_id as usize] 44 | ); 45 | println!("Distance to hit: {}", ray_hit.t); 46 | } else { 47 | println!("Miss"); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /examples/basic_cwbvh.rs: -------------------------------------------------------------------------------- 1 | use glam::*; 2 | use obvhs::{ 3 | cwbvh::builder::build_cwbvh_from_tris, 4 | ray::{Ray, RayHit}, 5 | test_util::geometry::{icosphere, PLANE}, 6 | triangle::Triangle, 7 | BvhBuildParams, 8 | }; 9 | use std::time::Duration; 10 | 11 | fn main() { 12 | // Build a scene with an icosphere and a plane 13 | // BVH primitives do not need to be triangles, the BVH builder is only concerned with AABBs. 14 | // (With the exception of optional precise triangle aabb splitting) 15 | let mut tris: Vec = Vec::new(); 16 | tris.extend(icosphere(1)); 17 | tris.extend(PLANE); 18 | 19 | // Build the BVH. 20 | // build_cwbvh_from_tris is just a helper that can build from BvhBuildParams and the 21 | // respective presets. Feel free to copy the contents of build_cwbvh_from_tris or 22 | // build_cwbvh. They are very straightforward. If you don't want to use Triangles as the 23 | // primitive, use build_cwbvh instead. build_cwbvh_from_tris just adds support for 24 | // splitting tris. 25 | let bvh = build_cwbvh_from_tris( 26 | &tris, 27 | BvhBuildParams::medium_build(), 28 | &mut Duration::default(), 29 | ); 30 | 31 | // Create a new ray 32 | let ray = Ray::new_inf(vec3a(0.1, 0.1, 4.0), vec3a(0.0, 0.0, -1.0)); 33 | 34 | // Traverse the BVH, finding the closest hit. 35 | let mut ray_hit = RayHit::none(); 36 | if bvh.ray_traverse(ray, &mut ray_hit, |ray, id| { 37 | // Use primitive_indices to look up the original primitive id. 38 | // (Could reorder tris per bvh.primitive_indices to avoid this lookup, see 39 | // cornell_box_cwbvh example) 40 | tris[bvh.primitive_indices[id] as usize].intersect(ray) 41 | }) { 42 | println!( 43 | "Hit Triangle {}", 44 | bvh.primitive_indices[ray_hit.primitive_id as usize] 45 | ); 46 | println!("Distance to hit: {}", ray_hit.t); 47 | } else { 48 | println!("Miss"); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /examples/cornell_box_cwbvh.rs: -------------------------------------------------------------------------------- 1 | use std::{f32::consts::PI, time::Duration}; 2 | 3 | use glam::*; 4 | use image::{ImageBuffer, Rgba}; 5 | use obvhs::{ 6 | cwbvh::builder::build_cwbvh_from_tris, 7 | ray::{Ray, RayHit}, 8 | test_util::geometry::{CUBE, PLANE}, 9 | triangle::Triangle, 10 | BvhBuildParams, Transformable, 11 | }; 12 | 13 | // Generate triangles for cornell box 14 | fn generate_cornell_box() -> Vec { 15 | let floor = PLANE; 16 | let mut box1 = CUBE; 17 | let mut box2 = box1.clone(); 18 | let mut ceiling = floor.clone(); 19 | let mut wall1 = floor.clone(); 20 | let mut wall2 = floor.clone(); 21 | let mut wall3 = floor.clone(); 22 | box1.transform(&Mat4::from_scale_rotation_translation( 23 | Vec3::splat(0.3), 24 | Quat::from_rotation_y(-17.5f32.to_radians()), 25 | vec3(0.33, 0.3, 0.37), 26 | )); 27 | box2.transform(&Mat4::from_scale_rotation_translation( 28 | vec3(0.3, 0.6, 0.3), 29 | Quat::from_rotation_y(17.5f32.to_radians()), 30 | vec3(-0.33, 0.6, -0.29), 31 | )); 32 | ceiling.transform(&Mat4::from_translation(Vec3::Y * 2.0)); 33 | wall1.transform(&Mat4::from_rotation_translation( 34 | Quat::from_rotation_x(PI * 0.5), 35 | vec3(0.0, 1.0, -1.0), 36 | )); 37 | wall2.transform(&Mat4::from_rotation_translation( 38 | Quat::from_rotation_z(-PI * 0.5), 39 | vec3(-1.0, 1.0, 0.0), 40 | )); 41 | wall3.transform(&Mat4::from_rotation_translation( 42 | Quat::from_rotation_z(-PI * 0.5), 43 | vec3(1.0, 1.0, 0.0), 44 | )); 45 | let mut tris = Vec::new(); 46 | tris.extend(floor); 47 | tris.extend(box1); 48 | tris.extend(box2); 49 | tris.extend(ceiling); 50 | tris.extend(wall1); 51 | tris.extend(wall2); 52 | tris.extend(wall3); 53 | tris 54 | } 55 | 56 | fn main() { 57 | let tris = generate_cornell_box(); 58 | // Build cwbvh (Change this to build_bvh2_from_tris to try with Bvh2) 59 | let bvh = build_cwbvh_from_tris( 60 | &tris, 61 | BvhBuildParams::medium_build(), 62 | &mut Duration::default(), 63 | ); 64 | 65 | // The reason for this mapping below is that if multiple primitives are contained in a cwbvh node, they need to have their indices layed out contiguously. 66 | // If we want to avoid this indirection during traversal there are two options: 67 | // 1. Layout the primitives in the order of the cwbvh's indices mapping so that this can index directly into the primitive list. 68 | // 2. Only allow one primitive per node and write back the original mapping to the bvh node list. 69 | let bvh_tris = bvh 70 | .primitive_indices 71 | .iter() 72 | .map(|i| tris[*i as usize]) 73 | .collect::>(); 74 | 75 | // Setup render target and camera 76 | let width = 1280; 77 | let height = 720; 78 | let target_size = Vec2::new(width as f32, height as f32); 79 | let fov = 90.0f32; 80 | let eye = vec3a(0.0, 1.0, 2.1); 81 | let look_at = vec3(0.0, 1.0, 0.0); 82 | 83 | // Compute camera projection & view matrices 84 | let aspect_ratio = target_size.x / target_size.y; 85 | let proj_inv = 86 | Mat4::perspective_infinite_reverse_rh(fov.to_radians(), aspect_ratio, 0.01).inverse(); 87 | let view_inv = Mat4::look_at_rh(eye.into(), look_at, Vec3::Y).inverse(); 88 | 89 | // Init image buffer 90 | let mut img: ImageBuffer, Vec> = ImageBuffer::new(width, height); 91 | let pixels = img.as_mut(); 92 | 93 | // For each pixel trace ray into scene and write normal as color to image buffer 94 | pixels.chunks_mut(4).enumerate().for_each(|(i, chunk)| { 95 | let frag_coord = uvec2(i as u32 % width, i as u32 / width); 96 | let mut screen_uv = frag_coord.as_vec2() / target_size; 97 | screen_uv.y = 1.0 - screen_uv.y; 98 | let ndc = screen_uv * 2.0 - Vec2::ONE; 99 | let clip_pos = vec4(ndc.x, ndc.y, 1.0, 1.0); 100 | 101 | let mut vs_pos = proj_inv * clip_pos; 102 | vs_pos /= vs_pos.w; 103 | let direction = (Vec3A::from((view_inv * vs_pos).xyz()) - eye).normalize(); 104 | let ray = Ray::new(eye, direction, 0.0, f32::MAX); 105 | 106 | let mut hit = RayHit::none(); 107 | if bvh.ray_traverse(ray, &mut hit, |ray, id| bvh_tris[id].intersect(ray)) { 108 | let mut normal = bvh_tris[hit.primitive_id as usize].compute_normal(); 109 | normal *= normal.dot(-ray.direction).signum(); // Double sided 110 | let c = (normal * 255.0).as_uvec3(); 111 | chunk.copy_from_slice(&[c.x as u8, c.y as u8, c.z as u8, 255]); 112 | } 113 | }); 114 | 115 | img.save("basic_cornell_box_rend.png") 116 | .expect("Failed to save image"); 117 | } 118 | -------------------------------------------------------------------------------- /examples/demoscene.rs: -------------------------------------------------------------------------------- 1 | // For fun, not pbr 2 | // Run with `--release --features parallel` unless you like waiting around for a very long time. 3 | use glam::*; 4 | use image::{ImageBuffer, Rgba}; 5 | use obvhs::{ 6 | cwbvh::builder::build_cwbvh_from_tris, 7 | ray::{Ray, RayHit}, 8 | rt_triangle::RtTriangle, 9 | test_util::{ 10 | geometry::demoscene, 11 | sampling::{ 12 | build_orthonormal_basis, cosine_sample_hemisphere, hash_noise, 13 | somewhat_boring_display_transform, uniform_sample_cone, uniform_sample_sphere, 14 | }, 15 | }, 16 | timeit, BvhBuildParams, 17 | }; 18 | use std::{io::Write, time::Duration}; 19 | pub const SUN_ANGULAR_DIAMETER: f32 = 0.00933; 20 | 21 | #[cfg(feature = "parallel")] 22 | use rayon::iter::{IntoParallelIterator, ParallelIterator}; 23 | use sky::Sky; 24 | 25 | fn main() { 26 | let total_aa_samples = 64; 27 | let resolution = 2560; 28 | let seed = 57; 29 | 30 | timeit!["generate height map", 31 | let tris = demoscene(resolution as usize, seed * 10); 32 | ]; 33 | println!("{} triangles, {} AA samples", tris.len(), total_aa_samples); 34 | timeit!["generate bvh", 35 | let bvh = build_cwbvh_from_tris(&tris, BvhBuildParams::very_fast_build(), &mut Duration::default()); 36 | ]; 37 | 38 | let bvh_tris = bvh 39 | .primitive_indices 40 | .iter() 41 | .map(|i| (&tris[*i as usize]).into()) 42 | .collect::>(); 43 | 44 | let intersection_fn = |ray: &Ray, id: usize| bvh_tris[id].intersect(ray); 45 | 46 | // Setup render target and camera 47 | let width = resolution; 48 | let height = ((resolution as f32) * 0.3711) as u32; 49 | let target_size = Vec2::new(width as f32, height as f32); 50 | let fov = 17.0f32; 51 | let eye = vec3a(0.0, 0.0, 1.35); 52 | let look_at = eye + vec3a(0.0, 0.16, -1.0); 53 | let sun_direction = vec3a(0.35, -0.1, 0.19).normalize(); 54 | let sky = Sky::red_sunset(-sun_direction); 55 | let sky_bg = Sky::red_sunset(-vec3a(0.35, -0.1, 0.5).normalize()); // To extend the sun glow a bit in the BG 56 | let nee = 1.0 - SUN_ANGULAR_DIAMETER.cos(); 57 | let material_color = vec3a(0.61, 0.59, 0.52).powf(2.2); 58 | let exposure = -3.6; 59 | 60 | // Compute camera projection & view matrices 61 | let aspect_ratio = target_size.x / target_size.y; 62 | let proj_inv = 63 | Mat4::perspective_infinite_reverse_rh(fov.to_radians(), aspect_ratio, 0.01).inverse(); 64 | let view = Mat4::look_at_rh(eye.into(), look_at.into(), Vec3::Y); 65 | let view_inv = view.inverse(); 66 | 67 | let mut fragments = vec![Vec3A::ZERO; (width * height) as usize]; 68 | 69 | println!("|{}|", " ".repeat(total_aa_samples as usize)); 70 | print!(" "); 71 | timeit![ 72 | "render", 73 | for aa_sample in 0..total_aa_samples { 74 | print!("."); // Print progress 75 | std::io::stdout().flush().unwrap(); 76 | let new_fragments: Vec; 77 | #[cfg(feature = "parallel")] 78 | let iter = (0..width * height).into_par_iter(); 79 | #[cfg(not(feature = "parallel"))] 80 | let iter = (0..width * height).into_iter(); 81 | new_fragments = iter 82 | .map(|i| { 83 | let frag_coord = uvec2(i as u32 % width, i as u32 / width); 84 | let misc_grain_noise = hash_noise(frag_coord, aa_sample + 12345); 85 | let aa = vec2( 86 | hash_noise(frag_coord, aa_sample), 87 | hash_noise(frag_coord, aa_sample + 512), 88 | ) * 0.5 89 | - 0.25; 90 | let mut screen_uv = (frag_coord.as_vec2() + aa) / target_size; 91 | screen_uv.y = 1.0 - screen_uv.y; 92 | let ndc = screen_uv * 2.0 - Vec2::ONE; 93 | let clip_pos = vec4(ndc.x, ndc.y, 1.0, 1.0); 94 | 95 | let mut vs = proj_inv * clip_pos; 96 | vs /= vs.w; 97 | let direction = (Vec3A::from((view_inv * vs).xyz()) - eye).normalize(); 98 | 99 | let fuzz = vec3a( 100 | hash_noise(frag_coord, aa_sample), 101 | hash_noise(frag_coord, aa_sample + 512), 102 | hash_noise(frag_coord, aa_sample + 1024), 103 | ); 104 | let fuzzy_cube_of_sensor = eye + (fuzz * 2.0 - 1.0) * 0.002; 105 | 106 | let focal_distance = 2.4; 107 | let focal_point = eye + direction * focal_distance; 108 | let cam_dir = (focal_point - fuzzy_cube_of_sensor).normalize_or_zero(); 109 | let ray = Ray::new_inf(fuzzy_cube_of_sensor, cam_dir); 110 | 111 | let mut color = Vec3A::ZERO; 112 | 113 | let fog_dir = uniform_sample_sphere(vec2( 114 | hash_noise(frag_coord, aa_sample + 2048), 115 | hash_noise(frag_coord, aa_sample + 3840), 116 | )); 117 | let mut hit = RayHit::none(); 118 | let fogc = sky.render(fog_dir).min(Vec3A::splat(100.0)); 119 | let skyc = sky.render(ray.direction); 120 | let sunc = sky.render(-sun_direction); 121 | let mut state = bvh.new_ray_traversal(ray); 122 | while bvh.ray_traverse_dynamic(&mut state, &mut hit, intersection_fn) {} 123 | if hit.t < f32::MAX { 124 | let mut normal = bvh_tris[hit.primitive_id as usize].compute_normal(); 125 | normal *= normal.dot(-ray.direction).signum(); // Double sided 126 | 127 | let hit_p = ray.origin + ray.direction * hit.t - ray.direction * 0.01; 128 | 129 | let tangent_to_world = build_orthonormal_basis(normal); 130 | let mut ao_ray_dir = cosine_sample_hemisphere(vec2( 131 | hash_noise(frag_coord, aa_sample), 132 | hash_noise(frag_coord, aa_sample + 1024), 133 | )); 134 | ao_ray_dir = (tangent_to_world * ao_ray_dir).normalize(); 135 | 136 | let diff_ray = Ray::new_inf(hit_p, ao_ray_dir); 137 | let mut diff_hit = RayHit::none(); 138 | state.reinit(diff_ray); 139 | while bvh.ray_traverse_dynamic(&mut state, &mut diff_hit, intersection_fn) {} 140 | if diff_hit.t < f32::MAX { 141 | let mut diff_hit_normal = 142 | bvh_tris[diff_hit.primitive_id as usize].compute_normal(); 143 | diff_hit_normal *= diff_hit_normal.dot(-ray.direction).signum(); // Double sided 144 | 145 | // Silly 1st bounce sun shadow ray 146 | let ao_hit_p = hit_p + diff_ray.direction * diff_hit.t - diff_ray.direction * 0.01; 147 | let sun_ray = Ray::new_inf(ao_hit_p, -sun_direction); 148 | let mut sun_hit = RayHit::none(); 149 | // anyhit 150 | 151 | state.reinit(sun_ray); 152 | if !bvh.ray_traverse_dynamic(&mut state, &mut sun_hit, intersection_fn) { 153 | // xD 154 | color += material_color * material_color * nee * sunc * 4.0; 155 | } 156 | } else { 157 | let fresnel = (1.0 - normal.dot(-cam_dir)).powf(8.0).max(0.0); 158 | let skyc = sky 159 | .render(diff_ray.direction) 160 | // Sun results in fireflies. Clamp to avoid randomly sampling super high values. 161 | .min(Vec3A::splat(100.0)); 162 | color += material_color * (fresnel * skyc * 0.5 + skyc); 163 | } 164 | 165 | // Sun shadow ray 166 | let sun_rnd = vec2( 167 | hash_noise(frag_coord, aa_sample + 10000), 168 | hash_noise(frag_coord, aa_sample + 20000), 169 | ); 170 | let sun_basis = build_orthonormal_basis(sun_direction); 171 | let sun_dir = (sun_basis 172 | * uniform_sample_cone(sun_rnd, (SUN_ANGULAR_DIAMETER * 0.5).cos())) 173 | .normalize_or_zero(); 174 | 175 | let mut sun_hit = RayHit::none(); 176 | let sun_ray = Ray::new_inf(hit_p, -sun_dir); 177 | 178 | state.reinit(sun_ray); // anyhit 179 | if !bvh.ray_traverse_dynamic(&mut state, &mut sun_hit, intersection_fn) { 180 | color += material_color 181 | * nee 182 | * normal.dot(-sun_dir).max(0.00001) 183 | * sunc 184 | * 10.0 185 | * misc_grain_noise; 186 | } 187 | 188 | // Fog shadow ray 189 | let fog_t = hit.t * hash_noise(frag_coord, aa_sample + 54321); 190 | let fog_p = ray.origin + ray.direction * fog_t; 191 | let sun_ray = Ray::new_inf(fog_p, -sun_direction); 192 | let mut sun_hit = RayHit::none(); 193 | 194 | state.reinit(sun_ray); // anyhit 195 | if !bvh.ray_traverse_dynamic(&mut state, &mut sun_hit, intersection_fn) { 196 | color += nee * sunc * fog_t * 0.2; 197 | } 198 | state.reinit(Ray::new_inf(fog_p, fog_dir)); // anyhit 199 | if !bvh.ray_traverse_dynamic(&mut state, &mut RayHit::none(), intersection_fn) { 200 | color += fog_t * 0.2 * fogc; 201 | } 202 | } else { 203 | let sky_bgc = sky_bg.render(ray.direction) * 0.4 + skyc * 0.6; 204 | color += sky_bgc * 0.4 + sky_bgc * misc_grain_noise * 0.6; 205 | color += 0.2 * fogc; 206 | } 207 | color 208 | }) 209 | .collect::>(); 210 | new_fragments 211 | .iter() 212 | .zip(fragments.iter_mut()) 213 | .for_each(|(new, col)| *col += *new); 214 | } 215 | println!(""); 216 | ]; 217 | 218 | let mut img: ImageBuffer, Vec> = ImageBuffer::new(width, height); 219 | let pixels = img.as_mut(); 220 | pixels.chunks_mut(4).enumerate().for_each(|(i, chunk)| { 221 | let mut col = (fragments[i] / total_aa_samples as f32).max(Vec3A::ZERO); 222 | col *= Vec3A::splat(2.0).powf(exposure); 223 | col = somewhat_boring_display_transform(col); 224 | col = col.powf(1.7); // contrast 225 | let luma = Vec3A::splat(col.dot(vec3a(0.2126, 0.7152, 0.0722))); 226 | col = luma * -0.1 + col * 1.1; // saturation 227 | let c = (col.clamp(Vec3A::ZERO, Vec3A::ONE) * 255.0).as_uvec3(); 228 | chunk.copy_from_slice(&[c.x as u8, c.y as u8, c.z as u8, 255]); 229 | }); 230 | 231 | img.save(format!("demoscene_{}_rend.png", seed)) 232 | .expect("Failed to save image"); 233 | } 234 | 235 | mod sky { 236 | use std::f32::consts::PI; 237 | 238 | use glam::{vec3a, Vec3A}; 239 | 240 | use obvhs::test_util::sampling::smoothstep; 241 | 242 | use crate::SUN_ANGULAR_DIAMETER; 243 | 244 | // Based on https://github.com/Tw1ddle/Sky-Shader/ 245 | pub struct Sky { 246 | pub depolarization_factor: f32, 247 | pub mie_coefficient: f32, 248 | pub mie_directional_g: f32, 249 | pub mie_k_coefficient: Vec3A, 250 | pub mie_v: f32, 251 | pub mie_zenith_length: f32, 252 | pub num_molecules: f32, 253 | pub primaries: Vec3A, 254 | pub rayleigh: f32, 255 | pub rayleigh_zenith_length: f32, 256 | pub refractive_index: f32, 257 | pub sun_angular_diameter: f32, 258 | pub sun_intensity_factor: f32, 259 | pub sun_intensity_falloff_steepness: f32, 260 | pub turbidity: f32, 261 | pub sun_position: Vec3A, 262 | } 263 | 264 | impl Sky { 265 | pub fn red_sunset(sun_position: Vec3A) -> Sky { 266 | Sky { 267 | depolarization_factor: 0.02, 268 | mie_coefficient: 0.005, 269 | mie_directional_g: 0.82, 270 | mie_k_coefficient: vec3a(0.686, 0.678, 0.666), 271 | mie_v: 3.936, 272 | mie_zenith_length: 34000.0, 273 | num_molecules: 2.542e25, 274 | primaries: vec3a(6.8e-7f32, 5.5e-7f32, 4.5e-7f32), 275 | rayleigh: 2.28, 276 | rayleigh_zenith_length: 8400.0, 277 | refractive_index: 1.00029, 278 | sun_angular_diameter: SUN_ANGULAR_DIAMETER, 279 | sun_intensity_factor: 1000.0, 280 | sun_intensity_falloff_steepness: 1.1, 281 | turbidity: 4.7, 282 | sun_position, 283 | } 284 | } 285 | 286 | pub fn render(&self, dir: Vec3A) -> Vec3A { 287 | let sunfade = 1.0 - (1.0 - (self.sun_position.y / 450000.0).exp()).clamp(0.0, 1.0); 288 | let rayleigh_coefficient = self.rayleigh - (1.0 * (1.0 - sunfade)); 289 | let beta_r = self.total_rayleigh(self.primaries) * rayleigh_coefficient; 290 | 291 | let beta_m = self.total_mie(self.primaries) * self.mie_coefficient; 292 | 293 | let zenith_angle = (0.0f32.max(Vec3A::Y.dot(dir))).acos(); 294 | let denom = 295 | zenith_angle.cos() + 0.15 * (93.885 - ((zenith_angle * 180.0) / PI)).powf(-1.253); 296 | let s_r = self.rayleigh_zenith_length / denom; 297 | let s_m = self.mie_zenith_length / denom; 298 | 299 | let fex = (-(beta_r * s_r + beta_m * s_m)).exp(); 300 | 301 | let sun_direction = self.sun_position.normalize(); 302 | let cos_theta = dir.dot(sun_direction); 303 | let beta_r_theta = beta_r * Self::rayleigh_phase(cos_theta * 0.5 + 0.5); 304 | let beta_m_theta = 305 | beta_m * Self::henyey_greenstein_phase(cos_theta, self.mie_directional_g); 306 | 307 | let sun_e = self.sun_intensity(sun_direction.dot(Vec3A::Y)); 308 | let mut lin = 309 | (sun_e * ((beta_r_theta + beta_m_theta) / (beta_r + beta_m)) * (1.0 - fex)) 310 | .powf(1.5); 311 | lin *= Vec3A::splat(1.0).lerp( 312 | (sun_e * ((beta_r_theta + beta_m_theta) / (beta_r + beta_m)) * fex).powf(0.5), 313 | (1.0 - Vec3A::Y.dot(sun_direction)) 314 | .powf(5.0) 315 | .clamp(0.0, 1.0), 316 | ); 317 | 318 | let sun_angular_diameter_cos = (self.sun_angular_diameter).cos(); 319 | let sundisk = smoothstep( 320 | sun_angular_diameter_cos, 321 | sun_angular_diameter_cos, // + 0.00002 322 | cos_theta, 323 | ); 324 | let mut l0 = Vec3A::splat(0.1) * fex; 325 | l0 += sun_e * 19000.0 * fex * sundisk; 326 | let mut color = (lin + l0) * 0.04; 327 | let low_falloff = (Vec3A::Y.dot(dir) + 0.4).powf(5.0).max(0.0); 328 | color = (color * 0.1).powf(3.0) * low_falloff; 329 | color.powf(1.0 / (1.2 + (1.2 * sunfade))) * 0.5 330 | } 331 | 332 | fn total_rayleigh(&self, lambda: Vec3A) -> Vec3A { 333 | (8.0 * PI.powi(3) 334 | * (self.refractive_index.powi(2) - 1.0).powi(2) 335 | * (6.0 + 3.0 * self.depolarization_factor)) 336 | / (3.0 337 | * self.num_molecules 338 | * lambda.powf(4.0) 339 | * (6.0 - 7.0 * self.depolarization_factor)) 340 | } 341 | 342 | fn total_mie(&self, lambda: Vec3A) -> Vec3A { 343 | let c = 0.2 * self.turbidity * 10e-18; 344 | 0.434 * c * PI * (2.0 * PI / lambda).powf(self.mie_v - 2.0) * self.mie_k_coefficient 345 | } 346 | 347 | fn rayleigh_phase(cos_theta: f32) -> f32 { 348 | (3.0 / (16.0 * PI)) * (1.0 + cos_theta.powi(2)) 349 | } 350 | 351 | fn henyey_greenstein_phase(cos_theta: f32, g: f32) -> f32 { 352 | (1.0 / (4.0 * PI)) 353 | * ((1.0 - g.powi(2)) / (1.0 - 2.0 * g * cos_theta + g.powi(2)).powf(1.5)) 354 | } 355 | 356 | fn sun_intensity(&self, zenith_angle_cos: f32) -> f32 { 357 | let cutoff_angle = PI / 1.95; 358 | self.sun_intensity_factor 359 | * 0.0f32.max( 360 | 1.0 - (-((cutoff_angle - zenith_angle_cos.acos()).exp() 361 | / self.sun_intensity_falloff_steepness)), 362 | ) 363 | } 364 | } 365 | } 366 | -------------------------------------------------------------------------------- /src/aabb.rs: -------------------------------------------------------------------------------- 1 | //! An Axis-Aligned Bounding Box (AABB) represented by its minimum and maximum points. 2 | 3 | use std::ops::BitAnd; 4 | 5 | use bytemuck::{Pod, Zeroable}; 6 | use glam::Vec3A; 7 | 8 | use crate::{ray::Ray, Boundable}; 9 | 10 | /// An Axis-Aligned Bounding Box (AABB) represented by its minimum and maximum points. 11 | #[derive(Default, Clone, Copy, Debug, PartialEq)] 12 | #[repr(C)] 13 | pub struct Aabb { 14 | pub min: Vec3A, 15 | pub max: Vec3A, 16 | } 17 | 18 | unsafe impl Pod for Aabb {} 19 | unsafe impl Zeroable for Aabb {} 20 | 21 | impl Aabb { 22 | /// An invalid (empty) AABB with min set to the maximum possible value 23 | /// and max set to the minimum possible value. 24 | pub const INVALID: Self = Self { 25 | min: Vec3A::splat(f32::MAX), 26 | max: Vec3A::splat(f32::MIN), 27 | }; 28 | 29 | /// An infinite AABB with min set to negative infinity 30 | /// and max set to positive infinity. 31 | pub const LARGEST: Self = Self { 32 | min: Vec3A::splat(-f32::MAX), 33 | max: Vec3A::splat(f32::MAX), 34 | }; 35 | 36 | /// An infinite AABB with min set to negative infinity 37 | /// and max set to positive infinity. 38 | pub const INFINITY: Self = Self { 39 | min: Vec3A::splat(-f32::INFINITY), 40 | max: Vec3A::splat(f32::INFINITY), 41 | }; 42 | 43 | /// Creates a new AABB with the given minimum and maximum points. 44 | #[inline] 45 | pub fn new(min: Vec3A, max: Vec3A) -> Self { 46 | Self { min, max } 47 | } 48 | 49 | /// Creates a new AABB with both min and max set to the given point. 50 | #[inline] 51 | pub fn from_point(point: Vec3A) -> Self { 52 | Self { 53 | min: point, 54 | max: point, 55 | } 56 | } 57 | 58 | /// Creates an AABB that bounds the given set of points. 59 | #[inline] 60 | pub fn from_points(points: &[Vec3A]) -> Self { 61 | let mut points = points.iter(); 62 | let mut aabb = Aabb::from_point(*points.next().unwrap()); 63 | for point in points { 64 | aabb.extend(*point); 65 | } 66 | aabb 67 | } 68 | 69 | /// Checks if the AABB contains the given point. 70 | #[inline] 71 | pub fn contains_point(&self, point: Vec3A) -> bool { 72 | (point.cmpge(self.min).bitand(point.cmple(self.max))).all() 73 | } 74 | 75 | /// Extends the AABB to include the given point. 76 | #[inline] 77 | pub fn extend(&mut self, point: Vec3A) -> &mut Self { 78 | *self = self.union(&Self::from_point(point)); 79 | self 80 | } 81 | 82 | /// Returns the union of this AABB and another AABB. 83 | #[inline] 84 | #[must_use] 85 | pub fn union(&self, other: &Self) -> Self { 86 | Aabb { 87 | min: self.min.min(other.min), 88 | max: self.max.max(other.max), 89 | } 90 | } 91 | 92 | /// Returns the intersection of this AABB and another AABB. 93 | /// 94 | /// The intersection of two AABBs is the overlapping region that is 95 | /// common to both AABBs. If the AABBs do not overlap, the resulting 96 | /// AABB will have min and max values that do not form a valid box 97 | /// (min will not be less than max). 98 | #[inline] 99 | pub fn intersection(&self, other: &Self) -> Self { 100 | Aabb { 101 | min: self.min.max(other.min), 102 | max: self.max.min(other.max), 103 | } 104 | } 105 | 106 | /// Returns the diagonal vector of the AABB. 107 | #[inline] 108 | pub fn diagonal(&self) -> Vec3A { 109 | self.max - self.min 110 | } 111 | 112 | /// Returns the center point of the AABB. 113 | #[inline] 114 | pub fn center(&self) -> Vec3A { 115 | (self.max + self.min) * 0.5 116 | } 117 | 118 | /// Returns the center coordinate of the AABB along a specific axis. 119 | #[inline] 120 | pub fn center_axis(&self, axis: usize) -> f32 { 121 | (self.max[axis] + self.min[axis]) * 0.5 122 | } 123 | 124 | /// Returns the index of the largest axis of the AABB. 125 | #[inline] 126 | pub fn largest_axis(&self) -> usize { 127 | let d = self.diagonal(); 128 | if d.x < d.y { 129 | if d.y < d.z { 130 | 2 131 | } else { 132 | 1 133 | } 134 | } else if d.x < d.z { 135 | 2 136 | } else { 137 | 0 138 | } 139 | } 140 | 141 | /// Returns the index of the smallest axis of the AABB. 142 | #[inline] 143 | pub fn smallest_axis(&self) -> usize { 144 | let d = self.diagonal(); 145 | if d.x > d.y { 146 | if d.y > d.z { 147 | 2 148 | } else { 149 | 1 150 | } 151 | } else if d.x > d.z { 152 | 2 153 | } else { 154 | 0 155 | } 156 | } 157 | 158 | /// Returns half the surface area of the AABB. 159 | #[inline] 160 | pub fn half_area(&self) -> f32 { 161 | let d = self.diagonal(); 162 | (d.x + d.y) * d.z + d.x * d.y 163 | } 164 | 165 | /// Returns the surface area of the AABB. 166 | #[inline] 167 | pub fn surface_area(&self) -> f32 { 168 | let d = self.diagonal(); 169 | 2.0 * d.dot(d) 170 | } 171 | 172 | /// Returns an empty AABB. 173 | #[inline] 174 | pub fn empty() -> Self { 175 | Self { 176 | min: Vec3A::new(f32::MAX, f32::MAX, f32::MAX), 177 | max: Vec3A::new(f32::MIN, f32::MIN, f32::MIN), 178 | } 179 | } 180 | 181 | /// Checks if the AABB is valid (i.e., min <= max on all axes). 182 | pub fn valid(&self) -> bool { 183 | self.min.cmple(self.max).all() 184 | } 185 | 186 | /// Checks if this AABB intersects with another AABB. 187 | #[inline] 188 | pub fn intersect_aabb(&self, other: &Aabb) -> bool { 189 | (self.min.cmpgt(other.max) | self.max.cmplt(other.min)).bitmask() == 0 190 | } 191 | 192 | /// Checks if this AABB intersects with a ray and returns the distance to the intersection point. 193 | /// Returns `f32::MAX` if there is no intersection. 194 | #[inline] 195 | pub fn intersect_ray(&self, ray: &Ray) -> f32 { 196 | let t1 = (self.min - ray.origin) * ray.inv_direction; 197 | let t2 = (self.max - ray.origin) * ray.inv_direction; 198 | 199 | let tmin = t1.min(t2); 200 | let tmax = t1.max(t2); 201 | 202 | let tmin_n = tmin.x.max(tmin.y.max(tmin.z)); 203 | let tmax_n = tmax.x.min(tmax.y.min(tmax.z)); 204 | 205 | if tmax_n >= tmin_n && tmax_n >= 0.0 { 206 | tmin_n 207 | } else { 208 | f32::INFINITY 209 | } 210 | } 211 | } 212 | 213 | impl Boundable for Aabb { 214 | #[inline] 215 | fn aabb(&self) -> Aabb { 216 | *self 217 | } 218 | } 219 | 220 | #[cfg(test)] 221 | mod tests { 222 | use super::*; 223 | use glam::Vec3A; 224 | 225 | #[test] 226 | fn test_from_point() { 227 | let point = Vec3A::ONE; 228 | let aabb = Aabb::from_point(point); 229 | assert_eq!(aabb.min, point); 230 | assert_eq!(aabb.max, point); 231 | } 232 | 233 | #[test] 234 | fn test_from_points() { 235 | let points = vec![Vec3A::ZERO, Vec3A::ONE, Vec3A::splat(2.0)]; 236 | let aabb = Aabb::from_points(&points); 237 | assert_eq!(aabb.min, Vec3A::ZERO); 238 | assert_eq!(aabb.max, Vec3A::splat(2.0)); 239 | } 240 | 241 | #[test] 242 | fn test_contains_point() { 243 | let aabb = Aabb::new(Vec3A::ZERO, Vec3A::ONE); 244 | assert!(aabb.contains_point(Vec3A::splat(0.5))); 245 | assert!(!aabb.contains_point(Vec3A::splat(1.5))); 246 | } 247 | 248 | #[test] 249 | fn test_extend() { 250 | let mut aabb = Aabb::from_point(Vec3A::ZERO); 251 | aabb.extend(Vec3A::ONE); 252 | assert_eq!(aabb.min, Vec3A::ZERO); 253 | assert_eq!(aabb.max, Vec3A::ONE); 254 | } 255 | 256 | #[test] 257 | fn test_union() { 258 | let aabb1 = Aabb::new(Vec3A::ZERO, Vec3A::ONE); 259 | let aabb2 = Aabb::new(Vec3A::splat(0.5), Vec3A::splat(1.5)); 260 | let union = aabb1.union(&aabb2); 261 | assert_eq!(union.min, Vec3A::ZERO); 262 | assert_eq!(union.max, Vec3A::splat(1.5)); 263 | } 264 | 265 | #[test] 266 | fn test_intersection() { 267 | let aabb1 = Aabb::new(Vec3A::ZERO, Vec3A::ONE); 268 | let aabb2 = Aabb::new(Vec3A::splat(0.5), Vec3A::splat(1.5)); 269 | let intersection = aabb1.intersection(&aabb2); 270 | assert_eq!(intersection.min, Vec3A::splat(0.5)); 271 | assert_eq!(intersection.max, Vec3A::ONE); 272 | assert!(intersection.valid()); 273 | } 274 | 275 | #[test] 276 | fn test_intersection_no_overlap() { 277 | let aabb1 = Aabb::new(Vec3A::ZERO, Vec3A::ONE); 278 | let aabb2 = Aabb::new(Vec3A::splat(2.0), Vec3A::splat(3.0)); 279 | let intersection = aabb1.intersection(&aabb2); 280 | assert_eq!(intersection.min, Vec3A::splat(2.0)); 281 | assert_eq!(intersection.max, Vec3A::ONE); 282 | assert!(!intersection.valid()); 283 | } 284 | 285 | #[test] 286 | fn test_diagonal() { 287 | let aabb = Aabb::new(Vec3A::ZERO, Vec3A::ONE); 288 | assert_eq!(aabb.diagonal(), Vec3A::ONE); 289 | } 290 | 291 | #[test] 292 | fn test_center() { 293 | let aabb = Aabb::new(Vec3A::ZERO, Vec3A::ONE); 294 | assert_eq!(aabb.center(), Vec3A::splat(0.5)); 295 | } 296 | 297 | #[test] 298 | fn test_center_axis() { 299 | let aabb = Aabb::new(Vec3A::ZERO, Vec3A::ONE); 300 | assert_eq!(aabb.center_axis(0), 0.5); 301 | assert_eq!(aabb.center_axis(1), 0.5); 302 | assert_eq!(aabb.center_axis(2), 0.5); 303 | } 304 | 305 | #[test] 306 | fn test_largest_axis() { 307 | let aabb = Aabb::new(Vec3A::ZERO, Vec3A::new(1.0, 2.0, 3.0)); 308 | assert_eq!(aabb.largest_axis(), 2); 309 | } 310 | 311 | #[test] 312 | fn test_smallest_axis() { 313 | let aabb = Aabb::new(Vec3A::ZERO, Vec3A::new(1.0, 2.0, 3.0)); 314 | assert_eq!(aabb.smallest_axis(), 0); 315 | } 316 | 317 | #[test] 318 | fn test_half_area() { 319 | let aabb = Aabb::new(Vec3A::ZERO, Vec3A::ONE); 320 | assert_eq!(aabb.half_area(), 3.0); 321 | } 322 | 323 | #[test] 324 | fn test_surface_area() { 325 | let aabb = Aabb::new(Vec3A::ZERO, Vec3A::ONE); 326 | assert_eq!(aabb.surface_area(), 6.0); 327 | } 328 | 329 | #[test] 330 | fn test_empty() { 331 | let aabb = Aabb::empty(); 332 | assert_eq!(aabb.min, Vec3A::new(f32::MAX, f32::MAX, f32::MAX)); 333 | assert_eq!(aabb.max, Vec3A::new(f32::MIN, f32::MIN, f32::MIN)); 334 | } 335 | 336 | #[test] 337 | fn test_valid() { 338 | let valid_aabb = Aabb::new(Vec3A::ZERO, Vec3A::ONE); 339 | assert!(valid_aabb.valid()); 340 | 341 | let invalid_aabb = Aabb::new(Vec3A::splat(2.0), Vec3A::splat(1.0)); 342 | assert!(!invalid_aabb.valid()); 343 | } 344 | 345 | #[test] 346 | fn test_intersect_aabb() { 347 | let aabb1 = Aabb::new(Vec3A::ZERO, Vec3A::ONE); 348 | let aabb2 = Aabb::new(Vec3A::splat(0.5), Vec3A::splat(1.5)); 349 | assert!(aabb1.intersect_aabb(&aabb2)); 350 | let aabb3 = Aabb::new(Vec3A::splat(1.5), Vec3A::splat(2.5)); 351 | assert!(!aabb1.intersect_aabb(&aabb3)); 352 | } 353 | 354 | #[test] 355 | fn test_intersect_ray() { 356 | let aabb = Aabb::new(Vec3A::ZERO, Vec3A::ONE); 357 | let ray = Ray::new(Vec3A::splat(-1.0), Vec3A::ONE, 0.0, f32::MAX); 358 | assert_eq!(aabb.intersect_ray(&ray), 1.0); 359 | let ray_no_intersect = Ray::new(Vec3A::splat(2.0), Vec3A::ONE, 0.0, f32::MAX); 360 | assert_eq!(aabb.intersect_ray(&ray_no_intersect), f32::INFINITY); 361 | } 362 | } 363 | -------------------------------------------------------------------------------- /src/bvh2/builder.rs: -------------------------------------------------------------------------------- 1 | use std::time::{Duration, Instant}; 2 | 3 | use crate::{ 4 | aabb::Aabb, splits::split_aabbs_preset, triangle::Triangle, Boundable, BvhBuildParams, 5 | }; 6 | 7 | use super::{leaf_collapser::collapse, reinsertion::ReinsertionOptimizer, Bvh2}; 8 | 9 | /// Build a bvh2 from the given list of Triangles. 10 | /// Just a helper function / example, feel free to reimplement for your specific use case. 11 | /// 12 | /// # Arguments 13 | /// * `triangles` - A list of Triangles. 14 | /// * `config` - Parameters for configuring the BVH building. 15 | /// * `core_build_time` - The core BVH build time. Does not include things like initial AABB 16 | /// generation or debug validation. This is mostly just here to simplify profiling in [tray_racing](https://github.com/DGriffin91/tray_racing) 17 | pub fn build_bvh2_from_tris( 18 | triangles: &[Triangle], 19 | config: BvhBuildParams, 20 | core_build_time: &mut Duration, 21 | ) -> Bvh2 { 22 | let mut aabbs = Vec::with_capacity(triangles.len()); 23 | let mut indices = Vec::with_capacity(triangles.len()); 24 | let mut largest_half_area = 0.0; 25 | let mut avg_area = 0.0; 26 | 27 | for (i, tri) in triangles.iter().enumerate() { 28 | let a = tri.v0; 29 | let b = tri.v1; 30 | let c = tri.v2; 31 | let mut aabb = Aabb::empty(); 32 | aabb.extend(a).extend(b).extend(c); 33 | let half_area = aabb.half_area(); 34 | largest_half_area = half_area.max(largest_half_area); 35 | avg_area += half_area; 36 | aabbs.push(aabb); 37 | indices.push(i as u32); 38 | } 39 | avg_area /= triangles.len() as f32; 40 | 41 | let start_time = Instant::now(); 42 | 43 | if config.pre_split { 44 | split_aabbs_preset( 45 | &mut aabbs, 46 | &mut indices, 47 | triangles, 48 | avg_area, 49 | largest_half_area, 50 | ); 51 | } 52 | 53 | let mut bvh2 = config.ploc_search_distance.build( 54 | &aabbs, 55 | indices, 56 | config.sort_precision, 57 | config.search_depth_threshold, 58 | ); 59 | ReinsertionOptimizer::run(&mut bvh2, config.reinsertion_batch_ratio, None); 60 | collapse( 61 | &mut bvh2, 62 | config.max_prims_per_leaf, 63 | config.collapse_traversal_cost, 64 | ); 65 | ReinsertionOptimizer::run( 66 | &mut bvh2, 67 | config.reinsertion_batch_ratio * config.post_collapse_reinsertion_batch_ratio_multiplier, 68 | None, 69 | ); 70 | 71 | *core_build_time += start_time.elapsed(); 72 | 73 | #[cfg(debug_assertions)] 74 | { 75 | bvh2.validate(triangles, false, config.pre_split); 76 | } 77 | 78 | bvh2 79 | } 80 | 81 | /// Build a bvh2 from the given list of Boundable primitives. 82 | /// `pre_split` in BvhBuildParams is ignored in this case. 83 | /// Just a helper function / example, feel free to reimplement for your specific use case. 84 | /// 85 | /// # Arguments 86 | /// * `primitives` - A list of Primitives that implement Boundable. 87 | /// * `config` - Parameters for configuring the BVH building. 88 | /// * `core_build_time` - The core BVH build time. Does not include things like initial AABB 89 | /// generation or debug validation. This is mostly just here to simplify profiling in [tray_racing](https://github.com/DGriffin91/tray_racing) 90 | // TODO: we could optionally do imprecise basic Aabb splits. 91 | pub fn build_bvh2( 92 | primitives: &[T], 93 | config: BvhBuildParams, 94 | core_build_time: &mut Duration, 95 | ) -> Bvh2 { 96 | let mut aabbs = Vec::with_capacity(primitives.len()); 97 | let mut indices = Vec::with_capacity(primitives.len()); 98 | 99 | for (i, primitive) in primitives.iter().enumerate() { 100 | indices.push(i as u32); 101 | aabbs.push(primitive.aabb()); 102 | } 103 | 104 | let start_time = Instant::now(); 105 | 106 | let mut bvh2 = config.ploc_search_distance.build( 107 | &aabbs, 108 | indices, 109 | config.sort_precision, 110 | config.search_depth_threshold, 111 | ); 112 | ReinsertionOptimizer::run(&mut bvh2, config.reinsertion_batch_ratio, None); 113 | collapse( 114 | &mut bvh2, 115 | config.max_prims_per_leaf, 116 | config.collapse_traversal_cost, 117 | ); 118 | ReinsertionOptimizer::run( 119 | &mut bvh2, 120 | config.reinsertion_batch_ratio * config.post_collapse_reinsertion_batch_ratio_multiplier, 121 | None, 122 | ); 123 | 124 | *core_build_time += start_time.elapsed(); 125 | 126 | #[cfg(debug_assertions)] 127 | { 128 | bvh2.validate(primitives, false, config.pre_split); 129 | } 130 | 131 | bvh2 132 | } 133 | -------------------------------------------------------------------------------- /src/bvh2/leaf_collapser.rs: -------------------------------------------------------------------------------- 1 | // Based on https://github.com/madmann91/bvh/blob/2fd0db62022993963a7343669275647cb073e19a/include/bvh/leaf_collapser.hpp 2 | #[cfg(feature = "parallel")] 3 | use rayon::iter::{IntoParallelIterator, ParallelIterator}; 4 | #[cfg(feature = "parallel")] 5 | use std::sync::atomic::{AtomicU32, Ordering}; 6 | 7 | use crate::bvh2::{Bvh2, Bvh2Node}; 8 | 9 | /// Collapses leaves of the BVH according to the SAH. This optimization 10 | /// is only helpful for bottom-up builders, as top-down builders already 11 | /// have a termination criterion that prevents leaf creation when the SAH 12 | /// cost does not improve. 13 | pub fn collapse(bvh: &mut Bvh2, max_prims: u32, traversal_cost: f32) { 14 | crate::scope!("collapse"); 15 | 16 | if max_prims <= 1 { 17 | return; 18 | } 19 | 20 | if bvh.nodes.is_empty() || bvh.nodes[0].is_leaf() { 21 | return; 22 | } 23 | 24 | let nodes_qty = bvh.nodes.len(); 25 | 26 | let parents = bvh.compute_parents(); 27 | 28 | let mut indices_copy = Vec::new(); 29 | let mut nodes_copy = Vec::new(); 30 | 31 | let mut node_counts: Vec = 32 | (0..nodes_qty).map(|_| SometimesAtomicU32::new(1)).collect(); 33 | let mut prim_counts: Vec = 34 | (0..nodes_qty).map(|_| SometimesAtomicU32::new(0)).collect(); 35 | 36 | let node_count; 37 | 38 | // Bottom-up traversal to collapse leaves 39 | // TODO need to figure out if parallel version can have data races, if so: 40 | // maybe record commands in parallel, include a index, and execute them sequentially 41 | // also reference original impl 42 | bottom_up_traverse(bvh, &parents, |leaf, i| { 43 | if leaf { 44 | prim_counts[i].set(bvh.nodes[i].prim_count); 45 | } else { 46 | let node = &bvh.nodes[i]; 47 | debug_assert!(!node.is_leaf()); 48 | let first_child = node.first_index as usize; 49 | 50 | let left_count = prim_counts[first_child].get(); 51 | let right_count = prim_counts[first_child + 1].get(); 52 | let total_count = left_count + right_count; 53 | 54 | // Compute the cost of collapsing this node when both children are leaves 55 | if left_count > 0 && right_count > 0 && total_count <= max_prims { 56 | let left = bvh.nodes[first_child]; 57 | let right = bvh.nodes[first_child + 1]; 58 | let collapse_cost = node.aabb.half_area() * (total_count as f32 - traversal_cost); 59 | let base_cost = left.aabb.half_area() * left_count as f32 60 | + right.aabb.half_area() * right_count as f32; 61 | let both_have_same_prim = 62 | (left.first_index == right.first_index) && total_count == 2; 63 | 64 | // Collapse them if cost of the collapsed node is lower, or both children contain the same primitive (as a result of splits) 65 | if collapse_cost <= base_cost || both_have_same_prim { 66 | //if both_have_same_prim { 1 } else { total_count }; // TODO, Reduce total count (was showing artifacts) 67 | prim_counts[i].set(total_count); 68 | prim_counts[first_child].set(0); 69 | prim_counts[first_child + 1].set(0); 70 | node_counts[first_child].set(0); 71 | node_counts[first_child + 1].set(0); 72 | } 73 | } 74 | } 75 | }); 76 | 77 | // Prefix sums computed sequentially (TODO: parallelize) 78 | let mut sum = 0; 79 | node_counts.iter_mut().for_each(|count| { 80 | sum += count.get(); 81 | count.set(sum); 82 | }); 83 | 84 | sum = 0; 85 | prim_counts.iter_mut().for_each(|count| { 86 | sum += count.get(); 87 | count.set(sum); 88 | }); 89 | 90 | { 91 | node_count = node_counts[bvh.nodes.len() - 1].get(); 92 | if prim_counts[0].get() > 0 { 93 | // This means the root node has become a leaf. 94 | // We avoid copying the data and just swap the old prim array with the new one. 95 | bvh.nodes[0].first_index = 0; 96 | bvh.nodes[0].prim_count = prim_counts[0].get(); 97 | std::mem::swap(&mut bvh.primitive_indices, &mut indices_copy); 98 | std::mem::swap(&mut bvh.nodes, &mut nodes_copy); 99 | } else { 100 | nodes_copy = vec![Default::default(); node_count as usize]; 101 | indices_copy = 102 | vec![Default::default(); prim_counts[bvh.nodes.len() - 1].get() as usize]; 103 | nodes_copy[0] = bvh.nodes[0]; 104 | nodes_copy[0].first_index = node_counts[nodes_copy[0].first_index as usize - 1].get(); 105 | } 106 | } 107 | 108 | // TODO Parallelize: 109 | { 110 | for i in 1..bvh.nodes.len() { 111 | let node_id = node_counts[i - 1].get() as usize; 112 | if node_id == node_counts[i].get() as usize { 113 | continue; 114 | } 115 | 116 | nodes_copy[node_id] = bvh.nodes[i]; 117 | let mut first_prim = prim_counts[i - 1].get(); 118 | if first_prim != prim_counts[i].get() { 119 | nodes_copy[node_id].prim_count = prim_counts[i].get() - first_prim; 120 | nodes_copy[node_id].first_index = first_prim; 121 | 122 | // Top-down traversal to store the prims contained in this subtree. 123 | 124 | if true { 125 | let mut j = i; 126 | loop { 127 | let node = bvh.nodes[j]; 128 | if node.is_leaf() { 129 | for n in 0..node.prim_count { 130 | indices_copy[(first_prim + n) as usize] = 131 | bvh.primitive_indices[(node.first_index + n) as usize]; 132 | } 133 | 134 | first_prim += node.prim_count; 135 | while !Bvh2Node::is_left_sibling(j) && j != i { 136 | j = parents[j] as usize; 137 | } 138 | if j == i { 139 | break; 140 | } 141 | j = Bvh2Node::get_sibling_id(j); 142 | } else { 143 | j = node.first_index as usize; 144 | } 145 | } 146 | } else { 147 | // ------------------------- 148 | // Alternate method (slower) 149 | // ------------------------- 150 | let mut stack = Vec::new(); 151 | stack.push(i); 152 | while let Some(current_node_index) = stack.pop() { 153 | let node = &bvh.nodes[current_node_index]; 154 | 155 | if node.is_leaf() { 156 | for n in 0..node.prim_count { 157 | indices_copy[(first_prim + n) as usize] = 158 | bvh.primitive_indices[(node.first_index + n) as usize]; 159 | } 160 | first_prim += node.prim_count; 161 | } else { 162 | stack.push(node.first_index as usize); 163 | stack.push((node.first_index + 1) as usize); 164 | } 165 | } 166 | // ------------------------- 167 | } 168 | } else { 169 | let first_child = &mut nodes_copy[node_id].first_index; 170 | *first_child = node_counts[*first_child as usize - 1].get(); 171 | } 172 | } 173 | } 174 | 175 | std::mem::swap(&mut bvh.nodes, &mut nodes_copy); 176 | std::mem::swap(&mut bvh.primitive_indices, &mut indices_copy); 177 | } 178 | 179 | // Based on https://github.com/madmann91/bvh/blob/2fd0db62022993963a7343669275647cb073e19a/include/bvh/bottom_up_algorithm.hpp 180 | #[cfg(not(feature = "parallel"))] 181 | fn bottom_up_traverse( 182 | bvh: &Bvh2, 183 | parents: &[u32], 184 | mut process_node: F, // True is for leaf 185 | ) where 186 | F: FnMut(bool, usize), 187 | { 188 | // Special case if the BVH is just a leaf 189 | if bvh.nodes.len() == 1 { 190 | process_node(true, 0); 191 | return; 192 | } 193 | 194 | // Iterate through all nodes starting from 1, since node 0 is assumed to be the root 195 | (1..bvh.nodes.len()).for_each(|i| { 196 | // Only process leaves 197 | if bvh.nodes[i].is_leaf() { 198 | process_node(true, i); 199 | 200 | // Process inner nodes on the path from that leaf up to the root 201 | let mut j = i; 202 | while j != 0 { 203 | j = parents[j] as usize; 204 | 205 | process_node(false, j); 206 | } 207 | } 208 | }); 209 | } 210 | 211 | #[cfg(feature = "parallel")] 212 | fn bottom_up_traverse( 213 | bvh: &Bvh2, 214 | parents: &[u32], 215 | process_node: F, // True is for leaf 216 | ) where 217 | F: Fn(bool, usize) + Sync + Send, 218 | { 219 | // Special case if the BVH is just a leaf 220 | if bvh.nodes.len() == 1 { 221 | process_node(true, 0); 222 | return; 223 | } 224 | 225 | // Iterate through all nodes starting from 1, since node 0 is assumed to be the root 226 | (1..bvh.nodes.len()).into_par_iter().for_each(|i| { 227 | // Only process leaves 228 | if bvh.nodes[i].is_leaf() { 229 | process_node(true, i); 230 | 231 | // Process inner nodes on the path from that leaf up to the root 232 | let mut j = i as usize; 233 | while j != 0 { 234 | j = parents[j] as usize; 235 | 236 | process_node(false, j); 237 | } 238 | } 239 | }); 240 | } 241 | 242 | pub struct SometimesAtomicU32 { 243 | #[cfg(feature = "parallel")] 244 | pub value: AtomicU32, 245 | #[cfg(not(feature = "parallel"))] 246 | pub value: u32, 247 | } 248 | 249 | impl SometimesAtomicU32 { 250 | #[inline] 251 | pub fn new(value: u32) -> SometimesAtomicU32 { 252 | #[cfg(feature = "parallel")] 253 | { 254 | SometimesAtomicU32 { 255 | value: AtomicU32::new(value), 256 | } 257 | } 258 | #[cfg(not(feature = "parallel"))] 259 | { 260 | SometimesAtomicU32 { value } 261 | } 262 | } 263 | 264 | #[inline] 265 | #[cfg(feature = "parallel")] 266 | pub fn set(&self, value: u32) { 267 | self.value.store(value, Ordering::SeqCst); 268 | #[cfg(not(feature = "parallel"))] 269 | { 270 | self.value = value; 271 | } 272 | } 273 | 274 | #[inline] 275 | #[cfg(not(feature = "parallel"))] 276 | pub fn set(&mut self, value: u32) { 277 | self.value = value; 278 | } 279 | 280 | #[inline] 281 | pub fn get(&self) -> u32 { 282 | #[cfg(feature = "parallel")] 283 | { 284 | self.value.load(Ordering::SeqCst) 285 | } 286 | #[cfg(not(feature = "parallel"))] 287 | { 288 | self.value 289 | } 290 | } 291 | } 292 | -------------------------------------------------------------------------------- /src/bvh2/reinsertion.rs: -------------------------------------------------------------------------------- 1 | // Reinsertion optimizer based on "Parallel Reinsertion for Bounding Volume Hierarchy Optimization", by D. Meister and J. Bittner: 2 | // https://meistdan.github.io/publications/prbvh/paper.pdf 3 | // https://jcgt.org/published/0011/04/01/paper.pdf 4 | // Reference: https://github.com/madmann91/bvh/blob/3490634ae822e5081e41f09498fcce03bc1419e3/src/bvh/v2/reinsertion_optimizer.h 5 | 6 | // Note: Most asserts exist to try to elide bounds checks 7 | 8 | #[cfg(feature = "parallel")] 9 | use rayon::iter::{IntoParallelIterator, ParallelIterator}; 10 | use rdst::{RadixKey, RadixSort}; 11 | 12 | use crate::{ 13 | bvh2::{Bvh2, Bvh2Node}, 14 | heapstack::HeapStack, 15 | }; 16 | 17 | /// Restructures the BVH, optimizing node locations within the BVH hierarchy per SAH cost. 18 | pub struct ReinsertionOptimizer<'a> { 19 | candidates: Vec, 20 | reinsertions: Vec, 21 | touched: Vec, 22 | parents: Vec, 23 | bvh: &'a mut Bvh2, 24 | batch_size_ratio: f32, 25 | } 26 | 27 | impl ReinsertionOptimizer<'_> { 28 | /// Restructures the BVH, optimizing node locations within the BVH hierarchy per SAH cost. 29 | /// batch_size_ratio: Fraction of the number of nodes to optimize per iteration. 30 | /// ratio_sequence: A sequence of ratios to preform reinsertion at. These are as a 31 | /// proportion of the batch_size_ratio. If None, the following sequence is used: 32 | /// (1..32).step_by(2).map(|n| 1.0 / n as f32) or 33 | /// 1/1, 1/3, 1/5, 1/7, 1/9, 1/11, 1/13, 1/15, 1/17, 1/19, 1/21, 1/23, 1/25, 1/27, 1/29, 1/31 34 | pub fn run(bvh: &mut Bvh2, batch_size_ratio: f32, ratio_sequence: Option>) { 35 | crate::scope!("reinsertion_optimize"); 36 | 37 | if bvh.nodes.is_empty() || bvh.nodes[0].is_leaf() || batch_size_ratio <= 0.0 { 38 | return; 39 | } 40 | #[cfg(feature = "parallel")] 41 | let parents = bvh.compute_parents_parallel(); 42 | #[cfg(not(feature = "parallel"))] 43 | let parents = bvh.compute_parents(); 44 | 45 | let cap = (bvh.nodes.len() as f32 * batch_size_ratio.min(1.0)).ceil() as usize; 46 | 47 | ReinsertionOptimizer { 48 | candidates: Vec::with_capacity(cap), 49 | reinsertions: Vec::with_capacity(cap), 50 | touched: vec![false; bvh.nodes.len()], 51 | parents, 52 | bvh, 53 | batch_size_ratio, 54 | } 55 | .optimize_impl(ratio_sequence); 56 | } 57 | 58 | pub fn optimize_impl(&mut self, ratio_sequence: Option>) { 59 | // This initially preforms reinsertion at the specified ratio, then at progressively smaller ratios, 60 | // focusing more reinsertion time at the top of the bvh. The original method would perform reinsertion 61 | // for a fixed ratio a fixed number of times. 62 | let ratio_sequence = ratio_sequence.unwrap_or( 63 | (1..32) 64 | .step_by(2) 65 | .map(|n| 1.0 / n as f32) 66 | .collect::>(), 67 | ); 68 | 69 | let mut reinsertion_stack = HeapStack::<(f32, u32)>::new_with_capacity(256); // Can't put in Self because of borrows 70 | ratio_sequence.iter().for_each(|ratio| { 71 | let batch_size = 72 | (((self.bvh.nodes.len() as f32 * self.batch_size_ratio) * ratio) as usize).max(1); 73 | let node_count = self.bvh.nodes.len().min(batch_size + 1); 74 | self.find_candidates(node_count); 75 | self.optimize_candidates(&mut reinsertion_stack, node_count - 1); 76 | }); 77 | } 78 | 79 | /// Find potential candidates for reinsertion 80 | fn find_candidates(&mut self, node_count: usize) { 81 | // This method just takes the first node_count*2 nodes in the bvh and sorts them by their half area 82 | // This seemed to find candidates much faster while resulting in similar bvh traversal performance vs the original method 83 | // https://github.com/madmann91/bvh/blob/3490634ae822e5081e41f09498fcce03bc1419e3/src/bvh/v2/reinsertion_optimizer.h#L88 84 | // Taking the first node_count * 2 seemed to work nearly as well as sorting all the nodes, but builds much faster. 85 | self.candidates.clear(); 86 | self.bvh 87 | .nodes 88 | .iter() 89 | .take(node_count * 2) 90 | .enumerate() 91 | .skip(1) 92 | .for_each(|(i, node)| { 93 | self.candidates.push(Candidate { 94 | cost: node.aabb.half_area(), 95 | node_id: i as u32, 96 | }); 97 | }); 98 | self.candidates.radix_sort_unstable(); 99 | } 100 | 101 | #[allow(unused_variables)] 102 | fn optimize_candidates(&mut self, reinsertion_stack: &mut HeapStack<(f32, u32)>, count: usize) { 103 | self.reinsertions.clear(); 104 | self.touched.fill(false); 105 | 106 | #[cfg(feature = "parallel")] 107 | { 108 | let mut reinsertions_map = (0..count) 109 | .into_par_iter() 110 | .map(|i| { 111 | // TODO figure out a way to create a limited number of these just once and reuse from the rayon 112 | let mut stack = HeapStack::<(f32, u32)>::new_with_capacity(256); 113 | self.find_reinsertion(&mut stack, self.candidates[i].node_id as usize) 114 | }) 115 | .collect::>(); 116 | reinsertions_map.drain(..).for_each(|r| { 117 | if r.area_diff > 0.0 { 118 | self.reinsertions.push(r) 119 | } 120 | }); 121 | } 122 | #[cfg(not(feature = "parallel"))] 123 | { 124 | assert!(count <= self.candidates.len()); 125 | (0..count).for_each(|i| { 126 | let r = 127 | self.find_reinsertion(reinsertion_stack, self.candidates[i].node_id as usize); 128 | if r.area_diff > 0.0 { 129 | self.reinsertions.push(r) 130 | } 131 | }); 132 | } 133 | 134 | self.reinsertions 135 | .sort_unstable_by(|a, b| b.area_diff.partial_cmp(&a.area_diff).unwrap()); 136 | 137 | assert!(self.reinsertions.len() <= self.touched.len()); 138 | (0..self.reinsertions.len()).for_each(|i| { 139 | let reinsertion = &self.reinsertions[i]; 140 | let conflicts = self.get_conflicts(reinsertion.from, reinsertion.to); 141 | 142 | if conflicts.iter().any(|&i| self.touched[i]) { 143 | return; 144 | } 145 | 146 | conflicts.iter().for_each(|&conflict| { 147 | self.touched[conflict] = true; 148 | }); 149 | 150 | self.reinsert_node(reinsertion.from as usize, reinsertion.to as usize); 151 | }); 152 | } 153 | 154 | fn find_reinsertion(&self, stack: &mut HeapStack<(f32, u32)>, node_id: usize) -> Reinsertion { 155 | debug_assert_ne!(node_id, 0); 156 | // Try to elide bounds checks 157 | assert!(node_id < self.bvh.nodes.len()); 158 | assert!(node_id < self.parents.len()); 159 | 160 | /* 161 | * Here is an example that explains how the cost of a reinsertion is computed. For the 162 | * reinsertion from A to C, in the figure below, we need to remove P1, replace it by B, 163 | * and create a node that holds A and C and place it where C was. 164 | * 165 | * R 166 | * / \ 167 | * Pn Q1 168 | * / \ 169 | * ... ... 170 | * / \ 171 | * P1 C 172 | * / \ 173 | * A B 174 | * 175 | * The resulting area *decrease* is (SA(x) means the surface area of x): 176 | * 177 | * SA(P1) + : P1 was removed 178 | * SA(P2) - SA(B) + : P2 now only contains B 179 | * SA(P3) - SA(B U sibling(P2)) + : Same but for P3 180 | * ... + 181 | * SA(Pn) - SA(B U sibling(P2) U ... U sibling(P(n - 1)) + : Same but for Pn 182 | * 0 + : R does not change 183 | * SA(Q1) - SA(Q1 U A) + : Q1 now contains A 184 | * SA(Q2) - SA(Q2 U A) + : Q2 now contains A 185 | * ... + 186 | * -SA(A U C) : For the parent of A and C 187 | */ 188 | let mut best_reinsertion = Reinsertion { 189 | from: node_id as u32, 190 | to: 0, 191 | area_diff: 0.0, 192 | }; 193 | let node_area = self.bvh.nodes[node_id].aabb.half_area(); 194 | let parent_area = self.bvh.nodes[self.parents[node_id] as usize] 195 | .aabb 196 | .half_area(); 197 | let mut area_diff = parent_area; 198 | let mut sibling_id = Bvh2Node::get_sibling_id(node_id); 199 | let mut pivot_bbox = self.bvh.nodes[sibling_id].aabb; 200 | let parent_id = self.parents[node_id] as usize; 201 | let mut pivot_id = parent_id; 202 | let aabb = self.bvh.nodes[node_id].aabb; 203 | stack.clear(); 204 | loop { 205 | stack.push((area_diff, sibling_id as u32)); 206 | while !stack.is_empty() { 207 | let (top_area_diff, top_sibling_id) = stack.pop_fast(); 208 | if top_area_diff - node_area <= best_reinsertion.area_diff { 209 | continue; 210 | } 211 | 212 | let dst_node = &self.bvh.nodes[*top_sibling_id as usize]; 213 | let merged_area = dst_node.aabb.union(&aabb).half_area(); 214 | let reinsert_area = top_area_diff - merged_area; 215 | if reinsert_area > best_reinsertion.area_diff { 216 | best_reinsertion.to = *top_sibling_id; 217 | best_reinsertion.area_diff = reinsert_area; 218 | } 219 | 220 | if !dst_node.is_leaf() { 221 | let child_area = reinsert_area + dst_node.aabb.half_area(); 222 | stack.push((child_area, dst_node.first_index)); 223 | stack.push((child_area, dst_node.first_index + 1)); 224 | } 225 | } 226 | 227 | if pivot_id != parent_id { 228 | pivot_bbox = pivot_bbox.union(&self.bvh.nodes[sibling_id].aabb); 229 | area_diff += self.bvh.nodes[pivot_id].aabb.half_area() - pivot_bbox.half_area(); 230 | } 231 | 232 | if pivot_id == 0 { 233 | break; 234 | } 235 | 236 | sibling_id = Bvh2Node::get_sibling_id(pivot_id); 237 | pivot_id = self.parents[pivot_id] as usize; 238 | } 239 | 240 | if best_reinsertion.to == Bvh2Node::get_sibling_id32(best_reinsertion.from) 241 | || best_reinsertion.to == self.parents[best_reinsertion.from as usize] 242 | { 243 | best_reinsertion = Reinsertion::default(); 244 | } 245 | 246 | best_reinsertion 247 | } 248 | 249 | fn reinsert_node(&mut self, from: usize, to: usize) { 250 | let sibling_id = Bvh2Node::get_sibling_id(from); 251 | let parent_id = self.parents[from] as usize; 252 | let sibling_node = self.bvh.nodes[sibling_id]; 253 | let dst_node = self.bvh.nodes[to]; 254 | 255 | self.bvh.nodes[to].make_inner(Bvh2Node::get_left_sibling_id(from) as u32); 256 | self.bvh.nodes[sibling_id] = dst_node; 257 | self.bvh.nodes[parent_id] = sibling_node; 258 | 259 | if !self.bvh.nodes[sibling_id].is_leaf() { 260 | self.parents[self.bvh.nodes[sibling_id].first_index as usize] = sibling_id as u32; 261 | self.parents[self.bvh.nodes[sibling_id].first_index as usize + 1] = sibling_id as u32; 262 | } 263 | if !self.bvh.nodes[parent_id].is_leaf() { 264 | self.parents[self.bvh.nodes[parent_id].first_index as usize] = parent_id as u32; 265 | self.parents[self.bvh.nodes[parent_id].first_index as usize + 1] = parent_id as u32; 266 | } 267 | 268 | self.parents[sibling_id] = to as u32; 269 | self.parents[from] = to as u32; 270 | self.bvh.refit_from_fast(to, &self.parents); 271 | self.bvh.refit_from_fast(parent_id, &self.parents); 272 | } 273 | 274 | #[inline(always)] 275 | fn get_conflicts(&self, from: u32, to: u32) -> [usize; 5] { 276 | [ 277 | to as usize, 278 | from as usize, 279 | Bvh2Node::get_sibling_id(from as usize), 280 | self.parents[to as usize] as usize, 281 | self.parents[from as usize] as usize, 282 | ] 283 | } 284 | } 285 | 286 | #[derive(Default, Clone, Copy)] 287 | struct Reinsertion { 288 | from: u32, 289 | to: u32, 290 | area_diff: f32, 291 | } 292 | 293 | #[derive(Clone, Copy, Debug)] 294 | struct Candidate { 295 | node_id: u32, 296 | cost: f32, 297 | } 298 | 299 | impl RadixKey for Candidate { 300 | const LEVELS: usize = 4; 301 | 302 | #[inline] 303 | fn get_level(&self, level: usize) -> u8 { 304 | (-self.cost).get_level(level) 305 | } 306 | } 307 | -------------------------------------------------------------------------------- /src/cwbvh/builder.rs: -------------------------------------------------------------------------------- 1 | use std::time::{Duration, Instant}; 2 | 3 | use crate::{ 4 | aabb::Aabb, 5 | bvh2::reinsertion::ReinsertionOptimizer, 6 | cwbvh::{bvh2_to_cwbvh::bvh2_to_cwbvh, CwBvh}, 7 | splits::split_aabbs_preset, 8 | triangle::Triangle, 9 | Boundable, BvhBuildParams, 10 | }; 11 | 12 | /// Build a cwbvh from the given list of Triangles. 13 | /// Just a helper function / example, feel free to reimplement for your specific use case. 14 | /// 15 | /// # Arguments 16 | /// * `triangles` - A list of Triangles. 17 | /// * `config` - Parameters for configuring the BVH building. 18 | /// * `core_build_time` - The core BVH build time. Does not include things like initial AABB 19 | /// generation or debug validation. This is mostly just here to simplify profiling in [tray_racing](https://github.com/DGriffin91/tray_racing) 20 | pub fn build_cwbvh_from_tris( 21 | triangles: &[Triangle], 22 | config: BvhBuildParams, 23 | core_build_time: &mut Duration, 24 | ) -> CwBvh { 25 | let mut aabbs = Vec::with_capacity(triangles.len()); 26 | let mut indices = Vec::with_capacity(triangles.len()); 27 | let mut largest_half_area = 0.0; 28 | let mut avg_half_area = 0.0; 29 | 30 | for (i, tri) in triangles.iter().enumerate() { 31 | let a = tri.v0; 32 | let b = tri.v1; 33 | let c = tri.v2; 34 | let mut aabb = Aabb::empty(); 35 | aabb.extend(a).extend(b).extend(c); 36 | let half_area = aabb.half_area(); 37 | largest_half_area = half_area.max(largest_half_area); 38 | avg_half_area += half_area; 39 | aabbs.push(aabb); 40 | indices.push(i as u32); 41 | } 42 | avg_half_area /= triangles.len() as f32; 43 | 44 | let start_time = Instant::now(); 45 | 46 | if config.pre_split { 47 | split_aabbs_preset( 48 | &mut aabbs, 49 | &mut indices, 50 | triangles, 51 | avg_half_area, 52 | largest_half_area, 53 | ); 54 | } 55 | 56 | let mut bvh2 = config.ploc_search_distance.build( 57 | &aabbs, 58 | indices, 59 | config.sort_precision, 60 | config.search_depth_threshold, 61 | ); 62 | ReinsertionOptimizer::run(&mut bvh2, config.reinsertion_batch_ratio, None); 63 | let cwbvh = bvh2_to_cwbvh(&bvh2, config.max_prims_per_leaf.clamp(1, 3), true, false); 64 | 65 | *core_build_time += start_time.elapsed(); 66 | 67 | #[cfg(debug_assertions)] 68 | { 69 | bvh2.validate(triangles, false, config.pre_split); 70 | cwbvh.validate(triangles, config.pre_split, false); 71 | } 72 | 73 | cwbvh 74 | } 75 | 76 | /// Build a cwbvh from the given list of Boundable primitives. 77 | /// `pre_split` in BvhBuildParams is ignored in this case. 78 | /// Just a helper function / example, feel free to reimplement for your specific use case. 79 | /// 80 | /// # Arguments 81 | /// * `primitives` - A list of Primitives that implement Boundable. 82 | /// * `config` - Parameters for configuring the BVH building. 83 | /// * `core_build_time` - The core BVH build time. Does not include things like initial AABB 84 | /// generation or debug validation. This is mostly just here to simplify profiling in [tray_racing](https://github.com/DGriffin91/tray_racing) 85 | // TODO: we could optionally do imprecise basic Aabb splits. 86 | pub fn build_cwbvh( 87 | primitives: &[T], 88 | config: BvhBuildParams, 89 | core_build_time: &mut Duration, 90 | ) -> CwBvh { 91 | let mut aabbs = Vec::with_capacity(primitives.len()); 92 | let mut indices = Vec::with_capacity(primitives.len()); 93 | 94 | for (i, primitive) in primitives.iter().enumerate() { 95 | indices.push(i as u32); 96 | aabbs.push(primitive.aabb()); 97 | } 98 | 99 | let start_time = Instant::now(); 100 | 101 | let mut bvh2 = config.ploc_search_distance.build( 102 | &aabbs, 103 | indices, 104 | config.sort_precision, 105 | config.search_depth_threshold, 106 | ); 107 | ReinsertionOptimizer::run(&mut bvh2, config.reinsertion_batch_ratio, None); 108 | let cwbvh = bvh2_to_cwbvh(&bvh2, config.max_prims_per_leaf.clamp(1, 3), true, false); 109 | 110 | #[cfg(debug_assertions)] 111 | { 112 | bvh2.validate(&aabbs, false, config.pre_split); 113 | cwbvh.validate(&aabbs, config.pre_split, false); 114 | } 115 | 116 | *core_build_time += start_time.elapsed(); 117 | 118 | cwbvh 119 | } 120 | -------------------------------------------------------------------------------- /src/cwbvh/bvh2_to_cwbvh.rs: -------------------------------------------------------------------------------- 1 | // Uses cost / merging from cwbvh paper 2 | 3 | use glam::{vec3a, UVec3, Vec3A}; 4 | 5 | use crate::{ 6 | aabb::Aabb, 7 | bvh2::Bvh2, 8 | cwbvh::{CwBvh, CwBvhNode, BRANCHING, DENOM}, 9 | PerComponent, VecExt, 10 | }; 11 | 12 | use super::DIRECTIONS; 13 | 14 | /// Convert a bvh2 to CwBvh 15 | pub struct Bvh2Converter<'a> { 16 | pub bvh2: &'a Bvh2, 17 | pub nodes: Vec, 18 | pub primitive_indices: Vec, 19 | pub decisions: Vec, 20 | pub order_children_during_build: bool, 21 | pub include_exact_node_aabbs: bool, 22 | pub exact_node_aabbs: Option>, 23 | direction_lut: [Vec3A; 8], 24 | } 25 | 26 | const INVALID: u8 = u8::MAX; 27 | const INVALID32: u32 = u32::MAX; 28 | const INVALID_USIZE: usize = INVALID32 as usize; 29 | 30 | const PRIM_COST: f32 = 0.3; 31 | 32 | impl<'a> Bvh2Converter<'a> { 33 | /// Initialize the Bvh2 to CwBvh converter. 34 | pub fn new(bvh2: &'a Bvh2, order_children: bool, include_exact_node_aabbs: bool) -> Self { 35 | let capacity = bvh2.primitive_indices.len(); 36 | 37 | let mut nodes = Vec::with_capacity(capacity); 38 | nodes.push(Default::default()); 39 | 40 | let mut direction_lut = [Vec3A::ZERO; DIRECTIONS]; 41 | direction_lut 42 | .iter_mut() 43 | .enumerate() 44 | .for_each(|(s, direction)| { 45 | *direction = vec3a( 46 | if (s & 0b100) != 0 { -1.0 } else { 1.0 }, 47 | if (s & 0b010) != 0 { -1.0 } else { 1.0 }, 48 | if (s & 0b001) != 0 { -1.0 } else { 1.0 }, 49 | ); 50 | }); 51 | 52 | Self { 53 | bvh2, 54 | nodes, 55 | primitive_indices: Vec::with_capacity(capacity), 56 | decisions: vec![Decision::default(); bvh2.nodes.len() * 7], 57 | order_children_during_build: order_children, 58 | direction_lut, 59 | include_exact_node_aabbs, 60 | exact_node_aabbs: if include_exact_node_aabbs { 61 | Some(vec![Aabb::empty(); bvh2.nodes.len()]) 62 | } else { 63 | None 64 | }, 65 | } 66 | } 67 | 68 | /// Convert the bvh2 to CwBvh 69 | pub fn convert_to_cwbvh(&mut self) { 70 | crate::scope!("convert_to_cwbvh"); 71 | debug_assert_eq!(std::mem::size_of::(), 80); 72 | self.convert_to_cwbvh_impl(0, 0); 73 | } 74 | 75 | pub fn convert_to_cwbvh_impl(&mut self, node_index_bvh8: usize, node_index_bvh2: usize) { 76 | let mut node = self.nodes[node_index_bvh8]; 77 | let aabb = self.bvh2.nodes[node_index_bvh2].aabb; 78 | if let Some(exact_node_aabbs) = &mut self.exact_node_aabbs { 79 | exact_node_aabbs[node_index_bvh8] = aabb; 80 | } 81 | 82 | let node_p = aabb.min; 83 | node.p = node_p.into(); 84 | 85 | let e = ((aabb.max - aabb.min).max(Vec3A::splat(1e-20)) * DENOM) 86 | .log2() 87 | .ceil() 88 | .exp2(); 89 | debug_assert!(e.cmpgt(Vec3A::ZERO).all(), "aabb: {:?} e: {}", aabb, e); 90 | 91 | let rcp_e = 1.0 / e; 92 | let e: UVec3 = e.per_comp(|c: f32| { 93 | let bits = c.to_bits(); 94 | // Only the exponent bits can be non-zero 95 | debug_assert_eq!(bits & 0b10000000011111111111111111111111, 0); 96 | bits >> 23 97 | }); 98 | node.e = [e.x as u8, e.y as u8, e.z as u8]; 99 | 100 | let children = &mut [INVALID32; 8]; 101 | 102 | let child_count = &mut 0; 103 | self.get_children(node_index_bvh2, children, child_count, 0); 104 | 105 | if self.order_children_during_build { 106 | self.order_children(node_index_bvh2, children, *child_count as usize); 107 | } 108 | 109 | node.imask = 0; 110 | 111 | node.primitive_base_idx = self.primitive_indices.len() as u32; 112 | node.child_base_idx = self.nodes.len() as u32; 113 | 114 | let mut num_internal_nodes = 0; 115 | let mut num_primitives = 0_u32; 116 | 117 | for (i, child_index) in children.iter().enumerate() { 118 | if *child_index == INVALID32 { 119 | continue; // Empty slot 120 | }; 121 | 122 | let child_aabb = self.bvh2.nodes[*child_index as usize].aabb; 123 | 124 | // const PAD: f32 = 1e-20; 125 | // Use to force non-zero volumes. 126 | const PAD: f32 = 0.0; 127 | 128 | let mut child_min = ((child_aabb.min - node_p - PAD) * rcp_e).floor(); 129 | let mut child_max = ((child_aabb.max - node_p + PAD) * rcp_e).ceil(); 130 | 131 | child_min = child_min.clamp(Vec3A::ZERO, Vec3A::splat(255.0)); 132 | child_max = child_max.clamp(Vec3A::ZERO, Vec3A::splat(255.0)); 133 | 134 | debug_assert!((child_min.cmple(child_max)).all()); 135 | 136 | node.child_min_x[i] = child_min.x as u8; 137 | node.child_min_y[i] = child_min.y as u8; 138 | node.child_min_z[i] = child_min.z as u8; 139 | node.child_max_x[i] = child_max.x as u8; 140 | node.child_max_y[i] = child_max.y as u8; 141 | node.child_max_z[i] = child_max.z as u8; 142 | 143 | match self.decisions[(child_index * 7) as usize].kind { 144 | DecisionKind::LEAF => { 145 | let primitive_count = self.count_primitives(*child_index as usize, self.bvh2); 146 | debug_assert!(primitive_count > 0 && primitive_count <= 3); 147 | 148 | // Three highest bits contain unary representation of primitive count 149 | 150 | node.child_meta[i] = num_primitives as u8 151 | | match primitive_count { 152 | 1 => 0b0010_0000, 153 | 2 => 0b0110_0000, 154 | 3 => 0b1110_0000, 155 | _ => panic!("Incorrect leaf primitive count: {}", primitive_count), 156 | }; 157 | 158 | num_primitives += primitive_count; 159 | debug_assert!(num_primitives <= 24); 160 | } 161 | DecisionKind::INTERNAL => { 162 | node.imask |= 1u8 << i; 163 | 164 | node.child_meta[i] = (24 + i as u8) | 0b0010_0000; 165 | 166 | num_internal_nodes += 1; 167 | } 168 | DecisionKind::DISTRIBUTE => unreachable!(), 169 | } 170 | } 171 | 172 | self.nodes 173 | .resize(self.nodes.len() + num_internal_nodes, Default::default()); 174 | self.nodes[node_index_bvh8] = node; 175 | 176 | debug_assert!(node.child_base_idx as usize + num_internal_nodes == self.nodes.len()); 177 | debug_assert!( 178 | node.primitive_base_idx + num_primitives == self.primitive_indices.len() as u32 179 | ); 180 | 181 | // Recurse on Internal Nodes 182 | let mut offset = 0; 183 | for (i, child_index) in children.iter().enumerate() { 184 | if *child_index != INVALID32 && (node.imask & (1 << i)) != 0 { 185 | self.convert_to_cwbvh_impl( 186 | (node.child_base_idx + offset) as usize, 187 | *child_index as usize, 188 | ); 189 | offset += 1; 190 | } 191 | } 192 | //self.nodes[node_index_bvh8] = node; 193 | } 194 | 195 | // Recursively count primitives in subtree of the given Node 196 | // Simultaneously fills the indices buffer of the BVH8 197 | fn count_primitives(&mut self, node_index: usize, bvh2: &Bvh2) -> u32 { 198 | let node = bvh2.nodes[node_index]; 199 | 200 | if node.is_leaf() { 201 | debug_assert!(node.prim_count == 1); 202 | 203 | self.primitive_indices 204 | .push(bvh2.primitive_indices[node.first_index as usize]); 205 | 206 | return node.prim_count; 207 | } 208 | 209 | self.count_primitives(node.first_index as usize, bvh2) 210 | + self.count_primitives((node.first_index + 1) as usize, bvh2) 211 | } 212 | 213 | /// Fill cost table for bvh2 -> bvh8 conversion 214 | pub fn calculate_cost(&mut self, max_prims_per_leaf: u32) { 215 | crate::scope!("calculate_cost"); 216 | self.calculate_cost_impl(0, max_prims_per_leaf, 0); 217 | } 218 | 219 | // Based on https://github.com/jan-van-bergen/GPU-Raytracer/blob/6559ae2241c8fdea0ddaec959fe1a47ec9b3ab0d/Src/BVH/Converters/BVH8Converter.cpp#L24 220 | pub fn calculate_cost_impl( 221 | &mut self, 222 | node_index: usize, 223 | max_prims_per_leaf: u32, 224 | _current_depth: i32, 225 | ) -> u32 { 226 | let node = &self.bvh2.nodes[node_index]; 227 | let half_area = node.aabb.half_area(); 228 | let first_index = node.first_index; 229 | let prim_count = node.prim_count; 230 | 231 | let node_dec_idx = node_index * 7; 232 | let first_index_7 = (first_index * 7) as usize; 233 | let next_index_7 = ((first_index + 1) * 7) as usize; 234 | 235 | let num_primitives; 236 | 237 | // TODO possibly merge as much as possible past a specified depth 238 | // let depth_cost = if current_depth > 15 { 1.0 } else { 1.0 }; 239 | 240 | //if is_leaf() 241 | if prim_count != 0 { 242 | num_primitives = prim_count; 243 | if num_primitives != 1 { 244 | panic!( 245 | "ERROR: BVH8 Builder expects BVH with leaf Nodes containing only 1 primitive!\n" 246 | ); 247 | } 248 | 249 | // SAH cost 250 | let cost_leaf = half_area * (num_primitives as f32) * PRIM_COST; 251 | 252 | for i in 0..7 { 253 | let decision = &mut self.decisions[node_dec_idx + i]; 254 | decision.kind = DecisionKind::LEAF; 255 | decision.cost = cost_leaf; 256 | } 257 | } else { 258 | num_primitives = self.calculate_cost_impl( 259 | first_index as usize, 260 | max_prims_per_leaf, 261 | _current_depth + 1, 262 | ) + self.calculate_cost_impl( 263 | (first_index + 1) as usize, 264 | max_prims_per_leaf, 265 | _current_depth + 1, 266 | ); 267 | 268 | // Separate case: i=0 (i=1 in the paper) 269 | { 270 | let cost_leaf = if num_primitives <= max_prims_per_leaf { 271 | (num_primitives as f32) * half_area * PRIM_COST 272 | } else { 273 | f32::INFINITY 274 | }; 275 | 276 | let mut cost_distribute = f32::INFINITY; 277 | 278 | let mut distribute_left = INVALID; 279 | let mut distribute_right = INVALID; 280 | 281 | for k in 0..7 { 282 | let c = self.decisions[first_index_7 + k].cost 283 | + self.decisions[next_index_7 + 6 - k].cost; 284 | 285 | if c < cost_distribute { 286 | cost_distribute = c; 287 | 288 | distribute_left = k as u8; 289 | distribute_right = 6 - k as u8; 290 | } 291 | } 292 | 293 | let cost_internal = cost_distribute + half_area; 294 | 295 | let decision = &mut self.decisions[node_dec_idx]; 296 | if cost_leaf < cost_internal { 297 | decision.kind = DecisionKind::LEAF; 298 | decision.cost = cost_leaf; 299 | } else { 300 | decision.kind = DecisionKind::INTERNAL; 301 | decision.cost = cost_internal; 302 | } 303 | 304 | decision.distribute_left = distribute_left; 305 | decision.distribute_right = distribute_right; 306 | } 307 | 308 | // In the paper i=2..7 309 | let mut node_i; 310 | for i in 1..7 { 311 | node_i = node_dec_idx + i; 312 | let mut cost_distribute = self.decisions[node_i - 1].cost; 313 | 314 | let mut distribute_left = INVALID; 315 | let mut distribute_right = INVALID; 316 | 317 | for k in 0..i { 318 | let c = self.decisions[first_index_7 + k].cost 319 | + self.decisions[next_index_7 + i - k - 1].cost; 320 | 321 | if c < cost_distribute { 322 | cost_distribute = c; 323 | 324 | let k_u8 = k as u8; 325 | distribute_left = k_u8; 326 | distribute_right = i as u8 - k_u8 - 1; 327 | } 328 | } 329 | 330 | let decision = &mut self.decisions[node_i]; 331 | decision.cost = cost_distribute; 332 | 333 | if distribute_left != INVALID { 334 | decision.kind = DecisionKind::DISTRIBUTE; 335 | decision.distribute_left = distribute_left; 336 | decision.distribute_right = distribute_right; 337 | } else { 338 | self.decisions[node_i] = self.decisions[node_i - 1]; 339 | } 340 | } 341 | } 342 | 343 | num_primitives 344 | } 345 | 346 | pub fn get_children( 347 | &mut self, 348 | node_index: usize, 349 | children: &mut [u32; 8], 350 | child_count: &mut u32, 351 | i: usize, 352 | ) { 353 | let node = &self.bvh2.nodes[node_index]; 354 | 355 | if node.is_leaf() { 356 | children[*child_count as usize] = node_index as u32; 357 | *child_count += 1; 358 | return; 359 | } 360 | 361 | let decision = &self.decisions[node_index * 7 + i]; 362 | let distribute_left = decision.distribute_left; 363 | let distribute_right = decision.distribute_right; 364 | 365 | debug_assert!(distribute_left < 7); 366 | debug_assert!(distribute_right < 7); 367 | 368 | // Recurse on left child if it needs to distribute 369 | if self.decisions[(node.first_index * 7 + distribute_left as u32) as usize].kind 370 | == DecisionKind::DISTRIBUTE 371 | { 372 | self.get_children( 373 | node.first_index as usize, 374 | children, 375 | child_count, 376 | distribute_left as usize, 377 | ); 378 | } else { 379 | children[*child_count as usize] = node.first_index; 380 | *child_count += 1; 381 | } 382 | 383 | // Recurse on right child if it needs to distribute 384 | if self.decisions[((node.first_index + 1) * 7 + distribute_right as u32) as usize].kind 385 | == DecisionKind::DISTRIBUTE 386 | { 387 | self.get_children( 388 | (node.first_index + 1) as usize, 389 | children, 390 | child_count, 391 | distribute_right as usize, 392 | ); 393 | } else { 394 | children[*child_count as usize] = node.first_index + 1; 395 | *child_count += 1; 396 | } 397 | } 398 | 399 | // Based on https://github.com/jan-van-bergen/GPU-Raytracer/blob/6559ae2241c8fdea0ddaec959fe1a47ec9b3ab0d/Src/BVH/Converters/BVH8Converter.cpp#L148 400 | pub fn order_children( 401 | &mut self, 402 | node_index: usize, 403 | children: &mut [u32; 8], 404 | child_count: usize, 405 | ) { 406 | let node = &self.bvh2.nodes[node_index]; 407 | let p = node.aabb.center(); 408 | 409 | let mut cost = [[f32::MAX; DIRECTIONS]; BRANCHING]; 410 | 411 | assert!(child_count <= BRANCHING); 412 | assert!(cost.len() >= child_count); 413 | // Fill cost table 414 | // TODO parallel: check to see if this is faster w/ par_iter 415 | for s in 0..DIRECTIONS { 416 | let d = self.direction_lut[s]; 417 | for c in 0..child_count { 418 | let v = self.bvh2.nodes[children[c] as usize].aabb.center() - p; 419 | let cost_slot = unsafe { cost.get_unchecked_mut(c).get_unchecked_mut(s) }; 420 | *cost_slot = d.dot(v); // No benefit from normalizing 421 | } 422 | } 423 | 424 | let mut assignment = [INVALID_USIZE; BRANCHING]; 425 | let mut slot_filled = [false; DIRECTIONS]; 426 | 427 | // The paper suggests the auction method, but greedy is almost as good. 428 | loop { 429 | let mut min_cost = f32::MAX; 430 | 431 | let mut min_slot = INVALID_USIZE; 432 | let mut min_index = INVALID_USIZE; 433 | 434 | // Find cheapest unfilled slot of any unassigned child 435 | for c in 0..child_count { 436 | if assignment[c] == INVALID_USIZE { 437 | for (s, &slot_filled) in slot_filled.iter().enumerate() { 438 | let cost = unsafe { *cost.get_unchecked(c).get_unchecked(s) }; 439 | if !slot_filled && cost < min_cost { 440 | min_cost = cost; 441 | 442 | min_slot = s; 443 | min_index = c; 444 | } 445 | } 446 | } 447 | } 448 | 449 | if min_slot == INVALID_USIZE { 450 | break; 451 | } 452 | 453 | slot_filled[min_slot] = true; 454 | assignment[min_index] = min_slot; 455 | } 456 | 457 | let original_order = std::mem::replace(children, [INVALID32; 8]); 458 | 459 | assert!(assignment.len() >= child_count); // Allow compiler to skip bounds check 460 | assert!(original_order.len() >= child_count); // Allow compiler to skip bounds check 461 | for i in 0..child_count { 462 | debug_assert!(assignment[i] != INVALID_USIZE); 463 | debug_assert!(original_order[i] != INVALID32); 464 | children[assignment[i]] = original_order[i]; 465 | } 466 | } 467 | } 468 | 469 | #[derive(Copy, Clone, PartialEq, Default)] 470 | pub enum DecisionKind { 471 | LEAF, 472 | INTERNAL, 473 | #[default] 474 | DISTRIBUTE, 475 | } 476 | 477 | #[derive(Copy, Clone, Default)] 478 | pub struct Decision { 479 | pub cost: f32, 480 | pub kind: DecisionKind, 481 | pub distribute_left: u8, 482 | pub distribute_right: u8, 483 | } 484 | 485 | /// Convert the given bvh2 to cwbvh 486 | /// # Arguments 487 | /// * `bvh2` - Source BVH 488 | /// * `max_prims_per_leaf` - 0..=3 The maximum number of primitives per leaf. 489 | pub fn bvh2_to_cwbvh( 490 | bvh2: &Bvh2, 491 | max_prims_per_leaf: u32, 492 | order_children: bool, 493 | include_exact_node_aabbs: bool, 494 | ) -> CwBvh { 495 | if bvh2.nodes.is_empty() { 496 | return CwBvh::default(); 497 | } 498 | let mut converter = Bvh2Converter::new(bvh2, order_children, include_exact_node_aabbs); 499 | converter.calculate_cost(max_prims_per_leaf); 500 | converter.convert_to_cwbvh(); 501 | 502 | CwBvh { 503 | nodes: converter.nodes, 504 | primitive_indices: converter.primitive_indices, 505 | total_aabb: bvh2.nodes[0].aabb, 506 | exact_node_aabbs: converter.exact_node_aabbs, 507 | } 508 | } 509 | -------------------------------------------------------------------------------- /src/cwbvh/node.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fmt::{self, Formatter}, 3 | mem::transmute, 4 | }; 5 | 6 | use crate::{aabb::Aabb, ray::Ray}; 7 | use bytemuck::{Pod, Zeroable}; 8 | use glam::{vec3a, Vec3, Vec3A}; 9 | use std::fmt::Debug; 10 | 11 | use super::NQ_SCALE; 12 | 13 | /// A Compressed Wide BVH8 Node. repr(C), Pod, 80 bytes. 14 | // https://research.nvidia.com/sites/default/files/publications/ylitie2017hpg-paper.pdf 15 | #[derive(Clone, Copy, Default, PartialEq)] 16 | #[repr(C)] 17 | pub struct CwBvhNode { 18 | /// Min point of node AABB 19 | pub p: Vec3, 20 | 21 | /// Exponent of child bounding box compression 22 | /// Max point of node AABB could be calculated ex: `p.x + bitcast(e[0] << 23) * 255.0` 23 | pub e: [u8; 3], 24 | 25 | /// Bitmask indicating which children are internal nodes. 1 for internal, 0 for leaf 26 | pub imask: u8, 27 | 28 | /// Index of first child into `Vec` 29 | pub child_base_idx: u32, 30 | 31 | /// Index of first primitive into primitive_indices `Vec` 32 | pub primitive_base_idx: u32, 33 | 34 | /// Meta data for each child 35 | /// Empty child slot: The field is set to 00000000 36 | /// 37 | /// For leafs nodes: the low 5 bits store the primitive offset [0..24) from primitive_base_idx. And the high 38 | /// 3 bits store the number of primitives in that leaf in a unary encoding. 39 | /// A child leaf with 2 primitives with the first primitive starting at primitive_base_idx would be 0b01100000 40 | /// A child leaf with 3 primitives with the first primitive starting at primitive_base_idx + 2 would be 0b11100010 41 | /// A child leaf with 1 primitive with the first primitive starting at primitive_base_idx + 1 would be 0b00100001 42 | /// 43 | /// For internal nodes: The high 3 bits are set to 001 while the low 5 bits store the child slot index plus 24 44 | /// i.e., the values range [24..32) 45 | pub child_meta: [u8; 8], 46 | 47 | // Note: deviation from the paper: the min&max are interleaved here. 48 | /// Axis planes for each child. 49 | /// The plane position could be calculated, for example, with `p.x + bitcast(e[0] << 23) * child_min_x[0]` 50 | /// But in the actual intersection implementation the ray is transformed instead. 51 | pub child_min_x: [u8; 8], 52 | pub child_max_x: [u8; 8], 53 | pub child_min_y: [u8; 8], 54 | pub child_max_y: [u8; 8], 55 | pub child_min_z: [u8; 8], 56 | pub child_max_z: [u8; 8], 57 | } 58 | 59 | impl Debug for CwBvhNode { 60 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { 61 | f.debug_struct("CwBvhNode") 62 | .field("p", &self.p) 63 | .field("e", &self.e) 64 | .field("imask", &format!("{:#010b}", &self.imask)) 65 | .field("child_base_idx", &self.child_base_idx) 66 | .field("primitive_base_idx", &self.primitive_base_idx) 67 | .field( 68 | "child_meta", 69 | &self 70 | .child_meta 71 | .iter() 72 | .map(|c| format!("{:#010b}", c)) 73 | .collect::>(), 74 | ) 75 | .field("child_min_x", &self.child_min_x) 76 | .field("child_max_x", &self.child_max_x) 77 | .field("child_min_y", &self.child_min_y) 78 | .field("child_max_y", &self.child_max_y) 79 | .field("child_min_z", &self.child_min_z) 80 | .field("child_max_z", &self.child_max_z) 81 | .finish() 82 | } 83 | } 84 | 85 | unsafe impl Pod for CwBvhNode {} 86 | unsafe impl Zeroable for CwBvhNode {} 87 | 88 | pub(crate) const EPSILON: f32 = 0.0001; 89 | 90 | impl CwBvhNode { 91 | #[inline(always)] 92 | pub fn intersect_ray(&self, ray: &Ray, oct_inv4: u32) -> u32 { 93 | #[cfg(all( 94 | any(target_arch = "x86", target_arch = "x86_64"), 95 | target_feature = "sse2" 96 | ))] 97 | { 98 | self.intersect_ray_simd(ray, oct_inv4) 99 | } 100 | 101 | #[cfg(not(all( 102 | any(target_arch = "x86", target_arch = "x86_64"), 103 | target_feature = "sse2" 104 | )))] 105 | { 106 | self.intersect_ray_basic(ray, oct_inv4) 107 | } 108 | } 109 | 110 | /// Intersects only one child at a time with the given ray. Limited simd usage on platforms that support it. Exists for reference & compatibility. 111 | /// Traversal times with CwBvhNode::intersect_ray_simd take less than half the time vs intersect_ray_basic. 112 | #[inline(always)] 113 | pub fn intersect_ray_basic(&self, ray: &Ray, oct_inv4: u32) -> u32 { 114 | let adjusted_ray_dir_inv = self.compute_extent() * ray.inv_direction; 115 | let adjusted_ray_origin = (Vec3A::from(self.p) - ray.origin) * ray.inv_direction; 116 | 117 | let mut hit_mask = 0; 118 | 119 | let rdx = ray.direction.x < 0.0; 120 | let rdy = ray.direction.y < 0.0; 121 | let rdz = ray.direction.z < 0.0; 122 | 123 | let (child_bits8, bit_index8) = self.get_child_and_index_bits(oct_inv4); 124 | 125 | for child in 0..8 { 126 | let q_lo_x = self.child_min_x[child]; 127 | let q_lo_y = self.child_min_y[child]; 128 | let q_lo_z = self.child_min_z[child]; 129 | 130 | let q_hi_x = self.child_max_x[child]; 131 | let q_hi_y = self.child_max_y[child]; 132 | let q_hi_z = self.child_max_z[child]; 133 | 134 | let x_min = if rdx { q_hi_x } else { q_lo_x }; 135 | let x_max = if rdx { q_lo_x } else { q_hi_x }; 136 | let y_min = if rdy { q_hi_y } else { q_lo_y }; 137 | let y_max = if rdy { q_lo_y } else { q_hi_y }; 138 | let z_min = if rdz { q_hi_z } else { q_lo_z }; 139 | let z_max = if rdz { q_lo_z } else { q_hi_z }; 140 | 141 | let mut tmin3 = vec3a(x_min as f32, y_min as f32, z_min as f32); 142 | let mut tmax3 = vec3a(x_max as f32, y_max as f32, z_max as f32); 143 | 144 | // Account for grid origin and scale 145 | tmin3 = tmin3 * adjusted_ray_dir_inv + adjusted_ray_origin; 146 | tmax3 = tmax3 * adjusted_ray_dir_inv + adjusted_ray_origin; 147 | 148 | let tmin = tmin3.x.max(tmin3.y).max(tmin3.z).max(EPSILON); //ray.tmin? 149 | let tmax = tmax3.x.min(tmax3.y).min(tmax3.z).min(ray.tmax); 150 | 151 | let intersected = tmin <= tmax; 152 | if intersected { 153 | let child_bits = extract_byte64(child_bits8, child); 154 | let bit_index = extract_byte64(bit_index8, child); 155 | hit_mask |= child_bits << bit_index; 156 | } 157 | } 158 | 159 | hit_mask 160 | } 161 | 162 | #[inline(always)] 163 | pub fn intersect_aabb(&self, aabb: &Aabb, oct_inv4: u32) -> u32 { 164 | let extent_rcp = 1.0 / self.compute_extent(); 165 | let p = Vec3A::from(self.p); 166 | 167 | // Transform the query aabb into the node's local space 168 | let adjusted_aabb = Aabb::new((aabb.min - p) * extent_rcp, (aabb.max - p) * extent_rcp); 169 | 170 | let mut hit_mask = 0; 171 | 172 | let (child_bits8, bit_index8) = self.get_child_and_index_bits(oct_inv4); 173 | 174 | for child in 0..8 { 175 | if self.local_child_aabb(child).intersect_aabb(&adjusted_aabb) { 176 | let child_bits = extract_byte64(child_bits8, child); 177 | let bit_index = extract_byte64(bit_index8, child); 178 | hit_mask |= child_bits << bit_index; 179 | } 180 | } 181 | 182 | hit_mask 183 | } 184 | 185 | #[inline(always)] 186 | pub fn contains_point(&self, point: &Vec3A, oct_inv4: u32) -> u32 { 187 | let extent_rcp = 1.0 / self.compute_extent(); 188 | let p = Vec3A::from(self.p); 189 | 190 | // Transform the query point into the node's local space 191 | let adjusted_point = (*point - p) * extent_rcp; 192 | 193 | let mut hit_mask = 0; 194 | 195 | let (child_bits8, bit_index8) = self.get_child_and_index_bits(oct_inv4); 196 | 197 | for child in 0..8 { 198 | if self.local_child_aabb(child).contains_point(adjusted_point) { 199 | let child_bits = extract_byte64(child_bits8, child); 200 | let bit_index = extract_byte64(bit_index8, child); 201 | hit_mask |= child_bits << bit_index; 202 | } 203 | } 204 | 205 | hit_mask 206 | } 207 | 208 | // TODO intersect frustum 209 | // https://github.com/zeux/niagara/blob/bf90aa8c78e352d3b753b35553a3bcc8c65ef7a0/src/shaders/drawcull.comp.glsl#L71 210 | // https://iquilezles.org/articles/frustumcorrect/ 211 | 212 | #[inline(always)] 213 | pub fn get_child_and_index_bits(&self, oct_inv4: u32) -> (u64, u64) { 214 | let mut oct_inv8 = oct_inv4 as u64; 215 | oct_inv8 |= oct_inv8 << 32; 216 | let meta8 = unsafe { transmute::<[u8; 8], u64>(self.child_meta) }; 217 | 218 | // (meta8 & (meta8 << 1)) takes advantage of the offset indexing for inner nodes [24..32) 219 | // [0b00011000..=0b00011111). For leaf nodes [0..24) these two bits (0b00011000) are never both set. 220 | let inner_mask = 0b0001000000010000000100000001000000010000000100000001000000010000; 221 | let is_inner8 = (meta8 & (meta8 << 1)) & inner_mask; 222 | 223 | // 00010000 >> 4: 00000001, then 00000001 * 0xff: 11111111 224 | let inner_mask8 = (is_inner8 >> 4) * 0xffu64; 225 | 226 | // Each byte of bit_index8 contains the traversal priority, biased by 24, for internal nodes, and 227 | // the triangle offset for leaf nodes. The bit index will later be used to shift the child bits. 228 | let index_mask = 0b0001111100011111000111110001111100011111000111110001111100011111; 229 | let bit_index8 = (meta8 ^ (oct_inv8 & inner_mask8)) & index_mask; 230 | 231 | // For internal nodes child_bits8 will just be 1 in each byte, so that bit will then be shifted into the high 232 | // byte of the node hit_mask (see CwBvhNode::intersect_ray). For leaf nodes it will have the unary encoded 233 | // leaf primitive count and that will be shifted into the lower 24 bits of node hit_mask. 234 | let child_mask = 0b0000011100000111000001110000011100000111000001110000011100000111; 235 | let child_bits8 = (meta8 >> 5) & child_mask; 236 | (child_bits8, bit_index8) 237 | } 238 | 239 | /// Get local child aabb position relative to the parent 240 | #[inline(always)] 241 | pub fn local_child_aabb(&self, child: usize) -> Aabb { 242 | Aabb::new( 243 | vec3a( 244 | self.child_min_x[child] as f32, 245 | self.child_min_y[child] as f32, 246 | self.child_min_z[child] as f32, 247 | ), 248 | vec3a( 249 | self.child_max_x[child] as f32, 250 | self.child_max_y[child] as f32, 251 | self.child_max_z[child] as f32, 252 | ), 253 | ) 254 | } 255 | 256 | #[inline(always)] 257 | pub fn child_aabb(&self, child: usize) -> Aabb { 258 | let e = self.compute_extent(); 259 | let p: Vec3A = self.p.into(); 260 | let mut local_aabb = self.local_child_aabb(child); 261 | local_aabb.min = local_aabb.min * e + p; 262 | local_aabb.max = local_aabb.max * e + p; 263 | local_aabb 264 | } 265 | 266 | #[inline(always)] 267 | pub fn aabb(&self) -> Aabb { 268 | let e = self.compute_extent(); 269 | let p: Vec3A = self.p.into(); 270 | Aabb::new(p, p + e * NQ_SCALE) 271 | } 272 | 273 | /// Convert stored extent exponent into float vector 274 | #[inline(always)] 275 | pub fn compute_extent(&self) -> Vec3A { 276 | vec3a( 277 | f32::from_bits((self.e[0] as u32) << 23), 278 | f32::from_bits((self.e[1] as u32) << 23), 279 | f32::from_bits((self.e[2] as u32) << 23), 280 | ) 281 | } 282 | 283 | // If the child is empty this will also return true. If needed also use CwBvh::is_child_empty(). 284 | #[inline(always)] 285 | pub fn is_leaf(&self, child: usize) -> bool { 286 | (self.imask & (1 << child)) == 0 287 | } 288 | 289 | #[inline(always)] 290 | pub fn is_child_empty(&self, child: usize) -> bool { 291 | self.child_meta[child] == 0 292 | } 293 | 294 | /// Returns the primitive starting index and primitive count for the given child. 295 | #[inline(always)] 296 | pub fn child_primitives(&self, child: usize) -> (u32, u32) { 297 | let child_meta = self.child_meta[child]; 298 | let starting_index = self.primitive_base_idx + (self.child_meta[child] & 0b11111) as u32; 299 | let primitive_count = (child_meta & 0b11100000).count_ones(); 300 | (starting_index, primitive_count) 301 | } 302 | 303 | /// Returns the node index of the given child. 304 | #[inline(always)] 305 | pub fn child_node_index(&self, child: usize) -> u32 { 306 | let child_meta = self.child_meta[child]; 307 | let slot_index = (child_meta & 0b11111) as usize - 24; 308 | let relative_index = (self.imask as u32 & !(0xffffffffu32 << slot_index)).count_ones(); 309 | self.child_base_idx + relative_index 310 | } 311 | } 312 | 313 | #[inline(always)] 314 | pub fn extract_byte(x: u32, b: u32) -> u32 { 315 | (x >> (b * 8)) & 0xFFu32 316 | } 317 | 318 | #[inline(always)] 319 | pub fn extract_byte64(x: u64, b: usize) -> u32 { 320 | ((x >> (b * 8)) as u32) & 0xFFu32 321 | } 322 | -------------------------------------------------------------------------------- /src/cwbvh/simd.rs: -------------------------------------------------------------------------------- 1 | use glam::*; 2 | #[cfg(target_arch = "x86")] 3 | use std::arch::x86::*; 4 | #[cfg(target_arch = "x86_64")] 5 | use std::arch::x86_64::*; 6 | 7 | use crate::{ 8 | cwbvh::{ 9 | node::{extract_byte64, EPSILON}, 10 | CwBvhNode, 11 | }, 12 | ray::Ray, 13 | }; 14 | 15 | impl CwBvhNode { 16 | #[inline(always)] 17 | pub fn intersect_ray_simd(&self, ray: &Ray, oct_inv4: u32) -> u32 { 18 | let adj_ray_dir_inv = self.compute_extent() * ray.inv_direction; 19 | let adj_ray_origin = (Vec3A::from(self.p) - ray.origin) * ray.inv_direction; 20 | let mut hit_mask = 0u32; 21 | unsafe { 22 | let adj_ray_dir_inv_x = _mm_set1_ps(adj_ray_dir_inv.x); 23 | let adj_ray_dir_inv_y = _mm_set1_ps(adj_ray_dir_inv.y); 24 | let adj_ray_dir_inv_z = _mm_set1_ps(adj_ray_dir_inv.z); 25 | 26 | let adj_ray_orig_x = _mm_set1_ps(adj_ray_origin.x); 27 | let adj_ray_orig_y = _mm_set1_ps(adj_ray_origin.y); 28 | let adj_ray_orig_z = _mm_set1_ps(adj_ray_origin.z); 29 | 30 | let rdx = ray.direction.x < 0.0; 31 | let rdy = ray.direction.y < 0.0; 32 | let rdz = ray.direction.z < 0.0; 33 | 34 | let (child_bits8, bit_index8) = self.get_child_and_index_bits(oct_inv4); 35 | 36 | #[inline(always)] 37 | fn get_q(v: &[u8; 8], i: usize) -> __m128 { 38 | // get_q is the most expensive part of intersect_simd 39 | // Tried version with _mm_cvtepu8_epi32 and _mm_cvtepi32_ps, it was a lot slower. 40 | // Tried transmuting v into a u64 and bit shifting, it was a lot slower. 41 | unsafe { 42 | _mm_set_ps( 43 | *v.get_unchecked(i * 4 + 3) as f32, 44 | *v.get_unchecked(i * 4 + 2) as f32, 45 | *v.get_unchecked(i * 4 + 1) as f32, 46 | *v.get_unchecked(i * 4) as f32, 47 | ) 48 | } 49 | } 50 | 51 | // Intersect 4 aabbs at a time: 52 | for i in 0..2 { 53 | // It's possible to select hi/lo outside the loop with child_min_x, etc... but that seems quite a bit slower 54 | // using _mm_blendv_ps or similar instead of `if rdx`, etc... is slower 55 | 56 | // Interleaving x, y, z like this is slightly faster than loading all at once. Tried using _mm_prefetch without luck 57 | let q_lo_x = get_q(&self.child_min_x, i); 58 | let q_hi_x = get_q(&self.child_max_x, i); 59 | let x_min = if rdx { q_hi_x } else { q_lo_x }; 60 | let x_max = if rdx { q_lo_x } else { q_hi_x }; 61 | // Tried using _mm_fmadd_ps, it was a lot slower 62 | let tmin_x = _mm_add_ps(_mm_mul_ps(x_min, adj_ray_dir_inv_x), adj_ray_orig_x); 63 | let tmax_x = _mm_add_ps(_mm_mul_ps(x_max, adj_ray_dir_inv_x), adj_ray_orig_x); 64 | 65 | let q_lo_y = get_q(&self.child_min_y, i); 66 | let q_hi_y = get_q(&self.child_max_y, i); 67 | let y_min = if rdy { q_hi_y } else { q_lo_y }; 68 | let y_max = if rdy { q_lo_y } else { q_hi_y }; 69 | let tmin_y = _mm_add_ps(_mm_mul_ps(y_min, adj_ray_dir_inv_y), adj_ray_orig_y); 70 | let tmax_y = _mm_add_ps(_mm_mul_ps(y_max, adj_ray_dir_inv_y), adj_ray_orig_y); 71 | 72 | let q_lo_z = get_q(&self.child_min_z, i); 73 | let q_hi_z = get_q(&self.child_max_z, i); 74 | let z_min = if rdz { q_hi_z } else { q_lo_z }; 75 | let z_max = if rdz { q_lo_z } else { q_hi_z }; 76 | let tmin_z = _mm_add_ps(_mm_mul_ps(z_min, adj_ray_dir_inv_z), adj_ray_orig_z); 77 | let tmax_z = _mm_add_ps(_mm_mul_ps(z_max, adj_ray_dir_inv_z), adj_ray_orig_z); 78 | 79 | // Tried using _mm_fmadd_ps, it was a lot slower 80 | // Compute intersection 81 | let tmin = _mm_max_ps(tmin_x, _mm_max_ps(tmin_y, tmin_z)); 82 | let tmax = _mm_min_ps(tmax_x, _mm_min_ps(tmax_y, tmax_z)); 83 | let tmin = _mm_max_ps(tmin, _mm_set1_ps(EPSILON)); //ray.tmin? 84 | let tmax = _mm_min_ps(tmax, _mm_set1_ps(ray.tmax)); 85 | 86 | let intersected = _mm_cmple_ps(tmin, tmax); 87 | let mask = _mm_movemask_ps(intersected); 88 | 89 | for j in 0..4 { 90 | let offset = i * 4 + j; 91 | if (mask & (1 << j)) != 0 { 92 | let child_bits = extract_byte64(child_bits8, offset); 93 | let bit_index = extract_byte64(bit_index8, offset); 94 | hit_mask |= child_bits << bit_index; 95 | } 96 | } 97 | } 98 | } 99 | hit_mask 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/cwbvh/traverse_macro.rs: -------------------------------------------------------------------------------- 1 | /// Traverse a CwBvh with custom node and primitive intersections. 2 | /// I really didn't want to use a macro but it seems like everything else using closures/yielding is slower given 3 | /// both generic node and primitive traversal. 4 | /// 5 | /// # Parameters 6 | /// - `$cwbvh`: `&CwBvh` The CwBvh to be traversed. 7 | /// - `$node`: `&CwBvhNode` The current node in the BVH that is being traversed. 8 | /// - `$state`: `Traversal` Mutable traversal state. 9 | /// - `$node_intersection`: An expression that is executed for each node intersection during traversal. 10 | /// It should test for intersection against the current `node`, making use of `state.oct_inv4` u32. 11 | /// It should return a u32 `hitmask` of the node children hitmask corresponding to which nodes were intersected. 12 | /// - `$primitive_intersection`: A code block that is executed for each primitive intersection. 13 | /// It should read the current `state.primitive_id` u32. This is the index into the primitive indices for the 14 | /// current primitive to be tested. Optionally use `break` to halt traversal. 15 | /// 16 | /// # Example: Closest hit ray traversal 17 | /// ``` 18 | /// use obvhs::{ 19 | /// cwbvh::{builder::build_cwbvh_from_tris, node::CwBvhNode}, 20 | /// ray::{Ray, RayHit}, 21 | /// test_util::geometry::{icosphere, PLANE}, 22 | /// triangle::Triangle, 23 | /// BvhBuildParams, 24 | /// traverse, 25 | /// }; 26 | /// use glam::*; 27 | /// use std::time::Duration; 28 | /// 29 | /// let mut tris: Vec = Vec::new(); 30 | /// tris.extend(icosphere(1)); 31 | /// tris.extend(PLANE); 32 | /// 33 | /// let ray = Ray::new_inf(vec3a(0.1, 0.1, 4.0), vec3a(0.0, 0.0, -1.0)); 34 | /// 35 | /// let bvh = build_cwbvh_from_tris(&tris, BvhBuildParams::medium_build(), &mut Duration::default()); 36 | /// let mut hit = RayHit::none(); 37 | /// let mut traverse_ray = ray.clone(); 38 | /// let mut state = bvh.new_traversal(ray.direction); 39 | /// let mut node; 40 | /// traverse!(bvh, node, state, 41 | /// // Node intersection: 42 | /// node.intersect_ray(&traverse_ray, state.oct_inv4), 43 | /// // Primitive intersection: 44 | /// { 45 | /// let t = tris[bvh.primitive_indices[state.primitive_id as usize] as usize].intersect(&traverse_ray); 46 | /// if t < traverse_ray.tmax { 47 | /// hit.primitive_id = state.primitive_id; 48 | /// hit.t = t; 49 | /// traverse_ray.tmax = t; 50 | /// } 51 | /// } 52 | /// ); 53 | /// 54 | /// let did_hit = hit.t < ray.tmax; 55 | /// assert!(did_hit); 56 | /// assert!(bvh.primitive_indices[hit.primitive_id as usize] == 62); 57 | /// ``` 58 | #[macro_export] 59 | macro_rules! traverse { 60 | ($cwbvh:expr, $node:expr, $state:expr, $node_intersection:expr, $primitive_intersection:expr) => {{ 61 | loop { 62 | // While the primitive group is not empty 63 | while $state.primitive_group.y != 0 { 64 | let local_primitive_index = $crate::cwbvh::firstbithigh($state.primitive_group.y); 65 | 66 | // Remove primitive from current_group 67 | $state.primitive_group.y &= !(1u32 << local_primitive_index); 68 | 69 | $state.primitive_id = $state.primitive_group.x + local_primitive_index; 70 | $primitive_intersection 71 | } 72 | $state.primitive_group = UVec2::ZERO; 73 | 74 | // If there's remaining nodes in the current group to check 75 | if $state.current_group.y & 0xff000000 != 0 { 76 | let hits_imask = $state.current_group.y; 77 | 78 | let child_index_offset = $crate::cwbvh::firstbithigh(hits_imask); 79 | let child_index_base = $state.current_group.x; 80 | 81 | // Remove node from current_group 82 | $state.current_group.y &= !(1u32 << child_index_offset); 83 | 84 | // If the node group is not yet empty, push it on the stack 85 | if $state.current_group.y & 0xff000000 != 0 { 86 | $state.stack.push($state.current_group); 87 | } 88 | 89 | let slot_index = (child_index_offset - 24) ^ ($state.oct_inv4 & 0xff); 90 | let relative_index = (hits_imask & !(0xffffffffu32 << slot_index)).count_ones(); 91 | 92 | let child_node_index = child_index_base + relative_index; 93 | 94 | $node = &$cwbvh.nodes[child_node_index as usize]; 95 | 96 | $state.hitmask = $node_intersection; 97 | 98 | $state.current_group.x = $node.child_base_idx; 99 | $state.primitive_group.x = $node.primitive_base_idx; 100 | 101 | $state.current_group.y = (&$state.hitmask & 0xff000000u32) | ($node.imask as u32); 102 | $state.primitive_group.y = &$state.hitmask & 0x00ffffffu32; 103 | } else { 104 | // Below is only needed when using triangle postponing, which would only be helpful on the 105 | // GPU (it helps reduce thread divergence). Also, this isn't compatible with traversal yeilding. 106 | // $state.primitive_group = $state.current_group; 107 | $state.current_group = UVec2::ZERO; 108 | } 109 | 110 | // If there's no remaining nodes in the current group to check, pop it off the stack. 111 | if $state.primitive_group.y == 0 && ($state.current_group.y & 0xff000000) == 0 { 112 | // If the stack is empty, end traversal. 113 | if $state.stack.is_empty() { 114 | $state.current_group.y = 0; 115 | break; 116 | } 117 | 118 | $state.current_group = $state.stack.pop_fast(); 119 | } 120 | } 121 | }}; 122 | } 123 | -------------------------------------------------------------------------------- /src/heapstack.rs: -------------------------------------------------------------------------------- 1 | //! A stack data structure implemented on the heap with adjustable capacity. 2 | 3 | /// A stack data structure implemented on the heap with adjustable capacity. 4 | /// 5 | /// This structure allows pushing and popping elements and will never automatically 6 | /// allocate or deallocate. The only functions that will result in allocation are 7 | /// `HeapStack::new_with_capacity` and `HeapStack::reserve`. 8 | /// 9 | /// The elements must implement the `Clone` and `Default` traits. 10 | #[derive(Default)] 11 | pub struct HeapStack { 12 | data: Vec, 13 | index: usize, 14 | } 15 | 16 | impl HeapStack { 17 | /// Creates a new `HeapStack` with the specified initial capacity. 18 | /// 19 | /// # Arguments 20 | /// * `cap` - The initial capacity of the stack. Must be greater than zero. 21 | /// 22 | /// # Returns 23 | /// A `HeapStack` with pre-allocated space for `cap` elements. 24 | /// 25 | /// # Panics 26 | /// This function will panic if `cap` is zero. 27 | #[inline(always)] 28 | pub fn new_with_capacity(cap: usize) -> Self { 29 | assert!(cap > 0); 30 | HeapStack { 31 | data: vec![Default::default(); cap], 32 | index: 0, 33 | } 34 | } 35 | 36 | /// Pushes a value onto the stack. 37 | /// 38 | /// # Arguments 39 | /// * `v` - The value to be pushed onto the stack. 40 | /// 41 | /// # Panics 42 | /// This function will panic if the stack is full. 43 | #[inline(always)] 44 | pub fn push(&mut self, v: T) { 45 | if self.index < self.data.len() { 46 | *unsafe { self.data.get_unchecked_mut(self.index) } = v; 47 | self.index += 1; 48 | } else { 49 | panic!("Index out of bounds: the HeapStack is full (length: {}) and cannot accommodate more elements", self.data.len()); 50 | } 51 | } 52 | 53 | /// Pops a value from the stack. 54 | /// 55 | /// # Returns 56 | /// `Some(T)` if the stack is not empty, otherwise `None`. 57 | #[inline(always)] 58 | pub fn pop(&mut self) -> Option<&T> { 59 | if self.index > 0 { 60 | self.index = self.index.saturating_sub(1); 61 | Some(&self.data[self.index]) 62 | } else { 63 | None 64 | } 65 | } 66 | 67 | /// Pops a value from the stack without checking bounds. 68 | /// 69 | /// This function is safe to call because a `HeapStack` cannot have a capacity of zero. 70 | /// However, if the stack is empty when this function is called, it will access what was previously 71 | /// the first value in the stack, which may not be the expected behavior. 72 | /// 73 | /// # Returns 74 | /// The value at the top of the stack. 75 | #[inline(always)] 76 | pub fn pop_fast(&mut self) -> &T { 77 | self.index = self.index.saturating_sub(1); 78 | let v = unsafe { self.data.get_unchecked(self.index) }; 79 | v 80 | } 81 | 82 | /// Returns the number of elements in the stack. 83 | /// 84 | /// # Returns 85 | /// The length of the stack. 86 | #[inline(always)] 87 | pub fn len(&self) -> usize { 88 | self.index 89 | } 90 | 91 | /// Returns true if the stack is empty. 92 | #[inline(always)] 93 | pub fn is_empty(&self) -> bool { 94 | self.index == 0 95 | } 96 | 97 | /// Clears the stack, removing all elements. 98 | /// 99 | /// This operation does not deallocate the stack's capacity. 100 | #[inline(always)] 101 | pub fn clear(&mut self) { 102 | self.index = 0; 103 | } 104 | 105 | /// Reserves capacity for at least `cap` elements. 106 | /// 107 | /// # Arguments 108 | /// * `cap` - The desired capacity. 109 | /// If the new capacity is smaller than the current capacity, this function does nothing. 110 | #[inline(always)] 111 | pub fn reserve(&mut self, cap: usize) { 112 | if cap < self.data.len() { 113 | return; 114 | } 115 | self.data.resize(cap, Default::default()); 116 | } 117 | } 118 | 119 | #[cfg(test)] 120 | mod tests { 121 | use super::*; 122 | 123 | #[test] 124 | fn test_new_with_capacity() { 125 | let stack: HeapStack = HeapStack::new_with_capacity(10); 126 | assert_eq!(stack.len(), 0); 127 | assert_eq!(stack.data.len(), 10); 128 | } 129 | 130 | #[test] 131 | fn test_push_and_pop() { 132 | let mut stack: HeapStack = HeapStack::new_with_capacity(10); 133 | stack.push(1); 134 | stack.push(2); 135 | stack.push(3); 136 | 137 | assert_eq!(stack.len(), 3); 138 | assert_eq!(stack.pop(), Some(&3)); 139 | assert_eq!(stack.pop(), Some(&2)); 140 | assert_eq!(stack.pop(), Some(&1)); 141 | assert_eq!(stack.pop(), None); 142 | } 143 | 144 | #[test] 145 | #[should_panic(expected = "Index out of bounds: the HeapStack is full")] 146 | fn test_push_panic() { 147 | let mut stack: HeapStack = HeapStack::new_with_capacity(2); 148 | stack.push(1); 149 | stack.push(2); 150 | stack.push(3); // This should panic 151 | } 152 | 153 | #[test] 154 | fn test_pop_fast() { 155 | let mut stack: HeapStack = HeapStack::new_with_capacity(10); 156 | stack.push(1); 157 | stack.push(2); 158 | stack.push(3); 159 | 160 | assert_eq!(*stack.pop_fast(), 3); 161 | assert_eq!(*stack.pop_fast(), 2); 162 | assert_eq!(*stack.pop_fast(), 1); 163 | } 164 | 165 | #[test] 166 | fn test_clear() { 167 | let mut stack: HeapStack = HeapStack::new_with_capacity(10); 168 | stack.push(1); 169 | stack.push(2); 170 | stack.push(3); 171 | 172 | stack.clear(); 173 | assert_eq!(stack.len(), 0); 174 | assert_eq!(stack.pop(), None); 175 | } 176 | 177 | #[test] 178 | fn test_reserve() { 179 | let mut stack: HeapStack = HeapStack::new_with_capacity(5); 180 | assert_eq!(stack.data.len(), 5); 181 | 182 | stack.reserve(10); 183 | assert_eq!(stack.data.len(), 10); 184 | 185 | stack.reserve(3); // Should not shrink the capacity 186 | assert_eq!(stack.data.len(), 10); 187 | } 188 | } 189 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! # BVH Construction and Traversal Library 2 | //! 3 | //! - [PLOC](https://meistdan.github.io/publications/ploc/paper.pdf) BVH2 builder with [Parallel Reinsertion](https://meistdan.github.io/publications/prbvh/paper.pdf) and spatial pre-splits. 4 | //! - [CWBVH](https://research.nvidia.com/sites/default/files/publications/ylitie2017hpg-paper.pdf) An eight-way compressed wide BVH8 builder. Each BVH Node is compressed so that it takes up only 80 bytes per node. 5 | //! - CPU traversal for both BVH2 and CWBVH (SIMD traversal, intersecting 4 nodes at a time) 6 | //! - For GPU traversal example, see the [Tray Racing](https://github.com/DGriffin91/tray_racing) benchmark 7 | //! 8 | //! OBVHS optionally uses [rayon](https://github.com/rayon-rs/rayon) to parallelize building. Many parts of the building process are parallelized, but single threaded building speed has initally been the priority so there is still quite a bit of room for improvement in parallel building performance. 9 | //! 10 | //! ## Example 11 | //! 12 | //! ``` 13 | //! use glam::*; 14 | //! use obvhs::{ 15 | //! cwbvh::builder::build_cwbvh_from_tris, 16 | //! ray::{Ray, RayHit}, 17 | //! test_util::geometry::{icosphere, PLANE}, 18 | //! triangle::Triangle, 19 | //! BvhBuildParams, 20 | //! }; 21 | //! use std::time::Duration; 22 | //! 23 | //! fn main() { 24 | //! // Build a scene with an icosphere and a plane 25 | //! // BVH primitives do not need to be triangles, the BVH builder is only concerned with AABBs. 26 | //! // (With the exception of optional precise triangle aabb splitting) 27 | //! let mut tris: Vec = Vec::new(); 28 | //! tris.extend(icosphere(1)); 29 | //! tris.extend(PLANE); 30 | //! 31 | //! // Build the BVH. 32 | //! // build_cwbvh_from_tris is just a helper that can build from BvhBuildParams and the 33 | //! // respective presets. Feel free to copy the contents of build_cwbvh_from_tris or 34 | //! // build_cwbvh. They are very straightforward. If you don't want to use Triangles as the 35 | //! // primitive, use build_cwbvh instead. build_cwbvh_from_tris just adds support for 36 | //! // splitting tris. 37 | //! let bvh = build_cwbvh_from_tris( 38 | //! &tris, 39 | //! BvhBuildParams::medium_build(), 40 | //! &mut Duration::default(), 41 | //! ); 42 | //! 43 | //! // Create a new ray 44 | //! let ray = Ray::new_inf(vec3a(0.1, 0.1, 4.0), vec3a(0.0, 0.0, -1.0)); 45 | //! 46 | //! // Traverse the BVH, finding the closest hit. 47 | //! let mut ray_hit = RayHit::none(); 48 | //! if bvh.ray_traverse(ray, &mut ray_hit, |ray, id| { 49 | //! // Use primitive_indices to look up the original primitive id. 50 | //! // (Could reorder tris per bvh.primitive_indices to avoid this lookup, see 51 | //! // cornell_box_cwbvh example) 52 | //! tris[bvh.primitive_indices[id] as usize].intersect(ray) 53 | //! }) { 54 | //! println!( 55 | //! "Hit Triangle {}", 56 | //! bvh.primitive_indices[ray_hit.primitive_id as usize] 57 | //! ); 58 | //! println!("Distance to hit: {}", ray_hit.t); 59 | //! } else { 60 | //! println!("Miss"); 61 | //! } 62 | //! } 63 | //! ``` 64 | 65 | use std::time::Duration; 66 | 67 | use aabb::Aabb; 68 | use glam::Mat4; 69 | use ploc::{PlocSearchDistance, SortPrecision}; 70 | use triangle::Triangle; 71 | 72 | pub mod aabb; 73 | pub mod bvh2; 74 | pub mod cwbvh; 75 | pub mod heapstack; 76 | pub mod ploc; 77 | pub mod ray; 78 | pub mod rt_triangle; 79 | pub mod splits; 80 | pub mod test_util; 81 | pub mod triangle; 82 | 83 | /// A trait for types that can be bounded by an axis-aligned bounding box (AABB). Used in Bvh2/CwBvh validation. 84 | pub trait Boundable { 85 | fn aabb(&self) -> Aabb; 86 | } 87 | 88 | /// A trait for types that can have a matrix transform applied. Primarily for testing/examples. 89 | pub trait Transformable { 90 | fn transform(&mut self, matrix: &Mat4); 91 | } 92 | 93 | /// Apply a function to each component of a type. 94 | #[doc(hidden)] 95 | pub trait PerComponent { 96 | fn per_comp(self, f: impl Fn(C1) -> C2) -> Output; 97 | } 98 | 99 | impl PerComponent for Input 100 | where 101 | Input: Into<[C1; 3]>, 102 | Output: From<[C2; 3]>, 103 | { 104 | /// Applies a function to each component of the input. 105 | fn per_comp(self, f: impl Fn(C1) -> C2) -> Output { 106 | let [x, y, z] = self.into(); 107 | Output::from([f(x), f(y), f(z)]) 108 | } 109 | } 110 | 111 | #[doc(hidden)] 112 | pub trait VecExt { 113 | /// Computes the base 2 logarithm of each component of the vector. 114 | fn log2(self) -> Self; 115 | /// Computes the base 2 exponential of each component of the vector. 116 | fn exp2(self) -> Self; 117 | } 118 | 119 | impl VecExt for glam::Vec3 { 120 | /// Computes the base 2 logarithm of each component of the `Vec3` vector. 121 | fn log2(self) -> Self { 122 | self.per_comp(f32::log2) 123 | } 124 | 125 | /// Computes the base 2 exponential of each component of the `Vec3` vector. 126 | fn exp2(self) -> Self { 127 | self.per_comp(f32::exp2) 128 | } 129 | } 130 | 131 | impl VecExt for glam::Vec3A { 132 | /// Computes the base 2 logarithm of each component of the `Vec3A` vector. 133 | fn log2(self) -> Self { 134 | self.per_comp(f32::log2) 135 | } 136 | 137 | /// Computes the base 2 exponential of each component of the `Vec3A` vector. 138 | fn exp2(self) -> Self { 139 | self.per_comp(f32::exp2) 140 | } 141 | } 142 | 143 | /// A macro to measure and print the execution time of a block of code. 144 | /// 145 | /// # Arguments 146 | /// * `$label` - A string label to identify the code block being timed. 147 | /// * `$($code:tt)*` - The code block whose execution time is to be measured. 148 | /// 149 | /// # Usage 150 | /// ```rust 151 | /// use obvhs::timeit; 152 | /// timeit!["example", 153 | /// // code to measure 154 | /// ]; 155 | /// ``` 156 | /// 157 | /// # Note 158 | /// The macro purposefully doesn't include a scope so variables don't need to 159 | /// be passed out of it. This allows it to be trivially added to existing code. 160 | /// 161 | /// This macro only measures time when the `timeit` feature is enabled. 162 | #[macro_export] 163 | #[doc(hidden)] 164 | macro_rules! timeit { 165 | [$label:expr, $($code:tt)*] => { 166 | #[cfg(feature = "timeit")] 167 | let timeit_start = std::time::Instant::now(); 168 | $($code)* 169 | #[cfg(feature = "timeit")] 170 | println!("{:>8} {}", format!("{}", $crate::PrettyDuration(timeit_start.elapsed())), $label); 171 | }; 172 | } 173 | 174 | /// A wrapper struct for `std::time::Duration` to provide pretty-printing of durations. 175 | #[doc(hidden)] 176 | pub struct PrettyDuration(pub Duration); 177 | 178 | impl std::fmt::Display for PrettyDuration { 179 | /// Durations are formatted as follows: 180 | /// - If the duration is greater than or equal to 1 second, it is formatted in seconds (s). 181 | /// - If the duration is greater than or equal to 1 millisecond but less than 1 second, it is formatted in milliseconds (ms). 182 | /// - If the duration is less than 1 millisecond, it is formatted in microseconds (µs). 183 | /// In the case of seconds & milliseconds, the duration is always printed with a precision of two decimal places. 184 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 185 | let duration = self.0; 186 | if duration.as_secs() > 0 { 187 | let seconds = 188 | duration.as_secs() as f64 + f64::from(duration.subsec_nanos()) / 1_000_000_000.0; 189 | write!(f, "{:.2}s ", seconds) 190 | } else if duration.subsec_millis() > 0 { 191 | let milliseconds = 192 | duration.as_millis() as f64 + f64::from(duration.subsec_micros() % 1_000) / 1_000.0; 193 | write!(f, "{:.2}ms", milliseconds) 194 | } else { 195 | let microseconds = duration.as_micros(); 196 | write!(f, "{}µs", microseconds) 197 | } 198 | } 199 | } 200 | 201 | /// Add profile scope. Nesting the macro allows us to make the profiling crate optional. 202 | #[doc(hidden)] 203 | #[macro_export] 204 | macro_rules! scope { 205 | [$label:expr] => { 206 | #[cfg(feature = "profile")] 207 | profiling::scope!($label); 208 | }; 209 | } 210 | 211 | /// General build parameters for Bvh2 & CwBvh 212 | pub struct BvhBuildParams { 213 | /// Split large tris into multiple AABBs 214 | pub pre_split: bool, 215 | /// In ploc, the number of nodes before and after the current one that are evaluated for pairing. 1 has a 216 | /// fast path in building and still results in decent quality BVHs esp. when paired with a bit of reinsertion. 217 | pub ploc_search_distance: PlocSearchDistance, 218 | /// Below this depth a search distance of 1 will be used for ploc. 219 | pub search_depth_threshold: usize, 220 | /// Typically 0..1: ratio of nodes considered as candidates for reinsertion. Above 1 to evaluate the whole set 221 | /// multiple times. A little goes a long way. Try 0.01 or even 0.001 before disabling for build performance. 222 | pub reinsertion_batch_ratio: f32, 223 | /// For Bvh2 only, a second pass of reinsertion after collapse. Since collapse reduces the node count, 224 | /// this reinsertion pass will be faster. 0 to disable. Relative to the initial reinsertion_batch_ratio. 225 | pub post_collapse_reinsertion_batch_ratio_multiplier: f32, 226 | /// Bits used for ploc radix sort. 227 | pub sort_precision: SortPrecision, 228 | /// Min 1 (CwBvh will clamp to max 3) 229 | pub max_prims_per_leaf: u32, 230 | /// Multiplier for traversal cost calculation during Bvh2 collapse (Does not affect CwBvh). A higher value will 231 | /// result in more primitives per leaf. 232 | pub collapse_traversal_cost: f32, 233 | } 234 | 235 | impl BvhBuildParams { 236 | pub fn fastest_build() -> Self { 237 | BvhBuildParams { 238 | pre_split: false, 239 | ploc_search_distance: PlocSearchDistance::Minimum, 240 | search_depth_threshold: 0, 241 | reinsertion_batch_ratio: 0.0, 242 | post_collapse_reinsertion_batch_ratio_multiplier: 0.0, 243 | sort_precision: SortPrecision::U64, 244 | max_prims_per_leaf: 1, 245 | collapse_traversal_cost: 1.0, 246 | } 247 | } 248 | pub fn very_fast_build() -> Self { 249 | BvhBuildParams { 250 | pre_split: false, 251 | ploc_search_distance: PlocSearchDistance::Minimum, 252 | search_depth_threshold: 0, 253 | reinsertion_batch_ratio: 0.01, 254 | post_collapse_reinsertion_batch_ratio_multiplier: 0.0, 255 | sort_precision: SortPrecision::U64, 256 | max_prims_per_leaf: 8, 257 | collapse_traversal_cost: 3.0, 258 | } 259 | } 260 | pub fn fast_build() -> Self { 261 | BvhBuildParams { 262 | pre_split: false, 263 | ploc_search_distance: PlocSearchDistance::Low, 264 | search_depth_threshold: 2, 265 | reinsertion_batch_ratio: 0.02, 266 | post_collapse_reinsertion_batch_ratio_multiplier: 0.0, 267 | sort_precision: SortPrecision::U64, 268 | max_prims_per_leaf: 8, 269 | collapse_traversal_cost: 3.0, 270 | } 271 | } 272 | /// Tries to be around the same build time as embree but with faster traversal 273 | pub fn medium_build() -> Self { 274 | BvhBuildParams { 275 | pre_split: false, 276 | ploc_search_distance: PlocSearchDistance::Medium, 277 | search_depth_threshold: 3, 278 | reinsertion_batch_ratio: 0.05, 279 | post_collapse_reinsertion_batch_ratio_multiplier: 2.0, 280 | sort_precision: SortPrecision::U64, 281 | max_prims_per_leaf: 8, 282 | collapse_traversal_cost: 3.0, 283 | } 284 | } 285 | pub fn slow_build() -> Self { 286 | BvhBuildParams { 287 | pre_split: true, 288 | ploc_search_distance: PlocSearchDistance::High, 289 | search_depth_threshold: 2, 290 | reinsertion_batch_ratio: 0.2, 291 | post_collapse_reinsertion_batch_ratio_multiplier: 2.0, 292 | sort_precision: SortPrecision::U128, 293 | max_prims_per_leaf: 8, 294 | collapse_traversal_cost: 3.0, 295 | } 296 | } 297 | pub fn very_slow_build() -> Self { 298 | BvhBuildParams { 299 | pre_split: true, 300 | ploc_search_distance: PlocSearchDistance::Medium, 301 | search_depth_threshold: 1, 302 | reinsertion_batch_ratio: 1.0, 303 | post_collapse_reinsertion_batch_ratio_multiplier: 1.0, 304 | sort_precision: SortPrecision::U128, 305 | max_prims_per_leaf: 8, 306 | collapse_traversal_cost: 3.0, 307 | } 308 | } 309 | } 310 | -------------------------------------------------------------------------------- /src/ploc/mod.rs: -------------------------------------------------------------------------------- 1 | //! PLOC (Parallel, Locally Ordered Clustering) BVH 2 Builder. 2 | 3 | pub mod morton; 4 | 5 | // https://madmann91.github.io/2021/05/05/ploc-revisited.html 6 | // https://github.com/meistdan/ploc/ 7 | // https://meistdan.github.io/publications/ploc/paper.pdf 8 | // https://github.com/madmann91/bvh/blob/v1/include/bvh/locally_ordered_clustering_builder.hpp 9 | 10 | use glam::DVec3; 11 | use rdst::{RadixKey, RadixSort}; 12 | 13 | #[cfg(feature = "parallel")] 14 | use rayon::iter::{ 15 | IndexedParallelIterator, IntoParallelIterator, IntoParallelRefIterator, 16 | IntoParallelRefMutIterator, ParallelIterator, 17 | }; 18 | 19 | use crate::ploc::morton::{morton_encode_u128_unorm, morton_encode_u64_unorm}; 20 | use crate::{ 21 | aabb::Aabb, 22 | bvh2::{Bvh2, Bvh2Node}, 23 | }; 24 | 25 | impl PlocSearchDistance { 26 | /// # Arguments 27 | /// * `aabbs` - A list of bounding boxes. Should correspond to the number and order of primitives. 28 | /// * `indices` - The list indices used to index into the list of primitives. This allows for 29 | /// flexibility in which primitives are included in the bvh and in what order they are referenced. 30 | /// Often this would just be equivalent to: (0..aabbs.len() as u32).collect::>() 31 | /// * `sort_precision` - Bits used for ploc radix sort. More bits results in a more accurate but slower sort. 32 | /// * `search_depth_threshold` - Below this depth a search distance of 1 will be used. Set to 0 to bypass and 33 | /// just use PlocSearchDistance. When trying to optimize build time it can be beneficial to limit the search 34 | /// distance for the first few passes as that is when the largest number of primitives are being considered. 35 | /// This pairs are initially found more quickly since it doesn't need to search as far, and they are also 36 | /// found more often, since the nodes need to both agree to become paired. This also seems to occasionally 37 | /// result in an overall better bvh structure. 38 | pub fn build( 39 | &self, 40 | aabbs: &[Aabb], 41 | indices: Vec, 42 | sort_precision: SortPrecision, 43 | search_depth_threshold: usize, 44 | ) -> Bvh2 { 45 | match self { 46 | PlocSearchDistance::Minimum => { 47 | build_ploc::<1>(aabbs, indices, sort_precision, search_depth_threshold) 48 | } 49 | PlocSearchDistance::VeryLow => { 50 | build_ploc::<2>(aabbs, indices, sort_precision, search_depth_threshold) 51 | } 52 | PlocSearchDistance::Low => { 53 | build_ploc::<6>(aabbs, indices, sort_precision, search_depth_threshold) 54 | } 55 | PlocSearchDistance::Medium => { 56 | build_ploc::<14>(aabbs, indices, sort_precision, search_depth_threshold) 57 | } 58 | PlocSearchDistance::High => { 59 | build_ploc::<24>(aabbs, indices, sort_precision, search_depth_threshold) 60 | } 61 | PlocSearchDistance::VeryHigh => { 62 | build_ploc::<32>(aabbs, indices, sort_precision, search_depth_threshold) 63 | } 64 | } 65 | } 66 | } 67 | 68 | /// # Arguments 69 | /// * `aabbs` - A list of bounding boxes. Should correspond to the number and order of primitives. 70 | /// * `search_depth_threshold` - Below this depth a search distance of 1 will be used 71 | /// * `sort_precision` - Bits used for ploc radix sort. More bits results in a more accurate but slower sort. 72 | /// * `search_depth_threshold` - Below this depth a search distance of 1 will be used. Set to 0 to bypass and 73 | /// just use SEARCH_DISTANCE. 74 | /// 75 | /// SEARCH_DISTANCE should be <= 32 76 | pub fn build_ploc( 77 | aabbs: &[Aabb], 78 | indices: Vec, 79 | sort_precision: SortPrecision, 80 | search_depth_threshold: usize, 81 | ) -> Bvh2 { 82 | crate::scope!("build_ploc"); 83 | 84 | let prim_count = aabbs.len(); 85 | 86 | if prim_count == 0 { 87 | return Bvh2::default(); 88 | } 89 | 90 | let mut init_leafs = Vec::with_capacity(prim_count); 91 | let mut total_aabb = Aabb::empty(); 92 | 93 | for (i, prim_index) in indices.iter().enumerate() { 94 | let aabb = aabbs[i]; 95 | debug_assert!(!aabb.min.is_nan()); 96 | debug_assert!(!aabb.max.is_nan()); 97 | total_aabb.extend(aabb.min); 98 | total_aabb.extend(aabb.max); 99 | init_leafs.push(Bvh2Node { 100 | aabb, 101 | prim_count: 1, 102 | first_index: *prim_index, 103 | }); 104 | } 105 | 106 | let nodes = build_ploc_from_leafs::( 107 | init_leafs, 108 | total_aabb, 109 | sort_precision, 110 | search_depth_threshold, 111 | ); 112 | 113 | Bvh2 { 114 | nodes, 115 | primitive_indices: indices, 116 | ..Default::default() 117 | } 118 | } 119 | 120 | pub fn build_ploc_from_leafs( 121 | mut current_nodes: Vec, 122 | total_aabb: Aabb, 123 | sort_precision: SortPrecision, 124 | search_depth_threshold: usize, 125 | ) -> Vec { 126 | crate::scope!("build_ploc_from_leafs"); 127 | 128 | let prim_count = current_nodes.len(); 129 | 130 | // Merge nodes until there is only one left 131 | let nodes_count = (2 * prim_count as i64 - 1).max(0) as usize; 132 | 133 | let scale = 1.0 / total_aabb.diagonal().as_dvec3(); 134 | let offset = -total_aabb.min.as_dvec3() * scale; 135 | 136 | // Sort primitives according to their morton code 137 | sort_precision.sort_nodes(&mut current_nodes, scale, offset); 138 | 139 | let mut nodes = vec![Bvh2Node::default(); nodes_count]; 140 | 141 | let mut insert_index = nodes_count; 142 | let mut next_nodes = Vec::with_capacity(prim_count); 143 | assert!(i8::MAX as usize > SEARCH_DISTANCE); 144 | let mut merge: Vec = vec![0; prim_count]; 145 | 146 | #[cfg(not(feature = "parallel"))] 147 | let mut cache = SearchCache::::default(); 148 | 149 | let mut depth: usize = 0; 150 | while current_nodes.len() > 1 { 151 | if SEARCH_DISTANCE == 1 || depth < search_depth_threshold { 152 | // TODO try making build_ploc_from_leafs_one that embeds this logic into 153 | // the main `while index < merge.len() {` loop (may not be faster, tbd) 154 | let mut last_cost = f32::MAX; 155 | let count = current_nodes.len() - 1; 156 | assert!(count < merge.len()); // Try to elide bounds check 157 | (0..count).for_each(|i| { 158 | let cost = current_nodes[i] 159 | .aabb 160 | .union(¤t_nodes[i + 1].aabb) 161 | .half_area(); 162 | merge[i] = if last_cost < cost { -1 } else { 1 }; 163 | last_cost = cost; 164 | }); 165 | merge[current_nodes.len() - 1] = -1; 166 | } else { 167 | #[cfg(feature = "parallel")] 168 | let iter = merge.par_iter_mut(); 169 | #[cfg(not(feature = "parallel"))] 170 | let iter = merge.iter_mut(); 171 | iter.enumerate() 172 | .take(current_nodes.len()) 173 | .for_each(|(index, best)| { 174 | #[cfg(feature = "parallel")] 175 | { 176 | *best = find_best_node_basic(index, ¤t_nodes, SEARCH_DISTANCE); 177 | } 178 | #[cfg(not(feature = "parallel"))] 179 | { 180 | *best = cache.find_best_node(index, ¤t_nodes); 181 | } 182 | }); 183 | }; 184 | 185 | let mut index = 0; 186 | while index < current_nodes.len() { 187 | let index_offset = merge[index] as i64; 188 | let best_index = (index as i64 + index_offset) as usize; 189 | // The two nodes should be merged if they agree on their respective merge indices. 190 | if best_index as i64 + merge[best_index] as i64 != index as i64 { 191 | // If not, the current node should be kept for the next iteration 192 | next_nodes.push(current_nodes[index]); 193 | index += 1; 194 | continue; 195 | } 196 | 197 | // Since we only need to merge once, we only merge if the first index is less than the second. 198 | if best_index > index { 199 | index += 1; 200 | continue; 201 | } 202 | 203 | debug_assert_ne!(best_index, index); 204 | 205 | let left = current_nodes[index]; 206 | let right = current_nodes[best_index]; 207 | 208 | // Reserve space in the target array for the two children 209 | debug_assert!(insert_index >= 2); 210 | insert_index -= 2; 211 | 212 | // Create the parent node and place it in the array for the next iteration 213 | next_nodes.push(Bvh2Node { 214 | aabb: left.aabb.union(&right.aabb), 215 | prim_count: 0, 216 | first_index: insert_index as u32, 217 | }); 218 | 219 | // Out of bounds here error here could indicate NaN present in input aabb. Try running in debug mode. 220 | nodes[insert_index] = left; 221 | nodes[insert_index + 1] = right; 222 | 223 | if SEARCH_DISTANCE == 1 && index_offset == 1 { 224 | // If the search distance is only 1, and the next index was merged with this one, 225 | // we can skip the next index. 226 | // The code for this with the while loop seemed to also be slightly faster than: 227 | // for (index, best_index) in merge.iter().enumerate() { 228 | // even in the other cases. For some reason... 229 | index += 2; 230 | } else { 231 | index += 1; 232 | } 233 | } 234 | 235 | (next_nodes, current_nodes) = (current_nodes, next_nodes); 236 | next_nodes.clear(); 237 | depth += 1; 238 | } 239 | 240 | insert_index = insert_index.saturating_sub(1); 241 | nodes[insert_index] = current_nodes[0]; 242 | nodes 243 | } 244 | 245 | #[cfg(feature = "parallel")] 246 | fn find_best_node_basic(index: usize, nodes: &[Bvh2Node], search_distance: usize) -> i8 { 247 | let mut best_node = index; 248 | let mut best_cost = f32::INFINITY; 249 | 250 | let begin = index - search_distance.min(index); 251 | let end = (index + search_distance + 1).min(nodes.len()); 252 | 253 | let our_aabb = nodes[index].aabb; 254 | for other in begin..end { 255 | if other == index { 256 | continue; 257 | } 258 | let cost = our_aabb.union(&nodes[other].aabb).half_area(); 259 | if cost < best_cost { 260 | best_node = other; 261 | best_cost = cost; 262 | } 263 | } 264 | 265 | (best_node as i64 - index as i64) as i8 266 | } 267 | 268 | /// In PLOC, the number of nodes before and after the current one that are evaluated for pairing. 269 | /// Minimum (1) has a fast path in building and still results in decent quality BVHs especially 270 | /// when paired with a bit of reinsertion. 271 | #[derive(Default, Clone, Copy)] 272 | pub enum PlocSearchDistance { 273 | /// 1 274 | Minimum, 275 | /// 2 276 | VeryLow, 277 | /// 6 278 | Low, 279 | #[default] 280 | /// 14 281 | Medium, 282 | /// 24 283 | High, 284 | /// 32 285 | VeryHigh, 286 | } 287 | 288 | impl From for PlocSearchDistance { 289 | fn from(value: u32) -> Self { 290 | match value { 291 | 1 => PlocSearchDistance::Minimum, 292 | 2 => PlocSearchDistance::VeryLow, 293 | 6 => PlocSearchDistance::Low, 294 | 14 => PlocSearchDistance::Medium, 295 | 24 => PlocSearchDistance::High, 296 | 32 => PlocSearchDistance::VeryHigh, 297 | _ => panic!("Invalid value for PlocSearchDistance: {}", value), 298 | } 299 | } 300 | } 301 | 302 | // Tried using a Vec it was ~30% slower with a search distance of 14. 303 | // Tried making the Vec flat, used get_unchecked, etc... (without those it was ~80% slower) 304 | pub struct SearchCache([[f32; SEARCH_DISTANCE]; SEARCH_DISTANCE]); 305 | 306 | impl Default for SearchCache { 307 | fn default() -> Self { 308 | SearchCache([[0.0; SEARCH_DISTANCE]; SEARCH_DISTANCE]) 309 | } 310 | } 311 | 312 | impl SearchCache { 313 | #[inline] 314 | #[cfg(not(feature = "parallel"))] 315 | fn back(&self, index: usize, other: usize) -> f32 { 316 | // Note: the compiler removes the bounds check due to the % SEARCH_DISTANCE 317 | self.0[other % SEARCH_DISTANCE][index % SEARCH_DISTANCE] 318 | } 319 | 320 | #[inline] 321 | #[cfg(not(feature = "parallel"))] 322 | fn front(&mut self, index: usize, other: usize) -> &mut f32 { 323 | &mut self.0[index % SEARCH_DISTANCE][other % SEARCH_DISTANCE] 324 | } 325 | 326 | #[cfg(not(feature = "parallel"))] 327 | fn find_best_node(&mut self, index: usize, nodes: &[Bvh2Node]) -> i8 { 328 | let mut best_node = index; 329 | let mut best_cost = f32::INFINITY; 330 | 331 | let begin = index - SEARCH_DISTANCE.min(index); 332 | let end = (index + SEARCH_DISTANCE + 1).min(nodes.len()); 333 | 334 | for other in begin..index { 335 | let area = self.back(index, other); 336 | if area < best_cost { 337 | best_node = other; 338 | best_cost = area; 339 | } 340 | } 341 | 342 | let our_aabb = nodes[index].aabb; 343 | ((index + 1)..end).for_each(|other| { 344 | let cost = our_aabb.union(&nodes[other].aabb).half_area(); 345 | *self.front(index, other) = cost; 346 | if cost < best_cost { 347 | best_node = other; 348 | best_cost = cost; 349 | } 350 | }); 351 | 352 | (best_node as i64 - index as i64) as i8 353 | } 354 | } 355 | 356 | // --------------------- 357 | // --- SORTING NODES --- 358 | // --------------------- 359 | 360 | // TODO find a not terrible way to make this less repetitive 361 | 362 | #[derive(Debug, Copy, Clone)] 363 | pub enum SortPrecision { 364 | U128, 365 | U64, 366 | } 367 | 368 | impl SortPrecision { 369 | fn sort_nodes(&self, current_nodes: &mut Vec, scale: DVec3, offset: DVec3) { 370 | match self { 371 | SortPrecision::U128 => sort_nodes_m128(current_nodes, scale, offset), 372 | SortPrecision::U64 => sort_nodes_m64(current_nodes, scale, offset), 373 | } 374 | } 375 | } 376 | 377 | #[derive(Clone, Copy)] 378 | struct Morton128 { 379 | index: usize, 380 | code: u128, 381 | } 382 | 383 | impl RadixKey for Morton128 { 384 | const LEVELS: usize = 16; 385 | 386 | #[inline(always)] 387 | fn get_level(&self, level: usize) -> u8 { 388 | self.code.get_level(level) 389 | } 390 | } 391 | 392 | #[derive(Clone, Copy)] 393 | struct Morton64 { 394 | index: usize, 395 | code: u64, 396 | } 397 | 398 | impl RadixKey for Morton64 { 399 | const LEVELS: usize = 8; 400 | 401 | #[inline(always)] 402 | fn get_level(&self, level: usize) -> u8 { 403 | self.code.get_level(level) 404 | } 405 | } 406 | 407 | fn sort_nodes_m128(current_nodes: &mut Vec, scale: DVec3, offset: DVec3) { 408 | crate::scope!("sort_nodes_m128"); 409 | 410 | #[cfg(feature = "parallel")] 411 | let iter = current_nodes.par_iter(); 412 | #[cfg(not(feature = "parallel"))] 413 | let iter = current_nodes.iter(); 414 | 415 | let mut indexed_mortons: Vec = iter 416 | .enumerate() 417 | .map(|(index, leaf)| { 418 | let center = leaf.aabb.center().as_dvec3() * scale + offset; 419 | Morton128 { 420 | index, 421 | code: morton_encode_u128_unorm(center), 422 | } 423 | }) 424 | .collect(); 425 | 426 | indexed_mortons.radix_sort_unstable(); 427 | 428 | #[cfg(feature = "parallel")] 429 | let iter = indexed_mortons.into_par_iter(); 430 | #[cfg(not(feature = "parallel"))] 431 | let iter = indexed_mortons.iter(); 432 | 433 | *current_nodes = iter.map(|m| current_nodes[m.index]).collect(); 434 | } 435 | 436 | fn sort_nodes_m64(current_nodes: &mut Vec, scale: DVec3, offset: DVec3) { 437 | crate::scope!("sort_nodes_m64"); 438 | 439 | #[cfg(feature = "parallel")] 440 | let iter = current_nodes.par_iter(); 441 | #[cfg(not(feature = "parallel"))] 442 | let iter = current_nodes.iter(); 443 | 444 | let mut indexed_mortons: Vec = iter 445 | .enumerate() 446 | .map(|(index, leaf)| { 447 | let center = leaf.aabb.center().as_dvec3() * scale + offset; 448 | Morton64 { 449 | index, 450 | code: morton_encode_u64_unorm(center), 451 | } 452 | }) 453 | .collect(); 454 | 455 | indexed_mortons.radix_sort_unstable(); 456 | 457 | #[cfg(feature = "parallel")] 458 | let iter = indexed_mortons.into_par_iter(); 459 | #[cfg(not(feature = "parallel"))] 460 | let iter = indexed_mortons.iter(); 461 | 462 | *current_nodes = iter.map(|m| current_nodes[m.index]).collect(); 463 | } 464 | -------------------------------------------------------------------------------- /src/ploc/morton.rs: -------------------------------------------------------------------------------- 1 | // http://www.graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN 2 | // TODO evaluate Extended Morton Codes for High Performance Bounding Volume Hierarchy Construction: 3 | // https://www.dcgi.fel.cvut.cz/projects/emc/emc2017.pdf 4 | // https://www.highperformancegraphics.org/wp-content/uploads/2017/Papers-Session3/HPG207_ExtendedMortonCodes.pdf 5 | 6 | //--------------------------------------------------- 7 | // --- 10 bit resolution per channel morton curve --- 8 | //--------------------------------------------------- 9 | 10 | use glam::DVec3; 11 | 12 | #[inline] 13 | pub fn split_by_3_u32(a: u16) -> u32 { 14 | let mut x = a as u32 & 0x3ff; // we only look at the first 10 bits 15 | x = (x | x << 16) & 0x30000ff; 16 | x = (x | x << 8) & 0x300f00f; 17 | x = (x | x << 4) & 0x30c30c3; 18 | x = (x | x << 2) & 0x9249249; 19 | x 20 | } 21 | 22 | #[inline] 23 | /// Encode x,y,z position into a u64 morton value. 24 | /// Input should be 0..=2u16.pow(10) (or 1u16 << 10) 25 | /// (only included for reference, this isn't reasonably accurate for most BVHs) 26 | pub fn morton_encode_u32(x: u16, y: u16, z: u16) -> u32 { 27 | split_by_3_u32(x) | split_by_3_u32(y) << 1 | split_by_3_u32(z) << 2 28 | } 29 | 30 | //--------------------------------------------------- 31 | // --- 21 bit resolution per channel morton curve --- 32 | //--------------------------------------------------- 33 | 34 | #[inline] 35 | pub fn split_by_3_u64(a: u32) -> u64 { 36 | let mut x = a as u64 & 0x1fffff; // we only look at the first 21 bits 37 | x = (x | x << 32) & 0x1f00000000ffff; 38 | x = (x | x << 16) & 0x1f0000ff0000ff; 39 | x = (x | x << 8) & 0x100f00f00f00f00f; 40 | x = (x | x << 4) & 0x10c30c30c30c30c3; 41 | x = (x | x << 2) & 0x1249249249249249; 42 | x 43 | } 44 | 45 | #[inline] 46 | /// Encode x,y,z position into a u64 morton value. 47 | /// Input should be 0..=2u32.pow(21) (or 1u32 << 21) 48 | pub fn morton_encode_u64(x: u32, y: u32, z: u32) -> u64 { 49 | split_by_3_u64(x) | split_by_3_u64(y) << 1 | split_by_3_u64(z) << 2 50 | } 51 | 52 | #[inline] 53 | /// Encode a DVec3 position into a u128 morton value. 54 | /// Input should be 0.0..=1.0 55 | pub fn morton_encode_u64_unorm(p: DVec3) -> u64 { 56 | let p = p * (1 << 21) as f64; 57 | morton_encode_u64(p.x as u32, p.y as u32, p.z as u32) 58 | } 59 | 60 | //--------------------------------------------------- 61 | // --- 42 bit resolution per channel morton curve --- 62 | //--------------------------------------------------- 63 | 64 | #[inline] 65 | pub fn split_by_3_u128(a: u64) -> u128 { 66 | let mut x = a as u128 & 0x3ffffffffff; // we only look at the first 42 bits 67 | x = (x | x << 64) & 0x3ff0000000000000000ffffffff; 68 | x = (x | x << 32) & 0x3ff00000000ffff00000000ffff; 69 | x = (x | x << 16) & 0x30000ff0000ff0000ff0000ff0000ff; 70 | x = (x | x << 8) & 0x300f00f00f00f00f00f00f00f00f00f; 71 | x = (x | x << 4) & 0x30c30c30c30c30c30c30c30c30c30c3; 72 | x = (x | x << 2) & 0x9249249249249249249249249249249; 73 | x 74 | } 75 | 76 | #[inline] 77 | /// Encode x,y,z position into a u128 morton value. 78 | /// Input should be 0..=2u64.pow(42) (or 1u64 << 42) 79 | pub fn morton_encode_u128(x: u64, y: u64, z: u64) -> u128 { 80 | split_by_3_u128(x) | split_by_3_u128(y) << 1 | split_by_3_u128(z) << 2 81 | } 82 | 83 | #[inline] 84 | /// Encode a DVec3 position into a u128 morton value. 85 | /// Input should be 0.0..=1.0 86 | pub fn morton_encode_u128_unorm(p: DVec3) -> u128 { 87 | let p = p * (1u64 << 42) as f64; 88 | morton_encode_u128(p.x as u64, p.y as u64, p.z as u64) 89 | } 90 | -------------------------------------------------------------------------------- /src/ray.rs: -------------------------------------------------------------------------------- 1 | //! A ray in 3D space. 2 | 3 | use glam::{vec3a, Vec3A}; 4 | 5 | /// Computes the inverse of `x` avoiding division by zero. 6 | pub fn safe_inverse(x: f32) -> f32 { 7 | if x.abs() <= f32::EPSILON { 8 | x.signum() / f32::EPSILON 9 | } else { 10 | 1.0 / x 11 | } 12 | } 13 | 14 | /// A struct representing a ray in 3D space. 15 | #[derive(Clone, Copy, Debug)] 16 | #[repr(C)] 17 | pub struct Ray { 18 | /// The starting point of the ray. 19 | pub origin: Vec3A, 20 | /// The direction vector of the ray. 21 | pub direction: Vec3A, 22 | /// The inverse of the direction vector components. 23 | /// Used to avoid division in ray/aabb tests. Seems to improve performance in 24 | /// some cases on the cpu, but not the gpu in some others. 25 | pub inv_direction: Vec3A, 26 | /// The minimum `t` (distance) value for intersection tests. 27 | pub tmin: f32, 28 | /// The maximum `t` (distance) value for intersection tests. 29 | pub tmax: f32, 30 | } 31 | 32 | impl Ray { 33 | /// Creates a new `Ray` with the given origin, direction, and `t` (distance) range. 34 | pub fn new(origin: Vec3A, direction: Vec3A, min: f32, max: f32) -> Self { 35 | let ray = Ray { 36 | origin, 37 | direction, 38 | inv_direction: vec3a( 39 | safe_inverse(direction.x), 40 | safe_inverse(direction.y), 41 | safe_inverse(direction.z), 42 | ), 43 | tmin: min, 44 | tmax: max, 45 | }; 46 | 47 | debug_assert!(ray.inv_direction.is_finite()); 48 | debug_assert!(ray.direction.is_finite()); 49 | debug_assert!(origin.is_finite()); 50 | 51 | ray 52 | } 53 | 54 | /// Creates a new infinite `Ray` with the given origin, direction. 55 | pub fn new_inf(origin: Vec3A, direction: Vec3A) -> Self { 56 | Self::new(origin, direction, 0.0, f32::INFINITY) 57 | } 58 | } 59 | 60 | /// A struct representing a hit record in ray tracing. 61 | /// A `Hit` record contains the IDs of the primitive, geometry and instance that 62 | /// were hit, as well as the `t` (distance) value at which the hit occurred. 63 | #[derive(Clone, Copy, Debug)] 64 | #[repr(C)] 65 | pub struct RayHit { 66 | pub primitive_id: u32, 67 | pub geometry_id: u32, 68 | pub instance_id: u32, 69 | pub t: f32, 70 | } 71 | 72 | pub const INVALID_ID: u32 = u32::MAX; 73 | 74 | impl RayHit { 75 | /// Creates a new `RayHit` instance representing no hit. 76 | pub fn none() -> Self { 77 | Self { 78 | primitive_id: INVALID_ID, 79 | geometry_id: INVALID_ID, 80 | instance_id: INVALID_ID, 81 | t: f32::INFINITY, 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/rt_triangle.rs: -------------------------------------------------------------------------------- 1 | //! Triangle types optimized for ray intersection performance. 2 | 3 | use bytemuck::{Pod, Zeroable}; 4 | use glam::*; 5 | 6 | use half::f16; 7 | 8 | use crate::{aabb::Aabb, ray::Ray, triangle::Triangle, Boundable}; 9 | 10 | #[derive(Clone, Copy, Default, PartialEq)] 11 | #[repr(C)] 12 | /// A compressed 3D triangle optimized for GPU ray intersection performance. 13 | pub struct RtCompressedTriangle { 14 | /// Base vertex 15 | pub v0: [f32; 3], 16 | /// Edges 1 & 2 encoded as IEEE 754 f16 `v1 - v0, v2 - v0` 17 | pub e1_e2: [u16; 6], 18 | } 19 | 20 | unsafe impl Pod for RtCompressedTriangle {} 21 | unsafe impl Zeroable for RtCompressedTriangle {} 22 | 23 | impl From<&Triangle> for RtCompressedTriangle { 24 | #[inline(always)] 25 | fn from(tri: &Triangle) -> Self { 26 | RtCompressedTriangle::new(tri.v0, tri.v1, tri.v2) 27 | } 28 | } 29 | 30 | impl RtCompressedTriangle { 31 | #[inline(always)] 32 | pub fn new(v0: Vec3A, v1: Vec3A, v2: Vec3A) -> Self { 33 | let e1 = v1 - v0; 34 | let e2 = v2 - v0; 35 | 36 | Self { 37 | v0: [v0.x, v0.y, v0.z], 38 | e1_e2: [ 39 | f16::from_f32(e1.x).to_bits(), 40 | f16::from_f32(e2.x).to_bits(), 41 | f16::from_f32(e1.y).to_bits(), 42 | f16::from_f32(e2.y).to_bits(), 43 | f16::from_f32(e1.z).to_bits(), 44 | f16::from_f32(e2.z).to_bits(), 45 | ], 46 | } 47 | } 48 | 49 | #[inline(always)] 50 | pub fn vertices(&self) -> [Vec3A; 3] { 51 | let (v0, e1, e2) = self.unpack(); 52 | let v1 = v0 + e1; 53 | let v2 = v0 + e2; 54 | [v0, v1, v2] 55 | } 56 | 57 | #[inline(always)] 58 | pub fn aabb(&self) -> Aabb { 59 | Aabb::from_points(&self.vertices()) 60 | } 61 | 62 | #[inline(always)] 63 | pub fn compute_normal(&self) -> Vec3A { 64 | let (_v0, e1, e2) = self.unpack(); 65 | ((e1).cross(e2)).normalize_or_zero() 66 | } 67 | 68 | /// Find the distance (t) of the intersection of the `Ray` and this Triangle. 69 | /// Returns f32::INFINITY for miss. 70 | #[inline(always)] 71 | pub fn intersect(&self, ray: &Ray) -> f32 { 72 | // TODO not very water tight from the back side in some contexts (tris with edges at 0,0,0 show 1px gap) 73 | // Find out if this is typical of Möller 74 | // Based on Fast Minimum Storage Ray Triangle Intersection by T. Möller and B. Trumbore 75 | // https://madmann91.github.io/2021/04/29/an-introduction-to-bvhs.html 76 | 77 | let (v0, e1, e2) = self.unpack(); 78 | let ng = (-e1).cross(e2); 79 | 80 | let cull_backface = false; 81 | 82 | let c = v0 - ray.origin; 83 | let r = ray.direction.cross(c); 84 | let inv_det = 1.0 / ng.dot(ray.direction); 85 | 86 | let u = r.dot(e2) * inv_det; 87 | let v = r.dot(-e1) * inv_det; 88 | let w = 1.0 - u - v; 89 | 90 | // Original: 91 | //let hit = u >= 0.0 && v >= 0.0 && w >= 0.0; 92 | //let valid = if cull_backface { 93 | // inv_det > 0.0 && hit 94 | //} else { 95 | // inv_det != 0.0 && hit 96 | //}; 97 | 98 | // Note: differs in that if v == -0.0, for example will cause valid to be false 99 | let hit = u.to_bits() | v.to_bits() | w.to_bits(); 100 | let valid = if cull_backface { 101 | (inv_det.to_bits() | hit) & 0x8000_0000 == 0 102 | } else { 103 | inv_det != 0.0 && hit & 0x8000_0000 == 0 104 | }; 105 | 106 | if valid { 107 | let t = ng.dot(c) * inv_det; 108 | if t >= ray.tmin && t <= ray.tmax { 109 | return t; 110 | } 111 | } 112 | 113 | f32::INFINITY 114 | } 115 | 116 | pub fn unpack(&self) -> (Vec3A, Vec3A, Vec3A) { 117 | let v0: Vec3A = self.v0.into(); 118 | let e1x = f16::from_bits(self.e1_e2[0]).to_f32(); 119 | let e2x = f16::from_bits(self.e1_e2[1]).to_f32(); 120 | let e1y = f16::from_bits(self.e1_e2[2]).to_f32(); 121 | let e2y = f16::from_bits(self.e1_e2[3]).to_f32(); 122 | let e1z = f16::from_bits(self.e1_e2[4]).to_f32(); 123 | let e2z = f16::from_bits(self.e1_e2[5]).to_f32(); 124 | let e1 = Vec3A::new(e1x, e1y, e1z); 125 | let e2 = Vec3A::new(e2x, e2y, e2z); 126 | (v0, e1, e2) 127 | } 128 | 129 | #[inline(always)] 130 | pub fn compute_barycentric(&self, ray: &Ray) -> Vec2 { 131 | let (v0, e1, e2) = self.unpack(); 132 | let ng = (-e1).cross(e2); 133 | let r = ray.direction.cross(v0 - ray.origin); 134 | vec2(r.dot(e2), r.dot(-e1)) / ng.dot(ray.direction) 135 | } 136 | } 137 | 138 | impl Boundable for RtCompressedTriangle { 139 | fn aabb(&self) -> Aabb { 140 | self.aabb() 141 | } 142 | } 143 | 144 | #[derive(Clone, Copy, Default, PartialEq)] 145 | /// A 3D triangle optimized for CPU ray intersection performance. 146 | pub struct RtTriangle { 147 | /// Base vertex 148 | pub v0: Vec3A, 149 | /// Edge 1 `v0 - v1` 150 | pub e1: Vec3A, 151 | /// Edge 2 `v2 - v0` 152 | pub e2: Vec3A, 153 | /// Geometric normal `e1.cross(e2)`. 154 | /// Optimized for intersection. 155 | /// Needs to be inverted for typical normal. 156 | pub ng: Vec3A, 157 | } 158 | 159 | impl From<&Triangle> for RtTriangle { 160 | #[inline(always)] 161 | fn from(tri: &Triangle) -> Self { 162 | RtTriangle::new(tri.v0, tri.v1, tri.v2) 163 | } 164 | } 165 | 166 | // Uses layout from https://github.com/madmann91/bvh/blob/master/src/bvh/v2/tri.h#L36 167 | // to optimize for intersection. On the CPU this is a bit faster than e1 = v1 - v0; e2 = v2 - v0; 168 | impl RtTriangle { 169 | #[inline(always)] 170 | pub fn new(v0: Vec3A, v1: Vec3A, v2: Vec3A) -> Self { 171 | let e1 = v0 - v1; 172 | let e2 = v2 - v0; 173 | Self { 174 | v0, 175 | e1, 176 | e2, 177 | ng: e1.cross(e2), 178 | } 179 | } 180 | 181 | #[inline(always)] 182 | fn vertices(&self) -> [Vec3A; 3] { 183 | [self.v0, self.v0 - self.e1, self.v0 + self.e2] 184 | } 185 | 186 | #[inline(always)] 187 | pub fn aabb(&self) -> Aabb { 188 | Aabb::from_points(&self.vertices()) 189 | } 190 | 191 | #[inline(always)] 192 | pub fn compute_normal(&self) -> Vec3A { 193 | -self.ng.normalize_or_zero() 194 | } 195 | 196 | /// Find the distance (t) of the intersection of the `Ray` and this Triangle. 197 | /// Returns f32::INFINITY for miss. 198 | #[inline(always)] 199 | pub fn intersect(&self, ray: &Ray) -> f32 { 200 | // TODO not very water tight from the back side in some contexts (tris with edges at 0,0,0 show 1px gap) 201 | // Find out if this is typical of Möller 202 | // Based on Fast Minimum Storage Ray Triangle Intersection by T. Möller and B. Trumbore 203 | // https://madmann91.github.io/2021/04/29/an-introduction-to-bvhs.html 204 | let cull_backface = false; 205 | 206 | let c = self.v0 - ray.origin; 207 | let r = ray.direction.cross(c); 208 | let inv_det = 1.0 / self.ng.dot(ray.direction); 209 | 210 | let u = r.dot(self.e2) * inv_det; 211 | let v = r.dot(self.e1) * inv_det; 212 | let w = 1.0 - u - v; 213 | 214 | // Original: 215 | //let hit = u >= 0.0 && v >= 0.0 && w >= 0.0; 216 | //let valid = if cull_backface { 217 | // inv_det > 0.0 && hit 218 | //} else { 219 | // inv_det != 0.0 && hit 220 | //}; 221 | 222 | // Note: differs in that if v == -0.0, for example will cause valid to be false 223 | let hit = u.to_bits() | v.to_bits() | w.to_bits(); 224 | let valid = if cull_backface { 225 | (inv_det.to_bits() | hit) & 0x8000_0000 == 0 226 | } else { 227 | inv_det != 0.0 && hit & 0x8000_0000 == 0 228 | }; 229 | 230 | if valid { 231 | let t = self.ng.dot(c) * inv_det; 232 | if t >= ray.tmin && t <= ray.tmax { 233 | return t; 234 | } 235 | } 236 | 237 | f32::INFINITY 238 | } 239 | 240 | // https://github.com/RenderKit/embree/blob/0c236df6f31a8e9c8a48803dada333e9ea0029a6/kernels/geometry/triangle_intersector_moeller.h#L9 241 | #[cfg(all( 242 | any(target_arch = "x86", target_arch = "x86_64"), 243 | target_feature = "sse2" 244 | ))] 245 | pub fn intersect_embree(&self, ray: &Ray) -> f32 { 246 | // Not watertight from the front side? Looks similar to what intersect() above looks like from the back side. 247 | 248 | // This uses the orientation from https://madmann91.github.io/2021/04/29/an-introduction-to-bvhs.html 249 | 250 | let cull_backface = false; 251 | 252 | // Calculate denominator 253 | let o = ray.origin; 254 | let d = ray.direction; 255 | let c = self.v0 - o; 256 | let r = c.cross(d); 257 | let den = (-self.ng).dot(d); 258 | let abs_den = den.abs(); 259 | 260 | fn signmsk(x: f32) -> f32 { 261 | #[cfg(target_arch = "x86")] 262 | use std::arch::x86::*; 263 | #[cfg(target_arch = "x86_64")] 264 | use std::arch::x86_64::*; 265 | unsafe { 266 | let mask = _mm_set1_ps(-0.0); 267 | let x_vec = _mm_set_ss(x); 268 | let sign_bit = _mm_and_ps(x_vec, mask); 269 | _mm_cvtss_f32(sign_bit) 270 | //_mm_cvtss_f32(_mm_and_ps( 271 | // _mm_set_ss(x), 272 | // _mm_castsi128_ps(_mm_set1_epi32(-2147483648i32)), 273 | //)) 274 | } 275 | } 276 | 277 | let sgn_den = signmsk(den).to_bits(); 278 | 279 | // Perform edge tests 280 | let u = f32::from_bits(r.dot(self.e2).to_bits() ^ sgn_den); 281 | let v = f32::from_bits(r.dot(self.e1).to_bits() ^ sgn_den); 282 | // TODO simd uv? 283 | 284 | // Perform backface culling 285 | // Original: 286 | //let valid = if cull_backface { 287 | // den < 0.0 && u >= 0.0 && v >= 0.0 && u + v <= abs_den 288 | //} else { 289 | // den != 0.0 && u >= 0.0 && v >= 0.0 && u + v <= abs_den 290 | //}; 291 | 292 | let w = abs_den - u - v; 293 | let valid = if cull_backface { 294 | ((-den).to_bits() | u.to_bits() | v.to_bits() | (abs_den - u - v).to_bits()) 295 | & 0x8000_0000 296 | == 0 297 | } else { 298 | den != 0.0 && ((u.to_bits() | v.to_bits() | w.to_bits()) & 0x8000_0000) == 0 299 | }; 300 | 301 | if !valid { 302 | return f32::INFINITY; 303 | } 304 | 305 | // Perform depth test 306 | let t = f32::from_bits((-self.ng).dot(c).to_bits() ^ sgn_den); 307 | 308 | if abs_den * ray.tmin < t && t <= abs_den * ray.tmax { 309 | return t; 310 | } 311 | 312 | f32::INFINITY 313 | } 314 | 315 | #[inline(always)] 316 | pub fn compute_barycentric(&self, ray: &Ray) -> Vec2 { 317 | let r = ray.direction.cross(self.v0 - ray.origin); 318 | vec2(r.dot(self.e2), r.dot(self.e1)) / self.ng.dot(ray.direction) 319 | } 320 | } 321 | 322 | impl Boundable for RtTriangle { 323 | fn aabb(&self) -> Aabb { 324 | self.aabb() 325 | } 326 | } 327 | -------------------------------------------------------------------------------- /src/splits.rs: -------------------------------------------------------------------------------- 1 | //! Split large triangles into multiple smaller Aabbs. 2 | 3 | use glam::Vec3A; 4 | 5 | use crate::{aabb::Aabb, triangle::Triangle}; 6 | 7 | /// Splits large triangles into multiple smaller Aabbs. Fits the new aabbs tightly to the triangle. 8 | /// Note: This will result in more aabbs than triangles. The indices Vec will have grow with the 9 | /// added Aabb's with the respective mapping back to the initial list of triangles. 10 | /// # Arguments 11 | /// * `avg_half_area` - The average half area of the Triangles 12 | /// * `largest_half_area` - The largest half area of the Triangles 13 | /// This is tuned to try to create splits conservatively enough that it generally 14 | /// wont result in lower traversal performance across a variety of scenes. 15 | /// (Naive splitting can result in lower traversal performance in some scenes) 16 | pub fn split_aabbs_preset( 17 | aabbs: &mut Vec, 18 | indices: &mut Vec, 19 | triangles: &[Triangle], 20 | avg_half_area: f32, 21 | largest_half_area: f32, 22 | ) { 23 | split_aabbs_precise( 24 | aabbs, 25 | indices, 26 | triangles, 27 | avg_half_area * 3.0, 28 | (avg_half_area * 4.0).max(avg_half_area * 0.9 + largest_half_area * 0.1), 29 | 1.8, 30 | 1.6, 31 | 12, 32 | 12, 33 | ); 34 | } 35 | 36 | /// Splits large triangles into multiple smaller Aabbs. Fits the new aabbs tightly to the triangle. 37 | /// Note: This will result in more aabbs than triangles. The indices Vec will have grow with the 38 | /// added Aabb's with the respective mapping back to the initial list of triangles. 39 | /// # Arguments 40 | /// * `area_thresh_low` - Triangles with aabb half areas below this will not be considered for splitting. 41 | /// * `area_thresh_high` - If the low split factor condition is not met then area_thresh_high > old_cost 42 | /// must be met in addition to best_cost * split_factor_high < old_cost in order for the split to occur 43 | /// * `split_factor_low` - If the resulting smallest aabb half area (best_cost) multiplied by this factor is 44 | /// lower than the original cost the best split will be used (best_cost * split_factor_low < old_cost) 45 | /// (area_thresh_high > old_cost && best_cost * split_factor_high < old_cost) 46 | /// * `max_iterations` - Number of times to evaluate the entire set of aabbs/triangles (including the newly added splits) 47 | /// * `split_tests` - Number of places try splitting the triangle at. 48 | #[allow(clippy::too_many_arguments)] 49 | pub fn split_aabbs_precise( 50 | aabbs: &mut Vec, 51 | indices: &mut Vec, 52 | triangles: &[Triangle], 53 | area_thresh_low: f32, 54 | area_thresh_high: f32, 55 | split_factor_low: f32, 56 | split_factor_high: f32, 57 | max_iterations: u32, 58 | split_tests: u32, 59 | ) { 60 | crate::scope!("split_aabbs_precise"); 61 | 62 | let mut candidates = Vec::new(); 63 | 64 | for (i, aabb) in aabbs.iter().enumerate() { 65 | if aabb.half_area() > area_thresh_low { 66 | candidates.push(i) 67 | } 68 | } 69 | 70 | let mut old_candidates_len = candidates.len(); 71 | for _ in 0..max_iterations { 72 | for i in 0..candidates.len() { 73 | let aabb = &mut aabbs[candidates[i]]; 74 | let index = indices[candidates[i]]; 75 | let axis: usize = aabb.largest_axis(); 76 | 77 | let tri = triangles[index as usize]; 78 | 79 | let mut best_cost = f32::MAX; 80 | let mut left = *aabb; 81 | let mut right = *aabb; 82 | 83 | // TODO optimization: create multiple splits simultaneously 84 | for i in 1..split_tests { 85 | let n = i as f32 / split_tests as f32; 86 | let pos = aabb.min[axis] * n + aabb.max[axis] * (1.0 - n); 87 | 88 | let mut tmp_left = *aabb; 89 | let mut tmp_right = *aabb; 90 | 91 | tmp_left.max[axis] = pos; 92 | tmp_right.min[axis] = pos; 93 | let verts = [tri.v0, tri.v1, tri.v2, tri.v0]; 94 | let (t_left, t_right) = split_triangle(axis as u32, pos, verts); 95 | tmp_left = t_left.intersection(&tmp_left); 96 | tmp_right = t_right.intersection(&tmp_right); 97 | let area = tmp_left.half_area() + tmp_right.half_area(); 98 | if area < best_cost { 99 | best_cost = area; 100 | left = tmp_left; 101 | right = tmp_right; 102 | } 103 | } 104 | 105 | let old_cost = aabb.half_area(); 106 | 107 | if (area_thresh_high > old_cost && best_cost * split_factor_high < old_cost) 108 | || best_cost * split_factor_low < old_cost 109 | { 110 | *aabb = left; 111 | candidates.push(aabbs.len()); 112 | aabbs.push(right); 113 | indices.push(index); 114 | } 115 | } 116 | if old_candidates_len == candidates.len() { 117 | break; 118 | } else { 119 | candidates.retain(|c| aabbs[*c].half_area() > area_thresh_low); 120 | old_candidates_len = candidates.len(); 121 | } 122 | } 123 | } 124 | 125 | /// Based on , 126 | /// but with the "current bounds" moved out. 127 | pub fn split_triangle(dim: u32, pos: f32, v: [Vec3A; 4]) -> (Aabb, Aabb) { 128 | let mut left = Aabb::INVALID; 129 | let mut right = Aabb::INVALID; 130 | 131 | // Clip triangle to left and right box by processing all edges 132 | for i in 0..3 { 133 | let v0 = v[i]; 134 | let v1 = v[i + 1]; 135 | let v0d = v0[dim as usize]; 136 | let v1d = v1[dim as usize]; 137 | 138 | if v0d <= pos { 139 | // This point is on left side 140 | left.extend(v0); 141 | } 142 | if v0d >= pos { 143 | // This point is on right side 144 | right.extend(v0); 145 | } 146 | 147 | // The edge crosses the splitting location 148 | if (v0d < pos && pos < v1d) || (v1d < pos && pos < v0d) { 149 | debug_assert!((v1d - v0d) != 0.0); 150 | let inv_length = 1.0 / (v1d - v0d); 151 | let c = Vec3A::mul_add(Vec3A::splat((pos - v0d) * inv_length), v1 - v0, v0); 152 | left.extend(c); 153 | right.extend(c); 154 | } 155 | } 156 | 157 | (left, right) 158 | } 159 | -------------------------------------------------------------------------------- /src/test_util.rs: -------------------------------------------------------------------------------- 1 | //! Meshes, generators, sampling functions, etc.. for basic testing & examples. 2 | 3 | pub mod sampling { 4 | use std::f32::consts::TAU; 5 | 6 | use glam::*; 7 | 8 | #[inline(always)] 9 | pub fn uhash(a: u32, b: u32) -> u32 { 10 | let mut x = (a.overflowing_mul(1597334673).0) ^ (b.overflowing_mul(3812015801).0); 11 | // from https://nullprogram.com/blog/2018/07/31/ 12 | x = x ^ (x >> 16); 13 | x = x.overflowing_mul(0x7feb352d).0; 14 | x = x ^ (x >> 15); 15 | x = x.overflowing_mul(0x846ca68b).0; 16 | x = x ^ (x >> 16); 17 | x 18 | } 19 | 20 | #[inline(always)] 21 | pub fn unormf(n: u32) -> f32 { 22 | n as f32 * (1.0 / 0xffffffffu32 as f32) 23 | } 24 | 25 | #[inline(always)] 26 | pub fn hash_noise(coord: UVec2, frame: u32) -> f32 { 27 | let urnd = uhash(coord.x, (coord.y << 11) + frame); 28 | unormf(urnd) 29 | } 30 | 31 | // https://jcgt.org/published/0006/01/01/paper.pdf 32 | #[inline(always)] 33 | pub fn build_orthonormal_basis(n: Vec3A) -> Mat3 { 34 | let sign = n.z.signum(); 35 | let a = -1.0 / (sign + n.z); 36 | let b = n.x * n.y * a; 37 | 38 | mat3( 39 | vec3(1.0 + sign * n.x * n.x * a, sign * b, -sign * n.x), 40 | vec3(b, sign + n.y * n.y * a, -n.y), 41 | n.into(), 42 | ) 43 | } 44 | 45 | #[inline(always)] 46 | pub fn cosine_sample_hemisphere(urand: Vec2) -> Vec3A { 47 | let r = urand.x.sqrt(); 48 | let theta = urand.y * TAU; 49 | vec3a( 50 | r * theta.cos(), 51 | r * theta.sin(), 52 | 0.0f32.max(1.0 - urand.x).sqrt(), 53 | ) 54 | } 55 | 56 | #[inline(always)] 57 | pub fn uniform_sample_sphere(urand: Vec2) -> Vec3A { 58 | let z = 1.0 - 2.0 * urand.x; 59 | let r = (1.0 - z * z).sqrt(); 60 | let theta = urand.y * TAU; 61 | vec3a(r * theta.cos(), r * theta.sin(), z) 62 | } 63 | 64 | #[inline(always)] 65 | pub fn uniform_sample_cone(urand: Vec2, cos_theta_max: f32) -> Vec3A { 66 | let cos_theta = (1.0 - urand.x) + urand.x * cos_theta_max; 67 | let sin_theta = (1.0 - cos_theta * cos_theta).clamp(0.0, 1.0).sqrt(); 68 | let phi: f32 = urand.y * TAU; 69 | vec3a(sin_theta * phi.cos(), sin_theta * phi.sin(), cos_theta) 70 | } 71 | 72 | #[inline(always)] 73 | pub fn smoothstep(e0: f32, e1: f32, x: f32) -> f32 { 74 | let t = ((x - e0) / (e1 - e0)).clamp(0.0, 1.0); 75 | t * t * (3.0 - 2.0 * t) 76 | } 77 | 78 | #[inline(always)] 79 | fn cubic(v0: f32, v1: f32, v2: f32, v3: f32, x: f32) -> f32 { 80 | let p = (v3 - v2) - (v0 - v1); 81 | let q = (v0 - v1) - p; 82 | let r = v2 - v0; 83 | let s = v1; 84 | p * x.powi(3) + q * x.powi(2) + r * x + s 85 | } 86 | 87 | #[inline(always)] 88 | pub fn bicubic_noise(coord: Vec2, seed: u32) -> f32 { 89 | let ix = coord.x.floor() as u32; 90 | let iy = coord.y.floor() as u32; 91 | let fx = coord.x - ix as f32; 92 | let fy = coord.y - iy as f32; 93 | fn cubic_col(ix: u32, iy: u32, j: u32, seed: u32, fx: f32) -> f32 { 94 | cubic( 95 | hash_noise(uvec2(ix, iy + j), seed), 96 | hash_noise(uvec2(ix + 1, iy + j), seed), 97 | hash_noise(uvec2(ix + 2, iy + j), seed), 98 | hash_noise(uvec2(ix + 3, iy + j), seed), 99 | fx, 100 | ) 101 | } 102 | cubic( 103 | cubic_col(ix, iy, 0, seed, fx), 104 | cubic_col(ix, iy, 1, seed, fx), 105 | cubic_col(ix, iy, 2, seed, fx), 106 | cubic_col(ix, iy, 3, seed, fx), 107 | fy, 108 | ) 109 | } 110 | 111 | // By Tomasz Stachowiak 112 | pub fn somewhat_boring_display_transform(col: Vec3A) -> Vec3A { 113 | fn rgb_to_ycbcr(col: Vec3A) -> Vec3A { 114 | Mat3A { 115 | x_axis: vec3a(0.2126, -0.1146, 0.5), 116 | y_axis: vec3a(0.7152, -0.3854, -0.4542), 117 | z_axis: vec3a(0.0722, 0.5, -0.0458), 118 | } * col 119 | } 120 | 121 | fn tonemap_curve(v: f32) -> f32 { 122 | 1.0 - (-v).exp() 123 | } 124 | 125 | fn tonemap_curve3(v: Vec3A) -> Vec3A { 126 | 1.0 - (-v).exp() 127 | } 128 | 129 | fn tonemapping_luminance(col: Vec3A) -> f32 { 130 | col.dot(vec3a(0.2126, 0.7152, 0.0722)) 131 | } 132 | 133 | let mut col = col; 134 | let ycbcr = rgb_to_ycbcr(col); 135 | 136 | let bt = tonemap_curve(ycbcr.yz().length() * 2.4); 137 | let mut desat = (bt - 0.7) * 0.8; 138 | desat *= desat; 139 | 140 | let desat_col = col.lerp(ycbcr.xxx(), desat); 141 | 142 | let tm_luma = tonemap_curve(ycbcr.x); 143 | let tm0 = col * tm_luma / tonemapping_luminance(col).max(1e-5); 144 | let final_mult = 0.97; 145 | let tm1 = tonemap_curve3(desat_col); 146 | 147 | col = tm0.lerp(tm1, bt * bt); 148 | 149 | col * final_mult 150 | } 151 | } 152 | 153 | pub mod geometry { 154 | use crate::{test_util::sampling::bicubic_noise, Triangle}; 155 | use glam::*; 156 | 157 | #[inline(always)] 158 | const fn vec(a: f32, b: f32, c: f32) -> Vec3A { 159 | Vec3A::new(a, b, c) 160 | } 161 | #[inline(always)] 162 | const fn tri(v0: Vec3A, v1: Vec3A, v2: Vec3A) -> Triangle { 163 | Triangle { v0, v1, v2 } 164 | } 165 | 166 | /// Cube triangle mesh with side length of 2 centered at 0,0,0 167 | pub const CUBE: [Triangle; 12] = [ 168 | tri(vec(-1., 1., -1.), vec(1., 1., 1.), vec(1., 1., -1.)), 169 | tri(vec(1., 1., 1.), vec(-1., -1., 1.), vec(1., -1., 1.)), 170 | tri(vec(-1., 1., 1.), vec(-1., -1., -1.), vec(-1., -1., 1.)), 171 | tri(vec(1., -1., -1.), vec(-1., -1., 1.), vec(-1., -1., -1.)), 172 | tri(vec(1., 1., -1.), vec(1., -1., 1.), vec(1., -1., -1.)), 173 | tri(vec(-1., 1., -1.), vec(1., -1., -1.), vec(-1., -1., -1.)), 174 | tri(vec(-1., 1., -1.), vec(-1., 1., 1.), vec(1., 1., 1.)), 175 | tri(vec(1., 1., 1.), vec(-1., 1., 1.), vec(-1., -1., 1.)), 176 | tri(vec(-1., 1., 1.), vec(-1., 1., -1.), vec(-1., -1., -1.)), 177 | tri(vec(1., -1., -1.), vec(1., -1., 1.), vec(-1., -1., 1.)), 178 | tri(vec(1., 1., -1.), vec(1., 1., 1.), vec(1., -1., 1.)), 179 | tri(vec(-1., 1., -1.), vec(1., 1., -1.), vec(1., -1., -1.)), 180 | ]; 181 | 182 | /// Plane triangle mesh with side length of 2 centered at 0,0,0 183 | pub const PLANE: [Triangle; 2] = [ 184 | tri(vec(1., 0., 1.), vec(-1., 0., -1.), vec(-1., 0., 1.)), 185 | tri(vec(1., 0., 1.), vec(1., 0., -1.), vec(-1., 0., -1.)), 186 | ]; 187 | 188 | /// Generate icosphere mesh with radius of 2 189 | pub fn icosphere(subdivisions: u32) -> Vec { 190 | let phi = (1.0 + 5.0_f32.sqrt()) / 2.0; // golden ratio 191 | let (a, b, c, d, e) = (1.0, -1.0, 0.0, phi, -phi); 192 | 193 | #[rustfmt::skip] 194 | let mut p = [vec(b,d,c),vec(a,d,c),vec(b,e,c),vec(a,e,c),vec(c,b,d),vec(c,a,d),vec(c,b,e),vec(c,a,e),vec(d,c,b),vec(d,c,a),vec(e,c,b),vec(e,c,a)]; 195 | p.iter_mut().for_each(|v| *v = v.normalize()); 196 | 197 | let mut tris = vec![ 198 | tri(p[0], p[11], p[5]), 199 | tri(p[0], p[5], p[1]), 200 | tri(p[0], p[1], p[7]), 201 | tri(p[0], p[7], p[10]), 202 | tri(p[0], p[10], p[11]), 203 | tri(p[1], p[5], p[9]), 204 | tri(p[5], p[11], p[4]), 205 | tri(p[11], p[10], p[2]), 206 | tri(p[10], p[7], p[6]), 207 | tri(p[7], p[1], p[8]), 208 | tri(p[3], p[9], p[4]), 209 | tri(p[3], p[4], p[2]), 210 | tri(p[3], p[2], p[6]), 211 | tri(p[3], p[6], p[8]), 212 | tri(p[3], p[8], p[9]), 213 | tri(p[4], p[9], p[5]), 214 | tri(p[2], p[4], p[11]), 215 | tri(p[6], p[2], p[10]), 216 | tri(p[8], p[6], p[7]), 217 | tri(p[9], p[8], p[1]), 218 | ]; 219 | 220 | (0..subdivisions).for_each(|_| { 221 | let mut new_tris = Vec::new(); 222 | tris.iter().for_each(|t| { 223 | let mid01 = ((t.v0 + t.v1) * 0.5).normalize(); 224 | let mid12 = ((t.v1 + t.v2) * 0.5).normalize(); 225 | let mid20 = ((t.v2 + t.v0) * 0.5).normalize(); 226 | new_tris.push(tri(t.v0, mid01, mid20)); 227 | new_tris.push(tri(t.v1, mid12, mid01)); 228 | new_tris.push(tri(t.v2, mid20, mid12)); 229 | new_tris.push(tri(mid01, mid12, mid20)); 230 | }); 231 | tris = new_tris; 232 | }); 233 | 234 | tris 235 | } 236 | 237 | /// Convert height map to triangles with 2x2x2 size given -1.0..=1.0 output from height_map: F 238 | pub fn height_to_triangles( 239 | height_map: F, 240 | x_resolution: usize, 241 | z_resolution: usize, 242 | ) -> Vec 243 | where 244 | F: Fn(usize, usize) -> f32, 245 | { 246 | let mut triangles = Vec::new(); 247 | 248 | // Iterate over each cell in the grid 249 | for z in 0..z_resolution { 250 | for x in 0..x_resolution { 251 | // Calculate normalized positions 252 | let fx = (x as f32 / x_resolution as f32) * 2.0 - 1.0; 253 | let fz = (z as f32 / z_resolution as f32) * 2.0 - 1.0; 254 | let fx2 = ((x + 1) as f32 / x_resolution as f32) * 2.0 - 1.0; 255 | let fz2 = ((z + 1) as f32 / z_resolution as f32) * 2.0 - 1.0; 256 | 257 | // Create vertices for each corner of the cell 258 | let v00 = vec(fx, height_map(x, z), fz); 259 | let v10 = vec(fx2, height_map(x + 1, z), fz); 260 | let v01 = vec(fx, height_map(x, z + 1), fz2); 261 | let v11 = vec(fx2, height_map(x + 1, z + 1), fz2); 262 | 263 | // Create two triangles for this cell 264 | triangles.push(tri(v00, v01, v10)); 265 | triangles.push(tri(v10, v01, v11)); 266 | } 267 | } 268 | 269 | triangles 270 | } 271 | 272 | /// terrain_res 1024 or greater recommended 273 | pub fn demoscene(terrain_res: usize, seed: u32) -> Vec { 274 | let height_map = |x: usize, y: usize| -> f32 { 275 | let coord = vec2(x as f32, y as f32) / terrain_res as f32; 276 | let (mut cs, mut ns) = (1.579, 0.579); 277 | (1..17) 278 | .map(|i| { 279 | (cs, ns) = (cs * 1.579, ns * -0.579); 280 | bicubic_noise(coord * cs, seed + i) * ns 281 | }) 282 | .sum::() 283 | * (1.0 - coord.y).powf(0.579) 284 | + (1.0 - coord.y).powf(1.579) * 0.579 285 | }; 286 | height_to_triangles(height_map, terrain_res, terrain_res) 287 | } 288 | } 289 | -------------------------------------------------------------------------------- /src/triangle.rs: -------------------------------------------------------------------------------- 1 | //! Triangle representation in 3D space. 2 | 3 | use bytemuck::{Pod, Zeroable}; 4 | use glam::{vec2, Mat4, Vec2, Vec3A}; 5 | 6 | use crate::{aabb::Aabb, ray::Ray, Boundable, Transformable}; 7 | 8 | #[derive(Clone, Copy, Default, Debug)] 9 | pub struct Triangle { 10 | pub v0: Vec3A, 11 | pub v1: Vec3A, 12 | pub v2: Vec3A, 13 | } 14 | 15 | unsafe impl Pod for Triangle {} 16 | unsafe impl Zeroable for Triangle {} 17 | 18 | impl Triangle { 19 | /// Compute the normal of the triangle geometry. 20 | #[inline(always)] 21 | pub fn compute_normal(&self) -> Vec3A { 22 | let e1 = self.v1 - self.v0; 23 | let e2 = self.v2 - self.v0; 24 | e1.cross(e2).normalize_or_zero() 25 | } 26 | 27 | /// Compute the bounding box of the triangle. 28 | #[inline(always)] 29 | pub fn aabb(&self) -> Aabb { 30 | Aabb::from_points(&[self.v0, self.v1, self.v2]) 31 | } 32 | 33 | /// Find the distance (t) of the intersection of the `Ray` and this Triangle. 34 | /// Returns f32::INFINITY for miss. 35 | #[inline(always)] 36 | pub fn intersect(&self, ray: &Ray) -> f32 { 37 | // TODO not very water tight from the back side in some contexts (tris with edges at 0,0,0 show 1px gap) 38 | // Find out if this is typical of Möller 39 | // Based on Fast Minimum Storage Ray Triangle Intersection by T. Möller and B. Trumbore 40 | // https://madmann91.github.io/2021/04/29/an-introduction-to-bvhs.html 41 | let cull_backface = false; 42 | let e1 = self.v0 - self.v1; 43 | let e2 = self.v2 - self.v0; 44 | let n = e1.cross(e2); 45 | 46 | let c = self.v0 - ray.origin; 47 | let r = ray.direction.cross(c); 48 | let inv_det = 1.0 / n.dot(ray.direction); 49 | 50 | let u = r.dot(e2) * inv_det; 51 | let v = r.dot(e1) * inv_det; 52 | let w = 1.0 - u - v; 53 | 54 | //let hit = u >= 0.0 && v >= 0.0 && w >= 0.0; 55 | //let valid = if cull_backface { 56 | // inv_det > 0.0 && hit 57 | //} else { 58 | // inv_det != 0.0 && hit 59 | //}; 60 | 61 | // Note: differs in that if v == -0.0, for example will cause valid to be false 62 | let hit = u.to_bits() | v.to_bits() | w.to_bits(); 63 | let valid = if cull_backface { 64 | (inv_det.to_bits() | hit) & 0x8000_0000 == 0 65 | } else { 66 | inv_det != 0.0 && hit & 0x8000_0000 == 0 67 | }; 68 | 69 | if valid { 70 | let t = n.dot(c) * inv_det; 71 | if t >= ray.tmin && t <= ray.tmax { 72 | return t; 73 | } 74 | } 75 | 76 | f32::INFINITY 77 | } 78 | 79 | // https://github.com/RenderKit/embree/blob/0c236df6f31a8e9c8a48803dada333e9ea0029a6/kernels/geometry/triangle_intersector_moeller.h#L9 80 | #[cfg(all( 81 | any(target_arch = "x86", target_arch = "x86_64"), 82 | target_feature = "sse2" 83 | ))] 84 | pub fn intersect_embree(&self, ray: &Ray) -> f32 { 85 | // Not watertight from the front side? Looks similar to what above looks like from the back side. 86 | 87 | // This uses the orientation from https://madmann91.github.io/2021/04/29/an-introduction-to-bvhs.html 88 | 89 | let cull_backface = false; 90 | let v0 = self.v0; 91 | let e1 = self.v0 - self.v1; 92 | let e2 = self.v2 - self.v0; 93 | let ng = e1.cross(e2); 94 | 95 | // Calculate denominator 96 | let o = ray.origin; 97 | let d = ray.direction; 98 | let c = v0 - o; 99 | let r = c.cross(d); 100 | let den = (-ng).dot(d); 101 | let abs_den = den.abs(); 102 | 103 | fn signmsk(x: f32) -> f32 { 104 | #[cfg(target_arch = "x86")] 105 | use std::arch::x86::*; 106 | #[cfg(target_arch = "x86_64")] 107 | use std::arch::x86_64::*; 108 | unsafe { 109 | let mask = _mm_set1_ps(-0.0); 110 | let x_vec = _mm_set_ss(x); 111 | let sign_bit = _mm_and_ps(x_vec, mask); 112 | _mm_cvtss_f32(sign_bit) 113 | //_mm_cvtss_f32(_mm_and_ps( 114 | // _mm_set_ss(x), 115 | // _mm_castsi128_ps(_mm_set1_epi32(-2147483648i32)), 116 | //)) 117 | } 118 | } 119 | 120 | let sgn_den = signmsk(den).to_bits(); 121 | 122 | // Perform edge tests 123 | let u = f32::from_bits(r.dot(e2).to_bits() ^ sgn_den); 124 | let v = f32::from_bits(r.dot(e1).to_bits() ^ sgn_den); 125 | // TODO simd uv? 126 | 127 | // Perform backface culling 128 | // OG 129 | //let valid = if cull_backface { 130 | // den < 0.0 && u >= 0.0 && v >= 0.0 && u + v <= abs_den 131 | //} else { 132 | // den != 0.0 && u >= 0.0 && v >= 0.0 && u + v <= abs_den 133 | //}; 134 | 135 | let w = abs_den - u - v; 136 | let valid = if cull_backface { 137 | ((-den).to_bits() | u.to_bits() | v.to_bits() | (abs_den - u - v).to_bits()) 138 | & 0x8000_0000 139 | == 0 140 | } else { 141 | den != 0.0 && ((u.to_bits() | v.to_bits() | w.to_bits()) & 0x8000_0000) == 0 142 | }; 143 | 144 | if !valid { 145 | return f32::INFINITY; 146 | } 147 | 148 | // Perform depth test 149 | let t = f32::from_bits((-ng).dot(c).to_bits() ^ sgn_den); 150 | 151 | if abs_den * ray.tmin < t && t <= abs_den * ray.tmax { 152 | return t; 153 | } 154 | 155 | f32::INFINITY 156 | } 157 | 158 | #[inline(always)] 159 | pub fn compute_barycentric(&self, ray: &Ray) -> Vec2 { 160 | let e1 = self.v0 - self.v1; 161 | let e2 = self.v2 - self.v0; 162 | let ng = e1.cross(e2).normalize_or_zero(); 163 | let r = ray.direction.cross(self.v0 - ray.origin); 164 | vec2(r.dot(e2), r.dot(e1)) / ng.dot(ray.direction) 165 | } 166 | } 167 | 168 | impl Boundable for Triangle { 169 | fn aabb(&self) -> Aabb { 170 | self.aabb() 171 | } 172 | } 173 | 174 | impl Transformable for &mut Triangle { 175 | fn transform(&mut self, matrix: &Mat4) { 176 | self.v0 = matrix.transform_point3a(self.v0); 177 | self.v1 = matrix.transform_point3a(self.v1); 178 | self.v2 = matrix.transform_point3a(self.v2); 179 | } 180 | } 181 | 182 | impl Transformable for T 183 | where 184 | T: AsMut<[Triangle]>, 185 | { 186 | fn transform(&mut self, matrix: &Mat4) { 187 | self.as_mut().iter_mut().for_each(|mut triangle| { 188 | triangle.transform(matrix); 189 | }); 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /tests/mod.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests { 3 | 4 | use std::time::Duration; 5 | 6 | use glam::*; 7 | use obvhs::{ 8 | aabb::Aabb, 9 | bvh2::builder::{build_bvh2, build_bvh2_from_tris}, 10 | cwbvh::{ 11 | builder::{build_cwbvh, build_cwbvh_from_tris}, 12 | bvh2_to_cwbvh::bvh2_to_cwbvh, 13 | }, 14 | ray::{Ray, RayHit}, 15 | test_util::{ 16 | geometry::{demoscene, height_to_triangles, icosphere}, 17 | sampling::{hash_noise, uniform_sample_sphere}, 18 | }, 19 | traverse, 20 | triangle::Triangle, 21 | BvhBuildParams, 22 | }; 23 | 24 | #[test] 25 | pub fn build_bvh2_with_empty_aabb() { 26 | let bvh = build_bvh2( 27 | &[Aabb::empty()], 28 | BvhBuildParams::medium_build(), 29 | &mut Duration::default(), 30 | ); 31 | let ray = Ray::new_inf(Vec3A::Z, -Vec3A::Z); 32 | assert!(!bvh.ray_traverse(ray, &mut RayHit::none(), |_ray, _id| f32::INFINITY)); 33 | } 34 | 35 | #[test] 36 | pub fn build_cwbvh_with_empty_aabb() { 37 | let bvh = build_cwbvh( 38 | &[Aabb::empty()], 39 | BvhBuildParams::medium_build(), 40 | &mut Duration::default(), 41 | ); 42 | let ray = Ray::new_inf(Vec3A::Z, -Vec3A::Z); 43 | assert!(!bvh.ray_traverse(ray, &mut RayHit::none(), |_ray, _id| f32::INFINITY)); 44 | } 45 | 46 | #[test] 47 | pub fn build_bvh2_with_nothing() { 48 | let aabbs: Vec = Vec::new(); 49 | let bvh = build_bvh2( 50 | &aabbs, 51 | BvhBuildParams::medium_build(), 52 | &mut Duration::default(), 53 | ); 54 | let ray = Ray::new_inf(Vec3A::Z, -Vec3A::Z); 55 | assert!(!bvh.ray_traverse(ray, &mut RayHit::none(), |_ray, _id| f32::INFINITY)); 56 | } 57 | 58 | #[test] 59 | pub fn build_cwbvh_with_nothing() { 60 | let aabbs: Vec = Vec::new(); 61 | let bvh = build_cwbvh( 62 | &aabbs, 63 | BvhBuildParams::medium_build(), 64 | &mut Duration::default(), 65 | ); 66 | let ray = Ray::new_inf(Vec3A::Z, -Vec3A::Z); 67 | assert!(!bvh.ray_traverse(ray, &mut RayHit::none(), |_ray, _id| f32::INFINITY)); 68 | } 69 | 70 | #[test] 71 | pub fn check_flat_subdivided_plane_normals() { 72 | let tris = height_to_triangles(|_x: usize, _y: usize| -> f32 { 0.0 }, 4, 4); 73 | let mut hit_count = 0; 74 | eval_render( 75 | |_x: u32, _y: u32, hit: RayHit| { 76 | let n = tris[hit.primitive_id as usize].compute_normal(); 77 | if n == Vec3A::Y { 78 | hit_count += 1 79 | } 80 | }, 81 | &tris, 82 | 256, 83 | 256, 84 | 90.0f32.to_radians(), 85 | vec3a(0.0, 0.9, 0.0), 86 | vec3a(0.0, 0.0, 0.0), 87 | Vec3A::X, 88 | ); 89 | assert_eq!(hit_count, 256 * 256) 90 | } 91 | 92 | pub fn eval_render( 93 | mut eval: F, 94 | tris: &[Triangle], 95 | width: u32, 96 | height: u32, 97 | fov: f32, 98 | eye: Vec3A, 99 | look_at: Vec3A, 100 | up: Vec3A, 101 | ) where 102 | F: FnMut(u32, u32, RayHit), 103 | { 104 | let cwbvh = build_cwbvh_from_tris( 105 | tris, 106 | BvhBuildParams::medium_build(), 107 | &mut Duration::default(), 108 | ); 109 | 110 | let bvh_tris = cwbvh 111 | .primitive_indices 112 | .iter() 113 | .map(|i| tris[*i as usize]) 114 | .collect::>(); 115 | 116 | let target_size = Vec2::new(width as f32, height as f32); 117 | 118 | // Compute camera projection & view matrices 119 | let aspect_ratio = target_size.x / target_size.y; 120 | let proj_inv = Mat4::perspective_infinite_reverse_rh(fov, aspect_ratio, 0.01).inverse(); 121 | let view_inv = Mat4::look_at_rh(eye.into(), look_at.into(), up.into()).inverse(); 122 | 123 | for x in 0..width { 124 | for y in 0..height { 125 | let frag_coord = uvec2(x, y); 126 | let mut screen_uv = frag_coord.as_vec2() / target_size; 127 | screen_uv.y = 1.0 - screen_uv.y; 128 | let ndc = screen_uv * 2.0 - Vec2::ONE; 129 | let clip_pos = vec4(ndc.x, ndc.y, 1.0, 1.0); 130 | 131 | let mut vs = proj_inv * clip_pos; 132 | vs /= vs.w; 133 | let direction = (Vec3A::from((view_inv * vs).xyz()) - eye).normalize(); 134 | let ray = Ray::new(eye, direction, 0.0, f32::MAX); 135 | 136 | let mut hit = RayHit::none(); 137 | if cwbvh.ray_traverse(ray, &mut hit, |ray, id| bvh_tris[id].intersect(ray)) { 138 | eval(x, y, hit); 139 | } 140 | } 141 | } 142 | } 143 | 144 | #[test] 145 | pub fn traverse_aabb() { 146 | let tris = demoscene(201, 0); 147 | let aabb = Aabb::new(vec3a(0.511, -1.0, 0.511), vec3a(0.611, 1.0, 0.611)); 148 | 149 | let mut refrence_intersect_sum = 0usize; 150 | let mut refrence_count = 0; 151 | for (primitive_id, tri) in tris.iter().enumerate() { 152 | if aabb.intersect_aabb(&tri.aabb()) { 153 | refrence_intersect_sum = refrence_intersect_sum.wrapping_add(primitive_id); 154 | refrence_count += 1; 155 | } 156 | } 157 | 158 | // Bvh2 159 | let bvh2 = build_bvh2_from_tris( 160 | &tris, 161 | BvhBuildParams::fast_build(), 162 | &mut Duration::default(), 163 | ); 164 | let mut intersect_sum = 0usize; 165 | let mut intersect_count = 0; 166 | bvh2.validate(&tris, false, false); 167 | bvh2.aabb_traverse(aabb, |bvh, id| { 168 | let node = &bvh.nodes[id as usize]; 169 | for i in 0..node.prim_count { 170 | let primitive_id = bvh.primitive_indices[(node.first_index + i) as usize] as usize; 171 | let tri = tris[primitive_id]; 172 | if aabb.intersect_aabb(&tri.aabb()) { 173 | intersect_count += 1; 174 | intersect_sum = intersect_sum.wrapping_add(primitive_id); 175 | } 176 | } 177 | true 178 | }); 179 | assert_eq!(refrence_count, intersect_count); 180 | assert_eq!(refrence_intersect_sum, intersect_sum); 181 | 182 | // CwBvh 183 | let cwbvh = build_cwbvh_from_tris( 184 | &tris, 185 | BvhBuildParams::fast_build(), 186 | &mut Duration::default(), 187 | ); 188 | let mut cw_intersect_count = 0; 189 | let mut cw_intersect_sum = 0usize; 190 | cwbvh.validate(&tris, false, false); 191 | 192 | let mut state = cwbvh.new_traversal(Vec3A::ZERO); 193 | let mut node; 194 | traverse!( 195 | cwbvh, 196 | node, 197 | state, 198 | node.intersect_aabb(&aabb, state.oct_inv4), 199 | { 200 | let primitive_id = cwbvh.primitive_indices[state.primitive_id as usize] as usize; 201 | let tri = tris[primitive_id]; 202 | if aabb.intersect_aabb(&tri.aabb()) { 203 | cw_intersect_count += 1; 204 | cw_intersect_sum = cw_intersect_sum.wrapping_add(primitive_id); 205 | } 206 | } 207 | ); 208 | 209 | assert_eq!(refrence_count, cw_intersect_count); 210 | assert_eq!(refrence_intersect_sum, cw_intersect_sum); 211 | } 212 | 213 | #[test] 214 | pub fn traverse_point() { 215 | let tris = icosphere(0); 216 | 217 | // TODO Bvh2 218 | 219 | // CwBvh 220 | let cwbvh = build_cwbvh_from_tris( 221 | &tris, 222 | BvhBuildParams::fast_build(), 223 | &mut Duration::default(), 224 | ); 225 | cwbvh.validate(&tris, false, false); 226 | 227 | for i in 0..512 { 228 | let point = 229 | uniform_sample_sphere(vec2(hash_noise(uvec2(0, 0), i), hash_noise(uvec2(0, 1), i))); 230 | 231 | let mut refrence_intersect_sum = 0usize; 232 | let mut refrence_count = 0; 233 | for (primitive_id, tri) in tris.iter().enumerate() { 234 | if tri.aabb().contains_point(point) { 235 | refrence_intersect_sum = refrence_intersect_sum.wrapping_add(primitive_id); 236 | refrence_count += 1; 237 | } 238 | } 239 | 240 | let mut cw_intersect_count = 0; 241 | let mut cw_intersect_sum = 0usize; 242 | let mut state = cwbvh.new_traversal(Vec3A::ZERO); 243 | let mut node; 244 | traverse!( 245 | cwbvh, 246 | node, 247 | state, 248 | node.contains_point(&point, state.oct_inv4), 249 | { 250 | let primitive_id = 251 | cwbvh.primitive_indices[state.primitive_id as usize] as usize; 252 | let tri = tris[primitive_id]; 253 | if tri.aabb().contains_point(point) { 254 | cw_intersect_count += 1; 255 | cw_intersect_sum = cw_intersect_sum.wrapping_add(primitive_id); 256 | } 257 | } 258 | ); 259 | 260 | assert_eq!(refrence_count, cw_intersect_count); 261 | assert_eq!(refrence_intersect_sum, cw_intersect_sum); 262 | } 263 | } 264 | 265 | #[test] 266 | pub fn compute_parents_cwbvh() { 267 | let tris = demoscene(100, 0); 268 | let cwbvh = build_cwbvh_from_tris( 269 | &tris, 270 | BvhBuildParams::fast_build(), 271 | &mut Duration::default(), 272 | ); 273 | cwbvh.validate(&tris, false, false); 274 | let parents = cwbvh.compute_parents(); 275 | for (child, parent) in parents.iter().enumerate().skip(1) { 276 | let node = cwbvh.nodes[*parent as usize]; 277 | let mut found_child = false; 278 | for ch in 0..8 { 279 | if !node.is_leaf(ch) { 280 | let child_index = node.child_node_index(ch); 281 | if child_index as usize == child { 282 | found_child = true; 283 | break; 284 | } 285 | } 286 | } 287 | assert!(found_child, "child{}, parent{}", child, parent); 288 | } 289 | } 290 | 291 | #[test] 292 | pub fn order_children_cwbvh() { 293 | let tris = demoscene(100, 0); 294 | let triangles: &[Triangle] = &tris; 295 | let mut aabbs = Vec::with_capacity(triangles.len()); 296 | 297 | let config = BvhBuildParams::very_fast_build(); 298 | let mut indices = Vec::with_capacity(triangles.len()); 299 | for (i, tri) in triangles.iter().enumerate() { 300 | let a = tri.v0; 301 | let b = tri.v1; 302 | let c = tri.v2; 303 | let mut aabb = Aabb::empty(); 304 | aabb.extend(a).extend(b).extend(c); 305 | aabbs.push(aabb); 306 | indices.push(i as u32); 307 | } 308 | 309 | let bvh2 = config.ploc_search_distance.build( 310 | &aabbs, 311 | indices, 312 | config.sort_precision, 313 | config.search_depth_threshold, 314 | ); 315 | let mut cwbvh = bvh2_to_cwbvh(&bvh2, config.max_prims_per_leaf.clamp(1, 3), true, false); 316 | 317 | cwbvh.validate(&tris, false, false); 318 | for node in 0..cwbvh.nodes.len() { 319 | cwbvh.order_node_children(&aabbs, node, false); 320 | } 321 | cwbvh.validate(&tris, false, false); 322 | cwbvh.order_children(&aabbs, false); 323 | cwbvh.validate(&tris, false, false); 324 | } 325 | 326 | #[test] 327 | pub fn exact_aabbs_cwbvh() { 328 | let tris = demoscene(100, 0); 329 | let triangles: &[Triangle] = &tris; 330 | let mut aabbs = Vec::with_capacity(triangles.len()); 331 | 332 | let config = BvhBuildParams::very_fast_build(); 333 | let mut indices = Vec::with_capacity(triangles.len()); 334 | for (i, tri) in triangles.iter().enumerate() { 335 | let a = tri.v0; 336 | let b = tri.v1; 337 | let c = tri.v2; 338 | let mut aabb = Aabb::empty(); 339 | aabb.extend(a).extend(b).extend(c); 340 | aabbs.push(aabb); 341 | indices.push(i as u32); 342 | } 343 | 344 | let bvh2 = config.ploc_search_distance.build( 345 | &aabbs, 346 | indices, 347 | config.sort_precision, 348 | config.search_depth_threshold, 349 | ); 350 | let mut cwbvh = bvh2_to_cwbvh(&bvh2, config.max_prims_per_leaf.clamp(1, 3), true, true); 351 | 352 | if let Some(exact_node_aabbs) = &cwbvh.exact_node_aabbs { 353 | for node in &cwbvh.nodes { 354 | for ch in 0..8 { 355 | if !node.is_leaf(ch) { 356 | let child_node_index = node.child_node_index(ch) as usize; 357 | let compressed_aabb = node.child_aabb(ch); 358 | let child_node_self_compressed_aabb = cwbvh.nodes[child_node_index].aabb(); 359 | let exact_aabb = &exact_node_aabbs[child_node_index]; 360 | 361 | assert!(exact_aabb.min.cmpge((compressed_aabb.min).into()).all()); 362 | assert!(exact_aabb.max.cmple((compressed_aabb.max).into()).all()); 363 | assert!(exact_aabb 364 | .min 365 | .cmpge((child_node_self_compressed_aabb.min).into()) 366 | .all()); 367 | assert!(exact_aabb 368 | .max 369 | .cmple((child_node_self_compressed_aabb.max).into()) 370 | .all()); 371 | } 372 | } 373 | } 374 | } 375 | 376 | cwbvh.order_children(&aabbs, false); 377 | cwbvh.validate(&tris, false, false); 378 | cwbvh.order_children(&aabbs, false); 379 | cwbvh.validate(&tris, false, false); 380 | } 381 | } 382 | --------------------------------------------------------------------------------