├── .gitignore
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── demo.jpg
├── examples
    ├── basic_bvh2.rs
    ├── basic_cwbvh.rs
    ├── cornell_box_cwbvh.rs
    └── demoscene.rs
├── src
    ├── aabb.rs
    ├── bvh2
    │   ├── builder.rs
    │   ├── leaf_collapser.rs
    │   ├── mod.rs
    │   └── reinsertion.rs
    ├── cwbvh
    │   ├── builder.rs
    │   ├── bvh2_to_cwbvh.rs
    │   ├── mod.rs
    │   ├── node.rs
    │   ├── simd.rs
    │   └── traverse_macro.rs
    ├── heapstack.rs
    ├── lib.rs
    ├── ploc
    │   ├── mod.rs
    │   └── morton.rs
    ├── ray.rs
    ├── rt_triangle.rs
    ├── splits.rs
    ├── test_util.rs
    └── triangle.rs
└── tests
    └── mod.rs


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | .vscode
3 | Cargo.lock
4 | *_rend.png


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "obvhs"
 3 | version = "0.2.0"
 4 | edition = "2021"
 5 | description = "BVH Construction and Traversal Library"
 6 | homepage = "https://github.com/DGriffin91/obvhs"
 7 | repository = "https://github.com/DGriffin91/obvhs"
 8 | readme = "README.md"
 9 | license = "MIT OR Apache-2.0"
10 | keywords = ["bvh", "sah", "aabb", "cwbvh", "ploc"]
11 | 
12 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
13 | 
14 | [dependencies]
15 | glam = { version = "0.29", features = ["bytemuck"] }
16 | half = "2.3.1"
17 | bytemuck = "1.15"
18 | rdst = { version = "0.20.14", default-features = false }
19 | rayon = { version = "1.9.0", optional = true }
20 | 
21 | # Noop unless one of the profile-with features below is also used
22 | profiling = { version = "1.0", optional = true }
23 | 
24 | [dev-dependencies]
25 | image = "0.24"
26 | 
27 | [features]
28 | #default = []
29 | parallel = ["dep:rayon", "rdst/multi-threaded"]
30 | timeit = []
31 | 
32 | profile = ["dep:profiling"]
33 | profile-with-puffin = ["profiling/profile-with-puffin"]
34 | profile-with-optick = ["profiling/profile-with-optick"]
35 | profile-with-superluminal = ["profiling/profile-with-superluminal"]
36 | profile-with-tracing = ["profiling/profile-with-tracing"]
37 | profile-with-tracy = ["profiling/profile-with-tracy"]
38 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OBVHS - BVH Construction and Traversal Library
 2 | 
 3 | ![License](https://img.shields.io/badge/license-MIT%2FApache-blue.svg) [![Crates.io](https://img.shields.io/crates/v/obvhs.svg)](https://crates.io/crates/obvhs)
 4 | [![Docs](https://docs.rs/obvhs/badge.svg)](https://docs.rs/obvhs/latest/obvhs/)
 5 | 
 6 | - [PLOC](https://meistdan.github.io/publications/ploc/paper.pdf) BVH2 builder with [Parallel Reinsertion](https://meistdan.github.io/publications/prbvh/paper.pdf) and spatial pre-splits.
 7 | - [CWBVH](https://research.nvidia.com/sites/default/files/publications/ylitie2017hpg-paper.pdf) An eight-way compressed wide BVH8 builder. Each BVH Node is compressed so that it takes up only 80 bytes per node.
 8 | - CPU traversal for both BVH2 and CWBVH (SIMD traversal, intersecting 4 nodes at a time)
 9 | - For GPU traversal example, see the [Tray Racing](https://github.com/DGriffin91/tray_racing) benchmark
10 | 
11 | ![demo](demo.jpg)
12 | [*demoscene example*](https://github.com/DGriffin91/obvhs/blob/main/examples/demoscene.rs)
13 | 
14 | OBVHS optionally uses [rayon](https://github.com/rayon-rs/rayon) to parallelize building. Many parts of the building process are parallelized, but single threaded building speed has initally been the priority so there is still quite a bit of room for improvement in parallel building performance.
15 | 
16 | # Benchmarks
17 | See [Tray Racing](https://github.com/DGriffin91/tray_racing).
18 | 
19 | # Acknowledgments
20 | - [Tomasz Stachowiak](https://github.com/h3r2tic) for the initial rust/embree CWBVH builder, HLSL traversal, and numerous discussions along the way.
21 | - Jan Van Bergen for their [wonderful CUDA path tracer that implements CWBVH](https://github.com/jan-van-bergen/GPU-Raytracer).
22 | - Arsène Pérard-Gayot for their [series of articles on BVHs](https://madmann91.github.io/) and [BVH library](https://github.com/madmann91/bvh).
23 | - H. Ylitie et al. for [Efficient Incoherent Ray Traversal on GPUs Through
24 | Compressed Wide BVHs](https://research.nvidia.com/sites/default/files/publications/ylitie2017hpg-paper.pdf).
25 | - D. Meister et al. for [Parallel Locally-Ordered Clustering for Bounding Volume Hierarchy Construction](https://meistdan.github.io/publications/ploc/paper.pdf), [Parallel Reinsertion for Bounding Volume Hierarchy Optimization](https://meistdan.github.io/publications/prbvh/paper.pdf), and [Performance Comparison of Bounding Volume Hierarchies for GPU Ray Tracing](https://jcgt.org/published/0011/04/01/paper.pdf).
26 | 


--------------------------------------------------------------------------------
/demo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DGriffin91/obvhs/63e11fdbb9de52375695c8020bf61910e2825986/demo.jpg


--------------------------------------------------------------------------------
/examples/basic_bvh2.rs:
--------------------------------------------------------------------------------
 1 | use std::time::Duration;
 2 | 
 3 | use glam::*;
 4 | use obvhs::{
 5 |     bvh2::builder::build_bvh2_from_tris,
 6 |     ray::{Ray, RayHit},
 7 |     test_util::geometry::{icosphere, PLANE},
 8 |     triangle::Triangle,
 9 |     BvhBuildParams,
10 | };
11 | 
12 | fn main() {
13 |     // Build a scene with an icosphere and a plane
14 |     // BVH primitives do not need to be triangles, the BVH builder is only concerned with AABBs.
15 |     // (With the exception of optional precise triangle aabb splitting)
16 |     let mut tris: Vec<Triangle> = Vec::new();
17 |     tris.extend(icosphere(1));
18 |     tris.extend(PLANE);
19 | 
20 |     // Build the BVH.
21 |     // build_bvh_from_tris is just a helper that can build from BvhBuildParams and the
22 |     // respective presets. Feel free to copy the contents of build_bvh_from_tris or build_bvh.
23 |     // They are very straightforward. If you don't want to use Triangles as the primitive, use
24 |     // build_bvh instead. build_cwbvh_from_tris just adds support for splitting tris.
25 |     let bvh = build_bvh2_from_tris(
26 |         &tris,
27 |         BvhBuildParams::medium_build(),
28 |         &mut Duration::default(),
29 |     );
30 | 
31 |     // Create a new ray
32 |     let ray = Ray::new_inf(vec3a(0.1, 0.1, 4.0), vec3a(0.0, 0.0, -1.0));
33 | 
34 |     // Traverse the BVH, finding the closest hit.
35 |     let mut ray_hit = RayHit::none();
36 |     if bvh.ray_traverse(ray, &mut ray_hit, |ray, id| {
37 |         // Use primitive_indices to look up the original primitive id.
38 |         // (Could reorder tris per bvh.primitive_indices to avoid this lookup, see cornell_box_cwbvh example)
39 |         tris[bvh.primitive_indices[id] as usize].intersect(ray)
40 |     }) {
41 |         println!(
42 |             "Hit Triangle {}",
43 |             bvh.primitive_indices[ray_hit.primitive_id as usize]
44 |         );
45 |         println!("Distance to hit: {}", ray_hit.t);
46 |     } else {
47 |         println!("Miss");
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/examples/basic_cwbvh.rs:
--------------------------------------------------------------------------------
 1 | use glam::*;
 2 | use obvhs::{
 3 |     cwbvh::builder::build_cwbvh_from_tris,
 4 |     ray::{Ray, RayHit},
 5 |     test_util::geometry::{icosphere, PLANE},
 6 |     triangle::Triangle,
 7 |     BvhBuildParams,
 8 | };
 9 | use std::time::Duration;
10 | 
11 | fn main() {
12 |     // Build a scene with an icosphere and a plane
13 |     // BVH primitives do not need to be triangles, the BVH builder is only concerned with AABBs.
14 |     // (With the exception of optional precise triangle aabb splitting)
15 |     let mut tris: Vec<Triangle> = Vec::new();
16 |     tris.extend(icosphere(1));
17 |     tris.extend(PLANE);
18 | 
19 |     // Build the BVH.
20 |     // build_cwbvh_from_tris is just a helper that can build from BvhBuildParams and the
21 |     // respective presets. Feel free to copy the contents of build_cwbvh_from_tris or
22 |     // build_cwbvh. They are very straightforward. If you don't want to use Triangles as the
23 |     // primitive, use  build_cwbvh instead. build_cwbvh_from_tris just adds support for
24 |     // splitting tris.
25 |     let bvh = build_cwbvh_from_tris(
26 |         &tris,
27 |         BvhBuildParams::medium_build(),
28 |         &mut Duration::default(),
29 |     );
30 | 
31 |     // Create a new ray
32 |     let ray = Ray::new_inf(vec3a(0.1, 0.1, 4.0), vec3a(0.0, 0.0, -1.0));
33 | 
34 |     // Traverse the BVH, finding the closest hit.
35 |     let mut ray_hit = RayHit::none();
36 |     if bvh.ray_traverse(ray, &mut ray_hit, |ray, id| {
37 |         // Use primitive_indices to look up the original primitive id.
38 |         // (Could reorder tris per bvh.primitive_indices to avoid this lookup, see
39 |         // cornell_box_cwbvh example)
40 |         tris[bvh.primitive_indices[id] as usize].intersect(ray)
41 |     }) {
42 |         println!(
43 |             "Hit Triangle {}",
44 |             bvh.primitive_indices[ray_hit.primitive_id as usize]
45 |         );
46 |         println!("Distance to hit: {}", ray_hit.t);
47 |     } else {
48 |         println!("Miss");
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/examples/cornell_box_cwbvh.rs:
--------------------------------------------------------------------------------
  1 | use std::{f32::consts::PI, time::Duration};
  2 | 
  3 | use glam::*;
  4 | use image::{ImageBuffer, Rgba};
  5 | use obvhs::{
  6 |     cwbvh::builder::build_cwbvh_from_tris,
  7 |     ray::{Ray, RayHit},
  8 |     test_util::geometry::{CUBE, PLANE},
  9 |     triangle::Triangle,
 10 |     BvhBuildParams, Transformable,
 11 | };
 12 | 
 13 | // Generate triangles for cornell box
 14 | fn generate_cornell_box() -> Vec<Triangle> {
 15 |     let floor = PLANE;
 16 |     let mut box1 = CUBE;
 17 |     let mut box2 = box1.clone();
 18 |     let mut ceiling = floor.clone();
 19 |     let mut wall1 = floor.clone();
 20 |     let mut wall2 = floor.clone();
 21 |     let mut wall3 = floor.clone();
 22 |     box1.transform(&Mat4::from_scale_rotation_translation(
 23 |         Vec3::splat(0.3),
 24 |         Quat::from_rotation_y(-17.5f32.to_radians()),
 25 |         vec3(0.33, 0.3, 0.37),
 26 |     ));
 27 |     box2.transform(&Mat4::from_scale_rotation_translation(
 28 |         vec3(0.3, 0.6, 0.3),
 29 |         Quat::from_rotation_y(17.5f32.to_radians()),
 30 |         vec3(-0.33, 0.6, -0.29),
 31 |     ));
 32 |     ceiling.transform(&Mat4::from_translation(Vec3::Y * 2.0));
 33 |     wall1.transform(&Mat4::from_rotation_translation(
 34 |         Quat::from_rotation_x(PI * 0.5),
 35 |         vec3(0.0, 1.0, -1.0),
 36 |     ));
 37 |     wall2.transform(&Mat4::from_rotation_translation(
 38 |         Quat::from_rotation_z(-PI * 0.5),
 39 |         vec3(-1.0, 1.0, 0.0),
 40 |     ));
 41 |     wall3.transform(&Mat4::from_rotation_translation(
 42 |         Quat::from_rotation_z(-PI * 0.5),
 43 |         vec3(1.0, 1.0, 0.0),
 44 |     ));
 45 |     let mut tris = Vec::new();
 46 |     tris.extend(floor);
 47 |     tris.extend(box1);
 48 |     tris.extend(box2);
 49 |     tris.extend(ceiling);
 50 |     tris.extend(wall1);
 51 |     tris.extend(wall2);
 52 |     tris.extend(wall3);
 53 |     tris
 54 | }
 55 | 
 56 | fn main() {
 57 |     let tris = generate_cornell_box();
 58 |     // Build cwbvh (Change this to build_bvh2_from_tris to try with Bvh2)
 59 |     let bvh = build_cwbvh_from_tris(
 60 |         &tris,
 61 |         BvhBuildParams::medium_build(),
 62 |         &mut Duration::default(),
 63 |     );
 64 | 
 65 |     // The reason for this mapping below is that if multiple primitives are contained in a cwbvh node, they need to have their indices layed out contiguously.
 66 |     // If we want to avoid this indirection during traversal there are two options:
 67 |     // 1. Layout the primitives in the order of the cwbvh's indices mapping so that this can index directly into the primitive list.
 68 |     // 2. Only allow one primitive per node and write back the original mapping to the bvh node list.
 69 |     let bvh_tris = bvh
 70 |         .primitive_indices
 71 |         .iter()
 72 |         .map(|i| tris[*i as usize])
 73 |         .collect::<Vec<Triangle>>();
 74 | 
 75 |     // Setup render target and camera
 76 |     let width = 1280;
 77 |     let height = 720;
 78 |     let target_size = Vec2::new(width as f32, height as f32);
 79 |     let fov = 90.0f32;
 80 |     let eye = vec3a(0.0, 1.0, 2.1);
 81 |     let look_at = vec3(0.0, 1.0, 0.0);
 82 | 
 83 |     // Compute camera projection & view matrices
 84 |     let aspect_ratio = target_size.x / target_size.y;
 85 |     let proj_inv =
 86 |         Mat4::perspective_infinite_reverse_rh(fov.to_radians(), aspect_ratio, 0.01).inverse();
 87 |     let view_inv = Mat4::look_at_rh(eye.into(), look_at, Vec3::Y).inverse();
 88 | 
 89 |     // Init image buffer
 90 |     let mut img: ImageBuffer<Rgba<u8>, Vec<u8>> = ImageBuffer::new(width, height);
 91 |     let pixels = img.as_mut();
 92 | 
 93 |     // For each pixel trace ray into scene and write normal as color to image buffer
 94 |     pixels.chunks_mut(4).enumerate().for_each(|(i, chunk)| {
 95 |         let frag_coord = uvec2(i as u32 % width, i as u32 / width);
 96 |         let mut screen_uv = frag_coord.as_vec2() / target_size;
 97 |         screen_uv.y = 1.0 - screen_uv.y;
 98 |         let ndc = screen_uv * 2.0 - Vec2::ONE;
 99 |         let clip_pos = vec4(ndc.x, ndc.y, 1.0, 1.0);
100 | 
101 |         let mut vs_pos = proj_inv * clip_pos;
102 |         vs_pos /= vs_pos.w;
103 |         let direction = (Vec3A::from((view_inv * vs_pos).xyz()) - eye).normalize();
104 |         let ray = Ray::new(eye, direction, 0.0, f32::MAX);
105 | 
106 |         let mut hit = RayHit::none();
107 |         if bvh.ray_traverse(ray, &mut hit, |ray, id| bvh_tris[id].intersect(ray)) {
108 |             let mut normal = bvh_tris[hit.primitive_id as usize].compute_normal();
109 |             normal *= normal.dot(-ray.direction).signum(); // Double sided
110 |             let c = (normal * 255.0).as_uvec3();
111 |             chunk.copy_from_slice(&[c.x as u8, c.y as u8, c.z as u8, 255]);
112 |         }
113 |     });
114 | 
115 |     img.save("basic_cornell_box_rend.png")
116 |         .expect("Failed to save image");
117 | }
118 | 


--------------------------------------------------------------------------------
/examples/demoscene.rs:
--------------------------------------------------------------------------------
  1 | // For fun, not pbr
  2 | // Run with `--release --features parallel` unless you like waiting around for a very long time.
  3 | use glam::*;
  4 | use image::{ImageBuffer, Rgba};
  5 | use obvhs::{
  6 |     cwbvh::builder::build_cwbvh_from_tris,
  7 |     ray::{Ray, RayHit},
  8 |     rt_triangle::RtTriangle,
  9 |     test_util::{
 10 |         geometry::demoscene,
 11 |         sampling::{
 12 |             build_orthonormal_basis, cosine_sample_hemisphere, hash_noise,
 13 |             somewhat_boring_display_transform, uniform_sample_cone, uniform_sample_sphere,
 14 |         },
 15 |     },
 16 |     timeit, BvhBuildParams,
 17 | };
 18 | use std::{io::Write, time::Duration};
 19 | pub const SUN_ANGULAR_DIAMETER: f32 = 0.00933;
 20 | 
 21 | #[cfg(feature = "parallel")]
 22 | use rayon::iter::{IntoParallelIterator, ParallelIterator};
 23 | use sky::Sky;
 24 | 
 25 | fn main() {
 26 |     let total_aa_samples = 64;
 27 |     let resolution = 2560;
 28 |     let seed = 57;
 29 | 
 30 |     timeit!["generate height map",
 31 |     let tris = demoscene(resolution as usize, seed * 10);
 32 |     ];
 33 |     println!("{} triangles, {} AA samples", tris.len(), total_aa_samples);
 34 |     timeit!["generate bvh",
 35 |     let bvh = build_cwbvh_from_tris(&tris, BvhBuildParams::very_fast_build(), &mut Duration::default());
 36 |     ];
 37 | 
 38 |     let bvh_tris = bvh
 39 |         .primitive_indices
 40 |         .iter()
 41 |         .map(|i| (&tris[*i as usize]).into())
 42 |         .collect::<Vec<RtTriangle>>();
 43 | 
 44 |     let intersection_fn = |ray: &Ray, id: usize| bvh_tris[id].intersect(ray);
 45 | 
 46 |     // Setup render target and camera
 47 |     let width = resolution;
 48 |     let height = ((resolution as f32) * 0.3711) as u32;
 49 |     let target_size = Vec2::new(width as f32, height as f32);
 50 |     let fov = 17.0f32;
 51 |     let eye = vec3a(0.0, 0.0, 1.35);
 52 |     let look_at = eye + vec3a(0.0, 0.16, -1.0);
 53 |     let sun_direction = vec3a(0.35, -0.1, 0.19).normalize();
 54 |     let sky = Sky::red_sunset(-sun_direction);
 55 |     let sky_bg = Sky::red_sunset(-vec3a(0.35, -0.1, 0.5).normalize()); // To extend the sun glow a bit in the BG
 56 |     let nee = 1.0 - SUN_ANGULAR_DIAMETER.cos();
 57 |     let material_color = vec3a(0.61, 0.59, 0.52).powf(2.2);
 58 |     let exposure = -3.6;
 59 | 
 60 |     // Compute camera projection & view matrices
 61 |     let aspect_ratio = target_size.x / target_size.y;
 62 |     let proj_inv =
 63 |         Mat4::perspective_infinite_reverse_rh(fov.to_radians(), aspect_ratio, 0.01).inverse();
 64 |     let view = Mat4::look_at_rh(eye.into(), look_at.into(), Vec3::Y);
 65 |     let view_inv = view.inverse();
 66 | 
 67 |     let mut fragments = vec![Vec3A::ZERO; (width * height) as usize];
 68 | 
 69 |     println!("|{}|", " ".repeat(total_aa_samples as usize));
 70 |     print!(" ");
 71 |     timeit![
 72 |         "render",
 73 |         for aa_sample in 0..total_aa_samples {
 74 |             print!("."); // Print progress
 75 |             std::io::stdout().flush().unwrap();
 76 |             let new_fragments: Vec<Vec3A>;
 77 |             #[cfg(feature = "parallel")]
 78 |             let iter = (0..width * height).into_par_iter();
 79 |             #[cfg(not(feature = "parallel"))]
 80 |             let iter = (0..width * height).into_iter();
 81 |             new_fragments = iter
 82 |                 .map(|i| {
 83 |                     let frag_coord = uvec2(i as u32 % width, i as u32 / width);
 84 |                     let misc_grain_noise = hash_noise(frag_coord, aa_sample + 12345);
 85 |                     let aa = vec2(
 86 |                         hash_noise(frag_coord, aa_sample),
 87 |                         hash_noise(frag_coord, aa_sample + 512),
 88 |                     ) * 0.5
 89 |                         - 0.25;
 90 |                     let mut screen_uv = (frag_coord.as_vec2() + aa) / target_size;
 91 |                     screen_uv.y = 1.0 - screen_uv.y;
 92 |                     let ndc = screen_uv * 2.0 - Vec2::ONE;
 93 |                     let clip_pos = vec4(ndc.x, ndc.y, 1.0, 1.0);
 94 | 
 95 |                     let mut vs = proj_inv * clip_pos;
 96 |                     vs /= vs.w;
 97 |                     let direction = (Vec3A::from((view_inv * vs).xyz()) - eye).normalize();
 98 | 
 99 |                     let fuzz = vec3a(
100 |                         hash_noise(frag_coord, aa_sample),
101 |                         hash_noise(frag_coord, aa_sample + 512),
102 |                         hash_noise(frag_coord, aa_sample + 1024),
103 |                     );
104 |                     let fuzzy_cube_of_sensor = eye + (fuzz * 2.0 - 1.0) * 0.002;
105 | 
106 |                     let focal_distance = 2.4;
107 |                     let focal_point = eye + direction * focal_distance;
108 |                     let cam_dir = (focal_point - fuzzy_cube_of_sensor).normalize_or_zero();
109 |                     let ray = Ray::new_inf(fuzzy_cube_of_sensor, cam_dir);
110 | 
111 |                     let mut color = Vec3A::ZERO;
112 | 
113 |                     let fog_dir = uniform_sample_sphere(vec2(
114 |                         hash_noise(frag_coord, aa_sample + 2048),
115 |                         hash_noise(frag_coord, aa_sample + 3840),
116 |                     ));
117 |                     let mut hit = RayHit::none();
118 |                     let fogc = sky.render(fog_dir).min(Vec3A::splat(100.0));
119 |                     let skyc = sky.render(ray.direction);
120 |                     let sunc = sky.render(-sun_direction);
121 |                     let mut state = bvh.new_ray_traversal(ray);
122 |                     while bvh.ray_traverse_dynamic(&mut state, &mut hit, intersection_fn) {}
123 |                     if hit.t < f32::MAX {
124 |                         let mut normal = bvh_tris[hit.primitive_id as usize].compute_normal();
125 |                         normal *= normal.dot(-ray.direction).signum(); // Double sided
126 | 
127 |                         let hit_p = ray.origin + ray.direction * hit.t - ray.direction * 0.01;
128 | 
129 |                         let tangent_to_world = build_orthonormal_basis(normal);
130 |                         let mut ao_ray_dir = cosine_sample_hemisphere(vec2(
131 |                             hash_noise(frag_coord, aa_sample),
132 |                             hash_noise(frag_coord, aa_sample + 1024),
133 |                         ));
134 |                         ao_ray_dir = (tangent_to_world * ao_ray_dir).normalize();
135 | 
136 |                         let diff_ray = Ray::new_inf(hit_p, ao_ray_dir);
137 |                         let mut diff_hit = RayHit::none();
138 |                         state.reinit(diff_ray);
139 |                         while bvh.ray_traverse_dynamic(&mut state, &mut diff_hit, intersection_fn) {}
140 |                         if diff_hit.t < f32::MAX {
141 |                             let mut diff_hit_normal =
142 |                                 bvh_tris[diff_hit.primitive_id as usize].compute_normal();
143 |                             diff_hit_normal *= diff_hit_normal.dot(-ray.direction).signum(); // Double sided
144 | 
145 |                             // Silly 1st bounce sun shadow ray
146 |                             let ao_hit_p = hit_p + diff_ray.direction * diff_hit.t - diff_ray.direction * 0.01;
147 |                             let sun_ray = Ray::new_inf(ao_hit_p, -sun_direction);
148 |                             let mut sun_hit = RayHit::none();
149 |                             // anyhit
150 | 
151 |                             state.reinit(sun_ray);
152 |                             if !bvh.ray_traverse_dynamic(&mut state, &mut sun_hit, intersection_fn) {
153 |                                 // xD
154 |                                 color += material_color * material_color * nee * sunc * 4.0;
155 |                             }
156 |                         } else {
157 |                             let fresnel = (1.0 - normal.dot(-cam_dir)).powf(8.0).max(0.0);
158 |                             let skyc = sky
159 |                                 .render(diff_ray.direction)
160 |                                 // Sun results in fireflies. Clamp to avoid randomly sampling super high values.
161 |                                 .min(Vec3A::splat(100.0));
162 |                             color += material_color * (fresnel * skyc * 0.5 + skyc);
163 |                         }
164 | 
165 |                         // Sun shadow ray
166 |                         let sun_rnd = vec2(
167 |                             hash_noise(frag_coord, aa_sample + 10000),
168 |                             hash_noise(frag_coord, aa_sample + 20000),
169 |                         );
170 |                         let sun_basis = build_orthonormal_basis(sun_direction);
171 |                         let sun_dir = (sun_basis
172 |                             * uniform_sample_cone(sun_rnd, (SUN_ANGULAR_DIAMETER * 0.5).cos()))
173 |                         .normalize_or_zero();
174 | 
175 |                         let mut sun_hit = RayHit::none();
176 |                         let sun_ray = Ray::new_inf(hit_p, -sun_dir);
177 | 
178 |                         state.reinit(sun_ray); // anyhit
179 |                         if !bvh.ray_traverse_dynamic(&mut state, &mut sun_hit, intersection_fn) {
180 |                             color += material_color
181 |                                 * nee
182 |                                 * normal.dot(-sun_dir).max(0.00001)
183 |                                 * sunc
184 |                                 * 10.0
185 |                                 * misc_grain_noise;
186 |                         }
187 | 
188 |                         // Fog shadow ray
189 |                         let fog_t = hit.t * hash_noise(frag_coord, aa_sample + 54321);
190 |                         let fog_p = ray.origin + ray.direction * fog_t;
191 |                         let sun_ray = Ray::new_inf(fog_p, -sun_direction);
192 |                         let mut sun_hit = RayHit::none();
193 | 
194 |                         state.reinit(sun_ray); // anyhit
195 |                         if !bvh.ray_traverse_dynamic(&mut state, &mut sun_hit, intersection_fn) {
196 |                             color += nee * sunc * fog_t * 0.2;
197 |                         }
198 |                         state.reinit(Ray::new_inf(fog_p, fog_dir)); // anyhit
199 |                         if !bvh.ray_traverse_dynamic(&mut state, &mut RayHit::none(), intersection_fn) {
200 |                             color += fog_t * 0.2 * fogc;
201 |                         }
202 |                     } else {
203 |                         let sky_bgc = sky_bg.render(ray.direction) * 0.4 + skyc * 0.6;
204 |                         color += sky_bgc * 0.4 + sky_bgc * misc_grain_noise * 0.6;
205 |                         color += 0.2 * fogc;
206 |                     }
207 |                     color
208 |                 })
209 |                 .collect::<Vec<_>>();
210 |             new_fragments
211 |                 .iter()
212 |                 .zip(fragments.iter_mut())
213 |                 .for_each(|(new, col)| *col += *new);
214 |         }
215 |         println!("");
216 |     ];
217 | 
218 |     let mut img: ImageBuffer<Rgba<u8>, Vec<u8>> = ImageBuffer::new(width, height);
219 |     let pixels = img.as_mut();
220 |     pixels.chunks_mut(4).enumerate().for_each(|(i, chunk)| {
221 |         let mut col = (fragments[i] / total_aa_samples as f32).max(Vec3A::ZERO);
222 |         col *= Vec3A::splat(2.0).powf(exposure);
223 |         col = somewhat_boring_display_transform(col);
224 |         col = col.powf(1.7); // contrast
225 |         let luma = Vec3A::splat(col.dot(vec3a(0.2126, 0.7152, 0.0722)));
226 |         col = luma * -0.1 + col * 1.1; // saturation
227 |         let c = (col.clamp(Vec3A::ZERO, Vec3A::ONE) * 255.0).as_uvec3();
228 |         chunk.copy_from_slice(&[c.x as u8, c.y as u8, c.z as u8, 255]);
229 |     });
230 | 
231 |     img.save(format!("demoscene_{}_rend.png", seed))
232 |         .expect("Failed to save image");
233 | }
234 | 
235 | mod sky {
236 |     use std::f32::consts::PI;
237 | 
238 |     use glam::{vec3a, Vec3A};
239 | 
240 |     use obvhs::test_util::sampling::smoothstep;
241 | 
242 |     use crate::SUN_ANGULAR_DIAMETER;
243 | 
244 |     // Based on https://github.com/Tw1ddle/Sky-Shader/
245 |     pub struct Sky {
246 |         pub depolarization_factor: f32,
247 |         pub mie_coefficient: f32,
248 |         pub mie_directional_g: f32,
249 |         pub mie_k_coefficient: Vec3A,
250 |         pub mie_v: f32,
251 |         pub mie_zenith_length: f32,
252 |         pub num_molecules: f32,
253 |         pub primaries: Vec3A,
254 |         pub rayleigh: f32,
255 |         pub rayleigh_zenith_length: f32,
256 |         pub refractive_index: f32,
257 |         pub sun_angular_diameter: f32,
258 |         pub sun_intensity_factor: f32,
259 |         pub sun_intensity_falloff_steepness: f32,
260 |         pub turbidity: f32,
261 |         pub sun_position: Vec3A,
262 |     }
263 | 
264 |     impl Sky {
265 |         pub fn red_sunset(sun_position: Vec3A) -> Sky {
266 |             Sky {
267 |                 depolarization_factor: 0.02,
268 |                 mie_coefficient: 0.005,
269 |                 mie_directional_g: 0.82,
270 |                 mie_k_coefficient: vec3a(0.686, 0.678, 0.666),
271 |                 mie_v: 3.936,
272 |                 mie_zenith_length: 34000.0,
273 |                 num_molecules: 2.542e25,
274 |                 primaries: vec3a(6.8e-7f32, 5.5e-7f32, 4.5e-7f32),
275 |                 rayleigh: 2.28,
276 |                 rayleigh_zenith_length: 8400.0,
277 |                 refractive_index: 1.00029,
278 |                 sun_angular_diameter: SUN_ANGULAR_DIAMETER,
279 |                 sun_intensity_factor: 1000.0,
280 |                 sun_intensity_falloff_steepness: 1.1,
281 |                 turbidity: 4.7,
282 |                 sun_position,
283 |             }
284 |         }
285 | 
286 |         pub fn render(&self, dir: Vec3A) -> Vec3A {
287 |             let sunfade = 1.0 - (1.0 - (self.sun_position.y / 450000.0).exp()).clamp(0.0, 1.0);
288 |             let rayleigh_coefficient = self.rayleigh - (1.0 * (1.0 - sunfade));
289 |             let beta_r = self.total_rayleigh(self.primaries) * rayleigh_coefficient;
290 | 
291 |             let beta_m = self.total_mie(self.primaries) * self.mie_coefficient;
292 | 
293 |             let zenith_angle = (0.0f32.max(Vec3A::Y.dot(dir))).acos();
294 |             let denom =
295 |                 zenith_angle.cos() + 0.15 * (93.885 - ((zenith_angle * 180.0) / PI)).powf(-1.253);
296 |             let s_r = self.rayleigh_zenith_length / denom;
297 |             let s_m = self.mie_zenith_length / denom;
298 | 
299 |             let fex = (-(beta_r * s_r + beta_m * s_m)).exp();
300 | 
301 |             let sun_direction = self.sun_position.normalize();
302 |             let cos_theta = dir.dot(sun_direction);
303 |             let beta_r_theta = beta_r * Self::rayleigh_phase(cos_theta * 0.5 + 0.5);
304 |             let beta_m_theta =
305 |                 beta_m * Self::henyey_greenstein_phase(cos_theta, self.mie_directional_g);
306 | 
307 |             let sun_e = self.sun_intensity(sun_direction.dot(Vec3A::Y));
308 |             let mut lin =
309 |                 (sun_e * ((beta_r_theta + beta_m_theta) / (beta_r + beta_m)) * (1.0 - fex))
310 |                     .powf(1.5);
311 |             lin *= Vec3A::splat(1.0).lerp(
312 |                 (sun_e * ((beta_r_theta + beta_m_theta) / (beta_r + beta_m)) * fex).powf(0.5),
313 |                 (1.0 - Vec3A::Y.dot(sun_direction))
314 |                     .powf(5.0)
315 |                     .clamp(0.0, 1.0),
316 |             );
317 | 
318 |             let sun_angular_diameter_cos = (self.sun_angular_diameter).cos();
319 |             let sundisk = smoothstep(
320 |                 sun_angular_diameter_cos,
321 |                 sun_angular_diameter_cos, // + 0.00002
322 |                 cos_theta,
323 |             );
324 |             let mut l0 = Vec3A::splat(0.1) * fex;
325 |             l0 += sun_e * 19000.0 * fex * sundisk;
326 |             let mut color = (lin + l0) * 0.04;
327 |             let low_falloff = (Vec3A::Y.dot(dir) + 0.4).powf(5.0).max(0.0);
328 |             color = (color * 0.1).powf(3.0) * low_falloff;
329 |             color.powf(1.0 / (1.2 + (1.2 * sunfade))) * 0.5
330 |         }
331 | 
332 |         fn total_rayleigh(&self, lambda: Vec3A) -> Vec3A {
333 |             (8.0 * PI.powi(3)
334 |                 * (self.refractive_index.powi(2) - 1.0).powi(2)
335 |                 * (6.0 + 3.0 * self.depolarization_factor))
336 |                 / (3.0
337 |                     * self.num_molecules
338 |                     * lambda.powf(4.0)
339 |                     * (6.0 - 7.0 * self.depolarization_factor))
340 |         }
341 | 
342 |         fn total_mie(&self, lambda: Vec3A) -> Vec3A {
343 |             let c = 0.2 * self.turbidity * 10e-18;
344 |             0.434 * c * PI * (2.0 * PI / lambda).powf(self.mie_v - 2.0) * self.mie_k_coefficient
345 |         }
346 | 
347 |         fn rayleigh_phase(cos_theta: f32) -> f32 {
348 |             (3.0 / (16.0 * PI)) * (1.0 + cos_theta.powi(2))
349 |         }
350 | 
351 |         fn henyey_greenstein_phase(cos_theta: f32, g: f32) -> f32 {
352 |             (1.0 / (4.0 * PI))
353 |                 * ((1.0 - g.powi(2)) / (1.0 - 2.0 * g * cos_theta + g.powi(2)).powf(1.5))
354 |         }
355 | 
356 |         fn sun_intensity(&self, zenith_angle_cos: f32) -> f32 {
357 |             let cutoff_angle = PI / 1.95;
358 |             self.sun_intensity_factor
359 |                 * 0.0f32.max(
360 |                     1.0 - (-((cutoff_angle - zenith_angle_cos.acos()).exp()
361 |                         / self.sun_intensity_falloff_steepness)),
362 |                 )
363 |         }
364 |     }
365 | }
366 | 


--------------------------------------------------------------------------------
/src/aabb.rs:
--------------------------------------------------------------------------------
  1 | //! An Axis-Aligned Bounding Box (AABB) represented by its minimum and maximum points.
  2 | 
  3 | use std::ops::BitAnd;
  4 | 
  5 | use bytemuck::{Pod, Zeroable};
  6 | use glam::Vec3A;
  7 | 
  8 | use crate::{ray::Ray, Boundable};
  9 | 
 10 | /// An Axis-Aligned Bounding Box (AABB) represented by its minimum and maximum points.
 11 | #[derive(Default, Clone, Copy, Debug, PartialEq)]
 12 | #[repr(C)]
 13 | pub struct Aabb {
 14 |     pub min: Vec3A,
 15 |     pub max: Vec3A,
 16 | }
 17 | 
 18 | unsafe impl Pod for Aabb {}
 19 | unsafe impl Zeroable for Aabb {}
 20 | 
 21 | impl Aabb {
 22 |     /// An invalid (empty) AABB with min set to the maximum possible value
 23 |     /// and max set to the minimum possible value.
 24 |     pub const INVALID: Self = Self {
 25 |         min: Vec3A::splat(f32::MAX),
 26 |         max: Vec3A::splat(f32::MIN),
 27 |     };
 28 | 
 29 |     /// An infinite AABB with min set to negative infinity
 30 |     /// and max set to positive infinity.
 31 |     pub const LARGEST: Self = Self {
 32 |         min: Vec3A::splat(-f32::MAX),
 33 |         max: Vec3A::splat(f32::MAX),
 34 |     };
 35 | 
 36 |     /// An infinite AABB with min set to negative infinity
 37 |     /// and max set to positive infinity.
 38 |     pub const INFINITY: Self = Self {
 39 |         min: Vec3A::splat(-f32::INFINITY),
 40 |         max: Vec3A::splat(f32::INFINITY),
 41 |     };
 42 | 
 43 |     /// Creates a new AABB with the given minimum and maximum points.
 44 |     #[inline]
 45 |     pub fn new(min: Vec3A, max: Vec3A) -> Self {
 46 |         Self { min, max }
 47 |     }
 48 | 
 49 |     /// Creates a new AABB with both min and max set to the given point.
 50 |     #[inline]
 51 |     pub fn from_point(point: Vec3A) -> Self {
 52 |         Self {
 53 |             min: point,
 54 |             max: point,
 55 |         }
 56 |     }
 57 | 
 58 |     /// Creates an AABB that bounds the given set of points.
 59 |     #[inline]
 60 |     pub fn from_points(points: &[Vec3A]) -> Self {
 61 |         let mut points = points.iter();
 62 |         let mut aabb = Aabb::from_point(*points.next().unwrap());
 63 |         for point in points {
 64 |             aabb.extend(*point);
 65 |         }
 66 |         aabb
 67 |     }
 68 | 
 69 |     /// Checks if the AABB contains the given point.
 70 |     #[inline]
 71 |     pub fn contains_point(&self, point: Vec3A) -> bool {
 72 |         (point.cmpge(self.min).bitand(point.cmple(self.max))).all()
 73 |     }
 74 | 
 75 |     /// Extends the AABB to include the given point.
 76 |     #[inline]
 77 |     pub fn extend(&mut self, point: Vec3A) -> &mut Self {
 78 |         *self = self.union(&Self::from_point(point));
 79 |         self
 80 |     }
 81 | 
 82 |     /// Returns the union of this AABB and another AABB.
 83 |     #[inline]
 84 |     #[must_use]
 85 |     pub fn union(&self, other: &Self) -> Self {
 86 |         Aabb {
 87 |             min: self.min.min(other.min),
 88 |             max: self.max.max(other.max),
 89 |         }
 90 |     }
 91 | 
 92 |     /// Returns the intersection of this AABB and another AABB.
 93 |     ///
 94 |     /// The intersection of two AABBs is the overlapping region that is
 95 |     /// common to both AABBs. If the AABBs do not overlap, the resulting
 96 |     /// AABB will have min and max values that do not form a valid box
 97 |     /// (min will not be less than max).
 98 |     #[inline]
 99 |     pub fn intersection(&self, other: &Self) -> Self {
100 |         Aabb {
101 |             min: self.min.max(other.min),
102 |             max: self.max.min(other.max),
103 |         }
104 |     }
105 | 
106 |     /// Returns the diagonal vector of the AABB.
107 |     #[inline]
108 |     pub fn diagonal(&self) -> Vec3A {
109 |         self.max - self.min
110 |     }
111 | 
112 |     /// Returns the center point of the AABB.
113 |     #[inline]
114 |     pub fn center(&self) -> Vec3A {
115 |         (self.max + self.min) * 0.5
116 |     }
117 | 
118 |     /// Returns the center coordinate of the AABB along a specific axis.
119 |     #[inline]
120 |     pub fn center_axis(&self, axis: usize) -> f32 {
121 |         (self.max[axis] + self.min[axis]) * 0.5
122 |     }
123 | 
124 |     /// Returns the index of the largest axis of the AABB.
125 |     #[inline]
126 |     pub fn largest_axis(&self) -> usize {
127 |         let d = self.diagonal();
128 |         if d.x < d.y {
129 |             if d.y < d.z {
130 |                 2
131 |             } else {
132 |                 1
133 |             }
134 |         } else if d.x < d.z {
135 |             2
136 |         } else {
137 |             0
138 |         }
139 |     }
140 | 
141 |     /// Returns the index of the smallest axis of the AABB.
142 |     #[inline]
143 |     pub fn smallest_axis(&self) -> usize {
144 |         let d = self.diagonal();
145 |         if d.x > d.y {
146 |             if d.y > d.z {
147 |                 2
148 |             } else {
149 |                 1
150 |             }
151 |         } else if d.x > d.z {
152 |             2
153 |         } else {
154 |             0
155 |         }
156 |     }
157 | 
158 |     /// Returns half the surface area of the AABB.
159 |     #[inline]
160 |     pub fn half_area(&self) -> f32 {
161 |         let d = self.diagonal();
162 |         (d.x + d.y) * d.z + d.x * d.y
163 |     }
164 | 
165 |     /// Returns the surface area of the AABB.
166 |     #[inline]
167 |     pub fn surface_area(&self) -> f32 {
168 |         let d = self.diagonal();
169 |         2.0 * d.dot(d)
170 |     }
171 | 
172 |     /// Returns an empty AABB.
173 |     #[inline]
174 |     pub fn empty() -> Self {
175 |         Self {
176 |             min: Vec3A::new(f32::MAX, f32::MAX, f32::MAX),
177 |             max: Vec3A::new(f32::MIN, f32::MIN, f32::MIN),
178 |         }
179 |     }
180 | 
181 |     /// Checks if the AABB is valid (i.e., min <= max on all axes).
182 |     pub fn valid(&self) -> bool {
183 |         self.min.cmple(self.max).all()
184 |     }
185 | 
186 |     /// Checks if this AABB intersects with another AABB.
187 |     #[inline]
188 |     pub fn intersect_aabb(&self, other: &Aabb) -> bool {
189 |         (self.min.cmpgt(other.max) | self.max.cmplt(other.min)).bitmask() == 0
190 |     }
191 | 
192 |     /// Checks if this AABB intersects with a ray and returns the distance to the intersection point.
193 |     /// Returns `f32::MAX` if there is no intersection.
194 |     #[inline]
195 |     pub fn intersect_ray(&self, ray: &Ray) -> f32 {
196 |         let t1 = (self.min - ray.origin) * ray.inv_direction;
197 |         let t2 = (self.max - ray.origin) * ray.inv_direction;
198 | 
199 |         let tmin = t1.min(t2);
200 |         let tmax = t1.max(t2);
201 | 
202 |         let tmin_n = tmin.x.max(tmin.y.max(tmin.z));
203 |         let tmax_n = tmax.x.min(tmax.y.min(tmax.z));
204 | 
205 |         if tmax_n >= tmin_n && tmax_n >= 0.0 {
206 |             tmin_n
207 |         } else {
208 |             f32::INFINITY
209 |         }
210 |     }
211 | }
212 | 
213 | impl Boundable for Aabb {
214 |     #[inline]
215 |     fn aabb(&self) -> Aabb {
216 |         *self
217 |     }
218 | }
219 | 
220 | #[cfg(test)]
221 | mod tests {
222 |     use super::*;
223 |     use glam::Vec3A;
224 | 
225 |     #[test]
226 |     fn test_from_point() {
227 |         let point = Vec3A::ONE;
228 |         let aabb = Aabb::from_point(point);
229 |         assert_eq!(aabb.min, point);
230 |         assert_eq!(aabb.max, point);
231 |     }
232 | 
233 |     #[test]
234 |     fn test_from_points() {
235 |         let points = vec![Vec3A::ZERO, Vec3A::ONE, Vec3A::splat(2.0)];
236 |         let aabb = Aabb::from_points(&points);
237 |         assert_eq!(aabb.min, Vec3A::ZERO);
238 |         assert_eq!(aabb.max, Vec3A::splat(2.0));
239 |     }
240 | 
241 |     #[test]
242 |     fn test_contains_point() {
243 |         let aabb = Aabb::new(Vec3A::ZERO, Vec3A::ONE);
244 |         assert!(aabb.contains_point(Vec3A::splat(0.5)));
245 |         assert!(!aabb.contains_point(Vec3A::splat(1.5)));
246 |     }
247 | 
248 |     #[test]
249 |     fn test_extend() {
250 |         let mut aabb = Aabb::from_point(Vec3A::ZERO);
251 |         aabb.extend(Vec3A::ONE);
252 |         assert_eq!(aabb.min, Vec3A::ZERO);
253 |         assert_eq!(aabb.max, Vec3A::ONE);
254 |     }
255 | 
256 |     #[test]
257 |     fn test_union() {
258 |         let aabb1 = Aabb::new(Vec3A::ZERO, Vec3A::ONE);
259 |         let aabb2 = Aabb::new(Vec3A::splat(0.5), Vec3A::splat(1.5));
260 |         let union = aabb1.union(&aabb2);
261 |         assert_eq!(union.min, Vec3A::ZERO);
262 |         assert_eq!(union.max, Vec3A::splat(1.5));
263 |     }
264 | 
265 |     #[test]
266 |     fn test_intersection() {
267 |         let aabb1 = Aabb::new(Vec3A::ZERO, Vec3A::ONE);
268 |         let aabb2 = Aabb::new(Vec3A::splat(0.5), Vec3A::splat(1.5));
269 |         let intersection = aabb1.intersection(&aabb2);
270 |         assert_eq!(intersection.min, Vec3A::splat(0.5));
271 |         assert_eq!(intersection.max, Vec3A::ONE);
272 |         assert!(intersection.valid());
273 |     }
274 | 
275 |     #[test]
276 |     fn test_intersection_no_overlap() {
277 |         let aabb1 = Aabb::new(Vec3A::ZERO, Vec3A::ONE);
278 |         let aabb2 = Aabb::new(Vec3A::splat(2.0), Vec3A::splat(3.0));
279 |         let intersection = aabb1.intersection(&aabb2);
280 |         assert_eq!(intersection.min, Vec3A::splat(2.0));
281 |         assert_eq!(intersection.max, Vec3A::ONE);
282 |         assert!(!intersection.valid());
283 |     }
284 | 
285 |     #[test]
286 |     fn test_diagonal() {
287 |         let aabb = Aabb::new(Vec3A::ZERO, Vec3A::ONE);
288 |         assert_eq!(aabb.diagonal(), Vec3A::ONE);
289 |     }
290 | 
291 |     #[test]
292 |     fn test_center() {
293 |         let aabb = Aabb::new(Vec3A::ZERO, Vec3A::ONE);
294 |         assert_eq!(aabb.center(), Vec3A::splat(0.5));
295 |     }
296 | 
297 |     #[test]
298 |     fn test_center_axis() {
299 |         let aabb = Aabb::new(Vec3A::ZERO, Vec3A::ONE);
300 |         assert_eq!(aabb.center_axis(0), 0.5);
301 |         assert_eq!(aabb.center_axis(1), 0.5);
302 |         assert_eq!(aabb.center_axis(2), 0.5);
303 |     }
304 | 
305 |     #[test]
306 |     fn test_largest_axis() {
307 |         let aabb = Aabb::new(Vec3A::ZERO, Vec3A::new(1.0, 2.0, 3.0));
308 |         assert_eq!(aabb.largest_axis(), 2);
309 |     }
310 | 
311 |     #[test]
312 |     fn test_smallest_axis() {
313 |         let aabb = Aabb::new(Vec3A::ZERO, Vec3A::new(1.0, 2.0, 3.0));
314 |         assert_eq!(aabb.smallest_axis(), 0);
315 |     }
316 | 
317 |     #[test]
318 |     fn test_half_area() {
319 |         let aabb = Aabb::new(Vec3A::ZERO, Vec3A::ONE);
320 |         assert_eq!(aabb.half_area(), 3.0);
321 |     }
322 | 
323 |     #[test]
324 |     fn test_surface_area() {
325 |         let aabb = Aabb::new(Vec3A::ZERO, Vec3A::ONE);
326 |         assert_eq!(aabb.surface_area(), 6.0);
327 |     }
328 | 
329 |     #[test]
330 |     fn test_empty() {
331 |         let aabb = Aabb::empty();
332 |         assert_eq!(aabb.min, Vec3A::new(f32::MAX, f32::MAX, f32::MAX));
333 |         assert_eq!(aabb.max, Vec3A::new(f32::MIN, f32::MIN, f32::MIN));
334 |     }
335 | 
336 |     #[test]
337 |     fn test_valid() {
338 |         let valid_aabb = Aabb::new(Vec3A::ZERO, Vec3A::ONE);
339 |         assert!(valid_aabb.valid());
340 | 
341 |         let invalid_aabb = Aabb::new(Vec3A::splat(2.0), Vec3A::splat(1.0));
342 |         assert!(!invalid_aabb.valid());
343 |     }
344 | 
345 |     #[test]
346 |     fn test_intersect_aabb() {
347 |         let aabb1 = Aabb::new(Vec3A::ZERO, Vec3A::ONE);
348 |         let aabb2 = Aabb::new(Vec3A::splat(0.5), Vec3A::splat(1.5));
349 |         assert!(aabb1.intersect_aabb(&aabb2));
350 |         let aabb3 = Aabb::new(Vec3A::splat(1.5), Vec3A::splat(2.5));
351 |         assert!(!aabb1.intersect_aabb(&aabb3));
352 |     }
353 | 
354 |     #[test]
355 |     fn test_intersect_ray() {
356 |         let aabb = Aabb::new(Vec3A::ZERO, Vec3A::ONE);
357 |         let ray = Ray::new(Vec3A::splat(-1.0), Vec3A::ONE, 0.0, f32::MAX);
358 |         assert_eq!(aabb.intersect_ray(&ray), 1.0);
359 |         let ray_no_intersect = Ray::new(Vec3A::splat(2.0), Vec3A::ONE, 0.0, f32::MAX);
360 |         assert_eq!(aabb.intersect_ray(&ray_no_intersect), f32::INFINITY);
361 |     }
362 | }
363 | 


--------------------------------------------------------------------------------
/src/bvh2/builder.rs:
--------------------------------------------------------------------------------
  1 | use std::time::{Duration, Instant};
  2 | 
  3 | use crate::{
  4 |     aabb::Aabb, splits::split_aabbs_preset, triangle::Triangle, Boundable, BvhBuildParams,
  5 | };
  6 | 
  7 | use super::{leaf_collapser::collapse, reinsertion::ReinsertionOptimizer, Bvh2};
  8 | 
  9 | /// Build a bvh2 from the given list of Triangles.
 10 | /// Just a helper function / example, feel free to reimplement for your specific use case.
 11 | ///
 12 | /// # Arguments
 13 | /// * `triangles` - A list of Triangles.
 14 | /// * `config` - Parameters for configuring the BVH building.
 15 | /// * `core_build_time` - The core BVH build time. Does not include things like initial AABB
 16 | /// generation or debug validation. This is mostly just here to simplify profiling in [tray_racing](https://github.com/DGriffin91/tray_racing)
 17 | pub fn build_bvh2_from_tris(
 18 |     triangles: &[Triangle],
 19 |     config: BvhBuildParams,
 20 |     core_build_time: &mut Duration,
 21 | ) -> Bvh2 {
 22 |     let mut aabbs = Vec::with_capacity(triangles.len());
 23 |     let mut indices = Vec::with_capacity(triangles.len());
 24 |     let mut largest_half_area = 0.0;
 25 |     let mut avg_area = 0.0;
 26 | 
 27 |     for (i, tri) in triangles.iter().enumerate() {
 28 |         let a = tri.v0;
 29 |         let b = tri.v1;
 30 |         let c = tri.v2;
 31 |         let mut aabb = Aabb::empty();
 32 |         aabb.extend(a).extend(b).extend(c);
 33 |         let half_area = aabb.half_area();
 34 |         largest_half_area = half_area.max(largest_half_area);
 35 |         avg_area += half_area;
 36 |         aabbs.push(aabb);
 37 |         indices.push(i as u32);
 38 |     }
 39 |     avg_area /= triangles.len() as f32;
 40 | 
 41 |     let start_time = Instant::now();
 42 | 
 43 |     if config.pre_split {
 44 |         split_aabbs_preset(
 45 |             &mut aabbs,
 46 |             &mut indices,
 47 |             triangles,
 48 |             avg_area,
 49 |             largest_half_area,
 50 |         );
 51 |     }
 52 | 
 53 |     let mut bvh2 = config.ploc_search_distance.build(
 54 |         &aabbs,
 55 |         indices,
 56 |         config.sort_precision,
 57 |         config.search_depth_threshold,
 58 |     );
 59 |     ReinsertionOptimizer::run(&mut bvh2, config.reinsertion_batch_ratio, None);
 60 |     collapse(
 61 |         &mut bvh2,
 62 |         config.max_prims_per_leaf,
 63 |         config.collapse_traversal_cost,
 64 |     );
 65 |     ReinsertionOptimizer::run(
 66 |         &mut bvh2,
 67 |         config.reinsertion_batch_ratio * config.post_collapse_reinsertion_batch_ratio_multiplier,
 68 |         None,
 69 |     );
 70 | 
 71 |     *core_build_time += start_time.elapsed();
 72 | 
 73 |     #[cfg(debug_assertions)]
 74 |     {
 75 |         bvh2.validate(triangles, false, config.pre_split);
 76 |     }
 77 | 
 78 |     bvh2
 79 | }
 80 | 
 81 | /// Build a bvh2 from the given list of Boundable primitives.
 82 | /// `pre_split` in BvhBuildParams is ignored in this case.
 83 | /// Just a helper function / example, feel free to reimplement for your specific use case.
 84 | ///
 85 | /// # Arguments
 86 | /// * `primitives` - A list of Primitives that implement Boundable.
 87 | /// * `config` - Parameters for configuring the BVH building.
 88 | /// * `core_build_time` - The core BVH build time. Does not include things like initial AABB
 89 | /// generation or debug validation. This is mostly just here to simplify profiling in [tray_racing](https://github.com/DGriffin91/tray_racing)
 90 | // TODO: we could optionally do imprecise basic Aabb splits.
 91 | pub fn build_bvh2<T: Boundable>(
 92 |     primitives: &[T],
 93 |     config: BvhBuildParams,
 94 |     core_build_time: &mut Duration,
 95 | ) -> Bvh2 {
 96 |     let mut aabbs = Vec::with_capacity(primitives.len());
 97 |     let mut indices = Vec::with_capacity(primitives.len());
 98 | 
 99 |     for (i, primitive) in primitives.iter().enumerate() {
100 |         indices.push(i as u32);
101 |         aabbs.push(primitive.aabb());
102 |     }
103 | 
104 |     let start_time = Instant::now();
105 | 
106 |     let mut bvh2 = config.ploc_search_distance.build(
107 |         &aabbs,
108 |         indices,
109 |         config.sort_precision,
110 |         config.search_depth_threshold,
111 |     );
112 |     ReinsertionOptimizer::run(&mut bvh2, config.reinsertion_batch_ratio, None);
113 |     collapse(
114 |         &mut bvh2,
115 |         config.max_prims_per_leaf,
116 |         config.collapse_traversal_cost,
117 |     );
118 |     ReinsertionOptimizer::run(
119 |         &mut bvh2,
120 |         config.reinsertion_batch_ratio * config.post_collapse_reinsertion_batch_ratio_multiplier,
121 |         None,
122 |     );
123 | 
124 |     *core_build_time += start_time.elapsed();
125 | 
126 |     #[cfg(debug_assertions)]
127 |     {
128 |         bvh2.validate(primitives, false, config.pre_split);
129 |     }
130 | 
131 |     bvh2
132 | }
133 | 


--------------------------------------------------------------------------------
/src/bvh2/leaf_collapser.rs:
--------------------------------------------------------------------------------
  1 | // Based on https://github.com/madmann91/bvh/blob/2fd0db62022993963a7343669275647cb073e19a/include/bvh/leaf_collapser.hpp
  2 | #[cfg(feature = "parallel")]
  3 | use rayon::iter::{IntoParallelIterator, ParallelIterator};
  4 | #[cfg(feature = "parallel")]
  5 | use std::sync::atomic::{AtomicU32, Ordering};
  6 | 
  7 | use crate::bvh2::{Bvh2, Bvh2Node};
  8 | 
  9 | /// Collapses leaves of the BVH according to the SAH. This optimization
 10 | /// is only helpful for bottom-up builders, as top-down builders already
 11 | /// have a termination criterion that prevents leaf creation when the SAH
 12 | /// cost does not improve.
 13 | pub fn collapse(bvh: &mut Bvh2, max_prims: u32, traversal_cost: f32) {
 14 |     crate::scope!("collapse");
 15 | 
 16 |     if max_prims <= 1 {
 17 |         return;
 18 |     }
 19 | 
 20 |     if bvh.nodes.is_empty() || bvh.nodes[0].is_leaf() {
 21 |         return;
 22 |     }
 23 | 
 24 |     let nodes_qty = bvh.nodes.len();
 25 | 
 26 |     let parents = bvh.compute_parents();
 27 | 
 28 |     let mut indices_copy = Vec::new();
 29 |     let mut nodes_copy = Vec::new();
 30 | 
 31 |     let mut node_counts: Vec<SometimesAtomicU32> =
 32 |         (0..nodes_qty).map(|_| SometimesAtomicU32::new(1)).collect();
 33 |     let mut prim_counts: Vec<SometimesAtomicU32> =
 34 |         (0..nodes_qty).map(|_| SometimesAtomicU32::new(0)).collect();
 35 | 
 36 |     let node_count;
 37 | 
 38 |     // Bottom-up traversal to collapse leaves
 39 |     // TODO need to figure out if parallel version can have data races, if so:
 40 |     // maybe record commands in parallel, include a index, and execute them sequentially
 41 |     // also reference original impl
 42 |     bottom_up_traverse(bvh, &parents, |leaf, i| {
 43 |         if leaf {
 44 |             prim_counts[i].set(bvh.nodes[i].prim_count);
 45 |         } else {
 46 |             let node = &bvh.nodes[i];
 47 |             debug_assert!(!node.is_leaf());
 48 |             let first_child = node.first_index as usize;
 49 | 
 50 |             let left_count = prim_counts[first_child].get();
 51 |             let right_count = prim_counts[first_child + 1].get();
 52 |             let total_count = left_count + right_count;
 53 | 
 54 |             // Compute the cost of collapsing this node when both children are leaves
 55 |             if left_count > 0 && right_count > 0 && total_count <= max_prims {
 56 |                 let left = bvh.nodes[first_child];
 57 |                 let right = bvh.nodes[first_child + 1];
 58 |                 let collapse_cost = node.aabb.half_area() * (total_count as f32 - traversal_cost);
 59 |                 let base_cost = left.aabb.half_area() * left_count as f32
 60 |                     + right.aabb.half_area() * right_count as f32;
 61 |                 let both_have_same_prim =
 62 |                     (left.first_index == right.first_index) && total_count == 2;
 63 | 
 64 |                 // Collapse them if cost of the collapsed node is lower, or both children contain the same primitive (as a result of splits)
 65 |                 if collapse_cost <= base_cost || both_have_same_prim {
 66 |                     //if both_have_same_prim { 1 } else { total_count }; // TODO, Reduce total count (was showing artifacts)
 67 |                     prim_counts[i].set(total_count);
 68 |                     prim_counts[first_child].set(0);
 69 |                     prim_counts[first_child + 1].set(0);
 70 |                     node_counts[first_child].set(0);
 71 |                     node_counts[first_child + 1].set(0);
 72 |                 }
 73 |             }
 74 |         }
 75 |     });
 76 | 
 77 |     // Prefix sums computed sequentially (TODO: parallelize)
 78 |     let mut sum = 0;
 79 |     node_counts.iter_mut().for_each(|count| {
 80 |         sum += count.get();
 81 |         count.set(sum);
 82 |     });
 83 | 
 84 |     sum = 0;
 85 |     prim_counts.iter_mut().for_each(|count| {
 86 |         sum += count.get();
 87 |         count.set(sum);
 88 |     });
 89 | 
 90 |     {
 91 |         node_count = node_counts[bvh.nodes.len() - 1].get();
 92 |         if prim_counts[0].get() > 0 {
 93 |             // This means the root node has become a leaf.
 94 |             // We avoid copying the data and just swap the old prim array with the new one.
 95 |             bvh.nodes[0].first_index = 0;
 96 |             bvh.nodes[0].prim_count = prim_counts[0].get();
 97 |             std::mem::swap(&mut bvh.primitive_indices, &mut indices_copy);
 98 |             std::mem::swap(&mut bvh.nodes, &mut nodes_copy);
 99 |         } else {
100 |             nodes_copy = vec![Default::default(); node_count as usize];
101 |             indices_copy =
102 |                 vec![Default::default(); prim_counts[bvh.nodes.len() - 1].get() as usize];
103 |             nodes_copy[0] = bvh.nodes[0];
104 |             nodes_copy[0].first_index = node_counts[nodes_copy[0].first_index as usize - 1].get();
105 |         }
106 |     }
107 | 
108 |     // TODO Parallelize:
109 |     {
110 |         for i in 1..bvh.nodes.len() {
111 |             let node_id = node_counts[i - 1].get() as usize;
112 |             if node_id == node_counts[i].get() as usize {
113 |                 continue;
114 |             }
115 | 
116 |             nodes_copy[node_id] = bvh.nodes[i];
117 |             let mut first_prim = prim_counts[i - 1].get();
118 |             if first_prim != prim_counts[i].get() {
119 |                 nodes_copy[node_id].prim_count = prim_counts[i].get() - first_prim;
120 |                 nodes_copy[node_id].first_index = first_prim;
121 | 
122 |                 // Top-down traversal to store the prims contained in this subtree.
123 | 
124 |                 if true {
125 |                     let mut j = i;
126 |                     loop {
127 |                         let node = bvh.nodes[j];
128 |                         if node.is_leaf() {
129 |                             for n in 0..node.prim_count {
130 |                                 indices_copy[(first_prim + n) as usize] =
131 |                                     bvh.primitive_indices[(node.first_index + n) as usize];
132 |                             }
133 | 
134 |                             first_prim += node.prim_count;
135 |                             while !Bvh2Node::is_left_sibling(j) && j != i {
136 |                                 j = parents[j] as usize;
137 |                             }
138 |                             if j == i {
139 |                                 break;
140 |                             }
141 |                             j = Bvh2Node::get_sibling_id(j);
142 |                         } else {
143 |                             j = node.first_index as usize;
144 |                         }
145 |                     }
146 |                 } else {
147 |                     // -------------------------
148 |                     // Alternate method (slower)
149 |                     // -------------------------
150 |                     let mut stack = Vec::new();
151 |                     stack.push(i);
152 |                     while let Some(current_node_index) = stack.pop() {
153 |                         let node = &bvh.nodes[current_node_index];
154 | 
155 |                         if node.is_leaf() {
156 |                             for n in 0..node.prim_count {
157 |                                 indices_copy[(first_prim + n) as usize] =
158 |                                     bvh.primitive_indices[(node.first_index + n) as usize];
159 |                             }
160 |                             first_prim += node.prim_count;
161 |                         } else {
162 |                             stack.push(node.first_index as usize);
163 |                             stack.push((node.first_index + 1) as usize);
164 |                         }
165 |                     }
166 |                     // -------------------------
167 |                 }
168 |             } else {
169 |                 let first_child = &mut nodes_copy[node_id].first_index;
170 |                 *first_child = node_counts[*first_child as usize - 1].get();
171 |             }
172 |         }
173 |     }
174 | 
175 |     std::mem::swap(&mut bvh.nodes, &mut nodes_copy);
176 |     std::mem::swap(&mut bvh.primitive_indices, &mut indices_copy);
177 | }
178 | 
179 | // Based on https://github.com/madmann91/bvh/blob/2fd0db62022993963a7343669275647cb073e19a/include/bvh/bottom_up_algorithm.hpp
180 | #[cfg(not(feature = "parallel"))]
181 | fn bottom_up_traverse<F>(
182 |     bvh: &Bvh2,
183 |     parents: &[u32],
184 |     mut process_node: F, // True is for leaf
185 | ) where
186 |     F: FnMut(bool, usize),
187 | {
188 |     // Special case if the BVH is just a leaf
189 |     if bvh.nodes.len() == 1 {
190 |         process_node(true, 0);
191 |         return;
192 |     }
193 | 
194 |     // Iterate through all nodes starting from 1, since node 0 is assumed to be the root
195 |     (1..bvh.nodes.len()).for_each(|i| {
196 |         // Only process leaves
197 |         if bvh.nodes[i].is_leaf() {
198 |             process_node(true, i);
199 | 
200 |             // Process inner nodes on the path from that leaf up to the root
201 |             let mut j = i;
202 |             while j != 0 {
203 |                 j = parents[j] as usize;
204 | 
205 |                 process_node(false, j);
206 |             }
207 |         }
208 |     });
209 | }
210 | 
211 | #[cfg(feature = "parallel")]
212 | fn bottom_up_traverse<F>(
213 |     bvh: &Bvh2,
214 |     parents: &[u32],
215 |     process_node: F, // True is for leaf
216 | ) where
217 |     F: Fn(bool, usize) + Sync + Send,
218 | {
219 |     // Special case if the BVH is just a leaf
220 |     if bvh.nodes.len() == 1 {
221 |         process_node(true, 0);
222 |         return;
223 |     }
224 | 
225 |     // Iterate through all nodes starting from 1, since node 0 is assumed to be the root
226 |     (1..bvh.nodes.len()).into_par_iter().for_each(|i| {
227 |         // Only process leaves
228 |         if bvh.nodes[i].is_leaf() {
229 |             process_node(true, i);
230 | 
231 |             // Process inner nodes on the path from that leaf up to the root
232 |             let mut j = i as usize;
233 |             while j != 0 {
234 |                 j = parents[j] as usize;
235 | 
236 |                 process_node(false, j);
237 |             }
238 |         }
239 |     });
240 | }
241 | 
242 | pub struct SometimesAtomicU32 {
243 |     #[cfg(feature = "parallel")]
244 |     pub value: AtomicU32,
245 |     #[cfg(not(feature = "parallel"))]
246 |     pub value: u32,
247 | }
248 | 
249 | impl SometimesAtomicU32 {
250 |     #[inline]
251 |     pub fn new(value: u32) -> SometimesAtomicU32 {
252 |         #[cfg(feature = "parallel")]
253 |         {
254 |             SometimesAtomicU32 {
255 |                 value: AtomicU32::new(value),
256 |             }
257 |         }
258 |         #[cfg(not(feature = "parallel"))]
259 |         {
260 |             SometimesAtomicU32 { value }
261 |         }
262 |     }
263 | 
264 |     #[inline]
265 |     #[cfg(feature = "parallel")]
266 |     pub fn set(&self, value: u32) {
267 |         self.value.store(value, Ordering::SeqCst);
268 |         #[cfg(not(feature = "parallel"))]
269 |         {
270 |             self.value = value;
271 |         }
272 |     }
273 | 
274 |     #[inline]
275 |     #[cfg(not(feature = "parallel"))]
276 |     pub fn set(&mut self, value: u32) {
277 |         self.value = value;
278 |     }
279 | 
280 |     #[inline]
281 |     pub fn get(&self) -> u32 {
282 |         #[cfg(feature = "parallel")]
283 |         {
284 |             self.value.load(Ordering::SeqCst)
285 |         }
286 |         #[cfg(not(feature = "parallel"))]
287 |         {
288 |             self.value
289 |         }
290 |     }
291 | }
292 | 


--------------------------------------------------------------------------------
/src/bvh2/reinsertion.rs:
--------------------------------------------------------------------------------
  1 | // Reinsertion optimizer based on "Parallel Reinsertion for Bounding Volume Hierarchy Optimization", by D. Meister and J. Bittner:
  2 | // https://meistdan.github.io/publications/prbvh/paper.pdf
  3 | // https://jcgt.org/published/0011/04/01/paper.pdf
  4 | // Reference: https://github.com/madmann91/bvh/blob/3490634ae822e5081e41f09498fcce03bc1419e3/src/bvh/v2/reinsertion_optimizer.h
  5 | 
  6 | // Note: Most asserts exist to try to elide bounds checks
  7 | 
  8 | #[cfg(feature = "parallel")]
  9 | use rayon::iter::{IntoParallelIterator, ParallelIterator};
 10 | use rdst::{RadixKey, RadixSort};
 11 | 
 12 | use crate::{
 13 |     bvh2::{Bvh2, Bvh2Node},
 14 |     heapstack::HeapStack,
 15 | };
 16 | 
 17 | /// Restructures the BVH, optimizing node locations within the BVH hierarchy per SAH cost.
 18 | pub struct ReinsertionOptimizer<'a> {
 19 |     candidates: Vec<Candidate>,
 20 |     reinsertions: Vec<Reinsertion>,
 21 |     touched: Vec<bool>,
 22 |     parents: Vec<u32>,
 23 |     bvh: &'a mut Bvh2,
 24 |     batch_size_ratio: f32,
 25 | }
 26 | 
 27 | impl ReinsertionOptimizer<'_> {
 28 |     /// Restructures the BVH, optimizing node locations within the BVH hierarchy per SAH cost.
 29 |     /// batch_size_ratio: Fraction of the number of nodes to optimize per iteration.
 30 |     /// ratio_sequence: A sequence of ratios to preform reinsertion at. These are as a
 31 |     /// proportion of the batch_size_ratio. If None, the following sequence is used:
 32 |     /// (1..32).step_by(2).map(|n| 1.0 / n as f32) or
 33 |     /// 1/1, 1/3, 1/5, 1/7, 1/9, 1/11, 1/13, 1/15, 1/17, 1/19, 1/21, 1/23, 1/25, 1/27, 1/29, 1/31
 34 |     pub fn run(bvh: &mut Bvh2, batch_size_ratio: f32, ratio_sequence: Option<Vec<f32>>) {
 35 |         crate::scope!("reinsertion_optimize");
 36 | 
 37 |         if bvh.nodes.is_empty() || bvh.nodes[0].is_leaf() || batch_size_ratio <= 0.0 {
 38 |             return;
 39 |         }
 40 |         #[cfg(feature = "parallel")]
 41 |         let parents = bvh.compute_parents_parallel();
 42 |         #[cfg(not(feature = "parallel"))]
 43 |         let parents = bvh.compute_parents();
 44 | 
 45 |         let cap = (bvh.nodes.len() as f32 * batch_size_ratio.min(1.0)).ceil() as usize;
 46 | 
 47 |         ReinsertionOptimizer {
 48 |             candidates: Vec::with_capacity(cap),
 49 |             reinsertions: Vec::with_capacity(cap),
 50 |             touched: vec![false; bvh.nodes.len()],
 51 |             parents,
 52 |             bvh,
 53 |             batch_size_ratio,
 54 |         }
 55 |         .optimize_impl(ratio_sequence);
 56 |     }
 57 | 
 58 |     pub fn optimize_impl(&mut self, ratio_sequence: Option<Vec<f32>>) {
 59 |         // This initially preforms reinsertion at the specified ratio, then at progressively smaller ratios,
 60 |         // focusing more reinsertion time at the top of the bvh. The original method would perform reinsertion
 61 |         // for a fixed ratio a fixed number of times.
 62 |         let ratio_sequence = ratio_sequence.unwrap_or(
 63 |             (1..32)
 64 |                 .step_by(2)
 65 |                 .map(|n| 1.0 / n as f32)
 66 |                 .collect::<Vec<_>>(),
 67 |         );
 68 | 
 69 |         let mut reinsertion_stack = HeapStack::<(f32, u32)>::new_with_capacity(256); // Can't put in Self because of borrows
 70 |         ratio_sequence.iter().for_each(|ratio| {
 71 |             let batch_size =
 72 |                 (((self.bvh.nodes.len() as f32 * self.batch_size_ratio) * ratio) as usize).max(1);
 73 |             let node_count = self.bvh.nodes.len().min(batch_size + 1);
 74 |             self.find_candidates(node_count);
 75 |             self.optimize_candidates(&mut reinsertion_stack, node_count - 1);
 76 |         });
 77 |     }
 78 | 
 79 |     /// Find potential candidates for reinsertion
 80 |     fn find_candidates(&mut self, node_count: usize) {
 81 |         // This method just takes the first node_count*2 nodes in the bvh and sorts them by their half area
 82 |         // This seemed to find candidates much faster while resulting in similar bvh traversal performance vs the original method
 83 |         // https://github.com/madmann91/bvh/blob/3490634ae822e5081e41f09498fcce03bc1419e3/src/bvh/v2/reinsertion_optimizer.h#L88
 84 |         // Taking the first node_count * 2 seemed to work nearly as well as sorting all the nodes, but builds much faster.
 85 |         self.candidates.clear();
 86 |         self.bvh
 87 |             .nodes
 88 |             .iter()
 89 |             .take(node_count * 2)
 90 |             .enumerate()
 91 |             .skip(1)
 92 |             .for_each(|(i, node)| {
 93 |                 self.candidates.push(Candidate {
 94 |                     cost: node.aabb.half_area(),
 95 |                     node_id: i as u32,
 96 |                 });
 97 |             });
 98 |         self.candidates.radix_sort_unstable();
 99 |     }
100 | 
101 |     #[allow(unused_variables)]
102 |     fn optimize_candidates(&mut self, reinsertion_stack: &mut HeapStack<(f32, u32)>, count: usize) {
103 |         self.reinsertions.clear();
104 |         self.touched.fill(false);
105 | 
106 |         #[cfg(feature = "parallel")]
107 |         {
108 |             let mut reinsertions_map = (0..count)
109 |                 .into_par_iter()
110 |                 .map(|i| {
111 |                     // TODO figure out a way to create a limited number of these just once and reuse from the rayon
112 |                     let mut stack = HeapStack::<(f32, u32)>::new_with_capacity(256);
113 |                     self.find_reinsertion(&mut stack, self.candidates[i].node_id as usize)
114 |                 })
115 |                 .collect::<Vec<_>>();
116 |             reinsertions_map.drain(..).for_each(|r| {
117 |                 if r.area_diff > 0.0 {
118 |                     self.reinsertions.push(r)
119 |                 }
120 |             });
121 |         }
122 |         #[cfg(not(feature = "parallel"))]
123 |         {
124 |             assert!(count <= self.candidates.len());
125 |             (0..count).for_each(|i| {
126 |                 let r =
127 |                     self.find_reinsertion(reinsertion_stack, self.candidates[i].node_id as usize);
128 |                 if r.area_diff > 0.0 {
129 |                     self.reinsertions.push(r)
130 |                 }
131 |             });
132 |         }
133 | 
134 |         self.reinsertions
135 |             .sort_unstable_by(|a, b| b.area_diff.partial_cmp(&a.area_diff).unwrap());
136 | 
137 |         assert!(self.reinsertions.len() <= self.touched.len());
138 |         (0..self.reinsertions.len()).for_each(|i| {
139 |             let reinsertion = &self.reinsertions[i];
140 |             let conflicts = self.get_conflicts(reinsertion.from, reinsertion.to);
141 | 
142 |             if conflicts.iter().any(|&i| self.touched[i]) {
143 |                 return;
144 |             }
145 | 
146 |             conflicts.iter().for_each(|&conflict| {
147 |                 self.touched[conflict] = true;
148 |             });
149 | 
150 |             self.reinsert_node(reinsertion.from as usize, reinsertion.to as usize);
151 |         });
152 |     }
153 | 
154 |     fn find_reinsertion(&self, stack: &mut HeapStack<(f32, u32)>, node_id: usize) -> Reinsertion {
155 |         debug_assert_ne!(node_id, 0);
156 |         // Try to elide bounds checks
157 |         assert!(node_id < self.bvh.nodes.len());
158 |         assert!(node_id < self.parents.len());
159 | 
160 |         /*
161 |          * Here is an example that explains how the cost of a reinsertion is computed. For the
162 |          * reinsertion from A to C, in the figure below, we need to remove P1, replace it by B,
163 |          * and create a node that holds A and C and place it where C was.
164 |          *
165 |          *             R
166 |          *            / \
167 |          *          Pn   Q1
168 |          *          /     \
169 |          *        ...     ...
170 |          *        /         \
171 |          *       P1          C
172 |          *      / \
173 |          *     A   B
174 |          *
175 |          * The resulting area *decrease* is (SA(x) means the surface area of x):
176 |          *
177 |          *     SA(P1) +                                                : P1 was removed
178 |          *     SA(P2) - SA(B) +                                        : P2 now only contains B
179 |          *     SA(P3) - SA(B U sibling(P2)) +                          : Same but for P3
180 |          *     ... +
181 |          *     SA(Pn) - SA(B U sibling(P2) U ... U sibling(P(n - 1)) + : Same but for Pn
182 |          *     0 +                                                     : R does not change
183 |          *     SA(Q1) - SA(Q1 U A) +                                   : Q1 now contains A
184 |          *     SA(Q2) - SA(Q2 U A) +                                   : Q2 now contains A
185 |          *     ... +
186 |          *     -SA(A U C)                                              : For the parent of A and C
187 |          */
188 |         let mut best_reinsertion = Reinsertion {
189 |             from: node_id as u32,
190 |             to: 0,
191 |             area_diff: 0.0,
192 |         };
193 |         let node_area = self.bvh.nodes[node_id].aabb.half_area();
194 |         let parent_area = self.bvh.nodes[self.parents[node_id] as usize]
195 |             .aabb
196 |             .half_area();
197 |         let mut area_diff = parent_area;
198 |         let mut sibling_id = Bvh2Node::get_sibling_id(node_id);
199 |         let mut pivot_bbox = self.bvh.nodes[sibling_id].aabb;
200 |         let parent_id = self.parents[node_id] as usize;
201 |         let mut pivot_id = parent_id;
202 |         let aabb = self.bvh.nodes[node_id].aabb;
203 |         stack.clear();
204 |         loop {
205 |             stack.push((area_diff, sibling_id as u32));
206 |             while !stack.is_empty() {
207 |                 let (top_area_diff, top_sibling_id) = stack.pop_fast();
208 |                 if top_area_diff - node_area <= best_reinsertion.area_diff {
209 |                     continue;
210 |                 }
211 | 
212 |                 let dst_node = &self.bvh.nodes[*top_sibling_id as usize];
213 |                 let merged_area = dst_node.aabb.union(&aabb).half_area();
214 |                 let reinsert_area = top_area_diff - merged_area;
215 |                 if reinsert_area > best_reinsertion.area_diff {
216 |                     best_reinsertion.to = *top_sibling_id;
217 |                     best_reinsertion.area_diff = reinsert_area;
218 |                 }
219 | 
220 |                 if !dst_node.is_leaf() {
221 |                     let child_area = reinsert_area + dst_node.aabb.half_area();
222 |                     stack.push((child_area, dst_node.first_index));
223 |                     stack.push((child_area, dst_node.first_index + 1));
224 |                 }
225 |             }
226 | 
227 |             if pivot_id != parent_id {
228 |                 pivot_bbox = pivot_bbox.union(&self.bvh.nodes[sibling_id].aabb);
229 |                 area_diff += self.bvh.nodes[pivot_id].aabb.half_area() - pivot_bbox.half_area();
230 |             }
231 | 
232 |             if pivot_id == 0 {
233 |                 break;
234 |             }
235 | 
236 |             sibling_id = Bvh2Node::get_sibling_id(pivot_id);
237 |             pivot_id = self.parents[pivot_id] as usize;
238 |         }
239 | 
240 |         if best_reinsertion.to == Bvh2Node::get_sibling_id32(best_reinsertion.from)
241 |             || best_reinsertion.to == self.parents[best_reinsertion.from as usize]
242 |         {
243 |             best_reinsertion = Reinsertion::default();
244 |         }
245 | 
246 |         best_reinsertion
247 |     }
248 | 
249 |     fn reinsert_node(&mut self, from: usize, to: usize) {
250 |         let sibling_id = Bvh2Node::get_sibling_id(from);
251 |         let parent_id = self.parents[from] as usize;
252 |         let sibling_node = self.bvh.nodes[sibling_id];
253 |         let dst_node = self.bvh.nodes[to];
254 | 
255 |         self.bvh.nodes[to].make_inner(Bvh2Node::get_left_sibling_id(from) as u32);
256 |         self.bvh.nodes[sibling_id] = dst_node;
257 |         self.bvh.nodes[parent_id] = sibling_node;
258 | 
259 |         if !self.bvh.nodes[sibling_id].is_leaf() {
260 |             self.parents[self.bvh.nodes[sibling_id].first_index as usize] = sibling_id as u32;
261 |             self.parents[self.bvh.nodes[sibling_id].first_index as usize + 1] = sibling_id as u32;
262 |         }
263 |         if !self.bvh.nodes[parent_id].is_leaf() {
264 |             self.parents[self.bvh.nodes[parent_id].first_index as usize] = parent_id as u32;
265 |             self.parents[self.bvh.nodes[parent_id].first_index as usize + 1] = parent_id as u32;
266 |         }
267 | 
268 |         self.parents[sibling_id] = to as u32;
269 |         self.parents[from] = to as u32;
270 |         self.bvh.refit_from_fast(to, &self.parents);
271 |         self.bvh.refit_from_fast(parent_id, &self.parents);
272 |     }
273 | 
274 |     #[inline(always)]
275 |     fn get_conflicts(&self, from: u32, to: u32) -> [usize; 5] {
276 |         [
277 |             to as usize,
278 |             from as usize,
279 |             Bvh2Node::get_sibling_id(from as usize),
280 |             self.parents[to as usize] as usize,
281 |             self.parents[from as usize] as usize,
282 |         ]
283 |     }
284 | }
285 | 
286 | #[derive(Default, Clone, Copy)]
287 | struct Reinsertion {
288 |     from: u32,
289 |     to: u32,
290 |     area_diff: f32,
291 | }
292 | 
293 | #[derive(Clone, Copy, Debug)]
294 | struct Candidate {
295 |     node_id: u32,
296 |     cost: f32,
297 | }
298 | 
299 | impl RadixKey for Candidate {
300 |     const LEVELS: usize = 4;
301 | 
302 |     #[inline]
303 |     fn get_level(&self, level: usize) -> u8 {
304 |         (-self.cost).get_level(level)
305 |     }
306 | }
307 | 


--------------------------------------------------------------------------------
/src/cwbvh/builder.rs:
--------------------------------------------------------------------------------
  1 | use std::time::{Duration, Instant};
  2 | 
  3 | use crate::{
  4 |     aabb::Aabb,
  5 |     bvh2::reinsertion::ReinsertionOptimizer,
  6 |     cwbvh::{bvh2_to_cwbvh::bvh2_to_cwbvh, CwBvh},
  7 |     splits::split_aabbs_preset,
  8 |     triangle::Triangle,
  9 |     Boundable, BvhBuildParams,
 10 | };
 11 | 
 12 | /// Build a cwbvh from the given list of Triangles.
 13 | /// Just a helper function / example, feel free to reimplement for your specific use case.
 14 | ///
 15 | /// # Arguments
 16 | /// * `triangles` - A list of Triangles.
 17 | /// * `config` - Parameters for configuring the BVH building.
 18 | /// * `core_build_time` - The core BVH build time. Does not include things like initial AABB
 19 | /// generation or debug validation. This is mostly just here to simplify profiling in [tray_racing](https://github.com/DGriffin91/tray_racing)
 20 | pub fn build_cwbvh_from_tris(
 21 |     triangles: &[Triangle],
 22 |     config: BvhBuildParams,
 23 |     core_build_time: &mut Duration,
 24 | ) -> CwBvh {
 25 |     let mut aabbs = Vec::with_capacity(triangles.len());
 26 |     let mut indices = Vec::with_capacity(triangles.len());
 27 |     let mut largest_half_area = 0.0;
 28 |     let mut avg_half_area = 0.0;
 29 | 
 30 |     for (i, tri) in triangles.iter().enumerate() {
 31 |         let a = tri.v0;
 32 |         let b = tri.v1;
 33 |         let c = tri.v2;
 34 |         let mut aabb = Aabb::empty();
 35 |         aabb.extend(a).extend(b).extend(c);
 36 |         let half_area = aabb.half_area();
 37 |         largest_half_area = half_area.max(largest_half_area);
 38 |         avg_half_area += half_area;
 39 |         aabbs.push(aabb);
 40 |         indices.push(i as u32);
 41 |     }
 42 |     avg_half_area /= triangles.len() as f32;
 43 | 
 44 |     let start_time = Instant::now();
 45 | 
 46 |     if config.pre_split {
 47 |         split_aabbs_preset(
 48 |             &mut aabbs,
 49 |             &mut indices,
 50 |             triangles,
 51 |             avg_half_area,
 52 |             largest_half_area,
 53 |         );
 54 |     }
 55 | 
 56 |     let mut bvh2 = config.ploc_search_distance.build(
 57 |         &aabbs,
 58 |         indices,
 59 |         config.sort_precision,
 60 |         config.search_depth_threshold,
 61 |     );
 62 |     ReinsertionOptimizer::run(&mut bvh2, config.reinsertion_batch_ratio, None);
 63 |     let cwbvh = bvh2_to_cwbvh(&bvh2, config.max_prims_per_leaf.clamp(1, 3), true, false);
 64 | 
 65 |     *core_build_time += start_time.elapsed();
 66 | 
 67 |     #[cfg(debug_assertions)]
 68 |     {
 69 |         bvh2.validate(triangles, false, config.pre_split);
 70 |         cwbvh.validate(triangles, config.pre_split, false);
 71 |     }
 72 | 
 73 |     cwbvh
 74 | }
 75 | 
 76 | /// Build a cwbvh from the given list of Boundable primitives.
 77 | /// `pre_split` in BvhBuildParams is ignored in this case.
 78 | /// Just a helper function / example, feel free to reimplement for your specific use case.
 79 | ///
 80 | /// # Arguments
 81 | /// * `primitives` - A list of Primitives that implement Boundable.
 82 | /// * `config` - Parameters for configuring the BVH building.
 83 | /// * `core_build_time` - The core BVH build time. Does not include things like initial AABB
 84 | /// generation or debug validation. This is mostly just here to simplify profiling in [tray_racing](https://github.com/DGriffin91/tray_racing)
 85 | // TODO: we could optionally do imprecise basic Aabb splits.
 86 | pub fn build_cwbvh<T: Boundable>(
 87 |     primitives: &[T],
 88 |     config: BvhBuildParams,
 89 |     core_build_time: &mut Duration,
 90 | ) -> CwBvh {
 91 |     let mut aabbs = Vec::with_capacity(primitives.len());
 92 |     let mut indices = Vec::with_capacity(primitives.len());
 93 | 
 94 |     for (i, primitive) in primitives.iter().enumerate() {
 95 |         indices.push(i as u32);
 96 |         aabbs.push(primitive.aabb());
 97 |     }
 98 | 
 99 |     let start_time = Instant::now();
100 | 
101 |     let mut bvh2 = config.ploc_search_distance.build(
102 |         &aabbs,
103 |         indices,
104 |         config.sort_precision,
105 |         config.search_depth_threshold,
106 |     );
107 |     ReinsertionOptimizer::run(&mut bvh2, config.reinsertion_batch_ratio, None);
108 |     let cwbvh = bvh2_to_cwbvh(&bvh2, config.max_prims_per_leaf.clamp(1, 3), true, false);
109 | 
110 |     #[cfg(debug_assertions)]
111 |     {
112 |         bvh2.validate(&aabbs, false, config.pre_split);
113 |         cwbvh.validate(&aabbs, config.pre_split, false);
114 |     }
115 | 
116 |     *core_build_time += start_time.elapsed();
117 | 
118 |     cwbvh
119 | }
120 | 


--------------------------------------------------------------------------------
/src/cwbvh/bvh2_to_cwbvh.rs:
--------------------------------------------------------------------------------
  1 | // Uses cost / merging from cwbvh paper
  2 | 
  3 | use glam::{vec3a, UVec3, Vec3A};
  4 | 
  5 | use crate::{
  6 |     aabb::Aabb,
  7 |     bvh2::Bvh2,
  8 |     cwbvh::{CwBvh, CwBvhNode, BRANCHING, DENOM},
  9 |     PerComponent, VecExt,
 10 | };
 11 | 
 12 | use super::DIRECTIONS;
 13 | 
 14 | /// Convert a bvh2 to CwBvh
 15 | pub struct Bvh2Converter<'a> {
 16 |     pub bvh2: &'a Bvh2,
 17 |     pub nodes: Vec<CwBvhNode>,
 18 |     pub primitive_indices: Vec<u32>,
 19 |     pub decisions: Vec<Decision>,
 20 |     pub order_children_during_build: bool,
 21 |     pub include_exact_node_aabbs: bool,
 22 |     pub exact_node_aabbs: Option<Vec<Aabb>>,
 23 |     direction_lut: [Vec3A; 8],
 24 | }
 25 | 
 26 | const INVALID: u8 = u8::MAX;
 27 | const INVALID32: u32 = u32::MAX;
 28 | const INVALID_USIZE: usize = INVALID32 as usize;
 29 | 
 30 | const PRIM_COST: f32 = 0.3;
 31 | 
 32 | impl<'a> Bvh2Converter<'a> {
 33 |     /// Initialize the Bvh2 to CwBvh converter.
 34 |     pub fn new(bvh2: &'a Bvh2, order_children: bool, include_exact_node_aabbs: bool) -> Self {
 35 |         let capacity = bvh2.primitive_indices.len();
 36 | 
 37 |         let mut nodes = Vec::with_capacity(capacity);
 38 |         nodes.push(Default::default());
 39 | 
 40 |         let mut direction_lut = [Vec3A::ZERO; DIRECTIONS];
 41 |         direction_lut
 42 |             .iter_mut()
 43 |             .enumerate()
 44 |             .for_each(|(s, direction)| {
 45 |                 *direction = vec3a(
 46 |                     if (s & 0b100) != 0 { -1.0 } else { 1.0 },
 47 |                     if (s & 0b010) != 0 { -1.0 } else { 1.0 },
 48 |                     if (s & 0b001) != 0 { -1.0 } else { 1.0 },
 49 |                 );
 50 |             });
 51 | 
 52 |         Self {
 53 |             bvh2,
 54 |             nodes,
 55 |             primitive_indices: Vec::with_capacity(capacity),
 56 |             decisions: vec![Decision::default(); bvh2.nodes.len() * 7],
 57 |             order_children_during_build: order_children,
 58 |             direction_lut,
 59 |             include_exact_node_aabbs,
 60 |             exact_node_aabbs: if include_exact_node_aabbs {
 61 |                 Some(vec![Aabb::empty(); bvh2.nodes.len()])
 62 |             } else {
 63 |                 None
 64 |             },
 65 |         }
 66 |     }
 67 | 
 68 |     /// Convert the bvh2 to CwBvh
 69 |     pub fn convert_to_cwbvh(&mut self) {
 70 |         crate::scope!("convert_to_cwbvh");
 71 |         debug_assert_eq!(std::mem::size_of::<CwBvhNode>(), 80);
 72 |         self.convert_to_cwbvh_impl(0, 0);
 73 |     }
 74 | 
 75 |     pub fn convert_to_cwbvh_impl(&mut self, node_index_bvh8: usize, node_index_bvh2: usize) {
 76 |         let mut node = self.nodes[node_index_bvh8];
 77 |         let aabb = self.bvh2.nodes[node_index_bvh2].aabb;
 78 |         if let Some(exact_node_aabbs) = &mut self.exact_node_aabbs {
 79 |             exact_node_aabbs[node_index_bvh8] = aabb;
 80 |         }
 81 | 
 82 |         let node_p = aabb.min;
 83 |         node.p = node_p.into();
 84 | 
 85 |         let e = ((aabb.max - aabb.min).max(Vec3A::splat(1e-20)) * DENOM)
 86 |             .log2()
 87 |             .ceil()
 88 |             .exp2();
 89 |         debug_assert!(e.cmpgt(Vec3A::ZERO).all(), "aabb: {:?} e: {}", aabb, e);
 90 | 
 91 |         let rcp_e = 1.0 / e;
 92 |         let e: UVec3 = e.per_comp(|c: f32| {
 93 |             let bits = c.to_bits();
 94 |             // Only the exponent bits can be non-zero
 95 |             debug_assert_eq!(bits & 0b10000000011111111111111111111111, 0);
 96 |             bits >> 23
 97 |         });
 98 |         node.e = [e.x as u8, e.y as u8, e.z as u8];
 99 | 
100 |         let children = &mut [INVALID32; 8];
101 | 
102 |         let child_count = &mut 0;
103 |         self.get_children(node_index_bvh2, children, child_count, 0);
104 | 
105 |         if self.order_children_during_build {
106 |             self.order_children(node_index_bvh2, children, *child_count as usize);
107 |         }
108 | 
109 |         node.imask = 0;
110 | 
111 |         node.primitive_base_idx = self.primitive_indices.len() as u32;
112 |         node.child_base_idx = self.nodes.len() as u32;
113 | 
114 |         let mut num_internal_nodes = 0;
115 |         let mut num_primitives = 0_u32;
116 | 
117 |         for (i, child_index) in children.iter().enumerate() {
118 |             if *child_index == INVALID32 {
119 |                 continue; // Empty slot
120 |             };
121 | 
122 |             let child_aabb = self.bvh2.nodes[*child_index as usize].aabb;
123 | 
124 |             // const PAD: f32 = 1e-20;
125 |             // Use to force non-zero volumes.
126 |             const PAD: f32 = 0.0;
127 | 
128 |             let mut child_min = ((child_aabb.min - node_p - PAD) * rcp_e).floor();
129 |             let mut child_max = ((child_aabb.max - node_p + PAD) * rcp_e).ceil();
130 | 
131 |             child_min = child_min.clamp(Vec3A::ZERO, Vec3A::splat(255.0));
132 |             child_max = child_max.clamp(Vec3A::ZERO, Vec3A::splat(255.0));
133 | 
134 |             debug_assert!((child_min.cmple(child_max)).all());
135 | 
136 |             node.child_min_x[i] = child_min.x as u8;
137 |             node.child_min_y[i] = child_min.y as u8;
138 |             node.child_min_z[i] = child_min.z as u8;
139 |             node.child_max_x[i] = child_max.x as u8;
140 |             node.child_max_y[i] = child_max.y as u8;
141 |             node.child_max_z[i] = child_max.z as u8;
142 | 
143 |             match self.decisions[(child_index * 7) as usize].kind {
144 |                 DecisionKind::LEAF => {
145 |                     let primitive_count = self.count_primitives(*child_index as usize, self.bvh2);
146 |                     debug_assert!(primitive_count > 0 && primitive_count <= 3);
147 | 
148 |                     // Three highest bits contain unary representation of primitive count
149 | 
150 |                     node.child_meta[i] = num_primitives as u8
151 |                         | match primitive_count {
152 |                             1 => 0b0010_0000,
153 |                             2 => 0b0110_0000,
154 |                             3 => 0b1110_0000,
155 |                             _ => panic!("Incorrect leaf primitive count: {}", primitive_count),
156 |                         };
157 | 
158 |                     num_primitives += primitive_count;
159 |                     debug_assert!(num_primitives <= 24);
160 |                 }
161 |                 DecisionKind::INTERNAL => {
162 |                     node.imask |= 1u8 << i;
163 | 
164 |                     node.child_meta[i] = (24 + i as u8) | 0b0010_0000;
165 | 
166 |                     num_internal_nodes += 1;
167 |                 }
168 |                 DecisionKind::DISTRIBUTE => unreachable!(),
169 |             }
170 |         }
171 | 
172 |         self.nodes
173 |             .resize(self.nodes.len() + num_internal_nodes, Default::default());
174 |         self.nodes[node_index_bvh8] = node;
175 | 
176 |         debug_assert!(node.child_base_idx as usize + num_internal_nodes == self.nodes.len());
177 |         debug_assert!(
178 |             node.primitive_base_idx + num_primitives == self.primitive_indices.len() as u32
179 |         );
180 | 
181 |         // Recurse on Internal Nodes
182 |         let mut offset = 0;
183 |         for (i, child_index) in children.iter().enumerate() {
184 |             if *child_index != INVALID32 && (node.imask & (1 << i)) != 0 {
185 |                 self.convert_to_cwbvh_impl(
186 |                     (node.child_base_idx + offset) as usize,
187 |                     *child_index as usize,
188 |                 );
189 |                 offset += 1;
190 |             }
191 |         }
192 |         //self.nodes[node_index_bvh8] = node;
193 |     }
194 | 
195 |     // Recursively count primitives in subtree of the given Node
196 |     // Simultaneously fills the indices buffer of the BVH8
197 |     fn count_primitives(&mut self, node_index: usize, bvh2: &Bvh2) -> u32 {
198 |         let node = bvh2.nodes[node_index];
199 | 
200 |         if node.is_leaf() {
201 |             debug_assert!(node.prim_count == 1);
202 | 
203 |             self.primitive_indices
204 |                 .push(bvh2.primitive_indices[node.first_index as usize]);
205 | 
206 |             return node.prim_count;
207 |         }
208 | 
209 |         self.count_primitives(node.first_index as usize, bvh2)
210 |             + self.count_primitives((node.first_index + 1) as usize, bvh2)
211 |     }
212 | 
213 |     /// Fill cost table for bvh2 -> bvh8 conversion
214 |     pub fn calculate_cost(&mut self, max_prims_per_leaf: u32) {
215 |         crate::scope!("calculate_cost");
216 |         self.calculate_cost_impl(0, max_prims_per_leaf, 0);
217 |     }
218 | 
219 |     // Based on https://github.com/jan-van-bergen/GPU-Raytracer/blob/6559ae2241c8fdea0ddaec959fe1a47ec9b3ab0d/Src/BVH/Converters/BVH8Converter.cpp#L24
220 |     pub fn calculate_cost_impl(
221 |         &mut self,
222 |         node_index: usize,
223 |         max_prims_per_leaf: u32,
224 |         _current_depth: i32,
225 |     ) -> u32 {
226 |         let node = &self.bvh2.nodes[node_index];
227 |         let half_area = node.aabb.half_area();
228 |         let first_index = node.first_index;
229 |         let prim_count = node.prim_count;
230 | 
231 |         let node_dec_idx = node_index * 7;
232 |         let first_index_7 = (first_index * 7) as usize;
233 |         let next_index_7 = ((first_index + 1) * 7) as usize;
234 | 
235 |         let num_primitives;
236 | 
237 |         // TODO possibly merge as much as possible past a specified depth
238 |         // let depth_cost = if current_depth > 15 { 1.0 } else { 1.0 };
239 | 
240 |         //if is_leaf()
241 |         if prim_count != 0 {
242 |             num_primitives = prim_count;
243 |             if num_primitives != 1 {
244 |                 panic!(
245 |                     "ERROR: BVH8 Builder expects BVH with leaf Nodes containing only 1 primitive!\n"
246 |                 );
247 |             }
248 | 
249 |             // SAH cost
250 |             let cost_leaf = half_area * (num_primitives as f32) * PRIM_COST;
251 | 
252 |             for i in 0..7 {
253 |                 let decision = &mut self.decisions[node_dec_idx + i];
254 |                 decision.kind = DecisionKind::LEAF;
255 |                 decision.cost = cost_leaf;
256 |             }
257 |         } else {
258 |             num_primitives = self.calculate_cost_impl(
259 |                 first_index as usize,
260 |                 max_prims_per_leaf,
261 |                 _current_depth + 1,
262 |             ) + self.calculate_cost_impl(
263 |                 (first_index + 1) as usize,
264 |                 max_prims_per_leaf,
265 |                 _current_depth + 1,
266 |             );
267 | 
268 |             // Separate case: i=0 (i=1 in the paper)
269 |             {
270 |                 let cost_leaf = if num_primitives <= max_prims_per_leaf {
271 |                     (num_primitives as f32) * half_area * PRIM_COST
272 |                 } else {
273 |                     f32::INFINITY
274 |                 };
275 | 
276 |                 let mut cost_distribute = f32::INFINITY;
277 | 
278 |                 let mut distribute_left = INVALID;
279 |                 let mut distribute_right = INVALID;
280 | 
281 |                 for k in 0..7 {
282 |                     let c = self.decisions[first_index_7 + k].cost
283 |                         + self.decisions[next_index_7 + 6 - k].cost;
284 | 
285 |                     if c < cost_distribute {
286 |                         cost_distribute = c;
287 | 
288 |                         distribute_left = k as u8;
289 |                         distribute_right = 6 - k as u8;
290 |                     }
291 |                 }
292 | 
293 |                 let cost_internal = cost_distribute + half_area;
294 | 
295 |                 let decision = &mut self.decisions[node_dec_idx];
296 |                 if cost_leaf < cost_internal {
297 |                     decision.kind = DecisionKind::LEAF;
298 |                     decision.cost = cost_leaf;
299 |                 } else {
300 |                     decision.kind = DecisionKind::INTERNAL;
301 |                     decision.cost = cost_internal;
302 |                 }
303 | 
304 |                 decision.distribute_left = distribute_left;
305 |                 decision.distribute_right = distribute_right;
306 |             }
307 | 
308 |             // In the paper i=2..7
309 |             let mut node_i;
310 |             for i in 1..7 {
311 |                 node_i = node_dec_idx + i;
312 |                 let mut cost_distribute = self.decisions[node_i - 1].cost;
313 | 
314 |                 let mut distribute_left = INVALID;
315 |                 let mut distribute_right = INVALID;
316 | 
317 |                 for k in 0..i {
318 |                     let c = self.decisions[first_index_7 + k].cost
319 |                         + self.decisions[next_index_7 + i - k - 1].cost;
320 | 
321 |                     if c < cost_distribute {
322 |                         cost_distribute = c;
323 | 
324 |                         let k_u8 = k as u8;
325 |                         distribute_left = k_u8;
326 |                         distribute_right = i as u8 - k_u8 - 1;
327 |                     }
328 |                 }
329 | 
330 |                 let decision = &mut self.decisions[node_i];
331 |                 decision.cost = cost_distribute;
332 | 
333 |                 if distribute_left != INVALID {
334 |                     decision.kind = DecisionKind::DISTRIBUTE;
335 |                     decision.distribute_left = distribute_left;
336 |                     decision.distribute_right = distribute_right;
337 |                 } else {
338 |                     self.decisions[node_i] = self.decisions[node_i - 1];
339 |                 }
340 |             }
341 |         }
342 | 
343 |         num_primitives
344 |     }
345 | 
346 |     pub fn get_children(
347 |         &mut self,
348 |         node_index: usize,
349 |         children: &mut [u32; 8],
350 |         child_count: &mut u32,
351 |         i: usize,
352 |     ) {
353 |         let node = &self.bvh2.nodes[node_index];
354 | 
355 |         if node.is_leaf() {
356 |             children[*child_count as usize] = node_index as u32;
357 |             *child_count += 1;
358 |             return;
359 |         }
360 | 
361 |         let decision = &self.decisions[node_index * 7 + i];
362 |         let distribute_left = decision.distribute_left;
363 |         let distribute_right = decision.distribute_right;
364 | 
365 |         debug_assert!(distribute_left < 7);
366 |         debug_assert!(distribute_right < 7);
367 | 
368 |         // Recurse on left child if it needs to distribute
369 |         if self.decisions[(node.first_index * 7 + distribute_left as u32) as usize].kind
370 |             == DecisionKind::DISTRIBUTE
371 |         {
372 |             self.get_children(
373 |                 node.first_index as usize,
374 |                 children,
375 |                 child_count,
376 |                 distribute_left as usize,
377 |             );
378 |         } else {
379 |             children[*child_count as usize] = node.first_index;
380 |             *child_count += 1;
381 |         }
382 | 
383 |         // Recurse on right child if it needs to distribute
384 |         if self.decisions[((node.first_index + 1) * 7 + distribute_right as u32) as usize].kind
385 |             == DecisionKind::DISTRIBUTE
386 |         {
387 |             self.get_children(
388 |                 (node.first_index + 1) as usize,
389 |                 children,
390 |                 child_count,
391 |                 distribute_right as usize,
392 |             );
393 |         } else {
394 |             children[*child_count as usize] = node.first_index + 1;
395 |             *child_count += 1;
396 |         }
397 |     }
398 | 
399 |     // Based on https://github.com/jan-van-bergen/GPU-Raytracer/blob/6559ae2241c8fdea0ddaec959fe1a47ec9b3ab0d/Src/BVH/Converters/BVH8Converter.cpp#L148
400 |     pub fn order_children(
401 |         &mut self,
402 |         node_index: usize,
403 |         children: &mut [u32; 8],
404 |         child_count: usize,
405 |     ) {
406 |         let node = &self.bvh2.nodes[node_index];
407 |         let p = node.aabb.center();
408 | 
409 |         let mut cost = [[f32::MAX; DIRECTIONS]; BRANCHING];
410 | 
411 |         assert!(child_count <= BRANCHING);
412 |         assert!(cost.len() >= child_count);
413 |         // Fill cost table
414 |         // TODO parallel: check to see if this is faster w/ par_iter
415 |         for s in 0..DIRECTIONS {
416 |             let d = self.direction_lut[s];
417 |             for c in 0..child_count {
418 |                 let v = self.bvh2.nodes[children[c] as usize].aabb.center() - p;
419 |                 let cost_slot = unsafe { cost.get_unchecked_mut(c).get_unchecked_mut(s) };
420 |                 *cost_slot = d.dot(v); // No benefit from normalizing
421 |             }
422 |         }
423 | 
424 |         let mut assignment = [INVALID_USIZE; BRANCHING];
425 |         let mut slot_filled = [false; DIRECTIONS];
426 | 
427 |         // The paper suggests the auction method, but greedy is almost as good.
428 |         loop {
429 |             let mut min_cost = f32::MAX;
430 | 
431 |             let mut min_slot = INVALID_USIZE;
432 |             let mut min_index = INVALID_USIZE;
433 | 
434 |             // Find cheapest unfilled slot of any unassigned child
435 |             for c in 0..child_count {
436 |                 if assignment[c] == INVALID_USIZE {
437 |                     for (s, &slot_filled) in slot_filled.iter().enumerate() {
438 |                         let cost = unsafe { *cost.get_unchecked(c).get_unchecked(s) };
439 |                         if !slot_filled && cost < min_cost {
440 |                             min_cost = cost;
441 | 
442 |                             min_slot = s;
443 |                             min_index = c;
444 |                         }
445 |                     }
446 |                 }
447 |             }
448 | 
449 |             if min_slot == INVALID_USIZE {
450 |                 break;
451 |             }
452 | 
453 |             slot_filled[min_slot] = true;
454 |             assignment[min_index] = min_slot;
455 |         }
456 | 
457 |         let original_order = std::mem::replace(children, [INVALID32; 8]);
458 | 
459 |         assert!(assignment.len() >= child_count); // Allow compiler to skip bounds check
460 |         assert!(original_order.len() >= child_count); // Allow compiler to skip bounds check
461 |         for i in 0..child_count {
462 |             debug_assert!(assignment[i] != INVALID_USIZE);
463 |             debug_assert!(original_order[i] != INVALID32);
464 |             children[assignment[i]] = original_order[i];
465 |         }
466 |     }
467 | }
468 | 
469 | #[derive(Copy, Clone, PartialEq, Default)]
470 | pub enum DecisionKind {
471 |     LEAF,
472 |     INTERNAL,
473 |     #[default]
474 |     DISTRIBUTE,
475 | }
476 | 
477 | #[derive(Copy, Clone, Default)]
478 | pub struct Decision {
479 |     pub cost: f32,
480 |     pub kind: DecisionKind,
481 |     pub distribute_left: u8,
482 |     pub distribute_right: u8,
483 | }
484 | 
485 | /// Convert the given bvh2 to cwbvh
486 | /// # Arguments
487 | /// * `bvh2` - Source BVH
488 | /// * `max_prims_per_leaf` - 0..=3 The maximum number of primitives per leaf.
489 | pub fn bvh2_to_cwbvh(
490 |     bvh2: &Bvh2,
491 |     max_prims_per_leaf: u32,
492 |     order_children: bool,
493 |     include_exact_node_aabbs: bool,
494 | ) -> CwBvh {
495 |     if bvh2.nodes.is_empty() {
496 |         return CwBvh::default();
497 |     }
498 |     let mut converter = Bvh2Converter::new(bvh2, order_children, include_exact_node_aabbs);
499 |     converter.calculate_cost(max_prims_per_leaf);
500 |     converter.convert_to_cwbvh();
501 | 
502 |     CwBvh {
503 |         nodes: converter.nodes,
504 |         primitive_indices: converter.primitive_indices,
505 |         total_aabb: bvh2.nodes[0].aabb,
506 |         exact_node_aabbs: converter.exact_node_aabbs,
507 |     }
508 | }
509 | 


--------------------------------------------------------------------------------
/src/cwbvh/node.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     fmt::{self, Formatter},
  3 |     mem::transmute,
  4 | };
  5 | 
  6 | use crate::{aabb::Aabb, ray::Ray};
  7 | use bytemuck::{Pod, Zeroable};
  8 | use glam::{vec3a, Vec3, Vec3A};
  9 | use std::fmt::Debug;
 10 | 
 11 | use super::NQ_SCALE;
 12 | 
 13 | /// A Compressed Wide BVH8 Node. repr(C), Pod, 80 bytes.
 14 | // https://research.nvidia.com/sites/default/files/publications/ylitie2017hpg-paper.pdf
 15 | #[derive(Clone, Copy, Default, PartialEq)]
 16 | #[repr(C)]
 17 | pub struct CwBvhNode {
 18 |     /// Min point of node AABB
 19 |     pub p: Vec3,
 20 | 
 21 |     /// Exponent of child bounding box compression
 22 |     /// Max point of node AABB could be calculated ex: `p.x + bitcast<f32>(e[0] << 23) * 255.0`
 23 |     pub e: [u8; 3],
 24 | 
 25 |     /// Bitmask indicating which children are internal nodes. 1 for internal, 0 for leaf
 26 |     pub imask: u8,
 27 | 
 28 |     /// Index of first child into `Vec<CwBvhNode>`
 29 |     pub child_base_idx: u32,
 30 | 
 31 |     /// Index of first primitive into primitive_indices `Vec<u32>`
 32 |     pub primitive_base_idx: u32,
 33 | 
 34 |     /// Meta data for each child
 35 |     /// Empty child slot: The field is set to 00000000
 36 |     ///
 37 |     /// For leafs nodes: the low 5 bits store the primitive offset [0..24) from primitive_base_idx. And the high
 38 |     /// 3 bits store the number of primitives in that leaf in a unary encoding.
 39 |     /// A child leaf with 2 primitives with the first primitive starting at primitive_base_idx would be 0b01100000
 40 |     /// A child leaf with 3 primitives with the first primitive starting at primitive_base_idx + 2 would be 0b11100010
 41 |     /// A child leaf with 1 primitive with the first primitive starting at primitive_base_idx + 1 would be 0b00100001
 42 |     ///
 43 |     /// For internal nodes: The high 3 bits are set to 001 while the low 5 bits store the child slot index plus 24
 44 |     /// i.e., the values range [24..32)
 45 |     pub child_meta: [u8; 8],
 46 | 
 47 |     // Note: deviation from the paper: the min&max are interleaved here.
 48 |     /// Axis planes for each child.
 49 |     /// The plane position could be calculated, for example, with `p.x + bitcast<f32>(e[0] << 23) * child_min_x[0]`
 50 |     /// But in the actual intersection implementation the ray is transformed instead.
 51 |     pub child_min_x: [u8; 8],
 52 |     pub child_max_x: [u8; 8],
 53 |     pub child_min_y: [u8; 8],
 54 |     pub child_max_y: [u8; 8],
 55 |     pub child_min_z: [u8; 8],
 56 |     pub child_max_z: [u8; 8],
 57 | }
 58 | 
 59 | impl Debug for CwBvhNode {
 60 |     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
 61 |         f.debug_struct("CwBvhNode")
 62 |             .field("p", &self.p)
 63 |             .field("e", &self.e)
 64 |             .field("imask", &format!("{:#010b}", &self.imask))
 65 |             .field("child_base_idx", &self.child_base_idx)
 66 |             .field("primitive_base_idx", &self.primitive_base_idx)
 67 |             .field(
 68 |                 "child_meta",
 69 |                 &self
 70 |                     .child_meta
 71 |                     .iter()
 72 |                     .map(|c| format!("{:#010b}", c))
 73 |                     .collect::<Vec<_>>(),
 74 |             )
 75 |             .field("child_min_x", &self.child_min_x)
 76 |             .field("child_max_x", &self.child_max_x)
 77 |             .field("child_min_y", &self.child_min_y)
 78 |             .field("child_max_y", &self.child_max_y)
 79 |             .field("child_min_z", &self.child_min_z)
 80 |             .field("child_max_z", &self.child_max_z)
 81 |             .finish()
 82 |     }
 83 | }
 84 | 
 85 | unsafe impl Pod for CwBvhNode {}
 86 | unsafe impl Zeroable for CwBvhNode {}
 87 | 
 88 | pub(crate) const EPSILON: f32 = 0.0001;
 89 | 
 90 | impl CwBvhNode {
 91 |     #[inline(always)]
 92 |     pub fn intersect_ray(&self, ray: &Ray, oct_inv4: u32) -> u32 {
 93 |         #[cfg(all(
 94 |             any(target_arch = "x86", target_arch = "x86_64"),
 95 |             target_feature = "sse2"
 96 |         ))]
 97 |         {
 98 |             self.intersect_ray_simd(ray, oct_inv4)
 99 |         }
100 | 
101 |         #[cfg(not(all(
102 |             any(target_arch = "x86", target_arch = "x86_64"),
103 |             target_feature = "sse2"
104 |         )))]
105 |         {
106 |             self.intersect_ray_basic(ray, oct_inv4)
107 |         }
108 |     }
109 | 
110 |     /// Intersects only one child at a time with the given ray. Limited simd usage on platforms that support it. Exists for reference & compatibility.
111 |     /// Traversal times with CwBvhNode::intersect_ray_simd take less than half the time vs intersect_ray_basic.
112 |     #[inline(always)]
113 |     pub fn intersect_ray_basic(&self, ray: &Ray, oct_inv4: u32) -> u32 {
114 |         let adjusted_ray_dir_inv = self.compute_extent() * ray.inv_direction;
115 |         let adjusted_ray_origin = (Vec3A::from(self.p) - ray.origin) * ray.inv_direction;
116 | 
117 |         let mut hit_mask = 0;
118 | 
119 |         let rdx = ray.direction.x < 0.0;
120 |         let rdy = ray.direction.y < 0.0;
121 |         let rdz = ray.direction.z < 0.0;
122 | 
123 |         let (child_bits8, bit_index8) = self.get_child_and_index_bits(oct_inv4);
124 | 
125 |         for child in 0..8 {
126 |             let q_lo_x = self.child_min_x[child];
127 |             let q_lo_y = self.child_min_y[child];
128 |             let q_lo_z = self.child_min_z[child];
129 | 
130 |             let q_hi_x = self.child_max_x[child];
131 |             let q_hi_y = self.child_max_y[child];
132 |             let q_hi_z = self.child_max_z[child];
133 | 
134 |             let x_min = if rdx { q_hi_x } else { q_lo_x };
135 |             let x_max = if rdx { q_lo_x } else { q_hi_x };
136 |             let y_min = if rdy { q_hi_y } else { q_lo_y };
137 |             let y_max = if rdy { q_lo_y } else { q_hi_y };
138 |             let z_min = if rdz { q_hi_z } else { q_lo_z };
139 |             let z_max = if rdz { q_lo_z } else { q_hi_z };
140 | 
141 |             let mut tmin3 = vec3a(x_min as f32, y_min as f32, z_min as f32);
142 |             let mut tmax3 = vec3a(x_max as f32, y_max as f32, z_max as f32);
143 | 
144 |             // Account for grid origin and scale
145 |             tmin3 = tmin3 * adjusted_ray_dir_inv + adjusted_ray_origin;
146 |             tmax3 = tmax3 * adjusted_ray_dir_inv + adjusted_ray_origin;
147 | 
148 |             let tmin = tmin3.x.max(tmin3.y).max(tmin3.z).max(EPSILON); //ray.tmin?
149 |             let tmax = tmax3.x.min(tmax3.y).min(tmax3.z).min(ray.tmax);
150 | 
151 |             let intersected = tmin <= tmax;
152 |             if intersected {
153 |                 let child_bits = extract_byte64(child_bits8, child);
154 |                 let bit_index = extract_byte64(bit_index8, child);
155 |                 hit_mask |= child_bits << bit_index;
156 |             }
157 |         }
158 | 
159 |         hit_mask
160 |     }
161 | 
162 |     #[inline(always)]
163 |     pub fn intersect_aabb(&self, aabb: &Aabb, oct_inv4: u32) -> u32 {
164 |         let extent_rcp = 1.0 / self.compute_extent();
165 |         let p = Vec3A::from(self.p);
166 | 
167 |         // Transform the query aabb into the node's local space
168 |         let adjusted_aabb = Aabb::new((aabb.min - p) * extent_rcp, (aabb.max - p) * extent_rcp);
169 | 
170 |         let mut hit_mask = 0;
171 | 
172 |         let (child_bits8, bit_index8) = self.get_child_and_index_bits(oct_inv4);
173 | 
174 |         for child in 0..8 {
175 |             if self.local_child_aabb(child).intersect_aabb(&adjusted_aabb) {
176 |                 let child_bits = extract_byte64(child_bits8, child);
177 |                 let bit_index = extract_byte64(bit_index8, child);
178 |                 hit_mask |= child_bits << bit_index;
179 |             }
180 |         }
181 | 
182 |         hit_mask
183 |     }
184 | 
185 |     #[inline(always)]
186 |     pub fn contains_point(&self, point: &Vec3A, oct_inv4: u32) -> u32 {
187 |         let extent_rcp = 1.0 / self.compute_extent();
188 |         let p = Vec3A::from(self.p);
189 | 
190 |         // Transform the query point into the node's local space
191 |         let adjusted_point = (*point - p) * extent_rcp;
192 | 
193 |         let mut hit_mask = 0;
194 | 
195 |         let (child_bits8, bit_index8) = self.get_child_and_index_bits(oct_inv4);
196 | 
197 |         for child in 0..8 {
198 |             if self.local_child_aabb(child).contains_point(adjusted_point) {
199 |                 let child_bits = extract_byte64(child_bits8, child);
200 |                 let bit_index = extract_byte64(bit_index8, child);
201 |                 hit_mask |= child_bits << bit_index;
202 |             }
203 |         }
204 | 
205 |         hit_mask
206 |     }
207 | 
208 |     // TODO intersect frustum
209 |     // https://github.com/zeux/niagara/blob/bf90aa8c78e352d3b753b35553a3bcc8c65ef7a0/src/shaders/drawcull.comp.glsl#L71
210 |     // https://iquilezles.org/articles/frustumcorrect/
211 | 
212 |     #[inline(always)]
213 |     pub fn get_child_and_index_bits(&self, oct_inv4: u32) -> (u64, u64) {
214 |         let mut oct_inv8 = oct_inv4 as u64;
215 |         oct_inv8 |= oct_inv8 << 32;
216 |         let meta8 = unsafe { transmute::<[u8; 8], u64>(self.child_meta) };
217 | 
218 |         // (meta8 & (meta8 << 1)) takes advantage of the offset indexing for inner nodes [24..32)
219 |         // [0b00011000..=0b00011111). For leaf nodes [0..24) these two bits (0b00011000) are never both set.
220 |         let inner_mask = 0b0001000000010000000100000001000000010000000100000001000000010000;
221 |         let is_inner8 = (meta8 & (meta8 << 1)) & inner_mask;
222 | 
223 |         // 00010000 >> 4: 00000001, then 00000001 * 0xff: 11111111
224 |         let inner_mask8 = (is_inner8 >> 4) * 0xffu64;
225 | 
226 |         // Each byte of bit_index8 contains the traversal priority, biased by 24, for internal nodes, and
227 |         // the triangle offset for leaf nodes. The bit index will later be used to shift the child bits.
228 |         let index_mask = 0b0001111100011111000111110001111100011111000111110001111100011111;
229 |         let bit_index8 = (meta8 ^ (oct_inv8 & inner_mask8)) & index_mask;
230 | 
231 |         // For internal nodes child_bits8 will just be 1 in each byte, so that bit will then be shifted into the high
232 |         // byte of the node hit_mask (see CwBvhNode::intersect_ray). For leaf nodes it will have the unary encoded
233 |         // leaf primitive count and that will be shifted into the lower 24 bits of node hit_mask.
234 |         let child_mask = 0b0000011100000111000001110000011100000111000001110000011100000111;
235 |         let child_bits8 = (meta8 >> 5) & child_mask;
236 |         (child_bits8, bit_index8)
237 |     }
238 | 
239 |     /// Get local child aabb position relative to the parent
240 |     #[inline(always)]
241 |     pub fn local_child_aabb(&self, child: usize) -> Aabb {
242 |         Aabb::new(
243 |             vec3a(
244 |                 self.child_min_x[child] as f32,
245 |                 self.child_min_y[child] as f32,
246 |                 self.child_min_z[child] as f32,
247 |             ),
248 |             vec3a(
249 |                 self.child_max_x[child] as f32,
250 |                 self.child_max_y[child] as f32,
251 |                 self.child_max_z[child] as f32,
252 |             ),
253 |         )
254 |     }
255 | 
256 |     #[inline(always)]
257 |     pub fn child_aabb(&self, child: usize) -> Aabb {
258 |         let e = self.compute_extent();
259 |         let p: Vec3A = self.p.into();
260 |         let mut local_aabb = self.local_child_aabb(child);
261 |         local_aabb.min = local_aabb.min * e + p;
262 |         local_aabb.max = local_aabb.max * e + p;
263 |         local_aabb
264 |     }
265 | 
266 |     #[inline(always)]
267 |     pub fn aabb(&self) -> Aabb {
268 |         let e = self.compute_extent();
269 |         let p: Vec3A = self.p.into();
270 |         Aabb::new(p, p + e * NQ_SCALE)
271 |     }
272 | 
273 |     /// Convert stored extent exponent into float vector
274 |     #[inline(always)]
275 |     pub fn compute_extent(&self) -> Vec3A {
276 |         vec3a(
277 |             f32::from_bits((self.e[0] as u32) << 23),
278 |             f32::from_bits((self.e[1] as u32) << 23),
279 |             f32::from_bits((self.e[2] as u32) << 23),
280 |         )
281 |     }
282 | 
283 |     // If the child is empty this will also return true. If needed also use CwBvh::is_child_empty().
284 |     #[inline(always)]
285 |     pub fn is_leaf(&self, child: usize) -> bool {
286 |         (self.imask & (1 << child)) == 0
287 |     }
288 | 
289 |     #[inline(always)]
290 |     pub fn is_child_empty(&self, child: usize) -> bool {
291 |         self.child_meta[child] == 0
292 |     }
293 | 
294 |     /// Returns the primitive starting index and primitive count for the given child.
295 |     #[inline(always)]
296 |     pub fn child_primitives(&self, child: usize) -> (u32, u32) {
297 |         let child_meta = self.child_meta[child];
298 |         let starting_index = self.primitive_base_idx + (self.child_meta[child] & 0b11111) as u32;
299 |         let primitive_count = (child_meta & 0b11100000).count_ones();
300 |         (starting_index, primitive_count)
301 |     }
302 | 
303 |     /// Returns the node index of the given child.
304 |     #[inline(always)]
305 |     pub fn child_node_index(&self, child: usize) -> u32 {
306 |         let child_meta = self.child_meta[child];
307 |         let slot_index = (child_meta & 0b11111) as usize - 24;
308 |         let relative_index = (self.imask as u32 & !(0xffffffffu32 << slot_index)).count_ones();
309 |         self.child_base_idx + relative_index
310 |     }
311 | }
312 | 
313 | #[inline(always)]
314 | pub fn extract_byte(x: u32, b: u32) -> u32 {
315 |     (x >> (b * 8)) & 0xFFu32
316 | }
317 | 
318 | #[inline(always)]
319 | pub fn extract_byte64(x: u64, b: usize) -> u32 {
320 |     ((x >> (b * 8)) as u32) & 0xFFu32
321 | }
322 | 


--------------------------------------------------------------------------------
/src/cwbvh/simd.rs:
--------------------------------------------------------------------------------
  1 | use glam::*;
  2 | #[cfg(target_arch = "x86")]
  3 | use std::arch::x86::*;
  4 | #[cfg(target_arch = "x86_64")]
  5 | use std::arch::x86_64::*;
  6 | 
  7 | use crate::{
  8 |     cwbvh::{
  9 |         node::{extract_byte64, EPSILON},
 10 |         CwBvhNode,
 11 |     },
 12 |     ray::Ray,
 13 | };
 14 | 
 15 | impl CwBvhNode {
 16 |     #[inline(always)]
 17 |     pub fn intersect_ray_simd(&self, ray: &Ray, oct_inv4: u32) -> u32 {
 18 |         let adj_ray_dir_inv = self.compute_extent() * ray.inv_direction;
 19 |         let adj_ray_origin = (Vec3A::from(self.p) - ray.origin) * ray.inv_direction;
 20 |         let mut hit_mask = 0u32;
 21 |         unsafe {
 22 |             let adj_ray_dir_inv_x = _mm_set1_ps(adj_ray_dir_inv.x);
 23 |             let adj_ray_dir_inv_y = _mm_set1_ps(adj_ray_dir_inv.y);
 24 |             let adj_ray_dir_inv_z = _mm_set1_ps(adj_ray_dir_inv.z);
 25 | 
 26 |             let adj_ray_orig_x = _mm_set1_ps(adj_ray_origin.x);
 27 |             let adj_ray_orig_y = _mm_set1_ps(adj_ray_origin.y);
 28 |             let adj_ray_orig_z = _mm_set1_ps(adj_ray_origin.z);
 29 | 
 30 |             let rdx = ray.direction.x < 0.0;
 31 |             let rdy = ray.direction.y < 0.0;
 32 |             let rdz = ray.direction.z < 0.0;
 33 | 
 34 |             let (child_bits8, bit_index8) = self.get_child_and_index_bits(oct_inv4);
 35 | 
 36 |             #[inline(always)]
 37 |             fn get_q(v: &[u8; 8], i: usize) -> __m128 {
 38 |                 // get_q is the most expensive part of intersect_simd
 39 |                 // Tried version with _mm_cvtepu8_epi32 and _mm_cvtepi32_ps, it was a lot slower.
 40 |                 // Tried transmuting v into a u64 and bit shifting, it was a lot slower.
 41 |                 unsafe {
 42 |                     _mm_set_ps(
 43 |                         *v.get_unchecked(i * 4 + 3) as f32,
 44 |                         *v.get_unchecked(i * 4 + 2) as f32,
 45 |                         *v.get_unchecked(i * 4 + 1) as f32,
 46 |                         *v.get_unchecked(i * 4) as f32,
 47 |                     )
 48 |                 }
 49 |             }
 50 | 
 51 |             // Intersect 4 aabbs at a time:
 52 |             for i in 0..2 {
 53 |                 // It's possible to select hi/lo outside the loop with child_min_x, etc... but that seems quite a bit slower
 54 |                 // using _mm_blendv_ps or similar instead of `if rdx`, etc... is slower
 55 | 
 56 |                 // Interleaving x, y, z like this is slightly faster than loading all at once. Tried using _mm_prefetch without luck
 57 |                 let q_lo_x = get_q(&self.child_min_x, i);
 58 |                 let q_hi_x = get_q(&self.child_max_x, i);
 59 |                 let x_min = if rdx { q_hi_x } else { q_lo_x };
 60 |                 let x_max = if rdx { q_lo_x } else { q_hi_x };
 61 |                 // Tried using _mm_fmadd_ps, it was a lot slower
 62 |                 let tmin_x = _mm_add_ps(_mm_mul_ps(x_min, adj_ray_dir_inv_x), adj_ray_orig_x);
 63 |                 let tmax_x = _mm_add_ps(_mm_mul_ps(x_max, adj_ray_dir_inv_x), adj_ray_orig_x);
 64 | 
 65 |                 let q_lo_y = get_q(&self.child_min_y, i);
 66 |                 let q_hi_y = get_q(&self.child_max_y, i);
 67 |                 let y_min = if rdy { q_hi_y } else { q_lo_y };
 68 |                 let y_max = if rdy { q_lo_y } else { q_hi_y };
 69 |                 let tmin_y = _mm_add_ps(_mm_mul_ps(y_min, adj_ray_dir_inv_y), adj_ray_orig_y);
 70 |                 let tmax_y = _mm_add_ps(_mm_mul_ps(y_max, adj_ray_dir_inv_y), adj_ray_orig_y);
 71 | 
 72 |                 let q_lo_z = get_q(&self.child_min_z, i);
 73 |                 let q_hi_z = get_q(&self.child_max_z, i);
 74 |                 let z_min = if rdz { q_hi_z } else { q_lo_z };
 75 |                 let z_max = if rdz { q_lo_z } else { q_hi_z };
 76 |                 let tmin_z = _mm_add_ps(_mm_mul_ps(z_min, adj_ray_dir_inv_z), adj_ray_orig_z);
 77 |                 let tmax_z = _mm_add_ps(_mm_mul_ps(z_max, adj_ray_dir_inv_z), adj_ray_orig_z);
 78 | 
 79 |                 // Tried using _mm_fmadd_ps, it was a lot slower
 80 |                 // Compute intersection
 81 |                 let tmin = _mm_max_ps(tmin_x, _mm_max_ps(tmin_y, tmin_z));
 82 |                 let tmax = _mm_min_ps(tmax_x, _mm_min_ps(tmax_y, tmax_z));
 83 |                 let tmin = _mm_max_ps(tmin, _mm_set1_ps(EPSILON)); //ray.tmin?
 84 |                 let tmax = _mm_min_ps(tmax, _mm_set1_ps(ray.tmax));
 85 | 
 86 |                 let intersected = _mm_cmple_ps(tmin, tmax);
 87 |                 let mask = _mm_movemask_ps(intersected);
 88 | 
 89 |                 for j in 0..4 {
 90 |                     let offset = i * 4 + j;
 91 |                     if (mask & (1 << j)) != 0 {
 92 |                         let child_bits = extract_byte64(child_bits8, offset);
 93 |                         let bit_index = extract_byte64(bit_index8, offset);
 94 |                         hit_mask |= child_bits << bit_index;
 95 |                     }
 96 |                 }
 97 |             }
 98 |         }
 99 |         hit_mask
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/src/cwbvh/traverse_macro.rs:
--------------------------------------------------------------------------------
  1 | /// Traverse a CwBvh with custom node and primitive intersections.
  2 | /// I really didn't want to use a macro but it seems like everything else using closures/yielding is slower given
  3 | /// both generic node and primitive traversal.
  4 | ///
  5 | /// # Parameters
  6 | /// - `$cwbvh`: `&CwBvh` The CwBvh to be traversed.
  7 | /// - `$node`: `&CwBvhNode` The current node in the BVH that is being traversed.
  8 | /// - `$state`: `Traversal` Mutable traversal state.
  9 | /// - `$node_intersection`: An expression that is executed for each node intersection during traversal.
 10 | ///     It should test for intersection against the current `node`, making use of `state.oct_inv4` u32.
 11 | ///     It should return a u32 `hitmask` of the node children hitmask corresponding to which nodes were intersected.
 12 | /// - `$primitive_intersection`: A code block that is executed for each primitive intersection.
 13 | ///     It should read the current `state.primitive_id` u32. This is the index into the primitive indices for the
 14 | ///     current primitive to be tested. Optionally use `break` to halt traversal.
 15 | ///
 16 | /// # Example: Closest hit ray traversal
 17 | /// ```
 18 | /// use obvhs::{
 19 | ///     cwbvh::{builder::build_cwbvh_from_tris, node::CwBvhNode},
 20 | ///     ray::{Ray, RayHit},
 21 | ///     test_util::geometry::{icosphere, PLANE},
 22 | ///     triangle::Triangle,
 23 | ///     BvhBuildParams,
 24 | ///     traverse,
 25 | /// };
 26 | /// use glam::*;
 27 | /// use std::time::Duration;
 28 | ///
 29 | /// let mut tris: Vec<Triangle> = Vec::new();
 30 | /// tris.extend(icosphere(1));
 31 | /// tris.extend(PLANE);
 32 | ///
 33 | /// let ray = Ray::new_inf(vec3a(0.1, 0.1, 4.0), vec3a(0.0, 0.0, -1.0));
 34 | ///
 35 | /// let bvh = build_cwbvh_from_tris(&tris, BvhBuildParams::medium_build(), &mut Duration::default());
 36 | /// let mut hit = RayHit::none();
 37 | /// let mut traverse_ray = ray.clone();
 38 | /// let mut state = bvh.new_traversal(ray.direction);
 39 | /// let mut node;
 40 | /// traverse!(bvh, node, state,
 41 | ///     // Node intersection:
 42 | ///     node.intersect_ray(&traverse_ray, state.oct_inv4),
 43 | ///     // Primitive intersection:
 44 | ///     {
 45 | ///         let t = tris[bvh.primitive_indices[state.primitive_id as usize] as usize].intersect(&traverse_ray);
 46 | ///         if t < traverse_ray.tmax {
 47 | ///             hit.primitive_id = state.primitive_id;
 48 | ///             hit.t = t;
 49 | ///             traverse_ray.tmax = t;
 50 | ///         }
 51 | ///     }
 52 | /// );
 53 | ///
 54 | /// let did_hit = hit.t < ray.tmax;
 55 | /// assert!(did_hit);
 56 | /// assert!(bvh.primitive_indices[hit.primitive_id as usize] == 62);
 57 | /// ```
 58 | #[macro_export]
 59 | macro_rules! traverse {
 60 |     ($cwbvh:expr, $node:expr, $state:expr, $node_intersection:expr, $primitive_intersection:expr) => {{
 61 |         loop {
 62 |             // While the primitive group is not empty
 63 |             while $state.primitive_group.y != 0 {
 64 |                 let local_primitive_index = $crate::cwbvh::firstbithigh($state.primitive_group.y);
 65 | 
 66 |                 // Remove primitive from current_group
 67 |                 $state.primitive_group.y &= !(1u32 << local_primitive_index);
 68 | 
 69 |                 $state.primitive_id = $state.primitive_group.x + local_primitive_index;
 70 |                 $primitive_intersection
 71 |             }
 72 |             $state.primitive_group = UVec2::ZERO;
 73 | 
 74 |             // If there's remaining nodes in the current group to check
 75 |             if $state.current_group.y & 0xff000000 != 0 {
 76 |                 let hits_imask = $state.current_group.y;
 77 | 
 78 |                 let child_index_offset = $crate::cwbvh::firstbithigh(hits_imask);
 79 |                 let child_index_base = $state.current_group.x;
 80 | 
 81 |                 // Remove node from current_group
 82 |                 $state.current_group.y &= !(1u32 << child_index_offset);
 83 | 
 84 |                 // If the node group is not yet empty, push it on the stack
 85 |                 if $state.current_group.y & 0xff000000 != 0 {
 86 |                     $state.stack.push($state.current_group);
 87 |                 }
 88 | 
 89 |                 let slot_index = (child_index_offset - 24) ^ ($state.oct_inv4 & 0xff);
 90 |                 let relative_index = (hits_imask & !(0xffffffffu32 << slot_index)).count_ones();
 91 | 
 92 |                 let child_node_index = child_index_base + relative_index;
 93 | 
 94 |                 $node = &$cwbvh.nodes[child_node_index as usize];
 95 | 
 96 |                 $state.hitmask = $node_intersection;
 97 | 
 98 |                 $state.current_group.x = $node.child_base_idx;
 99 |                 $state.primitive_group.x = $node.primitive_base_idx;
100 | 
101 |                 $state.current_group.y = (&$state.hitmask & 0xff000000u32) | ($node.imask as u32);
102 |                 $state.primitive_group.y = &$state.hitmask & 0x00ffffffu32;
103 |             } else {
104 |                 // Below is only needed when using triangle postponing, which would only be helpful on the
105 |                 // GPU (it helps reduce thread divergence). Also, this isn't compatible with traversal yeilding.
106 |                 // $state.primitive_group = $state.current_group;
107 |                 $state.current_group = UVec2::ZERO;
108 |             }
109 | 
110 |             // If there's no remaining nodes in the current group to check, pop it off the stack.
111 |             if $state.primitive_group.y == 0 && ($state.current_group.y & 0xff000000) == 0 {
112 |                 // If the stack is empty, end traversal.
113 |                 if $state.stack.is_empty() {
114 |                     $state.current_group.y = 0;
115 |                     break;
116 |                 }
117 | 
118 |                 $state.current_group = $state.stack.pop_fast();
119 |             }
120 |         }
121 |     }};
122 | }
123 | 


--------------------------------------------------------------------------------
/src/heapstack.rs:
--------------------------------------------------------------------------------
  1 | //! A stack data structure implemented on the heap with adjustable capacity.
  2 | 
  3 | /// A stack data structure implemented on the heap with adjustable capacity.
  4 | ///
  5 | /// This structure allows pushing and popping elements and will never automatically
  6 | /// allocate or deallocate. The only functions that will result in allocation are
  7 | /// `HeapStack::new_with_capacity` and `HeapStack::reserve`.
  8 | ///
  9 | /// The elements must implement the `Clone` and `Default` traits.
 10 | #[derive(Default)]
 11 | pub struct HeapStack<T: Clone + Default> {
 12 |     data: Vec<T>,
 13 |     index: usize,
 14 | }
 15 | 
 16 | impl<T: Clone + Default> HeapStack<T> {
 17 |     /// Creates a new `HeapStack` with the specified initial capacity.
 18 |     ///
 19 |     /// # Arguments
 20 |     /// * `cap` - The initial capacity of the stack. Must be greater than zero.
 21 |     ///
 22 |     /// # Returns
 23 |     /// A `HeapStack` with pre-allocated space for `cap` elements.
 24 |     ///
 25 |     /// # Panics
 26 |     /// This function will panic if `cap` is zero.
 27 |     #[inline(always)]
 28 |     pub fn new_with_capacity(cap: usize) -> Self {
 29 |         assert!(cap > 0);
 30 |         HeapStack {
 31 |             data: vec![Default::default(); cap],
 32 |             index: 0,
 33 |         }
 34 |     }
 35 | 
 36 |     /// Pushes a value onto the stack.
 37 |     ///
 38 |     /// # Arguments
 39 |     /// * `v` - The value to be pushed onto the stack.
 40 |     ///
 41 |     /// # Panics
 42 |     /// This function will panic if the stack is full.
 43 |     #[inline(always)]
 44 |     pub fn push(&mut self, v: T) {
 45 |         if self.index < self.data.len() {
 46 |             *unsafe { self.data.get_unchecked_mut(self.index) } = v;
 47 |             self.index += 1;
 48 |         } else {
 49 |             panic!("Index out of bounds: the HeapStack is full (length: {}) and cannot accommodate more elements", self.data.len());
 50 |         }
 51 |     }
 52 | 
 53 |     /// Pops a value from the stack.
 54 |     ///
 55 |     /// # Returns
 56 |     /// `Some(T)` if the stack is not empty, otherwise `None`.
 57 |     #[inline(always)]
 58 |     pub fn pop(&mut self) -> Option<&T> {
 59 |         if self.index > 0 {
 60 |             self.index = self.index.saturating_sub(1);
 61 |             Some(&self.data[self.index])
 62 |         } else {
 63 |             None
 64 |         }
 65 |     }
 66 | 
 67 |     /// Pops a value from the stack without checking bounds.
 68 |     ///
 69 |     /// This function is safe to call because a `HeapStack` cannot have a capacity of zero.
 70 |     /// However, if the stack is empty when this function is called, it will access what was previously
 71 |     /// the first value in the stack, which may not be the expected behavior.
 72 |     ///
 73 |     /// # Returns
 74 |     /// The value at the top of the stack.
 75 |     #[inline(always)]
 76 |     pub fn pop_fast(&mut self) -> &T {
 77 |         self.index = self.index.saturating_sub(1);
 78 |         let v = unsafe { self.data.get_unchecked(self.index) };
 79 |         v
 80 |     }
 81 | 
 82 |     /// Returns the number of elements in the stack.
 83 |     ///
 84 |     /// # Returns
 85 |     /// The length of the stack.
 86 |     #[inline(always)]
 87 |     pub fn len(&self) -> usize {
 88 |         self.index
 89 |     }
 90 | 
 91 |     /// Returns true if the stack is empty.
 92 |     #[inline(always)]
 93 |     pub fn is_empty(&self) -> bool {
 94 |         self.index == 0
 95 |     }
 96 | 
 97 |     /// Clears the stack, removing all elements.
 98 |     ///
 99 |     /// This operation does not deallocate the stack's capacity.
100 |     #[inline(always)]
101 |     pub fn clear(&mut self) {
102 |         self.index = 0;
103 |     }
104 | 
105 |     /// Reserves capacity for at least `cap` elements.
106 |     ///
107 |     /// # Arguments
108 |     /// * `cap` - The desired capacity.
109 |     /// If the new capacity is smaller than the current capacity, this function does nothing.
110 |     #[inline(always)]
111 |     pub fn reserve(&mut self, cap: usize) {
112 |         if cap < self.data.len() {
113 |             return;
114 |         }
115 |         self.data.resize(cap, Default::default());
116 |     }
117 | }
118 | 
119 | #[cfg(test)]
120 | mod tests {
121 |     use super::*;
122 | 
123 |     #[test]
124 |     fn test_new_with_capacity() {
125 |         let stack: HeapStack<i32> = HeapStack::new_with_capacity(10);
126 |         assert_eq!(stack.len(), 0);
127 |         assert_eq!(stack.data.len(), 10);
128 |     }
129 | 
130 |     #[test]
131 |     fn test_push_and_pop() {
132 |         let mut stack: HeapStack<i32> = HeapStack::new_with_capacity(10);
133 |         stack.push(1);
134 |         stack.push(2);
135 |         stack.push(3);
136 | 
137 |         assert_eq!(stack.len(), 3);
138 |         assert_eq!(stack.pop(), Some(&3));
139 |         assert_eq!(stack.pop(), Some(&2));
140 |         assert_eq!(stack.pop(), Some(&1));
141 |         assert_eq!(stack.pop(), None);
142 |     }
143 | 
144 |     #[test]
145 |     #[should_panic(expected = "Index out of bounds: the HeapStack is full")]
146 |     fn test_push_panic() {
147 |         let mut stack: HeapStack<i32> = HeapStack::new_with_capacity(2);
148 |         stack.push(1);
149 |         stack.push(2);
150 |         stack.push(3); // This should panic
151 |     }
152 | 
153 |     #[test]
154 |     fn test_pop_fast() {
155 |         let mut stack: HeapStack<i32> = HeapStack::new_with_capacity(10);
156 |         stack.push(1);
157 |         stack.push(2);
158 |         stack.push(3);
159 | 
160 |         assert_eq!(*stack.pop_fast(), 3);
161 |         assert_eq!(*stack.pop_fast(), 2);
162 |         assert_eq!(*stack.pop_fast(), 1);
163 |     }
164 | 
165 |     #[test]
166 |     fn test_clear() {
167 |         let mut stack: HeapStack<i32> = HeapStack::new_with_capacity(10);
168 |         stack.push(1);
169 |         stack.push(2);
170 |         stack.push(3);
171 | 
172 |         stack.clear();
173 |         assert_eq!(stack.len(), 0);
174 |         assert_eq!(stack.pop(), None);
175 |     }
176 | 
177 |     #[test]
178 |     fn test_reserve() {
179 |         let mut stack: HeapStack<i32> = HeapStack::new_with_capacity(5);
180 |         assert_eq!(stack.data.len(), 5);
181 | 
182 |         stack.reserve(10);
183 |         assert_eq!(stack.data.len(), 10);
184 | 
185 |         stack.reserve(3); // Should not shrink the capacity
186 |         assert_eq!(stack.data.len(), 10);
187 |     }
188 | }
189 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! # BVH Construction and Traversal Library
  2 | //!
  3 | //! - [PLOC](https://meistdan.github.io/publications/ploc/paper.pdf) BVH2 builder with [Parallel Reinsertion](https://meistdan.github.io/publications/prbvh/paper.pdf) and spatial pre-splits.
  4 | //! - [CWBVH](https://research.nvidia.com/sites/default/files/publications/ylitie2017hpg-paper.pdf) An eight-way compressed wide BVH8 builder. Each BVH Node is compressed so that it takes up only 80 bytes per node.
  5 | //! - CPU traversal for both BVH2 and CWBVH (SIMD traversal, intersecting 4 nodes at a time)
  6 | //! - For GPU traversal example, see the [Tray Racing](https://github.com/DGriffin91/tray_racing) benchmark
  7 | //!
  8 | //! OBVHS optionally uses [rayon](https://github.com/rayon-rs/rayon) to parallelize building. Many parts of the building process are parallelized, but single threaded building speed has initally been the priority so there is still quite a bit of room for improvement in parallel building performance.
  9 | //!
 10 | //! ## Example
 11 | //!
 12 | //! ```
 13 | //! use glam::*;
 14 | //! use obvhs::{
 15 | //!     cwbvh::builder::build_cwbvh_from_tris,
 16 | //!     ray::{Ray, RayHit},
 17 | //!     test_util::geometry::{icosphere, PLANE},
 18 | //!     triangle::Triangle,
 19 | //!     BvhBuildParams,
 20 | //! };
 21 | //! use std::time::Duration;
 22 | //!
 23 | //! fn main() {
 24 | //!     // Build a scene with an icosphere and a plane
 25 | //!     // BVH primitives do not need to be triangles, the BVH builder is only concerned with AABBs.
 26 | //!     // (With the exception of optional precise triangle aabb splitting)
 27 | //!     let mut tris: Vec<Triangle> = Vec::new();
 28 | //!     tris.extend(icosphere(1));
 29 | //!     tris.extend(PLANE);
 30 | //!
 31 | //!     // Build the BVH.
 32 | //!     // build_cwbvh_from_tris is just a helper that can build from BvhBuildParams and the
 33 | //!     // respective presets. Feel free to copy the contents of build_cwbvh_from_tris or
 34 | //!     // build_cwbvh. They are very straightforward. If you don't want to use Triangles as the
 35 | //!     // primitive, use  build_cwbvh instead. build_cwbvh_from_tris just adds support for
 36 | //!     // splitting tris.
 37 | //!     let bvh = build_cwbvh_from_tris(
 38 | //!         &tris,
 39 | //!         BvhBuildParams::medium_build(),
 40 | //!         &mut Duration::default(),
 41 | //!     );
 42 | //!
 43 | //!     // Create a new ray
 44 | //!     let ray = Ray::new_inf(vec3a(0.1, 0.1, 4.0), vec3a(0.0, 0.0, -1.0));
 45 | //!
 46 | //!     // Traverse the BVH, finding the closest hit.
 47 | //!     let mut ray_hit = RayHit::none();
 48 | //!     if bvh.ray_traverse(ray, &mut ray_hit, |ray, id| {
 49 | //!         // Use primitive_indices to look up the original primitive id.
 50 | //!         // (Could reorder tris per bvh.primitive_indices to avoid this lookup, see
 51 | //!         // cornell_box_cwbvh example)
 52 | //!         tris[bvh.primitive_indices[id] as usize].intersect(ray)
 53 | //!     }) {
 54 | //!         println!(
 55 | //!             "Hit Triangle {}",
 56 | //!             bvh.primitive_indices[ray_hit.primitive_id as usize]
 57 | //!         );
 58 | //!         println!("Distance to hit: {}", ray_hit.t);
 59 | //!     } else {
 60 | //!         println!("Miss");
 61 | //!     }
 62 | //! }
 63 | //! ```
 64 | 
 65 | use std::time::Duration;
 66 | 
 67 | use aabb::Aabb;
 68 | use glam::Mat4;
 69 | use ploc::{PlocSearchDistance, SortPrecision};
 70 | use triangle::Triangle;
 71 | 
 72 | pub mod aabb;
 73 | pub mod bvh2;
 74 | pub mod cwbvh;
 75 | pub mod heapstack;
 76 | pub mod ploc;
 77 | pub mod ray;
 78 | pub mod rt_triangle;
 79 | pub mod splits;
 80 | pub mod test_util;
 81 | pub mod triangle;
 82 | 
 83 | /// A trait for types that can be bounded by an axis-aligned bounding box (AABB). Used in Bvh2/CwBvh validation.
 84 | pub trait Boundable {
 85 |     fn aabb(&self) -> Aabb;
 86 | }
 87 | 
 88 | /// A trait for types that can have a matrix transform applied. Primarily for testing/examples.
 89 | pub trait Transformable {
 90 |     fn transform(&mut self, matrix: &Mat4);
 91 | }
 92 | 
 93 | /// Apply a function to each component of a type.
 94 | #[doc(hidden)]
 95 | pub trait PerComponent<C1, C2 = C1, Output = Self> {
 96 |     fn per_comp(self, f: impl Fn(C1) -> C2) -> Output;
 97 | }
 98 | 
 99 | impl<Input, C1, C2, Output> PerComponent<C1, C2, Output> for Input
100 | where
101 |     Input: Into<[C1; 3]>,
102 |     Output: From<[C2; 3]>,
103 | {
104 |     /// Applies a function to each component of the input.
105 |     fn per_comp(self, f: impl Fn(C1) -> C2) -> Output {
106 |         let [x, y, z] = self.into();
107 |         Output::from([f(x), f(y), f(z)])
108 |     }
109 | }
110 | 
111 | #[doc(hidden)]
112 | pub trait VecExt {
113 |     /// Computes the base 2 logarithm of each component of the vector.
114 |     fn log2(self) -> Self;
115 |     /// Computes the base 2 exponential of each component of the vector.
116 |     fn exp2(self) -> Self;
117 | }
118 | 
119 | impl VecExt for glam::Vec3 {
120 |     /// Computes the base 2 logarithm of each component of the `Vec3` vector.
121 |     fn log2(self) -> Self {
122 |         self.per_comp(f32::log2)
123 |     }
124 | 
125 |     /// Computes the base 2 exponential of each component of the `Vec3` vector.
126 |     fn exp2(self) -> Self {
127 |         self.per_comp(f32::exp2)
128 |     }
129 | }
130 | 
131 | impl VecExt for glam::Vec3A {
132 |     /// Computes the base 2 logarithm of each component of the `Vec3A` vector.
133 |     fn log2(self) -> Self {
134 |         self.per_comp(f32::log2)
135 |     }
136 | 
137 |     /// Computes the base 2 exponential of each component of the `Vec3A` vector.
138 |     fn exp2(self) -> Self {
139 |         self.per_comp(f32::exp2)
140 |     }
141 | }
142 | 
143 | /// A macro to measure and print the execution time of a block of code.
144 | ///
145 | /// # Arguments
146 | /// * `$label` - A string label to identify the code block being timed.
147 | /// * `$($code:tt)*` - The code block whose execution time is to be measured.
148 | ///
149 | /// # Usage
150 | /// ```rust
151 | /// use obvhs::timeit;
152 | /// timeit!["example",
153 | ///     // code to measure
154 | /// ];
155 | /// ```
156 | ///
157 | /// # Note
158 | /// The macro purposefully doesn't include a scope so variables don't need to
159 | /// be passed out of it. This allows it to be trivially added to existing code.
160 | ///
161 | /// This macro only measures time when the `timeit` feature is enabled.
162 | #[macro_export]
163 | #[doc(hidden)]
164 | macro_rules! timeit {
165 |     [$label:expr, $($code:tt)*] => {
166 |         #[cfg(feature = "timeit")]
167 |         let timeit_start = std::time::Instant::now();
168 |         $($code)*
169 |         #[cfg(feature = "timeit")]
170 |         println!("{:>8} {}", format!("{}", $crate::PrettyDuration(timeit_start.elapsed())), $label);
171 |     };
172 | }
173 | 
174 | /// A wrapper struct for `std::time::Duration` to provide pretty-printing of durations.
175 | #[doc(hidden)]
176 | pub struct PrettyDuration(pub Duration);
177 | 
178 | impl std::fmt::Display for PrettyDuration {
179 |     /// Durations are formatted as follows:
180 |     /// - If the duration is greater than or equal to 1 second, it is formatted in seconds (s).
181 |     /// - If the duration is greater than or equal to 1 millisecond but less than 1 second, it is formatted in milliseconds (ms).
182 |     /// - If the duration is less than 1 millisecond, it is formatted in microseconds (µs).
183 |     /// In the case of seconds & milliseconds, the duration is always printed with a precision of two decimal places.
184 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
185 |         let duration = self.0;
186 |         if duration.as_secs() > 0 {
187 |             let seconds =
188 |                 duration.as_secs() as f64 + f64::from(duration.subsec_nanos()) / 1_000_000_000.0;
189 |             write!(f, "{:.2}s ", seconds)
190 |         } else if duration.subsec_millis() > 0 {
191 |             let milliseconds =
192 |                 duration.as_millis() as f64 + f64::from(duration.subsec_micros() % 1_000) / 1_000.0;
193 |             write!(f, "{:.2}ms", milliseconds)
194 |         } else {
195 |             let microseconds = duration.as_micros();
196 |             write!(f, "{}µs", microseconds)
197 |         }
198 |     }
199 | }
200 | 
201 | /// Add profile scope. Nesting the macro allows us to make the profiling crate optional.
202 | #[doc(hidden)]
203 | #[macro_export]
204 | macro_rules! scope {
205 |     [$label:expr] => {
206 |         #[cfg(feature = "profile")]
207 |         profiling::scope!($label);
208 |     };
209 | }
210 | 
211 | /// General build parameters for Bvh2 & CwBvh
212 | pub struct BvhBuildParams {
213 |     /// Split large tris into multiple AABBs
214 |     pub pre_split: bool,
215 |     /// In ploc, the number of nodes before and after the current one that are evaluated for pairing. 1 has a
216 |     /// fast path in building and still results in decent quality BVHs esp. when paired with a bit of reinsertion.
217 |     pub ploc_search_distance: PlocSearchDistance,
218 |     /// Below this depth a search distance of 1 will be used for ploc.
219 |     pub search_depth_threshold: usize,
220 |     /// Typically 0..1: ratio of nodes considered as candidates for reinsertion. Above 1 to evaluate the whole set
221 |     /// multiple times. A little goes a long way. Try 0.01 or even 0.001 before disabling for build performance.
222 |     pub reinsertion_batch_ratio: f32,
223 |     /// For Bvh2 only, a second pass of reinsertion after collapse. Since collapse reduces the node count,
224 |     /// this reinsertion pass will be faster. 0 to disable. Relative to the initial reinsertion_batch_ratio.
225 |     pub post_collapse_reinsertion_batch_ratio_multiplier: f32,
226 |     /// Bits used for ploc radix sort.
227 |     pub sort_precision: SortPrecision,
228 |     /// Min 1 (CwBvh will clamp to max 3)
229 |     pub max_prims_per_leaf: u32,
230 |     /// Multiplier for traversal cost calculation during Bvh2 collapse (Does not affect CwBvh). A higher value will
231 |     /// result in more primitives per leaf.
232 |     pub collapse_traversal_cost: f32,
233 | }
234 | 
235 | impl BvhBuildParams {
236 |     pub fn fastest_build() -> Self {
237 |         BvhBuildParams {
238 |             pre_split: false,
239 |             ploc_search_distance: PlocSearchDistance::Minimum,
240 |             search_depth_threshold: 0,
241 |             reinsertion_batch_ratio: 0.0,
242 |             post_collapse_reinsertion_batch_ratio_multiplier: 0.0,
243 |             sort_precision: SortPrecision::U64,
244 |             max_prims_per_leaf: 1,
245 |             collapse_traversal_cost: 1.0,
246 |         }
247 |     }
248 |     pub fn very_fast_build() -> Self {
249 |         BvhBuildParams {
250 |             pre_split: false,
251 |             ploc_search_distance: PlocSearchDistance::Minimum,
252 |             search_depth_threshold: 0,
253 |             reinsertion_batch_ratio: 0.01,
254 |             post_collapse_reinsertion_batch_ratio_multiplier: 0.0,
255 |             sort_precision: SortPrecision::U64,
256 |             max_prims_per_leaf: 8,
257 |             collapse_traversal_cost: 3.0,
258 |         }
259 |     }
260 |     pub fn fast_build() -> Self {
261 |         BvhBuildParams {
262 |             pre_split: false,
263 |             ploc_search_distance: PlocSearchDistance::Low,
264 |             search_depth_threshold: 2,
265 |             reinsertion_batch_ratio: 0.02,
266 |             post_collapse_reinsertion_batch_ratio_multiplier: 0.0,
267 |             sort_precision: SortPrecision::U64,
268 |             max_prims_per_leaf: 8,
269 |             collapse_traversal_cost: 3.0,
270 |         }
271 |     }
272 |     /// Tries to be around the same build time as embree but with faster traversal
273 |     pub fn medium_build() -> Self {
274 |         BvhBuildParams {
275 |             pre_split: false,
276 |             ploc_search_distance: PlocSearchDistance::Medium,
277 |             search_depth_threshold: 3,
278 |             reinsertion_batch_ratio: 0.05,
279 |             post_collapse_reinsertion_batch_ratio_multiplier: 2.0,
280 |             sort_precision: SortPrecision::U64,
281 |             max_prims_per_leaf: 8,
282 |             collapse_traversal_cost: 3.0,
283 |         }
284 |     }
285 |     pub fn slow_build() -> Self {
286 |         BvhBuildParams {
287 |             pre_split: true,
288 |             ploc_search_distance: PlocSearchDistance::High,
289 |             search_depth_threshold: 2,
290 |             reinsertion_batch_ratio: 0.2,
291 |             post_collapse_reinsertion_batch_ratio_multiplier: 2.0,
292 |             sort_precision: SortPrecision::U128,
293 |             max_prims_per_leaf: 8,
294 |             collapse_traversal_cost: 3.0,
295 |         }
296 |     }
297 |     pub fn very_slow_build() -> Self {
298 |         BvhBuildParams {
299 |             pre_split: true,
300 |             ploc_search_distance: PlocSearchDistance::Medium,
301 |             search_depth_threshold: 1,
302 |             reinsertion_batch_ratio: 1.0,
303 |             post_collapse_reinsertion_batch_ratio_multiplier: 1.0,
304 |             sort_precision: SortPrecision::U128,
305 |             max_prims_per_leaf: 8,
306 |             collapse_traversal_cost: 3.0,
307 |         }
308 |     }
309 | }
310 | 


--------------------------------------------------------------------------------
/src/ploc/mod.rs:
--------------------------------------------------------------------------------
  1 | //! PLOC (Parallel, Locally Ordered Clustering) BVH 2 Builder.
  2 | 
  3 | pub mod morton;
  4 | 
  5 | // https://madmann91.github.io/2021/05/05/ploc-revisited.html
  6 | // https://github.com/meistdan/ploc/
  7 | // https://meistdan.github.io/publications/ploc/paper.pdf
  8 | // https://github.com/madmann91/bvh/blob/v1/include/bvh/locally_ordered_clustering_builder.hpp
  9 | 
 10 | use glam::DVec3;
 11 | use rdst::{RadixKey, RadixSort};
 12 | 
 13 | #[cfg(feature = "parallel")]
 14 | use rayon::iter::{
 15 |     IndexedParallelIterator, IntoParallelIterator, IntoParallelRefIterator,
 16 |     IntoParallelRefMutIterator, ParallelIterator,
 17 | };
 18 | 
 19 | use crate::ploc::morton::{morton_encode_u128_unorm, morton_encode_u64_unorm};
 20 | use crate::{
 21 |     aabb::Aabb,
 22 |     bvh2::{Bvh2, Bvh2Node},
 23 | };
 24 | 
 25 | impl PlocSearchDistance {
 26 |     /// # Arguments
 27 |     /// * `aabbs` - A list of bounding boxes. Should correspond to the number and order of primitives.
 28 |     /// * `indices` - The list indices used to index into the list of primitives. This allows for
 29 |     /// flexibility in which primitives are included in the bvh and in what order they are referenced.
 30 |     /// Often this would just be equivalent to: (0..aabbs.len() as u32).collect::<Vec<_>>()
 31 |     /// * `sort_precision` - Bits used for ploc radix sort. More bits results in a more accurate but slower sort.
 32 |     /// * `search_depth_threshold` - Below this depth a search distance of 1 will be used. Set to 0 to bypass and
 33 |     /// just use PlocSearchDistance. When trying to optimize build time it can be beneficial to limit the search
 34 |     /// distance for the first few passes as that is when the largest number of primitives are being considered.
 35 |     /// This pairs are initially found more quickly since it doesn't need to search as far, and they are also
 36 |     /// found more often, since the nodes need to both agree to become paired. This also seems to occasionally
 37 |     /// result in an overall better bvh structure.
 38 |     pub fn build(
 39 |         &self,
 40 |         aabbs: &[Aabb],
 41 |         indices: Vec<u32>,
 42 |         sort_precision: SortPrecision,
 43 |         search_depth_threshold: usize,
 44 |     ) -> Bvh2 {
 45 |         match self {
 46 |             PlocSearchDistance::Minimum => {
 47 |                 build_ploc::<1>(aabbs, indices, sort_precision, search_depth_threshold)
 48 |             }
 49 |             PlocSearchDistance::VeryLow => {
 50 |                 build_ploc::<2>(aabbs, indices, sort_precision, search_depth_threshold)
 51 |             }
 52 |             PlocSearchDistance::Low => {
 53 |                 build_ploc::<6>(aabbs, indices, sort_precision, search_depth_threshold)
 54 |             }
 55 |             PlocSearchDistance::Medium => {
 56 |                 build_ploc::<14>(aabbs, indices, sort_precision, search_depth_threshold)
 57 |             }
 58 |             PlocSearchDistance::High => {
 59 |                 build_ploc::<24>(aabbs, indices, sort_precision, search_depth_threshold)
 60 |             }
 61 |             PlocSearchDistance::VeryHigh => {
 62 |                 build_ploc::<32>(aabbs, indices, sort_precision, search_depth_threshold)
 63 |             }
 64 |         }
 65 |     }
 66 | }
 67 | 
 68 | /// # Arguments
 69 | /// * `aabbs` - A list of bounding boxes. Should correspond to the number and order of primitives.
 70 | /// * `search_depth_threshold` - Below this depth a search distance of 1 will be used
 71 | /// * `sort_precision` - Bits used for ploc radix sort. More bits results in a more accurate but slower sort.
 72 | /// * `search_depth_threshold` - Below this depth a search distance of 1 will be used. Set to 0 to bypass and
 73 | /// just use SEARCH_DISTANCE.
 74 | ///
 75 | /// SEARCH_DISTANCE should be <= 32
 76 | pub fn build_ploc<const SEARCH_DISTANCE: usize>(
 77 |     aabbs: &[Aabb],
 78 |     indices: Vec<u32>,
 79 |     sort_precision: SortPrecision,
 80 |     search_depth_threshold: usize,
 81 | ) -> Bvh2 {
 82 |     crate::scope!("build_ploc");
 83 | 
 84 |     let prim_count = aabbs.len();
 85 | 
 86 |     if prim_count == 0 {
 87 |         return Bvh2::default();
 88 |     }
 89 | 
 90 |     let mut init_leafs = Vec::with_capacity(prim_count);
 91 |     let mut total_aabb = Aabb::empty();
 92 | 
 93 |     for (i, prim_index) in indices.iter().enumerate() {
 94 |         let aabb = aabbs[i];
 95 |         debug_assert!(!aabb.min.is_nan());
 96 |         debug_assert!(!aabb.max.is_nan());
 97 |         total_aabb.extend(aabb.min);
 98 |         total_aabb.extend(aabb.max);
 99 |         init_leafs.push(Bvh2Node {
100 |             aabb,
101 |             prim_count: 1,
102 |             first_index: *prim_index,
103 |         });
104 |     }
105 | 
106 |     let nodes = build_ploc_from_leafs::<SEARCH_DISTANCE>(
107 |         init_leafs,
108 |         total_aabb,
109 |         sort_precision,
110 |         search_depth_threshold,
111 |     );
112 | 
113 |     Bvh2 {
114 |         nodes,
115 |         primitive_indices: indices,
116 |         ..Default::default()
117 |     }
118 | }
119 | 
120 | pub fn build_ploc_from_leafs<const SEARCH_DISTANCE: usize>(
121 |     mut current_nodes: Vec<Bvh2Node>,
122 |     total_aabb: Aabb,
123 |     sort_precision: SortPrecision,
124 |     search_depth_threshold: usize,
125 | ) -> Vec<Bvh2Node> {
126 |     crate::scope!("build_ploc_from_leafs");
127 | 
128 |     let prim_count = current_nodes.len();
129 | 
130 |     // Merge nodes until there is only one left
131 |     let nodes_count = (2 * prim_count as i64 - 1).max(0) as usize;
132 | 
133 |     let scale = 1.0 / total_aabb.diagonal().as_dvec3();
134 |     let offset = -total_aabb.min.as_dvec3() * scale;
135 | 
136 |     // Sort primitives according to their morton code
137 |     sort_precision.sort_nodes(&mut current_nodes, scale, offset);
138 | 
139 |     let mut nodes = vec![Bvh2Node::default(); nodes_count];
140 | 
141 |     let mut insert_index = nodes_count;
142 |     let mut next_nodes = Vec::with_capacity(prim_count);
143 |     assert!(i8::MAX as usize > SEARCH_DISTANCE);
144 |     let mut merge: Vec<i8> = vec![0; prim_count];
145 | 
146 |     #[cfg(not(feature = "parallel"))]
147 |     let mut cache = SearchCache::<SEARCH_DISTANCE>::default();
148 | 
149 |     let mut depth: usize = 0;
150 |     while current_nodes.len() > 1 {
151 |         if SEARCH_DISTANCE == 1 || depth < search_depth_threshold {
152 |             // TODO try making build_ploc_from_leafs_one that embeds this logic into
153 |             // the main `while index < merge.len() {` loop  (may not be faster, tbd)
154 |             let mut last_cost = f32::MAX;
155 |             let count = current_nodes.len() - 1;
156 |             assert!(count < merge.len()); // Try to elide bounds check
157 |             (0..count).for_each(|i| {
158 |                 let cost = current_nodes[i]
159 |                     .aabb
160 |                     .union(&current_nodes[i + 1].aabb)
161 |                     .half_area();
162 |                 merge[i] = if last_cost < cost { -1 } else { 1 };
163 |                 last_cost = cost;
164 |             });
165 |             merge[current_nodes.len() - 1] = -1;
166 |         } else {
167 |             #[cfg(feature = "parallel")]
168 |             let iter = merge.par_iter_mut();
169 |             #[cfg(not(feature = "parallel"))]
170 |             let iter = merge.iter_mut();
171 |             iter.enumerate()
172 |                 .take(current_nodes.len())
173 |                 .for_each(|(index, best)| {
174 |                     #[cfg(feature = "parallel")]
175 |                     {
176 |                         *best = find_best_node_basic(index, &current_nodes, SEARCH_DISTANCE);
177 |                     }
178 |                     #[cfg(not(feature = "parallel"))]
179 |                     {
180 |                         *best = cache.find_best_node(index, &current_nodes);
181 |                     }
182 |                 });
183 |         };
184 | 
185 |         let mut index = 0;
186 |         while index < current_nodes.len() {
187 |             let index_offset = merge[index] as i64;
188 |             let best_index = (index as i64 + index_offset) as usize;
189 |             // The two nodes should be merged if they agree on their respective merge indices.
190 |             if best_index as i64 + merge[best_index] as i64 != index as i64 {
191 |                 // If not, the current node should be kept for the next iteration
192 |                 next_nodes.push(current_nodes[index]);
193 |                 index += 1;
194 |                 continue;
195 |             }
196 | 
197 |             // Since we only need to merge once, we only merge if the first index is less than the second.
198 |             if best_index > index {
199 |                 index += 1;
200 |                 continue;
201 |             }
202 | 
203 |             debug_assert_ne!(best_index, index);
204 | 
205 |             let left = current_nodes[index];
206 |             let right = current_nodes[best_index];
207 | 
208 |             // Reserve space in the target array for the two children
209 |             debug_assert!(insert_index >= 2);
210 |             insert_index -= 2;
211 | 
212 |             // Create the parent node and place it in the array for the next iteration
213 |             next_nodes.push(Bvh2Node {
214 |                 aabb: left.aabb.union(&right.aabb),
215 |                 prim_count: 0,
216 |                 first_index: insert_index as u32,
217 |             });
218 | 
219 |             // Out of bounds here error here could indicate NaN present in input aabb. Try running in debug mode.
220 |             nodes[insert_index] = left;
221 |             nodes[insert_index + 1] = right;
222 | 
223 |             if SEARCH_DISTANCE == 1 && index_offset == 1 {
224 |                 // If the search distance is only 1, and the next index was merged with this one,
225 |                 // we can skip the next index.
226 |                 // The code for this with the while loop seemed to also be slightly faster than:
227 |                 //     for (index, best_index) in merge.iter().enumerate() {
228 |                 // even in the other cases. For some reason...
229 |                 index += 2;
230 |             } else {
231 |                 index += 1;
232 |             }
233 |         }
234 | 
235 |         (next_nodes, current_nodes) = (current_nodes, next_nodes);
236 |         next_nodes.clear();
237 |         depth += 1;
238 |     }
239 | 
240 |     insert_index = insert_index.saturating_sub(1);
241 |     nodes[insert_index] = current_nodes[0];
242 |     nodes
243 | }
244 | 
245 | #[cfg(feature = "parallel")]
246 | fn find_best_node_basic(index: usize, nodes: &[Bvh2Node], search_distance: usize) -> i8 {
247 |     let mut best_node = index;
248 |     let mut best_cost = f32::INFINITY;
249 | 
250 |     let begin = index - search_distance.min(index);
251 |     let end = (index + search_distance + 1).min(nodes.len());
252 | 
253 |     let our_aabb = nodes[index].aabb;
254 |     for other in begin..end {
255 |         if other == index {
256 |             continue;
257 |         }
258 |         let cost = our_aabb.union(&nodes[other].aabb).half_area();
259 |         if cost < best_cost {
260 |             best_node = other;
261 |             best_cost = cost;
262 |         }
263 |     }
264 | 
265 |     (best_node as i64 - index as i64) as i8
266 | }
267 | 
268 | /// In PLOC, the number of nodes before and after the current one that are evaluated for pairing.
269 | /// Minimum (1) has a fast path in building and still results in decent quality BVHs especially
270 | /// when paired with a bit of reinsertion.
271 | #[derive(Default, Clone, Copy)]
272 | pub enum PlocSearchDistance {
273 |     /// 1
274 |     Minimum,
275 |     /// 2
276 |     VeryLow,
277 |     /// 6
278 |     Low,
279 |     #[default]
280 |     /// 14
281 |     Medium,
282 |     /// 24
283 |     High,
284 |     /// 32
285 |     VeryHigh,
286 | }
287 | 
288 | impl From<u32> for PlocSearchDistance {
289 |     fn from(value: u32) -> Self {
290 |         match value {
291 |             1 => PlocSearchDistance::Minimum,
292 |             2 => PlocSearchDistance::VeryLow,
293 |             6 => PlocSearchDistance::Low,
294 |             14 => PlocSearchDistance::Medium,
295 |             24 => PlocSearchDistance::High,
296 |             32 => PlocSearchDistance::VeryHigh,
297 |             _ => panic!("Invalid value for PlocSearchDistance: {}", value),
298 |         }
299 |     }
300 | }
301 | 
302 | // Tried using a Vec it was ~30% slower with a search distance of 14.
303 | // Tried making the Vec flat, used get_unchecked, etc... (without those it was ~80% slower)
304 | pub struct SearchCache<const SEARCH_DISTANCE: usize>([[f32; SEARCH_DISTANCE]; SEARCH_DISTANCE]);
305 | 
306 | impl<const SEARCH_DISTANCE: usize> Default for SearchCache<SEARCH_DISTANCE> {
307 |     fn default() -> Self {
308 |         SearchCache([[0.0; SEARCH_DISTANCE]; SEARCH_DISTANCE])
309 |     }
310 | }
311 | 
312 | impl<const SEARCH_DISTANCE: usize> SearchCache<SEARCH_DISTANCE> {
313 |     #[inline]
314 |     #[cfg(not(feature = "parallel"))]
315 |     fn back(&self, index: usize, other: usize) -> f32 {
316 |         // Note: the compiler removes the bounds check due to the % SEARCH_DISTANCE
317 |         self.0[other % SEARCH_DISTANCE][index % SEARCH_DISTANCE]
318 |     }
319 | 
320 |     #[inline]
321 |     #[cfg(not(feature = "parallel"))]
322 |     fn front(&mut self, index: usize, other: usize) -> &mut f32 {
323 |         &mut self.0[index % SEARCH_DISTANCE][other % SEARCH_DISTANCE]
324 |     }
325 | 
326 |     #[cfg(not(feature = "parallel"))]
327 |     fn find_best_node(&mut self, index: usize, nodes: &[Bvh2Node]) -> i8 {
328 |         let mut best_node = index;
329 |         let mut best_cost = f32::INFINITY;
330 | 
331 |         let begin = index - SEARCH_DISTANCE.min(index);
332 |         let end = (index + SEARCH_DISTANCE + 1).min(nodes.len());
333 | 
334 |         for other in begin..index {
335 |             let area = self.back(index, other);
336 |             if area < best_cost {
337 |                 best_node = other;
338 |                 best_cost = area;
339 |             }
340 |         }
341 | 
342 |         let our_aabb = nodes[index].aabb;
343 |         ((index + 1)..end).for_each(|other| {
344 |             let cost = our_aabb.union(&nodes[other].aabb).half_area();
345 |             *self.front(index, other) = cost;
346 |             if cost < best_cost {
347 |                 best_node = other;
348 |                 best_cost = cost;
349 |             }
350 |         });
351 | 
352 |         (best_node as i64 - index as i64) as i8
353 |     }
354 | }
355 | 
356 | // ---------------------
357 | // --- SORTING NODES ---
358 | // ---------------------
359 | 
360 | // TODO find a not terrible way to make this less repetitive
361 | 
362 | #[derive(Debug, Copy, Clone)]
363 | pub enum SortPrecision {
364 |     U128,
365 |     U64,
366 | }
367 | 
368 | impl SortPrecision {
369 |     fn sort_nodes(&self, current_nodes: &mut Vec<Bvh2Node>, scale: DVec3, offset: DVec3) {
370 |         match self {
371 |             SortPrecision::U128 => sort_nodes_m128(current_nodes, scale, offset),
372 |             SortPrecision::U64 => sort_nodes_m64(current_nodes, scale, offset),
373 |         }
374 |     }
375 | }
376 | 
377 | #[derive(Clone, Copy)]
378 | struct Morton128 {
379 |     index: usize,
380 |     code: u128,
381 | }
382 | 
383 | impl RadixKey for Morton128 {
384 |     const LEVELS: usize = 16;
385 | 
386 |     #[inline(always)]
387 |     fn get_level(&self, level: usize) -> u8 {
388 |         self.code.get_level(level)
389 |     }
390 | }
391 | 
392 | #[derive(Clone, Copy)]
393 | struct Morton64 {
394 |     index: usize,
395 |     code: u64,
396 | }
397 | 
398 | impl RadixKey for Morton64 {
399 |     const LEVELS: usize = 8;
400 | 
401 |     #[inline(always)]
402 |     fn get_level(&self, level: usize) -> u8 {
403 |         self.code.get_level(level)
404 |     }
405 | }
406 | 
407 | fn sort_nodes_m128(current_nodes: &mut Vec<Bvh2Node>, scale: DVec3, offset: DVec3) {
408 |     crate::scope!("sort_nodes_m128");
409 | 
410 |     #[cfg(feature = "parallel")]
411 |     let iter = current_nodes.par_iter();
412 |     #[cfg(not(feature = "parallel"))]
413 |     let iter = current_nodes.iter();
414 | 
415 |     let mut indexed_mortons: Vec<Morton128> = iter
416 |         .enumerate()
417 |         .map(|(index, leaf)| {
418 |             let center = leaf.aabb.center().as_dvec3() * scale + offset;
419 |             Morton128 {
420 |                 index,
421 |                 code: morton_encode_u128_unorm(center),
422 |             }
423 |         })
424 |         .collect();
425 | 
426 |     indexed_mortons.radix_sort_unstable();
427 | 
428 |     #[cfg(feature = "parallel")]
429 |     let iter = indexed_mortons.into_par_iter();
430 |     #[cfg(not(feature = "parallel"))]
431 |     let iter = indexed_mortons.iter();
432 | 
433 |     *current_nodes = iter.map(|m| current_nodes[m.index]).collect();
434 | }
435 | 
436 | fn sort_nodes_m64(current_nodes: &mut Vec<Bvh2Node>, scale: DVec3, offset: DVec3) {
437 |     crate::scope!("sort_nodes_m64");
438 | 
439 |     #[cfg(feature = "parallel")]
440 |     let iter = current_nodes.par_iter();
441 |     #[cfg(not(feature = "parallel"))]
442 |     let iter = current_nodes.iter();
443 | 
444 |     let mut indexed_mortons: Vec<Morton64> = iter
445 |         .enumerate()
446 |         .map(|(index, leaf)| {
447 |             let center = leaf.aabb.center().as_dvec3() * scale + offset;
448 |             Morton64 {
449 |                 index,
450 |                 code: morton_encode_u64_unorm(center),
451 |             }
452 |         })
453 |         .collect();
454 | 
455 |     indexed_mortons.radix_sort_unstable();
456 | 
457 |     #[cfg(feature = "parallel")]
458 |     let iter = indexed_mortons.into_par_iter();
459 |     #[cfg(not(feature = "parallel"))]
460 |     let iter = indexed_mortons.iter();
461 | 
462 |     *current_nodes = iter.map(|m| current_nodes[m.index]).collect();
463 | }
464 | 


--------------------------------------------------------------------------------
/src/ploc/morton.rs:
--------------------------------------------------------------------------------
 1 | // http://www.graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN
 2 | // TODO evaluate Extended Morton Codes for High Performance Bounding Volume Hierarchy Construction:
 3 | // https://www.dcgi.fel.cvut.cz/projects/emc/emc2017.pdf
 4 | // https://www.highperformancegraphics.org/wp-content/uploads/2017/Papers-Session3/HPG207_ExtendedMortonCodes.pdf
 5 | 
 6 | //---------------------------------------------------
 7 | // --- 10 bit resolution per channel morton curve ---
 8 | //---------------------------------------------------
 9 | 
10 | use glam::DVec3;
11 | 
12 | #[inline]
13 | pub fn split_by_3_u32(a: u16) -> u32 {
14 |     let mut x = a as u32 & 0x3ff; // we only look at the first 10 bits
15 |     x = (x | x << 16) & 0x30000ff;
16 |     x = (x | x << 8) & 0x300f00f;
17 |     x = (x | x << 4) & 0x30c30c3;
18 |     x = (x | x << 2) & 0x9249249;
19 |     x
20 | }
21 | 
22 | #[inline]
23 | /// Encode x,y,z position into a u64 morton value.
24 | /// Input should be 0..=2u16.pow(10) (or 1u16 << 10)
25 | /// (only included for reference, this isn't reasonably accurate for most BVHs)
26 | pub fn morton_encode_u32(x: u16, y: u16, z: u16) -> u32 {
27 |     split_by_3_u32(x) | split_by_3_u32(y) << 1 | split_by_3_u32(z) << 2
28 | }
29 | 
30 | //---------------------------------------------------
31 | // --- 21 bit resolution per channel morton curve ---
32 | //---------------------------------------------------
33 | 
34 | #[inline]
35 | pub fn split_by_3_u64(a: u32) -> u64 {
36 |     let mut x = a as u64 & 0x1fffff; // we only look at the first 21 bits
37 |     x = (x | x << 32) & 0x1f00000000ffff;
38 |     x = (x | x << 16) & 0x1f0000ff0000ff;
39 |     x = (x | x << 8) & 0x100f00f00f00f00f;
40 |     x = (x | x << 4) & 0x10c30c30c30c30c3;
41 |     x = (x | x << 2) & 0x1249249249249249;
42 |     x
43 | }
44 | 
45 | #[inline]
46 | /// Encode x,y,z position into a u64 morton value.
47 | /// Input should be 0..=2u32.pow(21) (or 1u32 << 21)
48 | pub fn morton_encode_u64(x: u32, y: u32, z: u32) -> u64 {
49 |     split_by_3_u64(x) | split_by_3_u64(y) << 1 | split_by_3_u64(z) << 2
50 | }
51 | 
52 | #[inline]
53 | /// Encode a DVec3 position into a u128 morton value.
54 | /// Input should be 0.0..=1.0
55 | pub fn morton_encode_u64_unorm(p: DVec3) -> u64 {
56 |     let p = p * (1 << 21) as f64;
57 |     morton_encode_u64(p.x as u32, p.y as u32, p.z as u32)
58 | }
59 | 
60 | //---------------------------------------------------
61 | // --- 42 bit resolution per channel morton curve ---
62 | //---------------------------------------------------
63 | 
64 | #[inline]
65 | pub fn split_by_3_u128(a: u64) -> u128 {
66 |     let mut x = a as u128 & 0x3ffffffffff; // we only look at the first 42 bits
67 |     x = (x | x << 64) & 0x3ff0000000000000000ffffffff;
68 |     x = (x | x << 32) & 0x3ff00000000ffff00000000ffff;
69 |     x = (x | x << 16) & 0x30000ff0000ff0000ff0000ff0000ff;
70 |     x = (x | x << 8) & 0x300f00f00f00f00f00f00f00f00f00f;
71 |     x = (x | x << 4) & 0x30c30c30c30c30c30c30c30c30c30c3;
72 |     x = (x | x << 2) & 0x9249249249249249249249249249249;
73 |     x
74 | }
75 | 
76 | #[inline]
77 | /// Encode x,y,z position into a u128 morton value.
78 | /// Input should be 0..=2u64.pow(42) (or 1u64 << 42)
79 | pub fn morton_encode_u128(x: u64, y: u64, z: u64) -> u128 {
80 |     split_by_3_u128(x) | split_by_3_u128(y) << 1 | split_by_3_u128(z) << 2
81 | }
82 | 
83 | #[inline]
84 | /// Encode a DVec3 position into a u128 morton value.
85 | /// Input should be 0.0..=1.0
86 | pub fn morton_encode_u128_unorm(p: DVec3) -> u128 {
87 |     let p = p * (1u64 << 42) as f64;
88 |     morton_encode_u128(p.x as u64, p.y as u64, p.z as u64)
89 | }
90 | 


--------------------------------------------------------------------------------
/src/ray.rs:
--------------------------------------------------------------------------------
 1 | //! A ray in 3D space.
 2 | 
 3 | use glam::{vec3a, Vec3A};
 4 | 
 5 | /// Computes the inverse of `x` avoiding division by zero.
 6 | pub fn safe_inverse(x: f32) -> f32 {
 7 |     if x.abs() <= f32::EPSILON {
 8 |         x.signum() / f32::EPSILON
 9 |     } else {
10 |         1.0 / x
11 |     }
12 | }
13 | 
14 | /// A struct representing a ray in 3D space.
15 | #[derive(Clone, Copy, Debug)]
16 | #[repr(C)]
17 | pub struct Ray {
18 |     /// The starting point of the ray.
19 |     pub origin: Vec3A,
20 |     /// The direction vector of the ray.
21 |     pub direction: Vec3A,
22 |     /// The inverse of the direction vector components.
23 |     /// Used to avoid division in ray/aabb tests. Seems to improve performance in
24 |     /// some cases on the cpu, but not the gpu in some others.
25 |     pub inv_direction: Vec3A,
26 |     /// The minimum `t` (distance) value for intersection tests.
27 |     pub tmin: f32,
28 |     /// The maximum `t` (distance) value for intersection tests.
29 |     pub tmax: f32,
30 | }
31 | 
32 | impl Ray {
33 |     /// Creates a new `Ray` with the given origin, direction, and `t` (distance) range.
34 |     pub fn new(origin: Vec3A, direction: Vec3A, min: f32, max: f32) -> Self {
35 |         let ray = Ray {
36 |             origin,
37 |             direction,
38 |             inv_direction: vec3a(
39 |                 safe_inverse(direction.x),
40 |                 safe_inverse(direction.y),
41 |                 safe_inverse(direction.z),
42 |             ),
43 |             tmin: min,
44 |             tmax: max,
45 |         };
46 | 
47 |         debug_assert!(ray.inv_direction.is_finite());
48 |         debug_assert!(ray.direction.is_finite());
49 |         debug_assert!(origin.is_finite());
50 | 
51 |         ray
52 |     }
53 | 
54 |     /// Creates a new infinite `Ray` with the given origin, direction.
55 |     pub fn new_inf(origin: Vec3A, direction: Vec3A) -> Self {
56 |         Self::new(origin, direction, 0.0, f32::INFINITY)
57 |     }
58 | }
59 | 
60 | /// A struct representing a hit record in ray tracing.
61 | /// A `Hit` record contains the IDs of the primitive, geometry and instance that
62 | /// were hit, as well as the `t` (distance) value at which the hit occurred.
63 | #[derive(Clone, Copy, Debug)]
64 | #[repr(C)]
65 | pub struct RayHit {
66 |     pub primitive_id: u32,
67 |     pub geometry_id: u32,
68 |     pub instance_id: u32,
69 |     pub t: f32,
70 | }
71 | 
72 | pub const INVALID_ID: u32 = u32::MAX;
73 | 
74 | impl RayHit {
75 |     /// Creates a new `RayHit` instance representing no hit.
76 |     pub fn none() -> Self {
77 |         Self {
78 |             primitive_id: INVALID_ID,
79 |             geometry_id: INVALID_ID,
80 |             instance_id: INVALID_ID,
81 |             t: f32::INFINITY,
82 |         }
83 |     }
84 | }
85 | 


--------------------------------------------------------------------------------
/src/rt_triangle.rs:
--------------------------------------------------------------------------------
  1 | //! Triangle types optimized for ray intersection performance.
  2 | 
  3 | use bytemuck::{Pod, Zeroable};
  4 | use glam::*;
  5 | 
  6 | use half::f16;
  7 | 
  8 | use crate::{aabb::Aabb, ray::Ray, triangle::Triangle, Boundable};
  9 | 
 10 | #[derive(Clone, Copy, Default, PartialEq)]
 11 | #[repr(C)]
 12 | /// A compressed 3D triangle optimized for GPU ray intersection performance.
 13 | pub struct RtCompressedTriangle {
 14 |     /// Base vertex
 15 |     pub v0: [f32; 3],
 16 |     /// Edges 1 & 2 encoded as IEEE 754 f16 `v1 - v0, v2 - v0`
 17 |     pub e1_e2: [u16; 6],
 18 | }
 19 | 
 20 | unsafe impl Pod for RtCompressedTriangle {}
 21 | unsafe impl Zeroable for RtCompressedTriangle {}
 22 | 
 23 | impl From<&Triangle> for RtCompressedTriangle {
 24 |     #[inline(always)]
 25 |     fn from(tri: &Triangle) -> Self {
 26 |         RtCompressedTriangle::new(tri.v0, tri.v1, tri.v2)
 27 |     }
 28 | }
 29 | 
 30 | impl RtCompressedTriangle {
 31 |     #[inline(always)]
 32 |     pub fn new(v0: Vec3A, v1: Vec3A, v2: Vec3A) -> Self {
 33 |         let e1 = v1 - v0;
 34 |         let e2 = v2 - v0;
 35 | 
 36 |         Self {
 37 |             v0: [v0.x, v0.y, v0.z],
 38 |             e1_e2: [
 39 |                 f16::from_f32(e1.x).to_bits(),
 40 |                 f16::from_f32(e2.x).to_bits(),
 41 |                 f16::from_f32(e1.y).to_bits(),
 42 |                 f16::from_f32(e2.y).to_bits(),
 43 |                 f16::from_f32(e1.z).to_bits(),
 44 |                 f16::from_f32(e2.z).to_bits(),
 45 |             ],
 46 |         }
 47 |     }
 48 | 
 49 |     #[inline(always)]
 50 |     pub fn vertices(&self) -> [Vec3A; 3] {
 51 |         let (v0, e1, e2) = self.unpack();
 52 |         let v1 = v0 + e1;
 53 |         let v2 = v0 + e2;
 54 |         [v0, v1, v2]
 55 |     }
 56 | 
 57 |     #[inline(always)]
 58 |     pub fn aabb(&self) -> Aabb {
 59 |         Aabb::from_points(&self.vertices())
 60 |     }
 61 | 
 62 |     #[inline(always)]
 63 |     pub fn compute_normal(&self) -> Vec3A {
 64 |         let (_v0, e1, e2) = self.unpack();
 65 |         ((e1).cross(e2)).normalize_or_zero()
 66 |     }
 67 | 
 68 |     /// Find the distance (t) of the intersection of the `Ray` and this Triangle.
 69 |     /// Returns f32::INFINITY for miss.
 70 |     #[inline(always)]
 71 |     pub fn intersect(&self, ray: &Ray) -> f32 {
 72 |         // TODO not very water tight from the back side in some contexts (tris with edges at 0,0,0 show 1px gap)
 73 |         // Find out if this is typical of Möller
 74 |         // Based on Fast Minimum Storage Ray Triangle Intersection by T. Möller and B. Trumbore
 75 |         // https://madmann91.github.io/2021/04/29/an-introduction-to-bvhs.html
 76 | 
 77 |         let (v0, e1, e2) = self.unpack();
 78 |         let ng = (-e1).cross(e2);
 79 | 
 80 |         let cull_backface = false;
 81 | 
 82 |         let c = v0 - ray.origin;
 83 |         let r = ray.direction.cross(c);
 84 |         let inv_det = 1.0 / ng.dot(ray.direction);
 85 | 
 86 |         let u = r.dot(e2) * inv_det;
 87 |         let v = r.dot(-e1) * inv_det;
 88 |         let w = 1.0 - u - v;
 89 | 
 90 |         // Original:
 91 |         //let hit = u >= 0.0 && v >= 0.0 && w >= 0.0;
 92 |         //let valid = if cull_backface {
 93 |         //    inv_det > 0.0 && hit
 94 |         //} else {
 95 |         //    inv_det != 0.0 && hit
 96 |         //};
 97 | 
 98 |         // Note: differs in that if v == -0.0, for example will cause valid to be false
 99 |         let hit = u.to_bits() | v.to_bits() | w.to_bits();
100 |         let valid = if cull_backface {
101 |             (inv_det.to_bits() | hit) & 0x8000_0000 == 0
102 |         } else {
103 |             inv_det != 0.0 && hit & 0x8000_0000 == 0
104 |         };
105 | 
106 |         if valid {
107 |             let t = ng.dot(c) * inv_det;
108 |             if t >= ray.tmin && t <= ray.tmax {
109 |                 return t;
110 |             }
111 |         }
112 | 
113 |         f32::INFINITY
114 |     }
115 | 
116 |     pub fn unpack(&self) -> (Vec3A, Vec3A, Vec3A) {
117 |         let v0: Vec3A = self.v0.into();
118 |         let e1x = f16::from_bits(self.e1_e2[0]).to_f32();
119 |         let e2x = f16::from_bits(self.e1_e2[1]).to_f32();
120 |         let e1y = f16::from_bits(self.e1_e2[2]).to_f32();
121 |         let e2y = f16::from_bits(self.e1_e2[3]).to_f32();
122 |         let e1z = f16::from_bits(self.e1_e2[4]).to_f32();
123 |         let e2z = f16::from_bits(self.e1_e2[5]).to_f32();
124 |         let e1 = Vec3A::new(e1x, e1y, e1z);
125 |         let e2 = Vec3A::new(e2x, e2y, e2z);
126 |         (v0, e1, e2)
127 |     }
128 | 
129 |     #[inline(always)]
130 |     pub fn compute_barycentric(&self, ray: &Ray) -> Vec2 {
131 |         let (v0, e1, e2) = self.unpack();
132 |         let ng = (-e1).cross(e2);
133 |         let r = ray.direction.cross(v0 - ray.origin);
134 |         vec2(r.dot(e2), r.dot(-e1)) / ng.dot(ray.direction)
135 |     }
136 | }
137 | 
138 | impl Boundable for RtCompressedTriangle {
139 |     fn aabb(&self) -> Aabb {
140 |         self.aabb()
141 |     }
142 | }
143 | 
144 | #[derive(Clone, Copy, Default, PartialEq)]
145 | /// A 3D triangle optimized for CPU ray intersection performance.
146 | pub struct RtTriangle {
147 |     /// Base vertex
148 |     pub v0: Vec3A,
149 |     /// Edge 1 `v0 - v1`
150 |     pub e1: Vec3A,
151 |     /// Edge 2 `v2 - v0`
152 |     pub e2: Vec3A,
153 |     /// Geometric normal `e1.cross(e2)`.
154 |     /// Optimized for intersection.
155 |     /// Needs to be inverted for typical normal.
156 |     pub ng: Vec3A,
157 | }
158 | 
159 | impl From<&Triangle> for RtTriangle {
160 |     #[inline(always)]
161 |     fn from(tri: &Triangle) -> Self {
162 |         RtTriangle::new(tri.v0, tri.v1, tri.v2)
163 |     }
164 | }
165 | 
166 | // Uses layout from https://github.com/madmann91/bvh/blob/master/src/bvh/v2/tri.h#L36
167 | // to optimize for intersection. On the CPU this is a bit faster than e1 = v1 - v0; e2 = v2 - v0;
168 | impl RtTriangle {
169 |     #[inline(always)]
170 |     pub fn new(v0: Vec3A, v1: Vec3A, v2: Vec3A) -> Self {
171 |         let e1 = v0 - v1;
172 |         let e2 = v2 - v0;
173 |         Self {
174 |             v0,
175 |             e1,
176 |             e2,
177 |             ng: e1.cross(e2),
178 |         }
179 |     }
180 | 
181 |     #[inline(always)]
182 |     fn vertices(&self) -> [Vec3A; 3] {
183 |         [self.v0, self.v0 - self.e1, self.v0 + self.e2]
184 |     }
185 | 
186 |     #[inline(always)]
187 |     pub fn aabb(&self) -> Aabb {
188 |         Aabb::from_points(&self.vertices())
189 |     }
190 | 
191 |     #[inline(always)]
192 |     pub fn compute_normal(&self) -> Vec3A {
193 |         -self.ng.normalize_or_zero()
194 |     }
195 | 
196 |     /// Find the distance (t) of the intersection of the `Ray` and this Triangle.
197 |     /// Returns f32::INFINITY for miss.
198 |     #[inline(always)]
199 |     pub fn intersect(&self, ray: &Ray) -> f32 {
200 |         // TODO not very water tight from the back side in some contexts (tris with edges at 0,0,0 show 1px gap)
201 |         // Find out if this is typical of Möller
202 |         // Based on Fast Minimum Storage Ray Triangle Intersection by T. Möller and B. Trumbore
203 |         // https://madmann91.github.io/2021/04/29/an-introduction-to-bvhs.html
204 |         let cull_backface = false;
205 | 
206 |         let c = self.v0 - ray.origin;
207 |         let r = ray.direction.cross(c);
208 |         let inv_det = 1.0 / self.ng.dot(ray.direction);
209 | 
210 |         let u = r.dot(self.e2) * inv_det;
211 |         let v = r.dot(self.e1) * inv_det;
212 |         let w = 1.0 - u - v;
213 | 
214 |         // Original:
215 |         //let hit = u >= 0.0 && v >= 0.0 && w >= 0.0;
216 |         //let valid = if cull_backface {
217 |         //    inv_det > 0.0 && hit
218 |         //} else {
219 |         //    inv_det != 0.0 && hit
220 |         //};
221 | 
222 |         // Note: differs in that if v == -0.0, for example will cause valid to be false
223 |         let hit = u.to_bits() | v.to_bits() | w.to_bits();
224 |         let valid = if cull_backface {
225 |             (inv_det.to_bits() | hit) & 0x8000_0000 == 0
226 |         } else {
227 |             inv_det != 0.0 && hit & 0x8000_0000 == 0
228 |         };
229 | 
230 |         if valid {
231 |             let t = self.ng.dot(c) * inv_det;
232 |             if t >= ray.tmin && t <= ray.tmax {
233 |                 return t;
234 |             }
235 |         }
236 | 
237 |         f32::INFINITY
238 |     }
239 | 
240 |     // https://github.com/RenderKit/embree/blob/0c236df6f31a8e9c8a48803dada333e9ea0029a6/kernels/geometry/triangle_intersector_moeller.h#L9
241 |     #[cfg(all(
242 |         any(target_arch = "x86", target_arch = "x86_64"),
243 |         target_feature = "sse2"
244 |     ))]
245 |     pub fn intersect_embree(&self, ray: &Ray) -> f32 {
246 |         // Not watertight from the front side? Looks similar to what intersect() above looks like from the back side.
247 | 
248 |         // This uses the orientation from https://madmann91.github.io/2021/04/29/an-introduction-to-bvhs.html
249 | 
250 |         let cull_backface = false;
251 | 
252 |         // Calculate denominator
253 |         let o = ray.origin;
254 |         let d = ray.direction;
255 |         let c = self.v0 - o;
256 |         let r = c.cross(d);
257 |         let den = (-self.ng).dot(d);
258 |         let abs_den = den.abs();
259 | 
260 |         fn signmsk(x: f32) -> f32 {
261 |             #[cfg(target_arch = "x86")]
262 |             use std::arch::x86::*;
263 |             #[cfg(target_arch = "x86_64")]
264 |             use std::arch::x86_64::*;
265 |             unsafe {
266 |                 let mask = _mm_set1_ps(-0.0);
267 |                 let x_vec = _mm_set_ss(x);
268 |                 let sign_bit = _mm_and_ps(x_vec, mask);
269 |                 _mm_cvtss_f32(sign_bit)
270 |                 //_mm_cvtss_f32(_mm_and_ps(
271 |                 //    _mm_set_ss(x),
272 |                 //    _mm_castsi128_ps(_mm_set1_epi32(-2147483648i32)),
273 |                 //))
274 |             }
275 |         }
276 | 
277 |         let sgn_den = signmsk(den).to_bits();
278 | 
279 |         // Perform edge tests
280 |         let u = f32::from_bits(r.dot(self.e2).to_bits() ^ sgn_den);
281 |         let v = f32::from_bits(r.dot(self.e1).to_bits() ^ sgn_den);
282 |         // TODO simd uv?
283 | 
284 |         // Perform backface culling
285 |         // Original:
286 |         //let valid = if cull_backface {
287 |         //    den < 0.0 && u >= 0.0 && v >= 0.0 && u + v <= abs_den
288 |         //} else {
289 |         //    den != 0.0 && u >= 0.0 && v >= 0.0 && u + v <= abs_den
290 |         //};
291 | 
292 |         let w = abs_den - u - v;
293 |         let valid = if cull_backface {
294 |             ((-den).to_bits() | u.to_bits() | v.to_bits() | (abs_den - u - v).to_bits())
295 |                 & 0x8000_0000
296 |                 == 0
297 |         } else {
298 |             den != 0.0 && ((u.to_bits() | v.to_bits() | w.to_bits()) & 0x8000_0000) == 0
299 |         };
300 | 
301 |         if !valid {
302 |             return f32::INFINITY;
303 |         }
304 | 
305 |         // Perform depth test
306 |         let t = f32::from_bits((-self.ng).dot(c).to_bits() ^ sgn_den);
307 | 
308 |         if abs_den * ray.tmin < t && t <= abs_den * ray.tmax {
309 |             return t;
310 |         }
311 | 
312 |         f32::INFINITY
313 |     }
314 | 
315 |     #[inline(always)]
316 |     pub fn compute_barycentric(&self, ray: &Ray) -> Vec2 {
317 |         let r = ray.direction.cross(self.v0 - ray.origin);
318 |         vec2(r.dot(self.e2), r.dot(self.e1)) / self.ng.dot(ray.direction)
319 |     }
320 | }
321 | 
322 | impl Boundable for RtTriangle {
323 |     fn aabb(&self) -> Aabb {
324 |         self.aabb()
325 |     }
326 | }
327 | 


--------------------------------------------------------------------------------
/src/splits.rs:
--------------------------------------------------------------------------------
  1 | //! Split large triangles into multiple smaller Aabbs.
  2 | 
  3 | use glam::Vec3A;
  4 | 
  5 | use crate::{aabb::Aabb, triangle::Triangle};
  6 | 
  7 | /// Splits large triangles into multiple smaller Aabbs. Fits the new aabbs tightly to the triangle.
  8 | /// Note: This will result in more aabbs than triangles. The indices Vec will have grow with the
  9 | /// added Aabb's with the respective mapping back to the initial list of triangles.
 10 | /// # Arguments
 11 | /// * `avg_half_area` - The average half area of the Triangles
 12 | /// * `largest_half_area` - The largest half area of the Triangles
 13 | /// This is tuned to try to create splits conservatively enough that it generally
 14 | /// wont result in lower traversal performance across a variety of scenes.
 15 | /// (Naive splitting can result in lower traversal performance in some scenes)
 16 | pub fn split_aabbs_preset(
 17 |     aabbs: &mut Vec<Aabb>,
 18 |     indices: &mut Vec<u32>,
 19 |     triangles: &[Triangle],
 20 |     avg_half_area: f32,
 21 |     largest_half_area: f32,
 22 | ) {
 23 |     split_aabbs_precise(
 24 |         aabbs,
 25 |         indices,
 26 |         triangles,
 27 |         avg_half_area * 3.0,
 28 |         (avg_half_area * 4.0).max(avg_half_area * 0.9 + largest_half_area * 0.1),
 29 |         1.8,
 30 |         1.6,
 31 |         12,
 32 |         12,
 33 |     );
 34 | }
 35 | 
 36 | /// Splits large triangles into multiple smaller Aabbs. Fits the new aabbs tightly to the triangle.
 37 | /// Note: This will result in more aabbs than triangles. The indices Vec will have grow with the
 38 | /// added Aabb's with the respective mapping back to the initial list of triangles.
 39 | /// # Arguments
 40 | /// * `area_thresh_low` - Triangles with aabb half areas below this will not be considered for splitting.
 41 | /// * `area_thresh_high` - If the low split factor condition is not met then area_thresh_high > old_cost
 42 | /// must be met in addition to best_cost * split_factor_high < old_cost in order for the split to occur
 43 | /// * `split_factor_low` - If the resulting smallest aabb half area (best_cost) multiplied by this factor is
 44 | /// lower than the original cost the best split will be used (best_cost * split_factor_low < old_cost)
 45 | /// (area_thresh_high > old_cost && best_cost * split_factor_high < old_cost)
 46 | /// * `max_iterations` - Number of times to evaluate the entire set of aabbs/triangles (including the newly added splits)
 47 | /// * `split_tests` - Number of places try splitting the triangle at.
 48 | #[allow(clippy::too_many_arguments)]
 49 | pub fn split_aabbs_precise(
 50 |     aabbs: &mut Vec<Aabb>,
 51 |     indices: &mut Vec<u32>,
 52 |     triangles: &[Triangle],
 53 |     area_thresh_low: f32,
 54 |     area_thresh_high: f32,
 55 |     split_factor_low: f32,
 56 |     split_factor_high: f32,
 57 |     max_iterations: u32,
 58 |     split_tests: u32,
 59 | ) {
 60 |     crate::scope!("split_aabbs_precise");
 61 | 
 62 |     let mut candidates = Vec::new();
 63 | 
 64 |     for (i, aabb) in aabbs.iter().enumerate() {
 65 |         if aabb.half_area() > area_thresh_low {
 66 |             candidates.push(i)
 67 |         }
 68 |     }
 69 | 
 70 |     let mut old_candidates_len = candidates.len();
 71 |     for _ in 0..max_iterations {
 72 |         for i in 0..candidates.len() {
 73 |             let aabb = &mut aabbs[candidates[i]];
 74 |             let index = indices[candidates[i]];
 75 |             let axis: usize = aabb.largest_axis();
 76 | 
 77 |             let tri = triangles[index as usize];
 78 | 
 79 |             let mut best_cost = f32::MAX;
 80 |             let mut left = *aabb;
 81 |             let mut right = *aabb;
 82 | 
 83 |             // TODO optimization: create multiple splits simultaneously
 84 |             for i in 1..split_tests {
 85 |                 let n = i as f32 / split_tests as f32;
 86 |                 let pos = aabb.min[axis] * n + aabb.max[axis] * (1.0 - n);
 87 | 
 88 |                 let mut tmp_left = *aabb;
 89 |                 let mut tmp_right = *aabb;
 90 | 
 91 |                 tmp_left.max[axis] = pos;
 92 |                 tmp_right.min[axis] = pos;
 93 |                 let verts = [tri.v0, tri.v1, tri.v2, tri.v0];
 94 |                 let (t_left, t_right) = split_triangle(axis as u32, pos, verts);
 95 |                 tmp_left = t_left.intersection(&tmp_left);
 96 |                 tmp_right = t_right.intersection(&tmp_right);
 97 |                 let area = tmp_left.half_area() + tmp_right.half_area();
 98 |                 if area < best_cost {
 99 |                     best_cost = area;
100 |                     left = tmp_left;
101 |                     right = tmp_right;
102 |                 }
103 |             }
104 | 
105 |             let old_cost = aabb.half_area();
106 | 
107 |             if (area_thresh_high > old_cost && best_cost * split_factor_high < old_cost)
108 |                 || best_cost * split_factor_low < old_cost
109 |             {
110 |                 *aabb = left;
111 |                 candidates.push(aabbs.len());
112 |                 aabbs.push(right);
113 |                 indices.push(index);
114 |             }
115 |         }
116 |         if old_candidates_len == candidates.len() {
117 |             break;
118 |         } else {
119 |             candidates.retain(|c| aabbs[*c].half_area() > area_thresh_low);
120 |             old_candidates_len = candidates.len();
121 |         }
122 |     }
123 | }
124 | 
125 | /// Based on <https://github.com/embree/embree/blob/be0accfd0b246e2b03355b8ee7710a22c1b49240/kernels/builders/splitter.h#L17C1-L49C6>,
126 | /// but with the "current bounds" moved out.
127 | pub fn split_triangle(dim: u32, pos: f32, v: [Vec3A; 4]) -> (Aabb, Aabb) {
128 |     let mut left = Aabb::INVALID;
129 |     let mut right = Aabb::INVALID;
130 | 
131 |     // Clip triangle to left and right box by processing all edges
132 |     for i in 0..3 {
133 |         let v0 = v[i];
134 |         let v1 = v[i + 1];
135 |         let v0d = v0[dim as usize];
136 |         let v1d = v1[dim as usize];
137 | 
138 |         if v0d <= pos {
139 |             // This point is on left side
140 |             left.extend(v0);
141 |         }
142 |         if v0d >= pos {
143 |             // This point is on right side
144 |             right.extend(v0);
145 |         }
146 | 
147 |         // The edge crosses the splitting location
148 |         if (v0d < pos && pos < v1d) || (v1d < pos && pos < v0d) {
149 |             debug_assert!((v1d - v0d) != 0.0);
150 |             let inv_length = 1.0 / (v1d - v0d);
151 |             let c = Vec3A::mul_add(Vec3A::splat((pos - v0d) * inv_length), v1 - v0, v0);
152 |             left.extend(c);
153 |             right.extend(c);
154 |         }
155 |     }
156 | 
157 |     (left, right)
158 | }
159 | 


--------------------------------------------------------------------------------
/src/test_util.rs:
--------------------------------------------------------------------------------
  1 | //! Meshes, generators, sampling functions, etc.. for basic testing & examples.
  2 | 
  3 | pub mod sampling {
  4 |     use std::f32::consts::TAU;
  5 | 
  6 |     use glam::*;
  7 | 
  8 |     #[inline(always)]
  9 |     pub fn uhash(a: u32, b: u32) -> u32 {
 10 |         let mut x = (a.overflowing_mul(1597334673).0) ^ (b.overflowing_mul(3812015801).0);
 11 |         // from https://nullprogram.com/blog/2018/07/31/
 12 |         x = x ^ (x >> 16);
 13 |         x = x.overflowing_mul(0x7feb352d).0;
 14 |         x = x ^ (x >> 15);
 15 |         x = x.overflowing_mul(0x846ca68b).0;
 16 |         x = x ^ (x >> 16);
 17 |         x
 18 |     }
 19 | 
 20 |     #[inline(always)]
 21 |     pub fn unormf(n: u32) -> f32 {
 22 |         n as f32 * (1.0 / 0xffffffffu32 as f32)
 23 |     }
 24 | 
 25 |     #[inline(always)]
 26 |     pub fn hash_noise(coord: UVec2, frame: u32) -> f32 {
 27 |         let urnd = uhash(coord.x, (coord.y << 11) + frame);
 28 |         unormf(urnd)
 29 |     }
 30 | 
 31 |     // https://jcgt.org/published/0006/01/01/paper.pdf
 32 |     #[inline(always)]
 33 |     pub fn build_orthonormal_basis(n: Vec3A) -> Mat3 {
 34 |         let sign = n.z.signum();
 35 |         let a = -1.0 / (sign + n.z);
 36 |         let b = n.x * n.y * a;
 37 | 
 38 |         mat3(
 39 |             vec3(1.0 + sign * n.x * n.x * a, sign * b, -sign * n.x),
 40 |             vec3(b, sign + n.y * n.y * a, -n.y),
 41 |             n.into(),
 42 |         )
 43 |     }
 44 | 
 45 |     #[inline(always)]
 46 |     pub fn cosine_sample_hemisphere(urand: Vec2) -> Vec3A {
 47 |         let r = urand.x.sqrt();
 48 |         let theta = urand.y * TAU;
 49 |         vec3a(
 50 |             r * theta.cos(),
 51 |             r * theta.sin(),
 52 |             0.0f32.max(1.0 - urand.x).sqrt(),
 53 |         )
 54 |     }
 55 | 
 56 |     #[inline(always)]
 57 |     pub fn uniform_sample_sphere(urand: Vec2) -> Vec3A {
 58 |         let z = 1.0 - 2.0 * urand.x;
 59 |         let r = (1.0 - z * z).sqrt();
 60 |         let theta = urand.y * TAU;
 61 |         vec3a(r * theta.cos(), r * theta.sin(), z)
 62 |     }
 63 | 
 64 |     #[inline(always)]
 65 |     pub fn uniform_sample_cone(urand: Vec2, cos_theta_max: f32) -> Vec3A {
 66 |         let cos_theta = (1.0 - urand.x) + urand.x * cos_theta_max;
 67 |         let sin_theta = (1.0 - cos_theta * cos_theta).clamp(0.0, 1.0).sqrt();
 68 |         let phi: f32 = urand.y * TAU;
 69 |         vec3a(sin_theta * phi.cos(), sin_theta * phi.sin(), cos_theta)
 70 |     }
 71 | 
 72 |     #[inline(always)]
 73 |     pub fn smoothstep(e0: f32, e1: f32, x: f32) -> f32 {
 74 |         let t = ((x - e0) / (e1 - e0)).clamp(0.0, 1.0);
 75 |         t * t * (3.0 - 2.0 * t)
 76 |     }
 77 | 
 78 |     #[inline(always)]
 79 |     fn cubic(v0: f32, v1: f32, v2: f32, v3: f32, x: f32) -> f32 {
 80 |         let p = (v3 - v2) - (v0 - v1);
 81 |         let q = (v0 - v1) - p;
 82 |         let r = v2 - v0;
 83 |         let s = v1;
 84 |         p * x.powi(3) + q * x.powi(2) + r * x + s
 85 |     }
 86 | 
 87 |     #[inline(always)]
 88 |     pub fn bicubic_noise(coord: Vec2, seed: u32) -> f32 {
 89 |         let ix = coord.x.floor() as u32;
 90 |         let iy = coord.y.floor() as u32;
 91 |         let fx = coord.x - ix as f32;
 92 |         let fy = coord.y - iy as f32;
 93 |         fn cubic_col(ix: u32, iy: u32, j: u32, seed: u32, fx: f32) -> f32 {
 94 |             cubic(
 95 |                 hash_noise(uvec2(ix, iy + j), seed),
 96 |                 hash_noise(uvec2(ix + 1, iy + j), seed),
 97 |                 hash_noise(uvec2(ix + 2, iy + j), seed),
 98 |                 hash_noise(uvec2(ix + 3, iy + j), seed),
 99 |                 fx,
100 |             )
101 |         }
102 |         cubic(
103 |             cubic_col(ix, iy, 0, seed, fx),
104 |             cubic_col(ix, iy, 1, seed, fx),
105 |             cubic_col(ix, iy, 2, seed, fx),
106 |             cubic_col(ix, iy, 3, seed, fx),
107 |             fy,
108 |         )
109 |     }
110 | 
111 |     // By Tomasz Stachowiak
112 |     pub fn somewhat_boring_display_transform(col: Vec3A) -> Vec3A {
113 |         fn rgb_to_ycbcr(col: Vec3A) -> Vec3A {
114 |             Mat3A {
115 |                 x_axis: vec3a(0.2126, -0.1146, 0.5),
116 |                 y_axis: vec3a(0.7152, -0.3854, -0.4542),
117 |                 z_axis: vec3a(0.0722, 0.5, -0.0458),
118 |             } * col
119 |         }
120 | 
121 |         fn tonemap_curve(v: f32) -> f32 {
122 |             1.0 - (-v).exp()
123 |         }
124 | 
125 |         fn tonemap_curve3(v: Vec3A) -> Vec3A {
126 |             1.0 - (-v).exp()
127 |         }
128 | 
129 |         fn tonemapping_luminance(col: Vec3A) -> f32 {
130 |             col.dot(vec3a(0.2126, 0.7152, 0.0722))
131 |         }
132 | 
133 |         let mut col = col;
134 |         let ycbcr = rgb_to_ycbcr(col);
135 | 
136 |         let bt = tonemap_curve(ycbcr.yz().length() * 2.4);
137 |         let mut desat = (bt - 0.7) * 0.8;
138 |         desat *= desat;
139 | 
140 |         let desat_col = col.lerp(ycbcr.xxx(), desat);
141 | 
142 |         let tm_luma = tonemap_curve(ycbcr.x);
143 |         let tm0 = col * tm_luma / tonemapping_luminance(col).max(1e-5);
144 |         let final_mult = 0.97;
145 |         let tm1 = tonemap_curve3(desat_col);
146 | 
147 |         col = tm0.lerp(tm1, bt * bt);
148 | 
149 |         col * final_mult
150 |     }
151 | }
152 | 
153 | pub mod geometry {
154 |     use crate::{test_util::sampling::bicubic_noise, Triangle};
155 |     use glam::*;
156 | 
157 |     #[inline(always)]
158 |     const fn vec(a: f32, b: f32, c: f32) -> Vec3A {
159 |         Vec3A::new(a, b, c)
160 |     }
161 |     #[inline(always)]
162 |     const fn tri(v0: Vec3A, v1: Vec3A, v2: Vec3A) -> Triangle {
163 |         Triangle { v0, v1, v2 }
164 |     }
165 | 
166 |     /// Cube triangle mesh with side length of 2 centered at 0,0,0
167 |     pub const CUBE: [Triangle; 12] = [
168 |         tri(vec(-1., 1., -1.), vec(1., 1., 1.), vec(1., 1., -1.)),
169 |         tri(vec(1., 1., 1.), vec(-1., -1., 1.), vec(1., -1., 1.)),
170 |         tri(vec(-1., 1., 1.), vec(-1., -1., -1.), vec(-1., -1., 1.)),
171 |         tri(vec(1., -1., -1.), vec(-1., -1., 1.), vec(-1., -1., -1.)),
172 |         tri(vec(1., 1., -1.), vec(1., -1., 1.), vec(1., -1., -1.)),
173 |         tri(vec(-1., 1., -1.), vec(1., -1., -1.), vec(-1., -1., -1.)),
174 |         tri(vec(-1., 1., -1.), vec(-1., 1., 1.), vec(1., 1., 1.)),
175 |         tri(vec(1., 1., 1.), vec(-1., 1., 1.), vec(-1., -1., 1.)),
176 |         tri(vec(-1., 1., 1.), vec(-1., 1., -1.), vec(-1., -1., -1.)),
177 |         tri(vec(1., -1., -1.), vec(1., -1., 1.), vec(-1., -1., 1.)),
178 |         tri(vec(1., 1., -1.), vec(1., 1., 1.), vec(1., -1., 1.)),
179 |         tri(vec(-1., 1., -1.), vec(1., 1., -1.), vec(1., -1., -1.)),
180 |     ];
181 | 
182 |     /// Plane triangle mesh with side length of 2 centered at 0,0,0
183 |     pub const PLANE: [Triangle; 2] = [
184 |         tri(vec(1., 0., 1.), vec(-1., 0., -1.), vec(-1., 0., 1.)),
185 |         tri(vec(1., 0., 1.), vec(1., 0., -1.), vec(-1., 0., -1.)),
186 |     ];
187 | 
188 |     /// Generate icosphere mesh with radius of 2
189 |     pub fn icosphere(subdivisions: u32) -> Vec<Triangle> {
190 |         let phi = (1.0 + 5.0_f32.sqrt()) / 2.0; // golden ratio
191 |         let (a, b, c, d, e) = (1.0, -1.0, 0.0, phi, -phi);
192 | 
193 |         #[rustfmt::skip]
194 |         let mut p = [vec(b,d,c),vec(a,d,c),vec(b,e,c),vec(a,e,c),vec(c,b,d),vec(c,a,d),vec(c,b,e),vec(c,a,e),vec(d,c,b),vec(d,c,a),vec(e,c,b),vec(e,c,a)];
195 |         p.iter_mut().for_each(|v| *v = v.normalize());
196 | 
197 |         let mut tris = vec![
198 |             tri(p[0], p[11], p[5]),
199 |             tri(p[0], p[5], p[1]),
200 |             tri(p[0], p[1], p[7]),
201 |             tri(p[0], p[7], p[10]),
202 |             tri(p[0], p[10], p[11]),
203 |             tri(p[1], p[5], p[9]),
204 |             tri(p[5], p[11], p[4]),
205 |             tri(p[11], p[10], p[2]),
206 |             tri(p[10], p[7], p[6]),
207 |             tri(p[7], p[1], p[8]),
208 |             tri(p[3], p[9], p[4]),
209 |             tri(p[3], p[4], p[2]),
210 |             tri(p[3], p[2], p[6]),
211 |             tri(p[3], p[6], p[8]),
212 |             tri(p[3], p[8], p[9]),
213 |             tri(p[4], p[9], p[5]),
214 |             tri(p[2], p[4], p[11]),
215 |             tri(p[6], p[2], p[10]),
216 |             tri(p[8], p[6], p[7]),
217 |             tri(p[9], p[8], p[1]),
218 |         ];
219 | 
220 |         (0..subdivisions).for_each(|_| {
221 |             let mut new_tris = Vec::new();
222 |             tris.iter().for_each(|t| {
223 |                 let mid01 = ((t.v0 + t.v1) * 0.5).normalize();
224 |                 let mid12 = ((t.v1 + t.v2) * 0.5).normalize();
225 |                 let mid20 = ((t.v2 + t.v0) * 0.5).normalize();
226 |                 new_tris.push(tri(t.v0, mid01, mid20));
227 |                 new_tris.push(tri(t.v1, mid12, mid01));
228 |                 new_tris.push(tri(t.v2, mid20, mid12));
229 |                 new_tris.push(tri(mid01, mid12, mid20));
230 |             });
231 |             tris = new_tris;
232 |         });
233 | 
234 |         tris
235 |     }
236 | 
237 |     /// Convert height map to triangles with 2x2x2 size given -1.0..=1.0 output from height_map: F
238 |     pub fn height_to_triangles<F>(
239 |         height_map: F,
240 |         x_resolution: usize,
241 |         z_resolution: usize,
242 |     ) -> Vec<Triangle>
243 |     where
244 |         F: Fn(usize, usize) -> f32,
245 |     {
246 |         let mut triangles = Vec::new();
247 | 
248 |         // Iterate over each cell in the grid
249 |         for z in 0..z_resolution {
250 |             for x in 0..x_resolution {
251 |                 // Calculate normalized positions
252 |                 let fx = (x as f32 / x_resolution as f32) * 2.0 - 1.0;
253 |                 let fz = (z as f32 / z_resolution as f32) * 2.0 - 1.0;
254 |                 let fx2 = ((x + 1) as f32 / x_resolution as f32) * 2.0 - 1.0;
255 |                 let fz2 = ((z + 1) as f32 / z_resolution as f32) * 2.0 - 1.0;
256 | 
257 |                 // Create vertices for each corner of the cell
258 |                 let v00 = vec(fx, height_map(x, z), fz);
259 |                 let v10 = vec(fx2, height_map(x + 1, z), fz);
260 |                 let v01 = vec(fx, height_map(x, z + 1), fz2);
261 |                 let v11 = vec(fx2, height_map(x + 1, z + 1), fz2);
262 | 
263 |                 // Create two triangles for this cell
264 |                 triangles.push(tri(v00, v01, v10));
265 |                 triangles.push(tri(v10, v01, v11));
266 |             }
267 |         }
268 | 
269 |         triangles
270 |     }
271 | 
272 |     /// terrain_res 1024 or greater recommended
273 |     pub fn demoscene(terrain_res: usize, seed: u32) -> Vec<Triangle> {
274 |         let height_map = |x: usize, y: usize| -> f32 {
275 |             let coord = vec2(x as f32, y as f32) / terrain_res as f32;
276 |             let (mut cs, mut ns) = (1.579, 0.579);
277 |             (1..17)
278 |                 .map(|i| {
279 |                     (cs, ns) = (cs * 1.579, ns * -0.579);
280 |                     bicubic_noise(coord * cs, seed + i) * ns
281 |                 })
282 |                 .sum::<f32>()
283 |                 * (1.0 - coord.y).powf(0.579)
284 |                 + (1.0 - coord.y).powf(1.579) * 0.579
285 |         };
286 |         height_to_triangles(height_map, terrain_res, terrain_res)
287 |     }
288 | }
289 | 


--------------------------------------------------------------------------------
/src/triangle.rs:
--------------------------------------------------------------------------------
  1 | //! Triangle representation in 3D space.
  2 | 
  3 | use bytemuck::{Pod, Zeroable};
  4 | use glam::{vec2, Mat4, Vec2, Vec3A};
  5 | 
  6 | use crate::{aabb::Aabb, ray::Ray, Boundable, Transformable};
  7 | 
  8 | #[derive(Clone, Copy, Default, Debug)]
  9 | pub struct Triangle {
 10 |     pub v0: Vec3A,
 11 |     pub v1: Vec3A,
 12 |     pub v2: Vec3A,
 13 | }
 14 | 
 15 | unsafe impl Pod for Triangle {}
 16 | unsafe impl Zeroable for Triangle {}
 17 | 
 18 | impl Triangle {
 19 |     /// Compute the normal of the triangle geometry.
 20 |     #[inline(always)]
 21 |     pub fn compute_normal(&self) -> Vec3A {
 22 |         let e1 = self.v1 - self.v0;
 23 |         let e2 = self.v2 - self.v0;
 24 |         e1.cross(e2).normalize_or_zero()
 25 |     }
 26 | 
 27 |     /// Compute the bounding box of the triangle.
 28 |     #[inline(always)]
 29 |     pub fn aabb(&self) -> Aabb {
 30 |         Aabb::from_points(&[self.v0, self.v1, self.v2])
 31 |     }
 32 | 
 33 |     /// Find the distance (t) of the intersection of the `Ray` and this Triangle.
 34 |     /// Returns f32::INFINITY for miss.
 35 |     #[inline(always)]
 36 |     pub fn intersect(&self, ray: &Ray) -> f32 {
 37 |         // TODO not very water tight from the back side in some contexts (tris with edges at 0,0,0 show 1px gap)
 38 |         // Find out if this is typical of Möller
 39 |         // Based on Fast Minimum Storage Ray Triangle Intersection by T. Möller and B. Trumbore
 40 |         // https://madmann91.github.io/2021/04/29/an-introduction-to-bvhs.html
 41 |         let cull_backface = false;
 42 |         let e1 = self.v0 - self.v1;
 43 |         let e2 = self.v2 - self.v0;
 44 |         let n = e1.cross(e2);
 45 | 
 46 |         let c = self.v0 - ray.origin;
 47 |         let r = ray.direction.cross(c);
 48 |         let inv_det = 1.0 / n.dot(ray.direction);
 49 | 
 50 |         let u = r.dot(e2) * inv_det;
 51 |         let v = r.dot(e1) * inv_det;
 52 |         let w = 1.0 - u - v;
 53 | 
 54 |         //let hit = u >= 0.0 && v >= 0.0 && w >= 0.0;
 55 |         //let valid = if cull_backface {
 56 |         //    inv_det > 0.0 && hit
 57 |         //} else {
 58 |         //    inv_det != 0.0 && hit
 59 |         //};
 60 | 
 61 |         // Note: differs in that if v == -0.0, for example will cause valid to be false
 62 |         let hit = u.to_bits() | v.to_bits() | w.to_bits();
 63 |         let valid = if cull_backface {
 64 |             (inv_det.to_bits() | hit) & 0x8000_0000 == 0
 65 |         } else {
 66 |             inv_det != 0.0 && hit & 0x8000_0000 == 0
 67 |         };
 68 | 
 69 |         if valid {
 70 |             let t = n.dot(c) * inv_det;
 71 |             if t >= ray.tmin && t <= ray.tmax {
 72 |                 return t;
 73 |             }
 74 |         }
 75 | 
 76 |         f32::INFINITY
 77 |     }
 78 | 
 79 |     // https://github.com/RenderKit/embree/blob/0c236df6f31a8e9c8a48803dada333e9ea0029a6/kernels/geometry/triangle_intersector_moeller.h#L9
 80 |     #[cfg(all(
 81 |         any(target_arch = "x86", target_arch = "x86_64"),
 82 |         target_feature = "sse2"
 83 |     ))]
 84 |     pub fn intersect_embree(&self, ray: &Ray) -> f32 {
 85 |         // Not watertight from the front side? Looks similar to what above looks like from the back side.
 86 | 
 87 |         // This uses the orientation from https://madmann91.github.io/2021/04/29/an-introduction-to-bvhs.html
 88 | 
 89 |         let cull_backface = false;
 90 |         let v0 = self.v0;
 91 |         let e1 = self.v0 - self.v1;
 92 |         let e2 = self.v2 - self.v0;
 93 |         let ng = e1.cross(e2);
 94 | 
 95 |         // Calculate denominator
 96 |         let o = ray.origin;
 97 |         let d = ray.direction;
 98 |         let c = v0 - o;
 99 |         let r = c.cross(d);
100 |         let den = (-ng).dot(d);
101 |         let abs_den = den.abs();
102 | 
103 |         fn signmsk(x: f32) -> f32 {
104 |             #[cfg(target_arch = "x86")]
105 |             use std::arch::x86::*;
106 |             #[cfg(target_arch = "x86_64")]
107 |             use std::arch::x86_64::*;
108 |             unsafe {
109 |                 let mask = _mm_set1_ps(-0.0);
110 |                 let x_vec = _mm_set_ss(x);
111 |                 let sign_bit = _mm_and_ps(x_vec, mask);
112 |                 _mm_cvtss_f32(sign_bit)
113 |                 //_mm_cvtss_f32(_mm_and_ps(
114 |                 //    _mm_set_ss(x),
115 |                 //    _mm_castsi128_ps(_mm_set1_epi32(-2147483648i32)),
116 |                 //))
117 |             }
118 |         }
119 | 
120 |         let sgn_den = signmsk(den).to_bits();
121 | 
122 |         // Perform edge tests
123 |         let u = f32::from_bits(r.dot(e2).to_bits() ^ sgn_den);
124 |         let v = f32::from_bits(r.dot(e1).to_bits() ^ sgn_den);
125 |         // TODO simd uv?
126 | 
127 |         // Perform backface culling
128 |         // OG
129 |         //let valid = if cull_backface {
130 |         //    den < 0.0 && u >= 0.0 && v >= 0.0 && u + v <= abs_den
131 |         //} else {
132 |         //    den != 0.0 && u >= 0.0 && v >= 0.0 && u + v <= abs_den
133 |         //};
134 | 
135 |         let w = abs_den - u - v;
136 |         let valid = if cull_backface {
137 |             ((-den).to_bits() | u.to_bits() | v.to_bits() | (abs_den - u - v).to_bits())
138 |                 & 0x8000_0000
139 |                 == 0
140 |         } else {
141 |             den != 0.0 && ((u.to_bits() | v.to_bits() | w.to_bits()) & 0x8000_0000) == 0
142 |         };
143 | 
144 |         if !valid {
145 |             return f32::INFINITY;
146 |         }
147 | 
148 |         // Perform depth test
149 |         let t = f32::from_bits((-ng).dot(c).to_bits() ^ sgn_den);
150 | 
151 |         if abs_den * ray.tmin < t && t <= abs_den * ray.tmax {
152 |             return t;
153 |         }
154 | 
155 |         f32::INFINITY
156 |     }
157 | 
158 |     #[inline(always)]
159 |     pub fn compute_barycentric(&self, ray: &Ray) -> Vec2 {
160 |         let e1 = self.v0 - self.v1;
161 |         let e2 = self.v2 - self.v0;
162 |         let ng = e1.cross(e2).normalize_or_zero();
163 |         let r = ray.direction.cross(self.v0 - ray.origin);
164 |         vec2(r.dot(e2), r.dot(e1)) / ng.dot(ray.direction)
165 |     }
166 | }
167 | 
168 | impl Boundable for Triangle {
169 |     fn aabb(&self) -> Aabb {
170 |         self.aabb()
171 |     }
172 | }
173 | 
174 | impl Transformable for &mut Triangle {
175 |     fn transform(&mut self, matrix: &Mat4) {
176 |         self.v0 = matrix.transform_point3a(self.v0);
177 |         self.v1 = matrix.transform_point3a(self.v1);
178 |         self.v2 = matrix.transform_point3a(self.v2);
179 |     }
180 | }
181 | 
182 | impl<T> Transformable for T
183 | where
184 |     T: AsMut<[Triangle]>,
185 | {
186 |     fn transform(&mut self, matrix: &Mat4) {
187 |         self.as_mut().iter_mut().for_each(|mut triangle| {
188 |             triangle.transform(matrix);
189 |         });
190 |     }
191 | }
192 | 


--------------------------------------------------------------------------------
/tests/mod.rs:
--------------------------------------------------------------------------------
  1 | #[cfg(test)]
  2 | mod tests {
  3 | 
  4 |     use std::time::Duration;
  5 | 
  6 |     use glam::*;
  7 |     use obvhs::{
  8 |         aabb::Aabb,
  9 |         bvh2::builder::{build_bvh2, build_bvh2_from_tris},
 10 |         cwbvh::{
 11 |             builder::{build_cwbvh, build_cwbvh_from_tris},
 12 |             bvh2_to_cwbvh::bvh2_to_cwbvh,
 13 |         },
 14 |         ray::{Ray, RayHit},
 15 |         test_util::{
 16 |             geometry::{demoscene, height_to_triangles, icosphere},
 17 |             sampling::{hash_noise, uniform_sample_sphere},
 18 |         },
 19 |         traverse,
 20 |         triangle::Triangle,
 21 |         BvhBuildParams,
 22 |     };
 23 | 
 24 |     #[test]
 25 |     pub fn build_bvh2_with_empty_aabb() {
 26 |         let bvh = build_bvh2(
 27 |             &[Aabb::empty()],
 28 |             BvhBuildParams::medium_build(),
 29 |             &mut Duration::default(),
 30 |         );
 31 |         let ray = Ray::new_inf(Vec3A::Z, -Vec3A::Z);
 32 |         assert!(!bvh.ray_traverse(ray, &mut RayHit::none(), |_ray, _id| f32::INFINITY));
 33 |     }
 34 | 
 35 |     #[test]
 36 |     pub fn build_cwbvh_with_empty_aabb() {
 37 |         let bvh = build_cwbvh(
 38 |             &[Aabb::empty()],
 39 |             BvhBuildParams::medium_build(),
 40 |             &mut Duration::default(),
 41 |         );
 42 |         let ray = Ray::new_inf(Vec3A::Z, -Vec3A::Z);
 43 |         assert!(!bvh.ray_traverse(ray, &mut RayHit::none(), |_ray, _id| f32::INFINITY));
 44 |     }
 45 | 
 46 |     #[test]
 47 |     pub fn build_bvh2_with_nothing() {
 48 |         let aabbs: Vec<Aabb> = Vec::new();
 49 |         let bvh = build_bvh2(
 50 |             &aabbs,
 51 |             BvhBuildParams::medium_build(),
 52 |             &mut Duration::default(),
 53 |         );
 54 |         let ray = Ray::new_inf(Vec3A::Z, -Vec3A::Z);
 55 |         assert!(!bvh.ray_traverse(ray, &mut RayHit::none(), |_ray, _id| f32::INFINITY));
 56 |     }
 57 | 
 58 |     #[test]
 59 |     pub fn build_cwbvh_with_nothing() {
 60 |         let aabbs: Vec<Aabb> = Vec::new();
 61 |         let bvh = build_cwbvh(
 62 |             &aabbs,
 63 |             BvhBuildParams::medium_build(),
 64 |             &mut Duration::default(),
 65 |         );
 66 |         let ray = Ray::new_inf(Vec3A::Z, -Vec3A::Z);
 67 |         assert!(!bvh.ray_traverse(ray, &mut RayHit::none(), |_ray, _id| f32::INFINITY));
 68 |     }
 69 | 
 70 |     #[test]
 71 |     pub fn check_flat_subdivided_plane_normals() {
 72 |         let tris = height_to_triangles(|_x: usize, _y: usize| -> f32 { 0.0 }, 4, 4);
 73 |         let mut hit_count = 0;
 74 |         eval_render(
 75 |             |_x: u32, _y: u32, hit: RayHit| {
 76 |                 let n = tris[hit.primitive_id as usize].compute_normal();
 77 |                 if n == Vec3A::Y {
 78 |                     hit_count += 1
 79 |                 }
 80 |             },
 81 |             &tris,
 82 |             256,
 83 |             256,
 84 |             90.0f32.to_radians(),
 85 |             vec3a(0.0, 0.9, 0.0),
 86 |             vec3a(0.0, 0.0, 0.0),
 87 |             Vec3A::X,
 88 |         );
 89 |         assert_eq!(hit_count, 256 * 256)
 90 |     }
 91 | 
 92 |     pub fn eval_render<F>(
 93 |         mut eval: F,
 94 |         tris: &[Triangle],
 95 |         width: u32,
 96 |         height: u32,
 97 |         fov: f32,
 98 |         eye: Vec3A,
 99 |         look_at: Vec3A,
100 |         up: Vec3A,
101 |     ) where
102 |         F: FnMut(u32, u32, RayHit),
103 |     {
104 |         let cwbvh = build_cwbvh_from_tris(
105 |             tris,
106 |             BvhBuildParams::medium_build(),
107 |             &mut Duration::default(),
108 |         );
109 | 
110 |         let bvh_tris = cwbvh
111 |             .primitive_indices
112 |             .iter()
113 |             .map(|i| tris[*i as usize])
114 |             .collect::<Vec<Triangle>>();
115 | 
116 |         let target_size = Vec2::new(width as f32, height as f32);
117 | 
118 |         // Compute camera projection & view matrices
119 |         let aspect_ratio = target_size.x / target_size.y;
120 |         let proj_inv = Mat4::perspective_infinite_reverse_rh(fov, aspect_ratio, 0.01).inverse();
121 |         let view_inv = Mat4::look_at_rh(eye.into(), look_at.into(), up.into()).inverse();
122 | 
123 |         for x in 0..width {
124 |             for y in 0..height {
125 |                 let frag_coord = uvec2(x, y);
126 |                 let mut screen_uv = frag_coord.as_vec2() / target_size;
127 |                 screen_uv.y = 1.0 - screen_uv.y;
128 |                 let ndc = screen_uv * 2.0 - Vec2::ONE;
129 |                 let clip_pos = vec4(ndc.x, ndc.y, 1.0, 1.0);
130 | 
131 |                 let mut vs = proj_inv * clip_pos;
132 |                 vs /= vs.w;
133 |                 let direction = (Vec3A::from((view_inv * vs).xyz()) - eye).normalize();
134 |                 let ray = Ray::new(eye, direction, 0.0, f32::MAX);
135 | 
136 |                 let mut hit = RayHit::none();
137 |                 if cwbvh.ray_traverse(ray, &mut hit, |ray, id| bvh_tris[id].intersect(ray)) {
138 |                     eval(x, y, hit);
139 |                 }
140 |             }
141 |         }
142 |     }
143 | 
144 |     #[test]
145 |     pub fn traverse_aabb() {
146 |         let tris = demoscene(201, 0);
147 |         let aabb = Aabb::new(vec3a(0.511, -1.0, 0.511), vec3a(0.611, 1.0, 0.611));
148 | 
149 |         let mut refrence_intersect_sum = 0usize;
150 |         let mut refrence_count = 0;
151 |         for (primitive_id, tri) in tris.iter().enumerate() {
152 |             if aabb.intersect_aabb(&tri.aabb()) {
153 |                 refrence_intersect_sum = refrence_intersect_sum.wrapping_add(primitive_id);
154 |                 refrence_count += 1;
155 |             }
156 |         }
157 | 
158 |         // Bvh2
159 |         let bvh2 = build_bvh2_from_tris(
160 |             &tris,
161 |             BvhBuildParams::fast_build(),
162 |             &mut Duration::default(),
163 |         );
164 |         let mut intersect_sum = 0usize;
165 |         let mut intersect_count = 0;
166 |         bvh2.validate(&tris, false, false);
167 |         bvh2.aabb_traverse(aabb, |bvh, id| {
168 |             let node = &bvh.nodes[id as usize];
169 |             for i in 0..node.prim_count {
170 |                 let primitive_id = bvh.primitive_indices[(node.first_index + i) as usize] as usize;
171 |                 let tri = tris[primitive_id];
172 |                 if aabb.intersect_aabb(&tri.aabb()) {
173 |                     intersect_count += 1;
174 |                     intersect_sum = intersect_sum.wrapping_add(primitive_id);
175 |                 }
176 |             }
177 |             true
178 |         });
179 |         assert_eq!(refrence_count, intersect_count);
180 |         assert_eq!(refrence_intersect_sum, intersect_sum);
181 | 
182 |         // CwBvh
183 |         let cwbvh = build_cwbvh_from_tris(
184 |             &tris,
185 |             BvhBuildParams::fast_build(),
186 |             &mut Duration::default(),
187 |         );
188 |         let mut cw_intersect_count = 0;
189 |         let mut cw_intersect_sum = 0usize;
190 |         cwbvh.validate(&tris, false, false);
191 | 
192 |         let mut state = cwbvh.new_traversal(Vec3A::ZERO);
193 |         let mut node;
194 |         traverse!(
195 |             cwbvh,
196 |             node,
197 |             state,
198 |             node.intersect_aabb(&aabb, state.oct_inv4),
199 |             {
200 |                 let primitive_id = cwbvh.primitive_indices[state.primitive_id as usize] as usize;
201 |                 let tri = tris[primitive_id];
202 |                 if aabb.intersect_aabb(&tri.aabb()) {
203 |                     cw_intersect_count += 1;
204 |                     cw_intersect_sum = cw_intersect_sum.wrapping_add(primitive_id);
205 |                 }
206 |             }
207 |         );
208 | 
209 |         assert_eq!(refrence_count, cw_intersect_count);
210 |         assert_eq!(refrence_intersect_sum, cw_intersect_sum);
211 |     }
212 | 
213 |     #[test]
214 |     pub fn traverse_point() {
215 |         let tris = icosphere(0);
216 | 
217 |         // TODO Bvh2
218 | 
219 |         // CwBvh
220 |         let cwbvh = build_cwbvh_from_tris(
221 |             &tris,
222 |             BvhBuildParams::fast_build(),
223 |             &mut Duration::default(),
224 |         );
225 |         cwbvh.validate(&tris, false, false);
226 | 
227 |         for i in 0..512 {
228 |             let point =
229 |                 uniform_sample_sphere(vec2(hash_noise(uvec2(0, 0), i), hash_noise(uvec2(0, 1), i)));
230 | 
231 |             let mut refrence_intersect_sum = 0usize;
232 |             let mut refrence_count = 0;
233 |             for (primitive_id, tri) in tris.iter().enumerate() {
234 |                 if tri.aabb().contains_point(point) {
235 |                     refrence_intersect_sum = refrence_intersect_sum.wrapping_add(primitive_id);
236 |                     refrence_count += 1;
237 |                 }
238 |             }
239 | 
240 |             let mut cw_intersect_count = 0;
241 |             let mut cw_intersect_sum = 0usize;
242 |             let mut state = cwbvh.new_traversal(Vec3A::ZERO);
243 |             let mut node;
244 |             traverse!(
245 |                 cwbvh,
246 |                 node,
247 |                 state,
248 |                 node.contains_point(&point, state.oct_inv4),
249 |                 {
250 |                     let primitive_id =
251 |                         cwbvh.primitive_indices[state.primitive_id as usize] as usize;
252 |                     let tri = tris[primitive_id];
253 |                     if tri.aabb().contains_point(point) {
254 |                         cw_intersect_count += 1;
255 |                         cw_intersect_sum = cw_intersect_sum.wrapping_add(primitive_id);
256 |                     }
257 |                 }
258 |             );
259 | 
260 |             assert_eq!(refrence_count, cw_intersect_count);
261 |             assert_eq!(refrence_intersect_sum, cw_intersect_sum);
262 |         }
263 |     }
264 | 
265 |     #[test]
266 |     pub fn compute_parents_cwbvh() {
267 |         let tris = demoscene(100, 0);
268 |         let cwbvh = build_cwbvh_from_tris(
269 |             &tris,
270 |             BvhBuildParams::fast_build(),
271 |             &mut Duration::default(),
272 |         );
273 |         cwbvh.validate(&tris, false, false);
274 |         let parents = cwbvh.compute_parents();
275 |         for (child, parent) in parents.iter().enumerate().skip(1) {
276 |             let node = cwbvh.nodes[*parent as usize];
277 |             let mut found_child = false;
278 |             for ch in 0..8 {
279 |                 if !node.is_leaf(ch) {
280 |                     let child_index = node.child_node_index(ch);
281 |                     if child_index as usize == child {
282 |                         found_child = true;
283 |                         break;
284 |                     }
285 |                 }
286 |             }
287 |             assert!(found_child, "child{}, parent{}", child, parent);
288 |         }
289 |     }
290 | 
291 |     #[test]
292 |     pub fn order_children_cwbvh() {
293 |         let tris = demoscene(100, 0);
294 |         let triangles: &[Triangle] = &tris;
295 |         let mut aabbs = Vec::with_capacity(triangles.len());
296 | 
297 |         let config = BvhBuildParams::very_fast_build();
298 |         let mut indices = Vec::with_capacity(triangles.len());
299 |         for (i, tri) in triangles.iter().enumerate() {
300 |             let a = tri.v0;
301 |             let b = tri.v1;
302 |             let c = tri.v2;
303 |             let mut aabb = Aabb::empty();
304 |             aabb.extend(a).extend(b).extend(c);
305 |             aabbs.push(aabb);
306 |             indices.push(i as u32);
307 |         }
308 | 
309 |         let bvh2 = config.ploc_search_distance.build(
310 |             &aabbs,
311 |             indices,
312 |             config.sort_precision,
313 |             config.search_depth_threshold,
314 |         );
315 |         let mut cwbvh = bvh2_to_cwbvh(&bvh2, config.max_prims_per_leaf.clamp(1, 3), true, false);
316 | 
317 |         cwbvh.validate(&tris, false, false);
318 |         for node in 0..cwbvh.nodes.len() {
319 |             cwbvh.order_node_children(&aabbs, node, false);
320 |         }
321 |         cwbvh.validate(&tris, false, false);
322 |         cwbvh.order_children(&aabbs, false);
323 |         cwbvh.validate(&tris, false, false);
324 |     }
325 | 
326 |     #[test]
327 |     pub fn exact_aabbs_cwbvh() {
328 |         let tris = demoscene(100, 0);
329 |         let triangles: &[Triangle] = &tris;
330 |         let mut aabbs = Vec::with_capacity(triangles.len());
331 | 
332 |         let config = BvhBuildParams::very_fast_build();
333 |         let mut indices = Vec::with_capacity(triangles.len());
334 |         for (i, tri) in triangles.iter().enumerate() {
335 |             let a = tri.v0;
336 |             let b = tri.v1;
337 |             let c = tri.v2;
338 |             let mut aabb = Aabb::empty();
339 |             aabb.extend(a).extend(b).extend(c);
340 |             aabbs.push(aabb);
341 |             indices.push(i as u32);
342 |         }
343 | 
344 |         let bvh2 = config.ploc_search_distance.build(
345 |             &aabbs,
346 |             indices,
347 |             config.sort_precision,
348 |             config.search_depth_threshold,
349 |         );
350 |         let mut cwbvh = bvh2_to_cwbvh(&bvh2, config.max_prims_per_leaf.clamp(1, 3), true, true);
351 | 
352 |         if let Some(exact_node_aabbs) = &cwbvh.exact_node_aabbs {
353 |             for node in &cwbvh.nodes {
354 |                 for ch in 0..8 {
355 |                     if !node.is_leaf(ch) {
356 |                         let child_node_index = node.child_node_index(ch) as usize;
357 |                         let compressed_aabb = node.child_aabb(ch);
358 |                         let child_node_self_compressed_aabb = cwbvh.nodes[child_node_index].aabb();
359 |                         let exact_aabb = &exact_node_aabbs[child_node_index];
360 | 
361 |                         assert!(exact_aabb.min.cmpge((compressed_aabb.min).into()).all());
362 |                         assert!(exact_aabb.max.cmple((compressed_aabb.max).into()).all());
363 |                         assert!(exact_aabb
364 |                             .min
365 |                             .cmpge((child_node_self_compressed_aabb.min).into())
366 |                             .all());
367 |                         assert!(exact_aabb
368 |                             .max
369 |                             .cmple((child_node_self_compressed_aabb.max).into())
370 |                             .all());
371 |                     }
372 |                 }
373 |             }
374 |         }
375 | 
376 |         cwbvh.order_children(&aabbs, false);
377 |         cwbvh.validate(&tris, false, false);
378 |         cwbvh.order_children(&aabbs, false);
379 |         cwbvh.validate(&tris, false, false);
380 |     }
381 | }
382 | 


--------------------------------------------------------------------------------