├── .gitignore ├── README.md ├── SoftwareRasterizer.sln └── SoftwareRasterizer ├── Castle ├── IndexBuffer.bin └── VertexBuffer.bin ├── Main.cpp ├── Occluder.cpp ├── Occluder.h ├── QuadDecomposition.cpp ├── QuadDecomposition.h ├── Rasterizer.cpp ├── Rasterizer.h ├── SoftwareRasterizer.vcxproj ├── SoftwareRasterizer.vcxproj.filters ├── Sponza ├── IndexBuffer.bin └── VertexBuffer.bin ├── SurfaceAreaHeuristic.cpp ├── SurfaceAreaHeuristic.h └── VectorMath.h /.gitignore: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # This .gitignore file was automatically created by Microsoft(R) Visual Studio. 3 | ################################################################################ 4 | 5 | *.suo 6 | *.pdb 7 | *.ipdb 8 | *.iobj 9 | *.exe 10 | *.opendb 11 | *.db 12 | *.tlog 13 | *.obj 14 | *.log 15 | *.user 16 | *.idb 17 | *.ilk 18 | *.cfg 19 | *.vtss 20 | *.aux 21 | *.ipch 22 | *.options 23 | *.th 24 | *.sc 25 | *.trace 26 | *.vspx 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rasterizer 2 | 3 | This project is a state-of-the-art software occlusion culling system. 4 | 5 | It's similar in spirit to Intel's https://github.com/GameTechDev/OcclusionCulling, but uses completely different techniques and is 2-3 times faster in single-threaded AVX mode when rendering the full set of occluders (no minimum size). 6 | 7 | Checkout http://threadlocalmutex.com/?p=144 and http://threadlocalmutex.com/?p=163 for some implementation details. 8 | 9 | Sample Data 10 | =========== 11 | 12 | The folder Sponza contains a copy of Crytek's public domain Sponza model. 13 | 14 | The folder Castle contains a copy of Intel's Occlusion Culling sample scene, redistributed here under the terms of the Intel Code Samples License. 15 | 16 | Controls are WASD and cursor keys for the camera. 17 | 18 | Requirements 19 | ============ 20 | - An AVX2 and FMA3 capable CPU (Haswell, Excavator or later) 21 | - Visual Studio 2017 or higher 22 | 23 | License 24 | ============ 25 | 26 | [!["Creative Commons Licence"](https://i.creativecommons.org/p/zero/1.0/88x31.png)](https://creativecommons.org/publicdomain/zero/1.0/) 27 | 28 | The code in this repository is licensed under [CC0 1.0 Universal (CC0 1.0) Public Domain Dedication](https://creativecommons.org/publicdomain/zero/1.0/). 29 | -------------------------------------------------------------------------------- /SoftwareRasterizer.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25123.0 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SoftwareRasterizer", "SoftwareRasterizer\SoftwareRasterizer.vcxproj", "{7E7E35CC-2069-43BB-9056-49DFEF9EEE56}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Release|x64 = Release|x64 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {7E7E35CC-2069-43BB-9056-49DFEF9EEE56}.Debug|x64.ActiveCfg = Debug|x64 15 | {7E7E35CC-2069-43BB-9056-49DFEF9EEE56}.Debug|x64.Build.0 = Debug|x64 16 | {7E7E35CC-2069-43BB-9056-49DFEF9EEE56}.Release|x64.ActiveCfg = Release|x64 17 | {7E7E35CC-2069-43BB-9056-49DFEF9EEE56}.Release|x64.Build.0 = Release|x64 18 | EndGlobalSection 19 | GlobalSection(SolutionProperties) = preSolution 20 | HideSolutionNode = FALSE 21 | EndGlobalSection 22 | EndGlobal 23 | -------------------------------------------------------------------------------- /SoftwareRasterizer/Castle/IndexBuffer.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rawrunprotected/rasterizer/50a1c132c24e85aaa7ef00a6337610ffc04c403e/SoftwareRasterizer/Castle/IndexBuffer.bin -------------------------------------------------------------------------------- /SoftwareRasterizer/Castle/VertexBuffer.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rawrunprotected/rasterizer/50a1c132c24e85aaa7ef00a6337610ffc04c403e/SoftwareRasterizer/Castle/VertexBuffer.bin -------------------------------------------------------------------------------- /SoftwareRasterizer/Main.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rawrunprotected/rasterizer/50a1c132c24e85aaa7ef00a6337610ffc04c403e/SoftwareRasterizer/Main.cpp -------------------------------------------------------------------------------- /SoftwareRasterizer/Occluder.cpp: -------------------------------------------------------------------------------- 1 | #include "Occluder.h" 2 | 3 | #include "VectorMath.h" 4 | 5 | #include 6 | 7 | std::unique_ptr Occluder::bake(const std::vector<__m128>& vertices, __m128 refMin, __m128 refMax) 8 | { 9 | assert(vertices.size() % 16 == 0); 10 | 11 | // Simple k-means clustering by normal direction to improve backface culling efficiency 12 | std::vector<__m128> quadNormals; 13 | for (auto i = 0; i < vertices.size(); i += 4) 14 | { 15 | auto v0 = vertices[i + 0]; 16 | auto v1 = vertices[i + 1]; 17 | auto v2 = vertices[i + 2]; 18 | auto v3 = vertices[i + 3]; 19 | 20 | quadNormals.push_back(normalize(_mm_add_ps(normal(v0, v1, v2), normal(v0, v2, v3)))); 21 | } 22 | 23 | std::vector<__m128> centroids; 24 | std::vector centroidAssignment; 25 | centroids.push_back(_mm_setr_ps(+1.0f, 0.0f, 0.0f, 0.0f)); 26 | centroids.push_back(_mm_setr_ps(0.0f, +1.0f, 0.0f, 0.0f)); 27 | centroids.push_back(_mm_setr_ps(0.0f, 0.0f, +1.0f, 0.0f)); 28 | centroids.push_back(_mm_setr_ps(0.0f, -1.0f, 0.0f, 0.0f)); 29 | centroids.push_back(_mm_setr_ps(0.0f, 0.0f, -1.0f, 0.0f)); 30 | centroids.push_back(_mm_setr_ps(-1.0f, 0.0f, 0.0f, 0.0f)); 31 | 32 | centroidAssignment.resize(vertices.size() / 4); 33 | 34 | bool anyChanged = true; 35 | for (int iter = 0; iter < 10 && anyChanged; ++iter) 36 | { 37 | anyChanged = false; 38 | 39 | for (auto j = 0; j < quadNormals.size(); ++j) 40 | { 41 | __m128 normal = quadNormals[j]; 42 | 43 | __m128 bestDistance = _mm_set1_ps(-std::numeric_limits::infinity()); 44 | uint32_t bestCentroid = 0; 45 | for (int k = 0; k < centroids.size(); ++k) 46 | { 47 | __m128 distance = _mm_dp_ps(centroids[k], normal, 0x7F); 48 | if (_mm_comige_ss(distance, bestDistance)) 49 | { 50 | bestDistance = distance; 51 | bestCentroid = k; 52 | } 53 | } 54 | 55 | if (centroidAssignment[j] != bestCentroid) 56 | { 57 | centroidAssignment[j] = bestCentroid; 58 | anyChanged = true; 59 | } 60 | } 61 | 62 | for (int k = 0; k < centroids.size(); ++k) 63 | { 64 | centroids[k] = _mm_setzero_ps(); 65 | } 66 | 67 | for (int j = 0; j < quadNormals.size(); ++j) 68 | { 69 | int k = centroidAssignment[j]; 70 | 71 | centroids[k] = _mm_add_ps(centroids[k], quadNormals[j]); 72 | } 73 | 74 | for (int k = 0; k < centroids.size(); ++k) 75 | { 76 | centroids[k] = normalize(centroids[k]); 77 | } 78 | } 79 | 80 | std::vector<__m128> orderedVertices; 81 | for (uint32_t k = 0; k < centroids.size(); ++k) 82 | { 83 | for (int j = 0; j < vertices.size() / 4; ++j) 84 | { 85 | if (centroidAssignment[j] == k) 86 | { 87 | orderedVertices.push_back(vertices[4 * j + 0]); 88 | orderedVertices.push_back(vertices[4 * j + 1]); 89 | orderedVertices.push_back(vertices[4 * j + 2]); 90 | orderedVertices.push_back(vertices[4 * j + 3]); 91 | } 92 | } 93 | } 94 | 95 | auto occluder = std::make_unique(); 96 | 97 | __m128 invExtents = _mm_div_ps(_mm_set1_ps(1.0f), _mm_sub_ps(refMax, refMin)); 98 | 99 | __m128 scalingX = _mm_set1_ps(2047.0f); 100 | __m128 scalingY = _mm_set1_ps(2047.0f); 101 | __m128 scalingZ = _mm_set1_ps(1023.0f); 102 | 103 | __m128 half = _mm_set1_ps(0.5f); 104 | 105 | occluder->m_packetCount = 0; 106 | occluder->m_vertexData = reinterpret_cast<__m256i*>(_aligned_malloc(orderedVertices.size() * 4, 32)); 107 | 108 | for (size_t i = 0; i < orderedVertices.size(); i += 32) 109 | { 110 | __m128i v[8]; 111 | 112 | for (auto j = 0; j < 4; ++j) 113 | { 114 | // Transform into [0,1] space relative to bounding box 115 | __m128 v0 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 0], refMin), invExtents); 116 | __m128 v1 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 4], refMin), invExtents); 117 | __m128 v2 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 8], refMin), invExtents); 118 | __m128 v3 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 12], refMin), invExtents); 119 | __m128 v4 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 16], refMin), invExtents); 120 | __m128 v5 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 20], refMin), invExtents); 121 | __m128 v6 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 24], refMin), invExtents); 122 | __m128 v7 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 28], refMin), invExtents); 123 | 124 | // Transpose into [xxxx][yyyy][zzzz][wwww] 125 | _MM_TRANSPOSE4_PS(v0, v1, v2, v3); 126 | _MM_TRANSPOSE4_PS(v4, v5, v6, v7); 127 | 128 | // Scale and truncate to int 129 | v0 = _mm_fmadd_ps(v0, scalingX, half); 130 | v1 = _mm_fmadd_ps(v1, scalingY, half); 131 | v2 = _mm_fmadd_ps(v2, scalingZ, half); 132 | 133 | v4 = _mm_fmadd_ps(v4, scalingX, half); 134 | v5 = _mm_fmadd_ps(v5, scalingY, half); 135 | v6 = _mm_fmadd_ps(v6, scalingZ, half); 136 | 137 | __m128i X0 = _mm_sub_epi32(_mm_cvttps_epi32(v0), _mm_set1_epi32(1024)); 138 | __m128i Y0 = _mm_cvttps_epi32(v1); 139 | __m128i Z0 = _mm_cvttps_epi32(v2); 140 | 141 | __m128i X1 = _mm_sub_epi32(_mm_cvttps_epi32(v4), _mm_set1_epi32(1024)); 142 | __m128i Y1 = _mm_cvttps_epi32(v5); 143 | __m128i Z1 = _mm_cvttps_epi32(v6); 144 | 145 | // Pack to 11/11/10 format 146 | __m128i XYZ0 = _mm_or_si128(_mm_slli_epi32(X0, 21), _mm_or_si128(_mm_slli_epi32(Y0, 10), Z0)); 147 | __m128i XYZ1 = _mm_or_si128(_mm_slli_epi32(X1, 21), _mm_or_si128(_mm_slli_epi32(Y1, 10), Z1)); 148 | 149 | v[2 * j + 0] = XYZ0; 150 | v[2 * j + 1] = XYZ1; 151 | } 152 | 153 | occluder->m_vertexData[occluder->m_packetCount++] = _mm256_loadu_si256(reinterpret_cast(v + 0)); 154 | occluder->m_vertexData[occluder->m_packetCount++] = _mm256_loadu_si256(reinterpret_cast(v + 2)); 155 | occluder->m_vertexData[occluder->m_packetCount++] = _mm256_loadu_si256(reinterpret_cast(v + 4)); 156 | occluder->m_vertexData[occluder->m_packetCount++] = _mm256_loadu_si256(reinterpret_cast(v + 6)); 157 | } 158 | 159 | occluder->m_refMin = refMin; 160 | occluder->m_refMax = refMax; 161 | 162 | __m128 min = _mm_set1_ps(+std::numeric_limits::infinity()); 163 | __m128 max = _mm_set1_ps(-std::numeric_limits::infinity()); 164 | 165 | for (size_t i = 0; i < orderedVertices.size(); ++i) 166 | { 167 | min = _mm_min_ps(vertices[i], min); 168 | max = _mm_max_ps(vertices[i], max); 169 | } 170 | 171 | // Set W = 1 - this is expected by frustum culling code 172 | min = _mm_blend_ps(min, _mm_set1_ps(1.0f), 0b1000); 173 | max = _mm_blend_ps(max, _mm_set1_ps(1.0f), 0b1000); 174 | 175 | occluder->m_boundsMin = min; 176 | occluder->m_boundsMax = max; 177 | 178 | occluder->m_center = _mm_mul_ps(_mm_add_ps(max, min), _mm_set1_ps(0.5f)); 179 | 180 | return occluder; 181 | } 182 | 183 | -------------------------------------------------------------------------------- /SoftwareRasterizer/Occluder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | struct Occluder 8 | { 9 | static std::unique_ptr bake(const std::vector<__m128>& vertices, __m128 refMin, __m128 refMax); 10 | 11 | __m128 m_center; 12 | 13 | __m128 m_refMin; 14 | __m128 m_refMax; 15 | 16 | __m128 m_boundsMin; 17 | __m128 m_boundsMax; 18 | 19 | __m256i* m_vertexData; 20 | uint32_t m_packetCount; 21 | }; 22 | 23 | 24 | -------------------------------------------------------------------------------- /SoftwareRasterizer/QuadDecomposition.cpp: -------------------------------------------------------------------------------- 1 | #include "QuadDecomposition.h" 2 | 3 | #include "VectorMath.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | typedef int Vertex; 15 | 16 | typedef std::vector Path; 17 | 18 | struct Graph 19 | { 20 | size_t numVertices() const 21 | { 22 | return m_adjacencyList.size(); 23 | } 24 | 25 | std::vector> m_adjacencyList; 26 | }; 27 | 28 | class Matching 29 | { 30 | public: 31 | Matching(const Graph& graph) : m_graph(graph), m_matchedVertex(graph.numVertices(), -1), m_bridges(graph.numVertices()), m_clearToken(0), m_tree(graph.numVertices()) 32 | { 33 | std::vector unmatchedVertices; 34 | 35 | // Start with a greedy maximal matching 36 | for (Vertex v = 0; v < m_graph.numVertices(); ++v) 37 | { 38 | if (m_matchedVertex[v] == -1) 39 | { 40 | bool found = false; 41 | for (auto w : m_graph.m_adjacencyList[v]) 42 | { 43 | if (m_matchedVertex[w] == -1) 44 | { 45 | match(v, w); 46 | found = true; 47 | break; 48 | } 49 | } 50 | 51 | if (!found) 52 | { 53 | unmatchedVertices.push_back(v); 54 | } 55 | } 56 | } 57 | 58 | std::vector path; 59 | for (auto v : unmatchedVertices) 60 | { 61 | if (m_matchedVertex[v] == -1) 62 | { 63 | if (findAugmentingPath(v, path)) 64 | { 65 | augment(path); 66 | path.clear(); 67 | } 68 | } 69 | } 70 | } 71 | 72 | Vertex getMatchedVertex(Vertex v) 73 | { 74 | return m_matchedVertex[v]; 75 | } 76 | 77 | private: 78 | void match(Vertex v, Vertex w) 79 | { 80 | m_matchedVertex[v] = w; 81 | m_matchedVertex[w] = v; 82 | } 83 | 84 | void augment(std::vector& path) 85 | { 86 | for (int i = 0; i < path.size(); i += 2) 87 | { 88 | match(path[i], path[i + 1]); 89 | } 90 | } 91 | 92 | bool findAugmentingPath(Vertex root, std::vector & path) 93 | { 94 | // Clear out the forest 95 | size_t numVertices = m_graph.numVertices(); 96 | 97 | m_clearToken++; 98 | 99 | // Start our tree root 100 | m_tree[root].m_depth = 0; 101 | m_tree[root].m_parent = -1; 102 | m_tree[root].m_clearToken = m_clearToken; 103 | m_tree[root].m_blossom = root; 104 | 105 | m_queue.push(root); 106 | 107 | while (!m_queue.empty()) 108 | { 109 | Vertex v = m_queue.front(); 110 | m_queue.pop(); 111 | 112 | for (auto w : m_graph.m_adjacencyList[v]) 113 | { 114 | if (examineEdge(root, v, w, path)) 115 | { 116 | while (!m_queue.empty()) 117 | { 118 | m_queue.pop(); 119 | } 120 | 121 | return true; 122 | } 123 | } 124 | } 125 | 126 | return false; 127 | } 128 | 129 | bool examineEdge(Vertex root, Vertex v, Vertex w, std::vector & path) 130 | { 131 | Vertex vBar = find(v); 132 | Vertex wBar = find(w); 133 | 134 | if (vBar != wBar) 135 | { 136 | if (m_tree[wBar].m_clearToken != m_clearToken) 137 | { 138 | if (m_matchedVertex[w] == -1) 139 | { 140 | buildAugmentingPath(root, v, w, path); 141 | return true; 142 | } 143 | else 144 | { 145 | extendTree(v, w); 146 | } 147 | } 148 | else if (m_tree[wBar].m_depth % 2 == 0) 149 | { 150 | shrinkBlossom(v, w); 151 | } 152 | } 153 | 154 | return false; 155 | } 156 | 157 | void buildAugmentingPath(Vertex root, Vertex v, Vertex w, std::vector & path) 158 | { 159 | path.push_back(w); 160 | findPath(v, root, path); 161 | } 162 | 163 | void extendTree(Vertex v, Vertex w) 164 | { 165 | Vertex u = m_matchedVertex[w]; 166 | 167 | Node& nodeV = m_tree[v]; 168 | Node& nodeW = m_tree[w]; 169 | Node& nodeU = m_tree[u]; 170 | 171 | nodeW.m_depth = nodeV.m_depth + 1 + (nodeV.m_depth & 1); // Must be odd, so we add either 1 or 2 172 | nodeW.m_parent = v; 173 | nodeW.m_clearToken = m_clearToken; 174 | nodeW.m_blossom = w; 175 | 176 | nodeU.m_depth = nodeW.m_depth + 1; 177 | nodeU.m_parent = w; 178 | nodeU.m_clearToken = m_clearToken; 179 | nodeU.m_blossom = u; 180 | 181 | m_queue.push(u); 182 | } 183 | 184 | void shrinkBlossom(Vertex v, Vertex w) 185 | { 186 | Vertex b = findCommonAncestor(v, w); 187 | 188 | shrinkPath(b, v, w); 189 | shrinkPath(b, w, v); 190 | } 191 | 192 | void shrinkPath(Vertex b, Vertex v, Vertex w) 193 | { 194 | Vertex u = find(v); 195 | 196 | while (u != b) 197 | { 198 | makeUnion(b, u); 199 | assert(u != -1); 200 | assert(m_matchedVertex[u] != -1); 201 | u = m_matchedVertex[u]; 202 | makeUnion(b, u); 203 | makeRepresentative(b); 204 | m_queue.push(u); 205 | m_bridges[u] = std::make_pair(v, w); 206 | u = find(m_tree[u].m_parent); 207 | } 208 | } 209 | 210 | Vertex findCommonAncestor(Vertex v, Vertex w) 211 | { 212 | while (w != v) 213 | { 214 | if (m_tree[v].m_depth > m_tree[w].m_depth) 215 | { 216 | v = m_tree[v].m_parent; 217 | } 218 | else 219 | { 220 | w = m_tree[w].m_parent; 221 | } 222 | } 223 | 224 | return find(v); 225 | } 226 | 227 | void findPath(Vertex s, Vertex t, Path & path) 228 | { 229 | if (s == t) 230 | { 231 | path.push_back(s); 232 | } 233 | else if (m_tree[s].m_depth % 2 == 0) 234 | { 235 | path.push_back(s); 236 | path.push_back(m_matchedVertex[s]); 237 | findPath(m_tree[m_matchedVertex[s]].m_parent, t, path); 238 | } 239 | else 240 | { 241 | Vertex v, w; 242 | std::tie(v, w) = m_bridges[s]; 243 | 244 | path.push_back(s); 245 | 246 | size_t offset = path.size(); 247 | findPath(v, m_matchedVertex[s], path); 248 | std::reverse(path.begin() + offset, path.end()); 249 | 250 | findPath(w, t, path); 251 | } 252 | } 253 | 254 | void makeUnion(int x, int y) 255 | { 256 | int xRoot = find(x); 257 | m_tree[xRoot].m_blossom = find(y); 258 | } 259 | 260 | void makeRepresentative(int x) 261 | { 262 | int xRoot = find(x); 263 | m_tree[xRoot].m_blossom = x; 264 | m_tree[x].m_blossom = x; 265 | } 266 | 267 | int find(int x) 268 | { 269 | if (m_tree[x].m_clearToken != m_clearToken) 270 | { 271 | return x; 272 | } 273 | 274 | if (x != m_tree[x].m_blossom) 275 | { 276 | // Path compression 277 | m_tree[x].m_blossom = find(m_tree[x].m_blossom); 278 | } 279 | 280 | return m_tree[x].m_blossom; 281 | } 282 | 283 | 284 | private: 285 | int m_clearToken; 286 | 287 | const Graph& m_graph; 288 | 289 | std::queue m_queue; 290 | std::vector m_matchedVertex; 291 | 292 | struct Node 293 | { 294 | Node() 295 | { 296 | m_clearToken = 0; 297 | } 298 | 299 | int m_depth; 300 | Vertex m_parent; 301 | Vertex m_blossom; 302 | 303 | int m_clearToken; 304 | }; 305 | 306 | std::vector m_tree; 307 | 308 | std::vector> m_bridges; 309 | }; 310 | 311 | 312 | namespace 313 | { 314 | struct PairHash 315 | { 316 | public: 317 | template 318 | std::size_t operator()(const std::pair& x) const 319 | { 320 | auto hashT = std::hash{}(x.first); 321 | auto hashU = std::hash{}(x.second); 322 | return hashT ^ (hashU + 0x9e3779b9 + (hashT << 6) + (hashT >> 2)); 323 | } 324 | }; 325 | 326 | bool canMergeTrianglesToQuad(__m128 v0, __m128 v1, __m128 v2, __m128 v3) 327 | { 328 | // Maximum distance of vertices from original plane in world space units 329 | float maximumDepthError = 0.5f; 330 | 331 | __m128 n0 = normalize(normal(v0, v1, v2)); 332 | __m128 n2 = normalize(normal(v2, v3, v0)); 333 | 334 | __m128 planeDistA = _mm_andnot_ps(_mm_set1_ps(-0.0f), _mm_dp_ps(n0, _mm_sub_ps(v1, v3), 0x7F)); 335 | __m128 planeDistB = _mm_andnot_ps(_mm_set1_ps(-0.0f), _mm_dp_ps(n2, _mm_sub_ps(v1, v3), 0x7F)); 336 | 337 | if (_mm_comigt_ss(planeDistA, _mm_set1_ps(maximumDepthError)) || _mm_comigt_ss(planeDistB, _mm_set1_ps(maximumDepthError))) 338 | { 339 | return false; 340 | } 341 | 342 | return true; 343 | } 344 | } 345 | 346 | std::vector QuadDecomposition::decompose(const std::vector& indices, const std::vector<__m128>& vertices) 347 | { 348 | std::vector result; 349 | 350 | size_t triangleCount = indices.size() / 3; 351 | 352 | Graph candidateGraph; 353 | candidateGraph.m_adjacencyList.resize(triangleCount); 354 | 355 | std::unordered_map, std::vector>, PairHash> edgeToTriangle; 356 | 357 | for (uint32_t triangleIdx = 0; triangleIdx < triangleCount; ++triangleIdx) 358 | { 359 | uint32_t i[3]; 360 | i[0] = indices[3 * triangleIdx + 0]; 361 | i[1] = indices[3 * triangleIdx + 1]; 362 | i[2] = indices[3 * triangleIdx + 2]; 363 | 364 | edgeToTriangle[std::make_pair(i[0], i[1])].push_back(std::make_pair(triangleIdx, i[2])); 365 | edgeToTriangle[std::make_pair(i[1], i[2])].push_back(std::make_pair(triangleIdx, i[0])); 366 | edgeToTriangle[std::make_pair(i[2], i[0])].push_back(std::make_pair(triangleIdx, i[1])); 367 | 368 | for (int edgeIdx = 0; edgeIdx < 3; ++edgeIdx) 369 | { 370 | auto f = std::make_pair(i[(edgeIdx + 1) % 3], i[edgeIdx]); 371 | 372 | auto & neighbors = edgeToTriangle[f]; 373 | for (auto pair : neighbors) 374 | { 375 | uint32_t neighborTriangle = pair.first; 376 | uint32_t apex = pair.second; 377 | 378 | uint32_t quad[] = { i[edgeIdx], apex, i[(edgeIdx + 1) % 3], i[(edgeIdx + 2) % 3] }; 379 | 380 | if (canMergeTrianglesToQuad(vertices[quad[0]], vertices[quad[1]], vertices[quad[2]], vertices[quad[3]])) 381 | { 382 | candidateGraph.m_adjacencyList[triangleIdx].push_back(neighborTriangle); 383 | candidateGraph.m_adjacencyList[neighborTriangle].push_back(triangleIdx); 384 | } 385 | } 386 | } 387 | } 388 | 389 | 390 | uint32_t quadCount = 0; 391 | uint32_t trigleCount = 0; 392 | 393 | Matching matching(candidateGraph); 394 | 395 | for (uint32_t triangleIdx = 0; triangleIdx < triangleCount; ++triangleIdx) 396 | { 397 | int neighbor = matching.getMatchedVertex(triangleIdx); 398 | 399 | // No quad found 400 | if (neighbor == -1) 401 | { 402 | auto i0 = indices[3 * triangleIdx + 0]; 403 | auto i1 = indices[3 * triangleIdx + 1]; 404 | auto i2 = indices[3 * triangleIdx + 2]; 405 | 406 | result.push_back(i0); 407 | result.push_back(i2); 408 | result.push_back(i1); 409 | result.push_back(i0); 410 | } 411 | else if (triangleIdx < uint32_t(neighbor)) 412 | { 413 | uint32_t i[3]; 414 | i[0] = indices[3 * triangleIdx + 0]; 415 | i[1] = indices[3 * triangleIdx + 1]; 416 | i[2] = indices[3 * triangleIdx + 2]; 417 | 418 | // Find out which edge was matched 419 | for (uint32_t edgeIdx = 0; edgeIdx < 3; ++edgeIdx) 420 | { 421 | auto f = std::make_pair(i[(edgeIdx + 1) % 3], i[edgeIdx]); 422 | auto & neighbors = edgeToTriangle[f]; 423 | for (auto pair : neighbors) 424 | { 425 | if (pair.first == neighbor) 426 | { 427 | result.push_back(i[edgeIdx]); 428 | result.push_back(i[(edgeIdx + 2) % 3]); 429 | result.push_back(i[(edgeIdx + 1) % 3]); 430 | result.push_back(pair.second); 431 | 432 | quadCount++; 433 | 434 | goto nextTriangle; 435 | } 436 | } 437 | } 438 | } 439 | 440 | nextTriangle: 441 | continue; 442 | } 443 | 444 | return result; 445 | } 446 | -------------------------------------------------------------------------------- /SoftwareRasterizer/QuadDecomposition.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | class QuadDecomposition 7 | { 8 | public: 9 | static std::vector decompose(const std::vector& indices, const std::vector<__m128>& vertices); 10 | }; 11 | 12 | 13 | -------------------------------------------------------------------------------- /SoftwareRasterizer/Rasterizer.cpp: -------------------------------------------------------------------------------- 1 | #include "Rasterizer.h" 2 | 3 | #include "Occluder.h" 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | static constexpr float floatCompressionBias = 2.5237386e-29f; // 0xFFFF << 12 reinterpreted as float 10 | static constexpr float minEdgeOffset = -0.45f; 11 | static const float maxInvW = std::sqrt(std::numeric_limits::max()); 12 | 13 | static constexpr int OFFSET_QUANTIZATION_BITS = 6; 14 | static constexpr int OFFSET_QUANTIZATION_FACTOR = 1 << OFFSET_QUANTIZATION_BITS; 15 | 16 | static constexpr int SLOPE_QUANTIZATION_BITS = 6; 17 | static constexpr int SLOPE_QUANTIZATION_FACTOR = 1 << SLOPE_QUANTIZATION_BITS; 18 | 19 | enum PrimitiveMode 20 | { 21 | Culled = 0, 22 | Triangle0, 23 | Triangle1, 24 | ConcaveRight, 25 | ConcaveLeft, 26 | ConcaveCenter, 27 | Convex 28 | }; 29 | 30 | static constexpr int modeTable[256] = 31 | { 32 | Convex, Triangle1, ConcaveLeft, Triangle1, Triangle0, Culled, Triangle0, Culled, 33 | ConcaveRight, Triangle1, Culled, Triangle1, Triangle0, Culled, Triangle0, Culled, 34 | Convex, Triangle1, ConcaveLeft, Triangle1, Triangle0, Culled, Triangle0, Culled, 35 | Culled, Triangle1, ConcaveCenter, Triangle1, Triangle0, Culled, Triangle0, Culled, 36 | Convex, Culled, ConcaveLeft, Triangle1, Triangle0, Culled, Triangle0, Culled, 37 | ConcaveRight, Triangle1, ConcaveCenter, Triangle1, Triangle0, Culled, Culled, Culled, 38 | Convex, Triangle1, ConcaveLeft, Culled, Triangle0, Culled, Triangle0, Culled, 39 | ConcaveRight, Triangle1, ConcaveCenter, Triangle1, Culled, Culled, Triangle0, Culled, 40 | Convex, Triangle1, Culled, Triangle1, Triangle0, Culled, Triangle0, Culled, 41 | ConcaveRight, Triangle1, ConcaveCenter, Triangle1, Triangle0, Culled, Triangle0, Culled, 42 | Culled, Triangle1, ConcaveCenter, Triangle1, Triangle0, Culled, Triangle0, Culled, 43 | ConcaveCenter, Triangle1, ConcaveCenter, Triangle1, Triangle0, Culled, Triangle0, Culled, 44 | Convex, Triangle1, ConcaveLeft, Triangle1, Triangle0, Culled, Culled, Culled, 45 | ConcaveRight, Culled, ConcaveCenter, Triangle1, Triangle0, Culled, Triangle0, Culled, 46 | Triangle1, Triangle1, Triangle1, Triangle1, Culled, Culled, Culled, Culled, 47 | Triangle1, Triangle1, Triangle1, Culled, Culled, Culled, Culled, Culled, 48 | Convex, Triangle1, ConcaveLeft, Triangle1, Culled, Culled, Triangle0, Culled, 49 | ConcaveRight, Triangle1, ConcaveCenter, Culled, Triangle0, Culled, Triangle0, Culled, 50 | Convex, Triangle1, ConcaveLeft, Triangle1, Triangle0, Culled, Culled, Culled, 51 | ConcaveRight, Culled, ConcaveCenter, Triangle1, Triangle0, Culled, Triangle0, Culled, 52 | Culled, Triangle1, ConcaveLeft, Triangle1, Triangle0, Culled, Triangle0, Culled, 53 | ConcaveRight, Triangle1, ConcaveCenter, Triangle1, Triangle0, Culled, Triangle0, Culled, 54 | ConcaveRight, Triangle1, Culled, Triangle1, Triangle0, Culled, Triangle0, Culled, 55 | ConcaveRight, Triangle1, ConcaveCenter, Triangle1, Triangle0, Culled, Triangle0, Culled, 56 | Convex, Triangle1, ConcaveLeft, Culled, Triangle0, Culled, Triangle0, Culled, 57 | ConcaveRight, Triangle1, ConcaveCenter, Triangle1, Culled, Culled, Triangle0, Culled, 58 | Triangle0, Culled, Triangle0, Culled, Triangle0, Culled, Triangle0, Culled, 59 | Triangle0, Culled, Triangle0, Culled, Triangle0, Culled, Culled, Culled, 60 | ConcaveLeft, Triangle1, ConcaveLeft, Triangle1, Triangle0, Culled, Triangle0, Culled, 61 | Culled, Triangle1, ConcaveCenter, Triangle1, Triangle0, Culled, Triangle0, Culled, 62 | Culled, Culled, Culled, Culled, Culled, Culled, Culled, Culled, 63 | Culled, Culled, Culled, Culled, Culled, Culled, Culled, Culled, 64 | }; 65 | 66 | Rasterizer::Rasterizer(uint32_t width, uint32_t height) : m_width(width), m_height(height), m_blocksX(width / 8), m_blocksY(height / 8) 67 | { 68 | assert(width % 8 == 0 && height % 8 == 0); 69 | 70 | m_depthBuffer.resize(width * height / 8); 71 | m_hiZ.resize(m_blocksX * m_blocksY + 8, 0); // Add some extra padding to support out-of-bounds reads 72 | 73 | precomputeRasterizationTable(); 74 | } 75 | 76 | void Rasterizer::setModelViewProjection(const float* matrix) 77 | { 78 | __m128 mat0 = _mm_loadu_ps(matrix + 0); 79 | __m128 mat1 = _mm_loadu_ps(matrix + 4); 80 | __m128 mat2 = _mm_loadu_ps(matrix + 8); 81 | __m128 mat3 = _mm_loadu_ps(matrix + 12); 82 | 83 | _MM_TRANSPOSE4_PS(mat0, mat1, mat2, mat3); 84 | 85 | // Store rows 86 | _mm_storeu_ps(m_modelViewProjectionRaw + 0, mat0); 87 | _mm_storeu_ps(m_modelViewProjectionRaw + 4, mat1); 88 | _mm_storeu_ps(m_modelViewProjectionRaw + 8, mat2); 89 | _mm_storeu_ps(m_modelViewProjectionRaw + 12, mat3); 90 | 91 | // Bake viewport transform into matrix and 6shift by half a block 92 | mat0 = _mm_mul_ps(_mm_add_ps(mat0, mat3), _mm_set1_ps(m_width * 0.5f - 4.0f)); 93 | mat1 = _mm_mul_ps(_mm_add_ps(mat1, mat3), _mm_set1_ps(m_height * 0.5f - 4.0f)); 94 | 95 | // Map depth from [-1, 1] to [bias, 0] 96 | mat2 = _mm_mul_ps(_mm_sub_ps(mat3, mat2), _mm_set1_ps(0.5f * floatCompressionBias)); 97 | 98 | _MM_TRANSPOSE4_PS(mat0, mat1, mat2, mat3); 99 | 100 | // Store prebaked cols 101 | _mm_storeu_ps(m_modelViewProjection + 0, mat0); 102 | _mm_storeu_ps(m_modelViewProjection + 4, mat1); 103 | _mm_storeu_ps(m_modelViewProjection + 8, mat2); 104 | _mm_storeu_ps(m_modelViewProjection + 12, mat3); 105 | } 106 | 107 | void Rasterizer::clear() 108 | { 109 | // Mark blocks as cleared by setting Hi Z to 1 (one unit separated from far plane). 110 | // This value is extremely unlikely to occur during normal rendering, so we don't 111 | // need to guard against a HiZ of 1 occuring naturally. This is different from a value of 0, 112 | // which will occur every time a block is partially covered for the first time. 113 | __m128i clearValue = _mm_set1_epi16(1); 114 | uint32_t count = static_cast(m_hiZ.size()) / 8; 115 | __m128i* pHiZ = reinterpret_cast<__m128i*>(m_hiZ.data()); 116 | for (uint32_t offset = 0; offset < count; ++offset) 117 | { 118 | _mm_storeu_si128(pHiZ, clearValue); 119 | pHiZ++; 120 | } 121 | } 122 | 123 | bool Rasterizer::queryVisibility(__m128 boundsMin, __m128 boundsMax, bool& needsClipping) 124 | { 125 | // Frustum cull 126 | __m128 extents = _mm_sub_ps(boundsMax, boundsMin); 127 | __m128 center = _mm_add_ps(boundsMax, boundsMin); // Bounding box center times 2 - but since W = 2, the plane equations work out correctly 128 | __m128 minusZero = _mm_set1_ps(-0.0f); 129 | 130 | __m128 row0 = _mm_loadu_ps(m_modelViewProjectionRaw + 0); 131 | __m128 row1 = _mm_loadu_ps(m_modelViewProjectionRaw + 4); 132 | __m128 row2 = _mm_loadu_ps(m_modelViewProjectionRaw + 8); 133 | __m128 row3 = _mm_loadu_ps(m_modelViewProjectionRaw + 12); 134 | 135 | // Compute distance from each frustum plane 136 | __m128 plane0 = _mm_add_ps(row3, row0); 137 | __m128 offset0 = _mm_add_ps(center, _mm_xor_ps(extents, _mm_and_ps(plane0, minusZero))); 138 | __m128 dist0 = _mm_dp_ps(plane0, offset0, 0xff); 139 | 140 | __m128 plane1 = _mm_sub_ps(row3, row0); 141 | __m128 offset1 = _mm_add_ps(center, _mm_xor_ps(extents, _mm_and_ps(plane1, minusZero))); 142 | __m128 dist1 = _mm_dp_ps(plane1, offset1, 0xff); 143 | 144 | __m128 plane2 = _mm_add_ps(row3, row1); 145 | __m128 offset2 = _mm_add_ps(center, _mm_xor_ps(extents, _mm_and_ps(plane2, minusZero))); 146 | __m128 dist2 = _mm_dp_ps(plane2, offset2, 0xff); 147 | 148 | __m128 plane3 = _mm_sub_ps(row3, row1); 149 | __m128 offset3 = _mm_add_ps(center, _mm_xor_ps(extents, _mm_and_ps(plane3, minusZero))); 150 | __m128 dist3 = _mm_dp_ps(plane3, offset3, 0xff); 151 | 152 | __m128 plane4 = _mm_add_ps(row3, row2); 153 | __m128 offset4 = _mm_add_ps(center, _mm_xor_ps(extents, _mm_and_ps(plane4, minusZero))); 154 | __m128 dist4 = _mm_dp_ps(plane4, offset4, 0xff); 155 | 156 | __m128 plane5 = _mm_sub_ps(row3, row2); 157 | __m128 offset5 = _mm_add_ps(center, _mm_xor_ps(extents, _mm_and_ps(plane5, minusZero))); 158 | __m128 dist5 = _mm_dp_ps(plane5, offset5, 0xff); 159 | 160 | // Combine plane distance signs 161 | __m128 combined = _mm_or_ps(_mm_or_ps(_mm_or_ps(dist0, dist1), _mm_or_ps(dist2, dist3)), _mm_or_ps(dist4, dist5)); 162 | 163 | // Can't use _mm_testz_ps or _mm_comile_ss here because the OR's above created garbage in the non-sign bits 164 | if (_mm_movemask_ps(combined)) 165 | { 166 | return false; 167 | } 168 | 169 | // Load prebaked projection matrix 170 | __m128 col0 = _mm_loadu_ps(m_modelViewProjection + 0); 171 | __m128 col1 = _mm_loadu_ps(m_modelViewProjection + 4); 172 | __m128 col2 = _mm_loadu_ps(m_modelViewProjection + 8); 173 | __m128 col3 = _mm_loadu_ps(m_modelViewProjection + 12); 174 | 175 | // Transform edges 176 | __m128 egde0 = _mm_mul_ps(col0, _mm_broadcastss_ps(extents)); 177 | __m128 egde1 = _mm_mul_ps(col1, _mm_permute_ps(extents, _MM_SHUFFLE(1, 1, 1, 1))); 178 | __m128 egde2 = _mm_mul_ps(col2, _mm_permute_ps(extents, _MM_SHUFFLE(2, 2, 2, 2))); 179 | 180 | __m128 corners[8]; 181 | 182 | // Transform first corner 183 | corners[0] = 184 | _mm_fmadd_ps(col0, _mm_broadcastss_ps(boundsMin), 185 | _mm_fmadd_ps(col1, _mm_permute_ps(boundsMin, _MM_SHUFFLE(1, 1, 1, 1)), 186 | _mm_fmadd_ps(col2, _mm_permute_ps(boundsMin, _MM_SHUFFLE(2, 2, 2, 2)), 187 | col3))); 188 | 189 | // Transform remaining corners by adding edge vectors 190 | corners[1] = _mm_add_ps(corners[0], egde0); 191 | corners[2] = _mm_add_ps(corners[0], egde1); 192 | corners[4] = _mm_add_ps(corners[0], egde2); 193 | 194 | corners[3] = _mm_add_ps(corners[1], egde1); 195 | corners[5] = _mm_add_ps(corners[4], egde0); 196 | corners[6] = _mm_add_ps(corners[2], egde2); 197 | 198 | corners[7] = _mm_add_ps(corners[6], egde0); 199 | 200 | // Transpose into SoA 201 | _MM_TRANSPOSE4_PS(corners[0], corners[1], corners[2], corners[3]); 202 | _MM_TRANSPOSE4_PS(corners[4], corners[5], corners[6], corners[7]); 203 | 204 | // Even if all bounding box corners have W > 0 here, we may end up with some vertices with W < 0 to due floating point differences; so test with some epsilon if any W < 0. 205 | __m128 maxExtent = _mm_max_ps(extents, _mm_permute_ps(extents, _MM_SHUFFLE(1, 0, 3, 2))); 206 | maxExtent = _mm_max_ps(maxExtent, _mm_permute_ps(maxExtent, _MM_SHUFFLE(2, 3, 0, 1))); 207 | __m128 nearPlaneEpsilon = _mm_mul_ps(maxExtent, _mm_set1_ps(0.001f)); 208 | __m128 closeToNearPlane = _mm_or_ps(_mm_cmplt_ps(corners[3], nearPlaneEpsilon), _mm_cmplt_ps(corners[7], nearPlaneEpsilon)); 209 | if (!_mm_testz_ps(closeToNearPlane, closeToNearPlane)) 210 | { 211 | needsClipping = true; 212 | return true; 213 | } 214 | 215 | needsClipping = false; 216 | 217 | // Perspective division 218 | corners[3] = _mm_rcp_ps(corners[3]); 219 | corners[0] = _mm_mul_ps(corners[0], corners[3]); 220 | corners[1] = _mm_mul_ps(corners[1], corners[3]); 221 | corners[2] = _mm_mul_ps(corners[2], corners[3]); 222 | 223 | corners[7] = _mm_rcp_ps(corners[7]); 224 | corners[4] = _mm_mul_ps(corners[4], corners[7]); 225 | corners[5] = _mm_mul_ps(corners[5], corners[7]); 226 | corners[6] = _mm_mul_ps(corners[6], corners[7]); 227 | 228 | // Vertical mins and maxes 229 | __m128 minsX = _mm_min_ps(corners[0], corners[4]); 230 | __m128 maxsX = _mm_max_ps(corners[0], corners[4]); 231 | 232 | __m128 minsY = _mm_min_ps(corners[1], corners[5]); 233 | __m128 maxsY = _mm_max_ps(corners[1], corners[5]); 234 | 235 | // Horizontal reduction, step 1 236 | __m128 minsXY = _mm_min_ps(_mm_unpacklo_ps(minsX, minsY), _mm_unpackhi_ps(minsX, minsY)); 237 | __m128 maxsXY = _mm_max_ps(_mm_unpacklo_ps(maxsX, maxsY), _mm_unpackhi_ps(maxsX, maxsY)); 238 | 239 | // Clamp bounds 240 | minsXY = _mm_max_ps(minsXY, _mm_setzero_ps()); 241 | maxsXY = _mm_min_ps(maxsXY, _mm_setr_ps(float(m_width - 1), float(m_height - 1), float(m_width - 1), float(m_height - 1))); 242 | 243 | // Negate maxes so we can round in the same direction 244 | maxsXY = _mm_xor_ps(maxsXY, minusZero); 245 | 246 | // Horizontal reduction, step 2 247 | __m128 boundsF = _mm_min_ps(_mm_unpacklo_ps(minsXY, maxsXY), _mm_unpackhi_ps(minsXY, maxsXY)); 248 | 249 | // Round towards -infinity and convert to int 250 | __m128i boundsI = _mm_cvttps_epi32(_mm_round_ps(boundsF, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); 251 | 252 | // Store as scalars 253 | int bounds[4]; 254 | _mm_storeu_si128(reinterpret_cast<__m128i*>(&bounds), boundsI); 255 | 256 | // Revert the sign change we did for the maxes 257 | bounds[1] = -bounds[1]; 258 | bounds[3] = -bounds[3]; 259 | 260 | // No intersection between quad and screen area 261 | if (bounds[0] >= bounds[1] || bounds[2] >= bounds[3]) 262 | { 263 | return false; 264 | } 265 | 266 | uint32_t minX = bounds[0]; 267 | uint32_t maxX = bounds[1]; 268 | uint32_t minY = bounds[2]; 269 | uint32_t maxY = bounds[3]; 270 | 271 | __m128i depth = packDepthPremultiplied(corners[2], corners[6]); 272 | 273 | uint16_t maxZ = uint16_t(0xFFFF ^ _mm_extract_epi16(_mm_minpos_epu16(_mm_xor_si128(depth, _mm_set1_epi16(-1))), 0)); 274 | 275 | if (!query2D(minX, maxX, minY, maxY, maxZ)) 276 | { 277 | return false; 278 | } 279 | 280 | return true; 281 | } 282 | 283 | bool Rasterizer::query2D(uint32_t minX, uint32_t maxX, uint32_t minY, uint32_t maxY, uint32_t maxZ) const 284 | { 285 | const uint16_t* pHiZBuffer = &*m_hiZ.begin(); 286 | const __m128i* pDepthBuffer = &*m_depthBuffer.begin(); 287 | 288 | uint32_t blockMinX = minX / 8; 289 | uint32_t blockMaxX = maxX / 8; 290 | 291 | uint32_t blockMinY = minY / 8; 292 | uint32_t blockMaxY = maxY / 8; 293 | 294 | __m128i maxZV = _mm_set1_epi16(uint16_t(maxZ)); 295 | 296 | // Pretest against Hi-Z 297 | for (uint32_t blockY = blockMinY; blockY <= blockMaxY; ++blockY) 298 | { 299 | uint32_t startY = std::max(minY - 8 * blockY, 0); 300 | uint32_t endY = std::min(maxY - 8 * blockY, 7); 301 | 302 | const uint16_t* pHiZ = pHiZBuffer + (blockY * m_blocksX + blockMinX); 303 | const __m128i* pBlockDepth = pDepthBuffer + 8 * (blockY * m_blocksX + blockMinX) + startY; 304 | 305 | bool interiorLine = (startY == 0) && (endY == 7); 306 | 307 | for (uint32_t blockX = blockMinX; blockX <= blockMaxX; ++blockX, ++pHiZ, pBlockDepth += 8) 308 | { 309 | // Skip this block if it fully occludes the query box 310 | if (maxZ <= *pHiZ) 311 | { 312 | continue; 313 | } 314 | 315 | uint32_t startX = std::max(minX - blockX * 8, 0); 316 | 317 | uint32_t endX = std::min(maxX - blockX * 8, 7); 318 | 319 | bool interiorBlock = interiorLine && (startX == 0) && (endX == 7); 320 | 321 | // No pixels are masked, so there exists one where maxZ > pixelZ, and the query region is visible 322 | if (interiorBlock) 323 | { 324 | return true; 325 | } 326 | 327 | uint16_t rowSelector = (0xFFFF << 2 * startX) & (0xFFFF >> 2 * (7 - endX)); 328 | 329 | const __m128i* pRowDepth = pBlockDepth; 330 | 331 | for (uint32_t y = startY; y <= endY; ++y) 332 | { 333 | __m128i rowDepth = *pRowDepth++; 334 | 335 | __m128i notVisible = _mm_cmpeq_epi16(_mm_min_epu16(rowDepth, maxZV), maxZV); 336 | 337 | uint32_t visiblePixelMask = ~_mm_movemask_epi8(notVisible); 338 | 339 | if ((rowSelector & visiblePixelMask) != 0) 340 | { 341 | return true; 342 | } 343 | } 344 | } 345 | } 346 | 347 | // Not visible 348 | return false; 349 | } 350 | 351 | void Rasterizer::readBackDepth(void* target) const 352 | { 353 | const float bias = 3.9623753e+28f; // 1.0f / floatCompressionBias 354 | 355 | for (uint32_t blockY = 0; blockY < m_blocksY; ++blockY) 356 | { 357 | for (uint32_t blockX = 0; blockX < m_blocksX; ++blockX) 358 | { 359 | if (m_hiZ[blockY * m_blocksX + blockX] == 1) 360 | { 361 | for (uint32_t y = 0; y < 8; ++y) 362 | { 363 | uint8_t* dest = (uint8_t*)target + 4 * (8 * blockX + m_width * (8 * blockY + y)); 364 | memset(dest, 0, 32); 365 | } 366 | continue; 367 | } 368 | 369 | const __m128i* source = &m_depthBuffer[8 * (blockY * m_blocksX + blockX)]; 370 | for (uint32_t y = 0; y < 8; ++y) 371 | { 372 | uint8_t* dest = (uint8_t*)target + 4 * (8 * blockX + m_width * (8 * blockY + y)); 373 | 374 | __m128i depthI = _mm_load_si128(source++); 375 | 376 | __m256i depthI256 = _mm256_slli_epi32(_mm256_cvtepu16_epi32(depthI), 12); 377 | __m256 depth = _mm256_mul_ps(_mm256_castsi256_ps(depthI256), _mm256_set1_ps(bias)); 378 | 379 | __m256 linDepth = _mm256_div_ps(_mm256_set1_ps(2 * 0.25f), _mm256_sub_ps(_mm256_set1_ps(0.25f + 1000.0f), _mm256_mul_ps(_mm256_sub_ps(_mm256_set1_ps(1.0f), depth), _mm256_set1_ps(1000.0f - 0.25f)))); 380 | 381 | float linDepthA[16]; 382 | _mm256_storeu_ps(linDepthA, linDepth); 383 | 384 | for (uint32_t x = 0; x < 8; ++x) 385 | { 386 | float l = linDepthA[x]; 387 | uint32_t d = static_cast(100 * 256 * l); 388 | uint8_t v0 = uint8_t(d / 100); 389 | uint8_t v1 = d % 256; 390 | 391 | dest[4 * x + 0] = v0; 392 | dest[4 * x + 1] = v1; 393 | dest[4 * x + 2] = 0; 394 | dest[4 * x + 3] = 255; 395 | } 396 | } 397 | } 398 | } 399 | } 400 | 401 | __forceinline float Rasterizer::decompressFloat(uint16_t depth) 402 | { 403 | const float bias = 3.9623753e+28f; // 1.0f / floatCompressionBias 404 | 405 | union 406 | { 407 | uint32_t u; 408 | float f; 409 | }; 410 | 411 | u = uint32_t(depth) << 12; 412 | return f * bias; 413 | } 414 | 415 | __forceinline void Rasterizer::transpose256(__m256 A, __m256 B, __m256 C, __m256 D, __m128 out[8]) 416 | { 417 | __m256 _Tmp3, _Tmp2, _Tmp1, _Tmp0; 418 | _Tmp0 = _mm256_shuffle_ps(A, B, 0x44); 419 | _Tmp2 = _mm256_shuffle_ps(A, B, 0xEE); 420 | _Tmp1 = _mm256_shuffle_ps(C, D, 0x44); 421 | _Tmp3 = _mm256_shuffle_ps(C, D, 0xEE); 422 | 423 | A = _mm256_shuffle_ps(_Tmp0, _Tmp1, 0x88); 424 | B = _mm256_shuffle_ps(_Tmp0, _Tmp1, 0xDD); 425 | C = _mm256_shuffle_ps(_Tmp2, _Tmp3, 0x88); 426 | D = _mm256_shuffle_ps(_Tmp2, _Tmp3, 0xDD); 427 | 428 | _mm256_store_ps(reinterpret_cast(out + 0), A); 429 | _mm256_store_ps(reinterpret_cast(out + 2), B); 430 | _mm256_store_ps(reinterpret_cast(out + 4), C); 431 | _mm256_store_ps(reinterpret_cast(out + 6), D); 432 | } 433 | 434 | __forceinline void Rasterizer::transpose256i(__m256i A, __m256i B, __m256i C, __m256i D, __m128i out[8]) 435 | { 436 | __m256i _Tmp3, _Tmp2, _Tmp1, _Tmp0; 437 | _Tmp0 = _mm256_unpacklo_epi32(A, B); 438 | _Tmp1 = _mm256_unpacklo_epi32(C, D); 439 | _Tmp2 = _mm256_unpackhi_epi32(A, B); 440 | _Tmp3 = _mm256_unpackhi_epi32(C, D); 441 | A = _mm256_unpacklo_epi64(_Tmp0, _Tmp1); 442 | B = _mm256_unpackhi_epi64(_Tmp0, _Tmp1); 443 | C = _mm256_unpacklo_epi64(_Tmp2, _Tmp3); 444 | D = _mm256_unpackhi_epi64(_Tmp2, _Tmp3); 445 | 446 | _mm256_store_si256(reinterpret_cast<__m256i*>(out + 0), A); 447 | _mm256_store_si256(reinterpret_cast<__m256i*>(out + 2), B); 448 | _mm256_store_si256(reinterpret_cast<__m256i*>(out + 4), C); 449 | _mm256_store_si256(reinterpret_cast<__m256i*>(out + 6), D); 450 | } 451 | 452 | template 453 | __forceinline void Rasterizer::normalizeEdge(__m256& nx, __m256& ny, __m256 edgeFlipMask) 454 | { 455 | __m256 minusZero = _mm256_set1_ps(-0.0f); 456 | __m256 invLen = _mm256_rcp_ps(_mm256_add_ps(_mm256_andnot_ps(minusZero, nx), _mm256_andnot_ps(minusZero, ny))); 457 | 458 | constexpr float maxOffset = -minEdgeOffset; 459 | __m256 mul = _mm256_set1_ps((OFFSET_QUANTIZATION_FACTOR - 1) / (maxOffset - minEdgeOffset)); 460 | if (possiblyNearClipped) 461 | { 462 | mul = _mm256_xor_ps(mul, edgeFlipMask); 463 | } 464 | 465 | invLen = _mm256_mul_ps(mul, invLen); 466 | nx = _mm256_mul_ps(nx, invLen); 467 | ny = _mm256_mul_ps(ny, invLen); 468 | } 469 | 470 | __forceinline __m128i Rasterizer::quantizeSlopeLookup(__m128 nx, __m128 ny) 471 | { 472 | __m128i yNeg = _mm_castps_si128(_mm_cmplt_ps(ny, _mm_setzero_ps())); 473 | 474 | // Remap [-1, 1] to [0, SLOPE_QUANTIZATION / 2] 475 | const float mul = (SLOPE_QUANTIZATION_FACTOR / 2 - 1) * 0.5f; 476 | const float add = mul + 0.5f; 477 | 478 | __m128i quantizedSlope = _mm_cvttps_epi32(_mm_fmadd_ps(nx, _mm_set1_ps(mul), _mm_set1_ps(add))); 479 | return _mm_slli_epi32(_mm_sub_epi32(_mm_slli_epi32(quantizedSlope, 1), yNeg), OFFSET_QUANTIZATION_BITS); 480 | } 481 | 482 | __forceinline __m256i Rasterizer::quantizeSlopeLookup(__m256 nx, __m256 ny) 483 | { 484 | __m256i yNeg = _mm256_castps_si256(_mm256_cmp_ps(ny, _mm256_setzero_ps(), _CMP_LE_OQ)); 485 | 486 | // Remap [-1, 1] to [0, SLOPE_QUANTIZATION / 2] 487 | constexpr float maxOffset = -minEdgeOffset; 488 | const float mul = (SLOPE_QUANTIZATION_FACTOR / 2 - 1) * 0.5f / ((OFFSET_QUANTIZATION_FACTOR - 1) / (maxOffset - minEdgeOffset)); 489 | const float add = (SLOPE_QUANTIZATION_FACTOR / 2 - 1) * 0.5f + 0.5f; 490 | 491 | __m256i quantizedSlope = _mm256_cvttps_epi32(_mm256_fmadd_ps(nx, _mm256_set1_ps(mul), _mm256_set1_ps(add))); 492 | return _mm256_slli_epi32(_mm256_sub_epi32(_mm256_slli_epi32(quantizedSlope, 1), yNeg), OFFSET_QUANTIZATION_BITS); 493 | } 494 | 495 | 496 | __forceinline uint32_t Rasterizer::quantizeOffsetLookup(float offset) 497 | { 498 | const float maxOffset = -minEdgeOffset; 499 | 500 | // Remap [minOffset, maxOffset] to [0, OFFSET_QUANTIZATION] 501 | const float mul = (OFFSET_QUANTIZATION_FACTOR - 1) / (maxOffset - minEdgeOffset); 502 | const float add = 0.5f - minEdgeOffset * mul; 503 | 504 | float lookup = offset * mul + add; 505 | return std::min(std::max(int32_t(lookup), 0), OFFSET_QUANTIZATION_FACTOR - 1); 506 | } 507 | 508 | __forceinline __m128i Rasterizer::packDepthPremultiplied(__m128 depthA, __m128 depthB) 509 | { 510 | return _mm_packus_epi32(_mm_srai_epi32(_mm_castps_si128(depthA), 12), _mm_srai_epi32(_mm_castps_si128(depthB), 12)); 511 | } 512 | 513 | __forceinline __m128i Rasterizer::packDepthPremultiplied(__m256 depth) 514 | { 515 | __m256i x = _mm256_srai_epi32(_mm256_castps_si256(depth), 12); 516 | return _mm_packus_epi32(_mm256_castsi256_si128(x), _mm256_extracti128_si256(x, 1)); 517 | } 518 | 519 | __forceinline __m256i Rasterizer::packDepthPremultiplied(__m256 depthA, __m256 depthB) 520 | { 521 | __m256i x1 = _mm256_srai_epi32(_mm256_castps_si256(depthA), 12); 522 | __m256i x2 = _mm256_srai_epi32(_mm256_castps_si256(depthB), 12); 523 | 524 | return _mm256_packus_epi32(x1, x2); 525 | } 526 | 527 | uint64_t Rasterizer::transposeMask(uint64_t mask) 528 | { 529 | #if 0 530 | uint64_t maskA = _pdep_u64(_pext_u64(mask, 0x5555555555555555ull), 0xF0F0F0F0F0F0F0F0ull); 531 | uint64_t maskB = _pdep_u64(_pext_u64(mask, 0xAAAAAAAAAAAAAAAAull), 0x0F0F0F0F0F0F0F0Full); 532 | #else 533 | uint64_t maskA = 0; 534 | uint64_t maskB = 0; 535 | for (uint32_t group = 0; group < 8; ++group) 536 | { 537 | for (uint32_t bit = 0; bit < 4; ++bit) 538 | { 539 | maskA |= ((mask >> (8 * group + 2 * bit + 0)) & 1) << (4 + group * 8 + bit); 540 | maskB |= ((mask >> (8 * group + 2 * bit + 1)) & 1) << (0 + group * 8 + bit); 541 | } 542 | } 543 | #endif 544 | return maskA | maskB; 545 | } 546 | 547 | void Rasterizer::precomputeRasterizationTable() 548 | { 549 | const uint32_t angularResolution = 2000; 550 | const uint32_t offsetResolution = 2000; 551 | 552 | m_precomputedRasterTables.resize(OFFSET_QUANTIZATION_FACTOR * SLOPE_QUANTIZATION_FACTOR, 0); 553 | 554 | for (uint32_t i = 0; i < angularResolution; ++i) 555 | { 556 | float angle = -0.1f + 6.4f * float(i) / (angularResolution - 1); 557 | 558 | float nx = std::cos(angle); 559 | float ny = std::sin(angle); 560 | float l = 1.0f / (std::abs(nx) + std::abs(ny)); 561 | 562 | nx *= l; 563 | ny *= l; 564 | 565 | uint32_t slopeLookup = _mm_extract_epi32(quantizeSlopeLookup(_mm_set1_ps(nx), _mm_set1_ps(ny)), 0); 566 | 567 | for (uint32_t j = 0; j < offsetResolution; ++j) 568 | { 569 | float offset = -0.6f + 1.2f * float(j) / (angularResolution - 1); 570 | 571 | uint32_t offsetLookup = quantizeOffsetLookup(offset); 572 | 573 | uint32_t lookup = slopeLookup | offsetLookup; 574 | 575 | uint64_t block = 0; 576 | 577 | for (auto x = 0; x < 8; ++x) 578 | { 579 | for (auto y = 0; y < 8; ++y) 580 | { 581 | float edgeDistance = offset + (x - 3.5f) / 8.0f * nx + (y - 3.5f) / 8.0f * ny; 582 | if (edgeDistance <= 0.0f) 583 | { 584 | uint32_t bitIndex = 8 * x + y; 585 | block |= uint64_t(1) << bitIndex; 586 | } 587 | } 588 | } 589 | 590 | m_precomputedRasterTables[lookup] |= transposeMask(block); 591 | } 592 | // For each slope, the first block should be all ones, the last all zeroes 593 | 594 | if (m_precomputedRasterTables[slopeLookup] != -1) 595 | { 596 | __debugbreak(); 597 | } 598 | 599 | if (m_precomputedRasterTables[slopeLookup + OFFSET_QUANTIZATION_FACTOR - 1] != 0) 600 | { 601 | __debugbreak(); 602 | } 603 | } 604 | } 605 | 606 | template 607 | void Rasterizer::rasterize(const Occluder& occluder) 608 | { 609 | const __m256i* vertexData = occluder.m_vertexData; 610 | size_t packetCount = occluder.m_packetCount; 611 | 612 | __m256i maskY = _mm256_set1_epi32(2047 << 10); 613 | __m256i maskZ = _mm256_set1_epi32(1023); 614 | 615 | // Note that unaligned loads do not have a latency penalty on CPUs with SSE4 support 616 | __m128 mat0 = _mm_loadu_ps(m_modelViewProjection + 0); 617 | __m128 mat1 = _mm_loadu_ps(m_modelViewProjection + 4); 618 | __m128 mat2 = _mm_loadu_ps(m_modelViewProjection + 8); 619 | __m128 mat3 = _mm_loadu_ps(m_modelViewProjection + 12); 620 | 621 | __m128 boundsMin = occluder.m_refMin; 622 | __m128 boundsExtents = _mm_sub_ps(occluder.m_refMax, boundsMin); 623 | 624 | // Bake integer => bounding box transform into matrix 625 | mat3 = 626 | _mm_fmadd_ps(mat0, _mm_broadcastss_ps(boundsMin), 627 | _mm_fmadd_ps(mat1, _mm_permute_ps(boundsMin, _MM_SHUFFLE(1, 1, 1, 1)), 628 | _mm_fmadd_ps(mat2, _mm_permute_ps(boundsMin, _MM_SHUFFLE(2, 2, 2, 2)), 629 | mat3))); 630 | 631 | mat0 = _mm_mul_ps(mat0, _mm_mul_ps(_mm_broadcastss_ps(boundsExtents), _mm_set1_ps(1.0f / (2047ull << 21)))); 632 | mat1 = _mm_mul_ps(mat1, _mm_mul_ps(_mm_permute_ps(boundsExtents, _MM_SHUFFLE(1, 1, 1, 1)), _mm_set1_ps(1.0f / (2047 << 10)))); 633 | mat2 = _mm_mul_ps(mat2, _mm_mul_ps(_mm_permute_ps(boundsExtents, _MM_SHUFFLE(2, 2, 2, 2)), _mm_set1_ps(1.0f / 1023))); 634 | 635 | // Bias X coordinate back into positive range 636 | mat3 = _mm_fmadd_ps(mat0, _mm_set1_ps(1024ull << 21), mat3); 637 | 638 | // Skew projection to correct bleeding of Y and Z into X due to lack of masking 639 | mat1 = _mm_sub_ps(mat1, mat0); 640 | mat2 = _mm_sub_ps(mat2, mat0); 641 | 642 | _MM_TRANSPOSE4_PS(mat0, mat1, mat2, mat3); 643 | 644 | // Due to linear relationship between Z and W, it's cheaper to compute Z from W later in the pipeline than using the full projection matrix up front 645 | float c0, c1; 646 | { 647 | __m128 Za = _mm_permute_ps(mat2, _MM_SHUFFLE(3, 3, 3, 3)); 648 | __m128 Zb = _mm_dp_ps(mat2, _mm_setr_ps(1 << 21, 1 << 10, 1, 1), 0xFF); 649 | 650 | __m128 Wa = _mm_permute_ps(mat3, _MM_SHUFFLE(3, 3, 3, 3)); 651 | __m128 Wb = _mm_dp_ps(mat3, _mm_setr_ps(1 << 21, 1 << 10, 1, 1), 0xFF); 652 | 653 | _mm_store_ss(&c0, _mm_div_ps(_mm_sub_ps(Za, Zb), _mm_sub_ps(Wa, Wb))); 654 | _mm_store_ss(&c1, _mm_fnmadd_ps(_mm_div_ps(_mm_sub_ps(Za, Zb), _mm_sub_ps(Wa, Wb)), Wa, Za)); 655 | } 656 | 657 | for (uint32_t packetIdx = 0; packetIdx < packetCount; packetIdx += 4) 658 | { 659 | // Load data - only needed once per frame, so use streaming load 660 | __m256i I0 = _mm256_stream_load_si256(vertexData + packetIdx + 0); 661 | __m256i I1 = _mm256_stream_load_si256(vertexData + packetIdx + 1); 662 | __m256i I2 = _mm256_stream_load_si256(vertexData + packetIdx + 2); 663 | __m256i I3 = _mm256_stream_load_si256(vertexData + packetIdx + 3); 664 | 665 | // Vertex transformation - first W, then X & Y after camera plane culling, then Z after backface culling 666 | __m256 Xf0 = _mm256_cvtepi32_ps(I0); 667 | __m256 Xf1 = _mm256_cvtepi32_ps(I1); 668 | __m256 Xf2 = _mm256_cvtepi32_ps(I2); 669 | __m256 Xf3 = _mm256_cvtepi32_ps(I3); 670 | 671 | __m256 Yf0 = _mm256_cvtepi32_ps(_mm256_and_si256(I0, maskY)); 672 | __m256 Yf1 = _mm256_cvtepi32_ps(_mm256_and_si256(I1, maskY)); 673 | __m256 Yf2 = _mm256_cvtepi32_ps(_mm256_and_si256(I2, maskY)); 674 | __m256 Yf3 = _mm256_cvtepi32_ps(_mm256_and_si256(I3, maskY)); 675 | 676 | __m256 Zf0 = _mm256_cvtepi32_ps(_mm256_and_si256(I0, maskZ)); 677 | __m256 Zf1 = _mm256_cvtepi32_ps(_mm256_and_si256(I1, maskZ)); 678 | __m256 Zf2 = _mm256_cvtepi32_ps(_mm256_and_si256(I2, maskZ)); 679 | __m256 Zf3 = _mm256_cvtepi32_ps(_mm256_and_si256(I3, maskZ)); 680 | 681 | __m256 mat00 = _mm256_broadcast_ss(reinterpret_cast(&mat0) + 0); 682 | __m256 mat01 = _mm256_broadcast_ss(reinterpret_cast(&mat0) + 1); 683 | __m256 mat02 = _mm256_broadcast_ss(reinterpret_cast(&mat0) + 2); 684 | __m256 mat03 = _mm256_broadcast_ss(reinterpret_cast(&mat0) + 3); 685 | 686 | __m256 X0 = _mm256_fmadd_ps(Xf0, mat00, _mm256_fmadd_ps(Yf0, mat01, _mm256_fmadd_ps(Zf0, mat02, mat03))); 687 | __m256 X1 = _mm256_fmadd_ps(Xf1, mat00, _mm256_fmadd_ps(Yf1, mat01, _mm256_fmadd_ps(Zf1, mat02, mat03))); 688 | __m256 X2 = _mm256_fmadd_ps(Xf2, mat00, _mm256_fmadd_ps(Yf2, mat01, _mm256_fmadd_ps(Zf2, mat02, mat03))); 689 | __m256 X3 = _mm256_fmadd_ps(Xf3, mat00, _mm256_fmadd_ps(Yf3, mat01, _mm256_fmadd_ps(Zf3, mat02, mat03))); 690 | 691 | __m256 mat10 = _mm256_broadcast_ss(reinterpret_cast(&mat1) + 0); 692 | __m256 mat11 = _mm256_broadcast_ss(reinterpret_cast(&mat1) + 1); 693 | __m256 mat12 = _mm256_broadcast_ss(reinterpret_cast(&mat1) + 2); 694 | __m256 mat13 = _mm256_broadcast_ss(reinterpret_cast(&mat1) + 3); 695 | 696 | __m256 Y0 = _mm256_fmadd_ps(Xf0, mat10, _mm256_fmadd_ps(Yf0, mat11, _mm256_fmadd_ps(Zf0, mat12, mat13))); 697 | __m256 Y1 = _mm256_fmadd_ps(Xf1, mat10, _mm256_fmadd_ps(Yf1, mat11, _mm256_fmadd_ps(Zf1, mat12, mat13))); 698 | __m256 Y2 = _mm256_fmadd_ps(Xf2, mat10, _mm256_fmadd_ps(Yf2, mat11, _mm256_fmadd_ps(Zf2, mat12, mat13))); 699 | __m256 Y3 = _mm256_fmadd_ps(Xf3, mat10, _mm256_fmadd_ps(Yf3, mat11, _mm256_fmadd_ps(Zf3, mat12, mat13))); 700 | 701 | __m256 mat30 = _mm256_broadcast_ss(reinterpret_cast(&mat3) + 0); 702 | __m256 mat31 = _mm256_broadcast_ss(reinterpret_cast(&mat3) + 1); 703 | __m256 mat32 = _mm256_broadcast_ss(reinterpret_cast(&mat3) + 2); 704 | __m256 mat33 = _mm256_broadcast_ss(reinterpret_cast(&mat3) + 3); 705 | 706 | __m256 W0 = _mm256_fmadd_ps(Xf0, mat30, _mm256_fmadd_ps(Yf0, mat31, _mm256_fmadd_ps(Zf0, mat32, mat33))); 707 | __m256 W1 = _mm256_fmadd_ps(Xf1, mat30, _mm256_fmadd_ps(Yf1, mat31, _mm256_fmadd_ps(Zf1, mat32, mat33))); 708 | __m256 W2 = _mm256_fmadd_ps(Xf2, mat30, _mm256_fmadd_ps(Yf2, mat31, _mm256_fmadd_ps(Zf2, mat32, mat33))); 709 | __m256 W3 = _mm256_fmadd_ps(Xf3, mat30, _mm256_fmadd_ps(Yf3, mat31, _mm256_fmadd_ps(Zf3, mat32, mat33))); 710 | 711 | __m256 invW0, invW1, invW2, invW3; 712 | // Clamp W and invert 713 | if (possiblyNearClipped) 714 | { 715 | __m256 lowerBound = _mm256_set1_ps(-maxInvW); 716 | __m256 upperBound = _mm256_set1_ps(+maxInvW); 717 | invW0 = _mm256_min_ps(upperBound, _mm256_max_ps(lowerBound, _mm256_rcp_ps(W0))); 718 | invW1 = _mm256_min_ps(upperBound, _mm256_max_ps(lowerBound, _mm256_rcp_ps(W1))); 719 | invW2 = _mm256_min_ps(upperBound, _mm256_max_ps(lowerBound, _mm256_rcp_ps(W2))); 720 | invW3 = _mm256_min_ps(upperBound, _mm256_max_ps(lowerBound, _mm256_rcp_ps(W3))); 721 | } 722 | else 723 | { 724 | invW0 = _mm256_rcp_ps(W0); 725 | invW1 = _mm256_rcp_ps(W1); 726 | invW2 = _mm256_rcp_ps(W2); 727 | invW3 = _mm256_rcp_ps(W3); 728 | } 729 | 730 | // Round to integer coordinates to improve culling of zero-area triangles 731 | __m256 x0 = _mm256_mul_ps(_mm256_round_ps(_mm256_mul_ps(X0, invW0), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), _mm256_set1_ps(0.125f)); 732 | __m256 x1 = _mm256_mul_ps(_mm256_round_ps(_mm256_mul_ps(X1, invW1), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), _mm256_set1_ps(0.125f)); 733 | __m256 x2 = _mm256_mul_ps(_mm256_round_ps(_mm256_mul_ps(X2, invW2), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), _mm256_set1_ps(0.125f)); 734 | __m256 x3 = _mm256_mul_ps(_mm256_round_ps(_mm256_mul_ps(X3, invW3), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), _mm256_set1_ps(0.125f)); 735 | 736 | __m256 y0 = _mm256_mul_ps(_mm256_round_ps(_mm256_mul_ps(Y0, invW0), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), _mm256_set1_ps(0.125f)); 737 | __m256 y1 = _mm256_mul_ps(_mm256_round_ps(_mm256_mul_ps(Y1, invW1), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), _mm256_set1_ps(0.125f)); 738 | __m256 y2 = _mm256_mul_ps(_mm256_round_ps(_mm256_mul_ps(Y2, invW2), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), _mm256_set1_ps(0.125f)); 739 | __m256 y3 = _mm256_mul_ps(_mm256_round_ps(_mm256_mul_ps(Y3, invW3), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), _mm256_set1_ps(0.125f)); 740 | 741 | // Compute unnormalized edge directions 742 | __m256 edgeNormalsX0 = _mm256_sub_ps(y1, y0); 743 | __m256 edgeNormalsX1 = _mm256_sub_ps(y2, y1); 744 | __m256 edgeNormalsX2 = _mm256_sub_ps(y3, y2); 745 | __m256 edgeNormalsX3 = _mm256_sub_ps(y0, y3); 746 | 747 | __m256 edgeNormalsY0 = _mm256_sub_ps(x0, x1); 748 | __m256 edgeNormalsY1 = _mm256_sub_ps(x1, x2); 749 | __m256 edgeNormalsY2 = _mm256_sub_ps(x2, x3); 750 | __m256 edgeNormalsY3 = _mm256_sub_ps(x3, x0); 751 | 752 | __m256 area0 = _mm256_fmsub_ps(edgeNormalsX0, edgeNormalsY1, _mm256_mul_ps(edgeNormalsX1, edgeNormalsY0)); 753 | __m256 area1 = _mm256_fmsub_ps(edgeNormalsX1, edgeNormalsY2, _mm256_mul_ps(edgeNormalsX2, edgeNormalsY1)); 754 | __m256 area2 = _mm256_fmsub_ps(edgeNormalsX2, edgeNormalsY3, _mm256_mul_ps(edgeNormalsX3, edgeNormalsY2)); 755 | __m256 area3 = _mm256_sub_ps(_mm256_add_ps(area0, area2), area1); 756 | 757 | __m256 minusZero256 = _mm256_set1_ps(-0.0f); 758 | 759 | __m256 wSign0, wSign1, wSign2, wSign3; 760 | if (possiblyNearClipped) 761 | { 762 | wSign0 = _mm256_and_ps(invW0, minusZero256); 763 | wSign1 = _mm256_and_ps(invW1, minusZero256); 764 | wSign2 = _mm256_and_ps(invW2, minusZero256); 765 | wSign3 = _mm256_and_ps(invW3, minusZero256); 766 | } 767 | else 768 | { 769 | wSign0 = _mm256_setzero_ps(); 770 | wSign1 = _mm256_setzero_ps(); 771 | wSign2 = _mm256_setzero_ps(); 772 | wSign3 = _mm256_setzero_ps(); 773 | } 774 | 775 | // Compute signs of areas. We treat 0 as negative as this allows treating primitives with zero area as backfacing. 776 | __m256 areaSign0, areaSign1, areaSign2, areaSign3; 777 | if (possiblyNearClipped) 778 | { 779 | // Flip areas for each vertex with W < 0. This needs to be done before comparison against 0 rather than afterwards to make sure zero-are triangles are handled correctly. 780 | areaSign0 = _mm256_cmp_ps(_mm256_xor_ps(_mm256_xor_ps(area0, wSign0), _mm256_xor_ps(wSign1, wSign2)), _mm256_setzero_ps(), _CMP_LE_OQ); 781 | areaSign1 = _mm256_and_ps(minusZero256, _mm256_cmp_ps(_mm256_xor_ps(_mm256_xor_ps(area1, wSign1), _mm256_xor_ps(wSign2, wSign3)), _mm256_setzero_ps(), _CMP_LE_OQ)); 782 | areaSign2 = _mm256_and_ps(minusZero256, _mm256_cmp_ps(_mm256_xor_ps(_mm256_xor_ps(area2, wSign0), _mm256_xor_ps(wSign2, wSign3)), _mm256_setzero_ps(), _CMP_LE_OQ)); 783 | areaSign3 = _mm256_and_ps(minusZero256, _mm256_cmp_ps(_mm256_xor_ps(_mm256_xor_ps(area3, wSign1), _mm256_xor_ps(wSign0, wSign3)), _mm256_setzero_ps(), _CMP_LE_OQ)); 784 | } 785 | else 786 | { 787 | areaSign0 = _mm256_cmp_ps(area0, _mm256_setzero_ps(), _CMP_LE_OQ); 788 | areaSign1 = _mm256_and_ps(minusZero256, _mm256_cmp_ps(area1, _mm256_setzero_ps(), _CMP_LE_OQ)); 789 | areaSign2 = _mm256_and_ps(minusZero256, _mm256_cmp_ps(area2, _mm256_setzero_ps(), _CMP_LE_OQ)); 790 | areaSign3 = _mm256_and_ps(minusZero256, _mm256_cmp_ps(area3, _mm256_setzero_ps(), _CMP_LE_OQ)); 791 | } 792 | 793 | __m256i config = _mm256_or_si256( 794 | _mm256_or_si256(_mm256_srli_epi32(_mm256_castps_si256(areaSign3), 28), _mm256_srli_epi32(_mm256_castps_si256(areaSign2), 29)), 795 | _mm256_or_si256(_mm256_srli_epi32(_mm256_castps_si256(areaSign1), 30), _mm256_srli_epi32(_mm256_castps_si256(areaSign0), 31))); 796 | 797 | if (possiblyNearClipped) 798 | { 799 | config = _mm256_or_si256(config, 800 | _mm256_or_si256( 801 | _mm256_or_si256(_mm256_srli_epi32(_mm256_castps_si256(wSign3), 24), _mm256_srli_epi32(_mm256_castps_si256(wSign2), 25)), 802 | _mm256_or_si256(_mm256_srli_epi32(_mm256_castps_si256(wSign1), 26), _mm256_srli_epi32(_mm256_castps_si256(wSign0), 27)))); 803 | } 804 | 805 | __m256i modes = _mm256_i32gather_epi32(modeTable, config, 4); 806 | if (_mm256_testz_si256(modes, modes)) 807 | { 808 | continue; 809 | } 810 | 811 | __m256i primitiveValid = _mm256_cmpgt_epi32(modes, _mm256_setzero_si256()); 812 | 813 | uint32_t primModes[8]; 814 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(primModes), modes); 815 | 816 | __m256 minFx, minFy, maxFx, maxFy; 817 | 818 | if (possiblyNearClipped) 819 | { 820 | // Clipless bounding box computation 821 | __m256 infP = _mm256_set1_ps(+10000.0f); 822 | __m256 infN = _mm256_set1_ps(-10000.0f); 823 | 824 | // Find interval of points with W > 0 825 | __m256 minPx0 = _mm256_blendv_ps(x0, infP, wSign0); 826 | __m256 minPx1 = _mm256_blendv_ps(x1, infP, wSign1); 827 | __m256 minPx2 = _mm256_blendv_ps(x2, infP, wSign2); 828 | __m256 minPx3 = _mm256_blendv_ps(x3, infP, wSign3); 829 | 830 | __m256 minPx = _mm256_min_ps( 831 | _mm256_min_ps(minPx0, minPx1), 832 | _mm256_min_ps(minPx2, minPx3)); 833 | 834 | __m256 minPy0 = _mm256_blendv_ps(y0, infP, wSign0); 835 | __m256 minPy1 = _mm256_blendv_ps(y1, infP, wSign1); 836 | __m256 minPy2 = _mm256_blendv_ps(y2, infP, wSign2); 837 | __m256 minPy3 = _mm256_blendv_ps(y3, infP, wSign3); 838 | 839 | __m256 minPy = _mm256_min_ps( 840 | _mm256_min_ps(minPy0, minPy1), 841 | _mm256_min_ps(minPy2, minPy3)); 842 | 843 | __m256 maxPx0 = _mm256_xor_ps(minPx0, wSign0); 844 | __m256 maxPx1 = _mm256_xor_ps(minPx1, wSign1); 845 | __m256 maxPx2 = _mm256_xor_ps(minPx2, wSign2); 846 | __m256 maxPx3 = _mm256_xor_ps(minPx3, wSign3); 847 | 848 | __m256 maxPx = _mm256_max_ps( 849 | _mm256_max_ps(maxPx0, maxPx1), 850 | _mm256_max_ps(maxPx2, maxPx3)); 851 | 852 | __m256 maxPy0 = _mm256_xor_ps(minPy0, wSign0); 853 | __m256 maxPy1 = _mm256_xor_ps(minPy1, wSign1); 854 | __m256 maxPy2 = _mm256_xor_ps(minPy2, wSign2); 855 | __m256 maxPy3 = _mm256_xor_ps(minPy3, wSign3); 856 | 857 | __m256 maxPy = _mm256_max_ps( 858 | _mm256_max_ps(maxPy0, maxPy1), 859 | _mm256_max_ps(maxPy2, maxPy3)); 860 | 861 | // Find interval of points with W < 0 862 | __m256 minNx0 = _mm256_blendv_ps(infP, x0, wSign0); 863 | __m256 minNx1 = _mm256_blendv_ps(infP, x1, wSign1); 864 | __m256 minNx2 = _mm256_blendv_ps(infP, x2, wSign2); 865 | __m256 minNx3 = _mm256_blendv_ps(infP, x3, wSign3); 866 | 867 | __m256 minNx = _mm256_min_ps( 868 | _mm256_min_ps(minNx0, minNx1), 869 | _mm256_min_ps(minNx2, minNx3)); 870 | 871 | __m256 minNy0 = _mm256_blendv_ps(infP, y0, wSign0); 872 | __m256 minNy1 = _mm256_blendv_ps(infP, y1, wSign1); 873 | __m256 minNy2 = _mm256_blendv_ps(infP, y2, wSign2); 874 | __m256 minNy3 = _mm256_blendv_ps(infP, y3, wSign3); 875 | 876 | __m256 minNy = _mm256_min_ps( 877 | _mm256_min_ps(minNy0, minNy1), 878 | _mm256_min_ps(minNy2, minNy3)); 879 | 880 | __m256 maxNx0 = _mm256_blendv_ps(infN, x0, wSign0); 881 | __m256 maxNx1 = _mm256_blendv_ps(infN, x1, wSign1); 882 | __m256 maxNx2 = _mm256_blendv_ps(infN, x2, wSign2); 883 | __m256 maxNx3 = _mm256_blendv_ps(infN, x3, wSign3); 884 | 885 | __m256 maxNx = _mm256_max_ps( 886 | _mm256_max_ps(maxNx0, maxNx1), 887 | _mm256_max_ps(maxNx2, maxNx3)); 888 | 889 | __m256 maxNy0 = _mm256_blendv_ps(infN, y0, wSign0); 890 | __m256 maxNy1 = _mm256_blendv_ps(infN, y1, wSign1); 891 | __m256 maxNy2 = _mm256_blendv_ps(infN, y2, wSign2); 892 | __m256 maxNy3 = _mm256_blendv_ps(infN, y3, wSign3); 893 | 894 | __m256 maxNy = _mm256_max_ps( 895 | _mm256_max_ps(maxNy0, maxNy1), 896 | _mm256_max_ps(maxNy2, maxNy3)); 897 | 898 | // Include interval bounds resp. infinity depending on ordering of intervals 899 | __m256 incAx = _mm256_blendv_ps(minPx, infN, _mm256_cmp_ps(maxNx, minPx, _CMP_GT_OQ)); 900 | __m256 incAy = _mm256_blendv_ps(minPy, infN, _mm256_cmp_ps(maxNy, minPy, _CMP_GT_OQ)); 901 | 902 | __m256 incBx = _mm256_blendv_ps(maxPx, infP, _mm256_cmp_ps(maxPx, minNx, _CMP_GT_OQ)); 903 | __m256 incBy = _mm256_blendv_ps(maxPy, infP, _mm256_cmp_ps(maxPy, minNy, _CMP_GT_OQ)); 904 | 905 | minFx = _mm256_min_ps(incAx, incBx); 906 | minFy = _mm256_min_ps(incAy, incBy); 907 | 908 | maxFx = _mm256_max_ps(incAx, incBx); 909 | maxFy = _mm256_max_ps(incAy, incBy); 910 | } 911 | else 912 | { 913 | // Standard bounding box inclusion 914 | minFx = _mm256_min_ps(_mm256_min_ps(x0, x1), _mm256_min_ps(x2, x3)); 915 | minFy = _mm256_min_ps(_mm256_min_ps(y0, y1), _mm256_min_ps(y2, y3)); 916 | 917 | maxFx = _mm256_max_ps(_mm256_max_ps(x0, x1), _mm256_max_ps(x2, x3)); 918 | maxFy = _mm256_max_ps(_mm256_max_ps(y0, y1), _mm256_max_ps(y2, y3)); 919 | } 920 | 921 | // Clamp and round 922 | __m256i minX, minY, maxX, maxY; 923 | minX = _mm256_max_epi32(_mm256_cvttps_epi32(_mm256_add_ps(minFx, _mm256_set1_ps(4.9999f / 8.0f))), _mm256_setzero_si256()); 924 | minY = _mm256_max_epi32(_mm256_cvttps_epi32(_mm256_add_ps(minFy, _mm256_set1_ps(4.9999f / 8.0f))), _mm256_setzero_si256()); 925 | maxX = _mm256_min_epi32(_mm256_cvttps_epi32(_mm256_add_ps(maxFx, _mm256_set1_ps(11.0f / 8.0f))), _mm256_set1_epi32(m_blocksX)); 926 | maxY = _mm256_min_epi32(_mm256_cvttps_epi32(_mm256_add_ps(maxFy, _mm256_set1_ps(11.0f / 8.0f))), _mm256_set1_epi32(m_blocksY)); 927 | 928 | // Check overlap between bounding box and frustum 929 | __m256i inFrustum = _mm256_and_si256(_mm256_cmpgt_epi32(maxX, minX), _mm256_cmpgt_epi32(maxY, minY)); 930 | primitiveValid = _mm256_and_si256(inFrustum, primitiveValid); 931 | 932 | if (_mm256_testz_si256(primitiveValid, primitiveValid)) 933 | { 934 | continue; 935 | } 936 | 937 | // Convert bounds from [min, max] to [min, range] 938 | __m256i rangeX = _mm256_sub_epi32(maxX, minX); 939 | __m256i rangeY = _mm256_sub_epi32(maxY, minY); 940 | 941 | // Compute Z from linear relation with 1/W 942 | __m256 z0, z1, z2, z3; 943 | __m256 C0 = _mm256_broadcast_ss(&c0); 944 | __m256 C1 = _mm256_broadcast_ss(&c1); 945 | z0 = _mm256_fmadd_ps(invW0, C1, C0); 946 | z1 = _mm256_fmadd_ps(invW1, C1, C0); 947 | z2 = _mm256_fmadd_ps(invW2, C1, C0); 948 | z3 = _mm256_fmadd_ps(invW3, C1, C0); 949 | 950 | __m256 maxZ = _mm256_max_ps(_mm256_max_ps(z0, z1), _mm256_max_ps(z2, z3)); 951 | 952 | // If any W < 0, assume maxZ = 1 (effectively disabling Hi-Z) 953 | if (possiblyNearClipped) 954 | { 955 | maxZ = _mm256_blendv_ps(maxZ, _mm256_set1_ps(1.0f), _mm256_or_ps(_mm256_or_ps(wSign0, wSign1), _mm256_or_ps(wSign2, wSign3))); 956 | } 957 | 958 | __m128i packedDepthBounds = packDepthPremultiplied(maxZ); 959 | 960 | uint16_t depthBounds[8]; 961 | _mm_storeu_si128(reinterpret_cast<__m128i*>(depthBounds), packedDepthBounds); 962 | 963 | // Compute screen space depth plane 964 | __m256 greaterArea = _mm256_cmp_ps(_mm256_andnot_ps(minusZero256, area0), _mm256_andnot_ps(minusZero256, area2), _CMP_LT_OQ); 965 | 966 | // Force triangle area to be picked in the relevant mode. 967 | __m256 modeTriangle0 = _mm256_castsi256_ps(_mm256_cmpeq_epi32(modes, _mm256_set1_epi32(Triangle0))); 968 | __m256 modeTriangle1 = _mm256_castsi256_ps(_mm256_cmpeq_epi32(modes, _mm256_set1_epi32(Triangle1))); 969 | greaterArea = _mm256_andnot_ps(modeTriangle0, _mm256_or_ps(modeTriangle1, greaterArea)); 970 | 971 | 972 | __m256 invArea; 973 | if (possiblyNearClipped) 974 | { 975 | // Do a precise divison to reduce error in depth plane. Note that the area computed here 976 | // differs from the rasterized region if W < 0, so it can be very small for large covered screen regions. 977 | invArea = _mm256_div_ps(_mm256_set1_ps(1.0f), _mm256_blendv_ps(area0, area2, greaterArea)); 978 | } 979 | else 980 | { 981 | invArea = _mm256_rcp_ps(_mm256_blendv_ps(area0, area2, greaterArea)); 982 | } 983 | 984 | __m256 z12 = _mm256_sub_ps(z1, z2); 985 | __m256 z20 = _mm256_sub_ps(z2, z0); 986 | __m256 z30 = _mm256_sub_ps(z3, z0); 987 | 988 | 989 | __m256 edgeNormalsX4 = _mm256_sub_ps(y0, y2); 990 | __m256 edgeNormalsY4 = _mm256_sub_ps(x2, x0); 991 | 992 | __m256 depthPlane0, depthPlane1, depthPlane2; 993 | depthPlane1 = _mm256_mul_ps(invArea, _mm256_blendv_ps(_mm256_fmsub_ps(z20, edgeNormalsX1, _mm256_mul_ps(z12, edgeNormalsX4)), _mm256_fnmadd_ps(z20, edgeNormalsX3, _mm256_mul_ps(z30, edgeNormalsX4)), greaterArea)); 994 | depthPlane2 = _mm256_mul_ps(invArea, _mm256_blendv_ps(_mm256_fmsub_ps(z20, edgeNormalsY1, _mm256_mul_ps(z12, edgeNormalsY4)), _mm256_fnmadd_ps(z20, edgeNormalsY3, _mm256_mul_ps(z30, edgeNormalsY4)), greaterArea)); 995 | 996 | x0 = _mm256_sub_ps(x0, _mm256_cvtepi32_ps(minX)); 997 | y0 = _mm256_sub_ps(y0, _mm256_cvtepi32_ps(minY)); 998 | 999 | depthPlane0 = _mm256_fnmadd_ps(x0, depthPlane1, _mm256_fnmadd_ps(y0, depthPlane2, z0)); 1000 | 1001 | // If mode == Triangle0, replace edge 2 with edge 4; if mode == Triangle1, replace edge 0 with edge 4 1002 | edgeNormalsX2 = _mm256_blendv_ps(edgeNormalsX2, edgeNormalsX4, modeTriangle0); 1003 | edgeNormalsY2 = _mm256_blendv_ps(edgeNormalsY2, edgeNormalsY4, modeTriangle0); 1004 | edgeNormalsX0 = _mm256_blendv_ps(edgeNormalsX0, _mm256_xor_ps(minusZero256, edgeNormalsX4), modeTriangle1); 1005 | edgeNormalsY0 = _mm256_blendv_ps(edgeNormalsY0, _mm256_xor_ps(minusZero256, edgeNormalsY4), modeTriangle1); 1006 | 1007 | // Flip edges if W < 0 1008 | __m256 edgeFlipMask0, edgeFlipMask1, edgeFlipMask2, edgeFlipMask3; 1009 | if (possiblyNearClipped) 1010 | { 1011 | edgeFlipMask0 = _mm256_xor_ps(wSign0, _mm256_blendv_ps(wSign1, wSign2, modeTriangle1)); 1012 | edgeFlipMask1 = _mm256_xor_ps(wSign1, wSign2); 1013 | edgeFlipMask2 = _mm256_xor_ps(wSign2, _mm256_blendv_ps(wSign3, wSign0, modeTriangle0)); 1014 | edgeFlipMask3 = _mm256_xor_ps(wSign0, wSign3); 1015 | } 1016 | else 1017 | { 1018 | edgeFlipMask0 = _mm256_setzero_ps(); 1019 | edgeFlipMask1 = _mm256_setzero_ps(); 1020 | edgeFlipMask2 = _mm256_setzero_ps(); 1021 | edgeFlipMask3 = _mm256_setzero_ps(); 1022 | } 1023 | 1024 | // Normalize edge equations for lookup 1025 | normalizeEdge(edgeNormalsX0, edgeNormalsY0, edgeFlipMask0); 1026 | normalizeEdge(edgeNormalsX1, edgeNormalsY1, edgeFlipMask1); 1027 | normalizeEdge(edgeNormalsX2, edgeNormalsY2, edgeFlipMask2); 1028 | normalizeEdge(edgeNormalsX3, edgeNormalsY3, edgeFlipMask3); 1029 | 1030 | const float maxOffset = -minEdgeOffset; 1031 | __m256 add256 = _mm256_set1_ps(0.5f - minEdgeOffset * (OFFSET_QUANTIZATION_FACTOR - 1) / (maxOffset - minEdgeOffset)); 1032 | __m256 edgeOffsets0, edgeOffsets1, edgeOffsets2, edgeOffsets3; 1033 | 1034 | edgeOffsets0 = _mm256_fnmadd_ps(x0, edgeNormalsX0, _mm256_fnmadd_ps(y0, edgeNormalsY0, add256)); 1035 | edgeOffsets1 = _mm256_fnmadd_ps(x1, edgeNormalsX1, _mm256_fnmadd_ps(y1, edgeNormalsY1, add256)); 1036 | edgeOffsets2 = _mm256_fnmadd_ps(x2, edgeNormalsX2, _mm256_fnmadd_ps(y2, edgeNormalsY2, add256)); 1037 | edgeOffsets3 = _mm256_fnmadd_ps(x3, edgeNormalsX3, _mm256_fnmadd_ps(y3, edgeNormalsY3, add256)); 1038 | 1039 | edgeOffsets1 = _mm256_fmadd_ps(_mm256_cvtepi32_ps(minX), edgeNormalsX1, edgeOffsets1); 1040 | edgeOffsets2 = _mm256_fmadd_ps(_mm256_cvtepi32_ps(minX), edgeNormalsX2, edgeOffsets2); 1041 | edgeOffsets3 = _mm256_fmadd_ps(_mm256_cvtepi32_ps(minX), edgeNormalsX3, edgeOffsets3); 1042 | 1043 | edgeOffsets1 = _mm256_fmadd_ps(_mm256_cvtepi32_ps(minY), edgeNormalsY1, edgeOffsets1); 1044 | edgeOffsets2 = _mm256_fmadd_ps(_mm256_cvtepi32_ps(minY), edgeNormalsY2, edgeOffsets2); 1045 | edgeOffsets3 = _mm256_fmadd_ps(_mm256_cvtepi32_ps(minY), edgeNormalsY3, edgeOffsets3); 1046 | 1047 | // Quantize slopes 1048 | __m256i slopeLookups0, slopeLookups1, slopeLookups2, slopeLookups3; 1049 | slopeLookups0 = quantizeSlopeLookup(edgeNormalsX0, edgeNormalsY0); 1050 | slopeLookups1 = quantizeSlopeLookup(edgeNormalsX1, edgeNormalsY1); 1051 | slopeLookups2 = quantizeSlopeLookup(edgeNormalsX2, edgeNormalsY2); 1052 | slopeLookups3 = quantizeSlopeLookup(edgeNormalsX3, edgeNormalsY3); 1053 | 1054 | __m256i firstBlockIdx = _mm256_add_epi32(_mm256_mullo_epi16(minY, _mm256_set1_epi32(m_blocksX)), minX); 1055 | 1056 | uint32_t firstBlocks[8]; 1057 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(firstBlocks), firstBlockIdx); 1058 | 1059 | uint32_t rangesX[8]; 1060 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(rangesX), rangeX); 1061 | 1062 | uint32_t rangesY[8]; 1063 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(rangesY), rangeY); 1064 | 1065 | // Transpose into AoS 1066 | __m128 depthPlane[8]; 1067 | transpose256(depthPlane0, depthPlane1, depthPlane2, _mm256_setzero_ps(), depthPlane); 1068 | 1069 | __m128 edgeNormalsX[8]; 1070 | transpose256(edgeNormalsX0, edgeNormalsX1, edgeNormalsX2, edgeNormalsX3, edgeNormalsX); 1071 | 1072 | __m128 edgeNormalsY[8]; 1073 | transpose256(edgeNormalsY0, edgeNormalsY1, edgeNormalsY2, edgeNormalsY3, edgeNormalsY); 1074 | 1075 | __m128 edgeOffsets[8]; 1076 | transpose256(edgeOffsets0, edgeOffsets1, edgeOffsets2, edgeOffsets3, edgeOffsets); 1077 | 1078 | __m128i slopeLookups[8]; 1079 | transpose256i(slopeLookups0, slopeLookups1, slopeLookups2, slopeLookups3, slopeLookups); 1080 | 1081 | uint32_t validMask = _mm256_movemask_ps(_mm256_castsi256_ps(primitiveValid)); 1082 | 1083 | // Fetch data pointers since we'll manually strength-reduce memory arithmetic 1084 | const int64_t* pTable = &*m_precomputedRasterTables.begin(); 1085 | uint16_t* pHiZBuffer = &*m_hiZ.begin(); 1086 | __m128i* pDepthBuffer = &*m_depthBuffer.begin(); 1087 | 1088 | // Loop over set bits 1089 | unsigned long primitiveIdx; 1090 | while (_BitScanForward(&primitiveIdx, validMask)) 1091 | { 1092 | // Clear lowest set bit in mask 1093 | validMask &= validMask - 1; 1094 | 1095 | uint32_t primitiveIdxTransposed = ((primitiveIdx << 1) & 7) | (primitiveIdx >> 2); 1096 | 1097 | // Extract and prepare per-primitive data 1098 | uint16_t primitiveMaxZ = depthBounds[primitiveIdx]; 1099 | 1100 | __m256 depthDx = _mm256_broadcastss_ps(_mm_permute_ps(depthPlane[primitiveIdxTransposed], _MM_SHUFFLE(1, 1, 1, 1))); 1101 | __m256 depthDy = _mm256_broadcastss_ps(_mm_permute_ps(depthPlane[primitiveIdxTransposed], _MM_SHUFFLE(2, 2, 2, 2))); 1102 | 1103 | const float depthSamplePos = -0.5f + 1.0f / 16.0f; 1104 | __m256 lineDepth = 1105 | _mm256_fmadd_ps(depthDx, _mm256_setr_ps(depthSamplePos + 0.0f, depthSamplePos + 0.125f, depthSamplePos + 0.25f, depthSamplePos + 0.375f, depthSamplePos + 0.0f, depthSamplePos + 0.125f, depthSamplePos + 0.25f, depthSamplePos + 0.375f), 1106 | _mm256_fmadd_ps(depthDy, _mm256_setr_ps(depthSamplePos + 0.0f, depthSamplePos + 0.0f, depthSamplePos + 0.0f, depthSamplePos + 0.0f, depthSamplePos + 0.125f, depthSamplePos + 0.125f, depthSamplePos + 0.125f, depthSamplePos + 0.125f), 1107 | _mm256_broadcastss_ps(depthPlane[primitiveIdxTransposed]))); 1108 | 1109 | __m128i slopeLookup = slopeLookups[primitiveIdxTransposed]; 1110 | __m128 edgeNormalX = edgeNormalsX[primitiveIdxTransposed]; 1111 | __m128 edgeNormalY = edgeNormalsY[primitiveIdxTransposed]; 1112 | __m128 lineOffset = edgeOffsets[primitiveIdxTransposed]; 1113 | 1114 | const uint32_t blocksX = m_blocksX; 1115 | 1116 | const uint32_t firstBlock = firstBlocks[primitiveIdx]; 1117 | const uint32_t blockRangeX = rangesX[primitiveIdx]; 1118 | const uint32_t blockRangeY = rangesY[primitiveIdx]; 1119 | 1120 | uint16_t* pPrimitiveHiZ = pHiZBuffer + firstBlock; 1121 | __m256i* pPrimitiveOut = reinterpret_cast<__m256i*>(pDepthBuffer) + 4 * firstBlock; 1122 | 1123 | uint32_t primitiveMode = primModes[primitiveIdx]; 1124 | 1125 | for (uint32_t blockY = 0; 1126 | blockY < blockRangeY; 1127 | ++blockY, 1128 | pPrimitiveHiZ += blocksX, 1129 | pPrimitiveOut += 4 * blocksX, 1130 | lineDepth = _mm256_add_ps(lineDepth, depthDy), 1131 | lineOffset = _mm_add_ps(lineOffset, edgeNormalY)) 1132 | { 1133 | uint16_t* pBlockRowHiZ = pPrimitiveHiZ; 1134 | __m256i* out = pPrimitiveOut; 1135 | 1136 | __m128 offset = lineOffset; 1137 | __m256 depth = lineDepth; 1138 | 1139 | bool anyBlockHit = false; 1140 | for (uint32_t blockX = 0; 1141 | blockX < blockRangeX; 1142 | ++blockX, 1143 | pBlockRowHiZ += 1, 1144 | out += 4, 1145 | depth = _mm256_add_ps(depthDx, depth), 1146 | offset = _mm_add_ps(edgeNormalX, offset)) 1147 | { 1148 | uint16_t hiZ = *pBlockRowHiZ; 1149 | if (hiZ >= primitiveMaxZ) 1150 | { 1151 | continue; 1152 | } 1153 | 1154 | uint64_t blockMask; 1155 | if (primitiveMode == Convex) // 83-97% 1156 | { 1157 | // Simplified conservative test: combined block mask will be zero if any offset is outside of range 1158 | __m128 anyOffsetOutsideMask = _mm_cmpge_ps(offset, _mm_set1_ps(OFFSET_QUANTIZATION_FACTOR - 1)); 1159 | if (!_mm_testz_ps(anyOffsetOutsideMask, anyOffsetOutsideMask)) 1160 | { 1161 | if (anyBlockHit) 1162 | { 1163 | // Convexity implies we won't hit another block in this row and can skip to the next line. 1164 | break; 1165 | } 1166 | continue; 1167 | } 1168 | 1169 | anyBlockHit = true; 1170 | 1171 | __m128i offsetClamped = _mm_max_epi32(_mm_cvttps_epi32(offset), _mm_setzero_si128()); 1172 | 1173 | static uint64_t totalBlocks = 0; 1174 | static uint64_t containedBlocks = 0; 1175 | 1176 | __m128i lookup = _mm_or_si128(slopeLookup, offsetClamped); 1177 | 1178 | // Generate block mask 1179 | uint64_t A = pTable[uint32_t(_mm_cvtsi128_si32(lookup))]; 1180 | uint64_t B = pTable[uint32_t(_mm_extract_epi32(lookup, 1))]; 1181 | uint64_t C = pTable[uint32_t(_mm_extract_epi32(lookup, 2))]; 1182 | uint64_t D = pTable[uint32_t(_mm_extract_epi32(lookup, 3))]; 1183 | 1184 | blockMask = (A & B) & (C & D); 1185 | 1186 | // It is possible but very unlikely that blockMask == 0 if all A,B,C,D != 0 according to the conservative test above, so we skip the additional branch here. 1187 | } 1188 | else 1189 | { 1190 | __m128i offsetClamped = _mm_min_epi32(_mm_max_epi32(_mm_cvttps_epi32(offset), _mm_setzero_si128()), _mm_set1_epi32(OFFSET_QUANTIZATION_FACTOR - 1)); 1191 | __m128i lookup = _mm_or_si128(slopeLookup, offsetClamped); 1192 | 1193 | // Generate block mask 1194 | uint64_t A = pTable[uint32_t(_mm_cvtsi128_si32(lookup))]; 1195 | uint64_t B = pTable[uint32_t(_mm_extract_epi32(lookup, 1))]; 1196 | uint64_t C = pTable[uint32_t(_mm_extract_epi32(lookup, 2))]; 1197 | uint64_t D = pTable[uint32_t(_mm_extract_epi32(lookup, 3))]; 1198 | 1199 | // Switch over primitive mode. MSVC compiles this as a "sub eax, 1; jz label;" ladder, so the mode enum is ordered by descending frequency of occurence 1200 | // to optimize branch efficiency. By ensuring we have a default case that falls through to the last possible value (ConcaveLeft if not near clipped, 1201 | // ConcaveCenter otherwise) we avoid the last branch in the ladder. 1202 | switch (primitiveMode) 1203 | { 1204 | case Triangle0: // 2.3-11% 1205 | blockMask = A & B & C; 1206 | break; 1207 | 1208 | case Triangle1: // 0.1-4% 1209 | blockMask = A & C & D; 1210 | break; 1211 | 1212 | case ConcaveRight: // 0.01-0.9% 1213 | blockMask = (A | D) & (B & C); 1214 | break; 1215 | 1216 | default: 1217 | // Case ConcaveCenter can only occur if any W < 0 1218 | if (possiblyNearClipped) 1219 | { 1220 | // case ConcaveCenter: // < 1e-6% 1221 | blockMask = (A & B) | (C & D); 1222 | break; 1223 | } 1224 | else 1225 | { 1226 | // Fall-through 1227 | } 1228 | 1229 | case ConcaveLeft: // 0.01-0.6% 1230 | blockMask = (A & D) & (B | C); 1231 | break; 1232 | } 1233 | 1234 | // No pixels covered => skip block 1235 | if (!blockMask) 1236 | { 1237 | continue; 1238 | } 1239 | } 1240 | 1241 | // Generate depth values around block 1242 | __m256 depth0 = depth; 1243 | __m256 depth1 = _mm256_fmadd_ps(depthDx, _mm256_set1_ps(0.5f), depth0); 1244 | __m256 depth8 = _mm256_add_ps(depthDy, depth0); 1245 | __m256 depth9 = _mm256_add_ps(depthDy, depth1); 1246 | 1247 | // Pack depth 1248 | __m256i d0 = packDepthPremultiplied(depth0, depth1); 1249 | __m256i d4 = packDepthPremultiplied(depth8, depth9); 1250 | 1251 | // Interpolate remaining values in packed space 1252 | __m256i d2 = _mm256_avg_epu16(d0, d4); 1253 | __m256i d1 = _mm256_avg_epu16(d0, d2); 1254 | __m256i d3 = _mm256_avg_epu16(d2, d4); 1255 | 1256 | // Not all pixels covered - mask depth 1257 | if (blockMask != -1) 1258 | { 1259 | __m128i A = _mm_cvtsi64x_si128(blockMask); 1260 | __m128i B = _mm_slli_epi64(A, 4); 1261 | __m256i C = _mm256_inserti128_si256(_mm256_castsi128_si256(A), B, 1); 1262 | __m256i rowMask = _mm256_unpacklo_epi8(C, C); 1263 | 1264 | d0 = _mm256_blendv_epi8(_mm256_setzero_si256(), d0, _mm256_slli_epi16(rowMask, 3)); 1265 | d1 = _mm256_blendv_epi8(_mm256_setzero_si256(), d1, _mm256_slli_epi16(rowMask, 2)); 1266 | d2 = _mm256_blendv_epi8(_mm256_setzero_si256(), d2, _mm256_add_epi16(rowMask, rowMask)); 1267 | d3 = _mm256_blendv_epi8(_mm256_setzero_si256(), d3, rowMask); 1268 | } 1269 | 1270 | // Test fast clear flag 1271 | if (hiZ != 1) 1272 | { 1273 | // Merge depth values 1274 | d0 = _mm256_max_epu16(_mm256_load_si256(out + 0), d0); 1275 | d1 = _mm256_max_epu16(_mm256_load_si256(out + 1), d1); 1276 | d2 = _mm256_max_epu16(_mm256_load_si256(out + 2), d2); 1277 | d3 = _mm256_max_epu16(_mm256_load_si256(out + 3), d3); 1278 | } 1279 | 1280 | // Store back new depth 1281 | _mm256_store_si256(out + 0, d0); 1282 | _mm256_store_si256(out + 1, d1); 1283 | _mm256_store_si256(out + 2, d2); 1284 | _mm256_store_si256(out + 3, d3); 1285 | 1286 | // Update HiZ 1287 | __m256i newMinZ = _mm256_min_epu16(_mm256_min_epu16(d0, d1), _mm256_min_epu16(d2, d3)); 1288 | __m128i newMinZ16 = _mm_minpos_epu16(_mm_min_epu16(_mm256_castsi256_si128(newMinZ), _mm256_extracti128_si256(newMinZ, 1))); 1289 | 1290 | *pBlockRowHiZ = uint16_t(uint32_t(_mm_cvtsi128_si32(newMinZ16))); 1291 | } 1292 | } 1293 | } 1294 | } 1295 | } 1296 | 1297 | // Force template instantiations 1298 | template void Rasterizer::rasterize(const Occluder& occluder); 1299 | template void Rasterizer::rasterize(const Occluder& occluder); -------------------------------------------------------------------------------- /SoftwareRasterizer/Rasterizer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | struct Occluder; 9 | 10 | class Rasterizer 11 | { 12 | public: 13 | Rasterizer(uint32_t width, uint32_t height); 14 | 15 | void setModelViewProjection(const float* matrix); 16 | 17 | void clear(); 18 | 19 | template 20 | void rasterize(const Occluder& occluder); 21 | 22 | bool queryVisibility(__m128 boundsMin, __m128 boundsMax, bool& needsClipping); 23 | 24 | bool query2D(uint32_t minX, uint32_t maxX, uint32_t minY, uint32_t maxY, uint32_t maxZ) const; 25 | 26 | void readBackDepth(void* target) const; 27 | 28 | private: 29 | static float decompressFloat(uint16_t depth); 30 | 31 | static void transpose256(__m256 A, __m256 B, __m256 C, __m256 D, __m128 out[8]); 32 | static void transpose256i(__m256i A, __m256i B, __m256i C, __m256i D, __m128i out[8]); 33 | 34 | template 35 | static void normalizeEdge(__m256& nx, __m256& ny, __m256 edgeFlipMask); 36 | 37 | static __m128i quantizeSlopeLookup(__m128 nx, __m128 ny); 38 | static __m256i quantizeSlopeLookup(__m256 nx, __m256 ny); 39 | 40 | static uint32_t quantizeOffsetLookup(float offset); 41 | 42 | static __m128i packDepthPremultiplied(__m128 depthA, __m128 depthB); 43 | static __m256i packDepthPremultiplied(__m256 depthA, __m256 depthB); 44 | static __m128i packDepthPremultiplied(__m256 depth); 45 | 46 | static uint64_t transposeMask(uint64_t mask); 47 | 48 | void precomputeRasterizationTable(); 49 | 50 | float m_modelViewProjection[16]; 51 | float m_modelViewProjectionRaw[16]; 52 | 53 | std::vector m_precomputedRasterTables; 54 | std::vector<__m128i> m_depthBuffer; 55 | std::vector m_hiZ; 56 | 57 | uint32_t m_width; 58 | uint32_t m_height; 59 | uint32_t m_blocksX; 60 | uint32_t m_blocksY; 61 | }; 62 | -------------------------------------------------------------------------------- /SoftwareRasterizer/SoftwareRasterizer.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | x64 7 | 8 | 9 | Release 10 | x64 11 | 12 | 13 | 14 | {7E7E35CC-2069-43BB-9056-49DFEF9EEE56} 15 | Win32Proj 16 | SoftwareRasterizer 17 | 10.0.10586.0 18 | 19 | 20 | 21 | Application 22 | true 23 | v142 24 | Unicode 25 | 26 | 27 | Application 28 | false 29 | v142 30 | true 31 | Unicode 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | true 47 | 48 | 49 | false 50 | 51 | 52 | 53 | NotUsing 54 | Level3 55 | Disabled 56 | _DEBUG;_WINDOWS;%(PreprocessorDefinitions) 57 | true 58 | true 59 | AdvancedVectorExtensions2 60 | 61 | 62 | Windows 63 | true 64 | 65 | 66 | 67 | 68 | Level3 69 | NotUsing 70 | MaxSpeed 71 | true 72 | true 73 | NDEBUG;_WINDOWS;%(PreprocessorDefinitions) 74 | true 75 | true 76 | AdvancedVectorExtensions2 77 | false 78 | AnySuitable 79 | Speed 80 | 81 | 82 | Windows 83 | true 84 | true 85 | true 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /SoftwareRasterizer/SoftwareRasterizer.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;xsd 11 | 12 | 13 | 14 | 15 | Header Files 16 | 17 | 18 | Header Files 19 | 20 | 21 | Header Files 22 | 23 | 24 | Header Files 25 | 26 | 27 | Header Files 28 | 29 | 30 | 31 | 32 | Source Files 33 | 34 | 35 | Source Files 36 | 37 | 38 | Source Files 39 | 40 | 41 | Source Files 42 | 43 | 44 | Source Files 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /SoftwareRasterizer/Sponza/IndexBuffer.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rawrunprotected/rasterizer/50a1c132c24e85aaa7ef00a6337610ffc04c403e/SoftwareRasterizer/Sponza/IndexBuffer.bin -------------------------------------------------------------------------------- /SoftwareRasterizer/Sponza/VertexBuffer.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rawrunprotected/rasterizer/50a1c132c24e85aaa7ef00a6337610ffc04c403e/SoftwareRasterizer/Sponza/VertexBuffer.bin -------------------------------------------------------------------------------- /SoftwareRasterizer/SurfaceAreaHeuristic.cpp: -------------------------------------------------------------------------------- 1 | #include "SurfaceAreaHeuristic.h" 2 | 3 | #include "VectorMath.h" 4 | 5 | #include 6 | #include 7 | 8 | namespace 9 | { 10 | uint32_t sahSplit(const std::vector& aabbsIn, uint32_t splitGranularity, uint32_t* indicesStart, uint32_t* indicesEnd) 11 | { 12 | uint32_t numIndices = uint32_t(indicesEnd - indicesStart); 13 | 14 | __m128 bestCost = _mm_set1_ps(std::numeric_limits::infinity()); 15 | 16 | int bestAxis = -1; 17 | int bestIndex = -1; 18 | 19 | for (int splitAxis = 0; splitAxis < 3; ++splitAxis) 20 | { 21 | // Sort along center position 22 | std::stable_sort(indicesStart, indicesEnd, [&](auto i0, auto i1) { 23 | return _mm_movemask_ps(_mm_cmplt_ps(aabbsIn[i0].getCenter(), aabbsIn[i1].getCenter())) & (1 << splitAxis); 24 | }); 25 | 26 | std::vector<__m128> areasFromLeft; 27 | areasFromLeft.resize(numIndices); 28 | 29 | std::vector<__m128> areasFromRight; 30 | areasFromRight.resize(numIndices); 31 | 32 | Aabb fromLeft; 33 | for (uint32_t i = 0; i < numIndices; ++i) 34 | { 35 | fromLeft.include(aabbsIn[indicesStart[i]]); 36 | areasFromLeft[i] = fromLeft.surfaceArea(); 37 | } 38 | 39 | Aabb fromRight; 40 | for (int i = numIndices - 1; i >= 0; --i) 41 | { 42 | fromRight.include(aabbsIn[indicesStart[i]]); 43 | areasFromRight[i] = fromRight.surfaceArea(); 44 | } 45 | 46 | for (uint32_t splitIndex = splitGranularity; splitIndex < numIndices - splitGranularity; splitIndex += splitGranularity) 47 | { 48 | int countLeft = static_cast(splitIndex); 49 | int countRight = static_cast(numIndices - splitIndex); 50 | 51 | __m128 areaLeft = areasFromLeft[splitIndex - 1]; 52 | __m128 areaRight = areasFromRight[splitIndex]; 53 | __m128 scaledAreaLeft = _mm_mul_ss(areaLeft, _mm_cvtsi32_ss(_mm_setzero_ps(), countLeft)); 54 | __m128 scaledAreaRight = _mm_mul_ss(areaRight, _mm_cvtsi32_ss(_mm_setzero_ps(), countRight)); 55 | 56 | __m128 cost = _mm_add_ss(scaledAreaLeft, scaledAreaRight); 57 | 58 | if (_mm_comilt_ss(cost, bestCost)) 59 | { 60 | bestCost = cost; 61 | bestAxis = splitAxis; 62 | bestIndex = splitIndex; 63 | } 64 | } 65 | } 66 | 67 | // Sort again according to best axis 68 | std::stable_sort(indicesStart, indicesEnd, [&](auto i0, auto i1) { 69 | return _mm_movemask_ps(_mm_cmplt_ps(aabbsIn[i0].getCenter(), aabbsIn[i1].getCenter())) & (1 << bestAxis); 70 | }); 71 | 72 | return bestIndex; 73 | } 74 | 75 | void generateBatchesRecursive(const std::vector& aabbsIn, uint32_t targetSize, uint32_t splitGranularity, uint32_t* indicesStart, uint32_t* indicesEnd, std::vector>& result) 76 | { 77 | auto splitIndex = sahSplit(aabbsIn, splitGranularity, indicesStart, indicesEnd); 78 | 79 | uint32_t* range[] = { indicesStart, indicesStart + splitIndex, indicesEnd }; 80 | 81 | for (int i = 0; i < 2; ++i) 82 | { 83 | auto batchSize = range[i + 1] - range[i]; 84 | if (batchSize < targetSize) 85 | { 86 | result.push_back({ range[i], range[i + 1] }); 87 | } 88 | else 89 | { 90 | generateBatchesRecursive(aabbsIn, targetSize, splitGranularity, range[i], range[i + 1], result); 91 | } 92 | } 93 | } 94 | } 95 | 96 | std::vector> SurfaceAreaHeuristic::generateBatches(const std::vector& aabbs, uint32_t targetSize, uint32_t splitGranularity) 97 | { 98 | std::vector indices(aabbs.size()); 99 | std::iota(begin(indices), end(indices), 0); 100 | 101 | std::vector> result; 102 | generateBatchesRecursive(aabbs, targetSize, splitGranularity, &indices[0], &indices[0] + indices.size(), result); 103 | return result; 104 | } 105 | -------------------------------------------------------------------------------- /SoftwareRasterizer/SurfaceAreaHeuristic.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | struct Aabb; 6 | 7 | class SurfaceAreaHeuristic 8 | { 9 | public: 10 | static std::vector> generateBatches(const std::vector& aabbs, uint32_t targetSize, uint32_t splitGranularity); 11 | }; 12 | 13 | -------------------------------------------------------------------------------- /SoftwareRasterizer/VectorMath.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | // Cross product 6 | inline __m128 cross(__m128 a, __m128 b) 7 | { 8 | __m128 a_yzx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); 9 | __m128 b_yzx = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); 10 | __m128 c = _mm_sub_ps(_mm_mul_ps(a, b_yzx), _mm_mul_ps(a_yzx, b)); 11 | return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1)); 12 | } 13 | 14 | // Normal vector of triangle 15 | inline __m128 normal(__m128 v0, __m128 v1, __m128 v2) 16 | { 17 | return cross(_mm_sub_ps(v1, v0), _mm_sub_ps(v2, v0)); 18 | } 19 | 20 | inline __m128 normalize(__m128 v) 21 | { 22 | return _mm_mul_ps(v, _mm_rsqrt_ps(_mm_dp_ps(v, v, 0x7F))); 23 | } 24 | 25 | struct Aabb 26 | { 27 | Aabb() 28 | { 29 | m_min = _mm_set1_ps(+std::numeric_limits::infinity()); 30 | m_max = _mm_set1_ps(-std::numeric_limits::infinity()); 31 | } 32 | 33 | __m128 m_min, m_max; 34 | 35 | void include(const Aabb& aabb) 36 | { 37 | m_min = _mm_min_ps(m_min, aabb.m_min); 38 | m_max = _mm_max_ps(m_max, aabb.m_max); 39 | } 40 | 41 | void include(__m128 point) 42 | { 43 | m_min = _mm_min_ps(m_min, point); 44 | m_max = _mm_max_ps(m_max, point); 45 | } 46 | 47 | __m128 getCenter() const 48 | { 49 | return _mm_add_ps(m_min, m_max); 50 | } 51 | 52 | __m128 getExtents() const 53 | { 54 | return _mm_sub_ps(m_max, m_min); 55 | } 56 | 57 | __m128 surfaceArea() 58 | { 59 | __m128 extents = getExtents(); 60 | __m128 extents2 = _mm_shuffle_ps(extents, extents, _MM_SHUFFLE(3, 0, 2, 1)); 61 | return _mm_dp_ps(extents, extents2, 0x7F); 62 | } 63 | }; 64 | --------------------------------------------------------------------------------