├── .clang-format ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── dist ├── BlellochScan.hpp ├── RadixSort.hpp └── Reduce.hpp ├── generate.py ├── glu ├── BlellochScan.hpp ├── RadixSort.hpp ├── Reduce.hpp ├── data_types.hpp ├── errors.hpp └── gl_utils.hpp ├── test ├── CMakeLists.txt ├── blelloch_scan_tests.cpp ├── generated │ ├── test_include_BlellochScan.cpp │ ├── test_include_RadixSort.cpp │ └── test_include_Reduce.cpp ├── main.cpp ├── radix_sort_tests.cpp ├── reduce_tests.cpp └── util │ ├── Random.hpp │ └── StopWatch.hpp └── third_party ├── glad ├── include │ ├── KHR │ │ └── khrplatform.h │ └── glad │ │ └── glad.h └── src │ └── glad.c └── renderdoc └── renderdoc_app.h /.clang-format: -------------------------------------------------------------------------------- 1 | Language: Cpp 2 | IndentWidth: 4 3 | ColumnLimit: 120 4 | BreakBeforeBraces: Allman 5 | PackConstructorInitializers: Never 6 | SpaceBeforeCtorInitializerColon: true 7 | BreakConstructorInitializers: AfterColon 8 | AllowShortFunctionsOnASingleLine: Inline 9 | BreakBeforeBinaryOperators: None 10 | AlignAfterOpenBracket: BlockIndent 11 | AllowShortLambdasOnASingleLine: All 12 | BinPackArguments: true 13 | BinPackParameters: false 14 | NamespaceIndentation: All 15 | ReferenceAlignment: Left 16 | PointerAlignment: Left 17 | AllowAllParametersOfDeclarationOnNextLine: true 18 | IndentAccessModifiers: false 19 | AccessModifierOffset: -4 20 | SpaceAfterCStyleCast: true 21 | SpaceAfterTemplateKeyword: false 22 | AlwaysBreakTemplateDeclarations: Yes 23 | --- 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # CLion 2 | cmake-build-* 3 | .idea 4 | *.xml 5 | 6 | # Python 7 | venv 8 | .venv 9 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/glfw"] 2 | path = third_party/glfw 3 | url = https://github.com/glfw/glfw 4 | [submodule "third_party/Catch2"] 5 | path = third_party/Catch2 6 | url = https://github.com/catchorg/Catch2 7 | [submodule "third_party/glm"] 8 | path = third_party/glm 9 | url = https://github.com/g-truc/glm 10 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.19) 2 | project(glu VERSION 2.0 DESCRIPTION "GL Utilities by loryruta") 3 | 4 | set(CMAKE_CXX_STANDARD 17) 5 | 6 | # Catch2 7 | add_subdirectory(third_party/Catch2) 8 | 9 | # GLFW 10 | add_subdirectory(third_party/glfw) 11 | 12 | # glad 13 | add_library(glad STATIC third_party/glad/src/glad.c) 14 | target_include_directories(glad PUBLIC third_party/glad/include) 15 | 16 | # glm 17 | add_library(glm INTERFACE) 18 | target_include_directories(glm INTERFACE third_party/glm) 19 | 20 | # Renderdoc 21 | add_library(renderdoc INTERFACE) 22 | target_include_directories(renderdoc INTERFACE third_party/renderdoc) 23 | 24 | # 25 | add_library(glu INTERFACE) 26 | target_include_directories(glu INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) 27 | 28 | # TODO optionally add test subdirectory (e.g. don't add if configuring in git submodule) 29 | add_subdirectory(test) 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Lorenzo Rutayisire 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # gl-radix-sort 3 | 4 | Ready to use RadixSort and other parallel programming primitives, implemented with OpenGL 4.6. 5 | 6 | Includes: 7 | - Parallel Reduce 8 | - Parallel BlellochScan 9 | - Parallel RadixSort 10 | 11 | Such modules are grouped together under the name "GLU" (OpenGL Utilities). 12 | 13 | ## Requirements 14 | 15 | - C++17 16 | 17 | ## How to include it 18 | 19 | ### Copy-paste the utility file 20 | 21 | - Look into the `dist/` directory 22 | - Take a utility (e.g. `dist/Reduce.hpp`) 23 | - Copy the file in your codebase 24 | - Include it where you need it 25 | - **Important: OpenGL 4.6 symbols must be defined prior the utility file!** E.g.: 26 | 27 | ```cpp 28 | #include // Must be placed beforehand! 29 | #include "Reduce.hpp" 30 | ``` 31 | 32 | ### git submodule + CMake 33 | 34 | - Git submodule this project into your codebase 35 | - Add CMake subdirectory and link to the `glu` target 36 | 37 | ```cmake 38 | add_subdirectory(path/to/glu) 39 | 40 | target_link_libraries(your_project PUBLIC glu) 41 | ``` 42 | 43 | ## How to use it 44 | 45 | ### Reduce 46 | 47 | ```cpp 48 | #include "Reduce.hpp" 49 | 50 | using namespace glu; 51 | 52 | size_t N; 53 | GLuint buffer; // SSBO containing N GLuint (of size N * sizeof(GLuint)) 54 | 55 | Reduce reduce(DataType_Uint, ReduceOperator_Sum); 56 | reduce(buffer, N); 57 | ``` 58 | 59 | ### BlellochScan 60 | 61 | ```cpp 62 | #include "Blelloch.hpp" 63 | 64 | using namespace glu; 65 | 66 | size_t N; // Important: N must be a power of 2 67 | GLuint buffer; // SSBO containing N GLuint (of size N * sizeof(GLuint)) 68 | 69 | BlellochScan blelloch_scan(DataType_Uint); 70 | blelloch_scan(buffer, N); 71 | ``` 72 | 73 | ### RadixSort 74 | 75 | ```cpp 76 | #include "RadixSort.hpp" 77 | 78 | using namespace glu; 79 | 80 | size_t N; 81 | GLuint key_buffer; // SSBO containing N GLuint (of size N * sizeof(GLuint)) 82 | GLuint val_buffer; // SSBO containing N GLuint (of size N * sizeof(GLuint)) 83 | 84 | RadixSort radix_sort; 85 | radix_sort(buffer, N); 86 | ``` 87 | 88 | Note: currently `val_buffer` is **required** and its type is `GLuint`. If you have a keys array you would have to 89 | allocate a dummy values array! 90 | 91 | ## Performance 92 | 93 | - OS: Ubuntu 22.04 94 | - Processor: AMD Ryzen 7 3700X 8-Core Processor 95 | - Device: NVIDIA GeForce RTX 2060 SUPER/PCIe/SS 96 | - NVIDIA Driver Version: 545.23.08 97 | - Memory: 16 GB 98 | 99 | ``` 100 | Reduce; Num elements: 1024, Elapsed: 0.069 ms 101 | Reduce; Num elements: 16384, Elapsed: 0.012 ms 102 | Reduce; Num elements: 65536, Elapsed: 0.016 ms 103 | Reduce; Num elements: 131072, Elapsed: 0.020 ms 104 | Reduce; Num elements: 524288, Elapsed: 0.029 ms 105 | Reduce; Num elements: 1048576, Elapsed: 0.049 ms 106 | Reduce; Num elements: 16777216, Elapsed: 0.620 ms 107 | Reduce; Num elements: 67108864, Elapsed: 2.514 ms 108 | Reduce; Num elements: 134217728, Elapsed: 5.030 ms 109 | Reduce; Num elements: 268435456, Elapsed: 10.044 ms 110 | BlellochScan; Num elements: 1024, Elapsed: 1.108 ms 111 | BlellochScan; Num elements: 16384, Elapsed: 0.081 ms 112 | BlellochScan; Num elements: 65536, Elapsed: 0.101 ms 113 | BlellochScan; Num elements: 131072, Elapsed: 0.115 ms 114 | BlellochScan; Num elements: 524288, Elapsed: 0.168 ms 115 | BlellochScan; Num elements: 1048576, Elapsed: 0.360 ms 116 | BlellochScan; Num elements: 16777216, Elapsed: 4.368 ms 117 | BlellochScan; Num elements: 67108864, Elapsed: 18.407 ms 118 | BlellochScan; Num elements: 134217728, Elapsed: 37.167 ms 119 | BlellochScan; Num elements: 268435456, Elapsed: 86.493 ms 120 | Radix sort; Num elements: 1024, Elapsed: 0.663 ms 121 | Radix sort; Num elements: 16384, Elapsed: 1.004 ms 122 | Radix sort; Num elements: 65536, Elapsed: 1.761 ms 123 | Radix sort; Num elements: 131072, Elapsed: 3.074 ms 124 | Radix sort; Num elements: 524288, Elapsed: 10.633 ms 125 | Radix sort; Num elements: 1048576, Elapsed: 20.457 ms 126 | Radix sort; Num elements: 2097152, Elapsed: 39.688 ms 127 | Radix sort; Num elements: 4194304, Elapsed: 78.594 ms 128 | Radix sort; Num elements: 8388608, Elapsed: 0.156 s 129 | Radix sort; Num elements: 16777216, Elapsed: 0.311 s 130 | Radix sort; Num elements: 33554432, Elapsed: 0.626 s 131 | Radix sort; Num elements: 67108864, Elapsed: 1.252 s 132 | Radix sort; Num elements: 134217728, Elapsed: 2.518 s 133 | Radix sort; Num elements: 268435456, Elapsed: 5.022 s 134 | ``` 135 | 136 | To run it on your machine: clone the repository, build `glu_test` and run: 137 | 138 | ``` 139 | ./glu_test [benchmark] 140 | ``` 141 | 142 | ## Useful resources 143 | - http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/RadixSort.pdf 144 | - https://vgc.poly.edu/~csilva/papers/cgf.pdf 145 | - Nabla (radix sort implementation): https://github.com/Devsh-Graphics-Programming/Nabla 146 | - Udacity parallel programming guide: https://www.youtube.com/playlist?list=PLAwxTw4SYaPnFKojVQrmyOGFCqHTxfdv2 147 | -------------------------------------------------------------------------------- /dist/BlellochScan.hpp: -------------------------------------------------------------------------------- 1 | // This code was automatically generated; you're not supposed to edit it! 2 | 3 | #ifndef GLU_BLELLOCHSCAN_HPP 4 | #define GLU_BLELLOCHSCAN_HPP 5 | 6 | #include 7 | 8 | #ifndef GLU_REDUCE_HPP 9 | #define GLU_REDUCE_HPP 10 | 11 | #ifndef GLU_DATA_TYPES_HPP 12 | #define GLU_DATA_TYPES_HPP 13 | 14 | #ifndef GLU_ERRORS_HPP 15 | #define GLU_ERRORS_HPP 16 | 17 | #include 18 | #include 19 | 20 | // TODO mark if (!condition_) as unlikely 21 | #define GLU_CHECK_STATE(condition_, ...) \ 22 | { \ 23 | if (!(condition_)) \ 24 | { \ 25 | fprintf(stderr, __VA_ARGS__); \ 26 | exit(1); \ 27 | } \ 28 | } 29 | 30 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__) 31 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__) 32 | 33 | #endif 34 | 35 | 36 | 37 | namespace glu 38 | { 39 | enum DataType 40 | { 41 | DataType_Float = 0, 42 | DataType_Double, 43 | DataType_Int, 44 | DataType_Uint, 45 | DataType_Vec2, 46 | DataType_Vec4, 47 | DataType_DVec2, 48 | DataType_DVec4, 49 | DataType_UVec2, 50 | DataType_UVec4, 51 | DataType_IVec2, 52 | DataType_IVec4 53 | }; 54 | 55 | inline const char* to_glsl_type_str(DataType data_type) 56 | { 57 | // clang-format off 58 | if (data_type == DataType_Float) return "float"; 59 | else if (data_type == DataType_Double) return "double"; 60 | else if (data_type == DataType_Int) return "int"; 61 | else if (data_type == DataType_Uint) return "uint"; 62 | else if (data_type == DataType_Vec2) return "vec2"; 63 | else if (data_type == DataType_Vec4) return "vec4"; 64 | else if (data_type == DataType_DVec2) return "dvec2"; 65 | else if (data_type == DataType_DVec4) return "dvec4"; 66 | else if (data_type == DataType_UVec2) return "uvec2"; 67 | else if (data_type == DataType_UVec4) return "uvec4"; 68 | else if (data_type == DataType_IVec2) return "ivec2"; 69 | else if (data_type == DataType_IVec4) return "ivec4"; 70 | else 71 | { 72 | GLU_FAIL("Invalid data type: %d", data_type); 73 | } 74 | // clang-format on 75 | } 76 | 77 | } // namespace glu 78 | 79 | #endif // GLU_DATA_TYPES_HPP 80 | 81 | 82 | #ifndef GLU_GL_UTILS_HPP 83 | #define GLU_GL_UTILS_HPP 84 | 85 | #include 86 | #include 87 | #include 88 | #include 89 | 90 | #ifndef GLU_ERRORS_HPP 91 | #define GLU_ERRORS_HPP 92 | 93 | #include 94 | #include 95 | 96 | // TODO mark if (!condition_) as unlikely 97 | #define GLU_CHECK_STATE(condition_, ...) \ 98 | { \ 99 | if (!(condition_)) \ 100 | { \ 101 | fprintf(stderr, __VA_ARGS__); \ 102 | exit(1); \ 103 | } \ 104 | } 105 | 106 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__) 107 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__) 108 | 109 | #endif 110 | 111 | 112 | 113 | namespace glu 114 | { 115 | inline void 116 | copy_buffer(GLuint src_buffer, GLuint dst_buffer, size_t size, size_t src_offset = 0, size_t dst_offset = 0) 117 | { 118 | glBindBuffer(GL_COPY_READ_BUFFER, src_buffer); 119 | glBindBuffer(GL_COPY_WRITE_BUFFER, dst_buffer); 120 | 121 | glCopyBufferSubData( 122 | GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, (GLintptr) src_offset, (GLintptr) dst_offset, (GLsizeiptr) size 123 | ); 124 | } 125 | 126 | /// A RAII wrapper for GL shader. 127 | class Shader 128 | { 129 | private: 130 | GLuint m_handle; 131 | 132 | public: 133 | explicit Shader(GLenum type) : 134 | m_handle(glCreateShader(type)){}; 135 | Shader(const Shader&) = delete; 136 | 137 | Shader(Shader&& other) noexcept 138 | { 139 | m_handle = other.m_handle; 140 | other.m_handle = 0; 141 | } 142 | 143 | ~Shader() { glDeleteShader(m_handle); } 144 | 145 | [[nodiscard]] GLuint handle() const { return m_handle; } 146 | 147 | void source_from_str(const std::string& src_str) 148 | { 149 | const char* src_ptr = src_str.c_str(); 150 | glShaderSource(m_handle, 1, &src_ptr, nullptr); 151 | } 152 | 153 | void source_from_file(const char* src_filepath) 154 | { 155 | FILE* file = fopen(src_filepath, "rt"); 156 | GLU_CHECK_STATE(!file, "Failed to shader file: %s", src_filepath); 157 | 158 | fseek(file, 0, SEEK_END); 159 | size_t file_size = ftell(file); 160 | fseek(file, 0, SEEK_SET); 161 | 162 | std::string src{}; 163 | src.resize(file_size); 164 | fread(src.data(), sizeof(char), file_size, file); 165 | source_from_str(src.c_str()); 166 | 167 | fclose(file); 168 | } 169 | 170 | std::string get_info_log() 171 | { 172 | GLint log_length = 0; 173 | glGetShaderiv(m_handle, GL_INFO_LOG_LENGTH, &log_length); 174 | 175 | std::vector log(log_length); 176 | glGetShaderInfoLog(m_handle, log_length, nullptr, log.data()); 177 | return {log.begin(), log.end()}; 178 | } 179 | 180 | void compile() 181 | { 182 | glCompileShader(m_handle); 183 | 184 | GLint status; 185 | glGetShaderiv(m_handle, GL_COMPILE_STATUS, &status); 186 | if (!status) 187 | { 188 | GLU_CHECK_STATE(status, "Shader failed to compile: %s", get_info_log().c_str()); 189 | } 190 | } 191 | }; 192 | 193 | /// A RAII wrapper for GL program. 194 | class Program 195 | { 196 | private: 197 | GLuint m_handle; 198 | 199 | public: 200 | explicit Program() { m_handle = glCreateProgram(); }; 201 | Program(const Program&) = delete; 202 | 203 | Program(Program&& other) noexcept 204 | { 205 | m_handle = other.m_handle; 206 | other.m_handle = 0; 207 | } 208 | 209 | ~Program() { glDeleteProgram(m_handle); } 210 | 211 | [[nodiscard]] GLuint handle() const { return m_handle; } 212 | 213 | void attach_shader(GLuint shader_handle) { glAttachShader(m_handle, shader_handle); } 214 | void attach_shader(const Shader& shader) { glAttachShader(m_handle, shader.handle()); } 215 | 216 | [[nodiscard]] std::string get_info_log() const 217 | { 218 | GLint log_length = 0; 219 | glGetProgramiv(m_handle, GL_INFO_LOG_LENGTH, &log_length); 220 | 221 | std::vector log(log_length); 222 | glGetProgramInfoLog(m_handle, log_length, nullptr, log.data()); 223 | return {log.begin(), log.end()}; 224 | } 225 | 226 | void link() 227 | { 228 | GLint status; 229 | glLinkProgram(m_handle); 230 | glGetProgramiv(m_handle, GL_LINK_STATUS, &status); 231 | if (!status) 232 | { 233 | GLU_CHECK_STATE(status, "Program failed to link: %s", get_info_log().c_str()); 234 | } 235 | } 236 | 237 | void use() { glUseProgram(m_handle); } 238 | 239 | GLint get_uniform_location(const char* uniform_name) 240 | { 241 | GLint loc = glGetUniformLocation(m_handle, uniform_name); 242 | GLU_CHECK_STATE(loc >= 0, "Failed to get uniform location: %s", uniform_name); 243 | return loc; 244 | } 245 | }; 246 | 247 | /// A RAII helper class for GL shader storage buffer. 248 | class ShaderStorageBuffer 249 | { 250 | private: 251 | GLuint m_handle = 0; 252 | size_t m_size = 0; 253 | 254 | public: 255 | explicit ShaderStorageBuffer(size_t initial_size = 0) 256 | { 257 | if (initial_size > 0) 258 | resize(initial_size, false); 259 | } 260 | 261 | explicit ShaderStorageBuffer(const void* data, size_t size) : 262 | m_size(size) 263 | { 264 | GLU_CHECK_ARGUMENT(data, ""); 265 | GLU_CHECK_ARGUMENT(size > 0, ""); 266 | 267 | glCreateBuffers(1, &m_handle); 268 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 269 | glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, data, GL_DYNAMIC_STORAGE_BIT); 270 | } 271 | 272 | template 273 | explicit ShaderStorageBuffer(const std::vector& data) : 274 | ShaderStorageBuffer(data.data(), data.size() * sizeof(T)) 275 | { 276 | } 277 | 278 | ShaderStorageBuffer(const ShaderStorageBuffer&) = delete; 279 | ShaderStorageBuffer(ShaderStorageBuffer&& other) noexcept 280 | { 281 | m_handle = other.m_handle; 282 | m_size = other.m_size; 283 | other.m_handle = 0; 284 | } 285 | 286 | ~ShaderStorageBuffer() 287 | { 288 | if (m_handle) 289 | glDeleteBuffers(1, &m_handle); 290 | } 291 | 292 | [[nodiscard]] GLuint handle() const { return m_handle; } 293 | [[nodiscard]] size_t size() const { return m_size; } 294 | 295 | /// Grows or shrinks the buffer. If keep_data, performs an additional copy to maintain the data. 296 | void resize(size_t size, bool keep_data = false) 297 | { 298 | size_t old_size = m_size; 299 | GLuint old_handle = m_handle; 300 | 301 | if (old_size != size) 302 | { 303 | m_size = size; 304 | 305 | glCreateBuffers(1, &m_handle); 306 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 307 | glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, nullptr, GL_DYNAMIC_STORAGE_BIT); 308 | 309 | if (keep_data) 310 | copy_buffer(old_handle, m_handle, std::min(old_size, size)); 311 | 312 | glDeleteBuffers(1, &old_handle); 313 | } 314 | } 315 | 316 | /// Clears the entire buffer with the given GLuint value (repeated). 317 | void clear(GLuint value) 318 | { 319 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 320 | glClearBufferData(GL_SHADER_STORAGE_BUFFER, GL_R32UI, GL_RED, GL_UNSIGNED_INT, &value); 321 | } 322 | 323 | void write_data(const void* data, size_t size) 324 | { 325 | GLU_CHECK_ARGUMENT(size <= m_size, ""); 326 | 327 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 328 | glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, size, data); 329 | } 330 | 331 | template 332 | std::vector get_data() const 333 | { 334 | GLU_CHECK_ARGUMENT(m_size % sizeof(T) == 0, "Size %zu isn't a multiple of %zu", m_size, sizeof(T)); 335 | 336 | std::vector result(m_size / sizeof(T)); 337 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 338 | glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, (GLsizeiptr) m_size, result.data()); 339 | return result; 340 | } 341 | 342 | void bind(GLuint index, size_t size = 0, size_t offset = 0) 343 | { 344 | if (size == 0) 345 | size = m_size; 346 | glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, m_handle, (GLintptr) offset, (GLsizeiptr) size); 347 | } 348 | }; 349 | 350 | /// Measures elapsed time on GPU for executing the given callback. 351 | inline uint64_t measure_gl_elapsed_time(const std::function& callback) 352 | { 353 | GLuint query; 354 | uint64_t elapsed_time{}; 355 | 356 | glGenQueries(1, &query); 357 | glBeginQuery(GL_TIME_ELAPSED, query); 358 | 359 | callback(); 360 | 361 | glEndQuery(GL_TIME_ELAPSED); 362 | 363 | glGetQueryObjectui64v(query, GL_QUERY_RESULT, &elapsed_time); 364 | glDeleteQueries(1, &query); 365 | 366 | return elapsed_time; 367 | } 368 | 369 | template 370 | IntegerT log32_floor(IntegerT n) 371 | { 372 | return (IntegerT) floor(double(log2(n)) / 5.0); 373 | } 374 | 375 | template 376 | IntegerT log32_ceil(IntegerT n) 377 | { 378 | return (IntegerT) ceil(double(log2(n)) / 5.0); 379 | } 380 | 381 | template 382 | IntegerT div_ceil(IntegerT n, IntegerT d) 383 | { 384 | return (IntegerT) ceil(double(n) / double(d)); 385 | } 386 | 387 | template 388 | bool is_power_of_2(T n) 389 | { 390 | return (n & (n - 1)) == 0; 391 | } 392 | 393 | template 394 | IntegerT next_power_of_2(IntegerT n) 395 | { 396 | n--; 397 | n |= n >> 1; 398 | n |= n >> 2; 399 | n |= n >> 4; 400 | n |= n >> 8; 401 | n |= n >> 16; 402 | n++; 403 | return n; 404 | } 405 | 406 | template 407 | void print_stl_container(Iterator begin, Iterator end) 408 | { 409 | size_t i = 0; 410 | for (; begin != end; begin++) 411 | { 412 | printf("(%zu) %s, ", i, std::to_string(*begin).c_str()); 413 | i++; 414 | } 415 | printf("\n"); 416 | } 417 | 418 | template 419 | void print_buffer(const ShaderStorageBuffer& buffer) 420 | { 421 | std::vector data = buffer.get_data(); 422 | print_stl_container(data.begin(), data.end()); 423 | } 424 | 425 | inline void print_buffer_hex(const ShaderStorageBuffer& buffer) 426 | { 427 | std::vector data = buffer.get_data(); 428 | for (size_t i = 0; i < data.size(); i++) 429 | printf("(%zu) %08x, ", i, data[i]); 430 | printf("\n"); 431 | } 432 | } // namespace glu 433 | 434 | #endif // GLU_GL_UTILS_HPP 435 | 436 | 437 | 438 | namespace glu 439 | { 440 | namespace detail 441 | { 442 | inline const char* k_reduction_shader_src = R"( 443 | #extension GL_KHR_shader_subgroup_arithmetic : require 444 | 445 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in; 446 | 447 | layout(std430, binding = 0) buffer Buffer 448 | { 449 | DATA_TYPE data[]; 450 | }; 451 | 452 | layout(location = 0) uniform uint u_count; 453 | layout(location = 1) uniform uint u_depth; 454 | 455 | void main() 456 | { 457 | uint step = 1 << (5 * u_depth); 458 | uint subgroup_i = gl_WorkGroupID.x * NUM_THREADS + gl_SubgroupID * gl_SubgroupSize; 459 | uint i = (subgroup_i + gl_SubgroupInvocationID) * step; 460 | if (i < u_count) 461 | { 462 | DATA_TYPE r = SUBGROUP_OPERATION(data[i]); 463 | if (gl_SubgroupInvocationID == 0) 464 | { 465 | data[i] = r; 466 | } 467 | } 468 | } 469 | )"; 470 | } 471 | 472 | /// The operators that can be used for the reduction operation. 473 | enum ReduceOperator 474 | { 475 | ReduceOperator_Sum = 0, 476 | ReduceOperator_Mul, 477 | ReduceOperator_Min, 478 | ReduceOperator_Max 479 | }; 480 | 481 | /// A class that implements the reduction operation. 482 | class Reduce 483 | { 484 | private: 485 | const DataType m_data_type; 486 | const ReduceOperator m_operator; 487 | const size_t m_num_threads; 488 | const size_t m_num_items; 489 | 490 | Program m_program; 491 | 492 | public: 493 | explicit Reduce(DataType data_type, ReduceOperator operator_) : 494 | m_data_type(data_type), 495 | m_operator(operator_), 496 | m_num_threads(1024), 497 | m_num_items(4) 498 | { 499 | std::string shader_src = "#version 460\n\n"; 500 | 501 | shader_src += std::string("#define DATA_TYPE ") + to_glsl_type_str(m_data_type) + "\n"; 502 | shader_src += std::string("#define NUM_THREADS ") + std::to_string(m_num_threads) + "\n"; 503 | shader_src += std::string("#define NUM_ITEMS ") + std::to_string(m_num_items) + "\n"; 504 | 505 | if (m_operator == ReduceOperator_Sum) 506 | { 507 | shader_src += "#define OPERATOR(a, b) (a + b)\n"; 508 | shader_src += "#define SUBGROUP_OPERATION(value) subgroupAdd(value)\n"; 509 | } 510 | else if (m_operator == ReduceOperator_Mul) 511 | { 512 | shader_src += "#define OPERATOR(a, b) (a * b)\n"; 513 | shader_src += "#define SUBGROUP_OPERATION(value) subgroupMul(value)\n"; 514 | } 515 | else if (m_operator == ReduceOperator_Min) 516 | { 517 | shader_src += "#define OPERATOR(a, b) (min(a, b))\n"; 518 | shader_src += "#define SUBGROUP_OPERATION(value) subgroupMin(value)\n"; 519 | } 520 | else if (m_operator == ReduceOperator_Max) 521 | { 522 | shader_src += "#define OPERATOR(a, b) (max(a, b))\n"; 523 | shader_src += "#define SUBGROUP_OPERATION(value) subgroupMax(value)\n"; 524 | } 525 | else 526 | { 527 | GLU_FAIL("Invalid reduction operator: %d", m_operator); 528 | } 529 | 530 | shader_src += detail::k_reduction_shader_src; 531 | 532 | Shader shader(GL_COMPUTE_SHADER); 533 | shader.source_from_str(shader_src.c_str()); 534 | shader.compile(); 535 | 536 | m_program.attach_shader(shader); 537 | m_program.link(); 538 | } 539 | 540 | ~Reduce() = default; 541 | 542 | void operator()(GLuint buffer, size_t count) 543 | { 544 | GLU_CHECK_ARGUMENT(buffer, "Invalid buffer"); 545 | GLU_CHECK_ARGUMENT(count > 0, "Count must be greater than zero"); 546 | 547 | m_program.use(); 548 | 549 | glUniform1ui(m_program.get_uniform_location("u_count"), count); 550 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer); 551 | 552 | for (int depth = 0;; depth++) 553 | { 554 | int step = 1 << (5 * depth); 555 | if (step >= count) 556 | break; 557 | 558 | size_t level_count = count >> (5 * depth); 559 | 560 | glUniform1ui(m_program.get_uniform_location("u_depth"), depth); 561 | 562 | size_t num_workgroups = div_ceil(level_count, m_num_threads); 563 | glDispatchCompute(num_workgroups, 1, 1); 564 | glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 565 | } 566 | } 567 | }; 568 | } // namespace glu 569 | 570 | #endif // GLU_REDUCE_HPP 571 | 572 | 573 | #ifndef GLU_DATA_TYPES_HPP 574 | #define GLU_DATA_TYPES_HPP 575 | 576 | #ifndef GLU_ERRORS_HPP 577 | #define GLU_ERRORS_HPP 578 | 579 | #include 580 | #include 581 | 582 | // TODO mark if (!condition_) as unlikely 583 | #define GLU_CHECK_STATE(condition_, ...) \ 584 | { \ 585 | if (!(condition_)) \ 586 | { \ 587 | fprintf(stderr, __VA_ARGS__); \ 588 | exit(1); \ 589 | } \ 590 | } 591 | 592 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__) 593 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__) 594 | 595 | #endif 596 | 597 | 598 | 599 | namespace glu 600 | { 601 | enum DataType 602 | { 603 | DataType_Float = 0, 604 | DataType_Double, 605 | DataType_Int, 606 | DataType_Uint, 607 | DataType_Vec2, 608 | DataType_Vec4, 609 | DataType_DVec2, 610 | DataType_DVec4, 611 | DataType_UVec2, 612 | DataType_UVec4, 613 | DataType_IVec2, 614 | DataType_IVec4 615 | }; 616 | 617 | inline const char* to_glsl_type_str(DataType data_type) 618 | { 619 | // clang-format off 620 | if (data_type == DataType_Float) return "float"; 621 | else if (data_type == DataType_Double) return "double"; 622 | else if (data_type == DataType_Int) return "int"; 623 | else if (data_type == DataType_Uint) return "uint"; 624 | else if (data_type == DataType_Vec2) return "vec2"; 625 | else if (data_type == DataType_Vec4) return "vec4"; 626 | else if (data_type == DataType_DVec2) return "dvec2"; 627 | else if (data_type == DataType_DVec4) return "dvec4"; 628 | else if (data_type == DataType_UVec2) return "uvec2"; 629 | else if (data_type == DataType_UVec4) return "uvec4"; 630 | else if (data_type == DataType_IVec2) return "ivec2"; 631 | else if (data_type == DataType_IVec4) return "ivec4"; 632 | else 633 | { 634 | GLU_FAIL("Invalid data type: %d", data_type); 635 | } 636 | // clang-format on 637 | } 638 | 639 | } // namespace glu 640 | 641 | #endif // GLU_DATA_TYPES_HPP 642 | 643 | 644 | 645 | namespace glu 646 | { 647 | namespace detail 648 | { 649 | inline const char* k_upsweep_shader_src = R"( 650 | #extension GL_KHR_shader_subgroup_shuffle_relative : require 651 | 652 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in; 653 | 654 | layout(std430, binding = 0) buffer Buffer 655 | { 656 | DATA_TYPE data[]; 657 | }; 658 | 659 | layout(location = 0) uniform uint u_count; 660 | layout(location = 1) uniform uint u_step; 661 | 662 | void main() 663 | { 664 | uint partition_i = gl_WorkGroupID.y; 665 | uint thread_i = gl_WorkGroupID.x * NUM_THREADS + gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; 666 | uint i = partition_i * u_count + thread_i * u_step + u_step - 1; 667 | uint end_i = (partition_i + 1) * u_count; 668 | if (i < end_i) 669 | { 670 | DATA_TYPE lval = subgroupShuffleUp(data[i], 1); 671 | DATA_TYPE r = OPERATION(data[i], lval); 672 | if (i == end_i - 1) // Clear last 673 | { 674 | data[i] = IDENTITY; 675 | } 676 | else if (gl_SubgroupInvocationID % 2 == 1) 677 | { 678 | data[i] = r; 679 | } 680 | } 681 | } 682 | )"; 683 | 684 | inline const char* k_downsweep_shader_src = R"( 685 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in; 686 | 687 | layout(std430, binding = 0) buffer Buffer 688 | { 689 | DATA_TYPE data[]; 690 | }; 691 | 692 | layout(location = 0) uniform uint u_count; 693 | layout(location = 1) uniform uint u_step; 694 | 695 | void main() 696 | { 697 | uint partition_i = gl_WorkGroupID.y; 698 | uint i = partition_i * u_count + gl_GlobalInvocationID.x * (u_step << 1) + (u_step - 1); 699 | uint next_i = i + u_step; 700 | uint end_i = (partition_i + 1) * u_count; 701 | if (next_i < end_i) 702 | { 703 | DATA_TYPE tmp = data[i]; 704 | data[i] = data[next_i]; 705 | data[next_i] = data[next_i] + tmp; 706 | } 707 | else if (i < end_i) 708 | { 709 | data[i] = IDENTITY; 710 | } 711 | } 712 | )"; 713 | } // namespace detail 714 | 715 | /// A class that implements Blelloch scan algorithm (exclusive prefix sum). 716 | class BlellochScan 717 | { 718 | private: 719 | const DataType m_data_type; 720 | const size_t m_num_threads; 721 | const size_t m_num_items; 722 | 723 | Program m_upsweep_program; 724 | Program m_downsweep_program; 725 | 726 | public: 727 | explicit BlellochScan(DataType data_type) : 728 | m_data_type(data_type), 729 | m_num_threads(1024), 730 | m_num_items(4) 731 | { 732 | std::string shader_src = "#version 460\n\n"; 733 | 734 | shader_src += std::string("#define DATA_TYPE ") + to_glsl_type_str(m_data_type) + "\n"; 735 | shader_src += "#define OPERATION(a, b) (a + b)\n"; 736 | shader_src += "#define IDENTITY 0\n"; 737 | shader_src += std::string("#define NUM_THREADS ") + std::to_string(m_num_threads) + "\n"; 738 | shader_src += std::string("#define NUM_ITEMS ") + std::to_string(m_num_items) + "\n"; 739 | 740 | { // Upsweep program 741 | Shader upsweep_shader(GL_COMPUTE_SHADER); 742 | upsweep_shader.source_from_str((shader_src + detail::k_upsweep_shader_src).c_str()); 743 | upsweep_shader.compile(); 744 | 745 | m_upsweep_program.attach_shader(upsweep_shader); 746 | m_upsweep_program.link(); 747 | } 748 | 749 | { // Downsweep program 750 | Shader downsweep_program(GL_COMPUTE_SHADER); 751 | downsweep_program.source_from_str((shader_src + detail::k_downsweep_shader_src).c_str()); 752 | downsweep_program.compile(); 753 | 754 | m_downsweep_program.attach_shader(downsweep_program); 755 | m_downsweep_program.link(); 756 | } 757 | } 758 | 759 | ~BlellochScan() = default; 760 | 761 | /// Runs Blelloch exclusive scan on multiple partitions. 762 | /// 763 | /// @param buffer the input GLuint buffer 764 | /// @param count the number of GLuint in the buffer (must be a power of 2) 765 | /// @param num_partitions the number of partitions (must be adjacent) 766 | void operator()(GLuint buffer, size_t count, size_t num_partitions = 1) 767 | { 768 | GLU_CHECK_ARGUMENT(buffer, "Invalid buffer"); 769 | GLU_CHECK_ARGUMENT(count > 0, "Count must be greater than zero"); 770 | GLU_CHECK_ARGUMENT(is_power_of_2(count), "Count must be a power of 2"); // TODO Remove this requirement 771 | GLU_CHECK_ARGUMENT(num_partitions >= 1, "Num of partitions must be >= 1"); 772 | 773 | upsweep(buffer, count, num_partitions); // Also clear last 774 | downsweep(buffer, count, num_partitions); 775 | } 776 | 777 | private: 778 | void upsweep(GLuint buffer, size_t count, size_t num_partitions) // Also clear last 779 | { 780 | m_upsweep_program.use(); 781 | 782 | glUniform1ui(m_upsweep_program.get_uniform_location("u_count"), count); 783 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer); 784 | 785 | int step = 1; 786 | int level_count = (int) count; 787 | while (true) 788 | { 789 | glUniform1ui(m_upsweep_program.get_uniform_location("u_step"), step); 790 | 791 | size_t num_workgroups = div_ceil(level_count, m_num_threads); 792 | glDispatchCompute(num_workgroups, num_partitions, 1); 793 | glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 794 | 795 | step <<= 1; 796 | 797 | level_count >>= 1; 798 | 799 | if (level_count <= 1) 800 | break; 801 | } 802 | } 803 | 804 | void downsweep(GLuint buffer, size_t count, size_t num_partitions) 805 | { 806 | m_downsweep_program.use(); 807 | 808 | glUniform1ui(m_downsweep_program.get_uniform_location("u_count"), count); 809 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer); 810 | 811 | int step = next_power_of_2(int(count)) >> 1; 812 | size_t level_count = 1; 813 | while (true) 814 | { 815 | glUniform1ui(m_downsweep_program.get_uniform_location("u_step"), step); 816 | 817 | size_t num_workgroups = div_ceil(level_count, m_num_threads); 818 | glDispatchCompute(num_workgroups, num_partitions, 1); 819 | glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 820 | 821 | step >>= 1; 822 | level_count <<= 1; 823 | if (step == 0) 824 | break; 825 | } 826 | } 827 | }; 828 | } // namespace glu 829 | 830 | #endif // GLU_BLELLOCHSCAN_HPP 831 | -------------------------------------------------------------------------------- /dist/RadixSort.hpp: -------------------------------------------------------------------------------- 1 | // This code was automatically generated; you're not supposed to edit it! 2 | 3 | #ifndef GLU_RADIXSORT_HPP 4 | #define GLU_RADIXSORT_HPP 5 | 6 | #ifndef GLU_BLELLOCHSCAN_HPP 7 | #define GLU_BLELLOCHSCAN_HPP 8 | 9 | #include 10 | 11 | #ifndef GLU_REDUCE_HPP 12 | #define GLU_REDUCE_HPP 13 | 14 | #ifndef GLU_DATA_TYPES_HPP 15 | #define GLU_DATA_TYPES_HPP 16 | 17 | #ifndef GLU_ERRORS_HPP 18 | #define GLU_ERRORS_HPP 19 | 20 | #include 21 | #include 22 | 23 | // TODO mark if (!condition_) as unlikely 24 | #define GLU_CHECK_STATE(condition_, ...) \ 25 | { \ 26 | if (!(condition_)) \ 27 | { \ 28 | fprintf(stderr, __VA_ARGS__); \ 29 | exit(1); \ 30 | } \ 31 | } 32 | 33 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__) 34 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__) 35 | 36 | #endif 37 | 38 | 39 | 40 | namespace glu 41 | { 42 | enum DataType 43 | { 44 | DataType_Float = 0, 45 | DataType_Double, 46 | DataType_Int, 47 | DataType_Uint, 48 | DataType_Vec2, 49 | DataType_Vec4, 50 | DataType_DVec2, 51 | DataType_DVec4, 52 | DataType_UVec2, 53 | DataType_UVec4, 54 | DataType_IVec2, 55 | DataType_IVec4 56 | }; 57 | 58 | inline const char* to_glsl_type_str(DataType data_type) 59 | { 60 | // clang-format off 61 | if (data_type == DataType_Float) return "float"; 62 | else if (data_type == DataType_Double) return "double"; 63 | else if (data_type == DataType_Int) return "int"; 64 | else if (data_type == DataType_Uint) return "uint"; 65 | else if (data_type == DataType_Vec2) return "vec2"; 66 | else if (data_type == DataType_Vec4) return "vec4"; 67 | else if (data_type == DataType_DVec2) return "dvec2"; 68 | else if (data_type == DataType_DVec4) return "dvec4"; 69 | else if (data_type == DataType_UVec2) return "uvec2"; 70 | else if (data_type == DataType_UVec4) return "uvec4"; 71 | else if (data_type == DataType_IVec2) return "ivec2"; 72 | else if (data_type == DataType_IVec4) return "ivec4"; 73 | else 74 | { 75 | GLU_FAIL("Invalid data type: %d", data_type); 76 | } 77 | // clang-format on 78 | } 79 | 80 | } // namespace glu 81 | 82 | #endif // GLU_DATA_TYPES_HPP 83 | 84 | 85 | #ifndef GLU_GL_UTILS_HPP 86 | #define GLU_GL_UTILS_HPP 87 | 88 | #include 89 | #include 90 | #include 91 | #include 92 | 93 | #ifndef GLU_ERRORS_HPP 94 | #define GLU_ERRORS_HPP 95 | 96 | #include 97 | #include 98 | 99 | // TODO mark if (!condition_) as unlikely 100 | #define GLU_CHECK_STATE(condition_, ...) \ 101 | { \ 102 | if (!(condition_)) \ 103 | { \ 104 | fprintf(stderr, __VA_ARGS__); \ 105 | exit(1); \ 106 | } \ 107 | } 108 | 109 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__) 110 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__) 111 | 112 | #endif 113 | 114 | 115 | 116 | namespace glu 117 | { 118 | inline void 119 | copy_buffer(GLuint src_buffer, GLuint dst_buffer, size_t size, size_t src_offset = 0, size_t dst_offset = 0) 120 | { 121 | glBindBuffer(GL_COPY_READ_BUFFER, src_buffer); 122 | glBindBuffer(GL_COPY_WRITE_BUFFER, dst_buffer); 123 | 124 | glCopyBufferSubData( 125 | GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, (GLintptr) src_offset, (GLintptr) dst_offset, (GLsizeiptr) size 126 | ); 127 | } 128 | 129 | /// A RAII wrapper for GL shader. 130 | class Shader 131 | { 132 | private: 133 | GLuint m_handle; 134 | 135 | public: 136 | explicit Shader(GLenum type) : 137 | m_handle(glCreateShader(type)){}; 138 | Shader(const Shader&) = delete; 139 | 140 | Shader(Shader&& other) noexcept 141 | { 142 | m_handle = other.m_handle; 143 | other.m_handle = 0; 144 | } 145 | 146 | ~Shader() { glDeleteShader(m_handle); } 147 | 148 | [[nodiscard]] GLuint handle() const { return m_handle; } 149 | 150 | void source_from_str(const std::string& src_str) 151 | { 152 | const char* src_ptr = src_str.c_str(); 153 | glShaderSource(m_handle, 1, &src_ptr, nullptr); 154 | } 155 | 156 | void source_from_file(const char* src_filepath) 157 | { 158 | FILE* file = fopen(src_filepath, "rt"); 159 | GLU_CHECK_STATE(!file, "Failed to shader file: %s", src_filepath); 160 | 161 | fseek(file, 0, SEEK_END); 162 | size_t file_size = ftell(file); 163 | fseek(file, 0, SEEK_SET); 164 | 165 | std::string src{}; 166 | src.resize(file_size); 167 | fread(src.data(), sizeof(char), file_size, file); 168 | source_from_str(src.c_str()); 169 | 170 | fclose(file); 171 | } 172 | 173 | std::string get_info_log() 174 | { 175 | GLint log_length = 0; 176 | glGetShaderiv(m_handle, GL_INFO_LOG_LENGTH, &log_length); 177 | 178 | std::vector log(log_length); 179 | glGetShaderInfoLog(m_handle, log_length, nullptr, log.data()); 180 | return {log.begin(), log.end()}; 181 | } 182 | 183 | void compile() 184 | { 185 | glCompileShader(m_handle); 186 | 187 | GLint status; 188 | glGetShaderiv(m_handle, GL_COMPILE_STATUS, &status); 189 | if (!status) 190 | { 191 | GLU_CHECK_STATE(status, "Shader failed to compile: %s", get_info_log().c_str()); 192 | } 193 | } 194 | }; 195 | 196 | /// A RAII wrapper for GL program. 197 | class Program 198 | { 199 | private: 200 | GLuint m_handle; 201 | 202 | public: 203 | explicit Program() { m_handle = glCreateProgram(); }; 204 | Program(const Program&) = delete; 205 | 206 | Program(Program&& other) noexcept 207 | { 208 | m_handle = other.m_handle; 209 | other.m_handle = 0; 210 | } 211 | 212 | ~Program() { glDeleteProgram(m_handle); } 213 | 214 | [[nodiscard]] GLuint handle() const { return m_handle; } 215 | 216 | void attach_shader(GLuint shader_handle) { glAttachShader(m_handle, shader_handle); } 217 | void attach_shader(const Shader& shader) { glAttachShader(m_handle, shader.handle()); } 218 | 219 | [[nodiscard]] std::string get_info_log() const 220 | { 221 | GLint log_length = 0; 222 | glGetProgramiv(m_handle, GL_INFO_LOG_LENGTH, &log_length); 223 | 224 | std::vector log(log_length); 225 | glGetProgramInfoLog(m_handle, log_length, nullptr, log.data()); 226 | return {log.begin(), log.end()}; 227 | } 228 | 229 | void link() 230 | { 231 | GLint status; 232 | glLinkProgram(m_handle); 233 | glGetProgramiv(m_handle, GL_LINK_STATUS, &status); 234 | if (!status) 235 | { 236 | GLU_CHECK_STATE(status, "Program failed to link: %s", get_info_log().c_str()); 237 | } 238 | } 239 | 240 | void use() { glUseProgram(m_handle); } 241 | 242 | GLint get_uniform_location(const char* uniform_name) 243 | { 244 | GLint loc = glGetUniformLocation(m_handle, uniform_name); 245 | GLU_CHECK_STATE(loc >= 0, "Failed to get uniform location: %s", uniform_name); 246 | return loc; 247 | } 248 | }; 249 | 250 | /// A RAII helper class for GL shader storage buffer. 251 | class ShaderStorageBuffer 252 | { 253 | private: 254 | GLuint m_handle = 0; 255 | size_t m_size = 0; 256 | 257 | public: 258 | explicit ShaderStorageBuffer(size_t initial_size = 0) 259 | { 260 | if (initial_size > 0) 261 | resize(initial_size, false); 262 | } 263 | 264 | explicit ShaderStorageBuffer(const void* data, size_t size) : 265 | m_size(size) 266 | { 267 | GLU_CHECK_ARGUMENT(data, ""); 268 | GLU_CHECK_ARGUMENT(size > 0, ""); 269 | 270 | glCreateBuffers(1, &m_handle); 271 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 272 | glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, data, GL_DYNAMIC_STORAGE_BIT); 273 | } 274 | 275 | template 276 | explicit ShaderStorageBuffer(const std::vector& data) : 277 | ShaderStorageBuffer(data.data(), data.size() * sizeof(T)) 278 | { 279 | } 280 | 281 | ShaderStorageBuffer(const ShaderStorageBuffer&) = delete; 282 | ShaderStorageBuffer(ShaderStorageBuffer&& other) noexcept 283 | { 284 | m_handle = other.m_handle; 285 | m_size = other.m_size; 286 | other.m_handle = 0; 287 | } 288 | 289 | ~ShaderStorageBuffer() 290 | { 291 | if (m_handle) 292 | glDeleteBuffers(1, &m_handle); 293 | } 294 | 295 | [[nodiscard]] GLuint handle() const { return m_handle; } 296 | [[nodiscard]] size_t size() const { return m_size; } 297 | 298 | /// Grows or shrinks the buffer. If keep_data, performs an additional copy to maintain the data. 299 | void resize(size_t size, bool keep_data = false) 300 | { 301 | size_t old_size = m_size; 302 | GLuint old_handle = m_handle; 303 | 304 | if (old_size != size) 305 | { 306 | m_size = size; 307 | 308 | glCreateBuffers(1, &m_handle); 309 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 310 | glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, nullptr, GL_DYNAMIC_STORAGE_BIT); 311 | 312 | if (keep_data) 313 | copy_buffer(old_handle, m_handle, std::min(old_size, size)); 314 | 315 | glDeleteBuffers(1, &old_handle); 316 | } 317 | } 318 | 319 | /// Clears the entire buffer with the given GLuint value (repeated). 320 | void clear(GLuint value) 321 | { 322 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 323 | glClearBufferData(GL_SHADER_STORAGE_BUFFER, GL_R32UI, GL_RED, GL_UNSIGNED_INT, &value); 324 | } 325 | 326 | void write_data(const void* data, size_t size) 327 | { 328 | GLU_CHECK_ARGUMENT(size <= m_size, ""); 329 | 330 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 331 | glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, size, data); 332 | } 333 | 334 | template 335 | std::vector get_data() const 336 | { 337 | GLU_CHECK_ARGUMENT(m_size % sizeof(T) == 0, "Size %zu isn't a multiple of %zu", m_size, sizeof(T)); 338 | 339 | std::vector result(m_size / sizeof(T)); 340 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 341 | glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, (GLsizeiptr) m_size, result.data()); 342 | return result; 343 | } 344 | 345 | void bind(GLuint index, size_t size = 0, size_t offset = 0) 346 | { 347 | if (size == 0) 348 | size = m_size; 349 | glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, m_handle, (GLintptr) offset, (GLsizeiptr) size); 350 | } 351 | }; 352 | 353 | /// Measures elapsed time on GPU for executing the given callback. 354 | inline uint64_t measure_gl_elapsed_time(const std::function& callback) 355 | { 356 | GLuint query; 357 | uint64_t elapsed_time{}; 358 | 359 | glGenQueries(1, &query); 360 | glBeginQuery(GL_TIME_ELAPSED, query); 361 | 362 | callback(); 363 | 364 | glEndQuery(GL_TIME_ELAPSED); 365 | 366 | glGetQueryObjectui64v(query, GL_QUERY_RESULT, &elapsed_time); 367 | glDeleteQueries(1, &query); 368 | 369 | return elapsed_time; 370 | } 371 | 372 | template 373 | IntegerT log32_floor(IntegerT n) 374 | { 375 | return (IntegerT) floor(double(log2(n)) / 5.0); 376 | } 377 | 378 | template 379 | IntegerT log32_ceil(IntegerT n) 380 | { 381 | return (IntegerT) ceil(double(log2(n)) / 5.0); 382 | } 383 | 384 | template 385 | IntegerT div_ceil(IntegerT n, IntegerT d) 386 | { 387 | return (IntegerT) ceil(double(n) / double(d)); 388 | } 389 | 390 | template 391 | bool is_power_of_2(T n) 392 | { 393 | return (n & (n - 1)) == 0; 394 | } 395 | 396 | template 397 | IntegerT next_power_of_2(IntegerT n) 398 | { 399 | n--; 400 | n |= n >> 1; 401 | n |= n >> 2; 402 | n |= n >> 4; 403 | n |= n >> 8; 404 | n |= n >> 16; 405 | n++; 406 | return n; 407 | } 408 | 409 | template 410 | void print_stl_container(Iterator begin, Iterator end) 411 | { 412 | size_t i = 0; 413 | for (; begin != end; begin++) 414 | { 415 | printf("(%zu) %s, ", i, std::to_string(*begin).c_str()); 416 | i++; 417 | } 418 | printf("\n"); 419 | } 420 | 421 | template 422 | void print_buffer(const ShaderStorageBuffer& buffer) 423 | { 424 | std::vector data = buffer.get_data(); 425 | print_stl_container(data.begin(), data.end()); 426 | } 427 | 428 | inline void print_buffer_hex(const ShaderStorageBuffer& buffer) 429 | { 430 | std::vector data = buffer.get_data(); 431 | for (size_t i = 0; i < data.size(); i++) 432 | printf("(%zu) %08x, ", i, data[i]); 433 | printf("\n"); 434 | } 435 | } // namespace glu 436 | 437 | #endif // GLU_GL_UTILS_HPP 438 | 439 | 440 | 441 | namespace glu 442 | { 443 | namespace detail 444 | { 445 | inline const char* k_reduction_shader_src = R"( 446 | #extension GL_KHR_shader_subgroup_arithmetic : require 447 | 448 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in; 449 | 450 | layout(std430, binding = 0) buffer Buffer 451 | { 452 | DATA_TYPE data[]; 453 | }; 454 | 455 | layout(location = 0) uniform uint u_count; 456 | layout(location = 1) uniform uint u_depth; 457 | 458 | void main() 459 | { 460 | uint step = 1 << (5 * u_depth); 461 | uint subgroup_i = gl_WorkGroupID.x * NUM_THREADS + gl_SubgroupID * gl_SubgroupSize; 462 | uint i = (subgroup_i + gl_SubgroupInvocationID) * step; 463 | if (i < u_count) 464 | { 465 | DATA_TYPE r = SUBGROUP_OPERATION(data[i]); 466 | if (gl_SubgroupInvocationID == 0) 467 | { 468 | data[i] = r; 469 | } 470 | } 471 | } 472 | )"; 473 | } 474 | 475 | /// The operators that can be used for the reduction operation. 476 | enum ReduceOperator 477 | { 478 | ReduceOperator_Sum = 0, 479 | ReduceOperator_Mul, 480 | ReduceOperator_Min, 481 | ReduceOperator_Max 482 | }; 483 | 484 | /// A class that implements the reduction operation. 485 | class Reduce 486 | { 487 | private: 488 | const DataType m_data_type; 489 | const ReduceOperator m_operator; 490 | const size_t m_num_threads; 491 | const size_t m_num_items; 492 | 493 | Program m_program; 494 | 495 | public: 496 | explicit Reduce(DataType data_type, ReduceOperator operator_) : 497 | m_data_type(data_type), 498 | m_operator(operator_), 499 | m_num_threads(1024), 500 | m_num_items(4) 501 | { 502 | std::string shader_src = "#version 460\n\n"; 503 | 504 | shader_src += std::string("#define DATA_TYPE ") + to_glsl_type_str(m_data_type) + "\n"; 505 | shader_src += std::string("#define NUM_THREADS ") + std::to_string(m_num_threads) + "\n"; 506 | shader_src += std::string("#define NUM_ITEMS ") + std::to_string(m_num_items) + "\n"; 507 | 508 | if (m_operator == ReduceOperator_Sum) 509 | { 510 | shader_src += "#define OPERATOR(a, b) (a + b)\n"; 511 | shader_src += "#define SUBGROUP_OPERATION(value) subgroupAdd(value)\n"; 512 | } 513 | else if (m_operator == ReduceOperator_Mul) 514 | { 515 | shader_src += "#define OPERATOR(a, b) (a * b)\n"; 516 | shader_src += "#define SUBGROUP_OPERATION(value) subgroupMul(value)\n"; 517 | } 518 | else if (m_operator == ReduceOperator_Min) 519 | { 520 | shader_src += "#define OPERATOR(a, b) (min(a, b))\n"; 521 | shader_src += "#define SUBGROUP_OPERATION(value) subgroupMin(value)\n"; 522 | } 523 | else if (m_operator == ReduceOperator_Max) 524 | { 525 | shader_src += "#define OPERATOR(a, b) (max(a, b))\n"; 526 | shader_src += "#define SUBGROUP_OPERATION(value) subgroupMax(value)\n"; 527 | } 528 | else 529 | { 530 | GLU_FAIL("Invalid reduction operator: %d", m_operator); 531 | } 532 | 533 | shader_src += detail::k_reduction_shader_src; 534 | 535 | Shader shader(GL_COMPUTE_SHADER); 536 | shader.source_from_str(shader_src.c_str()); 537 | shader.compile(); 538 | 539 | m_program.attach_shader(shader); 540 | m_program.link(); 541 | } 542 | 543 | ~Reduce() = default; 544 | 545 | void operator()(GLuint buffer, size_t count) 546 | { 547 | GLU_CHECK_ARGUMENT(buffer, "Invalid buffer"); 548 | GLU_CHECK_ARGUMENT(count > 0, "Count must be greater than zero"); 549 | 550 | m_program.use(); 551 | 552 | glUniform1ui(m_program.get_uniform_location("u_count"), count); 553 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer); 554 | 555 | for (int depth = 0;; depth++) 556 | { 557 | int step = 1 << (5 * depth); 558 | if (step >= count) 559 | break; 560 | 561 | size_t level_count = count >> (5 * depth); 562 | 563 | glUniform1ui(m_program.get_uniform_location("u_depth"), depth); 564 | 565 | size_t num_workgroups = div_ceil(level_count, m_num_threads); 566 | glDispatchCompute(num_workgroups, 1, 1); 567 | glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 568 | } 569 | } 570 | }; 571 | } // namespace glu 572 | 573 | #endif // GLU_REDUCE_HPP 574 | 575 | 576 | #ifndef GLU_DATA_TYPES_HPP 577 | #define GLU_DATA_TYPES_HPP 578 | 579 | #ifndef GLU_ERRORS_HPP 580 | #define GLU_ERRORS_HPP 581 | 582 | #include 583 | #include 584 | 585 | // TODO mark if (!condition_) as unlikely 586 | #define GLU_CHECK_STATE(condition_, ...) \ 587 | { \ 588 | if (!(condition_)) \ 589 | { \ 590 | fprintf(stderr, __VA_ARGS__); \ 591 | exit(1); \ 592 | } \ 593 | } 594 | 595 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__) 596 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__) 597 | 598 | #endif 599 | 600 | 601 | 602 | namespace glu 603 | { 604 | enum DataType 605 | { 606 | DataType_Float = 0, 607 | DataType_Double, 608 | DataType_Int, 609 | DataType_Uint, 610 | DataType_Vec2, 611 | DataType_Vec4, 612 | DataType_DVec2, 613 | DataType_DVec4, 614 | DataType_UVec2, 615 | DataType_UVec4, 616 | DataType_IVec2, 617 | DataType_IVec4 618 | }; 619 | 620 | inline const char* to_glsl_type_str(DataType data_type) 621 | { 622 | // clang-format off 623 | if (data_type == DataType_Float) return "float"; 624 | else if (data_type == DataType_Double) return "double"; 625 | else if (data_type == DataType_Int) return "int"; 626 | else if (data_type == DataType_Uint) return "uint"; 627 | else if (data_type == DataType_Vec2) return "vec2"; 628 | else if (data_type == DataType_Vec4) return "vec4"; 629 | else if (data_type == DataType_DVec2) return "dvec2"; 630 | else if (data_type == DataType_DVec4) return "dvec4"; 631 | else if (data_type == DataType_UVec2) return "uvec2"; 632 | else if (data_type == DataType_UVec4) return "uvec4"; 633 | else if (data_type == DataType_IVec2) return "ivec2"; 634 | else if (data_type == DataType_IVec4) return "ivec4"; 635 | else 636 | { 637 | GLU_FAIL("Invalid data type: %d", data_type); 638 | } 639 | // clang-format on 640 | } 641 | 642 | } // namespace glu 643 | 644 | #endif // GLU_DATA_TYPES_HPP 645 | 646 | 647 | 648 | namespace glu 649 | { 650 | namespace detail 651 | { 652 | inline const char* k_upsweep_shader_src = R"( 653 | #extension GL_KHR_shader_subgroup_shuffle_relative : require 654 | 655 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in; 656 | 657 | layout(std430, binding = 0) buffer Buffer 658 | { 659 | DATA_TYPE data[]; 660 | }; 661 | 662 | layout(location = 0) uniform uint u_count; 663 | layout(location = 1) uniform uint u_step; 664 | 665 | void main() 666 | { 667 | uint partition_i = gl_WorkGroupID.y; 668 | uint thread_i = gl_WorkGroupID.x * NUM_THREADS + gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; 669 | uint i = partition_i * u_count + thread_i * u_step + u_step - 1; 670 | uint end_i = (partition_i + 1) * u_count; 671 | if (i < end_i) 672 | { 673 | DATA_TYPE lval = subgroupShuffleUp(data[i], 1); 674 | DATA_TYPE r = OPERATION(data[i], lval); 675 | if (i == end_i - 1) // Clear last 676 | { 677 | data[i] = IDENTITY; 678 | } 679 | else if (gl_SubgroupInvocationID % 2 == 1) 680 | { 681 | data[i] = r; 682 | } 683 | } 684 | } 685 | )"; 686 | 687 | inline const char* k_downsweep_shader_src = R"( 688 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in; 689 | 690 | layout(std430, binding = 0) buffer Buffer 691 | { 692 | DATA_TYPE data[]; 693 | }; 694 | 695 | layout(location = 0) uniform uint u_count; 696 | layout(location = 1) uniform uint u_step; 697 | 698 | void main() 699 | { 700 | uint partition_i = gl_WorkGroupID.y; 701 | uint i = partition_i * u_count + gl_GlobalInvocationID.x * (u_step << 1) + (u_step - 1); 702 | uint next_i = i + u_step; 703 | uint end_i = (partition_i + 1) * u_count; 704 | if (next_i < end_i) 705 | { 706 | DATA_TYPE tmp = data[i]; 707 | data[i] = data[next_i]; 708 | data[next_i] = data[next_i] + tmp; 709 | } 710 | else if (i < end_i) 711 | { 712 | data[i] = IDENTITY; 713 | } 714 | } 715 | )"; 716 | } // namespace detail 717 | 718 | /// A class that implements Blelloch scan algorithm (exclusive prefix sum). 719 | class BlellochScan 720 | { 721 | private: 722 | const DataType m_data_type; 723 | const size_t m_num_threads; 724 | const size_t m_num_items; 725 | 726 | Program m_upsweep_program; 727 | Program m_downsweep_program; 728 | 729 | public: 730 | explicit BlellochScan(DataType data_type) : 731 | m_data_type(data_type), 732 | m_num_threads(1024), 733 | m_num_items(4) 734 | { 735 | std::string shader_src = "#version 460\n\n"; 736 | 737 | shader_src += std::string("#define DATA_TYPE ") + to_glsl_type_str(m_data_type) + "\n"; 738 | shader_src += "#define OPERATION(a, b) (a + b)\n"; 739 | shader_src += "#define IDENTITY 0\n"; 740 | shader_src += std::string("#define NUM_THREADS ") + std::to_string(m_num_threads) + "\n"; 741 | shader_src += std::string("#define NUM_ITEMS ") + std::to_string(m_num_items) + "\n"; 742 | 743 | { // Upsweep program 744 | Shader upsweep_shader(GL_COMPUTE_SHADER); 745 | upsweep_shader.source_from_str((shader_src + detail::k_upsweep_shader_src).c_str()); 746 | upsweep_shader.compile(); 747 | 748 | m_upsweep_program.attach_shader(upsweep_shader); 749 | m_upsweep_program.link(); 750 | } 751 | 752 | { // Downsweep program 753 | Shader downsweep_program(GL_COMPUTE_SHADER); 754 | downsweep_program.source_from_str((shader_src + detail::k_downsweep_shader_src).c_str()); 755 | downsweep_program.compile(); 756 | 757 | m_downsweep_program.attach_shader(downsweep_program); 758 | m_downsweep_program.link(); 759 | } 760 | } 761 | 762 | ~BlellochScan() = default; 763 | 764 | /// Runs Blelloch exclusive scan on multiple partitions. 765 | /// 766 | /// @param buffer the input GLuint buffer 767 | /// @param count the number of GLuint in the buffer (must be a power of 2) 768 | /// @param num_partitions the number of partitions (must be adjacent) 769 | void operator()(GLuint buffer, size_t count, size_t num_partitions = 1) 770 | { 771 | GLU_CHECK_ARGUMENT(buffer, "Invalid buffer"); 772 | GLU_CHECK_ARGUMENT(count > 0, "Count must be greater than zero"); 773 | GLU_CHECK_ARGUMENT(is_power_of_2(count), "Count must be a power of 2"); // TODO Remove this requirement 774 | GLU_CHECK_ARGUMENT(num_partitions >= 1, "Num of partitions must be >= 1"); 775 | 776 | upsweep(buffer, count, num_partitions); // Also clear last 777 | downsweep(buffer, count, num_partitions); 778 | } 779 | 780 | private: 781 | void upsweep(GLuint buffer, size_t count, size_t num_partitions) // Also clear last 782 | { 783 | m_upsweep_program.use(); 784 | 785 | glUniform1ui(m_upsweep_program.get_uniform_location("u_count"), count); 786 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer); 787 | 788 | int step = 1; 789 | int level_count = (int) count; 790 | while (true) 791 | { 792 | glUniform1ui(m_upsweep_program.get_uniform_location("u_step"), step); 793 | 794 | size_t num_workgroups = div_ceil(level_count, m_num_threads); 795 | glDispatchCompute(num_workgroups, num_partitions, 1); 796 | glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 797 | 798 | step <<= 1; 799 | 800 | level_count >>= 1; 801 | 802 | if (level_count <= 1) 803 | break; 804 | } 805 | } 806 | 807 | void downsweep(GLuint buffer, size_t count, size_t num_partitions) 808 | { 809 | m_downsweep_program.use(); 810 | 811 | glUniform1ui(m_downsweep_program.get_uniform_location("u_count"), count); 812 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer); 813 | 814 | int step = next_power_of_2(int(count)) >> 1; 815 | size_t level_count = 1; 816 | while (true) 817 | { 818 | glUniform1ui(m_downsweep_program.get_uniform_location("u_step"), step); 819 | 820 | size_t num_workgroups = div_ceil(level_count, m_num_threads); 821 | glDispatchCompute(num_workgroups, num_partitions, 1); 822 | glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 823 | 824 | step >>= 1; 825 | level_count <<= 1; 826 | if (step == 0) 827 | break; 828 | } 829 | } 830 | }; 831 | } // namespace glu 832 | 833 | #endif // GLU_BLELLOCHSCAN_HPP 834 | 835 | 836 | #ifndef GLU_GL_UTILS_HPP 837 | #define GLU_GL_UTILS_HPP 838 | 839 | #include 840 | #include 841 | #include 842 | #include 843 | 844 | #ifndef GLU_ERRORS_HPP 845 | #define GLU_ERRORS_HPP 846 | 847 | #include 848 | #include 849 | 850 | // TODO mark if (!condition_) as unlikely 851 | #define GLU_CHECK_STATE(condition_, ...) \ 852 | { \ 853 | if (!(condition_)) \ 854 | { \ 855 | fprintf(stderr, __VA_ARGS__); \ 856 | exit(1); \ 857 | } \ 858 | } 859 | 860 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__) 861 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__) 862 | 863 | #endif 864 | 865 | 866 | 867 | namespace glu 868 | { 869 | inline void 870 | copy_buffer(GLuint src_buffer, GLuint dst_buffer, size_t size, size_t src_offset = 0, size_t dst_offset = 0) 871 | { 872 | glBindBuffer(GL_COPY_READ_BUFFER, src_buffer); 873 | glBindBuffer(GL_COPY_WRITE_BUFFER, dst_buffer); 874 | 875 | glCopyBufferSubData( 876 | GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, (GLintptr) src_offset, (GLintptr) dst_offset, (GLsizeiptr) size 877 | ); 878 | } 879 | 880 | /// A RAII wrapper for GL shader. 881 | class Shader 882 | { 883 | private: 884 | GLuint m_handle; 885 | 886 | public: 887 | explicit Shader(GLenum type) : 888 | m_handle(glCreateShader(type)){}; 889 | Shader(const Shader&) = delete; 890 | 891 | Shader(Shader&& other) noexcept 892 | { 893 | m_handle = other.m_handle; 894 | other.m_handle = 0; 895 | } 896 | 897 | ~Shader() { glDeleteShader(m_handle); } 898 | 899 | [[nodiscard]] GLuint handle() const { return m_handle; } 900 | 901 | void source_from_str(const std::string& src_str) 902 | { 903 | const char* src_ptr = src_str.c_str(); 904 | glShaderSource(m_handle, 1, &src_ptr, nullptr); 905 | } 906 | 907 | void source_from_file(const char* src_filepath) 908 | { 909 | FILE* file = fopen(src_filepath, "rt"); 910 | GLU_CHECK_STATE(!file, "Failed to shader file: %s", src_filepath); 911 | 912 | fseek(file, 0, SEEK_END); 913 | size_t file_size = ftell(file); 914 | fseek(file, 0, SEEK_SET); 915 | 916 | std::string src{}; 917 | src.resize(file_size); 918 | fread(src.data(), sizeof(char), file_size, file); 919 | source_from_str(src.c_str()); 920 | 921 | fclose(file); 922 | } 923 | 924 | std::string get_info_log() 925 | { 926 | GLint log_length = 0; 927 | glGetShaderiv(m_handle, GL_INFO_LOG_LENGTH, &log_length); 928 | 929 | std::vector log(log_length); 930 | glGetShaderInfoLog(m_handle, log_length, nullptr, log.data()); 931 | return {log.begin(), log.end()}; 932 | } 933 | 934 | void compile() 935 | { 936 | glCompileShader(m_handle); 937 | 938 | GLint status; 939 | glGetShaderiv(m_handle, GL_COMPILE_STATUS, &status); 940 | if (!status) 941 | { 942 | GLU_CHECK_STATE(status, "Shader failed to compile: %s", get_info_log().c_str()); 943 | } 944 | } 945 | }; 946 | 947 | /// A RAII wrapper for GL program. 948 | class Program 949 | { 950 | private: 951 | GLuint m_handle; 952 | 953 | public: 954 | explicit Program() { m_handle = glCreateProgram(); }; 955 | Program(const Program&) = delete; 956 | 957 | Program(Program&& other) noexcept 958 | { 959 | m_handle = other.m_handle; 960 | other.m_handle = 0; 961 | } 962 | 963 | ~Program() { glDeleteProgram(m_handle); } 964 | 965 | [[nodiscard]] GLuint handle() const { return m_handle; } 966 | 967 | void attach_shader(GLuint shader_handle) { glAttachShader(m_handle, shader_handle); } 968 | void attach_shader(const Shader& shader) { glAttachShader(m_handle, shader.handle()); } 969 | 970 | [[nodiscard]] std::string get_info_log() const 971 | { 972 | GLint log_length = 0; 973 | glGetProgramiv(m_handle, GL_INFO_LOG_LENGTH, &log_length); 974 | 975 | std::vector log(log_length); 976 | glGetProgramInfoLog(m_handle, log_length, nullptr, log.data()); 977 | return {log.begin(), log.end()}; 978 | } 979 | 980 | void link() 981 | { 982 | GLint status; 983 | glLinkProgram(m_handle); 984 | glGetProgramiv(m_handle, GL_LINK_STATUS, &status); 985 | if (!status) 986 | { 987 | GLU_CHECK_STATE(status, "Program failed to link: %s", get_info_log().c_str()); 988 | } 989 | } 990 | 991 | void use() { glUseProgram(m_handle); } 992 | 993 | GLint get_uniform_location(const char* uniform_name) 994 | { 995 | GLint loc = glGetUniformLocation(m_handle, uniform_name); 996 | GLU_CHECK_STATE(loc >= 0, "Failed to get uniform location: %s", uniform_name); 997 | return loc; 998 | } 999 | }; 1000 | 1001 | /// A RAII helper class for GL shader storage buffer. 1002 | class ShaderStorageBuffer 1003 | { 1004 | private: 1005 | GLuint m_handle = 0; 1006 | size_t m_size = 0; 1007 | 1008 | public: 1009 | explicit ShaderStorageBuffer(size_t initial_size = 0) 1010 | { 1011 | if (initial_size > 0) 1012 | resize(initial_size, false); 1013 | } 1014 | 1015 | explicit ShaderStorageBuffer(const void* data, size_t size) : 1016 | m_size(size) 1017 | { 1018 | GLU_CHECK_ARGUMENT(data, ""); 1019 | GLU_CHECK_ARGUMENT(size > 0, ""); 1020 | 1021 | glCreateBuffers(1, &m_handle); 1022 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 1023 | glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, data, GL_DYNAMIC_STORAGE_BIT); 1024 | } 1025 | 1026 | template 1027 | explicit ShaderStorageBuffer(const std::vector& data) : 1028 | ShaderStorageBuffer(data.data(), data.size() * sizeof(T)) 1029 | { 1030 | } 1031 | 1032 | ShaderStorageBuffer(const ShaderStorageBuffer&) = delete; 1033 | ShaderStorageBuffer(ShaderStorageBuffer&& other) noexcept 1034 | { 1035 | m_handle = other.m_handle; 1036 | m_size = other.m_size; 1037 | other.m_handle = 0; 1038 | } 1039 | 1040 | ~ShaderStorageBuffer() 1041 | { 1042 | if (m_handle) 1043 | glDeleteBuffers(1, &m_handle); 1044 | } 1045 | 1046 | [[nodiscard]] GLuint handle() const { return m_handle; } 1047 | [[nodiscard]] size_t size() const { return m_size; } 1048 | 1049 | /// Grows or shrinks the buffer. If keep_data, performs an additional copy to maintain the data. 1050 | void resize(size_t size, bool keep_data = false) 1051 | { 1052 | size_t old_size = m_size; 1053 | GLuint old_handle = m_handle; 1054 | 1055 | if (old_size != size) 1056 | { 1057 | m_size = size; 1058 | 1059 | glCreateBuffers(1, &m_handle); 1060 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 1061 | glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, nullptr, GL_DYNAMIC_STORAGE_BIT); 1062 | 1063 | if (keep_data) 1064 | copy_buffer(old_handle, m_handle, std::min(old_size, size)); 1065 | 1066 | glDeleteBuffers(1, &old_handle); 1067 | } 1068 | } 1069 | 1070 | /// Clears the entire buffer with the given GLuint value (repeated). 1071 | void clear(GLuint value) 1072 | { 1073 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 1074 | glClearBufferData(GL_SHADER_STORAGE_BUFFER, GL_R32UI, GL_RED, GL_UNSIGNED_INT, &value); 1075 | } 1076 | 1077 | void write_data(const void* data, size_t size) 1078 | { 1079 | GLU_CHECK_ARGUMENT(size <= m_size, ""); 1080 | 1081 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 1082 | glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, size, data); 1083 | } 1084 | 1085 | template 1086 | std::vector get_data() const 1087 | { 1088 | GLU_CHECK_ARGUMENT(m_size % sizeof(T) == 0, "Size %zu isn't a multiple of %zu", m_size, sizeof(T)); 1089 | 1090 | std::vector result(m_size / sizeof(T)); 1091 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 1092 | glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, (GLsizeiptr) m_size, result.data()); 1093 | return result; 1094 | } 1095 | 1096 | void bind(GLuint index, size_t size = 0, size_t offset = 0) 1097 | { 1098 | if (size == 0) 1099 | size = m_size; 1100 | glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, m_handle, (GLintptr) offset, (GLsizeiptr) size); 1101 | } 1102 | }; 1103 | 1104 | /// Measures elapsed time on GPU for executing the given callback. 1105 | inline uint64_t measure_gl_elapsed_time(const std::function& callback) 1106 | { 1107 | GLuint query; 1108 | uint64_t elapsed_time{}; 1109 | 1110 | glGenQueries(1, &query); 1111 | glBeginQuery(GL_TIME_ELAPSED, query); 1112 | 1113 | callback(); 1114 | 1115 | glEndQuery(GL_TIME_ELAPSED); 1116 | 1117 | glGetQueryObjectui64v(query, GL_QUERY_RESULT, &elapsed_time); 1118 | glDeleteQueries(1, &query); 1119 | 1120 | return elapsed_time; 1121 | } 1122 | 1123 | template 1124 | IntegerT log32_floor(IntegerT n) 1125 | { 1126 | return (IntegerT) floor(double(log2(n)) / 5.0); 1127 | } 1128 | 1129 | template 1130 | IntegerT log32_ceil(IntegerT n) 1131 | { 1132 | return (IntegerT) ceil(double(log2(n)) / 5.0); 1133 | } 1134 | 1135 | template 1136 | IntegerT div_ceil(IntegerT n, IntegerT d) 1137 | { 1138 | return (IntegerT) ceil(double(n) / double(d)); 1139 | } 1140 | 1141 | template 1142 | bool is_power_of_2(T n) 1143 | { 1144 | return (n & (n - 1)) == 0; 1145 | } 1146 | 1147 | template 1148 | IntegerT next_power_of_2(IntegerT n) 1149 | { 1150 | n--; 1151 | n |= n >> 1; 1152 | n |= n >> 2; 1153 | n |= n >> 4; 1154 | n |= n >> 8; 1155 | n |= n >> 16; 1156 | n++; 1157 | return n; 1158 | } 1159 | 1160 | template 1161 | void print_stl_container(Iterator begin, Iterator end) 1162 | { 1163 | size_t i = 0; 1164 | for (; begin != end; begin++) 1165 | { 1166 | printf("(%zu) %s, ", i, std::to_string(*begin).c_str()); 1167 | i++; 1168 | } 1169 | printf("\n"); 1170 | } 1171 | 1172 | template 1173 | void print_buffer(const ShaderStorageBuffer& buffer) 1174 | { 1175 | std::vector data = buffer.get_data(); 1176 | print_stl_container(data.begin(), data.end()); 1177 | } 1178 | 1179 | inline void print_buffer_hex(const ShaderStorageBuffer& buffer) 1180 | { 1181 | std::vector data = buffer.get_data(); 1182 | for (size_t i = 0; i < data.size(); i++) 1183 | printf("(%zu) %08x, ", i, data[i]); 1184 | printf("\n"); 1185 | } 1186 | } // namespace glu 1187 | 1188 | #endif // GLU_GL_UTILS_HPP 1189 | 1190 | 1191 | 1192 | namespace glu 1193 | { 1194 | namespace detail 1195 | { 1196 | inline const char* k_radix_sort_counting_shader = R"( 1197 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in; 1198 | 1199 | layout(std430, binding = 0) readonly buffer KeyBuffer 1200 | { 1201 | uint b_key_buffer[]; 1202 | }; 1203 | 1204 | layout(std430, binding = 1) buffer BlockCountBuffer 1205 | { 1206 | uint b_block_count_buffer[]; // 16 * NUM_THREADS 1207 | }; 1208 | 1209 | layout(std430, binding = 2) buffer GlobalCountBuffer 1210 | { 1211 | uint b_global_count_buffer[]; 1212 | }; 1213 | 1214 | layout(location = 0) uniform uint u_count; 1215 | layout(location = 1) uniform uint u_radix_shift; 1216 | layout(location = 2) uniform uint u_num_blocks_power_of_2; 1217 | 1218 | void main() 1219 | { 1220 | for (uint radix = 0; radix < 16; radix++) 1221 | { 1222 | b_block_count_buffer[radix * u_num_blocks_power_of_2 + gl_WorkGroupID.x] = 0; 1223 | } 1224 | 1225 | barrier(); 1226 | 1227 | uint i = gl_GlobalInvocationID.x; 1228 | if (i < u_count) 1229 | { 1230 | // Block-wide count on shared memory 1231 | uint radix = (b_key_buffer[i] >> u_radix_shift) & 0xf; 1232 | atomicAdd(b_block_count_buffer[radix * u_num_blocks_power_of_2 + gl_WorkGroupID.x], 1); 1233 | } 1234 | 1235 | barrier(); 1236 | 1237 | if (gl_LocalInvocationIndex < 16) 1238 | { 1239 | uint block_count = b_block_count_buffer[gl_LocalInvocationIndex * u_num_blocks_power_of_2 + gl_WorkGroupID.x]; 1240 | atomicAdd(b_global_count_buffer[gl_LocalInvocationIndex], block_count); 1241 | } 1242 | } 1243 | )"; 1244 | 1245 | inline const char* k_radix_sort_reordering_shader = R"( 1246 | #extension GL_KHR_shader_subgroup_arithmetic : require 1247 | 1248 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in; 1249 | 1250 | layout(std430, binding = 0) readonly buffer SrcKeyBuffer 1251 | { 1252 | uint b_src_key_buffer[]; 1253 | }; 1254 | 1255 | layout(std430, binding = 1) readonly buffer SrcValBuffer 1256 | { 1257 | uint b_src_val_buffer[]; 1258 | }; 1259 | 1260 | layout(std430, binding = 2) writeonly buffer DstKeyBuffer 1261 | { 1262 | uint b_dst_key_buffer[]; 1263 | }; 1264 | 1265 | layout(std430, binding = 3) writeonly buffer DstValBuffer 1266 | { 1267 | uint b_dst_val_buffer[]; 1268 | }; 1269 | 1270 | layout(std430, binding = 4) readonly buffer BlockOffsetBuffer 1271 | { 1272 | uint b_block_offset_buffer[]; 1273 | }; 1274 | 1275 | layout(std430, binding = 5) readonly buffer GlobalCountBuffer 1276 | { 1277 | uint b_global_count_buffer[]; 1278 | }; 1279 | 1280 | layout(location = 0) uniform uint u_count; 1281 | layout(location = 1) uniform uint u_radix_shift; 1282 | layout(location = 2) uniform uint u_num_blocks_power_of_2; 1283 | 1284 | shared uint s_global_offset_buffer[16]; 1285 | shared uint s_prefix_sum_buffer[NUM_THREADS]; 1286 | 1287 | void prefix_sum() // Block-wide prefix sum (Blelloch scan) 1288 | { 1289 | uint thread_i = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; 1290 | 1291 | // Upsweep 1292 | for (uint step = 1; step < NUM_THREADS; step <<= 1) 1293 | { 1294 | if (thread_i % 2 == 1) 1295 | { 1296 | uint i = thread_i * step + (step - 1); 1297 | if (i < NUM_THREADS) 1298 | { 1299 | s_prefix_sum_buffer[i] = s_prefix_sum_buffer[i] + s_prefix_sum_buffer[i - step]; 1300 | } 1301 | } 1302 | 1303 | barrier(); 1304 | } 1305 | 1306 | // Clear last 1307 | if (thread_i == NUM_THREADS - 1) s_prefix_sum_buffer[thread_i] = 0; 1308 | 1309 | barrier(); 1310 | 1311 | // Downsweep 1312 | uint step = NUM_THREADS >> 1; 1313 | for (; step > 0; step >>= 1) 1314 | { 1315 | uint i = thread_i * step + (step - 1); 1316 | if (i + step < NUM_THREADS && thread_i % 2 == 0) 1317 | { 1318 | uint tmp = s_prefix_sum_buffer[i]; 1319 | s_prefix_sum_buffer[i] = s_prefix_sum_buffer[i + step]; 1320 | s_prefix_sum_buffer[i + step] = tmp + s_prefix_sum_buffer[i + step]; 1321 | } 1322 | 1323 | barrier(); 1324 | } 1325 | } 1326 | 1327 | void main() 1328 | { 1329 | uint thread_i = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; 1330 | uint i = gl_WorkGroupID.x * NUM_THREADS + thread_i; 1331 | 1332 | // Prefix sum on global counts to obtain global offsets 1333 | if (gl_SubgroupID == 0 && gl_SubgroupInvocationID < 16) 1334 | { 1335 | uint v = subgroupExclusiveAdd(b_global_count_buffer[gl_SubgroupInvocationID]); 1336 | s_global_offset_buffer[gl_SubgroupInvocationID] = v; 1337 | } 1338 | 1339 | barrier(); 1340 | 1341 | // Reordering 1342 | for (uint radix = 0; radix < 16; radix++) 1343 | { 1344 | bool should_place = false; 1345 | if (i < u_count) 1346 | { 1347 | should_place = ((b_src_key_buffer[i] >> u_radix_shift) & 0xf) == radix; 1348 | } 1349 | 1350 | s_prefix_sum_buffer[thread_i] = should_place ? 1 : 0; 1351 | 1352 | barrier(); 1353 | 1354 | // Prefix sum on local counts to obtain local offsets 1355 | prefix_sum(); 1356 | 1357 | if (should_place) 1358 | { 1359 | uint di = 1360 | s_global_offset_buffer[radix] + 1361 | b_block_offset_buffer[radix * u_num_blocks_power_of_2 + gl_WorkGroupID.x] + 1362 | s_prefix_sum_buffer[thread_i]; 1363 | b_dst_key_buffer[di] = b_src_key_buffer[i]; 1364 | b_dst_val_buffer[di] = b_src_val_buffer[i]; 1365 | } 1366 | } 1367 | } 1368 | )"; 1369 | } // namespace detail 1370 | 1371 | class RadixSort 1372 | { 1373 | private: 1374 | Program m_count_program; 1375 | BlellochScan m_blelloch_scan; 1376 | Program m_reorder_program; 1377 | 1378 | /// A GLuint buffer of size 16 * NUM_THREADS that stores the counts of radixes per block. 1379 | ShaderStorageBuffer m_block_count_buffer; 1380 | 1381 | /// A GLuint buffer of size 16 that stores the global counts of radixes. 1382 | ShaderStorageBuffer m_global_count_buffer; 1383 | 1384 | ShaderStorageBuffer m_key_scratch_buffer; 1385 | ShaderStorageBuffer m_val_scratch_buffer; 1386 | 1387 | const size_t m_num_threads; 1388 | 1389 | public: 1390 | explicit RadixSort() : 1391 | m_blelloch_scan(DataType_Uint), 1392 | m_num_threads(1024) 1393 | { 1394 | GLU_CHECK_ARGUMENT(is_power_of_2(m_num_threads), "Num threads must be a power of 2"); 1395 | 1396 | m_global_count_buffer.resize(16 * sizeof(GLuint)); 1397 | 1398 | std::string shader_src = "#version 460\n\n"; 1399 | shader_src += "#define NUM_THREADS " + std::to_string(m_num_threads) + "\n"; 1400 | 1401 | { // Counting program 1402 | Shader shader(GL_COMPUTE_SHADER); 1403 | shader.source_from_str(shader_src + detail::k_radix_sort_counting_shader); 1404 | shader.compile(); 1405 | 1406 | m_count_program.attach_shader(shader.handle()); 1407 | m_count_program.link(); 1408 | } 1409 | 1410 | { // Reordering program 1411 | Shader shader(GL_COMPUTE_SHADER); 1412 | shader.source_from_str(shader_src + detail::k_radix_sort_reordering_shader); 1413 | shader.compile(); 1414 | 1415 | m_reorder_program.attach_shader(shader.handle()); 1416 | m_reorder_program.link(); 1417 | } 1418 | } 1419 | 1420 | ~RadixSort() = default; 1421 | 1422 | void prepare_internal_buffers(size_t count) 1423 | { 1424 | { // Prepare block count buffer 1425 | size_t required_size = required_block_count_buffer_size(count); 1426 | if (m_block_count_buffer.size() < required_size) 1427 | { 1428 | m_block_count_buffer.resize(required_size, false); 1429 | #ifdef GLU_VERBOSE // TODO Create a log utility 1430 | printf("[RadixSort] Block count buffer reallocated to: %zu\n", required_size); 1431 | #endif 1432 | } 1433 | } 1434 | 1435 | { // Prepare key scratch buffer 1436 | size_t required_size = required_key_scratch_buffer_size(count); 1437 | if (m_key_scratch_buffer.size() < required_size) 1438 | { 1439 | m_key_scratch_buffer.resize(required_size, false); 1440 | #ifdef GLU_VERBOSE 1441 | printf("[RadixSort] Key scratch buffer reallocated to: %zu\n", required_size); 1442 | #endif 1443 | } 1444 | } 1445 | 1446 | { // Prepare val scratch buffer 1447 | size_t required_size = required_val_scratch_buffer_size(count); 1448 | if (m_val_scratch_buffer.size() < required_size) 1449 | { 1450 | m_val_scratch_buffer.resize(required_size, false); 1451 | #ifdef GLU_VERBOSE 1452 | printf("[RadixSort] Val scratch buffer reallocated to: %zu\n", required_size); 1453 | #endif 1454 | } 1455 | } 1456 | } 1457 | 1458 | void operator()(GLuint key_buffer, GLuint val_buffer, size_t count, size_t num_steps = 0) 1459 | { 1460 | GLU_CHECK_ARGUMENT(key_buffer, "Invalid key buffer"); 1461 | GLU_CHECK_ARGUMENT(val_buffer, "Invalid value buffer"); 1462 | 1463 | if (count <= 1) 1464 | return; // Hey, that's already sorted x) 1465 | 1466 | prepare_internal_buffers(count); 1467 | 1468 | size_t num_blocks = div_ceil(count, size_t(1024)); 1469 | size_t num_blocks_power_of_2 = next_power_of_2(num_blocks); // Required by BlellochScan 1470 | 1471 | GLuint key_buffers[]{key_buffer, m_key_scratch_buffer.handle()}; 1472 | GLuint val_buffers[]{val_buffer, m_val_scratch_buffer.handle()}; 1473 | 1474 | for (int step = 0; step < 8;) 1475 | { 1476 | // ---------------------------------------------------------------- Counting 1477 | 1478 | m_block_count_buffer.clear(0); 1479 | m_global_count_buffer.clear(0); 1480 | 1481 | m_count_program.use(); 1482 | 1483 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, key_buffers[step % 2]); 1484 | m_block_count_buffer.bind(1); 1485 | m_global_count_buffer.bind(2); 1486 | 1487 | glUniform1ui(m_count_program.get_uniform_location("u_count"), count); 1488 | glUniform1ui(m_count_program.get_uniform_location("u_radix_shift"), step << 2); 1489 | glUniform1ui(m_count_program.get_uniform_location("u_num_blocks_power_of_2"), num_blocks_power_of_2); 1490 | 1491 | glDispatchCompute(num_blocks, 1, 1); 1492 | glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 1493 | 1494 | // ---------------------------------------------------------------- Prefix sum 1495 | 1496 | m_blelloch_scan(m_block_count_buffer.handle(), num_blocks_power_of_2, 16); 1497 | 1498 | // ---------------------------------------------------------------- Reordering 1499 | 1500 | m_reorder_program.use(); 1501 | 1502 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, key_buffers[step % 2]); 1503 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, val_buffers[step % 2]); 1504 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, key_buffers[(step + 1) % 2]); 1505 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, val_buffers[(step + 1) % 2]); 1506 | m_block_count_buffer.bind(4); 1507 | m_global_count_buffer.bind(5); 1508 | 1509 | glUniform1ui(m_reorder_program.get_uniform_location("u_count"), count); 1510 | glUniform1ui(m_reorder_program.get_uniform_location("u_radix_shift"), step << 2); 1511 | glUniform1ui(m_reorder_program.get_uniform_location("u_num_blocks_power_of_2"), num_blocks_power_of_2); 1512 | 1513 | glDispatchCompute(num_blocks, 1, 1); 1514 | glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 1515 | 1516 | ++step; 1517 | if (step == num_steps || step == 8) break; 1518 | } 1519 | } 1520 | 1521 | private: 1522 | [[nodiscard]] static size_t required_block_count_buffer_size(size_t count) 1523 | { 1524 | size_t num_blocks = div_ceil(count, size_t(1024)); 1525 | size_t num_blocks_power_of_2 = next_power_of_2(num_blocks); // Required by BlellochScan 1526 | 1527 | return next_power_of_2(16 * num_blocks_power_of_2) * sizeof(GLuint); 1528 | } 1529 | 1530 | [[nodiscard]] static size_t required_key_scratch_buffer_size(size_t count) 1531 | { 1532 | return next_power_of_2(count) * sizeof(GLuint); 1533 | } 1534 | 1535 | [[nodiscard]] static size_t required_val_scratch_buffer_size(size_t count) 1536 | { 1537 | return next_power_of_2(count) * sizeof(GLuint); 1538 | } 1539 | }; 1540 | } // namespace glu 1541 | 1542 | #endif // GLU_RADIXSORT_HPP 1543 | -------------------------------------------------------------------------------- /dist/Reduce.hpp: -------------------------------------------------------------------------------- 1 | // This code was automatically generated; you're not supposed to edit it! 2 | 3 | #ifndef GLU_REDUCE_HPP 4 | #define GLU_REDUCE_HPP 5 | 6 | #ifndef GLU_DATA_TYPES_HPP 7 | #define GLU_DATA_TYPES_HPP 8 | 9 | #ifndef GLU_ERRORS_HPP 10 | #define GLU_ERRORS_HPP 11 | 12 | #include 13 | #include 14 | 15 | // TODO mark if (!condition_) as unlikely 16 | #define GLU_CHECK_STATE(condition_, ...) \ 17 | { \ 18 | if (!(condition_)) \ 19 | { \ 20 | fprintf(stderr, __VA_ARGS__); \ 21 | exit(1); \ 22 | } \ 23 | } 24 | 25 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__) 26 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__) 27 | 28 | #endif 29 | 30 | 31 | 32 | namespace glu 33 | { 34 | enum DataType 35 | { 36 | DataType_Float = 0, 37 | DataType_Double, 38 | DataType_Int, 39 | DataType_Uint, 40 | DataType_Vec2, 41 | DataType_Vec4, 42 | DataType_DVec2, 43 | DataType_DVec4, 44 | DataType_UVec2, 45 | DataType_UVec4, 46 | DataType_IVec2, 47 | DataType_IVec4 48 | }; 49 | 50 | inline const char* to_glsl_type_str(DataType data_type) 51 | { 52 | // clang-format off 53 | if (data_type == DataType_Float) return "float"; 54 | else if (data_type == DataType_Double) return "double"; 55 | else if (data_type == DataType_Int) return "int"; 56 | else if (data_type == DataType_Uint) return "uint"; 57 | else if (data_type == DataType_Vec2) return "vec2"; 58 | else if (data_type == DataType_Vec4) return "vec4"; 59 | else if (data_type == DataType_DVec2) return "dvec2"; 60 | else if (data_type == DataType_DVec4) return "dvec4"; 61 | else if (data_type == DataType_UVec2) return "uvec2"; 62 | else if (data_type == DataType_UVec4) return "uvec4"; 63 | else if (data_type == DataType_IVec2) return "ivec2"; 64 | else if (data_type == DataType_IVec4) return "ivec4"; 65 | else 66 | { 67 | GLU_FAIL("Invalid data type: %d", data_type); 68 | } 69 | // clang-format on 70 | } 71 | 72 | } // namespace glu 73 | 74 | #endif // GLU_DATA_TYPES_HPP 75 | 76 | 77 | #ifndef GLU_GL_UTILS_HPP 78 | #define GLU_GL_UTILS_HPP 79 | 80 | #include 81 | #include 82 | #include 83 | #include 84 | 85 | #ifndef GLU_ERRORS_HPP 86 | #define GLU_ERRORS_HPP 87 | 88 | #include 89 | #include 90 | 91 | // TODO mark if (!condition_) as unlikely 92 | #define GLU_CHECK_STATE(condition_, ...) \ 93 | { \ 94 | if (!(condition_)) \ 95 | { \ 96 | fprintf(stderr, __VA_ARGS__); \ 97 | exit(1); \ 98 | } \ 99 | } 100 | 101 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__) 102 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__) 103 | 104 | #endif 105 | 106 | 107 | 108 | namespace glu 109 | { 110 | inline void 111 | copy_buffer(GLuint src_buffer, GLuint dst_buffer, size_t size, size_t src_offset = 0, size_t dst_offset = 0) 112 | { 113 | glBindBuffer(GL_COPY_READ_BUFFER, src_buffer); 114 | glBindBuffer(GL_COPY_WRITE_BUFFER, dst_buffer); 115 | 116 | glCopyBufferSubData( 117 | GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, (GLintptr) src_offset, (GLintptr) dst_offset, (GLsizeiptr) size 118 | ); 119 | } 120 | 121 | /// A RAII wrapper for GL shader. 122 | class Shader 123 | { 124 | private: 125 | GLuint m_handle; 126 | 127 | public: 128 | explicit Shader(GLenum type) : 129 | m_handle(glCreateShader(type)){}; 130 | Shader(const Shader&) = delete; 131 | 132 | Shader(Shader&& other) noexcept 133 | { 134 | m_handle = other.m_handle; 135 | other.m_handle = 0; 136 | } 137 | 138 | ~Shader() { glDeleteShader(m_handle); } 139 | 140 | [[nodiscard]] GLuint handle() const { return m_handle; } 141 | 142 | void source_from_str(const std::string& src_str) 143 | { 144 | const char* src_ptr = src_str.c_str(); 145 | glShaderSource(m_handle, 1, &src_ptr, nullptr); 146 | } 147 | 148 | void source_from_file(const char* src_filepath) 149 | { 150 | FILE* file = fopen(src_filepath, "rt"); 151 | GLU_CHECK_STATE(!file, "Failed to shader file: %s", src_filepath); 152 | 153 | fseek(file, 0, SEEK_END); 154 | size_t file_size = ftell(file); 155 | fseek(file, 0, SEEK_SET); 156 | 157 | std::string src{}; 158 | src.resize(file_size); 159 | fread(src.data(), sizeof(char), file_size, file); 160 | source_from_str(src.c_str()); 161 | 162 | fclose(file); 163 | } 164 | 165 | std::string get_info_log() 166 | { 167 | GLint log_length = 0; 168 | glGetShaderiv(m_handle, GL_INFO_LOG_LENGTH, &log_length); 169 | 170 | std::vector log(log_length); 171 | glGetShaderInfoLog(m_handle, log_length, nullptr, log.data()); 172 | return {log.begin(), log.end()}; 173 | } 174 | 175 | void compile() 176 | { 177 | glCompileShader(m_handle); 178 | 179 | GLint status; 180 | glGetShaderiv(m_handle, GL_COMPILE_STATUS, &status); 181 | if (!status) 182 | { 183 | GLU_CHECK_STATE(status, "Shader failed to compile: %s", get_info_log().c_str()); 184 | } 185 | } 186 | }; 187 | 188 | /// A RAII wrapper for GL program. 189 | class Program 190 | { 191 | private: 192 | GLuint m_handle; 193 | 194 | public: 195 | explicit Program() { m_handle = glCreateProgram(); }; 196 | Program(const Program&) = delete; 197 | 198 | Program(Program&& other) noexcept 199 | { 200 | m_handle = other.m_handle; 201 | other.m_handle = 0; 202 | } 203 | 204 | ~Program() { glDeleteProgram(m_handle); } 205 | 206 | [[nodiscard]] GLuint handle() const { return m_handle; } 207 | 208 | void attach_shader(GLuint shader_handle) { glAttachShader(m_handle, shader_handle); } 209 | void attach_shader(const Shader& shader) { glAttachShader(m_handle, shader.handle()); } 210 | 211 | [[nodiscard]] std::string get_info_log() const 212 | { 213 | GLint log_length = 0; 214 | glGetProgramiv(m_handle, GL_INFO_LOG_LENGTH, &log_length); 215 | 216 | std::vector log(log_length); 217 | glGetProgramInfoLog(m_handle, log_length, nullptr, log.data()); 218 | return {log.begin(), log.end()}; 219 | } 220 | 221 | void link() 222 | { 223 | GLint status; 224 | glLinkProgram(m_handle); 225 | glGetProgramiv(m_handle, GL_LINK_STATUS, &status); 226 | if (!status) 227 | { 228 | GLU_CHECK_STATE(status, "Program failed to link: %s", get_info_log().c_str()); 229 | } 230 | } 231 | 232 | void use() { glUseProgram(m_handle); } 233 | 234 | GLint get_uniform_location(const char* uniform_name) 235 | { 236 | GLint loc = glGetUniformLocation(m_handle, uniform_name); 237 | GLU_CHECK_STATE(loc >= 0, "Failed to get uniform location: %s", uniform_name); 238 | return loc; 239 | } 240 | }; 241 | 242 | /// A RAII helper class for GL shader storage buffer. 243 | class ShaderStorageBuffer 244 | { 245 | private: 246 | GLuint m_handle = 0; 247 | size_t m_size = 0; 248 | 249 | public: 250 | explicit ShaderStorageBuffer(size_t initial_size = 0) 251 | { 252 | if (initial_size > 0) 253 | resize(initial_size, false); 254 | } 255 | 256 | explicit ShaderStorageBuffer(const void* data, size_t size) : 257 | m_size(size) 258 | { 259 | GLU_CHECK_ARGUMENT(data, ""); 260 | GLU_CHECK_ARGUMENT(size > 0, ""); 261 | 262 | glCreateBuffers(1, &m_handle); 263 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 264 | glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, data, GL_DYNAMIC_STORAGE_BIT); 265 | } 266 | 267 | template 268 | explicit ShaderStorageBuffer(const std::vector& data) : 269 | ShaderStorageBuffer(data.data(), data.size() * sizeof(T)) 270 | { 271 | } 272 | 273 | ShaderStorageBuffer(const ShaderStorageBuffer&) = delete; 274 | ShaderStorageBuffer(ShaderStorageBuffer&& other) noexcept 275 | { 276 | m_handle = other.m_handle; 277 | m_size = other.m_size; 278 | other.m_handle = 0; 279 | } 280 | 281 | ~ShaderStorageBuffer() 282 | { 283 | if (m_handle) 284 | glDeleteBuffers(1, &m_handle); 285 | } 286 | 287 | [[nodiscard]] GLuint handle() const { return m_handle; } 288 | [[nodiscard]] size_t size() const { return m_size; } 289 | 290 | /// Grows or shrinks the buffer. If keep_data, performs an additional copy to maintain the data. 291 | void resize(size_t size, bool keep_data = false) 292 | { 293 | size_t old_size = m_size; 294 | GLuint old_handle = m_handle; 295 | 296 | if (old_size != size) 297 | { 298 | m_size = size; 299 | 300 | glCreateBuffers(1, &m_handle); 301 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 302 | glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, nullptr, GL_DYNAMIC_STORAGE_BIT); 303 | 304 | if (keep_data) 305 | copy_buffer(old_handle, m_handle, std::min(old_size, size)); 306 | 307 | glDeleteBuffers(1, &old_handle); 308 | } 309 | } 310 | 311 | /// Clears the entire buffer with the given GLuint value (repeated). 312 | void clear(GLuint value) 313 | { 314 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 315 | glClearBufferData(GL_SHADER_STORAGE_BUFFER, GL_R32UI, GL_RED, GL_UNSIGNED_INT, &value); 316 | } 317 | 318 | void write_data(const void* data, size_t size) 319 | { 320 | GLU_CHECK_ARGUMENT(size <= m_size, ""); 321 | 322 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 323 | glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, size, data); 324 | } 325 | 326 | template 327 | std::vector get_data() const 328 | { 329 | GLU_CHECK_ARGUMENT(m_size % sizeof(T) == 0, "Size %zu isn't a multiple of %zu", m_size, sizeof(T)); 330 | 331 | std::vector result(m_size / sizeof(T)); 332 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 333 | glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, (GLsizeiptr) m_size, result.data()); 334 | return result; 335 | } 336 | 337 | void bind(GLuint index, size_t size = 0, size_t offset = 0) 338 | { 339 | if (size == 0) 340 | size = m_size; 341 | glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, m_handle, (GLintptr) offset, (GLsizeiptr) size); 342 | } 343 | }; 344 | 345 | /// Measures elapsed time on GPU for executing the given callback. 346 | inline uint64_t measure_gl_elapsed_time(const std::function& callback) 347 | { 348 | GLuint query; 349 | uint64_t elapsed_time{}; 350 | 351 | glGenQueries(1, &query); 352 | glBeginQuery(GL_TIME_ELAPSED, query); 353 | 354 | callback(); 355 | 356 | glEndQuery(GL_TIME_ELAPSED); 357 | 358 | glGetQueryObjectui64v(query, GL_QUERY_RESULT, &elapsed_time); 359 | glDeleteQueries(1, &query); 360 | 361 | return elapsed_time; 362 | } 363 | 364 | template 365 | IntegerT log32_floor(IntegerT n) 366 | { 367 | return (IntegerT) floor(double(log2(n)) / 5.0); 368 | } 369 | 370 | template 371 | IntegerT log32_ceil(IntegerT n) 372 | { 373 | return (IntegerT) ceil(double(log2(n)) / 5.0); 374 | } 375 | 376 | template 377 | IntegerT div_ceil(IntegerT n, IntegerT d) 378 | { 379 | return (IntegerT) ceil(double(n) / double(d)); 380 | } 381 | 382 | template 383 | bool is_power_of_2(T n) 384 | { 385 | return (n & (n - 1)) == 0; 386 | } 387 | 388 | template 389 | IntegerT next_power_of_2(IntegerT n) 390 | { 391 | n--; 392 | n |= n >> 1; 393 | n |= n >> 2; 394 | n |= n >> 4; 395 | n |= n >> 8; 396 | n |= n >> 16; 397 | n++; 398 | return n; 399 | } 400 | 401 | template 402 | void print_stl_container(Iterator begin, Iterator end) 403 | { 404 | size_t i = 0; 405 | for (; begin != end; begin++) 406 | { 407 | printf("(%zu) %s, ", i, std::to_string(*begin).c_str()); 408 | i++; 409 | } 410 | printf("\n"); 411 | } 412 | 413 | template 414 | void print_buffer(const ShaderStorageBuffer& buffer) 415 | { 416 | std::vector data = buffer.get_data(); 417 | print_stl_container(data.begin(), data.end()); 418 | } 419 | 420 | inline void print_buffer_hex(const ShaderStorageBuffer& buffer) 421 | { 422 | std::vector data = buffer.get_data(); 423 | for (size_t i = 0; i < data.size(); i++) 424 | printf("(%zu) %08x, ", i, data[i]); 425 | printf("\n"); 426 | } 427 | } // namespace glu 428 | 429 | #endif // GLU_GL_UTILS_HPP 430 | 431 | 432 | 433 | namespace glu 434 | { 435 | namespace detail 436 | { 437 | inline const char* k_reduction_shader_src = R"( 438 | #extension GL_KHR_shader_subgroup_arithmetic : require 439 | 440 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in; 441 | 442 | layout(std430, binding = 0) buffer Buffer 443 | { 444 | DATA_TYPE data[]; 445 | }; 446 | 447 | layout(location = 0) uniform uint u_count; 448 | layout(location = 1) uniform uint u_depth; 449 | 450 | void main() 451 | { 452 | uint step = 1 << (5 * u_depth); 453 | uint subgroup_i = gl_WorkGroupID.x * NUM_THREADS + gl_SubgroupID * gl_SubgroupSize; 454 | uint i = (subgroup_i + gl_SubgroupInvocationID) * step; 455 | if (i < u_count) 456 | { 457 | DATA_TYPE r = SUBGROUP_OPERATION(data[i]); 458 | if (gl_SubgroupInvocationID == 0) 459 | { 460 | data[i] = r; 461 | } 462 | } 463 | } 464 | )"; 465 | } 466 | 467 | /// The operators that can be used for the reduction operation. 468 | enum ReduceOperator 469 | { 470 | ReduceOperator_Sum = 0, 471 | ReduceOperator_Mul, 472 | ReduceOperator_Min, 473 | ReduceOperator_Max 474 | }; 475 | 476 | /// A class that implements the reduction operation. 477 | class Reduce 478 | { 479 | private: 480 | const DataType m_data_type; 481 | const ReduceOperator m_operator; 482 | const size_t m_num_threads; 483 | const size_t m_num_items; 484 | 485 | Program m_program; 486 | 487 | public: 488 | explicit Reduce(DataType data_type, ReduceOperator operator_) : 489 | m_data_type(data_type), 490 | m_operator(operator_), 491 | m_num_threads(1024), 492 | m_num_items(4) 493 | { 494 | std::string shader_src = "#version 460\n\n"; 495 | 496 | shader_src += std::string("#define DATA_TYPE ") + to_glsl_type_str(m_data_type) + "\n"; 497 | shader_src += std::string("#define NUM_THREADS ") + std::to_string(m_num_threads) + "\n"; 498 | shader_src += std::string("#define NUM_ITEMS ") + std::to_string(m_num_items) + "\n"; 499 | 500 | if (m_operator == ReduceOperator_Sum) 501 | { 502 | shader_src += "#define OPERATOR(a, b) (a + b)\n"; 503 | shader_src += "#define SUBGROUP_OPERATION(value) subgroupAdd(value)\n"; 504 | } 505 | else if (m_operator == ReduceOperator_Mul) 506 | { 507 | shader_src += "#define OPERATOR(a, b) (a * b)\n"; 508 | shader_src += "#define SUBGROUP_OPERATION(value) subgroupMul(value)\n"; 509 | } 510 | else if (m_operator == ReduceOperator_Min) 511 | { 512 | shader_src += "#define OPERATOR(a, b) (min(a, b))\n"; 513 | shader_src += "#define SUBGROUP_OPERATION(value) subgroupMin(value)\n"; 514 | } 515 | else if (m_operator == ReduceOperator_Max) 516 | { 517 | shader_src += "#define OPERATOR(a, b) (max(a, b))\n"; 518 | shader_src += "#define SUBGROUP_OPERATION(value) subgroupMax(value)\n"; 519 | } 520 | else 521 | { 522 | GLU_FAIL("Invalid reduction operator: %d", m_operator); 523 | } 524 | 525 | shader_src += detail::k_reduction_shader_src; 526 | 527 | Shader shader(GL_COMPUTE_SHADER); 528 | shader.source_from_str(shader_src.c_str()); 529 | shader.compile(); 530 | 531 | m_program.attach_shader(shader); 532 | m_program.link(); 533 | } 534 | 535 | ~Reduce() = default; 536 | 537 | void operator()(GLuint buffer, size_t count) 538 | { 539 | GLU_CHECK_ARGUMENT(buffer, "Invalid buffer"); 540 | GLU_CHECK_ARGUMENT(count > 0, "Count must be greater than zero"); 541 | 542 | m_program.use(); 543 | 544 | glUniform1ui(m_program.get_uniform_location("u_count"), count); 545 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer); 546 | 547 | for (int depth = 0;; depth++) 548 | { 549 | int step = 1 << (5 * depth); 550 | if (step >= count) 551 | break; 552 | 553 | size_t level_count = count >> (5 * depth); 554 | 555 | glUniform1ui(m_program.get_uniform_location("u_depth"), depth); 556 | 557 | size_t num_workgroups = div_ceil(level_count, m_num_threads); 558 | glDispatchCompute(num_workgroups, 1, 1); 559 | glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 560 | } 561 | } 562 | }; 563 | } // namespace glu 564 | 565 | #endif // GLU_REDUCE_HPP 566 | -------------------------------------------------------------------------------- /generate.py: -------------------------------------------------------------------------------- 1 | import re 2 | from os import path 3 | 4 | script_dir = path.dirname(path.realpath(__file__)) 5 | 6 | 7 | def generate_standalone_header_code(in_filepath: str) -> str: 8 | with open(in_filepath, "rt") as in_file: 9 | code = in_file.read() 10 | 11 | while True: 12 | matches = [match for match in re.finditer(r'^#include\s+"(\S+?)"', code, re.MULTILINE)] 13 | if len(matches) == 0: 14 | break 15 | 16 | match = matches[0] 17 | included_filepath = path.join(path.dirname(in_filepath), match.group(1)) 18 | included_code = generate_standalone_header_code(included_filepath) 19 | 20 | code = code[:match.span()[0]] + included_code + "\n" + code[match.span()[1]:] 21 | return code 22 | 23 | 24 | def generate_standalone_header(in_filepath: str, out_filepath: str): 25 | print("Generating %s from %s" % (out_filepath, in_filepath)) 26 | with open(out_filepath, "wt") as out_file: 27 | out_str = "// This code was automatically generated; you're not supposed to edit it!\n\n" 28 | out_str += generate_standalone_header_code(in_filepath) 29 | out_file.write(out_str) 30 | 31 | 32 | if __name__ == "__main__": 33 | def p(filename: str): 34 | return path.join(script_dir, "glu/%s" % filename), path.join(script_dir, "dist/%s" % filename) 35 | 36 | generate_standalone_header(*p("BlellochScan.hpp")) 37 | generate_standalone_header(*p("RadixSort.hpp")) 38 | generate_standalone_header(*p("Reduce.hpp")) 39 | -------------------------------------------------------------------------------- /glu/BlellochScan.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GLU_BLELLOCHSCAN_HPP 2 | #define GLU_BLELLOCHSCAN_HPP 3 | 4 | #include 5 | 6 | #include "Reduce.hpp" 7 | #include "data_types.hpp" 8 | 9 | namespace glu 10 | { 11 | namespace detail 12 | { 13 | inline const char* k_upsweep_shader_src = R"( 14 | #extension GL_KHR_shader_subgroup_shuffle_relative : require 15 | 16 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in; 17 | 18 | layout(std430, binding = 0) buffer Buffer 19 | { 20 | DATA_TYPE data[]; 21 | }; 22 | 23 | layout(location = 0) uniform uint u_count; 24 | layout(location = 1) uniform uint u_step; 25 | 26 | void main() 27 | { 28 | uint partition_i = gl_WorkGroupID.y; 29 | uint thread_i = gl_WorkGroupID.x * NUM_THREADS + gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; 30 | uint i = partition_i * u_count + thread_i * u_step + u_step - 1; 31 | uint end_i = (partition_i + 1) * u_count; 32 | if (i < end_i) 33 | { 34 | DATA_TYPE lval = subgroupShuffleUp(data[i], 1); 35 | DATA_TYPE r = OPERATION(data[i], lval); 36 | if (i == end_i - 1) // Clear last 37 | { 38 | data[i] = IDENTITY; 39 | } 40 | else if (gl_SubgroupInvocationID % 2 == 1) 41 | { 42 | data[i] = r; 43 | } 44 | } 45 | } 46 | )"; 47 | 48 | inline const char* k_downsweep_shader_src = R"( 49 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in; 50 | 51 | layout(std430, binding = 0) buffer Buffer 52 | { 53 | DATA_TYPE data[]; 54 | }; 55 | 56 | layout(location = 0) uniform uint u_count; 57 | layout(location = 1) uniform uint u_step; 58 | 59 | void main() 60 | { 61 | uint partition_i = gl_WorkGroupID.y; 62 | uint i = partition_i * u_count + gl_GlobalInvocationID.x * (u_step << 1) + (u_step - 1); 63 | uint next_i = i + u_step; 64 | uint end_i = (partition_i + 1) * u_count; 65 | if (next_i < end_i) 66 | { 67 | DATA_TYPE tmp = data[i]; 68 | data[i] = data[next_i]; 69 | data[next_i] = data[next_i] + tmp; 70 | } 71 | else if (i < end_i) 72 | { 73 | data[i] = IDENTITY; 74 | } 75 | } 76 | )"; 77 | } // namespace detail 78 | 79 | /// A class that implements Blelloch scan algorithm (exclusive prefix sum). 80 | class BlellochScan 81 | { 82 | private: 83 | const DataType m_data_type; 84 | const size_t m_num_threads; 85 | const size_t m_num_items; 86 | 87 | Program m_upsweep_program; 88 | Program m_downsweep_program; 89 | 90 | public: 91 | explicit BlellochScan(DataType data_type) : 92 | m_data_type(data_type), 93 | m_num_threads(1024), 94 | m_num_items(4) 95 | { 96 | std::string shader_src = "#version 460\n\n"; 97 | 98 | shader_src += std::string("#define DATA_TYPE ") + to_glsl_type_str(m_data_type) + "\n"; 99 | shader_src += "#define OPERATION(a, b) (a + b)\n"; 100 | shader_src += "#define IDENTITY 0\n"; 101 | shader_src += std::string("#define NUM_THREADS ") + std::to_string(m_num_threads) + "\n"; 102 | shader_src += std::string("#define NUM_ITEMS ") + std::to_string(m_num_items) + "\n"; 103 | 104 | { // Upsweep program 105 | Shader upsweep_shader(GL_COMPUTE_SHADER); 106 | upsweep_shader.source_from_str((shader_src + detail::k_upsweep_shader_src).c_str()); 107 | upsweep_shader.compile(); 108 | 109 | m_upsweep_program.attach_shader(upsweep_shader); 110 | m_upsweep_program.link(); 111 | } 112 | 113 | { // Downsweep program 114 | Shader downsweep_program(GL_COMPUTE_SHADER); 115 | downsweep_program.source_from_str((shader_src + detail::k_downsweep_shader_src).c_str()); 116 | downsweep_program.compile(); 117 | 118 | m_downsweep_program.attach_shader(downsweep_program); 119 | m_downsweep_program.link(); 120 | } 121 | } 122 | 123 | ~BlellochScan() = default; 124 | 125 | /// Runs Blelloch exclusive scan on multiple partitions. 126 | /// 127 | /// @param buffer the input GLuint buffer 128 | /// @param count the number of GLuint in the buffer (must be a power of 2) 129 | /// @param num_partitions the number of partitions (must be adjacent) 130 | void operator()(GLuint buffer, size_t count, size_t num_partitions = 1) 131 | { 132 | GLU_CHECK_ARGUMENT(buffer, "Invalid buffer"); 133 | GLU_CHECK_ARGUMENT(count > 0, "Count must be greater than zero"); 134 | GLU_CHECK_ARGUMENT(is_power_of_2(count), "Count must be a power of 2"); // TODO Remove this requirement 135 | GLU_CHECK_ARGUMENT(num_partitions >= 1, "Num of partitions must be >= 1"); 136 | 137 | upsweep(buffer, count, num_partitions); // Also clear last 138 | downsweep(buffer, count, num_partitions); 139 | } 140 | 141 | private: 142 | void upsweep(GLuint buffer, size_t count, size_t num_partitions) // Also clear last 143 | { 144 | m_upsweep_program.use(); 145 | 146 | glUniform1ui(m_upsweep_program.get_uniform_location("u_count"), count); 147 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer); 148 | 149 | int step = 1; 150 | int level_count = (int) count; 151 | while (true) 152 | { 153 | glUniform1ui(m_upsweep_program.get_uniform_location("u_step"), step); 154 | 155 | size_t num_workgroups = div_ceil(level_count, m_num_threads); 156 | glDispatchCompute(num_workgroups, num_partitions, 1); 157 | glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 158 | 159 | step <<= 1; 160 | 161 | level_count >>= 1; 162 | 163 | if (level_count <= 1) 164 | break; 165 | } 166 | } 167 | 168 | void downsweep(GLuint buffer, size_t count, size_t num_partitions) 169 | { 170 | m_downsweep_program.use(); 171 | 172 | glUniform1ui(m_downsweep_program.get_uniform_location("u_count"), count); 173 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer); 174 | 175 | int step = next_power_of_2(int(count)) >> 1; 176 | size_t level_count = 1; 177 | while (true) 178 | { 179 | glUniform1ui(m_downsweep_program.get_uniform_location("u_step"), step); 180 | 181 | size_t num_workgroups = div_ceil(level_count, m_num_threads); 182 | glDispatchCompute(num_workgroups, num_partitions, 1); 183 | glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 184 | 185 | step >>= 1; 186 | level_count <<= 1; 187 | if (step == 0) 188 | break; 189 | } 190 | } 191 | }; 192 | } // namespace glu 193 | 194 | #endif // GLU_BLELLOCHSCAN_HPP 195 | -------------------------------------------------------------------------------- /glu/RadixSort.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GLU_RADIXSORT_HPP 2 | #define GLU_RADIXSORT_HPP 3 | 4 | #include "BlellochScan.hpp" 5 | #include "gl_utils.hpp" 6 | 7 | namespace glu 8 | { 9 | namespace detail 10 | { 11 | inline const char* k_radix_sort_counting_shader = R"( 12 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in; 13 | 14 | layout(std430, binding = 0) readonly buffer KeyBuffer 15 | { 16 | uint b_key_buffer[]; 17 | }; 18 | 19 | layout(std430, binding = 1) buffer BlockCountBuffer 20 | { 21 | uint b_block_count_buffer[]; // 16 * NUM_THREADS 22 | }; 23 | 24 | layout(std430, binding = 2) buffer GlobalCountBuffer 25 | { 26 | uint b_global_count_buffer[]; 27 | }; 28 | 29 | layout(location = 0) uniform uint u_count; 30 | layout(location = 1) uniform uint u_radix_shift; 31 | layout(location = 2) uniform uint u_num_blocks_power_of_2; 32 | 33 | void main() 34 | { 35 | for (uint radix = 0; radix < 16; radix++) 36 | { 37 | b_block_count_buffer[radix * u_num_blocks_power_of_2 + gl_WorkGroupID.x] = 0; 38 | } 39 | 40 | barrier(); 41 | 42 | uint i = gl_GlobalInvocationID.x; 43 | if (i < u_count) 44 | { 45 | // Block-wide count on shared memory 46 | uint radix = (b_key_buffer[i] >> u_radix_shift) & 0xf; 47 | atomicAdd(b_block_count_buffer[radix * u_num_blocks_power_of_2 + gl_WorkGroupID.x], 1); 48 | } 49 | 50 | barrier(); 51 | 52 | if (gl_LocalInvocationIndex < 16) 53 | { 54 | uint block_count = b_block_count_buffer[gl_LocalInvocationIndex * u_num_blocks_power_of_2 + gl_WorkGroupID.x]; 55 | atomicAdd(b_global_count_buffer[gl_LocalInvocationIndex], block_count); 56 | } 57 | } 58 | )"; 59 | 60 | inline const char* k_radix_sort_reordering_shader = R"( 61 | #extension GL_KHR_shader_subgroup_arithmetic : require 62 | 63 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in; 64 | 65 | layout(std430, binding = 0) readonly buffer SrcKeyBuffer 66 | { 67 | uint b_src_key_buffer[]; 68 | }; 69 | 70 | layout(std430, binding = 1) readonly buffer SrcValBuffer 71 | { 72 | uint b_src_val_buffer[]; 73 | }; 74 | 75 | layout(std430, binding = 2) writeonly buffer DstKeyBuffer 76 | { 77 | uint b_dst_key_buffer[]; 78 | }; 79 | 80 | layout(std430, binding = 3) writeonly buffer DstValBuffer 81 | { 82 | uint b_dst_val_buffer[]; 83 | }; 84 | 85 | layout(std430, binding = 4) readonly buffer BlockOffsetBuffer 86 | { 87 | uint b_block_offset_buffer[]; 88 | }; 89 | 90 | layout(std430, binding = 5) readonly buffer GlobalCountBuffer 91 | { 92 | uint b_global_count_buffer[]; 93 | }; 94 | 95 | layout(location = 0) uniform uint u_count; 96 | layout(location = 1) uniform uint u_radix_shift; 97 | layout(location = 2) uniform uint u_num_blocks_power_of_2; 98 | 99 | shared uint s_global_offset_buffer[16]; 100 | shared uint s_prefix_sum_buffer[NUM_THREADS]; 101 | 102 | void prefix_sum() // Block-wide prefix sum (Blelloch scan) 103 | { 104 | uint thread_i = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; 105 | 106 | // Upsweep 107 | for (uint step = 1; step < NUM_THREADS; step <<= 1) 108 | { 109 | if (thread_i % 2 == 1) 110 | { 111 | uint i = thread_i * step + (step - 1); 112 | if (i < NUM_THREADS) 113 | { 114 | s_prefix_sum_buffer[i] = s_prefix_sum_buffer[i] + s_prefix_sum_buffer[i - step]; 115 | } 116 | } 117 | 118 | barrier(); 119 | } 120 | 121 | // Clear last 122 | if (thread_i == NUM_THREADS - 1) s_prefix_sum_buffer[thread_i] = 0; 123 | 124 | barrier(); 125 | 126 | // Downsweep 127 | uint step = NUM_THREADS >> 1; 128 | for (; step > 0; step >>= 1) 129 | { 130 | uint i = thread_i * step + (step - 1); 131 | if (i + step < NUM_THREADS && thread_i % 2 == 0) 132 | { 133 | uint tmp = s_prefix_sum_buffer[i]; 134 | s_prefix_sum_buffer[i] = s_prefix_sum_buffer[i + step]; 135 | s_prefix_sum_buffer[i + step] = tmp + s_prefix_sum_buffer[i + step]; 136 | } 137 | 138 | barrier(); 139 | } 140 | } 141 | 142 | void main() 143 | { 144 | uint thread_i = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; 145 | uint i = gl_WorkGroupID.x * NUM_THREADS + thread_i; 146 | 147 | // Prefix sum on global counts to obtain global offsets 148 | if (gl_SubgroupID == 0 && gl_SubgroupInvocationID < 16) 149 | { 150 | uint v = subgroupExclusiveAdd(b_global_count_buffer[gl_SubgroupInvocationID]); 151 | s_global_offset_buffer[gl_SubgroupInvocationID] = v; 152 | } 153 | 154 | barrier(); 155 | 156 | // Reordering 157 | for (uint radix = 0; radix < 16; radix++) 158 | { 159 | bool should_place = false; 160 | if (i < u_count) 161 | { 162 | should_place = ((b_src_key_buffer[i] >> u_radix_shift) & 0xf) == radix; 163 | } 164 | 165 | s_prefix_sum_buffer[thread_i] = should_place ? 1 : 0; 166 | 167 | barrier(); 168 | 169 | // Prefix sum on local counts to obtain local offsets 170 | prefix_sum(); 171 | 172 | if (should_place) 173 | { 174 | uint di = 175 | s_global_offset_buffer[radix] + 176 | b_block_offset_buffer[radix * u_num_blocks_power_of_2 + gl_WorkGroupID.x] + 177 | s_prefix_sum_buffer[thread_i]; 178 | b_dst_key_buffer[di] = b_src_key_buffer[i]; 179 | b_dst_val_buffer[di] = b_src_val_buffer[i]; 180 | } 181 | } 182 | } 183 | )"; 184 | } // namespace detail 185 | 186 | class RadixSort 187 | { 188 | private: 189 | Program m_count_program; 190 | BlellochScan m_blelloch_scan; 191 | Program m_reorder_program; 192 | 193 | /// A GLuint buffer of size 16 * NUM_THREADS that stores the counts of radixes per block. 194 | ShaderStorageBuffer m_block_count_buffer; 195 | 196 | /// A GLuint buffer of size 16 that stores the global counts of radixes. 197 | ShaderStorageBuffer m_global_count_buffer; 198 | 199 | ShaderStorageBuffer m_key_scratch_buffer; 200 | ShaderStorageBuffer m_val_scratch_buffer; 201 | 202 | const size_t m_num_threads; 203 | 204 | public: 205 | explicit RadixSort() : 206 | m_blelloch_scan(DataType_Uint), 207 | m_num_threads(1024) 208 | { 209 | GLU_CHECK_ARGUMENT(is_power_of_2(m_num_threads), "Num threads must be a power of 2"); 210 | 211 | m_global_count_buffer.resize(16 * sizeof(GLuint)); 212 | 213 | std::string shader_src = "#version 460\n\n"; 214 | shader_src += "#define NUM_THREADS " + std::to_string(m_num_threads) + "\n"; 215 | 216 | { // Counting program 217 | Shader shader(GL_COMPUTE_SHADER); 218 | shader.source_from_str(shader_src + detail::k_radix_sort_counting_shader); 219 | shader.compile(); 220 | 221 | m_count_program.attach_shader(shader.handle()); 222 | m_count_program.link(); 223 | } 224 | 225 | { // Reordering program 226 | Shader shader(GL_COMPUTE_SHADER); 227 | shader.source_from_str(shader_src + detail::k_radix_sort_reordering_shader); 228 | shader.compile(); 229 | 230 | m_reorder_program.attach_shader(shader.handle()); 231 | m_reorder_program.link(); 232 | } 233 | } 234 | 235 | ~RadixSort() = default; 236 | 237 | void prepare_internal_buffers(size_t count) 238 | { 239 | { // Prepare block count buffer 240 | size_t required_size = required_block_count_buffer_size(count); 241 | if (m_block_count_buffer.size() < required_size) 242 | { 243 | m_block_count_buffer.resize(required_size, false); 244 | #ifdef GLU_VERBOSE // TODO Create a log utility 245 | printf("[RadixSort] Block count buffer reallocated to: %zu\n", required_size); 246 | #endif 247 | } 248 | } 249 | 250 | { // Prepare key scratch buffer 251 | size_t required_size = required_key_scratch_buffer_size(count); 252 | if (m_key_scratch_buffer.size() < required_size) 253 | { 254 | m_key_scratch_buffer.resize(required_size, false); 255 | #ifdef GLU_VERBOSE 256 | printf("[RadixSort] Key scratch buffer reallocated to: %zu\n", required_size); 257 | #endif 258 | } 259 | } 260 | 261 | { // Prepare val scratch buffer 262 | size_t required_size = required_val_scratch_buffer_size(count); 263 | if (m_val_scratch_buffer.size() < required_size) 264 | { 265 | m_val_scratch_buffer.resize(required_size, false); 266 | #ifdef GLU_VERBOSE 267 | printf("[RadixSort] Val scratch buffer reallocated to: %zu\n", required_size); 268 | #endif 269 | } 270 | } 271 | } 272 | 273 | void operator()(GLuint key_buffer, GLuint val_buffer, size_t count, size_t num_steps = 0) 274 | { 275 | GLU_CHECK_ARGUMENT(key_buffer, "Invalid key buffer"); 276 | GLU_CHECK_ARGUMENT(val_buffer, "Invalid value buffer"); 277 | 278 | if (count <= 1) 279 | return; // Hey, that's already sorted x) 280 | 281 | prepare_internal_buffers(count); 282 | 283 | size_t num_blocks = div_ceil(count, size_t(1024)); 284 | size_t num_blocks_power_of_2 = next_power_of_2(num_blocks); // Required by BlellochScan 285 | 286 | GLuint key_buffers[]{key_buffer, m_key_scratch_buffer.handle()}; 287 | GLuint val_buffers[]{val_buffer, m_val_scratch_buffer.handle()}; 288 | 289 | for (int step = 0; step < 8;) 290 | { 291 | // ---------------------------------------------------------------- Counting 292 | 293 | m_block_count_buffer.clear(0); 294 | m_global_count_buffer.clear(0); 295 | 296 | m_count_program.use(); 297 | 298 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, key_buffers[step % 2]); 299 | m_block_count_buffer.bind(1); 300 | m_global_count_buffer.bind(2); 301 | 302 | glUniform1ui(m_count_program.get_uniform_location("u_count"), count); 303 | glUniform1ui(m_count_program.get_uniform_location("u_radix_shift"), step << 2); 304 | glUniform1ui(m_count_program.get_uniform_location("u_num_blocks_power_of_2"), num_blocks_power_of_2); 305 | 306 | glDispatchCompute(num_blocks, 1, 1); 307 | glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 308 | 309 | // ---------------------------------------------------------------- Prefix sum 310 | 311 | m_blelloch_scan(m_block_count_buffer.handle(), num_blocks_power_of_2, 16); 312 | 313 | // ---------------------------------------------------------------- Reordering 314 | 315 | m_reorder_program.use(); 316 | 317 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, key_buffers[step % 2]); 318 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, val_buffers[step % 2]); 319 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, key_buffers[(step + 1) % 2]); 320 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, val_buffers[(step + 1) % 2]); 321 | m_block_count_buffer.bind(4); 322 | m_global_count_buffer.bind(5); 323 | 324 | glUniform1ui(m_reorder_program.get_uniform_location("u_count"), count); 325 | glUniform1ui(m_reorder_program.get_uniform_location("u_radix_shift"), step << 2); 326 | glUniform1ui(m_reorder_program.get_uniform_location("u_num_blocks_power_of_2"), num_blocks_power_of_2); 327 | 328 | glDispatchCompute(num_blocks, 1, 1); 329 | glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 330 | 331 | ++step; 332 | if (step == num_steps || step == 8) break; 333 | } 334 | } 335 | 336 | private: 337 | [[nodiscard]] static size_t required_block_count_buffer_size(size_t count) 338 | { 339 | size_t num_blocks = div_ceil(count, size_t(1024)); 340 | size_t num_blocks_power_of_2 = next_power_of_2(num_blocks); // Required by BlellochScan 341 | 342 | return next_power_of_2(16 * num_blocks_power_of_2) * sizeof(GLuint); 343 | } 344 | 345 | [[nodiscard]] static size_t required_key_scratch_buffer_size(size_t count) 346 | { 347 | return next_power_of_2(count) * sizeof(GLuint); 348 | } 349 | 350 | [[nodiscard]] static size_t required_val_scratch_buffer_size(size_t count) 351 | { 352 | return next_power_of_2(count) * sizeof(GLuint); 353 | } 354 | }; 355 | } // namespace glu 356 | 357 | #endif // GLU_RADIXSORT_HPP 358 | -------------------------------------------------------------------------------- /glu/Reduce.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GLU_REDUCE_HPP 2 | #define GLU_REDUCE_HPP 3 | 4 | #include "data_types.hpp" 5 | #include "gl_utils.hpp" 6 | 7 | namespace glu 8 | { 9 | namespace detail 10 | { 11 | inline const char* k_reduction_shader_src = R"( 12 | #extension GL_KHR_shader_subgroup_arithmetic : require 13 | 14 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in; 15 | 16 | layout(std430, binding = 0) buffer Buffer 17 | { 18 | DATA_TYPE data[]; 19 | }; 20 | 21 | layout(location = 0) uniform uint u_count; 22 | layout(location = 1) uniform uint u_depth; 23 | 24 | void main() 25 | { 26 | uint step = 1 << (5 * u_depth); 27 | uint subgroup_i = gl_WorkGroupID.x * NUM_THREADS + gl_SubgroupID * gl_SubgroupSize; 28 | uint i = (subgroup_i + gl_SubgroupInvocationID) * step; 29 | if (i < u_count) 30 | { 31 | DATA_TYPE r = SUBGROUP_OPERATION(data[i]); 32 | if (gl_SubgroupInvocationID == 0) 33 | { 34 | data[i] = r; 35 | } 36 | } 37 | } 38 | )"; 39 | } 40 | 41 | /// The operators that can be used for the reduction operation. 42 | enum ReduceOperator 43 | { 44 | ReduceOperator_Sum = 0, 45 | ReduceOperator_Mul, 46 | ReduceOperator_Min, 47 | ReduceOperator_Max 48 | }; 49 | 50 | /// A class that implements the reduction operation. 51 | class Reduce 52 | { 53 | private: 54 | const DataType m_data_type; 55 | const ReduceOperator m_operator; 56 | const size_t m_num_threads; 57 | const size_t m_num_items; 58 | 59 | Program m_program; 60 | 61 | public: 62 | explicit Reduce(DataType data_type, ReduceOperator operator_) : 63 | m_data_type(data_type), 64 | m_operator(operator_), 65 | m_num_threads(1024), 66 | m_num_items(4) 67 | { 68 | std::string shader_src = "#version 460\n\n"; 69 | 70 | shader_src += std::string("#define DATA_TYPE ") + to_glsl_type_str(m_data_type) + "\n"; 71 | shader_src += std::string("#define NUM_THREADS ") + std::to_string(m_num_threads) + "\n"; 72 | shader_src += std::string("#define NUM_ITEMS ") + std::to_string(m_num_items) + "\n"; 73 | 74 | if (m_operator == ReduceOperator_Sum) 75 | { 76 | shader_src += "#define OPERATOR(a, b) (a + b)\n"; 77 | shader_src += "#define SUBGROUP_OPERATION(value) subgroupAdd(value)\n"; 78 | } 79 | else if (m_operator == ReduceOperator_Mul) 80 | { 81 | shader_src += "#define OPERATOR(a, b) (a * b)\n"; 82 | shader_src += "#define SUBGROUP_OPERATION(value) subgroupMul(value)\n"; 83 | } 84 | else if (m_operator == ReduceOperator_Min) 85 | { 86 | shader_src += "#define OPERATOR(a, b) (min(a, b))\n"; 87 | shader_src += "#define SUBGROUP_OPERATION(value) subgroupMin(value)\n"; 88 | } 89 | else if (m_operator == ReduceOperator_Max) 90 | { 91 | shader_src += "#define OPERATOR(a, b) (max(a, b))\n"; 92 | shader_src += "#define SUBGROUP_OPERATION(value) subgroupMax(value)\n"; 93 | } 94 | else 95 | { 96 | GLU_FAIL("Invalid reduction operator: %d", m_operator); 97 | } 98 | 99 | shader_src += detail::k_reduction_shader_src; 100 | 101 | Shader shader(GL_COMPUTE_SHADER); 102 | shader.source_from_str(shader_src.c_str()); 103 | shader.compile(); 104 | 105 | m_program.attach_shader(shader); 106 | m_program.link(); 107 | } 108 | 109 | ~Reduce() = default; 110 | 111 | void operator()(GLuint buffer, size_t count) 112 | { 113 | GLU_CHECK_ARGUMENT(buffer, "Invalid buffer"); 114 | GLU_CHECK_ARGUMENT(count > 0, "Count must be greater than zero"); 115 | 116 | m_program.use(); 117 | 118 | glUniform1ui(m_program.get_uniform_location("u_count"), count); 119 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer); 120 | 121 | for (int depth = 0;; depth++) 122 | { 123 | int step = 1 << (5 * depth); 124 | if (step >= count) 125 | break; 126 | 127 | size_t level_count = count >> (5 * depth); 128 | 129 | glUniform1ui(m_program.get_uniform_location("u_depth"), depth); 130 | 131 | size_t num_workgroups = div_ceil(level_count, m_num_threads); 132 | glDispatchCompute(num_workgroups, 1, 1); 133 | glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 134 | } 135 | } 136 | }; 137 | } // namespace glu 138 | 139 | #endif // GLU_REDUCE_HPP 140 | -------------------------------------------------------------------------------- /glu/data_types.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GLU_DATA_TYPES_HPP 2 | #define GLU_DATA_TYPES_HPP 3 | 4 | #include "errors.hpp" 5 | 6 | namespace glu 7 | { 8 | enum DataType 9 | { 10 | DataType_Float = 0, 11 | DataType_Double, 12 | DataType_Int, 13 | DataType_Uint, 14 | DataType_Vec2, 15 | DataType_Vec4, 16 | DataType_DVec2, 17 | DataType_DVec4, 18 | DataType_UVec2, 19 | DataType_UVec4, 20 | DataType_IVec2, 21 | DataType_IVec4 22 | }; 23 | 24 | inline const char* to_glsl_type_str(DataType data_type) 25 | { 26 | // clang-format off 27 | if (data_type == DataType_Float) return "float"; 28 | else if (data_type == DataType_Double) return "double"; 29 | else if (data_type == DataType_Int) return "int"; 30 | else if (data_type == DataType_Uint) return "uint"; 31 | else if (data_type == DataType_Vec2) return "vec2"; 32 | else if (data_type == DataType_Vec4) return "vec4"; 33 | else if (data_type == DataType_DVec2) return "dvec2"; 34 | else if (data_type == DataType_DVec4) return "dvec4"; 35 | else if (data_type == DataType_UVec2) return "uvec2"; 36 | else if (data_type == DataType_UVec4) return "uvec4"; 37 | else if (data_type == DataType_IVec2) return "ivec2"; 38 | else if (data_type == DataType_IVec4) return "ivec4"; 39 | else 40 | { 41 | GLU_FAIL("Invalid data type: %d", data_type); 42 | } 43 | // clang-format on 44 | } 45 | 46 | } // namespace glu 47 | 48 | #endif // GLU_DATA_TYPES_HPP 49 | -------------------------------------------------------------------------------- /glu/errors.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GLU_ERRORS_HPP 2 | #define GLU_ERRORS_HPP 3 | 4 | #include 5 | #include 6 | 7 | // TODO mark if (!condition_) as unlikely 8 | #define GLU_CHECK_STATE(condition_, ...) \ 9 | { \ 10 | if (!(condition_)) \ 11 | { \ 12 | fprintf(stderr, __VA_ARGS__); \ 13 | exit(1); \ 14 | } \ 15 | } 16 | 17 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__) 18 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__) 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /glu/gl_utils.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GLU_GL_UTILS_HPP 2 | #define GLU_GL_UTILS_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "errors.hpp" 10 | 11 | namespace glu 12 | { 13 | inline void 14 | copy_buffer(GLuint src_buffer, GLuint dst_buffer, size_t size, size_t src_offset = 0, size_t dst_offset = 0) 15 | { 16 | glBindBuffer(GL_COPY_READ_BUFFER, src_buffer); 17 | glBindBuffer(GL_COPY_WRITE_BUFFER, dst_buffer); 18 | 19 | glCopyBufferSubData( 20 | GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, (GLintptr) src_offset, (GLintptr) dst_offset, (GLsizeiptr) size 21 | ); 22 | } 23 | 24 | /// A RAII wrapper for GL shader. 25 | class Shader 26 | { 27 | private: 28 | GLuint m_handle; 29 | 30 | public: 31 | explicit Shader(GLenum type) : 32 | m_handle(glCreateShader(type)){}; 33 | Shader(const Shader&) = delete; 34 | 35 | Shader(Shader&& other) noexcept 36 | { 37 | m_handle = other.m_handle; 38 | other.m_handle = 0; 39 | } 40 | 41 | ~Shader() { glDeleteShader(m_handle); } 42 | 43 | [[nodiscard]] GLuint handle() const { return m_handle; } 44 | 45 | void source_from_str(const std::string& src_str) 46 | { 47 | const char* src_ptr = src_str.c_str(); 48 | glShaderSource(m_handle, 1, &src_ptr, nullptr); 49 | } 50 | 51 | void source_from_file(const char* src_filepath) 52 | { 53 | FILE* file = fopen(src_filepath, "rt"); 54 | GLU_CHECK_STATE(!file, "Failed to shader file: %s", src_filepath); 55 | 56 | fseek(file, 0, SEEK_END); 57 | size_t file_size = ftell(file); 58 | fseek(file, 0, SEEK_SET); 59 | 60 | std::string src{}; 61 | src.resize(file_size); 62 | fread(src.data(), sizeof(char), file_size, file); 63 | source_from_str(src.c_str()); 64 | 65 | fclose(file); 66 | } 67 | 68 | std::string get_info_log() 69 | { 70 | GLint log_length = 0; 71 | glGetShaderiv(m_handle, GL_INFO_LOG_LENGTH, &log_length); 72 | 73 | std::vector log(log_length); 74 | glGetShaderInfoLog(m_handle, log_length, nullptr, log.data()); 75 | return {log.begin(), log.end()}; 76 | } 77 | 78 | void compile() 79 | { 80 | glCompileShader(m_handle); 81 | 82 | GLint status; 83 | glGetShaderiv(m_handle, GL_COMPILE_STATUS, &status); 84 | if (!status) 85 | { 86 | GLU_CHECK_STATE(status, "Shader failed to compile: %s", get_info_log().c_str()); 87 | } 88 | } 89 | }; 90 | 91 | /// A RAII wrapper for GL program. 92 | class Program 93 | { 94 | private: 95 | GLuint m_handle; 96 | 97 | public: 98 | explicit Program() { m_handle = glCreateProgram(); }; 99 | Program(const Program&) = delete; 100 | 101 | Program(Program&& other) noexcept 102 | { 103 | m_handle = other.m_handle; 104 | other.m_handle = 0; 105 | } 106 | 107 | ~Program() { glDeleteProgram(m_handle); } 108 | 109 | [[nodiscard]] GLuint handle() const { return m_handle; } 110 | 111 | void attach_shader(GLuint shader_handle) { glAttachShader(m_handle, shader_handle); } 112 | void attach_shader(const Shader& shader) { glAttachShader(m_handle, shader.handle()); } 113 | 114 | [[nodiscard]] std::string get_info_log() const 115 | { 116 | GLint log_length = 0; 117 | glGetProgramiv(m_handle, GL_INFO_LOG_LENGTH, &log_length); 118 | 119 | std::vector log(log_length); 120 | glGetProgramInfoLog(m_handle, log_length, nullptr, log.data()); 121 | return {log.begin(), log.end()}; 122 | } 123 | 124 | void link() 125 | { 126 | GLint status; 127 | glLinkProgram(m_handle); 128 | glGetProgramiv(m_handle, GL_LINK_STATUS, &status); 129 | if (!status) 130 | { 131 | GLU_CHECK_STATE(status, "Program failed to link: %s", get_info_log().c_str()); 132 | } 133 | } 134 | 135 | void use() { glUseProgram(m_handle); } 136 | 137 | GLint get_uniform_location(const char* uniform_name) 138 | { 139 | GLint loc = glGetUniformLocation(m_handle, uniform_name); 140 | GLU_CHECK_STATE(loc >= 0, "Failed to get uniform location: %s", uniform_name); 141 | return loc; 142 | } 143 | }; 144 | 145 | /// A RAII helper class for GL shader storage buffer. 146 | class ShaderStorageBuffer 147 | { 148 | private: 149 | GLuint m_handle = 0; 150 | size_t m_size = 0; 151 | 152 | public: 153 | explicit ShaderStorageBuffer(size_t initial_size = 0) 154 | { 155 | if (initial_size > 0) 156 | resize(initial_size, false); 157 | } 158 | 159 | explicit ShaderStorageBuffer(const void* data, size_t size) : 160 | m_size(size) 161 | { 162 | GLU_CHECK_ARGUMENT(data, ""); 163 | GLU_CHECK_ARGUMENT(size > 0, ""); 164 | 165 | glCreateBuffers(1, &m_handle); 166 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 167 | glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, data, GL_DYNAMIC_STORAGE_BIT); 168 | } 169 | 170 | template 171 | explicit ShaderStorageBuffer(const std::vector& data) : 172 | ShaderStorageBuffer(data.data(), data.size() * sizeof(T)) 173 | { 174 | } 175 | 176 | ShaderStorageBuffer(const ShaderStorageBuffer&) = delete; 177 | ShaderStorageBuffer(ShaderStorageBuffer&& other) noexcept 178 | { 179 | m_handle = other.m_handle; 180 | m_size = other.m_size; 181 | other.m_handle = 0; 182 | } 183 | 184 | ~ShaderStorageBuffer() 185 | { 186 | if (m_handle) 187 | glDeleteBuffers(1, &m_handle); 188 | } 189 | 190 | [[nodiscard]] GLuint handle() const { return m_handle; } 191 | [[nodiscard]] size_t size() const { return m_size; } 192 | 193 | /// Grows or shrinks the buffer. If keep_data, performs an additional copy to maintain the data. 194 | void resize(size_t size, bool keep_data = false) 195 | { 196 | size_t old_size = m_size; 197 | GLuint old_handle = m_handle; 198 | 199 | if (old_size != size) 200 | { 201 | m_size = size; 202 | 203 | glCreateBuffers(1, &m_handle); 204 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 205 | glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, nullptr, GL_DYNAMIC_STORAGE_BIT); 206 | 207 | if (keep_data) 208 | copy_buffer(old_handle, m_handle, std::min(old_size, size)); 209 | 210 | glDeleteBuffers(1, &old_handle); 211 | } 212 | } 213 | 214 | /// Clears the entire buffer with the given GLuint value (repeated). 215 | void clear(GLuint value) 216 | { 217 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 218 | glClearBufferData(GL_SHADER_STORAGE_BUFFER, GL_R32UI, GL_RED, GL_UNSIGNED_INT, &value); 219 | } 220 | 221 | void write_data(const void* data, size_t size) 222 | { 223 | GLU_CHECK_ARGUMENT(size <= m_size, ""); 224 | 225 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 226 | glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, size, data); 227 | } 228 | 229 | template 230 | std::vector get_data() const 231 | { 232 | GLU_CHECK_ARGUMENT(m_size % sizeof(T) == 0, "Size %zu isn't a multiple of %zu", m_size, sizeof(T)); 233 | 234 | std::vector result(m_size / sizeof(T)); 235 | glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle); 236 | glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, (GLsizeiptr) m_size, result.data()); 237 | return result; 238 | } 239 | 240 | void bind(GLuint index, size_t size = 0, size_t offset = 0) 241 | { 242 | if (size == 0) 243 | size = m_size; 244 | glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, m_handle, (GLintptr) offset, (GLsizeiptr) size); 245 | } 246 | }; 247 | 248 | /// Measures elapsed time on GPU for executing the given callback. 249 | inline uint64_t measure_gl_elapsed_time(const std::function& callback) 250 | { 251 | GLuint query; 252 | uint64_t elapsed_time{}; 253 | 254 | glGenQueries(1, &query); 255 | glBeginQuery(GL_TIME_ELAPSED, query); 256 | 257 | callback(); 258 | 259 | glEndQuery(GL_TIME_ELAPSED); 260 | 261 | glGetQueryObjectui64v(query, GL_QUERY_RESULT, &elapsed_time); 262 | glDeleteQueries(1, &query); 263 | 264 | return elapsed_time; 265 | } 266 | 267 | template 268 | IntegerT log32_floor(IntegerT n) 269 | { 270 | return (IntegerT) floor(double(log2(n)) / 5.0); 271 | } 272 | 273 | template 274 | IntegerT log32_ceil(IntegerT n) 275 | { 276 | return (IntegerT) ceil(double(log2(n)) / 5.0); 277 | } 278 | 279 | template 280 | IntegerT div_ceil(IntegerT n, IntegerT d) 281 | { 282 | return (IntegerT) ceil(double(n) / double(d)); 283 | } 284 | 285 | template 286 | bool is_power_of_2(T n) 287 | { 288 | return (n & (n - 1)) == 0; 289 | } 290 | 291 | template 292 | IntegerT next_power_of_2(IntegerT n) 293 | { 294 | n--; 295 | n |= n >> 1; 296 | n |= n >> 2; 297 | n |= n >> 4; 298 | n |= n >> 8; 299 | n |= n >> 16; 300 | n++; 301 | return n; 302 | } 303 | 304 | template 305 | void print_stl_container(Iterator begin, Iterator end) 306 | { 307 | size_t i = 0; 308 | for (; begin != end; begin++) 309 | { 310 | printf("(%zu) %s, ", i, std::to_string(*begin).c_str()); 311 | i++; 312 | } 313 | printf("\n"); 314 | } 315 | 316 | template 317 | void print_buffer(const ShaderStorageBuffer& buffer) 318 | { 319 | std::vector data = buffer.get_data(); 320 | print_stl_container(data.begin(), data.end()); 321 | } 322 | 323 | inline void print_buffer_hex(const ShaderStorageBuffer& buffer) 324 | { 325 | std::vector data = buffer.get_data(); 326 | for (size_t i = 0; i < data.size(); i++) 327 | printf("(%zu) %08x, ", i, data[i]); 328 | printf("\n"); 329 | } 330 | } // namespace glu 331 | 332 | #endif // GLU_GL_UTILS_HPP 333 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(glu_test 2 | main.cpp 3 | reduce_tests.cpp 4 | blelloch_scan_tests.cpp 5 | radix_sort_tests.cpp 6 | 7 | # These source files test the correct generation of the dist/* files 8 | generated/test_include_BlellochScan.cpp 9 | generated/test_include_RadixSort.cpp 10 | generated/test_include_Reduce.cpp 11 | ) 12 | 13 | target_link_libraries(glu_test PRIVATE glu) 14 | 15 | target_link_libraries(glu_test PRIVATE Catch2::Catch2) 16 | target_link_libraries(glu_test PRIVATE glad) 17 | target_link_libraries(glu_test PRIVATE glm) 18 | target_link_libraries(glu_test PRIVATE glfw) 19 | target_link_libraries(glu_test PRIVATE renderdoc) 20 | -------------------------------------------------------------------------------- /test/blelloch_scan_tests.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "glu/BlellochScan.hpp" 7 | #include "util/Random.hpp" 8 | #include "util/StopWatch.hpp" 9 | 10 | using namespace glu; 11 | 12 | TEST_CASE("BlellochScan-simple", "[.]") 13 | { 14 | const std::vector data{1, 2, 3, 4, 5, 6, 7, 8}; 15 | 16 | ShaderStorageBuffer buffer(data); 17 | 18 | printf("Input:\n"); 19 | print_buffer(buffer); 20 | 21 | BlellochScan blelloch_scan(DataType_Uint); 22 | 23 | printf("Output:\n"); 24 | blelloch_scan(buffer.handle(), data.size()); 25 | print_buffer(buffer); 26 | } 27 | 28 | TEST_CASE("BlellochScan-multiple-sizes") 29 | { 30 | const uint64_t k_seed = 123; 31 | const size_t k_num_elements = 32 | GENERATE(1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576); 33 | 34 | Random random(k_seed); 35 | 36 | std::vector data = random.sample_int_vector(k_num_elements, 0, 100); 37 | 38 | ShaderStorageBuffer buffer(data); 39 | 40 | BlellochScan blelloch_scan(DataType_Uint); 41 | blelloch_scan(buffer.handle(), data.size()); 42 | 43 | std::vector expected(k_num_elements); 44 | std::exclusive_scan(data.begin(), data.end(), expected.begin(), 0); 45 | REQUIRE(buffer.get_data() == expected); 46 | } 47 | 48 | TEST_CASE("BlellochScan-multiple-partitions") 49 | { 50 | const uint64_t k_seed = 123; 51 | const size_t k_num_elements = 1024; 52 | const size_t k_num_partitions = GENERATE(1, 32, 100, 1000); 53 | 54 | Random random(k_seed); 55 | 56 | // Generate a random buffer containing data for all partitions 57 | std::vector data = random.sample_int_vector(k_num_elements * k_num_partitions, 0, 100); 58 | 59 | ShaderStorageBuffer buffer(data); 60 | 61 | // Run blelloch scan on all partitions 62 | BlellochScan blelloch_scan(DataType_Uint); 63 | blelloch_scan(buffer.handle(), k_num_elements, k_num_partitions); 64 | 65 | // Get the result host-side 66 | std::vector result = buffer.get_data(); 67 | 68 | auto data_begin = data.begin(); 69 | auto result_begin = result.begin(); 70 | 71 | // Check that exclusive scan was run for every partition 72 | for (int partition = 0; partition < k_num_partitions; partition++) 73 | { 74 | std::vector expected_result(k_num_elements); 75 | std::exclusive_scan(data_begin, data_begin + k_num_elements, expected_result.begin(), 0); 76 | 77 | REQUIRE(std::memcmp(expected_result.data(), &(*result_begin), k_num_elements * sizeof(GLuint)) == 0); 78 | 79 | data_begin += k_num_elements; 80 | result_begin += k_num_elements; 81 | } 82 | } 83 | 84 | TEST_CASE("BlellochScan-benchmark", "[.][benchmark]") 85 | { 86 | const size_t k_num_elements = GENERATE( 87 | 1024, // 1KB 88 | 16384, // 16KB 89 | 65536, // 65KB 90 | 131072, // 131KB 91 | 524288, // 524KB 92 | 1048576, // 1MB 93 | 16777216, // 16MB 94 | 67108864, // 67MB 95 | 134217728, // 134MB 96 | 268435456 // 268MB 97 | ); 98 | 99 | std::vector data(k_num_elements); // Don't need to initialize the vector for benchmarking 100 | 101 | ShaderStorageBuffer buffer(data); 102 | 103 | BlellochScan blelloch_scan(DataType_Uint); 104 | 105 | uint64_t ns = measure_gl_elapsed_time([&]() { blelloch_scan(buffer.handle(), k_num_elements); }); 106 | 107 | printf("BlellochScan; Num elements: %zu, Elapsed: %s\n", k_num_elements, ns_to_human_string(ns).c_str()); 108 | } 109 | -------------------------------------------------------------------------------- /test/generated/test_include_BlellochScan.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "dist/BlellochScan.hpp" 3 | -------------------------------------------------------------------------------- /test/generated/test_include_RadixSort.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "dist/RadixSort.hpp" 3 | -------------------------------------------------------------------------------- /test/generated/test_include_Reduce.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "dist/Reduce.hpp" 3 | -------------------------------------------------------------------------------- /test/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // clang-format off 5 | #include 6 | #include 7 | // clang-format on 8 | 9 | #include 10 | 11 | void print_gl_debug() 12 | { 13 | GLint v1, v2, v3; 14 | 15 | printf("---------------------------------------------------------------- Device info\n"); 16 | 17 | printf("Device: %s\n", glGetString(GL_RENDERER)); 18 | printf("Vendor: %s\n", glGetString(GL_VENDOR)); 19 | printf("Version: %s\n", glGetString(GL_VERSION)); 20 | printf("GLSL version: %s\n", glGetString(GL_SHADING_LANGUAGE_VERSION)); 21 | 22 | glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, &v1); 23 | glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 1, &v2); 24 | glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 2, &v3); 25 | printf("GL_MAX_COMPUTE_WORK_GROUP_COUNT: (%d, %d, %d)\n", v1, v2, v3); 26 | 27 | glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 0, &v1); 28 | glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 1, &v2); 29 | glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 2, &v3); 30 | printf("GL_MAX_COMPUTE_WORK_GROUP_SIZE: (%d, %d, %d)\n", v1, v2, v3); 31 | 32 | glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &v1); 33 | printf("GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS: %d\n", v1); 34 | glGetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &v1); 35 | printf("GL_MAX_COMPUTE_SHARED_MEMORY_SIZE: %d\n", v1); 36 | glGetIntegerv(GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS, &v1); 37 | printf("GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS: %d\n", v1); 38 | glGetIntegerv(GL_MAX_COMBINED_SHADER_STORAGE_BLOCKS, &v1); 39 | printf("GL_MAX_COMBINED_SHADER_STORAGE_BLOCKS: %d\n", v1); 40 | 41 | glGetIntegerv(GL_WARP_SIZE_NV, &v1); 42 | printf("GL_WARP_SIZE_NV: %d\n", v1); 43 | 44 | glGetIntegerv(GL_SUBGROUP_SIZE_KHR, &v1); 45 | printf("GL_SUBGROUP_SIZE_KHR: %d\n", v1); 46 | 47 | printf("----------------------------------------------------------------\n"); 48 | } 49 | 50 | void GLAPIENTRY debug_message_callback( 51 | GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length, const GLchar* message, const void* userParam 52 | ) 53 | { 54 | if (GL_DEBUG_TYPE_ERROR == type && severity <= GL_DEBUG_SEVERITY_HIGH) 55 | { 56 | fprintf(stderr, "GL CALLBACK: type = 0x%x, severity = 0x%x, message = %s\n", type, severity, message); 57 | } 58 | } 59 | 60 | int main(int argc, char* argv[]) 61 | { 62 | setvbuf(stdout, nullptr, _IONBF, 0); 63 | 64 | if (glfwInit() == GLFW_FALSE) 65 | { 66 | fprintf(stderr, "Failed to initialize GLFW"); 67 | exit(1); 68 | } 69 | 70 | glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); 71 | glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4); 72 | glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 6); 73 | 74 | glfwWindowHint(GLFW_VISIBLE, GLFW_FALSE); 75 | 76 | GLFWwindow* window = glfwCreateWindow(500, 500, "GLU", nullptr, nullptr); 77 | if (window == nullptr) 78 | { 79 | fprintf(stderr, "Failed to create GLFW window"); 80 | exit(1); 81 | } 82 | 83 | glfwMakeContextCurrent(window); 84 | 85 | if (!gladLoadGLLoader((GLADloadproc) glfwGetProcAddress)) 86 | { 87 | fprintf(stderr, "Failed to load GL"); 88 | exit(1); 89 | } 90 | 91 | glEnable(GL_DEBUG_OUTPUT); 92 | glDebugMessageCallback(debug_message_callback, nullptr); 93 | 94 | print_gl_debug(); 95 | 96 | std::vector catch2_args; 97 | for (int i = 0; i < argc; i++) 98 | catch2_args.emplace_back(argv[i]); 99 | 100 | // Show the test and sections names in stdout 101 | catch2_args.emplace_back("--reporter console::out=%stdout"); 102 | 103 | int result = Catch::Session().run((int) catch2_args.size(), catch2_args.data()); 104 | 105 | glfwDestroyWindow(window); 106 | glfwTerminate(); 107 | 108 | return result; 109 | } 110 | -------------------------------------------------------------------------------- /test/radix_sort_tests.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "glu/RadixSort.hpp" 13 | #include "util/Random.hpp" 14 | #include "util/StopWatch.hpp" 15 | 16 | using namespace glu; 17 | 18 | namespace 19 | { 20 | /// Builds an histogram counting the values in the given vector. 21 | template 22 | std::unordered_map build_value_histogram(const std::vector& vector) 23 | { 24 | std::unordered_map histogram; 25 | for (const T& entry : vector) 26 | { 27 | auto [iterator, inserted] = histogram.emplace(entry, 1); 28 | if (!inserted) 29 | iterator->second++; 30 | } 31 | return histogram; 32 | } 33 | 34 | /// Checks that vector1 is a permutation of vector2. 35 | template 36 | void check_permutation(const std::vector& vector1, const std::vector& vector2) 37 | { 38 | CHECK(vector1.size() == vector2.size()); 39 | 40 | std::unordered_map histogram1 = build_value_histogram(vector1); 41 | std::unordered_map histogram2 = build_value_histogram(vector2); 42 | CHECK(histogram1 == histogram2); 43 | } 44 | 45 | /// Checks whether the given vector is sorted. 46 | template 47 | void check_sorted(const std::vector& vector) 48 | { 49 | CAPTURE(vector); 50 | CHECK(std::is_sorted(vector.begin(), vector.end())); 51 | } 52 | } // namespace 53 | 54 | TEST_CASE("RadixSort-simple", "[.]") 55 | { 56 | const size_t k_num_elements = 10; 57 | 58 | const uint64_t k_seed = 1; 59 | Random random(k_seed); 60 | 61 | printf("Num elements: %zu; Seed: %" PRIu64 "\n", k_num_elements, k_seed); 62 | 63 | std::vector keys = random.sample_int_vector(k_num_elements, 0, UINT32_MAX); 64 | std::vector vals(k_num_elements); 65 | 66 | REQUIRE(keys.size() == vals.size()); 67 | 68 | ShaderStorageBuffer key_buffer(keys); 69 | ShaderStorageBuffer val_buffer(vals); 70 | 71 | printf("Input; Key buffer:\n"); 72 | print_buffer(key_buffer); 73 | print_buffer_hex(key_buffer); 74 | 75 | RadixSort radix_sort; 76 | radix_sort(key_buffer.handle(), val_buffer.handle(), keys.size()); 77 | 78 | printf("Output; Key buffer:\n"); 79 | print_buffer(key_buffer); 80 | print_buffer_hex(key_buffer); 81 | 82 | std::vector sorted_keys = key_buffer.get_data(); 83 | 84 | check_permutation(keys, sorted_keys); 85 | check_sorted(sorted_keys); 86 | } 87 | 88 | TEST_CASE("RadixSort-128-256-512-1024") 89 | { 90 | const size_t k_num_elements = GENERATE(128, 256, 512, 1024); 91 | 92 | const uint64_t k_seed = 1; 93 | Random random(k_seed); 94 | 95 | printf("Num elements: %zu; Seed: %" PRIu64 "\n", k_num_elements, k_seed); 96 | 97 | std::vector keys = random.sample_int_vector(k_num_elements, 0, UINT32_MAX); 98 | std::vector vals(k_num_elements); 99 | 100 | ShaderStorageBuffer key_buffer(keys); 101 | ShaderStorageBuffer val_buffer(vals); 102 | 103 | RadixSort radix_sort; 104 | radix_sort(key_buffer.handle(), val_buffer.handle(), keys.size()); 105 | 106 | std::vector sorted_keys = key_buffer.get_data(); 107 | 108 | check_permutation(keys, sorted_keys); 109 | check_sorted(sorted_keys); 110 | } 111 | 112 | TEST_CASE("RadixSort-2048") 113 | { 114 | const size_t k_num_elements = 2048; 115 | 116 | const uint64_t k_seed = 1; 117 | Random random(k_seed); 118 | 119 | printf("Num elements: %zu; Seed: %" PRIu64 "\n", k_num_elements, k_seed); 120 | 121 | std::vector keys = random.sample_int_vector(k_num_elements, 0, 10); 122 | std::vector vals(k_num_elements); 123 | 124 | ShaderStorageBuffer key_buffer(keys); 125 | ShaderStorageBuffer val_buffer(vals); 126 | 127 | RadixSort radix_sort; 128 | radix_sort(key_buffer.handle(), val_buffer.handle(), keys.size()); 129 | 130 | std::vector sorted_keys = key_buffer.get_data(); 131 | 132 | check_permutation(keys, sorted_keys); 133 | check_sorted(sorted_keys); 134 | } 135 | 136 | TEST_CASE("RadixSort-multiple-sizes") 137 | { 138 | const size_t k_num_elements = GENERATE(10993, 14978, 16243, 18985, 23857, 27865, 33363, 41298, 45821, 47487); 139 | 140 | const uint64_t k_seed = 1; 141 | Random random(k_seed); 142 | 143 | printf("Num elements: %zu; Seed: %" PRIu64 "\n", k_num_elements, k_seed); 144 | 145 | std::vector keys = random.sample_int_vector(k_num_elements, 0, UINT32_MAX); 146 | std::vector vals(k_num_elements); 147 | 148 | ShaderStorageBuffer key_buffer(keys); 149 | ShaderStorageBuffer val_buffer(vals); 150 | 151 | RadixSort radix_sort; 152 | radix_sort(key_buffer.handle(), val_buffer.handle(), keys.size()); 153 | 154 | std::vector sorted_keys = key_buffer.get_data(); 155 | 156 | check_permutation(keys, sorted_keys); 157 | check_sorted(sorted_keys); 158 | } 159 | 160 | TEST_CASE("RadixSort-benchmark", "[.][benchmark]") 161 | { 162 | const size_t k_num_elements = GENERATE( 163 | 1024, // 1KB 164 | 16384, // 16KB 165 | 65536, // 65KB 166 | 131072, // 131KB 167 | 524288, // 524KB 168 | 1048576, // 1MB 169 | 2097152, // 2MB 170 | 4194304, // 4MB 171 | 8388608, // 8MB 172 | 16777216, // 16MB 173 | 33554432, // 32MB 174 | 67108864, // 64MB 175 | 134217728, // 128MB 176 | 268435456 // 268MB 177 | ); 178 | 179 | std::vector keys(k_num_elements); // Don't need to initialize the vector for benchmarking 180 | std::vector vals(k_num_elements); 181 | 182 | ShaderStorageBuffer key_buffer(keys); 183 | ShaderStorageBuffer val_buffer(vals); 184 | 185 | RadixSort radix_sort; 186 | 187 | radix_sort.prepare_internal_buffers(k_num_elements); 188 | 189 | uint64_t ns = 190 | measure_gl_elapsed_time([&]() { radix_sort(key_buffer.handle(), val_buffer.handle(), k_num_elements); }); 191 | 192 | printf("Radix sort; Num elements: %zu, Elapsed: %s\n", k_num_elements, ns_to_human_string(ns).c_str()); 193 | } 194 | -------------------------------------------------------------------------------- /test/reduce_tests.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "glu/Reduce.hpp" 8 | #include "util/Random.hpp" 9 | #include "util/StopWatch.hpp" 10 | 11 | using namespace glu; 12 | using namespace Catch::Matchers; 13 | 14 | TEST_CASE("Reduce-simple-uint") 15 | { 16 | const uint32_t k_data[]{32, 35, 1, 3, 95, 10, 22, 24, 44, 37, 7, 80, 33, 54, 46, 23, 14, 84, 11, 67, 17 | 4, 58, 70, 61, 16, 36, 83, 9, 56, 99, 28, 98, 69, 21, 51, 34, 48, 91, 62, 19, 18 | 59, 79, 39, 92, 97, 78, 52, 40, 66, 47, 89, 88, 74, 49, 31, 20, 45, 13, 26, 72, 19 | 43, 30, 65, 94, 63, 8, 60, 15, 93, 86, 41, 75, 12, 73, 55, 90, 64, 96, 53, 1, 20 | 57, 71, 50, 42, 29, 2, 77, 25, 82, 18, 81, 85, 27, 5, 6, 68, 17, 38, 87, 76}; 21 | const size_t k_data_length = std::size(k_data); 22 | 23 | ShaderStorageBuffer buffer(k_data, k_data_length * sizeof(uint32_t)); 24 | 25 | SECTION("sum") 26 | { 27 | Reduce reduce(DataType_Uint, ReduceOperator_Sum); 28 | reduce(buffer.handle(), k_data_length); 29 | CHECK(buffer.get_data()[0] == 4951); 30 | } 31 | 32 | SECTION("mul") 33 | { 34 | Reduce reduce(DataType_Uint, ReduceOperator_Mul); 35 | reduce(buffer.handle(), 5); 36 | CHECK(buffer.get_data()[0] == 319200); 37 | } 38 | 39 | SECTION("min") 40 | { 41 | Reduce reduce(DataType_Uint, ReduceOperator_Min); 42 | reduce(buffer.handle(), k_data_length); 43 | CHECK(buffer.get_data()[0] == 1); 44 | } 45 | 46 | SECTION("max") 47 | { 48 | Reduce reduce(DataType_Uint, ReduceOperator_Max); 49 | reduce(buffer.handle(), k_data_length); 50 | CHECK(buffer.get_data()[0] == 99); 51 | } 52 | } 53 | 54 | TEST_CASE("Reduce-all") 55 | { 56 | SECTION("uint") 57 | { 58 | const std::vector k_data{1, 11, 80, 73, 48, 40, 89, 36, 70, 57}; 59 | Reduce reduce(DataType_Uint, ReduceOperator_Sum); 60 | ShaderStorageBuffer buffer(k_data); 61 | reduce(buffer.handle(), k_data.size()); 62 | CHECK(buffer.get_data()[0] == 505); 63 | } 64 | 65 | SECTION("float") 66 | { 67 | const std::vector k_data{42.138f, 18.228f, -19.127f, 86.564f, 11.904f, 68 | 48.538f, 30.606f, 11.338f, -32.699f, -29.587f}; 69 | Reduce reduce(DataType_Float, ReduceOperator_Sum); 70 | ShaderStorageBuffer buffer(k_data); 71 | reduce(buffer.handle(), k_data.size()); 72 | CHECK_THAT(buffer.get_data()[0], WithinAbs(167.9f, 0.1f)); 73 | } 74 | 75 | SECTION("double") 76 | { 77 | const std::vector k_data{-6.20, -56.02, 49.42, 52.38, -23.81, -29.72, 95.46, 77.37, -85.00, 81.74}; 78 | Reduce reduce(DataType_Double, ReduceOperator_Sum); 79 | ShaderStorageBuffer buffer(k_data); 80 | reduce(buffer.handle(), k_data.size()); 81 | CHECK_THAT(buffer.get_data()[0], WithinAbs(155.6, 0.1)); 82 | } 83 | 84 | SECTION("vec2") 85 | { 86 | const std::vector k_data{{-77.08f, 19.54f}, {98.89f, -16.09f}, {10.53f, 91.17f}, {43.06f, -94.18f}, 87 | {-19.18f, 0.86f}, {-49.99f, -92.53f}, {-4.68f, 42.34f}, {2.79f, -4.26f}, 88 | {-17.49f, 43.99f}, {79.45f, -14.58f}}; 89 | Reduce reduce(DataType_Vec2, ReduceOperator_Sum); 90 | ShaderStorageBuffer buffer(k_data); 91 | reduce(buffer.handle(), k_data.size()); 92 | 93 | glm::vec2 sum = buffer.get_data()[0]; 94 | CHECK_THAT(sum.x, WithinAbs(66.29f, 0.1f)); 95 | CHECK_THAT(sum.y, WithinAbs(-23.75f, 0.1f)); 96 | } 97 | 98 | SECTION("vec4") 99 | { 100 | const std::vector k_data{{-17.04f, 1.79f, 82.67f, 39.72f}, {52.66f, 24.75f, -19.05f, 91.92f}, 101 | {19.15f, 44.93f, -52.13f, 18.85f}, {-84.25f, 69.53f, -11.43f, 33.17f}, 102 | {19.46f, -14.30f, -15.20f, -63.83f}, {-20.51f, -56.75f, -2.70f, 82.66f}, 103 | {3.86f, 55.48f, -12.37f, -11.02f}, {-30.62f, -67.54f, -29.89f, -77.30f}, 104 | {-21.55f, 50.46f, 39.34f, 81.08f}, {-56.40f, 84.61f, 90.26f, 13.35f}}; 105 | Reduce reduce(DataType_Vec4, ReduceOperator_Sum); 106 | ShaderStorageBuffer buffer(k_data); 107 | reduce(buffer.handle(), k_data.size()); 108 | 109 | glm::vec4 sum = buffer.get_data()[0]; 110 | CHECK_THAT(sum.x, WithinAbs(-135.24f, 0.1f)); 111 | CHECK_THAT(sum.y, WithinAbs(192.97f, 0.1f)); 112 | CHECK_THAT(sum.z, WithinAbs(69.49f, 0.1f)); 113 | CHECK_THAT(sum.w, WithinAbs(208.59f, 0.1f)); 114 | } 115 | 116 | SECTION("ivec2") 117 | { 118 | const std::vector k_data{{-38, -88}, {57, -34}, {61, 60}, {-90, 73}, {-23, -17}, 119 | {34, -79}, {-80, 53}, {24, -23}, {-88, 69}, {-83, -67}}; 120 | Reduce reduce(DataType_IVec2, ReduceOperator_Sum); 121 | ShaderStorageBuffer buffer(k_data); 122 | reduce(buffer.handle(), k_data.size()); 123 | 124 | glm::ivec2 sum = buffer.get_data()[0]; 125 | CHECK(sum.x == -226); 126 | CHECK(sum.y == -53); 127 | } 128 | 129 | SECTION("ivec4") 130 | { 131 | const std::vector k_data{{-95, 99, -30, 2}, {-69, 33, 78, 20}, {33, -43, -38, -26}, 132 | {69, -67, -17, -57}, {18, -23, -2, -53}, {88, -96, 40, -48}, 133 | {-93, -47, -91, 59}, {-89, 82, 10, 94}, {-15, 7, 41, 14}, 134 | {63, 53, -40, 53}}; 135 | Reduce reduce(DataType_IVec4, ReduceOperator_Sum); 136 | ShaderStorageBuffer buffer(k_data); 137 | reduce(buffer.handle(), k_data.size()); 138 | 139 | glm::ivec4 sum = buffer.get_data()[0]; 140 | CHECK(sum.x == -90); 141 | CHECK(sum.y == -2); 142 | CHECK(sum.z == -49); 143 | CHECK(sum.w == 58); 144 | } 145 | } 146 | 147 | TEST_CASE("Reduce-subgroup-fitting-size") 148 | { 149 | const size_t k_num_elements = GENERATE(32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072); 150 | 151 | const uint64_t k_seed = 1; 152 | Random random(k_seed); 153 | 154 | std::vector data = random.sample_int_vector(k_num_elements, 0, 100); 155 | GLuint sum = std::accumulate(data.begin(), data.end(), GLuint(0)); 156 | 157 | ShaderStorageBuffer buffer(data.data(), data.size() * sizeof(uint32_t)); 158 | 159 | Reduce reduce(DataType_Uint, ReduceOperator_Sum); 160 | reduce(buffer.handle(), data.size()); 161 | 162 | uint32_t calc_sum = buffer.get_data()[0]; 163 | CHECK(calc_sum == sum); 164 | } 165 | 166 | TEST_CASE("Reduce-subgroup-non-fitting-size") 167 | { 168 | const size_t k_num_elements = GENERATE(1, 31, 93, 201, 693, 2087, 7358, 88289, 345897, 6094798, 5238082, 10043898); 169 | 170 | const uint64_t k_seed = 1; 171 | Random random(k_seed); 172 | 173 | std::vector data = random.sample_int_vector(k_num_elements, 0, 100); 174 | GLuint sum = std::accumulate(data.begin(), data.end(), GLuint(0)); 175 | 176 | ShaderStorageBuffer buffer(data.data(), data.size() * sizeof(GLuint)); 177 | 178 | Reduce reduce(DataType_Uint, ReduceOperator_Sum); 179 | reduce(buffer.handle(), data.size()); 180 | 181 | GLuint calc_sum = buffer.get_data()[0]; 182 | CHECK(calc_sum == sum); 183 | } 184 | 185 | TEST_CASE("Reduce-benchmark", "[.][benchmark]") 186 | { 187 | const size_t k_num_elements = GENERATE( 188 | 1024, // 1KB 189 | 16384, // 16KB 190 | 65536, // 65KB 191 | 131072, // 131KB 192 | 524288, // 524KB 193 | 1048576, // 1MB 194 | 16777216, // 16MB 195 | 67108864, // 67MB 196 | 134217728, // 134MB 197 | 268435456 // 268MB 198 | ); 199 | 200 | std::vector data(k_num_elements); // Don't need to initialize the vector for benchmarking 201 | 202 | ShaderStorageBuffer buffer(data); 203 | 204 | Reduce reduce(DataType_Uint, ReduceOperator_Sum); 205 | 206 | uint64_t ns = measure_gl_elapsed_time([&]() { reduce(buffer.handle(), k_num_elements); }); 207 | 208 | printf("Reduce; Num elements: %zu, Elapsed: %s\n", k_num_elements, ns_to_human_string(ns).c_str()); 209 | } 210 | -------------------------------------------------------------------------------- /test/util/Random.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace glu 8 | { 9 | class Random 10 | { 11 | private: 12 | std::optional m_random_engine; 13 | 14 | public: 15 | explicit Random(uint64_t seed = 0) : 16 | m_random_engine(seed) 17 | { 18 | m_random_engine = 19 | seed != 0 ? std::make_optional(seed) : std::make_optional(); 20 | } 21 | 22 | ~Random() = default; 23 | 24 | template 25 | IntegerT sample_int(IntegerT min, IntegerT max) 26 | { 27 | GLU_CHECK_ARGUMENT(min < max, "Min must be strictly lower than Max"); 28 | return ((*m_random_engine)() % (max - min)) + min; 29 | } 30 | 31 | template 32 | std::vector sample_int_vector(size_t num_elements, IntegerT min, IntegerT max) 33 | { 34 | std::vector result(num_elements); 35 | for (size_t i = 0; i < num_elements; i++) 36 | result[i] = sample_int(min, max); 37 | return result; 38 | } 39 | }; 40 | } // namespace glu 41 | -------------------------------------------------------------------------------- /test/util/StopWatch.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace glu 9 | { 10 | /// Converts the given nanoseconds to a human readable string. 11 | inline std::string ns_to_human_string(uint64_t ns) 12 | { 13 | double ms = (double(ns) / 1000.0) / 1000.0; 14 | double s = ms / 1000.0; 15 | 16 | if (s >= 0.1) 17 | { 18 | std::stringstream stream; // header not available with g++-12 19 | stream << std::fixed << std::setprecision(3) << s; 20 | return stream.str() + " s"; 21 | } 22 | else if (ms >= 0.001) 23 | { 24 | std::stringstream stream; // header not available with g++-12 25 | stream << std::fixed << std::setprecision(3) << ms; 26 | return stream.str() + " ms"; 27 | } 28 | else 29 | { 30 | return std::to_string(ns) + " ns"; 31 | } 32 | } 33 | 34 | class StopWatch 35 | { 36 | using ClockT = std::chrono::system_clock; 37 | 38 | private: 39 | ClockT::time_point m_start; 40 | 41 | public: 42 | StopWatch() { reset(); }; 43 | ~StopWatch() = default; 44 | 45 | void reset() { m_start = std::chrono::system_clock::now(); } 46 | 47 | ClockT::duration elapsed_time() { return std::chrono::system_clock::now() - m_start; } 48 | 49 | uint64_t elapsed_millis() 50 | { 51 | return std::chrono::duration_cast(elapsed_time()).count(); 52 | } 53 | uint64_t elapsed_nanos() 54 | { 55 | return std::chrono::duration_cast(elapsed_time()).count(); 56 | } 57 | 58 | std::string elapsed_time_str() { return ns_to_human_string(elapsed_nanos()); } 59 | }; 60 | } // namespace glu 61 | -------------------------------------------------------------------------------- /third_party/glad/include/KHR/khrplatform.h: -------------------------------------------------------------------------------- 1 | #ifndef __khrplatform_h_ 2 | #define __khrplatform_h_ 3 | 4 | /* 5 | ** Copyright (c) 2008-2018 The Khronos Group Inc. 6 | ** 7 | ** Permission is hereby granted, free of charge, to any person obtaining a 8 | ** copy of this software and/or associated documentation files (the 9 | ** "Materials"), to deal in the Materials without restriction, including 10 | ** without limitation the rights to use, copy, modify, merge, publish, 11 | ** distribute, sublicense, and/or sell copies of the Materials, and to 12 | ** permit persons to whom the Materials are furnished to do so, subject to 13 | ** the following conditions: 14 | ** 15 | ** The above copyright notice and this permission notice shall be included 16 | ** in all copies or substantial portions of the Materials. 17 | ** 18 | ** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | ** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 | ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 21 | ** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | ** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 | ** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 | ** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. 25 | */ 26 | 27 | /* Khronos platform-specific types and definitions. 28 | * 29 | * The master copy of khrplatform.h is maintained in the Khronos EGL 30 | * Registry repository at https://github.com/KhronosGroup/EGL-Registry 31 | * The last semantic modification to khrplatform.h was at commit ID: 32 | * 67a3e0864c2d75ea5287b9f3d2eb74a745936692 33 | * 34 | * Adopters may modify this file to suit their platform. Adopters are 35 | * encouraged to submit platform specific modifications to the Khronos 36 | * group so that they can be included in future versions of this file. 37 | * Please submit changes by filing pull requests or issues on 38 | * the EGL Registry repository linked above. 39 | * 40 | * 41 | * See the Implementer's Guidelines for information about where this file 42 | * should be located on your system and for more details of its use: 43 | * http://www.khronos.org/registry/implementers_guide.pdf 44 | * 45 | * This file should be included as 46 | * #include 47 | * by Khronos client API header files that use its types and defines. 48 | * 49 | * The types in khrplatform.h should only be used to define API-specific types. 50 | * 51 | * Types defined in khrplatform.h: 52 | * khronos_int8_t signed 8 bit 53 | * khronos_uint8_t unsigned 8 bit 54 | * khronos_int16_t signed 16 bit 55 | * khronos_uint16_t unsigned 16 bit 56 | * khronos_int32_t signed 32 bit 57 | * khronos_uint32_t unsigned 32 bit 58 | * khronos_int64_t signed 64 bit 59 | * khronos_uint64_t unsigned 64 bit 60 | * khronos_intptr_t signed same number of bits as a pointer 61 | * khronos_uintptr_t unsigned same number of bits as a pointer 62 | * khronos_ssize_t signed size 63 | * khronos_usize_t unsigned size 64 | * khronos_float_t signed 32 bit floating point 65 | * khronos_time_ns_t unsigned 64 bit time in nanoseconds 66 | * khronos_utime_nanoseconds_t unsigned time interval or absolute time in 67 | * nanoseconds 68 | * khronos_stime_nanoseconds_t signed time interval in nanoseconds 69 | * khronos_boolean_enum_t enumerated boolean type. This should 70 | * only be used as a base type when a client API's boolean type is 71 | * an enum. Client APIs which use an integer or other type for 72 | * booleans cannot use this as the base type for their boolean. 73 | * 74 | * Tokens defined in khrplatform.h: 75 | * 76 | * KHRONOS_FALSE, KHRONOS_TRUE Enumerated boolean false/true values. 77 | * 78 | * KHRONOS_SUPPORT_INT64 is 1 if 64 bit integers are supported; otherwise 0. 79 | * KHRONOS_SUPPORT_FLOAT is 1 if floats are supported; otherwise 0. 80 | * 81 | * Calling convention macros defined in this file: 82 | * KHRONOS_APICALL 83 | * KHRONOS_APIENTRY 84 | * KHRONOS_APIATTRIBUTES 85 | * 86 | * These may be used in function prototypes as: 87 | * 88 | * KHRONOS_APICALL void KHRONOS_APIENTRY funcname( 89 | * int arg1, 90 | * int arg2) KHRONOS_APIATTRIBUTES; 91 | */ 92 | 93 | #if defined(__SCITECH_SNAP__) && !defined(KHRONOS_STATIC) 94 | # define KHRONOS_STATIC 1 95 | #endif 96 | 97 | /*------------------------------------------------------------------------- 98 | * Definition of KHRONOS_APICALL 99 | *------------------------------------------------------------------------- 100 | * This precedes the return type of the function in the function prototype. 101 | */ 102 | #if defined(KHRONOS_STATIC) 103 | /* If the preprocessor constant KHRONOS_STATIC is defined, make the 104 | * header compatible with static linking. */ 105 | # define KHRONOS_APICALL 106 | #elif defined(_WIN32) 107 | # define KHRONOS_APICALL __declspec(dllimport) 108 | #elif defined (__SYMBIAN32__) 109 | # define KHRONOS_APICALL IMPORT_C 110 | #elif defined(__ANDROID__) 111 | # define KHRONOS_APICALL __attribute__((visibility("default"))) 112 | #else 113 | # define KHRONOS_APICALL 114 | #endif 115 | 116 | /*------------------------------------------------------------------------- 117 | * Definition of KHRONOS_APIENTRY 118 | *------------------------------------------------------------------------- 119 | * This follows the return type of the function and precedes the function 120 | * name in the function prototype. 121 | */ 122 | #if defined(_WIN32) && !defined(_WIN32_WCE) && !defined(__SCITECH_SNAP__) 123 | /* Win32 but not WinCE */ 124 | # define KHRONOS_APIENTRY __stdcall 125 | #else 126 | # define KHRONOS_APIENTRY 127 | #endif 128 | 129 | /*------------------------------------------------------------------------- 130 | * Definition of KHRONOS_APIATTRIBUTES 131 | *------------------------------------------------------------------------- 132 | * This follows the closing parenthesis of the function prototype arguments. 133 | */ 134 | #if defined (__ARMCC_2__) 135 | #define KHRONOS_APIATTRIBUTES __softfp 136 | #else 137 | #define KHRONOS_APIATTRIBUTES 138 | #endif 139 | 140 | /*------------------------------------------------------------------------- 141 | * basic type definitions 142 | *-----------------------------------------------------------------------*/ 143 | #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(__GNUC__) || defined(__SCO__) || defined(__USLC__) 144 | 145 | 146 | /* 147 | * Using 148 | */ 149 | #include 150 | typedef int32_t khronos_int32_t; 151 | typedef uint32_t khronos_uint32_t; 152 | typedef int64_t khronos_int64_t; 153 | typedef uint64_t khronos_uint64_t; 154 | #define KHRONOS_SUPPORT_INT64 1 155 | #define KHRONOS_SUPPORT_FLOAT 1 156 | 157 | #elif defined(__VMS ) || defined(__sgi) 158 | 159 | /* 160 | * Using 161 | */ 162 | #include 163 | typedef int32_t khronos_int32_t; 164 | typedef uint32_t khronos_uint32_t; 165 | typedef int64_t khronos_int64_t; 166 | typedef uint64_t khronos_uint64_t; 167 | #define KHRONOS_SUPPORT_INT64 1 168 | #define KHRONOS_SUPPORT_FLOAT 1 169 | 170 | #elif defined(_WIN32) && !defined(__SCITECH_SNAP__) 171 | 172 | /* 173 | * Win32 174 | */ 175 | typedef __int32 khronos_int32_t; 176 | typedef unsigned __int32 khronos_uint32_t; 177 | typedef __int64 khronos_int64_t; 178 | typedef unsigned __int64 khronos_uint64_t; 179 | #define KHRONOS_SUPPORT_INT64 1 180 | #define KHRONOS_SUPPORT_FLOAT 1 181 | 182 | #elif defined(__sun__) || defined(__digital__) 183 | 184 | /* 185 | * Sun or Digital 186 | */ 187 | typedef int khronos_int32_t; 188 | typedef unsigned int khronos_uint32_t; 189 | #if defined(__arch64__) || defined(_LP64) 190 | typedef long int khronos_int64_t; 191 | typedef unsigned long int khronos_uint64_t; 192 | #else 193 | typedef long long int khronos_int64_t; 194 | typedef unsigned long long int khronos_uint64_t; 195 | #endif /* __arch64__ */ 196 | #define KHRONOS_SUPPORT_INT64 1 197 | #define KHRONOS_SUPPORT_FLOAT 1 198 | 199 | #elif 0 200 | 201 | /* 202 | * Hypothetical platform with no float or int64 support 203 | */ 204 | typedef int khronos_int32_t; 205 | typedef unsigned int khronos_uint32_t; 206 | #define KHRONOS_SUPPORT_INT64 0 207 | #define KHRONOS_SUPPORT_FLOAT 0 208 | 209 | #else 210 | 211 | /* 212 | * Generic fallback 213 | */ 214 | #include 215 | typedef int32_t khronos_int32_t; 216 | typedef uint32_t khronos_uint32_t; 217 | typedef int64_t khronos_int64_t; 218 | typedef uint64_t khronos_uint64_t; 219 | #define KHRONOS_SUPPORT_INT64 1 220 | #define KHRONOS_SUPPORT_FLOAT 1 221 | 222 | #endif 223 | 224 | 225 | /* 226 | * Types that are (so far) the same on all platforms 227 | */ 228 | typedef signed char khronos_int8_t; 229 | typedef unsigned char khronos_uint8_t; 230 | typedef signed short int khronos_int16_t; 231 | typedef unsigned short int khronos_uint16_t; 232 | 233 | /* 234 | * Types that differ between LLP64 and LP64 architectures - in LLP64, 235 | * pointers are 64 bits, but 'long' is still 32 bits. Win64 appears 236 | * to be the only LLP64 architecture in current use. 237 | */ 238 | #ifdef _WIN64 239 | typedef signed long long int khronos_intptr_t; 240 | typedef unsigned long long int khronos_uintptr_t; 241 | typedef signed long long int khronos_ssize_t; 242 | typedef unsigned long long int khronos_usize_t; 243 | #else 244 | typedef signed long int khronos_intptr_t; 245 | typedef unsigned long int khronos_uintptr_t; 246 | typedef signed long int khronos_ssize_t; 247 | typedef unsigned long int khronos_usize_t; 248 | #endif 249 | 250 | #if KHRONOS_SUPPORT_FLOAT 251 | /* 252 | * Float type 253 | */ 254 | typedef float khronos_float_t; 255 | #endif 256 | 257 | #if KHRONOS_SUPPORT_INT64 258 | /* Time types 259 | * 260 | * These types can be used to represent a time interval in nanoseconds or 261 | * an absolute Unadjusted System Time. Unadjusted System Time is the number 262 | * of nanoseconds since some arbitrary system event (e.g. since the last 263 | * time the system booted). The Unadjusted System Time is an unsigned 264 | * 64 bit value that wraps back to 0 every 584 years. Time intervals 265 | * may be either signed or unsigned. 266 | */ 267 | typedef khronos_uint64_t khronos_utime_nanoseconds_t; 268 | typedef khronos_int64_t khronos_stime_nanoseconds_t; 269 | #endif 270 | 271 | /* 272 | * Dummy value used to pad enum types to 32 bits. 273 | */ 274 | #ifndef KHRONOS_MAX_ENUM 275 | #define KHRONOS_MAX_ENUM 0x7FFFFFFF 276 | #endif 277 | 278 | /* 279 | * Enumerated boolean type 280 | * 281 | * Values other than zero should be considered to be true. Therefore 282 | * comparisons should not be made against KHRONOS_TRUE. 283 | */ 284 | typedef enum { 285 | KHRONOS_FALSE = 0, 286 | KHRONOS_TRUE = 1, 287 | KHRONOS_BOOLEAN_ENUM_FORCE_SIZE = KHRONOS_MAX_ENUM 288 | } khronos_boolean_enum_t; 289 | 290 | #endif /* __khrplatform_h_ */ 291 | -------------------------------------------------------------------------------- /third_party/renderdoc/renderdoc_app.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * The MIT License (MIT) 3 | * 4 | * Copyright (c) 2019-2021 Baldur Karlsson 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | * THE SOFTWARE. 23 | ******************************************************************************/ 24 | 25 | #pragma once 26 | 27 | ////////////////////////////////////////////////////////////////////////////////////////////////// 28 | // 29 | // Documentation for the API is available at https://renderdoc.org/docs/in_application_api.html 30 | // 31 | 32 | #if !defined(RENDERDOC_NO_STDINT) 33 | #include 34 | #endif 35 | 36 | #if defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_MSC_VER) 37 | #define RENDERDOC_CC __cdecl 38 | #elif defined(__linux__) 39 | #define RENDERDOC_CC 40 | #elif defined(__APPLE__) 41 | #define RENDERDOC_CC 42 | #else 43 | #error "Unknown platform" 44 | #endif 45 | 46 | #ifdef __cplusplus 47 | extern "C" { 48 | #endif 49 | 50 | ////////////////////////////////////////////////////////////////////////////////////////////////// 51 | // Constants not used directly in below API 52 | 53 | // This is a GUID/magic value used for when applications pass a path where shader debug 54 | // information can be found to match up with a stripped shader. 55 | // the define can be used like so: const GUID RENDERDOC_ShaderDebugMagicValue = 56 | // RENDERDOC_ShaderDebugMagicValue_value 57 | #define RENDERDOC_ShaderDebugMagicValue_struct \ 58 | { \ 59 | 0xeab25520, 0x6670, 0x4865, 0x84, 0x29, 0x6c, 0x8, 0x51, 0x54, 0x00, 0xff \ 60 | } 61 | 62 | // as an alternative when you want a byte array (assuming x86 endianness): 63 | #define RENDERDOC_ShaderDebugMagicValue_bytearray \ 64 | { \ 65 | 0x20, 0x55, 0xb2, 0xea, 0x70, 0x66, 0x65, 0x48, 0x84, 0x29, 0x6c, 0x8, 0x51, 0x54, 0x00, 0xff \ 66 | } 67 | 68 | // truncated version when only a uint64_t is available (e.g. Vulkan tags): 69 | #define RENDERDOC_ShaderDebugMagicValue_truncated 0x48656670eab25520ULL 70 | 71 | ////////////////////////////////////////////////////////////////////////////////////////////////// 72 | // RenderDoc capture options 73 | // 74 | 75 | typedef enum RENDERDOC_CaptureOption { 76 | // Allow the application to enable vsync 77 | // 78 | // Default - enabled 79 | // 80 | // 1 - The application can enable or disable vsync at will 81 | // 0 - vsync is force disabled 82 | eRENDERDOC_Option_AllowVSync = 0, 83 | 84 | // Allow the application to enable fullscreen 85 | // 86 | // Default - enabled 87 | // 88 | // 1 - The application can enable or disable fullscreen at will 89 | // 0 - fullscreen is force disabled 90 | eRENDERDOC_Option_AllowFullscreen = 1, 91 | 92 | // Record API debugging events and messages 93 | // 94 | // Default - disabled 95 | // 96 | // 1 - Enable built-in API debugging features and records the results into 97 | // the capture, which is matched up with events on replay 98 | // 0 - no API debugging is forcibly enabled 99 | eRENDERDOC_Option_APIValidation = 2, 100 | eRENDERDOC_Option_DebugDeviceMode = 2, // deprecated name of this enum 101 | 102 | // Capture CPU callstacks for API events 103 | // 104 | // Default - disabled 105 | // 106 | // 1 - Enables capturing of callstacks 107 | // 0 - no callstacks are captured 108 | eRENDERDOC_Option_CaptureCallstacks = 3, 109 | 110 | // When capturing CPU callstacks, only capture them from actions. 111 | // This option does nothing without the above option being enabled 112 | // 113 | // Default - disabled 114 | // 115 | // 1 - Only captures callstacks for actions. 116 | // Ignored if CaptureCallstacks is disabled 117 | // 0 - Callstacks, if enabled, are captured for every event. 118 | eRENDERDOC_Option_CaptureCallstacksOnlyDraws = 4, 119 | eRENDERDOC_Option_CaptureCallstacksOnlyActions = 4, 120 | 121 | // Specify a delay in seconds to wait for a debugger to attach, after 122 | // creating or injecting into a process, before continuing to allow it to run. 123 | // 124 | // 0 indicates no delay, and the process will run immediately after injection 125 | // 126 | // Default - 0 seconds 127 | // 128 | eRENDERDOC_Option_DelayForDebugger = 5, 129 | 130 | // Verify buffer access. This includes checking the memory returned by a Map() call to 131 | // detect any out-of-bounds modification, as well as initialising buffers with undefined contents 132 | // to a marker value to catch use of uninitialised memory. 133 | // 134 | // NOTE: This option is only valid for OpenGL and D3D11. Explicit APIs such as D3D12 and Vulkan do 135 | // not do the same kind of interception & checking and undefined contents are really undefined. 136 | // 137 | // Default - disabled 138 | // 139 | // 1 - Verify buffer access 140 | // 0 - No verification is performed, and overwriting bounds may cause crashes or corruption in 141 | // RenderDoc. 142 | eRENDERDOC_Option_VerifyBufferAccess = 6, 143 | 144 | // The old name for eRENDERDOC_Option_VerifyBufferAccess was eRENDERDOC_Option_VerifyMapWrites. 145 | // This option now controls the filling of uninitialised buffers with 0xdddddddd which was 146 | // previously always enabled 147 | eRENDERDOC_Option_VerifyMapWrites = eRENDERDOC_Option_VerifyBufferAccess, 148 | 149 | // Hooks any system API calls that create child processes, and injects 150 | // RenderDoc into them recursively with the same options. 151 | // 152 | // Default - disabled 153 | // 154 | // 1 - Hooks into spawned child processes 155 | // 0 - Child processes are not hooked by RenderDoc 156 | eRENDERDOC_Option_HookIntoChildren = 7, 157 | 158 | // By default RenderDoc only includes resources in the final capture necessary 159 | // for that frame, this allows you to override that behaviour. 160 | // 161 | // Default - disabled 162 | // 163 | // 1 - all live resources at the time of capture are included in the capture 164 | // and available for inspection 165 | // 0 - only the resources referenced by the captured frame are included 166 | eRENDERDOC_Option_RefAllResources = 8, 167 | 168 | // **NOTE**: As of RenderDoc v1.1 this option has been deprecated. Setting or 169 | // getting it will be ignored, to allow compatibility with older versions. 170 | // In v1.1 the option acts as if it's always enabled. 171 | // 172 | // By default RenderDoc skips saving initial states for resources where the 173 | // previous contents don't appear to be used, assuming that writes before 174 | // reads indicate previous contents aren't used. 175 | // 176 | // Default - disabled 177 | // 178 | // 1 - initial contents at the start of each captured frame are saved, even if 179 | // they are later overwritten or cleared before being used. 180 | // 0 - unless a read is detected, initial contents will not be saved and will 181 | // appear as black or empty data. 182 | eRENDERDOC_Option_SaveAllInitials = 9, 183 | 184 | // In APIs that allow for the recording of command lists to be replayed later, 185 | // RenderDoc may choose to not capture command lists before a frame capture is 186 | // triggered, to reduce overheads. This means any command lists recorded once 187 | // and replayed many times will not be available and may cause a failure to 188 | // capture. 189 | // 190 | // NOTE: This is only true for APIs where multithreading is difficult or 191 | // discouraged. Newer APIs like Vulkan and D3D12 will ignore this option 192 | // and always capture all command lists since the API is heavily oriented 193 | // around it and the overheads have been reduced by API design. 194 | // 195 | // 1 - All command lists are captured from the start of the application 196 | // 0 - Command lists are only captured if their recording begins during 197 | // the period when a frame capture is in progress. 198 | eRENDERDOC_Option_CaptureAllCmdLists = 10, 199 | 200 | // Mute API debugging output when the API validation mode option is enabled 201 | // 202 | // Default - enabled 203 | // 204 | // 1 - Mute any API debug messages from being displayed or passed through 205 | // 0 - API debugging is displayed as normal 206 | eRENDERDOC_Option_DebugOutputMute = 11, 207 | 208 | // Option to allow vendor extensions to be used even when they may be 209 | // incompatible with RenderDoc and cause corrupted replays or crashes. 210 | // 211 | // Default - inactive 212 | // 213 | // No values are documented, this option should only be used when absolutely 214 | // necessary as directed by a RenderDoc developer. 215 | eRENDERDOC_Option_AllowUnsupportedVendorExtensions = 12, 216 | 217 | } RENDERDOC_CaptureOption; 218 | 219 | // Sets an option that controls how RenderDoc behaves on capture. 220 | // 221 | // Returns 1 if the option and value are valid 222 | // Returns 0 if either is invalid and the option is unchanged 223 | typedef int(RENDERDOC_CC *pRENDERDOC_SetCaptureOptionU32)(RENDERDOC_CaptureOption opt, uint32_t val); 224 | typedef int(RENDERDOC_CC *pRENDERDOC_SetCaptureOptionF32)(RENDERDOC_CaptureOption opt, float val); 225 | 226 | // Gets the current value of an option as a uint32_t 227 | // 228 | // If the option is invalid, 0xffffffff is returned 229 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_GetCaptureOptionU32)(RENDERDOC_CaptureOption opt); 230 | 231 | // Gets the current value of an option as a float 232 | // 233 | // If the option is invalid, -FLT_MAX is returned 234 | typedef float(RENDERDOC_CC *pRENDERDOC_GetCaptureOptionF32)(RENDERDOC_CaptureOption opt); 235 | 236 | typedef enum RENDERDOC_InputButton { 237 | // '0' - '9' matches ASCII values 238 | eRENDERDOC_Key_0 = 0x30, 239 | eRENDERDOC_Key_1 = 0x31, 240 | eRENDERDOC_Key_2 = 0x32, 241 | eRENDERDOC_Key_3 = 0x33, 242 | eRENDERDOC_Key_4 = 0x34, 243 | eRENDERDOC_Key_5 = 0x35, 244 | eRENDERDOC_Key_6 = 0x36, 245 | eRENDERDOC_Key_7 = 0x37, 246 | eRENDERDOC_Key_8 = 0x38, 247 | eRENDERDOC_Key_9 = 0x39, 248 | 249 | // 'A' - 'Z' matches ASCII values 250 | eRENDERDOC_Key_A = 0x41, 251 | eRENDERDOC_Key_B = 0x42, 252 | eRENDERDOC_Key_C = 0x43, 253 | eRENDERDOC_Key_D = 0x44, 254 | eRENDERDOC_Key_E = 0x45, 255 | eRENDERDOC_Key_F = 0x46, 256 | eRENDERDOC_Key_G = 0x47, 257 | eRENDERDOC_Key_H = 0x48, 258 | eRENDERDOC_Key_I = 0x49, 259 | eRENDERDOC_Key_J = 0x4A, 260 | eRENDERDOC_Key_K = 0x4B, 261 | eRENDERDOC_Key_L = 0x4C, 262 | eRENDERDOC_Key_M = 0x4D, 263 | eRENDERDOC_Key_N = 0x4E, 264 | eRENDERDOC_Key_O = 0x4F, 265 | eRENDERDOC_Key_P = 0x50, 266 | eRENDERDOC_Key_Q = 0x51, 267 | eRENDERDOC_Key_R = 0x52, 268 | eRENDERDOC_Key_S = 0x53, 269 | eRENDERDOC_Key_T = 0x54, 270 | eRENDERDOC_Key_U = 0x55, 271 | eRENDERDOC_Key_V = 0x56, 272 | eRENDERDOC_Key_W = 0x57, 273 | eRENDERDOC_Key_X = 0x58, 274 | eRENDERDOC_Key_Y = 0x59, 275 | eRENDERDOC_Key_Z = 0x5A, 276 | 277 | // leave the rest of the ASCII range free 278 | // in case we want to use it later 279 | eRENDERDOC_Key_NonPrintable = 0x100, 280 | 281 | eRENDERDOC_Key_Divide, 282 | eRENDERDOC_Key_Multiply, 283 | eRENDERDOC_Key_Subtract, 284 | eRENDERDOC_Key_Plus, 285 | 286 | eRENDERDOC_Key_F1, 287 | eRENDERDOC_Key_F2, 288 | eRENDERDOC_Key_F3, 289 | eRENDERDOC_Key_F4, 290 | eRENDERDOC_Key_F5, 291 | eRENDERDOC_Key_F6, 292 | eRENDERDOC_Key_F7, 293 | eRENDERDOC_Key_F8, 294 | eRENDERDOC_Key_F9, 295 | eRENDERDOC_Key_F10, 296 | eRENDERDOC_Key_F11, 297 | eRENDERDOC_Key_F12, 298 | 299 | eRENDERDOC_Key_Home, 300 | eRENDERDOC_Key_End, 301 | eRENDERDOC_Key_Insert, 302 | eRENDERDOC_Key_Delete, 303 | eRENDERDOC_Key_PageUp, 304 | eRENDERDOC_Key_PageDn, 305 | 306 | eRENDERDOC_Key_Backspace, 307 | eRENDERDOC_Key_Tab, 308 | eRENDERDOC_Key_PrtScrn, 309 | eRENDERDOC_Key_Pause, 310 | 311 | eRENDERDOC_Key_Max, 312 | } RENDERDOC_InputButton; 313 | 314 | // Sets which key or keys can be used to toggle focus between multiple windows 315 | // 316 | // If keys is NULL or num is 0, toggle keys will be disabled 317 | typedef void(RENDERDOC_CC *pRENDERDOC_SetFocusToggleKeys)(RENDERDOC_InputButton *keys, int num); 318 | 319 | // Sets which key or keys can be used to capture the next frame 320 | // 321 | // If keys is NULL or num is 0, captures keys will be disabled 322 | typedef void(RENDERDOC_CC *pRENDERDOC_SetCaptureKeys)(RENDERDOC_InputButton *keys, int num); 323 | 324 | typedef enum RENDERDOC_OverlayBits { 325 | // This single bit controls whether the overlay is enabled or disabled globally 326 | eRENDERDOC_Overlay_Enabled = 0x1, 327 | 328 | // Show the average framerate over several seconds as well as min/max 329 | eRENDERDOC_Overlay_FrameRate = 0x2, 330 | 331 | // Show the current frame number 332 | eRENDERDOC_Overlay_FrameNumber = 0x4, 333 | 334 | // Show a list of recent captures, and how many captures have been made 335 | eRENDERDOC_Overlay_CaptureList = 0x8, 336 | 337 | // Default values for the overlay mask 338 | eRENDERDOC_Overlay_Default = (eRENDERDOC_Overlay_Enabled | eRENDERDOC_Overlay_FrameRate | 339 | eRENDERDOC_Overlay_FrameNumber | eRENDERDOC_Overlay_CaptureList), 340 | 341 | // Enable all bits 342 | eRENDERDOC_Overlay_All = ~0U, 343 | 344 | // Disable all bits 345 | eRENDERDOC_Overlay_None = 0, 346 | } RENDERDOC_OverlayBits; 347 | 348 | // returns the overlay bits that have been set 349 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_GetOverlayBits)(); 350 | // sets the overlay bits with an and & or mask 351 | typedef void(RENDERDOC_CC *pRENDERDOC_MaskOverlayBits)(uint32_t And, uint32_t Or); 352 | 353 | // this function will attempt to remove RenderDoc's hooks in the application. 354 | // 355 | // Note: that this can only work correctly if done immediately after 356 | // the module is loaded, before any API work happens. RenderDoc will remove its 357 | // injected hooks and shut down. Behaviour is undefined if this is called 358 | // after any API functions have been called, and there is still no guarantee of 359 | // success. 360 | typedef void(RENDERDOC_CC *pRENDERDOC_RemoveHooks)(); 361 | 362 | // DEPRECATED: compatibility for code compiled against pre-1.4.1 headers. 363 | typedef pRENDERDOC_RemoveHooks pRENDERDOC_Shutdown; 364 | 365 | // This function will unload RenderDoc's crash handler. 366 | // 367 | // If you use your own crash handler and don't want RenderDoc's handler to 368 | // intercede, you can call this function to unload it and any unhandled 369 | // exceptions will pass to the next handler. 370 | typedef void(RENDERDOC_CC *pRENDERDOC_UnloadCrashHandler)(); 371 | 372 | // Sets the capture file path template 373 | // 374 | // pathtemplate is a UTF-8 string that gives a template for how captures will be named 375 | // and where they will be saved. 376 | // 377 | // Any extension is stripped off the path, and captures are saved in the directory 378 | // specified, and named with the filename and the frame number appended. If the 379 | // directory does not exist it will be created, including any parent directories. 380 | // 381 | // If pathtemplate is NULL, the template will remain unchanged 382 | // 383 | // Example: 384 | // 385 | // SetCaptureFilePathTemplate("my_captures/example"); 386 | // 387 | // Capture #1 -> my_captures/example_frame123.rdc 388 | // Capture #2 -> my_captures/example_frame456.rdc 389 | typedef void(RENDERDOC_CC *pRENDERDOC_SetCaptureFilePathTemplate)(const char *pathtemplate); 390 | 391 | // returns the current capture path template, see SetCaptureFileTemplate above, as a UTF-8 string 392 | typedef const char *(RENDERDOC_CC *pRENDERDOC_GetCaptureFilePathTemplate)(); 393 | 394 | // DEPRECATED: compatibility for code compiled against pre-1.1.2 headers. 395 | typedef pRENDERDOC_SetCaptureFilePathTemplate pRENDERDOC_SetLogFilePathTemplate; 396 | typedef pRENDERDOC_GetCaptureFilePathTemplate pRENDERDOC_GetLogFilePathTemplate; 397 | 398 | // returns the number of captures that have been made 399 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_GetNumCaptures)(); 400 | 401 | // This function returns the details of a capture, by index. New captures are added 402 | // to the end of the list. 403 | // 404 | // filename will be filled with the absolute path to the capture file, as a UTF-8 string 405 | // pathlength will be written with the length in bytes of the filename string 406 | // timestamp will be written with the time of the capture, in seconds since the Unix epoch 407 | // 408 | // Any of the parameters can be NULL and they'll be skipped. 409 | // 410 | // The function will return 1 if the capture index is valid, or 0 if the index is invalid 411 | // If the index is invalid, the values will be unchanged 412 | // 413 | // Note: when captures are deleted in the UI they will remain in this list, so the 414 | // capture path may not exist anymore. 415 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_GetCapture)(uint32_t idx, char *filename, 416 | uint32_t *pathlength, uint64_t *timestamp); 417 | 418 | // Sets the comments associated with a capture file. These comments are displayed in the 419 | // UI program when opening. 420 | // 421 | // filePath should be a path to the capture file to add comments to. If set to NULL or "" 422 | // the most recent capture file created made will be used instead. 423 | // comments should be a NULL-terminated UTF-8 string to add as comments. 424 | // 425 | // Any existing comments will be overwritten. 426 | typedef void(RENDERDOC_CC *pRENDERDOC_SetCaptureFileComments)(const char *filePath, 427 | const char *comments); 428 | 429 | // returns 1 if the RenderDoc UI is connected to this application, 0 otherwise 430 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_IsTargetControlConnected)(); 431 | 432 | // DEPRECATED: compatibility for code compiled against pre-1.1.1 headers. 433 | // This was renamed to IsTargetControlConnected in API 1.1.1, the old typedef is kept here for 434 | // backwards compatibility with old code, it is castable either way since it's ABI compatible 435 | // as the same function pointer type. 436 | typedef pRENDERDOC_IsTargetControlConnected pRENDERDOC_IsRemoteAccessConnected; 437 | 438 | // This function will launch the Replay UI associated with the RenderDoc library injected 439 | // into the running application. 440 | // 441 | // if connectTargetControl is 1, the Replay UI will be launched with a command line parameter 442 | // to connect to this application 443 | // cmdline is the rest of the command line, as a UTF-8 string. E.g. a captures to open 444 | // if cmdline is NULL, the command line will be empty. 445 | // 446 | // returns the PID of the replay UI if successful, 0 if not successful. 447 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_LaunchReplayUI)(uint32_t connectTargetControl, 448 | const char *cmdline); 449 | 450 | // RenderDoc can return a higher version than requested if it's backwards compatible, 451 | // this function returns the actual version returned. If a parameter is NULL, it will be 452 | // ignored and the others will be filled out. 453 | typedef void(RENDERDOC_CC *pRENDERDOC_GetAPIVersion)(int *major, int *minor, int *patch); 454 | 455 | ////////////////////////////////////////////////////////////////////////// 456 | // Capturing functions 457 | // 458 | 459 | // A device pointer is a pointer to the API's root handle. 460 | // 461 | // This would be an ID3D11Device, HGLRC/GLXContext, ID3D12Device, etc 462 | typedef void *RENDERDOC_DevicePointer; 463 | 464 | // A window handle is the OS's native window handle 465 | // 466 | // This would be an HWND, GLXDrawable, etc 467 | typedef void *RENDERDOC_WindowHandle; 468 | 469 | // A helper macro for Vulkan, where the device handle cannot be used directly. 470 | // 471 | // Passing the VkInstance to this macro will return the RENDERDOC_DevicePointer to use. 472 | // 473 | // Specifically, the value needed is the dispatch table pointer, which sits as the first 474 | // pointer-sized object in the memory pointed to by the VkInstance. Thus we cast to a void** and 475 | // indirect once. 476 | #define RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(inst) (*((void **)(inst))) 477 | 478 | // This sets the RenderDoc in-app overlay in the API/window pair as 'active' and it will 479 | // respond to keypresses. Neither parameter can be NULL 480 | typedef void(RENDERDOC_CC *pRENDERDOC_SetActiveWindow)(RENDERDOC_DevicePointer device, 481 | RENDERDOC_WindowHandle wndHandle); 482 | 483 | // capture the next frame on whichever window and API is currently considered active 484 | typedef void(RENDERDOC_CC *pRENDERDOC_TriggerCapture)(); 485 | 486 | // capture the next N frames on whichever window and API is currently considered active 487 | typedef void(RENDERDOC_CC *pRENDERDOC_TriggerMultiFrameCapture)(uint32_t numFrames); 488 | 489 | // When choosing either a device pointer or a window handle to capture, you can pass NULL. 490 | // Passing NULL specifies a 'wildcard' match against anything. This allows you to specify 491 | // any API rendering to a specific window, or a specific API instance rendering to any window, 492 | // or in the simplest case of one window and one API, you can just pass NULL for both. 493 | // 494 | // In either case, if there are two or more possible matching (device,window) pairs it 495 | // is undefined which one will be captured. 496 | // 497 | // Note: for headless rendering you can pass NULL for the window handle and either specify 498 | // a device pointer or leave it NULL as above. 499 | 500 | // Immediately starts capturing API calls on the specified device pointer and window handle. 501 | // 502 | // If there is no matching thing to capture (e.g. no supported API has been initialised), 503 | // this will do nothing. 504 | // 505 | // The results are undefined (including crashes) if two captures are started overlapping, 506 | // even on separate devices and/oror windows. 507 | typedef void(RENDERDOC_CC *pRENDERDOC_StartFrameCapture)(RENDERDOC_DevicePointer device, 508 | RENDERDOC_WindowHandle wndHandle); 509 | 510 | // Returns whether or not a frame capture is currently ongoing anywhere. 511 | // 512 | // This will return 1 if a capture is ongoing, and 0 if there is no capture running 513 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_IsFrameCapturing)(); 514 | 515 | // Ends capturing immediately. 516 | // 517 | // This will return 1 if the capture succeeded, and 0 if there was an error capturing. 518 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_EndFrameCapture)(RENDERDOC_DevicePointer device, 519 | RENDERDOC_WindowHandle wndHandle); 520 | 521 | // Ends capturing immediately and discard any data stored without saving to disk. 522 | // 523 | // This will return 1 if the capture was discarded, and 0 if there was an error or no capture 524 | // was in progress 525 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_DiscardFrameCapture)(RENDERDOC_DevicePointer device, 526 | RENDERDOC_WindowHandle wndHandle); 527 | 528 | ////////////////////////////////////////////////////////////////////////////////////////////////// 529 | // RenderDoc API versions 530 | // 531 | 532 | // RenderDoc uses semantic versioning (http://semver.org/). 533 | // 534 | // MAJOR version is incremented when incompatible API changes happen. 535 | // MINOR version is incremented when functionality is added in a backwards-compatible manner. 536 | // PATCH version is incremented when backwards-compatible bug fixes happen. 537 | // 538 | // Note that this means the API returned can be higher than the one you might have requested. 539 | // e.g. if you are running against a newer RenderDoc that supports 1.0.1, it will be returned 540 | // instead of 1.0.0. You can check this with the GetAPIVersion entry point 541 | typedef enum RENDERDOC_Version { 542 | eRENDERDOC_API_Version_1_0_0 = 10000, // RENDERDOC_API_1_0_0 = 1 00 00 543 | eRENDERDOC_API_Version_1_0_1 = 10001, // RENDERDOC_API_1_0_1 = 1 00 01 544 | eRENDERDOC_API_Version_1_0_2 = 10002, // RENDERDOC_API_1_0_2 = 1 00 02 545 | eRENDERDOC_API_Version_1_1_0 = 10100, // RENDERDOC_API_1_1_0 = 1 01 00 546 | eRENDERDOC_API_Version_1_1_1 = 10101, // RENDERDOC_API_1_1_1 = 1 01 01 547 | eRENDERDOC_API_Version_1_1_2 = 10102, // RENDERDOC_API_1_1_2 = 1 01 02 548 | eRENDERDOC_API_Version_1_2_0 = 10200, // RENDERDOC_API_1_2_0 = 1 02 00 549 | eRENDERDOC_API_Version_1_3_0 = 10300, // RENDERDOC_API_1_3_0 = 1 03 00 550 | eRENDERDOC_API_Version_1_4_0 = 10400, // RENDERDOC_API_1_4_0 = 1 04 00 551 | eRENDERDOC_API_Version_1_4_1 = 10401, // RENDERDOC_API_1_4_1 = 1 04 01 552 | eRENDERDOC_API_Version_1_4_2 = 10402, // RENDERDOC_API_1_4_2 = 1 04 02 553 | } RENDERDOC_Version; 554 | 555 | // API version changelog: 556 | // 557 | // 1.0.0 - initial release 558 | // 1.0.1 - Bugfix: IsFrameCapturing() was returning false for captures that were triggered 559 | // by keypress or TriggerCapture, instead of Start/EndFrameCapture. 560 | // 1.0.2 - Refactor: Renamed eRENDERDOC_Option_DebugDeviceMode to eRENDERDOC_Option_APIValidation 561 | // 1.1.0 - Add feature: TriggerMultiFrameCapture(). Backwards compatible with 1.0.x since the new 562 | // function pointer is added to the end of the struct, the original layout is identical 563 | // 1.1.1 - Refactor: Renamed remote access to target control (to better disambiguate from remote 564 | // replay/remote server concept in replay UI) 565 | // 1.1.2 - Refactor: Renamed "log file" in function names to just capture, to clarify that these 566 | // are captures and not debug logging files. This is the first API version in the v1.0 567 | // branch. 568 | // 1.2.0 - Added feature: SetCaptureFileComments() to add comments to a capture file that will be 569 | // displayed in the UI program on load. 570 | // 1.3.0 - Added feature: New capture option eRENDERDOC_Option_AllowUnsupportedVendorExtensions 571 | // which allows users to opt-in to allowing unsupported vendor extensions to function. 572 | // Should be used at the user's own risk. 573 | // Refactor: Renamed eRENDERDOC_Option_VerifyMapWrites to 574 | // eRENDERDOC_Option_VerifyBufferAccess, which now also controls initialisation to 575 | // 0xdddddddd of uninitialised buffer contents. 576 | // 1.4.0 - Added feature: DiscardFrameCapture() to discard a frame capture in progress and stop 577 | // capturing without saving anything to disk. 578 | // 1.4.1 - Refactor: Renamed Shutdown to RemoveHooks to better clarify what is happening 579 | // 1.4.2 - Refactor: Renamed 'draws' to 'actions' in callstack capture option. 580 | 581 | typedef struct RENDERDOC_API_1_4_1 582 | { 583 | pRENDERDOC_GetAPIVersion GetAPIVersion; 584 | 585 | pRENDERDOC_SetCaptureOptionU32 SetCaptureOptionU32; 586 | pRENDERDOC_SetCaptureOptionF32 SetCaptureOptionF32; 587 | 588 | pRENDERDOC_GetCaptureOptionU32 GetCaptureOptionU32; 589 | pRENDERDOC_GetCaptureOptionF32 GetCaptureOptionF32; 590 | 591 | pRENDERDOC_SetFocusToggleKeys SetFocusToggleKeys; 592 | pRENDERDOC_SetCaptureKeys SetCaptureKeys; 593 | 594 | pRENDERDOC_GetOverlayBits GetOverlayBits; 595 | pRENDERDOC_MaskOverlayBits MaskOverlayBits; 596 | 597 | // Shutdown was renamed to RemoveHooks in 1.4.1. 598 | // These unions allow old code to continue compiling without changes 599 | union 600 | { 601 | pRENDERDOC_Shutdown Shutdown; 602 | pRENDERDOC_RemoveHooks RemoveHooks; 603 | }; 604 | pRENDERDOC_UnloadCrashHandler UnloadCrashHandler; 605 | 606 | // Get/SetLogFilePathTemplate was renamed to Get/SetCaptureFilePathTemplate in 1.1.2. 607 | // These unions allow old code to continue compiling without changes 608 | union 609 | { 610 | // deprecated name 611 | pRENDERDOC_SetLogFilePathTemplate SetLogFilePathTemplate; 612 | // current name 613 | pRENDERDOC_SetCaptureFilePathTemplate SetCaptureFilePathTemplate; 614 | }; 615 | union 616 | { 617 | // deprecated name 618 | pRENDERDOC_GetLogFilePathTemplate GetLogFilePathTemplate; 619 | // current name 620 | pRENDERDOC_GetCaptureFilePathTemplate GetCaptureFilePathTemplate; 621 | }; 622 | 623 | pRENDERDOC_GetNumCaptures GetNumCaptures; 624 | pRENDERDOC_GetCapture GetCapture; 625 | 626 | pRENDERDOC_TriggerCapture TriggerCapture; 627 | 628 | // IsRemoteAccessConnected was renamed to IsTargetControlConnected in 1.1.1. 629 | // This union allows old code to continue compiling without changes 630 | union 631 | { 632 | // deprecated name 633 | pRENDERDOC_IsRemoteAccessConnected IsRemoteAccessConnected; 634 | // current name 635 | pRENDERDOC_IsTargetControlConnected IsTargetControlConnected; 636 | }; 637 | pRENDERDOC_LaunchReplayUI LaunchReplayUI; 638 | 639 | pRENDERDOC_SetActiveWindow SetActiveWindow; 640 | 641 | pRENDERDOC_StartFrameCapture StartFrameCapture; 642 | pRENDERDOC_IsFrameCapturing IsFrameCapturing; 643 | pRENDERDOC_EndFrameCapture EndFrameCapture; 644 | 645 | // new function in 1.1.0 646 | pRENDERDOC_TriggerMultiFrameCapture TriggerMultiFrameCapture; 647 | 648 | // new function in 1.2.0 649 | pRENDERDOC_SetCaptureFileComments SetCaptureFileComments; 650 | 651 | // new function in 1.4.0 652 | pRENDERDOC_DiscardFrameCapture DiscardFrameCapture; 653 | } RENDERDOC_API_1_4_2; 654 | 655 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_0_0; 656 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_0_1; 657 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_0_2; 658 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_1_0; 659 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_1_1; 660 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_1_2; 661 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_2_0; 662 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_3_0; 663 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_4_0; 664 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_4_0; 665 | 666 | ////////////////////////////////////////////////////////////////////////////////////////////////// 667 | // RenderDoc API entry point 668 | // 669 | // This entry point can be obtained via GetProcAddress/dlsym if RenderDoc is available. 670 | // 671 | // The name is the same as the typedef - "RENDERDOC_GetAPI" 672 | // 673 | // This function is not thread safe, and should not be called on multiple threads at once. 674 | // Ideally, call this once as early as possible in your application's startup, before doing 675 | // any API work, since some configuration functionality etc has to be done also before 676 | // initialising any APIs. 677 | // 678 | // Parameters: 679 | // version is a single value from the RENDERDOC_Version above. 680 | // 681 | // outAPIPointers will be filled out with a pointer to the corresponding struct of function 682 | // pointers. 683 | // 684 | // Returns: 685 | // 1 - if the outAPIPointers has been filled with a pointer to the API struct requested 686 | // 0 - if the requested version is not supported or the arguments are invalid. 687 | // 688 | typedef int(RENDERDOC_CC *pRENDERDOC_GetAPI)(RENDERDOC_Version version, void **outAPIPointers); 689 | 690 | #ifdef __cplusplus 691 | } // extern "C" 692 | #endif --------------------------------------------------------------------------------