├── .clang-format
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── README.md
├── dist
    ├── BlellochScan.hpp
    ├── RadixSort.hpp
    └── Reduce.hpp
├── generate.py
├── glu
    ├── BlellochScan.hpp
    ├── RadixSort.hpp
    ├── Reduce.hpp
    ├── data_types.hpp
    ├── errors.hpp
    └── gl_utils.hpp
├── test
    ├── CMakeLists.txt
    ├── blelloch_scan_tests.cpp
    ├── generated
    │   ├── test_include_BlellochScan.cpp
    │   ├── test_include_RadixSort.cpp
    │   └── test_include_Reduce.cpp
    ├── main.cpp
    ├── radix_sort_tests.cpp
    ├── reduce_tests.cpp
    └── util
    │   ├── Random.hpp
    │   └── StopWatch.hpp
└── third_party
    ├── glad
        ├── include
        │   ├── KHR
        │   │   └── khrplatform.h
        │   └── glad
        │   │   └── glad.h
        └── src
        │   └── glad.c
    └── renderdoc
        └── renderdoc_app.h


/.clang-format:
--------------------------------------------------------------------------------
 1 | Language: Cpp
 2 | IndentWidth: 4
 3 | ColumnLimit: 120
 4 | BreakBeforeBraces: Allman
 5 | PackConstructorInitializers: Never
 6 | SpaceBeforeCtorInitializerColon: true
 7 | BreakConstructorInitializers: AfterColon
 8 | AllowShortFunctionsOnASingleLine: Inline
 9 | BreakBeforeBinaryOperators: None
10 | AlignAfterOpenBracket: BlockIndent
11 | AllowShortLambdasOnASingleLine: All
12 | BinPackArguments: true
13 | BinPackParameters: false
14 | NamespaceIndentation: All
15 | ReferenceAlignment: Left
16 | PointerAlignment: Left
17 | AllowAllParametersOfDeclarationOnNextLine: true
18 | IndentAccessModifiers: false
19 | AccessModifierOffset: -4
20 | SpaceAfterCStyleCast: true
21 | SpaceAfterTemplateKeyword: false
22 | AlwaysBreakTemplateDeclarations: Yes
23 | ---
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # CLion
2 | cmake-build-*
3 | .idea
4 | *.xml
5 | 
6 | # Python
7 | venv
8 | .venv
9 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "third_party/glfw"]
 2 | 	path = third_party/glfw
 3 | 	url = https://github.com/glfw/glfw
 4 | [submodule "third_party/Catch2"]
 5 | 	path = third_party/Catch2
 6 | 	url = https://github.com/catchorg/Catch2
 7 | [submodule "third_party/glm"]
 8 | 	path = third_party/glm
 9 | 	url = https://github.com/g-truc/glm
10 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.19)
 2 | project(glu VERSION 2.0 DESCRIPTION "GL Utilities by loryruta")
 3 | 
 4 | set(CMAKE_CXX_STANDARD 17)
 5 | 
 6 | # Catch2
 7 | add_subdirectory(third_party/Catch2)
 8 | 
 9 | # GLFW
10 | add_subdirectory(third_party/glfw)
11 | 
12 | # glad
13 | add_library(glad STATIC third_party/glad/src/glad.c)
14 | target_include_directories(glad PUBLIC third_party/glad/include)
15 | 
16 | # glm
17 | add_library(glm INTERFACE)
18 | target_include_directories(glm INTERFACE third_party/glm)
19 | 
20 | # Renderdoc
21 | add_library(renderdoc INTERFACE)
22 | target_include_directories(renderdoc INTERFACE third_party/renderdoc)
23 | 
24 | #
25 | add_library(glu INTERFACE)
26 | target_include_directories(glu INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
27 | 
28 | # TODO optionally add test subdirectory (e.g. don't add if configuring in git submodule)
29 | add_subdirectory(test)
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Lorenzo Rutayisire
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # gl-radix-sort
  3 | 
  4 | Ready to use RadixSort and other parallel programming primitives, implemented with OpenGL 4.6.
  5 | 
  6 | Includes:
  7 | - Parallel Reduce
  8 | - Parallel BlellochScan
  9 | - Parallel RadixSort
 10 | 
 11 | Such modules are grouped together under the name "GLU" (OpenGL Utilities).
 12 | 
 13 | ## Requirements
 14 | 
 15 | - C++17
 16 | 
 17 | ## How to include it
 18 | 
 19 | ### Copy-paste the utility file
 20 | 
 21 | - Look into the `dist/` directory
 22 | - Take a utility (e.g. `dist/Reduce.hpp`)
 23 | - Copy the file in your codebase
 24 | - Include it where you need it
 25 |   - **Important: OpenGL 4.6 symbols must be defined prior the utility file!** E.g.:
 26 | 
 27 | ```cpp
 28 | #include <glad/glad.h>  // Must be placed beforehand! 
 29 | #include "Reduce.hpp"
 30 | ```
 31 | 
 32 | ### git submodule + CMake
 33 | 
 34 | - Git submodule this project into your codebase
 35 | - Add CMake subdirectory and link to the `glu` target
 36 | 
 37 | ```cmake
 38 | add_subdirectory(path/to/glu)
 39 | 
 40 | target_link_libraries(your_project PUBLIC glu)
 41 | ```
 42 | 
 43 | ## How to use it
 44 | 
 45 | ### Reduce
 46 | 
 47 | ```cpp
 48 | #include "Reduce.hpp"
 49 | 
 50 | using namespace glu;
 51 | 
 52 | size_t N;
 53 | GLuint buffer;  // SSBO containing N GLuint (of size N * sizeof(GLuint))
 54 | 
 55 | Reduce reduce(DataType_Uint, ReduceOperator_Sum);
 56 | reduce(buffer, N);
 57 | ```
 58 | 
 59 | ### BlellochScan
 60 | 
 61 | ```cpp
 62 | #include "Blelloch.hpp"
 63 | 
 64 | using namespace glu;
 65 | 
 66 | size_t N;       // Important: N must be a power of 2
 67 | GLuint buffer;  // SSBO containing N GLuint (of size N * sizeof(GLuint))
 68 | 
 69 | BlellochScan blelloch_scan(DataType_Uint);
 70 | blelloch_scan(buffer, N);
 71 | ```
 72 | 
 73 | ### RadixSort
 74 | 
 75 | ```cpp
 76 | #include "RadixSort.hpp"
 77 | 
 78 | using namespace glu;
 79 | 
 80 | size_t N;
 81 | GLuint key_buffer;  // SSBO containing N GLuint (of size N * sizeof(GLuint))
 82 | GLuint val_buffer;  // SSBO containing N GLuint (of size N * sizeof(GLuint))
 83 | 
 84 | RadixSort radix_sort;
 85 | radix_sort(buffer, N);
 86 | ```
 87 | 
 88 | Note: currently `val_buffer` is **required** and its type is `GLuint`. If you have a keys array you would have to
 89 | allocate a dummy values array!
 90 | 
 91 | ## Performance
 92 | 
 93 | - OS: Ubuntu 22.04
 94 | - Processor: AMD Ryzen 7 3700X 8-Core Processor
 95 | - Device: NVIDIA GeForce RTX 2060 SUPER/PCIe/SS
 96 | - NVIDIA Driver Version: 545.23.08
 97 | - Memory: 16 GB
 98 | 
 99 | ```
100 | Reduce; Num elements: 1024, Elapsed: 0.069 ms
101 | Reduce; Num elements: 16384, Elapsed: 0.012 ms
102 | Reduce; Num elements: 65536, Elapsed: 0.016 ms
103 | Reduce; Num elements: 131072, Elapsed: 0.020 ms
104 | Reduce; Num elements: 524288, Elapsed: 0.029 ms
105 | Reduce; Num elements: 1048576, Elapsed: 0.049 ms
106 | Reduce; Num elements: 16777216, Elapsed: 0.620 ms
107 | Reduce; Num elements: 67108864, Elapsed: 2.514 ms
108 | Reduce; Num elements: 134217728, Elapsed: 5.030 ms
109 | Reduce; Num elements: 268435456, Elapsed: 10.044 ms
110 | BlellochScan; Num elements: 1024, Elapsed: 1.108 ms
111 | BlellochScan; Num elements: 16384, Elapsed: 0.081 ms
112 | BlellochScan; Num elements: 65536, Elapsed: 0.101 ms
113 | BlellochScan; Num elements: 131072, Elapsed: 0.115 ms
114 | BlellochScan; Num elements: 524288, Elapsed: 0.168 ms
115 | BlellochScan; Num elements: 1048576, Elapsed: 0.360 ms
116 | BlellochScan; Num elements: 16777216, Elapsed: 4.368 ms
117 | BlellochScan; Num elements: 67108864, Elapsed: 18.407 ms
118 | BlellochScan; Num elements: 134217728, Elapsed: 37.167 ms
119 | BlellochScan; Num elements: 268435456, Elapsed: 86.493 ms
120 | Radix sort; Num elements: 1024, Elapsed: 0.663 ms
121 | Radix sort; Num elements: 16384, Elapsed: 1.004 ms
122 | Radix sort; Num elements: 65536, Elapsed: 1.761 ms
123 | Radix sort; Num elements: 131072, Elapsed: 3.074 ms
124 | Radix sort; Num elements: 524288, Elapsed: 10.633 ms
125 | Radix sort; Num elements: 1048576, Elapsed: 20.457 ms
126 | Radix sort; Num elements: 2097152, Elapsed: 39.688 ms
127 | Radix sort; Num elements: 4194304, Elapsed: 78.594 ms
128 | Radix sort; Num elements: 8388608, Elapsed: 0.156 s
129 | Radix sort; Num elements: 16777216, Elapsed: 0.311 s
130 | Radix sort; Num elements: 33554432, Elapsed: 0.626 s
131 | Radix sort; Num elements: 67108864, Elapsed: 1.252 s
132 | Radix sort; Num elements: 134217728, Elapsed: 2.518 s
133 | Radix sort; Num elements: 268435456, Elapsed: 5.022 s
134 | ```
135 | 
136 | To run it on your machine: clone the repository, build `glu_test` and run:
137 | 
138 | ```
139 | ./glu_test [benchmark]
140 | ```
141 | 
142 | ## Useful resources
143 | - http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/RadixSort.pdf
144 | - https://vgc.poly.edu/~csilva/papers/cgf.pdf
145 | - Nabla (radix sort implementation): https://github.com/Devsh-Graphics-Programming/Nabla
146 | - Udacity parallel programming guide: https://www.youtube.com/playlist?list=PLAwxTw4SYaPnFKojVQrmyOGFCqHTxfdv2
147 | 


--------------------------------------------------------------------------------
/dist/BlellochScan.hpp:
--------------------------------------------------------------------------------
  1 | // This code was automatically generated; you're not supposed to edit it!
  2 | 
  3 | #ifndef GLU_BLELLOCHSCAN_HPP
  4 | #define GLU_BLELLOCHSCAN_HPP
  5 | 
  6 | #include <string>
  7 | 
  8 | #ifndef GLU_REDUCE_HPP
  9 | #define GLU_REDUCE_HPP
 10 | 
 11 | #ifndef GLU_DATA_TYPES_HPP
 12 | #define GLU_DATA_TYPES_HPP
 13 | 
 14 | #ifndef GLU_ERRORS_HPP
 15 | #define GLU_ERRORS_HPP
 16 | 
 17 | #include <cstdio>
 18 | #include <cstdlib>
 19 | 
 20 | // TODO mark if (!condition_) as unlikely
 21 | #define GLU_CHECK_STATE(condition_, ...)                                                                                   \
 22 |     {                                                                                                                  \
 23 |         if (!(condition_))                                                                                             \
 24 |         {                                                                                                              \
 25 |             fprintf(stderr, __VA_ARGS__);                                                                              \
 26 |             exit(1);                                                                                                   \
 27 |         }                                                                                                              \
 28 |     }
 29 | 
 30 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__)
 31 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__)
 32 | 
 33 | #endif
 34 | 
 35 | 
 36 | 
 37 | namespace glu
 38 | {
 39 |     enum DataType
 40 |     {
 41 |         DataType_Float = 0,
 42 |         DataType_Double,
 43 |         DataType_Int,
 44 |         DataType_Uint,
 45 |         DataType_Vec2,
 46 |         DataType_Vec4,
 47 |         DataType_DVec2,
 48 |         DataType_DVec4,
 49 |         DataType_UVec2,
 50 |         DataType_UVec4,
 51 |         DataType_IVec2,
 52 |         DataType_IVec4
 53 |     };
 54 | 
 55 |     inline const char* to_glsl_type_str(DataType data_type)
 56 |     {
 57 |         // clang-format off
 58 |         if (data_type == DataType_Float)       return "float";
 59 |         else if (data_type == DataType_Double) return "double";
 60 |         else if (data_type == DataType_Int)    return "int";
 61 |         else if (data_type == DataType_Uint)   return "uint";
 62 |         else if (data_type == DataType_Vec2)   return "vec2";
 63 |         else if (data_type == DataType_Vec4)   return "vec4";
 64 |         else if (data_type == DataType_DVec2)  return "dvec2";
 65 |         else if (data_type == DataType_DVec4)  return "dvec4";
 66 |         else if (data_type == DataType_UVec2)  return "uvec2";
 67 |         else if (data_type == DataType_UVec4)  return "uvec4";
 68 |         else if (data_type == DataType_IVec2)  return "ivec2";
 69 |         else if (data_type == DataType_IVec4)  return "ivec4";
 70 |         else
 71 |         {
 72 |             GLU_FAIL("Invalid data type: %d", data_type);
 73 |         }
 74 |         // clang-format on
 75 |     }
 76 | 
 77 | } // namespace glu
 78 | 
 79 | #endif // GLU_DATA_TYPES_HPP
 80 | 
 81 | 
 82 | #ifndef GLU_GL_UTILS_HPP
 83 | #define GLU_GL_UTILS_HPP
 84 | 
 85 | #include <cmath>
 86 | #include <functional>
 87 | #include <string>
 88 | #include <vector>
 89 | 
 90 | #ifndef GLU_ERRORS_HPP
 91 | #define GLU_ERRORS_HPP
 92 | 
 93 | #include <cstdio>
 94 | #include <cstdlib>
 95 | 
 96 | // TODO mark if (!condition_) as unlikely
 97 | #define GLU_CHECK_STATE(condition_, ...)                                                                                   \
 98 |     {                                                                                                                  \
 99 |         if (!(condition_))                                                                                             \
100 |         {                                                                                                              \
101 |             fprintf(stderr, __VA_ARGS__);                                                                              \
102 |             exit(1);                                                                                                   \
103 |         }                                                                                                              \
104 |     }
105 | 
106 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__)
107 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__)
108 | 
109 | #endif
110 | 
111 | 
112 | 
113 | namespace glu
114 | {
115 |     inline void
116 |     copy_buffer(GLuint src_buffer, GLuint dst_buffer, size_t size, size_t src_offset = 0, size_t dst_offset = 0)
117 |     {
118 |         glBindBuffer(GL_COPY_READ_BUFFER, src_buffer);
119 |         glBindBuffer(GL_COPY_WRITE_BUFFER, dst_buffer);
120 | 
121 |         glCopyBufferSubData(
122 |             GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, (GLintptr) src_offset, (GLintptr) dst_offset, (GLsizeiptr) size
123 |         );
124 |     }
125 | 
126 |     /// A RAII wrapper for GL shader.
127 |     class Shader
128 |     {
129 |     private:
130 |         GLuint m_handle;
131 | 
132 |     public:
133 |         explicit Shader(GLenum type) :
134 |             m_handle(glCreateShader(type)){};
135 |         Shader(const Shader&) = delete;
136 | 
137 |         Shader(Shader&& other) noexcept
138 |         {
139 |             m_handle = other.m_handle;
140 |             other.m_handle = 0;
141 |         }
142 | 
143 |         ~Shader() { glDeleteShader(m_handle); }
144 | 
145 |         [[nodiscard]] GLuint handle() const { return m_handle; }
146 | 
147 |         void source_from_str(const std::string& src_str)
148 |         {
149 |             const char* src_ptr = src_str.c_str();
150 |             glShaderSource(m_handle, 1, &src_ptr, nullptr);
151 |         }
152 | 
153 |         void source_from_file(const char* src_filepath)
154 |         {
155 |             FILE* file = fopen(src_filepath, "rt");
156 |             GLU_CHECK_STATE(!file, "Failed to shader file: %s", src_filepath);
157 | 
158 |             fseek(file, 0, SEEK_END);
159 |             size_t file_size = ftell(file);
160 |             fseek(file, 0, SEEK_SET);
161 | 
162 |             std::string src{};
163 |             src.resize(file_size);
164 |             fread(src.data(), sizeof(char), file_size, file);
165 |             source_from_str(src.c_str());
166 | 
167 |             fclose(file);
168 |         }
169 | 
170 |         std::string get_info_log()
171 |         {
172 |             GLint log_length = 0;
173 |             glGetShaderiv(m_handle, GL_INFO_LOG_LENGTH, &log_length);
174 | 
175 |             std::vector<GLchar> log(log_length);
176 |             glGetShaderInfoLog(m_handle, log_length, nullptr, log.data());
177 |             return {log.begin(), log.end()};
178 |         }
179 | 
180 |         void compile()
181 |         {
182 |             glCompileShader(m_handle);
183 | 
184 |             GLint status;
185 |             glGetShaderiv(m_handle, GL_COMPILE_STATUS, &status);
186 |             if (!status)
187 |             {
188 |                 GLU_CHECK_STATE(status, "Shader failed to compile: %s", get_info_log().c_str());
189 |             }
190 |         }
191 |     };
192 | 
193 |     /// A RAII wrapper for GL program.
194 |     class Program
195 |     {
196 |     private:
197 |         GLuint m_handle;
198 | 
199 |     public:
200 |         explicit Program() { m_handle = glCreateProgram(); };
201 |         Program(const Program&) = delete;
202 | 
203 |         Program(Program&& other) noexcept
204 |         {
205 |             m_handle = other.m_handle;
206 |             other.m_handle = 0;
207 |         }
208 | 
209 |         ~Program() { glDeleteProgram(m_handle); }
210 | 
211 |         [[nodiscard]] GLuint handle() const { return m_handle; }
212 | 
213 |         void attach_shader(GLuint shader_handle) { glAttachShader(m_handle, shader_handle); }
214 |         void attach_shader(const Shader& shader) { glAttachShader(m_handle, shader.handle()); }
215 | 
216 |         [[nodiscard]] std::string get_info_log() const
217 |         {
218 |             GLint log_length = 0;
219 |             glGetProgramiv(m_handle, GL_INFO_LOG_LENGTH, &log_length);
220 | 
221 |             std::vector<GLchar> log(log_length);
222 |             glGetProgramInfoLog(m_handle, log_length, nullptr, log.data());
223 |             return {log.begin(), log.end()};
224 |         }
225 | 
226 |         void link()
227 |         {
228 |             GLint status;
229 |             glLinkProgram(m_handle);
230 |             glGetProgramiv(m_handle, GL_LINK_STATUS, &status);
231 |             if (!status)
232 |             {
233 |                 GLU_CHECK_STATE(status, "Program failed to link: %s", get_info_log().c_str());
234 |             }
235 |         }
236 | 
237 |         void use() { glUseProgram(m_handle); }
238 | 
239 |         GLint get_uniform_location(const char* uniform_name)
240 |         {
241 |             GLint loc = glGetUniformLocation(m_handle, uniform_name);
242 |             GLU_CHECK_STATE(loc >= 0, "Failed to get uniform location: %s", uniform_name);
243 |             return loc;
244 |         }
245 |     };
246 | 
247 |     /// A RAII helper class for GL shader storage buffer.
248 |     class ShaderStorageBuffer
249 |     {
250 |     private:
251 |         GLuint m_handle = 0;
252 |         size_t m_size = 0;
253 | 
254 |     public:
255 |         explicit ShaderStorageBuffer(size_t initial_size = 0)
256 |         {
257 |             if (initial_size > 0)
258 |                 resize(initial_size, false);
259 |         }
260 | 
261 |         explicit ShaderStorageBuffer(const void* data, size_t size) :
262 |             m_size(size)
263 |         {
264 |             GLU_CHECK_ARGUMENT(data, "");
265 |             GLU_CHECK_ARGUMENT(size > 0, "");
266 | 
267 |             glCreateBuffers(1, &m_handle);
268 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
269 |             glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, data, GL_DYNAMIC_STORAGE_BIT);
270 |         }
271 | 
272 |         template<typename T>
273 |         explicit ShaderStorageBuffer(const std::vector<T>& data) :
274 |             ShaderStorageBuffer(data.data(), data.size() * sizeof(T))
275 |         {
276 |         }
277 | 
278 |         ShaderStorageBuffer(const ShaderStorageBuffer&) = delete;
279 |         ShaderStorageBuffer(ShaderStorageBuffer&& other) noexcept
280 |         {
281 |             m_handle = other.m_handle;
282 |             m_size = other.m_size;
283 |             other.m_handle = 0;
284 |         }
285 | 
286 |         ~ShaderStorageBuffer()
287 |         {
288 |             if (m_handle)
289 |                 glDeleteBuffers(1, &m_handle);
290 |         }
291 | 
292 |         [[nodiscard]] GLuint handle() const { return m_handle; }
293 |         [[nodiscard]] size_t size() const { return m_size; }
294 | 
295 |         /// Grows or shrinks the buffer. If keep_data, performs an additional copy to maintain the data.
296 |         void resize(size_t size, bool keep_data = false)
297 |         {
298 |             size_t old_size = m_size;
299 |             GLuint old_handle = m_handle;
300 | 
301 |             if (old_size != size)
302 |             {
303 |                 m_size = size;
304 | 
305 |                 glCreateBuffers(1, &m_handle);
306 |                 glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
307 |                 glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, nullptr, GL_DYNAMIC_STORAGE_BIT);
308 | 
309 |                 if (keep_data)
310 |                     copy_buffer(old_handle, m_handle, std::min(old_size, size));
311 | 
312 |                 glDeleteBuffers(1, &old_handle);
313 |             }
314 |         }
315 | 
316 |         /// Clears the entire buffer with the given GLuint value (repeated).
317 |         void clear(GLuint value)
318 |         {
319 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
320 |             glClearBufferData(GL_SHADER_STORAGE_BUFFER, GL_R32UI, GL_RED, GL_UNSIGNED_INT, &value);
321 |         }
322 | 
323 |         void write_data(const void* data, size_t size)
324 |         {
325 |             GLU_CHECK_ARGUMENT(size <= m_size, "");
326 | 
327 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
328 |             glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, size, data);
329 |         }
330 | 
331 |         template<typename T>
332 |         std::vector<T> get_data() const
333 |         {
334 |             GLU_CHECK_ARGUMENT(m_size % sizeof(T) == 0, "Size %zu isn't a multiple of %zu", m_size, sizeof(T));
335 | 
336 |             std::vector<T> result(m_size / sizeof(T));
337 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
338 |             glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, (GLsizeiptr) m_size, result.data());
339 |             return result;
340 |         }
341 | 
342 |         void bind(GLuint index, size_t size = 0, size_t offset = 0)
343 |         {
344 |             if (size == 0)
345 |                 size = m_size;
346 |             glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, m_handle, (GLintptr) offset, (GLsizeiptr) size);
347 |         }
348 |     };
349 | 
350 |     /// Measures elapsed time on GPU for executing the given callback.
351 |     inline uint64_t measure_gl_elapsed_time(const std::function<void()>& callback)
352 |     {
353 |         GLuint query;
354 |         uint64_t elapsed_time{};
355 | 
356 |         glGenQueries(1, &query);
357 |         glBeginQuery(GL_TIME_ELAPSED, query);
358 | 
359 |         callback();
360 | 
361 |         glEndQuery(GL_TIME_ELAPSED);
362 | 
363 |         glGetQueryObjectui64v(query, GL_QUERY_RESULT, &elapsed_time);
364 |         glDeleteQueries(1, &query);
365 | 
366 |         return elapsed_time;
367 |     }
368 | 
369 |     template<typename IntegerT>
370 |     IntegerT log32_floor(IntegerT n)
371 |     {
372 |         return (IntegerT) floor(double(log2(n)) / 5.0);
373 |     }
374 | 
375 |     template<typename IntegerT>
376 |     IntegerT log32_ceil(IntegerT n)
377 |     {
378 |         return (IntegerT) ceil(double(log2(n)) / 5.0);
379 |     }
380 | 
381 |     template<typename IntegerT>
382 |     IntegerT div_ceil(IntegerT n, IntegerT d)
383 |     {
384 |         return (IntegerT) ceil(double(n) / double(d));
385 |     }
386 | 
387 |     template<typename T>
388 |     bool is_power_of_2(T n)
389 |     {
390 |         return (n & (n - 1)) == 0;
391 |     }
392 | 
393 |     template<typename IntegerT>
394 |     IntegerT next_power_of_2(IntegerT n)
395 |     {
396 |         n--;
397 |         n |= n >> 1;
398 |         n |= n >> 2;
399 |         n |= n >> 4;
400 |         n |= n >> 8;
401 |         n |= n >> 16;
402 |         n++;
403 |         return n;
404 |     }
405 | 
406 |     template<typename Iterator>
407 |     void print_stl_container(Iterator begin, Iterator end)
408 |     {
409 |         size_t i = 0;
410 |         for (; begin != end; begin++)
411 |         {
412 |             printf("(%zu) %s, ", i, std::to_string(*begin).c_str());
413 |             i++;
414 |         }
415 |         printf("\n");
416 |     }
417 | 
418 |     template<typename T>
419 |     void print_buffer(const ShaderStorageBuffer& buffer)
420 |     {
421 |         std::vector<T> data = buffer.get_data<T>();
422 |         print_stl_container(data.begin(), data.end());
423 |     }
424 | 
425 |     inline void print_buffer_hex(const ShaderStorageBuffer& buffer)
426 |     {
427 |         std::vector<GLuint> data = buffer.get_data<GLuint>();
428 |         for (size_t i = 0; i < data.size(); i++)
429 |             printf("(%zu) %08x, ", i, data[i]);
430 |         printf("\n");
431 |     }
432 | } // namespace glu
433 | 
434 | #endif // GLU_GL_UTILS_HPP
435 | 
436 | 
437 | 
438 | namespace glu
439 | {
440 |     namespace detail
441 |     {
442 |         inline const char* k_reduction_shader_src = R"(
443 | #extension GL_KHR_shader_subgroup_arithmetic : require
444 | 
445 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;
446 | 
447 | layout(std430, binding = 0) buffer Buffer
448 | {
449 |     DATA_TYPE data[];
450 | };
451 | 
452 | layout(location = 0) uniform uint u_count;
453 | layout(location = 1) uniform uint u_depth;
454 | 
455 | void main()
456 | {
457 |     uint step = 1 << (5 * u_depth);
458 |     uint subgroup_i = gl_WorkGroupID.x * NUM_THREADS + gl_SubgroupID * gl_SubgroupSize;
459 |     uint i = (subgroup_i + gl_SubgroupInvocationID) * step;
460 |     if (i < u_count)
461 |     {
462 |         DATA_TYPE r = SUBGROUP_OPERATION(data[i]);
463 |         if (gl_SubgroupInvocationID == 0)
464 |         {
465 |             data[i] = r;
466 |         }
467 |     }
468 | }
469 | )";
470 |     }
471 | 
472 |     /// The operators that can be used for the reduction operation.
473 |     enum ReduceOperator
474 |     {
475 |         ReduceOperator_Sum = 0,
476 |         ReduceOperator_Mul,
477 |         ReduceOperator_Min,
478 |         ReduceOperator_Max
479 |     };
480 | 
481 |     /// A class that implements the reduction operation.
482 |     class Reduce
483 |     {
484 |     private:
485 |         const DataType m_data_type;
486 |         const ReduceOperator m_operator;
487 |         const size_t m_num_threads;
488 |         const size_t m_num_items;
489 | 
490 |         Program m_program;
491 | 
492 |     public:
493 |         explicit Reduce(DataType data_type, ReduceOperator operator_) :
494 |             m_data_type(data_type),
495 |             m_operator(operator_),
496 |             m_num_threads(1024),
497 |             m_num_items(4)
498 |         {
499 |             std::string shader_src = "#version 460\n\n";
500 | 
501 |             shader_src += std::string("#define DATA_TYPE ") + to_glsl_type_str(m_data_type) + "\n";
502 |             shader_src += std::string("#define NUM_THREADS ") + std::to_string(m_num_threads) + "\n";
503 |             shader_src += std::string("#define NUM_ITEMS ") + std::to_string(m_num_items) + "\n";
504 | 
505 |             if (m_operator == ReduceOperator_Sum)
506 |             {
507 |                 shader_src += "#define OPERATOR(a, b) (a + b)\n";
508 |                 shader_src += "#define SUBGROUP_OPERATION(value) subgroupAdd(value)\n";
509 |             }
510 |             else if (m_operator == ReduceOperator_Mul)
511 |             {
512 |                 shader_src += "#define OPERATOR(a, b) (a * b)\n";
513 |                 shader_src += "#define SUBGROUP_OPERATION(value) subgroupMul(value)\n";
514 |             }
515 |             else if (m_operator == ReduceOperator_Min)
516 |             {
517 |                 shader_src += "#define OPERATOR(a, b) (min(a, b))\n";
518 |                 shader_src += "#define SUBGROUP_OPERATION(value) subgroupMin(value)\n";
519 |             }
520 |             else if (m_operator == ReduceOperator_Max)
521 |             {
522 |                 shader_src += "#define OPERATOR(a, b) (max(a, b))\n";
523 |                 shader_src += "#define SUBGROUP_OPERATION(value) subgroupMax(value)\n";
524 |             }
525 |             else
526 |             {
527 |                 GLU_FAIL("Invalid reduction operator: %d", m_operator);
528 |             }
529 | 
530 |             shader_src += detail::k_reduction_shader_src;
531 | 
532 |             Shader shader(GL_COMPUTE_SHADER);
533 |             shader.source_from_str(shader_src.c_str());
534 |             shader.compile();
535 | 
536 |             m_program.attach_shader(shader);
537 |             m_program.link();
538 |         }
539 | 
540 |         ~Reduce() = default;
541 | 
542 |         void operator()(GLuint buffer, size_t count)
543 |         {
544 |             GLU_CHECK_ARGUMENT(buffer, "Invalid buffer");
545 |             GLU_CHECK_ARGUMENT(count > 0, "Count must be greater than zero");
546 | 
547 |             m_program.use();
548 | 
549 |             glUniform1ui(m_program.get_uniform_location("u_count"), count);
550 |             glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer);
551 | 
552 |             for (int depth = 0;; depth++)
553 |             {
554 |                 int step = 1 << (5 * depth);
555 |                 if (step >= count)
556 |                     break;
557 | 
558 |                 size_t level_count = count >> (5 * depth);
559 | 
560 |                 glUniform1ui(m_program.get_uniform_location("u_depth"), depth);
561 | 
562 |                 size_t num_workgroups = div_ceil(level_count, m_num_threads);
563 |                 glDispatchCompute(num_workgroups, 1, 1);
564 |                 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
565 |             }
566 |         }
567 |     };
568 | } // namespace glu
569 | 
570 | #endif // GLU_REDUCE_HPP
571 | 
572 | 
573 | #ifndef GLU_DATA_TYPES_HPP
574 | #define GLU_DATA_TYPES_HPP
575 | 
576 | #ifndef GLU_ERRORS_HPP
577 | #define GLU_ERRORS_HPP
578 | 
579 | #include <cstdio>
580 | #include <cstdlib>
581 | 
582 | // TODO mark if (!condition_) as unlikely
583 | #define GLU_CHECK_STATE(condition_, ...)                                                                                   \
584 |     {                                                                                                                  \
585 |         if (!(condition_))                                                                                             \
586 |         {                                                                                                              \
587 |             fprintf(stderr, __VA_ARGS__);                                                                              \
588 |             exit(1);                                                                                                   \
589 |         }                                                                                                              \
590 |     }
591 | 
592 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__)
593 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__)
594 | 
595 | #endif
596 | 
597 | 
598 | 
599 | namespace glu
600 | {
601 |     enum DataType
602 |     {
603 |         DataType_Float = 0,
604 |         DataType_Double,
605 |         DataType_Int,
606 |         DataType_Uint,
607 |         DataType_Vec2,
608 |         DataType_Vec4,
609 |         DataType_DVec2,
610 |         DataType_DVec4,
611 |         DataType_UVec2,
612 |         DataType_UVec4,
613 |         DataType_IVec2,
614 |         DataType_IVec4
615 |     };
616 | 
617 |     inline const char* to_glsl_type_str(DataType data_type)
618 |     {
619 |         // clang-format off
620 |         if (data_type == DataType_Float)       return "float";
621 |         else if (data_type == DataType_Double) return "double";
622 |         else if (data_type == DataType_Int)    return "int";
623 |         else if (data_type == DataType_Uint)   return "uint";
624 |         else if (data_type == DataType_Vec2)   return "vec2";
625 |         else if (data_type == DataType_Vec4)   return "vec4";
626 |         else if (data_type == DataType_DVec2)  return "dvec2";
627 |         else if (data_type == DataType_DVec4)  return "dvec4";
628 |         else if (data_type == DataType_UVec2)  return "uvec2";
629 |         else if (data_type == DataType_UVec4)  return "uvec4";
630 |         else if (data_type == DataType_IVec2)  return "ivec2";
631 |         else if (data_type == DataType_IVec4)  return "ivec4";
632 |         else
633 |         {
634 |             GLU_FAIL("Invalid data type: %d", data_type);
635 |         }
636 |         // clang-format on
637 |     }
638 | 
639 | } // namespace glu
640 | 
641 | #endif // GLU_DATA_TYPES_HPP
642 | 
643 | 
644 | 
645 | namespace glu
646 | {
647 |     namespace detail
648 |     {
649 |         inline const char* k_upsweep_shader_src = R"(
650 | #extension GL_KHR_shader_subgroup_shuffle_relative : require
651 | 
652 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;
653 | 
654 | layout(std430, binding = 0) buffer Buffer
655 | {
656 |     DATA_TYPE data[];
657 | };
658 | 
659 | layout(location = 0) uniform uint u_count;
660 | layout(location = 1) uniform uint u_step;
661 | 
662 | void main()
663 | {
664 |     uint partition_i = gl_WorkGroupID.y;
665 |     uint thread_i = gl_WorkGroupID.x * NUM_THREADS + gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
666 |     uint i = partition_i * u_count + thread_i * u_step + u_step - 1;
667 |     uint end_i = (partition_i + 1) * u_count;
668 |     if (i < end_i)
669 |     {
670 |         DATA_TYPE lval = subgroupShuffleUp(data[i], 1);
671 |         DATA_TYPE r = OPERATION(data[i], lval);
672 |         if (i == end_i - 1)  // Clear last
673 |         {
674 |             data[i] = IDENTITY;
675 |         }
676 |         else if (gl_SubgroupInvocationID % 2 == 1)
677 |         {
678 |             data[i] = r;
679 |         }
680 |     }
681 | }
682 | )";
683 | 
684 |         inline const char* k_downsweep_shader_src = R"(
685 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;
686 | 
687 | layout(std430, binding = 0) buffer Buffer
688 | {
689 |     DATA_TYPE data[];
690 | };
691 | 
692 | layout(location = 0) uniform uint u_count;
693 | layout(location = 1) uniform uint u_step;
694 | 
695 | void main()
696 | {
697 |     uint partition_i = gl_WorkGroupID.y;
698 |     uint i = partition_i * u_count + gl_GlobalInvocationID.x * (u_step << 1) + (u_step - 1);
699 |     uint next_i = i + u_step;
700 |     uint end_i = (partition_i + 1) * u_count;
701 |     if (next_i < end_i)
702 |     {
703 |         DATA_TYPE tmp = data[i];
704 |         data[i] = data[next_i];
705 |         data[next_i] = data[next_i] + tmp;
706 |     }
707 |     else if (i < end_i)
708 |     {
709 |         data[i] = IDENTITY;
710 |     }
711 | }
712 | )";
713 |     } // namespace detail
714 | 
715 |     /// A class that implements Blelloch scan algorithm (exclusive prefix sum).
716 |     class BlellochScan
717 |     {
718 |     private:
719 |         const DataType m_data_type;
720 |         const size_t m_num_threads;
721 |         const size_t m_num_items;
722 | 
723 |         Program m_upsweep_program;
724 |         Program m_downsweep_program;
725 | 
726 |     public:
727 |         explicit BlellochScan(DataType data_type) :
728 |             m_data_type(data_type),
729 |             m_num_threads(1024),
730 |             m_num_items(4)
731 |         {
732 |             std::string shader_src = "#version 460\n\n";
733 | 
734 |             shader_src += std::string("#define DATA_TYPE ") + to_glsl_type_str(m_data_type) + "\n";
735 |             shader_src += "#define OPERATION(a, b) (a + b)\n";
736 |             shader_src += "#define IDENTITY 0\n";
737 |             shader_src += std::string("#define NUM_THREADS ") + std::to_string(m_num_threads) + "\n";
738 |             shader_src += std::string("#define NUM_ITEMS ") + std::to_string(m_num_items) + "\n";
739 | 
740 |             { // Upsweep program
741 |                 Shader upsweep_shader(GL_COMPUTE_SHADER);
742 |                 upsweep_shader.source_from_str((shader_src + detail::k_upsweep_shader_src).c_str());
743 |                 upsweep_shader.compile();
744 | 
745 |                 m_upsweep_program.attach_shader(upsweep_shader);
746 |                 m_upsweep_program.link();
747 |             }
748 | 
749 |             { // Downsweep program
750 |                 Shader downsweep_program(GL_COMPUTE_SHADER);
751 |                 downsweep_program.source_from_str((shader_src + detail::k_downsweep_shader_src).c_str());
752 |                 downsweep_program.compile();
753 | 
754 |                 m_downsweep_program.attach_shader(downsweep_program);
755 |                 m_downsweep_program.link();
756 |             }
757 |         }
758 | 
759 |         ~BlellochScan() = default;
760 | 
761 |         /// Runs Blelloch exclusive scan on multiple partitions.
762 |         ///
763 |         /// @param buffer the input GLuint buffer
764 |         /// @param count the number of GLuint in the buffer (must be a power of 2)
765 |         /// @param num_partitions the number of partitions (must be adjacent)
766 |         void operator()(GLuint buffer, size_t count, size_t num_partitions = 1)
767 |         {
768 |             GLU_CHECK_ARGUMENT(buffer, "Invalid buffer");
769 |             GLU_CHECK_ARGUMENT(count > 0, "Count must be greater than zero");
770 |             GLU_CHECK_ARGUMENT(is_power_of_2(count), "Count must be a power of 2"); // TODO Remove this requirement
771 |             GLU_CHECK_ARGUMENT(num_partitions >= 1, "Num of partitions must be >= 1");
772 | 
773 |             upsweep(buffer, count, num_partitions); // Also clear last
774 |             downsweep(buffer, count, num_partitions);
775 |         }
776 | 
777 |     private:
778 |         void upsweep(GLuint buffer, size_t count, size_t num_partitions) // Also clear last
779 |         {
780 |             m_upsweep_program.use();
781 | 
782 |             glUniform1ui(m_upsweep_program.get_uniform_location("u_count"), count);
783 |             glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer);
784 | 
785 |             int step = 1;
786 |             int level_count = (int) count;
787 |             while (true)
788 |             {
789 |                 glUniform1ui(m_upsweep_program.get_uniform_location("u_step"), step);
790 | 
791 |                 size_t num_workgroups = div_ceil<size_t>(level_count, m_num_threads);
792 |                 glDispatchCompute(num_workgroups, num_partitions, 1);
793 |                 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
794 | 
795 |                 step <<= 1;
796 | 
797 |                 level_count >>= 1;
798 | 
799 |                 if (level_count <= 1)
800 |                     break;
801 |             }
802 |         }
803 | 
804 |         void downsweep(GLuint buffer, size_t count, size_t num_partitions)
805 |         {
806 |             m_downsweep_program.use();
807 | 
808 |             glUniform1ui(m_downsweep_program.get_uniform_location("u_count"), count);
809 |             glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer);
810 | 
811 |             int step = next_power_of_2(int(count)) >> 1;
812 |             size_t level_count = 1;
813 |             while (true)
814 |             {
815 |                 glUniform1ui(m_downsweep_program.get_uniform_location("u_step"), step);
816 | 
817 |                 size_t num_workgroups = div_ceil(level_count, m_num_threads);
818 |                 glDispatchCompute(num_workgroups, num_partitions, 1);
819 |                 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
820 | 
821 |                 step >>= 1;
822 |                 level_count <<= 1;
823 |                 if (step == 0)
824 |                     break;
825 |             }
826 |         }
827 |     };
828 | } // namespace glu
829 | 
830 | #endif // GLU_BLELLOCHSCAN_HPP
831 | 


--------------------------------------------------------------------------------
/dist/RadixSort.hpp:
--------------------------------------------------------------------------------
   1 | // This code was automatically generated; you're not supposed to edit it!
   2 | 
   3 | #ifndef GLU_RADIXSORT_HPP
   4 | #define GLU_RADIXSORT_HPP
   5 | 
   6 | #ifndef GLU_BLELLOCHSCAN_HPP
   7 | #define GLU_BLELLOCHSCAN_HPP
   8 | 
   9 | #include <string>
  10 | 
  11 | #ifndef GLU_REDUCE_HPP
  12 | #define GLU_REDUCE_HPP
  13 | 
  14 | #ifndef GLU_DATA_TYPES_HPP
  15 | #define GLU_DATA_TYPES_HPP
  16 | 
  17 | #ifndef GLU_ERRORS_HPP
  18 | #define GLU_ERRORS_HPP
  19 | 
  20 | #include <cstdio>
  21 | #include <cstdlib>
  22 | 
  23 | // TODO mark if (!condition_) as unlikely
  24 | #define GLU_CHECK_STATE(condition_, ...)                                                                                   \
  25 |     {                                                                                                                  \
  26 |         if (!(condition_))                                                                                             \
  27 |         {                                                                                                              \
  28 |             fprintf(stderr, __VA_ARGS__);                                                                              \
  29 |             exit(1);                                                                                                   \
  30 |         }                                                                                                              \
  31 |     }
  32 | 
  33 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__)
  34 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__)
  35 | 
  36 | #endif
  37 | 
  38 | 
  39 | 
  40 | namespace glu
  41 | {
  42 |     enum DataType
  43 |     {
  44 |         DataType_Float = 0,
  45 |         DataType_Double,
  46 |         DataType_Int,
  47 |         DataType_Uint,
  48 |         DataType_Vec2,
  49 |         DataType_Vec4,
  50 |         DataType_DVec2,
  51 |         DataType_DVec4,
  52 |         DataType_UVec2,
  53 |         DataType_UVec4,
  54 |         DataType_IVec2,
  55 |         DataType_IVec4
  56 |     };
  57 | 
  58 |     inline const char* to_glsl_type_str(DataType data_type)
  59 |     {
  60 |         // clang-format off
  61 |         if (data_type == DataType_Float)       return "float";
  62 |         else if (data_type == DataType_Double) return "double";
  63 |         else if (data_type == DataType_Int)    return "int";
  64 |         else if (data_type == DataType_Uint)   return "uint";
  65 |         else if (data_type == DataType_Vec2)   return "vec2";
  66 |         else if (data_type == DataType_Vec4)   return "vec4";
  67 |         else if (data_type == DataType_DVec2)  return "dvec2";
  68 |         else if (data_type == DataType_DVec4)  return "dvec4";
  69 |         else if (data_type == DataType_UVec2)  return "uvec2";
  70 |         else if (data_type == DataType_UVec4)  return "uvec4";
  71 |         else if (data_type == DataType_IVec2)  return "ivec2";
  72 |         else if (data_type == DataType_IVec4)  return "ivec4";
  73 |         else
  74 |         {
  75 |             GLU_FAIL("Invalid data type: %d", data_type);
  76 |         }
  77 |         // clang-format on
  78 |     }
  79 | 
  80 | } // namespace glu
  81 | 
  82 | #endif // GLU_DATA_TYPES_HPP
  83 | 
  84 | 
  85 | #ifndef GLU_GL_UTILS_HPP
  86 | #define GLU_GL_UTILS_HPP
  87 | 
  88 | #include <cmath>
  89 | #include <functional>
  90 | #include <string>
  91 | #include <vector>
  92 | 
  93 | #ifndef GLU_ERRORS_HPP
  94 | #define GLU_ERRORS_HPP
  95 | 
  96 | #include <cstdio>
  97 | #include <cstdlib>
  98 | 
  99 | // TODO mark if (!condition_) as unlikely
 100 | #define GLU_CHECK_STATE(condition_, ...)                                                                                   \
 101 |     {                                                                                                                  \
 102 |         if (!(condition_))                                                                                             \
 103 |         {                                                                                                              \
 104 |             fprintf(stderr, __VA_ARGS__);                                                                              \
 105 |             exit(1);                                                                                                   \
 106 |         }                                                                                                              \
 107 |     }
 108 | 
 109 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__)
 110 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__)
 111 | 
 112 | #endif
 113 | 
 114 | 
 115 | 
 116 | namespace glu
 117 | {
 118 |     inline void
 119 |     copy_buffer(GLuint src_buffer, GLuint dst_buffer, size_t size, size_t src_offset = 0, size_t dst_offset = 0)
 120 |     {
 121 |         glBindBuffer(GL_COPY_READ_BUFFER, src_buffer);
 122 |         glBindBuffer(GL_COPY_WRITE_BUFFER, dst_buffer);
 123 | 
 124 |         glCopyBufferSubData(
 125 |             GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, (GLintptr) src_offset, (GLintptr) dst_offset, (GLsizeiptr) size
 126 |         );
 127 |     }
 128 | 
 129 |     /// A RAII wrapper for GL shader.
 130 |     class Shader
 131 |     {
 132 |     private:
 133 |         GLuint m_handle;
 134 | 
 135 |     public:
 136 |         explicit Shader(GLenum type) :
 137 |             m_handle(glCreateShader(type)){};
 138 |         Shader(const Shader&) = delete;
 139 | 
 140 |         Shader(Shader&& other) noexcept
 141 |         {
 142 |             m_handle = other.m_handle;
 143 |             other.m_handle = 0;
 144 |         }
 145 | 
 146 |         ~Shader() { glDeleteShader(m_handle); }
 147 | 
 148 |         [[nodiscard]] GLuint handle() const { return m_handle; }
 149 | 
 150 |         void source_from_str(const std::string& src_str)
 151 |         {
 152 |             const char* src_ptr = src_str.c_str();
 153 |             glShaderSource(m_handle, 1, &src_ptr, nullptr);
 154 |         }
 155 | 
 156 |         void source_from_file(const char* src_filepath)
 157 |         {
 158 |             FILE* file = fopen(src_filepath, "rt");
 159 |             GLU_CHECK_STATE(!file, "Failed to shader file: %s", src_filepath);
 160 | 
 161 |             fseek(file, 0, SEEK_END);
 162 |             size_t file_size = ftell(file);
 163 |             fseek(file, 0, SEEK_SET);
 164 | 
 165 |             std::string src{};
 166 |             src.resize(file_size);
 167 |             fread(src.data(), sizeof(char), file_size, file);
 168 |             source_from_str(src.c_str());
 169 | 
 170 |             fclose(file);
 171 |         }
 172 | 
 173 |         std::string get_info_log()
 174 |         {
 175 |             GLint log_length = 0;
 176 |             glGetShaderiv(m_handle, GL_INFO_LOG_LENGTH, &log_length);
 177 | 
 178 |             std::vector<GLchar> log(log_length);
 179 |             glGetShaderInfoLog(m_handle, log_length, nullptr, log.data());
 180 |             return {log.begin(), log.end()};
 181 |         }
 182 | 
 183 |         void compile()
 184 |         {
 185 |             glCompileShader(m_handle);
 186 | 
 187 |             GLint status;
 188 |             glGetShaderiv(m_handle, GL_COMPILE_STATUS, &status);
 189 |             if (!status)
 190 |             {
 191 |                 GLU_CHECK_STATE(status, "Shader failed to compile: %s", get_info_log().c_str());
 192 |             }
 193 |         }
 194 |     };
 195 | 
 196 |     /// A RAII wrapper for GL program.
 197 |     class Program
 198 |     {
 199 |     private:
 200 |         GLuint m_handle;
 201 | 
 202 |     public:
 203 |         explicit Program() { m_handle = glCreateProgram(); };
 204 |         Program(const Program&) = delete;
 205 | 
 206 |         Program(Program&& other) noexcept
 207 |         {
 208 |             m_handle = other.m_handle;
 209 |             other.m_handle = 0;
 210 |         }
 211 | 
 212 |         ~Program() { glDeleteProgram(m_handle); }
 213 | 
 214 |         [[nodiscard]] GLuint handle() const { return m_handle; }
 215 | 
 216 |         void attach_shader(GLuint shader_handle) { glAttachShader(m_handle, shader_handle); }
 217 |         void attach_shader(const Shader& shader) { glAttachShader(m_handle, shader.handle()); }
 218 | 
 219 |         [[nodiscard]] std::string get_info_log() const
 220 |         {
 221 |             GLint log_length = 0;
 222 |             glGetProgramiv(m_handle, GL_INFO_LOG_LENGTH, &log_length);
 223 | 
 224 |             std::vector<GLchar> log(log_length);
 225 |             glGetProgramInfoLog(m_handle, log_length, nullptr, log.data());
 226 |             return {log.begin(), log.end()};
 227 |         }
 228 | 
 229 |         void link()
 230 |         {
 231 |             GLint status;
 232 |             glLinkProgram(m_handle);
 233 |             glGetProgramiv(m_handle, GL_LINK_STATUS, &status);
 234 |             if (!status)
 235 |             {
 236 |                 GLU_CHECK_STATE(status, "Program failed to link: %s", get_info_log().c_str());
 237 |             }
 238 |         }
 239 | 
 240 |         void use() { glUseProgram(m_handle); }
 241 | 
 242 |         GLint get_uniform_location(const char* uniform_name)
 243 |         {
 244 |             GLint loc = glGetUniformLocation(m_handle, uniform_name);
 245 |             GLU_CHECK_STATE(loc >= 0, "Failed to get uniform location: %s", uniform_name);
 246 |             return loc;
 247 |         }
 248 |     };
 249 | 
 250 |     /// A RAII helper class for GL shader storage buffer.
 251 |     class ShaderStorageBuffer
 252 |     {
 253 |     private:
 254 |         GLuint m_handle = 0;
 255 |         size_t m_size = 0;
 256 | 
 257 |     public:
 258 |         explicit ShaderStorageBuffer(size_t initial_size = 0)
 259 |         {
 260 |             if (initial_size > 0)
 261 |                 resize(initial_size, false);
 262 |         }
 263 | 
 264 |         explicit ShaderStorageBuffer(const void* data, size_t size) :
 265 |             m_size(size)
 266 |         {
 267 |             GLU_CHECK_ARGUMENT(data, "");
 268 |             GLU_CHECK_ARGUMENT(size > 0, "");
 269 | 
 270 |             glCreateBuffers(1, &m_handle);
 271 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
 272 |             glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, data, GL_DYNAMIC_STORAGE_BIT);
 273 |         }
 274 | 
 275 |         template<typename T>
 276 |         explicit ShaderStorageBuffer(const std::vector<T>& data) :
 277 |             ShaderStorageBuffer(data.data(), data.size() * sizeof(T))
 278 |         {
 279 |         }
 280 | 
 281 |         ShaderStorageBuffer(const ShaderStorageBuffer&) = delete;
 282 |         ShaderStorageBuffer(ShaderStorageBuffer&& other) noexcept
 283 |         {
 284 |             m_handle = other.m_handle;
 285 |             m_size = other.m_size;
 286 |             other.m_handle = 0;
 287 |         }
 288 | 
 289 |         ~ShaderStorageBuffer()
 290 |         {
 291 |             if (m_handle)
 292 |                 glDeleteBuffers(1, &m_handle);
 293 |         }
 294 | 
 295 |         [[nodiscard]] GLuint handle() const { return m_handle; }
 296 |         [[nodiscard]] size_t size() const { return m_size; }
 297 | 
 298 |         /// Grows or shrinks the buffer. If keep_data, performs an additional copy to maintain the data.
 299 |         void resize(size_t size, bool keep_data = false)
 300 |         {
 301 |             size_t old_size = m_size;
 302 |             GLuint old_handle = m_handle;
 303 | 
 304 |             if (old_size != size)
 305 |             {
 306 |                 m_size = size;
 307 | 
 308 |                 glCreateBuffers(1, &m_handle);
 309 |                 glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
 310 |                 glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, nullptr, GL_DYNAMIC_STORAGE_BIT);
 311 | 
 312 |                 if (keep_data)
 313 |                     copy_buffer(old_handle, m_handle, std::min(old_size, size));
 314 | 
 315 |                 glDeleteBuffers(1, &old_handle);
 316 |             }
 317 |         }
 318 | 
 319 |         /// Clears the entire buffer with the given GLuint value (repeated).
 320 |         void clear(GLuint value)
 321 |         {
 322 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
 323 |             glClearBufferData(GL_SHADER_STORAGE_BUFFER, GL_R32UI, GL_RED, GL_UNSIGNED_INT, &value);
 324 |         }
 325 | 
 326 |         void write_data(const void* data, size_t size)
 327 |         {
 328 |             GLU_CHECK_ARGUMENT(size <= m_size, "");
 329 | 
 330 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
 331 |             glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, size, data);
 332 |         }
 333 | 
 334 |         template<typename T>
 335 |         std::vector<T> get_data() const
 336 |         {
 337 |             GLU_CHECK_ARGUMENT(m_size % sizeof(T) == 0, "Size %zu isn't a multiple of %zu", m_size, sizeof(T));
 338 | 
 339 |             std::vector<T> result(m_size / sizeof(T));
 340 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
 341 |             glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, (GLsizeiptr) m_size, result.data());
 342 |             return result;
 343 |         }
 344 | 
 345 |         void bind(GLuint index, size_t size = 0, size_t offset = 0)
 346 |         {
 347 |             if (size == 0)
 348 |                 size = m_size;
 349 |             glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, m_handle, (GLintptr) offset, (GLsizeiptr) size);
 350 |         }
 351 |     };
 352 | 
 353 |     /// Measures elapsed time on GPU for executing the given callback.
 354 |     inline uint64_t measure_gl_elapsed_time(const std::function<void()>& callback)
 355 |     {
 356 |         GLuint query;
 357 |         uint64_t elapsed_time{};
 358 | 
 359 |         glGenQueries(1, &query);
 360 |         glBeginQuery(GL_TIME_ELAPSED, query);
 361 | 
 362 |         callback();
 363 | 
 364 |         glEndQuery(GL_TIME_ELAPSED);
 365 | 
 366 |         glGetQueryObjectui64v(query, GL_QUERY_RESULT, &elapsed_time);
 367 |         glDeleteQueries(1, &query);
 368 | 
 369 |         return elapsed_time;
 370 |     }
 371 | 
 372 |     template<typename IntegerT>
 373 |     IntegerT log32_floor(IntegerT n)
 374 |     {
 375 |         return (IntegerT) floor(double(log2(n)) / 5.0);
 376 |     }
 377 | 
 378 |     template<typename IntegerT>
 379 |     IntegerT log32_ceil(IntegerT n)
 380 |     {
 381 |         return (IntegerT) ceil(double(log2(n)) / 5.0);
 382 |     }
 383 | 
 384 |     template<typename IntegerT>
 385 |     IntegerT div_ceil(IntegerT n, IntegerT d)
 386 |     {
 387 |         return (IntegerT) ceil(double(n) / double(d));
 388 |     }
 389 | 
 390 |     template<typename T>
 391 |     bool is_power_of_2(T n)
 392 |     {
 393 |         return (n & (n - 1)) == 0;
 394 |     }
 395 | 
 396 |     template<typename IntegerT>
 397 |     IntegerT next_power_of_2(IntegerT n)
 398 |     {
 399 |         n--;
 400 |         n |= n >> 1;
 401 |         n |= n >> 2;
 402 |         n |= n >> 4;
 403 |         n |= n >> 8;
 404 |         n |= n >> 16;
 405 |         n++;
 406 |         return n;
 407 |     }
 408 | 
 409 |     template<typename Iterator>
 410 |     void print_stl_container(Iterator begin, Iterator end)
 411 |     {
 412 |         size_t i = 0;
 413 |         for (; begin != end; begin++)
 414 |         {
 415 |             printf("(%zu) %s, ", i, std::to_string(*begin).c_str());
 416 |             i++;
 417 |         }
 418 |         printf("\n");
 419 |     }
 420 | 
 421 |     template<typename T>
 422 |     void print_buffer(const ShaderStorageBuffer& buffer)
 423 |     {
 424 |         std::vector<T> data = buffer.get_data<T>();
 425 |         print_stl_container(data.begin(), data.end());
 426 |     }
 427 | 
 428 |     inline void print_buffer_hex(const ShaderStorageBuffer& buffer)
 429 |     {
 430 |         std::vector<GLuint> data = buffer.get_data<GLuint>();
 431 |         for (size_t i = 0; i < data.size(); i++)
 432 |             printf("(%zu) %08x, ", i, data[i]);
 433 |         printf("\n");
 434 |     }
 435 | } // namespace glu
 436 | 
 437 | #endif // GLU_GL_UTILS_HPP
 438 | 
 439 | 
 440 | 
 441 | namespace glu
 442 | {
 443 |     namespace detail
 444 |     {
 445 |         inline const char* k_reduction_shader_src = R"(
 446 | #extension GL_KHR_shader_subgroup_arithmetic : require
 447 | 
 448 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;
 449 | 
 450 | layout(std430, binding = 0) buffer Buffer
 451 | {
 452 |     DATA_TYPE data[];
 453 | };
 454 | 
 455 | layout(location = 0) uniform uint u_count;
 456 | layout(location = 1) uniform uint u_depth;
 457 | 
 458 | void main()
 459 | {
 460 |     uint step = 1 << (5 * u_depth);
 461 |     uint subgroup_i = gl_WorkGroupID.x * NUM_THREADS + gl_SubgroupID * gl_SubgroupSize;
 462 |     uint i = (subgroup_i + gl_SubgroupInvocationID) * step;
 463 |     if (i < u_count)
 464 |     {
 465 |         DATA_TYPE r = SUBGROUP_OPERATION(data[i]);
 466 |         if (gl_SubgroupInvocationID == 0)
 467 |         {
 468 |             data[i] = r;
 469 |         }
 470 |     }
 471 | }
 472 | )";
 473 |     }
 474 | 
 475 |     /// The operators that can be used for the reduction operation.
 476 |     enum ReduceOperator
 477 |     {
 478 |         ReduceOperator_Sum = 0,
 479 |         ReduceOperator_Mul,
 480 |         ReduceOperator_Min,
 481 |         ReduceOperator_Max
 482 |     };
 483 | 
 484 |     /// A class that implements the reduction operation.
 485 |     class Reduce
 486 |     {
 487 |     private:
 488 |         const DataType m_data_type;
 489 |         const ReduceOperator m_operator;
 490 |         const size_t m_num_threads;
 491 |         const size_t m_num_items;
 492 | 
 493 |         Program m_program;
 494 | 
 495 |     public:
 496 |         explicit Reduce(DataType data_type, ReduceOperator operator_) :
 497 |             m_data_type(data_type),
 498 |             m_operator(operator_),
 499 |             m_num_threads(1024),
 500 |             m_num_items(4)
 501 |         {
 502 |             std::string shader_src = "#version 460\n\n";
 503 | 
 504 |             shader_src += std::string("#define DATA_TYPE ") + to_glsl_type_str(m_data_type) + "\n";
 505 |             shader_src += std::string("#define NUM_THREADS ") + std::to_string(m_num_threads) + "\n";
 506 |             shader_src += std::string("#define NUM_ITEMS ") + std::to_string(m_num_items) + "\n";
 507 | 
 508 |             if (m_operator == ReduceOperator_Sum)
 509 |             {
 510 |                 shader_src += "#define OPERATOR(a, b) (a + b)\n";
 511 |                 shader_src += "#define SUBGROUP_OPERATION(value) subgroupAdd(value)\n";
 512 |             }
 513 |             else if (m_operator == ReduceOperator_Mul)
 514 |             {
 515 |                 shader_src += "#define OPERATOR(a, b) (a * b)\n";
 516 |                 shader_src += "#define SUBGROUP_OPERATION(value) subgroupMul(value)\n";
 517 |             }
 518 |             else if (m_operator == ReduceOperator_Min)
 519 |             {
 520 |                 shader_src += "#define OPERATOR(a, b) (min(a, b))\n";
 521 |                 shader_src += "#define SUBGROUP_OPERATION(value) subgroupMin(value)\n";
 522 |             }
 523 |             else if (m_operator == ReduceOperator_Max)
 524 |             {
 525 |                 shader_src += "#define OPERATOR(a, b) (max(a, b))\n";
 526 |                 shader_src += "#define SUBGROUP_OPERATION(value) subgroupMax(value)\n";
 527 |             }
 528 |             else
 529 |             {
 530 |                 GLU_FAIL("Invalid reduction operator: %d", m_operator);
 531 |             }
 532 | 
 533 |             shader_src += detail::k_reduction_shader_src;
 534 | 
 535 |             Shader shader(GL_COMPUTE_SHADER);
 536 |             shader.source_from_str(shader_src.c_str());
 537 |             shader.compile();
 538 | 
 539 |             m_program.attach_shader(shader);
 540 |             m_program.link();
 541 |         }
 542 | 
 543 |         ~Reduce() = default;
 544 | 
 545 |         void operator()(GLuint buffer, size_t count)
 546 |         {
 547 |             GLU_CHECK_ARGUMENT(buffer, "Invalid buffer");
 548 |             GLU_CHECK_ARGUMENT(count > 0, "Count must be greater than zero");
 549 | 
 550 |             m_program.use();
 551 | 
 552 |             glUniform1ui(m_program.get_uniform_location("u_count"), count);
 553 |             glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer);
 554 | 
 555 |             for (int depth = 0;; depth++)
 556 |             {
 557 |                 int step = 1 << (5 * depth);
 558 |                 if (step >= count)
 559 |                     break;
 560 | 
 561 |                 size_t level_count = count >> (5 * depth);
 562 | 
 563 |                 glUniform1ui(m_program.get_uniform_location("u_depth"), depth);
 564 | 
 565 |                 size_t num_workgroups = div_ceil(level_count, m_num_threads);
 566 |                 glDispatchCompute(num_workgroups, 1, 1);
 567 |                 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
 568 |             }
 569 |         }
 570 |     };
 571 | } // namespace glu
 572 | 
 573 | #endif // GLU_REDUCE_HPP
 574 | 
 575 | 
 576 | #ifndef GLU_DATA_TYPES_HPP
 577 | #define GLU_DATA_TYPES_HPP
 578 | 
 579 | #ifndef GLU_ERRORS_HPP
 580 | #define GLU_ERRORS_HPP
 581 | 
 582 | #include <cstdio>
 583 | #include <cstdlib>
 584 | 
 585 | // TODO mark if (!condition_) as unlikely
 586 | #define GLU_CHECK_STATE(condition_, ...)                                                                                   \
 587 |     {                                                                                                                  \
 588 |         if (!(condition_))                                                                                             \
 589 |         {                                                                                                              \
 590 |             fprintf(stderr, __VA_ARGS__);                                                                              \
 591 |             exit(1);                                                                                                   \
 592 |         }                                                                                                              \
 593 |     }
 594 | 
 595 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__)
 596 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__)
 597 | 
 598 | #endif
 599 | 
 600 | 
 601 | 
 602 | namespace glu
 603 | {
 604 |     enum DataType
 605 |     {
 606 |         DataType_Float = 0,
 607 |         DataType_Double,
 608 |         DataType_Int,
 609 |         DataType_Uint,
 610 |         DataType_Vec2,
 611 |         DataType_Vec4,
 612 |         DataType_DVec2,
 613 |         DataType_DVec4,
 614 |         DataType_UVec2,
 615 |         DataType_UVec4,
 616 |         DataType_IVec2,
 617 |         DataType_IVec4
 618 |     };
 619 | 
 620 |     inline const char* to_glsl_type_str(DataType data_type)
 621 |     {
 622 |         // clang-format off
 623 |         if (data_type == DataType_Float)       return "float";
 624 |         else if (data_type == DataType_Double) return "double";
 625 |         else if (data_type == DataType_Int)    return "int";
 626 |         else if (data_type == DataType_Uint)   return "uint";
 627 |         else if (data_type == DataType_Vec2)   return "vec2";
 628 |         else if (data_type == DataType_Vec4)   return "vec4";
 629 |         else if (data_type == DataType_DVec2)  return "dvec2";
 630 |         else if (data_type == DataType_DVec4)  return "dvec4";
 631 |         else if (data_type == DataType_UVec2)  return "uvec2";
 632 |         else if (data_type == DataType_UVec4)  return "uvec4";
 633 |         else if (data_type == DataType_IVec2)  return "ivec2";
 634 |         else if (data_type == DataType_IVec4)  return "ivec4";
 635 |         else
 636 |         {
 637 |             GLU_FAIL("Invalid data type: %d", data_type);
 638 |         }
 639 |         // clang-format on
 640 |     }
 641 | 
 642 | } // namespace glu
 643 | 
 644 | #endif // GLU_DATA_TYPES_HPP
 645 | 
 646 | 
 647 | 
 648 | namespace glu
 649 | {
 650 |     namespace detail
 651 |     {
 652 |         inline const char* k_upsweep_shader_src = R"(
 653 | #extension GL_KHR_shader_subgroup_shuffle_relative : require
 654 | 
 655 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;
 656 | 
 657 | layout(std430, binding = 0) buffer Buffer
 658 | {
 659 |     DATA_TYPE data[];
 660 | };
 661 | 
 662 | layout(location = 0) uniform uint u_count;
 663 | layout(location = 1) uniform uint u_step;
 664 | 
 665 | void main()
 666 | {
 667 |     uint partition_i = gl_WorkGroupID.y;
 668 |     uint thread_i = gl_WorkGroupID.x * NUM_THREADS + gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
 669 |     uint i = partition_i * u_count + thread_i * u_step + u_step - 1;
 670 |     uint end_i = (partition_i + 1) * u_count;
 671 |     if (i < end_i)
 672 |     {
 673 |         DATA_TYPE lval = subgroupShuffleUp(data[i], 1);
 674 |         DATA_TYPE r = OPERATION(data[i], lval);
 675 |         if (i == end_i - 1)  // Clear last
 676 |         {
 677 |             data[i] = IDENTITY;
 678 |         }
 679 |         else if (gl_SubgroupInvocationID % 2 == 1)
 680 |         {
 681 |             data[i] = r;
 682 |         }
 683 |     }
 684 | }
 685 | )";
 686 | 
 687 |         inline const char* k_downsweep_shader_src = R"(
 688 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;
 689 | 
 690 | layout(std430, binding = 0) buffer Buffer
 691 | {
 692 |     DATA_TYPE data[];
 693 | };
 694 | 
 695 | layout(location = 0) uniform uint u_count;
 696 | layout(location = 1) uniform uint u_step;
 697 | 
 698 | void main()
 699 | {
 700 |     uint partition_i = gl_WorkGroupID.y;
 701 |     uint i = partition_i * u_count + gl_GlobalInvocationID.x * (u_step << 1) + (u_step - 1);
 702 |     uint next_i = i + u_step;
 703 |     uint end_i = (partition_i + 1) * u_count;
 704 |     if (next_i < end_i)
 705 |     {
 706 |         DATA_TYPE tmp = data[i];
 707 |         data[i] = data[next_i];
 708 |         data[next_i] = data[next_i] + tmp;
 709 |     }
 710 |     else if (i < end_i)
 711 |     {
 712 |         data[i] = IDENTITY;
 713 |     }
 714 | }
 715 | )";
 716 |     } // namespace detail
 717 | 
 718 |     /// A class that implements Blelloch scan algorithm (exclusive prefix sum).
 719 |     class BlellochScan
 720 |     {
 721 |     private:
 722 |         const DataType m_data_type;
 723 |         const size_t m_num_threads;
 724 |         const size_t m_num_items;
 725 | 
 726 |         Program m_upsweep_program;
 727 |         Program m_downsweep_program;
 728 | 
 729 |     public:
 730 |         explicit BlellochScan(DataType data_type) :
 731 |             m_data_type(data_type),
 732 |             m_num_threads(1024),
 733 |             m_num_items(4)
 734 |         {
 735 |             std::string shader_src = "#version 460\n\n";
 736 | 
 737 |             shader_src += std::string("#define DATA_TYPE ") + to_glsl_type_str(m_data_type) + "\n";
 738 |             shader_src += "#define OPERATION(a, b) (a + b)\n";
 739 |             shader_src += "#define IDENTITY 0\n";
 740 |             shader_src += std::string("#define NUM_THREADS ") + std::to_string(m_num_threads) + "\n";
 741 |             shader_src += std::string("#define NUM_ITEMS ") + std::to_string(m_num_items) + "\n";
 742 | 
 743 |             { // Upsweep program
 744 |                 Shader upsweep_shader(GL_COMPUTE_SHADER);
 745 |                 upsweep_shader.source_from_str((shader_src + detail::k_upsweep_shader_src).c_str());
 746 |                 upsweep_shader.compile();
 747 | 
 748 |                 m_upsweep_program.attach_shader(upsweep_shader);
 749 |                 m_upsweep_program.link();
 750 |             }
 751 | 
 752 |             { // Downsweep program
 753 |                 Shader downsweep_program(GL_COMPUTE_SHADER);
 754 |                 downsweep_program.source_from_str((shader_src + detail::k_downsweep_shader_src).c_str());
 755 |                 downsweep_program.compile();
 756 | 
 757 |                 m_downsweep_program.attach_shader(downsweep_program);
 758 |                 m_downsweep_program.link();
 759 |             }
 760 |         }
 761 | 
 762 |         ~BlellochScan() = default;
 763 | 
 764 |         /// Runs Blelloch exclusive scan on multiple partitions.
 765 |         ///
 766 |         /// @param buffer the input GLuint buffer
 767 |         /// @param count the number of GLuint in the buffer (must be a power of 2)
 768 |         /// @param num_partitions the number of partitions (must be adjacent)
 769 |         void operator()(GLuint buffer, size_t count, size_t num_partitions = 1)
 770 |         {
 771 |             GLU_CHECK_ARGUMENT(buffer, "Invalid buffer");
 772 |             GLU_CHECK_ARGUMENT(count > 0, "Count must be greater than zero");
 773 |             GLU_CHECK_ARGUMENT(is_power_of_2(count), "Count must be a power of 2"); // TODO Remove this requirement
 774 |             GLU_CHECK_ARGUMENT(num_partitions >= 1, "Num of partitions must be >= 1");
 775 | 
 776 |             upsweep(buffer, count, num_partitions); // Also clear last
 777 |             downsweep(buffer, count, num_partitions);
 778 |         }
 779 | 
 780 |     private:
 781 |         void upsweep(GLuint buffer, size_t count, size_t num_partitions) // Also clear last
 782 |         {
 783 |             m_upsweep_program.use();
 784 | 
 785 |             glUniform1ui(m_upsweep_program.get_uniform_location("u_count"), count);
 786 |             glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer);
 787 | 
 788 |             int step = 1;
 789 |             int level_count = (int) count;
 790 |             while (true)
 791 |             {
 792 |                 glUniform1ui(m_upsweep_program.get_uniform_location("u_step"), step);
 793 | 
 794 |                 size_t num_workgroups = div_ceil<size_t>(level_count, m_num_threads);
 795 |                 glDispatchCompute(num_workgroups, num_partitions, 1);
 796 |                 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
 797 | 
 798 |                 step <<= 1;
 799 | 
 800 |                 level_count >>= 1;
 801 | 
 802 |                 if (level_count <= 1)
 803 |                     break;
 804 |             }
 805 |         }
 806 | 
 807 |         void downsweep(GLuint buffer, size_t count, size_t num_partitions)
 808 |         {
 809 |             m_downsweep_program.use();
 810 | 
 811 |             glUniform1ui(m_downsweep_program.get_uniform_location("u_count"), count);
 812 |             glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer);
 813 | 
 814 |             int step = next_power_of_2(int(count)) >> 1;
 815 |             size_t level_count = 1;
 816 |             while (true)
 817 |             {
 818 |                 glUniform1ui(m_downsweep_program.get_uniform_location("u_step"), step);
 819 | 
 820 |                 size_t num_workgroups = div_ceil(level_count, m_num_threads);
 821 |                 glDispatchCompute(num_workgroups, num_partitions, 1);
 822 |                 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
 823 | 
 824 |                 step >>= 1;
 825 |                 level_count <<= 1;
 826 |                 if (step == 0)
 827 |                     break;
 828 |             }
 829 |         }
 830 |     };
 831 | } // namespace glu
 832 | 
 833 | #endif // GLU_BLELLOCHSCAN_HPP
 834 | 
 835 | 
 836 | #ifndef GLU_GL_UTILS_HPP
 837 | #define GLU_GL_UTILS_HPP
 838 | 
 839 | #include <cmath>
 840 | #include <functional>
 841 | #include <string>
 842 | #include <vector>
 843 | 
 844 | #ifndef GLU_ERRORS_HPP
 845 | #define GLU_ERRORS_HPP
 846 | 
 847 | #include <cstdio>
 848 | #include <cstdlib>
 849 | 
 850 | // TODO mark if (!condition_) as unlikely
 851 | #define GLU_CHECK_STATE(condition_, ...)                                                                                   \
 852 |     {                                                                                                                  \
 853 |         if (!(condition_))                                                                                             \
 854 |         {                                                                                                              \
 855 |             fprintf(stderr, __VA_ARGS__);                                                                              \
 856 |             exit(1);                                                                                                   \
 857 |         }                                                                                                              \
 858 |     }
 859 | 
 860 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__)
 861 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__)
 862 | 
 863 | #endif
 864 | 
 865 | 
 866 | 
 867 | namespace glu
 868 | {
 869 |     inline void
 870 |     copy_buffer(GLuint src_buffer, GLuint dst_buffer, size_t size, size_t src_offset = 0, size_t dst_offset = 0)
 871 |     {
 872 |         glBindBuffer(GL_COPY_READ_BUFFER, src_buffer);
 873 |         glBindBuffer(GL_COPY_WRITE_BUFFER, dst_buffer);
 874 | 
 875 |         glCopyBufferSubData(
 876 |             GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, (GLintptr) src_offset, (GLintptr) dst_offset, (GLsizeiptr) size
 877 |         );
 878 |     }
 879 | 
 880 |     /// A RAII wrapper for GL shader.
 881 |     class Shader
 882 |     {
 883 |     private:
 884 |         GLuint m_handle;
 885 | 
 886 |     public:
 887 |         explicit Shader(GLenum type) :
 888 |             m_handle(glCreateShader(type)){};
 889 |         Shader(const Shader&) = delete;
 890 | 
 891 |         Shader(Shader&& other) noexcept
 892 |         {
 893 |             m_handle = other.m_handle;
 894 |             other.m_handle = 0;
 895 |         }
 896 | 
 897 |         ~Shader() { glDeleteShader(m_handle); }
 898 | 
 899 |         [[nodiscard]] GLuint handle() const { return m_handle; }
 900 | 
 901 |         void source_from_str(const std::string& src_str)
 902 |         {
 903 |             const char* src_ptr = src_str.c_str();
 904 |             glShaderSource(m_handle, 1, &src_ptr, nullptr);
 905 |         }
 906 | 
 907 |         void source_from_file(const char* src_filepath)
 908 |         {
 909 |             FILE* file = fopen(src_filepath, "rt");
 910 |             GLU_CHECK_STATE(!file, "Failed to shader file: %s", src_filepath);
 911 | 
 912 |             fseek(file, 0, SEEK_END);
 913 |             size_t file_size = ftell(file);
 914 |             fseek(file, 0, SEEK_SET);
 915 | 
 916 |             std::string src{};
 917 |             src.resize(file_size);
 918 |             fread(src.data(), sizeof(char), file_size, file);
 919 |             source_from_str(src.c_str());
 920 | 
 921 |             fclose(file);
 922 |         }
 923 | 
 924 |         std::string get_info_log()
 925 |         {
 926 |             GLint log_length = 0;
 927 |             glGetShaderiv(m_handle, GL_INFO_LOG_LENGTH, &log_length);
 928 | 
 929 |             std::vector<GLchar> log(log_length);
 930 |             glGetShaderInfoLog(m_handle, log_length, nullptr, log.data());
 931 |             return {log.begin(), log.end()};
 932 |         }
 933 | 
 934 |         void compile()
 935 |         {
 936 |             glCompileShader(m_handle);
 937 | 
 938 |             GLint status;
 939 |             glGetShaderiv(m_handle, GL_COMPILE_STATUS, &status);
 940 |             if (!status)
 941 |             {
 942 |                 GLU_CHECK_STATE(status, "Shader failed to compile: %s", get_info_log().c_str());
 943 |             }
 944 |         }
 945 |     };
 946 | 
 947 |     /// A RAII wrapper for GL program.
 948 |     class Program
 949 |     {
 950 |     private:
 951 |         GLuint m_handle;
 952 | 
 953 |     public:
 954 |         explicit Program() { m_handle = glCreateProgram(); };
 955 |         Program(const Program&) = delete;
 956 | 
 957 |         Program(Program&& other) noexcept
 958 |         {
 959 |             m_handle = other.m_handle;
 960 |             other.m_handle = 0;
 961 |         }
 962 | 
 963 |         ~Program() { glDeleteProgram(m_handle); }
 964 | 
 965 |         [[nodiscard]] GLuint handle() const { return m_handle; }
 966 | 
 967 |         void attach_shader(GLuint shader_handle) { glAttachShader(m_handle, shader_handle); }
 968 |         void attach_shader(const Shader& shader) { glAttachShader(m_handle, shader.handle()); }
 969 | 
 970 |         [[nodiscard]] std::string get_info_log() const
 971 |         {
 972 |             GLint log_length = 0;
 973 |             glGetProgramiv(m_handle, GL_INFO_LOG_LENGTH, &log_length);
 974 | 
 975 |             std::vector<GLchar> log(log_length);
 976 |             glGetProgramInfoLog(m_handle, log_length, nullptr, log.data());
 977 |             return {log.begin(), log.end()};
 978 |         }
 979 | 
 980 |         void link()
 981 |         {
 982 |             GLint status;
 983 |             glLinkProgram(m_handle);
 984 |             glGetProgramiv(m_handle, GL_LINK_STATUS, &status);
 985 |             if (!status)
 986 |             {
 987 |                 GLU_CHECK_STATE(status, "Program failed to link: %s", get_info_log().c_str());
 988 |             }
 989 |         }
 990 | 
 991 |         void use() { glUseProgram(m_handle); }
 992 | 
 993 |         GLint get_uniform_location(const char* uniform_name)
 994 |         {
 995 |             GLint loc = glGetUniformLocation(m_handle, uniform_name);
 996 |             GLU_CHECK_STATE(loc >= 0, "Failed to get uniform location: %s", uniform_name);
 997 |             return loc;
 998 |         }
 999 |     };
1000 | 
1001 |     /// A RAII helper class for GL shader storage buffer.
1002 |     class ShaderStorageBuffer
1003 |     {
1004 |     private:
1005 |         GLuint m_handle = 0;
1006 |         size_t m_size = 0;
1007 | 
1008 |     public:
1009 |         explicit ShaderStorageBuffer(size_t initial_size = 0)
1010 |         {
1011 |             if (initial_size > 0)
1012 |                 resize(initial_size, false);
1013 |         }
1014 | 
1015 |         explicit ShaderStorageBuffer(const void* data, size_t size) :
1016 |             m_size(size)
1017 |         {
1018 |             GLU_CHECK_ARGUMENT(data, "");
1019 |             GLU_CHECK_ARGUMENT(size > 0, "");
1020 | 
1021 |             glCreateBuffers(1, &m_handle);
1022 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
1023 |             glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, data, GL_DYNAMIC_STORAGE_BIT);
1024 |         }
1025 | 
1026 |         template<typename T>
1027 |         explicit ShaderStorageBuffer(const std::vector<T>& data) :
1028 |             ShaderStorageBuffer(data.data(), data.size() * sizeof(T))
1029 |         {
1030 |         }
1031 | 
1032 |         ShaderStorageBuffer(const ShaderStorageBuffer&) = delete;
1033 |         ShaderStorageBuffer(ShaderStorageBuffer&& other) noexcept
1034 |         {
1035 |             m_handle = other.m_handle;
1036 |             m_size = other.m_size;
1037 |             other.m_handle = 0;
1038 |         }
1039 | 
1040 |         ~ShaderStorageBuffer()
1041 |         {
1042 |             if (m_handle)
1043 |                 glDeleteBuffers(1, &m_handle);
1044 |         }
1045 | 
1046 |         [[nodiscard]] GLuint handle() const { return m_handle; }
1047 |         [[nodiscard]] size_t size() const { return m_size; }
1048 | 
1049 |         /// Grows or shrinks the buffer. If keep_data, performs an additional copy to maintain the data.
1050 |         void resize(size_t size, bool keep_data = false)
1051 |         {
1052 |             size_t old_size = m_size;
1053 |             GLuint old_handle = m_handle;
1054 | 
1055 |             if (old_size != size)
1056 |             {
1057 |                 m_size = size;
1058 | 
1059 |                 glCreateBuffers(1, &m_handle);
1060 |                 glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
1061 |                 glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, nullptr, GL_DYNAMIC_STORAGE_BIT);
1062 | 
1063 |                 if (keep_data)
1064 |                     copy_buffer(old_handle, m_handle, std::min(old_size, size));
1065 | 
1066 |                 glDeleteBuffers(1, &old_handle);
1067 |             }
1068 |         }
1069 | 
1070 |         /// Clears the entire buffer with the given GLuint value (repeated).
1071 |         void clear(GLuint value)
1072 |         {
1073 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
1074 |             glClearBufferData(GL_SHADER_STORAGE_BUFFER, GL_R32UI, GL_RED, GL_UNSIGNED_INT, &value);
1075 |         }
1076 | 
1077 |         void write_data(const void* data, size_t size)
1078 |         {
1079 |             GLU_CHECK_ARGUMENT(size <= m_size, "");
1080 | 
1081 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
1082 |             glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, size, data);
1083 |         }
1084 | 
1085 |         template<typename T>
1086 |         std::vector<T> get_data() const
1087 |         {
1088 |             GLU_CHECK_ARGUMENT(m_size % sizeof(T) == 0, "Size %zu isn't a multiple of %zu", m_size, sizeof(T));
1089 | 
1090 |             std::vector<T> result(m_size / sizeof(T));
1091 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
1092 |             glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, (GLsizeiptr) m_size, result.data());
1093 |             return result;
1094 |         }
1095 | 
1096 |         void bind(GLuint index, size_t size = 0, size_t offset = 0)
1097 |         {
1098 |             if (size == 0)
1099 |                 size = m_size;
1100 |             glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, m_handle, (GLintptr) offset, (GLsizeiptr) size);
1101 |         }
1102 |     };
1103 | 
1104 |     /// Measures elapsed time on GPU for executing the given callback.
1105 |     inline uint64_t measure_gl_elapsed_time(const std::function<void()>& callback)
1106 |     {
1107 |         GLuint query;
1108 |         uint64_t elapsed_time{};
1109 | 
1110 |         glGenQueries(1, &query);
1111 |         glBeginQuery(GL_TIME_ELAPSED, query);
1112 | 
1113 |         callback();
1114 | 
1115 |         glEndQuery(GL_TIME_ELAPSED);
1116 | 
1117 |         glGetQueryObjectui64v(query, GL_QUERY_RESULT, &elapsed_time);
1118 |         glDeleteQueries(1, &query);
1119 | 
1120 |         return elapsed_time;
1121 |     }
1122 | 
1123 |     template<typename IntegerT>
1124 |     IntegerT log32_floor(IntegerT n)
1125 |     {
1126 |         return (IntegerT) floor(double(log2(n)) / 5.0);
1127 |     }
1128 | 
1129 |     template<typename IntegerT>
1130 |     IntegerT log32_ceil(IntegerT n)
1131 |     {
1132 |         return (IntegerT) ceil(double(log2(n)) / 5.0);
1133 |     }
1134 | 
1135 |     template<typename IntegerT>
1136 |     IntegerT div_ceil(IntegerT n, IntegerT d)
1137 |     {
1138 |         return (IntegerT) ceil(double(n) / double(d));
1139 |     }
1140 | 
1141 |     template<typename T>
1142 |     bool is_power_of_2(T n)
1143 |     {
1144 |         return (n & (n - 1)) == 0;
1145 |     }
1146 | 
1147 |     template<typename IntegerT>
1148 |     IntegerT next_power_of_2(IntegerT n)
1149 |     {
1150 |         n--;
1151 |         n |= n >> 1;
1152 |         n |= n >> 2;
1153 |         n |= n >> 4;
1154 |         n |= n >> 8;
1155 |         n |= n >> 16;
1156 |         n++;
1157 |         return n;
1158 |     }
1159 | 
1160 |     template<typename Iterator>
1161 |     void print_stl_container(Iterator begin, Iterator end)
1162 |     {
1163 |         size_t i = 0;
1164 |         for (; begin != end; begin++)
1165 |         {
1166 |             printf("(%zu) %s, ", i, std::to_string(*begin).c_str());
1167 |             i++;
1168 |         }
1169 |         printf("\n");
1170 |     }
1171 | 
1172 |     template<typename T>
1173 |     void print_buffer(const ShaderStorageBuffer& buffer)
1174 |     {
1175 |         std::vector<T> data = buffer.get_data<T>();
1176 |         print_stl_container(data.begin(), data.end());
1177 |     }
1178 | 
1179 |     inline void print_buffer_hex(const ShaderStorageBuffer& buffer)
1180 |     {
1181 |         std::vector<GLuint> data = buffer.get_data<GLuint>();
1182 |         for (size_t i = 0; i < data.size(); i++)
1183 |             printf("(%zu) %08x, ", i, data[i]);
1184 |         printf("\n");
1185 |     }
1186 | } // namespace glu
1187 | 
1188 | #endif // GLU_GL_UTILS_HPP
1189 | 
1190 | 
1191 | 
1192 | namespace glu
1193 | {
1194 |     namespace detail
1195 |     {
1196 |         inline const char* k_radix_sort_counting_shader = R"(
1197 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;
1198 | 
1199 | layout(std430, binding = 0) readonly buffer KeyBuffer
1200 | {
1201 |     uint b_key_buffer[];
1202 | };
1203 | 
1204 | layout(std430, binding = 1) buffer BlockCountBuffer
1205 | {
1206 |     uint b_block_count_buffer[]; // 16 * NUM_THREADS
1207 | };
1208 | 
1209 | layout(std430, binding = 2) buffer GlobalCountBuffer
1210 | {
1211 |     uint b_global_count_buffer[];
1212 | };
1213 | 
1214 | layout(location = 0) uniform uint u_count;
1215 | layout(location = 1) uniform uint u_radix_shift;
1216 | layout(location = 2) uniform uint u_num_blocks_power_of_2;
1217 | 
1218 | void main()
1219 | {
1220 |     for (uint radix = 0; radix < 16; radix++)
1221 |     {
1222 |         b_block_count_buffer[radix * u_num_blocks_power_of_2 + gl_WorkGroupID.x] = 0;
1223 |     }
1224 | 
1225 |     barrier();
1226 | 
1227 |     uint i = gl_GlobalInvocationID.x;
1228 |     if (i < u_count)
1229 |     {
1230 |         // Block-wide count on shared memory
1231 |         uint radix = (b_key_buffer[i] >> u_radix_shift) & 0xf;
1232 |         atomicAdd(b_block_count_buffer[radix * u_num_blocks_power_of_2 + gl_WorkGroupID.x], 1);
1233 |     }
1234 | 
1235 |     barrier();
1236 | 
1237 |     if (gl_LocalInvocationIndex < 16)
1238 |     {
1239 |         uint block_count = b_block_count_buffer[gl_LocalInvocationIndex * u_num_blocks_power_of_2 + gl_WorkGroupID.x];
1240 |         atomicAdd(b_global_count_buffer[gl_LocalInvocationIndex], block_count);
1241 |     }
1242 | }
1243 | )";
1244 | 
1245 |         inline const char* k_radix_sort_reordering_shader = R"(
1246 | #extension GL_KHR_shader_subgroup_arithmetic : require
1247 | 
1248 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;
1249 | 
1250 | layout(std430, binding = 0) readonly buffer SrcKeyBuffer
1251 | {
1252 |     uint b_src_key_buffer[];
1253 | };
1254 | 
1255 | layout(std430, binding = 1) readonly buffer SrcValBuffer
1256 | {
1257 |     uint b_src_val_buffer[];
1258 | };
1259 | 
1260 | layout(std430, binding = 2) writeonly buffer DstKeyBuffer
1261 | {
1262 |     uint b_dst_key_buffer[];
1263 | };
1264 | 
1265 | layout(std430, binding = 3) writeonly buffer DstValBuffer
1266 | {
1267 |     uint b_dst_val_buffer[];
1268 | };
1269 | 
1270 | layout(std430, binding = 4) readonly buffer BlockOffsetBuffer
1271 | {
1272 |     uint b_block_offset_buffer[];
1273 | };
1274 | 
1275 | layout(std430, binding = 5) readonly buffer GlobalCountBuffer
1276 | {
1277 |     uint b_global_count_buffer[];
1278 | };
1279 | 
1280 | layout(location = 0) uniform uint u_count;
1281 | layout(location = 1) uniform uint u_radix_shift;
1282 | layout(location = 2) uniform uint u_num_blocks_power_of_2;
1283 | 
1284 | shared uint s_global_offset_buffer[16];
1285 | shared uint s_prefix_sum_buffer[NUM_THREADS];
1286 | 
1287 | void prefix_sum()  // Block-wide prefix sum (Blelloch scan)
1288 | {
1289 |     uint thread_i = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
1290 | 
1291 |     // Upsweep
1292 |     for (uint step = 1; step < NUM_THREADS; step <<= 1)
1293 |     {
1294 |         if (thread_i % 2 == 1)
1295 |         {
1296 |             uint i = thread_i * step + (step - 1);
1297 |             if (i < NUM_THREADS)
1298 |             {
1299 |                 s_prefix_sum_buffer[i] = s_prefix_sum_buffer[i] + s_prefix_sum_buffer[i - step];
1300 |             }
1301 |         }
1302 | 
1303 |         barrier();
1304 |     }
1305 | 
1306 |     // Clear last
1307 |     if (thread_i == NUM_THREADS - 1) s_prefix_sum_buffer[thread_i] = 0;
1308 | 
1309 |     barrier();
1310 | 
1311 |     // Downsweep
1312 |     uint step = NUM_THREADS >> 1;
1313 |     for (; step > 0; step >>= 1)
1314 |     {
1315 |         uint i = thread_i * step + (step - 1);
1316 |         if (i + step < NUM_THREADS && thread_i % 2 == 0)
1317 |         {
1318 |             uint tmp = s_prefix_sum_buffer[i];
1319 |             s_prefix_sum_buffer[i] = s_prefix_sum_buffer[i + step];
1320 |             s_prefix_sum_buffer[i + step] = tmp + s_prefix_sum_buffer[i + step];
1321 |         }
1322 | 
1323 |         barrier();
1324 |     }
1325 | }
1326 | 
1327 | void main()
1328 | {
1329 |     uint thread_i = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
1330 |     uint i = gl_WorkGroupID.x * NUM_THREADS + thread_i;
1331 | 
1332 |     // Prefix sum on global counts to obtain global offsets
1333 |     if (gl_SubgroupID == 0 && gl_SubgroupInvocationID < 16)
1334 |     {
1335 |         uint v = subgroupExclusiveAdd(b_global_count_buffer[gl_SubgroupInvocationID]);
1336 |         s_global_offset_buffer[gl_SubgroupInvocationID] = v;
1337 |     }
1338 | 
1339 |     barrier();
1340 | 
1341 |     // Reordering
1342 |     for (uint radix = 0; radix < 16; radix++)
1343 |     {
1344 |         bool should_place = false;
1345 |         if (i < u_count)
1346 |         {
1347 |             should_place = ((b_src_key_buffer[i] >> u_radix_shift) & 0xf) == radix;
1348 |         }
1349 | 
1350 |         s_prefix_sum_buffer[thread_i] = should_place ? 1 : 0;
1351 | 
1352 |         barrier();
1353 | 
1354 |         // Prefix sum on local counts to obtain local offsets
1355 |         prefix_sum();
1356 | 
1357 |         if (should_place)
1358 |         {
1359 |             uint di =
1360 |                 s_global_offset_buffer[radix] +
1361 |                 b_block_offset_buffer[radix * u_num_blocks_power_of_2 + gl_WorkGroupID.x] +
1362 |                 s_prefix_sum_buffer[thread_i];
1363 |             b_dst_key_buffer[di] = b_src_key_buffer[i];
1364 |             b_dst_val_buffer[di] = b_src_val_buffer[i];
1365 |         }
1366 |     }
1367 | }
1368 | )";
1369 |     } // namespace detail
1370 | 
1371 |     class RadixSort
1372 |     {
1373 |     private:
1374 |         Program m_count_program;
1375 |         BlellochScan m_blelloch_scan;
1376 |         Program m_reorder_program;
1377 | 
1378 |         /// A GLuint buffer of size 16 * NUM_THREADS that stores the counts of radixes per block.
1379 |         ShaderStorageBuffer m_block_count_buffer;
1380 | 
1381 |         /// A GLuint buffer of size 16 that stores the global counts of radixes.
1382 |         ShaderStorageBuffer m_global_count_buffer;
1383 | 
1384 |         ShaderStorageBuffer m_key_scratch_buffer;
1385 |         ShaderStorageBuffer m_val_scratch_buffer;
1386 | 
1387 |         const size_t m_num_threads;
1388 | 
1389 |     public:
1390 |         explicit RadixSort() :
1391 |             m_blelloch_scan(DataType_Uint),
1392 |             m_num_threads(1024)
1393 |         {
1394 |             GLU_CHECK_ARGUMENT(is_power_of_2(m_num_threads), "Num threads must be a power of 2");
1395 | 
1396 |             m_global_count_buffer.resize(16 * sizeof(GLuint));
1397 | 
1398 |             std::string shader_src = "#version 460\n\n";
1399 |             shader_src += "#define NUM_THREADS " + std::to_string(m_num_threads) + "\n";
1400 | 
1401 |             { // Counting program
1402 |                 Shader shader(GL_COMPUTE_SHADER);
1403 |                 shader.source_from_str(shader_src + detail::k_radix_sort_counting_shader);
1404 |                 shader.compile();
1405 | 
1406 |                 m_count_program.attach_shader(shader.handle());
1407 |                 m_count_program.link();
1408 |             }
1409 | 
1410 |             { // Reordering program
1411 |                 Shader shader(GL_COMPUTE_SHADER);
1412 |                 shader.source_from_str(shader_src + detail::k_radix_sort_reordering_shader);
1413 |                 shader.compile();
1414 | 
1415 |                 m_reorder_program.attach_shader(shader.handle());
1416 |                 m_reorder_program.link();
1417 |             }
1418 |         }
1419 | 
1420 |         ~RadixSort() = default;
1421 | 
1422 |         void prepare_internal_buffers(size_t count)
1423 |         {
1424 |             { // Prepare block count buffer
1425 |                 size_t required_size = required_block_count_buffer_size(count);
1426 |                 if (m_block_count_buffer.size() < required_size)
1427 |                 {
1428 |                     m_block_count_buffer.resize(required_size, false);
1429 | #ifdef GLU_VERBOSE // TODO Create a log utility
1430 |                     printf("[RadixSort] Block count buffer reallocated to: %zu\n", required_size);
1431 | #endif
1432 |                 }
1433 |             }
1434 | 
1435 |             { // Prepare key scratch buffer
1436 |                 size_t required_size = required_key_scratch_buffer_size(count);
1437 |                 if (m_key_scratch_buffer.size() < required_size)
1438 |                 {
1439 |                     m_key_scratch_buffer.resize(required_size, false);
1440 | #ifdef GLU_VERBOSE
1441 |                     printf("[RadixSort] Key scratch buffer reallocated to: %zu\n", required_size);
1442 | #endif
1443 |                 }
1444 |             }
1445 | 
1446 |             { // Prepare val scratch buffer
1447 |                 size_t required_size = required_val_scratch_buffer_size(count);
1448 |                 if (m_val_scratch_buffer.size() < required_size)
1449 |                 {
1450 |                     m_val_scratch_buffer.resize(required_size, false);
1451 | #ifdef GLU_VERBOSE
1452 |                     printf("[RadixSort] Val scratch buffer reallocated to: %zu\n", required_size);
1453 | #endif
1454 |                 }
1455 |             }
1456 |         }
1457 | 
1458 |         void operator()(GLuint key_buffer, GLuint val_buffer, size_t count, size_t num_steps = 0)
1459 |         {
1460 |             GLU_CHECK_ARGUMENT(key_buffer, "Invalid key buffer");
1461 |             GLU_CHECK_ARGUMENT(val_buffer, "Invalid value buffer");
1462 | 
1463 |             if (count <= 1)
1464 |                 return; // Hey, that's already sorted x)
1465 | 
1466 |             prepare_internal_buffers(count);
1467 | 
1468 |             size_t num_blocks = div_ceil(count, size_t(1024));
1469 |             size_t num_blocks_power_of_2 = next_power_of_2(num_blocks); // Required by BlellochScan
1470 | 
1471 |             GLuint key_buffers[]{key_buffer, m_key_scratch_buffer.handle()};
1472 |             GLuint val_buffers[]{val_buffer, m_val_scratch_buffer.handle()};
1473 | 
1474 |             for (int step = 0; step < 8;)
1475 |             {
1476 |                 // ---------------------------------------------------------------- Counting
1477 | 
1478 |                 m_block_count_buffer.clear(0);
1479 |                 m_global_count_buffer.clear(0);
1480 | 
1481 |                 m_count_program.use();
1482 | 
1483 |                 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, key_buffers[step % 2]);
1484 |                 m_block_count_buffer.bind(1);
1485 |                 m_global_count_buffer.bind(2);
1486 | 
1487 |                 glUniform1ui(m_count_program.get_uniform_location("u_count"), count);
1488 |                 glUniform1ui(m_count_program.get_uniform_location("u_radix_shift"), step << 2);
1489 |                 glUniform1ui(m_count_program.get_uniform_location("u_num_blocks_power_of_2"), num_blocks_power_of_2);
1490 | 
1491 |                 glDispatchCompute(num_blocks, 1, 1);
1492 |                 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
1493 | 
1494 |                 // ---------------------------------------------------------------- Prefix sum
1495 | 
1496 |                 m_blelloch_scan(m_block_count_buffer.handle(), num_blocks_power_of_2, 16);
1497 | 
1498 |                 // ---------------------------------------------------------------- Reordering
1499 | 
1500 |                 m_reorder_program.use();
1501 | 
1502 |                 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, key_buffers[step % 2]);
1503 |                 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, val_buffers[step % 2]);
1504 |                 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, key_buffers[(step + 1) % 2]);
1505 |                 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, val_buffers[(step + 1) % 2]);
1506 |                 m_block_count_buffer.bind(4);
1507 |                 m_global_count_buffer.bind(5);
1508 | 
1509 |                 glUniform1ui(m_reorder_program.get_uniform_location("u_count"), count);
1510 |                 glUniform1ui(m_reorder_program.get_uniform_location("u_radix_shift"), step << 2);
1511 |                 glUniform1ui(m_reorder_program.get_uniform_location("u_num_blocks_power_of_2"), num_blocks_power_of_2);
1512 | 
1513 |                 glDispatchCompute(num_blocks, 1, 1);
1514 |                 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
1515 | 
1516 |                 ++step;
1517 |                 if (step == num_steps || step == 8) break;
1518 |             }
1519 |         }
1520 | 
1521 |     private:
1522 |         [[nodiscard]] static size_t required_block_count_buffer_size(size_t count)
1523 |         {
1524 |             size_t num_blocks = div_ceil(count, size_t(1024));
1525 |             size_t num_blocks_power_of_2 = next_power_of_2(num_blocks); // Required by BlellochScan
1526 | 
1527 |             return next_power_of_2(16 * num_blocks_power_of_2) * sizeof(GLuint);
1528 |         }
1529 | 
1530 |         [[nodiscard]] static size_t required_key_scratch_buffer_size(size_t count)
1531 |         {
1532 |             return next_power_of_2(count) * sizeof(GLuint);
1533 |         }
1534 | 
1535 |         [[nodiscard]] static size_t required_val_scratch_buffer_size(size_t count)
1536 |         {
1537 |             return next_power_of_2(count) * sizeof(GLuint);
1538 |         }
1539 |     };
1540 | } // namespace glu
1541 | 
1542 | #endif // GLU_RADIXSORT_HPP
1543 | 


--------------------------------------------------------------------------------
/dist/Reduce.hpp:
--------------------------------------------------------------------------------
  1 | // This code was automatically generated; you're not supposed to edit it!
  2 | 
  3 | #ifndef GLU_REDUCE_HPP
  4 | #define GLU_REDUCE_HPP
  5 | 
  6 | #ifndef GLU_DATA_TYPES_HPP
  7 | #define GLU_DATA_TYPES_HPP
  8 | 
  9 | #ifndef GLU_ERRORS_HPP
 10 | #define GLU_ERRORS_HPP
 11 | 
 12 | #include <cstdio>
 13 | #include <cstdlib>
 14 | 
 15 | // TODO mark if (!condition_) as unlikely
 16 | #define GLU_CHECK_STATE(condition_, ...)                                                                                   \
 17 |     {                                                                                                                  \
 18 |         if (!(condition_))                                                                                             \
 19 |         {                                                                                                              \
 20 |             fprintf(stderr, __VA_ARGS__);                                                                              \
 21 |             exit(1);                                                                                                   \
 22 |         }                                                                                                              \
 23 |     }
 24 | 
 25 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__)
 26 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__)
 27 | 
 28 | #endif
 29 | 
 30 | 
 31 | 
 32 | namespace glu
 33 | {
 34 |     enum DataType
 35 |     {
 36 |         DataType_Float = 0,
 37 |         DataType_Double,
 38 |         DataType_Int,
 39 |         DataType_Uint,
 40 |         DataType_Vec2,
 41 |         DataType_Vec4,
 42 |         DataType_DVec2,
 43 |         DataType_DVec4,
 44 |         DataType_UVec2,
 45 |         DataType_UVec4,
 46 |         DataType_IVec2,
 47 |         DataType_IVec4
 48 |     };
 49 | 
 50 |     inline const char* to_glsl_type_str(DataType data_type)
 51 |     {
 52 |         // clang-format off
 53 |         if (data_type == DataType_Float)       return "float";
 54 |         else if (data_type == DataType_Double) return "double";
 55 |         else if (data_type == DataType_Int)    return "int";
 56 |         else if (data_type == DataType_Uint)   return "uint";
 57 |         else if (data_type == DataType_Vec2)   return "vec2";
 58 |         else if (data_type == DataType_Vec4)   return "vec4";
 59 |         else if (data_type == DataType_DVec2)  return "dvec2";
 60 |         else if (data_type == DataType_DVec4)  return "dvec4";
 61 |         else if (data_type == DataType_UVec2)  return "uvec2";
 62 |         else if (data_type == DataType_UVec4)  return "uvec4";
 63 |         else if (data_type == DataType_IVec2)  return "ivec2";
 64 |         else if (data_type == DataType_IVec4)  return "ivec4";
 65 |         else
 66 |         {
 67 |             GLU_FAIL("Invalid data type: %d", data_type);
 68 |         }
 69 |         // clang-format on
 70 |     }
 71 | 
 72 | } // namespace glu
 73 | 
 74 | #endif // GLU_DATA_TYPES_HPP
 75 | 
 76 | 
 77 | #ifndef GLU_GL_UTILS_HPP
 78 | #define GLU_GL_UTILS_HPP
 79 | 
 80 | #include <cmath>
 81 | #include <functional>
 82 | #include <string>
 83 | #include <vector>
 84 | 
 85 | #ifndef GLU_ERRORS_HPP
 86 | #define GLU_ERRORS_HPP
 87 | 
 88 | #include <cstdio>
 89 | #include <cstdlib>
 90 | 
 91 | // TODO mark if (!condition_) as unlikely
 92 | #define GLU_CHECK_STATE(condition_, ...)                                                                                   \
 93 |     {                                                                                                                  \
 94 |         if (!(condition_))                                                                                             \
 95 |         {                                                                                                              \
 96 |             fprintf(stderr, __VA_ARGS__);                                                                              \
 97 |             exit(1);                                                                                                   \
 98 |         }                                                                                                              \
 99 |     }
100 | 
101 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__)
102 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__)
103 | 
104 | #endif
105 | 
106 | 
107 | 
108 | namespace glu
109 | {
110 |     inline void
111 |     copy_buffer(GLuint src_buffer, GLuint dst_buffer, size_t size, size_t src_offset = 0, size_t dst_offset = 0)
112 |     {
113 |         glBindBuffer(GL_COPY_READ_BUFFER, src_buffer);
114 |         glBindBuffer(GL_COPY_WRITE_BUFFER, dst_buffer);
115 | 
116 |         glCopyBufferSubData(
117 |             GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, (GLintptr) src_offset, (GLintptr) dst_offset, (GLsizeiptr) size
118 |         );
119 |     }
120 | 
121 |     /// A RAII wrapper for GL shader.
122 |     class Shader
123 |     {
124 |     private:
125 |         GLuint m_handle;
126 | 
127 |     public:
128 |         explicit Shader(GLenum type) :
129 |             m_handle(glCreateShader(type)){};
130 |         Shader(const Shader&) = delete;
131 | 
132 |         Shader(Shader&& other) noexcept
133 |         {
134 |             m_handle = other.m_handle;
135 |             other.m_handle = 0;
136 |         }
137 | 
138 |         ~Shader() { glDeleteShader(m_handle); }
139 | 
140 |         [[nodiscard]] GLuint handle() const { return m_handle; }
141 | 
142 |         void source_from_str(const std::string& src_str)
143 |         {
144 |             const char* src_ptr = src_str.c_str();
145 |             glShaderSource(m_handle, 1, &src_ptr, nullptr);
146 |         }
147 | 
148 |         void source_from_file(const char* src_filepath)
149 |         {
150 |             FILE* file = fopen(src_filepath, "rt");
151 |             GLU_CHECK_STATE(!file, "Failed to shader file: %s", src_filepath);
152 | 
153 |             fseek(file, 0, SEEK_END);
154 |             size_t file_size = ftell(file);
155 |             fseek(file, 0, SEEK_SET);
156 | 
157 |             std::string src{};
158 |             src.resize(file_size);
159 |             fread(src.data(), sizeof(char), file_size, file);
160 |             source_from_str(src.c_str());
161 | 
162 |             fclose(file);
163 |         }
164 | 
165 |         std::string get_info_log()
166 |         {
167 |             GLint log_length = 0;
168 |             glGetShaderiv(m_handle, GL_INFO_LOG_LENGTH, &log_length);
169 | 
170 |             std::vector<GLchar> log(log_length);
171 |             glGetShaderInfoLog(m_handle, log_length, nullptr, log.data());
172 |             return {log.begin(), log.end()};
173 |         }
174 | 
175 |         void compile()
176 |         {
177 |             glCompileShader(m_handle);
178 | 
179 |             GLint status;
180 |             glGetShaderiv(m_handle, GL_COMPILE_STATUS, &status);
181 |             if (!status)
182 |             {
183 |                 GLU_CHECK_STATE(status, "Shader failed to compile: %s", get_info_log().c_str());
184 |             }
185 |         }
186 |     };
187 | 
188 |     /// A RAII wrapper for GL program.
189 |     class Program
190 |     {
191 |     private:
192 |         GLuint m_handle;
193 | 
194 |     public:
195 |         explicit Program() { m_handle = glCreateProgram(); };
196 |         Program(const Program&) = delete;
197 | 
198 |         Program(Program&& other) noexcept
199 |         {
200 |             m_handle = other.m_handle;
201 |             other.m_handle = 0;
202 |         }
203 | 
204 |         ~Program() { glDeleteProgram(m_handle); }
205 | 
206 |         [[nodiscard]] GLuint handle() const { return m_handle; }
207 | 
208 |         void attach_shader(GLuint shader_handle) { glAttachShader(m_handle, shader_handle); }
209 |         void attach_shader(const Shader& shader) { glAttachShader(m_handle, shader.handle()); }
210 | 
211 |         [[nodiscard]] std::string get_info_log() const
212 |         {
213 |             GLint log_length = 0;
214 |             glGetProgramiv(m_handle, GL_INFO_LOG_LENGTH, &log_length);
215 | 
216 |             std::vector<GLchar> log(log_length);
217 |             glGetProgramInfoLog(m_handle, log_length, nullptr, log.data());
218 |             return {log.begin(), log.end()};
219 |         }
220 | 
221 |         void link()
222 |         {
223 |             GLint status;
224 |             glLinkProgram(m_handle);
225 |             glGetProgramiv(m_handle, GL_LINK_STATUS, &status);
226 |             if (!status)
227 |             {
228 |                 GLU_CHECK_STATE(status, "Program failed to link: %s", get_info_log().c_str());
229 |             }
230 |         }
231 | 
232 |         void use() { glUseProgram(m_handle); }
233 | 
234 |         GLint get_uniform_location(const char* uniform_name)
235 |         {
236 |             GLint loc = glGetUniformLocation(m_handle, uniform_name);
237 |             GLU_CHECK_STATE(loc >= 0, "Failed to get uniform location: %s", uniform_name);
238 |             return loc;
239 |         }
240 |     };
241 | 
242 |     /// A RAII helper class for GL shader storage buffer.
243 |     class ShaderStorageBuffer
244 |     {
245 |     private:
246 |         GLuint m_handle = 0;
247 |         size_t m_size = 0;
248 | 
249 |     public:
250 |         explicit ShaderStorageBuffer(size_t initial_size = 0)
251 |         {
252 |             if (initial_size > 0)
253 |                 resize(initial_size, false);
254 |         }
255 | 
256 |         explicit ShaderStorageBuffer(const void* data, size_t size) :
257 |             m_size(size)
258 |         {
259 |             GLU_CHECK_ARGUMENT(data, "");
260 |             GLU_CHECK_ARGUMENT(size > 0, "");
261 | 
262 |             glCreateBuffers(1, &m_handle);
263 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
264 |             glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, data, GL_DYNAMIC_STORAGE_BIT);
265 |         }
266 | 
267 |         template<typename T>
268 |         explicit ShaderStorageBuffer(const std::vector<T>& data) :
269 |             ShaderStorageBuffer(data.data(), data.size() * sizeof(T))
270 |         {
271 |         }
272 | 
273 |         ShaderStorageBuffer(const ShaderStorageBuffer&) = delete;
274 |         ShaderStorageBuffer(ShaderStorageBuffer&& other) noexcept
275 |         {
276 |             m_handle = other.m_handle;
277 |             m_size = other.m_size;
278 |             other.m_handle = 0;
279 |         }
280 | 
281 |         ~ShaderStorageBuffer()
282 |         {
283 |             if (m_handle)
284 |                 glDeleteBuffers(1, &m_handle);
285 |         }
286 | 
287 |         [[nodiscard]] GLuint handle() const { return m_handle; }
288 |         [[nodiscard]] size_t size() const { return m_size; }
289 | 
290 |         /// Grows or shrinks the buffer. If keep_data, performs an additional copy to maintain the data.
291 |         void resize(size_t size, bool keep_data = false)
292 |         {
293 |             size_t old_size = m_size;
294 |             GLuint old_handle = m_handle;
295 | 
296 |             if (old_size != size)
297 |             {
298 |                 m_size = size;
299 | 
300 |                 glCreateBuffers(1, &m_handle);
301 |                 glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
302 |                 glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, nullptr, GL_DYNAMIC_STORAGE_BIT);
303 | 
304 |                 if (keep_data)
305 |                     copy_buffer(old_handle, m_handle, std::min(old_size, size));
306 | 
307 |                 glDeleteBuffers(1, &old_handle);
308 |             }
309 |         }
310 | 
311 |         /// Clears the entire buffer with the given GLuint value (repeated).
312 |         void clear(GLuint value)
313 |         {
314 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
315 |             glClearBufferData(GL_SHADER_STORAGE_BUFFER, GL_R32UI, GL_RED, GL_UNSIGNED_INT, &value);
316 |         }
317 | 
318 |         void write_data(const void* data, size_t size)
319 |         {
320 |             GLU_CHECK_ARGUMENT(size <= m_size, "");
321 | 
322 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
323 |             glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, size, data);
324 |         }
325 | 
326 |         template<typename T>
327 |         std::vector<T> get_data() const
328 |         {
329 |             GLU_CHECK_ARGUMENT(m_size % sizeof(T) == 0, "Size %zu isn't a multiple of %zu", m_size, sizeof(T));
330 | 
331 |             std::vector<T> result(m_size / sizeof(T));
332 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
333 |             glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, (GLsizeiptr) m_size, result.data());
334 |             return result;
335 |         }
336 | 
337 |         void bind(GLuint index, size_t size = 0, size_t offset = 0)
338 |         {
339 |             if (size == 0)
340 |                 size = m_size;
341 |             glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, m_handle, (GLintptr) offset, (GLsizeiptr) size);
342 |         }
343 |     };
344 | 
345 |     /// Measures elapsed time on GPU for executing the given callback.
346 |     inline uint64_t measure_gl_elapsed_time(const std::function<void()>& callback)
347 |     {
348 |         GLuint query;
349 |         uint64_t elapsed_time{};
350 | 
351 |         glGenQueries(1, &query);
352 |         glBeginQuery(GL_TIME_ELAPSED, query);
353 | 
354 |         callback();
355 | 
356 |         glEndQuery(GL_TIME_ELAPSED);
357 | 
358 |         glGetQueryObjectui64v(query, GL_QUERY_RESULT, &elapsed_time);
359 |         glDeleteQueries(1, &query);
360 | 
361 |         return elapsed_time;
362 |     }
363 | 
364 |     template<typename IntegerT>
365 |     IntegerT log32_floor(IntegerT n)
366 |     {
367 |         return (IntegerT) floor(double(log2(n)) / 5.0);
368 |     }
369 | 
370 |     template<typename IntegerT>
371 |     IntegerT log32_ceil(IntegerT n)
372 |     {
373 |         return (IntegerT) ceil(double(log2(n)) / 5.0);
374 |     }
375 | 
376 |     template<typename IntegerT>
377 |     IntegerT div_ceil(IntegerT n, IntegerT d)
378 |     {
379 |         return (IntegerT) ceil(double(n) / double(d));
380 |     }
381 | 
382 |     template<typename T>
383 |     bool is_power_of_2(T n)
384 |     {
385 |         return (n & (n - 1)) == 0;
386 |     }
387 | 
388 |     template<typename IntegerT>
389 |     IntegerT next_power_of_2(IntegerT n)
390 |     {
391 |         n--;
392 |         n |= n >> 1;
393 |         n |= n >> 2;
394 |         n |= n >> 4;
395 |         n |= n >> 8;
396 |         n |= n >> 16;
397 |         n++;
398 |         return n;
399 |     }
400 | 
401 |     template<typename Iterator>
402 |     void print_stl_container(Iterator begin, Iterator end)
403 |     {
404 |         size_t i = 0;
405 |         for (; begin != end; begin++)
406 |         {
407 |             printf("(%zu) %s, ", i, std::to_string(*begin).c_str());
408 |             i++;
409 |         }
410 |         printf("\n");
411 |     }
412 | 
413 |     template<typename T>
414 |     void print_buffer(const ShaderStorageBuffer& buffer)
415 |     {
416 |         std::vector<T> data = buffer.get_data<T>();
417 |         print_stl_container(data.begin(), data.end());
418 |     }
419 | 
420 |     inline void print_buffer_hex(const ShaderStorageBuffer& buffer)
421 |     {
422 |         std::vector<GLuint> data = buffer.get_data<GLuint>();
423 |         for (size_t i = 0; i < data.size(); i++)
424 |             printf("(%zu) %08x, ", i, data[i]);
425 |         printf("\n");
426 |     }
427 | } // namespace glu
428 | 
429 | #endif // GLU_GL_UTILS_HPP
430 | 
431 | 
432 | 
433 | namespace glu
434 | {
435 |     namespace detail
436 |     {
437 |         inline const char* k_reduction_shader_src = R"(
438 | #extension GL_KHR_shader_subgroup_arithmetic : require
439 | 
440 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;
441 | 
442 | layout(std430, binding = 0) buffer Buffer
443 | {
444 |     DATA_TYPE data[];
445 | };
446 | 
447 | layout(location = 0) uniform uint u_count;
448 | layout(location = 1) uniform uint u_depth;
449 | 
450 | void main()
451 | {
452 |     uint step = 1 << (5 * u_depth);
453 |     uint subgroup_i = gl_WorkGroupID.x * NUM_THREADS + gl_SubgroupID * gl_SubgroupSize;
454 |     uint i = (subgroup_i + gl_SubgroupInvocationID) * step;
455 |     if (i < u_count)
456 |     {
457 |         DATA_TYPE r = SUBGROUP_OPERATION(data[i]);
458 |         if (gl_SubgroupInvocationID == 0)
459 |         {
460 |             data[i] = r;
461 |         }
462 |     }
463 | }
464 | )";
465 |     }
466 | 
467 |     /// The operators that can be used for the reduction operation.
468 |     enum ReduceOperator
469 |     {
470 |         ReduceOperator_Sum = 0,
471 |         ReduceOperator_Mul,
472 |         ReduceOperator_Min,
473 |         ReduceOperator_Max
474 |     };
475 | 
476 |     /// A class that implements the reduction operation.
477 |     class Reduce
478 |     {
479 |     private:
480 |         const DataType m_data_type;
481 |         const ReduceOperator m_operator;
482 |         const size_t m_num_threads;
483 |         const size_t m_num_items;
484 | 
485 |         Program m_program;
486 | 
487 |     public:
488 |         explicit Reduce(DataType data_type, ReduceOperator operator_) :
489 |             m_data_type(data_type),
490 |             m_operator(operator_),
491 |             m_num_threads(1024),
492 |             m_num_items(4)
493 |         {
494 |             std::string shader_src = "#version 460\n\n";
495 | 
496 |             shader_src += std::string("#define DATA_TYPE ") + to_glsl_type_str(m_data_type) + "\n";
497 |             shader_src += std::string("#define NUM_THREADS ") + std::to_string(m_num_threads) + "\n";
498 |             shader_src += std::string("#define NUM_ITEMS ") + std::to_string(m_num_items) + "\n";
499 | 
500 |             if (m_operator == ReduceOperator_Sum)
501 |             {
502 |                 shader_src += "#define OPERATOR(a, b) (a + b)\n";
503 |                 shader_src += "#define SUBGROUP_OPERATION(value) subgroupAdd(value)\n";
504 |             }
505 |             else if (m_operator == ReduceOperator_Mul)
506 |             {
507 |                 shader_src += "#define OPERATOR(a, b) (a * b)\n";
508 |                 shader_src += "#define SUBGROUP_OPERATION(value) subgroupMul(value)\n";
509 |             }
510 |             else if (m_operator == ReduceOperator_Min)
511 |             {
512 |                 shader_src += "#define OPERATOR(a, b) (min(a, b))\n";
513 |                 shader_src += "#define SUBGROUP_OPERATION(value) subgroupMin(value)\n";
514 |             }
515 |             else if (m_operator == ReduceOperator_Max)
516 |             {
517 |                 shader_src += "#define OPERATOR(a, b) (max(a, b))\n";
518 |                 shader_src += "#define SUBGROUP_OPERATION(value) subgroupMax(value)\n";
519 |             }
520 |             else
521 |             {
522 |                 GLU_FAIL("Invalid reduction operator: %d", m_operator);
523 |             }
524 | 
525 |             shader_src += detail::k_reduction_shader_src;
526 | 
527 |             Shader shader(GL_COMPUTE_SHADER);
528 |             shader.source_from_str(shader_src.c_str());
529 |             shader.compile();
530 | 
531 |             m_program.attach_shader(shader);
532 |             m_program.link();
533 |         }
534 | 
535 |         ~Reduce() = default;
536 | 
537 |         void operator()(GLuint buffer, size_t count)
538 |         {
539 |             GLU_CHECK_ARGUMENT(buffer, "Invalid buffer");
540 |             GLU_CHECK_ARGUMENT(count > 0, "Count must be greater than zero");
541 | 
542 |             m_program.use();
543 | 
544 |             glUniform1ui(m_program.get_uniform_location("u_count"), count);
545 |             glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer);
546 | 
547 |             for (int depth = 0;; depth++)
548 |             {
549 |                 int step = 1 << (5 * depth);
550 |                 if (step >= count)
551 |                     break;
552 | 
553 |                 size_t level_count = count >> (5 * depth);
554 | 
555 |                 glUniform1ui(m_program.get_uniform_location("u_depth"), depth);
556 | 
557 |                 size_t num_workgroups = div_ceil(level_count, m_num_threads);
558 |                 glDispatchCompute(num_workgroups, 1, 1);
559 |                 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
560 |             }
561 |         }
562 |     };
563 | } // namespace glu
564 | 
565 | #endif // GLU_REDUCE_HPP
566 | 


--------------------------------------------------------------------------------
/generate.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from os import path
 3 | 
 4 | script_dir = path.dirname(path.realpath(__file__))
 5 | 
 6 | 
 7 | def generate_standalone_header_code(in_filepath: str) -> str:
 8 |     with open(in_filepath, "rt") as in_file:
 9 |         code = in_file.read()
10 | 
11 |         while True:
12 |             matches = [match for match in re.finditer(r'^#include\s+"(\S+?)"', code, re.MULTILINE)]
13 |             if len(matches) == 0:
14 |                 break
15 | 
16 |             match = matches[0]
17 |             included_filepath = path.join(path.dirname(in_filepath), match.group(1))
18 |             included_code = generate_standalone_header_code(included_filepath)
19 | 
20 |             code = code[:match.span()[0]] + included_code + "\n" + code[match.span()[1]:]
21 |         return code
22 | 
23 | 
24 | def generate_standalone_header(in_filepath: str, out_filepath: str):
25 |     print("Generating %s from %s" % (out_filepath, in_filepath))
26 |     with open(out_filepath, "wt") as out_file:
27 |         out_str = "// This code was automatically generated; you're not supposed to edit it!\n\n"
28 |         out_str += generate_standalone_header_code(in_filepath)
29 |         out_file.write(out_str)
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     def p(filename: str):
34 |         return path.join(script_dir, "glu/%s" % filename), path.join(script_dir, "dist/%s" % filename)
35 | 
36 |     generate_standalone_header(*p("BlellochScan.hpp"))
37 |     generate_standalone_header(*p("RadixSort.hpp"))
38 |     generate_standalone_header(*p("Reduce.hpp"))
39 | 


--------------------------------------------------------------------------------
/glu/BlellochScan.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef GLU_BLELLOCHSCAN_HPP
  2 | #define GLU_BLELLOCHSCAN_HPP
  3 | 
  4 | #include <string>
  5 | 
  6 | #include "Reduce.hpp"
  7 | #include "data_types.hpp"
  8 | 
  9 | namespace glu
 10 | {
 11 |     namespace detail
 12 |     {
 13 |         inline const char* k_upsweep_shader_src = R"(
 14 | #extension GL_KHR_shader_subgroup_shuffle_relative : require
 15 | 
 16 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;
 17 | 
 18 | layout(std430, binding = 0) buffer Buffer
 19 | {
 20 |     DATA_TYPE data[];
 21 | };
 22 | 
 23 | layout(location = 0) uniform uint u_count;
 24 | layout(location = 1) uniform uint u_step;
 25 | 
 26 | void main()
 27 | {
 28 |     uint partition_i = gl_WorkGroupID.y;
 29 |     uint thread_i = gl_WorkGroupID.x * NUM_THREADS + gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
 30 |     uint i = partition_i * u_count + thread_i * u_step + u_step - 1;
 31 |     uint end_i = (partition_i + 1) * u_count;
 32 |     if (i < end_i)
 33 |     {
 34 |         DATA_TYPE lval = subgroupShuffleUp(data[i], 1);
 35 |         DATA_TYPE r = OPERATION(data[i], lval);
 36 |         if (i == end_i - 1)  // Clear last
 37 |         {
 38 |             data[i] = IDENTITY;
 39 |         }
 40 |         else if (gl_SubgroupInvocationID % 2 == 1)
 41 |         {
 42 |             data[i] = r;
 43 |         }
 44 |     }
 45 | }
 46 | )";
 47 | 
 48 |         inline const char* k_downsweep_shader_src = R"(
 49 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;
 50 | 
 51 | layout(std430, binding = 0) buffer Buffer
 52 | {
 53 |     DATA_TYPE data[];
 54 | };
 55 | 
 56 | layout(location = 0) uniform uint u_count;
 57 | layout(location = 1) uniform uint u_step;
 58 | 
 59 | void main()
 60 | {
 61 |     uint partition_i = gl_WorkGroupID.y;
 62 |     uint i = partition_i * u_count + gl_GlobalInvocationID.x * (u_step << 1) + (u_step - 1);
 63 |     uint next_i = i + u_step;
 64 |     uint end_i = (partition_i + 1) * u_count;
 65 |     if (next_i < end_i)
 66 |     {
 67 |         DATA_TYPE tmp = data[i];
 68 |         data[i] = data[next_i];
 69 |         data[next_i] = data[next_i] + tmp;
 70 |     }
 71 |     else if (i < end_i)
 72 |     {
 73 |         data[i] = IDENTITY;
 74 |     }
 75 | }
 76 | )";
 77 |     } // namespace detail
 78 | 
 79 |     /// A class that implements Blelloch scan algorithm (exclusive prefix sum).
 80 |     class BlellochScan
 81 |     {
 82 |     private:
 83 |         const DataType m_data_type;
 84 |         const size_t m_num_threads;
 85 |         const size_t m_num_items;
 86 | 
 87 |         Program m_upsweep_program;
 88 |         Program m_downsweep_program;
 89 | 
 90 |     public:
 91 |         explicit BlellochScan(DataType data_type) :
 92 |             m_data_type(data_type),
 93 |             m_num_threads(1024),
 94 |             m_num_items(4)
 95 |         {
 96 |             std::string shader_src = "#version 460\n\n";
 97 | 
 98 |             shader_src += std::string("#define DATA_TYPE ") + to_glsl_type_str(m_data_type) + "\n";
 99 |             shader_src += "#define OPERATION(a, b) (a + b)\n";
100 |             shader_src += "#define IDENTITY 0\n";
101 |             shader_src += std::string("#define NUM_THREADS ") + std::to_string(m_num_threads) + "\n";
102 |             shader_src += std::string("#define NUM_ITEMS ") + std::to_string(m_num_items) + "\n";
103 | 
104 |             { // Upsweep program
105 |                 Shader upsweep_shader(GL_COMPUTE_SHADER);
106 |                 upsweep_shader.source_from_str((shader_src + detail::k_upsweep_shader_src).c_str());
107 |                 upsweep_shader.compile();
108 | 
109 |                 m_upsweep_program.attach_shader(upsweep_shader);
110 |                 m_upsweep_program.link();
111 |             }
112 | 
113 |             { // Downsweep program
114 |                 Shader downsweep_program(GL_COMPUTE_SHADER);
115 |                 downsweep_program.source_from_str((shader_src + detail::k_downsweep_shader_src).c_str());
116 |                 downsweep_program.compile();
117 | 
118 |                 m_downsweep_program.attach_shader(downsweep_program);
119 |                 m_downsweep_program.link();
120 |             }
121 |         }
122 | 
123 |         ~BlellochScan() = default;
124 | 
125 |         /// Runs Blelloch exclusive scan on multiple partitions.
126 |         ///
127 |         /// @param buffer the input GLuint buffer
128 |         /// @param count the number of GLuint in the buffer (must be a power of 2)
129 |         /// @param num_partitions the number of partitions (must be adjacent)
130 |         void operator()(GLuint buffer, size_t count, size_t num_partitions = 1)
131 |         {
132 |             GLU_CHECK_ARGUMENT(buffer, "Invalid buffer");
133 |             GLU_CHECK_ARGUMENT(count > 0, "Count must be greater than zero");
134 |             GLU_CHECK_ARGUMENT(is_power_of_2(count), "Count must be a power of 2"); // TODO Remove this requirement
135 |             GLU_CHECK_ARGUMENT(num_partitions >= 1, "Num of partitions must be >= 1");
136 | 
137 |             upsweep(buffer, count, num_partitions); // Also clear last
138 |             downsweep(buffer, count, num_partitions);
139 |         }
140 | 
141 |     private:
142 |         void upsweep(GLuint buffer, size_t count, size_t num_partitions) // Also clear last
143 |         {
144 |             m_upsweep_program.use();
145 | 
146 |             glUniform1ui(m_upsweep_program.get_uniform_location("u_count"), count);
147 |             glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer);
148 | 
149 |             int step = 1;
150 |             int level_count = (int) count;
151 |             while (true)
152 |             {
153 |                 glUniform1ui(m_upsweep_program.get_uniform_location("u_step"), step);
154 | 
155 |                 size_t num_workgroups = div_ceil<size_t>(level_count, m_num_threads);
156 |                 glDispatchCompute(num_workgroups, num_partitions, 1);
157 |                 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
158 | 
159 |                 step <<= 1;
160 | 
161 |                 level_count >>= 1;
162 | 
163 |                 if (level_count <= 1)
164 |                     break;
165 |             }
166 |         }
167 | 
168 |         void downsweep(GLuint buffer, size_t count, size_t num_partitions)
169 |         {
170 |             m_downsweep_program.use();
171 | 
172 |             glUniform1ui(m_downsweep_program.get_uniform_location("u_count"), count);
173 |             glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer);
174 | 
175 |             int step = next_power_of_2(int(count)) >> 1;
176 |             size_t level_count = 1;
177 |             while (true)
178 |             {
179 |                 glUniform1ui(m_downsweep_program.get_uniform_location("u_step"), step);
180 | 
181 |                 size_t num_workgroups = div_ceil(level_count, m_num_threads);
182 |                 glDispatchCompute(num_workgroups, num_partitions, 1);
183 |                 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
184 | 
185 |                 step >>= 1;
186 |                 level_count <<= 1;
187 |                 if (step == 0)
188 |                     break;
189 |             }
190 |         }
191 |     };
192 | } // namespace glu
193 | 
194 | #endif // GLU_BLELLOCHSCAN_HPP
195 | 


--------------------------------------------------------------------------------
/glu/RadixSort.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef GLU_RADIXSORT_HPP
  2 | #define GLU_RADIXSORT_HPP
  3 | 
  4 | #include "BlellochScan.hpp"
  5 | #include "gl_utils.hpp"
  6 | 
  7 | namespace glu
  8 | {
  9 |     namespace detail
 10 |     {
 11 |         inline const char* k_radix_sort_counting_shader = R"(
 12 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;
 13 | 
 14 | layout(std430, binding = 0) readonly buffer KeyBuffer
 15 | {
 16 |     uint b_key_buffer[];
 17 | };
 18 | 
 19 | layout(std430, binding = 1) buffer BlockCountBuffer
 20 | {
 21 |     uint b_block_count_buffer[]; // 16 * NUM_THREADS
 22 | };
 23 | 
 24 | layout(std430, binding = 2) buffer GlobalCountBuffer
 25 | {
 26 |     uint b_global_count_buffer[];
 27 | };
 28 | 
 29 | layout(location = 0) uniform uint u_count;
 30 | layout(location = 1) uniform uint u_radix_shift;
 31 | layout(location = 2) uniform uint u_num_blocks_power_of_2;
 32 | 
 33 | void main()
 34 | {
 35 |     for (uint radix = 0; radix < 16; radix++)
 36 |     {
 37 |         b_block_count_buffer[radix * u_num_blocks_power_of_2 + gl_WorkGroupID.x] = 0;
 38 |     }
 39 | 
 40 |     barrier();
 41 | 
 42 |     uint i = gl_GlobalInvocationID.x;
 43 |     if (i < u_count)
 44 |     {
 45 |         // Block-wide count on shared memory
 46 |         uint radix = (b_key_buffer[i] >> u_radix_shift) & 0xf;
 47 |         atomicAdd(b_block_count_buffer[radix * u_num_blocks_power_of_2 + gl_WorkGroupID.x], 1);
 48 |     }
 49 | 
 50 |     barrier();
 51 | 
 52 |     if (gl_LocalInvocationIndex < 16)
 53 |     {
 54 |         uint block_count = b_block_count_buffer[gl_LocalInvocationIndex * u_num_blocks_power_of_2 + gl_WorkGroupID.x];
 55 |         atomicAdd(b_global_count_buffer[gl_LocalInvocationIndex], block_count);
 56 |     }
 57 | }
 58 | )";
 59 | 
 60 |         inline const char* k_radix_sort_reordering_shader = R"(
 61 | #extension GL_KHR_shader_subgroup_arithmetic : require
 62 | 
 63 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;
 64 | 
 65 | layout(std430, binding = 0) readonly buffer SrcKeyBuffer
 66 | {
 67 |     uint b_src_key_buffer[];
 68 | };
 69 | 
 70 | layout(std430, binding = 1) readonly buffer SrcValBuffer
 71 | {
 72 |     uint b_src_val_buffer[];
 73 | };
 74 | 
 75 | layout(std430, binding = 2) writeonly buffer DstKeyBuffer
 76 | {
 77 |     uint b_dst_key_buffer[];
 78 | };
 79 | 
 80 | layout(std430, binding = 3) writeonly buffer DstValBuffer
 81 | {
 82 |     uint b_dst_val_buffer[];
 83 | };
 84 | 
 85 | layout(std430, binding = 4) readonly buffer BlockOffsetBuffer
 86 | {
 87 |     uint b_block_offset_buffer[];
 88 | };
 89 | 
 90 | layout(std430, binding = 5) readonly buffer GlobalCountBuffer
 91 | {
 92 |     uint b_global_count_buffer[];
 93 | };
 94 | 
 95 | layout(location = 0) uniform uint u_count;
 96 | layout(location = 1) uniform uint u_radix_shift;
 97 | layout(location = 2) uniform uint u_num_blocks_power_of_2;
 98 | 
 99 | shared uint s_global_offset_buffer[16];
100 | shared uint s_prefix_sum_buffer[NUM_THREADS];
101 | 
102 | void prefix_sum()  // Block-wide prefix sum (Blelloch scan)
103 | {
104 |     uint thread_i = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
105 | 
106 |     // Upsweep
107 |     for (uint step = 1; step < NUM_THREADS; step <<= 1)
108 |     {
109 |         if (thread_i % 2 == 1)
110 |         {
111 |             uint i = thread_i * step + (step - 1);
112 |             if (i < NUM_THREADS)
113 |             {
114 |                 s_prefix_sum_buffer[i] = s_prefix_sum_buffer[i] + s_prefix_sum_buffer[i - step];
115 |             }
116 |         }
117 | 
118 |         barrier();
119 |     }
120 | 
121 |     // Clear last
122 |     if (thread_i == NUM_THREADS - 1) s_prefix_sum_buffer[thread_i] = 0;
123 | 
124 |     barrier();
125 | 
126 |     // Downsweep
127 |     uint step = NUM_THREADS >> 1;
128 |     for (; step > 0; step >>= 1)
129 |     {
130 |         uint i = thread_i * step + (step - 1);
131 |         if (i + step < NUM_THREADS && thread_i % 2 == 0)
132 |         {
133 |             uint tmp = s_prefix_sum_buffer[i];
134 |             s_prefix_sum_buffer[i] = s_prefix_sum_buffer[i + step];
135 |             s_prefix_sum_buffer[i + step] = tmp + s_prefix_sum_buffer[i + step];
136 |         }
137 | 
138 |         barrier();
139 |     }
140 | }
141 | 
142 | void main()
143 | {
144 |     uint thread_i = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
145 |     uint i = gl_WorkGroupID.x * NUM_THREADS + thread_i;
146 | 
147 |     // Prefix sum on global counts to obtain global offsets
148 |     if (gl_SubgroupID == 0 && gl_SubgroupInvocationID < 16)
149 |     {
150 |         uint v = subgroupExclusiveAdd(b_global_count_buffer[gl_SubgroupInvocationID]);
151 |         s_global_offset_buffer[gl_SubgroupInvocationID] = v;
152 |     }
153 | 
154 |     barrier();
155 | 
156 |     // Reordering
157 |     for (uint radix = 0; radix < 16; radix++)
158 |     {
159 |         bool should_place = false;
160 |         if (i < u_count)
161 |         {
162 |             should_place = ((b_src_key_buffer[i] >> u_radix_shift) & 0xf) == radix;
163 |         }
164 | 
165 |         s_prefix_sum_buffer[thread_i] = should_place ? 1 : 0;
166 | 
167 |         barrier();
168 | 
169 |         // Prefix sum on local counts to obtain local offsets
170 |         prefix_sum();
171 | 
172 |         if (should_place)
173 |         {
174 |             uint di =
175 |                 s_global_offset_buffer[radix] +
176 |                 b_block_offset_buffer[radix * u_num_blocks_power_of_2 + gl_WorkGroupID.x] +
177 |                 s_prefix_sum_buffer[thread_i];
178 |             b_dst_key_buffer[di] = b_src_key_buffer[i];
179 |             b_dst_val_buffer[di] = b_src_val_buffer[i];
180 |         }
181 |     }
182 | }
183 | )";
184 |     } // namespace detail
185 | 
186 |     class RadixSort
187 |     {
188 |     private:
189 |         Program m_count_program;
190 |         BlellochScan m_blelloch_scan;
191 |         Program m_reorder_program;
192 | 
193 |         /// A GLuint buffer of size 16 * NUM_THREADS that stores the counts of radixes per block.
194 |         ShaderStorageBuffer m_block_count_buffer;
195 | 
196 |         /// A GLuint buffer of size 16 that stores the global counts of radixes.
197 |         ShaderStorageBuffer m_global_count_buffer;
198 | 
199 |         ShaderStorageBuffer m_key_scratch_buffer;
200 |         ShaderStorageBuffer m_val_scratch_buffer;
201 | 
202 |         const size_t m_num_threads;
203 | 
204 |     public:
205 |         explicit RadixSort() :
206 |             m_blelloch_scan(DataType_Uint),
207 |             m_num_threads(1024)
208 |         {
209 |             GLU_CHECK_ARGUMENT(is_power_of_2(m_num_threads), "Num threads must be a power of 2");
210 | 
211 |             m_global_count_buffer.resize(16 * sizeof(GLuint));
212 | 
213 |             std::string shader_src = "#version 460\n\n";
214 |             shader_src += "#define NUM_THREADS " + std::to_string(m_num_threads) + "\n";
215 | 
216 |             { // Counting program
217 |                 Shader shader(GL_COMPUTE_SHADER);
218 |                 shader.source_from_str(shader_src + detail::k_radix_sort_counting_shader);
219 |                 shader.compile();
220 | 
221 |                 m_count_program.attach_shader(shader.handle());
222 |                 m_count_program.link();
223 |             }
224 | 
225 |             { // Reordering program
226 |                 Shader shader(GL_COMPUTE_SHADER);
227 |                 shader.source_from_str(shader_src + detail::k_radix_sort_reordering_shader);
228 |                 shader.compile();
229 | 
230 |                 m_reorder_program.attach_shader(shader.handle());
231 |                 m_reorder_program.link();
232 |             }
233 |         }
234 | 
235 |         ~RadixSort() = default;
236 | 
237 |         void prepare_internal_buffers(size_t count)
238 |         {
239 |             { // Prepare block count buffer
240 |                 size_t required_size = required_block_count_buffer_size(count);
241 |                 if (m_block_count_buffer.size() < required_size)
242 |                 {
243 |                     m_block_count_buffer.resize(required_size, false);
244 | #ifdef GLU_VERBOSE // TODO Create a log utility
245 |                     printf("[RadixSort] Block count buffer reallocated to: %zu\n", required_size);
246 | #endif
247 |                 }
248 |             }
249 | 
250 |             { // Prepare key scratch buffer
251 |                 size_t required_size = required_key_scratch_buffer_size(count);
252 |                 if (m_key_scratch_buffer.size() < required_size)
253 |                 {
254 |                     m_key_scratch_buffer.resize(required_size, false);
255 | #ifdef GLU_VERBOSE
256 |                     printf("[RadixSort] Key scratch buffer reallocated to: %zu\n", required_size);
257 | #endif
258 |                 }
259 |             }
260 | 
261 |             { // Prepare val scratch buffer
262 |                 size_t required_size = required_val_scratch_buffer_size(count);
263 |                 if (m_val_scratch_buffer.size() < required_size)
264 |                 {
265 |                     m_val_scratch_buffer.resize(required_size, false);
266 | #ifdef GLU_VERBOSE
267 |                     printf("[RadixSort] Val scratch buffer reallocated to: %zu\n", required_size);
268 | #endif
269 |                 }
270 |             }
271 |         }
272 | 
273 |         void operator()(GLuint key_buffer, GLuint val_buffer, size_t count, size_t num_steps = 0)
274 |         {
275 |             GLU_CHECK_ARGUMENT(key_buffer, "Invalid key buffer");
276 |             GLU_CHECK_ARGUMENT(val_buffer, "Invalid value buffer");
277 | 
278 |             if (count <= 1)
279 |                 return; // Hey, that's already sorted x)
280 | 
281 |             prepare_internal_buffers(count);
282 | 
283 |             size_t num_blocks = div_ceil(count, size_t(1024));
284 |             size_t num_blocks_power_of_2 = next_power_of_2(num_blocks); // Required by BlellochScan
285 | 
286 |             GLuint key_buffers[]{key_buffer, m_key_scratch_buffer.handle()};
287 |             GLuint val_buffers[]{val_buffer, m_val_scratch_buffer.handle()};
288 | 
289 |             for (int step = 0; step < 8;)
290 |             {
291 |                 // ---------------------------------------------------------------- Counting
292 | 
293 |                 m_block_count_buffer.clear(0);
294 |                 m_global_count_buffer.clear(0);
295 | 
296 |                 m_count_program.use();
297 | 
298 |                 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, key_buffers[step % 2]);
299 |                 m_block_count_buffer.bind(1);
300 |                 m_global_count_buffer.bind(2);
301 | 
302 |                 glUniform1ui(m_count_program.get_uniform_location("u_count"), count);
303 |                 glUniform1ui(m_count_program.get_uniform_location("u_radix_shift"), step << 2);
304 |                 glUniform1ui(m_count_program.get_uniform_location("u_num_blocks_power_of_2"), num_blocks_power_of_2);
305 | 
306 |                 glDispatchCompute(num_blocks, 1, 1);
307 |                 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
308 | 
309 |                 // ---------------------------------------------------------------- Prefix sum
310 | 
311 |                 m_blelloch_scan(m_block_count_buffer.handle(), num_blocks_power_of_2, 16);
312 | 
313 |                 // ---------------------------------------------------------------- Reordering
314 | 
315 |                 m_reorder_program.use();
316 | 
317 |                 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, key_buffers[step % 2]);
318 |                 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, val_buffers[step % 2]);
319 |                 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, key_buffers[(step + 1) % 2]);
320 |                 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, val_buffers[(step + 1) % 2]);
321 |                 m_block_count_buffer.bind(4);
322 |                 m_global_count_buffer.bind(5);
323 | 
324 |                 glUniform1ui(m_reorder_program.get_uniform_location("u_count"), count);
325 |                 glUniform1ui(m_reorder_program.get_uniform_location("u_radix_shift"), step << 2);
326 |                 glUniform1ui(m_reorder_program.get_uniform_location("u_num_blocks_power_of_2"), num_blocks_power_of_2);
327 | 
328 |                 glDispatchCompute(num_blocks, 1, 1);
329 |                 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
330 | 
331 |                 ++step;
332 |                 if (step == num_steps || step == 8) break;
333 |             }
334 |         }
335 | 
336 |     private:
337 |         [[nodiscard]] static size_t required_block_count_buffer_size(size_t count)
338 |         {
339 |             size_t num_blocks = div_ceil(count, size_t(1024));
340 |             size_t num_blocks_power_of_2 = next_power_of_2(num_blocks); // Required by BlellochScan
341 | 
342 |             return next_power_of_2(16 * num_blocks_power_of_2) * sizeof(GLuint);
343 |         }
344 | 
345 |         [[nodiscard]] static size_t required_key_scratch_buffer_size(size_t count)
346 |         {
347 |             return next_power_of_2(count) * sizeof(GLuint);
348 |         }
349 | 
350 |         [[nodiscard]] static size_t required_val_scratch_buffer_size(size_t count)
351 |         {
352 |             return next_power_of_2(count) * sizeof(GLuint);
353 |         }
354 |     };
355 | } // namespace glu
356 | 
357 | #endif // GLU_RADIXSORT_HPP
358 | 


--------------------------------------------------------------------------------
/glu/Reduce.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef GLU_REDUCE_HPP
  2 | #define GLU_REDUCE_HPP
  3 | 
  4 | #include "data_types.hpp"
  5 | #include "gl_utils.hpp"
  6 | 
  7 | namespace glu
  8 | {
  9 |     namespace detail
 10 |     {
 11 |         inline const char* k_reduction_shader_src = R"(
 12 | #extension GL_KHR_shader_subgroup_arithmetic : require
 13 | 
 14 | layout(local_size_x = NUM_THREADS, local_size_y = 1, local_size_z = 1) in;
 15 | 
 16 | layout(std430, binding = 0) buffer Buffer
 17 | {
 18 |     DATA_TYPE data[];
 19 | };
 20 | 
 21 | layout(location = 0) uniform uint u_count;
 22 | layout(location = 1) uniform uint u_depth;
 23 | 
 24 | void main()
 25 | {
 26 |     uint step = 1 << (5 * u_depth);
 27 |     uint subgroup_i = gl_WorkGroupID.x * NUM_THREADS + gl_SubgroupID * gl_SubgroupSize;
 28 |     uint i = (subgroup_i + gl_SubgroupInvocationID) * step;
 29 |     if (i < u_count)
 30 |     {
 31 |         DATA_TYPE r = SUBGROUP_OPERATION(data[i]);
 32 |         if (gl_SubgroupInvocationID == 0)
 33 |         {
 34 |             data[i] = r;
 35 |         }
 36 |     }
 37 | }
 38 | )";
 39 |     }
 40 | 
 41 |     /// The operators that can be used for the reduction operation.
 42 |     enum ReduceOperator
 43 |     {
 44 |         ReduceOperator_Sum = 0,
 45 |         ReduceOperator_Mul,
 46 |         ReduceOperator_Min,
 47 |         ReduceOperator_Max
 48 |     };
 49 | 
 50 |     /// A class that implements the reduction operation.
 51 |     class Reduce
 52 |     {
 53 |     private:
 54 |         const DataType m_data_type;
 55 |         const ReduceOperator m_operator;
 56 |         const size_t m_num_threads;
 57 |         const size_t m_num_items;
 58 | 
 59 |         Program m_program;
 60 | 
 61 |     public:
 62 |         explicit Reduce(DataType data_type, ReduceOperator operator_) :
 63 |             m_data_type(data_type),
 64 |             m_operator(operator_),
 65 |             m_num_threads(1024),
 66 |             m_num_items(4)
 67 |         {
 68 |             std::string shader_src = "#version 460\n\n";
 69 | 
 70 |             shader_src += std::string("#define DATA_TYPE ") + to_glsl_type_str(m_data_type) + "\n";
 71 |             shader_src += std::string("#define NUM_THREADS ") + std::to_string(m_num_threads) + "\n";
 72 |             shader_src += std::string("#define NUM_ITEMS ") + std::to_string(m_num_items) + "\n";
 73 | 
 74 |             if (m_operator == ReduceOperator_Sum)
 75 |             {
 76 |                 shader_src += "#define OPERATOR(a, b) (a + b)\n";
 77 |                 shader_src += "#define SUBGROUP_OPERATION(value) subgroupAdd(value)\n";
 78 |             }
 79 |             else if (m_operator == ReduceOperator_Mul)
 80 |             {
 81 |                 shader_src += "#define OPERATOR(a, b) (a * b)\n";
 82 |                 shader_src += "#define SUBGROUP_OPERATION(value) subgroupMul(value)\n";
 83 |             }
 84 |             else if (m_operator == ReduceOperator_Min)
 85 |             {
 86 |                 shader_src += "#define OPERATOR(a, b) (min(a, b))\n";
 87 |                 shader_src += "#define SUBGROUP_OPERATION(value) subgroupMin(value)\n";
 88 |             }
 89 |             else if (m_operator == ReduceOperator_Max)
 90 |             {
 91 |                 shader_src += "#define OPERATOR(a, b) (max(a, b))\n";
 92 |                 shader_src += "#define SUBGROUP_OPERATION(value) subgroupMax(value)\n";
 93 |             }
 94 |             else
 95 |             {
 96 |                 GLU_FAIL("Invalid reduction operator: %d", m_operator);
 97 |             }
 98 | 
 99 |             shader_src += detail::k_reduction_shader_src;
100 | 
101 |             Shader shader(GL_COMPUTE_SHADER);
102 |             shader.source_from_str(shader_src.c_str());
103 |             shader.compile();
104 | 
105 |             m_program.attach_shader(shader);
106 |             m_program.link();
107 |         }
108 | 
109 |         ~Reduce() = default;
110 | 
111 |         void operator()(GLuint buffer, size_t count)
112 |         {
113 |             GLU_CHECK_ARGUMENT(buffer, "Invalid buffer");
114 |             GLU_CHECK_ARGUMENT(count > 0, "Count must be greater than zero");
115 | 
116 |             m_program.use();
117 | 
118 |             glUniform1ui(m_program.get_uniform_location("u_count"), count);
119 |             glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, buffer);
120 | 
121 |             for (int depth = 0;; depth++)
122 |             {
123 |                 int step = 1 << (5 * depth);
124 |                 if (step >= count)
125 |                     break;
126 | 
127 |                 size_t level_count = count >> (5 * depth);
128 | 
129 |                 glUniform1ui(m_program.get_uniform_location("u_depth"), depth);
130 | 
131 |                 size_t num_workgroups = div_ceil(level_count, m_num_threads);
132 |                 glDispatchCompute(num_workgroups, 1, 1);
133 |                 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
134 |             }
135 |         }
136 |     };
137 | } // namespace glu
138 | 
139 | #endif // GLU_REDUCE_HPP
140 | 


--------------------------------------------------------------------------------
/glu/data_types.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef GLU_DATA_TYPES_HPP
 2 | #define GLU_DATA_TYPES_HPP
 3 | 
 4 | #include "errors.hpp"
 5 | 
 6 | namespace glu
 7 | {
 8 |     enum DataType
 9 |     {
10 |         DataType_Float = 0,
11 |         DataType_Double,
12 |         DataType_Int,
13 |         DataType_Uint,
14 |         DataType_Vec2,
15 |         DataType_Vec4,
16 |         DataType_DVec2,
17 |         DataType_DVec4,
18 |         DataType_UVec2,
19 |         DataType_UVec4,
20 |         DataType_IVec2,
21 |         DataType_IVec4
22 |     };
23 | 
24 |     inline const char* to_glsl_type_str(DataType data_type)
25 |     {
26 |         // clang-format off
27 |         if (data_type == DataType_Float)       return "float";
28 |         else if (data_type == DataType_Double) return "double";
29 |         else if (data_type == DataType_Int)    return "int";
30 |         else if (data_type == DataType_Uint)   return "uint";
31 |         else if (data_type == DataType_Vec2)   return "vec2";
32 |         else if (data_type == DataType_Vec4)   return "vec4";
33 |         else if (data_type == DataType_DVec2)  return "dvec2";
34 |         else if (data_type == DataType_DVec4)  return "dvec4";
35 |         else if (data_type == DataType_UVec2)  return "uvec2";
36 |         else if (data_type == DataType_UVec4)  return "uvec4";
37 |         else if (data_type == DataType_IVec2)  return "ivec2";
38 |         else if (data_type == DataType_IVec4)  return "ivec4";
39 |         else
40 |         {
41 |             GLU_FAIL("Invalid data type: %d", data_type);
42 |         }
43 |         // clang-format on
44 |     }
45 | 
46 | } // namespace glu
47 | 
48 | #endif // GLU_DATA_TYPES_HPP
49 | 


--------------------------------------------------------------------------------
/glu/errors.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef GLU_ERRORS_HPP
 2 | #define GLU_ERRORS_HPP
 3 | 
 4 | #include <cstdio>
 5 | #include <cstdlib>
 6 | 
 7 | // TODO mark if (!condition_) as unlikely
 8 | #define GLU_CHECK_STATE(condition_, ...)                                                                                   \
 9 |     {                                                                                                                  \
10 |         if (!(condition_))                                                                                             \
11 |         {                                                                                                              \
12 |             fprintf(stderr, __VA_ARGS__);                                                                              \
13 |             exit(1);                                                                                                   \
14 |         }                                                                                                              \
15 |     }
16 | 
17 | #define GLU_CHECK_ARGUMENT(condition_, ...) GLU_CHECK_STATE(condition_, __VA_ARGS__)
18 | #define GLU_FAIL(...) GLU_CHECK_STATE(false, __VA_ARGS__)
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/glu/gl_utils.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef GLU_GL_UTILS_HPP
  2 | #define GLU_GL_UTILS_HPP
  3 | 
  4 | #include <cmath>
  5 | #include <functional>
  6 | #include <string>
  7 | #include <vector>
  8 | 
  9 | #include "errors.hpp"
 10 | 
 11 | namespace glu
 12 | {
 13 |     inline void
 14 |     copy_buffer(GLuint src_buffer, GLuint dst_buffer, size_t size, size_t src_offset = 0, size_t dst_offset = 0)
 15 |     {
 16 |         glBindBuffer(GL_COPY_READ_BUFFER, src_buffer);
 17 |         glBindBuffer(GL_COPY_WRITE_BUFFER, dst_buffer);
 18 | 
 19 |         glCopyBufferSubData(
 20 |             GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, (GLintptr) src_offset, (GLintptr) dst_offset, (GLsizeiptr) size
 21 |         );
 22 |     }
 23 | 
 24 |     /// A RAII wrapper for GL shader.
 25 |     class Shader
 26 |     {
 27 |     private:
 28 |         GLuint m_handle;
 29 | 
 30 |     public:
 31 |         explicit Shader(GLenum type) :
 32 |             m_handle(glCreateShader(type)){};
 33 |         Shader(const Shader&) = delete;
 34 | 
 35 |         Shader(Shader&& other) noexcept
 36 |         {
 37 |             m_handle = other.m_handle;
 38 |             other.m_handle = 0;
 39 |         }
 40 | 
 41 |         ~Shader() { glDeleteShader(m_handle); }
 42 | 
 43 |         [[nodiscard]] GLuint handle() const { return m_handle; }
 44 | 
 45 |         void source_from_str(const std::string& src_str)
 46 |         {
 47 |             const char* src_ptr = src_str.c_str();
 48 |             glShaderSource(m_handle, 1, &src_ptr, nullptr);
 49 |         }
 50 | 
 51 |         void source_from_file(const char* src_filepath)
 52 |         {
 53 |             FILE* file = fopen(src_filepath, "rt");
 54 |             GLU_CHECK_STATE(!file, "Failed to shader file: %s", src_filepath);
 55 | 
 56 |             fseek(file, 0, SEEK_END);
 57 |             size_t file_size = ftell(file);
 58 |             fseek(file, 0, SEEK_SET);
 59 | 
 60 |             std::string src{};
 61 |             src.resize(file_size);
 62 |             fread(src.data(), sizeof(char), file_size, file);
 63 |             source_from_str(src.c_str());
 64 | 
 65 |             fclose(file);
 66 |         }
 67 | 
 68 |         std::string get_info_log()
 69 |         {
 70 |             GLint log_length = 0;
 71 |             glGetShaderiv(m_handle, GL_INFO_LOG_LENGTH, &log_length);
 72 | 
 73 |             std::vector<GLchar> log(log_length);
 74 |             glGetShaderInfoLog(m_handle, log_length, nullptr, log.data());
 75 |             return {log.begin(), log.end()};
 76 |         }
 77 | 
 78 |         void compile()
 79 |         {
 80 |             glCompileShader(m_handle);
 81 | 
 82 |             GLint status;
 83 |             glGetShaderiv(m_handle, GL_COMPILE_STATUS, &status);
 84 |             if (!status)
 85 |             {
 86 |                 GLU_CHECK_STATE(status, "Shader failed to compile: %s", get_info_log().c_str());
 87 |             }
 88 |         }
 89 |     };
 90 | 
 91 |     /// A RAII wrapper for GL program.
 92 |     class Program
 93 |     {
 94 |     private:
 95 |         GLuint m_handle;
 96 | 
 97 |     public:
 98 |         explicit Program() { m_handle = glCreateProgram(); };
 99 |         Program(const Program&) = delete;
100 | 
101 |         Program(Program&& other) noexcept
102 |         {
103 |             m_handle = other.m_handle;
104 |             other.m_handle = 0;
105 |         }
106 | 
107 |         ~Program() { glDeleteProgram(m_handle); }
108 | 
109 |         [[nodiscard]] GLuint handle() const { return m_handle; }
110 | 
111 |         void attach_shader(GLuint shader_handle) { glAttachShader(m_handle, shader_handle); }
112 |         void attach_shader(const Shader& shader) { glAttachShader(m_handle, shader.handle()); }
113 | 
114 |         [[nodiscard]] std::string get_info_log() const
115 |         {
116 |             GLint log_length = 0;
117 |             glGetProgramiv(m_handle, GL_INFO_LOG_LENGTH, &log_length);
118 | 
119 |             std::vector<GLchar> log(log_length);
120 |             glGetProgramInfoLog(m_handle, log_length, nullptr, log.data());
121 |             return {log.begin(), log.end()};
122 |         }
123 | 
124 |         void link()
125 |         {
126 |             GLint status;
127 |             glLinkProgram(m_handle);
128 |             glGetProgramiv(m_handle, GL_LINK_STATUS, &status);
129 |             if (!status)
130 |             {
131 |                 GLU_CHECK_STATE(status, "Program failed to link: %s", get_info_log().c_str());
132 |             }
133 |         }
134 | 
135 |         void use() { glUseProgram(m_handle); }
136 | 
137 |         GLint get_uniform_location(const char* uniform_name)
138 |         {
139 |             GLint loc = glGetUniformLocation(m_handle, uniform_name);
140 |             GLU_CHECK_STATE(loc >= 0, "Failed to get uniform location: %s", uniform_name);
141 |             return loc;
142 |         }
143 |     };
144 | 
145 |     /// A RAII helper class for GL shader storage buffer.
146 |     class ShaderStorageBuffer
147 |     {
148 |     private:
149 |         GLuint m_handle = 0;
150 |         size_t m_size = 0;
151 | 
152 |     public:
153 |         explicit ShaderStorageBuffer(size_t initial_size = 0)
154 |         {
155 |             if (initial_size > 0)
156 |                 resize(initial_size, false);
157 |         }
158 | 
159 |         explicit ShaderStorageBuffer(const void* data, size_t size) :
160 |             m_size(size)
161 |         {
162 |             GLU_CHECK_ARGUMENT(data, "");
163 |             GLU_CHECK_ARGUMENT(size > 0, "");
164 | 
165 |             glCreateBuffers(1, &m_handle);
166 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
167 |             glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, data, GL_DYNAMIC_STORAGE_BIT);
168 |         }
169 | 
170 |         template<typename T>
171 |         explicit ShaderStorageBuffer(const std::vector<T>& data) :
172 |             ShaderStorageBuffer(data.data(), data.size() * sizeof(T))
173 |         {
174 |         }
175 | 
176 |         ShaderStorageBuffer(const ShaderStorageBuffer&) = delete;
177 |         ShaderStorageBuffer(ShaderStorageBuffer&& other) noexcept
178 |         {
179 |             m_handle = other.m_handle;
180 |             m_size = other.m_size;
181 |             other.m_handle = 0;
182 |         }
183 | 
184 |         ~ShaderStorageBuffer()
185 |         {
186 |             if (m_handle)
187 |                 glDeleteBuffers(1, &m_handle);
188 |         }
189 | 
190 |         [[nodiscard]] GLuint handle() const { return m_handle; }
191 |         [[nodiscard]] size_t size() const { return m_size; }
192 | 
193 |         /// Grows or shrinks the buffer. If keep_data, performs an additional copy to maintain the data.
194 |         void resize(size_t size, bool keep_data = false)
195 |         {
196 |             size_t old_size = m_size;
197 |             GLuint old_handle = m_handle;
198 | 
199 |             if (old_size != size)
200 |             {
201 |                 m_size = size;
202 | 
203 |                 glCreateBuffers(1, &m_handle);
204 |                 glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
205 |                 glBufferStorage(GL_SHADER_STORAGE_BUFFER, (GLsizeiptr) m_size, nullptr, GL_DYNAMIC_STORAGE_BIT);
206 | 
207 |                 if (keep_data)
208 |                     copy_buffer(old_handle, m_handle, std::min(old_size, size));
209 | 
210 |                 glDeleteBuffers(1, &old_handle);
211 |             }
212 |         }
213 | 
214 |         /// Clears the entire buffer with the given GLuint value (repeated).
215 |         void clear(GLuint value)
216 |         {
217 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
218 |             glClearBufferData(GL_SHADER_STORAGE_BUFFER, GL_R32UI, GL_RED, GL_UNSIGNED_INT, &value);
219 |         }
220 | 
221 |         void write_data(const void* data, size_t size)
222 |         {
223 |             GLU_CHECK_ARGUMENT(size <= m_size, "");
224 | 
225 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
226 |             glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, size, data);
227 |         }
228 | 
229 |         template<typename T>
230 |         std::vector<T> get_data() const
231 |         {
232 |             GLU_CHECK_ARGUMENT(m_size % sizeof(T) == 0, "Size %zu isn't a multiple of %zu", m_size, sizeof(T));
233 | 
234 |             std::vector<T> result(m_size / sizeof(T));
235 |             glBindBuffer(GL_SHADER_STORAGE_BUFFER, m_handle);
236 |             glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, (GLsizeiptr) m_size, result.data());
237 |             return result;
238 |         }
239 | 
240 |         void bind(GLuint index, size_t size = 0, size_t offset = 0)
241 |         {
242 |             if (size == 0)
243 |                 size = m_size;
244 |             glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, m_handle, (GLintptr) offset, (GLsizeiptr) size);
245 |         }
246 |     };
247 | 
248 |     /// Measures elapsed time on GPU for executing the given callback.
249 |     inline uint64_t measure_gl_elapsed_time(const std::function<void()>& callback)
250 |     {
251 |         GLuint query;
252 |         uint64_t elapsed_time{};
253 | 
254 |         glGenQueries(1, &query);
255 |         glBeginQuery(GL_TIME_ELAPSED, query);
256 | 
257 |         callback();
258 | 
259 |         glEndQuery(GL_TIME_ELAPSED);
260 | 
261 |         glGetQueryObjectui64v(query, GL_QUERY_RESULT, &elapsed_time);
262 |         glDeleteQueries(1, &query);
263 | 
264 |         return elapsed_time;
265 |     }
266 | 
267 |     template<typename IntegerT>
268 |     IntegerT log32_floor(IntegerT n)
269 |     {
270 |         return (IntegerT) floor(double(log2(n)) / 5.0);
271 |     }
272 | 
273 |     template<typename IntegerT>
274 |     IntegerT log32_ceil(IntegerT n)
275 |     {
276 |         return (IntegerT) ceil(double(log2(n)) / 5.0);
277 |     }
278 | 
279 |     template<typename IntegerT>
280 |     IntegerT div_ceil(IntegerT n, IntegerT d)
281 |     {
282 |         return (IntegerT) ceil(double(n) / double(d));
283 |     }
284 | 
285 |     template<typename T>
286 |     bool is_power_of_2(T n)
287 |     {
288 |         return (n & (n - 1)) == 0;
289 |     }
290 | 
291 |     template<typename IntegerT>
292 |     IntegerT next_power_of_2(IntegerT n)
293 |     {
294 |         n--;
295 |         n |= n >> 1;
296 |         n |= n >> 2;
297 |         n |= n >> 4;
298 |         n |= n >> 8;
299 |         n |= n >> 16;
300 |         n++;
301 |         return n;
302 |     }
303 | 
304 |     template<typename Iterator>
305 |     void print_stl_container(Iterator begin, Iterator end)
306 |     {
307 |         size_t i = 0;
308 |         for (; begin != end; begin++)
309 |         {
310 |             printf("(%zu) %s, ", i, std::to_string(*begin).c_str());
311 |             i++;
312 |         }
313 |         printf("\n");
314 |     }
315 | 
316 |     template<typename T>
317 |     void print_buffer(const ShaderStorageBuffer& buffer)
318 |     {
319 |         std::vector<T> data = buffer.get_data<T>();
320 |         print_stl_container(data.begin(), data.end());
321 |     }
322 | 
323 |     inline void print_buffer_hex(const ShaderStorageBuffer& buffer)
324 |     {
325 |         std::vector<GLuint> data = buffer.get_data<GLuint>();
326 |         for (size_t i = 0; i < data.size(); i++)
327 |             printf("(%zu) %08x, ", i, data[i]);
328 |         printf("\n");
329 |     }
330 | } // namespace glu
331 | 
332 | #endif // GLU_GL_UTILS_HPP
333 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(glu_test
 2 |     main.cpp
 3 |     reduce_tests.cpp
 4 |     blelloch_scan_tests.cpp
 5 |     radix_sort_tests.cpp
 6 | 
 7 |     # These source files test the correct generation of the dist/* files
 8 |     generated/test_include_BlellochScan.cpp
 9 |     generated/test_include_RadixSort.cpp
10 |     generated/test_include_Reduce.cpp
11 | )
12 | 
13 | target_link_libraries(glu_test PRIVATE glu)
14 | 
15 | target_link_libraries(glu_test PRIVATE Catch2::Catch2)
16 | target_link_libraries(glu_test PRIVATE glad)
17 | target_link_libraries(glu_test PRIVATE glm)
18 | target_link_libraries(glu_test PRIVATE glfw)
19 | target_link_libraries(glu_test PRIVATE renderdoc)
20 | 


--------------------------------------------------------------------------------
/test/blelloch_scan_tests.cpp:
--------------------------------------------------------------------------------
  1 | #include <catch2/catch_test_macros.hpp>
  2 | #include <catch2/generators/catch_generators.hpp>
  3 | #include <catch2/matchers/catch_matchers_floating_point.hpp>
  4 | #include <glad/glad.h>
  5 | 
  6 | #include "glu/BlellochScan.hpp"
  7 | #include "util/Random.hpp"
  8 | #include "util/StopWatch.hpp"
  9 | 
 10 | using namespace glu;
 11 | 
 12 | TEST_CASE("BlellochScan-simple", "[.]")
 13 | {
 14 |     const std::vector<GLuint> data{1, 2, 3, 4, 5, 6, 7, 8};
 15 | 
 16 |     ShaderStorageBuffer buffer(data);
 17 | 
 18 |     printf("Input:\n");
 19 |     print_buffer<GLuint>(buffer);
 20 | 
 21 |     BlellochScan blelloch_scan(DataType_Uint);
 22 | 
 23 |     printf("Output:\n");
 24 |     blelloch_scan(buffer.handle(), data.size());
 25 |     print_buffer<GLuint>(buffer);
 26 | }
 27 | 
 28 | TEST_CASE("BlellochScan-multiple-sizes")
 29 | {
 30 |     const uint64_t k_seed = 123;
 31 |     const size_t k_num_elements =
 32 |         GENERATE(1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576);
 33 | 
 34 |     Random random(k_seed);
 35 | 
 36 |     std::vector<GLuint> data = random.sample_int_vector<GLuint>(k_num_elements, 0, 100);
 37 | 
 38 |     ShaderStorageBuffer buffer(data);
 39 | 
 40 |     BlellochScan blelloch_scan(DataType_Uint);
 41 |     blelloch_scan(buffer.handle(), data.size());
 42 | 
 43 |     std::vector<GLuint> expected(k_num_elements);
 44 |     std::exclusive_scan(data.begin(), data.end(), expected.begin(), 0);
 45 |     REQUIRE(buffer.get_data<GLuint>() == expected);
 46 | }
 47 | 
 48 | TEST_CASE("BlellochScan-multiple-partitions")
 49 | {
 50 |     const uint64_t k_seed = 123;
 51 |     const size_t k_num_elements = 1024;
 52 |     const size_t k_num_partitions = GENERATE(1, 32, 100, 1000);
 53 | 
 54 |     Random random(k_seed);
 55 | 
 56 |     // Generate a random buffer containing data for all partitions
 57 |     std::vector<GLuint> data = random.sample_int_vector<GLuint>(k_num_elements * k_num_partitions, 0, 100);
 58 | 
 59 |     ShaderStorageBuffer buffer(data);
 60 | 
 61 |     // Run blelloch scan on all partitions
 62 |     BlellochScan blelloch_scan(DataType_Uint);
 63 |     blelloch_scan(buffer.handle(), k_num_elements, k_num_partitions);
 64 | 
 65 |     // Get the result host-side
 66 |     std::vector<GLuint> result = buffer.get_data<GLuint>();
 67 | 
 68 |     auto data_begin = data.begin();
 69 |     auto result_begin = result.begin();
 70 | 
 71 |     // Check that exclusive scan was run for every partition
 72 |     for (int partition = 0; partition < k_num_partitions; partition++)
 73 |     {
 74 |         std::vector<GLuint> expected_result(k_num_elements);
 75 |         std::exclusive_scan(data_begin, data_begin + k_num_elements, expected_result.begin(), 0);
 76 | 
 77 |         REQUIRE(std::memcmp(expected_result.data(), &(*result_begin), k_num_elements * sizeof(GLuint)) == 0);
 78 | 
 79 |         data_begin += k_num_elements;
 80 |         result_begin += k_num_elements;
 81 |     }
 82 | }
 83 | 
 84 | TEST_CASE("BlellochScan-benchmark", "[.][benchmark]")
 85 | {
 86 |     const size_t k_num_elements = GENERATE(
 87 |         1024,      // 1KB
 88 |         16384,     // 16KB
 89 |         65536,     // 65KB
 90 |         131072,    // 131KB
 91 |         524288,    // 524KB
 92 |         1048576,   // 1MB
 93 |         16777216,  // 16MB
 94 |         67108864,  // 67MB
 95 |         134217728, // 134MB
 96 |         268435456  // 268MB
 97 |     );
 98 | 
 99 |     std::vector<GLuint> data(k_num_elements); // Don't need to initialize the vector for benchmarking
100 | 
101 |     ShaderStorageBuffer buffer(data);
102 | 
103 |     BlellochScan blelloch_scan(DataType_Uint);
104 | 
105 |     uint64_t ns = measure_gl_elapsed_time([&]() { blelloch_scan(buffer.handle(), k_num_elements); });
106 | 
107 |     printf("BlellochScan; Num elements: %zu, Elapsed: %s\n", k_num_elements, ns_to_human_string(ns).c_str());
108 | }
109 | 


--------------------------------------------------------------------------------
/test/generated/test_include_BlellochScan.cpp:
--------------------------------------------------------------------------------
1 | #include <glad/glad.h>
2 | #include "dist/BlellochScan.hpp"
3 | 


--------------------------------------------------------------------------------
/test/generated/test_include_RadixSort.cpp:
--------------------------------------------------------------------------------
1 | #include <glad/glad.h>
2 | #include "dist/RadixSort.hpp"
3 | 


--------------------------------------------------------------------------------
/test/generated/test_include_Reduce.cpp:
--------------------------------------------------------------------------------
1 | #include <glad/glad.h>
2 | #include "dist/Reduce.hpp"
3 | 


--------------------------------------------------------------------------------
/test/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstdio>
  2 | #include <cstdlib>
  3 | 
  4 | // clang-format off
  5 | #include <glad/glad.h>
  6 | #include <GLFW/glfw3.h>
  7 | // clang-format on
  8 | 
  9 | #include <catch2/catch_all.hpp>
 10 | 
 11 | void print_gl_debug()
 12 | {
 13 |     GLint v1, v2, v3;
 14 | 
 15 |     printf("---------------------------------------------------------------- Device info\n");
 16 | 
 17 |     printf("Device: %s\n", glGetString(GL_RENDERER));
 18 |     printf("Vendor: %s\n", glGetString(GL_VENDOR));
 19 |     printf("Version: %s\n", glGetString(GL_VERSION));
 20 |     printf("GLSL version: %s\n", glGetString(GL_SHADING_LANGUAGE_VERSION));
 21 | 
 22 |     glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, &v1);
 23 |     glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 1, &v2);
 24 |     glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 2, &v3);
 25 |     printf("GL_MAX_COMPUTE_WORK_GROUP_COUNT: (%d, %d, %d)\n", v1, v2, v3);
 26 | 
 27 |     glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 0, &v1);
 28 |     glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 1, &v2);
 29 |     glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 2, &v3);
 30 |     printf("GL_MAX_COMPUTE_WORK_GROUP_SIZE: (%d, %d, %d)\n", v1, v2, v3);
 31 | 
 32 |     glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &v1);
 33 |     printf("GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS: %d\n", v1);
 34 |     glGetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &v1);
 35 |     printf("GL_MAX_COMPUTE_SHARED_MEMORY_SIZE: %d\n", v1);
 36 |     glGetIntegerv(GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS, &v1);
 37 |     printf("GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS: %d\n", v1);
 38 |     glGetIntegerv(GL_MAX_COMBINED_SHADER_STORAGE_BLOCKS, &v1);
 39 |     printf("GL_MAX_COMBINED_SHADER_STORAGE_BLOCKS: %d\n", v1);
 40 | 
 41 |     glGetIntegerv(GL_WARP_SIZE_NV, &v1);
 42 |     printf("GL_WARP_SIZE_NV: %d\n", v1);
 43 | 
 44 |     glGetIntegerv(GL_SUBGROUP_SIZE_KHR, &v1);
 45 |     printf("GL_SUBGROUP_SIZE_KHR: %d\n", v1);
 46 | 
 47 |     printf("----------------------------------------------------------------\n");
 48 | }
 49 | 
 50 | void GLAPIENTRY debug_message_callback(
 51 |     GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length, const GLchar* message, const void* userParam
 52 | )
 53 | {
 54 |     if (GL_DEBUG_TYPE_ERROR == type && severity <= GL_DEBUG_SEVERITY_HIGH)
 55 |     {
 56 |         fprintf(stderr, "GL CALLBACK: type = 0x%x, severity = 0x%x, message = %s\n", type, severity, message);
 57 |     }
 58 | }
 59 | 
 60 | int main(int argc, char* argv[])
 61 | {
 62 |     setvbuf(stdout, nullptr, _IONBF, 0);
 63 | 
 64 |     if (glfwInit() == GLFW_FALSE)
 65 |     {
 66 |         fprintf(stderr, "Failed to initialize GLFW");
 67 |         exit(1);
 68 |     }
 69 | 
 70 |     glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
 71 |     glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
 72 |     glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 6);
 73 | 
 74 |     glfwWindowHint(GLFW_VISIBLE, GLFW_FALSE);
 75 | 
 76 |     GLFWwindow* window = glfwCreateWindow(500, 500, "GLU", nullptr, nullptr);
 77 |     if (window == nullptr)
 78 |     {
 79 |         fprintf(stderr, "Failed to create GLFW window");
 80 |         exit(1);
 81 |     }
 82 | 
 83 |     glfwMakeContextCurrent(window);
 84 | 
 85 |     if (!gladLoadGLLoader((GLADloadproc) glfwGetProcAddress))
 86 |     {
 87 |         fprintf(stderr, "Failed to load GL");
 88 |         exit(1);
 89 |     }
 90 | 
 91 |     glEnable(GL_DEBUG_OUTPUT);
 92 |     glDebugMessageCallback(debug_message_callback, nullptr);
 93 | 
 94 |     print_gl_debug();
 95 | 
 96 |     std::vector<const char*> catch2_args;
 97 |     for (int i = 0; i < argc; i++)
 98 |         catch2_args.emplace_back(argv[i]);
 99 | 
100 |     // Show the test and sections names in stdout
101 |     catch2_args.emplace_back("--reporter console::out=%stdout");
102 | 
103 |     int result = Catch::Session().run((int) catch2_args.size(), catch2_args.data());
104 | 
105 |     glfwDestroyWindow(window);
106 |     glfwTerminate();
107 | 
108 |     return result;
109 | }
110 | 


--------------------------------------------------------------------------------
/test/radix_sort_tests.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cinttypes>
  3 | #include <unordered_map>
  4 | #include <vector>
  5 | 
  6 | #include <catch2/benchmark/catch_benchmark_all.hpp>
  7 | #include <catch2/catch_test_macros.hpp>
  8 | #include <catch2/generators/catch_generators_all.hpp>
  9 | #include <catch2/matchers/catch_matchers_floating_point.hpp>
 10 | #include <glad/glad.h>
 11 | 
 12 | #include "glu/RadixSort.hpp"
 13 | #include "util/Random.hpp"
 14 | #include "util/StopWatch.hpp"
 15 | 
 16 | using namespace glu;
 17 | 
 18 | namespace
 19 | {
 20 |     /// Builds an histogram counting the values in the given vector.
 21 |     template<typename T>
 22 |     std::unordered_map<T, size_t> build_value_histogram(const std::vector<T>& vector)
 23 |     {
 24 |         std::unordered_map<T, size_t> histogram;
 25 |         for (const T& entry : vector)
 26 |         {
 27 |             auto [iterator, inserted] = histogram.emplace(entry, 1);
 28 |             if (!inserted)
 29 |                 iterator->second++;
 30 |         }
 31 |         return histogram;
 32 |     }
 33 | 
 34 |     /// Checks that vector1 is a permutation of vector2.
 35 |     template<typename T>
 36 |     void check_permutation(const std::vector<T>& vector1, const std::vector<T>& vector2)
 37 |     {
 38 |         CHECK(vector1.size() == vector2.size());
 39 | 
 40 |         std::unordered_map<T, size_t> histogram1 = build_value_histogram(vector1);
 41 |         std::unordered_map<T, size_t> histogram2 = build_value_histogram(vector2);
 42 |         CHECK(histogram1 == histogram2);
 43 |     }
 44 | 
 45 |     /// Checks whether the given vector is sorted.
 46 |     template<typename T>
 47 |     void check_sorted(const std::vector<T>& vector)
 48 |     {
 49 |         CAPTURE(vector);
 50 |         CHECK(std::is_sorted(vector.begin(), vector.end()));
 51 |     }
 52 | } // namespace
 53 | 
 54 | TEST_CASE("RadixSort-simple", "[.]")
 55 | {
 56 |     const size_t k_num_elements = 10;
 57 | 
 58 |     const uint64_t k_seed = 1;
 59 |     Random random(k_seed);
 60 | 
 61 |     printf("Num elements: %zu; Seed: %" PRIu64 "\n", k_num_elements, k_seed);
 62 | 
 63 |     std::vector<GLuint> keys = random.sample_int_vector<GLuint>(k_num_elements, 0, UINT32_MAX);
 64 |     std::vector<GLuint> vals(k_num_elements);
 65 | 
 66 |     REQUIRE(keys.size() == vals.size());
 67 | 
 68 |     ShaderStorageBuffer key_buffer(keys);
 69 |     ShaderStorageBuffer val_buffer(vals);
 70 | 
 71 |     printf("Input; Key buffer:\n");
 72 |     print_buffer<GLuint>(key_buffer);
 73 |     print_buffer_hex(key_buffer);
 74 | 
 75 |     RadixSort radix_sort;
 76 |     radix_sort(key_buffer.handle(), val_buffer.handle(), keys.size());
 77 | 
 78 |     printf("Output; Key buffer:\n");
 79 |     print_buffer<GLuint>(key_buffer);
 80 |     print_buffer_hex(key_buffer);
 81 | 
 82 |     std::vector<GLuint> sorted_keys = key_buffer.get_data<GLuint>();
 83 | 
 84 |     check_permutation(keys, sorted_keys);
 85 |     check_sorted(sorted_keys);
 86 | }
 87 | 
 88 | TEST_CASE("RadixSort-128-256-512-1024")
 89 | {
 90 |     const size_t k_num_elements = GENERATE(128, 256, 512, 1024);
 91 | 
 92 |     const uint64_t k_seed = 1;
 93 |     Random random(k_seed);
 94 | 
 95 |     printf("Num elements: %zu; Seed: %" PRIu64 "\n", k_num_elements, k_seed);
 96 | 
 97 |     std::vector<GLuint> keys = random.sample_int_vector<GLuint>(k_num_elements, 0, UINT32_MAX);
 98 |     std::vector<GLuint> vals(k_num_elements);
 99 | 
100 |     ShaderStorageBuffer key_buffer(keys);
101 |     ShaderStorageBuffer val_buffer(vals);
102 | 
103 |     RadixSort radix_sort;
104 |     radix_sort(key_buffer.handle(), val_buffer.handle(), keys.size());
105 | 
106 |     std::vector<GLuint> sorted_keys = key_buffer.get_data<GLuint>();
107 | 
108 |     check_permutation(keys, sorted_keys);
109 |     check_sorted(sorted_keys);
110 | }
111 | 
112 | TEST_CASE("RadixSort-2048")
113 | {
114 |     const size_t k_num_elements = 2048;
115 | 
116 |     const uint64_t k_seed = 1;
117 |     Random random(k_seed);
118 | 
119 |     printf("Num elements: %zu; Seed: %" PRIu64 "\n", k_num_elements, k_seed);
120 | 
121 |     std::vector<GLuint> keys = random.sample_int_vector<GLuint>(k_num_elements, 0, 10);
122 |     std::vector<GLuint> vals(k_num_elements);
123 | 
124 |     ShaderStorageBuffer key_buffer(keys);
125 |     ShaderStorageBuffer val_buffer(vals);
126 | 
127 |     RadixSort radix_sort;
128 |     radix_sort(key_buffer.handle(), val_buffer.handle(), keys.size());
129 | 
130 |     std::vector<GLuint> sorted_keys = key_buffer.get_data<GLuint>();
131 | 
132 |     check_permutation(keys, sorted_keys);
133 |     check_sorted(sorted_keys);
134 | }
135 | 
136 | TEST_CASE("RadixSort-multiple-sizes")
137 | {
138 |     const size_t k_num_elements = GENERATE(10993, 14978, 16243, 18985, 23857, 27865, 33363, 41298, 45821, 47487);
139 | 
140 |     const uint64_t k_seed = 1;
141 |     Random random(k_seed);
142 | 
143 |     printf("Num elements: %zu; Seed: %" PRIu64 "\n", k_num_elements, k_seed);
144 | 
145 |     std::vector<GLuint> keys = random.sample_int_vector<GLuint>(k_num_elements, 0, UINT32_MAX);
146 |     std::vector<GLuint> vals(k_num_elements);
147 | 
148 |     ShaderStorageBuffer key_buffer(keys);
149 |     ShaderStorageBuffer val_buffer(vals);
150 | 
151 |     RadixSort radix_sort;
152 |     radix_sort(key_buffer.handle(), val_buffer.handle(), keys.size());
153 | 
154 |     std::vector<GLuint> sorted_keys = key_buffer.get_data<GLuint>();
155 | 
156 |     check_permutation(keys, sorted_keys);
157 |     check_sorted(sorted_keys);
158 | }
159 | 
160 | TEST_CASE("RadixSort-benchmark", "[.][benchmark]")
161 | {
162 |     const size_t k_num_elements = GENERATE(
163 |         1024,      // 1KB
164 |         16384,     // 16KB
165 |         65536,     // 65KB
166 |         131072,    // 131KB
167 |         524288,    // 524KB
168 |         1048576,   // 1MB
169 |         2097152,   // 2MB
170 |         4194304,   // 4MB
171 |         8388608,   // 8MB
172 |         16777216,  // 16MB
173 |         33554432,  // 32MB
174 |         67108864,  // 64MB
175 |         134217728, // 128MB
176 |         268435456  // 268MB
177 |     );
178 | 
179 |     std::vector<GLuint> keys(k_num_elements); // Don't need to initialize the vector for benchmarking
180 |     std::vector<GLuint> vals(k_num_elements);
181 | 
182 |     ShaderStorageBuffer key_buffer(keys);
183 |     ShaderStorageBuffer val_buffer(vals);
184 | 
185 |     RadixSort radix_sort;
186 | 
187 |     radix_sort.prepare_internal_buffers(k_num_elements);
188 | 
189 |     uint64_t ns =
190 |         measure_gl_elapsed_time([&]() { radix_sort(key_buffer.handle(), val_buffer.handle(), k_num_elements); });
191 | 
192 |     printf("Radix sort; Num elements: %zu, Elapsed: %s\n", k_num_elements, ns_to_human_string(ns).c_str());
193 | }
194 | 


--------------------------------------------------------------------------------
/test/reduce_tests.cpp:
--------------------------------------------------------------------------------
  1 | #include <catch2/catch_test_macros.hpp>
  2 | #include <catch2/generators/catch_generators.hpp>
  3 | #include <catch2/matchers/catch_matchers_floating_point.hpp>
  4 | #include <glad/glad.h>
  5 | #include <glm/glm.hpp>
  6 | 
  7 | #include "glu/Reduce.hpp"
  8 | #include "util/Random.hpp"
  9 | #include "util/StopWatch.hpp"
 10 | 
 11 | using namespace glu;
 12 | using namespace Catch::Matchers;
 13 | 
 14 | TEST_CASE("Reduce-simple-uint")
 15 | {
 16 |     const uint32_t k_data[]{32, 35, 1,  3,  95, 10, 22, 24, 44, 37, 7,  80, 33, 54, 46, 23, 14, 84, 11, 67,
 17 |                             4,  58, 70, 61, 16, 36, 83, 9,  56, 99, 28, 98, 69, 21, 51, 34, 48, 91, 62, 19,
 18 |                             59, 79, 39, 92, 97, 78, 52, 40, 66, 47, 89, 88, 74, 49, 31, 20, 45, 13, 26, 72,
 19 |                             43, 30, 65, 94, 63, 8,  60, 15, 93, 86, 41, 75, 12, 73, 55, 90, 64, 96, 53, 1,
 20 |                             57, 71, 50, 42, 29, 2,  77, 25, 82, 18, 81, 85, 27, 5,  6,  68, 17, 38, 87, 76};
 21 |     const size_t k_data_length = std::size(k_data);
 22 | 
 23 |     ShaderStorageBuffer buffer(k_data, k_data_length * sizeof(uint32_t));
 24 | 
 25 |     SECTION("sum")
 26 |     {
 27 |         Reduce reduce(DataType_Uint, ReduceOperator_Sum);
 28 |         reduce(buffer.handle(), k_data_length);
 29 |         CHECK(buffer.get_data<uint32_t>()[0] == 4951);
 30 |     }
 31 | 
 32 |     SECTION("mul")
 33 |     {
 34 |         Reduce reduce(DataType_Uint, ReduceOperator_Mul);
 35 |         reduce(buffer.handle(), 5);
 36 |         CHECK(buffer.get_data<uint32_t>()[0] == 319200);
 37 |     }
 38 | 
 39 |     SECTION("min")
 40 |     {
 41 |         Reduce reduce(DataType_Uint, ReduceOperator_Min);
 42 |         reduce(buffer.handle(), k_data_length);
 43 |         CHECK(buffer.get_data<uint32_t>()[0] == 1);
 44 |     }
 45 | 
 46 |     SECTION("max")
 47 |     {
 48 |         Reduce reduce(DataType_Uint, ReduceOperator_Max);
 49 |         reduce(buffer.handle(), k_data_length);
 50 |         CHECK(buffer.get_data<uint32_t>()[0] == 99);
 51 |     }
 52 | }
 53 | 
 54 | TEST_CASE("Reduce-all")
 55 | {
 56 |     SECTION("uint")
 57 |     {
 58 |         const std::vector<uint32_t> k_data{1, 11, 80, 73, 48, 40, 89, 36, 70, 57};
 59 |         Reduce reduce(DataType_Uint, ReduceOperator_Sum);
 60 |         ShaderStorageBuffer buffer(k_data);
 61 |         reduce(buffer.handle(), k_data.size());
 62 |         CHECK(buffer.get_data<uint32_t>()[0] == 505);
 63 |     }
 64 | 
 65 |     SECTION("float")
 66 |     {
 67 |         const std::vector<float> k_data{42.138f, 18.228f, -19.127f, 86.564f,  11.904f,
 68 |                                         48.538f, 30.606f, 11.338f,  -32.699f, -29.587f};
 69 |         Reduce reduce(DataType_Float, ReduceOperator_Sum);
 70 |         ShaderStorageBuffer buffer(k_data);
 71 |         reduce(buffer.handle(), k_data.size());
 72 |         CHECK_THAT(buffer.get_data<float>()[0], WithinAbs(167.9f, 0.1f));
 73 |     }
 74 | 
 75 |     SECTION("double")
 76 |     {
 77 |         const std::vector<double> k_data{-6.20, -56.02, 49.42, 52.38, -23.81, -29.72, 95.46, 77.37, -85.00, 81.74};
 78 |         Reduce reduce(DataType_Double, ReduceOperator_Sum);
 79 |         ShaderStorageBuffer buffer(k_data);
 80 |         reduce(buffer.handle(), k_data.size());
 81 |         CHECK_THAT(buffer.get_data<double>()[0], WithinAbs(155.6, 0.1));
 82 |     }
 83 | 
 84 |     SECTION("vec2")
 85 |     {
 86 |         const std::vector<glm::vec2> k_data{{-77.08f, 19.54f}, {98.89f, -16.09f},  {10.53f, 91.17f}, {43.06f, -94.18f},
 87 |                                             {-19.18f, 0.86f},  {-49.99f, -92.53f}, {-4.68f, 42.34f}, {2.79f, -4.26f},
 88 |                                             {-17.49f, 43.99f}, {79.45f, -14.58f}};
 89 |         Reduce reduce(DataType_Vec2, ReduceOperator_Sum);
 90 |         ShaderStorageBuffer buffer(k_data);
 91 |         reduce(buffer.handle(), k_data.size());
 92 | 
 93 |         glm::vec2 sum = buffer.get_data<glm::vec2>()[0];
 94 |         CHECK_THAT(sum.x, WithinAbs(66.29f, 0.1f));
 95 |         CHECK_THAT(sum.y, WithinAbs(-23.75f, 0.1f));
 96 |     }
 97 | 
 98 |     SECTION("vec4")
 99 |     {
100 |         const std::vector<glm::vec4> k_data{{-17.04f, 1.79f, 82.67f, 39.72f},    {52.66f, 24.75f, -19.05f, 91.92f},
101 |                                             {19.15f, 44.93f, -52.13f, 18.85f},   {-84.25f, 69.53f, -11.43f, 33.17f},
102 |                                             {19.46f, -14.30f, -15.20f, -63.83f}, {-20.51f, -56.75f, -2.70f, 82.66f},
103 |                                             {3.86f, 55.48f, -12.37f, -11.02f},   {-30.62f, -67.54f, -29.89f, -77.30f},
104 |                                             {-21.55f, 50.46f, 39.34f, 81.08f},   {-56.40f, 84.61f, 90.26f, 13.35f}};
105 |         Reduce reduce(DataType_Vec4, ReduceOperator_Sum);
106 |         ShaderStorageBuffer buffer(k_data);
107 |         reduce(buffer.handle(), k_data.size());
108 | 
109 |         glm::vec4 sum = buffer.get_data<glm::vec4>()[0];
110 |         CHECK_THAT(sum.x, WithinAbs(-135.24f, 0.1f));
111 |         CHECK_THAT(sum.y, WithinAbs(192.97f, 0.1f));
112 |         CHECK_THAT(sum.z, WithinAbs(69.49f, 0.1f));
113 |         CHECK_THAT(sum.w, WithinAbs(208.59f, 0.1f));
114 |     }
115 | 
116 |     SECTION("ivec2")
117 |     {
118 |         const std::vector<glm::ivec2> k_data{{-38, -88}, {57, -34}, {61, 60},  {-90, 73}, {-23, -17},
119 |                                              {34, -79},  {-80, 53}, {24, -23}, {-88, 69}, {-83, -67}};
120 |         Reduce reduce(DataType_IVec2, ReduceOperator_Sum);
121 |         ShaderStorageBuffer buffer(k_data);
122 |         reduce(buffer.handle(), k_data.size());
123 | 
124 |         glm::ivec2 sum = buffer.get_data<glm::ivec2>()[0];
125 |         CHECK(sum.x == -226);
126 |         CHECK(sum.y == -53);
127 |     }
128 | 
129 |     SECTION("ivec4")
130 |     {
131 |         const std::vector<glm::ivec4> k_data{{-95, 99, -30, 2},   {-69, 33, 78, 20},  {33, -43, -38, -26},
132 |                                              {69, -67, -17, -57}, {18, -23, -2, -53}, {88, -96, 40, -48},
133 |                                              {-93, -47, -91, 59}, {-89, 82, 10, 94},  {-15, 7, 41, 14},
134 |                                              {63, 53, -40, 53}};
135 |         Reduce reduce(DataType_IVec4, ReduceOperator_Sum);
136 |         ShaderStorageBuffer buffer(k_data);
137 |         reduce(buffer.handle(), k_data.size());
138 | 
139 |         glm::ivec4 sum = buffer.get_data<glm::ivec4>()[0];
140 |         CHECK(sum.x == -90);
141 |         CHECK(sum.y == -2);
142 |         CHECK(sum.z == -49);
143 |         CHECK(sum.w == 58);
144 |     }
145 | }
146 | 
147 | TEST_CASE("Reduce-subgroup-fitting-size")
148 | {
149 |     const size_t k_num_elements = GENERATE(32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072);
150 | 
151 |     const uint64_t k_seed = 1;
152 |     Random random(k_seed);
153 | 
154 |     std::vector<GLuint> data = random.sample_int_vector<GLuint>(k_num_elements, 0, 100);
155 |     GLuint sum = std::accumulate(data.begin(), data.end(), GLuint(0));
156 | 
157 |     ShaderStorageBuffer buffer(data.data(), data.size() * sizeof(uint32_t));
158 | 
159 |     Reduce reduce(DataType_Uint, ReduceOperator_Sum);
160 |     reduce(buffer.handle(), data.size());
161 | 
162 |     uint32_t calc_sum = buffer.get_data<uint32_t>()[0];
163 |     CHECK(calc_sum == sum);
164 | }
165 | 
166 | TEST_CASE("Reduce-subgroup-non-fitting-size")
167 | {
168 |     const size_t k_num_elements = GENERATE(1, 31, 93, 201, 693, 2087, 7358, 88289, 345897, 6094798, 5238082, 10043898);
169 | 
170 |     const uint64_t k_seed = 1;
171 |     Random random(k_seed);
172 | 
173 |     std::vector<GLuint> data = random.sample_int_vector<GLuint>(k_num_elements, 0, 100);
174 |     GLuint sum = std::accumulate(data.begin(), data.end(), GLuint(0));
175 | 
176 |     ShaderStorageBuffer buffer(data.data(), data.size() * sizeof(GLuint));
177 | 
178 |     Reduce reduce(DataType_Uint, ReduceOperator_Sum);
179 |     reduce(buffer.handle(), data.size());
180 | 
181 |     GLuint calc_sum = buffer.get_data<GLuint>()[0];
182 |     CHECK(calc_sum == sum);
183 | }
184 | 
185 | TEST_CASE("Reduce-benchmark", "[.][benchmark]")
186 | {
187 |     const size_t k_num_elements = GENERATE(
188 |         1024,      // 1KB
189 |         16384,     // 16KB
190 |         65536,     // 65KB
191 |         131072,    // 131KB
192 |         524288,    // 524KB
193 |         1048576,   // 1MB
194 |         16777216,  // 16MB
195 |         67108864,  // 67MB
196 |         134217728, // 134MB
197 |         268435456  // 268MB
198 |     );
199 | 
200 |     std::vector<GLuint> data(k_num_elements); // Don't need to initialize the vector for benchmarking
201 | 
202 |     ShaderStorageBuffer buffer(data);
203 | 
204 |     Reduce reduce(DataType_Uint, ReduceOperator_Sum);
205 | 
206 |     uint64_t ns = measure_gl_elapsed_time([&]() { reduce(buffer.handle(), k_num_elements); });
207 | 
208 |     printf("Reduce; Num elements: %zu, Elapsed: %s\n", k_num_elements, ns_to_human_string(ns).c_str());
209 | }
210 | 


--------------------------------------------------------------------------------
/test/util/Random.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cmath>
 4 | #include <optional>
 5 | #include <random>
 6 | 
 7 | namespace glu
 8 | {
 9 |     class Random
10 |     {
11 |     private:
12 |         std::optional<std::minstd_rand> m_random_engine;
13 | 
14 |     public:
15 |         explicit Random(uint64_t seed = 0) :
16 |             m_random_engine(seed)
17 |         {
18 |             m_random_engine =
19 |                 seed != 0 ? std::make_optional<std::minstd_rand>(seed) : std::make_optional<std::minstd_rand>();
20 |         }
21 | 
22 |         ~Random() = default;
23 | 
24 |         template<typename IntegerT>
25 |         IntegerT sample_int(IntegerT min, IntegerT max)
26 |         {
27 |             GLU_CHECK_ARGUMENT(min < max, "Min must be strictly lower than Max");
28 |             return ((*m_random_engine)() % (max - min)) + min;
29 |         }
30 | 
31 |         template<typename IntegerT>
32 |         std::vector<IntegerT> sample_int_vector(size_t num_elements, IntegerT min, IntegerT max)
33 |         {
34 |             std::vector<IntegerT> result(num_elements);
35 |             for (size_t i = 0; i < num_elements; i++)
36 |                 result[i] = sample_int(min, max);
37 |             return result;
38 |         }
39 |     };
40 | } // namespace glu
41 | 


--------------------------------------------------------------------------------
/test/util/StopWatch.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <chrono>
 4 | #include <cstdint>
 5 | #include <iomanip>
 6 | #include <sstream>
 7 | 
 8 | namespace glu
 9 | {
10 |     /// Converts the given nanoseconds to a human readable string.
11 |     inline std::string ns_to_human_string(uint64_t ns)
12 |     {
13 |         double ms = (double(ns) / 1000.0) / 1000.0;
14 |         double s = ms / 1000.0;
15 | 
16 |         if (s >= 0.1)
17 |         {
18 |             std::stringstream stream; // <format> header not available with g++-12
19 |             stream << std::fixed << std::setprecision(3) << s;
20 |             return stream.str() + " s";
21 |         }
22 |         else if (ms >= 0.001)
23 |         {
24 |             std::stringstream stream; // <format> header not available with g++-12
25 |             stream << std::fixed << std::setprecision(3) << ms;
26 |             return stream.str() + " ms";
27 |         }
28 |         else
29 |         {
30 |             return std::to_string(ns) + " ns";
31 |         }
32 |     }
33 | 
34 |     class StopWatch
35 |     {
36 |         using ClockT = std::chrono::system_clock;
37 | 
38 |     private:
39 |         ClockT::time_point m_start;
40 | 
41 |     public:
42 |         StopWatch() { reset(); };
43 |         ~StopWatch() = default;
44 | 
45 |         void reset() { m_start = std::chrono::system_clock::now(); }
46 | 
47 |         ClockT::duration elapsed_time() { return std::chrono::system_clock::now() - m_start; }
48 | 
49 |         uint64_t elapsed_millis()
50 |         {
51 |             return std::chrono::duration_cast<std::chrono::milliseconds>(elapsed_time()).count();
52 |         }
53 |         uint64_t elapsed_nanos()
54 |         {
55 |             return std::chrono::duration_cast<std::chrono::nanoseconds>(elapsed_time()).count();
56 |         }
57 | 
58 |         std::string elapsed_time_str() { return ns_to_human_string(elapsed_nanos()); }
59 |     };
60 | } // namespace glu
61 | 


--------------------------------------------------------------------------------
/third_party/glad/include/KHR/khrplatform.h:
--------------------------------------------------------------------------------
  1 | #ifndef __khrplatform_h_
  2 | #define __khrplatform_h_
  3 | 
  4 | /*
  5 | ** Copyright (c) 2008-2018 The Khronos Group Inc.
  6 | **
  7 | ** Permission is hereby granted, free of charge, to any person obtaining a
  8 | ** copy of this software and/or associated documentation files (the
  9 | ** "Materials"), to deal in the Materials without restriction, including
 10 | ** without limitation the rights to use, copy, modify, merge, publish,
 11 | ** distribute, sublicense, and/or sell copies of the Materials, and to
 12 | ** permit persons to whom the Materials are furnished to do so, subject to
 13 | ** the following conditions:
 14 | **
 15 | ** The above copyright notice and this permission notice shall be included
 16 | ** in all copies or substantial portions of the Materials.
 17 | **
 18 | ** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 19 | ** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 20 | ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 21 | ** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 22 | ** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 23 | ** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 24 | ** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 25 | */
 26 | 
 27 | /* Khronos platform-specific types and definitions.
 28 |  *
 29 |  * The master copy of khrplatform.h is maintained in the Khronos EGL
 30 |  * Registry repository at https://github.com/KhronosGroup/EGL-Registry
 31 |  * The last semantic modification to khrplatform.h was at commit ID:
 32 |  *      67a3e0864c2d75ea5287b9f3d2eb74a745936692
 33 |  *
 34 |  * Adopters may modify this file to suit their platform. Adopters are
 35 |  * encouraged to submit platform specific modifications to the Khronos
 36 |  * group so that they can be included in future versions of this file.
 37 |  * Please submit changes by filing pull requests or issues on
 38 |  * the EGL Registry repository linked above.
 39 |  *
 40 |  *
 41 |  * See the Implementer's Guidelines for information about where this file
 42 |  * should be located on your system and for more details of its use:
 43 |  *    http://www.khronos.org/registry/implementers_guide.pdf
 44 |  *
 45 |  * This file should be included as
 46 |  *        #include <KHR/khrplatform.h>
 47 |  * by Khronos client API header files that use its types and defines.
 48 |  *
 49 |  * The types in khrplatform.h should only be used to define API-specific types.
 50 |  *
 51 |  * Types defined in khrplatform.h:
 52 |  *    khronos_int8_t              signed   8  bit
 53 |  *    khronos_uint8_t             unsigned 8  bit
 54 |  *    khronos_int16_t             signed   16 bit
 55 |  *    khronos_uint16_t            unsigned 16 bit
 56 |  *    khronos_int32_t             signed   32 bit
 57 |  *    khronos_uint32_t            unsigned 32 bit
 58 |  *    khronos_int64_t             signed   64 bit
 59 |  *    khronos_uint64_t            unsigned 64 bit
 60 |  *    khronos_intptr_t            signed   same number of bits as a pointer
 61 |  *    khronos_uintptr_t           unsigned same number of bits as a pointer
 62 |  *    khronos_ssize_t             signed   size
 63 |  *    khronos_usize_t             unsigned size
 64 |  *    khronos_float_t             signed   32 bit floating point
 65 |  *    khronos_time_ns_t           unsigned 64 bit time in nanoseconds
 66 |  *    khronos_utime_nanoseconds_t unsigned time interval or absolute time in
 67 |  *                                         nanoseconds
 68 |  *    khronos_stime_nanoseconds_t signed time interval in nanoseconds
 69 |  *    khronos_boolean_enum_t      enumerated boolean type. This should
 70 |  *      only be used as a base type when a client API's boolean type is
 71 |  *      an enum. Client APIs which use an integer or other type for
 72 |  *      booleans cannot use this as the base type for their boolean.
 73 |  *
 74 |  * Tokens defined in khrplatform.h:
 75 |  *
 76 |  *    KHRONOS_FALSE, KHRONOS_TRUE Enumerated boolean false/true values.
 77 |  *
 78 |  *    KHRONOS_SUPPORT_INT64 is 1 if 64 bit integers are supported; otherwise 0.
 79 |  *    KHRONOS_SUPPORT_FLOAT is 1 if floats are supported; otherwise 0.
 80 |  *
 81 |  * Calling convention macros defined in this file:
 82 |  *    KHRONOS_APICALL
 83 |  *    KHRONOS_APIENTRY
 84 |  *    KHRONOS_APIATTRIBUTES
 85 |  *
 86 |  * These may be used in function prototypes as:
 87 |  *
 88 |  *      KHRONOS_APICALL void KHRONOS_APIENTRY funcname(
 89 |  *                                  int arg1,
 90 |  *                                  int arg2) KHRONOS_APIATTRIBUTES;
 91 |  */
 92 | 
 93 | #if defined(__SCITECH_SNAP__) && !defined(KHRONOS_STATIC)
 94 | #   define KHRONOS_STATIC 1
 95 | #endif
 96 | 
 97 | /*-------------------------------------------------------------------------
 98 |  * Definition of KHRONOS_APICALL
 99 |  *-------------------------------------------------------------------------
100 |  * This precedes the return type of the function in the function prototype.
101 |  */
102 | #if defined(KHRONOS_STATIC)
103 |     /* If the preprocessor constant KHRONOS_STATIC is defined, make the
104 |      * header compatible with static linking. */
105 | #   define KHRONOS_APICALL
106 | #elif defined(_WIN32)
107 | #   define KHRONOS_APICALL __declspec(dllimport)
108 | #elif defined (__SYMBIAN32__)
109 | #   define KHRONOS_APICALL IMPORT_C
110 | #elif defined(__ANDROID__)
111 | #   define KHRONOS_APICALL __attribute__((visibility("default")))
112 | #else
113 | #   define KHRONOS_APICALL
114 | #endif
115 | 
116 | /*-------------------------------------------------------------------------
117 |  * Definition of KHRONOS_APIENTRY
118 |  *-------------------------------------------------------------------------
119 |  * This follows the return type of the function  and precedes the function
120 |  * name in the function prototype.
121 |  */
122 | #if defined(_WIN32) && !defined(_WIN32_WCE) && !defined(__SCITECH_SNAP__)
123 |     /* Win32 but not WinCE */
124 | #   define KHRONOS_APIENTRY __stdcall
125 | #else
126 | #   define KHRONOS_APIENTRY
127 | #endif
128 | 
129 | /*-------------------------------------------------------------------------
130 |  * Definition of KHRONOS_APIATTRIBUTES
131 |  *-------------------------------------------------------------------------
132 |  * This follows the closing parenthesis of the function prototype arguments.
133 |  */
134 | #if defined (__ARMCC_2__)
135 | #define KHRONOS_APIATTRIBUTES __softfp
136 | #else
137 | #define KHRONOS_APIATTRIBUTES
138 | #endif
139 | 
140 | /*-------------------------------------------------------------------------
141 |  * basic type definitions
142 |  *-----------------------------------------------------------------------*/
143 | #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(__GNUC__) || defined(__SCO__) || defined(__USLC__)
144 | 
145 | 
146 | /*
147 |  * Using <stdint.h>
148 |  */
149 | #include <stdint.h>
150 | typedef int32_t                 khronos_int32_t;
151 | typedef uint32_t                khronos_uint32_t;
152 | typedef int64_t                 khronos_int64_t;
153 | typedef uint64_t                khronos_uint64_t;
154 | #define KHRONOS_SUPPORT_INT64   1
155 | #define KHRONOS_SUPPORT_FLOAT   1
156 | 
157 | #elif defined(__VMS ) || defined(__sgi)
158 | 
159 | /*
160 |  * Using <inttypes.h>
161 |  */
162 | #include <inttypes.h>
163 | typedef int32_t                 khronos_int32_t;
164 | typedef uint32_t                khronos_uint32_t;
165 | typedef int64_t                 khronos_int64_t;
166 | typedef uint64_t                khronos_uint64_t;
167 | #define KHRONOS_SUPPORT_INT64   1
168 | #define KHRONOS_SUPPORT_FLOAT   1
169 | 
170 | #elif defined(_WIN32) && !defined(__SCITECH_SNAP__)
171 | 
172 | /*
173 |  * Win32
174 |  */
175 | typedef __int32                 khronos_int32_t;
176 | typedef unsigned __int32        khronos_uint32_t;
177 | typedef __int64                 khronos_int64_t;
178 | typedef unsigned __int64        khronos_uint64_t;
179 | #define KHRONOS_SUPPORT_INT64   1
180 | #define KHRONOS_SUPPORT_FLOAT   1
181 | 
182 | #elif defined(__sun__) || defined(__digital__)
183 | 
184 | /*
185 |  * Sun or Digital
186 |  */
187 | typedef int                     khronos_int32_t;
188 | typedef unsigned int            khronos_uint32_t;
189 | #if defined(__arch64__) || defined(_LP64)
190 | typedef long int                khronos_int64_t;
191 | typedef unsigned long int       khronos_uint64_t;
192 | #else
193 | typedef long long int           khronos_int64_t;
194 | typedef unsigned long long int  khronos_uint64_t;
195 | #endif /* __arch64__ */
196 | #define KHRONOS_SUPPORT_INT64   1
197 | #define KHRONOS_SUPPORT_FLOAT   1
198 | 
199 | #elif 0
200 | 
201 | /*
202 |  * Hypothetical platform with no float or int64 support
203 |  */
204 | typedef int                     khronos_int32_t;
205 | typedef unsigned int            khronos_uint32_t;
206 | #define KHRONOS_SUPPORT_INT64   0
207 | #define KHRONOS_SUPPORT_FLOAT   0
208 | 
209 | #else
210 | 
211 | /*
212 |  * Generic fallback
213 |  */
214 | #include <stdint.h>
215 | typedef int32_t                 khronos_int32_t;
216 | typedef uint32_t                khronos_uint32_t;
217 | typedef int64_t                 khronos_int64_t;
218 | typedef uint64_t                khronos_uint64_t;
219 | #define KHRONOS_SUPPORT_INT64   1
220 | #define KHRONOS_SUPPORT_FLOAT   1
221 | 
222 | #endif
223 | 
224 | 
225 | /*
226 |  * Types that are (so far) the same on all platforms
227 |  */
228 | typedef signed   char          khronos_int8_t;
229 | typedef unsigned char          khronos_uint8_t;
230 | typedef signed   short int     khronos_int16_t;
231 | typedef unsigned short int     khronos_uint16_t;
232 | 
233 | /*
234 |  * Types that differ between LLP64 and LP64 architectures - in LLP64,
235 |  * pointers are 64 bits, but 'long' is still 32 bits. Win64 appears
236 |  * to be the only LLP64 architecture in current use.
237 |  */
238 | #ifdef _WIN64
239 | typedef signed   long long int khronos_intptr_t;
240 | typedef unsigned long long int khronos_uintptr_t;
241 | typedef signed   long long int khronos_ssize_t;
242 | typedef unsigned long long int khronos_usize_t;
243 | #else
244 | typedef signed   long  int     khronos_intptr_t;
245 | typedef unsigned long  int     khronos_uintptr_t;
246 | typedef signed   long  int     khronos_ssize_t;
247 | typedef unsigned long  int     khronos_usize_t;
248 | #endif
249 | 
250 | #if KHRONOS_SUPPORT_FLOAT
251 | /*
252 |  * Float type
253 |  */
254 | typedef          float         khronos_float_t;
255 | #endif
256 | 
257 | #if KHRONOS_SUPPORT_INT64
258 | /* Time types
259 |  *
260 |  * These types can be used to represent a time interval in nanoseconds or
261 |  * an absolute Unadjusted System Time.  Unadjusted System Time is the number
262 |  * of nanoseconds since some arbitrary system event (e.g. since the last
263 |  * time the system booted).  The Unadjusted System Time is an unsigned
264 |  * 64 bit value that wraps back to 0 every 584 years.  Time intervals
265 |  * may be either signed or unsigned.
266 |  */
267 | typedef khronos_uint64_t       khronos_utime_nanoseconds_t;
268 | typedef khronos_int64_t        khronos_stime_nanoseconds_t;
269 | #endif
270 | 
271 | /*
272 |  * Dummy value used to pad enum types to 32 bits.
273 |  */
274 | #ifndef KHRONOS_MAX_ENUM
275 | #define KHRONOS_MAX_ENUM 0x7FFFFFFF
276 | #endif
277 | 
278 | /*
279 |  * Enumerated boolean type
280 |  *
281 |  * Values other than zero should be considered to be true.  Therefore
282 |  * comparisons should not be made against KHRONOS_TRUE.
283 |  */
284 | typedef enum {
285 |     KHRONOS_FALSE = 0,
286 |     KHRONOS_TRUE  = 1,
287 |     KHRONOS_BOOLEAN_ENUM_FORCE_SIZE = KHRONOS_MAX_ENUM
288 | } khronos_boolean_enum_t;
289 | 
290 | #endif /* __khrplatform_h_ */
291 | 


--------------------------------------------------------------------------------
/third_party/renderdoc/renderdoc_app.h:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * The MIT License (MIT)
  3 |  *
  4 |  * Copyright (c) 2019-2021 Baldur Karlsson
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  7 |  * of this software and associated documentation files (the "Software"), to deal
  8 |  * in the Software without restriction, including without limitation the rights
  9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 |  * copies of the Software, and to permit persons to whom the Software is
 11 |  * furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in
 14 |  * all copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 |  * THE SOFTWARE.
 23 |  ******************************************************************************/
 24 | 
 25 | #pragma once
 26 | 
 27 | //////////////////////////////////////////////////////////////////////////////////////////////////
 28 | //
 29 | // Documentation for the API is available at https://renderdoc.org/docs/in_application_api.html
 30 | //
 31 | 
 32 | #if !defined(RENDERDOC_NO_STDINT)
 33 | #include <stdint.h>
 34 | #endif
 35 | 
 36 | #if defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_MSC_VER)
 37 | #define RENDERDOC_CC __cdecl
 38 | #elif defined(__linux__)
 39 | #define RENDERDOC_CC
 40 | #elif defined(__APPLE__)
 41 | #define RENDERDOC_CC
 42 | #else
 43 | #error "Unknown platform"
 44 | #endif
 45 | 
 46 | #ifdef __cplusplus
 47 | extern "C" {
 48 | #endif
 49 | 
 50 | //////////////////////////////////////////////////////////////////////////////////////////////////
 51 | // Constants not used directly in below API
 52 | 
 53 | // This is a GUID/magic value used for when applications pass a path where shader debug
 54 | // information can be found to match up with a stripped shader.
 55 | // the define can be used like so: const GUID RENDERDOC_ShaderDebugMagicValue =
 56 | // RENDERDOC_ShaderDebugMagicValue_value
 57 | #define RENDERDOC_ShaderDebugMagicValue_struct                                \
 58 |   {                                                                           \
 59 |     0xeab25520, 0x6670, 0x4865, 0x84, 0x29, 0x6c, 0x8, 0x51, 0x54, 0x00, 0xff \
 60 |   }
 61 | 
 62 | // as an alternative when you want a byte array (assuming x86 endianness):
 63 | #define RENDERDOC_ShaderDebugMagicValue_bytearray                                                 \
 64 |   {                                                                                               \
 65 |     0x20, 0x55, 0xb2, 0xea, 0x70, 0x66, 0x65, 0x48, 0x84, 0x29, 0x6c, 0x8, 0x51, 0x54, 0x00, 0xff \
 66 |   }
 67 | 
 68 | // truncated version when only a uint64_t is available (e.g. Vulkan tags):
 69 | #define RENDERDOC_ShaderDebugMagicValue_truncated 0x48656670eab25520ULL
 70 | 
 71 | //////////////////////////////////////////////////////////////////////////////////////////////////
 72 | // RenderDoc capture options
 73 | //
 74 | 
 75 | typedef enum RENDERDOC_CaptureOption {
 76 | 	// Allow the application to enable vsync
 77 | 	//
 78 | 	// Default - enabled
 79 | 	//
 80 | 	// 1 - The application can enable or disable vsync at will
 81 | 	// 0 - vsync is force disabled
 82 | 	eRENDERDOC_Option_AllowVSync = 0,
 83 | 
 84 | 	// Allow the application to enable fullscreen
 85 | 	//
 86 | 	// Default - enabled
 87 | 	//
 88 | 	// 1 - The application can enable or disable fullscreen at will
 89 | 	// 0 - fullscreen is force disabled
 90 | 	eRENDERDOC_Option_AllowFullscreen = 1,
 91 | 
 92 | 	// Record API debugging events and messages
 93 | 	//
 94 | 	// Default - disabled
 95 | 	//
 96 | 	// 1 - Enable built-in API debugging features and records the results into
 97 | 	//     the capture, which is matched up with events on replay
 98 | 	// 0 - no API debugging is forcibly enabled
 99 | 	eRENDERDOC_Option_APIValidation = 2,
100 | 	eRENDERDOC_Option_DebugDeviceMode = 2,    // deprecated name of this enum
101 | 
102 | 	// Capture CPU callstacks for API events
103 | 	//
104 | 	// Default - disabled
105 | 	//
106 | 	// 1 - Enables capturing of callstacks
107 | 	// 0 - no callstacks are captured
108 | 	eRENDERDOC_Option_CaptureCallstacks = 3,
109 | 
110 | 	// When capturing CPU callstacks, only capture them from actions.
111 | 	// This option does nothing without the above option being enabled
112 | 	//
113 | 	// Default - disabled
114 | 	//
115 | 	// 1 - Only captures callstacks for actions.
116 | 	//     Ignored if CaptureCallstacks is disabled
117 | 	// 0 - Callstacks, if enabled, are captured for every event.
118 | 	eRENDERDOC_Option_CaptureCallstacksOnlyDraws = 4,
119 | 	eRENDERDOC_Option_CaptureCallstacksOnlyActions = 4,
120 | 
121 | 	// Specify a delay in seconds to wait for a debugger to attach, after
122 | 	// creating or injecting into a process, before continuing to allow it to run.
123 | 	//
124 | 	// 0 indicates no delay, and the process will run immediately after injection
125 | 	//
126 | 	// Default - 0 seconds
127 | 	//
128 | 	eRENDERDOC_Option_DelayForDebugger = 5,
129 | 
130 | 	// Verify buffer access. This includes checking the memory returned by a Map() call to
131 | 	// detect any out-of-bounds modification, as well as initialising buffers with undefined contents
132 | 	// to a marker value to catch use of uninitialised memory.
133 | 	//
134 | 	// NOTE: This option is only valid for OpenGL and D3D11. Explicit APIs such as D3D12 and Vulkan do
135 | 	// not do the same kind of interception & checking and undefined contents are really undefined.
136 | 	//
137 | 	// Default - disabled
138 | 	//
139 | 	// 1 - Verify buffer access
140 | 	// 0 - No verification is performed, and overwriting bounds may cause crashes or corruption in
141 | 	//     RenderDoc.
142 | 	eRENDERDOC_Option_VerifyBufferAccess = 6,
143 | 
144 | 	// The old name for eRENDERDOC_Option_VerifyBufferAccess was eRENDERDOC_Option_VerifyMapWrites.
145 | 	// This option now controls the filling of uninitialised buffers with 0xdddddddd which was
146 | 	// previously always enabled
147 | 	eRENDERDOC_Option_VerifyMapWrites = eRENDERDOC_Option_VerifyBufferAccess,
148 | 
149 | 	// Hooks any system API calls that create child processes, and injects
150 | 	// RenderDoc into them recursively with the same options.
151 | 	//
152 | 	// Default - disabled
153 | 	//
154 | 	// 1 - Hooks into spawned child processes
155 | 	// 0 - Child processes are not hooked by RenderDoc
156 | 	eRENDERDOC_Option_HookIntoChildren = 7,
157 | 
158 | 	// By default RenderDoc only includes resources in the final capture necessary
159 | 	// for that frame, this allows you to override that behaviour.
160 | 	//
161 | 	// Default - disabled
162 | 	//
163 | 	// 1 - all live resources at the time of capture are included in the capture
164 | 	//     and available for inspection
165 | 	// 0 - only the resources referenced by the captured frame are included
166 | 	eRENDERDOC_Option_RefAllResources = 8,
167 | 
168 | 	// **NOTE**: As of RenderDoc v1.1 this option has been deprecated. Setting or
169 | 	// getting it will be ignored, to allow compatibility with older versions.
170 | 	// In v1.1 the option acts as if it's always enabled.
171 | 	//
172 | 	// By default RenderDoc skips saving initial states for resources where the
173 | 	// previous contents don't appear to be used, assuming that writes before
174 | 	// reads indicate previous contents aren't used.
175 | 	//
176 | 	// Default - disabled
177 | 	//
178 | 	// 1 - initial contents at the start of each captured frame are saved, even if
179 | 	//     they are later overwritten or cleared before being used.
180 | 	// 0 - unless a read is detected, initial contents will not be saved and will
181 | 	//     appear as black or empty data.
182 | 	eRENDERDOC_Option_SaveAllInitials = 9,
183 | 
184 | 	// In APIs that allow for the recording of command lists to be replayed later,
185 | 	// RenderDoc may choose to not capture command lists before a frame capture is
186 | 	// triggered, to reduce overheads. This means any command lists recorded once
187 | 	// and replayed many times will not be available and may cause a failure to
188 | 	// capture.
189 | 	//
190 | 	// NOTE: This is only true for APIs where multithreading is difficult or
191 | 	// discouraged. Newer APIs like Vulkan and D3D12 will ignore this option
192 | 	// and always capture all command lists since the API is heavily oriented
193 | 	// around it and the overheads have been reduced by API design.
194 | 	//
195 | 	// 1 - All command lists are captured from the start of the application
196 | 	// 0 - Command lists are only captured if their recording begins during
197 | 	//     the period when a frame capture is in progress.
198 | 	eRENDERDOC_Option_CaptureAllCmdLists = 10,
199 | 
200 | 	// Mute API debugging output when the API validation mode option is enabled
201 | 	//
202 | 	// Default - enabled
203 | 	//
204 | 	// 1 - Mute any API debug messages from being displayed or passed through
205 | 	// 0 - API debugging is displayed as normal
206 | 	eRENDERDOC_Option_DebugOutputMute = 11,
207 | 
208 | 	// Option to allow vendor extensions to be used even when they may be
209 | 	// incompatible with RenderDoc and cause corrupted replays or crashes.
210 | 	//
211 | 	// Default - inactive
212 | 	//
213 | 	// No values are documented, this option should only be used when absolutely
214 | 	// necessary as directed by a RenderDoc developer.
215 | 	eRENDERDOC_Option_AllowUnsupportedVendorExtensions = 12,
216 | 
217 | } RENDERDOC_CaptureOption;
218 | 
219 | // Sets an option that controls how RenderDoc behaves on capture.
220 | //
221 | // Returns 1 if the option and value are valid
222 | // Returns 0 if either is invalid and the option is unchanged
223 | typedef int(RENDERDOC_CC *pRENDERDOC_SetCaptureOptionU32)(RENDERDOC_CaptureOption opt, uint32_t val);
224 | typedef int(RENDERDOC_CC *pRENDERDOC_SetCaptureOptionF32)(RENDERDOC_CaptureOption opt, float val);
225 | 
226 | // Gets the current value of an option as a uint32_t
227 | //
228 | // If the option is invalid, 0xffffffff is returned
229 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_GetCaptureOptionU32)(RENDERDOC_CaptureOption opt);
230 | 
231 | // Gets the current value of an option as a float
232 | //
233 | // If the option is invalid, -FLT_MAX is returned
234 | typedef float(RENDERDOC_CC *pRENDERDOC_GetCaptureOptionF32)(RENDERDOC_CaptureOption opt);
235 | 
236 | typedef enum RENDERDOC_InputButton {
237 | 	// '0' - '9' matches ASCII values
238 | 	eRENDERDOC_Key_0 = 0x30,
239 | 	eRENDERDOC_Key_1 = 0x31,
240 | 	eRENDERDOC_Key_2 = 0x32,
241 | 	eRENDERDOC_Key_3 = 0x33,
242 | 	eRENDERDOC_Key_4 = 0x34,
243 | 	eRENDERDOC_Key_5 = 0x35,
244 | 	eRENDERDOC_Key_6 = 0x36,
245 | 	eRENDERDOC_Key_7 = 0x37,
246 | 	eRENDERDOC_Key_8 = 0x38,
247 | 	eRENDERDOC_Key_9 = 0x39,
248 | 
249 | 	// 'A' - 'Z' matches ASCII values
250 | 	eRENDERDOC_Key_A = 0x41,
251 | 	eRENDERDOC_Key_B = 0x42,
252 | 	eRENDERDOC_Key_C = 0x43,
253 | 	eRENDERDOC_Key_D = 0x44,
254 | 	eRENDERDOC_Key_E = 0x45,
255 | 	eRENDERDOC_Key_F = 0x46,
256 | 	eRENDERDOC_Key_G = 0x47,
257 | 	eRENDERDOC_Key_H = 0x48,
258 | 	eRENDERDOC_Key_I = 0x49,
259 | 	eRENDERDOC_Key_J = 0x4A,
260 | 	eRENDERDOC_Key_K = 0x4B,
261 | 	eRENDERDOC_Key_L = 0x4C,
262 | 	eRENDERDOC_Key_M = 0x4D,
263 | 	eRENDERDOC_Key_N = 0x4E,
264 | 	eRENDERDOC_Key_O = 0x4F,
265 | 	eRENDERDOC_Key_P = 0x50,
266 | 	eRENDERDOC_Key_Q = 0x51,
267 | 	eRENDERDOC_Key_R = 0x52,
268 | 	eRENDERDOC_Key_S = 0x53,
269 | 	eRENDERDOC_Key_T = 0x54,
270 | 	eRENDERDOC_Key_U = 0x55,
271 | 	eRENDERDOC_Key_V = 0x56,
272 | 	eRENDERDOC_Key_W = 0x57,
273 | 	eRENDERDOC_Key_X = 0x58,
274 | 	eRENDERDOC_Key_Y = 0x59,
275 | 	eRENDERDOC_Key_Z = 0x5A,
276 | 
277 | 	// leave the rest of the ASCII range free
278 | 	// in case we want to use it later
279 | 	eRENDERDOC_Key_NonPrintable = 0x100,
280 | 
281 | 	eRENDERDOC_Key_Divide,
282 | 	eRENDERDOC_Key_Multiply,
283 | 	eRENDERDOC_Key_Subtract,
284 | 	eRENDERDOC_Key_Plus,
285 | 
286 | 	eRENDERDOC_Key_F1,
287 | 	eRENDERDOC_Key_F2,
288 | 	eRENDERDOC_Key_F3,
289 | 	eRENDERDOC_Key_F4,
290 | 	eRENDERDOC_Key_F5,
291 | 	eRENDERDOC_Key_F6,
292 | 	eRENDERDOC_Key_F7,
293 | 	eRENDERDOC_Key_F8,
294 | 	eRENDERDOC_Key_F9,
295 | 	eRENDERDOC_Key_F10,
296 | 	eRENDERDOC_Key_F11,
297 | 	eRENDERDOC_Key_F12,
298 | 
299 | 	eRENDERDOC_Key_Home,
300 | 	eRENDERDOC_Key_End,
301 | 	eRENDERDOC_Key_Insert,
302 | 	eRENDERDOC_Key_Delete,
303 | 	eRENDERDOC_Key_PageUp,
304 | 	eRENDERDOC_Key_PageDn,
305 | 
306 | 	eRENDERDOC_Key_Backspace,
307 | 	eRENDERDOC_Key_Tab,
308 | 	eRENDERDOC_Key_PrtScrn,
309 | 	eRENDERDOC_Key_Pause,
310 | 
311 | 	eRENDERDOC_Key_Max,
312 | } RENDERDOC_InputButton;
313 | 
314 | // Sets which key or keys can be used to toggle focus between multiple windows
315 | //
316 | // If keys is NULL or num is 0, toggle keys will be disabled
317 | typedef void(RENDERDOC_CC *pRENDERDOC_SetFocusToggleKeys)(RENDERDOC_InputButton *keys, int num);
318 | 
319 | // Sets which key or keys can be used to capture the next frame
320 | //
321 | // If keys is NULL or num is 0, captures keys will be disabled
322 | typedef void(RENDERDOC_CC *pRENDERDOC_SetCaptureKeys)(RENDERDOC_InputButton *keys, int num);
323 | 
324 | typedef enum RENDERDOC_OverlayBits {
325 | 	// This single bit controls whether the overlay is enabled or disabled globally
326 | 	eRENDERDOC_Overlay_Enabled = 0x1,
327 | 
328 | 	// Show the average framerate over several seconds as well as min/max
329 | 	eRENDERDOC_Overlay_FrameRate = 0x2,
330 | 
331 | 	// Show the current frame number
332 | 	eRENDERDOC_Overlay_FrameNumber = 0x4,
333 | 
334 | 	// Show a list of recent captures, and how many captures have been made
335 | 	eRENDERDOC_Overlay_CaptureList = 0x8,
336 | 
337 | 	// Default values for the overlay mask
338 | 	eRENDERDOC_Overlay_Default = (eRENDERDOC_Overlay_Enabled | eRENDERDOC_Overlay_FrameRate |
339 | 		eRENDERDOC_Overlay_FrameNumber | eRENDERDOC_Overlay_CaptureList),
340 | 
341 | 	// Enable all bits
342 | 	eRENDERDOC_Overlay_All = ~0U,
343 | 
344 | 	// Disable all bits
345 | 	eRENDERDOC_Overlay_None = 0,
346 | } RENDERDOC_OverlayBits;
347 | 
348 | // returns the overlay bits that have been set
349 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_GetOverlayBits)();
350 | // sets the overlay bits with an and & or mask
351 | typedef void(RENDERDOC_CC *pRENDERDOC_MaskOverlayBits)(uint32_t And, uint32_t Or);
352 | 
353 | // this function will attempt to remove RenderDoc's hooks in the application.
354 | //
355 | // Note: that this can only work correctly if done immediately after
356 | // the module is loaded, before any API work happens. RenderDoc will remove its
357 | // injected hooks and shut down. Behaviour is undefined if this is called
358 | // after any API functions have been called, and there is still no guarantee of
359 | // success.
360 | typedef void(RENDERDOC_CC *pRENDERDOC_RemoveHooks)();
361 | 
362 | // DEPRECATED: compatibility for code compiled against pre-1.4.1 headers.
363 | typedef pRENDERDOC_RemoveHooks pRENDERDOC_Shutdown;
364 | 
365 | // This function will unload RenderDoc's crash handler.
366 | //
367 | // If you use your own crash handler and don't want RenderDoc's handler to
368 | // intercede, you can call this function to unload it and any unhandled
369 | // exceptions will pass to the next handler.
370 | typedef void(RENDERDOC_CC *pRENDERDOC_UnloadCrashHandler)();
371 | 
372 | // Sets the capture file path template
373 | //
374 | // pathtemplate is a UTF-8 string that gives a template for how captures will be named
375 | // and where they will be saved.
376 | //
377 | // Any extension is stripped off the path, and captures are saved in the directory
378 | // specified, and named with the filename and the frame number appended. If the
379 | // directory does not exist it will be created, including any parent directories.
380 | //
381 | // If pathtemplate is NULL, the template will remain unchanged
382 | //
383 | // Example:
384 | //
385 | // SetCaptureFilePathTemplate("my_captures/example");
386 | //
387 | // Capture #1 -> my_captures/example_frame123.rdc
388 | // Capture #2 -> my_captures/example_frame456.rdc
389 | typedef void(RENDERDOC_CC *pRENDERDOC_SetCaptureFilePathTemplate)(const char *pathtemplate);
390 | 
391 | // returns the current capture path template, see SetCaptureFileTemplate above, as a UTF-8 string
392 | typedef const char *(RENDERDOC_CC *pRENDERDOC_GetCaptureFilePathTemplate)();
393 | 
394 | // DEPRECATED: compatibility for code compiled against pre-1.1.2 headers.
395 | typedef pRENDERDOC_SetCaptureFilePathTemplate pRENDERDOC_SetLogFilePathTemplate;
396 | typedef pRENDERDOC_GetCaptureFilePathTemplate pRENDERDOC_GetLogFilePathTemplate;
397 | 
398 | // returns the number of captures that have been made
399 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_GetNumCaptures)();
400 | 
401 | // This function returns the details of a capture, by index. New captures are added
402 | // to the end of the list.
403 | //
404 | // filename will be filled with the absolute path to the capture file, as a UTF-8 string
405 | // pathlength will be written with the length in bytes of the filename string
406 | // timestamp will be written with the time of the capture, in seconds since the Unix epoch
407 | //
408 | // Any of the parameters can be NULL and they'll be skipped.
409 | //
410 | // The function will return 1 if the capture index is valid, or 0 if the index is invalid
411 | // If the index is invalid, the values will be unchanged
412 | //
413 | // Note: when captures are deleted in the UI they will remain in this list, so the
414 | // capture path may not exist anymore.
415 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_GetCapture)(uint32_t idx, char *filename,
416 | 													  uint32_t *pathlength, uint64_t *timestamp);
417 | 
418 | // Sets the comments associated with a capture file. These comments are displayed in the
419 | // UI program when opening.
420 | //
421 | // filePath should be a path to the capture file to add comments to. If set to NULL or ""
422 | // the most recent capture file created made will be used instead.
423 | // comments should be a NULL-terminated UTF-8 string to add as comments.
424 | //
425 | // Any existing comments will be overwritten.
426 | typedef void(RENDERDOC_CC *pRENDERDOC_SetCaptureFileComments)(const char *filePath,
427 | 															  const char *comments);
428 | 
429 | // returns 1 if the RenderDoc UI is connected to this application, 0 otherwise
430 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_IsTargetControlConnected)();
431 | 
432 | // DEPRECATED: compatibility for code compiled against pre-1.1.1 headers.
433 | // This was renamed to IsTargetControlConnected in API 1.1.1, the old typedef is kept here for
434 | // backwards compatibility with old code, it is castable either way since it's ABI compatible
435 | // as the same function pointer type.
436 | typedef pRENDERDOC_IsTargetControlConnected pRENDERDOC_IsRemoteAccessConnected;
437 | 
438 | // This function will launch the Replay UI associated with the RenderDoc library injected
439 | // into the running application.
440 | //
441 | // if connectTargetControl is 1, the Replay UI will be launched with a command line parameter
442 | // to connect to this application
443 | // cmdline is the rest of the command line, as a UTF-8 string. E.g. a captures to open
444 | // if cmdline is NULL, the command line will be empty.
445 | //
446 | // returns the PID of the replay UI if successful, 0 if not successful.
447 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_LaunchReplayUI)(uint32_t connectTargetControl,
448 | 														  const char *cmdline);
449 | 
450 | // RenderDoc can return a higher version than requested if it's backwards compatible,
451 | // this function returns the actual version returned. If a parameter is NULL, it will be
452 | // ignored and the others will be filled out.
453 | typedef void(RENDERDOC_CC *pRENDERDOC_GetAPIVersion)(int *major, int *minor, int *patch);
454 | 
455 | //////////////////////////////////////////////////////////////////////////
456 | // Capturing functions
457 | //
458 | 
459 | // A device pointer is a pointer to the API's root handle.
460 | //
461 | // This would be an ID3D11Device, HGLRC/GLXContext, ID3D12Device, etc
462 | typedef void *RENDERDOC_DevicePointer;
463 | 
464 | // A window handle is the OS's native window handle
465 | //
466 | // This would be an HWND, GLXDrawable, etc
467 | typedef void *RENDERDOC_WindowHandle;
468 | 
469 | // A helper macro for Vulkan, where the device handle cannot be used directly.
470 | //
471 | // Passing the VkInstance to this macro will return the RENDERDOC_DevicePointer to use.
472 | //
473 | // Specifically, the value needed is the dispatch table pointer, which sits as the first
474 | // pointer-sized object in the memory pointed to by the VkInstance. Thus we cast to a void** and
475 | // indirect once.
476 | #define RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(inst) (*((void **)(inst)))
477 | 
478 | // This sets the RenderDoc in-app overlay in the API/window pair as 'active' and it will
479 | // respond to keypresses. Neither parameter can be NULL
480 | typedef void(RENDERDOC_CC *pRENDERDOC_SetActiveWindow)(RENDERDOC_DevicePointer device,
481 | 													   RENDERDOC_WindowHandle wndHandle);
482 | 
483 | // capture the next frame on whichever window and API is currently considered active
484 | typedef void(RENDERDOC_CC *pRENDERDOC_TriggerCapture)();
485 | 
486 | // capture the next N frames on whichever window and API is currently considered active
487 | typedef void(RENDERDOC_CC *pRENDERDOC_TriggerMultiFrameCapture)(uint32_t numFrames);
488 | 
489 | // When choosing either a device pointer or a window handle to capture, you can pass NULL.
490 | // Passing NULL specifies a 'wildcard' match against anything. This allows you to specify
491 | // any API rendering to a specific window, or a specific API instance rendering to any window,
492 | // or in the simplest case of one window and one API, you can just pass NULL for both.
493 | //
494 | // In either case, if there are two or more possible matching (device,window) pairs it
495 | // is undefined which one will be captured.
496 | //
497 | // Note: for headless rendering you can pass NULL for the window handle and either specify
498 | // a device pointer or leave it NULL as above.
499 | 
500 | // Immediately starts capturing API calls on the specified device pointer and window handle.
501 | //
502 | // If there is no matching thing to capture (e.g. no supported API has been initialised),
503 | // this will do nothing.
504 | //
505 | // The results are undefined (including crashes) if two captures are started overlapping,
506 | // even on separate devices and/oror windows.
507 | typedef void(RENDERDOC_CC *pRENDERDOC_StartFrameCapture)(RENDERDOC_DevicePointer device,
508 | 														 RENDERDOC_WindowHandle wndHandle);
509 | 
510 | // Returns whether or not a frame capture is currently ongoing anywhere.
511 | //
512 | // This will return 1 if a capture is ongoing, and 0 if there is no capture running
513 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_IsFrameCapturing)();
514 | 
515 | // Ends capturing immediately.
516 | //
517 | // This will return 1 if the capture succeeded, and 0 if there was an error capturing.
518 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_EndFrameCapture)(RENDERDOC_DevicePointer device,
519 | 														   RENDERDOC_WindowHandle wndHandle);
520 | 
521 | // Ends capturing immediately and discard any data stored without saving to disk.
522 | //
523 | // This will return 1 if the capture was discarded, and 0 if there was an error or no capture
524 | // was in progress
525 | typedef uint32_t(RENDERDOC_CC *pRENDERDOC_DiscardFrameCapture)(RENDERDOC_DevicePointer device,
526 | 															   RENDERDOC_WindowHandle wndHandle);
527 | 
528 | //////////////////////////////////////////////////////////////////////////////////////////////////
529 | // RenderDoc API versions
530 | //
531 | 
532 | // RenderDoc uses semantic versioning (http://semver.org/).
533 | //
534 | // MAJOR version is incremented when incompatible API changes happen.
535 | // MINOR version is incremented when functionality is added in a backwards-compatible manner.
536 | // PATCH version is incremented when backwards-compatible bug fixes happen.
537 | //
538 | // Note that this means the API returned can be higher than the one you might have requested.
539 | // e.g. if you are running against a newer RenderDoc that supports 1.0.1, it will be returned
540 | // instead of 1.0.0. You can check this with the GetAPIVersion entry point
541 | typedef enum RENDERDOC_Version {
542 | 	eRENDERDOC_API_Version_1_0_0 = 10000,    // RENDERDOC_API_1_0_0 = 1 00 00
543 | 	eRENDERDOC_API_Version_1_0_1 = 10001,    // RENDERDOC_API_1_0_1 = 1 00 01
544 | 	eRENDERDOC_API_Version_1_0_2 = 10002,    // RENDERDOC_API_1_0_2 = 1 00 02
545 | 	eRENDERDOC_API_Version_1_1_0 = 10100,    // RENDERDOC_API_1_1_0 = 1 01 00
546 | 	eRENDERDOC_API_Version_1_1_1 = 10101,    // RENDERDOC_API_1_1_1 = 1 01 01
547 | 	eRENDERDOC_API_Version_1_1_2 = 10102,    // RENDERDOC_API_1_1_2 = 1 01 02
548 | 	eRENDERDOC_API_Version_1_2_0 = 10200,    // RENDERDOC_API_1_2_0 = 1 02 00
549 | 	eRENDERDOC_API_Version_1_3_0 = 10300,    // RENDERDOC_API_1_3_0 = 1 03 00
550 | 	eRENDERDOC_API_Version_1_4_0 = 10400,    // RENDERDOC_API_1_4_0 = 1 04 00
551 | 	eRENDERDOC_API_Version_1_4_1 = 10401,    // RENDERDOC_API_1_4_1 = 1 04 01
552 | 	eRENDERDOC_API_Version_1_4_2 = 10402,    // RENDERDOC_API_1_4_2 = 1 04 02
553 | } RENDERDOC_Version;
554 | 
555 | // API version changelog:
556 | //
557 | // 1.0.0 - initial release
558 | // 1.0.1 - Bugfix: IsFrameCapturing() was returning false for captures that were triggered
559 | //         by keypress or TriggerCapture, instead of Start/EndFrameCapture.
560 | // 1.0.2 - Refactor: Renamed eRENDERDOC_Option_DebugDeviceMode to eRENDERDOC_Option_APIValidation
561 | // 1.1.0 - Add feature: TriggerMultiFrameCapture(). Backwards compatible with 1.0.x since the new
562 | //         function pointer is added to the end of the struct, the original layout is identical
563 | // 1.1.1 - Refactor: Renamed remote access to target control (to better disambiguate from remote
564 | //         replay/remote server concept in replay UI)
565 | // 1.1.2 - Refactor: Renamed "log file" in function names to just capture, to clarify that these
566 | //         are captures and not debug logging files. This is the first API version in the v1.0
567 | //         branch.
568 | // 1.2.0 - Added feature: SetCaptureFileComments() to add comments to a capture file that will be
569 | //         displayed in the UI program on load.
570 | // 1.3.0 - Added feature: New capture option eRENDERDOC_Option_AllowUnsupportedVendorExtensions
571 | //         which allows users to opt-in to allowing unsupported vendor extensions to function.
572 | //         Should be used at the user's own risk.
573 | //         Refactor: Renamed eRENDERDOC_Option_VerifyMapWrites to
574 | //         eRENDERDOC_Option_VerifyBufferAccess, which now also controls initialisation to
575 | //         0xdddddddd of uninitialised buffer contents.
576 | // 1.4.0 - Added feature: DiscardFrameCapture() to discard a frame capture in progress and stop
577 | //         capturing without saving anything to disk.
578 | // 1.4.1 - Refactor: Renamed Shutdown to RemoveHooks to better clarify what is happening
579 | // 1.4.2 - Refactor: Renamed 'draws' to 'actions' in callstack capture option.
580 | 
581 | typedef struct RENDERDOC_API_1_4_1
582 | {
583 | 	pRENDERDOC_GetAPIVersion GetAPIVersion;
584 | 
585 | 	pRENDERDOC_SetCaptureOptionU32 SetCaptureOptionU32;
586 | 	pRENDERDOC_SetCaptureOptionF32 SetCaptureOptionF32;
587 | 
588 | 	pRENDERDOC_GetCaptureOptionU32 GetCaptureOptionU32;
589 | 	pRENDERDOC_GetCaptureOptionF32 GetCaptureOptionF32;
590 | 
591 | 	pRENDERDOC_SetFocusToggleKeys SetFocusToggleKeys;
592 | 	pRENDERDOC_SetCaptureKeys SetCaptureKeys;
593 | 
594 | 	pRENDERDOC_GetOverlayBits GetOverlayBits;
595 | 	pRENDERDOC_MaskOverlayBits MaskOverlayBits;
596 | 
597 | 	// Shutdown was renamed to RemoveHooks in 1.4.1.
598 | 	// These unions allow old code to continue compiling without changes
599 | 	union
600 | 	{
601 | 		pRENDERDOC_Shutdown Shutdown;
602 | 		pRENDERDOC_RemoveHooks RemoveHooks;
603 | 	};
604 | 	pRENDERDOC_UnloadCrashHandler UnloadCrashHandler;
605 | 
606 | 	// Get/SetLogFilePathTemplate was renamed to Get/SetCaptureFilePathTemplate in 1.1.2.
607 | 	// These unions allow old code to continue compiling without changes
608 | 	union
609 | 	{
610 | 		// deprecated name
611 | 		pRENDERDOC_SetLogFilePathTemplate SetLogFilePathTemplate;
612 | 		// current name
613 | 		pRENDERDOC_SetCaptureFilePathTemplate SetCaptureFilePathTemplate;
614 | 	};
615 | 	union
616 | 	{
617 | 		// deprecated name
618 | 		pRENDERDOC_GetLogFilePathTemplate GetLogFilePathTemplate;
619 | 		// current name
620 | 		pRENDERDOC_GetCaptureFilePathTemplate GetCaptureFilePathTemplate;
621 | 	};
622 | 
623 | 	pRENDERDOC_GetNumCaptures GetNumCaptures;
624 | 	pRENDERDOC_GetCapture GetCapture;
625 | 
626 | 	pRENDERDOC_TriggerCapture TriggerCapture;
627 | 
628 | 	// IsRemoteAccessConnected was renamed to IsTargetControlConnected in 1.1.1.
629 | 	// This union allows old code to continue compiling without changes
630 | 	union
631 | 	{
632 | 		// deprecated name
633 | 		pRENDERDOC_IsRemoteAccessConnected IsRemoteAccessConnected;
634 | 		// current name
635 | 		pRENDERDOC_IsTargetControlConnected IsTargetControlConnected;
636 | 	};
637 | 	pRENDERDOC_LaunchReplayUI LaunchReplayUI;
638 | 
639 | 	pRENDERDOC_SetActiveWindow SetActiveWindow;
640 | 
641 | 	pRENDERDOC_StartFrameCapture StartFrameCapture;
642 | 	pRENDERDOC_IsFrameCapturing IsFrameCapturing;
643 | 	pRENDERDOC_EndFrameCapture EndFrameCapture;
644 | 
645 | 	// new function in 1.1.0
646 | 	pRENDERDOC_TriggerMultiFrameCapture TriggerMultiFrameCapture;
647 | 
648 | 	// new function in 1.2.0
649 | 	pRENDERDOC_SetCaptureFileComments SetCaptureFileComments;
650 | 
651 | 	// new function in 1.4.0
652 | 	pRENDERDOC_DiscardFrameCapture DiscardFrameCapture;
653 | } RENDERDOC_API_1_4_2;
654 | 
655 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_0_0;
656 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_0_1;
657 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_0_2;
658 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_1_0;
659 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_1_1;
660 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_1_2;
661 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_2_0;
662 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_3_0;
663 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_4_0;
664 | typedef RENDERDOC_API_1_4_2 RENDERDOC_API_1_4_0;
665 | 
666 | //////////////////////////////////////////////////////////////////////////////////////////////////
667 | // RenderDoc API entry point
668 | //
669 | // This entry point can be obtained via GetProcAddress/dlsym if RenderDoc is available.
670 | //
671 | // The name is the same as the typedef - "RENDERDOC_GetAPI"
672 | //
673 | // This function is not thread safe, and should not be called on multiple threads at once.
674 | // Ideally, call this once as early as possible in your application's startup, before doing
675 | // any API work, since some configuration functionality etc has to be done also before
676 | // initialising any APIs.
677 | //
678 | // Parameters:
679 | //   version is a single value from the RENDERDOC_Version above.
680 | //
681 | //   outAPIPointers will be filled out with a pointer to the corresponding struct of function
682 | //   pointers.
683 | //
684 | // Returns:
685 | //   1 - if the outAPIPointers has been filled with a pointer to the API struct requested
686 | //   0 - if the requested version is not supported or the arguments are invalid.
687 | //
688 | typedef int(RENDERDOC_CC *pRENDERDOC_GetAPI)(RENDERDOC_Version version, void **outAPIPointers);
689 | 
690 | #ifdef __cplusplus
691 | }    // extern "C"
692 | #endif


--------------------------------------------------------------------------------