├── examples ├── gl_merge │ ├── CMakeLists.txt │ └── gl_merge.cxx ├── gl_sort │ ├── CMakeLists.txt │ └── gl_sort.cxx ├── gl_transform │ ├── CMakeLists.txt │ └── gl_transform.cxx ├── thrust_test │ └── thrust_test.cu ├── vk_merge │ ├── CMakeLists.txt │ └── vk_merge.cxx ├── vk_radix │ ├── CMakeLists.txt │ ├── timings.jpg │ └── vk_radix.cxx ├── vk_radix2 │ ├── CMakeLists.txt │ └── vk_radix2.cxx ├── vk_scan │ ├── CMakeLists.txt │ └── vk_scan.cxx ├── vk_sort │ ├── CMakeLists.txt │ └── vk_sort.cxx └── vk_transform │ ├── CMakeLists.txt │ └── vk_transform.cxx ├── inc └── mgpu │ ├── common │ ├── bindings.hxx │ ├── cta_merge.hxx │ ├── cta_mergesort.hxx │ ├── cta_radix.hxx │ ├── cta_scan.hxx │ ├── kernel_merge.hxx │ ├── kernel_mergesort.hxx │ ├── loadstore.hxx │ ├── meta.hxx │ ├── sort_networks.hxx │ ├── subgroup.hxx │ └── types.hxx │ ├── gl │ ├── app.hxx │ ├── buffer.hxx │ ├── merge.hxx │ ├── mergesort.hxx │ ├── partition.hxx │ └── transform.hxx │ └── vk │ ├── VkBootstrap.h │ ├── context.hxx │ ├── launch.hxx │ ├── merge.hxx │ ├── mergesort.hxx │ ├── partition.hxx │ ├── radix.hxx │ ├── scan.hxx │ ├── transform.hxx │ └── vk_mem_alloc.h └── src └── vk ├── VkBootstrap.cpp └── context.cxx /examples/gl_merge/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(gl_merge) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | gl_merge.cxx 8 | ../../inc 9 | ) 10 | 11 | set_source_files_properties(gl_merge.cxx PROPERTIES COMPILE_FLAGS -shader) 12 | 13 | add_executable(gl_merge ${SOURCE_FILES}) 14 | 15 | target_link_libraries(gl_merge 16 | glfw 17 | gl3w 18 | GL 19 | ) 20 | -------------------------------------------------------------------------------- /examples/gl_merge/gl_merge.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | using namespace mgpu::gl; 8 | 9 | template > 10 | std::vector gpu_merge(const std::vector& a, 11 | const std::vector& b, comp_t comp = comp_t()) { 12 | 13 | // Upload keys to OpenGL SSBOs. 14 | gl_buffer_t a_keys(a); 15 | gl_buffer_t b_keys(b); 16 | 17 | // Allocate an SSBO for the result. 18 | gl_buffer_t c_keys(a.size() + b.size()); 19 | 20 | // Merge the keys! 21 | merge_pipeline_t pipeline; 22 | pipeline.launch(a_keys, a.size(), b_keys, b.size(), c_keys, comp); 23 | 24 | // Return the result in host memory. 25 | return c_keys.get_data(); 26 | } 27 | 28 | int main() { 29 | app_t app("merge demo"); 30 | 31 | int a_count = 10000; 32 | int b_count = 10000; 33 | std::vector a(a_count), b(b_count); 34 | a[:] = rand() % 100000...; std::sort(a.begin(), a.end()); 35 | b[:] = rand() % 100000...; std::sort(b.begin(), b.end()); 36 | 37 | std::vector c = gpu_merge<128, 7>(a, b); 38 | printf("%d: %f\n", @range(), c[:])...; 39 | } -------------------------------------------------------------------------------- /examples/gl_sort/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(gl_sort) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | gl_sort.cxx 8 | ) 9 | 10 | set_source_files_properties(gl_sort.cxx PROPERTIES COMPILE_FLAGS -shader) 11 | 12 | add_executable(gl_sort ${SOURCE_FILES}) 13 | 14 | target_link_libraries(gl_sort 15 | glfw 16 | gl3w 17 | GL 18 | ) 19 | -------------------------------------------------------------------------------- /examples/gl_sort/gl_sort.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using namespace mgpu::gl; 6 | 7 | // key-index sort. 8 | // sort keys in place and fill with gather indices. 9 | 10 | template 11 | void gpu_sort(std::vector& data) { 12 | gl_buffer_t a(data); 13 | gl_buffer_t b(data.size()); 14 | 15 | mergesort_pipeline_t pipeline; 16 | pipeline.template sort_keys_indices(a, b, data.size()); 17 | 18 | std::vector indices = b.get_data(); 19 | std::vector gathered = [data[indices[:]]...]; 20 | 21 | a.get_data(data.data()); 22 | 23 | bool is_inverse = gathered == data; 24 | printf("is_inverse = %d\n", is_inverse); 25 | 26 | } 27 | 28 | int main() { 29 | app_t app("sort demo"); 30 | 31 | const int nt = 128; 32 | const int vt = 7; 33 | int count = 10000; 34 | std::vector data(count); 35 | for(int i = 0; i < count; ++i) 36 | data[i] = rand() % 10000; 37 | 38 | gpu_sort(data); 39 | 40 | 41 | bool is_sorted = (... && (data[:] <= data[1:])); 42 | printf("IS SORTED = %d\n", is_sorted); 43 | } -------------------------------------------------------------------------------- /examples/gl_transform/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(gl_transform) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | gl_transform.cxx 8 | ) 9 | 10 | set_source_files_properties(gl_transform.cxx PROPERTIES COMPILE_FLAGS -shader) 11 | 12 | add_executable(gl_transform ${SOURCE_FILES}) 13 | 14 | target_link_libraries(gl_transform 15 | glfw 16 | gl3w 17 | GL 18 | ) 19 | -------------------------------------------------------------------------------- /examples/gl_transform/gl_transform.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace mgpu::gl; 7 | 8 | int main() { 9 | // Initialize OpenGL and create an offscreen window. 10 | app_t app("lambda test"); 11 | 12 | // Allocate storage for 10000 floats. 13 | int count = 10000; 14 | gl_buffer_t data(count); 15 | 16 | // Bind to binding=0. Return a buffer_iterator_t that samples 17 | // shader_buffer<0, float[]>. 18 | auto p = data.bind_ssbo<0>(); 19 | 20 | // Launch a compute shader from a lambda. 21 | gl_transform([=](int index) { 22 | p[index] = sqrt((float)index); 23 | }, count); 24 | 25 | std::vector data2 = data.get_data(); 26 | printf("%5d: %f\n", @range(), data2[:])...; 27 | } -------------------------------------------------------------------------------- /examples/thrust_test/thrust_test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | int main(void) { 11 | typedef uint type_t; 12 | int max_count = 50'000'000; 13 | 14 | thrust::host_vector host(max_count); 15 | for(int i = 0; i < max_count; ++i) 16 | host[i] = rand() + 2 * rand(); // fill all 32 bits. 17 | 18 | // Copy in host data. 19 | thrust::device_vector gpu = host; 20 | 21 | int sizes[] { 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 25, 30, 35, 40, 45, 50 }; 22 | 23 | for(int size : sizes) { 24 | // Sort 5 billion keys at least. 25 | int count = 1'000'000 * size; 26 | int num_iterations = (int)ceil(5.0e9 / count); 27 | 28 | cudaDeviceSynchronize(); 29 | timespec start; 30 | clock_gettime(CLOCK_REALTIME, &start); 31 | 32 | for(int i = 0; i < num_iterations; ++i) 33 | thrust::sort(gpu.begin(), gpu.begin() + count); 34 | 35 | cudaDeviceSynchronize(); 36 | 37 | timespec end; 38 | clock_gettime(CLOCK_REALTIME, &end); 39 | 40 | double elapsed = (end.tv_sec - start.tv_sec) + 41 | (end.tv_nsec - start.tv_nsec) * 1.0e-9; 42 | 43 | double rate = (double)count * num_iterations / elapsed / 1.0e6; 44 | 45 | printf("%9d: %20.5f time=%f, iterations=%d\n", count, rate, elapsed, 46 | num_iterations); 47 | } 48 | 49 | return 0; 50 | } -------------------------------------------------------------------------------- /examples/vk_merge/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(vk_merge) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | vk_merge.cxx 8 | ../../src/vk/context.cxx 9 | ../../src/vk/VkBootstrap.cpp 10 | 11 | ../../inc 12 | ) 13 | 14 | set_source_files_properties(vk_merge.cxx PROPERTIES COMPILE_FLAGS -shader) 15 | 16 | add_executable(vk_merge ${SOURCE_FILES}) 17 | 18 | target_link_libraries(vk_merge 19 | glfw 20 | vulkan 21 | dl 22 | pthread 23 | ) 24 | -------------------------------------------------------------------------------- /examples/vk_merge/vk_merge.cxx: -------------------------------------------------------------------------------- 1 | #define ALIASED_SMEM 2 | 3 | #include 4 | #include 5 | 6 | using namespace mgpu::vk; 7 | 8 | int main() { 9 | context_t context; 10 | 11 | int a_count = 10000; 12 | int b_count = 10000; 13 | int count = a_count + b_count; 14 | 15 | float* a_host = context.alloc_cpu(a_count); 16 | float* b_host = context.alloc_cpu(b_count); 17 | float* c_host = context.alloc_cpu(a_count + b_count); 18 | 19 | // Generate test data into the staging buffers. 20 | for(int i = 0; i < a_count; ++i) 21 | a_host[i] = rand() % 100000; 22 | for(int i = 0; i < b_count; ++i) 23 | b_host[i] = rand() % 100000; 24 | 25 | // Sort both input sequences. 26 | std::sort(a_host, a_host + a_count); 27 | std::sort(b_host, b_host + b_count); 28 | 29 | float* a_gpu = context.alloc_gpu(a_count); 30 | float* b_gpu = context.alloc_gpu(b_count); 31 | float* c_gpu = context.alloc_gpu(count); 32 | 33 | // Create a command buffer. 34 | cmd_buffer_t cmd_buffer(context); 35 | cmd_buffer.begin(); 36 | 37 | // Upload to GPU memory. 38 | cmd_buffer.memcpy(a_gpu, a_host, sizeof(float) * a_count); 39 | cmd_buffer.memcpy(b_gpu, b_host, sizeof(float) * b_count); 40 | cmd_buffer.host_barrier(); 41 | 42 | // Execute the parallel merge. 43 | void* aux_data = nullptr; 44 | size_t aux_size = 0; 45 | merge(aux_data, aux_size, cmd_buffer, a_gpu, a_count, b_gpu, b_count, c_gpu, 46 | std::less()); 47 | aux_data = context.alloc_gpu(aux_size); 48 | 49 | merge(aux_data, aux_size, cmd_buffer, a_gpu, a_count, b_gpu, b_count, c_gpu, 50 | std::less()); 51 | 52 | // Retrieve the results. 53 | cmd_buffer.memcpy(c_host, c_gpu, sizeof(float) * count); 54 | cmd_buffer.host_barrier(); 55 | 56 | // End and submite the command buffer. 57 | cmd_buffer.end(); 58 | context.submit(cmd_buffer); 59 | 60 | vkQueueWaitIdle(context.queue); 61 | 62 | // Print our results. 63 | for(int i = 0; i < count; ++i) 64 | printf("%5d: %f\n", i, c_host[i]); 65 | 66 | context.free(aux_data); 67 | context.free(a_host); 68 | context.free(b_host); 69 | context.free(c_host); 70 | context.free(a_gpu); 71 | context.free(b_gpu); 72 | context.free(c_gpu); 73 | } -------------------------------------------------------------------------------- /examples/vk_radix/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(vk_radix) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | vk_radix.cxx 8 | ../../src/vk/context.cxx 9 | ../../src/vk/VkBootstrap.cpp 10 | 11 | ../../inc 12 | ) 13 | 14 | set_source_files_properties(vk_radix.cxx PROPERTIES COMPILE_FLAGS -shader) 15 | 16 | add_executable(vk_radix ${SOURCE_FILES}) 17 | 18 | target_link_libraries(vk_radix 19 | glfw 20 | vulkan 21 | dl 22 | pthread 23 | ) 24 | -------------------------------------------------------------------------------- /examples/vk_radix/timings.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seanbaxter/mgpu-shaders/ba92f0bd7db5c5a4e728823d7dc37cf07450eb5f/examples/vk_radix/timings.jpg -------------------------------------------------------------------------------- /examples/vk_radix/vk_radix.cxx: -------------------------------------------------------------------------------- 1 | #define ALIASED_SMEM 2 | 3 | #include "mgpu/vk/radix.hxx" 4 | #include 5 | #include 6 | 7 | using namespace mgpu; 8 | using namespace mgpu::vk; 9 | 10 | int main() { 11 | context_t context; 12 | 13 | typedef uint type_t; 14 | int max_count = 50'000'000; 15 | 16 | type_t* host = context.alloc_cpu(max_count); 17 | for(int i = 0; i < max_count; ++i) 18 | host[i] = rand() + 2 * rand(); // fill all 32 bits. 19 | 20 | type_t* gpu = context.alloc_gpu(max_count); 21 | 22 | // Create a command buffer. 23 | cmd_buffer_t cmd_buffer(context); 24 | cmd_buffer.begin(); 25 | 26 | // Copy in host data. 27 | cmd_buffer.memcpy(gpu, host, sizeof(type_t) * max_count); 28 | 29 | cmd_buffer.end(); 30 | context.submit(cmd_buffer); 31 | 32 | // Allocate auxiliary storage. 33 | void* aux_data; 34 | size_t aux_size = 0; 35 | radix_sort<128, 4, 4>(aux_data, aux_size, cmd_buffer, gpu, max_count); 36 | aux_data = context.alloc_gpu(aux_size); 37 | 38 | vkQueueWaitIdle(context.queue); 39 | 40 | int sizes[] { 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 25, 30, 35, 40, 45, 50 }; 41 | 42 | enum { nt = 256, vt = 16 }; 43 | for(int size : sizes) { 44 | // Sort 5 billion keys at least. 45 | int count = 1'000'000 * size; 46 | int num_iterations = (int)ceil(5.0e9 / count); 47 | 48 | cmd_buffer.reset(); 49 | cmd_buffer.begin(); 50 | 51 | timespec start; 52 | clock_gettime(CLOCK_REALTIME, &start); 53 | 54 | cmd_buffer.begin(); 55 | for(int i = 0; i < num_iterations; ++i) 56 | radix_sort(aux_data, aux_size, cmd_buffer, gpu, count); 57 | cmd_buffer.end(); 58 | 59 | context.submit(cmd_buffer); 60 | vkQueueWaitIdle(context.queue); 61 | 62 | timespec end; 63 | clock_gettime(CLOCK_REALTIME, &end); 64 | 65 | double elapsed = (end.tv_sec - start.tv_sec) + 66 | (end.tv_nsec - start.tv_nsec) * 1.0e-9; 67 | 68 | double rate = (double)count * num_iterations / elapsed / 1.0e6; 69 | 70 | printf("%9d: %20.5f time=%f, iterations=%d\n", count, rate, elapsed, 71 | num_iterations); 72 | } 73 | 74 | context.free(aux_data); 75 | context.free(gpu); 76 | context.free(host); 77 | } 78 | 79 | -------------------------------------------------------------------------------- /examples/vk_radix2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(vk_radix2) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | vk_radix2.cxx 8 | ../../src/vk/context.cxx 9 | ../../src/vk/VkBootstrap.cpp 10 | 11 | ../../inc 12 | ) 13 | 14 | set_source_files_properties(vk_radix2.cxx PROPERTIES COMPILE_FLAGS -shader) 15 | 16 | add_executable(vk_radix2 ${SOURCE_FILES}) 17 | 18 | target_link_libraries(vk_radix2 19 | glfw 20 | vulkan 21 | dl 22 | pthread 23 | ) 24 | -------------------------------------------------------------------------------- /examples/vk_radix2/vk_radix2.cxx: -------------------------------------------------------------------------------- 1 | #define ALIASED_SMEM 2 | 3 | #include "mgpu/vk/radix.hxx" 4 | #include 5 | #include 6 | 7 | using namespace mgpu; 8 | using namespace mgpu::vk; 9 | 10 | int main() { 11 | context_t context; 12 | 13 | // Allocate test data storage. 14 | enum { nt = 256, num_bits = 8, num_bins = 1<< num_bits, vt = 1, nv = nt * vt }; 15 | 16 | typedef uint type_t; 17 | int count = nv * 32 * 32 * 32; 18 | int num_ctas = div_up(count, nv); 19 | std::vector ref(count); 20 | 21 | type_t* host = context.alloc_cpu(count); 22 | type_t* gpu = context.alloc_gpu(count); 23 | 24 | // Generate test data. 25 | for(int i = 0; i < count; ++i) { 26 | ref[i] = host[i] = rand(); 27 | } 28 | 29 | //for(int i = 0; i < count; i += nv) { 30 | // std::sort(ref.begin() + i, ref.begin() + std::min(count, i + nv)); 31 | //} 32 | std::sort(ref.begin(), ref.end()); 33 | 34 | // Create a command buffer. 35 | cmd_buffer_t cmd_buffer(context); 36 | cmd_buffer.begin(); 37 | 38 | // Upload test data to GPU memory. 39 | cmd_buffer.memcpy(gpu, host, sizeof(type_t) * count); 40 | cmd_buffer.host_barrier(); 41 | 42 | void* aux_data = nullptr; 43 | size_t aux_size = 0; 44 | radix_sort(aux_data, aux_size, cmd_buffer, gpu, count); 45 | aux_data = context.alloc_gpu(aux_size); 46 | 47 | radix_sort(aux_data, aux_size, cmd_buffer, gpu, count); 48 | radix_sort(aux_data, aux_size, cmd_buffer, gpu, count); 49 | radix_sort(aux_data, aux_size, cmd_buffer, gpu, count); 50 | radix_sort(aux_data, aux_size, cmd_buffer, gpu, count); 51 | radix_sort(aux_data, aux_size, cmd_buffer, gpu, count); 52 | radix_sort(aux_data, aux_size, cmd_buffer, gpu, count); 53 | 54 | // Retrieve the results. 55 | cmd_buffer.memcpy(host, gpu, sizeof(type_t) * count); 56 | cmd_buffer.host_barrier(); 57 | 58 | // End and submit the command buffer. 59 | cmd_buffer.end(); 60 | context.submit(cmd_buffer); 61 | 62 | // And wait for it to be done. 63 | vkQueueWaitIdle(context.queue); 64 | 65 | for(int i = 0; i < count; ++i) { 66 | // printf("%6d: %9d\n", i, host[i]); // - %3d - %3d\n", i, host[i], ref[i], scans[i / 32][i % 32]); 67 | // printf("%3d: %5d \n", i, host[i]); 68 | 69 | if(host[i] != ref[i]) { 70 | printf("Error at %d: %d vs %d\n", i, host[i], ref[i]); 71 | exit(1); 72 | } 73 | } 74 | 75 | printf("MATCH\n"); 76 | 77 | 78 | context.free(aux_data); 79 | context.free(host); 80 | context.free(gpu); 81 | 82 | } 83 | 84 | -------------------------------------------------------------------------------- /examples/vk_scan/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(vk_scan) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | vk_scan.cxx 8 | ../../src/vk/context.cxx 9 | ../../src/vk/VkBootstrap.cpp 10 | 11 | ../../inc 12 | ) 13 | 14 | set_source_files_properties(vk_scan.cxx PROPERTIES COMPILE_FLAGS -shader) 15 | 16 | add_executable(vk_scan ${SOURCE_FILES}) 17 | 18 | target_link_libraries(vk_scan 19 | glfw 20 | vulkan 21 | dl 22 | pthread 23 | ) 24 | -------------------------------------------------------------------------------- /examples/vk_scan/vk_scan.cxx: -------------------------------------------------------------------------------- 1 | #define ALIASED_SMEM 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace mgpu; 11 | using namespace mgpu::vk; 12 | 13 | int main() { 14 | context_t context; 15 | 16 | // Allocate test data storage. 17 | int count = 10000; 18 | int* host = context.alloc_cpu(count); 19 | int* gpu = context.alloc_gpu(count); 20 | 21 | // Generate test data. 22 | for(int i = 0; i < count; ++i) 23 | host[i] = i; 24 | 25 | // Create a command buffer. 26 | cmd_buffer_t cmd_buffer(context); 27 | cmd_buffer.begin(); 28 | 29 | // Upload test data to GPU memory. 30 | cmd_buffer.memcpy(gpu, host, sizeof(int) * count); 31 | cmd_buffer.host_barrier(); 32 | 33 | // Execute the scan. 34 | void* aux_data = nullptr; 35 | size_t aux_size = 0; 36 | vk::scan(aux_data, aux_size, cmd_buffer, gpu, count); 37 | printf("aux size = %ld\n", aux_size); 38 | aux_data = context.alloc_gpu(aux_size); 39 | 40 | vk::scan(aux_data, aux_size, cmd_buffer, gpu, count); 41 | 42 | // Retrieve the results. 43 | cmd_buffer.memcpy(host, gpu, sizeof(int) * count); 44 | cmd_buffer.host_barrier(); 45 | 46 | // End and submit the command buffer. 47 | cmd_buffer.end(); 48 | context.submit(cmd_buffer); 49 | 50 | // And wait for it to be done. 51 | vkQueueWaitIdle(context.queue); 52 | 53 | for(int i = 0; i < count; ++i) 54 | printf("%3d: %2d\n", i, host[i]); 55 | 56 | context.free(aux_data); 57 | context.free(host); 58 | context.free(gpu); 59 | 60 | return 0; 61 | } 62 | 63 | -------------------------------------------------------------------------------- /examples/vk_sort/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(vk_sort) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | vk_sort.cxx 8 | ../../src/vk/context.cxx 9 | ../../src/vk/VkBootstrap.cpp 10 | 11 | ../../inc 12 | ) 13 | 14 | set_source_files_properties(vk_sort.cxx PROPERTIES COMPILE_FLAGS -shader) 15 | 16 | add_executable(vk_sort ${SOURCE_FILES}) 17 | 18 | target_link_libraries(vk_sort 19 | glfw 20 | vulkan 21 | dl 22 | pthread 23 | ) 24 | -------------------------------------------------------------------------------- /examples/vk_sort/vk_sort.cxx: -------------------------------------------------------------------------------- 1 | #define ALIASED_SMEM 2 | 3 | #include 4 | #include 5 | 6 | using namespace mgpu::vk; 7 | 8 | int main() { 9 | context_t context; 10 | 11 | int count = 10000; 12 | float* host = context.alloc_cpu(count); 13 | float* gpu = context.alloc_gpu(count); 14 | 15 | // Generate test data into the staging buffers. 16 | for(int i = 0; i < count; ++i) 17 | host[i] = rand() % 100000; 18 | 19 | // Create a command buffer. 20 | cmd_buffer_t cmd_buffer(context); 21 | cmd_buffer.begin(); 22 | 23 | // Upload to GPU memory. 24 | cmd_buffer.memcpy(gpu, host, sizeof(float) * count); 25 | cmd_buffer.host_barrier(); 26 | 27 | // Execute the parallel mergesort. 28 | void* aux_data = nullptr; 29 | size_t aux_size = 0; 30 | mergesort_keys(aux_data, aux_size, cmd_buffer, gpu, count); 31 | aux_data = context.alloc_gpu(aux_size); 32 | 33 | mergesort_keys(aux_data, aux_size, cmd_buffer, gpu, count); 34 | 35 | // Retrieve the results. 36 | cmd_buffer.memcpy(host, gpu, sizeof(float) * count); 37 | cmd_buffer.host_barrier(); 38 | 39 | // End and submit the command buffer. 40 | cmd_buffer.end(); 41 | context.submit(cmd_buffer); 42 | 43 | vkQueueWaitIdle(context.queue); 44 | 45 | // Print our results. 46 | for(int i = 0; i < count; ++i) 47 | printf("%5d: %f\n", i, host[i]); 48 | 49 | context.free(aux_data); 50 | context.free(host); 51 | context.free(gpu); 52 | } -------------------------------------------------------------------------------- /examples/vk_transform/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(vk_transform) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | vk_transform.cxx 8 | ../../src/vk/context.cxx 9 | ../../src/vk/VkBootstrap.cpp 10 | ) 11 | 12 | set_source_files_properties(vk_transform.cxx PROPERTIES COMPILE_FLAGS -shader) 13 | 14 | add_executable(vk_transform ${SOURCE_FILES}) 15 | 16 | target_link_libraries(vk_transform 17 | glfw 18 | vulkan 19 | dl 20 | pthread 21 | ) 22 | -------------------------------------------------------------------------------- /examples/vk_transform/vk_transform.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace mgpu::vk; 5 | 6 | template 7 | [[using spirv: comp, local_size(NT), push]] 8 | void compute_shader(int count, type_t a, type_t* x, type_t* y) { 9 | int gid = glcomp_GlobalInvocationID.x; 10 | if(gid < count) { 11 | x[gid] = 2 * gid; // Even values. 12 | y[gid] = 2 * gid + 1; // Odd values. 13 | } 14 | } 15 | 16 | int main() { 17 | context_t context; 18 | 19 | // Allocate test data storage. 20 | int count = 100; 21 | float a = 1.618f; // A saxpy coefficient. 22 | float* x = context.alloc_gpu(count); 23 | float* y = context.alloc_gpu(count); 24 | 25 | // Create a command buffer. 26 | cmd_buffer_t cmd_buffer(context); 27 | cmd_buffer.begin(); 28 | 29 | // Three ways to launch kernels with mgpu-shaders for Vulkan: 30 | 31 | // 1. Use chevron launch syntax. This calls spirv_chevron_comp and 32 | // passes the shader reference as the template argument. In mgpu's 33 | // implementation, num_blocks and cmd_buffer are the chevron arguments. 34 | const int NT = 64; // Use 64 threads per block. 35 | int num_blocks = mgpu::div_up(count, NT); 36 | compute_shader<<>>(count, M_PIf32, x, y); 37 | 38 | // 2. Use launch(). This is like a chevron launch, but you don't even have 39 | // to write a shader. Pass it a function object or lambda. The lambda 40 | // gets called back with the glcomp_LocalInvocation.x and 41 | // glcomp_WorkGroupID.x values. As with the chevron launch, specify the 42 | // grid size. You must also specify the workgroup size as a template 43 | // argument, since you aren't defining a compute on which to attach 44 | // local_size. 45 | launch(num_blocks, cmd_buffer, [=](int tid, int cta) { 46 | // tid and cta are the thread and workgroup IDs. 47 | // combine them for a global ID or read glcomp_GlobalInvocationID.x. 48 | int gid = tid + NT * cta; 49 | 50 | // Use the default-copy closure to capture the kernel parameters. 51 | if(gid < count) { 52 | // SAXPY these terms. 53 | y[gid] += a * x[gid]; 54 | } 55 | }); 56 | 57 | // 3. Use transform(). This is for embarrassingly parallel tasks. It 58 | // executes the function object once for each request. You can pass it 59 | // the group size as an optional template argument, or use an 60 | // implementation-defined group size. 61 | transform(count, cmd_buffer, [=](int index) { 62 | x[index] *= sqrt(y[index]); 63 | }); 64 | 65 | // Copy the data to host memory. 66 | float* host = context.alloc_cpu(count); 67 | 68 | cmd_buffer.host_barrier(); 69 | context.memcpy(cmd_buffer, host, x, sizeof(float) * count); 70 | cmd_buffer.host_barrier(); 71 | 72 | // End and submit the command buffer. 73 | cmd_buffer.end(); 74 | context.submit(cmd_buffer); 75 | 76 | // And wait for it to be done. 77 | vkQueueWaitIdle(context.queue); 78 | 79 | // Print our results. 80 | for(int i = 0; i < count; ++i) 81 | printf("%3d: %f\n", i, host[i]); 82 | 83 | context.free(x); 84 | context.free(y); 85 | context.free(host); 86 | 87 | return 0; 88 | } 89 | 90 | -------------------------------------------------------------------------------- /inc/mgpu/common/bindings.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "meta.hxx" 3 | #include 4 | #include 5 | 6 | BEGIN_MGPU_NAMESPACE 7 | 8 | template 9 | [[using spirv: in, location((int)index)]] 10 | type_t shader_in; 11 | 12 | template 13 | [[using spirv: out, location((int)index)]] 14 | type_t shader_out; 15 | 16 | template 17 | [[using spirv: uniform, binding((int)index)]] 18 | type_t shader_uniform; 19 | 20 | template 21 | [[using spirv: buffer, readonly, binding(index)]] 22 | type_t shader_readonly; 23 | 24 | template 25 | [[using spirv: buffer, writeonly, binding(index)]] 26 | type_t shader_writeonly; 27 | 28 | template 29 | [[using spirv: buffer, binding(index)]] 30 | type_t shader_buffer; 31 | 32 | //////////////////////////////////////////////////////////////////////////////// 33 | 34 | // Provide an a common iterator type. 35 | template 36 | struct iterator_t : std::iterator_traits*> { 37 | 38 | iterator_t() = default; 39 | explicit iterator_t(int offset) : offset(offset) { } 40 | 41 | iterator_t(const iterator_t&) = default; 42 | iterator_t& operator=(const iterator_t&) = default; 43 | 44 | iterator_t operator+(int diff) const noexcept { 45 | return iterator_t(offset + diff); 46 | } 47 | iterator_t& operator+=(int diff) noexcept { 48 | offset += diff; 49 | return *this; 50 | } 51 | friend iterator_t operator+(int diff, iterator_t rhs) noexcept { 52 | return iterator_t(diff + rhs.offset); 53 | } 54 | 55 | iterator_t operator-(int diff) const noexcept { 56 | return iterator_t(offset - diff); 57 | } 58 | iterator_t& operator-=(int diff) noexcept { 59 | offset -= diff; 60 | return *this; 61 | } 62 | 63 | int operator-(iterator_t rhs) const noexcept { 64 | return offset - rhs.offset; 65 | } 66 | 67 | decltype(auto) operator*() const noexcept { 68 | return accessor_t::access(offset); 69 | } 70 | 71 | decltype(auto) operator[](int index) const noexcept { 72 | return accessor_t::access(offset + index); 73 | } 74 | 75 | int offset = 0; 76 | }; 77 | 78 | template 79 | struct readonly_access_t { 80 | static type_t access(int index) noexcept { 81 | return shader_readonly[index]; 82 | } 83 | }; 84 | 85 | template 86 | using readonly_iterator_t = iterator_t >; 87 | 88 | template 89 | struct writeonly_access_t { 90 | static type_t& access(int index) noexcept { 91 | return shader_writeonly[index]; 92 | } 93 | }; 94 | template 95 | using writeonly_iterator_t = iterator_t >; 96 | 97 | template 98 | struct buffer_access_t { 99 | static type_t& access(int index) noexcept { 100 | return shader_buffer[index]; 101 | } 102 | }; 103 | template 104 | using buffer_iterator_t = iterator_t >; 105 | 106 | struct empty_iterator_t : std::iterator_traits { 107 | // Don't provide additional interface. The caller should check the 108 | // iterator_traits prior to subscripting. 109 | }; 110 | 111 | //////////////////////////////////////////////////////////////////////////////// 112 | 113 | END_MGPU_NAMESPACE 114 | 115 | -------------------------------------------------------------------------------- /inc/mgpu/common/cta_merge.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "loadstore.hxx" 3 | 4 | BEGIN_MGPU_NAMESPACE 5 | 6 | constexpr int num_merge_partitions(int count, int spacing) { 7 | return div_up(count, spacing) + 1; 8 | } 9 | 10 | template 12 | int merge_path(a_keys_it a_keys, int a_count, b_keys_it b_keys, 13 | int b_count, int diag, comp_t comp) { 14 | 15 | typedef typename std::iterator_traits::value_type type_t; 16 | int begin = max(0, diag - b_count); 17 | int end = min(diag, a_count); 18 | 19 | while(begin < end) { 20 | int mid = (begin + end) / 2; 21 | type_t a_key = a_keys[mid]; 22 | type_t b_key = b_keys[diag - 1 - mid]; 23 | bool pred = (bounds_upper == bounds) ? 24 | comp(a_key, b_key) : 25 | !comp(b_key, a_key); 26 | 27 | if(pred) begin = mid + 1; 28 | else end = mid; 29 | } 30 | return begin; 31 | } 32 | 33 | template 34 | int merge_path(keys_it keys, merge_range_t range, int diag, comp_t comp) { 35 | return merge_path( 36 | keys + range.a_begin, range.a_count(), 37 | keys + range.b_begin, range.b_count(), 38 | diag, comp); 39 | } 40 | 41 | template 42 | bool merge_predicate(type_t a_key, type_t b_key, merge_range_t range, 43 | comp_t comp) { 44 | 45 | bool p; 46 | if(range_check && !range.a_valid()) p = false; 47 | else if(range_check && !range.b_valid()) p = true; 48 | else p = (bounds_upper == bounds) ? comp(a_key, b_key) : !comp(b_key, a_key); 49 | return p; 50 | } 51 | 52 | merge_range_t compute_merge_range(int a_count, int b_count, int partition, 53 | int spacing, int mp0, int mp1) { 54 | 55 | int diag0 = spacing * partition; 56 | int diag1 = min(a_count + b_count, diag0 + spacing); 57 | 58 | return merge_range_t { mp0, mp1, diag0 - mp0, diag1 - mp1 }; 59 | } 60 | 61 | template 62 | std::array load_two_streams_reg(a_it a, int a_count, b_it b, 63 | int b_count, int tid) { 64 | 65 | std::array x; 66 | strided_iterate([&](int i, int index) { 67 | x[i] = (index < a_count) ? a[index] : b[index - a_count]; 68 | }, tid, a_count + b_count); 69 | 70 | return x; 71 | } 72 | 73 | template 75 | void load_two_streams_shared(a_it a, int a_count, b_it b, int b_count, 76 | int tid, type_t (&shared)[shared_size]) { 77 | 78 | // Load into register then make an unconditional strided store into memory. 79 | std::array x = load_two_streams_reg( 80 | a, a_count, b, b_count, tid); 81 | reg_to_shared_strided(x, tid, shared); 82 | } 83 | 84 | template 85 | std::array gather_two_streams_strided(a_it a, int a_count, 86 | b_it b, int b_count, std::array indices, int tid) { 87 | 88 | std::array x; 89 | strided_iterate([&](int i, int j) { 90 | x[i] = (indices[i] < a_count) ? a[indices[i]] : b[indices[i] - a_count]; 91 | }, tid, a_count + b_count); 92 | 93 | return x; 94 | } 95 | 96 | template 97 | void transfer_two_streams_strided(a_it a, int a_count, b_it b, 98 | int b_count, std::array indices, int tid, c_it c) { 99 | 100 | typedef typename std::iterator_traits::value_type type_t; 101 | std::array x = gather_two_streams_strided(a, 102 | a_count, b, b_count, indices, tid); 103 | 104 | reg_to_mem_strided(x, tid, a_count + b_count, c); 105 | } 106 | 107 | // This function must be able to dereference keys[a_begin] and keys[b_begin], 108 | // no matter the indices for each. The caller should allocate at least 109 | // nt * vt + 1 elements for keys_shared. 110 | template 111 | merge_pair_t serial_merge(const type_t* keys_shared, 112 | merge_range_t range, comp_t comp, bool sync = true) { 113 | 114 | type_t a_key = keys_shared[range.a_begin]; 115 | type_t b_key = keys_shared[range.b_begin]; 116 | 117 | merge_pair_t merge_pair; 118 | 119 | @meta for(int i = 0; i < vt; ++i) {{ 120 | bool p = merge_predicate(a_key, b_key, range, comp); 121 | int index = p ? range.a_begin : range.b_begin; 122 | 123 | merge_pair.keys[i] = p ? a_key : b_key; 124 | merge_pair.indices[i] = index; 125 | 126 | type_t c_key = keys_shared[++index]; 127 | if(p) a_key = c_key, range.a_begin = index; 128 | else b_key = c_key, range.b_begin = index; 129 | }} 130 | 131 | if(sync) __syncthreads(); 132 | return merge_pair; 133 | } 134 | 135 | // Load arrays a and b from global memory and merge into register. 136 | template 138 | merge_pair_t cta_merge_from_mem(a_it a, b_it b, 139 | merge_range_t range_mem, int tid, comp_t comp, 140 | type_t (&keys_shared)[shared_size]) { 141 | 142 | static_assert(shared_size >= nt * vt + 1, 143 | "cta_merge_from_mem requires temporary storage of at " 144 | "least nt * vt + 1 items"); 145 | 146 | // Load the data into shared memory. 147 | load_two_streams_shared(a + range_mem.a_begin, range_mem.a_count(), 148 | b + range_mem.b_begin, range_mem.b_count(), tid, keys_shared); 149 | 150 | // Run a merge path to find the start of the serial merge for each thread. 151 | merge_range_t range_local = range_mem.to_local(); 152 | int diag = vt * tid; 153 | int mp = merge_path(keys_shared, range_local, diag, comp); 154 | 155 | // Compute the ranges of the sources in shared memory. The end iterators 156 | // of the range are inaccurate, but still facilitate exact merging, because 157 | // only vt elements will be merged. 158 | merge_pair_t merged = serial_merge(keys_shared, 159 | range_local.partition(mp, diag), comp); 160 | 161 | return merged; 162 | }; 163 | 164 | END_MGPU_NAMESPACE 165 | -------------------------------------------------------------------------------- /inc/mgpu/common/cta_mergesort.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cta_merge.hxx" 3 | #include "sort_networks.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | constexpr int out_of_range_flags(int first, int vt, int count) { 8 | int out_of_range = min(vt, first + vt - count); 9 | int head_flags = 0; 10 | if(out_of_range > 0) { 11 | const int mask = (1<< vt) - 1; 12 | head_flags = mask & (~mask>> out_of_range); 13 | } 14 | return head_flags; 15 | } 16 | 17 | constexpr merge_range_t compute_mergesort_frame(int partition, int coop, 18 | int spacing) { 19 | 20 | int size = spacing * (coop / 2); 21 | int start = ~(coop - 1) & partition; 22 | int a_begin = spacing * start; 23 | int b_begin = spacing * start + size; 24 | 25 | return merge_range_t { 26 | a_begin, 27 | a_begin + size, 28 | b_begin, 29 | b_begin + size 30 | }; 31 | } 32 | 33 | constexpr merge_range_t compute_mergesort_range(int count, int partition, 34 | int coop, int spacing) { 35 | 36 | merge_range_t frame = compute_mergesort_frame(partition, coop, spacing); 37 | return merge_range_t { 38 | frame.a_begin, 39 | min(count, frame.a_end), 40 | min(count, frame.b_begin), 41 | min(count, frame.b_end) 42 | }; 43 | } 44 | 45 | constexpr merge_range_t compute_mergesort_range(int count, int partition, 46 | int coop, int spacing, int mp0, int mp1) { 47 | 48 | merge_range_t range = compute_mergesort_range(count, partition, 49 | coop, spacing); 50 | 51 | // Locate the diagonal from the start of the A sublist. 52 | int diag = spacing * partition - range.a_begin; 53 | 54 | // The end partition of the last cta for each merge operation is computed 55 | // and stored as the begin partition for the subsequent merge. i.e. it is 56 | // the same partition but in the wrong coordinate system, so its 0 when it 57 | // should be listSize. Correct that by checking if this is the last cta 58 | // in this merge operation. 59 | if(coop - 1 != ((coop - 1) & partition)) { 60 | range.a_end = range.a_begin + mp1; 61 | range.b_end = min(count, range.b_begin + diag + spacing - mp1); 62 | } 63 | 64 | range.a_begin = range.a_begin + mp0; 65 | range.b_begin = min(count, range.b_begin + diag - mp0); 66 | 67 | return range; 68 | } 69 | 70 | template 71 | struct cta_sort_t { 72 | static_assert(is_pow2(nt)); 73 | 74 | enum { 75 | has_values = !std::is_same_v, 76 | num_passes = s_log2(nt) 77 | }; 78 | 79 | ALIAS_UNION storage_t { 80 | key_t keys[nt * vt + 1]; 81 | val_t vals[nt * vt]; 82 | }; 83 | 84 | typedef kv_array_t array_t; 85 | 86 | template 87 | static array_t merge_pass(array_t x, int tid, int count, int pass, 88 | comp_t comp, storage_t& storage) { 89 | 90 | // Divide the CTA's keys into lists. 91 | int coop = 2<< pass; 92 | merge_range_t range = compute_mergesort_range(count, tid, coop, vt); 93 | int diag = vt * tid - range.a_begin; 94 | 95 | // Store the keys into shared memory for searching. 96 | reg_to_shared_thread(x.keys, tid, storage.keys); 97 | 98 | // Search for the merge for this thread within its list. 99 | int mp = merge_path(storage.keys, range, diag, comp); 100 | 101 | // Run a serial merge and return. 102 | merge_pair_t merge = serial_merge( 103 | storage.keys, range.partition(mp, diag), comp); 104 | x.keys = merge.keys; 105 | 106 | if constexpr(has_values) { 107 | // Reorder values through shared memory. 108 | reg_to_shared_thread(x.vals, tid, storage.vals); 109 | x.vals = shared_gather(storage.vals, merge.indices); 110 | } 111 | 112 | return x; 113 | } 114 | 115 | template 116 | static array_t block_sort(array_t x, int tid, int count, comp_t comp, 117 | storage_t& storage) { 118 | 119 | // Sort the inputs within each thread. If any threads have fewer than 120 | // vt items, use the segmented sort network to prevent out-of-range 121 | // elements from contaminating the sort. 122 | if(count < nt * vt) { 123 | int head_flags = out_of_range_flags(vt * tid, vt, count); 124 | x = odd_even_sort(x, comp, head_flags); 125 | } else 126 | x = odd_even_sort(x, comp); 127 | 128 | // Merge threads starting with a pair until all values are merged. 129 | for(int pass = 0; pass < num_passes; ++pass) 130 | x = merge_pass(x, tid, count, pass, comp, storage); 131 | 132 | return x; 133 | } 134 | }; 135 | 136 | END_MGPU_NAMESPACE 137 | -------------------------------------------------------------------------------- /inc/mgpu/common/cta_radix.hxx: -------------------------------------------------------------------------------- 1 | #include "cta_scan.hxx" 2 | #include 3 | 4 | BEGIN_MGPU_NAMESPACE 5 | 6 | template 7 | struct radix_permute_t { 8 | typedef unsigned_int_by_size_t unsigned_type; 9 | typedef signed_int_by_size_t signed_type; 10 | 11 | static unsigned_type to_radix_bits(type_t x) { 12 | if constexpr(std::is_unsigned_v) { 13 | // Do nothing. 14 | return x; 15 | 16 | } else if constexpr(std::is_integral_v) { 17 | // Flip the most significant bit. 18 | return x ^ (1<< (8 * sizeof(type_t) - 1)); 19 | 20 | } else if constexpr(std::is_floating_point_v) { 21 | // Always flip the most significant bit. Flip all other bits if the 22 | // most significant bit started flipped. 23 | unsigned_type y = *reinterpret_cast(&x); 24 | unsigned_type mask = 25 | // Carry-in the sign bit to all lower bits 26 | ((signed_type)y>> (8 * sizeof(type_t) - 1)) | 27 | // Always set the most significant bit 28 | ((unsigned_type)1<< (8 * sizeof(type_t) - 1)); 29 | 30 | return y ^ mask; 31 | 32 | } else { 33 | static_assert("type cannot be converted to radix form"); 34 | } 35 | } 36 | 37 | static type_t from_radix_bits(unsigned_type x) { 38 | if constexpr(std::is_unsigned_v) { 39 | // Do nothing. 40 | return x; 41 | 42 | } else if constexpr(std::is_integral_v) { 43 | // Flip the most significant bit. 44 | return x ^ (1<< (8 * sizeof(type_t) - 1)); 45 | 46 | } else if constexpr(std::is_floating_point_v) { 47 | // Flip the sign bit. 48 | x ^= (unsigned_type)1<< (8 * sizeof(type_t) - 1); 49 | 50 | // Flip the lower bits if the sign bit is set. 51 | unsigned_type mask = 52 | // Carry-in the sign bit to all lower bits 53 | ((signed_type)x>> (8 * sizeof(type_t) - 1)) & 54 | // Always clear the most significant bit 55 | (((unsigned_type)1<< (8 * sizeof(type_t) - 1)) - 1); 56 | x ^= mask; 57 | 58 | return *reinterpret_cast(&x); 59 | 60 | } else { 61 | static_assert("type cannot be converted from radix form"); 62 | } 63 | } 64 | }; 65 | 66 | //////////////////////////////////////////////////////////////////////////////// 67 | 68 | enum radix_kind_t { 69 | radix_kind_shared, 70 | radix_kind_ballot, 71 | }; 72 | 73 | template 74 | struct cta_radix_rank_t; 75 | 76 | //////////////////////////////////////////////////////////////////////////////// 77 | // Use shared memory histogram to rank digits. 78 | 79 | template 80 | struct cta_radix_rank_t { 81 | enum { num_bins = 1<< num_bits, num_slots = num_bins / 2 + 1 }; 82 | typedef cta_scan_t scan_t; 83 | 84 | template 85 | struct result_t { 86 | // All threads return scatter indices for each value. 87 | std::array indices; 88 | 89 | // The first num_bins threads return the corresponding digit count. 90 | uint digit_scan; 91 | }; 92 | 93 | union storage_t { 94 | uint16_t hist16[nt * num_bins]; 95 | uint32_t hist32[nt * num_slots]; 96 | typename scan_t::storage_t scan; 97 | }; 98 | 99 | // Return the cta-wide reduction for each digit in the first num_bins 100 | // threads. 101 | template 102 | uint reduce(std::array x, storage_t& shared) { 103 | int tid = glcomp_LocalInvocationID.x; 104 | 105 | // Cooperatively zero out the histogram smem. 106 | @meta for(int i = 0; i < num_slots; ++i) 107 | shared.hist32[nt * i + tid] = 0; 108 | __syncthreads(); 109 | 110 | // Compute the histogram for each thread. Not great for bank conflicts, but 111 | // at least it's easy. 112 | @meta for(int i = 0; i < vt; ++i) 113 | ++shared.hist16[nt * x[i] + tid]; 114 | __syncthreads(); 115 | 116 | // Cooperatively scan the entire histogram. Each thread loads 9 words, 117 | // which corresponds to 18 histogram slots. The odd grain size avoids 118 | // smem bank conflicts on all architectures. 119 | uint sum = 0; 120 | uint counters[num_slots]; 121 | @meta for(int i = 0; i < num_slots; ++i) { 122 | counters[i] = shared.hist32[num_slots * tid + i]; 123 | sum += counters[i]; 124 | } 125 | __syncthreads(); 126 | 127 | // Scan the reductions. 128 | uint carry_in = scan_t().scan(sum, shared.scan).scan; 129 | carry_in += (carry_in>> 16) | (carry_in<< 16); 130 | 131 | // Write the scanned histogram back to shared memory. 132 | @meta for(int i = 0; i < num_slots; ++i) { 133 | // Add .low to .high 134 | carry_in += counters[i]<< 16; 135 | carry_in += counters[i] + (counters[i]>> 16); 136 | shared.hist32[num_slots * tid + i] = carry_in; 137 | } 138 | __syncthreads(); 139 | 140 | // Get the digit totals. This is a maximally-conflicted operation. 141 | uint digit_count = 0; 142 | if(tid < num_bins) { 143 | digit_count = shared.hist16[nt * tid + nt - 1]; 144 | int left = subgroupShuffleUp(digit_count, 1); 145 | if(tid) 146 | digit_count -= left; 147 | } 148 | 149 | __syncthreads(); 150 | return digit_count; 151 | } 152 | 153 | // Return the scatter indices for all keys plus the cta-wide scan for 154 | // each digit. 155 | template 156 | result_t scatter(std::array x, storage_t& shared) { 157 | int tid = glcomp_LocalInvocationID.x; 158 | 159 | // Cooperatively zero out the histogram smem. 160 | @meta for(int i = 0; i < num_slots; ++i) 161 | shared.hist32[nt * i + tid] = 0; 162 | __syncthreads(); 163 | 164 | // Compute the histogram for each thread. Not great for bank conflicts, but 165 | // at least it's easy. 166 | @meta for(int i = 0; i < vt; ++i) 167 | ++shared.hist16[nt * x[i] + tid]; 168 | __syncthreads(); 169 | 170 | // Cooperatively scan the entire histogram. Each thread loads 9 words, 171 | // which corresponds to 18 histogram slots. The odd grain size avoids 172 | // smem bank conflicts on all architectures. 173 | uint sum = 0; 174 | uint counters[num_slots]; 175 | @meta for(int i = 0; i < num_slots; ++i) { 176 | counters[i] = shared.hist32[num_slots * tid + i]; 177 | sum += counters[i]; 178 | } 179 | __syncthreads(); 180 | 181 | // Scan the reductions. 182 | uint carry_in = scan_t().scan(sum, shared.scan).scan; 183 | carry_in += (carry_in>> 16) | (carry_in<< 16); 184 | 185 | // Write the scanned histogram back to shared memory. 186 | @meta for(int i = 0; i < num_slots; ++i) { 187 | // Add .low to .high 188 | carry_in += counters[i]<< 16; 189 | shared.hist32[num_slots * tid + i] = carry_in; 190 | carry_in += counters[i] + (counters[i]>> 16); 191 | } 192 | __syncthreads(); 193 | 194 | // Get the digit totals. This is a maximally-conflicted operation. 195 | uint digit_scan = tid < num_bins ? shared.hist16[nt * tid] : 0; 196 | __syncthreads(); 197 | 198 | // Make a downsweep pass by counting the digits a second time. 199 | std::array scatter; 200 | @meta for(int i = 0; i < vt; ++i) 201 | scatter[i] = shared.hist16[nt * x[i] + tid]++; 202 | __syncthreads(); 203 | 204 | return { scatter, digit_scan }; 205 | } 206 | }; 207 | 208 | //////////////////////////////////////////////////////////////////////////////// 209 | // Use ballot instruction to rank digits. Currently this only works on 210 | // 32-lane subgroups. 211 | 212 | template 213 | struct cta_radix_rank_t { 214 | enum { 215 | num_bins = 1<< num_bits, 216 | warp_size = 32, 217 | num_warps = nt / warp_size, 218 | counters_per_thread = num_warps * num_bins / nt 219 | }; 220 | 221 | // Simpler to require as many threads as thehre are histogram bins. 222 | static_assert(nt >= num_bins); 223 | static_assert(num_warps * num_bins >= nt); 224 | 225 | typedef cta_scan_t scan_t; 226 | 227 | template 228 | struct result_t { 229 | // All threads return scatter indices for each value. 230 | std::array indices; 231 | 232 | // The first num_bins threads return the corresponding digit count. 233 | uint digit_scan; 234 | }; 235 | 236 | union storage_t { 237 | uint32_t counters[num_warps * num_bins]; 238 | uint32_t hist32[num_warps][num_bins]; 239 | typename scan_t::storage_t scan; 240 | }; 241 | 242 | uint32_t get_matching_lanes(uint digit) { 243 | // Start with all lanes matching. 244 | uint32_t match = -1; 245 | @meta for(int i = 0; i < num_bits; ++i) {{ 246 | const uint flag = 1<< i; 247 | uint mask = flag & digit; 248 | uint32_t b = gl_subgroupBallot(mask).x; 249 | 250 | // Clear lanes from the match if they have a different bit. 251 | if(!mask) b = ~b; 252 | match &= b; 253 | }} 254 | 255 | return match; 256 | } 257 | 258 | template 259 | uint reduce(std::array x, storage_t& shared) { 260 | // Cooperatively zero out the histogram smem. 261 | int tid = glcomp_LocalInvocationID.x; 262 | int lane = gl_SubgroupInvocationID; 263 | int warp = gl_SubgroupID; 264 | 265 | // Cooperatively zero out the shared memory. 266 | @meta for(int i = 0; i < counters_per_thread; ++i) 267 | shared.counters[nt * i + tid] = 0; 268 | __syncthreads(); 269 | 270 | // Process each digit. 271 | @meta for(int i = 0; i < vt; ++i) {{ 272 | // Get a bitfield of lanes with matching digits. 273 | uint32_t match = get_matching_lanes(x[i]); 274 | 275 | // Increment the histogram bin to indicate the digit count. 276 | // Only the lowest lane in the match mask does this. 277 | if(0 == (gl_SubgroupLtMask & match)) 278 | shared.hist32[warp][x[i]] += bitCount(match); 279 | }} 280 | __syncthreads(); 281 | 282 | // Do a digit-wise reduction across warps. 283 | int digit_count = 0; 284 | if(tid < num_bins) { 285 | @meta for(int i = 0; i < num_warps; ++i) 286 | digit_count += shared.hist32[i][tid]; 287 | } 288 | __syncthreads(); 289 | 290 | return digit_count; 291 | } 292 | 293 | // The digits provided to scatter must be in warp-strided order. 294 | // That is, they first vary by i, then by lane. 295 | template 296 | result_t scatter(std::array x, storage_t& shared) { 297 | // Cooperatively zero out the histogram smem. 298 | int tid = glcomp_LocalInvocationID.x; 299 | int lane = gl_SubgroupInvocationID; 300 | int warp = gl_SubgroupID; 301 | 302 | // Cooperatively zero out the shared memory. 303 | @meta for(int i = 0; i < counters_per_thread; ++i) 304 | shared.counters[nt * i + tid] = 0; 305 | __syncthreads(); 306 | 307 | // Process each digit. 308 | uint matches[vt]; 309 | @meta for(int i = 0; i < vt; ++i) { 310 | // Get a bitfield of lanes with matching digits. 311 | matches[i] = get_matching_lanes(x[i]); 312 | if(0 == (gl_SubgroupLtMask & matches[i])) 313 | shared.hist32[warp][x[i]] += bitCount(matches[i]); 314 | } 315 | __syncthreads(); 316 | 317 | std::array counters; 318 | if(tid < num_bins) { 319 | // Reduce the digit counts over the warps and keep a copy of the 320 | // counters. 321 | @meta for(int i = 0; i < num_warps; ++i) 322 | counters[i] = shared.hist32[i][tid]; 323 | } 324 | __syncthreads(); 325 | 326 | // Do a cooperative CTA scan. 327 | auto result = scan_t().scan(counters, shared.scan); 328 | 329 | uint digit_scan = 0; 330 | if(tid < num_bins) { 331 | // Add back into the warp counters. 332 | digit_scan = result.scan[0]; 333 | @meta for(int i = 0; i < num_warps; ++i) 334 | shared.hist32[i][tid] = result.scan[i]; 335 | } 336 | __syncthreads(); 337 | 338 | // Make a second pass and compute scatter indices. 339 | std::array scatter; 340 | @meta for(int i = 0; i < vt; ++i) {{ 341 | uint lower_mask = gl_SubgroupLtMask.x & matches[i]; 342 | uint offset = shared.hist32[warp][x[i]]; 343 | scatter[i] = offset + bitCount(lower_mask); 344 | if(0 == (gl_SubgroupLtMask & matches[i])) 345 | shared.hist32[warp][x[i]] = offset + bitCount(matches[i]); 346 | }} 347 | __syncthreads(); 348 | 349 | return { scatter, digit_scan }; 350 | } 351 | }; 352 | 353 | END_MGPU_NAMESPACE 354 | -------------------------------------------------------------------------------- /inc/mgpu/common/cta_scan.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "loadstore.hxx" 3 | #include "subgroup.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | //////////////////////////////////////////////////////////////////////////////// 8 | 9 | template 10 | struct cta_reduce_t { 11 | struct storage_t { 12 | type_t warps[nt / 8]; 13 | }; 14 | 15 | // Reduce the values across a cta. Only thread 0 returns a value. If all 16 | // threads want the value, store to shared memory and broadcast. 17 | template > 18 | type_t reduce(type_t x, storage_t& shared, op_t op = op_t()) { 19 | int warp_size = gl_SubgroupSize; 20 | int num_warps = gl_NumSubgroups; 21 | int lane = gl_SubgroupInvocationID; 22 | int warp = gl_SubgroupID; 23 | 24 | // Reduce within a warp. 25 | for(int offset = 1; offset < warp_size; offset<<= 1) { 26 | type_t y = subgroupShuffleDown(x, offset); 27 | if(lane + offset < warp_size) 28 | x = op(x, y); 29 | } 30 | 31 | // The first lane in each warp writes its reduction. 32 | if(!lane) 33 | shared.warps[warp] = x; 34 | __syncthreads(); 35 | 36 | // Scan the reductions. This assumes we can do it in one shot. 37 | if(lane < num_warps) { 38 | x = shared.warps[lane]; 39 | for(int offset = 1; offset < num_warps; offset<<= 1) { 40 | type_t y = subgroupShuffleDown(x, offset); 41 | if(lane + offset < num_warps) 42 | x = op(x, y); 43 | } 44 | } 45 | __syncthreads(); 46 | 47 | return x; 48 | } 49 | 50 | template > 51 | type_t reduce(std::array x, storage_t& shared, op_t op = op_t()) { 52 | // Reduce within a thread. 53 | @meta for(int i = 1; i < vt; ++i) 54 | x[0] = op(x[0], x[i]); 55 | 56 | // Reduce across threads. 57 | return reduce(x[0], shared, op); 58 | } 59 | }; 60 | 61 | //////////////////////////////////////////////////////////////////////////////// 62 | 63 | enum scan_type_t { 64 | scan_type_exc, 65 | scan_type_inc, 66 | }; 67 | 68 | template 0)> 69 | struct scan_result_t { 70 | type_t scan; 71 | type_t reduction; 72 | }; 73 | 74 | template 75 | struct scan_result_t { 76 | std::array scan; 77 | type_t reduction; 78 | }; 79 | 80 | template 81 | struct cta_scan_t { 82 | struct storage_t { 83 | int warps[nt / 8]; 84 | }; 85 | 86 | // Scalar scan. 87 | template< 88 | scan_type_t scan_type = scan_type_exc, 89 | typename op_t = std::plus 90 | > 91 | scan_result_t scan(type_t x, storage_t& shared, 92 | type_t init = type_t(), op_t op = op_t()) { 93 | 94 | int warp_size = gl_SubgroupSize; 95 | int num_warps = gl_NumSubgroups; 96 | int lane = gl_SubgroupInvocationID; 97 | int warp = gl_SubgroupID; 98 | 99 | // Use subgroupShuffleUp to prefix sum over a warp. 100 | for(int offset = 1; offset < warp_size; offset<<= 1) { 101 | type_t y = subgroupShuffleUp(x, offset); 102 | if(offset <= lane) 103 | x = op(y, x); 104 | } 105 | 106 | // The last lane in each warp writes its reduction. 107 | if(warp_size - 1 == lane) 108 | shared.warps[warp] = x; 109 | __syncthreads(); 110 | 111 | // Scan the reductions. This assumes we can do it in one shot. 112 | if(lane < num_warps) { 113 | type_t x = shared.warps[lane]; 114 | for(int offset = 1; offset < num_warps; offset<<= 1) { 115 | type_t y = subgroupShuffleUp(x, offset); 116 | if(offset <= lane) 117 | x = op(y, x); 118 | } 119 | shared.warps[lane] = x; 120 | } 121 | __syncthreads(); 122 | 123 | if constexpr(scan_type_exc == scan_type) { 124 | // For exclusive scan, get the value of the warp scan to the left. 125 | type_t left = subgroupShuffleUp(x, 1); 126 | x = lane ? left : init; 127 | } 128 | 129 | if(warp) 130 | x = op(shared.warps[warp - 1], x); 131 | type_t reduction = shared.warps[num_warps - 1]; 132 | __syncthreads(); 133 | 134 | return { x, reduction }; 135 | } 136 | 137 | // CTA vectorized scan. Accepts multiple values per thread and adds in 138 | // optional global carry-in. 139 | template< 140 | scan_type_t scan_type = scan_type_exc, 141 | int vt, 142 | typename op_t = std::plus 143 | > 144 | scan_result_t scan(std::array x, storage_t& shared, 145 | type_t carry_in = type_t(), type_t init = type_t(), op_t op = op_t()) { 146 | 147 | int tid = glcomp_LocalInvocationID.x; 148 | 149 | // Reduce the inputs and scan them. 150 | type_t sum = (... + x...[:]); 151 | 152 | // Scan the thread-local reductions for a carry-in for each thread. 153 | scan_result_t result = scan(sum, shared, init, op); 154 | 155 | // Perform the scan downsweep and add both global carry-in and the thread 156 | // carry-in to the values. 157 | result.reduction = op(carry_in, result.reduction); 158 | result.scan = op(carry_in, result.scan); 159 | 160 | if constexpr(scan_type_exc == scan_type) { 161 | @meta for(int i = 0; i < vt; ++i) { 162 | type_t temp = x[i]; 163 | x[i] = result.scan; 164 | result.scan += temp; 165 | } 166 | 167 | } else { 168 | // Add the carry-in. 169 | @meta for(int i = 0; i < vt; ++i) 170 | x[i] = op(result.scan, x[i]); 171 | } 172 | 173 | return { x, result.reduction }; 174 | } 175 | }; 176 | 177 | END_MGPU_NAMESPACE 178 | -------------------------------------------------------------------------------- /inc/mgpu/common/kernel_merge.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cta_merge.hxx" 3 | 4 | BEGIN_MGPU_NAMESPACE 5 | 6 | //////////////////////////////////////////////////////////////////////////////// 7 | // Generic merge code for a compute kernel. 8 | 9 | template< 10 | int nt, int vt, 11 | typename mp_it, 12 | typename a_keys_it, typename a_vals_it, 13 | typename b_keys_it, typename b_vals_it, 14 | typename c_keys_it, typename c_vals_it, 15 | typename comp_t 16 | > 17 | void kernel_merge( 18 | mp_it mp_data, 19 | a_keys_it a_keys, a_vals_it a_vals, int a_count, 20 | b_keys_it b_keys, b_vals_it b_vals, int b_count, 21 | c_keys_it c_keys, c_vals_it c_vals, comp_t comp) { 22 | 23 | typedef typename std::iterator_traits::value_type key_t; 24 | typedef typename std::iterator_traits::value_type val_t; 25 | 26 | const int nv = nt * vt; 27 | int tid = threadIdx.x; 28 | int cta = blockIdx.x; 29 | 30 | __shared__ ALIAS_UNION { 31 | key_t keys[nv + 1]; 32 | int indices[nv]; 33 | } shared; 34 | 35 | // Load the range for this CTA and merge the values into register. 36 | int mp0 = mp_data[cta + 0]; 37 | int mp1 = mp_data[cta + 1]; 38 | merge_range_t range = compute_merge_range(a_count, b_count, cta, nv, 39 | mp0, mp1); 40 | 41 | merge_pair_t merge = cta_merge_from_mem( 42 | a_keys, b_keys, range, tid, comp, shared.keys); 43 | 44 | int dest_offset = nv * cta; 45 | reg_to_mem_thread(merge.keys, tid, range.total(), c_keys + dest_offset, 46 | shared.keys); 47 | 48 | if constexpr(!std::is_same_v) { 49 | // Transpose the indices from thread order to strided order. 50 | std::array indices = reg_thread_to_strided(merge.indices, tid, 51 | shared.indices); 52 | 53 | // Gather the input values and merge into the output values. 54 | transfer_two_streams_strided(a_vals + range.a_begin, range.a_count(), 55 | b_vals + range.b_begin, range.b_count(), indices, tid, 56 | c_vals + dest_offset); 57 | } 58 | } 59 | 60 | END_MGPU_NAMESPACE 61 | -------------------------------------------------------------------------------- /inc/mgpu/common/kernel_mergesort.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cta_mergesort.hxx" 3 | #include "bindings.hxx" 4 | 5 | #include 6 | 7 | BEGIN_MGPU_NAMESPACE 8 | 9 | //////////////////////////////////////////////////////////////////////////////// 10 | // Sort full tiles in place. 11 | 12 | template< 13 | bool sort_indices, 14 | int nt, int vt, 15 | typename keys_in_it, typename vals_in_it, 16 | typename keys_out_it, typename vals_out_it, 17 | typename comp_t 18 | > 19 | void kernel_blocksort( 20 | keys_in_it keys_in, vals_in_it vals_in, 21 | keys_out_it keys_out, vals_out_it vals_out, 22 | int count, comp_t comp) { 23 | 24 | typedef typename std::iterator_traits::value_type key_t; 25 | typedef typename std::iterator_traits::value_type val_t; 26 | enum { has_values = !std::is_same::value }; 27 | 28 | typedef cta_sort_t sort_t; 29 | sort_t sort; 30 | 31 | __shared__ typename sort_t::storage_t shared; 32 | 33 | const int nv = nt * vt; 34 | int tid = threadIdx.x; 35 | int cta = blockIdx.x; 36 | range_t tile = get_tile(cta, nv, count); 37 | 38 | // Load the keys and values. 39 | kv_array_t unsorted; 40 | unsorted.keys = mem_to_reg_thread(keys_in + tile.begin, tid, 41 | tile.count(), shared.keys); 42 | 43 | if constexpr(sort_indices) { 44 | // If we're sorting key/index pairs, sythesize the data without sampling 45 | // the counting_iterator, which would perform a trip through shared 46 | // memory. 47 | int index = vt * (nt * cta + tid); 48 | @meta for(int i = 0; i < vt; ++i) 49 | unsorted.vals[i] = index + i; 50 | 51 | } else if constexpr(has_values) { 52 | unsorted.vals = mem_to_reg_thread(vals_in + tile.begin, tid, 53 | tile.count(), shared.vals); 54 | } 55 | 56 | // Blocksort. 57 | kv_array_t sorted = sort_t().block_sort(unsorted, 58 | tid, tile.count(), comp, shared); 59 | 60 | // Store the keys and values. 61 | reg_to_mem_thread(sorted.keys, tid, tile.count(), 62 | keys_out + tile.begin, shared.keys); 63 | 64 | if constexpr(has_values) 65 | reg_to_mem_thread(sorted.vals, tid, tile.count(), 66 | vals_out + tile.begin, shared.vals); 67 | } 68 | 69 | //////////////////////////////////////////////////////////////////////////////// 70 | // Join two fully sorted sequences into one sequence. 71 | 72 | template< 73 | int nt, int vt, 74 | typename mp_it, 75 | typename keys_in_it, typename vals_in_it, 76 | typename keys_out_it, typename vals_out_it, 77 | typename comp_t 78 | > 79 | void kernel_mergesort_pass( 80 | mp_it mp_data, 81 | keys_in_it keys_in, vals_in_it vals_in, 82 | keys_out_it keys_out, vals_out_it vals_out, 83 | int count, int coop, comp_t comp) { 84 | 85 | typedef typename std::iterator_traits::value_type key_t; 86 | typedef typename std::iterator_traits::value_type val_t; 87 | enum { has_values = !std::is_same::value }; 88 | 89 | const int nv = nt * vt; 90 | int tid = threadIdx.x; 91 | int cta = blockIdx.x; 92 | 93 | __shared__ ALIAS_UNION { 94 | key_t keys[nv + 1]; 95 | int indices[nv]; 96 | } shared; 97 | 98 | range_t tile = get_tile(cta, nv, count); 99 | 100 | // Load the range for this CTA and merge the values into register. 101 | merge_range_t range = compute_mergesort_range(count, cta, coop, nv, 102 | mp_data[cta + 0], mp_data[cta + 1]); 103 | 104 | merge_pair_t merge = cta_merge_from_mem( 105 | keys_in, keys_in, range, tid, comp, shared.keys); 106 | 107 | // Store merged values back out. 108 | reg_to_mem_thread(merge.keys, tid, tile.count(), 109 | keys_out + tile.begin, shared.keys); 110 | 111 | if constexpr(has_values) { 112 | // Transpose the indices from thread order to strided order. 113 | std::array indices = reg_thread_to_strided(merge.indices, 114 | tid, shared.indices); 115 | 116 | // Gather the input values and merge into the output values. 117 | transfer_two_streams_strided(vals_in + range.a_begin, 118 | range.a_count(), vals_in + range.b_begin, range.b_count(), 119 | indices, tid, vals_out + tile.begin); 120 | } 121 | } 122 | 123 | END_MGPU_NAMESPACE 124 | -------------------------------------------------------------------------------- /inc/mgpu/common/loadstore.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | #include "types.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | //////////////////////////////////////////////////////////////////////////////// 8 | // reg<->shared 9 | 10 | template 11 | void reg_to_shared_thread(std::array x, int tid, 12 | type_t (&shared)[shared_size], bool sync = true) { 13 | 14 | static_assert(shared_size >= nt * vt, 15 | "reg_to_shared_thread must have at least nt * vt storage"); 16 | 17 | // thread_iterate([&](int i, int j) { 18 | // shared[j] = x[i]; 19 | // }, tid); 20 | 21 | @meta for(int i = 0; i < vt; ++i) 22 | shared[vt * tid + i] = x[i]; 23 | 24 | if(sync) __syncthreads(); 25 | } 26 | 27 | template 28 | std::array shared_to_reg_thread( 29 | const type_t (&shared)[shared_size], int tid, bool sync = true) { 30 | 31 | static_assert(shared_size >= nt * vt, 32 | "reg_to_shared_thread must have at least nt * vt storage"); 33 | 34 | std::array x; 35 | thread_iterate([&](int i, int j) { 36 | x[i] = shared[j]; 37 | }, tid); 38 | if(sync) __syncthreads(); 39 | return x; 40 | } 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | 44 | template 45 | void reg_to_shared_strided(std::array x, int tid, 46 | type_t (&shared)[shared_size], bool sync = true) { 47 | 48 | static_assert(shared_size >= nt * vt, 49 | "reg_to_shared_strided must have at least nt * vt storage"); 50 | 51 | // strided_iterate([&](int i, int j) { shared[j] = x[i]; }, tid); 52 | 53 | @meta for(int i = 0; i < vt; ++i) 54 | shared[nt * i + tid] = x[i]; 55 | 56 | if(sync) __syncthreads(); 57 | } 58 | 59 | template 60 | std::array shared_to_reg_strided( 61 | const type_t (&shared)[shared_size], int tid, bool sync = true) { 62 | 63 | static_assert(shared_size >= nt * vt, 64 | "shared_to_reg_strided must have at least nt * vt storage"); 65 | 66 | std::array x; 67 | strided_iterate([&](int i, int j) { x[i] = shared[j]; }, tid); 68 | if(sync) __syncthreads(); 69 | return x; 70 | } 71 | 72 | //////////////////////////////////////////////////////////////////////////////// 73 | 74 | template 75 | std::array shared_to_reg_warp(const type_t (&shared)[shared_size], 76 | int lane, int warp, int warp_size, bool sync = true) { 77 | 78 | uint cur = vt * warp_size * warp + lane; 79 | 80 | std::array x; 81 | @meta for(int i = 0; i < vt; ++i) 82 | x[i] = shared[cur + i * warp_size]; 83 | if(sync) __syncthreads(); 84 | return x; 85 | } 86 | 87 | //////////////////////////////////////////////////////////////////////////////// 88 | 89 | template 90 | std::array shared_gather(const type_t(&data)[shared_size], 91 | std::array indices, bool sync = true) { 92 | 93 | static_assert(shared_size >= nt * vt, 94 | "shared_gather must have at least nt * vt storage"); 95 | 96 | std::array x { data[indices...[:]]... }; 97 | if(sync) __syncthreads(); 98 | return x; 99 | } 100 | 101 | template 102 | std::array thread_to_strided(std::array x, 103 | int tid, type_t (&shared)[shared_size]) { 104 | 105 | reg_to_shared_thread(x, tid, shared); 106 | return shared_to_reg_strided(shared, tid); 107 | } 108 | 109 | 110 | 111 | //////////////////////////////////////////////////////////////////////////////// 112 | // reg<->memory 113 | 114 | template 115 | void reg_to_mem_strided(std::array x, int tid, 116 | int count, it_t mem) { 117 | 118 | // strided_iterate([=](int i, int j) { 119 | // mem[j] = x[i]; 120 | // }, tid, count); 121 | @meta for(int i = 0; i < vt; ++i) {{ 122 | int k = nt * i + tid; 123 | if(k < count) 124 | mem[k] = x[i]; 125 | }} 126 | } 127 | 128 | template 129 | std::array::value_type, vt> 130 | mem_to_reg_strided(it_t mem, int tid, int count) { 131 | typedef typename std::iterator_traits::value_type type_t; 132 | std::array x; 133 | 134 | // strided_iterate([&](int i, int j) { 135 | // x[i] = mem[j]; 136 | // }, tid, count); 137 | 138 | @meta for(int i = 0; i < vt; ++i) {{ 139 | int k = nt * i + tid; 140 | if(k < count) 141 | x[i] = mem[k]; 142 | }} 143 | 144 | return x; 145 | } 146 | 147 | template 149 | void reg_to_mem_thread(std::array x, int tid, 150 | int count, it_t mem, type_t (&shared)[shared_size]) { 151 | 152 | reg_to_shared_thread(x, tid, shared); 153 | std::array y = shared_to_reg_strided(shared, tid); 154 | reg_to_mem_strided(y, tid, count, mem); 155 | } 156 | 157 | template 159 | std::array mem_to_reg_thread(it_t mem, int tid, 160 | int count, type_t (&shared)[shared_size]) { 161 | 162 | std::array x = mem_to_reg_strided(mem, tid, count); 163 | reg_to_shared_strided(x, tid, shared); 164 | std::array y = shared_to_reg_thread(shared, tid); 165 | return y; 166 | } 167 | 168 | template 169 | void mem_to_mem(input_it input, int tid, int count, 170 | output_it output) { 171 | typedef typename std::iterator_traits::value_type type_t; 172 | type_t x[vt]; 173 | 174 | strided_iterate([&](int i, int j) { 175 | x[i] = input[j]; 176 | }, tid, count); 177 | strided_iterate([&](int i, int j) { 178 | output[j] = x[i]; 179 | }, tid, count); 180 | } 181 | 182 | //////////////////////////////////////////////////////////////////////////////// 183 | // memory<->memory 184 | 185 | template 186 | void mem_to_shared(it_t mem, int tid, int count, type_t* shared, 187 | bool sync = true) { 188 | 189 | std::array x = mem_to_reg_strided(mem, tid, count); 190 | strided_iterate([&](int i, int j) { 191 | shared[j] = x[i]; 192 | }, tid, count); 193 | if(sync) __syncthreads(); 194 | } 195 | 196 | template 197 | void shared_to_mem(const type_t* shared, int tid, int count, 198 | it_t mem, bool sync = true) { 199 | 200 | strided_iterate([&](int i, int j) { 201 | mem[j] = shared[j]; 202 | }, tid, count); 203 | if(sync) __syncthreads(); 204 | } 205 | 206 | //////////////////////////////////////////////////////////////////////////////// 207 | // reg<->reg 208 | 209 | template 210 | std::array reg_thread_to_strided(std::array x, 211 | int tid, type_t (&shared)[shared_size]) { 212 | 213 | reg_to_shared_thread(x, tid, shared); 214 | return shared_to_reg_strided(shared, tid); 215 | } 216 | 217 | template 218 | std::array reg_strided_to_thread(std::array x, 219 | int tid, type_t (&shared)[shared_size]) { 220 | 221 | reg_to_shared_strided(x, tid, shared); 222 | return shared_to_reg_thread(shared, tid); 223 | } 224 | 225 | END_MGPU_NAMESPACE 226 | -------------------------------------------------------------------------------- /inc/mgpu/common/meta.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define BEGIN_MGPU_NAMESPACE namespace mgpu { 9 | #define END_MGPU_NAMESPACE } 10 | 11 | BEGIN_MGPU_NAMESPACE 12 | 13 | struct empty_t { }; 14 | 15 | template 16 | struct tuple_t { 17 | types_t @(int...) ...; 18 | }; 19 | 20 | constexpr int div_up(int x, int y) { 21 | return (x + y - 1) / y; 22 | } 23 | constexpr int64_t div_up(int64_t x, int64_t y) { 24 | return (x + y - 1) / y; 25 | } 26 | constexpr size_t div_up(size_t x, size_t y) { 27 | return (x + y - 1) / y; 28 | } 29 | 30 | template 31 | constexpr bool is_pow2(type_t x) { 32 | static_assert(std::is_integral_v); 33 | return 0 == (x & (x - 1)); 34 | } 35 | 36 | // Find log2(x) and optionally round up to the next integer logarithm. 37 | inline int find_log2(int x, bool round_up = false) { 38 | int a = 31 - __builtin_clz(x); 39 | if(round_up) a += !is_pow2(x); 40 | return a; 41 | } 42 | 43 | constexpr int s_log2(int x) { 44 | int i = 0; 45 | while(x) { 46 | x>>= 1; 47 | ++i; 48 | } 49 | return i; 50 | } 51 | 52 | template 53 | void iterate(func_t f) { 54 | @meta for(int i = 0; i < count; ++i) 55 | f(i); 56 | } 57 | 58 | // Invoke unconditionally. 59 | template 60 | void strided_iterate(func_t f, int tid) { 61 | @meta for(int i = 0; i < vt; ++i) 62 | f(i, nt * i + tid); 63 | } 64 | 65 | // Check range. 66 | template 67 | void strided_iterate(func_t f, int tid, int count) { 68 | // Unroll the first vt0 elements of each thread. 69 | if constexpr(vt0) { 70 | if(vt0 > 1 && count >= nt * vt0) { 71 | strided_iterate(f, tid); // No checking 72 | 73 | } else { 74 | @meta for(int i = 0; i < vt0; ++i) {{ 75 | int index = nt * i + tid; 76 | if(index < count) f(i, index); 77 | }} 78 | } 79 | } 80 | 81 | @meta for(int i = vt0; i < vt; ++i) {{ 82 | int index = nt * i + tid; 83 | if(index < count) f(i, index); 84 | }} 85 | } 86 | 87 | template 88 | void thread_iterate(func_t f, int tid) { 89 | @meta for(int i = 0; i < vt; ++i) 90 | f(i, vt * tid + i); 91 | } 92 | 93 | template 94 | struct integer_by_size_t; 95 | 96 | template<> struct integer_by_size_t<1> { 97 | typedef int8_t signed_type; 98 | typedef uint8_t unsigned_type; 99 | }; 100 | template<> struct integer_by_size_t<2> { 101 | typedef int16_t signed_type; 102 | typedef uint16_t unsigned_type; 103 | }; 104 | template<> struct integer_by_size_t<4> { 105 | typedef int32_t signed_type; 106 | typedef uint32_t unsigned_type; 107 | }; 108 | template<> struct integer_by_size_t<8> { 109 | typedef int64_t signed_type; 110 | typedef uint64_t unsigned_type; 111 | }; 112 | 113 | template 114 | using signed_int_by_size_t = typename integer_by_size_t::signed_type; 115 | 116 | template 117 | using unsigned_int_by_size_t = typename integer_by_size_t::unsigned_type; 118 | 119 | template 120 | type_t* advance_pointer(void*& p, size_t count) { 121 | type_t* p2 = (type_t*)p; 122 | p = p2 + count; 123 | return p2; 124 | } 125 | 126 | #ifdef ALIASED_SMEM 127 | #define ALIAS_UNION union 128 | #pragma spirv GL_EXT_shared_memory_block 129 | #else 130 | #define ALIAS_UNION struct 131 | #endif 132 | 133 | END_MGPU_NAMESPACE 134 | -------------------------------------------------------------------------------- /inc/mgpu/common/sort_networks.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "types.hxx" 3 | 4 | BEGIN_MGPU_NAMESPACE 5 | 6 | //////////////////////////////////////////////////////////////////////////////// 7 | // Odd-even transposition sorting network. Sorts keys and values in-place in 8 | // register. 9 | // http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort 10 | 11 | template 12 | std::array odd_even_sort(std::array x, comp_t comp, 13 | int flags = 0) { 14 | 15 | @meta for(int I = 0; I < vt; ++I) { 16 | @meta for(int i = 1 & I; i < vt - 1; i += 2) { 17 | if((0 == ((2<< i) & flags)) && comp(x[i + 1], x[i])) 18 | std::swap(x[i], x[i + 1]); 19 | } 20 | } 21 | return x; 22 | } 23 | 24 | template 25 | kv_array_t odd_even_sort(kv_array_t x, 26 | comp_t comp, int flags = 0) { 27 | 28 | @meta for(int I = 0; I < vt; ++I) { 29 | @meta for(int i = 1 & I; i < vt - 1; i += 2) { 30 | if((0 == ((2<< i) & flags)) && comp(x.keys[i + 1], x.keys[i])) { 31 | std::swap(x.keys[i], x.keys[i + 1]); 32 | 33 | if constexpr(!std::is_same_v) 34 | std::swap(x.vals[i], x.vals[i + 1]); 35 | } 36 | } 37 | } 38 | return x; 39 | } 40 | 41 | END_MGPU_NAMESPACE 42 | -------------------------------------------------------------------------------- /inc/mgpu/common/subgroup.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "meta.hxx" 3 | 4 | BEGIN_MGPU_NAMESPACE 5 | 6 | template 7 | type_t subgroupShuffle(type_t x, uint id) { 8 | if constexpr(std::is_array_v || 9 | requires { typename std::tuple_size::type; }) { 10 | 11 | // Shuffle elemnents of arrays and tuples. 12 | x...[:] = subgroupShuffle(x...[:], id)...; 13 | 14 | } else if constexpr(std::is_class_v) { 15 | // Shuffle all public base classes and data members of class objects. 16 | x...[:] = subgroupShuffle(x.@base_values(), id)...; 17 | x...[:] = subgroupShuffle(x.@member_values(), id)...; 18 | 19 | } else { 20 | // Plain shuffle scalars. 21 | x = gl_subgroupShuffle(x, id); 22 | } 23 | 24 | return x; 25 | } 26 | 27 | template 28 | type_t subgroupShuffleDown(type_t x, uint delta) { 29 | if constexpr(std::is_array_v || 30 | requires { typename std::tuple_size::type; }) { 31 | x...[:] = subgroupShuffleDown(x...[:], delta)...; 32 | 33 | } else if constexpr(std::is_class_v) { 34 | x...[:] = subgroupShuffleDown(x.@base_values(), delta)...; 35 | x...[:] = subgroupShuffleDown(x.@member_values(), delta)...; 36 | 37 | } else { 38 | x = gl_subgroupShuffleDown(x, delta); 39 | } 40 | return x; 41 | } 42 | 43 | template 44 | type_t subgroupShuffleUp(type_t x, uint delta) { 45 | if constexpr(std::is_array_v || 46 | requires { typename std::tuple_size::type; }) { 47 | x...[:] = subgroupShuffleUp(x...[:], delta)...; 48 | 49 | } else if constexpr(std::is_class_v) { 50 | x...[:] = subgroupShuffleUp(x.@base_values(), delta)...; 51 | x...[:] = subgroupShuffleUp(x.@member_values(), delta)...; 52 | 53 | } else { 54 | x = gl_subgroupShuffleUp(x, delta); 55 | } 56 | return x; 57 | } 58 | 59 | END_MGPU_NAMESPACE 60 | -------------------------------------------------------------------------------- /inc/mgpu/common/types.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "meta.hxx" 3 | #include 4 | #include 5 | 6 | BEGIN_MGPU_NAMESPACE 7 | 8 | struct alignas(16) range_t { 9 | int begin, end; 10 | int size() const noexcept { return end - begin; } 11 | int count() const noexcept { return size(); } 12 | bool valid() const noexcept { return begin < end; } 13 | }; 14 | 15 | inline range_t get_tile(int cta, int nv, int count) noexcept { 16 | return range_t { nv * cta, min(count, nv * (cta + 1)) }; 17 | } 18 | 19 | struct alignas(16) merge_range_t { 20 | int a_begin, a_end, b_begin, b_end; 21 | 22 | int a_count() const noexcept { return a_end - a_begin; } 23 | int b_count() const noexcept { return b_end - b_begin; } 24 | int total() const noexcept { return a_count() + b_count(); } 25 | 26 | range_t a_range() const noexcept { 27 | return { a_begin, a_end }; 28 | } 29 | range_t b_range() const noexcept { 30 | return { b_begin, b_end }; 31 | } 32 | 33 | merge_range_t to_local() const noexcept { 34 | return { 0, a_count(), a_count(), total() }; 35 | } 36 | 37 | // Partition from mp to the end. 38 | merge_range_t partition(int mp0, int diag) const noexcept { 39 | return { a_begin + mp0, a_end, b_begin + diag - mp0, b_end }; 40 | } 41 | 42 | // Partition from mp0 to mp1. 43 | merge_range_t partition(int mp0, int diag0, int mp1, int diag1) const noexcept { 44 | return { 45 | a_begin + mp0, 46 | a_begin + mp1, 47 | b_begin + diag0 - mp0, 48 | b_begin + diag1 - mp1 49 | }; 50 | } 51 | 52 | bool a_valid() const noexcept { 53 | return a_begin < a_end; 54 | } 55 | bool b_valid() const noexcept { 56 | return b_begin < b_end; 57 | } 58 | }; 59 | 60 | template 61 | struct merge_pair_t { 62 | int mp; 63 | std::array keys; 64 | std::array indices; 65 | }; 66 | 67 | template 68 | struct kv_array_t { 69 | std::array keys; 70 | std::array vals; 71 | }; 72 | 73 | enum bounds_t { 74 | bounds_lower, 75 | bounds_upper 76 | }; 77 | 78 | END_MGPU_NAMESPACE 79 | -------------------------------------------------------------------------------- /inc/mgpu/gl/app.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "buffer.hxx" 3 | #include 4 | #include 5 | #include 6 | 7 | BEGIN_MGPU_NAMESPACE 8 | 9 | namespace gl { 10 | 11 | struct app_t { 12 | app_t(const char* name); 13 | 14 | protected: 15 | virtual void debug_callback(GLenum source, GLenum type, GLuint id, 16 | GLenum severity, GLsizei length, const GLchar* message); 17 | 18 | GLFWwindow* window = nullptr; 19 | 20 | private: 21 | static void _debug_callback(GLenum source, GLenum type, GLuint id, 22 | GLenum severity, GLsizei length, const GLchar* message, 23 | const void* user_param); 24 | 25 | }; 26 | 27 | app_t::app_t(const char* name) { 28 | glfwInit(); 29 | gl3wInit(); 30 | 31 | glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4); 32 | glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 6); 33 | 34 | glfwWindowHint(GLFW_VISIBLE, GLFW_FALSE); 35 | window = glfwCreateWindow(320, 240, name, nullptr, nullptr); 36 | glfwMakeContextCurrent(window); 37 | glfwSwapInterval(1); 38 | 39 | glEnable(GL_DEBUG_OUTPUT); 40 | glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); 41 | glDebugMessageCallback(_debug_callback, this); 42 | } 43 | 44 | void app_t::debug_callback(GLenum source, GLenum type, GLuint id, 45 | GLenum severity, GLsizei length, const GLchar* message) { 46 | 47 | if(GL_DEBUG_SEVERITY_HIGH == severity) { 48 | printf("OpenGL: %s\n", message); 49 | exit(1); 50 | } 51 | } 52 | 53 | void app_t::_debug_callback(GLenum source, GLenum type, GLuint id, 54 | GLenum severity, GLsizei length, const GLchar* message, 55 | const void* user_param) { 56 | 57 | app_t* app = (app_t*)user_param; 58 | app->debug_callback(source, type, id, severity, length, message); 59 | } 60 | 61 | } // namespace gl 62 | 63 | END_MGPU_NAMESPACE 64 | -------------------------------------------------------------------------------- /inc/mgpu/gl/buffer.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../common/bindings.hxx" 3 | 4 | #define GL_GLEXT_PROTOTYPES 5 | #include 6 | 7 | BEGIN_MGPU_NAMESPACE 8 | 9 | namespace gl { 10 | 11 | template< 12 | typename T, 13 | bool is_array = std::is_array_v, 14 | bool is_const = std::is_const_v 15 | > 16 | struct gl_buffer_t { 17 | typedef std::remove_extent_t type_t; 18 | 19 | gl_buffer_t() : buffer(0), count(0) { } 20 | 21 | gl_buffer_t(int count, const type_t* data = nullptr) noexcept : count(count) { 22 | glCreateBuffers(1, &buffer); 23 | glNamedBufferStorage(buffer, sizeof(type_t) * count, data, 24 | GL_DYNAMIC_STORAGE_BIT); 25 | } 26 | gl_buffer_t(const std::vector& data) noexcept : 27 | gl_buffer_t(data.size(), data.data()) { } 28 | 29 | ~gl_buffer_t() { 30 | if(buffer) 31 | glDeleteBuffers(1, &buffer); 32 | } 33 | 34 | gl_buffer_t(const gl_buffer_t&) = delete; 35 | gl_buffer_t& operator=(const gl_buffer_t) = delete; 36 | 37 | operator GLuint() noexcept { return buffer; } 38 | 39 | void swap(gl_buffer_t& rhs) noexcept { 40 | std::swap(buffer, rhs.buffer); 41 | std::swap(count, rhs.count); 42 | } 43 | 44 | void set_data(const type_t* data) noexcept { 45 | if(count) { 46 | assert(buffer); 47 | glNamedBufferSubData(buffer, 0, sizeof(type_t) * count, data); 48 | } 49 | } 50 | void set_data(const std::vector& data) { 51 | resize(data.size()); 52 | set_data(data.data()); 53 | } 54 | void set_data_range(const type_t* data, int first, int count) { 55 | assert(first + count <= this->count); 56 | if(count) { 57 | assert(buffer); 58 | glNamedBufferSubData(buffer, sizeof(type_t) * first, 59 | count * sizeof(type_t), data); 60 | } 61 | } 62 | 63 | void get_data(type_t* data) noexcept { 64 | if(count) { 65 | assert(buffer); 66 | glGetNamedBufferSubData(buffer, 0, sizeof(type_t) * count, data); 67 | } 68 | } 69 | 70 | void clear_bytes() { 71 | if(count && buffer) { 72 | char zero = 0; 73 | glClearNamedBufferData(buffer, GL_R8I, GL_RED_INTEGER, 74 | GL_UNSIGNED_BYTE, &zero); 75 | } 76 | } 77 | 78 | void bind_ubo(GLuint index) { 79 | glBindBufferBase(GL_UNIFORM_BUFFER, index, buffer); 80 | } 81 | void bind_ssbo(GLuint index) { 82 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, index, buffer); 83 | } 84 | 85 | template 86 | buffer_iterator_t bind_ssbo() { 87 | bind_ssbo(bind); 88 | return { }; 89 | } 90 | 91 | std::vector get_data() { 92 | std::vector vec(count); 93 | get_data(vec.data()); 94 | return vec; 95 | } 96 | 97 | void resize(int count2, bool preserve = false) { 98 | if(count != count2) { 99 | gl_buffer_t buffer2(count2); 100 | 101 | if(preserve && count && count2) { 102 | // Copy the old data into the new buffer. 103 | glCopyNamedBufferSubData(buffer, buffer2, 0, 0, 104 | std::min(count, count2) * sizeof(type_t)); 105 | } 106 | 107 | std::swap(buffer, buffer2.buffer); 108 | std::swap(count, buffer2.count); 109 | } 110 | } 111 | 112 | GLuint buffer; 113 | int count; 114 | }; 115 | 116 | template 117 | struct gl_buffer_t { 118 | gl_buffer_t(const type_t* data = nullptr) noexcept { 119 | glCreateBuffers(1, &buffer); 120 | glNamedBufferStorage(buffer, sizeof(type_t), data, 121 | GL_DYNAMIC_STORAGE_BIT); 122 | } 123 | 124 | ~gl_buffer_t() { 125 | glDeleteBuffers(1, &buffer); 126 | } 127 | 128 | gl_buffer_t(const gl_buffer_t&) = delete; 129 | gl_buffer_t& operator=(const gl_buffer_t) = delete; 130 | 131 | operator GLuint() noexcept { return buffer; } 132 | 133 | void set_data(const type_t& data) noexcept { 134 | assert(buffer); 135 | glNamedBufferSubData(buffer, 0, sizeof(type_t), &data); 136 | } 137 | void get_data(type_t* data) noexcept { 138 | assert(buffer); 139 | glGetNamedBufferSubData(buffer, 0, sizeof(type_t), data); 140 | } 141 | type_t get_data() noexcept { 142 | type_t x; 143 | get_data(&x); 144 | return x; 145 | } 146 | 147 | void bind_ubo(GLuint index) { 148 | glBindBufferBase(GL_UNIFORM_BUFFER, index, buffer); 149 | } 150 | void bind_ssbo(GLuint index) { 151 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, index, buffer); 152 | } 153 | 154 | GLuint buffer; 155 | }; 156 | 157 | // A const non-array type keeps a copy of the object on the CPU. 158 | template 159 | struct gl_buffer_t { 160 | typedef std::remove_const_t type_t; 161 | 162 | gl_buffer_t() : buffer(0), invalid(true) { 163 | glCreateBuffers(1, &buffer); 164 | glNamedBufferStorage(buffer, sizeof(type_t), nullptr, 165 | GL_DYNAMIC_STORAGE_BIT); 166 | } 167 | 168 | gl_buffer_t(const type_t& x) : data(x) { 169 | glCreateBuffers(1, &buffer); 170 | glNamedBufferStorage(buffer, sizeof(type_t), &data, 171 | GL_DYNAMIC_STORAGE_BIT); 172 | invalid = false; 173 | } 174 | 175 | ~gl_buffer_t() { 176 | glDeleteBuffers(1, &buffer); 177 | } 178 | 179 | void set_data(const type_t& x) noexcept { 180 | if(!data || memcmp(&x, &data, sizeof(type_t))) { 181 | data.emplace(x); 182 | invalid = true; 183 | } 184 | } 185 | 186 | void update() { 187 | if(invalid) { 188 | assert(data); 189 | glNamedBufferSubData(buffer, 0, sizeof(type_t), &*data); 190 | invalid = false; 191 | } 192 | } 193 | 194 | void bind_ubo(GLuint index) { 195 | update(); 196 | glBindBufferBase(GL_UNIFORM_BUFFER, index, buffer); 197 | } 198 | 199 | void bind_ubo_range(GLuint index, size_t offset, size_t size) { 200 | update(); 201 | glBindBufferRange(GL_UNIFORM_BUFFER, index, buffer, offset, size); 202 | } 203 | 204 | GLuint buffer; 205 | bool invalid; 206 | std::optional data; 207 | }; 208 | 209 | } // namespace gl 210 | 211 | END_MGPU_NAMESPACE 212 | -------------------------------------------------------------------------------- /inc/mgpu/gl/merge.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../common/kernel_merge.hxx" 3 | #include "../common/bindings.hxx" 4 | #include "partition.hxx" 5 | 6 | BEGIN_MGPU_NAMESPACE 7 | 8 | namespace gl { 9 | 10 | template 11 | [[using spirv: comp, local_size(nt)]] 12 | void kernel_merge() { 13 | params_t params = shader_uniform; 14 | 15 | kernel_merge( 16 | readonly_iterator_t(), 17 | 18 | params.a_keys, 19 | params.a_vals, 20 | params.a_count, 21 | 22 | params.b_keys, 23 | params.b_vals, 24 | params.b_count, 25 | 26 | params.c_keys, 27 | params.c_vals, 28 | 29 | params.comp 30 | ); 31 | } 32 | 33 | template< 34 | typename a_keys_it, 35 | typename a_values_it, 36 | typename b_keys_it, 37 | typename b_values_it, 38 | typename c_keys_it, 39 | typename c_values_it, 40 | typename comp_t> 41 | struct merge_params_t { 42 | a_keys_it a_keys; 43 | b_keys_it b_keys; 44 | c_keys_it c_keys; 45 | 46 | int spacing; // NV * VT 47 | int a_count; 48 | int b_count; 49 | 50 | // Put the potentially empty objects together to take up less space. 51 | a_values_it a_vals; 52 | b_values_it b_vals; 53 | c_values_it c_vals; 54 | comp_t comp; 55 | }; 56 | 57 | template 58 | void launch_merge(int count) { 59 | // First launch the partition kernel. 60 | launch_partition(count, nt * vt); 61 | 62 | // Launch the CTA merge kernel. 63 | int num_ctas = div_up(count, nt * vt); 64 | gl_dispatch_kernel >(num_ctas); 65 | } 66 | 67 | // merge_pipeline_t is a convenient entry point for using the merge 68 | // kernel. It loads data from SSBOs and writes to an SSBO. Storage for 69 | // the parameters UBO and merge paths SSBO is handled automatically. 70 | template > 72 | struct merge_pipeline_t { 73 | void reserve(int count, int spacing) { 74 | int num_partitions = num_merge_partitions(count, spacing); 75 | if(num_partitions > partitions_ssbo.count) 76 | partitions_ssbo.resize(num_partitions); 77 | } 78 | 79 | template 80 | void launch(GLuint a_keys, int a_count, GLuint b_keys, int b_count, 81 | GLuint c_keys, comp_t comp = comp_t()) { 82 | 83 | static_assert(std::is_same_v); 84 | 85 | // Bind the merge path SSBO. 86 | reserve(a_count + b_count, nt * vt); 87 | partitions_ssbo.bind_ssbo(3); 88 | 89 | params_t params { }; 90 | params.spacing = nt * vt; 91 | params.a_count = a_count; 92 | params.b_count = b_count; 93 | params.comp = comp; 94 | 95 | // Upload and bind the UBO. 96 | params_ubo.set_data(params); 97 | params_ubo.bind_ubo(0); 98 | 99 | // Bind the data. 100 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, a_keys); 101 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, b_keys); 102 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, c_keys); 103 | 104 | launch_merge(a_count + b_count); 105 | } 106 | 107 | template 108 | void launch(GLuint a_keys, GLuint a_vals, int a_count, GLuint b_keys, 109 | GLuint b_vals, int b_count, GLuint c_keys, GLuint c_vals, 110 | comp_t comp = comp_t()) { 111 | 112 | static_assert(!std::is_same_v); 113 | 114 | // Bind the merge path SSBO. 115 | reserve(a_count + b_count, nt * vt); 116 | partitions_ssbo.bind_ssbo(3); 117 | 118 | params_t params { }; 119 | params.spacing = nt * vt; 120 | params.a_count = a_count; 121 | params.b_count = b_count; 122 | params.comp = comp; 123 | 124 | // Upload and bind the UBO. 125 | params_ubo.set_data(params); 126 | params_ubo.bind_ubo(0); 127 | 128 | // Bind the data. 129 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, a_keys); 130 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, b_keys); 131 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, c_keys); 132 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, a_vals); 133 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, b_vals); 134 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, c_vals); 135 | 136 | launch_merge(a_count + b_count); 137 | } 138 | 139 | typedef merge_params_t< 140 | // A 141 | readonly_iterator_t, 142 | readonly_iterator_t, 143 | 144 | // B 145 | readonly_iterator_t, 146 | readonly_iterator_t, 147 | 148 | // C 149 | writeonly_iterator_t, 150 | writeonly_iterator_t, 151 | 152 | comp_t 153 | > params_t; 154 | 155 | gl_buffer_t partitions_ssbo; 156 | gl_buffer_t params_ubo; 157 | }; 158 | 159 | } // namespace gl 160 | 161 | END_MGPU_NAMESPACE -------------------------------------------------------------------------------- /inc/mgpu/gl/mergesort.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../common/kernel_mergesort.hxx" 3 | #include "transform.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | namespace gl { 8 | 9 | template 10 | [[using spirv: comp, local_size(nt)]] 11 | void kernel_blocksort() { 12 | params_t params = shader_uniform; 13 | 14 | kernel_blocksort( 15 | params.keys_block, 16 | params.vals_block, 17 | params.keys_out, 18 | params.vals_out, 19 | params.count, 20 | params.comp 21 | ); 22 | 23 | // Zero out the pass identifiers at the end of the mp data. The partition 24 | // and mergesort pass kernels use these terms to know which pass they're 25 | // working on. 26 | //if(params.num_partitions + threadIdx.x + blockIdx.x) 27 | params.mp_data[params.num_partitions] = 0; 28 | } 29 | 30 | template 31 | void kernel_mergesort_partition(mp_it mp_data, keys_it keys, int count, 32 | int num_partitions, int spacing, int coop, comp_t comp) { 33 | 34 | int index = threadIdx.x + blockDim.x * blockIdx.x; 35 | if(index < num_partitions) { 36 | merge_range_t range = compute_mergesort_range(count, index, coop, spacing); 37 | int diag = min(spacing * index, count) - range.a_begin; 38 | mp_data[index] = merge_path(keys + range.a_begin, 39 | range.a_count(), keys + range.b_begin, range.b_count(), diag, comp); 40 | } 41 | } 42 | 43 | template 44 | [[using spirv: comp, local_size(128)]] 45 | void kernel_mergesort_partition() { 46 | params_t params = shader_uniform; 47 | 48 | // Load the pass. 49 | int pass = params.mp_data[params.num_partitions]; 50 | 51 | // The first thread should increment the pass. 52 | int first_thread = !threadIdx.x && !blockIdx.x; 53 | if(first_thread) 54 | params.mp_data[params.num_partitions + 1] = pass; 55 | 56 | int coop = 2<< pass; 57 | kernel_mergesort_partition( 58 | params.mp_data, 59 | params.keys_in, 60 | params.count, 61 | params.num_partitions, 62 | params.spacing, 63 | coop, 64 | params.comp 65 | ); 66 | } 67 | 68 | template 69 | [[using spirv: comp, local_size(nt)]] 70 | void kernel_mergesort_pass() { 71 | params_t params = shader_uniform; 72 | 73 | // Load the pass. 74 | int pass = params.mp_data[params.num_partitions + 1]; 75 | 76 | // The first thread should increment the pass. 77 | int first_thread = !threadIdx.x && !blockIdx.x; 78 | if(first_thread) 79 | params.mp_data[params.num_partitions] = pass + 1; 80 | 81 | int coop = 2<< pass; 82 | kernel_mergesort_pass( 83 | params.mp_data, 84 | params.keys_in, 85 | params.vals_in, 86 | params.keys_out, 87 | params.vals_out, 88 | params.count, 89 | coop, 90 | params.comp 91 | ); 92 | } 93 | 94 | template< 95 | int mp, 96 | typename keys_block_it, 97 | typename vals_block_it, 98 | typename keys_in_it, 99 | typename vals_in_it, 100 | typename keys_out_it, 101 | typename vals_out_it, 102 | typename comp_t 103 | > struct mergesort_params_t { 104 | buffer_iterator_t mp_data; 105 | 106 | // Inputs to the blocksort. 107 | keys_block_it keys_block; 108 | vals_block_it vals_block; 109 | 110 | // Inputs to the partition and merge passes. 111 | keys_in_it keys_in; 112 | vals_in_it vals_in; 113 | 114 | // Outputs for blocksort and merge passes. 115 | keys_out_it keys_out; 116 | vals_out_it vals_out; 117 | 118 | int count; 119 | int num_partitions; 120 | int spacing; 121 | comp_t comp; 122 | }; 123 | 124 | template> 126 | struct mergesort_pipeline_t { 127 | enum { has_values = !std::is_same_v }; 128 | 129 | struct info_t { 130 | int num_passes; 131 | int num_ctas; 132 | int num_partitions; 133 | int num_partition_ctas; 134 | }; 135 | 136 | info_t reserve(int count, int nv) { 137 | int num_ctas = div_up(count, nv); 138 | int num_passes = find_log2(num_ctas, true); 139 | int num_partitions = num_ctas > 1 ? num_ctas + 1 : 0; 140 | 141 | if(num_passes) { 142 | // Reserve two extra slots for the pass. 143 | partitions_ssbo.resize(num_partitions + 2); 144 | keys_ssbo.resize(count); 145 | if(has_values) 146 | vals_ssbo.resize(count); 147 | } 148 | 149 | int num_partition_ctas = div_up(num_partitions, 128); 150 | return { num_passes, num_ctas, num_partitions, num_partition_ctas }; 151 | } 152 | 153 | template 154 | void sort_keys(GLuint keys, int count, comp_t comp = comp_t()) { 155 | static_assert(!has_values); 156 | const int nv = nt * vt; 157 | 158 | if(!count) return; 159 | 160 | params_t params { }; 161 | info_t info = reserve(count, nv); 162 | 163 | params.count = count; 164 | params.spacing = nv; 165 | params.num_partitions = info.num_partitions; 166 | params.comp = comp; 167 | 168 | // Ping pong with this buffer. 169 | GLuint keys2 = keys_ssbo.buffer; 170 | 171 | // Upload the UBO. 172 | params_ubo.set_data(params); 173 | params_ubo.bind_ubo(0); 174 | 175 | // Bind the partitions buffer. 176 | if(info.num_passes) 177 | partitions_ssbo.bind_ssbo(2); 178 | 179 | // Execute the block sort. 180 | if(info.num_passes % 2) { 181 | // Read the input and write to the aux buffer. 182 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, keys); 183 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, keys2); 184 | std::swap(keys, keys2); 185 | 186 | } else { 187 | // Read the input and write to the input. 188 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, keys); 189 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, keys); 190 | } 191 | 192 | // Launch the blocksort kernel. 193 | gl_dispatch_kernel >( 194 | info.num_ctas 195 | ); 196 | 197 | // Execute the merge passes. 198 | for(int pass = 0; pass < info.num_passes; ++pass) { 199 | // Bind the inputs and outputs. 200 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, keys); 201 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, keys2); 202 | 203 | // Launch the partitions kernel. 204 | gl_dispatch_kernel >( 205 | info.num_partition_ctas); 206 | 207 | // Launch the mergesort pass kernel. 208 | gl_dispatch_kernel >( 209 | info.num_ctas 210 | ); 211 | 212 | // Swap the source and destintation buffers. 213 | std::swap(keys, keys2); 214 | } 215 | } 216 | 217 | template 218 | void sort_keys_indices(GLuint keys, GLuint vals, int count, 219 | comp_t comp = comp_t()) { 220 | 221 | sort_keys_values(keys, vals, count, comp); 222 | } 223 | 224 | template 225 | void sort_keys_values(GLuint keys, GLuint vals, int count, 226 | comp_t comp = comp_t()) { 227 | 228 | static_assert(has_values); 229 | const int nv = nt * vt; 230 | 231 | if(!count) return; 232 | 233 | params_t params { }; 234 | info_t info = reserve(count, nv); 235 | 236 | params.count = count; 237 | params.spacing = nv; 238 | params.num_partitions = info.num_partitions; 239 | params.comp = comp; 240 | 241 | // Ping pong with this buffer. 242 | GLuint keys2 = keys_ssbo.buffer; 243 | GLuint vals2 = vals_ssbo.buffer; 244 | 245 | // Upload the UBO. 246 | params_ubo.set_data(params); 247 | params_ubo.bind_ubo(0); 248 | 249 | // Bind the partitions buffer. 250 | if(info.num_passes) 251 | partitions_ssbo.bind_ssbo(2); 252 | 253 | // Execute the block sort. 254 | if(info.num_passes % 2) { 255 | // Read the input and write to the aux buffer. 256 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, keys); 257 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, keys2); 258 | 259 | if constexpr(!sort_indices) 260 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, vals); 261 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, vals2); 262 | 263 | std::swap(keys, keys2); 264 | std::swap(vals, vals2); 265 | 266 | } else { 267 | // Read the input and write to the input. 268 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, keys); 269 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, keys); 270 | 271 | if constexpr(!sort_indices) 272 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, vals); 273 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, vals); 274 | } 275 | 276 | // Launch the blocksort kernel. 277 | gl_dispatch_kernel >( 278 | info.num_ctas 279 | ); 280 | 281 | // Execute the merge passes. 282 | for(int pass = 0; pass < info.num_passes; ++pass) { 283 | // Bind the inputs and outputs. 284 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, keys); 285 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, keys2); 286 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, vals); 287 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, vals2); 288 | 289 | // Launch the partitions kernel. 290 | gl_dispatch_kernel >( 291 | info.num_partition_ctas); 292 | 293 | // Launch the mergesort pass kernel. 294 | gl_dispatch_kernel >( 295 | info.num_ctas 296 | ); 297 | 298 | // Swap the source and destintation buffers. 299 | std::swap(keys, keys2); 300 | std::swap(vals, vals2); 301 | } 302 | } 303 | 304 | typedef mergesort_params_t< 305 | 2, // 2 is reserved for partitions. 306 | buffer_iterator_t, 307 | buffer_iterator_t, 308 | readonly_iterator_t, 309 | readonly_iterator_t, 310 | buffer_iterator_t, 311 | buffer_iterator_t, 312 | comp_t 313 | > params_t; 314 | 315 | // Keep a parameters UBO. The value is cached so glNamedBufferSubData is only 316 | // called when something changes. 317 | gl_buffer_t params_ubo; 318 | 319 | // Keep storage for keys and values to ping-pong between passes. 320 | gl_buffer_t keys_ssbo; 321 | gl_buffer_t vals_ssbo; 322 | gl_buffer_t partitions_ssbo; 323 | }; 324 | 325 | } // namespace gl 326 | 327 | END_MGPU_NAMESPACE 328 | -------------------------------------------------------------------------------- /inc/mgpu/gl/partition.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../common/cta_merge.hxx" 3 | #include "transform.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | namespace gl { 8 | 9 | template 10 | [[using spirv: comp, local_size(128)]] 11 | void kernel_partition() { 12 | // Load the kernel parameters from the uniform buffer at binding=ubo. 13 | params_t params = shader_uniform; 14 | int a_count = params.a_count; 15 | int b_count = params.b_count; 16 | int spacing = params.spacing; 17 | 18 | int num_partitions = num_merge_partitions(a_count + b_count, spacing); 19 | int index = threadIdx.x + blockDim.x * blockIdx.x; 20 | 21 | if(index < num_partitions) { 22 | int diag = min(spacing * index, a_count + b_count); 23 | 24 | writeonly_iterator_t mp_data; 25 | mp_data[index] = merge_path(params.a_keys, a_count, params.b_keys, 26 | b_count, diag, params.comp); 27 | } 28 | } 29 | 30 | template 31 | void launch_partition(int count, int spacing) { 32 | int num_ctas = div_up(num_merge_partitions(count, spacing), 128); 33 | gl_dispatch_kernel >(num_ctas); 34 | } 35 | 36 | } // namespace gl 37 | 38 | END_MGPU_NAMESPACE 39 | -------------------------------------------------------------------------------- /inc/mgpu/gl/transform.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "buffer.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | namespace gl { 8 | 9 | template 10 | void gl_dispatch_kernel(int x, bool membar = true) { 11 | static GLuint program = 0; 12 | if(!program) { 13 | GLuint cs = glCreateShader(GL_COMPUTE_SHADER); 14 | glShaderBinary(1, &cs, GL_SHADER_BINARY_FORMAT_SPIR_V_ARB, 15 | __spirv_data, __spirv_size); 16 | glSpecializeShader(cs, @spirv(kernel), 0, nullptr, nullptr); 17 | 18 | program = glCreateProgram(); 19 | glAttachShader(program, cs); 20 | glLinkProgram(program); 21 | } 22 | 23 | if(x) { 24 | glUseProgram(program); 25 | glDispatchCompute(x, 1, 1); 26 | 27 | if(membar) 28 | glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 29 | } 30 | } 31 | 32 | template 33 | [[using spirv: comp, local_size(nt)]] 34 | void kernel_transform() { 35 | data_t data = shader_uniform; 36 | 37 | int gid = threadIdx.x + nt * blockIdx.x; 38 | if(gid < data.count) 39 | data.func(gid); 40 | } 41 | 42 | template 43 | void gl_transform(func_t func, int count, bool membar = true) { 44 | static_assert(std::is_copy_constructible_v); 45 | 46 | struct data_t { 47 | func_t func; 48 | int count; 49 | }; 50 | 51 | // Keep a cache for the UBO. Only calls glNamedBufferSubData if 52 | // its contents are different from the last bind operation. 53 | static gl_buffer_t buffer; 54 | buffer.set_data({ func, count }); 55 | buffer.bind_ubo(ubo); 56 | 57 | int num_ctas = div_up(count, nt); 58 | gl_dispatch_kernel >(num_ctas, membar); 59 | } 60 | 61 | } // namespace gl 62 | 63 | END_MGPU_NAMESPACE -------------------------------------------------------------------------------- /inc/mgpu/vk/VkBootstrap.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 3 | * documentation files (the “Software”), to deal in the Software without restriction, including without 4 | * limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 5 | * of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | * 7 | * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | * 9 | * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 10 | * LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 11 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 12 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 13 | * 14 | * Copyright © 2020 Charles Giessen (charles@lunarg.com) 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | #include 22 | #include 23 | 24 | #include 25 | 26 | 27 | namespace vkb { 28 | 29 | namespace detail { 30 | 31 | struct Error { 32 | std::error_code type; 33 | VkResult vk_result = VK_SUCCESS; // optional error value if a vulkan call failed 34 | }; 35 | 36 | template class Result { 37 | public: 38 | Result (const T& value) : m_value{ value }, m_init{ true } {} 39 | Result (T&& value) : m_value{ std::move (value) }, m_init{ true } {} 40 | 41 | Result (Error error) : m_error{ error }, m_init{ false } {} 42 | 43 | Result (std::error_code error_code, VkResult result = VK_SUCCESS) 44 | : m_error{ error_code, result }, m_init{ false } {} 45 | 46 | ~Result () { destroy (); } 47 | Result (Result const& expected) : m_init (expected.m_init) { 48 | if (m_init) 49 | new (&m_value) T{ expected.m_value }; 50 | else 51 | m_error = expected.m_error; 52 | } 53 | Result (Result&& expected) : m_init (expected.m_init) { 54 | if (m_init) 55 | new (&m_value) T{ std::move (expected.m_value) }; 56 | else 57 | m_error = std::move (expected.m_error); 58 | expected.destroy (); 59 | } 60 | 61 | Result& operator= (const T& expect) { 62 | destroy (); 63 | m_init = true; 64 | new (&m_value) T{ expect }; 65 | return *this; 66 | } 67 | Result& operator= (T&& expect) { 68 | destroy (); 69 | m_init = true; 70 | new (&m_value) T{ std::move (expect) }; 71 | return *this; 72 | } 73 | Result& operator= (const Error& error) { 74 | destroy (); 75 | m_init = false; 76 | m_error = error; 77 | return *this; 78 | } 79 | Result& operator= (Error&& error) { 80 | destroy (); 81 | m_init = false; 82 | m_error = error; 83 | return *this; 84 | } 85 | // clang-format off 86 | const T* operator-> () const { assert (m_init); return &m_value; } 87 | T* operator-> () { assert (m_init); return &m_value; } 88 | const T& operator* () const& { assert (m_init); return m_value; } 89 | T& operator* () & { assert (m_init); return m_value; } 90 | T&& operator* () && { assert (m_init); return std::move (m_value); } 91 | const T& value () const& { assert (m_init); return m_value; } 92 | T& value () & { assert (m_init); return m_value; } 93 | const T&& value () const&& { assert (m_init); return std::move (m_value); } 94 | T&& value () && { assert (m_init); return std::move (m_value); } 95 | 96 | std::error_code error() const { assert (!m_init); return m_error.type; } 97 | VkResult vk_result() const { assert (!m_init); return m_error.vk_result; } 98 | // clang-format on 99 | 100 | 101 | bool has_value () const { return m_init; } 102 | explicit operator bool () const { return m_init; } 103 | 104 | private: 105 | void destroy () { 106 | if (m_init) m_value.~T (); 107 | } 108 | union { 109 | T m_value; 110 | Error m_error; 111 | }; 112 | bool m_init; 113 | }; 114 | 115 | } // namespace detail 116 | 117 | enum class InstanceError { 118 | vulkan_unavailable, 119 | vulkan_version_unavailable, 120 | vulkan_version_1_1_unavailable, 121 | vulkan_version_1_2_unavailable, 122 | failed_create_instance, 123 | failed_create_debug_messenger, 124 | requested_layers_not_present, 125 | requested_extensions_not_present, 126 | windowing_extensions_not_present, 127 | }; 128 | enum class PhysicalDeviceError { 129 | no_surface_provided, 130 | failed_enumerate_physical_devices, 131 | no_physical_devices_found, 132 | no_suitable_device, 133 | }; 134 | enum class QueueError { 135 | present_unavailable, 136 | graphics_unavailable, 137 | compute_unavailable, 138 | transfer_unavailable, 139 | queue_index_out_of_range, 140 | invalid_queue_family_index 141 | }; 142 | enum class DeviceError { 143 | failed_create_device, 144 | }; 145 | enum class SwapchainError { 146 | surface_handle_not_provided, 147 | failed_query_surface_support_details, 148 | failed_create_swapchain, 149 | failed_get_swapchain_images, 150 | failed_create_swapchain_image_views, 151 | }; 152 | 153 | std::error_code make_error_code (InstanceError instance_error); 154 | std::error_code make_error_code (PhysicalDeviceError physical_device_error); 155 | std::error_code make_error_code (QueueError queue_error); 156 | std::error_code make_error_code (DeviceError device_error); 157 | std::error_code make_error_code (SwapchainError swapchain_error); 158 | 159 | const char* to_string_message_severity (VkDebugUtilsMessageSeverityFlagBitsEXT s); 160 | const char* to_string_message_type (VkDebugUtilsMessageTypeFlagsEXT s); 161 | 162 | const char* to_string (InstanceError err); 163 | const char* to_string (PhysicalDeviceError err); 164 | const char* to_string (QueueError err); 165 | const char* to_string (DeviceError err); 166 | const char* to_string (SwapchainError err); 167 | 168 | // Gathers useful information about the available vulkan capabilities, like layers and instance extensions. 169 | // Use this for enabling features conditionally, ie if you would like an extension but can use a fallback if 170 | // it isn't supported but need to know if support is available first. 171 | struct SystemInfo { 172 | private: 173 | SystemInfo (); 174 | 175 | public: 176 | // Use get_system_info to create a SystemInfo struct. This is because loading vulkan could fail. 177 | static detail::Result get_system_info (); 178 | static detail::Result get_system_info (PFN_vkGetInstanceProcAddr fp_vkGetInstanceProcAddr); 179 | 180 | // Returns true if a layer is available 181 | bool is_layer_available (const char* layer_name) const; 182 | // Returns true if an extension is available 183 | bool is_extension_available (const char* extension_name) const; 184 | 185 | std::vector available_layers; 186 | std::vector available_extensions; 187 | bool validation_layers_available = false; 188 | bool debug_utils_available = false; 189 | }; 190 | 191 | 192 | class InstanceBuilder; 193 | class PhysicalDeviceSelector; 194 | 195 | struct Instance { 196 | VkInstance instance = VK_NULL_HANDLE; 197 | VkDebugUtilsMessengerEXT debug_messenger = VK_NULL_HANDLE; 198 | VkAllocationCallbacks* allocation_callbacks = VK_NULL_HANDLE; 199 | 200 | PFN_vkGetInstanceProcAddr fp_vkGetInstanceProcAddr = nullptr; 201 | 202 | private: 203 | bool headless = false; 204 | uint32_t instance_version = VK_MAKE_VERSION (1, 0, 0); 205 | 206 | friend class InstanceBuilder; 207 | friend class PhysicalDeviceSelector; 208 | }; 209 | 210 | void destroy_instance (Instance instance); // release instance resources 211 | 212 | class InstanceBuilder { 213 | public: 214 | // Default constructor, will load vulkan. 215 | explicit InstanceBuilder (); 216 | // Optional: Can use your own PFN_vkGetInstanceProcAddr 217 | explicit InstanceBuilder (PFN_vkGetInstanceProcAddr fp_vkGetInstanceProcAddr); 218 | 219 | // Create a VkInstance. Return an error if it failed. 220 | detail::Result build () const; 221 | 222 | // Sets the name of the application. Defaults to "" if none is provided. 223 | InstanceBuilder& set_app_name (const char* app_name); 224 | // Sets the name of the engine. Defaults to "" if none is provided. 225 | InstanceBuilder& set_engine_name (const char* engine_name); 226 | // Sets the (major, minor, patch) version of the application. 227 | InstanceBuilder& set_app_version (uint32_t major, uint32_t minor, uint32_t patch = 0); 228 | // Sets the (major, minor, patch) version of the engine. 229 | InstanceBuilder& set_engine_version (uint32_t major, uint32_t minor, uint32_t patch = 0); 230 | // Require a vulkan instance API version. Will fail to create if this version isn't available. 231 | InstanceBuilder& require_api_version (uint32_t major, uint32_t minor, uint32_t patch = 0); 232 | // Prefer a vulkan instance API version. If the desired version isn't available, it will use the highest version available. 233 | InstanceBuilder& desire_api_version (uint32_t major, uint32_t minor, uint32_t patch = 0); 234 | 235 | // Adds a layer to be enabled. Will fail to create an instance if the layer isn't available. 236 | InstanceBuilder& enable_layer (const char* layer_name); 237 | // Adds an extension to be enabled. Will fail to create an instance if the extension isn't available. 238 | InstanceBuilder& enable_extension (const char* extension_name); 239 | 240 | // Headless Mode does not load the required extensions for presentation. Defaults to true. 241 | InstanceBuilder& set_headless (bool headless = true); 242 | 243 | // Enables the validation layers. Will fail to create an instance if the validation layers aren't available. 244 | InstanceBuilder& enable_validation_layers (bool require_validation = true); 245 | // Checks if the validation layers are available and loads them if they are. 246 | InstanceBuilder& request_validation_layers (bool enable_validation = true); 247 | 248 | // Use a default debug callback that prints to standard out. 249 | InstanceBuilder& use_default_debug_messenger (); 250 | // Provide a user defined debug callback. 251 | InstanceBuilder& set_debug_callback (PFN_vkDebugUtilsMessengerCallbackEXT callback); 252 | // Set what message severity is needed to trigger the callback. 253 | InstanceBuilder& set_debug_messenger_severity (VkDebugUtilsMessageSeverityFlagsEXT severity); 254 | // Add a message severity to the list that triggers the callback. 255 | InstanceBuilder& add_debug_messenger_severity (VkDebugUtilsMessageSeverityFlagsEXT severity); 256 | // Set what message type triggers the callback. 257 | InstanceBuilder& set_debug_messenger_type (VkDebugUtilsMessageTypeFlagsEXT type); 258 | // Add a message type to the list of that triggers the callback. 259 | InstanceBuilder& add_debug_messenger_type (VkDebugUtilsMessageTypeFlagsEXT type); 260 | 261 | // Disable some validation checks. 262 | // Checks: All, and Shaders 263 | InstanceBuilder& add_validation_disable (VkValidationCheckEXT check); 264 | 265 | // Enables optional parts of the validation layers. 266 | // Parts: best practices, gpu assisted, and gpu assisted reserve binding slot. 267 | InstanceBuilder& add_validation_feature_enable (VkValidationFeatureEnableEXT enable); 268 | 269 | // Disables sections of the validation layers. 270 | // Options: All, shaders, thread safety, api parameters, object lifetimes, core checks, and unique handles. 271 | InstanceBuilder& add_validation_feature_disable (VkValidationFeatureDisableEXT disable); 272 | 273 | // Provide custom allocation callbacks. 274 | InstanceBuilder& set_allocation_callbacks (VkAllocationCallbacks* callbacks); 275 | 276 | private: 277 | struct InstanceInfo { 278 | // VkApplicationInfo 279 | const char* app_name = nullptr; 280 | const char* engine_name = nullptr; 281 | uint32_t application_version = 0; 282 | uint32_t engine_version = 0; 283 | uint32_t required_api_version = VK_MAKE_VERSION (1, 0, 0); 284 | uint32_t desired_api_version = VK_MAKE_VERSION (1, 0, 0); 285 | 286 | // VkInstanceCreateInfo 287 | std::vector layers; 288 | std::vector extensions; 289 | VkInstanceCreateFlags flags = 0; 290 | std::vector pNext_elements; 291 | 292 | // debug callback 293 | PFN_vkDebugUtilsMessengerCallbackEXT debug_callback = nullptr; 294 | VkDebugUtilsMessageSeverityFlagsEXT debug_message_severity = 295 | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT; 296 | VkDebugUtilsMessageTypeFlagsEXT debug_message_type = 297 | VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | 298 | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; 299 | 300 | // validation features 301 | std::vector disabled_validation_checks; 302 | std::vector enabled_validation_features; 303 | std::vector disabled_validation_features; 304 | 305 | // Custom allocator 306 | VkAllocationCallbacks* allocation_callbacks = VK_NULL_HANDLE; 307 | 308 | bool request_validation_layers = false; 309 | bool enable_validation_layers = false; 310 | bool use_debug_messenger = false; 311 | bool headless_context = false; 312 | 313 | PFN_vkGetInstanceProcAddr fp_vkGetInstanceProcAddr = nullptr; 314 | } info; 315 | }; 316 | 317 | VKAPI_ATTR VkBool32 VKAPI_CALL default_debug_callback (VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, 318 | VkDebugUtilsMessageTypeFlagsEXT messageType, 319 | const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData, 320 | void* pUserData); 321 | 322 | void destroy_debug_utils_messenger(VkInstance const instance, VkDebugUtilsMessengerEXT const messenger, VkAllocationCallbacks* allocation_callbacks = nullptr); 323 | 324 | // ---- Physical Device ---- // 325 | class PhysicalDeviceSelector; 326 | class DeviceBuilder; 327 | 328 | struct PhysicalDevice { 329 | VkPhysicalDevice physical_device = VK_NULL_HANDLE; 330 | VkSurfaceKHR surface = VK_NULL_HANDLE; 331 | 332 | VkPhysicalDeviceFeatures features{}; 333 | VkPhysicalDeviceProperties properties{}; 334 | VkPhysicalDeviceMemoryProperties memory_properties{}; 335 | 336 | // Has a queue family that supports compute operations but not graphics nor transfer. 337 | bool has_dedicated_compute_queue () const; 338 | // Has a queue family that supports transfer operations but not graphics nor compute. 339 | bool has_dedicated_transfer_queue () const; 340 | 341 | // Has a queue family that supports transfer operations but not graphics. 342 | bool has_separate_compute_queue () const; 343 | // Has a queue family that supports transfer operations but not graphics. 344 | bool has_separate_transfer_queue () const; 345 | 346 | // Advanced: Get the VkQueueFamilyProperties of the device if special queue setup is needed 347 | std::vector get_queue_families () const; 348 | 349 | private: 350 | std::vector extensions_to_enable; 351 | std::vector queue_families; 352 | bool defer_surface_initialization = false; 353 | friend class PhysicalDeviceSelector; 354 | friend class DeviceBuilder; 355 | }; 356 | 357 | enum class PreferredDeviceType { 358 | other = 0, 359 | integrated = 1, 360 | discrete = 2, 361 | virtual_gpu = 3, 362 | cpu = 4 363 | }; 364 | 365 | class PhysicalDeviceSelector { 366 | public: 367 | // Requires a vkb::Instance to construct, needed to pass instance creation info. 368 | explicit PhysicalDeviceSelector (Instance const& instance); 369 | 370 | detail::Result select () const; 371 | 372 | // Set the surface in which the physical device should render to. 373 | PhysicalDeviceSelector& set_surface (VkSurfaceKHR surface); 374 | // Set the desired physical device type to select. Defaults to PreferredDeviceType::discrete. 375 | PhysicalDeviceSelector& prefer_gpu_device_type (PreferredDeviceType type = PreferredDeviceType::discrete); 376 | // Allow selection of a gpu device type that isn't the preferred physical device type. Defaults to true. 377 | PhysicalDeviceSelector& allow_any_gpu_device_type (bool allow_any_type = true); 378 | 379 | // Require that a physical device supports presentation. Defaults to true. 380 | PhysicalDeviceSelector& require_present (bool require = true); 381 | 382 | // Require a queue family that supports compute operations but not graphics nor transfer. 383 | PhysicalDeviceSelector& require_dedicated_compute_queue (); 384 | // Require a queue family that supports transfer operations but not graphics nor compute. 385 | PhysicalDeviceSelector& require_dedicated_transfer_queue (); 386 | 387 | // Require a queue family that supports compute operations but not graphics. 388 | PhysicalDeviceSelector& require_separate_compute_queue (); 389 | // Require a queue family that supports transfer operations but not graphics. 390 | PhysicalDeviceSelector& require_separate_transfer_queue (); 391 | 392 | // Require a memory heap from VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT with `size` memory available. 393 | PhysicalDeviceSelector& required_device_memory_size (VkDeviceSize size); 394 | // Prefer a memory heap from VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT with `size` memory available. 395 | PhysicalDeviceSelector& desired_device_memory_size (VkDeviceSize size); 396 | 397 | // Require a physical device which supports a specific extension. 398 | PhysicalDeviceSelector& add_required_extension (const char* extension); 399 | // Require a physical device which supports a set of extensions. 400 | PhysicalDeviceSelector& add_required_extensions (std::vector extensions); 401 | 402 | // Prefer a physical device which supports a specific extension. 403 | PhysicalDeviceSelector& add_desired_extension (const char* extension); 404 | // Prefer a physical device which supports a set of extensions. 405 | PhysicalDeviceSelector& add_desired_extensions (std::vector extensions); 406 | 407 | // Prefer a physical device that supports a (major, minor) version of vulkan. 408 | PhysicalDeviceSelector& set_desired_version (uint32_t major, uint32_t minor); 409 | // Require a physical device that supports a (major, minor) version of vulkan. 410 | PhysicalDeviceSelector& set_minimum_version (uint32_t major, uint32_t minor); 411 | 412 | // Require a physical device which supports the features in VkPhysicalDeviceFeatures. 413 | PhysicalDeviceSelector& set_required_features (VkPhysicalDeviceFeatures features); 414 | 415 | // Used when surface creation happens after physical device selection. 416 | // Warning: This disables checking if the physical device supports a given surface. 417 | PhysicalDeviceSelector& defer_surface_initialization (); 418 | 419 | // Ignore all criteria and choose the first physical device that is available. 420 | // Only use when: The first gpu in the list may be set by global user preferences and an application may wish to respect it. 421 | PhysicalDeviceSelector& select_first_device_unconditionally (bool unconditionally = true); 422 | 423 | private: 424 | struct SystemInfo { 425 | VkInstance instance = VK_NULL_HANDLE; 426 | VkSurfaceKHR surface = VK_NULL_HANDLE; 427 | bool headless = false; 428 | } system_info; 429 | 430 | struct PhysicalDeviceDesc { 431 | VkPhysicalDevice phys_device = VK_NULL_HANDLE; 432 | std::vector queue_families; 433 | 434 | VkPhysicalDeviceFeatures device_features{}; 435 | VkPhysicalDeviceProperties device_properties{}; 436 | VkPhysicalDeviceMemoryProperties mem_properties{}; 437 | }; 438 | PhysicalDeviceDesc populate_device_details (VkPhysicalDevice phys_device) const; 439 | 440 | struct SelectionCriteria { 441 | PreferredDeviceType preferred_type = PreferredDeviceType::discrete; 442 | bool allow_any_type = true; 443 | bool require_present = true; 444 | bool require_dedicated_transfer_queue = false; 445 | bool require_dedicated_compute_queue = false; 446 | bool require_separate_transfer_queue = false; 447 | bool require_separate_compute_queue = false; 448 | VkDeviceSize required_mem_size = 0; 449 | VkDeviceSize desired_mem_size = 0; 450 | 451 | std::vector required_extensions; 452 | std::vector desired_extensions; 453 | 454 | uint32_t required_version = VK_MAKE_VERSION (1, 0, 0); 455 | uint32_t desired_version = VK_MAKE_VERSION (1, 0, 0); 456 | 457 | VkPhysicalDeviceFeatures required_features{}; 458 | 459 | bool defer_surface_initialization = false; 460 | bool use_first_gpu_unconditionally = false; 461 | } criteria; 462 | 463 | enum class Suitable { yes, partial, no }; 464 | 465 | Suitable is_device_suitable (PhysicalDeviceDesc phys_device) const; 466 | }; 467 | 468 | // ---- Queue ---- // 469 | enum class QueueType { present, graphics, compute, transfer }; 470 | 471 | // ---- Device ---- // 472 | 473 | struct Device { 474 | VkDevice device = VK_NULL_HANDLE; 475 | PhysicalDevice physical_device; 476 | VkSurfaceKHR surface = VK_NULL_HANDLE; 477 | std::vector queue_families; 478 | VkAllocationCallbacks* allocation_callbacks = VK_NULL_HANDLE; 479 | 480 | detail::Result get_queue_index (QueueType type) const; 481 | // Only a compute or transfer queue type is valid. All other queue types do not support a 'dedicated' queue index 482 | detail::Result get_dedicated_queue_index (QueueType type) const; 483 | 484 | detail::Result get_queue (QueueType type) const; 485 | // Only a compute or transfer queue type is valid. All other queue types do not support a 'dedicated' queue 486 | detail::Result get_dedicated_queue (QueueType type) const; 487 | }; 488 | 489 | // For advanced device queue setup 490 | struct CustomQueueDescription { 491 | explicit CustomQueueDescription (uint32_t index, uint32_t count, std::vector priorities); 492 | uint32_t index = 0; 493 | uint32_t count = 0; 494 | std::vector priorities; 495 | }; 496 | 497 | void destroy_device (Device device); 498 | 499 | class DeviceBuilder { 500 | public: 501 | // Any features and extensions that are requested/required in PhysicalDeviceSelector are automatically enabled. 502 | explicit DeviceBuilder (PhysicalDevice physical_device); 503 | 504 | detail::Result build () const; 505 | 506 | // For Advanced Users: specify the exact list of VkDeviceQueueCreateInfo's needed for the application. 507 | // If a custom queue setup is provided, getting the queues and queue indexes is up to the application. 508 | DeviceBuilder& custom_queue_setup (std::vector queue_descriptions); 509 | 510 | // Add a structure to the pNext chain of VkDeviceCreateInfo. 511 | // The structure must be valid when DeviceBuilder::build() is called. 512 | template DeviceBuilder& add_pNext (T* structure) { 513 | info.pNext_chain.push_back (reinterpret_cast (structure)); 514 | return *this; 515 | } 516 | 517 | // Provide custom allocation callbacks. 518 | DeviceBuilder& set_allocation_callbacks (VkAllocationCallbacks* callbacks); 519 | 520 | private: 521 | struct DeviceInfo { 522 | VkDeviceCreateFlags flags = 0; 523 | std::vector pNext_chain; 524 | PhysicalDevice physical_device; 525 | VkSurfaceKHR surface = VK_NULL_HANDLE; 526 | bool defer_surface_initialization = false; 527 | std::vector queue_families; 528 | VkPhysicalDeviceFeatures features{}; 529 | std::vector extensions_to_enable; 530 | std::vector queue_descriptions; 531 | VkAllocationCallbacks* allocation_callbacks = VK_NULL_HANDLE; 532 | } info; 533 | }; 534 | 535 | // ---- Swapchain ---- // 536 | struct Swapchain { 537 | VkDevice device = VK_NULL_HANDLE; 538 | VkSwapchainKHR swapchain = VK_NULL_HANDLE; 539 | uint32_t image_count = 0; 540 | VkFormat image_format = VK_FORMAT_UNDEFINED; 541 | VkExtent2D extent = { 0, 0 }; 542 | VkAllocationCallbacks* allocation_callbacks = VK_NULL_HANDLE; 543 | 544 | // Returns a vector of VkImage handles to the swapchain. 545 | detail::Result> get_images (); 546 | 547 | // Returns a vector of VkImageView's to the VkImage's of the swapchain. 548 | // VkImageViews must be destroyed. 549 | detail::Result> get_image_views (); 550 | void destroy_image_views (std::vector const& image_views); 551 | }; 552 | 553 | void destroy_swapchain (Swapchain const& swapchain); 554 | 555 | class SwapchainBuilder { 556 | public: 557 | explicit SwapchainBuilder (Device const& device); 558 | explicit SwapchainBuilder (Device const& device, VkSurfaceKHR const surface); 559 | explicit SwapchainBuilder (VkPhysicalDevice const physical_device, VkDevice const device, VkSurfaceKHR const surface, int32_t graphics_queue_index = -1, int32_t present_queue_index = -1); 560 | 561 | detail::Result build () const; 562 | 563 | // Set the oldSwapchain member of VkSwapchainCreateInfoKHR. 564 | // For use in rebuilding a swapchain. 565 | SwapchainBuilder& set_old_swapchain (VkSwapchainKHR old_swapchain); 566 | SwapchainBuilder& set_old_swapchain (Swapchain const& swapchain); 567 | 568 | 569 | // Desired size of the swapchain. By default, the swapchain will use the size 570 | // of the window being drawn to. 571 | SwapchainBuilder& set_desired_extent (uint32_t width, uint32_t height); 572 | 573 | // When determining the surface format, make this the first to be used if supported. 574 | SwapchainBuilder& set_desired_format (VkSurfaceFormatKHR format); 575 | // Add this swapchain format to the end of the list of formats selected from. 576 | SwapchainBuilder& add_fallback_format (VkSurfaceFormatKHR format); 577 | // Use the default swapchain formats. This is done if no formats are provided. 578 | SwapchainBuilder& use_default_format_selection (); 579 | 580 | // When determining the present mode, make this the first to be used if supported. 581 | SwapchainBuilder& set_desired_present_mode (VkPresentModeKHR present_mode); 582 | // Add this present mode to the end of the list of present modes selected from. 583 | SwapchainBuilder& add_fallback_present_mode (VkPresentModeKHR present_mode); 584 | // Use the default presentation mode. This is done if no present modes are provided. 585 | SwapchainBuilder& use_default_present_mode_selection (); 586 | 587 | // Set the bitmask of the image usage for acquired swapchain images. 588 | SwapchainBuilder& set_image_usage_flags (VkImageUsageFlags usage_flags); 589 | // Add a image usage to the bitmask for acquired swapchain images. 590 | SwapchainBuilder& add_image_usage_flags (VkImageUsageFlags usage_flags); 591 | // Use the default image usage bitmask values. This is the default if no image usages 592 | // are provided. The default is VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT 593 | SwapchainBuilder& use_default_image_usage_flags (); 594 | 595 | // Set the number of views in for multiview/stereo surface 596 | SwapchainBuilder& set_image_array_layer_count (uint32_t array_layer_count); 597 | 598 | // Set whether the Vulkan implementation is allowed to discard rendering operations that 599 | // affect regions of the surface that are not visible. Default is true. 600 | // Note: Applications should use the default of true if they do not expect to read back the content 601 | // of presentable images before presenting them or after reacquiring them, and if their fragment 602 | // shaders do not have any side effects that require them to run for all pixels in the presentable image. 603 | SwapchainBuilder& set_clipped (bool clipped = true); 604 | 605 | // Set the VkSwapchainCreateFlagBitsKHR. 606 | SwapchainBuilder& set_create_flags (VkSwapchainCreateFlagBitsKHR create_flags); 607 | // Set the transform to be applied, like a 90 degree rotation. Default is the current transform. 608 | SwapchainBuilder& set_pre_transform_flags (VkSurfaceTransformFlagBitsKHR pre_transform_flags); 609 | // Set the alpha channel to be used with other windows in on the system. Default is VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR. 610 | SwapchainBuilder& set_composite_alpha_flags (VkCompositeAlphaFlagBitsKHR composite_alpha_flags); 611 | 612 | // Add a structure to the pNext chain of VkSwapchainCreateInfoKHR. 613 | // The structure must be valid when SwapchainBuilder::build() is called. 614 | template SwapchainBuilder& add_pNext (T* structure) { 615 | info.pNext_chain.push_back (reinterpret_cast (structure)); 616 | return *this; 617 | } 618 | 619 | // Provide custom allocation callbacks. 620 | SwapchainBuilder& set_allocation_callbacks (VkAllocationCallbacks* callbacks); 621 | 622 | private: 623 | void add_desired_formats (std::vector& formats) const; 624 | void add_desired_present_modes (std::vector& modes) const; 625 | 626 | struct SwapchainInfo { 627 | VkPhysicalDevice physical_device = VK_NULL_HANDLE; 628 | VkDevice device = VK_NULL_HANDLE; 629 | std::vector pNext_chain; 630 | VkSwapchainCreateFlagBitsKHR create_flags = static_cast (0); 631 | VkSurfaceKHR surface = VK_NULL_HANDLE; 632 | std::vector desired_formats; 633 | uint32_t desired_width = 256; 634 | uint32_t desired_height = 256; 635 | uint32_t array_layer_count = 1; 636 | VkImageUsageFlags image_usage_flags = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; 637 | uint32_t graphics_queue_index = 0; 638 | uint32_t present_queue_index = 0; 639 | VkSurfaceTransformFlagBitsKHR pre_transform = static_cast (0); 640 | VkCompositeAlphaFlagBitsKHR composite_alpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; 641 | std::vector desired_present_modes; 642 | bool clipped = true; 643 | VkSwapchainKHR old_swapchain = VK_NULL_HANDLE; 644 | VkAllocationCallbacks* allocation_callbacks = VK_NULL_HANDLE; 645 | } info; 646 | }; 647 | 648 | } // namespace vkb 649 | 650 | 651 | namespace std { 652 | template <> struct is_error_code_enum : true_type {}; 653 | template <> struct is_error_code_enum : true_type {}; 654 | template <> struct is_error_code_enum : true_type {}; 655 | template <> struct is_error_code_enum : true_type {}; 656 | template <> struct is_error_code_enum : true_type {}; 657 | } // namespace std -------------------------------------------------------------------------------- /inc/mgpu/vk/context.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../common/meta.hxx" 3 | #include 4 | #include 5 | #include "vk_mem_alloc.h" 6 | 7 | BEGIN_MGPU_NAMESPACE 8 | 9 | namespace vk { 10 | 11 | struct context_t { 12 | context_t(); 13 | ~context_t(); 14 | 15 | // Make it non-copyable. 16 | context_t(const context_t&) = delete; 17 | context_t& operator=(const context_t&) = delete; 18 | 19 | operator VkInstance() const noexcept { return instance; } 20 | operator VkDevice() const noexcept { return device; } 21 | operator VkPhysicalDevice() const noexcept { return physical_device; } 22 | 23 | VkInstance instance; 24 | 25 | VkPhysicalDevice physical_device; 26 | VkDevice device; 27 | 28 | uint32_t queue_index; 29 | VkQueue queue; 30 | 31 | VkCommandPool command_pool; 32 | 33 | VkPipelineCache pipeline_cache; 34 | 35 | VmaAllocator allocator; 36 | 37 | struct buffer_t { 38 | uint32_t size; 39 | uint32_t usage; 40 | VkBuffer buffer; 41 | VmaAllocation allocation; 42 | 43 | bool is_cpu() const noexcept { 44 | return 0x8000'0000 & usage; 45 | } 46 | }; 47 | typedef std::map buffer_map_t; 48 | typedef buffer_map_t::iterator buffer_it_t; 49 | buffer_map_t buffer_map; 50 | void* staging; 51 | 52 | void* alloc_gpu(size_t size, 53 | uint32_t usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT); 54 | 55 | template 56 | type_t* alloc_gpu(size_t count, 57 | uint32_t usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT) { 58 | return (type_t*)alloc_gpu(sizeof(type_t) * count, usage); 59 | } 60 | 61 | void* alloc_cpu(size_t size, 62 | uint32_t usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT); 63 | 64 | template 65 | type_t* alloc_cpu(size_t count, 66 | uint32_t usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT) { 67 | return (type_t*)alloc_cpu(sizeof(type_t) * count, usage); 68 | } 69 | 70 | void free(void* p); 71 | buffer_it_t find_buffer(const void* p); 72 | 73 | // Copy between buffer memory. At least one operand must map to a buffer. 74 | void memcpy(VkCommandBuffer cmd_buffer, void* dest, const void* source, 75 | size_t size); 76 | 77 | std::map modules; 78 | VkShaderModule create_module(const char* data, size_t size); 79 | 80 | struct transform_t { 81 | VkPipelineLayout pipeline_layout; 82 | VkPipeline pipeline; 83 | }; 84 | std::map transforms; 85 | 86 | void dispatch_compute(VkCommandBuffer cmd_buffer, const char* name, 87 | VkShaderModule module, int num_blocks, uint32_t push_size, 88 | const void* push_data); 89 | 90 | void submit(VkCommandBuffer cmd_buffer); 91 | }; 92 | 93 | struct cmd_buffer_t { 94 | cmd_buffer_t(context_t& context); 95 | ~cmd_buffer_t(); 96 | 97 | operator VkCommandBuffer() { return vkCommandBuffer; } 98 | 99 | void reset(); 100 | void begin(); 101 | void end(); 102 | void submit(); 103 | void host_barrier(); 104 | void memcpy(void* dest, const void* source, size_t size); 105 | 106 | context_t& context; 107 | VkCommandBuffer vkCommandBuffer; 108 | }; 109 | 110 | struct memcache_t { 111 | memcache_t(context_t& context); 112 | ~memcache_t(); 113 | 114 | // Request a single allocation. 115 | template 116 | type_t* allocate(size_t count) { 117 | return (type_t*)allocate(sizeof(type_t) * count); 118 | } 119 | 120 | void* allocate(size_t size); 121 | 122 | // Allocate cache-line aligned memories that fit these sizes. 123 | template 124 | std::array allocate(std::array sizes) { 125 | std::array data; 126 | allocate(sizes.data(), count, &data.data()); 127 | return data; 128 | } 129 | void allocate(const size_t* sizes, int count, void** allocations); 130 | 131 | context_t& context; 132 | void* data; 133 | size_t capacity; 134 | }; 135 | 136 | } // namespace vk 137 | 138 | END_MGPU_NAMESPACE -------------------------------------------------------------------------------- /inc/mgpu/vk/launch.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if __circle_build__ < 114 4 | #error "Circle build 114 required" 5 | #endif 6 | 7 | #include "context.hxx" 8 | 9 | BEGIN_MGPU_NAMESPACE 10 | 11 | namespace vk { 12 | 13 | // Chevron launch on a SPIR-V compute shader performs ADL lookup to find 14 | // this symbol, and overload resolution to select this overload. 15 | template 16 | static void spirv_chevron_comp(int num_blocks, cmd_buffer_t& cmd_buffer, 17 | params_t... params) { 18 | 19 | static_assert((... && std::is_trivially_copyable_v)); 20 | tuple_t storage { params... }; 21 | 22 | cmd_buffer.context.dispatch_compute( 23 | cmd_buffer, 24 | @spirv(F), 25 | cmd_buffer.context.create_module(__spirv_data, __spirv_size), 26 | num_blocks, 27 | sizeof(storage), 28 | &storage 29 | ); 30 | } 31 | 32 | } // namespace vk 33 | 34 | END_MGPU_NAMESPACE -------------------------------------------------------------------------------- /inc/mgpu/vk/merge.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../common/kernel_merge.hxx" 3 | #include "partition.hxx" // TODO: PORT PARTITION 4 | 5 | #include 6 | 7 | BEGIN_MGPU_NAMESPACE 8 | 9 | namespace vk { 10 | 11 | // Key-value merge. 12 | template< 13 | int nt = 128, int vt = 7, // Optional tuning parameters. 14 | typename a_keys_it, typename a_vals_it, 15 | typename b_keys_it, typename b_vals_it, 16 | typename c_keys_it, typename c_vals_it, 17 | typename comp_t 18 | > 19 | void merge( 20 | void* aux_data, size_t& aux_size, cmd_buffer_t& cmd_buffer, 21 | a_keys_it a_keys, a_vals_it a_vals, int a_count, 22 | b_keys_it b_keys, b_vals_it b_vals, int b_count, 23 | c_keys_it c_keys, c_vals_it c_vals, comp_t comp) { 24 | 25 | typedef typename std::iterator_traits::value_type type_t; 26 | typedef typename std::iterator_traits::value_type val_t; 27 | constexpr int nv = nt * vt; 28 | 29 | int num_partitions = num_merge_partitions(a_count + b_count, nv); 30 | if(!aux_data) { 31 | // Use 1 int per partition. 32 | aux_size += sizeof(int) * num_partitions; 33 | return; 34 | } 35 | 36 | int* partitions = advance_pointer(aux_data, num_partitions); 37 | 38 | merge_path_partitions(cmd_buffer, a_keys, a_count, 39 | b_keys, b_count, partitions, nv, comp); 40 | 41 | int num_blocks = div_up(a_count + b_count, nv); 42 | launch(num_blocks, cmd_buffer, [=](int tid, int block) { 43 | kernel_merge(partitions, a_keys, a_vals, a_count, b_keys, 44 | b_vals, b_count, c_keys, c_vals, comp); 45 | }); 46 | } 47 | 48 | // Key-only merge. 49 | template 52 | void merge(void* aux_data, size_t& aux_size, cmd_buffer_t& cmd_buffer, 53 | a_keys_it a_keys, int a_count, b_keys_it b_keys, int b_count, 54 | c_keys_it c_keys, comp_t comp) { 55 | 56 | merge(aux_data, aux_size, cmd_buffer, a_keys, 57 | (const empty_t*)nullptr, a_count, b_keys, (const empty_t*)nullptr, 58 | b_count, c_keys, (empty_t*)nullptr, comp); 59 | } 60 | 61 | } // namespace vk 62 | 63 | END_MGPU_NAMESPACE 64 | -------------------------------------------------------------------------------- /inc/mgpu/vk/mergesort.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../common/kernel_mergesort.hxx" 3 | #include "context.hxx" 4 | #include "transform.hxx" 5 | 6 | BEGIN_MGPU_NAMESPACE 7 | 8 | namespace vk { 9 | 10 | template > 12 | void mergesort_kv( 13 | void* aux_data, size_t& aux_size, cmd_buffer_t& cmd_buffer, 14 | key_t* keys, val_t* vals, int count, comp_t comp = comp_t()) { 15 | 16 | static_assert(!sort_indices || std::is_same_v); 17 | constexpr bool has_values = !std::is_same_v; 18 | 19 | int num_ctas = div_up(count, nt * vt); 20 | int num_passes = find_log2(num_ctas, true); 21 | 22 | if(0 == num_passes) { 23 | if(!aux_data) return; 24 | 25 | // For a single CTA, sort in place and don't require any cache memory. 26 | launch(num_ctas, cmd_buffer, [=](int tid, int cta) { 27 | kernel_blocksort(keys, vals, keys, vals, 28 | count, comp); 29 | }); 30 | 31 | } else { 32 | int num_partitions = num_ctas + 1; 33 | 34 | // Allocate temporary storage for the partitions and ping-pong buffers. 35 | if(!aux_data) { 36 | aux_size += sizeof(int) * num_partitions; 37 | aux_size += sizeof(key_t) * count; 38 | if(has_values) 39 | aux_size += sizeof(val_t) * count; 40 | return; 41 | } 42 | 43 | int* mp = advance_pointer(aux_data, num_partitions); 44 | key_t* keys2 = advance_pointer(aux_data, count); 45 | val_t* vals2 = advance_pointer(aux_data, count); 46 | 47 | key_t* keys_blocksort = (1 & num_passes) ? keys2 : keys; 48 | val_t* vals_blocksort = (1 & num_passes) ? vals2 : vals; 49 | 50 | // Blocksort the input. 51 | launch(num_ctas, cmd_buffer, [=](int tid, int cta) { 52 | kernel_blocksort(keys, vals, keys_blocksort, 53 | vals_blocksort, count, comp); 54 | }); 55 | 56 | if(1 & num_passes) { 57 | std::swap(keys, keys2); 58 | std::swap(vals, vals2); 59 | } 60 | 61 | for(int pass = 0; pass < num_passes; ++pass) { 62 | int coop = 2<< pass; 63 | 64 | // Partition the partially-sorted inputs. 65 | transform(num_partitions, cmd_buffer, [=](int index) { 66 | int spacing = nt * vt; 67 | merge_range_t range = compute_mergesort_range(count, index, coop, 68 | spacing); 69 | int diag = min(spacing * index, count) - range.a_begin; 70 | mp[index] = merge_path(keys + range.a_begin, 71 | range.a_count(), keys + range.b_begin, range.b_count(), diag, comp); 72 | }); 73 | 74 | // Launch the merge pass. 75 | launch(num_ctas, cmd_buffer, [=](int tid, int cta) { 76 | kernel_mergesort_pass(mp, keys, vals, keys2, vals2, count, 77 | coop, comp); 78 | }); 79 | 80 | std::swap(keys, keys2); 81 | std::swap(vals, vals2); 82 | } 83 | } 84 | } 85 | 86 | template > 88 | void mergesort_keys(void* aux_data, size_t& aux_size, cmd_buffer_t& cmd_buffer, 89 | key_t* keys, int count, comp_t comp = comp_t()) { 90 | 91 | mergesort_kv(aux_data, aux_size, cmd_buffer, keys, 92 | (empty_t*)nullptr, count, comp); 93 | } 94 | 95 | template > 97 | void mergesort_indices(void* aux_data, size_t& aux_size, 98 | cmd_buffer_t& cmd_buffer, key_t* keys, int* indices, int count, 99 | comp_t comp = comp_t()) { 100 | 101 | mergesort_kv(aux_data, aux_size, cmd_buffer, keys, indices, 102 | count, comp); 103 | } 104 | 105 | } // namespace vk 106 | 107 | END_MGPU_NAMESPACE 108 | -------------------------------------------------------------------------------- /inc/mgpu/vk/partition.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../common/cta_merge.hxx" 3 | #include "transform.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | namespace vk { 8 | 9 | template 11 | void merge_path_partitions(cmd_buffer_t& cmd_buffer, a_keys_it a, 12 | int a_count, b_keys_it b, int b_count, int* mp_data, int spacing, 13 | comp_t comp) { 14 | 15 | int num_partitions = num_merge_partitions(a_count + b_count, spacing); 16 | transform(num_partitions, cmd_buffer, [=](int index) { 17 | int diag = min(spacing * index, a_count + b_count); 18 | mp_data[index] = merge_path(a, a_count, b, b_count, diag, comp); 19 | }); 20 | } 21 | 22 | } // namespace vk 23 | 24 | END_MGPU_NAMESPACE 25 | -------------------------------------------------------------------------------- /inc/mgpu/vk/radix.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../common/cta_radix.hxx" 3 | #include "scan.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | namespace vk { 8 | 9 | //////////////////////////////////////////////////////////////////////////////// 10 | // Scan counters for all bins. This avoids having to scatter into a single 11 | // ordered array, which becomes inefficient as num_bins becomes large. 12 | 13 | void radix_scan_8(void* aux_data, size_t& aux_size, 14 | cmd_buffer_t& cmd_buffer, uint* counts, int num_frames) { 15 | 16 | enum { nt = 1024, num_bins = 256 }; 17 | int num_ctas = div_up(num_frames, 32); 18 | if(!num_ctas) return; 19 | 20 | if(1 == num_ctas) { 21 | // Require no extra memory for single-CTA case. 22 | if(!aux_data) return; 23 | 24 | // Support up to 32 frames reduction in one CTA. 25 | launch(1, cmd_buffer, [=](int tid, int cta) mutable { 26 | typedef cta_scan_t scan_t; 27 | __shared__ struct { 28 | // Use non-overlapping space for scan and to store counts. 29 | typename scan_t::storage_t scan; 30 | uint counts[nt]; 31 | } shared; 32 | 33 | // The single-cta scan chops up 32 frames into four sections of 8 34 | // registers each. 35 | int section = tid / 256; 36 | int lane = tid & 255; 37 | int frame0 = 8 * section; 38 | 39 | counts += num_bins * frame0 + lane; 40 | num_frames -= frame0; 41 | 42 | uint x[8]; 43 | uint reduction = 0; 44 | @meta for(int i = 0; i < 8; ++i) { 45 | if(i < num_frames) { 46 | x[i] = counts[i * num_bins]; 47 | reduction += x[i]; 48 | } 49 | } 50 | shared.counts[tid] = reduction; 51 | __syncthreads(); 52 | 53 | // Further reduce them across lanes. 54 | uint total = 0; 55 | if(0 == frame0) { 56 | total = reduction; 57 | @meta for(int i = 1; i < nt / num_bins; ++i) 58 | total += shared.counts[tid + i * num_bins]; 59 | } 60 | __syncthreads(); 61 | 62 | // Scan the first 256 array elements. 63 | uint scan = scan_t().scan(total, shared.scan).scan; 64 | 65 | if(tid < num_bins) 66 | counts[tid] = scan; 67 | 68 | // Apply the carry-in to all section reductions. 69 | if(0 == frame0) { 70 | shared.counts[tid] = scan; 71 | scan += reduction; 72 | 73 | @meta for(int i = 1; i < nt / num_bins; ++i) { 74 | reduction = shared.counts[tid + i * num_bins]; 75 | shared.counts[tid + i * num_bins] = scan; 76 | scan += reduction; 77 | } 78 | } 79 | __syncthreads(); 80 | 81 | // Scan and output the cached counts. 82 | scan = shared.counts[tid]; 83 | @meta for(int i = 0; i < 8; ++i) { 84 | if(i < num_frames) { 85 | counts[i * num_bins] = scan; 86 | scan += x[i]; 87 | } 88 | } 89 | }); 90 | 91 | } else { 92 | if(!aux_data) { 93 | // Reserve one 256-item frame for each 32 input frames. 94 | aux_size += sizeof(uint) * num_bins * num_ctas; 95 | radix_scan_8(aux_data, aux_size, cmd_buffer, (uint*)nullptr, num_ctas); 96 | return; 97 | } 98 | 99 | // Write and read to a list of partials. 100 | uint* partials = advance_pointer(aux_data, num_bins * num_ctas); 101 | 102 | // Upsweep to reduce 32 frames into 1 frame. 103 | launch(num_ctas, cmd_buffer, [=](int tid, int cta) mutable { 104 | __shared__ uint shared_counts[nt]; 105 | 106 | int frame0 = tid / 256 + 32 * cta; 107 | int lane = tid & 255; 108 | num_frames -= frame0; 109 | 110 | counts += num_bins * frame0 + lane; 111 | 112 | uint reduction = 0; 113 | @meta for(int i = 0; i < 32; i += 4) { 114 | if(i < num_frames) 115 | reduction += counts[i * num_bins]; 116 | } 117 | shared_counts[tid] = reduction; 118 | __syncthreads(); 119 | 120 | if(tid < num_bins) { 121 | @meta for(int i = 1; i < nt / num_bins; ++i) 122 | reduction += shared_counts[i * num_bins + tid]; 123 | 124 | partials[num_bins * cta + tid] = reduction; 125 | } 126 | }); 127 | 128 | // Recurse on the partials. 129 | radix_scan_8(aux_data, aux_size, cmd_buffer, partials, num_ctas); 130 | 131 | // Downsweep to scan 32 frames with carry-in from the partials. 132 | launch(num_ctas, cmd_buffer, [=](int tid, int cta) mutable { 133 | typedef cta_scan_t scan_t; 134 | __shared__ struct { 135 | // Use non-overlapping space for scan and to store counts. 136 | uint counts[nt]; 137 | typename scan_t::storage_t scan; 138 | } shared; 139 | 140 | // Spread the sections 8 frames across. 141 | int section = tid / 256; 142 | int lane = tid & 255; 143 | int frame0 = 8 * section; 144 | 145 | counts += (32 * cta + frame0) * num_bins + lane; 146 | num_frames -= 32 * cta + frame0; 147 | 148 | uint x[8]; 149 | uint reduction = 0; 150 | @meta for(int i = 0; i < 8; ++i) { 151 | if(i < num_frames) { 152 | x[i] = counts[i * num_bins]; 153 | reduction += x[i]; 154 | } 155 | } 156 | shared.counts[tid] = reduction; 157 | __syncthreads(); 158 | 159 | if(0 == frame0) { 160 | // Add the carry-in from the partials. 161 | uint scan = partials[num_bins * cta + tid]; 162 | 163 | // Apply the carry-in to all section reductions. 164 | shared.counts[tid] = scan; 165 | scan += reduction; 166 | 167 | @meta for(int i = 1; i < nt / num_bins; ++i) { 168 | reduction = shared.counts[i * num_bins + tid]; 169 | shared.counts[i * num_bins + tid] = scan; 170 | scan += reduction; 171 | } 172 | } 173 | __syncthreads(); 174 | 175 | // Scan and output the cached counts. 176 | uint scan = shared.counts[tid]; 177 | @meta for(int i = 0; i < 8; ++i) { 178 | if(i < num_frames) { 179 | counts[i * num_bins] = scan; 180 | scan += x[i]; 181 | } 182 | } 183 | }); 184 | } 185 | } 186 | 187 | 188 | //////////////////////////////////////////////////////////////////////////////// 189 | // Radix sort entry point that accepts 4 bit sort (shared memory histogram) 190 | // or 8 bit sort (ballot with one histogram per warp). The ballot version uses 191 | // much less memory, but the partial reduction scatter is devastating with 192 | // 256 transactions, so a special radix scan is devised to handle both 193 | // implementations. 194 | 195 | template 197 | void radix_sort(void* aux_data, size_t& aux_size, cmd_buffer_t& cmd_buffer, 198 | key_t* data, int count) { 199 | 200 | enum { 201 | nv = nt * vt, 202 | num_bins = 1<< num_bits, 203 | }; 204 | typedef unsigned_int_by_size_t unsigned_type; 205 | 206 | static_assert(4 == num_bits || 8 == num_bits); 207 | 208 | // Require a 32-lane warp for ballot radix sort. 209 | // TODO: Write a 64-lane version. 210 | typedef cta_radix_rank_t< 211 | nt, 212 | num_bits, 213 | 4 == num_bits ? radix_kind_shared : radix_kind_ballot 214 | > radix_t; 215 | const int subgroup_size = 8 == num_bits ? 32 : -1; 216 | 217 | if(!count) return; 218 | int num_ctas = div_up(count, nv); 219 | 220 | 221 | if(1 == num_ctas) { 222 | if(!aux_data) return; 223 | 224 | //////////////////////////////////////////////////////////////////////////// 225 | // Fully radix sort data within a CTA. 226 | 227 | launch(num_ctas, cmd_buffer, [=](int tid, int cta) { 228 | __shared__ union { 229 | typename radix_t::storage_t radix; 230 | unsigned_type keys[nv]; 231 | } shared; 232 | 233 | int lane = gl_SubgroupInvocationID; 234 | int warp = gl_SubgroupID; 235 | int warp_size = gl_SubgroupSize; 236 | 237 | // Load the data into strided order. 238 | std::array keys; 239 | @meta for(int i = 0; i < vt; ++i) {{ 240 | int offset = nt * i + tid; 241 | if(offset < count) { 242 | // If the key is in range, load it and convert to radix bits. 243 | keys[i] = radix_permute_t::to_radix_bits(data[offset]); 244 | 245 | } else { 246 | // Otherwise set all radix bits so this key is sorted to the end. 247 | keys[i] = -1; 248 | } 249 | }} 250 | 251 | // Move the keys into shared memory. 252 | reg_to_shared_strided(keys, tid, shared.keys); 253 | 254 | @meta for(int bit = 0; bit < 8 * sizeof(num_bits); bit += num_bits) { 255 | // Load the keys from shared memory. 256 | if constexpr(8 == num_bits) 257 | keys = shared_to_reg_warp(shared.keys, lane, warp, warp_size); 258 | else 259 | keys = shared_to_reg_thread(shared.keys, tid); 260 | 261 | // Extract the digits for each key. 262 | std::array digits { 263 | (uint)bitfieldExtract(keys...[:], bit, num_bits) ... 264 | }; 265 | 266 | // Compute the radix rank of each digit. 267 | auto result = radix_t().scatter(digits, shared.radix); 268 | 269 | // Scatter the keys into shared memory. 270 | shared.keys[result.indices...[:]] = keys...[:] ...; 271 | __syncthreads(); 272 | } 273 | 274 | // Write from shared memory to device memory. 275 | @meta for(int i = 0; i < vt; ++i) {{ 276 | int offset = nt * i + tid; 277 | if(offset < count) { 278 | unsigned_type u = shared.keys[offset]; 279 | data[offset] = radix_permute_t::from_radix_bits(u); 280 | } 281 | }} 282 | }); 283 | 284 | } else { 285 | //////////////////////////////////////////////////////////////////////////// 286 | // Make multiple passes to sort the input. 287 | 288 | if(!aux_data) { 289 | // ping-pong keys buffer. 290 | aux_size += sizeof(key_t) * nv * num_ctas; 291 | 292 | // partials reduction buffer. 293 | aux_size += sizeof(uint) * num_bins * num_ctas; 294 | 295 | // scan auxiliary storage. 296 | if constexpr(8 == num_bits) 297 | radix_scan_8(aux_data, aux_size, cmd_buffer, (uint*)nullptr, num_ctas); 298 | else 299 | scan(nullptr, aux_size, cmd_buffer, (uint*)nullptr, num_bins * num_ctas); 300 | 301 | return; 302 | } 303 | 304 | // Allocate a second buffer to ping-pong. 305 | key_t* data2 = advance_pointer(aux_data, nv * num_ctas); 306 | 307 | // Allocate space for each digit count. 308 | uint* partials = advance_pointer(aux_data, num_bins * num_ctas); 309 | 310 | for(int bit = 0; bit < 8 * sizeof(key_t); bit += num_bits) { 311 | 312 | ////////////////////////////////////////////////////////////////////////// 313 | // Upsweep. 314 | 315 | launch(num_ctas, cmd_buffer, 316 | [=](int tid, int cta) mutable { 317 | 318 | __shared__ union { 319 | typename radix_t::storage_t radix; 320 | } shared; 321 | 322 | int cur = nv * cta; 323 | data += cur; 324 | count -= cur; 325 | 326 | // Load the data into strided order. 327 | std::array keys; 328 | @meta for(int i = 0; i < vt; ++i) {{ 329 | int offset = nt * i + tid; 330 | if(offset < count) { 331 | // If the key is in range, load it and convert to radix bits. 332 | keys[i] = radix_permute_t::to_radix_bits(data[offset]); 333 | 334 | } else { 335 | // Otherwise set all radix bits so this key is sorted to the end. 336 | keys[i] = -1; 337 | } 338 | }} 339 | 340 | // The upsweep doesn't care about the order of keys. Keep them in 341 | // strided order. 342 | 343 | // Extract the digits for each key. 344 | std::array digits { 345 | (uint)bitfieldExtract(keys...[:], bit, num_bits) ... 346 | }; 347 | 348 | // Compute the radix rank of each digit. 349 | uint digit_count = radix_t().reduce(digits, shared.radix); 350 | 351 | // Write the reductions to the counter. 352 | if(tid < num_bins) { 353 | int index = 0; 354 | if constexpr(8 == num_bits) { 355 | // Write densely to the output because we use the special radix 356 | // scan. 357 | index = num_bins * cta + tid; 358 | } else { 359 | // Interleave to the output because we use an ordinary scan. 360 | index = num_ctas * tid + cta; 361 | } 362 | 363 | partials[index] = digit_count; 364 | } 365 | }); 366 | 367 | ////////////////////////////////////////////////////////////////////////// 368 | // Scan. 369 | 370 | if constexpr(8 == num_bits) { 371 | radix_scan_8(aux_data, aux_size, cmd_buffer, partials, num_ctas); 372 | 373 | } else { 374 | scan(aux_data, aux_size, cmd_buffer, partials, num_bins * num_ctas); 375 | } 376 | 377 | ////////////////////////////////////////////////////////////////////////// 378 | // Downsweep. 379 | 380 | launch(num_ctas, cmd_buffer, 381 | [=](int tid, int cta) mutable { 382 | 383 | __shared__ union { 384 | typename radix_t::storage_t radix; 385 | unsigned_type keys[nv]; 386 | ivec2 offsets[num_bins]; 387 | } shared; 388 | 389 | int lane = gl_SubgroupInvocationID; 390 | int warp = gl_SubgroupID; 391 | int warp_size = gl_SubgroupSize; 392 | 393 | int cur = nv * cta; 394 | data += cur; 395 | count -= cur; 396 | 397 | // Load the data into strided order. 398 | std::array keys; 399 | @meta for(int i = 0; i < vt; ++i) {{ 400 | int offset = nt * i + tid; 401 | if(offset < count) { 402 | // If the key is in range, load it and convert to radix bits. 403 | keys[i] = radix_permute_t::to_radix_bits(data[offset]); 404 | 405 | } else { 406 | // Otherwise set all radix bits so this key is sorted to the end. 407 | keys[i] = -1; 408 | } 409 | }} 410 | 411 | // Move the keys into shared memory. 412 | reg_to_shared_strided(keys, tid, shared.keys); 413 | 414 | // Load the keys from shared memory. 415 | if constexpr(8 == num_bits) 416 | keys = shared_to_reg_warp(shared.keys, lane, warp, warp_size); 417 | else 418 | keys = shared_to_reg_thread(shared.keys, tid); 419 | 420 | // Extract the digits for each key. 421 | std::array digits { 422 | (uint)bitfieldExtract(keys...[:], bit, num_bits) ... 423 | }; 424 | 425 | // Compute the radix rank of each digit. 426 | auto result = radix_t().scatter(digits, shared.radix); 427 | 428 | // Scatter the keys into shared memory. 429 | shared.keys[result.indices...[:]] = keys...[:] ...; 430 | __syncthreads(); 431 | 432 | // Load keys in strided order. 433 | keys = shared_to_reg_strided(shared.keys, tid); 434 | 435 | // Load the offset for each digit into global output. 436 | if(tid < num_bins) { 437 | int index = 0; 438 | if constexpr(8 == num_bits) 439 | index = num_bins * cta + tid; 440 | else 441 | index = num_ctas * tid + cta; 442 | 443 | shared.offsets[tid] = ivec2( 444 | result.digit_scan, // local digit offset 445 | partials[index] // global digit offset 446 | ); 447 | } 448 | __syncthreads(); 449 | 450 | // Write from shared memory to device memory. 451 | @meta for(int i = 0; i < vt; ++i) {{ 452 | int offset = nt * i + tid; 453 | if(offset < count) { 454 | // Extract the digit. 455 | uint digit = bitfieldExtract(keys[i], bit, num_bits); 456 | 457 | // Look up the first occurrence of this digit within the CTA and 458 | // within the global output for this CTA. The difference is the 459 | // position at which we scatter to device memory. 460 | ivec2 offsets = shared.offsets[digit]; 461 | offset += offsets.y - offsets.x; 462 | data2[offset] = radix_permute_t::from_radix_bits(keys[i]); 463 | } 464 | }} 465 | }); 466 | 467 | std::swap(data, data2); 468 | } 469 | } 470 | } 471 | 472 | //////////////////////////////////////////////////////////////////////////////// 473 | 474 | 475 | 476 | } // namespace vk 477 | 478 | END_MGPU_NAMESPACE 479 | -------------------------------------------------------------------------------- /inc/mgpu/vk/scan.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../common/cta_scan.hxx" 3 | #include "context.hxx" 4 | #include "transform.hxx" 5 | 6 | BEGIN_MGPU_NAMESPACE 7 | 8 | namespace vk { 9 | 10 | template > 12 | void scan(void* aux_data, size_t& aux_size, cmd_buffer_t& cmd_buffer, 13 | type_t* data, int count, type_t init = type_t(), op_t op = op_t()) { 14 | 15 | enum { nv = nt * vt }; 16 | int num_ctas = div_up(count, nv); 17 | 18 | if(num_ctas <= 8) { 19 | if(!aux_data) { 20 | // Require no aux bytes, so return immediately. 21 | return; 22 | } 23 | 24 | // The small input pass. Perform the scan with a single CTA. 25 | launch(1, cmd_buffer, [=](int tid, int cta) { 26 | typedef cta_scan_t scan_t; 27 | 28 | __shared__ ALIAS_UNION { 29 | typename scan_t::storage_t scan; 30 | type_t values[nv]; 31 | } shared; 32 | 33 | type_t carry_in = type_t(); 34 | for(int cur = 0; cur < count; cur += nv) { 35 | // Load this tile's data. 36 | std::array x = mem_to_reg_thread(data + cur, tid, 37 | count - cur, shared.values); 38 | 39 | // Scan the inputs. 40 | auto result = scan_t().scan(x, shared.scan, carry_in, init, op); 41 | 42 | // Store the scanned values back to global memory. 43 | reg_to_mem_thread(result.scan, tid, count - cur, data + cur, 44 | shared.values); 45 | 46 | carry_in = result.reduction; 47 | } 48 | }); 49 | 50 | } else { 51 | // The recursive kernel. 52 | int num_passes = find_log2(num_ctas, true); 53 | 54 | if(!aux_data) { 55 | // Allocate space for one reduction per tile. 56 | aux_size += sizeof(type_t) * num_ctas; 57 | scan(nullptr, aux_size, cmd_buffer, data, num_ctas, init, op); 58 | return; 59 | } 60 | 61 | // Allocate space for one reduction per tile. 62 | type_t* partials = advance_pointer(aux_data, num_ctas); 63 | 64 | // The upsweep reduces each tile into partials. 65 | launch(num_ctas, cmd_buffer, [=](int tid, int cta) { 66 | typedef cta_reduce_t reduce_t; 67 | 68 | __shared__ ALIAS_UNION { 69 | typename reduce_t::storage_t reduce; 70 | type_t values[nv]; 71 | } shared; 72 | 73 | int cur = nv * cta; 74 | 75 | // Load this tile's data. 76 | std::array x = mem_to_reg_thread(data + cur, tid, 77 | count - cur, shared.values); 78 | 79 | type_t reduce = reduce_t().reduce(x, shared.reduce); 80 | 81 | // Write to the partials. 82 | if(!tid) 83 | partials[cta] = reduce; 84 | }); 85 | 86 | // Recursively scan the partials. 87 | scan(aux_data, aux_size, cmd_buffer, partials, num_ctas, init, op); 88 | 89 | // The downsweep performs a scan with carry-in. 90 | launch(num_ctas, cmd_buffer, [=](int tid, int cta) { 91 | typedef cta_scan_t scan_t; 92 | 93 | __shared__ ALIAS_UNION { 94 | typename scan_t::storage_t scan; 95 | type_t values[nv]; 96 | } shared; 97 | 98 | int cur = nv * cta; 99 | 100 | // Load this tile's data. 101 | std::array x = mem_to_reg_thread(data + cur, tid, 102 | count - cur, shared.values); 103 | 104 | // Load the carry-in. 105 | type_t carry_in = partials[cta]; 106 | 107 | // Scan the inputs. 108 | auto result = scan_t().scan(x, shared.scan, carry_in, init, op); 109 | 110 | // Store the scanned values back to global memory. 111 | reg_to_mem_thread(result.scan, tid, count - cur, data + cur, 112 | shared.values); 113 | }); 114 | } 115 | } 116 | 117 | } // namespace vk 118 | 119 | END_MGPU_NAMESPACE 120 | -------------------------------------------------------------------------------- /inc/mgpu/vk/transform.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "launch.hxx" 3 | 4 | BEGIN_MGPU_NAMESPACE 5 | 6 | namespace vk { 7 | 8 | // Launch a grid and pass (tid, cta) 9 | template 10 | [[using spirv: comp, local_size(nt), subgroup_size(subgroup_size), push]] 11 | void launch_cs(func_t func) { 12 | func(threadIdx.x, blockIdx.x); 13 | } 14 | 15 | template 16 | static void launch(int num_blocks, cmd_buffer_t& cmd_buffer, func_t func) { 17 | launch_cs<<>>(func); 18 | } 19 | 20 | // Launch a grid and pass gid. 21 | template 22 | [[using spirv: comp, local_size(nt), subgroup_size(subgroup_size), push]] 23 | void transform_cs(int count, func_t func) { 24 | int gid = glcomp_GlobalInvocationID.x; 25 | 26 | if(gid >= count) 27 | return; 28 | 29 | func(gid); 30 | } 31 | 32 | template 33 | static void transform(int count, cmd_buffer_t& cmd_buffer, func_t func) { 34 | int num_blocks = div_up(count, nt); 35 | transform_cs<<>>(count, func); 36 | } 37 | 38 | } // namespace vk 39 | 40 | END_MGPU_NAMESPACE -------------------------------------------------------------------------------- /src/vk/context.cxx: -------------------------------------------------------------------------------- 1 | // #define USE_VALIDATION 2 | #include 3 | 4 | #define VMA_IMPLEMENTATION 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | BEGIN_MGPU_NAMESPACE 12 | 13 | namespace vk { 14 | 15 | context_t::context_t() { 16 | // Create the instance. 17 | vkb::InstanceBuilder builder; 18 | auto inst_ret = builder.set_app_name("saxpy") 19 | .require_api_version(1, 2) 20 | #ifdef USE_VALIDATION 21 | .request_validation_layers () 22 | .use_default_debug_messenger () 23 | .add_debug_messenger_severity(VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT) 24 | .add_validation_feature_enable(VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT) 25 | .add_validation_feature_disable(VK_VALIDATION_FEATURE_DISABLE_SHADERS_EXT) 26 | #endif 27 | .set_headless() 28 | .build (); 29 | if (!inst_ret) { 30 | std::cerr << "Failed to create Vulkan instance. Error: " << inst_ret.error().message() << "\n"; 31 | exit(1); 32 | } 33 | vkb::Instance vkb_inst = inst_ret.value(); 34 | instance = vkb_inst.instance; 35 | 36 | // Create the physical device. 37 | 38 | vkb::PhysicalDeviceSelector selector{ vkb_inst }; 39 | auto phys_ret = selector 40 | .set_minimum_version (1, 2) 41 | .add_required_extension("VK_KHR_buffer_device_address") 42 | .add_required_extension("VK_KHR_shader_non_semantic_info") 43 | .require_dedicated_transfer_queue() 44 | .select(); 45 | if (!phys_ret) { 46 | std::cerr << "Failed to select Vulkan Physical Device. Error: " << phys_ret.error().message() << "\n"; 47 | exit(1); 48 | } 49 | 50 | vkb::PhysicalDevice vkb_phys_device = phys_ret.value(); 51 | physical_device = vkb_phys_device.physical_device; 52 | 53 | // Create the device. 54 | vkb::DeviceBuilder device_builder { vkb_phys_device }; 55 | VkPhysicalDeviceBufferDeviceAddressFeaturesKHR feature1 { 56 | VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_KHR, 57 | nullptr, 58 | true 59 | }; 60 | device_builder.add_pNext(&feature1); 61 | 62 | VkPhysicalDeviceFloat16Int8FeaturesKHR feature2 { 63 | VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES, 64 | nullptr, 65 | false, 66 | true 67 | }; 68 | device_builder.add_pNext(&feature2); 69 | 70 | // automatically propagate needed data from instance & physical device 71 | auto dev_ret = device_builder.build(); 72 | if (!dev_ret) { 73 | std::cerr << "Failed to create Vulkan device. Error: " << dev_ret.error().message() << "\n"; 74 | exit(1); 75 | } 76 | 77 | vkb::Device vkb_device = dev_ret.value(); 78 | device = vkb_device.device; 79 | 80 | // Create the compute queue. 81 | // Get the graphics queue with a helper function 82 | auto queue_ret = vkb_device.get_queue(vkb::QueueType::compute); 83 | if (!queue_ret) { 84 | std::cerr << "Failed to get queue. Error: " << queue_ret.error().message() << "\n"; 85 | exit(1); 86 | } 87 | queue = queue_ret.value(); 88 | queue_index = vkb_device.get_queue_index(vkb::QueueType::compute).value(); 89 | 90 | // Create a command pool. 91 | VkCommandPoolCreateInfo cmdPoolInfo { 92 | VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, 93 | nullptr, 94 | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, 95 | queue_index 96 | }; 97 | vkCreateCommandPool(device, &cmdPoolInfo, nullptr, &command_pool); 98 | 99 | // Create the pipeline cache. 100 | VkPipelineCacheCreateInfo pipelineCacheCreateInfo { 101 | VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO 102 | }; 103 | vkCreatePipelineCache(device, &pipelineCacheCreateInfo, nullptr, 104 | &pipeline_cache); 105 | 106 | // Create the allocator. 107 | VmaAllocatorCreateInfo allocatorInfo = {}; 108 | allocatorInfo.vulkanApiVersion = VK_API_VERSION_1_2; 109 | allocatorInfo.physicalDevice = physical_device; 110 | allocatorInfo.device = device; 111 | allocatorInfo.instance = instance; 112 | allocatorInfo.flags = 113 | VMA_ALLOCATOR_CREATE_EXTERNALLY_SYNCHRONIZED_BIT | 114 | VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT; 115 | 116 | vmaCreateAllocator(&allocatorInfo, &allocator); 117 | 118 | // Allocate a 16MB staging buffer. 119 | staging = alloc_cpu(16<< 20, VK_BUFFER_USAGE_TRANSFER_DST_BIT | 120 | VK_BUFFER_USAGE_TRANSFER_SRC_BIT); 121 | } 122 | 123 | context_t::~context_t() { 124 | // Destroy the staging memory. 125 | free(staging); 126 | 127 | // Destroy the allocator. 128 | assert(!buffer_map.size()); 129 | vmaDestroyAllocator(allocator); 130 | 131 | // Destroy the pipelines. 132 | for(auto it : transforms) { 133 | transform_t& transform = it.second; 134 | vkDestroyPipeline(device, transform.pipeline, nullptr); 135 | vkDestroyPipelineLayout(device, transform.pipeline_layout, nullptr); 136 | } 137 | 138 | // Destroy the shader modules. 139 | for(auto it : modules) 140 | vkDestroyShaderModule(device, it.second, nullptr); 141 | 142 | // Destroy the cache and command pool. 143 | vkDestroyPipelineCache(device, pipeline_cache, nullptr); 144 | vkDestroyCommandPool(device, command_pool, nullptr); 145 | vkDestroyDevice(device, nullptr); 146 | 147 | // Destroy the messenger. 148 | // TODO: 149 | 150 | // Destroy the instance. 151 | vkDestroyInstance(instance, nullptr); 152 | } 153 | 154 | //////////////////////////////////////////////////////////////////////////////// 155 | 156 | void* context_t::alloc_gpu(size_t size, uint32_t usage) { 157 | VkBufferCreateInfo bufferInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; 158 | bufferInfo.size = std::max(size, 1ul); 159 | bufferInfo.usage = VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT | usage; 160 | 161 | VmaAllocationCreateInfo allocInfo = {}; 162 | allocInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; 163 | 164 | VkBuffer buffer; 165 | VmaAllocation allocation; 166 | vmaCreateBuffer(allocator, &bufferInfo, &allocInfo, &buffer, 167 | &allocation, nullptr); 168 | 169 | VkBufferDeviceAddressInfo addressInfo { 170 | VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO, 171 | nullptr, 172 | buffer 173 | }; 174 | VkDeviceAddress address = vkGetBufferDeviceAddress(device, &addressInfo); 175 | void* p = (void*)address; 176 | 177 | buffer_map.insert(std::make_pair(p, buffer_t { size, usage, buffer, allocation })); 178 | return p; 179 | } 180 | 181 | void* context_t::alloc_cpu(size_t size, uint32_t usage) { 182 | VkBufferCreateInfo bufferInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; 183 | bufferInfo.size = std::max(size, 1ul); 184 | bufferInfo.usage = usage; 185 | 186 | VmaAllocationCreateInfo allocInfo = {}; 187 | allocInfo.usage = VMA_MEMORY_USAGE_CPU_ONLY; 188 | 189 | VkBuffer buffer; 190 | VmaAllocation allocation; 191 | vmaCreateBuffer(allocator, &bufferInfo, &allocInfo, &buffer, 192 | &allocation, nullptr); 193 | 194 | void* p; 195 | vmaMapMemory(allocator, allocation, &p); 196 | 197 | buffer_map.insert(std::make_pair(p, buffer_t { size, usage | 198 | 0x8000'0000, buffer, allocation })); 199 | return p; 200 | } 201 | void context_t::free(void* p) { 202 | if(!p) return; 203 | 204 | auto it = buffer_map.find(p); 205 | assert(buffer_map.end() != it && p == it->first); 206 | 207 | if(it->second.is_cpu()) 208 | vmaUnmapMemory(allocator, it->second.allocation); 209 | 210 | vmaDestroyBuffer(allocator, it->second.buffer, it->second.allocation); 211 | buffer_map.erase(it); 212 | } 213 | 214 | context_t::buffer_it_t context_t::find_buffer(const void* p) { 215 | buffer_it_t it = buffer_map.lower_bound(p); 216 | if(buffer_map.end() != it) { 217 | // Check the range. 218 | const char* p2 = (const char*)it->first + it->second.size; 219 | if(p >= p2) 220 | it = buffer_map.end(); 221 | } 222 | return it; 223 | } 224 | 225 | void context_t::memcpy(VkCommandBuffer cmd_buffer, void* dest, 226 | const void* source, size_t size) { 227 | 228 | if(!size) return; 229 | 230 | buffer_it_t dest_it = find_buffer(dest); 231 | buffer_it_t source_it = find_buffer(source); 232 | 233 | // For now both sides must be pointers into buffer objects. 234 | assert(buffer_map.end() != dest_it && buffer_map.end() != source_it); 235 | 236 | // Copy between buffers. 237 | VkBufferCopy copyRegion { }; 238 | copyRegion.srcOffset = ((const char*)source - (const char*)source_it->first); 239 | copyRegion.dstOffset = ((const char*)dest - (const char*)dest_it->first); 240 | copyRegion.size = size; 241 | vkCmdCopyBuffer(cmd_buffer, source_it->second.buffer, 242 | dest_it->second.buffer, 1, ©Region); 243 | } 244 | 245 | //////////////////////////////////////////////////////////////////////////////// 246 | 247 | VkShaderModule context_t::create_module(const char* data, size_t size) { 248 | auto it = modules.find(data); 249 | if(modules.end() == it) { 250 | VkShaderModuleCreateInfo createInfo { 251 | VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, 252 | nullptr, 253 | 0, 254 | size, 255 | (const uint32_t*)data 256 | }; 257 | 258 | VkShaderModule module; 259 | vkCreateShaderModule(device, &createInfo, nullptr, &module); 260 | it = modules.insert(std::pair(data, module)).first; 261 | } 262 | 263 | return it->second; 264 | } 265 | 266 | //////////////////////////////////////////////////////////////////////////////// 267 | 268 | void context_t::dispatch_compute(VkCommandBuffer cmd_buffer, const char* name, 269 | VkShaderModule module, int num_blocks, uint32_t push_size, 270 | const void* push_data) { 271 | 272 | auto it = transforms.find(name); 273 | if(transforms.end() == it) { 274 | // Define a pipeline layout that takes only a push constant. 275 | VkPipelineLayoutCreateInfo create_info { 276 | VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO 277 | }; 278 | create_info.pushConstantRangeCount = 1; 279 | 280 | VkPushConstantRange range { VK_SHADER_STAGE_COMPUTE_BIT, 0, push_size }; 281 | create_info.pPushConstantRanges = ⦥ 282 | 283 | VkPipelineLayout pipeline_layout; 284 | vkCreatePipelineLayout(device, &create_info, nullptr, 285 | &pipeline_layout); 286 | 287 | // Create the compute pipeline. 288 | VkComputePipelineCreateInfo computePipelineCreateInfo { 289 | VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, 290 | nullptr, 291 | 0, 292 | { 293 | VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, 294 | 0, 295 | 0, 296 | VK_SHADER_STAGE_COMPUTE_BIT, 297 | module, 298 | name 299 | }, 300 | pipeline_layout 301 | }; 302 | 303 | VkPipeline pipeline; 304 | vkCreateComputePipelines(device, pipeline_cache, 1, 305 | &computePipelineCreateInfo, nullptr, &pipeline); 306 | 307 | transform_t transform { 308 | pipeline_layout, 309 | pipeline 310 | }; 311 | 312 | it = transforms.insert(std::make_pair(name, transform)).first; 313 | } 314 | 315 | transform_t transform = it->second; 316 | 317 | vkCmdBindPipeline(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, 318 | transform.pipeline); 319 | 320 | vkCmdPushConstants(cmd_buffer, transform.pipeline_layout, 321 | VK_SHADER_STAGE_COMPUTE_BIT, 0, push_size, push_data); 322 | 323 | vkCmdDispatch(cmd_buffer, num_blocks, 1, 1); 324 | 325 | VkMemoryBarrier memoryBarrier = { 326 | VK_STRUCTURE_TYPE_MEMORY_BARRIER, 327 | nullptr, 328 | VK_ACCESS_SHADER_WRITE_BIT, 329 | VK_ACCESS_SHADER_READ_BIT 330 | }; 331 | vkCmdPipelineBarrier(cmd_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 332 | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memoryBarrier, 0, nullptr, 0, 333 | nullptr); 334 | } 335 | 336 | void context_t::submit(VkCommandBuffer cmd_buffer) { 337 | // Submit the command buffer. 338 | VkSubmitInfo submitInfo { 339 | VK_STRUCTURE_TYPE_SUBMIT_INFO 340 | }; 341 | submitInfo.commandBufferCount = 1; 342 | submitInfo.pCommandBuffers = &cmd_buffer; 343 | 344 | vkQueueSubmit(queue, 1, &submitInfo, 0); 345 | } 346 | 347 | //////////////////////////////////////////////////////////////////////////////// 348 | 349 | cmd_buffer_t::cmd_buffer_t(context_t& context) : context(context) { 350 | VkCommandBufferAllocateInfo allocInfo { 351 | VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, 352 | nullptr, 353 | context.command_pool, 354 | VK_COMMAND_BUFFER_LEVEL_PRIMARY, 355 | 1 356 | }; 357 | vkAllocateCommandBuffers(context.device, &allocInfo, &vkCommandBuffer); 358 | } 359 | 360 | cmd_buffer_t::~cmd_buffer_t() { 361 | vkFreeCommandBuffers(context.device, context.command_pool, 1, 362 | &vkCommandBuffer); 363 | } 364 | 365 | void cmd_buffer_t::reset() { 366 | vkResetCommandBuffer(vkCommandBuffer, 0); 367 | } 368 | 369 | void cmd_buffer_t::begin() { 370 | VkCommandBufferBeginInfo beginInfo { 371 | VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, 372 | nullptr, 373 | VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT 374 | }; 375 | vkBeginCommandBuffer(vkCommandBuffer, &beginInfo); 376 | } 377 | 378 | void cmd_buffer_t::end() { 379 | vkEndCommandBuffer(vkCommandBuffer); 380 | } 381 | 382 | void cmd_buffer_t::submit() { 383 | context.submit(vkCommandBuffer); 384 | } 385 | 386 | void cmd_buffer_t::host_barrier() { 387 | VkMemoryBarrier memoryBarrier { 388 | VK_STRUCTURE_TYPE_MEMORY_BARRIER, 389 | nullptr, 390 | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT | 391 | VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_TRANSFER_READ_BIT, 392 | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT | 393 | VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_TRANSFER_READ_BIT 394 | 395 | }; 396 | vkCmdPipelineBarrier(vkCommandBuffer, 397 | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT, 398 | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT, 399 | 0, 1, &memoryBarrier, 0, nullptr, 0, nullptr); 400 | } 401 | 402 | void cmd_buffer_t::memcpy(void* dest, const void* source, size_t size) { 403 | context.memcpy(vkCommandBuffer, dest, source, size); 404 | } 405 | 406 | //////////////////////////////////////////////////////////////////////////////// 407 | 408 | 409 | memcache_t::memcache_t(context_t& context) : 410 | context(context), data(nullptr), capacity(0) { } 411 | 412 | memcache_t::~memcache_t() { 413 | context.free(data); 414 | } 415 | 416 | void memcache_t::allocate(const size_t* sizes, int count, void** allocations) { 417 | size_t total = 0; 418 | for(int i = 0; i < count; ++i) { 419 | // Round up to a multiple of 256 bytes. 420 | size_t reserve = ~255 & (sizes[i] + 255); 421 | total += reserve; 422 | } 423 | 424 | // Allocate space in a single buffer. 425 | char* p = allocate(total); 426 | for(int i = 0; i < count; ++i) { 427 | allocations[i] = p; 428 | p += ~255 & (sizes[i] + 255); 429 | } 430 | } 431 | 432 | void* memcache_t::allocate(size_t size) { 433 | if(size > capacity) { 434 | context.free(data); 435 | data = context.alloc_gpu(size); 436 | capacity = size; 437 | } 438 | return data; 439 | } 440 | 441 | 442 | 443 | } // namespace vk 444 | 445 | END_MGPU_NAMESPACE 446 | --------------------------------------------------------------------------------