├── examples ├── gl_merge │ ├── CMakeLists.txt │ └── gl_merge.cxx ├── gl_sort │ ├── CMakeLists.txt │ └── gl_sort.cxx ├── gl_transform │ ├── CMakeLists.txt │ └── gl_transform.cxx ├── thrust_test │ └── thrust_test.cu ├── vk_merge │ ├── CMakeLists.txt │ └── vk_merge.cxx ├── vk_radix │ ├── CMakeLists.txt │ ├── timings.jpg │ └── vk_radix.cxx ├── vk_radix2 │ ├── CMakeLists.txt │ └── vk_radix2.cxx ├── vk_scan │ ├── CMakeLists.txt │ └── vk_scan.cxx ├── vk_sort │ ├── CMakeLists.txt │ └── vk_sort.cxx └── vk_transform │ ├── CMakeLists.txt │ └── vk_transform.cxx ├── inc └── mgpu │ ├── common │ ├── bindings.hxx │ ├── cta_merge.hxx │ ├── cta_mergesort.hxx │ ├── cta_radix.hxx │ ├── cta_scan.hxx │ ├── kernel_merge.hxx │ ├── kernel_mergesort.hxx │ ├── loadstore.hxx │ ├── meta.hxx │ ├── sort_networks.hxx │ ├── subgroup.hxx │ └── types.hxx │ ├── gl │ ├── app.hxx │ ├── buffer.hxx │ ├── merge.hxx │ ├── mergesort.hxx │ ├── partition.hxx │ └── transform.hxx │ └── vk │ ├── VkBootstrap.h │ ├── context.hxx │ ├── launch.hxx │ ├── merge.hxx │ ├── mergesort.hxx │ ├── partition.hxx │ ├── radix.hxx │ ├── scan.hxx │ ├── transform.hxx │ └── vk_mem_alloc.h └── src └── vk ├── VkBootstrap.cpp └── context.cxx /examples/gl_merge/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(gl_merge) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | gl_merge.cxx 8 | ../../inc 9 | ) 10 | 11 | set_source_files_properties(gl_merge.cxx PROPERTIES COMPILE_FLAGS -shader) 12 | 13 | add_executable(gl_merge ${SOURCE_FILES}) 14 | 15 | target_link_libraries(gl_merge 16 | glfw 17 | gl3w 18 | GL 19 | ) 20 | -------------------------------------------------------------------------------- /examples/gl_merge/gl_merge.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | using namespace mgpu::gl; 8 | 9 | template > 10 | std::vector gpu_merge(const std::vector& a, 11 | const std::vector& b, comp_t comp = comp_t()) { 12 | 13 | // Upload keys to OpenGL SSBOs. 14 | gl_buffer_t a_keys(a); 15 | gl_buffer_t b_keys(b); 16 | 17 | // Allocate an SSBO for the result. 18 | gl_buffer_t c_keys(a.size() + b.size()); 19 | 20 | // Merge the keys! 21 | merge_pipeline_t pipeline; 22 | pipeline.launch(a_keys, a.size(), b_keys, b.size(), c_keys, comp); 23 | 24 | // Return the result in host memory. 25 | return c_keys.get_data(); 26 | } 27 | 28 | int main() { 29 | app_t app("merge demo"); 30 | 31 | int a_count = 10000; 32 | int b_count = 10000; 33 | std::vector a(a_count), b(b_count); 34 | a[:] = rand() % 100000...; std::sort(a.begin(), a.end()); 35 | b[:] = rand() % 100000...; std::sort(b.begin(), b.end()); 36 | 37 | std::vector c = gpu_merge<128, 7>(a, b); 38 | printf("%d: %f\n", @range(), c[:])...; 39 | } -------------------------------------------------------------------------------- /examples/gl_sort/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(gl_sort) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | gl_sort.cxx 8 | ) 9 | 10 | set_source_files_properties(gl_sort.cxx PROPERTIES COMPILE_FLAGS -shader) 11 | 12 | add_executable(gl_sort ${SOURCE_FILES}) 13 | 14 | target_link_libraries(gl_sort 15 | glfw 16 | gl3w 17 | GL 18 | ) 19 | -------------------------------------------------------------------------------- /examples/gl_sort/gl_sort.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using namespace mgpu::gl; 6 | 7 | // key-index sort. 8 | // sort keys in place and fill with gather indices. 9 | 10 | template 11 | void gpu_sort(std::vector& data) { 12 | gl_buffer_t a(data); 13 | gl_buffer_t b(data.size()); 14 | 15 | mergesort_pipeline_t pipeline; 16 | pipeline.template sort_keys_indices(a, b, data.size()); 17 | 18 | std::vector indices = b.get_data(); 19 | std::vector gathered = [data[indices[:]]...]; 20 | 21 | a.get_data(data.data()); 22 | 23 | bool is_inverse = gathered == data; 24 | printf("is_inverse = %d\n", is_inverse); 25 | 26 | } 27 | 28 | int main() { 29 | app_t app("sort demo"); 30 | 31 | const int nt = 128; 32 | const int vt = 7; 33 | int count = 10000; 34 | std::vector data(count); 35 | for(int i = 0; i < count; ++i) 36 | data[i] = rand() % 10000; 37 | 38 | gpu_sort(data); 39 | 40 | 41 | bool is_sorted = (... && (data[:] <= data[1:])); 42 | printf("IS SORTED = %d\n", is_sorted); 43 | } -------------------------------------------------------------------------------- /examples/gl_transform/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(gl_transform) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | gl_transform.cxx 8 | ) 9 | 10 | set_source_files_properties(gl_transform.cxx PROPERTIES COMPILE_FLAGS -shader) 11 | 12 | add_executable(gl_transform ${SOURCE_FILES}) 13 | 14 | target_link_libraries(gl_transform 15 | glfw 16 | gl3w 17 | GL 18 | ) 19 | -------------------------------------------------------------------------------- /examples/gl_transform/gl_transform.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace mgpu::gl; 7 | 8 | int main() { 9 | // Initialize OpenGL and create an offscreen window. 10 | app_t app("lambda test"); 11 | 12 | // Allocate storage for 10000 floats. 13 | int count = 10000; 14 | gl_buffer_t data(count); 15 | 16 | // Bind to binding=0. Return a buffer_iterator_t that samples 17 | // shader_buffer<0, float[]>. 18 | auto p = data.bind_ssbo<0>(); 19 | 20 | // Launch a compute shader from a lambda. 21 | gl_transform([=](int index) { 22 | p[index] = sqrt((float)index); 23 | }, count); 24 | 25 | std::vector data2 = data.get_data(); 26 | printf("%5d: %f\n", @range(), data2[:])...; 27 | } -------------------------------------------------------------------------------- /examples/thrust_test/thrust_test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | int main(void) { 11 | typedef uint type_t; 12 | int max_count = 50'000'000; 13 | 14 | thrust::host_vector host(max_count); 15 | for(int i = 0; i < max_count; ++i) 16 | host[i] = rand() + 2 * rand(); // fill all 32 bits. 17 | 18 | // Copy in host data. 19 | thrust::device_vector gpu = host; 20 | 21 | int sizes[] { 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 25, 30, 35, 40, 45, 50 }; 22 | 23 | for(int size : sizes) { 24 | // Sort 5 billion keys at least. 25 | int count = 1'000'000 * size; 26 | int num_iterations = (int)ceil(5.0e9 / count); 27 | 28 | cudaDeviceSynchronize(); 29 | timespec start; 30 | clock_gettime(CLOCK_REALTIME, &start); 31 | 32 | for(int i = 0; i < num_iterations; ++i) 33 | thrust::sort(gpu.begin(), gpu.begin() + count); 34 | 35 | cudaDeviceSynchronize(); 36 | 37 | timespec end; 38 | clock_gettime(CLOCK_REALTIME, &end); 39 | 40 | double elapsed = (end.tv_sec - start.tv_sec) + 41 | (end.tv_nsec - start.tv_nsec) * 1.0e-9; 42 | 43 | double rate = (double)count * num_iterations / elapsed / 1.0e6; 44 | 45 | printf("%9d: %20.5f time=%f, iterations=%d\n", count, rate, elapsed, 46 | num_iterations); 47 | } 48 | 49 | return 0; 50 | } -------------------------------------------------------------------------------- /examples/vk_merge/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(vk_merge) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | vk_merge.cxx 8 | ../../src/vk/context.cxx 9 | ../../src/vk/VkBootstrap.cpp 10 | 11 | ../../inc 12 | ) 13 | 14 | set_source_files_properties(vk_merge.cxx PROPERTIES COMPILE_FLAGS -shader) 15 | 16 | add_executable(vk_merge ${SOURCE_FILES}) 17 | 18 | target_link_libraries(vk_merge 19 | glfw 20 | vulkan 21 | dl 22 | pthread 23 | ) 24 | -------------------------------------------------------------------------------- /examples/vk_merge/vk_merge.cxx: -------------------------------------------------------------------------------- 1 | #define ALIASED_SMEM 2 | 3 | #include 4 | #include 5 | 6 | using namespace mgpu::vk; 7 | 8 | int main() { 9 | context_t context; 10 | 11 | int a_count = 10000; 12 | int b_count = 10000; 13 | int count = a_count + b_count; 14 | 15 | float* a_host = context.alloc_cpu(a_count); 16 | float* b_host = context.alloc_cpu(b_count); 17 | float* c_host = context.alloc_cpu(a_count + b_count); 18 | 19 | // Generate test data into the staging buffers. 20 | for(int i = 0; i < a_count; ++i) 21 | a_host[i] = rand() % 100000; 22 | for(int i = 0; i < b_count; ++i) 23 | b_host[i] = rand() % 100000; 24 | 25 | // Sort both input sequences. 26 | std::sort(a_host, a_host + a_count); 27 | std::sort(b_host, b_host + b_count); 28 | 29 | float* a_gpu = context.alloc_gpu(a_count); 30 | float* b_gpu = context.alloc_gpu(b_count); 31 | float* c_gpu = context.alloc_gpu(count); 32 | 33 | // Create a command buffer. 34 | cmd_buffer_t cmd_buffer(context); 35 | cmd_buffer.begin(); 36 | 37 | // Upload to GPU memory. 38 | cmd_buffer.memcpy(a_gpu, a_host, sizeof(float) * a_count); 39 | cmd_buffer.memcpy(b_gpu, b_host, sizeof(float) * b_count); 40 | cmd_buffer.host_barrier(); 41 | 42 | // Execute the parallel merge. 43 | void* aux_data = nullptr; 44 | size_t aux_size = 0; 45 | merge(aux_data, aux_size, cmd_buffer, a_gpu, a_count, b_gpu, b_count, c_gpu, 46 | std::less()); 47 | aux_data = context.alloc_gpu(aux_size); 48 | 49 | merge(aux_data, aux_size, cmd_buffer, a_gpu, a_count, b_gpu, b_count, c_gpu, 50 | std::less()); 51 | 52 | // Retrieve the results. 53 | cmd_buffer.memcpy(c_host, c_gpu, sizeof(float) * count); 54 | cmd_buffer.host_barrier(); 55 | 56 | // End and submite the command buffer. 57 | cmd_buffer.end(); 58 | context.submit(cmd_buffer); 59 | 60 | vkQueueWaitIdle(context.queue); 61 | 62 | // Print our results. 63 | for(int i = 0; i < count; ++i) 64 | printf("%5d: %f\n", i, c_host[i]); 65 | 66 | context.free(aux_data); 67 | context.free(a_host); 68 | context.free(b_host); 69 | context.free(c_host); 70 | context.free(a_gpu); 71 | context.free(b_gpu); 72 | context.free(c_gpu); 73 | } -------------------------------------------------------------------------------- /examples/vk_radix/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(vk_radix) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | vk_radix.cxx 8 | ../../src/vk/context.cxx 9 | ../../src/vk/VkBootstrap.cpp 10 | 11 | ../../inc 12 | ) 13 | 14 | set_source_files_properties(vk_radix.cxx PROPERTIES COMPILE_FLAGS -shader) 15 | 16 | add_executable(vk_radix ${SOURCE_FILES}) 17 | 18 | target_link_libraries(vk_radix 19 | glfw 20 | vulkan 21 | dl 22 | pthread 23 | ) 24 | -------------------------------------------------------------------------------- /examples/vk_radix/timings.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seanbaxter/mgpu-shaders/ba92f0bd7db5c5a4e728823d7dc37cf07450eb5f/examples/vk_radix/timings.jpg -------------------------------------------------------------------------------- /examples/vk_radix/vk_radix.cxx: -------------------------------------------------------------------------------- 1 | #define ALIASED_SMEM 2 | 3 | #include "mgpu/vk/radix.hxx" 4 | #include 5 | #include 6 | 7 | using namespace mgpu; 8 | using namespace mgpu::vk; 9 | 10 | int main() { 11 | context_t context; 12 | 13 | typedef uint type_t; 14 | int max_count = 50'000'000; 15 | 16 | type_t* host = context.alloc_cpu(max_count); 17 | for(int i = 0; i < max_count; ++i) 18 | host[i] = rand() + 2 * rand(); // fill all 32 bits. 19 | 20 | type_t* gpu = context.alloc_gpu(max_count); 21 | 22 | // Create a command buffer. 23 | cmd_buffer_t cmd_buffer(context); 24 | cmd_buffer.begin(); 25 | 26 | // Copy in host data. 27 | cmd_buffer.memcpy(gpu, host, sizeof(type_t) * max_count); 28 | 29 | cmd_buffer.end(); 30 | context.submit(cmd_buffer); 31 | 32 | // Allocate auxiliary storage. 33 | void* aux_data; 34 | size_t aux_size = 0; 35 | radix_sort<128, 4, 4>(aux_data, aux_size, cmd_buffer, gpu, max_count); 36 | aux_data = context.alloc_gpu(aux_size); 37 | 38 | vkQueueWaitIdle(context.queue); 39 | 40 | int sizes[] { 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 25, 30, 35, 40, 45, 50 }; 41 | 42 | enum { nt = 256, vt = 16 }; 43 | for(int size : sizes) { 44 | // Sort 5 billion keys at least. 45 | int count = 1'000'000 * size; 46 | int num_iterations = (int)ceil(5.0e9 / count); 47 | 48 | cmd_buffer.reset(); 49 | cmd_buffer.begin(); 50 | 51 | timespec start; 52 | clock_gettime(CLOCK_REALTIME, &start); 53 | 54 | cmd_buffer.begin(); 55 | for(int i = 0; i < num_iterations; ++i) 56 | radix_sort(aux_data, aux_size, cmd_buffer, gpu, count); 57 | cmd_buffer.end(); 58 | 59 | context.submit(cmd_buffer); 60 | vkQueueWaitIdle(context.queue); 61 | 62 | timespec end; 63 | clock_gettime(CLOCK_REALTIME, &end); 64 | 65 | double elapsed = (end.tv_sec - start.tv_sec) + 66 | (end.tv_nsec - start.tv_nsec) * 1.0e-9; 67 | 68 | double rate = (double)count * num_iterations / elapsed / 1.0e6; 69 | 70 | printf("%9d: %20.5f time=%f, iterations=%d\n", count, rate, elapsed, 71 | num_iterations); 72 | } 73 | 74 | context.free(aux_data); 75 | context.free(gpu); 76 | context.free(host); 77 | } 78 | 79 | -------------------------------------------------------------------------------- /examples/vk_radix2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(vk_radix2) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | vk_radix2.cxx 8 | ../../src/vk/context.cxx 9 | ../../src/vk/VkBootstrap.cpp 10 | 11 | ../../inc 12 | ) 13 | 14 | set_source_files_properties(vk_radix2.cxx PROPERTIES COMPILE_FLAGS -shader) 15 | 16 | add_executable(vk_radix2 ${SOURCE_FILES}) 17 | 18 | target_link_libraries(vk_radix2 19 | glfw 20 | vulkan 21 | dl 22 | pthread 23 | ) 24 | -------------------------------------------------------------------------------- /examples/vk_radix2/vk_radix2.cxx: -------------------------------------------------------------------------------- 1 | #define ALIASED_SMEM 2 | 3 | #include "mgpu/vk/radix.hxx" 4 | #include 5 | #include 6 | 7 | using namespace mgpu; 8 | using namespace mgpu::vk; 9 | 10 | int main() { 11 | context_t context; 12 | 13 | // Allocate test data storage. 14 | enum { nt = 256, num_bits = 8, num_bins = 1<< num_bits, vt = 1, nv = nt * vt }; 15 | 16 | typedef uint type_t; 17 | int count = nv * 32 * 32 * 32; 18 | int num_ctas = div_up(count, nv); 19 | std::vector ref(count); 20 | 21 | type_t* host = context.alloc_cpu(count); 22 | type_t* gpu = context.alloc_gpu(count); 23 | 24 | // Generate test data. 25 | for(int i = 0; i < count; ++i) { 26 | ref[i] = host[i] = rand(); 27 | } 28 | 29 | //for(int i = 0; i < count; i += nv) { 30 | // std::sort(ref.begin() + i, ref.begin() + std::min(count, i + nv)); 31 | //} 32 | std::sort(ref.begin(), ref.end()); 33 | 34 | // Create a command buffer. 35 | cmd_buffer_t cmd_buffer(context); 36 | cmd_buffer.begin(); 37 | 38 | // Upload test data to GPU memory. 39 | cmd_buffer.memcpy(gpu, host, sizeof(type_t) * count); 40 | cmd_buffer.host_barrier(); 41 | 42 | void* aux_data = nullptr; 43 | size_t aux_size = 0; 44 | radix_sort(aux_data, aux_size, cmd_buffer, gpu, count); 45 | aux_data = context.alloc_gpu(aux_size); 46 | 47 | radix_sort(aux_data, aux_size, cmd_buffer, gpu, count); 48 | radix_sort(aux_data, aux_size, cmd_buffer, gpu, count); 49 | radix_sort(aux_data, aux_size, cmd_buffer, gpu, count); 50 | radix_sort(aux_data, aux_size, cmd_buffer, gpu, count); 51 | radix_sort(aux_data, aux_size, cmd_buffer, gpu, count); 52 | radix_sort(aux_data, aux_size, cmd_buffer, gpu, count); 53 | 54 | // Retrieve the results. 55 | cmd_buffer.memcpy(host, gpu, sizeof(type_t) * count); 56 | cmd_buffer.host_barrier(); 57 | 58 | // End and submit the command buffer. 59 | cmd_buffer.end(); 60 | context.submit(cmd_buffer); 61 | 62 | // And wait for it to be done. 63 | vkQueueWaitIdle(context.queue); 64 | 65 | for(int i = 0; i < count; ++i) { 66 | // printf("%6d: %9d\n", i, host[i]); // - %3d - %3d\n", i, host[i], ref[i], scans[i / 32][i % 32]); 67 | // printf("%3d: %5d \n", i, host[i]); 68 | 69 | if(host[i] != ref[i]) { 70 | printf("Error at %d: %d vs %d\n", i, host[i], ref[i]); 71 | exit(1); 72 | } 73 | } 74 | 75 | printf("MATCH\n"); 76 | 77 | 78 | context.free(aux_data); 79 | context.free(host); 80 | context.free(gpu); 81 | 82 | } 83 | 84 | -------------------------------------------------------------------------------- /examples/vk_scan/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(vk_scan) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | vk_scan.cxx 8 | ../../src/vk/context.cxx 9 | ../../src/vk/VkBootstrap.cpp 10 | 11 | ../../inc 12 | ) 13 | 14 | set_source_files_properties(vk_scan.cxx PROPERTIES COMPILE_FLAGS -shader) 15 | 16 | add_executable(vk_scan ${SOURCE_FILES}) 17 | 18 | target_link_libraries(vk_scan 19 | glfw 20 | vulkan 21 | dl 22 | pthread 23 | ) 24 | -------------------------------------------------------------------------------- /examples/vk_scan/vk_scan.cxx: -------------------------------------------------------------------------------- 1 | #define ALIASED_SMEM 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace mgpu; 11 | using namespace mgpu::vk; 12 | 13 | int main() { 14 | context_t context; 15 | 16 | // Allocate test data storage. 17 | int count = 10000; 18 | int* host = context.alloc_cpu(count); 19 | int* gpu = context.alloc_gpu(count); 20 | 21 | // Generate test data. 22 | for(int i = 0; i < count; ++i) 23 | host[i] = i; 24 | 25 | // Create a command buffer. 26 | cmd_buffer_t cmd_buffer(context); 27 | cmd_buffer.begin(); 28 | 29 | // Upload test data to GPU memory. 30 | cmd_buffer.memcpy(gpu, host, sizeof(int) * count); 31 | cmd_buffer.host_barrier(); 32 | 33 | // Execute the scan. 34 | void* aux_data = nullptr; 35 | size_t aux_size = 0; 36 | vk::scan(aux_data, aux_size, cmd_buffer, gpu, count); 37 | printf("aux size = %ld\n", aux_size); 38 | aux_data = context.alloc_gpu(aux_size); 39 | 40 | vk::scan(aux_data, aux_size, cmd_buffer, gpu, count); 41 | 42 | // Retrieve the results. 43 | cmd_buffer.memcpy(host, gpu, sizeof(int) * count); 44 | cmd_buffer.host_barrier(); 45 | 46 | // End and submit the command buffer. 47 | cmd_buffer.end(); 48 | context.submit(cmd_buffer); 49 | 50 | // And wait for it to be done. 51 | vkQueueWaitIdle(context.queue); 52 | 53 | for(int i = 0; i < count; ++i) 54 | printf("%3d: %2d\n", i, host[i]); 55 | 56 | context.free(aux_data); 57 | context.free(host); 58 | context.free(gpu); 59 | 60 | return 0; 61 | } 62 | 63 | -------------------------------------------------------------------------------- /examples/vk_sort/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(vk_sort) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | vk_sort.cxx 8 | ../../src/vk/context.cxx 9 | ../../src/vk/VkBootstrap.cpp 10 | 11 | ../../inc 12 | ) 13 | 14 | set_source_files_properties(vk_sort.cxx PROPERTIES COMPILE_FLAGS -shader) 15 | 16 | add_executable(vk_sort ${SOURCE_FILES}) 17 | 18 | target_link_libraries(vk_sort 19 | glfw 20 | vulkan 21 | dl 22 | pthread 23 | ) 24 | -------------------------------------------------------------------------------- /examples/vk_sort/vk_sort.cxx: -------------------------------------------------------------------------------- 1 | #define ALIASED_SMEM 2 | 3 | #include 4 | #include 5 | 6 | using namespace mgpu::vk; 7 | 8 | int main() { 9 | context_t context; 10 | 11 | int count = 10000; 12 | float* host = context.alloc_cpu(count); 13 | float* gpu = context.alloc_gpu(count); 14 | 15 | // Generate test data into the staging buffers. 16 | for(int i = 0; i < count; ++i) 17 | host[i] = rand() % 100000; 18 | 19 | // Create a command buffer. 20 | cmd_buffer_t cmd_buffer(context); 21 | cmd_buffer.begin(); 22 | 23 | // Upload to GPU memory. 24 | cmd_buffer.memcpy(gpu, host, sizeof(float) * count); 25 | cmd_buffer.host_barrier(); 26 | 27 | // Execute the parallel mergesort. 28 | void* aux_data = nullptr; 29 | size_t aux_size = 0; 30 | mergesort_keys(aux_data, aux_size, cmd_buffer, gpu, count); 31 | aux_data = context.alloc_gpu(aux_size); 32 | 33 | mergesort_keys(aux_data, aux_size, cmd_buffer, gpu, count); 34 | 35 | // Retrieve the results. 36 | cmd_buffer.memcpy(host, gpu, sizeof(float) * count); 37 | cmd_buffer.host_barrier(); 38 | 39 | // End and submit the command buffer. 40 | cmd_buffer.end(); 41 | context.submit(cmd_buffer); 42 | 43 | vkQueueWaitIdle(context.queue); 44 | 45 | // Print our results. 46 | for(int i = 0; i < count; ++i) 47 | printf("%5d: %f\n", i, host[i]); 48 | 49 | context.free(aux_data); 50 | context.free(host); 51 | context.free(gpu); 52 | } -------------------------------------------------------------------------------- /examples/vk_transform/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(vk_transform) 3 | 4 | include_directories(../../inc) 5 | 6 | set(SOURCE_FILES 7 | vk_transform.cxx 8 | ../../src/vk/context.cxx 9 | ../../src/vk/VkBootstrap.cpp 10 | ) 11 | 12 | set_source_files_properties(vk_transform.cxx PROPERTIES COMPILE_FLAGS -shader) 13 | 14 | add_executable(vk_transform ${SOURCE_FILES}) 15 | 16 | target_link_libraries(vk_transform 17 | glfw 18 | vulkan 19 | dl 20 | pthread 21 | ) 22 | -------------------------------------------------------------------------------- /examples/vk_transform/vk_transform.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace mgpu::vk; 5 | 6 | template 7 | [[using spirv: comp, local_size(NT), push]] 8 | void compute_shader(int count, type_t a, type_t* x, type_t* y) { 9 | int gid = glcomp_GlobalInvocationID.x; 10 | if(gid < count) { 11 | x[gid] = 2 * gid; // Even values. 12 | y[gid] = 2 * gid + 1; // Odd values. 13 | } 14 | } 15 | 16 | int main() { 17 | context_t context; 18 | 19 | // Allocate test data storage. 20 | int count = 100; 21 | float a = 1.618f; // A saxpy coefficient. 22 | float* x = context.alloc_gpu(count); 23 | float* y = context.alloc_gpu(count); 24 | 25 | // Create a command buffer. 26 | cmd_buffer_t cmd_buffer(context); 27 | cmd_buffer.begin(); 28 | 29 | // Three ways to launch kernels with mgpu-shaders for Vulkan: 30 | 31 | // 1. Use chevron launch syntax. This calls spirv_chevron_comp and 32 | // passes the shader reference as the template argument. In mgpu's 33 | // implementation, num_blocks and cmd_buffer are the chevron arguments. 34 | const int NT = 64; // Use 64 threads per block. 35 | int num_blocks = mgpu::div_up(count, NT); 36 | compute_shader<<>>(count, M_PIf32, x, y); 37 | 38 | // 2. Use launch(). This is like a chevron launch, but you don't even have 39 | // to write a shader. Pass it a function object or lambda. The lambda 40 | // gets called back with the glcomp_LocalInvocation.x and 41 | // glcomp_WorkGroupID.x values. As with the chevron launch, specify the 42 | // grid size. You must also specify the workgroup size as a template 43 | // argument, since you aren't defining a compute on which to attach 44 | // local_size. 45 | launch(num_blocks, cmd_buffer, [=](int tid, int cta) { 46 | // tid and cta are the thread and workgroup IDs. 47 | // combine them for a global ID or read glcomp_GlobalInvocationID.x. 48 | int gid = tid + NT * cta; 49 | 50 | // Use the default-copy closure to capture the kernel parameters. 51 | if(gid < count) { 52 | // SAXPY these terms. 53 | y[gid] += a * x[gid]; 54 | } 55 | }); 56 | 57 | // 3. Use transform(). This is for embarrassingly parallel tasks. It 58 | // executes the function object once for each request. You can pass it 59 | // the group size as an optional template argument, or use an 60 | // implementation-defined group size. 61 | transform(count, cmd_buffer, [=](int index) { 62 | x[index] *= sqrt(y[index]); 63 | }); 64 | 65 | // Copy the data to host memory. 66 | float* host = context.alloc_cpu(count); 67 | 68 | cmd_buffer.host_barrier(); 69 | context.memcpy(cmd_buffer, host, x, sizeof(float) * count); 70 | cmd_buffer.host_barrier(); 71 | 72 | // End and submit the command buffer. 73 | cmd_buffer.end(); 74 | context.submit(cmd_buffer); 75 | 76 | // And wait for it to be done. 77 | vkQueueWaitIdle(context.queue); 78 | 79 | // Print our results. 80 | for(int i = 0; i < count; ++i) 81 | printf("%3d: %f\n", i, host[i]); 82 | 83 | context.free(x); 84 | context.free(y); 85 | context.free(host); 86 | 87 | return 0; 88 | } 89 | 90 | -------------------------------------------------------------------------------- /inc/mgpu/common/bindings.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "meta.hxx" 3 | #include 4 | #include 5 | 6 | BEGIN_MGPU_NAMESPACE 7 | 8 | template 9 | [[using spirv: in, location((int)index)]] 10 | type_t shader_in; 11 | 12 | template 13 | [[using spirv: out, location((int)index)]] 14 | type_t shader_out; 15 | 16 | template 17 | [[using spirv: uniform, binding((int)index)]] 18 | type_t shader_uniform; 19 | 20 | template 21 | [[using spirv: buffer, readonly, binding(index)]] 22 | type_t shader_readonly; 23 | 24 | template 25 | [[using spirv: buffer, writeonly, binding(index)]] 26 | type_t shader_writeonly; 27 | 28 | template 29 | [[using spirv: buffer, binding(index)]] 30 | type_t shader_buffer; 31 | 32 | //////////////////////////////////////////////////////////////////////////////// 33 | 34 | // Provide an a common iterator type. 35 | template 36 | struct iterator_t : std::iterator_traits*> { 37 | 38 | iterator_t() = default; 39 | explicit iterator_t(int offset) : offset(offset) { } 40 | 41 | iterator_t(const iterator_t&) = default; 42 | iterator_t& operator=(const iterator_t&) = default; 43 | 44 | iterator_t operator+(int diff) const noexcept { 45 | return iterator_t(offset + diff); 46 | } 47 | iterator_t& operator+=(int diff) noexcept { 48 | offset += diff; 49 | return *this; 50 | } 51 | friend iterator_t operator+(int diff, iterator_t rhs) noexcept { 52 | return iterator_t(diff + rhs.offset); 53 | } 54 | 55 | iterator_t operator-(int diff) const noexcept { 56 | return iterator_t(offset - diff); 57 | } 58 | iterator_t& operator-=(int diff) noexcept { 59 | offset -= diff; 60 | return *this; 61 | } 62 | 63 | int operator-(iterator_t rhs) const noexcept { 64 | return offset - rhs.offset; 65 | } 66 | 67 | decltype(auto) operator*() const noexcept { 68 | return accessor_t::access(offset); 69 | } 70 | 71 | decltype(auto) operator[](int index) const noexcept { 72 | return accessor_t::access(offset + index); 73 | } 74 | 75 | int offset = 0; 76 | }; 77 | 78 | template 79 | struct readonly_access_t { 80 | static type_t access(int index) noexcept { 81 | return shader_readonly[index]; 82 | } 83 | }; 84 | 85 | template 86 | using readonly_iterator_t = iterator_t >; 87 | 88 | template 89 | struct writeonly_access_t { 90 | static type_t& access(int index) noexcept { 91 | return shader_writeonly[index]; 92 | } 93 | }; 94 | template 95 | using writeonly_iterator_t = iterator_t >; 96 | 97 | template 98 | struct buffer_access_t { 99 | static type_t& access(int index) noexcept { 100 | return shader_buffer[index]; 101 | } 102 | }; 103 | template 104 | using buffer_iterator_t = iterator_t >; 105 | 106 | struct empty_iterator_t : std::iterator_traits { 107 | // Don't provide additional interface. The caller should check the 108 | // iterator_traits prior to subscripting. 109 | }; 110 | 111 | //////////////////////////////////////////////////////////////////////////////// 112 | 113 | END_MGPU_NAMESPACE 114 | 115 | -------------------------------------------------------------------------------- /inc/mgpu/common/cta_merge.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "loadstore.hxx" 3 | 4 | BEGIN_MGPU_NAMESPACE 5 | 6 | constexpr int num_merge_partitions(int count, int spacing) { 7 | return div_up(count, spacing) + 1; 8 | } 9 | 10 | template 12 | int merge_path(a_keys_it a_keys, int a_count, b_keys_it b_keys, 13 | int b_count, int diag, comp_t comp) { 14 | 15 | typedef typename std::iterator_traits::value_type type_t; 16 | int begin = max(0, diag - b_count); 17 | int end = min(diag, a_count); 18 | 19 | while(begin < end) { 20 | int mid = (begin + end) / 2; 21 | type_t a_key = a_keys[mid]; 22 | type_t b_key = b_keys[diag - 1 - mid]; 23 | bool pred = (bounds_upper == bounds) ? 24 | comp(a_key, b_key) : 25 | !comp(b_key, a_key); 26 | 27 | if(pred) begin = mid + 1; 28 | else end = mid; 29 | } 30 | return begin; 31 | } 32 | 33 | template 34 | int merge_path(keys_it keys, merge_range_t range, int diag, comp_t comp) { 35 | return merge_path( 36 | keys + range.a_begin, range.a_count(), 37 | keys + range.b_begin, range.b_count(), 38 | diag, comp); 39 | } 40 | 41 | template 42 | bool merge_predicate(type_t a_key, type_t b_key, merge_range_t range, 43 | comp_t comp) { 44 | 45 | bool p; 46 | if(range_check && !range.a_valid()) p = false; 47 | else if(range_check && !range.b_valid()) p = true; 48 | else p = (bounds_upper == bounds) ? comp(a_key, b_key) : !comp(b_key, a_key); 49 | return p; 50 | } 51 | 52 | merge_range_t compute_merge_range(int a_count, int b_count, int partition, 53 | int spacing, int mp0, int mp1) { 54 | 55 | int diag0 = spacing * partition; 56 | int diag1 = min(a_count + b_count, diag0 + spacing); 57 | 58 | return merge_range_t { mp0, mp1, diag0 - mp0, diag1 - mp1 }; 59 | } 60 | 61 | template 62 | std::array load_two_streams_reg(a_it a, int a_count, b_it b, 63 | int b_count, int tid) { 64 | 65 | std::array x; 66 | strided_iterate([&](int i, int index) { 67 | x[i] = (index < a_count) ? a[index] : b[index - a_count]; 68 | }, tid, a_count + b_count); 69 | 70 | return x; 71 | } 72 | 73 | template 75 | void load_two_streams_shared(a_it a, int a_count, b_it b, int b_count, 76 | int tid, type_t (&shared)[shared_size]) { 77 | 78 | // Load into register then make an unconditional strided store into memory. 79 | std::array x = load_two_streams_reg( 80 | a, a_count, b, b_count, tid); 81 | reg_to_shared_strided(x, tid, shared); 82 | } 83 | 84 | template 85 | std::array gather_two_streams_strided(a_it a, int a_count, 86 | b_it b, int b_count, std::array indices, int tid) { 87 | 88 | std::array x; 89 | strided_iterate([&](int i, int j) { 90 | x[i] = (indices[i] < a_count) ? a[indices[i]] : b[indices[i] - a_count]; 91 | }, tid, a_count + b_count); 92 | 93 | return x; 94 | } 95 | 96 | template 97 | void transfer_two_streams_strided(a_it a, int a_count, b_it b, 98 | int b_count, std::array indices, int tid, c_it c) { 99 | 100 | typedef typename std::iterator_traits::value_type type_t; 101 | std::array x = gather_two_streams_strided(a, 102 | a_count, b, b_count, indices, tid); 103 | 104 | reg_to_mem_strided(x, tid, a_count + b_count, c); 105 | } 106 | 107 | // This function must be able to dereference keys[a_begin] and keys[b_begin], 108 | // no matter the indices for each. The caller should allocate at least 109 | // nt * vt + 1 elements for keys_shared. 110 | template 111 | merge_pair_t serial_merge(const type_t* keys_shared, 112 | merge_range_t range, comp_t comp, bool sync = true) { 113 | 114 | type_t a_key = keys_shared[range.a_begin]; 115 | type_t b_key = keys_shared[range.b_begin]; 116 | 117 | merge_pair_t merge_pair; 118 | 119 | @meta for(int i = 0; i < vt; ++i) {{ 120 | bool p = merge_predicate(a_key, b_key, range, comp); 121 | int index = p ? range.a_begin : range.b_begin; 122 | 123 | merge_pair.keys[i] = p ? a_key : b_key; 124 | merge_pair.indices[i] = index; 125 | 126 | type_t c_key = keys_shared[++index]; 127 | if(p) a_key = c_key, range.a_begin = index; 128 | else b_key = c_key, range.b_begin = index; 129 | }} 130 | 131 | if(sync) __syncthreads(); 132 | return merge_pair; 133 | } 134 | 135 | // Load arrays a and b from global memory and merge into register. 136 | template 138 | merge_pair_t cta_merge_from_mem(a_it a, b_it b, 139 | merge_range_t range_mem, int tid, comp_t comp, 140 | type_t (&keys_shared)[shared_size]) { 141 | 142 | static_assert(shared_size >= nt * vt + 1, 143 | "cta_merge_from_mem requires temporary storage of at " 144 | "least nt * vt + 1 items"); 145 | 146 | // Load the data into shared memory. 147 | load_two_streams_shared(a + range_mem.a_begin, range_mem.a_count(), 148 | b + range_mem.b_begin, range_mem.b_count(), tid, keys_shared); 149 | 150 | // Run a merge path to find the start of the serial merge for each thread. 151 | merge_range_t range_local = range_mem.to_local(); 152 | int diag = vt * tid; 153 | int mp = merge_path(keys_shared, range_local, diag, comp); 154 | 155 | // Compute the ranges of the sources in shared memory. The end iterators 156 | // of the range are inaccurate, but still facilitate exact merging, because 157 | // only vt elements will be merged. 158 | merge_pair_t merged = serial_merge(keys_shared, 159 | range_local.partition(mp, diag), comp); 160 | 161 | return merged; 162 | }; 163 | 164 | END_MGPU_NAMESPACE 165 | -------------------------------------------------------------------------------- /inc/mgpu/common/cta_mergesort.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cta_merge.hxx" 3 | #include "sort_networks.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | constexpr int out_of_range_flags(int first, int vt, int count) { 8 | int out_of_range = min(vt, first + vt - count); 9 | int head_flags = 0; 10 | if(out_of_range > 0) { 11 | const int mask = (1<< vt) - 1; 12 | head_flags = mask & (~mask>> out_of_range); 13 | } 14 | return head_flags; 15 | } 16 | 17 | constexpr merge_range_t compute_mergesort_frame(int partition, int coop, 18 | int spacing) { 19 | 20 | int size = spacing * (coop / 2); 21 | int start = ~(coop - 1) & partition; 22 | int a_begin = spacing * start; 23 | int b_begin = spacing * start + size; 24 | 25 | return merge_range_t { 26 | a_begin, 27 | a_begin + size, 28 | b_begin, 29 | b_begin + size 30 | }; 31 | } 32 | 33 | constexpr merge_range_t compute_mergesort_range(int count, int partition, 34 | int coop, int spacing) { 35 | 36 | merge_range_t frame = compute_mergesort_frame(partition, coop, spacing); 37 | return merge_range_t { 38 | frame.a_begin, 39 | min(count, frame.a_end), 40 | min(count, frame.b_begin), 41 | min(count, frame.b_end) 42 | }; 43 | } 44 | 45 | constexpr merge_range_t compute_mergesort_range(int count, int partition, 46 | int coop, int spacing, int mp0, int mp1) { 47 | 48 | merge_range_t range = compute_mergesort_range(count, partition, 49 | coop, spacing); 50 | 51 | // Locate the diagonal from the start of the A sublist. 52 | int diag = spacing * partition - range.a_begin; 53 | 54 | // The end partition of the last cta for each merge operation is computed 55 | // and stored as the begin partition for the subsequent merge. i.e. it is 56 | // the same partition but in the wrong coordinate system, so its 0 when it 57 | // should be listSize. Correct that by checking if this is the last cta 58 | // in this merge operation. 59 | if(coop - 1 != ((coop - 1) & partition)) { 60 | range.a_end = range.a_begin + mp1; 61 | range.b_end = min(count, range.b_begin + diag + spacing - mp1); 62 | } 63 | 64 | range.a_begin = range.a_begin + mp0; 65 | range.b_begin = min(count, range.b_begin + diag - mp0); 66 | 67 | return range; 68 | } 69 | 70 | template 71 | struct cta_sort_t { 72 | static_assert(is_pow2(nt)); 73 | 74 | enum { 75 | has_values = !std::is_same_v, 76 | num_passes = s_log2(nt) 77 | }; 78 | 79 | ALIAS_UNION storage_t { 80 | key_t keys[nt * vt + 1]; 81 | val_t vals[nt * vt]; 82 | }; 83 | 84 | typedef kv_array_t array_t; 85 | 86 | template 87 | static array_t merge_pass(array_t x, int tid, int count, int pass, 88 | comp_t comp, storage_t& storage) { 89 | 90 | // Divide the CTA's keys into lists. 91 | int coop = 2<< pass; 92 | merge_range_t range = compute_mergesort_range(count, tid, coop, vt); 93 | int diag = vt * tid - range.a_begin; 94 | 95 | // Store the keys into shared memory for searching. 96 | reg_to_shared_thread(x.keys, tid, storage.keys); 97 | 98 | // Search for the merge for this thread within its list. 99 | int mp = merge_path(storage.keys, range, diag, comp); 100 | 101 | // Run a serial merge and return. 102 | merge_pair_t merge = serial_merge( 103 | storage.keys, range.partition(mp, diag), comp); 104 | x.keys = merge.keys; 105 | 106 | if constexpr(has_values) { 107 | // Reorder values through shared memory. 108 | reg_to_shared_thread(x.vals, tid, storage.vals); 109 | x.vals = shared_gather(storage.vals, merge.indices); 110 | } 111 | 112 | return x; 113 | } 114 | 115 | template 116 | static array_t block_sort(array_t x, int tid, int count, comp_t comp, 117 | storage_t& storage) { 118 | 119 | // Sort the inputs within each thread. If any threads have fewer than 120 | // vt items, use the segmented sort network to prevent out-of-range 121 | // elements from contaminating the sort. 122 | if(count < nt * vt) { 123 | int head_flags = out_of_range_flags(vt * tid, vt, count); 124 | x = odd_even_sort(x, comp, head_flags); 125 | } else 126 | x = odd_even_sort(x, comp); 127 | 128 | // Merge threads starting with a pair until all values are merged. 129 | for(int pass = 0; pass < num_passes; ++pass) 130 | x = merge_pass(x, tid, count, pass, comp, storage); 131 | 132 | return x; 133 | } 134 | }; 135 | 136 | END_MGPU_NAMESPACE 137 | -------------------------------------------------------------------------------- /inc/mgpu/common/cta_radix.hxx: -------------------------------------------------------------------------------- 1 | #include "cta_scan.hxx" 2 | #include 3 | 4 | BEGIN_MGPU_NAMESPACE 5 | 6 | template 7 | struct radix_permute_t { 8 | typedef unsigned_int_by_size_t unsigned_type; 9 | typedef signed_int_by_size_t signed_type; 10 | 11 | static unsigned_type to_radix_bits(type_t x) { 12 | if constexpr(std::is_unsigned_v) { 13 | // Do nothing. 14 | return x; 15 | 16 | } else if constexpr(std::is_integral_v) { 17 | // Flip the most significant bit. 18 | return x ^ (1<< (8 * sizeof(type_t) - 1)); 19 | 20 | } else if constexpr(std::is_floating_point_v) { 21 | // Always flip the most significant bit. Flip all other bits if the 22 | // most significant bit started flipped. 23 | unsigned_type y = *reinterpret_cast(&x); 24 | unsigned_type mask = 25 | // Carry-in the sign bit to all lower bits 26 | ((signed_type)y>> (8 * sizeof(type_t) - 1)) | 27 | // Always set the most significant bit 28 | ((unsigned_type)1<< (8 * sizeof(type_t) - 1)); 29 | 30 | return y ^ mask; 31 | 32 | } else { 33 | static_assert("type cannot be converted to radix form"); 34 | } 35 | } 36 | 37 | static type_t from_radix_bits(unsigned_type x) { 38 | if constexpr(std::is_unsigned_v) { 39 | // Do nothing. 40 | return x; 41 | 42 | } else if constexpr(std::is_integral_v) { 43 | // Flip the most significant bit. 44 | return x ^ (1<< (8 * sizeof(type_t) - 1)); 45 | 46 | } else if constexpr(std::is_floating_point_v) { 47 | // Flip the sign bit. 48 | x ^= (unsigned_type)1<< (8 * sizeof(type_t) - 1); 49 | 50 | // Flip the lower bits if the sign bit is set. 51 | unsigned_type mask = 52 | // Carry-in the sign bit to all lower bits 53 | ((signed_type)x>> (8 * sizeof(type_t) - 1)) & 54 | // Always clear the most significant bit 55 | (((unsigned_type)1<< (8 * sizeof(type_t) - 1)) - 1); 56 | x ^= mask; 57 | 58 | return *reinterpret_cast(&x); 59 | 60 | } else { 61 | static_assert("type cannot be converted from radix form"); 62 | } 63 | } 64 | }; 65 | 66 | //////////////////////////////////////////////////////////////////////////////// 67 | 68 | enum radix_kind_t { 69 | radix_kind_shared, 70 | radix_kind_ballot, 71 | }; 72 | 73 | template 74 | struct cta_radix_rank_t; 75 | 76 | //////////////////////////////////////////////////////////////////////////////// 77 | // Use shared memory histogram to rank digits. 78 | 79 | template 80 | struct cta_radix_rank_t { 81 | enum { num_bins = 1<< num_bits, num_slots = num_bins / 2 + 1 }; 82 | typedef cta_scan_t scan_t; 83 | 84 | template 85 | struct result_t { 86 | // All threads return scatter indices for each value. 87 | std::array indices; 88 | 89 | // The first num_bins threads return the corresponding digit count. 90 | uint digit_scan; 91 | }; 92 | 93 | union storage_t { 94 | uint16_t hist16[nt * num_bins]; 95 | uint32_t hist32[nt * num_slots]; 96 | typename scan_t::storage_t scan; 97 | }; 98 | 99 | // Return the cta-wide reduction for each digit in the first num_bins 100 | // threads. 101 | template 102 | uint reduce(std::array x, storage_t& shared) { 103 | int tid = glcomp_LocalInvocationID.x; 104 | 105 | // Cooperatively zero out the histogram smem. 106 | @meta for(int i = 0; i < num_slots; ++i) 107 | shared.hist32[nt * i + tid] = 0; 108 | __syncthreads(); 109 | 110 | // Compute the histogram for each thread. Not great for bank conflicts, but 111 | // at least it's easy. 112 | @meta for(int i = 0; i < vt; ++i) 113 | ++shared.hist16[nt * x[i] + tid]; 114 | __syncthreads(); 115 | 116 | // Cooperatively scan the entire histogram. Each thread loads 9 words, 117 | // which corresponds to 18 histogram slots. The odd grain size avoids 118 | // smem bank conflicts on all architectures. 119 | uint sum = 0; 120 | uint counters[num_slots]; 121 | @meta for(int i = 0; i < num_slots; ++i) { 122 | counters[i] = shared.hist32[num_slots * tid + i]; 123 | sum += counters[i]; 124 | } 125 | __syncthreads(); 126 | 127 | // Scan the reductions. 128 | uint carry_in = scan_t().scan(sum, shared.scan).scan; 129 | carry_in += (carry_in>> 16) | (carry_in<< 16); 130 | 131 | // Write the scanned histogram back to shared memory. 132 | @meta for(int i = 0; i < num_slots; ++i) { 133 | // Add .low to .high 134 | carry_in += counters[i]<< 16; 135 | carry_in += counters[i] + (counters[i]>> 16); 136 | shared.hist32[num_slots * tid + i] = carry_in; 137 | } 138 | __syncthreads(); 139 | 140 | // Get the digit totals. This is a maximally-conflicted operation. 141 | uint digit_count = 0; 142 | if(tid < num_bins) { 143 | digit_count = shared.hist16[nt * tid + nt - 1]; 144 | int left = subgroupShuffleUp(digit_count, 1); 145 | if(tid) 146 | digit_count -= left; 147 | } 148 | 149 | __syncthreads(); 150 | return digit_count; 151 | } 152 | 153 | // Return the scatter indices for all keys plus the cta-wide scan for 154 | // each digit. 155 | template 156 | result_t scatter(std::array x, storage_t& shared) { 157 | int tid = glcomp_LocalInvocationID.x; 158 | 159 | // Cooperatively zero out the histogram smem. 160 | @meta for(int i = 0; i < num_slots; ++i) 161 | shared.hist32[nt * i + tid] = 0; 162 | __syncthreads(); 163 | 164 | // Compute the histogram for each thread. Not great for bank conflicts, but 165 | // at least it's easy. 166 | @meta for(int i = 0; i < vt; ++i) 167 | ++shared.hist16[nt * x[i] + tid]; 168 | __syncthreads(); 169 | 170 | // Cooperatively scan the entire histogram. Each thread loads 9 words, 171 | // which corresponds to 18 histogram slots. The odd grain size avoids 172 | // smem bank conflicts on all architectures. 173 | uint sum = 0; 174 | uint counters[num_slots]; 175 | @meta for(int i = 0; i < num_slots; ++i) { 176 | counters[i] = shared.hist32[num_slots * tid + i]; 177 | sum += counters[i]; 178 | } 179 | __syncthreads(); 180 | 181 | // Scan the reductions. 182 | uint carry_in = scan_t().scan(sum, shared.scan).scan; 183 | carry_in += (carry_in>> 16) | (carry_in<< 16); 184 | 185 | // Write the scanned histogram back to shared memory. 186 | @meta for(int i = 0; i < num_slots; ++i) { 187 | // Add .low to .high 188 | carry_in += counters[i]<< 16; 189 | shared.hist32[num_slots * tid + i] = carry_in; 190 | carry_in += counters[i] + (counters[i]>> 16); 191 | } 192 | __syncthreads(); 193 | 194 | // Get the digit totals. This is a maximally-conflicted operation. 195 | uint digit_scan = tid < num_bins ? shared.hist16[nt * tid] : 0; 196 | __syncthreads(); 197 | 198 | // Make a downsweep pass by counting the digits a second time. 199 | std::array scatter; 200 | @meta for(int i = 0; i < vt; ++i) 201 | scatter[i] = shared.hist16[nt * x[i] + tid]++; 202 | __syncthreads(); 203 | 204 | return { scatter, digit_scan }; 205 | } 206 | }; 207 | 208 | //////////////////////////////////////////////////////////////////////////////// 209 | // Use ballot instruction to rank digits. Currently this only works on 210 | // 32-lane subgroups. 211 | 212 | template 213 | struct cta_radix_rank_t { 214 | enum { 215 | num_bins = 1<< num_bits, 216 | warp_size = 32, 217 | num_warps = nt / warp_size, 218 | counters_per_thread = num_warps * num_bins / nt 219 | }; 220 | 221 | // Simpler to require as many threads as thehre are histogram bins. 222 | static_assert(nt >= num_bins); 223 | static_assert(num_warps * num_bins >= nt); 224 | 225 | typedef cta_scan_t scan_t; 226 | 227 | template 228 | struct result_t { 229 | // All threads return scatter indices for each value. 230 | std::array indices; 231 | 232 | // The first num_bins threads return the corresponding digit count. 233 | uint digit_scan; 234 | }; 235 | 236 | union storage_t { 237 | uint32_t counters[num_warps * num_bins]; 238 | uint32_t hist32[num_warps][num_bins]; 239 | typename scan_t::storage_t scan; 240 | }; 241 | 242 | uint32_t get_matching_lanes(uint digit) { 243 | // Start with all lanes matching. 244 | uint32_t match = -1; 245 | @meta for(int i = 0; i < num_bits; ++i) {{ 246 | const uint flag = 1<< i; 247 | uint mask = flag & digit; 248 | uint32_t b = gl_subgroupBallot(mask).x; 249 | 250 | // Clear lanes from the match if they have a different bit. 251 | if(!mask) b = ~b; 252 | match &= b; 253 | }} 254 | 255 | return match; 256 | } 257 | 258 | template 259 | uint reduce(std::array x, storage_t& shared) { 260 | // Cooperatively zero out the histogram smem. 261 | int tid = glcomp_LocalInvocationID.x; 262 | int lane = gl_SubgroupInvocationID; 263 | int warp = gl_SubgroupID; 264 | 265 | // Cooperatively zero out the shared memory. 266 | @meta for(int i = 0; i < counters_per_thread; ++i) 267 | shared.counters[nt * i + tid] = 0; 268 | __syncthreads(); 269 | 270 | // Process each digit. 271 | @meta for(int i = 0; i < vt; ++i) {{ 272 | // Get a bitfield of lanes with matching digits. 273 | uint32_t match = get_matching_lanes(x[i]); 274 | 275 | // Increment the histogram bin to indicate the digit count. 276 | // Only the lowest lane in the match mask does this. 277 | if(0 == (gl_SubgroupLtMask & match)) 278 | shared.hist32[warp][x[i]] += bitCount(match); 279 | }} 280 | __syncthreads(); 281 | 282 | // Do a digit-wise reduction across warps. 283 | int digit_count = 0; 284 | if(tid < num_bins) { 285 | @meta for(int i = 0; i < num_warps; ++i) 286 | digit_count += shared.hist32[i][tid]; 287 | } 288 | __syncthreads(); 289 | 290 | return digit_count; 291 | } 292 | 293 | // The digits provided to scatter must be in warp-strided order. 294 | // That is, they first vary by i, then by lane. 295 | template 296 | result_t scatter(std::array x, storage_t& shared) { 297 | // Cooperatively zero out the histogram smem. 298 | int tid = glcomp_LocalInvocationID.x; 299 | int lane = gl_SubgroupInvocationID; 300 | int warp = gl_SubgroupID; 301 | 302 | // Cooperatively zero out the shared memory. 303 | @meta for(int i = 0; i < counters_per_thread; ++i) 304 | shared.counters[nt * i + tid] = 0; 305 | __syncthreads(); 306 | 307 | // Process each digit. 308 | uint matches[vt]; 309 | @meta for(int i = 0; i < vt; ++i) { 310 | // Get a bitfield of lanes with matching digits. 311 | matches[i] = get_matching_lanes(x[i]); 312 | if(0 == (gl_SubgroupLtMask & matches[i])) 313 | shared.hist32[warp][x[i]] += bitCount(matches[i]); 314 | } 315 | __syncthreads(); 316 | 317 | std::array counters; 318 | if(tid < num_bins) { 319 | // Reduce the digit counts over the warps and keep a copy of the 320 | // counters. 321 | @meta for(int i = 0; i < num_warps; ++i) 322 | counters[i] = shared.hist32[i][tid]; 323 | } 324 | __syncthreads(); 325 | 326 | // Do a cooperative CTA scan. 327 | auto result = scan_t().scan(counters, shared.scan); 328 | 329 | uint digit_scan = 0; 330 | if(tid < num_bins) { 331 | // Add back into the warp counters. 332 | digit_scan = result.scan[0]; 333 | @meta for(int i = 0; i < num_warps; ++i) 334 | shared.hist32[i][tid] = result.scan[i]; 335 | } 336 | __syncthreads(); 337 | 338 | // Make a second pass and compute scatter indices. 339 | std::array scatter; 340 | @meta for(int i = 0; i < vt; ++i) {{ 341 | uint lower_mask = gl_SubgroupLtMask.x & matches[i]; 342 | uint offset = shared.hist32[warp][x[i]]; 343 | scatter[i] = offset + bitCount(lower_mask); 344 | if(0 == (gl_SubgroupLtMask & matches[i])) 345 | shared.hist32[warp][x[i]] = offset + bitCount(matches[i]); 346 | }} 347 | __syncthreads(); 348 | 349 | return { scatter, digit_scan }; 350 | } 351 | }; 352 | 353 | END_MGPU_NAMESPACE 354 | -------------------------------------------------------------------------------- /inc/mgpu/common/cta_scan.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "loadstore.hxx" 3 | #include "subgroup.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | //////////////////////////////////////////////////////////////////////////////// 8 | 9 | template 10 | struct cta_reduce_t { 11 | struct storage_t { 12 | type_t warps[nt / 8]; 13 | }; 14 | 15 | // Reduce the values across a cta. Only thread 0 returns a value. If all 16 | // threads want the value, store to shared memory and broadcast. 17 | template > 18 | type_t reduce(type_t x, storage_t& shared, op_t op = op_t()) { 19 | int warp_size = gl_SubgroupSize; 20 | int num_warps = gl_NumSubgroups; 21 | int lane = gl_SubgroupInvocationID; 22 | int warp = gl_SubgroupID; 23 | 24 | // Reduce within a warp. 25 | for(int offset = 1; offset < warp_size; offset<<= 1) { 26 | type_t y = subgroupShuffleDown(x, offset); 27 | if(lane + offset < warp_size) 28 | x = op(x, y); 29 | } 30 | 31 | // The first lane in each warp writes its reduction. 32 | if(!lane) 33 | shared.warps[warp] = x; 34 | __syncthreads(); 35 | 36 | // Scan the reductions. This assumes we can do it in one shot. 37 | if(lane < num_warps) { 38 | x = shared.warps[lane]; 39 | for(int offset = 1; offset < num_warps; offset<<= 1) { 40 | type_t y = subgroupShuffleDown(x, offset); 41 | if(lane + offset < num_warps) 42 | x = op(x, y); 43 | } 44 | } 45 | __syncthreads(); 46 | 47 | return x; 48 | } 49 | 50 | template > 51 | type_t reduce(std::array x, storage_t& shared, op_t op = op_t()) { 52 | // Reduce within a thread. 53 | @meta for(int i = 1; i < vt; ++i) 54 | x[0] = op(x[0], x[i]); 55 | 56 | // Reduce across threads. 57 | return reduce(x[0], shared, op); 58 | } 59 | }; 60 | 61 | //////////////////////////////////////////////////////////////////////////////// 62 | 63 | enum scan_type_t { 64 | scan_type_exc, 65 | scan_type_inc, 66 | }; 67 | 68 | template 0)> 69 | struct scan_result_t { 70 | type_t scan; 71 | type_t reduction; 72 | }; 73 | 74 | template 75 | struct scan_result_t { 76 | std::array scan; 77 | type_t reduction; 78 | }; 79 | 80 | template 81 | struct cta_scan_t { 82 | struct storage_t { 83 | int warps[nt / 8]; 84 | }; 85 | 86 | // Scalar scan. 87 | template< 88 | scan_type_t scan_type = scan_type_exc, 89 | typename op_t = std::plus 90 | > 91 | scan_result_t scan(type_t x, storage_t& shared, 92 | type_t init = type_t(), op_t op = op_t()) { 93 | 94 | int warp_size = gl_SubgroupSize; 95 | int num_warps = gl_NumSubgroups; 96 | int lane = gl_SubgroupInvocationID; 97 | int warp = gl_SubgroupID; 98 | 99 | // Use subgroupShuffleUp to prefix sum over a warp. 100 | for(int offset = 1; offset < warp_size; offset<<= 1) { 101 | type_t y = subgroupShuffleUp(x, offset); 102 | if(offset <= lane) 103 | x = op(y, x); 104 | } 105 | 106 | // The last lane in each warp writes its reduction. 107 | if(warp_size - 1 == lane) 108 | shared.warps[warp] = x; 109 | __syncthreads(); 110 | 111 | // Scan the reductions. This assumes we can do it in one shot. 112 | if(lane < num_warps) { 113 | type_t x = shared.warps[lane]; 114 | for(int offset = 1; offset < num_warps; offset<<= 1) { 115 | type_t y = subgroupShuffleUp(x, offset); 116 | if(offset <= lane) 117 | x = op(y, x); 118 | } 119 | shared.warps[lane] = x; 120 | } 121 | __syncthreads(); 122 | 123 | if constexpr(scan_type_exc == scan_type) { 124 | // For exclusive scan, get the value of the warp scan to the left. 125 | type_t left = subgroupShuffleUp(x, 1); 126 | x = lane ? left : init; 127 | } 128 | 129 | if(warp) 130 | x = op(shared.warps[warp - 1], x); 131 | type_t reduction = shared.warps[num_warps - 1]; 132 | __syncthreads(); 133 | 134 | return { x, reduction }; 135 | } 136 | 137 | // CTA vectorized scan. Accepts multiple values per thread and adds in 138 | // optional global carry-in. 139 | template< 140 | scan_type_t scan_type = scan_type_exc, 141 | int vt, 142 | typename op_t = std::plus 143 | > 144 | scan_result_t scan(std::array x, storage_t& shared, 145 | type_t carry_in = type_t(), type_t init = type_t(), op_t op = op_t()) { 146 | 147 | int tid = glcomp_LocalInvocationID.x; 148 | 149 | // Reduce the inputs and scan them. 150 | type_t sum = (... + x...[:]); 151 | 152 | // Scan the thread-local reductions for a carry-in for each thread. 153 | scan_result_t result = scan(sum, shared, init, op); 154 | 155 | // Perform the scan downsweep and add both global carry-in and the thread 156 | // carry-in to the values. 157 | result.reduction = op(carry_in, result.reduction); 158 | result.scan = op(carry_in, result.scan); 159 | 160 | if constexpr(scan_type_exc == scan_type) { 161 | @meta for(int i = 0; i < vt; ++i) { 162 | type_t temp = x[i]; 163 | x[i] = result.scan; 164 | result.scan += temp; 165 | } 166 | 167 | } else { 168 | // Add the carry-in. 169 | @meta for(int i = 0; i < vt; ++i) 170 | x[i] = op(result.scan, x[i]); 171 | } 172 | 173 | return { x, result.reduction }; 174 | } 175 | }; 176 | 177 | END_MGPU_NAMESPACE 178 | -------------------------------------------------------------------------------- /inc/mgpu/common/kernel_merge.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cta_merge.hxx" 3 | 4 | BEGIN_MGPU_NAMESPACE 5 | 6 | //////////////////////////////////////////////////////////////////////////////// 7 | // Generic merge code for a compute kernel. 8 | 9 | template< 10 | int nt, int vt, 11 | typename mp_it, 12 | typename a_keys_it, typename a_vals_it, 13 | typename b_keys_it, typename b_vals_it, 14 | typename c_keys_it, typename c_vals_it, 15 | typename comp_t 16 | > 17 | void kernel_merge( 18 | mp_it mp_data, 19 | a_keys_it a_keys, a_vals_it a_vals, int a_count, 20 | b_keys_it b_keys, b_vals_it b_vals, int b_count, 21 | c_keys_it c_keys, c_vals_it c_vals, comp_t comp) { 22 | 23 | typedef typename std::iterator_traits::value_type key_t; 24 | typedef typename std::iterator_traits::value_type val_t; 25 | 26 | const int nv = nt * vt; 27 | int tid = threadIdx.x; 28 | int cta = blockIdx.x; 29 | 30 | __shared__ ALIAS_UNION { 31 | key_t keys[nv + 1]; 32 | int indices[nv]; 33 | } shared; 34 | 35 | // Load the range for this CTA and merge the values into register. 36 | int mp0 = mp_data[cta + 0]; 37 | int mp1 = mp_data[cta + 1]; 38 | merge_range_t range = compute_merge_range(a_count, b_count, cta, nv, 39 | mp0, mp1); 40 | 41 | merge_pair_t merge = cta_merge_from_mem( 42 | a_keys, b_keys, range, tid, comp, shared.keys); 43 | 44 | int dest_offset = nv * cta; 45 | reg_to_mem_thread(merge.keys, tid, range.total(), c_keys + dest_offset, 46 | shared.keys); 47 | 48 | if constexpr(!std::is_same_v) { 49 | // Transpose the indices from thread order to strided order. 50 | std::array indices = reg_thread_to_strided(merge.indices, tid, 51 | shared.indices); 52 | 53 | // Gather the input values and merge into the output values. 54 | transfer_two_streams_strided(a_vals + range.a_begin, range.a_count(), 55 | b_vals + range.b_begin, range.b_count(), indices, tid, 56 | c_vals + dest_offset); 57 | } 58 | } 59 | 60 | END_MGPU_NAMESPACE 61 | -------------------------------------------------------------------------------- /inc/mgpu/common/kernel_mergesort.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cta_mergesort.hxx" 3 | #include "bindings.hxx" 4 | 5 | #include 6 | 7 | BEGIN_MGPU_NAMESPACE 8 | 9 | //////////////////////////////////////////////////////////////////////////////// 10 | // Sort full tiles in place. 11 | 12 | template< 13 | bool sort_indices, 14 | int nt, int vt, 15 | typename keys_in_it, typename vals_in_it, 16 | typename keys_out_it, typename vals_out_it, 17 | typename comp_t 18 | > 19 | void kernel_blocksort( 20 | keys_in_it keys_in, vals_in_it vals_in, 21 | keys_out_it keys_out, vals_out_it vals_out, 22 | int count, comp_t comp) { 23 | 24 | typedef typename std::iterator_traits::value_type key_t; 25 | typedef typename std::iterator_traits::value_type val_t; 26 | enum { has_values = !std::is_same::value }; 27 | 28 | typedef cta_sort_t sort_t; 29 | sort_t sort; 30 | 31 | __shared__ typename sort_t::storage_t shared; 32 | 33 | const int nv = nt * vt; 34 | int tid = threadIdx.x; 35 | int cta = blockIdx.x; 36 | range_t tile = get_tile(cta, nv, count); 37 | 38 | // Load the keys and values. 39 | kv_array_t unsorted; 40 | unsorted.keys = mem_to_reg_thread(keys_in + tile.begin, tid, 41 | tile.count(), shared.keys); 42 | 43 | if constexpr(sort_indices) { 44 | // If we're sorting key/index pairs, sythesize the data without sampling 45 | // the counting_iterator, which would perform a trip through shared 46 | // memory. 47 | int index = vt * (nt * cta + tid); 48 | @meta for(int i = 0; i < vt; ++i) 49 | unsorted.vals[i] = index + i; 50 | 51 | } else if constexpr(has_values) { 52 | unsorted.vals = mem_to_reg_thread(vals_in + tile.begin, tid, 53 | tile.count(), shared.vals); 54 | } 55 | 56 | // Blocksort. 57 | kv_array_t sorted = sort_t().block_sort(unsorted, 58 | tid, tile.count(), comp, shared); 59 | 60 | // Store the keys and values. 61 | reg_to_mem_thread(sorted.keys, tid, tile.count(), 62 | keys_out + tile.begin, shared.keys); 63 | 64 | if constexpr(has_values) 65 | reg_to_mem_thread(sorted.vals, tid, tile.count(), 66 | vals_out + tile.begin, shared.vals); 67 | } 68 | 69 | //////////////////////////////////////////////////////////////////////////////// 70 | // Join two fully sorted sequences into one sequence. 71 | 72 | template< 73 | int nt, int vt, 74 | typename mp_it, 75 | typename keys_in_it, typename vals_in_it, 76 | typename keys_out_it, typename vals_out_it, 77 | typename comp_t 78 | > 79 | void kernel_mergesort_pass( 80 | mp_it mp_data, 81 | keys_in_it keys_in, vals_in_it vals_in, 82 | keys_out_it keys_out, vals_out_it vals_out, 83 | int count, int coop, comp_t comp) { 84 | 85 | typedef typename std::iterator_traits::value_type key_t; 86 | typedef typename std::iterator_traits::value_type val_t; 87 | enum { has_values = !std::is_same::value }; 88 | 89 | const int nv = nt * vt; 90 | int tid = threadIdx.x; 91 | int cta = blockIdx.x; 92 | 93 | __shared__ ALIAS_UNION { 94 | key_t keys[nv + 1]; 95 | int indices[nv]; 96 | } shared; 97 | 98 | range_t tile = get_tile(cta, nv, count); 99 | 100 | // Load the range for this CTA and merge the values into register. 101 | merge_range_t range = compute_mergesort_range(count, cta, coop, nv, 102 | mp_data[cta + 0], mp_data[cta + 1]); 103 | 104 | merge_pair_t merge = cta_merge_from_mem( 105 | keys_in, keys_in, range, tid, comp, shared.keys); 106 | 107 | // Store merged values back out. 108 | reg_to_mem_thread(merge.keys, tid, tile.count(), 109 | keys_out + tile.begin, shared.keys); 110 | 111 | if constexpr(has_values) { 112 | // Transpose the indices from thread order to strided order. 113 | std::array indices = reg_thread_to_strided(merge.indices, 114 | tid, shared.indices); 115 | 116 | // Gather the input values and merge into the output values. 117 | transfer_two_streams_strided(vals_in + range.a_begin, 118 | range.a_count(), vals_in + range.b_begin, range.b_count(), 119 | indices, tid, vals_out + tile.begin); 120 | } 121 | } 122 | 123 | END_MGPU_NAMESPACE 124 | -------------------------------------------------------------------------------- /inc/mgpu/common/loadstore.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | #include "types.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | //////////////////////////////////////////////////////////////////////////////// 8 | // reg<->shared 9 | 10 | template 11 | void reg_to_shared_thread(std::array x, int tid, 12 | type_t (&shared)[shared_size], bool sync = true) { 13 | 14 | static_assert(shared_size >= nt * vt, 15 | "reg_to_shared_thread must have at least nt * vt storage"); 16 | 17 | // thread_iterate([&](int i, int j) { 18 | // shared[j] = x[i]; 19 | // }, tid); 20 | 21 | @meta for(int i = 0; i < vt; ++i) 22 | shared[vt * tid + i] = x[i]; 23 | 24 | if(sync) __syncthreads(); 25 | } 26 | 27 | template 28 | std::array shared_to_reg_thread( 29 | const type_t (&shared)[shared_size], int tid, bool sync = true) { 30 | 31 | static_assert(shared_size >= nt * vt, 32 | "reg_to_shared_thread must have at least nt * vt storage"); 33 | 34 | std::array x; 35 | thread_iterate([&](int i, int j) { 36 | x[i] = shared[j]; 37 | }, tid); 38 | if(sync) __syncthreads(); 39 | return x; 40 | } 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | 44 | template 45 | void reg_to_shared_strided(std::array x, int tid, 46 | type_t (&shared)[shared_size], bool sync = true) { 47 | 48 | static_assert(shared_size >= nt * vt, 49 | "reg_to_shared_strided must have at least nt * vt storage"); 50 | 51 | // strided_iterate([&](int i, int j) { shared[j] = x[i]; }, tid); 52 | 53 | @meta for(int i = 0; i < vt; ++i) 54 | shared[nt * i + tid] = x[i]; 55 | 56 | if(sync) __syncthreads(); 57 | } 58 | 59 | template 60 | std::array shared_to_reg_strided( 61 | const type_t (&shared)[shared_size], int tid, bool sync = true) { 62 | 63 | static_assert(shared_size >= nt * vt, 64 | "shared_to_reg_strided must have at least nt * vt storage"); 65 | 66 | std::array x; 67 | strided_iterate([&](int i, int j) { x[i] = shared[j]; }, tid); 68 | if(sync) __syncthreads(); 69 | return x; 70 | } 71 | 72 | //////////////////////////////////////////////////////////////////////////////// 73 | 74 | template 75 | std::array shared_to_reg_warp(const type_t (&shared)[shared_size], 76 | int lane, int warp, int warp_size, bool sync = true) { 77 | 78 | uint cur = vt * warp_size * warp + lane; 79 | 80 | std::array x; 81 | @meta for(int i = 0; i < vt; ++i) 82 | x[i] = shared[cur + i * warp_size]; 83 | if(sync) __syncthreads(); 84 | return x; 85 | } 86 | 87 | //////////////////////////////////////////////////////////////////////////////// 88 | 89 | template 90 | std::array shared_gather(const type_t(&data)[shared_size], 91 | std::array indices, bool sync = true) { 92 | 93 | static_assert(shared_size >= nt * vt, 94 | "shared_gather must have at least nt * vt storage"); 95 | 96 | std::array x { data[indices...[:]]... }; 97 | if(sync) __syncthreads(); 98 | return x; 99 | } 100 | 101 | template 102 | std::array thread_to_strided(std::array x, 103 | int tid, type_t (&shared)[shared_size]) { 104 | 105 | reg_to_shared_thread(x, tid, shared); 106 | return shared_to_reg_strided(shared, tid); 107 | } 108 | 109 | 110 | 111 | //////////////////////////////////////////////////////////////////////////////// 112 | // reg<->memory 113 | 114 | template 115 | void reg_to_mem_strided(std::array x, int tid, 116 | int count, it_t mem) { 117 | 118 | // strided_iterate([=](int i, int j) { 119 | // mem[j] = x[i]; 120 | // }, tid, count); 121 | @meta for(int i = 0; i < vt; ++i) {{ 122 | int k = nt * i + tid; 123 | if(k < count) 124 | mem[k] = x[i]; 125 | }} 126 | } 127 | 128 | template 129 | std::array::value_type, vt> 130 | mem_to_reg_strided(it_t mem, int tid, int count) { 131 | typedef typename std::iterator_traits::value_type type_t; 132 | std::array x; 133 | 134 | // strided_iterate([&](int i, int j) { 135 | // x[i] = mem[j]; 136 | // }, tid, count); 137 | 138 | @meta for(int i = 0; i < vt; ++i) {{ 139 | int k = nt * i + tid; 140 | if(k < count) 141 | x[i] = mem[k]; 142 | }} 143 | 144 | return x; 145 | } 146 | 147 | template 149 | void reg_to_mem_thread(std::array x, int tid, 150 | int count, it_t mem, type_t (&shared)[shared_size]) { 151 | 152 | reg_to_shared_thread(x, tid, shared); 153 | std::array y = shared_to_reg_strided(shared, tid); 154 | reg_to_mem_strided(y, tid, count, mem); 155 | } 156 | 157 | template 159 | std::array mem_to_reg_thread(it_t mem, int tid, 160 | int count, type_t (&shared)[shared_size]) { 161 | 162 | std::array x = mem_to_reg_strided(mem, tid, count); 163 | reg_to_shared_strided(x, tid, shared); 164 | std::array y = shared_to_reg_thread(shared, tid); 165 | return y; 166 | } 167 | 168 | template 169 | void mem_to_mem(input_it input, int tid, int count, 170 | output_it output) { 171 | typedef typename std::iterator_traits::value_type type_t; 172 | type_t x[vt]; 173 | 174 | strided_iterate([&](int i, int j) { 175 | x[i] = input[j]; 176 | }, tid, count); 177 | strided_iterate([&](int i, int j) { 178 | output[j] = x[i]; 179 | }, tid, count); 180 | } 181 | 182 | //////////////////////////////////////////////////////////////////////////////// 183 | // memory<->memory 184 | 185 | template 186 | void mem_to_shared(it_t mem, int tid, int count, type_t* shared, 187 | bool sync = true) { 188 | 189 | std::array x = mem_to_reg_strided(mem, tid, count); 190 | strided_iterate([&](int i, int j) { 191 | shared[j] = x[i]; 192 | }, tid, count); 193 | if(sync) __syncthreads(); 194 | } 195 | 196 | template 197 | void shared_to_mem(const type_t* shared, int tid, int count, 198 | it_t mem, bool sync = true) { 199 | 200 | strided_iterate([&](int i, int j) { 201 | mem[j] = shared[j]; 202 | }, tid, count); 203 | if(sync) __syncthreads(); 204 | } 205 | 206 | //////////////////////////////////////////////////////////////////////////////// 207 | // reg<->reg 208 | 209 | template 210 | std::array reg_thread_to_strided(std::array x, 211 | int tid, type_t (&shared)[shared_size]) { 212 | 213 | reg_to_shared_thread(x, tid, shared); 214 | return shared_to_reg_strided(shared, tid); 215 | } 216 | 217 | template 218 | std::array reg_strided_to_thread(std::array x, 219 | int tid, type_t (&shared)[shared_size]) { 220 | 221 | reg_to_shared_strided(x, tid, shared); 222 | return shared_to_reg_thread(shared, tid); 223 | } 224 | 225 | END_MGPU_NAMESPACE 226 | -------------------------------------------------------------------------------- /inc/mgpu/common/meta.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define BEGIN_MGPU_NAMESPACE namespace mgpu { 9 | #define END_MGPU_NAMESPACE } 10 | 11 | BEGIN_MGPU_NAMESPACE 12 | 13 | struct empty_t { }; 14 | 15 | template 16 | struct tuple_t { 17 | types_t @(int...) ...; 18 | }; 19 | 20 | constexpr int div_up(int x, int y) { 21 | return (x + y - 1) / y; 22 | } 23 | constexpr int64_t div_up(int64_t x, int64_t y) { 24 | return (x + y - 1) / y; 25 | } 26 | constexpr size_t div_up(size_t x, size_t y) { 27 | return (x + y - 1) / y; 28 | } 29 | 30 | template 31 | constexpr bool is_pow2(type_t x) { 32 | static_assert(std::is_integral_v); 33 | return 0 == (x & (x - 1)); 34 | } 35 | 36 | // Find log2(x) and optionally round up to the next integer logarithm. 37 | inline int find_log2(int x, bool round_up = false) { 38 | int a = 31 - __builtin_clz(x); 39 | if(round_up) a += !is_pow2(x); 40 | return a; 41 | } 42 | 43 | constexpr int s_log2(int x) { 44 | int i = 0; 45 | while(x) { 46 | x>>= 1; 47 | ++i; 48 | } 49 | return i; 50 | } 51 | 52 | template 53 | void iterate(func_t f) { 54 | @meta for(int i = 0; i < count; ++i) 55 | f(i); 56 | } 57 | 58 | // Invoke unconditionally. 59 | template 60 | void strided_iterate(func_t f, int tid) { 61 | @meta for(int i = 0; i < vt; ++i) 62 | f(i, nt * i + tid); 63 | } 64 | 65 | // Check range. 66 | template 67 | void strided_iterate(func_t f, int tid, int count) { 68 | // Unroll the first vt0 elements of each thread. 69 | if constexpr(vt0) { 70 | if(vt0 > 1 && count >= nt * vt0) { 71 | strided_iterate(f, tid); // No checking 72 | 73 | } else { 74 | @meta for(int i = 0; i < vt0; ++i) {{ 75 | int index = nt * i + tid; 76 | if(index < count) f(i, index); 77 | }} 78 | } 79 | } 80 | 81 | @meta for(int i = vt0; i < vt; ++i) {{ 82 | int index = nt * i + tid; 83 | if(index < count) f(i, index); 84 | }} 85 | } 86 | 87 | template 88 | void thread_iterate(func_t f, int tid) { 89 | @meta for(int i = 0; i < vt; ++i) 90 | f(i, vt * tid + i); 91 | } 92 | 93 | template 94 | struct integer_by_size_t; 95 | 96 | template<> struct integer_by_size_t<1> { 97 | typedef int8_t signed_type; 98 | typedef uint8_t unsigned_type; 99 | }; 100 | template<> struct integer_by_size_t<2> { 101 | typedef int16_t signed_type; 102 | typedef uint16_t unsigned_type; 103 | }; 104 | template<> struct integer_by_size_t<4> { 105 | typedef int32_t signed_type; 106 | typedef uint32_t unsigned_type; 107 | }; 108 | template<> struct integer_by_size_t<8> { 109 | typedef int64_t signed_type; 110 | typedef uint64_t unsigned_type; 111 | }; 112 | 113 | template 114 | using signed_int_by_size_t = typename integer_by_size_t::signed_type; 115 | 116 | template 117 | using unsigned_int_by_size_t = typename integer_by_size_t::unsigned_type; 118 | 119 | template 120 | type_t* advance_pointer(void*& p, size_t count) { 121 | type_t* p2 = (type_t*)p; 122 | p = p2 + count; 123 | return p2; 124 | } 125 | 126 | #ifdef ALIASED_SMEM 127 | #define ALIAS_UNION union 128 | #pragma spirv GL_EXT_shared_memory_block 129 | #else 130 | #define ALIAS_UNION struct 131 | #endif 132 | 133 | END_MGPU_NAMESPACE 134 | -------------------------------------------------------------------------------- /inc/mgpu/common/sort_networks.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "types.hxx" 3 | 4 | BEGIN_MGPU_NAMESPACE 5 | 6 | //////////////////////////////////////////////////////////////////////////////// 7 | // Odd-even transposition sorting network. Sorts keys and values in-place in 8 | // register. 9 | // http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort 10 | 11 | template 12 | std::array odd_even_sort(std::array x, comp_t comp, 13 | int flags = 0) { 14 | 15 | @meta for(int I = 0; I < vt; ++I) { 16 | @meta for(int i = 1 & I; i < vt - 1; i += 2) { 17 | if((0 == ((2<< i) & flags)) && comp(x[i + 1], x[i])) 18 | std::swap(x[i], x[i + 1]); 19 | } 20 | } 21 | return x; 22 | } 23 | 24 | template 25 | kv_array_t odd_even_sort(kv_array_t x, 26 | comp_t comp, int flags = 0) { 27 | 28 | @meta for(int I = 0; I < vt; ++I) { 29 | @meta for(int i = 1 & I; i < vt - 1; i += 2) { 30 | if((0 == ((2<< i) & flags)) && comp(x.keys[i + 1], x.keys[i])) { 31 | std::swap(x.keys[i], x.keys[i + 1]); 32 | 33 | if constexpr(!std::is_same_v) 34 | std::swap(x.vals[i], x.vals[i + 1]); 35 | } 36 | } 37 | } 38 | return x; 39 | } 40 | 41 | END_MGPU_NAMESPACE 42 | -------------------------------------------------------------------------------- /inc/mgpu/common/subgroup.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "meta.hxx" 3 | 4 | BEGIN_MGPU_NAMESPACE 5 | 6 | template 7 | type_t subgroupShuffle(type_t x, uint id) { 8 | if constexpr(std::is_array_v || 9 | requires { typename std::tuple_size::type; }) { 10 | 11 | // Shuffle elemnents of arrays and tuples. 12 | x...[:] = subgroupShuffle(x...[:], id)...; 13 | 14 | } else if constexpr(std::is_class_v) { 15 | // Shuffle all public base classes and data members of class objects. 16 | x...[:] = subgroupShuffle(x.@base_values(), id)...; 17 | x...[:] = subgroupShuffle(x.@member_values(), id)...; 18 | 19 | } else { 20 | // Plain shuffle scalars. 21 | x = gl_subgroupShuffle(x, id); 22 | } 23 | 24 | return x; 25 | } 26 | 27 | template 28 | type_t subgroupShuffleDown(type_t x, uint delta) { 29 | if constexpr(std::is_array_v || 30 | requires { typename std::tuple_size::type; }) { 31 | x...[:] = subgroupShuffleDown(x...[:], delta)...; 32 | 33 | } else if constexpr(std::is_class_v) { 34 | x...[:] = subgroupShuffleDown(x.@base_values(), delta)...; 35 | x...[:] = subgroupShuffleDown(x.@member_values(), delta)...; 36 | 37 | } else { 38 | x = gl_subgroupShuffleDown(x, delta); 39 | } 40 | return x; 41 | } 42 | 43 | template 44 | type_t subgroupShuffleUp(type_t x, uint delta) { 45 | if constexpr(std::is_array_v || 46 | requires { typename std::tuple_size::type; }) { 47 | x...[:] = subgroupShuffleUp(x...[:], delta)...; 48 | 49 | } else if constexpr(std::is_class_v) { 50 | x...[:] = subgroupShuffleUp(x.@base_values(), delta)...; 51 | x...[:] = subgroupShuffleUp(x.@member_values(), delta)...; 52 | 53 | } else { 54 | x = gl_subgroupShuffleUp(x, delta); 55 | } 56 | return x; 57 | } 58 | 59 | END_MGPU_NAMESPACE 60 | -------------------------------------------------------------------------------- /inc/mgpu/common/types.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "meta.hxx" 3 | #include 4 | #include 5 | 6 | BEGIN_MGPU_NAMESPACE 7 | 8 | struct alignas(16) range_t { 9 | int begin, end; 10 | int size() const noexcept { return end - begin; } 11 | int count() const noexcept { return size(); } 12 | bool valid() const noexcept { return begin < end; } 13 | }; 14 | 15 | inline range_t get_tile(int cta, int nv, int count) noexcept { 16 | return range_t { nv * cta, min(count, nv * (cta + 1)) }; 17 | } 18 | 19 | struct alignas(16) merge_range_t { 20 | int a_begin, a_end, b_begin, b_end; 21 | 22 | int a_count() const noexcept { return a_end - a_begin; } 23 | int b_count() const noexcept { return b_end - b_begin; } 24 | int total() const noexcept { return a_count() + b_count(); } 25 | 26 | range_t a_range() const noexcept { 27 | return { a_begin, a_end }; 28 | } 29 | range_t b_range() const noexcept { 30 | return { b_begin, b_end }; 31 | } 32 | 33 | merge_range_t to_local() const noexcept { 34 | return { 0, a_count(), a_count(), total() }; 35 | } 36 | 37 | // Partition from mp to the end. 38 | merge_range_t partition(int mp0, int diag) const noexcept { 39 | return { a_begin + mp0, a_end, b_begin + diag - mp0, b_end }; 40 | } 41 | 42 | // Partition from mp0 to mp1. 43 | merge_range_t partition(int mp0, int diag0, int mp1, int diag1) const noexcept { 44 | return { 45 | a_begin + mp0, 46 | a_begin + mp1, 47 | b_begin + diag0 - mp0, 48 | b_begin + diag1 - mp1 49 | }; 50 | } 51 | 52 | bool a_valid() const noexcept { 53 | return a_begin < a_end; 54 | } 55 | bool b_valid() const noexcept { 56 | return b_begin < b_end; 57 | } 58 | }; 59 | 60 | template 61 | struct merge_pair_t { 62 | int mp; 63 | std::array keys; 64 | std::array indices; 65 | }; 66 | 67 | template 68 | struct kv_array_t { 69 | std::array keys; 70 | std::array vals; 71 | }; 72 | 73 | enum bounds_t { 74 | bounds_lower, 75 | bounds_upper 76 | }; 77 | 78 | END_MGPU_NAMESPACE 79 | -------------------------------------------------------------------------------- /inc/mgpu/gl/app.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "buffer.hxx" 3 | #include 4 | #include 5 | #include 6 | 7 | BEGIN_MGPU_NAMESPACE 8 | 9 | namespace gl { 10 | 11 | struct app_t { 12 | app_t(const char* name); 13 | 14 | protected: 15 | virtual void debug_callback(GLenum source, GLenum type, GLuint id, 16 | GLenum severity, GLsizei length, const GLchar* message); 17 | 18 | GLFWwindow* window = nullptr; 19 | 20 | private: 21 | static void _debug_callback(GLenum source, GLenum type, GLuint id, 22 | GLenum severity, GLsizei length, const GLchar* message, 23 | const void* user_param); 24 | 25 | }; 26 | 27 | app_t::app_t(const char* name) { 28 | glfwInit(); 29 | gl3wInit(); 30 | 31 | glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4); 32 | glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 6); 33 | 34 | glfwWindowHint(GLFW_VISIBLE, GLFW_FALSE); 35 | window = glfwCreateWindow(320, 240, name, nullptr, nullptr); 36 | glfwMakeContextCurrent(window); 37 | glfwSwapInterval(1); 38 | 39 | glEnable(GL_DEBUG_OUTPUT); 40 | glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); 41 | glDebugMessageCallback(_debug_callback, this); 42 | } 43 | 44 | void app_t::debug_callback(GLenum source, GLenum type, GLuint id, 45 | GLenum severity, GLsizei length, const GLchar* message) { 46 | 47 | if(GL_DEBUG_SEVERITY_HIGH == severity) { 48 | printf("OpenGL: %s\n", message); 49 | exit(1); 50 | } 51 | } 52 | 53 | void app_t::_debug_callback(GLenum source, GLenum type, GLuint id, 54 | GLenum severity, GLsizei length, const GLchar* message, 55 | const void* user_param) { 56 | 57 | app_t* app = (app_t*)user_param; 58 | app->debug_callback(source, type, id, severity, length, message); 59 | } 60 | 61 | } // namespace gl 62 | 63 | END_MGPU_NAMESPACE 64 | -------------------------------------------------------------------------------- /inc/mgpu/gl/buffer.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../common/bindings.hxx" 3 | 4 | #define GL_GLEXT_PROTOTYPES 5 | #include 6 | 7 | BEGIN_MGPU_NAMESPACE 8 | 9 | namespace gl { 10 | 11 | template< 12 | typename T, 13 | bool is_array = std::is_array_v, 14 | bool is_const = std::is_const_v 15 | > 16 | struct gl_buffer_t { 17 | typedef std::remove_extent_t type_t; 18 | 19 | gl_buffer_t() : buffer(0), count(0) { } 20 | 21 | gl_buffer_t(int count, const type_t* data = nullptr) noexcept : count(count) { 22 | glCreateBuffers(1, &buffer); 23 | glNamedBufferStorage(buffer, sizeof(type_t) * count, data, 24 | GL_DYNAMIC_STORAGE_BIT); 25 | } 26 | gl_buffer_t(const std::vector& data) noexcept : 27 | gl_buffer_t(data.size(), data.data()) { } 28 | 29 | ~gl_buffer_t() { 30 | if(buffer) 31 | glDeleteBuffers(1, &buffer); 32 | } 33 | 34 | gl_buffer_t(const gl_buffer_t&) = delete; 35 | gl_buffer_t& operator=(const gl_buffer_t) = delete; 36 | 37 | operator GLuint() noexcept { return buffer; } 38 | 39 | void swap(gl_buffer_t& rhs) noexcept { 40 | std::swap(buffer, rhs.buffer); 41 | std::swap(count, rhs.count); 42 | } 43 | 44 | void set_data(const type_t* data) noexcept { 45 | if(count) { 46 | assert(buffer); 47 | glNamedBufferSubData(buffer, 0, sizeof(type_t) * count, data); 48 | } 49 | } 50 | void set_data(const std::vector& data) { 51 | resize(data.size()); 52 | set_data(data.data()); 53 | } 54 | void set_data_range(const type_t* data, int first, int count) { 55 | assert(first + count <= this->count); 56 | if(count) { 57 | assert(buffer); 58 | glNamedBufferSubData(buffer, sizeof(type_t) * first, 59 | count * sizeof(type_t), data); 60 | } 61 | } 62 | 63 | void get_data(type_t* data) noexcept { 64 | if(count) { 65 | assert(buffer); 66 | glGetNamedBufferSubData(buffer, 0, sizeof(type_t) * count, data); 67 | } 68 | } 69 | 70 | void clear_bytes() { 71 | if(count && buffer) { 72 | char zero = 0; 73 | glClearNamedBufferData(buffer, GL_R8I, GL_RED_INTEGER, 74 | GL_UNSIGNED_BYTE, &zero); 75 | } 76 | } 77 | 78 | void bind_ubo(GLuint index) { 79 | glBindBufferBase(GL_UNIFORM_BUFFER, index, buffer); 80 | } 81 | void bind_ssbo(GLuint index) { 82 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, index, buffer); 83 | } 84 | 85 | template 86 | buffer_iterator_t bind_ssbo() { 87 | bind_ssbo(bind); 88 | return { }; 89 | } 90 | 91 | std::vector get_data() { 92 | std::vector vec(count); 93 | get_data(vec.data()); 94 | return vec; 95 | } 96 | 97 | void resize(int count2, bool preserve = false) { 98 | if(count != count2) { 99 | gl_buffer_t buffer2(count2); 100 | 101 | if(preserve && count && count2) { 102 | // Copy the old data into the new buffer. 103 | glCopyNamedBufferSubData(buffer, buffer2, 0, 0, 104 | std::min(count, count2) * sizeof(type_t)); 105 | } 106 | 107 | std::swap(buffer, buffer2.buffer); 108 | std::swap(count, buffer2.count); 109 | } 110 | } 111 | 112 | GLuint buffer; 113 | int count; 114 | }; 115 | 116 | template 117 | struct gl_buffer_t { 118 | gl_buffer_t(const type_t* data = nullptr) noexcept { 119 | glCreateBuffers(1, &buffer); 120 | glNamedBufferStorage(buffer, sizeof(type_t), data, 121 | GL_DYNAMIC_STORAGE_BIT); 122 | } 123 | 124 | ~gl_buffer_t() { 125 | glDeleteBuffers(1, &buffer); 126 | } 127 | 128 | gl_buffer_t(const gl_buffer_t&) = delete; 129 | gl_buffer_t& operator=(const gl_buffer_t) = delete; 130 | 131 | operator GLuint() noexcept { return buffer; } 132 | 133 | void set_data(const type_t& data) noexcept { 134 | assert(buffer); 135 | glNamedBufferSubData(buffer, 0, sizeof(type_t), &data); 136 | } 137 | void get_data(type_t* data) noexcept { 138 | assert(buffer); 139 | glGetNamedBufferSubData(buffer, 0, sizeof(type_t), data); 140 | } 141 | type_t get_data() noexcept { 142 | type_t x; 143 | get_data(&x); 144 | return x; 145 | } 146 | 147 | void bind_ubo(GLuint index) { 148 | glBindBufferBase(GL_UNIFORM_BUFFER, index, buffer); 149 | } 150 | void bind_ssbo(GLuint index) { 151 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, index, buffer); 152 | } 153 | 154 | GLuint buffer; 155 | }; 156 | 157 | // A const non-array type keeps a copy of the object on the CPU. 158 | template 159 | struct gl_buffer_t { 160 | typedef std::remove_const_t type_t; 161 | 162 | gl_buffer_t() : buffer(0), invalid(true) { 163 | glCreateBuffers(1, &buffer); 164 | glNamedBufferStorage(buffer, sizeof(type_t), nullptr, 165 | GL_DYNAMIC_STORAGE_BIT); 166 | } 167 | 168 | gl_buffer_t(const type_t& x) : data(x) { 169 | glCreateBuffers(1, &buffer); 170 | glNamedBufferStorage(buffer, sizeof(type_t), &data, 171 | GL_DYNAMIC_STORAGE_BIT); 172 | invalid = false; 173 | } 174 | 175 | ~gl_buffer_t() { 176 | glDeleteBuffers(1, &buffer); 177 | } 178 | 179 | void set_data(const type_t& x) noexcept { 180 | if(!data || memcmp(&x, &data, sizeof(type_t))) { 181 | data.emplace(x); 182 | invalid = true; 183 | } 184 | } 185 | 186 | void update() { 187 | if(invalid) { 188 | assert(data); 189 | glNamedBufferSubData(buffer, 0, sizeof(type_t), &*data); 190 | invalid = false; 191 | } 192 | } 193 | 194 | void bind_ubo(GLuint index) { 195 | update(); 196 | glBindBufferBase(GL_UNIFORM_BUFFER, index, buffer); 197 | } 198 | 199 | void bind_ubo_range(GLuint index, size_t offset, size_t size) { 200 | update(); 201 | glBindBufferRange(GL_UNIFORM_BUFFER, index, buffer, offset, size); 202 | } 203 | 204 | GLuint buffer; 205 | bool invalid; 206 | std::optional data; 207 | }; 208 | 209 | } // namespace gl 210 | 211 | END_MGPU_NAMESPACE 212 | -------------------------------------------------------------------------------- /inc/mgpu/gl/merge.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../common/kernel_merge.hxx" 3 | #include "../common/bindings.hxx" 4 | #include "partition.hxx" 5 | 6 | BEGIN_MGPU_NAMESPACE 7 | 8 | namespace gl { 9 | 10 | template 11 | [[using spirv: comp, local_size(nt)]] 12 | void kernel_merge() { 13 | params_t params = shader_uniform; 14 | 15 | kernel_merge( 16 | readonly_iterator_t(), 17 | 18 | params.a_keys, 19 | params.a_vals, 20 | params.a_count, 21 | 22 | params.b_keys, 23 | params.b_vals, 24 | params.b_count, 25 | 26 | params.c_keys, 27 | params.c_vals, 28 | 29 | params.comp 30 | ); 31 | } 32 | 33 | template< 34 | typename a_keys_it, 35 | typename a_values_it, 36 | typename b_keys_it, 37 | typename b_values_it, 38 | typename c_keys_it, 39 | typename c_values_it, 40 | typename comp_t> 41 | struct merge_params_t { 42 | a_keys_it a_keys; 43 | b_keys_it b_keys; 44 | c_keys_it c_keys; 45 | 46 | int spacing; // NV * VT 47 | int a_count; 48 | int b_count; 49 | 50 | // Put the potentially empty objects together to take up less space. 51 | a_values_it a_vals; 52 | b_values_it b_vals; 53 | c_values_it c_vals; 54 | comp_t comp; 55 | }; 56 | 57 | template 58 | void launch_merge(int count) { 59 | // First launch the partition kernel. 60 | launch_partition(count, nt * vt); 61 | 62 | // Launch the CTA merge kernel. 63 | int num_ctas = div_up(count, nt * vt); 64 | gl_dispatch_kernel >(num_ctas); 65 | } 66 | 67 | // merge_pipeline_t is a convenient entry point for using the merge 68 | // kernel. It loads data from SSBOs and writes to an SSBO. Storage for 69 | // the parameters UBO and merge paths SSBO is handled automatically. 70 | template > 72 | struct merge_pipeline_t { 73 | void reserve(int count, int spacing) { 74 | int num_partitions = num_merge_partitions(count, spacing); 75 | if(num_partitions > partitions_ssbo.count) 76 | partitions_ssbo.resize(num_partitions); 77 | } 78 | 79 | template 80 | void launch(GLuint a_keys, int a_count, GLuint b_keys, int b_count, 81 | GLuint c_keys, comp_t comp = comp_t()) { 82 | 83 | static_assert(std::is_same_v); 84 | 85 | // Bind the merge path SSBO. 86 | reserve(a_count + b_count, nt * vt); 87 | partitions_ssbo.bind_ssbo(3); 88 | 89 | params_t params { }; 90 | params.spacing = nt * vt; 91 | params.a_count = a_count; 92 | params.b_count = b_count; 93 | params.comp = comp; 94 | 95 | // Upload and bind the UBO. 96 | params_ubo.set_data(params); 97 | params_ubo.bind_ubo(0); 98 | 99 | // Bind the data. 100 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, a_keys); 101 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, b_keys); 102 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, c_keys); 103 | 104 | launch_merge(a_count + b_count); 105 | } 106 | 107 | template 108 | void launch(GLuint a_keys, GLuint a_vals, int a_count, GLuint b_keys, 109 | GLuint b_vals, int b_count, GLuint c_keys, GLuint c_vals, 110 | comp_t comp = comp_t()) { 111 | 112 | static_assert(!std::is_same_v); 113 | 114 | // Bind the merge path SSBO. 115 | reserve(a_count + b_count, nt * vt); 116 | partitions_ssbo.bind_ssbo(3); 117 | 118 | params_t params { }; 119 | params.spacing = nt * vt; 120 | params.a_count = a_count; 121 | params.b_count = b_count; 122 | params.comp = comp; 123 | 124 | // Upload and bind the UBO. 125 | params_ubo.set_data(params); 126 | params_ubo.bind_ubo(0); 127 | 128 | // Bind the data. 129 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, a_keys); 130 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, b_keys); 131 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, c_keys); 132 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, a_vals); 133 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, b_vals); 134 | glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, c_vals); 135 | 136 | launch_merge(a_count + b_count); 137 | } 138 | 139 | typedef merge_params_t< 140 | // A 141 | readonly_iterator_t, 142 | readonly_iterator_t, 143 | 144 | // B 145 | readonly_iterator_t, 146 | readonly_iterator_t, 147 | 148 | // C 149 | writeonly_iterator_t, 150 | writeonly_iterator_t, 151 | 152 | comp_t 153 | > params_t; 154 | 155 | gl_buffer_t partitions_ssbo; 156 | gl_buffer_t params_ubo; 157 | }; 158 | 159 | } // namespace gl 160 | 161 | END_MGPU_NAMESPACE -------------------------------------------------------------------------------- /inc/mgpu/gl/mergesort.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../common/kernel_mergesort.hxx" 3 | #include "transform.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | namespace gl { 8 | 9 | template 10 | [[using spirv: comp, local_size(nt)]] 11 | void kernel_blocksort() { 12 | params_t params = shader_uniform; 13 | 14 | kernel_blocksort( 15 | params.keys_block, 16 | params.vals_block, 17 | params.keys_out, 18 | params.vals_out, 19 | params.count, 20 | params.comp 21 | ); 22 | 23 | // Zero out the pass identifiers at the end of the mp data. The partition 24 | // and mergesort pass kernels use these terms to know which pass they're 25 | // working on. 26 | //if(params.num_partitions + threadIdx.x + blockIdx.x) 27 | params.mp_data[params.num_partitions] = 0; 28 | } 29 | 30 | template 31 | void kernel_mergesort_partition(mp_it mp_data, keys_it keys, int count, 32 | int num_partitions, int spacing, int coop, comp_t comp) { 33 | 34 | int index = threadIdx.x + blockDim.x * blockIdx.x; 35 | if(index < num_partitions) { 36 | merge_range_t range = compute_mergesort_range(count, index, coop, spacing); 37 | int diag = min(spacing * index, count) - range.a_begin; 38 | mp_data[index] = merge_path(keys + range.a_begin, 39 | range.a_count(), keys + range.b_begin, range.b_count(), diag, comp); 40 | } 41 | } 42 | 43 | template 44 | [[using spirv: comp, local_size(128)]] 45 | void kernel_mergesort_partition() { 46 | params_t params = shader_uniform; 47 | 48 | // Load the pass. 49 | int pass = params.mp_data[params.num_partitions]; 50 | 51 | // The first thread should increment the pass. 52 | int first_thread = !threadIdx.x && !blockIdx.x; 53 | if(first_thread) 54 | params.mp_data[params.num_partitions + 1] = pass; 55 | 56 | int coop = 2<< pass; 57 | kernel_mergesort_partition( 58 | params.mp_data, 59 | params.keys_in, 60 | params.count, 61 | params.num_partitions, 62 | params.spacing, 63 | coop, 64 | params.comp 65 | ); 66 | } 67 | 68 | template