├── .gitignore ├── .gitmodules ├── src ├── root.zig ├── kernels.cl ├── utils.zig ├── idx.zig ├── defines.zig ├── main.zig ├── vtk.zig ├── opencl_hello_world.zig ├── ibm.zig ├── lbm.zig └── cl.zig ├── README.md └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | output 2 | .zig-cache 3 | zig-out -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "OpenCL-Headers"] 2 | path = OpenCL-Headers 3 | url = https://github.com/KhronosGroup/OpenCL-Headers 4 | -------------------------------------------------------------------------------- /src/root.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const testing = std.testing; 3 | 4 | comptime { 5 | _ = @import("lbm.zig"); 6 | _ = @import("vtk.zig"); 7 | } 8 | 9 | export fn add(a: i32, b: i32) i32 { 10 | return a + b; 11 | } 12 | 13 | test "basic add functionality" { 14 | try testing.expect(add(3, 7) == 10); 15 | } 16 | -------------------------------------------------------------------------------- /src/kernels.cl: -------------------------------------------------------------------------------- 1 | __kernel void square_array(__global int* input_array, __global int* output_array) { 2 | int i = get_global_id(0); 3 | int value = input_array[i]; 4 | output_array[i] = value * value; 5 | } 6 | 7 | __kernel void lbm_kernel( 8 | __global float* popA, 9 | __global float* popB, 10 | __global float* u, 11 | __global float* rho, 12 | __global float* force_ibm, 13 | const int time_step 14 | ) { 15 | // streaming (popA, popB) 16 | 17 | // macroscopics 18 | 19 | // collision 20 | 21 | // macroscopics 22 | } -------------------------------------------------------------------------------- /src/utils.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | 3 | pub fn writeArrayListToFile(filename: []const u8, content: []const u8) !void { 4 | // Open the file for writing 5 | var file = try std.fs.cwd().createFile(filename, .{}); 6 | defer file.close(); 7 | 8 | // Get a writer for the file 9 | var writer = file.writer(); 10 | 11 | // Write the contents of the ArrayList to the file 12 | try writer.writeAll(content); 13 | } 14 | 15 | // Function to append formatted strings to the list 16 | pub fn appendFormatted(list: *std.ArrayList(u8), comptime format_string: []const u8, args: anytype) !void { 17 | var buffer: [512]u8 = undefined; 18 | const buffer_slice = buffer[0..]; 19 | 20 | const str_add = try std.fmt.bufPrint(buffer_slice, format_string, args); 21 | 22 | try list.appendSlice(str_add); 23 | } 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # zLBM 2 | 3 | LBM (lattice Boltzmann method) solver written in Zig for learning purposes. 4 | 5 | The project is done by [Waine](https://github.com/wainejr/) and you can find the videos on its developing on his [YouTube](https://www.youtube.com/@waine_jr), in the [zLBM playlist](https://www.youtube.com/watch?v=BZobw0vnSHo&list=PL2WQTg3Tx5wO79IqfPwQhvgTqZsfIob9V). 6 | 7 | ## Cloning 8 | 9 | Don't forget to clone and initialize the submodules 10 | 11 | ```bash 12 | git clone --recursive https://github.com/wainejr/zLBM 13 | # or 14 | git clone https://github.com/wainejr/zLBM 15 | git submodule update --init --recursive 16 | ``` 17 | 18 | ## Building & Running 19 | 20 | To build the project, make sure you have [Zig](https://ziglang.org/) installed. 21 | The solver was developed and tested under version 0.13.0 and 0.14 on development. 22 | 23 | After that, you can build the program running 24 | 25 | ```bash 26 | zig build 27 | ``` 28 | 29 | And then run with 30 | 31 | ```bash 32 | ./zig-out/bin/zLBM 33 | ``` 34 | 35 | Or just run the project directly with 36 | 37 | ```bash 38 | zig run src/main.zig 39 | ``` 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Waine Junior 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/idx.zig: -------------------------------------------------------------------------------- 1 | const defs = @import("defines.zig"); 2 | 3 | pub inline fn idx2pos(idx: usize) [defs.dim]u32 { 4 | if (defs.dim == 2) { 5 | return .{ @intCast(idx % defs.domain_size[0]), @intCast(idx / defs.domain_size[0]) }; 6 | } else { 7 | return .{ @intCast(idx % defs.domain_size[0]), @intCast((idx / defs.domain_size[0]) % defs.domain_size[1]), @intCast(idx / (defs.domain_size[0] * defs.domain_size[1])) }; 8 | } 9 | } 10 | 11 | pub inline fn pos2idx(pos: [defs.dim]u32) usize { 12 | if (defs.dim == 2) { 13 | return pos[0] + pos[1] * defs.domain_size[0]; 14 | } else { 15 | return pos[0] + defs.domain_size[0] * (pos[1] + pos[2] * defs.domain_size[1]); 16 | } 17 | } 18 | 19 | pub inline fn idxPop(pos: [defs.dim]u32, i: u8) usize { 20 | return i + defs.n_pop * (pos2idx(pos)); 21 | } 22 | 23 | test "test Idx" { 24 | const std = @import("std"); 25 | const assert = std.debug.assert; 26 | var count: usize = 0; 27 | for (0..defs.n_nodes) |idx| { 28 | const pos = idx2pos(idx); 29 | const retIdx = pos2idx(pos); 30 | assert(retIdx == idx); 31 | for (0..defs.dim) |d| { 32 | assert(pos[d] >= 0); 33 | assert(pos[d] < defs.domain_size[d]); 34 | } 35 | for (0..defs.n_pop) |i| { 36 | const popIdx = idxPop(pos, @intCast(i)); 37 | assert(count == popIdx); 38 | count += 1; 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/defines.zig: -------------------------------------------------------------------------------- 1 | const utils = @import("utils.zig"); 2 | 3 | pub const cs2: f32 = 1.0 / 3.0; 4 | pub const VelSet = enum { D2Q9, D3Q19 }; 5 | pub const dim = switch (vel_set_use) { 6 | VelSet.D2Q9 => 2, 7 | VelSet.D3Q19 => 3, 8 | }; 9 | pub const n_pop = switch (vel_set_use) { 10 | VelSet.D2Q9 => 9, 11 | VelSet.D3Q19 => 19, 12 | }; 13 | pub const pop_dir: [n_pop][dim]i8 = switch (vel_set_use) { 14 | VelSet.D2Q9 => .{ [_]i8{ 0, 0 }, [_]i8{ 1, 0 }, [_]i8{ 0, 1 }, [_]i8{ -1, 0 }, [_]i8{ 0, -1 }, [_]i8{ 1, 1 }, [_]i8{ -1, 1 }, [_]i8{ -1, -1 }, [_]i8{ 1, -1 } }, 15 | VelSet.D3Q19 => .{ 16 | [_]i8{ 0, 0, 0 }, 17 | [_]i8{ 1, 0, 0 }, 18 | [_]i8{ -1, 0, 0 }, 19 | [_]i8{ 0, 1, 0 }, 20 | [_]i8{ 0, -1, 0 }, 21 | [_]i8{ 0, 0, 1 }, 22 | [_]i8{ 0, 0, -1 }, 23 | [_]i8{ 1, 1, 0 }, 24 | [_]i8{ -1, -1, 0 }, 25 | [_]i8{ 1, 0, 1 }, 26 | [_]i8{ -1, 0, -1 }, 27 | [_]i8{ 0, 1, 1 }, 28 | [_]i8{ 0, -1, -1 }, 29 | [_]i8{ 1, -1, 0 }, 30 | [_]i8{ -1, 1, 0 }, 31 | [_]i8{ 1, 0, -1 }, 32 | [_]i8{ -1, 0, 1 }, 33 | [_]i8{ 0, 1, -1 }, 34 | [_]i8{ 0, -1, 1 }, 35 | }, 36 | }; 37 | pub const n_nodes = domain_size[0] * domain_size[1] * (if (dim == 2) 1 else domain_size[2]); 38 | pub const pop_weights: [n_pop]f32 = switch (vel_set_use) { 39 | VelSet.D2Q9 => .{ 4.0 / 9.0, 1.0 / 9.0, 1.0 / 9.0, 1.0 / 9.0, 1.0 / 9.0, 1.0 / 36.0, 1.0 / 36.0, 1.0 / 36.0, 1.0 / 36.0 }, 40 | VelSet.D3Q19 => .{1.0 / 3.0} ++ [_]f32{1.0 / 18.0} ** 6 ++ [_]f32{1.0 / 36.0} ** 12, 41 | }; 42 | 43 | // Parameters 44 | pub const tau: f32 = 0.9; 45 | pub const domain_size: [dim]u32 = .{ 96, 96, 96 }; 46 | pub const vel_set_use = VelSet.D3Q19; 47 | pub const freq_export = 100; 48 | pub const n_steps = 5000; 49 | pub const global_force: [dim]f32 = .{ 0, 0, 0 }; 50 | 51 | // IBM parameters 52 | pub const forces_relaxation_factor = 0.5; 53 | pub const ibm_n_iterations = 1; 54 | -------------------------------------------------------------------------------- /src/main.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const lbm = @import("lbm.zig"); 3 | const ibm = @import("ibm.zig"); 4 | const vtk = @import("vtk.zig"); 5 | const defs = @import("defines.zig"); 6 | const utils = @import("utils.zig"); 7 | 8 | pub fn main() !void { 9 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); 10 | defer arena.deinit(); 11 | 12 | const allocator = arena.allocator(); 13 | 14 | const lbm_arrays = try lbm.allocate_arrs(&allocator); 15 | lbm_arrays.initialize(); 16 | // const body_ibm = try ibm.BodyIBM.create_basic_body(allocator); 17 | // try body_ibm.export_csv(allocator, "output/body_pos_0.csv"); 18 | const bodies: [0]ibm.BodyIBM = .{}; 19 | 20 | try lbm_arrays.export_arrays(allocator, 0); 21 | var timer = try std.time.Timer.start(); 22 | 23 | for (1..(defs.n_steps + 1)) |time_step| { 24 | lbm.run_time_step(lbm_arrays, @intCast(time_step)); 25 | // for (0..defs.ibm_n_iterations) |_| { 26 | for (0..1) |_| { // Use 1 iteration to not update macrs 27 | lbm.run_IBM_iteration(bodies[0..], lbm_arrays, @intCast(time_step)); 28 | } 29 | if (time_step % defs.freq_export == 0) { 30 | try lbm_arrays.export_arrays(allocator, @intCast(time_step)); 31 | 32 | for (bodies) |b| { 33 | var buffer: [100]u8 = undefined; 34 | const buffer_slice = buffer[0..]; 35 | const body_path = try std.fmt.bufPrint(buffer_slice, "output/body_pos_{}.csv", .{time_step}); 36 | try b.export_csv(allocator, body_path); 37 | } 38 | 39 | std.debug.print("Exported arrays in time step {}\n", .{time_step}); 40 | } 41 | } 42 | 43 | const time_passed_nano: f64 = @floatFromInt(timer.lap()); 44 | const time_passed_sec: f64 = time_passed_nano / 1e9; 45 | 46 | const mlups: f64 = (@as(usize, @intCast(defs.n_nodes)) * defs.n_steps) / (time_passed_sec * 1e6); 47 | 48 | std.debug.print("Finished simulation!\n", .{}); 49 | std.debug.print("MLUPS {d:0.2}\n", .{mlups}); 50 | std.debug.print("Time elapsed {d:0.2}s\n", .{time_passed_sec}); 51 | } 52 | -------------------------------------------------------------------------------- /src/vtk.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const utils = @import("utils.zig"); 3 | 4 | fn write_vtk_header(vtk_string: *std.ArrayList(u8), dims: []const u32) !void { 5 | const dims_use: [3]u32 = .{ dims[0], dims[1], if (dims.len < 3) 1 else dims[2] }; 6 | 7 | try vtk_string.appendSlice("# vtk DataFile Version 3.0\nData\nBINARY\nDATASET STRUCTURED_POINTS\n"); 8 | 9 | // "DIMENSIONS "+to_string(Nx)+" "+to_string(Ny)+" "+to_string(Nz)+"\n" 10 | try vtk_string.appendSlice("DIMENSIONS "); 11 | for (dims_use) |d| { 12 | try utils.appendFormatted(vtk_string, "{} ", .{d}); 13 | } 14 | try vtk_string.appendSlice("\n"); 15 | // "ORIGIN "+to_string(origin.x)+" "+to_string(origin.y)+" "+to_string(origin.z)+"\n" 16 | try vtk_string.appendSlice("ORIGIN "); 17 | for (dims_use) |_| { 18 | try utils.appendFormatted(vtk_string, "{} ", .{0}); 19 | } 20 | try vtk_string.appendSlice("\n"); 21 | // "SPACING "+to_string(spacing)+" "+to_string(spacing)+" "+to_string(spacing)+"\n" 22 | try vtk_string.appendSlice("SPACING "); 23 | for (dims_use) |_| { 24 | try utils.appendFormatted(vtk_string, "{} ", .{1}); 25 | } 26 | try vtk_string.appendSlice("\n"); 27 | // "POINT_DATA "+to_string((ulong)Nx*(ulong)Ny*(ulong)Nz)+ 28 | try utils.appendFormatted(vtk_string, "POINT_DATA {}", .{dims_use[0] * dims_use[1] * dims_use[2]}); 29 | // "\nSCALARS data "+vtk_type()+" "+to_string(dimensions())+"\nLOOKUP_TABLE default\n" 30 | } 31 | 32 | fn write_vtk_data(vtk_string: *std.ArrayList(u8), scalar_name: []const u8, arr: []const f32) !void { 33 | try utils.appendFormatted(vtk_string, "\nSCALARS {s} float 1", .{scalar_name}); 34 | // std.debug.print("my string {s}\n", .{vtk_string.items}); 35 | try vtk_string.appendSlice("\nLOOKUP_TABLE default\n"); 36 | 37 | // Write the scalar data in big-endian format 38 | var be_scalar: [4]u8 = undefined; 39 | for (arr) |value| { 40 | be_scalar = @bitCast(value); 41 | // std.debug.print("value {} bits {b:0>32} rev {b:0>32} scalars {x:} {x:} {x:} {x:}\n", .{ value, be_value, be_rev, be_scalar[0], be_scalar[1], be_scalar[2], be_scalar[3] }); 42 | const be_use: [4]u8 = .{ be_scalar[3], be_scalar[2], be_scalar[1], be_scalar[0] }; 43 | try vtk_string.appendSlice(be_use[0..4]); 44 | } 45 | } 46 | 47 | pub fn write_vtk(vtk_string: *std.ArrayList(u8), kv_arr: std.StringArrayHashMap([]const f32), dims: []const u32) !void { 48 | try write_vtk_header(vtk_string, dims); 49 | 50 | var arr_size: usize = 1; 51 | for (dims) |d| { 52 | arr_size *= d; 53 | } 54 | 55 | var it = kv_arr.iterator(); 56 | while (it.next()) |entry| { 57 | std.debug.assert(entry.value_ptr.len == arr_size); 58 | try write_vtk_data(vtk_string, entry.key_ptr.*, entry.value_ptr.*); 59 | } 60 | } 61 | 62 | test "export array" { 63 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); 64 | defer arena.deinit(); 65 | 66 | const allocator = arena.allocator(); 67 | 68 | // Define a new ArrayHashMap with keys of type []const u8 (strings) and values of type i32 (integers) 69 | var map = std.StringArrayHashMap([]const f32).init(allocator); 70 | defer map.deinit(); 71 | 72 | const dims: [2]u32 = .{ 2, 4 }; 73 | 74 | inline for (.{ "rho", "ux" }) |macr| { 75 | const my_arr: [8]f32 = .{ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8 }; 76 | try map.put(macr, my_arr[0..my_arr.len]); 77 | } 78 | var data_wr = std.ArrayList(u8).init(allocator); 79 | 80 | try write_vtk(&data_wr, map, &dims); 81 | 82 | try utils.writeArrayListToFile("teste.vtk", data_wr.items); 83 | } 84 | -------------------------------------------------------------------------------- /src/opencl_hello_world.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const info = std.log.info; 3 | 4 | const c = @cImport({ 5 | @cDefine("CL_TARGET_OPENCL_VERSION", "110"); 6 | @cInclude("CL/cl.h"); 7 | }); 8 | 9 | const program_src = 10 | \\__kernel void square_array(__global int* input_array, __global int* output_array) { 11 | \\ int i = get_global_id(0); 12 | \\ int value = input_array[i]; 13 | \\ output_array[i] = value * value; 14 | \\} 15 | ; 16 | 17 | const CLError = error{ 18 | GetPlatformsFailed, 19 | GetPlatformInfoFailed, 20 | NoPlatformsFound, 21 | GetDevicesFailed, 22 | GetDeviceInfoFailed, 23 | NoDevicesFound, 24 | CreateContextFailed, 25 | CreateCommandQueueFailed, 26 | CreateProgramFailed, 27 | BuildProgramFailed, 28 | CreateKernelFailed, 29 | SetKernelArgFailed, 30 | EnqueueNDRangeKernel, 31 | CreateBufferFailed, 32 | EnqueueWriteBufferFailed, 33 | EnqueueReadBufferFailed, 34 | }; 35 | 36 | fn get_cl_device() CLError!c.cl_device_id { 37 | var platform_ids: [16]c.cl_platform_id = undefined; 38 | var platform_count: c.cl_uint = undefined; 39 | if (c.clGetPlatformIDs(platform_ids.len, &platform_ids, &platform_count) != c.CL_SUCCESS) { 40 | return CLError.GetPlatformsFailed; 41 | } 42 | info("{} cl platform(s) found:", .{@as(u32, platform_count)}); 43 | 44 | for (platform_ids[0..platform_count], 0..) |id, i| { 45 | var name: [1024]u8 = undefined; 46 | var name_len: usize = undefined; 47 | if (c.clGetPlatformInfo(id, c.CL_PLATFORM_NAME, name.len, &name, &name_len) != c.CL_SUCCESS) { 48 | return CLError.GetPlatformInfoFailed; 49 | } 50 | info(" platform {}: {s}", .{ i, name[0..name_len] }); 51 | } 52 | 53 | if (platform_count == 0) { 54 | return CLError.NoPlatformsFound; 55 | } 56 | 57 | info("choosing platform 0...", .{}); 58 | 59 | var device_ids: [16]c.cl_device_id = undefined; 60 | var device_count: c.cl_uint = undefined; 61 | if (c.clGetDeviceIDs(platform_ids[0], c.CL_DEVICE_TYPE_ALL, device_ids.len, &device_ids, &device_count) != c.CL_SUCCESS) { 62 | return CLError.GetDevicesFailed; 63 | } 64 | info("{} cl device(s) found on platform 0:", .{@as(u32, device_count)}); 65 | 66 | for (device_ids[0..device_count], 0..) |id, i| { 67 | var name: [1024]u8 = undefined; 68 | var name_len: usize = undefined; 69 | if (c.clGetDeviceInfo(id, c.CL_DEVICE_NAME, name.len, &name, &name_len) != c.CL_SUCCESS) { 70 | return CLError.GetDeviceInfoFailed; 71 | } 72 | info(" device {}: {s}", .{ i, name[0..name_len] }); 73 | } 74 | 75 | if (device_count == 0) { 76 | return CLError.NoDevicesFound; 77 | } 78 | 79 | info("choosing device 0...", .{}); 80 | 81 | return device_ids[0]; 82 | } 83 | 84 | fn run_test(device: c.cl_device_id) CLError!void { 85 | info("** running test **", .{}); 86 | 87 | const ctx = c.clCreateContext(null, 1, &device, null, null, null); // future: last arg is error code 88 | if (ctx == null) { 89 | return CLError.CreateContextFailed; 90 | } 91 | defer _ = c.clReleaseContext(ctx); 92 | 93 | const command_queue = c.clCreateCommandQueue(ctx, device, 0, null); // future: last arg is error code 94 | if (command_queue == null) { 95 | return CLError.CreateCommandQueueFailed; 96 | } 97 | defer { 98 | _ = c.clFlush(command_queue); 99 | _ = c.clFinish(command_queue); 100 | _ = c.clReleaseCommandQueue(command_queue); 101 | } 102 | 103 | var program_src_c: [*c]const u8 = program_src; 104 | const program = c.clCreateProgramWithSource(ctx, 1, &program_src_c, null, null); // future: last arg is error code 105 | if (program == null) { 106 | return CLError.CreateProgramFailed; 107 | } 108 | defer _ = c.clReleaseProgram(program); 109 | 110 | if (c.clBuildProgram(program, 1, &device, null, null, null) != c.CL_SUCCESS) { 111 | return CLError.BuildProgramFailed; 112 | } 113 | 114 | const kernel = c.clCreateKernel(program, "square_array", null); 115 | if (kernel == null) { 116 | return CLError.CreateKernelFailed; 117 | } 118 | defer _ = c.clReleaseKernel(kernel); 119 | 120 | // Create buffers 121 | var input_array = init: { 122 | var init_value: [1024]i32 = undefined; 123 | for (0..1024) |i| { 124 | init_value[i] = @as(i32, @intCast(i)); 125 | } 126 | break :init init_value; 127 | }; 128 | 129 | var input_buffer = c.clCreateBuffer(ctx, c.CL_MEM_READ_ONLY, input_array.len * @sizeOf(i32), null, null); 130 | if (input_buffer == null) { 131 | return CLError.CreateBufferFailed; 132 | } 133 | defer _ = c.clReleaseMemObject(input_buffer); 134 | 135 | var output_buffer = c.clCreateBuffer(ctx, c.CL_MEM_WRITE_ONLY, input_array.len * @sizeOf(i32), null, null); 136 | if (output_buffer == null) { 137 | return CLError.CreateBufferFailed; 138 | } 139 | defer _ = c.clReleaseMemObject(output_buffer); 140 | 141 | // Fill input buffer 142 | if (c.clEnqueueWriteBuffer(command_queue, input_buffer, c.CL_TRUE, 0, input_array.len * @sizeOf(i32), &input_array, 0, null, null) != c.CL_SUCCESS) { 143 | return CLError.EnqueueWriteBufferFailed; 144 | } 145 | 146 | // Execute kernel 147 | if (c.clSetKernelArg(kernel, 0, @sizeOf(c.cl_mem), @ptrCast(&input_buffer)) != c.CL_SUCCESS) { 148 | return CLError.SetKernelArgFailed; 149 | } 150 | if (c.clSetKernelArg(kernel, 1, @sizeOf(c.cl_mem), @ptrCast(&output_buffer)) != c.CL_SUCCESS) { 151 | return CLError.SetKernelArgFailed; 152 | } 153 | 154 | var global_item_size: usize = input_array.len; 155 | var local_item_size: usize = 64; 156 | if (c.clEnqueueNDRangeKernel(command_queue, kernel, 1, null, &global_item_size, &local_item_size, 0, null, null) != c.CL_SUCCESS) { 157 | return CLError.EnqueueNDRangeKernel; 158 | } 159 | 160 | var output_array: [1024]i32 = undefined; 161 | if (c.clEnqueueReadBuffer(command_queue, output_buffer, c.CL_TRUE, 0, output_array.len * @sizeOf(i32), &output_array, 0, null, null) != c.CL_SUCCESS) { 162 | return CLError.EnqueueReadBufferFailed; 163 | } 164 | 165 | info("** done **", .{}); 166 | 167 | info("** results **", .{}); 168 | 169 | for (output_array, 0..) |val, i| { 170 | if (i % 100 == 0) { 171 | info("{} ^ 2 = {}", .{ i, val }); 172 | } 173 | } 174 | 175 | info("** done, exiting **", .{}); 176 | } 177 | 178 | pub fn main() anyerror!void { 179 | const device = try get_cl_device(); 180 | try run_test(device); 181 | } 182 | -------------------------------------------------------------------------------- /src/ibm.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const utils = @import("utils.zig"); 3 | const defs = @import("defines.zig"); 4 | const fidx = @import("idx.zig"); 5 | const Allocator = std.mem.Allocator; 6 | 7 | const DIRAC_RADIUS = 1.5; 8 | 9 | fn dirac_delta(r: f32) f32 { 10 | const ar = if (r < 0) -r else r; 11 | if (ar >= 1.5) { 12 | return 0; 13 | } 14 | if (ar >= 0.5) { 15 | return 9.0 / 8.0 - 3.0 * ar / 2.0 + r * r / 2.0; 16 | } 17 | return 0.75 - ar * ar; 18 | } 19 | 20 | pub const NodeIBM = struct { 21 | const Self = @This(); 22 | 23 | pos: [defs.dim]f32, 24 | u_interp: [defs.dim]f32, 25 | rho_interp: f32, 26 | area: f32, 27 | f_spread: [defs.dim]f32, 28 | dirac_sum: f32, 29 | 30 | pub fn init() NodeIBM { 31 | const n: NodeIBM = .{ 32 | .pos = .{ 0, 0, 0 }, 33 | .u_interp = .{ 0, 0, 0 }, 34 | .f_spread = .{ 0, 0, 0 }, 35 | .rho_interp = 0, 36 | .area = 0, 37 | .dirac_sum = 0, 38 | }; 39 | return n; 40 | } 41 | 42 | pub fn interp(self: *Self, rho: []f32, u: [defs.dim][]f32) void { 43 | const npos = self.pos; 44 | const r = DIRAC_RADIUS; 45 | const min_pos: [3]usize = .{ @intFromFloat(@ceil(npos[0] - r)), @intFromFloat(@ceil(npos[1] - r)), @intFromFloat(@ceil(npos[2] - r)) }; 46 | const max_pos: [3]usize = .{ @intFromFloat(@floor(npos[0] + r)), @intFromFloat(@floor(npos[1] + r)), @intFromFloat(@floor(npos[2] + r)) }; 47 | 48 | self.rho_interp = 0; 49 | self.u_interp = .{ 0, 0, 0 }; 50 | self.dirac_sum = 0; 51 | for (min_pos[2]..max_pos[2] + 1) |z| { 52 | const pz: f32 = @floatFromInt(z); 53 | const rz = pz - npos[2]; 54 | const dirac_z = dirac_delta(rz); 55 | for (min_pos[1]..max_pos[1] + 1) |y| { 56 | const py: f32 = @floatFromInt(y); 57 | const ry = py - npos[1]; 58 | const dirac_y = dirac_delta(ry); 59 | for (min_pos[0]..max_pos[0] + 1) |x| { 60 | const px: f32 = @floatFromInt(x); 61 | const rx = px - npos[0]; 62 | const dirac_x = dirac_delta(rx); 63 | 64 | const lpos: [defs.dim]u32 = .{ @intCast(x), @intCast(y), @intCast(z) }; 65 | const idx = fidx.pos2idx(lpos); 66 | const rho_local = rho[idx]; 67 | const u_local = .{ u[0][idx], u[1][idx], u[2][idx] }; 68 | 69 | const dirac = dirac_x * dirac_y * dirac_z; 70 | 71 | self.rho_interp += rho_local * dirac; 72 | self.u_interp[0] += u_local[0] * dirac; 73 | self.u_interp[1] += u_local[1] * dirac; 74 | self.u_interp[2] += u_local[2] * dirac; 75 | self.dirac_sum += dirac; 76 | } 77 | } 78 | } 79 | } 80 | 81 | pub fn update_f_spread(self: *Self) void { 82 | self.f_spread[0] = 2 * self.rho_interp * (-self.u_interp[0]) * self.area * defs.forces_relaxation_factor; 83 | self.f_spread[1] = 2 * self.rho_interp * (-self.u_interp[1]) * self.area * defs.forces_relaxation_factor; 84 | self.f_spread[2] = 2 * self.rho_interp * (-self.u_interp[2]) * self.area * defs.forces_relaxation_factor; 85 | } 86 | 87 | pub fn spread(self: Self, force: [defs.dim][]f32) void { 88 | const npos = self.pos; 89 | const r = DIRAC_RADIUS; 90 | const min_pos: [3]usize = .{ @intFromFloat(@ceil(npos[0] - r)), @intFromFloat(@ceil(npos[1] - r)), @intFromFloat(@ceil(npos[2] - r)) }; 91 | const max_pos: [3]usize = .{ @intFromFloat(@floor(npos[0] + r)), @intFromFloat(@floor(npos[1] + r)), @intFromFloat(@floor(npos[2] + r)) }; 92 | 93 | for (min_pos[2]..max_pos[2] + 1) |z| { 94 | const pz: f32 = @floatFromInt(z); 95 | const rz = pz - npos[2]; 96 | const dirac_z = dirac_delta(rz); 97 | for (min_pos[1]..max_pos[1] + 1) |y| { 98 | const py: f32 = @floatFromInt(y); 99 | const ry = py - npos[1]; 100 | const dirac_y = dirac_delta(ry); 101 | for (min_pos[0]..max_pos[0] + 1) |x| { 102 | const px: f32 = @floatFromInt(x); 103 | const rx = px - npos[0]; 104 | const dirac_x = dirac_delta(rx); 105 | 106 | const lpos: [defs.dim]u32 = .{ @intCast(x), @intCast(y), @intCast(z) }; 107 | const idx = fidx.pos2idx(lpos); 108 | 109 | const dirac = dirac_x * dirac_y * dirac_z; 110 | 111 | force[0][idx] += self.f_spread[0] * dirac; 112 | force[1][idx] += self.f_spread[1] * dirac; 113 | force[2][idx] += self.f_spread[2] * dirac; 114 | } 115 | } 116 | } 117 | } 118 | }; 119 | 120 | pub const BodyIBM = struct { 121 | const Self = @This(); 122 | 123 | nodes: []NodeIBM, 124 | 125 | pub fn create_basic_body(alloc: Allocator) !BodyIBM { 126 | const xmin = 10; 127 | const xmax = defs.domain_size[0] - 10; 128 | const zmin = 10; 129 | const zmax = defs.domain_size[1] - 10; 130 | const y_use = 5; 131 | 132 | const n_nodes = (xmax - xmin) * (zmax - zmin); 133 | 134 | var body_nodes = try alloc.alloc(NodeIBM, n_nodes); 135 | for (0..n_nodes) |idx| { 136 | body_nodes[idx] = NodeIBM.init(); 137 | } 138 | 139 | var i: usize = 0; 140 | for (xmin..xmax) |x| { 141 | for (zmin..zmax) |z| { 142 | body_nodes[i].pos = .{ @floatFromInt(x), y_use, @floatFromInt(z) }; 143 | body_nodes[i].area = 1; 144 | i += 1; 145 | } 146 | } 147 | 148 | return .{ .nodes = body_nodes }; 149 | } 150 | 151 | pub fn export_csv(self: Self, allocator: Allocator, path: []const u8) !void { 152 | var data_wr = std.ArrayList(u8).init(allocator); 153 | try data_wr.appendSlice("x,y,z,rho,ux,uy,uz,fx,fy,fz\n"); 154 | for (self.nodes) |n| { 155 | try utils.appendFormatted(&data_wr, "{}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n", .{ 156 | n.pos[0], 157 | n.pos[1], 158 | n.pos[2], 159 | n.rho_interp, 160 | n.u_interp[0], 161 | n.u_interp[1], 162 | n.u_interp[2], 163 | n.f_spread[0], 164 | n.f_spread[1], 165 | n.f_spread[2], 166 | }); 167 | } 168 | try utils.writeArrayListToFile(path, data_wr.items); 169 | data_wr.clearAndFree(); 170 | } 171 | 172 | pub fn interpolate_spread(self: Self, rho: []f32, u: [defs.dim][]f32, force: [defs.dim][]f32) void { 173 | for (0..self.nodes.len) |idx| { 174 | self.nodes[idx].interp(rho, u); 175 | self.nodes[idx].update_f_spread(); 176 | self.nodes[idx].spread(force); 177 | } 178 | } 179 | 180 | pub fn run_ibm(self: *const Self, rho: []f32, u: [defs.dim][]f32, force: [defs.dim][]f32) void { 181 | self.interpolate_spread(rho, u, force); 182 | } 183 | }; 184 | 185 | test "create basic body" { 186 | const allocator = std.testing.allocator; 187 | 188 | const body = try BodyIBM.create_basic_body(allocator); 189 | defer allocator.free(body.nodes); 190 | for (body.nodes) |node| { 191 | try std.testing.expectEqual(node.area, 1); 192 | } 193 | } 194 | 195 | test "interpolate body" { 196 | const lbm = @import("lbm.zig"); 197 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); 198 | defer arena.deinit(); 199 | 200 | const allocator = arena.allocator(); 201 | 202 | // const allocator = std.testing.allocator; 203 | 204 | const body = try BodyIBM.create_basic_body(allocator); 205 | defer allocator.free(body.nodes); 206 | 207 | const lbm_arrays = try lbm.allocate_arrs(&allocator); 208 | lbm_arrays.initialize(); 209 | 210 | body.interpolate_spread(lbm_arrays.rho, lbm_arrays.u, lbm_arrays.force_ibm); 211 | 212 | for (body.nodes) |node| { 213 | try std.testing.expectApproxEqAbs(1, node.dirac_sum, 0.01); 214 | } 215 | } 216 | 217 | test "spread body" { 218 | const lbm = @import("lbm.zig"); 219 | var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); 220 | defer arena.deinit(); 221 | 222 | const allocator = arena.allocator(); 223 | 224 | const body = try BodyIBM.create_basic_body(allocator); 225 | defer allocator.free(body.nodes); 226 | 227 | const lbm_arrays = try lbm.allocate_arrs(&allocator); 228 | lbm_arrays.initialize(); 229 | 230 | body.run_ibm(lbm_arrays.rho, lbm_arrays.u, lbm_arrays.force_ibm); 231 | } 232 | -------------------------------------------------------------------------------- /src/lbm.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const vtk = @import("vtk.zig"); 3 | const utils = @import("utils.zig"); 4 | const defs = @import("defines.zig"); 5 | const fidx = @import("idx.zig"); 6 | const ibm = @import("ibm.zig"); 7 | const Allocator = std.mem.Allocator; 8 | 9 | inline fn dot_prod(comptime T: type, x: *const [defs.dim]T, y: *const [defs.dim]T) T { 10 | var sum: T = 0; 11 | for (x, y) |i, j| { 12 | sum += i * j; 13 | } 14 | return sum; 15 | } 16 | 17 | inline fn func_feq(rho: f32, u: [defs.dim]f32, comptime i: usize) f32 { 18 | // const ud = .{ u[0], u[1] }; 19 | var popDir: [defs.dim]f32 = undefined; 20 | inline for (0..defs.dim) |d| { 21 | popDir[d] = @floatFromInt(defs.pop_dir[i][d]); 22 | } 23 | const uc: f32 = dot_prod(f32, &u, &popDir); 24 | const uu: f32 = dot_prod(f32, &u, &u); 25 | 26 | return rho * defs.pop_weights[i] * (1 + uc / defs.cs2 + (uc * uc) / (2 * defs.cs2 * defs.cs2) - (uu) / (2 * defs.cs2)); 27 | } 28 | 29 | inline fn source_term(u: [defs.dim]f32, force: [defs.dim]f32, comptime i: usize) f32 { 30 | // const ud = .{ u[0], u[1] }; 31 | var popDir: [defs.dim]f32 = undefined; 32 | inline for (0..defs.dim) |d| { 33 | popDir[d] = @floatFromInt(defs.pop_dir[i][d]); 34 | } 35 | var si: f32 = 0; 36 | const mul_term = (1 - 2 / defs.tau) * defs.pop_weights[i]; 37 | inline for (0..defs.dim) |alfa| { 38 | const cia = popDir[alfa]; 39 | si += mul_term * cia / defs.cs2 * force[alfa]; 40 | inline for (0..defs.dim) |beta| { 41 | const cib = popDir[beta]; 42 | const ciab = cia * cib; 43 | const k_dirac: f32 = if (alfa == beta) 1 else 0; 44 | si += mul_term * force[alfa] * (u[beta] * (ciab - defs.cs2 * k_dirac) / (defs.cs2 * defs.cs2)); 45 | } 46 | } 47 | return si; 48 | } 49 | 50 | test "func_eq const" { 51 | const assert = std.debug.assert; 52 | const rho: f32 = 1; 53 | const u: [defs.dim]f32 = .{0} ** defs.dim; 54 | inline for (0..defs.n_pop) |i| { 55 | const feq = func_feq(rho, u, i); 56 | assert(feq == defs.pop_weights[i]); 57 | } 58 | } 59 | 60 | pub fn macroscopics(idx: usize, pop: *[defs.n_pop]f32, rho: *f32, u: *[defs.dim]f32, force: *[defs.dim]f32) void { 61 | _ = idx; 62 | rho.* = 0; 63 | inline for (pop) |p| { 64 | rho.* += p; 65 | } 66 | u.* = .{0} ** defs.dim; 67 | inline for (0..defs.n_pop) |j| { 68 | inline for (0..defs.dim) |d| { 69 | if (defs.pop_dir[j][d] == 0) { 70 | continue; 71 | } 72 | const fdir: f32 = @floatFromInt(defs.pop_dir[j][d]); 73 | u.*[d] += pop[j] * fdir; 74 | } 75 | } 76 | inline for (0..defs.dim) |d| { 77 | u.*[d] += force.*[d] / 2; 78 | } 79 | 80 | inline for (0..defs.dim) |d| { 81 | u.*[d] /= rho.*; 82 | } 83 | } 84 | 85 | // Open Security Training 2 86 | 87 | pub fn collision(idx: usize, pop: *[defs.n_pop]f32, rho: f32, u: [defs.dim]f32, force: [defs.dim]f32) void { 88 | _ = idx; 89 | 90 | inline for (0..defs.n_pop) |i| { 91 | const feq = func_feq(rho, u, i); 92 | const si = source_term(u, force, i); 93 | const f_coll = pop[i] - (pop[i] - feq) / defs.tau - si; 94 | pop[i] = f_coll; 95 | } 96 | } 97 | 98 | pub fn streaming(idx: usize, pop: *[defs.n_pop]f32, popStream_arr: []f32) void { 99 | const pos = fidx.idx2pos(idx); 100 | for (0..defs.n_pop) |i| { 101 | // posTo = pos + defs.pop_dir[i] 102 | var popDir: [defs.dim]i32 = undefined; 103 | inline for (0..defs.dim) |d| { 104 | popDir[d] = @intCast(defs.pop_dir[i][d]); 105 | } 106 | var posFrom: [defs.dim]i32 = undefined; 107 | inline for (0..defs.dim) |d| { 108 | posFrom[d] = @intCast(pos[d]); 109 | posFrom[d] -= popDir[d] + defs.domain_size[d]; 110 | posFrom[d] = @mod(posFrom[d], defs.domain_size[d]); 111 | } 112 | var posFromU: [defs.dim]u32 = undefined; 113 | inline for (0..defs.dim) |d| { 114 | posFromU[d] = @intCast(posFrom[d]); 115 | } 116 | // std.debug.print("pop {} pos to {} {} pos {} {} dir {} {}\n", .{ i, posToU[0], posToU[1], pos[0], pos[1], popDir[0], popDir[1] }); 117 | 118 | pop[i] = popStream_arr[fidx.idxPop(posFromU, @intCast(i))]; 119 | } 120 | } 121 | 122 | const LBMArrays = struct { 123 | const Self = @This(); 124 | 125 | popA: []f32, 126 | popB: []f32, 127 | u: [defs.dim][]f32, 128 | rho: []f32, 129 | force_ibm: [defs.dim][]f32, 130 | 131 | pub fn initialize(self: *const Self) void { 132 | for (0..defs.n_nodes) |idx| { 133 | const pos = fidx.idx2pos(idx); 134 | // std.debug.print("pos {} {}\n", .{ pos[0], pos[1] });s 135 | 136 | self.rho[idx] = 1; 137 | var posF: [defs.dim]f32 = undefined; 138 | var posNorm: [defs.dim]f32 = undefined; 139 | inline for (0..defs.dim) |d| { 140 | posF[d] = @floatFromInt(pos[d]); 141 | posNorm[d] = posF[d] / defs.domain_size[d]; 142 | } 143 | 144 | const velNorm = 0.01; 145 | // const ux = velNorm * std.math.sin(posNorm[0] * 2 * std.math.pi) * std.math.cos(posNorm[1] * 2 * std.math.pi); 146 | // const uy = -velNorm * std.math.cos(posNorm[0] * 2 * std.math.pi) * std.math.sin(posNorm[1] * 2 * std.math.pi); 147 | 148 | self.u[0][idx] = velNorm * ((1 - posNorm[1]) * posNorm[1]); 149 | self.u[1][idx] = 0; 150 | if (defs.dim == 3) { 151 | self.u[2][idx] = 0; 152 | } 153 | 154 | var u: [defs.dim]f32 = undefined; 155 | inline for (0..defs.dim) |d| { 156 | u[d] = self.u[d][idx]; 157 | self.force_ibm[d][idx] = 0; 158 | } 159 | 160 | inline for (0..defs.n_pop) |j| { 161 | self.popA[fidx.idxPop(pos, j)] = func_feq(self.rho[idx], u, j); 162 | self.popB[fidx.idxPop(pos, j)] = func_feq(self.rho[idx], u, j); 163 | } 164 | } 165 | } 166 | 167 | pub fn update_macroscopics(self: Self, pop_arr: []f32) void { 168 | for (0..defs.n_nodes) |idx| { 169 | var pop: [defs.n_pop]f32 = undefined; 170 | const pos = fidx.idx2pos(idx); 171 | inline for (0..defs.n_pop) |j| { 172 | pop[j] = pop_arr[fidx.idxPop(pos, @intCast(j))]; 173 | } 174 | var rho: f32 = 0; 175 | var u: [defs.dim]f32 = .{0} ** defs.dim; 176 | var force: [defs.dim]f32 = .{0} ** defs.dim; 177 | inline for (0..defs.dim) |d| { 178 | force[d] += defs.global_force[d]; 179 | force[d] += self.force_ibm[d][idx]; 180 | } 181 | macroscopics(idx, &pop, &rho, &u, &force); 182 | self.rho[idx] = rho; 183 | inline for (0..defs.dim) |d| { 184 | self.u[d][idx] = u[d]; 185 | } 186 | } 187 | } 188 | 189 | pub fn export_arrays(self: *const Self, allocator: std.mem.Allocator, time_step: u32) !void { 190 | var buff: [50]u8 = undefined; 191 | const buff_slice = buff[0..]; 192 | 193 | var map = std.StringArrayHashMap([]const f32).init(allocator); 194 | defer map.deinit(); 195 | var data_wr = std.ArrayList(u8).init(allocator); 196 | defer data_wr.deinit(); 197 | 198 | try map.put("rho", @field(self, "rho")); 199 | const u_names: [defs.dim][]const u8 = if (defs.dim == 2) .{ "ux", "uy" } else .{ "ux", "uy", "uz" }; 200 | inline for (0..defs.dim, u_names) |d, macr_name| { 201 | try map.put(macr_name, self.u[d]); 202 | } 203 | const f_names: [defs.dim][]const u8 = if (defs.dim == 2) .{ "force_IBMx", "force_IBMy" } else .{ "force_IBMx", "force_IBMy", "force_IBMz" }; 204 | inline for (0..defs.dim, f_names) |d, macr_name| { 205 | try map.put(macr_name, self.force_ibm[d]); 206 | } 207 | 208 | const filename_use = try std.fmt.bufPrint(buff_slice, "output/macrs{d:0>5}.vtk", .{time_step}); 209 | try vtk.write_vtk(&data_wr, map, &defs.domain_size); 210 | try utils.writeArrayListToFile(filename_use, data_wr.items); 211 | } 212 | }; 213 | 214 | pub fn run_IBM_iteration(bodies: []const ibm.BodyIBM, lbm_arr: LBMArrays, time_step: u32) void { 215 | _ = time_step; 216 | if (bodies.len == 0) { 217 | return; 218 | } 219 | 220 | for (bodies) |b| { 221 | b.run_ibm(lbm_arr.rho, lbm_arr.u, lbm_arr.force_ibm); 222 | } 223 | } 224 | 225 | pub fn run_time_step(lbm_arr: LBMArrays, time_step: u32) void { 226 | const popMain_arr = if (time_step % 2 == 0) lbm_arr.popA else lbm_arr.popB; 227 | const popAux_arr = if (time_step % 2 == 1) lbm_arr.popA else lbm_arr.popB; 228 | 229 | for (0..defs.n_nodes) |idx| { 230 | var pop: [defs.n_pop]f32 = undefined; 231 | streaming(idx, &pop, popMain_arr); 232 | 233 | var rho: f32 = 0; 234 | var u: [defs.dim]f32 = .{0} ** defs.dim; 235 | var force: [defs.dim]f32 = .{0} ** defs.dim; 236 | var reset_forces = false; 237 | inline for (0..defs.dim) |d| { 238 | force[d] += defs.global_force[d]; 239 | const fibm = 0; 240 | if (fibm != 0) { 241 | force[d] += fibm; 242 | reset_forces = true; 243 | } 244 | } 245 | macroscopics(idx, &pop, &rho, &u, &force); 246 | collision(idx, &pop, rho, u, force); 247 | 248 | // Update populations 249 | const pos = fidx.idx2pos(idx); 250 | inline for (0..defs.n_pop) |j| { 251 | popAux_arr[fidx.idxPop(pos, @intCast(j))] = pop[j]; 252 | } 253 | 254 | // Update and save post collision macrs 255 | macroscopics(idx, &pop, &rho, &u, &force); 256 | lbm_arr.rho[idx] = rho; 257 | inline for (0..defs.dim) |d| { 258 | lbm_arr.u[d][idx] = u[d]; 259 | } 260 | if (reset_forces) { 261 | inline for (0..defs.dim) |d| { 262 | lbm_arr.force_ibm[d][idx] = 0; 263 | } 264 | } 265 | } 266 | } 267 | 268 | pub fn allocate_arrs(allocator: *const Allocator) !LBMArrays { 269 | const popA: []f32 = try allocator.alloc(f32, defs.n_nodes * defs.n_pop); 270 | const popB: []f32 = try allocator.alloc(f32, defs.n_nodes * defs.n_pop); 271 | const rho: []f32 = try allocator.alloc(f32, defs.n_nodes); 272 | var u: [defs.dim][]f32 = undefined; 273 | var force_ibm: [defs.dim][]f32 = undefined; 274 | inline for (0..defs.dim) |d| { 275 | u[d] = try allocator.alloc(f32, defs.n_nodes); 276 | force_ibm[d] = try allocator.alloc(f32, defs.n_nodes); 277 | } 278 | 279 | return LBMArrays{ .popA = popA, .popB = popB, .rho = rho, .u = u, .force_ibm = force_ibm }; 280 | } 281 | -------------------------------------------------------------------------------- /src/cl.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const info = std.log.info; 3 | 4 | const c = @cImport({ 5 | @cDefine("CL_TARGET_OPENCL_VERSION", "110"); 6 | @cInclude("CL/cl.h"); 7 | }); 8 | 9 | const CLError = error{ 10 | GetPlatformsFailed, 11 | GetPlatformInfoFailed, 12 | NoPlatformsFound, 13 | GetDevicesFailed, 14 | GetDeviceInfoFailed, 15 | NoDevicesFound, 16 | CreateContextFailed, 17 | CreateCommandQueueFailed, 18 | CreateProgramFailed, 19 | BuildProgramFailed, 20 | FreeProgramFailed, 21 | CreateKernelFailed, 22 | FreeKernelFailed, 23 | SetKernelArgFailed, 24 | EnqueueNDRangeKernel, 25 | CreateBufferFailed, 26 | EnqueueWriteBufferFailed, 27 | EnqueueReadBufferFailed, 28 | }; 29 | 30 | pub fn cl_get_device() CLError!c.cl_device_id { 31 | var platform_ids: [16]c.cl_platform_id = undefined; 32 | var platform_count: c.cl_uint = undefined; 33 | if (c.clGetPlatformIDs(platform_ids.len, &platform_ids, &platform_count) != c.CL_SUCCESS) { 34 | return CLError.GetPlatformsFailed; 35 | } 36 | info("{} cl platform(s) found:", .{@as(u32, platform_count)}); 37 | 38 | for (platform_ids[0..platform_count], 0..) |id, i| { 39 | var name: [1024]u8 = undefined; 40 | var name_len: usize = undefined; 41 | if (c.clGetPlatformInfo(id, c.CL_PLATFORM_NAME, name.len, &name, &name_len) != c.CL_SUCCESS) { 42 | return CLError.GetPlatformInfoFailed; 43 | } 44 | info(" platform {}: {s}", .{ i, name[0..name_len] }); 45 | } 46 | 47 | if (platform_count == 0) { 48 | return CLError.NoPlatformsFound; 49 | } 50 | 51 | info("choosing platform 0...", .{}); 52 | 53 | var device_ids: [16]c.cl_device_id = undefined; 54 | var device_count: c.cl_uint = undefined; 55 | if (c.clGetDeviceIDs(platform_ids[0], c.CL_DEVICE_TYPE_ALL, device_ids.len, &device_ids, &device_count) != c.CL_SUCCESS) { 56 | return CLError.GetDevicesFailed; 57 | } 58 | info("{} cl device(s) found on platform 0:", .{@as(u32, device_count)}); 59 | 60 | for (device_ids[0..device_count], 0..) |id, i| { 61 | var name: [1024]u8 = undefined; 62 | var name_len: usize = undefined; 63 | if (c.clGetDeviceInfo(id, c.CL_DEVICE_NAME, name.len, &name, &name_len) != c.CL_SUCCESS) { 64 | return CLError.GetDeviceInfoFailed; 65 | } 66 | info(" device {}: {s}", .{ i, name[0..name_len] }); 67 | } 68 | 69 | if (device_count == 0) { 70 | return CLError.NoDevicesFound; 71 | } 72 | 73 | info("choosing device 0...", .{}); 74 | 75 | return device_ids[0]; 76 | } 77 | 78 | pub const CLBuffer = struct { 79 | const Self = @This(); 80 | 81 | ctx: c.cl_context, 82 | d_buff: c.cl_mem, 83 | size: usize, 84 | 85 | pub fn init(size: usize, ctx: c.cl_context) CLError!CLBuffer { 86 | const input_buffer = c.clCreateBuffer(ctx, c.CL_MEM_READ_WRITE, size, null, null); 87 | if (input_buffer == null) { 88 | return CLError.CreateBufferFailed; 89 | } 90 | return .{ .ctx = ctx, .d_buff = input_buffer.?, .size = size }; 91 | } 92 | 93 | pub fn free(self: Self) void { 94 | if (c.clReleaseMemObject(self.d_buff) != c.CL_SUCCESS) { 95 | std.log.err("Error on buffer free. {any}", .{self}); 96 | } 97 | } 98 | 99 | pub fn read(self: Self, h_buff: ?*anyopaque, cmd_queue: CLQueue) CLError!void { 100 | // Fill input buffer 101 | if (c.clEnqueueReadBuffer(cmd_queue.queue, self.d_buff, c.CL_TRUE, 0, self.size, h_buff, 0, null, null) != c.CL_SUCCESS) { 102 | return CLError.EnqueueReadBufferFailed; 103 | } 104 | } 105 | 106 | pub fn write(self: Self, h_buff: ?*const anyopaque, cmd_queue: CLQueue) CLError!void { 107 | // Fill input buffer 108 | if (c.clEnqueueWriteBuffer(cmd_queue.queue, self.d_buff, c.CL_TRUE, 0, self.size, h_buff, 0, null, null) != c.CL_SUCCESS) { 109 | return CLError.EnqueueWriteBufferFailed; 110 | } 111 | } 112 | }; 113 | 114 | pub const CLQueue = struct { 115 | const Self = @This(); 116 | 117 | queue: c.cl_command_queue, 118 | 119 | pub fn init(ctx: c.cl_context, device: c.cl_device_id) CLError!Self { 120 | const command_queue = c.clCreateCommandQueue(ctx, device, 0, null); // future: last arg is error code 121 | if (command_queue == null) { 122 | return CLError.CreateCommandQueueFailed; 123 | } 124 | return .{ .queue = command_queue }; 125 | } 126 | 127 | pub fn free(self: Self) void { 128 | _ = c.clFlush(self.queue); 129 | _ = c.clFinish(self.queue); 130 | _ = c.clReleaseCommandQueue(self.queue); 131 | } 132 | }; 133 | 134 | test "test OpenCL memory buffer" { 135 | const device = try cl_get_device(); 136 | const ctx = c.clCreateContext(null, 1, &device, null, null, null); // future: last arg is error code 137 | if (ctx == null) { 138 | return CLError.CreateContextFailed; 139 | } 140 | defer _ = c.clReleaseContext(ctx); 141 | 142 | const hbuff_write: [3]u8 = .{ 1, 2, 3 }; 143 | var hbuff_read: [3]u8 = undefined; 144 | 145 | const queue = try CLQueue.init(ctx, device); 146 | var dbuff = try CLBuffer.init(3, ctx); 147 | defer { 148 | dbuff.free(); 149 | queue.free(); 150 | } 151 | 152 | try dbuff.write(&hbuff_write, queue); 153 | try dbuff.read(&hbuff_read, queue); 154 | _ = c.clFlush(queue.queue); 155 | _ = c.clFinish(queue.queue); 156 | 157 | try std.testing.expectEqual(1, hbuff_read[0]); 158 | try std.testing.expectEqual(2, hbuff_read[1]); 159 | try std.testing.expectEqual(3, hbuff_read[2]); 160 | } 161 | 162 | pub const CLProgram = struct { 163 | const Self = @This(); 164 | 165 | program: c.cl_program, 166 | 167 | pub fn init(ctx: c.cl_context, device: c.cl_device_id, program_src_c: []const u8) CLError!Self { 168 | const program = c.clCreateProgramWithSource(ctx, 1, @ptrCast(@constCast(&program_src_c.ptr)), null, null); // future: last arg is error code 169 | if (program == null) { 170 | return CLError.CreateProgramFailed; 171 | } 172 | if (c.clBuildProgram(program, 1, &device, null, null, null) != c.CL_SUCCESS) { 173 | return CLError.BuildProgramFailed; 174 | } 175 | return .{ .program = program }; 176 | } 177 | 178 | pub fn free(self: Self) void { 179 | if (c.clReleaseProgram(self.program) != c.CL_SUCCESS) { 180 | std.log.err("Error on program free. {any}", .{self}); 181 | } 182 | } 183 | }; 184 | 185 | test "test OpenCL program" { 186 | const program_src = 187 | \\__kernel void square_array(__global int* input_array, __global int* output_array) { 188 | \\ int i = get_global_id(0); 189 | \\ int value = input_array[i]; 190 | \\ output_array[i] = value * value; 191 | \\} 192 | ; 193 | 194 | const device = try cl_get_device(); 195 | const ctx = c.clCreateContext(null, 1, &device, null, null, null); // future: last arg is error code 196 | if (ctx == null) { 197 | return CLError.CreateContextFailed; 198 | } 199 | defer _ = c.clReleaseContext(ctx); 200 | 201 | const program = try CLProgram.init(ctx, device, program_src); 202 | program.free(); 203 | } 204 | 205 | pub const CLKernel = struct { 206 | const Self = @This(); 207 | 208 | kernel: c.cl_kernel, 209 | 210 | pub fn init(program: CLProgram, kernel_name: []const u8) CLError!CLKernel { 211 | const kernel = c.clCreateKernel(program.program, @ptrCast(kernel_name), null); 212 | if (kernel == null) { 213 | return CLError.CreateKernelFailed; 214 | } 215 | return .{ .kernel = kernel }; 216 | } 217 | 218 | pub fn free(self: Self) void { 219 | if (c.clReleaseKernel(self.kernel) != c.CL_SUCCESS) { 220 | std.log.err("Error on kernel free. {any}", .{self}); 221 | } 222 | } 223 | }; 224 | 225 | pub const CLKernelCall = struct { 226 | const Self = @This(); 227 | const ArgType = union(enum) { 228 | int: i32, 229 | float: f32, 230 | buffer: CLBuffer, 231 | }; 232 | 233 | kernel: CLKernel, 234 | queue: CLQueue, 235 | 236 | args: []ArgType, 237 | work_dim: u32, 238 | global_work_size: [3]usize, 239 | local_work_size: [3]usize, 240 | 241 | pub fn call(self: Self) CLError!void { 242 | for (self.args, 0..) |arg, i| { 243 | switch (arg) { 244 | .int => |v| { 245 | if (c.clSetKernelArg(self.kernel.kernel, @intCast(i), @sizeOf(i32), &v) != c.CL_SUCCESS) { 246 | return CLError.SetKernelArgFailed; 247 | } 248 | }, 249 | .float => |v| { 250 | if (c.clSetKernelArg(self.kernel.kernel, @intCast(i), @sizeOf(f32), &v) != c.CL_SUCCESS) { 251 | return CLError.SetKernelArgFailed; 252 | } 253 | }, 254 | .buffer => |v| { 255 | if (c.clSetKernelArg(self.kernel.kernel, @intCast(i), @sizeOf(c.cl_mem), @ptrCast(&v.d_buff)) != c.CL_SUCCESS) { 256 | return CLError.SetKernelArgFailed; 257 | } 258 | }, 259 | } 260 | } 261 | if (c.clEnqueueNDRangeKernel(self.queue.queue, self.kernel.kernel, self.work_dim, null, &self.global_work_size, &self.local_work_size, 0, null, null) != c.CL_SUCCESS) { 262 | return CLError.EnqueueNDRangeKernel; 263 | } 264 | } 265 | }; 266 | 267 | test "test OpenCL kernel" { 268 | const program_src = 269 | \\__kernel void square_array(__global int* input_array, __global int* output_array) { 270 | \\ int i = get_global_id(0); 271 | \\ int value = input_array[i]; 272 | \\ output_array[i] = value * value; 273 | \\} 274 | ; 275 | 276 | const device = try cl_get_device(); 277 | const ctx = c.clCreateContext(null, 1, &device, null, null, null); // future: last arg is error code 278 | if (ctx == null) { 279 | return CLError.CreateContextFailed; 280 | } 281 | defer _ = c.clReleaseContext(ctx); 282 | 283 | const program = try CLProgram.init(ctx, device, program_src); 284 | defer program.free(); 285 | const kernel = try CLKernel.init(program, "square_array"); 286 | defer kernel.free(); 287 | } 288 | 289 | test "test OpenCL kernel call" { 290 | const program_src = 291 | \\__kernel void square_array(__global int* input_array, __global int* output_array) { 292 | \\ int i = get_global_id(0); 293 | \\ int value = input_array[i]; 294 | \\ output_array[i] = value * value; 295 | \\} 296 | ; 297 | 298 | const device = try cl_get_device(); 299 | const ctx = c.clCreateContext(null, 1, &device, null, null, null); // future: last arg is error code 300 | if (ctx == null) { 301 | return CLError.CreateContextFailed; 302 | } 303 | defer _ = c.clReleaseContext(ctx); 304 | const queue = try CLQueue.init(ctx, device); 305 | 306 | const program = try CLProgram.init(ctx, device, program_src); 307 | defer program.free(); 308 | const kernel = try CLKernel.init(program, "square_array"); 309 | defer kernel.free(); 310 | 311 | // Create buffers 312 | var input_array = init: { 313 | var init_value: [1024]i32 = undefined; 314 | for (0..1024) |i| { 315 | init_value[i] = @as(i32, @intCast(i)); 316 | } 317 | break :init init_value; 318 | }; 319 | const input_buffer = try CLBuffer.init(1024 * @sizeOf(i32), ctx); 320 | defer input_buffer.free(); 321 | try input_buffer.write(&input_array, queue); 322 | const output_buffer = try CLBuffer.init(1024 * @sizeOf(i32), ctx); 323 | defer output_buffer.free(); 324 | 325 | const ArgType = CLKernelCall.ArgType; 326 | const args: [2]ArgType = .{ 327 | ArgType{ .buffer = input_buffer }, 328 | ArgType{ .buffer = output_buffer }, 329 | }; 330 | 331 | const kernel_call: CLKernelCall = .{ 332 | .kernel = kernel, 333 | .queue = queue, 334 | .args = @ptrCast(@constCast(&args)), 335 | .global_work_size = .{ input_array.len, 0, 0 }, 336 | .local_work_size = .{ 64, 0, 0 }, 337 | .work_dim = 1, 338 | }; 339 | try kernel_call.call(); 340 | 341 | var output_array: [1024]i32 = undefined; 342 | try output_buffer.read(&output_array, queue); 343 | for (output_array, 0..) |val, i| { 344 | if (i % 100 == 0) { 345 | try std.testing.expect(val == (i * i)); 346 | info("{} ^ 2 = {}", .{ i, val }); 347 | } 348 | } 349 | } 350 | --------------------------------------------------------------------------------