├── .gitignore └── src ├── main_device.zig ├── hip.zig ├── main.zig └── cuda.zig /.gitignore: -------------------------------------------------------------------------------- 1 | zig-out 2 | zig-cache 3 | .zig-cache 4 | -------------------------------------------------------------------------------- /src/main_device.zig: -------------------------------------------------------------------------------- 1 | //! Entry-point wrapper for device code. This file should contain 2 | //! everything that should be exported for the device binary. 3 | 4 | const std = @import("std"); 5 | 6 | // Custom panic handler, to prevent stack traces etc on this target. 7 | pub fn panic(msg: []const u8, stack_trace: ?*std.builtin.StackTrace, _: ?usize) noreturn { 8 | _ = msg; 9 | _ = stack_trace; 10 | unreachable; 11 | } 12 | 13 | comptime { 14 | @export(&@import("main.zig").shallenge, .{ .name = "shallenge" }); 15 | } 16 | -------------------------------------------------------------------------------- /src/hip.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const assert = std.debug.assert; 3 | 4 | pub const c = @cImport({ 5 | @cDefine("__HIP_PLATFORM_AMD__", "1"); 6 | @cInclude("hip/hip_runtime.h"); 7 | }); 8 | 9 | pub fn unexpected(err: c_uint) noreturn { 10 | std.log.err("unexpected hip result: {s}", .{c.hipGetErrorName(err)}); 11 | @panic("unexpected HIP error"); 12 | } 13 | 14 | pub fn init() void { 15 | } 16 | 17 | pub fn malloc(comptime T: type, n: usize) ![]T { 18 | var result: [*]T = undefined; 19 | return switch (c.hipMalloc( 20 | @ptrCast(&result), 21 | n * @sizeOf(T), 22 | )) { 23 | c.hipSuccess => result[0..n], 24 | c.hipErrorMemoryAllocation => error.OutOfMemory, 25 | else => |err| unexpected(err), 26 | }; 27 | } 28 | 29 | pub fn free(ptr: anytype) void { 30 | const actual_ptr = switch (@typeInfo(@TypeOf(ptr)).pointer.size) { 31 | .slice => ptr.ptr, 32 | else => ptr, 33 | }; 34 | 35 | assert(c.hipFree(actual_ptr) == c.hipSuccess); 36 | } 37 | 38 | const CopyDir = enum { 39 | host_to_device, 40 | device_to_host, 41 | host_to_host, 42 | device_to_device, 43 | 44 | fn toC(self: CopyDir) c_uint { 45 | return switch (self) { 46 | .host_to_device => c.hipMemcpyHostToDevice, 47 | .device_to_host => c.hipMemcpyDeviceToHost, 48 | .host_to_host => c.hipMemcpyHostToHost, 49 | .device_to_device => c.hipMemcpyDeviceToDevice, 50 | }; 51 | } 52 | }; 53 | 54 | pub fn memcpy(comptime T: type, dst: []T, src: []const T, direction: CopyDir) void { 55 | assert(dst.len >= src.len); 56 | switch (c.hipMemcpy( 57 | dst.ptr, 58 | src.ptr, 59 | @sizeOf(T) * src.len, 60 | direction.toC(), 61 | )) { 62 | c.hipSuccess => {}, 63 | else => |err| unexpected(err), 64 | } 65 | } 66 | 67 | pub const Module = struct { 68 | handle: c.hipModule_t, 69 | 70 | pub fn loadData(image: *const anyopaque) !Module { 71 | var module: Module = undefined; 72 | return switch (c.hipModuleLoadData(&module.handle, image)) { 73 | c.hipSuccess => module, 74 | c.hipErrorOutOfMemory => error.OutOfMemory, 75 | c.hipErrorSharedObjectInitFailed => error.SharedObjectInitFailed, 76 | else => |err| unexpected(err), 77 | }; 78 | } 79 | 80 | pub fn unload(self: Module) void { 81 | assert(c.hipModuleUnload(self.handle) == c.hipSuccess); 82 | } 83 | 84 | pub fn getFunction(self: Module, name: [*:0]const u8) !Function { 85 | var function: Function = undefined; 86 | return switch (c.hipModuleGetFunction(&function.handle, self.handle, name)) { 87 | c.hipSuccess => function, 88 | c.hipErrorNotFound => error.NotFound, 89 | else => |err| unexpected(err), 90 | }; 91 | } 92 | }; 93 | 94 | pub const Dim3 = struct { 95 | x: u32 = 1, 96 | y: u32 = 1, 97 | z: u32 = 1, 98 | }; 99 | 100 | pub const LaunchConfig = struct { 101 | grid_dim: Dim3 = .{}, 102 | block_dim: Dim3 = .{}, 103 | shared_mem_per_block: u32 = 0, 104 | stream: c.hipStream_t = null, 105 | }; 106 | 107 | pub const Function = struct { 108 | handle: c.hipFunction_t, 109 | 110 | pub fn launch( 111 | self: Function, 112 | cfg: LaunchConfig, 113 | args: anytype, 114 | ) void { 115 | var args_buf: [args.len]?*anyopaque = undefined; 116 | inline for (&args_buf, 0..) |*arg_buf, i| { 117 | arg_buf.* = @constCast(@ptrCast(&args[i])); 118 | } 119 | 120 | switch (c.hipModuleLaunchKernel( 121 | self.handle, 122 | cfg.grid_dim.x, 123 | cfg.grid_dim.y, 124 | cfg.grid_dim.z, 125 | cfg.block_dim.x, 126 | cfg.block_dim.y, 127 | cfg.block_dim.z, 128 | cfg.shared_mem_per_block, 129 | cfg.stream, 130 | &args_buf, 131 | null, 132 | )) { 133 | c.hipSuccess => {}, 134 | else => |err| unexpected(err), 135 | } 136 | } 137 | }; 138 | 139 | pub const Event = struct { 140 | handle: c.hipEvent_t, 141 | 142 | pub fn create() Event { 143 | var event: Event = undefined; 144 | return switch (c.hipEventCreate(&event.handle)) { 145 | c.hipSuccess => event, 146 | else => |err| unexpected(err), 147 | }; 148 | } 149 | 150 | pub fn destroy(self: Event) void { 151 | assert(c.hipEventDestroy(self.handle) == c.hipSuccess); 152 | } 153 | 154 | pub fn record(self: Event, stream: c.hipStream_t) void { 155 | switch (c.hipEventRecord(self.handle, stream)) { 156 | c.hipSuccess => {}, 157 | else => |err| unexpected(err), 158 | } 159 | } 160 | 161 | pub fn synchronize(self: Event) void { 162 | switch (c.hipEventSynchronize(self.handle)) { 163 | c.hipSuccess => {}, 164 | else => |err| unexpected(err), 165 | } 166 | } 167 | 168 | pub fn elapsed(start: Event, stop: Event) f32 { 169 | var result: f32 = undefined; 170 | return switch (c.hipEventElapsedTime(&result, start.handle, stop.handle)) { 171 | c.hipSuccess => result, 172 | else => |err| unexpected(err), 173 | }; 174 | } 175 | }; 176 | -------------------------------------------------------------------------------- /src/main.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const Hash = std.crypto.hash.sha2.Sha256; 3 | const assert = std.debug.assert; 4 | 5 | const build_options = @import("build_options"); 6 | 7 | pub const std_options: std.Options = .{ 8 | .log_level = .info, 9 | }; 10 | 11 | const hip = switch (build_options.gpu_runtime) { 12 | .hip => @import("hip.zig"), 13 | .cuda => @import("cuda.zig"), 14 | }; 15 | 16 | const block_size = 256; 17 | const grid_size = 65536; 18 | const items_per_thread = 256; 19 | 20 | const Seed = struct { 21 | bid: u16, 22 | tid: u8, 23 | item: u8, 24 | epoch: u32, 25 | 26 | inline fn zerosAndSeedBits(self: Seed, zeros: u32) ZerosAndSeedBits { 27 | return .{ 28 | .zeros = zeros, 29 | .bid = self.bid, 30 | .tid = self.tid, 31 | .item = self.item, 32 | }; 33 | } 34 | }; 35 | 36 | const ZerosAndSeedBits = packed struct(u64) { 37 | bid: u16, 38 | tid: u8, 39 | item: u8, 40 | // Zeros at the end so that this struct can be compared using max(). 41 | zeros: u32, 42 | 43 | inline fn seed(self: ZerosAndSeedBits, epoch: u32) Seed { 44 | return .{ 45 | .epoch = epoch, 46 | .bid = self.bid, 47 | .tid = self.tid, 48 | .item = self.item, 49 | }; 50 | } 51 | }; 52 | 53 | inline fn string(seed: Seed) [36]u8 { 54 | const elo = (seed.epoch & 0x0F0F_0F0F) + 0x6161_6161; 55 | const ehi = ((seed.epoch & 0xF0F0_F0F0) >> 4) + 0x6161_6161; 56 | 57 | // This seems to be faster than shifting and masking as above 58 | var nonce: [8]u8 = undefined; 59 | nonce[0] = @truncate('a' + ((seed.bid >> 12) & 0xF)); 60 | nonce[1] = @truncate('a' + ((seed.bid >> 8) & 0xF)); 61 | nonce[2] = @truncate('a' + ((seed.bid >> 4) & 0xF)); 62 | nonce[3] = @truncate('a' + (seed.bid & 0xF)); 63 | nonce[4] = @truncate('a' + ((seed.tid >> 4) & 0xF)); 64 | nonce[5] = @truncate('a' + (seed.tid & 0xF)); 65 | nonce[6] = @truncate('a' + ((seed.item >> 4) & 0xF)); 66 | nonce[7] = @truncate('a' + (seed.item & 0xF)); 67 | return ("snektron/zig+amdgcn+" ++ std.mem.toBytes(ehi) ++ std.mem.toBytes(elo) ++ std.mem.toBytes(nonce)).*; 68 | } 69 | 70 | pub fn shallenge( 71 | epoch: *const addrspace(.global) u32, 72 | out: *addrspace(.global) u64, 73 | ) callconv(.kernel) void { 74 | const e = epoch.*; 75 | const bid = @workGroupId(0); 76 | const tid = @workItemId(0); 77 | 78 | var max: u64 = 0; 79 | for (0..items_per_thread) |i| { 80 | const seed: Seed = .{ 81 | .bid = @truncate(bid), 82 | .tid = @truncate(tid), 83 | .item = @truncate(i), 84 | .epoch = e, 85 | }; 86 | 87 | const str = string(seed); 88 | 89 | var digest: [Hash.digest_length]u8 align(8) = undefined; 90 | Hash.hash(&str, &digest, .{}); 91 | 92 | const init_word = @byteSwap(std.mem.bytesAsValue(u64, digest[0..8]).*); 93 | const zeros = @clz(init_word); 94 | const zeros_and_seed_bits: u64 = @bitCast(seed.zerosAndSeedBits(zeros)); 95 | 96 | max = @max(max, zeros_and_seed_bits); 97 | } 98 | 99 | _ = @atomicRmw(u64, out, .Max, max, .monotonic); 100 | } 101 | 102 | pub fn main() !void { 103 | hip.init(); 104 | 105 | const d_out = try hip.malloc(u64, 1); 106 | defer hip.free(d_out); 107 | 108 | const d_epoch = try hip.malloc(u32, 1); 109 | defer hip.free(d_epoch); 110 | 111 | var zero: u64 = 0; 112 | hip.memcpy(u64, d_out, (&zero)[0..1], .host_to_device); 113 | 114 | std.log.debug(" loading module", .{}); 115 | const module = try hip.Module.loadData(@embedFile("offload-bundle")); 116 | defer module.unload(); 117 | 118 | const kernel = try module.getFunction("shallenge"); 119 | 120 | var epoch: u32 = @bitReverse(@as(u32, @truncate(@as(u64, @bitCast(std.time.milliTimestamp()))))); 121 | var timer = try std.time.Timer.start(); 122 | var max: u64 = 0; 123 | while (true) : (epoch +%= 1) { 124 | hip.memcpy(u32, d_epoch, (&epoch)[0..1], .host_to_device); 125 | 126 | kernel.launch( 127 | .{ 128 | .grid_dim = .{ .x = grid_size }, 129 | .block_dim = .{ .x = block_size }, 130 | }, 131 | .{ d_epoch.ptr, d_out.ptr }, 132 | ); 133 | 134 | var raw_bits: u64 = undefined; 135 | hip.memcpy(u64, (&raw_bits)[0..1], d_out, .device_to_host); 136 | const zeros_and_seed_bits: ZerosAndSeedBits = @bitCast(raw_bits); 137 | 138 | const seed = zeros_and_seed_bits.seed(epoch); 139 | const zeros = zeros_and_seed_bits.zeros; 140 | 141 | const hashes: f32 = @floatFromInt(block_size * grid_size * items_per_thread); 142 | const elapsed: f32 = @floatFromInt(timer.lap()); 143 | 144 | const str = string(seed); 145 | 146 | var digest: [Hash.digest_length]u8 align(8) = undefined; 147 | Hash.hash(&str, &digest, .{}); 148 | 149 | const init_word = @byteSwap(std.mem.bytesAsValue(u64, digest[0..8]).*); 150 | const zeros_actual = @clz(init_word); 151 | 152 | if (zeros_actual <= max) { 153 | continue; 154 | } 155 | 156 | max = zeros_actual; 157 | 158 | std.log.info("performance: {d} GH/s", .{ hashes / (elapsed / std.time.ns_per_s) / 1000_000_000}); 159 | std.log.info("epoch: {}", .{epoch}); 160 | std.log.info("zeros: {} ({} digits)", .{zeros, zeros / 4}); 161 | std.log.info("zeros (actual): {}", .{zeros_actual}); 162 | std.log.info("seed: {}", .{seed}); 163 | std.log.info("string: {s}", .{str}); 164 | std.log.info("hash: {x}", .{&digest}); 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/cuda.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const assert = std.debug.assert; 3 | 4 | pub const c = struct { 5 | const CUDA_SUCCESS = 0; 6 | const CUDA_ERROR_MEMORY_ALLOCATION = 2; 7 | const CUDA_ERROR_NOT_FOUND = 500; 8 | const CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303; 9 | 10 | const CUresult = c_uint; 11 | const CUmodule = *opaque{}; 12 | const CUfunction = *opaque{}; 13 | const CUstream = *opaque{}; 14 | const CUevent = *opaque{}; 15 | const CUdevice = *opaque{}; 16 | const CUcontext = *opaque{}; 17 | 18 | pub extern fn cuGetErrorName(err: CUresult, msg: *[*:0]const u8) CUresult; 19 | pub extern fn cuInit(flags: c_uint) CUresult; 20 | pub extern fn cuDeviceGetCount(count: *c_int) CUresult; 21 | pub extern fn cuDeviceGet(device: *CUdevice, ordinal: c_int) CUresult; 22 | pub extern fn cuCtxCreate(context: *CUcontext, flags: c_uint, device: CUdevice) CUresult; 23 | pub extern fn cuMemAlloc(ptr: **anyopaque, size: usize) CUresult; 24 | pub extern fn cuMemFree(ptr: *anyopaque) CUresult; 25 | pub extern fn cuMemcpyHtoD(dst_dev: *anyopaque, src_host: *const anyopaque, size: usize) CUresult; 26 | pub extern fn cuMemcpyDtoH(dst_host: *anyopaque, src_dev: *const anyopaque, size: usize) CUresult; 27 | pub extern fn cuEventCreate(event: *CUevent) CUresult; 28 | pub extern fn cuEventDestroy(event: CUevent) CUresult; 29 | pub extern fn cuEventRecord(event: CUevent, stream: ?CUstream) CUresult; 30 | pub extern fn cuEventSynchronize(event: CUevent) CUresult; 31 | pub extern fn cuEventElapsedTime(result: *f32, a: CUevent, b: CUevent) CUresult; 32 | pub extern fn cuModuleLoadData(module: *CUmodule, image: *const anyopaque) CUresult; 33 | pub extern fn cuModuleUnload(module: CUmodule) CUresult; 34 | pub extern fn cuModuleGetFunction(function: *CUfunction, module: CUmodule, name: [*:0]const u8) CUresult; 35 | pub extern fn cuLaunchKernel( 36 | function: CUfunction, 37 | gdx: c_uint, 38 | gdy: c_uint, 39 | gdz: c_uint, 40 | bdx: c_uint, 41 | bdy: c_uint, 42 | bdz: c_uint, 43 | shmem: c_uint, 44 | stream: ?CUstream, 45 | params: ?[*]?*anyopaque, 46 | extra: ?[*]?*anyopaque, 47 | ) CUresult; 48 | }; 49 | 50 | pub fn unexpected(err: c_uint) noreturn { 51 | var msg: [*:0]const u8 = undefined; 52 | switch (c.cuGetErrorName(err, &msg)) { 53 | c.CUDA_SUCCESS => {}, 54 | else => unreachable, 55 | } 56 | std.log.err("unexpected cuda result: {s} ({})", .{msg, err}); 57 | unreachable; 58 | } 59 | 60 | pub fn init() void { 61 | switch (c.cuInit(0)) { 62 | c.CUDA_SUCCESS => {}, 63 | else => |err| unexpected(err), 64 | } 65 | 66 | var count: c_int = undefined; 67 | switch (c.cuDeviceGetCount(&count)) { 68 | c.CUDA_SUCCESS => {}, 69 | else => |err| unexpected(err), 70 | } 71 | 72 | var device: c.CUdevice = undefined; 73 | switch (c.cuDeviceGet(&device, 0)) { 74 | c.CUDA_SUCCESS => {}, 75 | else => |err| unexpected(err), 76 | } 77 | 78 | var context: c.CUcontext = undefined; 79 | switch (c.cuCtxCreate(&context, 0, device)) { 80 | c.CUDA_SUCCESS => {}, 81 | else => |err| unexpected(err), 82 | } 83 | } 84 | 85 | pub fn malloc(comptime T: type, n: usize) ![]T { 86 | var result: usize = 0; // cuda driver does not write to the upper bytes, so initialize as zero! 87 | return switch (c.cuMemAlloc( 88 | @ptrCast(&result), 89 | n * @sizeOf(T), 90 | )) { 91 | c.CUDA_SUCCESS => @as([*]T, @ptrFromInt(result))[0..n], 92 | c.CUDA_ERROR_MEMORY_ALLOCATION => error.OutOfMemory, 93 | else => |err| unexpected(err), 94 | }; 95 | } 96 | 97 | pub fn free(ptr: anytype) void { 98 | const actual_ptr = switch (@typeInfo(@TypeOf(ptr)).pointer.size) { 99 | .slice => ptr.ptr, 100 | else => ptr, 101 | }; 102 | 103 | assert(c.cuMemFree(actual_ptr) == c.CUDA_SUCCESS); 104 | } 105 | 106 | const CopyDir = enum { 107 | host_to_device, 108 | device_to_host, 109 | }; 110 | 111 | pub fn memcpy(comptime T: type, dst: []T, src: []const T, direction: CopyDir) void { 112 | assert(dst.len >= src.len); 113 | switch (direction) { 114 | .host_to_device => switch (c.cuMemcpyHtoD( 115 | dst.ptr, 116 | src.ptr, 117 | @sizeOf(T) * src.len, 118 | )) { 119 | c.CUDA_SUCCESS => {}, 120 | else => |err| unexpected(err), 121 | }, 122 | .device_to_host => switch (c.cuMemcpyDtoH( 123 | dst.ptr, 124 | src.ptr, 125 | @sizeOf(T) * src.len, 126 | )) { 127 | c.CUDA_SUCCESS => {}, 128 | else => |err| unexpected(err), 129 | } 130 | } 131 | } 132 | 133 | pub const Module = struct { 134 | handle: c.CUmodule, 135 | 136 | pub fn loadData(image: *const anyopaque) !Module { 137 | var module: Module = undefined; 138 | return switch (c.cuModuleLoadData(&module.handle, image)) { 139 | c.CUDA_SUCCESS => module, 140 | c.CUDA_ERROR_MEMORY_ALLOCATION => error.OutOfMemory, 141 | c.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED => error.SharedObjectInitFailed, 142 | else => |err| unexpected(err), 143 | }; 144 | } 145 | 146 | pub fn unload(self: Module) void { 147 | assert(c.cuModuleUnload(self.handle) == c.CUDA_SUCCESS); 148 | } 149 | 150 | pub fn getFunction(self: Module, name: [*:0]const u8) !Function { 151 | var function: Function = undefined; 152 | return switch (c.cuModuleGetFunction(&function.handle, self.handle, name)) { 153 | c.CUDA_SUCCESS => function, 154 | c.CUDA_ERROR_NOT_FOUND => error.NotFound, 155 | else => |err| unexpected(err), 156 | }; 157 | } 158 | }; 159 | 160 | pub const Dim3 = struct { 161 | x: u32 = 1, 162 | y: u32 = 1, 163 | z: u32 = 1, 164 | }; 165 | 166 | pub const LaunchConfig = struct { 167 | grid_dim: Dim3 = .{}, 168 | block_dim: Dim3 = .{}, 169 | shared_mem_per_block: u32 = 0, 170 | stream: ?c.CUstream = null, 171 | }; 172 | 173 | pub const Function = struct { 174 | handle: c.CUfunction, 175 | 176 | pub fn launch( 177 | self: Function, 178 | cfg: LaunchConfig, 179 | args: anytype, 180 | ) void { 181 | var args_buf: [args.len]?*anyopaque = undefined; 182 | inline for (&args_buf, 0..) |*arg_buf, i| { 183 | arg_buf.* = @constCast(@ptrCast(&args[i])); 184 | } 185 | 186 | switch (c.cuLaunchKernel( 187 | self.handle, 188 | cfg.grid_dim.x, 189 | cfg.grid_dim.y, 190 | cfg.grid_dim.z, 191 | cfg.block_dim.x, 192 | cfg.block_dim.y, 193 | cfg.block_dim.z, 194 | cfg.shared_mem_per_block, 195 | cfg.stream, 196 | &args_buf, 197 | null, 198 | )) { 199 | c.CUDA_SUCCESS => {}, 200 | else => |err| unexpected(err), 201 | } 202 | } 203 | }; 204 | 205 | pub const Event = struct { 206 | handle: c.CUevent, 207 | 208 | pub fn create() Event { 209 | var event: Event = undefined; 210 | return switch (c.cuEventCreate(&event.handle)) { 211 | c.CUDA_SUCCESS => event, 212 | else => |err| unexpected(err), 213 | }; 214 | } 215 | 216 | pub fn destroy(self: Event) void { 217 | assert(c.cuEventDestroy(self.handle) == c.CUDA_SUCCESS); 218 | } 219 | 220 | pub fn record(self: Event, stream: ?c.CUstream) void { 221 | switch (c.cuEventRecord(self.handle, stream)) { 222 | c.CUDA_SUCCESS => {}, 223 | else => |err| unexpected(err), 224 | } 225 | } 226 | 227 | pub fn synchronize(self: Event) void { 228 | switch (c.cuEventSynchronize(self.handle)) { 229 | c.CUDA_SUCCESS => {}, 230 | else => |err| unexpected(err), 231 | } 232 | } 233 | 234 | pub fn elapsed(start: Event, stop: Event) f32 { 235 | var result: f32 = undefined; 236 | return switch (c.cuEventElapsedTime(&result, start.handle, stop.handle)) { 237 | c.CUDA_SUCCESS => result, 238 | else => |err| unexpected(err), 239 | }; 240 | } 241 | }; 242 | --------------------------------------------------------------------------------