├── .gitignore ├── .ignore ├── README.md ├── bindings.odin ├── context.odin ├── examples └── main.odin ├── kernel.odin ├── params.odin ├── tensor.odin └── utils.odin /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | ols.json 3 | odinfmt.json 4 | Justfile 5 | -------------------------------------------------------------------------------- /.ignore: -------------------------------------------------------------------------------- 1 | build/ 2 | ols.json 3 | odinfmt.json 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A odin ggpu compute library in odin 2 | Code a bit sloppy, and everything else. Burnout rn so. And i have no clue on how to make libraries so 3 | ## Dependencies: 4 | - wgpu [wgpu-odin](https://github.com/Capati/wgpu-odin) 5 | 6 | ## Documentation 7 | Too lazy as of now, so docs are code 8 | 9 | ## Examples: 10 | There is an example in the examples folder 11 | 12 | ## TODO: 13 | - [ ] Maybe make use of sdl3gpu? 14 | -------------------------------------------------------------------------------- /bindings.odin: -------------------------------------------------------------------------------- 1 | package odgpu 2 | 3 | Bindings :: struct { 4 | tensors: []Tensor, 5 | params: Maybe(Params), 6 | } 7 | 8 | create_bindings :: proc( 9 | ctx: Context, 10 | tensors: ..Tensor, 11 | params: Maybe(Params) = nil, 12 | ) -> ( 13 | b: Bindings, 14 | ) { 15 | n := u32(len(tensors)) 16 | oparams, ok := params.? 17 | if ok {n += 1;b.params = oparams} 18 | b.tensors = tensors 19 | return 20 | } 21 | 22 | -------------------------------------------------------------------------------- /context.odin: -------------------------------------------------------------------------------- 1 | package odgpu 2 | 3 | import "base:runtime" 4 | 5 | import "core:fmt" 6 | 7 | import wgpu "shared:wgpu-odin/wrapper" 8 | 9 | Context :: struct { 10 | instance: wgpu.Instance, 11 | adapter: wgpu.Adapter, 12 | device: wgpu.Device, 13 | queue: wgpu.Queue, 14 | } 15 | 16 | @(private = "file") 17 | _log_callback :: proc "c" ( 18 | level: wgpu.Log_Level, 19 | message: cstring, 20 | user_data: rawptr, 21 | ) { 22 | context = runtime.default_context() 23 | fmt.printfln("[wgpu] [%v] %s", level, message) 24 | } 25 | 26 | create_context :: proc( 27 | log_level: wgpu.Log_Level, 28 | ) -> ( 29 | ctx: Context, 30 | ok: bool = true, 31 | ) { 32 | wgpu.set_log_level(log_level) 33 | if log_level != .Off { 34 | wgpu.set_log_callback(_log_callback, nil) 35 | } 36 | 37 | ctx.instance = wgpu.create_instance( 38 | wgpu.Instance_Descriptor{backends = wgpu.Instance_Backend_Primary}, 39 | ) or_return 40 | 41 | ctx.adapter = wgpu.instance_request_adapter( 42 | ctx.instance, 43 | wgpu.Request_Adapter_Options { 44 | compatible_surface = nil, 45 | power_preference = .High_Performance, 46 | }, 47 | ) or_return 48 | 49 | adapter_info := wgpu.adapter_get_info(ctx.adapter) or_return 50 | 51 | ctx.device = wgpu.adapter_request_device( 52 | ctx.adapter, 53 | wgpu.Device_Descriptor{label = adapter_info.description}, 54 | ) or_return 55 | 56 | ctx.queue = wgpu.device_get_queue(ctx.device) 57 | 58 | return 59 | } 60 | 61 | context_sync :: proc(ctx: Context) -> bool { 62 | command_encoder := wgpu.device_create_command_encoder(ctx.device) 63 | command_buffer := wgpu.command_encoder_finish(command_encoder) or_return 64 | wgpu.queue_submit(ctx.queue, command_buffer) 65 | wgpu.device_poll(ctx.device) 66 | return true 67 | } 68 | 69 | context_destroy :: proc(ctx: ^Context) { 70 | if ctx.queue != nil do wgpu.queue_release(ctx.queue) 71 | if ctx.adapter != nil do wgpu.adapter_release(ctx.adapter) 72 | if ctx.instance != nil do wgpu.instance_release(ctx.instance) 73 | if ctx.device != nil do wgpu.device_poll(ctx.device) 74 | if ctx.device != nil do wgpu.device_release(ctx.device) 75 | } 76 | 77 | -------------------------------------------------------------------------------- /examples/main.odin: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "core:fmt" 4 | 5 | import ogp "../" 6 | 7 | code :: ` 8 | struct Params { 9 | t: f32, 10 | } 11 | 12 | @group(0) @binding(0) var stuff: array; 13 | @group(0) @binding(3) var params: Params; 14 | 15 | @compute 16 | @workgroup_size({{workgroup_size}}) 17 | fn main(@builtin(global_invocation_id) global_id: vec3u) { 18 | if (global_id.x > arrayLength(&stuff)) {return;} 19 | stuff[global_id.x] *= params.t; 20 | } 21 | ` 22 | 23 | 24 | main :: proc() { 25 | ctx, ok := ogp.create_context(.Warn);defer ogp.context_destroy(&ctx) 26 | assert(ok, "Failed to initialize context") 27 | 28 | data := [5]f32{1, 2, 3, 4, 5} 29 | tensor := ogp.create_tensor( 30 | ctx, 31 | {len(data)}, 32 | data[:], 33 | );defer ogp.tensor_destroy(tensor) 34 | 35 | Parameters :: struct { 36 | t: f32, 37 | } 38 | 39 | p := Parameters { 40 | t = 2, 41 | } 42 | params := ogp.create_params(ctx, p);defer ogp.params_destroy(params) 43 | 44 | kernel, _ := ogp.create_kernel( 45 | ctx, 46 | ogp.create_kernel_code(code), 47 | ogp.create_bindings(ctx, tensor, params = params), 48 | );defer ogp.kernel_destroy(kernel) 49 | 50 | ogp.kernel_dispatch(ctx, &kernel, ogp.Workgroup_Size{5, 1, 1}) 51 | ogp.wait(ctx) 52 | 53 | res, cok := ogp.tensor_copy_cpu(ctx, tensor, f32);defer delete(res) 54 | fmt.println(data) 55 | } 56 | 57 | -------------------------------------------------------------------------------- /kernel.odin: -------------------------------------------------------------------------------- 1 | package odgpu 2 | 3 | import "base:intrinsics" 4 | 5 | import "core:fmt" 6 | import "core:log" 7 | import "core:mem" 8 | import "core:slice" 9 | import "core:strings" 10 | 11 | import wgpu "shared:wgpu-odin/wrapper" 12 | 13 | Workgroup_Size :: distinct [3]u32 14 | 15 | Kernel_Code :: struct { 16 | code: cstring, 17 | workgroup_size: Workgroup_Size, 18 | label, entry_point: string, 19 | } 20 | 21 | create_kernel_code :: proc( 22 | code: string, 23 | workgroup_size: Workgroup_Size = {256, 1, 1}, 24 | label: string = "kernel code", 25 | entry_point: string = "main", 26 | ) -> ( 27 | kc: Kernel_Code, 28 | ) { 29 | t, _ := strings.replace_all( 30 | code, 31 | "{{workgroup_size}}", 32 | fmt.tprintf( 33 | "%d, %d, %d", 34 | workgroup_size.x, 35 | workgroup_size.y, 36 | workgroup_size.z, 37 | ), 38 | ) 39 | kc.code = strings.clone_to_cstring(t) 40 | kc.workgroup_size = workgroup_size 41 | kc.label = label 42 | kc.entry_point = entry_point 43 | return 44 | } 45 | 46 | Kernel :: struct { 47 | bindings: Bindings, 48 | code: Kernel_Code, 49 | shader_module: wgpu.Shader_Module, 50 | pipeline: wgpu.Compute_Pipeline, 51 | bind_group: wgpu.Bind_Group, 52 | command_buffer: wgpu.Command_Buffer, 53 | } 54 | 55 | create_kernel :: proc( 56 | ctx: Context, 57 | code: Kernel_Code, 58 | bindings: Bindings, 59 | ) -> ( 60 | k: Kernel, 61 | ok: bool = true, 62 | ) { 63 | k.bindings = bindings 64 | k.code = code 65 | 66 | log.debug("Creating shadermodule") 67 | k.shader_module = wgpu.device_create_shader_module( 68 | ctx.device, 69 | wgpu.Shader_Module_Descriptor { 70 | label = strings.clone_to_cstring(k.code.label), 71 | source = k.code.code, 72 | }, 73 | ) or_return 74 | 75 | log.debug("Creating pipeline") 76 | k.pipeline = wgpu.device_create_compute_pipeline( 77 | ctx.device, 78 | wgpu.Compute_Pipeline_Descriptor { 79 | label = strings.clone_to_cstring(k.code.label), 80 | module = k.shader_module, 81 | entry_point = strings.clone_to_cstring(k.code.entry_point), 82 | }, 83 | ) or_return 84 | 85 | log.debug("Creating bindgroup layout") 86 | bind_group_layout := wgpu.compute_pipeline_get_bind_group_layout( 87 | k.pipeline, 88 | 0, 89 | ) or_return;defer wgpu.bind_group_layout_release(bind_group_layout) 90 | 91 | n := len(bindings.tensors) 92 | 93 | params, pok := k.bindings.params.? 94 | if pok do n += 1 95 | 96 | entries := make([]wgpu.Bind_Group_Entry, n);defer delete(entries) 97 | 98 | for i in 0 ..< len(bindings.tensors) { 99 | entries[i] = wgpu.Bind_Group_Entry { 100 | binding = u32(i), 101 | resource = wgpu.buffer_as_entire_binding( 102 | k.bindings.tensors[i].buffer, 103 | ), 104 | } 105 | } 106 | if pok do entries[n - 1] = wgpu.Bind_Group_Entry { 107 | binding = u32(n) - 1, 108 | resource = wgpu.buffer_as_entire_binding(params.buffer), 109 | } 110 | 111 | log.debug("Creating bindgroup") 112 | k.bind_group = wgpu.device_create_bind_group( 113 | ctx.device, 114 | wgpu.Bind_Group_Descriptor { 115 | label = "Bindgroup", 116 | layout = bind_group_layout, 117 | entries = entries, 118 | }, 119 | ) or_return 120 | 121 | return 122 | } 123 | 124 | // wgpu.device_poll is needed after calling this 125 | kernel_dispatch :: proc( 126 | ctx: Context, 127 | kernel: ^Kernel, 128 | workgroup_size: Workgroup_Size, 129 | // tensor_to_copy: Maybe(Tensor) = nil, 130 | // done: Maybe(^bool) = nil, 131 | ) -> bool { 132 | command_encoder := wgpu.device_create_command_encoder(ctx.device) 133 | comp_pass := wgpu.command_encoder_begin_compute_pass( 134 | command_encoder, 135 | wgpu.Compute_Pass_Descriptor{label = "Compute pass"}, 136 | ) or_return 137 | 138 | wgpu.compute_pass_set_pipeline(comp_pass, kernel.pipeline) 139 | wgpu.compute_pass_set_bind_group(comp_pass, 0, kernel.bind_group) 140 | wgpu.compute_pass_dispatch_workgroups( 141 | comp_pass, 142 | workgroup_size.x, 143 | workgroup_size.y, 144 | workgroup_size.z, 145 | ) 146 | wgpu.compute_pass_end(comp_pass) 147 | 148 | wgpu.compute_pass_release(comp_pass) 149 | 150 | kernel.command_buffer = wgpu.command_encoder_finish( 151 | command_encoder, 152 | ) or_return 153 | wgpu.queue_submit(ctx.queue, kernel.command_buffer) 154 | 155 | return true 156 | } 157 | 158 | kernel_destroy :: proc(kernel: Kernel) { 159 | if kernel.command_buffer != nil do wgpu.command_buffer_release(kernel.command_buffer) 160 | if kernel.bind_group != nil do wgpu.bind_group_release(kernel.bind_group) 161 | if kernel.pipeline != nil do wgpu.compute_pipeline_release(kernel.pipeline) 162 | if kernel.shader_module != nil do wgpu.shader_module_release(kernel.shader_module) 163 | } 164 | 165 | -------------------------------------------------------------------------------- /params.odin: -------------------------------------------------------------------------------- 1 | package odgpu 2 | 3 | import "base:intrinsics" 4 | 5 | import "core:mem" 6 | import "core:reflect" 7 | 8 | import wgpu "shared:wgpu-odin/wrapper" 9 | 10 | Params :: struct { 11 | buffer: wgpu.Buffer, 12 | } 13 | 14 | create_params :: proc( 15 | ctx: Context, 16 | params: $T, 17 | usage: wgpu.Buffer_Usage_Flags = {.Copy_Dst, .Uniform}, 18 | ) -> ( 19 | p: Params, 20 | ) where intrinsics.type_is_struct(T) { 21 | p.buffer = wgpu.device_create_buffer_with_data( 22 | ctx.device, 23 | wgpu.Buffer_Data_Descriptor { 24 | label = "params buffer", 25 | usage = usage, 26 | contents = wgpu.to_bytes(params), 27 | }, 28 | ) 29 | return 30 | } 31 | 32 | params_update_single :: proc( 33 | ctx: Context, 34 | params: Params, 35 | field_offset: uintptr, 36 | value: $T, 37 | ) { 38 | wgpu.queue_write_buffer( 39 | ctx.queue, 40 | params.buffer, 41 | cast(wgpu.Buffer_Address)field_offset, 42 | mem.any_to_bytes(value), 43 | ) 44 | } 45 | 46 | params_update_whole :: proc( 47 | ctx: Context, 48 | params: Params, 49 | new: $T, 50 | ) where intrinsics.type_is_struct(T) { 51 | wgpu.queue_write_buffer(ctx.queue, params.buffer, 0, mem.any_to_bytes(new)) 52 | } 53 | 54 | params_update :: proc { 55 | params_update_single, 56 | params_update_whole, 57 | } 58 | 59 | params_destroy :: proc(params: Params) { 60 | wgpu.buffer_release(params.buffer) 61 | } 62 | 63 | -------------------------------------------------------------------------------- /tensor.odin: -------------------------------------------------------------------------------- 1 | package odgpu 2 | 3 | import "base:intrinsics" 4 | 5 | import "core:fmt" 6 | import "core:log" 7 | import "core:slice" 8 | import "core:time" 9 | 10 | import wgpu "shared:wgpu-odin/wrapper" 11 | 12 | Tensor_Shape :: distinct []u32 13 | DEFAULT_BUFFER_FLAGS :: wgpu.Buffer_Usage_Flags{.Storage, .Copy_Dst, .Copy_Src} 14 | 15 | ssize :: proc(ts: Tensor_Shape) -> (n: u32 = 1) { 16 | for i in 0 ..< len(ts) do n *= ts[i] 17 | return 18 | } 19 | 20 | Tensor :: struct { 21 | buffer: wgpu.Buffer, 22 | usage: wgpu.Buffer_Usage_Flags, 23 | size: u64, 24 | _t_size: u32, 25 | _i: u32, 26 | } 27 | 28 | create_tensor_empty :: proc( 29 | ctx: Context, 30 | shape: Tensor_Shape, 31 | $T: typeid, 32 | usage: wgpu.Buffer_Usage_Flags = DEFAULT_BUFFER_FLAGS, 33 | ) -> ( 34 | t: Tensor, 35 | ) where intrinsics.type_is_numeric(T) { 36 | num_elems := ssize(shape) 37 | size := num_elems * size_of(T) 38 | t.usage = usage 39 | t.size = u64(size) 40 | t._t_size = u32(size_of(T)) 41 | t._i = 0 42 | 43 | t.buffer = wgpu.device_create_buffer( 44 | ctx.device, 45 | wgpu.Buffer_Descriptor { 46 | label = "tensor", 47 | usage = usage, 48 | size = u64(size), 49 | }, 50 | ) 51 | 52 | return 53 | } 54 | 55 | create_tensor_data :: proc( 56 | ctx: Context, 57 | shape: Tensor_Shape, 58 | data: $E/[]$T, 59 | usage: wgpu.Buffer_Usage_Flags = DEFAULT_BUFFER_FLAGS, 60 | ) -> ( 61 | t: Tensor, 62 | ) { 63 | num_elems := ssize(shape) 64 | assert(num_elems == u32(len(data)), "Provided data size must match shape") 65 | assert(num_elems != 0) 66 | 67 | size := num_elems * size_of(T) 68 | t.usage = usage 69 | t.size = u64(size) 70 | t._t_size = u32(size_of(T)) 71 | t._i = num_elems 72 | 73 | t.buffer = wgpu.device_create_buffer_with_data( 74 | ctx.device, 75 | wgpu.Buffer_Data_Descriptor { 76 | label = "tensor", 77 | usage = usage, 78 | contents = wgpu.to_bytes(data), 79 | }, 80 | ) 81 | 82 | return 83 | } 84 | 85 | create_tensor :: proc { 86 | create_tensor_empty, 87 | create_tensor_data, 88 | } 89 | 90 | tensor_update :: proc( 91 | ctx: Context, 92 | tensor: ^Tensor, 93 | data: []$T, 94 | offset: u64 = 0, 95 | ) { 96 | assert( 97 | u32(len(data)) <= tensor.size / tensor._t_size, 98 | "new data length must be the same or less as tensors max length", 99 | ) 100 | tensor._i = max(tensor._i, u32(len(data))) 101 | wgpu.queue_write_buffer( 102 | ctx.queue, 103 | tensor.buffer, 104 | offset, 105 | wgpu.to_bytes(data), 106 | ) 107 | } 108 | 109 | tensor_sync_with :: proc(ctx: Context, to: Tensor, from: Tensor) -> bool { 110 | command_encoder := wgpu.device_create_command_encoder( 111 | ctx.device, 112 | );defer wgpu.command_encoder_release(command_encoder) 113 | 114 | wgpu.command_encoder_copy_buffer_to_buffer( 115 | command_encoder, 116 | from.buffer, 117 | 0, 118 | to.buffer, 119 | 0, 120 | from.size, 121 | ) or_return 122 | 123 | command_buffer := wgpu.command_encoder_finish(command_encoder) or_return 124 | wgpu.queue_submit(ctx.queue, command_buffer) 125 | return true 126 | } 127 | 128 | tensor_append :: proc(ctx: Context, tensor: ^Tensor, data: $T) -> bool { 129 | if tensor._i > (tensor.size / tensor._t_size) do return false 130 | 131 | wgpu.queue_write_buffer( 132 | ctx.queue, 133 | tensor.buffer, 134 | cast(wgpu.Buffer_Address)tensor._i * tensor._t_size, 135 | wgpu.to_bytes(data), 136 | ) 137 | tensor._i += 1 138 | 139 | return true 140 | } 141 | 142 | tensor_write_offset :: proc( 143 | ctx: Context, 144 | tensor: Tensor, 145 | data: $T, 146 | offset: u64, 147 | ) -> bool { 148 | if offset > tensor.size do return false 149 | 150 | wgpu.queue_write_buffer( 151 | ctx.queue, 152 | tensor.buffer, 153 | offset, 154 | wgpu.to_bytes(data), 155 | ) 156 | 157 | return true 158 | } 159 | 160 | 161 | tensor_write_index :: proc( 162 | ctx: Context, 163 | tensor: Tensor, 164 | data: $T, 165 | index: u32, 166 | ) -> bool { 167 | return tensor_write_offset(ctx, tensor, data, u64(index * tensor._t_size)) 168 | } 169 | 170 | tensor_copy_cpu :: proc( 171 | ctx: Context, 172 | tensor: Tensor, 173 | $T: typeid, 174 | offset: u64 = 0, 175 | ) -> ( 176 | data: []T, 177 | ok: bool = true, 178 | ) { 179 | // first copy the data over 180 | command_encoder := wgpu.device_create_command_encoder( 181 | ctx.device, 182 | );defer wgpu.command_encoder_release(command_encoder) 183 | 184 | copy_size := tensor._t_size * tensor._i 185 | if copy_size == 0 do return 186 | 187 | staging_buffer := wgpu.device_create_buffer( 188 | ctx.device, 189 | wgpu.Buffer_Descriptor { 190 | usage = {.Map_Read, .Copy_Dst}, 191 | size = u64(copy_size), 192 | }, 193 | );defer wgpu.buffer_release(staging_buffer) 194 | 195 | wgpu.command_encoder_copy_buffer_to_buffer( 196 | command_encoder, 197 | tensor.buffer, 198 | 0, 199 | staging_buffer, 200 | 0, 201 | u64(copy_size), 202 | ) or_return 203 | 204 | command_buffer := wgpu.command_encoder_finish(command_encoder) or_return 205 | wgpu.queue_submit(ctx.queue, command_buffer) 206 | wgpu.device_poll(ctx.device) 207 | 208 | result: wgpu.Buffer_Map_Async_Status 209 | 210 | handle_buffer_map := proc "c" ( 211 | status: wgpu.Buffer_Map_Async_Status, 212 | user_data: rawptr, 213 | ) { 214 | result := cast(^wgpu.Buffer_Map_Async_Status)user_data 215 | result^ = status 216 | } 217 | 218 | wgpu.buffer_map_async( 219 | staging_buffer, 220 | {.Read}, 221 | handle_buffer_map, 222 | &result, 223 | wgpu.Buffer_Range{offset = offset, size = u64(copy_size)}, 224 | ) or_return 225 | wgpu.device_poll(ctx.device) or_return 226 | 227 | if result != .Success {log.error(result);return nil, false} 228 | tmp := wgpu.buffer_get_mapped_range_bytes( 229 | staging_buffer, 230 | wgpu.Buffer_Range{offset = offset, size = u64(copy_size)}, 231 | ) 232 | otmp := slice.reinterpret([]T, tmp) 233 | data = make([]T, len(otmp)) 234 | copy(data, otmp) 235 | wgpu.buffer_unmap(staging_buffer) 236 | 237 | return 238 | } 239 | 240 | tensor_destroy :: proc(tensor: Tensor) { 241 | wgpu.buffer_release(tensor.buffer) 242 | } 243 | 244 | -------------------------------------------------------------------------------- /utils.odin: -------------------------------------------------------------------------------- 1 | package odgpu 2 | 3 | import "core:math" 4 | 5 | import wgpu "shared:wgpu-odin/wrapper" 6 | 7 | @(private = "file") 8 | ccdiv :: proc(n, d: u32) -> u32 {return (n + d - 1) / d} 9 | 10 | cdiv :: proc(total, group: Workgroup_Size) -> (res: Workgroup_Size) { 11 | for i in 0 ..< len(total) do res[i] = ccdiv(total[i], group[i]) 12 | return 13 | } 14 | 15 | wait :: proc(ctx: Context) {wgpu.device_poll(ctx.device)} 16 | 17 | --------------------------------------------------------------------------------