├── README.md ├── test_data ├── kernels.cu └── kernels.ptx ├── cublas ├── conversions.go ├── pointer_mode.go ├── errors.go ├── cublas.go ├── helpers_test.go ├── extensions_test.go ├── extensions.go ├── level2_test.go ├── level3_test.go ├── level2.go ├── level3.go ├── level1_test.go └── level1.go ├── helpers_test.go ├── device_test.go ├── LICENSE ├── curand ├── errors.go ├── curand_test.go └── curand.go ├── module_test.go ├── context.go ├── buffer_test.go ├── stream.go ├── doc.go ├── allocator_bfc.go ├── errors.go ├── allocator.go ├── module.go ├── buffer.go └── device.go /README.md: -------------------------------------------------------------------------------- 1 | # cuda 2 | 3 | This is a [Go](https://golang.org) package for interacting with [CUDA](https://en.wikipedia.org/wiki/CUDA). See the [GoDoc](https://godoc.org/github.com/unixpickle/cuda) for detailed usage information. 4 | 5 | # License 6 | 7 | This is licensed under a BSD 2-clause license. See [LICENSE](LICENSE). 8 | -------------------------------------------------------------------------------- /test_data/kernels.cu: -------------------------------------------------------------------------------- 1 | extern "C" __global__ 2 | void my_fancy_kernel(int n, float a, double b, int c, unsigned int d, double * out1, float * out2) { 3 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 4 | if (tid < n) { 5 | out1[tid] = (double)tid + (double)a + b + (double)c; 6 | out2[tid] = (float)c + (float)d - a; 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /cublas/conversions.go: -------------------------------------------------------------------------------- 1 | package cublas 2 | 3 | import "C" 4 | 5 | func safeUintToC(x uint) C.uint { 6 | if x > uint(^C.uint(0)) { 7 | panic("uint value out of bounds") 8 | } 9 | return C.uint(x) 10 | } 11 | 12 | func safeIntToC(x int) C.int { 13 | if x > int(C.int(^C.uint(0)/2)) { 14 | panic("int value out of bounds") 15 | } else if x < int((-C.int(^C.uint(0)/2))-1) { 16 | panic("int value out of bounds") 17 | } 18 | return C.int(x) 19 | } 20 | -------------------------------------------------------------------------------- /helpers_test.go: -------------------------------------------------------------------------------- 1 | package cuda 2 | 3 | import "testing" 4 | 5 | var testingContext *Context 6 | var testingAllocator Allocator 7 | 8 | func setupTest(t *testing.T) (*Context, Allocator) { 9 | if testingContext != nil { 10 | return testingContext, testingAllocator 11 | } 12 | devices, err := AllDevices() 13 | if err != nil { 14 | t.Fatal(err) 15 | } 16 | if len(devices) == 0 { 17 | t.Fatal("no CUDA devices") 18 | } 19 | testingContext, err = NewContext(devices[0], 10) 20 | if err != nil { 21 | t.Fatal(err) 22 | } 23 | testingAllocator = GCAllocator(NativeAllocator(testingContext), 0) 24 | return testingContext, testingAllocator 25 | } 26 | -------------------------------------------------------------------------------- /device_test.go: -------------------------------------------------------------------------------- 1 | package cuda 2 | 3 | import "testing" 4 | 5 | func TestDeviceName(t *testing.T) { 6 | devices, err := AllDevices() 7 | if err != nil { 8 | t.Fatal(err) 9 | } 10 | for i, d := range devices { 11 | name, err := d.Name() 12 | if err != nil { 13 | t.Errorf("device %d: %v", i, err) 14 | } else if len(name) == 0 { 15 | t.Errorf("device %d: empty name", i) 16 | } 17 | } 18 | } 19 | 20 | func TestDeviceAttr(t *testing.T) { 21 | devices, err := AllDevices() 22 | if err != nil { 23 | t.Fatal(err) 24 | } 25 | for i, d := range devices { 26 | rate, err := d.Attr(DevAttrClockRate) 27 | if err != nil { 28 | t.Errorf("device %d: %v", i, err) 29 | } else if rate == 0 { 30 | t.Errorf("device %d: clock rate 0", i) 31 | } 32 | } 33 | } 34 | 35 | func TestDeviceTotalMem(t *testing.T) { 36 | devices, err := AllDevices() 37 | if err != nil { 38 | t.Fatal(err) 39 | } 40 | for i, d := range devices { 41 | mem, err := d.TotalMem() 42 | if err != nil { 43 | t.Errorf("device %d: %v", i, err) 44 | } else if mem == 0 { 45 | t.Errorf("device %d: no memory", i) 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017, Alexander Nichol. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | -------------------------------------------------------------------------------- /cublas/pointer_mode.go: -------------------------------------------------------------------------------- 1 | package cublas 2 | 3 | /* 4 | #include 5 | 6 | const cublasPointerMode_t goCublasPointerModeHost = CUBLAS_POINTER_MODE_HOST; 7 | const cublasPointerMode_t goCublasPointerModeDevice = CUBLAS_POINTER_MODE_DEVICE; 8 | */ 9 | import "C" 10 | 11 | // PointerMode determines how BLAS APIs receive and return 12 | // scaler values. 13 | // 14 | // There are two types of scaler values in the API: scaler 15 | // inputs and scaler return values. 16 | // The current pointer mode affects both types of values. 17 | // 18 | // If the pointer mode is Device, then all scaler inputs 19 | // and outputs must be cuda.Buffer objects. 20 | // 21 | // If the pointer mode is Host, then all scaler inputs 22 | // must be float32, float64, *float32, or *float64; 23 | // all scaler outputs must be *float32 or *float64. 24 | type PointerMode int 25 | 26 | const ( 27 | Host PointerMode = iota 28 | Device 29 | ) 30 | 31 | func (p PointerMode) cPointerMode() C.cublasPointerMode_t { 32 | switch p { 33 | case Host: 34 | return C.goCublasPointerModeHost 35 | case Device: 36 | return C.goCublasPointerModeDevice 37 | default: 38 | panic("invalid PointerMode") 39 | } 40 | } 41 | 42 | // pointerizeInputs replaces float32 and float64 values 43 | // with *float32 and *float64 values. 44 | func pointerizeInputs(args ...*interface{}) { 45 | for _, x := range args { 46 | switch val := (*x).(type) { 47 | case float32: 48 | *x = &val 49 | case float64: 50 | *x = &val 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /cublas/errors.go: -------------------------------------------------------------------------------- 1 | package cublas 2 | 3 | /* 4 | #include 5 | 6 | // Needed to check for NULL from Cgo. 7 | const char * go_cublas_null_message = NULL; 8 | 9 | const char * go_cublas_err(cublasStatus_t s) { 10 | switch (s) { 11 | case CUBLAS_STATUS_SUCCESS: 12 | return NULL; 13 | case CUBLAS_STATUS_NOT_INITIALIZED: 14 | return "CUBLAS_STATUS_NOT_INITIALIZED"; 15 | case CUBLAS_STATUS_ALLOC_FAILED: 16 | return "CUBLAS_STATUS_ALLOC_FAILED"; 17 | case CUBLAS_STATUS_INVALID_VALUE: 18 | return "CUBLAS_STATUS_INVALID_VALUE"; 19 | case CUBLAS_STATUS_ARCH_MISMATCH: 20 | return "CUBLAS_STATUS_ARCH_MISMATCH"; 21 | case CUBLAS_STATUS_MAPPING_ERROR: 22 | return "CUBLAS_STATUS_MAPPING_ERROR"; 23 | case CUBLAS_STATUS_EXECUTION_FAILED: 24 | return "CUBLAS_STATUS_EXECUTION_FAILED"; 25 | case CUBLAS_STATUS_INTERNAL_ERROR: 26 | return "CUBLAS_STATUS_INTERNAL_ERROR"; 27 | case CUBLAS_STATUS_NOT_SUPPORTED: 28 | return "CUBLAS_STATUS_NOT_SUPPORTED"; 29 | case CUBLAS_STATUS_LICENSE_ERROR: 30 | return "CUBLAS_STATUS_LICENSE_ERROR"; 31 | default: 32 | return "unknown cuBLAS error"; 33 | } 34 | } 35 | */ 36 | import "C" 37 | 38 | import "github.com/unixpickle/cuda" 39 | 40 | // newError creates an Error from the result of a cuBLAS 41 | // API call. 42 | // 43 | // If e is CUBLAS_STATUS_SUCCESS, nil is returned. 44 | func newError(context string, e C.cublasStatus_t) error { 45 | cstr := C.go_cublas_err(e) 46 | if cstr == C.go_cublas_null_message { 47 | return nil 48 | } 49 | name := C.GoString(cstr) 50 | return &cuda.Error{ 51 | Context: context, 52 | Name: name, 53 | Message: name, 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /cublas/cublas.go: -------------------------------------------------------------------------------- 1 | // Package cublas provides bindings for the CUDA cuBLAS 2 | // library. 3 | package cublas 4 | 5 | /* 6 | #include 7 | */ 8 | import "C" 9 | 10 | import ( 11 | "runtime" 12 | 13 | "github.com/unixpickle/cuda" 14 | ) 15 | 16 | // A Handle is used to make cuBLAS calls. 17 | // 18 | // A given Handle is bound to a specific cuda.Context. 19 | type Handle struct { 20 | handle C.cublasHandle_t 21 | ctx *cuda.Context 22 | 23 | ptrMode PointerMode 24 | } 25 | 26 | // NewHandle creates a new cuBLAS handle. 27 | // 28 | // This must be called inside the cuda.Context. 29 | func NewHandle(ctx *cuda.Context) (*Handle, error) { 30 | res := &Handle{ctx: ctx, ptrMode: Host} 31 | err := newError("cublasCreate", C.cublasCreate(&res.handle)) 32 | if err != nil { 33 | return nil, err 34 | } 35 | runtime.SetFinalizer(res, func(obj *Handle) { 36 | go obj.ctx.Run(func() error { 37 | C.cublasDestroy(obj.handle) 38 | return nil 39 | }) 40 | }) 41 | return res, nil 42 | } 43 | 44 | // PointerMode returns the current PointerMode. 45 | // 46 | // This must be called inside the cuda.Context. 47 | func (h *Handle) PointerMode() PointerMode { 48 | return h.ptrMode 49 | } 50 | 51 | // SetPointerMode updates the current PointerMode. 52 | // 53 | // This must be called inside the cuda.Context. 54 | func (h *Handle) SetPointerMode(p PointerMode) error { 55 | res := C.cublasSetPointerMode(h.handle, p.cPointerMode()) 56 | if err := newError("cublasSetPointerMode", res); err != nil { 57 | return err 58 | } 59 | h.ptrMode = p 60 | return nil 61 | } 62 | 63 | // SetStream tells the handle which stream to use for its 64 | // computations. 65 | // 66 | // this must be called inside the cuda.Context. 67 | func (h *Handle) SetStream(s *cuda.Stream) error { 68 | res := C.cublasSetStream(h.handle, C.cudaStream_t(s.Pointer())) 69 | return newError("cublasSetStream", res) 70 | } 71 | -------------------------------------------------------------------------------- /curand/errors.go: -------------------------------------------------------------------------------- 1 | package curand 2 | 3 | /* 4 | #include 5 | 6 | // Needed to check for NULL from Cgo. 7 | const char * goCurandNULLMessage = NULL; 8 | 9 | const char * go_curand_err(curandStatus_t s) { 10 | switch (s) { 11 | case CURAND_STATUS_SUCCESS: 12 | return NULL; 13 | case CURAND_STATUS_VERSION_MISMATCH: 14 | return "CURAND_STATUS_VERSION_MISMATCH"; 15 | case CURAND_STATUS_NOT_INITIALIZED: 16 | return "CURAND_STATUS_NOT_INITIALIZED"; 17 | case CURAND_STATUS_ALLOCATION_FAILED: 18 | return "CURAND_STATUS_ALLOCATION_FAILED"; 19 | case CURAND_STATUS_TYPE_ERROR: 20 | return "CURAND_STATUS_TYPE_ERROR"; 21 | case CURAND_STATUS_OUT_OF_RANGE: 22 | return "CURAND_STATUS_OUT_OF_RANGE"; 23 | case CURAND_STATUS_LENGTH_NOT_MULTIPLE: 24 | return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; 25 | case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: 26 | return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; 27 | case CURAND_STATUS_LAUNCH_FAILURE: 28 | return "CURAND_STATUS_LAUNCH_FAILURE"; 29 | case CURAND_STATUS_PREEXISTING_FAILURE: 30 | return "CURAND_STATUS_PREEXISTING_FAILURE"; 31 | case CURAND_STATUS_INITIALIZATION_FAILED: 32 | return "CURAND_STATUS_INITIALIZATION_FAILED"; 33 | case CURAND_STATUS_ARCH_MISMATCH: 34 | return "CURAND_STATUS_ARCH_MISMATCH"; 35 | case CURAND_STATUS_INTERNAL_ERROR: 36 | return "CURAND_STATUS_INTERNAL_ERROR"; 37 | default: 38 | return "unknown cuRAND error"; 39 | } 40 | } 41 | */ 42 | import "C" 43 | 44 | import "github.com/unixpickle/cuda" 45 | 46 | // newError creates an Error from the result of a cuRAND 47 | // API call. 48 | // 49 | // If e is CURAND_STATUS_SUCCESS, nil is returned. 50 | func newError(context string, e C.curandStatus_t) error { 51 | msg := C.go_curand_err(e) 52 | if msg == C.goCurandNULLMessage { 53 | return nil 54 | } 55 | name := C.GoString(msg) 56 | return &cuda.Error{ 57 | Context: context, 58 | Name: name, 59 | Message: name, 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /test_data/kernels.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-21313162 5 | // Cuda compilation tools, release 8.0, V8.0.53 6 | // Based on LLVM 3.4svn 7 | // 8 | 9 | .version 4.3 10 | .target sm_30 11 | .address_size 64 12 | 13 | // .globl my_fancy_kernel 14 | 15 | .visible .entry my_fancy_kernel( 16 | .param .u32 my_fancy_kernel_param_0, 17 | .param .f32 my_fancy_kernel_param_1, 18 | .param .f64 my_fancy_kernel_param_2, 19 | .param .u32 my_fancy_kernel_param_3, 20 | .param .u32 my_fancy_kernel_param_4, 21 | .param .u64 my_fancy_kernel_param_5, 22 | .param .u64 my_fancy_kernel_param_6 23 | ) 24 | { 25 | .reg .pred %p<2>; 26 | .reg .f32 %f<6>; 27 | .reg .b32 %r<8>; 28 | .reg .f64 %fd<8>; 29 | .reg .b64 %rd<9>; 30 | 31 | 32 | ld.param.u32 %r4, [my_fancy_kernel_param_0]; 33 | ld.param.f32 %f1, [my_fancy_kernel_param_1]; 34 | ld.param.f64 %fd1, [my_fancy_kernel_param_2]; 35 | ld.param.u32 %r2, [my_fancy_kernel_param_3]; 36 | ld.param.u32 %r3, [my_fancy_kernel_param_4]; 37 | ld.param.u64 %rd1, [my_fancy_kernel_param_5]; 38 | ld.param.u64 %rd2, [my_fancy_kernel_param_6]; 39 | mov.u32 %r5, %ctaid.x; 40 | mov.u32 %r6, %ntid.x; 41 | mov.u32 %r7, %tid.x; 42 | mad.lo.s32 %r1, %r6, %r5, %r7; 43 | setp.ge.s32 %p1, %r1, %r4; 44 | @%p1 bra BB0_2; 45 | 46 | cvta.to.global.u64 %rd3, %rd1; 47 | cvt.f64.f32 %fd2, %f1; 48 | cvt.rn.f64.s32 %fd3, %r1; 49 | add.f64 %fd4, %fd2, %fd3; 50 | add.f64 %fd5, %fd4, %fd1; 51 | cvt.rn.f64.s32 %fd6, %r2; 52 | add.f64 %fd7, %fd6, %fd5; 53 | mul.wide.s32 %rd4, %r1, 8; 54 | add.s64 %rd5, %rd3, %rd4; 55 | st.global.f64 [%rd5], %fd7; 56 | cvt.rn.f32.u32 %f2, %r3; 57 | cvt.rn.f32.s32 %f3, %r2; 58 | add.f32 %f4, %f3, %f2; 59 | sub.f32 %f5, %f4, %f1; 60 | cvta.to.global.u64 %rd6, %rd2; 61 | mul.wide.s32 %rd7, %r1, 4; 62 | add.s64 %rd8, %rd6, %rd7; 63 | st.global.f32 [%rd8], %f5; 64 | 65 | BB0_2: 66 | ret; 67 | } 68 | 69 | 70 | -------------------------------------------------------------------------------- /module_test.go: -------------------------------------------------------------------------------- 1 | package cuda 2 | 3 | import ( 4 | "io/ioutil" 5 | "math" 6 | "testing" 7 | "unsafe" 8 | ) 9 | 10 | func TestModule(t *testing.T) { 11 | ptx, err := ioutil.ReadFile("test_data/kernels.ptx") 12 | if err != nil { 13 | t.Fatal(err) 14 | } 15 | ctx, a := setupTest(t) 16 | 17 | runTest := func(t *testing.T, stream *Stream) { 18 | mod, err := NewModule(ctx, string(ptx)) 19 | if err != nil { 20 | t.Error(err) 21 | return 22 | } 23 | 24 | doubleBuf, err := AllocBuffer(a, 8*1550) 25 | if err != nil { 26 | t.Error(err) 27 | return 28 | } 29 | floatBuf, err := AllocBuffer(a, 4*1550) 30 | if err != nil { 31 | t.Error(err) 32 | return 33 | } 34 | 35 | floatBuf.WithPtr(func(ptr unsafe.Pointer) { 36 | err = mod.Launch("my_fancy_kernel", 13, 1, 1, 128, 1, 1, 0, stream, int(1550), 37 | float32(3.7), float64(2.5), int(-3), uint(5), doubleBuf, ptr) 38 | }) 39 | 40 | if err != nil { 41 | t.Error(err) 42 | return 43 | } 44 | 45 | res32 := make([]float32, 1550) 46 | res64 := make([]float64, 1550) 47 | 48 | if err := ReadBuffer(res32, floatBuf); err != nil { 49 | t.Error(err) 50 | return 51 | } 52 | if err := ReadBuffer(res64, doubleBuf); err != nil { 53 | t.Error(err) 54 | return 55 | } 56 | 57 | expFloat := float32(-3 + 5 - 3.7) 58 | for i, a := range res32 { 59 | if math.Abs(float64(a-expFloat)) > 1e-4 { 60 | t.Errorf("entry %d: expected %v but got %v", i, expFloat, a) 61 | break 62 | } 63 | } 64 | 65 | for i, a := range res64 { 66 | x := float64(i) + 3.7 + 2.5 - 3 67 | if math.Abs(x-a) > 1e-5 { 68 | t.Errorf("entry %d: expected %v but got %v", i, x, a) 69 | break 70 | } 71 | } 72 | return 73 | } 74 | 75 | t.Run("NoStream", func(t *testing.T) { 76 | <-ctx.Run(func() error { 77 | runTest(t, nil) 78 | return nil 79 | }) 80 | }) 81 | 82 | t.Run("Stream", func(t *testing.T) { 83 | <-ctx.Run(func() error { 84 | stream, err := NewStream(false) 85 | if err != nil { 86 | t.Error(err) 87 | return nil 88 | } 89 | defer stream.Close() 90 | runTest(t, stream) 91 | return nil 92 | }) 93 | }) 94 | } 95 | -------------------------------------------------------------------------------- /cublas/helpers_test.go: -------------------------------------------------------------------------------- 1 | package cublas 2 | 3 | import ( 4 | "errors" 5 | "math" 6 | "testing" 7 | 8 | "github.com/unixpickle/cuda" 9 | ) 10 | 11 | var testContext *cuda.Context 12 | var testAllocator cuda.Allocator 13 | var testHandle *Handle 14 | 15 | func setupTest(t *testing.T, inBuffers ...interface{}) (*cuda.Context, *Handle, []cuda.Buffer) { 16 | if testContext == nil { 17 | devices, err := cuda.AllDevices() 18 | if err != nil { 19 | t.Fatal(err) 20 | } 21 | if len(devices) == 0 { 22 | t.Fatal("no CUDA devices") 23 | } 24 | testContext, err = cuda.NewContext(devices[0], -1) 25 | if err != nil { 26 | t.Fatal(err) 27 | } 28 | testAllocator = cuda.GCAllocator(cuda.NativeAllocator(testContext), 0) 29 | } 30 | if testHandle == nil { 31 | err := <-testContext.Run(func() (err error) { 32 | testHandle, err = NewHandle(testContext) 33 | return 34 | }) 35 | if err != nil { 36 | t.Fatal(err) 37 | } 38 | } 39 | 40 | outBufs := make([]cuda.Buffer, len(inBuffers)) 41 | for i, x := range inBuffers { 42 | err := <-testContext.Run(func() (err error) { 43 | switch x := x.(type) { 44 | case []float32: 45 | outBufs[i], err = cuda.AllocBuffer(testAllocator, uintptr(len(x)*4)) 46 | case []float64: 47 | outBufs[i], err = cuda.AllocBuffer(testAllocator, uintptr(len(x)*8)) 48 | case []int32: 49 | outBufs[i], err = cuda.AllocBuffer(testAllocator, uintptr(len(x)*4)) 50 | default: 51 | err = errors.New("unknown buffer type") 52 | } 53 | if err == nil { 54 | err = cuda.WriteBuffer(outBufs[i], x) 55 | } 56 | return 57 | }) 58 | if err != nil { 59 | t.Fatalf("buffer %d: %s", i, err) 60 | } 61 | } 62 | 63 | return testContext, testHandle, outBufs 64 | } 65 | 66 | func maxDelta32(v1, v2 []float32) float32 { 67 | var delta float32 68 | for i, x := range v1 { 69 | y := v2[i] 70 | diff := float32(math.Abs(float64(x - y))) 71 | if diff > delta { 72 | delta = diff 73 | } 74 | } 75 | return delta 76 | } 77 | 78 | func maxDelta64(v1, v2 []float64) float64 { 79 | var delta float64 80 | for i, x := range v1 { 81 | y := v2[i] 82 | diff := math.Abs(x - y) 83 | if diff > delta { 84 | delta = diff 85 | } 86 | } 87 | return delta 88 | } 89 | -------------------------------------------------------------------------------- /curand/curand_test.go: -------------------------------------------------------------------------------- 1 | package curand 2 | 3 | import ( 4 | "math/rand" 5 | "testing" 6 | 7 | "github.com/unixpickle/approb" 8 | "github.com/unixpickle/cuda" 9 | ) 10 | 11 | func TestGeneratorPseudo(t *testing.T) { 12 | devices, err := cuda.AllDevices() 13 | if err != nil { 14 | t.Fatal(err) 15 | } else if len(devices) == 0 { 16 | t.Fatal("no CUDA devices") 17 | } 18 | ctx, err := cuda.NewContext(devices[0], -1) 19 | if err != nil { 20 | t.Fatal(ctx) 21 | } 22 | allocator := cuda.GCAllocator(cuda.NativeAllocator(ctx), 0) 23 | err = <-ctx.Run(func() (resErr error) { 24 | defer func() { 25 | if err := recover(); err != nil { 26 | resErr = err.(error) 27 | } 28 | }() 29 | gen, err := NewGenerator(ctx, PseudoDefault) 30 | if err != nil { 31 | t.Error(err) 32 | return nil 33 | } 34 | samplers := testingSampleFuncs(allocator, gen) 35 | groundTruth := []func() float64{rand.NormFloat64, rand.NormFloat64, 36 | rand.Float64, rand.Float64} 37 | for i, sampler := range samplers { 38 | realSampler := groundTruth[i] 39 | corr := approb.Correlation(10000, 0.1, sampler, realSampler) 40 | if corr < 0.99 { 41 | t.Errorf("distribution %d was wrong", i) 42 | } 43 | } 44 | return nil 45 | }) 46 | if err != nil { 47 | t.Error(err) 48 | } 49 | } 50 | 51 | func testingSampleFuncs(allocator cuda.Allocator, g *Generator) []func() float64 { 52 | buf, err := cuda.AllocBuffer(allocator, 16) 53 | if err != nil { 54 | panic(err) 55 | } 56 | getValue32 := func() float32 { 57 | res := make([]float32, 1) 58 | if err := cuda.ReadBuffer(res, buf); err != nil { 59 | panic(err) 60 | } 61 | return res[0] 62 | } 63 | getValue64 := func() float64 { 64 | res := make([]float64, 1) 65 | if err := cuda.ReadBuffer(res, buf); err != nil { 66 | panic(err) 67 | } 68 | return res[0] 69 | } 70 | return []func() float64{ 71 | func() float64 { 72 | if err := g.Normal(buf, 0, 1); err != nil { 73 | panic(err) 74 | } 75 | return float64(getValue32()) 76 | }, 77 | func() float64 { 78 | if err := g.NormalDouble(buf, 0, 1); err != nil { 79 | panic(err) 80 | } 81 | return getValue64() 82 | }, 83 | func() float64 { 84 | if err := g.Uniform(buf); err != nil { 85 | panic(err) 86 | } 87 | return float64(getValue32()) 88 | }, 89 | func() float64 { 90 | if err := g.UniformDouble(buf); err != nil { 91 | panic(err) 92 | } 93 | return getValue64() 94 | }, 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /cublas/extensions_test.go: -------------------------------------------------------------------------------- 1 | package cublas 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/unixpickle/cuda" 7 | ) 8 | 9 | func TestSdgmm(t *testing.T) { 10 | ctx, handle, buffers := setupTest(t, 11 | []float32{1, 2, 3, 7, 4, 5, 6, 0}, 12 | []float32{0.5, 5, -2, 5, 3}, 13 | []float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}) 14 | <-ctx.Run(func() error { 15 | err := handle.Sdgmm(Left, 3, 2, buffers[0], 4, buffers[1], 2, 16 | buffers[2], 5) 17 | if err != nil { 18 | t.Error(err) 19 | return nil 20 | } 21 | 22 | actual := make([]float32, 10) 23 | if err := cuda.ReadBuffer(actual, buffers[2]); err != nil { 24 | t.Error(err) 25 | return nil 26 | } 27 | expected := []float32{0.5, -4, 9, 0, 0, 2, -10, 18, 0, 0} 28 | 29 | if maxDelta32(actual, expected) > 1e-4 { 30 | t.Errorf("expected %v but got %v", expected, actual) 31 | } 32 | 33 | err = handle.Sdgmm(Right, 3, 2, buffers[0], 3, buffers[1], 3, 34 | buffers[2], 3) 35 | if err != nil { 36 | t.Error(err) 37 | return nil 38 | } 39 | 40 | actual = make([]float32, 10) 41 | if err := cuda.ReadBuffer(actual, buffers[2]); err != nil { 42 | t.Error(err) 43 | return nil 44 | } 45 | expected = []float32{0.5, 1, 1.5, 35, 20, 25, -10, 18, 0, 0} 46 | 47 | if maxDelta32(actual, expected) > 1e-4 { 48 | t.Errorf("expected %v but got %v", expected, actual) 49 | } 50 | return nil 51 | }) 52 | } 53 | 54 | func TestDdgmm(t *testing.T) { 55 | ctx, handle, buffers := setupTest(t, 56 | []float64{1, 2, 3, 7, 4, 5, 6, 0}, 57 | []float64{0.5, 5, -2, 5, 3}, 58 | []float64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}) 59 | <-ctx.Run(func() error { 60 | err := handle.Ddgmm(Left, 3, 2, buffers[0], 4, buffers[1], 2, 61 | buffers[2], 5) 62 | if err != nil { 63 | t.Error(err) 64 | return nil 65 | } 66 | 67 | actual := make([]float64, 10) 68 | if err := cuda.ReadBuffer(actual, buffers[2]); err != nil { 69 | t.Error(err) 70 | return nil 71 | } 72 | expected := []float64{0.5, -4, 9, 0, 0, 2, -10, 18, 0, 0} 73 | 74 | if maxDelta64(actual, expected) > 1e-4 { 75 | t.Errorf("expected %v but got %v", expected, actual) 76 | } 77 | 78 | err = handle.Ddgmm(Right, 3, 2, buffers[0], 3, buffers[1], 3, 79 | buffers[2], 3) 80 | if err != nil { 81 | t.Error(err) 82 | return nil 83 | } 84 | 85 | actual = make([]float64, 10) 86 | if err := cuda.ReadBuffer(actual, buffers[2]); err != nil { 87 | t.Error(err) 88 | return nil 89 | } 90 | expected = []float64{0.5, 1, 1.5, 35, 20, 25, -10, 18, 0, 0} 91 | 92 | if maxDelta64(actual, expected) > 1e-4 { 93 | t.Errorf("expected %v but got %v", expected, actual) 94 | } 95 | return nil 96 | }) 97 | } 98 | -------------------------------------------------------------------------------- /cublas/extensions.go: -------------------------------------------------------------------------------- 1 | package cublas 2 | 3 | /* 4 | #include 5 | 6 | const cublasSideMode_t goCublasLeft = CUBLAS_SIDE_LEFT; 7 | const cublasSideMode_t goCublasRight = CUBLAS_SIDE_RIGHT; 8 | */ 9 | import "C" 10 | 11 | import ( 12 | "unsafe" 13 | 14 | "github.com/unixpickle/cuda" 15 | ) 16 | 17 | // A SideMode specifies the side on which a matrix should 18 | // be applied to another matrix. 19 | type SideMode int 20 | 21 | const ( 22 | Left SideMode = iota 23 | Right 24 | ) 25 | 26 | func (s SideMode) cValue() C.cublasSideMode_t { 27 | switch s { 28 | case Left: 29 | return C.goCublasLeft 30 | case Right: 31 | return C.goCublasRight 32 | default: 33 | panic("invalid SideMode") 34 | } 35 | } 36 | 37 | // Sdgmm multiplies a dense matrix by a diagonal matrix. 38 | // 39 | // The mode argument indicates on which side the diagonal 40 | // matrix should be placed. 41 | // 42 | // This must be called inside the cuda.Context. 43 | func (h *Handle) Sdgmm(mode SideMode, m, n int, matA cuda.Buffer, lda int, 44 | x cuda.Buffer, incx int, matC cuda.Buffer, ldc int) error { 45 | checkDgmm(mode, m, n, matA.Size()/4, lda, x.Size()/4, incx, matC.Size()/4, ldc) 46 | var res C.cublasStatus_t 47 | matA.WithPtr(func(aPtr unsafe.Pointer) { 48 | x.WithPtr(func(xPtr unsafe.Pointer) { 49 | matC.WithPtr(func(cPtr unsafe.Pointer) { 50 | res = C.cublasSdgmm(h.handle, mode.cValue(), 51 | safeIntToC(m), safeIntToC(n), 52 | (*C.float)(aPtr), safeIntToC(lda), 53 | (*C.float)(xPtr), safeIntToC(incx), 54 | (*C.float)(cPtr), safeIntToC(ldc)) 55 | }) 56 | }) 57 | }) 58 | return newError("cublasSdgmm", res) 59 | } 60 | 61 | // Ddgmm is like Sdgmm, but for double-precision. 62 | // 63 | // The mode argument indicates on which side the diagonal 64 | // matrix should be placed. 65 | // 66 | // This must be called inside the cuda.Context. 67 | func (h *Handle) Ddgmm(mode SideMode, m, n int, matA cuda.Buffer, lda int, 68 | x cuda.Buffer, incx int, matC cuda.Buffer, ldc int) error { 69 | checkDgmm(mode, m, n, matA.Size()/8, lda, x.Size()/8, incx, matC.Size()/8, ldc) 70 | var res C.cublasStatus_t 71 | matA.WithPtr(func(aPtr unsafe.Pointer) { 72 | x.WithPtr(func(xPtr unsafe.Pointer) { 73 | matC.WithPtr(func(cPtr unsafe.Pointer) { 74 | res = C.cublasDdgmm(h.handle, mode.cValue(), 75 | safeIntToC(m), safeIntToC(n), 76 | (*C.double)(aPtr), safeIntToC(lda), 77 | (*C.double)(xPtr), safeIntToC(incx), 78 | (*C.double)(cPtr), safeIntToC(ldc)) 79 | }) 80 | }) 81 | }) 82 | return newError("cublasDdgmm", res) 83 | } 84 | 85 | func checkDgmm(mode SideMode, m, n int, matA uintptr, lda int, x uintptr, incx int, 86 | matC uintptr, ldc int) { 87 | checkMatrix(NoTrans, lda, m, n, matA) 88 | checkMatrix(NoTrans, ldc, m, n, matC) 89 | 90 | neededX := uintptr(m) 91 | if mode == Right { 92 | neededX = uintptr(n) 93 | } 94 | if stridedSize(x, incx) < neededX { 95 | panic("index out of bounds") 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /context.go: -------------------------------------------------------------------------------- 1 | package cuda 2 | 3 | /* 4 | #include 5 | */ 6 | import "C" 7 | import ( 8 | "os" 9 | "runtime" 10 | "strconv" 11 | ) 12 | 13 | const defaultContextBuffer = 20 14 | 15 | func init() { 16 | if err := newErrorDriver("cuInit", C.cuInit(0)); err != nil { 17 | panic(err) 18 | } 19 | } 20 | 21 | // A Context maintains a CUDA-dedicated thread. 22 | // All CUDA code should be run by a Context. 23 | type Context struct { 24 | msgs chan<- *contextMsg 25 | ctx C.CUcontext 26 | } 27 | 28 | // NewContext creates a new Context on the Device. 29 | // 30 | // The bufferSize is the maximum number of asynchronous 31 | // calls that can be queued up at once. 32 | // A larger buffer size means that Run() is less likely 33 | // to block, all else equal. 34 | // 35 | // If bufferSize is -1, then the CUDA_CTX_BUFFER 36 | // environment variable is used. 37 | // If bufferSize is -1 and CUDA_CTX_BUFFER is not set, a 38 | // reasonable default is used. 39 | func NewContext(d *Device, bufferSize int) (*Context, error) { 40 | if bufferSize < -1 { 41 | panic("buffer size out of range") 42 | } else if bufferSize == -1 { 43 | bufferSize = defaultContextBuffer 44 | if bs := os.Getenv("CUDA_CTX_BUFFER"); bs != "" { 45 | parsed, err := strconv.Atoi(bs) 46 | if err == nil && parsed >= 0 { 47 | bufferSize = parsed 48 | } 49 | } 50 | } 51 | msgs := make(chan *contextMsg, bufferSize) 52 | go contextLoop(msgs) 53 | res := &Context{msgs: msgs} 54 | err := <-res.Run(func() error { 55 | return newErrorDriver("cuCtxCreate", C.cuCtxCreate(&res.ctx, 0, d.id)) 56 | }) 57 | if err != nil { 58 | close(msgs) 59 | return nil, err 60 | } 61 | runtime.SetFinalizer(res, func(obj *Context) { 62 | obj.Run(func() error { 63 | C.cuCtxDestroy(obj.ctx) 64 | return nil 65 | }) 66 | close(obj.msgs) 67 | }) 68 | return res, nil 69 | } 70 | 71 | // Run runs f in the Context and returns a channel that 72 | // will be sent the result of f when f completes. 73 | // 74 | // This may block until some queued up functions have 75 | // finished running on the Context. 76 | // 77 | // If you are not interested in the result of f, you can 78 | // simply ignore the returned channel. 79 | // 80 | // While f is running, no other function can run on the 81 | // Context. 82 | // This means that, to avoid deadlock, f should not use 83 | // the Context. 84 | func (c *Context) Run(f func() error) <-chan error { 85 | ch := make(chan error, 1) 86 | msg := &contextMsg{ 87 | f: f, 88 | doneChan: ch, 89 | } 90 | c.msgs <- msg 91 | runtime.KeepAlive(c) 92 | return ch 93 | } 94 | 95 | type contextMsg struct { 96 | f func() error 97 | doneChan chan<- error 98 | } 99 | 100 | func contextLoop(msgs <-chan *contextMsg) { 101 | runtime.LockOSThread() 102 | for msg := range msgs { 103 | msg.doneChan <- msg.f() 104 | close(msg.doneChan) 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /buffer_test.go: -------------------------------------------------------------------------------- 1 | package cuda 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestBufferIO(t *testing.T) { 9 | ctx, a := setupTest(t) 10 | <-ctx.Run(func() error { 11 | const floatSize = 4 12 | buf1, err := AllocBuffer(a, floatSize*10) 13 | if err != nil { 14 | t.Error(err) 15 | return nil 16 | } 17 | buf2, err := AllocBuffer(a, floatSize*15) 18 | if err != nil { 19 | t.Error(err) 20 | return nil 21 | } 22 | if err := ClearBuffer(buf1); err != nil { 23 | t.Error(err) 24 | return nil 25 | } 26 | if err := ClearBuffer(buf2); err != nil { 27 | t.Error(err) 28 | return nil 29 | } 30 | if err := WriteBuffer(buf1, []float32{1, 2, 3, 0, 0, 4, 5}); err != nil { 31 | t.Error(err) 32 | return nil 33 | } 34 | actual := make([]float32, 8) 35 | if err := ReadBuffer(actual, buf1); err != nil { 36 | t.Error(err) 37 | return nil 38 | } 39 | expected := []float32{1, 2, 3, 0, 0, 4, 5, 0} 40 | if !reflect.DeepEqual(actual, expected) { 41 | t.Errorf("expected %v but got %v", expected, actual) 42 | } 43 | 44 | if err := CopyBuffer(buf2, buf1); err != nil { 45 | t.Error(err) 46 | return nil 47 | } 48 | 49 | actual = make([]float32, 15) 50 | if err := ReadBuffer(actual, buf2); err != nil { 51 | t.Error(err) 52 | return nil 53 | } 54 | expected = []float32{1, 2, 3, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0} 55 | if !reflect.DeepEqual(actual, expected) { 56 | t.Errorf("expected %v but got %v", expected, actual) 57 | } 58 | 59 | return nil 60 | }) 61 | } 62 | 63 | func TestSlice(t *testing.T) { 64 | ctx, a := setupTest(t) 65 | <-ctx.Run(func() error { 66 | buf1, err := AllocBuffer(a, 32) 67 | if err != nil { 68 | t.Error(err) 69 | return nil 70 | } 71 | if err := ClearBuffer(buf1); err != nil { 72 | t.Error(err) 73 | return nil 74 | } 75 | if err := WriteBuffer(Slice(buf1, 8, 15), []byte{1, 2, 3, 4}); err != nil { 76 | t.Error(err) 77 | return nil 78 | } 79 | actual := make([]byte, 12) 80 | if err := ReadBuffer(actual, Slice(buf1, 4, 12)); err != nil { 81 | t.Error(err) 82 | return nil 83 | } 84 | expected := []byte{0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0} 85 | if !reflect.DeepEqual(actual, expected) { 86 | t.Errorf("expected %v but got %v", expected, actual) 87 | } 88 | 89 | if err := CopyBuffer(Slice(buf1, 0, 4), Slice(buf1, 8, 16)); err != nil { 90 | t.Error(err) 91 | return nil 92 | } 93 | 94 | actual = make([]byte, 14) 95 | if err := ReadBuffer(actual, buf1); err != nil { 96 | t.Error(err) 97 | return nil 98 | } 99 | expected = []byte{1, 2, 3, 4, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0} 100 | if !reflect.DeepEqual(actual, expected) { 101 | t.Errorf("expected %v but got %v", expected, actual) 102 | } 103 | 104 | if !Overlap(Slice(buf1, 0, 5), Slice(buf1, 3, 5)) { 105 | t.Error("should overlap") 106 | } 107 | if Overlap(Slice(buf1, 0, 5), Slice(buf1, 5, 10)) { 108 | t.Error("should not overlap") 109 | } 110 | 111 | return nil 112 | }) 113 | } 114 | -------------------------------------------------------------------------------- /stream.go: -------------------------------------------------------------------------------- 1 | package cuda 2 | 3 | /* 4 | #include 5 | 6 | const unsigned int streamNonBlockingFlag = CU_STREAM_NON_BLOCKING; 7 | const CUstream nullStream = NULL; 8 | */ 9 | import "C" 10 | import "unsafe" 11 | 12 | // Synchronize waits for asynchronous operations to 13 | // complete. 14 | // 15 | // This should be called in a Context. 16 | func Synchronize() error { 17 | return newErrorDriver("cuCtxSynchronize", C.cuCtxSynchronize()) 18 | } 19 | 20 | // A Stream manages a pipeline of CUDA operations. 21 | // Streams can be employed to achieve parallelism. 22 | type Stream struct { 23 | stream C.CUstream 24 | closed bool 25 | } 26 | 27 | // NewStream creates a new Stream. 28 | // 29 | // If nonBlocking is true, then this stream will be able 30 | // to run concurrently with the default stream. 31 | // 32 | // This should be called in a Context. 33 | func NewStream(nonBlocking bool) (*Stream, error) { 34 | res := &Stream{} 35 | status := C.cuStreamCreate(&res.stream, streamCreationFlags(nonBlocking)) 36 | if err := newErrorDriver("cuStreamCreate", status); err != nil { 37 | return nil, err 38 | } 39 | return res, nil 40 | } 41 | 42 | // NewStreamPriority is like NewStream, but the resulting 43 | // stream is assigned a certain priority. 44 | // 45 | // This should be called in a Context. 46 | func NewStreamPriority(nonBlocking bool, priority int) (*Stream, error) { 47 | res := &Stream{} 48 | status := C.cuStreamCreateWithPriority(&res.stream, streamCreationFlags(nonBlocking), 49 | safeIntToC(priority)) 50 | if err := newErrorDriver("cuStreamCreate", status); err != nil { 51 | return nil, err 52 | } 53 | return res, nil 54 | } 55 | 56 | // Synchronize waits for the stream's tasks to complete. 57 | func (s *Stream) Synchronize() error { 58 | s.assertOpen() 59 | return newErrorDriver("cuStreamSynchronize", C.cuStreamSynchronize(s.stream)) 60 | } 61 | 62 | // Close destroys the stream. 63 | // 64 | // This will return immediately, even if the stream is 65 | // still doing work. 66 | // 67 | // A stream should not be used after it is closed. 68 | // 69 | // This should be called in a Context. 70 | func (s *Stream) Close() error { 71 | if s.closed { 72 | return nil 73 | } 74 | s.closed = true 75 | return newErrorDriver("cuStreamDestroy", C.cuStreamDestroy(s.stream)) 76 | } 77 | 78 | // Pointer returns the raw pointer value of the underlying 79 | // stream object. 80 | // 81 | // If s is nil, then a NULL pointer is returned. 82 | // 83 | // This should be called in a Context. 84 | func (s *Stream) Pointer() unsafe.Pointer { 85 | if s == nil { 86 | return unsafe.Pointer(C.nullStream) 87 | } 88 | s.assertOpen() 89 | return unsafe.Pointer(s.stream) 90 | } 91 | 92 | func (s *Stream) cuStream() C.CUstream { 93 | if s == nil { 94 | return C.nullStream 95 | } 96 | s.assertOpen() 97 | return s.stream 98 | } 99 | 100 | func (s *Stream) assertOpen() { 101 | if s != nil && s.closed { 102 | panic("stream closed") 103 | } 104 | } 105 | 106 | func streamCreationFlags(nonBlocking bool) C.uint { 107 | if nonBlocking { 108 | return C.streamNonBlockingFlag 109 | } else { 110 | return 0 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /cublas/level2_test.go: -------------------------------------------------------------------------------- 1 | package cublas 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/unixpickle/cuda" 7 | ) 8 | 9 | func TestSgemv(t *testing.T) { 10 | ctx, handle, buffers := setupTest(t, 11 | []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 12 | []float32{3, 2, 1}, 13 | []float32{0, 0, 7, 6, 0, 0, 0, 0, 0, 0}, 14 | []float32{2.5}, 15 | []float32{3.1}) 16 | <-ctx.Run(func() error { 17 | alpha := float32(2.5) 18 | err := handle.Sgemv(NoTrans, 3, 2, &alpha, buffers[0], 4, buffers[1], -2, 19 | float32(1), buffers[2], 3) 20 | if err != nil { 21 | t.Error(err) 22 | return nil 23 | } 24 | 25 | actual := make([]float32, 10) 26 | if err := cuda.ReadBuffer(actual, buffers[2]); err != nil { 27 | t.Error(err) 28 | return nil 29 | } 30 | expected := []float32{40, 0, 7, 56, 0, 0, 60, 0, 0, 0} 31 | if maxDelta32(actual, expected) > 1e-4 { 32 | t.Errorf("expected %v but got %v", expected, actual) 33 | } 34 | 35 | if err := handle.SetPointerMode(Device); err != nil { 36 | t.Error(err) 37 | return nil 38 | } 39 | defer handle.SetPointerMode(Host) 40 | 41 | err = handle.Sgemv(Trans, 3, 2, buffers[3], buffers[0], 5, 42 | buffers[1], -1, buffers[4], buffers[2], 5) 43 | if err != nil { 44 | t.Error(err) 45 | return nil 46 | } 47 | 48 | if err := cuda.ReadBuffer(actual, buffers[2]); err != nil { 49 | t.Error(err) 50 | return nil 51 | } 52 | expected = []float32{159, 0, 7, 56, 0, 110, 60, 0, 0, 0} 53 | if maxDelta32(actual, expected) > 1e-4 { 54 | t.Errorf("expected %v but got %v", expected, actual) 55 | } 56 | 57 | return nil 58 | }) 59 | } 60 | 61 | func TestDgemv(t *testing.T) { 62 | ctx, handle, buffers := setupTest(t, 63 | []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 64 | []float64{3, 2, 1}, 65 | []float64{0, 0, 7, 6, 0, 0, 0, 0, 0, 0}, 66 | []float64{2.5}, 67 | []float64{3.1}) 68 | <-ctx.Run(func() error { 69 | alpha := float64(2.5) 70 | err := handle.Dgemv(NoTrans, 3, 2, &alpha, buffers[0], 4, buffers[1], -2, 71 | float64(1), buffers[2], 3) 72 | if err != nil { 73 | t.Error(err) 74 | return nil 75 | } 76 | 77 | actual := make([]float64, 10) 78 | if err := cuda.ReadBuffer(actual, buffers[2]); err != nil { 79 | t.Error(err) 80 | return nil 81 | } 82 | expected := []float64{40, 0, 7, 56, 0, 0, 60, 0, 0, 0} 83 | if maxDelta64(actual, expected) > 1e-4 { 84 | t.Errorf("expected %v but got %v", expected, actual) 85 | } 86 | 87 | if err := handle.SetPointerMode(Device); err != nil { 88 | t.Error(err) 89 | return nil 90 | } 91 | defer handle.SetPointerMode(Host) 92 | 93 | err = handle.Dgemv(Trans, 3, 2, buffers[3], buffers[0], 5, 94 | buffers[1], -1, buffers[4], buffers[2], 5) 95 | if err != nil { 96 | t.Error(err) 97 | return nil 98 | } 99 | 100 | if err := cuda.ReadBuffer(actual, buffers[2]); err != nil { 101 | t.Error(err) 102 | return nil 103 | } 104 | expected = []float64{159, 0, 7, 56, 0, 110, 60, 0, 0, 0} 105 | if maxDelta64(actual, expected) > 1e-4 { 106 | t.Errorf("expected %v but got %v", expected, actual) 107 | } 108 | 109 | return nil 110 | }) 111 | } 112 | -------------------------------------------------------------------------------- /cublas/level3_test.go: -------------------------------------------------------------------------------- 1 | package cublas 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/unixpickle/cuda" 7 | ) 8 | 9 | func TestSgemm(t *testing.T) { 10 | ctx, handle, buffers := setupTest(t, 11 | []float32{1, 2, 3, 0, 4, 5, 6, 0}, 12 | []float32{-2, 0, 1, 2, -1, -1}, 13 | []float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, 14 | []float32{2.5}, 15 | []float32{3.1}) 16 | <-ctx.Run(func() error { 17 | alpha := float32(2.5) 18 | err := handle.Sgemm(NoTrans, Trans, 3, 3, 2, &alpha, buffers[0], 4, buffers[1], 3, 19 | float32(0), buffers[2], 3) 20 | if err != nil { 21 | t.Error(err) 22 | return nil 23 | } 24 | 25 | actual := make([]float32, 10) 26 | if err := cuda.ReadBuffer(actual, buffers[2]); err != nil { 27 | t.Error(err) 28 | return nil 29 | } 30 | expected := []float32{15, 15, 15, -10, -12.5, -15, -7.5, -7.5, -7.5, 0} 31 | if maxDelta32(actual, expected) > 1e-4 { 32 | t.Errorf("expected %v but got %v", expected, actual) 33 | } 34 | 35 | if err := handle.SetPointerMode(Device); err != nil { 36 | t.Error(err) 37 | return nil 38 | } 39 | defer handle.SetPointerMode(Host) 40 | 41 | err = handle.Sgemm(Trans, NoTrans, 2, 2, 3, buffers[3], buffers[0], 4, 42 | buffers[1], 3, buffers[4], buffers[2], 5) 43 | if err != nil { 44 | t.Error(err) 45 | return nil 46 | } 47 | 48 | if err := cuda.ReadBuffer(actual, buffers[2]); err != nil { 49 | t.Error(err) 50 | return nil 51 | } 52 | expected = []float32{49, 41.5, 15, -10, -12.5, -54, -30.750, -7.5, -7.5, 0} 53 | if maxDelta32(actual, expected) > 1e-4 { 54 | t.Errorf("expected %v but got %v", expected, actual) 55 | } 56 | 57 | return nil 58 | }) 59 | } 60 | 61 | func TestDgemm(t *testing.T) { 62 | ctx, handle, buffers := setupTest(t, 63 | []float64{1, 2, 3, 0, 4, 5, 6, 0}, 64 | []float64{-2, 0, 1, 2, -1, -1}, 65 | []float64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, 66 | []float64{2.5}, 67 | []float64{3.1}) 68 | <-ctx.Run(func() error { 69 | alpha := float64(2.5) 70 | err := handle.Dgemm(NoTrans, Trans, 3, 3, 2, &alpha, buffers[0], 4, buffers[1], 3, 71 | float64(0), buffers[2], 3) 72 | if err != nil { 73 | t.Error(err) 74 | return nil 75 | } 76 | 77 | actual := make([]float64, 10) 78 | if err := cuda.ReadBuffer(actual, buffers[2]); err != nil { 79 | t.Error(err) 80 | return nil 81 | } 82 | expected := []float64{15, 15, 15, -10, -12.5, -15, -7.5, -7.5, -7.5, 0} 83 | if maxDelta64(actual, expected) > 1e-4 { 84 | t.Errorf("expected %v but got %v", expected, actual) 85 | } 86 | 87 | if err := handle.SetPointerMode(Device); err != nil { 88 | t.Error(err) 89 | return nil 90 | } 91 | defer handle.SetPointerMode(Host) 92 | 93 | err = handle.Dgemm(Trans, NoTrans, 2, 2, 3, buffers[3], buffers[0], 4, 94 | buffers[1], 3, buffers[4], buffers[2], 5) 95 | if err != nil { 96 | t.Error(err) 97 | return nil 98 | } 99 | 100 | if err := cuda.ReadBuffer(actual, buffers[2]); err != nil { 101 | t.Error(err) 102 | return nil 103 | } 104 | expected = []float64{49, 41.5, 15, -10, -12.5, -54, -30.750, -7.5, -7.5, 0} 105 | if maxDelta64(actual, expected) > 1e-4 { 106 | t.Errorf("expected %v but got %v", expected, actual) 107 | } 108 | 109 | return nil 110 | }) 111 | } 112 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | // Package cuda provides bindings to the CUDA library. 2 | // 3 | // Building 4 | // 5 | // To use this package, you must tell Go how to link with 6 | // CUDA. 7 | // On Mac OS X, this might look like: 8 | // 9 | // export CUDA_PATH="/Developer/NVIDIA/CUDA-8.0" 10 | // export DYLD_LIBRARY_PATH="$CUDA_PATH/lib":$DYLD_LIBRARY_PATH 11 | // export CPATH="$CUDA_PATH/include/" 12 | // export CGO_LDFLAGS="/usr/local/cuda/lib/libcuda.dylib $CUDA_PATH/lib/libcudart.dylib $CUDA_PATH/lib/libcublas.dylib $CUDA_PATH/lib/libcurand.dylib" 13 | // 14 | // On Linux, this might look like: 15 | // 16 | // export CUDA_PATH=/usr/local/cuda 17 | // export CPATH="$CUDA_PATH/include/" 18 | // export CGO_LDFLAGS="$CUDA_PATH/lib64/libcublas.so $CUDA_PATH/lib64/libcudart.so $CUDA_PATH/lib64/stubs/libcuda.so $CUDA_PATH/lib64/libcurand.so" 19 | // export LD_LIBRARY_PATH=$CUDA_PATH/lib64/ 20 | // 21 | // Contexts 22 | // 23 | // Virtually every cuda API must be run from within a 24 | // Context, which can be created like so: 25 | // 26 | // devices, err := cuda.AllDevices() 27 | // if err != nil { 28 | // // Handle error. 29 | // } 30 | // if len(devices) == 0 { 31 | // // No devices found. 32 | // } 33 | // ctx, err := cuda.NewContext(devices[0], 10) 34 | // if err != nil { 35 | // // Handle error. 36 | // } 37 | // 38 | // To run code in a Context asynchronously, you can do the 39 | // following: 40 | // 41 | // ctx.Run(func() error { 42 | // // My code here. 43 | // }) 44 | // 45 | // To run code synchronously, simply read from the 46 | // resulting channel: 47 | // 48 | // <-ctx.Run(func() error { 49 | // // My code here. 50 | // }) 51 | // 52 | // You should never call ctx.Run() inside another call to 53 | // ctx.Run(), for reasons that are documented on the 54 | // Context.Run() method. 55 | // 56 | // Memory Management 57 | // 58 | // There are two ways to deal with memory: using Buffers, 59 | // or using an Allocator directly with unsafe.Pointers. 60 | // The Buffer API provides a high-level buffer interface 61 | // with garbage collection and bounds checking. 62 | // Most APIs use Buffers, including the APIs provided by 63 | // sub-packages. 64 | // 65 | // No matter what, you will need an Allocator if you want 66 | // to allocate memory. 67 | // You can create an Allocator directly on top of CUDA: 68 | // 69 | // allocator := cuda.GCAllocator(cuda.NativeAllocator(ctx), 0) 70 | // 71 | // Once you have an allocator, you can use it to allocate 72 | // Buffer objects like so: 73 | // 74 | // err := <-ctx.Run(func() error { 75 | // // Allocate 16 bytes. 76 | // buffer, err := cuda.AllocBuffer(allocator, 16) 77 | // if err != nil { 78 | // return err 79 | // } 80 | // // Use the buffer here... 81 | // }) 82 | // 83 | // There are various functions to help you deal with 84 | // buffers. 85 | // The WriteBuffer() and ReadBuffer() functions allow you 86 | // to copy Go slices to and from buffers. 87 | // The Slice() function allows you to get a Buffer which 88 | // points to a sub-region of a parent Buffer. 89 | // 90 | // Kernels 91 | // 92 | // To run kernels, you will use a Module. 93 | // You can pass various Go primitives, unsafe.Pointers, 94 | // and Buffers as kernel arguments. 95 | // 96 | // Sub-packages 97 | // 98 | // The cublas and curand sub-packages provide basic linear 99 | // algebra routines and random number generators, 100 | // respectively. 101 | package cuda 102 | -------------------------------------------------------------------------------- /cublas/level2.go: -------------------------------------------------------------------------------- 1 | package cublas 2 | 3 | /* 4 | #include 5 | */ 6 | import "C" 7 | 8 | import ( 9 | "unsafe" 10 | 11 | "github.com/unixpickle/cuda" 12 | ) 13 | 14 | // Sgemv performs single-precision matrix-vector 15 | // multiplication. 16 | // 17 | // Matrices are stored in column-major order. 18 | // 19 | // The leading dimension lda may not be 0. 20 | // 21 | // The type of alpha and beta depends on the pointer mode. 22 | // In Host mode, use float32 or *float32. 23 | // In Device mode, user cuda.Buffer. 24 | // 25 | // This must be called inside the cuda.Context 26 | func (h *Handle) Sgemv(trans Operation, m, n int, alpha interface{}, 27 | matA cuda.Buffer, lda int, x cuda.Buffer, incx int, beta interface{}, 28 | y cuda.Buffer, incy int) error { 29 | checkGemv(trans, m, n, matA.Size()/4, lda, x.Size()/4, incx, y.Size()/4, incy) 30 | 31 | var res C.cublasStatus_t 32 | matA.WithPtr(func(aPtr unsafe.Pointer) { 33 | x.WithPtr(func(xPtr unsafe.Pointer) { 34 | y.WithPtr(func(yPtr unsafe.Pointer) { 35 | if h.PointerMode() == Host { 36 | pointerizeInputs(&alpha, &beta) 37 | res = C.cublasSgemv(h.handle, 38 | trans.cValue(), 39 | safeIntToC(m), safeIntToC(n), 40 | (*C.float)(alpha.(*float32)), 41 | (*C.float)(aPtr), safeIntToC(lda), 42 | (*C.float)(xPtr), safeIntToC(incx), 43 | (*C.float)(beta.(*float32)), 44 | (*C.float)(yPtr), safeIntToC(incy)) 45 | } else { 46 | alphaBeta32(alpha, beta, func(alpha, beta *C.float) { 47 | res = C.cublasSgemv(h.handle, 48 | trans.cValue(), 49 | safeIntToC(m), safeIntToC(n), 50 | alpha, 51 | (*C.float)(aPtr), safeIntToC(lda), 52 | (*C.float)(xPtr), safeIntToC(incx), 53 | beta, 54 | (*C.float)(yPtr), safeIntToC(incy)) 55 | }) 56 | } 57 | }) 58 | }) 59 | }) 60 | 61 | return newError("cublasSgemv", res) 62 | } 63 | 64 | // Dgemv performs double-precision matrix-vector 65 | // multiplication. 66 | // 67 | // Matrices are stored in column-major order. 68 | // 69 | // The leading dimension lda may not be 0. 70 | // 71 | // The type of alpha and beta depends on the pointer mode. 72 | // In Host mode, use float64 or *float64. 73 | // In Device mode, user cuda.Buffer. 74 | // 75 | // This must be called inside the cuda.Context 76 | func (h *Handle) Dgemv(trans Operation, m, n int, alpha interface{}, 77 | matA cuda.Buffer, lda int, x cuda.Buffer, incx int, beta interface{}, 78 | y cuda.Buffer, incy int) error { 79 | checkGemv(trans, m, n, matA.Size()/8, lda, x.Size()/8, incx, y.Size()/8, incy) 80 | 81 | var res C.cublasStatus_t 82 | matA.WithPtr(func(aPtr unsafe.Pointer) { 83 | x.WithPtr(func(xPtr unsafe.Pointer) { 84 | y.WithPtr(func(yPtr unsafe.Pointer) { 85 | if h.PointerMode() == Host { 86 | pointerizeInputs(&alpha, &beta) 87 | res = C.cublasDgemv(h.handle, 88 | trans.cValue(), 89 | safeIntToC(m), safeIntToC(n), 90 | (*C.double)(alpha.(*float64)), 91 | (*C.double)(aPtr), safeIntToC(lda), 92 | (*C.double)(xPtr), safeIntToC(incx), 93 | (*C.double)(beta.(*float64)), 94 | (*C.double)(yPtr), safeIntToC(incy)) 95 | } else { 96 | alphaBeta64(alpha, beta, func(alpha, beta *C.double) { 97 | res = C.cublasDgemv(h.handle, 98 | trans.cValue(), 99 | safeIntToC(m), safeIntToC(n), 100 | alpha, 101 | (*C.double)(aPtr), safeIntToC(lda), 102 | (*C.double)(xPtr), safeIntToC(incx), 103 | beta, 104 | (*C.double)(yPtr), safeIntToC(incy)) 105 | }) 106 | } 107 | }) 108 | }) 109 | }) 110 | 111 | return newError("cublasDgemv", res) 112 | } 113 | 114 | func checkGemv(trans Operation, m, n int, matA uintptr, lda int, x uintptr, 115 | incx int, y uintptr, incy int) { 116 | if trans != NoTrans { 117 | m, n = n, m 118 | } 119 | checkMatrix(trans, lda, m, n, matA) 120 | if stridedSize(x, incx) < uintptr(n) { 121 | panic("index out of bounds") 122 | } 123 | if stridedSize(y, incy) < uintptr(m) { 124 | panic("index out of bounds") 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /allocator_bfc.go: -------------------------------------------------------------------------------- 1 | package cuda 2 | 3 | /* 4 | #include 5 | #include 6 | */ 7 | import "C" 8 | import ( 9 | "errors" 10 | "os" 11 | "runtime" 12 | "strconv" 13 | "unsafe" 14 | 15 | "github.com/unixpickle/memalloc" 16 | ) 17 | 18 | const ( 19 | minAllocatorSize = 1 << 20 20 | maxAllocators = 5 21 | 22 | allocAlignment = 32 23 | allocHeadroom = 1 << 22 24 | ) 25 | 26 | type bfcAllocator struct { 27 | a []*memalloc.MemAllocator 28 | ctx *Context 29 | } 30 | 31 | // BFCAllocator creates an Allocator that uses memory 32 | // coalescing and best-fitting to reduce memory 33 | // fragmentation. 34 | // 35 | // You should wrap the returned allocator with GCAllocator 36 | // if you plan to use the Buffer API. 37 | // 38 | // The maxSize argument specifies the maximum amount of 39 | // memory to claim for the allocator. 40 | // If it is 0, the allocator may claim nearly all of the 41 | // available device memory. 42 | // 43 | // If the CUDA_BFC_HEADROOM environment variable is set, 44 | // it is used as the minimum number of bytes to leave 45 | // free. 46 | // 47 | // If the CUDA_BFC_MAX environment variable is set, it is 48 | // used as an upper memory bound (in addition to maxSize). 49 | // 50 | // This should be called from a Context. 51 | func BFCAllocator(ctx *Context, maxSize uintptr) (Allocator, error) { 52 | if maxSizeEnv := os.Getenv("CUDA_BFC_MAX"); maxSizeEnv != "" { 53 | size, err := strconv.ParseUint(maxSizeEnv, 10, 64) 54 | if err == nil && (maxSize == 0 || uintptr(size) < maxSize) { 55 | maxSize = uintptr(size) 56 | } 57 | } 58 | 59 | if maxSize == 0 { 60 | var err error 61 | maxSize, err = maxBFCMemory() 62 | if err != nil { 63 | return nil, err 64 | } 65 | } 66 | 67 | // The allocator size must fit in an int. 68 | for int(maxSize) < 0 || uintptr(int(maxSize)) != maxSize { 69 | maxSize >>= 1 70 | } 71 | 72 | var allocs []*memalloc.MemAllocator 73 | for len(allocs) < maxAllocators && maxSize >= minAllocatorSize { 74 | // No reason to reserve a misaligned amount of bytes. 75 | // Doing so would probably cause fragmentation, knowing 76 | // how bad cudaMalloc() is with fragmentation. 77 | maxSize = (maxSize / allocAlignment) * allocAlignment 78 | 79 | var region unsafe.Pointer 80 | err := newErrorRuntime("cudaMalloc", C.cudaMalloc(®ion, C.size_t(maxSize))) 81 | if err != nil { 82 | maxSize >>= 1 83 | continue 84 | } 85 | allocs = append(allocs, &memalloc.MemAllocator{ 86 | Start: region, 87 | Size: int(maxSize), 88 | Allocator: memalloc.NewBFC(int(maxSize), allocAlignment), 89 | }) 90 | 91 | newMax, err := maxBFCMemory() 92 | if err != nil { 93 | return nil, err 94 | } else if newMax < maxSize { 95 | maxSize = newMax 96 | } 97 | } 98 | if len(allocs) == 0 { 99 | return nil, errors.New("BFC init: not enough free memory") 100 | } 101 | 102 | res := &bfcAllocator{a: allocs, ctx: ctx} 103 | 104 | runtime.SetFinalizer(res, func(b *bfcAllocator) { 105 | go ctx.Run(func() error { 106 | for _, x := range b.a { 107 | C.cudaFree(x.Start) 108 | } 109 | return nil 110 | }) 111 | }) 112 | 113 | return res, nil 114 | } 115 | 116 | func (b *bfcAllocator) Context() *Context { 117 | return b.ctx 118 | } 119 | 120 | func (b *bfcAllocator) Alloc(size uintptr) (unsafe.Pointer, error) { 121 | if int(size) < 0 || uintptr(int(size)) != size { 122 | return nil, errors.New("BFC alloc: size must fit in int") 123 | } 124 | for _, x := range b.a { 125 | ptr, err := x.Alloc(int(size)) 126 | if err == nil { 127 | return ptr, nil 128 | } 129 | } 130 | return nil, errors.New("BFC alloc: out of memory") 131 | } 132 | 133 | func (b *bfcAllocator) Free(ptr unsafe.Pointer, size uintptr) { 134 | for _, x := range b.a { 135 | if x.Contains(ptr) { 136 | x.Free(ptr) 137 | return 138 | } 139 | } 140 | panic("invalid pointer was freed") 141 | } 142 | 143 | func maxBFCMemory() (uintptr, error) { 144 | headroom := uintptr(allocHeadroom) 145 | if roomStr := os.Getenv("CUDA_BFC_HEADROOM"); roomStr != "" { 146 | val, err := strconv.ParseUint(roomStr, 10, 64) 147 | if err == nil { 148 | headroom = uintptr(val) 149 | } 150 | } 151 | 152 | free, _, err := MemInfo() 153 | if err != nil { 154 | return 0, err 155 | } 156 | res := uintptr(free) 157 | if res < headroom { 158 | return 0, nil 159 | } 160 | return res - headroom, nil 161 | } 162 | -------------------------------------------------------------------------------- /errors.go: -------------------------------------------------------------------------------- 1 | package cuda 2 | 3 | /* 4 | #include 5 | #include 6 | 7 | // Needed to check for NULL from Cgo. 8 | const char * nullMessage = NULL; 9 | 10 | const char * go_cuda_cu_err(CUresult res) { 11 | switch (res) { 12 | case CUDA_SUCCESS: 13 | return NULL; 14 | case CUDA_ERROR_INVALID_VALUE: 15 | return "CUDA_ERROR_INVALID_VALUE"; 16 | case CUDA_ERROR_OUT_OF_MEMORY: 17 | return "CUDA_ERROR_OUT_OF_MEMORY"; 18 | case CUDA_ERROR_NOT_INITIALIZED: 19 | return "CUDA_ERROR_NOT_INITIALIZED"; 20 | case CUDA_ERROR_DEINITIALIZED: 21 | return "CUDA_ERROR_DEINITIALIZED"; 22 | case CUDA_ERROR_NO_DEVICE: 23 | return "CUDA_ERROR_NO_DEVICE"; 24 | case CUDA_ERROR_INVALID_DEVICE: 25 | return "CUDA_ERROR_INVALID_DEVICE"; 26 | case CUDA_ERROR_INVALID_IMAGE: 27 | return "CUDA_ERROR_INVALID_IMAGE"; 28 | case CUDA_ERROR_INVALID_CONTEXT: 29 | return "CUDA_ERROR_INVALID_CONTEXT"; 30 | case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: 31 | return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT"; 32 | case CUDA_ERROR_MAP_FAILED: 33 | return "CUDA_ERROR_MAP_FAILED"; 34 | case CUDA_ERROR_UNMAP_FAILED: 35 | return "CUDA_ERROR_UNMAP_FAILED"; 36 | case CUDA_ERROR_ARRAY_IS_MAPPED: 37 | return "CUDA_ERROR_ARRAY_IS_MAPPED"; 38 | case CUDA_ERROR_ALREADY_MAPPED: 39 | return "CUDA_ERROR_ALREADY_MAPPED"; 40 | case CUDA_ERROR_NO_BINARY_FOR_GPU: 41 | return "CUDA_ERROR_NO_BINARY_FOR_GPU"; 42 | case CUDA_ERROR_ALREADY_ACQUIRED: 43 | return "CUDA_ERROR_ALREADY_ACQUIRED"; 44 | case CUDA_ERROR_NOT_MAPPED: 45 | return "CUDA_ERROR_NOT_MAPPED"; 46 | case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: 47 | return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY"; 48 | case CUDA_ERROR_NOT_MAPPED_AS_POINTER: 49 | return "CUDA_ERROR_NOT_MAPPED_AS_POINTER"; 50 | case CUDA_ERROR_ECC_UNCORRECTABLE: 51 | return "CUDA_ERROR_ECC_UNCORRECTABLE"; 52 | case CUDA_ERROR_UNSUPPORTED_LIMIT: 53 | return "CUDA_ERROR_UNSUPPORTED_LIMIT"; 54 | case CUDA_ERROR_INVALID_SOURCE: 55 | return "CUDA_ERROR_INVALID_SOURCE"; 56 | case CUDA_ERROR_FILE_NOT_FOUND: 57 | return "CUDA_ERROR_FILE_NOT_FOUND"; 58 | case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: 59 | return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"; 60 | case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: 61 | return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"; 62 | case CUDA_ERROR_OPERATING_SYSTEM: 63 | return "CUDA_ERROR_OPERATING_SYSTEM"; 64 | case CUDA_ERROR_INVALID_HANDLE: 65 | return "CUDA_ERROR_INVALID_HANDLE"; 66 | case CUDA_ERROR_NOT_FOUND: 67 | return "CUDA_ERROR_NOT_FOUND"; 68 | case CUDA_ERROR_NOT_READY: 69 | return "CUDA_ERROR_NOT_READY"; 70 | case CUDA_ERROR_LAUNCH_FAILED: 71 | return "CUDA_ERROR_LAUNCH_FAILED"; 72 | case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: 73 | return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; 74 | case CUDA_ERROR_LAUNCH_TIMEOUT: 75 | return "CUDA_ERROR_LAUNCH_TIMEOUT"; 76 | case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: 77 | return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; 78 | default: 79 | return "CUDA_ERROR_UNKNOWN"; 80 | } 81 | } 82 | */ 83 | import "C" 84 | 85 | // Error is a CUDA-related error. 86 | type Error struct { 87 | // Context is typically a C function name. 88 | Context string 89 | 90 | // Name is the C constant name for the error, 91 | // such as "CURAND_STATUS_INTERNAL_ERROR". 92 | Name string 93 | 94 | // Message is the main error message. 95 | // 96 | // This may be human-readable, although it may often be 97 | // the same as Name. 98 | Message string 99 | } 100 | 101 | // newErrorDriver creates an Error from the result of a 102 | // CUDA driver API call. 103 | // 104 | // If e is CUDA_SUCCESS, nil is returned. 105 | func newErrorDriver(context string, e C.CUresult) error { 106 | return newErrorCStr(context, C.go_cuda_cu_err(e)) 107 | } 108 | 109 | // newErrorRuntime creates an Error from the result of a 110 | // CUDA runtime API call. 111 | // 112 | // If e is cudaSuccess, nil is returned. 113 | func newErrorRuntime(context string, e C.cudaError_t) error { 114 | if e == C.cudaSuccess { 115 | return nil 116 | } 117 | return newErrorCStr(context, C.cudaGetErrorString(e)) 118 | } 119 | 120 | func newErrorCStr(context string, cstr *C.char) error { 121 | if cstr == C.nullMessage { 122 | return nil 123 | } 124 | name := C.GoString(cstr) 125 | return &Error{ 126 | Context: context, 127 | Name: name, 128 | Message: name, 129 | } 130 | } 131 | 132 | // Error generates a message "context: message". 133 | func (e *Error) Error() string { 134 | return e.Context + ": " + e.Message 135 | } 136 | -------------------------------------------------------------------------------- /curand/curand.go: -------------------------------------------------------------------------------- 1 | // Package curand binds the CUDA cuRAND API to Go. 2 | package curand 3 | 4 | /* 5 | #include 6 | 7 | curandRngType_t go_curand_rng_type(int idx) { 8 | curandRngType_t options[] = { 9 | CURAND_RNG_PSEUDO_DEFAULT, 10 | CURAND_RNG_PSEUDO_XORWOW, 11 | CURAND_RNG_PSEUDO_MRG32K3A, 12 | CURAND_RNG_PSEUDO_MTGP32, 13 | CURAND_RNG_PSEUDO_MT19937, 14 | CURAND_RNG_PSEUDO_PHILOX4_32_10, 15 | CURAND_RNG_QUASI_DEFAULT, 16 | CURAND_RNG_QUASI_SOBOL32, 17 | CURAND_RNG_QUASI_SCRAMBLED_SOBOL32, 18 | CURAND_RNG_QUASI_SOBOL64, 19 | CURAND_RNG_QUASI_SCRAMBLED_SOBOL64, 20 | }; 21 | return options[idx]; 22 | } 23 | */ 24 | import "C" 25 | import ( 26 | "runtime" 27 | "unsafe" 28 | 29 | "github.com/unixpickle/cuda" 30 | ) 31 | 32 | type Type int 33 | 34 | // Available generations from cuRAND API. 35 | const ( 36 | PseudoDefault Type = iota 37 | PseudoXORWOW 38 | PseudoMRG32K3A 39 | PseudoMTGP32 40 | PseudoMT19937 41 | PseudoPHILOX43210 42 | QuasiDefault 43 | QuasiSobol32 44 | QuasiScrambledSobol32 45 | QuasiSobol64 46 | QuasiScrambledSobol64 47 | ) 48 | 49 | // A Generator generates random numbers. 50 | type Generator struct { 51 | ctx *cuda.Context 52 | gen C.curandGenerator_t 53 | } 54 | 55 | // NewGenerator creates a Generator for the given type. 56 | // 57 | // This must be called inside the cuda.Context. 58 | func NewGenerator(c *cuda.Context, t Type) (*Generator, error) { 59 | if t > QuasiScrambledSobol64 || t < 0 { 60 | panic("type out of bounds") 61 | } 62 | realType := C.go_curand_rng_type(C.int(t)) 63 | res := &Generator{ctx: c} 64 | code := C.curandCreateGenerator(&res.gen, realType) 65 | if err := newError("curandCreateGenerator", code); err != nil { 66 | return nil, err 67 | } 68 | runtime.SetFinalizer(res, func(g *Generator) { 69 | go g.ctx.Run(func() error { 70 | C.curandDestroyGenerator(g.gen) 71 | return nil 72 | }) 73 | }) 74 | return res, nil 75 | } 76 | 77 | // Seed sets the seed for a pseudo-random generator. 78 | func (g *Generator) Seed(seed int64) error { 79 | status := C.curandSetPseudoRandomGeneratorSeed(g.gen, C.ulonglong(seed)) 80 | return newError("curandSetPseudoRandomGeneratorSeed", status) 81 | } 82 | 83 | // GenerateSeeds initializes the generator. 84 | // 85 | // Generally, you will not need to call GenerateSeeds 86 | // yourself. 87 | // This is because other functions (e.g. Uniform) do the 88 | // initialization process automatically if needed. 89 | // 90 | // This must be called inside a cuda.Context. 91 | func (g *Generator) GenerateSeeds() error { 92 | return newError("curandGenerateSeeds", C.curandGenerateSeeds(g.gen)) 93 | } 94 | 95 | // Uniform generates uniformly-distributed 32-bit floats 96 | // and saves them to the buffer. 97 | // 98 | // This must be called inside a cuda.Context. 99 | func (g *Generator) Uniform(buf cuda.Buffer) error { 100 | var res error 101 | buf.WithPtr(func(ptr unsafe.Pointer) { 102 | status := C.curandGenerateUniform(g.gen, (*C.float)(ptr), 103 | C.size_t(buf.Size()/4)) 104 | res = newError("curandGenerateUniform", status) 105 | }) 106 | return res 107 | } 108 | 109 | // UniformDouble is like Uniform, but for 64-bit floats. 110 | // 111 | // This must be called inside a cuda.Context. 112 | func (g *Generator) UniformDouble(buf cuda.Buffer) error { 113 | var res error 114 | buf.WithPtr(func(ptr unsafe.Pointer) { 115 | status := C.curandGenerateUniformDouble(g.gen, (*C.double)(ptr), 116 | C.size_t(buf.Size()/8)) 117 | res = newError("curandGenerateUniformDouble", status) 118 | }) 119 | return res 120 | } 121 | 122 | // Normal generates normally distributed floats. 123 | // 124 | // cuRAND may require that the number of floats is 125 | // divisible by 2. 126 | // 127 | // This must be called inside a cuda.Context. 128 | func (g *Generator) Normal(buf cuda.Buffer, mean, stddev float32) error { 129 | var res error 130 | buf.WithPtr(func(ptr unsafe.Pointer) { 131 | status := C.curandGenerateNormal(g.gen, (*C.float)(ptr), 132 | C.size_t(buf.Size()/4), C.float(mean), C.float(stddev)) 133 | res = newError("curandGenerateNormal", status) 134 | }) 135 | return res 136 | } 137 | 138 | // NormalDouble generates normally distributed doubles. 139 | // 140 | // This must be called inside a cuda.Context. 141 | func (g *Generator) NormalDouble(buf cuda.Buffer, mean, stddev float64) error { 142 | var res error 143 | buf.WithPtr(func(ptr unsafe.Pointer) { 144 | status := C.curandGenerateNormalDouble(g.gen, (*C.double)(ptr), 145 | C.size_t(buf.Size()/8), C.double(mean), C.double(stddev)) 146 | res = newError("curandGenerateNormalDouble", status) 147 | }) 148 | return res 149 | } 150 | -------------------------------------------------------------------------------- /allocator.go: -------------------------------------------------------------------------------- 1 | package cuda 2 | 3 | /* 4 | #include 5 | #include 6 | */ 7 | import "C" 8 | 9 | import ( 10 | "os" 11 | "runtime" 12 | "strconv" 13 | "unsafe" 14 | ) 15 | 16 | const minGCThresh = 1 << 15 17 | 18 | // An Allocator allocates and frees CUDA memory. 19 | // 20 | // In general, Allocators are bound to a Context, meaning 21 | // that they should only be used from within that Context. 22 | // 23 | // Usually, you should prefer to use the Buffer type over 24 | // a direct memory allocation, since Buffers take care of 25 | // garbage collection for you. 26 | // 27 | // Allocators are not responsible for zeroing out returned 28 | // memory. 29 | type Allocator interface { 30 | // Get the Context in which all calls to this Allocator 31 | // should be made. 32 | // 33 | // Unlike Alloc and Free, this needn't be called from the 34 | // allocator's Context. 35 | Context() *Context 36 | 37 | // Allocate a chunk of CUDA memory. 38 | // 39 | // This should only be called from the Context. 40 | Alloc(size uintptr) (unsafe.Pointer, error) 41 | 42 | // Free a chunk of CUDA memory. 43 | // 44 | // The size passed to Free must be the same size that was 45 | // passed to Alloc(). 46 | // 47 | // This should only be called from the Context. 48 | Free(ptr unsafe.Pointer, size uintptr) 49 | } 50 | 51 | // MemInfo gets the free and total amount of memory 52 | // available for allocation on the current device. 53 | // 54 | // This must be called in a Context. 55 | func MemInfo() (free, total uint64, err error) { 56 | var cFree, cTotal C.size_t 57 | err = newErrorRuntime("cudaMemGetInfo", C.cudaMemGetInfo(&cFree, &cTotal)) 58 | free, total = uint64(cFree), uint64(cTotal) 59 | return 60 | } 61 | 62 | // A nativeAllocator allocates directly using CUDA. 63 | type nativeAllocator struct { 64 | ctx *Context 65 | } 66 | 67 | // NativeAllocator returns an Allocator that allocates 68 | // directly from the CUDA APIs. 69 | // 70 | // The resulting Allocator should be wrapped with 71 | // GCAllocator if you plan to use it with the Buffer API. 72 | // 73 | // This need not be called in a Context. 74 | func NativeAllocator(ctx *Context) Allocator { 75 | return &nativeAllocator{ctx: ctx} 76 | } 77 | 78 | func (n *nativeAllocator) Context() *Context { 79 | return n.ctx 80 | } 81 | 82 | func (n *nativeAllocator) Alloc(size uintptr) (unsafe.Pointer, error) { 83 | var ptr unsafe.Pointer 84 | return ptr, newErrorRuntime("cudaMalloc", C.cudaMalloc(&ptr, C.size_t(size))) 85 | } 86 | 87 | func (n *nativeAllocator) Free(ptr unsafe.Pointer, size uintptr) { 88 | C.cudaFree(ptr) 89 | } 90 | 91 | type gcAllocator struct { 92 | Allocator 93 | 94 | inUse uintptr 95 | thresh uintptr 96 | ratio float64 97 | } 98 | 99 | // GCAllocator wraps an Allocator in a new Allocator which 100 | // automatically triggers garbage collections. 101 | // 102 | // The frac argument behaves similarly to the GOGC 103 | // environment variable, except that GOGC is a percentage 104 | // whereas frac is a ratio. 105 | // Thus, a frac of 1.0 is equivalent to GOGC=100. 106 | // If frac is 0, the value for GOGC is used. 107 | // 108 | // If you are implementing your own Allocator, you will 109 | // likely want to wrap it with GCAllocator so that it 110 | // works nicely with the Buffer API. 111 | // 112 | // This need not be called in a Context. 113 | func GCAllocator(a Allocator, frac float64) Allocator { 114 | if frac == 0 { 115 | frac = 1 116 | if gogc := os.Getenv("GOGC"); gogc != "" { 117 | val, err := strconv.ParseFloat(gogc, 64) 118 | if err == nil { 119 | frac = val / 100 120 | } 121 | } 122 | } 123 | if frac <= 0 { 124 | panic("invalid frac argument") 125 | } 126 | 127 | return &gcAllocator{ 128 | Allocator: a, 129 | inUse: 0, 130 | thresh: minGCThresh, 131 | ratio: frac + 1, 132 | } 133 | } 134 | 135 | func (g *gcAllocator) Alloc(size uintptr) (unsafe.Pointer, error) { 136 | res, err := g.Allocator.Alloc(size) 137 | if err != nil { 138 | return res, err 139 | } 140 | g.inUse += size 141 | if g.inUse > g.thresh { 142 | g.thresh = g.updatedThresh() 143 | runtime.GC() 144 | } 145 | return res, nil 146 | } 147 | 148 | func (g *gcAllocator) Free(ptr unsafe.Pointer, size uintptr) { 149 | g.Allocator.Free(ptr, size) 150 | g.inUse -= size 151 | if g.inUse < 0 { 152 | panic("more memory was freed than allocated") 153 | } 154 | t := g.updatedThresh() 155 | if t < g.thresh { 156 | g.thresh = t 157 | } 158 | } 159 | 160 | func (g *gcAllocator) updatedThresh() uintptr { 161 | newVal := float64(g.inUse) * g.ratio 162 | 163 | // Only matters on 32-bit systems. 164 | if newVal > float64(^uintptr(0)) { 165 | return ^uintptr(0) 166 | } 167 | 168 | res := uintptr(newVal) 169 | if res > minGCThresh { 170 | return res 171 | } else { 172 | return minGCThresh 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /module.go: -------------------------------------------------------------------------------- 1 | package cuda 2 | 3 | /* 4 | #include 5 | #include 6 | #include 7 | 8 | const size_t ptrSize = sizeof(void *); 9 | const size_t maxArgSize = 8; 10 | const CUjit_option * nullJitOptions = NULL; 11 | const void ** nullPtrPtr = NULL; 12 | */ 13 | import "C" 14 | import ( 15 | "runtime" 16 | "unsafe" 17 | ) 18 | 19 | // A Module manages a set of compiled kernels. 20 | type Module struct { 21 | module C.CUmodule 22 | cache map[string]C.CUfunction 23 | ctx *Context 24 | } 25 | 26 | // NewModule creates a Module by compiling a chunk of PTX 27 | // code. 28 | // 29 | // This should be called from within the Context. 30 | // 31 | // You can build PTX code using the nvcc compiler like so: 32 | // 33 | // nvcc --gpu-architecture=compute_30 --gpu-code=compute_30 --ptx kernels.cu 34 | // 35 | // In the above example, you build "kernels.cu" to a PTX 36 | // file called "kernels.ptx". 37 | // 38 | // The word size of the PTX should match the word size of 39 | // the Go program. 40 | // Depending on your use case, you may want to compile 41 | // separate PTX files for 32-bit and 64-bit hosts. 42 | func NewModule(ctx *Context, ptx string) (*Module, error) { 43 | cstr := unsafe.Pointer(C.CString(ptx)) 44 | defer C.free(cstr) 45 | 46 | var module C.CUmodule 47 | res := C.cuModuleLoadDataEx(&module, cstr, 0, C.nullJitOptions, C.nullPtrPtr) 48 | if err := newErrorDriver("cuModuleLoadDataEx", res); err != nil { 49 | return nil, err 50 | } 51 | 52 | m := &Module{module: module, cache: map[string]C.CUfunction{}, ctx: ctx} 53 | runtime.SetFinalizer(m, func(obj *Module) { 54 | go obj.ctx.Run(func() error { 55 | C.cuModuleUnload(obj.module) 56 | return nil 57 | }) 58 | }) 59 | 60 | return m, nil 61 | } 62 | 63 | // Launch launches a kernel (which is referenced by name). 64 | // 65 | // This should be called from within the same Context that 66 | // NewModule was called from. 67 | // 68 | // Currently, the following types may be used as kernel 69 | // arguments: 70 | // 71 | // uint 72 | // int 73 | // float32 74 | // float64 75 | // unsafe.Pointer 76 | // Buffer 77 | // 78 | // To wait for the launched kernel to complete, use 79 | // Synchronize() or stream.Synchronize() if you specified 80 | // a non-nil stream. 81 | func (m *Module) Launch(kernel string, gridX, gridY, gridZ, blockX, blockY, blockZ, 82 | sharedMem uint, stream *Stream, args ...interface{}) error { 83 | res := cleanKernelArguments(args, nil, func(rawArgs []unsafe.Pointer) error { 84 | f, err := m.lookupKernel(kernel) 85 | if err != nil { 86 | return err 87 | } 88 | res := C.cuLaunchKernel(f, safeUintToC(gridX), safeUintToC(gridY), 89 | safeUintToC(gridZ), safeUintToC(blockX), safeUintToC(blockY), 90 | safeUintToC(blockZ), safeUintToC(sharedMem), stream.cuStream(), 91 | &rawArgs[0], C.nullPtrPtr) 92 | return newErrorDriver("cuLaunchKernel", res) 93 | }) 94 | runtime.KeepAlive(m) 95 | return res 96 | } 97 | 98 | func (m *Module) lookupKernel(name string) (C.CUfunction, error) { 99 | if f, ok := m.cache[name]; ok { 100 | return f, nil 101 | } 102 | cName := C.CString(name) 103 | defer C.free(unsafe.Pointer(cName)) 104 | var kernel C.CUfunction 105 | cuRes := C.cuModuleGetFunction(&kernel, m.module, cName) 106 | if err := newErrorDriver("cuModuleGetFunction", cuRes); err != nil { 107 | return kernel, err 108 | } 109 | m.cache[name] = kernel 110 | runtime.KeepAlive(m) 111 | return kernel, nil 112 | } 113 | 114 | func cleanKernelArguments(args []interface{}, newArgs []unsafe.Pointer, 115 | f func(args []unsafe.Pointer) error) error { 116 | if len(args) == 0 { 117 | return f(newArgs) 118 | } 119 | 120 | if buf, ok := args[0].(Buffer); ok { 121 | var res error 122 | buf.WithPtr(func(ptr unsafe.Pointer) { 123 | tempArgs := append([]interface{}{ptr}, args[1:]...) 124 | res = cleanKernelArguments(tempArgs, newArgs, f) 125 | }) 126 | return res 127 | } 128 | 129 | valPtr := unsafe.Pointer(C.malloc(C.maxArgSize)) 130 | defer C.free(valPtr) 131 | 132 | switch x := args[0].(type) { 133 | case uint: 134 | val := safeUintToC(x) 135 | C.memcpy(valPtr, unsafe.Pointer(&val), 4) 136 | case int: 137 | val := safeIntToC(x) 138 | C.memcpy(valPtr, unsafe.Pointer(&val), 4) 139 | case float32: 140 | val := C.float(x) 141 | C.memcpy(valPtr, unsafe.Pointer(&val), 4) 142 | case float64: 143 | val := C.double(x) 144 | C.memcpy(valPtr, unsafe.Pointer(&val), 8) 145 | case unsafe.Pointer: 146 | C.memcpy(valPtr, unsafe.Pointer(&x), C.ptrSize) 147 | } 148 | 149 | return cleanKernelArguments(args[1:], append(newArgs, valPtr), f) 150 | } 151 | 152 | func safeUintToC(x uint) C.uint { 153 | if x > uint(^C.uint(0)) { 154 | panic("uint value out of bounds") 155 | } 156 | return C.uint(x) 157 | } 158 | 159 | func safeIntToC(x int) C.int { 160 | if x > int(C.int(^C.uint(0)/2)) { 161 | panic("int value out of bounds") 162 | } else if x < int((-C.int(^C.uint(0)/2))-1) { 163 | panic("int value out of bounds") 164 | } 165 | return C.int(x) 166 | } 167 | -------------------------------------------------------------------------------- /cublas/level3.go: -------------------------------------------------------------------------------- 1 | package cublas 2 | 3 | /* 4 | #include 5 | 6 | const cublasOperation_t goCublasOpN = CUBLAS_OP_N; 7 | const cublasOperation_t goCublasOpT = CUBLAS_OP_T; 8 | const cublasOperation_t goCublasOpC = CUBLAS_OP_C; 9 | */ 10 | import "C" 11 | 12 | import ( 13 | "unsafe" 14 | 15 | "github.com/unixpickle/cuda" 16 | "github.com/unixpickle/essentials" 17 | ) 18 | 19 | // Operation specifies a matrix operation. 20 | type Operation int 21 | 22 | const ( 23 | NoTrans Operation = iota 24 | Trans 25 | ConjTrans 26 | ) 27 | 28 | func (o Operation) cValue() C.cublasOperation_t { 29 | switch o { 30 | case NoTrans: 31 | return C.goCublasOpN 32 | case Trans: 33 | return C.goCublasOpT 34 | case ConjTrans: 35 | return C.goCublasOpC 36 | default: 37 | panic("invalid Operation") 38 | } 39 | } 40 | 41 | // Sgemm performs single-precision matrix multiplication. 42 | // 43 | // Matrices are stored in column-major order. 44 | // 45 | // The leading dimensions lda, ldb, and ldc may not be 0. 46 | // 47 | // The type of alpha and beta depends on the pointer mode. 48 | // In Host mode, use float32 or *float32. 49 | // In Device mode, user cuda.Buffer. 50 | // 51 | // This must be called inside the cuda.Context 52 | func (h *Handle) Sgemm(transA, transB Operation, m, n, k int, alpha interface{}, 53 | matA cuda.Buffer, lda int, matB cuda.Buffer, ldb int, beta interface{}, 54 | matC cuda.Buffer, ldc int) error { 55 | checkGemm(transA, transB, m, n, k, 56 | matA.Size()/4, lda, 57 | matB.Size()/4, ldb, 58 | matC.Size()/4, ldc) 59 | 60 | var res C.cublasStatus_t 61 | matA.WithPtr(func(aPtr unsafe.Pointer) { 62 | matB.WithPtr(func(bPtr unsafe.Pointer) { 63 | matC.WithPtr(func(cPtr unsafe.Pointer) { 64 | if h.PointerMode() == Host { 65 | pointerizeInputs(&alpha, &beta) 66 | res = C.cublasSgemm(h.handle, 67 | transA.cValue(), transB.cValue(), 68 | safeIntToC(m), safeIntToC(n), safeIntToC(k), 69 | (*C.float)(alpha.(*float32)), 70 | (*C.float)(aPtr), safeIntToC(lda), 71 | (*C.float)(bPtr), safeIntToC(ldb), 72 | (*C.float)(beta.(*float32)), 73 | (*C.float)(cPtr), safeIntToC(ldc)) 74 | } else { 75 | alphaBeta32(alpha, beta, func(alpha, beta *C.float) { 76 | res = C.cublasSgemm(h.handle, 77 | transA.cValue(), transB.cValue(), 78 | safeIntToC(m), safeIntToC(n), safeIntToC(k), 79 | alpha, 80 | (*C.float)(aPtr), safeIntToC(lda), 81 | (*C.float)(bPtr), safeIntToC(ldb), 82 | beta, 83 | (*C.float)(cPtr), safeIntToC(ldc)) 84 | }) 85 | } 86 | }) 87 | }) 88 | }) 89 | 90 | return newError("cublasSgemm", res) 91 | } 92 | 93 | // Dgemm is like Sgemm, but for double-precision. 94 | // 95 | // The type of alpha and beta depends on the pointer mode. 96 | // In Host mode, use float64 or *float64. 97 | // In Device mode, user cuda.Buffer. 98 | // 99 | // This must be called inside the cuda.Context 100 | func (h *Handle) Dgemm(transA, transB Operation, m, n, k int, alpha interface{}, 101 | matA cuda.Buffer, lda int, matB cuda.Buffer, ldb int, beta interface{}, 102 | matC cuda.Buffer, ldc int) error { 103 | checkGemm(transA, transB, m, n, k, 104 | matA.Size()/8, lda, 105 | matB.Size()/8, ldb, 106 | matC.Size()/8, ldc) 107 | 108 | var res C.cublasStatus_t 109 | matA.WithPtr(func(aPtr unsafe.Pointer) { 110 | matB.WithPtr(func(bPtr unsafe.Pointer) { 111 | matC.WithPtr(func(cPtr unsafe.Pointer) { 112 | if h.PointerMode() == Host { 113 | pointerizeInputs(&alpha, &beta) 114 | res = C.cublasDgemm(h.handle, 115 | transA.cValue(), transB.cValue(), 116 | safeIntToC(m), safeIntToC(n), safeIntToC(k), 117 | (*C.double)(alpha.(*float64)), 118 | (*C.double)(aPtr), safeIntToC(lda), 119 | (*C.double)(bPtr), safeIntToC(ldb), 120 | (*C.double)(beta.(*float64)), 121 | (*C.double)(cPtr), safeIntToC(ldc)) 122 | } else { 123 | alphaBeta64(alpha, beta, func(alpha, beta *C.double) { 124 | res = C.cublasDgemm(h.handle, 125 | transA.cValue(), transB.cValue(), 126 | safeIntToC(m), safeIntToC(n), safeIntToC(k), 127 | alpha, 128 | (*C.double)(aPtr), safeIntToC(lda), 129 | (*C.double)(bPtr), safeIntToC(ldb), 130 | beta, 131 | (*C.double)(cPtr), safeIntToC(ldc)) 132 | }) 133 | } 134 | }) 135 | }) 136 | }) 137 | 138 | return newError("cublasDgemm", res) 139 | } 140 | 141 | func alphaBeta32(alpha, beta interface{}, f func(alpha, beta *C.float)) { 142 | b1 := alpha.(cuda.Buffer) 143 | b2 := beta.(cuda.Buffer) 144 | if b1.Size() < 4 || b2.Size() < 4 { 145 | panic("buffer underflow") 146 | } 147 | b1.WithPtr(func(ptr1 unsafe.Pointer) { 148 | b2.WithPtr(func(ptr2 unsafe.Pointer) { 149 | f((*C.float)(ptr1), (*C.float)(ptr2)) 150 | }) 151 | }) 152 | } 153 | 154 | func alphaBeta64(alpha, beta interface{}, f func(alpha, beta *C.double)) { 155 | b1 := alpha.(cuda.Buffer) 156 | b2 := beta.(cuda.Buffer) 157 | if b1.Size() < 4 || b2.Size() < 4 { 158 | panic("buffer underflow") 159 | } 160 | b1.WithPtr(func(ptr1 unsafe.Pointer) { 161 | b2.WithPtr(func(ptr2 unsafe.Pointer) { 162 | f((*C.double)(ptr1), (*C.double)(ptr2)) 163 | }) 164 | }) 165 | } 166 | 167 | func checkGemm(transA, transB Operation, m, n, k int, A uintptr, lda int, B uintptr, 168 | ldb int, C uintptr, ldc int) { 169 | checkMatrix(transA, lda, m, k, A) 170 | checkMatrix(transB, ldb, k, n, B) 171 | checkMatrix(NoTrans, ldc, m, n, C) 172 | } 173 | 174 | // checkMatrix ensures that op(A) fits in size elements, 175 | // given that op(A) is a-by-b and has leading dimension 176 | // lda. 177 | func checkMatrix(op Operation, lda, a, b int, size uintptr) { 178 | if a < 0 || b < 0 { 179 | panic("negative matrix dimension") 180 | } 181 | if op == NoTrans { 182 | if lda < essentials.MaxInt(1, a) { 183 | panic("leading dimension out of bounds") 184 | } 185 | if size/uintptr(lda) < uintptr(b) { 186 | panic("index out of bounds") 187 | } 188 | } else { 189 | if lda < essentials.MaxInt(1, b) { 190 | panic("leading dimension out of bounds") 191 | } 192 | if size/uintptr(lda) < uintptr(a) { 193 | panic("index out of bounds") 194 | } 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /buffer.go: -------------------------------------------------------------------------------- 1 | package cuda 2 | 3 | /* 4 | #include 5 | #include 6 | */ 7 | import "C" 8 | 9 | import ( 10 | "fmt" 11 | "runtime" 12 | "unsafe" 13 | ) 14 | 15 | // A Buffer provides a high-level interface into an 16 | // underlying CUDA buffer. 17 | type Buffer interface { 18 | // Allocator is the Allocator from which the Buffer was 19 | // allocated. 20 | Allocator() Allocator 21 | 22 | // Size is the size of the Buffer. 23 | Size() uintptr 24 | 25 | // WithPtr runs f with the pointer contained inside the 26 | // Buffer. 27 | // During the call to f, it is guaranteed that the Buffer 28 | // wil not be garbage collected. 29 | // However, nothing should store a reference to ptr after 30 | // f has completed. 31 | WithPtr(f func(ptr unsafe.Pointer)) 32 | } 33 | 34 | type buffer struct { 35 | alloc Allocator 36 | size uintptr 37 | ptr unsafe.Pointer 38 | } 39 | 40 | // AllocBuffer allocates a new Buffer. 41 | // 42 | // This must be called in the Allocator's Context. 43 | // 44 | // This does not zero out the returned memory. 45 | // To do that, you should use ClearBuffer(). 46 | func AllocBuffer(a Allocator, size uintptr) (Buffer, error) { 47 | ptr, err := a.Alloc(size) 48 | if err != nil { 49 | return nil, err 50 | } 51 | return WrapPointer(a, ptr, size), nil 52 | } 53 | 54 | // WrapPointer wraps a pointer in a Buffer. 55 | // You must specify the Allocator from which the pointer 56 | // originated and the size of the buffer. 57 | // 58 | // After calling this, you should not use the pointer 59 | // outside of the buffer. 60 | // The Buffer will automatically free the pointer. 61 | func WrapPointer(a Allocator, ptr unsafe.Pointer, size uintptr) Buffer { 62 | res := &buffer{alloc: a, size: size, ptr: ptr} 63 | runtime.SetFinalizer(res, func(obj *buffer) { 64 | allocator := obj.alloc 65 | go allocator.Context().Run(func() error { 66 | allocator.Free(obj.ptr, obj.size) 67 | return nil 68 | }) 69 | }) 70 | return res 71 | } 72 | 73 | func (b *buffer) Allocator() Allocator { 74 | return b.alloc 75 | } 76 | 77 | func (b *buffer) Size() uintptr { 78 | return b.size 79 | } 80 | 81 | func (b *buffer) WithPtr(f func(p unsafe.Pointer)) { 82 | f(b.ptr) 83 | runtime.KeepAlive(b) 84 | } 85 | 86 | type slice struct { 87 | Buffer 88 | off uintptr 89 | size uintptr 90 | } 91 | 92 | // Slice creates a Buffer which views some part of the 93 | // contents of another Buffer. 94 | // The start and end indexes are inclusive and exclusive, 95 | // respectively. 96 | func Slice(b Buffer, start, end uintptr) Buffer { 97 | if start > end || start > b.Size() || end > b.Size() { 98 | panic("index out of bounds") 99 | } 100 | return &slice{ 101 | Buffer: b, 102 | off: start, 103 | size: end - start, 104 | } 105 | } 106 | 107 | func (s *slice) Size() uintptr { 108 | return s.size 109 | } 110 | 111 | func (s *slice) WithPtr(f func(p unsafe.Pointer)) { 112 | s.Buffer.WithPtr(func(p unsafe.Pointer) { 113 | f(unsafe.Pointer(uintptr(p) + s.off)) 114 | }) 115 | } 116 | 117 | // Overlap checks if two buffers overlap in memory. 118 | func Overlap(b1, b2 Buffer) bool { 119 | var overlap bool 120 | b1.WithPtr(func(ptr1 unsafe.Pointer) { 121 | b2.WithPtr(func(ptr2 unsafe.Pointer) { 122 | overlap = uintptr(ptr1) < uintptr(ptr2)+uintptr(b2.Size()) && 123 | uintptr(ptr2) < uintptr(ptr1)+uintptr(b1.Size()) 124 | }) 125 | }) 126 | return overlap 127 | } 128 | 129 | // ClearBuffer writes zeros over the contents of a Buffer. 130 | // It must be called from the correct Context. 131 | func ClearBuffer(b Buffer) error { 132 | var res C.cudaError_t 133 | b.WithPtr(func(ptr unsafe.Pointer) { 134 | res = C.cudaMemset(ptr, 0, C.size_t(b.Size())) 135 | }) 136 | return newErrorRuntime("cudaMemset", res) 137 | } 138 | 139 | // WriteBuffer writes the data from a slice into a Buffer. 140 | // It must be called from the correct Context. 141 | // 142 | // Supported slice types are: 143 | // 144 | // []byte 145 | // []float64 146 | // []float32 147 | // []int32 148 | // []uint32 149 | // 150 | // Similar to the copy() built-in, the maximum possible 151 | // amount of data will be copied. 152 | func WriteBuffer(b Buffer, val interface{}) error { 153 | size := bytesForSlice(val) 154 | if size > b.Size() { 155 | size = b.Size() 156 | } 157 | if size == 0 { 158 | return nil 159 | } 160 | 161 | var res C.cudaError_t 162 | b.WithPtr(func(ptr unsafe.Pointer) { 163 | switch val := val.(type) { 164 | case []byte: 165 | res = C.cudaMemcpy(ptr, unsafe.Pointer(&val[0]), C.size_t(size), 166 | C.cudaMemcpyHostToDevice) 167 | case []float64: 168 | res = C.cudaMemcpy(ptr, unsafe.Pointer(&val[0]), C.size_t(size), 169 | C.cudaMemcpyHostToDevice) 170 | case []float32: 171 | res = C.cudaMemcpy(ptr, unsafe.Pointer(&val[0]), C.size_t(size), 172 | C.cudaMemcpyHostToDevice) 173 | case []int32: 174 | res = C.cudaMemcpy(ptr, unsafe.Pointer(&val[0]), C.size_t(size), 175 | C.cudaMemcpyHostToDevice) 176 | case []uint32: 177 | res = C.cudaMemcpy(ptr, unsafe.Pointer(&val[0]), C.size_t(size), 178 | C.cudaMemcpyHostToDevice) 179 | } 180 | }) 181 | 182 | return newErrorRuntime("cudaMemcpy", res) 183 | } 184 | 185 | // ReadBuffer reads the data from a Buffer into a slice. 186 | // This must be called from the correct Context. 187 | // 188 | // See WriteBuffer for details on supported slice types. 189 | func ReadBuffer(val interface{}, b Buffer) error { 190 | size := bytesForSlice(val) 191 | if size > b.Size() { 192 | size = b.Size() 193 | } 194 | if size == 0 { 195 | return nil 196 | } 197 | 198 | var res C.cudaError_t 199 | b.WithPtr(func(ptr unsafe.Pointer) { 200 | switch val := val.(type) { 201 | case []byte: 202 | res = C.cudaMemcpy(unsafe.Pointer(&val[0]), ptr, C.size_t(size), 203 | C.cudaMemcpyDeviceToHost) 204 | case []float64: 205 | res = C.cudaMemcpy(unsafe.Pointer(&val[0]), ptr, C.size_t(size), 206 | C.cudaMemcpyDeviceToHost) 207 | case []float32: 208 | res = C.cudaMemcpy(unsafe.Pointer(&val[0]), ptr, C.size_t(size), 209 | C.cudaMemcpyDeviceToHost) 210 | case []int32: 211 | res = C.cudaMemcpy(unsafe.Pointer(&val[0]), ptr, C.size_t(size), 212 | C.cudaMemcpyDeviceToHost) 213 | case []uint32: 214 | res = C.cudaMemcpy(unsafe.Pointer(&val[0]), ptr, C.size_t(size), 215 | C.cudaMemcpyDeviceToHost) 216 | } 217 | }) 218 | 219 | return newErrorRuntime("cudaMemcpy", res) 220 | } 221 | 222 | // CopyBuffer copies as many bytes as possible from src 223 | // into dst. 224 | // 225 | // The two Buffers must not contain overlapping regions of 226 | // memory. 227 | func CopyBuffer(dst, src Buffer) error { 228 | size := dst.Size() 229 | if src.Size() < size { 230 | size = src.Size() 231 | } 232 | if size == 0 { 233 | return nil 234 | } 235 | 236 | var res C.cudaError_t 237 | dst.WithPtr(func(dstPtr unsafe.Pointer) { 238 | src.WithPtr(func(srcPtr unsafe.Pointer) { 239 | res = C.cudaMemcpy(dstPtr, srcPtr, C.size_t(size), 240 | C.cudaMemcpyDeviceToDevice) 241 | }) 242 | }) 243 | 244 | return newErrorRuntime("cudaMemcpy", res) 245 | } 246 | 247 | func bytesForSlice(val interface{}) uintptr { 248 | switch val := val.(type) { 249 | case []byte: 250 | return uintptr(len(val)) 251 | case []float64: 252 | return 8 * uintptr(len(val)) 253 | case []float32: 254 | return 4 * uintptr(len(val)) 255 | case []int32: 256 | return 4 * uintptr(len(val)) 257 | case []uint32: 258 | return 4 * uintptr(len(val)) 259 | default: 260 | panic(fmt.Sprintf("unsupported type: %T", val)) 261 | } 262 | } 263 | -------------------------------------------------------------------------------- /device.go: -------------------------------------------------------------------------------- 1 | package cuda 2 | 3 | /* 4 | #include 5 | #include 6 | 7 | int devattr_for_idx(int i, CUdevice_attribute * res) { 8 | CUdevice_attribute attrs[] = { 9 | CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 10 | CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, 11 | CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, 12 | CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, 13 | CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 14 | CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, 15 | CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, 16 | CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, 17 | CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK, 18 | CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, 19 | CU_DEVICE_ATTRIBUTE_WARP_SIZE, 20 | CU_DEVICE_ATTRIBUTE_MAX_PITCH, 21 | CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, 22 | CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK, 23 | CU_DEVICE_ATTRIBUTE_CLOCK_RATE, 24 | CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, 25 | CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, 26 | CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 27 | CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, 28 | CU_DEVICE_ATTRIBUTE_INTEGRATED, 29 | CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, 30 | CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, 31 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, 32 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, 33 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, 34 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, 35 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, 36 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, 37 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, 38 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, 39 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, 40 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH, 41 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT, 42 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES, 43 | CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, 44 | CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, 45 | CU_DEVICE_ATTRIBUTE_ECC_ENABLED, 46 | CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, 47 | CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, 48 | CU_DEVICE_ATTRIBUTE_TCC_DRIVER, 49 | CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, 50 | CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, 51 | CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, 52 | CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 53 | CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, 54 | CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, 55 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, 56 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, 57 | CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER, 58 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH, 59 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT, 60 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE, 61 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE, 62 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE, 63 | CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, 64 | CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, 65 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH, 66 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH, 67 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS, 68 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH, 69 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH, 70 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT, 71 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH, 72 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT, 73 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH, 74 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH, 75 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS, 76 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH, 77 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT, 78 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS, 79 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH, 80 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH, 81 | CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS, 82 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH, 83 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH, 84 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT, 85 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH, 86 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH, 87 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT, 88 | CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, 89 | CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, 90 | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH, 91 | CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED, 92 | CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED, 93 | CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED, 94 | CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, 95 | CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, 96 | CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, 97 | CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, 98 | #ifdef CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED 99 | CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID, 100 | CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED, 101 | CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO, 102 | CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS, 103 | CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, 104 | CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, 105 | CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM 106 | #else 107 | CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID 108 | #endif 109 | }; 110 | if (i <= 0 || i >= sizeof(attrs)/sizeof(CUdevice_attribute)) { 111 | return 1; 112 | } 113 | *res = attrs[i]; 114 | return 0; 115 | } 116 | */ 117 | import "C" 118 | 119 | import ( 120 | "fmt" 121 | "unsafe" 122 | ) 123 | 124 | // DevAttr is a CUDA device attribute. 125 | type DevAttr int 126 | 127 | func (d DevAttr) cValue() (C.CUdevice_attribute, error) { 128 | if d < 0 || d > DevAttrCanUseHostPointerForRegisteredMem { 129 | panic("invalid DevAttr") 130 | } 131 | var res C.CUdevice_attribute 132 | status := C.devattr_for_idx(C.int(d), &res) 133 | if status == 0 { 134 | return res, nil 135 | } else { 136 | return 0, fmt.Errorf("unsupported device attribute: %d", int(d)) 137 | } 138 | } 139 | 140 | // All supported device attributes. 141 | const ( 142 | DevAttrMaxThreadsPerBlock DevAttr = iota 143 | DevAttrMaxBlockDimX 144 | DevAttrMaxBlockDimY 145 | DevAttrMaxBlockDimZ 146 | DevAttrMaxGridDimX 147 | DevAttrMaxGridDimY 148 | DevAttrMaxGridDimZ 149 | DevAttrMaxSharedMemoryPerBlock 150 | DevAttrSharedMemoryPerBlock 151 | DevAttrTotalConstantMemory 152 | DevAttrWarpSize 153 | DevAttrMaxPitch 154 | DevAttrMaxRegistersPerBlock 155 | DevAttrRegistersPerBlock 156 | DevAttrClockRate 157 | DevAttrTextureAlignment 158 | DevAttrGPUOverlap 159 | DevAttrMultiprocessorCount 160 | DevAttrKernelExecTimeout 161 | DevAttrIntegrated 162 | DevAttrCanMapHostMemory 163 | DevAttrComputeMode 164 | DevAttrMaximumTexture1DWidth 165 | DevAttrMaximumTexture2DWidth 166 | DevAttrMaximumTexture2DHeight 167 | DevAttrMaximumTexture3DWidth 168 | DevAttrMaximumTexture3DHeight 169 | DevAttrMaximumTexture3DDepth 170 | DevAttrMaximumTexture2DLayeredWidth 171 | DevAttrMaximumTexture2DLayeredHeight 172 | DevAttrMaximumTexture2DLayeredLayers 173 | DevAttrMaximumTexture2DArrayWidth 174 | DevAttrMaximumTexture2DArrayHeight 175 | DevAttrMaximumTexture2DArrayNumslices 176 | DevAttrSurfaceAlignment 177 | DevAttrConcurrentKernels 178 | DevAttrECCEnabled 179 | DevAttrPCIBusID 180 | DevAttrPCIDeviceID 181 | DevAttrTCCDriver 182 | DevAttrMemoryClockRate 183 | DevAttrGlobalMemoryBusWidth 184 | DevAttrL2CacheSize 185 | DevAttrMaxThreadsPerMultiprocessor 186 | DevAttrAsyncEngineCount 187 | DevAttrUnifiedAddressing 188 | DevAttrMaximumTexture1DLayeredWidth 189 | DevAttrMaximumTexture1DLayeredLayers 190 | DevAttrCanTex2DGather 191 | DevAttrMaximumTexture2DGatherWidth 192 | DevAttrMaximumTexture2DGatherHeight 193 | DevAttrMaximumTexture3DWidthAlternate 194 | DevAttrMaximumTexture3DHeightAlternate 195 | DevAttrMaximumTexture3DDepthAlternate 196 | DevAttrPCIDomainID 197 | DevAttrTexturePitchAlignment 198 | DevAttrMaximumTexturecubemapWidth 199 | DevAttrMaximumTexturecubemapLayeredWidth 200 | DevAttrMaximumTexturecubemapLayeredLayers 201 | DevAttrMaximumSurface1DWidth 202 | DevAttrMaximumSurface2DWidth 203 | DevAttrMaximumSurface2DHeight 204 | DevAttrMaximumSurface3DWidth 205 | DevAttrMaximumSurface3DHeight 206 | DevAttrMaximumSurface3DDepth 207 | DevAttrMaximumSurface1DLayeredWidth 208 | DevAttrMaximumSurface1DLayeredLayers 209 | DevAttrMaximumSurface2DLayeredWidth 210 | DevAttrMaximumSurface2DLayeredHeight 211 | DevAttrMaximumSurface2DLayeredLayers 212 | DevAttrMaximumSurfacecubemapWidth 213 | DevAttrMaximumSurfacecubemapLayeredWidth 214 | DevAttrMaximumSurfacecubemapLayeredLayers 215 | DevAttrMaximumTexture1DLinearWidth 216 | DevAttrMaximumTexture2DLinearWidth 217 | DevAttrMaximumTexture2DLinearHeight 218 | DevAttrMaximumTexture2DLinearPitch 219 | DevAttrMaximumTexture2DMipmappedWidth 220 | DevAttrMaximumTexture2DMipmappedHeight 221 | DevAttrComputeCapabilityMajor 222 | DevAttrComputeCapabilityMinor 223 | DevAttrMaximumTexture1DMipmappedWidth 224 | DevAttrStreamPrioritiesSupported 225 | DevAttrGlobalL1CacheSupported 226 | DevAttrLocalL1CacheSupported 227 | DevAttrMaxSharedMemoryPerMultiprocessor 228 | DevAttrMaxRegistersPerMultiprocessor 229 | DevAttrManagedMemory 230 | DevAttrMultiGPUBoard 231 | DevAttrMultiGPUBoardGroupID 232 | DevAttrHostNativeAtomicSupported 233 | DevAttrSingleToDoublePrecisionPerfRatio 234 | DevAttrPageableMemoryAccess 235 | DevAttrConcurrentManagedAccess 236 | DevAttrComputePreemptionSupported 237 | DevAttrCanUseHostPointerForRegisteredMem 238 | ) 239 | 240 | // Device contains a unique ID for a CUDA device. 241 | type Device struct { 242 | id C.CUdevice 243 | } 244 | 245 | // AllDevices lists the available CUDA devices. 246 | // 247 | // This needn't be called from a Context. 248 | func AllDevices() ([]*Device, error) { 249 | var count C.int 250 | cuRes := C.cuDeviceGetCount(&count) 251 | if err := newErrorDriver("cuDeviceGetCount", cuRes); err != nil { 252 | return nil, err 253 | } 254 | var res []*Device 255 | for i := C.int(0); i < count; i++ { 256 | var dev C.CUdevice 257 | cuRes = C.cuDeviceGet(&dev, i) 258 | if err := newErrorDriver("cuDeviceGet", cuRes); err != nil { 259 | return nil, err 260 | } 261 | res = append(res, &Device{id: dev}) 262 | } 263 | return res, nil 264 | } 265 | 266 | // Name gets the device's identifier string. 267 | // 268 | // This needn't be called from a Context. 269 | func (d *Device) Name() (string, error) { 270 | res := (*C.char)(C.malloc(0x100)) 271 | defer C.free(unsafe.Pointer(res)) 272 | cuRes := C.cuDeviceGetName(res, 0xff, d.id) 273 | if err := newErrorDriver("cuDeviceGetName", cuRes); err != nil { 274 | return "", err 275 | } 276 | return C.GoString(res), nil 277 | } 278 | 279 | // Attr gets an attribute of the device. 280 | // 281 | // This needn't be called from a Context. 282 | func (d *Device) Attr(attr DevAttr) (int, error) { 283 | var res C.int 284 | cAttr, err := attr.cValue() 285 | if err != nil { 286 | return 0, err 287 | } 288 | cuRes := C.cuDeviceGetAttribute(&res, cAttr, d.id) 289 | if err := newErrorDriver("cuDeviceGetAttribute", cuRes); err != nil { 290 | return 0, err 291 | } 292 | return int(res), nil 293 | } 294 | 295 | // TotalMem gets the device's total memory. 296 | // 297 | // This needn't be called from a Context. 298 | func (d *Device) TotalMem() (uint64, error) { 299 | var res C.size_t 300 | cuRes := C.cuDeviceTotalMem(&res, d.id) 301 | if err := newErrorDriver("cuDeviceTotalMem", cuRes); err != nil { 302 | return 0, err 303 | } 304 | return uint64(res), nil 305 | } 306 | -------------------------------------------------------------------------------- /cublas/level1_test.go: -------------------------------------------------------------------------------- 1 | package cublas 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | 7 | "github.com/unixpickle/cuda" 8 | ) 9 | 10 | func TestSdot(t *testing.T) { 11 | ctx, handle, buffers := setupTest(t, []float32{1, 2, 3, 4, -2, -3, 5}, 12 | []float32{3, -1, 2, 3, -2, 0, 4, 2.5, 3.5}, []float32{0}) 13 | 14 | <-ctx.Run(func() error { 15 | var res float32 16 | err := handle.Sdot(3, buffers[0], 1, buffers[1], 1, &res) 17 | if err != nil { 18 | t.Error(err) 19 | return nil 20 | } 21 | if math.Abs(float64(res)-7) > 1e-4 { 22 | t.Errorf("bad value: %f", res) 23 | } 24 | 25 | err = handle.Sdot(5, buffers[0], 1, buffers[1], 2, &res) 26 | if err != nil { 27 | t.Error(err) 28 | return nil 29 | } 30 | if math.Abs(float64(res)-10) > 1e-4 { 31 | t.Errorf("bad value: %f", res) 32 | } 33 | 34 | err = handle.SetPointerMode(Device) 35 | if err != nil { 36 | t.Error(err) 37 | return nil 38 | } 39 | defer handle.SetPointerMode(Host) 40 | 41 | err = handle.Sdot(3, buffers[0], 3, buffers[1], 4, buffers[2]) 42 | if err != nil { 43 | t.Error(err) 44 | return nil 45 | } 46 | 47 | resSlice := make([]float32, 1) 48 | err = cuda.ReadBuffer(resSlice, buffers[2]) 49 | if err != nil { 50 | t.Error(err) 51 | return nil 52 | } 53 | 54 | if math.Abs(float64(resSlice[0])-12.5) > 1e-4 { 55 | t.Errorf("bad value: %f", resSlice[0]) 56 | } 57 | 58 | return nil 59 | }) 60 | } 61 | 62 | func TestDdot(t *testing.T) { 63 | ctx, handle, buffers := setupTest(t, []float64{1, 2, 3, 4, -2, -3, 5}, 64 | []float64{3, -1, 2, 3, -2, 0, 4, 2.5, 3.5}, []float64{0}) 65 | 66 | <-ctx.Run(func() error { 67 | var res float64 68 | err := handle.Ddot(3, buffers[0], 1, buffers[1], 1, &res) 69 | if err != nil { 70 | t.Error(err) 71 | return nil 72 | } 73 | if math.Abs(res-7) > 1e-4 { 74 | t.Errorf("bad value: %f", res) 75 | } 76 | 77 | err = handle.Ddot(5, buffers[0], 1, buffers[1], 2, &res) 78 | if err != nil { 79 | t.Error(err) 80 | return nil 81 | } 82 | if math.Abs(res-10) > 1e-4 { 83 | t.Errorf("bad value: %f", res) 84 | } 85 | 86 | err = handle.SetPointerMode(Device) 87 | if err != nil { 88 | t.Error(err) 89 | return nil 90 | } 91 | defer handle.SetPointerMode(Host) 92 | 93 | err = handle.Ddot(3, buffers[0], 3, buffers[1], 4, buffers[2]) 94 | if err != nil { 95 | t.Error(err) 96 | return nil 97 | } 98 | 99 | resSlice := make([]float64, 1) 100 | err = cuda.ReadBuffer(resSlice, buffers[2]) 101 | if err != nil { 102 | t.Error(err) 103 | return nil 104 | } 105 | 106 | if math.Abs(resSlice[0]-12.5) > 1e-4 { 107 | t.Errorf("bad value: %f", resSlice[0]) 108 | } 109 | 110 | return nil 111 | }) 112 | } 113 | 114 | func TestSscal(t *testing.T) { 115 | ctx, handle, buffers := setupTest(t, []float32{1, 2, 3, 4, -2, -3, 5}, []float32{0.25}) 116 | <-ctx.Run(func() error { 117 | actions := []func() error{ 118 | func() error { 119 | return handle.Sscal(4, float32(2), buffers[0], 2) 120 | }, 121 | func() error { 122 | scaler := float32(2) 123 | return handle.Sscal(3, &scaler, buffers[0], 1) 124 | }, 125 | func() error { 126 | if err := handle.SetPointerMode(Device); err != nil { 127 | t.Error(err) 128 | return nil 129 | } 130 | defer handle.SetPointerMode(Host) 131 | return handle.Sscal(7, buffers[1], buffers[0], 1) 132 | }, 133 | } 134 | expected := [][]float32{ 135 | {2, 2, 6, 4, -4, -3, 10}, 136 | {4, 4, 12, 4, -4, -3, 10}, 137 | {1, 1, 3, 1, -1, -0.75, 2.5}, 138 | } 139 | runTestActions32(t, actions, expected, buffers[0]) 140 | return nil 141 | }) 142 | } 143 | 144 | func TestDscal(t *testing.T) { 145 | ctx, handle, buffers := setupTest(t, []float64{1, 2, 3, 4, -2, -3, 5}, []float64{0.25}) 146 | <-ctx.Run(func() error { 147 | actions := []func() error{ 148 | func() error { 149 | return handle.Dscal(4, float64(2), buffers[0], 2) 150 | }, 151 | func() error { 152 | scaler := float64(2) 153 | return handle.Dscal(3, &scaler, buffers[0], 1) 154 | }, 155 | func() error { 156 | if err := handle.SetPointerMode(Device); err != nil { 157 | t.Error(err) 158 | return nil 159 | } 160 | defer handle.SetPointerMode(Host) 161 | return handle.Dscal(7, buffers[1], buffers[0], 1) 162 | }, 163 | } 164 | expected := [][]float64{ 165 | {2, 2, 6, 4, -4, -3, 10}, 166 | {4, 4, 12, 4, -4, -3, 10}, 167 | {1, 1, 3, 1, -1, -0.75, 2.5}, 168 | } 169 | runTestActions64(t, actions, expected, buffers[0]) 170 | return nil 171 | }) 172 | } 173 | 174 | func TestSaxpy(t *testing.T) { 175 | ctx, handle, buffers := setupTest(t, []float32{1, 2, 3, 4, -2, -3, 5}, 176 | []float32{1, 0, -1, 0, 1, 2, -2, 3, 0}, []float32{3}) 177 | <-ctx.Run(func() error { 178 | actions := []func() error{ 179 | func() error { 180 | return handle.Saxpy(5, float32(2), buffers[0], 1, buffers[1], 2) 181 | }, 182 | func() error { 183 | scaler := float32(-2) 184 | return handle.Saxpy(3, &scaler, buffers[0], 2, buffers[1], 3) 185 | }, 186 | func() error { 187 | if err := handle.SetPointerMode(Device); err != nil { 188 | t.Error(err) 189 | return nil 190 | } 191 | defer handle.SetPointerMode(Host) 192 | return handle.Saxpy(2, buffers[2], buffers[0], 1, buffers[1], 1) 193 | }, 194 | } 195 | expected := [][]float32{ 196 | {3, 0, 3, 0, 7, 2, 6, 3, -4}, 197 | {1, 0, 3, -6, 7, 2, 10, 3, -4}, 198 | {4, 6, 3, -6, 7, 2, 10, 3, -4}, 199 | } 200 | runTestActions32(t, actions, expected, buffers[1]) 201 | return nil 202 | }) 203 | } 204 | 205 | func TestDaxpy(t *testing.T) { 206 | ctx, handle, buffers := setupTest(t, []float64{1, 2, 3, 4, -2, -3, 5}, 207 | []float64{1, 0, -1, 0, 1, 2, -2, 3, 0}, []float64{3}) 208 | <-ctx.Run(func() error { 209 | actions := []func() error{ 210 | func() error { 211 | return handle.Daxpy(5, float64(2), buffers[0], 1, buffers[1], 2) 212 | }, 213 | func() error { 214 | scaler := float64(-2) 215 | return handle.Daxpy(3, &scaler, buffers[0], 2, buffers[1], 3) 216 | }, 217 | func() error { 218 | if err := handle.SetPointerMode(Device); err != nil { 219 | t.Error(err) 220 | return nil 221 | } 222 | defer handle.SetPointerMode(Host) 223 | return handle.Daxpy(2, buffers[2], buffers[0], 1, buffers[1], 1) 224 | }, 225 | } 226 | expected := [][]float64{ 227 | {3, 0, 3, 0, 7, 2, 6, 3, -4}, 228 | {1, 0, 3, -6, 7, 2, 10, 3, -4}, 229 | {4, 6, 3, -6, 7, 2, 10, 3, -4}, 230 | } 231 | runTestActions64(t, actions, expected, buffers[1]) 232 | return nil 233 | }) 234 | } 235 | 236 | func TestIsamax(t *testing.T) { 237 | ctx, handle, buffers := setupTest(t, []float32{1, 2, 3, 4, -3, -2, -5}, 238 | []int32{3}) 239 | <-ctx.Run(func() error { 240 | var idx int 241 | if err := handle.Isamax(6, buffers[0], 1, &idx); err != nil { 242 | t.Error(err) 243 | return nil 244 | } 245 | if idx != 4 { 246 | t.Errorf("expected 4 but got %v", idx) 247 | } 248 | 249 | if err := handle.SetPointerMode(Device); err != nil { 250 | t.Error(err) 251 | return nil 252 | } 253 | defer handle.SetPointerMode(Host) 254 | 255 | if err := handle.Isamax(4, buffers[0], 2, buffers[1]); err != nil { 256 | t.Error(err) 257 | return nil 258 | } 259 | 260 | resSlice := make([]int32, 1) 261 | if err := cuda.ReadBuffer(resSlice, buffers[1]); err != nil { 262 | t.Error(err) 263 | return nil 264 | } 265 | if resSlice[0] != 4 { 266 | t.Errorf("expected 4 but got %v", resSlice[0]) 267 | } 268 | 269 | return nil 270 | }) 271 | } 272 | 273 | func TestIdamax(t *testing.T) { 274 | ctx, handle, buffers := setupTest(t, []float64{1, 2, 3, 4, -3, -2, -5}, 275 | []int32{3}) 276 | <-ctx.Run(func() error { 277 | var idx int 278 | if err := handle.Idamax(6, buffers[0], 1, &idx); err != nil { 279 | t.Error(err) 280 | return nil 281 | } 282 | if idx != 4 { 283 | t.Errorf("expected 4 but got %v", idx) 284 | } 285 | 286 | if err := handle.SetPointerMode(Device); err != nil { 287 | t.Error(err) 288 | return nil 289 | } 290 | defer handle.SetPointerMode(Host) 291 | 292 | if err := handle.Idamax(4, buffers[0], 2, buffers[1]); err != nil { 293 | t.Error(err) 294 | return nil 295 | } 296 | 297 | resSlice := make([]int32, 1) 298 | if err := cuda.ReadBuffer(resSlice, buffers[1]); err != nil { 299 | t.Error(err) 300 | return nil 301 | } 302 | if resSlice[0] != 4 { 303 | t.Errorf("expected 4 but got %v", resSlice[0]) 304 | } 305 | 306 | return nil 307 | }) 308 | } 309 | 310 | func TestSasum(t *testing.T) { 311 | testNorm32(t, func(h *Handle, n int, x cuda.Buffer, inc int, res interface{}) error { 312 | return h.Sasum(n, x, inc, res) 313 | }, 1) 314 | } 315 | 316 | func TestDasum(t *testing.T) { 317 | testNorm64(t, func(h *Handle, n int, x cuda.Buffer, inc int, res interface{}) error { 318 | return h.Dasum(n, x, inc, res) 319 | }, 1) 320 | } 321 | 322 | func TestSnrm2(t *testing.T) { 323 | testNorm32(t, func(h *Handle, n int, x cuda.Buffer, inc int, res interface{}) error { 324 | return h.Snrm2(n, x, inc, res) 325 | }, 2) 326 | } 327 | 328 | func TestDnrm2(t *testing.T) { 329 | testNorm64(t, func(h *Handle, n int, x cuda.Buffer, inc int, res interface{}) error { 330 | return h.Dnrm2(n, x, inc, res) 331 | }, 2) 332 | } 333 | 334 | func runTestActions32(t *testing.T, fs []func() error, expected [][]float32, buf cuda.Buffer) { 335 | for i, f := range fs { 336 | if err := f(); err != nil { 337 | t.Errorf("action %d: %s", i, err) 338 | return 339 | } 340 | x := expected[i] 341 | actual := make([]float32, len(x)) 342 | if err := cuda.ReadBuffer(actual, buf); err != nil { 343 | t.Error(err) 344 | return 345 | } 346 | if maxDelta32(actual, x) > 1e-4 { 347 | t.Errorf("action %d: expected %v but got %v", i, x, actual) 348 | } 349 | } 350 | } 351 | 352 | func runTestActions64(t *testing.T, fs []func() error, expected [][]float64, buf cuda.Buffer) { 353 | for i, f := range fs { 354 | if err := f(); err != nil { 355 | t.Errorf("action %d: %s", i, err) 356 | return 357 | } 358 | x := expected[i] 359 | actual := make([]float64, len(x)) 360 | if err := cuda.ReadBuffer(actual, buf); err != nil { 361 | t.Error(err) 362 | return 363 | } 364 | if maxDelta64(actual, x) > 1e-4 { 365 | t.Errorf("action %d: expected %v but got %v", i, x, actual) 366 | } 367 | } 368 | } 369 | 370 | func testNorm32(t *testing.T, f func(h *Handle, n int, x cuda.Buffer, inc int, 371 | res interface{}) error, base int) { 372 | ctx, handle, buffers := setupTest(t, []float32{1, 2, 3, -1, -2, -4}, []float32{0.156}) 373 | 374 | stride2Ans := map[int]float32{1: 6, 2: float32(math.Sqrt(14))} 375 | stride1Ans := map[int]float32{1: 13, 2: float32(math.Sqrt(35))} 376 | 377 | <-ctx.Run(func() error { 378 | var res float32 379 | if err := f(handle, 3, buffers[0], 2, &res); err != nil { 380 | t.Error(err) 381 | return nil 382 | } 383 | if math.Abs(float64(res-stride2Ans[base])) > 1e-4 { 384 | t.Errorf("expected %v but got %v", stride2Ans[base], res) 385 | } 386 | 387 | if err := handle.SetPointerMode(Device); err != nil { 388 | t.Error(err) 389 | return nil 390 | } 391 | defer handle.SetPointerMode(Host) 392 | 393 | if err := f(handle, 6, buffers[0], 1, buffers[1]); err != nil { 394 | t.Error(err) 395 | return nil 396 | } 397 | resSlice := make([]float32, 1) 398 | if err := cuda.ReadBuffer(resSlice, buffers[1]); err != nil { 399 | t.Error(err) 400 | return nil 401 | } 402 | res = resSlice[0] 403 | if math.Abs(float64(res-stride1Ans[base])) > 1e-4 { 404 | t.Errorf("expected %v but got %v", stride1Ans[base], res) 405 | } 406 | return nil 407 | }) 408 | } 409 | 410 | func testNorm64(t *testing.T, f func(h *Handle, n int, x cuda.Buffer, inc int, 411 | res interface{}) error, base int) { 412 | ctx, handle, buffers := setupTest(t, []float64{1, 2, 3, -1, -2, -4}, []float64{0.156}) 413 | 414 | stride2Ans := map[int]float64{0: 3, 1: 6, 2: math.Sqrt(14)} 415 | stride1Ans := map[int]float64{0: 4, 1: 13, 2: math.Sqrt(35)} 416 | 417 | <-ctx.Run(func() error { 418 | var res float64 419 | if err := f(handle, 3, buffers[0], 2, &res); err != nil { 420 | t.Error(err) 421 | return nil 422 | } 423 | if math.Abs(res-stride2Ans[base]) > 1e-4 { 424 | t.Errorf("expected %v but got %v", stride2Ans[base], res) 425 | } 426 | 427 | if err := handle.SetPointerMode(Device); err != nil { 428 | t.Error(err) 429 | return nil 430 | } 431 | defer handle.SetPointerMode(Host) 432 | 433 | if err := f(handle, 6, buffers[0], 1, buffers[1]); err != nil { 434 | t.Error(err) 435 | return nil 436 | } 437 | resSlice := make([]float64, 1) 438 | if err := cuda.ReadBuffer(resSlice, buffers[1]); err != nil { 439 | t.Error(err) 440 | return nil 441 | } 442 | res = resSlice[0] 443 | if math.Abs(res-stride1Ans[base]) > 1e-4 { 444 | t.Errorf("expected %v but got %v", stride1Ans[base], res) 445 | } 446 | return nil 447 | }) 448 | } 449 | -------------------------------------------------------------------------------- /cublas/level1.go: -------------------------------------------------------------------------------- 1 | package cublas 2 | 3 | import ( 4 | "unsafe" 5 | 6 | "github.com/unixpickle/cuda" 7 | ) 8 | 9 | /* 10 | #include 11 | */ 12 | import "C" 13 | 14 | // Sdot performs a single-precision dot product. 15 | // 16 | // The result argument's type depends on the pointer mode. 17 | // In the Host pointer mode, it should be *float32. 18 | // In the Device pointer mode, it should be a cuda.Buffer. 19 | // 20 | // This must be called inside the cuda.Context. 21 | func (h *Handle) Sdot(n int, x cuda.Buffer, incx int, y cuda.Buffer, incy int, 22 | result interface{}) error { 23 | if n < 0 { 24 | panic("size out of bounds") 25 | } else if stridedSize(x.Size()/4, incx) < uintptr(n) { 26 | panic("index out of bounds") 27 | } else if stridedSize(y.Size()/4, incy) < uintptr(n) { 28 | panic("index out of bounds") 29 | } 30 | var res C.cublasStatus_t 31 | x.WithPtr(func(xPtr unsafe.Pointer) { 32 | y.WithPtr(func(yPtr unsafe.Pointer) { 33 | if h.PointerMode() == Host { 34 | res = C.cublasSdot(h.handle, safeIntToC(n), 35 | (*C.float)(xPtr), safeIntToC(incx), 36 | (*C.float)(yPtr), safeIntToC(incy), 37 | (*C.float)(result.(*float32))) 38 | } else { 39 | b := result.(cuda.Buffer) 40 | if b.Size() < 4 { 41 | panic("buffer underflow") 42 | } 43 | b.WithPtr(func(outPtr unsafe.Pointer) { 44 | res = C.cublasSdot(h.handle, safeIntToC(n), 45 | (*C.float)(xPtr), safeIntToC(incx), 46 | (*C.float)(yPtr), safeIntToC(incy), 47 | (*C.float)(outPtr)) 48 | }) 49 | } 50 | }) 51 | }) 52 | return newError("cublasSdot", res) 53 | } 54 | 55 | // Ddot performs a double-precision dot product. 56 | // 57 | // The result argument's type depends on the pointer mode. 58 | // In the Host pointer mode, it should be *float64. 59 | // In the Device pointer mode, it should be a cuda.Buffer. 60 | // 61 | // This must be called inside the cuda.Context. 62 | func (h *Handle) Ddot(n int, x cuda.Buffer, incx int, y cuda.Buffer, incy int, 63 | result interface{}) error { 64 | if n < 0 { 65 | panic("size out of bounds") 66 | } else if stridedSize(x.Size()/8, incx) < uintptr(n) { 67 | panic("index out of bounds") 68 | } else if stridedSize(y.Size()/8, incy) < uintptr(n) { 69 | panic("index out of bounds") 70 | } 71 | var res C.cublasStatus_t 72 | x.WithPtr(func(xPtr unsafe.Pointer) { 73 | y.WithPtr(func(yPtr unsafe.Pointer) { 74 | if h.PointerMode() == Host { 75 | res = C.cublasDdot(h.handle, safeIntToC(n), 76 | (*C.double)(xPtr), safeIntToC(incx), 77 | (*C.double)(yPtr), safeIntToC(incy), 78 | (*C.double)(result.(*float64))) 79 | } else { 80 | b := result.(cuda.Buffer) 81 | if b.Size() < 8 { 82 | panic("buffer underflow") 83 | } 84 | b.WithPtr(func(outPtr unsafe.Pointer) { 85 | res = C.cublasDdot(h.handle, safeIntToC(n), 86 | (*C.double)(xPtr), safeIntToC(incx), 87 | (*C.double)(yPtr), safeIntToC(incy), 88 | (*C.double)(outPtr)) 89 | }) 90 | } 91 | }) 92 | }) 93 | return newError("cublasDdot", res) 94 | } 95 | 96 | // Sscal scales a single-precision vector. 97 | // 98 | // The argument alpha's type depends on the pointer mode. 99 | // In the Host pointer mode, use float32 or *float32. 100 | // In the Device pointer mode, use cuda.Buffer. 101 | // 102 | // This must be called inside the cuda.Context. 103 | func (h *Handle) Sscal(n int, alpha interface{}, x cuda.Buffer, incx int) error { 104 | if n < 0 { 105 | panic("size out of bounds") 106 | } else if stridedSize(x.Size()/4, incx) < uintptr(n) { 107 | panic("index out of bounds") 108 | } 109 | 110 | var res C.cublasStatus_t 111 | x.WithPtr(func(ptr unsafe.Pointer) { 112 | if h.PointerMode() == Host { 113 | pointerizeInputs(&alpha) 114 | res = C.cublasSscal(h.handle, safeIntToC(n), (*C.float)(alpha.(*float32)), 115 | (*C.float)(ptr), safeIntToC(incx)) 116 | } else { 117 | b := alpha.(cuda.Buffer) 118 | if b.Size() < 4 { 119 | panic("buffer underflow") 120 | } 121 | b.WithPtr(func(alphaPtr unsafe.Pointer) { 122 | res = C.cublasSscal(h.handle, safeIntToC(n), (*C.float)(alphaPtr), 123 | (*C.float)(ptr), safeIntToC(incx)) 124 | }) 125 | } 126 | }) 127 | 128 | return newError("cublasSscal", res) 129 | } 130 | 131 | // Dscal is like Sscal, but for double-precision. 132 | // 133 | // The argument alpha's type depends on the pointer mode. 134 | // In the Host pointer mode, use float64 or *float64. 135 | // In the Device pointer mode, use cuda.Buffer. 136 | // 137 | // This must be called inside the cuda.Context. 138 | func (h *Handle) Dscal(n int, alpha interface{}, x cuda.Buffer, incx int) error { 139 | if n < 0 { 140 | panic("size out of bounds") 141 | } else if stridedSize(x.Size()/8, incx) < uintptr(n) { 142 | panic("index out of bounds") 143 | } 144 | 145 | var res C.cublasStatus_t 146 | x.WithPtr(func(ptr unsafe.Pointer) { 147 | if h.PointerMode() == Host { 148 | pointerizeInputs(&alpha) 149 | res = C.cublasDscal(h.handle, safeIntToC(n), (*C.double)(alpha.(*float64)), 150 | (*C.double)(ptr), safeIntToC(incx)) 151 | } else { 152 | b := alpha.(cuda.Buffer) 153 | if b.Size() < 8 { 154 | panic("buffer underflow") 155 | } 156 | b.WithPtr(func(alphaPtr unsafe.Pointer) { 157 | res = C.cublasDscal(h.handle, safeIntToC(n), (*C.double)(alphaPtr), 158 | (*C.double)(ptr), safeIntToC(incx)) 159 | }) 160 | } 161 | }) 162 | 163 | return newError("cublasDscal", res) 164 | } 165 | 166 | // Saxpy computes single-precision "ax plus y". 167 | // 168 | // The argument alpha's type depends on the pointer mode. 169 | // In the Host pointer mode, use float32 or *float32. 170 | // In the Device pointer mode, use cuda.Buffer. 171 | // 172 | // This must be called inside the cuda.Context. 173 | func (h *Handle) Saxpy(n int, alpha interface{}, x cuda.Buffer, incx int, 174 | y cuda.Buffer, incy int) error { 175 | if n < 0 { 176 | panic("size out of bounds") 177 | } else if stridedSize(x.Size()/4, incx) < uintptr(n) { 178 | panic("index out of bounds") 179 | } else if stridedSize(y.Size()/4, incy) < uintptr(n) { 180 | panic("index out of bounds") 181 | } 182 | 183 | var res C.cublasStatus_t 184 | x.WithPtr(func(xPtr unsafe.Pointer) { 185 | y.WithPtr(func(yPtr unsafe.Pointer) { 186 | if h.PointerMode() == Host { 187 | pointerizeInputs(&alpha) 188 | res = C.cublasSaxpy(h.handle, safeIntToC(n), (*C.float)(alpha.(*float32)), 189 | (*C.float)(xPtr), safeIntToC(incx), 190 | (*C.float)(yPtr), safeIntToC(incy)) 191 | } else { 192 | b := alpha.(cuda.Buffer) 193 | if b.Size() < 4 { 194 | panic("buffer underflow") 195 | } 196 | b.WithPtr(func(alphaPtr unsafe.Pointer) { 197 | res = C.cublasSaxpy(h.handle, safeIntToC(n), (*C.float)(alphaPtr), 198 | (*C.float)(xPtr), safeIntToC(incx), 199 | (*C.float)(yPtr), safeIntToC(incy)) 200 | }) 201 | } 202 | }) 203 | }) 204 | 205 | return newError("cublasSaxpy", res) 206 | } 207 | 208 | // Daxpy is like Saxpy, but for double-precision. 209 | // 210 | // The argument alpha's type depends on the pointer mode. 211 | // In the Host pointer mode, use float64 or *float64. 212 | // In the Device pointer mode, use cuda.Buffer. 213 | // 214 | // This must be called inside the cuda.Context. 215 | func (h *Handle) Daxpy(n int, alpha interface{}, x cuda.Buffer, incx int, 216 | y cuda.Buffer, incy int) error { 217 | if n < 0 { 218 | panic("size out of bounds") 219 | } else if stridedSize(x.Size()/8, incx) < uintptr(n) { 220 | panic("index out of bounds") 221 | } else if stridedSize(y.Size()/8, incy) < uintptr(n) { 222 | panic("index out of bounds") 223 | } 224 | 225 | var res C.cublasStatus_t 226 | x.WithPtr(func(xPtr unsafe.Pointer) { 227 | y.WithPtr(func(yPtr unsafe.Pointer) { 228 | if h.PointerMode() == Host { 229 | pointerizeInputs(&alpha) 230 | res = C.cublasDaxpy(h.handle, safeIntToC(n), (*C.double)(alpha.(*float64)), 231 | (*C.double)(xPtr), safeIntToC(incx), 232 | (*C.double)(yPtr), safeIntToC(incy)) 233 | } else { 234 | b := alpha.(cuda.Buffer) 235 | if b.Size() < 8 { 236 | panic("buffer underflow") 237 | } 238 | b.WithPtr(func(alphaPtr unsafe.Pointer) { 239 | res = C.cublasDaxpy(h.handle, safeIntToC(n), (*C.double)(alphaPtr), 240 | (*C.double)(xPtr), safeIntToC(incx), 241 | (*C.double)(yPtr), safeIntToC(incy)) 242 | }) 243 | } 244 | }) 245 | }) 246 | 247 | return newError("cublasDaxpy", res) 248 | } 249 | 250 | // Isamax gets the index of the first single-precision 251 | // vector component with the max absolute value. 252 | // The resulting indices start at one, not zero. 253 | // 254 | // The result argument's type depends on the pointer mode. 255 | // In the Host pointer mode, use *int. 256 | // In the Device pointer mode, use cuda.Buffer. 257 | // 258 | // This must be called inside the cuda.Context. 259 | func (h *Handle) Isamax(n int, x cuda.Buffer, incx int, result interface{}) error { 260 | if n < 0 { 261 | panic("size out of bounds") 262 | } else if stridedSize(x.Size()/4, incx) < uintptr(n) { 263 | panic("index out of bounds") 264 | } 265 | 266 | var res C.cublasStatus_t 267 | x.WithPtr(func(xPtr unsafe.Pointer) { 268 | if h.PointerMode() == Host { 269 | var resInt C.int 270 | res = C.cublasIsamax(h.handle, safeIntToC(n), (*C.float)(xPtr), 271 | safeIntToC(incx), &resInt) 272 | *(result.(*int)) = int(resInt) 273 | } else { 274 | b := result.(cuda.Buffer) 275 | if b.Size() < 4 { 276 | panic("buffer underflow") 277 | } 278 | b.WithPtr(func(resPtr unsafe.Pointer) { 279 | res = C.cublasIsamax(h.handle, safeIntToC(n), (*C.float)(xPtr), 280 | safeIntToC(incx), (*C.int)(resPtr)) 281 | }) 282 | } 283 | }) 284 | 285 | return newError("cublasIsamax", res) 286 | } 287 | 288 | // Idamax is like Isamax, but for double-precision. 289 | // 290 | // The result argument's type depends on the pointer mode. 291 | // In the Host pointer mode, use *int. 292 | // In the Device pointer mode, use cuda.Buffer. 293 | // 294 | // This must be called inside the cuda.Context. 295 | func (h *Handle) Idamax(n int, x cuda.Buffer, incx int, result interface{}) error { 296 | if n < 0 { 297 | panic("size out of bounds") 298 | } else if stridedSize(x.Size()/8, incx) < uintptr(n) { 299 | panic("index out of bounds") 300 | } 301 | 302 | var res C.cublasStatus_t 303 | x.WithPtr(func(xPtr unsafe.Pointer) { 304 | if h.PointerMode() == Host { 305 | var resInt C.int 306 | res = C.cublasIdamax(h.handle, safeIntToC(n), (*C.double)(xPtr), 307 | safeIntToC(incx), &resInt) 308 | *(result.(*int)) = int(resInt) 309 | } else { 310 | b := result.(cuda.Buffer) 311 | if b.Size() < 4 { 312 | panic("buffer underflow") 313 | } 314 | b.WithPtr(func(resPtr unsafe.Pointer) { 315 | res = C.cublasIdamax(h.handle, safeIntToC(n), (*C.double)(xPtr), 316 | safeIntToC(incx), (*C.int)(resPtr)) 317 | }) 318 | } 319 | }) 320 | 321 | return newError("cublasIdamax", res) 322 | } 323 | 324 | // Sasum sums the absolute values of the components in a 325 | // single-precision vector. 326 | // 327 | // The result argument's type depends on the pointer mode. 328 | // In the Host pointer mode, use *float32. 329 | // In the Device pointer mode, use cuda.Buffer. 330 | // 331 | // This must be called inside the cuda.Context. 332 | func (h *Handle) Sasum(n int, x cuda.Buffer, incx int, result interface{}) error { 333 | f := func(arg1 C.cublasHandle_t, arg2 C.int, arg3 *C.float, arg4 C.int, 334 | arg5 *C.float) C.cublasStatus_t { 335 | return C.cublasSasum(arg1, arg2, arg3, arg4, arg5) 336 | } 337 | return newError("cublasSasum", h.norm32(n, x, incx, result, f)) 338 | } 339 | 340 | // Dasum is like Sasum, but for double-precision. 341 | // 342 | // The result argument's type depends on the pointer mode. 343 | // In the Host pointer mode, use *float64. 344 | // In the Device pointer mode, use cuda.Buffer. 345 | // 346 | // This must be called inside the cuda.Context. 347 | func (h *Handle) Dasum(n int, x cuda.Buffer, incx int, result interface{}) error { 348 | f := func(arg1 C.cublasHandle_t, arg2 C.int, arg3 *C.double, arg4 C.int, 349 | arg5 *C.double) C.cublasStatus_t { 350 | return C.cublasDasum(arg1, arg2, arg3, arg4, arg5) 351 | } 352 | return newError("cublasDasum", h.norm64(n, x, incx, result, f)) 353 | } 354 | 355 | // Snrm2 computes the Euclidean norm of a single-precision 356 | // vector. 357 | // 358 | // The result argument's type depends on the pointer mode. 359 | // In the Host pointer mode, use *float32. 360 | // In the Device pointer mode, use cuda.Buffer. 361 | // 362 | // This must be called inside the cuda.Context. 363 | func (h *Handle) Snrm2(n int, x cuda.Buffer, incx int, result interface{}) error { 364 | f := func(arg1 C.cublasHandle_t, arg2 C.int, arg3 *C.float, arg4 C.int, 365 | arg5 *C.float) C.cublasStatus_t { 366 | return C.cublasSnrm2(arg1, arg2, arg3, arg4, arg5) 367 | } 368 | return newError("cublasSnrm2", h.norm32(n, x, incx, result, f)) 369 | } 370 | 371 | // Dnrm2 is like Snrm2, but for double-precision. 372 | // 373 | // The result argument's type depends on the pointer mode. 374 | // In the Host pointer mode, use *float64. 375 | // In the Device pointer mode, use cuda.Buffer. 376 | // 377 | // This must be called inside the cuda.Context. 378 | func (h *Handle) Dnrm2(n int, x cuda.Buffer, incx int, result interface{}) error { 379 | f := func(arg1 C.cublasHandle_t, arg2 C.int, arg3 *C.double, arg4 C.int, 380 | arg5 *C.double) C.cublasStatus_t { 381 | return C.cublasDnrm2(arg1, arg2, arg3, arg4, arg5) 382 | } 383 | return newError("cublasDnrm2", h.norm64(n, x, incx, result, f)) 384 | } 385 | 386 | func (h *Handle) norm32(n int, x cuda.Buffer, incx int, result interface{}, 387 | f func(C.cublasHandle_t, C.int, *C.float, C.int, *C.float) C.cublasStatus_t) C.cublasStatus_t { 388 | if n < 0 { 389 | panic("size out of bounds") 390 | } else if stridedSize(x.Size()/4, incx) < uintptr(n) { 391 | panic("index out of bounds") 392 | } 393 | 394 | var res C.cublasStatus_t 395 | x.WithPtr(func(xPtr unsafe.Pointer) { 396 | if h.PointerMode() == Host { 397 | res = f(h.handle, safeIntToC(n), (*C.float)(xPtr), 398 | safeIntToC(incx), (*C.float)(result.(*float32))) 399 | } else { 400 | b := result.(cuda.Buffer) 401 | if b.Size() < 4 { 402 | panic("buffer underflow") 403 | } 404 | b.WithPtr(func(resPtr unsafe.Pointer) { 405 | res = f(h.handle, safeIntToC(n), (*C.float)(xPtr), 406 | safeIntToC(incx), (*C.float)(resPtr)) 407 | }) 408 | } 409 | }) 410 | 411 | return res 412 | } 413 | 414 | func (h *Handle) norm64(n int, x cuda.Buffer, incx int, result interface{}, 415 | f func(C.cublasHandle_t, C.int, *C.double, C.int, 416 | *C.double) C.cublasStatus_t) C.cublasStatus_t { 417 | if n < 0 { 418 | panic("size out of bounds") 419 | } else if stridedSize(x.Size()/8, incx) < uintptr(n) { 420 | panic("index out of bounds") 421 | } 422 | 423 | var res C.cublasStatus_t 424 | x.WithPtr(func(xPtr unsafe.Pointer) { 425 | if h.PointerMode() == Host { 426 | res = f(h.handle, safeIntToC(n), (*C.double)(xPtr), 427 | safeIntToC(incx), (*C.double)(result.(*float64))) 428 | } else { 429 | b := result.(cuda.Buffer) 430 | if b.Size() < 8 { 431 | panic("buffer underflow") 432 | } 433 | b.WithPtr(func(resPtr unsafe.Pointer) { 434 | res = f(h.handle, safeIntToC(n), (*C.double)(xPtr), 435 | safeIntToC(incx), (*C.double)(resPtr)) 436 | }) 437 | } 438 | }) 439 | 440 | return res 441 | } 442 | 443 | func stridedSize(totalCount uintptr, inc int) uintptr { 444 | if inc == 0 { 445 | panic("zero increment") 446 | } else if inc < 0 { 447 | inc = -inc 448 | } 449 | // Do this in such a way that we never risk overflow. 450 | res := totalCount / uintptr(inc) 451 | if totalCount%uintptr(inc) != 0 { 452 | res++ 453 | } 454 | return res 455 | } 456 | --------------------------------------------------------------------------------