├── README.md
├── test_data
    ├── kernels.cu
    └── kernels.ptx
├── cublas
    ├── conversions.go
    ├── pointer_mode.go
    ├── errors.go
    ├── cublas.go
    ├── helpers_test.go
    ├── extensions_test.go
    ├── extensions.go
    ├── level2_test.go
    ├── level3_test.go
    ├── level2.go
    ├── level3.go
    ├── level1_test.go
    └── level1.go
├── helpers_test.go
├── device_test.go
├── LICENSE
├── curand
    ├── errors.go
    ├── curand_test.go
    └── curand.go
├── module_test.go
├── context.go
├── buffer_test.go
├── stream.go
├── doc.go
├── allocator_bfc.go
├── errors.go
├── allocator.go
├── module.go
├── buffer.go
└── device.go


/README.md:
--------------------------------------------------------------------------------
1 | # cuda
2 | 
3 | This is a [Go](https://golang.org) package for interacting with [CUDA](https://en.wikipedia.org/wiki/CUDA). See the [GoDoc](https://godoc.org/github.com/unixpickle/cuda) for detailed usage information.
4 | 
5 | # License
6 | 
7 | This is licensed under a BSD 2-clause license. See [LICENSE](LICENSE).
8 | 


--------------------------------------------------------------------------------
/test_data/kernels.cu:
--------------------------------------------------------------------------------
1 | extern "C" __global__
2 | void my_fancy_kernel(int n, float a, double b, int c, unsigned int d, double * out1, float * out2) {
3 | 	int tid = blockIdx.x * blockDim.x + threadIdx.x;
4 | 	if (tid < n) {
5 |     out1[tid] = (double)tid + (double)a + b + (double)c;
6 | 		out2[tid] = (float)c + (float)d - a;
7 | 	}
8 | }
9 | 


--------------------------------------------------------------------------------
/cublas/conversions.go:
--------------------------------------------------------------------------------
 1 | package cublas
 2 | 
 3 | import "C"
 4 | 
 5 | func safeUintToC(x uint) C.uint {
 6 | 	if x > uint(^C.uint(0)) {
 7 | 		panic("uint value out of bounds")
 8 | 	}
 9 | 	return C.uint(x)
10 | }
11 | 
12 | func safeIntToC(x int) C.int {
13 | 	if x > int(C.int(^C.uint(0)/2)) {
14 | 		panic("int value out of bounds")
15 | 	} else if x < int((-C.int(^C.uint(0)/2))-1) {
16 | 		panic("int value out of bounds")
17 | 	}
18 | 	return C.int(x)
19 | }
20 | 


--------------------------------------------------------------------------------
/helpers_test.go:
--------------------------------------------------------------------------------
 1 | package cuda
 2 | 
 3 | import "testing"
 4 | 
 5 | var testingContext *Context
 6 | var testingAllocator Allocator
 7 | 
 8 | func setupTest(t *testing.T) (*Context, Allocator) {
 9 | 	if testingContext != nil {
10 | 		return testingContext, testingAllocator
11 | 	}
12 | 	devices, err := AllDevices()
13 | 	if err != nil {
14 | 		t.Fatal(err)
15 | 	}
16 | 	if len(devices) == 0 {
17 | 		t.Fatal("no CUDA devices")
18 | 	}
19 | 	testingContext, err = NewContext(devices[0], 10)
20 | 	if err != nil {
21 | 		t.Fatal(err)
22 | 	}
23 | 	testingAllocator = GCAllocator(NativeAllocator(testingContext), 0)
24 | 	return testingContext, testingAllocator
25 | }
26 | 


--------------------------------------------------------------------------------
/device_test.go:
--------------------------------------------------------------------------------
 1 | package cuda
 2 | 
 3 | import "testing"
 4 | 
 5 | func TestDeviceName(t *testing.T) {
 6 | 	devices, err := AllDevices()
 7 | 	if err != nil {
 8 | 		t.Fatal(err)
 9 | 	}
10 | 	for i, d := range devices {
11 | 		name, err := d.Name()
12 | 		if err != nil {
13 | 			t.Errorf("device %d: %v", i, err)
14 | 		} else if len(name) == 0 {
15 | 			t.Errorf("device %d: empty name", i)
16 | 		}
17 | 	}
18 | }
19 | 
20 | func TestDeviceAttr(t *testing.T) {
21 | 	devices, err := AllDevices()
22 | 	if err != nil {
23 | 		t.Fatal(err)
24 | 	}
25 | 	for i, d := range devices {
26 | 		rate, err := d.Attr(DevAttrClockRate)
27 | 		if err != nil {
28 | 			t.Errorf("device %d: %v", i, err)
29 | 		} else if rate == 0 {
30 | 			t.Errorf("device %d: clock rate 0", i)
31 | 		}
32 | 	}
33 | }
34 | 
35 | func TestDeviceTotalMem(t *testing.T) {
36 | 	devices, err := AllDevices()
37 | 	if err != nil {
38 | 		t.Fatal(err)
39 | 	}
40 | 	for i, d := range devices {
41 | 		mem, err := d.TotalMem()
42 | 		if err != nil {
43 | 			t.Errorf("device %d: %v", i, err)
44 | 		} else if mem == 0 {
45 | 			t.Errorf("device %d: no memory", i)
46 | 		}
47 | 	}
48 | }
49 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017, Alexander Nichol.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 |    list of conditions and the following disclaimer.
 9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 |    this list of conditions and the following disclaimer in the documentation
11 |    and/or other materials provided with the distribution.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | 


--------------------------------------------------------------------------------
/cublas/pointer_mode.go:
--------------------------------------------------------------------------------
 1 | package cublas
 2 | 
 3 | /*
 4 | #include <cublas_v2.h>
 5 | 
 6 | const cublasPointerMode_t goCublasPointerModeHost = CUBLAS_POINTER_MODE_HOST;
 7 | const cublasPointerMode_t goCublasPointerModeDevice = CUBLAS_POINTER_MODE_DEVICE;
 8 | */
 9 | import "C"
10 | 
11 | // PointerMode determines how BLAS APIs receive and return
12 | // scaler values.
13 | //
14 | // There are two types of scaler values in the API: scaler
15 | // inputs and scaler return values.
16 | // The current pointer mode affects both types of values.
17 | //
18 | // If the pointer mode is Device, then all scaler inputs
19 | // and outputs must be cuda.Buffer objects.
20 | //
21 | // If the pointer mode is Host, then all scaler inputs
22 | // must be float32, float64, *float32, or *float64;
23 | // all scaler outputs must be *float32 or *float64.
24 | type PointerMode int
25 | 
26 | const (
27 | 	Host PointerMode = iota
28 | 	Device
29 | )
30 | 
31 | func (p PointerMode) cPointerMode() C.cublasPointerMode_t {
32 | 	switch p {
33 | 	case Host:
34 | 		return C.goCublasPointerModeHost
35 | 	case Device:
36 | 		return C.goCublasPointerModeDevice
37 | 	default:
38 | 		panic("invalid PointerMode")
39 | 	}
40 | }
41 | 
42 | // pointerizeInputs replaces float32 and float64 values
43 | // with *float32 and *float64 values.
44 | func pointerizeInputs(args ...*interface{}) {
45 | 	for _, x := range args {
46 | 		switch val := (*x).(type) {
47 | 		case float32:
48 | 			*x = &val
49 | 		case float64:
50 | 			*x = &val
51 | 		}
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/cublas/errors.go:
--------------------------------------------------------------------------------
 1 | package cublas
 2 | 
 3 | /*
 4 | #include <cublas_v2.h>
 5 | 
 6 | // Needed to check for NULL from Cgo.
 7 | const char * go_cublas_null_message = NULL;
 8 | 
 9 | const char * go_cublas_err(cublasStatus_t s) {
10 | 	switch (s) {
11 | 	case CUBLAS_STATUS_SUCCESS:
12 | 		return NULL;
13 | 	case CUBLAS_STATUS_NOT_INITIALIZED:
14 | 		return "CUBLAS_STATUS_NOT_INITIALIZED";
15 | 	case CUBLAS_STATUS_ALLOC_FAILED:
16 | 		return "CUBLAS_STATUS_ALLOC_FAILED";
17 | 	case CUBLAS_STATUS_INVALID_VALUE:
18 | 		return "CUBLAS_STATUS_INVALID_VALUE";
19 | 	case CUBLAS_STATUS_ARCH_MISMATCH:
20 | 		return "CUBLAS_STATUS_ARCH_MISMATCH";
21 | 	case CUBLAS_STATUS_MAPPING_ERROR:
22 | 		return "CUBLAS_STATUS_MAPPING_ERROR";
23 | 	case CUBLAS_STATUS_EXECUTION_FAILED:
24 | 		return "CUBLAS_STATUS_EXECUTION_FAILED";
25 | 	case CUBLAS_STATUS_INTERNAL_ERROR:
26 | 		return "CUBLAS_STATUS_INTERNAL_ERROR";
27 | 	case CUBLAS_STATUS_NOT_SUPPORTED:
28 | 		return "CUBLAS_STATUS_NOT_SUPPORTED";
29 | 	case CUBLAS_STATUS_LICENSE_ERROR:
30 | 		return "CUBLAS_STATUS_LICENSE_ERROR";
31 | 	default:
32 | 		return "unknown cuBLAS error";
33 | 	}
34 | }
35 | */
36 | import "C"
37 | 
38 | import "github.com/unixpickle/cuda"
39 | 
40 | // newError creates an Error from the result of a cuBLAS
41 | // API call.
42 | //
43 | // If e is CUBLAS_STATUS_SUCCESS, nil is returned.
44 | func newError(context string, e C.cublasStatus_t) error {
45 | 	cstr := C.go_cublas_err(e)
46 | 	if cstr == C.go_cublas_null_message {
47 | 		return nil
48 | 	}
49 | 	name := C.GoString(cstr)
50 | 	return &cuda.Error{
51 | 		Context: context,
52 | 		Name:    name,
53 | 		Message: name,
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/cublas/cublas.go:
--------------------------------------------------------------------------------
 1 | // Package cublas provides bindings for the CUDA cuBLAS
 2 | // library.
 3 | package cublas
 4 | 
 5 | /*
 6 | #include <cublas_v2.h>
 7 | */
 8 | import "C"
 9 | 
10 | import (
11 | 	"runtime"
12 | 
13 | 	"github.com/unixpickle/cuda"
14 | )
15 | 
16 | // A Handle is used to make cuBLAS calls.
17 | //
18 | // A given Handle is bound to a specific cuda.Context.
19 | type Handle struct {
20 | 	handle C.cublasHandle_t
21 | 	ctx    *cuda.Context
22 | 
23 | 	ptrMode PointerMode
24 | }
25 | 
26 | // NewHandle creates a new cuBLAS handle.
27 | //
28 | // This must be called inside the cuda.Context.
29 | func NewHandle(ctx *cuda.Context) (*Handle, error) {
30 | 	res := &Handle{ctx: ctx, ptrMode: Host}
31 | 	err := newError("cublasCreate", C.cublasCreate(&res.handle))
32 | 	if err != nil {
33 | 		return nil, err
34 | 	}
35 | 	runtime.SetFinalizer(res, func(obj *Handle) {
36 | 		go obj.ctx.Run(func() error {
37 | 			C.cublasDestroy(obj.handle)
38 | 			return nil
39 | 		})
40 | 	})
41 | 	return res, nil
42 | }
43 | 
44 | // PointerMode returns the current PointerMode.
45 | //
46 | // This must be called inside the cuda.Context.
47 | func (h *Handle) PointerMode() PointerMode {
48 | 	return h.ptrMode
49 | }
50 | 
51 | // SetPointerMode updates the current PointerMode.
52 | //
53 | // This must be called inside the cuda.Context.
54 | func (h *Handle) SetPointerMode(p PointerMode) error {
55 | 	res := C.cublasSetPointerMode(h.handle, p.cPointerMode())
56 | 	if err := newError("cublasSetPointerMode", res); err != nil {
57 | 		return err
58 | 	}
59 | 	h.ptrMode = p
60 | 	return nil
61 | }
62 | 
63 | // SetStream tells the handle which stream to use for its
64 | // computations.
65 | //
66 | // this must be called inside the cuda.Context.
67 | func (h *Handle) SetStream(s *cuda.Stream) error {
68 | 	res := C.cublasSetStream(h.handle, C.cudaStream_t(s.Pointer()))
69 | 	return newError("cublasSetStream", res)
70 | }
71 | 


--------------------------------------------------------------------------------
/curand/errors.go:
--------------------------------------------------------------------------------
 1 | package curand
 2 | 
 3 | /*
 4 | #include <curand.h>
 5 | 
 6 | // Needed to check for NULL from Cgo.
 7 | const char * goCurandNULLMessage = NULL;
 8 | 
 9 | const char * go_curand_err(curandStatus_t s) {
10 | 	switch (s) {
11 | 	case CURAND_STATUS_SUCCESS:
12 | 		return NULL;
13 | 	case CURAND_STATUS_VERSION_MISMATCH:
14 | 		return "CURAND_STATUS_VERSION_MISMATCH";
15 | 	case CURAND_STATUS_NOT_INITIALIZED:
16 | 		return "CURAND_STATUS_NOT_INITIALIZED";
17 | 	case CURAND_STATUS_ALLOCATION_FAILED:
18 | 		return "CURAND_STATUS_ALLOCATION_FAILED";
19 | 	case CURAND_STATUS_TYPE_ERROR:
20 | 		return "CURAND_STATUS_TYPE_ERROR";
21 | 	case CURAND_STATUS_OUT_OF_RANGE:
22 | 		return "CURAND_STATUS_OUT_OF_RANGE";
23 | 	case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
24 | 		return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
25 | 	case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
26 | 		return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
27 | 	case CURAND_STATUS_LAUNCH_FAILURE:
28 | 		return "CURAND_STATUS_LAUNCH_FAILURE";
29 | 	case CURAND_STATUS_PREEXISTING_FAILURE:
30 | 		return "CURAND_STATUS_PREEXISTING_FAILURE";
31 | 	case CURAND_STATUS_INITIALIZATION_FAILED:
32 | 		return "CURAND_STATUS_INITIALIZATION_FAILED";
33 | 	case CURAND_STATUS_ARCH_MISMATCH:
34 | 		return "CURAND_STATUS_ARCH_MISMATCH";
35 | 	case CURAND_STATUS_INTERNAL_ERROR:
36 | 		return "CURAND_STATUS_INTERNAL_ERROR";
37 | 	default:
38 | 		return "unknown cuRAND error";
39 | 	}
40 | }
41 | */
42 | import "C"
43 | 
44 | import "github.com/unixpickle/cuda"
45 | 
46 | // newError creates an Error from the result of a cuRAND
47 | // API call.
48 | //
49 | // If e is CURAND_STATUS_SUCCESS, nil is returned.
50 | func newError(context string, e C.curandStatus_t) error {
51 | 	msg := C.go_curand_err(e)
52 | 	if msg == C.goCurandNULLMessage {
53 | 		return nil
54 | 	}
55 | 	name := C.GoString(msg)
56 | 	return &cuda.Error{
57 | 		Context: context,
58 | 		Name:    name,
59 | 		Message: name,
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/test_data/kernels.ptx:
--------------------------------------------------------------------------------
 1 | //
 2 | // Generated by NVIDIA NVVM Compiler
 3 | //
 4 | // Compiler Build ID: CL-21313162
 5 | // Cuda compilation tools, release 8.0, V8.0.53
 6 | // Based on LLVM 3.4svn
 7 | //
 8 | 
 9 | .version 4.3
10 | .target sm_30
11 | .address_size 64
12 | 
13 | 	// .globl	my_fancy_kernel
14 | 
15 | .visible .entry my_fancy_kernel(
16 | 	.param .u32 my_fancy_kernel_param_0,
17 | 	.param .f32 my_fancy_kernel_param_1,
18 | 	.param .f64 my_fancy_kernel_param_2,
19 | 	.param .u32 my_fancy_kernel_param_3,
20 | 	.param .u32 my_fancy_kernel_param_4,
21 | 	.param .u64 my_fancy_kernel_param_5,
22 | 	.param .u64 my_fancy_kernel_param_6
23 | )
24 | {
25 | 	.reg .pred 	%p<2>;
26 | 	.reg .f32 	%f<6>;
27 | 	.reg .b32 	%r<8>;
28 | 	.reg .f64 	%fd<8>;
29 | 	.reg .b64 	%rd<9>;
30 | 
31 | 
32 | 	ld.param.u32 	%r4, [my_fancy_kernel_param_0];
33 | 	ld.param.f32 	%f1, [my_fancy_kernel_param_1];
34 | 	ld.param.f64 	%fd1, [my_fancy_kernel_param_2];
35 | 	ld.param.u32 	%r2, [my_fancy_kernel_param_3];
36 | 	ld.param.u32 	%r3, [my_fancy_kernel_param_4];
37 | 	ld.param.u64 	%rd1, [my_fancy_kernel_param_5];
38 | 	ld.param.u64 	%rd2, [my_fancy_kernel_param_6];
39 | 	mov.u32 	%r5, %ctaid.x;
40 | 	mov.u32 	%r6, %ntid.x;
41 | 	mov.u32 	%r7, %tid.x;
42 | 	mad.lo.s32 	%r1, %r6, %r5, %r7;
43 | 	setp.ge.s32	%p1, %r1, %r4;
44 | 	@%p1 bra 	BB0_2;
45 | 
46 | 	cvta.to.global.u64 	%rd3, %rd1;
47 | 	cvt.f64.f32	%fd2, %f1;
48 | 	cvt.rn.f64.s32	%fd3, %r1;
49 | 	add.f64 	%fd4, %fd2, %fd3;
50 | 	add.f64 	%fd5, %fd4, %fd1;
51 | 	cvt.rn.f64.s32	%fd6, %r2;
52 | 	add.f64 	%fd7, %fd6, %fd5;
53 | 	mul.wide.s32 	%rd4, %r1, 8;
54 | 	add.s64 	%rd5, %rd3, %rd4;
55 | 	st.global.f64 	[%rd5], %fd7;
56 | 	cvt.rn.f32.u32	%f2, %r3;
57 | 	cvt.rn.f32.s32	%f3, %r2;
58 | 	add.f32 	%f4, %f3, %f2;
59 | 	sub.f32 	%f5, %f4, %f1;
60 | 	cvta.to.global.u64 	%rd6, %rd2;
61 | 	mul.wide.s32 	%rd7, %r1, 4;
62 | 	add.s64 	%rd8, %rd6, %rd7;
63 | 	st.global.f32 	[%rd8], %f5;
64 | 
65 | BB0_2:
66 | 	ret;
67 | }
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/module_test.go:
--------------------------------------------------------------------------------
 1 | package cuda
 2 | 
 3 | import (
 4 | 	"io/ioutil"
 5 | 	"math"
 6 | 	"testing"
 7 | 	"unsafe"
 8 | )
 9 | 
10 | func TestModule(t *testing.T) {
11 | 	ptx, err := ioutil.ReadFile("test_data/kernels.ptx")
12 | 	if err != nil {
13 | 		t.Fatal(err)
14 | 	}
15 | 	ctx, a := setupTest(t)
16 | 
17 | 	runTest := func(t *testing.T, stream *Stream) {
18 | 		mod, err := NewModule(ctx, string(ptx))
19 | 		if err != nil {
20 | 			t.Error(err)
21 | 			return
22 | 		}
23 | 
24 | 		doubleBuf, err := AllocBuffer(a, 8*1550)
25 | 		if err != nil {
26 | 			t.Error(err)
27 | 			return
28 | 		}
29 | 		floatBuf, err := AllocBuffer(a, 4*1550)
30 | 		if err != nil {
31 | 			t.Error(err)
32 | 			return
33 | 		}
34 | 
35 | 		floatBuf.WithPtr(func(ptr unsafe.Pointer) {
36 | 			err = mod.Launch("my_fancy_kernel", 13, 1, 1, 128, 1, 1, 0, stream, int(1550),
37 | 				float32(3.7), float64(2.5), int(-3), uint(5), doubleBuf, ptr)
38 | 		})
39 | 
40 | 		if err != nil {
41 | 			t.Error(err)
42 | 			return
43 | 		}
44 | 
45 | 		res32 := make([]float32, 1550)
46 | 		res64 := make([]float64, 1550)
47 | 
48 | 		if err := ReadBuffer(res32, floatBuf); err != nil {
49 | 			t.Error(err)
50 | 			return
51 | 		}
52 | 		if err := ReadBuffer(res64, doubleBuf); err != nil {
53 | 			t.Error(err)
54 | 			return
55 | 		}
56 | 
57 | 		expFloat := float32(-3 + 5 - 3.7)
58 | 		for i, a := range res32 {
59 | 			if math.Abs(float64(a-expFloat)) > 1e-4 {
60 | 				t.Errorf("entry %d: expected %v but got %v", i, expFloat, a)
61 | 				break
62 | 			}
63 | 		}
64 | 
65 | 		for i, a := range res64 {
66 | 			x := float64(i) + 3.7 + 2.5 - 3
67 | 			if math.Abs(x-a) > 1e-5 {
68 | 				t.Errorf("entry %d: expected %v but got %v", i, x, a)
69 | 				break
70 | 			}
71 | 		}
72 | 		return
73 | 	}
74 | 
75 | 	t.Run("NoStream", func(t *testing.T) {
76 | 		<-ctx.Run(func() error {
77 | 			runTest(t, nil)
78 | 			return nil
79 | 		})
80 | 	})
81 | 
82 | 	t.Run("Stream", func(t *testing.T) {
83 | 		<-ctx.Run(func() error {
84 | 			stream, err := NewStream(false)
85 | 			if err != nil {
86 | 				t.Error(err)
87 | 				return nil
88 | 			}
89 | 			defer stream.Close()
90 | 			runTest(t, stream)
91 | 			return nil
92 | 		})
93 | 	})
94 | }
95 | 


--------------------------------------------------------------------------------
/cublas/helpers_test.go:
--------------------------------------------------------------------------------
 1 | package cublas
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"math"
 6 | 	"testing"
 7 | 
 8 | 	"github.com/unixpickle/cuda"
 9 | )
10 | 
11 | var testContext *cuda.Context
12 | var testAllocator cuda.Allocator
13 | var testHandle *Handle
14 | 
15 | func setupTest(t *testing.T, inBuffers ...interface{}) (*cuda.Context, *Handle, []cuda.Buffer) {
16 | 	if testContext == nil {
17 | 		devices, err := cuda.AllDevices()
18 | 		if err != nil {
19 | 			t.Fatal(err)
20 | 		}
21 | 		if len(devices) == 0 {
22 | 			t.Fatal("no CUDA devices")
23 | 		}
24 | 		testContext, err = cuda.NewContext(devices[0], -1)
25 | 		if err != nil {
26 | 			t.Fatal(err)
27 | 		}
28 | 		testAllocator = cuda.GCAllocator(cuda.NativeAllocator(testContext), 0)
29 | 	}
30 | 	if testHandle == nil {
31 | 		err := <-testContext.Run(func() (err error) {
32 | 			testHandle, err = NewHandle(testContext)
33 | 			return
34 | 		})
35 | 		if err != nil {
36 | 			t.Fatal(err)
37 | 		}
38 | 	}
39 | 
40 | 	outBufs := make([]cuda.Buffer, len(inBuffers))
41 | 	for i, x := range inBuffers {
42 | 		err := <-testContext.Run(func() (err error) {
43 | 			switch x := x.(type) {
44 | 			case []float32:
45 | 				outBufs[i], err = cuda.AllocBuffer(testAllocator, uintptr(len(x)*4))
46 | 			case []float64:
47 | 				outBufs[i], err = cuda.AllocBuffer(testAllocator, uintptr(len(x)*8))
48 | 			case []int32:
49 | 				outBufs[i], err = cuda.AllocBuffer(testAllocator, uintptr(len(x)*4))
50 | 			default:
51 | 				err = errors.New("unknown buffer type")
52 | 			}
53 | 			if err == nil {
54 | 				err = cuda.WriteBuffer(outBufs[i], x)
55 | 			}
56 | 			return
57 | 		})
58 | 		if err != nil {
59 | 			t.Fatalf("buffer %d: %s", i, err)
60 | 		}
61 | 	}
62 | 
63 | 	return testContext, testHandle, outBufs
64 | }
65 | 
66 | func maxDelta32(v1, v2 []float32) float32 {
67 | 	var delta float32
68 | 	for i, x := range v1 {
69 | 		y := v2[i]
70 | 		diff := float32(math.Abs(float64(x - y)))
71 | 		if diff > delta {
72 | 			delta = diff
73 | 		}
74 | 	}
75 | 	return delta
76 | }
77 | 
78 | func maxDelta64(v1, v2 []float64) float64 {
79 | 	var delta float64
80 | 	for i, x := range v1 {
81 | 		y := v2[i]
82 | 		diff := math.Abs(x - y)
83 | 		if diff > delta {
84 | 			delta = diff
85 | 		}
86 | 	}
87 | 	return delta
88 | }
89 | 


--------------------------------------------------------------------------------
/curand/curand_test.go:
--------------------------------------------------------------------------------
 1 | package curand
 2 | 
 3 | import (
 4 | 	"math/rand"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/unixpickle/approb"
 8 | 	"github.com/unixpickle/cuda"
 9 | )
10 | 
11 | func TestGeneratorPseudo(t *testing.T) {
12 | 	devices, err := cuda.AllDevices()
13 | 	if err != nil {
14 | 		t.Fatal(err)
15 | 	} else if len(devices) == 0 {
16 | 		t.Fatal("no CUDA devices")
17 | 	}
18 | 	ctx, err := cuda.NewContext(devices[0], -1)
19 | 	if err != nil {
20 | 		t.Fatal(ctx)
21 | 	}
22 | 	allocator := cuda.GCAllocator(cuda.NativeAllocator(ctx), 0)
23 | 	err = <-ctx.Run(func() (resErr error) {
24 | 		defer func() {
25 | 			if err := recover(); err != nil {
26 | 				resErr = err.(error)
27 | 			}
28 | 		}()
29 | 		gen, err := NewGenerator(ctx, PseudoDefault)
30 | 		if err != nil {
31 | 			t.Error(err)
32 | 			return nil
33 | 		}
34 | 		samplers := testingSampleFuncs(allocator, gen)
35 | 		groundTruth := []func() float64{rand.NormFloat64, rand.NormFloat64,
36 | 			rand.Float64, rand.Float64}
37 | 		for i, sampler := range samplers {
38 | 			realSampler := groundTruth[i]
39 | 			corr := approb.Correlation(10000, 0.1, sampler, realSampler)
40 | 			if corr < 0.99 {
41 | 				t.Errorf("distribution %d was wrong", i)
42 | 			}
43 | 		}
44 | 		return nil
45 | 	})
46 | 	if err != nil {
47 | 		t.Error(err)
48 | 	}
49 | }
50 | 
51 | func testingSampleFuncs(allocator cuda.Allocator, g *Generator) []func() float64 {
52 | 	buf, err := cuda.AllocBuffer(allocator, 16)
53 | 	if err != nil {
54 | 		panic(err)
55 | 	}
56 | 	getValue32 := func() float32 {
57 | 		res := make([]float32, 1)
58 | 		if err := cuda.ReadBuffer(res, buf); err != nil {
59 | 			panic(err)
60 | 		}
61 | 		return res[0]
62 | 	}
63 | 	getValue64 := func() float64 {
64 | 		res := make([]float64, 1)
65 | 		if err := cuda.ReadBuffer(res, buf); err != nil {
66 | 			panic(err)
67 | 		}
68 | 		return res[0]
69 | 	}
70 | 	return []func() float64{
71 | 		func() float64 {
72 | 			if err := g.Normal(buf, 0, 1); err != nil {
73 | 				panic(err)
74 | 			}
75 | 			return float64(getValue32())
76 | 		},
77 | 		func() float64 {
78 | 			if err := g.NormalDouble(buf, 0, 1); err != nil {
79 | 				panic(err)
80 | 			}
81 | 			return getValue64()
82 | 		},
83 | 		func() float64 {
84 | 			if err := g.Uniform(buf); err != nil {
85 | 				panic(err)
86 | 			}
87 | 			return float64(getValue32())
88 | 		},
89 | 		func() float64 {
90 | 			if err := g.UniformDouble(buf); err != nil {
91 | 				panic(err)
92 | 			}
93 | 			return getValue64()
94 | 		},
95 | 	}
96 | }
97 | 


--------------------------------------------------------------------------------
/cublas/extensions_test.go:
--------------------------------------------------------------------------------
 1 | package cublas
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/unixpickle/cuda"
 7 | )
 8 | 
 9 | func TestSdgmm(t *testing.T) {
10 | 	ctx, handle, buffers := setupTest(t,
11 | 		[]float32{1, 2, 3, 7, 4, 5, 6, 0},
12 | 		[]float32{0.5, 5, -2, 5, 3},
13 | 		[]float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0})
14 | 	<-ctx.Run(func() error {
15 | 		err := handle.Sdgmm(Left, 3, 2, buffers[0], 4, buffers[1], 2,
16 | 			buffers[2], 5)
17 | 		if err != nil {
18 | 			t.Error(err)
19 | 			return nil
20 | 		}
21 | 
22 | 		actual := make([]float32, 10)
23 | 		if err := cuda.ReadBuffer(actual, buffers[2]); err != nil {
24 | 			t.Error(err)
25 | 			return nil
26 | 		}
27 | 		expected := []float32{0.5, -4, 9, 0, 0, 2, -10, 18, 0, 0}
28 | 
29 | 		if maxDelta32(actual, expected) > 1e-4 {
30 | 			t.Errorf("expected %v but got %v", expected, actual)
31 | 		}
32 | 
33 | 		err = handle.Sdgmm(Right, 3, 2, buffers[0], 3, buffers[1], 3,
34 | 			buffers[2], 3)
35 | 		if err != nil {
36 | 			t.Error(err)
37 | 			return nil
38 | 		}
39 | 
40 | 		actual = make([]float32, 10)
41 | 		if err := cuda.ReadBuffer(actual, buffers[2]); err != nil {
42 | 			t.Error(err)
43 | 			return nil
44 | 		}
45 | 		expected = []float32{0.5, 1, 1.5, 35, 20, 25, -10, 18, 0, 0}
46 | 
47 | 		if maxDelta32(actual, expected) > 1e-4 {
48 | 			t.Errorf("expected %v but got %v", expected, actual)
49 | 		}
50 | 		return nil
51 | 	})
52 | }
53 | 
54 | func TestDdgmm(t *testing.T) {
55 | 	ctx, handle, buffers := setupTest(t,
56 | 		[]float64{1, 2, 3, 7, 4, 5, 6, 0},
57 | 		[]float64{0.5, 5, -2, 5, 3},
58 | 		[]float64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0})
59 | 	<-ctx.Run(func() error {
60 | 		err := handle.Ddgmm(Left, 3, 2, buffers[0], 4, buffers[1], 2,
61 | 			buffers[2], 5)
62 | 		if err != nil {
63 | 			t.Error(err)
64 | 			return nil
65 | 		}
66 | 
67 | 		actual := make([]float64, 10)
68 | 		if err := cuda.ReadBuffer(actual, buffers[2]); err != nil {
69 | 			t.Error(err)
70 | 			return nil
71 | 		}
72 | 		expected := []float64{0.5, -4, 9, 0, 0, 2, -10, 18, 0, 0}
73 | 
74 | 		if maxDelta64(actual, expected) > 1e-4 {
75 | 			t.Errorf("expected %v but got %v", expected, actual)
76 | 		}
77 | 
78 | 		err = handle.Ddgmm(Right, 3, 2, buffers[0], 3, buffers[1], 3,
79 | 			buffers[2], 3)
80 | 		if err != nil {
81 | 			t.Error(err)
82 | 			return nil
83 | 		}
84 | 
85 | 		actual = make([]float64, 10)
86 | 		if err := cuda.ReadBuffer(actual, buffers[2]); err != nil {
87 | 			t.Error(err)
88 | 			return nil
89 | 		}
90 | 		expected = []float64{0.5, 1, 1.5, 35, 20, 25, -10, 18, 0, 0}
91 | 
92 | 		if maxDelta64(actual, expected) > 1e-4 {
93 | 			t.Errorf("expected %v but got %v", expected, actual)
94 | 		}
95 | 		return nil
96 | 	})
97 | }
98 | 


--------------------------------------------------------------------------------
/cublas/extensions.go:
--------------------------------------------------------------------------------
 1 | package cublas
 2 | 
 3 | /*
 4 | #include <cublas_v2.h>
 5 | 
 6 | const cublasSideMode_t goCublasLeft = CUBLAS_SIDE_LEFT;
 7 | const cublasSideMode_t goCublasRight = CUBLAS_SIDE_RIGHT;
 8 | */
 9 | import "C"
10 | 
11 | import (
12 | 	"unsafe"
13 | 
14 | 	"github.com/unixpickle/cuda"
15 | )
16 | 
17 | // A SideMode specifies the side on which a matrix should
18 | // be applied to another matrix.
19 | type SideMode int
20 | 
21 | const (
22 | 	Left SideMode = iota
23 | 	Right
24 | )
25 | 
26 | func (s SideMode) cValue() C.cublasSideMode_t {
27 | 	switch s {
28 | 	case Left:
29 | 		return C.goCublasLeft
30 | 	case Right:
31 | 		return C.goCublasRight
32 | 	default:
33 | 		panic("invalid SideMode")
34 | 	}
35 | }
36 | 
37 | // Sdgmm multiplies a dense matrix by a diagonal matrix.
38 | //
39 | // The mode argument indicates on which side the diagonal
40 | // matrix should be placed.
41 | //
42 | // This must be called inside the cuda.Context.
43 | func (h *Handle) Sdgmm(mode SideMode, m, n int, matA cuda.Buffer, lda int,
44 | 	x cuda.Buffer, incx int, matC cuda.Buffer, ldc int) error {
45 | 	checkDgmm(mode, m, n, matA.Size()/4, lda, x.Size()/4, incx, matC.Size()/4, ldc)
46 | 	var res C.cublasStatus_t
47 | 	matA.WithPtr(func(aPtr unsafe.Pointer) {
48 | 		x.WithPtr(func(xPtr unsafe.Pointer) {
49 | 			matC.WithPtr(func(cPtr unsafe.Pointer) {
50 | 				res = C.cublasSdgmm(h.handle, mode.cValue(),
51 | 					safeIntToC(m), safeIntToC(n),
52 | 					(*C.float)(aPtr), safeIntToC(lda),
53 | 					(*C.float)(xPtr), safeIntToC(incx),
54 | 					(*C.float)(cPtr), safeIntToC(ldc))
55 | 			})
56 | 		})
57 | 	})
58 | 	return newError("cublasSdgmm", res)
59 | }
60 | 
61 | // Ddgmm is like Sdgmm, but for double-precision.
62 | //
63 | // The mode argument indicates on which side the diagonal
64 | // matrix should be placed.
65 | //
66 | // This must be called inside the cuda.Context.
67 | func (h *Handle) Ddgmm(mode SideMode, m, n int, matA cuda.Buffer, lda int,
68 | 	x cuda.Buffer, incx int, matC cuda.Buffer, ldc int) error {
69 | 	checkDgmm(mode, m, n, matA.Size()/8, lda, x.Size()/8, incx, matC.Size()/8, ldc)
70 | 	var res C.cublasStatus_t
71 | 	matA.WithPtr(func(aPtr unsafe.Pointer) {
72 | 		x.WithPtr(func(xPtr unsafe.Pointer) {
73 | 			matC.WithPtr(func(cPtr unsafe.Pointer) {
74 | 				res = C.cublasDdgmm(h.handle, mode.cValue(),
75 | 					safeIntToC(m), safeIntToC(n),
76 | 					(*C.double)(aPtr), safeIntToC(lda),
77 | 					(*C.double)(xPtr), safeIntToC(incx),
78 | 					(*C.double)(cPtr), safeIntToC(ldc))
79 | 			})
80 | 		})
81 | 	})
82 | 	return newError("cublasDdgmm", res)
83 | }
84 | 
85 | func checkDgmm(mode SideMode, m, n int, matA uintptr, lda int, x uintptr, incx int,
86 | 	matC uintptr, ldc int) {
87 | 	checkMatrix(NoTrans, lda, m, n, matA)
88 | 	checkMatrix(NoTrans, ldc, m, n, matC)
89 | 
90 | 	neededX := uintptr(m)
91 | 	if mode == Right {
92 | 		neededX = uintptr(n)
93 | 	}
94 | 	if stridedSize(x, incx) < neededX {
95 | 		panic("index out of bounds")
96 | 	}
97 | }
98 | 


--------------------------------------------------------------------------------
/context.go:
--------------------------------------------------------------------------------
  1 | package cuda
  2 | 
  3 | /*
  4 | #include <cuda.h>
  5 | */
  6 | import "C"
  7 | import (
  8 | 	"os"
  9 | 	"runtime"
 10 | 	"strconv"
 11 | )
 12 | 
 13 | const defaultContextBuffer = 20
 14 | 
 15 | func init() {
 16 | 	if err := newErrorDriver("cuInit", C.cuInit(0)); err != nil {
 17 | 		panic(err)
 18 | 	}
 19 | }
 20 | 
 21 | // A Context maintains a CUDA-dedicated thread.
 22 | // All CUDA code should be run by a Context.
 23 | type Context struct {
 24 | 	msgs chan<- *contextMsg
 25 | 	ctx  C.CUcontext
 26 | }
 27 | 
 28 | // NewContext creates a new Context on the Device.
 29 | //
 30 | // The bufferSize is the maximum number of asynchronous
 31 | // calls that can be queued up at once.
 32 | // A larger buffer size means that Run() is less likely
 33 | // to block, all else equal.
 34 | //
 35 | // If bufferSize is -1, then the CUDA_CTX_BUFFER
 36 | // environment variable is used.
 37 | // If bufferSize is -1 and CUDA_CTX_BUFFER is not set, a
 38 | // reasonable default is used.
 39 | func NewContext(d *Device, bufferSize int) (*Context, error) {
 40 | 	if bufferSize < -1 {
 41 | 		panic("buffer size out of range")
 42 | 	} else if bufferSize == -1 {
 43 | 		bufferSize = defaultContextBuffer
 44 | 		if bs := os.Getenv("CUDA_CTX_BUFFER"); bs != "" {
 45 | 			parsed, err := strconv.Atoi(bs)
 46 | 			if err == nil && parsed >= 0 {
 47 | 				bufferSize = parsed
 48 | 			}
 49 | 		}
 50 | 	}
 51 | 	msgs := make(chan *contextMsg, bufferSize)
 52 | 	go contextLoop(msgs)
 53 | 	res := &Context{msgs: msgs}
 54 | 	err := <-res.Run(func() error {
 55 | 		return newErrorDriver("cuCtxCreate", C.cuCtxCreate(&res.ctx, 0, d.id))
 56 | 	})
 57 | 	if err != nil {
 58 | 		close(msgs)
 59 | 		return nil, err
 60 | 	}
 61 | 	runtime.SetFinalizer(res, func(obj *Context) {
 62 | 		obj.Run(func() error {
 63 | 			C.cuCtxDestroy(obj.ctx)
 64 | 			return nil
 65 | 		})
 66 | 		close(obj.msgs)
 67 | 	})
 68 | 	return res, nil
 69 | }
 70 | 
 71 | // Run runs f in the Context and returns a channel that
 72 | // will be sent the result of f when f completes.
 73 | //
 74 | // This may block until some queued up functions have
 75 | // finished running on the Context.
 76 | //
 77 | // If you are not interested in the result of f, you can
 78 | // simply ignore the returned channel.
 79 | //
 80 | // While f is running, no other function can run on the
 81 | // Context.
 82 | // This means that, to avoid deadlock, f should not use
 83 | // the Context.
 84 | func (c *Context) Run(f func() error) <-chan error {
 85 | 	ch := make(chan error, 1)
 86 | 	msg := &contextMsg{
 87 | 		f:        f,
 88 | 		doneChan: ch,
 89 | 	}
 90 | 	c.msgs <- msg
 91 | 	runtime.KeepAlive(c)
 92 | 	return ch
 93 | }
 94 | 
 95 | type contextMsg struct {
 96 | 	f        func() error
 97 | 	doneChan chan<- error
 98 | }
 99 | 
100 | func contextLoop(msgs <-chan *contextMsg) {
101 | 	runtime.LockOSThread()
102 | 	for msg := range msgs {
103 | 		msg.doneChan <- msg.f()
104 | 		close(msg.doneChan)
105 | 	}
106 | }
107 | 


--------------------------------------------------------------------------------
/buffer_test.go:
--------------------------------------------------------------------------------
  1 | package cuda
  2 | 
  3 | import (
  4 | 	"reflect"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func TestBufferIO(t *testing.T) {
  9 | 	ctx, a := setupTest(t)
 10 | 	<-ctx.Run(func() error {
 11 | 		const floatSize = 4
 12 | 		buf1, err := AllocBuffer(a, floatSize*10)
 13 | 		if err != nil {
 14 | 			t.Error(err)
 15 | 			return nil
 16 | 		}
 17 | 		buf2, err := AllocBuffer(a, floatSize*15)
 18 | 		if err != nil {
 19 | 			t.Error(err)
 20 | 			return nil
 21 | 		}
 22 | 		if err := ClearBuffer(buf1); err != nil {
 23 | 			t.Error(err)
 24 | 			return nil
 25 | 		}
 26 | 		if err := ClearBuffer(buf2); err != nil {
 27 | 			t.Error(err)
 28 | 			return nil
 29 | 		}
 30 | 		if err := WriteBuffer(buf1, []float32{1, 2, 3, 0, 0, 4, 5}); err != nil {
 31 | 			t.Error(err)
 32 | 			return nil
 33 | 		}
 34 | 		actual := make([]float32, 8)
 35 | 		if err := ReadBuffer(actual, buf1); err != nil {
 36 | 			t.Error(err)
 37 | 			return nil
 38 | 		}
 39 | 		expected := []float32{1, 2, 3, 0, 0, 4, 5, 0}
 40 | 		if !reflect.DeepEqual(actual, expected) {
 41 | 			t.Errorf("expected %v but got %v", expected, actual)
 42 | 		}
 43 | 
 44 | 		if err := CopyBuffer(buf2, buf1); err != nil {
 45 | 			t.Error(err)
 46 | 			return nil
 47 | 		}
 48 | 
 49 | 		actual = make([]float32, 15)
 50 | 		if err := ReadBuffer(actual, buf2); err != nil {
 51 | 			t.Error(err)
 52 | 			return nil
 53 | 		}
 54 | 		expected = []float32{1, 2, 3, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0}
 55 | 		if !reflect.DeepEqual(actual, expected) {
 56 | 			t.Errorf("expected %v but got %v", expected, actual)
 57 | 		}
 58 | 
 59 | 		return nil
 60 | 	})
 61 | }
 62 | 
 63 | func TestSlice(t *testing.T) {
 64 | 	ctx, a := setupTest(t)
 65 | 	<-ctx.Run(func() error {
 66 | 		buf1, err := AllocBuffer(a, 32)
 67 | 		if err != nil {
 68 | 			t.Error(err)
 69 | 			return nil
 70 | 		}
 71 | 		if err := ClearBuffer(buf1); err != nil {
 72 | 			t.Error(err)
 73 | 			return nil
 74 | 		}
 75 | 		if err := WriteBuffer(Slice(buf1, 8, 15), []byte{1, 2, 3, 4}); err != nil {
 76 | 			t.Error(err)
 77 | 			return nil
 78 | 		}
 79 | 		actual := make([]byte, 12)
 80 | 		if err := ReadBuffer(actual, Slice(buf1, 4, 12)); err != nil {
 81 | 			t.Error(err)
 82 | 			return nil
 83 | 		}
 84 | 		expected := []byte{0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0}
 85 | 		if !reflect.DeepEqual(actual, expected) {
 86 | 			t.Errorf("expected %v but got %v", expected, actual)
 87 | 		}
 88 | 
 89 | 		if err := CopyBuffer(Slice(buf1, 0, 4), Slice(buf1, 8, 16)); err != nil {
 90 | 			t.Error(err)
 91 | 			return nil
 92 | 		}
 93 | 
 94 | 		actual = make([]byte, 14)
 95 | 		if err := ReadBuffer(actual, buf1); err != nil {
 96 | 			t.Error(err)
 97 | 			return nil
 98 | 		}
 99 | 		expected = []byte{1, 2, 3, 4, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0}
100 | 		if !reflect.DeepEqual(actual, expected) {
101 | 			t.Errorf("expected %v but got %v", expected, actual)
102 | 		}
103 | 
104 | 		if !Overlap(Slice(buf1, 0, 5), Slice(buf1, 3, 5)) {
105 | 			t.Error("should overlap")
106 | 		}
107 | 		if Overlap(Slice(buf1, 0, 5), Slice(buf1, 5, 10)) {
108 | 			t.Error("should not overlap")
109 | 		}
110 | 
111 | 		return nil
112 | 	})
113 | }
114 | 


--------------------------------------------------------------------------------
/stream.go:
--------------------------------------------------------------------------------
  1 | package cuda
  2 | 
  3 | /*
  4 | #include <cuda.h>
  5 | 
  6 | const unsigned int streamNonBlockingFlag = CU_STREAM_NON_BLOCKING;
  7 | const CUstream nullStream = NULL;
  8 | */
  9 | import "C"
 10 | import "unsafe"
 11 | 
 12 | // Synchronize waits for asynchronous operations to
 13 | // complete.
 14 | //
 15 | // This should be called in a Context.
 16 | func Synchronize() error {
 17 | 	return newErrorDriver("cuCtxSynchronize", C.cuCtxSynchronize())
 18 | }
 19 | 
 20 | // A Stream manages a pipeline of CUDA operations.
 21 | // Streams can be employed to achieve parallelism.
 22 | type Stream struct {
 23 | 	stream C.CUstream
 24 | 	closed bool
 25 | }
 26 | 
 27 | // NewStream creates a new Stream.
 28 | //
 29 | // If nonBlocking is true, then this stream will be able
 30 | // to run concurrently with the default stream.
 31 | //
 32 | // This should be called in a Context.
 33 | func NewStream(nonBlocking bool) (*Stream, error) {
 34 | 	res := &Stream{}
 35 | 	status := C.cuStreamCreate(&res.stream, streamCreationFlags(nonBlocking))
 36 | 	if err := newErrorDriver("cuStreamCreate", status); err != nil {
 37 | 		return nil, err
 38 | 	}
 39 | 	return res, nil
 40 | }
 41 | 
 42 | // NewStreamPriority is like NewStream, but the resulting
 43 | // stream is assigned a certain priority.
 44 | //
 45 | // This should be called in a Context.
 46 | func NewStreamPriority(nonBlocking bool, priority int) (*Stream, error) {
 47 | 	res := &Stream{}
 48 | 	status := C.cuStreamCreateWithPriority(&res.stream, streamCreationFlags(nonBlocking),
 49 | 		safeIntToC(priority))
 50 | 	if err := newErrorDriver("cuStreamCreate", status); err != nil {
 51 | 		return nil, err
 52 | 	}
 53 | 	return res, nil
 54 | }
 55 | 
 56 | // Synchronize waits for the stream's tasks to complete.
 57 | func (s *Stream) Synchronize() error {
 58 | 	s.assertOpen()
 59 | 	return newErrorDriver("cuStreamSynchronize", C.cuStreamSynchronize(s.stream))
 60 | }
 61 | 
 62 | // Close destroys the stream.
 63 | //
 64 | // This will return immediately, even if the stream is
 65 | // still doing work.
 66 | //
 67 | // A stream should not be used after it is closed.
 68 | //
 69 | // This should be called in a Context.
 70 | func (s *Stream) Close() error {
 71 | 	if s.closed {
 72 | 		return nil
 73 | 	}
 74 | 	s.closed = true
 75 | 	return newErrorDriver("cuStreamDestroy", C.cuStreamDestroy(s.stream))
 76 | }
 77 | 
 78 | // Pointer returns the raw pointer value of the underlying
 79 | // stream object.
 80 | //
 81 | // If s is nil, then a NULL pointer is returned.
 82 | //
 83 | // This should be called in a Context.
 84 | func (s *Stream) Pointer() unsafe.Pointer {
 85 | 	if s == nil {
 86 | 		return unsafe.Pointer(C.nullStream)
 87 | 	}
 88 | 	s.assertOpen()
 89 | 	return unsafe.Pointer(s.stream)
 90 | }
 91 | 
 92 | func (s *Stream) cuStream() C.CUstream {
 93 | 	if s == nil {
 94 | 		return C.nullStream
 95 | 	}
 96 | 	s.assertOpen()
 97 | 	return s.stream
 98 | }
 99 | 
100 | func (s *Stream) assertOpen() {
101 | 	if s != nil && s.closed {
102 | 		panic("stream closed")
103 | 	}
104 | }
105 | 
106 | func streamCreationFlags(nonBlocking bool) C.uint {
107 | 	if nonBlocking {
108 | 		return C.streamNonBlockingFlag
109 | 	} else {
110 | 		return 0
111 | 	}
112 | }
113 | 


--------------------------------------------------------------------------------
/cublas/level2_test.go:
--------------------------------------------------------------------------------
  1 | package cublas
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/unixpickle/cuda"
  7 | )
  8 | 
  9 | func TestSgemv(t *testing.T) {
 10 | 	ctx, handle, buffers := setupTest(t,
 11 | 		[]float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
 12 | 		[]float32{3, 2, 1},
 13 | 		[]float32{0, 0, 7, 6, 0, 0, 0, 0, 0, 0},
 14 | 		[]float32{2.5},
 15 | 		[]float32{3.1})
 16 | 	<-ctx.Run(func() error {
 17 | 		alpha := float32(2.5)
 18 | 		err := handle.Sgemv(NoTrans, 3, 2, &alpha, buffers[0], 4, buffers[1], -2,
 19 | 			float32(1), buffers[2], 3)
 20 | 		if err != nil {
 21 | 			t.Error(err)
 22 | 			return nil
 23 | 		}
 24 | 
 25 | 		actual := make([]float32, 10)
 26 | 		if err := cuda.ReadBuffer(actual, buffers[2]); err != nil {
 27 | 			t.Error(err)
 28 | 			return nil
 29 | 		}
 30 | 		expected := []float32{40, 0, 7, 56, 0, 0, 60, 0, 0, 0}
 31 | 		if maxDelta32(actual, expected) > 1e-4 {
 32 | 			t.Errorf("expected %v but got %v", expected, actual)
 33 | 		}
 34 | 
 35 | 		if err := handle.SetPointerMode(Device); err != nil {
 36 | 			t.Error(err)
 37 | 			return nil
 38 | 		}
 39 | 		defer handle.SetPointerMode(Host)
 40 | 
 41 | 		err = handle.Sgemv(Trans, 3, 2, buffers[3], buffers[0], 5,
 42 | 			buffers[1], -1, buffers[4], buffers[2], 5)
 43 | 		if err != nil {
 44 | 			t.Error(err)
 45 | 			return nil
 46 | 		}
 47 | 
 48 | 		if err := cuda.ReadBuffer(actual, buffers[2]); err != nil {
 49 | 			t.Error(err)
 50 | 			return nil
 51 | 		}
 52 | 		expected = []float32{159, 0, 7, 56, 0, 110, 60, 0, 0, 0}
 53 | 		if maxDelta32(actual, expected) > 1e-4 {
 54 | 			t.Errorf("expected %v but got %v", expected, actual)
 55 | 		}
 56 | 
 57 | 		return nil
 58 | 	})
 59 | }
 60 | 
 61 | func TestDgemv(t *testing.T) {
 62 | 	ctx, handle, buffers := setupTest(t,
 63 | 		[]float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
 64 | 		[]float64{3, 2, 1},
 65 | 		[]float64{0, 0, 7, 6, 0, 0, 0, 0, 0, 0},
 66 | 		[]float64{2.5},
 67 | 		[]float64{3.1})
 68 | 	<-ctx.Run(func() error {
 69 | 		alpha := float64(2.5)
 70 | 		err := handle.Dgemv(NoTrans, 3, 2, &alpha, buffers[0], 4, buffers[1], -2,
 71 | 			float64(1), buffers[2], 3)
 72 | 		if err != nil {
 73 | 			t.Error(err)
 74 | 			return nil
 75 | 		}
 76 | 
 77 | 		actual := make([]float64, 10)
 78 | 		if err := cuda.ReadBuffer(actual, buffers[2]); err != nil {
 79 | 			t.Error(err)
 80 | 			return nil
 81 | 		}
 82 | 		expected := []float64{40, 0, 7, 56, 0, 0, 60, 0, 0, 0}
 83 | 		if maxDelta64(actual, expected) > 1e-4 {
 84 | 			t.Errorf("expected %v but got %v", expected, actual)
 85 | 		}
 86 | 
 87 | 		if err := handle.SetPointerMode(Device); err != nil {
 88 | 			t.Error(err)
 89 | 			return nil
 90 | 		}
 91 | 		defer handle.SetPointerMode(Host)
 92 | 
 93 | 		err = handle.Dgemv(Trans, 3, 2, buffers[3], buffers[0], 5,
 94 | 			buffers[1], -1, buffers[4], buffers[2], 5)
 95 | 		if err != nil {
 96 | 			t.Error(err)
 97 | 			return nil
 98 | 		}
 99 | 
100 | 		if err := cuda.ReadBuffer(actual, buffers[2]); err != nil {
101 | 			t.Error(err)
102 | 			return nil
103 | 		}
104 | 		expected = []float64{159, 0, 7, 56, 0, 110, 60, 0, 0, 0}
105 | 		if maxDelta64(actual, expected) > 1e-4 {
106 | 			t.Errorf("expected %v but got %v", expected, actual)
107 | 		}
108 | 
109 | 		return nil
110 | 	})
111 | }
112 | 


--------------------------------------------------------------------------------
/cublas/level3_test.go:
--------------------------------------------------------------------------------
  1 | package cublas
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/unixpickle/cuda"
  7 | )
  8 | 
  9 | func TestSgemm(t *testing.T) {
 10 | 	ctx, handle, buffers := setupTest(t,
 11 | 		[]float32{1, 2, 3, 0, 4, 5, 6, 0},
 12 | 		[]float32{-2, 0, 1, 2, -1, -1},
 13 | 		[]float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
 14 | 		[]float32{2.5},
 15 | 		[]float32{3.1})
 16 | 	<-ctx.Run(func() error {
 17 | 		alpha := float32(2.5)
 18 | 		err := handle.Sgemm(NoTrans, Trans, 3, 3, 2, &alpha, buffers[0], 4, buffers[1], 3,
 19 | 			float32(0), buffers[2], 3)
 20 | 		if err != nil {
 21 | 			t.Error(err)
 22 | 			return nil
 23 | 		}
 24 | 
 25 | 		actual := make([]float32, 10)
 26 | 		if err := cuda.ReadBuffer(actual, buffers[2]); err != nil {
 27 | 			t.Error(err)
 28 | 			return nil
 29 | 		}
 30 | 		expected := []float32{15, 15, 15, -10, -12.5, -15, -7.5, -7.5, -7.5, 0}
 31 | 		if maxDelta32(actual, expected) > 1e-4 {
 32 | 			t.Errorf("expected %v but got %v", expected, actual)
 33 | 		}
 34 | 
 35 | 		if err := handle.SetPointerMode(Device); err != nil {
 36 | 			t.Error(err)
 37 | 			return nil
 38 | 		}
 39 | 		defer handle.SetPointerMode(Host)
 40 | 
 41 | 		err = handle.Sgemm(Trans, NoTrans, 2, 2, 3, buffers[3], buffers[0], 4,
 42 | 			buffers[1], 3, buffers[4], buffers[2], 5)
 43 | 		if err != nil {
 44 | 			t.Error(err)
 45 | 			return nil
 46 | 		}
 47 | 
 48 | 		if err := cuda.ReadBuffer(actual, buffers[2]); err != nil {
 49 | 			t.Error(err)
 50 | 			return nil
 51 | 		}
 52 | 		expected = []float32{49, 41.5, 15, -10, -12.5, -54, -30.750, -7.5, -7.5, 0}
 53 | 		if maxDelta32(actual, expected) > 1e-4 {
 54 | 			t.Errorf("expected %v but got %v", expected, actual)
 55 | 		}
 56 | 
 57 | 		return nil
 58 | 	})
 59 | }
 60 | 
 61 | func TestDgemm(t *testing.T) {
 62 | 	ctx, handle, buffers := setupTest(t,
 63 | 		[]float64{1, 2, 3, 0, 4, 5, 6, 0},
 64 | 		[]float64{-2, 0, 1, 2, -1, -1},
 65 | 		[]float64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
 66 | 		[]float64{2.5},
 67 | 		[]float64{3.1})
 68 | 	<-ctx.Run(func() error {
 69 | 		alpha := float64(2.5)
 70 | 		err := handle.Dgemm(NoTrans, Trans, 3, 3, 2, &alpha, buffers[0], 4, buffers[1], 3,
 71 | 			float64(0), buffers[2], 3)
 72 | 		if err != nil {
 73 | 			t.Error(err)
 74 | 			return nil
 75 | 		}
 76 | 
 77 | 		actual := make([]float64, 10)
 78 | 		if err := cuda.ReadBuffer(actual, buffers[2]); err != nil {
 79 | 			t.Error(err)
 80 | 			return nil
 81 | 		}
 82 | 		expected := []float64{15, 15, 15, -10, -12.5, -15, -7.5, -7.5, -7.5, 0}
 83 | 		if maxDelta64(actual, expected) > 1e-4 {
 84 | 			t.Errorf("expected %v but got %v", expected, actual)
 85 | 		}
 86 | 
 87 | 		if err := handle.SetPointerMode(Device); err != nil {
 88 | 			t.Error(err)
 89 | 			return nil
 90 | 		}
 91 | 		defer handle.SetPointerMode(Host)
 92 | 
 93 | 		err = handle.Dgemm(Trans, NoTrans, 2, 2, 3, buffers[3], buffers[0], 4,
 94 | 			buffers[1], 3, buffers[4], buffers[2], 5)
 95 | 		if err != nil {
 96 | 			t.Error(err)
 97 | 			return nil
 98 | 		}
 99 | 
100 | 		if err := cuda.ReadBuffer(actual, buffers[2]); err != nil {
101 | 			t.Error(err)
102 | 			return nil
103 | 		}
104 | 		expected = []float64{49, 41.5, 15, -10, -12.5, -54, -30.750, -7.5, -7.5, 0}
105 | 		if maxDelta64(actual, expected) > 1e-4 {
106 | 			t.Errorf("expected %v but got %v", expected, actual)
107 | 		}
108 | 
109 | 		return nil
110 | 	})
111 | }
112 | 


--------------------------------------------------------------------------------
/doc.go:
--------------------------------------------------------------------------------
  1 | // Package cuda provides bindings to the CUDA library.
  2 | //
  3 | // Building
  4 | //
  5 | // To use this package, you must tell Go how to link with
  6 | // CUDA.
  7 | // On Mac OS X, this might look like:
  8 | //
  9 | //     export CUDA_PATH="/Developer/NVIDIA/CUDA-8.0"
 10 | //     export DYLD_LIBRARY_PATH="$CUDA_PATH/lib":$DYLD_LIBRARY_PATH
 11 | //     export CPATH="$CUDA_PATH/include/"
 12 | //     export CGO_LDFLAGS="/usr/local/cuda/lib/libcuda.dylib $CUDA_PATH/lib/libcudart.dylib $CUDA_PATH/lib/libcublas.dylib $CUDA_PATH/lib/libcurand.dylib"
 13 | //
 14 | // On Linux, this might look like:
 15 | //
 16 | //     export CUDA_PATH=/usr/local/cuda
 17 | //     export CPATH="$CUDA_PATH/include/"
 18 | //     export CGO_LDFLAGS="$CUDA_PATH/lib64/libcublas.so $CUDA_PATH/lib64/libcudart.so $CUDA_PATH/lib64/stubs/libcuda.so $CUDA_PATH/lib64/libcurand.so"
 19 | //     export LD_LIBRARY_PATH=$CUDA_PATH/lib64/
 20 | //
 21 | // Contexts
 22 | //
 23 | // Virtually every cuda API must be run from within a
 24 | // Context, which can be created like so:
 25 | //
 26 | //     devices, err := cuda.AllDevices()
 27 | //     if err != nil {
 28 | //         // Handle error.
 29 | //     }
 30 | //     if len(devices) == 0 {
 31 | //         // No devices found.
 32 | //     }
 33 | //     ctx, err := cuda.NewContext(devices[0], 10)
 34 | //     if err != nil {
 35 | //         // Handle error.
 36 | //     }
 37 | //
 38 | // To run code in a Context asynchronously, you can do the
 39 | // following:
 40 | //
 41 | //     ctx.Run(func() error {
 42 | //         // My code here.
 43 | //     })
 44 | //
 45 | // To run code synchronously, simply read from the
 46 | // resulting channel:
 47 | //
 48 | //     <-ctx.Run(func() error {
 49 | //         // My code here.
 50 | //     })
 51 | //
 52 | // You should never call ctx.Run() inside another call to
 53 | // ctx.Run(), for reasons that are documented on the
 54 | // Context.Run() method.
 55 | //
 56 | // Memory Management
 57 | //
 58 | // There are two ways to deal with memory: using Buffers,
 59 | // or using an Allocator directly with unsafe.Pointers.
 60 | // The Buffer API provides a high-level buffer interface
 61 | // with garbage collection and bounds checking.
 62 | // Most APIs use Buffers, including the APIs provided by
 63 | // sub-packages.
 64 | //
 65 | // No matter what, you will need an Allocator if you want
 66 | // to allocate memory.
 67 | // You can create an Allocator directly on top of CUDA:
 68 | //
 69 | //     allocator := cuda.GCAllocator(cuda.NativeAllocator(ctx), 0)
 70 | //
 71 | // Once you have an allocator, you can use it to allocate
 72 | // Buffer objects like so:
 73 | //
 74 | //     err := <-ctx.Run(func() error {
 75 | //         // Allocate 16 bytes.
 76 | //         buffer, err := cuda.AllocBuffer(allocator, 16)
 77 | //         if err != nil {
 78 | //             return err
 79 | //         }
 80 | //         // Use the buffer here...
 81 | //     })
 82 | //
 83 | // There are various functions to help you deal with
 84 | // buffers.
 85 | // The WriteBuffer() and ReadBuffer() functions allow you
 86 | // to copy Go slices to and from buffers.
 87 | // The Slice() function allows you to get a Buffer which
 88 | // points to a sub-region of a parent Buffer.
 89 | //
 90 | // Kernels
 91 | //
 92 | // To run kernels, you will use a Module.
 93 | // You can pass various Go primitives, unsafe.Pointers,
 94 | // and Buffers as kernel arguments.
 95 | //
 96 | // Sub-packages
 97 | //
 98 | // The cublas and curand sub-packages provide basic linear
 99 | // algebra routines and random number generators,
100 | // respectively.
101 | package cuda
102 | 


--------------------------------------------------------------------------------
/cublas/level2.go:
--------------------------------------------------------------------------------
  1 | package cublas
  2 | 
  3 | /*
  4 | #include <cublas_v2.h>
  5 | */
  6 | import "C"
  7 | 
  8 | import (
  9 | 	"unsafe"
 10 | 
 11 | 	"github.com/unixpickle/cuda"
 12 | )
 13 | 
 14 | // Sgemv performs single-precision matrix-vector
 15 | // multiplication.
 16 | //
 17 | // Matrices are stored in column-major order.
 18 | //
 19 | // The leading dimension lda may not be 0.
 20 | //
 21 | // The type of alpha and beta depends on the pointer mode.
 22 | // In Host mode, use float32 or *float32.
 23 | // In Device mode, user cuda.Buffer.
 24 | //
 25 | // This must be called inside the cuda.Context
 26 | func (h *Handle) Sgemv(trans Operation, m, n int, alpha interface{},
 27 | 	matA cuda.Buffer, lda int, x cuda.Buffer, incx int, beta interface{},
 28 | 	y cuda.Buffer, incy int) error {
 29 | 	checkGemv(trans, m, n, matA.Size()/4, lda, x.Size()/4, incx, y.Size()/4, incy)
 30 | 
 31 | 	var res C.cublasStatus_t
 32 | 	matA.WithPtr(func(aPtr unsafe.Pointer) {
 33 | 		x.WithPtr(func(xPtr unsafe.Pointer) {
 34 | 			y.WithPtr(func(yPtr unsafe.Pointer) {
 35 | 				if h.PointerMode() == Host {
 36 | 					pointerizeInputs(&alpha, &beta)
 37 | 					res = C.cublasSgemv(h.handle,
 38 | 						trans.cValue(),
 39 | 						safeIntToC(m), safeIntToC(n),
 40 | 						(*C.float)(alpha.(*float32)),
 41 | 						(*C.float)(aPtr), safeIntToC(lda),
 42 | 						(*C.float)(xPtr), safeIntToC(incx),
 43 | 						(*C.float)(beta.(*float32)),
 44 | 						(*C.float)(yPtr), safeIntToC(incy))
 45 | 				} else {
 46 | 					alphaBeta32(alpha, beta, func(alpha, beta *C.float) {
 47 | 						res = C.cublasSgemv(h.handle,
 48 | 							trans.cValue(),
 49 | 							safeIntToC(m), safeIntToC(n),
 50 | 							alpha,
 51 | 							(*C.float)(aPtr), safeIntToC(lda),
 52 | 							(*C.float)(xPtr), safeIntToC(incx),
 53 | 							beta,
 54 | 							(*C.float)(yPtr), safeIntToC(incy))
 55 | 					})
 56 | 				}
 57 | 			})
 58 | 		})
 59 | 	})
 60 | 
 61 | 	return newError("cublasSgemv", res)
 62 | }
 63 | 
 64 | // Dgemv performs double-precision matrix-vector
 65 | // multiplication.
 66 | //
 67 | // Matrices are stored in column-major order.
 68 | //
 69 | // The leading dimension lda may not be 0.
 70 | //
 71 | // The type of alpha and beta depends on the pointer mode.
 72 | // In Host mode, use float64 or *float64.
 73 | // In Device mode, user cuda.Buffer.
 74 | //
 75 | // This must be called inside the cuda.Context
 76 | func (h *Handle) Dgemv(trans Operation, m, n int, alpha interface{},
 77 | 	matA cuda.Buffer, lda int, x cuda.Buffer, incx int, beta interface{},
 78 | 	y cuda.Buffer, incy int) error {
 79 | 	checkGemv(trans, m, n, matA.Size()/8, lda, x.Size()/8, incx, y.Size()/8, incy)
 80 | 
 81 | 	var res C.cublasStatus_t
 82 | 	matA.WithPtr(func(aPtr unsafe.Pointer) {
 83 | 		x.WithPtr(func(xPtr unsafe.Pointer) {
 84 | 			y.WithPtr(func(yPtr unsafe.Pointer) {
 85 | 				if h.PointerMode() == Host {
 86 | 					pointerizeInputs(&alpha, &beta)
 87 | 					res = C.cublasDgemv(h.handle,
 88 | 						trans.cValue(),
 89 | 						safeIntToC(m), safeIntToC(n),
 90 | 						(*C.double)(alpha.(*float64)),
 91 | 						(*C.double)(aPtr), safeIntToC(lda),
 92 | 						(*C.double)(xPtr), safeIntToC(incx),
 93 | 						(*C.double)(beta.(*float64)),
 94 | 						(*C.double)(yPtr), safeIntToC(incy))
 95 | 				} else {
 96 | 					alphaBeta64(alpha, beta, func(alpha, beta *C.double) {
 97 | 						res = C.cublasDgemv(h.handle,
 98 | 							trans.cValue(),
 99 | 							safeIntToC(m), safeIntToC(n),
100 | 							alpha,
101 | 							(*C.double)(aPtr), safeIntToC(lda),
102 | 							(*C.double)(xPtr), safeIntToC(incx),
103 | 							beta,
104 | 							(*C.double)(yPtr), safeIntToC(incy))
105 | 					})
106 | 				}
107 | 			})
108 | 		})
109 | 	})
110 | 
111 | 	return newError("cublasDgemv", res)
112 | }
113 | 
114 | func checkGemv(trans Operation, m, n int, matA uintptr, lda int, x uintptr,
115 | 	incx int, y uintptr, incy int) {
116 | 	if trans != NoTrans {
117 | 		m, n = n, m
118 | 	}
119 | 	checkMatrix(trans, lda, m, n, matA)
120 | 	if stridedSize(x, incx) < uintptr(n) {
121 | 		panic("index out of bounds")
122 | 	}
123 | 	if stridedSize(y, incy) < uintptr(m) {
124 | 		panic("index out of bounds")
125 | 	}
126 | }
127 | 


--------------------------------------------------------------------------------
/allocator_bfc.go:
--------------------------------------------------------------------------------
  1 | package cuda
  2 | 
  3 | /*
  4 | #include <cuda.h>
  5 | #include <cuda_runtime_api.h>
  6 | */
  7 | import "C"
  8 | import (
  9 | 	"errors"
 10 | 	"os"
 11 | 	"runtime"
 12 | 	"strconv"
 13 | 	"unsafe"
 14 | 
 15 | 	"github.com/unixpickle/memalloc"
 16 | )
 17 | 
 18 | const (
 19 | 	minAllocatorSize = 1 << 20
 20 | 	maxAllocators    = 5
 21 | 
 22 | 	allocAlignment = 32
 23 | 	allocHeadroom  = 1 << 22
 24 | )
 25 | 
 26 | type bfcAllocator struct {
 27 | 	a   []*memalloc.MemAllocator
 28 | 	ctx *Context
 29 | }
 30 | 
 31 | // BFCAllocator creates an Allocator that uses memory
 32 | // coalescing and best-fitting to reduce memory
 33 | // fragmentation.
 34 | //
 35 | // You should wrap the returned allocator with GCAllocator
 36 | // if you plan to use the Buffer API.
 37 | //
 38 | // The maxSize argument specifies the maximum amount of
 39 | // memory to claim for the allocator.
 40 | // If it is 0, the allocator may claim nearly all of the
 41 | // available device memory.
 42 | //
 43 | // If the CUDA_BFC_HEADROOM environment variable is set,
 44 | // it is used as the minimum number of bytes to leave
 45 | // free.
 46 | //
 47 | // If the CUDA_BFC_MAX environment variable is set, it is
 48 | // used as an upper memory bound (in addition to maxSize).
 49 | //
 50 | // This should be called from a Context.
 51 | func BFCAllocator(ctx *Context, maxSize uintptr) (Allocator, error) {
 52 | 	if maxSizeEnv := os.Getenv("CUDA_BFC_MAX"); maxSizeEnv != "" {
 53 | 		size, err := strconv.ParseUint(maxSizeEnv, 10, 64)
 54 | 		if err == nil && (maxSize == 0 || uintptr(size) < maxSize) {
 55 | 			maxSize = uintptr(size)
 56 | 		}
 57 | 	}
 58 | 
 59 | 	if maxSize == 0 {
 60 | 		var err error
 61 | 		maxSize, err = maxBFCMemory()
 62 | 		if err != nil {
 63 | 			return nil, err
 64 | 		}
 65 | 	}
 66 | 
 67 | 	// The allocator size must fit in an int.
 68 | 	for int(maxSize) < 0 || uintptr(int(maxSize)) != maxSize {
 69 | 		maxSize >>= 1
 70 | 	}
 71 | 
 72 | 	var allocs []*memalloc.MemAllocator
 73 | 	for len(allocs) < maxAllocators && maxSize >= minAllocatorSize {
 74 | 		// No reason to reserve a misaligned amount of bytes.
 75 | 		// Doing so would probably cause fragmentation, knowing
 76 | 		// how bad cudaMalloc() is with fragmentation.
 77 | 		maxSize = (maxSize / allocAlignment) * allocAlignment
 78 | 
 79 | 		var region unsafe.Pointer
 80 | 		err := newErrorRuntime("cudaMalloc", C.cudaMalloc(&region, C.size_t(maxSize)))
 81 | 		if err != nil {
 82 | 			maxSize >>= 1
 83 | 			continue
 84 | 		}
 85 | 		allocs = append(allocs, &memalloc.MemAllocator{
 86 | 			Start:     region,
 87 | 			Size:      int(maxSize),
 88 | 			Allocator: memalloc.NewBFC(int(maxSize), allocAlignment),
 89 | 		})
 90 | 
 91 | 		newMax, err := maxBFCMemory()
 92 | 		if err != nil {
 93 | 			return nil, err
 94 | 		} else if newMax < maxSize {
 95 | 			maxSize = newMax
 96 | 		}
 97 | 	}
 98 | 	if len(allocs) == 0 {
 99 | 		return nil, errors.New("BFC init: not enough free memory")
100 | 	}
101 | 
102 | 	res := &bfcAllocator{a: allocs, ctx: ctx}
103 | 
104 | 	runtime.SetFinalizer(res, func(b *bfcAllocator) {
105 | 		go ctx.Run(func() error {
106 | 			for _, x := range b.a {
107 | 				C.cudaFree(x.Start)
108 | 			}
109 | 			return nil
110 | 		})
111 | 	})
112 | 
113 | 	return res, nil
114 | }
115 | 
116 | func (b *bfcAllocator) Context() *Context {
117 | 	return b.ctx
118 | }
119 | 
120 | func (b *bfcAllocator) Alloc(size uintptr) (unsafe.Pointer, error) {
121 | 	if int(size) < 0 || uintptr(int(size)) != size {
122 | 		return nil, errors.New("BFC alloc: size must fit in int")
123 | 	}
124 | 	for _, x := range b.a {
125 | 		ptr, err := x.Alloc(int(size))
126 | 		if err == nil {
127 | 			return ptr, nil
128 | 		}
129 | 	}
130 | 	return nil, errors.New("BFC alloc: out of memory")
131 | }
132 | 
133 | func (b *bfcAllocator) Free(ptr unsafe.Pointer, size uintptr) {
134 | 	for _, x := range b.a {
135 | 		if x.Contains(ptr) {
136 | 			x.Free(ptr)
137 | 			return
138 | 		}
139 | 	}
140 | 	panic("invalid pointer was freed")
141 | }
142 | 
143 | func maxBFCMemory() (uintptr, error) {
144 | 	headroom := uintptr(allocHeadroom)
145 | 	if roomStr := os.Getenv("CUDA_BFC_HEADROOM"); roomStr != "" {
146 | 		val, err := strconv.ParseUint(roomStr, 10, 64)
147 | 		if err == nil {
148 | 			headroom = uintptr(val)
149 | 		}
150 | 	}
151 | 
152 | 	free, _, err := MemInfo()
153 | 	if err != nil {
154 | 		return 0, err
155 | 	}
156 | 	res := uintptr(free)
157 | 	if res < headroom {
158 | 		return 0, nil
159 | 	}
160 | 	return res - headroom, nil
161 | }
162 | 


--------------------------------------------------------------------------------
/errors.go:
--------------------------------------------------------------------------------
  1 | package cuda
  2 | 
  3 | /*
  4 | #include <cuda.h>
  5 | #include <cuda_runtime_api.h>
  6 | 
  7 | // Needed to check for NULL from Cgo.
  8 | const char * nullMessage = NULL;
  9 | 
 10 | const char * go_cuda_cu_err(CUresult res) {
 11 | 	switch (res) {
 12 | 	case CUDA_SUCCESS:
 13 | 		return NULL;
 14 | 	case CUDA_ERROR_INVALID_VALUE:
 15 | 		return "CUDA_ERROR_INVALID_VALUE";
 16 | 	case CUDA_ERROR_OUT_OF_MEMORY:
 17 | 		return "CUDA_ERROR_OUT_OF_MEMORY";
 18 | 	case CUDA_ERROR_NOT_INITIALIZED:
 19 | 		return "CUDA_ERROR_NOT_INITIALIZED";
 20 | 	case CUDA_ERROR_DEINITIALIZED:
 21 | 		return "CUDA_ERROR_DEINITIALIZED";
 22 | 	case CUDA_ERROR_NO_DEVICE:
 23 | 		return "CUDA_ERROR_NO_DEVICE";
 24 | 	case CUDA_ERROR_INVALID_DEVICE:
 25 | 		return "CUDA_ERROR_INVALID_DEVICE";
 26 | 	case CUDA_ERROR_INVALID_IMAGE:
 27 | 		return "CUDA_ERROR_INVALID_IMAGE";
 28 | 	case CUDA_ERROR_INVALID_CONTEXT:
 29 | 		return "CUDA_ERROR_INVALID_CONTEXT";
 30 | 	case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
 31 | 		return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
 32 | 	case CUDA_ERROR_MAP_FAILED:
 33 | 		return "CUDA_ERROR_MAP_FAILED";
 34 | 	case CUDA_ERROR_UNMAP_FAILED:
 35 | 		return "CUDA_ERROR_UNMAP_FAILED";
 36 | 	case CUDA_ERROR_ARRAY_IS_MAPPED:
 37 | 		return "CUDA_ERROR_ARRAY_IS_MAPPED";
 38 | 	case CUDA_ERROR_ALREADY_MAPPED:
 39 | 		return "CUDA_ERROR_ALREADY_MAPPED";
 40 | 	case CUDA_ERROR_NO_BINARY_FOR_GPU:
 41 | 		return "CUDA_ERROR_NO_BINARY_FOR_GPU";
 42 | 	case CUDA_ERROR_ALREADY_ACQUIRED:
 43 | 		return "CUDA_ERROR_ALREADY_ACQUIRED";
 44 | 	case CUDA_ERROR_NOT_MAPPED:
 45 | 		return "CUDA_ERROR_NOT_MAPPED";
 46 | 	case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
 47 | 		return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
 48 | 	case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
 49 | 		return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
 50 | 	case CUDA_ERROR_ECC_UNCORRECTABLE:
 51 | 		return "CUDA_ERROR_ECC_UNCORRECTABLE";
 52 | 	case CUDA_ERROR_UNSUPPORTED_LIMIT:
 53 | 		return "CUDA_ERROR_UNSUPPORTED_LIMIT";
 54 | 	case CUDA_ERROR_INVALID_SOURCE:
 55 | 		return "CUDA_ERROR_INVALID_SOURCE";
 56 | 	case CUDA_ERROR_FILE_NOT_FOUND:
 57 | 		return "CUDA_ERROR_FILE_NOT_FOUND";
 58 | 	case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
 59 | 		return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
 60 | 	case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
 61 | 		return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
 62 | 	case CUDA_ERROR_OPERATING_SYSTEM:
 63 | 		return "CUDA_ERROR_OPERATING_SYSTEM";
 64 | 	case CUDA_ERROR_INVALID_HANDLE:
 65 | 		return "CUDA_ERROR_INVALID_HANDLE";
 66 | 	case CUDA_ERROR_NOT_FOUND:
 67 | 		return "CUDA_ERROR_NOT_FOUND";
 68 | 	case CUDA_ERROR_NOT_READY:
 69 | 		return "CUDA_ERROR_NOT_READY";
 70 | 	case CUDA_ERROR_LAUNCH_FAILED:
 71 | 		return "CUDA_ERROR_LAUNCH_FAILED";
 72 | 	case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
 73 | 		return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
 74 | 	case CUDA_ERROR_LAUNCH_TIMEOUT:
 75 | 		return "CUDA_ERROR_LAUNCH_TIMEOUT";
 76 | 	case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
 77 | 		return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
 78 | 	default:
 79 | 		return "CUDA_ERROR_UNKNOWN";
 80 | 	}
 81 | }
 82 | */
 83 | import "C"
 84 | 
 85 | // Error is a CUDA-related error.
 86 | type Error struct {
 87 | 	// Context is typically a C function name.
 88 | 	Context string
 89 | 
 90 | 	// Name is the C constant name for the error,
 91 | 	// such as "CURAND_STATUS_INTERNAL_ERROR".
 92 | 	Name string
 93 | 
 94 | 	// Message is the main error message.
 95 | 	//
 96 | 	// This may be human-readable, although it may often be
 97 | 	// the same as Name.
 98 | 	Message string
 99 | }
100 | 
101 | // newErrorDriver creates an Error from the result of a
102 | // CUDA driver API call.
103 | //
104 | // If e is CUDA_SUCCESS, nil is returned.
105 | func newErrorDriver(context string, e C.CUresult) error {
106 | 	return newErrorCStr(context, C.go_cuda_cu_err(e))
107 | }
108 | 
109 | // newErrorRuntime creates an Error from the result of a
110 | // CUDA runtime API call.
111 | //
112 | // If e is cudaSuccess, nil is returned.
113 | func newErrorRuntime(context string, e C.cudaError_t) error {
114 | 	if e == C.cudaSuccess {
115 | 		return nil
116 | 	}
117 | 	return newErrorCStr(context, C.cudaGetErrorString(e))
118 | }
119 | 
120 | func newErrorCStr(context string, cstr *C.char) error {
121 | 	if cstr == C.nullMessage {
122 | 		return nil
123 | 	}
124 | 	name := C.GoString(cstr)
125 | 	return &Error{
126 | 		Context: context,
127 | 		Name:    name,
128 | 		Message: name,
129 | 	}
130 | }
131 | 
132 | // Error generates a message "context: message".
133 | func (e *Error) Error() string {
134 | 	return e.Context + ": " + e.Message
135 | }
136 | 


--------------------------------------------------------------------------------
/curand/curand.go:
--------------------------------------------------------------------------------
  1 | // Package curand binds the CUDA cuRAND API to Go.
  2 | package curand
  3 | 
  4 | /*
  5 | #include <curand.h>
  6 | 
  7 | curandRngType_t go_curand_rng_type(int idx) {
  8 | 	curandRngType_t options[] = {
  9 | 		CURAND_RNG_PSEUDO_DEFAULT,
 10 | 		CURAND_RNG_PSEUDO_XORWOW,
 11 | 		CURAND_RNG_PSEUDO_MRG32K3A,
 12 | 		CURAND_RNG_PSEUDO_MTGP32,
 13 | 		CURAND_RNG_PSEUDO_MT19937,
 14 | 		CURAND_RNG_PSEUDO_PHILOX4_32_10,
 15 | 		CURAND_RNG_QUASI_DEFAULT,
 16 | 		CURAND_RNG_QUASI_SOBOL32,
 17 | 		CURAND_RNG_QUASI_SCRAMBLED_SOBOL32,
 18 | 		CURAND_RNG_QUASI_SOBOL64,
 19 | 		CURAND_RNG_QUASI_SCRAMBLED_SOBOL64,
 20 | 	};
 21 | 	return options[idx];
 22 | }
 23 | */
 24 | import "C"
 25 | import (
 26 | 	"runtime"
 27 | 	"unsafe"
 28 | 
 29 | 	"github.com/unixpickle/cuda"
 30 | )
 31 | 
 32 | type Type int
 33 | 
 34 | // Available generations from cuRAND API.
 35 | const (
 36 | 	PseudoDefault Type = iota
 37 | 	PseudoXORWOW
 38 | 	PseudoMRG32K3A
 39 | 	PseudoMTGP32
 40 | 	PseudoMT19937
 41 | 	PseudoPHILOX43210
 42 | 	QuasiDefault
 43 | 	QuasiSobol32
 44 | 	QuasiScrambledSobol32
 45 | 	QuasiSobol64
 46 | 	QuasiScrambledSobol64
 47 | )
 48 | 
 49 | // A Generator generates random numbers.
 50 | type Generator struct {
 51 | 	ctx *cuda.Context
 52 | 	gen C.curandGenerator_t
 53 | }
 54 | 
 55 | // NewGenerator creates a Generator for the given type.
 56 | //
 57 | // This must be called inside the cuda.Context.
 58 | func NewGenerator(c *cuda.Context, t Type) (*Generator, error) {
 59 | 	if t > QuasiScrambledSobol64 || t < 0 {
 60 | 		panic("type out of bounds")
 61 | 	}
 62 | 	realType := C.go_curand_rng_type(C.int(t))
 63 | 	res := &Generator{ctx: c}
 64 | 	code := C.curandCreateGenerator(&res.gen, realType)
 65 | 	if err := newError("curandCreateGenerator", code); err != nil {
 66 | 		return nil, err
 67 | 	}
 68 | 	runtime.SetFinalizer(res, func(g *Generator) {
 69 | 		go g.ctx.Run(func() error {
 70 | 			C.curandDestroyGenerator(g.gen)
 71 | 			return nil
 72 | 		})
 73 | 	})
 74 | 	return res, nil
 75 | }
 76 | 
 77 | // Seed sets the seed for a pseudo-random generator.
 78 | func (g *Generator) Seed(seed int64) error {
 79 | 	status := C.curandSetPseudoRandomGeneratorSeed(g.gen, C.ulonglong(seed))
 80 | 	return newError("curandSetPseudoRandomGeneratorSeed", status)
 81 | }
 82 | 
 83 | // GenerateSeeds initializes the generator.
 84 | //
 85 | // Generally, you will not need to call GenerateSeeds
 86 | // yourself.
 87 | // This is because other functions (e.g. Uniform) do the
 88 | // initialization process automatically if needed.
 89 | //
 90 | // This must be called inside a cuda.Context.
 91 | func (g *Generator) GenerateSeeds() error {
 92 | 	return newError("curandGenerateSeeds", C.curandGenerateSeeds(g.gen))
 93 | }
 94 | 
 95 | // Uniform generates uniformly-distributed 32-bit floats
 96 | // and saves them to the buffer.
 97 | //
 98 | // This must be called inside a cuda.Context.
 99 | func (g *Generator) Uniform(buf cuda.Buffer) error {
100 | 	var res error
101 | 	buf.WithPtr(func(ptr unsafe.Pointer) {
102 | 		status := C.curandGenerateUniform(g.gen, (*C.float)(ptr),
103 | 			C.size_t(buf.Size()/4))
104 | 		res = newError("curandGenerateUniform", status)
105 | 	})
106 | 	return res
107 | }
108 | 
109 | // UniformDouble is like Uniform, but for 64-bit floats.
110 | //
111 | // This must be called inside a cuda.Context.
112 | func (g *Generator) UniformDouble(buf cuda.Buffer) error {
113 | 	var res error
114 | 	buf.WithPtr(func(ptr unsafe.Pointer) {
115 | 		status := C.curandGenerateUniformDouble(g.gen, (*C.double)(ptr),
116 | 			C.size_t(buf.Size()/8))
117 | 		res = newError("curandGenerateUniformDouble", status)
118 | 	})
119 | 	return res
120 | }
121 | 
122 | // Normal generates normally distributed floats.
123 | //
124 | // cuRAND may require that the number of floats is
125 | // divisible by 2.
126 | //
127 | // This must be called inside a cuda.Context.
128 | func (g *Generator) Normal(buf cuda.Buffer, mean, stddev float32) error {
129 | 	var res error
130 | 	buf.WithPtr(func(ptr unsafe.Pointer) {
131 | 		status := C.curandGenerateNormal(g.gen, (*C.float)(ptr),
132 | 			C.size_t(buf.Size()/4), C.float(mean), C.float(stddev))
133 | 		res = newError("curandGenerateNormal", status)
134 | 	})
135 | 	return res
136 | }
137 | 
138 | // NormalDouble generates normally distributed doubles.
139 | //
140 | // This must be called inside a cuda.Context.
141 | func (g *Generator) NormalDouble(buf cuda.Buffer, mean, stddev float64) error {
142 | 	var res error
143 | 	buf.WithPtr(func(ptr unsafe.Pointer) {
144 | 		status := C.curandGenerateNormalDouble(g.gen, (*C.double)(ptr),
145 | 			C.size_t(buf.Size()/8), C.double(mean), C.double(stddev))
146 | 		res = newError("curandGenerateNormalDouble", status)
147 | 	})
148 | 	return res
149 | }
150 | 


--------------------------------------------------------------------------------
/allocator.go:
--------------------------------------------------------------------------------
  1 | package cuda
  2 | 
  3 | /*
  4 | #include <cuda_runtime_api.h>
  5 | #include <cuda.h>
  6 | */
  7 | import "C"
  8 | 
  9 | import (
 10 | 	"os"
 11 | 	"runtime"
 12 | 	"strconv"
 13 | 	"unsafe"
 14 | )
 15 | 
 16 | const minGCThresh = 1 << 15
 17 | 
 18 | // An Allocator allocates and frees CUDA memory.
 19 | //
 20 | // In general, Allocators are bound to a Context, meaning
 21 | // that they should only be used from within that Context.
 22 | //
 23 | // Usually, you should prefer to use the Buffer type over
 24 | // a direct memory allocation, since Buffers take care of
 25 | // garbage collection for you.
 26 | //
 27 | // Allocators are not responsible for zeroing out returned
 28 | // memory.
 29 | type Allocator interface {
 30 | 	// Get the Context in which all calls to this Allocator
 31 | 	// should be made.
 32 | 	//
 33 | 	// Unlike Alloc and Free, this needn't be called from the
 34 | 	// allocator's Context.
 35 | 	Context() *Context
 36 | 
 37 | 	// Allocate a chunk of CUDA memory.
 38 | 	//
 39 | 	// This should only be called from the Context.
 40 | 	Alloc(size uintptr) (unsafe.Pointer, error)
 41 | 
 42 | 	// Free a chunk of CUDA memory.
 43 | 	//
 44 | 	// The size passed to Free must be the same size that was
 45 | 	// passed to Alloc().
 46 | 	//
 47 | 	// This should only be called from the Context.
 48 | 	Free(ptr unsafe.Pointer, size uintptr)
 49 | }
 50 | 
 51 | // MemInfo gets the free and total amount of memory
 52 | // available for allocation on the current device.
 53 | //
 54 | // This must be called in a Context.
 55 | func MemInfo() (free, total uint64, err error) {
 56 | 	var cFree, cTotal C.size_t
 57 | 	err = newErrorRuntime("cudaMemGetInfo", C.cudaMemGetInfo(&cFree, &cTotal))
 58 | 	free, total = uint64(cFree), uint64(cTotal)
 59 | 	return
 60 | }
 61 | 
 62 | // A nativeAllocator allocates directly using CUDA.
 63 | type nativeAllocator struct {
 64 | 	ctx *Context
 65 | }
 66 | 
 67 | // NativeAllocator returns an Allocator that allocates
 68 | // directly from the CUDA APIs.
 69 | //
 70 | // The resulting Allocator should be wrapped with
 71 | // GCAllocator if you plan to use it with the Buffer API.
 72 | //
 73 | // This need not be called in a Context.
 74 | func NativeAllocator(ctx *Context) Allocator {
 75 | 	return &nativeAllocator{ctx: ctx}
 76 | }
 77 | 
 78 | func (n *nativeAllocator) Context() *Context {
 79 | 	return n.ctx
 80 | }
 81 | 
 82 | func (n *nativeAllocator) Alloc(size uintptr) (unsafe.Pointer, error) {
 83 | 	var ptr unsafe.Pointer
 84 | 	return ptr, newErrorRuntime("cudaMalloc", C.cudaMalloc(&ptr, C.size_t(size)))
 85 | }
 86 | 
 87 | func (n *nativeAllocator) Free(ptr unsafe.Pointer, size uintptr) {
 88 | 	C.cudaFree(ptr)
 89 | }
 90 | 
 91 | type gcAllocator struct {
 92 | 	Allocator
 93 | 
 94 | 	inUse  uintptr
 95 | 	thresh uintptr
 96 | 	ratio  float64
 97 | }
 98 | 
 99 | // GCAllocator wraps an Allocator in a new Allocator which
100 | // automatically triggers garbage collections.
101 | //
102 | // The frac argument behaves similarly to the GOGC
103 | // environment variable, except that GOGC is a percentage
104 | // whereas frac is a ratio.
105 | // Thus, a frac of 1.0 is equivalent to GOGC=100.
106 | // If frac is 0, the value for GOGC is used.
107 | //
108 | // If you are implementing your own Allocator, you will
109 | // likely want to wrap it with GCAllocator so that it
110 | // works nicely with the Buffer API.
111 | //
112 | // This need not be called in a Context.
113 | func GCAllocator(a Allocator, frac float64) Allocator {
114 | 	if frac == 0 {
115 | 		frac = 1
116 | 		if gogc := os.Getenv("GOGC"); gogc != "" {
117 | 			val, err := strconv.ParseFloat(gogc, 64)
118 | 			if err == nil {
119 | 				frac = val / 100
120 | 			}
121 | 		}
122 | 	}
123 | 	if frac <= 0 {
124 | 		panic("invalid frac argument")
125 | 	}
126 | 
127 | 	return &gcAllocator{
128 | 		Allocator: a,
129 | 		inUse:     0,
130 | 		thresh:    minGCThresh,
131 | 		ratio:     frac + 1,
132 | 	}
133 | }
134 | 
135 | func (g *gcAllocator) Alloc(size uintptr) (unsafe.Pointer, error) {
136 | 	res, err := g.Allocator.Alloc(size)
137 | 	if err != nil {
138 | 		return res, err
139 | 	}
140 | 	g.inUse += size
141 | 	if g.inUse > g.thresh {
142 | 		g.thresh = g.updatedThresh()
143 | 		runtime.GC()
144 | 	}
145 | 	return res, nil
146 | }
147 | 
148 | func (g *gcAllocator) Free(ptr unsafe.Pointer, size uintptr) {
149 | 	g.Allocator.Free(ptr, size)
150 | 	g.inUse -= size
151 | 	if g.inUse < 0 {
152 | 		panic("more memory was freed than allocated")
153 | 	}
154 | 	t := g.updatedThresh()
155 | 	if t < g.thresh {
156 | 		g.thresh = t
157 | 	}
158 | }
159 | 
160 | func (g *gcAllocator) updatedThresh() uintptr {
161 | 	newVal := float64(g.inUse) * g.ratio
162 | 
163 | 	// Only matters on 32-bit systems.
164 | 	if newVal > float64(^uintptr(0)) {
165 | 		return ^uintptr(0)
166 | 	}
167 | 
168 | 	res := uintptr(newVal)
169 | 	if res > minGCThresh {
170 | 		return res
171 | 	} else {
172 | 		return minGCThresh
173 | 	}
174 | }
175 | 


--------------------------------------------------------------------------------
/module.go:
--------------------------------------------------------------------------------
  1 | package cuda
  2 | 
  3 | /*
  4 | #include <string.h>
  5 | #include <cuda.h>
  6 | #include <cuda_runtime_api.h>
  7 | 
  8 | const size_t ptrSize = sizeof(void *);
  9 | const size_t maxArgSize = 8;
 10 | const CUjit_option * nullJitOptions = NULL;
 11 | const void ** nullPtrPtr = NULL;
 12 | */
 13 | import "C"
 14 | import (
 15 | 	"runtime"
 16 | 	"unsafe"
 17 | )
 18 | 
 19 | // A Module manages a set of compiled kernels.
 20 | type Module struct {
 21 | 	module C.CUmodule
 22 | 	cache  map[string]C.CUfunction
 23 | 	ctx    *Context
 24 | }
 25 | 
 26 | // NewModule creates a Module by compiling a chunk of PTX
 27 | // code.
 28 | //
 29 | // This should be called from within the Context.
 30 | //
 31 | // You can build PTX code using the nvcc compiler like so:
 32 | //
 33 | //     nvcc --gpu-architecture=compute_30 --gpu-code=compute_30 --ptx kernels.cu
 34 | //
 35 | // In the above example, you build "kernels.cu" to a PTX
 36 | // file called "kernels.ptx".
 37 | //
 38 | // The word size of the PTX should match the word size of
 39 | // the Go program.
 40 | // Depending on your use case, you may want to compile
 41 | // separate PTX files for 32-bit and 64-bit hosts.
 42 | func NewModule(ctx *Context, ptx string) (*Module, error) {
 43 | 	cstr := unsafe.Pointer(C.CString(ptx))
 44 | 	defer C.free(cstr)
 45 | 
 46 | 	var module C.CUmodule
 47 | 	res := C.cuModuleLoadDataEx(&module, cstr, 0, C.nullJitOptions, C.nullPtrPtr)
 48 | 	if err := newErrorDriver("cuModuleLoadDataEx", res); err != nil {
 49 | 		return nil, err
 50 | 	}
 51 | 
 52 | 	m := &Module{module: module, cache: map[string]C.CUfunction{}, ctx: ctx}
 53 | 	runtime.SetFinalizer(m, func(obj *Module) {
 54 | 		go obj.ctx.Run(func() error {
 55 | 			C.cuModuleUnload(obj.module)
 56 | 			return nil
 57 | 		})
 58 | 	})
 59 | 
 60 | 	return m, nil
 61 | }
 62 | 
 63 | // Launch launches a kernel (which is referenced by name).
 64 | //
 65 | // This should be called from within the same Context that
 66 | // NewModule was called from.
 67 | //
 68 | // Currently, the following types may be used as kernel
 69 | // arguments:
 70 | //
 71 | //     uint
 72 | //     int
 73 | //     float32
 74 | //     float64
 75 | //     unsafe.Pointer
 76 | //     Buffer
 77 | //
 78 | // To wait for the launched kernel to complete, use
 79 | // Synchronize() or stream.Synchronize() if you specified
 80 | // a non-nil stream.
 81 | func (m *Module) Launch(kernel string, gridX, gridY, gridZ, blockX, blockY, blockZ,
 82 | 	sharedMem uint, stream *Stream, args ...interface{}) error {
 83 | 	res := cleanKernelArguments(args, nil, func(rawArgs []unsafe.Pointer) error {
 84 | 		f, err := m.lookupKernel(kernel)
 85 | 		if err != nil {
 86 | 			return err
 87 | 		}
 88 | 		res := C.cuLaunchKernel(f, safeUintToC(gridX), safeUintToC(gridY),
 89 | 			safeUintToC(gridZ), safeUintToC(blockX), safeUintToC(blockY),
 90 | 			safeUintToC(blockZ), safeUintToC(sharedMem), stream.cuStream(),
 91 | 			&rawArgs[0], C.nullPtrPtr)
 92 | 		return newErrorDriver("cuLaunchKernel", res)
 93 | 	})
 94 | 	runtime.KeepAlive(m)
 95 | 	return res
 96 | }
 97 | 
 98 | func (m *Module) lookupKernel(name string) (C.CUfunction, error) {
 99 | 	if f, ok := m.cache[name]; ok {
100 | 		return f, nil
101 | 	}
102 | 	cName := C.CString(name)
103 | 	defer C.free(unsafe.Pointer(cName))
104 | 	var kernel C.CUfunction
105 | 	cuRes := C.cuModuleGetFunction(&kernel, m.module, cName)
106 | 	if err := newErrorDriver("cuModuleGetFunction", cuRes); err != nil {
107 | 		return kernel, err
108 | 	}
109 | 	m.cache[name] = kernel
110 | 	runtime.KeepAlive(m)
111 | 	return kernel, nil
112 | }
113 | 
114 | func cleanKernelArguments(args []interface{}, newArgs []unsafe.Pointer,
115 | 	f func(args []unsafe.Pointer) error) error {
116 | 	if len(args) == 0 {
117 | 		return f(newArgs)
118 | 	}
119 | 
120 | 	if buf, ok := args[0].(Buffer); ok {
121 | 		var res error
122 | 		buf.WithPtr(func(ptr unsafe.Pointer) {
123 | 			tempArgs := append([]interface{}{ptr}, args[1:]...)
124 | 			res = cleanKernelArguments(tempArgs, newArgs, f)
125 | 		})
126 | 		return res
127 | 	}
128 | 
129 | 	valPtr := unsafe.Pointer(C.malloc(C.maxArgSize))
130 | 	defer C.free(valPtr)
131 | 
132 | 	switch x := args[0].(type) {
133 | 	case uint:
134 | 		val := safeUintToC(x)
135 | 		C.memcpy(valPtr, unsafe.Pointer(&val), 4)
136 | 	case int:
137 | 		val := safeIntToC(x)
138 | 		C.memcpy(valPtr, unsafe.Pointer(&val), 4)
139 | 	case float32:
140 | 		val := C.float(x)
141 | 		C.memcpy(valPtr, unsafe.Pointer(&val), 4)
142 | 	case float64:
143 | 		val := C.double(x)
144 | 		C.memcpy(valPtr, unsafe.Pointer(&val), 8)
145 | 	case unsafe.Pointer:
146 | 		C.memcpy(valPtr, unsafe.Pointer(&x), C.ptrSize)
147 | 	}
148 | 
149 | 	return cleanKernelArguments(args[1:], append(newArgs, valPtr), f)
150 | }
151 | 
152 | func safeUintToC(x uint) C.uint {
153 | 	if x > uint(^C.uint(0)) {
154 | 		panic("uint value out of bounds")
155 | 	}
156 | 	return C.uint(x)
157 | }
158 | 
159 | func safeIntToC(x int) C.int {
160 | 	if x > int(C.int(^C.uint(0)/2)) {
161 | 		panic("int value out of bounds")
162 | 	} else if x < int((-C.int(^C.uint(0)/2))-1) {
163 | 		panic("int value out of bounds")
164 | 	}
165 | 	return C.int(x)
166 | }
167 | 


--------------------------------------------------------------------------------
/cublas/level3.go:
--------------------------------------------------------------------------------
  1 | package cublas
  2 | 
  3 | /*
  4 | #include <cublas_v2.h>
  5 | 
  6 | const cublasOperation_t goCublasOpN = CUBLAS_OP_N;
  7 | const cublasOperation_t goCublasOpT = CUBLAS_OP_T;
  8 | const cublasOperation_t goCublasOpC = CUBLAS_OP_C;
  9 | */
 10 | import "C"
 11 | 
 12 | import (
 13 | 	"unsafe"
 14 | 
 15 | 	"github.com/unixpickle/cuda"
 16 | 	"github.com/unixpickle/essentials"
 17 | )
 18 | 
 19 | // Operation specifies a matrix operation.
 20 | type Operation int
 21 | 
 22 | const (
 23 | 	NoTrans Operation = iota
 24 | 	Trans
 25 | 	ConjTrans
 26 | )
 27 | 
 28 | func (o Operation) cValue() C.cublasOperation_t {
 29 | 	switch o {
 30 | 	case NoTrans:
 31 | 		return C.goCublasOpN
 32 | 	case Trans:
 33 | 		return C.goCublasOpT
 34 | 	case ConjTrans:
 35 | 		return C.goCublasOpC
 36 | 	default:
 37 | 		panic("invalid Operation")
 38 | 	}
 39 | }
 40 | 
 41 | // Sgemm performs single-precision matrix multiplication.
 42 | //
 43 | // Matrices are stored in column-major order.
 44 | //
 45 | // The leading dimensions lda, ldb, and ldc may not be 0.
 46 | //
 47 | // The type of alpha and beta depends on the pointer mode.
 48 | // In Host mode, use float32 or *float32.
 49 | // In Device mode, user cuda.Buffer.
 50 | //
 51 | // This must be called inside the cuda.Context
 52 | func (h *Handle) Sgemm(transA, transB Operation, m, n, k int, alpha interface{},
 53 | 	matA cuda.Buffer, lda int, matB cuda.Buffer, ldb int, beta interface{},
 54 | 	matC cuda.Buffer, ldc int) error {
 55 | 	checkGemm(transA, transB, m, n, k,
 56 | 		matA.Size()/4, lda,
 57 | 		matB.Size()/4, ldb,
 58 | 		matC.Size()/4, ldc)
 59 | 
 60 | 	var res C.cublasStatus_t
 61 | 	matA.WithPtr(func(aPtr unsafe.Pointer) {
 62 | 		matB.WithPtr(func(bPtr unsafe.Pointer) {
 63 | 			matC.WithPtr(func(cPtr unsafe.Pointer) {
 64 | 				if h.PointerMode() == Host {
 65 | 					pointerizeInputs(&alpha, &beta)
 66 | 					res = C.cublasSgemm(h.handle,
 67 | 						transA.cValue(), transB.cValue(),
 68 | 						safeIntToC(m), safeIntToC(n), safeIntToC(k),
 69 | 						(*C.float)(alpha.(*float32)),
 70 | 						(*C.float)(aPtr), safeIntToC(lda),
 71 | 						(*C.float)(bPtr), safeIntToC(ldb),
 72 | 						(*C.float)(beta.(*float32)),
 73 | 						(*C.float)(cPtr), safeIntToC(ldc))
 74 | 				} else {
 75 | 					alphaBeta32(alpha, beta, func(alpha, beta *C.float) {
 76 | 						res = C.cublasSgemm(h.handle,
 77 | 							transA.cValue(), transB.cValue(),
 78 | 							safeIntToC(m), safeIntToC(n), safeIntToC(k),
 79 | 							alpha,
 80 | 							(*C.float)(aPtr), safeIntToC(lda),
 81 | 							(*C.float)(bPtr), safeIntToC(ldb),
 82 | 							beta,
 83 | 							(*C.float)(cPtr), safeIntToC(ldc))
 84 | 					})
 85 | 				}
 86 | 			})
 87 | 		})
 88 | 	})
 89 | 
 90 | 	return newError("cublasSgemm", res)
 91 | }
 92 | 
 93 | // Dgemm is like Sgemm, but for double-precision.
 94 | //
 95 | // The type of alpha and beta depends on the pointer mode.
 96 | // In Host mode, use float64 or *float64.
 97 | // In Device mode, user cuda.Buffer.
 98 | //
 99 | // This must be called inside the cuda.Context
100 | func (h *Handle) Dgemm(transA, transB Operation, m, n, k int, alpha interface{},
101 | 	matA cuda.Buffer, lda int, matB cuda.Buffer, ldb int, beta interface{},
102 | 	matC cuda.Buffer, ldc int) error {
103 | 	checkGemm(transA, transB, m, n, k,
104 | 		matA.Size()/8, lda,
105 | 		matB.Size()/8, ldb,
106 | 		matC.Size()/8, ldc)
107 | 
108 | 	var res C.cublasStatus_t
109 | 	matA.WithPtr(func(aPtr unsafe.Pointer) {
110 | 		matB.WithPtr(func(bPtr unsafe.Pointer) {
111 | 			matC.WithPtr(func(cPtr unsafe.Pointer) {
112 | 				if h.PointerMode() == Host {
113 | 					pointerizeInputs(&alpha, &beta)
114 | 					res = C.cublasDgemm(h.handle,
115 | 						transA.cValue(), transB.cValue(),
116 | 						safeIntToC(m), safeIntToC(n), safeIntToC(k),
117 | 						(*C.double)(alpha.(*float64)),
118 | 						(*C.double)(aPtr), safeIntToC(lda),
119 | 						(*C.double)(bPtr), safeIntToC(ldb),
120 | 						(*C.double)(beta.(*float64)),
121 | 						(*C.double)(cPtr), safeIntToC(ldc))
122 | 				} else {
123 | 					alphaBeta64(alpha, beta, func(alpha, beta *C.double) {
124 | 						res = C.cublasDgemm(h.handle,
125 | 							transA.cValue(), transB.cValue(),
126 | 							safeIntToC(m), safeIntToC(n), safeIntToC(k),
127 | 							alpha,
128 | 							(*C.double)(aPtr), safeIntToC(lda),
129 | 							(*C.double)(bPtr), safeIntToC(ldb),
130 | 							beta,
131 | 							(*C.double)(cPtr), safeIntToC(ldc))
132 | 					})
133 | 				}
134 | 			})
135 | 		})
136 | 	})
137 | 
138 | 	return newError("cublasDgemm", res)
139 | }
140 | 
141 | func alphaBeta32(alpha, beta interface{}, f func(alpha, beta *C.float)) {
142 | 	b1 := alpha.(cuda.Buffer)
143 | 	b2 := beta.(cuda.Buffer)
144 | 	if b1.Size() < 4 || b2.Size() < 4 {
145 | 		panic("buffer underflow")
146 | 	}
147 | 	b1.WithPtr(func(ptr1 unsafe.Pointer) {
148 | 		b2.WithPtr(func(ptr2 unsafe.Pointer) {
149 | 			f((*C.float)(ptr1), (*C.float)(ptr2))
150 | 		})
151 | 	})
152 | }
153 | 
154 | func alphaBeta64(alpha, beta interface{}, f func(alpha, beta *C.double)) {
155 | 	b1 := alpha.(cuda.Buffer)
156 | 	b2 := beta.(cuda.Buffer)
157 | 	if b1.Size() < 4 || b2.Size() < 4 {
158 | 		panic("buffer underflow")
159 | 	}
160 | 	b1.WithPtr(func(ptr1 unsafe.Pointer) {
161 | 		b2.WithPtr(func(ptr2 unsafe.Pointer) {
162 | 			f((*C.double)(ptr1), (*C.double)(ptr2))
163 | 		})
164 | 	})
165 | }
166 | 
167 | func checkGemm(transA, transB Operation, m, n, k int, A uintptr, lda int, B uintptr,
168 | 	ldb int, C uintptr, ldc int) {
169 | 	checkMatrix(transA, lda, m, k, A)
170 | 	checkMatrix(transB, ldb, k, n, B)
171 | 	checkMatrix(NoTrans, ldc, m, n, C)
172 | }
173 | 
174 | // checkMatrix ensures that op(A) fits in size elements,
175 | // given that op(A) is a-by-b and has leading dimension
176 | // lda.
177 | func checkMatrix(op Operation, lda, a, b int, size uintptr) {
178 | 	if a < 0 || b < 0 {
179 | 		panic("negative matrix dimension")
180 | 	}
181 | 	if op == NoTrans {
182 | 		if lda < essentials.MaxInt(1, a) {
183 | 			panic("leading dimension out of bounds")
184 | 		}
185 | 		if size/uintptr(lda) < uintptr(b) {
186 | 			panic("index out of bounds")
187 | 		}
188 | 	} else {
189 | 		if lda < essentials.MaxInt(1, b) {
190 | 			panic("leading dimension out of bounds")
191 | 		}
192 | 		if size/uintptr(lda) < uintptr(a) {
193 | 			panic("index out of bounds")
194 | 		}
195 | 	}
196 | }
197 | 


--------------------------------------------------------------------------------
/buffer.go:
--------------------------------------------------------------------------------
  1 | package cuda
  2 | 
  3 | /*
  4 | #include <cuda.h>
  5 | #include <cuda_runtime_api.h>
  6 | */
  7 | import "C"
  8 | 
  9 | import (
 10 | 	"fmt"
 11 | 	"runtime"
 12 | 	"unsafe"
 13 | )
 14 | 
 15 | // A Buffer provides a high-level interface into an
 16 | // underlying CUDA buffer.
 17 | type Buffer interface {
 18 | 	// Allocator is the Allocator from which the Buffer was
 19 | 	// allocated.
 20 | 	Allocator() Allocator
 21 | 
 22 | 	// Size is the size of the Buffer.
 23 | 	Size() uintptr
 24 | 
 25 | 	// WithPtr runs f with the pointer contained inside the
 26 | 	// Buffer.
 27 | 	// During the call to f, it is guaranteed that the Buffer
 28 | 	// wil not be garbage collected.
 29 | 	// However, nothing should store a reference to ptr after
 30 | 	// f has completed.
 31 | 	WithPtr(f func(ptr unsafe.Pointer))
 32 | }
 33 | 
 34 | type buffer struct {
 35 | 	alloc Allocator
 36 | 	size  uintptr
 37 | 	ptr   unsafe.Pointer
 38 | }
 39 | 
 40 | // AllocBuffer allocates a new Buffer.
 41 | //
 42 | // This must be called in the Allocator's Context.
 43 | //
 44 | // This does not zero out the returned memory.
 45 | // To do that, you should use ClearBuffer().
 46 | func AllocBuffer(a Allocator, size uintptr) (Buffer, error) {
 47 | 	ptr, err := a.Alloc(size)
 48 | 	if err != nil {
 49 | 		return nil, err
 50 | 	}
 51 | 	return WrapPointer(a, ptr, size), nil
 52 | }
 53 | 
 54 | // WrapPointer wraps a pointer in a Buffer.
 55 | // You must specify the Allocator from which the pointer
 56 | // originated and the size of the buffer.
 57 | //
 58 | // After calling this, you should not use the pointer
 59 | // outside of the buffer.
 60 | // The Buffer will automatically free the pointer.
 61 | func WrapPointer(a Allocator, ptr unsafe.Pointer, size uintptr) Buffer {
 62 | 	res := &buffer{alloc: a, size: size, ptr: ptr}
 63 | 	runtime.SetFinalizer(res, func(obj *buffer) {
 64 | 		allocator := obj.alloc
 65 | 		go allocator.Context().Run(func() error {
 66 | 			allocator.Free(obj.ptr, obj.size)
 67 | 			return nil
 68 | 		})
 69 | 	})
 70 | 	return res
 71 | }
 72 | 
 73 | func (b *buffer) Allocator() Allocator {
 74 | 	return b.alloc
 75 | }
 76 | 
 77 | func (b *buffer) Size() uintptr {
 78 | 	return b.size
 79 | }
 80 | 
 81 | func (b *buffer) WithPtr(f func(p unsafe.Pointer)) {
 82 | 	f(b.ptr)
 83 | 	runtime.KeepAlive(b)
 84 | }
 85 | 
 86 | type slice struct {
 87 | 	Buffer
 88 | 	off  uintptr
 89 | 	size uintptr
 90 | }
 91 | 
 92 | // Slice creates a Buffer which views some part of the
 93 | // contents of another Buffer.
 94 | // The start and end indexes are inclusive and exclusive,
 95 | // respectively.
 96 | func Slice(b Buffer, start, end uintptr) Buffer {
 97 | 	if start > end || start > b.Size() || end > b.Size() {
 98 | 		panic("index out of bounds")
 99 | 	}
100 | 	return &slice{
101 | 		Buffer: b,
102 | 		off:    start,
103 | 		size:   end - start,
104 | 	}
105 | }
106 | 
107 | func (s *slice) Size() uintptr {
108 | 	return s.size
109 | }
110 | 
111 | func (s *slice) WithPtr(f func(p unsafe.Pointer)) {
112 | 	s.Buffer.WithPtr(func(p unsafe.Pointer) {
113 | 		f(unsafe.Pointer(uintptr(p) + s.off))
114 | 	})
115 | }
116 | 
117 | // Overlap checks if two buffers overlap in memory.
118 | func Overlap(b1, b2 Buffer) bool {
119 | 	var overlap bool
120 | 	b1.WithPtr(func(ptr1 unsafe.Pointer) {
121 | 		b2.WithPtr(func(ptr2 unsafe.Pointer) {
122 | 			overlap = uintptr(ptr1) < uintptr(ptr2)+uintptr(b2.Size()) &&
123 | 				uintptr(ptr2) < uintptr(ptr1)+uintptr(b1.Size())
124 | 		})
125 | 	})
126 | 	return overlap
127 | }
128 | 
129 | // ClearBuffer writes zeros over the contents of a Buffer.
130 | // It must be called from the correct Context.
131 | func ClearBuffer(b Buffer) error {
132 | 	var res C.cudaError_t
133 | 	b.WithPtr(func(ptr unsafe.Pointer) {
134 | 		res = C.cudaMemset(ptr, 0, C.size_t(b.Size()))
135 | 	})
136 | 	return newErrorRuntime("cudaMemset", res)
137 | }
138 | 
139 | // WriteBuffer writes the data from a slice into a Buffer.
140 | // It must be called from the correct Context.
141 | //
142 | // Supported slice types are:
143 | //
144 | //     []byte
145 | //     []float64
146 | //     []float32
147 | //     []int32
148 | //     []uint32
149 | //
150 | // Similar to the copy() built-in, the maximum possible
151 | // amount of data will be copied.
152 | func WriteBuffer(b Buffer, val interface{}) error {
153 | 	size := bytesForSlice(val)
154 | 	if size > b.Size() {
155 | 		size = b.Size()
156 | 	}
157 | 	if size == 0 {
158 | 		return nil
159 | 	}
160 | 
161 | 	var res C.cudaError_t
162 | 	b.WithPtr(func(ptr unsafe.Pointer) {
163 | 		switch val := val.(type) {
164 | 		case []byte:
165 | 			res = C.cudaMemcpy(ptr, unsafe.Pointer(&val[0]), C.size_t(size),
166 | 				C.cudaMemcpyHostToDevice)
167 | 		case []float64:
168 | 			res = C.cudaMemcpy(ptr, unsafe.Pointer(&val[0]), C.size_t(size),
169 | 				C.cudaMemcpyHostToDevice)
170 | 		case []float32:
171 | 			res = C.cudaMemcpy(ptr, unsafe.Pointer(&val[0]), C.size_t(size),
172 | 				C.cudaMemcpyHostToDevice)
173 | 		case []int32:
174 | 			res = C.cudaMemcpy(ptr, unsafe.Pointer(&val[0]), C.size_t(size),
175 | 				C.cudaMemcpyHostToDevice)
176 | 		case []uint32:
177 | 			res = C.cudaMemcpy(ptr, unsafe.Pointer(&val[0]), C.size_t(size),
178 | 				C.cudaMemcpyHostToDevice)
179 | 		}
180 | 	})
181 | 
182 | 	return newErrorRuntime("cudaMemcpy", res)
183 | }
184 | 
185 | // ReadBuffer reads the data from a Buffer into a slice.
186 | // This must be called from the correct Context.
187 | //
188 | // See WriteBuffer for details on supported slice types.
189 | func ReadBuffer(val interface{}, b Buffer) error {
190 | 	size := bytesForSlice(val)
191 | 	if size > b.Size() {
192 | 		size = b.Size()
193 | 	}
194 | 	if size == 0 {
195 | 		return nil
196 | 	}
197 | 
198 | 	var res C.cudaError_t
199 | 	b.WithPtr(func(ptr unsafe.Pointer) {
200 | 		switch val := val.(type) {
201 | 		case []byte:
202 | 			res = C.cudaMemcpy(unsafe.Pointer(&val[0]), ptr, C.size_t(size),
203 | 				C.cudaMemcpyDeviceToHost)
204 | 		case []float64:
205 | 			res = C.cudaMemcpy(unsafe.Pointer(&val[0]), ptr, C.size_t(size),
206 | 				C.cudaMemcpyDeviceToHost)
207 | 		case []float32:
208 | 			res = C.cudaMemcpy(unsafe.Pointer(&val[0]), ptr, C.size_t(size),
209 | 				C.cudaMemcpyDeviceToHost)
210 | 		case []int32:
211 | 			res = C.cudaMemcpy(unsafe.Pointer(&val[0]), ptr, C.size_t(size),
212 | 				C.cudaMemcpyDeviceToHost)
213 | 		case []uint32:
214 | 			res = C.cudaMemcpy(unsafe.Pointer(&val[0]), ptr, C.size_t(size),
215 | 				C.cudaMemcpyDeviceToHost)
216 | 		}
217 | 	})
218 | 
219 | 	return newErrorRuntime("cudaMemcpy", res)
220 | }
221 | 
222 | // CopyBuffer copies as many bytes as possible from src
223 | // into dst.
224 | //
225 | // The two Buffers must not contain overlapping regions of
226 | // memory.
227 | func CopyBuffer(dst, src Buffer) error {
228 | 	size := dst.Size()
229 | 	if src.Size() < size {
230 | 		size = src.Size()
231 | 	}
232 | 	if size == 0 {
233 | 		return nil
234 | 	}
235 | 
236 | 	var res C.cudaError_t
237 | 	dst.WithPtr(func(dstPtr unsafe.Pointer) {
238 | 		src.WithPtr(func(srcPtr unsafe.Pointer) {
239 | 			res = C.cudaMemcpy(dstPtr, srcPtr, C.size_t(size),
240 | 				C.cudaMemcpyDeviceToDevice)
241 | 		})
242 | 	})
243 | 
244 | 	return newErrorRuntime("cudaMemcpy", res)
245 | }
246 | 
247 | func bytesForSlice(val interface{}) uintptr {
248 | 	switch val := val.(type) {
249 | 	case []byte:
250 | 		return uintptr(len(val))
251 | 	case []float64:
252 | 		return 8 * uintptr(len(val))
253 | 	case []float32:
254 | 		return 4 * uintptr(len(val))
255 | 	case []int32:
256 | 		return 4 * uintptr(len(val))
257 | 	case []uint32:
258 | 		return 4 * uintptr(len(val))
259 | 	default:
260 | 		panic(fmt.Sprintf("unsupported type: %T", val))
261 | 	}
262 | }
263 | 


--------------------------------------------------------------------------------
/device.go:
--------------------------------------------------------------------------------
  1 | package cuda
  2 | 
  3 | /*
  4 | #include <cuda.h>
  5 | #include <assert.h>
  6 | 
  7 | int devattr_for_idx(int i, CUdevice_attribute * res) {
  8 | 	CUdevice_attribute attrs[] = {
  9 | 		CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
 10 | 		CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
 11 | 		CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
 12 | 		CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
 13 | 		CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
 14 | 		CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
 15 | 		CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
 16 | 		CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
 17 | 		CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK,
 18 | 		CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
 19 | 		CU_DEVICE_ATTRIBUTE_WARP_SIZE,
 20 | 		CU_DEVICE_ATTRIBUTE_MAX_PITCH,
 21 | 		CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
 22 | 		CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK,
 23 | 		CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
 24 | 		CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
 25 | 		CU_DEVICE_ATTRIBUTE_GPU_OVERLAP,
 26 | 		CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
 27 | 		CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,
 28 | 		CU_DEVICE_ATTRIBUTE_INTEGRATED,
 29 | 		CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY,
 30 | 		CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,
 31 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH,
 32 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH,
 33 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT,
 34 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH,
 35 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT,
 36 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH,
 37 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH,
 38 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT,
 39 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS,
 40 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH,
 41 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT,
 42 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES,
 43 | 		CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT,
 44 | 		CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS,
 45 | 		CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
 46 | 		CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
 47 | 		CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
 48 | 		CU_DEVICE_ATTRIBUTE_TCC_DRIVER,
 49 | 		CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
 50 | 		CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
 51 | 		CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
 52 | 		CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
 53 | 		CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT,
 54 | 		CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
 55 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH,
 56 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS,
 57 | 		CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER,
 58 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH,
 59 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT,
 60 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE,
 61 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE,
 62 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE,
 63 | 		CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID,
 64 | 		CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT,
 65 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH,
 66 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH,
 67 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS,
 68 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH,
 69 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH,
 70 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT,
 71 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH,
 72 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT,
 73 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH,
 74 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH,
 75 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS,
 76 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH,
 77 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT,
 78 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS,
 79 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH,
 80 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH,
 81 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS,
 82 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH,
 83 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH,
 84 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT,
 85 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH,
 86 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH,
 87 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT,
 88 | 		CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
 89 | 		CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
 90 | 		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH,
 91 | 		CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED,
 92 | 		CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED,
 93 | 		CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED,
 94 | 		CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
 95 | 		CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
 96 | 		CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY,
 97 | 		CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD,
 98 | #ifdef CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED
 99 | 		CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID,
100 | 		CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED,
101 | 		CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO,
102 | 		CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS,
103 | 		CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS,
104 | 		CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
105 | 		CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM
106 | #else
107 | 		CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID
108 | #endif
109 | 	};
110 | 	if (i <= 0 || i >= sizeof(attrs)/sizeof(CUdevice_attribute)) {
111 | 		return 1;
112 | 	}
113 | 	*res = attrs[i];
114 | 	return 0;
115 | }
116 | */
117 | import "C"
118 | 
119 | import (
120 | 	"fmt"
121 | 	"unsafe"
122 | )
123 | 
124 | // DevAttr is a CUDA device attribute.
125 | type DevAttr int
126 | 
127 | func (d DevAttr) cValue() (C.CUdevice_attribute, error) {
128 | 	if d < 0 || d > DevAttrCanUseHostPointerForRegisteredMem {
129 | 		panic("invalid DevAttr")
130 | 	}
131 | 	var res C.CUdevice_attribute
132 | 	status := C.devattr_for_idx(C.int(d), &res)
133 | 	if status == 0 {
134 | 		return res, nil
135 | 	} else {
136 | 		return 0, fmt.Errorf("unsupported device attribute: %d", int(d))
137 | 	}
138 | }
139 | 
140 | // All supported device attributes.
141 | const (
142 | 	DevAttrMaxThreadsPerBlock DevAttr = iota
143 | 	DevAttrMaxBlockDimX
144 | 	DevAttrMaxBlockDimY
145 | 	DevAttrMaxBlockDimZ
146 | 	DevAttrMaxGridDimX
147 | 	DevAttrMaxGridDimY
148 | 	DevAttrMaxGridDimZ
149 | 	DevAttrMaxSharedMemoryPerBlock
150 | 	DevAttrSharedMemoryPerBlock
151 | 	DevAttrTotalConstantMemory
152 | 	DevAttrWarpSize
153 | 	DevAttrMaxPitch
154 | 	DevAttrMaxRegistersPerBlock
155 | 	DevAttrRegistersPerBlock
156 | 	DevAttrClockRate
157 | 	DevAttrTextureAlignment
158 | 	DevAttrGPUOverlap
159 | 	DevAttrMultiprocessorCount
160 | 	DevAttrKernelExecTimeout
161 | 	DevAttrIntegrated
162 | 	DevAttrCanMapHostMemory
163 | 	DevAttrComputeMode
164 | 	DevAttrMaximumTexture1DWidth
165 | 	DevAttrMaximumTexture2DWidth
166 | 	DevAttrMaximumTexture2DHeight
167 | 	DevAttrMaximumTexture3DWidth
168 | 	DevAttrMaximumTexture3DHeight
169 | 	DevAttrMaximumTexture3DDepth
170 | 	DevAttrMaximumTexture2DLayeredWidth
171 | 	DevAttrMaximumTexture2DLayeredHeight
172 | 	DevAttrMaximumTexture2DLayeredLayers
173 | 	DevAttrMaximumTexture2DArrayWidth
174 | 	DevAttrMaximumTexture2DArrayHeight
175 | 	DevAttrMaximumTexture2DArrayNumslices
176 | 	DevAttrSurfaceAlignment
177 | 	DevAttrConcurrentKernels
178 | 	DevAttrECCEnabled
179 | 	DevAttrPCIBusID
180 | 	DevAttrPCIDeviceID
181 | 	DevAttrTCCDriver
182 | 	DevAttrMemoryClockRate
183 | 	DevAttrGlobalMemoryBusWidth
184 | 	DevAttrL2CacheSize
185 | 	DevAttrMaxThreadsPerMultiprocessor
186 | 	DevAttrAsyncEngineCount
187 | 	DevAttrUnifiedAddressing
188 | 	DevAttrMaximumTexture1DLayeredWidth
189 | 	DevAttrMaximumTexture1DLayeredLayers
190 | 	DevAttrCanTex2DGather
191 | 	DevAttrMaximumTexture2DGatherWidth
192 | 	DevAttrMaximumTexture2DGatherHeight
193 | 	DevAttrMaximumTexture3DWidthAlternate
194 | 	DevAttrMaximumTexture3DHeightAlternate
195 | 	DevAttrMaximumTexture3DDepthAlternate
196 | 	DevAttrPCIDomainID
197 | 	DevAttrTexturePitchAlignment
198 | 	DevAttrMaximumTexturecubemapWidth
199 | 	DevAttrMaximumTexturecubemapLayeredWidth
200 | 	DevAttrMaximumTexturecubemapLayeredLayers
201 | 	DevAttrMaximumSurface1DWidth
202 | 	DevAttrMaximumSurface2DWidth
203 | 	DevAttrMaximumSurface2DHeight
204 | 	DevAttrMaximumSurface3DWidth
205 | 	DevAttrMaximumSurface3DHeight
206 | 	DevAttrMaximumSurface3DDepth
207 | 	DevAttrMaximumSurface1DLayeredWidth
208 | 	DevAttrMaximumSurface1DLayeredLayers
209 | 	DevAttrMaximumSurface2DLayeredWidth
210 | 	DevAttrMaximumSurface2DLayeredHeight
211 | 	DevAttrMaximumSurface2DLayeredLayers
212 | 	DevAttrMaximumSurfacecubemapWidth
213 | 	DevAttrMaximumSurfacecubemapLayeredWidth
214 | 	DevAttrMaximumSurfacecubemapLayeredLayers
215 | 	DevAttrMaximumTexture1DLinearWidth
216 | 	DevAttrMaximumTexture2DLinearWidth
217 | 	DevAttrMaximumTexture2DLinearHeight
218 | 	DevAttrMaximumTexture2DLinearPitch
219 | 	DevAttrMaximumTexture2DMipmappedWidth
220 | 	DevAttrMaximumTexture2DMipmappedHeight
221 | 	DevAttrComputeCapabilityMajor
222 | 	DevAttrComputeCapabilityMinor
223 | 	DevAttrMaximumTexture1DMipmappedWidth
224 | 	DevAttrStreamPrioritiesSupported
225 | 	DevAttrGlobalL1CacheSupported
226 | 	DevAttrLocalL1CacheSupported
227 | 	DevAttrMaxSharedMemoryPerMultiprocessor
228 | 	DevAttrMaxRegistersPerMultiprocessor
229 | 	DevAttrManagedMemory
230 | 	DevAttrMultiGPUBoard
231 | 	DevAttrMultiGPUBoardGroupID
232 | 	DevAttrHostNativeAtomicSupported
233 | 	DevAttrSingleToDoublePrecisionPerfRatio
234 | 	DevAttrPageableMemoryAccess
235 | 	DevAttrConcurrentManagedAccess
236 | 	DevAttrComputePreemptionSupported
237 | 	DevAttrCanUseHostPointerForRegisteredMem
238 | )
239 | 
240 | // Device contains a unique ID for a CUDA device.
241 | type Device struct {
242 | 	id C.CUdevice
243 | }
244 | 
245 | // AllDevices lists the available CUDA devices.
246 | //
247 | // This needn't be called from a Context.
248 | func AllDevices() ([]*Device, error) {
249 | 	var count C.int
250 | 	cuRes := C.cuDeviceGetCount(&count)
251 | 	if err := newErrorDriver("cuDeviceGetCount", cuRes); err != nil {
252 | 		return nil, err
253 | 	}
254 | 	var res []*Device
255 | 	for i := C.int(0); i < count; i++ {
256 | 		var dev C.CUdevice
257 | 		cuRes = C.cuDeviceGet(&dev, i)
258 | 		if err := newErrorDriver("cuDeviceGet", cuRes); err != nil {
259 | 			return nil, err
260 | 		}
261 | 		res = append(res, &Device{id: dev})
262 | 	}
263 | 	return res, nil
264 | }
265 | 
266 | // Name gets the device's identifier string.
267 | //
268 | // This needn't be called from a Context.
269 | func (d *Device) Name() (string, error) {
270 | 	res := (*C.char)(C.malloc(0x100))
271 | 	defer C.free(unsafe.Pointer(res))
272 | 	cuRes := C.cuDeviceGetName(res, 0xff, d.id)
273 | 	if err := newErrorDriver("cuDeviceGetName", cuRes); err != nil {
274 | 		return "", err
275 | 	}
276 | 	return C.GoString(res), nil
277 | }
278 | 
279 | // Attr gets an attribute of the device.
280 | //
281 | // This needn't be called from a Context.
282 | func (d *Device) Attr(attr DevAttr) (int, error) {
283 | 	var res C.int
284 | 	cAttr, err := attr.cValue()
285 | 	if err != nil {
286 | 		return 0, err
287 | 	}
288 | 	cuRes := C.cuDeviceGetAttribute(&res, cAttr, d.id)
289 | 	if err := newErrorDriver("cuDeviceGetAttribute", cuRes); err != nil {
290 | 		return 0, err
291 | 	}
292 | 	return int(res), nil
293 | }
294 | 
295 | // TotalMem gets the device's total memory.
296 | //
297 | // This needn't be called from a Context.
298 | func (d *Device) TotalMem() (uint64, error) {
299 | 	var res C.size_t
300 | 	cuRes := C.cuDeviceTotalMem(&res, d.id)
301 | 	if err := newErrorDriver("cuDeviceTotalMem", cuRes); err != nil {
302 | 		return 0, err
303 | 	}
304 | 	return uint64(res), nil
305 | }
306 | 


--------------------------------------------------------------------------------
/cublas/level1_test.go:
--------------------------------------------------------------------------------
  1 | package cublas
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/unixpickle/cuda"
  8 | )
  9 | 
 10 | func TestSdot(t *testing.T) {
 11 | 	ctx, handle, buffers := setupTest(t, []float32{1, 2, 3, 4, -2, -3, 5},
 12 | 		[]float32{3, -1, 2, 3, -2, 0, 4, 2.5, 3.5}, []float32{0})
 13 | 
 14 | 	<-ctx.Run(func() error {
 15 | 		var res float32
 16 | 		err := handle.Sdot(3, buffers[0], 1, buffers[1], 1, &res)
 17 | 		if err != nil {
 18 | 			t.Error(err)
 19 | 			return nil
 20 | 		}
 21 | 		if math.Abs(float64(res)-7) > 1e-4 {
 22 | 			t.Errorf("bad value: %f", res)
 23 | 		}
 24 | 
 25 | 		err = handle.Sdot(5, buffers[0], 1, buffers[1], 2, &res)
 26 | 		if err != nil {
 27 | 			t.Error(err)
 28 | 			return nil
 29 | 		}
 30 | 		if math.Abs(float64(res)-10) > 1e-4 {
 31 | 			t.Errorf("bad value: %f", res)
 32 | 		}
 33 | 
 34 | 		err = handle.SetPointerMode(Device)
 35 | 		if err != nil {
 36 | 			t.Error(err)
 37 | 			return nil
 38 | 		}
 39 | 		defer handle.SetPointerMode(Host)
 40 | 
 41 | 		err = handle.Sdot(3, buffers[0], 3, buffers[1], 4, buffers[2])
 42 | 		if err != nil {
 43 | 			t.Error(err)
 44 | 			return nil
 45 | 		}
 46 | 
 47 | 		resSlice := make([]float32, 1)
 48 | 		err = cuda.ReadBuffer(resSlice, buffers[2])
 49 | 		if err != nil {
 50 | 			t.Error(err)
 51 | 			return nil
 52 | 		}
 53 | 
 54 | 		if math.Abs(float64(resSlice[0])-12.5) > 1e-4 {
 55 | 			t.Errorf("bad value: %f", resSlice[0])
 56 | 		}
 57 | 
 58 | 		return nil
 59 | 	})
 60 | }
 61 | 
 62 | func TestDdot(t *testing.T) {
 63 | 	ctx, handle, buffers := setupTest(t, []float64{1, 2, 3, 4, -2, -3, 5},
 64 | 		[]float64{3, -1, 2, 3, -2, 0, 4, 2.5, 3.5}, []float64{0})
 65 | 
 66 | 	<-ctx.Run(func() error {
 67 | 		var res float64
 68 | 		err := handle.Ddot(3, buffers[0], 1, buffers[1], 1, &res)
 69 | 		if err != nil {
 70 | 			t.Error(err)
 71 | 			return nil
 72 | 		}
 73 | 		if math.Abs(res-7) > 1e-4 {
 74 | 			t.Errorf("bad value: %f", res)
 75 | 		}
 76 | 
 77 | 		err = handle.Ddot(5, buffers[0], 1, buffers[1], 2, &res)
 78 | 		if err != nil {
 79 | 			t.Error(err)
 80 | 			return nil
 81 | 		}
 82 | 		if math.Abs(res-10) > 1e-4 {
 83 | 			t.Errorf("bad value: %f", res)
 84 | 		}
 85 | 
 86 | 		err = handle.SetPointerMode(Device)
 87 | 		if err != nil {
 88 | 			t.Error(err)
 89 | 			return nil
 90 | 		}
 91 | 		defer handle.SetPointerMode(Host)
 92 | 
 93 | 		err = handle.Ddot(3, buffers[0], 3, buffers[1], 4, buffers[2])
 94 | 		if err != nil {
 95 | 			t.Error(err)
 96 | 			return nil
 97 | 		}
 98 | 
 99 | 		resSlice := make([]float64, 1)
100 | 		err = cuda.ReadBuffer(resSlice, buffers[2])
101 | 		if err != nil {
102 | 			t.Error(err)
103 | 			return nil
104 | 		}
105 | 
106 | 		if math.Abs(resSlice[0]-12.5) > 1e-4 {
107 | 			t.Errorf("bad value: %f", resSlice[0])
108 | 		}
109 | 
110 | 		return nil
111 | 	})
112 | }
113 | 
114 | func TestSscal(t *testing.T) {
115 | 	ctx, handle, buffers := setupTest(t, []float32{1, 2, 3, 4, -2, -3, 5}, []float32{0.25})
116 | 	<-ctx.Run(func() error {
117 | 		actions := []func() error{
118 | 			func() error {
119 | 				return handle.Sscal(4, float32(2), buffers[0], 2)
120 | 			},
121 | 			func() error {
122 | 				scaler := float32(2)
123 | 				return handle.Sscal(3, &scaler, buffers[0], 1)
124 | 			},
125 | 			func() error {
126 | 				if err := handle.SetPointerMode(Device); err != nil {
127 | 					t.Error(err)
128 | 					return nil
129 | 				}
130 | 				defer handle.SetPointerMode(Host)
131 | 				return handle.Sscal(7, buffers[1], buffers[0], 1)
132 | 			},
133 | 		}
134 | 		expected := [][]float32{
135 | 			{2, 2, 6, 4, -4, -3, 10},
136 | 			{4, 4, 12, 4, -4, -3, 10},
137 | 			{1, 1, 3, 1, -1, -0.75, 2.5},
138 | 		}
139 | 		runTestActions32(t, actions, expected, buffers[0])
140 | 		return nil
141 | 	})
142 | }
143 | 
144 | func TestDscal(t *testing.T) {
145 | 	ctx, handle, buffers := setupTest(t, []float64{1, 2, 3, 4, -2, -3, 5}, []float64{0.25})
146 | 	<-ctx.Run(func() error {
147 | 		actions := []func() error{
148 | 			func() error {
149 | 				return handle.Dscal(4, float64(2), buffers[0], 2)
150 | 			},
151 | 			func() error {
152 | 				scaler := float64(2)
153 | 				return handle.Dscal(3, &scaler, buffers[0], 1)
154 | 			},
155 | 			func() error {
156 | 				if err := handle.SetPointerMode(Device); err != nil {
157 | 					t.Error(err)
158 | 					return nil
159 | 				}
160 | 				defer handle.SetPointerMode(Host)
161 | 				return handle.Dscal(7, buffers[1], buffers[0], 1)
162 | 			},
163 | 		}
164 | 		expected := [][]float64{
165 | 			{2, 2, 6, 4, -4, -3, 10},
166 | 			{4, 4, 12, 4, -4, -3, 10},
167 | 			{1, 1, 3, 1, -1, -0.75, 2.5},
168 | 		}
169 | 		runTestActions64(t, actions, expected, buffers[0])
170 | 		return nil
171 | 	})
172 | }
173 | 
174 | func TestSaxpy(t *testing.T) {
175 | 	ctx, handle, buffers := setupTest(t, []float32{1, 2, 3, 4, -2, -3, 5},
176 | 		[]float32{1, 0, -1, 0, 1, 2, -2, 3, 0}, []float32{3})
177 | 	<-ctx.Run(func() error {
178 | 		actions := []func() error{
179 | 			func() error {
180 | 				return handle.Saxpy(5, float32(2), buffers[0], 1, buffers[1], 2)
181 | 			},
182 | 			func() error {
183 | 				scaler := float32(-2)
184 | 				return handle.Saxpy(3, &scaler, buffers[0], 2, buffers[1], 3)
185 | 			},
186 | 			func() error {
187 | 				if err := handle.SetPointerMode(Device); err != nil {
188 | 					t.Error(err)
189 | 					return nil
190 | 				}
191 | 				defer handle.SetPointerMode(Host)
192 | 				return handle.Saxpy(2, buffers[2], buffers[0], 1, buffers[1], 1)
193 | 			},
194 | 		}
195 | 		expected := [][]float32{
196 | 			{3, 0, 3, 0, 7, 2, 6, 3, -4},
197 | 			{1, 0, 3, -6, 7, 2, 10, 3, -4},
198 | 			{4, 6, 3, -6, 7, 2, 10, 3, -4},
199 | 		}
200 | 		runTestActions32(t, actions, expected, buffers[1])
201 | 		return nil
202 | 	})
203 | }
204 | 
205 | func TestDaxpy(t *testing.T) {
206 | 	ctx, handle, buffers := setupTest(t, []float64{1, 2, 3, 4, -2, -3, 5},
207 | 		[]float64{1, 0, -1, 0, 1, 2, -2, 3, 0}, []float64{3})
208 | 	<-ctx.Run(func() error {
209 | 		actions := []func() error{
210 | 			func() error {
211 | 				return handle.Daxpy(5, float64(2), buffers[0], 1, buffers[1], 2)
212 | 			},
213 | 			func() error {
214 | 				scaler := float64(-2)
215 | 				return handle.Daxpy(3, &scaler, buffers[0], 2, buffers[1], 3)
216 | 			},
217 | 			func() error {
218 | 				if err := handle.SetPointerMode(Device); err != nil {
219 | 					t.Error(err)
220 | 					return nil
221 | 				}
222 | 				defer handle.SetPointerMode(Host)
223 | 				return handle.Daxpy(2, buffers[2], buffers[0], 1, buffers[1], 1)
224 | 			},
225 | 		}
226 | 		expected := [][]float64{
227 | 			{3, 0, 3, 0, 7, 2, 6, 3, -4},
228 | 			{1, 0, 3, -6, 7, 2, 10, 3, -4},
229 | 			{4, 6, 3, -6, 7, 2, 10, 3, -4},
230 | 		}
231 | 		runTestActions64(t, actions, expected, buffers[1])
232 | 		return nil
233 | 	})
234 | }
235 | 
236 | func TestIsamax(t *testing.T) {
237 | 	ctx, handle, buffers := setupTest(t, []float32{1, 2, 3, 4, -3, -2, -5},
238 | 		[]int32{3})
239 | 	<-ctx.Run(func() error {
240 | 		var idx int
241 | 		if err := handle.Isamax(6, buffers[0], 1, &idx); err != nil {
242 | 			t.Error(err)
243 | 			return nil
244 | 		}
245 | 		if idx != 4 {
246 | 			t.Errorf("expected 4 but got %v", idx)
247 | 		}
248 | 
249 | 		if err := handle.SetPointerMode(Device); err != nil {
250 | 			t.Error(err)
251 | 			return nil
252 | 		}
253 | 		defer handle.SetPointerMode(Host)
254 | 
255 | 		if err := handle.Isamax(4, buffers[0], 2, buffers[1]); err != nil {
256 | 			t.Error(err)
257 | 			return nil
258 | 		}
259 | 
260 | 		resSlice := make([]int32, 1)
261 | 		if err := cuda.ReadBuffer(resSlice, buffers[1]); err != nil {
262 | 			t.Error(err)
263 | 			return nil
264 | 		}
265 | 		if resSlice[0] != 4 {
266 | 			t.Errorf("expected 4 but got %v", resSlice[0])
267 | 		}
268 | 
269 | 		return nil
270 | 	})
271 | }
272 | 
273 | func TestIdamax(t *testing.T) {
274 | 	ctx, handle, buffers := setupTest(t, []float64{1, 2, 3, 4, -3, -2, -5},
275 | 		[]int32{3})
276 | 	<-ctx.Run(func() error {
277 | 		var idx int
278 | 		if err := handle.Idamax(6, buffers[0], 1, &idx); err != nil {
279 | 			t.Error(err)
280 | 			return nil
281 | 		}
282 | 		if idx != 4 {
283 | 			t.Errorf("expected 4 but got %v", idx)
284 | 		}
285 | 
286 | 		if err := handle.SetPointerMode(Device); err != nil {
287 | 			t.Error(err)
288 | 			return nil
289 | 		}
290 | 		defer handle.SetPointerMode(Host)
291 | 
292 | 		if err := handle.Idamax(4, buffers[0], 2, buffers[1]); err != nil {
293 | 			t.Error(err)
294 | 			return nil
295 | 		}
296 | 
297 | 		resSlice := make([]int32, 1)
298 | 		if err := cuda.ReadBuffer(resSlice, buffers[1]); err != nil {
299 | 			t.Error(err)
300 | 			return nil
301 | 		}
302 | 		if resSlice[0] != 4 {
303 | 			t.Errorf("expected 4 but got %v", resSlice[0])
304 | 		}
305 | 
306 | 		return nil
307 | 	})
308 | }
309 | 
310 | func TestSasum(t *testing.T) {
311 | 	testNorm32(t, func(h *Handle, n int, x cuda.Buffer, inc int, res interface{}) error {
312 | 		return h.Sasum(n, x, inc, res)
313 | 	}, 1)
314 | }
315 | 
316 | func TestDasum(t *testing.T) {
317 | 	testNorm64(t, func(h *Handle, n int, x cuda.Buffer, inc int, res interface{}) error {
318 | 		return h.Dasum(n, x, inc, res)
319 | 	}, 1)
320 | }
321 | 
322 | func TestSnrm2(t *testing.T) {
323 | 	testNorm32(t, func(h *Handle, n int, x cuda.Buffer, inc int, res interface{}) error {
324 | 		return h.Snrm2(n, x, inc, res)
325 | 	}, 2)
326 | }
327 | 
328 | func TestDnrm2(t *testing.T) {
329 | 	testNorm64(t, func(h *Handle, n int, x cuda.Buffer, inc int, res interface{}) error {
330 | 		return h.Dnrm2(n, x, inc, res)
331 | 	}, 2)
332 | }
333 | 
334 | func runTestActions32(t *testing.T, fs []func() error, expected [][]float32, buf cuda.Buffer) {
335 | 	for i, f := range fs {
336 | 		if err := f(); err != nil {
337 | 			t.Errorf("action %d: %s", i, err)
338 | 			return
339 | 		}
340 | 		x := expected[i]
341 | 		actual := make([]float32, len(x))
342 | 		if err := cuda.ReadBuffer(actual, buf); err != nil {
343 | 			t.Error(err)
344 | 			return
345 | 		}
346 | 		if maxDelta32(actual, x) > 1e-4 {
347 | 			t.Errorf("action %d: expected %v but got %v", i, x, actual)
348 | 		}
349 | 	}
350 | }
351 | 
352 | func runTestActions64(t *testing.T, fs []func() error, expected [][]float64, buf cuda.Buffer) {
353 | 	for i, f := range fs {
354 | 		if err := f(); err != nil {
355 | 			t.Errorf("action %d: %s", i, err)
356 | 			return
357 | 		}
358 | 		x := expected[i]
359 | 		actual := make([]float64, len(x))
360 | 		if err := cuda.ReadBuffer(actual, buf); err != nil {
361 | 			t.Error(err)
362 | 			return
363 | 		}
364 | 		if maxDelta64(actual, x) > 1e-4 {
365 | 			t.Errorf("action %d: expected %v but got %v", i, x, actual)
366 | 		}
367 | 	}
368 | }
369 | 
370 | func testNorm32(t *testing.T, f func(h *Handle, n int, x cuda.Buffer, inc int,
371 | 	res interface{}) error, base int) {
372 | 	ctx, handle, buffers := setupTest(t, []float32{1, 2, 3, -1, -2, -4}, []float32{0.156})
373 | 
374 | 	stride2Ans := map[int]float32{1: 6, 2: float32(math.Sqrt(14))}
375 | 	stride1Ans := map[int]float32{1: 13, 2: float32(math.Sqrt(35))}
376 | 
377 | 	<-ctx.Run(func() error {
378 | 		var res float32
379 | 		if err := f(handle, 3, buffers[0], 2, &res); err != nil {
380 | 			t.Error(err)
381 | 			return nil
382 | 		}
383 | 		if math.Abs(float64(res-stride2Ans[base])) > 1e-4 {
384 | 			t.Errorf("expected %v but got %v", stride2Ans[base], res)
385 | 		}
386 | 
387 | 		if err := handle.SetPointerMode(Device); err != nil {
388 | 			t.Error(err)
389 | 			return nil
390 | 		}
391 | 		defer handle.SetPointerMode(Host)
392 | 
393 | 		if err := f(handle, 6, buffers[0], 1, buffers[1]); err != nil {
394 | 			t.Error(err)
395 | 			return nil
396 | 		}
397 | 		resSlice := make([]float32, 1)
398 | 		if err := cuda.ReadBuffer(resSlice, buffers[1]); err != nil {
399 | 			t.Error(err)
400 | 			return nil
401 | 		}
402 | 		res = resSlice[0]
403 | 		if math.Abs(float64(res-stride1Ans[base])) > 1e-4 {
404 | 			t.Errorf("expected %v but got %v", stride1Ans[base], res)
405 | 		}
406 | 		return nil
407 | 	})
408 | }
409 | 
410 | func testNorm64(t *testing.T, f func(h *Handle, n int, x cuda.Buffer, inc int,
411 | 	res interface{}) error, base int) {
412 | 	ctx, handle, buffers := setupTest(t, []float64{1, 2, 3, -1, -2, -4}, []float64{0.156})
413 | 
414 | 	stride2Ans := map[int]float64{0: 3, 1: 6, 2: math.Sqrt(14)}
415 | 	stride1Ans := map[int]float64{0: 4, 1: 13, 2: math.Sqrt(35)}
416 | 
417 | 	<-ctx.Run(func() error {
418 | 		var res float64
419 | 		if err := f(handle, 3, buffers[0], 2, &res); err != nil {
420 | 			t.Error(err)
421 | 			return nil
422 | 		}
423 | 		if math.Abs(res-stride2Ans[base]) > 1e-4 {
424 | 			t.Errorf("expected %v but got %v", stride2Ans[base], res)
425 | 		}
426 | 
427 | 		if err := handle.SetPointerMode(Device); err != nil {
428 | 			t.Error(err)
429 | 			return nil
430 | 		}
431 | 		defer handle.SetPointerMode(Host)
432 | 
433 | 		if err := f(handle, 6, buffers[0], 1, buffers[1]); err != nil {
434 | 			t.Error(err)
435 | 			return nil
436 | 		}
437 | 		resSlice := make([]float64, 1)
438 | 		if err := cuda.ReadBuffer(resSlice, buffers[1]); err != nil {
439 | 			t.Error(err)
440 | 			return nil
441 | 		}
442 | 		res = resSlice[0]
443 | 		if math.Abs(res-stride1Ans[base]) > 1e-4 {
444 | 			t.Errorf("expected %v but got %v", stride1Ans[base], res)
445 | 		}
446 | 		return nil
447 | 	})
448 | }
449 | 


--------------------------------------------------------------------------------
/cublas/level1.go:
--------------------------------------------------------------------------------
  1 | package cublas
  2 | 
  3 | import (
  4 | 	"unsafe"
  5 | 
  6 | 	"github.com/unixpickle/cuda"
  7 | )
  8 | 
  9 | /*
 10 | #include <cublas_v2.h>
 11 | */
 12 | import "C"
 13 | 
 14 | // Sdot performs a single-precision dot product.
 15 | //
 16 | // The result argument's type depends on the pointer mode.
 17 | // In the Host pointer mode, it should be *float32.
 18 | // In the Device pointer mode, it should be a cuda.Buffer.
 19 | //
 20 | // This must be called inside the cuda.Context.
 21 | func (h *Handle) Sdot(n int, x cuda.Buffer, incx int, y cuda.Buffer, incy int,
 22 | 	result interface{}) error {
 23 | 	if n < 0 {
 24 | 		panic("size out of bounds")
 25 | 	} else if stridedSize(x.Size()/4, incx) < uintptr(n) {
 26 | 		panic("index out of bounds")
 27 | 	} else if stridedSize(y.Size()/4, incy) < uintptr(n) {
 28 | 		panic("index out of bounds")
 29 | 	}
 30 | 	var res C.cublasStatus_t
 31 | 	x.WithPtr(func(xPtr unsafe.Pointer) {
 32 | 		y.WithPtr(func(yPtr unsafe.Pointer) {
 33 | 			if h.PointerMode() == Host {
 34 | 				res = C.cublasSdot(h.handle, safeIntToC(n),
 35 | 					(*C.float)(xPtr), safeIntToC(incx),
 36 | 					(*C.float)(yPtr), safeIntToC(incy),
 37 | 					(*C.float)(result.(*float32)))
 38 | 			} else {
 39 | 				b := result.(cuda.Buffer)
 40 | 				if b.Size() < 4 {
 41 | 					panic("buffer underflow")
 42 | 				}
 43 | 				b.WithPtr(func(outPtr unsafe.Pointer) {
 44 | 					res = C.cublasSdot(h.handle, safeIntToC(n),
 45 | 						(*C.float)(xPtr), safeIntToC(incx),
 46 | 						(*C.float)(yPtr), safeIntToC(incy),
 47 | 						(*C.float)(outPtr))
 48 | 				})
 49 | 			}
 50 | 		})
 51 | 	})
 52 | 	return newError("cublasSdot", res)
 53 | }
 54 | 
 55 | // Ddot performs a double-precision dot product.
 56 | //
 57 | // The result argument's type depends on the pointer mode.
 58 | // In the Host pointer mode, it should be *float64.
 59 | // In the Device pointer mode, it should be a cuda.Buffer.
 60 | //
 61 | // This must be called inside the cuda.Context.
 62 | func (h *Handle) Ddot(n int, x cuda.Buffer, incx int, y cuda.Buffer, incy int,
 63 | 	result interface{}) error {
 64 | 	if n < 0 {
 65 | 		panic("size out of bounds")
 66 | 	} else if stridedSize(x.Size()/8, incx) < uintptr(n) {
 67 | 		panic("index out of bounds")
 68 | 	} else if stridedSize(y.Size()/8, incy) < uintptr(n) {
 69 | 		panic("index out of bounds")
 70 | 	}
 71 | 	var res C.cublasStatus_t
 72 | 	x.WithPtr(func(xPtr unsafe.Pointer) {
 73 | 		y.WithPtr(func(yPtr unsafe.Pointer) {
 74 | 			if h.PointerMode() == Host {
 75 | 				res = C.cublasDdot(h.handle, safeIntToC(n),
 76 | 					(*C.double)(xPtr), safeIntToC(incx),
 77 | 					(*C.double)(yPtr), safeIntToC(incy),
 78 | 					(*C.double)(result.(*float64)))
 79 | 			} else {
 80 | 				b := result.(cuda.Buffer)
 81 | 				if b.Size() < 8 {
 82 | 					panic("buffer underflow")
 83 | 				}
 84 | 				b.WithPtr(func(outPtr unsafe.Pointer) {
 85 | 					res = C.cublasDdot(h.handle, safeIntToC(n),
 86 | 						(*C.double)(xPtr), safeIntToC(incx),
 87 | 						(*C.double)(yPtr), safeIntToC(incy),
 88 | 						(*C.double)(outPtr))
 89 | 				})
 90 | 			}
 91 | 		})
 92 | 	})
 93 | 	return newError("cublasDdot", res)
 94 | }
 95 | 
 96 | // Sscal scales a single-precision vector.
 97 | //
 98 | // The argument alpha's type depends on the pointer mode.
 99 | // In the Host pointer mode, use float32 or *float32.
100 | // In the Device pointer mode, use cuda.Buffer.
101 | //
102 | // This must be called inside the cuda.Context.
103 | func (h *Handle) Sscal(n int, alpha interface{}, x cuda.Buffer, incx int) error {
104 | 	if n < 0 {
105 | 		panic("size out of bounds")
106 | 	} else if stridedSize(x.Size()/4, incx) < uintptr(n) {
107 | 		panic("index out of bounds")
108 | 	}
109 | 
110 | 	var res C.cublasStatus_t
111 | 	x.WithPtr(func(ptr unsafe.Pointer) {
112 | 		if h.PointerMode() == Host {
113 | 			pointerizeInputs(&alpha)
114 | 			res = C.cublasSscal(h.handle, safeIntToC(n), (*C.float)(alpha.(*float32)),
115 | 				(*C.float)(ptr), safeIntToC(incx))
116 | 		} else {
117 | 			b := alpha.(cuda.Buffer)
118 | 			if b.Size() < 4 {
119 | 				panic("buffer underflow")
120 | 			}
121 | 			b.WithPtr(func(alphaPtr unsafe.Pointer) {
122 | 				res = C.cublasSscal(h.handle, safeIntToC(n), (*C.float)(alphaPtr),
123 | 					(*C.float)(ptr), safeIntToC(incx))
124 | 			})
125 | 		}
126 | 	})
127 | 
128 | 	return newError("cublasSscal", res)
129 | }
130 | 
131 | // Dscal is like Sscal, but for double-precision.
132 | //
133 | // The argument alpha's type depends on the pointer mode.
134 | // In the Host pointer mode, use float64 or *float64.
135 | // In the Device pointer mode, use cuda.Buffer.
136 | //
137 | // This must be called inside the cuda.Context.
138 | func (h *Handle) Dscal(n int, alpha interface{}, x cuda.Buffer, incx int) error {
139 | 	if n < 0 {
140 | 		panic("size out of bounds")
141 | 	} else if stridedSize(x.Size()/8, incx) < uintptr(n) {
142 | 		panic("index out of bounds")
143 | 	}
144 | 
145 | 	var res C.cublasStatus_t
146 | 	x.WithPtr(func(ptr unsafe.Pointer) {
147 | 		if h.PointerMode() == Host {
148 | 			pointerizeInputs(&alpha)
149 | 			res = C.cublasDscal(h.handle, safeIntToC(n), (*C.double)(alpha.(*float64)),
150 | 				(*C.double)(ptr), safeIntToC(incx))
151 | 		} else {
152 | 			b := alpha.(cuda.Buffer)
153 | 			if b.Size() < 8 {
154 | 				panic("buffer underflow")
155 | 			}
156 | 			b.WithPtr(func(alphaPtr unsafe.Pointer) {
157 | 				res = C.cublasDscal(h.handle, safeIntToC(n), (*C.double)(alphaPtr),
158 | 					(*C.double)(ptr), safeIntToC(incx))
159 | 			})
160 | 		}
161 | 	})
162 | 
163 | 	return newError("cublasDscal", res)
164 | }
165 | 
166 | // Saxpy computes single-precision "ax plus y".
167 | //
168 | // The argument alpha's type depends on the pointer mode.
169 | // In the Host pointer mode, use float32 or *float32.
170 | // In the Device pointer mode, use cuda.Buffer.
171 | //
172 | // This must be called inside the cuda.Context.
173 | func (h *Handle) Saxpy(n int, alpha interface{}, x cuda.Buffer, incx int,
174 | 	y cuda.Buffer, incy int) error {
175 | 	if n < 0 {
176 | 		panic("size out of bounds")
177 | 	} else if stridedSize(x.Size()/4, incx) < uintptr(n) {
178 | 		panic("index out of bounds")
179 | 	} else if stridedSize(y.Size()/4, incy) < uintptr(n) {
180 | 		panic("index out of bounds")
181 | 	}
182 | 
183 | 	var res C.cublasStatus_t
184 | 	x.WithPtr(func(xPtr unsafe.Pointer) {
185 | 		y.WithPtr(func(yPtr unsafe.Pointer) {
186 | 			if h.PointerMode() == Host {
187 | 				pointerizeInputs(&alpha)
188 | 				res = C.cublasSaxpy(h.handle, safeIntToC(n), (*C.float)(alpha.(*float32)),
189 | 					(*C.float)(xPtr), safeIntToC(incx),
190 | 					(*C.float)(yPtr), safeIntToC(incy))
191 | 			} else {
192 | 				b := alpha.(cuda.Buffer)
193 | 				if b.Size() < 4 {
194 | 					panic("buffer underflow")
195 | 				}
196 | 				b.WithPtr(func(alphaPtr unsafe.Pointer) {
197 | 					res = C.cublasSaxpy(h.handle, safeIntToC(n), (*C.float)(alphaPtr),
198 | 						(*C.float)(xPtr), safeIntToC(incx),
199 | 						(*C.float)(yPtr), safeIntToC(incy))
200 | 				})
201 | 			}
202 | 		})
203 | 	})
204 | 
205 | 	return newError("cublasSaxpy", res)
206 | }
207 | 
208 | // Daxpy is like Saxpy, but for double-precision.
209 | //
210 | // The argument alpha's type depends on the pointer mode.
211 | // In the Host pointer mode, use float64 or *float64.
212 | // In the Device pointer mode, use cuda.Buffer.
213 | //
214 | // This must be called inside the cuda.Context.
215 | func (h *Handle) Daxpy(n int, alpha interface{}, x cuda.Buffer, incx int,
216 | 	y cuda.Buffer, incy int) error {
217 | 	if n < 0 {
218 | 		panic("size out of bounds")
219 | 	} else if stridedSize(x.Size()/8, incx) < uintptr(n) {
220 | 		panic("index out of bounds")
221 | 	} else if stridedSize(y.Size()/8, incy) < uintptr(n) {
222 | 		panic("index out of bounds")
223 | 	}
224 | 
225 | 	var res C.cublasStatus_t
226 | 	x.WithPtr(func(xPtr unsafe.Pointer) {
227 | 		y.WithPtr(func(yPtr unsafe.Pointer) {
228 | 			if h.PointerMode() == Host {
229 | 				pointerizeInputs(&alpha)
230 | 				res = C.cublasDaxpy(h.handle, safeIntToC(n), (*C.double)(alpha.(*float64)),
231 | 					(*C.double)(xPtr), safeIntToC(incx),
232 | 					(*C.double)(yPtr), safeIntToC(incy))
233 | 			} else {
234 | 				b := alpha.(cuda.Buffer)
235 | 				if b.Size() < 8 {
236 | 					panic("buffer underflow")
237 | 				}
238 | 				b.WithPtr(func(alphaPtr unsafe.Pointer) {
239 | 					res = C.cublasDaxpy(h.handle, safeIntToC(n), (*C.double)(alphaPtr),
240 | 						(*C.double)(xPtr), safeIntToC(incx),
241 | 						(*C.double)(yPtr), safeIntToC(incy))
242 | 				})
243 | 			}
244 | 		})
245 | 	})
246 | 
247 | 	return newError("cublasDaxpy", res)
248 | }
249 | 
250 | // Isamax gets the index of the first single-precision
251 | // vector component with the max absolute value.
252 | // The resulting indices start at one, not zero.
253 | //
254 | // The result argument's type depends on the pointer mode.
255 | // In the Host pointer mode, use *int.
256 | // In the Device pointer mode, use cuda.Buffer.
257 | //
258 | // This must be called inside the cuda.Context.
259 | func (h *Handle) Isamax(n int, x cuda.Buffer, incx int, result interface{}) error {
260 | 	if n < 0 {
261 | 		panic("size out of bounds")
262 | 	} else if stridedSize(x.Size()/4, incx) < uintptr(n) {
263 | 		panic("index out of bounds")
264 | 	}
265 | 
266 | 	var res C.cublasStatus_t
267 | 	x.WithPtr(func(xPtr unsafe.Pointer) {
268 | 		if h.PointerMode() == Host {
269 | 			var resInt C.int
270 | 			res = C.cublasIsamax(h.handle, safeIntToC(n), (*C.float)(xPtr),
271 | 				safeIntToC(incx), &resInt)
272 | 			*(result.(*int)) = int(resInt)
273 | 		} else {
274 | 			b := result.(cuda.Buffer)
275 | 			if b.Size() < 4 {
276 | 				panic("buffer underflow")
277 | 			}
278 | 			b.WithPtr(func(resPtr unsafe.Pointer) {
279 | 				res = C.cublasIsamax(h.handle, safeIntToC(n), (*C.float)(xPtr),
280 | 					safeIntToC(incx), (*C.int)(resPtr))
281 | 			})
282 | 		}
283 | 	})
284 | 
285 | 	return newError("cublasIsamax", res)
286 | }
287 | 
288 | // Idamax is like Isamax, but for double-precision.
289 | //
290 | // The result argument's type depends on the pointer mode.
291 | // In the Host pointer mode, use *int.
292 | // In the Device pointer mode, use cuda.Buffer.
293 | //
294 | // This must be called inside the cuda.Context.
295 | func (h *Handle) Idamax(n int, x cuda.Buffer, incx int, result interface{}) error {
296 | 	if n < 0 {
297 | 		panic("size out of bounds")
298 | 	} else if stridedSize(x.Size()/8, incx) < uintptr(n) {
299 | 		panic("index out of bounds")
300 | 	}
301 | 
302 | 	var res C.cublasStatus_t
303 | 	x.WithPtr(func(xPtr unsafe.Pointer) {
304 | 		if h.PointerMode() == Host {
305 | 			var resInt C.int
306 | 			res = C.cublasIdamax(h.handle, safeIntToC(n), (*C.double)(xPtr),
307 | 				safeIntToC(incx), &resInt)
308 | 			*(result.(*int)) = int(resInt)
309 | 		} else {
310 | 			b := result.(cuda.Buffer)
311 | 			if b.Size() < 4 {
312 | 				panic("buffer underflow")
313 | 			}
314 | 			b.WithPtr(func(resPtr unsafe.Pointer) {
315 | 				res = C.cublasIdamax(h.handle, safeIntToC(n), (*C.double)(xPtr),
316 | 					safeIntToC(incx), (*C.int)(resPtr))
317 | 			})
318 | 		}
319 | 	})
320 | 
321 | 	return newError("cublasIdamax", res)
322 | }
323 | 
324 | // Sasum sums the absolute values of the components in a
325 | // single-precision vector.
326 | //
327 | // The result argument's type depends on the pointer mode.
328 | // In the Host pointer mode, use *float32.
329 | // In the Device pointer mode, use cuda.Buffer.
330 | //
331 | // This must be called inside the cuda.Context.
332 | func (h *Handle) Sasum(n int, x cuda.Buffer, incx int, result interface{}) error {
333 | 	f := func(arg1 C.cublasHandle_t, arg2 C.int, arg3 *C.float, arg4 C.int,
334 | 		arg5 *C.float) C.cublasStatus_t {
335 | 		return C.cublasSasum(arg1, arg2, arg3, arg4, arg5)
336 | 	}
337 | 	return newError("cublasSasum", h.norm32(n, x, incx, result, f))
338 | }
339 | 
340 | // Dasum is like Sasum, but for double-precision.
341 | //
342 | // The result argument's type depends on the pointer mode.
343 | // In the Host pointer mode, use *float64.
344 | // In the Device pointer mode, use cuda.Buffer.
345 | //
346 | // This must be called inside the cuda.Context.
347 | func (h *Handle) Dasum(n int, x cuda.Buffer, incx int, result interface{}) error {
348 | 	f := func(arg1 C.cublasHandle_t, arg2 C.int, arg3 *C.double, arg4 C.int,
349 | 		arg5 *C.double) C.cublasStatus_t {
350 | 		return C.cublasDasum(arg1, arg2, arg3, arg4, arg5)
351 | 	}
352 | 	return newError("cublasDasum", h.norm64(n, x, incx, result, f))
353 | }
354 | 
355 | // Snrm2 computes the Euclidean norm of a single-precision
356 | // vector.
357 | //
358 | // The result argument's type depends on the pointer mode.
359 | // In the Host pointer mode, use *float32.
360 | // In the Device pointer mode, use cuda.Buffer.
361 | //
362 | // This must be called inside the cuda.Context.
363 | func (h *Handle) Snrm2(n int, x cuda.Buffer, incx int, result interface{}) error {
364 | 	f := func(arg1 C.cublasHandle_t, arg2 C.int, arg3 *C.float, arg4 C.int,
365 | 		arg5 *C.float) C.cublasStatus_t {
366 | 		return C.cublasSnrm2(arg1, arg2, arg3, arg4, arg5)
367 | 	}
368 | 	return newError("cublasSnrm2", h.norm32(n, x, incx, result, f))
369 | }
370 | 
371 | // Dnrm2 is like Snrm2, but for double-precision.
372 | //
373 | // The result argument's type depends on the pointer mode.
374 | // In the Host pointer mode, use *float64.
375 | // In the Device pointer mode, use cuda.Buffer.
376 | //
377 | // This must be called inside the cuda.Context.
378 | func (h *Handle) Dnrm2(n int, x cuda.Buffer, incx int, result interface{}) error {
379 | 	f := func(arg1 C.cublasHandle_t, arg2 C.int, arg3 *C.double, arg4 C.int,
380 | 		arg5 *C.double) C.cublasStatus_t {
381 | 		return C.cublasDnrm2(arg1, arg2, arg3, arg4, arg5)
382 | 	}
383 | 	return newError("cublasDnrm2", h.norm64(n, x, incx, result, f))
384 | }
385 | 
386 | func (h *Handle) norm32(n int, x cuda.Buffer, incx int, result interface{},
387 | 	f func(C.cublasHandle_t, C.int, *C.float, C.int, *C.float) C.cublasStatus_t) C.cublasStatus_t {
388 | 	if n < 0 {
389 | 		panic("size out of bounds")
390 | 	} else if stridedSize(x.Size()/4, incx) < uintptr(n) {
391 | 		panic("index out of bounds")
392 | 	}
393 | 
394 | 	var res C.cublasStatus_t
395 | 	x.WithPtr(func(xPtr unsafe.Pointer) {
396 | 		if h.PointerMode() == Host {
397 | 			res = f(h.handle, safeIntToC(n), (*C.float)(xPtr),
398 | 				safeIntToC(incx), (*C.float)(result.(*float32)))
399 | 		} else {
400 | 			b := result.(cuda.Buffer)
401 | 			if b.Size() < 4 {
402 | 				panic("buffer underflow")
403 | 			}
404 | 			b.WithPtr(func(resPtr unsafe.Pointer) {
405 | 				res = f(h.handle, safeIntToC(n), (*C.float)(xPtr),
406 | 					safeIntToC(incx), (*C.float)(resPtr))
407 | 			})
408 | 		}
409 | 	})
410 | 
411 | 	return res
412 | }
413 | 
414 | func (h *Handle) norm64(n int, x cuda.Buffer, incx int, result interface{},
415 | 	f func(C.cublasHandle_t, C.int, *C.double, C.int,
416 | 		*C.double) C.cublasStatus_t) C.cublasStatus_t {
417 | 	if n < 0 {
418 | 		panic("size out of bounds")
419 | 	} else if stridedSize(x.Size()/8, incx) < uintptr(n) {
420 | 		panic("index out of bounds")
421 | 	}
422 | 
423 | 	var res C.cublasStatus_t
424 | 	x.WithPtr(func(xPtr unsafe.Pointer) {
425 | 		if h.PointerMode() == Host {
426 | 			res = f(h.handle, safeIntToC(n), (*C.double)(xPtr),
427 | 				safeIntToC(incx), (*C.double)(result.(*float64)))
428 | 		} else {
429 | 			b := result.(cuda.Buffer)
430 | 			if b.Size() < 8 {
431 | 				panic("buffer underflow")
432 | 			}
433 | 			b.WithPtr(func(resPtr unsafe.Pointer) {
434 | 				res = f(h.handle, safeIntToC(n), (*C.double)(xPtr),
435 | 					safeIntToC(incx), (*C.double)(resPtr))
436 | 			})
437 | 		}
438 | 	})
439 | 
440 | 	return res
441 | }
442 | 
443 | func stridedSize(totalCount uintptr, inc int) uintptr {
444 | 	if inc == 0 {
445 | 		panic("zero increment")
446 | 	} else if inc < 0 {
447 | 		inc = -inc
448 | 	}
449 | 	// Do this in such a way that we never risk overflow.
450 | 	res := totalCount / uintptr(inc)
451 | 	if totalCount%uintptr(inc) != 0 {
452 | 		res++
453 | 	}
454 | 	return res
455 | }
456 | 


--------------------------------------------------------------------------------