├── .gitignore ├── Makefile ├── README.md ├── cu ├── Makefile ├── README ├── cgoflags.go ├── context.go ├── context_test.go ├── device.go ├── device_test.go ├── dim3.go ├── doc.go ├── execution.go ├── function.go ├── init.go ├── init_test.go ├── memory.go ├── memory_test.go ├── memset.go ├── module.go ├── module_test.go ├── peer.go ├── result.go ├── runtimeapi.go ├── stream.go ├── testdata │ ├── testmodule.cu │ └── testmodule.ptx ├── version.go └── version_test.go ├── cuda ├── Makefile ├── README ├── cgoflags.go └── device.go ├── cufft ├── Makefile ├── README ├── cgoflags.go ├── doc.go ├── fft_test.go ├── init_test.go ├── mode.go ├── plan.go ├── result.go └── type.go ├── curand ├── Makefile ├── README ├── cgoflags.go ├── generator.go └── status.go ├── doc.go ├── gophergpu.png └── safe ├── Makefile ├── README ├── complex128s.go ├── complex128s_test.go ├── complex64s.go ├── complex64s_test.go ├── doc.go ├── fft1d_test.go ├── fft1dc2r.go ├── fft1dr2c.go ├── fft3d_test.go ├── fft3dc2r.go ├── fft3dd2z.go ├── fft3dr2c.go ├── fft3dz2d.go ├── fftplan.go ├── float32s.go ├── float32s_test.go ├── float64s.go ├── float64s_test.go ├── init.go ├── slice.go └── subs.sh /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.{6,8,5,o} 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: 6g doc 2 | 3 | 6g: 4 | go install -v 5 | go tool vet *.go 6 | gofmt -w *.go 7 | 8 | GCCGO=gccgo -gccgoflags '-static-libgcc -O3' 9 | 10 | gccgo: 11 | go install -v -compiler $(GCCGO) 12 | 13 | test: 6gtest gccgotest 14 | 15 | 6gtest: 16 | go test 17 | 18 | gccgotest: 19 | go test -compiler $(GCCGO) 20 | 21 | bench: 6gbench gccgobench 22 | 23 | 6gbench: 24 | go test -bench=. 25 | 26 | gccgobench: 27 | go test -bench=. -compiler $(GCCGO) 28 | 29 | clean: 30 | go clean 31 | go-optview -c -w *.go 32 | gofmt -w *.go 33 | 34 | opt: 35 | go-optview -w *.go 36 | gofmt -w *.go 37 | 38 | doc: 39 | godoc github.com/barnex/cuda5 > README 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Go bindings for CUDA 2 | 3 | Go bindings for nVIDIA CUDA 5 and later. This package compiles with both gc and gccgo. 4 | 5 | ![fig](gophergpu.png) 6 | -------------------------------------------------------------------------------- /cu/Makefile: -------------------------------------------------------------------------------- 1 | all: 6g gccgo doc 2 | 3 | 6g: 4 | go install -v 5 | go tool vet *.go 6 | gofmt -w *.go 7 | 8 | GCCGO=gccgo -gccgoflags '-static-libgcc -O3' 9 | 10 | gccgo: 11 | go build -v -compiler $(GCCGO) 12 | 13 | test: 6gtest gccgotest 14 | 15 | 6gtest: 16 | go test 17 | 18 | gccgotest: 19 | go test -compiler $(GCCGO) 20 | 21 | bench: 6gbench gccgobench 22 | 23 | 6gbench: 24 | go test -bench=. 25 | 26 | gccgobench: 27 | go test -bench=. -compiler $(GCCGO) 28 | 29 | clean: 30 | go clean 31 | 32 | doc: 33 | godoc github.com/barnex/cuda5/cu > README 34 | -------------------------------------------------------------------------------- /cu/README: -------------------------------------------------------------------------------- 1 | PACKAGE 2 | 3 | package cu 4 | import "github.com/barnex/cuda5/cu" 5 | 6 | Go bindings for the CUDA driver API. 7 | 8 | CONSTANTS 9 | 10 | const ( 11 | // If the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor. 12 | CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO 13 | // Spin when waiting for results from the GPU. 14 | CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN 15 | // Yield its thread when waiting for results from the GPU. 16 | CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD 17 | // Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work. 18 | CTX_BLOCKING_SYNC 19 | // Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU. 20 | CTX_MAP_HOST = C.CU_CTX_MAP_HOST 21 | //Do not reduce local memory after resizing local memory for a kernel. 22 | CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX 23 | ) 24 | Flags for CtxCreate 25 | const ( 26 | SIZEOF_FLOAT32 = 4 27 | SIZEOF_FLOAT64 = 8 28 | SIZEOF_COMPLEX64 = 8 29 | SIZEOF_COMPLEX128 = 16 30 | ) 31 | Type size in bytes 32 | 33 | 34 | FUNCTIONS 35 | 36 | func CtxDestroy(ctx *Context) 37 | Destroys the CUDA context specified by ctx. If the context usage count 38 | is not equal to 1, or the context is current to any CPU thread other 39 | than the current one, this function fails. Floating contexts (detached 40 | from a CPU thread via cuCtxPopCurrent()) may be destroyed by this 41 | function. 42 | 43 | func CtxDisablePeerAccess(peer Context) 44 | Reverses CtxEnablePeerAccess(). 45 | 46 | func CtxEnablePeerAccess(peer Context) 47 | Make allocations from the peer Context available to the current context. 48 | 49 | func CtxGetApiVersion(ctx Context) (version int) 50 | Returns the API version to create the context. 51 | 52 | func CtxSetCurrent(ctx Context) 53 | Sets the current active context. 54 | 55 | func CtxSynchronize() 56 | Blocks until the device has completed all preceding requested tasks, if 57 | the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag. 58 | 59 | func DeviceCanAccessPeer(dev, peer Device) bool 60 | Returns true if CtxEnablePeerAccess can be called on a context for dev 61 | and peerDev. 62 | 63 | func DeviceComputeCapability(device Device) (major, minor int) 64 | Returns the compute capability of the device. 65 | 66 | func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int 67 | Gets the value of a device attribute. 68 | 69 | func DeviceGetCount() int 70 | Returns the number of devices with compute capability greater than or 71 | equal to 1.0 that are available for execution. 72 | 73 | func DeviceGetName(dev Device) string 74 | Gets the name of the device. 75 | 76 | func DeviceTotalMem(device Device) int64 77 | Returns the total amount of memory available on the device in bytes. 78 | 79 | func FuncGetAttribute(attrib FunctionAttribute, function Function) int 80 | 81 | func Init(flags int) 82 | Initialize the CUDA driver API. Currently, flags must be 0. If Init() 83 | has not been called, any function from the driver API will panic with 84 | ERROR_NOT_INITIALIZED. 85 | 86 | func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) 87 | 88 | func MemAllocHost(bytes int64) unsafe.Pointer 89 | 90 | func MemFree(ptr *DevicePtr) 91 | Frees device memory allocated by MemAlloc(). Overwrites the pointer with 92 | NULL. It is safe to double-free. 93 | 94 | func MemFreeHost(ptr unsafe.Pointer) 95 | 96 | func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr) 97 | Returns the base address and size of the allocation (by MemAlloc) that 98 | contains the input pointer ptr. 99 | 100 | func MemGetInfo() (free, total int64) 101 | Returns the free and total amount of memroy in the current Context (in 102 | bytes). 103 | 104 | func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag) 105 | Page-locks memory specified by the pointer and bytes. The pointer and 106 | byte size must be aligned to the host page size (4KB) See also: 107 | MemHostUnregister() 108 | 109 | func MemHostUnregister(ptr unsafe.Pointer) 110 | Unmaps memory locked by MemHostRegister(). 111 | 112 | func Memcpy(dst, src DevicePtr, bytes int64) 113 | Copies a number of bytes on the current device. Requires unified 114 | addressing to be supported. See also: MemcpyDtoD(). TODO(a): is actually 115 | an auto copy for device and/or host memory 116 | 117 | func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream) 118 | Asynchronously copies a number of bytes on the current device. 119 | 120 | func MemcpyDtoD(dst, src DevicePtr, bytes int64) 121 | Copies a number of bytes from host to device. 122 | 123 | func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream) 124 | Asynchronously copies a number of bytes from host to device. 125 | 126 | func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64) 127 | Copies a number of bytes from device to host. 128 | 129 | func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream) 130 | Asynchronously copies a number of bytes device host to host. The host 131 | memory must be page-locked (see MemRegister) 132 | 133 | func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64) 134 | Copies a number of bytes from host to device. 135 | 136 | func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream) 137 | Asynchronously copies a number of bytes from host to device. The host 138 | memory must be page-locked (see MemRegister) 139 | 140 | func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64) 141 | Copies from device memory in one context (device) to another. 142 | 143 | func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream) 144 | Asynchronously copies from device memory in one context (device) to 145 | another. 146 | 147 | func MemsetD32(deviceptr DevicePtr, value uint32, N int64) 148 | Sets the first N 32-bit values of dst array to value. Asynchronous. 149 | 150 | func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream) 151 | Asynchronously sets the first N 32-bit values of dst array to value. 152 | 153 | func MemsetD8(deviceptr DevicePtr, value uint8, N int64) 154 | Sets the first N 8-bit values of dst array to value. Asynchronous. 155 | 156 | func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream) 157 | Asynchronously sets the first N 32-bit values of dst array to value. 158 | 159 | func StreamDestroy(stream *Stream) 160 | Destroys an asynchronous stream 161 | 162 | func StreamSynchronize(stream Stream) 163 | Blocks until the stream has completed. 164 | 165 | func Version() int 166 | Returns the CUDA driver version. 167 | 168 | 169 | TYPES 170 | 171 | type Context uintptr 172 | CUDA context. 173 | 174 | func CtxCreate(flags uint, dev Device) Context 175 | Create a CUDA context. 176 | 177 | func CtxGetCurrent() Context 178 | Gets the current active context. 179 | 180 | func (ctx Context) ApiVersion() (version int) 181 | Returns the API version to create the context. 182 | 183 | func (ctx *Context) Destroy() 184 | Destroys the CUDA context. 185 | 186 | func (peer Context) DisablePeerAccess() 187 | Reverses EnablePeerAccess(). 188 | 189 | func (peer Context) EnablePeerAccess() 190 | Make allocations from the peer Context available to the current context. 191 | 192 | func (ctx Context) SetCurrent() 193 | Sets the current active context. 194 | 195 | type DevProp struct { 196 | MaxThreadsPerBlock int 197 | MaxThreadsDim [3]int 198 | MaxGridSize [3]int 199 | SharedMemPerBlock int 200 | TotalConstantMemory int 201 | SIMDWidth int 202 | MemPitch int 203 | RegsPerBlock int 204 | ClockRate int 205 | TextureAlign int 206 | } 207 | Device properties 208 | 209 | func DeviceGetProperties(dev Device) (prop DevProp) 210 | Returns the device's properties. 211 | 212 | type Device int 213 | CUDA Device number. 214 | 215 | func CtxGetDevice() Device 216 | Returns the ordinal of the current context's device. 217 | 218 | func DeviceGet(ordinal int) Device 219 | Returns in a device handle given an ordinal in the range [0, 220 | DeviceGetCount()-1]. 221 | 222 | func (dev Device) Attribute(attrib DeviceAttribute) int 223 | Gets the value of a device attribute. 224 | 225 | func (dev Device) CanAccessPeer(peer Device) bool 226 | Returns true if CtxEnablePeerAccess can be called on a context for dev 227 | and peerDev. 228 | 229 | func (device Device) ComputeCapability() (major, minor int) 230 | Returns the compute capability of the device. 231 | 232 | func (dev Device) Name() string 233 | Gets the name of the device. 234 | 235 | func (dev Device) Properties() DevProp 236 | Returns the device's properties. 237 | 238 | func (device Device) TotalMem() int64 239 | Returns the total amount of memory available on the device in bytes. 240 | 241 | type DeviceAttribute int 242 | 243 | const ( 244 | MAX_THREADS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK // Maximum number of threads per block 245 | MAX_BLOCK_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X // Maximum block dimension X 246 | MAX_BLOCK_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y // Maximum block dimension Y 247 | MAX_BLOCK_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z // Maximum block dimension Z 248 | MAX_GRID_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X // Maximum grid dimension X 249 | MAX_GRID_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y // Maximum grid dimension Y 250 | MAX_GRID_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z // Maximum grid dimension Z 251 | MAX_SHARED_MEMORY_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK // Maximum shared memory available per block in bytes 252 | TOTAL_CONSTANT_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY // Memory available on device for __constant__ variables in a CUDA C kernel in bytes 253 | WARP_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE // Warp size in threads 254 | MAX_PITCH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH // Maximum pitch in bytes allowed by memory copies 255 | MAX_REGISTERS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK // Maximum number of 32-bit registers available per block 256 | CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE // Peak clock frequency in kilohertz 257 | TEXTURE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT // Alignment requirement for textures 258 | MULTIPROCESSOR_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT // Number of multiprocessors on device 259 | KERNEL_EXEC_TIMEOUT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT // Specifies whether there is a run time limit on kernels 260 | INTEGRATED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED // Device is integrated with host memory 261 | CAN_MAP_HOST_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY // Device can map host memory into CUDA address space 262 | COMPUTE_MODE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE // Compute mode (See ::CUcomputemode for details) 263 | MAXIMUM_TEXTURE1D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH // Maximum 1D texture width 264 | MAXIMUM_TEXTURE2D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH // Maximum 2D texture width 265 | MAXIMUM_TEXTURE2D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT // Maximum 2D texture height 266 | MAXIMUM_TEXTURE3D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH // Maximum 3D texture width 267 | MAXIMUM_TEXTURE3D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT // Maximum 3D texture height 268 | MAXIMUM_TEXTURE3D_DEPTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH // Maximum 3D texture depth 269 | MAXIMUM_TEXTURE2D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH // Maximum 2D layered texture width 270 | MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height 271 | MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture 272 | SURFACE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT // Alignment requirement for surfaces 273 | CONCURRENT_KERNELS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS // Device can possibly execute multiple kernels concurrently 274 | ECC_ENABLED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED // Device has ECC support enabled 275 | PCI_BUS_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID // PCI bus ID of the device 276 | PCI_DEVICE_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID // PCI device ID of the device 277 | TCC_DRIVER DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER // Device is using TCC driver model 278 | MEMORY_CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE // Peak memory clock frequency in kilohertz 279 | GLOBAL_MEMORY_BUS_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH // Global memory bus width in bits 280 | L2_CACHE_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE // Size of L2 cache in bytes 281 | MAX_THREADS_PER_MULTIPROCESSOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR // Maximum resident threads per multiprocessor 282 | ASYNC_ENGINE_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT // Number of asynchronous engines 283 | UNIFIED_ADDRESSING DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING // Device uses shares a unified address space with the host 284 | MAXIMUM_TEXTURE1D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH // Maximum 1D layered texture width 285 | MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture 286 | ) 287 | 288 | type DevicePtr uintptr 289 | 290 | func MemAlloc(bytes int64) DevicePtr 291 | Allocates a number of bytes of device memory. 292 | 293 | func (ptr DevicePtr) Bytes() (bytes int64) 294 | Returns the size of the allocation (by MemAlloc) that contains the input 295 | pointer ptr. 296 | 297 | func (ptr *DevicePtr) Free() 298 | Frees device memory allocated by MemAlloc(). Overwrites the pointer with 299 | NULL. It is safe to double-free. 300 | 301 | func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr) 302 | Returns the base address and size of the allocation (by MemAlloc) that 303 | contains the input pointer ptr. 304 | 305 | func (ptr DevicePtr) MemoryType() MemoryType 306 | Returns the physical memory type that ptr addresses. 307 | 308 | func (p DevicePtr) String() string 309 | 310 | type Dim3 struct { 311 | X, Y, Z int 312 | } 313 | 314 | type Function uintptr 315 | Represents a CUDA CUfunction, a reference to a function within a module. 316 | 317 | func ModuleGetFunction(module Module, name string) Function 318 | Returns a Function handle. 319 | 320 | func (f Function) GetAttribute(attrib FunctionAttribute) int 321 | 322 | type FunctionAttribute int 323 | 324 | const ( 325 | FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail. 326 | FUNC_A_SHARED_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES // The size in bytes of statically-allocated shared memory required by this function. 327 | FUNC_A_CONST_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES // The size in bytes of user-allocated constant memory required by this function. 328 | FUNC_A_LOCAL_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES // The size in bytes of local memory used by each thread of this function. 329 | FUNC_A_NUM_REGS FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS // The number of registers used by each thread of this function. 330 | FUNC_A_PTX_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION // The PTX virtual architecture version for which the function was compiled. 331 | FUNC_A_BINARY_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION // The binary architecture version for which the function was compiled. 332 | ) 333 | 334 | type MemHostRegisterFlag int 335 | 336 | const ( 337 | // Memory is pinned in all CUDA contexts. 338 | MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE 339 | // Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer() 340 | MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP 341 | ) 342 | Flag for MemHostRegister 343 | 344 | type MemoryType uint 345 | 346 | const ( 347 | MemoryTypeHost MemoryType = C.CU_MEMORYTYPE_HOST 348 | MemoryTypeDevice MemoryType = C.CU_MEMORYTYPE_DEVICE 349 | MemoryTypeArray MemoryType = C.CU_MEMORYTYPE_ARRAY 350 | MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED 351 | ) 352 | 353 | func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result) 354 | Returns the physical memory type that ptr addresses. 355 | 356 | func (t MemoryType) String() string 357 | 358 | type Module uintptr 359 | Represents a CUDA CUmodule, a reference to executable device code. 360 | 361 | func ModuleLoad(fname string) Module 362 | Loads a compute module from file 363 | 364 | func ModuleLoadData(image string) Module 365 | Loads a compute module from string 366 | 367 | func (m Module) GetFunction(name string) Function 368 | Returns a Function handle. 369 | 370 | type Result int 371 | CUDA error status. CUDA error statuses are not returned by functions but 372 | checked and passed to panic() when not successful. If desired, they can 373 | be caught by recover(). 374 | 375 | const ( 376 | SUCCESS Result = C.CUDA_SUCCESS 377 | ERROR_INVALID_VALUE Result = C.CUDA_ERROR_INVALID_VALUE 378 | ERROR_OUT_OF_MEMORY Result = C.CUDA_ERROR_OUT_OF_MEMORY 379 | ERROR_NOT_INITIALIZED Result = C.CUDA_ERROR_NOT_INITIALIZED 380 | ERROR_DEINITIALIZED Result = C.CUDA_ERROR_DEINITIALIZED 381 | ERROR_PROFILER_DISABLED Result = C.CUDA_ERROR_PROFILER_DISABLED 382 | ERROR_PROFILER_NOT_INITIALIZED Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED 383 | ERROR_PROFILER_ALREADY_STARTED Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED 384 | ERROR_PROFILER_ALREADY_STOPPED Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED 385 | ERROR_NO_DEVICE Result = C.CUDA_ERROR_NO_DEVICE 386 | ERROR_INVALID_DEVICE Result = C.CUDA_ERROR_INVALID_DEVICE 387 | ERROR_INVALID_IMAGE Result = C.CUDA_ERROR_INVALID_IMAGE 388 | ERROR_INVALID_CONTEXT Result = C.CUDA_ERROR_INVALID_CONTEXT 389 | ERROR_CONTEXT_ALREADY_CURRENT Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT 390 | ERROR_MAP_FAILED Result = C.CUDA_ERROR_MAP_FAILED 391 | ERROR_UNMAP_FAILED Result = C.CUDA_ERROR_UNMAP_FAILED 392 | ERROR_ARRAY_IS_MAPPED Result = C.CUDA_ERROR_ARRAY_IS_MAPPED 393 | ERROR_ALREADY_MAPPED Result = C.CUDA_ERROR_ALREADY_MAPPED 394 | ERROR_NO_BINARY_FOR_GPU Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU 395 | ERROR_ALREADY_ACQUIRED Result = C.CUDA_ERROR_ALREADY_ACQUIRED 396 | ERROR_NOT_MAPPED Result = C.CUDA_ERROR_NOT_MAPPED 397 | ERROR_NOT_MAPPED_AS_ARRAY Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY 398 | ERROR_NOT_MAPPED_AS_POINTER Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER 399 | ERROR_ECC_UNCORRECTABLE Result = C.CUDA_ERROR_ECC_UNCORRECTABLE 400 | ERROR_UNSUPPORTED_LIMIT Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT 401 | ERROR_CONTEXT_ALREADY_IN_USE Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE 402 | ERROR_INVALID_SOURCE Result = C.CUDA_ERROR_INVALID_SOURCE 403 | ERROR_FILE_NOT_FOUND Result = C.CUDA_ERROR_FILE_NOT_FOUND 404 | ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND 405 | ERROR_SHARED_OBJECT_INIT_FAILED Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED 406 | ERROR_OPERATING_SYSTEM Result = C.CUDA_ERROR_OPERATING_SYSTEM 407 | ERROR_INVALID_HANDLE Result = C.CUDA_ERROR_INVALID_HANDLE 408 | ERROR_NOT_FOUND Result = C.CUDA_ERROR_NOT_FOUND 409 | ERROR_NOT_READY Result = C.CUDA_ERROR_NOT_READY 410 | ERROR_LAUNCH_FAILED Result = C.CUDA_ERROR_LAUNCH_FAILED 411 | ERROR_LAUNCH_OUT_OF_RESOURCES Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES 412 | ERROR_LAUNCH_TIMEOUT Result = C.CUDA_ERROR_LAUNCH_TIMEOUT 413 | ERROR_LAUNCH_INCOMPATIBLE_TEXTURING Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING 414 | ERROR_PEER_ACCESS_ALREADY_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED 415 | ERROR_PEER_ACCESS_NOT_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED 416 | ERROR_PRIMARY_CONTEXT_ACTIVE Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE 417 | ERROR_CONTEXT_IS_DESTROYED Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED 418 | ERROR_ASSERT Result = C.CUDA_ERROR_ASSERT 419 | ERROR_TOO_MANY_PEERS Result = C.CUDA_ERROR_TOO_MANY_PEERS 420 | ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED 421 | ERROR_HOST_MEMORY_NOT_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED 422 | ERROR_UNKNOWN Result = C.CUDA_ERROR_UNKNOWN 423 | ) 424 | 425 | func StreamQuery(stream Stream) Result 426 | Returns Success if all operations have completed, ErrorNotReady 427 | otherwise 428 | 429 | func (err Result) String() string 430 | Message string for the error 431 | 432 | type Stream uintptr 433 | CUDA stream. 434 | 435 | func StreamCreate() Stream 436 | Creates an asynchronous stream 437 | 438 | func (stream *Stream) Destroy() 439 | Destroys the asynchronous stream 440 | 441 | func (stream Stream) Query() Result 442 | Returns Success if all operations have completed, ErrorNotReady 443 | otherwise 444 | 445 | func (stream Stream) Synchronize() 446 | Blocks until the stream has completed. 447 | 448 | 449 | -------------------------------------------------------------------------------- /cu/cgoflags.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | // This file provides CGO flags to find CUDA libraries and headers. 4 | 5 | //#cgo LDFLAGS:-lcuda -lcudart 6 | // 7 | ////default location: 8 | //#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib 9 | //#cgo CFLAGS: -I/usr/local/cuda/include/ 10 | // 11 | ////default location if not properly symlinked: 12 | //#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib 13 | //#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib 14 | //#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib 15 | //#cgo CFLAGS: -I/usr/local/cuda-6.0/include/ 16 | //#cgo CFLAGS: -I/usr/local/cuda-5.5/include/ 17 | //#cgo CFLAGS: -I/usr/local/cuda-5.0/include/ 18 | // 19 | ////arch linux: 20 | //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib 21 | //#cgo CFLAGS: -I/opt/cuda/include 22 | // 23 | ////WINDOWS: 24 | //#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64 25 | //#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include 26 | import "C" 27 | -------------------------------------------------------------------------------- /cu/context.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | // This file implements CUDA driver context management 4 | 5 | //#include 6 | import "C" 7 | import "unsafe" 8 | 9 | // CUDA context. 10 | type Context uintptr 11 | 12 | // Create a CUDA context. 13 | func CtxCreate(flags uint, dev Device) Context { 14 | var ctx C.CUcontext 15 | err := Result(C.cuCtxCreate(&ctx, C.uint(flags), C.CUdevice(dev))) 16 | if err != SUCCESS { 17 | panic(err) 18 | } 19 | return Context(uintptr(unsafe.Pointer(ctx))) 20 | } 21 | 22 | //Destroys the CUDA context specified by ctx. If the context usage count is not equal to 1, or the context is current to any CPU thread other than the current one, this function fails. Floating contexts (detached from a CPU thread via cuCtxPopCurrent()) may be destroyed by this function. 23 | func CtxDestroy(ctx *Context) { 24 | err := Result(C.cuCtxDestroy(C.CUcontext(unsafe.Pointer(uintptr(*ctx))))) 25 | *ctx = 0 26 | if err != SUCCESS { 27 | panic(err) 28 | } 29 | } 30 | 31 | //Destroys the CUDA context. 32 | func (ctx *Context) Destroy() { 33 | CtxDestroy(ctx) 34 | } 35 | 36 | // Returns the API version to create the context. 37 | func CtxGetApiVersion(ctx Context) (version int) { 38 | var cversion C.uint 39 | err := Result(C.cuCtxGetApiVersion(C.CUcontext(unsafe.Pointer(uintptr(ctx))), &cversion)) 40 | if err != SUCCESS { 41 | panic(err) 42 | } 43 | version = int(cversion) 44 | return 45 | } 46 | 47 | // Returns the API version to create the context. 48 | func (ctx Context) ApiVersion() (version int) { 49 | return CtxGetApiVersion(ctx) 50 | } 51 | 52 | // Gets the current active context. 53 | func CtxGetCurrent() Context { 54 | var ctx C.CUcontext 55 | err := Result(C.cuCtxGetCurrent(&ctx)) 56 | if err != SUCCESS { 57 | panic(err) 58 | } 59 | return Context(uintptr(unsafe.Pointer(ctx))) 60 | } 61 | 62 | // Returns the ordinal of the current context's device. 63 | func CtxGetDevice() Device { 64 | var dev C.CUdevice 65 | err := Result(C.cuCtxGetDevice(&dev)) 66 | if err != SUCCESS { 67 | panic(err) 68 | } 69 | return Device(dev) 70 | } 71 | 72 | // Sets the current active context. 73 | func CtxSetCurrent(ctx Context) { 74 | err := Result(C.cuCtxSetCurrent(C.CUcontext(unsafe.Pointer(uintptr(ctx))))) 75 | if err != SUCCESS { 76 | panic(err) 77 | } 78 | } 79 | 80 | // Sets the current active context. 81 | func (ctx Context) SetCurrent() { 82 | CtxSetCurrent(ctx) 83 | } 84 | 85 | // Blocks until the device has completed all preceding requested tasks, if the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag. 86 | func CtxSynchronize() { 87 | err := Result(C.cuCtxSynchronize()) 88 | if err != SUCCESS { 89 | panic(err) 90 | } 91 | } 92 | 93 | // Flags for CtxCreate 94 | const ( 95 | // If the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor. 96 | CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO 97 | // Spin when waiting for results from the GPU. 98 | CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN 99 | // Yield its thread when waiting for results from the GPU. 100 | CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD 101 | // Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work. 102 | CTX_BLOCKING_SYNC 103 | // Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU. 104 | CTX_MAP_HOST = C.CU_CTX_MAP_HOST 105 | //Do not reduce local memory after resizing local memory for a kernel. 106 | CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX 107 | ) 108 | -------------------------------------------------------------------------------- /cu/context_test.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestContext(t *testing.T) { 9 | fmt.Println("CtxCreate") 10 | ctx := CtxCreate(CTX_SCHED_AUTO, 0) 11 | fmt.Println("CtxSetCurrent") 12 | CtxSetCurrent(ctx) 13 | fmt.Println("CtxGetApiVersion:", ctx.ApiVersion()) 14 | fmt.Println("CtxGetDevice:", CtxGetDevice()) 15 | (&ctx).Destroy() 16 | } 17 | 18 | func BenchmarkGetContext(b *testing.B) { 19 | b.StopTimer() 20 | ctx := CtxCreate(CTX_SCHED_AUTO, 0) 21 | CtxSetCurrent(ctx) 22 | b.StartTimer() 23 | for i := 0; i < b.N; i++ { 24 | CtxGetCurrent() 25 | } 26 | } 27 | 28 | func BenchmarkSetContext(b *testing.B) { 29 | b.StopTimer() 30 | ctx := CtxCreate(CTX_SCHED_AUTO, 0) 31 | b.StartTimer() 32 | for i := 0; i < b.N; i++ { 33 | ctx.SetCurrent() 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /cu/device.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | // This file implements CUDA driver device management 4 | 5 | //#include 6 | import "C" 7 | 8 | import () 9 | 10 | // CUDA Device number. 11 | type Device int 12 | 13 | // Returns the compute capability of the device. 14 | func DeviceComputeCapability(device Device) (major, minor int) { 15 | var maj, min C.int 16 | err := Result(C.cuDeviceComputeCapability(&maj, &min, C.CUdevice(device))) 17 | if err != SUCCESS { 18 | panic(err) 19 | } 20 | major = int(maj) 21 | minor = int(min) 22 | return 23 | } 24 | 25 | // Returns the compute capability of the device. 26 | func (device Device) ComputeCapability() (major, minor int) { 27 | return DeviceComputeCapability(device) 28 | } 29 | 30 | // Returns in a device handle given an ordinal in the range [0, DeviceGetCount()-1]. 31 | func DeviceGet(ordinal int) Device { 32 | var device C.CUdevice 33 | err := Result(C.cuDeviceGet(&device, C.int(ordinal))) 34 | if err != SUCCESS { 35 | panic(err) 36 | } 37 | return Device(device) 38 | } 39 | 40 | // Gets the value of a device attribute. 41 | func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int { 42 | var attr C.int 43 | err := Result(C.cuDeviceGetAttribute(&attr, C.CUdevice_attribute(attrib), C.CUdevice(dev))) 44 | if err != SUCCESS { 45 | panic(err) 46 | } 47 | return int(attr) 48 | } 49 | 50 | // Gets the value of a device attribute. 51 | func (dev Device) Attribute(attrib DeviceAttribute) int { 52 | return DeviceGetAttribute(attrib, dev) 53 | } 54 | 55 | // Returns the number of devices with compute capability greater than or equal to 1.0 that are available for execution. 56 | func DeviceGetCount() int { 57 | var count C.int 58 | err := Result(C.cuDeviceGetCount(&count)) 59 | if err != SUCCESS { 60 | panic(err) 61 | } 62 | return int(count) 63 | } 64 | 65 | // Gets the name of the device. 66 | func DeviceGetName(dev Device) string { 67 | size := 256 68 | buf := make([]byte, size) 69 | cstr := C.CString(string(buf)) 70 | err := Result(C.cuDeviceGetName(cstr, C.int(size), C.CUdevice(dev))) 71 | if err != SUCCESS { 72 | panic(err) 73 | } 74 | return C.GoString(cstr) 75 | } 76 | 77 | // Gets the name of the device. 78 | func (dev Device) Name() string { 79 | return DeviceGetName(dev) 80 | } 81 | 82 | // Device properties 83 | type DevProp struct { 84 | MaxThreadsPerBlock int 85 | MaxThreadsDim [3]int 86 | MaxGridSize [3]int 87 | SharedMemPerBlock int 88 | TotalConstantMemory int 89 | SIMDWidth int 90 | MemPitch int 91 | RegsPerBlock int 92 | ClockRate int 93 | TextureAlign int 94 | } 95 | 96 | // Returns the device's properties. 97 | func DeviceGetProperties(dev Device) (prop DevProp) { 98 | var cprop C.CUdevprop 99 | err := Result(C.cuDeviceGetProperties(&cprop, C.CUdevice(dev))) 100 | if err != SUCCESS { 101 | panic(err) 102 | } 103 | prop.MaxThreadsPerBlock = int(cprop.maxThreadsPerBlock) 104 | prop.MaxThreadsDim[0] = int(cprop.maxThreadsDim[0]) 105 | prop.MaxThreadsDim[1] = int(cprop.maxThreadsDim[1]) 106 | prop.MaxThreadsDim[2] = int(cprop.maxThreadsDim[2]) 107 | prop.MaxGridSize[0] = int(cprop.maxGridSize[0]) 108 | prop.MaxGridSize[1] = int(cprop.maxGridSize[1]) 109 | prop.MaxGridSize[2] = int(cprop.maxGridSize[2]) 110 | prop.SharedMemPerBlock = int(cprop.sharedMemPerBlock) 111 | prop.TotalConstantMemory = int(cprop.totalConstantMemory) 112 | prop.SIMDWidth = int(cprop.SIMDWidth) 113 | prop.MemPitch = int(cprop.memPitch) 114 | prop.RegsPerBlock = int(cprop.regsPerBlock) 115 | prop.ClockRate = int(cprop.clockRate) 116 | prop.TextureAlign = int(cprop.textureAlign) 117 | return 118 | } 119 | 120 | // Returns the device's properties. 121 | func (dev Device) Properties() DevProp { 122 | return DeviceGetProperties(dev) 123 | } 124 | 125 | // Returns the total amount of memory available on the device in bytes. 126 | func (device Device) TotalMem() int64 { 127 | return DeviceTotalMem(device) 128 | } 129 | 130 | // Returns the total amount of memory available on the device in bytes. 131 | func DeviceTotalMem(device Device) int64 { 132 | var bytes C.size_t 133 | err := Result(C.cuDeviceTotalMem(&bytes, C.CUdevice(device))) 134 | if err != SUCCESS { 135 | panic(err) 136 | } 137 | return int64(bytes) 138 | } 139 | 140 | type DeviceAttribute int 141 | 142 | const ( 143 | MAX_THREADS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK // Maximum number of threads per block 144 | MAX_BLOCK_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X // Maximum block dimension X 145 | MAX_BLOCK_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y // Maximum block dimension Y 146 | MAX_BLOCK_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z // Maximum block dimension Z 147 | MAX_GRID_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X // Maximum grid dimension X 148 | MAX_GRID_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y // Maximum grid dimension Y 149 | MAX_GRID_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z // Maximum grid dimension Z 150 | MAX_SHARED_MEMORY_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK // Maximum shared memory available per block in bytes 151 | TOTAL_CONSTANT_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY // Memory available on device for __constant__ variables in a CUDA C kernel in bytes 152 | WARP_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE // Warp size in threads 153 | MAX_PITCH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH // Maximum pitch in bytes allowed by memory copies 154 | MAX_REGISTERS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK // Maximum number of 32-bit registers available per block 155 | CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE // Peak clock frequency in kilohertz 156 | TEXTURE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT // Alignment requirement for textures 157 | MULTIPROCESSOR_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT // Number of multiprocessors on device 158 | KERNEL_EXEC_TIMEOUT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT // Specifies whether there is a run time limit on kernels 159 | INTEGRATED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED // Device is integrated with host memory 160 | CAN_MAP_HOST_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY // Device can map host memory into CUDA address space 161 | COMPUTE_MODE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE // Compute mode (See ::CUcomputemode for details) 162 | MAXIMUM_TEXTURE1D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH // Maximum 1D texture width 163 | MAXIMUM_TEXTURE2D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH // Maximum 2D texture width 164 | MAXIMUM_TEXTURE2D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT // Maximum 2D texture height 165 | MAXIMUM_TEXTURE3D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH // Maximum 3D texture width 166 | MAXIMUM_TEXTURE3D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT // Maximum 3D texture height 167 | MAXIMUM_TEXTURE3D_DEPTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH // Maximum 3D texture depth 168 | MAXIMUM_TEXTURE2D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH // Maximum 2D layered texture width 169 | MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height 170 | MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture 171 | SURFACE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT // Alignment requirement for surfaces 172 | CONCURRENT_KERNELS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS // Device can possibly execute multiple kernels concurrently 173 | ECC_ENABLED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED // Device has ECC support enabled 174 | PCI_BUS_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID // PCI bus ID of the device 175 | PCI_DEVICE_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID // PCI device ID of the device 176 | TCC_DRIVER DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER // Device is using TCC driver model 177 | MEMORY_CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE // Peak memory clock frequency in kilohertz 178 | GLOBAL_MEMORY_BUS_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH // Global memory bus width in bits 179 | L2_CACHE_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE // Size of L2 cache in bytes 180 | MAX_THREADS_PER_MULTIPROCESSOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR // Maximum resident threads per multiprocessor 181 | ASYNC_ENGINE_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT // Number of asynchronous engines 182 | UNIFIED_ADDRESSING DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING // Device uses shares a unified address space with the host 183 | MAXIMUM_TEXTURE1D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH // Maximum 1D layered texture width 184 | MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture 185 | ) 186 | -------------------------------------------------------------------------------- /cu/device_test.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestDevice(t *testing.T) { 9 | fmt.Println("DeviceGetCount:", DeviceGetCount()) 10 | for i := 0; i < DeviceGetCount(); i++ { 11 | fmt.Println("DeviceGet", i) 12 | dev := DeviceGet(i) 13 | major, minor := dev.ComputeCapability() 14 | fmt.Println("Name: ", dev.Name()) 15 | fmt.Println("ComputeCapability: ", major, minor) 16 | fmt.Println("TotalMem: ", dev.TotalMem()) 17 | 18 | fmt.Println("ATTRIBUTE_MAX_THREADS_PER_BLOCK :", dev.Attribute(MAX_THREADS_PER_BLOCK)) 19 | fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_X :", dev.Attribute(MAX_BLOCK_DIM_X)) 20 | fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_Y :", dev.Attribute(MAX_BLOCK_DIM_Y)) 21 | fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_Z :", dev.Attribute(MAX_BLOCK_DIM_Z)) 22 | fmt.Println("ATTRIBUTE_MAX_GRID_DIM_X :", dev.Attribute(MAX_GRID_DIM_X)) 23 | fmt.Println("ATTRIBUTE_MAX_GRID_DIM_Y :", dev.Attribute(MAX_GRID_DIM_Y)) 24 | fmt.Println("ATTRIBUTE_MAX_GRID_DIM_Z :", dev.Attribute(MAX_GRID_DIM_Z)) 25 | fmt.Println("ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK :", dev.Attribute(MAX_SHARED_MEMORY_PER_BLOCK)) 26 | fmt.Println("ATTRIBUTE_TOTAL_CONSTANT_MEMORY :", dev.Attribute(TOTAL_CONSTANT_MEMORY)) 27 | fmt.Println("ATTRIBUTE_WARP_SIZE :", dev.Attribute(WARP_SIZE)) 28 | fmt.Println("ATTRIBUTE_MAX_PITCH :", dev.Attribute(MAX_PITCH)) 29 | fmt.Println("ATTRIBUTE_MAX_REGISTERS_PER_BLOCK :", dev.Attribute(MAX_REGISTERS_PER_BLOCK)) 30 | fmt.Println("ATTRIBUTE_CLOCK_RATE :", dev.Attribute(CLOCK_RATE)) 31 | fmt.Println("ATTRIBUTE_TEXTURE_ALIGNMENT :", dev.Attribute(TEXTURE_ALIGNMENT)) 32 | fmt.Println("ATTRIBUTE_MULTIPROCESSOR_COUNT :", dev.Attribute(MULTIPROCESSOR_COUNT)) 33 | fmt.Println("ATTRIBUTE_KERNEL_EXEC_TIMEOUT :", dev.Attribute(KERNEL_EXEC_TIMEOUT)) 34 | fmt.Println("ATTRIBUTE_INTEGRATED :", dev.Attribute(INTEGRATED)) 35 | fmt.Println("ATTRIBUTE_CAN_MAP_HOST_MEMORY :", dev.Attribute(CAN_MAP_HOST_MEMORY)) 36 | fmt.Println("ATTRIBUTE_COMPUTE_MODE :", dev.Attribute(COMPUTE_MODE)) 37 | fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE1D_WIDTH)) 38 | fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE2D_WIDTH)) 39 | fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT :", dev.Attribute(MAXIMUM_TEXTURE2D_HEIGHT)) 40 | fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE3D_WIDTH)) 41 | fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT :", dev.Attribute(MAXIMUM_TEXTURE3D_HEIGHT)) 42 | fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH :", dev.Attribute(MAXIMUM_TEXTURE3D_DEPTH)) 43 | fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_WIDTH)) 44 | fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT)) 45 | fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_LAYERS)) 46 | fmt.Println("ATTRIBUTE_SURFACE_ALIGNMENT :", dev.Attribute(SURFACE_ALIGNMENT)) 47 | fmt.Println("ATTRIBUTE_CONCURRENT_KERNELS :", dev.Attribute(CONCURRENT_KERNELS)) 48 | fmt.Println("ATTRIBUTE_ECC_ENABLED :", dev.Attribute(ECC_ENABLED)) 49 | fmt.Println("ATTRIBUTE_PCI_BUS_ID :", dev.Attribute(PCI_BUS_ID)) 50 | fmt.Println("ATTRIBUTE_PCI_DEVICE_ID :", dev.Attribute(PCI_DEVICE_ID)) 51 | fmt.Println("ATTRIBUTE_TCC_DRIVER :", dev.Attribute(TCC_DRIVER)) 52 | fmt.Println("ATTRIBUTE_MEMORY_CLOCK_RATE :", dev.Attribute(MEMORY_CLOCK_RATE)) 53 | fmt.Println("ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH :", dev.Attribute(GLOBAL_MEMORY_BUS_WIDTH)) 54 | fmt.Println("ATTRIBUTE_L2_CACHE_SIZE :", dev.Attribute(L2_CACHE_SIZE)) 55 | fmt.Println("ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR :", dev.Attribute(MAX_THREADS_PER_MULTIPROCESSOR)) 56 | fmt.Println("ATTRIBUTE_ASYNC_ENGINE_COUNT :", dev.Attribute(ASYNC_ENGINE_COUNT)) 57 | fmt.Println("ATTRIBUTE_UNIFIED_ADDRESSING :", dev.Attribute(UNIFIED_ADDRESSING)) 58 | fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE1D_LAYERED_WIDTH)) 59 | fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:", dev.Attribute(MAXIMUM_TEXTURE1D_LAYERED_LAYERS)) 60 | 61 | fmt.Printf("Properties:%#v\n", dev.Properties()) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /cu/dim3.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | type Dim3 struct { 4 | X, Y, Z int 5 | } 6 | -------------------------------------------------------------------------------- /cu/doc.go: -------------------------------------------------------------------------------- 1 | // Go bindings for the CUDA driver API. 2 | package cu 3 | -------------------------------------------------------------------------------- /cu/execution.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | // This file implements execution of CUDA kernels 4 | 5 | //#include 6 | import "C" 7 | 8 | import ( 9 | "unsafe" 10 | ) 11 | 12 | const pointerSize = 8 // sorry, 64 bits only. 13 | 14 | func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) { 15 | 16 | // Since Go 1.6, a cgo argument cannot have a Go pointer to Go pointer, 17 | // so we copy the argument values go C memory first. 18 | argv := C.malloc(C.size_t(len(kernelParams) * pointerSize)) 19 | argp := C.malloc(C.size_t(len(kernelParams) * pointerSize)) 20 | defer C.free(argv) 21 | defer C.free(argp) 22 | for i := range kernelParams { 23 | *((*unsafe.Pointer)(offset(argp, i))) = offset(argv, i) // argp[i] = &argv[i] 24 | *((*uint64)(offset(argv, i))) = *((*uint64)(kernelParams[i])) // argv[i] = *kernelParams[i] 25 | } 26 | 27 | err := Result(C.cuLaunchKernel( 28 | C.CUfunction(unsafe.Pointer(uintptr(f))), 29 | C.uint(gridDimX), 30 | C.uint(gridDimY), 31 | C.uint(gridDimZ), 32 | C.uint(blockDimX), 33 | C.uint(blockDimY), 34 | C.uint(blockDimZ), 35 | C.uint(sharedMemBytes), 36 | C.CUstream(unsafe.Pointer(uintptr(stream))), 37 | (*unsafe.Pointer)(argp), 38 | (*unsafe.Pointer)(unsafe.Pointer(uintptr(0))))) 39 | if err != SUCCESS { 40 | panic(err) 41 | } 42 | } 43 | 44 | func offset(ptr unsafe.Pointer, i int) unsafe.Pointer { 45 | return unsafe.Pointer(uintptr(ptr) + pointerSize*uintptr(i)) 46 | } 47 | -------------------------------------------------------------------------------- /cu/function.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | // This file implements manipulations on CUDA functions 4 | 5 | //#include 6 | import "C" 7 | 8 | import ( 9 | "unsafe" 10 | ) 11 | 12 | // Represents a CUDA CUfunction, a reference to a function within a module. 13 | type Function uintptr 14 | 15 | func FuncGetAttribute(attrib FunctionAttribute, function Function) int { 16 | var attr C.int 17 | err := Result(C.cuFuncGetAttribute(&attr, C.CUfunction_attribute(attrib), C.CUfunction(unsafe.Pointer(uintptr(function))))) 18 | if err != SUCCESS { 19 | panic(err) 20 | } 21 | return int(attr) 22 | } 23 | 24 | func (f Function) GetAttribute(attrib FunctionAttribute) int { 25 | return FuncGetAttribute(attrib, f) 26 | } 27 | 28 | type FunctionAttribute int 29 | 30 | const ( 31 | FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail. 32 | FUNC_A_SHARED_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES // The size in bytes of statically-allocated shared memory required by this function. 33 | FUNC_A_CONST_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES // The size in bytes of user-allocated constant memory required by this function. 34 | FUNC_A_LOCAL_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES // The size in bytes of local memory used by each thread of this function. 35 | FUNC_A_NUM_REGS FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS // The number of registers used by each thread of this function. 36 | FUNC_A_PTX_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION // The PTX virtual architecture version for which the function was compiled. 37 | FUNC_A_BINARY_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION // The binary architecture version for which the function was compiled. 38 | ) 39 | -------------------------------------------------------------------------------- /cu/init.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | // This file implements CUDA driver initialization 4 | 5 | //#include 6 | import "C" 7 | 8 | // Initialize the CUDA driver API. 9 | // Currently, flags must be 0. 10 | // If Init() has not been called, any function from the driver API will panic with ERROR_NOT_INITIALIZED. 11 | func Init(flags int) { 12 | err := Result(C.cuInit(C.uint(flags))) 13 | if err != SUCCESS { 14 | panic(err) 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /cu/init_test.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | // needed for all other tests. 8 | func init() { 9 | Init(0) 10 | ctx := CtxCreate(CTX_SCHED_AUTO, 0) 11 | CtxSetCurrent(ctx) 12 | fmt.Println("Created CUDA context") 13 | } 14 | -------------------------------------------------------------------------------- /cu/memory.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | // This file implements CUDA memory management on the driver level 4 | 5 | //#include 6 | import "C" 7 | 8 | import ( 9 | "fmt" 10 | "unsafe" 11 | ) 12 | 13 | type DevicePtr uintptr 14 | 15 | // Allocates a number of bytes of device memory. 16 | func MemAlloc(bytes int64) DevicePtr { 17 | var devptr C.CUdeviceptr 18 | err := Result(C.cuMemAlloc(&devptr, C.size_t(bytes))) 19 | if err != SUCCESS { 20 | panic(err) 21 | } 22 | return DevicePtr(devptr) 23 | } 24 | 25 | // Frees device memory allocated by MemAlloc(). 26 | // It is safe to double-free. 27 | func MemFree(p DevicePtr) { 28 | if p == DevicePtr(uintptr(0)) { 29 | return // Allready freed 30 | } 31 | err := Result(C.cuMemFree(C.CUdeviceptr(p))) 32 | if err != SUCCESS { 33 | panic(err) 34 | } 35 | } 36 | 37 | // Frees device memory allocated by MemAlloc(). 38 | // Overwrites the pointer with NULL. 39 | // It is safe to double-free. 40 | func (ptr DevicePtr) Free() { 41 | MemFree(ptr) 42 | } 43 | 44 | // Copies a number of bytes on the current device. 45 | // Requires unified addressing to be supported. 46 | // See also: MemcpyDtoD(). 47 | func Memcpy(dst, src DevicePtr, bytes int64) { 48 | err := Result(C.cuMemcpy(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes))) 49 | if err != SUCCESS { 50 | panic(err) 51 | } 52 | } 53 | 54 | // Asynchronously copies a number of bytes on the current device. 55 | func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream) { 56 | err := Result(C.cuMemcpyAsync(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) 57 | if err != SUCCESS { 58 | panic(err) 59 | } 60 | } 61 | 62 | // Copies a number of bytes from host to device. 63 | func MemcpyDtoD(dst, src DevicePtr, bytes int64) { 64 | err := Result(C.cuMemcpyDtoD(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes))) 65 | if err != SUCCESS { 66 | panic(err) 67 | } 68 | } 69 | 70 | // Asynchronously copies a number of bytes from host to device. 71 | func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream) { 72 | err := Result(C.cuMemcpyDtoDAsync(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) 73 | if err != SUCCESS { 74 | panic(err) 75 | } 76 | } 77 | 78 | // Copies a number of bytes from host to device. 79 | func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64) { 80 | err := Result(C.cuMemcpyHtoD(C.CUdeviceptr(dst), src, C.size_t(bytes))) 81 | if err != SUCCESS { 82 | panic(err) 83 | } 84 | } 85 | 86 | // Asynchronously copies a number of bytes from host to device. 87 | // The host memory must be page-locked (see MemRegister) 88 | func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream) { 89 | err := Result(C.cuMemcpyHtoDAsync(C.CUdeviceptr(dst), src, C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) 90 | if err != SUCCESS { 91 | panic(err) 92 | } 93 | } 94 | 95 | // Copies a number of bytes from device to host. 96 | func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64) { 97 | err := Result(C.cuMemcpyDtoH(dst, C.CUdeviceptr(src), C.size_t(bytes))) 98 | if err != SUCCESS { 99 | panic(err) 100 | } 101 | } 102 | 103 | // Asynchronously copies a number of bytes device host to host. 104 | // The host memory must be page-locked (see MemRegister) 105 | func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream) { 106 | err := Result(C.cuMemcpyDtoHAsync(dst, C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) 107 | if err != SUCCESS { 108 | panic(err) 109 | } 110 | } 111 | 112 | // Copies from device memory in one context (device) to another. 113 | func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64) { 114 | err := Result(C.cuMemcpyPeer(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes))) 115 | if err != SUCCESS { 116 | panic(err) 117 | } 118 | } 119 | 120 | // Asynchronously copies from device memory in one context (device) to another. 121 | func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream) { 122 | err := Result(C.cuMemcpyPeerAsync(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) 123 | if err != SUCCESS { 124 | panic(err) 125 | } 126 | } 127 | 128 | // Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr. 129 | func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr) { 130 | var cbytes C.size_t 131 | var cptr C.CUdeviceptr 132 | err := Result(C.cuMemGetAddressRange(&cptr, &cbytes, C.CUdeviceptr(ptr))) 133 | if err != SUCCESS { 134 | panic(err) 135 | } 136 | bytes = int64(cbytes) 137 | base = DevicePtr(cptr) 138 | return 139 | } 140 | 141 | // Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr. 142 | func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr) { 143 | return MemGetAddressRange(ptr) 144 | } 145 | 146 | // Returns the size of the allocation (by MemAlloc) that contains the input pointer ptr. 147 | func (ptr DevicePtr) Bytes() (bytes int64) { 148 | bytes, _ = MemGetAddressRange(ptr) 149 | return 150 | } 151 | 152 | // Returns the free and total amount of memroy in the current Context (in bytes). 153 | func MemGetInfo() (free, total int64) { 154 | var cfree, ctotal C.size_t 155 | err := Result(C.cuMemGetInfo(&cfree, &ctotal)) 156 | if err != SUCCESS { 157 | panic(err) 158 | } 159 | free = int64(cfree) 160 | total = int64(ctotal) 161 | return 162 | } 163 | 164 | // Page-locks memory specified by the pointer and bytes. 165 | // The pointer and byte size must be aligned to the host page size (4KB) 166 | // See also: MemHostUnregister() 167 | // doesn't link with cuda6.5 168 | //func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag) { 169 | // err := Result(C.cuMemHostRegister(ptr, C.size_t(bytes), C.uint(flags))) 170 | // if err != SUCCESS { 171 | // panic(err) 172 | // } 173 | //} 174 | 175 | // Unmaps memory locked by MemHostRegister(). 176 | // doesn't link with cuda6.5 177 | //func MemHostUnregister(ptr unsafe.Pointer) { 178 | // err := Result(C.cuMemHostUnregister(ptr)) 179 | // if err != SUCCESS { 180 | // panic(err) 181 | // } 182 | //} 183 | 184 | func MemAllocHost(bytes int64) unsafe.Pointer { 185 | var p unsafe.Pointer 186 | err := Result(C.cuMemAllocHost(&p, C.size_t(bytes))) 187 | if err != SUCCESS { 188 | panic(err) 189 | } 190 | return p 191 | } 192 | 193 | func MemFreeHost(ptr unsafe.Pointer) { 194 | err := Result(C.cuMemFreeHost(ptr)) 195 | if err != SUCCESS { 196 | panic(err) 197 | } 198 | } 199 | 200 | type MemHostRegisterFlag int 201 | 202 | // Flag for MemHostRegister 203 | const ( 204 | // Memory is pinned in all CUDA contexts. 205 | MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE 206 | // Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer() 207 | MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP 208 | ) 209 | 210 | func (p DevicePtr) String() string { 211 | return fmt.Sprint(unsafe.Pointer(uintptr(p))) 212 | } 213 | 214 | // Type size in bytes 215 | const ( 216 | SIZEOF_FLOAT32 = 4 217 | SIZEOF_FLOAT64 = 8 218 | SIZEOF_COMPLEX64 = 8 219 | SIZEOF_COMPLEX128 = 16 220 | ) 221 | 222 | // Physical memory type of device pointer. 223 | type MemoryType uint 224 | 225 | const ( 226 | MemoryTypeHost MemoryType = C.CU_MEMORYTYPE_HOST 227 | MemoryTypeDevice MemoryType = C.CU_MEMORYTYPE_DEVICE 228 | MemoryTypeArray MemoryType = C.CU_MEMORYTYPE_ARRAY 229 | MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED 230 | ) 231 | 232 | var memorytype = map[MemoryType]string{ 233 | MemoryTypeHost: "MemoryTypeHost", 234 | MemoryTypeDevice: "MemoryTypeDevice", 235 | MemoryTypeArray: "MemoryTypeArray", 236 | MemoryTypeUnified: "MemoryTypeUnified"} 237 | 238 | func (t MemoryType) String() string { 239 | if s, ok := memorytype[t]; ok { 240 | return s 241 | } 242 | return "MemoryTypeUnknown" 243 | } 244 | 245 | // Returns the physical memory type that ptr addresses. 246 | func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result) { 247 | var typ uint64 // foresee enough memory just to be safe 248 | err = Result(C.cuPointerGetAttribute(unsafe.Pointer(&typ), 249 | C.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, C.CUdeviceptr(uintptr(ptr)))) 250 | return MemoryType(uint(typ)), err 251 | } 252 | 253 | // Returns the physical memory type that ptr addresses. 254 | func (ptr DevicePtr) MemoryType() MemoryType { 255 | t, err := PointerGetAttributeMemoryType(ptr) 256 | if err != SUCCESS { 257 | panic(err) 258 | } 259 | return t 260 | } 261 | -------------------------------------------------------------------------------- /cu/memory_test.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "testing" 7 | "unsafe" 8 | ) 9 | 10 | func TestMalloc(t *testing.T) { 11 | for i := 0; i < 1024; i++ { 12 | pointer := MemAlloc(16 * 1024 * 1024) 13 | pointer.Free() 14 | } 15 | for i := 0; i < 1024; i++ { 16 | pointer := MemAlloc(16 * 1024 * 1024) 17 | MemFree(pointer) 18 | } 19 | } 20 | 21 | func BenchmarkMallocFree1B(b *testing.B) { 22 | for i := 0; i < b.N; i++ { 23 | m := MemAlloc(1) 24 | m.Free() 25 | } 26 | } 27 | 28 | func BenchmarkMallocFree1kB(b *testing.B) { 29 | for i := 0; i < b.N; i++ { 30 | m := MemAlloc(1024) 31 | m.Free() 32 | } 33 | } 34 | 35 | func BenchmarkMallocFree1MB(b *testing.B) { 36 | for i := 0; i < b.N; i++ { 37 | m := MemAlloc(1024 * 1024) 38 | m.Free() 39 | } 40 | } 41 | 42 | func TestMemAddressRange(t *testing.T) { 43 | N := 12345 44 | ptr := MemAlloc(int64(N)) 45 | size, base := MemGetAddressRange(ptr) 46 | if size != int64(N) { 47 | t.Fail() 48 | } 49 | if base != ptr { 50 | t.Fail() 51 | } 52 | size, base = 0, DevicePtr(0) 53 | size, base = ptr.GetAddressRange() 54 | if ptr.Bytes() != int64(N) { 55 | t.Fail() 56 | } 57 | } 58 | 59 | func TestMemGetInfo(t *testing.T) { 60 | free, total := MemGetInfo() 61 | fmt.Println("MemGetInfo: ", free, "/", total) 62 | if free > total { 63 | t.Fail() 64 | } 65 | if total == 0 { 66 | t.Fail() 67 | } 68 | } 69 | 70 | func TestMemsetAsync(t *testing.T) { 71 | N := int64(32 * 1024) 72 | host1 := make([]float32, N) 73 | for i := range host1 { 74 | host1[i] = float32(i) 75 | } 76 | host2 := make([]float32, N) 77 | dev1 := MemAlloc(int64(4 * N)) 78 | MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N) 79 | str := StreamCreate() 80 | MemsetD32Async(dev1, math.Float32bits(42), N, str) 81 | MemsetD32Async(dev1, math.Float32bits(21), N/2, str) 82 | MemcpyDtoH((unsafe.Pointer(&host2[0])), dev1, 4*N) 83 | str.Synchronize() 84 | (&str).Destroy() 85 | for i := 0; i < len(host2)/2; i++ { 86 | if host2[i] != 21 { 87 | t.Fail() 88 | } 89 | } 90 | for i := len(host2) / 2; i < len(host2); i++ { 91 | if host2[i] != 42 { 92 | t.Fail() 93 | } 94 | } 95 | dev1.Free() 96 | } 97 | 98 | func TestMemset(t *testing.T) { 99 | N := int64(32 * 1024) 100 | host1 := make([]float32, N) 101 | for i := range host1 { 102 | host1[i] = float32(i) 103 | } 104 | host2 := make([]float32, N) 105 | dev1 := MemAlloc(int64(4 * N)) 106 | MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N) 107 | MemsetD32(dev1, math.Float32bits(42), N) 108 | MemsetD32(dev1, math.Float32bits(21), N/2) 109 | MemcpyDtoH((unsafe.Pointer(&host2[0])), dev1, 4*N) 110 | for i := 0; i < len(host2)/2; i++ { 111 | if host2[i] != 21 { 112 | t.Fail() 113 | } 114 | } 115 | for i := len(host2) / 2; i < len(host2); i++ { 116 | if host2[i] != 42 { 117 | t.Fail() 118 | } 119 | } 120 | dev1.Free() 121 | } 122 | 123 | func TestMemcpy(t *testing.T) { 124 | N := int64(32 * 1024) 125 | host1 := make([]float32, N) 126 | for i := range host1 { 127 | host1[i] = float32(i) 128 | } 129 | host2 := make([]float32, N) 130 | dev1 := MemAlloc(int64(4 * N)) 131 | dev2 := MemAlloc(int64(4 * N)) 132 | MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N) 133 | MemcpyDtoD(dev2, dev1, 4*N) 134 | MemcpyDtoH((unsafe.Pointer(&host2[0])), dev2, 4*N) 135 | for i := range host2 { 136 | if host2[i] != float32(i) { 137 | t.Fail() 138 | } 139 | } 140 | dev1.Free() 141 | dev2.Free() 142 | } 143 | 144 | func TestMemcpyAsync(t *testing.T) { 145 | N := int64(32 * 1024) 146 | host1 := make([]float32, N) 147 | for i := range host1 { 148 | host1[i] = float32(i) 149 | } 150 | host2 := make([]float32, N) 151 | dev1 := MemAlloc(int64(4 * N)) 152 | dev2 := MemAlloc(int64(4 * N)) 153 | stream := StreamCreate() 154 | MemcpyHtoDAsync(dev1, (unsafe.Pointer(&host1[0])), 4*N, stream) 155 | MemcpyDtoDAsync(dev2, dev1, 4*N, stream) 156 | MemcpyDtoHAsync((unsafe.Pointer(&host2[0])), dev2, 4*N, stream) 157 | stream.Synchronize() 158 | for i := range host2 { 159 | if host2[i] != float32(i) { 160 | t.Fail() 161 | } 162 | } 163 | dev1.Free() 164 | dev2.Free() 165 | } 166 | 167 | func TestMemcpyAsyncRegistered(t *testing.T) { 168 | N := int64(32 * 1024) 169 | host1 := make([]float32, N) 170 | for i := range host1 { 171 | host1[i] = float32(i) 172 | } 173 | host2 := make([]float32, N) 174 | dev1 := MemAlloc(int64(4 * N)) 175 | dev2 := MemAlloc(int64(4 * N)) 176 | stream := StreamCreate() 177 | MemcpyHtoDAsync(dev1, (unsafe.Pointer(&host1[0])), 4*N, stream) 178 | MemcpyDtoDAsync(dev2, dev1, 4*N, stream) 179 | MemcpyDtoHAsync((unsafe.Pointer(&host2[0])), dev2, 4*N, stream) 180 | stream.Synchronize() 181 | for i := range host2 { 182 | if host2[i] != float32(i) { 183 | t.Fail() 184 | } 185 | } 186 | dev1.Free() 187 | dev2.Free() 188 | } 189 | 190 | func BenchmarkMemcpy(b *testing.B) { 191 | b.StopTimer() 192 | N := int64(32 * 1024 * 1024) 193 | host1 := make([]float32, N) 194 | host2 := make([]float32, N) 195 | dev1 := MemAlloc(int64(4 * N)) 196 | defer dev1.Free() 197 | dev2 := MemAlloc(int64(4 * N)) 198 | defer dev2.Free() 199 | b.SetBytes(4 * N) 200 | b.StartTimer() 201 | for i := 0; i < b.N; i++ { 202 | MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N) 203 | MemcpyDtoD(dev2, dev1, 4*N) 204 | MemcpyDtoH((unsafe.Pointer(&host2[0])), dev2, 4*N) 205 | } 206 | } 207 | -------------------------------------------------------------------------------- /cu/memset.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | // This file implements CUDA memset functions. 4 | 5 | //#include 6 | import "C" 7 | 8 | import ( 9 | "unsafe" 10 | ) 11 | 12 | // Sets the first N 32-bit values of dst array to value. 13 | // Asynchronous. 14 | func MemsetD32(deviceptr DevicePtr, value uint32, N int64) { 15 | err := Result(C.cuMemsetD32(C.CUdeviceptr(deviceptr), C.uint(value), C.size_t(N))) 16 | if err != SUCCESS { 17 | panic(err) 18 | } 19 | } 20 | 21 | // Asynchronously sets the first N 32-bit values of dst array to value. 22 | func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream) { 23 | err := Result(C.cuMemsetD32Async(C.CUdeviceptr(deviceptr), C.uint(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream))))) 24 | if err != SUCCESS { 25 | panic(err) 26 | } 27 | } 28 | 29 | // Sets the first N 8-bit values of dst array to value. 30 | // Asynchronous. 31 | func MemsetD8(deviceptr DevicePtr, value uint8, N int64) { 32 | err := Result(C.cuMemsetD8(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N))) 33 | if err != SUCCESS { 34 | panic(err) 35 | } 36 | } 37 | 38 | // Asynchronously sets the first N 32-bit values of dst array to value. 39 | func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream) { 40 | err := Result(C.cuMemsetD8Async(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream))))) 41 | if err != SUCCESS { 42 | panic(err) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /cu/module.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | // This file implements loading of CUDA ptx modules 4 | 5 | //#include 6 | import "C" 7 | 8 | import ( 9 | "unsafe" 10 | ) 11 | 12 | // Represents a CUDA CUmodule, a reference to executable device code. 13 | type Module uintptr 14 | 15 | // Loads a compute module from file 16 | func ModuleLoad(fname string) Module { 17 | //fmt.Fprintln(os.Stderr, "driver.ModuleLoad", fname) 18 | var mod C.CUmodule 19 | err := Result(C.cuModuleLoad(&mod, C.CString(fname))) 20 | if err != SUCCESS { 21 | panic(err) 22 | } 23 | return Module(uintptr(unsafe.Pointer(mod))) 24 | } 25 | 26 | // Loads a compute module from string 27 | func ModuleLoadData(image string) Module { 28 | var mod C.CUmodule 29 | err := Result(C.cuModuleLoadData(&mod, unsafe.Pointer(C.CString(image)))) 30 | if err != SUCCESS { 31 | panic(err) 32 | } 33 | return Module(uintptr(unsafe.Pointer(mod))) 34 | } 35 | 36 | // Returns a Function handle. 37 | func ModuleGetFunction(module Module, name string) Function { 38 | var function C.CUfunction 39 | err := Result(C.cuModuleGetFunction( 40 | &function, 41 | C.CUmodule(unsafe.Pointer(uintptr(module))), 42 | C.CString(name))) 43 | if err != SUCCESS { 44 | panic(err) 45 | } 46 | return Function(uintptr(unsafe.Pointer(function))) 47 | } 48 | 49 | // Returns a Function handle. 50 | func (m Module) GetFunction(name string) Function { 51 | return ModuleGetFunction(m, name) 52 | } 53 | -------------------------------------------------------------------------------- /cu/module_test.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | import ( 4 | "testing" 5 | "unsafe" 6 | //"fmt" 7 | ) 8 | 9 | func TestModule(test *testing.T) { 10 | mod := ModuleLoad("/testdata/testmodule.ptx") 11 | f := mod.GetFunction("testMemset") 12 | 13 | N := 1000 14 | N4 := 4 * int64(N) 15 | a := make([]float32, N) 16 | A := MemAlloc(N4) 17 | defer A.Free() 18 | aptr := unsafe.Pointer(&a[0]) 19 | MemcpyHtoD(A, aptr, N4) 20 | 21 | var value float32 22 | value = 42 23 | 24 | var n int 25 | n = N / 2 26 | 27 | block := 128 28 | grid := DivUp(N, block) 29 | shmem := 0 30 | args := []unsafe.Pointer{unsafe.Pointer(&A), unsafe.Pointer(&value), unsafe.Pointer(&n)} 31 | LaunchKernel(f, grid, 1, 1, block, 1, 1, shmem, 0, args) 32 | 33 | MemcpyDtoH(aptr, A, N4) 34 | for i := 0; i < N/2; i++ { 35 | if a[i] != 42 { 36 | test.Fail() 37 | } 38 | } 39 | for i := N / 2; i < N; i++ { 40 | if a[i] != 0 { 41 | test.Fail() 42 | } 43 | } 44 | //fmt.Println(a) 45 | } 46 | 47 | // Integer division rounded up. 48 | func DivUp(x, y int) int { 49 | return ((x - 1) / y) + 1 50 | } 51 | -------------------------------------------------------------------------------- /cu/peer.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | // This file implements CUDA unified addressing. 4 | 5 | //#include 6 | import "C" 7 | 8 | import ( 9 | "unsafe" 10 | ) 11 | 12 | // Make allocations from the peer Context available to the current context. 13 | func CtxEnablePeerAccess(peer Context) { 14 | err := Result(C.cuCtxEnablePeerAccess(C.CUcontext(unsafe.Pointer(uintptr(peer))), C.uint(0))) 15 | if err != SUCCESS { 16 | panic(err) 17 | } 18 | } 19 | 20 | // Make allocations from the peer Context available to the current context. 21 | func (peer Context) EnablePeerAccess() { 22 | CtxEnablePeerAccess(peer) 23 | } 24 | 25 | // Reverses CtxEnablePeerAccess(). 26 | func CtxDisablePeerAccess(peer Context) { 27 | err := Result(C.cuCtxDisablePeerAccess(C.CUcontext(unsafe.Pointer(uintptr(peer))))) 28 | if err != SUCCESS { 29 | panic(err) 30 | } 31 | } 32 | 33 | // Reverses EnablePeerAccess(). 34 | func (peer Context) DisablePeerAccess() { 35 | CtxDisablePeerAccess(peer) 36 | } 37 | 38 | // Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev. 39 | func DeviceCanAccessPeer(dev, peer Device) bool { 40 | var canAccessPeer C.int 41 | err := Result(C.cuDeviceCanAccessPeer(&canAccessPeer, C.CUdevice(dev), C.CUdevice(peer))) 42 | if err != SUCCESS { 43 | panic(err) 44 | } 45 | return int(canAccessPeer) != 0 46 | } 47 | 48 | // Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev. 49 | func (dev Device) CanAccessPeer(peer Device) bool { 50 | return DeviceCanAccessPeer(dev, peer) 51 | } 52 | -------------------------------------------------------------------------------- /cu/result.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | // This file provides access to CUDA driver error statuses (type CUresult). 4 | 5 | //#include 6 | import "C" 7 | import ( 8 | "fmt" 9 | ) 10 | 11 | // CUDA error status. 12 | // CUDA error statuses are not returned by functions but checked and passed to 13 | // panic() when not successful. If desired, they can be caught by 14 | // recover(). 15 | type Result int 16 | 17 | // Message string for the error 18 | func (err Result) String() string { 19 | str, ok := errorString[err] 20 | if !ok { 21 | return "Unknown CUresult: " + fmt.Sprint(int(err)) 22 | } 23 | return str 24 | } 25 | 26 | const ( 27 | SUCCESS Result = C.CUDA_SUCCESS 28 | ERROR_INVALID_VALUE Result = C.CUDA_ERROR_INVALID_VALUE 29 | ERROR_OUT_OF_MEMORY Result = C.CUDA_ERROR_OUT_OF_MEMORY 30 | ERROR_NOT_INITIALIZED Result = C.CUDA_ERROR_NOT_INITIALIZED 31 | ERROR_DEINITIALIZED Result = C.CUDA_ERROR_DEINITIALIZED 32 | ERROR_PROFILER_DISABLED Result = C.CUDA_ERROR_PROFILER_DISABLED 33 | ERROR_PROFILER_NOT_INITIALIZED Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED 34 | ERROR_PROFILER_ALREADY_STARTED Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED 35 | ERROR_PROFILER_ALREADY_STOPPED Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED 36 | ERROR_NO_DEVICE Result = C.CUDA_ERROR_NO_DEVICE 37 | ERROR_INVALID_DEVICE Result = C.CUDA_ERROR_INVALID_DEVICE 38 | ERROR_INVALID_IMAGE Result = C.CUDA_ERROR_INVALID_IMAGE 39 | ERROR_INVALID_CONTEXT Result = C.CUDA_ERROR_INVALID_CONTEXT 40 | ERROR_CONTEXT_ALREADY_CURRENT Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT 41 | ERROR_MAP_FAILED Result = C.CUDA_ERROR_MAP_FAILED 42 | ERROR_UNMAP_FAILED Result = C.CUDA_ERROR_UNMAP_FAILED 43 | ERROR_ARRAY_IS_MAPPED Result = C.CUDA_ERROR_ARRAY_IS_MAPPED 44 | ERROR_ALREADY_MAPPED Result = C.CUDA_ERROR_ALREADY_MAPPED 45 | ERROR_NO_BINARY_FOR_GPU Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU 46 | ERROR_ALREADY_ACQUIRED Result = C.CUDA_ERROR_ALREADY_ACQUIRED 47 | ERROR_NOT_MAPPED Result = C.CUDA_ERROR_NOT_MAPPED 48 | ERROR_NOT_MAPPED_AS_ARRAY Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY 49 | ERROR_NOT_MAPPED_AS_POINTER Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER 50 | ERROR_ECC_UNCORRECTABLE Result = C.CUDA_ERROR_ECC_UNCORRECTABLE 51 | ERROR_UNSUPPORTED_LIMIT Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT 52 | ERROR_CONTEXT_ALREADY_IN_USE Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE 53 | ERROR_INVALID_SOURCE Result = C.CUDA_ERROR_INVALID_SOURCE 54 | ERROR_FILE_NOT_FOUND Result = C.CUDA_ERROR_FILE_NOT_FOUND 55 | ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND 56 | ERROR_SHARED_OBJECT_INIT_FAILED Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED 57 | ERROR_OPERATING_SYSTEM Result = C.CUDA_ERROR_OPERATING_SYSTEM 58 | ERROR_INVALID_HANDLE Result = C.CUDA_ERROR_INVALID_HANDLE 59 | ERROR_NOT_FOUND Result = C.CUDA_ERROR_NOT_FOUND 60 | ERROR_NOT_READY Result = C.CUDA_ERROR_NOT_READY 61 | ERROR_LAUNCH_FAILED Result = C.CUDA_ERROR_LAUNCH_FAILED 62 | ERROR_LAUNCH_OUT_OF_RESOURCES Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES 63 | ERROR_LAUNCH_TIMEOUT Result = C.CUDA_ERROR_LAUNCH_TIMEOUT 64 | ERROR_LAUNCH_INCOMPATIBLE_TEXTURING Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING 65 | ERROR_PEER_ACCESS_ALREADY_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED 66 | ERROR_PEER_ACCESS_NOT_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED 67 | ERROR_PRIMARY_CONTEXT_ACTIVE Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE 68 | ERROR_CONTEXT_IS_DESTROYED Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED 69 | ERROR_ASSERT Result = C.CUDA_ERROR_ASSERT 70 | ERROR_TOO_MANY_PEERS Result = C.CUDA_ERROR_TOO_MANY_PEERS 71 | ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED 72 | ERROR_HOST_MEMORY_NOT_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED 73 | ERROR_HARDWARE_STACK_ERROR Result = 714 //C.CUDA_ERROR_HARDWARE_STACK_ERROR 74 | ERROR_ILLEGAL_INSTRUCTION Result = 715 //C.CUDA_ERROR_ILLEGAL_INSTRUCTION 75 | ERROR_MISALIGNED_ADDRESS Result = 716 //C.CUDA_ERROR_MISALIGNED_ADDRESS 76 | ERROR_INVALID_ADDRESS_SPACE Result = 717 //C.CUDA_ERROR_INVALID_ADDRESS_SPACE 77 | ERROR_INVALID_PC Result = 718 //C.CUDA_ERROR_INVALID_PC 78 | ERROR_NOT_PERMITTED Result = 800 //C.CUDA_ERROR_NOT_PERMITTED 79 | ERROR_NOT_SUPPORTED Result = 801 //C.CUDA_ERROR_NOT_SUPPORTED 80 | ERROR_UNKNOWN Result = C.CUDA_ERROR_UNKNOWN 81 | ) 82 | 83 | // Map with error strings for Result error numbers 84 | var errorString map[Result]string = map[Result]string{ 85 | SUCCESS: "CUDA_SUCCESS", 86 | ERROR_INVALID_VALUE: "CUDA_ERROR_INVALID_VALUE", 87 | ERROR_OUT_OF_MEMORY: "CUDA_ERROR_OUT_OF_MEMORY", 88 | ERROR_NOT_INITIALIZED: "CUDA_ERROR_NOT_INITIALIZED", 89 | ERROR_DEINITIALIZED: "CUDA_ERROR_DEINITIALIZED", 90 | ERROR_PROFILER_DISABLED: "CUDA_ERROR_PROFILER_DISABLED", 91 | ERROR_PROFILER_NOT_INITIALIZED: "CUDA_ERROR_PROFILER_NOT_INITIALIZED", 92 | ERROR_PROFILER_ALREADY_STARTED: "CUDA_ERROR_PROFILER_ALREADY_STARTED", 93 | ERROR_PROFILER_ALREADY_STOPPED: "CUDA_ERROR_PROFILER_ALREADY_STOPPED", 94 | ERROR_NO_DEVICE: "CUDA_ERROR_NO_DEVICE", 95 | ERROR_INVALID_DEVICE: "CUDA_ERROR_INVALID_DEVICE", 96 | ERROR_INVALID_IMAGE: "CUDA_ERROR_INVALID_IMAGE", 97 | ERROR_INVALID_CONTEXT: "CUDA_ERROR_INVALID_CONTEXT", 98 | ERROR_CONTEXT_ALREADY_CURRENT: "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", 99 | ERROR_MAP_FAILED: "CUDA_ERROR_MAP_FAILED", 100 | ERROR_UNMAP_FAILED: "CUDA_ERROR_UNMAP_FAILED", 101 | ERROR_ARRAY_IS_MAPPED: "CUDA_ERROR_ARRAY_IS_MAPPED", 102 | ERROR_ALREADY_MAPPED: "CUDA_ERROR_ALREADY_MAPPED", 103 | ERROR_NO_BINARY_FOR_GPU: "CUDA_ERROR_NO_BINARY_FOR_GPU", 104 | ERROR_ALREADY_ACQUIRED: "CUDA_ERROR_ALREADY_ACQUIRED", 105 | ERROR_NOT_MAPPED: "CUDA_ERROR_NOT_MAPPED", 106 | ERROR_NOT_MAPPED_AS_ARRAY: "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", 107 | ERROR_NOT_MAPPED_AS_POINTER: "CUDA_ERROR_NOT_MAPPED_AS_POINTER", 108 | ERROR_ECC_UNCORRECTABLE: "CUDA_ERROR_ECC_UNCORRECTABLE", 109 | ERROR_UNSUPPORTED_LIMIT: "CUDA_ERROR_UNSUPPORTED_LIMIT", 110 | ERROR_CONTEXT_ALREADY_IN_USE: "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", 111 | ERROR_INVALID_SOURCE: "CUDA_ERROR_INVALID_SOURCE", 112 | ERROR_FILE_NOT_FOUND: "CUDA_ERROR_FILE_NOT_FOUND", 113 | ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", 114 | ERROR_SHARED_OBJECT_INIT_FAILED: "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", 115 | ERROR_OPERATING_SYSTEM: "CUDA_ERROR_OPERATING_SYSTEM", 116 | ERROR_INVALID_HANDLE: "CUDA_ERROR_INVALID_HANDLE", 117 | ERROR_NOT_FOUND: "CUDA_ERROR_NOT_FOUND", 118 | ERROR_NOT_READY: "CUDA_ERROR_NOT_READY", 119 | ERROR_LAUNCH_OUT_OF_RESOURCES: "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", 120 | ERROR_LAUNCH_TIMEOUT: "CUDA_ERROR_LAUNCH_TIMEOUT", 121 | ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", 122 | ERROR_PEER_ACCESS_ALREADY_ENABLED: "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", 123 | ERROR_PEER_ACCESS_NOT_ENABLED: "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", 124 | ERROR_PRIMARY_CONTEXT_ACTIVE: "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", 125 | ERROR_CONTEXT_IS_DESTROYED: "CUDA_ERROR_CONTEXT_IS_DESTROYED", 126 | ERROR_ASSERT: "CUDA_ERROR_ASSERT", 127 | ERROR_TOO_MANY_PEERS: "CUDA_ERROR_TOO_MANY_PEERS", 128 | ERROR_HOST_MEMORY_ALREADY_REGISTERED: "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", 129 | ERROR_HOST_MEMORY_NOT_REGISTERED: "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", 130 | ERROR_HARDWARE_STACK_ERROR: "CUDA_ERROR_HARDWARE_STACK_ERROR", 131 | ERROR_ILLEGAL_INSTRUCTION: "CUDA_ERROR_ILLEGAL_INSTRUCTION", 132 | ERROR_MISALIGNED_ADDRESS: "CUDA_ERROR_MISALIGNED_ADDRESS", 133 | ERROR_INVALID_ADDRESS_SPACE: "CUDA_ERROR_INVALID_ADDRESS_SPACE", 134 | ERROR_INVALID_PC: "CUDA_ERROR_INVALID_PC", 135 | ERROR_LAUNCH_FAILED: "CUDA_ERROR_LAUNCH_FAILED", 136 | ERROR_NOT_PERMITTED: "CUDA_ERROR_NOT_PERMITTED", 137 | ERROR_NOT_SUPPORTED: "CUDA_ERROR_NOT_SUPPORTED", 138 | ERROR_UNKNOWN: "CUDA_ERROR_UNKNOWN"} 139 | -------------------------------------------------------------------------------- /cu/runtimeapi.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | // This file implements parts of the CUDA runtime api instead of the driver 4 | // api the rest of this package uses. 5 | // It might be useful to move this to a seperate package at some point. 6 | 7 | //#include 8 | import "C" 9 | import "unsafe" 10 | 11 | // Set the device as current. 12 | func SetDevice(device Device) { 13 | err := Result(C.cudaSetDevice(C.int(device))) 14 | if err != SUCCESS { 15 | panic(err) 16 | } 17 | } 18 | 19 | // Reset the state of the current device. 20 | func DeviceReset() { 21 | err := Result(C.cudaDeviceReset()) 22 | if err != SUCCESS { 23 | panic(err) 24 | } 25 | } 26 | 27 | // Set CUDA device flags. 28 | func SetDeviceFlags(flags uint) { 29 | err := Result(C.cudaSetDeviceFlags(C.uint(flags))) 30 | if err != SUCCESS { 31 | panic(err) 32 | } 33 | } 34 | 35 | //Flags for SetDeviceFlasgs 36 | const ( 37 | // The default, decides to yield or not based on active CUDA threads and processors. 38 | DeviceAuto = C.cudaDeviceScheduleAuto 39 | // Actively spin while waiting for device. 40 | DeviceSpin = C.cudaDeviceScheduleSpin 41 | // Yield when waiting. 42 | DeviceYield = C.cudaDeviceScheduleYield 43 | // ScheduleBlockingSync block CPU on sync. 44 | DeviceScheduleBlockingSync = C.cudaDeviceScheduleBlockingSync 45 | // ScheduleBlockingSync block CPU on sync. Deprecated since cuda 4.0 46 | DeviceBlockingSync = C.cudaDeviceBlockingSync 47 | // For use with pinned host memory 48 | DeviceMapHost = C.cudaDeviceMapHost 49 | // Do not reduce local memory to try and prevent thrashing 50 | DeviceLmemResizeToMax = C.cudaDeviceLmemResizeToMax 51 | ) 52 | 53 | func Malloc(bytes int64) DevicePtr { 54 | var devptr unsafe.Pointer 55 | err := Result(C.cudaMalloc(&devptr, C.size_t(bytes))) 56 | if err != SUCCESS { 57 | panic(err) 58 | } 59 | return DevicePtr(devptr) 60 | } 61 | 62 | func MallocHost(bytes int64) unsafe.Pointer { 63 | var p unsafe.Pointer 64 | err := Result(C.cudaMallocHost(&p, C.size_t(bytes))) 65 | if err != SUCCESS { 66 | panic(err) 67 | } 68 | return p 69 | } 70 | 71 | func FreeHost(ptr unsafe.Pointer) { 72 | err := Result(C.cudaFreeHost(ptr)) 73 | if err != SUCCESS { 74 | panic(err) 75 | } 76 | } 77 | 78 | // Copies a number of bytes in the direction specified by flags 79 | func MemCpy(dst, src unsafe.Pointer, bytes int64, flags uint) { 80 | err := Result(C.cudaMemcpy(dst, src, C.size_t(bytes), uint32(flags))) 81 | if err != SUCCESS { 82 | panic(err) 83 | } 84 | } 85 | 86 | //Flags for memory copy types 87 | const ( 88 | // Host to Host 89 | HtoH = C.cudaMemcpyHostToHost 90 | // Host to Device 91 | HtoD = C.cudaMemcpyHostToDevice 92 | // Device to Host 93 | DtoH = C.cudaMemcpyDeviceToHost 94 | // Device to Device 95 | DtoD = C.cudaMemcpyDeviceToDevice 96 | // Default, unified virtual address space 97 | Virt = C.cudaMemcpyDefault 98 | ) 99 | -------------------------------------------------------------------------------- /cu/stream.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | // This file implements CUDA streams 4 | 5 | //#include 6 | import "C" 7 | import "unsafe" 8 | 9 | // CUDA stream. 10 | type Stream uintptr 11 | 12 | // Creates an asynchronous stream 13 | func StreamCreate() Stream { 14 | var stream C.CUstream 15 | err := Result(C.cuStreamCreate(&stream, C.uint(0))) // flags has to be zero 16 | if err != SUCCESS { 17 | panic(err) 18 | } 19 | return Stream(uintptr(unsafe.Pointer(stream))) 20 | } 21 | 22 | // Destroys the asynchronous stream 23 | func (stream *Stream) Destroy() { 24 | str := *stream 25 | err := Result(C.cuStreamDestroy(C.CUstream(unsafe.Pointer(uintptr(str))))) 26 | *stream = 0 27 | if err != SUCCESS { 28 | panic(err) 29 | } 30 | } 31 | 32 | // Destroys an asynchronous stream 33 | func StreamDestroy(stream *Stream) { 34 | stream.Destroy() 35 | } 36 | 37 | // Blocks until the stream has completed. 38 | func (stream Stream) Synchronize() { 39 | err := Result(C.cuStreamSynchronize(C.CUstream(unsafe.Pointer(uintptr(stream))))) 40 | if err != SUCCESS { 41 | panic(err) 42 | } 43 | } 44 | 45 | // Returns Success if all operations have completed, ErrorNotReady otherwise 46 | func (stream Stream) Query() Result { 47 | return Result(C.cuStreamQuery(C.CUstream(unsafe.Pointer(uintptr(stream))))) 48 | } 49 | 50 | // Returns Success if all operations have completed, ErrorNotReady otherwise 51 | func StreamQuery(stream Stream) Result { 52 | return stream.Query() 53 | } 54 | 55 | // Blocks until the stream has completed. 56 | func StreamSynchronize(stream Stream) { 57 | stream.Synchronize() 58 | } 59 | -------------------------------------------------------------------------------- /cu/testdata/testmodule.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Module to test CUDA module loading and execution. 3 | * To be compiled with: 4 | * nvcc -ptx testmodule.cu 5 | */ 6 | 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | 12 | #define threadindex ( ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x ) 13 | 14 | /// Sets the first N elements of array to value. 15 | __global__ void testMemset(float* array, float value, int N){ 16 | int i = threadindex; 17 | if(i < N){ 18 | array[i] = value; 19 | } 20 | } 21 | 22 | 23 | #ifdef __cplusplus 24 | } 25 | #endif 26 | -------------------------------------------------------------------------------- /cu/testdata/testmodule.ptx: -------------------------------------------------------------------------------- 1 | .version 1.4 2 | .target sm_10, map_f64_to_f32 3 | // compiled with /usr/local/cuda/open64/lib//be 4 | // nvopencc 4.0 built on 2011-02-18 5 | 6 | //----------------------------------------------------------- 7 | // Compiling /tmp/tmpxft_00000e56_00000000-9_testmodule.cpp3.i (/tmp/ccBI#.rDLD4T) 8 | //----------------------------------------------------------- 9 | 10 | //----------------------------------------------------------- 11 | // Options: 12 | //----------------------------------------------------------- 13 | // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64 14 | // -O3 (Optimization level) 15 | // -g0 (Debug level) 16 | // -m2 (Report advisories) 17 | //----------------------------------------------------------- 18 | 19 | .file 1 "" 20 | .file 2 "/tmp/tmpxft_00000e56_00000000-8_testmodule.cudafe2.gpu" 21 | .file 3 "/usr/lib/gcc/x86_64-linux-gnu/4.4.3/include/stddef.h" 22 | .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h" 23 | .file 5 "/usr/local/cuda/bin/../include/host_defines.h" 24 | .file 6 "/usr/local/cuda/bin/../include/builtin_types.h" 25 | .file 7 "/usr/local/cuda/bin/../include/device_types.h" 26 | .file 8 "/usr/local/cuda/bin/../include/driver_types.h" 27 | .file 9 "/usr/local/cuda/bin/../include/surface_types.h" 28 | .file 10 "/usr/local/cuda/bin/../include/texture_types.h" 29 | .file 11 "/usr/local/cuda/bin/../include/vector_types.h" 30 | .file 12 "/usr/local/cuda/bin/../include/device_launch_parameters.h" 31 | .file 13 "/usr/local/cuda/bin/../include/crt/storage_class.h" 32 | .file 14 "/usr/include/bits/types.h" 33 | .file 15 "/usr/include/time.h" 34 | .file 16 "testmodule.cu" 35 | .file 17 "/usr/local/cuda/bin/../include/common_functions.h" 36 | .file 18 "/usr/local/cuda/bin/../include/math_functions.h" 37 | .file 19 "/usr/local/cuda/bin/../include/math_constants.h" 38 | .file 20 "/usr/local/cuda/bin/../include/device_functions.h" 39 | .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h" 40 | .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h" 41 | .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h" 42 | .file 24 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h" 43 | .file 25 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h" 44 | .file 26 "/usr/local/cuda/bin/../include/surface_functions.h" 45 | .file 27 "/usr/local/cuda/bin/../include/texture_fetch_functions.h" 46 | .file 28 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h" 47 | 48 | 49 | .entry testMemset ( 50 | .param .u64 __cudaparm_testMemset_array, 51 | .param .f32 __cudaparm_testMemset_value, 52 | .param .s32 __cudaparm_testMemset_N) 53 | { 54 | .reg .u16 %rh<4>; 55 | .reg .u32 %r<10>; 56 | .reg .u64 %rd<6>; 57 | .reg .f32 %f<3>; 58 | .reg .pred %p<3>; 59 | .loc 16 7 0 60 | $LDWbegin_testMemset: 61 | mov.u16 %rh1, %nctaid.x; 62 | mov.u16 %rh2, %ctaid.y; 63 | mul.wide.u16 %r1, %rh1, %rh2; 64 | cvt.u32.u16 %r2, %ctaid.x; 65 | add.u32 %r3, %r2, %r1; 66 | cvt.u32.u16 %r4, %ntid.x; 67 | mul.lo.u32 %r5, %r4, %r3; 68 | cvt.u32.u16 %r6, %tid.x; 69 | add.u32 %r7, %r6, %r5; 70 | ld.param.s32 %r8, [__cudaparm_testMemset_N]; 71 | setp.le.s32 %p1, %r8, %r7; 72 | @%p1 bra $Lt_0_1026; 73 | .loc 16 10 0 74 | ld.param.f32 %f1, [__cudaparm_testMemset_value]; 75 | ld.param.u64 %rd1, [__cudaparm_testMemset_array]; 76 | cvt.s64.s32 %rd2, %r7; 77 | mul.wide.s32 %rd3, %r7, 4; 78 | add.u64 %rd4, %rd1, %rd3; 79 | st.global.f32 [%rd4+0], %f1; 80 | $Lt_0_1026: 81 | .loc 16 12 0 82 | exit; 83 | $LDWend_testMemset: 84 | } // testMemset 85 | 86 | -------------------------------------------------------------------------------- /cu/version.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | // This file implements CUDA driver version management 4 | 5 | //#include 6 | import "C" 7 | 8 | // Returns the CUDA driver version. 9 | func Version() int { 10 | var version C.int 11 | err := Result(C.cuDriverGetVersion(&version)) 12 | if err != SUCCESS { 13 | panic(err) 14 | } 15 | return int(version) 16 | } 17 | -------------------------------------------------------------------------------- /cu/version_test.go: -------------------------------------------------------------------------------- 1 | package cu 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestVersion(t *testing.T) { 9 | fmt.Println("CUDA driver version: ", Version()) 10 | } 11 | -------------------------------------------------------------------------------- /cuda/Makefile: -------------------------------------------------------------------------------- 1 | all: 6g gccgo doc 2 | 3 | 6g: 4 | go install -v 5 | go tool vet *.go 6 | gofmt -w *.go 7 | 8 | GCCGO=gccgo -gccgoflags '-static-libgcc -O3' 9 | 10 | gccgo: 11 | go build -v -compiler $(GCCGO) 12 | 13 | test: 6gtest gccgotest 14 | 15 | 6gtest: 16 | go test 17 | 18 | gccgotest: 19 | go test -compiler $(GCCGO) 20 | 21 | bench: 6gbench gccgobench 22 | 23 | 6gbench: 24 | go test -bench=. 25 | 26 | gccgobench: 27 | go test -bench=. -compiler $(GCCGO) 28 | 29 | clean: 30 | go clean 31 | 32 | doc: 33 | godoc github.com/barnex/cuda5/cu > README 34 | -------------------------------------------------------------------------------- /cuda/README: -------------------------------------------------------------------------------- 1 | PACKAGE 2 | 3 | package cu 4 | import "github.com/barnex/cuda5/cu" 5 | 6 | Go bindings for the CUDA driver API. 7 | 8 | CONSTANTS 9 | 10 | const ( 11 | // If the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor. 12 | CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO 13 | // Spin when waiting for results from the GPU. 14 | CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN 15 | // Yield its thread when waiting for results from the GPU. 16 | CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD 17 | // Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work. 18 | CTX_BLOCKING_SYNC 19 | // Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU. 20 | CTX_MAP_HOST = C.CU_CTX_MAP_HOST 21 | //Do not reduce local memory after resizing local memory for a kernel. 22 | CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX 23 | ) 24 | Flags for CtxCreate 25 | const ( 26 | SIZEOF_FLOAT32 = 4 27 | SIZEOF_FLOAT64 = 8 28 | SIZEOF_COMPLEX64 = 8 29 | SIZEOF_COMPLEX128 = 16 30 | ) 31 | Type size in bytes 32 | 33 | 34 | FUNCTIONS 35 | 36 | func CtxDestroy(ctx *Context) 37 | Destroys the CUDA context specified by ctx. If the context usage count 38 | is not equal to 1, or the context is current to any CPU thread other 39 | than the current one, this function fails. Floating contexts (detached 40 | from a CPU thread via cuCtxPopCurrent()) may be destroyed by this 41 | function. 42 | 43 | func CtxDisablePeerAccess(peer Context) 44 | Reverses CtxEnablePeerAccess(). 45 | 46 | func CtxEnablePeerAccess(peer Context) 47 | Make allocations from the peer Context available to the current context. 48 | 49 | func CtxGetApiVersion(ctx Context) (version int) 50 | Returns the API version to create the context. 51 | 52 | func CtxSetCurrent(ctx Context) 53 | Sets the current active context. 54 | 55 | func CtxSynchronize() 56 | Blocks until the device has completed all preceding requested tasks, if 57 | the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag. 58 | 59 | func DeviceCanAccessPeer(dev, peer Device) bool 60 | Returns true if CtxEnablePeerAccess can be called on a context for dev 61 | and peerDev. 62 | 63 | func DeviceComputeCapability(device Device) (major, minor int) 64 | Returns the compute capability of the device. 65 | 66 | func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int 67 | Gets the value of a device attribute. 68 | 69 | func DeviceGetCount() int 70 | Returns the number of devices with compute capability greater than or 71 | equal to 1.0 that are available for execution. 72 | 73 | func DeviceGetName(dev Device) string 74 | Gets the name of the device. 75 | 76 | func DeviceTotalMem(device Device) int64 77 | Returns the total amount of memory available on the device in bytes. 78 | 79 | func FuncGetAttribute(attrib FunctionAttribute, function Function) int 80 | 81 | func Init(flags int) 82 | Initialize the CUDA driver API. Currently, flags must be 0. If Init() 83 | has not been called, any function from the driver API will panic with 84 | ERROR_NOT_INITIALIZED. 85 | 86 | func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) 87 | 88 | func MemAllocHost(bytes int64) unsafe.Pointer 89 | 90 | func MemFree(ptr *DevicePtr) 91 | Frees device memory allocated by MemAlloc(). Overwrites the pointer with 92 | NULL. It is safe to double-free. 93 | 94 | func MemFreeHost(ptr unsafe.Pointer) 95 | 96 | func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr) 97 | Returns the base address and size of the allocation (by MemAlloc) that 98 | contains the input pointer ptr. 99 | 100 | func MemGetInfo() (free, total int64) 101 | Returns the free and total amount of memroy in the current Context (in 102 | bytes). 103 | 104 | func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag) 105 | Page-locks memory specified by the pointer and bytes. The pointer and 106 | byte size must be aligned to the host page size (4KB) See also: 107 | MemHostUnregister() 108 | 109 | func MemHostUnregister(ptr unsafe.Pointer) 110 | Unmaps memory locked by MemHostRegister(). 111 | 112 | func Memcpy(dst, src DevicePtr, bytes int64) 113 | Copies a number of bytes on the current device. Requires unified 114 | addressing to be supported. See also: MemcpyDtoD(). TODO(a): is actually 115 | an auto copy for device and/or host memory 116 | 117 | func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream) 118 | Asynchronously copies a number of bytes on the current device. 119 | 120 | func MemcpyDtoD(dst, src DevicePtr, bytes int64) 121 | Copies a number of bytes from host to device. 122 | 123 | func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream) 124 | Asynchronously copies a number of bytes from host to device. 125 | 126 | func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64) 127 | Copies a number of bytes from device to host. 128 | 129 | func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream) 130 | Asynchronously copies a number of bytes device host to host. The host 131 | memory must be page-locked (see MemRegister) 132 | 133 | func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64) 134 | Copies a number of bytes from host to device. 135 | 136 | func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream) 137 | Asynchronously copies a number of bytes from host to device. The host 138 | memory must be page-locked (see MemRegister) 139 | 140 | func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64) 141 | Copies from device memory in one context (device) to another. 142 | 143 | func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream) 144 | Asynchronously copies from device memory in one context (device) to 145 | another. 146 | 147 | func MemsetD32(deviceptr DevicePtr, value uint32, N int64) 148 | Sets the first N 32-bit values of dst array to value. Asynchronous. 149 | 150 | func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream) 151 | Asynchronously sets the first N 32-bit values of dst array to value. 152 | 153 | func MemsetD8(deviceptr DevicePtr, value uint8, N int64) 154 | Sets the first N 8-bit values of dst array to value. Asynchronous. 155 | 156 | func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream) 157 | Asynchronously sets the first N 32-bit values of dst array to value. 158 | 159 | func StreamDestroy(stream *Stream) 160 | Destroys an asynchronous stream 161 | 162 | func StreamSynchronize(stream Stream) 163 | Blocks until the stream has completed. 164 | 165 | func Version() int 166 | Returns the CUDA driver version. 167 | 168 | 169 | TYPES 170 | 171 | type Context uintptr 172 | CUDA context. 173 | 174 | func CtxCreate(flags uint, dev Device) Context 175 | Create a CUDA context. 176 | 177 | func CtxGetCurrent() Context 178 | Gets the current active context. 179 | 180 | func (ctx Context) ApiVersion() (version int) 181 | Returns the API version to create the context. 182 | 183 | func (ctx *Context) Destroy() 184 | Destroys the CUDA context. 185 | 186 | func (peer Context) DisablePeerAccess() 187 | Reverses EnablePeerAccess(). 188 | 189 | func (peer Context) EnablePeerAccess() 190 | Make allocations from the peer Context available to the current context. 191 | 192 | func (ctx Context) SetCurrent() 193 | Sets the current active context. 194 | 195 | type DevProp struct { 196 | MaxThreadsPerBlock int 197 | MaxThreadsDim [3]int 198 | MaxGridSize [3]int 199 | SharedMemPerBlock int 200 | TotalConstantMemory int 201 | SIMDWidth int 202 | MemPitch int 203 | RegsPerBlock int 204 | ClockRate int 205 | TextureAlign int 206 | } 207 | Device properties 208 | 209 | func DeviceGetProperties(dev Device) (prop DevProp) 210 | Returns the device's properties. 211 | 212 | type Device int 213 | CUDA Device number. 214 | 215 | func CtxGetDevice() Device 216 | Returns the ordinal of the current context's device. 217 | 218 | func DeviceGet(ordinal int) Device 219 | Returns in a device handle given an ordinal in the range [0, 220 | DeviceGetCount()-1]. 221 | 222 | func (dev Device) Attribute(attrib DeviceAttribute) int 223 | Gets the value of a device attribute. 224 | 225 | func (dev Device) CanAccessPeer(peer Device) bool 226 | Returns true if CtxEnablePeerAccess can be called on a context for dev 227 | and peerDev. 228 | 229 | func (device Device) ComputeCapability() (major, minor int) 230 | Returns the compute capability of the device. 231 | 232 | func (dev Device) Name() string 233 | Gets the name of the device. 234 | 235 | func (dev Device) Properties() DevProp 236 | Returns the device's properties. 237 | 238 | func (device Device) TotalMem() int64 239 | Returns the total amount of memory available on the device in bytes. 240 | 241 | type DeviceAttribute int 242 | 243 | const ( 244 | MAX_THREADS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK // Maximum number of threads per block 245 | MAX_BLOCK_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X // Maximum block dimension X 246 | MAX_BLOCK_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y // Maximum block dimension Y 247 | MAX_BLOCK_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z // Maximum block dimension Z 248 | MAX_GRID_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X // Maximum grid dimension X 249 | MAX_GRID_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y // Maximum grid dimension Y 250 | MAX_GRID_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z // Maximum grid dimension Z 251 | MAX_SHARED_MEMORY_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK // Maximum shared memory available per block in bytes 252 | TOTAL_CONSTANT_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY // Memory available on device for __constant__ variables in a CUDA C kernel in bytes 253 | WARP_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE // Warp size in threads 254 | MAX_PITCH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH // Maximum pitch in bytes allowed by memory copies 255 | MAX_REGISTERS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK // Maximum number of 32-bit registers available per block 256 | CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE // Peak clock frequency in kilohertz 257 | TEXTURE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT // Alignment requirement for textures 258 | MULTIPROCESSOR_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT // Number of multiprocessors on device 259 | KERNEL_EXEC_TIMEOUT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT // Specifies whether there is a run time limit on kernels 260 | INTEGRATED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED // Device is integrated with host memory 261 | CAN_MAP_HOST_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY // Device can map host memory into CUDA address space 262 | COMPUTE_MODE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE // Compute mode (See ::CUcomputemode for details) 263 | MAXIMUM_TEXTURE1D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH // Maximum 1D texture width 264 | MAXIMUM_TEXTURE2D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH // Maximum 2D texture width 265 | MAXIMUM_TEXTURE2D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT // Maximum 2D texture height 266 | MAXIMUM_TEXTURE3D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH // Maximum 3D texture width 267 | MAXIMUM_TEXTURE3D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT // Maximum 3D texture height 268 | MAXIMUM_TEXTURE3D_DEPTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH // Maximum 3D texture depth 269 | MAXIMUM_TEXTURE2D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH // Maximum 2D layered texture width 270 | MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height 271 | MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture 272 | SURFACE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT // Alignment requirement for surfaces 273 | CONCURRENT_KERNELS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS // Device can possibly execute multiple kernels concurrently 274 | ECC_ENABLED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED // Device has ECC support enabled 275 | PCI_BUS_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID // PCI bus ID of the device 276 | PCI_DEVICE_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID // PCI device ID of the device 277 | TCC_DRIVER DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER // Device is using TCC driver model 278 | MEMORY_CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE // Peak memory clock frequency in kilohertz 279 | GLOBAL_MEMORY_BUS_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH // Global memory bus width in bits 280 | L2_CACHE_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE // Size of L2 cache in bytes 281 | MAX_THREADS_PER_MULTIPROCESSOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR // Maximum resident threads per multiprocessor 282 | ASYNC_ENGINE_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT // Number of asynchronous engines 283 | UNIFIED_ADDRESSING DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING // Device uses shares a unified address space with the host 284 | MAXIMUM_TEXTURE1D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH // Maximum 1D layered texture width 285 | MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture 286 | ) 287 | 288 | type DevicePtr uintptr 289 | 290 | func MemAlloc(bytes int64) DevicePtr 291 | Allocates a number of bytes of device memory. 292 | 293 | func (ptr DevicePtr) Bytes() (bytes int64) 294 | Returns the size of the allocation (by MemAlloc) that contains the input 295 | pointer ptr. 296 | 297 | func (ptr *DevicePtr) Free() 298 | Frees device memory allocated by MemAlloc(). Overwrites the pointer with 299 | NULL. It is safe to double-free. 300 | 301 | func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr) 302 | Returns the base address and size of the allocation (by MemAlloc) that 303 | contains the input pointer ptr. 304 | 305 | func (ptr DevicePtr) MemoryType() MemoryType 306 | Returns the physical memory type that ptr addresses. 307 | 308 | func (p DevicePtr) String() string 309 | 310 | type Dim3 struct { 311 | X, Y, Z int 312 | } 313 | 314 | type Function uintptr 315 | Represents a CUDA CUfunction, a reference to a function within a module. 316 | 317 | func ModuleGetFunction(module Module, name string) Function 318 | Returns a Function handle. 319 | 320 | func (f Function) GetAttribute(attrib FunctionAttribute) int 321 | 322 | type FunctionAttribute int 323 | 324 | const ( 325 | FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail. 326 | FUNC_A_SHARED_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES // The size in bytes of statically-allocated shared memory required by this function. 327 | FUNC_A_CONST_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES // The size in bytes of user-allocated constant memory required by this function. 328 | FUNC_A_LOCAL_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES // The size in bytes of local memory used by each thread of this function. 329 | FUNC_A_NUM_REGS FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS // The number of registers used by each thread of this function. 330 | FUNC_A_PTX_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION // The PTX virtual architecture version for which the function was compiled. 331 | FUNC_A_BINARY_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION // The binary architecture version for which the function was compiled. 332 | ) 333 | 334 | type MemHostRegisterFlag int 335 | 336 | const ( 337 | // Memory is pinned in all CUDA contexts. 338 | MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE 339 | // Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer() 340 | MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP 341 | ) 342 | Flag for MemHostRegister 343 | 344 | type MemoryType uint 345 | Physical memory type of device pointer. 346 | 347 | const ( 348 | MemoryTypeHost MemoryType = C.CU_MEMORYTYPE_HOST 349 | MemoryTypeDevice MemoryType = C.CU_MEMORYTYPE_DEVICE 350 | MemoryTypeArray MemoryType = C.CU_MEMORYTYPE_ARRAY 351 | MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED 352 | ) 353 | 354 | func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result) 355 | Returns the physical memory type that ptr addresses. 356 | 357 | func (t MemoryType) String() string 358 | 359 | type Module uintptr 360 | Represents a CUDA CUmodule, a reference to executable device code. 361 | 362 | func ModuleLoad(fname string) Module 363 | Loads a compute module from file 364 | 365 | func ModuleLoadData(image string) Module 366 | Loads a compute module from string 367 | 368 | func (m Module) GetFunction(name string) Function 369 | Returns a Function handle. 370 | 371 | type Result int 372 | CUDA error status. CUDA error statuses are not returned by functions but 373 | checked and passed to panic() when not successful. If desired, they can 374 | be caught by recover(). 375 | 376 | const ( 377 | SUCCESS Result = C.CUDA_SUCCESS 378 | ERROR_INVALID_VALUE Result = C.CUDA_ERROR_INVALID_VALUE 379 | ERROR_OUT_OF_MEMORY Result = C.CUDA_ERROR_OUT_OF_MEMORY 380 | ERROR_NOT_INITIALIZED Result = C.CUDA_ERROR_NOT_INITIALIZED 381 | ERROR_DEINITIALIZED Result = C.CUDA_ERROR_DEINITIALIZED 382 | ERROR_PROFILER_DISABLED Result = C.CUDA_ERROR_PROFILER_DISABLED 383 | ERROR_PROFILER_NOT_INITIALIZED Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED 384 | ERROR_PROFILER_ALREADY_STARTED Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED 385 | ERROR_PROFILER_ALREADY_STOPPED Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED 386 | ERROR_NO_DEVICE Result = C.CUDA_ERROR_NO_DEVICE 387 | ERROR_INVALID_DEVICE Result = C.CUDA_ERROR_INVALID_DEVICE 388 | ERROR_INVALID_IMAGE Result = C.CUDA_ERROR_INVALID_IMAGE 389 | ERROR_INVALID_CONTEXT Result = C.CUDA_ERROR_INVALID_CONTEXT 390 | ERROR_CONTEXT_ALREADY_CURRENT Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT 391 | ERROR_MAP_FAILED Result = C.CUDA_ERROR_MAP_FAILED 392 | ERROR_UNMAP_FAILED Result = C.CUDA_ERROR_UNMAP_FAILED 393 | ERROR_ARRAY_IS_MAPPED Result = C.CUDA_ERROR_ARRAY_IS_MAPPED 394 | ERROR_ALREADY_MAPPED Result = C.CUDA_ERROR_ALREADY_MAPPED 395 | ERROR_NO_BINARY_FOR_GPU Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU 396 | ERROR_ALREADY_ACQUIRED Result = C.CUDA_ERROR_ALREADY_ACQUIRED 397 | ERROR_NOT_MAPPED Result = C.CUDA_ERROR_NOT_MAPPED 398 | ERROR_NOT_MAPPED_AS_ARRAY Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY 399 | ERROR_NOT_MAPPED_AS_POINTER Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER 400 | ERROR_ECC_UNCORRECTABLE Result = C.CUDA_ERROR_ECC_UNCORRECTABLE 401 | ERROR_UNSUPPORTED_LIMIT Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT 402 | ERROR_CONTEXT_ALREADY_IN_USE Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE 403 | ERROR_INVALID_SOURCE Result = C.CUDA_ERROR_INVALID_SOURCE 404 | ERROR_FILE_NOT_FOUND Result = C.CUDA_ERROR_FILE_NOT_FOUND 405 | ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND 406 | ERROR_SHARED_OBJECT_INIT_FAILED Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED 407 | ERROR_OPERATING_SYSTEM Result = C.CUDA_ERROR_OPERATING_SYSTEM 408 | ERROR_INVALID_HANDLE Result = C.CUDA_ERROR_INVALID_HANDLE 409 | ERROR_NOT_FOUND Result = C.CUDA_ERROR_NOT_FOUND 410 | ERROR_NOT_READY Result = C.CUDA_ERROR_NOT_READY 411 | ERROR_LAUNCH_FAILED Result = C.CUDA_ERROR_LAUNCH_FAILED 412 | ERROR_LAUNCH_OUT_OF_RESOURCES Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES 413 | ERROR_LAUNCH_TIMEOUT Result = C.CUDA_ERROR_LAUNCH_TIMEOUT 414 | ERROR_LAUNCH_INCOMPATIBLE_TEXTURING Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING 415 | ERROR_PEER_ACCESS_ALREADY_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED 416 | ERROR_PEER_ACCESS_NOT_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED 417 | ERROR_PRIMARY_CONTEXT_ACTIVE Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE 418 | ERROR_CONTEXT_IS_DESTROYED Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED 419 | ERROR_ASSERT Result = C.CUDA_ERROR_ASSERT 420 | ERROR_TOO_MANY_PEERS Result = C.CUDA_ERROR_TOO_MANY_PEERS 421 | ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED 422 | ERROR_HOST_MEMORY_NOT_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED 423 | ERROR_UNKNOWN Result = C.CUDA_ERROR_UNKNOWN 424 | ) 425 | 426 | func StreamQuery(stream Stream) Result 427 | Returns Success if all operations have completed, ErrorNotReady 428 | otherwise 429 | 430 | func (err Result) String() string 431 | Message string for the error 432 | 433 | type Stream uintptr 434 | CUDA stream. 435 | 436 | func StreamCreate() Stream 437 | Creates an asynchronous stream 438 | 439 | func (stream *Stream) Destroy() 440 | Destroys the asynchronous stream 441 | 442 | func (stream Stream) Query() Result 443 | Returns Success if all operations have completed, ErrorNotReady 444 | otherwise 445 | 446 | func (stream Stream) Synchronize() 447 | Blocks until the stream has completed. 448 | 449 | 450 | -------------------------------------------------------------------------------- /cuda/cgoflags.go: -------------------------------------------------------------------------------- 1 | package cuda 2 | 3 | // This file provides CGO flags. 4 | 5 | import "C" 6 | 7 | //#cgo LDFLAGS:-lcudart 8 | // 9 | ////default location: 10 | //#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib 11 | //#cgo CFLAGS: -I/usr/local/cuda/include/ 12 | // 13 | ////default location if not properly symlinked: 14 | //#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib 15 | //#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib 16 | //#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib 17 | //#cgo CFLAGS: -I/usr/local/cuda-6.0/include/ 18 | //#cgo CFLAGS: -I/usr/local/cuda-5.5/include/ 19 | //#cgo CFLAGS: -I/usr/local/cuda-5.0/include/ 20 | // 21 | ////arch linux: 22 | //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib 23 | //#cgo CFLAGS: -I/opt/cuda/include 24 | // 25 | ////WINDOWS: 26 | //#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64 27 | //#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include 28 | import "C" 29 | -------------------------------------------------------------------------------- /cuda/device.go: -------------------------------------------------------------------------------- 1 | package cuda 2 | 3 | //#include 4 | //#include 5 | import "C" 6 | 7 | import ( 8 | "github.com/barnex/cuda5/cu" 9 | ) 10 | 11 | // Reset the current GPU device. 12 | func DeviceReset() { 13 | err := cu.Result(C.cudaDeviceReset()) 14 | if err != cu.SUCCESS { 15 | panic(err) 16 | } 17 | } 18 | 19 | // Set preference for more cache or shared memory. 20 | func DeviceSetCacheConfig(cacheConfig FuncCache) { 21 | err := cu.Result(C.cudaDeviceSetCacheConfig(uint32(cacheConfig))) 22 | if err != cu.SUCCESS { 23 | panic(err) 24 | } 25 | } 26 | 27 | // Cache preference option. 28 | type FuncCache int 29 | 30 | const ( 31 | FUNC_CACHE_PREFER_NONE FuncCache = C.CU_FUNC_CACHE_PREFER_NONE 32 | FUNC_CACHE_PREFER_SHARED FuncCache = C.CU_FUNC_CACHE_PREFER_SHARED 33 | FUNC_CACHE_PREFER_L1 FuncCache = C.CU_FUNC_CACHE_PREFER_L1 34 | FUNC_CACHE_PREFER_EQUAL FuncCache = C.CU_FUNC_CACHE_PREFER_EQUAL 35 | ) 36 | -------------------------------------------------------------------------------- /cufft/Makefile: -------------------------------------------------------------------------------- 1 | all: 6g gccgo doc 2 | 3 | 6g: 4 | go install -v 5 | go tool vet *.go 6 | gofmt -w *.go 7 | 8 | GCCGO=gccgo -gccgoflags '-static-libgcc -O3' 9 | 10 | gccgo: 11 | go build -v -compiler $(GCCGO) 12 | 13 | test: 6gtest gccgotest 14 | 15 | 6gtest: 16 | go test 17 | 18 | gccgotest: 19 | go test -compiler $(GCCGO) 20 | 21 | bench: 6gbench gccgobench 22 | 23 | 6gbench: 24 | go test -bench=. 25 | 26 | gccgobench: 27 | go test -bench=. -compiler $(GCCGO) 28 | 29 | clean: 30 | go clean 31 | 32 | doc: 33 | godoc github.com/barnex/cuda5/cufft > README 34 | -------------------------------------------------------------------------------- /cufft/README: -------------------------------------------------------------------------------- 1 | PACKAGE DOCUMENTATION 2 | 3 | package cufft 4 | import "github.com/barnex/cuda5/cufft" 5 | 6 | Go bindings for the CUDA CUFFT API. 7 | 8 | 9 | CONSTANTS 10 | 11 | const ( 12 | FORWARD = -1 // Forward FFT 13 | INVERSE = 1 // Inverse FFT 14 | ) 15 | 16 | 17 | TYPES 18 | 19 | type CompatibilityMode int 20 | CUFFT compatibility mode 21 | 22 | const ( 23 | COMPATIBILITY_NATIVE CompatibilityMode = C.CUFFT_COMPATIBILITY_NATIVE 24 | COMPATIBILITY_FFTW_PADDING CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_PADDING 25 | COMPATIBILITY_FFTW_ASYMMETRIC CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC 26 | COMPATIBILITY_FFTW_ALL CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ALL 27 | ) 28 | 29 | 30 | func (t CompatibilityMode) String() string 31 | 32 | 33 | type Handle uintptr 34 | FFT plan handle, reference type to a plan 35 | 36 | 37 | func Plan1d(nx int, typ Type, batch int) Handle 38 | 1D FFT plan 39 | 40 | 41 | func Plan2d(nx, ny int, typ Type) Handle 42 | 2D FFT plan 43 | 44 | 45 | func Plan3d(nx, ny, nz int, typ Type) Handle 46 | 3D FFT plan 47 | 48 | 49 | func PlanMany(n []int, inembed []int, istride int, oembed []int, ostride int, typ Type, batch int) Handle 50 | 1D,2D or 3D FFT plan 51 | 52 | 53 | func (plan *Handle) Destroy() 54 | Destroys the plan. 55 | 56 | func (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int) 57 | Execute Complex-to-Complex plan 58 | 59 | func (plan Handle) ExecC2R(idata, odata cu.DevicePtr) 60 | Execute Complex-to-Real plan 61 | 62 | func (plan Handle) ExecD2Z(idata, odata cu.DevicePtr) 63 | Execute Double Real-to-Complex plan 64 | 65 | func (plan Handle) ExecR2C(idata, odata cu.DevicePtr) 66 | Execute Real-to-Complex plan 67 | 68 | func (plan Handle) ExecZ2D(idata, odata cu.DevicePtr) 69 | Execute Double Complex-to-Real plan 70 | 71 | func (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int) 72 | Execute Double Complex-to-Complex plan 73 | 74 | func (plan Handle) SetCompatibilityMode(mode CompatibilityMode) 75 | Sets the FFTW compatibility mode 76 | 77 | func (plan Handle) SetStream(stream cu.Stream) 78 | Sets the cuda stream for this plan 79 | 80 | 81 | type Result int 82 | FFT result 83 | 84 | const ( 85 | SUCCESS Result = C.CUFFT_SUCCESS 86 | INVALID_PLAN Result = C.CUFFT_INVALID_PLAN 87 | ALLOC_FAILED Result = C.CUFFT_ALLOC_FAILED 88 | INVALID_TYPE Result = C.CUFFT_INVALID_TYPE 89 | INVALID_VALUE Result = C.CUFFT_INVALID_VALUE 90 | INTERNAL_ERROR Result = C.CUFFT_INTERNAL_ERROR 91 | EXEC_FAILED Result = C.CUFFT_EXEC_FAILED 92 | SETUP_FAILED Result = C.CUFFT_SETUP_FAILED 93 | INVALID_SIZE Result = C.CUFFT_INVALID_SIZE 94 | UNALIGNED_DATA Result = C.CUFFT_UNALIGNED_DATA 95 | ) 96 | FFT result value 97 | 98 | 99 | func (r Result) String() string 100 | 101 | 102 | type Type int 103 | FFT type 104 | 105 | const ( 106 | R2C Type = C.CUFFT_R2C // Real to Complex (interleaved) 107 | C2R Type = C.CUFFT_C2R // Complex (interleaved) to Real 108 | C2C Type = C.CUFFT_C2C // Complex to Complex, interleaved 109 | D2Z Type = C.CUFFT_D2Z // Double to Double-Complex 110 | Z2D Type = C.CUFFT_Z2D // Double-Complex to Double 111 | Z2Z Type = C.CUFFT_Z2Z // Double-Complex to Double-Complex 112 | ) 113 | 114 | 115 | func (t Type) String() string 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /cufft/cgoflags.go: -------------------------------------------------------------------------------- 1 | package cufft 2 | 3 | // This file provides CGO flags to find CUDA libraries and headers. 4 | 5 | //#cgo LDFLAGS:-lcufft 6 | // 7 | ////default location: 8 | //#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib 9 | //#cgo CFLAGS: -I/usr/local/cuda/include/ 10 | // 11 | ////default location if not properly symlinked: 12 | //#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib 13 | //#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib 14 | //#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib 15 | //#cgo CFLAGS: -I/usr/local/cuda-6.0/include/ 16 | //#cgo CFLAGS: -I/usr/local/cuda-5.5/include/ 17 | //#cgo CFLAGS: -I/usr/local/cuda-5.0/include/ 18 | // 19 | ////arch linux: 20 | //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib 21 | //#cgo CFLAGS: -I/opt/cuda/include 22 | // 23 | ////WINDOWS: 24 | //#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64 25 | //#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include -w 26 | import "C" 27 | -------------------------------------------------------------------------------- /cufft/doc.go: -------------------------------------------------------------------------------- 1 | // Go bindings for the CUDA CUFFT API. 2 | package cufft 3 | -------------------------------------------------------------------------------- /cufft/fft_test.go: -------------------------------------------------------------------------------- 1 | package cufft 2 | 3 | import ( 4 | "fmt" 5 | "github.com/barnex/cuda5/cu" 6 | "unsafe" 7 | ) 8 | 9 | func ExampleFFT1D() { 10 | N := 8 11 | 12 | hostIn := make([]float32, N) 13 | hostIn[0] = 1 14 | 15 | devIn := cu.MemAlloc(int64(len(hostIn)) * cu.SIZEOF_FLOAT32) 16 | defer cu.MemFree(&devIn) 17 | cu.MemcpyHtoD(devIn, unsafe.Pointer(&hostIn[0]), devIn.Bytes()) 18 | 19 | hostOut := make([]complex64, N/2+1) 20 | devOut := cu.MemAlloc(int64(len(hostOut)) * cu.SIZEOF_COMPLEX64) 21 | defer cu.MemFree(&devOut) 22 | 23 | plan := Plan1d(N, R2C, 1) 24 | defer plan.Destroy() 25 | plan.ExecR2C(devIn, devOut) 26 | 27 | cu.MemcpyDtoH(unsafe.Pointer(&hostOut[0]), devOut, devOut.Bytes()) 28 | 29 | fmt.Println("hostIn:", hostIn) 30 | fmt.Println("hostOut:", hostOut) 31 | 32 | // Output: 33 | // hostIn: [1 0 0 0 0 0 0 0] 34 | // hostOut: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)] 35 | } 36 | -------------------------------------------------------------------------------- /cufft/init_test.go: -------------------------------------------------------------------------------- 1 | package cufft 2 | 3 | import ( 4 | "fmt" 5 | "github.com/barnex/cuda5/cu" 6 | ) 7 | 8 | // needed for all other tests. 9 | func init() { 10 | cu.Init(0) 11 | ctx := cu.CtxCreate(cu.CTX_SCHED_AUTO, 0) 12 | cu.CtxSetCurrent(ctx) 13 | fmt.Println("Created CUDA context") 14 | } 15 | -------------------------------------------------------------------------------- /cufft/mode.go: -------------------------------------------------------------------------------- 1 | package cufft 2 | 3 | //#include 4 | import "C" 5 | 6 | import ( 7 | "fmt" 8 | ) 9 | 10 | // CUFFT compatibility mode 11 | type CompatibilityMode int 12 | 13 | const ( 14 | COMPATIBILITY_NATIVE CompatibilityMode = C.CUFFT_COMPATIBILITY_NATIVE 15 | COMPATIBILITY_FFTW_PADDING CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_PADDING 16 | COMPATIBILITY_FFTW_ASYMMETRIC CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC 17 | COMPATIBILITY_FFTW_ALL CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ALL 18 | ) 19 | 20 | func (t CompatibilityMode) String() string { 21 | if str, ok := compatibilityModeString[t]; ok { 22 | return str 23 | } 24 | return fmt.Sprint("CUFFT Compatibility mode with unknown number:", int(t)) 25 | } 26 | 27 | var compatibilityModeString map[CompatibilityMode]string = map[CompatibilityMode]string{ 28 | COMPATIBILITY_NATIVE: "CUFFT_COMPATIBILITY_NATIVE", 29 | COMPATIBILITY_FFTW_PADDING: "CUFFT_COMPATIBILITY_FFTW_PADDING", 30 | COMPATIBILITY_FFTW_ASYMMETRIC: "CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC", 31 | COMPATIBILITY_FFTW_ALL: "CUFFT_COMPATIBILITY_FFTW_ALL"} 32 | -------------------------------------------------------------------------------- /cufft/plan.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Arne Vansteenkiste (barnex@gmail.com). All rights reserved. 2 | // Use of this source code is governed by a freeBSD 3 | // license that can be found in the LICENSE.txt file. 4 | 5 | package cufft 6 | 7 | //#include 8 | import "C" 9 | 10 | import ( 11 | "github.com/barnex/cuda5/cu" 12 | "unsafe" 13 | ) 14 | 15 | // FFT plan handle, reference type to a plan 16 | type Handle uintptr 17 | 18 | // 1D FFT plan 19 | func Plan1d(nx int, typ Type, batch int) Handle { 20 | var handle C.cufftHandle 21 | err := Result(C.cufftPlan1d( 22 | &handle, 23 | C.int(nx), 24 | C.cufftType(typ), 25 | C.int(batch))) 26 | if err != SUCCESS { 27 | panic(err) 28 | } 29 | return Handle(handle) 30 | } 31 | 32 | // 2D FFT plan 33 | func Plan2d(nx, ny int, typ Type) Handle { 34 | var handle C.cufftHandle 35 | err := Result(C.cufftPlan2d( 36 | &handle, 37 | C.int(nx), 38 | C.int(ny), 39 | C.cufftType(typ))) 40 | if err != SUCCESS { 41 | panic(err) 42 | } 43 | return Handle(handle) 44 | } 45 | 46 | // 3D FFT plan 47 | func Plan3d(nx, ny, nz int, typ Type) Handle { 48 | var handle C.cufftHandle 49 | err := Result(C.cufftPlan3d( 50 | &handle, 51 | C.int(nx), 52 | C.int(ny), 53 | C.int(nz), 54 | C.cufftType(typ))) 55 | if err != SUCCESS { 56 | panic(err) 57 | } 58 | return Handle(handle) 59 | } 60 | 61 | //cufftPlanMany( 62 | // cufftHandle *plan, int rank, int *n, int *inembed, 63 | // int istride, int idist, int *onembed, int ostride, 64 | // int odist, cufftType type, int batch ); 65 | 66 | // 1D,2D or 3D FFT plan 67 | func PlanMany(n []int, inembed []int, istride int, oembed []int, ostride int, typ Type, batch int) Handle { 68 | var handle C.cufftHandle 69 | 70 | NULL := (*C.int)(unsafe.Pointer(uintptr(0))) 71 | 72 | inembedptr := NULL 73 | idist := 0 74 | if inembed != nil { 75 | inembedptr = (*C.int)(unsafe.Pointer(&inembed[0])) 76 | idist = inembed[0] 77 | } 78 | 79 | oembedptr := NULL 80 | odist := 0 81 | if oembed != nil { 82 | oembedptr = (*C.int)(unsafe.Pointer(&oembed[0])) 83 | odist = oembed[0] 84 | } 85 | 86 | err := Result(C.cufftPlanMany( 87 | &handle, 88 | C.int(len(n)), // rank 89 | (*C.int)(unsafe.Pointer(&n[0])), // n 90 | inembedptr, 91 | C.int(istride), 92 | C.int(idist), 93 | oembedptr, 94 | C.int(ostride), 95 | C.int(odist), 96 | C.cufftType(typ), 97 | C.int(batch))) 98 | if err != SUCCESS { 99 | panic(err) 100 | } 101 | return Handle(handle) 102 | } 103 | 104 | // Execute Complex-to-Complex plan 105 | func (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int) { 106 | err := Result(C.cufftExecC2C( 107 | C.cufftHandle(plan), 108 | (*C.cufftComplex)(unsafe.Pointer(uintptr(idata))), 109 | (*C.cufftComplex)(unsafe.Pointer(uintptr(odata))), 110 | C.int(direction))) 111 | if err != SUCCESS { 112 | panic(err) 113 | } 114 | } 115 | 116 | // Execute Real-to-Complex plan 117 | func (plan Handle) ExecR2C(idata, odata cu.DevicePtr) { 118 | err := Result(C.cufftExecR2C( 119 | C.cufftHandle(plan), 120 | (*C.cufftReal)(unsafe.Pointer(uintptr(idata))), 121 | (*C.cufftComplex)(unsafe.Pointer(uintptr(odata))))) 122 | if err != SUCCESS { 123 | panic(err) 124 | } 125 | } 126 | 127 | // Execute Complex-to-Real plan 128 | func (plan Handle) ExecC2R(idata, odata cu.DevicePtr) { 129 | err := Result(C.cufftExecC2R( 130 | C.cufftHandle(plan), 131 | (*C.cufftComplex)(unsafe.Pointer(uintptr(idata))), 132 | (*C.cufftReal)(unsafe.Pointer(uintptr(odata))))) 133 | if err != SUCCESS { 134 | panic(err) 135 | } 136 | } 137 | 138 | // Execute Double Complex-to-Complex plan 139 | func (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int) { 140 | err := Result(C.cufftExecZ2Z( 141 | C.cufftHandle(plan), 142 | (*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(idata))), 143 | (*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(odata))), 144 | C.int(direction))) 145 | if err != SUCCESS { 146 | panic(err) 147 | } 148 | } 149 | 150 | // Execute Double Real-to-Complex plan 151 | func (plan Handle) ExecD2Z(idata, odata cu.DevicePtr) { 152 | err := Result(C.cufftExecD2Z( 153 | C.cufftHandle(plan), 154 | (*C.cufftDoubleReal)(unsafe.Pointer(uintptr(idata))), 155 | (*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(odata))))) 156 | if err != SUCCESS { 157 | panic(err) 158 | } 159 | } 160 | 161 | // Execute Double Complex-to-Real plan 162 | func (plan Handle) ExecZ2D(idata, odata cu.DevicePtr) { 163 | err := Result(C.cufftExecZ2D( 164 | C.cufftHandle(plan), 165 | (*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(idata))), 166 | (*C.cufftDoubleReal)(unsafe.Pointer(uintptr(odata))))) 167 | if err != SUCCESS { 168 | panic(err) 169 | } 170 | } 171 | 172 | // Destroys the plan. 173 | func (plan *Handle) Destroy() { 174 | err := Result(C.cufftDestroy(C.cufftHandle(*plan))) 175 | *plan = 0 // make sure plan is not used anymore 176 | if err != SUCCESS { 177 | panic(err) 178 | } 179 | } 180 | 181 | // Sets the cuda stream for this plan 182 | func (plan Handle) SetStream(stream cu.Stream) { 183 | err := Result(C.cufftSetStream( 184 | C.cufftHandle(plan), 185 | C.cudaStream_t(unsafe.Pointer(uintptr(stream))))) 186 | if err != SUCCESS { 187 | panic(err) 188 | } 189 | } 190 | 191 | // Sets the FFTW compatibility mode 192 | func (plan Handle) SetCompatibilityMode(mode CompatibilityMode) { 193 | err := Result(C.cufftSetCompatibilityMode( 194 | C.cufftHandle(plan), 195 | C.cufftCompatibility(mode))) 196 | if err != SUCCESS { 197 | panic(err) 198 | } 199 | } 200 | -------------------------------------------------------------------------------- /cufft/result.go: -------------------------------------------------------------------------------- 1 | package cufft 2 | 3 | //#include 4 | import "C" 5 | 6 | import ( 7 | "fmt" 8 | ) 9 | 10 | // FFT result 11 | type Result int 12 | 13 | // FFT result value 14 | const ( 15 | SUCCESS Result = C.CUFFT_SUCCESS 16 | INVALID_PLAN Result = C.CUFFT_INVALID_PLAN 17 | ALLOC_FAILED Result = C.CUFFT_ALLOC_FAILED 18 | INVALID_TYPE Result = C.CUFFT_INVALID_TYPE 19 | INVALID_VALUE Result = C.CUFFT_INVALID_VALUE 20 | INTERNAL_ERROR Result = C.CUFFT_INTERNAL_ERROR 21 | EXEC_FAILED Result = C.CUFFT_EXEC_FAILED 22 | SETUP_FAILED Result = C.CUFFT_SETUP_FAILED 23 | INVALID_SIZE Result = C.CUFFT_INVALID_SIZE 24 | UNALIGNED_DATA Result = C.CUFFT_UNALIGNED_DATA 25 | INCOMPLETE_PARAMETER_LIST Result = 0xA // cuda6 values copied to avoid dependency on cuda6/cufft.h 26 | INVALID_DEVICE Result = 0xB 27 | PARSE_ERROR Result = 0xC 28 | NO_WORKSPACE Result = 0xD 29 | ) 30 | 31 | func (r Result) String() string { 32 | if str, ok := resultString[r]; ok { 33 | return str 34 | } 35 | return fmt.Sprint("CUFFT Result with unknown error number:", int(r)) 36 | } 37 | 38 | var resultString map[Result]string = map[Result]string{ 39 | SUCCESS: "CUFFT_SUCCESS", 40 | INVALID_PLAN: "CUFFT_INVALID_PLAN", 41 | ALLOC_FAILED: "CUFFT_ALLOC_FAILED", 42 | INVALID_TYPE: "CUFFT_INVALID_TYPE", 43 | INVALID_VALUE: "CUFFT_INVALID_VALUE", 44 | INTERNAL_ERROR: "CUFFT_INTERNAL_ERROR", 45 | EXEC_FAILED: "CUFFT_EXEC_FAILED", 46 | SETUP_FAILED: "CUFFT_SETUP_FAILED", 47 | INVALID_SIZE: "CUFFT_INVALID_SIZE", 48 | UNALIGNED_DATA: "CUFFT_UNALIGNED_DATA", 49 | INCOMPLETE_PARAMETER_LIST: "CUFFT_INCOMPLETE_PARAMETER_LIST", 50 | INVALID_DEVICE: "CUFFT_INVALID_DEVICE", 51 | PARSE_ERROR: "CUFFT_PARSE_ERROR", 52 | NO_WORKSPACE: "CUFFT_NO_WORKSPACE"} 53 | -------------------------------------------------------------------------------- /cufft/type.go: -------------------------------------------------------------------------------- 1 | package cufft 2 | 3 | //#include 4 | import "C" 5 | 6 | import ( 7 | "fmt" 8 | ) 9 | 10 | // FFT type 11 | type Type int 12 | 13 | const ( 14 | R2C Type = C.CUFFT_R2C // Real to Complex (interleaved) 15 | C2R Type = C.CUFFT_C2R // Complex (interleaved) to Real 16 | C2C Type = C.CUFFT_C2C // Complex to Complex, interleaved 17 | D2Z Type = C.CUFFT_D2Z // Double to Double-Complex 18 | Z2D Type = C.CUFFT_Z2D // Double-Complex to Double 19 | Z2Z Type = C.CUFFT_Z2Z // Double-Complex to Double-Complex 20 | ) 21 | 22 | const ( 23 | FORWARD = -1 // Forward FFT 24 | INVERSE = 1 // Inverse FFT 25 | ) 26 | 27 | func (t Type) String() string { 28 | if str, ok := typeString[t]; ok { 29 | return str 30 | } 31 | return fmt.Sprint("CUFFT Type with unknown number:", int(t)) 32 | } 33 | 34 | var typeString map[Type]string = map[Type]string{ 35 | R2C: "CUFFT_R2C", 36 | C2R: "CUFFT_C2R", 37 | C2C: "CUFFT_C2C", 38 | D2Z: "CUFFT_D2Z", 39 | Z2D: "CUFFT_Z2D", 40 | Z2Z: "CUFFT_Z2Z"} 41 | -------------------------------------------------------------------------------- /curand/Makefile: -------------------------------------------------------------------------------- 1 | all: 6g gccgo doc 2 | 3 | 6g: 4 | go install -v 5 | go tool vet *.go 6 | gofmt -w *.go 7 | 8 | GCCGO=gccgo -gccgoflags '-static-libgcc -O3' 9 | 10 | gccgo: 11 | go build -v -compiler $(GCCGO) 12 | 13 | test: 6gtest gccgotest 14 | 15 | 6gtest: 16 | go test 17 | 18 | gccgotest: 19 | go test -compiler $(GCCGO) 20 | 21 | bench: 6gbench gccgobench 22 | 23 | 6gbench: 24 | go test -bench=. 25 | 26 | gccgobench: 27 | go test -bench=. -compiler $(GCCGO) 28 | 29 | clean: 30 | go clean 31 | 32 | doc: 33 | godoc github.com/barnex/cuda5/curand > README 34 | -------------------------------------------------------------------------------- /curand/README: -------------------------------------------------------------------------------- 1 | PACKAGE DOCUMENTATION 2 | 3 | package curand 4 | import "github.com/barnex/cuda5/curand" 5 | 6 | 7 | 8 | TYPES 9 | 10 | type Generator uintptr 11 | 12 | 13 | func CreateGenerator(rngType RngType) Generator 14 | 15 | 16 | func (g Generator) GenerateNormal(output uintptr, n int64, mean, stddev float32) 17 | 18 | func (g Generator) SetSeed(seed int64) 19 | 20 | 21 | type RngType int 22 | 23 | const ( 24 | PSEUDO_DEFAULT RngType = C.CURAND_RNG_PSEUDO_DEFAULT // Default pseudorandom generator 25 | PSEUDO_XORWOW RngType = C.CURAND_RNG_PSEUDO_XORWOW // XORWOW pseudorandom generator 26 | QUASI_DEFAULT RngType = C.CURAND_RNG_QUASI_DEFAULT // Default quasirandom generator 27 | QUASI_SOBOL32 RngType = C.CURAND_RNG_QUASI_SOBOL32 // Sobol32 quasirandom generator 28 | QUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 // Scrambled Sobol32 quasirandom generator 29 | QUASI_SOBOL64 RngType = C.CURAND_RNG_QUASI_SOBOL64 // Sobol64 quasirandom generator 30 | QUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 // Scrambled Sobol64 quasirandom generator 31 | ) 32 | 33 | 34 | 35 | type Status int 36 | 37 | const ( 38 | SUCCESS Status = C.CURAND_STATUS_SUCCESS // No errors 39 | VERSION_MISMATCH Status = C.CURAND_STATUS_VERSION_MISMATCH // Header file and linked library version do not match 40 | NOT_INITIALIZED Status = C.CURAND_STATUS_NOT_INITIALIZED // Generator not initialized 41 | ALLOCATION_FAILED Status = C.CURAND_STATUS_ALLOCATION_FAILED // Memory allocation failed 42 | TYPE_ERROR Status = C.CURAND_STATUS_TYPE_ERROR // Generator is wrong type 43 | OUT_OF_RANGE Status = C.CURAND_STATUS_OUT_OF_RANGE // Argument out of range 44 | LENGTH_NOT_MULTIPLE Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE // Length requested is not a multple of dimension 45 | LAUNCH_FAILURE Status = C.CURAND_STATUS_LAUNCH_FAILURE // Kernel launch failure 46 | PREEXISTING_FAILURE Status = C.CURAND_STATUS_PREEXISTING_FAILURE // Preexisting failure on library entry 47 | INITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED // Initialization of CUDA failed 48 | ARCH_MISMATCH Status = C.CURAND_STATUS_ARCH_MISMATCH // Architecture mismatch, GPU does not support requested feature 49 | INTERNAL_ERROR Status = C.CURAND_STATUS_INTERNAL_ERROR // Internal library error 50 | ) 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /curand/cgoflags.go: -------------------------------------------------------------------------------- 1 | package curand 2 | 3 | // This file provides CGO flags to find CUDA libraries and headers. 4 | 5 | //#cgo LDFLAGS:-lcurand 6 | // 7 | ////default location: 8 | //#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib 9 | //#cgo CFLAGS: -I/usr/local/cuda/include/ 10 | // 11 | ////default location if not properly symlinked: 12 | //#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib 13 | //#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib 14 | //#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib 15 | //#cgo CFLAGS: -I/usr/local/cuda-6.0/include/ 16 | //#cgo CFLAGS: -I/usr/local/cuda-5.5/include/ 17 | //#cgo CFLAGS: -I/usr/local/cuda-5.0/include/ 18 | // 19 | ////arch linux: 20 | //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib 21 | //#cgo CFLAGS: -I/opt/cuda/include 22 | // 23 | ////WINDOWS: 24 | //#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64 25 | //#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include -w 26 | import "C" 27 | -------------------------------------------------------------------------------- /curand/generator.go: -------------------------------------------------------------------------------- 1 | package curand 2 | 3 | //#include 4 | import "C" 5 | 6 | import ( 7 | "unsafe" 8 | ) 9 | 10 | type Generator uintptr 11 | 12 | type RngType int 13 | 14 | const ( 15 | PSEUDO_DEFAULT RngType = C.CURAND_RNG_PSEUDO_DEFAULT // Default pseudorandom generator 16 | PSEUDO_XORWOW RngType = C.CURAND_RNG_PSEUDO_XORWOW // XORWOW pseudorandom generator 17 | QUASI_DEFAULT RngType = C.CURAND_RNG_QUASI_DEFAULT // Default quasirandom generator 18 | QUASI_SOBOL32 RngType = C.CURAND_RNG_QUASI_SOBOL32 // Sobol32 quasirandom generator 19 | QUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 // Scrambled Sobol32 quasirandom generator 20 | QUASI_SOBOL64 RngType = C.CURAND_RNG_QUASI_SOBOL64 // Sobol64 quasirandom generator 21 | QUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 // Scrambled Sobol64 quasirandom generator 22 | ) 23 | 24 | func CreateGenerator(rngType RngType) Generator { 25 | var rng C.curandGenerator_t 26 | err := Status(C.curandCreateGenerator(&rng, C.curandRngType_t(rngType))) 27 | if err != SUCCESS { 28 | panic(err) 29 | } 30 | return Generator(uintptr(unsafe.Pointer(rng))) // cgo 31 | } 32 | 33 | func (g Generator) GenerateNormal(output uintptr, n int64, mean, stddev float32) { 34 | err := Status(C.curandGenerateNormal( 35 | C.curandGenerator_t(unsafe.Pointer(uintptr(g))), 36 | (*C.float)(unsafe.Pointer(output)), 37 | C.size_t(n), 38 | C.float(mean), 39 | C.float(stddev))) 40 | if err != SUCCESS { 41 | panic(err) 42 | } 43 | } 44 | 45 | func (g Generator) SetSeed(seed int64) { 46 | err := Status(C.curandSetPseudoRandomGeneratorSeed(C.curandGenerator_t(unsafe.Pointer(uintptr(g))), _Ctype_ulonglong(seed))) 47 | if err != SUCCESS { 48 | panic(err) 49 | } 50 | } 51 | 52 | // Documentation was taken from the curand headers. 53 | -------------------------------------------------------------------------------- /curand/status.go: -------------------------------------------------------------------------------- 1 | package curand 2 | 3 | //#include 4 | import "C" 5 | 6 | import ( 7 | "fmt" 8 | ) 9 | 10 | type Status int 11 | 12 | const ( 13 | SUCCESS Status = C.CURAND_STATUS_SUCCESS // No errors 14 | VERSION_MISMATCH Status = C.CURAND_STATUS_VERSION_MISMATCH // Header file and linked library version do not match 15 | NOT_INITIALIZED Status = C.CURAND_STATUS_NOT_INITIALIZED // Generator not initialized 16 | ALLOCATION_FAILED Status = C.CURAND_STATUS_ALLOCATION_FAILED // Memory allocation failed 17 | TYPE_ERROR Status = C.CURAND_STATUS_TYPE_ERROR // Generator is wrong type 18 | OUT_OF_RANGE Status = C.CURAND_STATUS_OUT_OF_RANGE // Argument out of range 19 | LENGTH_NOT_MULTIPLE Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE // Length requested is not a multple of dimension 20 | LAUNCH_FAILURE Status = C.CURAND_STATUS_LAUNCH_FAILURE // Kernel launch failure 21 | PREEXISTING_FAILURE Status = C.CURAND_STATUS_PREEXISTING_FAILURE // Preexisting failure on library entry 22 | INITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED // Initialization of CUDA failed 23 | ARCH_MISMATCH Status = C.CURAND_STATUS_ARCH_MISMATCH // Architecture mismatch, GPU does not support requested feature 24 | INTERNAL_ERROR Status = C.CURAND_STATUS_INTERNAL_ERROR // Internal library error 25 | ) 26 | 27 | func (s Status) String() string { 28 | if str, ok := statusStr[s]; ok { 29 | return str 30 | } else { 31 | return fmt.Sprint("CURAND ERROR NUMBER ", int(s)) 32 | } 33 | } 34 | 35 | var statusStr = map[Status]string{ 36 | SUCCESS: "CURAND_STATUS_SUCCESS", 37 | VERSION_MISMATCH: "CURAND_STATUS_VERSION_MISMATCH", 38 | NOT_INITIALIZED: "CURAND_STATUS_NOT_INITIALIZED", 39 | ALLOCATION_FAILED: "CURAND_STATUS_ALLOCATION_FAILED", 40 | TYPE_ERROR: "CURAND_STATUS_TYPE_ERROR", 41 | OUT_OF_RANGE: "CURAND_STATUS_OUT_OF_RANGE", 42 | LENGTH_NOT_MULTIPLE: "CURAND_STATUS_LENGTH_NOT_MULTIPLE", 43 | LAUNCH_FAILURE: "CURAND_STATUS_LAUNCH_FAILURE", 44 | PREEXISTING_FAILURE: "CURAND_STATUS_PREEXISTING_FAILURE", 45 | INITIALIZATION_FAILED: "CURAND_STATUS_INITIALIZATION_FAILED", 46 | ARCH_MISMATCH: "CURAND_STATUS_ARCH_MISMATCH", 47 | INTERNAL_ERROR: "CURAND_STATUS_INTERNAL_ERROR", 48 | } 49 | 50 | // Documentation was taken from the curand headers. 51 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Go bindings for nVIDIA CUDA 5. 3 | This package compiles with both gc and gccgo. 4 | */ 5 | package cuda5 6 | 7 | // Dummy imports so that 8 | // go get github.com/barnex/cuda5 9 | // will install everything. 10 | import ( 11 | _ "github.com/barnex/cuda5/cu" 12 | _ "github.com/barnex/cuda5/cufft" 13 | _ "github.com/barnex/cuda5/safe" 14 | ) 15 | -------------------------------------------------------------------------------- /gophergpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/barnex/cuda5/da30a9b287d8f7ad210d42d911e33ef5c511544b/gophergpu.png -------------------------------------------------------------------------------- /safe/Makefile: -------------------------------------------------------------------------------- 1 | all: 6g doc #gccgo 2 | 3 | 6g: 4 | go install -v 5 | go tool vet *.go 6 | gofmt -w *.go 7 | 8 | GCCGO=gccgo -gccgoflags '-static-libgcc -O3' 9 | 10 | gccgo: 11 | go build -v -compiler $(GCCGO) 12 | 13 | test: 6gtest gccgotest 14 | 15 | 6gtest: 16 | go test 17 | 18 | gccgotest: 19 | go test -compiler $(GCCGO) 20 | 21 | bench: 6gbench gccgobench 22 | 23 | 6gbench: 24 | go test -bench=. 25 | 26 | gccgobench: 27 | go test -bench=. -compiler $(GCCGO) 28 | 29 | clean: 30 | go clean 31 | go-optview -c -w *.go 32 | gofmt -w *.go 33 | 34 | opt: 35 | go-optview -w *.go 36 | gofmt -w *.go 37 | 38 | doc: 39 | godoc github.com/barnex/cuda5/safe > README 40 | -------------------------------------------------------------------------------- /safe/README: -------------------------------------------------------------------------------- 1 | PACKAGE 2 | 3 | package safe 4 | import "github.com/barnex/cuda5/safe" 5 | 6 | Safe and more idiomatic wrappers for the low-level CUDA functions. 7 | 8 | FUNCTIONS 9 | 10 | func InitCuda() 11 | 12 | 13 | TYPES 14 | 15 | type Complex128s struct { 16 | // contains filtered or unexported fields 17 | } 18 | Slice of complex128's on the GPU. 19 | 20 | func MakeComplex128s(len_ int) Complex128s 21 | Make a slice of complex128's on the GPU. Initialized to zero. 22 | 23 | func (s *Complex128s) Cap() int 24 | Slice capacity. 25 | 26 | func (dst Complex128s) CopyDtoD(src Complex128s) 27 | Copy src on host to dst on host. 28 | 29 | func (dst Complex128s) CopyDtoDAsync(src Complex128s, stream cu.Stream) 30 | Copy src on host to dst on host, asynchronously. 31 | 32 | func (src Complex128s) CopyDtoH(dst []complex128) 33 | Copy src form device to dst on host. 34 | 35 | func (src Complex128s) CopyDtoHAsync(dst []complex128, stream cu.Stream) 36 | Copy src form device to dst on host, asynchronously. 37 | 38 | func (dst Complex128s) CopyHtoD(src []complex128) 39 | Copy src from host to dst on the device. 40 | 41 | func (dst Complex128s) CopyHtoDAsync(src []complex128, stream cu.Stream) 42 | Copy src from host to dst on the device, asynchronously. 43 | 44 | func (s Complex128s) Float() Float64s 45 | Re-interpret the array as float numbers, in interleaved format. 46 | Underlying storage is shared. 47 | 48 | func (s *Complex128s) Free() 49 | Free the underlying storage. To be used with care. Free() should only be 50 | called on a slice created by MakeXXX(), not on a slice created by 51 | x.Slice(). Freeing a slice invalidates all other slices referring to it. 52 | 53 | func (src Complex128s) Host() []complex128 54 | Returns a fresh copy on host. 55 | 56 | func (s *Complex128s) Len() int 57 | Slice length (number of elements). 58 | 59 | func (s *Complex128s) Pointer() cu.DevicePtr 60 | Pointer to the first element. 61 | 62 | func (s Complex128s) Slice(start, stop int) Complex128s 63 | Return a slice from start (inclusive) to stop (exclusive), sharing the 64 | underlying storage with the original slice. Slices obtained in this way 65 | should not be Free()'d 66 | 67 | func (s *Complex128s) UnsafeSet(pointer unsafe.Pointer, length, capacity int) 68 | Manually set the pointer, length and capacity. Side-steps the security 69 | mechanisms, use with caution. 70 | 71 | type Complex64s struct { 72 | // contains filtered or unexported fields 73 | } 74 | Slice of complex64's on the GPU. 75 | 76 | func MakeComplex64s(len_ int) Complex64s 77 | Make a slice of complex64's on the GPU. Initialized to zero. 78 | 79 | func (s *Complex64s) Cap() int 80 | Slice capacity. 81 | 82 | func (dst Complex64s) CopyDtoD(src Complex64s) 83 | Copy src on host to dst on host. 84 | 85 | func (dst Complex64s) CopyDtoDAsync(src Complex64s, stream cu.Stream) 86 | Copy src on host to dst on host, asynchronously. 87 | 88 | func (src Complex64s) CopyDtoH(dst []complex64) 89 | Copy src form device to dst on host. 90 | 91 | func (src Complex64s) CopyDtoHAsync(dst []complex64, stream cu.Stream) 92 | Copy src form device to dst on host, asynchronously. 93 | 94 | func (dst Complex64s) CopyHtoD(src []complex64) 95 | Copy src from host to dst on the device. 96 | 97 | func (dst Complex64s) CopyHtoDAsync(src []complex64, stream cu.Stream) 98 | Copy src from host to dst on the device, asynchronously. 99 | 100 | func (s Complex64s) Float() Float32s 101 | Re-interpret the array as float numbers, in interleaved format. 102 | Underlying storage is shared. 103 | 104 | func (s *Complex64s) Free() 105 | Free the underlying storage. To be used with care. Free() should only be 106 | called on a slice created by MakeXXX(), not on a slice created by 107 | x.Slice(). Freeing a slice invalidates all other slices referring to it. 108 | 109 | func (src Complex64s) Host() []complex64 110 | Returns a fresh copy on host. 111 | 112 | func (s *Complex64s) Len() int 113 | Slice length (number of elements). 114 | 115 | func (s *Complex64s) Pointer() cu.DevicePtr 116 | Pointer to the first element. 117 | 118 | func (s Complex64s) Slice(start, stop int) Complex64s 119 | Return a slice from start (inclusive) to stop (exclusive), sharing the 120 | underlying storage with the original slice. Slices obtained in this way 121 | should not be Free()'d 122 | 123 | func (s *Complex64s) UnsafeSet(pointer unsafe.Pointer, length, capacity int) 124 | Manually set the pointer, length and capacity. Side-steps the security 125 | mechanisms, use with caution. 126 | 127 | type FFT1DC2RPlan struct { 128 | // contains filtered or unexported fields 129 | } 130 | 1D single-precission complex-to-real FFT plan. 131 | 132 | func FFT1DC2R(size, batch int) FFT1DC2RPlan 133 | 1D single-precission complex-to-real FFT plan. 134 | 135 | func (p FFT1DC2RPlan) Destroy() 136 | Releases all resources associated with the FFT plan. 137 | 138 | func (p FFT1DC2RPlan) Exec(src Complex64s, dst Float32s) 139 | Execute the FFT plan. Synchronized. 140 | 141 | func (p FFT1DC2RPlan) InputLen() int 142 | Required length of the output array. 143 | 144 | func (p FFT1DC2RPlan) OutputLen() int 145 | Required length of the input array. 146 | 147 | func (p FFT1DC2RPlan) SetStream(stream cu.Stream) 148 | Associates a CUDA stream with the FFT plan. If a stream is set, 149 | plan.Stream().Synchronize() can to be called to wait for the execution 150 | to finish. 151 | 152 | func (s FFT1DC2RPlan) Size() int 153 | Returns the logical size of the FFT: the number of elements (real or 154 | complex) it transforms. 155 | 156 | func (p FFT1DC2RPlan) Stream() cu.Stream 157 | Returns the CUDA stream associated with the FFT plan. 158 | 159 | type FFT1DR2CPlan struct { 160 | // contains filtered or unexported fields 161 | } 162 | 1D single-precission real-to-complex FFT plan. 163 | 164 | func FFT1DR2C(size, batch int) FFT1DR2CPlan 165 | 1D single-precission real-to-complex FFT plan. 166 | 167 | func (p FFT1DR2CPlan) Destroy() 168 | Releases all resources associated with the FFT plan. 169 | 170 | func (p FFT1DR2CPlan) Exec(src Float32s, dst Complex64s) 171 | Execute the FFT plan. Synchronized. 172 | 173 | func (p FFT1DR2CPlan) InputLen() int 174 | Required length of the input array. 175 | 176 | func (p FFT1DR2CPlan) OutputLen() int 177 | Required length of the output array. 178 | 179 | func (p FFT1DR2CPlan) SetStream(stream cu.Stream) 180 | Associates a CUDA stream with the FFT plan. If a stream is set, 181 | plan.Stream().Synchronize() can to be called to wait for the execution 182 | to finish. 183 | 184 | func (s FFT1DR2CPlan) Size() int 185 | Returns the logical size of the FFT: the number of elements (real or 186 | complex) it transforms. 187 | 188 | func (p FFT1DR2CPlan) Stream() cu.Stream 189 | Returns the CUDA stream associated with the FFT plan. 190 | 191 | type FFT3DC2RPlan struct { 192 | // contains filtered or unexported fields 193 | } 194 | 3D single-precission real-to-complex FFT plan. 195 | 196 | func FFT3DC2R(Nx, Ny, Nz int) FFT3DC2RPlan 197 | 3D single-precission real-to-complex FFT plan. 198 | 199 | func (p FFT3DC2RPlan) Destroy() 200 | Releases all resources associated with the FFT plan. 201 | 202 | func (p FFT3DC2RPlan) Exec(src Complex64s, dst Float32s) 203 | Execute the FFT plan. src and dst are 3D arrays stored 1D arrays. 204 | 205 | func (p FFT3DC2RPlan) InputLen() int 206 | Required length of the (1D) input array. 207 | 208 | func (p FFT3DC2RPlan) InputSize() (Nx, Ny, Nz int) 209 | 3D size of the input array. 210 | 211 | func (p FFT3DC2RPlan) OutputLen() int 212 | Required length of the (1D) output array. 213 | 214 | func (p FFT3DC2RPlan) OutputSize() (Nx, Ny, Nz int) 215 | 3D size of the output array. 216 | 217 | func (p FFT3DC2RPlan) SetStream(stream cu.Stream) 218 | Associates a CUDA stream with the FFT plan. If a stream is set, 219 | plan.Stream().Synchronize() can to be called to wait for the execution 220 | to finish. 221 | 222 | func (s FFT3DC2RPlan) Size() (Nx, Ny, Nz int) 223 | Returns the logical size of the FFT: the number of elements (real or 224 | complex) it transforms. 225 | 226 | func (p FFT3DC2RPlan) Stream() cu.Stream 227 | Returns the CUDA stream associated with the FFT plan. 228 | 229 | type FFT3DD2ZPlan struct { 230 | // contains filtered or unexported fields 231 | } 232 | 3D single-precission real-to-complex FFT plan. 233 | 234 | func FFT3DD2Z(Nx, Ny, Nz int) FFT3DD2ZPlan 235 | 3D single-precission real-to-complex FFT plan. 236 | 237 | func (p FFT3DD2ZPlan) Destroy() 238 | Releases all resources associated with the FFT plan. 239 | 240 | func (p FFT3DD2ZPlan) Exec(src Float64s, dst Complex128s) 241 | Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D 242 | arrays. 243 | 244 | func (p FFT3DD2ZPlan) InputLen() int 245 | Required length of the (1D) input array. 246 | 247 | func (p FFT3DD2ZPlan) InputSize() (Nx, Ny, Nz int) 248 | 3D size of the input array. 249 | 250 | func (p FFT3DD2ZPlan) OutputLen() int 251 | Required length of the (1D) output array. 252 | 253 | func (p FFT3DD2ZPlan) OutputSize() (Nx, Ny, Nz int) 254 | 3D size of the output array. 255 | 256 | func (p FFT3DD2ZPlan) SetStream(stream cu.Stream) 257 | Associates a CUDA stream with the FFT plan. If a stream is set, 258 | plan.Stream().Synchronize() can to be called to wait for the execution 259 | to finish. 260 | 261 | func (s FFT3DD2ZPlan) Size() (Nx, Ny, Nz int) 262 | Returns the logical size of the FFT: the number of elements (real or 263 | complex) it transforms. 264 | 265 | func (p FFT3DD2ZPlan) Stream() cu.Stream 266 | Returns the CUDA stream associated with the FFT plan. 267 | 268 | type FFT3DR2CPlan struct { 269 | // contains filtered or unexported fields 270 | } 271 | 3D single-precission real-to-complex FFT plan. 272 | 273 | func FFT3DR2C(Nx, Ny, Nz int) FFT3DR2CPlan 274 | 3D single-precission real-to-complex FFT plan. 275 | 276 | func (p FFT3DR2CPlan) Destroy() 277 | Releases all resources associated with the FFT plan. 278 | 279 | func (p FFT3DR2CPlan) Exec(src Float32s, dst Complex64s) 280 | Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D 281 | arrays. 282 | 283 | func (p FFT3DR2CPlan) InputLen() int 284 | Required length of the (1D) input array. 285 | 286 | func (p FFT3DR2CPlan) InputSize() (Nx, Ny, Nz int) 287 | 3D size of the input array. 288 | 289 | func (p FFT3DR2CPlan) OutputLen() int 290 | Required length of the (1D) output array. 291 | 292 | func (p FFT3DR2CPlan) OutputSize() (Nx, Ny, Nz int) 293 | 3D size of the output array. 294 | 295 | func (p FFT3DR2CPlan) SetStream(stream cu.Stream) 296 | Associates a CUDA stream with the FFT plan. If a stream is set, 297 | plan.Stream().Synchronize() can to be called to wait for the execution 298 | to finish. 299 | 300 | func (s FFT3DR2CPlan) Size() (Nx, Ny, Nz int) 301 | Returns the logical size of the FFT: the number of elements (real or 302 | complex) it transforms. 303 | 304 | func (p FFT3DR2CPlan) Stream() cu.Stream 305 | Returns the CUDA stream associated with the FFT plan. 306 | 307 | type FFT3DZ2DPlan struct { 308 | // contains filtered or unexported fields 309 | } 310 | 3D single-precission real-to-complex FFT plan. 311 | 312 | func FFT3DZ2D(Nx, Ny, Nz int) FFT3DZ2DPlan 313 | 3D single-precission real-to-complex FFT plan. 314 | 315 | func (p FFT3DZ2DPlan) Destroy() 316 | Releases all resources associated with the FFT plan. 317 | 318 | func (p FFT3DZ2DPlan) Exec(src Complex128s, dst Float64s) 319 | Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D 320 | arrays. 321 | 322 | func (p FFT3DZ2DPlan) InputLen() int 323 | Required length of the (1D) input array. 324 | 325 | func (p FFT3DZ2DPlan) InputSize() (Nx, Ny, Nz int) 326 | 3D size of the input array. 327 | 328 | func (p FFT3DZ2DPlan) OutputLen() int 329 | Required length of the (1D) output array. 330 | 331 | func (p FFT3DZ2DPlan) OutputSize() (Nx, Ny, Nz int) 332 | 3D size of the output array. 333 | 334 | func (p FFT3DZ2DPlan) SetStream(stream cu.Stream) 335 | Associates a CUDA stream with the FFT plan. If a stream is set, 336 | plan.Stream().Synchronize() can to be called to wait for the execution 337 | to finish. 338 | 339 | func (s FFT3DZ2DPlan) Size() (Nx, Ny, Nz int) 340 | Returns the logical size of the FFT: the number of elements (real or 341 | complex) it transforms. 342 | 343 | func (p FFT3DZ2DPlan) Stream() cu.Stream 344 | Returns the CUDA stream associated with the FFT plan. 345 | 346 | type Float32s struct { 347 | // contains filtered or unexported fields 348 | } 349 | Slice of float32's on the GPU. 350 | 351 | func MakeFloat32s(len_ int) Float32s 352 | Make a slice of float32's on the GPU. Initialized to zero. 353 | 354 | func (s *Float32s) Cap() int 355 | Slice capacity. 356 | 357 | func (s Float32s) Complex() Complex64s 358 | Re-interpret the array as complex numbers, in interleaved format. 359 | Underlying storage is shared. 360 | 361 | func (dst Float32s) CopyDtoD(src Float32s) 362 | Copy src on host to dst on host. 363 | 364 | func (dst Float32s) CopyDtoDAsync(src Float32s, stream cu.Stream) 365 | Copy src on host to dst on host, asynchronously. 366 | 367 | func (src Float32s) CopyDtoH(dst []float32) 368 | Copy src form device to dst on host. 369 | 370 | func (src Float32s) CopyDtoHAsync(dst []float32, stream cu.Stream) 371 | Copy src form device to dst on host, asynchronously. 372 | 373 | func (dst Float32s) CopyHtoD(src []float32) 374 | Copy src from host to dst on the device. 375 | 376 | func (dst Float32s) CopyHtoDAsync(src []float32, stream cu.Stream) 377 | Copy src from host to dst on the device, asynchronously. 378 | 379 | func (s *Float32s) Free() 380 | Free the underlying storage. To be used with care. Free() should only be 381 | called on a slice created by MakeXXX(), not on a slice created by 382 | x.Slice(). Freeing a slice invalidates all other slices referring to it. 383 | 384 | func (src Float32s) Host() []float32 385 | Returns a fresh copy on host. 386 | 387 | func (s *Float32s) Len() int 388 | Slice length (number of elements). 389 | 390 | func (s Float32s) Memset(value float32) 391 | Set the entire slice to this value. 392 | 393 | func (s Float32s) MemsetAsync(value float32, stream cu.Stream) 394 | Set the entire slice to this value, asynchronously. 395 | 396 | func (s *Float32s) Pointer() cu.DevicePtr 397 | Pointer to the first element. 398 | 399 | func (s Float32s) Slice(start, stop int) Float32s 400 | Return a slice from start (inclusive) to stop (exclusive), sharing the 401 | underlying storage with the original slice. Slices obtained in this way 402 | should not be Free()'d 403 | 404 | func (s *Float32s) UnsafeSet(pointer unsafe.Pointer, length, capacity int) 405 | Manually set the pointer, length and capacity. Side-steps the security 406 | mechanisms, use with caution. 407 | 408 | type Float64s struct { 409 | // contains filtered or unexported fields 410 | } 411 | Slice of float64's on the GPU. 412 | 413 | func MakeFloat64s(len_ int) Float64s 414 | Make a slice of float64's on the GPU. Initialized to zero. 415 | 416 | func (s *Float64s) Cap() int 417 | Slice capacity. 418 | 419 | func (s Float64s) Complex() Complex128s 420 | Re-interpret the array as complex numbers, in interleaved format. 421 | Underlying storage is shared. 422 | 423 | func (dst Float64s) CopyDtoD(src Float64s) 424 | Copy src on host to dst on host. 425 | 426 | func (dst Float64s) CopyDtoDAsync(src Float64s, stream cu.Stream) 427 | Copy src on host to dst on host, asynchronously. 428 | 429 | func (src Float64s) CopyDtoH(dst []float64) 430 | Copy src form device to dst on host. 431 | 432 | func (src Float64s) CopyDtoHAsync(dst []float64, stream cu.Stream) 433 | Copy src form device to dst on host, asynchronously. 434 | 435 | func (dst Float64s) CopyHtoD(src []float64) 436 | Copy src from host to dst on the device. 437 | 438 | func (dst Float64s) CopyHtoDAsync(src []float64, stream cu.Stream) 439 | Copy src from host to dst on the device, asynchronously. 440 | 441 | func (s *Float64s) Free() 442 | Free the underlying storage. To be used with care. Free() should only be 443 | called on a slice created by MakeXXX(), not on a slice created by 444 | x.Slice(). Freeing a slice invalidates all other slices referring to it. 445 | 446 | func (src Float64s) Host() []float64 447 | Returns a fresh copy on host. 448 | 449 | func (s *Float64s) Len() int 450 | Slice length (number of elements). 451 | 452 | func (s *Float64s) Pointer() cu.DevicePtr 453 | Pointer to the first element. 454 | 455 | func (s Float64s) Slice(start, stop int) Float64s 456 | Return a slice from start (inclusive) to stop (exclusive), sharing the 457 | underlying storage with the original slice. Slices obtained in this way 458 | should not be Free()'d 459 | 460 | func (s *Float64s) UnsafeSet(pointer unsafe.Pointer, length, capacity int) 461 | Manually set the pointer, length and capacity. Side-steps the security 462 | mechanisms, use with caution. 463 | 464 | 465 | -------------------------------------------------------------------------------- /safe/complex128s.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "github.com/barnex/cuda5/cu" 5 | "unsafe" 6 | ) 7 | 8 | // Slice of complex128's on the GPU. 9 | type Complex128s struct{ slice } 10 | 11 | // Make a slice of complex128's on the GPU. 12 | // Initialized to zero. 13 | func MakeComplex128s(len_ int) Complex128s { 14 | return Complex128s{makeslice(len_, cu.SIZEOF_COMPLEX128)} 15 | } 16 | 17 | // Return a slice from start (inclusive) to stop (exclusive), 18 | // sharing the underlying storage with the original slice. 19 | // Slices obtained in this way should not be Free()'d 20 | func (s Complex128s) Slice(start, stop int) Complex128s { 21 | return Complex128s{s.slice.slice(start, stop, cu.SIZEOF_COMPLEX128)} 22 | } 23 | 24 | // Copy src from host to dst on the device. 25 | func (dst Complex128s) CopyHtoD(src []complex128) { 26 | dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX128) 27 | } 28 | 29 | // Copy src form device to dst on host. 30 | func (src Complex128s) CopyDtoH(dst []complex128) { 31 | src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX128) 32 | } 33 | 34 | // Copy src on host to dst on host. 35 | func (dst Complex128s) CopyDtoD(src Complex128s) { 36 | dst.copyDtoD(&src.slice, cu.SIZEOF_COMPLEX128) 37 | } 38 | 39 | // Copy src from host to dst on the device, asynchronously. 40 | func (dst Complex128s) CopyHtoDAsync(src []complex128, stream cu.Stream) { 41 | dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX128, stream) 42 | } 43 | 44 | // Copy src form device to dst on host, asynchronously. 45 | func (src Complex128s) CopyDtoHAsync(dst []complex128, stream cu.Stream) { 46 | src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX128, stream) 47 | } 48 | 49 | // Copy src on host to dst on host, asynchronously. 50 | func (dst Complex128s) CopyDtoDAsync(src Complex128s, stream cu.Stream) { 51 | dst.copyDtoDAsync(&src.slice, cu.SIZEOF_COMPLEX128, stream) 52 | } 53 | 54 | // Returns a fresh copy on host. 55 | func (src Complex128s) Host() []complex128 { 56 | cpy := make([]complex128, src.Len()) 57 | src.CopyDtoH(cpy) 58 | return cpy 59 | } 60 | 61 | // Re-interpret the array as float numbers, 62 | // in interleaved format. Underlying storage 63 | // is shared. 64 | func (s Complex128s) Float() Float64s { 65 | return Float64s{slice{s.ptr_, s.len_ * 2, s.cap_ * 2}} 66 | } 67 | -------------------------------------------------------------------------------- /safe/complex128s_test.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestComplex128sSlice(test *testing.T) { 9 | InitCuda() 10 | 11 | a := MakeComplex128s(100) 12 | defer a.Free() 13 | 14 | if !reflect.DeepEqual(a.Host(), make([]complex128, 100)) { 15 | test.Error(a.Host()) 16 | } 17 | 18 | b := make([]complex128, 100) 19 | 20 | if a.Len() != len(b) { 21 | test.Error("len:", a.Len(), "!=", cap(b)) 22 | } 23 | if a.Cap() != cap(b) { 24 | test.Error("cap:", a.Cap(), "!=", cap(b)) 25 | } 26 | 27 | c := a.Slice(20, 30) 28 | d := b[20:30] 29 | 30 | if c.Len() != len(d) { 31 | test.Error("sliced len:", c.Len(), "!=", cap(d)) 32 | } 33 | if c.Cap() != cap(d) { 34 | test.Error("sliced cap:", c.Cap(), "!=", cap(d)) 35 | } 36 | 37 | e := a.Slice(0, 50) 38 | f := b[0:50] 39 | 40 | if e.Len() != len(f) { 41 | test.Error("sliced len:", e.Len(), "!=", cap(f)) 42 | } 43 | if e.Cap() != cap(f) { 44 | test.Error("sliced cap:", e.Cap(), "!=", cap(f)) 45 | } 46 | } 47 | 48 | func TestComplex128sPanic1(test *testing.T) { 49 | InitCuda() 50 | 51 | defer func() { 52 | err := recover() 53 | test.Log("recovered:", err) 54 | if err == nil { 55 | test.Fail() 56 | } 57 | }() 58 | 59 | a := MakeComplex128s(100) 60 | defer a.Free() 61 | 62 | a.Slice(-1, 10) 63 | } 64 | 65 | func TestComplex128sPanic2(test *testing.T) { 66 | InitCuda() 67 | 68 | defer func() { 69 | err := recover() 70 | test.Log("recovered:", err) 71 | if err == nil { 72 | test.Fail() 73 | } 74 | }() 75 | 76 | a := MakeComplex128s(100) 77 | defer a.Free() 78 | 79 | a.Slice(0, 101) 80 | } 81 | 82 | func TestComplex128sCopy(test *testing.T) { 83 | InitCuda() 84 | 85 | a := make([]complex128, 100) 86 | 87 | b := MakeComplex128s(100) 88 | defer b.Free() 89 | 90 | c := MakeComplex128s(100) 91 | defer c.Free() 92 | 93 | d := make([]complex128, 200) 94 | 95 | for i := range a { 96 | a[i] = complex(float64(i), float64(2*i)) 97 | } 98 | 99 | b.CopyHtoD(a) 100 | 101 | c.CopyDtoD(b) 102 | 103 | c.CopyDtoH(d[:100]) 104 | 105 | if !reflect.DeepEqual(a, d[:100]) { 106 | test.Error(d) 107 | } 108 | if !reflect.DeepEqual(d[100:], make([]complex128, 100)) { 109 | test.Error(d) 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /safe/complex64s.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "github.com/barnex/cuda5/cu" 5 | "unsafe" 6 | ) 7 | 8 | // Slice of complex64's on the GPU. 9 | type Complex64s struct{ slice } 10 | 11 | // Make a slice of complex64's on the GPU. 12 | // Initialized to zero. 13 | func MakeComplex64s(len_ int) Complex64s { 14 | return Complex64s{makeslice(len_, cu.SIZEOF_COMPLEX64)} 15 | } 16 | 17 | // Return a slice from start (inclusive) to stop (exclusive), 18 | // sharing the underlying storage with the original slice. 19 | // Slices obtained in this way should not be Free()'d 20 | func (s Complex64s) Slice(start, stop int) Complex64s { 21 | return Complex64s{s.slice.slice(start, stop, cu.SIZEOF_COMPLEX64)} 22 | } 23 | 24 | // Copy src from host to dst on the device. 25 | func (dst Complex64s) CopyHtoD(src []complex64) { 26 | dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX64) 27 | } 28 | 29 | // Copy src form device to dst on host. 30 | func (src Complex64s) CopyDtoH(dst []complex64) { 31 | src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX64) 32 | } 33 | 34 | // Copy src on host to dst on host. 35 | func (dst Complex64s) CopyDtoD(src Complex64s) { 36 | dst.copyDtoD(&src.slice, cu.SIZEOF_COMPLEX64) 37 | } 38 | 39 | // Copy src from host to dst on the device, asynchronously. 40 | func (dst Complex64s) CopyHtoDAsync(src []complex64, stream cu.Stream) { 41 | dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX64, stream) 42 | } 43 | 44 | // Copy src form device to dst on host, asynchronously. 45 | func (src Complex64s) CopyDtoHAsync(dst []complex64, stream cu.Stream) { 46 | src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX64, stream) 47 | } 48 | 49 | // Copy src on host to dst on host, asynchronously. 50 | func (dst Complex64s) CopyDtoDAsync(src Complex64s, stream cu.Stream) { 51 | dst.copyDtoDAsync(&src.slice, cu.SIZEOF_COMPLEX64, stream) 52 | } 53 | 54 | // Returns a fresh copy on host. 55 | func (src Complex64s) Host() []complex64 { 56 | cpy := make([]complex64, src.Len()) 57 | src.CopyDtoH(cpy) 58 | return cpy 59 | } 60 | 61 | // Re-interpret the array as float numbers, 62 | // in interleaved format. Underlying storage 63 | // is shared. 64 | func (s Complex64s) Float() Float32s { 65 | return Float32s{slice{s.ptr_, s.len_ * 2, s.cap_ * 2}} 66 | } 67 | -------------------------------------------------------------------------------- /safe/complex64s_test.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestComplex64sSlice(test *testing.T) { 9 | InitCuda() 10 | 11 | a := MakeComplex64s(100) 12 | defer a.Free() 13 | 14 | if !reflect.DeepEqual(a.Host(), make([]complex64, 100)) { 15 | test.Error(a.Host()) 16 | } 17 | 18 | b := make([]complex64, 100) 19 | 20 | if a.Len() != len(b) { 21 | test.Error("len:", a.Len(), "!=", cap(b)) 22 | } 23 | if a.Cap() != cap(b) { 24 | test.Error("cap:", a.Cap(), "!=", cap(b)) 25 | } 26 | 27 | c := a.Slice(20, 30) 28 | d := b[20:30] 29 | 30 | if c.Len() != len(d) { 31 | test.Error("sliced len:", c.Len(), "!=", cap(d)) 32 | } 33 | if c.Cap() != cap(d) { 34 | test.Error("sliced cap:", c.Cap(), "!=", cap(d)) 35 | } 36 | 37 | e := a.Slice(0, 50) 38 | f := b[0:50] 39 | 40 | if e.Len() != len(f) { 41 | test.Error("sliced len:", e.Len(), "!=", cap(f)) 42 | } 43 | if e.Cap() != cap(f) { 44 | test.Error("sliced cap:", e.Cap(), "!=", cap(f)) 45 | } 46 | } 47 | 48 | func TestComplex64sPanic1(test *testing.T) { 49 | InitCuda() 50 | 51 | defer func() { 52 | err := recover() 53 | test.Log("recovered:", err) 54 | if err == nil { 55 | test.Fail() 56 | } 57 | }() 58 | 59 | a := MakeComplex64s(100) 60 | defer a.Free() 61 | 62 | a.Slice(-1, 10) 63 | } 64 | 65 | func TestComplex64sPanic2(test *testing.T) { 66 | InitCuda() 67 | 68 | defer func() { 69 | err := recover() 70 | test.Log("recovered:", err) 71 | if err == nil { 72 | test.Fail() 73 | } 74 | }() 75 | 76 | a := MakeComplex64s(100) 77 | defer a.Free() 78 | 79 | a.Slice(0, 101) 80 | } 81 | 82 | func TestComplex64sCopy(test *testing.T) { 83 | InitCuda() 84 | 85 | a := make([]complex64, 100) 86 | 87 | b := MakeComplex64s(100) 88 | defer b.Free() 89 | 90 | c := MakeComplex64s(100) 91 | defer c.Free() 92 | 93 | d := make([]complex64, 200) 94 | 95 | for i := range a { 96 | a[i] = complex(float32(i), float32(2*i)) 97 | } 98 | 99 | b.CopyHtoD(a) 100 | 101 | c.CopyDtoD(b) 102 | 103 | c.CopyDtoH(d[:100]) 104 | 105 | if !reflect.DeepEqual(a, d[:100]) { 106 | test.Error(d) 107 | } 108 | if !reflect.DeepEqual(d[100:], make([]complex64, 100)) { 109 | test.Error(d) 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /safe/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Safe and more idiomatic wrappers for the low-level CUDA functions. 3 | */ 4 | package safe 5 | -------------------------------------------------------------------------------- /safe/fft1d_test.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | func ExampleFFT1DR2C() { 8 | InitCuda() 9 | 10 | N := 8 11 | batch := 1 12 | 13 | fft := FFT1DR2C(N, batch) 14 | defer fft.Destroy() 15 | 16 | input := MakeFloat32s(N) 17 | defer input.Free() 18 | input.CopyHtoD([]float32{1, 0, 0, 0, 0, 0, 0, 0}) 19 | 20 | output := MakeComplex64s(fft.OutputLen()) 21 | defer output.Free() 22 | 23 | fft.Exec(input, output) 24 | 25 | fmt.Println("input:", input.Host()) 26 | fmt.Println("output:", output.Host()) 27 | 28 | // Output: 29 | // input: [1 0 0 0 0 0 0 0] 30 | // output: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)] 31 | } 32 | 33 | func ExampleFFT1DR2C_Inplace() { 34 | InitCuda() 35 | 36 | N := 8 37 | batch := 2 38 | 39 | fft := FFT1DR2C(N, batch) 40 | defer fft.Destroy() 41 | 42 | output := MakeComplex64s(fft.OutputLen()) 43 | defer output.Free() 44 | 45 | input := output.Float().Slice(0, fft.InputLen()) 46 | // input uses same layout as out-of-place transform 47 | // (CUFFT native layout) 48 | input.CopyHtoD([]float32{1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0}) 49 | fmt.Println("input:", input.Host()) 50 | 51 | fft.Exec(input, output) 52 | fmt.Println("output:", output.Host()) 53 | 54 | inverse := FFT1DC2R(N, batch) 55 | defer inverse.Destroy() 56 | inverse.Exec(output, input) 57 | fmt.Println("input:", input.Host()) 58 | 59 | // Output: 60 | // input: [1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0] 61 | // output: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i) (+1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)] 62 | // input: [8 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0] 63 | } 64 | func ExampleFFT1DC2R() { 65 | InitCuda() 66 | 67 | N := 8 68 | batch := 1 69 | 70 | fft := FFT1DC2R(N, batch) 71 | defer fft.Destroy() 72 | 73 | input := MakeComplex64s(fft.InputLen()) 74 | defer input.Free() 75 | input.CopyHtoD([]complex64{(1 + 0i), (+1 + 0i), (+1 + 0i), (+1 - 0i), (+1 + 0i)}) 76 | 77 | output := MakeFloat32s(fft.OutputLen()) 78 | defer output.Free() 79 | 80 | fft.Exec(input, output) 81 | 82 | fmt.Println("input:", input.Host()) 83 | fmt.Println("output:", output.Host()) 84 | 85 | // Output: 86 | // input: [(1+0i) (+1+0i) (+1+0i) (+1+0i) (+1+0i)] 87 | // output: [8 0 0 0 0 0 0 0] 88 | } 89 | -------------------------------------------------------------------------------- /safe/fft1dc2r.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "fmt" 5 | "github.com/barnex/cuda5/cufft" 6 | ) 7 | 8 | // 1D single-precission complex-to-real FFT plan. 9 | type FFT1DC2RPlan struct { 10 | fftplan 11 | size1D 12 | batch int 13 | } 14 | 15 | // 1D single-precission complex-to-real FFT plan. 16 | func FFT1DC2R(size, batch int) FFT1DC2RPlan { 17 | handle := cufft.Plan1d(size, cufft.C2R, batch) 18 | handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE) 19 | return FFT1DC2RPlan{fftplan{handle, 0}, size1D(size), batch} 20 | } 21 | 22 | // Execute the FFT plan. Synchronized. 23 | func (p FFT1DC2RPlan) Exec(src Complex64s, dst Float32s) { 24 | oksrclen := p.InputLen() 25 | if src.Len() != oksrclen { 26 | panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) 27 | } 28 | okdstlen := p.OutputLen() 29 | if dst.Len() != okdstlen { 30 | panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) 31 | } 32 | p.handle.ExecC2R(src.Pointer(), dst.Pointer()) 33 | p.stream.Synchronize() //! 34 | } 35 | 36 | // Required length of the input array. 37 | func (p FFT1DC2RPlan) OutputLen() int { 38 | return p.batch * p.Size() 39 | } 40 | 41 | // Required length of the output array. 42 | func (p FFT1DC2RPlan) InputLen() int { 43 | return p.batch * (p.Size()/2 + 1) 44 | } 45 | -------------------------------------------------------------------------------- /safe/fft1dr2c.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "fmt" 5 | "github.com/barnex/cuda5/cufft" 6 | ) 7 | 8 | // 1D single-precission real-to-complex FFT plan. 9 | type FFT1DR2CPlan struct { 10 | fftplan 11 | size1D 12 | batch int 13 | } 14 | 15 | // 1D single-precission real-to-complex FFT plan. 16 | func FFT1DR2C(size, batch int) FFT1DR2CPlan { 17 | handle := cufft.Plan1d(size, cufft.R2C, batch) 18 | handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE) 19 | return FFT1DR2CPlan{fftplan{handle, 0}, size1D(size), batch} 20 | } 21 | 22 | // Execute the FFT plan. Synchronized. 23 | func (p FFT1DR2CPlan) Exec(src Float32s, dst Complex64s) { 24 | oksrclen := p.InputLen() 25 | if src.Len() != oksrclen { 26 | panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) 27 | } 28 | okdstlen := p.OutputLen() 29 | if dst.Len() != okdstlen { 30 | panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) 31 | } 32 | p.handle.ExecR2C(src.Pointer(), dst.Pointer()) 33 | p.stream.Synchronize() //! 34 | } 35 | 36 | // Required length of the input array. 37 | func (p FFT1DR2CPlan) InputLen() int { 38 | return p.batch * p.Size() 39 | } 40 | 41 | // Required length of the output array. 42 | func (p FFT1DR2CPlan) OutputLen() int { 43 | return p.batch * (p.Size()/2 + 1) 44 | } 45 | -------------------------------------------------------------------------------- /safe/fft3d_test.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | func ExampleFFT3DR2C() { 8 | InitCuda() 9 | 10 | Nx, Ny, Nz := 2, 4, 8 11 | 12 | fft := FFT3DR2C(Nx, Ny, Nz) 13 | defer fft.Destroy() 14 | 15 | input := MakeFloat32s(fft.InputLen()) 16 | defer input.Free() 17 | 18 | inputData := make([]float32, Nx*Ny*Nz) 19 | inputData[0*Ny*Nz] = 1 20 | inputData[1*Ny*Nz] = 1 21 | input.CopyHtoD(inputData) 22 | 23 | output := MakeComplex64s(fft.OutputLen()) 24 | defer output.Free() 25 | 26 | fft.Exec(input, output) 27 | 28 | fmt.Println("input:", Reshape3DFloat32(input.Host(), Nx, Ny, Nz)) 29 | Ox, Oy, Oz := fft.OutputSize() 30 | fmt.Println("output:", Reshape3DComplex64(output.Host(), Ox, Oy, Oz)) 31 | 32 | // Output: 33 | // input: [[[1 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[1 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]] 34 | // output: [[[(2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)]] [[(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)]]] 35 | } 36 | 37 | func ExampleFFT3DC2R() { 38 | InitCuda() 39 | 40 | Nx, Ny, Nz := 2, 4, 8 41 | 42 | fft := FFT3DC2R(Nx, Ny, Nz) 43 | defer fft.Destroy() 44 | 45 | input := MakeComplex64s(fft.InputLen()) 46 | defer input.Free() 47 | 48 | inputData := make([]complex64, fft.InputLen()) 49 | for i := range inputData { 50 | inputData[i] = 2 51 | } 52 | input.CopyHtoD(inputData) 53 | 54 | output := MakeFloat32s(fft.OutputLen()) 55 | defer output.Free() 56 | 57 | fft.Exec(input, output) 58 | 59 | Ix, Iy, Iz := fft.InputSize() 60 | fmt.Println("input:", Reshape3DComplex64(input.Host(), Ix, Iy, Iz)) 61 | fmt.Println("output:", Reshape3DFloat32(output.Host(), Nx, Ny, Nz)) 62 | 63 | // Output: 64 | // input: [[[(2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)]] [[(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)]]] 65 | // output: [[[128 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]] 66 | } 67 | 68 | func ExampleFFT3D() { 69 | InitCuda() 70 | 71 | Nx, Ny, Nz := 2, 4, 8 72 | 73 | forward := FFT3DR2C(Nx, Ny, Nz) 74 | defer forward.Destroy() 75 | 76 | input := MakeFloat32s(forward.InputLen()) 77 | defer input.Free() 78 | 79 | inputData := make([]float32, forward.InputLen()) 80 | inputData[5] = 1 81 | input.CopyHtoD(inputData) 82 | 83 | output := MakeComplex64s(forward.OutputLen()) 84 | defer output.Free() 85 | 86 | forward.Exec(input, output) 87 | 88 | backward := FFT3DC2R(Nx, Ny, Nz) 89 | backward.Exec(output, input) 90 | 91 | fmt.Println("input:", Reshape3DFloat32(inputData, Nx, Ny, Nz)) 92 | fmt.Println("forward+inverse:", Reshape3DFloat32(input.Host(), Nx, Ny, Nz)) 93 | 94 | // Output: 95 | // input: [[[0 0 0 0 0 1 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]] 96 | // forward+inverse: [[[0 0 0 0 0 64 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]] 97 | } 98 | 99 | //func ExampleFFT3D64() { 100 | // InitCuda() 101 | // 102 | // Nx, Ny, Nz := 2, 4, 8 103 | // 104 | // forward := FFT3DD2Z(Nx, Ny, Nz) 105 | // defer forward.Destroy() 106 | // 107 | // input := MakeFloat64s(forward.InputLen()) 108 | // defer input.Free() 109 | // 110 | // inputData := make([]float64, forward.InputLen()) 111 | // inputData[5] = 1 112 | // input.CopyHtoD(inputData) 113 | // 114 | // output := MakeComplex128s(forward.OutputLen()) 115 | // defer output.Free() 116 | // 117 | // forward.Exec(input, output) 118 | // 119 | // backward := FFT3DZ2D(Nx, Ny, Nz) 120 | // backward.Exec(output, input) 121 | // 122 | // fmt.Println("input:", Reshape3DFloat64(inputData, Nx, Ny, Nz)) 123 | // fmt.Println("forward+inverse:", Reshape3DFloat64(input.Host(), Nx, Ny, Nz)) 124 | // 125 | // // Output: 126 | // // input: [[[0 0 0 0 0 1 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]] 127 | // // forward+inverse: [[[0 0 0 0 0 64 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]] 128 | //} 129 | -------------------------------------------------------------------------------- /safe/fft3dc2r.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "fmt" 5 | "github.com/barnex/cuda5/cufft" 6 | ) 7 | 8 | // 3D single-precission real-to-complex FFT plan. 9 | type FFT3DC2RPlan struct { 10 | fftplan 11 | size3D 12 | } 13 | 14 | // 3D single-precission real-to-complex FFT plan. 15 | func FFT3DC2R(Nx, Ny, Nz int) FFT3DC2RPlan { 16 | handle := cufft.Plan3d(Nx, Ny, Nz, cufft.C2R) 17 | handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE) 18 | return FFT3DC2RPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}} 19 | } 20 | 21 | // Execute the FFT plan. 22 | // src and dst are 3D arrays stored 1D arrays. 23 | func (p FFT3DC2RPlan) Exec(src Complex64s, dst Float32s) { 24 | oksrclen := p.InputLen() 25 | if src.Len() != oksrclen { 26 | panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) 27 | } 28 | okdstlen := p.OutputLen() 29 | if dst.Len() != okdstlen { 30 | panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) 31 | } 32 | p.handle.ExecC2R(src.Pointer(), dst.Pointer()) 33 | p.stream.Synchronize() //! 34 | } 35 | 36 | // 3D size of the input array. 37 | func (p FFT3DC2RPlan) InputSize() (Nx, Ny, Nz int) { 38 | return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1 39 | } 40 | 41 | // 3D size of the output array. 42 | func (p FFT3DC2RPlan) OutputSize() (Nx, Ny, Nz int) { 43 | return p.size3D[0], p.size3D[1], p.size3D[2] 44 | } 45 | 46 | // Required length of the (1D) input array. 47 | func (p FFT3DC2RPlan) InputLen() int { 48 | return prod3(p.InputSize()) 49 | } 50 | 51 | // Required length of the (1D) output array. 52 | func (p FFT3DC2RPlan) OutputLen() int { 53 | return prod3(p.OutputSize()) 54 | } 55 | -------------------------------------------------------------------------------- /safe/fft3dd2z.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "fmt" 5 | "github.com/barnex/cuda5/cufft" 6 | ) 7 | 8 | // 3D single-precission real-to-complex FFT plan. 9 | type FFT3DD2ZPlan struct { 10 | fftplan 11 | size3D 12 | } 13 | 14 | // 3D single-precission real-to-complex FFT plan. 15 | func FFT3DD2Z(Nx, Ny, Nz int) FFT3DD2ZPlan { 16 | handle := cufft.Plan3d(Nx, Ny, Nz, cufft.D2Z) 17 | handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE) 18 | return FFT3DD2ZPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}} 19 | } 20 | 21 | // Execute the FFT plan. Synchronized. 22 | // src and dst are 3D arrays stored 1D arrays. 23 | func (p FFT3DD2ZPlan) Exec(src Float64s, dst Complex128s) { 24 | oksrclen := p.InputLen() 25 | if src.Len() != oksrclen { 26 | panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) 27 | } 28 | okdstlen := p.OutputLen() 29 | if dst.Len() != okdstlen { 30 | panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) 31 | } 32 | p.handle.ExecD2Z(src.Pointer(), dst.Pointer()) 33 | p.stream.Synchronize() //! 34 | } 35 | 36 | // 3D size of the input array. 37 | func (p FFT3DD2ZPlan) InputSize() (Nx, Ny, Nz int) { 38 | return p.size3D[0], p.size3D[1], p.size3D[2] 39 | } 40 | 41 | // 3D size of the output array. 42 | func (p FFT3DD2ZPlan) OutputSize() (Nx, Ny, Nz int) { 43 | return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1 44 | } 45 | 46 | // Required length of the (1D) input array. 47 | func (p FFT3DD2ZPlan) InputLen() int { 48 | return prod3(p.InputSize()) 49 | } 50 | 51 | // Required length of the (1D) output array. 52 | func (p FFT3DD2ZPlan) OutputLen() int { 53 | return prod3(p.OutputSize()) 54 | } 55 | -------------------------------------------------------------------------------- /safe/fft3dr2c.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "fmt" 5 | "github.com/barnex/cuda5/cufft" 6 | ) 7 | 8 | // 3D single-precission real-to-complex FFT plan. 9 | type FFT3DR2CPlan struct { 10 | fftplan 11 | size3D 12 | } 13 | 14 | // 3D single-precission real-to-complex FFT plan. 15 | func FFT3DR2C(Nx, Ny, Nz int) FFT3DR2CPlan { 16 | handle := cufft.Plan3d(Nx, Ny, Nz, cufft.R2C) 17 | handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE) 18 | return FFT3DR2CPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}} 19 | } 20 | 21 | // Execute the FFT plan. Synchronized. 22 | // src and dst are 3D arrays stored 1D arrays. 23 | func (p FFT3DR2CPlan) Exec(src Float32s, dst Complex64s) { 24 | oksrclen := p.InputLen() 25 | if src.Len() != oksrclen { 26 | panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) 27 | } 28 | okdstlen := p.OutputLen() 29 | if dst.Len() != okdstlen { 30 | panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) 31 | } 32 | p.handle.ExecR2C(src.Pointer(), dst.Pointer()) 33 | p.stream.Synchronize() //! 34 | } 35 | 36 | // 3D size of the input array. 37 | func (p FFT3DR2CPlan) InputSize() (Nx, Ny, Nz int) { 38 | return p.size3D[0], p.size3D[1], p.size3D[2] 39 | } 40 | 41 | // 3D size of the output array. 42 | func (p FFT3DR2CPlan) OutputSize() (Nx, Ny, Nz int) { 43 | return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1 44 | } 45 | 46 | // Required length of the (1D) input array. 47 | func (p FFT3DR2CPlan) InputLen() int { 48 | return prod3(p.InputSize()) 49 | } 50 | 51 | // Required length of the (1D) output array. 52 | func (p FFT3DR2CPlan) OutputLen() int { 53 | return prod3(p.OutputSize()) 54 | } 55 | -------------------------------------------------------------------------------- /safe/fft3dz2d.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "fmt" 5 | "github.com/barnex/cuda5/cufft" 6 | ) 7 | 8 | // 3D single-precission real-to-complex FFT plan. 9 | type FFT3DZ2DPlan struct { 10 | fftplan 11 | size3D 12 | } 13 | 14 | // 3D single-precission real-to-complex FFT plan. 15 | func FFT3DZ2D(Nx, Ny, Nz int) FFT3DZ2DPlan { 16 | handle := cufft.Plan3d(Nx, Ny, Nz, cufft.Z2D) 17 | handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE) 18 | return FFT3DZ2DPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}} 19 | } 20 | 21 | // Execute the FFT plan. Synchronized. 22 | // src and dst are 3D arrays stored 1D arrays. 23 | func (p FFT3DZ2DPlan) Exec(src Complex128s, dst Float64s) { 24 | oksrclen := p.InputLen() 25 | if src.Len() != oksrclen { 26 | panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) 27 | } 28 | okdstlen := p.OutputLen() 29 | if dst.Len() != okdstlen { 30 | panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) 31 | } 32 | p.handle.ExecZ2D(src.Pointer(), dst.Pointer()) 33 | p.stream.Synchronize() //! 34 | } 35 | 36 | // 3D size of the input array. 37 | func (p FFT3DZ2DPlan) InputSize() (Nx, Ny, Nz int) { 38 | return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1 39 | } 40 | 41 | // 3D size of the output array. 42 | func (p FFT3DZ2DPlan) OutputSize() (Nx, Ny, Nz int) { 43 | return p.size3D[0], p.size3D[1], p.size3D[2] 44 | } 45 | 46 | // Required length of the (1D) input array. 47 | func (p FFT3DZ2DPlan) InputLen() int { 48 | return prod3(p.InputSize()) 49 | } 50 | 51 | // Required length of the (1D) output array. 52 | func (p FFT3DZ2DPlan) OutputLen() int { 53 | return prod3(p.OutputSize()) 54 | } 55 | -------------------------------------------------------------------------------- /safe/fftplan.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | // INTERNAL 4 | // Base implementation for all FFT plans. 5 | 6 | import ( 7 | "github.com/barnex/cuda5/cu" 8 | "github.com/barnex/cuda5/cufft" 9 | ) 10 | 11 | // Base implementation for all FFT plans. 12 | type fftplan struct { 13 | handle cufft.Handle 14 | stream cu.Stream 15 | } 16 | 17 | // For the sake of embedding. 18 | type size1D int 19 | 20 | // Returns the logical size of the FFT: 21 | // the number of elements (real or complex) 22 | // it transforms. 23 | func (s size1D) Size() int { return int(s) } 24 | 25 | // For the sake of embedding. 26 | type size3D [3]int 27 | 28 | // Returns the logical size of the FFT: 29 | // the number of elements (real or complex) 30 | // it transforms. 31 | func (s size3D) Size() (Nx, Ny, Nz int) { return s[0], s[1], s[2] } 32 | 33 | func prod3(x, y, z int) int { 34 | return x * y * z 35 | } 36 | 37 | // Releases all resources associated with the FFT plan. 38 | func (p fftplan) Destroy() { p.handle.Destroy() } 39 | 40 | // Associates a CUDA stream with the FFT plan. 41 | // If a stream is set, plan.Stream().Synchronize() can 42 | // to be called to wait for the execution to finish. 43 | func (p fftplan) SetStream(stream cu.Stream) { 44 | p.handle.SetStream(stream) 45 | p.stream = stream 46 | } 47 | 48 | // Returns the CUDA stream associated with the FFT plan. 49 | func (p fftplan) Stream() cu.Stream { 50 | return p.stream 51 | } 52 | -------------------------------------------------------------------------------- /safe/float32s.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "fmt" 5 | "github.com/barnex/cuda5/cu" 6 | "math" 7 | "unsafe" 8 | ) 9 | 10 | // Slice of float32's on the GPU. 11 | type Float32s struct{ slice } 12 | 13 | // Make a slice of float32's on the GPU. 14 | // Initialized to zero. 15 | func MakeFloat32s(len_ int) Float32s { 16 | return Float32s{makeslice(len_, cu.SIZEOF_FLOAT32)} 17 | } 18 | 19 | // Return a slice from start (inclusive) to stop (exclusive), 20 | // sharing the underlying storage with the original slice. 21 | // Slices obtained in this way should not be Free()'d 22 | func (s Float32s) Slice(start, stop int) Float32s { 23 | return Float32s{s.slice.slice(start, stop, cu.SIZEOF_FLOAT32)} 24 | } 25 | 26 | // Copy src from host to dst on the device. 27 | func (dst Float32s) CopyHtoD(src []float32) { 28 | dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT32) 29 | } 30 | 31 | // Copy src form device to dst on host. 32 | func (src Float32s) CopyDtoH(dst []float32) { 33 | src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT32) 34 | } 35 | 36 | // Copy src on host to dst on host. 37 | func (dst Float32s) CopyDtoD(src Float32s) { 38 | dst.copyDtoD(&src.slice, cu.SIZEOF_FLOAT32) 39 | } 40 | 41 | // Copy src from host to dst on the device, asynchronously. 42 | func (dst Float32s) CopyHtoDAsync(src []float32, stream cu.Stream) { 43 | dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT32, stream) 44 | } 45 | 46 | // Copy src form device to dst on host, asynchronously. 47 | func (src Float32s) CopyDtoHAsync(dst []float32, stream cu.Stream) { 48 | src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT32, stream) 49 | } 50 | 51 | // Copy src on host to dst on host, asynchronously. 52 | func (dst Float32s) CopyDtoDAsync(src Float32s, stream cu.Stream) { 53 | dst.copyDtoDAsync(&src.slice, cu.SIZEOF_FLOAT32, stream) 54 | } 55 | 56 | // Returns a fresh copy on host. 57 | func (src Float32s) Host() []float32 { 58 | cpy := make([]float32, src.Len()) 59 | src.CopyDtoH(cpy) 60 | return cpy 61 | } 62 | 63 | // Set the entire slice to this value. 64 | func (s Float32s) Memset(value float32) { 65 | cu.MemsetD32(s.Pointer(), math.Float32bits(value), int64(s.Len())) 66 | cu.CtxSynchronize() 67 | } 68 | 69 | // Set the entire slice to this value, asynchronously. 70 | func (s Float32s) MemsetAsync(value float32, stream cu.Stream) { 71 | cu.MemsetD32Async(s.Pointer(), math.Float32bits(value), int64(s.Len()), stream) 72 | } 73 | 74 | // Re-interpret the array as complex numbers, 75 | // in interleaved format. Underlying storage 76 | // is shared. 77 | func (s Float32s) Complex() Complex64s { 78 | if s.Len()%2 != 0 { 79 | panic(fmt.Errorf("complex: need even number of elements, have:%v", s.Len())) 80 | } 81 | return Complex64s{slice{s.ptr_, s.len_ / 2, s.cap_ / 2}} 82 | } 83 | -------------------------------------------------------------------------------- /safe/float32s_test.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestFloat32sSlice(test *testing.T) { 9 | InitCuda() 10 | 11 | a := MakeFloat32s(100) 12 | defer a.Free() 13 | 14 | if !reflect.DeepEqual(a.Host(), make([]float32, 100)) { 15 | test.Error(a.Host()) 16 | } 17 | 18 | b := make([]float32, 100) 19 | 20 | if a.Len() != len(b) { 21 | test.Error("len:", a.Len(), "!=", cap(b)) 22 | } 23 | if a.Cap() != cap(b) { 24 | test.Error("cap:", a.Cap(), "!=", cap(b)) 25 | } 26 | 27 | c := a.Slice(20, 30) 28 | d := b[20:30] 29 | 30 | if c.Len() != len(d) { 31 | test.Error("sliced len:", c.Len(), "!=", cap(d)) 32 | } 33 | if c.Cap() != cap(d) { 34 | test.Error("sliced cap:", c.Cap(), "!=", cap(d)) 35 | } 36 | 37 | e := a.Slice(0, 50) 38 | f := b[0:50] 39 | 40 | if e.Len() != len(f) { 41 | test.Error("sliced len:", e.Len(), "!=", cap(f)) 42 | } 43 | if e.Cap() != cap(f) { 44 | test.Error("sliced cap:", e.Cap(), "!=", cap(f)) 45 | } 46 | } 47 | 48 | func TestFloat32sPanic1(test *testing.T) { 49 | InitCuda() 50 | 51 | defer func() { 52 | err := recover() 53 | test.Log("recovered:", err) 54 | if err == nil { 55 | test.Fail() 56 | } 57 | }() 58 | 59 | a := MakeFloat32s(100) 60 | defer a.Free() 61 | 62 | a.Slice(-1, 10) 63 | } 64 | 65 | func TestFloat32sPanic2(test *testing.T) { 66 | InitCuda() 67 | 68 | defer func() { 69 | err := recover() 70 | test.Log("recovered:", err) 71 | if err == nil { 72 | test.Fail() 73 | } 74 | }() 75 | 76 | a := MakeFloat32s(100) 77 | defer a.Free() 78 | 79 | a.Slice(0, 101) 80 | } 81 | 82 | func TestFloat32sCopy(test *testing.T) { 83 | InitCuda() 84 | 85 | a := make([]float32, 100) 86 | 87 | b := MakeFloat32s(100) 88 | defer b.Free() 89 | 90 | c := MakeFloat32s(100) 91 | defer c.Free() 92 | 93 | d := make([]float32, 200) 94 | 95 | for i := range a { 96 | a[i] = float32(i) 97 | } 98 | 99 | b.CopyHtoD(a) 100 | 101 | c.CopyDtoD(b) 102 | 103 | c.CopyDtoH(d[:100]) 104 | 105 | if !reflect.DeepEqual(a, d[:100]) { 106 | test.Error(d) 107 | } 108 | if !reflect.DeepEqual(d[100:], make([]float32, 100)) { 109 | test.Error(d) 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /safe/float64s.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "fmt" 5 | "github.com/barnex/cuda5/cu" 6 | "unsafe" 7 | ) 8 | 9 | // Slice of float64's on the GPU. 10 | type Float64s struct{ slice } 11 | 12 | // Make a slice of float64's on the GPU. 13 | // Initialized to zero. 14 | func MakeFloat64s(len_ int) Float64s { 15 | return Float64s{makeslice(len_, cu.SIZEOF_FLOAT64)} 16 | } 17 | 18 | // Return a slice from start (inclusive) to stop (exclusive), 19 | // sharing the underlying storage with the original slice. 20 | // Slices obtained in this way should not be Free()'d 21 | func (s Float64s) Slice(start, stop int) Float64s { 22 | return Float64s{s.slice.slice(start, stop, cu.SIZEOF_FLOAT64)} 23 | } 24 | 25 | // Copy src from host to dst on the device. 26 | func (dst Float64s) CopyHtoD(src []float64) { 27 | dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT64) 28 | } 29 | 30 | // Copy src form device to dst on host. 31 | func (src Float64s) CopyDtoH(dst []float64) { 32 | src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT64) 33 | } 34 | 35 | // Copy src on host to dst on host. 36 | func (dst Float64s) CopyDtoD(src Float64s) { 37 | dst.copyDtoD(&src.slice, cu.SIZEOF_FLOAT64) 38 | } 39 | 40 | // Copy src from host to dst on the device, asynchronously. 41 | func (dst Float64s) CopyHtoDAsync(src []float64, stream cu.Stream) { 42 | dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT64, stream) 43 | } 44 | 45 | // Copy src form device to dst on host, asynchronously. 46 | func (src Float64s) CopyDtoHAsync(dst []float64, stream cu.Stream) { 47 | src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT64, stream) 48 | } 49 | 50 | // Copy src on host to dst on host, asynchronously. 51 | func (dst Float64s) CopyDtoDAsync(src Float64s, stream cu.Stream) { 52 | dst.copyDtoDAsync(&src.slice, cu.SIZEOF_FLOAT64, stream) 53 | } 54 | 55 | // Returns a fresh copy on host. 56 | func (src Float64s) Host() []float64 { 57 | cpy := make([]float64, src.Len()) 58 | src.CopyDtoH(cpy) 59 | return cpy 60 | } 61 | 62 | // Re-interpret the array as complex numbers, 63 | // in interleaved format. Underlying storage 64 | // is shared. 65 | func (s Float64s) Complex() Complex128s { 66 | if s.Len()%2 != 0 { 67 | panic(fmt.Errorf("complex: need even number of elements, have:%v", s.Len())) 68 | } 69 | return Complex128s{slice{s.ptr_, s.len_ / 2, s.cap_ / 2}} 70 | } 71 | -------------------------------------------------------------------------------- /safe/float64s_test.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestFloat64sSlice(test *testing.T) { 9 | InitCuda() 10 | 11 | a := MakeFloat64s(100) 12 | defer a.Free() 13 | 14 | if !reflect.DeepEqual(a.Host(), make([]float64, 100)) { 15 | test.Error(a.Host()) 16 | } 17 | 18 | b := make([]float64, 100) 19 | 20 | if a.Len() != len(b) { 21 | test.Error("len:", a.Len(), "!=", cap(b)) 22 | } 23 | if a.Cap() != cap(b) { 24 | test.Error("cap:", a.Cap(), "!=", cap(b)) 25 | } 26 | 27 | c := a.Slice(20, 30) 28 | d := b[20:30] 29 | 30 | if c.Len() != len(d) { 31 | test.Error("sliced len:", c.Len(), "!=", cap(d)) 32 | } 33 | if c.Cap() != cap(d) { 34 | test.Error("sliced cap:", c.Cap(), "!=", cap(d)) 35 | } 36 | 37 | e := a.Slice(0, 50) 38 | f := b[0:50] 39 | 40 | if e.Len() != len(f) { 41 | test.Error("sliced len:", e.Len(), "!=", cap(f)) 42 | } 43 | if e.Cap() != cap(f) { 44 | test.Error("sliced cap:", e.Cap(), "!=", cap(f)) 45 | } 46 | } 47 | 48 | func TestFloat64sPanic1(test *testing.T) { 49 | InitCuda() 50 | 51 | defer func() { 52 | err := recover() 53 | test.Log("recovered:", err) 54 | if err == nil { 55 | test.Fail() 56 | } 57 | }() 58 | 59 | a := MakeFloat64s(100) 60 | defer a.Free() 61 | 62 | a.Slice(-1, 10) 63 | } 64 | 65 | func TestFloat64sPanic2(test *testing.T) { 66 | InitCuda() 67 | 68 | defer func() { 69 | err := recover() 70 | test.Log("recovered:", err) 71 | if err == nil { 72 | test.Fail() 73 | } 74 | }() 75 | 76 | a := MakeFloat64s(100) 77 | defer a.Free() 78 | 79 | a.Slice(0, 101) 80 | } 81 | 82 | func TestFloat64sCopy(test *testing.T) { 83 | InitCuda() 84 | 85 | a := make([]float64, 100) 86 | 87 | b := MakeFloat64s(100) 88 | defer b.Free() 89 | 90 | c := MakeFloat64s(100) 91 | defer c.Free() 92 | 93 | d := make([]float64, 200) 94 | 95 | for i := range a { 96 | a[i] = float64(i) 97 | } 98 | 99 | b.CopyHtoD(a) 100 | 101 | c.CopyDtoD(b) 102 | 103 | c.CopyDtoH(d[:100]) 104 | 105 | if !reflect.DeepEqual(a, d[:100]) { 106 | test.Error(d) 107 | } 108 | if !reflect.DeepEqual(d[100:], make([]float64, 100)) { 109 | test.Error(d) 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /safe/init.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | import ( 4 | "github.com/barnex/cuda5/cu" 5 | "runtime" 6 | ) 7 | 8 | func InitCuda() { 9 | runtime.LockOSThread() 10 | cu.Init(0) 11 | cu.CtxCreate(cu.CTX_SCHED_AUTO, 0).SetCurrent() 12 | } 13 | -------------------------------------------------------------------------------- /safe/slice.go: -------------------------------------------------------------------------------- 1 | package safe 2 | 3 | // INTERNAL. 4 | // This file implements common functionality for all slice types 5 | // (Float32s, Float64s, Complex64s, ...). 6 | 7 | import ( 8 | "fmt" 9 | "github.com/barnex/cuda5/cu" 10 | "unsafe" 11 | ) 12 | 13 | // internal base func for all makeXXX() functions 14 | func makeslice(len_ int, elemsize int) slice { 15 | bytes := int64(len_) * int64(elemsize) 16 | s := slice{0, len_, len_} 17 | if bytes > 0 { 18 | s.ptr_ = cu.MemAlloc(bytes) 19 | cu.MemsetD8(s.ptr_, 0, bytes) 20 | cu.CtxSynchronize() 21 | } 22 | return s 23 | } 24 | 25 | // internal base type for all slices 26 | type slice struct { 27 | ptr_ cu.DevicePtr // address offset of first element 28 | len_ int // number of elements 29 | cap_ int 30 | } 31 | 32 | // Pointer to the first element. 33 | func (s *slice) Pointer() cu.DevicePtr { return s.ptr_ } 34 | 35 | // Slice length (number of elements). 36 | func (s *slice) Len() int { return s.len_ } 37 | 38 | // Slice capacity. 39 | func (s *slice) Cap() int { return s.cap_ } 40 | 41 | // Free the underlying storage. 42 | // To be used with care. Free() should only be called on 43 | // a slice created by MakeXXX(), not on a slice created 44 | // by x.Slice(). Freeing a slice invalidates all other 45 | // slices referring to it. 46 | func (s *slice) Free() { 47 | s.ptr_.Free() 48 | s.len_ = 0 49 | s.cap_ = 0 50 | } 51 | 52 | // internal base func for all slice() functions 53 | func (s *slice) slice(start, stop int, elemsize uintptr) slice { 54 | if start >= s.cap_ || start < 0 || stop > s.cap_ || stop < 0 { 55 | panic("cuda4/safe: slice index out of bounds") 56 | } 57 | if start > stop { 58 | panic("cuda4/safe: inverted slice range") 59 | } 60 | return slice{cu.DevicePtr(uintptr(s.ptr_) + uintptr(start)*elemsize), stop - start, s.cap_ - start} 61 | } 62 | 63 | func (dst *slice) copyHtoD(src unsafe.Pointer, srclen int, elemsize int) { 64 | if srclen != dst.Len() { 65 | panic(fmt.Errorf("cuda4/safe: len mismatch: len(src)=%v (host), dst.Len()=%v (device)", srclen, dst.Len())) 66 | } 67 | cu.MemcpyHtoD(dst.Pointer(), src, int64(elemsize)*int64(srclen)) 68 | } 69 | 70 | func (src *slice) copyDtoH(dst unsafe.Pointer, dstlen int, elemsize int) { 71 | if dstlen != src.Len() { 72 | panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), len(dst)=%v (host)", src.Len(), dstlen)) 73 | } 74 | cu.MemcpyDtoH(dst, src.Pointer(), int64(elemsize)*int64(dstlen)) 75 | } 76 | 77 | func (dst *slice) copyDtoD(src *slice, elemsize int) { 78 | if dst.Len() != src.Len() { 79 | panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), dst.Len()=%v", src.Len(), dst.Len())) 80 | } 81 | cu.MemcpyDtoD(dst.Pointer(), src.Pointer(), int64(elemsize)*int64(dst.Len())) 82 | } 83 | 84 | func (dst *slice) copyHtoDAsync(src unsafe.Pointer, srclen int, elemsize int, stream cu.Stream) { 85 | if srclen != dst.Len() { 86 | panic(fmt.Errorf("cuda4/safe: len mismatch: len(src)=%v (host), dst.Len()=%v (device)", srclen, dst.Len())) 87 | } 88 | cu.MemcpyHtoDAsync(dst.Pointer(), src, int64(elemsize)*int64(srclen), stream) 89 | } 90 | 91 | func (src *slice) copyDtoHAsync(dst unsafe.Pointer, dstlen int, elemsize int, stream cu.Stream) { 92 | if dstlen != src.Len() { 93 | panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), len(dst)=%v (host)", src.Len(), dstlen)) 94 | } 95 | cu.MemcpyDtoHAsync(dst, src.Pointer(), int64(elemsize)*int64(dstlen), stream) 96 | } 97 | 98 | func (dst *slice) copyDtoDAsync(src *slice, elemsize int, stream cu.Stream) { 99 | if dst.Len() != src.Len() { 100 | panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), dst.Len()=%v", src.Len(), dst.Len())) 101 | } 102 | cu.MemcpyDtoDAsync(dst.Pointer(), src.Pointer(), int64(elemsize)*int64(dst.Len()), stream) 103 | } 104 | 105 | // Manually set the pointer, length and capacity. 106 | // Side-steps the security mechanisms, use with caution. 107 | func (s *slice) UnsafeSet(pointer unsafe.Pointer, length, capacity int) { 108 | s.ptr_ = cu.DevicePtr(uintptr(pointer)) 109 | s.len_ = length 110 | s.cap_ = capacity 111 | } 112 | -------------------------------------------------------------------------------- /safe/subs.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | subs32='s/loat32/loat64/g;' 4 | subs32+='s/FLOAT32/FLOAT64/g;' 5 | 6 | #sed $subs32 float32s.go > float64s.go 7 | #sed $subs32 float32s_test.go > float64s_test.go 8 | 9 | subsc64='s/Float32/Complex64/g;' 10 | subsc64+='s/float32/complex64/g;' 11 | subsc64+='s/FLOAT32/COMPLEX64/g;' 12 | #sed $subsc64 float32s_test.go > complex64s_test.go 13 | #sed $subsc64 float32s.go > complex64s.go 14 | 15 | 16 | subsc128='s/omplex64/omplex128/g;' 17 | subsc128+='s/COMPLEX64/COMPLEX128/g;' 18 | sed $subsc128 complex64s.go > complex128s.go 19 | sed $subsc128 complex64s_test.go > complex128s_test.go 20 | --------------------------------------------------------------------------------