├── .gitignore
├── Makefile
├── README.md
├── cu
    ├── Makefile
    ├── README
    ├── cgoflags.go
    ├── context.go
    ├── context_test.go
    ├── device.go
    ├── device_test.go
    ├── dim3.go
    ├── doc.go
    ├── execution.go
    ├── function.go
    ├── init.go
    ├── init_test.go
    ├── memory.go
    ├── memory_test.go
    ├── memset.go
    ├── module.go
    ├── module_test.go
    ├── peer.go
    ├── result.go
    ├── runtimeapi.go
    ├── stream.go
    ├── testdata
    │   ├── testmodule.cu
    │   └── testmodule.ptx
    ├── version.go
    └── version_test.go
├── cuda
    ├── Makefile
    ├── README
    ├── cgoflags.go
    └── device.go
├── cufft
    ├── Makefile
    ├── README
    ├── cgoflags.go
    ├── doc.go
    ├── fft_test.go
    ├── init_test.go
    ├── mode.go
    ├── plan.go
    ├── result.go
    └── type.go
├── curand
    ├── Makefile
    ├── README
    ├── cgoflags.go
    ├── generator.go
    └── status.go
├── doc.go
├── gophergpu.png
└── safe
    ├── Makefile
    ├── README
    ├── complex128s.go
    ├── complex128s_test.go
    ├── complex64s.go
    ├── complex64s_test.go
    ├── doc.go
    ├── fft1d_test.go
    ├── fft1dc2r.go
    ├── fft1dr2c.go
    ├── fft3d_test.go
    ├── fft3dc2r.go
    ├── fft3dd2z.go
    ├── fft3dr2c.go
    ├── fft3dz2d.go
    ├── fftplan.go
    ├── float32s.go
    ├── float32s_test.go
    ├── float64s.go
    ├── float64s_test.go
    ├── init.go
    ├── slice.go
    └── subs.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.{6,8,5,o}
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all: 6g doc
 2 | 
 3 | 6g:
 4 | 	go install -v
 5 | 	go tool vet *.go
 6 | 	gofmt -w *.go
 7 | 
 8 | GCCGO=gccgo -gccgoflags '-static-libgcc -O3'
 9 | 
10 | gccgo:
11 | 	go install -v -compiler $(GCCGO)
12 | 
13 | test: 6gtest gccgotest
14 | 
15 | 6gtest: 
16 | 	go test
17 | 
18 | gccgotest: 
19 | 	go test -compiler $(GCCGO)
20 | 
21 | bench: 6gbench gccgobench
22 | 
23 | 6gbench:
24 | 	go test -bench=.
25 | 
26 | gccgobench:
27 | 	go test -bench=. -compiler $(GCCGO)
28 | 
29 | clean:
30 | 	go clean
31 | 	go-optview -c -w *.go
32 | 	gofmt -w *.go
33 | 
34 | opt:
35 | 	go-optview -w *.go
36 | 	gofmt -w *.go
37 | 
38 | doc:
39 | 	godoc github.com/barnex/cuda5 > README
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Go bindings for CUDA
2 | 
3 | Go bindings for nVIDIA CUDA 5 and later. This package compiles with both gc and gccgo.
4 | 
5 | ![fig](gophergpu.png)
6 | 


--------------------------------------------------------------------------------
/cu/Makefile:
--------------------------------------------------------------------------------
 1 | all: 6g gccgo doc
 2 | 
 3 | 6g:
 4 | 	go install -v
 5 | 	go tool vet *.go
 6 | 	gofmt -w *.go
 7 | 
 8 | GCCGO=gccgo -gccgoflags '-static-libgcc -O3'
 9 | 
10 | gccgo:
11 | 	go build -v -compiler $(GCCGO)
12 | 
13 | test: 6gtest gccgotest
14 | 
15 | 6gtest: 
16 | 	go test
17 | 
18 | gccgotest: 
19 | 	go test -compiler $(GCCGO)
20 | 
21 | bench: 6gbench gccgobench
22 | 
23 | 6gbench:
24 | 	go test -bench=.
25 | 
26 | gccgobench:
27 | 	go test -bench=. -compiler $(GCCGO)
28 | 
29 | clean:
30 | 	go clean
31 | 
32 | doc:
33 | 	godoc github.com/barnex/cuda5/cu > README
34 | 


--------------------------------------------------------------------------------
/cu/README:
--------------------------------------------------------------------------------
  1 | PACKAGE
  2 | 
  3 | package cu
  4 |     import "github.com/barnex/cuda5/cu"
  5 | 
  6 |     Go bindings for the CUDA driver API.
  7 | 
  8 | CONSTANTS
  9 | 
 10 | const (
 11 |     // If  the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor.
 12 |     CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO
 13 |     // Spin when waiting for results from the GPU. 
 14 |     CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN
 15 |     // Yield its thread when waiting for results from the GPU.
 16 |     CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD
 17 |     // Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work.
 18 |     CTX_BLOCKING_SYNC
 19 |     // Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU.
 20 |     CTX_MAP_HOST = C.CU_CTX_MAP_HOST
 21 |     //Do not reduce local memory after resizing local memory for a kernel. 
 22 |     CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX
 23 | )
 24 |     Flags for CtxCreate
 25 | const (
 26 |     SIZEOF_FLOAT32    = 4
 27 |     SIZEOF_FLOAT64    = 8
 28 |     SIZEOF_COMPLEX64  = 8
 29 |     SIZEOF_COMPLEX128 = 16
 30 | )
 31 |     Type size in bytes
 32 | 
 33 | 
 34 | FUNCTIONS
 35 | 
 36 | func CtxDestroy(ctx *Context)
 37 |     Destroys the CUDA context specified by ctx. If the context usage count
 38 |     is not equal to 1, or the context is current to any CPU thread other
 39 |     than the current one, this function fails. Floating contexts (detached
 40 |     from a CPU thread via cuCtxPopCurrent()) may be destroyed by this
 41 |     function.
 42 | 
 43 | func CtxDisablePeerAccess(peer Context)
 44 |     Reverses CtxEnablePeerAccess().
 45 | 
 46 | func CtxEnablePeerAccess(peer Context)
 47 |     Make allocations from the peer Context available to the current context.
 48 | 
 49 | func CtxGetApiVersion(ctx Context) (version int)
 50 |     Returns the API version to create the context.
 51 | 
 52 | func CtxSetCurrent(ctx Context)
 53 |     Sets the current active context.
 54 | 
 55 | func CtxSynchronize()
 56 |     Blocks until the device has completed all preceding requested tasks, if
 57 |     the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag.
 58 | 
 59 | func DeviceCanAccessPeer(dev, peer Device) bool
 60 |     Returns true if CtxEnablePeerAccess can be called on a context for dev
 61 |     and peerDev.
 62 | 
 63 | func DeviceComputeCapability(device Device) (major, minor int)
 64 |     Returns the compute capability of the device.
 65 | 
 66 | func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int
 67 |     Gets the value of a device attribute.
 68 | 
 69 | func DeviceGetCount() int
 70 |     Returns the number of devices with compute capability greater than or
 71 |     equal to 1.0 that are available for execution.
 72 | 
 73 | func DeviceGetName(dev Device) string
 74 |     Gets the name of the device.
 75 | 
 76 | func DeviceTotalMem(device Device) int64
 77 |     Returns the total amount of memory available on the device in bytes.
 78 | 
 79 | func FuncGetAttribute(attrib FunctionAttribute, function Function) int
 80 | 
 81 | func Init(flags int)
 82 |     Initialize the CUDA driver API. Currently, flags must be 0. If Init()
 83 |     has not been called, any function from the driver API will panic with
 84 |     ERROR_NOT_INITIALIZED.
 85 | 
 86 | func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer)
 87 | 
 88 | func MemAllocHost(bytes int64) unsafe.Pointer
 89 | 
 90 | func MemFree(ptr *DevicePtr)
 91 |     Frees device memory allocated by MemAlloc(). Overwrites the pointer with
 92 |     NULL. It is safe to double-free.
 93 | 
 94 | func MemFreeHost(ptr unsafe.Pointer)
 95 | 
 96 | func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr)
 97 |     Returns the base address and size of the allocation (by MemAlloc) that
 98 |     contains the input pointer ptr.
 99 | 
100 | func MemGetInfo() (free, total int64)
101 |     Returns the free and total amount of memroy in the current Context (in
102 |     bytes).
103 | 
104 | func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag)
105 |     Page-locks memory specified by the pointer and bytes. The pointer and
106 |     byte size must be aligned to the host page size (4KB) See also:
107 |     MemHostUnregister()
108 | 
109 | func MemHostUnregister(ptr unsafe.Pointer)
110 |     Unmaps memory locked by MemHostRegister().
111 | 
112 | func Memcpy(dst, src DevicePtr, bytes int64)
113 |     Copies a number of bytes on the current device. Requires unified
114 |     addressing to be supported. See also: MemcpyDtoD(). TODO(a): is actually
115 |     an auto copy for device and/or host memory
116 | 
117 | func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream)
118 |     Asynchronously copies a number of bytes on the current device.
119 | 
120 | func MemcpyDtoD(dst, src DevicePtr, bytes int64)
121 |     Copies a number of bytes from host to device.
122 | 
123 | func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream)
124 |     Asynchronously copies a number of bytes from host to device.
125 | 
126 | func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64)
127 |     Copies a number of bytes from device to host.
128 | 
129 | func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream)
130 |     Asynchronously copies a number of bytes device host to host. The host
131 |     memory must be page-locked (see MemRegister)
132 | 
133 | func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64)
134 |     Copies a number of bytes from host to device.
135 | 
136 | func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream)
137 |     Asynchronously copies a number of bytes from host to device. The host
138 |     memory must be page-locked (see MemRegister)
139 | 
140 | func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64)
141 |     Copies from device memory in one context (device) to another.
142 | 
143 | func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream)
144 |     Asynchronously copies from device memory in one context (device) to
145 |     another.
146 | 
147 | func MemsetD32(deviceptr DevicePtr, value uint32, N int64)
148 |     Sets the first N 32-bit values of dst array to value. Asynchronous.
149 | 
150 | func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream)
151 |     Asynchronously sets the first N 32-bit values of dst array to value.
152 | 
153 | func MemsetD8(deviceptr DevicePtr, value uint8, N int64)
154 |     Sets the first N 8-bit values of dst array to value. Asynchronous.
155 | 
156 | func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream)
157 |     Asynchronously sets the first N 32-bit values of dst array to value.
158 | 
159 | func StreamDestroy(stream *Stream)
160 |     Destroys an asynchronous stream
161 | 
162 | func StreamSynchronize(stream Stream)
163 |     Blocks until the stream has completed.
164 | 
165 | func Version() int
166 |     Returns the CUDA driver version.
167 | 
168 | 
169 | TYPES
170 | 
171 | type Context uintptr
172 |     CUDA context.
173 | 
174 | func CtxCreate(flags uint, dev Device) Context
175 |     Create a CUDA context.
176 | 
177 | func CtxGetCurrent() Context
178 |     Gets the current active context.
179 | 
180 | func (ctx Context) ApiVersion() (version int)
181 |     Returns the API version to create the context.
182 | 
183 | func (ctx *Context) Destroy()
184 |     Destroys the CUDA context.
185 | 
186 | func (peer Context) DisablePeerAccess()
187 |     Reverses EnablePeerAccess().
188 | 
189 | func (peer Context) EnablePeerAccess()
190 |     Make allocations from the peer Context available to the current context.
191 | 
192 | func (ctx Context) SetCurrent()
193 |     Sets the current active context.
194 | 
195 | type DevProp struct {
196 |     MaxThreadsPerBlock  int
197 |     MaxThreadsDim       [3]int
198 |     MaxGridSize         [3]int
199 |     SharedMemPerBlock   int
200 |     TotalConstantMemory int
201 |     SIMDWidth           int
202 |     MemPitch            int
203 |     RegsPerBlock        int
204 |     ClockRate           int
205 |     TextureAlign        int
206 | }
207 |     Device properties
208 | 
209 | func DeviceGetProperties(dev Device) (prop DevProp)
210 |     Returns the device's properties.
211 | 
212 | type Device int
213 |     CUDA Device number.
214 | 
215 | func CtxGetDevice() Device
216 |     Returns the ordinal of the current context's device.
217 | 
218 | func DeviceGet(ordinal int) Device
219 |     Returns in a device handle given an ordinal in the range [0,
220 |     DeviceGetCount()-1].
221 | 
222 | func (dev Device) Attribute(attrib DeviceAttribute) int
223 |     Gets the value of a device attribute.
224 | 
225 | func (dev Device) CanAccessPeer(peer Device) bool
226 |     Returns true if CtxEnablePeerAccess can be called on a context for dev
227 |     and peerDev.
228 | 
229 | func (device Device) ComputeCapability() (major, minor int)
230 |     Returns the compute capability of the device.
231 | 
232 | func (dev Device) Name() string
233 |     Gets the name of the device.
234 | 
235 | func (dev Device) Properties() DevProp
236 |     Returns the device's properties.
237 | 
238 | func (device Device) TotalMem() int64
239 |     Returns the total amount of memory available on the device in bytes.
240 | 
241 | type DeviceAttribute int
242 | 
243 | const (
244 |     MAX_THREADS_PER_BLOCK            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK            // Maximum number of threads per block
245 |     MAX_BLOCK_DIM_X                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X                  // Maximum block dimension X
246 |     MAX_BLOCK_DIM_Y                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y                  // Maximum block dimension Y
247 |     MAX_BLOCK_DIM_Z                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z                  // Maximum block dimension Z
248 |     MAX_GRID_DIM_X                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X                   // Maximum grid dimension X
249 |     MAX_GRID_DIM_Y                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y                   // Maximum grid dimension Y
250 |     MAX_GRID_DIM_Z                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z                   // Maximum grid dimension Z
251 |     MAX_SHARED_MEMORY_PER_BLOCK      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK      // Maximum shared memory available per block in bytes
252 |     TOTAL_CONSTANT_MEMORY            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY            // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
253 |     WARP_SIZE                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE                        // Warp size in threads
254 |     MAX_PITCH                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH                        // Maximum pitch in bytes allowed by memory copies
255 |     MAX_REGISTERS_PER_BLOCK          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK          // Maximum number of 32-bit registers available per block
256 |     CLOCK_RATE                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE                       // Peak clock frequency in kilohertz
257 |     TEXTURE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT                // Alignment requirement for textures
258 |     MULTIPROCESSOR_COUNT             DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT             // Number of multiprocessors on device
259 |     KERNEL_EXEC_TIMEOUT              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT              // Specifies whether there is a run time limit on kernels
260 |     INTEGRATED                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED                       // Device is integrated with host memory
261 |     CAN_MAP_HOST_MEMORY              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY              // Device can map host memory into CUDA address space
262 |     COMPUTE_MODE                     DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE                     // Compute mode (See ::CUcomputemode for details)
263 |     MAXIMUM_TEXTURE1D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH          // Maximum 1D texture width
264 |     MAXIMUM_TEXTURE2D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH          // Maximum 2D texture width
265 |     MAXIMUM_TEXTURE2D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT         // Maximum 2D texture height
266 |     MAXIMUM_TEXTURE3D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH          // Maximum 3D texture width
267 |     MAXIMUM_TEXTURE3D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT         // Maximum 3D texture height
268 |     MAXIMUM_TEXTURE3D_DEPTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH          // Maximum 3D texture depth
269 |     MAXIMUM_TEXTURE2D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH  // Maximum 2D layered texture width
270 |     MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height
271 |     MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture
272 |     SURFACE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT                // Alignment requirement for surfaces
273 |     CONCURRENT_KERNELS               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS               // Device can possibly execute multiple kernels concurrently
274 |     ECC_ENABLED                      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED                      // Device has ECC support enabled
275 |     PCI_BUS_ID                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID                       // PCI bus ID of the device
276 |     PCI_DEVICE_ID                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID                    // PCI device ID of the device
277 |     TCC_DRIVER                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER                       // Device is using TCC driver model
278 |     MEMORY_CLOCK_RATE                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE                // Peak memory clock frequency in kilohertz
279 |     GLOBAL_MEMORY_BUS_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH          // Global memory bus width in bits
280 |     L2_CACHE_SIZE                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE                    // Size of L2 cache in bytes
281 |     MAX_THREADS_PER_MULTIPROCESSOR   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR   // Maximum resident threads per multiprocessor
282 |     ASYNC_ENGINE_COUNT               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT               // Number of asynchronous engines
283 |     UNIFIED_ADDRESSING               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING               // Device uses shares a unified address space with the host 
284 |     MAXIMUM_TEXTURE1D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH  // Maximum 1D layered texture width
285 |     MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture
286 | )
287 | 
288 | type DevicePtr uintptr
289 | 
290 | func MemAlloc(bytes int64) DevicePtr
291 |     Allocates a number of bytes of device memory.
292 | 
293 | func (ptr DevicePtr) Bytes() (bytes int64)
294 |     Returns the size of the allocation (by MemAlloc) that contains the input
295 |     pointer ptr.
296 | 
297 | func (ptr *DevicePtr) Free()
298 |     Frees device memory allocated by MemAlloc(). Overwrites the pointer with
299 |     NULL. It is safe to double-free.
300 | 
301 | func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr)
302 |     Returns the base address and size of the allocation (by MemAlloc) that
303 |     contains the input pointer ptr.
304 | 
305 | func (ptr DevicePtr) MemoryType() MemoryType
306 |     Returns the physical memory type that ptr addresses.
307 | 
308 | func (p DevicePtr) String() string
309 | 
310 | type Dim3 struct {
311 |     X, Y, Z int
312 | }
313 | 
314 | type Function uintptr
315 |     Represents a CUDA CUfunction, a reference to a function within a module.
316 | 
317 | func ModuleGetFunction(module Module, name string) Function
318 |     Returns a Function handle.
319 | 
320 | func (f Function) GetAttribute(attrib FunctionAttribute) int
321 | 
322 | type FunctionAttribute int
323 | 
324 | const (
325 |     FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail.
326 |     FUNC_A_SHARED_SIZE_BYTES     FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES     // The size in bytes of statically-allocated shared memory required by this function. 
327 |     FUNC_A_CONST_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES      // The size in bytes of user-allocated constant memory required by this function.
328 |     FUNC_A_LOCAL_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES      // The size in bytes of local memory used by each thread of this function.
329 |     FUNC_A_NUM_REGS              FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS              // The number of registers used by each thread of this function.
330 |     FUNC_A_PTX_VERSION           FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION           // The PTX virtual architecture version for which the function was compiled. 
331 |     FUNC_A_BINARY_VERSION        FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION        // The binary architecture version for which the function was compiled.
332 | )
333 | 
334 | type MemHostRegisterFlag int
335 | 
336 | const (
337 |     // Memory is pinned in all CUDA contexts.
338 |     MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE
339 |     // Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer()
340 |     MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP
341 | )
342 |     Flag for MemHostRegister
343 | 
344 | type MemoryType uint
345 | 
346 | const (
347 |     MemoryTypeHost    MemoryType = C.CU_MEMORYTYPE_HOST
348 |     MemoryTypeDevice  MemoryType = C.CU_MEMORYTYPE_DEVICE
349 |     MemoryTypeArray   MemoryType = C.CU_MEMORYTYPE_ARRAY
350 |     MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED
351 | )
352 | 
353 | func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result)
354 |     Returns the physical memory type that ptr addresses.
355 | 
356 | func (t MemoryType) String() string
357 | 
358 | type Module uintptr
359 |     Represents a CUDA CUmodule, a reference to executable device code.
360 | 
361 | func ModuleLoad(fname string) Module
362 |     Loads a compute module from file
363 | 
364 | func ModuleLoadData(image string) Module
365 |     Loads a compute module from string
366 | 
367 | func (m Module) GetFunction(name string) Function
368 |     Returns a Function handle.
369 | 
370 | type Result int
371 |     CUDA error status. CUDA error statuses are not returned by functions but
372 |     checked and passed to panic() when not successful. If desired, they can
373 |     be caught by recover().
374 | 
375 | const (
376 |     SUCCESS                              Result = C.CUDA_SUCCESS
377 |     ERROR_INVALID_VALUE                  Result = C.CUDA_ERROR_INVALID_VALUE
378 |     ERROR_OUT_OF_MEMORY                  Result = C.CUDA_ERROR_OUT_OF_MEMORY
379 |     ERROR_NOT_INITIALIZED                Result = C.CUDA_ERROR_NOT_INITIALIZED
380 |     ERROR_DEINITIALIZED                  Result = C.CUDA_ERROR_DEINITIALIZED
381 |     ERROR_PROFILER_DISABLED              Result = C.CUDA_ERROR_PROFILER_DISABLED
382 |     ERROR_PROFILER_NOT_INITIALIZED       Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED
383 |     ERROR_PROFILER_ALREADY_STARTED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED
384 |     ERROR_PROFILER_ALREADY_STOPPED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED
385 |     ERROR_NO_DEVICE                      Result = C.CUDA_ERROR_NO_DEVICE
386 |     ERROR_INVALID_DEVICE                 Result = C.CUDA_ERROR_INVALID_DEVICE
387 |     ERROR_INVALID_IMAGE                  Result = C.CUDA_ERROR_INVALID_IMAGE
388 |     ERROR_INVALID_CONTEXT                Result = C.CUDA_ERROR_INVALID_CONTEXT
389 |     ERROR_CONTEXT_ALREADY_CURRENT        Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT
390 |     ERROR_MAP_FAILED                     Result = C.CUDA_ERROR_MAP_FAILED
391 |     ERROR_UNMAP_FAILED                   Result = C.CUDA_ERROR_UNMAP_FAILED
392 |     ERROR_ARRAY_IS_MAPPED                Result = C.CUDA_ERROR_ARRAY_IS_MAPPED
393 |     ERROR_ALREADY_MAPPED                 Result = C.CUDA_ERROR_ALREADY_MAPPED
394 |     ERROR_NO_BINARY_FOR_GPU              Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU
395 |     ERROR_ALREADY_ACQUIRED               Result = C.CUDA_ERROR_ALREADY_ACQUIRED
396 |     ERROR_NOT_MAPPED                     Result = C.CUDA_ERROR_NOT_MAPPED
397 |     ERROR_NOT_MAPPED_AS_ARRAY            Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY
398 |     ERROR_NOT_MAPPED_AS_POINTER          Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER
399 |     ERROR_ECC_UNCORRECTABLE              Result = C.CUDA_ERROR_ECC_UNCORRECTABLE
400 |     ERROR_UNSUPPORTED_LIMIT              Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT
401 |     ERROR_CONTEXT_ALREADY_IN_USE         Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE
402 |     ERROR_INVALID_SOURCE                 Result = C.CUDA_ERROR_INVALID_SOURCE
403 |     ERROR_FILE_NOT_FOUND                 Result = C.CUDA_ERROR_FILE_NOT_FOUND
404 |     ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
405 |     ERROR_SHARED_OBJECT_INIT_FAILED      Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
406 |     ERROR_OPERATING_SYSTEM               Result = C.CUDA_ERROR_OPERATING_SYSTEM
407 |     ERROR_INVALID_HANDLE                 Result = C.CUDA_ERROR_INVALID_HANDLE
408 |     ERROR_NOT_FOUND                      Result = C.CUDA_ERROR_NOT_FOUND
409 |     ERROR_NOT_READY                      Result = C.CUDA_ERROR_NOT_READY
410 |     ERROR_LAUNCH_FAILED                  Result = C.CUDA_ERROR_LAUNCH_FAILED
411 |     ERROR_LAUNCH_OUT_OF_RESOURCES        Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
412 |     ERROR_LAUNCH_TIMEOUT                 Result = C.CUDA_ERROR_LAUNCH_TIMEOUT
413 |     ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING
414 |     ERROR_PEER_ACCESS_ALREADY_ENABLED    Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
415 |     ERROR_PEER_ACCESS_NOT_ENABLED        Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
416 |     ERROR_PRIMARY_CONTEXT_ACTIVE         Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
417 |     ERROR_CONTEXT_IS_DESTROYED           Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED
418 |     ERROR_ASSERT                         Result = C.CUDA_ERROR_ASSERT
419 |     ERROR_TOO_MANY_PEERS                 Result = C.CUDA_ERROR_TOO_MANY_PEERS
420 |     ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED
421 |     ERROR_HOST_MEMORY_NOT_REGISTERED     Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED
422 |     ERROR_UNKNOWN                        Result = C.CUDA_ERROR_UNKNOWN
423 | )
424 | 
425 | func StreamQuery(stream Stream) Result
426 |     Returns Success if all operations have completed, ErrorNotReady
427 |     otherwise
428 | 
429 | func (err Result) String() string
430 |     Message string for the error
431 | 
432 | type Stream uintptr
433 |     CUDA stream.
434 | 
435 | func StreamCreate() Stream
436 |     Creates an asynchronous stream
437 | 
438 | func (stream *Stream) Destroy()
439 |     Destroys the asynchronous stream
440 | 
441 | func (stream Stream) Query() Result
442 |     Returns Success if all operations have completed, ErrorNotReady
443 |     otherwise
444 | 
445 | func (stream Stream) Synchronize()
446 |     Blocks until the stream has completed.
447 | 
448 | 
449 | 


--------------------------------------------------------------------------------
/cu/cgoflags.go:
--------------------------------------------------------------------------------
 1 | package cu
 2 | 
 3 | // This file provides CGO flags to find CUDA libraries and headers.
 4 | 
 5 | //#cgo LDFLAGS:-lcuda -lcudart
 6 | //
 7 | ////default location:
 8 | //#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
 9 | //#cgo CFLAGS: -I/usr/local/cuda/include/
10 | //
11 | ////default location if not properly symlinked:
12 | //#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib
13 | //#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib
14 | //#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib
15 | //#cgo CFLAGS: -I/usr/local/cuda-6.0/include/
16 | //#cgo CFLAGS: -I/usr/local/cuda-5.5/include/
17 | //#cgo CFLAGS: -I/usr/local/cuda-5.0/include/
18 | //
19 | ////arch linux:
20 | //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib
21 | //#cgo CFLAGS: -I/opt/cuda/include
22 | //
23 | ////WINDOWS:
24 | //#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64
25 | //#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include
26 | import "C"
27 | 


--------------------------------------------------------------------------------
/cu/context.go:
--------------------------------------------------------------------------------
  1 | package cu
  2 | 
  3 | // This file implements CUDA driver context management
  4 | 
  5 | //#include <cuda.h>
  6 | import "C"
  7 | import "unsafe"
  8 | 
  9 | // CUDA context.
 10 | type Context uintptr
 11 | 
 12 | // Create a CUDA context.
 13 | func CtxCreate(flags uint, dev Device) Context {
 14 | 	var ctx C.CUcontext
 15 | 	err := Result(C.cuCtxCreate(&ctx, C.uint(flags), C.CUdevice(dev)))
 16 | 	if err != SUCCESS {
 17 | 		panic(err)
 18 | 	}
 19 | 	return Context(uintptr(unsafe.Pointer(ctx)))
 20 | }
 21 | 
 22 | //Destroys the CUDA context specified by ctx. If the context usage count is not equal to 1, or the context is current to any CPU thread other than the current one, this function fails. Floating contexts (detached from a CPU thread via cuCtxPopCurrent()) may be destroyed by this function.
 23 | func CtxDestroy(ctx *Context) {
 24 | 	err := Result(C.cuCtxDestroy(C.CUcontext(unsafe.Pointer(uintptr(*ctx)))))
 25 | 	*ctx = 0
 26 | 	if err != SUCCESS {
 27 | 		panic(err)
 28 | 	}
 29 | }
 30 | 
 31 | //Destroys the CUDA context.
 32 | func (ctx *Context) Destroy() {
 33 | 	CtxDestroy(ctx)
 34 | }
 35 | 
 36 | // Returns the API version to create the context.
 37 | func CtxGetApiVersion(ctx Context) (version int) {
 38 | 	var cversion C.uint
 39 | 	err := Result(C.cuCtxGetApiVersion(C.CUcontext(unsafe.Pointer(uintptr(ctx))), &cversion))
 40 | 	if err != SUCCESS {
 41 | 		panic(err)
 42 | 	}
 43 | 	version = int(cversion)
 44 | 	return
 45 | }
 46 | 
 47 | // Returns the API version to create the context.
 48 | func (ctx Context) ApiVersion() (version int) {
 49 | 	return CtxGetApiVersion(ctx)
 50 | }
 51 | 
 52 | // Gets the current active context.
 53 | func CtxGetCurrent() Context {
 54 | 	var ctx C.CUcontext
 55 | 	err := Result(C.cuCtxGetCurrent(&ctx))
 56 | 	if err != SUCCESS {
 57 | 		panic(err)
 58 | 	}
 59 | 	return Context(uintptr(unsafe.Pointer(ctx)))
 60 | }
 61 | 
 62 | // Returns the ordinal of the current context's device.
 63 | func CtxGetDevice() Device {
 64 | 	var dev C.CUdevice
 65 | 	err := Result(C.cuCtxGetDevice(&dev))
 66 | 	if err != SUCCESS {
 67 | 		panic(err)
 68 | 	}
 69 | 	return Device(dev)
 70 | }
 71 | 
 72 | // Sets the current active context.
 73 | func CtxSetCurrent(ctx Context) {
 74 | 	err := Result(C.cuCtxSetCurrent(C.CUcontext(unsafe.Pointer(uintptr(ctx)))))
 75 | 	if err != SUCCESS {
 76 | 		panic(err)
 77 | 	}
 78 | }
 79 | 
 80 | // Sets the current active context.
 81 | func (ctx Context) SetCurrent() {
 82 | 	CtxSetCurrent(ctx)
 83 | }
 84 | 
 85 | // Blocks until the device has completed all preceding requested tasks, if the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag.
 86 | func CtxSynchronize() {
 87 | 	err := Result(C.cuCtxSynchronize())
 88 | 	if err != SUCCESS {
 89 | 		panic(err)
 90 | 	}
 91 | }
 92 | 
 93 | // Flags for CtxCreate
 94 | const (
 95 | 	// If  the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor.
 96 | 	CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO
 97 | 	// Spin when waiting for results from the GPU.
 98 | 	CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN
 99 | 	// Yield its thread when waiting for results from the GPU.
100 | 	CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD
101 | 	// Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work.
102 | 	CTX_BLOCKING_SYNC
103 | 	// Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU.
104 | 	CTX_MAP_HOST = C.CU_CTX_MAP_HOST
105 | 	//Do not reduce local memory after resizing local memory for a kernel.
106 | 	CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX
107 | )
108 | 


--------------------------------------------------------------------------------
/cu/context_test.go:
--------------------------------------------------------------------------------
 1 | package cu
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestContext(t *testing.T) {
 9 | 	fmt.Println("CtxCreate")
10 | 	ctx := CtxCreate(CTX_SCHED_AUTO, 0)
11 | 	fmt.Println("CtxSetCurrent")
12 | 	CtxSetCurrent(ctx)
13 | 	fmt.Println("CtxGetApiVersion:", ctx.ApiVersion())
14 | 	fmt.Println("CtxGetDevice:", CtxGetDevice())
15 | 	(&ctx).Destroy()
16 | }
17 | 
18 | func BenchmarkGetContext(b *testing.B) {
19 | 	b.StopTimer()
20 | 	ctx := CtxCreate(CTX_SCHED_AUTO, 0)
21 | 	CtxSetCurrent(ctx)
22 | 	b.StartTimer()
23 | 	for i := 0; i < b.N; i++ {
24 | 		CtxGetCurrent()
25 | 	}
26 | }
27 | 
28 | func BenchmarkSetContext(b *testing.B) {
29 | 	b.StopTimer()
30 | 	ctx := CtxCreate(CTX_SCHED_AUTO, 0)
31 | 	b.StartTimer()
32 | 	for i := 0; i < b.N; i++ {
33 | 		ctx.SetCurrent()
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/cu/device.go:
--------------------------------------------------------------------------------
  1 | package cu
  2 | 
  3 | // This file implements CUDA driver device management
  4 | 
  5 | //#include <cuda.h>
  6 | import "C"
  7 | 
  8 | import ()
  9 | 
 10 | // CUDA Device number.
 11 | type Device int
 12 | 
 13 | // Returns the compute capability of the device.
 14 | func DeviceComputeCapability(device Device) (major, minor int) {
 15 | 	var maj, min C.int
 16 | 	err := Result(C.cuDeviceComputeCapability(&maj, &min, C.CUdevice(device)))
 17 | 	if err != SUCCESS {
 18 | 		panic(err)
 19 | 	}
 20 | 	major = int(maj)
 21 | 	minor = int(min)
 22 | 	return
 23 | }
 24 | 
 25 | // Returns the compute capability of the device.
 26 | func (device Device) ComputeCapability() (major, minor int) {
 27 | 	return DeviceComputeCapability(device)
 28 | }
 29 | 
 30 | // Returns in a device handle given an ordinal in the range [0, DeviceGetCount()-1].
 31 | func DeviceGet(ordinal int) Device {
 32 | 	var device C.CUdevice
 33 | 	err := Result(C.cuDeviceGet(&device, C.int(ordinal)))
 34 | 	if err != SUCCESS {
 35 | 		panic(err)
 36 | 	}
 37 | 	return Device(device)
 38 | }
 39 | 
 40 | // Gets the value of a device attribute.
 41 | func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int {
 42 | 	var attr C.int
 43 | 	err := Result(C.cuDeviceGetAttribute(&attr, C.CUdevice_attribute(attrib), C.CUdevice(dev)))
 44 | 	if err != SUCCESS {
 45 | 		panic(err)
 46 | 	}
 47 | 	return int(attr)
 48 | }
 49 | 
 50 | // Gets the value of a device attribute.
 51 | func (dev Device) Attribute(attrib DeviceAttribute) int {
 52 | 	return DeviceGetAttribute(attrib, dev)
 53 | }
 54 | 
 55 | // Returns the number of devices with compute capability greater than or equal to 1.0 that are available for execution.
 56 | func DeviceGetCount() int {
 57 | 	var count C.int
 58 | 	err := Result(C.cuDeviceGetCount(&count))
 59 | 	if err != SUCCESS {
 60 | 		panic(err)
 61 | 	}
 62 | 	return int(count)
 63 | }
 64 | 
 65 | // Gets the name of the device.
 66 | func DeviceGetName(dev Device) string {
 67 | 	size := 256
 68 | 	buf := make([]byte, size)
 69 | 	cstr := C.CString(string(buf))
 70 | 	err := Result(C.cuDeviceGetName(cstr, C.int(size), C.CUdevice(dev)))
 71 | 	if err != SUCCESS {
 72 | 		panic(err)
 73 | 	}
 74 | 	return C.GoString(cstr)
 75 | }
 76 | 
 77 | // Gets the name of the device.
 78 | func (dev Device) Name() string {
 79 | 	return DeviceGetName(dev)
 80 | }
 81 | 
 82 | // Device properties
 83 | type DevProp struct {
 84 | 	MaxThreadsPerBlock  int
 85 | 	MaxThreadsDim       [3]int
 86 | 	MaxGridSize         [3]int
 87 | 	SharedMemPerBlock   int
 88 | 	TotalConstantMemory int
 89 | 	SIMDWidth           int
 90 | 	MemPitch            int
 91 | 	RegsPerBlock        int
 92 | 	ClockRate           int
 93 | 	TextureAlign        int
 94 | }
 95 | 
 96 | // Returns the device's properties.
 97 | func DeviceGetProperties(dev Device) (prop DevProp) {
 98 | 	var cprop C.CUdevprop
 99 | 	err := Result(C.cuDeviceGetProperties(&cprop, C.CUdevice(dev)))
100 | 	if err != SUCCESS {
101 | 		panic(err)
102 | 	}
103 | 	prop.MaxThreadsPerBlock = int(cprop.maxThreadsPerBlock)
104 | 	prop.MaxThreadsDim[0] = int(cprop.maxThreadsDim[0])
105 | 	prop.MaxThreadsDim[1] = int(cprop.maxThreadsDim[1])
106 | 	prop.MaxThreadsDim[2] = int(cprop.maxThreadsDim[2])
107 | 	prop.MaxGridSize[0] = int(cprop.maxGridSize[0])
108 | 	prop.MaxGridSize[1] = int(cprop.maxGridSize[1])
109 | 	prop.MaxGridSize[2] = int(cprop.maxGridSize[2])
110 | 	prop.SharedMemPerBlock = int(cprop.sharedMemPerBlock)
111 | 	prop.TotalConstantMemory = int(cprop.totalConstantMemory)
112 | 	prop.SIMDWidth = int(cprop.SIMDWidth)
113 | 	prop.MemPitch = int(cprop.memPitch)
114 | 	prop.RegsPerBlock = int(cprop.regsPerBlock)
115 | 	prop.ClockRate = int(cprop.clockRate)
116 | 	prop.TextureAlign = int(cprop.textureAlign)
117 | 	return
118 | }
119 | 
120 | // Returns the device's properties.
121 | func (dev Device) Properties() DevProp {
122 | 	return DeviceGetProperties(dev)
123 | }
124 | 
125 | // Returns the total amount of memory available on the device in bytes.
126 | func (device Device) TotalMem() int64 {
127 | 	return DeviceTotalMem(device)
128 | }
129 | 
130 | // Returns the total amount of memory available on the device in bytes.
131 | func DeviceTotalMem(device Device) int64 {
132 | 	var bytes C.size_t
133 | 	err := Result(C.cuDeviceTotalMem(&bytes, C.CUdevice(device)))
134 | 	if err != SUCCESS {
135 | 		panic(err)
136 | 	}
137 | 	return int64(bytes)
138 | }
139 | 
140 | type DeviceAttribute int
141 | 
142 | const (
143 | 	MAX_THREADS_PER_BLOCK            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK            // Maximum number of threads per block
144 | 	MAX_BLOCK_DIM_X                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X                  // Maximum block dimension X
145 | 	MAX_BLOCK_DIM_Y                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y                  // Maximum block dimension Y
146 | 	MAX_BLOCK_DIM_Z                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z                  // Maximum block dimension Z
147 | 	MAX_GRID_DIM_X                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X                   // Maximum grid dimension X
148 | 	MAX_GRID_DIM_Y                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y                   // Maximum grid dimension Y
149 | 	MAX_GRID_DIM_Z                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z                   // Maximum grid dimension Z
150 | 	MAX_SHARED_MEMORY_PER_BLOCK      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK      // Maximum shared memory available per block in bytes
151 | 	TOTAL_CONSTANT_MEMORY            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY            // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
152 | 	WARP_SIZE                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE                        // Warp size in threads
153 | 	MAX_PITCH                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH                        // Maximum pitch in bytes allowed by memory copies
154 | 	MAX_REGISTERS_PER_BLOCK          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK          // Maximum number of 32-bit registers available per block
155 | 	CLOCK_RATE                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE                       // Peak clock frequency in kilohertz
156 | 	TEXTURE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT                // Alignment requirement for textures
157 | 	MULTIPROCESSOR_COUNT             DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT             // Number of multiprocessors on device
158 | 	KERNEL_EXEC_TIMEOUT              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT              // Specifies whether there is a run time limit on kernels
159 | 	INTEGRATED                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED                       // Device is integrated with host memory
160 | 	CAN_MAP_HOST_MEMORY              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY              // Device can map host memory into CUDA address space
161 | 	COMPUTE_MODE                     DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE                     // Compute mode (See ::CUcomputemode for details)
162 | 	MAXIMUM_TEXTURE1D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH          // Maximum 1D texture width
163 | 	MAXIMUM_TEXTURE2D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH          // Maximum 2D texture width
164 | 	MAXIMUM_TEXTURE2D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT         // Maximum 2D texture height
165 | 	MAXIMUM_TEXTURE3D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH          // Maximum 3D texture width
166 | 	MAXIMUM_TEXTURE3D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT         // Maximum 3D texture height
167 | 	MAXIMUM_TEXTURE3D_DEPTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH          // Maximum 3D texture depth
168 | 	MAXIMUM_TEXTURE2D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH  // Maximum 2D layered texture width
169 | 	MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height
170 | 	MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture
171 | 	SURFACE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT                // Alignment requirement for surfaces
172 | 	CONCURRENT_KERNELS               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS               // Device can possibly execute multiple kernels concurrently
173 | 	ECC_ENABLED                      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED                      // Device has ECC support enabled
174 | 	PCI_BUS_ID                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID                       // PCI bus ID of the device
175 | 	PCI_DEVICE_ID                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID                    // PCI device ID of the device
176 | 	TCC_DRIVER                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER                       // Device is using TCC driver model
177 | 	MEMORY_CLOCK_RATE                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE                // Peak memory clock frequency in kilohertz
178 | 	GLOBAL_MEMORY_BUS_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH          // Global memory bus width in bits
179 | 	L2_CACHE_SIZE                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE                    // Size of L2 cache in bytes
180 | 	MAX_THREADS_PER_MULTIPROCESSOR   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR   // Maximum resident threads per multiprocessor
181 | 	ASYNC_ENGINE_COUNT               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT               // Number of asynchronous engines
182 | 	UNIFIED_ADDRESSING               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING               // Device uses shares a unified address space with the host
183 | 	MAXIMUM_TEXTURE1D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH  // Maximum 1D layered texture width
184 | 	MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture
185 | )
186 | 


--------------------------------------------------------------------------------
/cu/device_test.go:
--------------------------------------------------------------------------------
 1 | package cu
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestDevice(t *testing.T) {
 9 | 	fmt.Println("DeviceGetCount:", DeviceGetCount())
10 | 	for i := 0; i < DeviceGetCount(); i++ {
11 | 		fmt.Println("DeviceGet", i)
12 | 		dev := DeviceGet(i)
13 | 		major, minor := dev.ComputeCapability()
14 | 		fmt.Println("Name: ", dev.Name())
15 | 		fmt.Println("ComputeCapability: ", major, minor)
16 | 		fmt.Println("TotalMem: ", dev.TotalMem())
17 | 
18 | 		fmt.Println("ATTRIBUTE_MAX_THREADS_PER_BLOCK           :", dev.Attribute(MAX_THREADS_PER_BLOCK))
19 | 		fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_X                 :", dev.Attribute(MAX_BLOCK_DIM_X))
20 | 		fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_Y                 :", dev.Attribute(MAX_BLOCK_DIM_Y))
21 | 		fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_Z                 :", dev.Attribute(MAX_BLOCK_DIM_Z))
22 | 		fmt.Println("ATTRIBUTE_MAX_GRID_DIM_X                  :", dev.Attribute(MAX_GRID_DIM_X))
23 | 		fmt.Println("ATTRIBUTE_MAX_GRID_DIM_Y                  :", dev.Attribute(MAX_GRID_DIM_Y))
24 | 		fmt.Println("ATTRIBUTE_MAX_GRID_DIM_Z                  :", dev.Attribute(MAX_GRID_DIM_Z))
25 | 		fmt.Println("ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK     :", dev.Attribute(MAX_SHARED_MEMORY_PER_BLOCK))
26 | 		fmt.Println("ATTRIBUTE_TOTAL_CONSTANT_MEMORY           :", dev.Attribute(TOTAL_CONSTANT_MEMORY))
27 | 		fmt.Println("ATTRIBUTE_WARP_SIZE                       :", dev.Attribute(WARP_SIZE))
28 | 		fmt.Println("ATTRIBUTE_MAX_PITCH                       :", dev.Attribute(MAX_PITCH))
29 | 		fmt.Println("ATTRIBUTE_MAX_REGISTERS_PER_BLOCK         :", dev.Attribute(MAX_REGISTERS_PER_BLOCK))
30 | 		fmt.Println("ATTRIBUTE_CLOCK_RATE                      :", dev.Attribute(CLOCK_RATE))
31 | 		fmt.Println("ATTRIBUTE_TEXTURE_ALIGNMENT               :", dev.Attribute(TEXTURE_ALIGNMENT))
32 | 		fmt.Println("ATTRIBUTE_MULTIPROCESSOR_COUNT            :", dev.Attribute(MULTIPROCESSOR_COUNT))
33 | 		fmt.Println("ATTRIBUTE_KERNEL_EXEC_TIMEOUT             :", dev.Attribute(KERNEL_EXEC_TIMEOUT))
34 | 		fmt.Println("ATTRIBUTE_INTEGRATED                      :", dev.Attribute(INTEGRATED))
35 | 		fmt.Println("ATTRIBUTE_CAN_MAP_HOST_MEMORY             :", dev.Attribute(CAN_MAP_HOST_MEMORY))
36 | 		fmt.Println("ATTRIBUTE_COMPUTE_MODE                    :", dev.Attribute(COMPUTE_MODE))
37 | 		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH         :", dev.Attribute(MAXIMUM_TEXTURE1D_WIDTH))
38 | 		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH         :", dev.Attribute(MAXIMUM_TEXTURE2D_WIDTH))
39 | 		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT        :", dev.Attribute(MAXIMUM_TEXTURE2D_HEIGHT))
40 | 		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH         :", dev.Attribute(MAXIMUM_TEXTURE3D_WIDTH))
41 | 		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT        :", dev.Attribute(MAXIMUM_TEXTURE3D_HEIGHT))
42 | 		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH         :", dev.Attribute(MAXIMUM_TEXTURE3D_DEPTH))
43 | 		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_WIDTH))
44 | 		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT))
45 | 		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_LAYERS))
46 | 		fmt.Println("ATTRIBUTE_SURFACE_ALIGNMENT               :", dev.Attribute(SURFACE_ALIGNMENT))
47 | 		fmt.Println("ATTRIBUTE_CONCURRENT_KERNELS              :", dev.Attribute(CONCURRENT_KERNELS))
48 | 		fmt.Println("ATTRIBUTE_ECC_ENABLED                     :", dev.Attribute(ECC_ENABLED))
49 | 		fmt.Println("ATTRIBUTE_PCI_BUS_ID                      :", dev.Attribute(PCI_BUS_ID))
50 | 		fmt.Println("ATTRIBUTE_PCI_DEVICE_ID                   :", dev.Attribute(PCI_DEVICE_ID))
51 | 		fmt.Println("ATTRIBUTE_TCC_DRIVER                      :", dev.Attribute(TCC_DRIVER))
52 | 		fmt.Println("ATTRIBUTE_MEMORY_CLOCK_RATE               :", dev.Attribute(MEMORY_CLOCK_RATE))
53 | 		fmt.Println("ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH         :", dev.Attribute(GLOBAL_MEMORY_BUS_WIDTH))
54 | 		fmt.Println("ATTRIBUTE_L2_CACHE_SIZE                   :", dev.Attribute(L2_CACHE_SIZE))
55 | 		fmt.Println("ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR  :", dev.Attribute(MAX_THREADS_PER_MULTIPROCESSOR))
56 | 		fmt.Println("ATTRIBUTE_ASYNC_ENGINE_COUNT              :", dev.Attribute(ASYNC_ENGINE_COUNT))
57 | 		fmt.Println("ATTRIBUTE_UNIFIED_ADDRESSING              :", dev.Attribute(UNIFIED_ADDRESSING))
58 | 		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE1D_LAYERED_WIDTH))
59 | 		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:", dev.Attribute(MAXIMUM_TEXTURE1D_LAYERED_LAYERS))
60 | 
61 | 		fmt.Printf("Properties:%#v\n", dev.Properties())
62 | 	}
63 | }
64 | 


--------------------------------------------------------------------------------
/cu/dim3.go:
--------------------------------------------------------------------------------
1 | package cu
2 | 
3 | type Dim3 struct {
4 | 	X, Y, Z int
5 | }
6 | 


--------------------------------------------------------------------------------
/cu/doc.go:
--------------------------------------------------------------------------------
1 | // Go bindings for the CUDA driver API.
2 | package cu
3 | 


--------------------------------------------------------------------------------
/cu/execution.go:
--------------------------------------------------------------------------------
 1 | package cu
 2 | 
 3 | // This file implements execution of CUDA kernels
 4 | 
 5 | //#include <cuda.h>
 6 | import "C"
 7 | 
 8 | import (
 9 | 	"unsafe"
10 | )
11 | 
12 | const pointerSize = 8 // sorry, 64 bits only.
13 | 
14 | func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) {
15 | 
16 | 	// Since Go 1.6, a cgo argument cannot have a Go pointer to Go pointer,
17 | 	// so we copy the argument values go C memory first.
18 | 	argv := C.malloc(C.size_t(len(kernelParams) * pointerSize))
19 | 	argp := C.malloc(C.size_t(len(kernelParams) * pointerSize))
20 | 	defer C.free(argv)
21 | 	defer C.free(argp)
22 | 	for i := range kernelParams {
23 | 		*((*unsafe.Pointer)(offset(argp, i))) = offset(argv, i)       // argp[i] = &argv[i]
24 | 		*((*uint64)(offset(argv, i))) = *((*uint64)(kernelParams[i])) // argv[i] = *kernelParams[i]
25 | 	}
26 | 
27 | 	err := Result(C.cuLaunchKernel(
28 | 		C.CUfunction(unsafe.Pointer(uintptr(f))),
29 | 		C.uint(gridDimX),
30 | 		C.uint(gridDimY),
31 | 		C.uint(gridDimZ),
32 | 		C.uint(blockDimX),
33 | 		C.uint(blockDimY),
34 | 		C.uint(blockDimZ),
35 | 		C.uint(sharedMemBytes),
36 | 		C.CUstream(unsafe.Pointer(uintptr(stream))),
37 | 		(*unsafe.Pointer)(argp),
38 | 		(*unsafe.Pointer)(unsafe.Pointer(uintptr(0)))))
39 | 	if err != SUCCESS {
40 | 		panic(err)
41 | 	}
42 | }
43 | 
44 | func offset(ptr unsafe.Pointer, i int) unsafe.Pointer {
45 | 	return unsafe.Pointer(uintptr(ptr) + pointerSize*uintptr(i))
46 | }
47 | 


--------------------------------------------------------------------------------
/cu/function.go:
--------------------------------------------------------------------------------
 1 | package cu
 2 | 
 3 | // This file implements manipulations on CUDA functions
 4 | 
 5 | //#include <cuda.h>
 6 | import "C"
 7 | 
 8 | import (
 9 | 	"unsafe"
10 | )
11 | 
12 | // Represents a CUDA CUfunction, a reference to a function within a module.
13 | type Function uintptr
14 | 
15 | func FuncGetAttribute(attrib FunctionAttribute, function Function) int {
16 | 	var attr C.int
17 | 	err := Result(C.cuFuncGetAttribute(&attr, C.CUfunction_attribute(attrib), C.CUfunction(unsafe.Pointer(uintptr(function)))))
18 | 	if err != SUCCESS {
19 | 		panic(err)
20 | 	}
21 | 	return int(attr)
22 | }
23 | 
24 | func (f Function) GetAttribute(attrib FunctionAttribute) int {
25 | 	return FuncGetAttribute(attrib, f)
26 | }
27 | 
28 | type FunctionAttribute int
29 | 
30 | const (
31 | 	FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail.
32 | 	FUNC_A_SHARED_SIZE_BYTES     FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES     // The size in bytes of statically-allocated shared memory required by this function.
33 | 	FUNC_A_CONST_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES      // The size in bytes of user-allocated constant memory required by this function.
34 | 	FUNC_A_LOCAL_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES      // The size in bytes of local memory used by each thread of this function.
35 | 	FUNC_A_NUM_REGS              FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS              // The number of registers used by each thread of this function.
36 | 	FUNC_A_PTX_VERSION           FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION           // The PTX virtual architecture version for which the function was compiled.
37 | 	FUNC_A_BINARY_VERSION        FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION        // The binary architecture version for which the function was compiled.
38 | )
39 | 


--------------------------------------------------------------------------------
/cu/init.go:
--------------------------------------------------------------------------------
 1 | package cu
 2 | 
 3 | // This file implements CUDA driver initialization
 4 | 
 5 | //#include <cuda.h>
 6 | import "C"
 7 | 
 8 | // Initialize the CUDA driver API.
 9 | // Currently, flags must be 0.
10 | // If Init() has not been called, any function from the driver API will panic with ERROR_NOT_INITIALIZED.
11 | func Init(flags int) {
12 | 	err := Result(C.cuInit(C.uint(flags)))
13 | 	if err != SUCCESS {
14 | 		panic(err)
15 | 	}
16 | }
17 | 


--------------------------------------------------------------------------------
/cu/init_test.go:
--------------------------------------------------------------------------------
 1 | package cu
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | )
 6 | 
 7 | // needed for all other tests.
 8 | func init() {
 9 | 	Init(0)
10 | 	ctx := CtxCreate(CTX_SCHED_AUTO, 0)
11 | 	CtxSetCurrent(ctx)
12 | 	fmt.Println("Created CUDA context")
13 | }
14 | 


--------------------------------------------------------------------------------
/cu/memory.go:
--------------------------------------------------------------------------------
  1 | package cu
  2 | 
  3 | // This file implements CUDA memory management on the driver level
  4 | 
  5 | //#include <cuda.h>
  6 | import "C"
  7 | 
  8 | import (
  9 | 	"fmt"
 10 | 	"unsafe"
 11 | )
 12 | 
 13 | type DevicePtr uintptr
 14 | 
 15 | // Allocates a number of bytes of device memory.
 16 | func MemAlloc(bytes int64) DevicePtr {
 17 | 	var devptr C.CUdeviceptr
 18 | 	err := Result(C.cuMemAlloc(&devptr, C.size_t(bytes)))
 19 | 	if err != SUCCESS {
 20 | 		panic(err)
 21 | 	}
 22 | 	return DevicePtr(devptr)
 23 | }
 24 | 
 25 | // Frees device memory allocated by MemAlloc().
 26 | // It is safe to double-free.
 27 | func MemFree(p DevicePtr) {
 28 | 	if p == DevicePtr(uintptr(0)) {
 29 | 		return // Allready freed
 30 | 	}
 31 | 	err := Result(C.cuMemFree(C.CUdeviceptr(p)))
 32 | 	if err != SUCCESS {
 33 | 		panic(err)
 34 | 	}
 35 | }
 36 | 
 37 | // Frees device memory allocated by MemAlloc().
 38 | // Overwrites the pointer with NULL.
 39 | // It is safe to double-free.
 40 | func (ptr DevicePtr) Free() {
 41 | 	MemFree(ptr)
 42 | }
 43 | 
 44 | // Copies a number of bytes on the current device.
 45 | // Requires unified addressing to be supported.
 46 | // See also: MemcpyDtoD().
 47 | func Memcpy(dst, src DevicePtr, bytes int64) {
 48 | 	err := Result(C.cuMemcpy(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes)))
 49 | 	if err != SUCCESS {
 50 | 		panic(err)
 51 | 	}
 52 | }
 53 | 
 54 | // Asynchronously copies a number of bytes on the current device.
 55 | func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream) {
 56 | 	err := Result(C.cuMemcpyAsync(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))
 57 | 	if err != SUCCESS {
 58 | 		panic(err)
 59 | 	}
 60 | }
 61 | 
 62 | // Copies a number of bytes from host to device.
 63 | func MemcpyDtoD(dst, src DevicePtr, bytes int64) {
 64 | 	err := Result(C.cuMemcpyDtoD(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes)))
 65 | 	if err != SUCCESS {
 66 | 		panic(err)
 67 | 	}
 68 | }
 69 | 
 70 | // Asynchronously copies a number of bytes from host to device.
 71 | func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream) {
 72 | 	err := Result(C.cuMemcpyDtoDAsync(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))
 73 | 	if err != SUCCESS {
 74 | 		panic(err)
 75 | 	}
 76 | }
 77 | 
 78 | // Copies a number of bytes from host to device.
 79 | func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64) {
 80 | 	err := Result(C.cuMemcpyHtoD(C.CUdeviceptr(dst), src, C.size_t(bytes)))
 81 | 	if err != SUCCESS {
 82 | 		panic(err)
 83 | 	}
 84 | }
 85 | 
 86 | // Asynchronously copies a number of bytes from host to device.
 87 | // The host memory must be page-locked (see MemRegister)
 88 | func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream) {
 89 | 	err := Result(C.cuMemcpyHtoDAsync(C.CUdeviceptr(dst), src, C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))
 90 | 	if err != SUCCESS {
 91 | 		panic(err)
 92 | 	}
 93 | }
 94 | 
 95 | // Copies a number of bytes from device to host.
 96 | func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64) {
 97 | 	err := Result(C.cuMemcpyDtoH(dst, C.CUdeviceptr(src), C.size_t(bytes)))
 98 | 	if err != SUCCESS {
 99 | 		panic(err)
100 | 	}
101 | }
102 | 
103 | // Asynchronously copies a number of bytes device host to host.
104 | // The host memory must be page-locked (see MemRegister)
105 | func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream) {
106 | 	err := Result(C.cuMemcpyDtoHAsync(dst, C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))
107 | 	if err != SUCCESS {
108 | 		panic(err)
109 | 	}
110 | }
111 | 
112 | // Copies from device memory in one context (device) to another.
113 | func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64) {
114 | 	err := Result(C.cuMemcpyPeer(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes)))
115 | 	if err != SUCCESS {
116 | 		panic(err)
117 | 	}
118 | }
119 | 
120 | // Asynchronously copies from device memory in one context (device) to another.
121 | func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream) {
122 | 	err := Result(C.cuMemcpyPeerAsync(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))
123 | 	if err != SUCCESS {
124 | 		panic(err)
125 | 	}
126 | }
127 | 
128 | // Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr.
129 | func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr) {
130 | 	var cbytes C.size_t
131 | 	var cptr C.CUdeviceptr
132 | 	err := Result(C.cuMemGetAddressRange(&cptr, &cbytes, C.CUdeviceptr(ptr)))
133 | 	if err != SUCCESS {
134 | 		panic(err)
135 | 	}
136 | 	bytes = int64(cbytes)
137 | 	base = DevicePtr(cptr)
138 | 	return
139 | }
140 | 
141 | // Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr.
142 | func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr) {
143 | 	return MemGetAddressRange(ptr)
144 | }
145 | 
146 | // Returns the size of the allocation (by MemAlloc) that contains the input pointer ptr.
147 | func (ptr DevicePtr) Bytes() (bytes int64) {
148 | 	bytes, _ = MemGetAddressRange(ptr)
149 | 	return
150 | }
151 | 
152 | // Returns the free and total amount of memroy in the current Context (in bytes).
153 | func MemGetInfo() (free, total int64) {
154 | 	var cfree, ctotal C.size_t
155 | 	err := Result(C.cuMemGetInfo(&cfree, &ctotal))
156 | 	if err != SUCCESS {
157 | 		panic(err)
158 | 	}
159 | 	free = int64(cfree)
160 | 	total = int64(ctotal)
161 | 	return
162 | }
163 | 
164 | // Page-locks memory specified by the pointer and bytes.
165 | // The pointer and byte size must be aligned to the host page size (4KB)
166 | // See also: MemHostUnregister()
167 | // doesn't link with cuda6.5
168 | //func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag) {
169 | //	err := Result(C.cuMemHostRegister(ptr, C.size_t(bytes), C.uint(flags)))
170 | //	if err != SUCCESS {
171 | //		panic(err)
172 | //	}
173 | //}
174 | 
175 | // Unmaps memory locked by MemHostRegister().
176 | // doesn't link with cuda6.5
177 | //func MemHostUnregister(ptr unsafe.Pointer) {
178 | //	err := Result(C.cuMemHostUnregister(ptr))
179 | //	if err != SUCCESS {
180 | //		panic(err)
181 | //	}
182 | //}
183 | 
184 | func MemAllocHost(bytes int64) unsafe.Pointer {
185 | 	var p unsafe.Pointer
186 | 	err := Result(C.cuMemAllocHost(&p, C.size_t(bytes)))
187 | 	if err != SUCCESS {
188 | 		panic(err)
189 | 	}
190 | 	return p
191 | }
192 | 
193 | func MemFreeHost(ptr unsafe.Pointer) {
194 | 	err := Result(C.cuMemFreeHost(ptr))
195 | 	if err != SUCCESS {
196 | 		panic(err)
197 | 	}
198 | }
199 | 
200 | type MemHostRegisterFlag int
201 | 
202 | // Flag for MemHostRegister
203 | const (
204 | 	// Memory is pinned in all CUDA contexts.
205 | 	MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE
206 | 	// Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer()
207 | 	MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP
208 | )
209 | 
210 | func (p DevicePtr) String() string {
211 | 	return fmt.Sprint(unsafe.Pointer(uintptr(p)))
212 | }
213 | 
214 | // Type size in bytes
215 | const (
216 | 	SIZEOF_FLOAT32    = 4
217 | 	SIZEOF_FLOAT64    = 8
218 | 	SIZEOF_COMPLEX64  = 8
219 | 	SIZEOF_COMPLEX128 = 16
220 | )
221 | 
222 | // Physical memory type of device pointer.
223 | type MemoryType uint
224 | 
225 | const (
226 | 	MemoryTypeHost    MemoryType = C.CU_MEMORYTYPE_HOST
227 | 	MemoryTypeDevice  MemoryType = C.CU_MEMORYTYPE_DEVICE
228 | 	MemoryTypeArray   MemoryType = C.CU_MEMORYTYPE_ARRAY
229 | 	MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED
230 | )
231 | 
232 | var memorytype = map[MemoryType]string{
233 | 	MemoryTypeHost:    "MemoryTypeHost",
234 | 	MemoryTypeDevice:  "MemoryTypeDevice",
235 | 	MemoryTypeArray:   "MemoryTypeArray",
236 | 	MemoryTypeUnified: "MemoryTypeUnified"}
237 | 
238 | func (t MemoryType) String() string {
239 | 	if s, ok := memorytype[t]; ok {
240 | 		return s
241 | 	}
242 | 	return "MemoryTypeUnknown"
243 | }
244 | 
245 | // Returns the physical memory type that ptr addresses.
246 | func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result) {
247 | 	var typ uint64 // foresee enough memory just to be safe
248 | 	err = Result(C.cuPointerGetAttribute(unsafe.Pointer(&typ),
249 | 		C.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, C.CUdeviceptr(uintptr(ptr))))
250 | 	return MemoryType(uint(typ)), err
251 | }
252 | 
253 | // Returns the physical memory type that ptr addresses.
254 | func (ptr DevicePtr) MemoryType() MemoryType {
255 | 	t, err := PointerGetAttributeMemoryType(ptr)
256 | 	if err != SUCCESS {
257 | 		panic(err)
258 | 	}
259 | 	return t
260 | }
261 | 


--------------------------------------------------------------------------------
/cu/memory_test.go:
--------------------------------------------------------------------------------
  1 | package cu
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | 	"testing"
  7 | 	"unsafe"
  8 | )
  9 | 
 10 | func TestMalloc(t *testing.T) {
 11 | 	for i := 0; i < 1024; i++ {
 12 | 		pointer := MemAlloc(16 * 1024 * 1024)
 13 | 		pointer.Free()
 14 | 	}
 15 | 	for i := 0; i < 1024; i++ {
 16 | 		pointer := MemAlloc(16 * 1024 * 1024)
 17 | 		MemFree(pointer)
 18 | 	}
 19 | }
 20 | 
 21 | func BenchmarkMallocFree1B(b *testing.B) {
 22 | 	for i := 0; i < b.N; i++ {
 23 | 		m := MemAlloc(1)
 24 | 		m.Free()
 25 | 	}
 26 | }
 27 | 
 28 | func BenchmarkMallocFree1kB(b *testing.B) {
 29 | 	for i := 0; i < b.N; i++ {
 30 | 		m := MemAlloc(1024)
 31 | 		m.Free()
 32 | 	}
 33 | }
 34 | 
 35 | func BenchmarkMallocFree1MB(b *testing.B) {
 36 | 	for i := 0; i < b.N; i++ {
 37 | 		m := MemAlloc(1024 * 1024)
 38 | 		m.Free()
 39 | 	}
 40 | }
 41 | 
 42 | func TestMemAddressRange(t *testing.T) {
 43 | 	N := 12345
 44 | 	ptr := MemAlloc(int64(N))
 45 | 	size, base := MemGetAddressRange(ptr)
 46 | 	if size != int64(N) {
 47 | 		t.Fail()
 48 | 	}
 49 | 	if base != ptr {
 50 | 		t.Fail()
 51 | 	}
 52 | 	size, base = 0, DevicePtr(0)
 53 | 	size, base = ptr.GetAddressRange()
 54 | 	if ptr.Bytes() != int64(N) {
 55 | 		t.Fail()
 56 | 	}
 57 | }
 58 | 
 59 | func TestMemGetInfo(t *testing.T) {
 60 | 	free, total := MemGetInfo()
 61 | 	fmt.Println("MemGetInfo: ", free, "/", total)
 62 | 	if free > total {
 63 | 		t.Fail()
 64 | 	}
 65 | 	if total == 0 {
 66 | 		t.Fail()
 67 | 	}
 68 | }
 69 | 
 70 | func TestMemsetAsync(t *testing.T) {
 71 | 	N := int64(32 * 1024)
 72 | 	host1 := make([]float32, N)
 73 | 	for i := range host1 {
 74 | 		host1[i] = float32(i)
 75 | 	}
 76 | 	host2 := make([]float32, N)
 77 | 	dev1 := MemAlloc(int64(4 * N))
 78 | 	MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N)
 79 | 	str := StreamCreate()
 80 | 	MemsetD32Async(dev1, math.Float32bits(42), N, str)
 81 | 	MemsetD32Async(dev1, math.Float32bits(21), N/2, str)
 82 | 	MemcpyDtoH((unsafe.Pointer(&host2[0])), dev1, 4*N)
 83 | 	str.Synchronize()
 84 | 	(&str).Destroy()
 85 | 	for i := 0; i < len(host2)/2; i++ {
 86 | 		if host2[i] != 21 {
 87 | 			t.Fail()
 88 | 		}
 89 | 	}
 90 | 	for i := len(host2) / 2; i < len(host2); i++ {
 91 | 		if host2[i] != 42 {
 92 | 			t.Fail()
 93 | 		}
 94 | 	}
 95 | 	dev1.Free()
 96 | }
 97 | 
 98 | func TestMemset(t *testing.T) {
 99 | 	N := int64(32 * 1024)
100 | 	host1 := make([]float32, N)
101 | 	for i := range host1 {
102 | 		host1[i] = float32(i)
103 | 	}
104 | 	host2 := make([]float32, N)
105 | 	dev1 := MemAlloc(int64(4 * N))
106 | 	MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N)
107 | 	MemsetD32(dev1, math.Float32bits(42), N)
108 | 	MemsetD32(dev1, math.Float32bits(21), N/2)
109 | 	MemcpyDtoH((unsafe.Pointer(&host2[0])), dev1, 4*N)
110 | 	for i := 0; i < len(host2)/2; i++ {
111 | 		if host2[i] != 21 {
112 | 			t.Fail()
113 | 		}
114 | 	}
115 | 	for i := len(host2) / 2; i < len(host2); i++ {
116 | 		if host2[i] != 42 {
117 | 			t.Fail()
118 | 		}
119 | 	}
120 | 	dev1.Free()
121 | }
122 | 
123 | func TestMemcpy(t *testing.T) {
124 | 	N := int64(32 * 1024)
125 | 	host1 := make([]float32, N)
126 | 	for i := range host1 {
127 | 		host1[i] = float32(i)
128 | 	}
129 | 	host2 := make([]float32, N)
130 | 	dev1 := MemAlloc(int64(4 * N))
131 | 	dev2 := MemAlloc(int64(4 * N))
132 | 	MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N)
133 | 	MemcpyDtoD(dev2, dev1, 4*N)
134 | 	MemcpyDtoH((unsafe.Pointer(&host2[0])), dev2, 4*N)
135 | 	for i := range host2 {
136 | 		if host2[i] != float32(i) {
137 | 			t.Fail()
138 | 		}
139 | 	}
140 | 	dev1.Free()
141 | 	dev2.Free()
142 | }
143 | 
144 | func TestMemcpyAsync(t *testing.T) {
145 | 	N := int64(32 * 1024)
146 | 	host1 := make([]float32, N)
147 | 	for i := range host1 {
148 | 		host1[i] = float32(i)
149 | 	}
150 | 	host2 := make([]float32, N)
151 | 	dev1 := MemAlloc(int64(4 * N))
152 | 	dev2 := MemAlloc(int64(4 * N))
153 | 	stream := StreamCreate()
154 | 	MemcpyHtoDAsync(dev1, (unsafe.Pointer(&host1[0])), 4*N, stream)
155 | 	MemcpyDtoDAsync(dev2, dev1, 4*N, stream)
156 | 	MemcpyDtoHAsync((unsafe.Pointer(&host2[0])), dev2, 4*N, stream)
157 | 	stream.Synchronize()
158 | 	for i := range host2 {
159 | 		if host2[i] != float32(i) {
160 | 			t.Fail()
161 | 		}
162 | 	}
163 | 	dev1.Free()
164 | 	dev2.Free()
165 | }
166 | 
167 | func TestMemcpyAsyncRegistered(t *testing.T) {
168 | 	N := int64(32 * 1024)
169 | 	host1 := make([]float32, N)
170 | 	for i := range host1 {
171 | 		host1[i] = float32(i)
172 | 	}
173 | 	host2 := make([]float32, N)
174 | 	dev1 := MemAlloc(int64(4 * N))
175 | 	dev2 := MemAlloc(int64(4 * N))
176 | 	stream := StreamCreate()
177 | 	MemcpyHtoDAsync(dev1, (unsafe.Pointer(&host1[0])), 4*N, stream)
178 | 	MemcpyDtoDAsync(dev2, dev1, 4*N, stream)
179 | 	MemcpyDtoHAsync((unsafe.Pointer(&host2[0])), dev2, 4*N, stream)
180 | 	stream.Synchronize()
181 | 	for i := range host2 {
182 | 		if host2[i] != float32(i) {
183 | 			t.Fail()
184 | 		}
185 | 	}
186 | 	dev1.Free()
187 | 	dev2.Free()
188 | }
189 | 
190 | func BenchmarkMemcpy(b *testing.B) {
191 | 	b.StopTimer()
192 | 	N := int64(32 * 1024 * 1024)
193 | 	host1 := make([]float32, N)
194 | 	host2 := make([]float32, N)
195 | 	dev1 := MemAlloc(int64(4 * N))
196 | 	defer dev1.Free()
197 | 	dev2 := MemAlloc(int64(4 * N))
198 | 	defer dev2.Free()
199 | 	b.SetBytes(4 * N)
200 | 	b.StartTimer()
201 | 	for i := 0; i < b.N; i++ {
202 | 		MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N)
203 | 		MemcpyDtoD(dev2, dev1, 4*N)
204 | 		MemcpyDtoH((unsafe.Pointer(&host2[0])), dev2, 4*N)
205 | 	}
206 | }
207 | 


--------------------------------------------------------------------------------
/cu/memset.go:
--------------------------------------------------------------------------------
 1 | package cu
 2 | 
 3 | // This file implements CUDA memset functions.
 4 | 
 5 | //#include <cuda.h>
 6 | import "C"
 7 | 
 8 | import (
 9 | 	"unsafe"
10 | )
11 | 
12 | // Sets the first N 32-bit values of dst array to value.
13 | // Asynchronous.
14 | func MemsetD32(deviceptr DevicePtr, value uint32, N int64) {
15 | 	err := Result(C.cuMemsetD32(C.CUdeviceptr(deviceptr), C.uint(value), C.size_t(N)))
16 | 	if err != SUCCESS {
17 | 		panic(err)
18 | 	}
19 | }
20 | 
21 | // Asynchronously sets the first N 32-bit values of dst array to value.
22 | func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream) {
23 | 	err := Result(C.cuMemsetD32Async(C.CUdeviceptr(deviceptr), C.uint(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream)))))
24 | 	if err != SUCCESS {
25 | 		panic(err)
26 | 	}
27 | }
28 | 
29 | // Sets the first N 8-bit values of dst array to value.
30 | // Asynchronous.
31 | func MemsetD8(deviceptr DevicePtr, value uint8, N int64) {
32 | 	err := Result(C.cuMemsetD8(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N)))
33 | 	if err != SUCCESS {
34 | 		panic(err)
35 | 	}
36 | }
37 | 
38 | // Asynchronously sets the first N 32-bit values of dst array to value.
39 | func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream) {
40 | 	err := Result(C.cuMemsetD8Async(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream)))))
41 | 	if err != SUCCESS {
42 | 		panic(err)
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/cu/module.go:
--------------------------------------------------------------------------------
 1 | package cu
 2 | 
 3 | // This file implements loading of CUDA ptx modules
 4 | 
 5 | //#include <cuda.h>
 6 | import "C"
 7 | 
 8 | import (
 9 | 	"unsafe"
10 | )
11 | 
12 | // Represents a CUDA CUmodule, a reference to executable device code.
13 | type Module uintptr
14 | 
15 | // Loads a compute module from file
16 | func ModuleLoad(fname string) Module {
17 | 	//fmt.Fprintln(os.Stderr, "driver.ModuleLoad", fname)
18 | 	var mod C.CUmodule
19 | 	err := Result(C.cuModuleLoad(&mod, C.CString(fname)))
20 | 	if err != SUCCESS {
21 | 		panic(err)
22 | 	}
23 | 	return Module(uintptr(unsafe.Pointer(mod)))
24 | }
25 | 
26 | // Loads a compute module from string
27 | func ModuleLoadData(image string) Module {
28 | 	var mod C.CUmodule
29 | 	err := Result(C.cuModuleLoadData(&mod, unsafe.Pointer(C.CString(image))))
30 | 	if err != SUCCESS {
31 | 		panic(err)
32 | 	}
33 | 	return Module(uintptr(unsafe.Pointer(mod)))
34 | }
35 | 
36 | // Returns a Function handle.
37 | func ModuleGetFunction(module Module, name string) Function {
38 | 	var function C.CUfunction
39 | 	err := Result(C.cuModuleGetFunction(
40 | 		&function,
41 | 		C.CUmodule(unsafe.Pointer(uintptr(module))),
42 | 		C.CString(name)))
43 | 	if err != SUCCESS {
44 | 		panic(err)
45 | 	}
46 | 	return Function(uintptr(unsafe.Pointer(function)))
47 | }
48 | 
49 | // Returns a Function handle.
50 | func (m Module) GetFunction(name string) Function {
51 | 	return ModuleGetFunction(m, name)
52 | }
53 | 


--------------------------------------------------------------------------------
/cu/module_test.go:
--------------------------------------------------------------------------------
 1 | package cu
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 	"unsafe"
 6 | 	//"fmt"
 7 | )
 8 | 
 9 | func TestModule(test *testing.T) {
10 | 	mod := ModuleLoad("/testdata/testmodule.ptx")
11 | 	f := mod.GetFunction("testMemset")
12 | 
13 | 	N := 1000
14 | 	N4 := 4 * int64(N)
15 | 	a := make([]float32, N)
16 | 	A := MemAlloc(N4)
17 | 	defer A.Free()
18 | 	aptr := unsafe.Pointer(&a[0])
19 | 	MemcpyHtoD(A, aptr, N4)
20 | 
21 | 	var value float32
22 | 	value = 42
23 | 
24 | 	var n int
25 | 	n = N / 2
26 | 
27 | 	block := 128
28 | 	grid := DivUp(N, block)
29 | 	shmem := 0
30 | 	args := []unsafe.Pointer{unsafe.Pointer(&A), unsafe.Pointer(&value), unsafe.Pointer(&n)}
31 | 	LaunchKernel(f, grid, 1, 1, block, 1, 1, shmem, 0, args)
32 | 
33 | 	MemcpyDtoH(aptr, A, N4)
34 | 	for i := 0; i < N/2; i++ {
35 | 		if a[i] != 42 {
36 | 			test.Fail()
37 | 		}
38 | 	}
39 | 	for i := N / 2; i < N; i++ {
40 | 		if a[i] != 0 {
41 | 			test.Fail()
42 | 		}
43 | 	}
44 | 	//fmt.Println(a)
45 | }
46 | 
47 | // Integer division rounded up.
48 | func DivUp(x, y int) int {
49 | 	return ((x - 1) / y) + 1
50 | }
51 | 


--------------------------------------------------------------------------------
/cu/peer.go:
--------------------------------------------------------------------------------
 1 | package cu
 2 | 
 3 | // This file implements CUDA unified addressing.
 4 | 
 5 | //#include <cuda.h>
 6 | import "C"
 7 | 
 8 | import (
 9 | 	"unsafe"
10 | )
11 | 
12 | // Make allocations from the peer Context available to the current context.
13 | func CtxEnablePeerAccess(peer Context) {
14 | 	err := Result(C.cuCtxEnablePeerAccess(C.CUcontext(unsafe.Pointer(uintptr(peer))), C.uint(0)))
15 | 	if err != SUCCESS {
16 | 		panic(err)
17 | 	}
18 | }
19 | 
20 | // Make allocations from the peer Context available to the current context.
21 | func (peer Context) EnablePeerAccess() {
22 | 	CtxEnablePeerAccess(peer)
23 | }
24 | 
25 | // Reverses CtxEnablePeerAccess().
26 | func CtxDisablePeerAccess(peer Context) {
27 | 	err := Result(C.cuCtxDisablePeerAccess(C.CUcontext(unsafe.Pointer(uintptr(peer)))))
28 | 	if err != SUCCESS {
29 | 		panic(err)
30 | 	}
31 | }
32 | 
33 | // Reverses EnablePeerAccess().
34 | func (peer Context) DisablePeerAccess() {
35 | 	CtxDisablePeerAccess(peer)
36 | }
37 | 
38 | // Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev.
39 | func DeviceCanAccessPeer(dev, peer Device) bool {
40 | 	var canAccessPeer C.int
41 | 	err := Result(C.cuDeviceCanAccessPeer(&canAccessPeer, C.CUdevice(dev), C.CUdevice(peer)))
42 | 	if err != SUCCESS {
43 | 		panic(err)
44 | 	}
45 | 	return int(canAccessPeer) != 0
46 | }
47 | 
48 | // Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev.
49 | func (dev Device) CanAccessPeer(peer Device) bool {
50 | 	return DeviceCanAccessPeer(dev, peer)
51 | }
52 | 


--------------------------------------------------------------------------------
/cu/result.go:
--------------------------------------------------------------------------------
  1 | package cu
  2 | 
  3 | // This file provides access to CUDA driver error statuses (type CUresult).
  4 | 
  5 | //#include <cuda.h>
  6 | import "C"
  7 | import (
  8 | 	"fmt"
  9 | )
 10 | 
 11 | // CUDA error status.
 12 | // CUDA error statuses are not returned by functions but checked and passed to
 13 | // panic() when not successful. If desired, they can be caught by
 14 | // recover().
 15 | type Result int
 16 | 
 17 | // Message string for the error
 18 | func (err Result) String() string {
 19 | 	str, ok := errorString[err]
 20 | 	if !ok {
 21 | 		return "Unknown CUresult: " + fmt.Sprint(int(err))
 22 | 	}
 23 | 	return str
 24 | }
 25 | 
 26 | const (
 27 | 	SUCCESS                              Result = C.CUDA_SUCCESS
 28 | 	ERROR_INVALID_VALUE                  Result = C.CUDA_ERROR_INVALID_VALUE
 29 | 	ERROR_OUT_OF_MEMORY                  Result = C.CUDA_ERROR_OUT_OF_MEMORY
 30 | 	ERROR_NOT_INITIALIZED                Result = C.CUDA_ERROR_NOT_INITIALIZED
 31 | 	ERROR_DEINITIALIZED                  Result = C.CUDA_ERROR_DEINITIALIZED
 32 | 	ERROR_PROFILER_DISABLED              Result = C.CUDA_ERROR_PROFILER_DISABLED
 33 | 	ERROR_PROFILER_NOT_INITIALIZED       Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED
 34 | 	ERROR_PROFILER_ALREADY_STARTED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED
 35 | 	ERROR_PROFILER_ALREADY_STOPPED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED
 36 | 	ERROR_NO_DEVICE                      Result = C.CUDA_ERROR_NO_DEVICE
 37 | 	ERROR_INVALID_DEVICE                 Result = C.CUDA_ERROR_INVALID_DEVICE
 38 | 	ERROR_INVALID_IMAGE                  Result = C.CUDA_ERROR_INVALID_IMAGE
 39 | 	ERROR_INVALID_CONTEXT                Result = C.CUDA_ERROR_INVALID_CONTEXT
 40 | 	ERROR_CONTEXT_ALREADY_CURRENT        Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT
 41 | 	ERROR_MAP_FAILED                     Result = C.CUDA_ERROR_MAP_FAILED
 42 | 	ERROR_UNMAP_FAILED                   Result = C.CUDA_ERROR_UNMAP_FAILED
 43 | 	ERROR_ARRAY_IS_MAPPED                Result = C.CUDA_ERROR_ARRAY_IS_MAPPED
 44 | 	ERROR_ALREADY_MAPPED                 Result = C.CUDA_ERROR_ALREADY_MAPPED
 45 | 	ERROR_NO_BINARY_FOR_GPU              Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU
 46 | 	ERROR_ALREADY_ACQUIRED               Result = C.CUDA_ERROR_ALREADY_ACQUIRED
 47 | 	ERROR_NOT_MAPPED                     Result = C.CUDA_ERROR_NOT_MAPPED
 48 | 	ERROR_NOT_MAPPED_AS_ARRAY            Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY
 49 | 	ERROR_NOT_MAPPED_AS_POINTER          Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER
 50 | 	ERROR_ECC_UNCORRECTABLE              Result = C.CUDA_ERROR_ECC_UNCORRECTABLE
 51 | 	ERROR_UNSUPPORTED_LIMIT              Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT
 52 | 	ERROR_CONTEXT_ALREADY_IN_USE         Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE
 53 | 	ERROR_INVALID_SOURCE                 Result = C.CUDA_ERROR_INVALID_SOURCE
 54 | 	ERROR_FILE_NOT_FOUND                 Result = C.CUDA_ERROR_FILE_NOT_FOUND
 55 | 	ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
 56 | 	ERROR_SHARED_OBJECT_INIT_FAILED      Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
 57 | 	ERROR_OPERATING_SYSTEM               Result = C.CUDA_ERROR_OPERATING_SYSTEM
 58 | 	ERROR_INVALID_HANDLE                 Result = C.CUDA_ERROR_INVALID_HANDLE
 59 | 	ERROR_NOT_FOUND                      Result = C.CUDA_ERROR_NOT_FOUND
 60 | 	ERROR_NOT_READY                      Result = C.CUDA_ERROR_NOT_READY
 61 | 	ERROR_LAUNCH_FAILED                  Result = C.CUDA_ERROR_LAUNCH_FAILED
 62 | 	ERROR_LAUNCH_OUT_OF_RESOURCES        Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
 63 | 	ERROR_LAUNCH_TIMEOUT                 Result = C.CUDA_ERROR_LAUNCH_TIMEOUT
 64 | 	ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING
 65 | 	ERROR_PEER_ACCESS_ALREADY_ENABLED    Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
 66 | 	ERROR_PEER_ACCESS_NOT_ENABLED        Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
 67 | 	ERROR_PRIMARY_CONTEXT_ACTIVE         Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
 68 | 	ERROR_CONTEXT_IS_DESTROYED           Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED
 69 | 	ERROR_ASSERT                         Result = C.CUDA_ERROR_ASSERT
 70 | 	ERROR_TOO_MANY_PEERS                 Result = C.CUDA_ERROR_TOO_MANY_PEERS
 71 | 	ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED
 72 | 	ERROR_HOST_MEMORY_NOT_REGISTERED     Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED
 73 | 	ERROR_HARDWARE_STACK_ERROR           Result = 714 //C.CUDA_ERROR_HARDWARE_STACK_ERROR
 74 | 	ERROR_ILLEGAL_INSTRUCTION            Result = 715 //C.CUDA_ERROR_ILLEGAL_INSTRUCTION
 75 | 	ERROR_MISALIGNED_ADDRESS             Result = 716 //C.CUDA_ERROR_MISALIGNED_ADDRESS
 76 | 	ERROR_INVALID_ADDRESS_SPACE          Result = 717 //C.CUDA_ERROR_INVALID_ADDRESS_SPACE
 77 | 	ERROR_INVALID_PC                     Result = 718 //C.CUDA_ERROR_INVALID_PC
 78 | 	ERROR_NOT_PERMITTED                  Result = 800 //C.CUDA_ERROR_NOT_PERMITTED
 79 | 	ERROR_NOT_SUPPORTED                  Result = 801 //C.CUDA_ERROR_NOT_SUPPORTED
 80 | 	ERROR_UNKNOWN                        Result = C.CUDA_ERROR_UNKNOWN
 81 | )
 82 | 
 83 | // Map with error strings for Result error numbers
 84 | var errorString map[Result]string = map[Result]string{
 85 | 	SUCCESS:                              "CUDA_SUCCESS",
 86 | 	ERROR_INVALID_VALUE:                  "CUDA_ERROR_INVALID_VALUE",
 87 | 	ERROR_OUT_OF_MEMORY:                  "CUDA_ERROR_OUT_OF_MEMORY",
 88 | 	ERROR_NOT_INITIALIZED:                "CUDA_ERROR_NOT_INITIALIZED",
 89 | 	ERROR_DEINITIALIZED:                  "CUDA_ERROR_DEINITIALIZED",
 90 | 	ERROR_PROFILER_DISABLED:              "CUDA_ERROR_PROFILER_DISABLED",
 91 | 	ERROR_PROFILER_NOT_INITIALIZED:       "CUDA_ERROR_PROFILER_NOT_INITIALIZED",
 92 | 	ERROR_PROFILER_ALREADY_STARTED:       "CUDA_ERROR_PROFILER_ALREADY_STARTED",
 93 | 	ERROR_PROFILER_ALREADY_STOPPED:       "CUDA_ERROR_PROFILER_ALREADY_STOPPED",
 94 | 	ERROR_NO_DEVICE:                      "CUDA_ERROR_NO_DEVICE",
 95 | 	ERROR_INVALID_DEVICE:                 "CUDA_ERROR_INVALID_DEVICE",
 96 | 	ERROR_INVALID_IMAGE:                  "CUDA_ERROR_INVALID_IMAGE",
 97 | 	ERROR_INVALID_CONTEXT:                "CUDA_ERROR_INVALID_CONTEXT",
 98 | 	ERROR_CONTEXT_ALREADY_CURRENT:        "CUDA_ERROR_CONTEXT_ALREADY_CURRENT",
 99 | 	ERROR_MAP_FAILED:                     "CUDA_ERROR_MAP_FAILED",
100 | 	ERROR_UNMAP_FAILED:                   "CUDA_ERROR_UNMAP_FAILED",
101 | 	ERROR_ARRAY_IS_MAPPED:                "CUDA_ERROR_ARRAY_IS_MAPPED",
102 | 	ERROR_ALREADY_MAPPED:                 "CUDA_ERROR_ALREADY_MAPPED",
103 | 	ERROR_NO_BINARY_FOR_GPU:              "CUDA_ERROR_NO_BINARY_FOR_GPU",
104 | 	ERROR_ALREADY_ACQUIRED:               "CUDA_ERROR_ALREADY_ACQUIRED",
105 | 	ERROR_NOT_MAPPED:                     "CUDA_ERROR_NOT_MAPPED",
106 | 	ERROR_NOT_MAPPED_AS_ARRAY:            "CUDA_ERROR_NOT_MAPPED_AS_ARRAY",
107 | 	ERROR_NOT_MAPPED_AS_POINTER:          "CUDA_ERROR_NOT_MAPPED_AS_POINTER",
108 | 	ERROR_ECC_UNCORRECTABLE:              "CUDA_ERROR_ECC_UNCORRECTABLE",
109 | 	ERROR_UNSUPPORTED_LIMIT:              "CUDA_ERROR_UNSUPPORTED_LIMIT",
110 | 	ERROR_CONTEXT_ALREADY_IN_USE:         "CUDA_ERROR_CONTEXT_ALREADY_IN_USE",
111 | 	ERROR_INVALID_SOURCE:                 "CUDA_ERROR_INVALID_SOURCE",
112 | 	ERROR_FILE_NOT_FOUND:                 "CUDA_ERROR_FILE_NOT_FOUND",
113 | 	ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND",
114 | 	ERROR_SHARED_OBJECT_INIT_FAILED:      "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED",
115 | 	ERROR_OPERATING_SYSTEM:               "CUDA_ERROR_OPERATING_SYSTEM",
116 | 	ERROR_INVALID_HANDLE:                 "CUDA_ERROR_INVALID_HANDLE",
117 | 	ERROR_NOT_FOUND:                      "CUDA_ERROR_NOT_FOUND",
118 | 	ERROR_NOT_READY:                      "CUDA_ERROR_NOT_READY",
119 | 	ERROR_LAUNCH_OUT_OF_RESOURCES:        "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES",
120 | 	ERROR_LAUNCH_TIMEOUT:                 "CUDA_ERROR_LAUNCH_TIMEOUT",
121 | 	ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:  "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING",
122 | 	ERROR_PEER_ACCESS_ALREADY_ENABLED:    "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED",
123 | 	ERROR_PEER_ACCESS_NOT_ENABLED:        "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED",
124 | 	ERROR_PRIMARY_CONTEXT_ACTIVE:         "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE",
125 | 	ERROR_CONTEXT_IS_DESTROYED:           "CUDA_ERROR_CONTEXT_IS_DESTROYED",
126 | 	ERROR_ASSERT:                         "CUDA_ERROR_ASSERT",
127 | 	ERROR_TOO_MANY_PEERS:                 "CUDA_ERROR_TOO_MANY_PEERS",
128 | 	ERROR_HOST_MEMORY_ALREADY_REGISTERED: "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED",
129 | 	ERROR_HOST_MEMORY_NOT_REGISTERED:     "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED",
130 | 	ERROR_HARDWARE_STACK_ERROR:           "CUDA_ERROR_HARDWARE_STACK_ERROR",
131 | 	ERROR_ILLEGAL_INSTRUCTION:            "CUDA_ERROR_ILLEGAL_INSTRUCTION",
132 | 	ERROR_MISALIGNED_ADDRESS:             "CUDA_ERROR_MISALIGNED_ADDRESS",
133 | 	ERROR_INVALID_ADDRESS_SPACE:          "CUDA_ERROR_INVALID_ADDRESS_SPACE",
134 | 	ERROR_INVALID_PC:                     "CUDA_ERROR_INVALID_PC",
135 | 	ERROR_LAUNCH_FAILED:                  "CUDA_ERROR_LAUNCH_FAILED",
136 | 	ERROR_NOT_PERMITTED:                  "CUDA_ERROR_NOT_PERMITTED",
137 | 	ERROR_NOT_SUPPORTED:                  "CUDA_ERROR_NOT_SUPPORTED",
138 | 	ERROR_UNKNOWN:                        "CUDA_ERROR_UNKNOWN"}
139 | 


--------------------------------------------------------------------------------
/cu/runtimeapi.go:
--------------------------------------------------------------------------------
 1 | package cu
 2 | 
 3 | // This file implements parts of the CUDA runtime api instead of the driver
 4 | // api the rest of this package uses.
 5 | // It might be useful to move this to a seperate package at some point.
 6 | 
 7 | //#include <cuda_runtime.h>
 8 | import "C"
 9 | import "unsafe"
10 | 
11 | // Set the device as current.
12 | func SetDevice(device Device) {
13 | 	err := Result(C.cudaSetDevice(C.int(device)))
14 | 	if err != SUCCESS {
15 | 		panic(err)
16 | 	}
17 | }
18 | 
19 | // Reset the state of the current device.
20 | func DeviceReset() {
21 | 	err := Result(C.cudaDeviceReset())
22 | 	if err != SUCCESS {
23 | 		panic(err)
24 | 	}
25 | }
26 | 
27 | // Set CUDA device flags.
28 | func SetDeviceFlags(flags uint) {
29 | 	err := Result(C.cudaSetDeviceFlags(C.uint(flags)))
30 | 	if err != SUCCESS {
31 | 		panic(err)
32 | 	}
33 | }
34 | 
35 | //Flags for SetDeviceFlasgs
36 | const (
37 | 	// The default, decides to yield or not based on active CUDA threads and processors.
38 | 	DeviceAuto = C.cudaDeviceScheduleAuto
39 | 	// Actively spin while waiting for device.
40 | 	DeviceSpin = C.cudaDeviceScheduleSpin
41 | 	// Yield when waiting.
42 | 	DeviceYield = C.cudaDeviceScheduleYield
43 | 	// ScheduleBlockingSync block CPU on sync.
44 | 	DeviceScheduleBlockingSync = C.cudaDeviceScheduleBlockingSync
45 | 	// ScheduleBlockingSync block CPU on sync.  Deprecated since cuda 4.0
46 | 	DeviceBlockingSync = C.cudaDeviceBlockingSync
47 | 	// For use with pinned host memory
48 | 	DeviceMapHost = C.cudaDeviceMapHost
49 | 	// Do not reduce local memory to try and prevent thrashing
50 | 	DeviceLmemResizeToMax = C.cudaDeviceLmemResizeToMax
51 | )
52 | 
53 | func Malloc(bytes int64) DevicePtr {
54 | 	var devptr unsafe.Pointer
55 | 	err := Result(C.cudaMalloc(&devptr, C.size_t(bytes)))
56 | 	if err != SUCCESS {
57 | 		panic(err)
58 | 	}
59 | 	return DevicePtr(devptr)
60 | }
61 | 
62 | func MallocHost(bytes int64) unsafe.Pointer {
63 | 	var p unsafe.Pointer
64 | 	err := Result(C.cudaMallocHost(&p, C.size_t(bytes)))
65 | 	if err != SUCCESS {
66 | 		panic(err)
67 | 	}
68 | 	return p
69 | }
70 | 
71 | func FreeHost(ptr unsafe.Pointer) {
72 | 	err := Result(C.cudaFreeHost(ptr))
73 | 	if err != SUCCESS {
74 | 		panic(err)
75 | 	}
76 | }
77 | 
78 | // Copies a number of bytes in the direction specified by flags
79 | func MemCpy(dst, src unsafe.Pointer, bytes int64, flags uint) {
80 | 	err := Result(C.cudaMemcpy(dst, src, C.size_t(bytes), uint32(flags)))
81 | 	if err != SUCCESS {
82 | 		panic(err)
83 | 	}
84 | }
85 | 
86 | //Flags for memory copy types
87 | const (
88 | 	// Host to Host
89 | 	HtoH = C.cudaMemcpyHostToHost
90 | 	// Host to Device
91 | 	HtoD = C.cudaMemcpyHostToDevice
92 | 	// Device to Host
93 | 	DtoH = C.cudaMemcpyDeviceToHost
94 | 	// Device to Device
95 | 	DtoD = C.cudaMemcpyDeviceToDevice
96 | 	// Default, unified virtual address space
97 | 	Virt = C.cudaMemcpyDefault
98 | )
99 | 


--------------------------------------------------------------------------------
/cu/stream.go:
--------------------------------------------------------------------------------
 1 | package cu
 2 | 
 3 | // This file implements CUDA streams
 4 | 
 5 | //#include <cuda.h>
 6 | import "C"
 7 | import "unsafe"
 8 | 
 9 | // CUDA stream.
10 | type Stream uintptr
11 | 
12 | // Creates an asynchronous stream
13 | func StreamCreate() Stream {
14 | 	var stream C.CUstream
15 | 	err := Result(C.cuStreamCreate(&stream, C.uint(0))) // flags has to be zero
16 | 	if err != SUCCESS {
17 | 		panic(err)
18 | 	}
19 | 	return Stream(uintptr(unsafe.Pointer(stream)))
20 | }
21 | 
22 | // Destroys the asynchronous stream
23 | func (stream *Stream) Destroy() {
24 | 	str := *stream
25 | 	err := Result(C.cuStreamDestroy(C.CUstream(unsafe.Pointer(uintptr(str)))))
26 | 	*stream = 0
27 | 	if err != SUCCESS {
28 | 		panic(err)
29 | 	}
30 | }
31 | 
32 | // Destroys an asynchronous stream
33 | func StreamDestroy(stream *Stream) {
34 | 	stream.Destroy()
35 | }
36 | 
37 | // Blocks until the stream has completed.
38 | func (stream Stream) Synchronize() {
39 | 	err := Result(C.cuStreamSynchronize(C.CUstream(unsafe.Pointer(uintptr(stream)))))
40 | 	if err != SUCCESS {
41 | 		panic(err)
42 | 	}
43 | }
44 | 
45 | // Returns Success if all operations have completed, ErrorNotReady otherwise
46 | func (stream Stream) Query() Result {
47 | 	return Result(C.cuStreamQuery(C.CUstream(unsafe.Pointer(uintptr(stream)))))
48 | }
49 | 
50 | // Returns Success if all operations have completed, ErrorNotReady otherwise
51 | func StreamQuery(stream Stream) Result {
52 | 	return stream.Query()
53 | }
54 | 
55 | // Blocks until the stream has completed.
56 | func StreamSynchronize(stream Stream) {
57 | 	stream.Synchronize()
58 | }
59 | 


--------------------------------------------------------------------------------
/cu/testdata/testmodule.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Module to test CUDA module loading and execution.
 3 |  * To be compiled with:
 4 |  * nvcc -ptx testmodule.cu
 5 |  */
 6 | 
 7 | 
 8 | #ifdef __cplusplus
 9 | extern "C" {
10 | #endif
11 | 
12 | #define threadindex ( ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x )
13 | 
14 | /// Sets the first N elements of array to value.
15 | __global__ void testMemset(float* array, float value, int N){
16 | 	int i = threadindex;
17 | 	if(i < N){
18 | 		array[i] = value;
19 | 	}
20 | }
21 | 
22 | 
23 | #ifdef __cplusplus
24 | }
25 | #endif
26 | 


--------------------------------------------------------------------------------
/cu/testdata/testmodule.ptx:
--------------------------------------------------------------------------------
 1 | 	.version 1.4
 2 | 	.target sm_10, map_f64_to_f32
 3 | 	// compiled with /usr/local/cuda/open64/lib//be
 4 | 	// nvopencc 4.0 built on 2011-02-18
 5 | 
 6 | 	//-----------------------------------------------------------
 7 | 	// Compiling /tmp/tmpxft_00000e56_00000000-9_testmodule.cpp3.i (/tmp/ccBI#.rDLD4T)
 8 | 	//-----------------------------------------------------------
 9 | 
10 | 	//-----------------------------------------------------------
11 | 	// Options:
12 | 	//-----------------------------------------------------------
13 | 	//  Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64
14 | 	//  -O3	(Optimization level)
15 | 	//  -g0	(Debug level)
16 | 	//  -m2	(Report advisories)
17 | 	//-----------------------------------------------------------
18 | 
19 | 	.file	1	"<command-line>"
20 | 	.file	2	"/tmp/tmpxft_00000e56_00000000-8_testmodule.cudafe2.gpu"
21 | 	.file	3	"/usr/lib/gcc/x86_64-linux-gnu/4.4.3/include/stddef.h"
22 | 	.file	4	"/usr/local/cuda/bin/../include/crt/device_runtime.h"
23 | 	.file	5	"/usr/local/cuda/bin/../include/host_defines.h"
24 | 	.file	6	"/usr/local/cuda/bin/../include/builtin_types.h"
25 | 	.file	7	"/usr/local/cuda/bin/../include/device_types.h"
26 | 	.file	8	"/usr/local/cuda/bin/../include/driver_types.h"
27 | 	.file	9	"/usr/local/cuda/bin/../include/surface_types.h"
28 | 	.file	10	"/usr/local/cuda/bin/../include/texture_types.h"
29 | 	.file	11	"/usr/local/cuda/bin/../include/vector_types.h"
30 | 	.file	12	"/usr/local/cuda/bin/../include/device_launch_parameters.h"
31 | 	.file	13	"/usr/local/cuda/bin/../include/crt/storage_class.h"
32 | 	.file	14	"/usr/include/bits/types.h"
33 | 	.file	15	"/usr/include/time.h"
34 | 	.file	16	"testmodule.cu"
35 | 	.file	17	"/usr/local/cuda/bin/../include/common_functions.h"
36 | 	.file	18	"/usr/local/cuda/bin/../include/math_functions.h"
37 | 	.file	19	"/usr/local/cuda/bin/../include/math_constants.h"
38 | 	.file	20	"/usr/local/cuda/bin/../include/device_functions.h"
39 | 	.file	21	"/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
40 | 	.file	22	"/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
41 | 	.file	23	"/usr/local/cuda/bin/../include/sm_13_double_functions.h"
42 | 	.file	24	"/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
43 | 	.file	25	"/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
44 | 	.file	26	"/usr/local/cuda/bin/../include/surface_functions.h"
45 | 	.file	27	"/usr/local/cuda/bin/../include/texture_fetch_functions.h"
46 | 	.file	28	"/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
47 | 
48 | 
49 | 	.entry testMemset (
50 | 		.param .u64 __cudaparm_testMemset_array,
51 | 		.param .f32 __cudaparm_testMemset_value,
52 | 		.param .s32 __cudaparm_testMemset_N)
53 | 	{
54 | 	.reg .u16 %rh<4>;
55 | 	.reg .u32 %r<10>;
56 | 	.reg .u64 %rd<6>;
57 | 	.reg .f32 %f<3>;
58 | 	.reg .pred %p<3>;
59 | 	.loc	16	7	0
60 | $LDWbegin_testMemset:
61 | 	mov.u16 	%rh1, %nctaid.x;
62 | 	mov.u16 	%rh2, %ctaid.y;
63 | 	mul.wide.u16 	%r1, %rh1, %rh2;
64 | 	cvt.u32.u16 	%r2, %ctaid.x;
65 | 	add.u32 	%r3, %r2, %r1;
66 | 	cvt.u32.u16 	%r4, %ntid.x;
67 | 	mul.lo.u32 	%r5, %r4, %r3;
68 | 	cvt.u32.u16 	%r6, %tid.x;
69 | 	add.u32 	%r7, %r6, %r5;
70 | 	ld.param.s32 	%r8, [__cudaparm_testMemset_N];
71 | 	setp.le.s32 	%p1, %r8, %r7;
72 | 	@%p1 bra 	$Lt_0_1026;
73 | 	.loc	16	10	0
74 | 	ld.param.f32 	%f1, [__cudaparm_testMemset_value];
75 | 	ld.param.u64 	%rd1, [__cudaparm_testMemset_array];
76 | 	cvt.s64.s32 	%rd2, %r7;
77 | 	mul.wide.s32 	%rd3, %r7, 4;
78 | 	add.u64 	%rd4, %rd1, %rd3;
79 | 	st.global.f32 	[%rd4+0], %f1;
80 | $Lt_0_1026:
81 | 	.loc	16	12	0
82 | 	exit;
83 | $LDWend_testMemset:
84 | 	} // testMemset
85 | 
86 | 


--------------------------------------------------------------------------------
/cu/version.go:
--------------------------------------------------------------------------------
 1 | package cu
 2 | 
 3 | // This file implements CUDA driver version management
 4 | 
 5 | //#include <cuda.h>
 6 | import "C"
 7 | 
 8 | // Returns the CUDA driver version.
 9 | func Version() int {
10 | 	var version C.int
11 | 	err := Result(C.cuDriverGetVersion(&version))
12 | 	if err != SUCCESS {
13 | 		panic(err)
14 | 	}
15 | 	return int(version)
16 | }
17 | 


--------------------------------------------------------------------------------
/cu/version_test.go:
--------------------------------------------------------------------------------
 1 | package cu
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestVersion(t *testing.T) {
 9 | 	fmt.Println("CUDA driver version: ", Version())
10 | }
11 | 


--------------------------------------------------------------------------------
/cuda/Makefile:
--------------------------------------------------------------------------------
 1 | all: 6g gccgo doc
 2 | 
 3 | 6g:
 4 | 	go install -v
 5 | 	go tool vet *.go
 6 | 	gofmt -w *.go
 7 | 
 8 | GCCGO=gccgo -gccgoflags '-static-libgcc -O3'
 9 | 
10 | gccgo:
11 | 	go build -v -compiler $(GCCGO)
12 | 
13 | test: 6gtest gccgotest
14 | 
15 | 6gtest: 
16 | 	go test
17 | 
18 | gccgotest: 
19 | 	go test -compiler $(GCCGO)
20 | 
21 | bench: 6gbench gccgobench
22 | 
23 | 6gbench:
24 | 	go test -bench=.
25 | 
26 | gccgobench:
27 | 	go test -bench=. -compiler $(GCCGO)
28 | 
29 | clean:
30 | 	go clean
31 | 
32 | doc:
33 | 	godoc github.com/barnex/cuda5/cu > README
34 | 


--------------------------------------------------------------------------------
/cuda/README:
--------------------------------------------------------------------------------
  1 | PACKAGE
  2 | 
  3 | package cu
  4 |     import "github.com/barnex/cuda5/cu"
  5 | 
  6 |     Go bindings for the CUDA driver API.
  7 | 
  8 | CONSTANTS
  9 | 
 10 | const (
 11 |     // If  the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor.
 12 |     CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO
 13 |     // Spin when waiting for results from the GPU. 
 14 |     CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN
 15 |     // Yield its thread when waiting for results from the GPU.
 16 |     CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD
 17 |     // Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work.
 18 |     CTX_BLOCKING_SYNC
 19 |     // Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU.
 20 |     CTX_MAP_HOST = C.CU_CTX_MAP_HOST
 21 |     //Do not reduce local memory after resizing local memory for a kernel. 
 22 |     CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX
 23 | )
 24 |     Flags for CtxCreate
 25 | const (
 26 |     SIZEOF_FLOAT32    = 4
 27 |     SIZEOF_FLOAT64    = 8
 28 |     SIZEOF_COMPLEX64  = 8
 29 |     SIZEOF_COMPLEX128 = 16
 30 | )
 31 |     Type size in bytes
 32 | 
 33 | 
 34 | FUNCTIONS
 35 | 
 36 | func CtxDestroy(ctx *Context)
 37 |     Destroys the CUDA context specified by ctx. If the context usage count
 38 |     is not equal to 1, or the context is current to any CPU thread other
 39 |     than the current one, this function fails. Floating contexts (detached
 40 |     from a CPU thread via cuCtxPopCurrent()) may be destroyed by this
 41 |     function.
 42 | 
 43 | func CtxDisablePeerAccess(peer Context)
 44 |     Reverses CtxEnablePeerAccess().
 45 | 
 46 | func CtxEnablePeerAccess(peer Context)
 47 |     Make allocations from the peer Context available to the current context.
 48 | 
 49 | func CtxGetApiVersion(ctx Context) (version int)
 50 |     Returns the API version to create the context.
 51 | 
 52 | func CtxSetCurrent(ctx Context)
 53 |     Sets the current active context.
 54 | 
 55 | func CtxSynchronize()
 56 |     Blocks until the device has completed all preceding requested tasks, if
 57 |     the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag.
 58 | 
 59 | func DeviceCanAccessPeer(dev, peer Device) bool
 60 |     Returns true if CtxEnablePeerAccess can be called on a context for dev
 61 |     and peerDev.
 62 | 
 63 | func DeviceComputeCapability(device Device) (major, minor int)
 64 |     Returns the compute capability of the device.
 65 | 
 66 | func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int
 67 |     Gets the value of a device attribute.
 68 | 
 69 | func DeviceGetCount() int
 70 |     Returns the number of devices with compute capability greater than or
 71 |     equal to 1.0 that are available for execution.
 72 | 
 73 | func DeviceGetName(dev Device) string
 74 |     Gets the name of the device.
 75 | 
 76 | func DeviceTotalMem(device Device) int64
 77 |     Returns the total amount of memory available on the device in bytes.
 78 | 
 79 | func FuncGetAttribute(attrib FunctionAttribute, function Function) int
 80 | 
 81 | func Init(flags int)
 82 |     Initialize the CUDA driver API. Currently, flags must be 0. If Init()
 83 |     has not been called, any function from the driver API will panic with
 84 |     ERROR_NOT_INITIALIZED.
 85 | 
 86 | func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer)
 87 | 
 88 | func MemAllocHost(bytes int64) unsafe.Pointer
 89 | 
 90 | func MemFree(ptr *DevicePtr)
 91 |     Frees device memory allocated by MemAlloc(). Overwrites the pointer with
 92 |     NULL. It is safe to double-free.
 93 | 
 94 | func MemFreeHost(ptr unsafe.Pointer)
 95 | 
 96 | func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr)
 97 |     Returns the base address and size of the allocation (by MemAlloc) that
 98 |     contains the input pointer ptr.
 99 | 
100 | func MemGetInfo() (free, total int64)
101 |     Returns the free and total amount of memroy in the current Context (in
102 |     bytes).
103 | 
104 | func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag)
105 |     Page-locks memory specified by the pointer and bytes. The pointer and
106 |     byte size must be aligned to the host page size (4KB) See also:
107 |     MemHostUnregister()
108 | 
109 | func MemHostUnregister(ptr unsafe.Pointer)
110 |     Unmaps memory locked by MemHostRegister().
111 | 
112 | func Memcpy(dst, src DevicePtr, bytes int64)
113 |     Copies a number of bytes on the current device. Requires unified
114 |     addressing to be supported. See also: MemcpyDtoD(). TODO(a): is actually
115 |     an auto copy for device and/or host memory
116 | 
117 | func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream)
118 |     Asynchronously copies a number of bytes on the current device.
119 | 
120 | func MemcpyDtoD(dst, src DevicePtr, bytes int64)
121 |     Copies a number of bytes from host to device.
122 | 
123 | func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream)
124 |     Asynchronously copies a number of bytes from host to device.
125 | 
126 | func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64)
127 |     Copies a number of bytes from device to host.
128 | 
129 | func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream)
130 |     Asynchronously copies a number of bytes device host to host. The host
131 |     memory must be page-locked (see MemRegister)
132 | 
133 | func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64)
134 |     Copies a number of bytes from host to device.
135 | 
136 | func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream)
137 |     Asynchronously copies a number of bytes from host to device. The host
138 |     memory must be page-locked (see MemRegister)
139 | 
140 | func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64)
141 |     Copies from device memory in one context (device) to another.
142 | 
143 | func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream)
144 |     Asynchronously copies from device memory in one context (device) to
145 |     another.
146 | 
147 | func MemsetD32(deviceptr DevicePtr, value uint32, N int64)
148 |     Sets the first N 32-bit values of dst array to value. Asynchronous.
149 | 
150 | func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream)
151 |     Asynchronously sets the first N 32-bit values of dst array to value.
152 | 
153 | func MemsetD8(deviceptr DevicePtr, value uint8, N int64)
154 |     Sets the first N 8-bit values of dst array to value. Asynchronous.
155 | 
156 | func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream)
157 |     Asynchronously sets the first N 32-bit values of dst array to value.
158 | 
159 | func StreamDestroy(stream *Stream)
160 |     Destroys an asynchronous stream
161 | 
162 | func StreamSynchronize(stream Stream)
163 |     Blocks until the stream has completed.
164 | 
165 | func Version() int
166 |     Returns the CUDA driver version.
167 | 
168 | 
169 | TYPES
170 | 
171 | type Context uintptr
172 |     CUDA context.
173 | 
174 | func CtxCreate(flags uint, dev Device) Context
175 |     Create a CUDA context.
176 | 
177 | func CtxGetCurrent() Context
178 |     Gets the current active context.
179 | 
180 | func (ctx Context) ApiVersion() (version int)
181 |     Returns the API version to create the context.
182 | 
183 | func (ctx *Context) Destroy()
184 |     Destroys the CUDA context.
185 | 
186 | func (peer Context) DisablePeerAccess()
187 |     Reverses EnablePeerAccess().
188 | 
189 | func (peer Context) EnablePeerAccess()
190 |     Make allocations from the peer Context available to the current context.
191 | 
192 | func (ctx Context) SetCurrent()
193 |     Sets the current active context.
194 | 
195 | type DevProp struct {
196 |     MaxThreadsPerBlock  int
197 |     MaxThreadsDim       [3]int
198 |     MaxGridSize         [3]int
199 |     SharedMemPerBlock   int
200 |     TotalConstantMemory int
201 |     SIMDWidth           int
202 |     MemPitch            int
203 |     RegsPerBlock        int
204 |     ClockRate           int
205 |     TextureAlign        int
206 | }
207 |     Device properties
208 | 
209 | func DeviceGetProperties(dev Device) (prop DevProp)
210 |     Returns the device's properties.
211 | 
212 | type Device int
213 |     CUDA Device number.
214 | 
215 | func CtxGetDevice() Device
216 |     Returns the ordinal of the current context's device.
217 | 
218 | func DeviceGet(ordinal int) Device
219 |     Returns in a device handle given an ordinal in the range [0,
220 |     DeviceGetCount()-1].
221 | 
222 | func (dev Device) Attribute(attrib DeviceAttribute) int
223 |     Gets the value of a device attribute.
224 | 
225 | func (dev Device) CanAccessPeer(peer Device) bool
226 |     Returns true if CtxEnablePeerAccess can be called on a context for dev
227 |     and peerDev.
228 | 
229 | func (device Device) ComputeCapability() (major, minor int)
230 |     Returns the compute capability of the device.
231 | 
232 | func (dev Device) Name() string
233 |     Gets the name of the device.
234 | 
235 | func (dev Device) Properties() DevProp
236 |     Returns the device's properties.
237 | 
238 | func (device Device) TotalMem() int64
239 |     Returns the total amount of memory available on the device in bytes.
240 | 
241 | type DeviceAttribute int
242 | 
243 | const (
244 |     MAX_THREADS_PER_BLOCK            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK            // Maximum number of threads per block
245 |     MAX_BLOCK_DIM_X                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X                  // Maximum block dimension X
246 |     MAX_BLOCK_DIM_Y                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y                  // Maximum block dimension Y
247 |     MAX_BLOCK_DIM_Z                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z                  // Maximum block dimension Z
248 |     MAX_GRID_DIM_X                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X                   // Maximum grid dimension X
249 |     MAX_GRID_DIM_Y                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y                   // Maximum grid dimension Y
250 |     MAX_GRID_DIM_Z                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z                   // Maximum grid dimension Z
251 |     MAX_SHARED_MEMORY_PER_BLOCK      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK      // Maximum shared memory available per block in bytes
252 |     TOTAL_CONSTANT_MEMORY            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY            // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
253 |     WARP_SIZE                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE                        // Warp size in threads
254 |     MAX_PITCH                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH                        // Maximum pitch in bytes allowed by memory copies
255 |     MAX_REGISTERS_PER_BLOCK          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK          // Maximum number of 32-bit registers available per block
256 |     CLOCK_RATE                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE                       // Peak clock frequency in kilohertz
257 |     TEXTURE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT                // Alignment requirement for textures
258 |     MULTIPROCESSOR_COUNT             DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT             // Number of multiprocessors on device
259 |     KERNEL_EXEC_TIMEOUT              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT              // Specifies whether there is a run time limit on kernels
260 |     INTEGRATED                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED                       // Device is integrated with host memory
261 |     CAN_MAP_HOST_MEMORY              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY              // Device can map host memory into CUDA address space
262 |     COMPUTE_MODE                     DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE                     // Compute mode (See ::CUcomputemode for details)
263 |     MAXIMUM_TEXTURE1D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH          // Maximum 1D texture width
264 |     MAXIMUM_TEXTURE2D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH          // Maximum 2D texture width
265 |     MAXIMUM_TEXTURE2D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT         // Maximum 2D texture height
266 |     MAXIMUM_TEXTURE3D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH          // Maximum 3D texture width
267 |     MAXIMUM_TEXTURE3D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT         // Maximum 3D texture height
268 |     MAXIMUM_TEXTURE3D_DEPTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH          // Maximum 3D texture depth
269 |     MAXIMUM_TEXTURE2D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH  // Maximum 2D layered texture width
270 |     MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height
271 |     MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture
272 |     SURFACE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT                // Alignment requirement for surfaces
273 |     CONCURRENT_KERNELS               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS               // Device can possibly execute multiple kernels concurrently
274 |     ECC_ENABLED                      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED                      // Device has ECC support enabled
275 |     PCI_BUS_ID                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID                       // PCI bus ID of the device
276 |     PCI_DEVICE_ID                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID                    // PCI device ID of the device
277 |     TCC_DRIVER                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER                       // Device is using TCC driver model
278 |     MEMORY_CLOCK_RATE                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE                // Peak memory clock frequency in kilohertz
279 |     GLOBAL_MEMORY_BUS_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH          // Global memory bus width in bits
280 |     L2_CACHE_SIZE                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE                    // Size of L2 cache in bytes
281 |     MAX_THREADS_PER_MULTIPROCESSOR   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR   // Maximum resident threads per multiprocessor
282 |     ASYNC_ENGINE_COUNT               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT               // Number of asynchronous engines
283 |     UNIFIED_ADDRESSING               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING               // Device uses shares a unified address space with the host 
284 |     MAXIMUM_TEXTURE1D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH  // Maximum 1D layered texture width
285 |     MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture
286 | )
287 | 
288 | type DevicePtr uintptr
289 | 
290 | func MemAlloc(bytes int64) DevicePtr
291 |     Allocates a number of bytes of device memory.
292 | 
293 | func (ptr DevicePtr) Bytes() (bytes int64)
294 |     Returns the size of the allocation (by MemAlloc) that contains the input
295 |     pointer ptr.
296 | 
297 | func (ptr *DevicePtr) Free()
298 |     Frees device memory allocated by MemAlloc(). Overwrites the pointer with
299 |     NULL. It is safe to double-free.
300 | 
301 | func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr)
302 |     Returns the base address and size of the allocation (by MemAlloc) that
303 |     contains the input pointer ptr.
304 | 
305 | func (ptr DevicePtr) MemoryType() MemoryType
306 |     Returns the physical memory type that ptr addresses.
307 | 
308 | func (p DevicePtr) String() string
309 | 
310 | type Dim3 struct {
311 |     X, Y, Z int
312 | }
313 | 
314 | type Function uintptr
315 |     Represents a CUDA CUfunction, a reference to a function within a module.
316 | 
317 | func ModuleGetFunction(module Module, name string) Function
318 |     Returns a Function handle.
319 | 
320 | func (f Function) GetAttribute(attrib FunctionAttribute) int
321 | 
322 | type FunctionAttribute int
323 | 
324 | const (
325 |     FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail.
326 |     FUNC_A_SHARED_SIZE_BYTES     FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES     // The size in bytes of statically-allocated shared memory required by this function. 
327 |     FUNC_A_CONST_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES      // The size in bytes of user-allocated constant memory required by this function.
328 |     FUNC_A_LOCAL_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES      // The size in bytes of local memory used by each thread of this function.
329 |     FUNC_A_NUM_REGS              FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS              // The number of registers used by each thread of this function.
330 |     FUNC_A_PTX_VERSION           FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION           // The PTX virtual architecture version for which the function was compiled. 
331 |     FUNC_A_BINARY_VERSION        FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION        // The binary architecture version for which the function was compiled.
332 | )
333 | 
334 | type MemHostRegisterFlag int
335 | 
336 | const (
337 |     // Memory is pinned in all CUDA contexts.
338 |     MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE
339 |     // Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer()
340 |     MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP
341 | )
342 |     Flag for MemHostRegister
343 | 
344 | type MemoryType uint
345 |     Physical memory type of device pointer.
346 | 
347 | const (
348 |     MemoryTypeHost    MemoryType = C.CU_MEMORYTYPE_HOST
349 |     MemoryTypeDevice  MemoryType = C.CU_MEMORYTYPE_DEVICE
350 |     MemoryTypeArray   MemoryType = C.CU_MEMORYTYPE_ARRAY
351 |     MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED
352 | )
353 | 
354 | func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result)
355 |     Returns the physical memory type that ptr addresses.
356 | 
357 | func (t MemoryType) String() string
358 | 
359 | type Module uintptr
360 |     Represents a CUDA CUmodule, a reference to executable device code.
361 | 
362 | func ModuleLoad(fname string) Module
363 |     Loads a compute module from file
364 | 
365 | func ModuleLoadData(image string) Module
366 |     Loads a compute module from string
367 | 
368 | func (m Module) GetFunction(name string) Function
369 |     Returns a Function handle.
370 | 
371 | type Result int
372 |     CUDA error status. CUDA error statuses are not returned by functions but
373 |     checked and passed to panic() when not successful. If desired, they can
374 |     be caught by recover().
375 | 
376 | const (
377 |     SUCCESS                              Result = C.CUDA_SUCCESS
378 |     ERROR_INVALID_VALUE                  Result = C.CUDA_ERROR_INVALID_VALUE
379 |     ERROR_OUT_OF_MEMORY                  Result = C.CUDA_ERROR_OUT_OF_MEMORY
380 |     ERROR_NOT_INITIALIZED                Result = C.CUDA_ERROR_NOT_INITIALIZED
381 |     ERROR_DEINITIALIZED                  Result = C.CUDA_ERROR_DEINITIALIZED
382 |     ERROR_PROFILER_DISABLED              Result = C.CUDA_ERROR_PROFILER_DISABLED
383 |     ERROR_PROFILER_NOT_INITIALIZED       Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED
384 |     ERROR_PROFILER_ALREADY_STARTED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED
385 |     ERROR_PROFILER_ALREADY_STOPPED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED
386 |     ERROR_NO_DEVICE                      Result = C.CUDA_ERROR_NO_DEVICE
387 |     ERROR_INVALID_DEVICE                 Result = C.CUDA_ERROR_INVALID_DEVICE
388 |     ERROR_INVALID_IMAGE                  Result = C.CUDA_ERROR_INVALID_IMAGE
389 |     ERROR_INVALID_CONTEXT                Result = C.CUDA_ERROR_INVALID_CONTEXT
390 |     ERROR_CONTEXT_ALREADY_CURRENT        Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT
391 |     ERROR_MAP_FAILED                     Result = C.CUDA_ERROR_MAP_FAILED
392 |     ERROR_UNMAP_FAILED                   Result = C.CUDA_ERROR_UNMAP_FAILED
393 |     ERROR_ARRAY_IS_MAPPED                Result = C.CUDA_ERROR_ARRAY_IS_MAPPED
394 |     ERROR_ALREADY_MAPPED                 Result = C.CUDA_ERROR_ALREADY_MAPPED
395 |     ERROR_NO_BINARY_FOR_GPU              Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU
396 |     ERROR_ALREADY_ACQUIRED               Result = C.CUDA_ERROR_ALREADY_ACQUIRED
397 |     ERROR_NOT_MAPPED                     Result = C.CUDA_ERROR_NOT_MAPPED
398 |     ERROR_NOT_MAPPED_AS_ARRAY            Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY
399 |     ERROR_NOT_MAPPED_AS_POINTER          Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER
400 |     ERROR_ECC_UNCORRECTABLE              Result = C.CUDA_ERROR_ECC_UNCORRECTABLE
401 |     ERROR_UNSUPPORTED_LIMIT              Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT
402 |     ERROR_CONTEXT_ALREADY_IN_USE         Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE
403 |     ERROR_INVALID_SOURCE                 Result = C.CUDA_ERROR_INVALID_SOURCE
404 |     ERROR_FILE_NOT_FOUND                 Result = C.CUDA_ERROR_FILE_NOT_FOUND
405 |     ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
406 |     ERROR_SHARED_OBJECT_INIT_FAILED      Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
407 |     ERROR_OPERATING_SYSTEM               Result = C.CUDA_ERROR_OPERATING_SYSTEM
408 |     ERROR_INVALID_HANDLE                 Result = C.CUDA_ERROR_INVALID_HANDLE
409 |     ERROR_NOT_FOUND                      Result = C.CUDA_ERROR_NOT_FOUND
410 |     ERROR_NOT_READY                      Result = C.CUDA_ERROR_NOT_READY
411 |     ERROR_LAUNCH_FAILED                  Result = C.CUDA_ERROR_LAUNCH_FAILED
412 |     ERROR_LAUNCH_OUT_OF_RESOURCES        Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
413 |     ERROR_LAUNCH_TIMEOUT                 Result = C.CUDA_ERROR_LAUNCH_TIMEOUT
414 |     ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING
415 |     ERROR_PEER_ACCESS_ALREADY_ENABLED    Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
416 |     ERROR_PEER_ACCESS_NOT_ENABLED        Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
417 |     ERROR_PRIMARY_CONTEXT_ACTIVE         Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
418 |     ERROR_CONTEXT_IS_DESTROYED           Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED
419 |     ERROR_ASSERT                         Result = C.CUDA_ERROR_ASSERT
420 |     ERROR_TOO_MANY_PEERS                 Result = C.CUDA_ERROR_TOO_MANY_PEERS
421 |     ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED
422 |     ERROR_HOST_MEMORY_NOT_REGISTERED     Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED
423 |     ERROR_UNKNOWN                        Result = C.CUDA_ERROR_UNKNOWN
424 | )
425 | 
426 | func StreamQuery(stream Stream) Result
427 |     Returns Success if all operations have completed, ErrorNotReady
428 |     otherwise
429 | 
430 | func (err Result) String() string
431 |     Message string for the error
432 | 
433 | type Stream uintptr
434 |     CUDA stream.
435 | 
436 | func StreamCreate() Stream
437 |     Creates an asynchronous stream
438 | 
439 | func (stream *Stream) Destroy()
440 |     Destroys the asynchronous stream
441 | 
442 | func (stream Stream) Query() Result
443 |     Returns Success if all operations have completed, ErrorNotReady
444 |     otherwise
445 | 
446 | func (stream Stream) Synchronize()
447 |     Blocks until the stream has completed.
448 | 
449 | 
450 | 


--------------------------------------------------------------------------------
/cuda/cgoflags.go:
--------------------------------------------------------------------------------
 1 | package cuda
 2 | 
 3 | // This file provides CGO flags.
 4 | 
 5 | import "C"
 6 | 
 7 | //#cgo LDFLAGS:-lcudart
 8 | //
 9 | ////default location:
10 | //#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
11 | //#cgo CFLAGS: -I/usr/local/cuda/include/
12 | //
13 | ////default location if not properly symlinked:
14 | //#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib
15 | //#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib
16 | //#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib
17 | //#cgo CFLAGS: -I/usr/local/cuda-6.0/include/
18 | //#cgo CFLAGS: -I/usr/local/cuda-5.5/include/
19 | //#cgo CFLAGS: -I/usr/local/cuda-5.0/include/
20 | //
21 | ////arch linux:
22 | //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib
23 | //#cgo CFLAGS: -I/opt/cuda/include
24 | //
25 | ////WINDOWS:
26 | //#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64
27 | //#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include
28 | import "C"
29 | 


--------------------------------------------------------------------------------
/cuda/device.go:
--------------------------------------------------------------------------------
 1 | package cuda
 2 | 
 3 | //#include <cuda_runtime.h>
 4 | //#include <cuda.h>
 5 | import "C"
 6 | 
 7 | import (
 8 | 	"github.com/barnex/cuda5/cu"
 9 | )
10 | 
11 | // Reset the current GPU device.
12 | func DeviceReset() {
13 | 	err := cu.Result(C.cudaDeviceReset())
14 | 	if err != cu.SUCCESS {
15 | 		panic(err)
16 | 	}
17 | }
18 | 
19 | // Set preference for more cache or shared memory.
20 | func DeviceSetCacheConfig(cacheConfig FuncCache) {
21 | 	err := cu.Result(C.cudaDeviceSetCacheConfig(uint32(cacheConfig)))
22 | 	if err != cu.SUCCESS {
23 | 		panic(err)
24 | 	}
25 | }
26 | 
27 | // Cache preference option.
28 | type FuncCache int
29 | 
30 | const (
31 | 	FUNC_CACHE_PREFER_NONE   FuncCache = C.CU_FUNC_CACHE_PREFER_NONE
32 | 	FUNC_CACHE_PREFER_SHARED FuncCache = C.CU_FUNC_CACHE_PREFER_SHARED
33 | 	FUNC_CACHE_PREFER_L1     FuncCache = C.CU_FUNC_CACHE_PREFER_L1
34 | 	FUNC_CACHE_PREFER_EQUAL  FuncCache = C.CU_FUNC_CACHE_PREFER_EQUAL
35 | )
36 | 


--------------------------------------------------------------------------------
/cufft/Makefile:
--------------------------------------------------------------------------------
 1 | all: 6g gccgo doc
 2 | 
 3 | 6g:
 4 | 	go install -v
 5 | 	go tool vet *.go
 6 | 	gofmt -w *.go
 7 | 
 8 | GCCGO=gccgo -gccgoflags '-static-libgcc -O3'
 9 | 
10 | gccgo:
11 | 	go build -v -compiler $(GCCGO)
12 | 
13 | test: 6gtest gccgotest
14 | 
15 | 6gtest: 
16 | 	go test
17 | 
18 | gccgotest: 
19 | 	go test -compiler $(GCCGO)
20 | 
21 | bench: 6gbench gccgobench
22 | 
23 | 6gbench:
24 | 	go test -bench=.
25 | 
26 | gccgobench:
27 | 	go test -bench=. -compiler $(GCCGO)
28 | 
29 | clean:
30 | 	go clean
31 | 
32 | doc:
33 | 	godoc github.com/barnex/cuda5/cufft > README
34 | 


--------------------------------------------------------------------------------
/cufft/README:
--------------------------------------------------------------------------------
  1 | PACKAGE DOCUMENTATION
  2 | 
  3 | package cufft
  4 |     import "github.com/barnex/cuda5/cufft"
  5 | 
  6 |     Go bindings for the CUDA CUFFT API.
  7 | 
  8 | 
  9 | CONSTANTS
 10 | 
 11 | const (
 12 |     FORWARD = -1 // Forward FFT
 13 |     INVERSE = 1  // Inverse FFT
 14 | )
 15 | 
 16 | 
 17 | TYPES
 18 | 
 19 | type CompatibilityMode int
 20 |     CUFFT compatibility mode
 21 | 
 22 | const (
 23 |     COMPATIBILITY_NATIVE          CompatibilityMode = C.CUFFT_COMPATIBILITY_NATIVE
 24 |     COMPATIBILITY_FFTW_PADDING    CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_PADDING
 25 |     COMPATIBILITY_FFTW_ASYMMETRIC CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC
 26 |     COMPATIBILITY_FFTW_ALL        CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ALL
 27 | )
 28 | 
 29 | 
 30 | func (t CompatibilityMode) String() string
 31 | 
 32 | 
 33 | type Handle uintptr
 34 |     FFT plan handle, reference type to a plan
 35 | 
 36 | 
 37 | func Plan1d(nx int, typ Type, batch int) Handle
 38 |     1D FFT plan
 39 | 
 40 | 
 41 | func Plan2d(nx, ny int, typ Type) Handle
 42 |     2D FFT plan
 43 | 
 44 | 
 45 | func Plan3d(nx, ny, nz int, typ Type) Handle
 46 |     3D FFT plan
 47 | 
 48 | 
 49 | func PlanMany(n []int, inembed []int, istride int, oembed []int, ostride int, typ Type, batch int) Handle
 50 |     1D,2D or 3D FFT plan
 51 | 
 52 | 
 53 | func (plan *Handle) Destroy()
 54 |     Destroys the plan.
 55 | 
 56 | func (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int)
 57 |     Execute Complex-to-Complex plan
 58 | 
 59 | func (plan Handle) ExecC2R(idata, odata cu.DevicePtr)
 60 |     Execute Complex-to-Real plan
 61 | 
 62 | func (plan Handle) ExecD2Z(idata, odata cu.DevicePtr)
 63 |     Execute Double Real-to-Complex plan
 64 | 
 65 | func (plan Handle) ExecR2C(idata, odata cu.DevicePtr)
 66 |     Execute Real-to-Complex plan
 67 | 
 68 | func (plan Handle) ExecZ2D(idata, odata cu.DevicePtr)
 69 |     Execute Double Complex-to-Real plan
 70 | 
 71 | func (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int)
 72 |     Execute Double Complex-to-Complex plan
 73 | 
 74 | func (plan Handle) SetCompatibilityMode(mode CompatibilityMode)
 75 |     Sets the FFTW compatibility mode
 76 | 
 77 | func (plan Handle) SetStream(stream cu.Stream)
 78 |     Sets the cuda stream for this plan
 79 | 
 80 | 
 81 | type Result int
 82 |     FFT result
 83 | 
 84 | const (
 85 |     SUCCESS        Result = C.CUFFT_SUCCESS
 86 |     INVALID_PLAN   Result = C.CUFFT_INVALID_PLAN
 87 |     ALLOC_FAILED   Result = C.CUFFT_ALLOC_FAILED
 88 |     INVALID_TYPE   Result = C.CUFFT_INVALID_TYPE
 89 |     INVALID_VALUE  Result = C.CUFFT_INVALID_VALUE
 90 |     INTERNAL_ERROR Result = C.CUFFT_INTERNAL_ERROR
 91 |     EXEC_FAILED    Result = C.CUFFT_EXEC_FAILED
 92 |     SETUP_FAILED   Result = C.CUFFT_SETUP_FAILED
 93 |     INVALID_SIZE   Result = C.CUFFT_INVALID_SIZE
 94 |     UNALIGNED_DATA Result = C.CUFFT_UNALIGNED_DATA
 95 | )
 96 |     FFT result value
 97 | 
 98 | 
 99 | func (r Result) String() string
100 | 
101 | 
102 | type Type int
103 |     FFT type
104 | 
105 | const (
106 |     R2C Type = C.CUFFT_R2C // Real to Complex (interleaved)
107 |     C2R Type = C.CUFFT_C2R // Complex (interleaved) to Real
108 |     C2C Type = C.CUFFT_C2C // Complex to Complex, interleaved
109 |     D2Z Type = C.CUFFT_D2Z // Double to Double-Complex
110 |     Z2D Type = C.CUFFT_Z2D // Double-Complex to Double
111 |     Z2Z Type = C.CUFFT_Z2Z // Double-Complex to Double-Complex
112 | )
113 | 
114 | 
115 | func (t Type) String() string
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/cufft/cgoflags.go:
--------------------------------------------------------------------------------
 1 | package cufft
 2 | 
 3 | // This file provides CGO flags to find CUDA libraries and headers.
 4 | 
 5 | //#cgo LDFLAGS:-lcufft
 6 | //
 7 | ////default location:
 8 | //#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
 9 | //#cgo CFLAGS: -I/usr/local/cuda/include/
10 | //
11 | ////default location if not properly symlinked:
12 | //#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib
13 | //#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib
14 | //#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib
15 | //#cgo CFLAGS: -I/usr/local/cuda-6.0/include/
16 | //#cgo CFLAGS: -I/usr/local/cuda-5.5/include/
17 | //#cgo CFLAGS: -I/usr/local/cuda-5.0/include/
18 | //
19 | ////arch linux:
20 | //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib
21 | //#cgo CFLAGS: -I/opt/cuda/include
22 | //
23 | ////WINDOWS:
24 | //#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64
25 | //#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include -w
26 | import "C"
27 | 


--------------------------------------------------------------------------------
/cufft/doc.go:
--------------------------------------------------------------------------------
1 | // Go bindings for the CUDA CUFFT API.
2 | package cufft
3 | 


--------------------------------------------------------------------------------
/cufft/fft_test.go:
--------------------------------------------------------------------------------
 1 | package cufft
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/barnex/cuda5/cu"
 6 | 	"unsafe"
 7 | )
 8 | 
 9 | func ExampleFFT1D() {
10 | 	N := 8
11 | 
12 | 	hostIn := make([]float32, N)
13 | 	hostIn[0] = 1
14 | 
15 | 	devIn := cu.MemAlloc(int64(len(hostIn)) * cu.SIZEOF_FLOAT32)
16 | 	defer cu.MemFree(&devIn)
17 | 	cu.MemcpyHtoD(devIn, unsafe.Pointer(&hostIn[0]), devIn.Bytes())
18 | 
19 | 	hostOut := make([]complex64, N/2+1)
20 | 	devOut := cu.MemAlloc(int64(len(hostOut)) * cu.SIZEOF_COMPLEX64)
21 | 	defer cu.MemFree(&devOut)
22 | 
23 | 	plan := Plan1d(N, R2C, 1)
24 | 	defer plan.Destroy()
25 | 	plan.ExecR2C(devIn, devOut)
26 | 
27 | 	cu.MemcpyDtoH(unsafe.Pointer(&hostOut[0]), devOut, devOut.Bytes())
28 | 
29 | 	fmt.Println("hostIn:", hostIn)
30 | 	fmt.Println("hostOut:", hostOut)
31 | 
32 | 	// Output:
33 | 	// hostIn: [1 0 0 0 0 0 0 0]
34 | 	// hostOut: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)]
35 | }
36 | 


--------------------------------------------------------------------------------
/cufft/init_test.go:
--------------------------------------------------------------------------------
 1 | package cufft
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/barnex/cuda5/cu"
 6 | )
 7 | 
 8 | // needed for all other tests.
 9 | func init() {
10 | 	cu.Init(0)
11 | 	ctx := cu.CtxCreate(cu.CTX_SCHED_AUTO, 0)
12 | 	cu.CtxSetCurrent(ctx)
13 | 	fmt.Println("Created CUDA context")
14 | }
15 | 


--------------------------------------------------------------------------------
/cufft/mode.go:
--------------------------------------------------------------------------------
 1 | package cufft
 2 | 
 3 | //#include <cufft.h>
 4 | import "C"
 5 | 
 6 | import (
 7 | 	"fmt"
 8 | )
 9 | 
10 | // CUFFT compatibility mode
11 | type CompatibilityMode int
12 | 
13 | const (
14 | 	COMPATIBILITY_NATIVE          CompatibilityMode = C.CUFFT_COMPATIBILITY_NATIVE
15 | 	COMPATIBILITY_FFTW_PADDING    CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_PADDING
16 | 	COMPATIBILITY_FFTW_ASYMMETRIC CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC
17 | 	COMPATIBILITY_FFTW_ALL        CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ALL
18 | )
19 | 
20 | func (t CompatibilityMode) String() string {
21 | 	if str, ok := compatibilityModeString[t]; ok {
22 | 		return str
23 | 	}
24 | 	return fmt.Sprint("CUFFT Compatibility mode with unknown number:", int(t))
25 | }
26 | 
27 | var compatibilityModeString map[CompatibilityMode]string = map[CompatibilityMode]string{
28 | 	COMPATIBILITY_NATIVE:          "CUFFT_COMPATIBILITY_NATIVE",
29 | 	COMPATIBILITY_FFTW_PADDING:    "CUFFT_COMPATIBILITY_FFTW_PADDING",
30 | 	COMPATIBILITY_FFTW_ASYMMETRIC: "CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC",
31 | 	COMPATIBILITY_FFTW_ALL:        "CUFFT_COMPATIBILITY_FFTW_ALL"}
32 | 


--------------------------------------------------------------------------------
/cufft/plan.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 Arne Vansteenkiste (barnex@gmail.com).  All rights reserved.
  2 | // Use of this source code is governed by a freeBSD
  3 | // license that can be found in the LICENSE.txt file.
  4 | 
  5 | package cufft
  6 | 
  7 | //#include <cufft.h>
  8 | import "C"
  9 | 
 10 | import (
 11 | 	"github.com/barnex/cuda5/cu"
 12 | 	"unsafe"
 13 | )
 14 | 
 15 | // FFT plan handle, reference type to a plan
 16 | type Handle uintptr
 17 | 
 18 | // 1D FFT plan
 19 | func Plan1d(nx int, typ Type, batch int) Handle {
 20 | 	var handle C.cufftHandle
 21 | 	err := Result(C.cufftPlan1d(
 22 | 		&handle,
 23 | 		C.int(nx),
 24 | 		C.cufftType(typ),
 25 | 		C.int(batch)))
 26 | 	if err != SUCCESS {
 27 | 		panic(err)
 28 | 	}
 29 | 	return Handle(handle)
 30 | }
 31 | 
 32 | // 2D FFT plan
 33 | func Plan2d(nx, ny int, typ Type) Handle {
 34 | 	var handle C.cufftHandle
 35 | 	err := Result(C.cufftPlan2d(
 36 | 		&handle,
 37 | 		C.int(nx),
 38 | 		C.int(ny),
 39 | 		C.cufftType(typ)))
 40 | 	if err != SUCCESS {
 41 | 		panic(err)
 42 | 	}
 43 | 	return Handle(handle)
 44 | }
 45 | 
 46 | // 3D FFT plan
 47 | func Plan3d(nx, ny, nz int, typ Type) Handle {
 48 | 	var handle C.cufftHandle
 49 | 	err := Result(C.cufftPlan3d(
 50 | 		&handle,
 51 | 		C.int(nx),
 52 | 		C.int(ny),
 53 | 		C.int(nz),
 54 | 		C.cufftType(typ)))
 55 | 	if err != SUCCESS {
 56 | 		panic(err)
 57 | 	}
 58 | 	return Handle(handle)
 59 | }
 60 | 
 61 | //cufftPlanMany(
 62 | //    cufftHandle *plan, int rank, int *n, int *inembed,
 63 | //    int istride, int idist, int *onembed, int ostride,
 64 | //    int odist, cufftType type, int batch );
 65 | 
 66 | // 1D,2D or 3D FFT plan
 67 | func PlanMany(n []int, inembed []int, istride int, oembed []int, ostride int, typ Type, batch int) Handle {
 68 | 	var handle C.cufftHandle
 69 | 
 70 | 	NULL := (*C.int)(unsafe.Pointer(uintptr(0)))
 71 | 
 72 | 	inembedptr := NULL
 73 | 	idist := 0
 74 | 	if inembed != nil {
 75 | 		inembedptr = (*C.int)(unsafe.Pointer(&inembed[0]))
 76 | 		idist = inembed[0]
 77 | 	}
 78 | 
 79 | 	oembedptr := NULL
 80 | 	odist := 0
 81 | 	if oembed != nil {
 82 | 		oembedptr = (*C.int)(unsafe.Pointer(&oembed[0]))
 83 | 		odist = oembed[0]
 84 | 	}
 85 | 
 86 | 	err := Result(C.cufftPlanMany(
 87 | 		&handle,
 88 | 		C.int(len(n)),                   // rank
 89 | 		(*C.int)(unsafe.Pointer(&n[0])), // n
 90 | 		inembedptr,
 91 | 		C.int(istride),
 92 | 		C.int(idist),
 93 | 		oembedptr,
 94 | 		C.int(ostride),
 95 | 		C.int(odist),
 96 | 		C.cufftType(typ),
 97 | 		C.int(batch)))
 98 | 	if err != SUCCESS {
 99 | 		panic(err)
100 | 	}
101 | 	return Handle(handle)
102 | }
103 | 
104 | // Execute Complex-to-Complex plan
105 | func (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int) {
106 | 	err := Result(C.cufftExecC2C(
107 | 		C.cufftHandle(plan),
108 | 		(*C.cufftComplex)(unsafe.Pointer(uintptr(idata))),
109 | 		(*C.cufftComplex)(unsafe.Pointer(uintptr(odata))),
110 | 		C.int(direction)))
111 | 	if err != SUCCESS {
112 | 		panic(err)
113 | 	}
114 | }
115 | 
116 | // Execute Real-to-Complex plan
117 | func (plan Handle) ExecR2C(idata, odata cu.DevicePtr) {
118 | 	err := Result(C.cufftExecR2C(
119 | 		C.cufftHandle(plan),
120 | 		(*C.cufftReal)(unsafe.Pointer(uintptr(idata))),
121 | 		(*C.cufftComplex)(unsafe.Pointer(uintptr(odata)))))
122 | 	if err != SUCCESS {
123 | 		panic(err)
124 | 	}
125 | }
126 | 
127 | // Execute Complex-to-Real plan
128 | func (plan Handle) ExecC2R(idata, odata cu.DevicePtr) {
129 | 	err := Result(C.cufftExecC2R(
130 | 		C.cufftHandle(plan),
131 | 		(*C.cufftComplex)(unsafe.Pointer(uintptr(idata))),
132 | 		(*C.cufftReal)(unsafe.Pointer(uintptr(odata)))))
133 | 	if err != SUCCESS {
134 | 		panic(err)
135 | 	}
136 | }
137 | 
138 | // Execute Double Complex-to-Complex plan
139 | func (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int) {
140 | 	err := Result(C.cufftExecZ2Z(
141 | 		C.cufftHandle(plan),
142 | 		(*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(idata))),
143 | 		(*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(odata))),
144 | 		C.int(direction)))
145 | 	if err != SUCCESS {
146 | 		panic(err)
147 | 	}
148 | }
149 | 
150 | // Execute Double Real-to-Complex plan
151 | func (plan Handle) ExecD2Z(idata, odata cu.DevicePtr) {
152 | 	err := Result(C.cufftExecD2Z(
153 | 		C.cufftHandle(plan),
154 | 		(*C.cufftDoubleReal)(unsafe.Pointer(uintptr(idata))),
155 | 		(*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(odata)))))
156 | 	if err != SUCCESS {
157 | 		panic(err)
158 | 	}
159 | }
160 | 
161 | // Execute Double Complex-to-Real plan
162 | func (plan Handle) ExecZ2D(idata, odata cu.DevicePtr) {
163 | 	err := Result(C.cufftExecZ2D(
164 | 		C.cufftHandle(plan),
165 | 		(*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(idata))),
166 | 		(*C.cufftDoubleReal)(unsafe.Pointer(uintptr(odata)))))
167 | 	if err != SUCCESS {
168 | 		panic(err)
169 | 	}
170 | }
171 | 
172 | // Destroys the plan.
173 | func (plan *Handle) Destroy() {
174 | 	err := Result(C.cufftDestroy(C.cufftHandle(*plan)))
175 | 	*plan = 0 // make sure plan is not used anymore
176 | 	if err != SUCCESS {
177 | 		panic(err)
178 | 	}
179 | }
180 | 
181 | // Sets the cuda stream for this plan
182 | func (plan Handle) SetStream(stream cu.Stream) {
183 | 	err := Result(C.cufftSetStream(
184 | 		C.cufftHandle(plan),
185 | 		C.cudaStream_t(unsafe.Pointer(uintptr(stream)))))
186 | 	if err != SUCCESS {
187 | 		panic(err)
188 | 	}
189 | }
190 | 
191 | // Sets the FFTW compatibility mode
192 | func (plan Handle) SetCompatibilityMode(mode CompatibilityMode) {
193 | 	err := Result(C.cufftSetCompatibilityMode(
194 | 		C.cufftHandle(plan),
195 | 		C.cufftCompatibility(mode)))
196 | 	if err != SUCCESS {
197 | 		panic(err)
198 | 	}
199 | }
200 | 


--------------------------------------------------------------------------------
/cufft/result.go:
--------------------------------------------------------------------------------
 1 | package cufft
 2 | 
 3 | //#include <cufft.h>
 4 | import "C"
 5 | 
 6 | import (
 7 | 	"fmt"
 8 | )
 9 | 
10 | // FFT result
11 | type Result int
12 | 
13 | // FFT result value
14 | const (
15 | 	SUCCESS                   Result = C.CUFFT_SUCCESS
16 | 	INVALID_PLAN              Result = C.CUFFT_INVALID_PLAN
17 | 	ALLOC_FAILED              Result = C.CUFFT_ALLOC_FAILED
18 | 	INVALID_TYPE              Result = C.CUFFT_INVALID_TYPE
19 | 	INVALID_VALUE             Result = C.CUFFT_INVALID_VALUE
20 | 	INTERNAL_ERROR            Result = C.CUFFT_INTERNAL_ERROR
21 | 	EXEC_FAILED               Result = C.CUFFT_EXEC_FAILED
22 | 	SETUP_FAILED              Result = C.CUFFT_SETUP_FAILED
23 | 	INVALID_SIZE              Result = C.CUFFT_INVALID_SIZE
24 | 	UNALIGNED_DATA            Result = C.CUFFT_UNALIGNED_DATA
25 | 	INCOMPLETE_PARAMETER_LIST Result = 0xA // cuda6 values copied to avoid dependency on cuda6/cufft.h
26 | 	INVALID_DEVICE            Result = 0xB
27 | 	PARSE_ERROR               Result = 0xC
28 | 	NO_WORKSPACE              Result = 0xD
29 | )
30 | 
31 | func (r Result) String() string {
32 | 	if str, ok := resultString[r]; ok {
33 | 		return str
34 | 	}
35 | 	return fmt.Sprint("CUFFT Result with unknown error number:", int(r))
36 | }
37 | 
38 | var resultString map[Result]string = map[Result]string{
39 | 	SUCCESS:                   "CUFFT_SUCCESS",
40 | 	INVALID_PLAN:              "CUFFT_INVALID_PLAN",
41 | 	ALLOC_FAILED:              "CUFFT_ALLOC_FAILED",
42 | 	INVALID_TYPE:              "CUFFT_INVALID_TYPE",
43 | 	INVALID_VALUE:             "CUFFT_INVALID_VALUE",
44 | 	INTERNAL_ERROR:            "CUFFT_INTERNAL_ERROR",
45 | 	EXEC_FAILED:               "CUFFT_EXEC_FAILED",
46 | 	SETUP_FAILED:              "CUFFT_SETUP_FAILED",
47 | 	INVALID_SIZE:              "CUFFT_INVALID_SIZE",
48 | 	UNALIGNED_DATA:            "CUFFT_UNALIGNED_DATA",
49 | 	INCOMPLETE_PARAMETER_LIST: "CUFFT_INCOMPLETE_PARAMETER_LIST",
50 | 	INVALID_DEVICE:            "CUFFT_INVALID_DEVICE",
51 | 	PARSE_ERROR:               "CUFFT_PARSE_ERROR",
52 | 	NO_WORKSPACE:              "CUFFT_NO_WORKSPACE"}
53 | 


--------------------------------------------------------------------------------
/cufft/type.go:
--------------------------------------------------------------------------------
 1 | package cufft
 2 | 
 3 | //#include <cufft.h>
 4 | import "C"
 5 | 
 6 | import (
 7 | 	"fmt"
 8 | )
 9 | 
10 | // FFT type
11 | type Type int
12 | 
13 | const (
14 | 	R2C Type = C.CUFFT_R2C // Real to Complex (interleaved)
15 | 	C2R Type = C.CUFFT_C2R // Complex (interleaved) to Real
16 | 	C2C Type = C.CUFFT_C2C // Complex to Complex, interleaved
17 | 	D2Z Type = C.CUFFT_D2Z // Double to Double-Complex
18 | 	Z2D Type = C.CUFFT_Z2D // Double-Complex to Double
19 | 	Z2Z Type = C.CUFFT_Z2Z // Double-Complex to Double-Complex
20 | )
21 | 
22 | const (
23 | 	FORWARD = -1 // Forward FFT
24 | 	INVERSE = 1  // Inverse FFT
25 | )
26 | 
27 | func (t Type) String() string {
28 | 	if str, ok := typeString[t]; ok {
29 | 		return str
30 | 	}
31 | 	return fmt.Sprint("CUFFT Type with unknown number:", int(t))
32 | }
33 | 
34 | var typeString map[Type]string = map[Type]string{
35 | 	R2C: "CUFFT_R2C",
36 | 	C2R: "CUFFT_C2R",
37 | 	C2C: "CUFFT_C2C",
38 | 	D2Z: "CUFFT_D2Z",
39 | 	Z2D: "CUFFT_Z2D",
40 | 	Z2Z: "CUFFT_Z2Z"}
41 | 


--------------------------------------------------------------------------------
/curand/Makefile:
--------------------------------------------------------------------------------
 1 | all: 6g gccgo doc
 2 | 
 3 | 6g:
 4 | 	go install -v
 5 | 	go tool vet *.go
 6 | 	gofmt -w *.go
 7 | 
 8 | GCCGO=gccgo -gccgoflags '-static-libgcc -O3'
 9 | 
10 | gccgo:
11 | 	go build -v -compiler $(GCCGO)
12 | 
13 | test: 6gtest gccgotest
14 | 
15 | 6gtest: 
16 | 	go test
17 | 
18 | gccgotest: 
19 | 	go test -compiler $(GCCGO)
20 | 
21 | bench: 6gbench gccgobench
22 | 
23 | 6gbench:
24 | 	go test -bench=.
25 | 
26 | gccgobench:
27 | 	go test -bench=. -compiler $(GCCGO)
28 | 
29 | clean:
30 | 	go clean
31 | 
32 | doc:
33 | 	godoc github.com/barnex/cuda5/curand > README
34 | 


--------------------------------------------------------------------------------
/curand/README:
--------------------------------------------------------------------------------
 1 | PACKAGE DOCUMENTATION
 2 | 
 3 | package curand
 4 |     import "github.com/barnex/cuda5/curand"
 5 | 
 6 | 
 7 | 
 8 | TYPES
 9 | 
10 | type Generator uintptr
11 | 
12 | 
13 | func CreateGenerator(rngType RngType) Generator
14 | 
15 | 
16 | func (g Generator) GenerateNormal(output uintptr, n int64, mean, stddev float32)
17 | 
18 | func (g Generator) SetSeed(seed int64)
19 | 
20 | 
21 | type RngType int
22 | 
23 | const (
24 |     PSEUDO_DEFAULT          RngType = C.CURAND_RNG_PSEUDO_DEFAULT          // Default pseudorandom generator
25 |     PSEUDO_XORWOW           RngType = C.CURAND_RNG_PSEUDO_XORWOW           // XORWOW pseudorandom generator
26 |     QUASI_DEFAULT           RngType = C.CURAND_RNG_QUASI_DEFAULT           // Default quasirandom generator
27 |     QUASI_SOBOL32           RngType = C.CURAND_RNG_QUASI_SOBOL32           // Sobol32 quasirandom generator
28 |     QUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 // Scrambled Sobol32 quasirandom generator
29 |     QUASI_SOBOL64           RngType = C.CURAND_RNG_QUASI_SOBOL64           // Sobol64 quasirandom generator
30 |     QUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 // Scrambled Sobol64 quasirandom generator
31 | )
32 | 
33 | 
34 | 
35 | type Status int
36 | 
37 | const (
38 |     SUCCESS               Status = C.CURAND_STATUS_SUCCESS               // No errors
39 |     VERSION_MISMATCH      Status = C.CURAND_STATUS_VERSION_MISMATCH      // Header file and linked library version do not match
40 |     NOT_INITIALIZED       Status = C.CURAND_STATUS_NOT_INITIALIZED       // Generator not initialized
41 |     ALLOCATION_FAILED     Status = C.CURAND_STATUS_ALLOCATION_FAILED     // Memory allocation failed
42 |     TYPE_ERROR            Status = C.CURAND_STATUS_TYPE_ERROR            // Generator is wrong type
43 |     OUT_OF_RANGE          Status = C.CURAND_STATUS_OUT_OF_RANGE          // Argument out of range
44 |     LENGTH_NOT_MULTIPLE   Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE   // Length requested is not a multple of dimension
45 |     LAUNCH_FAILURE        Status = C.CURAND_STATUS_LAUNCH_FAILURE        // Kernel launch failure
46 |     PREEXISTING_FAILURE   Status = C.CURAND_STATUS_PREEXISTING_FAILURE   // Preexisting failure on library entry
47 |     INITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED // Initialization of CUDA failed
48 |     ARCH_MISMATCH         Status = C.CURAND_STATUS_ARCH_MISMATCH         // Architecture mismatch, GPU does not support requested feature
49 |     INTERNAL_ERROR        Status = C.CURAND_STATUS_INTERNAL_ERROR        // Internal library error
50 | )
51 | 
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/curand/cgoflags.go:
--------------------------------------------------------------------------------
 1 | package curand
 2 | 
 3 | // This file provides CGO flags to find CUDA libraries and headers.
 4 | 
 5 | //#cgo LDFLAGS:-lcurand
 6 | //
 7 | ////default location:
 8 | //#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
 9 | //#cgo CFLAGS: -I/usr/local/cuda/include/
10 | //
11 | ////default location if not properly symlinked:
12 | //#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib
13 | //#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib
14 | //#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib
15 | //#cgo CFLAGS: -I/usr/local/cuda-6.0/include/
16 | //#cgo CFLAGS: -I/usr/local/cuda-5.5/include/
17 | //#cgo CFLAGS: -I/usr/local/cuda-5.0/include/
18 | //
19 | ////arch linux:
20 | //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib
21 | //#cgo CFLAGS: -I/opt/cuda/include
22 | //
23 | ////WINDOWS:
24 | //#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64
25 | //#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include -w
26 | import "C"
27 | 


--------------------------------------------------------------------------------
/curand/generator.go:
--------------------------------------------------------------------------------
 1 | package curand
 2 | 
 3 | //#include <curand.h>
 4 | import "C"
 5 | 
 6 | import (
 7 | 	"unsafe"
 8 | )
 9 | 
10 | type Generator uintptr
11 | 
12 | type RngType int
13 | 
14 | const (
15 | 	PSEUDO_DEFAULT          RngType = C.CURAND_RNG_PSEUDO_DEFAULT          // Default pseudorandom generator
16 | 	PSEUDO_XORWOW           RngType = C.CURAND_RNG_PSEUDO_XORWOW           // XORWOW pseudorandom generator
17 | 	QUASI_DEFAULT           RngType = C.CURAND_RNG_QUASI_DEFAULT           // Default quasirandom generator
18 | 	QUASI_SOBOL32           RngType = C.CURAND_RNG_QUASI_SOBOL32           // Sobol32 quasirandom generator
19 | 	QUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 // Scrambled Sobol32 quasirandom generator
20 | 	QUASI_SOBOL64           RngType = C.CURAND_RNG_QUASI_SOBOL64           // Sobol64 quasirandom generator
21 | 	QUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 // Scrambled Sobol64 quasirandom generator
22 | )
23 | 
24 | func CreateGenerator(rngType RngType) Generator {
25 | 	var rng C.curandGenerator_t
26 | 	err := Status(C.curandCreateGenerator(&rng, C.curandRngType_t(rngType)))
27 | 	if err != SUCCESS {
28 | 		panic(err)
29 | 	}
30 | 	return Generator(uintptr(unsafe.Pointer(rng))) // cgo
31 | }
32 | 
33 | func (g Generator) GenerateNormal(output uintptr, n int64, mean, stddev float32) {
34 | 	err := Status(C.curandGenerateNormal(
35 | 		C.curandGenerator_t(unsafe.Pointer(uintptr(g))),
36 | 		(*C.float)(unsafe.Pointer(output)),
37 | 		C.size_t(n),
38 | 		C.float(mean),
39 | 		C.float(stddev)))
40 | 	if err != SUCCESS {
41 | 		panic(err)
42 | 	}
43 | }
44 | 
45 | func (g Generator) SetSeed(seed int64) {
46 | 	err := Status(C.curandSetPseudoRandomGeneratorSeed(C.curandGenerator_t(unsafe.Pointer(uintptr(g))), _Ctype_ulonglong(seed)))
47 | 	if err != SUCCESS {
48 | 		panic(err)
49 | 	}
50 | }
51 | 
52 | // Documentation was taken from the curand headers.
53 | 


--------------------------------------------------------------------------------
/curand/status.go:
--------------------------------------------------------------------------------
 1 | package curand
 2 | 
 3 | //#include <curand.h>
 4 | import "C"
 5 | 
 6 | import (
 7 | 	"fmt"
 8 | )
 9 | 
10 | type Status int
11 | 
12 | const (
13 | 	SUCCESS               Status = C.CURAND_STATUS_SUCCESS               // No errors
14 | 	VERSION_MISMATCH      Status = C.CURAND_STATUS_VERSION_MISMATCH      // Header file and linked library version do not match
15 | 	NOT_INITIALIZED       Status = C.CURAND_STATUS_NOT_INITIALIZED       // Generator not initialized
16 | 	ALLOCATION_FAILED     Status = C.CURAND_STATUS_ALLOCATION_FAILED     // Memory allocation failed
17 | 	TYPE_ERROR            Status = C.CURAND_STATUS_TYPE_ERROR            // Generator is wrong type
18 | 	OUT_OF_RANGE          Status = C.CURAND_STATUS_OUT_OF_RANGE          // Argument out of range
19 | 	LENGTH_NOT_MULTIPLE   Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE   // Length requested is not a multple of dimension
20 | 	LAUNCH_FAILURE        Status = C.CURAND_STATUS_LAUNCH_FAILURE        // Kernel launch failure
21 | 	PREEXISTING_FAILURE   Status = C.CURAND_STATUS_PREEXISTING_FAILURE   // Preexisting failure on library entry
22 | 	INITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED // Initialization of CUDA failed
23 | 	ARCH_MISMATCH         Status = C.CURAND_STATUS_ARCH_MISMATCH         // Architecture mismatch, GPU does not support requested feature
24 | 	INTERNAL_ERROR        Status = C.CURAND_STATUS_INTERNAL_ERROR        // Internal library error
25 | )
26 | 
27 | func (s Status) String() string {
28 | 	if str, ok := statusStr[s]; ok {
29 | 		return str
30 | 	} else {
31 | 		return fmt.Sprint("CURAND ERROR NUMBER ", int(s))
32 | 	}
33 | }
34 | 
35 | var statusStr = map[Status]string{
36 | 	SUCCESS:               "CURAND_STATUS_SUCCESS",
37 | 	VERSION_MISMATCH:      "CURAND_STATUS_VERSION_MISMATCH",
38 | 	NOT_INITIALIZED:       "CURAND_STATUS_NOT_INITIALIZED",
39 | 	ALLOCATION_FAILED:     "CURAND_STATUS_ALLOCATION_FAILED",
40 | 	TYPE_ERROR:            "CURAND_STATUS_TYPE_ERROR",
41 | 	OUT_OF_RANGE:          "CURAND_STATUS_OUT_OF_RANGE",
42 | 	LENGTH_NOT_MULTIPLE:   "CURAND_STATUS_LENGTH_NOT_MULTIPLE",
43 | 	LAUNCH_FAILURE:        "CURAND_STATUS_LAUNCH_FAILURE",
44 | 	PREEXISTING_FAILURE:   "CURAND_STATUS_PREEXISTING_FAILURE",
45 | 	INITIALIZATION_FAILED: "CURAND_STATUS_INITIALIZATION_FAILED",
46 | 	ARCH_MISMATCH:         "CURAND_STATUS_ARCH_MISMATCH",
47 | 	INTERNAL_ERROR:        "CURAND_STATUS_INTERNAL_ERROR",
48 | }
49 | 
50 | // Documentation was taken from the curand headers.
51 | 


--------------------------------------------------------------------------------
/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | 	Go bindings for nVIDIA CUDA 5.
 3 | 	This package compiles with both gc and gccgo.
 4 | */
 5 | package cuda5
 6 | 
 7 | // Dummy imports so that
 8 | // 	go get github.com/barnex/cuda5
 9 | // will install everything.
10 | import (
11 | 	_ "github.com/barnex/cuda5/cu"
12 | 	_ "github.com/barnex/cuda5/cufft"
13 | 	_ "github.com/barnex/cuda5/safe"
14 | )
15 | 


--------------------------------------------------------------------------------
/gophergpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/barnex/cuda5/da30a9b287d8f7ad210d42d911e33ef5c511544b/gophergpu.png


--------------------------------------------------------------------------------
/safe/Makefile:
--------------------------------------------------------------------------------
 1 | all: 6g doc #gccgo
 2 | 
 3 | 6g:
 4 | 	go install -v
 5 | 	go tool vet *.go
 6 | 	gofmt -w *.go
 7 | 
 8 | GCCGO=gccgo -gccgoflags '-static-libgcc -O3'
 9 | 
10 | gccgo:
11 | 	go build -v -compiler $(GCCGO)
12 | 
13 | test: 6gtest gccgotest
14 | 
15 | 6gtest: 
16 | 	go test
17 | 
18 | gccgotest: 
19 | 	go test -compiler $(GCCGO)
20 | 
21 | bench: 6gbench gccgobench
22 | 
23 | 6gbench:
24 | 	go test -bench=.
25 | 
26 | gccgobench:
27 | 	go test -bench=. -compiler $(GCCGO)
28 | 
29 | clean:
30 | 	go clean
31 | 	go-optview -c -w *.go
32 | 	gofmt -w *.go
33 | 
34 | opt:
35 | 	go-optview -w *.go
36 | 	gofmt -w *.go
37 | 
38 | doc:
39 | 	godoc github.com/barnex/cuda5/safe > README
40 | 


--------------------------------------------------------------------------------
/safe/README:
--------------------------------------------------------------------------------
  1 | PACKAGE
  2 | 
  3 | package safe
  4 |     import "github.com/barnex/cuda5/safe"
  5 | 
  6 |     Safe and more idiomatic wrappers for the low-level CUDA functions.
  7 | 
  8 | FUNCTIONS
  9 | 
 10 | func InitCuda()
 11 | 
 12 | 
 13 | TYPES
 14 | 
 15 | type Complex128s struct {
 16 |     // contains filtered or unexported fields
 17 | }
 18 |     Slice of complex128's on the GPU.
 19 | 
 20 | func MakeComplex128s(len_ int) Complex128s
 21 |     Make a slice of complex128's on the GPU. Initialized to zero.
 22 | 
 23 | func (s *Complex128s) Cap() int
 24 |     Slice capacity.
 25 | 
 26 | func (dst Complex128s) CopyDtoD(src Complex128s)
 27 |     Copy src on host to dst on host.
 28 | 
 29 | func (dst Complex128s) CopyDtoDAsync(src Complex128s, stream cu.Stream)
 30 |     Copy src on host to dst on host, asynchronously.
 31 | 
 32 | func (src Complex128s) CopyDtoH(dst []complex128)
 33 |     Copy src form device to dst on host.
 34 | 
 35 | func (src Complex128s) CopyDtoHAsync(dst []complex128, stream cu.Stream)
 36 |     Copy src form device to dst on host, asynchronously.
 37 | 
 38 | func (dst Complex128s) CopyHtoD(src []complex128)
 39 |     Copy src from host to dst on the device.
 40 | 
 41 | func (dst Complex128s) CopyHtoDAsync(src []complex128, stream cu.Stream)
 42 |     Copy src from host to dst on the device, asynchronously.
 43 | 
 44 | func (s Complex128s) Float() Float64s
 45 |     Re-interpret the array as float numbers, in interleaved format.
 46 |     Underlying storage is shared.
 47 | 
 48 | func (s *Complex128s) Free()
 49 |     Free the underlying storage. To be used with care. Free() should only be
 50 |     called on a slice created by MakeXXX(), not on a slice created by
 51 |     x.Slice(). Freeing a slice invalidates all other slices referring to it.
 52 | 
 53 | func (src Complex128s) Host() []complex128
 54 |     Returns a fresh copy on host.
 55 | 
 56 | func (s *Complex128s) Len() int
 57 |     Slice length (number of elements).
 58 | 
 59 | func (s *Complex128s) Pointer() cu.DevicePtr
 60 |     Pointer to the first element.
 61 | 
 62 | func (s Complex128s) Slice(start, stop int) Complex128s
 63 |     Return a slice from start (inclusive) to stop (exclusive), sharing the
 64 |     underlying storage with the original slice. Slices obtained in this way
 65 |     should not be Free()'d
 66 | 
 67 | func (s *Complex128s) UnsafeSet(pointer unsafe.Pointer, length, capacity int)
 68 |     Manually set the pointer, length and capacity. Side-steps the security
 69 |     mechanisms, use with caution.
 70 | 
 71 | type Complex64s struct {
 72 |     // contains filtered or unexported fields
 73 | }
 74 |     Slice of complex64's on the GPU.
 75 | 
 76 | func MakeComplex64s(len_ int) Complex64s
 77 |     Make a slice of complex64's on the GPU. Initialized to zero.
 78 | 
 79 | func (s *Complex64s) Cap() int
 80 |     Slice capacity.
 81 | 
 82 | func (dst Complex64s) CopyDtoD(src Complex64s)
 83 |     Copy src on host to dst on host.
 84 | 
 85 | func (dst Complex64s) CopyDtoDAsync(src Complex64s, stream cu.Stream)
 86 |     Copy src on host to dst on host, asynchronously.
 87 | 
 88 | func (src Complex64s) CopyDtoH(dst []complex64)
 89 |     Copy src form device to dst on host.
 90 | 
 91 | func (src Complex64s) CopyDtoHAsync(dst []complex64, stream cu.Stream)
 92 |     Copy src form device to dst on host, asynchronously.
 93 | 
 94 | func (dst Complex64s) CopyHtoD(src []complex64)
 95 |     Copy src from host to dst on the device.
 96 | 
 97 | func (dst Complex64s) CopyHtoDAsync(src []complex64, stream cu.Stream)
 98 |     Copy src from host to dst on the device, asynchronously.
 99 | 
100 | func (s Complex64s) Float() Float32s
101 |     Re-interpret the array as float numbers, in interleaved format.
102 |     Underlying storage is shared.
103 | 
104 | func (s *Complex64s) Free()
105 |     Free the underlying storage. To be used with care. Free() should only be
106 |     called on a slice created by MakeXXX(), not on a slice created by
107 |     x.Slice(). Freeing a slice invalidates all other slices referring to it.
108 | 
109 | func (src Complex64s) Host() []complex64
110 |     Returns a fresh copy on host.
111 | 
112 | func (s *Complex64s) Len() int
113 |     Slice length (number of elements).
114 | 
115 | func (s *Complex64s) Pointer() cu.DevicePtr
116 |     Pointer to the first element.
117 | 
118 | func (s Complex64s) Slice(start, stop int) Complex64s
119 |     Return a slice from start (inclusive) to stop (exclusive), sharing the
120 |     underlying storage with the original slice. Slices obtained in this way
121 |     should not be Free()'d
122 | 
123 | func (s *Complex64s) UnsafeSet(pointer unsafe.Pointer, length, capacity int)
124 |     Manually set the pointer, length and capacity. Side-steps the security
125 |     mechanisms, use with caution.
126 | 
127 | type FFT1DC2RPlan struct {
128 |     // contains filtered or unexported fields
129 | }
130 |     1D single-precission complex-to-real FFT plan.
131 | 
132 | func FFT1DC2R(size, batch int) FFT1DC2RPlan
133 |     1D single-precission complex-to-real FFT plan.
134 | 
135 | func (p FFT1DC2RPlan) Destroy()
136 |     Releases all resources associated with the FFT plan.
137 | 
138 | func (p FFT1DC2RPlan) Exec(src Complex64s, dst Float32s)
139 |     Execute the FFT plan. Synchronized.
140 | 
141 | func (p FFT1DC2RPlan) InputLen() int
142 |     Required length of the output array.
143 | 
144 | func (p FFT1DC2RPlan) OutputLen() int
145 |     Required length of the input array.
146 | 
147 | func (p FFT1DC2RPlan) SetStream(stream cu.Stream)
148 |     Associates a CUDA stream with the FFT plan. If a stream is set,
149 |     plan.Stream().Synchronize() can to be called to wait for the execution
150 |     to finish.
151 | 
152 | func (s FFT1DC2RPlan) Size() int
153 |     Returns the logical size of the FFT: the number of elements (real or
154 |     complex) it transforms.
155 | 
156 | func (p FFT1DC2RPlan) Stream() cu.Stream
157 |     Returns the CUDA stream associated with the FFT plan.
158 | 
159 | type FFT1DR2CPlan struct {
160 |     // contains filtered or unexported fields
161 | }
162 |     1D single-precission real-to-complex FFT plan.
163 | 
164 | func FFT1DR2C(size, batch int) FFT1DR2CPlan
165 |     1D single-precission real-to-complex FFT plan.
166 | 
167 | func (p FFT1DR2CPlan) Destroy()
168 |     Releases all resources associated with the FFT plan.
169 | 
170 | func (p FFT1DR2CPlan) Exec(src Float32s, dst Complex64s)
171 |     Execute the FFT plan. Synchronized.
172 | 
173 | func (p FFT1DR2CPlan) InputLen() int
174 |     Required length of the input array.
175 | 
176 | func (p FFT1DR2CPlan) OutputLen() int
177 |     Required length of the output array.
178 | 
179 | func (p FFT1DR2CPlan) SetStream(stream cu.Stream)
180 |     Associates a CUDA stream with the FFT plan. If a stream is set,
181 |     plan.Stream().Synchronize() can to be called to wait for the execution
182 |     to finish.
183 | 
184 | func (s FFT1DR2CPlan) Size() int
185 |     Returns the logical size of the FFT: the number of elements (real or
186 |     complex) it transforms.
187 | 
188 | func (p FFT1DR2CPlan) Stream() cu.Stream
189 |     Returns the CUDA stream associated with the FFT plan.
190 | 
191 | type FFT3DC2RPlan struct {
192 |     // contains filtered or unexported fields
193 | }
194 |     3D single-precission real-to-complex FFT plan.
195 | 
196 | func FFT3DC2R(Nx, Ny, Nz int) FFT3DC2RPlan
197 |     3D single-precission real-to-complex FFT plan.
198 | 
199 | func (p FFT3DC2RPlan) Destroy()
200 |     Releases all resources associated with the FFT plan.
201 | 
202 | func (p FFT3DC2RPlan) Exec(src Complex64s, dst Float32s)
203 |     Execute the FFT plan. src and dst are 3D arrays stored 1D arrays.
204 | 
205 | func (p FFT3DC2RPlan) InputLen() int
206 |     Required length of the (1D) input array.
207 | 
208 | func (p FFT3DC2RPlan) InputSize() (Nx, Ny, Nz int)
209 |     3D size of the input array.
210 | 
211 | func (p FFT3DC2RPlan) OutputLen() int
212 |     Required length of the (1D) output array.
213 | 
214 | func (p FFT3DC2RPlan) OutputSize() (Nx, Ny, Nz int)
215 |     3D size of the output array.
216 | 
217 | func (p FFT3DC2RPlan) SetStream(stream cu.Stream)
218 |     Associates a CUDA stream with the FFT plan. If a stream is set,
219 |     plan.Stream().Synchronize() can to be called to wait for the execution
220 |     to finish.
221 | 
222 | func (s FFT3DC2RPlan) Size() (Nx, Ny, Nz int)
223 |     Returns the logical size of the FFT: the number of elements (real or
224 |     complex) it transforms.
225 | 
226 | func (p FFT3DC2RPlan) Stream() cu.Stream
227 |     Returns the CUDA stream associated with the FFT plan.
228 | 
229 | type FFT3DD2ZPlan struct {
230 |     // contains filtered or unexported fields
231 | }
232 |     3D single-precission real-to-complex FFT plan.
233 | 
234 | func FFT3DD2Z(Nx, Ny, Nz int) FFT3DD2ZPlan
235 |     3D single-precission real-to-complex FFT plan.
236 | 
237 | func (p FFT3DD2ZPlan) Destroy()
238 |     Releases all resources associated with the FFT plan.
239 | 
240 | func (p FFT3DD2ZPlan) Exec(src Float64s, dst Complex128s)
241 |     Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D
242 |     arrays.
243 | 
244 | func (p FFT3DD2ZPlan) InputLen() int
245 |     Required length of the (1D) input array.
246 | 
247 | func (p FFT3DD2ZPlan) InputSize() (Nx, Ny, Nz int)
248 |     3D size of the input array.
249 | 
250 | func (p FFT3DD2ZPlan) OutputLen() int
251 |     Required length of the (1D) output array.
252 | 
253 | func (p FFT3DD2ZPlan) OutputSize() (Nx, Ny, Nz int)
254 |     3D size of the output array.
255 | 
256 | func (p FFT3DD2ZPlan) SetStream(stream cu.Stream)
257 |     Associates a CUDA stream with the FFT plan. If a stream is set,
258 |     plan.Stream().Synchronize() can to be called to wait for the execution
259 |     to finish.
260 | 
261 | func (s FFT3DD2ZPlan) Size() (Nx, Ny, Nz int)
262 |     Returns the logical size of the FFT: the number of elements (real or
263 |     complex) it transforms.
264 | 
265 | func (p FFT3DD2ZPlan) Stream() cu.Stream
266 |     Returns the CUDA stream associated with the FFT plan.
267 | 
268 | type FFT3DR2CPlan struct {
269 |     // contains filtered or unexported fields
270 | }
271 |     3D single-precission real-to-complex FFT plan.
272 | 
273 | func FFT3DR2C(Nx, Ny, Nz int) FFT3DR2CPlan
274 |     3D single-precission real-to-complex FFT plan.
275 | 
276 | func (p FFT3DR2CPlan) Destroy()
277 |     Releases all resources associated with the FFT plan.
278 | 
279 | func (p FFT3DR2CPlan) Exec(src Float32s, dst Complex64s)
280 |     Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D
281 |     arrays.
282 | 
283 | func (p FFT3DR2CPlan) InputLen() int
284 |     Required length of the (1D) input array.
285 | 
286 | func (p FFT3DR2CPlan) InputSize() (Nx, Ny, Nz int)
287 |     3D size of the input array.
288 | 
289 | func (p FFT3DR2CPlan) OutputLen() int
290 |     Required length of the (1D) output array.
291 | 
292 | func (p FFT3DR2CPlan) OutputSize() (Nx, Ny, Nz int)
293 |     3D size of the output array.
294 | 
295 | func (p FFT3DR2CPlan) SetStream(stream cu.Stream)
296 |     Associates a CUDA stream with the FFT plan. If a stream is set,
297 |     plan.Stream().Synchronize() can to be called to wait for the execution
298 |     to finish.
299 | 
300 | func (s FFT3DR2CPlan) Size() (Nx, Ny, Nz int)
301 |     Returns the logical size of the FFT: the number of elements (real or
302 |     complex) it transforms.
303 | 
304 | func (p FFT3DR2CPlan) Stream() cu.Stream
305 |     Returns the CUDA stream associated with the FFT plan.
306 | 
307 | type FFT3DZ2DPlan struct {
308 |     // contains filtered or unexported fields
309 | }
310 |     3D single-precission real-to-complex FFT plan.
311 | 
312 | func FFT3DZ2D(Nx, Ny, Nz int) FFT3DZ2DPlan
313 |     3D single-precission real-to-complex FFT plan.
314 | 
315 | func (p FFT3DZ2DPlan) Destroy()
316 |     Releases all resources associated with the FFT plan.
317 | 
318 | func (p FFT3DZ2DPlan) Exec(src Complex128s, dst Float64s)
319 |     Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D
320 |     arrays.
321 | 
322 | func (p FFT3DZ2DPlan) InputLen() int
323 |     Required length of the (1D) input array.
324 | 
325 | func (p FFT3DZ2DPlan) InputSize() (Nx, Ny, Nz int)
326 |     3D size of the input array.
327 | 
328 | func (p FFT3DZ2DPlan) OutputLen() int
329 |     Required length of the (1D) output array.
330 | 
331 | func (p FFT3DZ2DPlan) OutputSize() (Nx, Ny, Nz int)
332 |     3D size of the output array.
333 | 
334 | func (p FFT3DZ2DPlan) SetStream(stream cu.Stream)
335 |     Associates a CUDA stream with the FFT plan. If a stream is set,
336 |     plan.Stream().Synchronize() can to be called to wait for the execution
337 |     to finish.
338 | 
339 | func (s FFT3DZ2DPlan) Size() (Nx, Ny, Nz int)
340 |     Returns the logical size of the FFT: the number of elements (real or
341 |     complex) it transforms.
342 | 
343 | func (p FFT3DZ2DPlan) Stream() cu.Stream
344 |     Returns the CUDA stream associated with the FFT plan.
345 | 
346 | type Float32s struct {
347 |     // contains filtered or unexported fields
348 | }
349 |     Slice of float32's on the GPU.
350 | 
351 | func MakeFloat32s(len_ int) Float32s
352 |     Make a slice of float32's on the GPU. Initialized to zero.
353 | 
354 | func (s *Float32s) Cap() int
355 |     Slice capacity.
356 | 
357 | func (s Float32s) Complex() Complex64s
358 |     Re-interpret the array as complex numbers, in interleaved format.
359 |     Underlying storage is shared.
360 | 
361 | func (dst Float32s) CopyDtoD(src Float32s)
362 |     Copy src on host to dst on host.
363 | 
364 | func (dst Float32s) CopyDtoDAsync(src Float32s, stream cu.Stream)
365 |     Copy src on host to dst on host, asynchronously.
366 | 
367 | func (src Float32s) CopyDtoH(dst []float32)
368 |     Copy src form device to dst on host.
369 | 
370 | func (src Float32s) CopyDtoHAsync(dst []float32, stream cu.Stream)
371 |     Copy src form device to dst on host, asynchronously.
372 | 
373 | func (dst Float32s) CopyHtoD(src []float32)
374 |     Copy src from host to dst on the device.
375 | 
376 | func (dst Float32s) CopyHtoDAsync(src []float32, stream cu.Stream)
377 |     Copy src from host to dst on the device, asynchronously.
378 | 
379 | func (s *Float32s) Free()
380 |     Free the underlying storage. To be used with care. Free() should only be
381 |     called on a slice created by MakeXXX(), not on a slice created by
382 |     x.Slice(). Freeing a slice invalidates all other slices referring to it.
383 | 
384 | func (src Float32s) Host() []float32
385 |     Returns a fresh copy on host.
386 | 
387 | func (s *Float32s) Len() int
388 |     Slice length (number of elements).
389 | 
390 | func (s Float32s) Memset(value float32)
391 |     Set the entire slice to this value.
392 | 
393 | func (s Float32s) MemsetAsync(value float32, stream cu.Stream)
394 |     Set the entire slice to this value, asynchronously.
395 | 
396 | func (s *Float32s) Pointer() cu.DevicePtr
397 |     Pointer to the first element.
398 | 
399 | func (s Float32s) Slice(start, stop int) Float32s
400 |     Return a slice from start (inclusive) to stop (exclusive), sharing the
401 |     underlying storage with the original slice. Slices obtained in this way
402 |     should not be Free()'d
403 | 
404 | func (s *Float32s) UnsafeSet(pointer unsafe.Pointer, length, capacity int)
405 |     Manually set the pointer, length and capacity. Side-steps the security
406 |     mechanisms, use with caution.
407 | 
408 | type Float64s struct {
409 |     // contains filtered or unexported fields
410 | }
411 |     Slice of float64's on the GPU.
412 | 
413 | func MakeFloat64s(len_ int) Float64s
414 |     Make a slice of float64's on the GPU. Initialized to zero.
415 | 
416 | func (s *Float64s) Cap() int
417 |     Slice capacity.
418 | 
419 | func (s Float64s) Complex() Complex128s
420 |     Re-interpret the array as complex numbers, in interleaved format.
421 |     Underlying storage is shared.
422 | 
423 | func (dst Float64s) CopyDtoD(src Float64s)
424 |     Copy src on host to dst on host.
425 | 
426 | func (dst Float64s) CopyDtoDAsync(src Float64s, stream cu.Stream)
427 |     Copy src on host to dst on host, asynchronously.
428 | 
429 | func (src Float64s) CopyDtoH(dst []float64)
430 |     Copy src form device to dst on host.
431 | 
432 | func (src Float64s) CopyDtoHAsync(dst []float64, stream cu.Stream)
433 |     Copy src form device to dst on host, asynchronously.
434 | 
435 | func (dst Float64s) CopyHtoD(src []float64)
436 |     Copy src from host to dst on the device.
437 | 
438 | func (dst Float64s) CopyHtoDAsync(src []float64, stream cu.Stream)
439 |     Copy src from host to dst on the device, asynchronously.
440 | 
441 | func (s *Float64s) Free()
442 |     Free the underlying storage. To be used with care. Free() should only be
443 |     called on a slice created by MakeXXX(), not on a slice created by
444 |     x.Slice(). Freeing a slice invalidates all other slices referring to it.
445 | 
446 | func (src Float64s) Host() []float64
447 |     Returns a fresh copy on host.
448 | 
449 | func (s *Float64s) Len() int
450 |     Slice length (number of elements).
451 | 
452 | func (s *Float64s) Pointer() cu.DevicePtr
453 |     Pointer to the first element.
454 | 
455 | func (s Float64s) Slice(start, stop int) Float64s
456 |     Return a slice from start (inclusive) to stop (exclusive), sharing the
457 |     underlying storage with the original slice. Slices obtained in this way
458 |     should not be Free()'d
459 | 
460 | func (s *Float64s) UnsafeSet(pointer unsafe.Pointer, length, capacity int)
461 |     Manually set the pointer, length and capacity. Side-steps the security
462 |     mechanisms, use with caution.
463 | 
464 | 
465 | 


--------------------------------------------------------------------------------
/safe/complex128s.go:
--------------------------------------------------------------------------------
 1 | package safe
 2 | 
 3 | import (
 4 | 	"github.com/barnex/cuda5/cu"
 5 | 	"unsafe"
 6 | )
 7 | 
 8 | // Slice of complex128's on the GPU.
 9 | type Complex128s struct{ slice }
10 | 
11 | // Make a slice of complex128's on the GPU.
12 | // Initialized to zero.
13 | func MakeComplex128s(len_ int) Complex128s {
14 | 	return Complex128s{makeslice(len_, cu.SIZEOF_COMPLEX128)}
15 | }
16 | 
17 | // Return a slice from start (inclusive) to stop (exclusive),
18 | // sharing the underlying storage with the original slice.
19 | // Slices obtained in this way should not be Free()'d
20 | func (s Complex128s) Slice(start, stop int) Complex128s {
21 | 	return Complex128s{s.slice.slice(start, stop, cu.SIZEOF_COMPLEX128)}
22 | }
23 | 
24 | // Copy src from host to dst on the device.
25 | func (dst Complex128s) CopyHtoD(src []complex128) {
26 | 	dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX128)
27 | }
28 | 
29 | // Copy src form device to dst on host.
30 | func (src Complex128s) CopyDtoH(dst []complex128) {
31 | 	src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX128)
32 | }
33 | 
34 | // Copy src on host to dst on host.
35 | func (dst Complex128s) CopyDtoD(src Complex128s) {
36 | 	dst.copyDtoD(&src.slice, cu.SIZEOF_COMPLEX128)
37 | }
38 | 
39 | // Copy src from host to dst on the device, asynchronously.
40 | func (dst Complex128s) CopyHtoDAsync(src []complex128, stream cu.Stream) {
41 | 	dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX128, stream)
42 | }
43 | 
44 | // Copy src form device to dst on host, asynchronously.
45 | func (src Complex128s) CopyDtoHAsync(dst []complex128, stream cu.Stream) {
46 | 	src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX128, stream)
47 | }
48 | 
49 | // Copy src on host to dst on host, asynchronously.
50 | func (dst Complex128s) CopyDtoDAsync(src Complex128s, stream cu.Stream) {
51 | 	dst.copyDtoDAsync(&src.slice, cu.SIZEOF_COMPLEX128, stream)
52 | }
53 | 
54 | // Returns a fresh copy on host.
55 | func (src Complex128s) Host() []complex128 {
56 | 	cpy := make([]complex128, src.Len())
57 | 	src.CopyDtoH(cpy)
58 | 	return cpy
59 | }
60 | 
61 | // Re-interpret the array as float numbers,
62 | // in interleaved format. Underlying storage
63 | // is shared.
64 | func (s Complex128s) Float() Float64s {
65 | 	return Float64s{slice{s.ptr_, s.len_ * 2, s.cap_ * 2}}
66 | }
67 | 


--------------------------------------------------------------------------------
/safe/complex128s_test.go:
--------------------------------------------------------------------------------
  1 | package safe
  2 | 
  3 | import (
  4 | 	"reflect"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func TestComplex128sSlice(test *testing.T) {
  9 | 	InitCuda()
 10 | 
 11 | 	a := MakeComplex128s(100)
 12 | 	defer a.Free()
 13 | 
 14 | 	if !reflect.DeepEqual(a.Host(), make([]complex128, 100)) {
 15 | 		test.Error(a.Host())
 16 | 	}
 17 | 
 18 | 	b := make([]complex128, 100)
 19 | 
 20 | 	if a.Len() != len(b) {
 21 | 		test.Error("len:", a.Len(), "!=", cap(b))
 22 | 	}
 23 | 	if a.Cap() != cap(b) {
 24 | 		test.Error("cap:", a.Cap(), "!=", cap(b))
 25 | 	}
 26 | 
 27 | 	c := a.Slice(20, 30)
 28 | 	d := b[20:30]
 29 | 
 30 | 	if c.Len() != len(d) {
 31 | 		test.Error("sliced len:", c.Len(), "!=", cap(d))
 32 | 	}
 33 | 	if c.Cap() != cap(d) {
 34 | 		test.Error("sliced cap:", c.Cap(), "!=", cap(d))
 35 | 	}
 36 | 
 37 | 	e := a.Slice(0, 50)
 38 | 	f := b[0:50]
 39 | 
 40 | 	if e.Len() != len(f) {
 41 | 		test.Error("sliced len:", e.Len(), "!=", cap(f))
 42 | 	}
 43 | 	if e.Cap() != cap(f) {
 44 | 		test.Error("sliced cap:", e.Cap(), "!=", cap(f))
 45 | 	}
 46 | }
 47 | 
 48 | func TestComplex128sPanic1(test *testing.T) {
 49 | 	InitCuda()
 50 | 
 51 | 	defer func() {
 52 | 		err := recover()
 53 | 		test.Log("recovered:", err)
 54 | 		if err == nil {
 55 | 			test.Fail()
 56 | 		}
 57 | 	}()
 58 | 
 59 | 	a := MakeComplex128s(100)
 60 | 	defer a.Free()
 61 | 
 62 | 	a.Slice(-1, 10)
 63 | }
 64 | 
 65 | func TestComplex128sPanic2(test *testing.T) {
 66 | 	InitCuda()
 67 | 
 68 | 	defer func() {
 69 | 		err := recover()
 70 | 		test.Log("recovered:", err)
 71 | 		if err == nil {
 72 | 			test.Fail()
 73 | 		}
 74 | 	}()
 75 | 
 76 | 	a := MakeComplex128s(100)
 77 | 	defer a.Free()
 78 | 
 79 | 	a.Slice(0, 101)
 80 | }
 81 | 
 82 | func TestComplex128sCopy(test *testing.T) {
 83 | 	InitCuda()
 84 | 
 85 | 	a := make([]complex128, 100)
 86 | 
 87 | 	b := MakeComplex128s(100)
 88 | 	defer b.Free()
 89 | 
 90 | 	c := MakeComplex128s(100)
 91 | 	defer c.Free()
 92 | 
 93 | 	d := make([]complex128, 200)
 94 | 
 95 | 	for i := range a {
 96 | 		a[i] = complex(float64(i), float64(2*i))
 97 | 	}
 98 | 
 99 | 	b.CopyHtoD(a)
100 | 
101 | 	c.CopyDtoD(b)
102 | 
103 | 	c.CopyDtoH(d[:100])
104 | 
105 | 	if !reflect.DeepEqual(a, d[:100]) {
106 | 		test.Error(d)
107 | 	}
108 | 	if !reflect.DeepEqual(d[100:], make([]complex128, 100)) {
109 | 		test.Error(d)
110 | 	}
111 | }
112 | 


--------------------------------------------------------------------------------
/safe/complex64s.go:
--------------------------------------------------------------------------------
 1 | package safe
 2 | 
 3 | import (
 4 | 	"github.com/barnex/cuda5/cu"
 5 | 	"unsafe"
 6 | )
 7 | 
 8 | // Slice of complex64's on the GPU.
 9 | type Complex64s struct{ slice }
10 | 
11 | // Make a slice of complex64's on the GPU.
12 | // Initialized to zero.
13 | func MakeComplex64s(len_ int) Complex64s {
14 | 	return Complex64s{makeslice(len_, cu.SIZEOF_COMPLEX64)}
15 | }
16 | 
17 | // Return a slice from start (inclusive) to stop (exclusive),
18 | // sharing the underlying storage with the original slice.
19 | // Slices obtained in this way should not be Free()'d
20 | func (s Complex64s) Slice(start, stop int) Complex64s {
21 | 	return Complex64s{s.slice.slice(start, stop, cu.SIZEOF_COMPLEX64)}
22 | }
23 | 
24 | // Copy src from host to dst on the device.
25 | func (dst Complex64s) CopyHtoD(src []complex64) {
26 | 	dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX64)
27 | }
28 | 
29 | // Copy src form device to dst on host.
30 | func (src Complex64s) CopyDtoH(dst []complex64) {
31 | 	src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX64)
32 | }
33 | 
34 | // Copy src on host to dst on host.
35 | func (dst Complex64s) CopyDtoD(src Complex64s) {
36 | 	dst.copyDtoD(&src.slice, cu.SIZEOF_COMPLEX64)
37 | }
38 | 
39 | // Copy src from host to dst on the device, asynchronously.
40 | func (dst Complex64s) CopyHtoDAsync(src []complex64, stream cu.Stream) {
41 | 	dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX64, stream)
42 | }
43 | 
44 | // Copy src form device to dst on host, asynchronously.
45 | func (src Complex64s) CopyDtoHAsync(dst []complex64, stream cu.Stream) {
46 | 	src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX64, stream)
47 | }
48 | 
49 | // Copy src on host to dst on host, asynchronously.
50 | func (dst Complex64s) CopyDtoDAsync(src Complex64s, stream cu.Stream) {
51 | 	dst.copyDtoDAsync(&src.slice, cu.SIZEOF_COMPLEX64, stream)
52 | }
53 | 
54 | // Returns a fresh copy on host.
55 | func (src Complex64s) Host() []complex64 {
56 | 	cpy := make([]complex64, src.Len())
57 | 	src.CopyDtoH(cpy)
58 | 	return cpy
59 | }
60 | 
61 | // Re-interpret the array as float numbers,
62 | // in interleaved format. Underlying storage
63 | // is shared.
64 | func (s Complex64s) Float() Float32s {
65 | 	return Float32s{slice{s.ptr_, s.len_ * 2, s.cap_ * 2}}
66 | }
67 | 


--------------------------------------------------------------------------------
/safe/complex64s_test.go:
--------------------------------------------------------------------------------
  1 | package safe
  2 | 
  3 | import (
  4 | 	"reflect"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func TestComplex64sSlice(test *testing.T) {
  9 | 	InitCuda()
 10 | 
 11 | 	a := MakeComplex64s(100)
 12 | 	defer a.Free()
 13 | 
 14 | 	if !reflect.DeepEqual(a.Host(), make([]complex64, 100)) {
 15 | 		test.Error(a.Host())
 16 | 	}
 17 | 
 18 | 	b := make([]complex64, 100)
 19 | 
 20 | 	if a.Len() != len(b) {
 21 | 		test.Error("len:", a.Len(), "!=", cap(b))
 22 | 	}
 23 | 	if a.Cap() != cap(b) {
 24 | 		test.Error("cap:", a.Cap(), "!=", cap(b))
 25 | 	}
 26 | 
 27 | 	c := a.Slice(20, 30)
 28 | 	d := b[20:30]
 29 | 
 30 | 	if c.Len() != len(d) {
 31 | 		test.Error("sliced len:", c.Len(), "!=", cap(d))
 32 | 	}
 33 | 	if c.Cap() != cap(d) {
 34 | 		test.Error("sliced cap:", c.Cap(), "!=", cap(d))
 35 | 	}
 36 | 
 37 | 	e := a.Slice(0, 50)
 38 | 	f := b[0:50]
 39 | 
 40 | 	if e.Len() != len(f) {
 41 | 		test.Error("sliced len:", e.Len(), "!=", cap(f))
 42 | 	}
 43 | 	if e.Cap() != cap(f) {
 44 | 		test.Error("sliced cap:", e.Cap(), "!=", cap(f))
 45 | 	}
 46 | }
 47 | 
 48 | func TestComplex64sPanic1(test *testing.T) {
 49 | 	InitCuda()
 50 | 
 51 | 	defer func() {
 52 | 		err := recover()
 53 | 		test.Log("recovered:", err)
 54 | 		if err == nil {
 55 | 			test.Fail()
 56 | 		}
 57 | 	}()
 58 | 
 59 | 	a := MakeComplex64s(100)
 60 | 	defer a.Free()
 61 | 
 62 | 	a.Slice(-1, 10)
 63 | }
 64 | 
 65 | func TestComplex64sPanic2(test *testing.T) {
 66 | 	InitCuda()
 67 | 
 68 | 	defer func() {
 69 | 		err := recover()
 70 | 		test.Log("recovered:", err)
 71 | 		if err == nil {
 72 | 			test.Fail()
 73 | 		}
 74 | 	}()
 75 | 
 76 | 	a := MakeComplex64s(100)
 77 | 	defer a.Free()
 78 | 
 79 | 	a.Slice(0, 101)
 80 | }
 81 | 
 82 | func TestComplex64sCopy(test *testing.T) {
 83 | 	InitCuda()
 84 | 
 85 | 	a := make([]complex64, 100)
 86 | 
 87 | 	b := MakeComplex64s(100)
 88 | 	defer b.Free()
 89 | 
 90 | 	c := MakeComplex64s(100)
 91 | 	defer c.Free()
 92 | 
 93 | 	d := make([]complex64, 200)
 94 | 
 95 | 	for i := range a {
 96 | 		a[i] = complex(float32(i), float32(2*i))
 97 | 	}
 98 | 
 99 | 	b.CopyHtoD(a)
100 | 
101 | 	c.CopyDtoD(b)
102 | 
103 | 	c.CopyDtoH(d[:100])
104 | 
105 | 	if !reflect.DeepEqual(a, d[:100]) {
106 | 		test.Error(d)
107 | 	}
108 | 	if !reflect.DeepEqual(d[100:], make([]complex64, 100)) {
109 | 		test.Error(d)
110 | 	}
111 | }
112 | 


--------------------------------------------------------------------------------
/safe/doc.go:
--------------------------------------------------------------------------------
1 | /*
2 | 	Safe and more idiomatic wrappers for the low-level CUDA functions.
3 | */
4 | package safe
5 | 


--------------------------------------------------------------------------------
/safe/fft1d_test.go:
--------------------------------------------------------------------------------
 1 | package safe
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | )
 6 | 
 7 | func ExampleFFT1DR2C() {
 8 | 	InitCuda()
 9 | 
10 | 	N := 8
11 | 	batch := 1
12 | 
13 | 	fft := FFT1DR2C(N, batch)
14 | 	defer fft.Destroy()
15 | 
16 | 	input := MakeFloat32s(N)
17 | 	defer input.Free()
18 | 	input.CopyHtoD([]float32{1, 0, 0, 0, 0, 0, 0, 0})
19 | 
20 | 	output := MakeComplex64s(fft.OutputLen())
21 | 	defer output.Free()
22 | 
23 | 	fft.Exec(input, output)
24 | 
25 | 	fmt.Println("input:", input.Host())
26 | 	fmt.Println("output:", output.Host())
27 | 
28 | 	// Output:
29 | 	// input: [1 0 0 0 0 0 0 0]
30 | 	// output: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)]
31 | }
32 | 
33 | func ExampleFFT1DR2C_Inplace() {
34 | 	InitCuda()
35 | 
36 | 	N := 8
37 | 	batch := 2
38 | 
39 | 	fft := FFT1DR2C(N, batch)
40 | 	defer fft.Destroy()
41 | 
42 | 	output := MakeComplex64s(fft.OutputLen())
43 | 	defer output.Free()
44 | 
45 | 	input := output.Float().Slice(0, fft.InputLen())
46 | 	// input uses same layout as out-of-place transform
47 | 	// (CUFFT native layout)
48 | 	input.CopyHtoD([]float32{1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0})
49 | 	fmt.Println("input:", input.Host())
50 | 
51 | 	fft.Exec(input, output)
52 | 	fmt.Println("output:", output.Host())
53 | 
54 | 	inverse := FFT1DC2R(N, batch)
55 | 	defer inverse.Destroy()
56 | 	inverse.Exec(output, input)
57 | 	fmt.Println("input:", input.Host())
58 | 
59 | 	// Output:
60 | 	// input: [1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
61 | 	// output: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i) (+1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)]
62 | 	// input: [8 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0]
63 | }
64 | func ExampleFFT1DC2R() {
65 | 	InitCuda()
66 | 
67 | 	N := 8
68 | 	batch := 1
69 | 
70 | 	fft := FFT1DC2R(N, batch)
71 | 	defer fft.Destroy()
72 | 
73 | 	input := MakeComplex64s(fft.InputLen())
74 | 	defer input.Free()
75 | 	input.CopyHtoD([]complex64{(1 + 0i), (+1 + 0i), (+1 + 0i), (+1 - 0i), (+1 + 0i)})
76 | 
77 | 	output := MakeFloat32s(fft.OutputLen())
78 | 	defer output.Free()
79 | 
80 | 	fft.Exec(input, output)
81 | 
82 | 	fmt.Println("input:", input.Host())
83 | 	fmt.Println("output:", output.Host())
84 | 
85 | 	// Output:
86 | 	// input: [(1+0i) (+1+0i) (+1+0i) (+1+0i) (+1+0i)]
87 | 	// output: [8 0 0 0 0 0 0 0]
88 | }
89 | 


--------------------------------------------------------------------------------
/safe/fft1dc2r.go:
--------------------------------------------------------------------------------
 1 | package safe
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/barnex/cuda5/cufft"
 6 | )
 7 | 
 8 | // 1D single-precission complex-to-real FFT plan.
 9 | type FFT1DC2RPlan struct {
10 | 	fftplan
11 | 	size1D
12 | 	batch int
13 | }
14 | 
15 | // 1D single-precission complex-to-real FFT plan.
16 | func FFT1DC2R(size, batch int) FFT1DC2RPlan {
17 | 	handle := cufft.Plan1d(size, cufft.C2R, batch)
18 | 	handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
19 | 	return FFT1DC2RPlan{fftplan{handle, 0}, size1D(size), batch}
20 | }
21 | 
22 | // Execute the FFT plan. Synchronized.
23 | func (p FFT1DC2RPlan) Exec(src Complex64s, dst Float32s) {
24 | 	oksrclen := p.InputLen()
25 | 	if src.Len() != oksrclen {
26 | 		panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
27 | 	}
28 | 	okdstlen := p.OutputLen()
29 | 	if dst.Len() != okdstlen {
30 | 		panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
31 | 	}
32 | 	p.handle.ExecC2R(src.Pointer(), dst.Pointer())
33 | 	p.stream.Synchronize() //!
34 | }
35 | 
36 | // Required length of the input array.
37 | func (p FFT1DC2RPlan) OutputLen() int {
38 | 	return p.batch * p.Size()
39 | }
40 | 
41 | // Required length of the output array.
42 | func (p FFT1DC2RPlan) InputLen() int {
43 | 	return p.batch * (p.Size()/2 + 1)
44 | }
45 | 


--------------------------------------------------------------------------------
/safe/fft1dr2c.go:
--------------------------------------------------------------------------------
 1 | package safe
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/barnex/cuda5/cufft"
 6 | )
 7 | 
 8 | // 1D single-precission real-to-complex FFT plan.
 9 | type FFT1DR2CPlan struct {
10 | 	fftplan
11 | 	size1D
12 | 	batch int
13 | }
14 | 
15 | // 1D single-precission real-to-complex FFT plan.
16 | func FFT1DR2C(size, batch int) FFT1DR2CPlan {
17 | 	handle := cufft.Plan1d(size, cufft.R2C, batch)
18 | 	handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
19 | 	return FFT1DR2CPlan{fftplan{handle, 0}, size1D(size), batch}
20 | }
21 | 
22 | // Execute the FFT plan. Synchronized.
23 | func (p FFT1DR2CPlan) Exec(src Float32s, dst Complex64s) {
24 | 	oksrclen := p.InputLen()
25 | 	if src.Len() != oksrclen {
26 | 		panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
27 | 	}
28 | 	okdstlen := p.OutputLen()
29 | 	if dst.Len() != okdstlen {
30 | 		panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
31 | 	}
32 | 	p.handle.ExecR2C(src.Pointer(), dst.Pointer())
33 | 	p.stream.Synchronize() //!
34 | }
35 | 
36 | // Required length of the input array.
37 | func (p FFT1DR2CPlan) InputLen() int {
38 | 	return p.batch * p.Size()
39 | }
40 | 
41 | // Required length of the output array.
42 | func (p FFT1DR2CPlan) OutputLen() int {
43 | 	return p.batch * (p.Size()/2 + 1)
44 | }
45 | 


--------------------------------------------------------------------------------
/safe/fft3d_test.go:
--------------------------------------------------------------------------------
  1 | package safe
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | )
  6 | 
  7 | func ExampleFFT3DR2C() {
  8 | 	InitCuda()
  9 | 
 10 | 	Nx, Ny, Nz := 2, 4, 8
 11 | 
 12 | 	fft := FFT3DR2C(Nx, Ny, Nz)
 13 | 	defer fft.Destroy()
 14 | 
 15 | 	input := MakeFloat32s(fft.InputLen())
 16 | 	defer input.Free()
 17 | 
 18 | 	inputData := make([]float32, Nx*Ny*Nz)
 19 | 	inputData[0*Ny*Nz] = 1
 20 | 	inputData[1*Ny*Nz] = 1
 21 | 	input.CopyHtoD(inputData)
 22 | 
 23 | 	output := MakeComplex64s(fft.OutputLen())
 24 | 	defer output.Free()
 25 | 
 26 | 	fft.Exec(input, output)
 27 | 
 28 | 	fmt.Println("input:", Reshape3DFloat32(input.Host(), Nx, Ny, Nz))
 29 | 	Ox, Oy, Oz := fft.OutputSize()
 30 | 	fmt.Println("output:", Reshape3DComplex64(output.Host(), Ox, Oy, Oz))
 31 | 
 32 | 	// Output:
 33 | 	// input: [[[1 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[1 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
 34 | 	// output: [[[(2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)]] [[(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)]]]
 35 | }
 36 | 
 37 | func ExampleFFT3DC2R() {
 38 | 	InitCuda()
 39 | 
 40 | 	Nx, Ny, Nz := 2, 4, 8
 41 | 
 42 | 	fft := FFT3DC2R(Nx, Ny, Nz)
 43 | 	defer fft.Destroy()
 44 | 
 45 | 	input := MakeComplex64s(fft.InputLen())
 46 | 	defer input.Free()
 47 | 
 48 | 	inputData := make([]complex64, fft.InputLen())
 49 | 	for i := range inputData {
 50 | 		inputData[i] = 2
 51 | 	}
 52 | 	input.CopyHtoD(inputData)
 53 | 
 54 | 	output := MakeFloat32s(fft.OutputLen())
 55 | 	defer output.Free()
 56 | 
 57 | 	fft.Exec(input, output)
 58 | 
 59 | 	Ix, Iy, Iz := fft.InputSize()
 60 | 	fmt.Println("input:", Reshape3DComplex64(input.Host(), Ix, Iy, Iz))
 61 | 	fmt.Println("output:", Reshape3DFloat32(output.Host(), Nx, Ny, Nz))
 62 | 
 63 | 	// Output:
 64 | 	// input: [[[(2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)]] [[(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)]]]
 65 | 	// output: [[[128 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
 66 | }
 67 | 
 68 | func ExampleFFT3D() {
 69 | 	InitCuda()
 70 | 
 71 | 	Nx, Ny, Nz := 2, 4, 8
 72 | 
 73 | 	forward := FFT3DR2C(Nx, Ny, Nz)
 74 | 	defer forward.Destroy()
 75 | 
 76 | 	input := MakeFloat32s(forward.InputLen())
 77 | 	defer input.Free()
 78 | 
 79 | 	inputData := make([]float32, forward.InputLen())
 80 | 	inputData[5] = 1
 81 | 	input.CopyHtoD(inputData)
 82 | 
 83 | 	output := MakeComplex64s(forward.OutputLen())
 84 | 	defer output.Free()
 85 | 
 86 | 	forward.Exec(input, output)
 87 | 
 88 | 	backward := FFT3DC2R(Nx, Ny, Nz)
 89 | 	backward.Exec(output, input)
 90 | 
 91 | 	fmt.Println("input:", Reshape3DFloat32(inputData, Nx, Ny, Nz))
 92 | 	fmt.Println("forward+inverse:", Reshape3DFloat32(input.Host(), Nx, Ny, Nz))
 93 | 
 94 | 	// Output:
 95 | 	// input: [[[0 0 0 0 0 1 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
 96 | 	// forward+inverse: [[[0 0 0 0 0 64 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
 97 | }
 98 | 
 99 | //func ExampleFFT3D64() {
100 | //	InitCuda()
101 | //
102 | //	Nx, Ny, Nz := 2, 4, 8
103 | //
104 | //	forward := FFT3DD2Z(Nx, Ny, Nz)
105 | //	defer forward.Destroy()
106 | //
107 | //	input := MakeFloat64s(forward.InputLen())
108 | //	defer input.Free()
109 | //
110 | //	inputData := make([]float64, forward.InputLen())
111 | //	inputData[5] = 1
112 | //	input.CopyHtoD(inputData)
113 | //
114 | //	output := MakeComplex128s(forward.OutputLen())
115 | //	defer output.Free()
116 | //
117 | //	forward.Exec(input, output)
118 | //
119 | //	backward := FFT3DZ2D(Nx, Ny, Nz)
120 | //	backward.Exec(output, input)
121 | //
122 | //	fmt.Println("input:", Reshape3DFloat64(inputData, Nx, Ny, Nz))
123 | //	fmt.Println("forward+inverse:", Reshape3DFloat64(input.Host(), Nx, Ny, Nz))
124 | //
125 | //	// Output:
126 | //	// input: [[[0 0 0 0 0 1 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
127 | //	// forward+inverse: [[[0 0 0 0 0 64 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
128 | //}
129 | 


--------------------------------------------------------------------------------
/safe/fft3dc2r.go:
--------------------------------------------------------------------------------
 1 | package safe
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/barnex/cuda5/cufft"
 6 | )
 7 | 
 8 | // 3D single-precission real-to-complex FFT plan.
 9 | type FFT3DC2RPlan struct {
10 | 	fftplan
11 | 	size3D
12 | }
13 | 
14 | // 3D single-precission real-to-complex FFT plan.
15 | func FFT3DC2R(Nx, Ny, Nz int) FFT3DC2RPlan {
16 | 	handle := cufft.Plan3d(Nx, Ny, Nz, cufft.C2R)
17 | 	handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
18 | 	return FFT3DC2RPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}}
19 | }
20 | 
21 | // Execute the FFT plan.
22 | // src and dst are 3D arrays stored 1D arrays.
23 | func (p FFT3DC2RPlan) Exec(src Complex64s, dst Float32s) {
24 | 	oksrclen := p.InputLen()
25 | 	if src.Len() != oksrclen {
26 | 		panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
27 | 	}
28 | 	okdstlen := p.OutputLen()
29 | 	if dst.Len() != okdstlen {
30 | 		panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
31 | 	}
32 | 	p.handle.ExecC2R(src.Pointer(), dst.Pointer())
33 | 	p.stream.Synchronize() //!
34 | }
35 | 
36 | // 3D size of the input array.
37 | func (p FFT3DC2RPlan) InputSize() (Nx, Ny, Nz int) {
38 | 	return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1
39 | }
40 | 
41 | // 3D size of the output array.
42 | func (p FFT3DC2RPlan) OutputSize() (Nx, Ny, Nz int) {
43 | 	return p.size3D[0], p.size3D[1], p.size3D[2]
44 | }
45 | 
46 | // Required length of the (1D) input array.
47 | func (p FFT3DC2RPlan) InputLen() int {
48 | 	return prod3(p.InputSize())
49 | }
50 | 
51 | // Required length of the (1D) output array.
52 | func (p FFT3DC2RPlan) OutputLen() int {
53 | 	return prod3(p.OutputSize())
54 | }
55 | 


--------------------------------------------------------------------------------
/safe/fft3dd2z.go:
--------------------------------------------------------------------------------
 1 | package safe
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/barnex/cuda5/cufft"
 6 | )
 7 | 
 8 | // 3D single-precission real-to-complex FFT plan.
 9 | type FFT3DD2ZPlan struct {
10 | 	fftplan
11 | 	size3D
12 | }
13 | 
14 | // 3D single-precission real-to-complex FFT plan.
15 | func FFT3DD2Z(Nx, Ny, Nz int) FFT3DD2ZPlan {
16 | 	handle := cufft.Plan3d(Nx, Ny, Nz, cufft.D2Z)
17 | 	handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
18 | 	return FFT3DD2ZPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}}
19 | }
20 | 
21 | // Execute the FFT plan. Synchronized.
22 | // src and dst are 3D arrays stored 1D arrays.
23 | func (p FFT3DD2ZPlan) Exec(src Float64s, dst Complex128s) {
24 | 	oksrclen := p.InputLen()
25 | 	if src.Len() != oksrclen {
26 | 		panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
27 | 	}
28 | 	okdstlen := p.OutputLen()
29 | 	if dst.Len() != okdstlen {
30 | 		panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
31 | 	}
32 | 	p.handle.ExecD2Z(src.Pointer(), dst.Pointer())
33 | 	p.stream.Synchronize() //!
34 | }
35 | 
36 | // 3D size of the input array.
37 | func (p FFT3DD2ZPlan) InputSize() (Nx, Ny, Nz int) {
38 | 	return p.size3D[0], p.size3D[1], p.size3D[2]
39 | }
40 | 
41 | // 3D size of the output array.
42 | func (p FFT3DD2ZPlan) OutputSize() (Nx, Ny, Nz int) {
43 | 	return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1
44 | }
45 | 
46 | // Required length of the (1D) input array.
47 | func (p FFT3DD2ZPlan) InputLen() int {
48 | 	return prod3(p.InputSize())
49 | }
50 | 
51 | // Required length of the (1D) output array.
52 | func (p FFT3DD2ZPlan) OutputLen() int {
53 | 	return prod3(p.OutputSize())
54 | }
55 | 


--------------------------------------------------------------------------------
/safe/fft3dr2c.go:
--------------------------------------------------------------------------------
 1 | package safe
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/barnex/cuda5/cufft"
 6 | )
 7 | 
 8 | // 3D single-precission real-to-complex FFT plan.
 9 | type FFT3DR2CPlan struct {
10 | 	fftplan
11 | 	size3D
12 | }
13 | 
14 | // 3D single-precission real-to-complex FFT plan.
15 | func FFT3DR2C(Nx, Ny, Nz int) FFT3DR2CPlan {
16 | 	handle := cufft.Plan3d(Nx, Ny, Nz, cufft.R2C)
17 | 	handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
18 | 	return FFT3DR2CPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}}
19 | }
20 | 
21 | // Execute the FFT plan. Synchronized.
22 | // src and dst are 3D arrays stored 1D arrays.
23 | func (p FFT3DR2CPlan) Exec(src Float32s, dst Complex64s) {
24 | 	oksrclen := p.InputLen()
25 | 	if src.Len() != oksrclen {
26 | 		panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
27 | 	}
28 | 	okdstlen := p.OutputLen()
29 | 	if dst.Len() != okdstlen {
30 | 		panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
31 | 	}
32 | 	p.handle.ExecR2C(src.Pointer(), dst.Pointer())
33 | 	p.stream.Synchronize() //!
34 | }
35 | 
36 | // 3D size of the input array.
37 | func (p FFT3DR2CPlan) InputSize() (Nx, Ny, Nz int) {
38 | 	return p.size3D[0], p.size3D[1], p.size3D[2]
39 | }
40 | 
41 | // 3D size of the output array.
42 | func (p FFT3DR2CPlan) OutputSize() (Nx, Ny, Nz int) {
43 | 	return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1
44 | }
45 | 
46 | // Required length of the (1D) input array.
47 | func (p FFT3DR2CPlan) InputLen() int {
48 | 	return prod3(p.InputSize())
49 | }
50 | 
51 | // Required length of the (1D) output array.
52 | func (p FFT3DR2CPlan) OutputLen() int {
53 | 	return prod3(p.OutputSize())
54 | }
55 | 


--------------------------------------------------------------------------------
/safe/fft3dz2d.go:
--------------------------------------------------------------------------------
 1 | package safe
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/barnex/cuda5/cufft"
 6 | )
 7 | 
 8 | // 3D single-precission real-to-complex FFT plan.
 9 | type FFT3DZ2DPlan struct {
10 | 	fftplan
11 | 	size3D
12 | }
13 | 
14 | // 3D single-precission real-to-complex FFT plan.
15 | func FFT3DZ2D(Nx, Ny, Nz int) FFT3DZ2DPlan {
16 | 	handle := cufft.Plan3d(Nx, Ny, Nz, cufft.Z2D)
17 | 	handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
18 | 	return FFT3DZ2DPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}}
19 | }
20 | 
21 | // Execute the FFT plan. Synchronized.
22 | // src and dst are 3D arrays stored 1D arrays.
23 | func (p FFT3DZ2DPlan) Exec(src Complex128s, dst Float64s) {
24 | 	oksrclen := p.InputLen()
25 | 	if src.Len() != oksrclen {
26 | 		panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
27 | 	}
28 | 	okdstlen := p.OutputLen()
29 | 	if dst.Len() != okdstlen {
30 | 		panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
31 | 	}
32 | 	p.handle.ExecZ2D(src.Pointer(), dst.Pointer())
33 | 	p.stream.Synchronize() //!
34 | }
35 | 
36 | // 3D size of the input array.
37 | func (p FFT3DZ2DPlan) InputSize() (Nx, Ny, Nz int) {
38 | 	return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1
39 | }
40 | 
41 | // 3D size of the output array.
42 | func (p FFT3DZ2DPlan) OutputSize() (Nx, Ny, Nz int) {
43 | 	return p.size3D[0], p.size3D[1], p.size3D[2]
44 | }
45 | 
46 | // Required length of the (1D) input array.
47 | func (p FFT3DZ2DPlan) InputLen() int {
48 | 	return prod3(p.InputSize())
49 | }
50 | 
51 | // Required length of the (1D) output array.
52 | func (p FFT3DZ2DPlan) OutputLen() int {
53 | 	return prod3(p.OutputSize())
54 | }
55 | 


--------------------------------------------------------------------------------
/safe/fftplan.go:
--------------------------------------------------------------------------------
 1 | package safe
 2 | 
 3 | // INTERNAL
 4 | // Base implementation for all FFT plans.
 5 | 
 6 | import (
 7 | 	"github.com/barnex/cuda5/cu"
 8 | 	"github.com/barnex/cuda5/cufft"
 9 | )
10 | 
11 | // Base implementation for all FFT plans.
12 | type fftplan struct {
13 | 	handle cufft.Handle
14 | 	stream cu.Stream
15 | }
16 | 
17 | // For the sake of embedding.
18 | type size1D int
19 | 
20 | // Returns the logical size of the FFT:
21 | // the number of elements (real or complex)
22 | // it transforms.
23 | func (s size1D) Size() int { return int(s) }
24 | 
25 | // For the sake of embedding.
26 | type size3D [3]int
27 | 
28 | // Returns the logical size of the FFT:
29 | // the number of elements (real or complex)
30 | // it transforms.
31 | func (s size3D) Size() (Nx, Ny, Nz int) { return s[0], s[1], s[2] }
32 | 
33 | func prod3(x, y, z int) int {
34 | 	return x * y * z
35 | }
36 | 
37 | // Releases all resources associated with the FFT plan.
38 | func (p fftplan) Destroy() { p.handle.Destroy() }
39 | 
40 | // Associates a CUDA stream with the FFT plan.
41 | // If a stream is set, plan.Stream().Synchronize() can
42 | // to be called to wait for the execution to finish.
43 | func (p fftplan) SetStream(stream cu.Stream) {
44 | 	p.handle.SetStream(stream)
45 | 	p.stream = stream
46 | }
47 | 
48 | // Returns the CUDA stream associated with the FFT plan.
49 | func (p fftplan) Stream() cu.Stream {
50 | 	return p.stream
51 | }
52 | 


--------------------------------------------------------------------------------
/safe/float32s.go:
--------------------------------------------------------------------------------
 1 | package safe
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/barnex/cuda5/cu"
 6 | 	"math"
 7 | 	"unsafe"
 8 | )
 9 | 
10 | // Slice of float32's on the GPU.
11 | type Float32s struct{ slice }
12 | 
13 | // Make a slice of float32's on the GPU.
14 | // Initialized to zero.
15 | func MakeFloat32s(len_ int) Float32s {
16 | 	return Float32s{makeslice(len_, cu.SIZEOF_FLOAT32)}
17 | }
18 | 
19 | // Return a slice from start (inclusive) to stop (exclusive),
20 | // sharing the underlying storage with the original slice.
21 | // Slices obtained in this way should not be Free()'d
22 | func (s Float32s) Slice(start, stop int) Float32s {
23 | 	return Float32s{s.slice.slice(start, stop, cu.SIZEOF_FLOAT32)}
24 | }
25 | 
26 | // Copy src from host to dst on the device.
27 | func (dst Float32s) CopyHtoD(src []float32) {
28 | 	dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT32)
29 | }
30 | 
31 | // Copy src form device to dst on host.
32 | func (src Float32s) CopyDtoH(dst []float32) {
33 | 	src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT32)
34 | }
35 | 
36 | // Copy src on host to dst on host.
37 | func (dst Float32s) CopyDtoD(src Float32s) {
38 | 	dst.copyDtoD(&src.slice, cu.SIZEOF_FLOAT32)
39 | }
40 | 
41 | // Copy src from host to dst on the device, asynchronously.
42 | func (dst Float32s) CopyHtoDAsync(src []float32, stream cu.Stream) {
43 | 	dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT32, stream)
44 | }
45 | 
46 | // Copy src form device to dst on host, asynchronously.
47 | func (src Float32s) CopyDtoHAsync(dst []float32, stream cu.Stream) {
48 | 	src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT32, stream)
49 | }
50 | 
51 | // Copy src on host to dst on host, asynchronously.
52 | func (dst Float32s) CopyDtoDAsync(src Float32s, stream cu.Stream) {
53 | 	dst.copyDtoDAsync(&src.slice, cu.SIZEOF_FLOAT32, stream)
54 | }
55 | 
56 | // Returns a fresh copy on host.
57 | func (src Float32s) Host() []float32 {
58 | 	cpy := make([]float32, src.Len())
59 | 	src.CopyDtoH(cpy)
60 | 	return cpy
61 | }
62 | 
63 | // Set the entire slice to this value.
64 | func (s Float32s) Memset(value float32) {
65 | 	cu.MemsetD32(s.Pointer(), math.Float32bits(value), int64(s.Len()))
66 | 	cu.CtxSynchronize()
67 | }
68 | 
69 | // Set the entire slice to this value, asynchronously.
70 | func (s Float32s) MemsetAsync(value float32, stream cu.Stream) {
71 | 	cu.MemsetD32Async(s.Pointer(), math.Float32bits(value), int64(s.Len()), stream)
72 | }
73 | 
74 | // Re-interpret the array as complex numbers,
75 | // in interleaved format. Underlying storage
76 | // is shared.
77 | func (s Float32s) Complex() Complex64s {
78 | 	if s.Len()%2 != 0 {
79 | 		panic(fmt.Errorf("complex: need even number of elements, have:%v", s.Len()))
80 | 	}
81 | 	return Complex64s{slice{s.ptr_, s.len_ / 2, s.cap_ / 2}}
82 | }
83 | 


--------------------------------------------------------------------------------
/safe/float32s_test.go:
--------------------------------------------------------------------------------
  1 | package safe
  2 | 
  3 | import (
  4 | 	"reflect"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func TestFloat32sSlice(test *testing.T) {
  9 | 	InitCuda()
 10 | 
 11 | 	a := MakeFloat32s(100)
 12 | 	defer a.Free()
 13 | 
 14 | 	if !reflect.DeepEqual(a.Host(), make([]float32, 100)) {
 15 | 		test.Error(a.Host())
 16 | 	}
 17 | 
 18 | 	b := make([]float32, 100)
 19 | 
 20 | 	if a.Len() != len(b) {
 21 | 		test.Error("len:", a.Len(), "!=", cap(b))
 22 | 	}
 23 | 	if a.Cap() != cap(b) {
 24 | 		test.Error("cap:", a.Cap(), "!=", cap(b))
 25 | 	}
 26 | 
 27 | 	c := a.Slice(20, 30)
 28 | 	d := b[20:30]
 29 | 
 30 | 	if c.Len() != len(d) {
 31 | 		test.Error("sliced len:", c.Len(), "!=", cap(d))
 32 | 	}
 33 | 	if c.Cap() != cap(d) {
 34 | 		test.Error("sliced cap:", c.Cap(), "!=", cap(d))
 35 | 	}
 36 | 
 37 | 	e := a.Slice(0, 50)
 38 | 	f := b[0:50]
 39 | 
 40 | 	if e.Len() != len(f) {
 41 | 		test.Error("sliced len:", e.Len(), "!=", cap(f))
 42 | 	}
 43 | 	if e.Cap() != cap(f) {
 44 | 		test.Error("sliced cap:", e.Cap(), "!=", cap(f))
 45 | 	}
 46 | }
 47 | 
 48 | func TestFloat32sPanic1(test *testing.T) {
 49 | 	InitCuda()
 50 | 
 51 | 	defer func() {
 52 | 		err := recover()
 53 | 		test.Log("recovered:", err)
 54 | 		if err == nil {
 55 | 			test.Fail()
 56 | 		}
 57 | 	}()
 58 | 
 59 | 	a := MakeFloat32s(100)
 60 | 	defer a.Free()
 61 | 
 62 | 	a.Slice(-1, 10)
 63 | }
 64 | 
 65 | func TestFloat32sPanic2(test *testing.T) {
 66 | 	InitCuda()
 67 | 
 68 | 	defer func() {
 69 | 		err := recover()
 70 | 		test.Log("recovered:", err)
 71 | 		if err == nil {
 72 | 			test.Fail()
 73 | 		}
 74 | 	}()
 75 | 
 76 | 	a := MakeFloat32s(100)
 77 | 	defer a.Free()
 78 | 
 79 | 	a.Slice(0, 101)
 80 | }
 81 | 
 82 | func TestFloat32sCopy(test *testing.T) {
 83 | 	InitCuda()
 84 | 
 85 | 	a := make([]float32, 100)
 86 | 
 87 | 	b := MakeFloat32s(100)
 88 | 	defer b.Free()
 89 | 
 90 | 	c := MakeFloat32s(100)
 91 | 	defer c.Free()
 92 | 
 93 | 	d := make([]float32, 200)
 94 | 
 95 | 	for i := range a {
 96 | 		a[i] = float32(i)
 97 | 	}
 98 | 
 99 | 	b.CopyHtoD(a)
100 | 
101 | 	c.CopyDtoD(b)
102 | 
103 | 	c.CopyDtoH(d[:100])
104 | 
105 | 	if !reflect.DeepEqual(a, d[:100]) {
106 | 		test.Error(d)
107 | 	}
108 | 	if !reflect.DeepEqual(d[100:], make([]float32, 100)) {
109 | 		test.Error(d)
110 | 	}
111 | }
112 | 


--------------------------------------------------------------------------------
/safe/float64s.go:
--------------------------------------------------------------------------------
 1 | package safe
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/barnex/cuda5/cu"
 6 | 	"unsafe"
 7 | )
 8 | 
 9 | // Slice of float64's on the GPU.
10 | type Float64s struct{ slice }
11 | 
12 | // Make a slice of float64's on the GPU.
13 | // Initialized to zero.
14 | func MakeFloat64s(len_ int) Float64s {
15 | 	return Float64s{makeslice(len_, cu.SIZEOF_FLOAT64)}
16 | }
17 | 
18 | // Return a slice from start (inclusive) to stop (exclusive),
19 | // sharing the underlying storage with the original slice.
20 | // Slices obtained in this way should not be Free()'d
21 | func (s Float64s) Slice(start, stop int) Float64s {
22 | 	return Float64s{s.slice.slice(start, stop, cu.SIZEOF_FLOAT64)}
23 | }
24 | 
25 | // Copy src from host to dst on the device.
26 | func (dst Float64s) CopyHtoD(src []float64) {
27 | 	dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT64)
28 | }
29 | 
30 | // Copy src form device to dst on host.
31 | func (src Float64s) CopyDtoH(dst []float64) {
32 | 	src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT64)
33 | }
34 | 
35 | // Copy src on host to dst on host.
36 | func (dst Float64s) CopyDtoD(src Float64s) {
37 | 	dst.copyDtoD(&src.slice, cu.SIZEOF_FLOAT64)
38 | }
39 | 
40 | // Copy src from host to dst on the device, asynchronously.
41 | func (dst Float64s) CopyHtoDAsync(src []float64, stream cu.Stream) {
42 | 	dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT64, stream)
43 | }
44 | 
45 | // Copy src form device to dst on host, asynchronously.
46 | func (src Float64s) CopyDtoHAsync(dst []float64, stream cu.Stream) {
47 | 	src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT64, stream)
48 | }
49 | 
50 | // Copy src on host to dst on host, asynchronously.
51 | func (dst Float64s) CopyDtoDAsync(src Float64s, stream cu.Stream) {
52 | 	dst.copyDtoDAsync(&src.slice, cu.SIZEOF_FLOAT64, stream)
53 | }
54 | 
55 | // Returns a fresh copy on host.
56 | func (src Float64s) Host() []float64 {
57 | 	cpy := make([]float64, src.Len())
58 | 	src.CopyDtoH(cpy)
59 | 	return cpy
60 | }
61 | 
62 | // Re-interpret the array as complex numbers,
63 | // in interleaved format. Underlying storage
64 | // is shared.
65 | func (s Float64s) Complex() Complex128s {
66 | 	if s.Len()%2 != 0 {
67 | 		panic(fmt.Errorf("complex: need even number of elements, have:%v", s.Len()))
68 | 	}
69 | 	return Complex128s{slice{s.ptr_, s.len_ / 2, s.cap_ / 2}}
70 | }
71 | 


--------------------------------------------------------------------------------
/safe/float64s_test.go:
--------------------------------------------------------------------------------
  1 | package safe
  2 | 
  3 | import (
  4 | 	"reflect"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func TestFloat64sSlice(test *testing.T) {
  9 | 	InitCuda()
 10 | 
 11 | 	a := MakeFloat64s(100)
 12 | 	defer a.Free()
 13 | 
 14 | 	if !reflect.DeepEqual(a.Host(), make([]float64, 100)) {
 15 | 		test.Error(a.Host())
 16 | 	}
 17 | 
 18 | 	b := make([]float64, 100)
 19 | 
 20 | 	if a.Len() != len(b) {
 21 | 		test.Error("len:", a.Len(), "!=", cap(b))
 22 | 	}
 23 | 	if a.Cap() != cap(b) {
 24 | 		test.Error("cap:", a.Cap(), "!=", cap(b))
 25 | 	}
 26 | 
 27 | 	c := a.Slice(20, 30)
 28 | 	d := b[20:30]
 29 | 
 30 | 	if c.Len() != len(d) {
 31 | 		test.Error("sliced len:", c.Len(), "!=", cap(d))
 32 | 	}
 33 | 	if c.Cap() != cap(d) {
 34 | 		test.Error("sliced cap:", c.Cap(), "!=", cap(d))
 35 | 	}
 36 | 
 37 | 	e := a.Slice(0, 50)
 38 | 	f := b[0:50]
 39 | 
 40 | 	if e.Len() != len(f) {
 41 | 		test.Error("sliced len:", e.Len(), "!=", cap(f))
 42 | 	}
 43 | 	if e.Cap() != cap(f) {
 44 | 		test.Error("sliced cap:", e.Cap(), "!=", cap(f))
 45 | 	}
 46 | }
 47 | 
 48 | func TestFloat64sPanic1(test *testing.T) {
 49 | 	InitCuda()
 50 | 
 51 | 	defer func() {
 52 | 		err := recover()
 53 | 		test.Log("recovered:", err)
 54 | 		if err == nil {
 55 | 			test.Fail()
 56 | 		}
 57 | 	}()
 58 | 
 59 | 	a := MakeFloat64s(100)
 60 | 	defer a.Free()
 61 | 
 62 | 	a.Slice(-1, 10)
 63 | }
 64 | 
 65 | func TestFloat64sPanic2(test *testing.T) {
 66 | 	InitCuda()
 67 | 
 68 | 	defer func() {
 69 | 		err := recover()
 70 | 		test.Log("recovered:", err)
 71 | 		if err == nil {
 72 | 			test.Fail()
 73 | 		}
 74 | 	}()
 75 | 
 76 | 	a := MakeFloat64s(100)
 77 | 	defer a.Free()
 78 | 
 79 | 	a.Slice(0, 101)
 80 | }
 81 | 
 82 | func TestFloat64sCopy(test *testing.T) {
 83 | 	InitCuda()
 84 | 
 85 | 	a := make([]float64, 100)
 86 | 
 87 | 	b := MakeFloat64s(100)
 88 | 	defer b.Free()
 89 | 
 90 | 	c := MakeFloat64s(100)
 91 | 	defer c.Free()
 92 | 
 93 | 	d := make([]float64, 200)
 94 | 
 95 | 	for i := range a {
 96 | 		a[i] = float64(i)
 97 | 	}
 98 | 
 99 | 	b.CopyHtoD(a)
100 | 
101 | 	c.CopyDtoD(b)
102 | 
103 | 	c.CopyDtoH(d[:100])
104 | 
105 | 	if !reflect.DeepEqual(a, d[:100]) {
106 | 		test.Error(d)
107 | 	}
108 | 	if !reflect.DeepEqual(d[100:], make([]float64, 100)) {
109 | 		test.Error(d)
110 | 	}
111 | }
112 | 


--------------------------------------------------------------------------------
/safe/init.go:
--------------------------------------------------------------------------------
 1 | package safe
 2 | 
 3 | import (
 4 | 	"github.com/barnex/cuda5/cu"
 5 | 	"runtime"
 6 | )
 7 | 
 8 | func InitCuda() {
 9 | 	runtime.LockOSThread()
10 | 	cu.Init(0)
11 | 	cu.CtxCreate(cu.CTX_SCHED_AUTO, 0).SetCurrent()
12 | }
13 | 


--------------------------------------------------------------------------------
/safe/slice.go:
--------------------------------------------------------------------------------
  1 | package safe
  2 | 
  3 | // INTERNAL.
  4 | // This file implements common functionality for all slice types
  5 | // (Float32s, Float64s, Complex64s, ...).
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"github.com/barnex/cuda5/cu"
 10 | 	"unsafe"
 11 | )
 12 | 
 13 | // internal base func for all makeXXX() functions
 14 | func makeslice(len_ int, elemsize int) slice {
 15 | 	bytes := int64(len_) * int64(elemsize)
 16 | 	s := slice{0, len_, len_}
 17 | 	if bytes > 0 {
 18 | 		s.ptr_ = cu.MemAlloc(bytes)
 19 | 		cu.MemsetD8(s.ptr_, 0, bytes)
 20 | 		cu.CtxSynchronize()
 21 | 	}
 22 | 	return s
 23 | }
 24 | 
 25 | // internal base type for all slices
 26 | type slice struct {
 27 | 	ptr_ cu.DevicePtr // address offset of first element
 28 | 	len_ int          // number of elements
 29 | 	cap_ int
 30 | }
 31 | 
 32 | // Pointer to the first element.
 33 | func (s *slice) Pointer() cu.DevicePtr { return s.ptr_ }
 34 | 
 35 | // Slice length (number of elements).
 36 | func (s *slice) Len() int { return s.len_ }
 37 | 
 38 | // Slice capacity.
 39 | func (s *slice) Cap() int { return s.cap_ }
 40 | 
 41 | // Free the underlying storage.
 42 | // To be used with care. Free() should only be called on
 43 | // a slice created by MakeXXX(), not on a slice created
 44 | // by x.Slice(). Freeing a slice invalidates all other
 45 | // slices referring to it.
 46 | func (s *slice) Free() {
 47 | 	s.ptr_.Free()
 48 | 	s.len_ = 0
 49 | 	s.cap_ = 0
 50 | }
 51 | 
 52 | // internal base func for all slice() functions
 53 | func (s *slice) slice(start, stop int, elemsize uintptr) slice {
 54 | 	if start >= s.cap_ || start < 0 || stop > s.cap_ || stop < 0 {
 55 | 		panic("cuda4/safe: slice index out of bounds")
 56 | 	}
 57 | 	if start > stop {
 58 | 		panic("cuda4/safe: inverted slice range")
 59 | 	}
 60 | 	return slice{cu.DevicePtr(uintptr(s.ptr_) + uintptr(start)*elemsize), stop - start, s.cap_ - start}
 61 | }
 62 | 
 63 | func (dst *slice) copyHtoD(src unsafe.Pointer, srclen int, elemsize int) {
 64 | 	if srclen != dst.Len() {
 65 | 		panic(fmt.Errorf("cuda4/safe: len mismatch: len(src)=%v (host), dst.Len()=%v (device)", srclen, dst.Len()))
 66 | 	}
 67 | 	cu.MemcpyHtoD(dst.Pointer(), src, int64(elemsize)*int64(srclen))
 68 | }
 69 | 
 70 | func (src *slice) copyDtoH(dst unsafe.Pointer, dstlen int, elemsize int) {
 71 | 	if dstlen != src.Len() {
 72 | 		panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), len(dst)=%v (host)", src.Len(), dstlen))
 73 | 	}
 74 | 	cu.MemcpyDtoH(dst, src.Pointer(), int64(elemsize)*int64(dstlen))
 75 | }
 76 | 
 77 | func (dst *slice) copyDtoD(src *slice, elemsize int) {
 78 | 	if dst.Len() != src.Len() {
 79 | 		panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), dst.Len()=%v", src.Len(), dst.Len()))
 80 | 	}
 81 | 	cu.MemcpyDtoD(dst.Pointer(), src.Pointer(), int64(elemsize)*int64(dst.Len()))
 82 | }
 83 | 
 84 | func (dst *slice) copyHtoDAsync(src unsafe.Pointer, srclen int, elemsize int, stream cu.Stream) {
 85 | 	if srclen != dst.Len() {
 86 | 		panic(fmt.Errorf("cuda4/safe: len mismatch: len(src)=%v (host), dst.Len()=%v (device)", srclen, dst.Len()))
 87 | 	}
 88 | 	cu.MemcpyHtoDAsync(dst.Pointer(), src, int64(elemsize)*int64(srclen), stream)
 89 | }
 90 | 
 91 | func (src *slice) copyDtoHAsync(dst unsafe.Pointer, dstlen int, elemsize int, stream cu.Stream) {
 92 | 	if dstlen != src.Len() {
 93 | 		panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), len(dst)=%v (host)", src.Len(), dstlen))
 94 | 	}
 95 | 	cu.MemcpyDtoHAsync(dst, src.Pointer(), int64(elemsize)*int64(dstlen), stream)
 96 | }
 97 | 
 98 | func (dst *slice) copyDtoDAsync(src *slice, elemsize int, stream cu.Stream) {
 99 | 	if dst.Len() != src.Len() {
100 | 		panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), dst.Len()=%v", src.Len(), dst.Len()))
101 | 	}
102 | 	cu.MemcpyDtoDAsync(dst.Pointer(), src.Pointer(), int64(elemsize)*int64(dst.Len()), stream)
103 | }
104 | 
105 | // Manually set the pointer, length and capacity.
106 | // Side-steps the security mechanisms, use with caution.
107 | func (s *slice) UnsafeSet(pointer unsafe.Pointer, length, capacity int) {
108 | 	s.ptr_ = cu.DevicePtr(uintptr(pointer))
109 | 	s.len_ = length
110 | 	s.cap_ = capacity
111 | }
112 | 


--------------------------------------------------------------------------------
/safe/subs.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | subs32='s/loat32/loat64/g;'
 4 | subs32+='s/FLOAT32/FLOAT64/g;'
 5 | 
 6 | #sed $subs32 float32s.go > float64s.go
 7 | #sed $subs32 float32s_test.go > float64s_test.go
 8 | 
 9 | subsc64='s/Float32/Complex64/g;'
10 | subsc64+='s/float32/complex64/g;'
11 | subsc64+='s/FLOAT32/COMPLEX64/g;'
12 | #sed $subsc64 float32s_test.go > complex64s_test.go
13 | #sed $subsc64 float32s.go > complex64s.go
14 | 
15 | 
16 | subsc128='s/omplex64/omplex128/g;'
17 | subsc128+='s/COMPLEX64/COMPLEX128/g;'
18 | sed $subsc128 complex64s.go > complex128s.go
19 | sed $subsc128 complex64s_test.go > complex128s_test.go
20 | 


--------------------------------------------------------------------------------