├── .gitignore ├── .pylintrc ├── LICENSE ├── README.rst ├── cuda ├── __init__.py ├── cu │ ├── __init__.py │ └── cudadrv.py ├── cublas │ ├── __init__.py │ └── cublas.py ├── cuda │ ├── __init__.py │ └── cudart.py ├── cufft │ ├── __init__.py │ └── cufft.py ├── sugar │ ├── __init__.py │ ├── blas │ │ ├── __init__.py │ │ ├── saxpy.py │ │ ├── sdot.py │ │ └── sgemm.py │ ├── fft │ │ ├── __init__.py │ │ ├── conv_gold.py │ │ ├── fft.py │ │ ├── fftconvolve.py │ │ └── fftconvolve2d_kernel.cu │ ├── kernel │ │ ├── __init__.py │ │ ├── compiler.py │ │ ├── kernelfactorydrv.py │ │ ├── kernelfactoryrt.py │ │ └── tests │ │ │ ├── matrix_mul.py │ │ │ └── matrix_mul_kernel.cu │ ├── memory │ │ ├── __init__.py │ │ └── linear.py │ └── query │ │ ├── __init__.py │ │ ├── bandwidth.py │ │ ├── cu_utils.py │ │ └── cuda_utils.py └── utils │ ├── __init__.py │ ├── decorator.py │ ├── libutils.py │ └── logger.py ├── ez_setup.py ├── mkdist ├── oldcode ├── cu │ ├── __init__.py │ ├── cu_api.py │ └── cu_defs.py ├── cublas │ ├── __init__.py │ ├── cublas_api.py │ └── cublas_defs.py ├── cuda │ ├── __init__.py │ ├── cuda_api.py │ └── cuda_defs.py ├── cufft │ ├── __init__.py │ ├── cufft_api.py │ └── cufft_defs.py ├── examples │ ├── TODO │ ├── __init__.py │ └── bw_test.py └── misc │ ├── README │ ├── cf │ ├── cmpG │ ├── compileC │ ├── compileCG │ ├── compileCX │ ├── compileG │ ├── compileGso │ ├── cpuFunctions.c │ ├── cpuFunctions.py │ ├── ctypes_array.py │ ├── ctypes_array_test.py │ ├── ctypes_extra.py │ ├── devinfo_cr.py │ ├── devinfo_cu.py │ ├── gpuFunctions.cu │ ├── gpuFunctions.cubin │ ├── gpuFunctions.linkinfo │ ├── gpuFunctions.ptx │ ├── gpuFunctions.py │ ├── kernelGL.cu │ ├── matadd.txt │ ├── mklMath.py │ ├── sgemmN │ ├── sgemmN.cu │ ├── sgemmN.log │ ├── simple.cu │ ├── simple.cubin │ ├── simple.ptx │ ├── simple.py │ ├── utilities.py │ └── vector.c ├── setup.py ├── tests ├── cu │ └── todo │ │ ├── cu_add.py │ │ ├── cu_blsc.py │ │ ├── cu_gflops.py │ │ ├── cu_poly.py │ │ ├── cu_saxpy.py │ │ ├── cu_sgemm.py │ │ ├── cu_streams.py │ │ └── cu_trig.py ├── cuda │ └── todo │ │ ├── cuda_GL.py │ │ ├── cuda_GLimg.png │ │ ├── cuda_QtGL.py │ │ ├── cuda_add.py │ │ ├── cuda_blsc.py │ │ ├── cuda_gflops.py │ │ ├── cuda_poly.py │ │ ├── cuda_saxpy.py │ │ ├── cuda_sgemm.py │ │ ├── cuda_streams.py │ │ └── cuda_trig.py ├── cufft │ ├── cufft_fft.py │ ├── fftlab.py │ └── todo │ │ ├── bfft.py │ │ ├── cu_fft.py │ │ ├── cuda_fft.py │ │ ├── dfft.py │ │ ├── gfft_cu.py │ │ ├── gfft_cuda.py │ │ ├── manyfft.py │ │ ├── sfft.py │ │ └── xfft.py └── test_cublas.py └── xml ├── createbindings.py ├── cublas.py ├── cublas.xml ├── cudadrv.py ├── cudadrv.xml ├── cudart.py ├── cudart.xml ├── cufft.py ├── cufft.xml ├── generate-xml.sh ├── generate-xml.sh.orig ├── generate-xml_linux.sh ├── generate-xml_macosx.sh └── my_CUDA2100_vector_types.h /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.py~ 3 | junk 4 | build 5 | *egg* 6 | *testbed* 7 | *.so 8 | *.linkinfo 9 | *.swp 10 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Description 2 | =========== 3 | 4 | Python bindings for CUDA 2.1 with numpy integration 5 | 6 | Authors 7 | ------- 8 | 9 | Justin Riley (jtriley@mit.edu) 10 | Nicolas Pinto (pinto@mit.edu) 11 | 12 | Mailing List 13 | ============ 14 | 15 | http://groups.google.com/group/python-cuda 16 | 17 | Bug Tracker 18 | =========== 19 | 20 | http://npinto.lighthouseapp.com/projects/24960-python-cuda 21 | 22 | License 23 | ======= 24 | 25 | see the LICENSE file 26 | 27 | -------------------------------------------------------------------------------- /cuda/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import cuda 4 | import cu 5 | 6 | import cublas 7 | import cufft 8 | 9 | import sugar 10 | import utils 11 | 12 | import platform 13 | 14 | # add $CUDA_ROOT/bin to %PATH% in windows 15 | if platform.system() == "Windows": 16 | import _winreg as wreg 17 | reg = wreg.ConnectRegistry(None, wreg.HKEY_LOCAL_MACHINE) 18 | key = wreg.OpenKey(reg, r"SOFTWARE\NVIDIA Corporation\Installed Products\NVIDIA CUDA") 19 | import os 20 | cuda_bin = os.path.join(wreg.QueryValueEx(key, "InstallDir")[0],"bin") 21 | os.environ['PATH'] += os.path.pathsep + cuda_bin 22 | 23 | import atexit 24 | atexit.register(cuda.cudaThreadExit) 25 | 26 | def debug(): 27 | utils.enable_debug() 28 | -------------------------------------------------------------------------------- /cuda/cu/__init__.py: -------------------------------------------------------------------------------- 1 | from cudadrv import * 2 | -------------------------------------------------------------------------------- /cuda/cublas/__init__.py: -------------------------------------------------------------------------------- 1 | from cublas import * 2 | -------------------------------------------------------------------------------- /cuda/cuda/__init__.py: -------------------------------------------------------------------------------- 1 | from cudart import * 2 | -------------------------------------------------------------------------------- /cuda/cufft/__init__.py: -------------------------------------------------------------------------------- 1 | from cufft import * 2 | CUFFT_FORWARD = -1 3 | CUFFT_INVERSE = 1 4 | -------------------------------------------------------------------------------- /cuda/sugar/__init__.py: -------------------------------------------------------------------------------- 1 | import memory 2 | import kernel 3 | import fft 4 | import blas 5 | import query 6 | -------------------------------------------------------------------------------- /cuda/sugar/blas/__init__.py: -------------------------------------------------------------------------------- 1 | from saxpy import * 2 | from sdot import * 3 | from sgemm import * 4 | -------------------------------------------------------------------------------- /cuda/sugar/blas/saxpy.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | from time import time 3 | from ctypes import cast,c_float, POINTER 4 | 5 | from numpy import empty_like,dot 6 | from numpy.random import randn 7 | 8 | from cuda.cublas import * 9 | from cuda.cuda import cudaThreadSynchronize 10 | from cuda.sugar.memory import Linear 11 | 12 | def embed_ipython(): 13 | from IPython.Shell import IPShellEmbed 14 | ipshell = IPShellEmbed(user_ns = dict()) 15 | ipshell() 16 | 17 | def cpu_saxpy(a,b, alpha): 18 | return (alpha*a+b) 19 | 20 | def gpu_saxpy(a,b,alpha): 21 | # init cublas lib 22 | cublasInit() 23 | 24 | # allocate device vectors from host 25 | d_X = Linear(a.shape).from_numpy(a) 26 | d_Y = Linear(b.shape).from_numpy(b) 27 | 28 | # execute cublasSaxpy and sync threads 29 | cublasSaxpy(a.shape[1],alpha,d_X.ref,1,d_Y.ref,1) 30 | cudaThreadSynchronize() 31 | 32 | return d_Y.to_numpy() 33 | 34 | def test(): 35 | vlength = 8192 36 | alpha = 1 37 | 38 | # allocate host vectors 39 | h_X = randn(1,vlength).astype('float32') 40 | h_Y = randn(1,vlength).astype('float32') 41 | 42 | print "-"*80 43 | print 'h_X:' 44 | print h_X 45 | print "-"*80 46 | 47 | print "-"*80 48 | print 'h_Y:' 49 | print h_Y 50 | print "-"*80 51 | 52 | print "-"*80 53 | print 'CPU RESULT:' 54 | print cpu_saxpy(h_X,h_Y,alpha) 55 | print "-"*80 56 | 57 | print "-"*80 58 | print 'GPU RESULT:' 59 | print gpu_saxpy(h_X, h_Y, alpha) 60 | print "-"*80 61 | 62 | if __name__ == "__main__": 63 | test() 64 | -------------------------------------------------------------------------------- /cuda/sugar/blas/sdot.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | from ctypes import c_float 4 | from time import time 5 | 6 | import cuda.cublas as cublas 7 | import cuda.cuda as cuda 8 | from cuda.sugar.memory import Linear 9 | 10 | import numpy 11 | from numpy.random import randn 12 | 13 | def gpu_sdot(a,b): 14 | assert a.size == b.size 15 | assert a.shape[0] == b.shape[1] 16 | cublas.cublasInit() 17 | cublas.cublasFree(0) 18 | d_X = Linear(a.shape).from_numpy(a) 19 | d_Y = Linear(b.shape).from_numpy(b) 20 | gpu_result = cublas.cublasSdot(a.shape[1], d_X.ref, 1, d_Y.ref, 1) 21 | cuda.cudaThreadSynchronize() 22 | cublas.cublasShutdown() 23 | return gpu_result 24 | 25 | def test(): 26 | vlength = 1024 27 | 28 | n2 = vlength*vlength 29 | 30 | h_X = randn(1,n2).astype('float32') 31 | h_Y = randn(1,n2).astype('float32') 32 | 33 | print "-"*80 34 | print "h_X:" 35 | print h_X 36 | print "-"*80 37 | 38 | print "-"*80 39 | print "h_Y:" 40 | print h_Y 41 | print "-"*80 42 | 43 | print "-"*80 44 | print numpy.dot(h_X,h_Y.transpose())[0][0] 45 | print "-"*80 46 | 47 | print "-"*80 48 | print "cublasSdot(d_X,d_Y):" 49 | print gpu_sdot(h_X, h_Y.transpose()) 50 | print "-"*80 51 | 52 | if __name__ == "__main__": 53 | test() 54 | -------------------------------------------------------------------------------- /cuda/sugar/blas/sgemm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from cuda.cuda import cudaThreadSynchronize 3 | from cuda.cublas import cublasInit, cublasShutdown, cublasSgemm 4 | from cuda.sugar.memory import Linear 5 | 6 | import numpy 7 | from numpy.random import randn 8 | 9 | def gpu_sgemm(a,b, alpha=1): 10 | """ Single Precision Matrix Multiplication on GPU, expects two, two-dimensional numpy arrays as input. Arrays must be such that a.shape[1] == b.shape[0]. Optionally specify alpha for scalar multiplication""" 11 | # init cublas 12 | cublasInit() 13 | 14 | assert a.shape[1] == b.shape[0] 15 | 16 | c_shape = (a.shape[0], b.shape[1]) 17 | # allocate device matrices from host 18 | dA = Linear(a.shape, order='F').from_numpy(a) 19 | dB = Linear(b.shape, order='F').from_numpy(b) 20 | dC = Linear(c_shape, order='F') 21 | 22 | # transpose a/b ? t = yes, n = no 23 | transa = 'n' 24 | transb = 'n' 25 | 26 | # compute with CUBLAS 27 | cublasSgemm( transa, transb, a.shape[0], b.shape[1], a.shape[1], alpha, dA.ref, a.shape[0], dB.ref, b.shape[0], 0, dC.ref, a.shape[0] ) 28 | cudaThreadSynchronize() 29 | # shutdown 30 | cublasShutdown() 31 | return dC.to_numpy() 32 | 33 | 34 | 35 | def test(): 36 | # Size of square matrix 37 | N = 2 38 | 39 | # allocate host matrices 40 | A = randn(3,N).astype('float32') 41 | B = randn(3,5).astype('float32') 42 | 43 | # compute the cpu reference 44 | ref = numpy.dot(A,B) 45 | 46 | print '-'*80 47 | print ref 48 | print '-'*80 49 | 50 | print '-'*80 51 | print gpu_sgemm(A,B) 52 | print '-'*80 53 | 54 | 55 | if __name__ == "__main__": 56 | test() 57 | -------------------------------------------------------------------------------- /cuda/sugar/fft/__init__.py: -------------------------------------------------------------------------------- 1 | from fft import * 2 | from fftconvolve import * 3 | -------------------------------------------------------------------------------- /cuda/sugar/fft/conv_gold.py: -------------------------------------------------------------------------------- 1 | import os 2 | import ctypes 3 | import numpy as np 4 | from scipy.signal import convolve2d, fftconvolve 5 | 6 | 7 | source = ''' 8 | #include 9 | #include 10 | 11 | typedef struct{ 12 | float x, y; 13 | } Complex; 14 | 15 | const Complex CPLX_ZERO = {0, 0}; 16 | 17 | //a += b * c 18 | extern "C" void complexMAD(Complex& a, Complex b, Complex c){ 19 | Complex t = {a.x + b.x * c.x - b.y * c.y, a.y + b.x * c.y + b.y * c.x}; 20 | a = t; 21 | } 22 | 23 | extern "C" void printComplexArray(Complex * arr, int rows, int cols) { 24 | printf("arr[%d,%d] = %f \\n", 0, 0, arr[0].x); 25 | for(int i=0; i < rows; ++i) { 26 | for(int j=0; j < cols; ++j) { 27 | //printf("arr[%d,%d] = %f+i%f \\n", i, j, arr[i*cols+j].x, arr[i*cols+j].y); 28 | } 29 | } 30 | } 31 | 32 | extern "C" int checkResults(Complex *h_ResultCPU, Complex *h_ResultGPU, int DATA_W, int DATA_H, int FFT_W) { 33 | Complex rCPU, rGPU; 34 | double max_delta_ref, delta, ref, sum_delta2, sum_ref2, L2norm; 35 | 36 | sum_delta2 = 0; 37 | sum_ref2 = 0; 38 | max_delta_ref = 0; 39 | 40 | for(int y = 0; y < DATA_H; y++) 41 | for(int x = 0; x < DATA_W; x++){ 42 | rCPU = h_ResultCPU[y * DATA_W + x]; 43 | rGPU = h_ResultGPU[y * FFT_W + x]; 44 | delta = (rCPU.x - rGPU.x) * (rCPU.x - rGPU.x) + (rCPU.y - rGPU.y) * (rCPU.y - rGPU.y); 45 | ref = rCPU.x * rCPU.x + rCPU.y * rCPU.y; 46 | if((delta / ref) > max_delta_ref) max_delta_ref = delta / ref; 47 | sum_delta2 += delta; 48 | sum_ref2 += ref; 49 | } 50 | L2norm = sqrt(sum_delta2 / sum_ref2); 51 | printf("Max delta / CPU value %E\\n", sqrt(max_delta_ref)); 52 | printf("L2 norm: %E\\n", L2norm); 53 | printf((L2norm < 1e-6) ? "TEST PASSED\\n" : "TEST FAILED\\n"); 54 | return 0; 55 | } 56 | 57 | //////////////////////////////////////////////////////////////////////////////// 58 | // Reference straightfroward CPU convolution 59 | //////////////////////////////////////////////////////////////////////////////// 60 | extern "C" void convolutionCPU( 61 | Complex *h_Result, 62 | Complex *h_Data, 63 | Complex *h_Kernel, 64 | int dataW, 65 | int dataH, 66 | int kernelW, 67 | int kernelH, 68 | int kernelX, 69 | int kernelY 70 | ){ 71 | //for(int y=0; y < kernelH; y++) { 72 | // for(int x=0; x < kernelW; x++) { 73 | // printf("k[%d,%d] = %f + %fj\\n", x,y,h_Kernel[y*kernelW+x].x,h_Kernel[y*kernelW+x].y); 74 | // } 75 | //} 76 | for(int y = 0; y < dataH; y++) 77 | for(int x = 0; x < dataW; x++){ 78 | //printf("[%d,%d] = %f + %fj\\n", x,y,h_Data[x*dataW+y].x,h_Data[x*dataW+y].y); 79 | // printf("d[%d,%d] = %f + %fj\\n", x,y,h_Data[y*dataW+x].x,h_Data[y*dataW+x].y); 80 | 81 | Complex sum = CPLX_ZERO; 82 | 83 | for(int ky = -(kernelH - kernelY - 1); ky <= kernelY; ky++) 84 | for(int kx = -(kernelW - kernelX - 1); kx <= kernelX; kx++){ 85 | int dx = x + kx; 86 | int dy = y + ky; 87 | if(dx < 0) dx = 0; 88 | if(dy < 0) dy = 0; 89 | if(dx >= dataW) dx = dataW - 1; 90 | if(dy >= dataH) dy = dataH - 1; 91 | 92 | complexMAD( 93 | sum, 94 | h_Data[dy * dataW + dx], 95 | h_Kernel[(kernelY - ky) * kernelW + (kernelX - kx)] 96 | ); 97 | } 98 | 99 | h_Result[y * dataW + x] = sum; 100 | } 101 | } 102 | ''' 103 | 104 | 105 | 106 | class float2(ctypes.Structure): 107 | pass 108 | float2._fields_ = [ 109 | ('x', ctypes.c_float), 110 | ('y', ctypes.c_float), 111 | ] 112 | 113 | def _get_float2_ptr(numpy_array): 114 | return numpy_array.ctypes.data_as(ctypes.POINTER(float2)) 115 | 116 | def _load_dll(): 117 | file = open('conv_gold.cpp','w') 118 | file.write(source) 119 | file.close() 120 | 121 | os.system('rm -f conv_gold.so') 122 | os.system('g++ -fPIC -shared -o /tmp/conv_gold.so conv_gold.cpp') 123 | os.system('rm conv_gold.cpp') 124 | return ctypes.cdll.LoadLibrary('/tmp/conv_gold.so') 125 | 126 | DLL = _load_dll() 127 | 128 | def get_dll(): 129 | return DLL 130 | 131 | def get_convolution_cpu(): 132 | conv_gold = get_dll() 133 | return conv_gold.convolutionCPU 134 | 135 | def get_check_results(): 136 | conv_gold = get_dll() 137 | return conv_gold.checkResults 138 | 139 | def get_print_complex(): 140 | conv_gold = get_dll() 141 | return conv_gold.printComplexArray 142 | 143 | def run(): 144 | print_complex = get_print_complex() 145 | convolutionCPU = get_convolution_cpu() 146 | check_results = get_check_results() 147 | 148 | #data = np.ones((3,3)).astype('complex64') 149 | data = np.asfortranarray(np.random.randn(3,3).astype('complex64')) 150 | #kernel = np.ones((3,3)).astype('complex64') 151 | kernel = np.asfortranarray(np.random.randn(3,3).astype('complex64')) 152 | result = np.asfortranarray(np.zeros_like(data).astype('complex64')) 153 | 154 | convolutionCPU(_get_float2_ptr(result), _get_float2_ptr(data), _get_float2_ptr(kernel), data.shape[1], data.shape[0], kernel.shape[1], kernel.shape[0], 1, 6) 155 | 156 | print 157 | print kernel 158 | print 159 | print data 160 | print 161 | 162 | s1 = np.array(data.shape) 163 | s2 = np.array(kernel.shape) 164 | 165 | print result 166 | print 167 | print fftconvolve(data.real, kernel.real, mode='full').astype('complex64') 168 | 169 | if __name__ == "__main__": 170 | run() 171 | -------------------------------------------------------------------------------- /cuda/sugar/fft/fft.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | import ctypes 5 | 6 | import numpy as np 7 | 8 | import cuda.cuda as cuda 9 | import cuda.cufft as cufft 10 | import logging 11 | 12 | logger = logging.getLogger(os.path.basename(__file__)) 13 | info = logger.info 14 | debug = logger.debug 15 | warn = logger.warn 16 | error = logger.error 17 | 18 | def _get_cufft_signal(numpy_array): 19 | dsignal = ctypes.c_void_p() 20 | cuda.cudaMalloc(ctypes.byref(dsignal), numpy_array.nbytes) 21 | cuda.cudaMemcpy(dsignal, numpy_array.ctypes.data, numpy_array.nbytes, cuda.cudaMemcpyHostToDevice) 22 | return ctypes.cast(dsignal,ctypes.POINTER(cufft.cufftComplex)) 23 | 24 | def _get_plan(shape): 25 | ndims = len(shape) 26 | if ndims == 1: 27 | return _get_1dplan(shape) 28 | elif ndims == 2: 29 | return _get_2dplan(shape) 30 | elif ndims == 3: 31 | return _get_3dplan(shape) 32 | else: 33 | error('_get_plan: invalid size (todo: throw exception)') 34 | 35 | def _get_1dplan(shape,batch=1): 36 | debug("[*] Creating a 1D FFT plan...") 37 | plan = cufft.cufftHandle() 38 | cufft.cufftPlan1d(plan, shape[0], cufft.CUFFT_C2C, batch) 39 | return plan 40 | 41 | def _get_2dplan(shape): 42 | debug("[*] Creating a 2D FFT plan...") 43 | plan = cufft.cufftHandle() 44 | cufft.cufftPlan2d(plan, shape[0], shape[1], cufft.CUFFT_C2C) 45 | return plan 46 | 47 | def _get_3dplan(shape): 48 | debug("[*] Creating a 3D FFT plan...") 49 | plan = cufft.cufftHandle() 50 | cufft.cufftPlan3d(plan, shape[0], shape[1], shape[2], cufft.CUFFT_C2C) 51 | return plan 52 | 53 | def _get_data(device_ptr,numpy_array): 54 | result = np.empty_like(numpy_array) 55 | cuda.cudaMemcpy(result.ctypes.data, device_ptr, numpy_array.nbytes, cuda.cudaMemcpyDeviceToHost) 56 | return result 57 | 58 | def _get_inverse_data(device_ptr,numpy_array): 59 | result = _get_data(device_ptr, numpy_array) 60 | return result/float(numpy_array.size) 61 | 62 | def _cuda_fft(numpy_array, leave_on_device=False): 63 | dsignal = _get_cufft_signal(numpy_array) 64 | plan = _get_plan(numpy_array.shape) 65 | #print "[*] Using the CUFFT plan to forward transform the signal in place..." 66 | #print "(*) cufftExecC2C note: Identical pointers to input and output arrays " 67 | #print " implies in-place transformation" 68 | cufft.cufftExecC2C(plan, dsignal, dsignal, cufft.CUFFT_FORWARD) 69 | debug("[*] Destroying CUFFT plan...") 70 | cufft.cufftDestroy(plan) 71 | if not leave_on_device: 72 | result = _get_data(dsignal, numpy_array) 73 | #result = result.reshape(numpy_array.shape) 74 | cuda.cudaFree(dsignal) 75 | return result 76 | else: 77 | return dsignal 78 | 79 | def _cuda_ifft(numpy_array, leave_on_device=False): 80 | dsignal = _get_cufft_signal(numpy_array) 81 | plan = _get_plan(numpy_array.shape) 82 | debug("[*] Using the CUFFT plan to inverse transform the signal in place...") 83 | cufft.cufftExecC2C(plan, dsignal, dsignal, cufft.CUFFT_INVERSE) 84 | debug("[*] Destroying CUFFT plan...") 85 | cufft.cufftDestroy(plan) 86 | if not leave_on_device: 87 | result = _get_inverse_data(dsignal, numpy_array) 88 | #result = result.reshape(numpy_array.shape) 89 | cuda.cudaFree(dsignal) 90 | return result 91 | else: 92 | return dsignal 93 | 94 | def fft(numpy_array, leave_on_device=False): 95 | if numpy_array.ndim == 1: 96 | return _cuda_fft(numpy_array, leave_on_device) 97 | else: 98 | print 'cuda.sugar.fft.fft: ndim != 1, throw exception ' 99 | 100 | def fft2(numpy_array, leave_on_device=False): 101 | if numpy_array.ndim == 2: 102 | return _cuda_fft(numpy_array, leave_on_device) 103 | else: 104 | print 'cuda.sugar.fft.fft2: ndim !=2, throw exception' 105 | 106 | def fftn(numpy_array, leave_on_device=False): 107 | if numpy_array.ndim > 3: 108 | print 'cuda.sugar.fft.fftn: ndim > 3, throw exception' 109 | else: 110 | return _cuda_fft(numpy_array, leave_on_device) 111 | 112 | def ifft(numpy_array, leave_on_device=False): 113 | if numpy_array.ndim == 1: 114 | return _cuda_ifft(numpy_array, leave_on_device) 115 | else: 116 | print 'cuda.sugar.fft.ifft: ndim != 1, throw exception ' 117 | 118 | def ifft2(numpy_array, leave_on_device=False): 119 | if numpy_array.ndim == 2: 120 | return _cuda_ifft(numpy_array, leave_on_device) 121 | else: 122 | print 'cuda.sugar.fft.ifft2: ndim != 2, throw exception ' 123 | 124 | def ifftn(numpy_array, leave_on_device=False): 125 | if numpy_array.ndim > 3: 126 | print 'cuda.sugar.fft.ifftn: ndim > 3, throw exception ' 127 | else: 128 | return _cuda_ifft(numpy_array, leave_on_device) 129 | 130 | def main(): 131 | print "-"*55 132 | print "-- --" 133 | print "-- python-cuda versions of numpy.fft.{fft,ifft} --" 134 | print "-- --" 135 | print "-"*55 136 | print 137 | print ">>> Creating host signal..." 138 | 139 | try: 140 | size = int(sys.argv[1]) 141 | except Exception,e: 142 | size = 10 143 | 144 | print "size = %s" % size 145 | 146 | numpy_array = np.random.randn(size).astype('complex64') 147 | numpy_array -= numpy_array.mean() 148 | numpy_array /= numpy_array.std() 149 | 150 | print ">>> Computing ffts with GPU..." 151 | print "[*] Forward fft on gpu ..." 152 | fft_res = fft(numpy_array) 153 | 154 | print "[*] Inverse fft on gpu ..." 155 | ifft_res = ifft(fft_res) 156 | 157 | print ">>> Computing references with numpy..." 158 | 159 | print "[*] Forward fft" 160 | forward_ref = np.fft.fft(numpy_array) 161 | 162 | print "[*] Inverse fft" 163 | inverse_ref = np.fft.ifft(forward_ref) 164 | 165 | print "l2norm fft: ", np.linalg.norm(fft_res - forward_ref) 166 | 167 | print "l2norm ifft: ", np.linalg.norm(ifft_res - inverse_ref) 168 | 169 | if __name__ == "__main__": 170 | main() 171 | -------------------------------------------------------------------------------- /cuda/sugar/fft/fftconvolve2d_kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2007 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NOTICE TO USER: 5 | * 6 | * This source code is subject to NVIDIA ownership rights under U.S. and 7 | * international Copyright laws. Users and possessors of this source code 8 | * are hereby granted a nonexclusive, royalty-free license to use this code 9 | * in individual and commercial software. 10 | * 11 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 12 | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 13 | * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH 14 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 15 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 16 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 17 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 18 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 19 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 20 | * OR PERFORMANCE OF THIS SOURCE CODE. 21 | * 22 | * U.S. Government End Users. This source code is a "commercial item" as 23 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of 24 | * "commercial computer software" and "commercial computer software 25 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 26 | * and is provided to the U.S. Government only as a commercial end item. 27 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 28 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 29 | * source code with only those rights set forth herein. 30 | * 31 | * Any use of this source code in individual and commercial software must 32 | * include, in the user documentation and internal comments to the code, 33 | * the above Disclaimer and U.S. Government End Users Notice. 34 | */ 35 | 36 | 37 | #define IMUL(a, b) __mul24(a, b) 38 | 39 | typedef float2 Complex; 40 | 41 | texture texKernel; 42 | texture texData; 43 | 44 | extern "C" { 45 | 46 | //////////////////////////////////////////////////////////////////////////////// 47 | // Cyclically shift convolution kernel, so that the center is at (0, 0) 48 | //////////////////////////////////////////////////////////////////////////////// 49 | 50 | __global__ void padKernel( 51 | Complex *d_PaddedKernel, 52 | int fftW, 53 | int fftH, 54 | int kernelW, 55 | int kernelH, 56 | int kernelX, 57 | int kernelY 58 | ){ 59 | const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x; 60 | const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y; 61 | 62 | if(x < kernelW && y < kernelH){ 63 | int kx = x - kernelX; if(kx < 0) kx += fftW; 64 | int ky = y - kernelY; if(ky < 0) ky += fftH; 65 | d_PaddedKernel[IMUL(ky, fftW) + kx] = 66 | tex2D(texKernel, (float)x + 0.5f, (float)y + 0.5f); 67 | } 68 | } 69 | 70 | 71 | //////////////////////////////////////////////////////////////////////////////// 72 | // Copy input data array to the upper left corner and pad by border values 73 | //////////////////////////////////////////////////////////////////////////////// 74 | 75 | __global__ void padData( 76 | Complex *d_PaddedData, 77 | int fftW, 78 | int fftH, 79 | int dataW, 80 | int dataH, 81 | int kernelW, 82 | int kernelH, 83 | int kernelX, 84 | int kernelY 85 | ){ 86 | const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x; 87 | const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y; 88 | const int borderW = dataW + kernelX; 89 | const int borderH = dataH + kernelY; 90 | int dx; 91 | int dy; 92 | 93 | if(x < fftW && y < fftH){ 94 | if(x < dataW) dx = x; 95 | if(y < dataH) dy = y; 96 | if(x >= dataW && x < borderW) dx = dataW - 1; 97 | if(y >= dataH && y < borderH) dy = dataH - 1; 98 | if(x >= borderW) dx = 0; 99 | if(y >= borderH) dy = 0; 100 | 101 | d_PaddedData[IMUL(y, fftW) + x] = 102 | tex2D(texData, (float)dx + 0.5f, (float)dy + 0.5f); 103 | } 104 | } 105 | 106 | 107 | 108 | //////////////////////////////////////////////////////////////////////////////// 109 | // Modulate Fourier image of padded data by Fourier image of padded kernel 110 | // and normalize by FFT size 111 | //////////////////////////////////////////////////////////////////////////////// 112 | __device__ void complexMulAndScale(Complex& a, Complex b, float c){ 113 | Complex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)}; 114 | a = t; 115 | } 116 | 117 | __global__ void modulateAndNormalize( 118 | Complex *d_PaddedData, 119 | Complex *d_PaddedKernel, 120 | int dataN 121 | ){ 122 | const int tid = IMUL(blockDim.x, blockIdx.x) + threadIdx.x; 123 | const int threadN = IMUL(blockDim.x, gridDim.x); 124 | const float q = 1.0f / (float)dataN; 125 | 126 | for(int i = tid; i < dataN; i += threadN) 127 | complexMulAndScale(d_PaddedData[i], d_PaddedKernel[i], q); 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /cuda/sugar/kernel/__init__.py: -------------------------------------------------------------------------------- 1 | from kernelfactorydrv import * 2 | from kernelfactoryrt import * 3 | -------------------------------------------------------------------------------- /cuda/sugar/kernel/compiler.py: -------------------------------------------------------------------------------- 1 | from ctypes import cdll 2 | import commands 3 | from subprocess import Popen, PIPE 4 | from cuda.utils import memoize 5 | 6 | class CompileError(Exception): 7 | pass 8 | 9 | @memoize 10 | def get_nvcc_version(nvcc): 11 | try: 12 | return Popen([nvcc, "--version"], stdout=PIPE).communicate()[0] 13 | except OSError, e: 14 | raise OSError, "%s was not found (is it on the PATH?) [%s]" % ( 15 | nvcc, str(e)) 16 | 17 | def compile_plain(source, options, keep, nvcc, cache_dir): 18 | from os.path import join 19 | from platform import architecture 20 | 21 | if architecture()[0] == "64bit": 22 | options.insert(0,"-Xcompiler='-fPIC'") 23 | 24 | if cache_dir: 25 | try: 26 | import hashlib 27 | checksum = hashlib.md5() 28 | except ImportError: 29 | # for Python << 2.5 30 | import md5 31 | checksum = md5.new() 32 | 33 | checksum.update(source) 34 | for option in options: 35 | checksum.update(option) 36 | checksum.update(get_nvcc_version(nvcc)) 37 | 38 | cache_file = checksum.hexdigest() 39 | cache_path = join(cache_dir, cache_file + ".so") 40 | 41 | try: 42 | #return open(cache_path, "r").read() 43 | return cdll.LoadLibrary(cache_path) 44 | except: 45 | pass 46 | 47 | from tempfile import mkdtemp 48 | file_dir = mkdtemp() 49 | file_root = "kernel" 50 | 51 | cu_file_name = file_root + ".cu" 52 | cu_file_path = join(file_dir, cu_file_name) 53 | 54 | options.append("-o") 55 | options.append("%s.so" % join(file_dir,file_root)) 56 | 57 | outf = open(cu_file_path, "w") 58 | outf.write(str(source)) 59 | outf.close() 60 | 61 | if keep: 62 | options = options[:] 63 | options.append("--keep") 64 | 65 | print "*** compiler output in %s" % file_dir 66 | 67 | #from pytools.prefork import call 68 | try: 69 | 70 | print "Compiling kernel using the following options: " 71 | print ' '.join([nvcc, "--shared"] + options + [cu_file_name]) 72 | 73 | process = Popen([nvcc, "--shared"] + options + [cu_file_path], stdout=PIPE, cwd=file_dir) 74 | output = process.communicate()[0] 75 | result = process.returncode 76 | 77 | if output: 78 | print 'Compiler output below:' 79 | print output 80 | 81 | except OSError, e: 82 | raise OSError, "%s was not found (is it on the PATH?) [%s]" % ( 83 | nvcc, str(e)) 84 | 85 | if result != 0: 86 | raise CompileError, "nvcc compilation of %s failed" % cu_file_path 87 | 88 | kdll = open(join(file_dir, file_root + ".so"), "r").read() 89 | 90 | if cache_dir: 91 | outf = open(cache_path, "w") 92 | outf.write(kdll) 93 | outf.close() 94 | 95 | if not keep: 96 | from os import listdir, unlink, rmdir 97 | for name in listdir(file_dir): 98 | unlink(join(file_dir, name)) 99 | rmdir(file_dir) 100 | 101 | kdll = cdll.LoadLibrary(cache_path) 102 | 103 | return kdll 104 | 105 | def compile(source, nvcc="nvcc", options=[], keep=False, 106 | no_extern_c=False, arch=None, code=None, cache_dir=None, 107 | include_dirs=[]): 108 | 109 | if not no_extern_c: 110 | source = 'extern "C" {\n%s\n}\n' % source 111 | 112 | options = options[:] 113 | if arch is None: 114 | try: 115 | # todo replace this with python-cuda equivalent 116 | #from pycuda.driver import Context 117 | #arch = "sm_%d%d" % Context.get_device().compute_capability() 118 | arch = None 119 | except RuntimeError: 120 | pass 121 | 122 | if cache_dir is None: 123 | from os.path import expanduser, join, exists 124 | import os 125 | try: 126 | getattr( os , 'getuid' ) 127 | except: 128 | def getuid(): 129 | return os.getenv('USERNAME') 130 | os.getuid = getuid 131 | 132 | from tempfile import gettempdir 133 | cache_dir = join(gettempdir(), 134 | "python-cuda-compiler-cache-v1-uid%s" % os.getuid()) 135 | 136 | if not exists(cache_dir): 137 | from os import mkdir 138 | mkdir(cache_dir) 139 | 140 | if arch is not None: 141 | options.extend(["-arch", arch]) 142 | 143 | if code is not None: 144 | options.extend(["-code", code]) 145 | 146 | include_dirs = include_dirs[:] 147 | 148 | for i in include_dirs: 149 | options.append("-I"+i) 150 | 151 | return compile_plain(source, options, keep, nvcc, cache_dir) 152 | -------------------------------------------------------------------------------- /cuda/sugar/kernel/kernelfactorydrv.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | from cuda.cuda import * 3 | 4 | 5 | 6 | class KernelGetter(object): 7 | """ Wraps a ctypes CDLL instance for accessing CUDA kernels. 8 | 9 | Example 10 | ------- 11 | from ctypes import cdll 12 | mykernels = KernelGetter(cdll.LoadLibrary('libmykernels.so')) 13 | mykernels.FastKernel(grid, block)(x, y) 14 | # Equivalent CUDA call: 15 | # FastKernel<<>>(x, y) 16 | """ 17 | 18 | def __init__(self, dll): 19 | raise NotImplementedError 20 | # self.dll = dll 21 | 22 | # def __getattr__(self, name): 23 | # mangled_name = '__device_stub_%s' % name 24 | # try: 25 | # funcptr = getattr(self.dll, mangled_name) 26 | # except AttributeError: 27 | # raise AttributeError("could not find kernel named %r in %r" % (name, self.dll)) 28 | 29 | # # Return a factory function that will create the Kernel object. 30 | # factory = lambda *args, **kwds: Kernel(funcptr, *args, **kwds) 31 | 32 | # return factory 33 | 34 | 35 | # class Kernel(object): 36 | # """ Configure a CUDA kernel. 37 | # """ 38 | 39 | # def __init__(self, funcptr, gridDim, blockDim, sharedMem=0, tokens=0): 40 | # # The function pointer to the kernel. 41 | # self.funcptr = funcptr 42 | 43 | # # The configuration parameters for the call. These are the arguments 44 | # # inside the <<<>>> brackets in CUDA. 45 | # self.gridDim = gridDim 46 | # self.blockDim = blockDim 47 | # self.sharedMem = sharedMem 48 | # self.tokens = tokens 49 | 50 | # # Delegate .restype and .argtypes attribute access to the underlying 51 | # # function pointer. 52 | # def _get_restype(self): 53 | # return self.funcptr.restype 54 | # def _set_restype(self, val): 55 | # self.funcptr.restype = val 56 | # restype = property(_get_restype, _set_restype) 57 | 58 | # def _get_argtypes(self): 59 | # return self.funcptr.argtypes 60 | # def _set_argtypes(self, val): 61 | # self.funcptr.argtypes = val 62 | # argtypes = property(_get_argtypes, _set_argtypes) 63 | 64 | 65 | # def __call__(self, *args): 66 | # """ Call the kernel as configured. 67 | # """ 68 | # cudart.cudaConfigureCall(self.gridDim, self.blockDim, self.sharedMem, self.tokens) 69 | # self.funcptr(*args) 70 | # # Check to make sure we didn't get an error. 71 | # err = cudart.getLastError() 72 | # cudart._checkCudaStatus(err) 73 | -------------------------------------------------------------------------------- /cuda/sugar/kernel/kernelfactoryrt.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | from cuda.cuda import * 3 | from cuda.sugar.kernel.compiler import compile 4 | 5 | class SourceModule(object): 6 | """ Wraps a ctypes CDLL instance for accessing CUDA kernels. 7 | 8 | Example 9 | ------- 10 | from ctypes import cdll 11 | mykernels = KernelGetter(cdll.LoadLibrary('libmykernels.so')) 12 | mykernels.FastKernel(grid, block)(x, y) 13 | # Equivalent CUDA call: 14 | # FastKernel<<>>(x, y) 15 | """ 16 | def __init__(self, source, nvcc="nvcc", options=[], keep=False, 17 | no_extern_c=False, arch=None, code=None, cache_dir=None, 18 | include_dirs=[]): 19 | 20 | self.dll = compile(source, nvcc, options, keep, no_extern_c, 21 | arch, code, cache_dir, include_dirs) 22 | 23 | def __getattr__(self, name): 24 | mangled_name = '__device_stub_%s' % name 25 | try: 26 | funcptr = getattr(self.dll, mangled_name) 27 | except AttributeError: 28 | raise AttributeError("could not find kernel named %r in %r" % (name, self.dll)) 29 | 30 | # Return a factory function that will create the Kernel object. 31 | factory = lambda *args, **kwds: Kernel(funcptr, *args, **kwds) 32 | 33 | return factory 34 | 35 | class Kernel(object): 36 | """ Configure a CUDA kernel. 37 | """ 38 | 39 | def __init__(self, funcptr, gridDim, blockDim, sharedMem=0, tokens=0): 40 | # The function pointer to the kernel. 41 | self.funcptr = funcptr 42 | 43 | # The configuration parameters for the call. These are the arguments 44 | # inside the <<<>>> brackets in CUDA. 45 | self.gridDim = gridDim 46 | self.blockDim = blockDim 47 | self.sharedMem = sharedMem 48 | self.tokens = tokens 49 | 50 | # Delegate .restype and .argtypes attribute access to the underlying 51 | # function pointer. 52 | def _get_restype(self): 53 | return self.funcptr.restype 54 | def _set_restype(self, val): 55 | self.funcptr.restype = val 56 | restype = property(_get_restype, _set_restype) 57 | 58 | def _get_argtypes(self): 59 | return self.funcptr.argtypes 60 | def _set_argtypes(self, val): 61 | self.funcptr.argtypes = val 62 | argtypes = property(_get_argtypes, _set_argtypes) 63 | 64 | 65 | def __call__(self, *args): 66 | """ Call the kernel as configured. 67 | """ 68 | cudart.cudaConfigureCall(self.gridDim, self.blockDim, self.sharedMem, self.tokens) 69 | self.funcptr(*args) 70 | # Check to make sure we didn't get an error. 71 | #err = cudart.getLastError() 72 | #cudart._checkCudaStatus(err) 73 | -------------------------------------------------------------------------------- /cuda/sugar/kernel/tests/matrix_mul.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from cuda.memory import Linear 4 | from cuda.kernel.kernelfactoryrt import SourceModule 5 | from cuda.cuda import dim3 6 | 7 | #from IPython.Shell import IPShellEmbed 8 | #ipshell = IPShellEmbed(argv=[]) 9 | 10 | 11 | BLOCK_SIZE = 16 12 | # Matrix A width 13 | WA = (3 * BLOCK_SIZE) 14 | # Matrix A height 15 | HA = (5 * BLOCK_SIZE) 16 | # Matrix B width 17 | WB = (8 * BLOCK_SIZE) 18 | # Matrix B height 19 | HB = WA 20 | # Matrix C width 21 | WC = WB 22 | # Matrix C height 23 | HC = HA 24 | 25 | matrixMul = SourceModule(open('matrix_mul_kernel.cu','r').read()) 26 | 27 | nA = np.random.random(size=(HA, WA)).astype(np.float32) 28 | nB = np.random.random(size=(HB, WB)).astype(np.float32) 29 | 30 | print 'Allocating arrays' 31 | dA = Linear(nA.shape).from_numpy(nA) 32 | dB = Linear(nB.shape).from_numpy(nB) 33 | dC = Linear((HC,WC)) 34 | 35 | print 'Calling kernel' 36 | grid = dim3(WC // BLOCK_SIZE, HC // BLOCK_SIZE, 1) 37 | block = dim3(BLOCK_SIZE, BLOCK_SIZE, 1) 38 | Mul = matrixMul.matrixMul(grid, block) 39 | Mul(dC.ref, dA.ref, dB.ref, WA, WB) 40 | 41 | print 'Collecting results' 42 | nC = dC.to_numpy() 43 | nC.reshape((HC, WC)) 44 | 45 | print 'Freeing data' 46 | dA._free() 47 | dB._free() 48 | dC._free() 49 | 50 | print 'Calculating error' 51 | print 52 | goldC = np.dot(nA, nB) 53 | err = nC - goldC 54 | print 'L2 err: %r' % np.linalg.norm(err, 2) 55 | print 'L1 err: %r' % np.linalg.norm(err, 1) 56 | print 'Linf err: %r' % np.linalg.norm(err, np.inf) 57 | print 'Lfro err: %r' % np.linalg.norm(err, 'fro') 58 | -------------------------------------------------------------------------------- /cuda/sugar/kernel/tests/matrix_mul_kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2007 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NOTICE TO USER: 5 | * 6 | * This source code is subject to NVIDIA ownership rights under U.S. and 7 | * international Copyright laws. Users and possessors of this source code 8 | * are hereby granted a nonexclusive, royalty-free license to use this code 9 | * in individual and commercial software. 10 | * 11 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 12 | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 13 | * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH 14 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 15 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 16 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 17 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 18 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 19 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 20 | * OR PERFORMANCE OF THIS SOURCE CODE. 21 | * 22 | * U.S. Government End Users. This source code is a "commercial item" as 23 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of 24 | * "commercial computer software" and "commercial computer software 25 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 26 | * and is provided to the U.S. Government only as a commercial end item. 27 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 28 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 29 | * source code with only those rights set forth herein. 30 | * 31 | * Any use of this source code in individual and commercial software must 32 | * include, in the user documentation and internal comments to the code, 33 | * the above Disclaimer and U.S. Government End Users Notice. 34 | */ 35 | 36 | /* Matrix multiplication: C = A * B. 37 | * Device code. 38 | */ 39 | 40 | #include 41 | 42 | #define CHECK_BANK_CONFLICTS 0 43 | #if CHECK_BANK_CONFLICTS 44 | #define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j)) 45 | #define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j)) 46 | #else 47 | #define AS(i, j) As[i][j] 48 | #define BS(i, j) Bs[i][j] 49 | #endif 50 | 51 | // Thread block size 52 | #define BLOCK_SIZE 16 53 | 54 | // Matrix dimensions 55 | // (chosen as multiples of the thread block size for simplicity) 56 | #define WA (3 * BLOCK_SIZE) // Matrix A width 57 | #define HA (5 * BLOCK_SIZE) // Matrix A height 58 | #define WB (8 * BLOCK_SIZE) // Matrix B width 59 | #define HB WA // Matrix B height 60 | #define WC WB // Matrix C width 61 | #define HC HA // Matrix C height 62 | 63 | //////////////////////////////////////////////////////////////////////////////// 64 | //! Matrix multiplication on the device: C = A * B 65 | //! wA is A's width and wB is B's width 66 | //////////////////////////////////////////////////////////////////////////////// 67 | __global__ void 68 | matrixMul( float* C, float* A, float* B, int wA, int wB) 69 | { 70 | // Block index 71 | int bx = blockIdx.x; 72 | int by = blockIdx.y; 73 | 74 | // Thread index 75 | int tx = threadIdx.x; 76 | int ty = threadIdx.y; 77 | 78 | // Index of the first sub-matrix of A processed by the block 79 | int aBegin = wA * BLOCK_SIZE * by; 80 | 81 | // Index of the last sub-matrix of A processed by the block 82 | int aEnd = aBegin + wA - 1; 83 | 84 | // Step size used to iterate through the sub-matrices of A 85 | int aStep = BLOCK_SIZE; 86 | 87 | // Index of the first sub-matrix of B processed by the block 88 | int bBegin = BLOCK_SIZE * bx; 89 | 90 | // Step size used to iterate through the sub-matrices of B 91 | int bStep = BLOCK_SIZE * wB; 92 | 93 | // Csub is used to store the element of the block sub-matrix 94 | // that is computed by the thread 95 | float Csub = 0; 96 | 97 | // Loop over all the sub-matrices of A and B 98 | // required to compute the block sub-matrix 99 | for (int a = aBegin, b = bBegin; 100 | a <= aEnd; 101 | a += aStep, b += bStep) { 102 | 103 | // Declaration of the shared memory array As used to 104 | // store the sub-matrix of A 105 | __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 106 | 107 | // Declaration of the shared memory array Bs used to 108 | // store the sub-matrix of B 109 | __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 110 | 111 | // Load the matrices from device memory 112 | // to shared memory; each thread loads 113 | // one element of each matrix 114 | AS(ty, tx) = A[a + wA * ty + tx]; 115 | BS(ty, tx) = B[b + wB * ty + tx]; 116 | 117 | // Synchronize to make sure the matrices are loaded 118 | __syncthreads(); 119 | 120 | // Multiply the two matrices together; 121 | // each thread computes one element 122 | // of the block sub-matrix 123 | for (int k = 0; k < BLOCK_SIZE; ++k) 124 | Csub += AS(ty, k) * BS(k, tx); 125 | 126 | // Synchronize to make sure that the preceding 127 | // computation is done before loading two new 128 | // sub-matrices of A and B in the next iteration 129 | __syncthreads(); 130 | } 131 | 132 | // Write the block sub-matrix to device memory; 133 | // each thread writes one element 134 | int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; 135 | C[c + wB * ty + tx] = Csub; 136 | } 137 | -------------------------------------------------------------------------------- /cuda/sugar/memory/__init__.py: -------------------------------------------------------------------------------- 1 | from linear import * 2 | -------------------------------------------------------------------------------- /cuda/sugar/memory/linear.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Array-like objects for CUDA.""" 3 | 4 | from cuda.cuda import * 5 | from cuda.cublas import * 6 | import numpy 7 | import ctypes 8 | from ctypes import * 9 | 10 | # cuda <-> dtype conversion 11 | cudaDtypes = {'float32': ctypes.c_float, 12 | 'int32': ctypes.c_int, 13 | 'complex64': ctypes.c_float*2, 14 | } 15 | 16 | class Linear(object): 17 | 18 | ref = property(fget=lambda self: self._get_ref()) 19 | 20 | def __init__(self, shape=None, dtype='float32', order=None): 21 | self.shape = shape 22 | self.size = numpy.prod(shape) 23 | self.dtype = numpy.dtype(dtype) 24 | self.order = order 25 | self.ctype = self._convert_type(self.dtype) 26 | self.nbytes = self.size*ctypes.sizeof(self.ctype) 27 | self.allocated = False 28 | self.data = None 29 | self._alloc() 30 | 31 | def __del__(self): 32 | self._free() 33 | 34 | def _convert_type(self, dtype): 35 | ct = cudaDtypes.get(dtype.name, None) 36 | if ct is None: 37 | raise TypeError("Unsupported dtype") 38 | return ct 39 | 40 | def _get_ref(self): 41 | return cast(self.data,POINTER(self._convert_type(self.dtype))) 42 | 43 | def _alloc(self): 44 | self.data = c_void_p() 45 | cudaMalloc(byref(self.data), self.nbytes) 46 | self.allocated = True 47 | 48 | def _free(self): 49 | if self.allocated: 50 | cudaFree(self.data) 51 | self.data = None 52 | self.allocated = False 53 | 54 | def to_numpy(self, a=None): 55 | if not self.allocated: 56 | raise Exception("Must first allocate") 57 | if a is None: 58 | a = numpy.empty(self.shape, dtype=self.dtype, order=self.order) 59 | else: 60 | # Check that the given array is appropriate. 61 | if a.size != self.size: 62 | raise ValueError("need an array of size %s; got %s" % (self.size, a.size)) 63 | if a.dtype.name != self.dtype.name: 64 | # XXX: compare dtypes directly? issubdtype? 65 | raise ValueError("need an array of dtype %r; got %r" % (self.dtype, a.dtype)) 66 | cudaMemcpy(a.ctypes.data, self.ref, self.nbytes, cudaMemcpyDeviceToHost) 67 | a = a.reshape(self.shape, order=self.order) 68 | return a 69 | 70 | def from_numpy(self, a): 71 | if not self.allocated: 72 | raise Exception("Must first allocate") 73 | assert a.size == self.size, "size must be the same" 74 | assert a.dtype == self.dtype, "dtype must be the same" 75 | a = numpy.ascontiguousarray(a,dtype=None) 76 | if self.order == 'F': 77 | a = numpy.asfortranarray(a) 78 | cudaMemcpy(self.data, a.ctypes.data, self.nbytes, cudaMemcpyHostToDevice) 79 | return self 80 | -------------------------------------------------------------------------------- /cuda/sugar/query/__init__.py: -------------------------------------------------------------------------------- 1 | from cu_utils import * 2 | from cuda_utils import * 3 | -------------------------------------------------------------------------------- /cuda/sugar/query/cu_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from cuda.cu import * 3 | from ctypes import * 4 | 5 | class GPUException(Exception): 6 | pass 7 | 8 | class cu_CUDA(object): 9 | usedDevices = 0 10 | def __init__(self): 11 | flags = 0 # see manual 12 | self.device = None 13 | self.context = None 14 | self.module = None 15 | self.deviceID = -1 16 | cuInit(flags) 17 | device_count = c_int() 18 | cuDeviceGetCount(byref(device_count)) 19 | if cu_CUDA.usedDevices >= device_count.value: 20 | print "No more uninitialized devices available" 21 | return 22 | self.device = CUdevice() 23 | self.context = CUcontext() 24 | self.modules = list() 25 | self.functions = dict() 26 | self.deviceID = cu_CUDA.usedDevices 27 | cuDeviceGet(byref(self.device),self.deviceID) 28 | cu_CUDA.usedDevices += 1 29 | status = cuCtxCreate(byref(self.context),0,self.device) 30 | if status != CUDA_SUCCESS: 31 | cuCtxDetach(self.context) 32 | raise GPUException("Failed to create CUDA context") 33 | self.getInfo() 34 | 35 | def getSourceModule(self,name=None): 36 | if name is None: 37 | name = "gpuFunctions.cubin" 38 | module = CUmodule() 39 | status = cuModuleLoad(byref(module),name) 40 | if status != CUDA_SUCCESS: 41 | print "File not found: %s" % name 42 | self.modules.append(module) 43 | return module 44 | 45 | def getFunction(self,name): 46 | missing = True 47 | function = CUfunction() 48 | for module in self.modules: 49 | status = cuModuleGetFunction(function,module,name) 50 | if status != CUDA_SUCCESS: 51 | continue 52 | else: 53 | self.functions[name] = function 54 | missing = False 55 | break 56 | if missing: 57 | print "Function not found: %s" % name 58 | return None 59 | return function 60 | 61 | def getInfo(self): 62 | device = self.device 63 | info = dict() 64 | count = c_int() 65 | cuDeviceGetCount(byref(count)) 66 | info["count"] = count.value 67 | name = (c_char*256)() 68 | cuDeviceGetName(name,256,device) 69 | info["name"] = name.value 70 | memsize = c_uint() 71 | cuDeviceTotalMem(byref(memsize),device) 72 | info["memory"] = memsize.value 73 | free,total = c_uint(),c_uint() 74 | cuMemGetInfo(byref(free),byref(total)) 75 | info["free"] = free.value 76 | major,minor = c_int(),c_int() 77 | cuDeviceComputeCapability(byref(major),byref(minor),device) 78 | info["capability"] = (major.value,minor.value) 79 | props = CUdevprop() 80 | cuDeviceGetProperties(byref(props),device) 81 | info["properties"] = props 82 | self.info = info 83 | 84 | def __str__(self): 85 | s = ["Device Info:\n"] 86 | i = self.info 87 | s.append("%-19s = %d" % ( 88 | "number of devices",i["count"])) 89 | s.append("%-19s = %d" % ( 90 | "current device ID",self.deviceID)) 91 | s.append("%-19s = %s" % ( 92 | "device name =",i["name"])) 93 | s.append("%-19s = %.f MB" % ( 94 | "memory size",i["memory"]/1024.**2)) 95 | s.append("%-19s = %.f MB" % ( 96 | "memory free",i["free"]/1024.**2)) 97 | s.append("%-19s = %.f MHz" % ( 98 | "clock rate",i["properties"].clockRate/1000.)) 99 | s.append("%-19s = %d" % ( 100 | "major",i["capability"][0])) 101 | s.append("%-19s = %d" % ( 102 | "minor",i["capability"][1])) 103 | s.append(21*"-") 104 | s.append(str(i["properties"])) 105 | return "\n".join(s) 106 | -------------------------------------------------------------------------------- /cuda/sugar/query/cuda_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import cuda.cuda as cuda 3 | from ctypes import byref, c_int 4 | import logging 5 | log = logging.getLogger('python-cuda') 6 | 7 | CUDART_VERSION = 2010 8 | 9 | def cutilSafeCall(retval): 10 | if retval != 0: 11 | log.error( 'error! %s' % retval) 12 | 13 | def get_device_count(): 14 | device_count = c_int() 15 | cutilSafeCall(cuda.cudaGetDeviceCount(byref(device_count))); 16 | return device_count.value 17 | 18 | def has_cuda_device(): 19 | dev_count = get_device_count() 20 | if dev_count > 0: 21 | log.debug("Found %d gpu devices" % dev_count) 22 | else: 23 | log.debug("There is no device supporting CUDA") 24 | return False 25 | 26 | cuda_enabled = False 27 | 28 | for dev in range(0, dev_count): 29 | dev_prop = cuda.cudaDeviceProp() 30 | retval = cuda.cudaGetDeviceProperties(byref(dev_prop), dev) 31 | if dev_prop.major == 9999 and dev_prop.minor == 9999: 32 | log.debug( "Device %s does not support cuda." % dev) 33 | continue 34 | cuda_enabled = True 35 | break 36 | 37 | if not cuda_enabled: 38 | log.debug("There is no device supporting CUDA") 39 | return cuda_enabled 40 | 41 | def needs_emulation(): 42 | return has_cuda_device() 43 | 44 | def get_devices(): 45 | dev_count = get_device_count() 46 | if dev_count > 0: 47 | log.debug("found %d gpu devices" % dev_count) 48 | else: 49 | log.debug("there is no device supporting cuda") 50 | 51 | for dev in range(0, dev_count): 52 | dev_prop = cuda.cudaDeviceProp() 53 | retval = cuda.cudaGetDeviceProperties(byref(dev_prop), dev) 54 | if retval == 3: 55 | log.debug( "there is no device supporting cuda") 56 | break 57 | elif dev == 0: 58 | if dev_prop.major == 9999 and dev_prop.minor == 9999: 59 | log.debug( "there is no device supporting cuda.") 60 | elif dev_count == 1: 61 | log.debug( "there is 1 device supporting cuda") 62 | else: 63 | log.debug( "there are %d devices supporting cuda" % dev_count) 64 | 65 | log.debug('Device %d: "%s"' % (dev, dev_prop.name)) 66 | log.debug("Major revision number: %d" % dev_prop.major) 67 | log.debug("Minor revision number: %d" % dev_prop.minor) 68 | log.debug("Total amount of global memory: %u bytes" % dev_prop.totalGlobalMem) 69 | 70 | if CUDART_VERSION >= 2000: 71 | log.debug("Number of multiprocessors: %d", dev_prop.multiProcessorCount); 72 | log.debug("Number of cores: %d", 8 * dev_prop.multiProcessorCount); 73 | 74 | log.debug( "Total amount of constant memory: %u bytes" % dev_prop.totalConstMem) 75 | log.debug( "Total amount of shared memory per block: %u bytes" % dev_prop.sharedMemPerBlock) 76 | log.debug( "Total number of registers available per block: %d" % dev_prop.regsPerBlock) 77 | log.debug( "Warp size: %d" % dev_prop.warpSize) 78 | log.debug( "Maximum number of threads per block: %d" % dev_prop.maxThreadsPerBlock) 79 | log.debug( "Maximum sizes of each dimension of a block: %d x %d x %d" % (dev_prop.maxThreadsDim[0], dev_prop.maxThreadsDim[1], dev_prop.maxThreadsDim[2])) 80 | log.debug( "Maximum sizes of each dimension of a grid: %d x %d x %d" % (dev_prop.maxGridSize[0], dev_prop.maxGridSize[1], dev_prop.maxGridSize[2])) 81 | log.debug( "Maximum memory pitch: %u bytes" % dev_prop.memPitch) 82 | log.debug( "Texture alignment: %u bytes" % dev_prop.textureAlignment) 83 | log.debug( "Clock rate: %.2f GHz" % (dev_prop.clockRate * (1e-6))) 84 | 85 | if CUDART_VERSION >= 2000: 86 | log.debug("Concurrent copy and execution: %s" % bool(dev_prop.deviceOverlap)) 87 | -------------------------------------------------------------------------------- /cuda/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from logger import * 2 | from libutils import * 3 | from decorator import memoize 4 | 5 | -------------------------------------------------------------------------------- /cuda/utils/libutils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import ctypes 3 | import os 4 | import platform 5 | 6 | OSNAME = platform.system() 7 | 8 | def get_lib(name, cdll_opts = None): 9 | libname = None 10 | if OSNAME == "Linux": 11 | libname = "lib" + name + ".so" 12 | elif OSNAME == "Darwin": 13 | libname = "lib" + name + ".dylib" 14 | elif OSNAME == "Windows": 15 | import _winreg as wreg 16 | reg = wreg.ConnectRegistry(None, wreg.HKEY_LOCAL_MACHINE) 17 | key = wreg.OpenKey(reg, r"SOFTWARE\NVIDIA Corporation\Installed Products\NVIDIA CUDA") 18 | cuda_bin = os.path.join(wreg.QueryValueEx(key, "InstallDir")[0],"bin") 19 | libname = os.path.join(cuda_bin, "%s.dll" % name) 20 | if name == "cuda": 21 | libname = "nvcuda.dll" 22 | lib = ctypes.windll.LoadLibrary( libname ) 23 | return lib 24 | if cdll_opts: 25 | lib = ctypes.CDLL(libname, cdll_opts) 26 | else: 27 | lib = ctypes.CDLL(libname) 28 | return lib 29 | 30 | if __name__ == "__main__": 31 | try: 32 | print "Loading libcuda..." 33 | get_lib("cuda") 34 | print "Test PASSED" 35 | except: 36 | print "Test FAILED" 37 | -------------------------------------------------------------------------------- /cuda/utils/logger.py: -------------------------------------------------------------------------------- 1 | # Setup logging globally (ie root logger) 2 | import types 3 | import logging 4 | import logging.handlers 5 | import platform 6 | 7 | INFO_NO_NEWLINE = logging.INFO + 1 8 | 9 | class MultipleFormatHandler(logging.StreamHandler): 10 | 11 | formatters = { logging.INFO: logging.Formatter(">>> %(message)s\n"), 12 | INFO_NO_NEWLINE: logging.Formatter(">>> %(message)s"), 13 | logging.DEBUG: logging.Formatter("%(filename)s:%(lineno)d - %(levelname)s - %(message)s\n"), 14 | logging.WARN: logging.Formatter("%(filename)s:%(lineno)d - %(levelname)s - %(message)s\n"), 15 | logging.CRITICAL: logging.Formatter("%(filename)s:%(lineno)d - %(levelname)s - %(message)s\n"), 16 | logging.ERROR: logging.Formatter("%(filename)s:%(lineno)d - %(levelname)s - %(message)s\n")} 17 | 18 | def format(self,record): 19 | return self.formatters[record.levelno].format(record) 20 | 21 | def emit(self, record): 22 | try: 23 | msg = self.format(record) 24 | fs = "%s" 25 | if not hasattr(types, "UnicodeType"): #if no unicode support... 26 | self.stream.write(fs % msg) 27 | else: 28 | try: 29 | self.stream.write(fs % msg) 30 | except UnicodeError: 31 | self.stream.write(fs % msg.encode("UTF-8")) 32 | self.flush() 33 | except (KeyboardInterrupt, SystemExit): 34 | raise 35 | except: 36 | self.handleError(record) 37 | 38 | logger = logging.getLogger('python-cuda') 39 | logger.setLevel(logging.INFO) 40 | 41 | mfh = MultipleFormatHandler() 42 | logger.addHandler(mfh) 43 | 44 | if platform.system() == "Linux": 45 | syslog_handler = logging.handlers.SysLogHandler(address='/dev/log') 46 | formatter = logging.Formatter("%(filename)s:%(lineno)d - %(levelname)s - %(message)s\n") 47 | syslog_handler.setFormatter(formatter) 48 | logger.addHandler(syslog_handler) 49 | 50 | def enable_debug(): 51 | logger.setLevel(logging.DEBUG) 52 | -------------------------------------------------------------------------------- /mkdist: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | rm -rf build MANIFEST 3 | python setup.py bdist --formats=gztar 4 | rm -rf build MANIFEST 5 | -------------------------------------------------------------------------------- /oldcode/cu/__init__.py: -------------------------------------------------------------------------------- 1 | from cu_api import * 2 | -------------------------------------------------------------------------------- /oldcode/cublas/__init__.py: -------------------------------------------------------------------------------- 1 | from cublas_api import * 2 | -------------------------------------------------------------------------------- /oldcode/cublas/cublas_defs.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8: © Arno Pähler, 2007-08 2 | 3 | from ctypes import * 4 | 5 | c_int_p = POINTER(c_int) 6 | c_uint_p = POINTER(c_uint) 7 | c_float_p = POINTER(c_float) 8 | 9 | ###include "cuComplex.h" /* import complex data type */ 10 | ## 11 | ##/* CUBLAS status returns */ 12 | ###define CUBLAS_STATUS_SUCCESS 0x00000000 13 | ###define CUBLAS_STATUS_NOT_INITIALIZED 0x00000001 14 | ###define CUBLAS_STATUS_ALLOC_FAILED 0x00000003 15 | ###define CUBLAS_STATUS_INVALID_VALUE 0x00000007 16 | ###define CUBLAS_STATUS_MAPPING_ERROR 0x0000000B 17 | ###define CUBLAS_STATUS_EXECUTION_FAILED 0x0000000D 18 | ###define CUBLAS_STATUS_INTERNAL_ERROR 0x0000000E 19 | CUBLAS_STATUS_SUCCESS = 0x00000000 20 | CUBLAS_STATUS_NOT_INITIALIZED = 0x00000001 21 | CUBLAS_STATUS_ALLOC_FAILED = 0x00000003 22 | CUBLAS_STATUS_INVALID_VALUE = 0x00000007 23 | CUBLAS_STATUS_MAPPING_ERROR = 0x0000000B 24 | CUBLAS_STATUS_EXECUTION_FAILED = 0x0000000D 25 | CUBLAS_STATUS_INTERNAL_ERROR = 0x0000000E 26 | 27 | ##/* CUBLAS data types */ 28 | ##typedef unsigned int cublasStatus; 29 | cublasStatus = c_uint 30 | -------------------------------------------------------------------------------- /oldcode/cuda/__init__.py: -------------------------------------------------------------------------------- 1 | from cuda_api import * 2 | 3 | -------------------------------------------------------------------------------- /oldcode/cufft/__init__.py: -------------------------------------------------------------------------------- 1 | from cufft_api import * 2 | -------------------------------------------------------------------------------- /oldcode/cufft/cufft_api.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | # XXX 5 | # -- 6 | # Code forked from python-cuda-2.0_42 © Arno Pähler, 2007-08 7 | # -- 8 | 9 | from cufft_defs import * 10 | from cuda.utils import libutils 11 | 12 | cufft = libutils.get_lib("cufft", RTLD_GLOBAL) 13 | 14 | #cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan, 15 | # int nx, 16 | # cufftType type, 17 | # int batch); 18 | cufftPlan1d = cufft.cufftPlan1d 19 | cufftPlan1d.restype = cufftResult 20 | cufftPlan1d.argtypes = [ cufftHandle_p, 21 | c_int, cufftType, c_int ] 22 | 23 | #cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan, 24 | # int nx, int ny, 25 | # cufftType type); 26 | cufftPlan2d = cufft.cufftPlan2d 27 | cufftPlan2d.restype = cufftResult 28 | cufftPlan2d.argtypes = [ cufftHandle_p, 29 | c_int, c_int, cufftType ] 30 | 31 | #cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan, 32 | # int nx, int ny, int nz, 33 | # cufftType type); 34 | cufftPlan3d = cufft.cufftPlan3d 35 | cufftPlan3d.restype = cufftResult 36 | cufftPlan3d.argtypes = [ cufftHandle_p, 37 | c_int, c_int, c_int, cufftType ] 38 | 39 | #cufftResult CUFFTAPI cufftDestroy(cufftHandle plan); 40 | cufftDestroy = cufft.cufftDestroy 41 | cufftDestroy.restype = cufftResult 42 | cufftDestroy.argtypes = [ cufftHandle ] 43 | 44 | #cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan, 45 | # cufftComplex *idata, 46 | # cufftComplex *odata, 47 | # int direction); 48 | cufftExecC2C = cufft.cufftExecC2C 49 | cufftExecC2C.restype = cufftResult 50 | cufftExecC2C.argtypes = [ cufftHandle, c_uint, c_uint, c_int ] 51 | 52 | #cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan, 53 | # cufftReal *idata, 54 | # cufftComplex *odata); 55 | cufftExecR2C = cufft.cufftExecR2C 56 | cufftExecR2C.restype = cufftResult 57 | cufftExecR2C.argtypes = [ cufftHandle, c_uint, c_uint ] 58 | 59 | #cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan, 60 | # cufftComplex *idata, 61 | # cufftReal *odata); 62 | cufftExecC2R = cufft.cufftExecC2R 63 | cufftExecC2R.restype = cufftResult 64 | cufftExecC2R.argtypes = [ cufftHandle, c_uint, c_uint ] 65 | -------------------------------------------------------------------------------- /oldcode/cufft/cufft_defs.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8: © Arno Pähler, 2007-08 2 | 3 | from ctypes import * 4 | 5 | #// CUFFT API function return values 6 | #typedef enum cufftResult_t { 7 | # CUFFT_SUCCESS = 0x0, 8 | # CUFFT_INVALID_PLAN = 0x1, 9 | # CUFFT_ALLOC_FAILED = 0x2, 10 | # CUFFT_INVALID_TYPE = 0x3, 11 | # CUFFT_INVALID_VALUE = 0x4, 12 | # CUFFT_INTERNAL_ERROR = 0x5, 13 | # CUFFT_EXEC_FAILED = 0x6, 14 | # CUFFT_SETUP_FAILED = 0x7, 15 | # CUFFT_INVALID_SIZE = 0x8 16 | #} cufftResult; 17 | 18 | cufftResult = c_int 19 | 20 | CUFFT_SUCCESS = 0x0 21 | CUFFT_INVALID_PLAN = 0x1 22 | CUFFT_ALLOC_FAILED = 0x2 23 | CUFFT_INVALID_TYPE = 0x3 24 | CUFFT_INVALID_VALUE = 0x4 25 | CUFFT_INTERNAL_ERROR = 0x5 26 | CUFFT_EXEC_FAILED = 0x6 27 | CUFFT_SETUP_FAILED = 0x7 28 | CUFFT_INVALID_SIZE = 0x8 29 | 30 | #// CUFFT defines and supports the following data types 31 | # 32 | #// cufftHandle is a handle type used to store and access CUFFT plans. 33 | #typedef unsigned int cufftHandle; 34 | # 35 | #// cufftReal is a single-precision, floating-point real data type. 36 | #typedef float cufftReal; 37 | # 38 | #// cufftComplex is a single-precision, floating-point complex data type that 39 | #// consists of interleaved real and imaginary components. 40 | #typedef float cufftComplex[2]; 41 | 42 | cufftHandle = c_uint 43 | cufftReal = c_float 44 | cufftComplex = (c_float*2) 45 | 46 | cufftHandle_p = POINTER(cufftHandle) 47 | 48 | #// CUFFT transform directions 49 | ##define CUFFT_FORWARD -1 // Forward FFT 50 | ##define CUFFT_INVERSE 1 // Inverse FFT 51 | 52 | CUFFT_FORWARD = -1 ## Forward FFT 53 | CUFFT_INVERSE = 1 ## Inverse FFT 54 | 55 | #// CUFFT supports the following transform types 56 | #typedef enum cufftType_t { 57 | # CUFFT_R2C = 0x2a, // Real to Complex (interleaved) 58 | # CUFFT_C2R = 0x2c, // Complex (interleaved) to Real 59 | # CUFFT_C2C = 0x29 // Complex to Complex, interleaved 60 | #} cufftType; 61 | 62 | cufftType = c_int 63 | 64 | CUFFT_R2C = 0x2a ## Real to Complex (interleaved) 65 | CUFFT_C2R = 0x2c ## Complex (interleaved) to Real 66 | CUFFT_C2C = 0x29 ## Complex to Complex, interleaved 67 | -------------------------------------------------------------------------------- /oldcode/examples/TODO: -------------------------------------------------------------------------------- 1 | - put 6.963 examples (converted to python-cuda) 2 | - put SDK examples (converted to python-cuda) 3 | - anything else! -------------------------------------------------------------------------------- /oldcode/examples/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /oldcode/examples/bw_test.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | # coding:utf-8: © Arno Pähler, 2007-08 3 | 4 | from cuda.cuda import * 5 | from cuda.utils.cuda_utils import mallocHost 6 | from cuda.utils.ctypes_array import convert 7 | 8 | from time import time 9 | from numpy import abs,max 10 | 11 | PAGEABLE = 0 12 | PINNED = 1 13 | MEMCOPY_ITERATIONS = 250 14 | 15 | def compare(a,b): 16 | a1 = convert(a) 17 | b1 = convert(b) 18 | diff = max(abs(a1-b1)) 19 | return diff 20 | 21 | #/////////////////////////////////////////////////////////////////////////////// 22 | #// test the bandwidth of a device to host memcopy of a specific size 23 | #/////////////////////////////////////////////////////////////////////////////// 24 | def testDeviceToHostTransfer(size,mode): 25 | dtype = c_int 26 | memSize = size*sizeof(dtype) 27 | amountCopied = memSize*MEMCOPY_ITERATIONS 28 | d_idata = c_void_p() 29 | 30 | h_idata = mallocHost(size,dtype,mode) 31 | h_odata = mallocHost(size,dtype,mode) 32 | for i in range(size): 33 | h_idata[i] = dtype(size-i) 34 | h_odata[i] = dtype(123) 35 | 36 | cudaMalloc(byref(d_idata),memSize) 37 | cudaMemcpy(d_idata,h_idata,memSize,cudaMemcpyHostToDevice) 38 | t0 = time() 39 | for i in range(MEMCOPY_ITERATIONS): 40 | cudaMemcpy(h_odata,d_idata,memSize,cudaMemcpyDeviceToHost) 41 | t1 = time()-t0 42 | diff = compare(h_idata,h_odata) 43 | print "Max abs difference = %3s" % diff, 44 | bandwidthInGBs = amountCopied/(t1*float((1 << 30))) 45 | 46 | if mode == PINNED: 47 | cudaFreeHost(h_idata) 48 | cudaFreeHost(h_odata) 49 | cudaFree(d_idata) 50 | print "Device To Host : %4.1f GB/s" % bandwidthInGBs 51 | return bandwidthInGBs 52 | 53 | #/////////////////////////////////////////////////////////////////////////////// 54 | #//! test the bandwidth of a host to device memcopy of a specific size 55 | #/////////////////////////////////////////////////////////////////////////////// 56 | def testHostToDeviceTransfer(size,mode): 57 | dtype = c_float 58 | memSize = size*sizeof(dtype) 59 | amountCopied = memSize*MEMCOPY_ITERATIONS 60 | d_idata = c_void_p() 61 | 62 | h_idata = mallocHost(size,dtype,mode) 63 | h_odata = mallocHost(size,dtype,mode) 64 | for i in range(size): 65 | h_idata[i] = dtype(size-i) 66 | h_odata[i] = dtype(456) 67 | 68 | cudaMalloc(byref(d_idata),memSize) 69 | t0 = time() 70 | for i in range(MEMCOPY_ITERATIONS): 71 | cudaMemcpy(d_idata,h_idata,memSize,cudaMemcpyHostToDevice) 72 | cudaMemcpy(h_odata,d_idata,memSize,cudaMemcpyDeviceToHost) 73 | t1 = time()-t0 74 | diff = compare(h_idata,h_odata) 75 | print "Max abs difference = %3s" % diff, 76 | bandwidthInGBs = amountCopied/(t1*float((1 << 30))) 77 | 78 | if mode == PINNED: 79 | cudaFreeHost(h_idata) 80 | cudaFreeHost(h_odata) 81 | cudaFree(d_idata) 82 | print "Host To Device : %4.1f GB/s" % bandwidthInGBs 83 | return bandwidthInGBs 84 | 85 | #/////////////////////////////////////////////////////////////////////////////// 86 | #//! test the bandwidth of a device to device memcopy of a specific size 87 | #/////////////////////////////////////////////////////////////////////////////// 88 | def testDeviceToDeviceTransfer(size,mode): 89 | dtype = c_double 90 | memSize = size*sizeof(dtype) 91 | amountCopied = memSize*MEMCOPY_ITERATIONS 92 | d_idata = c_void_p() 93 | d_odata = c_void_p() 94 | 95 | h_idata = mallocHost(size,dtype,mode) 96 | h_odata = mallocHost(size,dtype,mode) 97 | for i in range(size): 98 | h_idata[i] = dtype(size-i) 99 | h_odata[i] = dtype(789) 100 | 101 | cudaMalloc(byref(d_idata),memSize) 102 | cudaMalloc(byref(d_odata),memSize) 103 | cudaMemcpy(d_idata,h_idata,memSize,cudaMemcpyHostToDevice) 104 | t0 = time() 105 | for i in range(MEMCOPY_ITERATIONS): 106 | cudaMemcpy(d_odata,d_idata,memSize,cudaMemcpyDeviceToDevice) 107 | cudaThreadSynchronize() 108 | t1 = time()-t0 109 | cudaMemcpy(h_odata,d_odata,memSize,cudaMemcpyDeviceToHost) 110 | diff = compare(h_idata,h_odata) 111 | print "Max abs difference = %3s" % diff, 112 | bandwidthInGBs = (2.*amountCopied)/(t1*float((1 << 30))) 113 | 114 | if mode == PINNED: 115 | cudaFreeHost(h_idata) 116 | cudaFreeHost(h_odata) 117 | cudaFree(d_idata) 118 | cudaFree(d_odata) 119 | print "Device To Device: %4.1f GB/s" % bandwidthInGBs 120 | return bandwidthInGBs 121 | 122 | if __name__ == "__main__": 123 | size = 1024*1024 124 | memtype = {PAGEABLE:"pageable ",PINNED:"page-locked"} 125 | 126 | for mode in (PAGEABLE,PINNED): 127 | print 128 | print "+-------------------------+" 129 | print "| Bandwidth transfer test |" 130 | print "| using CUDA runtime API |" 131 | print "| with %s memory |" % memtype[mode] 132 | print "+-------------------------+\n" 133 | 134 | testHostToDeviceTransfer(size,mode) 135 | testDeviceToHostTransfer(size,mode) 136 | testDeviceToDeviceTransfer(size,mode) 137 | -------------------------------------------------------------------------------- /oldcode/misc/README: -------------------------------------------------------------------------------- 1 | just a temp place to put stuff that we haven't sorted yet (e.g. from the old 'examples') 2 | 3 | 4 | -------------------------------------------------------------------------------- /oldcode/misc/cf: -------------------------------------------------------------------------------- 1 | cu_fft.py -------------------------------------------------------------------------------- /oldcode/misc/cmpG: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh -f 2 | set flopt="--maxrregcount 32 --use_fast_math --gpu-name sm_11" 3 | set lib="" 4 | nvcc -o ${1} -O3 $flopt ${1}.cu -lcublas 5 | -------------------------------------------------------------------------------- /oldcode/misc/compileC: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh -f 2 | set inc="-I/opt/local/Library/Frameworks/Python.framework/Versions/2.6/include/" 3 | #set inc="-I/usr/include/python2.5" 4 | set lib="" 5 | set flags="-fPIC -O2 -msse2"# -malign-double" 6 | gcc -c $flags $inc ${1}.c 7 | gcc -shared $lib -o _${1}.so ${1}.o 8 | strip -x _${1}.so 9 | rm *.o 10 | -------------------------------------------------------------------------------- /oldcode/misc/compileCG: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | echo "Compiling CPU functions" 3 | compileC cpuFunctions 4 | echo "Compiling GPU functions" 5 | compileG gpuFunctions 6 | compileG simple 7 | -------------------------------------------------------------------------------- /oldcode/misc/compileCX: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh -f 2 | 3 | # Link all needed Intel routines into _vector.so 4 | # thus avoiding later problems with dynamic loading 5 | 6 | set src="vector" 7 | set mkl="/opt/intel/mkl/10.0.1.014/lib/32" 8 | set lib="$mkl/libmkl_intel.a $mkl/libmkl_intel_thread.a $mkl/libmkl_core.a $mkl/libguide.a -lpthread" 9 | set flags="-static -fPIC -O2 -msse2 -malign-double" 10 | gcc -c $flags $src.c 11 | gcc -shared -o _$src.so $src.o $lib 12 | strip -x _$src.so 13 | rm *.o 14 | -------------------------------------------------------------------------------- /oldcode/misc/compileG: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh -f 2 | set flag1="--ptx" 3 | set flag2="--cubin" 4 | #set flopt="--maxrregcount 12 --use_fast_math --gpu-architecture sm_11" 5 | set flopt="--use_fast_math --gpu-architecture sm_11" 6 | set lib="" 7 | nvcc $flag1 $flopt ${1}.cu |& grep -iv warning 8 | nvcc $flag2 $flopt ${1}.ptx |& grep -iv warning 9 | -------------------------------------------------------------------------------- /oldcode/misc/compileGso: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh -f 2 | set flag1="" 3 | #set flopt="--maxrregcount 12 --use_fast_math --gpu-code sm_11" 4 | set flopt="--use_fast_math --gpu-code sm_11 --ptxas-options=-v" 5 | set lib="-L$CUDA/lib -lcudart -lcuda" 6 | #nvcc ${flag1} ${flopt} ${1}.cu -c -o ${1}.o |& grep -iv warning 7 | nvcc ${flag1} ${flopt} ${1}.cu -c -o ${1}.o 8 | g++ -shared ${lib} -o lib${1}.so ${1}.o 9 | strip -x lib${1}.so 10 | rm ${1}.o 11 | -------------------------------------------------------------------------------- /oldcode/misc/cpuFunctions.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8: © Arno Pähler, 2007-08 2 | 3 | from ctypes import * 4 | 5 | #mm = CDLL("./_cpuFunctions.so") 6 | mm = CDLL("./_cpuFunctions.dylib") 7 | 8 | _cvp = c_void_p 9 | _cfl = c_float 10 | _cui = c_uint 11 | 12 | # 13 | # Utility functions 14 | # 15 | 16 | mm.rdtsc.restype = c_uint64; 17 | mm.rdtsc.argtypes = None 18 | 19 | def rdtsc(): 20 | return mm.rdtsc() 21 | 22 | ReadTimestampCounter = rdtsc 23 | 24 | mm.microtime.restype = c_long; 25 | mm.microtime.argtypes = None 26 | 27 | def microtime(): 28 | return mm.microtime() 29 | 30 | mm.scale.restype = None 31 | mm.scale.argtypes = [ _cvp, _cfl, _cui ] 32 | 33 | def scale(a,s,n=None): 34 | if n is None: 35 | n = len(a) 36 | mm.scale(a,s,n) 37 | 38 | mm.l1norm.restype = _cfl 39 | mm.l1norm.argtypes = [ _cvp, _cvp, _cui ] 40 | 41 | def l1norm(a,b,n=None): 42 | if n is None: 43 | n = len(a) 44 | return mm.l1norm(a,b,n) 45 | 46 | mm.arrayInit.restype = None 47 | mm.arrayInit.argtypes = [ _cvp, _cui ] 48 | 49 | def arrayInit(a,n=None): 50 | if n is None: 51 | n = len(a) 52 | mm.arrayInit(a,n) 53 | 54 | vectorInit = arrayInit 55 | 56 | mm.fixedInit.restype = None 57 | mm.fixedInit.argtypes = [ _cvp, _cui ] 58 | 59 | def fixedInit(a,n=None): 60 | if n is None: 61 | n = len(a) 62 | mm.fixedInit(a,n) 63 | 64 | mm.randInit.restype = None 65 | mm.randInit.argtypes = [ _cvp, _cui, _cfl, _cfl ] 66 | 67 | def randInit(a,l,h,n=None): 68 | if n is None: 69 | n = len(a) 70 | mm.randInit(a,n,l,h) 71 | 72 | mm.setZero.restype = None 73 | mm.setZero.argtypes = [ _cvp, _cui ] 74 | 75 | def setZero(a,n=None): 76 | if n is None: 77 | n = len(a) 78 | mm.setZero(a,n) 79 | 80 | mm.checkError.restype = None 81 | mm.checkError.argtypes = [ _cvp, _cvp, _cui, _cvp, _cvp ] 82 | 83 | def checkError(a,b,n=None): 84 | if n is None: 85 | n = len(a) 86 | err = c_float() 87 | mxe = c_float() 88 | mm.checkError(a,b,n,byref(err),byref(mxe)) 89 | return err.value,mxe.value 90 | 91 | mm.checkTrig.restype = None 92 | mm.checkTrig.argtypes = [ _cvp, _cvp, _cvp, _cvp, _cui ] 93 | 94 | def checkTrig(a,b,n=None): 95 | if n is None: 96 | n = len(a) 97 | e = c_float() 98 | m = c_float() 99 | mm.checkTrig(byref(e),byref(m),a,b,n) 100 | return e.value,m.value 101 | 102 | # 103 | # Math functions 104 | # 105 | 106 | mm.gflops.restype = None 107 | mm.gflops.argtypes = [ ] 108 | 109 | def cpuGFLOPS(): 110 | mm.gflops() 111 | 112 | mm.blsc.restype = None 113 | mm.blsc.argtypes = [ _cvp, _cvp, _cvp, 114 | _cvp, _cvp, _cfl, _cfl, _cui ] 115 | 116 | def cpuBLSC(h_C,h_P,h_S,h_X,h_T,R,V,size): 117 | mm.blsc(h_C,h_P,h_S,h_X,h_T,R,V,size) 118 | 119 | mm.poly5.restype = None 120 | mm.poly5.argtypes = [ _cvp, _cvp, _cui ] 121 | 122 | mm.poly10.restype = None 123 | mm.poly10.argtypes = [ _cvp, _cvp, _cui ] 124 | 125 | mm.poly20.restype = None 126 | mm.poly20.argtypes = [ _cvp, _cvp, _cui ] 127 | 128 | mm.poly40.restype = None 129 | mm.poly40.argtypes = [ _cvp, _cvp, _cui ] 130 | 131 | def cpuPOLY5(x,y,n=None): 132 | if n is None: 133 | n = len(x) 134 | mm.poly5(x,y,n) 135 | 136 | def cpuPOLY10(x,y,n=None): 137 | if n is None: 138 | n = len(x) 139 | mm.poly10(x,y,n) 140 | 141 | def cpuPOLY20(x,y,n=None): 142 | if n is None: 143 | n = len(x) 144 | mm.poly20(x,y,n) 145 | 146 | def cpuPOLY40(x,y,n=None): 147 | if n is None: 148 | n = len(x) 149 | mm.poly40(x,y,n) 150 | 151 | mm.saxpy.restype = None 152 | mm.saxpy.argtypes = [ _cfl, _cvp, _cvp, _cui ] 153 | 154 | def cpuSAXPY(a,x,y,n=None): 155 | if n is None: 156 | n = len(x) 157 | mm.saxpy(a,x,y,n) 158 | 159 | mm.vadd.restype = None 160 | mm.vadd.argtypes = [ _cvp, _cvp, _cui ] 161 | 162 | def cpuVADD(x,y,n=None): 163 | if n is None: 164 | n = len(x) 165 | mm.vadd(x,y,n) 166 | 167 | mm.sdot.restype = c_float 168 | mm.sdot.argtypes = [ _cvp, _cvp, _cui ] 169 | 170 | def cpuSDOT(x,y,n=None): 171 | if n is None: 172 | n = len(x) 173 | return mm.sdot(x,y,n) 174 | 175 | mm.sgemm.restype = None 176 | mm.sgemm.argtypes = [ 177 | _cvp, _cvp, _cvp, 178 | _cui, _cui, _cui ] 179 | 180 | def cpuSGEMM(C,A,B,m,k,n): 181 | mm.sgemm(C,A,B,m,k,n) 182 | 183 | mm.trig.restype = None 184 | mm.trig.argtypes = [ _cvp, _cvp, _cvp, _cui ] 185 | 186 | def cpuTRIG(a,x,y,n=None): 187 | if n is None: 188 | n = len(a) 189 | mm.trig(a,x,y,n) 190 | -------------------------------------------------------------------------------- /oldcode/misc/ctypes_array.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8: © Arno Pähler, 2007-08 2 | # important details are © Thomas Heller 3 | from sys import byteorder 4 | from ctypes import * 5 | from numpy.core.multiarray import array as multi_array 6 | from numpy import isfortran 7 | 8 | __all__ = ["convert","className","typeName"] 9 | 10 | ################################################################ 11 | # some shortcut utilities 12 | 13 | # use this as eval(PRODUCT,whatever) 14 | PRODUCT = "reduce(lambda x,y:x*y,%s,1)" 15 | 16 | def className(o): 17 | o = type(o) 18 | c = o.__class__.__name__ 19 | # if gone too far, back up one level 20 | if c == "type": 21 | c = o.__name__ 22 | return c 23 | 24 | def typeName(o): 25 | if isSimpleType(o): 26 | return o.__class__.__name__ 27 | while not isSimpleType(o): 28 | try: 29 | o = o._type_ 30 | except AttributeError: 31 | o = type(o) 32 | break 33 | return o.__name__ 34 | 35 | def isCtypesFamily(o): 36 | c_n = "ArrayType","PointerType","SimpleType" 37 | cn = o.__class__.__name__ 38 | isObject = cn[:2] == "c_" or cn[:5] =="LP_c_" 39 | isType = cn in c_n 40 | return isObject or isType 41 | 42 | def isNumpyArray(o): 43 | return className(o) == "ndarray" 44 | 45 | def isArrayType(o): 46 | return className(o) == "ArrayType" 47 | 48 | def isSimpleType(o): 49 | return className(o) == "SimpleType" 50 | 51 | if byteorder == "little": 52 | T = "<" 53 | else: 54 | T = ">" 55 | 56 | c_Dict = { 57 | "c_byte" : "%si1" % T, 58 | "c_short" : "%si2" % T, 59 | "c_int" : "%si4" % T, 60 | "c_long" : "%si4" % T, 61 | "c_longlong" : "%si8" % T, 62 | "c_ubyte" : "%su1" % T, 63 | "c_ushort" : "%su2" % T, 64 | "c_uint" : "%su4" % T, 65 | "c_ulong" : "%su4" % T, 66 | "c_ulonglong" : "%su8" % T, 67 | "c_float" : "%sf4" % T, 68 | "c_double" : "%sf8" % T,} 69 | 70 | n_Dict = dict( 71 | [(v,eval(k)) for k,v in c_Dict.items()]) 72 | 73 | ################################################################ 74 | # public functions 75 | 76 | def convert(obj,dims=None,order="C",out=None): 77 | """Converts ctypes array to numpy array and vice versa 78 | convert determines the input type (ctypes or numpy) 79 | internally and returns an object of the opposite type. 80 | 81 | NOTE: do NOT do the following: 82 | n1 = numpy_array; c1 = convert(n1); n1 = convert(n1) 83 | ^^ ^^ 84 | nasty things will happen! it is ok to do 85 | n1 = numpy_array; c1 = convert(n1); n2 = convert(n1) 86 | ^^ ^^ 87 | A 1D ctypes array ca can be converted to a (m,n,k,...) 88 | numpy array na in C order with convert(ca,(m,n,k,...),"C") 89 | and to na in F order with convert(ca,(m,n,k,...),"F"). 90 | (m,n,k,...) is reversed internally and the order of 91 | matrix-matrix or matrix-vector nultiplication must be 92 | inverted, when comparing with C oder results. 93 | 94 | This code is based on similar code posted by Thomas Heller""" 95 | 96 | # for obj in simple c_types (e.g.c_float(1.)) 97 | if isSimpleType(obj): 98 | """convert simple ctype to numpy array""" 99 | obj = obj.value, 100 | return multi_array(obj,copy=False) 101 | 102 | # for obj in scalars(e.g. 1.), lists and tuples 103 | if not (isCtypesFamily(obj) or isNumpyArray(obj)): 104 | """convert Python scalar, list or tuple to numpy array""" 105 | obj = tuple(obj) 106 | return multi_array(obj,copy=False) 107 | 108 | # numpy ==> ctypes 109 | if isNumpyArray(obj): 110 | """convert numpy array to ctypes array""" 111 | do_copy = False 112 | 113 | # if obj is C order and return object should be 114 | # Fortran order, transpose obj for Fortran order 115 | if not isfortran(obj) and order == "F": 116 | obj = obj.T 117 | 118 | ai = obj.__array_interface__ 119 | if ai["strides"]: 120 | pass 121 | # do something sensible 122 | # obj = obj.T 123 | # ai = obj.__array_interface__ 124 | 125 | addr,readonly = ai["data"] 126 | if readonly: # make a copy 127 | do_copy = True 128 | 129 | ## code below should consider strides 130 | i_size = obj.itemsize 131 | if out is None: 132 | # print "SIZE",eval(PRODUCT % "obj.shape") 133 | t = n_Dict[ai["typestr"]] 134 | for dim in ai["shape"][::-1]: 135 | t = t*dim 136 | if do_copy: 137 | out = t() 138 | memmove(out,addr,obj.size*i_size) 139 | else: 140 | out = t.from_address(addr) 141 | out.__array_interface__ = ai 142 | out.__keep = ai 143 | return out 144 | else: 145 | size1 = obj.size 146 | size2 = len(out) 147 | size = min(size1,size2)*i_size 148 | memmove(out,addr,size) 149 | out.__array_interface__ = ai 150 | out.__keep = ai 151 | return out 152 | # ctypes ==> numpy 153 | else: 154 | """convert ctypes array to numpy array""" 155 | typestr = c_Dict[typeName(obj)] 156 | strides = None 157 | if dims is None: 158 | shape = [] 159 | o = obj 160 | while isArrayType(o): 161 | shape.append(o._length_) 162 | o = o._type_ 163 | shape = tuple(shape) 164 | else: 165 | shape = tuple(dims) 166 | p = sizeof(eval(typeName(obj))) 167 | products = [p] 168 | for d in dims[:-1]: 169 | p *= d 170 | products.append(p) 171 | if order == "F": 172 | strides = tuple(products) 173 | 174 | ao = addressof(obj) 175 | ai = \ 176 | { 177 | 'descr' : [('',typestr)], 178 | '__ref' : ao, 179 | 'strides': strides, 180 | 'shape' : shape, 181 | 'version': 3, 182 | 'typestr': typestr, 183 | 'data' : (ao,False) 184 | } 185 | obj.__array_interface__ = ai 186 | 187 | return multi_array(obj,copy=False) 188 | -------------------------------------------------------------------------------- /oldcode/misc/ctypes_array_test.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | from ctypes import * 4 | from ctypes_array import * 5 | from numpy import * 6 | from numpy.random import rand 7 | 8 | def pif(a): 9 | ai = a.__array_interface__ 10 | b = ai["strides"] is not None 11 | return " ".join([ 12 | "%-5s:" % str(b), 13 | ":",className(a), 14 | str(ai["strides"]), 15 | str(ai["shape"]),]) 16 | 17 | ac = rand(4,8) 18 | af = array(ac,order="F")#ac.T 19 | 20 | def norm2(a): 21 | return round(sqrt(sum(a*a)),3) 22 | 23 | print "\nOriginal arrays ac, af = ac.T" 24 | print "hasStrides(ac):",pif(ac) 25 | print "hasStrides(af):",pif(af) 26 | print "type ac,af",type(ac),type(af) 27 | 28 | print "\nConvert to ctypes: ac => bc, af => bf" 29 | bc = convert(ac); print "hasStrides(bc):",pif(bc) 30 | bf = convert(af); print "hasStrides(bf):",pif(bf) 31 | print "\ntype bc,bf",type(bc),type(bf) 32 | 33 | print "\nCan combine numpy arrays and ctypes objects with array interface" 34 | delta = (ac-bc).flatten() 35 | print "L2-norm ac-bc",norm2(delta) 36 | delta = (af-bf).flatten() 37 | print "L2-norm af-bf",norm2(delta) 38 | 39 | print "\nConvert to numpy: bc => cc, bf => cf, bf => CF (Fortran)" 40 | cc = convert(bc);print "hasStrides(cc):",pif(cc) 41 | cf = convert(bf);print "hasStrides(cf):",pif(cf) 42 | CF = convert(bf,order="F"); print "hasStrides(CF):",pif(CF) 43 | print "\ntype cc,cf,CF",type(cc),type(cf),type(CF) 44 | 45 | delta = (af-CF).flatten() 46 | print "L2-norm af-CF",norm2(delta) 47 | 48 | print "\nConvert to ctypes ac => dc, af => df (dc,df = 1D)" 49 | dc = (eval(typeName(bc))*ac.size)() 50 | df = (eval(typeName(bf))*af.size)() 51 | convert(ac,None,None,dc) 52 | convert(af,None,None,df) 53 | print "type dc,df",type(dc),type(df) 54 | 55 | delta = (ac-dc).flatten() 56 | print "L2-norm ac-dc",norm2(delta) 57 | delta = (af-df).flatten() 58 | print "L2-norm af-df",norm2(delta) 59 | delta = af.flatten()-ac.flatten() 60 | print "\ncomparing flattened ac,af" 61 | print "L2-norm af-ac",norm2(delta) 62 | set_printoptions(precision=3) 63 | print "\nac[:3],af[:3]" 64 | print ac.flatten()[:3] 65 | print af.flatten()[:3] 66 | print "\ndc[:3],df[:3]" 67 | print "[%6.3f %6.3f %6.3f]" % tuple(dc[:3]) 68 | print "[%6.3f %6.3f %6.3f]" % tuple(df[:3]) 69 | 70 | print "\nConvert to numpy: dc => ec, df => ef" 71 | ec = convert(dc,(4,8),"C") 72 | ef = convert(df,(4,8),"F") 73 | print "1D dc,df ctypes objects=> 2D numpy arrays ec,ef" 74 | print ; print "ac, ec" 75 | print pif(ac) 76 | print pif(ec) 77 | print ; print "af, ef" 78 | print pif(af) 79 | print pif(ef) 80 | print "\nL2-norm ac-ec, af-ef, ec-ef" 81 | print norm2(ac-ec) 82 | print norm2(af-ef) 83 | print norm2(ec-ef) 84 | print "\nL2-norm ac-ef, af-ec (flattened)" 85 | print norm2(ac.flatten()-ef.flatten()) 86 | print norm2(af.flatten()-ec.flatten()) 87 | 88 | 89 | -------------------------------------------------------------------------------- /oldcode/misc/ctypes_extra.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | # coding:utf-8: © Arno Pähler, 2007-08 3 | from ctypes import * 4 | from numpy import * 5 | from numpy.random import rand 6 | from ctypes_array import convert 7 | 8 | ao = addressof 9 | def fa(a,o,n,dtype=None): 10 | if dtype is None: 11 | t = a.__class__._type_ 12 | else: 13 | t = dtype 14 | s = sizeof(t) 15 | return (t*n).from_address(ao(a)+o*s) 16 | 17 | class x_float(c_float): 18 | pass 19 | 20 | b = convert(rand(10)) 21 | a = (x_float*10)(*b) 22 | z = (c_float*10)(*b) 23 | 24 | try: 25 | u = (c_float*2).from_address(ao(a[6])) 26 | su = sizeof(u._type_) 27 | print "0x%8.8x" % ao(u) 28 | print "%10.7f %10.7f" % (u[0],u[1]) 29 | except TypeError: 30 | print "x_float does not work" 31 | 32 | try: 33 | v = fa(a,6,2,c_float) 34 | sv = sizeof(v._type_) 35 | except TypeError: 36 | print "x_float does not work" 37 | 38 | try: 39 | w = fa(z,6,2) 40 | sw = sizeof(w._type_) 41 | except TypeError: 42 | print "c_float does not work" 43 | 44 | sz = sizeof(z._type_) 45 | for i in range(len(v)): 46 | print "0x%8.8x 0x%8.8x 0x%8.8x 0x%8.8x" % ( 47 | ao(a[i+6]),ao(v)+i*sv,ao(z)+(i+6)*sz,ao(w)+i*sw) 48 | print "%10.7f %10.7f %10.7f % 10.7f" % (a[i+6].value,v[i],z[i+6],w[i]) 49 | -------------------------------------------------------------------------------- /oldcode/misc/devinfo_cr.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | # coding:utf-8: © Arno Pähler, 2007-08 3 | 4 | from ctypes import * 5 | from cuda.cuda_api import * 6 | 7 | if __name__ == "__main__": 8 | print "+------------------------+" 9 | print "| CUDA Device Info |" 10 | print "| using CUDA runtime API |" 11 | print "+------------------------+\n" 12 | count = c_int() 13 | cudaGetDeviceCount(byref(count)) 14 | print "number of devices =", count.value 15 | props = cudaDeviceProp() 16 | cudaGetDeviceProperties(props, 0) 17 | print props 18 | -------------------------------------------------------------------------------- /oldcode/misc/devinfo_cu.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | # coding:utf-8: © Arno Pähler, 2007-08 3 | 4 | from ctypes import * 5 | from cuda.cu_defs import CUdevprop 6 | from cuda.cu_api import * 7 | 8 | if __name__ == "__main__": 9 | print "+-----------------------+" 10 | print "| CUDA Device Info |" 11 | print "| using CUDA driver API |" 12 | print "+-----------------------+\n" 13 | cuInit(0) 14 | count = c_int() 15 | cuDeviceGetCount(byref(count)) 16 | device = CUdevice() 17 | name = (c_char*256)() 18 | cuDeviceGet(byref(device),0) 19 | cuDeviceGetName(name,256,device) 20 | memsize = c_uint() 21 | cuDeviceTotalMem(byref(memsize),device) 22 | major,minor = c_int(),c_int() 23 | cuDeviceComputeCapability(byref(major),byref(minor),device) 24 | props = CUdevprop() 25 | cuDeviceGetProperties(byref(props),device) 26 | 27 | cuContext = CUcontext() 28 | cuCtxCreate(byref(cuContext),0,device) 29 | free,total = c_uint(),c_uint() 30 | cuMemGetInfo(byref(free),byref(total)) 31 | free = free.value 32 | cuCtxDetach(cuContext) 33 | 34 | print "%-19s = %d" % ("number of devices",count.value) 35 | print "%-19s = %s" % ("device name =",name.value) 36 | print "%-19s = %.f MB" % ("memory size",memsize.value/1024.**2) 37 | print "%-19s = %.f MB" % ("memory free",free/1024.**2) 38 | print "%-19s = %.f MHz" % ("clock rate",props.clockRate/1000.) 39 | print "%-19s = %d" % ("major",major.value) 40 | print "%-19s = %d" % ("minor",minor.value) 41 | print 21*"-" 42 | print props 43 | -------------------------------------------------------------------------------- /oldcode/misc/gpuFunctions.linkinfo: -------------------------------------------------------------------------------- 1 | --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export gpuScale,gpuPOLY5,gpuPOLY20,gpuPOLY40,init_array,gpuSAXPY,gpuSGEMM,gpuPOLY10,gpuVADD,gpuGFLOPS,gpuTRIG,gpuBLSC -------------------------------------------------------------------------------- /oldcode/misc/gpuFunctions.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8: © Arno Pähler, 2007-08 2 | from ctypes import * 3 | 4 | cvp = c_void_p 5 | _cf = c_float 6 | _ci = c_int 7 | 8 | lib = CDLL("./libgpuFunctions.so") 9 | 10 | #__global__ void gpuGFLOPS() 11 | gpuGFLOPS = lib.__device_stub_gpuGFLOPS 12 | gpuGFLOPS.restype = None 13 | gpuGFLOPS.argtypes = [ ] 14 | 15 | #__global__ void gpuBLSC( 16 | #float *d_Calls, float *d_Puts, 17 | #float *d_S, float *d_X, float *d_T, 18 | #float R, float V, int OptN) 19 | gpuBLSC = lib.__device_stub_gpuBLSC 20 | gpuBLSC.restype = None 21 | gpuBLSC.argtypes = [ cvp, cvp, cvp, cvp, cvp, 22 | _cf, _cf, _ci ] 23 | 24 | #__global__ void gpuPOLY5( 25 | #float *d_In1, float *d_Out1, int size ) 26 | gpuPOLY5 = lib.__device_stub_gpuPOLY5 27 | gpuPOLY5.restype = None 28 | gpuPOLY5.argtypes = [ cvp, cvp, _ci ] 29 | 30 | #__global__ void gpuPOLY10( 31 | #float *d_In1, float *d_Out1, int size ) 32 | gpuPOLY10 = lib.__device_stub_gpuPOLY10 33 | gpuPOLY10.restype = None 34 | gpuPOLY10.argtypes = [ cvp, cvp, _ci ] 35 | 36 | #__global__ void gpuPOLY20( 37 | #float *d_In1, float *d_Out1, int size ) 38 | gpuPOLY20 = lib.__device_stub_gpuPOLY20 39 | gpuPOLY20.restype = None 40 | gpuPOLY20.argtypes = [ cvp, cvp, _ci ] 41 | 42 | #__global__ void gpuPOLY40( 43 | #float *d_In1, float *d_Out1, int size ) 44 | gpuPOLY40 = lib.__device_stub_gpuPOLY40 45 | gpuPOLY40.restype = None 46 | gpuPOLY40.argtypes = [ cvp, cvp, _ci ] 47 | 48 | #__global__ void gpuSAXPY( 49 | #float Factor, float *d_In1, float *d_In2, int size ) 50 | gpuSAXPY = lib.__device_stub_gpuSAXPY 51 | gpuSAXPY.restype = None 52 | gpuSAXPY.argtypes = [ _cf, cvp, cvp, _ci ] 53 | 54 | #__global__ void gpuVADD( 55 | #float *d_In1, float *d_In2, int size ) 56 | gpuVADD = lib.__device_stub_gpuVADD 57 | gpuVADD.restype = None 58 | gpuVADD.argtypes = [ cvp, cvp, _ci ] 59 | 60 | #__global__ void gpuSGEMM( 61 | #float* C, float* A, float* B, int wA, int wB ) 62 | gpuSGEMM = lib.__device_stub_gpuSGEMM 63 | gpuSGEMM.restype = None 64 | gpuSGEMM.argtypes = [ cvp, cvp, cvp, _ci, _ci ] 65 | 66 | #__global__ void gpuTRIG( 67 | #float *d_Out1, float *d_Out2, float *d_In1, int size ) 68 | gpuTRIG = lib.__device_stub_gpuTRIG 69 | gpuTRIG.restype = None 70 | gpuTRIG.argtypes = [ cvp, cvp, cvp, _ci ] 71 | 72 | #__global__ void gpuScale( 73 | #float *d_Out1, _F *d_In1, _F scale, int size ) 74 | gpuScale = lib.__device_stub_gpuScale 75 | gpuScale.restype = None 76 | gpuScale.argtypes = [ cvp, cvp, _cf, _ci ] 77 | 78 | #// for streams example 79 | #__global__ void init_array( 80 | #int *g_data, int *factor){ 81 | init_array = lib.__device_stub_init_array 82 | init_array.restype = None 83 | init_array.argtypes = [ c_int, c_int ] 84 | -------------------------------------------------------------------------------- /oldcode/misc/kernelGL.cu: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | __global__ void kernel1( 3 | float4* pos, unsigned int width, unsigned int height, float time) 4 | { 5 | unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; 6 | unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; 7 | 8 | // calculate uv coordinates 9 | float u = x / (float) width; 10 | float v = y / (float) height; 11 | u = u*2.0f - 1.0f; 12 | v = v*2.0f - 1.0f; 13 | 14 | // calculate simple sine wave pattern 15 | float freq = 4.0f; 16 | float w = sinf(u*freq + time) * cosf(v*freq + time) * 0.5f; 17 | 18 | // write output vertex 19 | pos[y*width+x] = make_float4(u, w, v, 1.0f); 20 | } 21 | 22 | __global__ void kernel2( 23 | float4* pos, unsigned int width, unsigned int height, float time) 24 | { 25 | unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; 26 | unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; 27 | 28 | // calculate uv coordinates 29 | float u = x / (float) width; 30 | float v = y / (float) height; 31 | u = u*2.0f - 1.0f; 32 | v = v*2.0f - 1.0f; 33 | 34 | // calculate simple sine wave pattern 35 | float freq = 4.0f; 36 | float efac = .5f*exp(.5f*sin((u+v)*freq+time)); 37 | float w = (sinf(u*freq + time) + cosf(v*freq + time)) * efac; 38 | 39 | // write output vertex 40 | pos[y*width+x] = make_float4(u, w, v, 1.0f); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /oldcode/misc/matadd.txt: -------------------------------------------------------------------------------- 1 | //CPU 2 | void addMatrix(float *a, float *b, 3 | float *c, int N) 4 | { 5 | int i, j, index; 6 | for (i = 0; i < N; i++) { 7 | for (j = 0; j < N; j++) { 8 | index = i + j * N; 9 | c[index]=a[index] + b[index]; 10 | } 11 | } 12 | } 13 | void main() 14 | { 15 | ..... 16 | addMatrix(a, b, c, N); 17 | } 18 | //GPU 19 | __global__ void addMatrix(float *a,float *b, 20 | float *c, int N) 21 | { 22 | int i=blockIdx.x*blockDim.x+threadIdx.x; 23 | int j=blockIdx.y*blockDim.y+threadIdx.y; 24 | int index = i + j * N; 25 | if ( i < N && j < N) 26 | c[index]= a[index] + b[index]; 27 | } 28 | void main() 29 | { 30 | ..... // allocate & transfer data to GPU 31 | dim3 dimBlk (blocksize, blocksize); 32 | dim3 dimGrd (N/dimBlk.x, N/dimBlk.y); 33 | addMatrix<<>>(a, b, c,N); 34 | } 35 | //GPU 36 | // Compute vector sum C = A+B 37 | // Each thread performs one pair-wise addition 38 | __global__ void vecAdd(float* A, float* B, float* C) 39 | { 40 | int i = threadIdx.x + blockDim.x * blockIdx.x; 41 | C[i] = A[i] + B[i]; 42 | } 43 | __global__ void vecAdd(float* A, float* B, float* C); 44 | void main() 45 | { 46 | // Execute on N/256 blocks of 256 threads each 47 | vecAdd<<< N/256, 256>>>(d_A, d_B, d_C); 48 | } 49 | //GPU 50 | __global__ void transpose_naive(float *odata, float *idata, int width, int height) 51 | { 52 | unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x; 53 | unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y; 54 | if (xIndex < width && yIndex < height) 55 | { 56 | unsigned int index_in = xIndex + width * yIndex; 57 | unsigned int index_out = yIndex + height * xIndex; 58 | odata[index_out] = idata[index_in]; 59 | } 60 | } 61 | __global__ void transpose(float *odata, float *idata, int width, int height) 62 | { 63 | __shared__ float block[(BLOCK_DIM+1)*BLOCK_DIM]; 64 | unsigned int xBlock = __mul24(blockDim.x, blockIdx.x); 65 | unsigned int yBlock = __mul24(blockDim.y, blockIdx.y); 66 | unsigned int xIndex = xBlock + threadIdx.x; 67 | unsigned int yIndex = yBlock + threadIdx.y; 68 | unsigned int index_out, index_transpose; 69 | if (xIndex < width && yIndex < height) 70 | { 71 | unsigned int index_in = __mul24(width, yIndex) + xIndex; 72 | unsigned int index_block = __mul24(threadIdx.y, BLOCK_DIM+1) + threadIdx.x; 73 | block[index_block] = idata[index_in]; 74 | index_transpose = __mul24(threadIdx.x, BLOCK_DIM+1) + threadIdx.y; 75 | index_out = __mul24(height, xBlock + threadIdx.y) + yBlock + threadIdx.x; 76 | } 77 | __syncthreads(); 78 | if (xIndex < width && yIndex < height) 79 | odata[index_out] = block[index_transpose]; 80 | } 81 | //GPU 82 | template 83 | __global__ void reduce6(int *g_idata, int *g_odata, unsigned int n) 84 | { 85 | extern __shared__ int sdata[]; 86 | unsigned int tid = threadIdx.x; 87 | unsigned int i = blockIdx.x*(blockSize*2) + tid; //Final Optimized Kernel 88 | unsigned int gridSize = blockSize*2*gridDim.x; 89 | sdata[tid] = 0; 90 | do { sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; } while (i < n); 91 | __syncthreads(); 92 | if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); } 93 | if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); } 94 | if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); } 95 | if (tid < 32) { 96 | if (blockSize >= 64) sdata[tid] += sdata[tid + 32]; 97 | if (blockSize >= 32) sdata[tid] += sdata[tid + 16]; 98 | if (blockSize >= 16) sdata[tid] += sdata[tid + 8]; 99 | if (blockSize >= 8) sdata[tid] += sdata[tid + 4]; 100 | if (blockSize >= 4) sdata[tid] += sdata[tid + 2]; 101 | if (blockSize >= 2) sdata[tid] += sdata[tid + 1]; 102 | } 103 | if (tid == 0) g_odata[blockIdx.x] = sdata[0]; 104 | } 105 | // textures vs __constant__ 106 | 107 | __constant__ short hash_g[1024]; 108 | __constant__ short hash_h[8192]; 109 | ... 110 | return hash_h[(hash_g[b] + a) & 0x1fff]; 111 | 112 | texture hash_g; 113 | texture hash_h; 114 | ... 115 | cudaBindTexture(0, hash_g, hash_g_gpu, sizeof(hash_g_cpu)); 116 | cudaBindTexture(0, hash_h, hash_h_gpu, sizeof(hash_h_cpu)); 117 | ... 118 | return tex1Dfetch(hash_h, (tex1Dfetch(hash_g, b) + a) & 0x1fff); 119 | 120 | Constants: method=[ _Z4testPiP11permutation ] 121 | gputime=[ 60942.465 ] cputime=[ 60972.000 ] occupancy=[ 1.000 ] 122 | Texture: method=[ _Z4testPiP11permutation ] 123 | gputime=[ 29661.119 ] cputime=[ 29920.000 ] occupancy=[ 1.000 ] 124 | -------------------------------------------------------------------------------- /oldcode/misc/mklMath.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8: © Arno Pähler, 2007-08 2 | 3 | from ctypes import CDLL 4 | from math import * 5 | 6 | vml = CDLL("./_vector.so") 7 | 8 | vcos = vml.vsCos 9 | vcos.restype = None 10 | 11 | vsin = vml.vsSin 12 | vsin.restype = None 13 | 14 | vsincos = vml.vsSinCos 15 | vsincos.restype = None 16 | 17 | vexp = vml.vsExp 18 | vexp.restype = None 19 | 20 | vlog = vml.vsLn 21 | vlog.restype = None 22 | 23 | vlog10 = vml.vsLog10 24 | vlog10.restype = None 25 | 26 | vsqrt = vml.vsSqrt 27 | vsqrt.restype = None 28 | 29 | def cpuTRIG(h_Y,h_Z,h_X): 30 | size = len(h_X) 31 | if False: 32 | vcos(size,h_X,h_Y) 33 | vsin(size,h_X,h_Z) 34 | else: # about 20% faster 35 | vsincos(size,h_X,h_Z,h_Y) 36 | 37 | ##//////////////////////////////////////////////////////////////////////////// 38 | ## Shared CPU/GPU functions, performing calculations for single option by 39 | ## Black-Scholes formula. 40 | ##//////////////////////////////////////////////////////////////////////////// 41 | A1 = 0.319381530 42 | A2 = -0.356563782 43 | A3 = 1.781477937 44 | A4 = -1.821255978 45 | A5 = 1.3302744290 46 | RSQRT2PI = 0.3989422804 47 | 48 | ##Polynomial approximation of cumulative normal distribution function 49 | def CND(d): 50 | K = 1.0 / (1.0 + 0.2316419 * abs(d)) 51 | 52 | cnd = RSQRT2PI * exp(-0.5 * d * d) * \ 53 | (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) 54 | 55 | if d > 0: 56 | cnd = 1.0 - cnd 57 | 58 | return cnd 59 | 60 | ## Calculate Black-Scholes formula for both calls and puts 61 | ## S, ##Stock price 62 | ## X, ##Option strike 63 | ## T, ##Option years 64 | ## R, ##Riskless rate 65 | ## V ##Volatility rate 66 | def BlackScholesBody( 67 | S, X, T, R, V ): 68 | sqrtT = sqrt(T) 69 | d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT) 70 | d2 = d1 - V * sqrtT 71 | 72 | CNDD1 = CND(d1) 73 | CNDD2 = CND(d2) 74 | 75 | ##Calculate Call and Put simultaneously 76 | expRT = exp(- R * T) 77 | CallResult = S * CNDD1 - X * expRT * CNDD2 78 | PutResult = X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1) 79 | 80 | return CallResult,PutResult 81 | 82 | def cpuBLSC(h_C,h_P,h_S,h_X,h_T,R,V,size): 83 | for i in range(size): 84 | h_C[i],h_P[i] = BlackScholesBody(h_S[i],h_X[i],h_T[i],R,V) 85 | -------------------------------------------------------------------------------- /oldcode/misc/sgemmN: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/npinto/python-cuda/92898059d7a32b261ba0758e50dccfe5c4bd4ac4/oldcode/misc/sgemmN -------------------------------------------------------------------------------- /oldcode/misc/sgemmN.log: -------------------------------------------------------------------------------- 1 | 2 | testing sgemm( 'N', 'N', n, n, n, ... ) 3 | 4 | n CUBLAS,Gflop/s we,Gflop/s "error" 5 | 64 22.97 12.61 0 6 | 128 40.94 51.73 0 7 | 192 100.26 101.37 0 8 | 256 97.20 98.01 0 9 | 320 120.21 120.92 0 10 | 384 126.81 125.59 0 11 | 448 132.74 132.28 0 12 | 512 136.32 132.46 0 13 | 576 135.26 140.52 0 14 | 704 138.64 139.08 0 15 | 832 156.95 138.04 0 16 | 960 139.01 139.58 0 17 | 1088 141.96 172.23 0 18 | 1216 142.39 142.22 0 19 | 1408 142.33 143.02 0 20 | 1600 143.17 143.08 0 21 | 1792 143.69 192.88 0 22 | 1984 143.02 143.77 0 23 | 2240 164.13 143.80 0 24 | 2496 143.48 143.55 0 25 | 2816 161.37 176.05 0 26 | 3136 143.76 165.94 0 27 | 3520 185.63 180.48 0 28 | 3904 189.89 162.13 0 29 | 30 | testing sgemm( 'N', 'T', n, n, n, ... ) 31 | 32 | n CUBLAS,Gflop/s we,Gflop/s "error" 33 | 64 22.87 12.82 0 34 | 128 40.74 51.19 0 35 | 192 101.35 102.00 0 36 | 256 98.56 99.11 0 37 | 320 122.89 122.78 0 38 | 384 128.55 127.23 0 39 | 448 134.89 133.69 0 40 | 512 133.56 138.56 0 41 | 576 141.31 136.77 0 42 | 704 150.16 139.81 0 43 | 832 137.56 138.31 0 44 | 960 139.53 139.73 0 45 | 1088 142.25 148.80 0 46 | 1216 142.20 143.25 0 47 | 1408 142.83 142.88 0 48 | 1600 142.78 143.60 0 49 | 1792 142.23 141.73 0 50 | 1984 143.83 143.91 0 51 | 2240 186.56 170.33 0 52 | 2496 143.83 143.89 0 53 | 2816 157.43 173.14 0 54 | 3136 184.27 163.58 0 55 | 3520 180.04 177.53 0 56 | 3904 166.91 193.43 0 57 | -------------------------------------------------------------------------------- /oldcode/misc/simple.cu: -------------------------------------------------------------------------------- 1 | // © Arno Pähler, 2007-08 2 | extern "C" { 3 | typedef float _F; 4 | typedef const float _cF; 5 | typedef const unsigned int _cI; 6 | 7 | texture Arg; 8 | 9 | __global__ void TRIG 10 | (_F *d_Out1, _F *d_Out2, _cF *d_In1, _cI size ) 11 | { 12 | _cI tid = blockDim.x * blockIdx.x + threadIdx.x; 13 | _cI tsz = blockDim.x * gridDim.x; 14 | int i; 15 | 16 | for (i = tid; i < size; i += tsz) 17 | { 18 | d_Out1[i] = cosf(d_In1[i]); 19 | d_Out2[i] = sinf(d_In1[i]); 20 | } 21 | } 22 | 23 | __global__ void TRIGTex 24 | (_F *d_Out1, _F *d_Out2, _cI size ) 25 | { 26 | _cI tid = blockDim.x * blockIdx.x + threadIdx.x; 27 | _cI tsz = blockDim.x * gridDim.x; 28 | int i; 29 | __shared__ float x; 30 | 31 | for (i = tid; i < size; i += tsz) 32 | { 33 | x = tex1Dfetch(Arg,i); 34 | d_Out1[i] = cosf(x); 35 | d_Out2[i] = sinf(x); 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /oldcode/misc/simple.cubin: -------------------------------------------------------------------------------- 1 | architecture {sm_11} 2 | abiversion {0} 3 | modname {cubin} 4 | sampler { 5 | name = Arg 6 | texunit = 0 7 | } 8 | code { 9 | name = TRIG 10 | lmem = 0 11 | smem = 32 12 | reg = 6 13 | bar = 0 14 | bincode { 15 | 0x10000205 0x40004780 0xa0000005 0x04000780 16 | 0x60014c05 0x00204780 0x3001cffd 0x6420c7c8 17 | 0x30000003 0x00000280 0x10000201 0x40004780 18 | 0x3002020d 0xc4100780 0x3002ce05 0xc4300780 19 | 0x41002810 0x2103ec00 0x2101ec04 0x2103e808 20 | 0x2000ca0d 0x0420c780 0x30020811 0xc4100780 21 | 0xd00e0015 0x80c00780 0xb0000a15 0xc0000780 22 | 0x90000a15 0xa0000780 0xd00e0415 0xa0c00780 23 | 0xd00e0015 0x80c00780 0xb0000a15 0xc0000780 24 | 0x90000a15 0x80000780 0x20000001 0x04010780 25 | 0xd00e0615 0xa0c00780 0x300101fd 0x640047c8 26 | 0x20048408 0x2004860c 0x1000c003 0x00000280 27 | 0xf0000001 0xe0000001 28 | } 29 | } 30 | code { 31 | name = TRIGTex 32 | lmem = 0 33 | smem = 32 34 | reg = 9 35 | bar = 0 36 | bincode { 37 | 0x10000205 0x40004780 0xa0000005 0x04000780 38 | 0x60014c05 0x00204780 0x3001cdfd 0x6420c7c8 39 | 0x30000003 0x00000280 0x10000201 0x40004780 40 | 0x30020211 0xc4100780 0xa0017003 0000000000 41 | 0x3002cc15 0xc4300780 0x41002808 0x2104e80c 42 | 0x2104ea10 0x2105e818 0x30020415 0xc4100780 43 | 0x10000201 0x0403c780 0xf3000001 0x00000784 44 | 0xb000001d 0xc0000780 0x90000e21 0xa0000780 45 | 0xd00e0621 0xa0c00780 0x90000e1d 0x80000780 46 | 0x2000060d 0x04014780 0xd00e081d 0xa0c00780 47 | 0x300607fd 0x640047c8 0x20028204 0x20058810 48 | 0x1000c003 0x00000280 0x00000e01 0xe4200782 49 | 0xf0000001 0xe0000001 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /oldcode/misc/simple.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | # coding:utf-8: © Arno Pähler, 2007-08 3 | from ctypes import * 4 | from time import time 5 | 6 | from cuda.cu_defs import * 7 | from cuda.cu_api import * 8 | from cuda.cu_utils import * 9 | 10 | from cpuFunctions import checkError,checkTrig,vectorInit 11 | 12 | UseVML = True 13 | if UseVML: 14 | from mklMath import cpuTRIG 15 | else: 16 | from cpuFunctions import cpuTRIG 17 | 18 | BLOCK_SIZE = 320 19 | GRID_SIZE = 8 20 | 21 | S4 = sizeof(c_float) 22 | 23 | def main(device,vlength = 128,loops = 1): 24 | print "+-----------------------+" 25 | print "| Simple TRIG Test |" 26 | print "| using CUDA driver API |" 27 | print "+-----------------------+" 28 | print "params: %2d %5dK %3d\n" % (log2n,vlength >> 10,loops), 29 | 30 | n2 = vlength ## Vector length 31 | 32 | # TRIGTex is about 1.5x faster than TRIG 33 | # name = "TRIG" 34 | name = "TRIGTex" 35 | 36 | TRIG = device.functions[name] 37 | mod0 = device.modules[0] 38 | 39 | sizeV = S4*n2 40 | h_Arg = (c_float*n2)() 41 | h_Cos = (c_float*n2)() 42 | h_Sin = (c_float*n2)() 43 | 44 | vectorInit(h_Arg) 45 | 46 | d_Arg = getMemory(h_Arg) 47 | d_Cos = getMemory(n2) 48 | d_Sin = getMemory(n2) 49 | 50 | tex = devMemToTex(mod0,"Arg",d_Arg,sizeV) 51 | 52 | cuFuncSetBlockShape(TRIG,BLOCK_SIZE,1,1) 53 | cuParamSeti(TRIG,0,d_Cos) 54 | cuParamSeti(TRIG,4,d_Sin) 55 | if name != "TRIGTex": 56 | cuParamSeti(TRIG,8,d_Arg) 57 | cuParamSeti(TRIG,12,n2) 58 | cuParamSetSize(TRIG,16) 59 | else: 60 | cuParamSetTexRef(TRIG,CU_PARAM_TR_DEFAULT,tex) 61 | cuParamSeti(TRIG,8,n2) 62 | cuParamSetSize(TRIG,12) 63 | cuCtxSynchronize() 64 | 65 | t0 = time() 66 | for i in range(loops): 67 | cuLaunchGrid(TRIG,GRID_SIZE,1) 68 | cuCtxSynchronize() 69 | t0 = time()-t0 70 | 71 | g_Cos = (c_float*n2)() 72 | g_Sin = (c_float*n2)() 73 | cuMemcpyDtoH(g_Cos,d_Cos,sizeV) 74 | cuMemcpyDtoH(g_Sin,d_Sin,sizeV) 75 | cuCtxSynchronize() 76 | 77 | cuMemFree(d_Arg) 78 | cuMemFree(d_Cos) 79 | cuMemFree(d_Sin) 80 | 81 | t1 = time() 82 | for i in range(loops): 83 | cpuTRIG(h_Cos,h_Sin,h_Arg) 84 | t1 = time()-t1 85 | 86 | flopsg = (2.e-6*n2)*float(loops) 87 | flopsc = flopsg 88 | 89 | t0 *= 1.e3; 90 | t1 *= 1.e3; 91 | print "\n time[msec] GFlops\n" 92 | print "GPU: %12.1f%10.2f" % (t0,flopsg/t0) 93 | print "CPU: %12.1f%10.2f" % (t1,flopsc/t1) 94 | print " %12.1f" % (t1/t0) 95 | 96 | x = float(1 << 23) 97 | e,m = checkTrig(g_Cos,g_Sin) 98 | print "\n",name, "internal check GPU" 99 | print "%8.1e %8.1e" % (e,m) 100 | print "%8.1f %8.1f" % (e*x,m*x) 101 | 102 | e,m = checkTrig(h_Cos,h_Sin) 103 | print "\n",name, "internal check CPU" 104 | print "%8.1e %8.1e" % (e,m) 105 | print "%8.1f %8.1f" % (e*x,m*x) 106 | 107 | print "\n","check between CPU and GPU" 108 | err,mxe = checkError(h_Cos,g_Cos) 109 | print "Avg and max abs error (cos) = %8.1e %8.1e" % (err,mxe) 110 | print " %8.1f %8.1f" % (err*x,mxe*x) 111 | err,mxe = checkError(h_Sin,g_Sin) 112 | print "Avg and max abs error (sin) = %8.1e %8.1e" % (err,mxe) 113 | print " %8.1f %8.1f" % (err*x,mxe*x) 114 | 115 | if __name__ == "__main__": 116 | import sys 117 | device = cu_CUDA() 118 | device.getSourceModule("simple.cubin") 119 | device.getFunction("TRIG") 120 | device.getFunction("TRIGTex") 121 | 122 | log2n,loops = 15,1 123 | if len(sys.argv) > 1: 124 | log2n = int(sys.argv[1]) 125 | log2n = max(0,min(log2n,25)) 126 | if len(sys.argv) > 2: 127 | loops = int(sys.argv[2]) 128 | vlength = 1 << log2n 129 | main(device,vlength,loops) 130 | cuCtxDetach(device.context) 131 | -------------------------------------------------------------------------------- /oldcode/misc/utilities.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8: © Arno Pähler, 2007-08 2 | 3 | from bz2 import BZ2File 4 | from gzip import GzipFile 5 | from collections import defaultdict 6 | from ctypes import memmove 7 | from os.path import splitext 8 | from subprocess import Popen,PIPE 9 | from sys import platform 10 | from time import ctime,time 11 | 12 | try: 13 | from numpy import empty,float32 14 | def c2n_array(a,m,n=1): 15 | ## scipy needs array in Fortran order 16 | aa = empty((m,n),dtype=float32,order='F') 17 | memmove(aa.ctypes.data,a,4*(m*n)) 18 | return aa 19 | def n2c_array(a,aa): 20 | ## aa should already have been allocated! 21 | memmove(aa,a.ctypes.data,4*a.size) 22 | return aa 23 | except ImportError: 24 | def c2n_array(a,m,n=1): 25 | return a 26 | def n2c_array(a,aa): 27 | aa = a 28 | return aa 29 | 30 | from cpuFunctions import ReadTimestampCounter 31 | 32 | class Timer(object): 33 | 34 | fakeClock = 1.e-9/2.8 # denominator: clock in GHz 35 | 36 | def __init__(self,startNow=False,useTSC=False): 37 | self.useTSC = useTSC 38 | self.wall = 0. ## later 39 | self.cpu = 0. ## later 40 | self.running = False 41 | self.torg = 0. 42 | self.time = 0. 43 | self.counters = [] 44 | self.freqs = None 45 | 46 | if useTSC and platform == "linux2": 47 | self.getFrequency(0) 48 | elif useTSC and platform == "win32": 49 | self.rtfreq = self.fakeClock 50 | elif useTSC: 51 | self.rtfreq = self.fakeClock 52 | 53 | if startNow: 54 | self.start() 55 | 56 | def __str__(self): 57 | pf = dict(( 58 | ("linux2","Linux"), 59 | ("win32","Windows"))) 60 | s =["System: %s" % pf[platform]] 61 | useTSC = self.useTSC 62 | s.append("UseTSC: %s" % bool(useTSC)) 63 | if useTSC: 64 | s.append("Cores : %d" % len(self.freqs)) 65 | clock = 1.e-9/self.rtfreq 66 | s.append("Clock : %.3f GHz" % clock) 67 | return "\n".join(s) 68 | 69 | def getFrequency(self,core=0): 70 | if self.freqs is None: 71 | cpuinfo = "/proc/cpuinfo" 72 | freqs = [] 73 | self.core = core 74 | for line in open(cpuinfo): 75 | if line.startswith("cpu MHz"): 76 | freq = float(line.split(":")[1]) 77 | freqs.append(1.e-6/freq) 78 | self.freqs = freqs 79 | self.rtfreq = freqs[core] 80 | else: 81 | self.rtfreq = freqs[core] 82 | return self.rtfreq 83 | 84 | def getTime(self): 85 | if self.useTSC: 86 | t = ReadTimestampCounter()*rfreq 87 | else: 88 | t = time() 89 | return t-self.torg 90 | 91 | def start(self): 92 | if self.useTSC and self.freqs is None: 93 | self.getFrequency() 94 | self.running = True 95 | self.torg = self.getTime() 96 | self.time = 0. 97 | self.counters = [] 98 | 99 | def split(self): 100 | t = self.time 101 | if self.running: 102 | t = self.getTime() 103 | self.time = t 104 | self.counters.append(t) 105 | return t 106 | 107 | def read(self,all = True): 108 | o = self.torg 109 | t = self.time 110 | if self.running: 111 | t = self.getTime() 112 | self.time = t 113 | if all: 114 | return o,t,self.counters 115 | else: 116 | return o,t 117 | 118 | def reset(self): 119 | self.start() 120 | 121 | def stop(self): 122 | if self.running: 123 | self.running = False 124 | t = self.getTime() 125 | self.time = t 126 | self.torg = 0. 127 | return t 128 | 129 | BSZ = 1024 130 | 131 | ## system execution of 'command' with imput 'input' to 'command' 132 | def System(command,input = ""): 133 | """system execution of 'command' with input 'input' to 'command' 134 | Returns tuple (status,output to stdout,output to stderr) 135 | with outputs split on newlines and returned as lists.""" 136 | if platform == "win32": 137 | run = Popen(command,shell = True,bufsize = BSZ, 138 | stdin = PIPE,stdout = PIPE,stderr = PIPE) 139 | else: 140 | run = Popen(command,shell = True,bufsize = BSZ, 141 | stdin = PIPE,stdout = PIPE,stderr = PIPE, 142 | close_fds = True) 143 | if input != "": 144 | run.stdin.write(input+"\n") 145 | runOutput = run.stdout.read().splitlines() 146 | runErrors = run.stderr.read().splitlines() 147 | status = run.wait() 148 | return status,runOutput,runErrors 149 | 150 | ## allow to open ordinary, gzipped, bzipped files 151 | def xOpen(name,mode = "r"): 152 | """open file depending on extension (.gz,.bz2) so that ordinary 153 | as well as compressed files can be opened with the same syntax. 154 | Returns a Python file object.""" 155 | extension = splitext(name)[-1] 156 | if extension == ".gz": 157 | file = GzipFile(name,mode) 158 | elif extension == ".bz2": 159 | file = BZ2File(name,mode) 160 | else: 161 | file = open(name,mode) 162 | return file 163 | 164 | ## print timing info: t0 = start time, t1 = final time 165 | def printTiming(t0,t1): 166 | """ Given start time t0 and end time t1 in seconds, 167 | as returned by time.time(), 168 | print a nice representation like 169 | 170 | Started : Mon Jan 7 23:18:21 2008 171 | Finished : Mon Jan 7 23:18:22 2008 172 | Elapsed : 0.9 (00:00:00.9) 173 | 174 | Elapsed time as given in seconds and as (hh:mm:ss.s)""" 175 | 176 | dt = t1-t0 177 | dh = int(dt/3600.) 178 | du = dt-float(3600*dh) 179 | dm = int(du/60.) 180 | ds = du-float(60*dm) 181 | 182 | print "\nStarted : %s" % ctime(t0) 183 | print "Finished : %s" % ctime(t1) 184 | print "Elapsed : %.1f (%2.2d:%2.2d:%04.1f)\n" % (dt,dh,dm,ds) 185 | 186 | ## invert a dictionary swapping keys and values 187 | def invertDict(org_dict): 188 | """Invert a dictionary of (key,val) and returns a dictionary (val,key). 189 | Fails if val is mutable, i.e. ALL vals must be immutable, e.g. strings.""" 190 | invdict = defaultdict(list) 191 | for k in org_dict: 192 | v = org_dict[k] 193 | invdict[v].append(k) 194 | return invdict 195 | -------------------------------------------------------------------------------- /oldcode/misc/vector.c: -------------------------------------------------------------------------------- 1 | // To force linking in all needed Intel routines 2 | // See compileCX for details (linking against .a) 3 | void Dummy(void) 4 | { 5 | vsAdd(); 6 | vsSub(); 7 | vsDiv(); 8 | vsSqr(); 9 | vsMul(); 10 | vsAbs(); 11 | vsInv(); 12 | 13 | vsSin(); 14 | vsCos(); 15 | vsSinCos(); 16 | vsTan(); 17 | vsAsin(); 18 | vsAcos(); 19 | vsAtan(); 20 | vsAtan2(); 21 | 22 | vsSinh(); 23 | vsCosh(); 24 | vsTanh(); 25 | vsAsinh(); 26 | vsAcosh(); 27 | vsAtanh(); 28 | 29 | vsPow(); 30 | vsPowx(); 31 | vsSqrt(); 32 | vsCbrt(); 33 | vsInvSqrt(); 34 | vsInvCbrt(); 35 | vsHypot(); 36 | 37 | vsFloor(); 38 | vsCeil(); 39 | vsRound(); 40 | vsTrunc(); 41 | vsRint(); 42 | vsNearbyInt(); 43 | vsModf(); 44 | 45 | vsExp(); 46 | vsLn(); 47 | vsLog10(); 48 | 49 | vsErf(); 50 | vsErfc(); 51 | vsErfInv(); 52 | } 53 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ setuptools setup.py for python-cuda """ 5 | 6 | from ez_setup import use_setuptools 7 | use_setuptools(version='0.6c9') 8 | 9 | from setuptools import setup, find_packages 10 | 11 | setup( 12 | name = 'python-cuda', 13 | 14 | version = '2.1-0.0.1', 15 | 16 | packages = ['cuda', 17 | 'cuda.cu', 18 | 'cuda.cuda', 19 | 'cuda.cublas', 20 | 'cuda.cufft', 21 | 'cuda.sugar', 22 | 'cuda.sugar.memory', 23 | 'cuda.sugar.kernel', 24 | 'cuda.sugar.fft', 25 | 'cuda.sugar.blas', 26 | 'cuda.sugar.query', 27 | 'cuda.utils'], 28 | 29 | package_dir = {'cuda':'cuda'}, 30 | 31 | package_data = {'cuda.sugar.fft': ['*.cu'] }, 32 | 33 | install_requires=[ 34 | "numpy>=1.3.0", 35 | "scipy>=0.7.0", 36 | ], 37 | 38 | 39 | # author='', 40 | # author_email='', 41 | # url='', 42 | # description='Python bindings for CUDA 2.1 with numpy integration', 43 | # long_description = """ """, 44 | # download_url='', 45 | # license='?', 46 | # package_data = {} 47 | 48 | ) 49 | -------------------------------------------------------------------------------- /tests/cu/todo/cu_add.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | # coding:utf-8: © Arno Pähler, 2007-08 3 | from ctypes import * 4 | from time import time 5 | 6 | #from cuda.cu_defs import * 7 | from cu.cu_defs import * 8 | #from cuda.cu_api import * 9 | from cu.cu_api import * 10 | #from cuda.cu_utils import * 11 | from utils.cu_utils import * 12 | 13 | from cpuFunctions import fixedInit,cpuVADD,checkError 14 | 15 | BLOCK_SIZE = 64 16 | GRID_SIZE = 256 17 | S4 = sizeof(c_float) 18 | checkErrorFlag = False 19 | 20 | def main(device,vlength = 128,loops = 1): 21 | 22 | n2 = vlength ## Vector length 23 | gpuVADD = device.functions["gpuVADD"] 24 | 25 | h_X = (c_float*n2)() 26 | h_Y = (c_float*n2)() 27 | g_Y = (c_float*n2)() 28 | 29 | fixedInit(h_X) 30 | 31 | d_X = getMemory(h_X) 32 | d_Y = getMemory(h_Y) 33 | 34 | cuFuncSetBlockShape(gpuVADD,BLOCK_SIZE,1,1) 35 | cuParamSeti(gpuVADD,0,d_X) 36 | cuParamSeti(gpuVADD,4,d_Y) 37 | cuParamSeti(gpuVADD,8,n2) 38 | cuParamSetSize(gpuVADD,12) 39 | 40 | cuCtxSynchronize() 41 | t0 = time() 42 | for i in range(loops): 43 | cuLaunchGrid(gpuVADD,GRID_SIZE,1) 44 | cuCtxSynchronize() 45 | t0 = time()-t0 46 | 47 | flops = (1.e-9*n2)*float(loops) 48 | cuMemcpyDtoH(g_Y,d_Y,n2*S4) 49 | cuCtxSynchronize() 50 | 51 | cuMemFree(d_X) 52 | cuMemFree(d_Y) 53 | 54 | t1 = time() 55 | for i in range(loops): 56 | cpuVADD(h_X,h_Y) 57 | t1 = time()-t1 58 | print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0) 59 | 60 | if checkErrorFlag: 61 | err,mxe = checkError(h_Y,g_Y) 62 | print "Avg and max rel error = %.2e %.2e" % (err,mxe) 63 | 64 | if __name__ == "__main__": 65 | import sys 66 | 67 | device = cu_CUDA() 68 | device.getSourceModule("gpuFunctions.cubin") 69 | device.getFunction("gpuVADD") 70 | 71 | lmin,lmax = 7,24 72 | if len(sys.argv) > 1: 73 | lmin = lmax = int(sys.argv[1]) 74 | loopx = -1 75 | if len(sys.argv) > 2: 76 | loopx = int(sys.argv[2]) 77 | lmax = min(max(0,lmax),24) 78 | lmin = min(max(0,lmin),lmax) 79 | for l in range(lmin,lmax+1): 80 | if l < 10: 81 | loops = 25000 82 | elif l < 17: 83 | loops = 10000 84 | elif l < 21: 85 | loops = 250 86 | else: 87 | loops = 25 88 | vlength = 1 << l 89 | if loopx > 0: 90 | loops = loopx 91 | print "%5d %5d" % (l,loops), 92 | main(device,vlength,loops) 93 | cuCtxDetach(device.context) 94 | -------------------------------------------------------------------------------- /tests/cu/todo/cu_blsc.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | # coding:utf-8: © Arno Pähler, 2007-08 3 | from ctypes import * 4 | from time import time 5 | 6 | from cu.cu_defs import * 7 | from cu.cu_api import * 8 | from utils.cu_utils import * 9 | 10 | from cpuFunctions import randInit,checkError 11 | 12 | UseVML = False 13 | if UseVML: 14 | from mklMath import cpuBLSC 15 | else: 16 | from cpuFunctions import cpuBLSC 17 | 18 | BLOCK_SIZE = 128 19 | GRID_SIZE = 192 20 | checkErrorFlag = False 21 | 22 | S4 = sizeof(c_float) 23 | 24 | def main(device,vlength = 128,loops = 1): 25 | 26 | n2 = vlength ## Vector length 27 | 28 | gpuBLSC = device.functions["gpuBLSC"] 29 | 30 | h_S = (c_float*n2)() 31 | h_X = (c_float*n2)() 32 | h_T = (c_float*n2)() 33 | h_C = (c_float*n2)() 34 | h_P = (c_float*n2)() 35 | 36 | 37 | randInit(h_S,5.,30.) 38 | randInit(h_X,1.,100.) 39 | randInit(h_T,.25,10.) 40 | R,V = .03,.3 41 | 42 | d_S = getMemory(h_S) 43 | d_X = getMemory(h_X) 44 | d_T = getMemory(h_T) 45 | d_C = getMemory(h_C) 46 | d_P = getMemory(h_P) 47 | 48 | cuFuncSetBlockShape(gpuBLSC,BLOCK_SIZE,1,1) 49 | cuParamSeti(gpuBLSC, 0,d_C) 50 | cuParamSeti(gpuBLSC, 4,d_P) 51 | cuParamSeti(gpuBLSC, 8,d_S) 52 | cuParamSeti(gpuBLSC,12,d_X) 53 | cuParamSeti(gpuBLSC,16,d_T) 54 | cuParamSetf(gpuBLSC,20,R) 55 | cuParamSetf(gpuBLSC,24,V) 56 | cuParamSeti(gpuBLSC,28,n2) 57 | cuParamSetSize(gpuBLSC,32) 58 | 59 | cuCtxSynchronize() 60 | t0 = time() 61 | for i in range(loops): 62 | cuLaunchGrid(gpuBLSC,GRID_SIZE,1) 63 | cuCtxSynchronize() 64 | t0 = time()-t0 65 | 66 | flops = (2.e-6*n2)*float(loops) 67 | g_C = (c_float*n2)() 68 | g_P = (c_float*n2)() 69 | cuMemcpyDtoH(g_C,d_C,n2*S4) 70 | cuMemcpyDtoH(g_P,d_P,n2*S4) 71 | cuCtxSynchronize() 72 | 73 | cuMemFree(d_S) 74 | cuMemFree(d_X) 75 | cuMemFree(d_T) 76 | cuMemFree(d_C) 77 | cuMemFree(d_P) 78 | 79 | t1 = time() 80 | for i in range(loops): 81 | cpuBLSC(h_C,h_P,h_S,h_X,h_T,R,V,n2) 82 | t1 = time()-t1 83 | print "%10d%10.2f%10.2f" % (vlength,flops/t1,flops/t0) 84 | 85 | if checkErrorFlag: 86 | err,mxe = checkError(h_C,g_C) 87 | print "Avg rel error (call) = %.2e" % (err,) 88 | err,mxe = checkError(h_P,g_P) 89 | print "Avg rel error (put) = %.2e" % (err,) 90 | 91 | if __name__ == "__main__": 92 | import sys 93 | 94 | device = cu_CUDA() 95 | device.getSourceModule("gpuFunctions.cubin") 96 | device.getFunction("gpuBLSC") 97 | 98 | lmin,lmax = 7,23 99 | if len(sys.argv) > 1: 100 | lmin = lmax = int(sys.argv[1]) 101 | lmax = min(max(0,lmax),23) 102 | lmin = min(max(0,lmin),lmax) 103 | for l in range(lmin,lmax+1): 104 | if l < 10: 105 | loops = 1000 106 | elif l < 13: 107 | loops = 500 108 | elif l < 17: 109 | loops = 100 110 | elif l < 21: 111 | loops = 10 112 | else: 113 | loops = 5 114 | loops = 2 115 | vlength = 1 << l 116 | print "%5d %5d" % (l,loops), 117 | main(device,vlength,loops) 118 | cuCtxDetach(device.context) 119 | -------------------------------------------------------------------------------- /tests/cu/todo/cu_gflops.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | # coding:utf-8: © Arno Pähler, 2007-08 3 | from ctypes import * 4 | from time import time 5 | 6 | from cu.cu_defs import * 7 | from cu.cu_api import * 8 | from utils.cu_utils import * 9 | 10 | from cpuFunctions import cpuGFLOPS 11 | 12 | BLOCK_SIZE_C = 192 13 | ITERATIONS_C = 512 14 | 15 | BLOCK_SIZE_G = 512 16 | GRID_SIZE_G = 512 17 | ITERATIONS_G = 512 18 | 19 | def main(device,loops = 1): 20 | 21 | gpuGFLOPS = device.functions["gpuGFLOPS"] 22 | 23 | cuFuncSetBlockShape(gpuGFLOPS,BLOCK_SIZE_G,1,1) 24 | 25 | t0 = time() 26 | for i in range(loops): 27 | cuCtxSynchronize() 28 | cuLaunchGrid(gpuGFLOPS,GRID_SIZE_G,1) 29 | cuCtxSynchronize() 30 | t0 = time()-t0 31 | 32 | flopsc = 4096.*ITERATIONS_C*BLOCK_SIZE_C 33 | flopsg = 4096.*ITERATIONS_G*BLOCK_SIZE_G*GRID_SIZE_G 34 | 35 | flopsc *= 1.e-9*float(loops) 36 | flopsg *= 1.e-9*float(loops) 37 | 38 | t1 = time() 39 | for i in range(loops): 40 | cpuGFLOPS() 41 | t1 = time()-t1 42 | # peakg = 4.*8.*2.*1.458 # 4MP*8SP/MP*2flops/SP/clock*clock[GHz] (8600GTS) 43 | peakg = 14.*8.*2.*1.512 # 14MP*8SP/MP*2flops/SP/clock*clock[GHz] (9800GT) 44 | print "%8.3f%8.2f%8.3f%8.2f [%.2f]" % ( 45 | t1,flopsc/t1,t0,flopsg/t0,peakg) 46 | 47 | if __name__ == "__main__": 48 | import sys 49 | 50 | device = cu_CUDA() 51 | device.getSourceModule("gpuFunctions.cubin") 52 | device.getFunction("gpuGFLOPS") 53 | 54 | loops = 1 55 | if len(sys.argv) > 1: 56 | loops = int(sys.argv[1]) 57 | print "%5d" % (loops), 58 | main(device,loops) 59 | cuCtxDetach(device.context) 60 | -------------------------------------------------------------------------------- /tests/cu/todo/cu_poly.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | # coding:utf-8: © Arno Pähler, 2007-08 3 | from ctypes import * 4 | from time import time 5 | 6 | from cu.cu_defs import * 7 | from cu.cu_api import * 8 | from utils.cu_utils import * 9 | 10 | from cpuFunctions import cpuPOLY5,cpuPOLY10,cpuPOLY20,cpuPOLY40 11 | 12 | BLOCK_SIZE = 144 13 | GRID_SIZE = 192 14 | ##BLOCK_SIZE = 320 15 | ##GRID_SIZE = 8 16 | checkErrorFlag = False 17 | 18 | S4 = sizeof(c_float) 19 | psize = 5 20 | 21 | def main(device,vlength = 128,loops = 1,m1 = 1): 22 | print "%5d %5d %5d" % (l,loops,m1), 23 | 24 | alfa = c_float(.5) 25 | n2 = vlength ## Vector length 26 | 27 | mp = 1 << (m1-1) 28 | print "%5d" % (mp*psize), 29 | fcn = "gpuPOLY%d"%(mp*psize) 30 | gpuPOLY = device.functions[fcn] 31 | h_X = (c_float*n2)() 32 | h_Y = (c_float*n2)() 33 | g_Y = (c_float*n2)() 34 | 35 | vectorInit(h_X) 36 | 37 | d_X = getMemory(h_X) 38 | d_Y = getMemory(h_Y) 39 | 40 | cuFuncSetBlockShape(gpuPOLY,BLOCK_SIZE,1,1) 41 | cuParamSeti(gpuPOLY,0,d_X) 42 | cuParamSeti(gpuPOLY,4,d_Y) 43 | cuParamSeti(gpuPOLY,8,n2) 44 | cuParamSetSize(gpuPOLY,12) 45 | 46 | cuCtxSynchronize() 47 | cuLaunchGrid(gpuPOLY,GRID_SIZE,1) 48 | t0 = time() 49 | for i in range(loops): 50 | cuLaunchGrid(gpuPOLY,GRID_SIZE,1) 51 | cuCtxSynchronize() 52 | t0 = time()-t0 53 | 54 | flops = (2.e-9*m1*n2*(psize-1))*float(loops) 55 | cuMemcpyDtoH(g_Y,d_Y,n2*S4) 56 | cuCtxSynchronize() 57 | 58 | cuMemFree(d_X) 59 | cuMemFree(d_Y) 60 | 61 | cpuPOLY = eval("cpuPOLY%d" % (mp*psize)) 62 | t1 = time() 63 | for i in range(loops): 64 | cpuPOLY(h_X,h_Y) 65 | t1 = time()-t1 66 | print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0) 67 | 68 | if checkErrorFlag: 69 | err,mxe = checkError(h_Y,g_Y) 70 | print "Avg and max rel error = %.2e %.2e" % (err,mxe) 71 | 72 | if __name__ == "__main__": 73 | import sys 74 | 75 | lmin,lmax = 7,23 76 | if len(sys.argv) > 1: 77 | lmin = lmax = int(sys.argv[1]) 78 | loopx = -1 79 | if len(sys.argv) > 2: 80 | loopx = int(sys.argv[2]) 81 | m1 = 4 82 | if len(sys.argv) > 3: 83 | m1 = min(4,int(sys.argv[3])) 84 | lmax = min(max(0,lmax),23) 85 | lmin = min(max(0,lmin),lmax) 86 | 87 | mp = 1 << (m1-1) 88 | device = cu_CUDA() 89 | device.getSourceModule("gpuFunctions.cubin") 90 | fcn = "gpuPOLY%d"%(mp*psize) 91 | device.getFunction(fcn) 92 | 93 | for l in range(lmin,lmax+1): 94 | if l < 10: 95 | loops = 10000/m1 96 | elif l < 13: 97 | loops = 5000/m1 98 | elif l < 17: 99 | loops = 500/m1 100 | elif l < 21: 101 | loops = 250/m1 102 | else: 103 | loops = 100/m1 104 | vlength = 1 << l 105 | if loopx > 0: 106 | loops = loopx 107 | main(device,vlength,loops,m1) 108 | cuCtxDetach(device.context) 109 | -------------------------------------------------------------------------------- /tests/cu/todo/cu_saxpy.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | # coding:utf-8: © Arno Pähler, 2007-08 3 | from ctypes import * 4 | from time import time 5 | 6 | from cu.cu_defs import * 7 | from cu.cu_api import * 8 | from utils.cu_utils import * 9 | 10 | from cpuFunctions import fixedInit,cpuSAXPY,checkError 11 | 12 | BLOCK_SIZE = 64 13 | GRID_SIZE = 256 14 | S4 = sizeof(c_float) 15 | checkErrorFlag = False 16 | 17 | def main(device,vlength = 128,loops = 1): 18 | 19 | alfa = c_float(.5) 20 | n2 = vlength ## Vector length 21 | gpuSAXPY = device.functions["gpuSAXPY"] 22 | 23 | h_X = (c_float*n2)() 24 | h_Y = (c_float*n2)() 25 | g_Y = (c_float*n2)() 26 | 27 | fixedInit(h_X) 28 | 29 | d_X = getMemory(h_X) 30 | d_Y = getMemory(h_Y) 31 | 32 | cuFuncSetBlockShape(gpuSAXPY,BLOCK_SIZE,1,1) 33 | cuParamSetf(gpuSAXPY,0,alfa) 34 | cuParamSeti(gpuSAXPY,4,d_X) 35 | cuParamSeti(gpuSAXPY,8,d_Y) 36 | cuParamSeti(gpuSAXPY,12,n2) 37 | cuParamSetSize(gpuSAXPY,16) 38 | 39 | cuCtxSynchronize() 40 | t0 = time() 41 | for i in range(loops): 42 | cuLaunchGrid(gpuSAXPY,GRID_SIZE,1) 43 | cuCtxSynchronize() 44 | t0 = time()-t0 45 | 46 | flops = (2.e-9*n2)*float(loops) 47 | cuMemcpyDtoH(g_Y,d_Y,n2*S4) 48 | cuCtxSynchronize() 49 | 50 | cuMemFree(d_X) 51 | cuMemFree(d_Y) 52 | 53 | t1 = time() 54 | for i in range(loops): 55 | cpuSAXPY(alfa,h_X,h_Y) 56 | t1 = time()-t1 57 | print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0) 58 | 59 | if checkErrorFlag: 60 | err,mxe = checkError(h_Y,g_Y) 61 | print "Avg and max rel error = %.2e %.2e" % (err,mxe) 62 | 63 | if __name__ == "__main__": 64 | import sys 65 | 66 | device = cu_CUDA() 67 | device.getSourceModule("gpuFunctions.cubin") 68 | device.getFunction("gpuSAXPY") 69 | 70 | lmin,lmax = 7,24 71 | if len(sys.argv) > 1: 72 | lmin = lmax = int(sys.argv[1]) 73 | loopx = -1 74 | if len(sys.argv) > 2: 75 | loopx = int(sys.argv[2]) 76 | lmax = min(max(0,lmax),24) 77 | lmin = min(max(0,lmin),lmax) 78 | for l in range(lmin,lmax+1): 79 | if l < 10: 80 | loops = 25000 81 | elif l < 17: 82 | loops = 10000 83 | elif l < 21: 84 | loops = 250 85 | else: 86 | loops = 25 87 | vlength = 1 << l 88 | if loopx > 0: 89 | loops = loopx 90 | print "%5d %5d" % (l,loops), 91 | main(device,vlength,loops) 92 | cuCtxDetach(device.context) 93 | -------------------------------------------------------------------------------- /tests/cu/todo/cu_sgemm.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | # coding:utf-8: © Arno Pähler, 2007-08 3 | from ctypes import * 4 | from time import time 5 | 6 | from cu.cu_defs import * 7 | from cu.cu_api import * 8 | from utils.cu_utils import * 9 | 10 | from cpuFunctions import arrayInit,cpuSGEMM,checkError 11 | from ctypes_array import * 12 | 13 | useSciPy = True 14 | if useSciPy: 15 | from scipy.lib.blas.fblas import sgemm as _sgemm 16 | # C : A*B (on the GPU) 17 | # F : (A*B).T = B.T * A.T (scipy) 18 | def sgemm(z,x,y,m,n,k): 19 | nx = convert(x,(m,k),"C").T 20 | ny = convert(y,(k,n),"C").T 21 | nz = _sgemm(1.,ny,nx) 22 | convert(nz,out=z) 23 | return z 24 | else: 25 | # C : A*B (on the CPU) (in C) 26 | sgemm = cpuSGEMM 27 | 28 | BLOCK_SIZE = 1 << 4 29 | S4 = sizeof(c_float) 30 | 31 | def main(N = 1024,L = 100): 32 | M = N 33 | K = N >> 1 34 | N = N << 1 35 | flops = (2.e-9*M*N)*float(K*L) 36 | print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops) 37 | na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0. 38 | 39 | t0 = time() 40 | device = cu_CUDA() 41 | device.getSourceModule("gpuFunctions.cubin") 42 | gpuSGEMM = device.getFunction("gpuSGEMM") 43 | 44 | sizeA = M*K 45 | sizeB = K*N 46 | sizeC = M*N 47 | 48 | h_A = (c_float*sizeA)() 49 | h_B = (c_float*sizeB)() 50 | 51 | arrayInit(h_A) 52 | arrayInit(h_B) 53 | 54 | d_A = getMemory(h_A) 55 | d_B = getMemory(h_B) 56 | d_C = getMemory(sizeC) 57 | 58 | cuFuncSetBlockShape(gpuSGEMM,BLOCK_SIZE,BLOCK_SIZE,1) 59 | cuFuncSetSharedSize(gpuSGEMM,2*BLOCK_SIZE*BLOCK_SIZE*S4) 60 | cuParamSeti(gpuSGEMM,0,d_C) 61 | cuParamSeti(gpuSGEMM,4,d_A) 62 | cuParamSeti(gpuSGEMM,8,d_B) 63 | cuParamSeti(gpuSGEMM,12,K) 64 | cuParamSeti(gpuSGEMM,16,N) 65 | cuParamSetSize(gpuSGEMM,20) 66 | tt = t0 = time()-t0 67 | print "Overhead driver API: %.3f sec\n" % t0 68 | 69 | t0 = time() 70 | cuCtxSynchronize() 71 | for i in range(L): 72 | cuLaunchGrid(gpuSGEMM,N/BLOCK_SIZE,M/BLOCK_SIZE) 73 | cuCtxSynchronize() 74 | t0 = time()-t0 75 | tt += t0 76 | 77 | h_C = (c_float*sizeC)() 78 | cuMemcpyDtoH(h_C,d_C,S4*sizeC) 79 | cuCtxSynchronize() 80 | 81 | cuMemFree(d_A) 82 | cuMemFree(d_B) 83 | cuMemFree(d_C) 84 | cuCtxDetach(device.context) 85 | 86 | print "Processing time: %.3g (%.3g) sec" % (t0,tt) 87 | print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt) 88 | 89 | ref = (c_float*sizeC)() 90 | 91 | t1 = time() 92 | for i in range(L): 93 | sgemm(ref,h_A,h_B,M,N,K) 94 | t1 = time()-t1 95 | print "\nProcessing time: %.3g sec" % t1 96 | print "Gigaflops CPU : %.2f" % (flops/t1) 97 | print "Speedup GPU/CPU: %.2f" % (t1/t0) 98 | 99 | err,mxe = checkError(ref,h_C) 100 | print "\nAvg and max rel error = %.2e %.2e" % (err,mxe) 101 | 102 | if __name__ == "__main__": 103 | import sys 104 | 105 | M, L = 1024, 100 106 | if len(sys.argv) > 1: 107 | M = int(sys.argv[1]) 108 | M = (M >> 5) << 5 # multiple of (2*BLOCK_SIZE) 109 | if len(sys.argv) > 2: 110 | L = int(sys.argv[2]) 111 | 112 | print "+-----------------------+" 113 | print "| Matrix Multiplication |" 114 | print "| using CUDA driver API |" 115 | print "+-----------------------+\n" 116 | main(M,L) 117 | -------------------------------------------------------------------------------- /tests/cu/todo/cu_streams.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | # coding:utf-8: © Arno Pähler, 2007-08 3 | from ctypes import * 4 | 5 | from cu.cu_defs import * 6 | from cu.cu_api import * 7 | from utils.cu_utils import * 8 | 9 | from ctypes_array import * 10 | from numpy import all,int32,zeros 11 | 12 | MB = 1024*1024 13 | SI = sizeof(c_int) 14 | 15 | def check_results(a,n,c): 16 | u = (c_int*n).from_address(a.value) 17 | a = convert(u,(n,)) 18 | c = c.value 19 | return all(a==c) 20 | 21 | def main(device): 22 | nstreams = 8 23 | nreps = 10 24 | n = 16*MB 25 | nbytes = n*SI 26 | 27 | count = c_int() 28 | cuDeviceGetCount(byref(count)) 29 | if count == 0: 30 | print "no CUDA capable device found" 31 | return 32 | 33 | major = c_int() 34 | minor = c_int() 35 | cuDeviceComputeCapability(byref(major),byref(minor),device.device) 36 | if major.value == 1 and minor.value < 1: 37 | print "%s does not support streams" % props.name 38 | return 39 | 40 | init_array = device.functions["init_array"] 41 | u = zeros((n,),dtype=int32)+5 42 | x = convert(u) 43 | c = c_int(x[0]) 44 | a = c_void_p() 45 | cuMemAllocHost(byref(a),nbytes) 46 | 47 | d_a = getMemory(n) 48 | d_c = getMemory(x) 49 | 50 | streams = (CUstream*nstreams)() 51 | for i in range(nstreams): 52 | stream = CUstream() 53 | cuStreamCreate(byref(stream),0) 54 | streams[i] = stream 55 | 56 | ev_start = CUevent() 57 | ev_stop = CUevent() 58 | cuEventCreate(byref(ev_start),0) 59 | cuEventCreate(byref(ev_stop),0) 60 | 61 | cuEventRecord(ev_start,streams[0]) 62 | cuMemcpyDtoHAsync(a,d_a,nbytes,streams[0]) 63 | cuEventRecord(ev_stop,streams[0]) 64 | cuEventSynchronize(ev_stop) 65 | t_copy = c_float() 66 | cuEventElapsedTime(byref(t_copy),ev_start,ev_stop) 67 | t_copy = t_copy.value 68 | 69 | cuFuncSetBlockShape(init_array,512,1,1) 70 | cuParamSeti(init_array,0,d_a) 71 | cuParamSeti(init_array,4,d_c) 72 | cuParamSetSize(init_array,8) 73 | 74 | cuEventRecord(ev_start,streams[0]) 75 | cuLaunchGrid(init_array,n/512,1) 76 | cuEventRecord(ev_stop,streams[0]) 77 | cuEventSynchronize(ev_stop) 78 | t_kernel = c_float() 79 | cuEventElapsedTime(byref(t_kernel),ev_start,ev_stop) 80 | t_kernel = t_kernel.value 81 | 82 | cuFuncSetBlockShape(init_array,512,1,1) 83 | cuParamSeti(init_array,0,d_a) 84 | cuParamSeti(init_array,4,d_c) 85 | cuParamSetSize(init_array,8) 86 | 87 | cuEventRecord(ev_start,streams[0]) 88 | for i in range(nreps): 89 | cuLaunchGrid(init_array,n/512,1) 90 | cuMemcpyDtoH(a,d_a,nbytes) 91 | cuEventRecord(ev_stop,streams[0]) 92 | cuEventSynchronize(ev_stop) 93 | elapsed0 = c_float() 94 | cuEventElapsedTime(byref(elapsed0),ev_start,ev_stop) 95 | elapsed0 = elapsed0.value 96 | 97 | memset(a,255,nbytes) 98 | cuMemsetD32(d_a,0,n) 99 | cuEventRecord(ev_start,streams[0]) 100 | a_0 = a.value 101 | off = n*SI/nstreams 102 | for k in range(nreps): 103 | for i in range(nstreams): 104 | d_ai = d_a+i*n*SI/nstreams 105 | cuParamSeti(init_array,0,d_ai) 106 | cuLaunchGridAsync(init_array,n/(nstreams*512),1,streams[i]) 107 | for i in range(nstreams): 108 | ai = a_0+i*off 109 | di = d_c+i*off 110 | cuMemcpyDtoHAsync(ai,di,nbytes/nstreams,streams[i]) 111 | cuEventRecord(ev_stop,streams[0]) 112 | cuEventSynchronize(ev_stop) 113 | elapsed1 = c_float() 114 | cuEventElapsedTime(byref(elapsed1),ev_start,ev_stop) 115 | elapsed1 = elapsed1.value 116 | 117 | passed = check_results(a,n,c) 118 | 119 | for i in range(nstreams): 120 | cuStreamDestroy(streams[i]) 121 | cuEventDestroy(ev_start) 122 | cuEventDestroy(ev_stop) 123 | 124 | cuMemFree(d_a) 125 | cuMemFree(d_c) 126 | cuMemFreeHost(a) 127 | 128 | print "memcopy:\t%.2f" % t_copy 129 | print "kernel:\t\t%.2f" % t_kernel 130 | print "non-streamed:\t%.2f (%.2f expected)" % ( 131 | elapsed0/nreps,t_kernel+t_copy) 132 | print "%d streams:\t%.2f (%.2f expected)" % ( 133 | nstreams,elapsed1/nreps,t_kernel+t_copy/nstreams) 134 | 135 | print "-------------------------------" 136 | if passed: 137 | print "Test PASSED" 138 | else: 139 | print "Test FAILED" 140 | 141 | if __name__ == "__main__": 142 | device = cu_CUDA() 143 | device.getSourceModule("gpuFunctions.cubin") 144 | device.getFunction("init_array") 145 | main(device) 146 | cuCtxDetach(device.context) 147 | -------------------------------------------------------------------------------- /tests/cu/todo/cu_trig.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | # coding:utf-8: © Arno Pähler, 2007-08 3 | from ctypes import * 4 | from time import time 5 | 6 | from cu.cu_defs import * 7 | from cu.cu_api import * 8 | from utils.cu_utils import * 9 | 10 | from cpuFunctions import vectorInit,checkError 11 | 12 | UseVML = True 13 | if UseVML: 14 | from mklMath import cpuTRIG 15 | else: 16 | from cpuFunctions import cpuTRIG 17 | 18 | BLOCK_SIZE = 128 19 | GRID_SIZE = 192 20 | checkErrorFlag = False 21 | 22 | S4 = sizeof(c_float) 23 | 24 | def main(device,vlength = 128,loops = 1): 25 | 26 | n2 = vlength ## Vector length 27 | gpuTRIG = device.functions["gpuTRIG"] 28 | 29 | h_X = (c_float*n2)() 30 | h_Y = (c_float*n2)() 31 | h_Z = (c_float*n2)() 32 | 33 | vectorInit(h_X) 34 | 35 | d_X = getMemory(h_X) 36 | d_Y = getMemory(h_Y) 37 | d_Z = getMemory(h_Z) 38 | 39 | cuFuncSetBlockShape(gpuTRIG,BLOCK_SIZE,1,1) 40 | cuParamSeti(gpuTRIG,0,d_Y) 41 | cuParamSeti(gpuTRIG,4,d_Z) 42 | cuParamSeti(gpuTRIG,8,d_X) 43 | cuParamSeti(gpuTRIG,12,n2) 44 | cuParamSetSize(gpuTRIG,16) 45 | 46 | cuCtxSynchronize() 47 | t0 = time() 48 | for i in range(loops): 49 | cuLaunchGrid(gpuTRIG,GRID_SIZE,1) 50 | cuCtxSynchronize() 51 | t0 = time()-t0 52 | 53 | flops = (8.e-9*n2)*float(loops) 54 | g_Y = (c_float*n2)() 55 | g_Z = (c_float*n2)() 56 | cuMemcpyDtoH(g_Y,d_Y,S4*n2) 57 | cuMemcpyDtoH(g_Z,d_Z,S4*n2) 58 | cuCtxSynchronize() 59 | 60 | cuMemFree(d_X) 61 | cuMemFree(d_Y) 62 | cuMemFree(d_Z) 63 | 64 | t1 = time() 65 | for i in range(loops): 66 | cpuTRIG(h_Y,h_Z,h_X) 67 | t1 = time()-t1 68 | print "%10d%6.2f%6.2f GFlops" % (vlength,flops/t1,flops/t0) 69 | 70 | if checkErrorFlag: 71 | err,mxe = checkError(h_Y,g_Y,n2) 72 | print "Avg and max rel error (cos) = %.2e %.2e" % (err,mxe) 73 | err,mxe = checkError(h_Z,g_Z,n2) 74 | print "Avg and max rel error (sin) = %.2e %.2e" % (err,mxe) 75 | 76 | if __name__ == "__main__": 77 | import sys 78 | 79 | device = cu_CUDA() 80 | device.getSourceModule("gpuFunctions.cubin") 81 | device.getFunction("gpuTRIG") 82 | 83 | lmin,lmax = 7,23 84 | if len(sys.argv) > 1: 85 | lmin = lmax = int(sys.argv[1]) 86 | lmax = min(max(0,lmax),23) 87 | lmin = min(max(0,lmin),lmax) 88 | for l in range(lmin,lmax+1): 89 | if l < 10: 90 | loops = 10000 91 | elif l < 13: 92 | loops = 2000 93 | elif l < 17: 94 | loops = 250 95 | elif l < 21: 96 | loops = 100 97 | else: 98 | loops = 50 99 | vlength = 1 << l 100 | print "%5d %5d" % (l,loops), 101 | main(device,vlength,loops) 102 | cuCtxDetach(device.context) 103 | -------------------------------------------------------------------------------- /tests/cuda/todo/cuda_GL.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | # coding:utf-8: © Arno Pähler, 2007-08 3 | # GLUT version 4 | from ctypes import * 5 | 6 | from ogl.gl import * 7 | from OpenGL.GLUT import * 8 | 9 | from cuda.cuda_defs import * 10 | from cuda.cuda_api import * 11 | 12 | lib = CDLL("./libkernelGL.so") 13 | 14 | kernel1 = lib.__device_stub_kernel1 15 | kernel1.restype = None 16 | kernel1.argtypes = [ c_void_p, c_uint, c_uint, c_float ] 17 | 18 | kernel2 = lib.__device_stub_kernel2 19 | kernel2.restype = None 20 | kernel2.argtypes = [ c_void_p, c_uint, c_uint, c_float ] 21 | 22 | window_width = 512 23 | window_height = 512 24 | 25 | mesh_width = 256 26 | mesh_height = 256 27 | 28 | anim = 0.0 29 | mouse_buttons = 0 30 | rotate_x,rotate_y,translate_z = 0.,0.,-3.0 31 | global mouse_old_x,mouse_old_y 32 | 33 | vbo = GLuint() 34 | 35 | kernel = kernel1 36 | 37 | def main(argc,argv): 38 | global vbo 39 | 40 | glutInit(argc,argv) 41 | glutInitDisplayMode(GLUT_RGBA|GLUT_DOUBLE) 42 | glutInitWindowSize(window_width,window_height) 43 | glutCreateWindow("Cuda GL Demo") 44 | 45 | initGL() 46 | 47 | glutDisplayFunc(display) 48 | glutKeyboardFunc(keyboard) 49 | glutMouseFunc(mouse) 50 | glutMotionFunc(motion) 51 | 52 | vbo = createVBO() 53 | runCuda(vbo) 54 | 55 | glutMainLoop() 56 | 57 | def runCuda(vbo): 58 | vptr = c_void_p() 59 | status = cudaGLMapBufferObject(byref(vptr),vbo) 60 | 61 | block = dim3(8,8,1) 62 | grid = dim3(mesh_width/block.x,mesh_height/block.y,1) 63 | status = cudaConfigureCall(grid,block,0,0) 64 | kernel(vptr,mesh_width,mesh_height,anim) 65 | 66 | status = cudaGLUnmapBufferObject(vbo) 67 | if status != 0: 68 | exit() 69 | 70 | def initGL(): 71 | glClearColor(0.0,0.0,0.0,1.0) 72 | glDisable(GL_DEPTH_TEST) 73 | 74 | glViewport(0,0,window_width,window_height) 75 | glMatrixMode(GL_PROJECTION) 76 | glLoadIdentity() 77 | ratio = float(window_width)/float(window_height) 78 | glFrustum(-1.,1.,-1.,1.,2.,10.) 79 | 80 | return True 81 | 82 | def createVBO(): 83 | global vbo 84 | glGenBuffers(1,byref(vbo)) 85 | glBindBuffer(GL_ARRAY_BUFFER,vbo) 86 | 87 | size = mesh_width*mesh_height*4*sizeof(c_float) 88 | glBufferData(GL_ARRAY_BUFFER,size,0,GL_DYNAMIC_DRAW) 89 | 90 | glBindBuffer(GL_ARRAY_BUFFER,0) 91 | 92 | status = cudaGLRegisterBufferObject(vbo) 93 | return vbo 94 | 95 | def deleteVBO(): 96 | global vbo 97 | glBindBuffer(1,vbo) 98 | glDeleteBuffers(1,vbo) 99 | 100 | status = cudaGLUnregisterBufferObject(vbo) 101 | vbo = 0 102 | 103 | def display(): 104 | global anim,vbo 105 | runCuda(vbo) 106 | 107 | glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT) 108 | 109 | glMatrixMode(GL_MODELVIEW) 110 | glLoadIdentity() 111 | glTranslatef(0.0,0.0,translate_z) 112 | glRotatef(rotate_x,1.0,0.0,0.0) 113 | glRotatef(rotate_y,0.0,1.0,0.0) 114 | 115 | glBindBuffer(GL_ARRAY_BUFFER,vbo) 116 | glVertexPointer(4,GL_FLOAT,0,0) 117 | 118 | glEnableClientState(GL_VERTEX_ARRAY) 119 | glColor3f(1.0,0.0,0.0) 120 | glDrawArrays(GL_POINTS,0,mesh_width*mesh_height) 121 | glDisableClientState(GL_VERTEX_ARRAY) 122 | 123 | glutSwapBuffers() 124 | glutPostRedisplay() 125 | 126 | anim += 0.01 127 | 128 | def keyboard(key,x,y): 129 | if key == chr(27): 130 | deleteVBO() 131 | exit() 132 | 133 | def mouse(button,state,x,y): 134 | global mouse_buttons 135 | global mouse_old_x,mouse_old_y 136 | if state == GLUT_DOWN: 137 | mouse_buttons |= 1<