├── .gitignore
├── .pylintrc
├── LICENSE
├── README.rst
├── cuda
    ├── __init__.py
    ├── cu
    │   ├── __init__.py
    │   └── cudadrv.py
    ├── cublas
    │   ├── __init__.py
    │   └── cublas.py
    ├── cuda
    │   ├── __init__.py
    │   └── cudart.py
    ├── cufft
    │   ├── __init__.py
    │   └── cufft.py
    ├── sugar
    │   ├── __init__.py
    │   ├── blas
    │   │   ├── __init__.py
    │   │   ├── saxpy.py
    │   │   ├── sdot.py
    │   │   └── sgemm.py
    │   ├── fft
    │   │   ├── __init__.py
    │   │   ├── conv_gold.py
    │   │   ├── fft.py
    │   │   ├── fftconvolve.py
    │   │   └── fftconvolve2d_kernel.cu
    │   ├── kernel
    │   │   ├── __init__.py
    │   │   ├── compiler.py
    │   │   ├── kernelfactorydrv.py
    │   │   ├── kernelfactoryrt.py
    │   │   └── tests
    │   │   │   ├── matrix_mul.py
    │   │   │   └── matrix_mul_kernel.cu
    │   ├── memory
    │   │   ├── __init__.py
    │   │   └── linear.py
    │   └── query
    │   │   ├── __init__.py
    │   │   ├── bandwidth.py
    │   │   ├── cu_utils.py
    │   │   └── cuda_utils.py
    └── utils
    │   ├── __init__.py
    │   ├── decorator.py
    │   ├── libutils.py
    │   └── logger.py
├── ez_setup.py
├── mkdist
├── oldcode
    ├── cu
    │   ├── __init__.py
    │   ├── cu_api.py
    │   └── cu_defs.py
    ├── cublas
    │   ├── __init__.py
    │   ├── cublas_api.py
    │   └── cublas_defs.py
    ├── cuda
    │   ├── __init__.py
    │   ├── cuda_api.py
    │   └── cuda_defs.py
    ├── cufft
    │   ├── __init__.py
    │   ├── cufft_api.py
    │   └── cufft_defs.py
    ├── examples
    │   ├── TODO
    │   ├── __init__.py
    │   └── bw_test.py
    └── misc
    │   ├── README
    │   ├── cf
    │   ├── cmpG
    │   ├── compileC
    │   ├── compileCG
    │   ├── compileCX
    │   ├── compileG
    │   ├── compileGso
    │   ├── cpuFunctions.c
    │   ├── cpuFunctions.py
    │   ├── ctypes_array.py
    │   ├── ctypes_array_test.py
    │   ├── ctypes_extra.py
    │   ├── devinfo_cr.py
    │   ├── devinfo_cu.py
    │   ├── gpuFunctions.cu
    │   ├── gpuFunctions.cubin
    │   ├── gpuFunctions.linkinfo
    │   ├── gpuFunctions.ptx
    │   ├── gpuFunctions.py
    │   ├── kernelGL.cu
    │   ├── matadd.txt
    │   ├── mklMath.py
    │   ├── sgemmN
    │   ├── sgemmN.cu
    │   ├── sgemmN.log
    │   ├── simple.cu
    │   ├── simple.cubin
    │   ├── simple.ptx
    │   ├── simple.py
    │   ├── utilities.py
    │   └── vector.c
├── setup.py
├── tests
    ├── cu
    │   └── todo
    │   │   ├── cu_add.py
    │   │   ├── cu_blsc.py
    │   │   ├── cu_gflops.py
    │   │   ├── cu_poly.py
    │   │   ├── cu_saxpy.py
    │   │   ├── cu_sgemm.py
    │   │   ├── cu_streams.py
    │   │   └── cu_trig.py
    ├── cuda
    │   └── todo
    │   │   ├── cuda_GL.py
    │   │   ├── cuda_GLimg.png
    │   │   ├── cuda_QtGL.py
    │   │   ├── cuda_add.py
    │   │   ├── cuda_blsc.py
    │   │   ├── cuda_gflops.py
    │   │   ├── cuda_poly.py
    │   │   ├── cuda_saxpy.py
    │   │   ├── cuda_sgemm.py
    │   │   ├── cuda_streams.py
    │   │   └── cuda_trig.py
    ├── cufft
    │   ├── cufft_fft.py
    │   ├── fftlab.py
    │   └── todo
    │   │   ├── bfft.py
    │   │   ├── cu_fft.py
    │   │   ├── cuda_fft.py
    │   │   ├── dfft.py
    │   │   ├── gfft_cu.py
    │   │   ├── gfft_cuda.py
    │   │   ├── manyfft.py
    │   │   ├── sfft.py
    │   │   └── xfft.py
    └── test_cublas.py
└── xml
    ├── createbindings.py
    ├── cublas.py
    ├── cublas.xml
    ├── cudadrv.py
    ├── cudadrv.xml
    ├── cudart.py
    ├── cudart.xml
    ├── cufft.py
    ├── cufft.xml
    ├── generate-xml.sh
    ├── generate-xml.sh.orig
    ├── generate-xml_linux.sh
    ├── generate-xml_macosx.sh
    └── my_CUDA2100_vector_types.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.py~
 3 | junk
 4 | build
 5 | *egg*
 6 | *testbed*
 7 | *.so
 8 | *.linkinfo
 9 | *.swp
10 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Description
 2 | ===========
 3 | 
 4 | Python bindings for CUDA 2.1 with numpy integration
 5 | 
 6 | Authors
 7 | -------
 8 | 
 9 | Justin Riley (jtriley@mit.edu)
10 | Nicolas Pinto (pinto@mit.edu)
11 | 
12 | Mailing List
13 | ============
14 | 
15 | http://groups.google.com/group/python-cuda
16 | 
17 | Bug Tracker
18 | ===========
19 | 
20 | http://npinto.lighthouseapp.com/projects/24960-python-cuda
21 | 
22 | License
23 | =======
24 | 
25 | see the LICENSE file
26 | 
27 | 


--------------------------------------------------------------------------------
/cuda/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import cuda
 4 | import cu
 5 | 
 6 | import cublas
 7 | import cufft
 8 | 
 9 | import sugar
10 | import utils
11 | 
12 | import platform
13 | 
14 | # add $CUDA_ROOT/bin to %PATH% in windows
15 | if platform.system() == "Windows":
16 |     import _winreg as wreg
17 |     reg = wreg.ConnectRegistry(None, wreg.HKEY_LOCAL_MACHINE)
18 |     key = wreg.OpenKey(reg, r"SOFTWARE\NVIDIA Corporation\Installed Products\NVIDIA CUDA")
19 |     import os
20 |     cuda_bin = os.path.join(wreg.QueryValueEx(key, "InstallDir")[0],"bin")
21 |     os.environ['PATH'] += os.path.pathsep + cuda_bin
22 | 
23 | import atexit
24 | atexit.register(cuda.cudaThreadExit)
25 | 
26 | def debug():
27 |     utils.enable_debug()
28 | 


--------------------------------------------------------------------------------
/cuda/cu/__init__.py:
--------------------------------------------------------------------------------
1 | from cudadrv import *
2 | 


--------------------------------------------------------------------------------
/cuda/cublas/__init__.py:
--------------------------------------------------------------------------------
1 | from cublas import *
2 | 


--------------------------------------------------------------------------------
/cuda/cuda/__init__.py:
--------------------------------------------------------------------------------
1 | from cudart import *
2 | 


--------------------------------------------------------------------------------
/cuda/cufft/__init__.py:
--------------------------------------------------------------------------------
1 | from cufft import *
2 | CUFFT_FORWARD = -1
3 | CUFFT_INVERSE = 1
4 | 


--------------------------------------------------------------------------------
/cuda/sugar/__init__.py:
--------------------------------------------------------------------------------
1 | import memory
2 | import kernel
3 | import fft 
4 | import blas
5 | import query
6 | 


--------------------------------------------------------------------------------
/cuda/sugar/blas/__init__.py:
--------------------------------------------------------------------------------
1 | from saxpy import *
2 | from sdot import *
3 | from sgemm import *
4 | 


--------------------------------------------------------------------------------
/cuda/sugar/blas/saxpy.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | from time import time
 3 | from ctypes import cast,c_float, POINTER
 4 | 
 5 | from numpy import empty_like,dot
 6 | from numpy.random import randn
 7 | 
 8 | from cuda.cublas import *
 9 | from cuda.cuda import cudaThreadSynchronize
10 | from cuda.sugar.memory import Linear
11 | 
12 | def embed_ipython():
13 |     from IPython.Shell import IPShellEmbed
14 |     ipshell = IPShellEmbed(user_ns = dict())
15 |     ipshell()
16 | 
17 | def cpu_saxpy(a,b, alpha):
18 |     return (alpha*a+b)
19 | 
20 | def gpu_saxpy(a,b,alpha):
21 |     # init cublas lib
22 |     cublasInit()
23 | 
24 |     # allocate device vectors from host
25 |     d_X = Linear(a.shape).from_numpy(a)
26 |     d_Y = Linear(b.shape).from_numpy(b)
27 | 
28 |     # execute cublasSaxpy and sync threads
29 |     cublasSaxpy(a.shape[1],alpha,d_X.ref,1,d_Y.ref,1)
30 |     cudaThreadSynchronize()
31 | 
32 |     return d_Y.to_numpy()
33 | 
34 | def test():
35 |     vlength = 8192
36 |     alpha = 1
37 | 
38 |     # allocate host vectors
39 |     h_X = randn(1,vlength).astype('float32')
40 |     h_Y = randn(1,vlength).astype('float32')
41 | 
42 |     print "-"*80
43 |     print 'h_X:'
44 |     print h_X
45 |     print "-"*80
46 | 
47 |     print "-"*80
48 |     print 'h_Y:'
49 |     print h_Y
50 |     print "-"*80
51 | 
52 |     print "-"*80
53 |     print 'CPU RESULT:'
54 |     print cpu_saxpy(h_X,h_Y,alpha)
55 |     print "-"*80
56 | 
57 |     print "-"*80
58 |     print 'GPU RESULT:'
59 |     print gpu_saxpy(h_X, h_Y, alpha)
60 |     print "-"*80
61 | 
62 | if __name__ == "__main__":
63 |     test()
64 | 


--------------------------------------------------------------------------------
/cuda/sugar/blas/sdot.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | 
 3 | from ctypes import c_float
 4 | from time import time
 5 | 
 6 | import cuda.cublas as cublas
 7 | import cuda.cuda as cuda
 8 | from cuda.sugar.memory import Linear
 9 | 
10 | import numpy
11 | from numpy.random import randn
12 | 
13 | def gpu_sdot(a,b):
14 |     assert a.size == b.size
15 |     assert a.shape[0] == b.shape[1]
16 |     cublas.cublasInit()
17 |     cublas.cublasFree(0)
18 |     d_X = Linear(a.shape).from_numpy(a)
19 |     d_Y = Linear(b.shape).from_numpy(b)
20 |     gpu_result = cublas.cublasSdot(a.shape[1], d_X.ref, 1, d_Y.ref, 1)
21 |     cuda.cudaThreadSynchronize()
22 |     cublas.cublasShutdown()
23 |     return gpu_result
24 | 
25 | def test():
26 |     vlength = 1024
27 | 
28 |     n2 = vlength*vlength
29 | 
30 |     h_X = randn(1,n2).astype('float32')
31 |     h_Y = randn(1,n2).astype('float32')
32 | 
33 |     print "-"*80
34 |     print "h_X:"
35 |     print h_X
36 |     print "-"*80
37 | 
38 |     print "-"*80
39 |     print "h_Y:"
40 |     print h_Y
41 |     print "-"*80
42 | 
43 |     print "-"*80
44 |     print numpy.dot(h_X,h_Y.transpose())[0][0]
45 |     print "-"*80
46 | 
47 |     print "-"*80
48 |     print "cublasSdot(d_X,d_Y):"
49 |     print gpu_sdot(h_X, h_Y.transpose())
50 |     print "-"*80
51 | 
52 | if __name__ == "__main__":
53 |     test()
54 | 


--------------------------------------------------------------------------------
/cuda/sugar/blas/sgemm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from cuda.cuda import cudaThreadSynchronize
 3 | from cuda.cublas import cublasInit, cublasShutdown, cublasSgemm
 4 | from cuda.sugar.memory import Linear
 5 | 
 6 | import numpy 
 7 | from numpy.random import randn
 8 | 
 9 | def gpu_sgemm(a,b, alpha=1):
10 |     """ Single Precision Matrix Multiplication on GPU, expects two, two-dimensional numpy arrays as input. Arrays must be such that a.shape[1] == b.shape[0]. Optionally specify alpha for scalar multiplication"""
11 |     # init cublas
12 |     cublasInit()
13 | 
14 |     assert a.shape[1] == b.shape[0]
15 | 
16 |     c_shape = (a.shape[0], b.shape[1])
17 |     # allocate device matrices from host
18 |     dA = Linear(a.shape, order='F').from_numpy(a)
19 |     dB = Linear(b.shape, order='F').from_numpy(b)
20 |     dC = Linear(c_shape, order='F')
21 | 
22 |     # transpose a/b ? t = yes, n = no
23 |     transa = 'n'
24 |     transb = 'n'
25 | 
26 |     # compute with CUBLAS
27 |     cublasSgemm( transa, transb, a.shape[0], b.shape[1], a.shape[1], alpha, dA.ref, a.shape[0], dB.ref, b.shape[0], 0, dC.ref, a.shape[0] )
28 |     cudaThreadSynchronize()
29 |     # shutdown
30 |     cublasShutdown() 
31 |     return dC.to_numpy()
32 | 
33 | 
34 | 
35 | def test():
36 |     # Size of square matrix
37 |     N = 2
38 | 
39 |     # allocate host matrices
40 |     A = randn(3,N).astype('float32')
41 |     B = randn(3,5).astype('float32')
42 | 
43 |     # compute the cpu reference
44 |     ref = numpy.dot(A,B)
45 | 
46 |     print '-'*80
47 |     print ref 
48 |     print '-'*80
49 | 
50 |     print '-'*80
51 |     print gpu_sgemm(A,B) 
52 |     print '-'*80
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     test()
57 | 


--------------------------------------------------------------------------------
/cuda/sugar/fft/__init__.py:
--------------------------------------------------------------------------------
1 | from fft import *
2 | from fftconvolve import *
3 | 


--------------------------------------------------------------------------------
/cuda/sugar/fft/conv_gold.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import ctypes
  3 | import numpy as np
  4 | from scipy.signal import convolve2d, fftconvolve
  5 | 
  6 | 
  7 | source = '''
  8 | #include <stdio.h>
  9 | #include <math.h>
 10 | 
 11 | typedef struct{
 12 |     float x, y;
 13 | } Complex;
 14 | 
 15 | const Complex CPLX_ZERO = {0, 0};
 16 | 
 17 | //a += b * c
 18 | extern "C" void complexMAD(Complex& a, Complex b, Complex c){
 19 |     Complex t = {a.x + b.x * c.x - b.y * c.y, a.y + b.x * c.y + b.y * c.x};
 20 |     a = t;
 21 | }
 22 | 
 23 | extern "C" void printComplexArray(Complex * arr, int rows, int cols) {
 24 |     printf("arr[%d,%d] = %f \\n", 0, 0, arr[0].x);
 25 |     for(int i=0; i < rows; ++i) {
 26 |         for(int j=0; j < cols; ++j) {
 27 |             //printf("arr[%d,%d] = %f+i%f \\n", i, j, arr[i*cols+j].x, arr[i*cols+j].y);
 28 |         }
 29 |     }
 30 | }
 31 | 
 32 | extern "C" int checkResults(Complex *h_ResultCPU, Complex *h_ResultGPU, int DATA_W, int DATA_H, int FFT_W) {
 33 |     Complex rCPU, rGPU;
 34 |     double max_delta_ref, delta, ref, sum_delta2, sum_ref2, L2norm;
 35 | 
 36 |     sum_delta2 = 0;
 37 |     sum_ref2   = 0;
 38 |     max_delta_ref = 0;
 39 | 
 40 |     for(int y = 0; y < DATA_H; y++)
 41 |         for(int x = 0; x < DATA_W; x++){
 42 |             rCPU = h_ResultCPU[y * DATA_W + x];
 43 |             rGPU = h_ResultGPU[y * FFT_W  + x];
 44 |             delta = (rCPU.x - rGPU.x) * (rCPU.x - rGPU.x) + (rCPU.y - rGPU.y) * (rCPU.y - rGPU.y);
 45 |             ref   = rCPU.x * rCPU.x + rCPU.y * rCPU.y;
 46 |             if((delta / ref) > max_delta_ref) max_delta_ref = delta / ref;
 47 |             sum_delta2 += delta;
 48 |             sum_ref2   += ref;
 49 |         }
 50 |     L2norm = sqrt(sum_delta2 / sum_ref2);
 51 |     printf("Max delta / CPU value %E\\n", sqrt(max_delta_ref));
 52 |     printf("L2 norm: %E\\n", L2norm);
 53 |     printf((L2norm < 1e-6) ? "TEST PASSED\\n" : "TEST FAILED\\n");
 54 |     return 0;
 55 | }
 56 | 
 57 | ////////////////////////////////////////////////////////////////////////////////
 58 | // Reference straightfroward CPU convolution
 59 | ////////////////////////////////////////////////////////////////////////////////
 60 | extern "C" void convolutionCPU(
 61 |     Complex *h_Result,
 62 |     Complex *h_Data,
 63 |     Complex *h_Kernel,
 64 |     int dataW,
 65 |     int dataH,
 66 |     int kernelW,
 67 |     int kernelH,
 68 |     int kernelX,
 69 |     int kernelY
 70 | ){
 71 |     //for(int y=0; y < kernelH; y++) {
 72 |     //    for(int x=0; x < kernelW; x++) {
 73 |     //        printf("k[%d,%d] = %f + %fj\\n", x,y,h_Kernel[y*kernelW+x].x,h_Kernel[y*kernelW+x].y);
 74 |     //    }
 75 |     //}
 76 |     for(int y = 0; y < dataH; y++)
 77 |         for(int x = 0; x < dataW; x++){
 78 |             //printf("[%d,%d] = %f + %fj\\n", x,y,h_Data[x*dataW+y].x,h_Data[x*dataW+y].y);
 79 |      //       printf("d[%d,%d] = %f + %fj\\n", x,y,h_Data[y*dataW+x].x,h_Data[y*dataW+x].y);
 80 | 
 81 |             Complex sum = CPLX_ZERO;
 82 | 
 83 |             for(int ky = -(kernelH - kernelY - 1); ky <= kernelY; ky++)
 84 |                 for(int kx = -(kernelW - kernelX - 1); kx <= kernelX; kx++){
 85 |                     int dx = x + kx;
 86 |                     int dy = y + ky;
 87 |                     if(dx < 0) dx = 0;
 88 |                     if(dy < 0) dy = 0;
 89 |                     if(dx >= dataW) dx = dataW - 1;
 90 |                     if(dy >= dataH) dy = dataH - 1;
 91 | 
 92 |                     complexMAD(
 93 |                         sum,
 94 |                         h_Data[dy * dataW + dx],
 95 |                         h_Kernel[(kernelY - ky) * kernelW + (kernelX - kx)]
 96 |                     );
 97 |                 }
 98 | 
 99 |             h_Result[y * dataW + x] = sum;
100 |         }
101 | }
102 | '''
103 | 
104 | 
105 | 
106 | class float2(ctypes.Structure):
107 |     pass
108 | float2._fields_ = [
109 |     ('x', ctypes.c_float),
110 |     ('y', ctypes.c_float),
111 | ]
112 | 
113 | def _get_float2_ptr(numpy_array):
114 |     return numpy_array.ctypes.data_as(ctypes.POINTER(float2))
115 | 
116 | def _load_dll():
117 |     file = open('conv_gold.cpp','w')
118 |     file.write(source)
119 |     file.close()
120 | 
121 |     os.system('rm -f conv_gold.so')
122 |     os.system('g++ -fPIC -shared -o /tmp/conv_gold.so conv_gold.cpp')
123 |     os.system('rm conv_gold.cpp')
124 |     return ctypes.cdll.LoadLibrary('/tmp/conv_gold.so')
125 | 
126 | DLL = _load_dll()
127 | 
128 | def get_dll():
129 |     return DLL
130 | 
131 | def get_convolution_cpu():
132 |     conv_gold = get_dll()
133 |     return conv_gold.convolutionCPU
134 | 
135 | def get_check_results():
136 |     conv_gold = get_dll()
137 |     return conv_gold.checkResults
138 | 
139 | def get_print_complex():
140 |     conv_gold = get_dll()
141 |     return conv_gold.printComplexArray
142 | 
143 | def run():
144 |     print_complex = get_print_complex()
145 |     convolutionCPU = get_convolution_cpu()
146 |     check_results = get_check_results()
147 | 
148 |     #data = np.ones((3,3)).astype('complex64')
149 |     data = np.asfortranarray(np.random.randn(3,3).astype('complex64'))
150 |     #kernel = np.ones((3,3)).astype('complex64')
151 |     kernel = np.asfortranarray(np.random.randn(3,3).astype('complex64'))
152 |     result = np.asfortranarray(np.zeros_like(data).astype('complex64'))
153 | 
154 |     convolutionCPU(_get_float2_ptr(result), _get_float2_ptr(data), _get_float2_ptr(kernel), data.shape[1], data.shape[0], kernel.shape[1], kernel.shape[0], 1, 6)
155 | 
156 |     print
157 |     print kernel
158 |     print
159 |     print data
160 |     print
161 | 
162 |     s1 = np.array(data.shape)
163 |     s2 = np.array(kernel.shape)
164 | 
165 |     print result
166 |     print 
167 |     print fftconvolve(data.real, kernel.real, mode='full').astype('complex64')
168 | 
169 | if __name__ == "__main__":
170 |     run()
171 | 


--------------------------------------------------------------------------------
/cuda/sugar/fft/fft.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os
  3 | import sys
  4 | import ctypes
  5 | 
  6 | import numpy as np
  7 | 
  8 | import cuda.cuda as cuda
  9 | import cuda.cufft as cufft
 10 | import logging 
 11 | 
 12 | logger = logging.getLogger(os.path.basename(__file__))
 13 | info = logger.info
 14 | debug = logger.debug
 15 | warn = logger.warn
 16 | error = logger.error
 17 | 
 18 | def _get_cufft_signal(numpy_array):
 19 |     dsignal = ctypes.c_void_p()
 20 |     cuda.cudaMalloc(ctypes.byref(dsignal), numpy_array.nbytes)
 21 |     cuda.cudaMemcpy(dsignal, numpy_array.ctypes.data, numpy_array.nbytes, cuda.cudaMemcpyHostToDevice)
 22 |     return ctypes.cast(dsignal,ctypes.POINTER(cufft.cufftComplex))
 23 | 
 24 | def _get_plan(shape):
 25 |     ndims = len(shape)
 26 |     if ndims == 1:
 27 |         return _get_1dplan(shape)
 28 |     elif ndims == 2:
 29 |         return _get_2dplan(shape)
 30 |     elif ndims == 3:
 31 |         return _get_3dplan(shape)
 32 |     else:
 33 |         error('_get_plan: invalid size (todo: throw exception)')
 34 | 
 35 | def _get_1dplan(shape,batch=1):
 36 |     debug("[*] Creating a 1D FFT plan...")
 37 |     plan = cufft.cufftHandle()
 38 |     cufft.cufftPlan1d(plan, shape[0], cufft.CUFFT_C2C, batch)
 39 |     return plan
 40 | 
 41 | def _get_2dplan(shape):
 42 |     debug("[*] Creating a 2D FFT plan...")
 43 |     plan = cufft.cufftHandle()
 44 |     cufft.cufftPlan2d(plan, shape[0], shape[1], cufft.CUFFT_C2C)
 45 |     return plan
 46 | 
 47 | def _get_3dplan(shape):
 48 |     debug("[*] Creating a 3D FFT plan...")
 49 |     plan = cufft.cufftHandle()
 50 |     cufft.cufftPlan3d(plan, shape[0], shape[1], shape[2], cufft.CUFFT_C2C)
 51 |     return plan
 52 | 
 53 | def _get_data(device_ptr,numpy_array):
 54 |     result = np.empty_like(numpy_array)
 55 |     cuda.cudaMemcpy(result.ctypes.data, device_ptr, numpy_array.nbytes, cuda.cudaMemcpyDeviceToHost)
 56 |     return result
 57 | 
 58 | def _get_inverse_data(device_ptr,numpy_array):
 59 |     result = _get_data(device_ptr, numpy_array)
 60 |     return result/float(numpy_array.size)
 61 | 
 62 | def _cuda_fft(numpy_array, leave_on_device=False):
 63 |     dsignal = _get_cufft_signal(numpy_array)
 64 |     plan = _get_plan(numpy_array.shape)
 65 |     #print "[*] Using the CUFFT plan to forward transform the signal in place..."
 66 |     #print "(*) cufftExecC2C note: Identical pointers to input and output arrays "
 67 |     #print "    implies in-place transformation"
 68 |     cufft.cufftExecC2C(plan, dsignal, dsignal, cufft.CUFFT_FORWARD)
 69 |     debug("[*] Destroying CUFFT plan...")
 70 |     cufft.cufftDestroy(plan)
 71 |     if not leave_on_device:
 72 |         result = _get_data(dsignal, numpy_array)
 73 |         #result = result.reshape(numpy_array.shape)
 74 |         cuda.cudaFree(dsignal)
 75 |         return result
 76 |     else:
 77 |         return dsignal
 78 | 
 79 | def _cuda_ifft(numpy_array, leave_on_device=False):
 80 |     dsignal = _get_cufft_signal(numpy_array)
 81 |     plan = _get_plan(numpy_array.shape)
 82 |     debug("[*] Using the CUFFT plan to inverse transform the signal in place...")
 83 |     cufft.cufftExecC2C(plan, dsignal, dsignal, cufft.CUFFT_INVERSE)
 84 |     debug("[*] Destroying CUFFT plan...")
 85 |     cufft.cufftDestroy(plan)
 86 |     if not leave_on_device:
 87 |         result = _get_inverse_data(dsignal, numpy_array)
 88 |         #result = result.reshape(numpy_array.shape)
 89 |         cuda.cudaFree(dsignal)
 90 |         return result
 91 |     else:
 92 |         return dsignal
 93 | 
 94 | def fft(numpy_array, leave_on_device=False):
 95 |     if numpy_array.ndim == 1:
 96 |         return _cuda_fft(numpy_array, leave_on_device)
 97 |     else:
 98 |         print 'cuda.sugar.fft.fft: ndim != 1, throw exception '
 99 | 
100 | def fft2(numpy_array, leave_on_device=False):
101 |     if numpy_array.ndim == 2:
102 |         return _cuda_fft(numpy_array, leave_on_device)
103 |     else:
104 |         print 'cuda.sugar.fft.fft2: ndim !=2, throw exception'
105 | 
106 | def fftn(numpy_array, leave_on_device=False):
107 |     if numpy_array.ndim > 3:
108 |         print 'cuda.sugar.fft.fftn: ndim > 3, throw exception'
109 |     else:
110 |         return _cuda_fft(numpy_array, leave_on_device)
111 | 
112 | def ifft(numpy_array, leave_on_device=False):
113 |     if numpy_array.ndim == 1:
114 |         return _cuda_ifft(numpy_array, leave_on_device)
115 |     else:
116 |         print 'cuda.sugar.fft.ifft: ndim != 1, throw exception '
117 | 
118 | def ifft2(numpy_array, leave_on_device=False):
119 |     if numpy_array.ndim == 2:
120 |         return _cuda_ifft(numpy_array, leave_on_device)
121 |     else:
122 |         print 'cuda.sugar.fft.ifft2: ndim != 2, throw exception '
123 | 
124 | def ifftn(numpy_array, leave_on_device=False):
125 |     if numpy_array.ndim > 3:
126 |         print 'cuda.sugar.fft.ifftn: ndim > 3, throw exception '
127 |     else:
128 |         return _cuda_ifft(numpy_array, leave_on_device)
129 | 
130 | def main():
131 |     print "-"*55
132 |     print "--                                                   --"
133 |     print "--    python-cuda versions of numpy.fft.{fft,ifft}   --"
134 |     print "--                                                   --"
135 |     print "-"*55
136 |     print
137 |     print ">>> Creating host signal..."
138 | 
139 |     try:
140 |         size = int(sys.argv[1])
141 |     except Exception,e:
142 |         size = 10
143 | 
144 |     print "size = %s" % size
145 | 
146 |     numpy_array = np.random.randn(size).astype('complex64')
147 |     numpy_array -= numpy_array.mean()
148 |     numpy_array /= numpy_array.std()
149 | 
150 |     print ">>> Computing ffts with GPU..."
151 |     print "[*] Forward fft on gpu ..."
152 |     fft_res = fft(numpy_array)
153 | 
154 |     print "[*] Inverse fft on gpu ..."
155 |     ifft_res = ifft(fft_res) 
156 | 
157 |     print ">>> Computing references with numpy..."
158 | 
159 |     print "[*] Forward fft"
160 |     forward_ref = np.fft.fft(numpy_array)
161 | 
162 |     print "[*] Inverse fft"
163 |     inverse_ref = np.fft.ifft(forward_ref)
164 | 
165 |     print "l2norm fft: ", np.linalg.norm(fft_res - forward_ref)
166 | 
167 |     print "l2norm ifft: ", np.linalg.norm(ifft_res - inverse_ref)
168 | 
169 | if __name__ == "__main__":
170 |     main()
171 | 


--------------------------------------------------------------------------------
/cuda/sugar/fft/fftconvolve2d_kernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2007 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * NOTICE TO USER:   
  5 |  *
  6 |  * This source code is subject to NVIDIA ownership rights under U.S. and 
  7 |  * international Copyright laws.  Users and possessors of this source code 
  8 |  * are hereby granted a nonexclusive, royalty-free license to use this code 
  9 |  * in individual and commercial software.
 10 |  *
 11 |  * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
 12 |  * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
 13 |  * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
 14 |  * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
 15 |  * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 16 |  * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
 17 |  * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
 18 |  * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
 19 |  * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE 
 20 |  * OR PERFORMANCE OF THIS SOURCE CODE.  
 21 |  *
 22 |  * U.S. Government End Users.   This source code is a "commercial item" as 
 23 |  * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of 
 24 |  * "commercial computer  software"  and "commercial computer software 
 25 |  * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) 
 26 |  * and is provided to the U.S. Government only as a commercial end item.  
 27 |  * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
 28 |  * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
 29 |  * source code with only those rights set forth herein. 
 30 |  *
 31 |  * Any use of this source code in individual and commercial software must 
 32 |  * include, in the user documentation and internal comments to the code,
 33 |  * the above Disclaimer and U.S. Government End Users Notice.
 34 |  */
 35 | 
 36 | 
 37 | #define IMUL(a, b) __mul24(a, b)
 38 | 
 39 | typedef float2 Complex;
 40 | 
 41 | texture<Complex, 2, cudaReadModeElementType> texKernel;
 42 | texture<Complex, 2, cudaReadModeElementType> texData;
 43 | 
 44 | extern "C" {
 45 | 
 46 | ////////////////////////////////////////////////////////////////////////////////
 47 | // Cyclically shift convolution kernel, so that the center is at (0, 0)
 48 | ////////////////////////////////////////////////////////////////////////////////
 49 | 
 50 | __global__ void padKernel(
 51 |     Complex *d_PaddedKernel,
 52 |     int fftW,
 53 |     int fftH,
 54 |     int kernelW,
 55 |     int kernelH,
 56 |     int kernelX,
 57 |     int kernelY
 58 | ){
 59 |     const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
 60 |     const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
 61 | 
 62 |     if(x < kernelW && y < kernelH){
 63 |         int kx = x - kernelX; if(kx < 0) kx += fftW;
 64 |         int ky = y - kernelY; if(ky < 0) ky += fftH;
 65 |         d_PaddedKernel[IMUL(ky, fftW) + kx] =
 66 |             tex2D(texKernel, (float)x + 0.5f, (float)y + 0.5f);
 67 |     }
 68 | }
 69 | 
 70 | 
 71 | ////////////////////////////////////////////////////////////////////////////////
 72 | // Copy input data array to the upper left corner and pad by border values
 73 | ////////////////////////////////////////////////////////////////////////////////
 74 | 
 75 | __global__ void padData(
 76 |     Complex *d_PaddedData,
 77 |     int fftW,
 78 |     int fftH,
 79 |     int dataW,
 80 |     int dataH,
 81 |     int kernelW,
 82 |     int kernelH,
 83 |     int kernelX,
 84 |     int kernelY
 85 | ){
 86 |     const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
 87 |     const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
 88 |     const int borderW = dataW + kernelX;
 89 |     const int borderH = dataH + kernelY;
 90 |     int dx;
 91 |     int dy;
 92 | 
 93 |     if(x < fftW && y < fftH){
 94 |         if(x < dataW) dx = x;
 95 |         if(y < dataH) dy = y;
 96 |         if(x >= dataW && x < borderW) dx = dataW - 1;
 97 |         if(y >= dataH && y < borderH) dy = dataH - 1;
 98 |         if(x >= borderW) dx = 0;
 99 |         if(y >= borderH) dy = 0;
100 | 
101 |         d_PaddedData[IMUL(y, fftW) + x] =
102 |             tex2D(texData, (float)dx + 0.5f, (float)dy + 0.5f);
103 |     }
104 | }
105 | 
106 | 
107 | 
108 | ////////////////////////////////////////////////////////////////////////////////
109 | // Modulate Fourier image of padded data by Fourier image of padded kernel
110 | // and normalize by FFT size
111 | ////////////////////////////////////////////////////////////////////////////////
112 | __device__ void complexMulAndScale(Complex& a, Complex b, float c){
113 |     Complex t = {c * (a.x * b.x - a.y * b.y), c * (a.y * b.x + a.x * b.y)};
114 |     a = t;
115 | }
116 | 
117 | __global__ void modulateAndNormalize(
118 |     Complex *d_PaddedData,
119 |     Complex *d_PaddedKernel,
120 |     int dataN
121 | ){
122 |     const int     tid = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
123 |     const int threadN = IMUL(blockDim.x, gridDim.x);
124 |     const float     q = 1.0f / (float)dataN;
125 | 
126 |     for(int i = tid; i < dataN; i += threadN)
127 |         complexMulAndScale(d_PaddedData[i], d_PaddedKernel[i], q);
128 | }
129 | }
130 | 


--------------------------------------------------------------------------------
/cuda/sugar/kernel/__init__.py:
--------------------------------------------------------------------------------
1 | from kernelfactorydrv import *
2 | from kernelfactoryrt import *
3 | 


--------------------------------------------------------------------------------
/cuda/sugar/kernel/compiler.py:
--------------------------------------------------------------------------------
  1 | from ctypes import cdll
  2 | import commands
  3 | from subprocess import Popen, PIPE
  4 | from cuda.utils import memoize
  5 | 
  6 | class CompileError(Exception):
  7 |     pass
  8 | 
  9 | @memoize
 10 | def get_nvcc_version(nvcc):
 11 |     try:
 12 |         return Popen([nvcc, "--version"], stdout=PIPE).communicate()[0]
 13 |     except OSError, e:
 14 |         raise OSError, "%s was not found (is it on the PATH?) [%s]" % (
 15 |                 nvcc, str(e))
 16 | 
 17 | def compile_plain(source, options, keep, nvcc, cache_dir):
 18 |     from os.path import join
 19 |     from platform import architecture
 20 | 
 21 |     if architecture()[0] == "64bit":
 22 |         options.insert(0,"-Xcompiler='-fPIC'")
 23 | 
 24 |     if cache_dir:
 25 |         try:
 26 |             import hashlib
 27 |             checksum = hashlib.md5()
 28 |         except ImportError:
 29 |             # for Python << 2.5
 30 |             import md5
 31 |             checksum = md5.new()
 32 | 
 33 |         checksum.update(source)
 34 |         for option in options: 
 35 |             checksum.update(option)
 36 |         checksum.update(get_nvcc_version(nvcc))
 37 | 
 38 |         cache_file = checksum.hexdigest()
 39 |         cache_path = join(cache_dir, cache_file + ".so")
 40 | 
 41 |         try:
 42 |             #return open(cache_path, "r").read()
 43 |             return cdll.LoadLibrary(cache_path)
 44 |         except:
 45 |             pass
 46 | 
 47 |     from tempfile import mkdtemp
 48 |     file_dir = mkdtemp()
 49 |     file_root = "kernel"
 50 | 
 51 |     cu_file_name = file_root + ".cu"
 52 |     cu_file_path = join(file_dir, cu_file_name)
 53 | 
 54 |     options.append("-o")
 55 |     options.append("%s.so" % join(file_dir,file_root))
 56 | 
 57 |     outf = open(cu_file_path, "w")
 58 |     outf.write(str(source))
 59 |     outf.close()
 60 | 
 61 |     if keep:
 62 |         options = options[:]
 63 |         options.append("--keep")
 64 | 
 65 |         print "*** compiler output in %s" % file_dir
 66 | 
 67 |     #from pytools.prefork import call
 68 |     try:
 69 | 
 70 |         print "Compiling kernel using the following options: " 
 71 |         print ' '.join([nvcc, "--shared"] + options + [cu_file_name])
 72 | 
 73 |         process = Popen([nvcc, "--shared"] + options + [cu_file_path], stdout=PIPE, cwd=file_dir)
 74 |         output = process.communicate()[0]
 75 |         result = process.returncode
 76 | 
 77 |         if output:
 78 |             print 'Compiler output below:'
 79 |             print output
 80 | 
 81 |     except OSError, e:
 82 |         raise OSError, "%s was not found (is it on the PATH?) [%s]" % (
 83 |                 nvcc, str(e))
 84 | 
 85 |     if result != 0:
 86 |         raise CompileError, "nvcc compilation of %s failed" % cu_file_path
 87 | 
 88 |     kdll = open(join(file_dir, file_root + ".so"), "r").read()
 89 | 
 90 |     if cache_dir:
 91 |         outf = open(cache_path, "w")
 92 |         outf.write(kdll)
 93 |         outf.close()
 94 | 
 95 |     if not keep:
 96 |         from os import listdir, unlink, rmdir
 97 |         for name in listdir(file_dir):
 98 |             unlink(join(file_dir, name))
 99 |         rmdir(file_dir)
100 | 
101 |     kdll = cdll.LoadLibrary(cache_path)
102 | 
103 |     return kdll
104 | 
105 | def compile(source, nvcc="nvcc", options=[], keep=False,
106 |         no_extern_c=False, arch=None, code=None, cache_dir=None,
107 |         include_dirs=[]):
108 | 
109 |     if not no_extern_c:
110 |         source = 'extern "C" {\n%s\n}\n' % source
111 | 
112 |     options = options[:]
113 |     if arch is None:
114 |         try:
115 |             # todo replace this with python-cuda equivalent
116 |             #from pycuda.driver import Context 
117 |             #arch = "sm_%d%d" % Context.get_device().compute_capability()
118 |             arch = None
119 |         except RuntimeError:
120 |             pass
121 | 
122 |     if cache_dir is None:
123 |         from os.path import expanduser, join, exists
124 |         import os
125 |         try:
126 |             getattr( os , 'getuid' )	
127 |         except:
128 |             def getuid():
129 |                 return os.getenv('USERNAME')
130 |             os.getuid = getuid
131 | 
132 |         from tempfile import gettempdir
133 |         cache_dir = join(gettempdir(), 
134 |                 "python-cuda-compiler-cache-v1-uid%s" % os.getuid())
135 | 
136 |         if not exists(cache_dir):
137 |             from os import mkdir
138 |             mkdir(cache_dir)
139 | 
140 |     if arch is not None:
141 |         options.extend(["-arch", arch])
142 | 
143 |     if code is not None:
144 |         options.extend(["-code", code])
145 | 
146 |     include_dirs = include_dirs[:]
147 | 
148 |     for i in include_dirs:
149 |         options.append("-I"+i)
150 | 
151 |     return compile_plain(source, options, keep, nvcc, cache_dir)
152 | 


--------------------------------------------------------------------------------
/cuda/sugar/kernel/kernelfactorydrv.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | from cuda.cuda import *
 3 | 
 4 | 
 5 | 
 6 | class KernelGetter(object):
 7 |     """ Wraps a ctypes CDLL instance for accessing CUDA kernels.
 8 | 
 9 |     Example
10 |     -------
11 |     from ctypes import cdll
12 |     mykernels = KernelGetter(cdll.LoadLibrary('libmykernels.so'))
13 |     mykernels.FastKernel(grid, block)(x, y)
14 |     # Equivalent CUDA call:
15 |     #   FastKernel<<<grid, block>>>(x, y)
16 |     """
17 | 
18 |     def __init__(self, dll):
19 |         raise NotImplementedError
20 | #         self.dll = dll
21 | 
22 | #     def __getattr__(self, name):
23 | #         mangled_name = '__device_stub_%s' % name
24 | #         try:
25 | #             funcptr = getattr(self.dll, mangled_name)
26 | #         except AttributeError:
27 | #             raise AttributeError("could not find kernel named %r in %r" % (name, self.dll))
28 | 
29 | #         # Return a factory function that will create the Kernel object.
30 | #         factory = lambda *args, **kwds: Kernel(funcptr, *args, **kwds)
31 | 
32 | #         return factory
33 | 
34 | 
35 | # class Kernel(object):
36 | #     """ Configure a CUDA kernel.
37 | #     """
38 | 
39 | #     def __init__(self, funcptr, gridDim, blockDim, sharedMem=0, tokens=0):
40 | #         # The function pointer to the kernel.
41 | #         self.funcptr = funcptr
42 | 
43 | #         # The configuration parameters for the call. These are the arguments
44 | #         # inside the <<<>>> brackets in CUDA.
45 | #         self.gridDim = gridDim
46 | #         self.blockDim = blockDim
47 | #         self.sharedMem = sharedMem
48 | #         self.tokens = tokens
49 | 
50 | #     # Delegate .restype and .argtypes attribute access to the underlying
51 | #     # function pointer.
52 | #     def _get_restype(self):
53 | #         return self.funcptr.restype
54 | #     def _set_restype(self, val):
55 | #         self.funcptr.restype = val
56 | #     restype = property(_get_restype, _set_restype)
57 | 
58 | #     def _get_argtypes(self):
59 | #         return self.funcptr.argtypes
60 | #     def _set_argtypes(self, val):
61 | #         self.funcptr.argtypes = val
62 | #     argtypes = property(_get_argtypes, _set_argtypes)
63 | 
64 | 
65 | #     def __call__(self, *args):
66 | #         """ Call the kernel as configured.
67 | #         """
68 | #         cudart.cudaConfigureCall(self.gridDim, self.blockDim, self.sharedMem, self.tokens)
69 | #         self.funcptr(*args)
70 | #         # Check to make sure we didn't get an error.
71 | #         err = cudart.getLastError()
72 | #         cudart._checkCudaStatus(err)
73 | 


--------------------------------------------------------------------------------
/cuda/sugar/kernel/kernelfactoryrt.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | from cuda.cuda import *
 3 | from cuda.sugar.kernel.compiler import compile
 4 | 
 5 | class SourceModule(object):
 6 |     """ Wraps a ctypes CDLL instance for accessing CUDA kernels.
 7 | 
 8 |     Example
 9 |     -------
10 |     from ctypes import cdll
11 |     mykernels = KernelGetter(cdll.LoadLibrary('libmykernels.so'))
12 |     mykernels.FastKernel(grid, block)(x, y)
13 |     # Equivalent CUDA call:
14 |     #   FastKernel<<<grid, block>>>(x, y)
15 |     """
16 |     def __init__(self, source, nvcc="nvcc", options=[], keep=False,
17 |             no_extern_c=False, arch=None, code=None, cache_dir=None,
18 |             include_dirs=[]):
19 | 
20 |         self.dll = compile(source, nvcc, options, keep, no_extern_c, 
21 |                 arch, code, cache_dir, include_dirs)
22 | 
23 |     def __getattr__(self, name):
24 |         mangled_name = '__device_stub_%s' % name
25 |         try:
26 |             funcptr = getattr(self.dll, mangled_name)
27 |         except AttributeError:
28 |             raise AttributeError("could not find kernel named %r in %r" % (name, self.dll))
29 | 
30 |         # Return a factory function that will create the Kernel object.
31 |         factory = lambda *args, **kwds: Kernel(funcptr, *args, **kwds)
32 | 
33 |         return factory
34 | 
35 | class Kernel(object):
36 |     """ Configure a CUDA kernel.
37 |     """
38 | 
39 |     def __init__(self, funcptr, gridDim, blockDim, sharedMem=0, tokens=0):
40 |         # The function pointer to the kernel.
41 |         self.funcptr = funcptr
42 | 
43 |         # The configuration parameters for the call. These are the arguments
44 |         # inside the <<<>>> brackets in CUDA.
45 |         self.gridDim = gridDim
46 |         self.blockDim = blockDim
47 |         self.sharedMem = sharedMem
48 |         self.tokens = tokens
49 | 
50 |     # Delegate .restype and .argtypes attribute access to the underlying
51 |     # function pointer.
52 |     def _get_restype(self):
53 |         return self.funcptr.restype
54 |     def _set_restype(self, val):
55 |         self.funcptr.restype = val
56 |     restype = property(_get_restype, _set_restype)
57 | 
58 |     def _get_argtypes(self):
59 |         return self.funcptr.argtypes
60 |     def _set_argtypes(self, val):
61 |         self.funcptr.argtypes = val
62 |     argtypes = property(_get_argtypes, _set_argtypes)
63 | 
64 | 
65 |     def __call__(self, *args):
66 |         """ Call the kernel as configured.
67 |         """
68 |         cudart.cudaConfigureCall(self.gridDim, self.blockDim, self.sharedMem, self.tokens)
69 |         self.funcptr(*args)
70 |         # Check to make sure we didn't get an error.
71 |         #err = cudart.getLastError()
72 |         #cudart._checkCudaStatus(err)
73 | 


--------------------------------------------------------------------------------
/cuda/sugar/kernel/tests/matrix_mul.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from cuda.memory import Linear
 4 | from cuda.kernel.kernelfactoryrt import SourceModule
 5 | from cuda.cuda import dim3
 6 | 
 7 | #from IPython.Shell import IPShellEmbed
 8 | #ipshell = IPShellEmbed(argv=[])
 9 | 
10 | 
11 | BLOCK_SIZE = 16
12 | # Matrix A width
13 | WA = (3 * BLOCK_SIZE)
14 | # Matrix A height
15 | HA = (5 * BLOCK_SIZE)
16 | # Matrix B width
17 | WB = (8 * BLOCK_SIZE)
18 | # Matrix B height
19 | HB = WA
20 | # Matrix C width 
21 | WC = WB
22 | # Matrix C height
23 | HC = HA
24 | 
25 | matrixMul = SourceModule(open('matrix_mul_kernel.cu','r').read())
26 | 
27 | nA = np.random.random(size=(HA, WA)).astype(np.float32)
28 | nB = np.random.random(size=(HB, WB)).astype(np.float32)
29 | 
30 | print 'Allocating arrays'
31 | dA = Linear(nA.shape).from_numpy(nA)
32 | dB = Linear(nB.shape).from_numpy(nB)
33 | dC = Linear((HC,WC))
34 | 
35 | print 'Calling kernel'
36 | grid = dim3(WC // BLOCK_SIZE, HC // BLOCK_SIZE, 1)
37 | block = dim3(BLOCK_SIZE, BLOCK_SIZE, 1)
38 | Mul = matrixMul.matrixMul(grid, block)
39 | Mul(dC.ref, dA.ref, dB.ref, WA, WB)
40 | 
41 | print 'Collecting results'
42 | nC = dC.to_numpy()
43 | nC.reshape((HC, WC))
44 | 
45 | print 'Freeing data'
46 | dA._free()
47 | dB._free()
48 | dC._free()
49 | 
50 | print 'Calculating error'
51 | print
52 | goldC = np.dot(nA, nB)
53 | err = nC - goldC
54 | print 'L2 err: %r' % np.linalg.norm(err, 2)
55 | print 'L1 err: %r' % np.linalg.norm(err, 1)
56 | print 'Linf err: %r' % np.linalg.norm(err, np.inf)
57 | print 'Lfro err: %r' % np.linalg.norm(err, 'fro')
58 | 


--------------------------------------------------------------------------------
/cuda/sugar/kernel/tests/matrix_mul_kernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2007 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * NOTICE TO USER:
  5 |  *
  6 |  * This source code is subject to NVIDIA ownership rights under U.S. and
  7 |  * international Copyright laws.  Users and possessors of this source code
  8 |  * are hereby granted a nonexclusive, royalty-free license to use this code
  9 |  * in individual and commercial software.
 10 |  *
 11 |  * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 12 |  * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 13 |  * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 14 |  * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 15 |  * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 16 |  * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 17 |  * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 18 |  * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 19 |  * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
 20 |  * OR PERFORMANCE OF THIS SOURCE CODE.
 21 |  *
 22 |  * U.S. Government End Users.   This source code is a "commercial item" as
 23 |  * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
 24 |  * "commercial computer  software"  and "commercial computer software
 25 |  * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
 26 |  * and is provided to the U.S. Government only as a commercial end item.
 27 |  * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 28 |  * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 29 |  * source code with only those rights set forth herein.
 30 |  *
 31 |  * Any use of this source code in individual and commercial software must
 32 |  * include, in the user documentation and internal comments to the code,
 33 |  * the above Disclaimer and U.S. Government End Users Notice.
 34 |  */
 35 | 
 36 | /* Matrix multiplication: C = A * B.
 37 |  * Device code.
 38 |  */
 39 | 
 40 | #include <stdio.h>
 41 | 
 42 | #define CHECK_BANK_CONFLICTS 0
 43 | #if CHECK_BANK_CONFLICTS
 44 | #define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
 45 | #define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
 46 | #else
 47 | #define AS(i, j) As[i][j]
 48 | #define BS(i, j) Bs[i][j]
 49 | #endif
 50 | 
 51 | // Thread block size
 52 | #define BLOCK_SIZE 16
 53 | 
 54 | // Matrix dimensions
 55 | // (chosen as multiples of the thread block size for simplicity)
 56 | #define WA (3 * BLOCK_SIZE) // Matrix A width
 57 | #define HA (5 * BLOCK_SIZE) // Matrix A height
 58 | #define WB (8 * BLOCK_SIZE) // Matrix B width
 59 | #define HB WA  // Matrix B height
 60 | #define WC WB  // Matrix C width 
 61 | #define HC HA  // Matrix C height
 62 | 
 63 | ////////////////////////////////////////////////////////////////////////////////
 64 | //! Matrix multiplication on the device: C = A * B
 65 | //! wA is A's width and wB is B's width
 66 | ////////////////////////////////////////////////////////////////////////////////
 67 | __global__ void
 68 | matrixMul( float* C, float* A, float* B, int wA, int wB)
 69 | {
 70 |     // Block index
 71 |     int bx = blockIdx.x;
 72 |     int by = blockIdx.y;
 73 | 
 74 |     // Thread index
 75 |     int tx = threadIdx.x;
 76 |     int ty = threadIdx.y;
 77 | 
 78 |     // Index of the first sub-matrix of A processed by the block
 79 |     int aBegin = wA * BLOCK_SIZE * by;
 80 | 
 81 |     // Index of the last sub-matrix of A processed by the block
 82 |     int aEnd   = aBegin + wA - 1;
 83 | 
 84 |     // Step size used to iterate through the sub-matrices of A
 85 |     int aStep  = BLOCK_SIZE;
 86 | 
 87 |     // Index of the first sub-matrix of B processed by the block
 88 |     int bBegin = BLOCK_SIZE * bx;
 89 | 
 90 |     // Step size used to iterate through the sub-matrices of B
 91 |     int bStep  = BLOCK_SIZE * wB;
 92 | 
 93 |     // Csub is used to store the element of the block sub-matrix
 94 |     // that is computed by the thread
 95 |     float Csub = 0;
 96 | 
 97 |     // Loop over all the sub-matrices of A and B
 98 |     // required to compute the block sub-matrix
 99 |     for (int a = aBegin, b = bBegin;
100 |              a <= aEnd;
101 |              a += aStep, b += bStep) {
102 | 
103 |         // Declaration of the shared memory array As used to
104 |         // store the sub-matrix of A
105 |         __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
106 | 
107 |         // Declaration of the shared memory array Bs used to
108 |         // store the sub-matrix of B
109 |         __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
110 | 
111 |         // Load the matrices from device memory
112 |         // to shared memory; each thread loads
113 |         // one element of each matrix
114 |         AS(ty, tx) = A[a + wA * ty + tx];
115 |         BS(ty, tx) = B[b + wB * ty + tx];
116 | 
117 |         // Synchronize to make sure the matrices are loaded
118 |         __syncthreads();
119 | 
120 |         // Multiply the two matrices together;
121 |         // each thread computes one element
122 |         // of the block sub-matrix
123 |         for (int k = 0; k < BLOCK_SIZE; ++k)
124 |             Csub += AS(ty, k) * BS(k, tx);
125 | 
126 |         // Synchronize to make sure that the preceding
127 |         // computation is done before loading two new
128 |         // sub-matrices of A and B in the next iteration
129 |         __syncthreads();
130 |     }
131 | 
132 |     // Write the block sub-matrix to device memory;
133 |     // each thread writes one element
134 |     int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
135 |     C[c + wB * ty + tx] = Csub;
136 | }
137 | 


--------------------------------------------------------------------------------
/cuda/sugar/memory/__init__.py:
--------------------------------------------------------------------------------
1 | from linear import *
2 | 


--------------------------------------------------------------------------------
/cuda/sugar/memory/linear.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Array-like objects for CUDA."""
 3 | 
 4 | from cuda.cuda import *
 5 | from cuda.cublas import *
 6 | import numpy
 7 | import ctypes
 8 | from ctypes import *
 9 | 
10 | # cuda <-> dtype conversion
11 | cudaDtypes = {'float32': ctypes.c_float,
12 |               'int32': ctypes.c_int,
13 |               'complex64': ctypes.c_float*2,
14 |              }
15 | 
16 | class Linear(object):
17 | 
18 |     ref = property(fget=lambda self: self._get_ref())
19 | 
20 |     def __init__(self, shape=None, dtype='float32', order=None):
21 |         self.shape = shape
22 |         self.size = numpy.prod(shape)
23 |         self.dtype = numpy.dtype(dtype)
24 |         self.order = order
25 |         self.ctype = self._convert_type(self.dtype)
26 |         self.nbytes = self.size*ctypes.sizeof(self.ctype)
27 |         self.allocated = False
28 |         self.data = None
29 |         self._alloc()
30 | 
31 |     def __del__(self):
32 |         self._free()
33 | 
34 |     def _convert_type(self, dtype):
35 |         ct = cudaDtypes.get(dtype.name, None)
36 |         if ct is None:
37 |             raise TypeError("Unsupported dtype")
38 |         return ct
39 | 
40 |     def _get_ref(self):
41 |         return cast(self.data,POINTER(self._convert_type(self.dtype)))
42 | 
43 |     def _alloc(self):
44 |         self.data = c_void_p()
45 |         cudaMalloc(byref(self.data), self.nbytes)
46 |         self.allocated = True
47 | 
48 |     def _free(self):
49 |         if self.allocated:
50 |             cudaFree(self.data)
51 |             self.data = None
52 |             self.allocated = False
53 | 
54 |     def to_numpy(self, a=None):
55 |         if not self.allocated:
56 |             raise Exception("Must first allocate")
57 |         if a is None:
58 |             a = numpy.empty(self.shape, dtype=self.dtype, order=self.order)
59 |         else:
60 |             # Check that the given array is appropriate.
61 |             if a.size != self.size:
62 |                 raise ValueError("need an array of size %s; got %s" % (self.size, a.size))
63 |             if a.dtype.name != self.dtype.name:
64 |                 # XXX: compare dtypes directly? issubdtype?
65 |                 raise ValueError("need an array of dtype %r; got %r" % (self.dtype, a.dtype))
66 |         cudaMemcpy(a.ctypes.data, self.ref, self.nbytes, cudaMemcpyDeviceToHost)
67 |         a = a.reshape(self.shape, order=self.order)
68 |         return a
69 | 
70 |     def from_numpy(self, a):
71 |         if not self.allocated:
72 |             raise Exception("Must first allocate")
73 |         assert a.size == self.size, "size must be the same"
74 |         assert a.dtype == self.dtype, "dtype must be the same"
75 |         a = numpy.ascontiguousarray(a,dtype=None)
76 |         if self.order == 'F':
77 |             a = numpy.asfortranarray(a)
78 |         cudaMemcpy(self.data, a.ctypes.data, self.nbytes, cudaMemcpyHostToDevice)
79 |         return self
80 | 


--------------------------------------------------------------------------------
/cuda/sugar/query/__init__.py:
--------------------------------------------------------------------------------
1 | from cu_utils import *
2 | from cuda_utils import *
3 | 


--------------------------------------------------------------------------------
/cuda/sugar/query/cu_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from cuda.cu import * 
  3 | from ctypes import *
  4 | 
  5 | class GPUException(Exception):
  6 |     pass
  7 | 
  8 | class cu_CUDA(object):
  9 |     usedDevices = 0
 10 |     def __init__(self):
 11 |         flags = 0 # see manual
 12 |         self.device = None
 13 |         self.context = None
 14 |         self.module = None
 15 |         self.deviceID  = -1
 16 |         cuInit(flags)
 17 |         device_count = c_int()
 18 |         cuDeviceGetCount(byref(device_count))
 19 |         if cu_CUDA.usedDevices >= device_count.value:
 20 |             print "No more uninitialized devices available"
 21 |             return
 22 |         self.device = CUdevice()
 23 |         self.context = CUcontext()
 24 |         self.modules = list()
 25 |         self.functions = dict()
 26 |         self.deviceID = cu_CUDA.usedDevices
 27 |         cuDeviceGet(byref(self.device),self.deviceID)
 28 |         cu_CUDA.usedDevices += 1
 29 |         status = cuCtxCreate(byref(self.context),0,self.device)
 30 |         if status != CUDA_SUCCESS:
 31 |             cuCtxDetach(self.context)
 32 |             raise GPUException("Failed to create CUDA context")
 33 |         self.getInfo()
 34 | 
 35 |     def getSourceModule(self,name=None):
 36 |         if name is None:
 37 |             name = "gpuFunctions.cubin"
 38 |         module   = CUmodule()
 39 |         status = cuModuleLoad(byref(module),name)
 40 |         if status != CUDA_SUCCESS:
 41 |             print "File not found: %s" % name
 42 |         self.modules.append(module)
 43 |         return module
 44 | 
 45 |     def getFunction(self,name):
 46 |         missing = True
 47 |         function = CUfunction()
 48 |         for module in self.modules:
 49 |             status = cuModuleGetFunction(function,module,name)
 50 |             if status != CUDA_SUCCESS:
 51 |                 continue
 52 |             else:
 53 |                 self.functions[name] = function
 54 |                 missing = False
 55 |                 break
 56 |         if missing:
 57 |             print "Function not found: %s" % name
 58 |             return None
 59 |         return function
 60 | 
 61 |     def getInfo(self):
 62 |         device = self.device
 63 |         info = dict()
 64 |         count = c_int()
 65 |         cuDeviceGetCount(byref(count))
 66 |         info["count"] = count.value
 67 |         name = (c_char*256)()
 68 |         cuDeviceGetName(name,256,device)
 69 |         info["name"] = name.value
 70 |         memsize = c_uint()
 71 |         cuDeviceTotalMem(byref(memsize),device)
 72 |         info["memory"] = memsize.value
 73 |         free,total = c_uint(),c_uint()
 74 |         cuMemGetInfo(byref(free),byref(total))
 75 |         info["free"] = free.value
 76 |         major,minor = c_int(),c_int()
 77 |         cuDeviceComputeCapability(byref(major),byref(minor),device)
 78 |         info["capability"] = (major.value,minor.value)
 79 |         props = CUdevprop()
 80 |         cuDeviceGetProperties(byref(props),device)
 81 |         info["properties"] = props
 82 |         self.info = info
 83 | 
 84 |     def __str__(self):
 85 |         s = ["Device Info:\n"]
 86 |         i = self.info
 87 |         s.append("%-19s = %d" % (
 88 |             "number of devices",i["count"]))
 89 |         s.append("%-19s = %d" % (
 90 |             "current device ID",self.deviceID))
 91 |         s.append("%-19s = %s" % (
 92 |             "device name =",i["name"]))
 93 |         s.append("%-19s = %.f MB" % (
 94 |             "memory size",i["memory"]/1024.**2))
 95 |         s.append("%-19s = %.f MB" % (
 96 |             "memory free",i["free"]/1024.**2))
 97 |         s.append("%-19s = %.f MHz" % (
 98 |             "clock rate",i["properties"].clockRate/1000.))
 99 |         s.append("%-19s = %d" % (
100 |             "major",i["capability"][0]))
101 |         s.append("%-19s = %d" % (
102 |             "minor",i["capability"][1]))
103 |         s.append(21*"-")
104 |         s.append(str(i["properties"]))
105 |         return "\n".join(s)
106 | 


--------------------------------------------------------------------------------
/cuda/sugar/query/cuda_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import cuda.cuda as cuda
 3 | from ctypes import byref, c_int
 4 | import logging
 5 | log = logging.getLogger('python-cuda')
 6 | 
 7 | CUDART_VERSION = 2010
 8 | 
 9 | def cutilSafeCall(retval):
10 |     if retval != 0:
11 |         log.error( 'error! %s' % retval)
12 | 
13 | def get_device_count():
14 |     device_count = c_int()
15 |     cutilSafeCall(cuda.cudaGetDeviceCount(byref(device_count)));
16 |     return device_count.value
17 | 
18 | def has_cuda_device():
19 |     dev_count = get_device_count()
20 |     if dev_count > 0:
21 |         log.debug("Found %d gpu devices" % dev_count)
22 |     else:
23 |         log.debug("There is no device supporting CUDA")
24 |         return False
25 | 
26 |     cuda_enabled = False
27 | 
28 |     for dev in range(0, dev_count):
29 |         dev_prop = cuda.cudaDeviceProp()
30 |         retval = cuda.cudaGetDeviceProperties(byref(dev_prop), dev)
31 |         if dev_prop.major == 9999 and dev_prop.minor == 9999:
32 |             log.debug( "Device %s does not support cuda." % dev)
33 |             continue
34 |         cuda_enabled = True
35 |         break
36 | 
37 |     if not cuda_enabled:
38 |         log.debug("There is no device supporting CUDA")
39 |     return cuda_enabled
40 | 
41 | def needs_emulation():
42 |     return has_cuda_device()
43 | 
44 | def get_devices(): 
45 |     dev_count = get_device_count()
46 |     if dev_count > 0:
47 |         log.debug("found %d gpu devices" % dev_count)
48 |     else:
49 |         log.debug("there is no device supporting cuda")
50 | 
51 |     for dev in range(0, dev_count):
52 |         dev_prop = cuda.cudaDeviceProp()
53 |         retval = cuda.cudaGetDeviceProperties(byref(dev_prop), dev)
54 |         if retval == 3:
55 |             log.debug( "there is no device supporting cuda")
56 |             break
57 |         elif dev == 0: 
58 |             if dev_prop.major == 9999 and dev_prop.minor == 9999:
59 |                 log.debug( "there is no device supporting cuda.")
60 |             elif dev_count == 1:
61 |                 log.debug( "there is 1 device supporting cuda")
62 |             else:
63 |                 log.debug( "there are %d devices supporting cuda" % dev_count)
64 | 
65 |         log.debug('Device %d: "%s"' % (dev, dev_prop.name))
66 |         log.debug("Major revision number:                         %d" % dev_prop.major)
67 |         log.debug("Minor revision number:                         %d" % dev_prop.minor)
68 |         log.debug("Total amount of global memory:                 %u bytes" % dev_prop.totalGlobalMem)
69 | 
70 |         if CUDART_VERSION >= 2000:
71 |             log.debug("Number of multiprocessors:                     %d", dev_prop.multiProcessorCount);
72 |             log.debug("Number of cores:                               %d", 8 * dev_prop.multiProcessorCount);
73 | 
74 |         log.debug( "Total amount of constant memory:               %u bytes" % dev_prop.totalConstMem)
75 |         log.debug( "Total amount of shared memory per block:       %u bytes" % dev_prop.sharedMemPerBlock)
76 |         log.debug( "Total number of registers available per block: %d" % dev_prop.regsPerBlock)
77 |         log.debug( "Warp size:                                     %d" % dev_prop.warpSize)
78 |         log.debug( "Maximum number of threads per block:           %d" % dev_prop.maxThreadsPerBlock)
79 |         log.debug( "Maximum sizes of each dimension of a block:    %d x %d x %d" % (dev_prop.maxThreadsDim[0], dev_prop.maxThreadsDim[1], dev_prop.maxThreadsDim[2]))
80 |         log.debug( "Maximum sizes of each dimension of a grid:     %d x %d x %d" % (dev_prop.maxGridSize[0], dev_prop.maxGridSize[1], dev_prop.maxGridSize[2]))
81 |         log.debug( "Maximum memory pitch:                          %u bytes" % dev_prop.memPitch)
82 |         log.debug( "Texture alignment:                             %u bytes" % dev_prop.textureAlignment)
83 |         log.debug( "Clock rate:                                    %.2f GHz" % (dev_prop.clockRate * (1e-6)))
84 | 
85 |         if CUDART_VERSION >= 2000:
86 |             log.debug("Concurrent copy and execution:                 %s" % bool(dev_prop.deviceOverlap))
87 | 


--------------------------------------------------------------------------------
/cuda/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from logger import *
2 | from libutils import *
3 | from decorator import memoize
4 | 
5 | 


--------------------------------------------------------------------------------
/cuda/utils/libutils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import ctypes 
 3 | import os
 4 | import platform
 5 | 
 6 | OSNAME = platform.system()
 7 | 
 8 | def get_lib(name, cdll_opts = None):
 9 |     libname = None
10 |     if OSNAME == "Linux": 
11 |         libname = "lib" + name + ".so"
12 |     elif OSNAME == "Darwin": 
13 |         libname = "lib" + name + ".dylib"
14 |     elif OSNAME == "Windows": 
15 |         import _winreg as wreg
16 |         reg = wreg.ConnectRegistry(None, wreg.HKEY_LOCAL_MACHINE)
17 |         key = wreg.OpenKey(reg, r"SOFTWARE\NVIDIA Corporation\Installed Products\NVIDIA CUDA")
18 |         cuda_bin = os.path.join(wreg.QueryValueEx(key, "InstallDir")[0],"bin")
19 |         libname = os.path.join(cuda_bin, "%s.dll" % name)
20 |         if name == "cuda":
21 |             libname = "nvcuda.dll"
22 |         lib = ctypes.windll.LoadLibrary( libname ) 
23 |         return lib
24 |     if cdll_opts:
25 |         lib = ctypes.CDLL(libname, cdll_opts)
26 |     else: 
27 |         lib = ctypes.CDLL(libname)
28 |     return lib
29 | 
30 | if __name__ == "__main__":
31 |     try:
32 |         print "Loading libcuda..."
33 |         get_lib("cuda")
34 |         print "Test PASSED"
35 |     except:
36 |         print "Test FAILED"
37 | 


--------------------------------------------------------------------------------
/cuda/utils/logger.py:
--------------------------------------------------------------------------------
 1 | # Setup logging globally (ie root logger)
 2 | import types
 3 | import logging
 4 | import logging.handlers
 5 | import platform
 6 | 
 7 | INFO_NO_NEWLINE = logging.INFO + 1
 8 | 
 9 | class MultipleFormatHandler(logging.StreamHandler):
10 | 
11 |     formatters = {  logging.INFO: logging.Formatter(">>> %(message)s\n"),
12 |                     INFO_NO_NEWLINE: logging.Formatter(">>> %(message)s"),
13 |                     logging.DEBUG: logging.Formatter("%(filename)s:%(lineno)d - %(levelname)s - %(message)s\n"),
14 |                     logging.WARN: logging.Formatter("%(filename)s:%(lineno)d - %(levelname)s - %(message)s\n"),
15 |                     logging.CRITICAL: logging.Formatter("%(filename)s:%(lineno)d - %(levelname)s - %(message)s\n"),
16 |                     logging.ERROR: logging.Formatter("%(filename)s:%(lineno)d - %(levelname)s - %(message)s\n")}
17 | 
18 |     def format(self,record):
19 |         return self.formatters[record.levelno].format(record)
20 | 
21 |     def emit(self, record):
22 |         try:
23 |             msg = self.format(record)
24 |             fs = "%s"
25 |             if not hasattr(types, "UnicodeType"): #if no unicode support...
26 |                 self.stream.write(fs % msg)
27 |             else:
28 |                 try:
29 |                     self.stream.write(fs % msg)
30 |                 except UnicodeError:
31 |                     self.stream.write(fs % msg.encode("UTF-8"))
32 |             self.flush()
33 |         except (KeyboardInterrupt, SystemExit):
34 |             raise
35 |         except:
36 |             self.handleError(record)
37 | 
38 | logger = logging.getLogger('python-cuda')
39 | logger.setLevel(logging.INFO)
40 | 
41 | mfh = MultipleFormatHandler()
42 | logger.addHandler(mfh)
43 | 
44 | if platform.system() == "Linux":
45 |     syslog_handler = logging.handlers.SysLogHandler(address='/dev/log')
46 |     formatter = logging.Formatter("%(filename)s:%(lineno)d - %(levelname)s - %(message)s\n")
47 |     syslog_handler.setFormatter(formatter)
48 |     logger.addHandler(syslog_handler)
49 | 
50 | def enable_debug():
51 |     logger.setLevel(logging.DEBUG)
52 | 


--------------------------------------------------------------------------------
/mkdist:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | rm -rf build MANIFEST
3 | python setup.py bdist --formats=gztar
4 | rm -rf build MANIFEST
5 | 


--------------------------------------------------------------------------------
/oldcode/cu/__init__.py:
--------------------------------------------------------------------------------
1 | from cu_api import *
2 | 


--------------------------------------------------------------------------------
/oldcode/cublas/__init__.py:
--------------------------------------------------------------------------------
1 | from cublas_api import *
2 | 


--------------------------------------------------------------------------------
/oldcode/cublas/cublas_defs.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8: © Arno Pähler, 2007-08
 2 | 
 3 | from ctypes import *
 4 | 
 5 | c_int_p   = POINTER(c_int)
 6 | c_uint_p  = POINTER(c_uint)
 7 | c_float_p = POINTER(c_float)
 8 | 
 9 | ###include "cuComplex.h"   /* import complex data type */
10 | ##
11 | ##/* CUBLAS status returns */
12 | ###define CUBLAS_STATUS_SUCCESS           0x00000000
13 | ###define CUBLAS_STATUS_NOT_INITIALIZED   0x00000001
14 | ###define CUBLAS_STATUS_ALLOC_FAILED      0x00000003
15 | ###define CUBLAS_STATUS_INVALID_VALUE     0x00000007
16 | ###define CUBLAS_STATUS_MAPPING_ERROR     0x0000000B
17 | ###define CUBLAS_STATUS_EXECUTION_FAILED  0x0000000D
18 | ###define CUBLAS_STATUS_INTERNAL_ERROR    0x0000000E
19 | CUBLAS_STATUS_SUCCESS          = 0x00000000
20 | CUBLAS_STATUS_NOT_INITIALIZED  = 0x00000001
21 | CUBLAS_STATUS_ALLOC_FAILED     = 0x00000003
22 | CUBLAS_STATUS_INVALID_VALUE    = 0x00000007
23 | CUBLAS_STATUS_MAPPING_ERROR    = 0x0000000B
24 | CUBLAS_STATUS_EXECUTION_FAILED = 0x0000000D
25 | CUBLAS_STATUS_INTERNAL_ERROR   = 0x0000000E
26 | 
27 | ##/* CUBLAS data types */
28 | ##typedef unsigned int cublasStatus;
29 | cublasStatus = c_uint
30 | 


--------------------------------------------------------------------------------
/oldcode/cuda/__init__.py:
--------------------------------------------------------------------------------
1 | from cuda_api import *
2 | 
3 | 


--------------------------------------------------------------------------------
/oldcode/cufft/__init__.py:
--------------------------------------------------------------------------------
1 | from cufft_api import *
2 | 


--------------------------------------------------------------------------------
/oldcode/cufft/cufft_api.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*- 
 3 | 
 4 | # XXX
 5 | # --
 6 | # Code forked from python-cuda-2.0_42 © Arno Pähler, 2007-08
 7 | # -- 
 8 | 
 9 | from cufft_defs import *
10 | from cuda.utils import libutils
11 | 
12 | cufft = libutils.get_lib("cufft", RTLD_GLOBAL)
13 | 
14 | #cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan, 
15 | #                                 int nx, 
16 | #                                 cufftType type, 
17 | #                                 int batch);
18 | cufftPlan1d = cufft.cufftPlan1d
19 | cufftPlan1d.restype = cufftResult
20 | cufftPlan1d.argtypes = [ cufftHandle_p,
21 |                         c_int, cufftType, c_int ]
22 | 
23 | #cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan, 
24 | #                                 int nx, int ny,
25 | #                                 cufftType type);
26 | cufftPlan2d = cufft.cufftPlan2d
27 | cufftPlan2d.restype = cufftResult
28 | cufftPlan2d.argtypes = [ cufftHandle_p,
29 |                         c_int, c_int, cufftType ]
30 | 
31 | #cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan, 
32 | #                                 int nx, int ny, int nz, 
33 | #                                 cufftType type);
34 | cufftPlan3d = cufft.cufftPlan3d
35 | cufftPlan3d.restype = cufftResult
36 | cufftPlan3d.argtypes = [ cufftHandle_p,
37 |                         c_int, c_int, c_int, cufftType ]
38 | 
39 | #cufftResult CUFFTAPI cufftDestroy(cufftHandle plan);
40 | cufftDestroy = cufft.cufftDestroy
41 | cufftDestroy.restype = cufftResult
42 | cufftDestroy.argtypes = [ cufftHandle ]
43 | 
44 | #cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan, 
45 | #                                  cufftComplex *idata,
46 | #                                  cufftComplex *odata,
47 | #                                  int direction);
48 | cufftExecC2C = cufft.cufftExecC2C
49 | cufftExecC2C.restype = cufftResult
50 | cufftExecC2C.argtypes = [ cufftHandle, c_uint, c_uint, c_int ]
51 | 
52 | #cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan, 
53 | #                                  cufftReal *idata,
54 | #                                  cufftComplex *odata);
55 | cufftExecR2C = cufft.cufftExecR2C
56 | cufftExecR2C.restype = cufftResult
57 | cufftExecR2C.argtypes = [ cufftHandle, c_uint, c_uint ]
58 | 
59 | #cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan, 
60 | #                                  cufftComplex *idata,
61 | #                                  cufftReal *odata);
62 | cufftExecC2R = cufft.cufftExecC2R
63 | cufftExecC2R.restype = cufftResult
64 | cufftExecC2R.argtypes = [ cufftHandle, c_uint, c_uint ]
65 | 


--------------------------------------------------------------------------------
/oldcode/cufft/cufft_defs.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8: © Arno Pähler, 2007-08
 2 | 
 3 | from ctypes import *
 4 | 
 5 | #// CUFFT API function return values 
 6 | #typedef enum cufftResult_t {
 7 | #    CUFFT_SUCCESS        = 0x0,
 8 | #    CUFFT_INVALID_PLAN   = 0x1,
 9 | #    CUFFT_ALLOC_FAILED   = 0x2,
10 | #    CUFFT_INVALID_TYPE   = 0x3,
11 | #    CUFFT_INVALID_VALUE  = 0x4,
12 | #    CUFFT_INTERNAL_ERROR = 0x5,
13 | #    CUFFT_EXEC_FAILED    = 0x6,
14 | #    CUFFT_SETUP_FAILED   = 0x7,
15 | #    CUFFT_INVALID_SIZE   = 0x8
16 | #} cufftResult;
17 | 
18 | cufftResult = c_int
19 | 
20 | CUFFT_SUCCESS        = 0x0
21 | CUFFT_INVALID_PLAN   = 0x1
22 | CUFFT_ALLOC_FAILED   = 0x2
23 | CUFFT_INVALID_TYPE   = 0x3
24 | CUFFT_INVALID_VALUE  = 0x4
25 | CUFFT_INTERNAL_ERROR = 0x5
26 | CUFFT_EXEC_FAILED    = 0x6
27 | CUFFT_SETUP_FAILED   = 0x7
28 | CUFFT_INVALID_SIZE   = 0x8
29 | 
30 | #// CUFFT defines and supports the following data types
31 | #
32 | #// cufftHandle is a handle type used to store and access CUFFT plans.
33 | #typedef unsigned int cufftHandle;
34 | #
35 | #// cufftReal is a single-precision, floating-point real data type.
36 | #typedef float cufftReal;
37 | #
38 | #// cufftComplex is a single-precision, floating-point complex data type that 
39 | #// consists of interleaved real and imaginary components.
40 | #typedef float cufftComplex[2];
41 | 
42 | cufftHandle  = c_uint
43 | cufftReal    = c_float
44 | cufftComplex = (c_float*2)
45 | 
46 | cufftHandle_p = POINTER(cufftHandle)
47 | 
48 | #// CUFFT transform directions 
49 | ##define CUFFT_FORWARD -1 // Forward FFT
50 | ##define CUFFT_INVERSE  1 // Inverse FFT
51 | 
52 | CUFFT_FORWARD = -1  ## Forward FFT
53 | CUFFT_INVERSE =  1  ## Inverse FFT
54 | 
55 | #// CUFFT supports the following transform types 
56 | #typedef enum cufftType_t {
57 | #    CUFFT_R2C = 0x2a, // Real to Complex (interleaved)
58 | #    CUFFT_C2R = 0x2c, // Complex (interleaved) to Real
59 | #    CUFFT_C2C = 0x29  // Complex to Complex, interleaved
60 | #} cufftType;
61 | 
62 | cufftType = c_int
63 | 
64 | CUFFT_R2C = 0x2a  ## Real to Complex (interleaved)
65 | CUFFT_C2R = 0x2c  ## Complex (interleaved) to Real
66 | CUFFT_C2C = 0x29  ## Complex to Complex, interleaved
67 | 


--------------------------------------------------------------------------------
/oldcode/examples/TODO:
--------------------------------------------------------------------------------
1 | - put 6.963 examples (converted to python-cuda)
2 | - put SDK examples (converted to python-cuda)
3 | - anything else!


--------------------------------------------------------------------------------
/oldcode/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/oldcode/examples/bw_test.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | # coding:utf-8: © Arno Pähler, 2007-08
  3 | 
  4 | from cuda.cuda import *
  5 | from cuda.utils.cuda_utils import mallocHost
  6 | from cuda.utils.ctypes_array import convert
  7 | 
  8 | from time import time
  9 | from numpy import abs,max
 10 | 
 11 | PAGEABLE = 0
 12 | PINNED = 1
 13 | MEMCOPY_ITERATIONS = 250
 14 | 
 15 | def compare(a,b):
 16 |     a1 = convert(a)
 17 |     b1 = convert(b)
 18 |     diff = max(abs(a1-b1))
 19 |     return diff
 20 | 
 21 | #///////////////////////////////////////////////////////////////////////////////
 22 | #//  test the bandwidth of a device to host memcopy of a specific size
 23 | #///////////////////////////////////////////////////////////////////////////////
 24 | def testDeviceToHostTransfer(size,mode):
 25 |     dtype = c_int
 26 |     memSize = size*sizeof(dtype)
 27 |     amountCopied = memSize*MEMCOPY_ITERATIONS
 28 |     d_idata = c_void_p()
 29 | 
 30 |     h_idata = mallocHost(size,dtype,mode)
 31 |     h_odata = mallocHost(size,dtype,mode)
 32 |     for i in range(size):
 33 |         h_idata[i] = dtype(size-i)
 34 |         h_odata[i] = dtype(123)
 35 | 
 36 |     cudaMalloc(byref(d_idata),memSize)
 37 |     cudaMemcpy(d_idata,h_idata,memSize,cudaMemcpyHostToDevice)
 38 |     t0 = time()
 39 |     for i in range(MEMCOPY_ITERATIONS):
 40 |         cudaMemcpy(h_odata,d_idata,memSize,cudaMemcpyDeviceToHost)
 41 |     t1 = time()-t0
 42 |     diff = compare(h_idata,h_odata)
 43 |     print "Max abs difference = %3s" % diff,
 44 |     bandwidthInGBs = amountCopied/(t1*float((1 << 30)))
 45 | 
 46 |     if mode == PINNED:
 47 |         cudaFreeHost(h_idata)
 48 |         cudaFreeHost(h_odata)
 49 |     cudaFree(d_idata)
 50 |     print "Device To Host  : %4.1f GB/s" % bandwidthInGBs
 51 |     return bandwidthInGBs
 52 | 
 53 | #///////////////////////////////////////////////////////////////////////////////
 54 | #//! test the bandwidth of a host to device memcopy of a specific size
 55 | #///////////////////////////////////////////////////////////////////////////////
 56 | def testHostToDeviceTransfer(size,mode):
 57 |     dtype = c_float
 58 |     memSize = size*sizeof(dtype)
 59 |     amountCopied = memSize*MEMCOPY_ITERATIONS
 60 |     d_idata = c_void_p()
 61 | 
 62 |     h_idata = mallocHost(size,dtype,mode)
 63 |     h_odata = mallocHost(size,dtype,mode)
 64 |     for i in range(size):
 65 |         h_idata[i] = dtype(size-i)
 66 |         h_odata[i] = dtype(456)
 67 | 
 68 |     cudaMalloc(byref(d_idata),memSize)
 69 |     t0 = time()
 70 |     for i in range(MEMCOPY_ITERATIONS):
 71 |         cudaMemcpy(d_idata,h_idata,memSize,cudaMemcpyHostToDevice)
 72 |     cudaMemcpy(h_odata,d_idata,memSize,cudaMemcpyDeviceToHost)
 73 |     t1 = time()-t0
 74 |     diff = compare(h_idata,h_odata)
 75 |     print "Max abs difference = %3s" % diff,
 76 |     bandwidthInGBs = amountCopied/(t1*float((1 << 30)))
 77 | 
 78 |     if mode == PINNED:
 79 |         cudaFreeHost(h_idata)
 80 |         cudaFreeHost(h_odata)
 81 |     cudaFree(d_idata)
 82 |     print "Host To Device  : %4.1f GB/s" % bandwidthInGBs
 83 |     return bandwidthInGBs
 84 | 
 85 | #///////////////////////////////////////////////////////////////////////////////
 86 | #//! test the bandwidth of a device to device memcopy of a specific size
 87 | #///////////////////////////////////////////////////////////////////////////////
 88 | def testDeviceToDeviceTransfer(size,mode):
 89 |     dtype = c_double
 90 |     memSize = size*sizeof(dtype)
 91 |     amountCopied = memSize*MEMCOPY_ITERATIONS
 92 |     d_idata = c_void_p()
 93 |     d_odata = c_void_p()
 94 | 
 95 |     h_idata = mallocHost(size,dtype,mode)
 96 |     h_odata = mallocHost(size,dtype,mode)
 97 |     for i in range(size):
 98 |         h_idata[i] = dtype(size-i)
 99 |         h_odata[i] = dtype(789)
100 | 
101 |     cudaMalloc(byref(d_idata),memSize)
102 |     cudaMalloc(byref(d_odata),memSize)
103 |     cudaMemcpy(d_idata,h_idata,memSize,cudaMemcpyHostToDevice)
104 |     t0 = time()
105 |     for i in range(MEMCOPY_ITERATIONS):
106 |         cudaMemcpy(d_odata,d_idata,memSize,cudaMemcpyDeviceToDevice)
107 |     cudaThreadSynchronize()
108 |     t1 = time()-t0
109 |     cudaMemcpy(h_odata,d_odata,memSize,cudaMemcpyDeviceToHost)
110 |     diff = compare(h_idata,h_odata)
111 |     print "Max abs difference = %3s" % diff,
112 |     bandwidthInGBs = (2.*amountCopied)/(t1*float((1 << 30)))
113 | 
114 |     if mode == PINNED:
115 |         cudaFreeHost(h_idata)
116 |         cudaFreeHost(h_odata)
117 |     cudaFree(d_idata)
118 |     cudaFree(d_odata)
119 |     print "Device To Device: %4.1f GB/s" % bandwidthInGBs
120 |     return bandwidthInGBs
121 | 
122 | if __name__ == "__main__":
123 |     size = 1024*1024
124 |     memtype = {PAGEABLE:"pageable   ",PINNED:"page-locked"}
125 | 
126 |     for mode in (PAGEABLE,PINNED):
127 |         print
128 |         print "+-------------------------+"
129 |         print "| Bandwidth transfer test |"
130 |         print "| using CUDA runtime API  |"
131 |         print "| with %s memory |" % memtype[mode]
132 |         print "+-------------------------+\n"
133 | 
134 |         testHostToDeviceTransfer(size,mode)
135 |         testDeviceToHostTransfer(size,mode)
136 |         testDeviceToDeviceTransfer(size,mode)
137 | 


--------------------------------------------------------------------------------
/oldcode/misc/README:
--------------------------------------------------------------------------------
1 | just a temp place to put stuff that we haven't sorted yet (e.g. from the old 'examples')
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/oldcode/misc/cf:
--------------------------------------------------------------------------------
1 | cu_fft.py


--------------------------------------------------------------------------------
/oldcode/misc/cmpG:
--------------------------------------------------------------------------------
1 | #!/bin/tcsh -f
2 | set flopt="--maxrregcount 32 --use_fast_math --gpu-name sm_11"
3 | set lib=""
4 | nvcc -o ${1} -O3 $flopt ${1}.cu -lcublas
5 | 


--------------------------------------------------------------------------------
/oldcode/misc/compileC:
--------------------------------------------------------------------------------
 1 | #!/bin/tcsh -f
 2 | set inc="-I/opt/local/Library/Frameworks/Python.framework/Versions/2.6/include/"
 3 | #set inc="-I/usr/include/python2.5"
 4 | set lib=""
 5 | set flags="-fPIC -O2 -msse2"# -malign-double"
 6 | gcc -c $flags $inc ${1}.c
 7 | gcc -shared $lib -o _${1}.so ${1}.o
 8 | strip -x _${1}.so
 9 | rm *.o
10 | 


--------------------------------------------------------------------------------
/oldcode/misc/compileCG:
--------------------------------------------------------------------------------
1 | #!/bin/sh 
2 | echo "Compiling CPU functions"
3 | compileC cpuFunctions
4 | echo "Compiling GPU functions"
5 | compileG gpuFunctions
6 | compileG simple
7 | 


--------------------------------------------------------------------------------
/oldcode/misc/compileCX:
--------------------------------------------------------------------------------
 1 | #!/bin/tcsh -f
 2 | 
 3 | # Link all needed Intel routines into _vector.so
 4 | # thus avoiding later problems with dynamic loading
 5 | 
 6 | set src="vector"
 7 | set mkl="/opt/intel/mkl/10.0.1.014/lib/32"
 8 | set lib="$mkl/libmkl_intel.a $mkl/libmkl_intel_thread.a $mkl/libmkl_core.a $mkl/libguide.a -lpthread"
 9 | set flags="-static -fPIC -O2 -msse2 -malign-double"
10 | gcc -c $flags $src.c
11 | gcc -shared -o _$src.so $src.o $lib
12 | strip -x _$src.so
13 | rm *.o
14 | 


--------------------------------------------------------------------------------
/oldcode/misc/compileG:
--------------------------------------------------------------------------------
1 | #!/bin/tcsh -f
2 | set flag1="--ptx"
3 | set flag2="--cubin"
4 | #set flopt="--maxrregcount 12 --use_fast_math --gpu-architecture sm_11"
5 | set flopt="--use_fast_math --gpu-architecture sm_11"
6 | set lib=""
7 | nvcc $flag1 $flopt ${1}.cu  |& grep -iv warning
8 | nvcc $flag2 $flopt ${1}.ptx |& grep -iv warning
9 | 


--------------------------------------------------------------------------------
/oldcode/misc/compileGso:
--------------------------------------------------------------------------------
 1 | #!/bin/tcsh -f
 2 | set flag1=""
 3 | #set flopt="--maxrregcount 12 --use_fast_math --gpu-code sm_11"
 4 | set flopt="--use_fast_math --gpu-code sm_11 --ptxas-options=-v"
 5 | set lib="-L$CUDA/lib -lcudart -lcuda"
 6 | #nvcc ${flag1} ${flopt} ${1}.cu -c -o ${1}.o |& grep -iv warning
 7 | nvcc ${flag1} ${flopt} ${1}.cu -c -o ${1}.o
 8 | g++ -shared ${lib} -o lib${1}.so ${1}.o
 9 | strip -x lib${1}.so
10 | rm ${1}.o 
11 | 


--------------------------------------------------------------------------------
/oldcode/misc/cpuFunctions.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8: © Arno Pähler, 2007-08
  2 | 
  3 | from ctypes import *
  4 | 
  5 | #mm = CDLL("./_cpuFunctions.so")
  6 | mm = CDLL("./_cpuFunctions.dylib")
  7 | 
  8 | _cvp = c_void_p
  9 | _cfl = c_float
 10 | _cui = c_uint
 11 | 
 12 | #
 13 | # Utility functions
 14 | #
 15 | 
 16 | mm.rdtsc.restype = c_uint64;
 17 | mm.rdtsc.argtypes = None
 18 | 
 19 | def rdtsc():
 20 |     return mm.rdtsc()
 21 | 
 22 | ReadTimestampCounter = rdtsc
 23 | 
 24 | mm.microtime.restype = c_long;
 25 | mm.microtime.argtypes = None
 26 | 
 27 | def microtime():
 28 |     return mm.microtime()
 29 | 
 30 | mm.scale.restype = None
 31 | mm.scale.argtypes = [ _cvp, _cfl, _cui ]
 32 | 
 33 | def scale(a,s,n=None):
 34 |     if n is None:
 35 |         n = len(a)
 36 |     mm.scale(a,s,n)
 37 | 
 38 | mm.l1norm.restype = _cfl
 39 | mm.l1norm.argtypes = [ _cvp, _cvp, _cui ]
 40 | 
 41 | def l1norm(a,b,n=None):
 42 |     if n is None:
 43 |         n = len(a)
 44 |     return mm.l1norm(a,b,n)
 45 | 
 46 | mm.arrayInit.restype = None
 47 | mm.arrayInit.argtypes = [ _cvp, _cui ]
 48 | 
 49 | def arrayInit(a,n=None):
 50 |     if n is None:
 51 |         n = len(a)
 52 |     mm.arrayInit(a,n)
 53 | 
 54 | vectorInit = arrayInit
 55 | 
 56 | mm.fixedInit.restype = None
 57 | mm.fixedInit.argtypes = [ _cvp, _cui ]
 58 | 
 59 | def fixedInit(a,n=None):
 60 |     if n is None:
 61 |         n = len(a)
 62 |     mm.fixedInit(a,n)
 63 | 
 64 | mm.randInit.restype = None
 65 | mm.randInit.argtypes = [ _cvp, _cui, _cfl, _cfl ]
 66 | 
 67 | def randInit(a,l,h,n=None):
 68 |     if n is None:
 69 |         n = len(a)
 70 |     mm.randInit(a,n,l,h)
 71 | 
 72 | mm.setZero.restype = None
 73 | mm.setZero.argtypes = [ _cvp, _cui ]
 74 | 
 75 | def setZero(a,n=None):
 76 |     if n is None:
 77 |         n = len(a)
 78 |     mm.setZero(a,n)
 79 | 
 80 | mm.checkError.restype = None
 81 | mm.checkError.argtypes = [ _cvp, _cvp, _cui, _cvp, _cvp ]
 82 | 
 83 | def checkError(a,b,n=None):
 84 |     if n is None:
 85 |         n = len(a)
 86 |     err = c_float()
 87 |     mxe = c_float()
 88 |     mm.checkError(a,b,n,byref(err),byref(mxe))
 89 |     return err.value,mxe.value
 90 | 
 91 | mm.checkTrig.restype = None
 92 | mm.checkTrig.argtypes = [ _cvp, _cvp, _cvp, _cvp, _cui ]
 93 | 
 94 | def checkTrig(a,b,n=None):
 95 |     if n is None:
 96 |         n = len(a)
 97 |     e = c_float()
 98 |     m = c_float()
 99 |     mm.checkTrig(byref(e),byref(m),a,b,n)
100 |     return e.value,m.value
101 | 
102 | #
103 | # Math functions
104 | #
105 | 
106 | mm.gflops.restype = None
107 | mm.gflops.argtypes = [ ]
108 | 
109 | def cpuGFLOPS():
110 |     mm.gflops()
111 | 
112 | mm.blsc.restype = None
113 | mm.blsc.argtypes = [ _cvp, _cvp, _cvp,
114 |                    _cvp, _cvp, _cfl, _cfl, _cui ]
115 | 
116 | def cpuBLSC(h_C,h_P,h_S,h_X,h_T,R,V,size):
117 |     mm.blsc(h_C,h_P,h_S,h_X,h_T,R,V,size)
118 | 
119 | mm.poly5.restype = None
120 | mm.poly5.argtypes = [ _cvp, _cvp, _cui ]
121 | 
122 | mm.poly10.restype = None
123 | mm.poly10.argtypes = [ _cvp, _cvp, _cui ]
124 | 
125 | mm.poly20.restype = None
126 | mm.poly20.argtypes = [ _cvp, _cvp, _cui ]
127 | 
128 | mm.poly40.restype = None
129 | mm.poly40.argtypes = [ _cvp, _cvp, _cui ]
130 | 
131 | def cpuPOLY5(x,y,n=None):
132 |     if n is None:
133 |         n = len(x)
134 |     mm.poly5(x,y,n)
135 | 
136 | def cpuPOLY10(x,y,n=None):
137 |     if n is None:
138 |         n = len(x)
139 |     mm.poly10(x,y,n)
140 | 
141 | def cpuPOLY20(x,y,n=None):
142 |     if n is None:
143 |         n = len(x)
144 |     mm.poly20(x,y,n)
145 | 
146 | def cpuPOLY40(x,y,n=None):
147 |     if n is None:
148 |         n = len(x)
149 |     mm.poly40(x,y,n)
150 | 
151 | mm.saxpy.restype = None
152 | mm.saxpy.argtypes = [ _cfl, _cvp, _cvp, _cui ]
153 | 
154 | def cpuSAXPY(a,x,y,n=None):
155 |     if n is None:
156 |         n = len(x)
157 |     mm.saxpy(a,x,y,n)
158 | 
159 | mm.vadd.restype = None
160 | mm.vadd.argtypes = [ _cvp, _cvp, _cui ]
161 | 
162 | def cpuVADD(x,y,n=None):
163 |     if n is None:
164 |         n = len(x)
165 |     mm.vadd(x,y,n)
166 | 
167 | mm.sdot.restype = c_float
168 | mm.sdot.argtypes = [ _cvp, _cvp, _cui ]
169 | 
170 | def cpuSDOT(x,y,n=None):
171 |     if n is None:
172 |         n = len(x)
173 |     return mm.sdot(x,y,n)
174 | 
175 | mm.sgemm.restype = None
176 | mm.sgemm.argtypes = [
177 |     _cvp, _cvp, _cvp,
178 |     _cui, _cui, _cui ]
179 | 
180 | def cpuSGEMM(C,A,B,m,k,n):
181 |     mm.sgemm(C,A,B,m,k,n)
182 | 
183 | mm.trig.restype = None
184 | mm.trig.argtypes = [ _cvp, _cvp, _cvp, _cui ]
185 | 
186 | def cpuTRIG(a,x,y,n=None):
187 |     if n is None:
188 |         n = len(a)
189 |     mm.trig(a,x,y,n)
190 | 


--------------------------------------------------------------------------------
/oldcode/misc/ctypes_array.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8: © Arno Pähler, 2007-08
  2 | # important details are © Thomas Heller
  3 | from sys import byteorder
  4 | from ctypes import *
  5 | from numpy.core.multiarray import array as multi_array
  6 | from numpy import isfortran
  7 | 
  8 | __all__ = ["convert","className","typeName"]
  9 | 
 10 | ################################################################
 11 | # some shortcut utilities
 12 | 
 13 | # use this as eval(PRODUCT,whatever)
 14 | PRODUCT = "reduce(lambda x,y:x*y,%s,1)"
 15 | 
 16 | def className(o):
 17 |     o = type(o)
 18 |     c = o.__class__.__name__
 19 |     # if gone too far, back up one level
 20 |     if c == "type":
 21 |         c = o.__name__
 22 |     return c
 23 | 
 24 | def typeName(o):
 25 |     if isSimpleType(o):
 26 |         return o.__class__.__name__
 27 |     while not isSimpleType(o):
 28 |         try:
 29 |             o = o._type_
 30 |         except AttributeError:
 31 |             o = type(o)
 32 |             break
 33 |     return o.__name__
 34 | 
 35 | def isCtypesFamily(o):
 36 |     c_n = "ArrayType","PointerType","SimpleType"
 37 |     cn = o.__class__.__name__
 38 |     isObject = cn[:2] == "c_" or cn[:5] =="LP_c_"
 39 |     isType = cn in c_n
 40 |     return isObject or isType
 41 | 
 42 | def isNumpyArray(o):
 43 |     return className(o) == "ndarray"
 44 | 
 45 | def isArrayType(o):
 46 |     return className(o) == "ArrayType"
 47 | 
 48 | def isSimpleType(o):
 49 |     return className(o) == "SimpleType"
 50 | 
 51 | if byteorder == "little":
 52 |     T = "<"
 53 | else:
 54 |     T = ">"
 55 | 
 56 | c_Dict = {
 57 |     "c_byte"      : "%si1" % T,
 58 |     "c_short"     : "%si2" % T,
 59 |     "c_int"       : "%si4" % T,
 60 |     "c_long"      : "%si4" % T,
 61 |     "c_longlong"  : "%si8" % T,
 62 |     "c_ubyte"     : "%su1" % T,
 63 |     "c_ushort"    : "%su2" % T,
 64 |     "c_uint"      : "%su4" % T,
 65 |     "c_ulong"     : "%su4" % T,
 66 |     "c_ulonglong" : "%su8" % T,
 67 |     "c_float"     : "%sf4" % T,
 68 |     "c_double"    : "%sf8" % T,}
 69 | 
 70 | n_Dict = dict(
 71 |     [(v,eval(k)) for k,v in c_Dict.items()])
 72 | 
 73 | ################################################################
 74 | # public functions
 75 | 
 76 | def convert(obj,dims=None,order="C",out=None):
 77 |     """Converts ctypes array to numpy array and vice versa
 78 |     convert determines the input type (ctypes or numpy)
 79 |     internally and returns an object of the opposite type.
 80 | 
 81 |     NOTE: do NOT do the following:
 82 |     n1 = numpy_array; c1 = convert(n1); n1 = convert(n1)
 83 |                                         ^^           ^^
 84 |     nasty things will happen! it is ok to do
 85 |     n1 = numpy_array; c1 = convert(n1); n2 = convert(n1)
 86 |                                         ^^           ^^
 87 |     A 1D ctypes array ca can be converted to a (m,n,k,...)
 88 |     numpy array na in C order with convert(ca,(m,n,k,...),"C")
 89 |     and to na in F order with convert(ca,(m,n,k,...),"F").
 90 |     (m,n,k,...) is reversed internally and the order of
 91 |     matrix-matrix or matrix-vector nultiplication must be
 92 |     inverted, when comparing with C oder results.
 93 | 
 94 |     This code is based on similar code posted by Thomas Heller"""
 95 | 
 96 |     # for obj in simple c_types (e.g.c_float(1.))
 97 |     if isSimpleType(obj):
 98 |         """convert simple ctype to numpy array"""
 99 |         obj = obj.value,
100 |         return multi_array(obj,copy=False)
101 | 
102 |     # for obj in scalars(e.g. 1.), lists and tuples
103 |     if not (isCtypesFamily(obj) or isNumpyArray(obj)):
104 |         """convert Python scalar, list or tuple to numpy array"""
105 |         obj = tuple(obj)
106 |         return multi_array(obj,copy=False)
107 | 
108 |     # numpy ==> ctypes
109 |     if isNumpyArray(obj):
110 |         """convert numpy array to ctypes array"""
111 |         do_copy = False
112 | 
113 |         # if obj is C order and return object should be
114 |         # Fortran order, transpose obj for Fortran order
115 |         if not isfortran(obj) and order == "F":
116 |             obj = obj.T
117 | 
118 |         ai = obj.__array_interface__
119 |         if ai["strides"]:
120 |             pass
121 |             # do something sensible
122 | #            obj = obj.T
123 | #            ai = obj.__array_interface__
124 | 
125 |         addr,readonly = ai["data"]
126 |         if readonly: # make a copy
127 |             do_copy = True
128 | 
129 |         ## code below should consider strides
130 |         i_size = obj.itemsize
131 |         if out is None:
132 | #            print "SIZE",eval(PRODUCT % "obj.shape")
133 |             t = n_Dict[ai["typestr"]]
134 |             for dim in ai["shape"][::-1]:
135 |                 t = t*dim
136 |             if do_copy:
137 |                 out = t()
138 |                 memmove(out,addr,obj.size*i_size)
139 |             else:
140 |                 out = t.from_address(addr)
141 |             out.__array_interface__ = ai
142 |             out.__keep = ai
143 |             return out
144 |         else:
145 |             size1 = obj.size
146 |             size2 = len(out)
147 |             size  = min(size1,size2)*i_size
148 |             memmove(out,addr,size)
149 |             out.__array_interface__ = ai
150 |             out.__keep = ai
151 |             return out
152 |     # ctypes ==> numpy
153 |     else:
154 |         """convert ctypes array to numpy array"""
155 |         typestr = c_Dict[typeName(obj)]
156 |         strides = None
157 |         if dims is None:
158 |             shape = []
159 |             o = obj
160 |             while isArrayType(o):
161 |                 shape.append(o._length_)
162 |                 o = o._type_
163 |             shape = tuple(shape)
164 |         else:
165 |             shape = tuple(dims)
166 |             p = sizeof(eval(typeName(obj)))
167 |             products = [p]
168 |             for d in dims[:-1]:
169 |                 p *= d
170 |                 products.append(p)
171 |             if order == "F":
172 |                 strides = tuple(products)
173 | 
174 |         ao = addressof(obj)
175 |         ai = \
176 |             {
177 |             'descr'  : [('',typestr)],
178 |             '__ref'  : ao,
179 |             'strides': strides,
180 |             'shape'  : shape,
181 |             'version': 3,
182 |             'typestr': typestr,
183 |             'data'   : (ao,False)
184 |             }
185 |         obj.__array_interface__ = ai
186 | 
187 |         return multi_array(obj,copy=False)
188 | 


--------------------------------------------------------------------------------
/oldcode/misc/ctypes_array_test.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | 
 3 | from ctypes import *
 4 | from ctypes_array import *
 5 | from numpy import *
 6 | from numpy.random import rand
 7 | 
 8 | def pif(a):
 9 |     ai = a.__array_interface__
10 |     b = ai["strides"] is not None
11 |     return " ".join([
12 |         "%-5s:" % str(b),
13 |         ":",className(a),
14 |         str(ai["strides"]),
15 |         str(ai["shape"]),])
16 | 
17 | ac = rand(4,8)
18 | af = array(ac,order="F")#ac.T
19 | 
20 | def norm2(a):
21 |     return round(sqrt(sum(a*a)),3)
22 | 
23 | print "\nOriginal arrays ac, af = ac.T"
24 | print "hasStrides(ac):",pif(ac)
25 | print "hasStrides(af):",pif(af)
26 | print "type ac,af",type(ac),type(af)
27 | 
28 | print "\nConvert to ctypes: ac => bc, af => bf"
29 | bc = convert(ac); print "hasStrides(bc):",pif(bc)
30 | bf = convert(af); print "hasStrides(bf):",pif(bf)
31 | print "\ntype bc,bf",type(bc),type(bf)
32 | 
33 | print "\nCan combine numpy arrays and ctypes objects with array interface"
34 | delta = (ac-bc).flatten()
35 | print "L2-norm ac-bc",norm2(delta)
36 | delta = (af-bf).flatten()
37 | print "L2-norm af-bf",norm2(delta)
38 | 
39 | print "\nConvert to numpy: bc => cc, bf => cf, bf => CF (Fortran)"
40 | cc = convert(bc);print "hasStrides(cc):",pif(cc)
41 | cf = convert(bf);print "hasStrides(cf):",pif(cf)
42 | CF = convert(bf,order="F"); print "hasStrides(CF):",pif(CF)
43 | print "\ntype cc,cf,CF",type(cc),type(cf),type(CF)
44 | 
45 | delta = (af-CF).flatten()
46 | print "L2-norm af-CF",norm2(delta)
47 | 
48 | print "\nConvert to ctypes ac => dc, af => df (dc,df = 1D)"
49 | dc = (eval(typeName(bc))*ac.size)()
50 | df = (eval(typeName(bf))*af.size)()
51 | convert(ac,None,None,dc)
52 | convert(af,None,None,df)
53 | print "type dc,df",type(dc),type(df)
54 | 
55 | delta = (ac-dc).flatten()
56 | print "L2-norm ac-dc",norm2(delta)
57 | delta = (af-df).flatten()
58 | print "L2-norm af-df",norm2(delta)
59 | delta = af.flatten()-ac.flatten()
60 | print "\ncomparing flattened ac,af"
61 | print "L2-norm af-ac",norm2(delta)
62 | set_printoptions(precision=3)
63 | print "\nac[:3],af[:3]"
64 | print ac.flatten()[:3]
65 | print af.flatten()[:3]
66 | print "\ndc[:3],df[:3]"
67 | print "[%6.3f %6.3f %6.3f]" % tuple(dc[:3])
68 | print "[%6.3f %6.3f %6.3f]" % tuple(df[:3])
69 | 
70 | print "\nConvert to numpy: dc => ec, df => ef"
71 | ec = convert(dc,(4,8),"C")
72 | ef = convert(df,(4,8),"F")
73 | print "1D dc,df ctypes objects=> 2D numpy arrays ec,ef"
74 | print ; print "ac, ec"
75 | print pif(ac)
76 | print pif(ec)
77 | print ; print "af, ef"
78 | print pif(af)
79 | print pif(ef)
80 | print "\nL2-norm ac-ec, af-ef, ec-ef"
81 | print norm2(ac-ec)
82 | print norm2(af-ef)
83 | print norm2(ec-ef)
84 | print "\nL2-norm ac-ef, af-ec (flattened)"
85 | print norm2(ac.flatten()-ef.flatten())
86 | print norm2(af.flatten()-ec.flatten())
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/oldcode/misc/ctypes_extra.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | # coding:utf-8: © Arno Pähler, 2007-08
 3 | from ctypes import *
 4 | from numpy import *
 5 | from numpy.random import rand
 6 | from ctypes_array import convert
 7 | 
 8 | ao = addressof
 9 | def fa(a,o,n,dtype=None):
10 |     if dtype is None:
11 |         t = a.__class__._type_
12 |     else:
13 |         t = dtype
14 |     s = sizeof(t)
15 |     return (t*n).from_address(ao(a)+o*s)
16 |     
17 | class x_float(c_float):
18 |     pass
19 | 
20 | b = convert(rand(10))
21 | a = (x_float*10)(*b)
22 | z = (c_float*10)(*b)
23 | 
24 | try:
25 |     u = (c_float*2).from_address(ao(a[6]))
26 |     su = sizeof(u._type_)
27 |     print "0x%8.8x" % ao(u)
28 |     print "%10.7f %10.7f" % (u[0],u[1])
29 | except TypeError:
30 |     print "x_float does not work"
31 | 
32 | try:
33 |     v = fa(a,6,2,c_float)
34 |     sv = sizeof(v._type_)
35 | except TypeError:
36 |     print "x_float does not work"
37 | 
38 | try:
39 |     w = fa(z,6,2)
40 |     sw = sizeof(w._type_)
41 | except TypeError:
42 |     print "c_float does not work"
43 | 
44 | sz = sizeof(z._type_)
45 | for i in range(len(v)):
46 |     print "0x%8.8x 0x%8.8x 0x%8.8x 0x%8.8x" % (
47 |     ao(a[i+6]),ao(v)+i*sv,ao(z)+(i+6)*sz,ao(w)+i*sw)
48 |     print "%10.7f %10.7f %10.7f % 10.7f" % (a[i+6].value,v[i],z[i+6],w[i])
49 | 


--------------------------------------------------------------------------------
/oldcode/misc/devinfo_cr.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | # coding:utf-8: © Arno Pähler, 2007-08
 3 | 
 4 | from ctypes import *
 5 | from cuda.cuda_api import *
 6 | 
 7 | if __name__ == "__main__":
 8 |     print "+------------------------+"
 9 |     print "| CUDA Device Info       |"
10 |     print "| using CUDA runtime API |"
11 |     print "+------------------------+\n"
12 |     count = c_int()
13 |     cudaGetDeviceCount(byref(count))
14 |     print "number of devices  =", count.value
15 |     props = cudaDeviceProp()
16 |     cudaGetDeviceProperties(props, 0)
17 |     print props
18 | 


--------------------------------------------------------------------------------
/oldcode/misc/devinfo_cu.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | # coding:utf-8: © Arno Pähler, 2007-08
 3 | 
 4 | from ctypes import *
 5 | from cuda.cu_defs import CUdevprop
 6 | from cuda.cu_api import *
 7 | 
 8 | if __name__ == "__main__":
 9 |     print "+-----------------------+"
10 |     print "| CUDA Device Info      |"
11 |     print "| using CUDA driver API |"
12 |     print "+-----------------------+\n"
13 |     cuInit(0)
14 |     count = c_int()
15 |     cuDeviceGetCount(byref(count))
16 |     device = CUdevice()
17 |     name = (c_char*256)()
18 |     cuDeviceGet(byref(device),0)
19 |     cuDeviceGetName(name,256,device)
20 |     memsize = c_uint()
21 |     cuDeviceTotalMem(byref(memsize),device)
22 |     major,minor = c_int(),c_int()
23 |     cuDeviceComputeCapability(byref(major),byref(minor),device)
24 |     props = CUdevprop()
25 |     cuDeviceGetProperties(byref(props),device)
26 | 
27 |     cuContext = CUcontext()
28 |     cuCtxCreate(byref(cuContext),0,device)
29 |     free,total = c_uint(),c_uint()
30 |     cuMemGetInfo(byref(free),byref(total))
31 |     free = free.value
32 |     cuCtxDetach(cuContext)
33 | 
34 |     print "%-19s = %d" % ("number of devices",count.value)
35 |     print "%-19s = %s" % ("device name =",name.value)
36 |     print "%-19s = %.f MB" % ("memory size",memsize.value/1024.**2)
37 |     print "%-19s = %.f MB" % ("memory free",free/1024.**2)
38 |     print "%-19s = %.f MHz" % ("clock rate",props.clockRate/1000.)
39 |     print "%-19s = %d" % ("major",major.value)
40 |     print "%-19s = %d" % ("minor",minor.value)
41 |     print 21*"-"
42 |     print props
43 | 


--------------------------------------------------------------------------------
/oldcode/misc/gpuFunctions.linkinfo:
--------------------------------------------------------------------------------
1 |  --import %laneid,%ctaid,%nctaid,%smid,A7,%pm3,%pm2,%pm1,__CC-temp__0__,%pm0,%tid,%clock,%warpid,%ntid,%gridid --export gpuScale,gpuPOLY5,gpuPOLY20,gpuPOLY40,init_array,gpuSAXPY,gpuSGEMM,gpuPOLY10,gpuVADD,gpuGFLOPS,gpuTRIG,gpuBLSC


--------------------------------------------------------------------------------
/oldcode/misc/gpuFunctions.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8: © Arno Pähler, 2007-08
 2 | from ctypes import *
 3 | 
 4 | cvp = c_void_p
 5 | _cf = c_float
 6 | _ci = c_int
 7 | 
 8 | lib = CDLL("./libgpuFunctions.so")
 9 | 
10 | #__global__ void gpuGFLOPS()
11 | gpuGFLOPS = lib.__device_stub_gpuGFLOPS
12 | gpuGFLOPS.restype = None
13 | gpuGFLOPS.argtypes = [ ]
14 | 
15 | #__global__ void gpuBLSC(
16 | #float *d_Calls, float *d_Puts,
17 | #float *d_S, float *d_X, float *d_T,
18 | #float R, float V, int OptN)
19 | gpuBLSC = lib.__device_stub_gpuBLSC
20 | gpuBLSC.restype = None
21 | gpuBLSC.argtypes = [ cvp, cvp, cvp, cvp, cvp,
22 |     _cf, _cf, _ci ]
23 | 
24 | #__global__ void gpuPOLY5(
25 | #float *d_In1, float *d_Out1, int size )
26 | gpuPOLY5 = lib.__device_stub_gpuPOLY5
27 | gpuPOLY5.restype = None
28 | gpuPOLY5.argtypes = [ cvp, cvp, _ci ]
29 | 
30 | #__global__ void gpuPOLY10(
31 | #float *d_In1, float *d_Out1, int size )
32 | gpuPOLY10 = lib.__device_stub_gpuPOLY10
33 | gpuPOLY10.restype = None
34 | gpuPOLY10.argtypes = [ cvp, cvp, _ci ]
35 | 
36 | #__global__ void gpuPOLY20(
37 | #float *d_In1, float *d_Out1, int size )
38 | gpuPOLY20 = lib.__device_stub_gpuPOLY20
39 | gpuPOLY20.restype = None
40 | gpuPOLY20.argtypes = [ cvp, cvp, _ci ]
41 | 
42 | #__global__ void gpuPOLY40(
43 | #float *d_In1, float *d_Out1, int size )
44 | gpuPOLY40 = lib.__device_stub_gpuPOLY40
45 | gpuPOLY40.restype = None
46 | gpuPOLY40.argtypes = [ cvp, cvp, _ci ]
47 | 
48 | #__global__ void gpuSAXPY(
49 | #float Factor, float *d_In1, float *d_In2, int size )
50 | gpuSAXPY = lib.__device_stub_gpuSAXPY
51 | gpuSAXPY.restype = None
52 | gpuSAXPY.argtypes = [ _cf, cvp, cvp, _ci ]
53 | 
54 | #__global__ void gpuVADD(
55 | #float *d_In1, float *d_In2, int size )
56 | gpuVADD = lib.__device_stub_gpuVADD
57 | gpuVADD.restype = None
58 | gpuVADD.argtypes = [ cvp, cvp, _ci ]
59 | 
60 | #__global__ void gpuSGEMM(
61 | #float* C, float* A, float* B, int wA, int wB )
62 | gpuSGEMM = lib.__device_stub_gpuSGEMM
63 | gpuSGEMM.restype = None
64 | gpuSGEMM.argtypes = [ cvp, cvp, cvp, _ci, _ci ]
65 | 
66 | #__global__ void gpuTRIG(
67 | #float *d_Out1, float *d_Out2, float *d_In1, int size )
68 | gpuTRIG = lib.__device_stub_gpuTRIG
69 | gpuTRIG.restype = None
70 | gpuTRIG.argtypes = [ cvp, cvp, cvp, _ci ]
71 | 
72 | #__global__ void gpuScale(
73 | #float *d_Out1, _F *d_In1, _F scale, int size )
74 | gpuScale = lib.__device_stub_gpuScale
75 | gpuScale.restype = None
76 | gpuScale.argtypes = [ cvp, cvp, _cf, _ci ]
77 | 
78 | #// for streams example
79 | #__global__ void init_array(
80 | #int *g_data, int *factor){ 
81 | init_array = lib.__device_stub_init_array
82 | init_array.restype = None
83 | init_array.argtypes = [ c_int, c_int ]
84 | 


--------------------------------------------------------------------------------
/oldcode/misc/kernelGL.cu:
--------------------------------------------------------------------------------
 1 | extern "C" {
 2 | __global__ void kernel1(
 3 | float4* pos, unsigned int width, unsigned int height, float time)
 4 | {
 5 |     unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
 6 |     unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
 7 | 
 8 |     // calculate uv coordinates
 9 |     float u = x / (float) width;
10 |     float v = y / (float) height;
11 |     u = u*2.0f - 1.0f;
12 |     v = v*2.0f - 1.0f;
13 | 
14 |     // calculate simple sine wave pattern
15 |     float freq = 4.0f;
16 |     float w = sinf(u*freq + time) * cosf(v*freq + time) * 0.5f;
17 | 
18 |     // write output vertex
19 |     pos[y*width+x] = make_float4(u, w, v, 1.0f);
20 | }
21 | 
22 | __global__ void kernel2(
23 | float4* pos, unsigned int width, unsigned int height, float time)
24 | {
25 |     unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
26 |     unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
27 | 
28 |     // calculate uv coordinates
29 |     float u = x / (float) width;
30 |     float v = y / (float) height;
31 |     u = u*2.0f - 1.0f;
32 |     v = v*2.0f - 1.0f;
33 | 
34 |     // calculate simple sine wave pattern
35 |     float freq = 4.0f;
36 |     float efac = .5f*exp(.5f*sin((u+v)*freq+time));
37 |     float w = (sinf(u*freq + time) + cosf(v*freq + time)) * efac;
38 | 
39 |     // write output vertex
40 |     pos[y*width+x] = make_float4(u, w, v, 1.0f);
41 | }
42 | }
43 | 


--------------------------------------------------------------------------------
/oldcode/misc/matadd.txt:
--------------------------------------------------------------------------------
  1 | //CPU
  2 | void addMatrix(float *a, float *b,
  3 |                float *c, int N)
  4 | {
  5 |   int i, j, index;
  6 |   for (i = 0; i < N; i++) {
  7 |     for (j = 0; j < N; j++) {
  8 |       index = i + j * N;
  9 |       c[index]=a[index] + b[index];
 10 |     }
 11 |   }
 12 | }
 13 | void main()
 14 | {
 15 |   .....
 16 |   addMatrix(a, b, c, N);
 17 | }
 18 | //GPU
 19 | __global__ void addMatrix(float *a,float *b,
 20 |                           float *c, int N)
 21 | {
 22 |   int i=blockIdx.x*blockDim.x+threadIdx.x;
 23 |   int j=blockIdx.y*blockDim.y+threadIdx.y;
 24 |   int index = i + j * N;
 25 |   if ( i < N && j < N)
 26 |     c[index]= a[index] + b[index];
 27 | }
 28 | void main()
 29 | {
 30 |   ..... // allocate & transfer data to GPU
 31 |   dim3 dimBlk (blocksize, blocksize);
 32 |   dim3 dimGrd (N/dimBlk.x, N/dimBlk.y);
 33 |   addMatrix<<<dimGrd,dimBlk>>>(a, b, c,N);
 34 | }
 35 | //GPU
 36 | // Compute vector sum C = A+B
 37 | // Each thread performs one pair-wise addition
 38 | __global__ void vecAdd(float* A, float* B, float* C)
 39 | {
 40 |     int i = threadIdx.x + blockDim.x * blockIdx.x;
 41 |     C[i] = A[i] + B[i];
 42 | }
 43 | __global__ void vecAdd(float* A, float* B, float* C);
 44 | void main()
 45 | {
 46 |     // Execute on N/256 blocks of 256 threads each
 47 |     vecAdd<<< N/256, 256>>>(d_A, d_B, d_C);
 48 | }
 49 | //GPU
 50 | __global__ void transpose_naive(float *odata, float *idata, int width, int height)
 51 | {
 52 |  unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
 53 |  unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
 54 |  if (xIndex < width && yIndex < height)
 55 |   {
 56 |      unsigned int index_in = xIndex + width * yIndex;
 57 |      unsigned int index_out = yIndex + height * xIndex;
 58 |      odata[index_out] = idata[index_in];
 59 |   }
 60 | }
 61 | __global__ void transpose(float *odata, float *idata, int width, int height)
 62 | {
 63 |  __shared__ float block[(BLOCK_DIM+1)*BLOCK_DIM];
 64 |  unsigned int xBlock = __mul24(blockDim.x, blockIdx.x);
 65 |  unsigned int yBlock = __mul24(blockDim.y, blockIdx.y);
 66 |  unsigned int xIndex = xBlock + threadIdx.x;
 67 |  unsigned int yIndex = yBlock + threadIdx.y;
 68 |  unsigned int index_out, index_transpose;
 69 |  if (xIndex < width && yIndex < height)
 70 |  {
 71 |      unsigned int index_in = __mul24(width, yIndex) + xIndex;
 72 |      unsigned int index_block = __mul24(threadIdx.y, BLOCK_DIM+1) + threadIdx.x;
 73 |      block[index_block] = idata[index_in];
 74 |      index_transpose = __mul24(threadIdx.x, BLOCK_DIM+1) + threadIdx.y;
 75 |      index_out = __mul24(height, xBlock + threadIdx.y) + yBlock + threadIdx.x;
 76 |  }
 77 |  __syncthreads();
 78 |  if (xIndex < width && yIndex < height)
 79 |      odata[index_out] = block[index_transpose];
 80 | }
 81 | //GPU
 82 | template <unsigned int blockSize>
 83 | __global__ void reduce6(int *g_idata, int *g_odata, unsigned int n)
 84 | {
 85 |   extern __shared__ int sdata[];
 86 |   unsigned int tid = threadIdx.x;
 87 |   unsigned int i = blockIdx.x*(blockSize*2) + tid;      //Final Optimized Kernel
 88 |   unsigned int gridSize = blockSize*2*gridDim.x;
 89 |   sdata[tid] = 0;
 90 |   do { sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; } while (i < n);
 91 |   __syncthreads();
 92 |   if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
 93 |   if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
 94 |   if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
 95 |   if (tid < 32) {
 96 |       if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
 97 |       if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
 98 |       if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
 99 |       if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
100 |       if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
101 |       if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
102 |   }
103 |   if (tid == 0) g_odata[blockIdx.x] = sdata[0];
104 | }
105 | // textures vs __constant__
106 | 
107 | __constant__ short hash_g[1024];
108 | __constant__ short hash_h[8192];
109 | ...
110 | return hash_h[(hash_g[b] + a) & 0x1fff];
111 | 
112 | texture<short, 1, cudaReadModeElementType> hash_g;
113 | texture<short, 1, cudaReadModeElementType> hash_h;
114 | ...
115 | cudaBindTexture(0, hash_g, hash_g_gpu, sizeof(hash_g_cpu));
116 | cudaBindTexture(0, hash_h, hash_h_gpu, sizeof(hash_h_cpu));
117 | ...
118 | return tex1Dfetch(hash_h, (tex1Dfetch(hash_g, b) + a) & 0x1fff);
119 | 
120 | Constants: method=[ _Z4testPiP11permutation ]
121 |     gputime=[ 60942.465 ] cputime=[ 60972.000 ] occupancy=[ 1.000 ]
122 | Texture: method=[ _Z4testPiP11permutation ]
123 |     gputime=[ 29661.119 ] cputime=[ 29920.000 ] occupancy=[ 1.000 ]
124 | 


--------------------------------------------------------------------------------
/oldcode/misc/mklMath.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8: © Arno Pähler, 2007-08
 2 | 
 3 | from ctypes import CDLL
 4 | from math import *
 5 | 
 6 | vml = CDLL("./_vector.so")
 7 | 
 8 | vcos = vml.vsCos
 9 | vcos.restype = None
10 | 
11 | vsin = vml.vsSin
12 | vsin.restype = None
13 | 
14 | vsincos = vml.vsSinCos
15 | vsincos.restype = None
16 | 
17 | vexp = vml.vsExp
18 | vexp.restype = None
19 | 
20 | vlog = vml.vsLn
21 | vlog.restype = None
22 | 
23 | vlog10 = vml.vsLog10
24 | vlog10.restype = None
25 | 
26 | vsqrt = vml.vsSqrt
27 | vsqrt.restype = None
28 | 
29 | def cpuTRIG(h_Y,h_Z,h_X):
30 |     size = len(h_X)
31 |     if False:
32 |         vcos(size,h_X,h_Y)
33 |         vsin(size,h_X,h_Z)
34 |     else: # about 20% faster
35 |         vsincos(size,h_X,h_Z,h_Y)
36 | 
37 | ##////////////////////////////////////////////////////////////////////////////
38 | ## Shared CPU/GPU functions, performing calculations for single option by 
39 | ## Black-Scholes formula.
40 | ##////////////////////////////////////////////////////////////////////////////
41 | A1 =  0.319381530
42 | A2 = -0.356563782
43 | A3 =  1.781477937
44 | A4 = -1.821255978
45 | A5 = 1.3302744290
46 | RSQRT2PI = 0.3989422804
47 | 
48 | ##Polynomial approximation of cumulative normal distribution function
49 | def CND(d):
50 |     K = 1.0 / (1.0 + 0.2316419 * abs(d))
51 | 
52 |     cnd = RSQRT2PI * exp(-0.5 * d * d) * \
53 |         (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
54 | 
55 |     if d > 0:
56 |         cnd = 1.0 - cnd
57 | 
58 |     return cnd
59 | 
60 | ## Calculate Black-Scholes formula for both calls and puts
61 | ##    S, ##Stock price
62 | ##    X, ##Option strike
63 | ##    T, ##Option years
64 | ##    R, ##Riskless rate
65 | ##    V  ##Volatility rate
66 | def BlackScholesBody(
67 |     S, X, T, R, V ):
68 |     sqrtT = sqrt(T)
69 |     d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT)
70 |     d2 = d1 - V * sqrtT
71 | 
72 |     CNDD1 = CND(d1)
73 |     CNDD2 = CND(d2)
74 | 
75 |     ##Calculate Call and Put simultaneously
76 |     expRT = exp(- R * T)
77 |     CallResult = S * CNDD1 - X * expRT * CNDD2
78 |     PutResult  = X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1)
79 | 
80 |     return CallResult,PutResult
81 | 
82 | def cpuBLSC(h_C,h_P,h_S,h_X,h_T,R,V,size):
83 |     for i in range(size):
84 |         h_C[i],h_P[i] = BlackScholesBody(h_S[i],h_X[i],h_T[i],R,V)
85 | 


--------------------------------------------------------------------------------
/oldcode/misc/sgemmN:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npinto/python-cuda/92898059d7a32b261ba0758e50dccfe5c4bd4ac4/oldcode/misc/sgemmN


--------------------------------------------------------------------------------
/oldcode/misc/sgemmN.log:
--------------------------------------------------------------------------------
 1 | 
 2 | testing sgemm( 'N', 'N', n, n, n, ... )
 3 | 
 4 |    n   CUBLAS,Gflop/s   we,Gflop/s   "error"
 5 |    64       22.97          12.61        0
 6 |   128       40.94          51.73        0
 7 |   192      100.26         101.37        0
 8 |   256       97.20          98.01        0
 9 |   320      120.21         120.92        0
10 |   384      126.81         125.59        0
11 |   448      132.74         132.28        0
12 |   512      136.32         132.46        0
13 |   576      135.26         140.52        0
14 |   704      138.64         139.08        0
15 |   832      156.95         138.04        0
16 |   960      139.01         139.58        0
17 |  1088      141.96         172.23        0
18 |  1216      142.39         142.22        0
19 |  1408      142.33         143.02        0
20 |  1600      143.17         143.08        0
21 |  1792      143.69         192.88        0
22 |  1984      143.02         143.77        0
23 |  2240      164.13         143.80        0
24 |  2496      143.48         143.55        0
25 |  2816      161.37         176.05        0
26 |  3136      143.76         165.94        0
27 |  3520      185.63         180.48        0
28 |  3904      189.89         162.13        0
29 | 
30 | testing sgemm( 'N', 'T', n, n, n, ... )
31 | 
32 |    n   CUBLAS,Gflop/s   we,Gflop/s   "error"
33 |    64       22.87          12.82        0
34 |   128       40.74          51.19        0
35 |   192      101.35         102.00        0
36 |   256       98.56          99.11        0
37 |   320      122.89         122.78        0
38 |   384      128.55         127.23        0
39 |   448      134.89         133.69        0
40 |   512      133.56         138.56        0
41 |   576      141.31         136.77        0
42 |   704      150.16         139.81        0
43 |   832      137.56         138.31        0
44 |   960      139.53         139.73        0
45 |  1088      142.25         148.80        0
46 |  1216      142.20         143.25        0
47 |  1408      142.83         142.88        0
48 |  1600      142.78         143.60        0
49 |  1792      142.23         141.73        0
50 |  1984      143.83         143.91        0
51 |  2240      186.56         170.33        0
52 |  2496      143.83         143.89        0
53 |  2816      157.43         173.14        0
54 |  3136      184.27         163.58        0
55 |  3520      180.04         177.53        0
56 |  3904      166.91         193.43        0
57 | 


--------------------------------------------------------------------------------
/oldcode/misc/simple.cu:
--------------------------------------------------------------------------------
 1 | // © Arno Pähler, 2007-08
 2 | extern "C" {
 3 | typedef float _F;
 4 | typedef const float _cF;
 5 | typedef const unsigned int _cI;
 6 | 
 7 | texture<float,1,cudaReadModeElementType> Arg;
 8 | 
 9 | __global__ void TRIG
10 |     (_F *d_Out1, _F *d_Out2, _cF *d_In1, _cI size )
11 | {
12 |     _cI tid = blockDim.x * blockIdx.x + threadIdx.x;
13 |     _cI tsz = blockDim.x * gridDim.x;
14 |     int i;
15 | 
16 |     for (i = tid; i < size; i += tsz)
17 |     {
18 |         d_Out1[i] = cosf(d_In1[i]);
19 |         d_Out2[i] = sinf(d_In1[i]);
20 |     }
21 | }
22 | 
23 | __global__ void TRIGTex
24 |     (_F *d_Out1, _F *d_Out2, _cI size )
25 | {
26 |     _cI tid = blockDim.x * blockIdx.x + threadIdx.x;
27 |     _cI tsz = blockDim.x * gridDim.x;
28 |     int i;
29 |     __shared__ float x;
30 | 
31 |     for (i = tid; i < size; i += tsz)
32 |     {
33 |         x = tex1Dfetch(Arg,i);
34 |         d_Out1[i] = cosf(x);
35 |         d_Out2[i] = sinf(x);
36 |     }
37 | }
38 | }
39 | 


--------------------------------------------------------------------------------
/oldcode/misc/simple.cubin:
--------------------------------------------------------------------------------
 1 | architecture {sm_11}
 2 | abiversion {0}
 3 | modname {cubin}
 4 | sampler  {
 5 | 	name = Arg
 6 | 	texunit = 0
 7 | }
 8 | code  {
 9 | 	name = TRIG
10 | 	lmem = 0
11 | 	smem = 32
12 | 	reg = 6
13 | 	bar = 0
14 | 	bincode  {
15 | 		0x10000205 0x40004780 0xa0000005 0x04000780 
16 | 		0x60014c05 0x00204780 0x3001cffd 0x6420c7c8 
17 | 		0x30000003 0x00000280 0x10000201 0x40004780 
18 | 		0x3002020d 0xc4100780 0x3002ce05 0xc4300780 
19 | 		0x41002810 0x2103ec00 0x2101ec04 0x2103e808 
20 | 		0x2000ca0d 0x0420c780 0x30020811 0xc4100780 
21 | 		0xd00e0015 0x80c00780 0xb0000a15 0xc0000780 
22 | 		0x90000a15 0xa0000780 0xd00e0415 0xa0c00780 
23 | 		0xd00e0015 0x80c00780 0xb0000a15 0xc0000780 
24 | 		0x90000a15 0x80000780 0x20000001 0x04010780 
25 | 		0xd00e0615 0xa0c00780 0x300101fd 0x640047c8 
26 | 		0x20048408 0x2004860c 0x1000c003 0x00000280 
27 | 		0xf0000001 0xe0000001 
28 | 	}
29 | }
30 | code  {
31 | 	name = TRIGTex
32 | 	lmem = 0
33 | 	smem = 32
34 | 	reg = 9
35 | 	bar = 0
36 | 	bincode  {
37 | 		0x10000205 0x40004780 0xa0000005 0x04000780 
38 | 		0x60014c05 0x00204780 0x3001cdfd 0x6420c7c8 
39 | 		0x30000003 0x00000280 0x10000201 0x40004780 
40 | 		0x30020211 0xc4100780 0xa0017003 0000000000 
41 | 		0x3002cc15 0xc4300780 0x41002808 0x2104e80c 
42 | 		0x2104ea10 0x2105e818 0x30020415 0xc4100780 
43 | 		0x10000201 0x0403c780 0xf3000001 0x00000784 
44 | 		0xb000001d 0xc0000780 0x90000e21 0xa0000780 
45 | 		0xd00e0621 0xa0c00780 0x90000e1d 0x80000780 
46 | 		0x2000060d 0x04014780 0xd00e081d 0xa0c00780 
47 | 		0x300607fd 0x640047c8 0x20028204 0x20058810 
48 | 		0x1000c003 0x00000280 0x00000e01 0xe4200782 
49 | 		0xf0000001 0xe0000001 
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/oldcode/misc/simple.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | # coding:utf-8: © Arno Pähler, 2007-08
  3 | from ctypes import *
  4 | from time import time
  5 | 
  6 | from cuda.cu_defs import *
  7 | from cuda.cu_api import *
  8 | from cuda.cu_utils import *
  9 | 
 10 | from cpuFunctions import checkError,checkTrig,vectorInit
 11 | 
 12 | UseVML = True
 13 | if UseVML:
 14 |     from mklMath import cpuTRIG
 15 | else:
 16 |     from cpuFunctions import cpuTRIG
 17 | 
 18 | BLOCK_SIZE = 320
 19 | GRID_SIZE  = 8
 20 | 
 21 | S4 = sizeof(c_float)
 22 | 
 23 | def main(device,vlength = 128,loops = 1):
 24 |     print "+-----------------------+"
 25 |     print "|   Simple  TRIG Test   |"
 26 |     print "| using CUDA driver API |"
 27 |     print "+-----------------------+"
 28 |     print "params: %2d %5dK %3d\n" % (log2n,vlength >> 10,loops),
 29 | 
 30 |     n2 = vlength ## Vector length
 31 | 
 32 |     # TRIGTex is about 1.5x faster than TRIG
 33 | #    name = "TRIG"
 34 |     name = "TRIGTex"
 35 | 
 36 |     TRIG = device.functions[name]
 37 |     mod0 = device.modules[0]
 38 | 
 39 |     sizeV = S4*n2
 40 |     h_Arg = (c_float*n2)()
 41 |     h_Cos = (c_float*n2)()
 42 |     h_Sin = (c_float*n2)()
 43 | 
 44 |     vectorInit(h_Arg)
 45 | 
 46 |     d_Arg = getMemory(h_Arg)
 47 |     d_Cos = getMemory(n2)
 48 |     d_Sin = getMemory(n2)
 49 | 
 50 |     tex = devMemToTex(mod0,"Arg",d_Arg,sizeV)
 51 | 
 52 |     cuFuncSetBlockShape(TRIG,BLOCK_SIZE,1,1)
 53 |     cuParamSeti(TRIG,0,d_Cos)
 54 |     cuParamSeti(TRIG,4,d_Sin)
 55 |     if name != "TRIGTex":
 56 |         cuParamSeti(TRIG,8,d_Arg)
 57 |         cuParamSeti(TRIG,12,n2)
 58 |         cuParamSetSize(TRIG,16)
 59 |     else:
 60 |         cuParamSetTexRef(TRIG,CU_PARAM_TR_DEFAULT,tex)
 61 |         cuParamSeti(TRIG,8,n2)
 62 |         cuParamSetSize(TRIG,12)
 63 |     cuCtxSynchronize()
 64 | 
 65 |     t0 = time()
 66 |     for i in range(loops):
 67 |         cuLaunchGrid(TRIG,GRID_SIZE,1)
 68 |     cuCtxSynchronize()
 69 |     t0 = time()-t0
 70 | 
 71 |     g_Cos = (c_float*n2)()
 72 |     g_Sin = (c_float*n2)()
 73 |     cuMemcpyDtoH(g_Cos,d_Cos,sizeV)
 74 |     cuMemcpyDtoH(g_Sin,d_Sin,sizeV)
 75 |     cuCtxSynchronize()
 76 | 
 77 |     cuMemFree(d_Arg)
 78 |     cuMemFree(d_Cos)
 79 |     cuMemFree(d_Sin)
 80 | 
 81 |     t1 = time()
 82 |     for i in range(loops):
 83 |         cpuTRIG(h_Cos,h_Sin,h_Arg)
 84 |     t1 = time()-t1
 85 | 
 86 |     flopsg = (2.e-6*n2)*float(loops)
 87 |     flopsc = flopsg
 88 | 
 89 |     t0 *= 1.e3;
 90 |     t1 *= 1.e3;
 91 |     print "\n       time[msec]    GFlops\n"
 92 |     print "GPU: %12.1f%10.2f" % (t0,flopsg/t0)
 93 |     print "CPU: %12.1f%10.2f" % (t1,flopsc/t1)
 94 |     print "     %12.1f" % (t1/t0)
 95 | 
 96 |     x = float(1 << 23)
 97 |     e,m = checkTrig(g_Cos,g_Sin)
 98 |     print "\n",name, "internal check GPU"
 99 |     print "%8.1e %8.1e" % (e,m)
100 |     print "%8.1f %8.1f" % (e*x,m*x)
101 | 
102 |     e,m = checkTrig(h_Cos,h_Sin)
103 |     print "\n",name, "internal check CPU"
104 |     print "%8.1e %8.1e" % (e,m)
105 |     print "%8.1f %8.1f" % (e*x,m*x)
106 | 
107 |     print "\n","check between CPU and GPU"
108 |     err,mxe = checkError(h_Cos,g_Cos)
109 |     print "Avg and max abs error (cos) = %8.1e %8.1e" % (err,mxe)
110 |     print "                              %8.1f %8.1f" % (err*x,mxe*x)
111 |     err,mxe = checkError(h_Sin,g_Sin)
112 |     print "Avg and max abs error (sin) = %8.1e %8.1e" % (err,mxe)
113 |     print "                              %8.1f %8.1f" % (err*x,mxe*x)
114 | 
115 | if __name__ == "__main__":
116 |     import sys
117 |     device = cu_CUDA()
118 |     device.getSourceModule("simple.cubin")
119 |     device.getFunction("TRIG")
120 |     device.getFunction("TRIGTex")
121 | 
122 |     log2n,loops = 15,1
123 |     if len(sys.argv) > 1:
124 |         log2n = int(sys.argv[1])
125 |     log2n = max(0,min(log2n,25))
126 |     if len(sys.argv) > 2:
127 |         loops = int(sys.argv[2])
128 |     vlength = 1 << log2n
129 |     main(device,vlength,loops)
130 |     cuCtxDetach(device.context)
131 | 


--------------------------------------------------------------------------------
/oldcode/misc/utilities.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8: © Arno Pähler, 2007-08
  2 | 
  3 | from bz2 import BZ2File
  4 | from gzip import GzipFile
  5 | from collections import defaultdict
  6 | from ctypes import memmove
  7 | from os.path import splitext
  8 | from subprocess import Popen,PIPE
  9 | from sys import platform
 10 | from time import ctime,time
 11 | 
 12 | try:
 13 |     from numpy import empty,float32
 14 |     def c2n_array(a,m,n=1):
 15 |         ##  scipy needs array in Fortran order
 16 |         aa = empty((m,n),dtype=float32,order='F')
 17 |         memmove(aa.ctypes.data,a,4*(m*n))
 18 |         return aa
 19 |     def n2c_array(a,aa):
 20 |         ##  aa should already have been allocated!
 21 |         memmove(aa,a.ctypes.data,4*a.size)
 22 |         return aa
 23 | except ImportError:
 24 |     def c2n_array(a,m,n=1):
 25 |         return a
 26 |     def n2c_array(a,aa):
 27 |         aa = a
 28 |         return aa
 29 | 
 30 | from cpuFunctions import ReadTimestampCounter
 31 | 
 32 | class Timer(object):
 33 | 
 34 |     fakeClock = 1.e-9/2.8 # denominator: clock in GHz
 35 | 
 36 |     def __init__(self,startNow=False,useTSC=False):
 37 |         self.useTSC = useTSC
 38 |         self.wall = 0. ## later
 39 |         self.cpu  = 0. ## later
 40 |         self.running = False
 41 |         self.torg = 0.
 42 |         self.time = 0.
 43 |         self.counters = []
 44 |         self.freqs = None
 45 | 
 46 |         if useTSC and platform == "linux2":
 47 |             self.getFrequency(0)
 48 |         elif useTSC and platform == "win32":
 49 |             self.rtfreq = self.fakeClock
 50 |         elif useTSC:
 51 |             self.rtfreq = self.fakeClock
 52 | 
 53 |         if startNow:
 54 |             self.start()
 55 | 
 56 |     def __str__(self):
 57 |         pf = dict((
 58 |                   ("linux2","Linux"),
 59 |                   ("win32","Windows")))
 60 |         s =["System: %s" % pf[platform]]
 61 |         useTSC = self.useTSC
 62 |         s.append("UseTSC: %s" % bool(useTSC))
 63 |         if useTSC:
 64 |             s.append("Cores : %d" % len(self.freqs))
 65 |             clock = 1.e-9/self.rtfreq
 66 |             s.append("Clock : %.3f GHz" % clock)
 67 |         return "\n".join(s)
 68 | 
 69 |     def getFrequency(self,core=0):
 70 |         if self.freqs is None:
 71 |             cpuinfo = "/proc/cpuinfo"
 72 |             freqs = []
 73 |             self.core = core
 74 |             for line in open(cpuinfo):
 75 |                 if line.startswith("cpu MHz"):
 76 |                     freq = float(line.split(":")[1])
 77 |                     freqs.append(1.e-6/freq)
 78 |             self.freqs = freqs
 79 |             self.rtfreq = freqs[core]
 80 |         else:
 81 |             self.rtfreq = freqs[core]
 82 |         return self.rtfreq
 83 | 
 84 |     def getTime(self):
 85 |         if self.useTSC:
 86 |             t = ReadTimestampCounter()*rfreq
 87 |         else:
 88 |             t = time()
 89 |         return t-self.torg
 90 | 
 91 |     def start(self):
 92 |         if self.useTSC and self.freqs is None:
 93 |             self.getFrequency()
 94 |         self.running = True
 95 |         self.torg = self.getTime()
 96 |         self.time = 0.
 97 |         self.counters = []
 98 | 
 99 |     def split(self):
100 |         t = self.time
101 |         if self.running:
102 |             t = self.getTime()
103 |         self.time = t
104 |         self.counters.append(t)
105 |         return t
106 | 
107 |     def read(self,all = True):
108 |         o = self.torg
109 |         t = self.time
110 |         if self.running:
111 |             t = self.getTime()
112 |         self.time = t
113 |         if all:
114 |             return o,t,self.counters
115 |         else:
116 |             return o,t
117 | 
118 |     def reset(self):
119 |         self.start()
120 | 
121 |     def stop(self):
122 |         if self.running:
123 |             self.running = False
124 |             t = self.getTime()
125 |             self.time = t
126 |             self.torg = 0.
127 |             return t
128 | 
129 | BSZ = 1024
130 | 
131 | ## system execution of 'command' with imput 'input' to 'command'
132 | def System(command,input = ""):
133 |     """system execution of 'command' with input 'input' to 'command'
134 |     Returns tuple (status,output to stdout,output to stderr)
135 |     with outputs split on newlines and returned as lists."""
136 |     if platform == "win32":
137 |         run = Popen(command,shell = True,bufsize = BSZ,
138 |               stdin = PIPE,stdout = PIPE,stderr = PIPE)
139 |     else:
140 |         run = Popen(command,shell = True,bufsize = BSZ,
141 |               stdin = PIPE,stdout = PIPE,stderr = PIPE,
142 |               close_fds = True)
143 |     if input != "":
144 |         run.stdin.write(input+"\n")
145 |     runOutput = run.stdout.read().splitlines()
146 |     runErrors = run.stderr.read().splitlines()
147 |     status = run.wait()
148 |     return status,runOutput,runErrors
149 | 
150 | ## allow to open ordinary, gzipped, bzipped files
151 | def xOpen(name,mode = "r"):
152 |     """open file depending on extension (.gz,.bz2) so that ordinary
153 |     as well as compressed files can be opened with the same syntax.
154 |     Returns a Python file object."""
155 |     extension = splitext(name)[-1]
156 |     if extension == ".gz":
157 |       file = GzipFile(name,mode)
158 |     elif extension == ".bz2":
159 |         file = BZ2File(name,mode)
160 |     else:
161 |        file = open(name,mode)
162 |     return file
163 | 
164 | ## print timing info: t0 = start time, t1 = final time
165 | def printTiming(t0,t1):
166 |     """ Given start time t0 and end time t1 in seconds,
167 |     as returned by time.time(),
168 |     print a nice representation like
169 | 
170 |     Started  : Mon Jan  7 23:18:21 2008
171 |     Finished : Mon Jan  7 23:18:22 2008
172 |     Elapsed  : 0.9 (00:00:00.9)
173 | 
174 |     Elapsed time as given in seconds and as (hh:mm:ss.s)"""
175 | 
176 |     dt = t1-t0
177 |     dh = int(dt/3600.)
178 |     du = dt-float(3600*dh)
179 |     dm = int(du/60.)
180 |     ds = du-float(60*dm)
181 | 
182 |     print "\nStarted  : %s" % ctime(t0)
183 |     print "Finished : %s" % ctime(t1)
184 |     print "Elapsed  : %.1f (%2.2d:%2.2d:%04.1f)\n" % (dt,dh,dm,ds)
185 | 
186 | ## invert a dictionary swapping keys and values
187 | def invertDict(org_dict):
188 |     """Invert a dictionary of (key,val) and returns a dictionary (val,key).
189 |     Fails if val is mutable, i.e. ALL vals must be immutable, e.g. strings."""
190 |     invdict = defaultdict(list)
191 |     for k in org_dict:
192 |         v = org_dict[k]
193 |         invdict[v].append(k)
194 |     return invdict
195 | 


--------------------------------------------------------------------------------
/oldcode/misc/vector.c:
--------------------------------------------------------------------------------
 1 | // To force linking in all needed Intel routines
 2 | // See compileCX for details (linking against .a)
 3 | void Dummy(void)
 4 | {
 5 | 	vsAdd();
 6 | 	vsSub();
 7 | 	vsDiv();
 8 | 	vsSqr();
 9 | 	vsMul();
10 | 	vsAbs();
11 | 	vsInv();
12 | 
13 | 	vsSin();   
14 | 	vsCos();   
15 | 	vsSinCos();
16 | 	vsTan();   
17 | 	vsAsin();  
18 | 	vsAcos();  
19 | 	vsAtan();  
20 | 	vsAtan2();
21 | 
22 | 	vsSinh(); 
23 | 	vsCosh(); 
24 | 	vsTanh(); 
25 | 	vsAsinh();
26 | 	vsAcosh();
27 | 	vsAtanh();
28 | 
29 | 	vsPow();    
30 | 	vsPowx();   
31 | 	vsSqrt();   
32 | 	vsCbrt();   
33 | 	vsInvSqrt();
34 | 	vsInvCbrt();
35 | 	vsHypot();
36 | 
37 | 	vsFloor();   
38 | 	vsCeil();    
39 | 	vsRound();   
40 | 	vsTrunc();   
41 | 	vsRint();    
42 | 	vsNearbyInt();
43 | 	vsModf();
44 | 
45 | 	vsExp();  	     
46 | 	vsLn();   
47 | 	vsLog10();
48 | 
49 | 	vsErf();   
50 | 	vsErfc();  
51 | 	vsErfInv();
52 | }
53 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*- 
 3 | 
 4 | """ setuptools setup.py for python-cuda """
 5 | 
 6 | from ez_setup import use_setuptools
 7 | use_setuptools(version='0.6c9')
 8 | 
 9 | from setuptools import setup, find_packages
10 | 
11 | setup(
12 |     name = 'python-cuda',
13 | 
14 |     version = '2.1-0.0.1',
15 | 
16 |     packages = ['cuda',
17 |                 'cuda.cu',
18 |                 'cuda.cuda', 
19 |                 'cuda.cublas', 
20 |                 'cuda.cufft', 
21 |                 'cuda.sugar',
22 |                 'cuda.sugar.memory', 
23 |                 'cuda.sugar.kernel', 
24 |                 'cuda.sugar.fft',
25 |                 'cuda.sugar.blas',
26 |                 'cuda.sugar.query', 
27 |                 'cuda.utils'],
28 | 
29 |     package_dir = {'cuda':'cuda'},
30 | 
31 |     package_data = {'cuda.sugar.fft': ['*.cu'] },
32 | 
33 |     install_requires=[
34 |         "numpy>=1.3.0",
35 |         "scipy>=0.7.0",
36 |     ],
37 | 
38 | 
39 | #     author='',
40 | #     author_email='',
41 | #     url='',
42 | #     description='Python bindings for CUDA 2.1 with numpy integration',
43 | #     long_description = """ """,
44 | #     download_url='',
45 | #     license='?',
46 | #     package_data = {}
47 | 
48 | )
49 | 


--------------------------------------------------------------------------------
/tests/cu/todo/cu_add.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | # coding:utf-8: © Arno Pähler, 2007-08
 3 | from ctypes import *
 4 | from time import time
 5 | 
 6 | #from cuda.cu_defs import *
 7 | from cu.cu_defs import *
 8 | #from cuda.cu_api import *
 9 | from cu.cu_api import *
10 | #from cuda.cu_utils import *
11 | from utils.cu_utils import *
12 | 
13 | from cpuFunctions import fixedInit,cpuVADD,checkError
14 | 
15 | BLOCK_SIZE = 64
16 | GRID_SIZE  = 256
17 | S4 = sizeof(c_float)
18 | checkErrorFlag = False
19 | 
20 | def main(device,vlength = 128,loops = 1):
21 | 
22 |     n2 = vlength ## Vector length
23 |     gpuVADD = device.functions["gpuVADD"]
24 | 
25 |     h_X = (c_float*n2)()
26 |     h_Y = (c_float*n2)()
27 |     g_Y = (c_float*n2)()
28 | 
29 |     fixedInit(h_X)
30 | 
31 |     d_X = getMemory(h_X)
32 |     d_Y = getMemory(h_Y)
33 | 
34 |     cuFuncSetBlockShape(gpuVADD,BLOCK_SIZE,1,1)
35 |     cuParamSeti(gpuVADD,0,d_X)
36 |     cuParamSeti(gpuVADD,4,d_Y)
37 |     cuParamSeti(gpuVADD,8,n2)
38 |     cuParamSetSize(gpuVADD,12)
39 | 
40 |     cuCtxSynchronize()
41 |     t0 = time()
42 |     for i in range(loops):
43 |         cuLaunchGrid(gpuVADD,GRID_SIZE,1)
44 |     cuCtxSynchronize()
45 |     t0 = time()-t0
46 | 
47 |     flops = (1.e-9*n2)*float(loops)
48 |     cuMemcpyDtoH(g_Y,d_Y,n2*S4)
49 |     cuCtxSynchronize()
50 | 
51 |     cuMemFree(d_X)
52 |     cuMemFree(d_Y)
53 | 
54 |     t1 = time()
55 |     for i in range(loops):
56 |         cpuVADD(h_X,h_Y)
57 |     t1 = time()-t1
58 |     print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0)
59 | 
60 |     if checkErrorFlag:
61 |         err,mxe = checkError(h_Y,g_Y)
62 |         print "Avg and max rel error = %.2e %.2e" % (err,mxe)
63 | 
64 | if __name__ == "__main__":
65 |     import sys
66 | 
67 |     device = cu_CUDA()
68 |     device.getSourceModule("gpuFunctions.cubin")
69 |     device.getFunction("gpuVADD")
70 | 
71 |     lmin,lmax = 7,24
72 |     if len(sys.argv) > 1:
73 |         lmin = lmax = int(sys.argv[1])
74 |     loopx = -1
75 |     if len(sys.argv) > 2:
76 |         loopx = int(sys.argv[2])
77 |     lmax = min(max(0,lmax),24)
78 |     lmin = min(max(0,lmin),lmax)
79 |     for l in range(lmin,lmax+1):
80 |         if l < 10:
81 |             loops = 25000
82 |         elif l < 17:
83 |             loops = 10000
84 |         elif l < 21:
85 |             loops = 250
86 |         else:
87 |             loops = 25
88 |         vlength = 1 << l
89 |         if loopx > 0:
90 |             loops = loopx
91 |         print "%5d %5d" % (l,loops),
92 |         main(device,vlength,loops)
93 |     cuCtxDetach(device.context)
94 | 


--------------------------------------------------------------------------------
/tests/cu/todo/cu_blsc.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | # coding:utf-8: © Arno Pähler, 2007-08
  3 | from ctypes import *
  4 | from time import time
  5 | 
  6 | from cu.cu_defs import *
  7 | from cu.cu_api import *
  8 | from utils.cu_utils import *
  9 | 
 10 | from cpuFunctions import randInit,checkError
 11 | 
 12 | UseVML = False
 13 | if UseVML:
 14 |     from mklMath import cpuBLSC
 15 | else:
 16 |     from cpuFunctions import cpuBLSC
 17 | 
 18 | BLOCK_SIZE = 128
 19 | GRID_SIZE  = 192
 20 | checkErrorFlag = False
 21 | 
 22 | S4 = sizeof(c_float)
 23 | 
 24 | def main(device,vlength = 128,loops = 1):
 25 | 
 26 |     n2 = vlength ## Vector length
 27 | 
 28 |     gpuBLSC = device.functions["gpuBLSC"]
 29 | 
 30 |     h_S = (c_float*n2)()
 31 |     h_X = (c_float*n2)()
 32 |     h_T = (c_float*n2)()
 33 |     h_C = (c_float*n2)()
 34 |     h_P = (c_float*n2)()
 35 | 
 36 | 
 37 |     randInit(h_S,5.,30.)
 38 |     randInit(h_X,1.,100.)
 39 |     randInit(h_T,.25,10.)
 40 |     R,V = .03,.3
 41 | 
 42 |     d_S = getMemory(h_S)
 43 |     d_X = getMemory(h_X)
 44 |     d_T = getMemory(h_T)
 45 |     d_C = getMemory(h_C)
 46 |     d_P = getMemory(h_P)
 47 | 
 48 |     cuFuncSetBlockShape(gpuBLSC,BLOCK_SIZE,1,1)
 49 |     cuParamSeti(gpuBLSC, 0,d_C)
 50 |     cuParamSeti(gpuBLSC, 4,d_P)
 51 |     cuParamSeti(gpuBLSC, 8,d_S)
 52 |     cuParamSeti(gpuBLSC,12,d_X)
 53 |     cuParamSeti(gpuBLSC,16,d_T)
 54 |     cuParamSetf(gpuBLSC,20,R)
 55 |     cuParamSetf(gpuBLSC,24,V)
 56 |     cuParamSeti(gpuBLSC,28,n2)
 57 |     cuParamSetSize(gpuBLSC,32)
 58 | 
 59 |     cuCtxSynchronize()
 60 |     t0 = time()
 61 |     for i in range(loops):
 62 |         cuLaunchGrid(gpuBLSC,GRID_SIZE,1)
 63 |     cuCtxSynchronize()
 64 |     t0 = time()-t0
 65 | 
 66 |     flops = (2.e-6*n2)*float(loops)
 67 |     g_C = (c_float*n2)()
 68 |     g_P = (c_float*n2)()
 69 |     cuMemcpyDtoH(g_C,d_C,n2*S4)
 70 |     cuMemcpyDtoH(g_P,d_P,n2*S4)
 71 |     cuCtxSynchronize()
 72 | 
 73 |     cuMemFree(d_S)
 74 |     cuMemFree(d_X)
 75 |     cuMemFree(d_T)
 76 |     cuMemFree(d_C)
 77 |     cuMemFree(d_P)
 78 | 
 79 |     t1 = time()
 80 |     for i in range(loops):
 81 |         cpuBLSC(h_C,h_P,h_S,h_X,h_T,R,V,n2)
 82 |     t1 = time()-t1
 83 |     print "%10d%10.2f%10.2f" % (vlength,flops/t1,flops/t0)
 84 | 
 85 |     if checkErrorFlag:
 86 |         err,mxe = checkError(h_C,g_C)
 87 |         print "Avg rel error (call) = %.2e" % (err,)
 88 |         err,mxe = checkError(h_P,g_P)
 89 |         print "Avg rel error (put)  = %.2e" % (err,)
 90 | 
 91 | if __name__ == "__main__":
 92 |     import sys
 93 | 
 94 |     device = cu_CUDA()
 95 |     device.getSourceModule("gpuFunctions.cubin")
 96 |     device.getFunction("gpuBLSC")
 97 | 
 98 |     lmin,lmax = 7,23
 99 |     if len(sys.argv) > 1:
100 |         lmin = lmax = int(sys.argv[1])
101 |     lmax = min(max(0,lmax),23)
102 |     lmin = min(max(0,lmin),lmax)
103 |     for l in range(lmin,lmax+1):
104 |         if l < 10:
105 |             loops = 1000
106 |         elif l < 13:
107 |             loops = 500
108 |         elif l < 17:
109 |             loops = 100
110 |         elif l < 21:
111 |             loops = 10
112 |         else:
113 |             loops = 5
114 |         loops = 2
115 |         vlength = 1 << l
116 |         print "%5d %5d" % (l,loops),
117 |         main(device,vlength,loops)
118 |     cuCtxDetach(device.context)
119 | 


--------------------------------------------------------------------------------
/tests/cu/todo/cu_gflops.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | # coding:utf-8: © Arno Pähler, 2007-08
 3 | from ctypes import *
 4 | from time import time
 5 | 
 6 | from cu.cu_defs import *
 7 | from cu.cu_api import *
 8 | from utils.cu_utils import *
 9 | 
10 | from cpuFunctions import cpuGFLOPS
11 | 
12 | BLOCK_SIZE_C = 192
13 | ITERATIONS_C = 512
14 | 
15 | BLOCK_SIZE_G = 512
16 | GRID_SIZE_G  = 512
17 | ITERATIONS_G = 512
18 | 
19 | def main(device,loops = 1):
20 | 
21 |     gpuGFLOPS = device.functions["gpuGFLOPS"]
22 | 
23 |     cuFuncSetBlockShape(gpuGFLOPS,BLOCK_SIZE_G,1,1)
24 | 
25 |     t0 = time()
26 |     for i in range(loops):
27 |         cuCtxSynchronize()
28 |         cuLaunchGrid(gpuGFLOPS,GRID_SIZE_G,1)
29 |         cuCtxSynchronize()
30 |     t0 = time()-t0
31 | 
32 |     flopsc = 4096.*ITERATIONS_C*BLOCK_SIZE_C
33 |     flopsg = 4096.*ITERATIONS_G*BLOCK_SIZE_G*GRID_SIZE_G
34 | 
35 |     flopsc *= 1.e-9*float(loops)
36 |     flopsg *= 1.e-9*float(loops)
37 | 
38 |     t1 = time()
39 |     for i in range(loops):
40 |         cpuGFLOPS()
41 |     t1 = time()-t1
42 | #    peakg = 4.*8.*2.*1.458 # 4MP*8SP/MP*2flops/SP/clock*clock[GHz] (8600GTS)
43 |     peakg = 14.*8.*2.*1.512 # 14MP*8SP/MP*2flops/SP/clock*clock[GHz] (9800GT)
44 |     print "%8.3f%8.2f%8.3f%8.2f [%.2f]" % (
45 |         t1,flopsc/t1,t0,flopsg/t0,peakg)
46 | 
47 | if __name__ == "__main__":
48 |     import sys
49 | 
50 |     device = cu_CUDA()
51 |     device.getSourceModule("gpuFunctions.cubin")
52 |     device.getFunction("gpuGFLOPS")
53 | 
54 |     loops = 1
55 |     if len(sys.argv) > 1:
56 |         loops = int(sys.argv[1])
57 |     print "%5d" % (loops),
58 |     main(device,loops)
59 |     cuCtxDetach(device.context)
60 | 


--------------------------------------------------------------------------------
/tests/cu/todo/cu_poly.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | # coding:utf-8: © Arno Pähler, 2007-08
  3 | from ctypes import *
  4 | from time import time
  5 | 
  6 | from cu.cu_defs import *
  7 | from cu.cu_api import *
  8 | from utils.cu_utils import *
  9 | 
 10 | from cpuFunctions import cpuPOLY5,cpuPOLY10,cpuPOLY20,cpuPOLY40
 11 | 
 12 | BLOCK_SIZE = 144
 13 | GRID_SIZE  = 192
 14 | ##BLOCK_SIZE = 320
 15 | ##GRID_SIZE  = 8
 16 | checkErrorFlag = False
 17 | 
 18 | S4 = sizeof(c_float)
 19 | psize = 5
 20 | 
 21 | def main(device,vlength = 128,loops = 1,m1 = 1):
 22 |     print "%5d %5d %5d" % (l,loops,m1),
 23 | 
 24 |     alfa = c_float(.5)
 25 |     n2 = vlength ## Vector length
 26 | 
 27 |     mp = 1 << (m1-1)
 28 |     print "%5d" % (mp*psize),
 29 |     fcn = "gpuPOLY%d"%(mp*psize)
 30 |     gpuPOLY = device.functions[fcn]
 31 |     h_X = (c_float*n2)()
 32 |     h_Y = (c_float*n2)()
 33 |     g_Y = (c_float*n2)()
 34 | 
 35 |     vectorInit(h_X)
 36 | 
 37 |     d_X = getMemory(h_X)
 38 |     d_Y = getMemory(h_Y)
 39 | 
 40 |     cuFuncSetBlockShape(gpuPOLY,BLOCK_SIZE,1,1)
 41 |     cuParamSeti(gpuPOLY,0,d_X)
 42 |     cuParamSeti(gpuPOLY,4,d_Y)
 43 |     cuParamSeti(gpuPOLY,8,n2)
 44 |     cuParamSetSize(gpuPOLY,12)
 45 | 
 46 |     cuCtxSynchronize()
 47 |     cuLaunchGrid(gpuPOLY,GRID_SIZE,1)
 48 |     t0 = time()
 49 |     for i in range(loops):
 50 |         cuLaunchGrid(gpuPOLY,GRID_SIZE,1)
 51 |     cuCtxSynchronize()
 52 |     t0 = time()-t0
 53 | 
 54 |     flops = (2.e-9*m1*n2*(psize-1))*float(loops)
 55 |     cuMemcpyDtoH(g_Y,d_Y,n2*S4)
 56 |     cuCtxSynchronize()
 57 | 
 58 |     cuMemFree(d_X)
 59 |     cuMemFree(d_Y)
 60 | 
 61 |     cpuPOLY = eval("cpuPOLY%d" % (mp*psize))
 62 |     t1 = time()
 63 |     for i in range(loops):
 64 |         cpuPOLY(h_X,h_Y)
 65 |     t1 = time()-t1
 66 |     print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0)
 67 | 
 68 |     if checkErrorFlag:
 69 |         err,mxe = checkError(h_Y,g_Y)
 70 |         print "Avg and max rel error = %.2e %.2e" % (err,mxe)
 71 | 
 72 | if __name__ == "__main__":
 73 |     import sys
 74 | 
 75 |     lmin,lmax = 7,23
 76 |     if len(sys.argv) > 1:
 77 |         lmin = lmax = int(sys.argv[1])
 78 |     loopx = -1
 79 |     if len(sys.argv) > 2:
 80 |         loopx = int(sys.argv[2])
 81 |     m1 = 4
 82 |     if len(sys.argv) > 3:
 83 |         m1 = min(4,int(sys.argv[3]))
 84 |     lmax = min(max(0,lmax),23)
 85 |     lmin = min(max(0,lmin),lmax)
 86 | 
 87 |     mp = 1 << (m1-1)
 88 |     device = cu_CUDA()
 89 |     device.getSourceModule("gpuFunctions.cubin")
 90 |     fcn = "gpuPOLY%d"%(mp*psize)
 91 |     device.getFunction(fcn)
 92 | 
 93 |     for l in range(lmin,lmax+1):
 94 |         if l < 10:
 95 |             loops = 10000/m1
 96 |         elif l < 13:
 97 |             loops = 5000/m1
 98 |         elif l < 17:
 99 |             loops = 500/m1
100 |         elif l < 21:
101 |             loops = 250/m1
102 |         else:
103 |             loops = 100/m1
104 |         vlength = 1 << l
105 |         if loopx > 0:
106 |             loops = loopx
107 |         main(device,vlength,loops,m1)
108 |     cuCtxDetach(device.context)
109 | 


--------------------------------------------------------------------------------
/tests/cu/todo/cu_saxpy.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | # coding:utf-8: © Arno Pähler, 2007-08
 3 | from ctypes import *
 4 | from time import time
 5 | 
 6 | from cu.cu_defs import *
 7 | from cu.cu_api import *
 8 | from utils.cu_utils import *
 9 | 
10 | from cpuFunctions import fixedInit,cpuSAXPY,checkError
11 | 
12 | BLOCK_SIZE = 64
13 | GRID_SIZE  = 256
14 | S4 = sizeof(c_float)
15 | checkErrorFlag = False
16 | 
17 | def main(device,vlength = 128,loops = 1):
18 | 
19 |     alfa = c_float(.5)
20 |     n2 = vlength ## Vector length
21 |     gpuSAXPY = device.functions["gpuSAXPY"]
22 | 
23 |     h_X = (c_float*n2)()
24 |     h_Y = (c_float*n2)()
25 |     g_Y = (c_float*n2)()
26 | 
27 |     fixedInit(h_X)
28 | 
29 |     d_X = getMemory(h_X)
30 |     d_Y = getMemory(h_Y)
31 | 
32 |     cuFuncSetBlockShape(gpuSAXPY,BLOCK_SIZE,1,1)
33 |     cuParamSetf(gpuSAXPY,0,alfa)
34 |     cuParamSeti(gpuSAXPY,4,d_X)
35 |     cuParamSeti(gpuSAXPY,8,d_Y)
36 |     cuParamSeti(gpuSAXPY,12,n2)
37 |     cuParamSetSize(gpuSAXPY,16)
38 | 
39 |     cuCtxSynchronize()
40 |     t0 = time()
41 |     for i in range(loops):
42 |         cuLaunchGrid(gpuSAXPY,GRID_SIZE,1)
43 |     cuCtxSynchronize()
44 |     t0 = time()-t0
45 | 
46 |     flops = (2.e-9*n2)*float(loops)
47 |     cuMemcpyDtoH(g_Y,d_Y,n2*S4)
48 |     cuCtxSynchronize()
49 | 
50 |     cuMemFree(d_X)
51 |     cuMemFree(d_Y)
52 | 
53 |     t1 = time()
54 |     for i in range(loops):
55 |         cpuSAXPY(alfa,h_X,h_Y)
56 |     t1 = time()-t1
57 |     print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0)
58 | 
59 |     if checkErrorFlag:
60 |         err,mxe = checkError(h_Y,g_Y)
61 |         print "Avg and max rel error = %.2e %.2e" % (err,mxe)
62 | 
63 | if __name__ == "__main__":
64 |     import sys
65 | 
66 |     device = cu_CUDA()
67 |     device.getSourceModule("gpuFunctions.cubin")
68 |     device.getFunction("gpuSAXPY")
69 | 
70 |     lmin,lmax = 7,24
71 |     if len(sys.argv) > 1:
72 |         lmin = lmax = int(sys.argv[1])
73 |     loopx = -1
74 |     if len(sys.argv) > 2:
75 |         loopx = int(sys.argv[2])
76 |     lmax = min(max(0,lmax),24)
77 |     lmin = min(max(0,lmin),lmax)
78 |     for l in range(lmin,lmax+1):
79 |         if l < 10:
80 |             loops = 25000
81 |         elif l < 17:
82 |             loops = 10000
83 |         elif l < 21:
84 |             loops = 250
85 |         else:
86 |             loops = 25
87 |         vlength = 1 << l
88 |         if loopx > 0:
89 |             loops = loopx
90 |         print "%5d %5d" % (l,loops),
91 |         main(device,vlength,loops)
92 |     cuCtxDetach(device.context)
93 | 


--------------------------------------------------------------------------------
/tests/cu/todo/cu_sgemm.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | # coding:utf-8: © Arno Pähler, 2007-08
  3 | from ctypes import *
  4 | from time import time
  5 | 
  6 | from cu.cu_defs import *
  7 | from cu.cu_api import *
  8 | from utils.cu_utils import *
  9 | 
 10 | from cpuFunctions import arrayInit,cpuSGEMM,checkError
 11 | from ctypes_array import *
 12 | 
 13 | useSciPy = True
 14 | if useSciPy:
 15 |     from scipy.lib.blas.fblas import sgemm as _sgemm
 16 |     # C : A*B (on the GPU)
 17 |     # F : (A*B).T = B.T * A.T (scipy)
 18 |     def sgemm(z,x,y,m,n,k):
 19 |         nx = convert(x,(m,k),"C").T
 20 |         ny = convert(y,(k,n),"C").T
 21 |         nz = _sgemm(1.,ny,nx)
 22 |         convert(nz,out=z)
 23 |         return z
 24 | else:
 25 |     # C : A*B (on the CPU) (in C)
 26 |     sgemm = cpuSGEMM
 27 | 
 28 | BLOCK_SIZE  = 1 << 4
 29 | S4 = sizeof(c_float)
 30 | 
 31 | def main(N = 1024,L = 100):
 32 |     M = N
 33 |     K = N >> 1
 34 |     N = N << 1
 35 |     flops = (2.e-9*M*N)*float(K*L)
 36 |     print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops)
 37 |     na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0.
 38 | 
 39 |     t0 = time()
 40 |     device = cu_CUDA()
 41 |     device.getSourceModule("gpuFunctions.cubin")
 42 |     gpuSGEMM = device.getFunction("gpuSGEMM")
 43 | 
 44 |     sizeA = M*K
 45 |     sizeB = K*N
 46 |     sizeC = M*N
 47 | 
 48 |     h_A = (c_float*sizeA)()
 49 |     h_B = (c_float*sizeB)()
 50 | 
 51 |     arrayInit(h_A)
 52 |     arrayInit(h_B)
 53 | 
 54 |     d_A = getMemory(h_A)
 55 |     d_B = getMemory(h_B)
 56 |     d_C = getMemory(sizeC)
 57 | 
 58 |     cuFuncSetBlockShape(gpuSGEMM,BLOCK_SIZE,BLOCK_SIZE,1)
 59 |     cuFuncSetSharedSize(gpuSGEMM,2*BLOCK_SIZE*BLOCK_SIZE*S4)
 60 |     cuParamSeti(gpuSGEMM,0,d_C)
 61 |     cuParamSeti(gpuSGEMM,4,d_A)
 62 |     cuParamSeti(gpuSGEMM,8,d_B)
 63 |     cuParamSeti(gpuSGEMM,12,K)
 64 |     cuParamSeti(gpuSGEMM,16,N)
 65 |     cuParamSetSize(gpuSGEMM,20)
 66 |     tt = t0 = time()-t0
 67 |     print "Overhead driver API: %.3f sec\n" % t0
 68 | 
 69 |     t0 = time()
 70 |     cuCtxSynchronize()
 71 |     for i in range(L):
 72 |         cuLaunchGrid(gpuSGEMM,N/BLOCK_SIZE,M/BLOCK_SIZE)
 73 |     cuCtxSynchronize()
 74 |     t0 = time()-t0
 75 |     tt += t0
 76 | 
 77 |     h_C = (c_float*sizeC)()
 78 |     cuMemcpyDtoH(h_C,d_C,S4*sizeC)
 79 |     cuCtxSynchronize()
 80 | 
 81 |     cuMemFree(d_A)
 82 |     cuMemFree(d_B)
 83 |     cuMemFree(d_C)
 84 |     cuCtxDetach(device.context)
 85 | 
 86 |     print "Processing time: %.3g (%.3g) sec" % (t0,tt)
 87 |     print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt)
 88 | 
 89 |     ref = (c_float*sizeC)()
 90 | 
 91 |     t1 = time()
 92 |     for i in range(L):
 93 |         sgemm(ref,h_A,h_B,M,N,K)
 94 |     t1 = time()-t1
 95 |     print "\nProcessing time: %.3g sec" % t1
 96 |     print "Gigaflops CPU  : %.2f" % (flops/t1)
 97 |     print "Speedup GPU/CPU: %.2f" % (t1/t0)
 98 | 
 99 |     err,mxe = checkError(ref,h_C)
100 |     print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)
101 | 
102 | if __name__ == "__main__":
103 |     import sys
104 | 
105 |     M, L = 1024, 100
106 |     if len(sys.argv) > 1:
107 |         M = int(sys.argv[1])
108 |     M = (M >> 5) << 5 # multiple of (2*BLOCK_SIZE)
109 |     if len(sys.argv) > 2:
110 |         L = int(sys.argv[2])
111 | 
112 |     print "+-----------------------+"
113 |     print "| Matrix Multiplication |"
114 |     print "| using CUDA driver API |"
115 |     print "+-----------------------+\n"
116 |     main(M,L)
117 | 


--------------------------------------------------------------------------------
/tests/cu/todo/cu_streams.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | # coding:utf-8: © Arno Pähler, 2007-08
  3 | from ctypes import *
  4 | 
  5 | from cu.cu_defs import *
  6 | from cu.cu_api import *
  7 | from utils.cu_utils import *
  8 | 
  9 | from ctypes_array import *
 10 | from numpy import all,int32,zeros
 11 | 
 12 | MB = 1024*1024
 13 | SI = sizeof(c_int)
 14 | 
 15 | def check_results(a,n,c):
 16 |     u = (c_int*n).from_address(a.value)
 17 |     a = convert(u,(n,))
 18 |     c = c.value
 19 |     return all(a==c)
 20 | 
 21 | def main(device):
 22 |     nstreams = 8
 23 |     nreps = 10
 24 |     n = 16*MB
 25 |     nbytes = n*SI
 26 | 
 27 |     count = c_int()
 28 |     cuDeviceGetCount(byref(count))
 29 |     if count == 0:
 30 |         print "no CUDA capable device found"
 31 |         return
 32 | 
 33 |     major = c_int()
 34 |     minor = c_int()
 35 |     cuDeviceComputeCapability(byref(major),byref(minor),device.device)
 36 |     if major.value == 1 and minor.value < 1:
 37 |         print "%s does not support streams" % props.name
 38 |         return
 39 | 
 40 |     init_array = device.functions["init_array"]
 41 |     u = zeros((n,),dtype=int32)+5
 42 |     x = convert(u)
 43 |     c = c_int(x[0])
 44 |     a = c_void_p()
 45 |     cuMemAllocHost(byref(a),nbytes)
 46 | 
 47 |     d_a = getMemory(n)
 48 |     d_c = getMemory(x)
 49 | 
 50 |     streams = (CUstream*nstreams)()
 51 |     for i in range(nstreams):
 52 |         stream = CUstream()
 53 |         cuStreamCreate(byref(stream),0)
 54 |         streams[i] = stream
 55 | 
 56 |     ev_start = CUevent()
 57 |     ev_stop = CUevent()
 58 |     cuEventCreate(byref(ev_start),0)
 59 |     cuEventCreate(byref(ev_stop),0)
 60 | 
 61 |     cuEventRecord(ev_start,streams[0])
 62 |     cuMemcpyDtoHAsync(a,d_a,nbytes,streams[0])
 63 |     cuEventRecord(ev_stop,streams[0])
 64 |     cuEventSynchronize(ev_stop)
 65 |     t_copy = c_float()
 66 |     cuEventElapsedTime(byref(t_copy),ev_start,ev_stop)
 67 |     t_copy = t_copy.value
 68 | 
 69 |     cuFuncSetBlockShape(init_array,512,1,1)
 70 |     cuParamSeti(init_array,0,d_a)
 71 |     cuParamSeti(init_array,4,d_c)
 72 |     cuParamSetSize(init_array,8)
 73 | 
 74 |     cuEventRecord(ev_start,streams[0])
 75 |     cuLaunchGrid(init_array,n/512,1)
 76 |     cuEventRecord(ev_stop,streams[0])
 77 |     cuEventSynchronize(ev_stop)
 78 |     t_kernel = c_float()
 79 |     cuEventElapsedTime(byref(t_kernel),ev_start,ev_stop)
 80 |     t_kernel = t_kernel.value
 81 | 
 82 |     cuFuncSetBlockShape(init_array,512,1,1)
 83 |     cuParamSeti(init_array,0,d_a)
 84 |     cuParamSeti(init_array,4,d_c)
 85 |     cuParamSetSize(init_array,8)
 86 | 
 87 |     cuEventRecord(ev_start,streams[0])
 88 |     for i in range(nreps):
 89 |         cuLaunchGrid(init_array,n/512,1)
 90 |         cuMemcpyDtoH(a,d_a,nbytes)
 91 |     cuEventRecord(ev_stop,streams[0])
 92 |     cuEventSynchronize(ev_stop)
 93 |     elapsed0 = c_float()
 94 |     cuEventElapsedTime(byref(elapsed0),ev_start,ev_stop)
 95 |     elapsed0 = elapsed0.value
 96 | 
 97 |     memset(a,255,nbytes)
 98 |     cuMemsetD32(d_a,0,n)
 99 |     cuEventRecord(ev_start,streams[0])
100 |     a_0 = a.value
101 |     off = n*SI/nstreams
102 |     for k in range(nreps):
103 |         for i in range(nstreams):
104 |             d_ai = d_a+i*n*SI/nstreams
105 |             cuParamSeti(init_array,0,d_ai)
106 |             cuLaunchGridAsync(init_array,n/(nstreams*512),1,streams[i])
107 |         for i in range(nstreams):
108 |             ai = a_0+i*off
109 |             di = d_c+i*off
110 |             cuMemcpyDtoHAsync(ai,di,nbytes/nstreams,streams[i])
111 |     cuEventRecord(ev_stop,streams[0])
112 |     cuEventSynchronize(ev_stop)
113 |     elapsed1 = c_float()
114 |     cuEventElapsedTime(byref(elapsed1),ev_start,ev_stop)
115 |     elapsed1 = elapsed1.value
116 | 
117 |     passed = check_results(a,n,c)
118 | 
119 |     for i in range(nstreams):
120 |         cuStreamDestroy(streams[i])
121 |     cuEventDestroy(ev_start)
122 |     cuEventDestroy(ev_stop)
123 | 
124 |     cuMemFree(d_a)
125 |     cuMemFree(d_c)
126 |     cuMemFreeHost(a)
127 | 
128 |     print "memcopy:\t%.2f" % t_copy
129 |     print "kernel:\t\t%.2f" % t_kernel
130 |     print "non-streamed:\t%.2f (%.2f expected)" % (
131 |         elapsed0/nreps,t_kernel+t_copy)
132 |     print "%d streams:\t%.2f (%.2f expected)" % (
133 |         nstreams,elapsed1/nreps,t_kernel+t_copy/nstreams)
134 | 
135 |     print "-------------------------------"
136 |     if passed:
137 |         print "Test PASSED"
138 |     else:
139 |         print "Test FAILED"
140 | 
141 | if __name__ == "__main__":
142 |     device = cu_CUDA()
143 |     device.getSourceModule("gpuFunctions.cubin")
144 |     device.getFunction("init_array")
145 |     main(device)
146 |     cuCtxDetach(device.context)
147 | 


--------------------------------------------------------------------------------
/tests/cu/todo/cu_trig.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | # coding:utf-8: © Arno Pähler, 2007-08
  3 | from ctypes import *
  4 | from time import time
  5 | 
  6 | from cu.cu_defs import *
  7 | from cu.cu_api import *
  8 | from utils.cu_utils import *
  9 | 
 10 | from cpuFunctions import vectorInit,checkError
 11 | 
 12 | UseVML = True
 13 | if UseVML:
 14 |     from mklMath import cpuTRIG
 15 | else:
 16 |     from cpuFunctions import cpuTRIG
 17 | 
 18 | BLOCK_SIZE = 128
 19 | GRID_SIZE  = 192
 20 | checkErrorFlag = False
 21 | 
 22 | S4 = sizeof(c_float)
 23 | 
 24 | def main(device,vlength = 128,loops = 1):
 25 | 
 26 |     n2 = vlength ## Vector length
 27 |     gpuTRIG = device.functions["gpuTRIG"]
 28 | 
 29 |     h_X = (c_float*n2)()
 30 |     h_Y = (c_float*n2)()
 31 |     h_Z = (c_float*n2)()
 32 | 
 33 |     vectorInit(h_X)
 34 | 
 35 |     d_X = getMemory(h_X)
 36 |     d_Y = getMemory(h_Y)
 37 |     d_Z = getMemory(h_Z)
 38 | 
 39 |     cuFuncSetBlockShape(gpuTRIG,BLOCK_SIZE,1,1)
 40 |     cuParamSeti(gpuTRIG,0,d_Y)
 41 |     cuParamSeti(gpuTRIG,4,d_Z)
 42 |     cuParamSeti(gpuTRIG,8,d_X)
 43 |     cuParamSeti(gpuTRIG,12,n2)
 44 |     cuParamSetSize(gpuTRIG,16)
 45 | 
 46 |     cuCtxSynchronize()
 47 |     t0 = time()
 48 |     for i in range(loops):
 49 |         cuLaunchGrid(gpuTRIG,GRID_SIZE,1)
 50 |     cuCtxSynchronize()
 51 |     t0 = time()-t0
 52 | 
 53 |     flops = (8.e-9*n2)*float(loops)
 54 |     g_Y = (c_float*n2)()
 55 |     g_Z = (c_float*n2)()
 56 |     cuMemcpyDtoH(g_Y,d_Y,S4*n2)
 57 |     cuMemcpyDtoH(g_Z,d_Z,S4*n2)
 58 |     cuCtxSynchronize()
 59 | 
 60 |     cuMemFree(d_X)
 61 |     cuMemFree(d_Y)
 62 |     cuMemFree(d_Z)
 63 | 
 64 |     t1 = time()
 65 |     for i in range(loops):
 66 |         cpuTRIG(h_Y,h_Z,h_X)
 67 |     t1 = time()-t1
 68 |     print "%10d%6.2f%6.2f GFlops" % (vlength,flops/t1,flops/t0)
 69 | 
 70 |     if checkErrorFlag:
 71 |         err,mxe = checkError(h_Y,g_Y,n2)
 72 |         print "Avg and max rel error (cos) = %.2e %.2e" % (err,mxe)
 73 |         err,mxe = checkError(h_Z,g_Z,n2)
 74 |         print "Avg and max rel error (sin) = %.2e %.2e" % (err,mxe)
 75 | 
 76 | if __name__ == "__main__":
 77 |     import sys
 78 | 
 79 |     device = cu_CUDA()
 80 |     device.getSourceModule("gpuFunctions.cubin")
 81 |     device.getFunction("gpuTRIG")
 82 | 
 83 |     lmin,lmax = 7,23
 84 |     if len(sys.argv) > 1:
 85 |         lmin = lmax = int(sys.argv[1])
 86 |     lmax = min(max(0,lmax),23)
 87 |     lmin = min(max(0,lmin),lmax)
 88 |     for l in range(lmin,lmax+1):
 89 |         if l < 10:
 90 |             loops = 10000
 91 |         elif l < 13:
 92 |             loops = 2000
 93 |         elif l < 17:
 94 |             loops = 250
 95 |         elif l < 21:
 96 |             loops = 100
 97 |         else:
 98 |             loops = 50
 99 |         vlength = 1 << l
100 |         print "%5d %5d" % (l,loops),
101 |         main(device,vlength,loops)
102 |     cuCtxDetach(device.context)
103 | 


--------------------------------------------------------------------------------
/tests/cuda/todo/cuda_GL.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | # coding:utf-8: © Arno Pähler, 2007-08
  3 | # GLUT version
  4 | from ctypes import *
  5 | 
  6 | from ogl.gl import *
  7 | from OpenGL.GLUT import *
  8 | 
  9 | from cuda.cuda_defs import *
 10 | from cuda.cuda_api import *
 11 | 
 12 | lib = CDLL("./libkernelGL.so")
 13 | 
 14 | kernel1 = lib.__device_stub_kernel1
 15 | kernel1.restype = None
 16 | kernel1.argtypes = [ c_void_p, c_uint, c_uint, c_float ]
 17 | 
 18 | kernel2 = lib.__device_stub_kernel2
 19 | kernel2.restype = None
 20 | kernel2.argtypes = [ c_void_p, c_uint, c_uint, c_float ]
 21 | 
 22 | window_width = 512
 23 | window_height = 512
 24 | 
 25 | mesh_width = 256
 26 | mesh_height = 256
 27 | 
 28 | anim = 0.0
 29 | mouse_buttons = 0
 30 | rotate_x,rotate_y,translate_z = 0.,0.,-3.0
 31 | global mouse_old_x,mouse_old_y
 32 | 
 33 | vbo = GLuint()
 34 | 
 35 | kernel = kernel1
 36 | 
 37 | def main(argc,argv):
 38 |     global vbo
 39 | 
 40 |     glutInit(argc,argv)
 41 |     glutInitDisplayMode(GLUT_RGBA|GLUT_DOUBLE)
 42 |     glutInitWindowSize(window_width,window_height)
 43 |     glutCreateWindow("Cuda GL Demo")
 44 | 
 45 |     initGL()
 46 | 
 47 |     glutDisplayFunc(display)
 48 |     glutKeyboardFunc(keyboard)
 49 |     glutMouseFunc(mouse)
 50 |     glutMotionFunc(motion)
 51 | 
 52 |     vbo = createVBO()
 53 |     runCuda(vbo)
 54 | 
 55 |     glutMainLoop()
 56 | 
 57 | def runCuda(vbo):
 58 |     vptr = c_void_p()
 59 |     status = cudaGLMapBufferObject(byref(vptr),vbo)
 60 | 
 61 |     block = dim3(8,8,1)
 62 |     grid = dim3(mesh_width/block.x,mesh_height/block.y,1)
 63 |     status = cudaConfigureCall(grid,block,0,0)
 64 |     kernel(vptr,mesh_width,mesh_height,anim)
 65 | 
 66 |     status = cudaGLUnmapBufferObject(vbo)
 67 |     if status != 0:
 68 |         exit()
 69 | 
 70 | def initGL():
 71 |     glClearColor(0.0,0.0,0.0,1.0)
 72 |     glDisable(GL_DEPTH_TEST)
 73 | 
 74 |     glViewport(0,0,window_width,window_height)
 75 |     glMatrixMode(GL_PROJECTION)
 76 |     glLoadIdentity()
 77 |     ratio = float(window_width)/float(window_height)
 78 |     glFrustum(-1.,1.,-1.,1.,2.,10.)
 79 | 
 80 |     return True
 81 | 
 82 | def createVBO():
 83 |     global vbo
 84 |     glGenBuffers(1,byref(vbo))
 85 |     glBindBuffer(GL_ARRAY_BUFFER,vbo)
 86 | 
 87 |     size = mesh_width*mesh_height*4*sizeof(c_float)
 88 |     glBufferData(GL_ARRAY_BUFFER,size,0,GL_DYNAMIC_DRAW)
 89 | 
 90 |     glBindBuffer(GL_ARRAY_BUFFER,0)
 91 | 
 92 |     status = cudaGLRegisterBufferObject(vbo)
 93 |     return vbo
 94 | 
 95 | def deleteVBO():
 96 |     global vbo
 97 |     glBindBuffer(1,vbo)
 98 |     glDeleteBuffers(1,vbo)
 99 | 
100 |     status = cudaGLUnregisterBufferObject(vbo)
101 |     vbo = 0
102 | 
103 | def display():
104 |     global anim,vbo
105 |     runCuda(vbo)
106 | 
107 |     glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT)
108 | 
109 |     glMatrixMode(GL_MODELVIEW)
110 |     glLoadIdentity()
111 |     glTranslatef(0.0,0.0,translate_z)
112 |     glRotatef(rotate_x,1.0,0.0,0.0)
113 |     glRotatef(rotate_y,0.0,1.0,0.0)
114 | 
115 |     glBindBuffer(GL_ARRAY_BUFFER,vbo)
116 |     glVertexPointer(4,GL_FLOAT,0,0)
117 | 
118 |     glEnableClientState(GL_VERTEX_ARRAY)
119 |     glColor3f(1.0,0.0,0.0)
120 |     glDrawArrays(GL_POINTS,0,mesh_width*mesh_height)
121 |     glDisableClientState(GL_VERTEX_ARRAY)
122 | 
123 |     glutSwapBuffers()
124 |     glutPostRedisplay()
125 | 
126 |     anim += 0.01
127 | 
128 | def keyboard(key,x,y):
129 |     if key == chr(27):
130 |         deleteVBO()
131 |         exit()
132 | 
133 | def mouse(button,state,x,y):
134 |     global mouse_buttons
135 |     global mouse_old_x,mouse_old_y
136 |     if state == GLUT_DOWN:
137 |         mouse_buttons |= 1<<button
138 |     elif state == GLUT_UP:
139 |         mouse_buttons = 0
140 | 
141 |     mouse_old_x = x
142 |     mouse_old_y = y
143 |     glutPostRedisplay()
144 | 
145 | def motion(x,y):
146 |     global mouse_buttons
147 |     global mouse_old_x,mouse_old_y
148 |     global rotate_x,rotate_y,translate_z
149 |     dx = x-mouse_old_x
150 |     dy = y-mouse_old_y
151 | 
152 |     if mouse_buttons & 1:
153 |         rotate_x += dy*0.2
154 |         rotate_y += dx*0.2
155 |     elif mouse_buttons & 4:
156 |         translate_z += dy*0.01
157 | 
158 |     mouse_old_x = x
159 |     mouse_old_y = y
160 | 
161 | if __name__ == "__main__":
162 |     from sys import argv
163 |     argc = len(argv)
164 |     cudaSetDevice(0)
165 |     main(argc,argv)
166 | 


--------------------------------------------------------------------------------
/tests/cuda/todo/cuda_GLimg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npinto/python-cuda/92898059d7a32b261ba0758e50dccfe5c4bd4ac4/tests/cuda/todo/cuda_GLimg.png


--------------------------------------------------------------------------------
/tests/cuda/todo/cuda_QtGL.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | # coding:utf-8: © Arno Pähler,2007-08
  3 | # Qt4/PyQt4 version
  4 | from ctypes import *
  5 | import time
  6 | 
  7 | from PyQt4 import QtCore,QtGui,QtOpenGL
  8 | from PyQt4.QtCore import Qt
  9 | from ogl.gl import *
 10 | 
 11 | from cuda.cuda_defs import *
 12 | from cuda.cuda_api import *
 13 | from cuda.cuda_utils import *
 14 | 
 15 | lib = CDLL("./libkernelGL.so")
 16 | 
 17 | kernel1 = lib.__device_stub_kernel1
 18 | kernel1.restype = None
 19 | kernel1.argtypes = [ c_void_p,c_uint,c_uint,c_float ]
 20 | 
 21 | kernel2 = lib.__device_stub_kernel2
 22 | kernel2.restype = None
 23 | kernel2.argtypes = [ c_void_p,c_uint,c_uint,c_float ]
 24 | 
 25 | Sf4 = 4*sizeof(c_float)
 26 | 
 27 | kernel = kernel2
 28 | 
 29 | class CudaGLWidget(QtOpenGL.QGLWidget):
 30 |     def __init__(self,parent = None,name = None):
 31 |         QtOpenGL.QGLWidget.__init__(self,parent,name)
 32 |         self.setWindowTitle("Cuda GL Demo")
 33 | 
 34 |         self.device = cuda_CUDA()
 35 | 
 36 | 
 37 |         # self.initializeGL gets called automatically
 38 |         # and implicitly creates the OpenGL context
 39 |         self.reset()
 40 |         self.setGeometry(QtCore.QRect(0,0.,self.width,self.height))
 41 | 
 42 |         self.t0 = time.time()
 43 |         self.frames = 0
 44 |         self.startTimer(0)
 45 | 
 46 |     def reset(self):
 47 |         self.mouse_buttons = -1
 48 |         self.last_x = 0.
 49 |         self.last_y = 0.
 50 |         self.rot_x = 0.
 51 |         self.rot_y = 0.
 52 |         self.trn_z = -3.
 53 |         self.anim = 0.
 54 |         self.scale = 1.
 55 |         self.play = True
 56 | 
 57 |         self.width = 512
 58 |         self.height = 512
 59 | 
 60 |         self.mesh_w = 256
 61 |         self.mesh_h = 256
 62 |         self.m_size = self.mesh_w*self.mesh_h
 63 | 
 64 |     def runCuda(self):
 65 |         vptr = c_void_p()
 66 |         cudaGLMapBufferObject(byref(vptr),self.vbo)
 67 | 
 68 |         block = dim3(16,16,1)
 69 |         grid = dim3(self.mesh_w/block.x,self.mesh_h/block.y,1)
 70 |         cudaConfigureCall(grid,block,0,0)
 71 |         kernel(vptr,self.mesh_w,self.mesh_h,self.anim)
 72 | 
 73 |         cudaGLUnmapBufferObject(self.vbo)
 74 | 
 75 |     def initializeGL(self):
 76 |         glDisable(GL_DEPTH_TEST)
 77 |         self.createVBO()
 78 | 
 79 |     def paintGL(self):
 80 |         self.runCuda()
 81 | 
 82 |         glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT)
 83 | 
 84 |         glMatrixMode(GL_MODELVIEW)
 85 |         glLoadIdentity()
 86 |         glTranslatef(0.,0.,self.trn_z)
 87 |         glRotatef(self.rot_x,1.,0.,0.)
 88 |         glRotatef(self.rot_y,0.,1.,0.)
 89 |         s = self.scale
 90 |         glScalef(s,s,s)
 91 | 
 92 |         glBindBuffer(GL_ARRAY_BUFFER,self.vbo)
 93 |         glVertexPointer(4,GL_FLOAT,0,0)
 94 | 
 95 |         glEnableClientState(GL_VERTEX_ARRAY)
 96 |         glColor3f(1.,1.,.75)
 97 |         glDrawArrays(GL_POINTS,0,self.m_size)
 98 | 
 99 |         glDisableClientState(GL_VERTEX_ARRAY)
100 | 
101 |         if self.play:
102 |             self.anim += 0.01
103 | 
104 |     def resizeGL(self,width,height):
105 |         self.width = width
106 |         self.height = height
107 |         w = float(width)/float(height)
108 |         h = 1.
109 | 
110 |         glViewport(0,0,width,height)
111 |         glMatrixMode(GL_PROJECTION)
112 |         glLoadIdentity()
113 |         glFrustum(-w,w,-h,h,2.,10.)
114 | 
115 |         glMatrixMode(GL_MODELVIEW)
116 |         glLoadIdentity()
117 |         glTranslatef(0.,0.,-3.)
118 | 
119 |     def createVBO(self):
120 |         self.vbo = vbo = GLuint()
121 |         glGenBuffers(1,byref(vbo))
122 |         glBindBuffer(GL_ARRAY_BUFFER,vbo)
123 | 
124 |         size = self.m_size*Sf4
125 |         glBufferData(GL_ARRAY_BUFFER,size,0,GL_DYNAMIC_DRAW)
126 |         b_ptr = glMapBuffer(GL_ARRAY_BUFFER,GL_WRITE_ONLY)
127 |         glUnmapBuffer(GL_ARRAY_BUFFER)
128 |         glBindBuffer(GL_ARRAY_BUFFER,0)
129 |         cudaGLRegisterBufferObject(vbo)
130 | 
131 |     def deleteVBO(self):
132 |         vbo = self.vbo
133 |         glBindBuffer(1,vbo)
134 |         glDeleteBuffers(1,vbo)
135 | 
136 |         cudaGLUnregisterBufferObject(vbo)
137 |         self.vbo = None
138 | 
139 |     def keyPressEvent(self,ev):
140 |         key = ev.key()
141 |         if key == Qt.Key_Escape \
142 |         or key == Qt.Key_Q:
143 |             self.deleteVBO()
144 |             cudaThreadExit()
145 |             exit()
146 |         elif key == Qt.Key_R:
147 |             self.reset()
148 |         elif key == Qt.Key_S:
149 |             self.play = not self.play
150 |         elif key == Qt.Key_P:
151 |             image = self.grabFrameBuffer()
152 |             image.save("cuda_GLimg.png","PNG")
153 | 
154 |     def mousePressEvent(self,ev):
155 |         button = ev.button()
156 |         if button == Qt.LeftButton:
157 |             self.MouseButton = 0
158 |         elif button == Qt.RightButton:
159 |             self.MouseButton = 2
160 |         else:
161 |             self.MouseButton = 1
162 |         self.last_x = ev.x()
163 |         self.last_y = ev.y()
164 | 
165 |     def mouseReleaseEvent(self,ev):
166 |         self.MouseButton = -1
167 | 
168 |     def mouseMoveEvent(self,ev):
169 |         x,y = ev.x(),ev.y()
170 |         dx = x-self.last_x
171 |         dy = y-self.last_y
172 |         mouse = self.MouseButton
173 |         if mouse == 0:
174 |             self.rot_x += .2*dy
175 |             self.rot_y += .2*dx
176 |         elif mouse == 2:
177 |             self.trn_z += .01*dy
178 |         self.last_x = x
179 |         self.last_y = y
180 | 
181 |     def wheelEvent(self,ev):
182 |         delta = ev.delta() # usually +/- 120
183 |         scale = self.scale
184 |         if delta != 0:
185 |             scale += 6./float(delta)
186 |         self.scale = min(2.,max(.2,scale))
187 | 
188 |     def timerEvent(self,ev):
189 |         t = time.time()
190 |         self.frames +=  1
191 |         if (t-self.t0) >=  5.0:
192 |             seconds = t-self.t0
193 |             fps = self.frames/seconds
194 |             print "%d frames in %3.1f seconds = %6.1f fps" % (
195 |                 self.frames,seconds,fps);
196 |             self.frames = 0
197 |             self.t0 = t
198 |         self.updateGL() # needed
199 | 
200 | ##############################################################################
201 | if __name__ == '__main__':
202 |     from sys import argv
203 |     app = QtGui.QApplication(argv)
204 |     widget = CudaGLWidget()
205 |     widget.show()
206 |     app.exec_()
207 | 


--------------------------------------------------------------------------------
/tests/cuda/todo/cuda_add.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*- 
  3 | 
  4 | # XXX
  5 | # --
  6 | # Code forked from python-cuda-2.0_42 © Arno Pähler, 2007-08
  7 | # -- 
  8 | 
  9 | from ctypes import *
 10 | from time import time
 11 | 
 12 | # NP: simple vector addition example
 13 | # XXX should be easy ;-)
 14 | 
 15 | from cuda import cuda
 16 | print cuda.cudaConfigureCall
 17 | print cuda.cudaThreadSynchronize
 18 | 
 19 | #fixedInit(h_X)
 20 | 
 21 | #d_X = getMemory(h_X)
 22 | #d_Y = getMemory(h_Y)
 23 | 
 24 | #blockDim  = dim3(BLOCK_SIZE,1,1)
 25 | #gridDim   = dim3(GRID_SIZE,1,1)
 26 | #cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
 27 | #cudaFree(d_Y)
 28 | #cudaThreadExit()
 29 | #cudaSetDevice(0)
 30 | import sys
 31 | sys.exit(0)#raise
 32 | 
 33 | # 
 34 | #from cpuFunctions import fixedInit,cpuVADD,checkError
 35 | from gpuFunctions import gpuVADD
 36 | 
 37 | BLOCK_SIZE = 256
 38 | GRID_SIZE  = 256
 39 | S4 = sizeof(c_float)
 40 | checkErrorFlag = False
 41 | 
 42 | def main(vlength = 128,loops = 1):
 43 | 
 44 |     n2 = vlength ## Vector length
 45 | 
 46 |     h_X = (c_float*n2)()
 47 |     h_Y = (c_float*n2)()
 48 |     g_Y = (c_float*n2)()
 49 | 
 50 |     fixedInit(h_X)
 51 | 
 52 |     d_X = getMemory(h_X)
 53 |     d_Y = getMemory(h_Y)
 54 | 
 55 |     blockDim  = dim3(BLOCK_SIZE,1,1)
 56 |     gridDim   = dim3(GRID_SIZE,1,1)
 57 | 
 58 |     t0 = time()
 59 |     cudaThreadSynchronize()
 60 |     for i in range(loops):
 61 |         cudaConfigureCall(gridDim,blockDim,0,0)
 62 |         gpuVADD(d_X,d_Y,n2)
 63 |     cudaThreadSynchronize()
 64 |     t0 = time()-t0
 65 | 
 66 |     flops = (1.e-9*n2)*float(loops)
 67 |     g_Y = (c_float*n2)()
 68 |     cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
 69 |     cudaThreadSynchronize()
 70 | 
 71 |     cudaFree(d_X)
 72 |     cudaFree(d_Y)
 73 | 
 74 |     cudaThreadExit()
 75 |     t1 = time()
 76 |     for i in range(loops):
 77 |         cpuVADD(h_X,h_Y)
 78 |     t1 = time()-t1
 79 |     print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0)
 80 | 
 81 |     if checkErrorFlag:
 82 |         err,mxe = checkError(h_Y,g_Y)
 83 |         print "Avg and max rel error = %.2e %.2e" % (err,mxe)
 84 | 
 85 | if __name__ == "__main__":
 86 |     import sys
 87 | 
 88 |     cudaSetDevice(0)
 89 | 
 90 |     lmin,lmax = 7,24
 91 |     if len(sys.argv) > 1:
 92 |         lmin = lmax = int(sys.argv[1])
 93 |     loopx = -1
 94 |     if len(sys.argv) > 2:
 95 |         loopx = int(sys.argv[2])
 96 |     lmax = min(max(0,lmax),24)
 97 |     lmin = min(max(0,lmin),lmax)
 98 |     for l in range(lmin,lmax+1):
 99 |         if l < 10:
100 |             loops = 25000
101 |         elif l < 17:
102 |             loops = 10000
103 |         elif l < 21:
104 |             loops = 250
105 |         else:
106 |             loops = 25
107 |         vlength = 1 << l
108 |         if loopx > 0:
109 |             loops = loopx
110 |         print "%5d %5d" % (l,loops),
111 |         main(vlength,loops)
112 | 


--------------------------------------------------------------------------------
/tests/cuda/todo/cuda_blsc.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | # coding:utf-8: © Arno Pähler, 2007-08
  3 | from ctypes import *
  4 | from time import time
  5 | 
  6 | from cuda.cuda_defs import *
  7 | from cuda.cuda_api import *
  8 | from cuda.cuda_utils import *
  9 | 
 10 | from cpuFunctions import randInit,checkError
 11 | from gpuFunctions import gpuBLSC
 12 | 
 13 | UseVML = False
 14 | if UseVML:
 15 |     from mklMath import cpuBLSC
 16 | else:
 17 |     from cpuFunctions import cpuBLSC
 18 | 
 19 | BLOCK_SIZE = 128
 20 | GRID_SIZE  = 192
 21 | checkErrorFlag = False
 22 | 
 23 | S4 = sizeof(c_float)
 24 | 
 25 | def main(vlength = 128,loops = 1):
 26 | 
 27 |     n2 = vlength ## Vector length
 28 | 
 29 |     h_S = (c_float*n2)()
 30 |     h_X = (c_float*n2)()
 31 |     h_T = (c_float*n2)()
 32 |     h_C = (c_float*n2)()
 33 |     h_P = (c_float*n2)()
 34 | 
 35 | 
 36 |     randInit(h_S,5.,30.)
 37 |     randInit(h_X,1.,100.)
 38 |     randInit(h_T,.25,10.)
 39 |     R,V = .03,.3
 40 | 
 41 |     d_S = getMemory(h_S)
 42 |     d_X = getMemory(h_X)
 43 |     d_T = getMemory(h_T)
 44 |     d_C = getMemory(h_C)
 45 |     d_P = getMemory(h_P)
 46 | 
 47 |     blockDim  = dim3(BLOCK_SIZE,1,1)
 48 |     gridDim   = dim3(GRID_SIZE,1,1)
 49 | 
 50 |     cudaThreadSynchronize()
 51 |     t0 = time()
 52 |     for i in range(loops):
 53 |         cudaConfigureCall(gridDim,blockDim,0,0)
 54 |         gpuBLSC(d_C,d_P,d_S,d_X,d_T,R,V,n2)
 55 |     cudaThreadSynchronize()
 56 |     t0 = time()-t0
 57 | 
 58 |     flops = (2.e-6*n2)*float(loops)
 59 |     g_C = (c_float*n2)()
 60 |     g_P = (c_float*n2)()
 61 |     cudaMemcpy(g_C,d_C,S4*n2,cudaMemcpyDeviceToHost)
 62 |     cudaMemcpy(g_P,d_P,S4*n2,cudaMemcpyDeviceToHost)
 63 |     cudaThreadSynchronize()
 64 | 
 65 |     cudaFree(d_S)
 66 |     cudaFree(d_X)
 67 |     cudaFree(d_T)
 68 |     cudaFree(d_C)
 69 |     cudaFree(d_P)
 70 | 
 71 |     cudaThreadExit()
 72 |     t1 = time()
 73 |     for i in range(loops):
 74 |         cpuBLSC(h_C,h_P,h_S,h_X,h_T,R,V,n2)
 75 |     t1 = time()-t1
 76 |     print "%10d%10.2f%10.2f" % (vlength,flops/t1,flops/t0)
 77 | 
 78 |     if checkErrorFlag:
 79 |         err,mxe = checkError(h_C,g_C)
 80 |         print "Avg rel error (call) = %.2e" % (err,)
 81 |         err,mxe = checkError(h_P,g_P)
 82 |         print "Avg rel error (put)  = %.2e" % (err,)
 83 | 
 84 | if __name__ == "__main__":
 85 |     import sys
 86 | 
 87 |     cudaSetDevice(0)
 88 | 
 89 |     lmin,lmax = 7,23
 90 |     if len(sys.argv) > 1:
 91 |         lmin = lmax = int(sys.argv[1])
 92 |     lmax = min(max(0,lmax),23)
 93 |     lmin = min(max(0,lmin),lmax)
 94 |     for l in range(lmin,lmax+1):
 95 |         if l < 10:
 96 |             loops = 1000
 97 |         elif l < 13:
 98 |             loops = 500
 99 |         elif l < 17:
100 |             loops = 100
101 |         elif l < 21:
102 |             loops = 10
103 |         else:
104 |             loops = 5
105 |         loops = 2
106 |         vlength = 1 << l
107 |         print "%5d %5d" % (l,loops),
108 |         main(vlength,loops)
109 | 


--------------------------------------------------------------------------------
/tests/cuda/todo/cuda_gflops.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | # coding:utf-8: © Arno Pähler, 2007-08
 3 | from ctypes import *
 4 | from time import time
 5 | 
 6 | from cuda.cuda_defs import *
 7 | from cuda.cuda_api import *
 8 | 
 9 | from cpuFunctions import cpuGFLOPS
10 | from gpuFunctions import gpuGFLOPS
11 | 
12 | BLOCK_SIZE_C = 192
13 | ITERATIONS_C = 512
14 | 
15 | BLOCK_SIZE_G = 512
16 | GRID_SIZE_G  = 512
17 | ITERATIONS_G = 512
18 | S4 = sizeof(c_float)
19 | 
20 | # This is SUBSTANTIALLY slower than cu_gflops.py. Why?
21 | # Looping about 50 times almost as fast as cu_gflops.py.
22 | 
23 | def main(loops = 1):
24 | 
25 |     blockDim  = dim3(BLOCK_SIZE_G,1,1)
26 |     gridDim   = dim3(GRID_SIZE_G,1,1)
27 | 
28 |     t0 = time()
29 |     cudaThreadSynchronize()
30 |     for i in range(loops):
31 |         cudaConfigureCall(gridDim,blockDim,0,0)
32 |         gpuGFLOPS()
33 |     cudaThreadSynchronize()
34 |     t0 = time()-t0
35 |     cudaThreadExit()
36 | 
37 |     flopsc = 4096.*ITERATIONS_C*BLOCK_SIZE_C
38 |     flopsg = 4096.*ITERATIONS_G*BLOCK_SIZE_G*GRID_SIZE_G
39 |     flopsc *= 1.e-9*float(loops)
40 |     flopsg *= 1.e-9*float(loops)
41 | 
42 |     t1 = time()
43 |     for i in range(loops):
44 |         cpuGFLOPS()
45 |     t1 = time()-t1
46 | #    peakg = 4.*8.*2.*1.458 # 4MP*8SP/MP*2flops/SP/clock*clock[GHz] (8600GTS)
47 |     peakg = 14.*8.*2.*1.512 # 14MP*8SP/MP*2flops/SP/clock*clock[GHz] (9800GT)
48 |     print "%8.3f%8.2f%8.3f%8.2f [%.2f]" % (t1,flopsc/t1,t0,flopsg/t0,peakg)
49 |     print "%8.3f%8.2f" % (flopsc/t1*2.8,flopsg/t0*1.512/112)
50 | 
51 | if __name__ == "__main__":
52 |     import sys
53 | 
54 |     cudaSetDevice(0)
55 | 
56 |     loops = 1
57 |     if len(sys.argv) > 1:
58 |         loops = int(sys.argv[1])
59 |     print "%5d" % (loops),
60 |     main(loops)
61 | 


--------------------------------------------------------------------------------
/tests/cuda/todo/cuda_poly.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | # coding:utf-8: © Arno Pähler, 2007-08
  3 | from ctypes import *
  4 | from time import time
  5 | 
  6 | from cuda.cuda_defs import *
  7 | from cuda.cuda_api import *
  8 | from cuda.cuda_utils import *
  9 | 
 10 | from cpuFunctions import vectorInit,checkError
 11 | from cpuFunctions import cpuPOLY5,cpuPOLY10,cpuPOLY20,cpuPOLY40
 12 | from gpuFunctions import gpuPOLY5,gpuPOLY10,gpuPOLY20,gpuPOLY40
 13 | 
 14 | BLOCK_SIZE = 144
 15 | GRID_SIZE  = 192
 16 | checkErrorFlag = False
 17 | 
 18 | S4 = sizeof(c_float)
 19 | psize = 5
 20 | 
 21 | def main(vlength = 128,loops = 1,m1 = 1):
 22 |     print "%5d %5d %5d" % (l,loops,m1),
 23 | 
 24 |     alfa = c_float(.5)
 25 |     n2 = vlength ## Vector length
 26 | 
 27 |     mp = 1 << (m1-1)
 28 |     print "%5d" % (mp*psize),
 29 |     gpuPOLY = eval("gpuPOLY%d"%(mp*psize))
 30 |     h_X = (c_float*n2)()
 31 |     h_Y = (c_float*n2)()
 32 |     g_Y = (c_float*n2)()
 33 | 
 34 |     vectorInit(h_X)
 35 | 
 36 |     d_X = getMemory(h_X)
 37 |     d_Y = getMemory(h_Y)
 38 | 
 39 |     blockDim  = dim3(BLOCK_SIZE,1,1)
 40 |     gridDim   = dim3(GRID_SIZE,1,1)
 41 | 
 42 |     t0 = time()
 43 |     cudaThreadSynchronize()
 44 |     for i in range(loops):
 45 |         cudaConfigureCall(gridDim,blockDim,0,0)
 46 |         gpuPOLY(d_X,d_Y,n2)
 47 |     cudaThreadSynchronize()
 48 |     t0 = time()-t0
 49 | 
 50 |     flops = (2.e-9*m1*n2*(psize-1))*float(loops)
 51 |     cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
 52 |     cudaThreadSynchronize()
 53 | 
 54 |     cudaFree(d_X)
 55 |     cudaFree(d_Y)
 56 | 
 57 |     cudaThreadExit()
 58 |     cpuPOLY = eval("cpuPOLY%d" % (mp*psize))
 59 |     t1 = time()
 60 |     for i in range(loops):
 61 |         cpuPOLY(h_X,h_Y)
 62 |     t1 = time()-t1
 63 |     print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0)
 64 | 
 65 |     if checkErrorFlag:
 66 |         err,mxe = checkError(h_Y,g_Y)
 67 |         print "Avg and max rel error = %.2e %.2e" % (err,mxe)
 68 | 
 69 | if __name__ == "__main__":
 70 |     import sys
 71 | 
 72 |     cudaSetDevice(0)
 73 | 
 74 |     lmin,lmax = 7,23
 75 |     if len(sys.argv) > 1:
 76 |         lmin = lmax = int(sys.argv[1])
 77 |     loopx = -1
 78 |     if len(sys.argv) > 2:
 79 |         loopx = int(sys.argv[2])
 80 |     m1 = 4
 81 |     if len(sys.argv) > 3:
 82 |         m1 = min(4,int(sys.argv[3]))
 83 |     lmax = min(max(0,lmax),23)
 84 |     lmin = min(max(0,lmin),lmax)
 85 | 
 86 |     for l in range(lmin,lmax+1):
 87 |         if l < 10:
 88 |             loops = 10000/m1
 89 |         elif l < 13:
 90 |             loops = 5000/m1
 91 |         elif l < 17:
 92 |             loops = 500/m1
 93 |         elif l < 21:
 94 |             loops = 250/m1
 95 |         else:
 96 |             loops = 100/m1
 97 |         vlength = 1 << l
 98 |         if loopx > 0:
 99 |             loops = loopx
100 |         main(vlength,loops,m1)
101 | 


--------------------------------------------------------------------------------
/tests/cuda/todo/cuda_saxpy.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | # coding:utf-8: © Arno Pähler, 2007-08
 3 | from ctypes import *
 4 | from time import time
 5 | 
 6 | from cuda.cuda_defs import *
 7 | from cuda.cuda_api import *
 8 | from cuda.cuda_utils import *
 9 | 
10 | from cpuFunctions import fixedInit,cpuSAXPY,checkError
11 | from gpuFunctions import gpuSAXPY
12 | 
13 | BLOCK_SIZE = 64
14 | GRID_SIZE  = 256
15 | S4 = sizeof(c_float)
16 | checkErrorFlag = False
17 | 
18 | def main(vlength = 128,loops = 1):
19 | 
20 |     alfa = c_float(.5)
21 |     n2 = vlength ## Vector length
22 | 
23 |     h_X = (c_float*n2)()
24 |     h_Y = (c_float*n2)()
25 |     g_Y = (c_float*n2)()
26 | 
27 |     fixedInit(h_X)
28 | 
29 |     d_X = getMemory(h_X)
30 |     d_Y = getMemory(h_Y)
31 | 
32 |     blockDim  = dim3(BLOCK_SIZE,1,1)
33 |     gridDim   = dim3(GRID_SIZE,1,1)
34 | 
35 |     t0 = time()
36 |     cudaThreadSynchronize()
37 |     for i in range(loops):
38 |         cudaConfigureCall(gridDim,blockDim,0,0)
39 |         gpuSAXPY(alfa,d_X,d_Y,n2)
40 |     cudaThreadSynchronize()
41 |     t0 = time()-t0
42 | 
43 |     flops = (2.e-9*n2)*float(loops)
44 |     g_Y = (c_float*n2)()
45 |     cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
46 |     cudaThreadSynchronize()
47 | 
48 |     cudaFree(d_X)
49 |     cudaFree(d_Y)
50 | 
51 |     cudaThreadExit()
52 |     t1 = time()
53 |     for i in range(loops):
54 |         cpuSAXPY(alfa,h_X,h_Y)
55 |     t1 = time()-t1
56 |     print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0)
57 | 
58 |     if checkErrorFlag:
59 |         err,mxe = checkError(h_Y,g_Y)
60 |         print "Avg and max rel error = %.2e %.2e" % (err,mxe)
61 | 
62 | if __name__ == "__main__":
63 |     import sys
64 | 
65 |     cudaSetDevice(0)
66 | 
67 |     lmin,lmax = 7,24
68 |     if len(sys.argv) > 1:
69 |         lmin = lmax = int(sys.argv[1])
70 |     loopx = -1
71 |     if len(sys.argv) > 2:
72 |         loopx = int(sys.argv[2])
73 |     lmax = min(max(0,lmax),24)
74 |     lmin = min(max(0,lmin),lmax)
75 |     for l in range(lmin,lmax+1):
76 |         if l < 10:
77 |             loops = 25000
78 |         elif l < 17:
79 |             loops = 10000
80 |         elif l < 21:
81 |             loops = 250
82 |         else:
83 |             loops = 25
84 |         vlength = 1 << l
85 |         if loopx > 0:
86 |             loops = loopx
87 |         print "%5d %5d" % (l,loops),
88 |         main(vlength,loops)
89 | 


--------------------------------------------------------------------------------
/tests/cuda/todo/cuda_sgemm.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | # coding:utf-8: © Arno Pähler, 2007-08
  3 | from ctypes import *
  4 | from time import time
  5 | 
  6 | from cuda.cuda_api import *
  7 | from cuda.cuda_defs import *
  8 | from cuda.cuda_utils import *
  9 | 
 10 | from cpuFunctions import arrayInit,cpuSGEMM,checkError
 11 | from gpuFunctions import gpuSGEMM
 12 | from ctypes_array import *
 13 | 
 14 | useSciPy = True
 15 | if useSciPy:
 16 |     from scipy.lib.blas.fblas import sgemm as _sgemm
 17 |     # C : A*B (on the GPU)
 18 |     # F : (A*B).T = B.T * A.T (scipy)
 19 |     def sgemm(z,x,y,m,n,k):
 20 |         nx = convert(x,(m,k)).T
 21 |         ny = convert(y,(k,n)).T
 22 |         nz = _sgemm(1.,ny,nx)
 23 |         convert(nz,out=z)
 24 |         return z
 25 | else:
 26 |     # C : A*B (on the CPU) (in C)
 27 |     sgemm = cpuSGEMM
 28 | 
 29 | BLOCK_SIZE  = 1 << 4
 30 | S4 = sizeof(c_float)
 31 | 
 32 | def main(N = 1024,L = 100):
 33 |     M = N
 34 |     K = N >> 1
 35 |     N = N << 1
 36 |     flops = (2.e-9*M*N)*float(K*L)
 37 |     print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops)
 38 |     na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0.
 39 | 
 40 |     t0 = time()
 41 | 
 42 |     sizeA = M*K
 43 |     sizeB = K*N
 44 |     sizeC = M*N
 45 | 
 46 |     h_A = (c_float*sizeA)()
 47 |     h_B = (c_float*sizeB)()
 48 | 
 49 |     arrayInit(h_A)
 50 |     arrayInit(h_B)
 51 | 
 52 |     d_A = getMemory(h_A)
 53 |     d_B = getMemory(h_B)
 54 |     d_C = getMemory(sizeC)
 55 | 
 56 |     blockDim  = dim3(BLOCK_SIZE,BLOCK_SIZE,1)
 57 |     gridDim   = dim3(N/BLOCK_SIZE,M/BLOCK_SIZE,1)
 58 |     sharedMem = S4*2*BLOCK_SIZE*BLOCK_SIZE
 59 |     tt = t0 = time()-t0
 60 |     print "Overhead runtime API: %.3f sec\n" % t0
 61 | 
 62 |     t0 = time()
 63 |     cudaThreadSynchronize()
 64 |     for i in range(L):
 65 |         cudaConfigureCall(gridDim,blockDim,sharedMem,0)
 66 |         gpuSGEMM(d_C,d_A,d_B,K,N)
 67 |     cudaThreadSynchronize()
 68 |     t0 = time()-t0
 69 |     tt += t0
 70 | 
 71 |     h_C = (c_float*sizeC)()
 72 |     cudaMemcpy(h_C,d_C,S4*sizeC,cudaMemcpyDeviceToHost)
 73 | 
 74 |     cudaThreadSynchronize()
 75 | 
 76 |     cudaFree(d_A)
 77 |     cudaFree(d_B)
 78 |     cudaFree(d_C)
 79 | 
 80 |     cudaThreadExit()
 81 |     print "Processing time: %.3g (%.3g) sec" % (t0,tt)
 82 |     print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt)
 83 | 
 84 |     ref = (c_float*sizeC)()
 85 | 
 86 |     t1 = time()
 87 |     for i in range(L):
 88 |         sgemm(ref,h_A,h_B,M,N,K)
 89 |     t1 = time()-t1
 90 |     print "\nProcessing time: %.3g sec" % t1
 91 |     print "Gigaflops CPU  : %.2f" % (flops/t1)
 92 |     print "Speedup GPU/CPU: %.2f" % (t1/t0)
 93 | 
 94 |     err,mxe = checkError(ref,h_C)
 95 |     print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)
 96 | 
 97 | if __name__ == "__main__":
 98 |     import sys
 99 | 
100 |     cudaSetDevice(0)
101 | 
102 |     M, L = 1024, 100
103 |     if len(sys.argv) > 1:
104 |         M = int(sys.argv[1])
105 |     M = (M >> 5) << 5 # multiple of (2*BLOCK_SIZE)
106 |     if len(sys.argv) > 2:
107 |         L = int(sys.argv[2])
108 | 
109 |     print "+-----------------------+"
110 |     print "| Matrix Multiplication |"
111 |     print "|     using CUDA API    |"
112 |     print "+-----------------------+\n"
113 |     main(M,L)
114 | 


--------------------------------------------------------------------------------
/tests/cuda/todo/cuda_streams.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | # coding:utf-8: © Arno Pähler, 2007-08
  3 | from ctypes import *
  4 | 
  5 | from cuda.cuda_defs import *
  6 | from cuda.cuda_api import *
  7 | from cuda.cuda_utils import *
  8 | 
  9 | from ctypes_array import *
 10 | from numpy import all,int32,zeros
 11 | 
 12 | from gpuFunctions import init_array
 13 | 
 14 | MB = 1024*1024
 15 | SI = sizeof(c_int)
 16 | 
 17 | def check_results(a,n,c):
 18 |     u = (c_int*n).from_address(a.value)
 19 |     a = convert(u,(n,))
 20 |     c = c.value
 21 |     return all(a==c)
 22 | 
 23 | def main():
 24 |     nstreams = 8
 25 |     nreps = 10
 26 |     n = 16*MB
 27 |     nbytes = n*SI
 28 | 
 29 |     count = c_int()
 30 |     cudaGetDeviceCount(byref(count))
 31 |     if count == 0:
 32 |         print "no CUDA capable device found"
 33 |         return
 34 | 
 35 |     props = cudaDeviceProp()
 36 |     cudaGetDeviceProperties(byref(props),0)
 37 |     if props.major == 1 and props.minor < 1:
 38 |         print "%s does not support streams" % props.name
 39 |         return
 40 | 
 41 |     u = zeros((n,),dtype=int32)+5
 42 |     x = convert(u)
 43 |     c = c_int(x[0])
 44 |     a = c_void_p()
 45 |     cudaMallocHost(byref(a),nbytes)
 46 | 
 47 |     d_a = getMemory(n)
 48 |     d_c = getMemory(x)
 49 | 
 50 |     streams = (cudaStream_t*nstreams)()
 51 |     for i in range(nstreams):
 52 |         stream = cudaStream_t()
 53 |         cudaStreamCreate(byref(stream))
 54 |         streams[i] = stream.value
 55 | 
 56 |     ev_start = cudaEvent_t()
 57 |     ev_stop = cudaEvent_t()
 58 |     cudaEventCreate(byref(ev_start))
 59 |     cudaEventCreate(byref(ev_stop))
 60 | 
 61 |     cudaEventRecord(ev_start,0)
 62 |     cudaMemcpyAsync(a,d_a,nbytes,cudaMemcpyDeviceToHost,streams[0])
 63 |     cudaEventRecord(ev_stop,0)
 64 |     cudaEventSynchronize(ev_stop)
 65 |     t_copy = c_float()
 66 |     cudaEventElapsedTime(byref(t_copy),ev_start,ev_stop)
 67 |     t_copy = t_copy.value
 68 | 
 69 |     threads=dim3(512,1,1)
 70 |     blocks=dim3(n/threads.x,1,1)
 71 |     cudaEventRecord(ev_start,0)
 72 |     cudaConfigureCall(blocks,threads,0,streams[0])
 73 |     init_array(d_a,d_c)
 74 |     cudaEventRecord(ev_stop,0)
 75 |     cudaEventSynchronize(ev_stop)
 76 |     t_kernel = c_float()
 77 |     cudaEventElapsedTime(byref(t_kernel),ev_start,ev_stop)
 78 |     t_kernel = t_kernel.value
 79 | 
 80 |     threads=dim3(512,1,1)
 81 |     blocks=dim3(n/threads.x,1,1)
 82 |     cudaEventRecord(ev_start,0)
 83 |     for i in range(nreps):
 84 |         cudaConfigureCall(blocks,threads,0,0)
 85 |         init_array(d_a,d_c)
 86 |         cudaMemcpy(a,d_a,nbytes,cudaMemcpyDeviceToHost)
 87 |     cudaEventRecord(ev_stop,0)
 88 |     cudaEventSynchronize(ev_stop)
 89 |     elapsed0 = c_float()
 90 |     cudaEventElapsedTime(byref(elapsed0),ev_start,ev_stop)
 91 |     elapsed0 = elapsed0.value
 92 | 
 93 |     threads = dim3(512,1,1)
 94 |     blocks = dim3(n/(nstreams*threads.x),1,1)
 95 |     memset(a,255,nbytes)
 96 |     cudaMemset(d_a,0,nbytes)
 97 |     cudaEventRecord(ev_start,0)
 98 |     a_0 = a.value
 99 |     off = n*SI/nstreams
100 |     for k in range(nreps):
101 |         for i in range(nstreams):
102 |             cudaConfigureCall(blocks,threads,0,streams[i])
103 |             init_array(d_a+i*n*SI/nstreams,d_c)
104 |         for i in range(nstreams):
105 |             ai = a_0+i*off
106 |             di = d_c+i*off
107 |             cudaMemcpyAsync(ai,di,nbytes/nstreams,
108 |                 cudaMemcpyDeviceToHost,streams[i])
109 |     cudaEventRecord(ev_stop,0)
110 |     cudaEventSynchronize(ev_stop)
111 |     elapsed1 = c_float()
112 |     cudaEventElapsedTime(byref(elapsed1),ev_start,ev_stop)
113 |     elapsed1 = elapsed1.value
114 | 
115 |     passed = check_results(a,n,c)
116 | 
117 |     for i in range(nstreams):
118 |         cudaStreamDestroy(streams[i])
119 |     cudaEventDestroy(ev_start)
120 |     cudaEventDestroy(ev_stop)
121 | 
122 |     cudaFree(d_a)
123 |     cudaFree(d_c)
124 |     cudaFreeHost(a)
125 |     cudaThreadExit()
126 | 
127 |     print "memcopy:\t%.2f" % t_copy
128 |     print "kernel:\t\t%.2f" % t_kernel
129 |     print "non-streamed:\t%.2f (%.2f expected)" % (
130 |         elapsed0/nreps,t_kernel+t_copy)
131 |     print "%d streams:\t%.2f (%.2f expected)" % (
132 |         nstreams,elapsed1/nreps,t_kernel+t_copy/nstreams)
133 | 
134 |     print "-------------------------------"
135 |     if passed:
136 |         print "Test PASSED"
137 |     else:
138 |         print "Test FAILED"
139 | 
140 | if __name__ == "__main__":
141 |     cudaSetDevice(0)
142 |     main()
143 | 


--------------------------------------------------------------------------------
/tests/cuda/todo/cuda_trig.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | # coding:utf-8: © Arno Pähler, 2007-08
  3 | from ctypes import *
  4 | from time import time
  5 | 
  6 | from cuda.cu_defs import *
  7 | from cuda.cu_api import *
  8 | from cuda.cuda_utils import *
  9 | 
 10 | from cpuFunctions import vectorInit,checkError
 11 | from gpuFunctions import gpuTRIG
 12 | 
 13 | UseVML = True
 14 | if UseVML:
 15 |     from mklMath import cpuTRIG
 16 | else:
 17 |     from cpuFunctions import cpuTRIG
 18 | 
 19 | BLOCK_SIZE = 128
 20 | GRID_SIZE  = 192
 21 | checkErrorFlag = False
 22 | 
 23 | S4 = sizeof(c_float)
 24 | 
 25 | def main(vlength = 128,loops = 1):
 26 | 
 27 |     n2 = vlength ## Vector length
 28 | 
 29 |     h_X = (c_float*n2)()
 30 |     h_Y = (c_float*n2)()
 31 |     h_Z = (c_float*n2)()
 32 | 
 33 |     vectorInit(h_X)
 34 | 
 35 |     d_X = getMemory(h_X)
 36 |     d_Y = getMemory(h_Y)
 37 |     d_Z = getMemory(h_Z)
 38 | 
 39 |     blockDim  = dim3(BLOCK_SIZE,1,1)
 40 |     gridDim   = dim3(GRID_SIZE,1,1)
 41 | 
 42 |     t0 = time()
 43 |     cudaThreadSynchronize()
 44 |     for i in range(loops):
 45 |         cudaConfigureCall(gridDim,blockDim,0,0)
 46 |         gpuTRIG(d_Y,d_Z,d_X,n2)
 47 |     cudaThreadSynchronize()
 48 |     t0 = time()-t0
 49 | 
 50 |     flops = (2.e-9*n2)*float(loops)
 51 |     g_Y = (c_float*n2)()
 52 |     cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
 53 |     cudaThreadSynchronize()
 54 | 
 55 |     flops = (8.e-9*n2)*float(loops)
 56 |     g_Y = (c_float*n2)()
 57 |     g_Z = (c_float*n2)()
 58 |     cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost)
 59 |     cudaMemcpy(g_Z,d_Z,S4*n2,cudaMemcpyDeviceToHost)
 60 |     cudaThreadSynchronize()
 61 | 
 62 |     cudaFree(d_X)
 63 |     cudaFree(d_Y)
 64 |     cudaFree(d_Z)
 65 | 
 66 |     cudaThreadExit()
 67 |     t1 = time()
 68 |     for i in range(loops):
 69 |         cpuTRIG(h_Y,h_Z,h_X)
 70 |     t1 = time()-t1
 71 |     print "%10d%6.2f%6.2f GFlops" % (vlength,flops/t1,flops/t0)
 72 | 
 73 |     if checkErrorFlag:
 74 |         err,mxe = checkError(h_Y,g_Y,n2)
 75 |         print "Avg and max rel error (cos) = %.2e %.2e" % (err,mxe)
 76 |         err,mxe = checkError(h_Z,g_Z,n2)
 77 |         print "Avg and max rel error (sin) = %.2e %.2e" % (err,mxe)
 78 | 
 79 | if __name__ == "__main__":
 80 |     import sys
 81 | 
 82 |     cudaSetDevice(0)
 83 | 
 84 |     lmin,lmax = 7,23
 85 |     if len(sys.argv) > 1:
 86 |         lmin = lmax = int(sys.argv[1])
 87 |     lmax = min(max(0,lmax),23)
 88 |     lmin = min(max(0,lmin),lmax)
 89 |     for l in range(lmin,lmax+1):
 90 |         if l < 10:
 91 |             loops = 10000
 92 |         elif l < 13:
 93 |             loops = 2000
 94 |         elif l < 17:
 95 |             loops = 250
 96 |         elif l < 21:
 97 |             loops = 100
 98 |         else:
 99 |             loops = 50
100 |         vlength = 1 << l
101 |         print "%5d %5d" % (l,loops),
102 |         main(vlength,loops)
103 | 


--------------------------------------------------------------------------------
/tests/cufft/cufft_fft.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import numpy.fft
 4 | from numpy.random import randn
 5 | from numpy import allclose
 6 | import cuda.sugar.fft
 7 |         
 8 | def main():
 9 |     print "-"*55
10 |     print "--                                                   --"
11 |     print "--  python-cuda versions of numpy.fft.{fftn,ifftn}   --"
12 |     print "--                                                   --"
13 |     print "-"*55
14 |     print ">>> Creating host signal..."
15 | 
16 |     try:
17 |         size = int(sys.argv[1])
18 |     except Exception,e:
19 |         size = 10
20 | 
21 |     print ">>> Signal Size = %s" % size
22 | 
23 |     numpy_array = randn(size).astype('complex64')
24 |     numpy_array -= numpy_array.mean()
25 |     numpy_array /= numpy_array.std()
26 | 
27 |     print ">>> Computing ffts on GPU (CUDA) ..."
28 | 
29 |     print "[*] Forward fft on gpu ..."
30 |     fft_res = cuda.sugar.fft.fftn(numpy_array)
31 | 
32 |     print "[*] Inverse fft on gpu ..."
33 |     ifft_res = cuda.sugar.fft.ifftn(fft_res) 
34 | 
35 |     print ">>> Computing references on CPU (numpy) ..."
36 | 
37 |     print "[*] Forward fft on cpu ..."
38 |     forward_ref = numpy.fft.fftn(numpy_array)
39 | 
40 |     print "[*] Inverse fft on cpu ..."
41 |     inverse_ref = numpy.fft.ifftn(forward_ref)
42 |     
43 |     print "l2norm fft: ", numpy.linalg.norm(fft_res - forward_ref)
44 | 
45 |     print "l2norm ifft: ", numpy.linalg.norm(ifft_res - inverse_ref)
46 | 
47 | if __name__ == "__main__":
48 |     main()
49 | 


--------------------------------------------------------------------------------
/tests/cufft/fftlab.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | #
  3 | # fftlab.py --- Demonstrates 2d ffts and convolutions
  4 | #
  5 | 
  6 | import os,sys
  7 | 
  8 | import logging
  9 | logger = logging.getLogger(os.path.basename(__file__))
 10 | info = logger.info
 11 | 
 12 | from qt import *
 13 | QApplication.setColorSpec(QApplication.NormalColor)
 14 | app = QApplication(sys.argv)
 15 | 
 16 | from matplotlib.backends.backend_qtagg import FigureCanvasQTAgg as FigureCanvas
 17 | from matplotlib.figure import Figure
 18 | 
 19 | import numpy as np
 20 | from scipy import lena
 21 | from scipy.signal import fftconvolve, convolve2d
 22 | from pylab import fftshift
 23 | 
 24 | from cuda.sugar.fft import fftconvolve2d, check_results
 25 | 
 26 | # Required for PyQt
 27 | TRUE  = 1
 28 | FALSE = 0
 29 | 
 30 | PROGNAME = "fftlab"
 31 | PROG_VERSION = "0.1"
 32 | 
 33 | class MplCanvas(FigureCanvas):
 34 |     """Ultimately, this is a QWidget (as well as a FigureCanvasAgg, etc.)."""
 35 |     def __init__(self, parent=None, width=5, height=4, dpi=100):
 36 |         self.fig = Figure(figsize=(width, height), dpi=dpi)
 37 |         self.axes = self.fig.add_subplot(111)
 38 |         # We want the axes cleared every time plot() is called
 39 |         self.axes.hold(False)
 40 | 
 41 |         self.compute_initial_figure()
 42 |         
 43 |         FigureCanvas.__init__(self, self.fig)
 44 |         self.reparent(parent, QPoint(0, 0))
 45 | 
 46 |         FigureCanvas.setSizePolicy(self,
 47 |                                    QSizePolicy.Expanding,
 48 |                                    QSizePolicy.Expanding)
 49 |         FigureCanvas.updateGeometry(self)
 50 | 
 51 |     def sizeHint(self):
 52 |         w, h = self.get_width_height()
 53 |         return QSize(w, h)
 54 | 
 55 |     def minimumSizeHint(self):
 56 |         return QSize(10, 10)
 57 | 
 58 | class ImageCanvas(MplCanvas):
 59 |     """Simple canvas for matshow'ing a 2d image array"""
 60 |     def __init__(self, numpy_array, parent=None, width=5, height=4, dpi=100):
 61 |         assert numpy_array.ndim == 2
 62 |         self.data = numpy_array
 63 |         super(ImageCanvas,self).__init__(parent, width, height, dpi)
 64 | 
 65 |     def compute_initial_figure(self):
 66 |         self.axes.matshow(self.data)
 67 | 
 68 | class FFTLab(QMainWindow):
 69 |     def __init__(self):
 70 |         QMainWindow.__init__(self, None,
 71 |                              "FFTLab Main Window",
 72 |                              Qt.WType_TopLevel | Qt.WDestructiveClose)
 73 | 
 74 |         self.file_menu = QPopupMenu(self)
 75 |         self.file_menu.insertItem('&Quit', self.file_quit, Qt.CTRL + Qt.Key_Q)
 76 |         self.menuBar().insertItem('&File', self.file_menu)
 77 | 
 78 |         self.help_menu = QPopupMenu(self)
 79 |         self.menuBar().insertSeparator()
 80 |         self.menuBar().insertItem('&Help', self.help_menu)
 81 | 
 82 |         self.help_menu.insertItem('&About', self.about)
 83 | 
 84 |         self.main_widget = QWidget(self, "Main widget")
 85 | 
 86 |         data = ((lena()/255.)).astype("complex64")
 87 |         kernel = np.ones((6,6)).astype("complex64")
 88 |         #data = np.random.uniform(0,1,(8,8)).astype("complex64")
 89 |         #kernel = np.random.uniform(0,1,(7,7)).astype("complex64")
 90 |         #power_spec = fftshift(log(abs(signal.fftn(data))))
 91 | 
 92 |         gpu_conv = fftconvolve2d(data,kernel)
 93 |         cpu_conv = fftconvolve(data.real, kernel.real, mode="valid")
 94 | 
 95 |         info("GPU shape = (%s, %s)" % gpu_conv.shape)
 96 |         info("CPU shape = (%s, %s)" % cpu_conv.shape)
 97 |         
 98 |         check_results(cpu_conv, gpu_conv)
 99 | 
100 |         data_c = ImageCanvas(data.real, self.main_widget)
101 |         kernel_c = ImageCanvas(kernel.real, self.main_widget)
102 |         gpu_conv_c = ImageCanvas(gpu_conv, self.main_widget)
103 |         cpu_conv_c = ImageCanvas(cpu_conv, self.main_widget)
104 |         #power_spec = ImageCanvas(power_spec,self.main_widget)
105 | 
106 |         data_label = QLabel("Input Data (lena)", self.main_widget)
107 |         data_label.setAlignment(QLabel.AlignCenter)
108 |         kernel_label = QLabel("Convolution Kernel", self.main_widget)
109 |         kernel_label.setAlignment(QLabel.AlignCenter)
110 |         gpu_conv_label = QLabel("GPU fftconvolve (CUDA)", self.main_widget)
111 |         gpu_conv_label.setAlignment(QLabel.AlignCenter)
112 |         cpu_conv_label = QLabel("CPU fftconvolve (NumPy)", self.main_widget)
113 |         cpu_conv_label.setAlignment(QLabel.AlignCenter)
114 | 
115 |         g = QGridLayout(self.main_widget)
116 |         g.addWidget(data_label,0,0)
117 |         g.addWidget(kernel_label,0,1)
118 |         g.addWidget(data_c,1,0)
119 |         g.addWidget(kernel_c,1,1)
120 |         g.addWidget(gpu_conv_label,2,0)
121 |         g.addWidget(cpu_conv_label,2,1)
122 |         g.addWidget(gpu_conv_c,3,0)
123 |         g.addWidget(cpu_conv_c,3,1)
124 | 
125 |         self.main_widget.setFocus()
126 |         self.setCentralWidget(self.main_widget)
127 | 
128 |         self.statusBar().message("%s - v%s" % (PROGNAME, PROG_VERSION) , 2000)
129 | 
130 |     def file_quit(self):
131 |         qApp.exit(0)
132 | 
133 |     def about(self):
134 |         QMessageBox.about(self, "About %s" % PROGNAME,
135 | u"""%(prog)s version %(version)s
136 | 
137 | This application visualizes 2d ffts and convolutions 
138 | using python-cuda by Justin Riley and Nicolas Pinto @ (MIT)
139 | """ % {"prog": PROGNAME, "version": PROG_VERSION})
140 | 
141 | def main():
142 |     aw = FFTLab()
143 |     aw.setCaption("%s" % PROGNAME)
144 |     qApp.setMainWidget(aw)
145 |     aw.show()
146 |     sys.exit(qApp.exec_loop())
147 | 
148 | if __name__ == "__main__": main()
149 | 


--------------------------------------------------------------------------------
/tests/cufft/todo/bfft.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | # coding:utf-8: © Arno Pähler, 2007-08
 3 | 
 4 | from math import log
 5 | 
 6 | import sys,time
 7 | from ctypes import c_float
 8 | 
 9 | import xfft
10 | from cpuFunctions import arrayInit,checkError
11 | 
12 | try:
13 |     kr = int(sys.argv[1])
14 |     dims = tuple([int(x) for x in sys.argv[2].split(",")])
15 | except IndexError:
16 |     sys.exit()
17 | 
18 | doComplex = False
19 | if kr < 0:
20 |     kr = -kr
21 |     doComplex = True
22 | 
23 | size = reduce(lambda x,y:x*y,dims)
24 | if doComplex:
25 |     r = (c_float*(size*2))()
26 | else:
27 |     r = (c_float*size)()
28 | arrayInit(r)
29 | 
30 | sz = 1.e6/float(size)
31 | 
32 | fftw_start = time.clock()
33 | wall_start = time.time()
34 | 
35 | 
36 | xr = float(.5 )/float(kr)
37 | 
38 | if doComplex:
39 |     text = "complex"
40 |     rcfftx = xfft.ccfft
41 |     crfftx = xfft.icfft
42 | else:
43 |     text = "   real"
44 |     rcfftx = xfft.rcfft
45 |     crfftx = xfft.crfft
46 | for k in range(0,kr):
47 |     c = rcfftx(r,dims)
48 |     z = crfftx(c,dims)
49 | 
50 | fftw_end = time.clock()
51 | wall_end = time.time()
52 | 
53 | dif = fftw_end - fftw_start
54 | wif = wall_end - wall_start
55 | print "\nfft elapsed real time     : %8.3f seconds" % wif
56 | print "%d-D %s-to-complex fft: %8.3f seconds" % (len(dims),text,dif*xr)
57 | 
58 | flops = 2.*5.e-9*log(size)*size*kr/log(2.)
59 | print "Performance               : %8.3f GFlops" % (flops/wif)
60 | dif = dif * xr * sz
61 | print "%d-D %s-to-complex fft: %8.3f µs/point\n" % (len(dims),text,dif)
62 | 
63 | rz = 1./size
64 | err,mxe = checkError(r,z)
65 | print "avg and max error         : %8.1e %8.1e" %  (err,mxe)
66 | 


--------------------------------------------------------------------------------
/tests/cufft/todo/cu_fft.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | # coding:utf-8: © Arno Pähler, 2007-08
  3 | 
  4 | from ctypes import *
  5 | from math import log
  6 | from time import time
  7 | 
  8 | from cuda.cufft_defs import *
  9 | from cuda.cufft_api import *
 10 | from cuda.cu_utils import *
 11 | 
 12 | import xfft as xf
 13 | import gfft_cu as gf
 14 | from cpuFunctions import arrayInit,checkError,scale
 15 | from cpuFunctions import ReadTimestampCounter
 16 | 
 17 | def main(check=False,doComplex=False,dims=(128,)):
 18 |     print "+------------------------+"
 19 |     print "| Fast Fourier Transform |"
 20 |     print "| using CUDA driver API  |"
 21 |     print "+------------------------+\n"
 22 |     dims = tuple(dims)
 23 |     ndim = len(dims)
 24 |     v = ("","NX = %d","NX = %d NY = %d","NX = %d NY = %d NZ = %d")
 25 |     SC = reduce(lambda x,y:x*y,dims)
 26 |     SR = reduce(lambda x,y:x*y,dims[:-1],1)
 27 |     SR *= 2*(dims[-1]/2+1)
 28 | 
 29 |     print v[ndim] % dims
 30 |     print "< doComplex: %s >\n" % doComplex
 31 | 
 32 |     rz = 1./float(SC)
 33 |     flops = 2.*5.*SC*log(SC)/log(2.)*1.e-9
 34 |     if doComplex:
 35 |         SC *= 2
 36 |     S4 = sizeof(c_float)
 37 | 
 38 |     if doComplex:
 39 |         sz = S4*(SC+SC)/(1024*1024)
 40 |     else:
 41 |         sz = S4*(SC+SR)/(1024*1024)
 42 | 
 43 |     h_A = (c_float*SC)()
 44 |     g_A = (c_float*SC)()
 45 |     arrayInit(h_A)
 46 | 
 47 |     d_A = getMemory(h_A)
 48 |     allocate = True
 49 | 
 50 |     if doComplex:
 51 |         d_B = getMemory(SC)
 52 |     elif allocate:
 53 |         d_B = getMemory(SR)
 54 | 
 55 |     if doComplex:
 56 |         plan = gf.makePlan(dims,CUFFT_C2C)
 57 |     else:
 58 |         plan1 = gf.makePlan(dims,CUFFT_R2C)
 59 |         plan2 = gf.makePlan(dims,CUFFT_C2R)
 60 | 
 61 |     t0 = time()
 62 |     x0 = ReadTimestampCounter()
 63 |     cuCtxSynchronize()
 64 | 
 65 |     if doComplex:
 66 |         d_B = gf.ccfft(plan,d_A,None,d_B)
 67 |         d_A = gf.icfft(plan,d_B,None,d_A)
 68 |     else:
 69 |         if allocate:
 70 |             d_B = gf.rcfft(plan1,d_A,None,d_B)
 71 |             d_A = gf.crfft(plan2,d_B,None,d_A)
 72 |         else:
 73 |             d_B = gf.rcfft(plan1,d_A,SR)
 74 |             cuMemFree(d_A)
 75 |             d_A = gf.crfft(plan2,d_B,SR)
 76 | 
 77 |     cuCtxSynchronize()
 78 |     t0 = time()-t0
 79 |     x1 = ReadTimestampCounter()
 80 |     fc = 1.e-3/2.8
 81 |     print "RDTSC: %.0f µs" % ((x1-x0)*fc)
 82 | 
 83 |     cuMemcpyDtoH(g_A,d_A,S4*SC)
 84 | 
 85 |     cuMemFree(d_A)
 86 |     cuMemFree(d_B)
 87 | 
 88 |     if doComplex:
 89 |         cufftDestroy(plan)
 90 |     else:
 91 |         cufftDestroy(plan1)
 92 |         cufftDestroy(plan2)
 93 | 
 94 |     scale(g_A,rz)
 95 | 
 96 |     print "\nProcessing time: %.3g sec" % t0
 97 |     print "Gigaflops GPU  : %.2f" % (flops/t0)
 98 |     gflops = (flops/t0,)
 99 | 
100 |     print "\nError CPU initial vs GPU"
101 |     err,mxe = checkError(h_A,g_A)
102 |     stats = err,mxe
103 |     print "Avg and max rel error = %.2e %.2e\n" % (err,mxe)
104 | 
105 |     f = (-1.,-1.)
106 |     if check:
107 |         t1 = time()
108 |         if doComplex:
109 |             h_B = xf.ccfft(h_A,dims)
110 |             h_B = xf.icfft(h_B,dims)
111 |         else:
112 |             h_B = xf.rcfft(h_A,dims)
113 |             h_B = xf.crfft(h_B,dims)
114 |         t1 = time()-t1
115 |         print "Processing time: %.3g sec" % t1
116 |         print "Gigaflops CPU  : %.2f" % (flops/t1)
117 |         print "Speedup GPU/CPU: %.2f" % (t1/t0)
118 | 
119 |         print "\nError CPU final vs CPU initial"
120 |         err,mxe = checkError(h_B,h_A)
121 |         print "Avg and max rel error = %.2e %.2e" % (err,mxe)
122 | 
123 |         print "\nError CPU final vs GPU"
124 |         err,mxe = checkError(h_B,g_A)
125 |         print "Avg and max rel error = %.2e %.2e" % (err,mxe)
126 |         f = (flops/t1,t1/t0,)
127 | 
128 |     fmt = "\n## "+" ".join(len(dims)*["%4d"])
129 |     fmt += " : %5.1f %5.1f %5.1f: %.2e %.2e"
130 |     print fmt % (dims+gflops+f+stats)
131 | 
132 | if __name__ == "__main__":
133 |     import sys
134 | 
135 |     device = cu_CUDA()
136 | 
137 |     check = False
138 |     doComplex = False
139 |     dims = (256,128,256)
140 |     if len(sys.argv) > 1:
141 |         if sys.argv[1] == "-c":
142 |             check = True
143 |         elif sys.argv[1] == "-cx":
144 |             check = True
145 |             doComplex = True
146 |         elif sys.argv[1] == "-x":
147 |             doComplex = True
148 |         else:
149 |             xyz = sys.argv[1].split(",")
150 |             dims = tuple(int(x) for x in xyz)
151 |     if len(sys.argv) > 2:
152 |         if sys.argv[2] == "-c":
153 |             check = True
154 |         elif sys.argv[2] == "-cx":
155 |             check = True
156 |             doComplex = True
157 |         elif sys.argv[2] == "-x":
158 |             doComplex = True
159 |     main(check,doComplex,dims)
160 |     cuCtxDetach(device.context)
161 | 


--------------------------------------------------------------------------------
/tests/cufft/todo/cuda_fft.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | # coding:utf-8: © Arno Pähler, 2007-08
  3 | 
  4 | from ctypes import *
  5 | from math import log
  6 | from time import time
  7 | 
  8 | from cuda.cufft_defs import *
  9 | from cuda.cufft_api import *
 10 | from cuda.cuda_utils import *
 11 | 
 12 | import xfft as xf
 13 | import gfft_cuda as gf
 14 | from cpuFunctions import arrayInit,checkError,scale
 15 | from cpuFunctions import ReadTimestampCounter
 16 | 
 17 | def main(check=False,doComplex=False,dims=(128,)):
 18 |     print "+------------------------+"
 19 |     print "| Fast Fourier Transform |"
 20 |     print "| using CUDA runtime API |"
 21 |     print "+------------------------+\n"
 22 |     dims = tuple(dims)
 23 |     ndim = len(dims)
 24 |     v = ("","NX = %d","NX = %d NY = %d","NX = %d NY = %d NZ = %d")
 25 |     SC = reduce(lambda x,y:x*y,dims)
 26 |     SR = reduce(lambda x,y:x*y,dims[:-1],1)
 27 |     SR *= 2*(dims[-1]/2+1)
 28 | 
 29 |     print v[ndim] % dims
 30 |     print "< doComplex: %s >\n" % doComplex
 31 | 
 32 |     rz = 1./float(SC)
 33 |     flops = 2.*5.*SC*log(SC)/log(2.)*1.e-9
 34 |     if doComplex:
 35 |         SC *= 2
 36 |     S4 = sizeof(c_float)
 37 | 
 38 |     if doComplex:
 39 |         sz = S4*(SC+SC)/(1024*1024)
 40 |     else:
 41 |         sz = S4*(SC+SR)/(1024*1024)
 42 | 
 43 |     h_A = (c_float*SC)()
 44 |     g_A = (c_float*SC)()
 45 |     arrayInit(h_A)
 46 | 
 47 |     d_A = getMemory(h_A)
 48 |     allocate = True
 49 | 
 50 |     if doComplex:
 51 |         d_B = getMemory(SC)
 52 |     elif allocate:
 53 |         d_B = getMemory(SR)
 54 | 
 55 |     if doComplex:
 56 |         plan = gf.makePlan(dims,CUFFT_C2C)
 57 |     else:
 58 |         plan1 = gf.makePlan(dims,CUFFT_R2C)
 59 |         plan2 = gf.makePlan(dims,CUFFT_C2R)
 60 | 
 61 |     t0 = time()
 62 |     x0 = ReadTimestampCounter()
 63 |     cudaThreadSynchronize()
 64 | 
 65 |     if doComplex:
 66 |         d_B = gf.ccfft(plan,d_A,None,d_B)
 67 |         d_A = gf.icfft(plan,d_B,None,d_A)
 68 |     else:
 69 |         if allocate:
 70 |             d_B = gf.rcfft(plan1,d_A,None,d_B)
 71 |             d_A = gf.crfft(plan2,d_B,None,d_A)
 72 |         else:
 73 |             d_B = gf.rcfft(plan1,d_A,SR)
 74 |             cuMemFree(d_A)
 75 |             d_A = gf.crfft(plan2,d_B,SR)
 76 | 
 77 |     cudaThreadSynchronize()
 78 |     t0 = time()-t0
 79 |     x1 = ReadTimestampCounter()
 80 |     fc = 1.e-3/2.8
 81 |     print "RDTSC: %.0f µs" % ((x1-x0)*fc)
 82 | 
 83 |     cudaMemcpy(g_A,d_A,S4*SC,cudaMemcpyDeviceToHost)
 84 | 
 85 |     cudaFree(d_A)
 86 |     cudaFree(d_B)
 87 | 
 88 |     if doComplex:
 89 |         cufftDestroy(plan)
 90 |     else:
 91 |         cufftDestroy(plan1)
 92 |         cufftDestroy(plan2)
 93 | 
 94 |     cudaThreadExit()
 95 |     scale(g_A,rz)
 96 | 
 97 |     print "\nProcessing time: %.3g sec" % t0
 98 |     print "Gigaflops GPU  : %.2f" % (flops/t0)
 99 |     gflops = (flops/t0,)
100 | 
101 |     print "\nError CPU initial vs GPU"
102 |     err,mxe = checkError(h_A,g_A)
103 |     stats = err,mxe
104 |     print "Avg and max rel error = %.2e %.2e\n" % (err,mxe)
105 | 
106 |     if check:
107 |         t1 = time()
108 |         if doComplex:
109 |             h_B = xf.ccfft(h_A,dims)
110 |             h_B = xf.icfft(h_B,dims)
111 |         else:
112 |             h_B = xf.rcfft(h_A,dims)
113 |             h_B = xf.crfft(h_B,dims)
114 |         t1 = time()-t1
115 |         print "Processing time: %.3g sec" % t1
116 |         print "Gigaflops CPU  : %.2f" % (flops/t1)
117 |         print "Speedup GPU/CPU: %.2f" % (t1/t0)
118 | 
119 |         print "\nError CPU final vs CPU initial"
120 |         err,mxe = checkError(h_B,h_A)
121 |         print "Avg and max rel error = %.2e %.2e" % (err,mxe)
122 | 
123 |         print "\nError CPU final vs GPU"
124 |         err,mxe = checkError(h_B,g_A)
125 |         print "Avg and max rel error = %.2e %.2e" % (err,mxe)
126 |     f = (-1.,)
127 |     if check:
128 |         f = (t1/t0,)
129 |     fmt = "\n## "+" ".join(len(dims)*["%3d"])+" : %.1f %.1f: %.2e %.2e"
130 |     print fmt % (dims+gflops+f+stats)
131 | 
132 | if __name__ == "__main__":
133 |     import sys
134 | 
135 |     cudaSetDevice(0)
136 | 
137 |     check = False
138 |     doComplex = False
139 |     dims = (256,128,256)
140 |     if len(sys.argv) > 1:
141 |         if sys.argv[1] == "-c":
142 |             check = True
143 |         elif sys.argv[1] == "-cx":
144 |             check = True
145 |             doComplex = True
146 |         elif sys.argv[1] == "-x":
147 |             doComplex = True
148 |         else:
149 |             xyz = sys.argv[1].split(",")
150 |             dims = tuple(int(x) for x in xyz)
151 |     if len(sys.argv) > 2:
152 |         if sys.argv[2] == "-c":
153 |             check = True
154 |         elif sys.argv[2] == "-cx":
155 |             check = True
156 |             doComplex = True
157 |         elif sys.argv[2] == "-x":
158 |             doComplex = True
159 |     main(check,doComplex,dims)
160 | 


--------------------------------------------------------------------------------
/tests/cufft/todo/dfft.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8: © Arno Pähler, 2007-08
 2 | from ctypes import c_int,c_void_p
 3 | from ctypes import CDLL,POINTER,RTLD_GLOBAL
 4 | 
 5 | ##  This version is for 64-bit floats
 6 | 
 7 | _ci  = c_int
 8 | _cip = POINTER(c_int)
 9 | _cvp = c_void_p
10 | 
11 | cc = CDLL("/usr/lib/libfftw.so",mode=RTLD_GLOBAL)
12 | cr = CDLL("/usr/lib/librfftw.so")
13 | 
14 | fftw_plan = _ci
15 | 
16 | ##extern [r]fftwnd_plan fftwnd_create_plan(
17 | ##    int rank, const int *n,
18 | ##    fftw_direction dir, int flags);
19 | ##
20 | ##  all plans have the same signature
21 | 
22 | CreatePlan_c = cc.fftwnd_create_plan
23 | CreatePlan_c.restype = fftw_plan
24 | CreatePlan_c.argtypes = [ _ci, _cip, _ci, _ci ]
25 | 
26 | CreatePlan_r = cr.rfftwnd_create_plan
27 | CreatePlan_r.restype = fftw_plan
28 | CreatePlan_r.argtypes = [ _ci, _cip, _ci, _ci ]
29 | 
30 | ##extern void [r]fftwnd_destroy_plan(rfftwnd_plan plan);
31 | DestroyPlan_c = cc.fftwnd_destroy_plan
32 | DestroyPlan_c.restype = None
33 | DestroyPlan_c.argtypes = [ _ci ]
34 | 
35 | DestroyPlan_r = cr.rfftwnd_destroy_plan
36 | DestroyPlan_r.restype = None
37 | DestroyPlan_r.argtypes = [ _ci ]
38 | 
39 | ##extern void fftwnd_one(fftwnd_plan p,
40 | ##            fftw_complex *in, fftw_complex *out);
41 | Execute_c2c = cc.fftwnd_one
42 | Execute_c2c.restype = None
43 | Execute_c2c.argtypes = [ _ci, _cvp, _cvp ]
44 | 
45 | ##extern void rfftwnd_one_real_to_complex(rfftwnd_plan p,
46 | ##                  fftw_real *in, fftw_complex *out);
47 | ##extern void rfftwnd_one_complex_to_real(rfftwnd_plan p,
48 | ##                  fftw_complex *in, fftw_real *out);
49 | Execute_r2c = cr.rfftwnd_one_real_to_complex
50 | Execute_r2c.restype = None
51 | Execute_r2c.argtypes = [ _ci, _cvp, _cvp ]
52 | 
53 | Execute_c2r = cr.rfftwnd_one_complex_to_real
54 | Execute_c2r.restype = None
55 | Execute_c2r.argtypes = [ _ci, _cvp, _cvp ]
56 | 


--------------------------------------------------------------------------------
/tests/cufft/todo/gfft_cu.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8: © Arno Pähler, 2007-08
 2 | 
 3 | from cuda.cufft_api import *
 4 | from cuda.cu_utils import *
 5 | 
 6 | def makePlan(dims,kind):
 7 |     """
 8 |     dims : tuple of array dimensions (1-3 els.)
 9 |     kind : type of transform desired
10 |            returns plan to be used by transform
11 |     """
12 |     spln = "cufftPlan%dd(*args)"
13 |     ndim = len(dims)
14 |     plan = cufftHandle()
15 |     args = (byref(plan),)+dims+(kind,)
16 |     if ndim == 1:
17 |         args = args+(1,)
18 |     eval(spln % ndim)
19 |     return plan
20 | 
21 | def rcfft(plan,r,SC=None,c=None):
22 |     """
23 |     plan : plan created by fftw
24 |     r    : real array to be transformed
25 |     SC   : size of output array
26 |     c    : complex array, result of transform
27 |     """
28 |     if c is None:
29 |         if SC is None:
30 |             cufftDestroy(plan)
31 |             raise ValueError("array size missing")
32 |         c = getMemory(SC)
33 |     cufftExecR2C(plan,r,c)
34 |     return c
35 | 
36 | def crfft(plan,c,SC=None,r=None):
37 |     """
38 |     plan : plan created by fftw
39 |     c    : complex array to be transformed
40 |     SC   : size of output array
41 |     r    : real array, result of transform
42 |     """
43 |     if r is None:
44 |         if SC is None:
45 |             cufftDestroy(plan)
46 |             raise ValueError("array size missing")
47 |         r = getMemory(SC)
48 |     cufftExecC2R(plan,c,r)
49 |     return r
50 | 
51 | def ccfft(plan,c,SC=None,z=None):
52 |     """
53 |     plan : plan created by fftw
54 |     c    : complex array to be transformed
55 |     SC   : size of output array
56 |     z    : complex array, result of transform
57 |     """
58 |     if z is None:
59 |         if SC is None:
60 |             cufftDestroy(plan)
61 |             raise ValueError("array size missing")
62 |         z = getMemory(SC)
63 |     cufftExecC2C(plan,c,z,CUFFT_FORWARD)
64 |     return z
65 | 
66 | def icfft(plan,z,SC=None,c=None):
67 |     """
68 |     plan : plan created by fftw
69 |     z    : complex array to be transformed
70 |     SC   : size of output array
71 |     c    : complex array, result of transform
72 |     """
73 |     if c is None:
74 |         if SC is None:
75 |             cufftDestroy(plan)
76 |             raise ValueError("array size missing")
77 |         c = getMemory(SC)
78 |     cufftExecC2C(plan,z,c,CUFFT_INVERSE)
79 |     return c
80 | 
81 | #shortcuts
82 | 
83 | fft  = ccfft
84 | ifft = icfft
85 | 


--------------------------------------------------------------------------------
/tests/cufft/todo/gfft_cuda.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8: © Arno Pähler, 2007-08
 2 | 
 3 | from cuda.cufft_api import *
 4 | from cuda.cuda_utils import *
 5 | 
 6 | def makePlan(dims,kind):
 7 |     """
 8 |     dims : tuple of array dimensions (1-3 els.)
 9 |     kind : type of transform desired
10 |            returns plan to be used by transform
11 |     """
12 |     spln = "cufftPlan%dd(*args)"
13 |     ndim = len(dims)
14 |     plan = cufftHandle()
15 |     args = (byref(plan),)+dims+(kind,)
16 |     if ndim == 1:
17 |         args = args+(1,)
18 |     eval(spln % ndim)
19 |     return plan
20 | 
21 | def rcfft(plan,r,SC=None,c=None):
22 |     """
23 |     plan : plan created by fftw
24 |     r    : real array to be transformed
25 |     SC   : size of output array
26 |     c    : complex array, result of transform
27 |     """
28 |     if c is None:
29 |         if SC is None:
30 |             cufftDestroy(plan)
31 |             raise ValueError("array size missing")
32 |         c = getMemory(SC)
33 |     cufftExecR2C(plan,r,c)
34 |     return c
35 | 
36 | def crfft(plan,c,SC=None,r=None):
37 |     """
38 |     plan : plan created by fftw
39 |     c    : complex array to be transformed
40 |     SC   : size of output array
41 |     r    : real array, result of transform
42 |     """
43 |     if r is None:
44 |         if SC is None:
45 |             cufftDestroy(plan)
46 |             raise ValueError("array size missing")
47 |         r = getMemory(SC)
48 |     cufftExecC2R(plan,c,r)
49 |     return r
50 | 
51 | def ccfft(plan,c,SC=None,z=None):
52 |     """
53 |     plan : plan created by fftw
54 |     c    : complex array to be transformed
55 |     SC   : size of output array
56 |     z    : complex array, result of transform
57 |     """
58 |     if z is None:
59 |         if SC is None:
60 |             cufftDestroy(plan)
61 |             raise ValueError("array size missing")
62 |         z = getMemory(SC)
63 |     cufftExecC2C(plan,c,z,CUFFT_FORWARD)
64 |     return z
65 | 
66 | def icfft(plan,z,SC=None,c=None):
67 |     """
68 |     plan : plan created by fftw
69 |     z    : complex array to be transformed
70 |     SC   : size of output array
71 |     c    : complex array, result of transform
72 |     """
73 |     if c is None:
74 |         if SC is None:
75 |             cufftDestroy(plan)
76 |             raise ValueError("array size missing")
77 |         c = getMemory(SC)
78 |     cufftExecC2C(plan,z,c,CUFFT_INVERSE)
79 |     return c
80 | 
81 | #shortcuts
82 | 
83 | fft  = ccfft
84 | ifft = icfft
85 | 


--------------------------------------------------------------------------------
/tests/cufft/todo/manyfft.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | # coding:utf-8: © Arno Pähler, 2007-08
 3 | 
 4 | import sys
 5 | from utilities import System
 6 | 
 7 | cmd = "cu_fft.py %d,%d,%d %s"
 8 | scm = ""
 9 | 
10 | if len(sys.argv) > 1:
11 |     scm = sys.argv[1]
12 | 
13 | vz = (64,128,256)
14 | if "x" not in scm:
15 |     vz = (64,128,256,512)
16 | 
17 | for nx in (128,256):
18 |     for ny in (128,256):
19 |         for nz in vz:
20 |             s,o,e = System(cmd % (nx,ny,nz,scm))
21 |             print o[-1]
22 |         print
23 |     print "---\n"
24 | 


--------------------------------------------------------------------------------
/tests/cufft/todo/sfft.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8: © Arno Pähler, 2007-08
 2 | # NP: remove absolute paths (argnnnnn!)
 3 | 
 4 | from ctypes import c_int,c_void_p
 5 | from ctypes import CDLL,POINTER,RTLD_GLOBAL
 6 | 
 7 | ##  This version is for 32-bit floats
 8 | 
 9 | _ci  = c_int
10 | _cip = POINTER(c_int)
11 | _cvp = c_void_p
12 | 
13 | cc = CDLL("libsfftw.so",mode=RTLD_GLOBAL)
14 | cr = CDLL("libsrfftw.so")
15 | 
16 | fftw_plan = _ci
17 | 
18 | ##extern [r]fftwnd_plan fftwnd_create_plan(
19 | ##    int rank, const int *n,
20 | ##    fftw_direction dir, int flags);
21 | ##
22 | ##  all plans have the same signature
23 | 
24 | CreatePlan_c = cc.fftwnd_create_plan
25 | CreatePlan_c.restype = fftw_plan
26 | CreatePlan_c.argtypes = [ _ci, _cip, _ci, _ci ]
27 | 
28 | CreatePlan_r = cr.rfftwnd_create_plan
29 | CreatePlan_r.restype = fftw_plan
30 | CreatePlan_r.argtypes = [ _ci, _cip, _ci, _ci ]
31 | 
32 | ##extern void [r]fftwnd_destroy_plan(rfftwnd_plan plan);
33 | DestroyPlan_c = cc.fftwnd_destroy_plan
34 | DestroyPlan_c.restype = None
35 | DestroyPlan_c.argtypes = [ _ci ]
36 | 
37 | DestroyPlan_r = cr.rfftwnd_destroy_plan
38 | DestroyPlan_r.restype = None
39 | DestroyPlan_r.argtypes = [ _ci ]
40 | 
41 | ##extern void fftwnd_one(fftwnd_plan p,
42 | ##            fftw_complex *in, fftw_complex *out);
43 | Execute_c2c = cc.fftwnd_one
44 | Execute_c2c.restype = None
45 | Execute_c2c.argtypes = [ _ci, _cvp, _cvp ]
46 | 
47 | ##extern void rfftwnd_one_real_to_complex(rfftwnd_plan p,
48 | ##                  fftw_real *in, fftw_complex *out);
49 | ##extern void rfftwnd_one_complex_to_real(rfftwnd_plan p,
50 | ##                  fftw_complex *in, fftw_real *out);
51 | Execute_r2c = cr.rfftwnd_one_real_to_complex
52 | Execute_r2c.restype = None
53 | Execute_r2c.argtypes = [ _ci, _cvp, _cvp ]
54 | 
55 | Execute_c2r = cr.rfftwnd_one_complex_to_real
56 | Execute_c2r.restype = None
57 | Execute_c2r.argtypes = [ _ci, _cvp, _cvp ]
58 | 


--------------------------------------------------------------------------------
/tests/cufft/todo/xfft.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8: © Arno Pähler, 2007-08
  2 | 
  3 | from ctypes import c_double,c_float,c_int
  4 | 
  5 | #cannot mix single and double in one file
  6 | #segmentation fault on conflicting loads
  7 | #of both single and double versions of fft
  8 | #import dfft,sfft
  9 | 
 10 | import sfft
 11 | from cpuFunctions import scale
 12 | 
 13 | _cd  = c_double
 14 | _cf  = c_float
 15 | _ci  = c_int
 16 | 
 17 | ##  constants
 18 | FORWARD  = -1  ## Forward FFT
 19 | BACKWARD =  1  ## Inverse FFT
 20 | 
 21 | ESTIMATE = 0  ## for plans
 22 | MEASURE  = 1  ## for plans
 23 | 
 24 | OUT_OF_PLACE =   0
 25 | IN_PLACE     =   8
 26 | USE_WISDOM   =  16
 27 | THREADSAFE   = 128
 28 | 
 29 | R2C = FORWARD
 30 | C2R = BACKWARD
 31 | 
 32 | fftw_plan = _ci
 33 | 
 34 | x_cache = {}
 35 | 
 36 | def getType(item):
 37 |     """gets the type of ctypes item"""
 38 |     itype = item._type_._type_
 39 |     if itype == 'd':
 40 | #        return _cd,dfft
 41 |         return None,None
 42 |     elif itype == 'f':
 43 |         return _cf,sfft
 44 |     else:
 45 |         return None,None
 46 | 
 47 | def rcfft(r,dims):
 48 |     global x_cache
 49 |     if not isinstance(dims,tuple):
 50 |         dims = tuple(dims)
 51 |     dimx = list(dims)
 52 |     dimx[-1] = 2*(dimx[-1]/2+1)
 53 |     size = reduce(lambda x,y:x*y,dimx)
 54 |     ndim = len(dims)
 55 |     x_type,x_fft = getType(r)
 56 |     if x_type is None:
 57 |         return None
 58 |     c = (x_type*size)()
 59 |     try:
 60 |         wsave = x_cache[('rc',dims)]
 61 |     except KeyError:
 62 |         xdim = (_ci*ndim)(*dims)
 63 |         wsave = x_fft.CreatePlan_r(ndim,xdim,
 64 |                 R2C,ESTIMATE)
 65 |         x_cache[('rc',dims)] = wsave
 66 |     x_fft.Execute_r2c(wsave,r,c)
 67 |     return c
 68 | 
 69 | def crfft(c,dims):
 70 |     global x_cache
 71 |     if not isinstance(dims,tuple):
 72 |         dims = tuple(dims)
 73 |     dims = list(dims)
 74 |     dims = tuple(dims)
 75 |     size = reduce(lambda x,y:x*y,dims)
 76 |     ndim = len(dims)
 77 |     x_type,x_fft = getType(c)
 78 |     if x_type is None:
 79 |         return None
 80 |     r = (x_type*size)()
 81 |     try:
 82 |         wsave = x_cache[('cr',dims)]
 83 |     except KeyError:
 84 |         xdim = (_ci*ndim)(*dims)
 85 |         wsave = x_fft.CreatePlan_r(ndim,xdim,
 86 |                 C2R,ESTIMATE)
 87 |         x_cache[('cr',dims)] = wsave
 88 |     x_fft.Execute_c2r(wsave,c,r)
 89 |     sc = 1./float(size)
 90 |     scale(r,sc)
 91 |     return r
 92 | 
 93 | def ccfft(c,dims):
 94 |     global x_cache
 95 |     if not isinstance(dims,tuple):
 96 |         dims = tuple(dims)
 97 |     size = reduce(lambda x,y:x*y,dims)
 98 |     ndim = len(dims)
 99 |     x_type,x_fft = getType(c)
100 |     if x_type is None:
101 |         return None
102 |     z = (x_type*(size*2))()
103 |     try:
104 |         wsave = x_cache[('cc',dims)]
105 |     except KeyError:
106 |         xdim = (_ci*ndim)(*dims)
107 |         wsave = x_fft.CreatePlan_c(ndim,xdim,
108 |                 FORWARD,ESTIMATE)
109 |         x_cache[('cc',dims)] = wsave
110 |     x_fft.Execute_c2c(wsave,c,z)
111 |     return z
112 | 
113 | def icfft(z,dims):
114 |     global x_cache
115 |     if not isinstance(dims,tuple):
116 |         dims = tuple(dims)
117 |     size = reduce(lambda x,y:x*y,dims)
118 |     ndim = len(dims)
119 |     x_type,x_fft = getType(z)
120 |     if x_type is None:
121 |         return None
122 |     c = (x_type*(size*2))()
123 |     try:
124 |         wsave = x_cache[('ic',dims)]
125 |     except KeyError:
126 |         xdim = (_ci*ndim)(*dims)
127 |         wsave = x_fft.CreatePlan_c(ndim,xdim,
128 |                 BACKWARD,ESTIMATE)
129 |         x_cache[('ic',dims)] = wsave
130 |     x_fft.Execute_c2c(wsave,z,c)
131 |     sc = 1./float(size)
132 |     scale(c,sc)
133 |     return c
134 | 
135 | #shortcuts
136 | 
137 | fft  = ccfft
138 | ifft = icfft
139 | 


--------------------------------------------------------------------------------
/tests/test_cublas.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | 
 3 | import numpy as np
 4 | from cuda.sugar.memory import Linear
 5 | import cuda.sugar.blas as blas 
 6 | 
 7 | class TestCublas:
 8 | 
 9 |     def embed_ipython():
10 |         from IPython.Shell import IPShellEmbed
11 |         ipshell = IPShellEmbed(user_ns = dict())
12 |         ipshell()
13 | 
14 |     def cpu_saxpy(self, a, b, alpha):
15 |         return (alpha*a+b)
16 | 
17 |     def test_saxpy(self):
18 |         vlength = 8192
19 |         alpha = 1
20 |         a = np.random.randn(1,vlength).astype('float32')
21 |         b = np.random.randn(1,vlength).astype('float32')
22 |         cpu_result = self.cpu_saxpy(a,b,alpha)
23 |         gpu_result = blas.gpu_saxpy(a,b,alpha)
24 | 
25 |         print cpu_result   
26 |         print gpu_result
27 | 
28 |         assert np.allclose(cpu_result, gpu_result) == True
29 | 
30 |     def test_sdot(self):
31 |         vlength = 1024
32 |         n2 = vlength*vlength
33 |         a = np.random.randn(1,n2).astype('float32')
34 |         b = np.random.randn(1,n2).astype('float32')
35 |         cpu_result = np.dot(a,b.transpose())[0][0]
36 |         gpu_result = blas.gpu_sdot(a, b.transpose())
37 | 
38 |         print cpu_result
39 |         print gpu_result
40 | 
41 |         assert np.allclose([cpu_result], [gpu_result], atol=1e-1) == True
42 | 
43 |     def test_sgemm(self):
44 |         M=7; N=5; P=3;
45 |         a = np.random.randn(M,N).astype('float32')
46 |         b = np.random.randn(N,P).astype('float32')
47 |         cpu_result = np.dot(a,b)
48 |         gpu_result = blas.gpu_sgemm(a,b)
49 |         print cpu_result
50 |         print gpu_result
51 |         assert np.allclose(cpu_result, gpu_result)
52 |         
53 | if __name__ == "__main__":
54 |     test_cublas = TestCublas()
55 |     test_cublas.test_saxpy()
56 |     test_cublas.test_sdot()
57 |     test_cublas.test_sgemm()
58 | 


--------------------------------------------------------------------------------
/xml/createbindings.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import re
  4 | import os, sys
  5 | from elementtree import ElementTree
  6 | from optparse import OptionParser
  7 | from subprocess import Popen, PIPE
  8 | 
  9 | def fix_cdll_imports(bindings, lib):
 10 |     cdll_regex = "_libraries['%s'] = CDLL('%s')" % (lib,lib)
 11 |     new_bindings = []
 12 |     libname = os.path.basename(lib).split('.')[0].replace('lib','')
 13 |     for line in bindings:
 14 |         if line.rfind(cdll_regex) != -1:
 15 |             new_bindings += [line.replace(cdll_regex, "_libraries['%s'] = get_lib('%s')" % (libname,libname))]
 16 |         else:
 17 |             new_bindings += [line.replace(lib,libname)]
 18 |     new_bindings.insert(3,'from cuda.utils import get_lib\n')
 19 |     new_bindings.insert(4,'c_longdouble = c_double\n')
 20 |     return new_bindings
 21 | 
 22 | def clean_only_headers(xml_filename, only_headers):
 23 |     """ 
 24 |     removes definitions that are not defined in one of only_headers files
 25 | 
 26 |     fd - file descriptor for xml file
 27 |     only_headers - list of header files to check definitions against
 28 | 
 29 |     """
 30 |     gcc_xml = ElementTree.parse(xml_filename)
 31 |     root = gcc_xml.getroot()
 32 |     fields = root.getchildren()[:]
 33 |     files = root.findall('File')
 34 |     oheaders = {}
 35 | 
 36 |     for file in files:
 37 |         filename = file.get('name')
 38 |         if only_headers.has_key(filename):
 39 |             oheaders[file.get('id')] = file.get('name')
 40 |         else:
 41 |             for ohead in only_headers.keys():
 42 |                 if re.compile(ohead,re.IGNORECASE).search(filename):
 43 |                     oheaders[file.get('id')] = file.get('name')
 44 | 
 45 |     print oheaders
 46 |         
 47 |     for field in fields:
 48 |         file = field.get('file')
 49 |         if file is not None:
 50 |             if not oheaders.has_key(file):
 51 |                 #print "%s %s" % (field.tag, file)
 52 |                 if not field.get('location') == "f8:214":
 53 |                     root.remove(field)
 54 |     gcc_xml.write(xml_filename)
 55 | 
 56 | def main(args=None):
 57 |     """ Autogenerates ctype'd python version of a shared library. 
 58 |         Takes an array of arguments in the same format as sys.argv
 59 | 
 60 |         -H header_file
 61 |         -l library_file
 62 |         -I include_dir
 63 |         -o only_headers
 64 |         -x xml_output_file
 65 |         -p python_output-file
 66 |     """  
 67 |     if args is None:
 68 |         args = sys.argv
 69 | 
 70 |     usage = "usage: %prog [options]"
 71 |     parser = OptionParser(usage)
 72 |     parser.add_option("-H","--header",
 73 |                       action="append",
 74 |                       dest="HEADERS",
 75 |                       help="[REQUIRED] full path to header (e.g /path/to/foo.h)",
 76 |                       default = [])
 77 | 
 78 |     parser.add_option("-l","--library",
 79 |                       action="append",
 80 |                       dest="LIBRARIES",
 81 |                       help="[REQUIRED] full path to library (e.g /path/to/libfoo.so)",
 82 |                       default = [])
 83 | 
 84 |     parser.add_option("-I","--include-dir",
 85 |                       action="append",
 86 |                       dest="INCLUDE_DIRS",
 87 |                       help="[REQUIRED] paths to find dependency headers (e.g /usr/local/cuda/include/)",
 88 |                       default = [])
 89 | 
 90 |     parser.add_option("-o","--only-headers",
 91 |                       action="append",
 92 |                       dest="ONLY_HEADERS",
 93 |                       help="[OPTIONAL] include only definitions found in specified headers (e.g /usr/include/example.h)",
 94 |                       default = [])
 95 | 
 96 |     parser.add_option("-x","--xml-output-file",
 97 |                       dest="XML_FILE",
 98 |                       help="file to store xml output to (e.g. /my/project/xml/cuda.xml)",
 99 |                       default = "output.xml")
100 | 
101 |     parser.add_option("-p","--python-output-file",
102 |                       dest="PYTHON_FILE",
103 |                       help="file to store python bindings to (e.g. /my/project/cuda.py)",
104 |                       default = "output.py")
105 | 
106 |     options, args = parser.parse_args()
107 |         
108 |     if len(options.HEADERS) == 0 or \
109 |             len(options.LIBRARIES) == 0 or \
110 |             len(options.INCLUDE_DIRS) == 0:
111 |         raise ValueError, "You must supply the header files (-H), the library files (-l) and the include dirs (-I)"
112 | 
113 |     headers = " ".join(options.HEADERS)
114 |     libraries = "-l " + " -l ".join(options.LIBRARIES)
115 |     include_dirs = "-I " + " -I ".join(options.INCLUDE_DIRS)
116 |     xml_file = options.XML_FILE
117 |     python_file = options.PYTHON_FILE
118 | 
119 |     # -- h2xml
120 |     h2xmlcmd_l = ["python -m ctypeslib.h2xml"]
121 |     h2xmlcmd_l += [headers]
122 |     h2xmlcmd_l += [include_dirs]
123 |     h2xmlcmd_l += ["-o %s" % xml_file]
124 |     h2xmlcmd = " ".join(h2xmlcmd_l)
125 |     print "h2xmlcmd =", h2xmlcmd
126 |     assert os.system(h2xmlcmd) == 0
127 | 
128 |     # XXX: hack to remove Converter tags from the xml file
129 |     lines = [line for line in open(xml_file).readlines() 
130 |              if not line.startswith('  <Converter id="_')]
131 | 
132 |     fd = open(xml_file, 'w+')
133 |     fd.writelines(lines)
134 |     fd.close()
135 | 
136 |     if len(options.ONLY_HEADERS) != 0:
137 |         only_headers = {}.fromkeys(options.ONLY_HEADERS, 1)
138 |         clean_only_headers(xml_file, only_headers)
139 | 
140 |     # -- xml2py
141 |     xml2pycmd_l = ["python -m ctypeslib.xml2py"]
142 |     xml2pycmd_l += [xml_file]
143 |     xml2pycmd_l += ["-c -d -v"]
144 |     xml2pycmd_l += [libraries]
145 |     xml2pycmd = " ".join(xml2pycmd_l)
146 |     print "xml2pycmd =", xml2pycmd
147 |     p = Popen(xml2pycmd, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True)
148 |     bindings = p.stdout.readlines()
149 | 
150 |     for lib in options.LIBRARIES:
151 |         bindings = fix_cdll_imports(bindings, lib)
152 | 
153 |     open(python_file, 'w').writelines(bindings)
154 | 
155 | if __name__ == "__main__":
156 |     main()
157 | 


--------------------------------------------------------------------------------
/xml/generate-xml.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # NP: do we need a python script for that ?
 4 | 
 5 | INCLUDES="-I ./ -I /usr/local/cuda/include"
 6 | 
 7 | python createbindings.py -o cuda -H my_CUDA2100_vector_types.h -H cuda.h -l /usr/lib/libcuda.so $INCLUDES -x cudadrv.xml -p cudadrv.py && \
 8 | python cudadrv.py && cp -vf cudadrv.py ../cuda/cu/
 9 | #python cudadrv2.py
10 | 
11 | python createbindings.py -o cuda -H my_CUDA2100_vector_types.h -H cuda_runtime.h -l /usr/local/cuda/lib/libcudart.so $INCLUDES -x cudart.xml -p cudart.py && \
12 | python cudart.py && cp -vf cudart.py ../cuda/cuda/
13 | #python cudart2.py
14 | 
15 | python createbindings.py -o cuda -H my_CUDA2100_vector_types.h -H cublas.h -l /usr/local/cuda/lib/libcublas.so $INCLUDES -x cublas.xml -p cublas.py && \
16 | python cublas.py && cp -vf cublas.py ../cuda/cublas/
17 | #python cublas2.py
18 | 
19 | python createbindings.py -o cuda -H my_CUDA2100_vector_types.h -H cufft.h -l /usr/local/cuda/lib/libcufft.so $INCLUDES -x cufft.xml -p cufft.py && \
20 | python cufft.py && cp -vf cufft.py ../cuda/cufft/
21 | #python cufft2.py
22 | 
23 | #rm -vf cudadrv.xml cudadrv.py cudart.xml cudart.py cublas.xml cublas.py cufft.xml cufft.py
24 | 
25 | 


--------------------------------------------------------------------------------
/xml/generate-xml.sh.orig:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | python createbindings.py -H $CUDA_PATH/include/cuda.h -l $CUDA_PATH/lib/libcuda.$LIB_EXT -x cu.xml -p ../cuda/cu/cudadrv.py
 4 | python createbindings.py -H $CUDA_PATH/include/cuda_runtime.h -l $CUDA_PATH/lib/libcudart.$LIB_EXT -x cudart.xml -p ../cuda/cuda/cudart.py
 5 | python createbindings.py -H $CUDA_PATH/include/cublas.h -l $CUDA_PATH/lib/libcublas.$LIB_EXT -x cublas.xml -p ../cuda/cublas/cublas.py
 6 | python createbindings.py -H $CUDA_PATH/include/cufft.h -l $CUDA_PATH/lib/libcufft.$LIB_EXT -x cufft.xml -p ../cuda/cufft/cufft.py
 7 | 
 8 | #find . -iname \*.py -exec python {} \;
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/xml/generate-xml_linux.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # NP: do we need a python script for that ?
 4 | 
 5 | INCLUDES="-I ./ -I /usr/local/cuda/include"
 6 | 
 7 | python createbindings.py -H my_CUDA2100_vector_types.h -H cuda.h -l /usr/lib/libcuda.so $INCLUDES -x cudadrv.xml -p cudadrv.py && \
 8 | python cudadrv.py && cp -vf cudadrv.py ../cuda/cu/
 9 | 
10 | python createbindings.py -H my_CUDA2100_vector_types.h -H cuda_runtime.h -l /usr/local/cuda/lib/libcudart.so $INCLUDES -x cudart.xml -p cudart.py && \
11 | python cudart.py && cp -vf cudart.py ../cuda/cuda/
12 | 
13 | python createbindings.py -H my_CUDA2100_vector_types.h -H cublas.h -l /usr/local/cuda/lib/libcublas.so $INCLUDES -x cublas.xml -p cublas.py && \
14 | python cublas.py && cp -vf cublas.py ../cuda/cublas/
15 | 
16 | python createbindings.py -H my_CUDA2100_vector_types.h -H cufft.h -l /usr/local/cuda/lib/libcufft.so $INCLUDES -x cufft.xml -p cufft.py && \
17 | python cufft.py && cp -vf cufft.py ../cuda/cufft/
18 | 
19 | #rm -vf cudadrv.xml cudadrv.py cudart.xml cudart.py cublas.xml cublas.py cufft.xml cufft.py


--------------------------------------------------------------------------------
/xml/generate-xml_macosx.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # NP: do we need a python script for that ?
 4 | 
 5 | python createbindings.py -H /usr/local/cuda/include/cuda.h -l /usr/local/cuda/lib/libcuda.dylib -I /usr/local/cuda/include -x cu.xml -p ../cuda/cu/cudadrv.py
 6 | python createbindings.py -H /usr/local/cuda/include/cuda_runtime.h -l /usr/local/cuda/lib/libcudart.dylib -I /usr/local/cuda/include -x cudart.xml -p ../cuda/cuda/cudart.py
 7 | python createbindings.py -H /usr/local/cuda/include/cublas.h -l /usr/local/cuda/lib/libcublas.dylib -I /usr/local/cuda/include -x cublas.xml -p ../cuda/cublas/cublas.py
 8 | python createbindings.py -H /usr/local/cuda/include/cufft.h -l /usr/local/cuda/lib/libcufft.dylib -I /usr/local/cuda/include -x cufft.xml -p ../cuda/cufft/cufft.py
 9 | 
10 | #find . -iname \*.py -exec python {} \;
11 | 
12 | python ../cuda/cu/cudadrv.py
13 | python ../cuda/cuda/cudart.py
14 | python ../cuda/cublas/cublas.py
15 | python ../cuda/cufft/cufft.py
16 | 


--------------------------------------------------------------------------------