├── dev_notes.rst ├── .gitignore ├── MANIFEST.in ├── old ├── pdfs.py ├── mvnpdf.h ├── build_cython.py ├── kernels.h ├── gpustats.pxd ├── cytest.pyx ├── scratch.py ├── cucommon.h ├── common.c ├── util.py ├── Makefile ├── common.h └── mvnpdf.cu ├── gpustats ├── multigpu.py ├── compat.py ├── cufiles │ ├── support.cu │ ├── cpustub.cu │ ├── transpose.cu │ ├── univcaller.cu │ ├── mvcaller.cu │ ├── sample_discrete.cu │ ├── sample_discrete_logged.cu │ └── sampleFromMeasureMedium.cu ├── kernels.py ├── __init__.py ├── tests │ ├── test_samplers.py │ └── test_pdfs.py ├── sampler.py ├── codegen.py ├── pdfs.py └── util.py ├── LICENSE ├── README.rst ├── setup.py ├── examples └── pymc_test.py └── scripts └── bench.py /dev_notes.rst: -------------------------------------------------------------------------------- 1 | - EMmvNormalPDF : do nothing with 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.o 3 | *.so 4 | *.cu_o 5 | bin 6 | obj 7 | build 8 | *~ 9 | foo.cu 10 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt *.py *.rst 2 | include MANIFEST.in 3 | recursive-include gpustats/cufiles * 4 | 5 | #exclude build 6 | #exclude dist 7 | 8 | graft gpustats/tests 9 | global-exclude *~ *.swp *.pyc *.bak 10 | -------------------------------------------------------------------------------- /old/pdfs.py: -------------------------------------------------------------------------------- 1 | from numpy.linalg import inv, cholesky as chol 2 | import numpy as np 3 | 4 | import testmod 5 | import util 6 | 7 | def mvnpdf(data, means, covs): 8 | ''' 9 | Compute multivariate normal log pdf 10 | 11 | Parameters 12 | ---------- 13 | 14 | Returns 15 | ------- 16 | 17 | ''' 18 | logdets = [np.log(np.linalg.det(c)) for c in covs] 19 | ichol_sigmas = [inv(chol(c)) for c in covs] 20 | 21 | packed_params = util.pack_params(means, ichol_sigmas, logdets) 22 | packed_data = util.pad_data(data) 23 | return testmod.mvn_call(packed_data, packed_params, 24 | data.shape[1]) 25 | -------------------------------------------------------------------------------- /old/mvnpdf.h: -------------------------------------------------------------------------------- 1 | #ifndef __MVNPDF_H__ 2 | #define __MVNPDF_H__ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include "common.h" 9 | 10 | void mvnpdf(float* h_data, /** Data-vector; padded */ 11 | float* h_params, /** Density info; already padded */ 12 | float* h_pdf, /** Resultant PDF */ 13 | int data_dim, 14 | int total_obs, 15 | int nparams, 16 | int param_stride, // with padding 17 | int data_stride // with padding 18 | ); 19 | 20 | void cpu_mvnormpdf(float* x, float* density, float * output, int dim, 21 | int padded_dim, int N, int T); 22 | 23 | 24 | #ifdef __cplusplus 25 | } 26 | #endif 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /old/build_cython.py: -------------------------------------------------------------------------------- 1 | #/usr/bin/env python 2 | 3 | from distutils.extension import Extension 4 | from numpy.distutils.core import setup 5 | from Cython.Distutils import build_ext 6 | import numpy 7 | 8 | def get_cuda_include(): 9 | return '/usr/local/cuda/include' 10 | 11 | pyx_ext = Extension('testmod', ['cytest.pyx'], 12 | include_dirs=[numpy.get_include(), 13 | get_cuda_include()], 14 | library_dirs=['.'], 15 | libraries=['gpustats']) 16 | 17 | setup(name='testmod', description='', 18 | ext_modules=[pyx_ext], 19 | cmdclass = { 20 | 'build_ext' : build_ext 21 | }) 22 | -------------------------------------------------------------------------------- /old/kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef __KERNELS_H__ 2 | #define __KERNELS_H__ 3 | 4 | #include 5 | 6 | #include "common.h" 7 | 8 | __global__ void mvNormalPDF( 9 | REAL* iData, /** Data-vector; padded */ 10 | REAL* iDensityInfo, /** Density info; already padded */ 11 | REAL* oMeasure, /** Resultant measure */ 12 | int iD, /** Not currently necessary, as DIM is hardcoded */ 13 | int iN, 14 | int iTJ, 15 | int isLogScaled 16 | ); 17 | 18 | cudaError_t gpuMvNormalPDF( 19 | REAL* iData, /** Data-vector; padded */ 20 | REAL* iDensityInfo, /** Density info; already padded */ 21 | REAL* oMeasure, /** Resultant measure */ 22 | int iD, /** Not currently necessary, as DIM is hardcoded */ 23 | int iN, 24 | int iTJ 25 | ); 26 | 27 | #endif // __KERNELS_H__ 28 | -------------------------------------------------------------------------------- /old/gpustats.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from "cuda.h": 2 | struct cudaError_t: 3 | pass 4 | char* cudaGetErrorString(cudaError_t err) 5 | 6 | cdef extern from "common.h": 7 | struct PMatrix: 8 | float* data 9 | int rows 10 | int cols 11 | int stride 12 | 13 | void PMatrix_init(float* d, int r, int c, int s) 14 | 15 | void set_device(int device) 16 | 17 | cdef extern from "mvnpdf.h": 18 | void mvnpdf(float* h_data, 19 | float* h_params, 20 | float* h_pdf, 21 | int data_dim, 22 | int total_obs, 23 | int nparams, 24 | int param_stride, 25 | int data_stride) nogil 26 | 27 | void cpu_mvnpdf(float* x, float* density, float * output, int D, 28 | int padded_dim, int N, int T) nogil 29 | 30 | 31 | -------------------------------------------------------------------------------- /gpustats/multigpu.py: -------------------------------------------------------------------------------- 1 | from threading import Thread 2 | 3 | import testmod 4 | 5 | class GPUCall(Thread): 6 | """ 7 | 8 | """ 9 | 10 | def __init__(self, func, device=0): 11 | self.func = func 12 | self.device = device 13 | 14 | def acquire_device(self): 15 | testmod.set_device(self.device) 16 | 17 | def release_device(self): 18 | pass 19 | 20 | def run(self): 21 | self.acquire_device() 22 | self.func() 23 | self.release_device() 24 | 25 | def make_calls(func, data, devices=None, splits=None): 26 | """ 27 | 28 | Parameters 29 | ---------- 30 | 31 | Returns 32 | ------- 33 | 34 | """ 35 | if splits is None: 36 | pass 37 | 38 | def _execute_calls(calls): 39 | """ 40 | 41 | """ 42 | for call in calls: 43 | call.start() 44 | 45 | for call in calls: 46 | call.join() 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /gpustats/compat.py: -------------------------------------------------------------------------------- 1 | """ 2 | Python versions of functions for testing purposes etc. 3 | """ 4 | import numpy as np 5 | 6 | def python_mvnpdf(data, means, covs): 7 | from pymc import mv_normal_cov_like as pdf_func 8 | 9 | results = [] 10 | for i, datum in enumerate(data): 11 | for j, cov in enumerate(covs): 12 | mean = means[j] 13 | results.append(pdf_func(datum, mean, cov)) 14 | 15 | return np.array(results).reshape((len(data), len(covs))).squeeze() 16 | 17 | def python_sample_discrete(pmfs, draws=None): 18 | T, K = pmfs.shape 19 | output = np.empty(T, dtype=np.int32) 20 | if draws is None: 21 | draws = np.random.rand(T) 22 | 23 | # rescale 24 | pmfs = (pmfs.T / pmfs.sum(1)).T 25 | 26 | for i in xrange(T): 27 | the_sum = 0 28 | draw = draws[i] 29 | for j in xrange(K): 30 | the_sum += pmfs[i, j] 31 | 32 | if the_sum >= draw: 33 | output[i] = j 34 | break 35 | 36 | return output 37 | 38 | if __name__ == '__main__': 39 | pmfs = np.random.randn(20, 5) 40 | pmfs = (pmfs.T - pmfs.min(1)).T 41 | 42 | 43 | -------------------------------------------------------------------------------- /old/cytest.pyx: -------------------------------------------------------------------------------- 1 | cimport numpy as cnp 2 | from numpy cimport ndarray 3 | import numpy as np 4 | 5 | cimport gpustats as gps 6 | 7 | def set_device(device): 8 | ''' 9 | Set the CUDA device 10 | ''' 11 | gps.set_device(device) 12 | 13 | def cpu_mvnpdf(ndarray packed_data, ndarray packed_params, int dim): 14 | n, j = len(packed_data), len(packed_params) 15 | 16 | padded_dim = ( packed_data).shape[1] 17 | 18 | cdef ndarray output = np.empty((n, j), dtype=np.float32) 19 | gps.cpu_mvnpdf( packed_data.data, 20 | packed_params.data, 21 | output.data, 22 | dim, padded_dim, n, j) 23 | 24 | return output 25 | 26 | def mvn_call(ndarray packed_data, ndarray packed_params, int dim): 27 | ''' 28 | Invoke MVN kernel on prepared data 29 | 30 | Releases GIL 31 | ''' 32 | cdef int n, k, pn, pk 33 | 34 | n, k = ( packed_data).shape 35 | pn, pk = ( packed_params).shape 36 | 37 | cdef ndarray output = np.empty((n, pn), np.float32, order='F') 38 | 39 | with nogil: 40 | gps.mvnpdf( packed_data.data, 41 | packed_params.data, 42 | output.data, 43 | dim, n, pn, pk, k) 44 | 45 | return output 46 | -------------------------------------------------------------------------------- /gpustats/cufiles/support.cu: -------------------------------------------------------------------------------- 1 | 2 | #define LOG_2_PI 1.83787706640935f 3 | #define LOG_PI 1.144729885849400f 4 | 5 | __device__ int d_next_multiple(int k, int mult) { 6 | if (k % mult) 7 | return k + (mult - k % mult); 8 | else 9 | return k; 10 | } 11 | 12 | __device__ void copy_chunks(float* in_buf, float* out_buf, 13 | unsigned int tid, unsigned int total) { 14 | for (unsigned int chunk = 0; chunk + tid < total; chunk += blockDim.x) { 15 | out_buf[chunk + tid] = in_buf[chunk + tid]; 16 | } 17 | } 18 | 19 | __device__ void copy_chunks_strided(float* in_buf, float* out_buf, 20 | unsigned int tid, unsigned int ncols, 21 | unsigned int nrows, unsigned int stride) { 22 | unsigned int outind = 0; unsigned int total = ncols*nrows; 23 | for (unsigned int chunk = 0; chunk + tid < total; chunk += blockDim.x) { 24 | outind = ((chunk + tid)/ncols)*stride + (chunk + tid) % ncols; 25 | out_buf[outind] = in_buf[chunk + tid]; 26 | } 27 | } 28 | 29 | 30 | __device__ inline void atomic_add(float* address, float value){ 31 | #if __CUDA_ARCH__ >= 200 // for Fermi, atomicAdd supports floats 32 | atomicAdd(address, value); 33 | #elif __CUDA_ARCH__ >= 110 34 | float old = value; 35 | while ((old = atomicExch(address, atomicExch(address, 0.0f)+old))!=0.0f); 36 | #endif 37 | } 38 | 39 | -------------------------------------------------------------------------------- /gpustats/cufiles/cpustub.cu: -------------------------------------------------------------------------------- 1 | int MAX_BLOCK_PARAMS = 64; 2 | 3 | cudaError_t invoke_mvnpdf(PMatrix data, PMatrix params, float* d_pdf) { 4 | // Need to automatically tune block / grid layout to maximize shared memory 5 | // usage and coalescence, reduce wasted threads! 6 | BlockDesign design; 7 | get_tuned_layout(&design, &data, ¶ms, MAX_BLOCK_PARAMS); 8 | 9 | int nthreads = design.data_per_block * design.params_per_block; 10 | 11 | // Now set up grid layout / block size 12 | int grid_x = get_boxes(data.rows, design.data_per_block); 13 | int grid_y = get_boxes(params.rows, design.params_per_block); 14 | dim3 gridPDF(grid_x, grid_y); 15 | dim3 blockPDF(nthreads, 1); 16 | 17 | int sharedMemSize = compute_shmem(&data, ¶ms, 18 | design.params_per_block, 19 | design.data_per_block); 20 | 21 | #ifdef DEBUG 22 | printf("number params: %d, number data points: %d\n", 23 | design.params_per_block, design.data_per_block); 24 | printf("sharedMemSize: %d\n", sharedMemSize); 25 | printf("block: %d x %d, grid: %d x %d\n", blockPDF.x, blockPDF.y, 26 | gridPDF.x, gridPDF.y); 27 | printf("design: %d x %d\n", design.data_per_block, design.params_per_block); 28 | 29 | printf("nparams: %d\n", params.rows); 30 | #endif 31 | 32 | mvnpdf_k<<>>(data, params, design, d_pdf); 33 | return cudaSuccess; 34 | } 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010 Duke University and collaborators 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of any 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | GPUStats 3 | ======== 4 | 5 | gpustats is a PyCUDA-based library implementing functionality similar to that 6 | present in scipy.stats. It implements a simple framework for specifying new CUDA 7 | kernels and extending existing ones. Here is a (partial) list of target 8 | functionality: 9 | 10 | * Probability density functions (pdfs). These are intended to speed up 11 | likelihood calculations in particular in Bayesian inference applications, such 12 | as in PyMC 13 | 14 | * Random variable generation using CURAND 15 | 16 | Requirements 17 | ------------ 18 | 19 | * NumPy 20 | * SciPy 21 | * Working PyCUDA (http://pypi.python.org/pypi/pycuda) installation 22 | * (optional) PyMC, for test suite 23 | 24 | Installation and testing 25 | ------------------------ 26 | 27 | To install, run: 28 | 29 | :: 30 | 31 | python setup.py install 32 | 33 | If you have `nose` installed, you may run the test suite by running: 34 | 35 | :: 36 | 37 | import gpustats 38 | gpustats.test() 39 | 40 | Use 41 | --- 42 | 43 | :: 44 | 45 | import gpustats 46 | 47 | Some development guidelines 48 | --------------------------- 49 | 50 | * Use spaces (4 per indent), not tabs 51 | * Trim whitespace at the end of lines (most text editors will do this for you) 52 | * PEP8-consistent Python style 53 | 54 | People 55 | ------ 56 | 57 | Cliburn Chan cliburn.chan (at) duke.edu 58 | Andrew Cron ajc40 (at) stat.duke.edu 59 | Jacob Frelinger jacob.frelinger (at) duke.edu 60 | Wes McKinney wesmckinn (at) gmail.com 61 | Adam Richards adam.richards (at) duke.edu 62 | Marc Suchard msuchard (at) ucla.edu 63 | Quanli Wang quanli (at) stat.duke.edu 64 | Mike West mw (at) stat.duke.edu 65 | 66 | Notes 67 | ----- 68 | Requires working PyCUDA installation 69 | -------------------------------------------------------------------------------- /gpustats/kernels.py: -------------------------------------------------------------------------------- 1 | from gpustats.codegen import (MVDensityKernel, DensityKernel, Exp, 2 | CUFile) 3 | import gpustats.codegen as cg 4 | 5 | # TODO: check for name conflicts! 6 | 7 | _log_pdf_mvnormal = """ 8 | __device__ float %(name)s(float* data, float* params, int dim) { 9 | unsigned int LOGDET_OFFSET = dim * (dim + 3) / 2; 10 | float* mean = params; 11 | float* sigma = params + dim; 12 | float mult = params[LOGDET_OFFSET]; 13 | float logdet = params[LOGDET_OFFSET + 1]; 14 | 15 | float discrim = 0; 16 | float sum; 17 | unsigned int i, j; 18 | for (i = 0; i < dim; ++i) 19 | { 20 | sum = 0; 21 | for(j = 0; j <= i; ++j) { 22 | sum += *sigma++ * (data[j] - mean[j]); 23 | } 24 | discrim += sum * sum; 25 | } 26 | return log(mult) - 0.5f * (discrim + logdet + LOG_2_PI * dim); 27 | } 28 | """ 29 | log_pdf_mvnormal = MVDensityKernel('log_pdf_mvnormal', _log_pdf_mvnormal) 30 | pdf_mvnormal = Exp('pdf_mvnormal', log_pdf_mvnormal) 31 | 32 | 33 | _log_pdf_normal = """ 34 | __device__ float %(name)s(float* x, float* params) { 35 | // mean stored in params[0] 36 | float std = params[1]; 37 | 38 | // standardize 39 | float xstd = (*x - params[0]) / std; 40 | return - (xstd * xstd) / 2 - 0.5f * LOG_2_PI - log(std); 41 | } 42 | """ 43 | log_pdf_normal = DensityKernel('log_pdf_normal', _log_pdf_normal) 44 | pdf_normal = Exp('pdf_normal', log_pdf_normal) 45 | 46 | sample_discrete_old = CUFile('sample_discrete_old', 47 | 'sample_discrete.cu') 48 | 49 | sample_discrete_logged_old = CUFile('sample_discrete_logged_old', 50 | 'sample_discrete_logged.cu') 51 | 52 | sample_discrete = CUFile('sample_discrete', 53 | 'sampleFromMeasureMedium.cu') 54 | -------------------------------------------------------------------------------- /gpustats/cufiles/transpose.cu: -------------------------------------------------------------------------------- 1 | // Exercise 1 from http://webapp.dam.brown.edu/wiki/SciComp/CudaExercises 2 | 3 | // Transposition of a matrix 4 | // by Hendrik Riedmann 5 | // Andrew Cron added bounds checks ... 6 | 7 | // Andrew Cron added Z grid dimension to X for larger matrices 8 | 9 | #define BLOCK_SIZE %(block_size)d 10 | #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width) 11 | #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height) 12 | 13 | __global__ void transpose(float *A_t, float *A, int a_width, int a_height) 14 | { 15 | int bidx = blockIdx.x + blockIdx.z; 16 | // Base indices in A and A_t 17 | int base_idx_a = bidx * BLOCK_SIZE + 18 | blockIdx.y * A_BLOCK_STRIDE; 19 | int base_idx_a_t = blockIdx.y * BLOCK_SIZE + 20 | bidx * A_T_BLOCK_STRIDE; 21 | 22 | // Global indices in A and A_t 23 | int glob_idx_a = base_idx_a + threadIdx.x + a_width * threadIdx.y; 24 | int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height * threadIdx.y; 25 | 26 | int a_x_pos = bidx * BLOCK_SIZE + threadIdx.x; 27 | int a_y_pos = blockIdx.y * BLOCK_SIZE + threadIdx.y; 28 | int at_x_pos = blockIdx.y * BLOCK_SIZE + threadIdx.x; 29 | int at_y_pos = bidx * BLOCK_SIZE + threadIdx.y; 30 | 31 | __shared__ float A_shared[BLOCK_SIZE][BLOCK_SIZE+1]; 32 | 33 | if( a_x_pos < a_width && a_y_pos < a_height ){ 34 | // Store transposed submatrix to shared memory 35 | A_shared[threadIdx.y][threadIdx.x] = A[glob_idx_a]; 36 | } 37 | __syncthreads(); 38 | if( at_x_pos < a_height && at_y_pos < a_width ){ 39 | // Write transposed submatrix to global memory 40 | A_t[glob_idx_a_t] = A_shared[threadIdx.x][threadIdx.y]; 41 | } 42 | 43 | } 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /old/scratch.py: -------------------------------------------------------------------------------- 1 | from numpy.random import randn 2 | from numpy.linalg import cholesky as chol 3 | import numpy as np 4 | import numpy.linalg as L 5 | import scipy.special as sp 6 | import pymc.flib as flib 7 | import time 8 | import testmod 9 | import util 10 | import pdb 11 | 12 | def gen_testdata(n=100, k=4): 13 | # use static data to compare to R 14 | data = randn(n, k) 15 | mean = randn(k) 16 | 17 | np.savetxt('test_data', data) 18 | np.savetxt('test_mean', mean) 19 | 20 | def load_testdata(): 21 | data = np.loadtxt('test_data') 22 | mean = np.loadtxt('test_mean') 23 | cov = np.cov(data.T) 24 | 25 | 26 | return data, mean, cov 27 | 28 | def bench(cpu_func, gpu_func, gruns=50): 29 | """ 30 | 31 | """ 32 | 33 | _s = time.clock() 34 | for i in xrange(gruns): 35 | gpu_func() 36 | 37 | gpu_speed = (time.clock() - _s) / gruns 38 | 39 | _s = time.clock() 40 | cpu_func() 41 | cpu_speed = (time.clock() - _s) 42 | print 'CPU speed: %.3f' % (cpu_speed * 1000) 43 | print 'GPU speed: %.3f' % (gpu_speed * 1000) 44 | print cpu_speed / gpu_speed 45 | 46 | if __name__ == '__main__': 47 | testmod.set_device(0) 48 | 49 | n = 1e3 50 | k = 16 51 | 52 | data = randn(n, k).astype(np.float32) 53 | mean = randn(k) 54 | cov = np.array(util.random_cov(k), dtype=np.float32) 55 | 56 | j = 32 57 | 58 | padded_data = util.pad_data(data) 59 | 60 | chol_sigma = chol(cov) 61 | ichol_sigma = L.inv(chol_sigma) 62 | logdet = np.log(np.linalg.det(cov)) 63 | 64 | means = (mean,) * j 65 | covs = (ichol_sigma,) * j 66 | logdets = (logdet,) * j 67 | 68 | packed_params = util.pack_params(means, covs, logdets) 69 | 70 | cpu_func = lambda: testmod.cpu_mvnpdf(padded_data, packed_params, k).squeeze() 71 | gpu_func = lambda: testmod._mvnpdf(padded_data, packed_params, k).squeeze() 72 | 73 | print cpu_func() 74 | print gpu_func() 75 | 76 | # bench(cpu_func, gpu_func, gruns=50) 77 | -------------------------------------------------------------------------------- /old/cucommon.h: -------------------------------------------------------------------------------- 1 | /* 2 | Common functions for GPUStats CUDA kernels and interface functions 3 | 4 | */ 5 | #ifndef __CUCOMMON_H__ 6 | #define __CUCOMMON_H__ 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | int smem_size() { 17 | int dev = 0; 18 | cudaDeviceProp deviceProp; 19 | cudaGetDeviceProperties(&deviceProp, dev); 20 | return deviceProp.sharedMemPerBlock; 21 | } 22 | 23 | int max_block_threads() { 24 | int dev = 0; 25 | cudaDeviceProp deviceProp; 26 | cudaGetDeviceProperties(&deviceProp, dev); 27 | return deviceProp.maxThreadsPerBlock; 28 | } 29 | 30 | // Simple strided matrix data structure, far as I can tell there's little or no 31 | // overhead in the compiled version. 32 | typedef struct PMatrix { 33 | float* buf; // C-style row-major data 34 | int rows; // actual number of rows 35 | int cols; // actual number of columns 36 | int stride; // data length of row 37 | } PMatrix; 38 | 39 | void PMatrix_init(PMatrix* mat, float* data, int rows, int cols, int stride){ 40 | mat->buf = data; 41 | mat->rows = rows; 42 | mat->cols = cols; 43 | mat->stride = stride; 44 | } 45 | 46 | typedef struct { 47 | int data_per_block; 48 | int params_per_block; 49 | } BlockDesign; 50 | 51 | int next_pow2(int k, int pow2) { 52 | // next highest power of two 53 | while (k <= pow2 / 2) pow2 /= 2; 54 | return pow2; 55 | } 56 | 57 | int get_boxes(int n, int box_size) { 58 | // how many boxes of size box_size are needed to hold n things 59 | return (n + box_size - 1) / box_size; 60 | } 61 | 62 | void inline h_to_d(float* h_ptr, float* d_ptr, size_t n){ 63 | cudaError_t error; 64 | CATCH_ERR(cudaMemcpy(d_ptr, h_ptr, n * sizeof(float), cudaMemcpyHostToDevice)); 65 | } 66 | 67 | void inline d_to_h(float* d_ptr, float* h_ptr, size_t n){ 68 | cudaError_t error; 69 | CATCH_ERR(cudaMemcpy(h_ptr, d_ptr, n * sizeof(float), cudaMemcpyDeviceToHost)); 70 | } 71 | 72 | #ifdef __cplusplus 73 | } 74 | #endif 75 | 76 | #endif // __CUCOMMON_H__ 77 | -------------------------------------------------------------------------------- /old/common.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "common.h" 6 | 7 | #if _WIN32 8 | #define isnan(x) ((x) != (x)) 9 | #endif 10 | 11 | void set_device(int device) { 12 | cudaError_t error; 13 | CATCH_ERR(cudaSetDevice(device)); 14 | } 15 | 16 | REAL *allocateGPURealMemory(int length) { 17 | #ifdef DEBUG 18 | fprintf(stderr,"Entering ANMA-Real\n"); 19 | #endif 20 | 21 | REAL *data; 22 | cudaError_t error; 23 | SAFE_CUDA(cudaMalloc((void**) &data, SIZE_REAL * length),data); 24 | if (data == NULL) { 25 | fprintf(stderr,"Failed to allocate REAL (%d) memory on device!\n", 26 | length); 27 | // TODO clean up and gracefully die 28 | exit(-1); 29 | } 30 | 31 | #ifdef DEBUG 32 | fprintf(stderr,"Allocated %d to %d.\n",data,(data +length)); 33 | fprintf(stderr,"Leaving ANMA\n"); 34 | #endif 35 | 36 | return data; 37 | } 38 | 39 | INT *allocateGPUIntMemory(int length) { 40 | 41 | #ifdef DEBUG 42 | fprintf(stderr,"Entering ANMA-Int\n"); 43 | #endif 44 | 45 | INT *data; 46 | cudaError_t error; 47 | SAFE_CUDA(cudaMalloc((void**) &data, SIZE_INT * length),data); 48 | if (data == NULL) { 49 | fprintf(stderr,"Failed to allocate INT memory on device!\n"); 50 | exit(-1); 51 | } 52 | 53 | #ifdef DEBUG 54 | fprintf(stderr,"Allocated %d to %d.\n",data,(data+length)); 55 | fprintf(stderr,"Leaving ANMA\n"); 56 | #endif 57 | 58 | return data; 59 | } 60 | 61 | void freeGPUMemory(void *ptr) { 62 | 63 | #ifdef DEBUG 64 | fprintf(stderr,"Entering FNMA\n"); 65 | #endif 66 | 67 | if (ptr != 0) { 68 | cudaFree(ptr); 69 | } 70 | 71 | #ifdef DEBUG 72 | fprintf(stderr,"Leaving FNMA\n"); 73 | #endif 74 | } 75 | 76 | void storeGPURealMemoryArray(REAL *toGPUPtr, REAL *fromGPUPtr, int length) { 77 | cudaError_t error; 78 | SAFE_CUDA(cudaMemcpy(toGPUPtr, fromGPUPtr, SIZE_REAL*length, cudaMemcpyDeviceToDevice),toGPUPtr); 79 | } 80 | 81 | void storeGPUIntMemoryArray(INT *toGPUPtr, INT *fromGPUPtr, int length) { 82 | cudaError_t error; 83 | SAFE_CUDA(cudaMemcpy(toGPUPtr, fromGPUPtr, SIZE_INT*length, cudaMemcpyDeviceToDevice),toGPUPtr); 84 | } 85 | -------------------------------------------------------------------------------- /gpustats/cufiles/univcaller.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Automatically-generated kernel for %(name)s 3 | 4 | For univariate distributions 5 | */ 6 | 7 | __global__ void k_%(name)s(float* output, 8 | float* data, 9 | float* params, 10 | int data_per_block, 11 | int params_per_block, 12 | int nobs, 13 | int nparams, 14 | int params_stride) { 15 | 16 | unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; 17 | 18 | unsigned int rel_param = tid / data_per_block; 19 | unsigned int rel_data = tid - rel_param * data_per_block; 20 | 21 | unsigned int obs_num = data_per_block * blockIdx.x + rel_data; 22 | unsigned int param_num = params_per_block * blockIdx.y + rel_param; 23 | 24 | // set up shared data 25 | extern __shared__ float shared_data[]; 26 | float* sh_params = shared_data; 27 | float* sh_data = sh_params + params_per_block * params_stride; 28 | float* sh_result = sh_data + data_per_block; 29 | 30 | copy_chunks(data + data_per_block * blockIdx.x, 31 | sh_data, tid, 32 | min(nobs - data_per_block * blockIdx.x, 33 | data_per_block)); 34 | 35 | copy_chunks(params + params_per_block * blockIdx.y * params_stride, 36 | sh_params, tid, 37 | min(params_per_block, 38 | nparams - params_per_block * blockIdx.y) * params_stride); 39 | 40 | __syncthreads(); 41 | 42 | // allocated enough shared memory so that this will not walk out of bounds 43 | // no matter what, though some of the results will be garbage 44 | sh_result[tid] = %(name)s(sh_data + rel_data, 45 | sh_params + rel_param * params_stride); 46 | __syncthreads(); 47 | 48 | unsigned int result_idx = nobs * param_num + obs_num; 49 | 50 | // output is column-major, so this will then coalesce 51 | if (obs_num < nobs & param_num < nparams) { 52 | // output[result_idx] = obs_num; 53 | output[result_idx] = sh_result[tid]; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /gpustats/__init__.py: -------------------------------------------------------------------------------- 1 | from pdfs import * 2 | 3 | from numpy import errstate 4 | from numpy.testing import Tester 5 | class NoseWrapper(Tester): 6 | ''' 7 | This is simply a monkey patch for numpy.testing.Tester. 8 | 9 | It allows extra_argv to be changed from its default None to ['--exe'] so 10 | that the tests can be run the same across platforms. It also takes kwargs 11 | that are passed to numpy.errstate to suppress floating point warnings. 12 | ''' 13 | def test(self, label='fast', verbose=1, extra_argv=['--exe'], doctests=False, 14 | coverage=False, **kwargs): 15 | ''' Run tests for module using nose 16 | 17 | %(test_header)s 18 | doctests : boolean 19 | If True, run doctests in module, default False 20 | coverage : boolean 21 | If True, report coverage of NumPy code, default False 22 | (Requires the coverage module: 23 | http://nedbatchelder.com/code/modules/coverage.html) 24 | kwargs 25 | Passed to numpy.errstate. See its documentation for details. 26 | ''' 27 | 28 | # cap verbosity at 3 because nose becomes *very* verbose beyond that 29 | verbose = min(verbose, 3) 30 | 31 | from numpy.testing import utils 32 | utils.verbose = verbose 33 | 34 | if doctests: 35 | print "Running unit tests and doctests for %s" % self.package_name 36 | else: 37 | print "Running unit tests for %s" % self.package_name 38 | 39 | self._show_system_info() 40 | 41 | # reset doctest state on every run 42 | import doctest 43 | doctest.master = None 44 | 45 | argv, plugins = self.prepare_test_args(label, verbose, extra_argv, 46 | doctests, coverage) 47 | from numpy.testing.noseclasses import NumpyTestProgram 48 | from warnings import simplefilter #, catch_warnings 49 | with errstate(**kwargs): 50 | ## with catch_warnings(): 51 | simplefilter('ignore', category=DeprecationWarning) 52 | t = NumpyTestProgram(argv=argv, exit=False, plugins=plugins) 53 | return t.result 54 | test = NoseWrapper().test 55 | -------------------------------------------------------------------------------- /old/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pymc.distributions as pymc_dist 3 | 4 | PAD_MULTIPLE = 16 5 | HALF_WARP = 16 6 | 7 | def random_cov(dim): 8 | return pymc_dist.rinverse_wishart(dim, np.eye(dim)) 9 | 10 | def unvech(v): 11 | # quadratic formula, correct fp error 12 | rows = .5 * (-1 + np.sqrt(1 + 8 * len(v))) 13 | rows = int(np.round(rows)) 14 | 15 | result = np.zeros((rows, rows)) 16 | result[np.triu_indices(rows)] = v 17 | result = result + result.T 18 | 19 | # divide diagonal elements by 2 20 | result[np.diag_indices(rows)] /= 2 21 | 22 | return result 23 | 24 | def next_multiple(k, p): 25 | if k % p: 26 | return k + (p - k % p) 27 | 28 | return k 29 | 30 | def pad_data(data): 31 | """ 32 | Pad data to avoid bank conflicts on the GPU-- dimension should not be a 33 | multiple of the half-warp size (16) 34 | """ 35 | n, k = data.shape 36 | 37 | if not k % HALF_WARP: 38 | pad_dim = k + 1 39 | else: 40 | pad_dim = k 41 | 42 | if k != pad_dim: 43 | padded_data = np.empty((n, pad_dim), dtype=np.float32) 44 | padded_data[:, :k] = data 45 | 46 | return padded_data 47 | else: 48 | return prep_ndarray(data) 49 | 50 | def prep_ndarray(arr): 51 | # is float32 and contiguous? 52 | if not arr.dtype == np.float32 or not arr.flags.contiguous: 53 | arr = np.array(arr, dtype=np.float32) 54 | 55 | return arr 56 | 57 | def pack_params(means, chol_sigmas, logdets): 58 | to_pack = [] 59 | for m, ch, ld in zip(means, chol_sigmas, logdets): 60 | to_pack.append(pack_pdf_params(m, ch, ld)) 61 | 62 | return np.vstack(to_pack) 63 | 64 | def pack_pdf_params(mean, chol_sigma, logdet): 65 | ''' 66 | 67 | 68 | ''' 69 | k = len(mean) 70 | mean_len = k 71 | chol_len = k * (k + 1) / 2 72 | mch_len = mean_len + chol_len 73 | 74 | packed_dim = next_multiple(mch_len + 2, PAD_MULTIPLE) 75 | 76 | packed_params = np.empty(packed_dim, dtype=np.float32) 77 | packed_params[:mean_len] = mean 78 | 79 | packed_params[mean_len:mch_len] = chol_sigma[np.tril_indices(k)] 80 | packed_params[mch_len:mch_len + 2] = 1, logdet 81 | 82 | return packed_params 83 | -------------------------------------------------------------------------------- /gpustats/cufiles/mvcaller.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Automatically-generated kernel for %(name)s 3 | 4 | For multivariate distributions, coordinates to utilize shared memory 5 | 6 | TODO: How to avoid bank conflicts 7 | TODO: How to ensure coalescence 8 | */ 9 | 10 | __global__ void k_%(name)s(float* g_output, 11 | float* g_data, 12 | float* g_params, 13 | int data_per_block, 14 | int params_per_block, 15 | int data_rows, 16 | int data_stride, 17 | int data_cols, 18 | int params_rows, 19 | int params_stride) { 20 | 21 | unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; 22 | 23 | unsigned int rel_param = tid / data_per_block; 24 | unsigned int rel_data = tid - rel_param * data_per_block; 25 | 26 | unsigned int obs_num = data_per_block * blockIdx.x + rel_data; 27 | unsigned int param_num = params_per_block * blockIdx.y + rel_param; 28 | 29 | // set up shared data 30 | extern __shared__ float shared_data[]; 31 | float* sh_params = shared_data; 32 | float* sh_data = sh_params + params_per_block * params_stride; 33 | float* sh_result = sh_data + data_per_block * data_stride; 34 | 35 | copy_chunks(g_data + data_per_block * blockIdx.x * data_stride, 36 | sh_data, tid, 37 | min(data_rows - data_per_block * blockIdx.x, 38 | data_per_block) * data_stride); 39 | 40 | copy_chunks(g_params + params_per_block * blockIdx.y * params_stride, 41 | sh_params, tid, 42 | min(params_per_block, 43 | params_rows - params_per_block * blockIdx.y) * params_stride); 44 | 45 | __syncthreads(); 46 | 47 | // allocated enough shared memory so that this will not walk out of bounds 48 | // no matter what, though some of the results will be garbage 49 | sh_result[tid] = %(name)s(sh_data + rel_data * data_stride, 50 | sh_params + rel_param * params_stride, 51 | data_cols); 52 | __syncthreads(); 53 | 54 | unsigned int result_idx = data_rows * param_num + obs_num; 55 | // unsigned int result_idx = obs_num * data_cols + param_num 56 | 57 | // g_output is column-major, so this will then coalesce 58 | if (obs_num < data_rows & param_num < params_rows) { 59 | g_output[result_idx] = sh_result[tid]; 60 | } 61 | } 62 | 63 | // foo 64 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #/usr/bin/env python 2 | 3 | from numpy.distutils.misc_util import Configuration 4 | from numpy.distutils.core import setup 5 | 6 | DESCRIPTION = "GPU-based statistical functions" 7 | LONG_DESCRIPTION = """ 8 | gpustats is a PyCUDA-based library implementing functionality similar to that 9 | present in scipy.stats. It implements a simple framework for specifying new CUDA 10 | kernels and extending existing ones. Here is a (partial) list of target 11 | functionality: 12 | 13 | * Probability density functions (pdfs). These are intended to speed up 14 | likelihood calculations in particular in Bayesian inference applications, such 15 | as in PyMC 16 | 17 | * Random variable generation using CURAND 18 | 19 | Notes 20 | ----- 21 | Requires working PyCUDA installation 22 | """ 23 | 24 | REQUIRES = ['numpy', 'pycuda >= 0.94rc'] 25 | DISTNAME = 'gpustats' 26 | LICENSE = 'BSD' 27 | AUTHOR = "Wes McKinney" 28 | AUTHOR_EMAIL = "wesmckinn@gmail.com" 29 | URL = "https://github.com/dukestats/gpustats" 30 | CLASSIFIERS = [ 31 | 'Development Status :: 2 - Pre-Alpha', 32 | 'Environment :: Console', 33 | 'Operating System :: OS Independent', 34 | 'Intended Audience :: Science/Research', 35 | 'Programming Language :: Python', 36 | 'Topic :: Scientific/Engineering', 37 | ] 38 | 39 | MAJOR = 0 40 | MINOR = 0 41 | MICRO = 1 42 | ISRELEASED = True 43 | VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) 44 | 45 | FULLVERSION = VERSION 46 | if not ISRELEASED: 47 | FULLVERSION += '.beta' 48 | 49 | def configuration(parent_package='', top_path=None): 50 | config = Configuration(None, parent_package, top_path, 51 | version=FULLVERSION) 52 | config.set_options(ignore_setup_xxx_py=True, 53 | assume_default_configuration=True, 54 | delegate_options_to_subpackages=True, 55 | quiet=True) 56 | 57 | config.add_subpackage('gpustats') 58 | config.add_data_dir('gpustats/tests') 59 | config.add_data_dir('gpustats/cufiles') 60 | return config 61 | 62 | if __name__ == '__main__': 63 | setup(name=DISTNAME, 64 | author=AUTHOR, 65 | author_email=AUTHOR_EMAIL, 66 | description=DESCRIPTION, 67 | license=LICENSE, 68 | url=URL, 69 | long_description=LONG_DESCRIPTION, 70 | classifiers=CLASSIFIERS, 71 | platforms='any', 72 | configuration=configuration) 73 | -------------------------------------------------------------------------------- /gpustats/tests/test_samplers.py: -------------------------------------------------------------------------------- 1 | import nose 2 | import sys 3 | import unittest 4 | 5 | from numpy.random import rand 6 | from numpy.linalg import inv, cholesky as chol 7 | from numpy.testing import assert_almost_equal, assert_equal 8 | import numpy as np 9 | 10 | import scipy.stats as sp_stats 11 | 12 | import gpustats as gps 13 | import gpustats.sampler as gpusamp 14 | import gpustats.compat as compat 15 | import gpustats.util as util 16 | 17 | DECIMAL_6 = 6 18 | DECIMAL_5 = 5 19 | DECIMAL_4 = 4 20 | DECIMAL_3 = 3 21 | DECIMAL_2 = 2 22 | DECIMAL_1 = 1 23 | 24 | np.set_printoptions(suppress=True) 25 | 26 | def _make_test_densities(n=10000, k=4): 27 | dens = rand(k) 28 | densities = [dens.copy() for _ in range(n)] 29 | return np.asarray(densities) 30 | #return (densities.T - densities.sum(1)).T 31 | 32 | def _compare_discrete(n, k): 33 | densities = _make_test_densities(n, k) 34 | dens = densities[0,:].copy() / densities[0,:].sum() 35 | expected_mu = np.dot(np.arange(k), dens) 36 | 37 | labels = gpusamp.sample_discrete(densities, logged=False) 38 | est_mu = labels.mean() 39 | return est_mu, expected_mu 40 | 41 | def _compare_logged(n, k): 42 | densities = np.log(_make_test_densities(n, k)) 43 | dens = np.exp((densities[0,:] - densities[0,:].max())) 44 | dens = dens / dens.sum() 45 | expected_mu = np.dot(np.arange(k), dens) 46 | 47 | labels = gpusamp.sample_discrete(densities, logged=True) 48 | est_mu = labels.mean() 49 | return est_mu, expected_mu 50 | 51 | 52 | class TestDiscreteSampler(unittest.TestCase): 53 | test_cases = [(100000, 4), 54 | (100000, 9), 55 | (100000, 16), 56 | (100000, 20), 57 | (1000000, 35)] 58 | 59 | def _check_discrete(self, n, k): 60 | a, b = _compare_discrete(n, k) 61 | assert_almost_equal(a, b, DECIMAL_1) 62 | 63 | def _check_logged(self, n, k): 64 | a, b = _compare_logged(n, k) 65 | assert_almost_equal(a, b, DECIMAL_1) 66 | 67 | def test_discrete(self): 68 | for n, k in self.test_cases: 69 | self._check_discrete(n, k) 70 | 71 | def test_logged(self): 72 | for n, k in self.test_cases: 73 | self._check_logged(n, k) 74 | 75 | 76 | if __name__ == '__main__': 77 | print 'starting sampler' 78 | a, b = _compare_logged(1000000, 35) 79 | print a 80 | print b 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /gpustats/cufiles/sample_discrete.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Block layout : npmfs x nhelpers 3 | Grid layout : K x 1 4 | K is the smallest number such that K * npmfs >= pmf_rows 5 | */ 6 | 7 | __global__ void 8 | k_%(name)s(float* g_pmf, /** Precomputed pmf */ 9 | float* g_urand, /** Precomputed random number */ 10 | float* g_output, /** Resultant choice */ 11 | int pmf_rows, 12 | int pmf_cols, 13 | int pmf_stride, 14 | int sh_stride 15 | ) { 16 | // blockDim.x = number of pmfs sampled from in this block 17 | // blockDim.y = number of helper threads per pmf 18 | unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; 19 | unsigned int thidx = threadIdx.x; 20 | unsigned int npmfs = blockDim.x; 21 | 22 | // Make block size flexible ... 23 | extern __shared__ float shared_data[]; 24 | 25 | float* sh_pmf = shared_data; // npmfs * sh_stride floats 26 | float* sh_work = sh_pmf + npmfs * sh_stride; // nmpfs floats 27 | 28 | // Move pmf data into shared memory 29 | copy_chunks_strided(g_pmf + npmfs * pmf_stride * blockIdx.x, 30 | sh_pmf, tid, pmf_stride, 31 | min(npmfs, pmf_rows - npmfs * blockIdx.x), 32 | sh_stride); 33 | __syncthreads(); 34 | 35 | // move uniform random draws into shared memory 36 | copy_chunks(g_urand + npmfs * blockIdx.x, 37 | sh_work, tid, 38 | min(npmfs, pmf_rows - npmfs * blockIdx.x)); 39 | __syncthreads(); 40 | 41 | // done copying, now move pointer to start of pmf for this row of threads 42 | sh_pmf = sh_pmf + thidx * sh_stride; 43 | 44 | // compute normalizing constant using atomic operators? 45 | 46 | // for(int chunk = 0; chunk + thidx < pmf_cols; chunk += blockDim.x) { 47 | // atomic_add(sh_work + thidy, sh_pmf[chunk + thidx]); 48 | // } 49 | 50 | if (threadIdx.y == 0 && thidx < pmf_rows - npmfs * blockIdx.x) { 51 | float norm_const = 0; 52 | for (int i = 0; i < pmf_cols; ++i) { 53 | norm_const += sh_pmf[i]; 54 | } 55 | 56 | float draw = sh_work[thidx]; 57 | 58 | // replace with scaled cumulative pdf 59 | sh_pmf[0] /= norm_const; 60 | sh_work[thidx] = 0; 61 | if (sh_pmf[0] < draw) { 62 | for(int i = 1; i < pmf_cols; i++) { 63 | sh_pmf[i] = sh_pmf[i-1] + sh_pmf[i] / norm_const; 64 | if (sh_pmf[i] >= draw) { 65 | sh_work[thidx] = i; 66 | break; 67 | } 68 | } 69 | } 70 | } 71 | __syncthreads(); 72 | 73 | // this is now coalesced 74 | unsigned int result_id = blockIdx.x * npmfs + tid; 75 | if (result_id < pmf_rows && tid < npmfs) 76 | g_output[result_id] = sh_work[tid]; 77 | 78 | return; 79 | } 80 | -------------------------------------------------------------------------------- /gpustats/cufiles/sample_discrete_logged.cu: -------------------------------------------------------------------------------- 1 | __global__ void 2 | k_%(name)s(float* g_pmf, /** Precomputed logged pmf */ 3 | float* g_urand, /** Precomputed random number */ 4 | float* g_output, /** Resultant choice */ 5 | int pmf_rows, 6 | int pmf_cols, 7 | int pmf_stride, 8 | int sh_stride 9 | ) { 10 | 11 | // blockDim.x = number of pmfs sampled from in this block 12 | // blockDim.y = number of helper threads per pmf 13 | unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; 14 | unsigned int thidx = threadIdx.x; 15 | unsigned int npmfs = blockDim.x; 16 | 17 | // Make block size flexible ... 18 | extern __shared__ float shared_data[]; 19 | 20 | float* sh_pmf = shared_data; // npmfs * sh_stride floats 21 | float* sh_work = sh_pmf + npmfs * sh_stride; // nmpfs floats 22 | 23 | // Move pmf data into shared memory 24 | copy_chunks_strided(g_pmf + npmfs * pmf_stride * blockIdx.x, 25 | sh_pmf, tid, pmf_stride, 26 | min(npmfs, pmf_rows - npmfs * blockIdx.x), 27 | sh_stride); 28 | __syncthreads(); 29 | 30 | // move uniform random draws into shared memory 31 | copy_chunks(g_urand + npmfs * blockIdx.x, 32 | sh_work, tid, 33 | min(npmfs, pmf_rows - npmfs * blockIdx.x)); 34 | __syncthreads(); 35 | 36 | // done copying, now move pointer to start of pmf for this row of threads 37 | sh_pmf = sh_pmf + thidx * sh_stride; 38 | 39 | if (threadIdx.y == 0 && thidx < pmf_rows - npmfs * blockIdx.x) { 40 | // get max 41 | float pmf_max = sh_pmf[0]; float cur_val = 0; 42 | for (int i = 1; i < pmf_cols; ++i){ 43 | cur_val = sh_pmf[i]; 44 | pmf_max = fmax(pmf_max, cur_val); 45 | //pmf_max = ((pmf_max < cur_val) : (cur_val) , (pmf_max)); 46 | } 47 | 48 | // subtract max and exponentiate 49 | float norm_const = 0; 50 | for (int i = 0; i < pmf_cols; ++i) { 51 | sh_pmf[i] = expf(sh_pmf[i] - pmf_max); 52 | norm_const += sh_pmf[i]; 53 | } 54 | 55 | float draw = sh_work[thidx]; 56 | 57 | // replace with scaled cumulative pdf 58 | sh_pmf[0] /= norm_const; 59 | sh_work[thidx] = 0; 60 | if (sh_pmf[0] < draw) { 61 | for(int i = 1; i < pmf_cols; i++) { 62 | sh_pmf[i] = sh_pmf[i-1] + sh_pmf[i] / norm_const; 63 | if (sh_pmf[i] >= draw) { 64 | sh_work[thidx] = i; 65 | break; 66 | } 67 | } 68 | } 69 | 70 | // write 71 | g_output[blockIdx.x*npmfs + thidx] = sh_work[thidx]; 72 | 73 | } 74 | // __syncthreads(); 75 | 76 | // this is now coalesced 77 | // unsigned int result_id = blockIdx.x * npmfs + tid; 78 | // if (result_id < pmf_rows && tid < npmfs) 79 | // g_output[result_id] = sh_work[tid]; 80 | 81 | } 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /old/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_PATH = /usr/local/cuda 2 | 3 | # compilers 4 | CC := gcc 5 | CXX = g++ 6 | NVCC = $(CUDA_PATH)/bin/nvcc 7 | NVCC_DBG_FLAGS = -Xcompiler -fno-strict-aliasing,-fPIC 8 | 9 | INCPATH = 10 | # compiler / linker flags 11 | CCFLAGS = -fPIC -g -Wall 12 | 13 | # linker flags 14 | LINKFLAGS = -L. -lgpustats 15 | 16 | LIBPATH = 17 | NVCCFLAGS = $(NVCC_DBG_FLAGS) 18 | 19 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) 20 | OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:]) 21 | OSNAME := $(shell uname) 22 | OSARCH = $(shell uname -m) 23 | 24 | ifeq ($(OSNAME),Linux) 25 | CUDA_SDK_PATH := $(HOME)/cuda_sdk 26 | CUDA_LIB = -L$(CUDA_PATH)/lib64 27 | endif 28 | 29 | # OS X thinks it's i386 30 | # 'linux' is output for Linux system, 'darwin' for OS X 31 | DARWIN = $(strip $(findstring DARWIN, $(OSUPPER))) 32 | 33 | ifeq ($(OSNAME),Darwin) 34 | CUDA_SDK_PATH := /Developer/CUDA/C 35 | CUDA_LIB = -L$(CUDA_PATH)/lib 36 | endif 37 | 38 | CC_ARCH_FLAGS := 39 | # NVCCFLAGS := 40 | 41 | LIB_ARCH = x86_64 42 | CUDPPLIB_SUFFIX = x86_64 43 | NVCCFLAGS += -m64 44 | CXX_ARCH_FLAGS += -m64 45 | 46 | # ifeq ($(OSNAME),Darwin) 47 | # NVCCFLAGS += -m32 48 | # LIB_ARCH = i386 49 | # CC_ARCH_FLAGS += -arch i386 50 | # else 51 | # LIB_ARCH = x86_64 52 | # CUDPPLIB_SUFFIX = x86_64 53 | # NVCCFLAGS += -m64 54 | # ifneq ($(DARWIN),) 55 | # CXX_ARCH_FLAGS += -arch x86_64 56 | # else 57 | # CXX_ARCH_FLAGS += -m64 58 | # endif 59 | # endif 60 | 61 | CCFLAGS += $(CC_ARCH_FLAGS) 62 | 63 | CUDA_INC = -I$(CUDA_PATH)/include 64 | CUDA_SDK_COMMONDIR = $(CUDA_SDK_PATH)/common 65 | CUDA_SDK_INC = -I$(CUDA_SDK_COMMONDIR)/inc 66 | CUDA_LIB += -L$(CUDA_SDK_PATH)/lib -L$(CUDA_SDK_COMMONDIR)/lib -lcuda -lcudart -lcublas 67 | 68 | EXECUTABLE := test 69 | CUFILES = mvnpdf.cu 70 | CU_DEPS = common.h 71 | CFILES := common.c 72 | USECUBLAS := 1 73 | OBJDIR = obj 74 | LIBDIR = lib 75 | TARGETDIR = . 76 | TARGET := $(TARGETDIR)/$(EXECUTABLE) 77 | 78 | OBJS += $(patsubst %.c,%.o,$(notdir $(CFILES))) 79 | OBJS += $(patsubst %.cu,%.cu_o,$(notdir $(CUFILES))) 80 | 81 | VERBOSE := - 82 | 83 | # need to use g++ to link on OS X? 84 | 85 | libgpustats.so: makedirs $(OBJS) 86 | $(CXX) $(CC_ARCH_FLAGS) -shared -W1,-soname,libgpustats.so -o libgpustats.so $(OBJS) -lc $(CUDA_LIB) 87 | 88 | runpy: cython 89 | LD_LIBRARY_PATH=.:$(LD_LIBRARY_PATH) python scratch.py 90 | 91 | ipython: cython 92 | LD_LIBRARY_PATH=.:$(LD_LIBRARY_PATH) ipython 93 | 94 | test: libgpustats.so 95 | $(VERBOSE)$(CC) $(CC_ARCH_FLAGS) -std=c99 test.c -o test $(CUDA_INC) $(LINKFLAGS) 96 | 97 | cython: libgpustats.so cytest.pyx build_cython.py 98 | -python build_cython.py build_ext --inplace 99 | 100 | makedirs: 101 | $(VERBOSE)mkdir -p $(LIBDIR) 102 | $(VERBOSE)mkdir -p $(OBJDIR) 103 | $(VERBOSE)mkdir -p $(TARGETDIR) 104 | 105 | clean: 106 | -rm -rf *.so *.o *.cu_o build/ 107 | 108 | #### CUDA files 109 | 110 | %.o: %.c 111 | $(VERBOSE)$(CC) $(CCFLAGS) -c $*.c -o $@ $(INCPATH) $(CUDA_INC) $(CUDA_SDK_INC) 112 | 113 | %.c_o : %.c 114 | $(CC) $(PROFILE) -c $< -o $@ 115 | 116 | %.cu_o : %.cu $(CUDA_HEADERS) $(CU_DEPS) 117 | $(VERBOSE)$(NVCC) $(NVCCFLAGS) -c $< -o $@ -I. $(INCPATH) $(CUDA_INC) $(CUDA_SDK_INC) -DUNIX 118 | -------------------------------------------------------------------------------- /gpustats/tests/test_pdfs.py: -------------------------------------------------------------------------------- 1 | import nose 2 | import sys 3 | import unittest 4 | 5 | from numpy.random import randn 6 | from numpy.linalg import inv, cholesky as chol 7 | from numpy.testing import assert_almost_equal, assert_equal 8 | import numpy as np 9 | 10 | import scipy.stats as sp_stats 11 | 12 | import gpustats as gps 13 | import gpustats.compat as compat 14 | import gpustats.util as util 15 | 16 | DECIMAL_6 = 6 17 | DECIMAL_5 = 5 18 | DECIMAL_4 = 4 19 | DECIMAL_3 = 3 20 | DECIMAL_2 = 2 21 | 22 | np.set_printoptions(suppress=True) 23 | 24 | def _make_test_case(n=1000, k=4, p=1): 25 | data = randn(n, k) 26 | covs = [util.random_cov(k) for _ in range(p)] 27 | means = [randn(k) for _ in range(p)] 28 | return data, means, covs 29 | 30 | # debugging... 31 | 32 | def _compare_multi(n, k, p): 33 | data, means, covs = _make_test_case(n, k, p) 34 | 35 | # cpu in PyMC 36 | pyresult = compat.python_mvnpdf(data, means, covs) 37 | 38 | # gpu 39 | result = gps.mvnpdf_multi(data, means, covs) 40 | 41 | return result, pyresult 42 | 43 | def _compare_single(n, k): 44 | data, means, covs = _make_test_case(n, k, 1) 45 | 46 | mean = means[0] 47 | cov = covs[0] 48 | 49 | # cpu in PyMC 50 | pyresult = compat.python_mvnpdf(data, [mean], [cov]).squeeze() 51 | # gpu 52 | 53 | result = gps.mvnpdf(data, mean, cov) 54 | return result, pyresult 55 | 56 | class TestMVN(unittest.TestCase): 57 | # ndata, dim, ncomponents 58 | test_cases = [(1000, 4, 1), 59 | (1000, 4, 16), 60 | (1000, 4, 32), 61 | (1000, 4, 64), 62 | (1000, 7, 64), 63 | (1000, 8, 64), 64 | (1000, 14, 32), 65 | (1000, 16, 128), 66 | (250, 25, 32), 67 | (10, 15, 2), 68 | (500000, 5, 12)] 69 | 70 | def _check_multi(self, n, k, p): 71 | a, b = _compare_multi(n, k, p) 72 | assert_almost_equal(a, b, DECIMAL_2) 73 | 74 | def _check_single(self, n, k): 75 | a, b = _compare_single(n, k) 76 | assert_almost_equal(a, b, DECIMAL_2) 77 | 78 | def test_multi(self): 79 | for n, k, p in self.test_cases: 80 | self._check_multi(n, k, p) 81 | 82 | def test_single(self): 83 | for n, k, p in self.test_cases: 84 | self._check_single(n, k) 85 | 86 | class TestUnivariate(unittest.TestCase): 87 | def test_normal(self): 88 | test_cases = [ 89 | (100, 0, 1), 90 | (100, .5, 2.5), 91 | (10, 5, 3), 92 | (2000, 1, 4) 93 | ] 94 | for n, mean, std in test_cases: 95 | data = randn(n) 96 | pyresult = sp_stats.norm.pdf(data, loc=mean, scale=std) 97 | 98 | result = gps.normpdf(data, mean, std, logged=True) 99 | assert_almost_equal(result, np.log(pyresult), DECIMAL_5) 100 | 101 | def test_normal_multi(self): 102 | means = np.random.randn(5) 103 | scales = np.ones(5) 104 | 105 | data = np.random.randn(10) 106 | result = gps.normpdf_multi(data, means, scales, logged=True) 107 | 108 | pyresult = np.empty_like(result) 109 | for i, (m, sc) in enumerate(zip(means, scales)): 110 | pyresult[:, i] = sp_stats.norm.pdf(data, loc=m, scale=sc) 111 | assert_almost_equal(result, np.log(pyresult), DECIMAL_5) 112 | 113 | if __name__ == '__main__': 114 | # nose.runmodule(argv=['', '--pdb', '-v', '--pdb-failure']) 115 | _compare_multi(500000, 4, 128) 116 | pass 117 | -------------------------------------------------------------------------------- /old/common.h: -------------------------------------------------------------------------------- 1 | #ifndef __GPUSTATS_COMMON__ 2 | #define __GPUSTATS_COMMON__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | /* Dimension specific definitions to ensure coalesced memory transactions */ 9 | 10 | // extern int DIM,MEAN_CHD_DIM,PACK_DIM,CHD_DIM,LOGDET_OFFSET,DATA_PADDED_DIM,NCHUNKSIZE; 11 | 12 | /* 13 | #define DENSITIES_IN_BLOCK 16 //4 //4 for 27d data, 16 for other data 14 | #define DATA_IN_BLOCK 16 //need >= 16 to be efficient 15 | #define SAMPLE_BLOCK 32 16 | #define SAMPLE_DENSITY_BLOCK 16 17 | 18 | #define BASE_DATAPADED_DIM 8 19 | 20 | #define SIGMA_BLOCK_SIZE 128 21 | #define SIGMA_THREAD_SUM_SIZE 25 22 | #define MAX_GPU_COUNT 8 23 | 24 | 25 | #define LOGPDF 26 | */ 27 | 28 | //#define CHECK_GPU 29 | 30 | /* 31 | // For algorithm 2 32 | 33 | #define PAD_CSR 0 // Little (no?) performance gain on 9400M and complicates algorithm 34 | #define PAD 1 // Removes some bank conflicts (?) 35 | #define BLOCK_SIZE_COL 16 // # of data columns to process per block 36 | #define BLOCK_SIZE_ROW 32 // BLOCK_SIZE_ROW / HALFWARP = # of rows (components) to process per block 37 | #define HALFWARP_LOG2 4 38 | #define HALFWARP (1< work[tid]) { 47 | work[tid] = dcurrent; 48 | } 49 | } 50 | } 51 | } 52 | __syncthreads(); 53 | } 54 | } 55 | 56 | 57 | //get scaled cummulative pdfs 58 | for(int chunk = 0; chunk < iT; chunk += sample_density_block) { 59 | if(pdfIndex + chunk + thidx < iN*iT) 60 | measure[thidy*stride + thidx] = in_measure[pdfIndex + chunk + thidx]; 61 | 62 | __syncthreads(); 63 | 64 | if (tid < sample_block) { 65 | for(int i=0; i measure[tid*stride + i]){ 110 | work[tid] = i + chunk + 1; 111 | } 112 | } 113 | } 114 | if ( work[tid] >= iT) {work[tid] = iT-1;} 115 | } 116 | __syncthreads(); 117 | } 118 | 119 | // this is now coalesced 120 | if (result_id < iN && tid < sample_block) 121 | out_component[result_id] = (int) work[tid]; 122 | 123 | } 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /examples/pymc_test.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=E1101 2 | 3 | import pymc as pm 4 | import pymc.distributions as dist 5 | import numpy as np 6 | from numpy.linalg import inv, cholesky as chol 7 | import numpy.linalg as L 8 | import numpy.random as rand 9 | 10 | import matplotlib.pyplot as plt 11 | 12 | #------------------------------------------------------------------------------- 13 | # Generate MV normal mixture 14 | 15 | gen_mean = { 16 | 0 : [0, 5], 17 | 1 : [-10, 0], 18 | 2 : [-10, 10] 19 | } 20 | 21 | gen_sd = { 22 | 0 : [0.5, 0.5], 23 | 1 : [.5, 1], 24 | 2 : [1, .25] 25 | } 26 | 27 | gen_corr = { 28 | 0 : 0.5, 29 | 1 : -0.5, 30 | 2 : 0 31 | } 32 | 33 | group_weights = [0.6, 0.3, 0.1] 34 | 35 | def generate_data(n=1e5, k=2, ncomps=3, seed=1): 36 | rand.seed(seed) 37 | data_concat = [] 38 | labels_concat = [] 39 | 40 | for j in range(ncomps): 41 | mean = gen_mean[j] 42 | sd = gen_sd[j] 43 | corr = gen_corr[j] 44 | 45 | cov = np.empty((k, k)) 46 | cov.fill(corr) 47 | cov[np.diag_indices(k)] = 1 48 | cov *= np.outer(sd, sd) 49 | 50 | num = int(n * group_weights[j]) 51 | rvs = pm.rmv_normal_cov(mean, cov, size=num) 52 | 53 | data_concat.append(rvs) 54 | labels_concat.append(np.repeat(j, num)) 55 | 56 | return (np.concatenate(labels_concat), 57 | np.concatenate(data_concat, axis=0)) 58 | 59 | N = int(1e5) # n data points per component 60 | K = 2 # ndim 61 | ncomps = 3 # n mixture components 62 | 63 | true_labels, data = generate_data(n=N, k=K, ncomps=ncomps) 64 | 65 | def plot_2d_mixture(data, labels): 66 | plt.figure(figsize=(10, 10)) 67 | colors = 'bgr' 68 | for j in np.unique(labels): 69 | x, y = data[labels == j].T 70 | plt.plot(x, y, '%s.' % colors[j], ms=2) 71 | 72 | 73 | def plot_thetas(sampler): 74 | plot_2d_mixture(data, true_labels) 75 | 76 | def plot_theta(i): 77 | x, y = sampler.trace('theta_%d' % i)[:].T 78 | plt.plot(x, y, 'k.') 79 | 80 | for i in range(3): 81 | plot_theta(i) 82 | 83 | #------------------------------------------------------------------------------- 84 | # set up PyMC model 85 | 86 | # priors, fairly vague 87 | prior_mean = data.mean(0) 88 | sigma0 = np.diag([1., 1.]) 89 | prior_cov = np.cov(data.T) 90 | 91 | # shared hyperparameter? 92 | # theta_tau = pm.Wishart('theta_tau', n=4, Tau=L.inv(sigma0)) 93 | 94 | # df = pm.DiscreteUniform('df', 3, 50) 95 | 96 | thetas = [] 97 | taus = [] 98 | for j in range(ncomps): 99 | # need a hyperparameter for degrees of freedom? 100 | tau = pm.Wishart('C_%d' % j, n=3, Tau=inv(prior_cov)) 101 | theta = pm.MvNormal('theta_%d' % j, mu=prior_mean, tau=inv(2 * prior_cov)) 102 | 103 | thetas.append(theta) 104 | taus.append(tau) 105 | 106 | alpha0 = np.ones(3.) / 3 107 | weights = pm.Dirichlet('weights', theta=alpha0) 108 | # labels = pm.Categorical('labels', p=weights, size=len(data)) 109 | 110 | from pandas.util.testing import set_trace as st 111 | import pdfs 112 | import util 113 | 114 | def mixture_loglike(data, thetas, covs, labels): 115 | 116 | n = len(data) 117 | likes = pdfs.mvnpdf(data, thetas, covs) 118 | loglike = likes.ravel('F').take(labels * n + np.arange(n)).sum() 119 | 120 | if np.isnan(loglike): 121 | return -1e300 122 | 123 | return loglike 124 | 125 | if np.isnan(likes).any(): 126 | loglike = 0. 127 | for j, (theta, cov) in enumerate(zip(thetas, covs)): 128 | this_data = data[labels == j] 129 | ch = chol(cov) 130 | loglike += pm.mv_normal_chol_like(this_data, theta, ch) 131 | 132 | return loglike 133 | 134 | def mixture_loglike2(data, thetas, taus, weights): 135 | 136 | n = len(data) 137 | 138 | covs = [inv(tau) for tau in taus] 139 | 140 | likes = pdfs.mvnpdf(data, thetas, covs) 141 | loglike = (likes * weights).sum() 142 | 143 | # loglike = likes.ravel('F').take(labels * n + np.arange(n)).sum() 144 | 145 | if np.isnan(loglike): 146 | st() 147 | return -1e300 148 | 149 | return loglike 150 | 151 | if np.isnan(likes).any(): 152 | loglike = 0. 153 | for j, (theta, cov) in enumerate(zip(thetas, covs)): 154 | this_data = data[labels == j] 155 | loglike += pm.mv_normal_chol_like(this_data, theta, ch) 156 | 157 | return loglike 158 | 159 | @pm.deterministic 160 | def adj_weights(weights=weights): 161 | return np.sort(np.r_[weights, 1 - weights.sum()]) 162 | 163 | @pm.stochastic(observed=True) 164 | def mixture(value=data, thetas=thetas, taus=taus, weights=adj_weights): 165 | return mixture_loglike2(value, thetas, taus, weights) 166 | 167 | sampler = pm.MCMC(locals()) 168 | 169 | sampler.sample(iter=3000, burn=100, tune_interval=100, thin=10) 170 | 171 | -------------------------------------------------------------------------------- /gpustats/sampler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import gpustats.kernels as kernels 4 | import gpustats.codegen as codegen 5 | import gpustats.util as util 6 | import pycuda.driver as drv 7 | from pycuda.gpuarray import GPUArray, to_gpu 8 | from pycuda.gpuarray import empty as gpu_empty 9 | from pycuda.curandom import rand as curand 10 | 11 | # reload(kernels) 12 | # reload(codegen) 13 | 14 | cu_module = codegen.get_full_cuda_module() 15 | 16 | def sample_discrete(densities, logged=False, 17 | return_gpuarray=False): 18 | 19 | """ 20 | Takes a categorical sample from the unnormalized univariate 21 | densities defined in the rows of 'densities' 22 | 23 | Parameters 24 | --------- 25 | densities : ndarray or gpuarray (n, k) 26 | logged: boolean indicating whether densities is on the 27 | log scale ... 28 | 29 | Returns 30 | ------- 31 | indices : ndarray or gpuarray (if return_gpuarray=True) 32 | of length n and dtype = int32 33 | """ 34 | 35 | from gpustats.util import info 36 | 37 | n, k = densities.shape 38 | # prep data 39 | if isinstance(densities, GPUArray): 40 | if densities.flags.f_contiguous: 41 | gpu_densities = util.transpose(densities) 42 | else: 43 | gpu_densities = densities 44 | else: 45 | densities = util.prep_ndarray(densities) 46 | gpu_densities = to_gpu(densities) 47 | 48 | # get gpu function 49 | cu_func = cu_module.get_function('sample_discrete') 50 | 51 | # setup GPU data 52 | gpu_random = to_gpu(np.asarray(np.random.rand(n), dtype=np.float32)) 53 | gpu_dest = gpu_empty(n, dtype=np.int32) 54 | dims = np.array([n,k,logged],dtype=np.int32) 55 | 56 | if info.max_block_threads<1024: 57 | x_block_dim = 16 58 | else: 59 | x_block_dim = 32 60 | 61 | y_block_dim = 16 62 | # setup GPU call 63 | block_design = (x_block_dim, y_block_dim, 1) 64 | grid_design = (int(n/y_block_dim) + 1, 1) 65 | 66 | shared_mem = 4 * ( (x_block_dim+1)*y_block_dim + 67 | 2 * y_block_dim ) 68 | 69 | cu_func(gpu_densities, gpu_random, gpu_dest, 70 | dims[0], dims[1], dims[2], 71 | block=block_design, grid=grid_design, shared=shared_mem) 72 | 73 | gpu_random.gpudata.free() 74 | if return_gpuarray: 75 | return gpu_dest 76 | else: 77 | res = gpu_dest.get() 78 | gpu_dest.gpudata.free() 79 | return res 80 | 81 | 82 | ## depreciated 83 | def sample_discrete_old(in_densities, logged=False, pad=False, 84 | return_gpuarray=False): 85 | """ 86 | Takes a categorical sample from the unnormalized univariate 87 | densities defined in the rows of 'densities' 88 | 89 | Parameters 90 | --------- 91 | densities : ndarray or gpuarray (n, k) 92 | logged: boolean indicating whether densities is on the 93 | log scale ... 94 | 95 | Returns 96 | ------- 97 | indices : ndarray or gpuarray (if return_gpuarray=True) 98 | of length n and dtype = int32 99 | """ 100 | 101 | if pad: 102 | if logged: 103 | densities = util.pad_data_mult16(in_densities, fill=1) 104 | else: 105 | densities = util.pad_data_mult16(in_densities, fill=0) 106 | 107 | else: 108 | densities = in_densities 109 | 110 | n, k = densities.shape 111 | 112 | if logged: 113 | cu_func = cu_module.get_function('sample_discrete_logged_old') 114 | else: 115 | cu_func = cu_module.get_function('sample_discrete_old') 116 | 117 | if isinstance(densities, GPUArray): 118 | if densities.flags.f_contiguous: 119 | gpu_densities = util.transpose(densities) 120 | else: 121 | gpu_densities = densities 122 | else: 123 | densities = util.prep_ndarray(densities) 124 | gpu_densities = to_gpu(densities) 125 | 126 | # setup GPU data 127 | #gpu_random = curand(n) 128 | gpu_random = to_gpu(np.asarray(np.random.rand(n), dtype=np.float32)) 129 | #gpu_dest = to_gpu(np.zeros(n, dtype=np.float32)) 130 | gpu_dest = gpu_empty(n, dtype=np.float32) 131 | stride = gpu_densities.shape[1] 132 | if stride % 2 == 0: 133 | stride += 1 134 | dims = np.array([n,k, gpu_densities.shape[1], stride],dtype=np.int32) 135 | 136 | 137 | # optimize design ... 138 | grid_design, block_design = _tune_sfm(n, stride, cu_func.num_regs) 139 | 140 | shared_mem = 4 * (block_design[0] * stride + 141 | 1 * block_design[0]) 142 | 143 | cu_func(gpu_densities, gpu_random, gpu_dest, 144 | dims[0], dims[1], dims[2], dims[3], 145 | block=block_design, grid=grid_design, shared=shared_mem) 146 | 147 | gpu_random.gpudata.free() 148 | if return_gpuarray: 149 | return gpu_dest 150 | else: 151 | res = gpu_dest.get() 152 | gpu_dest.gpudata.free() 153 | return res 154 | 155 | def _tune_sfm(n, stride, func_regs): 156 | """ 157 | Outputs the 'opimal' block and grid configuration 158 | for the sample discrete kernel. 159 | """ 160 | from gpustats.util import info 161 | 162 | #info = DeviceInfo() 163 | comp_cap = info.compute_cap 164 | max_smem = info.shared_mem * 0.8 165 | max_threads = int(info.max_block_threads * 0.5) 166 | max_regs = 0.9 * info.max_registers 167 | 168 | # We want smallest dim possible in x dimsension while 169 | # still reading mem correctly 170 | 171 | if comp_cap[0] == 1: 172 | xdim = 16 173 | else: 174 | xdim = 32 175 | 176 | 177 | def sfm_config_ok(xdim, ydim, stride, func_regs, max_regs, max_smem, max_threads): 178 | ok = 4*(xdim*stride + 1*xdim) < max_smem and func_regs*ydim*xdim < max_regs 179 | return ok and xdim*ydim <= max_threads 180 | 181 | ydim = 2 182 | while sfm_config_ok(xdim, ydim, stride, func_regs, max_regs, max_smem, max_threads): 183 | ydim += 1 184 | 185 | ydim -= 1 186 | 187 | nblocks = int(n/xdim) + 1 188 | 189 | return (nblocks,1), (xdim,ydim,1) 190 | 191 | if __name__ == '__main__': 192 | 193 | n = 100 194 | k = 5 195 | dens = np.log(np.abs(np.random.randn(k))) - 200 196 | densities = [dens.copy() for _ in range(n)] 197 | dens = np.exp(dens + 200) 198 | densities = np.asarray(densities) 199 | 200 | labels = sample_discrete(densities, logged=True) 201 | mu = np.dot(dens / dens.sum(), np.arange(k)) 202 | print mu, labels.mean() 203 | -------------------------------------------------------------------------------- /scripts/bench.py: -------------------------------------------------------------------------------- 1 | from pandas import * 2 | 3 | import numpy as np 4 | 5 | from pycuda.gpuarray import to_gpu 6 | import gpustats 7 | import gpustats.util as util 8 | from scipy.stats import norm 9 | import timeit 10 | 11 | data = np.random.randn(1000000) 12 | mean = 20 13 | std = 5 14 | 15 | univ_setup = """ 16 | import numpy as np 17 | from pycuda.gpuarray import to_gpu 18 | k = 8 19 | means = np.random.randn(k) 20 | stds = np.abs(np.random.randn(k)) 21 | 22 | mean = 20 23 | std = 5 24 | import gpustats 25 | from scipy.stats import norm 26 | cpu_data = np.random.randn(%d) 27 | gpu_data = cpu_data 28 | """ 29 | 30 | univ_setup_gpuarray = univ_setup + """ 31 | gpu_data = to_gpu(cpu_data) 32 | """ 33 | 34 | multivar_setup = """ 35 | # from __main__ import data, mean, std 36 | import gpustats 37 | import gpustats.util as util 38 | import numpy as np 39 | import testmod 40 | from pycuda.gpuarray import to_gpu 41 | import testmod 42 | from numpy.linalg import cholesky as chol 43 | import numpy.linalg as L 44 | 45 | 46 | def next_multiple(k, p): 47 | if k.__mod__(p): 48 | return k + (p - k.__mod__(p)) 49 | 50 | return k 51 | 52 | PAD_MULTIPLE = 16 53 | HALF_WARP = 16 54 | 55 | 56 | def pad_data(data): 57 | n, k = data.shape 58 | 59 | if not k.__mod__(HALF_WARP): 60 | pad_dim = k + 1 61 | else: 62 | pad_dim = k 63 | 64 | if k != pad_dim: 65 | padded_data = np.empty((n, pad_dim), dtype=np.float32) 66 | padded_data[:, :k] = data 67 | 68 | return padded_data 69 | else: 70 | return prep_ndarray(data) 71 | 72 | def prep_ndarray(arr): 73 | # is float32 and contiguous? 74 | if not arr.dtype == np.float32 or not arr.flags.contiguous: 75 | arr = np.array(arr, dtype=np.float32) 76 | 77 | return arr 78 | 79 | def pack_params(means, chol_sigmas, logdets): 80 | to_pack = [] 81 | for m, ch, ld in zip(means, chol_sigmas, logdets): 82 | to_pack.append(pack_pdf_params(m, ch, ld)) 83 | 84 | return np.vstack(to_pack) 85 | 86 | def pack_pdf_params(mean, chol_sigma, logdet): 87 | k = len(mean) 88 | mean_len = k 89 | chol_len = k * (k + 1) / 2 90 | mch_len = mean_len + chol_len 91 | 92 | packed_dim = next_multiple(mch_len + 2, PAD_MULTIPLE) 93 | 94 | packed_params = np.empty(packed_dim, dtype=np.float32) 95 | packed_params[:mean_len] = mean 96 | 97 | packed_params[mean_len:mch_len] = chol_sigma[np.tril_indices(k)] 98 | packed_params[mch_len:mch_len + 2] = 1, logdet 99 | 100 | return packed_params 101 | 102 | k = %d 103 | 104 | dim = 15 105 | means = np.random.randn(k, dim) 106 | covs = [util.random_cov(dim) for _ in xrange(k)] 107 | 108 | cpu_data = np.random.randn(%d, dim) 109 | gpu_data = cpu_data 110 | """ 111 | 112 | multivar_setup_gpuarray = multivar_setup + """ 113 | gpu_data = to_gpu(cpu_data) 114 | """ 115 | 116 | LOG_2_PI = np.log(2 * np.pi) 117 | 118 | # def mvnpdf(data, mean, cov): 119 | # ichol_sigma = np.asarray(np.linalg.inv(np.linalg.cholesky(cov))) 120 | # # ichol_sigma = np.tril(ichol_sigma) 121 | # logdet = np.log(np.linalg.det(cov)) 122 | # return [_mvnpdf(x, mean, ichol_sigma, logdet) 123 | # for x in data] 124 | 125 | # def _mvnpdf(x, mean, ichol_sigma, logdet): 126 | # demeaned = x - mean 127 | # discrim = ((ichol_sigma * demeaned) ** 2).sum() 128 | # # discrim = np.dot(demeaned, np.dot(ichol_sigma, demeaned)) 129 | # return - 0.5 * (discrim + logdet + LOG_2_PI * dim) 130 | 131 | def get_timeit(stmt, setup, iter=10): 132 | return timeit.Timer(stmt, setup).timeit(number=iter) / iter 133 | 134 | def compare_timings_single(n, setup=univ_setup): 135 | gpu = "gpustats.normpdf(gpu_data, mean, std, logged=False)" 136 | cpu = "norm.pdf(cpu_data, loc=mean, scale=std)" 137 | setup = setup % n 138 | return {'GPU' : get_timeit(gpu, setup, iter=1000), 139 | 'CPU' : get_timeit(cpu, setup)} 140 | 141 | def compare_timings_multi(n, setup=univ_setup): 142 | gpu = "gpustats.normpdf_multi(gpu_data, means, stds, logged=False)" 143 | cpu = """ 144 | for m, s in zip(means, stds): 145 | norm.pdf(cpu_data, loc=m, scale=s) 146 | """ 147 | setup = setup % n 148 | return {'GPU' : get_timeit(gpu, setup, iter=100), 149 | 'CPU' : get_timeit(cpu, setup)} 150 | 151 | 152 | def mvcompare_timings(n, k=1, setup=multivar_setup): 153 | gpu = "gpustats.mvnpdf_multi(gpu_data, means, covs, logged=False)" 154 | cpu = """ 155 | ichol_sigmas = [L.inv(chol(sig)) for sig in covs] 156 | logdets = [np.log(np.linalg.det(sig)) for sig in covs] 157 | params = pack_params(means, covs, logdets) 158 | testmod.cpu_mvnpdf(cpu_data, params, dim) 159 | """ 160 | setup = setup % (k, n) 161 | return {'GPU' : get_timeit(gpu, setup, iter=100), 162 | 'CPU' : get_timeit(cpu, setup)} 163 | 164 | def get_timing_results(timing_f): 165 | lengths = [100, 1000, 10000, 100000, 1000000] 166 | 167 | result = {} 168 | for n in lengths: 169 | print n 170 | result[n] = timing_f(n) 171 | result = DataFrame(result).T 172 | result['Speedup'] = result['CPU'] / result['GPU'] 173 | return result 174 | 175 | # mvsingle = get_timing_results(mvcompare_timings) 176 | # comp_gpu = lambda n: mvcompare_timings(n, setup=multivar_setup_gpuarray) 177 | # mvsingle_gpu = get_timing_results(comp_gpu) 178 | # multi_comp = lambda n: mvcompare_timings(n, k=16) 179 | # mvmulti = get_timing_results(multi_comp) 180 | # multi_comp_gpu = lambda n: mvcompare_timings(n, k=16, 181 | # setup=multivar_setup_gpuarray) 182 | # mvmulti_gpu = get_timing_results(multi_comp_gpu) 183 | 184 | single = get_timing_results(compare_timings_single) 185 | comp_gpu = lambda n: compare_timings_single(n, setup=univ_setup_gpuarray) 186 | single_gpu = get_timing_results(comp_gpu) 187 | multi = get_timing_results(compare_timings_multi) 188 | comp_gpu = lambda n: compare_timings_multi(n, setup=univ_setup_gpuarray) 189 | multi_gpu = get_timing_results(comp_gpu) 190 | 191 | data = DataFrame({ 192 | 'Single' : single['Speedup'], 193 | 'Single (GPUArray)' : single_gpu['Speedup'], 194 | 'Multi' : multi['Speedup'], 195 | 'Multi (GPUArray)' : multi_gpu['Speedup'], 196 | }) 197 | 198 | 199 | mvdata = DataFrame({ 200 | 'Single' : mvsingle['Speedup'], 201 | 'Single (GPUArray)' : mvsingle_gpu['Speedup'], 202 | 'Multi' : mvmulti['Speedup'], 203 | 'Multi (GPUArray)' : mvmulti_gpu['Speedup'], 204 | }) 205 | 206 | if __name__ == '__main__': 207 | import gpustats 208 | import numpy as np 209 | from scipy.stats import norm 210 | import testmod 211 | from numpy.linalg import cholesky as chol 212 | import numpy.linalg as L 213 | 214 | # dim = 15 215 | # k = 8 216 | # means = np.random.randn(k, dim) 217 | # covs = [np.asarray(util.random_cov(dim)) for _ in xrange(k)] 218 | 219 | # cpu_data = np.random.randn(100000, dim) 220 | # gpu_data = to_gpu(cpu_data) 221 | 222 | # ichol_sigmas = [L.inv(chol(sig)) for sig in covs] 223 | # logdets = [np.log(np.linalg.det(sig)) for sig in covs] 224 | # packed_params = pack_params(means, covs, logdets) 225 | 226 | # pdfs = gpustats.mvnpdf(cpu_data, means[0], covs[0]) 227 | # pdfs = testmod.cpu_mvnpdf(cpu_data, packed_params, 15) 228 | 229 | -------------------------------------------------------------------------------- /gpustats/codegen.py: -------------------------------------------------------------------------------- 1 | import pycuda.driver as drv 2 | import pycuda.tools 3 | #import pycuda.autoinit 4 | drv.init() 5 | if drv.Context.get_current() is None: 6 | import pycuda.autoinit 7 | 8 | import numpy 9 | import numpy.linalg as la 10 | import os 11 | from pycuda.compiler import SourceModule 12 | from gpustats.util import get_cufiles_path 13 | 14 | class CUDAModule(object): 15 | """ 16 | Interfaces with PyCUDA 17 | 18 | Parameters 19 | ---------- 20 | kernel_dict : 21 | """ 22 | def __init__(self, kernel_dict): 23 | self.kernel_dict = kernel_dict 24 | self.support_code = _get_support_code() 25 | 26 | self.all_code = self._get_full_source() 27 | try: 28 | #self.pycuda_module = SourceModule(self.all_code) 29 | # dictionary mapping contexts to their respective loaded code modules 30 | self.pycuda_modules = { drv.Context.get_current() : SourceModule(self.all_code) } 31 | except Exception: 32 | f = open('foo.cu', 'w') 33 | print >> f, self.all_code 34 | f.close() 35 | raise 36 | #self.curDevice = drv.Context.get_device() 37 | 38 | def _get_full_source(self): 39 | formatted_kernels = [kern.get_code() 40 | for kern in self.kernel_dict.values()] 41 | return '\n'.join([self.support_code] + formatted_kernels) 42 | 43 | def get_function(self, name): 44 | # get the module for this context 45 | context = drv.Context.get_current() 46 | try: 47 | mod = self.pycuda_modules[context] 48 | except KeyError: 49 | # if it's a new context, init the module 50 | self.pycuda_modules[context] = SourceModule(self.all_code) 51 | mod = self.pycuda_modules[context] 52 | return mod.get_function('k_%s' % name) 53 | #curDevice = drv.Context.get_device() 54 | #if self.curDevice != curDevice: 55 | # self.pycuda_module = SourceModule(self.all_code) 56 | # self.curDevice = curDevice 57 | #return self.pycuda_module.get_function('k_%s' % name) 58 | 59 | def _get_support_code(): 60 | path = os.path.join(get_cufiles_path(), 'support.cu') 61 | return open(path).read() 62 | 63 | def _get_mvcaller_code(): 64 | # for multivariate pdfs 65 | path = os.path.join(get_cufiles_path(), 'mvcaller.cu') 66 | return open(path).read() 67 | 68 | def _get_univcaller_code(): 69 | # For univariate pdfs 70 | path = os.path.join(get_cufiles_path(), 'univcaller.cu') 71 | return open(path).read() 72 | 73 | class Kernel(object): 74 | 75 | def __init__(self, name): 76 | if name is None: 77 | raise ValueError('Kernel must have a default name') 78 | 79 | self.name = name 80 | 81 | def get_code(self): 82 | logic = self.get_logic() 83 | caller = self.get_caller() 84 | return '\n'.join((logic, caller)) 85 | 86 | def get_logic(self, **kwds): 87 | raise NotImplementedError 88 | 89 | def get_caller(self, **kwds): 90 | raise NotImplementedError 91 | 92 | def get_name(self, name=None): 93 | # can override default name, for transforms. this a hack? 94 | if name is None: 95 | name = self.name 96 | 97 | return name 98 | 99 | class CUFile(Kernel): 100 | """ 101 | Expose kernel contained in .cu file in the cufiles directory to code 102 | generation framework. Kernel need only have a template to be able to change 103 | the name of the generated kernel 104 | """ 105 | def __init__(self, name, filepath): 106 | self.full_path = os.path.join(get_cufiles_path(), 107 | filepath) 108 | 109 | Kernel.__init__(self, name) 110 | 111 | def get_code(self): 112 | code = open(self.full_path).read() 113 | return code % {'name' : self.name} 114 | 115 | class SamplerKernel(Kernel): 116 | """ 117 | Holds info for measure sample kernel. 118 | """ 119 | def __init__(self, name, logic_code): 120 | self.logic_code = logic_code 121 | Kernel.__init__(self, name) 122 | 123 | def get_logic(self, name=None): 124 | return self.logic_code 125 | 126 | def get_caller(self, name=None): 127 | return self._caller % {'name' : self.get_name(name)} 128 | 129 | class DensityKernel(Kernel): 130 | """ 131 | Generate kernel for probability density function 132 | """ 133 | 134 | _caller = _get_univcaller_code() 135 | def __init__(self, name, logic_code): 136 | 137 | self.logic_code = logic_code 138 | 139 | Kernel.__init__(self, name) 140 | 141 | def get_logic(self, name=None): 142 | return self.logic_code % {'name' : self.get_name(name)} 143 | 144 | def get_caller(self, name=None): 145 | return self._caller % {'name' : self.get_name(name)} 146 | 147 | class MVDensityKernel(DensityKernel): 148 | """ 149 | 150 | """ 151 | _caller = _get_mvcaller_code() 152 | 153 | class Transform(Kernel): 154 | """ 155 | Enable simple transforms of kernels to compute modified kernel code stub 156 | """ 157 | def __init__(self, name, kernel): 158 | self.kernel = kernel 159 | Kernel.__init__(self, name) 160 | 161 | # XXX: HACK, not general for non-density kernels 162 | def is_multivariate(self): 163 | return isinstance(self.kernel, MVDensityKernel) 164 | 165 | # flop the right name? 166 | 167 | class Flop(Transform): 168 | op = None 169 | 170 | def get_logic(self, name=None): 171 | name = self.get_name(name) 172 | 173 | actual_name = '%s_stub' % name 174 | kernel_logic = self.kernel.get_logic(name=actual_name) 175 | 176 | if self.is_multivariate(): 177 | stub_caller = _mv_stub_caller 178 | else: 179 | stub_caller = _univ_stub_caller 180 | 181 | transform_logic = stub_caller % {'name' : name, 182 | 'actual_kernel' : actual_name, 183 | 'op' : self.op} 184 | 185 | return '\n'.join((kernel_logic, transform_logic)) 186 | 187 | def get_caller(self): 188 | return self.kernel.get_caller(self.name) 189 | 190 | _univ_stub_caller = """ 191 | __device__ float %(name)s(float* x, float* params) { 192 | return %(op)s(%(actual_kernel)s(x, params)); 193 | } 194 | """ 195 | 196 | _mv_stub_caller = """ 197 | __device__ float %(name)s(float* x, float* params, int dim) { 198 | return %(op)s(%(actual_kernel)s(x, params, dim)); 199 | } 200 | """ 201 | 202 | class Exp(Flop): 203 | op = 'expf' 204 | 205 | class Log(Flop): 206 | op = 'logf' 207 | 208 | class Sqrt(Flop): 209 | op = 'sqrtf' 210 | 211 | _cu_module = None 212 | 213 | def get_full_cuda_module(): 214 | import gpustats.kernels as kernels 215 | global _cu_module 216 | 217 | if _cu_module is None: 218 | objects = kernels.__dict__ 219 | 220 | all_kernels = dict((k, v) 221 | for k, v in kernels.__dict__.iteritems() 222 | if isinstance(v, Kernel)) 223 | _cu_module = CUDAModule(all_kernels) 224 | 225 | return _cu_module 226 | 227 | if __name__ == '__main__': 228 | pass 229 | -------------------------------------------------------------------------------- /old/mvnpdf.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Multivariate normal pdf implementation 3 | */ 4 | 5 | #ifndef _INCLUDED_MVNPDF 6 | #define _INCLUDED_MVNPDF 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | 12 | #include "mvnpdf.h" 13 | #include "cucommon.h" 14 | 15 | int compute_shmem(PMatrix* data, PMatrix* params, int nparams, int ndata) { 16 | // to hold specified about of data, parameters, and results 17 | int result_space = nparams * ndata; 18 | int param_space = params->stride * nparams; 19 | int data_space = data->stride * ndata; 20 | 21 | return sizeof(float) * (result_space + param_space + data_space); 22 | } 23 | 24 | // Compute "optimal" block size given number of data points / parameters 25 | void get_tuned_layout(BlockDesign* info, PMatrix* data, PMatrix* params, 26 | int max_block_params) { 27 | // query the device for smem / max # of threads 28 | int max_smem = smem_size() / 10 * 9; 29 | int max_threads = max_block_threads(); 30 | 31 | // at most max_block_params sets of density parameters per block 32 | // for low-dimensional data, better to do more? 33 | int params_per = max_block_params; 34 | if (params->rows < max_block_params) 35 | params_per = next_pow2(params->rows, max_block_params); 36 | 37 | int data_per = max_threads / params_per; 38 | // at least 16 data points per block 39 | while (data_per < 16 & params_per > 1) { 40 | params_per /= 2; 41 | data_per *= 2; 42 | } 43 | 44 | while (1) { 45 | while (compute_shmem(data, params, params_per, data_per) > max_smem) { 46 | if (data_per <= 1) 47 | break; 48 | if (params_per > 1) 49 | params_per /= 2; 50 | else 51 | data_per /= 2; 52 | } 53 | // can't fit max_block_params sets of parameters into the shared memory 54 | if (data_per == 0) { 55 | data_per = 1; 56 | params_per /= 2; 57 | // start over the tuning 58 | continue; 59 | } 60 | else break; 61 | } 62 | 63 | // possible to squeeze more data? 64 | while (compute_shmem(data, params, params_per, 2 * data_per) <= max_smem) 65 | if (2 * data_per * params_per <= max_threads) 66 | data_per *= 2; 67 | else 68 | break; 69 | 70 | info->data_per_block = data_per; 71 | info->params_per_block = params_per; 72 | } 73 | 74 | __device__ int d_next_multiple(int k, int mult) { 75 | if (k % mult) 76 | return k + (mult - k % mult); 77 | else 78 | return k; 79 | } 80 | 81 | int next_multiple(int k, int mult) { 82 | if (k % mult) 83 | return k + (mult - k % mult); 84 | else 85 | return k; 86 | } 87 | 88 | __device__ float compute_pdf(float* data, float* params, int dim) { 89 | float* mean = params; 90 | float* sigma = params + dim; 91 | float mult = params[dim * (dim + 3) / 2]; 92 | float logdet = params[dim * (dim + 3) / 2 + 1]; 93 | 94 | float discrim = 0; 95 | float sum; 96 | unsigned int i, j; 97 | for (i = 0; i < dim; ++i) 98 | { 99 | sum = 0; 100 | for(j = 0; j <= i; ++j) { 101 | sum += *sigma++ * (data[j] - mean[j]); 102 | } 103 | discrim += sum * sum; 104 | } 105 | return log(mult) - 0.5 * (discrim + logdet + LOG_2_PI * dim); 106 | } 107 | 108 | __device__ void copy_chunks(float* in_buf, float* out_buf, 109 | unsigned int tid, unsigned int total) { 110 | for (unsigned int chunk = 0; chunk + tid < total; chunk += blockDim.x) { 111 | out_buf[chunk + tid] = in_buf[chunk + tid]; 112 | } 113 | } 114 | 115 | __global__ void mvnpdf_k(const PMatrix data, const PMatrix params, 116 | const BlockDesign design, float* output) { 117 | 118 | unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; 119 | 120 | unsigned int rel_param = tid / design.data_per_block; 121 | unsigned int rel_data = tid - rel_param * design.data_per_block; 122 | 123 | unsigned int obs_num = design.data_per_block * blockIdx.x + rel_data; 124 | unsigned int param_num = design.params_per_block * blockIdx.y + rel_param; 125 | 126 | // set up shared data 127 | extern __shared__ float shared_data[]; 128 | float* sh_params = shared_data; 129 | float* sh_data = sh_params + design.params_per_block * params.stride; 130 | float* sh_result = sh_data + design.data_per_block * data.stride; 131 | 132 | copy_chunks(data.buf + design.data_per_block * blockIdx.x * data.stride, 133 | sh_data, tid, 134 | min(data.rows - design.data_per_block * blockIdx.x, 135 | design.data_per_block) * data.stride); 136 | 137 | copy_chunks(params.buf + design.params_per_block * blockIdx.y * params.stride, 138 | sh_params, tid, 139 | min(design.params_per_block, 140 | params.rows - design.params_per_block * blockIdx.y) * params.stride); 141 | 142 | __syncthreads(); 143 | 144 | // allocated enough shared memory so that this will not walk out of bounds 145 | // no matter what, though some of the results will be garbage 146 | sh_result[tid] = compute_pdf(sh_data + rel_data * data.stride, 147 | sh_params + rel_param * params.stride, 148 | data.cols); 149 | __syncthreads(); 150 | 151 | unsigned int result_idx = data.rows * param_num + obs_num; 152 | 153 | // output is column-major, so this will then coalesce 154 | if (obs_num < data.rows & param_num < params.rows) { 155 | output[result_idx] = sh_result[tid]; 156 | } 157 | } 158 | 159 | // XXX: fix this 160 | int MAX_BLOCK_PARAMS = 64; 161 | 162 | cudaError_t invoke_mvnpdf(PMatrix data, PMatrix params, float* d_pdf) { 163 | // Need to automatically tune block / grid layout to maximize shared memory 164 | // usage and coalescence, reduce wasted threads! 165 | BlockDesign design; 166 | get_tuned_layout(&design, &data, ¶ms, MAX_BLOCK_PARAMS); 167 | 168 | int nthreads = design.data_per_block * design.params_per_block; 169 | 170 | // Now set up grid layout / block size 171 | int grid_x = get_boxes(data.rows, design.data_per_block); 172 | int grid_y = get_boxes(params.rows, design.params_per_block); 173 | dim3 gridPDF(grid_x, grid_y); 174 | dim3 blockPDF(nthreads, 1); 175 | 176 | int sharedMemSize = compute_shmem(&data, ¶ms, 177 | design.params_per_block, 178 | design.data_per_block); 179 | 180 | #ifdef DEBUG 181 | printf("number params: %d, number data points: %d\n", 182 | design.params_per_block, design.data_per_block); 183 | printf("sharedMemSize: %d\n", sharedMemSize); 184 | printf("block: %d x %d, grid: %d x %d\n", blockPDF.x, blockPDF.y, 185 | gridPDF.x, gridPDF.y); 186 | printf("design: %d x %d\n", design.data_per_block, design.params_per_block); 187 | 188 | printf("nparams: %d\n", params.rows); 189 | #endif 190 | 191 | mvnpdf_k<<>>(data, params, design, d_pdf); 192 | return cudaSuccess; 193 | } 194 | 195 | void mvnpdf(float* h_data, /** Data-vector; padded */ 196 | float* h_params, /** Density info; already padded */ 197 | float* h_pdf, /** Resultant PDF */ 198 | int data_dim, 199 | int total_obs, 200 | int nparams, // multiple sets of parameters 201 | int param_stride, // with padding 202 | int data_stride // with padding 203 | ) { 204 | 205 | float* d_data; 206 | float* d_params; 207 | float* d_pdf; 208 | cudaError_t error; 209 | 210 | PMatrix pdata, pparams; 211 | CATCH_ERR(cudaMalloc((void**) &d_pdf, total_obs * nparams * sizeof(float))); 212 | CATCH_ERR(cudaMalloc((void**) &d_data, 213 | data_stride * total_obs * sizeof(float))); 214 | CATCH_ERR(cudaMalloc((void**) &d_params, 215 | param_stride * nparams * sizeof(float))); 216 | 217 | h_to_d(h_data, d_data, total_obs * data_stride); 218 | h_to_d(h_params, d_params, nparams * param_stride); 219 | 220 | PMatrix_init(&pdata, d_data, total_obs, data_dim, data_stride); 221 | PMatrix_init(&pparams, d_params, nparams, 222 | data_dim * (data_dim + 3) / 2 + 2, param_stride); 223 | 224 | invoke_mvnpdf(pdata, pparams, d_pdf); 225 | d_to_h(d_pdf, h_pdf, total_obs * nparams); 226 | 227 | cudaFree(d_data); 228 | cudaFree(d_params); 229 | cudaFree(d_pdf); 230 | } 231 | 232 | void cpu_mvnpdf(float* x, float* density, float * output, int dim, 233 | int padded_dim, int N, int T) { 234 | int LOGDET_OFFSET = dim * (dim + 3) / 2; 235 | int MEAN_CHD_DIM = dim * (dim + 3) / 2 + 2; 236 | 237 | int PACK_DIM = next_multiple(MEAN_CHD_DIM, 16); 238 | 239 | float* xx = (float*) malloc(dim * sizeof(float)); 240 | int obs, component; 241 | 242 | for (obs = 0; obs < N; obs++) { 243 | for (component = 0; component < T; component++) { 244 | float discrim; 245 | float* tData = x + obs * padded_dim; 246 | float* tDensityInfo = density + component * PACK_DIM; 247 | float* tMean = tDensityInfo; 248 | float* tSigma = tDensityInfo + dim; 249 | float tP = tDensityInfo[LOGDET_OFFSET]; 250 | float tLogDet = tDensityInfo[LOGDET_OFFSET+1]; 251 | 252 | // Do density calculation 253 | discrim = 0; 254 | for(int i=0; i < dim; i++) { 255 | float sum = 0; 256 | for(int j=0; j <= i; j++) { 257 | sum += *tSigma * (tData[j] - tMean[j]); // xx[j] is always calculated since j <= i 258 | tSigma++; 259 | } 260 | 261 | discrim += sum * sum; 262 | } 263 | output[obs * T + component] = log(tP) - 0.5 * (discrim + tLogDet + (LOG_2_PI*(float) dim)); 264 | } 265 | } 266 | free(xx); 267 | } 268 | 269 | 270 | #ifdef __cplusplus 271 | } 272 | #endif 273 | 274 | #endif // _INCLUDED_MVNPDF 275 | -------------------------------------------------------------------------------- /gpustats/pdfs.py: -------------------------------------------------------------------------------- 1 | from numpy.random import randn 2 | from numpy.linalg import cholesky as chol 3 | import numpy as np 4 | import numpy.linalg as LA 5 | 6 | from pycuda.gpuarray import GPUArray, to_gpu 7 | from pycuda.gpuarray import empty as gpu_empty 8 | import gpustats.kernels as kernels 9 | import gpustats.codegen as codegen 10 | from gpustats.util import transpose as gpu_transpose 11 | reload(codegen) 12 | reload(kernels) 13 | import gpustats.util as util 14 | import pycuda.driver as drv 15 | 16 | __all__ = ['mvnpdf', 'mvnpdf_multi', 'normpdf', 'normpdf_multi'] 17 | 18 | cu_module = codegen.get_full_cuda_module() 19 | 20 | #------------------------------------------------------------------------------- 21 | # Invokers for univariate and multivariate density functions conforming to the 22 | # standard API 23 | 24 | def _multivariate_pdf_call(cu_func, data, packed_params, get, order, 25 | datadim=None): 26 | packed_params = util.prep_ndarray(packed_params) 27 | func_regs = cu_func.num_regs 28 | 29 | # Prep the data. Skip if gpudata ... 30 | if isinstance(data, GPUArray): 31 | padded_data = data 32 | if datadim==None: 33 | ndata, dim = data.shape 34 | else: 35 | ndata, dim = data.shape[0], datadim 36 | 37 | else: 38 | 39 | ndata, dim = data.shape 40 | padded_data = util.pad_data(data) 41 | 42 | nparams = len(packed_params) 43 | data_per, params_per = util.tune_blocksize(padded_data, 44 | packed_params, 45 | func_regs) 46 | 47 | blocksize = data_per * params_per 48 | #print 'the blocksize is ' + str(blocksize) 49 | #print 'data_per ' + str(data_per) + '. params_per ' + str(params_per) 50 | shared_mem = util.compute_shmem(padded_data, packed_params, 51 | data_per, params_per) 52 | block_design = (data_per * params_per, 1, 1) 53 | grid_design = (util.get_boxes(ndata, data_per), 54 | util.get_boxes(nparams, params_per)) 55 | 56 | # see cufiles/mvcaller.cu 57 | design = np.array(((data_per, params_per) + # block design 58 | padded_data.shape + # data spec 59 | (dim,) + # non-padded number of data columns 60 | packed_params.shape), # params spec 61 | dtype=np.int32) 62 | 63 | if nparams == 1: 64 | gpu_dest = gpu_empty(ndata, dtype=np.float32) 65 | #gpu_dest = to_gpu(np.zeros(ndata, dtype=np.float32)) 66 | else: 67 | gpu_dest = gpu_empty((ndata, nparams), dtype=np.float32, order='F') 68 | #gpu_dest = to_gpu(np.zeros((ndata, nparams), dtype=np.float32, order='F')) 69 | 70 | # Upload data if not already uploaded 71 | if not isinstance(padded_data, GPUArray): 72 | gpu_padded_data = to_gpu(padded_data) 73 | else: 74 | gpu_padded_data = padded_data 75 | 76 | gpu_packed_params = to_gpu(packed_params) 77 | 78 | params = (gpu_dest, gpu_padded_data, gpu_packed_params) + tuple(design) 79 | kwds = dict(block=block_design, grid=grid_design, shared=shared_mem) 80 | cu_func(*params, **kwds) 81 | 82 | gpu_packed_params.gpudata.free() 83 | if get: 84 | if order=='F': 85 | return gpu_dest.get() 86 | else: 87 | return np.asarray(gpu_dest.get(), dtype=np.float32, order='C') 88 | #output = gpu_dest.get() 89 | #if nparams > 1: 90 | # output = output.reshape((nparams, ndata), order='C').T 91 | #return output 92 | else: 93 | if order=='F' or nparams==1: 94 | return gpu_dest 95 | else: 96 | res = gpu_transpose(util.GPUarray_reshape(gpu_dest, (nparams, ndata), "C")) 97 | gpu_dest.gpudata.free() 98 | return res 99 | #return gpu_transpose(gpu_dest.reshape(nparams, ndata, 'C')) 100 | 101 | def _univariate_pdf_call(cu_func, data, packed_params, get): 102 | ndata = len(data) 103 | nparams = len(packed_params) 104 | 105 | func_regs = cu_func.num_regs 106 | 107 | packed_params = util.prep_ndarray(packed_params) 108 | 109 | data_per, params_per = util.tune_blocksize(data, 110 | packed_params, 111 | func_regs) 112 | 113 | shared_mem = util.compute_shmem(data, packed_params, 114 | data_per, params_per) 115 | 116 | block_design = (data_per * params_per, 1, 1) 117 | grid_design = (util.get_boxes(ndata, data_per), 118 | util.get_boxes(nparams, params_per)) 119 | 120 | # see cufiles/univcaller.cu 121 | 122 | #gpu_dest = to_gpu(np.zeros((ndata, nparams), dtype=np.float32)) 123 | gpu_dest = gpu_empty((ndata, nparams), dtype=np.float32) 124 | gpu_data = data if isinstance(data, GPUArray) else to_gpu(data) 125 | gpu_packed_params = to_gpu(packed_params) 126 | 127 | design = np.array(((data_per, params_per) + # block design 128 | (len(data),) + 129 | packed_params.shape), # params spec 130 | dtype=np.int32) 131 | 132 | cu_func(gpu_dest, 133 | gpu_data, gpu_packed_params, design[0], 134 | design[1], design[2], design[3], design[4], 135 | block=block_design, grid=grid_design, shared=shared_mem) 136 | 137 | if get: 138 | output = gpu_dest.get() 139 | if nparams > 1: 140 | output = output.reshape((nparams, ndata), order='C').T 141 | return output 142 | else: 143 | return gpu_dest 144 | 145 | #------------------------------------------------------------------------------- 146 | # Multivariate normal 147 | 148 | def mvnpdf(data, mean, cov, weight=None, logged=True, get=True, order="F", 149 | datadim=None): 150 | """ 151 | Multivariate normal density 152 | 153 | Parameters 154 | ---------- 155 | 156 | Returns 157 | ------- 158 | """ 159 | return mvnpdf_multi(data, [mean], [cov], 160 | logged=logged, get=get, order=order, 161 | datadim=datadim).squeeze() 162 | 163 | def mvnpdf_multi(data, means, covs, weights=None, logged=True, 164 | get=True, order="F", datadim=None): 165 | """ 166 | Multivariate normal density with multiple sets of parameters 167 | 168 | Parameters 169 | ---------- 170 | data : ndarray (n x k) 171 | covs : sequence of 2d k x k matrices (length j) 172 | weights : ndarray (length j) 173 | Multiplier for component j, usually will sum to 1 174 | 175 | get = False leaves the result on the GPU 176 | without copying back. 177 | 178 | If data has already been padded, the orginal dimension 179 | must be passed in datadim 180 | 181 | It data is of GPUarray type, the data is assumed to be 182 | padded, and datadim will need to be passed if padding 183 | was needed. 184 | 185 | Returns 186 | ------- 187 | densities : n x j 188 | """ 189 | if logged: 190 | cu_func = cu_module.get_function('log_pdf_mvnormal') 191 | else: 192 | cu_func = cu_module.get_function('pdf_mvnormal') 193 | 194 | assert(len(covs) == len(means)) 195 | 196 | ichol_sigmas = [LA.inv(chol(c)) for c in covs] 197 | logdets = [-2.0*np.log(c.diagonal()).sum() for c in ichol_sigmas] 198 | 199 | if weights is None: 200 | weights = np.ones(len(means)) 201 | 202 | packed_params = _pack_mvnpdf_params(means, ichol_sigmas, logdets, weights) 203 | 204 | return _multivariate_pdf_call(cu_func, data, packed_params, 205 | get, order,datadim) 206 | 207 | def _pack_mvnpdf_params(means, ichol_sigmas, logdets, weights): 208 | to_pack = [] 209 | for m, ch, ld, w in zip(means, ichol_sigmas, logdets, weights): 210 | to_pack.append(_pack_mvnpdf_params_single(m, ch, ld, w)) 211 | 212 | return np.vstack(to_pack) 213 | 214 | def _pack_mvnpdf_params_single(mean, ichol_sigma, logdet, weight=1): 215 | PAD_MULTIPLE = 16 216 | k = len(mean) 217 | mean_len = k 218 | ichol_len = k * (k + 1) / 2 219 | mch_len = mean_len + ichol_len 220 | 221 | packed_dim = util.next_multiple(mch_len + 2, PAD_MULTIPLE) 222 | 223 | packed_params = np.empty(packed_dim, dtype=np.float32) 224 | packed_params[:mean_len] = mean 225 | 226 | packed_params[mean_len:mch_len] = ichol_sigma[np.tril_indices(k)] 227 | packed_params[mch_len:mch_len + 2] = weight, logdet 228 | 229 | return packed_params 230 | 231 | #------------------------------------------------------------------------------- 232 | # Univariate normal 233 | 234 | def normpdf(x, mean, std, logged=True, get=True): 235 | """ 236 | Normal (Gaussian) density 237 | 238 | Parameters 239 | ---------- 240 | 241 | Returns 242 | ------- 243 | """ 244 | return normpdf_multi(x, [mean], [std], logged=logged, get=get).squeeze() 245 | 246 | def normpdf_multi(x, means, std, logged=True, get=True): 247 | if logged: 248 | cu_func = cu_module.get_function('log_pdf_normal') 249 | else: 250 | cu_func = cu_module.get_function('pdf_normal') 251 | 252 | packed_params = np.c_[means, std] 253 | 254 | if not isinstance(x, GPUArray): 255 | x = util.prep_ndarray(x) 256 | 257 | return _univariate_pdf_call(cu_func, x, packed_params, get) 258 | 259 | if __name__ == '__main__': 260 | import gpustats.compat as compat 261 | 262 | n = 1e5 263 | k = 8 264 | 265 | np.random.seed(1) 266 | data = randn(n, k).astype(np.float32) 267 | mean = randn(k).astype(np.float32) 268 | cov = util.random_cov(k).astype(np.float32) 269 | 270 | result = mvnpdf_multi(data, [mean, mean], [cov, cov]) 271 | # pyresult = compat.python_mvnpdf(data, [mean], [cov]).squeeze() 272 | # print result - pyresult 273 | -------------------------------------------------------------------------------- /gpustats/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pycuda.driver as drv 3 | import pycuda.gpuarray as gpuarray 4 | import pycuda 5 | import scipy.linalg as LA 6 | drv.init() 7 | if drv.Context.get_current() is None: 8 | import pycuda.autoinit 9 | from pycuda.compiler import SourceModule 10 | 11 | def threadSafeInit(device = 0): 12 | """ 13 | If gpustats (or any other pycuda work) is used inside a 14 | multiprocessing.Process, this function must be used inside the 15 | thread to clean up invalid contexts and create a new one on the 16 | given device. Assumes one GPU per thread. 17 | """ 18 | 19 | import atexit 20 | drv.init() # just in case 21 | 22 | ## clean up all contexts. most will be invalid from 23 | ## multiprocessing fork 24 | import os; import sys 25 | clean = False 26 | while not clean: 27 | _old_ctx = drv.Context.get_current() 28 | if _old_ctx is None: 29 | clean = True 30 | else: 31 | ## detach: will give warnings to stderr if invalid 32 | _old_cerr = os.dup(sys.stderr.fileno()) 33 | _nl = os.open(os.devnull, os.O_RDWR) 34 | os.dup2(_nl, sys.stderr.fileno()) 35 | _old_ctx.detach() 36 | sys.stderr = os.fdopen(_old_cerr, "wb") 37 | os.close(_nl) 38 | from pycuda.tools import clear_context_caches 39 | clear_context_caches() 40 | 41 | ## init a new device 42 | dev = drv.Device(device) 43 | ctx = dev.make_context() 44 | 45 | ## pycuda.autoinit exitfunc is bad now .. delete it 46 | exit_funcs = atexit._exithandlers 47 | for fn in exit_funcs: 48 | if hasattr(fn[0], 'func_name'): 49 | if fn[0].func_name == '_finish_up': 50 | exit_funcs.remove(fn) 51 | if fn[0].func_name == 'clean_all_contexts': # avoid duplicates 52 | exit_funcs.remove(fn) 53 | 54 | ## make sure we clean again on exit 55 | atexit.register(clean_all_contexts) 56 | 57 | 58 | def clean_all_contexts(): 59 | 60 | ctx = True 61 | while ctx is not None: 62 | ctx = drv.Context.get_current() 63 | if ctx is not None: 64 | ctx.detach() 65 | 66 | from pycuda.tools import clear_context_caches 67 | clear_context_caches() 68 | 69 | 70 | def GPUarray_reshape(garray, shape=None, order="C"): 71 | if shape is None: 72 | shape = garray.shape 73 | return gpuarray.GPUArray( 74 | shape=shape, 75 | dtype=garray.dtype, 76 | allocator=garray.allocator, 77 | base=garray, 78 | gpudata=int(garray.gpudata), 79 | order=order) 80 | 81 | def GPUarray_order(garray, order="F"): 82 | """ 83 | will set the order of garray in place 84 | """ 85 | if order=="F": 86 | if garray.flags.f_contiguous: 87 | exit 88 | else: 89 | garray.strides = gpuarray._f_contiguous_strides( 90 | garray.dtype.itemsize, garray.shape) 91 | garray.flags.f_contiguous = True 92 | garray.flags.c_contiguous = False 93 | elif order=="C": 94 | if garray.flags.c_contiguous: 95 | exit 96 | else: 97 | garray.strides = gpuarray._c_contiguous_strides( 98 | garray.dtype.itemsize, garray.shape) 99 | garray.flags.c_contiguous = True 100 | garray.flags.f_contiguous = False 101 | 102 | 103 | 104 | _dev_attr = drv.device_attribute 105 | ## TO DO: should be different for each device .. assumes they are the same 106 | class DeviceInfo(object): 107 | 108 | def __init__(self): 109 | #self._dev = pycuda.autoinit.device 110 | #self._dev = drv.Device(dev) 111 | self._dev = drv.Context.get_device() 112 | self._attr = self._dev.get_attributes() 113 | 114 | self.max_block_threads = self._attr[_dev_attr.MAX_THREADS_PER_BLOCK] 115 | self.shared_mem = self._attr[_dev_attr.MAX_SHARED_MEMORY_PER_BLOCK] 116 | self.warp_size = self._attr[_dev_attr.WARP_SIZE] 117 | self.max_registers = self._attr[_dev_attr.MAX_REGISTERS_PER_BLOCK] 118 | self.compute_cap = self._dev.compute_capability() 119 | self.max_grid_dim = (self._attr[_dev_attr.MAX_GRID_DIM_X], 120 | self._attr[_dev_attr.MAX_GRID_DIM_Y]) 121 | 122 | info = DeviceInfo() 123 | 124 | HALF_WARP = 16 125 | 126 | def random_cov(dim): 127 | from pymc.distributions import rwishart 128 | return LA.inv(rwishart(dim, np.eye(dim))) 129 | 130 | def unvech(v): 131 | # quadratic formula, correct fp error 132 | rows = .5 * (-1 + np.sqrt(1 + 8 * len(v))) 133 | rows = int(np.round(rows)) 134 | 135 | result = np.zeros((rows, rows)) 136 | result[np.triu_indices(rows)] = v 137 | result = result + result.T 138 | 139 | # divide diagonal elements by 2 140 | result[np.diag_indices(rows)] /= 2 141 | 142 | return result 143 | 144 | def pad_data_mult16(data, fill=0): 145 | """ 146 | Pad data to be a multiple of 16 for discrete sampler. 147 | """ 148 | 149 | if type(data) == gpuarray: 150 | data = data.get() 151 | 152 | n, k = data.shape 153 | 154 | km = int(k/16) + 1 155 | 156 | newk = km*16 157 | if newk != k: 158 | padded_data = np.zeros((n, newk), dtype=np.float32) 159 | if fill!=0: 160 | padded_data = padded_data + fill 161 | 162 | padded_data[:,:k] = data 163 | 164 | return padded_data 165 | else: 166 | return prep_ndarray(data) 167 | 168 | def pad_data(data): 169 | """ 170 | Pad data to avoid bank conflicts on the GPU-- dimension should not be a 171 | multiple of the half-warp size (16) 172 | """ 173 | if type(data) == gpuarray: 174 | data = data.get() 175 | 176 | n, k = data.shape 177 | 178 | if not k % HALF_WARP: 179 | pad_dim = k + 1 180 | else: 181 | pad_dim = k 182 | 183 | if k != pad_dim: 184 | padded_data = np.empty((n, pad_dim), dtype=np.float32) 185 | padded_data[:, :k] = data 186 | 187 | return padded_data 188 | else: 189 | return prep_ndarray(data) 190 | 191 | def prep_ndarray(arr): 192 | # is float32 and contiguous? 193 | if not arr.dtype == np.float32 or not arr.flags.contiguous: 194 | arr = np.array(arr, dtype=np.float32, order='C') 195 | 196 | return arr 197 | 198 | 199 | 200 | 201 | def tune_blocksize(data, params, func_regs): 202 | """ 203 | For multivariate distributions-- what's the optimal block size given the 204 | gpu? 205 | 206 | Parameters 207 | ---------- 208 | data : ndarray 209 | params : ndarray 210 | 211 | Returns 212 | ------- 213 | (data_per, params_per) : (int, int) 214 | """ 215 | #info = DeviceInfo() 216 | 217 | max_smem = info.shared_mem * 0.9 218 | max_threads = int(info.max_block_threads * 0.5) 219 | max_regs = info.max_registers 220 | max_grid = int(info.max_grid_dim[0]) 221 | 222 | params_per = 64#max_threads 223 | if (len(params) < params_per): 224 | params_per = _next_pow2(len(params), info.max_block_threads) 225 | 226 | min_data_per = data.shape[0] / max_grid; 227 | data_per0 = _next_pow2( max( max_threads / params_per, min_data_per ), 512); 228 | data_per = data_per0 229 | 230 | def _can_fit(data_per, params_per): 231 | ok = compute_shmem(data, params, data_per, params_per) <= max_smem 232 | ok = ok and data_per*params_per <= max_threads 233 | return ok and func_regs*data_per*params_per <= max_regs 234 | 235 | while True: 236 | while not _can_fit(data_per, params_per): 237 | if data_per <= min_data_per: 238 | break 239 | 240 | if params_per > 1: 241 | # reduce number of parameters first 242 | params_per /= 2 243 | else: 244 | # can't go any further, have to do less data 245 | data_per /= 2 246 | 247 | if data_per <= min_data_per: 248 | # we failed somehow. start over 249 | data_per = 2 * data_per0 250 | params_per /= 2 251 | continue 252 | else: 253 | break 254 | 255 | while _can_fit(2 * data_per, params_per): 256 | #if 2 * data_per * params_per < max_threads: 257 | data_per *= 2 258 | #else: 259 | # hit block size limit 260 | # break 261 | 262 | #import pdb; pdb.set_trace() 263 | return data_per, params_per 264 | 265 | def get_boxes(n, box_size): 266 | # how many boxes of size box_size are needed to hold n things 267 | return int((n + box_size - 1) / box_size) 268 | 269 | def compute_shmem(data, params, data_per, params_per): 270 | result_space = data_per * params_per 271 | 272 | data_dim = 1 if len(data.shape) == 1 else data.shape[1] 273 | params_dim = len(params) if len(params.shape) == 1 else params.shape[1] 274 | 275 | param_space = params_dim * params_per 276 | data_space = data_dim * data_per 277 | return 4 * (result_space + param_space + data_space) 278 | 279 | def _next_pow2(k, pow2): 280 | while k <= pow2 / 2: 281 | pow2 /= 2 282 | return pow2 283 | 284 | def next_multiple(k, mult): 285 | if k % mult: 286 | return k + (mult - k % mult) 287 | else: 288 | return k 289 | 290 | def get_cufiles_path(): 291 | import os.path as pth 292 | basepath = pth.abspath(pth.split(__file__)[0]) 293 | return pth.join(basepath, 'cufiles') 294 | 295 | 296 | from pycuda.tools import context_dependent_memoize 297 | 298 | @context_dependent_memoize 299 | def _get_transpose_kernel(): 300 | 301 | #info = DeviceInfo() 302 | if info.max_block_threads >= 1024: 303 | t_block_size = 32 304 | else: 305 | t_block_size = 16 306 | 307 | import os.path as pth 308 | mod = SourceModule( 309 | open(pth.join(get_cufiles_path(), "transpose.cu")).read() % { "block_size" : t_block_size }) 310 | 311 | func = mod.get_function("transpose") 312 | func.prepare("PPii") #, block=(t_block_size, t_block_size, 1)) 313 | return t_block_size, func 314 | 315 | 316 | #from pytools import Record 317 | #class TransposeKernelInfo(Record): pass 318 | #return TransposeKernelInfo(func=func, 319 | # block_size=t_block_size, 320 | # granularity=t_block_size) 321 | 322 | 323 | def _transpose(tgt, src): 324 | block_size, func = _get_transpose_kernel() 325 | 326 | 327 | h, w = src.shape 328 | assert tgt.shape == (w, h) 329 | #assert w % block_size == 0 330 | #assert h % block_size == 0 331 | 332 | gw = int(np.ceil(float(w) / block_size)) 333 | gh = int(np.ceil(float(h) / block_size)) 334 | gz = int(1) 335 | 336 | ### 3D grids are needed for larger data ... should be comming soon ... 337 | #while gw > info.max_grid_dim[0]: 338 | # gz += 1 339 | # gw = int(np.ceil(float(w) / (gz * block_size) )) 340 | 341 | func.prepared_call( 342 | (gw, gh), 343 | (block_size, block_size, 1), 344 | tgt.gpudata, src.gpudata, w, h) 345 | 346 | 347 | def transpose(src): 348 | h, w = src.shape 349 | 350 | result = gpuarray.empty((w, h), dtype=src.dtype) 351 | _transpose(result, src) 352 | del src 353 | return result 354 | --------------------------------------------------------------------------------