├── dev_notes.rst
├── .gitignore
├── MANIFEST.in
├── old
    ├── pdfs.py
    ├── mvnpdf.h
    ├── build_cython.py
    ├── kernels.h
    ├── gpustats.pxd
    ├── cytest.pyx
    ├── scratch.py
    ├── cucommon.h
    ├── common.c
    ├── util.py
    ├── Makefile
    ├── common.h
    └── mvnpdf.cu
├── gpustats
    ├── multigpu.py
    ├── compat.py
    ├── cufiles
    │   ├── support.cu
    │   ├── cpustub.cu
    │   ├── transpose.cu
    │   ├── univcaller.cu
    │   ├── mvcaller.cu
    │   ├── sample_discrete.cu
    │   ├── sample_discrete_logged.cu
    │   └── sampleFromMeasureMedium.cu
    ├── kernels.py
    ├── __init__.py
    ├── tests
    │   ├── test_samplers.py
    │   └── test_pdfs.py
    ├── sampler.py
    ├── codegen.py
    ├── pdfs.py
    └── util.py
├── LICENSE
├── README.rst
├── setup.py
├── examples
    └── pymc_test.py
└── scripts
    └── bench.py


/dev_notes.rst:
--------------------------------------------------------------------------------
1 | - EMmvNormalPDF : do nothing with
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.o
 3 | *.so
 4 | *.cu_o
 5 | bin
 6 | obj
 7 | build
 8 | *~
 9 | foo.cu
10 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include *.txt *.py *.rst
 2 | include MANIFEST.in
 3 | recursive-include gpustats/cufiles *
 4 | 
 5 | #exclude build
 6 | #exclude dist
 7 | 
 8 | graft gpustats/tests
 9 | global-exclude *~ *.swp  *.pyc *.bak
10 | 


--------------------------------------------------------------------------------
/old/pdfs.py:
--------------------------------------------------------------------------------
 1 | from numpy.linalg import inv, cholesky as chol
 2 | import numpy as np
 3 | 
 4 | import testmod
 5 | import util
 6 | 
 7 | def mvnpdf(data, means, covs):
 8 |     '''
 9 |     Compute multivariate normal log pdf
10 | 
11 |     Parameters
12 |     ----------
13 | 
14 |     Returns
15 |     -------
16 | 
17 |     '''
18 |     logdets = [np.log(np.linalg.det(c)) for c in covs]
19 |     ichol_sigmas = [inv(chol(c)) for c in covs]
20 | 
21 |     packed_params = util.pack_params(means, ichol_sigmas, logdets)
22 |     packed_data = util.pad_data(data)
23 |     return testmod.mvn_call(packed_data, packed_params,
24 |                             data.shape[1])
25 | 


--------------------------------------------------------------------------------
/old/mvnpdf.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MVNPDF_H__
 2 | #define __MVNPDF_H__
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | #include "common.h"
 9 | 
10 | void mvnpdf(float* h_data, /** Data-vector; padded */
11 | 			 float* h_params, /** Density info; already padded */
12 | 			 float* h_pdf, /** Resultant PDF */
13 | 			 int data_dim,
14 | 			 int total_obs,
15 | 			 int nparams,
16 | 			 int param_stride, // with padding
17 | 			 int data_stride // with padding
18 |   );
19 | 
20 | void cpu_mvnormpdf(float* x, float* density, float * output, int dim,
21 |                    int padded_dim, int N, int T);
22 | 
23 | 
24 | #ifdef __cplusplus
25 | }
26 | #endif
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/old/build_cython.py:
--------------------------------------------------------------------------------
 1 | #/usr/bin/env python
 2 | 
 3 | from distutils.extension import Extension
 4 | from numpy.distutils.core import setup
 5 | from Cython.Distutils import build_ext
 6 | import numpy
 7 | 
 8 | def get_cuda_include():
 9 |     return '/usr/local/cuda/include'
10 | 
11 | pyx_ext = Extension('testmod', ['cytest.pyx'],
12 |                     include_dirs=[numpy.get_include(),
13 |                                   get_cuda_include()],
14 |                     library_dirs=['.'],
15 |                     libraries=['gpustats'])
16 | 
17 | setup(name='testmod', description='',
18 |       ext_modules=[pyx_ext],
19 |       cmdclass = {
20 |           'build_ext' : build_ext
21 |       })
22 | 


--------------------------------------------------------------------------------
/old/kernels.h:
--------------------------------------------------------------------------------
 1 | #ifndef __KERNELS_H__
 2 | #define __KERNELS_H__
 3 | 
 4 | #include <cuda_runtime_api.h>
 5 | 
 6 | #include "common.h"
 7 | 
 8 | __global__ void mvNormalPDF(
 9 | 					REAL* iData, /** Data-vector; padded */
10 | 					REAL* iDensityInfo, /** Density info; already padded */
11 | 					REAL* oMeasure, /** Resultant measure */
12 | 					int iD, /** Not currently necessary, as DIM is hardcoded */
13 | 					int iN,
14 | 					int iTJ,
15 | 					int isLogScaled
16 | 				);
17 | 
18 | cudaError_t gpuMvNormalPDF(
19 | 					REAL* iData, /** Data-vector; padded */
20 | 					REAL* iDensityInfo, /** Density info; already padded */
21 | 					REAL* oMeasure, /** Resultant measure */
22 | 					int iD, /** Not currently necessary, as DIM is hardcoded */
23 | 					int iN,
24 | 					int iTJ
25 | 				);
26 | 
27 | #endif // __KERNELS_H__
28 | 


--------------------------------------------------------------------------------
/old/gpustats.pxd:
--------------------------------------------------------------------------------
 1 | cdef extern from "cuda.h":
 2 |     struct cudaError_t:
 3 |         pass
 4 |     char* cudaGetErrorString(cudaError_t err)
 5 | 
 6 | cdef extern from "common.h":
 7 |     struct PMatrix:
 8 |         float* data
 9 |         int rows
10 |         int cols
11 |         int stride
12 | 
13 |     void PMatrix_init(float* d, int r, int c, int s)
14 | 
15 |     void set_device(int device)
16 | 
17 | cdef extern from "mvnpdf.h":
18 |     void mvnpdf(float* h_data,
19 |                 float* h_params,
20 |                 float* h_pdf,
21 |                 int data_dim,
22 |                 int total_obs,
23 |                 int nparams,
24 |                 int param_stride,
25 |                 int data_stride) nogil
26 | 
27 |     void cpu_mvnpdf(float* x, float* density, float * output, int D,
28 |                     int padded_dim, int N, int T) nogil
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/gpustats/multigpu.py:
--------------------------------------------------------------------------------
 1 | from threading import Thread
 2 | 
 3 | import testmod
 4 | 
 5 | class GPUCall(Thread):
 6 |     """
 7 | 
 8 |     """
 9 | 
10 |     def __init__(self, func, device=0):
11 |         self.func = func
12 |         self.device = device
13 | 
14 |     def acquire_device(self):
15 |         testmod.set_device(self.device)
16 | 
17 |     def release_device(self):
18 |         pass
19 | 
20 |     def run(self):
21 |         self.acquire_device()
22 |         self.func()
23 |         self.release_device()
24 | 
25 | def make_calls(func, data, devices=None, splits=None):
26 |     """
27 | 
28 |     Parameters
29 |     ----------
30 | 
31 |     Returns
32 |     -------
33 | 
34 |     """
35 |     if splits is None:
36 |         pass
37 | 
38 | def _execute_calls(calls):
39 |     """
40 | 
41 |     """
42 |     for call in calls:
43 |         call.start()
44 | 
45 |     for call in calls:
46 |         call.join()
47 | 
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/gpustats/compat.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Python versions of functions for testing purposes etc.
 3 | """
 4 | import numpy as np
 5 | 
 6 | def python_mvnpdf(data, means, covs):
 7 |     from pymc import mv_normal_cov_like as pdf_func
 8 | 
 9 |     results = []
10 |     for i, datum in enumerate(data):
11 |         for j, cov in enumerate(covs):
12 |             mean = means[j]
13 |             results.append(pdf_func(datum, mean, cov))
14 | 
15 |     return np.array(results).reshape((len(data), len(covs))).squeeze()
16 | 
17 | def python_sample_discrete(pmfs, draws=None):
18 |     T, K = pmfs.shape
19 |     output = np.empty(T, dtype=np.int32)
20 |     if draws is None:
21 |         draws = np.random.rand(T)
22 | 
23 |     # rescale
24 |     pmfs = (pmfs.T / pmfs.sum(1)).T
25 | 
26 |     for i in xrange(T):
27 |         the_sum = 0
28 |         draw = draws[i]
29 |         for j in xrange(K):
30 |             the_sum += pmfs[i, j]
31 | 
32 |             if the_sum >= draw:
33 |                 output[i] = j
34 |                 break
35 | 
36 |     return output
37 | 
38 | if __name__ == '__main__':
39 |     pmfs = np.random.randn(20, 5)
40 |     pmfs = (pmfs.T - pmfs.min(1)).T
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/old/cytest.pyx:
--------------------------------------------------------------------------------
 1 | cimport numpy as cnp
 2 | from numpy cimport ndarray
 3 | import numpy as np
 4 | 
 5 | cimport gpustats as gps
 6 | 
 7 | def set_device(device):
 8 |     '''
 9 |     Set the CUDA device
10 |     '''
11 |     gps.set_device(device)
12 | 
13 | def cpu_mvnpdf(ndarray packed_data, ndarray packed_params, int dim):
14 |     n, j = len(packed_data), len(packed_params)
15 | 
16 |     padded_dim = (<object> packed_data).shape[1]
17 | 
18 |     cdef ndarray output = np.empty((n, j), dtype=np.float32)
19 |     gps.cpu_mvnpdf(<float*> packed_data.data,
20 |                     <float*> packed_params.data,
21 |                     <float*> output.data,
22 |                     dim, padded_dim, n, j)
23 | 
24 |     return output
25 | 
26 | def mvn_call(ndarray packed_data, ndarray packed_params, int dim):
27 |     '''
28 |     Invoke MVN kernel on prepared data
29 | 
30 |     Releases GIL
31 |     '''
32 |     cdef int n, k, pn, pk
33 | 
34 |     n, k = (<object> packed_data).shape
35 |     pn, pk = (<object> packed_params).shape
36 | 
37 |     cdef ndarray output = np.empty((n, pn), np.float32, order='F')
38 | 
39 |     with nogil:
40 |         gps.mvnpdf(<float*> packed_data.data,
41 |                     <float*> packed_params.data,
42 |                     <float*> output.data,
43 |                    dim, n, pn, pk, k)
44 | 
45 |     return output
46 | 


--------------------------------------------------------------------------------
/gpustats/cufiles/support.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | #define LOG_2_PI 1.83787706640935f
 3 | #define LOG_PI 1.144729885849400f
 4 | 
 5 | __device__ int d_next_multiple(int k, int mult) {
 6 |   if (k % mult)
 7 |     return k + (mult - k % mult);
 8 |   else
 9 |     return k;
10 | }
11 | 
12 | __device__ void copy_chunks(float* in_buf, float* out_buf,
13 |                             unsigned int tid, unsigned int total) {
14 |   for (unsigned int chunk = 0; chunk + tid < total; chunk += blockDim.x) {
15 |     out_buf[chunk + tid] = in_buf[chunk + tid];
16 |   }
17 | }
18 | 
19 | __device__ void copy_chunks_strided(float* in_buf, float* out_buf,
20 |                             unsigned int tid, unsigned int ncols, 
21 | 			    unsigned int nrows, unsigned int stride) {
22 |   unsigned int outind = 0; unsigned int total = ncols*nrows;
23 |   for (unsigned int chunk = 0; chunk + tid < total; chunk += blockDim.x) {
24 |     outind = ((chunk + tid)/ncols)*stride + (chunk + tid) % ncols;
25 |     out_buf[outind] = in_buf[chunk + tid];
26 |   }
27 | }
28 | 
29 | 
30 | __device__ inline void atomic_add(float* address, float value){
31 | #if __CUDA_ARCH__ >= 200 // for Fermi, atomicAdd supports floats
32 |   atomicAdd(address, value);
33 | #elif __CUDA_ARCH__ >= 110
34 |   float old = value;
35 |   while ((old = atomicExch(address, atomicExch(address, 0.0f)+old))!=0.0f);
36 | #endif
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/gpustats/cufiles/cpustub.cu:
--------------------------------------------------------------------------------
 1 | int MAX_BLOCK_PARAMS = 64;
 2 | 
 3 | cudaError_t invoke_mvnpdf(PMatrix data, PMatrix params, float* d_pdf) {
 4 |   // Need to automatically tune block / grid layout to maximize shared memory
 5 |   // usage and coalescence, reduce wasted threads!
 6 |   BlockDesign design;
 7 |   get_tuned_layout(&design, &data, &params, MAX_BLOCK_PARAMS);
 8 | 
 9 |   int nthreads = design.data_per_block * design.params_per_block;
10 | 
11 |   // Now set up grid layout / block size
12 |   int grid_x = get_boxes(data.rows, design.data_per_block);
13 |   int grid_y = get_boxes(params.rows, design.params_per_block);
14 |   dim3 gridPDF(grid_x, grid_y);
15 |   dim3 blockPDF(nthreads, 1);
16 | 
17 |   int sharedMemSize = compute_shmem(&data, &params,
18 |                                     design.params_per_block,
19 |                                     design.data_per_block);
20 | 
21 | #ifdef DEBUG
22 |   printf("number params: %d, number data points: %d\n",
23 |          design.params_per_block, design.data_per_block);
24 |   printf("sharedMemSize: %d\n", sharedMemSize);
25 |   printf("block: %d x %d, grid: %d x %d\n", blockPDF.x, blockPDF.y,
26 |          gridPDF.x, gridPDF.y);
27 |   printf("design: %d x %d\n", design.data_per_block, design.params_per_block);
28 | 
29 |   printf("nparams: %d\n", params.rows);
30 | #endif
31 | 
32 |   mvnpdf_k<<<gridPDF,blockPDF,sharedMemSize>>>(data, params, design, d_pdf);
33 |   return cudaSuccess;
34 | }
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010 Duke University and collaborators
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are
 6 | met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |        notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |        copyright notice, this list of conditions and the following
13 |        disclaimer in the documentation and/or other materials provided
14 |        with the distribution.
15 | 
16 |     * Neither the name of the copyright holder nor the names of any
17 |        contributors may be used to endorse or promote products derived
18 |        from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | GPUStats
 3 | ========
 4 | 
 5 | gpustats is a PyCUDA-based library implementing functionality similar to that
 6 | present in scipy.stats. It implements a simple framework for specifying new CUDA
 7 | kernels and extending existing ones. Here is a (partial) list of target
 8 | functionality:
 9 | 
10 | * Probability density functions (pdfs). These are intended to speed up
11 |   likelihood calculations in particular in Bayesian inference applications, such
12 |   as in PyMC
13 | 
14 | * Random variable generation using CURAND
15 | 
16 | Requirements
17 | ------------
18 | 
19 | * NumPy
20 | * SciPy
21 | * Working PyCUDA (http://pypi.python.org/pypi/pycuda) installation
22 | * (optional) PyMC, for test suite
23 | 
24 | Installation and testing
25 | ------------------------
26 | 
27 | To install, run:
28 | 
29 | ::
30 | 
31 |     python setup.py install
32 | 
33 | If you have `nose` installed, you may run the test suite by running:
34 | 
35 | ::
36 | 
37 |     import gpustats
38 | 	gpustats.test()
39 | 
40 | Use
41 | ---
42 | 
43 | ::
44 | 
45 |     import gpustats
46 | 
47 | Some development guidelines
48 | ---------------------------
49 | 
50 | * Use spaces (4 per indent), not tabs
51 | * Trim whitespace at the end of lines (most text editors will do this for you)
52 | * PEP8-consistent Python style
53 | 
54 | People
55 | ------
56 | 
57 | Cliburn Chan cliburn.chan (at) duke.edu
58 | Andrew Cron ajc40 (at) stat.duke.edu
59 | Jacob Frelinger jacob.frelinger (at) duke.edu
60 | Wes McKinney wesmckinn (at) gmail.com
61 | Adam Richards adam.richards (at) duke.edu
62 | Marc Suchard msuchard (at) ucla.edu
63 | Quanli Wang quanli (at) stat.duke.edu
64 | Mike West mw (at) stat.duke.edu
65 | 
66 | Notes
67 | -----
68 | Requires working PyCUDA installation
69 | 


--------------------------------------------------------------------------------
/gpustats/kernels.py:
--------------------------------------------------------------------------------
 1 | from gpustats.codegen import (MVDensityKernel, DensityKernel, Exp,
 2 |                               CUFile)
 3 | import gpustats.codegen as cg
 4 | 
 5 | # TODO: check for name conflicts!
 6 | 
 7 | _log_pdf_mvnormal = """
 8 | __device__ float %(name)s(float* data, float* params, int dim) {
 9 |   unsigned int LOGDET_OFFSET = dim * (dim + 3) / 2;
10 |   float* mean = params;
11 |   float* sigma = params + dim;
12 |   float mult = params[LOGDET_OFFSET];
13 |   float logdet = params[LOGDET_OFFSET + 1];
14 | 
15 |   float discrim = 0;
16 |   float sum;
17 |   unsigned int i, j;
18 |   for (i = 0; i < dim; ++i)
19 |   {
20 |     sum = 0;
21 |     for(j = 0; j <= i; ++j) {
22 |       sum += *sigma++ * (data[j] - mean[j]);
23 |     }
24 |     discrim += sum * sum;
25 |   }
26 |   return log(mult) - 0.5f * (discrim + logdet + LOG_2_PI * dim);
27 | }
28 | """
29 | log_pdf_mvnormal = MVDensityKernel('log_pdf_mvnormal', _log_pdf_mvnormal)
30 | pdf_mvnormal = Exp('pdf_mvnormal', log_pdf_mvnormal)
31 | 
32 | 
33 | _log_pdf_normal = """
34 | __device__ float %(name)s(float* x, float* params) {
35 |   // mean stored in params[0]
36 |   float std = params[1];
37 | 
38 |   // standardize
39 |   float xstd = (*x - params[0]) / std;
40 |   return - (xstd * xstd) / 2 - 0.5f * LOG_2_PI - log(std);
41 | }
42 | """
43 | log_pdf_normal = DensityKernel('log_pdf_normal', _log_pdf_normal)
44 | pdf_normal = Exp('pdf_normal', log_pdf_normal)
45 | 
46 | sample_discrete_old = CUFile('sample_discrete_old',
47 |                          'sample_discrete.cu')
48 | 
49 | sample_discrete_logged_old = CUFile('sample_discrete_logged_old',
50 |                                 'sample_discrete_logged.cu')
51 | 
52 | sample_discrete = CUFile('sample_discrete',
53 |                              'sampleFromMeasureMedium.cu')
54 | 


--------------------------------------------------------------------------------
/gpustats/cufiles/transpose.cu:
--------------------------------------------------------------------------------
 1 | // Exercise 1 from http://webapp.dam.brown.edu/wiki/SciComp/CudaExercises
 2 | 
 3 | // Transposition of a matrix
 4 | // by Hendrik Riedmann <riedmann@dam.brown.edu>
 5 | // Andrew Cron added bounds checks ...
 6 | 
 7 | // Andrew Cron added Z grid dimension to X for larger matrices
 8 | 
 9 | #define BLOCK_SIZE %(block_size)d
10 |     #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
11 |     #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
12 | 
13 |     __global__ void transpose(float *A_t, float *A, int a_width, int a_height)
14 |     {
15 | 	int bidx = blockIdx.x + blockIdx.z;
16 |         // Base indices in A and A_t
17 |         int base_idx_a   = bidx * BLOCK_SIZE + 
18 | 	blockIdx.y * A_BLOCK_STRIDE;
19 |         int base_idx_a_t = blockIdx.y * BLOCK_SIZE + 
20 | 	bidx * A_T_BLOCK_STRIDE;
21 | 
22 |         // Global indices in A and A_t
23 |         int glob_idx_a = base_idx_a + threadIdx.x + a_width * threadIdx.y;
24 |         int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height * threadIdx.y;
25 | 
26 | 	int a_x_pos = bidx * BLOCK_SIZE + threadIdx.x;
27 | 	int a_y_pos = blockIdx.y * BLOCK_SIZE + threadIdx.y;
28 | 	int at_x_pos = blockIdx.y * BLOCK_SIZE + threadIdx.x;
29 | 	int at_y_pos = bidx * BLOCK_SIZE + threadIdx.y;
30 | 
31 |         __shared__ float A_shared[BLOCK_SIZE][BLOCK_SIZE+1];
32 | 
33 | 	if( a_x_pos < a_width && a_y_pos < a_height ){
34 |             // Store transposed submatrix to shared memory
35 |             A_shared[threadIdx.y][threadIdx.x] = A[glob_idx_a];
36 |         }          
37 |         __syncthreads();
38 |         if( at_x_pos < a_height && at_y_pos < a_width ){
39 |             // Write transposed submatrix to global memory
40 |             A_t[glob_idx_a_t] = A_shared[threadIdx.x][threadIdx.y];
41 | 	}
42 | 
43 |     }
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/old/scratch.py:
--------------------------------------------------------------------------------
 1 | from numpy.random import randn
 2 | from numpy.linalg import cholesky as chol
 3 | import numpy as np
 4 | import numpy.linalg as L
 5 | import scipy.special as sp
 6 | import pymc.flib as flib
 7 | import time
 8 | import testmod
 9 | import util
10 | import pdb
11 | 
12 | def gen_testdata(n=100, k=4):
13 |     # use static data to compare to R
14 |     data = randn(n, k)
15 |     mean = randn(k)
16 | 
17 |     np.savetxt('test_data', data)
18 |     np.savetxt('test_mean', mean)
19 | 
20 | def load_testdata():
21 |     data = np.loadtxt('test_data')
22 |     mean = np.loadtxt('test_mean')
23 |     cov = np.cov(data.T)
24 | 
25 | 
26 |     return data, mean, cov
27 | 
28 | def bench(cpu_func, gpu_func, gruns=50):
29 |     """
30 | 
31 |     """
32 | 
33 |     _s = time.clock()
34 |     for i in xrange(gruns):
35 |         gpu_func()
36 | 
37 |     gpu_speed = (time.clock() - _s) / gruns
38 | 
39 |     _s = time.clock()
40 |     cpu_func()
41 |     cpu_speed = (time.clock() - _s)
42 |     print 'CPU speed: %.3f' % (cpu_speed * 1000)
43 |     print 'GPU speed: %.3f' % (gpu_speed * 1000)
44 |     print cpu_speed / gpu_speed
45 | 
46 | if __name__ == '__main__':
47 |     testmod.set_device(0)
48 | 
49 |     n = 1e3
50 |     k = 16
51 | 
52 |     data = randn(n, k).astype(np.float32)
53 |     mean = randn(k)
54 |     cov = np.array(util.random_cov(k), dtype=np.float32)
55 | 
56 |     j = 32
57 | 
58 |     padded_data = util.pad_data(data)
59 | 
60 |     chol_sigma = chol(cov)
61 |     ichol_sigma = L.inv(chol_sigma)
62 |     logdet = np.log(np.linalg.det(cov))
63 | 
64 |     means = (mean,) * j
65 |     covs = (ichol_sigma,) * j
66 |     logdets = (logdet,) * j
67 | 
68 |     packed_params = util.pack_params(means, covs, logdets)
69 | 
70 |     cpu_func = lambda: testmod.cpu_mvnpdf(padded_data, packed_params, k).squeeze()
71 |     gpu_func = lambda: testmod._mvnpdf(padded_data, packed_params, k).squeeze()
72 | 
73 |     print cpu_func()
74 |     print gpu_func()
75 | 
76 |     # bench(cpu_func, gpu_func, gruns=50)
77 | 


--------------------------------------------------------------------------------
/old/cucommon.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Common functions for GPUStats CUDA kernels and interface functions
 3 | 
 4 |  */
 5 | #ifndef __CUCOMMON_H__
 6 | #define __CUCOMMON_H__
 7 | 
 8 | #ifdef __cplusplus
 9 | extern "C" {
10 | #endif
11 | 
12 | #include <stdio.h>
13 | #include <cuda.h>
14 | #include <cuda_runtime_api.h>
15 | 
16 | int smem_size() {
17 |   int dev = 0;
18 |   cudaDeviceProp deviceProp;
19 |   cudaGetDeviceProperties(&deviceProp, dev);
20 |   return deviceProp.sharedMemPerBlock;
21 | }
22 | 
23 | int max_block_threads() {
24 |   int dev = 0;
25 |   cudaDeviceProp deviceProp;
26 |   cudaGetDeviceProperties(&deviceProp, dev);
27 |   return deviceProp.maxThreadsPerBlock;
28 | }
29 | 
30 | // Simple strided matrix data structure, far as I can tell there's little or no
31 | // overhead in the compiled version.
32 | typedef struct PMatrix {
33 |   float* buf; // C-style row-major data
34 |   int rows; // actual number of rows
35 |   int cols; // actual number of columns
36 |   int stride; // data length of row
37 | } PMatrix;
38 | 
39 | void PMatrix_init(PMatrix* mat, float* data, int rows, int cols, int stride){
40 |   mat->buf = data;
41 |   mat->rows = rows;
42 |   mat->cols = cols;
43 |   mat->stride = stride;
44 | }
45 | 
46 | typedef struct {
47 |   int data_per_block;
48 |   int params_per_block;
49 | } BlockDesign;
50 | 
51 | int next_pow2(int k, int pow2) {
52 |   // next highest power of two
53 |   while (k <= pow2 / 2) pow2 /= 2;
54 |   return pow2;
55 | }
56 | 
57 | int get_boxes(int n, int box_size) {
58 |   // how many boxes of size box_size are needed to hold n things
59 |   return (n + box_size - 1) / box_size;
60 | }
61 | 
62 | void inline h_to_d(float* h_ptr, float* d_ptr, size_t n){
63 |   cudaError_t error;
64 |   CATCH_ERR(cudaMemcpy(d_ptr, h_ptr, n * sizeof(float), cudaMemcpyHostToDevice));
65 | }
66 | 
67 | void inline d_to_h(float* d_ptr, float* h_ptr, size_t n){
68 |   cudaError_t error;
69 |   CATCH_ERR(cudaMemcpy(h_ptr, d_ptr, n * sizeof(float), cudaMemcpyDeviceToHost));
70 | }
71 | 
72 | #ifdef __cplusplus
73 | }
74 | #endif
75 | 
76 | #endif // __CUCOMMON_H__
77 | 


--------------------------------------------------------------------------------
/old/common.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include <cuda.h>
 5 | #include "common.h"
 6 | 
 7 | #if _WIN32
 8 | 	#define isnan(x) ((x) != (x))
 9 | #endif
10 | 
11 | void set_device(int device) {
12 |   cudaError_t error;
13 |   CATCH_ERR(cudaSetDevice(device));
14 | }
15 | 
16 | REAL *allocateGPURealMemory(int length) {
17 | #ifdef DEBUG
18 | 	fprintf(stderr,"Entering ANMA-Real\n");
19 | #endif
20 | 
21 | 	REAL *data;
22 | 	cudaError_t error;
23 | 	SAFE_CUDA(cudaMalloc((void**) &data, SIZE_REAL * length),data);
24 | 	if (data == NULL) {
25 | 		fprintf(stderr,"Failed to allocate REAL (%d) memory on device!\n",
26 | 				length);
27 | 		// TODO clean up and gracefully die
28 | 		exit(-1);
29 | 	}
30 | 
31 | #ifdef DEBUG
32 | 	fprintf(stderr,"Allocated %d to %d.\n",data,(data +length));
33 | 	fprintf(stderr,"Leaving ANMA\n");
34 | #endif
35 | 
36 | 	return data;
37 | }
38 | 
39 | INT *allocateGPUIntMemory(int length) {
40 | 
41 | #ifdef DEBUG
42 | 	fprintf(stderr,"Entering ANMA-Int\n");
43 | #endif
44 | 
45 | 	INT *data;
46 | 	cudaError_t error;
47 | 	SAFE_CUDA(cudaMalloc((void**) &data, SIZE_INT * length),data);
48 | 	if (data == NULL) {
49 | 		fprintf(stderr,"Failed to allocate INT memory on device!\n");
50 | 		exit(-1);
51 | 	}
52 | 
53 | #ifdef DEBUG
54 | 	fprintf(stderr,"Allocated %d to %d.\n",data,(data+length));
55 | 	fprintf(stderr,"Leaving ANMA\n");
56 | #endif
57 | 
58 | 	return data;
59 | }
60 | 
61 | void freeGPUMemory(void *ptr) {
62 | 
63 | #ifdef DEBUG
64 | 	fprintf(stderr,"Entering FNMA\n");
65 | #endif
66 | 
67 | 	if (ptr != 0) {
68 | 		cudaFree(ptr);
69 | 	}
70 | 
71 | #ifdef DEBUG
72 | 	fprintf(stderr,"Leaving FNMA\n");
73 | #endif
74 | }
75 | 
76 | void storeGPURealMemoryArray(REAL *toGPUPtr, REAL *fromGPUPtr, int length) {
77 | 	cudaError_t error;
78 | 	SAFE_CUDA(cudaMemcpy(toGPUPtr, fromGPUPtr, SIZE_REAL*length, cudaMemcpyDeviceToDevice),toGPUPtr);
79 | }
80 | 
81 | void storeGPUIntMemoryArray(INT *toGPUPtr, INT *fromGPUPtr, int length) {
82 | 	cudaError_t error;
83 | 	SAFE_CUDA(cudaMemcpy(toGPUPtr, fromGPUPtr, SIZE_INT*length, cudaMemcpyDeviceToDevice),toGPUPtr);
84 | }
85 | 


--------------------------------------------------------------------------------
/gpustats/cufiles/univcaller.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Automatically-generated kernel for %(name)s
 3 | 
 4 |   For univariate distributions
 5 |  */
 6 | 
 7 | __global__ void k_%(name)s(float* output,
 8 |                            float* data,
 9 |                            float* params,
10 |                            int data_per_block,
11 |                            int params_per_block,
12 |                            int nobs,
13 |                            int nparams,
14 |                            int params_stride) {
15 | 
16 |   unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
17 | 
18 |   unsigned int rel_param = tid / data_per_block;
19 |   unsigned int rel_data = tid - rel_param * data_per_block;
20 | 
21 |   unsigned int obs_num = data_per_block * blockIdx.x + rel_data;
22 |   unsigned int param_num = params_per_block * blockIdx.y + rel_param;
23 | 
24 |   // set up shared data
25 |   extern __shared__ float shared_data[];
26 |   float* sh_params = shared_data;
27 |   float* sh_data = sh_params + params_per_block * params_stride;
28 |   float* sh_result = sh_data + data_per_block;
29 | 
30 |   copy_chunks(data + data_per_block * blockIdx.x,
31 |               sh_data, tid,
32 |               min(nobs - data_per_block * blockIdx.x,
33 |                   data_per_block));
34 | 
35 |   copy_chunks(params + params_per_block * blockIdx.y * params_stride,
36 |               sh_params, tid,
37 |               min(params_per_block,
38 |                   nparams - params_per_block * blockIdx.y) * params_stride);
39 | 
40 |   __syncthreads();
41 | 
42 |   // allocated enough shared memory so that this will not walk out of bounds
43 |   // no matter what, though some of the results will be garbage
44 |   sh_result[tid] = %(name)s(sh_data + rel_data,
45 |                             sh_params + rel_param * params_stride);
46 |   __syncthreads();
47 | 
48 |   unsigned int result_idx = nobs * param_num + obs_num;
49 | 
50 |   // output is column-major, so this will then coalesce
51 |   if (obs_num < nobs & param_num < nparams) {
52 |     // output[result_idx] = obs_num;
53 |     output[result_idx] = sh_result[tid];
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/gpustats/__init__.py:
--------------------------------------------------------------------------------
 1 | from pdfs import *
 2 | 
 3 | from numpy import errstate
 4 | from numpy.testing import Tester
 5 | class NoseWrapper(Tester):
 6 |     '''
 7 |     This is simply a monkey patch for numpy.testing.Tester.
 8 | 
 9 |     It allows extra_argv to be changed from its default None to ['--exe'] so
10 |     that the tests can be run the same across platforms.  It also takes kwargs
11 |     that are passed to numpy.errstate to suppress floating point warnings.
12 |     '''
13 |     def test(self, label='fast', verbose=1, extra_argv=['--exe'], doctests=False,
14 |             coverage=False, **kwargs):
15 |         ''' Run tests for module using nose
16 | 
17 |         %(test_header)s
18 |         doctests : boolean
19 |             If True, run doctests in module, default False
20 |         coverage : boolean
21 |             If True, report coverage of NumPy code, default False
22 |             (Requires the coverage module:
23 |              http://nedbatchelder.com/code/modules/coverage.html)
24 |         kwargs
25 |             Passed to numpy.errstate.  See its documentation for details.
26 |         '''
27 | 
28 |         # cap verbosity at 3 because nose becomes *very* verbose beyond that
29 |         verbose = min(verbose, 3)
30 | 
31 |         from numpy.testing import utils
32 |         utils.verbose = verbose
33 | 
34 |         if doctests:
35 |             print "Running unit tests and doctests for %s" % self.package_name
36 |         else:
37 |             print "Running unit tests for %s" % self.package_name
38 | 
39 |         self._show_system_info()
40 | 
41 |         # reset doctest state on every run
42 |         import doctest
43 |         doctest.master = None
44 | 
45 |         argv, plugins = self.prepare_test_args(label, verbose, extra_argv,
46 |                                                doctests, coverage)
47 |         from numpy.testing.noseclasses import NumpyTestProgram
48 |         from warnings import simplefilter #, catch_warnings
49 |         with errstate(**kwargs):
50 | ##            with catch_warnings():
51 |             simplefilter('ignore', category=DeprecationWarning)
52 |             t = NumpyTestProgram(argv=argv, exit=False, plugins=plugins)
53 |         return t.result
54 | test = NoseWrapper().test
55 | 


--------------------------------------------------------------------------------
/old/util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pymc.distributions as pymc_dist
 3 | 
 4 | PAD_MULTIPLE = 16
 5 | HALF_WARP = 16
 6 | 
 7 | def random_cov(dim):
 8 |     return pymc_dist.rinverse_wishart(dim, np.eye(dim))
 9 | 
10 | def unvech(v):
11 |     # quadratic formula, correct fp error
12 |     rows = .5 * (-1 + np.sqrt(1 + 8 * len(v)))
13 |     rows = int(np.round(rows))
14 | 
15 |     result = np.zeros((rows, rows))
16 |     result[np.triu_indices(rows)] = v
17 |     result = result + result.T
18 | 
19 |     # divide diagonal elements by 2
20 |     result[np.diag_indices(rows)] /= 2
21 | 
22 |     return result
23 | 
24 | def next_multiple(k, p):
25 |     if k % p:
26 |         return k + (p - k % p)
27 | 
28 |     return k
29 | 
30 | def pad_data(data):
31 |     """
32 |     Pad data to avoid bank conflicts on the GPU-- dimension should not be a
33 |     multiple of the half-warp size (16)
34 |     """
35 |     n, k = data.shape
36 | 
37 |     if not k % HALF_WARP:
38 |         pad_dim = k + 1
39 |     else:
40 |         pad_dim = k
41 | 
42 |     if k != pad_dim:
43 |         padded_data = np.empty((n, pad_dim), dtype=np.float32)
44 |         padded_data[:, :k] = data
45 | 
46 |         return padded_data
47 |     else:
48 |         return prep_ndarray(data)
49 | 
50 | def prep_ndarray(arr):
51 |     # is float32 and contiguous?
52 |     if not arr.dtype == np.float32 or not arr.flags.contiguous:
53 |         arr = np.array(arr, dtype=np.float32)
54 | 
55 |     return arr
56 | 
57 | def pack_params(means, chol_sigmas, logdets):
58 |     to_pack = []
59 |     for m, ch, ld in zip(means, chol_sigmas, logdets):
60 |         to_pack.append(pack_pdf_params(m, ch, ld))
61 | 
62 |     return np.vstack(to_pack)
63 | 
64 | def pack_pdf_params(mean, chol_sigma, logdet):
65 |     '''
66 | 
67 | 
68 |     '''
69 |     k = len(mean)
70 |     mean_len = k
71 |     chol_len = k * (k + 1) / 2
72 |     mch_len = mean_len + chol_len
73 | 
74 |     packed_dim = next_multiple(mch_len + 2, PAD_MULTIPLE)
75 | 
76 |     packed_params = np.empty(packed_dim, dtype=np.float32)
77 |     packed_params[:mean_len] = mean
78 | 
79 |     packed_params[mean_len:mch_len] = chol_sigma[np.tril_indices(k)]
80 |     packed_params[mch_len:mch_len + 2] = 1, logdet
81 | 
82 |     return packed_params
83 | 


--------------------------------------------------------------------------------
/gpustats/cufiles/mvcaller.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Automatically-generated kernel for %(name)s
 3 | 
 4 |   For multivariate distributions, coordinates to utilize shared memory
 5 | 
 6 |   TODO: How to avoid bank conflicts
 7 |   TODO: How to ensure coalescence
 8 |  */
 9 | 
10 | __global__ void k_%(name)s(float* g_output,
11 | 						   float* g_data,
12 | 						   float* g_params,
13 | 						   int data_per_block,
14 | 						   int params_per_block,
15 | 						   int data_rows,
16 | 						   int data_stride,
17 | 						   int data_cols,
18 | 						   int params_rows,
19 | 						   int params_stride) {
20 | 
21 |   unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
22 | 
23 |   unsigned int rel_param = tid / data_per_block;
24 |   unsigned int rel_data = tid - rel_param * data_per_block;
25 | 
26 |   unsigned int obs_num = data_per_block * blockIdx.x + rel_data;
27 |   unsigned int param_num = params_per_block * blockIdx.y + rel_param;
28 | 
29 |   // set up shared data
30 |   extern __shared__ float shared_data[];
31 |   float* sh_params = shared_data;
32 |   float* sh_data = sh_params + params_per_block * params_stride;
33 |   float* sh_result = sh_data + data_per_block * data_stride;
34 | 
35 |   copy_chunks(g_data + data_per_block * blockIdx.x * data_stride,
36 |               sh_data, tid,
37 |               min(data_rows - data_per_block * blockIdx.x,
38 |                   data_per_block) * data_stride);
39 | 
40 |   copy_chunks(g_params + params_per_block * blockIdx.y * params_stride,
41 |               sh_params, tid,
42 |               min(params_per_block,
43 |                   params_rows - params_per_block * blockIdx.y) * params_stride);
44 | 
45 |   __syncthreads();
46 | 
47 |   // allocated enough shared memory so that this will not walk out of bounds
48 |   // no matter what, though some of the results will be garbage
49 |   sh_result[tid] = %(name)s(sh_data + rel_data * data_stride,
50 |                             sh_params + rel_param * params_stride,
51 |                             data_cols);
52 |   __syncthreads();
53 | 
54 |   unsigned int result_idx = data_rows * param_num + obs_num;
55 |   // unsigned int result_idx = obs_num * data_cols + param_num
56 | 
57 |   // g_output is column-major, so this will then coalesce
58 |   if (obs_num < data_rows & param_num < params_rows) {
59 |     g_output[result_idx] = sh_result[tid];
60 |   }
61 | }
62 | 
63 | // foo
64 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #/usr/bin/env python
 2 | 
 3 | from numpy.distutils.misc_util import Configuration
 4 | from numpy.distutils.core import setup
 5 | 
 6 | DESCRIPTION = "GPU-based statistical functions"
 7 | LONG_DESCRIPTION = """
 8 | gpustats is a PyCUDA-based library implementing functionality similar to that
 9 | present in scipy.stats. It implements a simple framework for specifying new CUDA
10 | kernels and extending existing ones. Here is a (partial) list of target
11 | functionality:
12 | 
13 | * Probability density functions (pdfs). These are intended to speed up
14 |   likelihood calculations in particular in Bayesian inference applications, such
15 |   as in PyMC
16 | 
17 | * Random variable generation using CURAND
18 | 
19 | Notes
20 | -----
21 | Requires working PyCUDA installation
22 | """
23 | 
24 | REQUIRES = ['numpy', 'pycuda >= 0.94rc']
25 | DISTNAME = 'gpustats'
26 | LICENSE = 'BSD'
27 | AUTHOR = "Wes McKinney"
28 | AUTHOR_EMAIL = "wesmckinn@gmail.com"
29 | URL = "https://github.com/dukestats/gpustats"
30 | CLASSIFIERS = [
31 |     'Development Status :: 2 - Pre-Alpha',
32 |     'Environment :: Console',
33 |     'Operating System :: OS Independent',
34 |     'Intended Audience :: Science/Research',
35 |     'Programming Language :: Python',
36 |     'Topic :: Scientific/Engineering',
37 | ]
38 | 
39 | MAJOR = 0
40 | MINOR = 0
41 | MICRO = 1
42 | ISRELEASED = True
43 | VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO)
44 | 
45 | FULLVERSION = VERSION
46 | if not ISRELEASED:
47 |     FULLVERSION += '.beta'
48 | 
49 | def configuration(parent_package='', top_path=None):
50 |     config = Configuration(None, parent_package, top_path,
51 |                            version=FULLVERSION)
52 |     config.set_options(ignore_setup_xxx_py=True,
53 |                        assume_default_configuration=True,
54 |                        delegate_options_to_subpackages=True,
55 |                        quiet=True)
56 | 
57 |     config.add_subpackage('gpustats')
58 |     config.add_data_dir('gpustats/tests')
59 |     config.add_data_dir('gpustats/cufiles')
60 |     return config
61 | 
62 | if __name__ == '__main__':
63 |     setup(name=DISTNAME,
64 |           author=AUTHOR,
65 |           author_email=AUTHOR_EMAIL,
66 |           description=DESCRIPTION,
67 |           license=LICENSE,
68 |           url=URL,
69 |           long_description=LONG_DESCRIPTION,
70 |           classifiers=CLASSIFIERS,
71 |           platforms='any',
72 |           configuration=configuration)
73 | 


--------------------------------------------------------------------------------
/gpustats/tests/test_samplers.py:
--------------------------------------------------------------------------------
 1 | import nose
 2 | import sys
 3 | import unittest
 4 | 
 5 | from numpy.random import rand
 6 | from numpy.linalg import inv, cholesky as chol
 7 | from numpy.testing import assert_almost_equal, assert_equal
 8 | import numpy as np
 9 | 
10 | import scipy.stats as sp_stats
11 | 
12 | import gpustats as gps
13 | import gpustats.sampler as gpusamp
14 | import gpustats.compat as compat
15 | import gpustats.util as util
16 | 
17 | DECIMAL_6 = 6
18 | DECIMAL_5 = 5
19 | DECIMAL_4 = 4
20 | DECIMAL_3 = 3
21 | DECIMAL_2 = 2
22 | DECIMAL_1 = 1
23 | 
24 | np.set_printoptions(suppress=True)
25 | 
26 | def _make_test_densities(n=10000, k=4):
27 |     dens = rand(k)
28 |     densities = [dens.copy() for _ in range(n)]
29 |     return np.asarray(densities)
30 |     #return (densities.T - densities.sum(1)).T
31 | 
32 | def _compare_discrete(n, k):
33 |     densities = _make_test_densities(n, k)
34 |     dens = densities[0,:].copy() / densities[0,:].sum()
35 |     expected_mu = np.dot(np.arange(k), dens)
36 | 
37 |     labels = gpusamp.sample_discrete(densities, logged=False)
38 |     est_mu = labels.mean()
39 |     return est_mu, expected_mu
40 | 
41 | def _compare_logged(n, k):
42 |     densities = np.log(_make_test_densities(n, k))
43 |     dens = np.exp((densities[0,:] - densities[0,:].max()))
44 |     dens = dens / dens.sum()
45 |     expected_mu = np.dot(np.arange(k), dens)
46 | 
47 |     labels = gpusamp.sample_discrete(densities, logged=True)
48 |     est_mu = labels.mean()
49 |     return est_mu, expected_mu
50 | 
51 | 
52 | class TestDiscreteSampler(unittest.TestCase):
53 |     test_cases = [(100000, 4),
54 |                   (100000, 9),
55 |                   (100000, 16),
56 |                   (100000, 20),
57 |                   (1000000, 35)]
58 | 
59 |     def _check_discrete(self, n, k):
60 |         a, b = _compare_discrete(n, k)
61 |         assert_almost_equal(a, b, DECIMAL_1)
62 | 
63 |     def _check_logged(self, n, k):
64 |         a, b = _compare_logged(n, k)
65 |         assert_almost_equal(a, b, DECIMAL_1)
66 | 
67 |     def test_discrete(self):
68 |         for n, k in self.test_cases:
69 |             self._check_discrete(n, k)
70 | 
71 |     def test_logged(self):
72 |         for n, k in self.test_cases:
73 |             self._check_logged(n, k)
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     print 'starting sampler'
78 |     a, b = _compare_logged(1000000, 35)
79 |     print a
80 |     print b
81 | 
82 |     
83 |     
84 | 


--------------------------------------------------------------------------------
/gpustats/cufiles/sample_discrete.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Block layout : npmfs x nhelpers
 3 |   Grid layout : K x 1
 4 |   K is the smallest number such that K * npmfs >= pmf_rows
 5 |  */
 6 | 
 7 | __global__ void
 8 | k_%(name)s(float* g_pmf, /** Precomputed pmf */
 9 | 		   float* g_urand, /** Precomputed random number */
10 | 		   float* g_output, /** Resultant choice */
11 | 		   int pmf_rows,
12 | 		   int pmf_cols,
13 | 		   int pmf_stride,
14 | 		   int sh_stride
15 |   ) {
16 |   // blockDim.x = number of pmfs sampled from in this block
17 |   // blockDim.y = number of helper threads per pmf
18 |   unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
19 |   unsigned int thidx = threadIdx.x;
20 |   unsigned int npmfs = blockDim.x;
21 | 
22 |   // Make block size flexible ...
23 |   extern __shared__ float shared_data[];
24 | 
25 |   float* sh_pmf = shared_data; // npmfs * sh_stride floats
26 |   float* sh_work = sh_pmf + npmfs * sh_stride; // nmpfs floats
27 | 
28 |   // Move pmf data into shared memory
29 |   copy_chunks_strided(g_pmf + npmfs * pmf_stride * blockIdx.x,
30 | 			  sh_pmf, tid, pmf_stride,
31 | 			  min(npmfs, pmf_rows - npmfs * blockIdx.x), 
32 | 			  sh_stride);
33 |   __syncthreads();
34 | 
35 |   // move uniform random draws into shared memory
36 |   copy_chunks(g_urand + npmfs * blockIdx.x,
37 | 			  sh_work, tid,
38 | 			  min(npmfs, pmf_rows - npmfs * blockIdx.x));
39 |   __syncthreads();
40 | 
41 |   // done copying, now move pointer to start of pmf for this row of threads
42 |   sh_pmf = sh_pmf + thidx * sh_stride;
43 | 
44 |   // compute normalizing constant using atomic operators?
45 | 
46 |   // for(int chunk = 0; chunk + thidx < pmf_cols; chunk += blockDim.x) {
47 |   // 	atomic_add(sh_work + thidy, sh_pmf[chunk + thidx]);
48 |   // }
49 | 
50 |   if (threadIdx.y == 0 && thidx < pmf_rows - npmfs * blockIdx.x) {
51 | 	float norm_const = 0;
52 |   	for (int i = 0; i < pmf_cols; ++i) {
53 |   	  norm_const += sh_pmf[i];
54 |   	}
55 | 
56 | 	float draw = sh_work[thidx];
57 | 
58 | 	// replace with scaled cumulative pdf
59 | 	sh_pmf[0] /= norm_const;
60 | 	sh_work[thidx] = 0;
61 | 	if (sh_pmf[0] < draw) {
62 | 	  for(int i = 1; i < pmf_cols; i++) {
63 | 		sh_pmf[i] = sh_pmf[i-1] + sh_pmf[i] / norm_const;
64 | 		if (sh_pmf[i] >= draw) {
65 | 		  sh_work[thidx] = i;
66 | 		  break;
67 | 		}
68 | 	  }
69 | 	}
70 |   }
71 |   __syncthreads();
72 | 
73 |   // this is now coalesced
74 |   unsigned int result_id = blockIdx.x * npmfs + tid;
75 |   if (result_id < pmf_rows && tid < npmfs)
76 |     g_output[result_id] = sh_work[tid];
77 | 
78 |   return;
79 | }
80 | 


--------------------------------------------------------------------------------
/gpustats/cufiles/sample_discrete_logged.cu:
--------------------------------------------------------------------------------
 1 | __global__ void
 2 | k_%(name)s(float* g_pmf, /** Precomputed logged pmf */
 3 | 		   float* g_urand, /** Precomputed random number */
 4 | 		   float* g_output, /** Resultant choice */
 5 | 		   int pmf_rows,
 6 | 		   int pmf_cols,
 7 | 		   int pmf_stride,
 8 | 		   int sh_stride
 9 |   ) {
10 | 
11 |   // blockDim.x = number of pmfs sampled from in this block
12 |   // blockDim.y = number of helper threads per pmf
13 |   unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
14 |   unsigned int thidx = threadIdx.x;
15 |   unsigned int npmfs = blockDim.x;
16 | 
17 |   // Make block size flexible ...
18 |   extern __shared__ float shared_data[];
19 | 
20 |   float* sh_pmf = shared_data; // npmfs * sh_stride floats
21 |   float* sh_work = sh_pmf + npmfs * sh_stride; // nmpfs floats
22 | 
23 |   // Move pmf data into shared memory
24 |   copy_chunks_strided(g_pmf + npmfs * pmf_stride * blockIdx.x,
25 | 			  sh_pmf, tid, pmf_stride,
26 | 			  min(npmfs, pmf_rows - npmfs * blockIdx.x), 
27 | 			  sh_stride);
28 |   __syncthreads();
29 | 
30 |   // move uniform random draws into shared memory
31 |   copy_chunks(g_urand + npmfs * blockIdx.x,
32 | 			  sh_work, tid,
33 | 			  min(npmfs, pmf_rows - npmfs * blockIdx.x));
34 |   __syncthreads();
35 | 
36 |   // done copying, now move pointer to start of pmf for this row of threads
37 |   sh_pmf = sh_pmf + thidx * sh_stride;
38 | 
39 |   if (threadIdx.y == 0 && thidx < pmf_rows - npmfs * blockIdx.x) {
40 |         // get max
41 |         float pmf_max = sh_pmf[0]; float cur_val = 0;
42 | 	for (int i = 1; i < pmf_cols; ++i){
43 | 	  cur_val = sh_pmf[i];
44 | 	  pmf_max = fmax(pmf_max, cur_val);
45 |           //pmf_max = ((pmf_max < cur_val) : (cur_val) , (pmf_max));
46 | 	}
47 | 
48 | 	// subtract max and exponentiate 
49 | 	float norm_const = 0;
50 |   	for (int i = 0; i < pmf_cols; ++i) {
51 | 	  sh_pmf[i] = expf(sh_pmf[i] - pmf_max);
52 |   	  norm_const += sh_pmf[i];
53 |   	}
54 | 
55 | 	float draw = sh_work[thidx];
56 | 
57 | 	// replace with scaled cumulative pdf
58 | 	sh_pmf[0] /= norm_const;
59 | 	sh_work[thidx] = 0;
60 | 	if (sh_pmf[0] < draw) {
61 | 	  for(int i = 1; i < pmf_cols; i++) {
62 | 		sh_pmf[i] = sh_pmf[i-1] + sh_pmf[i] / norm_const;
63 | 		if (sh_pmf[i] >= draw) {
64 | 		  sh_work[thidx] = i;
65 | 		  break;
66 | 		}
67 | 	  }
68 | 	}
69 | 
70 | 	// write
71 | 	g_output[blockIdx.x*npmfs + thidx] = sh_work[thidx];
72 | 
73 |   }
74 | //  __syncthreads();
75 | 
76 |   // this is now coalesced
77 | //  unsigned int result_id = blockIdx.x * npmfs + tid;
78 | //  if (result_id < pmf_rows && tid < npmfs)
79 | //    g_output[result_id] = sh_work[tid];
80 | 
81 | }
82 | 
83 | 
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/old/Makefile:
--------------------------------------------------------------------------------
  1 | CUDA_PATH = /usr/local/cuda
  2 | 
  3 | # compilers
  4 | CC := gcc
  5 | CXX = g++
  6 | NVCC = $(CUDA_PATH)/bin/nvcc
  7 | NVCC_DBG_FLAGS = -Xcompiler -fno-strict-aliasing,-fPIC
  8 | 
  9 | INCPATH =
 10 | # compiler / linker flags
 11 | CCFLAGS = -fPIC -g -Wall
 12 | 
 13 | # linker flags
 14 | LINKFLAGS = -L. -lgpustats
 15 | 
 16 | LIBPATH =
 17 | NVCCFLAGS = $(NVCC_DBG_FLAGS)
 18 | 
 19 | OSUPPER   = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
 20 | OSLOWER   = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:])
 21 | OSNAME := $(shell uname)
 22 | OSARCH = $(shell uname -m)
 23 | 
 24 | ifeq ($(OSNAME),Linux)
 25 | 	CUDA_SDK_PATH   := $(HOME)/cuda_sdk
 26 | 	CUDA_LIB  = -L$(CUDA_PATH)/lib64
 27 | endif
 28 | 
 29 | # OS X thinks it's i386
 30 | # 'linux' is output for Linux system, 'darwin' for OS X
 31 | DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))
 32 | 
 33 | ifeq ($(OSNAME),Darwin)
 34 | 	CUDA_SDK_PATH   := /Developer/CUDA/C
 35 | 	CUDA_LIB  = -L$(CUDA_PATH)/lib
 36 | endif
 37 | 
 38 | CC_ARCH_FLAGS :=
 39 | # NVCCFLAGS :=
 40 | 
 41 | LIB_ARCH        = x86_64
 42 | CUDPPLIB_SUFFIX = x86_64
 43 | NVCCFLAGS      += -m64
 44 | CXX_ARCH_FLAGS += -m64
 45 | 
 46 | # ifeq ($(OSNAME),Darwin)
 47 | # 	NVCCFLAGS += -m32
 48 | # 	LIB_ARCH = i386
 49 | # 	CC_ARCH_FLAGS += -arch i386
 50 | # else
 51 | # 	LIB_ARCH        = x86_64
 52 | # 	CUDPPLIB_SUFFIX = x86_64
 53 | # 	NVCCFLAGS      += -m64
 54 | # 	ifneq ($(DARWIN),)
 55 | # 	   CXX_ARCH_FLAGS += -arch x86_64
 56 | # 	else
 57 | # 	   CXX_ARCH_FLAGS += -m64
 58 | # 	endif
 59 | # endif
 60 | 
 61 | CCFLAGS += $(CC_ARCH_FLAGS)
 62 | 
 63 | CUDA_INC = -I$(CUDA_PATH)/include
 64 | CUDA_SDK_COMMONDIR = $(CUDA_SDK_PATH)/common
 65 | CUDA_SDK_INC = -I$(CUDA_SDK_COMMONDIR)/inc
 66 | CUDA_LIB += -L$(CUDA_SDK_PATH)/lib -L$(CUDA_SDK_COMMONDIR)/lib -lcuda -lcudart -lcublas
 67 | 
 68 | EXECUTABLE := test
 69 | CUFILES = mvnpdf.cu
 70 | CU_DEPS = common.h
 71 | CFILES := common.c
 72 | USECUBLAS        := 1
 73 | OBJDIR = obj
 74 | LIBDIR = lib
 75 | TARGETDIR = .
 76 | TARGET    := $(TARGETDIR)/$(EXECUTABLE)
 77 | 
 78 | OBJS +=  $(patsubst %.c,%.o,$(notdir $(CFILES)))
 79 | OBJS +=  $(patsubst %.cu,%.cu_o,$(notdir $(CUFILES)))
 80 | 
 81 | VERBOSE := -
 82 | 
 83 | # need to use g++ to link on OS X?
 84 | 
 85 | libgpustats.so: makedirs $(OBJS)
 86 | 	$(CXX) $(CC_ARCH_FLAGS) -shared -W1,-soname,libgpustats.so -o libgpustats.so $(OBJS) -lc $(CUDA_LIB)
 87 | 
 88 | runpy: cython
 89 | 	LD_LIBRARY_PATH=.:$(LD_LIBRARY_PATH)  python scratch.py
 90 | 
 91 | ipython: cython
 92 | 	LD_LIBRARY_PATH=.:$(LD_LIBRARY_PATH)  ipython
 93 | 
 94 | test: libgpustats.so
 95 | 	$(VERBOSE)$(CC) $(CC_ARCH_FLAGS) -std=c99 test.c -o test $(CUDA_INC) $(LINKFLAGS)
 96 | 
 97 | cython: libgpustats.so cytest.pyx build_cython.py
 98 | 	-python build_cython.py build_ext --inplace
 99 | 
100 | makedirs:
101 | 	$(VERBOSE)mkdir -p $(LIBDIR)
102 | 	$(VERBOSE)mkdir -p $(OBJDIR)
103 | 	$(VERBOSE)mkdir -p $(TARGETDIR)
104 | 
105 | clean:
106 | 	-rm -rf *.so *.o *.cu_o build/
107 | 
108 | #### CUDA files
109 | 
110 | %.o: %.c
111 | 	$(VERBOSE)$(CC) $(CCFLAGS) -c $*.c -o $@ $(INCPATH) $(CUDA_INC) $(CUDA_SDK_INC)
112 | 
113 | %.c_o : %.c
114 | 	$(CC) $(PROFILE) -c $< -o $@
115 | 
116 | %.cu_o : %.cu $(CUDA_HEADERS) $(CU_DEPS)
117 | 	$(VERBOSE)$(NVCC) $(NVCCFLAGS) -c $< -o $@ -I. $(INCPATH) $(CUDA_INC) $(CUDA_SDK_INC) -DUNIX
118 | 


--------------------------------------------------------------------------------
/gpustats/tests/test_pdfs.py:
--------------------------------------------------------------------------------
  1 | import nose
  2 | import sys
  3 | import unittest
  4 | 
  5 | from numpy.random import randn
  6 | from numpy.linalg import inv, cholesky as chol
  7 | from numpy.testing import assert_almost_equal, assert_equal
  8 | import numpy as np
  9 | 
 10 | import scipy.stats as sp_stats
 11 | 
 12 | import gpustats as gps
 13 | import gpustats.compat as compat
 14 | import gpustats.util as util
 15 | 
 16 | DECIMAL_6 = 6
 17 | DECIMAL_5 = 5
 18 | DECIMAL_4 = 4
 19 | DECIMAL_3 = 3
 20 | DECIMAL_2 = 2
 21 | 
 22 | np.set_printoptions(suppress=True)
 23 | 
 24 | def _make_test_case(n=1000, k=4, p=1):
 25 |     data = randn(n, k)
 26 |     covs = [util.random_cov(k) for _ in range(p)]
 27 |     means = [randn(k) for _ in range(p)]
 28 |     return data, means, covs
 29 | 
 30 | # debugging...
 31 | 
 32 | def _compare_multi(n, k, p):
 33 |     data, means, covs = _make_test_case(n, k, p)
 34 | 
 35 |     # cpu in PyMC
 36 |     pyresult = compat.python_mvnpdf(data, means, covs)
 37 | 
 38 |     # gpu
 39 |     result = gps.mvnpdf_multi(data, means, covs)
 40 | 
 41 |     return result, pyresult
 42 | 
 43 | def _compare_single(n, k):
 44 |     data, means, covs = _make_test_case(n, k, 1)
 45 | 
 46 |     mean = means[0]
 47 |     cov = covs[0]
 48 | 
 49 |     # cpu in PyMC
 50 |     pyresult = compat.python_mvnpdf(data, [mean], [cov]).squeeze()
 51 |     # gpu
 52 | 
 53 |     result = gps.mvnpdf(data, mean, cov)
 54 |     return result, pyresult
 55 | 
 56 | class TestMVN(unittest.TestCase):
 57 |     # ndata, dim, ncomponents
 58 |     test_cases = [(1000, 4, 1),
 59 |                   (1000, 4, 16),
 60 |                   (1000, 4, 32),
 61 |                   (1000, 4, 64),
 62 |                   (1000, 7, 64),
 63 |                   (1000, 8, 64),
 64 |                   (1000, 14, 32),
 65 |                   (1000, 16, 128),
 66 |                   (250, 25, 32),
 67 |                   (10, 15, 2),
 68 |                   (500000, 5, 12)]
 69 | 
 70 |     def _check_multi(self, n, k, p):
 71 |         a, b = _compare_multi(n, k, p)
 72 |         assert_almost_equal(a, b, DECIMAL_2)
 73 | 
 74 |     def _check_single(self, n, k):
 75 |         a, b = _compare_single(n, k)
 76 |         assert_almost_equal(a, b, DECIMAL_2)
 77 | 
 78 |     def test_multi(self):
 79 |         for n, k, p in self.test_cases:
 80 |             self._check_multi(n, k, p)
 81 | 
 82 |     def test_single(self):
 83 |         for n, k, p in self.test_cases:
 84 |             self._check_single(n, k)
 85 | 
 86 | class TestUnivariate(unittest.TestCase):
 87 |     def test_normal(self):
 88 |         test_cases = [
 89 |             (100, 0, 1),
 90 |             (100, .5, 2.5),
 91 |             (10, 5, 3),
 92 |             (2000, 1, 4)
 93 |         ]
 94 |         for n, mean, std in test_cases:
 95 |             data = randn(n)
 96 |             pyresult = sp_stats.norm.pdf(data, loc=mean, scale=std)
 97 | 
 98 |             result = gps.normpdf(data, mean, std, logged=True)
 99 |             assert_almost_equal(result, np.log(pyresult), DECIMAL_5)
100 | 
101 |     def test_normal_multi(self):
102 |         means = np.random.randn(5)
103 |         scales = np.ones(5)
104 | 
105 |         data = np.random.randn(10)
106 |         result = gps.normpdf_multi(data, means, scales, logged=True)
107 | 
108 |         pyresult = np.empty_like(result)
109 |         for i, (m, sc) in enumerate(zip(means, scales)):
110 |             pyresult[:, i] = sp_stats.norm.pdf(data, loc=m, scale=sc)
111 |         assert_almost_equal(result, np.log(pyresult), DECIMAL_5)
112 | 
113 | if __name__ == '__main__':
114 |     # nose.runmodule(argv=['', '--pdb', '-v', '--pdb-failure'])
115 |     _compare_multi(500000, 4, 128)
116 |     pass
117 | 


--------------------------------------------------------------------------------
/old/common.h:
--------------------------------------------------------------------------------
  1 | #ifndef __GPUSTATS_COMMON__
  2 | #define __GPUSTATS_COMMON__
  3 | 
  4 | #include <math.h>
  5 | #include <cuda.h>
  6 | #include <cuda_runtime_api.h>
  7 | 
  8 | /* Dimension specific definitions to ensure coalesced memory transactions */
  9 | 
 10 | // extern int DIM,MEAN_CHD_DIM,PACK_DIM,CHD_DIM,LOGDET_OFFSET,DATA_PADDED_DIM,NCHUNKSIZE;
 11 | 
 12 | /*
 13 | #define DENSITIES_IN_BLOCK		16 //4 //4 for 27d data, 16 for other data
 14 | #define	DATA_IN_BLOCK			16	//need >= 16 to be efficient
 15 | #define SAMPLE_BLOCK			32
 16 | #define SAMPLE_DENSITY_BLOCK	16
 17 | 
 18 | #define BASE_DATAPADED_DIM		8
 19 | 
 20 | #define SIGMA_BLOCK_SIZE		128
 21 | #define SIGMA_THREAD_SUM_SIZE		25
 22 | #define MAX_GPU_COUNT 8
 23 | 
 24 | 
 25 | #define LOGPDF
 26 | */
 27 | 
 28 | //#define CHECK_GPU
 29 | 
 30 | /*
 31 | // For algorithm 2
 32 | 
 33 | #define	PAD_CSR				0		// Little (no?) performance gain on 9400M and complicates algorithm
 34 | #define PAD					1		// Removes some bank conflicts (?)
 35 | #define BLOCK_SIZE_COL		16		// # of data columns to process per block
 36 | #define BLOCK_SIZE_ROW 		32		// BLOCK_SIZE_ROW / HALFWARP = # of rows (components) to process per block
 37 | #define HALFWARP_LOG2		4
 38 | #define HALFWARP 			(1<<HALFWARP_LOG2)
 39 | #define GROW_INDICES		16
 40 | 
 41 | #define COMPACT_BLOCK	256
 42 | 
 43 | */
 44 | 
 45 | /* Definition of REAL can be switched between 'double' and 'float' */
 46 | #ifdef DOUBLE_PRECISION
 47 | 	#define REAL		double
 48 | #else
 49 | 	#define REAL		float
 50 | #endif
 51 | 
 52 | #define SIZE_REAL	sizeof(REAL)
 53 | 
 54 | #define INT			int
 55 | 
 56 | #define SIZE_INT	sizeof(INT)
 57 | 
 58 | 
 59 | /* Error codes */
 60 | #define CUDA_ERROR	1
 61 | #define CUDA_SUCCESS	0
 62 | 
 63 | #define CATCH_ERR(call) error = call; \
 64 |   if( error != 0 ) {												\
 65 | 	fprintf(stderr,"CUDA Error %s\n", cudaGetErrorString(error));	\
 66 | 	exit(-1);														\
 67 |   }
 68 | 
 69 | #define SAFE_CUDA(call,ptr) error = call; \
 70 |   if( error != 0 ) {											\
 71 | 	fprintf(stderr,"Error %s\n", cudaGetErrorString(error));	\
 72 | 	exit(-1);													\
 73 |   }
 74 | 
 75 | // how to suppress gcc warning?
 76 | 	// fprintf(stderr,"Ptr = %d\n", ptr);							\
 77 | 
 78 | #define MEMCPY(to,from,length,toType) { int m; \
 79 | 										for(m=0; m<length; m++) { \
 80 | 											to[m] = (toType) from[m]; \
 81 | 										} }
 82 | 
 83 | 
 84 | 
 85 | #define LOG_2_PI 1.83787706640935
 86 | #define LOG_PI 1.144729885849400
 87 | 
 88 | 
 89 | void set_device(int device);
 90 | 
 91 | int initCUDAContext();
 92 | 
 93 | int migrateContext(CUcontext context);
 94 | 
 95 | REAL *allocateGPURealMemory(int length);
 96 | 
 97 | INT  *allocateGPUIntMemory(int length);
 98 | 
 99 | void checkCUDAError(const char *msg);
100 | 
101 | void freeGPUMemory(void *ptr);
102 | 
103 | void storeGPURealMemoryArray(REAL *toGPUPtr, REAL *fromGPUPtr, int length);
104 | 
105 | void storeGPUIntMemoryArray(INT *toGPUPtr, INT *fromGPUPtr, int length);
106 | 
107 | void printfCudaVector(REAL *dPtr, int length);
108 | 
109 | void printfCudaInt(int *dPtr, int length);
110 | 
111 | void printfVectorD(double *ptr, int length);
112 | 
113 | void printfVectorF(float *ptr, int length);
114 | 
115 | void printfVector(REAL *ptr, int length);
116 | 
117 | void printfInt(int *ptr,int length);
118 | 
119 | REAL sumCudaVector(REAL *dPtr, int length);
120 | 
121 | int checkZeros(REAL* dPtr, int length);
122 | 
123 | void loadTipPartials(int instance);
124 | 
125 | void doStore(int instance);
126 | 
127 | void doRestore(int instance);
128 | 
129 | void handleStoreRestoreQueue(int instance);
130 | 
131 | int smem_size();
132 | 
133 | int max_block_threads();
134 | 
135 | #endif // __GPUSTATS_COMMON__
136 | 


--------------------------------------------------------------------------------
/gpustats/cufiles/sampleFromMeasureMedium.cu:
--------------------------------------------------------------------------------
  1 | // Original written by Marc Suchard 
  2 | // Modified by Andrew Cron 
  3 | 
  4 | __global__ void k_%(name)s(float* in_measure, /** Precomputed measure */
  5 | 	   		   float* in_random, /** Precomputed random number */
  6 | 			   int* out_component, /** Resultant choice */
  7 | 			   int iN, int iT, int logged) {
  8 | 
  9 |   const int sample_density_block = blockDim.x;
 10 |   const int sample_block = blockDim.y;
 11 |   const int thidx = threadIdx.x;
 12 |   const int thidy = threadIdx.y;
 13 |   const int datumIndex = blockIdx.x * sample_block + thidy;
 14 |   const int pdfIndex = datumIndex * iT;
 15 |   const int tid = thidy*sample_density_block + thidx;
 16 |   const int stride = sample_density_block+1;
 17 | 
 18 |   // Make block size flexible ...
 19 |   extern __shared__ float shared_data[];
 20 |   float* measure = shared_data; // sample_block by stride
 21 |   float* sum = measure + sample_block*stride;
 22 |   float* work = sum + sample_block;
 23 | 
 24 |   // use 'work' in multiple places to save on memory
 25 |   if (tid < sample_block) {
 26 |     sum[tid] = 0;
 27 |     if(logged==1){
 28 |         work[tid] = -10000;
 29 |     } else {
 30 |         work[tid] = 0;
 31 |     }
 32 |   }
 33 | 
 34 | 
 35 |   if(logged==1){
 36 |   //get the max values
 37 |   for(int chunk = 0; chunk < iT; chunk += sample_density_block) {
 38 |     if(pdfIndex + chunk + thidx < iN*iT)
 39 |        measure[thidy*stride + thidx] = in_measure[pdfIndex + chunk + thidx];
 40 |     __syncthreads();
 41 | 
 42 |     if (tid < sample_block) {
 43 |       for(int i=0; i<sample_density_block; i++) {
 44 |     if(chunk + i < iT){
 45 |       float dcurrent = measure[tid*stride + i];
 46 |       if (dcurrent > work[tid]) {
 47 |         work[tid] = dcurrent;
 48 |       }
 49 |     }
 50 |       }
 51 |     }
 52 |     __syncthreads();
 53 |   }
 54 |   }
 55 | 
 56 | 
 57 |   //get scaled cummulative pdfs
 58 |   for(int chunk = 0; chunk < iT; chunk += sample_density_block) {
 59 |     if(pdfIndex + chunk + thidx < iN*iT)
 60 |        measure[thidy*stride + thidx] = in_measure[pdfIndex + chunk + thidx];
 61 | 
 62 |     __syncthreads();
 63 | 
 64 |     if (tid < sample_block) {
 65 |       for(int i=0; i<sample_density_block; i++) {
 66 |     if (chunk + i < iT){
 67 |       if(logged==1){
 68 |       //rescale and exp()
 69 |       sum[tid] += expf(measure[tid*stride + i] - work[tid]);
 70 |       } else {
 71 |       sum[tid] += measure[tid*stride + i];
 72 |       }
 73 |       measure[tid*stride + i] = sum[tid];
 74 |     }
 75 |       }
 76 |     }
 77 | 
 78 |     __syncthreads();
 79 | 
 80 |     if(datumIndex < iN && chunk + thidx < iT)
 81 |       in_measure[pdfIndex + chunk + thidx] = measure[thidy*stride + thidx];
 82 |  
 83 |   }
 84 | 
 85 |   __syncthreads();  
 86 | 
 87 |   if (tid < sample_block && logged==1){
 88 |     work[tid] = 0;
 89 |   }
 90 | 
 91 | 
 92 |   float* randomNumber = sum;
 93 |   const int result_id = blockIdx.x * sample_block + tid;
 94 |   if ( result_id < iN && tid < sample_block)
 95 |     randomNumber[tid] = in_random[result_id] * sum[tid];
 96 | 
 97 |   // Find the right bin for the random number ...
 98 |   for(int chunk = 0; chunk < iT; chunk += sample_density_block) {
 99 |     if(pdfIndex + chunk + thidx < iN*iT)
100 |        measure[thidy*stride + thidx] = in_measure[pdfIndex + chunk + thidx];
101 |     __syncthreads();
102 | 
103 |     if (tid < sample_block) {
104 | 
105 |       // storing the index in a float is better because it avoids
106 |       // bank conflicts ...
107 |       for(int i=0; i<sample_density_block; i++) {
108 |     if (chunk + i < iT){
109 |       if (randomNumber[tid] > measure[tid*stride + i]){
110 |         work[tid] = i + chunk + 1;
111 |       }
112 |     }
113 |       }
114 |       if ( work[tid] >= iT) {work[tid] = iT-1;}
115 |     }
116 |     __syncthreads();
117 |   }
118 | 
119 |   // this is now coalesced
120 |   if (result_id < iN && tid < sample_block)
121 |     out_component[result_id] = (int) work[tid];
122 | 
123 | }
124 | 
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/examples/pymc_test.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=E1101
  2 | 
  3 | import pymc as pm
  4 | import pymc.distributions as dist
  5 | import numpy as np
  6 | from numpy.linalg import inv, cholesky as chol
  7 | import numpy.linalg as L
  8 | import numpy.random as rand
  9 | 
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | #-------------------------------------------------------------------------------
 13 | # Generate MV normal mixture
 14 | 
 15 | gen_mean = {
 16 |     0 : [0, 5],
 17 |     1 : [-10, 0],
 18 |     2 : [-10, 10]
 19 | }
 20 | 
 21 | gen_sd = {
 22 |     0 : [0.5, 0.5],
 23 |     1 : [.5, 1],
 24 |     2 : [1, .25]
 25 | }
 26 | 
 27 | gen_corr = {
 28 |     0 : 0.5,
 29 |     1 : -0.5,
 30 |     2 : 0
 31 | }
 32 | 
 33 | group_weights = [0.6, 0.3, 0.1]
 34 | 
 35 | def generate_data(n=1e5, k=2, ncomps=3, seed=1):
 36 |     rand.seed(seed)
 37 |     data_concat = []
 38 |     labels_concat = []
 39 | 
 40 |     for j in range(ncomps):
 41 |         mean = gen_mean[j]
 42 |         sd = gen_sd[j]
 43 |         corr = gen_corr[j]
 44 | 
 45 |         cov = np.empty((k, k))
 46 |         cov.fill(corr)
 47 |         cov[np.diag_indices(k)] = 1
 48 |         cov *= np.outer(sd, sd)
 49 | 
 50 |         num = int(n * group_weights[j])
 51 |         rvs = pm.rmv_normal_cov(mean, cov, size=num)
 52 | 
 53 |         data_concat.append(rvs)
 54 |         labels_concat.append(np.repeat(j, num))
 55 | 
 56 |     return (np.concatenate(labels_concat),
 57 |             np.concatenate(data_concat, axis=0))
 58 | 
 59 | N = int(1e5) # n data points per component
 60 | K = 2 # ndim
 61 | ncomps = 3 # n mixture components
 62 | 
 63 | true_labels, data = generate_data(n=N, k=K, ncomps=ncomps)
 64 | 
 65 | def plot_2d_mixture(data, labels):
 66 |     plt.figure(figsize=(10, 10))
 67 |     colors = 'bgr'
 68 |     for j in np.unique(labels):
 69 |         x, y = data[labels == j].T
 70 |         plt.plot(x, y, '%s.' % colors[j], ms=2)
 71 | 
 72 | 
 73 | def plot_thetas(sampler):
 74 |     plot_2d_mixture(data, true_labels)
 75 | 
 76 |     def plot_theta(i):
 77 |         x, y = sampler.trace('theta_%d' % i)[:].T
 78 |         plt.plot(x, y, 'k.')
 79 | 
 80 |     for i in range(3):
 81 |         plot_theta(i)
 82 | 
 83 | #-------------------------------------------------------------------------------
 84 | # set up PyMC model
 85 | 
 86 | # priors, fairly vague
 87 | prior_mean = data.mean(0)
 88 | sigma0 = np.diag([1., 1.])
 89 | prior_cov = np.cov(data.T)
 90 | 
 91 | # shared hyperparameter?
 92 | # theta_tau = pm.Wishart('theta_tau', n=4, Tau=L.inv(sigma0))
 93 | 
 94 | # df = pm.DiscreteUniform('df', 3, 50)
 95 | 
 96 | thetas = []
 97 | taus = []
 98 | for j in range(ncomps):
 99 |     # need a hyperparameter for degrees of freedom?
100 |     tau = pm.Wishart('C_%d' % j, n=3, Tau=inv(prior_cov))
101 |     theta = pm.MvNormal('theta_%d' % j, mu=prior_mean, tau=inv(2 * prior_cov))
102 | 
103 |     thetas.append(theta)
104 |     taus.append(tau)
105 | 
106 | alpha0 = np.ones(3.) / 3
107 | weights = pm.Dirichlet('weights', theta=alpha0)
108 | # labels = pm.Categorical('labels', p=weights, size=len(data))
109 | 
110 | from pandas.util.testing import set_trace as st
111 | import pdfs
112 | import util
113 | 
114 | def mixture_loglike(data, thetas, covs, labels):
115 | 
116 |     n = len(data)
117 |     likes = pdfs.mvnpdf(data, thetas, covs)
118 |     loglike = likes.ravel('F').take(labels * n + np.arange(n)).sum()
119 | 
120 |     if np.isnan(loglike):
121 |         return -1e300
122 | 
123 |     return loglike
124 | 
125 |     if np.isnan(likes).any():
126 |         loglike = 0.
127 |         for j, (theta, cov) in enumerate(zip(thetas, covs)):
128 |             this_data = data[labels == j]
129 |             ch = chol(cov)
130 |             loglike += pm.mv_normal_chol_like(this_data, theta, ch)
131 | 
132 |         return loglike
133 | 
134 | def mixture_loglike2(data, thetas, taus, weights):
135 | 
136 |     n = len(data)
137 | 
138 |     covs = [inv(tau) for tau in taus]
139 | 
140 |     likes = pdfs.mvnpdf(data, thetas, covs)
141 |     loglike = (likes * weights).sum()
142 | 
143 |     # loglike = likes.ravel('F').take(labels * n + np.arange(n)).sum()
144 | 
145 |     if np.isnan(loglike):
146 |         st()
147 |         return -1e300
148 | 
149 |     return loglike
150 | 
151 |     if np.isnan(likes).any():
152 |         loglike = 0.
153 |         for j, (theta, cov) in enumerate(zip(thetas, covs)):
154 |             this_data = data[labels == j]
155 |             loglike += pm.mv_normal_chol_like(this_data, theta, ch)
156 | 
157 |         return loglike
158 | 
159 | @pm.deterministic
160 | def adj_weights(weights=weights):
161 |     return np.sort(np.r_[weights, 1 - weights.sum()])
162 | 
163 | @pm.stochastic(observed=True)
164 | def mixture(value=data, thetas=thetas, taus=taus, weights=adj_weights):
165 |     return mixture_loglike2(value, thetas, taus, weights)
166 | 
167 | sampler = pm.MCMC(locals())
168 | 
169 | sampler.sample(iter=3000, burn=100, tune_interval=100, thin=10)
170 | 
171 | 


--------------------------------------------------------------------------------
/gpustats/sampler.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import gpustats.kernels as kernels
  4 | import gpustats.codegen as codegen
  5 | import gpustats.util as util
  6 | import pycuda.driver as drv
  7 | from pycuda.gpuarray import GPUArray, to_gpu
  8 | from pycuda.gpuarray import empty as gpu_empty
  9 | from pycuda.curandom import rand as curand
 10 | 
 11 | # reload(kernels)
 12 | # reload(codegen)
 13 | 
 14 | cu_module = codegen.get_full_cuda_module()
 15 | 
 16 | def sample_discrete(densities, logged=False,
 17 |                         return_gpuarray=False):
 18 | 
 19 |     """
 20 |     Takes a categorical sample from the unnormalized univariate
 21 |     densities defined in the rows of 'densities'
 22 | 
 23 |     Parameters
 24 |     ---------
 25 |     densities : ndarray or gpuarray (n, k)
 26 |     logged: boolean indicating whether densities is on the
 27 |     log scale ...
 28 | 
 29 |     Returns
 30 |     -------
 31 |     indices : ndarray or gpuarray (if return_gpuarray=True)
 32 |     of length n and dtype = int32
 33 |     """
 34 | 
 35 |     from gpustats.util import info
 36 | 
 37 |     n, k = densities.shape
 38 |     # prep data
 39 |     if isinstance(densities, GPUArray):
 40 |         if densities.flags.f_contiguous:
 41 |             gpu_densities = util.transpose(densities)
 42 |         else:
 43 |             gpu_densities = densities
 44 |     else:
 45 |         densities = util.prep_ndarray(densities)
 46 |         gpu_densities = to_gpu(densities)
 47 | 
 48 |     # get gpu function
 49 |     cu_func = cu_module.get_function('sample_discrete')
 50 | 
 51 |     # setup GPU data
 52 |     gpu_random = to_gpu(np.asarray(np.random.rand(n), dtype=np.float32))
 53 |     gpu_dest = gpu_empty(n, dtype=np.int32)
 54 |     dims = np.array([n,k,logged],dtype=np.int32)
 55 | 
 56 |     if info.max_block_threads<1024:
 57 |         x_block_dim = 16
 58 |     else:
 59 |         x_block_dim = 32
 60 | 
 61 |     y_block_dim = 16
 62 |     # setup GPU call
 63 |     block_design = (x_block_dim, y_block_dim, 1)
 64 |     grid_design = (int(n/y_block_dim) + 1, 1)
 65 | 
 66 |     shared_mem = 4 * ( (x_block_dim+1)*y_block_dim +  
 67 |                      2 * y_block_dim )  
 68 | 
 69 |     cu_func(gpu_densities, gpu_random, gpu_dest, 
 70 |             dims[0], dims[1], dims[2], 
 71 |             block=block_design, grid=grid_design, shared=shared_mem)
 72 | 
 73 |     gpu_random.gpudata.free()
 74 |     if return_gpuarray:
 75 |         return gpu_dest
 76 |     else:
 77 |         res = gpu_dest.get()
 78 |         gpu_dest.gpudata.free()
 79 |         return res
 80 | 
 81 | 
 82 | ## depreciated 
 83 | def sample_discrete_old(in_densities, logged=False, pad=False,
 84 |                     return_gpuarray=False):
 85 |     """
 86 |     Takes a categorical sample from the unnormalized univariate
 87 |     densities defined in the rows of 'densities'
 88 | 
 89 |     Parameters
 90 |     ---------
 91 |     densities : ndarray or gpuarray (n, k)
 92 |     logged: boolean indicating whether densities is on the
 93 |     log scale ...
 94 | 
 95 |     Returns
 96 |     -------
 97 |     indices : ndarray or gpuarray (if return_gpuarray=True)
 98 |     of length n and dtype = int32
 99 |     """
100 | 
101 |     if pad:
102 |         if logged:
103 |             densities = util.pad_data_mult16(in_densities, fill=1)
104 |         else:
105 |             densities = util.pad_data_mult16(in_densities, fill=0)
106 | 
107 |     else:
108 |         densities = in_densities
109 | 
110 |     n, k = densities.shape
111 | 
112 |     if logged:
113 |         cu_func = cu_module.get_function('sample_discrete_logged_old')
114 |     else:
115 |         cu_func = cu_module.get_function('sample_discrete_old')
116 | 
117 |     if isinstance(densities, GPUArray):
118 |         if densities.flags.f_contiguous:
119 |             gpu_densities = util.transpose(densities)
120 |         else:
121 |             gpu_densities = densities
122 |     else:
123 |         densities = util.prep_ndarray(densities)
124 |         gpu_densities = to_gpu(densities)
125 | 
126 |     # setup GPU data
127 |     #gpu_random = curand(n)
128 |     gpu_random = to_gpu(np.asarray(np.random.rand(n), dtype=np.float32))
129 |     #gpu_dest = to_gpu(np.zeros(n, dtype=np.float32))
130 |     gpu_dest = gpu_empty(n, dtype=np.float32)
131 |     stride = gpu_densities.shape[1]
132 |     if stride % 2 == 0:
133 |         stride += 1
134 |     dims = np.array([n,k, gpu_densities.shape[1], stride],dtype=np.int32)
135 | 
136 | 
137 |     # optimize design ...
138 |     grid_design, block_design = _tune_sfm(n, stride, cu_func.num_regs)
139 | 
140 |     shared_mem = 4 * (block_design[0] * stride + 
141 |                      1 * block_design[0])
142 | 
143 |     cu_func(gpu_densities, gpu_random, gpu_dest, 
144 |             dims[0], dims[1], dims[2], dims[3],
145 |             block=block_design, grid=grid_design, shared=shared_mem)
146 | 
147 |     gpu_random.gpudata.free()
148 |     if return_gpuarray:
149 |         return gpu_dest
150 |     else:
151 |         res = gpu_dest.get()
152 |         gpu_dest.gpudata.free()
153 |         return res
154 | 
155 | def _tune_sfm(n, stride, func_regs):
156 |     """
157 |     Outputs the 'opimal' block and grid configuration
158 |     for the sample discrete kernel.
159 |     """
160 |     from gpustats.util import info
161 | 
162 |     #info = DeviceInfo()
163 |     comp_cap = info.compute_cap
164 |     max_smem = info.shared_mem * 0.8
165 |     max_threads = int(info.max_block_threads * 0.5)
166 |     max_regs = 0.9 * info.max_registers
167 | 
168 |     # We want smallest dim possible in x dimsension while
169 |     # still reading mem correctly
170 | 
171 |     if comp_cap[0] == 1:
172 |         xdim = 16
173 |     else:
174 |         xdim = 32
175 | 
176 | 
177 |     def sfm_config_ok(xdim, ydim, stride, func_regs, max_regs, max_smem, max_threads):
178 |         ok = 4*(xdim*stride + 1*xdim) < max_smem and func_regs*ydim*xdim < max_regs
179 |         return ok and xdim*ydim <= max_threads
180 | 
181 |     ydim = 2
182 |     while sfm_config_ok(xdim, ydim, stride, func_regs, max_regs, max_smem, max_threads):
183 |         ydim += 1
184 | 
185 |     ydim -= 1
186 | 
187 |     nblocks = int(n/xdim) + 1
188 | 
189 |     return (nblocks,1), (xdim,ydim,1)
190 | 
191 | if __name__ == '__main__':
192 | 
193 |     n = 100
194 |     k = 5
195 |     dens = np.log(np.abs(np.random.randn(k))) - 200
196 |     densities = [dens.copy() for _ in range(n)]
197 |     dens = np.exp(dens + 200)
198 |     densities = np.asarray(densities)
199 | 
200 |     labels = sample_discrete(densities, logged=True)
201 |     mu = np.dot(dens / dens.sum(), np.arange(k))
202 |     print mu, labels.mean()
203 | 


--------------------------------------------------------------------------------
/scripts/bench.py:
--------------------------------------------------------------------------------
  1 | from pandas import *
  2 | 
  3 | import numpy as np
  4 | 
  5 | from pycuda.gpuarray import to_gpu
  6 | import gpustats
  7 | import gpustats.util as util
  8 | from scipy.stats import norm
  9 | import timeit
 10 | 
 11 | data = np.random.randn(1000000)
 12 | mean = 20
 13 | std = 5
 14 | 
 15 | univ_setup = """
 16 | import numpy as np
 17 | from pycuda.gpuarray import to_gpu
 18 | k = 8
 19 | means = np.random.randn(k)
 20 | stds = np.abs(np.random.randn(k))
 21 | 
 22 | mean = 20
 23 | std = 5
 24 | import gpustats
 25 | from scipy.stats import norm
 26 | cpu_data = np.random.randn(%d)
 27 | gpu_data = cpu_data
 28 | """
 29 | 
 30 | univ_setup_gpuarray = univ_setup + """
 31 | gpu_data = to_gpu(cpu_data)
 32 | """
 33 | 
 34 | multivar_setup = """
 35 | # from __main__ import data, mean, std
 36 | import gpustats
 37 | import gpustats.util as util
 38 | import numpy as np
 39 | import testmod
 40 | from pycuda.gpuarray import to_gpu
 41 | import testmod
 42 | from numpy.linalg import cholesky as chol
 43 | import numpy.linalg as L
 44 | 
 45 | 
 46 | def next_multiple(k, p):
 47 |     if k.__mod__(p):
 48 |         return k + (p - k.__mod__(p))
 49 | 
 50 |     return k
 51 | 
 52 | PAD_MULTIPLE = 16
 53 | HALF_WARP = 16
 54 | 
 55 | 
 56 | def pad_data(data):
 57 |     n, k = data.shape
 58 | 
 59 |     if not k.__mod__(HALF_WARP):
 60 |         pad_dim = k + 1
 61 |     else:
 62 |         pad_dim = k
 63 | 
 64 |     if k != pad_dim:
 65 |         padded_data = np.empty((n, pad_dim), dtype=np.float32)
 66 |         padded_data[:, :k] = data
 67 | 
 68 |         return padded_data
 69 |     else:
 70 |         return prep_ndarray(data)
 71 | 
 72 | def prep_ndarray(arr):
 73 |     # is float32 and contiguous?
 74 |     if not arr.dtype == np.float32 or not arr.flags.contiguous:
 75 |         arr = np.array(arr, dtype=np.float32)
 76 | 
 77 |     return arr
 78 | 
 79 | def pack_params(means, chol_sigmas, logdets):
 80 |     to_pack = []
 81 |     for m, ch, ld in zip(means, chol_sigmas, logdets):
 82 |         to_pack.append(pack_pdf_params(m, ch, ld))
 83 | 
 84 |     return np.vstack(to_pack)
 85 | 
 86 | def pack_pdf_params(mean, chol_sigma, logdet):
 87 |     k = len(mean)
 88 |     mean_len = k
 89 |     chol_len = k * (k + 1) / 2
 90 |     mch_len = mean_len + chol_len
 91 | 
 92 |     packed_dim = next_multiple(mch_len + 2, PAD_MULTIPLE)
 93 | 
 94 |     packed_params = np.empty(packed_dim, dtype=np.float32)
 95 |     packed_params[:mean_len] = mean
 96 | 
 97 |     packed_params[mean_len:mch_len] = chol_sigma[np.tril_indices(k)]
 98 |     packed_params[mch_len:mch_len + 2] = 1, logdet
 99 | 
100 |     return packed_params
101 | 
102 | k = %d
103 | 
104 | dim = 15
105 | means = np.random.randn(k, dim)
106 | covs = [util.random_cov(dim) for _ in xrange(k)]
107 | 
108 | cpu_data = np.random.randn(%d, dim)
109 | gpu_data = cpu_data
110 | """
111 | 
112 | multivar_setup_gpuarray = multivar_setup + """
113 | gpu_data = to_gpu(cpu_data)
114 | """
115 | 
116 | LOG_2_PI = np.log(2 * np.pi)
117 | 
118 | # def mvnpdf(data, mean, cov):
119 | #     ichol_sigma = np.asarray(np.linalg.inv(np.linalg.cholesky(cov)))
120 | #     # ichol_sigma = np.tril(ichol_sigma)
121 | #     logdet = np.log(np.linalg.det(cov))
122 | #     return [_mvnpdf(x, mean, ichol_sigma, logdet)
123 | #             for x in data]
124 | 
125 | # def _mvnpdf(x, mean, ichol_sigma, logdet):
126 | #     demeaned = x - mean
127 | #     discrim = ((ichol_sigma * demeaned) ** 2).sum()
128 | #     # discrim = np.dot(demeaned, np.dot(ichol_sigma, demeaned))
129 | #     return - 0.5 * (discrim + logdet + LOG_2_PI * dim)
130 | 
131 | def get_timeit(stmt, setup, iter=10):
132 |     return timeit.Timer(stmt, setup).timeit(number=iter) / iter
133 | 
134 | def compare_timings_single(n, setup=univ_setup):
135 |     gpu = "gpustats.normpdf(gpu_data, mean, std, logged=False)"
136 |     cpu = "norm.pdf(cpu_data, loc=mean, scale=std)"
137 |     setup = setup % n
138 |     return {'GPU' : get_timeit(gpu, setup, iter=1000),
139 |             'CPU' : get_timeit(cpu, setup)}
140 | 
141 | def compare_timings_multi(n, setup=univ_setup):
142 |     gpu = "gpustats.normpdf_multi(gpu_data, means, stds, logged=False)"
143 |     cpu = """
144 | for m, s in zip(means, stds):
145 |     norm.pdf(cpu_data, loc=m, scale=s)
146 | """
147 |     setup = setup % n
148 |     return {'GPU' : get_timeit(gpu, setup, iter=100),
149 |             'CPU' : get_timeit(cpu, setup)}
150 | 
151 | 
152 | def mvcompare_timings(n, k=1, setup=multivar_setup):
153 |     gpu = "gpustats.mvnpdf_multi(gpu_data, means, covs, logged=False)"
154 |     cpu = """
155 | ichol_sigmas = [L.inv(chol(sig)) for sig in covs]
156 | logdets = [np.log(np.linalg.det(sig)) for sig in covs]
157 | params = pack_params(means, covs, logdets)
158 | testmod.cpu_mvnpdf(cpu_data, params, dim)
159 |     """
160 |     setup = setup % (k, n)
161 |     return {'GPU' : get_timeit(gpu, setup, iter=100),
162 |             'CPU' : get_timeit(cpu, setup)}
163 | 
164 | def get_timing_results(timing_f):
165 |     lengths = [100, 1000, 10000, 100000, 1000000]
166 | 
167 |     result = {}
168 |     for n in lengths:
169 |         print n
170 |         result[n] = timing_f(n)
171 |     result = DataFrame(result).T
172 |     result['Speedup'] = result['CPU'] / result['GPU']
173 |     return result
174 | 
175 | # mvsingle = get_timing_results(mvcompare_timings)
176 | # comp_gpu = lambda n: mvcompare_timings(n, setup=multivar_setup_gpuarray)
177 | # mvsingle_gpu = get_timing_results(comp_gpu)
178 | # multi_comp = lambda n: mvcompare_timings(n, k=16)
179 | # mvmulti = get_timing_results(multi_comp)
180 | # multi_comp_gpu = lambda n: mvcompare_timings(n, k=16,
181 | #                                        setup=multivar_setup_gpuarray)
182 | # mvmulti_gpu = get_timing_results(multi_comp_gpu)
183 | 
184 | single = get_timing_results(compare_timings_single)
185 | comp_gpu = lambda n: compare_timings_single(n, setup=univ_setup_gpuarray)
186 | single_gpu = get_timing_results(comp_gpu)
187 | multi = get_timing_results(compare_timings_multi)
188 | comp_gpu = lambda n: compare_timings_multi(n, setup=univ_setup_gpuarray)
189 | multi_gpu = get_timing_results(comp_gpu)
190 | 
191 | data = DataFrame({
192 |     'Single' : single['Speedup'],
193 |     'Single (GPUArray)' : single_gpu['Speedup'],
194 |     'Multi' : multi['Speedup'],
195 |     'Multi (GPUArray)' : multi_gpu['Speedup'],
196 | })
197 | 
198 | 
199 | mvdata = DataFrame({
200 |     'Single' : mvsingle['Speedup'],
201 |     'Single (GPUArray)' : mvsingle_gpu['Speedup'],
202 |     'Multi' : mvmulti['Speedup'],
203 |     'Multi (GPUArray)' : mvmulti_gpu['Speedup'],
204 | })
205 | 
206 | if __name__ == '__main__':
207 |     import gpustats
208 |     import numpy as np
209 |     from scipy.stats import norm
210 |     import testmod
211 |     from numpy.linalg import cholesky as chol
212 |     import numpy.linalg as L
213 | 
214 |     # dim = 15
215 |     # k = 8
216 |     # means = np.random.randn(k, dim)
217 |     # covs = [np.asarray(util.random_cov(dim)) for _ in xrange(k)]
218 | 
219 |     # cpu_data = np.random.randn(100000, dim)
220 |     # gpu_data = to_gpu(cpu_data)
221 | 
222 |     # ichol_sigmas = [L.inv(chol(sig)) for sig in covs]
223 |     # logdets = [np.log(np.linalg.det(sig)) for sig in covs]
224 |     # packed_params = pack_params(means, covs, logdets)
225 | 
226 |     # pdfs = gpustats.mvnpdf(cpu_data, means[0], covs[0])
227 |     # pdfs = testmod.cpu_mvnpdf(cpu_data, packed_params, 15)
228 | 
229 | 


--------------------------------------------------------------------------------
/gpustats/codegen.py:
--------------------------------------------------------------------------------
  1 | import pycuda.driver as drv
  2 | import pycuda.tools
  3 | #import pycuda.autoinit
  4 | drv.init()
  5 | if drv.Context.get_current() is None:
  6 |     import pycuda.autoinit
  7 | 
  8 | import numpy
  9 | import numpy.linalg as la
 10 | import os
 11 | from pycuda.compiler import SourceModule
 12 | from gpustats.util import get_cufiles_path
 13 | 
 14 | class CUDAModule(object):
 15 |     """
 16 |     Interfaces with PyCUDA
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     kernel_dict :
 21 |     """
 22 |     def __init__(self, kernel_dict):
 23 |         self.kernel_dict = kernel_dict
 24 |         self.support_code = _get_support_code()
 25 | 
 26 |         self.all_code = self._get_full_source()
 27 |         try:
 28 |             #self.pycuda_module = SourceModule(self.all_code)
 29 |             # dictionary mapping contexts to their respective loaded code modules
 30 |             self.pycuda_modules = { drv.Context.get_current() : SourceModule(self.all_code) }
 31 |         except Exception:
 32 |             f = open('foo.cu', 'w')
 33 |             print >> f, self.all_code
 34 |             f.close()
 35 |             raise
 36 |         #self.curDevice = drv.Context.get_device()
 37 | 
 38 |     def _get_full_source(self):
 39 |         formatted_kernels = [kern.get_code()
 40 |                              for kern in self.kernel_dict.values()]
 41 |         return '\n'.join([self.support_code] + formatted_kernels)
 42 | 
 43 |     def get_function(self, name):
 44 |         # get the module for this context
 45 |         context = drv.Context.get_current()
 46 |         try:
 47 |             mod = self.pycuda_modules[context]
 48 |         except KeyError:
 49 |             # if it's a new context, init the module
 50 |             self.pycuda_modules[context] = SourceModule(self.all_code)
 51 |             mod = self.pycuda_modules[context]
 52 |         return mod.get_function('k_%s' % name)
 53 |         #curDevice = drv.Context.get_device()
 54 |         #if self.curDevice != curDevice:
 55 |         #    self.pycuda_module = SourceModule(self.all_code)
 56 |         #    self.curDevice = curDevice
 57 |         #return self.pycuda_module.get_function('k_%s' % name)
 58 | 
 59 | def _get_support_code():
 60 |     path = os.path.join(get_cufiles_path(), 'support.cu')
 61 |     return open(path).read()
 62 | 
 63 | def _get_mvcaller_code():
 64 |     # for multivariate pdfs
 65 |     path = os.path.join(get_cufiles_path(), 'mvcaller.cu')
 66 |     return open(path).read()
 67 | 
 68 | def _get_univcaller_code():
 69 |     # For univariate pdfs
 70 |     path = os.path.join(get_cufiles_path(), 'univcaller.cu')
 71 |     return open(path).read()
 72 | 
 73 | class Kernel(object):
 74 | 
 75 |     def __init__(self, name):
 76 |         if name is None:
 77 |             raise ValueError('Kernel must have a default name')
 78 | 
 79 |         self.name = name
 80 | 
 81 |     def get_code(self):
 82 |         logic = self.get_logic()
 83 |         caller = self.get_caller()
 84 |         return '\n'.join((logic, caller))
 85 | 
 86 |     def get_logic(self, **kwds):
 87 |         raise NotImplementedError
 88 | 
 89 |     def get_caller(self, **kwds):
 90 |         raise NotImplementedError
 91 | 
 92 |     def get_name(self, name=None):
 93 |         # can override default name, for transforms. this a hack?
 94 |         if name is None:
 95 |             name = self.name
 96 | 
 97 |         return name
 98 | 
 99 | class CUFile(Kernel):
100 |     """
101 |     Expose kernel contained in .cu file in the cufiles directory to code
102 |     generation framework. Kernel need only have a template to be able to change
103 |     the name of the generated kernel
104 |     """
105 |     def __init__(self, name, filepath):
106 |         self.full_path = os.path.join(get_cufiles_path(),
107 |                                       filepath)
108 | 
109 |         Kernel.__init__(self, name)
110 | 
111 |     def get_code(self):
112 |         code = open(self.full_path).read()
113 |         return code % {'name' : self.name}
114 | 
115 | class SamplerKernel(Kernel):
116 |     """
117 |     Holds info for measure sample kernel.
118 |     """
119 |     def __init__(self, name, logic_code):
120 |         self.logic_code = logic_code
121 |         Kernel.__init__(self, name)
122 | 
123 |     def get_logic(self, name=None):
124 |         return self.logic_code
125 | 
126 |     def get_caller(self, name=None):
127 |         return self._caller % {'name' : self.get_name(name)}
128 | 
129 | class DensityKernel(Kernel):
130 |     """
131 |     Generate kernel for probability density function
132 |     """
133 | 
134 |     _caller = _get_univcaller_code()
135 |     def __init__(self, name, logic_code):
136 | 
137 |         self.logic_code = logic_code
138 | 
139 |         Kernel.__init__(self, name)
140 | 
141 |     def get_logic(self, name=None):
142 |         return self.logic_code % {'name' : self.get_name(name)}
143 | 
144 |     def get_caller(self, name=None):
145 |         return self._caller % {'name' : self.get_name(name)}
146 | 
147 | class MVDensityKernel(DensityKernel):
148 |     """
149 | 
150 |     """
151 |     _caller = _get_mvcaller_code()
152 | 
153 | class Transform(Kernel):
154 |     """
155 |     Enable simple transforms of kernels to compute modified kernel code stub
156 |     """
157 |     def __init__(self, name, kernel):
158 |         self.kernel = kernel
159 |         Kernel.__init__(self, name)
160 | 
161 |     # XXX: HACK, not general for non-density kernels
162 |     def is_multivariate(self):
163 |         return isinstance(self.kernel, MVDensityKernel)
164 | 
165 | # flop the right name?
166 | 
167 | class Flop(Transform):
168 |     op = None
169 | 
170 |     def get_logic(self, name=None):
171 |         name = self.get_name(name)
172 | 
173 |         actual_name = '%s_stub' % name
174 |         kernel_logic = self.kernel.get_logic(name=actual_name)
175 | 
176 |         if self.is_multivariate():
177 |             stub_caller = _mv_stub_caller
178 |         else:
179 |             stub_caller = _univ_stub_caller
180 | 
181 |         transform_logic = stub_caller % {'name' : name,
182 |                                          'actual_kernel' : actual_name,
183 |                                          'op' : self.op}
184 | 
185 |         return '\n'.join((kernel_logic, transform_logic))
186 | 
187 |     def get_caller(self):
188 |         return self.kernel.get_caller(self.name)
189 | 
190 | _univ_stub_caller = """
191 | __device__ float %(name)s(float* x, float* params) {
192 |     return %(op)s(%(actual_kernel)s(x, params));
193 | }
194 | """
195 | 
196 | _mv_stub_caller = """
197 | __device__ float %(name)s(float* x, float* params, int dim) {
198 |     return %(op)s(%(actual_kernel)s(x, params, dim));
199 | }
200 | """
201 | 
202 | class Exp(Flop):
203 |     op = 'expf'
204 | 
205 | class Log(Flop):
206 |     op = 'logf'
207 | 
208 | class Sqrt(Flop):
209 |     op = 'sqrtf'
210 | 
211 | _cu_module = None
212 | 
213 | def get_full_cuda_module():
214 |     import gpustats.kernels as kernels
215 |     global _cu_module
216 | 
217 |     if _cu_module is None:
218 |         objects = kernels.__dict__
219 | 
220 |         all_kernels = dict((k, v)
221 |                            for k, v in kernels.__dict__.iteritems()
222 |                            if isinstance(v, Kernel))
223 |         _cu_module = CUDAModule(all_kernels)
224 | 
225 |     return _cu_module
226 | 
227 | if __name__ == '__main__':
228 |     pass
229 | 


--------------------------------------------------------------------------------
/old/mvnpdf.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Multivariate normal pdf implementation
  3 |  */
  4 | 
  5 | #ifndef _INCLUDED_MVNPDF
  6 | #define _INCLUDED_MVNPDF
  7 | 
  8 | #ifdef __cplusplus
  9 | extern "C" {
 10 | #endif
 11 | 
 12 | #include "mvnpdf.h"
 13 | #include "cucommon.h"
 14 | 
 15 | int compute_shmem(PMatrix* data, PMatrix* params, int nparams, int ndata) {
 16 |   // to hold specified about of data, parameters, and results
 17 |   int result_space = nparams * ndata;
 18 |   int param_space = params->stride * nparams;
 19 |   int data_space = data->stride * ndata;
 20 | 
 21 |   return sizeof(float) * (result_space + param_space + data_space);
 22 | }
 23 | 
 24 | // Compute "optimal" block size given number of data points / parameters
 25 | void get_tuned_layout(BlockDesign* info, PMatrix* data, PMatrix* params,
 26 |                       int max_block_params) {
 27 |   // query the device for smem / max # of threads
 28 |   int max_smem = smem_size() / 10 * 9;
 29 |   int max_threads = max_block_threads();
 30 | 
 31 |   // at most max_block_params sets of density parameters per block
 32 |   // for low-dimensional data, better to do more?
 33 |   int params_per = max_block_params;
 34 |   if (params->rows < max_block_params)
 35 |     params_per = next_pow2(params->rows, max_block_params);
 36 | 
 37 |   int data_per = max_threads / params_per;
 38 |   // at least 16 data points per block
 39 |   while (data_per < 16 & params_per > 1) {
 40 |     params_per /= 2;
 41 |     data_per *= 2;
 42 |   }
 43 | 
 44 |   while (1) {
 45 |     while (compute_shmem(data, params, params_per, data_per) > max_smem) {
 46 |       if (data_per <= 1)
 47 |         break;
 48 |       if (params_per > 1)
 49 |         params_per /= 2;
 50 |       else
 51 |         data_per /= 2;
 52 |     }
 53 |     // can't fit max_block_params sets of parameters into the shared memory
 54 |     if (data_per == 0) {
 55 |       data_per = 1;
 56 |       params_per /= 2;
 57 |       // start over the tuning
 58 |       continue;
 59 |     }
 60 |     else break;
 61 |   }
 62 | 
 63 |   // possible to squeeze more data?
 64 |   while (compute_shmem(data, params, params_per, 2 * data_per) <= max_smem)
 65 |     if (2 * data_per * params_per <= max_threads)
 66 |       data_per *= 2;
 67 |     else
 68 |       break;
 69 | 
 70 |   info->data_per_block = data_per;
 71 |   info->params_per_block = params_per;
 72 | }
 73 | 
 74 | __device__ int d_next_multiple(int k, int mult) {
 75 |   if (k % mult)
 76 |     return k + (mult - k % mult);
 77 |   else
 78 |     return k;
 79 | }
 80 | 
 81 | int next_multiple(int k, int mult) {
 82 |   if (k % mult)
 83 |     return k + (mult - k % mult);
 84 |   else
 85 |     return k;
 86 | }
 87 | 
 88 | __device__ float compute_pdf(float* data, float* params, int dim) {
 89 |   float* mean = params;
 90 |   float* sigma = params + dim;
 91 |   float mult = params[dim * (dim + 3) / 2];
 92 |   float logdet = params[dim * (dim + 3) / 2 + 1];
 93 | 
 94 |   float discrim = 0;
 95 |   float sum;
 96 |   unsigned int i, j;
 97 |   for (i = 0; i < dim; ++i)
 98 |   {
 99 |     sum = 0;
100 |     for(j = 0; j <= i; ++j) {
101 |       sum += *sigma++ * (data[j] - mean[j]);
102 |     }
103 |     discrim += sum * sum;
104 |   }
105 |   return log(mult) - 0.5 * (discrim + logdet + LOG_2_PI * dim);
106 | }
107 | 
108 | __device__ void copy_chunks(float* in_buf, float* out_buf,
109 |                             unsigned int tid, unsigned int total) {
110 |   for (unsigned int chunk = 0; chunk + tid < total; chunk += blockDim.x) {
111 |     out_buf[chunk + tid] = in_buf[chunk + tid];
112 |   }
113 | }
114 | 
115 | __global__ void mvnpdf_k(const PMatrix data, const PMatrix params,
116 |                          const BlockDesign design, float* output) {
117 | 
118 |   unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
119 | 
120 |   unsigned int rel_param = tid / design.data_per_block;
121 |   unsigned int rel_data = tid - rel_param * design.data_per_block;
122 | 
123 |   unsigned int obs_num = design.data_per_block * blockIdx.x + rel_data;
124 |   unsigned int param_num = design.params_per_block * blockIdx.y + rel_param;
125 | 
126 |   // set up shared data
127 |   extern __shared__ float shared_data[];
128 |   float* sh_params = shared_data;
129 |   float* sh_data = sh_params + design.params_per_block * params.stride;
130 |   float* sh_result = sh_data + design.data_per_block * data.stride;
131 | 
132 |   copy_chunks(data.buf + design.data_per_block * blockIdx.x * data.stride,
133 |               sh_data, tid,
134 |               min(data.rows - design.data_per_block * blockIdx.x,
135 |                   design.data_per_block) * data.stride);
136 | 
137 |   copy_chunks(params.buf + design.params_per_block * blockIdx.y * params.stride,
138 |               sh_params, tid,
139 |               min(design.params_per_block,
140 |                   params.rows - design.params_per_block * blockIdx.y) * params.stride);
141 | 
142 |   __syncthreads();
143 | 
144 |   // allocated enough shared memory so that this will not walk out of bounds
145 |   // no matter what, though some of the results will be garbage
146 |   sh_result[tid] = compute_pdf(sh_data + rel_data * data.stride,
147 |                                sh_params + rel_param * params.stride,
148 |                                data.cols);
149 |   __syncthreads();
150 | 
151 |   unsigned int result_idx = data.rows * param_num + obs_num;
152 | 
153 |   // output is column-major, so this will then coalesce
154 |   if (obs_num < data.rows & param_num < params.rows) {
155 |     output[result_idx] = sh_result[tid];
156 |   }
157 | }
158 | 
159 | // XXX: fix this
160 | int MAX_BLOCK_PARAMS = 64;
161 | 
162 | cudaError_t invoke_mvnpdf(PMatrix data, PMatrix params, float* d_pdf) {
163 |   // Need to automatically tune block / grid layout to maximize shared memory
164 |   // usage and coalescence, reduce wasted threads!
165 |   BlockDesign design;
166 |   get_tuned_layout(&design, &data, &params, MAX_BLOCK_PARAMS);
167 | 
168 |   int nthreads = design.data_per_block * design.params_per_block;
169 | 
170 |   // Now set up grid layout / block size
171 |   int grid_x = get_boxes(data.rows, design.data_per_block);
172 |   int grid_y = get_boxes(params.rows, design.params_per_block);
173 |   dim3 gridPDF(grid_x, grid_y);
174 |   dim3 blockPDF(nthreads, 1);
175 | 
176 |   int sharedMemSize = compute_shmem(&data, &params,
177 |                                     design.params_per_block,
178 |                                     design.data_per_block);
179 | 
180 | #ifdef DEBUG
181 |   printf("number params: %d, number data points: %d\n",
182 |          design.params_per_block, design.data_per_block);
183 |   printf("sharedMemSize: %d\n", sharedMemSize);
184 |   printf("block: %d x %d, grid: %d x %d\n", blockPDF.x, blockPDF.y,
185 |          gridPDF.x, gridPDF.y);
186 |   printf("design: %d x %d\n", design.data_per_block, design.params_per_block);
187 | 
188 |   printf("nparams: %d\n", params.rows);
189 | #endif
190 | 
191 |   mvnpdf_k<<<gridPDF,blockPDF,sharedMemSize>>>(data, params, design, d_pdf);
192 |   return cudaSuccess;
193 | }
194 | 
195 | void mvnpdf(float* h_data, /** Data-vector; padded */
196 |              float* h_params, /** Density info; already padded */
197 |              float* h_pdf, /** Resultant PDF */
198 |              int data_dim,
199 |              int total_obs,
200 |              int nparams, // multiple sets of parameters
201 |              int param_stride, // with padding
202 |              int data_stride // with padding
203 |   ) {
204 | 
205 |   float* d_data;
206 |   float* d_params;
207 |   float* d_pdf;
208 |   cudaError_t error;
209 | 
210 |   PMatrix pdata, pparams;
211 |   CATCH_ERR(cudaMalloc((void**) &d_pdf, total_obs * nparams * sizeof(float)));
212 |   CATCH_ERR(cudaMalloc((void**) &d_data,
213 |                        data_stride * total_obs * sizeof(float)));
214 |   CATCH_ERR(cudaMalloc((void**) &d_params,
215 |                        param_stride * nparams * sizeof(float)));
216 | 
217 |   h_to_d(h_data, d_data, total_obs * data_stride);
218 |   h_to_d(h_params, d_params, nparams * param_stride);
219 | 
220 |   PMatrix_init(&pdata, d_data, total_obs, data_dim, data_stride);
221 |   PMatrix_init(&pparams, d_params, nparams,
222 |                data_dim * (data_dim + 3) / 2 + 2, param_stride);
223 | 
224 |   invoke_mvnpdf(pdata, pparams, d_pdf);
225 |   d_to_h(d_pdf, h_pdf, total_obs * nparams);
226 | 
227 |   cudaFree(d_data);
228 |   cudaFree(d_params);
229 |   cudaFree(d_pdf);
230 | }
231 | 
232 | void cpu_mvnpdf(float* x, float* density, float * output, int dim,
233 |                 int padded_dim, int N, int T) {
234 |     int LOGDET_OFFSET = dim * (dim + 3) / 2;
235 |     int MEAN_CHD_DIM = dim * (dim + 3) / 2  + 2;
236 | 
237 |     int PACK_DIM = next_multiple(MEAN_CHD_DIM, 16);
238 | 
239 |     float* xx = (float*) malloc(dim * sizeof(float));
240 |     int obs, component;
241 | 
242 |     for (obs = 0; obs < N; obs++) {
243 |         for (component = 0; component < T; component++) {
244 |             float discrim;
245 |             float* tData = x + obs * padded_dim;
246 |             float* tDensityInfo = density + component * PACK_DIM;
247 |             float* tMean = tDensityInfo;
248 |             float* tSigma = tDensityInfo + dim;
249 |             float  tP = tDensityInfo[LOGDET_OFFSET];
250 |             float  tLogDet = tDensityInfo[LOGDET_OFFSET+1];
251 | 
252 |             // Do density calculation
253 |             discrim = 0;
254 |             for(int i=0; i < dim; i++) {
255 |                 float sum = 0;
256 |                 for(int j=0; j <= i; j++) {
257 |                   sum += *tSigma * (tData[j] - tMean[j]); // xx[j] is always calculated since j <= i
258 |                   tSigma++;
259 |                 }
260 | 
261 |                 discrim += sum * sum;
262 |             }
263 |             output[obs * T + component] = log(tP) - 0.5 * (discrim + tLogDet + (LOG_2_PI*(float) dim));
264 |         }
265 |     }
266 |     free(xx);
267 | }
268 | 
269 | 
270 | #ifdef __cplusplus
271 | }
272 | #endif
273 | 
274 | #endif // _INCLUDED_MVNPDF
275 | 


--------------------------------------------------------------------------------
/gpustats/pdfs.py:
--------------------------------------------------------------------------------
  1 | from numpy.random import randn
  2 | from numpy.linalg import cholesky as chol
  3 | import numpy as np
  4 | import numpy.linalg as LA
  5 | 
  6 | from pycuda.gpuarray import GPUArray, to_gpu
  7 | from pycuda.gpuarray import empty as gpu_empty
  8 | import gpustats.kernels as kernels
  9 | import gpustats.codegen as codegen
 10 | from gpustats.util import transpose as gpu_transpose
 11 | reload(codegen)
 12 | reload(kernels)
 13 | import gpustats.util as util
 14 | import pycuda.driver as drv
 15 | 
 16 | __all__ = ['mvnpdf', 'mvnpdf_multi', 'normpdf', 'normpdf_multi']
 17 | 
 18 | cu_module = codegen.get_full_cuda_module()
 19 | 
 20 | #-------------------------------------------------------------------------------
 21 | # Invokers for univariate and multivariate density functions conforming to the
 22 | # standard API
 23 | 
 24 | def _multivariate_pdf_call(cu_func, data, packed_params, get, order,
 25 |                            datadim=None):
 26 |     packed_params = util.prep_ndarray(packed_params)
 27 |     func_regs = cu_func.num_regs
 28 | 
 29 |     # Prep the data. Skip if gpudata ...
 30 |     if isinstance(data, GPUArray):
 31 |         padded_data = data
 32 |         if datadim==None:
 33 |             ndata, dim = data.shape
 34 |         else:
 35 |             ndata, dim = data.shape[0], datadim
 36 | 
 37 |     else:
 38 | 
 39 |         ndata, dim = data.shape
 40 |         padded_data = util.pad_data(data)
 41 | 
 42 |     nparams = len(packed_params)
 43 |     data_per, params_per = util.tune_blocksize(padded_data,
 44 |                                                packed_params,
 45 |                                                func_regs)
 46 | 
 47 |     blocksize = data_per * params_per
 48 |     #print 'the blocksize is ' + str(blocksize)
 49 |     #print 'data_per ' + str(data_per) + '. params_per ' + str(params_per)
 50 |     shared_mem = util.compute_shmem(padded_data, packed_params,
 51 |                                     data_per, params_per)
 52 |     block_design = (data_per * params_per, 1, 1)
 53 |     grid_design = (util.get_boxes(ndata, data_per),
 54 |                    util.get_boxes(nparams, params_per))
 55 | 
 56 |     # see cufiles/mvcaller.cu
 57 |     design = np.array(((data_per, params_per) + # block design
 58 |                        padded_data.shape + # data spec
 59 |                        (dim,) + # non-padded number of data columns
 60 |                        packed_params.shape), # params spec
 61 |                       dtype=np.int32)
 62 | 
 63 |     if nparams == 1:
 64 |         gpu_dest = gpu_empty(ndata, dtype=np.float32)
 65 |         #gpu_dest = to_gpu(np.zeros(ndata, dtype=np.float32))
 66 |     else:
 67 |         gpu_dest = gpu_empty((ndata, nparams), dtype=np.float32, order='F')
 68 |         #gpu_dest = to_gpu(np.zeros((ndata, nparams), dtype=np.float32, order='F'))
 69 | 
 70 |     # Upload data if not already uploaded
 71 |     if not isinstance(padded_data, GPUArray):
 72 |         gpu_padded_data = to_gpu(padded_data)
 73 |     else:
 74 |         gpu_padded_data = padded_data
 75 | 
 76 |     gpu_packed_params = to_gpu(packed_params)
 77 | 
 78 |     params = (gpu_dest, gpu_padded_data, gpu_packed_params) + tuple(design)
 79 |     kwds = dict(block=block_design, grid=grid_design, shared=shared_mem)
 80 |     cu_func(*params, **kwds)
 81 | 
 82 |     gpu_packed_params.gpudata.free()
 83 |     if get:
 84 |         if order=='F':
 85 |             return gpu_dest.get()
 86 |         else:
 87 |             return np.asarray(gpu_dest.get(), dtype=np.float32, order='C')
 88 |         #output = gpu_dest.get()
 89 |         #if nparams > 1:
 90 |         #    output = output.reshape((nparams, ndata), order='C').T
 91 |         #return output
 92 |     else:
 93 |         if order=='F' or nparams==1:
 94 |             return gpu_dest
 95 |         else:
 96 |             res = gpu_transpose(util.GPUarray_reshape(gpu_dest, (nparams, ndata), "C"))
 97 |             gpu_dest.gpudata.free()
 98 |             return res
 99 |             #return gpu_transpose(gpu_dest.reshape(nparams, ndata, 'C'))
100 | 
101 | def _univariate_pdf_call(cu_func, data, packed_params, get):
102 |     ndata = len(data)
103 |     nparams = len(packed_params)
104 | 
105 |     func_regs = cu_func.num_regs
106 | 
107 |     packed_params = util.prep_ndarray(packed_params)
108 | 
109 |     data_per, params_per = util.tune_blocksize(data,
110 |                                                packed_params,
111 |                                                func_regs)
112 | 
113 |     shared_mem = util.compute_shmem(data, packed_params,
114 |                                     data_per, params_per)
115 | 
116 |     block_design = (data_per * params_per, 1, 1)
117 |     grid_design = (util.get_boxes(ndata, data_per),
118 |                    util.get_boxes(nparams, params_per))
119 | 
120 |     # see cufiles/univcaller.cu
121 | 
122 |     #gpu_dest = to_gpu(np.zeros((ndata, nparams), dtype=np.float32))
123 |     gpu_dest = gpu_empty((ndata, nparams), dtype=np.float32)
124 |     gpu_data = data if isinstance(data, GPUArray) else to_gpu(data)
125 |     gpu_packed_params = to_gpu(packed_params)
126 | 
127 |     design = np.array(((data_per, params_per) + # block design
128 |                        (len(data),) +
129 |                        packed_params.shape), # params spec
130 |                       dtype=np.int32)
131 | 
132 |     cu_func(gpu_dest,
133 |             gpu_data, gpu_packed_params, design[0],
134 |             design[1], design[2], design[3], design[4],
135 |             block=block_design, grid=grid_design, shared=shared_mem)
136 | 
137 |     if get:
138 |         output = gpu_dest.get()
139 |         if nparams > 1:
140 |             output = output.reshape((nparams, ndata), order='C').T
141 |         return output
142 |     else:
143 |         return gpu_dest
144 | 
145 | #-------------------------------------------------------------------------------
146 | # Multivariate normal
147 | 
148 | def mvnpdf(data, mean, cov, weight=None, logged=True, get=True, order="F",
149 |            datadim=None):
150 |     """
151 |     Multivariate normal density
152 | 
153 |     Parameters
154 |     ----------
155 | 
156 |     Returns
157 |     -------
158 |     """
159 |     return mvnpdf_multi(data, [mean], [cov],
160 |                         logged=logged, get=get, order=order,
161 |                         datadim=datadim).squeeze()
162 | 
163 | def mvnpdf_multi(data, means, covs, weights=None, logged=True,
164 |                  get=True, order="F", datadim=None):
165 |     """
166 |     Multivariate normal density with multiple sets of parameters
167 | 
168 |     Parameters
169 |     ----------
170 |     data : ndarray (n x k)
171 |     covs : sequence of 2d k x k matrices (length j)
172 |     weights : ndarray (length j)
173 |         Multiplier for component j, usually will sum to 1
174 | 
175 |     get = False leaves the result on the GPU
176 |     without copying back.
177 | 
178 |     If data has already been padded, the orginal dimension
179 |     must be passed in datadim
180 | 
181 |     It data is of GPUarray type, the data is assumed to be
182 |     padded, and datadim will need to be passed if padding
183 |     was needed.
184 | 
185 |     Returns
186 |     -------
187 |     densities : n x j
188 |     """
189 |     if logged:
190 |         cu_func = cu_module.get_function('log_pdf_mvnormal')
191 |     else:
192 |         cu_func = cu_module.get_function('pdf_mvnormal')
193 | 
194 |     assert(len(covs) == len(means))
195 | 
196 |     ichol_sigmas = [LA.inv(chol(c)) for c in covs]
197 |     logdets = [-2.0*np.log(c.diagonal()).sum() for c in ichol_sigmas]
198 | 
199 |     if weights is None:
200 |         weights = np.ones(len(means))
201 | 
202 |     packed_params = _pack_mvnpdf_params(means, ichol_sigmas, logdets, weights)
203 | 
204 |     return _multivariate_pdf_call(cu_func, data, packed_params,
205 |                                   get, order,datadim)
206 | 
207 | def _pack_mvnpdf_params(means, ichol_sigmas, logdets, weights):
208 |     to_pack = []
209 |     for m, ch, ld, w in zip(means, ichol_sigmas, logdets, weights):
210 |         to_pack.append(_pack_mvnpdf_params_single(m, ch, ld, w))
211 | 
212 |     return np.vstack(to_pack)
213 | 
214 | def _pack_mvnpdf_params_single(mean, ichol_sigma, logdet, weight=1):
215 |     PAD_MULTIPLE = 16
216 |     k = len(mean)
217 |     mean_len = k
218 |     ichol_len = k * (k + 1) / 2
219 |     mch_len = mean_len + ichol_len
220 | 
221 |     packed_dim = util.next_multiple(mch_len + 2, PAD_MULTIPLE)
222 | 
223 |     packed_params = np.empty(packed_dim, dtype=np.float32)
224 |     packed_params[:mean_len] = mean
225 | 
226 |     packed_params[mean_len:mch_len] = ichol_sigma[np.tril_indices(k)]
227 |     packed_params[mch_len:mch_len + 2] = weight, logdet
228 | 
229 |     return packed_params
230 | 
231 | #-------------------------------------------------------------------------------
232 | # Univariate normal
233 | 
234 | def normpdf(x, mean, std, logged=True, get=True):
235 |     """
236 |     Normal (Gaussian) density
237 | 
238 |     Parameters
239 |     ----------
240 | 
241 |     Returns
242 |     -------
243 |     """
244 |     return normpdf_multi(x, [mean], [std], logged=logged, get=get).squeeze()
245 | 
246 | def normpdf_multi(x, means, std, logged=True, get=True):
247 |     if logged:
248 |         cu_func = cu_module.get_function('log_pdf_normal')
249 |     else:
250 |         cu_func = cu_module.get_function('pdf_normal')
251 | 
252 |     packed_params = np.c_[means, std]
253 | 
254 |     if not isinstance(x, GPUArray):
255 |         x = util.prep_ndarray(x)
256 | 
257 |     return _univariate_pdf_call(cu_func, x, packed_params, get)
258 | 
259 | if __name__ == '__main__':
260 |     import gpustats.compat as compat
261 | 
262 |     n = 1e5
263 |     k = 8
264 | 
265 |     np.random.seed(1)
266 |     data = randn(n, k).astype(np.float32)
267 |     mean = randn(k).astype(np.float32)
268 |     cov = util.random_cov(k).astype(np.float32)
269 | 
270 |     result = mvnpdf_multi(data, [mean, mean], [cov, cov])
271 |     # pyresult = compat.python_mvnpdf(data, [mean], [cov]).squeeze()
272 |     # print result - pyresult
273 | 


--------------------------------------------------------------------------------
/gpustats/util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pycuda.driver as drv
  3 | import pycuda.gpuarray as gpuarray
  4 | import pycuda
  5 | import scipy.linalg as LA
  6 | drv.init()
  7 | if drv.Context.get_current() is None:
  8 |     import pycuda.autoinit
  9 | from pycuda.compiler import SourceModule
 10 | 
 11 | def threadSafeInit(device = 0):
 12 |     """
 13 |     If gpustats (or any other pycuda work) is used inside a 
 14 |     multiprocessing.Process, this function must be used inside the
 15 |     thread to clean up invalid contexts and create a new one on the 
 16 |     given device. Assumes one GPU per thread.
 17 |     """
 18 | 
 19 |     import atexit
 20 |     drv.init() # just in case
 21 | 
 22 |     ## clean up all contexts. most will be invalid from
 23 |     ## multiprocessing fork
 24 |     import os; import sys
 25 |     clean = False
 26 |     while not clean:
 27 |         _old_ctx = drv.Context.get_current()
 28 |         if _old_ctx is None:
 29 |             clean = True
 30 |         else:
 31 |             ## detach: will give warnings to stderr if invalid
 32 |             _old_cerr = os.dup(sys.stderr.fileno())
 33 |             _nl = os.open(os.devnull, os.O_RDWR)
 34 |             os.dup2(_nl, sys.stderr.fileno())
 35 |             _old_ctx.detach() 
 36 |             sys.stderr = os.fdopen(_old_cerr, "wb")
 37 |             os.close(_nl)
 38 |     from pycuda.tools import clear_context_caches
 39 |     clear_context_caches()
 40 |         
 41 |     ## init a new device
 42 |     dev = drv.Device(device)
 43 |     ctx = dev.make_context()
 44 | 
 45 |     ## pycuda.autoinit exitfunc is bad now .. delete it
 46 |     exit_funcs = atexit._exithandlers
 47 |     for fn in exit_funcs:
 48 |         if hasattr(fn[0], 'func_name'):
 49 |             if fn[0].func_name == '_finish_up':
 50 |                 exit_funcs.remove(fn)
 51 |             if fn[0].func_name == 'clean_all_contexts': # avoid duplicates
 52 |                 exit_funcs.remove(fn)
 53 | 
 54 |     ## make sure we clean again on exit
 55 |     atexit.register(clean_all_contexts)
 56 | 
 57 | 
 58 | def clean_all_contexts():
 59 | 
 60 |     ctx = True
 61 |     while ctx is not None:
 62 |         ctx = drv.Context.get_current()
 63 |         if ctx is not None:
 64 |             ctx.detach()
 65 | 
 66 |     from pycuda.tools import clear_context_caches
 67 |     clear_context_caches()
 68 |     
 69 | 
 70 | def GPUarray_reshape(garray, shape=None, order="C"):
 71 |     if shape is None:
 72 |         shape = garray.shape
 73 |     return gpuarray.GPUArray(
 74 |         shape=shape,
 75 |         dtype=garray.dtype,
 76 |         allocator=garray.allocator,
 77 |         base=garray,
 78 |         gpudata=int(garray.gpudata),
 79 |         order=order)
 80 | 
 81 | def GPUarray_order(garray, order="F"):
 82 |     """
 83 |     will set the order of garray in place
 84 |     """
 85 |     if order=="F":
 86 |         if garray.flags.f_contiguous:
 87 |             exit
 88 |         else:
 89 |             garray.strides = gpuarray._f_contiguous_strides(
 90 |                 garray.dtype.itemsize, garray.shape)
 91 |             garray.flags.f_contiguous = True
 92 |             garray.flags.c_contiguous = False
 93 |     elif order=="C":
 94 |         if garray.flags.c_contiguous:
 95 |             exit
 96 |         else:
 97 |             garray.strides = gpuarray._c_contiguous_strides(
 98 |                 garray.dtype.itemsize, garray.shape)
 99 |             garray.flags.c_contiguous = True
100 |             garray.flags.f_contiguous = False
101 |             
102 | 
103 | 
104 | _dev_attr = drv.device_attribute
105 | ## TO DO: should be different for each device .. assumes they are the same
106 | class DeviceInfo(object):
107 | 
108 |     def __init__(self):
109 |         #self._dev = pycuda.autoinit.device
110 |         #self._dev = drv.Device(dev)
111 |         self._dev = drv.Context.get_device()
112 |         self._attr = self._dev.get_attributes()
113 | 
114 |         self.max_block_threads = self._attr[_dev_attr.MAX_THREADS_PER_BLOCK]
115 |         self.shared_mem = self._attr[_dev_attr.MAX_SHARED_MEMORY_PER_BLOCK]
116 |         self.warp_size = self._attr[_dev_attr.WARP_SIZE]
117 |         self.max_registers = self._attr[_dev_attr.MAX_REGISTERS_PER_BLOCK]
118 |         self.compute_cap = self._dev.compute_capability()
119 |         self.max_grid_dim = (self._attr[_dev_attr.MAX_GRID_DIM_X],
120 |                              self._attr[_dev_attr.MAX_GRID_DIM_Y])
121 | 
122 | info = DeviceInfo()
123 | 
124 | HALF_WARP = 16
125 | 
126 | def random_cov(dim):
127 |     from pymc.distributions import rwishart
128 |     return LA.inv(rwishart(dim, np.eye(dim)))
129 | 
130 | def unvech(v):
131 |     # quadratic formula, correct fp error
132 |     rows = .5 * (-1 + np.sqrt(1 + 8 * len(v)))
133 |     rows = int(np.round(rows))
134 | 
135 |     result = np.zeros((rows, rows))
136 |     result[np.triu_indices(rows)] = v
137 |     result = result + result.T
138 | 
139 |     # divide diagonal elements by 2
140 |     result[np.diag_indices(rows)] /= 2
141 | 
142 |     return result
143 | 
144 | def pad_data_mult16(data, fill=0):
145 |     """
146 |     Pad data to be a multiple of 16 for discrete sampler.
147 |     """
148 | 
149 |     if type(data) == gpuarray:
150 |         data = data.get()
151 | 
152 |     n, k = data.shape
153 | 
154 |     km = int(k/16) + 1
155 | 
156 |     newk = km*16
157 |     if newk != k:
158 |         padded_data = np.zeros((n, newk), dtype=np.float32)
159 |         if fill!=0:
160 |             padded_data = padded_data + fill
161 | 
162 |         padded_data[:,:k] = data
163 | 
164 |         return padded_data
165 |     else:
166 |         return prep_ndarray(data)
167 | 
168 | def pad_data(data):
169 |     """
170 |     Pad data to avoid bank conflicts on the GPU-- dimension should not be a
171 |     multiple of the half-warp size (16)
172 |     """
173 |     if type(data) == gpuarray:
174 |         data = data.get()
175 | 
176 |     n, k = data.shape
177 | 
178 |     if not k % HALF_WARP:
179 |         pad_dim = k + 1
180 |     else:
181 |         pad_dim = k
182 | 
183 |     if k != pad_dim:
184 |         padded_data = np.empty((n, pad_dim), dtype=np.float32)
185 |         padded_data[:, :k] = data
186 | 
187 |         return padded_data
188 |     else:
189 |         return prep_ndarray(data)
190 | 
191 | def prep_ndarray(arr):
192 |     # is float32 and contiguous?
193 |     if not arr.dtype == np.float32 or not arr.flags.contiguous:
194 |         arr = np.array(arr, dtype=np.float32, order='C')
195 | 
196 |     return arr
197 | 
198 | 
199 | 
200 | 
201 | def tune_blocksize(data, params, func_regs):
202 |     """
203 |     For multivariate distributions-- what's the optimal block size given the
204 |     gpu?
205 | 
206 |     Parameters
207 |     ----------
208 |     data : ndarray
209 |     params : ndarray
210 | 
211 |     Returns
212 |     -------
213 |     (data_per, params_per) : (int, int)
214 |     """
215 |     #info = DeviceInfo()
216 | 
217 |     max_smem = info.shared_mem * 0.9
218 |     max_threads = int(info.max_block_threads * 0.5)
219 |     max_regs = info.max_registers
220 |     max_grid = int(info.max_grid_dim[0])
221 | 
222 |     params_per = 64#max_threads
223 |     if (len(params) < params_per):
224 |         params_per = _next_pow2(len(params), info.max_block_threads)
225 | 
226 |     min_data_per = data.shape[0] / max_grid;
227 |     data_per0 = _next_pow2( max( max_threads / params_per, min_data_per ), 512);
228 |     data_per = data_per0
229 | 
230 |     def _can_fit(data_per, params_per):
231 |         ok = compute_shmem(data, params, data_per, params_per) <= max_smem
232 |         ok = ok and data_per*params_per <= max_threads
233 |         return ok and func_regs*data_per*params_per <= max_regs
234 | 
235 |     while True:
236 |         while not _can_fit(data_per, params_per):
237 |             if data_per <= min_data_per:
238 |                 break
239 | 
240 |             if params_per > 1:
241 |                 # reduce number of parameters first
242 |                 params_per /= 2
243 |             else:
244 |                 # can't go any further, have to do less data
245 |                 data_per /= 2
246 | 
247 |         if data_per <= min_data_per:
248 |             # we failed somehow. start over
249 |             data_per = 2 * data_per0
250 |             params_per /= 2
251 |             continue
252 |         else:
253 |             break
254 | 
255 |     while _can_fit(2 * data_per, params_per):
256 |         #if 2 * data_per * params_per < max_threads:
257 |             data_per *= 2
258 |         #else:
259 |             # hit block size limit
260 |         #    break
261 | 
262 |     #import pdb; pdb.set_trace()
263 |     return data_per, params_per
264 | 
265 | def get_boxes(n, box_size):
266 |     # how many boxes of size box_size are needed to hold n things
267 |     return int((n + box_size - 1) / box_size)
268 | 
269 | def compute_shmem(data, params, data_per, params_per):
270 |     result_space = data_per * params_per
271 | 
272 |     data_dim = 1 if len(data.shape) == 1 else data.shape[1]
273 |     params_dim = len(params) if len(params.shape) == 1 else params.shape[1]
274 | 
275 |     param_space = params_dim * params_per
276 |     data_space = data_dim * data_per
277 |     return 4 * (result_space + param_space + data_space)
278 | 
279 | def _next_pow2(k, pow2):
280 |     while k <= pow2 / 2:
281 |         pow2 /= 2
282 |     return pow2
283 | 
284 | def next_multiple(k, mult):
285 |     if k % mult:
286 |         return k + (mult - k % mult)
287 |     else:
288 |         return k
289 | 
290 | def get_cufiles_path():
291 |     import os.path as pth
292 |     basepath = pth.abspath(pth.split(__file__)[0])
293 |     return pth.join(basepath, 'cufiles')
294 | 
295 | 
296 | from pycuda.tools import context_dependent_memoize
297 | 
298 | @context_dependent_memoize
299 | def _get_transpose_kernel():
300 | 
301 |     #info = DeviceInfo()
302 |     if info.max_block_threads >= 1024:
303 |         t_block_size = 32
304 |     else:
305 |         t_block_size = 16
306 | 
307 |     import os.path as pth
308 |     mod = SourceModule( 
309 |         open(pth.join(get_cufiles_path(), "transpose.cu")).read() % { "block_size" : t_block_size })
310 | 
311 |     func = mod.get_function("transpose")
312 |     func.prepare("PPii") #, block=(t_block_size, t_block_size, 1))
313 |     return t_block_size, func
314 |     
315 | 
316 |     #from pytools import Record
317 |     #class TransposeKernelInfo(Record): pass
318 |     #return TransposeKernelInfo(func=func, 
319 |     #                           block_size=t_block_size,
320 |     #                           granularity=t_block_size)
321 |     
322 | 
323 | def _transpose(tgt, src):
324 |     block_size, func = _get_transpose_kernel()
325 |     
326 | 
327 |     h, w = src.shape
328 |     assert tgt.shape == (w, h)
329 |     #assert w % block_size == 0
330 |     #assert h % block_size == 0
331 |     
332 |     gw = int(np.ceil(float(w) / block_size))
333 |     gh = int(np.ceil(float(h) / block_size))
334 |     gz = int(1)
335 | 
336 |     ### 3D grids are needed for larger data ... should be comming soon ...
337 |     #while gw > info.max_grid_dim[0]:
338 |     #    gz += 1
339 |     #    gw = int(np.ceil(float(w) / (gz * block_size) ))
340 | 
341 |     func.prepared_call(
342 |         (gw, gh),
343 |         (block_size, block_size, 1),
344 |         tgt.gpudata, src.gpudata, w, h)
345 | 
346 | 
347 | def transpose(src):
348 |     h, w = src.shape
349 | 
350 |     result = gpuarray.empty((w, h), dtype=src.dtype)
351 |     _transpose(result, src)
352 |     del src
353 |     return result
354 | 


--------------------------------------------------------------------------------