├── .gitignore ├── DESCRIPTION ├── INSTALL ├── NAMESPACE ├── NEWS ├── R ├── chooseGpu.R ├── getGpuId.R ├── gpuCor.R ├── gpuGranger.R ├── gpuHclust.R ├── gpuLm.R ├── gpuMatMult.R ├── gpuMi.R ├── gpuQr.R ├── gpuSolve.R ├── hooks.R └── zzz.R ├── cleanup ├── configure ├── configure.ac ├── inst └── cuda │ ├── correlation.cu │ ├── distance.cu │ ├── granger.cu │ ├── hcluster.cu │ ├── kendall.cu │ ├── mi.cu │ └── qrdecomp.cu ├── man ├── chooseGpu.Rd ├── cpuMatMult.Rd ├── getGpuId.Rd ├── gpuCor.Rd ├── gpuCrossprod.Rd ├── gpuDist.Rd ├── gpuDistClust.Rd ├── gpuGlm.Rd ├── gpuGranger.Rd ├── gpuHclust.Rd ├── gpuLm.Rd ├── gpuLm.defaultTol.Rd ├── gpuLm.fit.Rd ├── gpuLsfit.Rd ├── gpuMatMult.Rd ├── gpuMi.Rd ├── gpuQr.Rd ├── gpuSolve.Rd ├── gpuTcrossprod.Rd └── gpuTtest.Rd ├── src ├── Makevars.in ├── correlation.cpp ├── correlation.h ├── cudaUtils.cpp ├── cudaUtils.h ├── cuseful.cpp ├── cuseful.h ├── distance.cpp ├── distance.h ├── granger.cpp ├── granger.h ├── hcluster.cpp ├── hcluster.h ├── kendall.cpp ├── kendall.h ├── lsfit.cpp ├── lsfit.h ├── matmult.cpp ├── matmult.h ├── mi.cpp ├── mi.h ├── qrdecomp.cpp ├── qrdecomp.h ├── rinterface.cpp ├── rinterface.h ├── sort.cpp └── sort.h └── tools ├── config.guess ├── config.sub ├── install-sh ├── mdate-sh └── missing /.gitignore: -------------------------------------------------------------------------------- 1 | autom4te.cache 2 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: gputools 2 | Version: 1.2 3 | Title: A Few GPU Enabled Functions 4 | Authors@R: c(person("Josh", "Buckner", email = "nullsatz@gmail.com", 5 | role = c("aut", "cre")), 6 | person("Mark", "Seligman", email = "mseligman@suiji.org", role = "aut"), 7 | person("Fan", "Meng", email = "mengf@umich.edu", role = "aut"), 8 | person("Justin", "Wilson", role = "ctb")) 9 | Depends: R (>= 3.1.2) 10 | SystemRequirements: Nvidia's CUDA toolkit (>= release 7.0) 11 | Description: Provides R interfaces to a handful of common 12 | functions implemented using the Nvidia CUDA toolkit. Some of the 13 | functions require at least GPU Compute Capability 1.3. 14 | Thanks to Craig Stark at UC Irvine for donating time on his lab's Mac. 15 | License: GPL-3 16 | URL: https://github.com/nullsatz/gputools/wiki 17 | BugReports: https://github.com/nullsatz/gputools/issues 18 | Repository: CRAN 19 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | gputools 2 | 3 | ========= 4 | 5 | To install the package, you must first install Nvidia's CUDA Toolkit available from 6 | 7 | http://developer.nvidia.com/cuda-downloads 8 | 9 | You will need an Nvidia GPU of compute capability >= 2. 10 | 11 | ========= 12 | 13 | Installation 14 | 15 | If the package installation fails with an error: 16 | 17 | Set the environment variable CUDA_HOME to the root of your CUDA toolkit installation. 18 | 19 | If that doesn't help, there are several 'configure options' that may be set from the command line: 20 | 21 | R CMD INSTALL --configure-args="" gputools_0.5.tar.gz 22 | 23 | include: 24 | --with-nvcc= 25 | --with-cuda= 26 | --with-r= 27 | --with-r-include= 28 | --with-r-lib= 29 | 30 | For example: 31 | R CMD INSTALL --configure-args="--with-nvcc=/usr/local/cuda/bin/nvcc --with-r-lib=/usr/local/share/R/lib64" gputools_0.5.tar.gz 32 | 33 | Enjoy! 34 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | useDynLib(gputools) 2 | 3 | export(gpuCor, 4 | gpuTtest, 5 | gpuQr, gpuSolve, 6 | gpuGranger, 7 | gpuDist, gpuHclust, gpuDistClust, 8 | gpuMatMult, cpuMatMult, gpuCrossprod, gpuTcrossprod, 9 | gpuMi, 10 | gpuLm, gpuLm.fit, gpuLsfit, gpuGlm, gpuGlm.fit, 11 | chooseGpu, getGpuId) 12 | 13 | importFrom("stats", 14 | ".getXlevels", 15 | "as.dist", 16 | "complete.cases", 17 | "gaussian", 18 | "is.empty.model", 19 | "model.extract", 20 | "model.matrix", 21 | "model.offset", 22 | "model.response", 23 | "model.weights") 24 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | gputools changes 2 | ================= 3 | 4 | changes in gputools 1.1 5 | - this is a point release intended to trial cuda run-time compilation (nvrtc) 6 | it is only used for the kendall correlation option of gpuCor 7 | but nvrtc will be used for all cuda kernel code in future releases 8 | 9 | changes in gputools 0.5 10 | - switched back to autotools options 11 | 12 | changes in gputools 0.28 13 | - updated installation procedure 14 | - updated OSX compatibility 15 | - removed CULA dependencies 16 | 17 | changes in gputools 0.27 18 | - removed LICENSE file and updated DESCRIPTION file 19 | - fixed R documentation files 20 | - removed nonfunctional svm code 21 | 22 | changes in gputools 0.26 23 | - fixes to several potential bugs: 24 | There was a variable in the QR code which was never 25 | deallocated. Also, there were several variables freed with 26 | the wrong method. Also, added a missing cublasInit/Shutdown pair. 27 | Thanks to Eunjin Lee's error reporting! 28 | 29 | changes in gputools 0.24 30 | 31 | - added support for configure argument --with-compiler-bindir and 32 | environment variable COMPILER_BINDIR to support gcc located somewhere 33 | other than the user's path. Use one of these to get the package 34 | installed in flavors of Linux that ship with default gcc version > 4.4. 35 | 36 | changes in gputools 0.24 37 | 38 | - A couple of changes to the Makefile: 39 | * added -I. for compatibility with R 2.12 40 | * switched from CC to NVCC for compat with ~/.R/Makevars 41 | 42 | changes in gputools 0.21 43 | 44 | - fixed another bug in gpuMi 45 | 46 | changes in gputools 0.2 47 | 48 | - added some linear and nonlinear regression functions 49 | 50 | - added some configure args --with-r-lib and --with-r-include (see INSTALL 51 | notes section) 52 | 53 | - fixed a bug in gpuMi 54 | 55 | - added a demo of CULA use: fastICA with CULA's svd plugged in. 56 | 57 | changes in gputools 0.1.3 58 | 59 | - Mac os x and nvidia cards with compute capability less than 1.3 are now 60 | supported. Systems lacking a gpu altogether also recieve some support. See the 61 | file INSTALL for details. 62 | 63 | - gpuHclust, gpuDist, and gpuQr now output the same classes as R's base hclust, 64 | dist, and qr functions. 65 | 66 | - The c code now uses r's api. Host memory allocation and error reporting 67 | have recieved a facelift. 68 | 69 | - gpuGranger now reports the correct p-values and is slightly faster for small 70 | data sets. 71 | 72 | - Some bugs with gpuCor's pearson method have been fixed. 73 | -------------------------------------------------------------------------------- /R/chooseGpu.R: -------------------------------------------------------------------------------- 1 | chooseGpu <- function(deviceId = 0) 2 | { 3 | deviceId <- as.integer(deviceId) 4 | .C("rsetDevice", deviceId, PACKAGE='gputools') 5 | } 6 | -------------------------------------------------------------------------------- /R/getGpuId.R: -------------------------------------------------------------------------------- 1 | getGpuId <- function() 2 | { 3 | deviceId <- .C("rgetDevice", deviceId = integer(1), 4 | PACKAGE='gputools')$deviceId 5 | return(deviceId) 6 | } 7 | -------------------------------------------------------------------------------- /R/gpuCor.R: -------------------------------------------------------------------------------- 1 | gpuCor <- function(x, y = NULL, use = "everything", method = "pearson") { 2 | x <- as.matrix(x) 3 | nx <- ncol(x) 4 | size <- nrow(x) 5 | 6 | if(is.null(y)) { 7 | y <- x 8 | } else { 9 | y <- as.matrix(y) 10 | } 11 | ny <- ncol(y) 12 | 13 | n <- nx * ny 14 | 15 | methods <- c("pearson", "kendall") 16 | method <- pmatch(method, methods, -1) 17 | if(is.na(method)) { 18 | stop("invalid correlation method") 19 | } 20 | if(method == -1) { 21 | stop("ambiguous correlation method") 22 | } 23 | 24 | uses <- c("everything", "pairwise.complete.obs") 25 | use <- pmatch(use, uses, -1) 26 | if(is.na(use)) { 27 | stop("invalid correlation method") 28 | } 29 | if(use == -1) { 30 | stop("ambiguous correlation method") 31 | } 32 | 33 | if(methods[method] == "pearson") { 34 | answer <- .C("rpmcc", NAOK=TRUE, 35 | as.integer(use - 1), as.single(x), as.integer(nx), 36 | as.single(y), as.integer(ny), as.integer(size), 37 | pairs = single(n), corr = single(n), ts = single(n), 38 | PACKAGE='gputools') 39 | 40 | pairs <- t(matrix(answer$pairs, ny, nx)) 41 | corr <- t(matrix(answer$corr, ny, nx)) 42 | ts <- t(matrix(answer$ts, ny, nx)) 43 | 44 | return(list(coefficients = corr, ts = ts, pairs = pairs)) 45 | 46 | } else if(methods[method] == "kendall") { 47 | 48 | if(uses[use] != "everything") { 49 | warning("NA handling for Kendall's is not yet supported. Defaulting to using everything. Sorry for any inconvenience.") 50 | } 51 | 52 | a <- .C("RgpuKendall", 53 | as.single(x), nx, as.single(y), ny, 54 | size, result = double(nx*ny), 55 | PACKAGE = "gputools") 56 | 57 | pairs <- matrix(size, nx, ny) 58 | return(list(coefficients = matrix(a$result, nx, ny), pairs = pairs)) 59 | } else { 60 | stop("This correlation method is not yet supported.") 61 | } 62 | } 63 | 64 | gpuTtest <- function(goodPairs, coeffs) { 65 | goodPairs <- as.single(goodPairs) 66 | coeffs <- as.single(coeffs) 67 | 68 | n <- as.integer(length(goodPairs)) 69 | 70 | .C("rtestT", NAOK = TRUE, 71 | goodPairs, coeffs, n, 72 | results = single(n), 73 | PACKAGE = 'gputools')$results 74 | } 75 | -------------------------------------------------------------------------------- /R/gpuGranger.R: -------------------------------------------------------------------------------- 1 | gpuGranger <- function(x, y=NULL, lag) 2 | { 3 | x <- as.matrix(x) 4 | rows <- nrow(x) 5 | colsx <- ncol(x) 6 | 7 | if(rows - lag <= 2 * lag + 1) { 8 | stop("time sequence too short for lag: use longer sequences or smaller lag") 9 | } 10 | 11 | if(!is.null(y)) { 12 | y <- as.matrix(y) 13 | } 14 | 15 | lag <- as.integer(lag) 16 | 17 | if(is.null(y)) { 18 | colsy <- colsx 19 | cRetVal <- .C("rgpuGranger", 20 | as.integer(rows), as.integer(colsx), as.single(x), lag, 21 | fStats = single(colsx*colsy), 22 | pValues = single(colsx*colsy), 23 | PACKAGE='gputools') 24 | } else { 25 | colsy <- ncol(y) 26 | cRetVal <- .C("rgpuGrangerXY", 27 | as.integer(rows), as.integer(colsx), as.single(x), 28 | as.integer(colsy), as.single(y), lag, 29 | fStats = single(colsx*colsy), 30 | pValues = single(colsx*colsy), 31 | PACKAGE='gputools') 32 | } 33 | fStats <- matrix(cRetVal$fStats, colsx, colsy) 34 | pValues <- matrix(cRetVal$pValues, colsx, colsy) 35 | return(list(fStatistics = fStats, pValues = pValues)) 36 | } 37 | -------------------------------------------------------------------------------- /R/gpuHclust.R: -------------------------------------------------------------------------------- 1 | gpuDist <- function(points, method = "euclidean", p = 2.0) 2 | { 3 | if(!is.na(pmatch(method, "euclidian"))) { 4 | method <- "euclidean" 5 | } 6 | 7 | methods <- c("euclidean", "maximum", "manhattan", "canberra", "binary", 8 | "minkowski") 9 | method <- pmatch(method, methods) # hey presto method becomes an int 10 | if(is.na(method)) { 11 | stop("invalid distance method") 12 | } 13 | if(method == -1) { 14 | stop("ambiguous distance method") 15 | } 16 | method <- methods[method] # return method to a meaningful string 17 | 18 | points <- as.matrix(points) 19 | numPoints <- nrow(points) 20 | 21 | a <- .C("Rdistances", 22 | as.single(t(points)), 23 | as.integer(numPoints), 24 | as.integer(ncol(points)), 25 | d = single(numPoints * numPoints), 26 | method, as.single(p), 27 | PACKAGE='gputools') 28 | 29 | d <- as.dist(matrix(a$d, numPoints, numPoints)) 30 | attr(d, "Labels") <- dimnames(points)[[1L]] 31 | attr(d, "method") <- method 32 | attr(d, "call") <- match.call() 33 | 34 | if(!is.na(pmatch(method, "minkowski"))) { 35 | attr(d, "p") <- p 36 | } 37 | 38 | return(d) 39 | } 40 | 41 | gpuHclust <- function(distances, method = "complete") 42 | { 43 | methods <- c("ward", "single", "complete", "average", "mcquitty", 44 | "median", "centroid", "flexible", "flexible group", "wpgma") 45 | method <- pmatch(method, methods) # method is now an integer 46 | if(is.na(method)) { 47 | stop("invalid clustering method") 48 | } 49 | if(method == -1) { 50 | stop("ambiguous clustering method") 51 | } 52 | method <- methods[method] # return method to a meaningful string 53 | 54 | n <- as.integer(attr(distances, "Size")) 55 | if(is.null(n)) { 56 | stop("invalid dissimilarities") 57 | } 58 | if(n < 2) { 59 | stop("must have n >= 2 objects to cluster") 60 | } 61 | 62 | len <- as.integer(n*(n-1)/2) 63 | if(length(distances) != len) { 64 | if (length(distances) < len) { 65 | stop("dissimilarities of improper length") 66 | } else { 67 | warning("dissimilarities of improper length") 68 | } 69 | } 70 | 71 | numpoints <- n 72 | a <- .C("Rhcluster", 73 | as.single(as.matrix(distances)), 74 | as.integer(numpoints), 75 | merge = integer(2*(numpoints-1)), 76 | order = integer(numpoints), 77 | val = single(numpoints-1), 78 | method, 79 | PACKAGE='gputools') 80 | 81 | merge <- matrix(a$merge, numpoints-1, 2) 82 | 83 | tree <- list(merge = merge, height= a$val, order = a$order, 84 | labels = attr(distances, "Labels"), 85 | method = method, 86 | call = match.call(), 87 | dist.method = attr(distances, "method")) 88 | 89 | class(tree) <- "hclust" 90 | return(tree) 91 | } 92 | 93 | gpuDistClust <- function(points, distmethod = "euclidean", 94 | clustmethod = "complete") 95 | { 96 | if(!is.na(pmatch(distmethod, "euclidian"))) { 97 | method <- "euclidean" 98 | } 99 | 100 | methods <- c("euclidean", "maximum", "manhattan", "canberra", "binary", 101 | "minkowski") 102 | distmethod <- pmatch(distmethod, methods) # hey presto method becomes an int 103 | if(is.na(distmethod)) { 104 | stop("invalid distance method") 105 | } 106 | if(distmethod == -1) { 107 | stop("ambiguous distance method") 108 | } 109 | distmethod <- methods[distmethod] # return method to a meaningful string 110 | 111 | methods <- c("ward", "single", "complete", "average", "mcquitty", 112 | "median", "centroid", "flexible", "flexible group", "wpgma") 113 | clustmethod <- pmatch(clustmethod, methods) # method is now an integer 114 | if(is.na(clustmethod)) { 115 | stop("invalid clustering method") 116 | } 117 | if(clustmethod == -1) { 118 | stop("ambiguous clustering method") 119 | } 120 | clustmethod <- methods[clustmethod] # return method to a meaningful string 121 | 122 | points <- as.matrix(points) 123 | nump <- nrow(points) 124 | 125 | a <- .C("Rdistclust", 126 | distmethod, clustmethod, 127 | as.single(t(points)), 128 | as.integer(nump), 129 | as.integer(ncol(points)), 130 | merge = integer(2*(nump-1)), 131 | order = integer(nump), 132 | val = single(nump-1), 133 | PACKAGE='gputools') 134 | 135 | merge <- matrix(a$merge, nump-1, 2) 136 | 137 | tree <- list(merge = merge, height = a$val, order = a$order, 138 | labels = dimnames(points)[[1L]], 139 | method = clustmethod, 140 | call = match.call(), 141 | dist.method = distmethod) 142 | 143 | class(tree) <- "hclust" 144 | 145 | return(tree) 146 | } 147 | -------------------------------------------------------------------------------- /R/gpuMatMult.R: -------------------------------------------------------------------------------- 1 | gpuMatMult <- function(a, b) { 2 | a <- as.matrix(a) 3 | b <- as.matrix(b) 4 | 5 | if (ncol(a) != nrow(b)) 6 | stop("error: matrix dimensions mismatched for matrix multiplication") 7 | 8 | .Call("gpuMatMult", a, b, PACKAGE='gputools') 9 | } 10 | 11 | cpuMatMult <- function(a, b) { 12 | a <- as.matrix(a) 13 | b <- as.matrix(b) 14 | 15 | if (ncol(a) != nrow(b)) 16 | stop("error: matrix dimensions mismatched for matrix multiplication") 17 | 18 | a %*% b 19 | } 20 | 21 | gpuCrossprod <- function(a, b=NULL) { 22 | a <- as.matrix(a) 23 | 24 | if (is.null(b)) b <- as.matrix(a) 25 | else b <- as.matrix(b) 26 | 27 | if (nrow(a) != nrow(b)) 28 | stop("error: matrix dim mismatch for cross-product.") 29 | 30 | results <- .Call("gpuMatMult", t(a), b, PACKAGE='gputools') 31 | return(results) 32 | } 33 | 34 | 35 | gpuTcrossprod <- function(a, b=NULL) 36 | { 37 | a <- as.matrix(a) 38 | 39 | if (is.null(b)) b <- as.matrix(a) 40 | else b <- as.matrix(b) 41 | 42 | if (ncol(a) != ncol(b)) 43 | stop("error: matrix dim mismatch for transposed cross-product") 44 | 45 | results <- .Call("gpuMatMult", a,t(b), PACKAGE='gputools') 46 | return(results) 47 | } 48 | -------------------------------------------------------------------------------- /R/gpuMi.R: -------------------------------------------------------------------------------- 1 | gpuMi <- function(x, y = NULL, bins = 2, splineOrder = 1) 2 | { 3 | x <- as.matrix(x) 4 | if(!is.null(y)) { 5 | y <- as.matrix(y) 6 | } 7 | 8 | bins <- as.integer(bins) 9 | splineOrder <- as.integer(splineOrder) 10 | 11 | nsamples <- as.integer(nrow(x)) 12 | na <- as.integer(ncol(x)) 13 | a <- as.single(x) 14 | 15 | b <- a 16 | nb <- na 17 | row_labels <- colnames(x) 18 | 19 | if(is.null(y)) { 20 | nb <- as.integer(ncol(x)) 21 | b <- as.single(x) 22 | row_labels <- colnames(x) 23 | } else { 24 | nb <- as.integer(ncol(y)) 25 | b <- as.single(y) 26 | row_labels <- colnames(y) 27 | } 28 | 29 | mi <- .C("rBSplineMutualInfo", bins, splineOrder, nsamples, na, a, 30 | nb, b, mi = single(nb * na), 31 | PACKAGE='gputools')$mi 32 | mi <- matrix(mi, nb, na) 33 | rownames(mi) <- row_labels 34 | colnames(mi) <- colnames(x) 35 | return(mi) 36 | } 37 | -------------------------------------------------------------------------------- /R/gpuQr.R: -------------------------------------------------------------------------------- 1 | gpuQr <- function(x, tol = 1e-07) { 2 | 3 | x <- as.matrix(x) 4 | if(is.complex(x)) { 5 | stop("complex gpuQR not yet supported") 6 | } 7 | 8 | n <- nrow(x) 9 | p <- ncol(x) 10 | 11 | mode(x) <- 'single' 12 | 13 | res <- .C("rGetQRDecompRR", 14 | as.integer(n), 15 | as.integer(p), 16 | as.double(tol), 17 | qr = x, 18 | pivot = as.integer(0L:(p-1)), 19 | qraux = double(p), 20 | rank = integer(1L), 21 | PACKAGE='gputools' 22 | )[c('qr', 'pivot', 'qraux', 'rank')] 23 | 24 | res$pivot <- res$pivot + 1 25 | 26 | if(!is.null(cn <- colnames(x))) 27 | colnames(res$qr) <- cn[res$pivot] 28 | 29 | class(res) <- "qr" 30 | res 31 | } 32 | -------------------------------------------------------------------------------- /R/gpuSolve.R: -------------------------------------------------------------------------------- 1 | gpuSolve <- function(x, y=NULL) { 2 | x <- as.matrix(x) 3 | if(is.complex(x)) { 4 | stop("complex gpuSolve not yet supported") 5 | } 6 | 7 | n <- nrow(x) 8 | p <- ncol(x) 9 | if(p > n) { 10 | stop("x represents an underdetermined system") 11 | } 12 | 13 | x.qr <- qr(x) 14 | x.q <- qr.Q(x.qr) 15 | x.r <- qr.R(x.qr) 16 | 17 | if(is.null(y)) { 18 | myCall <- .C("rGetInverseFromQR", 19 | as.integer(n), as.integer(p), 20 | as.single(x.q), as.single(x.r), 21 | inverse = single(n * p), 22 | PACKAGE='gputools' 23 | ) 24 | x.inverse <- matrix(myCall$inverse, p, n) 25 | return(x.inverse) 26 | } else { 27 | y <- as.single(y) 28 | if(length(y) != n) { 29 | stop("y must have length nrows(x)") 30 | } 31 | myCall <- .C("rSolveFromQR", 32 | as.integer(n), as.integer(p), 33 | as.single(x.q), as.single(x.r), 34 | y, solution = single(p), 35 | PACKAGE='gputools' 36 | ) 37 | return(myCall$solution) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /R/hooks.R: -------------------------------------------------------------------------------- 1 | .onLoad<-function(libname, pkgname){ 2 | e<-emptyenv() 3 | library.dynam('gputools', pkgname, libname) 4 | reg.finalizer(e,function(...){unloadNamespace(pkgname)}, onexit=T) 5 | } 6 | 7 | .onUnload<-function(libpath){ 8 | library.dynam.unload('gputools', libpath) 9 | } 10 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onLoad <- function(lib, pkg) { 2 | cuFileNames <- 3 | c( "correlation.cu" 4 | , "distance.cu" 5 | , "granger.cu" 6 | , "hcluster.cu" 7 | , "kendall.cu" 8 | , "mi.cu" 9 | , "qrdecomp.cu" 10 | ) 11 | 12 | cuFileNames <- 13 | sapply(cuFileNames, 14 | function(fn) { 15 | system.file('cuda', fn, package = 'gputools') 16 | }) 17 | 18 | cuSrc <- 19 | sapply(cuFileNames, 20 | function(fn) { 21 | readChar(fn, file.info(fn)$size) 22 | }) 23 | 24 | cuFiles <- 25 | c( "correlation" 26 | , "distance" 27 | , "granger" 28 | , "hcluster" 29 | , "kendall" 30 | , "mi" 31 | , "qrdecomp" 32 | ) 33 | 34 | result <- 35 | .C("cuCompile", 36 | length(cuFiles), 37 | cuFiles, 38 | cuSrc) 39 | } 40 | 41 | .unLoad <- function(lib, pkg) { 42 | result <- .C("unloadPackage") 43 | } 44 | -------------------------------------------------------------------------------- /cleanup: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | rm -f src/*o 4 | rm -f src/Makefile 5 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | AC_INIT([gputools], 1.2) dnl package name, version 2 | AC_CONFIG_AUX_DIR([tools]) 3 | 4 | dnl Select an optional include path, from a configure option 5 | dnl or from an environment variable. 6 | AC_ARG_WITH([cuda-include], 7 | AC_HELP_STRING([--with-cuda-include=INCLUDE_PATH], 8 | [the location of CUDA header files]), 9 | [cuda_include_path=$withval]) 10 | CUDA_CPPFLAGS="-I." 11 | if test [ -n "${cuda_include_path}" ] ; then 12 | CUDA_CPPFLAGS="-I. -I${cuda_include_path}" 13 | else 14 | if test [ -n "${CUDA_INCLUDE}" ] ; then 15 | CUDA_CPPFLAGS="-I. -I${CUDA_INCLUDE}" 16 | cuda_include_path="${CUDA_INCLUDE}" 17 | else 18 | if test [ -d "/usr/local/cuda/include" ] ; then 19 | CUDA_CPPFLAGS="-I. -I/usr/local/cuda/include" 20 | cuda_include_path="/usr/local/cuda/include" 21 | fi 22 | fi 23 | fi 24 | 25 | dnl ditto for a library path 26 | AC_ARG_WITH([cuda-lib], 27 | AC_HELP_STRING([--with-cuda-lib=LIB_PATH], 28 | [the location of CUDA libraries]), 29 | [cuda_lib_path=$withval]) 30 | if test [ -n "${cuda_lib_path}" ] ; then 31 | LIBS="-L${cuda_lib_path} ${LIBS}" 32 | else 33 | if test [ -n "${CUDA_LIBS}" ] ; then 34 | LIBS="-L${CUDA_LIBS} ${LIBS}" 35 | cuda_lib_path="${CUDA_LIBS}" 36 | else 37 | if test [ -d "/usr/local/cuda/lib64" ] ; then 38 | LIBS="-L/usr/local/cuda/lib64 ${LIBS}" 39 | cuda_lib_path="/usr/local/cuda/lib64" 40 | else 41 | if test [ -d "/usr/local/cuda/lib" ] ; then 42 | LIBS="-L/usr/local/cuda/lib ${LIBS}" 43 | cuda_lib_path="/usr/local/cuda/lib" 44 | fi 45 | fi 46 | fi 47 | fi 48 | 49 | dnl Now find the compiler and compiler flags to use 50 | : ${R_HOME=`R RHOME`} 51 | if test -z "${R_HOME}"; then 52 | echo "could not determine R_HOME" 53 | exit 1 54 | fi 55 | CXX=`"${R_HOME}/bin/R" CMD config CXX` 56 | CPP=`"${R_HOME}/bin/R" CMD config CPP` 57 | CXXFLAGS=`"${R_HOME}/bin/R" CMD config CXXFLAGS` 58 | CPPFLAGS=`"${R_HOME}/bin/R" CMD config CPPFLAGS` 59 | AC_PROG_CXX 60 | AC_PROG_CPP 61 | 62 | if test [ -n "${cuda_include_path}" ] ; then 63 | OLD_CPPFLAGS=${CPPFLAGS} 64 | CPPFLAGS="-I${cuda_include_path} ${CPPFLAGS}" 65 | fi 66 | 67 | AC_CHECK_HEADER([cuda.h], , [HAS_CUDA_H=no]) 68 | AC_CHECK_HEADER([nvrtc.h], , [HAS_NVRTC_H=no]) 69 | 70 | if test [ -n "${cuda_include_path}" ] ; then 71 | CPPFLAGS=${OLD_CPPFLAGS} 72 | fi 73 | 74 | if test [ "${HAS_CUDA_H}" = no ] ; then 75 | AC_MSG_ERROR("CUDA header cuda.h not found") 76 | fi 77 | 78 | if test [ "${HAS_NVRTC_H}" = no ] ; then 79 | AC_MSG_ERROR("CUDA header nvrtc.h not found") 80 | fi 81 | 82 | if test [ -n "${cuda_lib_path}" ] ; then 83 | OLD_LDFLAGS=${LDFLAGS} 84 | LDFLAGS="-L${cuda_lib_path} ${LDFLAGS}" 85 | fi 86 | 87 | dnl search for a library containing a CUDA function 88 | AC_SEARCH_LIBS([cuGetErrorName], [cuda], , 89 | AC_MSG_ERROR("no cuda driver library found")) 90 | 91 | dnl search for a library containing a CUDA function 92 | AC_SEARCH_LIBS([cudaMalloc], [cudart], , 93 | AC_MSG_ERROR("no cuda runtime api library found")) 94 | 95 | dnl search for a library containing a CUDA function 96 | AC_SEARCH_LIBS([nvrtcCreateProgram], [nvrtc], , 97 | AC_MSG_ERROR("no nvrtc library found")) 98 | 99 | dnl search for a library containing a CUDA function 100 | AC_SEARCH_LIBS([cublasSetMatrix], [cublas], , 101 | AC_MSG_ERROR("no cublas library found")) 102 | 103 | if test [ -n "${cuda_lib_path}" ] ; then 104 | LDFLAGS=${OLD_LDFLAGS} 105 | AC_CANONICAL_HOST 106 | case $host_os in 107 | darwin* ) 108 | LIBS="${LIBS} -Wl,-rpath,${cuda_lib_path}" 109 | ;; 110 | * ) 111 | LIBS="${LIBS} -Wl,-rpath=${cuda_lib_path}" 112 | ;; 113 | esac 114 | fi 115 | 116 | dnl substitute CUDA_CPPFLAGS and LIBS 117 | AC_SUBST(CUDA_CPPFLAGS) 118 | AC_SUBST(LIBS) 119 | dnl and do substitution in the src/Makevars.in 120 | AC_CONFIG_FILES([src/Makevars]) 121 | AC_OUTPUT 122 | -------------------------------------------------------------------------------- /inst/cuda/correlation.cu: -------------------------------------------------------------------------------- 1 | #define FALSE 0 2 | #define TRUE !FALSE 3 | 4 | #define NUMTHREADS 16 5 | #define THREADWORK 32 6 | 7 | __global__ void gpuMeans(const float * vectsA, size_t na, 8 | const float * vectsB, size_t nb, size_t dim, 9 | float * means, float * numPairs) 10 | { 11 | size_t 12 | offset, stride, 13 | bx = blockIdx.x, by = blockIdx.y, tx = threadIdx.x; 14 | float a, b; 15 | 16 | __shared__ float 17 | threadSumsA[NUMTHREADS], threadSumsB[NUMTHREADS], 18 | count[NUMTHREADS]; 19 | 20 | if((bx >= na) || (by >= nb)) 21 | return; 22 | 23 | threadSumsA[tx] = 0.f; 24 | threadSumsB[tx] = 0.f; 25 | count[tx] = 0.f; 26 | 27 | for(offset = tx; offset < dim; offset += NUMTHREADS) { 28 | a = vectsA[bx * dim + offset]; 29 | b = vectsB[by * dim + offset]; 30 | if(!(isnan(a) || isnan(b))) { 31 | threadSumsA[tx] += a; 32 | threadSumsB[tx] += b; 33 | count[tx] += 1.f; 34 | } 35 | } 36 | __syncthreads(); 37 | 38 | for(stride = NUMTHREADS >> 1; stride > 0; stride >>= 1) { 39 | if(tx < stride) { 40 | threadSumsA[tx] += threadSumsA[tx + stride]; 41 | threadSumsB[tx] += threadSumsB[tx + stride]; 42 | count[tx] += count[tx+stride]; 43 | } 44 | __syncthreads(); 45 | } 46 | if(tx == 0) { 47 | means[bx*nb*2+by*2] = threadSumsA[0] / count[0]; 48 | means[bx*nb*2+by*2+1] = threadSumsB[0] / count[0]; 49 | numPairs[bx*nb+by] = count[0]; 50 | } 51 | } 52 | 53 | __global__ void gpuSD(const float * vectsA, size_t na, 54 | const float * vectsB, size_t nb, size_t dim, 55 | const float * means, const float * numPairs, float * sds) 56 | { 57 | size_t 58 | offset, stride, 59 | tx = threadIdx.x, 60 | bx = blockIdx.x, by = blockIdx.y; 61 | float 62 | a, b, 63 | termA, termB; 64 | __shared__ float 65 | meanA, meanB, n, 66 | threadSumsA[NUMTHREADS], threadSumsB[NUMTHREADS]; 67 | 68 | if((bx >= na) || (by >= nb)) 69 | return; 70 | 71 | if(tx == 0) { 72 | meanA = means[bx*nb*2+by*2]; 73 | meanB = means[bx*nb*2+by*2+1]; 74 | n = numPairs[bx*nb+by]; 75 | } 76 | __syncthreads(); 77 | 78 | threadSumsA[tx] = 0.f; 79 | threadSumsB[tx] = 0.f; 80 | for(offset = tx; offset < dim; offset += NUMTHREADS) { 81 | a = vectsA[bx * dim + offset]; 82 | b = vectsB[by * dim + offset]; 83 | if(!(isnan(a) || isnan(b))) { 84 | termA = a - meanA; 85 | termB = b - meanB; 86 | threadSumsA[tx] += termA * termA; 87 | threadSumsB[tx] += termB * termB; 88 | } 89 | } 90 | __syncthreads(); 91 | 92 | for(stride = NUMTHREADS >> 1; stride > 0; stride >>= 1) { 93 | if(tx < stride) { 94 | threadSumsA[tx] += threadSumsA[tx + stride]; 95 | threadSumsB[tx] += threadSumsB[tx + stride]; 96 | } 97 | __syncthreads(); 98 | } 99 | if(tx == 0) { 100 | sds[bx*nb*2+by*2] = sqrtf(threadSumsA[0] / (n - 1.f)); 101 | sds[bx*nb*2+by*2+1] = sqrtf(threadSumsB[0] / (n - 1.f)); 102 | } 103 | } 104 | 105 | __global__ void gpuPMCC(const float * vectsa, size_t na, 106 | const float * vectsb, size_t nb, size_t dim, 107 | const float * numPairs, const float * means, const float * sds, 108 | float * correlations) 109 | { 110 | size_t 111 | offset, stride, 112 | x = blockIdx.x, y = blockIdx.y, 113 | tx = threadIdx.x; 114 | float 115 | a, b, n, scoreA, scoreB; 116 | __shared__ float 117 | meanA, meanB, 118 | sdA, sdB, 119 | threadSums[NUMTHREADS]; 120 | 121 | if((x >= na) || (y >= nb)) 122 | return; 123 | 124 | if(tx == 0) { 125 | meanA = means[x*nb*2+y*2]; 126 | meanB = means[x*nb*2+y*2+1]; 127 | sdA = sds[x*nb*2+y*2]; 128 | sdB = sds[x*nb*2+y*2+1]; 129 | n = numPairs[x*nb+y]; 130 | } 131 | __syncthreads(); 132 | 133 | threadSums[tx] = 0.f; 134 | for(offset = tx; offset < dim; offset += NUMTHREADS) { 135 | a = vectsa[x * dim + offset]; 136 | b = vectsb[y * dim + offset]; 137 | if(!(isnan(a) || isnan(b))) { 138 | scoreA = (a - meanA) / sdA; 139 | scoreB = (b - meanB) / sdB; 140 | threadSums[tx] += scoreA * scoreB; 141 | } 142 | } 143 | __syncthreads(); 144 | 145 | for(stride = NUMTHREADS >> 1; stride > 0; stride >>= 1) { 146 | if(tx < stride) threadSums[tx] += threadSums[tx + stride]; 147 | __syncthreads(); 148 | } 149 | if(tx == 0) correlations[x*nb+y] = threadSums[0] / (n - 1.f); 150 | } 151 | 152 | __global__ void gpuMeansNoTest(const float * vectsA, size_t na, 153 | const float * vectsB, size_t nb, size_t dim, 154 | float * means, float * numPairs) 155 | { 156 | size_t 157 | offset, stride, 158 | bx = blockIdx.x, by = blockIdx.y, tx = threadIdx.x; 159 | float a, b; 160 | 161 | __shared__ float 162 | threadSumsA[NUMTHREADS], threadSumsB[NUMTHREADS], 163 | count[NUMTHREADS]; 164 | 165 | if((bx >= na) || (by >= nb)) 166 | return; 167 | 168 | threadSumsA[tx] = 0.f; 169 | threadSumsB[tx] = 0.f; 170 | count[tx] = 0.f; 171 | 172 | for(offset = tx; offset < dim; offset += NUMTHREADS) { 173 | a = vectsA[bx * dim + offset]; 174 | b = vectsB[by * dim + offset]; 175 | 176 | threadSumsA[tx] += a; 177 | threadSumsB[tx] += b; 178 | count[tx] += 1.f; 179 | } 180 | __syncthreads(); 181 | 182 | for(stride = NUMTHREADS >> 1; stride > 0; stride >>= 1) { 183 | if(tx < stride) { 184 | threadSumsA[tx] += threadSumsA[tx + stride]; 185 | threadSumsB[tx] += threadSumsB[tx + stride]; 186 | count[tx] += count[tx+stride]; 187 | } 188 | __syncthreads(); 189 | } 190 | if(tx == 0) { 191 | means[bx*nb*2+by*2] = threadSumsA[0] / count[0]; 192 | means[bx*nb*2+by*2+1] = threadSumsB[0] / count[0]; 193 | numPairs[bx*nb+by] = count[0]; 194 | } 195 | } 196 | 197 | __global__ void gpuSDNoTest(const float * vectsA, size_t na, 198 | const float * vectsB, size_t nb, size_t dim, 199 | const float * means, const float * numPairs, float * sds) 200 | { 201 | size_t 202 | offset, stride, 203 | tx = threadIdx.x, 204 | bx = blockIdx.x, by = blockIdx.y; 205 | float 206 | a, b, 207 | termA, termB; 208 | __shared__ float 209 | meanA, meanB, n, 210 | threadSumsA[NUMTHREADS], threadSumsB[NUMTHREADS]; 211 | 212 | if((bx >= na) || (by >= nb)) 213 | return; 214 | 215 | if(tx == 0) { 216 | meanA = means[bx*nb*2+by*2]; 217 | meanB = means[bx*nb*2+by*2+1]; 218 | n = numPairs[bx*nb+by]; 219 | } 220 | __syncthreads(); 221 | 222 | threadSumsA[tx] = 0.f; 223 | threadSumsB[tx] = 0.f; 224 | for(offset = tx; offset < dim; offset += NUMTHREADS) { 225 | a = vectsA[bx * dim + offset]; 226 | b = vectsB[by * dim + offset]; 227 | 228 | termA = a - meanA; 229 | termB = b - meanB; 230 | threadSumsA[tx] += termA * termA; 231 | threadSumsB[tx] += termB * termB; 232 | } 233 | __syncthreads(); 234 | 235 | for(stride = NUMTHREADS >> 1; stride > 0; stride >>= 1) { 236 | if(tx < stride) { 237 | threadSumsA[tx] += threadSumsA[tx + stride]; 238 | threadSumsB[tx] += threadSumsB[tx + stride]; 239 | } 240 | __syncthreads(); 241 | } 242 | if(tx == 0) { 243 | sds[bx*nb*2+by*2] = sqrtf(threadSumsA[0] / (n - 1.f)); 244 | sds[bx*nb*2+by*2+1] = sqrtf(threadSumsB[0] / (n - 1.f)); 245 | } 246 | } 247 | 248 | __global__ void gpuPMCCNoTest(const float * vectsa, size_t na, 249 | const float * vectsb, size_t nb, size_t dim, 250 | const float * numPairs, const float * means, const float * sds, 251 | float * correlations) 252 | { 253 | size_t 254 | offset, stride, 255 | x = blockIdx.x, y = blockIdx.y, 256 | tx = threadIdx.x; 257 | float 258 | a, b, n, scoreA, scoreB; 259 | __shared__ float 260 | meanA, meanB, 261 | sdA, sdB, 262 | threadSums[NUMTHREADS]; 263 | 264 | if((x >= na) || (y >= nb)) 265 | return; 266 | 267 | if(tx == 0) { 268 | meanA = means[x*nb*2+y*2]; 269 | meanB = means[x*nb*2+y*2+1]; 270 | sdA = sds[x*nb*2+y*2]; 271 | sdB = sds[x*nb*2+y*2+1]; 272 | n = numPairs[x*nb+y]; 273 | } 274 | __syncthreads(); 275 | 276 | threadSums[tx] = 0.f; 277 | for(offset = tx; offset < dim; offset += NUMTHREADS) { 278 | a = vectsa[x * dim + offset]; 279 | b = vectsb[y * dim + offset]; 280 | 281 | scoreA = (a - meanA) / sdA; 282 | scoreB = (b - meanB) / sdB; 283 | threadSums[tx] += scoreA * scoreB; 284 | } 285 | __syncthreads(); 286 | 287 | for(stride = NUMTHREADS >> 1; stride > 0; stride >>= 1) { 288 | if(tx < stride) threadSums[tx] += threadSums[tx + stride]; 289 | __syncthreads(); 290 | } 291 | if(tx == 0) correlations[x*nb+y] = threadSums[0] / (n - 1.f); 292 | } 293 | 294 | __global__ void gpuSignif(const float * gpuNumPairs, 295 | const float * gpuCorrelations, size_t n, float * gpuTScores) 296 | { 297 | size_t 298 | i, start, 299 | bx = blockIdx.x, tx = threadIdx.x; 300 | float 301 | radicand, cor, npairs; 302 | 303 | start = bx * NUMTHREADS * THREADWORK + tx * THREADWORK; 304 | for(i = 0; i < THREADWORK; i++) { 305 | if(start+i >= n) 306 | break; 307 | 308 | npairs = gpuNumPairs[start+i]; 309 | cor = gpuCorrelations[start+i]; 310 | radicand = (npairs - 2.f) / (1.f - cor * cor); 311 | gpuTScores[start+i] = cor * sqrtf(radicand); 312 | } 313 | } 314 | 315 | __device__ int dIsSignificant(float signif, int df) { 316 | float tcutoffs[49] = { 317 | // cuttoffs for degrees of freedom <= 30 318 | 637.000, 31.600, 2.920, 8.610, 6.869, 5.959, 5.408, 5.041, 4.781, 319 | 4.587, 4.437, 4.318, 4.221, 4.140, 4.073, 4.015, 3.965, 3.922, 320 | 3.883, 3.850, 3.819, 3.792, 3.768, 3.745, 3.725, 3.707, 3.690, 321 | 3.674, 3.659, 3.646, 322 | // cuttoffs for even degrees of freedom > 30 but <= 50 323 | 3.622, 3.601, 3.582, 3.566, 3.551, 3.538, 3.526, 3.515, 3.505, 3.496, 324 | // 55 <= df <= 70 by 5s 325 | 3.476, 3.460, 3.447, 3.435, 326 | 3.416, // 80 327 | 3.390, // 100 328 | 3.357, // 150 329 | 3.340, // 200 330 | 3.290 // > 200 331 | }; 332 | 333 | size_t index = 0; 334 | if(df <= 0) return 0; 335 | else if(df <= 30) index = df - 1; 336 | else if(df <= 50) index = 30 + (df + (df%2) - 32) / 2; 337 | else if(df <= 70) { 338 | if(df <= 55) index = 40; 339 | else if(df <= 60) index = 41; 340 | else if(df <= 65) index = 42; 341 | else if(df <= 70) index = 43; 342 | } 343 | else if(df <= 80) index = 44; 344 | else if(df <= 100) index = 45; 345 | else if(df <= 150) index = 46; 346 | else if(df <= 200) index = 47; 347 | else if(df > 200) index = 48; 348 | 349 | if(fabsf(signif) < tcutoffs[index]) return FALSE; 350 | 351 | return TRUE; 352 | } 353 | 354 | __global__ void dUpdateSignif(const float * gpuData, size_t n, 355 | float * gpuResults) 356 | { 357 | size_t 358 | i, start, inrow, outrow, 359 | bx = blockIdx.x, tx = threadIdx.x; 360 | float 361 | radicand, cor, npairs, tscore; 362 | 363 | start = bx * NUMTHREADS * THREADWORK + tx * THREADWORK; 364 | 365 | for(i = 0; i < THREADWORK; i++) { 366 | if(start+i > n) break; 367 | 368 | inrow = (start+i)*5; 369 | outrow = (start+i)*6; 370 | 371 | cor = gpuData[inrow+3]; 372 | npairs = gpuData[inrow+4]; 373 | 374 | if(cor >= 0.999) 375 | tscore = 10000.0; 376 | else { 377 | radicand = (npairs - 2.f) / (1.f - cor * cor); 378 | tscore = cor * sqrtf(radicand); 379 | } 380 | if(dIsSignificant(tscore, (int)npairs)) { 381 | gpuResults[outrow] = gpuData[inrow]; 382 | gpuResults[outrow+1] = gpuData[inrow+1]; 383 | gpuResults[outrow+2] = gpuData[inrow+2]; 384 | gpuResults[outrow+3] = cor; 385 | gpuResults[outrow+4] = tscore; 386 | gpuResults[outrow+5] = npairs; 387 | } else { 388 | gpuResults[outrow] = -1.f; 389 | } 390 | } 391 | } 392 | 393 | __global__ void noNAsPmccMeans(int nRows, int nCols, float * a, float * means) 394 | { 395 | int 396 | col = blockDim.x * blockIdx.x + threadIdx.x, 397 | inOffset = col * nRows, 398 | outOffset = threadIdx.x * blockDim.y, 399 | j = outOffset + threadIdx.y; 400 | float sum = 0.f; 401 | 402 | if(col >= nCols) return; 403 | 404 | __shared__ float threadSums[NUMTHREADS*NUMTHREADS]; 405 | 406 | for(int i = threadIdx.y; i < nRows; i += blockDim.y) 407 | sum += a[inOffset + i]; 408 | 409 | threadSums[j] = sum; 410 | __syncthreads(); 411 | 412 | for(int i = blockDim.y >> 1; i > 0; i >>= 1) { 413 | if(threadIdx.y < i) { 414 | threadSums[outOffset+threadIdx.y] 415 | += threadSums[outOffset+threadIdx.y + i]; 416 | } 417 | __syncthreads(); 418 | } 419 | if(threadIdx.y == 0) 420 | means[col] = threadSums[outOffset] / (float)nRows; 421 | } 422 | -------------------------------------------------------------------------------- /inst/cuda/granger.cu: -------------------------------------------------------------------------------- 1 | #define max(a, b) ((a > b)?a:b) 2 | 3 | #define THREADSPERDIM 16 4 | 5 | #define FALSE 0 6 | #define TRUE !FALSE 7 | 8 | // mX has order rows x cols 9 | // vectY has length rows 10 | __global__ void getRestricted(int countx, int county, int rows, int cols, 11 | float * mX, int mXdim, float * vY, int vYdim, float * mQ, int mQdim, 12 | float * mR, int mRdim, float * vectB, int vectBdim) { 13 | 14 | int 15 | m = blockIdx.x * THREADSPERDIM + threadIdx.x, n, 16 | i, j, k; 17 | float 18 | sum, invnorm, 19 | * X, * Y, * Q, * R, * B, 20 | * coli, * colj, 21 | * colQ, * colX; 22 | 23 | if(m >= county) return; 24 | if(m == 1) n = 0; 25 | else n = 1; 26 | 27 | X = mX + (m * mXdim); 28 | // initialize the intercepts 29 | for(i = 0; i < rows; i++) 30 | X[i] = 1.f; 31 | 32 | Y = vY + (m * countx + n) * vYdim; 33 | B = vectB + m * vectBdim; 34 | Q = mQ + m * mQdim; 35 | R = mR + m * mRdim; 36 | 37 | // initialize Q with X ... 38 | for(i = 0; i < rows; i++) { 39 | for(j = 0; j < cols; j++) 40 | Q[i+j*rows] = X[i+j*rows]; 41 | } 42 | 43 | // gramm-schmidt process to find Q 44 | for(j = 0; j < cols; j++) { 45 | colj = Q+rows*j; 46 | for(i = 0; i < j; i++) { 47 | coli = Q+rows*i; 48 | sum = 0.f; 49 | for(k = 0; k < rows; k++) 50 | sum += coli[k] * colj[k]; 51 | for(k = 0; k < rows; k++) 52 | colj[k] -= sum * coli[k]; 53 | } 54 | sum = 0.f; 55 | for(i = 0; i < rows; i++) 56 | sum += colj[i] * colj[i]; 57 | invnorm = 1.f / sqrtf(sum); 58 | for(i = 0; i < rows; i++) 59 | colj[i] *= invnorm; 60 | } 61 | for(i = cols-1; i > -1; i--) { 62 | colQ = Q+i*rows; 63 | // matmult Q * X -> R 64 | for(j = 0; j < cols; j++) { 65 | colX = X+j*rows; 66 | sum = 0.f; 67 | for(k = 0; k < rows; k++) 68 | sum += colQ[k] * colX[k]; 69 | R[i+j*cols] = sum; 70 | } 71 | sum = 0.f; 72 | // compute the vector Q^t * Y -> B 73 | for(j = 0; j < rows; j++) 74 | sum += colQ[j] * Y[j]; 75 | // back substitution to find the x for Rx = B 76 | for(j = cols-1; j > i; j--) 77 | sum -= R[i+j*cols] * B[j]; 78 | 79 | B[i] = sum / R[i+i*cols]; 80 | } 81 | } 82 | 83 | // mX has order rows x cols 84 | // vectY has length rows 85 | __global__ void getUnrestricted(int countx, int county, int rows, int cols, 86 | float * mX, int mXdim, float * vY, int vYdim, float * mQ, int mQdim, 87 | float * mR, int mRdim, float * vectB, int vectBdim) { 88 | 89 | int 90 | n = blockIdx.x * THREADSPERDIM + threadIdx.x, 91 | m = blockIdx.y * THREADSPERDIM + threadIdx.y, 92 | i, j, k; 93 | float 94 | sum, invnorm, 95 | * X, * Y, * Q, * R, * B, 96 | * coli, * colj, 97 | * colQ, * colX; 98 | if((m >= county) || (n >= countx)) return; 99 | 100 | X = mX + (m * countx + n) * mXdim; 101 | // initialize the intercepts 102 | for(i = 0; i < rows; i++) 103 | X[i] = 1.f; 104 | 105 | Y = vY + (m*countx+n) * vYdim; 106 | B = vectB + (m*countx+n) * vectBdim; 107 | Q = mQ + (m*countx+n) * mQdim; 108 | R = mR + (m*countx+n) * mRdim; 109 | 110 | // initialize Q with X ... 111 | for(i = 0; i < rows; i++) { 112 | for(j = 0; j < cols; j++) 113 | Q[i+j*rows] = X[i+j*rows]; 114 | } 115 | 116 | // gramm-schmidt process to find Q 117 | for(j = 0; j < cols; j++) { 118 | colj = Q+rows*j; 119 | for(i = 0; i < j; i++) { 120 | coli = Q+rows*i; 121 | sum = 0.f; 122 | for(k = 0; k < rows; k++) 123 | sum += coli[k] * colj[k]; 124 | for(k = 0; k < rows; k++) 125 | colj[k] -= sum * coli[k]; 126 | } 127 | sum = 0.f; 128 | for(i = 0; i < rows; i++) 129 | sum += colj[i] * colj[i]; 130 | invnorm = 1.f / sqrtf(sum); 131 | for(i = 0; i < rows; i++) 132 | colj[i] *= invnorm; 133 | } 134 | for(i = cols-1; i > -1; i--) { 135 | colQ = Q+i*rows; 136 | // matmult Q * X -> R 137 | for(j = 0; j < cols; j++) { 138 | colX = X+j*rows; 139 | sum = 0.f; 140 | for(k = 0; k < rows; k++) 141 | sum += colQ[k] * colX[k]; 142 | R[i+j*cols] = sum; 143 | } 144 | sum = 0.f; 145 | // compute the vector Q^t * Y -> B 146 | for(j = 0; j < rows; j++) 147 | sum += colQ[j] * Y[j]; 148 | // back substitution to find the x for Rx = B 149 | for(j = cols-1; j > i; j--) 150 | sum -= R[i+j*cols] * B[j]; 151 | 152 | B[i] = sum / R[i+i*cols]; 153 | } 154 | } 155 | 156 | __global__ void ftest(int diagFlag, int p, int rows, int colsx, int colsy, 157 | int rCols, int unrCols, float * obs, int obsDim, 158 | float * rCoeffs, int rCoeffsDim, float * unrCoeffs, int unrCoeffsDim, 159 | float * rdata, int rdataDim, float * unrdata, int unrdataDim, 160 | float * dfStats) // float * dpValues) 161 | { 162 | int 163 | j = blockIdx.x * THREADSPERDIM + threadIdx.x, 164 | i = blockIdx.y * THREADSPERDIM + threadIdx.y, 165 | idx = i*colsx + j, k, m; 166 | float 167 | kobs, fp = (float) p, frows = (float) rows, 168 | rSsq, unrSsq, 169 | rEst, unrEst, 170 | score = 0.f, 171 | * tObs, * tRCoeffs, * tUnrCoeffs, 172 | * tRdata, * tUnrdata; 173 | 174 | if((i >= colsy) || (j >= colsx)) return; 175 | if((!diagFlag) && (i == j)) { 176 | dfStats[idx] = 0.f; 177 | // dpValues[idx] = 0.f; 178 | return; 179 | } 180 | 181 | tObs = obs + (i*colsx+j)*obsDim; 182 | 183 | tRCoeffs = rCoeffs + i*rCoeffsDim; 184 | tRdata = rdata + i*rdataDim; 185 | 186 | tUnrCoeffs = unrCoeffs + (i*colsx+j)*unrCoeffsDim; 187 | tUnrdata = unrdata + (i*colsx+j)*unrdataDim; 188 | 189 | rSsq = unrSsq = 0.f; 190 | for(k = 0; k < rows; k++) { 191 | unrEst = rEst = 0.f; 192 | kobs = tObs[k]; 193 | for(m = 0; m < rCols; m++) 194 | rEst += tRCoeffs[m] * tRdata[k+m*rows]; 195 | for(m = 0; m < unrCols; m++) 196 | unrEst += tUnrCoeffs[m] * tUnrdata[k+m*rows]; 197 | rSsq += (kobs - rEst) * (kobs - rEst); 198 | unrSsq += (kobs - unrEst) * (kobs - unrEst); 199 | 200 | } 201 | score = ((rSsq - unrSsq)*(frows-2.f*fp-1.f)) / (fp*unrSsq); 202 | 203 | if(!isfinite(score)) 204 | score = 0.f; 205 | 206 | dfStats[idx] = score; 207 | } 208 | -------------------------------------------------------------------------------- /inst/cuda/hcluster.cu: -------------------------------------------------------------------------------- 1 | #define NUM_THREADS 32 2 | #define NUM_BLOCKS 1024 3 | 4 | __global__ void convert_kernel(float * dist, size_t pitch_dist, size_t n) 5 | { 6 | for(size_t index = threadIdx.x; index < n; index += NUM_THREADS) { 7 | dist[index * pitch_dist + index] = CUDART_INF_F; 8 | } 9 | } 10 | 11 | __global__ void find_min1_kernel(const float * dist, const size_t pitch_dist, 12 | const size_t n, const float * count, float * min_val, size_t * min_col, 13 | const size_t row_offset) 14 | { 15 | // Determine which row this block will handle 16 | const size_t row = row_offset + blockIdx.x; 17 | 18 | // If the row has already been merged, skip the work 19 | if((threadIdx.x == 0) && (row < n) && (count[row] < 0.f)) { 20 | min_val[row] = CUDART_INF_F; 21 | min_col[row] = 0; 22 | } 23 | 24 | if((row >= n) || (count[row] <= 0.f)) 25 | return; 26 | 27 | __shared__ float vals[NUM_THREADS]; 28 | __shared__ size_t cols[NUM_THREADS]; 29 | 30 | // Initialize with identity 31 | vals[threadIdx.x] = CUDART_INF_F; 32 | 33 | // Find the minimum 34 | for(size_t col = threadIdx.x; col <= row; col += NUM_THREADS) { 35 | float t = dist[row * pitch_dist + col]; 36 | if(t < vals[threadIdx.x]) { 37 | vals[threadIdx.x] = t; 38 | cols[threadIdx.x] = col; 39 | } 40 | } 41 | __syncthreads(); 42 | 43 | // Reduce 44 | for(size_t stride = NUM_THREADS >> 1; stride > 0; stride >>= 1) { 45 | if((threadIdx.x < stride) 46 | && (vals[threadIdx.x] > vals[threadIdx.x + stride])) 47 | { 48 | vals[threadIdx.x] = vals[threadIdx.x + stride]; 49 | cols[threadIdx.x] = cols[threadIdx.x + stride]; 50 | } 51 | __syncthreads(); 52 | } 53 | 54 | // Write the result 55 | if(threadIdx.x == 0) { 56 | min_val[row] = vals[0]; 57 | min_col[row] = cols[0]; 58 | } 59 | } 60 | 61 | __global__ void find_min2_kernel(const float * min_val, const size_t * min_col, 62 | float * count, int * sub, int * sup, float * val, const size_t n, 63 | const size_t iter) 64 | { 65 | __shared__ float vals[NUM_THREADS]; 66 | __shared__ size_t cols[NUM_THREADS]; 67 | 68 | // Initialize with identity 69 | vals[threadIdx.x] = CUDART_INF_F; 70 | 71 | // Find the minimum 72 | for(size_t row = threadIdx.x; row < n; row += NUM_THREADS) { 73 | float t = min_val[row]; 74 | if(t < vals[threadIdx.x]) { 75 | vals[threadIdx.x] = t; 76 | cols[threadIdx.x] = row; 77 | } 78 | } 79 | __syncthreads(); 80 | 81 | // Reduce 82 | for(size_t stride = NUM_THREADS >> 1; stride > 0; stride >>= 1) { 83 | if(threadIdx.x < stride) { 84 | if(vals[threadIdx.x] > vals[threadIdx.x + stride]) { 85 | vals[threadIdx.x] = vals[threadIdx.x + stride]; 86 | cols[threadIdx.x] = cols[threadIdx.x + stride]; 87 | } 88 | } 89 | __syncthreads(); 90 | } 91 | 92 | // Write out 93 | if(threadIdx.x == 0) { 94 | // Winning value is vals[0] 95 | // Winning row is cols[0] 96 | // Winning column is min_col[cols[0]] 97 | int row_winner = cols[0]; 98 | int col_winner = min_col[cols[0]]; 99 | val[iter] = vals[0]; 100 | sub[iter] = col_winner; 101 | sup[iter] = row_winner; 102 | 103 | count[row_winner] += count[col_winner]; 104 | count[col_winner] *= -1.f; 105 | } 106 | } 107 | 108 | __global__ void single_kernel(float * dist, const size_t pitch_dist, 109 | const size_t n, const int * sub, const int * sup, const float * count, 110 | const float * val, const size_t iter, const size_t col_offset, 111 | const float lambda, const float beta) 112 | { 113 | size_t col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x; 114 | 115 | // If it matters 116 | if(col < n) { 117 | int col_winner = sub[iter]; 118 | int row_winner = sup[iter]; 119 | float top_val = dist[col_winner * pitch_dist + col]; 120 | float bot_val = dist[row_winner * pitch_dist + col]; 121 | bot_val = min(bot_val, top_val); 122 | if(col == col_winner || col == row_winner) { 123 | bot_val = CUDART_INF_F; 124 | } 125 | top_val = CUDART_INF_F; 126 | // Write out 127 | dist[col_winner * pitch_dist + col] = top_val; 128 | dist[col * pitch_dist + col_winner] = top_val; 129 | dist[row_winner * pitch_dist + col] = bot_val; 130 | dist[col * pitch_dist + row_winner] = bot_val; 131 | } 132 | } 133 | 134 | __global__ void complete_kernel(float * dist, const size_t pitch_dist, 135 | const size_t n, const int * sub, const int * sup, const float * count, 136 | const float * val, const size_t iter, const size_t col_offset, 137 | const float lambda, const float beta) 138 | { 139 | const size_t col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x; 140 | 141 | // If it matters 142 | if(col < n) { 143 | 144 | int 145 | col_winner = sub[iter], row_winner = sup[iter]; 146 | float 147 | top_val = dist[col_winner * pitch_dist + col], 148 | bot_val = dist[row_winner * pitch_dist + col]; 149 | 150 | bot_val = fmaxf(bot_val, top_val); 151 | if((col == col_winner) || (col == row_winner)) 152 | bot_val = CUDART_INF_F; 153 | 154 | top_val = CUDART_INF_F; 155 | 156 | // Write out 157 | dist[col_winner * pitch_dist + col] = top_val; 158 | dist[col * pitch_dist + col_winner] = top_val; 159 | dist[row_winner * pitch_dist + col] = bot_val; 160 | dist[col * pitch_dist + row_winner] = bot_val; 161 | } 162 | } 163 | 164 | __global__ void wpgma_kernel(float * dist, const size_t pitch_dist, 165 | const size_t n, const int * sub, const int * sup, const float * count, 166 | const float * val, const size_t iter, const size_t col_offset, 167 | const float lambda, const float beta) 168 | { 169 | const size_t col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x; 170 | 171 | // If it matters 172 | if(col < n) { 173 | int col_winner = sub[iter]; 174 | int row_winner = sup[iter]; 175 | float top_val = dist[col_winner * pitch_dist + col]; 176 | float bot_val = dist[row_winner * pitch_dist + col]; 177 | bot_val = (bot_val + top_val) / 2.0; 178 | if(col == col_winner || col == row_winner) { 179 | bot_val = CUDART_INF_F; 180 | } 181 | top_val = CUDART_INF_F; 182 | // Write out 183 | dist[col_winner * pitch_dist + col] = top_val; 184 | dist[col * pitch_dist + col_winner] = top_val; 185 | dist[row_winner * pitch_dist + col] = bot_val; 186 | dist[col * pitch_dist + row_winner] = bot_val; 187 | } 188 | } 189 | 190 | __global__ void average_kernel(float * dist, const size_t pitch_dist, 191 | const size_t n, const int * sub, const int * sup, const float * count, 192 | const float * val, const size_t iter, const size_t col_offset, 193 | const float lambda, const float beta) 194 | { 195 | const size_t col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x; 196 | 197 | // If it matters 198 | if(col < n) { 199 | int col_winner = sub[iter]; 200 | int row_winner = sup[iter]; 201 | float top_val = dist[col_winner * pitch_dist + col]; 202 | float bot_val = dist[row_winner * pitch_dist + col]; 203 | float nr = count[row_winner]; 204 | float np = -1.0 * count[col_winner]; 205 | float nq = nr - np; 206 | bot_val = (top_val * np + bot_val * nq) / nr; 207 | if(col == col_winner || col == row_winner) { 208 | bot_val = CUDART_INF_F; 209 | } 210 | top_val = CUDART_INF_F; 211 | // Write out 212 | dist[col_winner * pitch_dist + col] = top_val; 213 | dist[col * pitch_dist + col_winner] = top_val; 214 | dist[row_winner * pitch_dist + col] = bot_val; 215 | dist[col * pitch_dist + row_winner] = bot_val; 216 | } 217 | } 218 | 219 | __global__ void median_kernel(float * dist, const size_t pitch_dist, 220 | const size_t n, const int * sub, const int * sup, const float * count, 221 | const float * val, const size_t iter, const size_t col_offset, 222 | const float lambda, const float beta) 223 | { 224 | const size_t col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x; 225 | 226 | // If it matters 227 | if(col < n) { 228 | int col_winner = sub[iter]; 229 | int row_winner = sup[iter]; 230 | float top_val = dist[col_winner * pitch_dist + col]; 231 | float bot_val = dist[row_winner * pitch_dist + col]; 232 | bot_val = (bot_val + top_val) / 2.0 - val[iter] / 4.0; 233 | if(col == col_winner || col == row_winner) { 234 | bot_val = CUDART_INF_F; 235 | } 236 | top_val = CUDART_INF_F; 237 | // Write out 238 | dist[col_winner * pitch_dist + col] = top_val; 239 | dist[col * pitch_dist + col_winner] = top_val; 240 | dist[row_winner * pitch_dist + col] = bot_val; 241 | dist[col * pitch_dist + row_winner] = bot_val; 242 | } 243 | } 244 | 245 | __global__ void mcquitty_kernel(float * dist, const size_t pitch_dist, 246 | const size_t n, const int * sub, const int * sup, const float * count, 247 | const float * val, const size_t iter, const size_t col_offset, 248 | const float lambda, const float beta) 249 | { 250 | const size_t col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x; 251 | 252 | // If it matters 253 | if(col < n) { 254 | int col_winner = sub[iter]; 255 | int row_winner = sup[iter]; 256 | float top_val = dist[col_winner * pitch_dist + col]; 257 | float bot_val = dist[row_winner * pitch_dist + col]; 258 | bot_val = (bot_val + top_val) / 2.0; 259 | if(col == col_winner || col == row_winner) { 260 | bot_val = CUDART_INF_F; 261 | } 262 | top_val = CUDART_INF_F; 263 | // Write out 264 | dist[col_winner * pitch_dist + col] = top_val; 265 | dist[col * pitch_dist + col_winner] = top_val; 266 | dist[row_winner * pitch_dist + col] = bot_val; 267 | dist[col * pitch_dist + row_winner] = bot_val; 268 | } 269 | } 270 | 271 | __global__ void centroid_kernel(float * dist, size_t pitch_dist, 272 | size_t n, const int * sub, const int * sup, const float * count, 273 | const float * val, size_t iter, size_t col_offset, 274 | float lambda, float beta) 275 | { 276 | size_t 277 | col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x; 278 | 279 | if(col < n) { // don't run off the end of the arrays 280 | int 281 | col_winner = sub[iter], row_winner = sup[iter]; 282 | float 283 | top_val = dist[col_winner * pitch_dist + col], 284 | bot_val = dist[row_winner * pitch_dist + col], 285 | nr = count[row_winner], np = -count[col_winner], 286 | nq = nr - np; 287 | 288 | bot_val = (top_val * np + bot_val * nq)/nr 289 | - (np * nq * val[iter])/(nr * nr); 290 | // bot_val = (nr * (bot_val * np + top_val * nq) - np * nq * val[iter]) 291 | // / (nr * nr); 292 | /* 293 | float nr = count[row_winner]; 294 | float np = -1.0 * count[col_winner]; 295 | float nq = nr - np; 296 | bot_val = (top_val * np + bot_val * nq) / nr; 297 | */ 298 | if(col == col_winner || col == row_winner) 299 | bot_val = CUDART_INF_F; 300 | 301 | top_val = CUDART_INF_F; 302 | 303 | dist[col_winner * pitch_dist + col] = top_val; 304 | dist[col * pitch_dist + col_winner] = top_val; 305 | dist[row_winner * pitch_dist + col] = bot_val; 306 | dist[col * pitch_dist + row_winner] = bot_val; 307 | } 308 | } 309 | 310 | __global__ void flexible_group_kernel(float * dist, const size_t pitch_dist, 311 | const size_t n, const int * sub, const int * sup, const float * count, 312 | const float * val, const size_t iter, const size_t col_offset, 313 | const float lambda, const float beta) 314 | { 315 | const size_t col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x; 316 | 317 | // If it matters 318 | if(col < n) { 319 | int col_winner = sub[iter]; 320 | int row_winner = sup[iter]; 321 | float top_val = dist[col_winner * pitch_dist + col]; 322 | float bot_val = dist[row_winner * pitch_dist + col]; 323 | float nr = count[row_winner]; 324 | float np = -1.0 * count[col_winner]; 325 | float nq = nr - np; 326 | bot_val = (bot_val * (1.0 - lambda) * np + top_val * (1.0 - lambda) * nq) / nr + beta * val[iter]; 327 | if(col == col_winner || col == row_winner) { 328 | bot_val = CUDART_INF_F; 329 | } 330 | top_val = CUDART_INF_F; 331 | // Write out 332 | dist[col_winner * pitch_dist + col] = top_val; 333 | dist[col * pitch_dist + col_winner] = top_val; 334 | dist[row_winner * pitch_dist + col] = bot_val; 335 | dist[col * pitch_dist + row_winner] = bot_val; 336 | } 337 | } 338 | 339 | __global__ void flexible_kernel(float * dist, const size_t pitch_dist, 340 | const size_t n, const int * sub, const int * sup, const float * count, 341 | const float * val, const size_t iter, const size_t col_offset, 342 | const float lambda, const float beta) 343 | { 344 | const size_t col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x; 345 | 346 | // If it matters 347 | if(col < n) { 348 | int col_winner = sub[iter]; 349 | int row_winner = sup[iter]; 350 | float top_val = dist[col_winner * pitch_dist + col]; 351 | float bot_val = dist[row_winner * pitch_dist + col]; 352 | bot_val = (bot_val * (1.0 - lambda) + top_val * (1.0 - lambda) ) / 2.0 + beta * val[iter]; 353 | if(col == col_winner || col == row_winner) { 354 | bot_val = CUDART_INF_F; 355 | } 356 | top_val = CUDART_INF_F; 357 | // Write out 358 | dist[col_winner * pitch_dist + col] = top_val; 359 | dist[col * pitch_dist + col_winner] = top_val; 360 | dist[row_winner * pitch_dist + col] = bot_val; 361 | dist[col * pitch_dist + row_winner] = bot_val; 362 | } 363 | } 364 | 365 | __global__ void ward_kernel(float * dist, const size_t pitch_dist, 366 | const size_t n, const int * sub, const int * sup, const float * count, 367 | const float * val, const size_t iter, const size_t col_offset, 368 | const float lambda, const float beta) 369 | { 370 | const size_t 371 | col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x; 372 | 373 | if(col >= n) 374 | return; 375 | 376 | int 377 | col_winner = sub[iter], row_winner = sup[iter]; 378 | 379 | float 380 | top_val = dist[col_winner * pitch_dist + col], 381 | bot_val = dist[row_winner * pitch_dist + col], 382 | nr = count[row_winner], np = -count[col_winner], 383 | nq = nr - np, nk = count[col]; 384 | 385 | if((nr == -nk) || (col == col_winner) || (col == row_winner)) { 386 | bot_val = CUDART_INF_F; 387 | } else { 388 | bot_val = (bot_val * (np + nk) + top_val * (nq + nk) - val[iter] * nk); 389 | bot_val /= (nr + nk); 390 | if(isinf(bot_val)) { 391 | bot_val = CUDART_INF_F; 392 | } 393 | } 394 | top_val = CUDART_INF_F; 395 | 396 | dist[col_winner * pitch_dist + col] = top_val; 397 | dist[col * pitch_dist + col_winner] = top_val; 398 | dist[row_winner * pitch_dist + col] = bot_val; 399 | dist[col * pitch_dist + row_winner] = bot_val; 400 | } 401 | 402 | -------------------------------------------------------------------------------- /inst/cuda/kendall.cu: -------------------------------------------------------------------------------- 1 | #define NUMTHREADS 16 2 | #define THREADWORK 32 3 | 4 | __global__ void gpuKendall(const float * a, size_t na, 5 | const float * b, size_t nb, size_t sampleSize, double * results) 6 | { 7 | size_t 8 | i, j, tests, 9 | tx = threadIdx.x, ty = threadIdx.y, 10 | bx = blockIdx.x, by = blockIdx.y, 11 | rowa = bx * sampleSize, rowb = by * sampleSize; 12 | float 13 | discordant, concordant = 0.f, 14 | numer, denom; 15 | 16 | __shared__ float threadSums[NUMTHREADS*NUMTHREADS]; 17 | 18 | for(i = tx; i < sampleSize; i += NUMTHREADS) { 19 | for(j = i+1+ty; j < sampleSize; j += NUMTHREADS) { 20 | tests = ((a[rowa+j] > a[rowa+i]) && (b[rowb+j] > b[rowb+i])) 21 | + ((a[rowa+j] < a[rowa+i]) && (b[rowb+j] < b[rowb+i])) 22 | + ((a[rowa+j] == a[rowa+i]) && (b[rowb+j] == b[rowb+i])); 23 | concordant = concordant + (float)tests; 24 | } 25 | } 26 | threadSums[tx*NUMTHREADS+ty] = concordant; 27 | 28 | __syncthreads(); 29 | for(i = NUMTHREADS >> 1; i > 0; i >>= 1) { 30 | if(ty < i) 31 | threadSums[tx*NUMTHREADS+ty] += threadSums[tx*NUMTHREADS+ty+i]; 32 | __syncthreads(); 33 | } 34 | for(i = NUMTHREADS >> 1; i > 0; i >>= 1) { 35 | if((tx < i) && (ty == 0)) 36 | threadSums[tx*NUMTHREADS] += threadSums[(tx+i)*NUMTHREADS]; 37 | __syncthreads(); 38 | } 39 | 40 | if((tx == 0) && (ty == 0)) { 41 | concordant = threadSums[0]; 42 | denom = (float)sampleSize; 43 | denom = (denom * (denom - 1.f)) / 2.f; discordant = denom - concordant; 44 | numer = concordant - discordant; 45 | results[by*na+bx] = ((double)numer)/((double)denom); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /inst/cuda/mi.cu: -------------------------------------------------------------------------------- 1 | #define NTHREADS 16 2 | 3 | __global__ void scale(float knot_max, int nx, int nsamples, 4 | float * x, int pitch_x) 5 | { 6 | int 7 | col_idx = blockDim.x * blockIdx.x + threadIdx.x; 8 | 9 | if(col_idx >= nx) return; 10 | 11 | float 12 | min, max, 13 | * col = x + col_idx * pitch_x; 14 | 15 | // find the min and the max 16 | min = max = col[0]; 17 | for(int i = 1; i < nsamples; i++) { 18 | if(col[i] < min) min = col[i]; 19 | if(col[i] > max) max = col[i]; 20 | } 21 | 22 | float delta = max - min; 23 | for(int i = 0; i < nsamples; i++) 24 | col[i] = (knot_max * (col[i] - min)) / delta; 25 | } 26 | 27 | __device__ float do_fraction(float numer, float denom) { 28 | float result = 0.f; 29 | 30 | if((numer == denom) && (numer != 0.f)) 31 | result = 1.f; 32 | else if(denom != 0.f) 33 | result = numer / denom; 34 | 35 | return result; 36 | } 37 | 38 | // bins must be initialized to zero before calling get_bin_scores 39 | __global__ void get_bin_scores(int nbins, int order, 40 | int nknots, float * knots, int nsamples, 41 | int nx, float * x, int pitch_x, 42 | float * bins, int pitch_bins) 43 | { 44 | int 45 | col_x = blockDim.x * blockIdx.x + threadIdx.x; 46 | 47 | if(col_x >= nx) 48 | return; 49 | 50 | float 51 | ld, rd, z, 52 | term1, term2, 53 | * in_col = x + col_x * pitch_x, 54 | * bin_col = bins + col_x * pitch_bins; 55 | int i0; 56 | 57 | for(int k = 0; k < nsamples; k++, bin_col += nbins) { 58 | z = in_col[k]; 59 | i0 = (int)floorf(z) + order - 1; 60 | if(i0 >= nbins) 61 | i0 = nbins - 1; 62 | 63 | bin_col[i0] = 1.f; 64 | for(int i = 2; i <= order; i++) { 65 | for(int j = i0 - i + 1; j <= i0; j++) { 66 | rd = do_fraction(knots[j + i] - z, knots[j + i] - knots[j + 1]); 67 | 68 | if((j < 0) || (j >= nbins) || (j >= nknots) || (j + i - 1 < 0) || (j > nknots)) 69 | term1 = 0.f; 70 | else { 71 | ld = do_fraction(z - knots[j], 72 | knots[j + i - 1] - knots[j]); 73 | term1 = ld * bin_col[j]; 74 | } 75 | 76 | if((j + 1 < 0) || (j + 1 >= nbins) || (j + 1 >= nknots) || (j + i < 0) || (j + i >= nknots)) 77 | term2 = 0.f; 78 | else { 79 | rd = do_fraction(knots[j + i] - z, 80 | knots[j + i] - knots[j + 1]); 81 | term2 = rd * bin_col[j + 1]; 82 | } 83 | bin_col[j] = term1 + term2; 84 | } 85 | } 86 | } 87 | } 88 | 89 | __global__ void get_entropy(int nbins, int nsamples, int nx, 90 | float * bin_scores, int pitch_bin_scores, float * entropies) 91 | { 92 | int 93 | col_x = blockDim.x * blockIdx.x + threadIdx.x; 94 | 95 | if(col_x >= nx) 96 | return; 97 | 98 | float 99 | * in_col = bin_scores + col_x * pitch_bin_scores, 100 | entropy = 0.f, prob, logp; 101 | 102 | for(int i = 0; i < nbins; i++) { 103 | prob = 0.f; 104 | for(int j = 0; j < nsamples; j++) 105 | prob += in_col[j * nbins + i]; 106 | prob /= (double) nsamples; 107 | 108 | if(prob <= 0.f) 109 | logp = 0.f; 110 | else 111 | logp = __log2f(prob); 112 | 113 | entropy += prob * logp; 114 | } 115 | entropies[col_x] = -entropy; 116 | } 117 | 118 | __global__ void get_mi(int nbins, int nsamples, 119 | int nx, float * x_bin_scores, int pitch_x_bin_scores, 120 | float * entropies_x, 121 | int ny, float * y_bin_scores, int pitch_y_bin_scores, 122 | float * entropies_y, 123 | float * mis, int pitch_mis) 124 | { 125 | int 126 | col_x = blockDim.x * blockIdx.x + threadIdx.x, 127 | col_y = blockDim.y * blockIdx.y + threadIdx.y; 128 | 129 | if((col_x >= nx) || (col_y >= ny)) 130 | return; 131 | 132 | float 133 | prob, logp, mi = 0.f, 134 | * x_bins = x_bin_scores + col_x * pitch_x_bin_scores, 135 | * y_bins = y_bin_scores + col_y * pitch_y_bin_scores; 136 | 137 | // calculate joint entropy 138 | for(int i = 0; i < nbins; i++) { 139 | for(int j = 0; j < nbins; j++) { 140 | prob = 0.f; 141 | for(int k = 0; k < nsamples; k++) 142 | prob += x_bins[k * nbins + i] * y_bins[k * nbins + j]; 143 | prob /= (float)nsamples; 144 | 145 | if(prob <= 0.f) 146 | logp = 0.f; 147 | else 148 | logp = __log2f(prob); 149 | 150 | mi += prob * logp; 151 | } 152 | } 153 | 154 | // calculate mi from entropies 155 | mi += entropies_x[col_x] + entropies_y[col_y]; 156 | (mis + col_y * pitch_mis)[col_x] = mi; 157 | } 158 | -------------------------------------------------------------------------------- /inst/cuda/qrdecomp.cu: -------------------------------------------------------------------------------- 1 | #define NTHREADS 512 2 | 3 | __global__ void getColNorms(int rows, int cols, float * da, int lda, 4 | float * colNorms) 5 | { 6 | int colIndex = threadIdx.x + blockIdx.x * blockDim.x; 7 | float 8 | sum = 0.f, term, 9 | * col; 10 | 11 | if(colIndex >= cols) 12 | return; 13 | 14 | col = da + colIndex * lda; 15 | 16 | // debug printing 17 | // printf("printing column %d\n", colIndex); 18 | // for(int i = 0; i < rows; i++) 19 | // printf("%f, ", col[i]); 20 | // puts(""); 21 | // end debug printing 22 | 23 | for(int i = 0; i < rows; i++) { 24 | term = col[i]; 25 | term *= term; 26 | sum += term; 27 | } 28 | 29 | // debug printing 30 | // printf("norm %f\n", norm); 31 | // end debug printing 32 | 33 | colNorms[colIndex] = sum; 34 | } 35 | 36 | __global__ void gpuFindMax(int n, float * data, int threadWorkLoad, 37 | int * maxIndex) 38 | { 39 | int 40 | j, k, 41 | start = threadWorkLoad * threadIdx.x, 42 | end = start + threadWorkLoad; 43 | __shared__ int maxIndicies[NTHREADS]; 44 | 45 | maxIndicies[threadIdx.x] = -1; 46 | 47 | if(start >= n) 48 | return; 49 | 50 | int localMaxIndex = start; 51 | for(int i = start+1; i < end; i++) { 52 | if(i >= n) 53 | break; 54 | if(data[i] > data[localMaxIndex]) 55 | localMaxIndex = i; 56 | } 57 | maxIndicies[threadIdx.x] = localMaxIndex; 58 | __syncthreads(); 59 | 60 | for(int i = blockDim.x >> 1; i > 0; i >>= 1) { 61 | if(threadIdx.x < i) { 62 | j = maxIndicies[threadIdx.x]; 63 | k = maxIndicies[i + threadIdx.x]; 64 | if((j != -1) && (k != -1) && (data[j] < data[k])) 65 | maxIndicies[threadIdx.x] = k; 66 | } 67 | __syncthreads(); 68 | } 69 | if(threadIdx.x == 0) { 70 | *maxIndex = maxIndicies[0]; 71 | // debug printing 72 | // printf("max index: %d\n", *maxIndex); 73 | // printf("max norm: %f\n", data[*maxIndex]); 74 | // end debug printing 75 | } 76 | } 77 | 78 | __global__ void gpuSwapCol(int rows, float * dArray, int coli, int * dColj, 79 | int * dPivot) 80 | { 81 | int rowIndex = blockIdx.x * blockDim.x + threadIdx.x; 82 | 83 | if(rowIndex >= rows) 84 | return; 85 | 86 | int colj = coli + (*dColj); 87 | float fholder; 88 | 89 | fholder = dArray[rowIndex+coli*rows]; 90 | dArray[rowIndex+coli*rows] = dArray[rowIndex+colj*rows]; 91 | dArray[rowIndex+colj*rows] = fholder; 92 | 93 | if((blockIdx.x == 0) && (threadIdx.x == 0)) { 94 | int iholder = dPivot[coli]; 95 | dPivot[coli] = dPivot[colj]; 96 | dPivot[colj] = iholder; 97 | } 98 | } 99 | 100 | __global__ void makeHVector(int rows, float * input, float * output) 101 | { 102 | int 103 | i, j; 104 | float 105 | elt, sum; 106 | __shared__ float 107 | beta, sums[NTHREADS]; 108 | 109 | if(threadIdx.x >= rows) 110 | return; 111 | 112 | sum = 0.f; 113 | for(i = threadIdx.x ; i < rows; i += NTHREADS) { 114 | if((threadIdx.x == 0) && (i == 0)) 115 | continue; 116 | elt = input[i]; 117 | output[i] = elt; 118 | sum += elt * elt; 119 | } 120 | sums[threadIdx.x] = sum; 121 | __syncthreads(); 122 | 123 | for(i = blockDim.x >> 1; i > 0 ; i >>= 1) { 124 | j = i+threadIdx.x; 125 | if((threadIdx.x < i) && (j < rows)) 126 | sums[threadIdx.x] += sums[j]; 127 | __syncthreads(); 128 | } 129 | 130 | if(threadIdx.x == 0) { 131 | elt = input[0]; 132 | float norm = sqrtf(elt * elt + sums[0]); 133 | 134 | if(elt > 0) 135 | elt += norm; 136 | else 137 | elt -= norm; 138 | 139 | output[0] = elt; 140 | 141 | norm = elt * elt + sums[0]; 142 | beta = sqrtf(2.f / norm); 143 | } 144 | __syncthreads(); 145 | 146 | for(i = threadIdx.x; i < rows; i += NTHREADS) 147 | output[i] *= beta; 148 | } 149 | 150 | // Updates the column norms by subtracting the Hadamard-square of the 151 | // Householder vector. 152 | // 153 | // N.B.: Overflow incurred in computing the square should already have 154 | // been detected in the original norm construction. 155 | 156 | __global__ void UpdateHHNorms(int cols, float *dV, float *dNorms) { 157 | // Copyright 2009, Mark Seligman at Rapid Biologics, LLC. All rights 158 | // reserved. 159 | 160 | int colIndex = threadIdx.x + blockIdx.x * blockDim.x; 161 | if (colIndex < cols) { 162 | float val = dV[colIndex]; 163 | dNorms[colIndex] -= val * val; 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /man/chooseGpu.Rd: -------------------------------------------------------------------------------- 1 | \name{chooseGpu} 2 | \alias{chooseGpu} 3 | \title{Choose which GPU device to use} 4 | 5 | \description{ 6 | Selects the GPU device to use for computation. This is only useful on a 7 | machine equipped with multiple GPU devices. The numbering starts at 0 8 | and is assigned by the CUDA capable driver. 9 | 10 | Choosing a device can only be done before any other GPU operation and 11 | only once per thread. 12 | } 13 | 14 | \usage{ 15 | chooseGpu(deviceId = 0) 16 | } 17 | 18 | \arguments{ 19 | \item{deviceId}{an integer >= 0 designating the GPU to use for computation.} 20 | } 21 | 22 | \value{ 23 | chooseGpu should print out an integer specifying the device id chosen or 24 | an error message. 25 | } 26 | -------------------------------------------------------------------------------- /man/cpuMatMult.Rd: -------------------------------------------------------------------------------- 1 | \name{cpuMatMult} 2 | \alias{cpuMatMult} 3 | \title{Perform Matrix Multiplication} 4 | 5 | \description{ 6 | Performs matrix multiplication using R's BLAS. This function is merely a 7 | wrapper for the BLAS dgemm function. 8 | } 9 | 10 | \usage{ 11 | cpuMatMult(a, b) 12 | } 13 | 14 | \arguments{ 15 | \item{a}{a numeric matrix.} 16 | \item{b}{a numeric matrix.} 17 | } 18 | 19 | \value{ 20 | A numeric matrix. 21 | The matrix is just the product of arguments 'a' and 'b'. 22 | } 23 | 24 | \examples{ 25 | matA <- matrix(runif(2*3), 2, 3) 26 | matB <- matrix(runif(3*4), 3, 4) 27 | cpuMatMult(matA, matB) 28 | } 29 | 30 | \keyword{array} 31 | \keyword{algebra} 32 | -------------------------------------------------------------------------------- /man/getGpuId.Rd: -------------------------------------------------------------------------------- 1 | \name{getGpuId} 2 | \alias{getGpuId} 3 | \title{Discover the Id of the current GPU device} 4 | 5 | \description{ 6 | Queries the CUDA driver for the GPU device currently assigned to this 7 | thread. This is the id of the device that will be used for computation. 8 | If you wish to use a different device, use the chooseGpu function. 9 | } 10 | 11 | \usage{ 12 | getGpuId() 13 | } 14 | 15 | \value{ 16 | The function returns a single integer indicating the id of the 17 | GPU device currently selected to carry out computation according to the 18 | CUDA driver. 19 | } 20 | 21 | \examples{ 22 | getGpuId() 23 | } 24 | -------------------------------------------------------------------------------- /man/gpuCor.Rd: -------------------------------------------------------------------------------- 1 | \name{gpuCor} 2 | \alias{gpuCor} 3 | \title{Calculate Various Correlation Coefficients With a GPU} 4 | 5 | \description{ 6 | The correlation coefficient will be calculated 7 | for each pair $x_i$, $y_j$ where $x_i$ is a column of $x$ and $y_j$ is a 8 | column of $y$. Currently, Pearson's and Kendall's correlation 9 | coefficient are implemented. Pearson's may be calculated for 10 | data sets containing NAs in which case, the implementation behaves 11 | as R-native cor function with use="pairwise.complete". 12 | } 13 | 14 | \usage{ 15 | gpuCor(x, y = NULL, use = "everything", method = "pearson") 16 | } 17 | 18 | \arguments{ 19 | \item{x}{a matrix of floating point values in which each column is a 20 | random variable.} 21 | \item{y}{a matrix of floating point values in which each column is a 22 | random variable.} 23 | \item{use}{a string. A character string giving a method for computing 24 | in the presence of missing values. Options are 25 | "everything" or "pairwise.complete.obs". This currently only 26 | affects the "pearson" method.} 27 | \item{method}{a string. Either "pearson" or "kendall".} 28 | } 29 | 30 | \value{ 31 | For method "pearson", a list with matrices 'pairs', 32 | 'coefficients', and 'ts'. The matrix entry $i$, $j$ for pairs represents 33 | the number of pairs of entries $x_i^k$, $y_j^k$ (the $k$-th entry from 34 | $x_i$ and $y_j$ respectively). These are the 35 | number of entries actually used to calculate the coefficients. 36 | Entry $i$, $j$ of the coefficients matrix is the correlation coefficient 37 | for $x_i$, $y_j$. Entry $i$, $j$ of the ts matrix is the t-score of the 38 | $i$, $j$ entry of the coefficient matrix. If use="pairwise.complete.obs" 39 | then only the pairs where both entries are not NA are used in the 40 | computations. 41 | 42 | For method "kendall", a list of matrices 'pairs' as above and 43 | 'coefficients' as follows. The matrix 'coefficients' is a matrix of 44 | floating point numbers where entry $i$, $j$ is the correlation coefficient 45 | for $x_i$, $y_j$. Calculation of t-scores for the kendall coefficients is 46 | not yet implemented. 47 | } 48 | 49 | \seealso{ 50 | cor 51 | } 52 | 53 | \examples{ 54 | numAvars <- 5 55 | numBvars <- 10 56 | numSamples <- 30 57 | A <- matrix(runif(numAvars*numSamples), numSamples, numAvars) 58 | B <- matrix(runif(numBvars*numSamples), numSamples, numBvars) 59 | gpuCor(A, B, method="pearson") 60 | gpuCor(A, B, method="kendall") 61 | A[3,2] <- NA 62 | gpuCor(A, B, use="pairwise.complete.obs", method="pearson") 63 | } 64 | -------------------------------------------------------------------------------- /man/gpuCrossprod.Rd: -------------------------------------------------------------------------------- 1 | \name{gpuCrossprod} 2 | \alias{gpuCrossprod} 3 | \title{Perform Matrix Cross-product with a GPU} 4 | 5 | \description{ 6 | Performs matrix cross-product using a GPU. This function is merely a 7 | couple of wrappers for the CUBLAS cublasSgemm function. 8 | } 9 | 10 | \usage{ 11 | gpuCrossprod(a, b=NULL) 12 | } 13 | 14 | \arguments{ 15 | \item{a}{a matrix of floating point values.} 16 | \item{b}{a matrix of floating point values. A null value 17 | defaults to 'a'.} 18 | } 19 | 20 | \value{ 21 | A matrix of single precision floating point values. 22 | The matrix is the cross-product of arguments 'a' and 'b', i.e., 23 | t(a) * b. 24 | } 25 | 26 | \examples{ 27 | matA <- matrix(runif(3*2), 3, 2) 28 | matB <- matrix(runif(3*4), 3, 4) 29 | gpuCrossprod(matA, matB) 30 | } 31 | 32 | \keyword{array} 33 | \keyword{algebra} 34 | 35 | -------------------------------------------------------------------------------- /man/gpuDist.Rd: -------------------------------------------------------------------------------- 1 | \name{gpuDist} 2 | \alias{gpuDist} 3 | \title{Compute Distances Between Vectors on a GPU} 4 | 5 | \description{ 6 | This function computes the distance between each vector of the 7 | 'points' argument using the metric specified by 'method'. 8 | } 9 | 10 | \usage{ 11 | gpuDist(points, method = "euclidean", p = 2.0) 12 | } 13 | 14 | \arguments{ 15 | \item{points}{a matrix of floating point numbers in which each row is a 16 | vector in $R^n$ space where $n$ is ncol(points).} 17 | \item{method}{a string representing the name of the metric to use to 18 | calculate the distance between the vectors of 'points'. Currently 19 | supported values are: "binary", "canberra", "euclidean", "manhattan", 20 | "maximum", and "minkowski".} 21 | \item{p}{a floating point parameter for the Minkowski metric.} 22 | } 23 | 24 | \value{ 25 | a class of type "dist" containing floating point numbers representing the 26 | distances between vectors from the 'points' argument. 27 | } 28 | 29 | \seealso{ 30 | dist 31 | } 32 | 33 | \examples{ 34 | numVectors <- 5 35 | dimension <- 10 36 | Vectors <- matrix(runif(numVectors*dimension), numVectors, dimension) 37 | gpuDist(Vectors, "euclidean") 38 | gpuDist(Vectors, "maximum") 39 | gpuDist(Vectors, "manhattan") 40 | gpuDist(Vectors, "minkowski", 4) 41 | } 42 | 43 | \keyword{math} 44 | -------------------------------------------------------------------------------- /man/gpuDistClust.Rd: -------------------------------------------------------------------------------- 1 | \name{gpuDistClust} 2 | \alias{gpuDistClust} 3 | \title{Compute Distances and Hierarchical Clustering for Vectors on a GPU} 4 | 5 | \description{This function takes a set of vectors and performs clustering 6 | on them. The function will first calculate the distance between all of the 7 | pairs of vectors and then use the distances to cluster the vectors. Both of 8 | these steps are done on the GPU. 9 | } 10 | 11 | \usage{ 12 | gpuDistClust(points, distmethod = "euclidean", clustmethod = "complete") 13 | } 14 | 15 | \arguments{ 16 | \item{points}{a matrix of floating point numbers in which each row is a 17 | vector in $R^n$ space where $n$ is ncol(points).} 18 | \item{distmethod}{a string representing the name of the metric to use to 19 | calculate the distance between the vectors of 'points'. Currently 20 | supported values are: "binary", "canberra", "euclidean", "manhattan", 21 | "maximum".} 22 | \item{clustmethod}{a string representing the name of the clustering method 23 | to be applied to distances. Currently supported method names include 24 | "average", "centroid", "complete", "flexible", "flexible group", 25 | "mcquitty", "median", "single", "ward", and "wpgma".} 26 | } 27 | 28 | \value{Copied from the native R function 'hclust' documentation. A class of 29 | type "hclust" with the following attributes. 30 | \item{merge}{an n-1 by 2 matrix. Row i of 'merge' describes the merging of 31 | clusters at step i of the clustering. If an element j in the 32 | row is negative, then observation -j was merged at this 33 | stage. If j is positive then the merge was with the cluster 34 | formed at the (earlier) stage j of the algorithm. Thus 35 | negative entries in 'merge' indicate agglomerations of 36 | singletons, and positive entries indicate agglomerations of 37 | non-singletons. Copied from the native R function 'hclust' 38 | documentation.} 39 | \item{order}{a vector giving the permutation of the original observations 40 | suitable for plotting, in the sense that a cluster plot using 41 | this ordering and matrix 'merge' will not have crossings of 42 | the branches.} 43 | \item{height}{a set of n-1 non-decreasing real values. The clustering 44 | height: that is, the value of the criterion associated with 45 | the clustering 'method' for the particular agglomeration.} 46 | } 47 | 48 | \seealso{ 49 | \code{\link{gpuDist}}, \code{\link{gpuHclust}}. 50 | } 51 | 52 | \examples{ 53 | numVectors <- 5 54 | dimension <- 10 55 | Vectors <- matrix(runif(numVectors*dimension), numVectors, dimension) 56 | myClust <- gpuDistClust(Vectors, "maximum", "mcquitty") 57 | plot(myClust) 58 | } 59 | 60 | \keyword{cluster} 61 | -------------------------------------------------------------------------------- /man/gpuGranger.Rd: -------------------------------------------------------------------------------- 1 | \name{gpuGranger} 2 | \alias{gpuGranger} 3 | \title{Perform Granger Causality Tests for Vectors on a GPU} 4 | 5 | \description{ 6 | This function performs, with the aid of a GPU, Granger Causality Tests on 7 | permutations of pairs of columns of the input matrices 'x' and 'y'. 8 | } 9 | 10 | \usage{ 11 | gpuGranger(x, y=NULL, lag) 12 | } 13 | 14 | \arguments{ 15 | \item{x}{a matrix of floating point values. Each column represents 16 | a sequence of observations for a single random variable.} 17 | \item{y}{an optional matrix of floating point values. Each column 18 | represents a sequence of observations for a single random variable.} 19 | \item{lag}{a positive integer by which to offset the sequence of 20 | observations to calculate the coefficient for Granger causality.} 21 | } 22 | 23 | \value{a list of two single precision floating point matrices both of the same 24 | dimension. The two matrices are fStatistics and pValues. The 25 | fStatistics matrix holds the F-statistics from the Granger causality tests. 26 | Each element of the pValues matrix is the p-value for the corresponding 27 | element of the fStatistics matrix. 28 | 29 | If y is NULL, the test is run on permutations of pairs of columns of x. To 30 | find the Granger causality F-statistic estimating the answer to 31 | "Does variable x[ ,j] Granger-cause variable x[ ,i]?", look at 32 | fStatistics[i, j] and pValues[i, j]. 33 | 34 | If y is not NULL, the test is run on permutations of pairs (x[ ,i], y[ ,j]). 35 | To find the Granger causality F-statistic estimating the answer to 36 | "Does variable y[ ,j] Granger-cause variable x[ ,i]?", look at 37 | fStatistics[i, j] and pValues[i, j]. 38 | } 39 | 40 | \examples{ 41 | # permutations of pairs of cols of just x 42 | numRandVars <- 5 43 | numSamples <- 20 44 | randVarSequences <- matrix(runif(numRandVars*numSamples), numSamples, 45 | numRandVars) 46 | gpuGranger(randVarSequences, lag = 5) 47 | 48 | # pairs of cols, one from x and one from y 49 | numXRandVars <- 5 50 | numXSamples <- 20 51 | x <- matrix(runif(numXRandVars*numXSamples), numXSamples, numXRandVars) 52 | 53 | numYRandVars <- 3 54 | numYSamples <- 20 55 | y <- matrix(runif(numYRandVars*numYSamples), numYSamples, numYRandVars) 56 | 57 | result <- gpuGranger(x, y, lag = 5) 58 | print(result) 59 | } 60 | -------------------------------------------------------------------------------- /man/gpuHclust.Rd: -------------------------------------------------------------------------------- 1 | \name{gpuHclust} 2 | \alias{gpuHclust} 3 | \title{Perform Hierarchical Clustering for Vectors with a GPU} 4 | 5 | \description{ 6 | This function performs clustering on a set of points. The distance between 7 | each pair of points should be calculated first using a function like 8 | 'gpuDist' or 'dist'. 9 | } 10 | 11 | \usage{ 12 | gpuHclust(distances, method = "complete") 13 | } 14 | 15 | \arguments{ 16 | \item{distances}{a class of type "dist" containing floating point numbers 17 | representing distances between points. R's native dist function and 18 | the gpuDist function produce output of this type.} 19 | \item{method}{a string representing the name of the clustering method to be 20 | applied to distances. Currently supported method names include 21 | "average", "centroid", "complete", "flexible", "flexible group", 22 | "mcquitty", "median", "single", "ward", and "wpgma".} 23 | } 24 | 25 | \value{Copied from the native R function 'hclust' documentation. A class of 26 | type "hclust" with the following attributes. 27 | \item{merge}{an n-1 by 2 matrix. Row i of 'merge' describes the merging of 28 | clusters at step i of the clustering. If an element j in the 29 | row is negative, then observation -j was merged at this 30 | stage. If j is positive then the merge was with the cluster 31 | formed at the (earlier) stage j of the algorithm. Thus 32 | negative entries in 'merge' indicate agglomerations of 33 | singletons, and positive entries indicate agglomerations of 34 | non-singletons. Copied from the native R function 'hclust' 35 | documentation.} 36 | \item{order}{a vector giving the permutation of the original observations 37 | suitable for plotting, in the sense that a cluster plot using 38 | this ordering and matrix 'merge' will not have crossings of 39 | the branches.} 40 | \item{height}{a set of n-1 non-decreasing real values. The clustering 41 | height: that is, the value of the criterion associated with 42 | the clustering 'method' for the particular agglomeration.} 43 | } 44 | 45 | \seealso{ 46 | hclust, \code{\link{gpuDistClust}} 47 | } 48 | 49 | \examples{ 50 | numVectors <- 5 51 | dimension <- 10 52 | Vectors <- matrix(runif(numVectors*dimension), numVectors, dimension) 53 | distMat <- gpuDist(Vectors, "euclidean") 54 | myClust <- gpuHclust(distMat, "single") 55 | plot(myClust) 56 | } 57 | 58 | \keyword{cluster} 59 | -------------------------------------------------------------------------------- /man/gpuLm.Rd: -------------------------------------------------------------------------------- 1 | % File man/gpuLm.Rd 2 | % Part of the gputools package 3 | % mostly copied from 4 | % File src/library/stats/man/lm.Rd 5 | % Part of the R package, http://www.R-project.org 6 | % Copyright 1995-2007 R Core Development Team 7 | % Distributed under GPL 2 or later 8 | 9 | \name{gpuLm} 10 | \alias{gpuLm} 11 | % \alias{print.lm} 12 | \concept{regression} 13 | \title{Fitting Linear Models using a GPU--enabled QR} 14 | \description{ 15 | Most of this documentation is copied from R's documentation for \code{lm}. 16 | \code{gpuLm} is used to fit linear models using a 17 | GPU enabled QR decomposition. 18 | It can be used to carry out regression, 19 | single stratum analysis of variance and 20 | analysis of covariance (although \code{\link{aov}} may provide a more 21 | convenient interface for these). 22 | 23 | Note: The QR decomposition employed by \code{gpuLm} is optimized for speed 24 | and uses minimal pivoting. If rank-revealing pivot is desired, then the 25 | function \code{gpuQR}, should be used. The most reliable 26 | determination of rank, however, will be obtained with the \code{svd} command. 27 | } 28 | 29 | \usage{ 30 | gpuLm(formula, data, subset, weights, na.action, 31 | method = "qr", model = TRUE, x = FALSE, y = FALSE, qr = TRUE, 32 | singular.ok = TRUE, contrasts = NULL, useSingle = TRUE, offset, \dots) 33 | } 34 | 35 | \arguments{ 36 | \item{formula}{an object of class \code{"\link{formula}"} (or one that 37 | can be coerced to that class): a symbolic description of the 38 | model to be fitted. The details of model specification are given 39 | under \sQuote{Details}.} 40 | 41 | \item{data}{an optional data frame, list or environment (or object 42 | coercible by \code{\link{as.data.frame}} to a data frame) containing 43 | the variables in the model. If not found in \code{data}, the 44 | variables are taken from \code{environment(formula)}, 45 | typically the environment from which \code{lm} is called.} 46 | 47 | \item{subset}{an optional vector specifying a subset of observations 48 | to be used in the fitting process.} 49 | 50 | \item{weights}{an optional vector of weights to be used in the fitting 51 | process. Should be \code{NULL} or a numeric vector. 52 | If non-NULL, weighted least squares is used with weights 53 | \code{weights} (that is, minimizing \code{sum(w*e^2)}); otherwise 54 | ordinary least squares is used. See also \sQuote{Details},} 55 | 56 | \item{na.action}{a function which indicates what should happen 57 | when the data contain \code{NA}s. The default is set by 58 | the \code{na.action} setting of \code{\link{options}}, and is 59 | \code{\link{na.fail}} if that is unset. The \sQuote{factory-fresh} 60 | default is \code{\link{na.omit}}. Another possible value is 61 | \code{NULL}, no action. Value \code{\link{na.exclude}} can be useful.} 62 | 63 | \item{method}{the method to be used; for fitting, currently only 64 | \code{method = "qr"} is supported; \code{method = "model.frame"} returns 65 | the model frame (the same as with \code{model = TRUE}, see below).} 66 | 67 | \item{model, x, y, qr}{logicals. If \code{TRUE} the corresponding 68 | components of the fit (the model frame, the model matrix, the 69 | response, the qr decomposition) are returned.} 70 | 71 | \item{singular.ok}{logical. If \code{FALSE} (the default in S but 72 | not in \R) a singular fit is an error.} 73 | 74 | \item{contrasts}{an optional list. See the \code{contrasts.arg} 75 | of \code{\link{model.matrix.default}}.} 76 | 77 | \item{useSingle}{an optional logical. In the future, setting this to 78 | \code{FALSE} will result in using double precision arithmetic on the 79 | gpu, but this is not yet implemented} 80 | 81 | \item{offset}{this can be used to specify an \emph{a priori} known 82 | component to be included in the linear predictor during fitting. 83 | This should be \code{NULL} or a numeric vector of length equal to 84 | the number of cases. One or more \code{\link{offset}} terms can be 85 | included in the formula instead or as well, and if more than one are 86 | specified their sum is used. See \code{\link{model.offset}}.} 87 | 88 | \item{\dots}{additional arguments to be passed to the low level 89 | regression fitting functions (see below).} 90 | } 91 | 92 | \details{ 93 | Models for \code{lm} are specified symbolically. A typical model has 94 | the form \code{response ~ terms} where \code{response} is the (numeric) 95 | response vector and \code{terms} is a series of terms which specifies a 96 | linear predictor for \code{response}. A terms specification of the form 97 | \code{first + second} indicates all the terms in \code{first} together 98 | with all the terms in \code{second} with duplicates removed. A 99 | specification of the form \code{first:second} indicates the set of 100 | terms obtained by taking the interactions of all terms in \code{first} 101 | with all terms in \code{second}. The specification \code{first*second} 102 | indicates the \emph{cross} of \code{first} and \code{second}. This is 103 | the same as \code{first + second + first:second}. 104 | 105 | If the formula includes an \code{\link{offset}}, this is evaluated and 106 | subtracted from the response. 107 | 108 | If \code{response} is a matrix a linear model is fitted separately by 109 | least-squares to each column of the matrix. 110 | 111 | See \code{\link{model.matrix}} for some further details. The terms in 112 | the formula will be re-ordered so that main effects come first, 113 | followed by the interactions, all second-order, all third-order and so 114 | on: to avoid this pass a \code{terms} object as the formula (see 115 | \code{\link{aov}} and \code{demo(glm.vr)} for an example). 116 | 117 | A formula has an implied intercept term. To remove this use either 118 | \code{y ~ x - 1} or \code{y ~ 0 + x}. See \code{\link{formula}} for 119 | more details of allowed formulae. 120 | 121 | Non-\code{NULL} \code{weights} can be used to indicate that different 122 | observations have different variances (with the values in 123 | \code{weights} being inversely proportional to the variances); or 124 | equivalently, when the elements of \code{weights} are positive 125 | integers \eqn{w_i}, that each response \eqn{y_i} is the mean of 126 | \eqn{w_i} unit-weight observations (including the case that there are 127 | \eqn{w_i} observations equal to \eqn{y_i} and the data have been 128 | summarized). 129 | 130 | \code{lm} calls the lower level functions \code{\link{lm.fit}}, etc, 131 | see below, for the actual numerical computations. For programming 132 | only, you may consider doing likewise. 133 | 134 | All of \code{weights}, \code{subset} and \code{offset} are evaluated 135 | in the same way as variables in \code{formula}, that is first in 136 | \code{data} and then in the environment of \code{formula}. 137 | } 138 | \value{ 139 | \code{lm} returns an object of \code{\link[base]{class}} \code{"lm"} or for 140 | multiple responses of class \code{c("mlm", "lm")}. 141 | 142 | The functions \code{summary} and \code{\link{anova}} are used to 143 | obtain and print a summary and analysis of variance table of the 144 | results. The generic accessor functions \code{coefficients}, 145 | \code{effects}, \code{fitted.values} and \code{residuals} extract 146 | various useful features of the value returned by \code{lm}. 147 | 148 | An object of class \code{"lm"} is a list containing at least the 149 | following components: 150 | 151 | \item{coefficients}{a named vector of coefficients} 152 | \item{residuals}{the residuals, that is response minus fitted values.} 153 | \item{fitted.values}{the fitted mean values.} 154 | \item{rank}{the numeric rank of the fitted linear model.} 155 | \item{weights}{(only for weighted fits) the specified weights.} 156 | \item{df.residual}{the residual degrees of freedom.} 157 | \item{call}{the matched call.} 158 | \item{terms}{the \code{\link{terms}} object used.} 159 | \item{contrasts}{(only where relevant) the contrasts used.} 160 | \item{xlevels}{(only where relevant) a record of the levels of the 161 | factors used in fitting.} 162 | \item{offset}{the offset used (missing if none were used).} 163 | \item{y}{if requested, the response used.} 164 | \item{x}{if requested, the model matrix used.} 165 | \item{model}{if requested (the default), the model frame used.} 166 | \item{na.action}{(where relevant) information returned by 167 | \code{\link{model.frame}} on the special handling of \code{NA}s.} 168 | 169 | In addition, non-null fits will have components \code{assign}, 170 | \code{effects} and (unless not requested) \code{qr} relating to the linear 171 | fit, for use by extractor functions such as \code{summary} and 172 | \code{\link{effects}}. 173 | } 174 | \section{Using time series}{ 175 | Considerable care is needed when using \code{lm} with time series. 176 | 177 | Unless \code{na.action = NULL}, the time series attributes are 178 | stripped from the variables before the regression is done. (This is 179 | necessary as omitting \code{NA}s would invalidate the time series 180 | attributes, and if \code{NA}s are omitted in the middle of the series 181 | the result would no longer be a regular time series.) 182 | 183 | Even if the time series attributes are retained, they are not used to 184 | line up series, so that the time shift of a lagged or differenced 185 | regressor would be ignored. It is good practice to prepare a 186 | \code{data} argument by \code{\link{ts.intersect}(\dots, dframe = TRUE)}, 187 | then apply a suitable \code{na.action} to that data frame and call 188 | \code{gpuLm} with \code{na.action = NULL} so that residuals and fitted 189 | values are time series. 190 | } 191 | \seealso{ 192 | \code{\link{summary.lm}} for summaries and \code{\link{anova.lm}} for 193 | the ANOVA table; \code{\link{aov}} for a different interface. 194 | 195 | The generic functions \code{\link{coef}}, \code{\link{effects}}, 196 | \code{\link{residuals}}, \code{\link{fitted}}, \code{\link{vcov}}. 197 | 198 | \code{\link{predict.lm}} (via \code{\link{predict}}) for prediction, 199 | including confidence and prediction intervals; 200 | \code{\link{confint}} for confidence intervals of \emph{parameters}. 201 | 202 | \code{\link{lm.influence}} for regression diagnostics, and 203 | \code{\link{glm}} for \bold{generalized} linear models. 204 | 205 | The underlying low level functions, 206 | \code{\link{lm.fit}} for plain, and \code{\link{lm.wfit}} for weighted 207 | regression fitting. 208 | 209 | More \code{lm()} examples are available e.g., in 210 | \code{\link[datasets]{anscombe}}, 211 | \code{\link[datasets]{attitude}}, 212 | \code{\link[datasets]{freeny}}, 213 | \code{\link[datasets]{LifeCycleSavings}}, 214 | \code{\link[datasets]{longley}}, 215 | \code{\link[datasets]{stackloss}}, 216 | \code{\link[datasets]{swiss}}. 217 | 218 | \code{biglm} in package \pkg{biglm} for an alternative 219 | way to fit linear models to large datasets (especially those with many 220 | cases). 221 | } 222 | 223 | \references{ 224 | Chambers, J. M. (1992) 225 | Linear models. 226 | Chapter 4 of Statistical Models in S 227 | eds J. M. Chambers and T. J. Hastie, Wadsworth & Brooks/Cole. 228 | 229 | Wilkinson, G. N. and Rogers, C. E. (1973) 230 | Symbolic descriptions of factorial models for analysis of variance. 231 | Applied Statistics, 22, 392--9. 232 | } 233 | 234 | \author{ 235 | The design was inspired by the S function of the same name described 236 | in Chambers (1992). The implementation of model formula by Ross Ihaka 237 | was based on Wilkinson & Rogers (1973). 238 | 239 | This function was adapted for Nvidia's CUDA--supporting GPGPUs by 240 | Mark Seligman at Rapid Biologics LLC. 241 | \code{http://www.rapidbiologics.com} 242 | 243 | } 244 | 245 | \note{ 246 | Offsets specified by \code{offset} will not be included in predictions 247 | by \code{\link{predict.lm}}, whereas those specified by an offset term 248 | in the formula will be. 249 | } 250 | 251 | \examples{ 252 | # require(graphics) 253 | 254 | ## Annette Dobson (1990) "An Introduction to Generalized Linear Models". 255 | ## Page 9: Plant Weight Data. 256 | ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14) 257 | trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69) 258 | group <- gl(2,10,20, labels=c("Ctl","Trt")) 259 | weight <- c(ctl, trt) 260 | anova(lm.D9 <- gpuLm(weight ~ group)) 261 | summary(lm.D90 <- gpuLm(weight ~ group - 1))# omitting intercept 262 | summary(resid(lm.D9) - resid(lm.D90)) #- residuals almost identical 263 | 264 | opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0)) 265 | plot(lm.D9, las = 1) # Residuals, Fitted, ... 266 | par(opar) 267 | 268 | ## model frame : 269 | stopifnot(identical(gpuLm(weight ~ group, method = "model.frame"), 270 | model.frame(lm.D9))) 271 | 272 | ### less simple examples in "See Also" above 273 | } 274 | 275 | \keyword{regression} 276 | 277 | -------------------------------------------------------------------------------- /man/gpuLm.defaultTol.Rd: -------------------------------------------------------------------------------- 1 | % File man/gpuLm.defaultTol.Rd 2 | % part of the gputools R package 3 | % Distributed under GPL 2 or later 4 | 5 | \name{gpuLm.defaultTol} 6 | \title{Function to switch tolerance depending on precision} 7 | \usage{ 8 | gpuLm.defaultTol(useSingle = TRUE) 9 | } 10 | \alias{gpuLm.defaultTol} 11 | \description{ 12 | This function was written by Mark Seligman at Rapid Biologics, 13 | \code{http://rapidbiologics.com} 14 | 15 | The function \code{\link{gpuLm.fit}} calls this function 16 | to determine a default tolerance. So gpuLm.defaultTol should 17 | \emph{not} need to be used directly. 18 | } 19 | \arguments{ 20 | \item{useSingle}{logical. If TRUE, a tolerance will be returned appropriate 21 | for single precision arithmetic. If FALSE, a tolerance will be returned 22 | appropriate for double precision arithmetic.} 23 | } 24 | \value{ 25 | a floating point number representing a tolerance to be used by gpuLm.fit 26 | } 27 | \seealso{ 28 | \code{\link{gpuLm.fit}} \code{\link{gpuLm}} 29 | } 30 | -------------------------------------------------------------------------------- /man/gpuLm.fit.Rd: -------------------------------------------------------------------------------- 1 | % File man/gpuLm.fit.Rd 2 | % part of the gputools R package 3 | % mostly copied from 4 | % File src/library/stats/man/lmfit.Rd 5 | % Part of the R package, http://www.R-project.org 6 | % Copyright 1995-2009 R Core Development Team 7 | % Distributed under GPL 2 or later 8 | 9 | \name{gpuLm.fit} 10 | \title{Fitter functions for gpu enabled linear models} 11 | \usage{ 12 | gpuLm.fit(x, y, w = NULL, offset = NULL, method = "qr", 13 | useSingle, tol = gpuLm.defaultTol(useSingle), singular.ok = TRUE, ...) 14 | } 15 | \alias{gpuLm.fit} 16 | \description{ 17 | The C code called by this function was written by 18 | Mark Seligman at Rapid Biologics, \code{http://rapidbiologics.com} 19 | 20 | The function \code{\link{gpuLm}} calls this function 21 | to fit linear models. So gpuLm.fit should \emph{not} need to be used 22 | directly. 23 | } 24 | \arguments{ 25 | \item{x}{design matrix of dimension \code{n * p}.} 26 | \item{y}{vector of observations of length \code{n}, or a matrix with 27 | \code{n} rows.} 28 | \item{w}{vector of weights (length \code{n}) to be used in the fitting 29 | process for the \code{wfit} functions. Weighted least squares is 30 | used with weights \code{w}, i.e., \code{sum(w * e^2)} is minimized.} 31 | \item{offset}{numeric of length \code{n}). This can be used to 32 | specify an \emph{a priori} known component to be included in the 33 | linear predictor during fitting.} 34 | 35 | \item{method}{currently, only \code{method="qr"} is supported.} 36 | 37 | \item{useSingle}{logical. If TRUE, the gpu will use single precision 38 | arithmetic. In the future, if FALSE the gpu may use double precision 39 | arithmetic, but this is not implemented yet.} 40 | 41 | % Avoid Matrix's grab of qr. 42 | \item{tol}{tolerance for the \code{\link[base]{qr}} decomposition. Default 43 | is 1e-7.} 44 | 45 | \item{singular.ok}{logical. If \code{FALSE}, a singular model is an 46 | error.} 47 | 48 | \item{\dots}{currently disregarded.} 49 | } 50 | \value{ 51 | %% S(-PLUS) returns an object of class "lm" 52 | %% such that print.lm, summary,... work; but that'd need more changes for R. 53 | a list with components 54 | \item{coefficients}{\code{p} vector} 55 | \item{residuals}{\code{n} vector or matrix} 56 | \item{fitted.values}{\code{n} vector or matrix} 57 | \item{effects}{(not null fits)\code{n} vector of orthogonal single-df 58 | effects. The first \code{rank} of them correspond to non-aliased 59 | coefficients, and are named accordingly.} 60 | \item{weights}{\code{n} vector --- \emph{only} for the \code{*wfit*} 61 | functions.} 62 | \item{rank}{integer, giving the rank} 63 | \item{df.residual}{degrees of freedom of residuals} 64 | \item{qr}{(not null fits) the QR decomposition, see \code{\link[base]{qr}}.} 65 | } 66 | \seealso{ 67 | \code{\link{gpuLm}} which should usually be used for linear least squares 68 | regression 69 | } 70 | \examples{ 71 | require(utils) 72 | set.seed(129) 73 | n <- 7 ; p <- 2 74 | X <- matrix(rnorm(n * p), n,p) # no intercept! 75 | y <- rnorm(n) 76 | w <- rnorm(n)^2 77 | 78 | str(lmw <- gpuLm.fit(x=X, y=y, w=w)) 79 | } 80 | \keyword{regression} 81 | \keyword{array} 82 | -------------------------------------------------------------------------------- /man/gpuLsfit.Rd: -------------------------------------------------------------------------------- 1 | % File man/gpuLsfit.Rd 2 | % Part of the gputools package 3 | % mostly copied from 4 | % File src/library/stats/man/lsfit.Rd 5 | % Part of the R package, http://www.R-project.org 6 | % Copyright 1995-2007 R Core Development Team 7 | % Distributed under GPL 2 or later 8 | 9 | \name{gpuLsfit} 10 | \alias{gpuLsfit} 11 | 12 | \title{Least squares fit using GPU--enabled QR decomposition} 13 | 14 | \usage{ 15 | gpuLsfit(x, y, 16 | wt = NULL, intercept = TRUE, useSingle = TRUE, 17 | tolerance=gpuLm.defaultTol(useSingle), 18 | yname = NULL) 19 | } 20 | 21 | \description{ 22 | The least squares estimate of \bold{\eqn{\beta}{b}} in the model 23 | \deqn{\bold{Y} = \bold{X \beta} + \bold{\epsilon}}{y = X b + e} 24 | is found. 25 | 26 | Most of this documentation is copied from R's documentation for 27 | \code{lsfit}. The function \code{gpuLsfit} performs a least--squares fit 28 | using a GPU enabled QR decomposition. 29 | 30 | Note: The QR decomposition employed by \code{gpuLm} is optimized for speed 31 | and uses minimal pivoting. If more precise pivoting is desired, then 32 | either the function \code{gpuQR} or, better still, \code{svd} should be used. 33 | } 34 | 35 | \arguments{ 36 | \item{x}{a matrix whose rows correspond to cases and whose columns 37 | correspond to variables.} 38 | \item{y}{the responses, possibly a matrix if you want to fit multiple 39 | left hand sides.} 40 | \item{wt}{an optional vector of weights for performing weighted least squares.} 41 | \item{intercept}{whether or not an intercept term should be used.} 42 | \item{useSingle}{whether to use single precision arithmetic on the gpu. Only the 'TRUE' option is implemented so far.} 43 | \item{tolerance}{the tolerance to be used in the matrix decomposition. 44 | This defaults to 1e-04 for single--precision GPU computation.} 45 | \item{yname}{names to be used for the response variables.} 46 | } 47 | 48 | \details{ 49 | If weights are specified then a weighted least squares is performed 50 | with the weight given to the \emph{j}th case specified by the \emph{j}th 51 | entry in \code{wt}. 52 | 53 | If any observation has a missing value in any field, that observation 54 | is removed before the analysis is carried out. 55 | This can be quite inefficient if there is a lot of missing data. 56 | 57 | The implementation is via a modification of the LINPACK subroutines 58 | which allow for multiple left-hand sides. 59 | } 60 | 61 | \value{ 62 | A list with the following named components: 63 | \item{coef}{the least squares estimates of the coefficients in 64 | the model (\bold{\eqn{\beta}{b}} as stated above).} 65 | \item{residuals}{residuals from the fit.} 66 | \item{intercept}{indicates whether an intercept was fitted.} 67 | \item{qr}{the QR decomposition of the design matrix.} 68 | } 69 | 70 | \author{ 71 | This function was adapted for Nvidia's CUDA--supporting GPGPUs 72 | by Mark Seligman at Rapid Biologics LLC. 73 | \code{http://www.rapidbiologics.com} 74 | } 75 | 76 | \references{ 77 | Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) 78 | \emph{The New S Language}. Wadsworth & Brooks/Cole. 79 | } 80 | \seealso{ 81 | \code{\link{lsfit}}, \code{\link{lm}}, 82 | \code{\link{ls.print}}, \code{\link{ls.diag}} 83 | } 84 | \examples{ 85 | \dontshow{utils::example("lm", echo = FALSE)} 86 | ##-- Using the same data as the lm(.) example: 87 | lsD9 <- gpuLsfit(x = unclass(gl(2,10)), y = weight) 88 | ls.print(lsD9) 89 | } 90 | \keyword{regression} 91 | -------------------------------------------------------------------------------- /man/gpuMatMult.Rd: -------------------------------------------------------------------------------- 1 | \name{gpuMatMult} 2 | \alias{gpuMatMult} 3 | \title{Perform Matrix Multiplication with a GPU} 4 | 5 | \description{ 6 | Performs matrix multiplication using a GPU. This function is merely a 7 | wrapper for the CUBLAS cublasDgemm function. 8 | } 9 | 10 | \usage{ 11 | gpuMatMult(a, b) 12 | } 13 | 14 | \arguments{ 15 | \item{a}{a numeric matrix.} 16 | \item{b}{a numeric matrix.} 17 | } 18 | 19 | \value{ 20 | A numeric matrix. 21 | The matrix is just the product of arguments 'a' and 'b'. 22 | } 23 | 24 | \examples{ 25 | matA <- matrix(runif(2*3), 2, 3) 26 | matB <- matrix(runif(3*4), 3, 4) 27 | gpuMatMult(matA, matB) 28 | } 29 | 30 | \keyword{array} 31 | \keyword{algebra} 32 | -------------------------------------------------------------------------------- /man/gpuMi.Rd: -------------------------------------------------------------------------------- 1 | \name{gpuMi} 2 | \alias{gpuMi} 3 | \title{B spline based mutual information} 4 | 5 | \description{ 6 | This function estimates the mutual information for permutations 7 | of pairs of columns of a matrix using a B spline approach on a 8 | GPU device. Please note, the data must be values from the interval 9 | [0.0, 1.0]. 10 | } 11 | 12 | \usage{ 13 | gpuMi(x, y = NULL, bins = 2, splineOrder = 1) 14 | } 15 | 16 | \arguments{ 17 | \item{x}{a matrix of floating point numbers from the interval [0.0, 1.0]. 18 | Each column represents a list of samples of a random variable. The 19 | mutual information between each column of x and each column of y will 20 | be computed. If y is NULL then each pair of columns of x will be 21 | compared.} 22 | \item{y}{a matrix of floating point numbers from the interval [0.0, 1.0]. 23 | Each column represents a list of samples of a random variable. The 24 | mutual information between each column of x and each column of y will 25 | be computed. If y is NULL then each pair of columns of x will be 26 | compared.} 27 | \item{bins}{a single integer value representing the number of equal 28 | intervals that [0.0, 1.0] will be divided into in order to determine 29 | the bins in which to place each value of the columns of x and y. In 30 | the case of splineOrder = 1, this determines the histogram for 31 | traditional mutual information. For splineOrder > 1, a single value 32 | may be placed in multiple adjoining bins with varying weights on 33 | membership.} 34 | \item{splineOrder}{a single integer value giving the degree of the spline 35 | polynomials used to define both the number of bins a single value will 36 | be placed in and the weight of membership given to the value.} 37 | } 38 | 39 | \value{ 40 | a matrix of single precision floating point values of order ncol(y) by 41 | ncol(x). Entry $(i, j)$ of this matrix represents the mutual information 42 | calculation for $(y_i, x_j)$. 43 | } 44 | 45 | \references{ 46 | Carten O. Daub, Ralf Steuer, Joachim Selbig, and Sebastian Kloska. 47 | 2004. Estimating mutual information using B-spline functions -- an 48 | improved similarity measure for analysing gene expression data. 49 | \emph{BMC Bioinformatics}. 5:118. Available from 50 | \url{http://www.biomedcentral.com/1471-2105/5/118} 51 | } 52 | 53 | \examples{ 54 | # get 3 random variables each with 20 samples 55 | x <- matrix(runif(60), 20, 3) 56 | y <- matrix(runif(60), 20, 3) 57 | # do something interesting 58 | y[,2] <- 3.0 * (x[,1] + x[,3]) 59 | z <- gpuMi(x, y, bins = 10, splineOrder = 3) 60 | print(z) 61 | } 62 | -------------------------------------------------------------------------------- /man/gpuQr.Rd: -------------------------------------------------------------------------------- 1 | \name{gpuQr} 2 | \alias{gpuQr} 3 | \title{Estimate the QR decomposition for a matrix} 4 | 5 | \description{ 6 | gpuQR estimates the QR decomposition for a matrix using 7 | column pivoting and householder matrices. The work is done on a 8 | GPU. 9 | 10 | Note: a rank-revealing pivoting scheme is employed, potentially 11 | resulting in pivot distinctly different from ordinary "qr". 12 | } 13 | 14 | \usage{ 15 | gpuQr(x, tol = 1e-07) 16 | } 17 | 18 | \arguments{ 19 | \item{x}{a matrix of floating point numbers. 20 | This is the matrix that will be decomposed into Q and R 21 | factors.} 22 | \item{tol}{a floating point value. It is used for estimating the rank 23 | of matrix x.} 24 | } 25 | 26 | \value{ 27 | an object of class 'qr'. This object has members qr, qraux, pivot, rank. 28 | It is meant to be identical to the output of R's base function 'qr'. 29 | From the documentation for R's 'qr' function: 30 | The attribute qr is a matrix with the same dimension as 'x'. 31 | The upper triangle contains the R of the QR decomposition. 32 | The lower triangle contains partial information to construct Q. 33 | The attribute qraux is a vector of length 'ncol(x)' contains 34 | more information to construct Q. 35 | The attribute rank is a single integer representing an estimation 36 | of the rank of input matrix x based on the results of the 37 | QR decomposition. In some cases, this rank can be wildly different from 38 | the actual rank of the matrix x and so is only an estimation. 39 | The attribute pivot contains the permutation applied to columns of x 40 | in the process of calculating the QR decomposition. 41 | } 42 | 43 | \author{ 44 | The low--level implementation of this function for Nvidia's 45 | CUDA--supporting GPGPUs was written 46 | by Mark Seligman at Rapid Biologics LLC. 47 | \code{http://www.rapidbiologics.com} 48 | } 49 | 50 | \references{ 51 | Bischof, C. B. and Van Loan, C. F. (1987) 52 | The WY Representation for Products of Householder Matrices 53 | SIAM J Sci. and Stat. Comp, 8, s2--s13. 54 | 55 | Bjorck, Ake (1996) Numerical methods for least squares 56 | problems. SIAM. 57 | 58 | Golub, Gene H. and Van Loan, C. F. (1996) 59 | Matrix Computations, Ed. 3, ch. 5. 60 | } 61 | 62 | \examples{ 63 | # get some random data of any shape at all 64 | x <- matrix(runif(25), 5, 5) 65 | qr <- gpuQr(x) 66 | print(qr) 67 | } 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /man/gpuSolve.Rd: -------------------------------------------------------------------------------- 1 | \name{gpuSolve} 2 | \alias{gpuSolve} 3 | \title{Estimate the solution to a matrix vector equation} 4 | 5 | \description{ 6 | This function estimates the solution to an equation of the form x * b = y 7 | where x is a matrix, b is an unknown vector, and y is a known vector. It 8 | does much calculation on a GPU. If the y argument is omitted, the 9 | function returns the inverse of x. 10 | 11 | The function uses R's base 'qr' and then applies the gpu to the result 12 | to get the final solution. 13 | } 14 | 15 | \usage{ 16 | gpuSolve(x, y=NULL) 17 | } 18 | 19 | \arguments{ 20 | \item{x}{a matrix of floating point numbers.} 21 | \item{y}{a vector of floating point numbers of length nrow(x).} 22 | } 23 | 24 | \value{ 25 | a vector or matrix of floating point numbers. If y is not null, 26 | then the value is an estimate of the vector b of length ncol(x) 27 | where x * b = y. If y is null or omitted, the value is a matrix, 28 | an estimate of a matrix multiplicative pseudo inverse of x. 29 | } 30 | 31 | \examples{ 32 | x <- matrix(runif(100), 10, 10) 33 | y <- runif(10) 34 | b <- gpuSolve(x, y) 35 | cat("Solution:\n") 36 | print(b) 37 | x.inverse <- gpuSolve(x) 38 | cat("an estimate of a pseudo inverse for x:\n") 39 | print(x.inverse) 40 | } 41 | -------------------------------------------------------------------------------- /man/gpuTcrossprod.Rd: -------------------------------------------------------------------------------- 1 | \name{gpuTcrossprod} 2 | \alias{gpuTcrossprod} 3 | \title{Perform Matrix Transposed Cross-product with a GPU} 4 | 5 | \description{ 6 | Performs transposed matrix cross-product using a GPU. This function is merely a 7 | couple of wrappers for the CUBLAS cublasSgemm function. 8 | } 9 | 10 | \usage{ 11 | gpuTcrossprod(a, b) 12 | } 13 | 14 | \arguments{ 15 | \item{a}{a matrix of floating point values.} 16 | \item{b}{a matrix of floating point values. If null, defaultsto 17 | 'a'.} 18 | } 19 | 20 | \value{ 21 | A matrix of single precision floating point values. 22 | The matrix is the transposed cross-product of arguments 'a' 23 | and 'b', i.e., a * t(b). 24 | } 25 | 26 | \examples{ 27 | matA <- matrix(runif(2*3), 2, 3) 28 | matB <- matrix(runif(4*3), 4, 3) 29 | gpuTcrossprod(matA, matB) 30 | } 31 | 32 | \keyword{array} 33 | \keyword{algebra} 34 | -------------------------------------------------------------------------------- /man/gpuTtest.Rd: -------------------------------------------------------------------------------- 1 | \name{gpuTtest} 2 | \alias{gpuTtest} 3 | \title{T-Test Estimator with a GPU} 4 | 5 | \description{ 6 | Given the number of samples and a Pearson correlation coefficient, this 7 | function estimates the t-score on a GPU. If an entry in goodPairs is 8 | zero or one then you may get a NaN as the t-test result. 9 | } 10 | 11 | \usage{ 12 | gpuTtest(goodPairs, coeffs) 13 | } 14 | 15 | \arguments{ 16 | \item{goodPairs}{a vector of positive integer values. Value i 17 | represents the number of samples used to calculate the i-th 18 | value of the 'coeffs' argument.} 19 | \item{coeffs}{a vector of floating point values representing 20 | Pearson correlation coefficients.} 21 | } 22 | 23 | \value{a vector of single precision floating point values. The i-th entry 24 | is an estimate of the t-score of the i-th entry of the 'coeffs' argument. 25 | } 26 | 27 | \seealso{ 28 | \code{\link{gpuCor}}. 29 | } 30 | 31 | \examples{ 32 | goodPairs <- rpois(10, lambda=5) 33 | coeffs <- runif(10) 34 | gpuTtest(goodPairs, coeffs) 35 | } 36 | -------------------------------------------------------------------------------- /src/Makevars.in: -------------------------------------------------------------------------------- 1 | PKG_CPPFLAGS = @CUDA_CPPFLAGS@ 2 | PKG_LIBS = @LIBS@ -------------------------------------------------------------------------------- /src/correlation.h: -------------------------------------------------------------------------------- 1 | typedef enum useObs {everything, pairwiseComplete} UseObs; 2 | 3 | void pmcc(UseObs whichObs, const float * vectsa, size_t na, 4 | const float * vectsb, size_t nb, size_t dim, float * numPairs, 5 | float * correlations, float * signifs); 6 | 7 | void setDevice(int device); 8 | void getDevice(int * device); 9 | 10 | void getData(const int * images, 11 | const int * xcoords, const int * ycoords, const int * zcoords, 12 | const int * mins, const int * maxes, 13 | const float * evs, size_t numrows, size_t numimages, float * output); 14 | 15 | size_t parseResults(const int * imageList1, size_t numImages1, 16 | const int * imageList2, size_t numImages2, 17 | int structureid, 18 | double cutCorrelation, int cutPairs, 19 | const double * correlations, const double * signifs, const int * numPairs, 20 | double * results); 21 | 22 | int isSignificant(double signif, int df); 23 | void testSignif(const float * goodPairs, const float * coeffs, 24 | size_t n, float * tscores); 25 | 26 | void hostSignif(const float * goodPairs, const float * coeffs, 27 | size_t n, float * tscores); 28 | size_t signifFilter(const double * data, size_t rows, double * results); 29 | size_t gpuSignifFilter(const float * data, size_t rows, float * results); 30 | 31 | void cublasPMCC(const float * sampsa, size_t numSampsA, const float * sampsb, 32 | size_t numSampsB, size_t sampSize, float * res); 33 | 34 | double hostKendall(const float * X, const float * Y, size_t n); 35 | void permHostKendall(const float * a, size_t na, const float * b, size_t nb, 36 | size_t sampleSize, double * results); 37 | 38 | void masterKendall(const float * x, size_t nx, const float * y, size_t ny, 39 | size_t sampleSize, double * results); 40 | -------------------------------------------------------------------------------- /src/cudaUtils.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "R.h" 7 | #include "nvrtc.h" 8 | #include "cuda.h" 9 | 10 | #include "cudaUtils.h" 11 | 12 | class CudaKernel { 13 | public: 14 | const char * name; 15 | const char * ptx; 16 | nvrtcProgram * prog; 17 | 18 | CudaKernel(const char * _name, const char * _ptx, nvrtcProgram * _prog) 19 | { 20 | name = _name; 21 | ptx = _ptx; 22 | prog = _prog; 23 | } 24 | }; 25 | 26 | std::map * cudaKernels; 27 | 28 | // Obtain compilation log from the program. 29 | void printCompileLog(nvrtcProgram &prog) { 30 | size_t logSize; 31 | NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); 32 | char * log = new char[logSize]; 33 | NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); 34 | warning(log); 35 | delete[] log; 36 | } 37 | 38 | static 39 | std::vector & getFileKernels(std::string file) 40 | { 41 | std::vector * kernels; 42 | if (file == "correlation") { 43 | std::string newKernels[] = 44 | { "gpuSignif" 45 | , "gpuMeans" 46 | , "gpuSD" 47 | , "gpuPMCC" 48 | , "gpuMeansNoTest" 49 | , "gpuSDNoTest" 50 | , "gpuPMCCNoTest" 51 | , "dUpdateSignif" 52 | , "noNAsPmccMeans" 53 | }; 54 | kernels = new std::vector(newKernels, newKernels + 9); 55 | } else if (file == "distance") { 56 | std::string newKernels[] = 57 | { "euclidean_kernel_same" 58 | , "maximum_kernel_same" 59 | , "manhattan_kernel_same" 60 | , "canberra_kernel_same" 61 | , "binary_kernel_same" 62 | , "minkowski_kernel_same" 63 | }; 64 | kernels = new std::vector(newKernels, newKernels + 6); 65 | } else if (file == "granger") { 66 | std::string newKernels[] = 67 | { "getRestricted" 68 | , "getUnrestricted" 69 | , "ftest" 70 | , "getRestricted" 71 | , "getUnrestricted" 72 | , "ftest" 73 | }; 74 | kernels = new std::vector(newKernels, newKernels + 6); 75 | } else if (file == "hcluster") { 76 | std::string newKernels[] = 77 | { "complete_kernel" 78 | , "wpgma_kernel" 79 | , "average_kernel" 80 | , "median_kernel" 81 | , "centroid_kernel" 82 | , "flexible_group_kernel" 83 | , "flexible_kernel" 84 | , "ward_kernel" 85 | , "mcquitty_kernel" 86 | , "single_kernel" 87 | , "convert_kernel" 88 | , "find_min1_kernel" 89 | , "find_min2_kernel" 90 | }; 91 | kernels = new std::vector(newKernels, newKernels + 13); 92 | } else if (file == "kendall") { 93 | std::string newKernels[] = { "gpuKendall" }; 94 | kernels = new std::vector(newKernels, newKernels + 1); 95 | } else if (file == "mi") { 96 | std::string newKernels[] = 97 | { "scale" 98 | , "get_bin_scores" 99 | , "get_entropy" 100 | , "get_mi" 101 | }; 102 | kernels = new std::vector(newKernels, newKernels + 4); 103 | } else if (file == "qrdecomp") { 104 | std::string newKernels[] = 105 | { "getColNorms" 106 | , "gpuFindMax" 107 | , "gpuSwapCol" 108 | , "makeHVector" 109 | , "UpdateHHNorms" 110 | }; 111 | kernels = new std::vector(newKernels, newKernels + 5); 112 | } else { 113 | kernels = new std::vector(); 114 | } 115 | return(*kernels); 116 | } 117 | 118 | extern "C" 119 | void cuCompile(const int * numFiles, 120 | const char ** cuFilenames, 121 | const char ** cuSrc) 122 | { 123 | cudaKernels = new std::map(); 124 | CUDA_SAFE_CALL(cuInit(0)); 125 | 126 | for (int i = 0; i < *numFiles; ++i) { 127 | std::string file = cuFilenames[i]; 128 | const char * src = cuSrc[i]; 129 | 130 | nvrtcProgram * prog = new nvrtcProgram(); 131 | NVRTC_SAFE_CALL( 132 | nvrtcCreateProgram(prog, // prog 133 | src, // buffer 134 | file.c_str(), // name 135 | 0, // numHeaders 136 | NULL, // headers 137 | NULL)); // includeNames 138 | 139 | std::vector kernels = getFileKernels(file); 140 | for(int i = 0; i < kernels.size(); ++i) { 141 | NVRTC_SAFE_CALL(nvrtcAddNameExpression(*prog, kernels[i].c_str())); 142 | } 143 | 144 | const char * options[] = { "--use_fast_math" }; 145 | 146 | nvrtcResult compileResult = nvrtcCompileProgram(*prog, 1, options); 147 | if (compileResult != NVRTC_SUCCESS) { 148 | printCompileLog(*prog); 149 | error("\ncuda kernel compile failed"); 150 | } 151 | 152 | // Obtain PTX from the program. 153 | size_t ptxSize; 154 | NVRTC_SAFE_CALL(nvrtcGetPTXSize(*prog, &ptxSize)); 155 | 156 | char * ptx = Calloc(ptxSize, char); 157 | NVRTC_SAFE_CALL(nvrtcGetPTX(*prog, ptx)); 158 | 159 | for(int i = 0; i < kernels.size(); ++i) { 160 | const char * name; 161 | NVRTC_SAFE_CALL(nvrtcGetLoweredName(*prog, kernels[i].c_str(), &name)); 162 | (*cudaKernels)[kernels[i]] = new CudaKernel(name, ptx, prog); 163 | } 164 | } 165 | } 166 | 167 | extern "C" 168 | void unloadPackage() 169 | { 170 | std::vector ptxs; 171 | std::vector progs; 172 | 173 | std::map::iterator iter; 174 | for (iter = cudaKernels->begin(); iter != cudaKernels->end(); ++iter) { 175 | ptxs.push_back(iter->second->ptx); 176 | progs.push_back(iter->second->prog); 177 | delete iter->second; 178 | } 179 | 180 | delete cudaKernels; 181 | 182 | std::sort(ptxs.begin(), ptxs.end()); 183 | std::unique(ptxs.begin(), ptxs.end()); 184 | std::vector::iterator ptx_i; 185 | for (ptx_i = ptxs.begin(); ptx_i != ptxs.end(); ++ptx_i) { 186 | Free(*ptx_i); 187 | } 188 | 189 | std::sort(progs.begin(), progs.end()); 190 | std::unique(progs.begin(), progs.end()); 191 | std::vector::iterator prog_i; 192 | for (prog_i = progs.begin(); prog_i != progs.end(); ++prog_i) { 193 | NVRTC_SAFE_CALL(nvrtcDestroyProgram(*prog_i)); 194 | } 195 | } 196 | 197 | void cudaLaunch(std::string kernelName, 198 | void * args[], 199 | const dim3 &gridDim, const dim3 &blockDim, 200 | cudaStream_t stream) 201 | { 202 | const CudaKernel * cudaKernel = (*cudaKernels)[kernelName]; 203 | 204 | CUmodule module; 205 | CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, cudaKernel->ptx, 0, 0, 0)); 206 | 207 | CUfunction kernel; 208 | CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, cudaKernel->name)); 209 | 210 | CUDA_SAFE_CALL( 211 | cuLaunchKernel(kernel, 212 | gridDim.x, gridDim.y, gridDim.z, // grid dim 213 | blockDim.x, blockDim.y, blockDim.z, // block dim 214 | 0, stream, // shared mem and stream 215 | args, 0)); // arguments 216 | // CUDA_SAFE_CALL(cuCtxSynchronize()); 217 | CUDA_SAFE_CALL(cuModuleUnload(module)); 218 | } 219 | -------------------------------------------------------------------------------- /src/cudaUtils.h: -------------------------------------------------------------------------------- 1 | #ifndef _CUDAUTILS_H_ 2 | #define _CUDAUTILS_H_ 3 | 4 | #include 5 | 6 | #include "R.h" 7 | #include "nvrtc.h" 8 | #include "cuda_runtime_api.h" 9 | 10 | #define NVRTC_SAFE_CALL(x) \ 11 | do { \ 12 | nvrtcResult result = x; \ 13 | if (result != NVRTC_SUCCESS) { \ 14 | error("\nerror: %d failed with error %s\n", x, \ 15 | nvrtcGetErrorString(result)); \ 16 | } \ 17 | } while(0) 18 | 19 | #define CUDA_SAFE_CALL(x) \ 20 | do { \ 21 | CUresult result = x; \ 22 | if (result != CUDA_SUCCESS) { \ 23 | const char *msg; \ 24 | cuGetErrorName(result, &msg); \ 25 | error("\nerror: %d failed with error %s\n", x, msg); \ 26 | } \ 27 | } while(0) 28 | 29 | void cudaLaunch(std::string kernelName, 30 | void * args[], 31 | const dim3 &gridDim, const dim3 &blockDim, 32 | cudaStream_t stream = NULL); 33 | 34 | #endif /* _CUDAUTILS_H_ */ 35 | -------------------------------------------------------------------------------- /src/cuseful.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include"cuseful.h" 10 | 11 | #include 12 | 13 | #define HALF RAND_MAX/2 14 | 15 | void fatal(const char * msg) 16 | { 17 | error(msg); 18 | } 19 | 20 | void getComputeNumber(int * major, int * minor) 21 | { 22 | int currentDevice = 0; 23 | struct cudaDeviceProp dProps; 24 | 25 | cudaGetDevice(¤tDevice); 26 | cudaGetDeviceProperties(&dProps, currentDevice); 27 | 28 | *major = dProps.major; 29 | *minor = dProps.minor; 30 | } 31 | 32 | void checkDoubleCapable(const char * failMsg) 33 | { 34 | int major, minor; 35 | major = minor = 0; 36 | getComputeNumber(&major, &minor); 37 | if((major < 1) || ((major == 1) && (minor < 3))) 38 | error(failMsg); 39 | } 40 | 41 | float * getMatFromFile(int rows, int cols, const char * fn) 42 | { 43 | FILE * matFile; 44 | matFile = fopen(fn, "r"); 45 | if(matFile == NULL) 46 | error("unable to open file %s", fn); 47 | float * mat = Calloc(rows*cols, float); 48 | int i, j, err; 49 | for(i = 0; i < rows; i++) { 50 | for(j = 0; j < cols; j++) { 51 | err = fscanf(matFile, " %f ", mat+i+j*rows); 52 | if(err == EOF) 53 | error("file %s incorrect: formatting or size", fn); 54 | } 55 | fscanf(matFile, " \n "); 56 | } 57 | fclose(matFile); 58 | return mat; 59 | } 60 | 61 | char * getTime() { 62 | time_t curtime; 63 | struct tm *loctime; 64 | curtime = time(NULL); 65 | loctime = localtime(&curtime); 66 | 67 | return asctime(loctime); 68 | } 69 | 70 | void printVect(int n, const float * vect, const char * msg) { 71 | if(msg != NULL) Rprintf(msg); 72 | for(int i = 0; i < n; i++) { 73 | Rprintf("%6.4f, ", vect[i]); 74 | if((i+1)%10 == 0) Rprintf("\n"); 75 | } 76 | if(n%10 != 0) Rprintf("\n"); 77 | if(msg != NULL) Rprintf("----------\n"); 78 | } 79 | 80 | void printMat(int rows, int cols, const float * mat, const char * msg) { 81 | int i; 82 | if(msg != NULL) Rprintf(msg); 83 | for(i = 0; i < rows; i++) 84 | printVect(cols, mat+i*cols, NULL); 85 | if(msg != NULL) Rprintf("----------\n"); 86 | } 87 | 88 | int hasCudaError(const char * msg) { 89 | cudaError_t err = cudaGetLastError(); 90 | if(cudaSuccess != err) 91 | error("cuda error : %s : %s\n", msg, cudaGetErrorString(err)); 92 | return 0; 93 | } 94 | 95 | void checkCudaError(const char * msg) { 96 | cudaError_t err = cudaGetLastError(); 97 | if(cudaSuccess != err) { 98 | if(msg != NULL) 99 | warning(msg); 100 | error(cudaGetErrorString(err)); 101 | } 102 | } 103 | 104 | std::string cublasGetErrorString(cublasStatus err) 105 | { 106 | switch(err) { 107 | case CUBLAS_STATUS_SUCCESS : 108 | return "operation completed successfully"; 109 | case CUBLAS_STATUS_NOT_INITIALIZED : 110 | return "CUBLAS library not initialized"; 111 | case CUBLAS_STATUS_ALLOC_FAILED : 112 | return "resource allocation failed"; 113 | case CUBLAS_STATUS_INVALID_VALUE : 114 | return "unsupported numerical value was passed to function"; 115 | case CUBLAS_STATUS_ARCH_MISMATCH : 116 | return "function requires an architectural feature absent from \ 117 | the architecture of the device"; 118 | case CUBLAS_STATUS_MAPPING_ERROR : 119 | return "access to GPU memory space failed"; 120 | case CUBLAS_STATUS_EXECUTION_FAILED : 121 | return "GPU program failed to execute"; 122 | case CUBLAS_STATUS_INTERNAL_ERROR : 123 | return "an internal CUBLAS operation failed"; 124 | default : 125 | return "unknown error type"; 126 | } 127 | } 128 | 129 | void checkCublasError(const char * msg) 130 | { 131 | cublasStatus err = cublasGetError(); 132 | if(err != CUBLAS_STATUS_SUCCESS) 133 | error("cublas error : %s : %s\n", msg, cublasGetErrorString(err).c_str()); 134 | } 135 | 136 | int hasCublasError(const char * msg) 137 | { 138 | cublasStatus err = cublasGetError(); 139 | if(err != CUBLAS_STATUS_SUCCESS) 140 | error("cublas error : %s : %s\n", msg, cublasGetErrorString(err).c_str()); 141 | return 0; 142 | } 143 | -------------------------------------------------------------------------------- /src/cuseful.h: -------------------------------------------------------------------------------- 1 | void fatal(const char * msg); 2 | void getComputeNumber(int * major, int * minor); 3 | void checkDoubleCapable(const char * failMsg); 4 | char * getTime(); 5 | void printVect(int n, const float * vect, const char * msg); 6 | void printMat(int rows, int cols, const float * mat, const char * msg); 7 | void checkCudaError(const char * msg); 8 | int hasCudaError(const char * msg); 9 | float * getMatFromFile(int rows, int cols, const char * fn); 10 | void checkCublasError(const char * msg); 11 | int hasCublasError(const char * msg); 12 | -------------------------------------------------------------------------------- /src/distance.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "R.h" 6 | #include "Rinternals.h" 7 | 8 | #include "cuda_runtime_api.h" 9 | 10 | #include "cuseful.h" 11 | #include "distance.h" 12 | 13 | #include "cudaUtils.h" 14 | 15 | #define NUM_THREADS 32 16 | 17 | static void euclidean(const float * vg_a, size_t n_a, const float * vg_b, 18 | size_t n_b, size_t dim, float * d) 19 | { 20 | float 21 | sum, component; 22 | 23 | for(size_t y = 0; y < n_b; y++) { 24 | for(size_t x = 0; x < n_a; x++) { 25 | sum = 0.f; 26 | for(size_t i = 0; i < dim; i++) { 27 | component = vg_a[x * dim + i] - vg_b[y * dim + i]; 28 | sum += component * component; 29 | } 30 | d[y * dim + x] = sqrtf(sum); 31 | } 32 | } 33 | } 34 | 35 | static void maximum(const float * vg_a, size_t n_a, const float * vg_b, 36 | size_t n_b, size_t dim, float * d) 37 | { 38 | float 39 | themax, previous, current; 40 | 41 | for(size_t y = 0; y < n_b; y++) { 42 | for(size_t x = 0; x < n_a; x++) { 43 | previous = 0.f; 44 | for(size_t i = 0; i < dim; i++) { 45 | current = fabsf(vg_a[x * dim + i] - vg_b[y * dim + i]); 46 | themax = (previous < current)? current : previous; 47 | previous = themax; 48 | } 49 | d[y * dim + x] = themax; 50 | } 51 | } 52 | } 53 | 54 | static void manhattan(const float * vg_a, size_t n_a, const float * vg_b, 55 | size_t n_b, size_t dim, float * d) 56 | { 57 | float sum; 58 | 59 | for(size_t y = 0; y < n_b; y++) { 60 | for(size_t x = 0; x < n_a; x++) { 61 | sum = 0.f; 62 | for(size_t i = 0; i < dim; i++) 63 | sum += fabsf(vg_a[x * dim + i] - vg_b[y * dim + i]); 64 | d[y * dim + x] = sum; 65 | } 66 | } 67 | } 68 | 69 | static void canberra(const float * vg_a, size_t n_a, const float * vg_b, 70 | size_t n_b, size_t dim, float * d) 71 | { 72 | float 73 | componentDiff, componentSum, 74 | acoord, bcoord, 75 | sum; 76 | 77 | for(size_t y = 0; y < n_b; y++) { 78 | for(size_t x = 0; x < n_a; x++) { 79 | sum = 0.f; 80 | for(size_t i = 0; i < dim; i++) { 81 | acoord = vg_a[x * dim + i]; 82 | bcoord = vg_b[y * dim + i]; 83 | 84 | componentDiff = fabsf(acoord - bcoord); 85 | componentSum = fabsf(acoord + bcoord); 86 | 87 | if(componentSum != 0.f) 88 | sum += componentDiff / componentSum; 89 | } 90 | d[y * dim + x] = sum; 91 | } 92 | } 93 | } 94 | 95 | static void binary(const float * vg_a, size_t n_a, const float * vg_b, 96 | size_t n_b, size_t dim, float * d) 97 | { 98 | int 99 | acoord, bcoord, 100 | sharedOnes, ones; 101 | float ratio; 102 | 103 | for(size_t y = 0; y < n_b; ++y) { 104 | for(size_t x = 0; x < n_a; ++x) { 105 | ones = sharedOnes = 0; 106 | for(size_t i = 0; i < dim; ++i) { 107 | acoord = (vg_a[x * dim + i] != 0.f); 108 | bcoord = (vg_b[y * dim + i] != 0.f); 109 | 110 | if(acoord ^ bcoord) sharedOnes += 1; 111 | if(acoord || bcoord) ones += 1; 112 | } 113 | ratio = (ones != 0)? ((float)sharedOnes / (float)ones) 114 | : (float)sharedOnes; 115 | 116 | d[y * dim + x] = ratio; 117 | } 118 | } 119 | } 120 | 121 | static void minkowski(const float * vg_a, size_t n_a, const float * vg_b, 122 | size_t n_b, size_t dim, float p, float * d) 123 | { 124 | float 125 | component, sum; 126 | 127 | for(size_t y = 0; y < n_b; y++) { 128 | for(size_t x = 0; x < n_a; x++) { 129 | sum = 0.f; 130 | for(size_t i = 0; i < dim; i++) { 131 | component = fabsf(vg_a[x * dim + i] - vg_b[y * dim + i]); 132 | sum += powf(component, p); 133 | } 134 | d[y * dim + x] = powf(sum, (float)(1.f / p)); 135 | } 136 | } 137 | } 138 | 139 | /* 140 | static void dot(const float * vg_a, size_t pitch_a, size_t n_a, 141 | const float * vg_b, size_t pitch_b, size_t n_b, 142 | size_t k, 143 | float * d, size_t pitch_d) 144 | { 145 | // Two different vectors 146 | if(vg_a != vg_b) { 147 | for(size_t y = 0; y < n_b; ++y) { 148 | for(size_t x = 0; x < n_a; ++x) { 149 | float s = 0.0; 150 | for(size_t i = 0; i < k; ++i) { 151 | float t = vg_a[x * pitch_a + i] * vg_b[y * pitch_b + i]; 152 | s += t; 153 | } 154 | d[y * pitch_d + x] = s; 155 | } 156 | } 157 | } else { 158 | // Compute 159 | for(size_t y = 1; y < n_b; ++y) { 160 | for(size_t x = 0; x <= y; ++x) { 161 | float s = 0.0; 162 | for(size_t i = 0; i < k; ++i) { 163 | float t = vg_a[x * pitch_a + i] * vg_b[y * pitch_b + i]; 164 | s += t; 165 | } 166 | d[y * pitch_d + x] = s; 167 | d[x * pitch_d + y] = s; 168 | } 169 | } 170 | } 171 | } 172 | */ 173 | 174 | void distance_host(const float * vg_a, size_t pitch_a, size_t n_a, 175 | const float * vg_b, size_t pitch_b, size_t n_b, 176 | size_t k, float * d, size_t pitch_d, 177 | dist_method method, float p) 178 | { 179 | switch(method) { 180 | case EUCLIDEAN: 181 | euclidean(vg_a, n_a, vg_b, n_b, k, d); 182 | break; 183 | case MAXIMUM: 184 | maximum(vg_a, n_a, vg_b, n_b, k, d); 185 | break; 186 | case MANHATTAN: 187 | manhattan(vg_a, n_a, vg_b, n_b, k, d); 188 | break; 189 | case CANBERRA: 190 | canberra(vg_a, n_a, vg_b, n_b, k, d); 191 | break; 192 | case BINARY: 193 | binary(vg_a, n_a, vg_b, n_b, k, d); 194 | break; 195 | case MINKOWSKI: 196 | minkowski(vg_a, n_a, vg_b, n_b, k, p, d); 197 | break; 198 | default: 199 | error("unknown distance method"); 200 | /* case DOT: 201 | dot(vg_a, pitch_a / sizeof(float), n_a, 202 | vg_b, pitch_b / sizeof(float), n_b, 203 | k, 204 | d, pitch_d / sizeof(float)); 205 | break; */ 206 | } 207 | } 208 | 209 | void distance_device(const float * vg_a_d, size_t pitch_a, size_t n_a, 210 | const float * vg_b_d, size_t pitch_b, size_t n_b, 211 | size_t k, 212 | float * d_d, size_t pitch_d, 213 | dist_method method, float p) 214 | { 215 | dim3 block(NUM_THREADS, 1, 1); 216 | dim3 grid(n_a, n_b, 1); 217 | 218 | size_t fbytes = sizeof(float); 219 | 220 | pitch_a /= fbytes; 221 | pitch_b /= fbytes; 222 | pitch_d /= fbytes; 223 | 224 | void * args[] = { 225 | &vg_a_d, &pitch_a, &n_a, 226 | &vg_b_d, &pitch_b, &n_b, 227 | &k, 228 | &d_d, &pitch_d, &p 229 | }; 230 | 231 | std::string kernelName; 232 | 233 | switch(method) { // Calculate the distance 234 | case EUCLIDEAN: 235 | kernelName = "euclidean_kernel_same"; 236 | break; 237 | case MAXIMUM: 238 | kernelName = "maximum_kernel_same"; 239 | break; 240 | case MANHATTAN: 241 | kernelName = "manhattan_kernel_same"; 242 | break; 243 | case CANBERRA: 244 | kernelName = "canberra_kernel_same"; 245 | break; 246 | case BINARY: 247 | kernelName = "binary_kernel_same"; 248 | break; 249 | case MINKOWSKI: 250 | kernelName = "minkowski_kernel_same"; 251 | break; 252 | default: 253 | kernelName = ""; 254 | error("unknown distance method"); 255 | } 256 | cudaLaunch(kernelName, args, grid, block); 257 | } 258 | 259 | void distance(const float * vg_a, size_t pitch_a, size_t n_a, 260 | const float * vg_b, size_t pitch_b, size_t n_b, 261 | size_t k, 262 | float * d, size_t pitch_d, 263 | dist_method method, float p) 264 | { 265 | size_t 266 | pitch_a_d, pitch_b_d, pitch_d_d; 267 | int same = (vg_a == vg_b); // are the two sets of vectors the same? 268 | 269 | // Space for the vector data 270 | float * distance_vg_a_d; 271 | float * distance_vg_b_d; 272 | 273 | // Space for the resulting distance 274 | float * distance_d_d; 275 | 276 | // Allocate space for the vectors and distances on the gpu 277 | cudaMallocPitch((void**)&distance_vg_a_d, &pitch_a_d, k * sizeof(float), 278 | n_a); 279 | cudaMemcpy2D(distance_vg_a_d, pitch_a_d, vg_a, pitch_a, k * sizeof(float), 280 | n_a, cudaMemcpyHostToDevice); 281 | cudaMallocPitch((void**)&distance_d_d, &pitch_d_d, n_a * sizeof(float), 282 | n_b); 283 | 284 | checkCudaError("distance function : malloc and memcpy"); 285 | 286 | if(same) // don't need to move vg_b to gpu 287 | distance_device(distance_vg_a_d, pitch_a_d, n_a, 288 | distance_vg_a_d, pitch_a_d, n_a, 289 | k, 290 | distance_d_d, pitch_d_d, 291 | method, p); 292 | else { // vg_b is a different set of pnts so store it on gpu too 293 | cudaMallocPitch((void**)&distance_vg_b_d, &pitch_b_d, 294 | k * sizeof(float), n_b); 295 | cudaMemcpy2D(distance_vg_b_d, pitch_b_d, vg_b, pitch_b, 296 | k * sizeof(float), n_b, cudaMemcpyHostToDevice); 297 | 298 | checkCudaError("distance function : malloc and memcpy"); 299 | 300 | distance_device(distance_vg_a_d, pitch_a_d, n_a, 301 | distance_vg_b_d, pitch_b_d, n_b, 302 | k, 303 | distance_d_d, pitch_d_d, 304 | method, p); 305 | cudaFree(distance_vg_b_d); 306 | } 307 | checkCudaError("distance function : kernel invocation"); 308 | // Copy the result back to cpu land now that gpu work is done 309 | cudaMemcpy2D(d, pitch_d, distance_d_d, pitch_d_d, n_a * sizeof(float), 310 | n_b, cudaMemcpyDeviceToHost); 311 | checkCudaError("distance function : memcpy"); 312 | 313 | // Free allocated space 314 | cudaFree(distance_vg_a_d); 315 | cudaFree(distance_d_d); 316 | } 317 | 318 | void distanceLeaveOnGpu(dist_method method, float p, const float * points, 319 | size_t dim, size_t numPoints, 320 | float ** gpuDistances, size_t * pitchDistances) // outputs 321 | { 322 | size_t pitchPoints; 323 | float * dPoints; 324 | 325 | // prepare the vectors and distance storage on the gpu 326 | cudaMallocPitch((void**)&dPoints, 327 | &pitchPoints, dim * sizeof(float), numPoints); 328 | cudaMemcpy2D(dPoints, pitchPoints, points, 329 | dim * sizeof(float), dim * sizeof(float), numPoints, 330 | cudaMemcpyHostToDevice); 331 | cudaMallocPitch((void**)gpuDistances, pitchDistances, 332 | numPoints * sizeof(float), numPoints); 333 | checkCudaError("distance on gpu func : malloc and memcpy"); 334 | 335 | distance_device(dPoints, pitchPoints, numPoints, 336 | dPoints, pitchPoints, numPoints, 337 | dim, 338 | *gpuDistances, *pitchDistances, 339 | method, p); 340 | checkCudaError("distance on gpu func : kernel invocation"); 341 | 342 | // clean up resources 343 | cudaFree(dPoints); // be kind rewind 344 | } 345 | -------------------------------------------------------------------------------- /src/distance.h: -------------------------------------------------------------------------------- 1 | #ifndef DISTANCE_H 2 | #define DISTANCE_H 3 | 4 | // Methods for computing the distance 5 | typedef enum { 6 | EUCLIDEAN, MAXIMUM, MANHATTAN, CANBERRA, 7 | BINARY, MINKOWSKI // ,DOT 8 | } dist_method; 9 | 10 | /* Calculate the distance matrix for vectors in group a and vectors in group b. 11 | * 12 | * The format for vg_a and vg_b is as follows: 13 | * There are n_* vectors, each of dimensionality k. They are stored in 14 | * row major order with a row (or pitch) being pitch_* bytes. 15 | * The calculated distances are stored in d such that the distance between 16 | * vectors indexed a and b is located in d[b * pitch_d / sizeof(float) + a]. 17 | * The user is responsible for 18 | * allocating storage for d. It should be at least n_a * n_b * sizeof(float) 19 | * bytes. The pitch_d argument is the same as for vg_*. 20 | * The method used to calculate the distance is dist_method. 21 | * 22 | * The argument p is optional and used for the Minkowski method. 23 | * 24 | * This function may run on the CPU or GPU depending on the size and 25 | * number of vectors. Good data alignment will increase performance. 26 | */ 27 | void distance(const float * vg_a, size_t pitch_a, size_t n_a, 28 | const float * vg_b, size_t pitch_b, size_t n_b, 29 | size_t k, 30 | float * d, size_t pitch_d, 31 | dist_method method, 32 | float p); 33 | 34 | /* This function should be used to calculate a distance matrix when 35 | * the data is already stored on the GPU. vg_a, vg_b, and 36 | * d should already be allocated and initialized on the device. 37 | * This function does not allocate or free any storage on the device. 38 | */ 39 | void distance_device(const float * vg_a, size_t pitch_a, size_t n_a, 40 | const float * vg_b, size_t pitch_b, size_t n_b, 41 | size_t k, 42 | float * d, size_t pitch_d, 43 | dist_method method, 44 | float p); 45 | 46 | /* This function is analogous to distance_device except that it 47 | * will run on a CPU. Storage for d is not allocated in this function. 48 | */ 49 | void distance_host(const float * vg_a, size_t pitch_a, size_t n_a, 50 | const float * vg_b, size_t pitch_b, size_t n_b, 51 | size_t k, 52 | float * d, size_t pitch_d, 53 | dist_method method, 54 | float p = 2.0); 55 | 56 | void distanceLeaveOnGpu(dist_method method, float p, const float * points, 57 | size_t dim, size_t numPoints, float ** gpuDistances, 58 | size_t * pitchDistances); 59 | 60 | 61 | #endif // DISTANCE_H 62 | -------------------------------------------------------------------------------- /src/granger.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "Rmath.h" 7 | #include "cuda_runtime_api.h" 8 | 9 | #include "cuseful.h" 10 | #include "cudaUtils.h" 11 | #include "granger.h" 12 | 13 | #define max(a, b) ((a > b)?a:b) 14 | 15 | #define THREADSPERDIM 16 16 | 17 | #define FALSE 0 18 | #define TRUE !FALSE 19 | 20 | void getPValues(int rows, int cols, const float * fStats, int p, int embedRows, 21 | float * pValues) 22 | { 23 | float fscore = 0.f; 24 | 25 | for(int i = 0; i < rows; i++) { 26 | for(int j = 0; j < cols; j++) { 27 | fscore = fStats[i + j * rows]; 28 | pValues[i + j * rows] = 1.f - (float) pf((double) fscore, 29 | (double) p, (double)embedRows - 2.0 * (double) p - 1.0, 30 | 1, 0); 31 | } 32 | } 33 | } 34 | 35 | void granger(int rows, int cols, const float * y, int p, 36 | float * fStats, float * pValues) 37 | { 38 | if(cols < 2) { 39 | fatal("The Granger test needs at least 2 variables.\n"); 40 | return; 41 | } 42 | int 43 | i, j, k, t = p+1, 44 | fbytes = sizeof(float), 45 | embedRows = rows-p, embedCols = t*2; 46 | float 47 | * Y, * rQ, * rR, 48 | * unrQ, * unrR, 49 | * restricted, * unrestricted, 50 | * rdata, * unrdata, 51 | * dfStats; // * dpValues; 52 | size_t 53 | size = cols*cols*fbytes, partSize = embedRows*size; 54 | 55 | cudaMalloc((void **)&Y, embedCols*partSize); 56 | 57 | cudaMalloc((void **)&rQ, t*embedRows*cols*fbytes); 58 | cudaMalloc((void **)&rR, t*t*cols*fbytes); 59 | cudaMalloc((void **)&rdata, t*embedRows*cols*fbytes); 60 | cudaMalloc((void **)&unrdata, (embedCols-1)*partSize); 61 | cudaMalloc((void **)&restricted, t*cols*fbytes); 62 | if( hasCudaError("granger: line 267: gpu memory allocation") ) return; 63 | 64 | int 65 | Ydim = embedCols * embedRows, 66 | rQdim = t * embedRows, rRdim = t * t, 67 | rdataDim = t*embedRows, restrictedDim = t, 68 | unrQdim = (embedCols-1) * embedRows, 69 | unrRdim = (embedCols-1) * (embedCols-1), 70 | unrestrictedDim = embedCols-1, unrdataDim = (embedCols-1)*embedRows; 71 | float 72 | * ypos, * rdataPos, * unrdataPos, 73 | * evenCols; 74 | int 75 | skip = 2*embedRows, colBytes = embedRows*fbytes; 76 | const float 77 | * vectA, * vectB; 78 | 79 | for(i = 0; i < cols; i++) { 80 | rdataPos = rdata+i*rdataDim; 81 | evenCols = rdataPos+embedRows; 82 | vectA = y+i*rows; 83 | for(j = 0; j < cols; j++) { 84 | if(i == j) continue; 85 | 86 | ypos = Y+(i*cols+j)*Ydim; 87 | unrdataPos = unrdata+(i*cols+j)*unrdataDim; 88 | 89 | vectB = y+j*rows; 90 | 91 | for(k = 0; k < p+1; k++) { // produce t subcols 92 | cudaMemcpy(ypos+k*skip, vectA+(p-k), embedRows*fbytes, 93 | cudaMemcpyHostToDevice); 94 | cudaMemcpy(ypos+k*skip+embedRows, vectB+(p-k), 95 | embedRows*fbytes, cudaMemcpyHostToDevice); 96 | } 97 | cudaMemcpy(unrdataPos+embedRows, ypos+skip, 98 | (embedCols-2)*embedRows*fbytes, cudaMemcpyDeviceToDevice); 99 | } 100 | // build restricted data from last set of unrestricted data 101 | // only need one per column, not one for each pairing 102 | for(k = 0; k < embedCols-2; k+=2) { 103 | cudaMemcpy(evenCols+(k*embedRows)/2, unrdataPos+(1+k)*embedRows, 104 | colBytes, cudaMemcpyDeviceToDevice); 105 | } 106 | } 107 | if( hasCudaError("granger : mem copy from host to device") ) return; 108 | 109 | int numBlocks = cols / THREADSPERDIM; 110 | if(numBlocks * THREADSPERDIM < cols) numBlocks++; 111 | 112 | dim3 113 | dimRGrid(numBlocks), 114 | dimRBlock(THREADSPERDIM), 115 | dimUnrGrid(numBlocks, numBlocks), 116 | dimUnrBlock(THREADSPERDIM, THREADSPERDIM); 117 | 118 | void * restArgs[] = { 119 | &cols, &cols, 120 | &embedRows, 121 | &t, 122 | &rdata, 123 | &rdataDim, 124 | &Y, 125 | &Ydim, 126 | &rQ, 127 | &rQdim, 128 | &rR, 129 | &rRdim, 130 | &restricted, 131 | &restrictedDim 132 | }; 133 | cudaLaunch("getRestricted", restArgs, dimRGrid, dimRBlock); 134 | if( hasCudaError("granger: getRestricted kernel execution") ) return; 135 | 136 | cudaFree(rQ); 137 | cudaFree(rR); 138 | 139 | cudaMalloc((void **)&unrQ, (embedCols-1)*partSize); 140 | cudaMalloc((void **)&unrR, (embedCols-1)*(embedCols-1)*size); 141 | cudaMalloc((void **)&unrestricted, (embedCols-1)*size); 142 | if( hasCudaError("granger: line 336: attemped gpu memory allocation") ) 143 | return; 144 | 145 | size_t unrestT = embedCols - 1; 146 | void * unrestArgs[] = { 147 | &cols, &cols, 148 | &embedRows, 149 | &unrestT, 150 | &unrdata, 151 | &unrdataDim, 152 | &Y, 153 | &Ydim, 154 | &unrQ, 155 | &unrQdim, 156 | &unrR, 157 | &unrRdim, 158 | &unrestricted, 159 | &unrestrictedDim 160 | }; 161 | cudaLaunch("getUnrestricted", unrestArgs, dimUnrGrid, dimUnrBlock); 162 | if( hasCudaError("granger : getUnRestricted kernel execution") ) return; 163 | 164 | cudaFree(unrQ); 165 | cudaFree(unrR); 166 | 167 | size_t resultSize = cols*cols*fbytes; 168 | cudaMalloc((void **)&dfStats, resultSize); 169 | // cudaMalloc((void **)&dpValues, resultSize); 170 | if( hasCudaError("granger: line 350: gpu memory allocation") ) return; 171 | 172 | int diagFlag = FALSE; 173 | void * ftestArgs[] = { 174 | &diagFlag, 175 | &p, 176 | &embedRows, 177 | &cols, &cols, 178 | &t, &unrestT, 179 | &Y, &Ydim, 180 | &restricted, &restrictedDim, 181 | &unrestricted, &unrestrictedDim, 182 | &rdata, &rdataDim, 183 | &unrdata, &unrdataDim, 184 | &dfStats 185 | }; 186 | cudaLaunch("ftest", ftestArgs, dimUnrGrid, dimUnrBlock); 187 | if( hasCudaError("granger : ftest kernel execution") ) return; 188 | 189 | cudaMemcpy(fStats, dfStats, resultSize, cudaMemcpyDeviceToHost); 190 | // cudaMemcpy(pValues, dpValues, resultSize, cudaMemcpyDeviceToHost); 191 | if( hasCudaError("granger : mem copy device to host") ) return; 192 | 193 | getPValues(cols, cols, fStats, p, embedRows, pValues); 194 | 195 | cudaFree(Y); 196 | cudaFree(restricted); 197 | cudaFree(unrestricted); 198 | cudaFree(rdata); 199 | cudaFree(unrdata); 200 | cudaFree(dfStats); 201 | // cudaFree(dpValues); 202 | } 203 | 204 | void grangerxy(int rows, int colsx, const float * x, int colsy, 205 | const float * y, int p, float * fStats, float * pValues) 206 | { 207 | 208 | if((p < 0) || (rows < 1) || (colsx < 1) || (colsy < 1)) { 209 | fatal("The Granger XY test needs at least a pair variables.\n"); 210 | return; 211 | } 212 | int 213 | i, j, k, t = p+1, 214 | fbytes = sizeof(float), 215 | embedRows = rows-p, embedCols = t*2; 216 | float 217 | * Y, * rQ, * rR, 218 | * unrQ, * unrR, 219 | * restricted, * unrestricted, 220 | * rdata, * unrdata, 221 | * dfStats; // * dpValues; 222 | size_t 223 | size = colsx*colsy*fbytes, partSize = embedRows*size; 224 | 225 | cudaMalloc((void **)&Y, embedCols*partSize); 226 | 227 | cudaMalloc((void **)&rQ, t*embedRows*colsy*fbytes); 228 | cudaMalloc((void **)&rR, t*t*colsy*fbytes); 229 | cudaMalloc((void **)&rdata, t*embedRows*colsy*fbytes); 230 | cudaMalloc((void **)&restricted, t*colsy*fbytes); 231 | 232 | cudaMalloc((void **)&unrQ, (embedCols-1)*partSize); 233 | cudaMalloc((void **)&unrR, (embedCols-1)*(embedCols-1)*size); 234 | cudaMalloc((void **)&unrestricted, (embedCols-1)*size); 235 | cudaMalloc((void **)&unrdata, (embedCols-1)*partSize); 236 | checkCudaError("grangerxy : attemped gpu memory allocation"); 237 | 238 | int 239 | Ydim = embedCols * embedRows, 240 | rQdim = t * embedRows, rRdim = t * t, 241 | rdataDim = t*embedRows, restrictedDim = t, 242 | unrQdim = (embedCols-1) * embedRows, 243 | unrRdim = (embedCols-1) * (embedCols-1), 244 | unrestrictedDim = embedCols-1, unrdataDim = (embedCols-1)*embedRows; 245 | float 246 | * ypos, * rdataPos, * unrdataPos; 247 | 248 | int 249 | skip = 2*embedRows, colBytes = embedRows*fbytes; 250 | const float * vectA, * vectB; 251 | float * evenCols; 252 | 253 | for(i = 0; i < colsy; i++) { 254 | rdataPos = rdata+i*rdataDim; 255 | evenCols = rdataPos+embedRows; 256 | vectA = y+i*rows; 257 | for(j = 0; j < colsx; j++) { 258 | ypos = Y+(i*colsx+j)*Ydim; 259 | unrdataPos = unrdata+(i*colsx+j)*unrdataDim; 260 | 261 | vectB = x+j*rows; 262 | 263 | for(k = 0; k < p+1; k++) { // produce t subcols 264 | cudaMemcpy(ypos+k*skip, vectA+(p-k), embedRows*fbytes, 265 | cudaMemcpyHostToDevice); 266 | cudaMemcpy(ypos+k*skip+embedRows, vectB+(p-k), 267 | embedRows*fbytes, cudaMemcpyHostToDevice); 268 | } 269 | cudaMemcpy(unrdataPos+embedRows, ypos+skip, 270 | (embedCols-2)*embedRows*fbytes, cudaMemcpyDeviceToDevice); 271 | } 272 | // build restricted data from last set of unrestricted data 273 | // only need one per column, not one for each pairing 274 | for(k = 0; k < embedCols-2; k+=2) { 275 | cudaMemcpy(evenCols+(k*embedRows)/2, unrdataPos+(1+k)*embedRows, 276 | colBytes, cudaMemcpyDeviceToDevice); 277 | } 278 | char errline[16]; 279 | sprintf(errline, "gxy err : %d\n", i); 280 | if( hasCudaError(errline) ) return; 281 | } 282 | checkCudaError("grangerxy : mem copy from host to device"); 283 | 284 | int 285 | numBlocksX = colsx / THREADSPERDIM, 286 | numBlocksY = colsy / THREADSPERDIM; 287 | 288 | if(numBlocksX * THREADSPERDIM < colsx) numBlocksX++; 289 | if(numBlocksY * THREADSPERDIM < colsy) numBlocksY++; 290 | 291 | dim3 292 | dimRGrid(numBlocksY), 293 | dimRBlock(THREADSPERDIM), 294 | dimUnrGrid(numBlocksX, numBlocksY), 295 | dimUnrBlock(THREADSPERDIM, THREADSPERDIM); 296 | 297 | void * restArgs[] = { 298 | &colsx, &colsy, 299 | &embedRows, 300 | &t, 301 | &rdata, &rdataDim, 302 | &Y, &Ydim, 303 | &rQ, &rQdim, 304 | &rR, &rRdim, 305 | &restricted, &restrictedDim 306 | }; 307 | cudaLaunch("getRestricted", restArgs, 308 | dimRGrid, dimRBlock); 309 | 310 | size_t unrestT = embedCols - 1; 311 | void * unrestArgs[] = { 312 | &colsx, &colsy, 313 | &embedRows, 314 | &unrestT, 315 | &unrdata, &unrdataDim, 316 | &Y, &Ydim, 317 | &unrQ, &unrQdim, 318 | &unrR, &unrRdim, 319 | &unrestricted, &unrestrictedDim 320 | }; 321 | cudaLaunch("getUnrestricted", unrestArgs, 322 | dimUnrGrid, dimUnrBlock); 323 | 324 | checkCudaError("grangerxy : kernel execution get(Un)Restricted"); 325 | 326 | cudaFree(rQ); 327 | cudaFree(unrQ); 328 | cudaFree(rR); 329 | cudaFree(unrR); 330 | 331 | size_t resultSize = colsx*colsy*fbytes; 332 | cudaMalloc((void **)&dfStats, resultSize); 333 | // cudaMalloc((void **)&dpValues, resultSize); 334 | checkCudaError("grangerxy : attemped gpu memory allocation"); 335 | 336 | int diagFlag = TRUE; 337 | void * ftestArgs[] = { 338 | &diagFlag, 339 | &p, 340 | &embedRows, 341 | &colsx, &colsy, 342 | &t, &unrestT, 343 | &Y, &Ydim, 344 | &restricted, &restrictedDim, 345 | &unrestricted, &unrestrictedDim, 346 | &rdata, &rdataDim, 347 | &unrdata, &unrdataDim, 348 | &dfStats 349 | }; 350 | cudaLaunch("ftest", ftestArgs, dimUnrGrid, dimUnrBlock); 351 | checkCudaError("grangerxy : kernel execution ftest"); 352 | 353 | cudaMemcpy(fStats, dfStats, resultSize, cudaMemcpyDeviceToHost); 354 | // cudaMemcpy(pValues, dpValues, resultSize, cudaMemcpyDeviceToHost); 355 | checkCudaError("grangerxy : mem copy from device to host"); 356 | 357 | getPValues(colsx, colsy, fStats, p, embedRows, pValues); 358 | 359 | cudaFree(Y); 360 | cudaFree(restricted); 361 | cudaFree(unrestricted); 362 | cudaFree(rdata); 363 | cudaFree(unrdata); 364 | cudaFree(dfStats); 365 | // cudaFree(dpValues); 366 | } 367 | -------------------------------------------------------------------------------- /src/granger.h: -------------------------------------------------------------------------------- 1 | #ifndef _GRANGER_H_ 2 | #define _GRANGER_H_ 3 | 4 | void granger(int rows, int cols, 5 | const float * y, int p, 6 | float * fStats, float * pValues); 7 | void grangerxy(int rows, 8 | int colsx, const float * x, 9 | int colsy, const float * y, 10 | int p, 11 | float * fStats, 12 | float * pValues); 13 | 14 | #endif /* _GRANGER_H_ */ 15 | -------------------------------------------------------------------------------- /src/hcluster.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "R.h" 6 | 7 | #include "math_constants.h" 8 | #include "cuseful.h" 9 | #include "hcluster.h" 10 | #include "cudaUtils.h" 11 | 12 | #define NUM_THREADS 32 13 | #define NUM_BLOCKS 1024 14 | 15 | void hcluster(const float * dist, size_t dist_pitch, size_t n, 16 | int * sub, int * sup, float * val, hc_method method, 17 | float lambda, float beta) 18 | { 19 | // Allocate space for the distance matrix 20 | size_t pitch_dist_d; 21 | float * hcluster_dist_d; // Distance matrix 22 | cudaMallocPitch((void**)&hcluster_dist_d, &pitch_dist_d, 23 | n * sizeof(float), n); 24 | 25 | // Copy the distance matrix 26 | cudaMemcpy2D(hcluster_dist_d, pitch_dist_d, 27 | dist, dist_pitch, 28 | n * sizeof(float), n, 29 | cudaMemcpyHostToDevice); 30 | 31 | hclusterPreparedDistances(hcluster_dist_d, pitch_dist_d, n, 32 | sub, sup, 33 | val, 34 | method, lambda, beta); 35 | 36 | cudaFree(hcluster_dist_d); 37 | checkCudaError("hcluster : cudaFree"); 38 | } 39 | 40 | void hclusterPreparedDistances(float * gpuDist, size_t pitch_dist_d, size_t n, 41 | int * sub, int * sup, 42 | float * val, 43 | hc_method method, 44 | float lambda, float beta) 45 | { 46 | float 47 | * hcluster_count_d, // Number of elements in each cluster 48 | * hcluster_min_val_d, // find min of each row and 49 | * hcluster_merge_val_d; // Array of the values merged at 50 | 51 | // col containing the min of each row 52 | size_t * hcluster_min_col_d; 53 | 54 | // Arrays telling which cluster merged with which cluster 55 | int 56 | * hcluster_sub_d, 57 | * hcluster_sup_d; 58 | 59 | cudaMalloc((void**)&hcluster_count_d, n * sizeof(float)); 60 | cudaMalloc((void**)&hcluster_min_val_d, n * sizeof(float)); 61 | cudaMalloc((void**)&hcluster_min_col_d, n * sizeof(size_t)); 62 | cudaMalloc((void**)&hcluster_sub_d, (n - 1) * sizeof(int)); 63 | cudaMalloc((void**)&hcluster_sup_d, (n - 1) * sizeof(int)); 64 | cudaMalloc((void**)&hcluster_merge_val_d, (n - 1) * sizeof(float)); 65 | 66 | // Every element starts in its own cluster 67 | float * pre_count = Calloc(n, float); 68 | for(size_t i = 0; i < n; ++i) 69 | pre_count[i] = 1.0; 70 | 71 | cudaMemcpy(hcluster_count_d, pre_count, n * sizeof(float), 72 | cudaMemcpyHostToDevice); 73 | checkCudaError("hcluster : malloc and memcpy"); 74 | 75 | Free(pre_count); 76 | 77 | dim3 78 | grid0(NUM_BLOCKS, 1, 1), block0(NUM_THREADS, 1, 1), 79 | grid1(1, 1, 1), block1(NUM_THREADS, 1, 1); 80 | 81 | size_t gpuPitch = pitch_dist_d / sizeof(float); 82 | void * convertArgs[] = { 83 | &gpuDist, 84 | &gpuPitch, 85 | &n 86 | }; 87 | 88 | // Convert 0 on the diagonal to infinity 89 | cudaLaunch("convert_kernel", convertArgs, grid1, block1); 90 | checkCudaError("hcluster : convert kernel"); 91 | 92 | std::string func; 93 | switch(method) { 94 | case COMPLETE: 95 | func = "complete_kernel"; 96 | break; 97 | case WPGMA: 98 | func = "wpgma_kernel"; 99 | break; 100 | case AVERAGE: 101 | func = "average_kernel"; 102 | break; 103 | case MEDIAN: 104 | func = "median_kernel"; 105 | break; 106 | case CENTROID: 107 | func = "centroid_kernel"; 108 | break; 109 | case FLEXIBLE_GROUP: 110 | func = "flexible_group_kernel"; 111 | break; 112 | case FLEXIBLE: 113 | func = "flexible_kernel"; 114 | break; 115 | case WARD: 116 | func = "ward_kernel"; 117 | break; 118 | case MCQUITTY: 119 | func = "mcquitty_kernel"; 120 | break; 121 | case SINGLE: 122 | default: 123 | func = "single_kernel"; 124 | break; 125 | } 126 | 127 | void * findMin1Args[] = { 128 | &gpuDist, 129 | &gpuPitch, 130 | &n, 131 | &hcluster_count_d, 132 | &hcluster_min_val_d, 133 | &hcluster_min_col_d, 134 | 0 // place holder for row_offset (a loop variable) 135 | }; 136 | void * findMin2Args[] = { 137 | &hcluster_min_val_d, 138 | &hcluster_min_col_d, 139 | &hcluster_count_d, 140 | &hcluster_sub_d, 141 | &hcluster_sup_d, 142 | &hcluster_merge_val_d, 143 | &n, 144 | 0 // place holder for iter 145 | }; 146 | void * funcArgs[] = { 147 | &gpuDist, 148 | &gpuPitch, 149 | &n, 150 | &hcluster_sub_d, 151 | &hcluster_sup_d, 152 | &hcluster_count_d, 153 | &hcluster_merge_val_d, 154 | 0, // place holder for iter 155 | 0, // place holder for col_offset 156 | &lambda, 157 | &beta 158 | }; 159 | size_t skip = NUM_BLOCKS * NUM_THREADS; 160 | // Merge items n - 1 times 161 | for(size_t iter = 0; iter < (n - 1); ++iter) { 162 | // Find the minimum of each column 163 | for(size_t row_offset = 0; row_offset < n; row_offset += NUM_BLOCKS) { 164 | findMin1Args[6] = &row_offset; 165 | cudaLaunch("find_min1_kernel", findMin1Args, 166 | grid0, block0); 167 | } 168 | 169 | // Find overall winner; update arrays sub, sup, val, count 170 | findMin2Args[7] = &iter; 171 | cudaLaunch("find_min2_kernel", findMin2Args, 172 | grid1, block1); 173 | 174 | // Update the distance matrix 175 | funcArgs[7] = &iter; 176 | for(size_t col_offset = 0; col_offset < n; col_offset += skip) { 177 | funcArgs[8] = &col_offset; 178 | cudaLaunch(func, funcArgs, grid0, block0); 179 | } 180 | } 181 | checkCudaError("hcluster : method kernel calls"); 182 | 183 | // Copy results 184 | cudaMemcpy(sub, hcluster_sub_d, (n - 1) * sizeof(int), 185 | cudaMemcpyDeviceToHost); 186 | cudaMemcpy(sup, hcluster_sup_d, (n - 1) * sizeof(int), 187 | cudaMemcpyDeviceToHost); 188 | cudaMemcpy(val, hcluster_merge_val_d, (n-1)*sizeof(float), 189 | cudaMemcpyDeviceToHost); 190 | 191 | checkCudaError("hcluster : results memcpy"); 192 | 193 | cudaFree(hcluster_count_d); 194 | cudaFree(hcluster_min_val_d); 195 | cudaFree(hcluster_min_col_d); 196 | cudaFree(hcluster_sub_d); 197 | cudaFree(hcluster_sup_d); 198 | cudaFree(hcluster_merge_val_d); 199 | 200 | checkCudaError("hcluster : cudaFree"); 201 | } 202 | -------------------------------------------------------------------------------- /src/hcluster.h: -------------------------------------------------------------------------------- 1 | #ifndef HCLUSTER_H 2 | #define HCLUSTER_H 3 | 4 | /* Methods for hierarchical clustering */ 5 | typedef enum { 6 | SINGLE, 7 | COMPLETE, 8 | WPGMA, 9 | AVERAGE, 10 | MEDIAN, 11 | CENTROID, 12 | FLEXIBLE_GROUP, 13 | FLEXIBLE, 14 | WARD, 15 | MCQUITTY 16 | } hc_method; 17 | 18 | void hcluster(const float * dist, size_t dist_pitch, size_t n, 19 | int * sub, int * sup, 20 | float * val, 21 | hc_method method, 22 | float lambda = 0.5, float beta = 0.5); 23 | 24 | void hclusterPreparedDistances(float * gpuDist, size_t pitch_dist_d, size_t n, 25 | int * sub, int * sup, 26 | float * val, 27 | hc_method method, 28 | float lambda, float beta); 29 | 30 | #endif // HCLUSTER_H 31 | -------------------------------------------------------------------------------- /src/kendall.cpp: -------------------------------------------------------------------------------- 1 | #include "nvrtc.h" 2 | #include "cuda.h" 3 | 4 | #include "R.h" 5 | 6 | #include "cuseful.h" 7 | #include "cudaUtils.h" 8 | 9 | #include "kendall.h" 10 | 11 | #define NUMTHREADS 16 12 | 13 | void masterKendall(const float * x, size_t nx, 14 | const float * y, size_t ny, 15 | size_t sampleSize, double * results) 16 | { 17 | size_t 18 | outputLength = nx * ny, outputBytes = outputLength*sizeof(double), 19 | xBytes = nx*sampleSize*sizeof(float), 20 | yBytes = ny*sampleSize*sizeof(float); 21 | float 22 | * gpux, * gpuy; 23 | double 24 | * gpuResults; 25 | dim3 26 | grid(nx, ny), block(NUMTHREADS, NUMTHREADS); 27 | 28 | cudaMalloc((void **)&gpux, xBytes); 29 | cudaMalloc((void **)&gpuy, yBytes); 30 | checkCudaError("input vector space allocation"); 31 | 32 | cudaMemcpy(gpux, x, xBytes, cudaMemcpyHostToDevice); 33 | cudaMemcpy(gpuy, y, yBytes, cudaMemcpyHostToDevice); 34 | checkCudaError("copying input vectors to gpu"); 35 | 36 | cudaMalloc((void **)&gpuResults, outputBytes); 37 | checkCudaError("allocation of space for result matrix"); 38 | 39 | void *args[] = 40 | { &gpux 41 | , &nx 42 | , &gpuy 43 | , &ny 44 | , &sampleSize 45 | , &gpuResults 46 | }; 47 | cudaLaunch("gpuKendall", args, 48 | grid, block); 49 | 50 | cudaFree(gpux); 51 | cudaFree(gpuy); 52 | cudaMemcpy(results, gpuResults, outputBytes, cudaMemcpyDeviceToHost); 53 | cudaFree(gpuResults); 54 | checkCudaError("copying results from gpu and cleaning up"); 55 | } 56 | -------------------------------------------------------------------------------- /src/kendall.h: -------------------------------------------------------------------------------- 1 | #ifndef _KENDALL_H_ 2 | #define _KENDALL_H_ 3 | 4 | void masterKendall(const float * x, size_t nx, 5 | const float * y, size_t ny, 6 | size_t sampleSize, 7 | double * results); 8 | 9 | #endif /* _KENDALL_H_ */ 10 | -------------------------------------------------------------------------------- /src/lsfit.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "cublas.h" 7 | 8 | #include "R.h" 9 | 10 | #include "cuseful.h" 11 | #include "lsfit.h" 12 | #include "qrdecomp.h" 13 | 14 | // Copyright 2009, Mark Seligman at Rapid Biologics, LLC. All rights 15 | // reserved. 16 | // 17 | 18 | // Rounds "length" up to the next multiple of the block length. 19 | // 20 | int alignBlock(int length, unsigned blockExp) { 21 | int blockSize = 1 << blockExp; 22 | return (length + blockSize - 1) & (((unsigned) -1) << blockExp); 23 | } 24 | 25 | void gpuLSFitF(float * X, int rows, int cols, float * Y, int yCols, 26 | double tol, float * coeffs, float * resids, float * effects, 27 | int * rank, int * pivot, double * qrAux) 28 | { 29 | const int 30 | fbytes = sizeof(float); 31 | 32 | // Should be >= 4, to satisfy alignment criterea for memory 33 | // coalescence. For larger arrays (> 1000 rows), best performance 34 | // has been observed at 7. 35 | // 36 | const unsigned blockExp = 7; // Gives blockSize = 2^7 = 128. 37 | 38 | float *dQR; 39 | 40 | int stride = alignBlock(rows, blockExp); 41 | 42 | cublasInit(); 43 | cublasAlloc(stride * cols, fbytes, (void **)&dQR); 44 | 45 | // This is overkill: just need to zero the padding. 46 | // 47 | cudaMemset2D(dQR, cols * fbytes, 0.f, cols * fbytes, stride); 48 | cublasSetMatrix(rows, cols, fbytes, X, rows, dQR, stride); 49 | 50 | // On return we have dQR in pivoted, packed QR form. 51 | 52 | getQRDecompBlocked(rows, cols, tol, dQR, 1 << blockExp, 53 | stride, pivot, qrAux, rank); 54 | cublasGetMatrix(rows, cols, fbytes, dQR, stride, X, rows); 55 | 56 | if(*rank > 0) 57 | getCRE(dQR, rows, cols, stride, *rank, qrAux, yCols, coeffs, resids, effects); 58 | else // Residuals copied from Y. 59 | memcpy(resids, Y, rows * yCols * fbytes); 60 | 61 | cublasFree(dQR); 62 | cublasShutdown(); 63 | } 64 | 65 | void gpuLSFitD(double *X, int n, int p, double *Y, int nY, 66 | double tol, double *coeffs, double *resids, double *effects, 67 | int *rank, int *pivot, double * qrAux) 68 | { 69 | // NYI 70 | } 71 | 72 | 73 | // Fills in the coefficients, residuals and effects matrices. 74 | // 75 | void getCRE(float *dQR, int rows, int cols, int stride, int rank, double *qrAux, 76 | int yCols, float *coeffs, float *resids, float *effects) 77 | { 78 | const int 79 | fbytes = sizeof(float); 80 | // Used by effects, residual computations. 81 | // 82 | int maxIdx = std::min(rank, rows - 1); 83 | 84 | float 85 | * diags = Calloc(rank * fbytes, float), 86 | * dDiags, *dResids, *dCoeffs, *dEffects; 87 | 88 | cublasAlloc(rank, fbytes, (void **) &dDiags); 89 | 90 | cublasAlloc(cols * yCols, fbytes, (void **) &dCoeffs); 91 | cublasAlloc(rows * yCols, fbytes, (void **) &dResids); 92 | cublasAlloc(rows * yCols, fbytes, (void **) &dEffects); 93 | 94 | // Temporarily swaps diagonals with qrAux. 95 | 96 | cublasScopy(rank, dQR, stride + 1, dDiags, 1); 97 | cublasGetVector(rank, fbytes, dDiags, 1, diags, 1); 98 | 99 | float *qrAuxFloat = Calloc(maxIdx * fbytes, float); 100 | for (int i = 0; i < maxIdx; i++) 101 | qrAuxFloat[i] = qrAux[i]; 102 | cublasSetVector(maxIdx, fbytes, qrAuxFloat, 1, dQR, stride + 1); 103 | Free(qrAuxFloat); 104 | 105 | cublasSetMatrix(cols, yCols, fbytes, coeffs, cols, dCoeffs, cols); 106 | cublasSetMatrix(rows, yCols, fbytes, effects, rows, dEffects, rows); 107 | cublasSetMatrix(rows, yCols, fbytes, resids, rows, dResids, rows); 108 | 109 | // Computes the effects matrix, intialized by caller to Y. 110 | 111 | float 112 | * pEffects = dEffects; 113 | 114 | for (int i = 0; i < yCols; i++, pEffects += rows) { 115 | float 116 | * pQR = dQR; 117 | 118 | for (int k = 0; k < maxIdx; k++, pQR += (stride + 1)) { 119 | double 120 | t = cublasSdot(rows - k, pQR, 1, pEffects + k, 1); 121 | 122 | t *= -1.0 / qrAux[k]; 123 | cublasSaxpy(rows - k, t, pQR, 1, pEffects + k, 1); 124 | } 125 | } 126 | 127 | // Computes the residuals matrix, initialized by caller to zero. 128 | // If not of full row rank, presets the remaining rows to those from 129 | // effects. 130 | 131 | if(rank < rows) { 132 | for(int i = 0; i < yCols; i++) { 133 | cublasScopy(rows - rank, dEffects + i*rows + rank, 1, 134 | dResids + i*rows + rank, 1); 135 | } 136 | } 137 | 138 | float 139 | * pResids = dResids; 140 | 141 | for (int i = 0; i < yCols; i++, pResids += rows) { 142 | for (int k = maxIdx - 1; k >= 0; k--) { 143 | double 144 | t = -(1.0 / qrAux[k]) 145 | * cublasSdot(rows - k, dQR + k*stride + k, 1, pResids + k, 1); 146 | 147 | cublasSaxpy(rows -k, t, dQR + k*stride + k, 1, pResids + k, 1); 148 | } 149 | } 150 | cublasScopy(maxIdx, dDiags, 1, dQR, stride + 1); 151 | 152 | // Computes the coefficients matrix, initialized by caller to zero. 153 | 154 | float 155 | * pCoeffs = dCoeffs; 156 | 157 | for(int i = 0; i < yCols; i++, pCoeffs += cols) { 158 | cublasScopy(rank, dEffects + i*rows, 1, pCoeffs, 1); 159 | 160 | float t; 161 | for(int k = rank - 1; k > 0; k--) { 162 | cublasSscal(1, 1.f / diags[k], pCoeffs + k, 1); 163 | cublasGetVector(1, fbytes, pCoeffs + k, 1, &t, 1); 164 | cublasSaxpy(k, -t, dQR + k*stride, 1, pCoeffs, 1); 165 | } 166 | cublasSscal(1, 1.f / diags[0], pCoeffs, 1); 167 | } 168 | Free(diags); 169 | 170 | cublasGetMatrix(cols, yCols, fbytes, dCoeffs, cols, coeffs, cols); 171 | cublasGetMatrix(rows, yCols, fbytes, dResids, rows, resids, rows); 172 | cublasGetMatrix(rows, yCols, fbytes, dEffects, rows, effects, rows); 173 | 174 | cublasFree(dDiags); 175 | cublasFree(dCoeffs); 176 | cublasFree(dResids); 177 | cublasFree(dEffects); 178 | } 179 | -------------------------------------------------------------------------------- /src/lsfit.h: -------------------------------------------------------------------------------- 1 | #ifndef _LSFIT_H_ 2 | #define _LSFIT_H_ 3 | 4 | void getCRE(float *dQR, int rows, int cols, int stride, int rank, 5 | double *qrAux, int yCols, 6 | float *coeffs, float *resids, float *effects); 7 | void gpuLSFitF(float *X, int n, int p, float *Y, int nY, 8 | double tol, float *coeffs, 9 | float *resids, float *effects, 10 | int *rank, int *pivot, double * qrAux); 11 | void gpuLSFitD(double *X, int n, int p, double *Y, int nY, 12 | double tol, double *coeffs, 13 | double *resids, double *effects, 14 | int *rank, int *pivot, double * qrAux); 15 | 16 | #endif /* _LSFIT_H_ */ 17 | -------------------------------------------------------------------------------- /src/matmult.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | 16 | SEXP gpuMatMult(SEXP a, SEXP b) { 17 | double 18 | * xa = REAL(a), * xb = REAL(b), 19 | * gpua, * gpub, * gpuc; 20 | 21 | SEXP 22 | dima = getAttrib(a, R_DimSymbol), 23 | dimb = getAttrib(b, R_DimSymbol); 24 | 25 | int 26 | rowsa = INTEGER(dima)[0], colsa = INTEGER(dima)[1], 27 | rowsb = INTEGER(dimb)[0], colsb = INTEGER(dimb)[1]; 28 | 29 | cublasStatus_t stat; 30 | cublasHandle_t handle; 31 | 32 | cudaError_t cudaStat; 33 | 34 | cudaStat = cudaMalloc((void**) &gpua, rowsa * colsa * sizeof(double)); 35 | if (cudaStat != cudaSuccess) error("device memory allocation failed"); 36 | 37 | cudaStat = cudaMalloc((void**) &gpub, rowsb * colsb * sizeof(double)); 38 | if (cudaStat != cudaSuccess) error("device memory allocation failed"); 39 | 40 | int 41 | rowsOpA = rowsa, colsOpA = colsa, colsOpB = colsb; 42 | 43 | cudaStat = cudaMalloc((void**) &gpuc, rowsOpA * colsOpB * sizeof(double)); 44 | if (cudaStat != cudaSuccess) error("device memory allocation failed"); 45 | 46 | stat = cublasCreate(&handle); 47 | if(stat != CUBLAS_STATUS_SUCCESS) error("CUBLAS initialization failed\n"); 48 | 49 | stat = cublasSetMatrix(rowsa, colsa, sizeof(double), xa, rowsa, 50 | gpua, rowsa); 51 | if(stat != CUBLAS_STATUS_SUCCESS) { 52 | cudaFree(gpuc); 53 | cudaFree(gpub); 54 | cudaFree(gpua); 55 | cublasDestroy(handle); 56 | error("data download failed\n"); 57 | } 58 | 59 | stat = cublasSetMatrix(rowsb, colsb, sizeof(double), xb, rowsb, 60 | gpub, rowsb); 61 | if(stat != CUBLAS_STATUS_SUCCESS) { 62 | cudaFree(gpuc); 63 | cudaFree(gpub); 64 | cudaFree(gpua); 65 | cublasDestroy(handle); 66 | error("data download failed\n"); 67 | } 68 | 69 | const double alpha = 1.0, beta = 0.0; 70 | cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, rowsOpA, colsOpB, colsOpA, 71 | &alpha, (const double *) gpua, rowsa, (const double *) gpub, rowsb, 72 | &beta, gpuc, rowsOpA); 73 | 74 | SEXP ab, dimab; 75 | PROTECT(ab = allocVector(REALSXP, rowsOpA * colsOpB)); 76 | PROTECT(dimab = allocVector(INTSXP, 2)); 77 | INTEGER(dimab)[0] = rowsOpA; INTEGER(dimab)[1] = colsOpB; 78 | setAttrib(ab, R_DimSymbol, dimab); 79 | 80 | double * xab = REAL(ab); 81 | stat = cublasGetMatrix(rowsOpA, colsOpB, sizeof(double), gpuc, rowsOpA, 82 | xab, rowsOpA); 83 | if(stat != CUBLAS_STATUS_SUCCESS) { 84 | cudaFree(gpuc); 85 | cudaFree(gpub); 86 | cudaFree(gpua); 87 | cublasDestroy(handle); 88 | error("data upload failed\n"); 89 | } 90 | 91 | cudaFree(gpua); 92 | cudaFree(gpub); 93 | cudaFree(gpuc); 94 | 95 | cublasDestroy(handle); 96 | UNPROTECT(2); 97 | return ab; 98 | } 99 | -------------------------------------------------------------------------------- /src/matmult.h: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | SEXP gpuMatMult(SEXP a, SEXP b); 3 | } 4 | -------------------------------------------------------------------------------- /src/mi.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "cudaUtils.h" 10 | 11 | #define NTHREADS 16 12 | 13 | static int initKnots(int nbins, int order, float ** knots) { 14 | int 15 | om1 = order - 1, 16 | degree = nbins - 1, 17 | dpo = degree + order, 18 | nknots = dpo + 1; 19 | 20 | *knots = Calloc(nknots, float); 21 | for(int i = 0; i < nknots; i++) { 22 | if(i <= om1) 23 | (*knots)[i] = 0.f; 24 | else if(i <= degree) 25 | (*knots)[i] = (*knots)[i-1] + 1.f; 26 | else 27 | (*knots)[i] = (*knots)[degree] + 1.f; 28 | } 29 | return nknots; 30 | } 31 | 32 | 33 | void bSplineMutualInfo(int nbins, int order, int nsamples, 34 | int nx, const float * x, 35 | int ny, const float * y, 36 | float * out_mi) 37 | { 38 | size_t 39 | pitch[2], pitch_bins[2], 40 | col_bytes = (size_t)nsamples * sizeof(float); 41 | int 42 | nknots, nblocks[2], size[2] = { nx, ny }; 43 | float 44 | * knots, * dknots, 45 | * stage[2], * dx[2], * dentropy[2], * dbins[2]; 46 | const float 47 | * data[2] = { x, y }; 48 | 49 | nknots = initKnots(nbins, order, &knots); 50 | float knot_max = knots[nknots - 1]; 51 | cudaMalloc((void **)&dknots, nknots * sizeof(float)); 52 | cudaMemcpy(dknots, knots, nknots * sizeof(float), cudaMemcpyHostToDevice); 53 | Free(knots); 54 | 55 | checkCudaError("bSplineMutualInfoSingle: 1"); 56 | 57 | for(int i = 0; i < 2; i++) { 58 | cudaMallocPitch((void **)&(dx[i]), pitch + i, col_bytes, size[i]); 59 | cudaMallocHost((void **)&(stage[i]), size[i] * col_bytes); 60 | cudaMalloc((void **)&(dentropy[i]), size[i] * sizeof(float)); 61 | cudaMallocPitch((void **)&(dbins[i]), pitch_bins + i, 62 | nbins * col_bytes, size[i]); 63 | cudaMemset2D(dbins[i], pitch_bins[i], 0, nbins * col_bytes, size[i]); 64 | 65 | nblocks[i] = size[i] / NTHREADS; 66 | if(nblocks[i] * NTHREADS < size[i]) 67 | nblocks[i]++; 68 | } 69 | 70 | checkCudaError("bSplineMutualInfoSingle: 2"); 71 | 72 | cudaStream_t stream[2]; 73 | for(int i = 0; i < 2; i++) 74 | cudaStreamCreate(stream + i); 75 | for(int i = 0; i < 2; i++) { 76 | cudaMemcpyAsync(stage[i], data[i], size[i] * col_bytes, 77 | cudaMemcpyHostToHost, stream[i]); 78 | cudaMemcpy2DAsync(dx[i], pitch[i], stage[i], col_bytes, col_bytes, 79 | size[i], cudaMemcpyHostToDevice, stream[i]); 80 | } 81 | for(int i = 0; i < 2; i++) { 82 | dim3 83 | grid(nblocks[i]), 84 | block(NTHREADS); 85 | 86 | size_t xpitch = pitch[i] / sizeof(float); 87 | 88 | void * scaleArgs[] = { 89 | &knot_max, 90 | size + i, 91 | &nsamples, 92 | dx + i, 93 | &xpitch 94 | }; 95 | cudaLaunch("scale", scaleArgs, 96 | grid, block, stream[i]); 97 | 98 | size_t pitch_bins_i = pitch_bins[i] / sizeof(float); 99 | void * gbsArgs[] = { 100 | &nbins, &order, 101 | &nknots, &dknots, 102 | &nsamples, size + i, 103 | dx + i, &xpitch, 104 | dbins + i, 105 | &pitch_bins_i 106 | }; 107 | cudaLaunch("get_bin_scores", gbsArgs, 108 | grid, block, stream[i]); 109 | 110 | void * entropyArgs[] = { 111 | &nbins, 112 | &nsamples, 113 | size + i, 114 | dbins + i, 115 | &pitch_bins_i, 116 | dentropy + i 117 | }; 118 | cudaLaunch("get_entropy", entropyArgs, 119 | grid, block, stream[i]); 120 | } 121 | checkCudaError("bSplineMutualInfoSingle: 3"); 122 | 123 | cudaFree(dknots); 124 | for(int i = 0; i < 2; i++) { 125 | cudaFreeHost(stage[i]); 126 | cudaFree(dx[i]); 127 | } 128 | 129 | size_t pitch_mi; 130 | float * dmi; 131 | cudaMallocPitch((void **)&dmi, &pitch_mi, ny * sizeof(float), nx); 132 | 133 | dim3 134 | gridDim(nblocks[0], nblocks[1]), blockDim(NTHREADS, NTHREADS); 135 | 136 | pitch_bins[0] /= sizeof(float); 137 | pitch_bins[1] /= sizeof(float); 138 | 139 | size_t dpitch_mi = pitch_mi / sizeof(float); 140 | 141 | void * miArgs[] = { 142 | &nbins, &nsamples, 143 | &nx, dbins, pitch_bins, dentropy, 144 | &ny, dbins + 1, pitch_bins + 1, dentropy + 1, 145 | &dmi, &dpitch_mi 146 | }; 147 | cudaLaunch("get_mi", miArgs, gridDim, blockDim); 148 | checkCudaError("bSplineMutualInfoSingle: 4"); 149 | 150 | for(int i = 0; i < 2; i++) 151 | cudaFree(dbins[i]); 152 | 153 | float * mi_stage; 154 | cudaMallocHost((void **)&mi_stage, nx * ny * sizeof(float)); 155 | 156 | cudaMemcpy2D(mi_stage, ny * sizeof(float), dmi, pitch_mi, 157 | ny * sizeof(float), nx, cudaMemcpyDeviceToHost); 158 | checkCudaError("bSplineMutualInfoSingle: 5"); 159 | cudaFree(dmi); 160 | 161 | memcpy(out_mi, mi_stage, nx * ny * sizeof(float)); 162 | cudaFreeHost(mi_stage); 163 | checkCudaError("bSplineMutualInfoSingle: 6"); 164 | } 165 | -------------------------------------------------------------------------------- /src/mi.h: -------------------------------------------------------------------------------- 1 | #ifndef _MI_H_ 2 | #define _MI_H_ 3 | 4 | void bSplineMutualInfo(int nbins, int order, int nsamples, 5 | int nx, const float * x, 6 | int ny, const float * y, 7 | float * out_mi); 8 | 9 | #endif /* _MI_H_ */ 10 | -------------------------------------------------------------------------------- /src/qrdecomp.h: -------------------------------------------------------------------------------- 1 | #ifndef _QRDECOMP_H_ 2 | #define _QRDECOMP_H_ 3 | 4 | void qrdecompMGS(int rows, int cols, float * da, float * dq, float * dr, 5 | int * pivots); 6 | void getQRDecomp(int rows, int cols, float * dq, float * da, int * pivot); 7 | void qrSolver(int rows, int cols, float * matX, float * vectY, float * vectB); 8 | void getQRDecompRR(int rows, int cols, double tol, float * dQR, 9 | int * pivot, double * qrAux, int * rank); 10 | void getQRDecompBlocked(int rows, int cols, double tol, float * dQR, 11 | int blockSize, int rowsUnblocked, int * pivot, 12 | double * qrAux, int * rank); 13 | 14 | void getInverseFromQR(int rows, int cols, const float * dQ, const float * dR, 15 | float * dInverse); 16 | void solveFromQR(int rows, int cols, const float * matQ, const float * matR, 17 | const float * vectY, float * vectB); 18 | 19 | #endif /* _QRDECOMP_H_ */ 20 | -------------------------------------------------------------------------------- /src/rinterface.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "cublas.h" 5 | 6 | #include "R.h" 7 | #include "Rinternals.h" 8 | #include "R_ext/Rdynload.h" 9 | 10 | #include "correlation.h" 11 | #include "cuseful.h" 12 | #include "distance.h" 13 | #include "granger.h" 14 | #include "hcluster.h" 15 | #include "kendall.h" 16 | #include "lsfit.h" 17 | #include "matmult.h" 18 | #include "mi.h" 19 | #include "qrdecomp.h" 20 | 21 | #include "rinterface.h" 22 | 23 | void R_init_mylib(DllInfo *info) { 24 | R_CallMethodDef callMethods[] = { 25 | {"gpuMatMult", (DL_FUNC) &gpuMatMult, 2}, 26 | {NULL, NULL, 0} 27 | }; 28 | R_registerRoutines(info, NULL, callMethods, NULL, NULL); 29 | } 30 | 31 | // whichObs = 0 means everything 32 | // whichObs = 1 means pairwiseComplete 33 | void rpmcc(const int * whichObs, 34 | const float * samplesA, const int * numSamplesA, 35 | const float * samplesB, const int * numSamplesB, 36 | const int * sampleSize, 37 | float * numPairs, float * correlations, float * signifs) 38 | { 39 | UseObs myObs; 40 | switch(*whichObs) { 41 | case 0: 42 | myObs = everything; 43 | break; 44 | case 1: 45 | myObs = pairwiseComplete; 46 | break; 47 | default: 48 | fatal("unknown use method"); 49 | } 50 | pmcc(myObs, 51 | samplesA, *numSamplesA, 52 | samplesB, *numSamplesB, 53 | *sampleSize, 54 | numPairs, correlations, signifs); 55 | } 56 | 57 | void rformatInput(const int * images, 58 | const int * xcoords, const int * ycoords, const int * zcoords, 59 | const int * mins, const int * maxes, 60 | const float * evs, const int * numrows, const int * numimages, 61 | float * output) 62 | { 63 | getData(images, xcoords, ycoords, zcoords, mins, maxes, evs, 64 | *numrows, *numimages, output); 65 | } 66 | 67 | void rformatOutput(const int * imageList1, const int * numImages1, 68 | const int * imageList2, const int * numImages2, 69 | const int * structureid, 70 | const double * cutCorrelation, const int * cutPairs, 71 | const double * correlations, const double * signifs, const int * numPairs, 72 | double * results, int * nrows) 73 | { 74 | *nrows = (int) parseResults(imageList1, *numImages1, imageList2, 75 | *numImages2, *structureid, *cutCorrelation, *cutPairs, 76 | correlations, signifs, numPairs, results); 77 | } 78 | 79 | void rsetDevice(const int * device) { 80 | setDevice(*device); 81 | } 82 | 83 | void rgetDevice(int * device) { 84 | getDevice(device); 85 | } 86 | 87 | void rtestT(const float * pairs, const float * coeffs, const int * n, 88 | float * ts) 89 | { 90 | testSignif(pairs, coeffs, (size_t) *n, ts); 91 | } 92 | 93 | void rhostT(const float * pairs, const float * coeffs, const int * n, 94 | float * ts) 95 | { 96 | hostSignif(pairs, coeffs, (size_t) *n, ts); 97 | } 98 | 99 | void rSignifFilter(const double * data, int * rows, double * results) { 100 | *rows = signifFilter(data, (size_t) *rows, results); 101 | } 102 | 103 | void gSignifFilter(const float * data, int * rows, float * results) { 104 | *rows = gpuSignifFilter(data, (size_t) *rows, results); 105 | } 106 | 107 | void RcublasPMCC(const float * samplesA, const int * numSamplesA, 108 | const float * samplesB, const int * numSamplesB, 109 | const int * sampleSize, 110 | float * correlations) 111 | { 112 | cublasPMCC(samplesA, *numSamplesA, samplesB, *numSamplesB, *sampleSize, 113 | correlations); 114 | } 115 | 116 | void RhostKendall(const float * X, const float * Y, const int * n, 117 | double * answer) 118 | { 119 | *answer = hostKendall(X, Y, *n); 120 | } 121 | 122 | void RpermHostKendall(const float * X, const int * nx, const float * Y, 123 | const int * ny, const int * sampleSize, double * answers) 124 | { 125 | permHostKendall(X, *nx, Y, *ny, *sampleSize, answers); 126 | } 127 | 128 | void RgpuKendall(const float * X, const int * nx, const float * Y, 129 | const int * ny, const int * sampleSize, double * answers) 130 | { 131 | masterKendall(X, *nx, Y, *ny, *sampleSize, answers); 132 | } 133 | 134 | void rgpuGranger(const int * rows, const int * cols, const float * y, 135 | const int * p, float * fStats, float * pValues) 136 | { 137 | granger(*rows, *cols, y, *p, fStats, pValues); 138 | } 139 | 140 | void rgpuGrangerXY(const int * rows, const int * colsx, const float * x, 141 | const int * colsy, const float * y, const int * p, 142 | float * fStats, float * pValues) 143 | { 144 | grangerxy(*rows, *colsx, x, *colsy, y, *p, fStats, pValues); 145 | } 146 | 147 | dist_method getDistEnum(const char * methodStr) 148 | { 149 | if(0 == strcmp(methodStr,"maximum")) return MAXIMUM; 150 | if(0 == strcmp(methodStr,"manhattan")) return MANHATTAN; 151 | if(0 == strcmp(methodStr,"canberra")) return CANBERRA; 152 | if(0 == strcmp(methodStr,"binary")) return BINARY; 153 | if(0 == strcmp(methodStr,"minkowski")) return MINKOWSKI; 154 | // if(0 == strcmp(methodStr,"dot")) return DOT; 155 | return EUCLIDEAN; 156 | } 157 | 158 | hc_method getClusterEnum(const char * methodStr) 159 | { 160 | if(0 == strcmp(methodStr,"complete")) return COMPLETE; 161 | if(0 == strcmp(methodStr,"wpgma")) return WPGMA; 162 | if(0 == strcmp(methodStr,"average")) return AVERAGE; 163 | if(0 == strcmp(methodStr,"median")) return MEDIAN; 164 | if(0 == strcmp(methodStr,"centroid")) return CENTROID; 165 | if(0 == strcmp(methodStr,"flexible_group")) return FLEXIBLE_GROUP; 166 | if(0 == strcmp(methodStr,"flexible")) return FLEXIBLE; 167 | if(0 == strcmp(methodStr,"ward")) return WARD; 168 | if(0 == strcmp(methodStr,"mcquitty")) return MCQUITTY; 169 | return SINGLE; 170 | } 171 | 172 | void Rdistclust(const char ** distmethod, const char ** clustmethod, 173 | const float * points, const int * numPoints, const int * dim, 174 | int * merge, int * order, float * val) 175 | { 176 | dist_method dmeth = getDistEnum(*distmethod); 177 | hc_method hcmeth = getClusterEnum(*clustmethod); 178 | 179 | size_t dpitch = 0; 180 | float * gpuDistances = NULL; 181 | 182 | distanceLeaveOnGpu(dmeth, 2.f, points, *dim, *numPoints, 183 | &gpuDistances, &dpitch); 184 | 185 | size_t len = (*numPoints) - 1; 186 | float 187 | lambda = 0.5f, beta = 0.5f; 188 | int 189 | * presub, * presup; 190 | 191 | presub = Calloc(len, int); 192 | presup = Calloc(len, int); 193 | 194 | hclusterPreparedDistances(gpuDistances, dpitch, *numPoints, 195 | presub, presup, 196 | val, 197 | hcmeth, 198 | lambda, beta); 199 | 200 | formatClustering(len, presub, presup, merge, order); 201 | 202 | Free(presub); 203 | Free(presup); 204 | } 205 | 206 | void Rdistances(const float * points, const int * numPoints, const int * dim, 207 | float * distances, const char ** method, const float *p) 208 | { 209 | dist_method nummethod = getDistEnum(*method); 210 | 211 | distance(points, (*dim)*sizeof(float), *numPoints, points, 212 | (*dim)*sizeof(float), *numPoints, *dim, distances, 213 | (*numPoints)*sizeof(float), nummethod, *p); 214 | } 215 | 216 | void Rhcluster(const float * distMat, const int * numPoints, 217 | int * merge, int * order, float * val, 218 | const char ** method) 219 | { 220 | hc_method nummethod = getClusterEnum(*method); 221 | 222 | size_t len = (*numPoints) - 1; 223 | size_t pitch = (*numPoints) * sizeof(float); 224 | float lambda = 0.5; 225 | float beta = 0.5; 226 | int 227 | * presub, * presup; 228 | 229 | presub = Calloc(len, int); 230 | presup = Calloc(len, int); 231 | 232 | hcluster(distMat, pitch, *numPoints, presub, presup, val, nummethod, 233 | lambda, beta); 234 | 235 | formatClustering(len, presub, presup, merge, order); 236 | 237 | Free(presub); 238 | Free(presup); 239 | } 240 | 241 | void formatClustering(const int len, const int * sub, const int * sup, 242 | int * merge, int * order) 243 | { 244 | for(size_t i = 0; i < len; i++) { 245 | merge[i] = -(sub[i] + 1); 246 | merge[i+len] = -(sup[i] + 1); 247 | } 248 | 249 | for(size_t i = 0; i < len; i++) { 250 | for(size_t j = i+1; j < len; j++) { 251 | if((merge[j] == merge[i]) || (merge[j] == merge[i+len])) 252 | merge[j] = i + 1; 253 | if((merge[j+len] == merge[i]) || (merge[j+len] == merge[i+len])) 254 | merge[j+len] = i + 1; 255 | if(((merge[j+len] < 0) && (merge[j] > 0)) 256 | || ((merge[j] > 0) && (merge[j+len] > 0) 257 | && (merge[j] > merge[j+len]))) { 258 | int holder = merge[j]; 259 | merge[j] = merge[j+len]; 260 | merge[j+len] = holder; 261 | } 262 | } 263 | } 264 | getPrintOrder(len, merge, order); 265 | } 266 | 267 | void getPrintOrder(const int len, const int * merge, int * order) 268 | { 269 | int 270 | level = len-1, otop = len; 271 | 272 | depthFirst(len, merge, level, &otop, order); 273 | } 274 | 275 | void depthFirst(const int len, const int * merge, int level, int * otop, 276 | int * order) 277 | { 278 | int 279 | left = level, right = level + len; 280 | 281 | if(merge[right] < 0) { 282 | order[*otop] = -merge[right]; 283 | (*otop)--; 284 | } else 285 | depthFirst(len, merge, merge[right]-1, otop, order); 286 | 287 | if(merge[left] < 0) { 288 | order[*otop] = -merge[left]; 289 | (*otop)--; 290 | } else 291 | depthFirst(len, merge, merge[left]-1, otop, order); 292 | } 293 | 294 | void RgetQRDecomp(int * rows, int * cols, float * a, float * q, int * pivot, 295 | int * rank) 296 | { 297 | 298 | int 299 | fbytes = sizeof(float), 300 | m = *rows, n = *cols; 301 | float 302 | * da, * dq; 303 | 304 | cublasAlloc(m*n, fbytes, (void **)&da); 305 | cublasAlloc(m*m, fbytes, (void **)&dq); 306 | cublasSetMatrix(m, n, fbytes, a, m, da, m); 307 | 308 | getQRDecomp(m, n, dq, da, pivot); 309 | 310 | cublasGetMatrix(m, n, fbytes, da, m, a, m); 311 | cublasGetMatrix(m, m, fbytes, dq, m, q, m); 312 | cublasFree(da); 313 | cublasFree(dq); 314 | 315 | int foundZero = 0; 316 | for(int i = 0; (i < m) && (i < n); i++) { 317 | if((a[i+i*m] < 0.0001f) && (a[i+i*m] > -0.0001f)) { 318 | foundZero = 1; 319 | *rank = i+1; 320 | break; 321 | } 322 | } 323 | if(!foundZero) { 324 | if(m > n) *rank = n; 325 | else *rank = m; 326 | } 327 | } 328 | 329 | // solve for B: XB=Y where B and Y are vectors and X is a matrix of 330 | // dimension rows x cols 331 | void RqrSolver(int * rows, int * cols, float * matX, float * vectY, 332 | float * vectB) 333 | { 334 | int 335 | fbytes = sizeof(float), 336 | m = *rows, n = *cols; 337 | float 338 | * dX, * dY, * dB; 339 | 340 | cublasAlloc(m*n, fbytes, (void **)&dX); 341 | cublasAlloc(n, fbytes, (void **)&dB); 342 | cublasAlloc(m, fbytes, (void **)&dY); 343 | checkCublasError("RqrSolver: line 80"); 344 | 345 | cublasSetMatrix(m, n, fbytes, matX, m, dX, m); 346 | cublasSetVector(m, fbytes, vectY, 1, dY, 1); 347 | checkCublasError("RqrSolver: line 84"); 348 | 349 | qrSolver(m, n, dX, dY, dB); 350 | 351 | cublasFree(dX); 352 | cublasFree(dY); 353 | 354 | cublasGetVector(n, fbytes, dB, 1, vectB, 1); 355 | checkCublasError("RqrSolver: line 93"); 356 | 357 | cublasFree(dB); 358 | } 359 | 360 | void rGetQRDecompRR(const int * rows, const int * cols, 361 | const double * tol, float * x, int * pivot, 362 | double * qraux, int * rank) 363 | { 364 | float * dQR; 365 | cudaMalloc((void **) &dQR, (*rows) * (*cols) * sizeof(float)); 366 | checkCudaError("rGetQRDecompRR:"); 367 | 368 | cudaMemcpy(dQR, x, (*rows) * (*cols) * sizeof(float), 369 | cudaMemcpyHostToDevice); 370 | 371 | getQRDecompRR(*rows, *cols, *tol, dQR, pivot, qraux, rank); 372 | 373 | cudaMemcpy(x, dQR, (*rows) * (*cols) * sizeof(float), 374 | cudaMemcpyDeviceToHost); 375 | checkCudaError("rGetQRDecompRR:"); 376 | cudaFree(dQR); 377 | checkCudaError("rGetQRDecompRR:"); 378 | } 379 | 380 | void rGetInverseFromQR(const int * rows, const int * cols, 381 | const float * q, const float * r, 382 | float * inverse) 383 | { 384 | float 385 | * dQ, * dR, * dInverse; 386 | 387 | cudaMalloc((void **) &dQ, (*rows) * (*cols) * sizeof(float)); 388 | cudaMalloc((void **) &dR, (*cols) * (*cols) * sizeof(float)); 389 | cudaMalloc((void **) &dInverse, (*rows) * (*cols) * sizeof(float)); 390 | checkCudaError("rGetInverseFromQR:"); 391 | 392 | cudaMemcpy(dQ, q, (*rows) * (*cols) * sizeof(float), 393 | cudaMemcpyHostToDevice); 394 | cudaMemcpy(dR, r, (*cols) * (*cols) * sizeof(float), 395 | cudaMemcpyHostToDevice); 396 | 397 | getInverseFromQR(*rows, *cols, dQ, dR, dInverse); 398 | 399 | cudaFree(dQ); 400 | cudaFree(dR); 401 | 402 | cudaMemcpy(inverse, dInverse, (*rows) * (*cols) * sizeof(float), 403 | cudaMemcpyDeviceToHost); 404 | checkCudaError("rGetInverseFromQR:"); 405 | 406 | cudaFree(dInverse); 407 | } 408 | 409 | void rSolveFromQR(const int * rows, const int * cols, const float * q, const float * r, 410 | const float * y, float * b) 411 | { 412 | solveFromQR(*rows, *cols, q, r, y, b); 413 | } 414 | 415 | void rBSplineMutualInfo(int * nBins, int * splineOrder, int * nsamples, 416 | int * rowsA, const float * A, 417 | int * rowsB, const float * B, 418 | float * mutualInfo) 419 | { 420 | bSplineMutualInfo(*nBins, *splineOrder, *nsamples, *rowsA, A, *rowsB, B, 421 | mutualInfo); 422 | } 423 | 424 | // Interface for R functions requiring least-squares computations. 425 | // 426 | void RgpuLSFit(float *X, int *n, int *p, float *Y, int *nY, 427 | double *tol, float *coeffs, float *resids, float *effects, 428 | int *rank, int *pivot, double * qrAux, int useSingle) 429 | { 430 | if (useSingle) { 431 | gpuLSFitF(X, *n, *p, Y, *nY, *tol, coeffs, resids, effects, 432 | rank, pivot, qrAux); 433 | } 434 | else { 435 | // gpuLSFitD(X, *n, *p, Y, *nY, *tol, coeffs, resids, effects, rank, pivot, qrAux); 436 | } 437 | } 438 | -------------------------------------------------------------------------------- /src/rinterface.h: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | // whichObs = 0 means everything 3 | // whichObs = 1 means pairwiseComplete 4 | void rpmcc(const int * whichObs, 5 | const float * samplesA, const int * numSamplesA, 6 | const float * samplesB, const int * numSamplesB, 7 | const int * sampleSize, float * numPairs, 8 | float * correlations, float * signifs); 9 | 10 | void rformatInput(const int * images, 11 | const int * xcoords, const int * ycoords, const int * zcoords, 12 | const int * mins, const int * maxes, 13 | const float * evs, const int * numrows, const int * numimages, 14 | float * output); 15 | 16 | void rformatOutput(const int * imageList1, const int * numImages1, 17 | const int * imageList2, const int * numImages2, 18 | const int * structureid, 19 | const double * cutCorrelation, const int * cutPairs, 20 | const double * correlations, const double * signifs, 21 | const int * numPairs, double * results, int * nrows); 22 | 23 | void rsetDevice(const int * device); 24 | void rgetDevice(int * device); 25 | 26 | void rtestT(const float * pairs, const float * coeffs, const int * n, 27 | float * ts, const char ** kernelSrc); 28 | void rhostT(const float * pairs, const float * coeffs, const int * n, 29 | float * ts); 30 | void rSignifFilter(const double * data, int * rows, double * results); 31 | void gSignifFilter(const float * data, int * rows, float * results); 32 | 33 | void RcublasPMCC(const float * samplesA, const int * numSamplesA, 34 | const float * samplesB, const int * numSamplesB, 35 | const int * sampleSize, float * correlations); 36 | 37 | void RhostKendall(const float * X, const float * Y, const int * n, 38 | double * answer); 39 | void RpermHostKendall(const float * X, const int * nx, const float * Y, 40 | const int * ny, const int * sampleSize, double * answers); 41 | void RgpuKendall(const float * X, const int * nx, const float * Y, 42 | const int * ny, const int * sampleSize, double * answers); 43 | 44 | void rgpuGranger(const int * rows, const int * colsy, const float * y, 45 | const int * p, float * fStats, float * pValues); 46 | void rgpuGrangerXY(const int * rows, const int * colsx, const float * x, 47 | const int * colsy, const float * y, const int * p, 48 | float * fStats, float * pValues); 49 | 50 | void Rdistclust(const char ** distmethod, const char ** clustmethod, 51 | const float * points, const int * numPoints, const int * dim, 52 | int * merge, int * order, float * val); 53 | void Rdistances(const float * points, const int * numPoints, 54 | const int * dim, float * distances, const char ** method, 55 | const float * p); 56 | void Rhcluster(const float * distMat, const int * numPoints, 57 | int * merge, int * order, float * val, const char ** method); 58 | 59 | void RgetQRDecomp(int * rows, int * cols, float * a, float * q, int * pivot, 60 | int * rank); 61 | void RqrSolver(int * rows, int * cols, float * matX, float * vectY, 62 | float * vectB); 63 | 64 | void rGetQRDecompRR(const int * rows, const int * cols, 65 | const double * tol, float * x, int * pivot, 66 | double * qraux, int * rank); 67 | 68 | void rGetInverseFromQR(const int * rows, const int * cols, const float * q, 69 | const float * r, float * inverse); 70 | void rSolveFromQR(const int * rows, const int * cols, const float * q, 71 | const float * r, const float * y, float * b); 72 | 73 | void rBSplineMutualInfo(int * nBins, int * splineOrder, int * nsamples, 74 | int * rowsA, const float * A, 75 | int * rowsB, const float * B, 76 | float * mutualInfo); 77 | 78 | void RgpuLSFit(float *X, int *n, int *p, float *Y, int *nY, 79 | double *tol, float *coeffs, float *resids, float *effects, 80 | int *rank, int *pivot, double * qrAux, int useSingle); 81 | 82 | void setDevice(int * device); 83 | } 84 | 85 | void formatClustering(const int len, const int * sub, const int * sup, 86 | int * merge, int * order); 87 | void getPrintOrder(const int len, const int * merge, int * order); 88 | void depthFirst(const int len, const int * merge, int level, int * otop, 89 | int * order); 90 | -------------------------------------------------------------------------------- /src/sort.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include"cuseful.h" 5 | #include"sort.h" 6 | 7 | void copyVect(int n, double * from, int incFrom, double * to, int incTo) 8 | { 9 | for(int i = 0; i < n; i++) 10 | to[i*incTo] = from[i*incFrom]; 11 | } 12 | 13 | //Swap rows in a col major array 14 | void swapRows(int rows, int cols, double * array, int rowA, int rowB) 15 | { 16 | double * tempRow = Calloc(cols, double); 17 | 18 | copyVect(cols, array+rowA, rows, tempRow, 1); 19 | copyVect(cols, array+rowB, rows, array+rowA, rows); 20 | copyVect(cols, tempRow, 1, array+rowB, rows); 21 | Free(tempRow); 22 | } 23 | 24 | //Find the index of the Median of the elements 25 | //of array that occur at every "shift" positions. 26 | int findMedianIndex(int rows, int cols, int colToSortOn, double * array, 27 | int left, int right, int shift) 28 | { 29 | int 30 | i, 31 | groups = (right - left)/shift + 1, 32 | k = left + groups/2*shift; 33 | 34 | double 35 | * colToSort = array+colToSortOn*rows; 36 | 37 | for(i = left; i <= k; i += shift) { 38 | 39 | int 40 | minRow = i; 41 | double 42 | minValue = colToSort[minRow]; 43 | 44 | for(int j = i; j <= right; j +=shift) { 45 | if(colToSort[j] < minValue) { 46 | minRow = j; 47 | minValue = colToSort[minRow]; 48 | } 49 | } 50 | swapRows(rows, cols, array, i, minRow); 51 | } 52 | return k; 53 | } 54 | 55 | //Computes the median of each group of 5 elements and stores 56 | //it as the first element of the group. Recursively does this 57 | //till there is only one group and hence only one Median 58 | double findMedianOfMedians(int rows, int cols, int colToSortOn, double * array, 59 | int left, int right) 60 | { 61 | double * colToSort = array+colToSortOn*rows; 62 | if(left == right) 63 | return colToSort[left]; 64 | 65 | int i, shift = 1; 66 | while(shift <= (right - left)) { 67 | for(i = left; i <= right; i+=shift*5) { 68 | int endIndex = (i + shift*5 - 1 < right) ? i + shift*5 - 1 : right; 69 | 70 | int medianIndex = findMedianIndex(rows, cols, colToSortOn, array, 71 | i, endIndex, shift); 72 | 73 | swapRows(rows, cols, array, i, medianIndex); 74 | } 75 | shift *= 5; 76 | } 77 | return colToSort[left]; 78 | } 79 | 80 | //Partition the array into two halves and return the 81 | //index about which the array is partitioned 82 | int partition(int rows, int cols, int colToSortOn, double * array, 83 | int left, int right) 84 | { 85 | //Makes the leftmost element a good pivot, 86 | //specifically the median of medians 87 | findMedianOfMedians(rows, cols, colToSortOn, array, left, right); 88 | 89 | int 90 | pivotIndex = left, index = left, 91 | i; 92 | double 93 | * colToSort = array+colToSortOn*rows, 94 | pivotValue = colToSort[pivotIndex]; 95 | 96 | swapRows(rows, cols, array, pivotIndex, right); 97 | 98 | for(i = left; i < right; i++) { 99 | if(colToSort[i] < pivotValue) { 100 | swapRows(rows, cols, array, i, index); 101 | index += 1; 102 | } 103 | } 104 | swapRows(rows, cols, array, right, index); 105 | return index; 106 | } 107 | 108 | //Quicksort the array 109 | void quicksort(int rows, int cols, int colToSortOn, double * array, 110 | int left, int right) 111 | { 112 | if(left >= right) 113 | return; 114 | 115 | int index = partition(rows, cols, colToSortOn, array, left, right); 116 | quicksort(rows, cols, colToSortOn, array, left, index - 1); 117 | quicksort(rows, cols, colToSortOn, array, index + 1, right); 118 | } 119 | -------------------------------------------------------------------------------- /src/sort.h: -------------------------------------------------------------------------------- 1 | void quicksort(int rows, int cols, int colToSortOn, double * array, 2 | int left, int right); 3 | -------------------------------------------------------------------------------- /tools/mdate-sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Get modification time of a file or directory and pretty-print it. 3 | 4 | scriptversion=2007-03-30.02 5 | 6 | # Copyright (C) 1995, 1996, 1997, 2003, 2004, 2005, 2007 Free Software 7 | # Foundation, Inc. 8 | # written by Ulrich Drepper , June 1995 9 | # 10 | # This program is free software; you can redistribute it and/or modify 11 | # it under the terms of the GNU General Public License as published by 12 | # the Free Software Foundation; either version 2, or (at your option) 13 | # any later version. 14 | # 15 | # This program is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | # GNU General Public License for more details. 19 | # 20 | # You should have received a copy of the GNU General Public License 21 | # along with this program; if not, write to the Free Software Foundation, 22 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 23 | 24 | # As a special exception to the GNU General Public License, if you 25 | # distribute this file as part of a program that contains a 26 | # configuration script generated by Autoconf, you may include it under 27 | # the same distribution terms that you use for the rest of that program. 28 | 29 | # This file is maintained in Automake, please report 30 | # bugs to or send patches to 31 | # . 32 | 33 | case $1 in 34 | '') 35 | echo "$0: No file. Try \`$0 --help' for more information." 1>&2 36 | exit 1; 37 | ;; 38 | -h | --h*) 39 | cat <<\EOF 40 | Usage: mdate-sh [--help] [--version] FILE 41 | 42 | Pretty-print the modification time of FILE. 43 | 44 | Report bugs to . 45 | EOF 46 | exit $? 47 | ;; 48 | -v | --v*) 49 | echo "mdate-sh $scriptversion" 50 | exit $? 51 | ;; 52 | esac 53 | 54 | # Prevent date giving response in another language. 55 | LANG=C 56 | export LANG 57 | LC_ALL=C 58 | export LC_ALL 59 | LC_TIME=C 60 | export LC_TIME 61 | 62 | # GNU ls changes its time format in response to the TIME_STYLE 63 | # variable. Since we cannot assume `unset' works, revert this 64 | # variable to its documented default. 65 | if test "${TIME_STYLE+set}" = set; then 66 | TIME_STYLE=posix-long-iso 67 | export TIME_STYLE 68 | fi 69 | 70 | save_arg1=$1 71 | 72 | # Find out how to get the extended ls output of a file or directory. 73 | if ls -L /dev/null 1>/dev/null 2>&1; then 74 | ls_command='ls -L -l -d' 75 | else 76 | ls_command='ls -l -d' 77 | fi 78 | # Avoid user/group names that might have spaces, when possible. 79 | if ls -n /dev/null 1>/dev/null 2>&1; then 80 | ls_command="$ls_command -n" 81 | fi 82 | 83 | # A `ls -l' line looks as follows on OS/2. 84 | # drwxrwx--- 0 Aug 11 2001 foo 85 | # This differs from Unix, which adds ownership information. 86 | # drwxrwx--- 2 root root 4096 Aug 11 2001 foo 87 | # 88 | # To find the date, we split the line on spaces and iterate on words 89 | # until we find a month. This cannot work with files whose owner is a 90 | # user named `Jan', or `Feb', etc. However, it's unlikely that `/' 91 | # will be owned by a user whose name is a month. So we first look at 92 | # the extended ls output of the root directory to decide how many 93 | # words should be skipped to get the date. 94 | 95 | # On HPUX /bin/sh, "set" interprets "-rw-r--r--" as options, so the "x" below. 96 | set x`$ls_command /` 97 | 98 | # Find which argument is the month. 99 | month= 100 | command= 101 | until test $month 102 | do 103 | shift 104 | # Add another shift to the command. 105 | command="$command shift;" 106 | case $1 in 107 | Jan) month=January; nummonth=1;; 108 | Feb) month=February; nummonth=2;; 109 | Mar) month=March; nummonth=3;; 110 | Apr) month=April; nummonth=4;; 111 | May) month=May; nummonth=5;; 112 | Jun) month=June; nummonth=6;; 113 | Jul) month=July; nummonth=7;; 114 | Aug) month=August; nummonth=8;; 115 | Sep) month=September; nummonth=9;; 116 | Oct) month=October; nummonth=10;; 117 | Nov) month=November; nummonth=11;; 118 | Dec) month=December; nummonth=12;; 119 | esac 120 | done 121 | 122 | # Get the extended ls output of the file or directory. 123 | set dummy x`eval "$ls_command \"\$save_arg1\""` 124 | 125 | # Remove all preceding arguments 126 | eval $command 127 | 128 | # Because of the dummy argument above, month is in $2. 129 | # 130 | # On a POSIX system, we should have 131 | # 132 | # $# = 5 133 | # $1 = file size 134 | # $2 = month 135 | # $3 = day 136 | # $4 = year or time 137 | # $5 = filename 138 | # 139 | # On Darwin 7.7.0 and 7.6.0, we have 140 | # 141 | # $# = 4 142 | # $1 = day 143 | # $2 = month 144 | # $3 = year or time 145 | # $4 = filename 146 | 147 | # Get the month. 148 | case $2 in 149 | Jan) month=January; nummonth=1;; 150 | Feb) month=February; nummonth=2;; 151 | Mar) month=March; nummonth=3;; 152 | Apr) month=April; nummonth=4;; 153 | May) month=May; nummonth=5;; 154 | Jun) month=June; nummonth=6;; 155 | Jul) month=July; nummonth=7;; 156 | Aug) month=August; nummonth=8;; 157 | Sep) month=September; nummonth=9;; 158 | Oct) month=October; nummonth=10;; 159 | Nov) month=November; nummonth=11;; 160 | Dec) month=December; nummonth=12;; 161 | esac 162 | 163 | case $3 in 164 | ???*) day=$1;; 165 | *) day=$3; shift;; 166 | esac 167 | 168 | # Here we have to deal with the problem that the ls output gives either 169 | # the time of day or the year. 170 | case $3 in 171 | *:*) set `date`; eval year=\$$# 172 | case $2 in 173 | Jan) nummonthtod=1;; 174 | Feb) nummonthtod=2;; 175 | Mar) nummonthtod=3;; 176 | Apr) nummonthtod=4;; 177 | May) nummonthtod=5;; 178 | Jun) nummonthtod=6;; 179 | Jul) nummonthtod=7;; 180 | Aug) nummonthtod=8;; 181 | Sep) nummonthtod=9;; 182 | Oct) nummonthtod=10;; 183 | Nov) nummonthtod=11;; 184 | Dec) nummonthtod=12;; 185 | esac 186 | # For the first six month of the year the time notation can also 187 | # be used for files modified in the last year. 188 | if (expr $nummonth \> $nummonthtod) > /dev/null; 189 | then 190 | year=`expr $year - 1` 191 | fi;; 192 | *) year=$3;; 193 | esac 194 | 195 | # The result. 196 | echo $day $month $year 197 | 198 | # Local Variables: 199 | # mode: shell-script 200 | # sh-indentation: 2 201 | # eval: (add-hook 'write-file-hooks 'time-stamp) 202 | # time-stamp-start: "scriptversion=" 203 | # time-stamp-format: "%:y-%02m-%02d.%02H" 204 | # time-stamp-end: "$" 205 | # End: 206 | -------------------------------------------------------------------------------- /tools/missing: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # Common stub for a few missing GNU programs while installing. 3 | 4 | scriptversion=2006-05-10.23 5 | 6 | # Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006 7 | # Free Software Foundation, Inc. 8 | # Originally by Fran,cois Pinard , 1996. 9 | 10 | # This program is free software; you can redistribute it and/or modify 11 | # it under the terms of the GNU General Public License as published by 12 | # the Free Software Foundation; either version 2, or (at your option) 13 | # any later version. 14 | 15 | # This program is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | # GNU General Public License for more details. 19 | 20 | # You should have received a copy of the GNU General Public License 21 | # along with this program; if not, write to the Free Software 22 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 23 | # 02110-1301, USA. 24 | 25 | # As a special exception to the GNU General Public License, if you 26 | # distribute this file as part of a program that contains a 27 | # configuration script generated by Autoconf, you may include it under 28 | # the same distribution terms that you use for the rest of that program. 29 | 30 | if test $# -eq 0; then 31 | echo 1>&2 "Try \`$0 --help' for more information" 32 | exit 1 33 | fi 34 | 35 | run=: 36 | sed_output='s/.* --output[ =]\([^ ]*\).*/\1/p' 37 | sed_minuso='s/.* -o \([^ ]*\).*/\1/p' 38 | 39 | # In the cases where this matters, `missing' is being run in the 40 | # srcdir already. 41 | if test -f configure.ac; then 42 | configure_ac=configure.ac 43 | else 44 | configure_ac=configure.in 45 | fi 46 | 47 | msg="missing on your system" 48 | 49 | case $1 in 50 | --run) 51 | # Try to run requested program, and just exit if it succeeds. 52 | run= 53 | shift 54 | "$@" && exit 0 55 | # Exit code 63 means version mismatch. This often happens 56 | # when the user try to use an ancient version of a tool on 57 | # a file that requires a minimum version. In this case we 58 | # we should proceed has if the program had been absent, or 59 | # if --run hadn't been passed. 60 | if test $? = 63; then 61 | run=: 62 | msg="probably too old" 63 | fi 64 | ;; 65 | 66 | -h|--h|--he|--hel|--help) 67 | echo "\ 68 | $0 [OPTION]... PROGRAM [ARGUMENT]... 69 | 70 | Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an 71 | error status if there is no known handling for PROGRAM. 72 | 73 | Options: 74 | -h, --help display this help and exit 75 | -v, --version output version information and exit 76 | --run try to run the given command, and emulate it if it fails 77 | 78 | Supported PROGRAM values: 79 | aclocal touch file \`aclocal.m4' 80 | autoconf touch file \`configure' 81 | autoheader touch file \`config.h.in' 82 | autom4te touch the output file, or create a stub one 83 | automake touch all \`Makefile.in' files 84 | bison create \`y.tab.[ch]', if possible, from existing .[ch] 85 | flex create \`lex.yy.c', if possible, from existing .c 86 | help2man touch the output file 87 | lex create \`lex.yy.c', if possible, from existing .c 88 | makeinfo touch the output file 89 | tar try tar, gnutar, gtar, then tar without non-portable flags 90 | yacc create \`y.tab.[ch]', if possible, from existing .[ch] 91 | 92 | Send bug reports to ." 93 | exit $? 94 | ;; 95 | 96 | -v|--v|--ve|--ver|--vers|--versi|--versio|--version) 97 | echo "missing $scriptversion (GNU Automake)" 98 | exit $? 99 | ;; 100 | 101 | -*) 102 | echo 1>&2 "$0: Unknown \`$1' option" 103 | echo 1>&2 "Try \`$0 --help' for more information" 104 | exit 1 105 | ;; 106 | 107 | esac 108 | 109 | # Now exit if we have it, but it failed. Also exit now if we 110 | # don't have it and --version was passed (most likely to detect 111 | # the program). 112 | case $1 in 113 | lex|yacc) 114 | # Not GNU programs, they don't have --version. 115 | ;; 116 | 117 | tar) 118 | if test -n "$run"; then 119 | echo 1>&2 "ERROR: \`tar' requires --run" 120 | exit 1 121 | elif test "x$2" = "x--version" || test "x$2" = "x--help"; then 122 | exit 1 123 | fi 124 | ;; 125 | 126 | *) 127 | if test -z "$run" && ($1 --version) > /dev/null 2>&1; then 128 | # We have it, but it failed. 129 | exit 1 130 | elif test "x$2" = "x--version" || test "x$2" = "x--help"; then 131 | # Could not run --version or --help. This is probably someone 132 | # running `$TOOL --version' or `$TOOL --help' to check whether 133 | # $TOOL exists and not knowing $TOOL uses missing. 134 | exit 1 135 | fi 136 | ;; 137 | esac 138 | 139 | # If it does not exist, or fails to run (possibly an outdated version), 140 | # try to emulate it. 141 | case $1 in 142 | aclocal*) 143 | echo 1>&2 "\ 144 | WARNING: \`$1' is $msg. You should only need it if 145 | you modified \`acinclude.m4' or \`${configure_ac}'. You might want 146 | to install the \`Automake' and \`Perl' packages. Grab them from 147 | any GNU archive site." 148 | touch aclocal.m4 149 | ;; 150 | 151 | autoconf) 152 | echo 1>&2 "\ 153 | WARNING: \`$1' is $msg. You should only need it if 154 | you modified \`${configure_ac}'. You might want to install the 155 | \`Autoconf' and \`GNU m4' packages. Grab them from any GNU 156 | archive site." 157 | touch configure 158 | ;; 159 | 160 | autoheader) 161 | echo 1>&2 "\ 162 | WARNING: \`$1' is $msg. You should only need it if 163 | you modified \`acconfig.h' or \`${configure_ac}'. You might want 164 | to install the \`Autoconf' and \`GNU m4' packages. Grab them 165 | from any GNU archive site." 166 | files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}` 167 | test -z "$files" && files="config.h" 168 | touch_files= 169 | for f in $files; do 170 | case $f in 171 | *:*) touch_files="$touch_files "`echo "$f" | 172 | sed -e 's/^[^:]*://' -e 's/:.*//'`;; 173 | *) touch_files="$touch_files $f.in";; 174 | esac 175 | done 176 | touch $touch_files 177 | ;; 178 | 179 | automake*) 180 | echo 1>&2 "\ 181 | WARNING: \`$1' is $msg. You should only need it if 182 | you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'. 183 | You might want to install the \`Automake' and \`Perl' packages. 184 | Grab them from any GNU archive site." 185 | find . -type f -name Makefile.am -print | 186 | sed 's/\.am$/.in/' | 187 | while read f; do touch "$f"; done 188 | ;; 189 | 190 | autom4te) 191 | echo 1>&2 "\ 192 | WARNING: \`$1' is needed, but is $msg. 193 | You might have modified some files without having the 194 | proper tools for further handling them. 195 | You can get \`$1' as part of \`Autoconf' from any GNU 196 | archive site." 197 | 198 | file=`echo "$*" | sed -n "$sed_output"` 199 | test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"` 200 | if test -f "$file"; then 201 | touch $file 202 | else 203 | test -z "$file" || exec >$file 204 | echo "#! /bin/sh" 205 | echo "# Created by GNU Automake missing as a replacement of" 206 | echo "# $ $@" 207 | echo "exit 0" 208 | chmod +x $file 209 | exit 1 210 | fi 211 | ;; 212 | 213 | bison|yacc) 214 | echo 1>&2 "\ 215 | WARNING: \`$1' $msg. You should only need it if 216 | you modified a \`.y' file. You may need the \`Bison' package 217 | in order for those modifications to take effect. You can get 218 | \`Bison' from any GNU archive site." 219 | rm -f y.tab.c y.tab.h 220 | if test $# -ne 1; then 221 | eval LASTARG="\${$#}" 222 | case $LASTARG in 223 | *.y) 224 | SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'` 225 | if test -f "$SRCFILE"; then 226 | cp "$SRCFILE" y.tab.c 227 | fi 228 | SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'` 229 | if test -f "$SRCFILE"; then 230 | cp "$SRCFILE" y.tab.h 231 | fi 232 | ;; 233 | esac 234 | fi 235 | if test ! -f y.tab.h; then 236 | echo >y.tab.h 237 | fi 238 | if test ! -f y.tab.c; then 239 | echo 'main() { return 0; }' >y.tab.c 240 | fi 241 | ;; 242 | 243 | lex|flex) 244 | echo 1>&2 "\ 245 | WARNING: \`$1' is $msg. You should only need it if 246 | you modified a \`.l' file. You may need the \`Flex' package 247 | in order for those modifications to take effect. You can get 248 | \`Flex' from any GNU archive site." 249 | rm -f lex.yy.c 250 | if test $# -ne 1; then 251 | eval LASTARG="\${$#}" 252 | case $LASTARG in 253 | *.l) 254 | SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'` 255 | if test -f "$SRCFILE"; then 256 | cp "$SRCFILE" lex.yy.c 257 | fi 258 | ;; 259 | esac 260 | fi 261 | if test ! -f lex.yy.c; then 262 | echo 'main() { return 0; }' >lex.yy.c 263 | fi 264 | ;; 265 | 266 | help2man) 267 | echo 1>&2 "\ 268 | WARNING: \`$1' is $msg. You should only need it if 269 | you modified a dependency of a manual page. You may need the 270 | \`Help2man' package in order for those modifications to take 271 | effect. You can get \`Help2man' from any GNU archive site." 272 | 273 | file=`echo "$*" | sed -n "$sed_output"` 274 | test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"` 275 | if test -f "$file"; then 276 | touch $file 277 | else 278 | test -z "$file" || exec >$file 279 | echo ".ab help2man is required to generate this page" 280 | exit 1 281 | fi 282 | ;; 283 | 284 | makeinfo) 285 | echo 1>&2 "\ 286 | WARNING: \`$1' is $msg. You should only need it if 287 | you modified a \`.texi' or \`.texinfo' file, or any other file 288 | indirectly affecting the aspect of the manual. The spurious 289 | call might also be the consequence of using a buggy \`make' (AIX, 290 | DU, IRIX). You might want to install the \`Texinfo' package or 291 | the \`GNU make' package. Grab either from any GNU archive site." 292 | # The file to touch is that specified with -o ... 293 | file=`echo "$*" | sed -n "$sed_output"` 294 | test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"` 295 | if test -z "$file"; then 296 | # ... or it is the one specified with @setfilename ... 297 | infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'` 298 | file=`sed -n ' 299 | /^@setfilename/{ 300 | s/.* \([^ ]*\) *$/\1/ 301 | p 302 | q 303 | }' $infile` 304 | # ... or it is derived from the source name (dir/f.texi becomes f.info) 305 | test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info 306 | fi 307 | # If the file does not exist, the user really needs makeinfo; 308 | # let's fail without touching anything. 309 | test -f $file || exit 1 310 | touch $file 311 | ;; 312 | 313 | tar) 314 | shift 315 | 316 | # We have already tried tar in the generic part. 317 | # Look for gnutar/gtar before invocation to avoid ugly error 318 | # messages. 319 | if (gnutar --version > /dev/null 2>&1); then 320 | gnutar "$@" && exit 0 321 | fi 322 | if (gtar --version > /dev/null 2>&1); then 323 | gtar "$@" && exit 0 324 | fi 325 | firstarg="$1" 326 | if shift; then 327 | case $firstarg in 328 | *o*) 329 | firstarg=`echo "$firstarg" | sed s/o//` 330 | tar "$firstarg" "$@" && exit 0 331 | ;; 332 | esac 333 | case $firstarg in 334 | *h*) 335 | firstarg=`echo "$firstarg" | sed s/h//` 336 | tar "$firstarg" "$@" && exit 0 337 | ;; 338 | esac 339 | fi 340 | 341 | echo 1>&2 "\ 342 | WARNING: I can't seem to be able to run \`tar' with the given arguments. 343 | You may want to install GNU tar or Free paxutils, or check the 344 | command line arguments." 345 | exit 1 346 | ;; 347 | 348 | *) 349 | echo 1>&2 "\ 350 | WARNING: \`$1' is needed, and is $msg. 351 | You might have modified some files without having the 352 | proper tools for further handling them. Check the \`README' file, 353 | it often tells you about the needed prerequisites for installing 354 | this package. You may also peek at any GNU archive site, in case 355 | some other package would contain this missing \`$1' program." 356 | exit 1 357 | ;; 358 | esac 359 | 360 | exit 0 361 | 362 | # Local variables: 363 | # eval: (add-hook 'write-file-hooks 'time-stamp) 364 | # time-stamp-start: "scriptversion=" 365 | # time-stamp-format: "%:y-%02m-%02d.%02H" 366 | # time-stamp-end: "$" 367 | # End: 368 | --------------------------------------------------------------------------------