├── .gitignore
├── DESCRIPTION
├── INSTALL
├── NAMESPACE
├── NEWS
├── R
    ├── chooseGpu.R
    ├── getGpuId.R
    ├── gpuCor.R
    ├── gpuGranger.R
    ├── gpuHclust.R
    ├── gpuLm.R
    ├── gpuMatMult.R
    ├── gpuMi.R
    ├── gpuQr.R
    ├── gpuSolve.R
    ├── hooks.R
    └── zzz.R
├── cleanup
├── configure
├── configure.ac
├── inst
    └── cuda
    │   ├── correlation.cu
    │   ├── distance.cu
    │   ├── granger.cu
    │   ├── hcluster.cu
    │   ├── kendall.cu
    │   ├── mi.cu
    │   └── qrdecomp.cu
├── man
    ├── chooseGpu.Rd
    ├── cpuMatMult.Rd
    ├── getGpuId.Rd
    ├── gpuCor.Rd
    ├── gpuCrossprod.Rd
    ├── gpuDist.Rd
    ├── gpuDistClust.Rd
    ├── gpuGlm.Rd
    ├── gpuGranger.Rd
    ├── gpuHclust.Rd
    ├── gpuLm.Rd
    ├── gpuLm.defaultTol.Rd
    ├── gpuLm.fit.Rd
    ├── gpuLsfit.Rd
    ├── gpuMatMult.Rd
    ├── gpuMi.Rd
    ├── gpuQr.Rd
    ├── gpuSolve.Rd
    ├── gpuTcrossprod.Rd
    └── gpuTtest.Rd
├── src
    ├── Makevars.in
    ├── correlation.cpp
    ├── correlation.h
    ├── cudaUtils.cpp
    ├── cudaUtils.h
    ├── cuseful.cpp
    ├── cuseful.h
    ├── distance.cpp
    ├── distance.h
    ├── granger.cpp
    ├── granger.h
    ├── hcluster.cpp
    ├── hcluster.h
    ├── kendall.cpp
    ├── kendall.h
    ├── lsfit.cpp
    ├── lsfit.h
    ├── matmult.cpp
    ├── matmult.h
    ├── mi.cpp
    ├── mi.h
    ├── qrdecomp.cpp
    ├── qrdecomp.h
    ├── rinterface.cpp
    ├── rinterface.h
    ├── sort.cpp
    └── sort.h
└── tools
    ├── config.guess
    ├── config.sub
    ├── install-sh
    ├── mdate-sh
    └── missing


/.gitignore:
--------------------------------------------------------------------------------
1 | autom4te.cache
2 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: gputools
 2 | Version: 1.2
 3 | Title: A Few GPU Enabled Functions
 4 | Authors@R: c(person("Josh", "Buckner", email = "nullsatz@gmail.com",
 5 |                     role = c("aut", "cre")),
 6 |   person("Mark", "Seligman", email = "mseligman@suiji.org", role = "aut"),
 7 |   person("Fan", "Meng", email = "mengf@umich.edu", role = "aut"),
 8 |   person("Justin", "Wilson", role = "ctb"))
 9 | Depends: R (>= 3.1.2)
10 | SystemRequirements: Nvidia's CUDA toolkit (>= release 7.0)
11 | Description: Provides R interfaces to a handful of common
12 |         functions implemented using the Nvidia CUDA toolkit. Some of the
13 |         functions require at least GPU Compute Capability 1.3. 
14 |         Thanks to Craig Stark at UC Irvine for donating time on his lab's Mac.
15 | License: GPL-3
16 | URL: https://github.com/nullsatz/gputools/wiki
17 | BugReports: https://github.com/nullsatz/gputools/issues
18 | Repository: CRAN
19 | 


--------------------------------------------------------------------------------
/INSTALL:
--------------------------------------------------------------------------------
 1 | gputools
 2 | 
 3 | =========
 4 | 
 5 | To install the package, you must first install Nvidia's CUDA Toolkit available from
 6 | 
 7 | http://developer.nvidia.com/cuda-downloads
 8 | 
 9 | You will need an Nvidia GPU of compute capability >= 2.
10 | 
11 | =========
12 | 
13 | Installation
14 | 
15 | If the package installation fails with an error:
16 | 
17 | Set the environment variable CUDA_HOME to the root of your CUDA toolkit installation.
18 | 
19 | If that doesn't help, there are several 'configure options' that may be set from the command line:
20 | 
21 | R CMD INSTALL --configure-args="<gputools configure options>" gputools_0.5.tar.gz
22 | 
23 | <gputools configure options> include:
24 | --with-nvcc=<path to nvcc>
25 | --with-cuda=<path to CUDA toolkit root>
26 | --with-r=<path to root of R install>
27 | --with-r-include=<path to R header files>
28 | --with-r-lib=<path to libR.so or libR.dylib>
29 | 
30 | For example:
31 | R CMD INSTALL --configure-args="--with-nvcc=/usr/local/cuda/bin/nvcc --with-r-lib=/usr/local/share/R/lib64" gputools_0.5.tar.gz
32 | 
33 | Enjoy!
34 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | useDynLib(gputools)
 2 | 
 3 | export(gpuCor, 
 4 | 	gpuTtest,
 5 | 	gpuQr, gpuSolve,
 6 | 	gpuGranger, 
 7 | 	gpuDist, gpuHclust, gpuDistClust,
 8 | 	gpuMatMult, cpuMatMult, gpuCrossprod, gpuTcrossprod,
 9 | 	gpuMi,
10 | 	gpuLm, gpuLm.fit, gpuLsfit, gpuGlm, gpuGlm.fit,
11 | 	chooseGpu, getGpuId)
12 | 
13 | importFrom("stats",
14 |   ".getXlevels",
15 |   "as.dist",
16 |   "complete.cases",
17 |   "gaussian",
18 |   "is.empty.model",
19 |   "model.extract",
20 |   "model.matrix",
21 |   "model.offset",
22 |   "model.response",
23 |   "model.weights")
24 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 | gputools changes
 2 | =================
 3 | 
 4 | changes in gputools 1.1
 5 | - this is a point release intended to trial cuda run-time compilation (nvrtc)
 6 |   it is only used for the kendall correlation option of gpuCor
 7 |   but nvrtc will be used for all cuda kernel code in future releases
 8 | 
 9 | changes in gputools 0.5
10 | - switched back to autotools options
11 | 
12 | changes in gputools 0.28
13 | - updated installation procedure
14 | - updated OSX compatibility
15 | - removed CULA dependencies
16 | 
17 | changes in gputools 0.27
18 | - removed LICENSE file and updated DESCRIPTION file
19 | - fixed R documentation files
20 | - removed nonfunctional svm code
21 | 
22 | changes in gputools 0.26
23 | - fixes to several potential bugs:  
24 | 	There was a variable in the QR code which was never  
25 | 	deallocated.  Also, there were several variables freed with  
26 | 	the wrong method. Also, added a missing cublasInit/Shutdown pair.
27 | 	Thanks to Eunjin Lee's error reporting!
28 | 
29 | changes in gputools 0.24
30 | 
31 | - added support for configure argument --with-compiler-bindir and
32 | 	environment variable COMPILER_BINDIR to support gcc located somewhere
33 | 	other than the user's path. Use one of these to get the package
34 | 	installed in flavors of Linux that ship with default gcc version > 4.4.
35 | 
36 | changes in gputools 0.24
37 | 
38 | - A couple of changes to the Makefile:
39 | 	* added -I. for compatibility with R 2.12
40 | 	* switched from CC to NVCC for compat with ~/.R/Makevars
41 | 
42 | changes in gputools 0.21
43 | 
44 | - fixed another bug in gpuMi
45 | 
46 | changes in gputools 0.2
47 | 
48 | - added some linear and nonlinear regression functions
49 | 
50 | - added some configure args --with-r-lib and --with-r-include (see INSTALL
51 | notes section)
52 | 
53 | - fixed a bug in gpuMi
54 | 
55 | - added a demo of CULA use: fastICA with CULA's svd plugged in.
56 | 
57 | changes in gputools 0.1.3
58 | 
59 | - Mac os x and nvidia cards with compute capability less than 1.3 are now
60 | supported. Systems lacking a gpu altogether also recieve some support. See the
61 | file INSTALL for details.
62 | 
63 | - gpuHclust, gpuDist, and gpuQr now output the same classes as R's base hclust,
64 | dist, and qr functions.
65 | 
66 | - The c code now uses r's api. Host memory allocation and error reporting
67 | have recieved a facelift.
68 | 
69 | - gpuGranger now reports the correct p-values and is slightly faster for small
70 | data sets.
71 | 
72 | - Some bugs with gpuCor's pearson method have been fixed.
73 | 


--------------------------------------------------------------------------------
/R/chooseGpu.R:
--------------------------------------------------------------------------------
1 | chooseGpu <- function(deviceId = 0) 
2 | {
3 | 	deviceId <- as.integer(deviceId)
4 | 	.C("rsetDevice", deviceId, PACKAGE='gputools')
5 | }
6 | 


--------------------------------------------------------------------------------
/R/getGpuId.R:
--------------------------------------------------------------------------------
1 | getGpuId <- function()
2 | {
3 | 	deviceId <- .C("rgetDevice", deviceId = integer(1),
4 | 		PACKAGE='gputools')$deviceId
5 | 	return(deviceId)
6 | }
7 | 


--------------------------------------------------------------------------------
/R/gpuCor.R:
--------------------------------------------------------------------------------
 1 | gpuCor <- function(x, y = NULL, use = "everything", method = "pearson") {
 2 |     x <- as.matrix(x)
 3 |     nx <- ncol(x)
 4 |     size <- nrow(x)
 5 | 
 6 |     if(is.null(y)) {
 7 |         y <- x
 8 |     } else {
 9 |         y <- as.matrix(y)
10 |     }
11 |     ny <- ncol(y)
12 | 
13 |     n <- nx * ny
14 | 
15 |     methods <- c("pearson", "kendall")
16 |     method <- pmatch(method, methods, -1)
17 |     if(is.na(method)) {
18 |         stop("invalid correlation method")
19 |     }
20 |     if(method == -1) {
21 |         stop("ambiguous correlation method")
22 |     }
23 | 
24 |     uses <- c("everything", "pairwise.complete.obs")
25 |     use <- pmatch(use, uses, -1)
26 |     if(is.na(use)) {
27 |         stop("invalid correlation method")
28 |     }
29 |     if(use == -1) {
30 |         stop("ambiguous correlation method")
31 |     }
32 | 
33 |     if(methods[method] == "pearson") {
34 |         answer <- .C("rpmcc", NAOK=TRUE,
35 |                      as.integer(use - 1), as.single(x), as.integer(nx),
36 |                      as.single(y), as.integer(ny), as.integer(size),
37 |                      pairs = single(n), corr = single(n), ts = single(n),
38 |                      PACKAGE='gputools')
39 | 
40 |         pairs <- t(matrix(answer$pairs, ny, nx))
41 |         corr <- t(matrix(answer$corr, ny, nx))
42 |         ts <- t(matrix(answer$ts, ny, nx))
43 | 
44 |         return(list(coefficients = corr, ts = ts, pairs = pairs))
45 | 
46 |     } else if(methods[method] == "kendall") {
47 | 
48 |         if(uses[use] != "everything") {
49 |             warning("NA handling for Kendall's is not yet supported. Defaulting to using everything. Sorry for any inconvenience.")
50 |         }
51 | 
52 |         a <- .C("RgpuKendall",
53 |                 as.single(x), nx, as.single(y), ny, 
54 |                 size, result = double(nx*ny),
55 |                 PACKAGE = "gputools")
56 | 
57 |         pairs <- matrix(size, nx, ny)
58 |         return(list(coefficients = matrix(a$result, nx, ny), pairs = pairs))
59 |     } else {
60 |         stop("This correlation method is not yet supported.")
61 |     }
62 | }
63 | 
64 | gpuTtest <- function(goodPairs, coeffs) {
65 |     goodPairs <- as.single(goodPairs)
66 |     coeffs <- as.single(coeffs)
67 | 
68 |     n <- as.integer(length(goodPairs))
69 | 
70 |     .C("rtestT", NAOK = TRUE,
71 |        goodPairs, coeffs, n,
72 |        results = single(n),
73 |        PACKAGE = 'gputools')$results
74 | }
75 | 


--------------------------------------------------------------------------------
/R/gpuGranger.R:
--------------------------------------------------------------------------------
 1 | gpuGranger <- function(x, y=NULL, lag)
 2 | {
 3 |     x <- as.matrix(x)
 4 |     rows <- nrow(x)
 5 |     colsx <- ncol(x)
 6 | 
 7 |     if(rows - lag <= 2 * lag + 1) {
 8 |         stop("time sequence too short for lag: use longer sequences or smaller lag")
 9 |     }
10 | 
11 |     if(!is.null(y)) {
12 |         y <- as.matrix(y)
13 |     }
14 | 
15 |     lag <- as.integer(lag)
16 | 
17 |     if(is.null(y)) {
18 |         colsy <- colsx
19 |         cRetVal <- .C("rgpuGranger",
20 |                       as.integer(rows), as.integer(colsx), as.single(x), lag, 
21 |                       fStats = single(colsx*colsy),
22 |                       pValues = single(colsx*colsy),
23 |                       PACKAGE='gputools')
24 |     } else {
25 |         colsy <- ncol(y)
26 |         cRetVal <- .C("rgpuGrangerXY",
27 |                       as.integer(rows), as.integer(colsx), as.single(x), 
28 |                       as.integer(colsy), as.single(y), lag, 
29 |                       fStats = single(colsx*colsy),
30 |                       pValues = single(colsx*colsy),
31 |                       PACKAGE='gputools')
32 |     }
33 |     fStats <- matrix(cRetVal$fStats, colsx, colsy)
34 |     pValues <- matrix(cRetVal$pValues, colsx, colsy)
35 |     return(list(fStatistics = fStats, pValues = pValues))
36 | }
37 | 


--------------------------------------------------------------------------------
/R/gpuHclust.R:
--------------------------------------------------------------------------------
  1 | gpuDist <- function(points, method = "euclidean", p = 2.0)
  2 | {
  3 |     if(!is.na(pmatch(method, "euclidian"))) {
  4 |         method <- "euclidean"
  5 |     }
  6 | 
  7 |     methods <- c("euclidean", "maximum", "manhattan", "canberra", "binary",
  8 |                  "minkowski")
  9 |     method <- pmatch(method, methods) # hey presto method becomes an int
 10 |     if(is.na(method)) {
 11 |         stop("invalid distance method")
 12 |     }
 13 |     if(method == -1) {
 14 |         stop("ambiguous distance method")
 15 |     }
 16 |     method <- methods[method] # return method to a meaningful string
 17 | 
 18 |     points <- as.matrix(points)
 19 |     numPoints <- nrow(points)
 20 | 
 21 |     a <- .C("Rdistances",
 22 |             as.single(t(points)),
 23 |             as.integer(numPoints),
 24 |             as.integer(ncol(points)),
 25 |             d = single(numPoints * numPoints),
 26 |             method, as.single(p),
 27 |             PACKAGE='gputools')
 28 | 
 29 |     d <- as.dist(matrix(a$d, numPoints, numPoints))
 30 |     attr(d, "Labels") <- dimnames(points)[[1L]]
 31 |     attr(d, "method") <- method
 32 |     attr(d, "call") <- match.call()
 33 | 
 34 |     if(!is.na(pmatch(method, "minkowski"))) {
 35 |         attr(d, "p") <- p
 36 |     }
 37 | 
 38 |     return(d)
 39 | }
 40 | 
 41 | gpuHclust <- function(distances, method = "complete")
 42 | {
 43 |     methods <- c("ward", "single", "complete", "average", "mcquitty",
 44 |                  "median", "centroid", "flexible", "flexible group", "wpgma")
 45 |     method <-  pmatch(method, methods) # method is now an integer
 46 |     if(is.na(method)) {
 47 |         stop("invalid clustering method")
 48 |     }
 49 |     if(method == -1) {
 50 |         stop("ambiguous clustering method")
 51 |     }
 52 |     method <- methods[method] # return method to a meaningful string
 53 | 
 54 |     n <- as.integer(attr(distances, "Size"))
 55 |     if(is.null(n)) {
 56 |         stop("invalid dissimilarities")
 57 |     }
 58 |     if(n < 2) {
 59 |         stop("must have n >= 2 objects to cluster")
 60 |     }
 61 | 
 62 |     len <- as.integer(n*(n-1)/2)
 63 |     if(length(distances) != len) {
 64 |         if (length(distances) < len) {
 65 |             stop("dissimilarities of improper length")
 66 |         } else {
 67 |             warning("dissimilarities of improper length")
 68 |         }
 69 |     }
 70 |     
 71 |     numpoints <- n
 72 |     a <- .C("Rhcluster",
 73 |             as.single(as.matrix(distances)),
 74 |             as.integer(numpoints),
 75 |             merge = integer(2*(numpoints-1)),
 76 |             order = integer(numpoints),
 77 |             val = single(numpoints-1),
 78 |             method,
 79 |             PACKAGE='gputools')
 80 | 
 81 |     merge <- matrix(a$merge, numpoints-1, 2)
 82 | 
 83 |     tree <- list(merge = merge, height= a$val, order = a$order,
 84 |                  labels = attr(distances, "Labels"),
 85 |                  method = method,
 86 |                  call = match.call(),
 87 |                  dist.method = attr(distances, "method"))
 88 | 
 89 |     class(tree) <- "hclust"
 90 |     return(tree)
 91 | }
 92 | 
 93 | gpuDistClust <- function(points, distmethod = "euclidean", 
 94 |                          clustmethod = "complete") 
 95 | {
 96 |     if(!is.na(pmatch(distmethod, "euclidian"))) {
 97 |         method <- "euclidean"
 98 |     }
 99 | 
100 |     methods <- c("euclidean", "maximum", "manhattan", "canberra", "binary",
101 |                  "minkowski")
102 |     distmethod <- pmatch(distmethod, methods) # hey presto method becomes an int
103 |     if(is.na(distmethod)) {
104 |         stop("invalid distance method")
105 |     }
106 |     if(distmethod == -1) {
107 |         stop("ambiguous distance method")
108 |     }
109 |     distmethod <- methods[distmethod] # return method to a meaningful string
110 | 
111 |     methods <- c("ward", "single", "complete", "average", "mcquitty",
112 |                  "median", "centroid", "flexible", "flexible group", "wpgma")
113 |     clustmethod <-  pmatch(clustmethod, methods) # method is now an integer
114 |     if(is.na(clustmethod)) {
115 |         stop("invalid clustering method")
116 |     }
117 |     if(clustmethod == -1) {
118 |         stop("ambiguous clustering method")
119 |     }
120 |     clustmethod <- methods[clustmethod] # return method to a meaningful string
121 | 
122 |     points <- as.matrix(points)
123 |     nump <- nrow(points)
124 | 
125 |     a <- .C("Rdistclust",
126 |             distmethod, clustmethod,
127 |             as.single(t(points)),
128 |             as.integer(nump),
129 |             as.integer(ncol(points)),
130 |             merge = integer(2*(nump-1)),
131 |             order = integer(nump),
132 |             val = single(nump-1),
133 |             PACKAGE='gputools')
134 | 
135 |     merge <- matrix(a$merge, nump-1, 2)
136 | 
137 |     tree <- list(merge = merge, height = a$val, order = a$order,
138 |                  labels = dimnames(points)[[1L]],
139 |                  method = clustmethod,
140 |                  call = match.call(),
141 |                  dist.method = distmethod)
142 | 
143 |     class(tree) <- "hclust"
144 | 
145 |     return(tree)
146 | }
147 | 


--------------------------------------------------------------------------------
/R/gpuMatMult.R:
--------------------------------------------------------------------------------
 1 | gpuMatMult <- function(a, b) {
 2 | 	a <- as.matrix(a)
 3 | 	b <- as.matrix(b)
 4 | 
 5 | 	if (ncol(a) != nrow(b))
 6 | 		stop("error:  matrix dimensions mismatched for matrix multiplication")
 7 |         
 8 | 	.Call("gpuMatMult", a, b, PACKAGE='gputools')
 9 | }
10 | 
11 | cpuMatMult <- function(a, b) {
12 | 	a <- as.matrix(a)
13 | 	b <- as.matrix(b)
14 | 
15 | 	if (ncol(a) != nrow(b))
16 | 		stop("error:  matrix dimensions mismatched for matrix multiplication")
17 |         
18 | 	a %*% b
19 | }
20 | 
21 | gpuCrossprod <- function(a, b=NULL) {
22 |     a <- as.matrix(a)
23 | 
24 |     if (is.null(b)) b <- as.matrix(a)
25 |     else b <- as.matrix(b)
26 | 
27 |     if (nrow(a) != nrow(b))
28 |         stop("error: matrix dim mismatch for cross-product.")
29 |         
30 | 	results <- .Call("gpuMatMult", t(a), b, PACKAGE='gputools')
31 | 	return(results)
32 | }
33 | 
34 | 
35 | gpuTcrossprod <- function(a, b=NULL) 
36 | {
37 |     a <- as.matrix(a)
38 | 
39 |     if (is.null(b)) b <- as.matrix(a)
40 |     else b <- as.matrix(b)
41 | 
42 |     if (ncol(a) != ncol(b))
43 |         stop("error: matrix dim mismatch for transposed cross-product")
44 |         
45 | 	results <- .Call("gpuMatMult", a,t(b), PACKAGE='gputools')
46 | 	return(results)
47 | }
48 | 


--------------------------------------------------------------------------------
/R/gpuMi.R:
--------------------------------------------------------------------------------
 1 | gpuMi <- function(x, y = NULL, bins = 2, splineOrder = 1)
 2 | {
 3 | 	x <- as.matrix(x)
 4 | 	if(!is.null(y)) {
 5 | 		y <- as.matrix(y)
 6 | 	}
 7 | 
 8 | 	bins <- as.integer(bins)
 9 | 	splineOrder <- as.integer(splineOrder)
10 | 
11 | 	nsamples <- as.integer(nrow(x))
12 | 	na <- as.integer(ncol(x))
13 | 	a <- as.single(x)
14 | 
15 | 	b <- a
16 | 	nb <- na
17 | 	row_labels <- colnames(x)
18 | 
19 | 	if(is.null(y)) {
20 | 		nb <- as.integer(ncol(x))
21 | 		b <- as.single(x)
22 | 		row_labels <- colnames(x)
23 | 	} else {
24 | 		nb <- as.integer(ncol(y))
25 | 		b <- as.single(y)
26 | 		row_labels <- colnames(y)
27 | 	}
28 |         
29 | 	mi <- .C("rBSplineMutualInfo", bins, splineOrder, nsamples, na, a, 
30 |                  nb, b, mi = single(nb * na),
31 |                  PACKAGE='gputools')$mi
32 | 	mi <- matrix(mi, nb, na)
33 | 	rownames(mi) <- row_labels
34 | 	colnames(mi) <- colnames(x)
35 | 	return(mi)
36 | }
37 | 


--------------------------------------------------------------------------------
/R/gpuQr.R:
--------------------------------------------------------------------------------
 1 | gpuQr <- function(x, tol = 1e-07) {
 2 | 
 3 | 	x <- as.matrix(x)
 4 | 	if(is.complex(x)) {
 5 | 		stop("complex gpuQR not yet supported")
 6 | 	}
 7 | 
 8 | 	n <- nrow(x)
 9 | 	p <- ncol(x)
10 | 
11 | 	mode(x) <- 'single'
12 | 
13 | 	res <- .C("rGetQRDecompRR",
14 | 		as.integer(n),
15 | 		as.integer(p),
16 | 		as.double(tol),
17 | 		qr = x,
18 | 		pivot = as.integer(0L:(p-1)),
19 | 		qraux = double(p),
20 | 		rank = integer(1L),
21 | 		PACKAGE='gputools'
22 | 	)[c('qr', 'pivot', 'qraux', 'rank')]
23 | 
24 |         res$pivot <- res$pivot + 1
25 |         
26 | 	if(!is.null(cn <- colnames(x)))
27 | 		colnames(res$qr) <- cn[res$pivot]
28 | 
29 | 	class(res) <- "qr"
30 | 	res
31 | }
32 | 


--------------------------------------------------------------------------------
/R/gpuSolve.R:
--------------------------------------------------------------------------------
 1 | gpuSolve <- function(x, y=NULL) {
 2 | 	x <- as.matrix(x)
 3 | 	if(is.complex(x)) {
 4 | 		stop("complex gpuSolve not yet supported")
 5 | 	}
 6 | 
 7 | 	n <- nrow(x)
 8 | 	p <- ncol(x)
 9 | 	if(p > n) {
10 | 		stop("x represents an underdetermined system")
11 | 	}
12 | 
13 | 	x.qr <- qr(x)
14 | 	x.q <- qr.Q(x.qr)
15 | 	x.r <- qr.R(x.qr)
16 | 
17 | 	if(is.null(y)) {
18 | 		myCall <- .C("rGetInverseFromQR",
19 | 			as.integer(n), as.integer(p),
20 | 			as.single(x.q), as.single(x.r),
21 | 			inverse = single(n * p),
22 | 			PACKAGE='gputools'
23 | 		)
24 | 		x.inverse <- matrix(myCall$inverse, p, n)
25 | 		return(x.inverse)
26 | 	} else {
27 | 		y <- as.single(y)
28 | 		if(length(y) != n) {
29 | 			stop("y must have length nrows(x)")
30 | 		}
31 | 		myCall <- .C("rSolveFromQR",
32 | 			as.integer(n), as.integer(p),
33 | 			as.single(x.q), as.single(x.r),
34 | 			y, solution = single(p),
35 | 			PACKAGE='gputools'
36 | 		)
37 | 		return(myCall$solution)
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/R/hooks.R:
--------------------------------------------------------------------------------
 1 | .onLoad<-function(libname, pkgname){
 2 | 	e<-emptyenv()
 3 | 	library.dynam('gputools', pkgname, libname)
 4 | 	reg.finalizer(e,function(...){unloadNamespace(pkgname)}, onexit=T)
 5 | }
 6 | 
 7 | .onUnload<-function(libpath){
 8 | 	library.dynam.unload('gputools', libpath)
 9 | }
10 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
 1 | .onLoad <- function(lib, pkg) {
 2 |   cuFileNames <-
 3 |     c( "correlation.cu"
 4 |      , "distance.cu"
 5 |      , "granger.cu"
 6 |      , "hcluster.cu"
 7 |      , "kendall.cu"
 8 |      , "mi.cu"
 9 |      , "qrdecomp.cu"
10 |      )
11 | 
12 |   cuFileNames <-
13 |     sapply(cuFileNames,
14 |       function(fn) {
15 |         system.file('cuda', fn, package = 'gputools')
16 |       })
17 | 
18 |   cuSrc <-
19 |     sapply(cuFileNames,
20 |       function(fn) {
21 |         readChar(fn, file.info(fn)$size)
22 |       })
23 | 
24 |   cuFiles <-
25 |     c( "correlation"
26 |      , "distance"
27 |      , "granger"
28 |      , "hcluster"
29 |      , "kendall"
30 |      , "mi"
31 |      , "qrdecomp"
32 |      )
33 | 
34 |   result <-
35 |     .C("cuCompile",
36 |       length(cuFiles),
37 |       cuFiles,
38 |       cuSrc)
39 | }
40 | 
41 | .unLoad <- function(lib, pkg) {
42 |   result <- .C("unloadPackage")
43 | }
44 | 


--------------------------------------------------------------------------------
/cleanup:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | rm -f src/*o
4 | rm -f src/Makefile
5 | 


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
  1 | AC_INIT([gputools], 1.2) dnl package name, version
  2 | AC_CONFIG_AUX_DIR([tools])
  3 | 
  4 | dnl Select an optional include path, from a configure option
  5 | dnl or from an environment variable.
  6 | AC_ARG_WITH([cuda-include],
  7 |   AC_HELP_STRING([--with-cuda-include=INCLUDE_PATH],
  8 |     [the location of CUDA header files]),
  9 |   [cuda_include_path=$withval])
 10 | CUDA_CPPFLAGS="-I."
 11 | if test [ -n "${cuda_include_path}" ] ; then
 12 |   CUDA_CPPFLAGS="-I. -I${cuda_include_path}"
 13 | else
 14 |   if test [ -n "${CUDA_INCLUDE}" ] ; then
 15 |      CUDA_CPPFLAGS="-I. -I${CUDA_INCLUDE}"
 16 |      cuda_include_path="${CUDA_INCLUDE}"
 17 |   else
 18 |     if test [ -d "/usr/local/cuda/include" ] ; then
 19 |       CUDA_CPPFLAGS="-I. -I/usr/local/cuda/include"
 20 |       cuda_include_path="/usr/local/cuda/include"
 21 |     fi
 22 |   fi
 23 | fi
 24 | 
 25 | dnl ditto for a library path
 26 | AC_ARG_WITH([cuda-lib],
 27 |       AC_HELP_STRING([--with-cuda-lib=LIB_PATH],
 28 |          [the location of CUDA libraries]),
 29 |       [cuda_lib_path=$withval])
 30 | if test [ -n "${cuda_lib_path}" ] ; then
 31 |    LIBS="-L${cuda_lib_path} ${LIBS}"
 32 | else
 33 |   if test [ -n "${CUDA_LIBS}" ] ; then
 34 |      LIBS="-L${CUDA_LIBS} ${LIBS}"
 35 |      cuda_lib_path="${CUDA_LIBS}"
 36 |   else
 37 |     if test [ -d "/usr/local/cuda/lib64" ] ; then
 38 |       LIBS="-L/usr/local/cuda/lib64 ${LIBS}"
 39 |       cuda_lib_path="/usr/local/cuda/lib64"
 40 |     else
 41 |       if test [ -d "/usr/local/cuda/lib" ] ; then
 42 |         LIBS="-L/usr/local/cuda/lib ${LIBS}"
 43 |         cuda_lib_path="/usr/local/cuda/lib"
 44 |       fi
 45 |     fi
 46 |   fi
 47 | fi
 48 | 
 49 | dnl Now find the compiler and compiler flags to use
 50 | : ${R_HOME=`R RHOME`}
 51 | if test -z "${R_HOME}"; then
 52 |   echo "could not determine R_HOME"
 53 |   exit 1
 54 | fi
 55 | CXX=`"${R_HOME}/bin/R" CMD config CXX`
 56 | CPP=`"${R_HOME}/bin/R" CMD config CPP`
 57 | CXXFLAGS=`"${R_HOME}/bin/R" CMD config CXXFLAGS`
 58 | CPPFLAGS=`"${R_HOME}/bin/R" CMD config CPPFLAGS`
 59 | AC_PROG_CXX
 60 | AC_PROG_CPP
 61 | 
 62 | if test [ -n "${cuda_include_path}" ] ; then
 63 |   OLD_CPPFLAGS=${CPPFLAGS}
 64 |   CPPFLAGS="-I${cuda_include_path} ${CPPFLAGS}"
 65 | fi
 66 | 
 67 | AC_CHECK_HEADER([cuda.h], , [HAS_CUDA_H=no])
 68 | AC_CHECK_HEADER([nvrtc.h], , [HAS_NVRTC_H=no])
 69 | 
 70 | if test [ -n "${cuda_include_path}" ] ; then
 71 |   CPPFLAGS=${OLD_CPPFLAGS}
 72 | fi
 73 | 
 74 | if test [ "${HAS_CUDA_H}" = no ] ; then
 75 |   AC_MSG_ERROR("CUDA header cuda.h not found")
 76 | fi
 77 | 
 78 | if test [ "${HAS_NVRTC_H}" = no ] ; then
 79 |   AC_MSG_ERROR("CUDA header nvrtc.h not found")
 80 | fi
 81 | 
 82 | if test [ -n "${cuda_lib_path}" ] ; then
 83 |   OLD_LDFLAGS=${LDFLAGS}
 84 |   LDFLAGS="-L${cuda_lib_path} ${LDFLAGS}"
 85 | fi
 86 | 
 87 | dnl search for a library containing a CUDA function
 88 | AC_SEARCH_LIBS([cuGetErrorName], [cuda], ,
 89 |   AC_MSG_ERROR("no cuda driver library found"))
 90 | 
 91 | dnl search for a library containing a CUDA function
 92 | AC_SEARCH_LIBS([cudaMalloc], [cudart], ,
 93 |   AC_MSG_ERROR("no cuda runtime api library found"))
 94 | 
 95 | dnl search for a library containing a CUDA function
 96 | AC_SEARCH_LIBS([nvrtcCreateProgram], [nvrtc], ,
 97 |   AC_MSG_ERROR("no nvrtc library found"))
 98 | 
 99 | dnl search for a library containing a CUDA function
100 | AC_SEARCH_LIBS([cublasSetMatrix], [cublas], ,
101 |   AC_MSG_ERROR("no cublas library found"))
102 | 
103 | if test [ -n "${cuda_lib_path}" ] ; then
104 |   LDFLAGS=${OLD_LDFLAGS}
105 |   AC_CANONICAL_HOST
106 |   case $host_os in
107 |     darwin* )
108 |       LIBS="${LIBS} -Wl,-rpath,${cuda_lib_path}"
109 |       ;;
110 |     * )
111 |       LIBS="${LIBS} -Wl,-rpath=${cuda_lib_path}"
112 |       ;;
113 |   esac
114 | fi
115 | 
116 | dnl substitute CUDA_CPPFLAGS and LIBS
117 | AC_SUBST(CUDA_CPPFLAGS)
118 | AC_SUBST(LIBS)
119 | dnl and do substitution in the src/Makevars.in
120 | AC_CONFIG_FILES([src/Makevars])
121 | AC_OUTPUT
122 | 


--------------------------------------------------------------------------------
/inst/cuda/correlation.cu:
--------------------------------------------------------------------------------
  1 | #define FALSE 0
  2 | #define TRUE !FALSE
  3 | 
  4 | #define NUMTHREADS 16
  5 | #define THREADWORK 32
  6 | 
  7 | __global__ void gpuMeans(const float * vectsA, size_t na, 
  8 | 	const float * vectsB, size_t nb, size_t dim, 
  9 | 	float * means, float * numPairs)
 10 | {
 11 | 	size_t 
 12 | 		offset, stride,
 13 | 		bx = blockIdx.x, by = blockIdx.y, tx = threadIdx.x;
 14 | 	float a, b;
 15 | 
 16 | 	__shared__ float 
 17 | 		threadSumsA[NUMTHREADS], threadSumsB[NUMTHREADS],
 18 | 		count[NUMTHREADS];
 19 | 
 20 | 	if((bx >= na) || (by >= nb))
 21 | 		return;
 22 | 
 23 | 	threadSumsA[tx] = 0.f;
 24 | 	threadSumsB[tx] = 0.f;
 25 | 	count[tx] = 0.f;
 26 | 
 27 | 	for(offset = tx; offset < dim; offset += NUMTHREADS) {
 28 | 		a = vectsA[bx * dim + offset];
 29 | 		b = vectsB[by * dim + offset];
 30 | 		if(!(isnan(a) || isnan(b))) {
 31 | 			threadSumsA[tx] += a;
 32 | 			threadSumsB[tx] += b;
 33 | 			count[tx] += 1.f;
 34 | 		}
 35 | 	}
 36 | 	__syncthreads();
 37 |     
 38 | 	for(stride = NUMTHREADS >> 1; stride > 0; stride >>= 1) {
 39 | 		if(tx < stride) {
 40 | 			threadSumsA[tx] += threadSumsA[tx + stride];
 41 | 			threadSumsB[tx] += threadSumsB[tx + stride];
 42 | 			count[tx] += count[tx+stride];
 43 | 		}
 44 | 		__syncthreads();
 45 | 	}
 46 | 	if(tx == 0) {
 47 | 		means[bx*nb*2+by*2] = threadSumsA[0] / count[0];
 48 | 		means[bx*nb*2+by*2+1] = threadSumsB[0] / count[0];
 49 | 		numPairs[bx*nb+by] = count[0];
 50 | 	}
 51 | }
 52 | 
 53 | __global__ void gpuSD(const float * vectsA, size_t na,
 54 | 	const float * vectsB, size_t nb, size_t dim, 
 55 | 	const float * means, const float * numPairs, float * sds)
 56 | {
 57 | 	size_t 
 58 | 		offset, stride,
 59 | 		tx = threadIdx.x, 
 60 | 		bx = blockIdx.x, by = blockIdx.y;
 61 | 	float 
 62 | 		a, b,
 63 | 		termA, termB;
 64 | 	__shared__ float 
 65 | 		meanA, meanB, n,
 66 | 		threadSumsA[NUMTHREADS], threadSumsB[NUMTHREADS];
 67 | 
 68 | 	if((bx >= na) || (by >= nb))
 69 | 		return;
 70 | 
 71 | 	if(tx == 0) {
 72 | 		meanA = means[bx*nb*2+by*2];	
 73 | 		meanB = means[bx*nb*2+by*2+1];	
 74 | 		n = numPairs[bx*nb+by]; 
 75 | 	}
 76 | 	__syncthreads();
 77 | 
 78 | 	threadSumsA[tx] = 0.f;
 79 | 	threadSumsB[tx] = 0.f;
 80 | 	for(offset = tx; offset < dim; offset += NUMTHREADS) {
 81 | 		a = vectsA[bx * dim + offset];
 82 | 		b = vectsB[by * dim + offset];
 83 | 		if(!(isnan(a) || isnan(b))) {
 84 | 			termA = a - meanA;
 85 | 			termB = b - meanB;
 86 | 			threadSumsA[tx] += termA * termA;
 87 | 			threadSumsB[tx] += termB * termB;
 88 | 		}
 89 | 	}
 90 | 	__syncthreads();
 91 | 
 92 | 	for(stride = NUMTHREADS >> 1; stride > 0; stride >>= 1) {
 93 | 		if(tx < stride) {
 94 | 			threadSumsA[tx] += threadSumsA[tx + stride];
 95 | 			threadSumsB[tx] += threadSumsB[tx + stride];
 96 | 		}
 97 | 		__syncthreads();
 98 | 	}
 99 | 	if(tx == 0) {
100 | 		sds[bx*nb*2+by*2]   = sqrtf(threadSumsA[0] / (n - 1.f));
101 | 		sds[bx*nb*2+by*2+1] = sqrtf(threadSumsB[0] / (n - 1.f));
102 | 	}
103 | }
104 | 
105 | __global__ void gpuPMCC(const float * vectsa, size_t na,
106 | 	const float * vectsb, size_t nb, size_t dim,
107 | 	const float * numPairs, const float * means, const float * sds,
108 | 	float * correlations) 
109 | {
110 | 	size_t 
111 | 		offset, stride,
112 | 		x = blockIdx.x, y = blockIdx.y, 
113 | 		tx = threadIdx.x;
114 | 	float 
115 | 		a, b, n, scoreA, scoreB;
116 |     __shared__ float 
117 | 		meanA, meanB, 
118 | 		sdA, sdB, 
119 | 		threadSums[NUMTHREADS];
120 | 
121 | 	if((x >= na) || (y >= nb))
122 | 		return;
123 | 
124 | 	if(tx == 0) {
125 | 		meanA = means[x*nb*2+y*2];
126 | 		meanB = means[x*nb*2+y*2+1];	
127 | 		sdA = sds[x*nb*2+y*2];
128 | 		sdB = sds[x*nb*2+y*2+1];	
129 | 		n = numPairs[x*nb+y]; 
130 | 	}
131 | 	__syncthreads();
132 | 
133 | 	threadSums[tx] = 0.f;
134 | 	for(offset = tx; offset < dim; offset += NUMTHREADS) {
135 | 		a = vectsa[x * dim + offset];
136 | 		b = vectsb[y * dim + offset];
137 | 		if(!(isnan(a) || isnan(b))) {
138 | 			scoreA = (a - meanA) / sdA; 
139 | 			scoreB = (b - meanB) / sdB;
140 | 			threadSums[tx] += scoreA * scoreB;
141 | 		}
142 | 	}
143 | 	__syncthreads();
144 | 
145 | 	for(stride = NUMTHREADS >> 1; stride > 0; stride >>= 1) {
146 | 		if(tx < stride) threadSums[tx] += threadSums[tx + stride];
147 | 		__syncthreads();
148 | 	}
149 | 	if(tx == 0) correlations[x*nb+y] = threadSums[0] / (n - 1.f);
150 | }
151 | 
152 | __global__ void gpuMeansNoTest(const float * vectsA, size_t na, 
153 | 	const float * vectsB, size_t nb, size_t dim, 
154 | 	float * means, float * numPairs)
155 | {
156 | 	size_t 
157 | 		offset, stride,
158 | 		bx = blockIdx.x, by = blockIdx.y, tx = threadIdx.x;
159 | 	float a, b;
160 | 
161 | 	__shared__ float 
162 | 		threadSumsA[NUMTHREADS], threadSumsB[NUMTHREADS],
163 | 		count[NUMTHREADS];
164 | 
165 | 	if((bx >= na) || (by >= nb))
166 | 		return;
167 | 
168 | 	threadSumsA[tx] = 0.f;
169 | 	threadSumsB[tx] = 0.f;
170 | 	count[tx] = 0.f;
171 | 
172 | 	for(offset = tx; offset < dim; offset += NUMTHREADS) {
173 | 		a = vectsA[bx * dim + offset];
174 | 		b = vectsB[by * dim + offset];
175 | 
176 | 		threadSumsA[tx] += a;
177 | 		threadSumsB[tx] += b;
178 | 		count[tx] += 1.f;
179 | 	}
180 | 	__syncthreads();
181 |     
182 | 	for(stride = NUMTHREADS >> 1; stride > 0; stride >>= 1) {
183 | 		if(tx < stride) {
184 | 			threadSumsA[tx] += threadSumsA[tx + stride];
185 | 			threadSumsB[tx] += threadSumsB[tx + stride];
186 | 			count[tx] += count[tx+stride];
187 | 		}
188 | 		__syncthreads();
189 | 	}
190 | 	if(tx == 0) {
191 | 		means[bx*nb*2+by*2] = threadSumsA[0] / count[0];
192 | 		means[bx*nb*2+by*2+1] = threadSumsB[0] / count[0];
193 | 		numPairs[bx*nb+by] = count[0];
194 | 	}
195 | }
196 | 
197 | __global__ void gpuSDNoTest(const float * vectsA, size_t na,
198 | 	const float * vectsB, size_t nb, size_t dim, 
199 | 	const float * means, const float * numPairs, float * sds)
200 | {
201 | 	size_t 
202 | 		offset, stride,
203 | 		tx = threadIdx.x, 
204 | 		bx = blockIdx.x, by = blockIdx.y;
205 | 	float 
206 | 		a, b,
207 | 		termA, termB;
208 | 	__shared__ float 
209 | 		meanA, meanB, n,
210 | 		threadSumsA[NUMTHREADS], threadSumsB[NUMTHREADS];
211 | 
212 | 	if((bx >= na) || (by >= nb))
213 | 		return;
214 | 
215 | 	if(tx == 0) {
216 | 		meanA = means[bx*nb*2+by*2];	
217 | 		meanB = means[bx*nb*2+by*2+1];	
218 | 		n = numPairs[bx*nb+by]; 
219 | 	}
220 | 	__syncthreads();
221 | 
222 | 	threadSumsA[tx] = 0.f;
223 | 	threadSumsB[tx] = 0.f;
224 | 	for(offset = tx; offset < dim; offset += NUMTHREADS) {
225 | 		a = vectsA[bx * dim + offset];
226 | 		b = vectsB[by * dim + offset];
227 | 
228 | 		termA = a - meanA;
229 | 		termB = b - meanB;
230 | 		threadSumsA[tx] += termA * termA;
231 | 		threadSumsB[tx] += termB * termB;
232 | 	}
233 | 	__syncthreads();
234 | 
235 | 	for(stride = NUMTHREADS >> 1; stride > 0; stride >>= 1) {
236 | 		if(tx < stride) {
237 | 			threadSumsA[tx] += threadSumsA[tx + stride];
238 | 			threadSumsB[tx] += threadSumsB[tx + stride];
239 | 		}
240 | 		__syncthreads();
241 | 	}
242 | 	if(tx == 0) {
243 | 		sds[bx*nb*2+by*2]   = sqrtf(threadSumsA[0] / (n - 1.f));
244 | 		sds[bx*nb*2+by*2+1] = sqrtf(threadSumsB[0] / (n - 1.f));
245 | 	}
246 | }
247 | 
248 | __global__ void gpuPMCCNoTest(const float * vectsa, size_t na,
249 | 	const float * vectsb, size_t nb, size_t dim,
250 | 	const float * numPairs, const float * means, const float * sds,
251 | 	float * correlations) 
252 | {
253 | 	size_t 
254 | 		offset, stride,
255 | 		x = blockIdx.x, y = blockIdx.y, 
256 | 		tx = threadIdx.x;
257 | 	float 
258 | 		a, b, n, scoreA, scoreB;
259 |     __shared__ float 
260 | 		meanA, meanB, 
261 | 		sdA, sdB, 
262 | 		threadSums[NUMTHREADS];
263 | 
264 | 	if((x >= na) || (y >= nb))
265 | 		return;
266 | 
267 | 	if(tx == 0) {
268 | 		meanA = means[x*nb*2+y*2];
269 | 		meanB = means[x*nb*2+y*2+1];	
270 | 		sdA = sds[x*nb*2+y*2];
271 | 		sdB = sds[x*nb*2+y*2+1];	
272 | 		n = numPairs[x*nb+y]; 
273 | 	}
274 | 	__syncthreads();
275 | 
276 | 	threadSums[tx] = 0.f;
277 | 	for(offset = tx; offset < dim; offset += NUMTHREADS) {
278 | 		a = vectsa[x * dim + offset];
279 | 		b = vectsb[y * dim + offset];
280 | 		
281 | 		scoreA = (a - meanA) / sdA; 
282 | 		scoreB = (b - meanB) / sdB;
283 | 		threadSums[tx] += scoreA * scoreB;
284 | 	}
285 | 	__syncthreads();
286 | 
287 | 	for(stride = NUMTHREADS >> 1; stride > 0; stride >>= 1) {
288 | 		if(tx < stride) threadSums[tx] += threadSums[tx + stride];
289 | 		__syncthreads();
290 | 	}
291 | 	if(tx == 0) correlations[x*nb+y] = threadSums[0] / (n - 1.f);
292 | }
293 | 
294 | __global__ void gpuSignif(const float * gpuNumPairs, 
295 | 	const float * gpuCorrelations, size_t n, float * gpuTScores)
296 | {
297 | 	size_t 
298 | 		i, start,
299 | 		bx = blockIdx.x, tx = threadIdx.x;
300 | 	float 
301 | 		radicand, cor, npairs;
302 | 
303 | 	start = bx * NUMTHREADS * THREADWORK + tx * THREADWORK;
304 | 	for(i = 0; i < THREADWORK; i++) {
305 | 		if(start+i >= n)
306 | 			break;
307 | 
308 | 		npairs = gpuNumPairs[start+i];
309 | 		cor = gpuCorrelations[start+i];
310 | 		radicand = (npairs - 2.f) / (1.f - cor * cor);
311 | 		gpuTScores[start+i] = cor * sqrtf(radicand);
312 | 	}
313 | }
314 | 
315 | __device__ int dIsSignificant(float signif, int df) {
316 |   float tcutoffs[49] = {
317 |     // cuttoffs for degrees of freedom <= 30
318 |     637.000, 31.600, 2.920, 8.610, 6.869, 5.959, 5.408, 5.041, 4.781, 
319 |     4.587, 4.437, 4.318, 4.221, 4.140, 4.073, 4.015, 3.965, 3.922, 
320 |     3.883, 3.850, 3.819, 3.792, 3.768, 3.745, 3.725, 3.707, 3.690, 
321 |     3.674, 3.659, 3.646,
322 |     // cuttoffs for even degrees of freedom > 30 but <= 50
323 |     3.622, 3.601, 3.582, 3.566, 3.551, 3.538, 3.526, 3.515, 3.505, 3.496, 
324 |     // 55 <= df <= 70 by 5s
325 |     3.476, 3.460, 3.447, 3.435, 
326 |     3.416, // 80
327 |     3.390, // 100
328 |     3.357, // 150
329 |     3.340, // 200
330 |     3.290  // > 200
331 |   };
332 | 
333 |   size_t index = 0;
334 |   if(df <= 0) return 0;
335 |   else if(df <= 30) index = df - 1;
336 |   else if(df <= 50) index = 30 + (df + (df%2) - 32) / 2;
337 |   else if(df <= 70) {
338 |     if(df <= 55) index = 40;
339 |     else if(df <= 60) index = 41;
340 |     else if(df <= 65) index = 42;
341 |     else if(df <= 70) index = 43;
342 |   }
343 |   else if(df <= 80) index = 44;
344 |   else if(df <= 100) index = 45;
345 |   else if(df <= 150) index = 46;
346 |   else if(df <= 200) index = 47;
347 |   else if(df > 200) index = 48;
348 | 
349 |   if(fabsf(signif) < tcutoffs[index]) return FALSE;
350 | 
351 |   return TRUE;
352 | }
353 | 
354 | __global__ void dUpdateSignif(const float * gpuData, size_t n, 
355 |                               float * gpuResults)
356 | {
357 |   size_t 
358 |     i, start, inrow, outrow, 
359 |     bx = blockIdx.x, tx = threadIdx.x;
360 |   float 
361 |     radicand, cor, npairs, tscore;
362 | 
363 |   start = bx * NUMTHREADS * THREADWORK + tx * THREADWORK;
364 |         
365 |   for(i = 0; i < THREADWORK; i++) {
366 |     if(start+i > n) break;
367 | 
368 |     inrow = (start+i)*5;
369 |     outrow = (start+i)*6;
370 | 
371 |     cor = gpuData[inrow+3];
372 |     npairs = gpuData[inrow+4];
373 | 
374 |     if(cor >= 0.999) 
375 |       tscore = 10000.0;
376 |     else {
377 |       radicand = (npairs - 2.f) / (1.f - cor * cor);
378 |       tscore = cor * sqrtf(radicand);
379 |     }
380 |     if(dIsSignificant(tscore, (int)npairs)) {
381 |       gpuResults[outrow]   = gpuData[inrow];
382 |       gpuResults[outrow+1] = gpuData[inrow+1];
383 |       gpuResults[outrow+2] = gpuData[inrow+2];
384 |       gpuResults[outrow+3] = cor;
385 |       gpuResults[outrow+4] = tscore;
386 |       gpuResults[outrow+5] = npairs;
387 |     } else {
388 |       gpuResults[outrow] = -1.f;
389 |     }
390 |   }
391 | }
392 | 
393 | __global__ void noNAsPmccMeans(int nRows, int nCols, float * a, float * means)
394 | {
395 | 	int
396 | 		col = blockDim.x * blockIdx.x + threadIdx.x,
397 | 		inOffset = col * nRows,
398 | 		outOffset = threadIdx.x * blockDim.y,
399 | 		j = outOffset + threadIdx.y;
400 | 	float sum = 0.f;
401 | 
402 | 	if(col >= nCols) return;
403 | 
404 | 	__shared__ float threadSums[NUMTHREADS*NUMTHREADS];
405 | 
406 | 	for(int i = threadIdx.y; i < nRows; i += blockDim.y)
407 | 		sum += a[inOffset + i];
408 | 
409 | 	threadSums[j] = sum;
410 | 	__syncthreads();
411 | 
412 | 	for(int i = blockDim.y >> 1; i > 0; i >>= 1) {
413 | 		if(threadIdx.y < i) {
414 | 			threadSums[outOffset+threadIdx.y] 
415 | 				+= threadSums[outOffset+threadIdx.y + i];
416 | 		}
417 | 		__syncthreads();
418 | 	}
419 | 	if(threadIdx.y == 0)
420 | 		means[col] = threadSums[outOffset] / (float)nRows;
421 | }
422 | 


--------------------------------------------------------------------------------
/inst/cuda/granger.cu:
--------------------------------------------------------------------------------
  1 | #define max(a, b) ((a > b)?a:b)
  2 | 
  3 | #define THREADSPERDIM   16
  4 | 
  5 | #define FALSE 0
  6 | #define TRUE !FALSE
  7 | 
  8 | // mX has order rows x cols
  9 | // vectY has length rows
 10 | __global__ void getRestricted(int countx, int county, int rows, int cols, 
 11 |                               float * mX, int mXdim, float * vY, int vYdim, float * mQ, int mQdim,
 12 |                               float * mR, int mRdim, float * vectB, int vectBdim) {
 13 | 
 14 |   int 
 15 |     m = blockIdx.x * THREADSPERDIM + threadIdx.x, n,
 16 |     i, j, k;
 17 |   float 
 18 |     sum, invnorm,
 19 |     * X, * Y, * Q, * R, * B,
 20 |     * coli, * colj, 
 21 |     * colQ, * colX;
 22 | 
 23 |   if(m >= county) return;
 24 |   if(m == 1) n = 0;
 25 |   else n = 1;
 26 | 
 27 |   X = mX + (m * mXdim);
 28 |   // initialize the intercepts
 29 |   for(i = 0; i < rows; i++)
 30 |     X[i] = 1.f;
 31 | 
 32 |   Y = vY + (m * countx + n) * vYdim;
 33 |   B = vectB + m * vectBdim;
 34 |   Q = mQ + m * mQdim;
 35 |   R = mR + m * mRdim;
 36 | 
 37 |   // initialize Q with X ...
 38 |   for(i = 0; i < rows; i++) {
 39 |     for(j = 0; j < cols; j++)
 40 |       Q[i+j*rows] = X[i+j*rows];
 41 |   }
 42 | 
 43 |   // gramm-schmidt process to find Q
 44 |   for(j = 0; j < cols; j++) {
 45 |     colj = Q+rows*j;
 46 |     for(i = 0; i < j; i++) {
 47 |       coli = Q+rows*i;
 48 |       sum = 0.f;
 49 |       for(k = 0; k < rows; k++)
 50 |         sum += coli[k] * colj[k];
 51 |       for(k = 0; k < rows; k++)
 52 |         colj[k] -= sum * coli[k];
 53 |     }
 54 |     sum = 0.f;
 55 |     for(i = 0; i < rows; i++)
 56 |       sum += colj[i] * colj[i];
 57 |     invnorm = 1.f / sqrtf(sum);
 58 |     for(i = 0; i < rows; i++)
 59 |       colj[i] *= invnorm;
 60 |   }
 61 |   for(i = cols-1; i > -1; i--) {
 62 |     colQ = Q+i*rows;
 63 |     // matmult Q * X -> R
 64 |     for(j = 0; j < cols; j++) {
 65 |       colX = X+j*rows;
 66 |       sum = 0.f;
 67 |       for(k = 0; k < rows; k++)
 68 |         sum += colQ[k] * colX[k];
 69 |       R[i+j*cols] = sum;
 70 |     }
 71 |     sum = 0.f;
 72 |     // compute the vector Q^t * Y -> B
 73 |     for(j = 0; j < rows; j++) 
 74 |       sum += colQ[j] * Y[j];
 75 |     // back substitution to find the x for Rx = B
 76 |     for(j = cols-1; j > i; j--)
 77 |       sum -= R[i+j*cols] * B[j];
 78 | 
 79 |     B[i] = sum / R[i+i*cols];
 80 |   }
 81 | }
 82 | 
 83 | // mX has order rows x cols
 84 | // vectY has length rows
 85 | __global__ void getUnrestricted(int countx, int county, int rows, int cols, 
 86 |                                 float * mX, int mXdim, float * vY, int vYdim, float * mQ, int mQdim,
 87 |                                 float * mR, int mRdim, float * vectB, int vectBdim) {
 88 | 
 89 |   int 
 90 |     n = blockIdx.x * THREADSPERDIM + threadIdx.x, 
 91 |     m = blockIdx.y * THREADSPERDIM + threadIdx.y, 
 92 |     i, j, k;
 93 |   float 
 94 |     sum, invnorm,
 95 |     * X, * Y, * Q, * R, * B,
 96 |     * coli, * colj, 
 97 |     * colQ, * colX;
 98 |   if((m >= county) || (n >= countx)) return;
 99 | 
100 |   X = mX + (m * countx + n) * mXdim;
101 |   // initialize the intercepts
102 |   for(i = 0; i < rows; i++) 
103 |     X[i] = 1.f;
104 | 
105 |   Y = vY + (m*countx+n) * vYdim;
106 |   B = vectB + (m*countx+n) * vectBdim;
107 |   Q = mQ + (m*countx+n) * mQdim;
108 |   R = mR + (m*countx+n) * mRdim;
109 | 
110 |   // initialize Q with X ...
111 |   for(i = 0; i < rows; i++) {
112 |     for(j = 0; j < cols; j++)
113 |       Q[i+j*rows] = X[i+j*rows];
114 |   }
115 | 
116 |   // gramm-schmidt process to find Q
117 |   for(j = 0; j < cols; j++) {
118 |     colj = Q+rows*j;
119 |     for(i = 0; i < j; i++) {
120 |       coli = Q+rows*i;
121 |       sum = 0.f;
122 |       for(k = 0; k < rows; k++)
123 |         sum += coli[k] * colj[k];
124 |       for(k = 0; k < rows; k++)
125 |         colj[k] -= sum * coli[k];
126 |     }
127 |     sum = 0.f;
128 |     for(i = 0; i < rows; i++)
129 |       sum += colj[i] * colj[i];
130 |     invnorm = 1.f / sqrtf(sum);
131 |     for(i = 0; i < rows; i++)
132 |       colj[i] *= invnorm;
133 |   }
134 |   for(i = cols-1; i > -1; i--) {
135 |     colQ = Q+i*rows;
136 |     // matmult Q * X -> R
137 |     for(j = 0; j < cols; j++) {
138 |       colX = X+j*rows;
139 |       sum = 0.f;
140 |       for(k = 0; k < rows; k++)
141 |         sum += colQ[k] * colX[k];
142 |       R[i+j*cols] = sum;
143 |     }
144 |     sum = 0.f;
145 |     // compute the vector Q^t * Y -> B
146 |     for(j = 0; j < rows; j++) 
147 |       sum += colQ[j] * Y[j];
148 |     // back substitution to find the x for Rx = B
149 |     for(j = cols-1; j > i; j--)
150 |       sum -= R[i+j*cols] * B[j];
151 | 
152 |     B[i] = sum / R[i+i*cols];
153 |   }
154 | }
155 | 
156 | __global__ void ftest(int diagFlag, int p, int rows, int colsx, int colsy, 
157 |                       int rCols, int unrCols, float * obs, int obsDim, 
158 |                       float * rCoeffs, int rCoeffsDim, float * unrCoeffs, int unrCoeffsDim, 
159 |                       float * rdata, int rdataDim, float * unrdata, int unrdataDim, 
160 |                       float * dfStats) // float * dpValues)
161 | {
162 |   int 
163 |     j = blockIdx.x * THREADSPERDIM + threadIdx.x, 
164 |     i = blockIdx.y * THREADSPERDIM + threadIdx.y, 
165 |     idx = i*colsx + j, k, m;
166 |   float 
167 |     kobs, fp = (float) p, frows = (float) rows,
168 |     rSsq, unrSsq,
169 |     rEst, unrEst,
170 |     score = 0.f, 
171 |     * tObs, * tRCoeffs, * tUnrCoeffs, 
172 |     * tRdata, * tUnrdata; 
173 | 
174 |   if((i >= colsy) || (j >= colsx)) return;
175 |   if((!diagFlag) && (i == j)) {
176 |     dfStats[idx] = 0.f;
177 |     // dpValues[idx] = 0.f;
178 |     return;
179 |   }
180 | 
181 |   tObs = obs + (i*colsx+j)*obsDim;
182 | 
183 |   tRCoeffs = rCoeffs + i*rCoeffsDim;
184 |   tRdata = rdata + i*rdataDim;
185 |         
186 |   tUnrCoeffs = unrCoeffs + (i*colsx+j)*unrCoeffsDim;
187 |   tUnrdata = unrdata + (i*colsx+j)*unrdataDim;
188 | 
189 |   rSsq = unrSsq = 0.f;
190 |   for(k = 0; k < rows; k++) {
191 |     unrEst = rEst = 0.f;
192 |     kobs = tObs[k];
193 |     for(m = 0; m < rCols; m++)
194 |       rEst += tRCoeffs[m] * tRdata[k+m*rows];
195 |     for(m = 0; m < unrCols; m++) 
196 |       unrEst += tUnrCoeffs[m] * tUnrdata[k+m*rows];
197 |     rSsq   += (kobs - rEst) * (kobs - rEst);
198 |     unrSsq += (kobs - unrEst) * (kobs - unrEst);
199 | 
200 |   }
201 |   score = ((rSsq - unrSsq)*(frows-2.f*fp-1.f)) / (fp*unrSsq);
202 | 
203 |   if(!isfinite(score))
204 |     score = 0.f;
205 | 
206 |   dfStats[idx] = score;
207 | }
208 | 


--------------------------------------------------------------------------------
/inst/cuda/hcluster.cu:
--------------------------------------------------------------------------------
  1 | #define NUM_THREADS 32
  2 | #define NUM_BLOCKS 1024
  3 | 
  4 | __global__ void convert_kernel(float * dist, size_t pitch_dist, size_t n)
  5 | {
  6 |   for(size_t index = threadIdx.x; index < n; index += NUM_THREADS) {
  7 |     dist[index * pitch_dist + index] = CUDART_INF_F;
  8 |   }
  9 | }
 10 | 
 11 | __global__ void find_min1_kernel(const float * dist, const size_t pitch_dist, 
 12 | 	const size_t n, const float * count, float * min_val, size_t * min_col, 
 13 | 	const size_t row_offset)
 14 | {
 15 | 	// Determine which row this block will handle
 16 | 	const size_t row = row_offset + blockIdx.x;
 17 | 
 18 | 	// If the row has already been merged, skip the work
 19 | 	if((threadIdx.x == 0) && (row < n) && (count[row] < 0.f)) {
 20 | 		min_val[row] = CUDART_INF_F;
 21 | 		min_col[row] = 0;		
 22 | 	}
 23 | 
 24 | 	if((row >= n) || (count[row] <= 0.f))
 25 | 		return;
 26 | 
 27 | 	__shared__ float vals[NUM_THREADS];
 28 | 	__shared__ size_t cols[NUM_THREADS];
 29 | 
 30 | 	// Initialize with identity
 31 | 	vals[threadIdx.x] = CUDART_INF_F;
 32 | 		
 33 | 	// Find the minimum
 34 | 	for(size_t col = threadIdx.x; col <= row; col += NUM_THREADS) {
 35 | 		float t = dist[row * pitch_dist + col];
 36 | 		if(t < vals[threadIdx.x]) {
 37 | 			vals[threadIdx.x] = t;
 38 | 			cols[threadIdx.x] = col;
 39 | 		}
 40 | 	}
 41 | 	__syncthreads();
 42 | 				
 43 | 	// Reduce
 44 | 	for(size_t stride = NUM_THREADS >> 1; stride > 0; stride >>= 1) {
 45 | 		if((threadIdx.x < stride)
 46 | 			&& (vals[threadIdx.x] > vals[threadIdx.x + stride]))
 47 | 		{
 48 | 			vals[threadIdx.x] = vals[threadIdx.x + stride];
 49 | 			cols[threadIdx.x] = cols[threadIdx.x + stride];
 50 | 		}
 51 | 		__syncthreads();
 52 | 	}
 53 | 		
 54 | 	// Write the result
 55 | 	if(threadIdx.x == 0) {
 56 | 		min_val[row] = vals[0];
 57 | 		min_col[row] = cols[0];
 58 | 	}
 59 | }
 60 | 
 61 | __global__ void find_min2_kernel(const float * min_val, const size_t * min_col,
 62 | 	float * count, int * sub, int * sup, float * val, const size_t n, 
 63 | 	const size_t iter)
 64 | {
 65 | 	__shared__ float vals[NUM_THREADS];
 66 | 	__shared__ size_t cols[NUM_THREADS];
 67 | 
 68 | 	// Initialize with identity
 69 | 	vals[threadIdx.x] = CUDART_INF_F;
 70 | 		
 71 | 	// Find the minimum
 72 | 	for(size_t row = threadIdx.x; row < n; row += NUM_THREADS) {
 73 | 		float t = min_val[row];
 74 | 		if(t < vals[threadIdx.x]) {
 75 | 			vals[threadIdx.x] = t;
 76 | 			cols[threadIdx.x] = row;
 77 | 		}
 78 | 	}
 79 | 	__syncthreads();
 80 | 				
 81 | 	// Reduce
 82 | 	for(size_t stride = NUM_THREADS >> 1; stride > 0; stride >>= 1) {
 83 | 		if(threadIdx.x < stride) {
 84 | 			if(vals[threadIdx.x] > vals[threadIdx.x + stride]) {
 85 | 				vals[threadIdx.x] = vals[threadIdx.x + stride];
 86 | 				cols[threadIdx.x] = cols[threadIdx.x + stride];
 87 | 			}
 88 | 		}
 89 | 		__syncthreads();
 90 | 	}
 91 | 	
 92 | 	// Write out
 93 | 	if(threadIdx.x == 0) {
 94 | 		// Winning value is vals[0]
 95 | 		// Winning row is cols[0]
 96 | 		// Winning column is min_col[cols[0]]
 97 | 		int row_winner = cols[0];
 98 | 		int col_winner = min_col[cols[0]];
 99 | 		val[iter] = vals[0];
100 | 		sub[iter] = col_winner;
101 | 		sup[iter] = row_winner;
102 | 
103 | 		count[row_winner] += count[col_winner];
104 | 		count[col_winner] *= -1.f;
105 | 	}
106 | }
107 | 
108 | __global__ void single_kernel(float * dist, const size_t pitch_dist, 
109 | 	const size_t n, const int * sub, const int * sup, const float * count, 
110 | 	const float * val, const size_t iter, const size_t col_offset, 
111 | 	const float lambda, const float beta)
112 | {
113 |   size_t col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x;
114 | 
115 |   // If it matters
116 |   if(col < n) {
117 |     int col_winner = sub[iter];
118 |     int row_winner = sup[iter];
119 |     float top_val = dist[col_winner * pitch_dist + col];
120 |     float bot_val = dist[row_winner * pitch_dist + col];
121 |     bot_val = min(bot_val, top_val);
122 |     if(col == col_winner || col == row_winner) {
123 |       bot_val = CUDART_INF_F;
124 |     }
125 |     top_val = CUDART_INF_F;
126 |     // Write out
127 |     dist[col_winner * pitch_dist + col] = top_val;
128 |     dist[col * pitch_dist + col_winner] = top_val;
129 |     dist[row_winner * pitch_dist + col] = bot_val;
130 |     dist[col * pitch_dist + row_winner] = bot_val;
131 |   }
132 | }
133 | 
134 | __global__ void complete_kernel(float * dist, const size_t pitch_dist, 
135 | 	const size_t n, const int * sub, const int * sup, const float * count, 
136 | 	const float * val, const size_t iter, const size_t col_offset, 
137 | 	const float lambda, const float beta)
138 | {
139 |   const size_t col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x;
140 | 
141 |   // If it matters
142 |   if(col < n) {
143 | 
144 |     int 
145 | 		col_winner = sub[iter], row_winner = sup[iter];
146 |     float 
147 | 		top_val = dist[col_winner * pitch_dist + col], 
148 | 		bot_val = dist[row_winner * pitch_dist + col];
149 | 
150 |     bot_val = fmaxf(bot_val, top_val);
151 |     if((col == col_winner) || (col == row_winner))
152 | 		bot_val = CUDART_INF_F;
153 | 
154 |     top_val = CUDART_INF_F;
155 | 
156 |     // Write out
157 |     dist[col_winner * pitch_dist + col] = top_val;
158 |     dist[col * pitch_dist + col_winner] = top_val;
159 |     dist[row_winner * pitch_dist + col] = bot_val;
160 |     dist[col * pitch_dist + row_winner] = bot_val;
161 |   }
162 | }
163 | 
164 | __global__ void wpgma_kernel(float * dist, const size_t pitch_dist, 
165 | 	const size_t n, const int * sub, const int * sup, const float * count,  
166 | 	const float * val, const size_t iter, const size_t col_offset, 
167 | 	const float lambda, const float beta)
168 | {
169 |   const size_t col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x;
170 | 
171 |   // If it matters
172 |   if(col < n) {
173 |     int col_winner = sub[iter];
174 |     int row_winner = sup[iter];
175 |     float top_val = dist[col_winner * pitch_dist + col];
176 |     float bot_val = dist[row_winner * pitch_dist + col];
177 |     bot_val = (bot_val + top_val) / 2.0;
178 |     if(col == col_winner || col == row_winner) {
179 |       bot_val = CUDART_INF_F;
180 |     }
181 |     top_val = CUDART_INF_F;
182 |     // Write out
183 |     dist[col_winner * pitch_dist + col] = top_val;
184 |     dist[col * pitch_dist + col_winner] = top_val;
185 |     dist[row_winner * pitch_dist + col] = bot_val;
186 |     dist[col * pitch_dist + row_winner] = bot_val;
187 |   }
188 | }
189 | 
190 | __global__ void average_kernel(float * dist, const size_t pitch_dist, 
191 | 	const size_t n, const int * sub, const int * sup, const float * count, 
192 | 	const float * val, const size_t iter, const size_t col_offset, 
193 | 	const float lambda, const float beta)
194 | {
195 |   const size_t col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x;
196 | 
197 |   // If it matters
198 |   if(col < n) {
199 |     int col_winner = sub[iter];
200 |     int row_winner = sup[iter];
201 |     float top_val = dist[col_winner * pitch_dist + col];
202 |     float bot_val = dist[row_winner * pitch_dist + col];
203 |     float nr = count[row_winner];
204 |     float np = -1.0 * count[col_winner];
205 |     float nq = nr - np;
206 |     bot_val = (top_val * np + bot_val * nq) / nr;
207 |     if(col == col_winner || col == row_winner) {
208 |       bot_val = CUDART_INF_F;
209 |     }
210 |     top_val = CUDART_INF_F;
211 |     // Write out
212 |     dist[col_winner * pitch_dist + col] = top_val;
213 |     dist[col * pitch_dist + col_winner] = top_val;
214 |     dist[row_winner * pitch_dist + col] = bot_val;
215 |     dist[col * pitch_dist + row_winner] = bot_val;
216 |   }
217 | }
218 | 
219 | __global__ void median_kernel(float * dist, const size_t pitch_dist, 
220 | 	const size_t n, const int * sub, const int * sup, const float * count, 
221 | 	const float * val, const size_t iter, const size_t col_offset, 
222 | 	const float lambda, const float beta)
223 | {
224 |   const size_t col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x;
225 | 
226 |   // If it matters
227 |   if(col < n) {
228 |     int col_winner = sub[iter];
229 |     int row_winner = sup[iter];
230 |     float top_val = dist[col_winner * pitch_dist + col];
231 |     float bot_val = dist[row_winner * pitch_dist + col];
232 |     bot_val = (bot_val + top_val) / 2.0 - val[iter] / 4.0;
233 |     if(col == col_winner || col == row_winner) {
234 |       bot_val = CUDART_INF_F;
235 |     }
236 |     top_val = CUDART_INF_F;
237 |     // Write out
238 |     dist[col_winner * pitch_dist + col] = top_val;
239 |     dist[col * pitch_dist + col_winner] = top_val;
240 |     dist[row_winner * pitch_dist + col] = bot_val;
241 |     dist[col * pitch_dist + row_winner] = bot_val;
242 |   }
243 | }
244 | 
245 | __global__ void mcquitty_kernel(float * dist, const size_t pitch_dist, 
246 | 	const size_t n, const int * sub, const int * sup, const float * count, 
247 | 	const float * val, const size_t iter, const size_t col_offset, 
248 | 	const float lambda, const float beta)
249 | {
250 |   const size_t col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x;
251 | 
252 |   // If it matters
253 |   if(col < n) {
254 |     int col_winner = sub[iter];
255 |     int row_winner = sup[iter];
256 |     float top_val = dist[col_winner * pitch_dist + col];
257 |     float bot_val = dist[row_winner * pitch_dist + col];
258 |     bot_val = (bot_val + top_val) / 2.0;
259 |     if(col == col_winner || col == row_winner) {
260 |       bot_val = CUDART_INF_F;
261 |     }
262 |     top_val = CUDART_INF_F;
263 |     // Write out
264 |     dist[col_winner * pitch_dist + col] = top_val;
265 |     dist[col * pitch_dist + col_winner] = top_val;
266 |     dist[row_winner * pitch_dist + col] = bot_val;
267 |     dist[col * pitch_dist + row_winner] = bot_val;
268 |   }
269 | }
270 | 
271 | __global__ void centroid_kernel(float * dist, size_t pitch_dist, 
272 | 	size_t n, const int * sub, const int * sup, const float * count, 
273 | 	const float * val, size_t iter, size_t col_offset, 
274 | 	float lambda, float beta)
275 | {
276 | 	size_t 
277 | 		col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x;
278 | 
279 | 	if(col < n) {		// don't run off the end of the arrays
280 |     	int 
281 | 			col_winner = sub[iter], row_winner = sup[iter];
282 | 		float 
283 | 			top_val = dist[col_winner * pitch_dist + col],
284 |     		bot_val = dist[row_winner * pitch_dist + col],
285 | 			nr = count[row_winner], np = -count[col_winner],
286 | 			nq = nr - np;
287 | 
288 | 		bot_val = (top_val * np + bot_val * nq)/nr 
289 | 			- (np * nq * val[iter])/(nr * nr);
290 | //		bot_val = (nr * (bot_val * np + top_val * nq) - np * nq * val[iter]) 
291 | //			/ (nr * nr);
292 | /*
293 |     float nr = count[row_winner];
294 |     float np = -1.0 * count[col_winner];
295 |     float nq = nr - np;
296 |     bot_val = (top_val * np + bot_val * nq) / nr;
297 | */	
298 | 		if(col == col_winner || col == row_winner)
299 | 			bot_val = CUDART_INF_F;
300 |     
301 | 		top_val = CUDART_INF_F;
302 | 
303 | 		dist[col_winner * pitch_dist + col] = top_val;
304 | 		dist[col * pitch_dist + col_winner] = top_val;
305 | 		dist[row_winner * pitch_dist + col] = bot_val;
306 | 		dist[col * pitch_dist + row_winner] = bot_val;
307 | 	}
308 | }
309 | 
310 | __global__ void flexible_group_kernel(float * dist, const size_t pitch_dist, 
311 | 	const size_t n, const int * sub, const int * sup, const float * count, 
312 | 	const float * val, const size_t iter, const size_t col_offset, 
313 | 	const float lambda, const float beta)
314 | {
315 |   const size_t col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x;
316 | 
317 |   // If it matters
318 |   if(col < n) {
319 |     int col_winner = sub[iter];
320 |     int row_winner = sup[iter];
321 |     float top_val = dist[col_winner * pitch_dist + col];
322 |     float bot_val = dist[row_winner * pitch_dist + col];
323 |     float nr = count[row_winner];
324 |     float np = -1.0 * count[col_winner];
325 |     float nq = nr - np;
326 |     bot_val = (bot_val * (1.0 - lambda) * np + top_val * (1.0 - lambda) *  nq) / nr + beta * val[iter];
327 |     if(col == col_winner || col == row_winner) {
328 |       bot_val = CUDART_INF_F;
329 |     }
330 |     top_val = CUDART_INF_F;
331 |     // Write out
332 |     dist[col_winner * pitch_dist + col] = top_val;
333 |     dist[col * pitch_dist + col_winner] = top_val;
334 |     dist[row_winner * pitch_dist + col] = bot_val;
335 |     dist[col * pitch_dist + row_winner] = bot_val;
336 |   }
337 | }
338 | 
339 | __global__ void flexible_kernel(float * dist, const size_t pitch_dist, 
340 | 	const size_t n, const int * sub, const int * sup, const float * count, 
341 | 	const float * val, const size_t iter, const size_t col_offset, 
342 | 	const float lambda, const float beta)
343 | {
344 |   const size_t col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x;
345 | 
346 |   // If it matters
347 |   if(col < n) {
348 |     int col_winner = sub[iter];
349 |     int row_winner = sup[iter];
350 |     float top_val = dist[col_winner * pitch_dist + col];
351 |     float bot_val = dist[row_winner * pitch_dist + col];
352 |     bot_val = (bot_val * (1.0 - lambda) + top_val * (1.0 - lambda) ) / 2.0 + beta * val[iter];
353 |     if(col == col_winner || col == row_winner) {
354 |       bot_val = CUDART_INF_F;
355 |     }
356 |     top_val = CUDART_INF_F;
357 |     // Write out
358 |     dist[col_winner * pitch_dist + col] = top_val;
359 |     dist[col * pitch_dist + col_winner] = top_val;
360 |     dist[row_winner * pitch_dist + col] = bot_val;
361 |     dist[col * pitch_dist + row_winner] = bot_val;
362 |   }
363 | }
364 | 
365 | __global__ void ward_kernel(float * dist, const size_t pitch_dist, 
366 | 	const size_t n, const int * sub, const int * sup, const float * count, 
367 | 	const float * val, const size_t iter, const size_t col_offset, 
368 | 	const float lambda, const float beta)
369 | {
370 | 	const size_t
371 | 		col = col_offset + NUM_THREADS * blockIdx.x + threadIdx.x;
372 | 
373 | 	if(col >= n)
374 | 		return;
375 | 
376 | 	int
377 | 		col_winner = sub[iter], row_winner = sup[iter];
378 | 
379 |     float
380 | 		top_val = dist[col_winner * pitch_dist + col],
381 | 		bot_val = dist[row_winner * pitch_dist + col],
382 | 		nr = count[row_winner], np = -count[col_winner],
383 | 		nq = nr - np, nk = count[col];
384 | 
385 | 	if((nr == -nk) || (col == col_winner) || (col == row_winner)) {
386 | 		bot_val = CUDART_INF_F;
387 | 	} else {
388 | 		bot_val = (bot_val * (np + nk) + top_val * (nq + nk) - val[iter] * nk);
389 | 		bot_val /= (nr + nk);
390 | 		if(isinf(bot_val)) {
391 | 			bot_val = CUDART_INF_F;
392 | 		}
393 | 	}
394 | 	top_val = CUDART_INF_F;
395 | 	
396 | 	dist[col_winner * pitch_dist + col] = top_val;
397 | 	dist[col * pitch_dist + col_winner] = top_val;
398 | 	dist[row_winner * pitch_dist + col] = bot_val;
399 | 	dist[col * pitch_dist + row_winner] = bot_val;
400 | }
401 | 
402 | 


--------------------------------------------------------------------------------
/inst/cuda/kendall.cu:
--------------------------------------------------------------------------------
 1 | #define NUMTHREADS 16
 2 | #define THREADWORK 32
 3 | 
 4 | __global__ void gpuKendall(const float * a, size_t na, 
 5 | 	const float * b, size_t nb, size_t sampleSize, double * results) 
 6 | {
 7 | 	size_t 
 8 | 		i, j, tests, 
 9 | 		tx = threadIdx.x, ty = threadIdx.y, 
10 | 		bx = blockIdx.x, by = blockIdx.y,
11 | 		rowa = bx * sampleSize, rowb = by * sampleSize;
12 | 	float 
13 | 		discordant, concordant = 0.f, 
14 | 		numer, denom;
15 | 
16 | 	__shared__ float threadSums[NUMTHREADS*NUMTHREADS];
17 | 
18 | 	for(i = tx; i < sampleSize; i += NUMTHREADS) {
19 | 		for(j = i+1+ty; j < sampleSize; j += NUMTHREADS) {
20 | 			tests = ((a[rowa+j] >  a[rowa+i]) && (b[rowb+j] >  b[rowb+i]))
21 | 				+ ((a[rowa+j] <  a[rowa+i]) && (b[rowb+j] <  b[rowb+i])) 
22 | 				+ ((a[rowa+j] == a[rowa+i]) && (b[rowb+j] == b[rowb+i])); 
23 | 			concordant = concordant + (float)tests;
24 | 		}
25 | 	}
26 | 	threadSums[tx*NUMTHREADS+ty] = concordant;
27 | 
28 | 	__syncthreads();
29 | 	for(i = NUMTHREADS >> 1; i > 0; i >>= 1) {
30 | 		if(ty < i) 
31 | 			threadSums[tx*NUMTHREADS+ty] += threadSums[tx*NUMTHREADS+ty+i];
32 | 		__syncthreads();
33 | 	}
34 |   for(i = NUMTHREADS >> 1; i > 0; i >>= 1) {
35 |     if((tx < i) && (ty == 0))
36 |       threadSums[tx*NUMTHREADS] += threadSums[(tx+i)*NUMTHREADS];
37 |     __syncthreads();
38 |   }
39 | 
40 | 	if((tx == 0) && (ty == 0)) {
41 | 		concordant = threadSums[0];
42 | 		denom = (float)sampleSize;
43 | 		denom = (denom * (denom - 1.f)) / 2.f; discordant = denom - concordant;
44 | 		numer = concordant - discordant;
45 | 		results[by*na+bx] = ((double)numer)/((double)denom);
46 | 	}
47 | }
48 | 


--------------------------------------------------------------------------------
/inst/cuda/mi.cu:
--------------------------------------------------------------------------------
  1 | #define NTHREADS 16
  2 | 
  3 | __global__ void scale(float knot_max, int nx, int nsamples,
  4 |                       float * x, int pitch_x)
  5 | {
  6 |   int
  7 |     col_idx = blockDim.x * blockIdx.x + threadIdx.x;
  8 | 
  9 |   if(col_idx >= nx) return;
 10 | 
 11 |   float
 12 |     min, max,
 13 |     * col = x + col_idx * pitch_x;
 14 | 
 15 |   // find the min and the max
 16 |   min = max = col[0];
 17 |   for(int i = 1; i < nsamples; i++) {
 18 |     if(col[i] < min) min = col[i];
 19 |     if(col[i] > max) max = col[i];
 20 |   }
 21 | 
 22 |   float delta = max - min;
 23 |   for(int i = 0; i < nsamples; i++)
 24 |     col[i] = (knot_max * (col[i] - min)) / delta;
 25 | }
 26 | 
 27 | __device__ float do_fraction(float numer, float denom) {
 28 |   float result = 0.f; 
 29 | 
 30 |   if((numer == denom) && (numer != 0.f))
 31 |     result = 1.f;
 32 |   else if(denom != 0.f)
 33 |     result = numer / denom;
 34 | 
 35 |   return result;
 36 | }
 37 | 
 38 | // bins must be initialized to zero before calling get_bin_scores
 39 | __global__ void get_bin_scores(int nbins, int order,
 40 |                                int nknots, float * knots, int nsamples,
 41 |                                int nx, float * x, int pitch_x,
 42 |                                float * bins, int pitch_bins)
 43 | {
 44 |   int
 45 |     col_x = blockDim.x * blockIdx.x + threadIdx.x;
 46 | 
 47 |   if(col_x >= nx)
 48 |     return;
 49 | 
 50 |   float
 51 |     ld, rd, z,
 52 |     term1, term2,
 53 |     * in_col = x + col_x * pitch_x,
 54 |     * bin_col = bins + col_x * pitch_bins;
 55 |   int i0;
 56 | 
 57 |   for(int k = 0; k < nsamples; k++, bin_col += nbins) {
 58 |     z = in_col[k];
 59 |     i0 = (int)floorf(z) + order - 1;
 60 |     if(i0 >= nbins)
 61 |       i0 = nbins - 1;
 62 | 
 63 |     bin_col[i0] = 1.f;
 64 |     for(int i = 2; i <= order; i++) {
 65 |       for(int j = i0 - i + 1; j <= i0; j++) {
 66 |         rd = do_fraction(knots[j + i] - z, knots[j + i] - knots[j + 1]);
 67 | 
 68 |         if((j < 0) || (j >= nbins) || (j >= nknots) || (j + i - 1 < 0) || (j > nknots))
 69 |           term1 = 0.f;
 70 |         else {
 71 |           ld = do_fraction(z - knots[j],
 72 |                            knots[j + i - 1] - knots[j]);
 73 |           term1 = ld * bin_col[j];
 74 |         }
 75 | 
 76 |         if((j + 1 < 0) || (j + 1 >= nbins) || (j + 1 >= nknots) || (j + i < 0) || (j + i >= nknots))
 77 |           term2 = 0.f;
 78 |         else {
 79 |           rd = do_fraction(knots[j + i] - z,
 80 |                            knots[j + i] - knots[j + 1]);
 81 |           term2 = rd * bin_col[j + 1];
 82 |         }
 83 |         bin_col[j] = term1 + term2;
 84 |       }
 85 |     }
 86 |   }
 87 | }
 88 | 
 89 | __global__ void get_entropy(int nbins, int nsamples, int nx,
 90 |                             float * bin_scores, int pitch_bin_scores, float * entropies)
 91 | {
 92 |   int
 93 |     col_x = blockDim.x * blockIdx.x + threadIdx.x;
 94 | 
 95 |   if(col_x >= nx)
 96 |     return;
 97 | 
 98 |   float
 99 |     * in_col = bin_scores + col_x * pitch_bin_scores,
100 |     entropy = 0.f, prob, logp;
101 | 
102 |   for(int i = 0; i < nbins; i++) {
103 |     prob = 0.f;
104 |     for(int j = 0; j < nsamples; j++)
105 |       prob += in_col[j * nbins + i];
106 |     prob /= (double) nsamples;
107 | 
108 |     if(prob <= 0.f)
109 |       logp = 0.f;
110 |     else
111 |       logp = __log2f(prob);
112 | 
113 |     entropy += prob * logp;
114 |   }
115 |   entropies[col_x] = -entropy;
116 | }
117 | 
118 | __global__ void get_mi(int nbins, int nsamples,
119 |                        int nx, float * x_bin_scores, int pitch_x_bin_scores,
120 |                        float * entropies_x,
121 |                        int ny, float * y_bin_scores, int pitch_y_bin_scores,
122 |                        float * entropies_y,
123 |                        float * mis, int pitch_mis)
124 | {
125 |   int
126 |     col_x = blockDim.x * blockIdx.x + threadIdx.x,
127 |     col_y = blockDim.y * blockIdx.y + threadIdx.y;
128 | 
129 |   if((col_x >= nx) || (col_y >= ny))
130 |     return;
131 | 
132 |   float
133 |     prob, logp, mi = 0.f,
134 |     * x_bins = x_bin_scores + col_x * pitch_x_bin_scores,
135 |     * y_bins = y_bin_scores + col_y * pitch_y_bin_scores;
136 |                 
137 |   // calculate joint entropy
138 |   for(int i = 0; i < nbins; i++) {
139 |     for(int j = 0; j < nbins; j++) {
140 |       prob = 0.f;
141 |       for(int k = 0; k < nsamples; k++)
142 |         prob += x_bins[k * nbins + i] * y_bins[k * nbins + j];
143 |       prob /= (float)nsamples;
144 | 
145 |       if(prob <= 0.f)
146 |         logp = 0.f;
147 |       else
148 |         logp = __log2f(prob);
149 | 
150 |       mi += prob * logp;
151 |     }
152 |   }
153 | 
154 |   // calculate mi from entropies
155 |   mi += entropies_x[col_x] + entropies_y[col_y];
156 |   (mis + col_y * pitch_mis)[col_x] = mi;
157 | }
158 | 


--------------------------------------------------------------------------------
/inst/cuda/qrdecomp.cu:
--------------------------------------------------------------------------------
  1 | #define NTHREADS 512
  2 | 
  3 | __global__ void getColNorms(int rows, int cols, float * da, int lda, 
  4 |                             float * colNorms)
  5 | {
  6 |   int colIndex = threadIdx.x + blockIdx.x * blockDim.x;
  7 |   float 
  8 |     sum = 0.f, term,
  9 |     * col;
 10 | 
 11 |   if(colIndex >= cols)
 12 |     return;
 13 | 
 14 |   col = da + colIndex * lda;
 15 | 
 16 |   // debug printing
 17 |   // printf("printing column %d\n", colIndex);
 18 |   // for(int i = 0; i < rows; i++)
 19 |   // printf("%f, ", col[i]);
 20 |   // puts("");
 21 |   // end debug printing
 22 | 
 23 |   for(int i = 0; i < rows; i++) {
 24 |     term = col[i];
 25 |     term *= term;
 26 |     sum += term;
 27 |   }
 28 | 
 29 |   // debug printing
 30 |   // printf("norm %f\n", norm);
 31 |   // end debug printing
 32 | 
 33 |   colNorms[colIndex] = sum;
 34 | }
 35 | 
 36 | __global__ void gpuFindMax(int n, float * data, int threadWorkLoad, 
 37 |                            int * maxIndex)
 38 | {
 39 |   int
 40 |     j, k,
 41 |     start = threadWorkLoad * threadIdx.x,
 42 |     end = start + threadWorkLoad;
 43 |   __shared__ int maxIndicies[NTHREADS];
 44 | 
 45 |   maxIndicies[threadIdx.x] = -1;
 46 | 
 47 |   if(start >= n)
 48 |     return;
 49 | 
 50 |   int localMaxIndex = start;
 51 |   for(int i = start+1; i < end; i++) {
 52 |     if(i >= n)
 53 |       break;
 54 |     if(data[i] > data[localMaxIndex])
 55 |       localMaxIndex = i;
 56 |   }
 57 |   maxIndicies[threadIdx.x] = localMaxIndex;
 58 |   __syncthreads();
 59 | 
 60 |   for(int i = blockDim.x >> 1; i > 0; i >>= 1) {
 61 |     if(threadIdx.x < i) {
 62 |       j = maxIndicies[threadIdx.x];
 63 |       k = maxIndicies[i + threadIdx.x];
 64 |       if((j != -1) && (k != -1) && (data[j] < data[k])) 
 65 |         maxIndicies[threadIdx.x] = k;
 66 |     }
 67 |     __syncthreads();
 68 |   }
 69 |   if(threadIdx.x == 0) {
 70 |     *maxIndex = maxIndicies[0];
 71 |     // debug printing
 72 |     // printf("max index: %d\n", *maxIndex);
 73 |     // printf("max norm: %f\n", data[*maxIndex]);
 74 |     // end debug printing
 75 |   }
 76 | }
 77 | 
 78 | __global__ void gpuSwapCol(int rows, float * dArray, int coli, int * dColj,
 79 |                            int * dPivot)
 80 | {
 81 |   int rowIndex = blockIdx.x * blockDim.x + threadIdx.x;
 82 | 
 83 |   if(rowIndex >= rows)
 84 |     return;
 85 | 
 86 |   int colj = coli + (*dColj);
 87 |   float fholder;
 88 | 
 89 |   fholder = dArray[rowIndex+coli*rows];
 90 |   dArray[rowIndex+coli*rows] = dArray[rowIndex+colj*rows];
 91 |   dArray[rowIndex+colj*rows] = fholder;
 92 | 
 93 |   if((blockIdx.x == 0) && (threadIdx.x == 0)) {
 94 |     int iholder = dPivot[coli];
 95 |     dPivot[coli] = dPivot[colj];
 96 |     dPivot[colj] = iholder;
 97 |   }
 98 | }
 99 | 
100 | __global__ void makeHVector(int rows, float * input, float * output)
101 | {
102 |   int
103 |     i, j;
104 |   float 
105 |     elt, sum;
106 |   __shared__ float 
107 |     beta, sums[NTHREADS];
108 | 
109 |   if(threadIdx.x >= rows)
110 |     return;
111 | 
112 |   sum = 0.f;
113 |   for(i = threadIdx.x ; i < rows; i += NTHREADS) {
114 |     if((threadIdx.x == 0) && (i == 0))
115 |       continue;
116 |     elt = input[i];
117 |     output[i] = elt;
118 |     sum += elt * elt;
119 |   }
120 |   sums[threadIdx.x] = sum;
121 |   __syncthreads();
122 |         
123 |   for(i = blockDim.x >> 1; i > 0 ; i >>= 1) {
124 |     j = i+threadIdx.x;
125 |     if((threadIdx.x < i) && (j < rows)) 
126 |       sums[threadIdx.x] += sums[j];
127 |     __syncthreads();
128 |   }
129 | 
130 |   if(threadIdx.x == 0) {
131 |     elt = input[0];
132 |     float norm = sqrtf(elt * elt + sums[0]);
133 | 
134 |     if(elt > 0) 
135 |       elt += norm;
136 |     else 
137 |       elt -= norm;
138 | 
139 |     output[0] = elt;
140 | 
141 |     norm = elt * elt + sums[0];
142 |     beta = sqrtf(2.f / norm);
143 |   }
144 |   __syncthreads();
145 | 
146 |   for(i = threadIdx.x; i < rows; i += NTHREADS)
147 |     output[i] *= beta;
148 | }
149 | 
150 | // Updates the column norms by subtracting the Hadamard-square of the
151 | // Householder vector.
152 | //
153 | // N.B.:  Overflow incurred in computing the square should already have
154 | // been detected in the original norm construction.
155 | 
156 | __global__ void UpdateHHNorms(int cols, float *dV, float *dNorms) {
157 |   // Copyright 2009, Mark Seligman at Rapid Biologics, LLC.  All rights
158 |   // reserved.
159 | 
160 |   int colIndex = threadIdx.x + blockIdx.x * blockDim.x;
161 |   if (colIndex < cols) {
162 |     float val = dV[colIndex];
163 |     dNorms[colIndex] -= val * val;
164 |   }
165 | }
166 | 


--------------------------------------------------------------------------------
/man/chooseGpu.Rd:
--------------------------------------------------------------------------------
 1 | \name{chooseGpu}
 2 | \alias{chooseGpu}
 3 | \title{Choose which GPU device to use}
 4 | 
 5 | \description{
 6 | 	Selects the GPU device to use for computation. This is only useful on a
 7 | 	machine equipped with multiple GPU devices. The numbering starts at 0
 8 | 	and is assigned by the CUDA capable driver.
 9 | 
10 | 	Choosing a device can only be done before any other GPU operation and
11 | 	only once per thread.
12 | }
13 | 
14 | \usage{
15 | chooseGpu(deviceId = 0) 
16 | }
17 | 
18 | \arguments{
19 | 	\item{deviceId}{an integer >= 0 designating the GPU to use for computation.}
20 | }
21 | 
22 | \value{
23 | 	chooseGpu should print out an integer specifying the device id chosen or 
24 | 	an error message.
25 | }
26 | 


--------------------------------------------------------------------------------
/man/cpuMatMult.Rd:
--------------------------------------------------------------------------------
 1 | \name{cpuMatMult}
 2 | \alias{cpuMatMult}
 3 | \title{Perform Matrix Multiplication}
 4 | 
 5 | \description{
 6 | 	Performs matrix multiplication using R's BLAS.  This function is merely a 
 7 | 	wrapper for the BLAS dgemm function.
 8 | }
 9 | 
10 | \usage{
11 | 	cpuMatMult(a, b)
12 | }
13 | 
14 | \arguments{
15 | 	\item{a}{a numeric matrix.}
16 | 	\item{b}{a numeric matrix.}
17 | }
18 | 
19 | \value{
20 | 	A numeric matrix.
21 | 	The matrix is just the product of arguments 'a' and 'b'.
22 | }
23 | 
24 | \examples{
25 | matA <- matrix(runif(2*3), 2, 3)
26 | matB <- matrix(runif(3*4), 3, 4)
27 | cpuMatMult(matA, matB)
28 | }
29 | 
30 | \keyword{array}
31 | \keyword{algebra}
32 | 


--------------------------------------------------------------------------------
/man/getGpuId.Rd:
--------------------------------------------------------------------------------
 1 | \name{getGpuId}
 2 | \alias{getGpuId}
 3 | \title{Discover the Id of the current GPU device}
 4 | 
 5 | \description{
 6 | 	Queries the CUDA driver for the GPU device currently assigned to this
 7 | 	thread. This is the id of the device that will be used for computation.
 8 | 	If you wish to use a different device, use the chooseGpu function.
 9 | }
10 | 
11 | \usage{
12 | getGpuId()
13 | }
14 | 
15 | \value{
16 | 	The function returns a single integer indicating the id of the
17 | 	GPU device currently selected to carry out computation according to the
18 | 	CUDA driver.
19 | }
20 | 
21 | \examples{
22 | getGpuId()
23 | }
24 | 


--------------------------------------------------------------------------------
/man/gpuCor.Rd:
--------------------------------------------------------------------------------
 1 | \name{gpuCor}
 2 | \alias{gpuCor}
 3 | \title{Calculate Various Correlation Coefficients With a GPU}
 4 | 
 5 | \description{
 6 | 	The correlation coefficient will be calculated
 7 | 	for each pair $x_i$, $y_j$ where $x_i$ is a column of $x$ and $y_j$ is a
 8 | 	column of $y$.  Currently, Pearson's and Kendall's correlation
 9 | 	coefficient are implemented.  Pearson's may be calculated for
10 | 	data sets containing NAs in which case, the implementation behaves
11 | 	as R-native cor function with use="pairwise.complete".
12 | }
13 | 
14 | \usage{
15 | gpuCor(x, y = NULL, use = "everything", method = "pearson")
16 | }
17 | 
18 | \arguments{
19 | 	\item{x}{a matrix of floating point values in which each column is a 
20 | 		random variable.}  
21 | 	\item{y}{a matrix of floating point values in which each column is a 
22 | 		random variable.}
23 | 	\item{use}{a string.  A character string giving a method for computing
24 | 		in the presence of missing values. Options are
25 | 		"everything" or "pairwise.complete.obs". This currently only 
26 | 		affects the "pearson" method.}
27 | 	\item{method}{a string.  Either "pearson" or "kendall".}
28 | }
29 | 
30 | \value{
31 | 	For method "pearson", a list with matrices 'pairs',
32 | 	'coefficients', and 'ts'. The matrix entry $i$, $j$ for pairs represents
33 | 	the number of pairs of entries $x_i^k$, $y_j^k$ (the $k$-th entry from
34 | 	$x_i$ and $y_j$ respectively). These are the
35 | 	number of entries actually used to calculate the coefficients.
36 | 	Entry $i$, $j$ of the coefficients matrix is the correlation coefficient
37 | 	for $x_i$, $y_j$. Entry $i$, $j$ of the ts matrix is the t-score of the
38 | 	$i$, $j$ entry of the coefficient matrix. If use="pairwise.complete.obs"
39 | 	then only the pairs where both entries are not NA are used in the
40 | 	computations.
41 | 
42 | 	For method "kendall", a list of matrices 'pairs' as above and
43 | 	'coefficients' as follows. The matrix 'coefficients' is a matrix of
44 | 	floating point numbers where entry $i$, $j$ is the correlation coefficient
45 | 	for $x_i$, $y_j$. Calculation of t-scores for the kendall coefficients is
46 | 	not yet implemented.
47 | }
48 | 
49 | \seealso{
50 | 	cor
51 | }
52 | 
53 | \examples{
54 | numAvars <- 5
55 | numBvars <- 10
56 | numSamples <- 30
57 | A <- matrix(runif(numAvars*numSamples), numSamples, numAvars)
58 | B <- matrix(runif(numBvars*numSamples), numSamples, numBvars)
59 | gpuCor(A, B, method="pearson")
60 | gpuCor(A, B, method="kendall")
61 | A[3,2] <- NA
62 | gpuCor(A, B, use="pairwise.complete.obs", method="pearson")
63 | }
64 | 


--------------------------------------------------------------------------------
/man/gpuCrossprod.Rd:
--------------------------------------------------------------------------------
 1 | \name{gpuCrossprod}
 2 | \alias{gpuCrossprod}
 3 | \title{Perform Matrix Cross-product with a GPU}
 4 | 
 5 | \description{
 6 | 	Performs matrix cross-product using a GPU.  This function is merely a 
 7 | 	couple of wrappers for the CUBLAS cublasSgemm function.
 8 | }
 9 | 
10 | \usage{
11 | 	gpuCrossprod(a, b=NULL)
12 | }
13 | 
14 | \arguments{
15 | 	\item{a}{a matrix of floating point values.}
16 | 	\item{b}{a matrix of floating point values.  A null value
17 | 	defaults to 'a'.}
18 | }
19 | 
20 | \value{
21 | 	A matrix of single precision floating point values.
22 | 	The matrix is the cross-product of arguments 'a' and 'b', i.e.,
23 | 	t(a) * b.
24 | }
25 | 
26 | \examples{
27 | matA <- matrix(runif(3*2), 3, 2)
28 | matB <- matrix(runif(3*4), 3, 4)
29 | gpuCrossprod(matA, matB)
30 | }
31 | 
32 | \keyword{array}
33 | \keyword{algebra}
34 | 
35 | 


--------------------------------------------------------------------------------
/man/gpuDist.Rd:
--------------------------------------------------------------------------------
 1 | \name{gpuDist}
 2 | \alias{gpuDist}
 3 | \title{Compute Distances Between Vectors on a GPU}
 4 | 
 5 | \description{
 6 | 	This function computes the distance between each vector of the
 7 | 	'points' argument using the metric specified by 'method'.
 8 | }
 9 | 
10 | \usage{
11 | 	gpuDist(points, method = "euclidean", p = 2.0)
12 | }
13 | 
14 | \arguments{
15 | 	\item{points}{a matrix of floating point numbers in which each row is a
16 | 		vector in $R^n$ space where $n$ is ncol(points).}
17 | 	\item{method}{a string representing the name of the metric to use to
18 | 		calculate the distance between the vectors of 'points'.  Currently
19 | 		supported values are: "binary", "canberra", "euclidean", "manhattan",
20 | 		"maximum", and "minkowski".}
21 | 	\item{p}{a floating point parameter for the Minkowski metric.}
22 | }
23 | 
24 | \value{
25 | 	a class of type "dist" containing floating point numbers representing the 
26 | 	distances between vectors from the 'points' argument.
27 | }
28 | 
29 | \seealso{
30 | 	dist
31 | }
32 | 
33 | \examples{
34 | numVectors <- 5
35 | dimension <- 10
36 | Vectors <- matrix(runif(numVectors*dimension), numVectors, dimension)
37 | gpuDist(Vectors, "euclidean")
38 | gpuDist(Vectors, "maximum")
39 | gpuDist(Vectors, "manhattan")
40 | gpuDist(Vectors, "minkowski", 4)
41 | }
42 | 
43 | \keyword{math}
44 | 


--------------------------------------------------------------------------------
/man/gpuDistClust.Rd:
--------------------------------------------------------------------------------
 1 | \name{gpuDistClust}
 2 | \alias{gpuDistClust}
 3 | \title{Compute Distances and Hierarchical Clustering for Vectors on a GPU}
 4 | 
 5 | \description{This function takes a set of vectors and performs clustering
 6 | 	on them.  The function will first calculate the distance between all of the 
 7 | 	pairs of vectors and then use the distances to cluster the vectors.  Both of
 8 | 	these steps are done on the GPU.
 9 | }
10 | 
11 | \usage{
12 | 	gpuDistClust(points, distmethod = "euclidean", clustmethod = "complete") 
13 | }
14 | 
15 | \arguments{
16 | 	\item{points}{a matrix of floating point numbers in which each row is a
17 | 		vector in $R^n$ space where $n$ is ncol(points).}
18 | 	\item{distmethod}{a string representing the name of the metric to use to
19 | 		calculate the distance between the vectors of 'points'.  Currently
20 | 		supported values are: "binary", "canberra", "euclidean", "manhattan",
21 | 		"maximum".}
22 | 	\item{clustmethod}{a string representing the name of the clustering method 
23 | 		to be applied to distances.  Currently supported method names include
24 | 		"average", "centroid", "complete", "flexible", "flexible group", 
25 | 		"mcquitty", "median", "single", "ward", and "wpgma".}
26 | }
27 | 
28 | \value{Copied from the native R function 'hclust' documentation.  A class of
29 | 	type "hclust" with the following attributes. 
30 | 	\item{merge}{an n-1 by 2 matrix. Row i of 'merge' describes the merging of
31 | 		clusters at step i of the clustering. If an element j in the
32 | 		row is negative, then observation -j was merged at this
33 | 		stage. If j is positive then the merge was with the cluster
34 | 		formed at the (earlier) stage j of the algorithm. Thus
35 | 		negative entries in 'merge' indicate agglomerations of
36 | 		singletons, and positive entries indicate agglomerations of
37 | 		non-singletons. Copied from the native R function 'hclust'
38 | 		documentation.}
39 | 	\item{order}{a vector giving the permutation of the original observations
40 | 		suitable for plotting, in the sense that a cluster plot using
41 | 		this ordering and matrix 'merge' will not have crossings of
42 | 		the branches.}
43 | 	\item{height}{a set of n-1 non-decreasing real values. The clustering
44 | 		height: that is, the value of the criterion associated with
45 | 		the clustering 'method' for the particular agglomeration.}
46 | }
47 | 
48 | \seealso{
49 | 	\code{\link{gpuDist}}, \code{\link{gpuHclust}}.
50 | }
51 | 
52 | \examples{
53 | numVectors <- 5
54 | dimension <- 10
55 | Vectors <- matrix(runif(numVectors*dimension), numVectors, dimension)
56 | myClust <- gpuDistClust(Vectors, "maximum", "mcquitty")
57 | plot(myClust)
58 | }
59 | 
60 | \keyword{cluster}
61 | 


--------------------------------------------------------------------------------
/man/gpuGranger.Rd:
--------------------------------------------------------------------------------
 1 | \name{gpuGranger}
 2 | \alias{gpuGranger}
 3 | \title{Perform Granger Causality Tests for Vectors on a GPU}
 4 | 
 5 | \description{
 6 | 	This function performs, with the aid of a GPU, Granger Causality Tests on 
 7 | 	permutations of pairs of columns of the input matrices 'x' and 'y'.
 8 | }
 9 | 
10 | \usage{
11 | gpuGranger(x, y=NULL, lag)
12 | }
13 | 
14 | \arguments{
15 | 	\item{x}{a matrix of floating point values.  Each column represents
16 | 		a sequence of observations for a single random variable.}
17 | 	\item{y}{an optional  matrix of floating point values.  Each column
18 | 		represents a sequence of observations for a single random variable.}
19 | 	\item{lag}{a positive integer by which to offset the sequence of 
20 | 		observations to calculate the coefficient for Granger causality.}
21 | }
22 | 
23 | \value{a list of two single precision floating point matrices both of the same
24 | 	dimension. The two matrices are fStatistics and pValues. The
25 | 	fStatistics matrix holds the F-statistics from the Granger causality tests.
26 | 	Each element of the pValues matrix is the p-value for the corresponding
27 | 	element of the fStatistics matrix.
28 | 
29 | 	If y is NULL, the test is run on permutations of pairs of columns of x. To
30 | 	find the Granger causality F-statistic estimating the answer to
31 | 	"Does variable x[ ,j] Granger-cause variable x[ ,i]?", look at
32 | 	fStatistics[i, j] and pValues[i, j].
33 | 
34 | 	If y is not NULL, the test is run on permutations of pairs (x[ ,i], y[ ,j]).
35 | 	To find the Granger causality F-statistic estimating the answer to
36 | 	"Does variable y[ ,j] Granger-cause variable x[ ,i]?", look at
37 | 	fStatistics[i, j] and pValues[i, j].
38 | }
39 | 
40 | \examples{
41 | # permutations of pairs of cols of just x
42 | numRandVars <- 5
43 | numSamples <- 20
44 | randVarSequences <- matrix(runif(numRandVars*numSamples), numSamples,
45 | 	numRandVars)
46 | gpuGranger(randVarSequences, lag = 5)
47 | 
48 | # pairs of cols, one from x and one from y
49 | numXRandVars <- 5
50 | numXSamples <- 20
51 | x <- matrix(runif(numXRandVars*numXSamples), numXSamples, numXRandVars)
52 | 
53 | numYRandVars <- 3
54 | numYSamples <- 20
55 | y <- matrix(runif(numYRandVars*numYSamples), numYSamples, numYRandVars)
56 | 
57 | result <- gpuGranger(x, y, lag = 5)
58 | print(result)
59 | }
60 | 


--------------------------------------------------------------------------------
/man/gpuHclust.Rd:
--------------------------------------------------------------------------------
 1 | \name{gpuHclust}
 2 | \alias{gpuHclust}
 3 | \title{Perform Hierarchical Clustering for Vectors with a GPU}
 4 | 
 5 | \description{
 6 | 	This function performs clustering on a set of points.  The distance between
 7 | 	each pair of points should be calculated first using a function like 
 8 | 	'gpuDist' or 'dist'.
 9 | }
10 | 
11 | \usage{
12 | 	gpuHclust(distances, method = "complete")
13 | }
14 | 
15 | \arguments{
16 | 	\item{distances}{a class of type "dist" containing floating point numbers
17 | 		representing distances between points.  R's native dist function and
18 | 		the gpuDist function produce output of this type.}  
19 | 	\item{method}{a string representing the name of the clustering method to be
20 | 		applied to distances.  Currently supported method names include
21 | 		"average", "centroid", "complete", "flexible", "flexible group", 
22 | 		"mcquitty", "median", "single", "ward", and "wpgma".}
23 | }
24 | 
25 | \value{Copied from the native R function 'hclust' documentation.  A class of
26 | 	type "hclust" with the following attributes. 
27 | 	\item{merge}{an n-1 by 2 matrix. Row i of 'merge' describes the merging of
28 | 		clusters at step i of the clustering. If an element j in the
29 | 		row is negative, then observation -j was merged at this
30 | 		stage. If j is positive then the merge was with the cluster
31 | 		formed at the (earlier) stage j of the algorithm. Thus
32 | 		negative entries in 'merge' indicate agglomerations of
33 | 		singletons, and positive entries indicate agglomerations of
34 | 		non-singletons. Copied from the native R function 'hclust'
35 | 		documentation.}
36 | 	\item{order}{a vector giving the permutation of the original observations
37 | 		suitable for plotting, in the sense that a cluster plot using
38 | 		this ordering and matrix 'merge' will not have crossings of
39 | 		the branches.}
40 | 	\item{height}{a set of n-1 non-decreasing real values. The clustering
41 | 		height: that is, the value of the criterion associated with
42 | 		the clustering 'method' for the particular agglomeration.}
43 | }
44 | 
45 | \seealso{
46 | 	hclust, \code{\link{gpuDistClust}}
47 | }
48 | 
49 | \examples{
50 | numVectors <- 5
51 | dimension <- 10
52 | Vectors <- matrix(runif(numVectors*dimension), numVectors, dimension)
53 | distMat <- gpuDist(Vectors, "euclidean")
54 | myClust <- gpuHclust(distMat, "single")
55 | plot(myClust)
56 | }
57 | 
58 | \keyword{cluster}
59 | 


--------------------------------------------------------------------------------
/man/gpuLm.Rd:
--------------------------------------------------------------------------------
  1 | % File man/gpuLm.Rd
  2 | % Part of the gputools package
  3 | % mostly copied from 
  4 | % File src/library/stats/man/lm.Rd
  5 | % Part of the R package, http://www.R-project.org
  6 | % Copyright 1995-2007 R Core Development Team
  7 | % Distributed under GPL 2 or later
  8 | 
  9 | \name{gpuLm}
 10 | \alias{gpuLm}
 11 | % \alias{print.lm}
 12 | \concept{regression}
 13 | \title{Fitting Linear Models using a GPU--enabled QR}
 14 | \description{
 15 | 	Most of this documentation is copied from R's documentation for \code{lm}.
 16 | 	\code{gpuLm} is used to fit linear models using a
 17 | 	GPU enabled QR decomposition.
 18 |   It can be used to carry out regression,
 19 |   single stratum analysis of variance and
 20 |   analysis of covariance (although \code{\link{aov}} may provide a more
 21 |   convenient interface for these).
 22 | 
 23 |   Note:  The QR decomposition employed by \code{gpuLm} is optimized for speed
 24 |   and uses minimal pivoting.  If rank-revealing pivot is desired, then the
 25 |   function \code{gpuQR}, should be used.  The most reliable
 26 |   determination of rank, however, will be obtained with the \code{svd} command.
 27 | }
 28 | 
 29 | \usage{
 30 | gpuLm(formula, data, subset, weights, na.action,
 31 |    method = "qr", model = TRUE, x = FALSE, y = FALSE, qr = TRUE,
 32 |    singular.ok = TRUE, contrasts = NULL, useSingle = TRUE, offset, \dots)
 33 | }
 34 | 
 35 | \arguments{
 36 |   \item{formula}{an object of class \code{"\link{formula}"} (or one that
 37 |     can be coerced to that class): a symbolic description of the
 38 |     model to be fitted.  The details of model specification are given
 39 |     under \sQuote{Details}.}
 40 | 
 41 |   \item{data}{an optional data frame, list or environment (or object
 42 |     coercible by \code{\link{as.data.frame}} to a data frame) containing
 43 |     the variables in the model.  If not found in \code{data}, the
 44 |     variables are taken from \code{environment(formula)},
 45 |     typically the environment from which \code{lm} is called.}
 46 | 
 47 |   \item{subset}{an optional vector specifying a subset of observations
 48 |     to be used in the fitting process.}
 49 | 
 50 |   \item{weights}{an optional vector of weights to be used in the fitting
 51 |     process.  Should be \code{NULL} or a numeric vector.
 52 |     If non-NULL, weighted least squares is used with weights
 53 |     \code{weights} (that is, minimizing \code{sum(w*e^2)}); otherwise
 54 |     ordinary least squares is used.  See also \sQuote{Details},}
 55 | 
 56 |   \item{na.action}{a function which indicates what should happen
 57 |     when the data contain \code{NA}s.  The default is set by
 58 |     the \code{na.action} setting of \code{\link{options}}, and is
 59 |     \code{\link{na.fail}} if that is unset.  The \sQuote{factory-fresh}
 60 |     default is \code{\link{na.omit}}.  Another possible value is
 61 |     \code{NULL}, no action.  Value \code{\link{na.exclude}} can be useful.}
 62 | 
 63 |   \item{method}{the method to be used; for fitting, currently only
 64 |     \code{method = "qr"} is supported; \code{method = "model.frame"} returns
 65 |     the model frame (the same as with \code{model = TRUE}, see below).}
 66 | 
 67 |   \item{model, x, y, qr}{logicals.  If \code{TRUE} the corresponding
 68 |     components of the fit (the model frame, the model matrix, the
 69 |     response, the qr decomposition) are returned.}
 70 | 
 71 |   \item{singular.ok}{logical. If \code{FALSE} (the default in S but
 72 |     not in \R) a singular fit is an error.}
 73 | 
 74 |   \item{contrasts}{an optional list. See the \code{contrasts.arg}
 75 |     of \code{\link{model.matrix.default}}.}
 76 | 
 77 | 	\item{useSingle}{an optional logical. In the future, setting this to
 78 | 		\code{FALSE} will result in using double precision arithmetic on the
 79 | 		gpu, but this is not yet implemented}
 80 | 
 81 |   \item{offset}{this can be used to specify an \emph{a priori} known
 82 |     component to be included in the linear predictor during fitting.
 83 |     This should be \code{NULL} or a numeric vector of length equal to
 84 |     the number of cases.  One or more \code{\link{offset}} terms can be
 85 |     included in the formula instead or as well, and if more than one are
 86 |     specified their sum is used.  See \code{\link{model.offset}}.}
 87 | 
 88 |   \item{\dots}{additional arguments to be passed to the low level
 89 |     regression fitting functions (see below).}
 90 | }
 91 | 
 92 | \details{
 93 |   Models for \code{lm} are specified symbolically.  A typical model has
 94 |   the form \code{response ~ terms} where \code{response} is the (numeric)
 95 |   response vector and \code{terms} is a series of terms which specifies a
 96 |   linear predictor for \code{response}.  A terms specification of the form
 97 |   \code{first + second} indicates all the terms in \code{first} together
 98 |   with all the terms in \code{second} with duplicates removed.  A
 99 |   specification of the form \code{first:second} indicates the set of
100 |   terms obtained by taking the interactions of all terms in \code{first}
101 |   with all terms in \code{second}.  The specification \code{first*second}
102 |   indicates the \emph{cross} of \code{first} and \code{second}.  This is
103 |   the same as \code{first + second + first:second}.
104 | 
105 |   If the formula includes an \code{\link{offset}}, this is evaluated and
106 |   subtracted from the response.
107 | 
108 |   If \code{response} is a matrix a linear model is fitted separately by
109 |   least-squares to each column of the matrix.
110 | 
111 |   See \code{\link{model.matrix}} for some further details.  The terms in
112 |   the formula will be re-ordered so that main effects come first,
113 |   followed by the interactions, all second-order, all third-order and so
114 |   on: to avoid this pass a \code{terms} object as the formula (see
115 |   \code{\link{aov}} and \code{demo(glm.vr)} for an example).
116 | 
117 |   A formula has an implied intercept term.  To remove this use either
118 |   \code{y ~ x - 1} or \code{y ~ 0 + x}.  See \code{\link{formula}} for
119 |   more details of allowed formulae.
120 | 
121 |   Non-\code{NULL} \code{weights} can be used to indicate that different
122 |   observations have different variances (with the values in
123 |   \code{weights} being inversely proportional to the variances); or
124 |   equivalently, when the elements of \code{weights} are positive
125 |   integers \eqn{w_i}, that each response \eqn{y_i} is the mean of
126 |   \eqn{w_i} unit-weight observations (including the case that there are
127 |   \eqn{w_i} observations equal to \eqn{y_i} and the data have been
128 |   summarized).
129 | 
130 |   \code{lm} calls the lower level functions \code{\link{lm.fit}}, etc,
131 |   see below, for the actual numerical computations.  For programming
132 |   only, you may consider doing likewise.
133 | 
134 |   All of \code{weights}, \code{subset} and \code{offset} are evaluated
135 |   in the same way as variables in \code{formula}, that is first in
136 |   \code{data} and then in the environment of \code{formula}.
137 | }
138 | \value{
139 |   \code{lm} returns an object of \code{\link[base]{class}} \code{"lm"} or for
140 |   multiple responses of class \code{c("mlm", "lm")}.
141 | 
142 |   The functions \code{summary} and \code{\link{anova}} are used to
143 |   obtain and print a summary and analysis of variance table of the
144 |   results.  The generic accessor functions \code{coefficients},
145 |   \code{effects}, \code{fitted.values} and \code{residuals} extract
146 |   various useful features of the value returned by \code{lm}.
147 | 
148 |   An object of class \code{"lm"} is a list containing at least the
149 |   following components:
150 | 
151 |   \item{coefficients}{a named vector of coefficients}
152 |   \item{residuals}{the residuals, that is response minus fitted values.}
153 |   \item{fitted.values}{the fitted mean values.}
154 |   \item{rank}{the numeric rank of the fitted linear model.}
155 |   \item{weights}{(only for weighted fits) the specified weights.}
156 |   \item{df.residual}{the residual degrees of freedom.}
157 |   \item{call}{the matched call.}
158 |   \item{terms}{the \code{\link{terms}} object used.}
159 |   \item{contrasts}{(only where relevant) the contrasts used.}
160 |   \item{xlevels}{(only where relevant) a record of the levels of the
161 |     factors used in fitting.}
162 |   \item{offset}{the offset used (missing if none were used).}
163 |   \item{y}{if requested, the response used.}
164 |   \item{x}{if requested, the model matrix used.}
165 |   \item{model}{if requested (the default), the model frame used.}
166 |   \item{na.action}{(where relevant) information returned by
167 |     \code{\link{model.frame}} on the special handling of \code{NA}s.}
168 | 
169 |   In addition, non-null fits will have components \code{assign},
170 |   \code{effects} and (unless not requested) \code{qr} relating to the linear
171 |   fit, for use by extractor functions such as \code{summary} and
172 |   \code{\link{effects}}.
173 | }
174 | \section{Using time series}{
175 |   Considerable care is needed when using \code{lm} with time series.
176 | 
177 |   Unless \code{na.action = NULL}, the time series attributes are
178 |   stripped from the variables before the regression is done.  (This is
179 |   necessary as omitting \code{NA}s would invalidate the time series
180 |   attributes, and if \code{NA}s are omitted in the middle of the series
181 |   the result would no longer be a regular time series.)
182 | 
183 |   Even if the time series attributes are retained, they are not used to
184 |   line up series, so that the time shift of a lagged or differenced
185 |   regressor would be ignored.  It is good practice to prepare a
186 |   \code{data} argument by \code{\link{ts.intersect}(\dots, dframe = TRUE)},
187 |   then apply a suitable \code{na.action} to that data frame and call
188 |   \code{gpuLm} with \code{na.action = NULL} so that residuals and fitted
189 |   values are time series.
190 | }
191 | \seealso{
192 |   \code{\link{summary.lm}} for summaries and \code{\link{anova.lm}} for
193 |   the ANOVA table; \code{\link{aov}} for a different interface.
194 | 
195 |   The generic functions \code{\link{coef}}, \code{\link{effects}},
196 |   \code{\link{residuals}}, \code{\link{fitted}}, \code{\link{vcov}}.
197 | 
198 |   \code{\link{predict.lm}} (via \code{\link{predict}}) for prediction,
199 |   including confidence and prediction intervals;
200 |   \code{\link{confint}} for confidence intervals of \emph{parameters}.
201 | 
202 |   \code{\link{lm.influence}} for regression diagnostics, and
203 |   \code{\link{glm}} for \bold{generalized} linear models.
204 | 
205 |   The underlying low level functions,
206 |   \code{\link{lm.fit}} for plain, and \code{\link{lm.wfit}} for weighted
207 |   regression fitting.
208 | 
209 |   More \code{lm()} examples are available e.g., in
210 |   \code{\link[datasets]{anscombe}},
211 |   \code{\link[datasets]{attitude}},
212 |   \code{\link[datasets]{freeny}},
213 |   \code{\link[datasets]{LifeCycleSavings}},
214 |   \code{\link[datasets]{longley}},
215 |   \code{\link[datasets]{stackloss}},
216 |   \code{\link[datasets]{swiss}}.
217 | 
218 |   \code{biglm} in package \pkg{biglm} for an alternative
219 |   way to fit linear models to large datasets (especially those with many
220 |   cases).
221 | }
222 | 
223 | \references{
224 |   Chambers, J. M. (1992)
225 |   Linear models.
226 |   Chapter 4 of Statistical Models in S
227 |   eds J. M. Chambers and T. J. Hastie, Wadsworth & Brooks/Cole.
228 | 
229 |   Wilkinson, G. N. and Rogers, C. E. (1973)
230 |   Symbolic descriptions of factorial models for analysis of variance.
231 |   Applied Statistics, 22, 392--9.
232 | }
233 | 
234 | \author{
235 |   The design was inspired by the S function of the same name described
236 |   in Chambers (1992).  The implementation of model formula by Ross Ihaka
237 |   was based on Wilkinson & Rogers (1973).
238 |   
239 |   This function was adapted for Nvidia's CUDA--supporting GPGPUs by
240 |   Mark Seligman at Rapid Biologics LLC.  
241 | 	\code{http://www.rapidbiologics.com}
242 | 
243 | }
244 | 
245 | \note{
246 |   Offsets specified by \code{offset} will not be included in predictions
247 |   by \code{\link{predict.lm}}, whereas those specified by an offset term
248 |   in the formula will be.
249 | }
250 | 
251 | \examples{
252 | # require(graphics)
253 | 
254 | ## Annette Dobson (1990) "An Introduction to Generalized Linear Models".
255 | ## Page 9: Plant Weight Data.
256 | ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14)
257 | trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69)
258 | group <- gl(2,10,20, labels=c("Ctl","Trt"))
259 | weight <- c(ctl, trt)
260 | anova(lm.D9 <- gpuLm(weight ~ group))
261 | summary(lm.D90 <- gpuLm(weight ~ group - 1))# omitting intercept
262 | summary(resid(lm.D9) - resid(lm.D90)) #- residuals almost identical
263 | 
264 | opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
265 | plot(lm.D9, las = 1) # Residuals, Fitted, ...
266 | par(opar)
267 | 
268 | ## model frame :
269 | stopifnot(identical(gpuLm(weight ~ group, method = "model.frame"),
270 | 	model.frame(lm.D9)))
271 | 
272 | ### less simple examples in "See Also" above
273 | }
274 | 
275 | \keyword{regression}
276 | 
277 | 


--------------------------------------------------------------------------------
/man/gpuLm.defaultTol.Rd:
--------------------------------------------------------------------------------
 1 | % File man/gpuLm.defaultTol.Rd
 2 | % part of the gputools R package
 3 | % Distributed under GPL 2 or later
 4 | 
 5 | \name{gpuLm.defaultTol}
 6 | \title{Function to switch tolerance depending on precision}
 7 | \usage{
 8 | gpuLm.defaultTol(useSingle = TRUE)
 9 | }
10 | \alias{gpuLm.defaultTol}
11 | \description{
12 | 	This function was written by Mark Seligman at Rapid Biologics, 
13 | 	\code{http://rapidbiologics.com}
14 | 
15 |   The function \code{\link{gpuLm.fit}} calls this function 
16 |   to determine a default tolerance. So gpuLm.defaultTol should
17 |   \emph{not} need to be used directly.
18 | }
19 | \arguments{
20 |   \item{useSingle}{logical. If TRUE, a tolerance will be returned appropriate
21 |   	for single precision arithmetic. If FALSE, a tolerance will be returned
22 | 	appropriate for double precision arithmetic.}
23 | }
24 | \value{
25 | 	a floating point number representing a tolerance to be used by gpuLm.fit
26 | }
27 | \seealso{
28 |   \code{\link{gpuLm.fit}} \code{\link{gpuLm}} 
29 | }
30 | 


--------------------------------------------------------------------------------
/man/gpuLm.fit.Rd:
--------------------------------------------------------------------------------
 1 | % File man/gpuLm.fit.Rd
 2 | % part of the gputools R package
 3 | % mostly copied from
 4 | % File src/library/stats/man/lmfit.Rd
 5 | % Part of the R package, http://www.R-project.org
 6 | % Copyright 1995-2009 R Core Development Team
 7 | % Distributed under GPL 2 or later
 8 | 
 9 | \name{gpuLm.fit}
10 | \title{Fitter functions for gpu enabled linear models}
11 | \usage{
12 | gpuLm.fit(x, y, w = NULL, offset = NULL, method = "qr",
13 | 	useSingle, tol = gpuLm.defaultTol(useSingle), singular.ok = TRUE, ...)
14 | }
15 | \alias{gpuLm.fit}
16 | \description{
17 | 	The C code called by this function was written by
18 | 	Mark Seligman at Rapid Biologics, \code{http://rapidbiologics.com}
19 | 
20 |   The function \code{\link{gpuLm}} calls this function 
21 |   to fit linear models. So gpuLm.fit should \emph{not} need to be used
22 |   directly.
23 | }
24 | \arguments{
25 |   \item{x}{design matrix of dimension \code{n * p}.}
26 |   \item{y}{vector of observations of length \code{n}, or a matrix with
27 |     \code{n} rows.}
28 |   \item{w}{vector of weights (length \code{n}) to be used in the fitting
29 |     process for the \code{wfit} functions.  Weighted least squares is
30 |     used with weights \code{w}, i.e., \code{sum(w * e^2)} is minimized.}
31 |   \item{offset}{numeric of length \code{n}).  This can be used to
32 |     specify an \emph{a priori} known component to be included in the
33 |     linear predictor during fitting.}
34 | 
35 |   \item{method}{currently, only \code{method="qr"} is supported.}
36 | 
37 |   \item{useSingle}{logical. If TRUE, the gpu will use single precision
38 |   	arithmetic. In the future, if FALSE the gpu may use double precision
39 | 	arithmetic, but this is not implemented yet.}
40 |   
41 |   % Avoid Matrix's grab of qr.
42 |   \item{tol}{tolerance for the \code{\link[base]{qr}} decomposition.  Default
43 |     is 1e-7.}
44 | 
45 |   \item{singular.ok}{logical. If \code{FALSE}, a singular model is an
46 |     error.}
47 |   
48 |   \item{\dots}{currently disregarded.}
49 | }
50 | \value{
51 | %% S(-PLUS) returns an object of class "lm"
52 | %% such that print.lm, summary,... work; but that'd need more changes for R.
53 |   a list with components
54 |   \item{coefficients}{\code{p} vector}
55 |   \item{residuals}{\code{n} vector or matrix}
56 |   \item{fitted.values}{\code{n} vector or matrix}
57 |   \item{effects}{(not null fits)\code{n} vector of orthogonal single-df
58 |     effects.  The first \code{rank} of them correspond to non-aliased
59 |     coefficients, and are named accordingly.}
60 |   \item{weights}{\code{n} vector --- \emph{only} for the \code{*wfit*}
61 |     functions.}
62 |   \item{rank}{integer, giving the rank}
63 |   \item{df.residual}{degrees of freedom of residuals}
64 |   \item{qr}{(not null fits) the QR decomposition, see \code{\link[base]{qr}}.}
65 | }
66 | \seealso{
67 |   \code{\link{gpuLm}} which should usually be used for linear least squares
68 |   regression
69 | }
70 | \examples{
71 | require(utils)
72 | set.seed(129)
73 | n <- 7 ; p <- 2
74 | X <- matrix(rnorm(n * p), n,p) # no intercept!
75 | y <- rnorm(n)
76 | w <- rnorm(n)^2
77 | 
78 | str(lmw <- gpuLm.fit(x=X, y=y, w=w))
79 | }
80 | \keyword{regression}
81 | \keyword{array}
82 | 


--------------------------------------------------------------------------------
/man/gpuLsfit.Rd:
--------------------------------------------------------------------------------
 1 | % File man/gpuLsfit.Rd
 2 | % Part of the gputools package
 3 | % mostly copied from
 4 | % File src/library/stats/man/lsfit.Rd
 5 | % Part of the R package, http://www.R-project.org
 6 | % Copyright 1995-2007 R Core Development Team
 7 | % Distributed under GPL 2 or later
 8 | 
 9 | \name{gpuLsfit}
10 | \alias{gpuLsfit}
11 | 
12 | \title{Least squares fit using GPU--enabled QR decomposition}
13 | 
14 | \usage{
15 | gpuLsfit(x, y,
16 |   wt = NULL, intercept = TRUE, useSingle = TRUE,
17 |   tolerance=gpuLm.defaultTol(useSingle),
18 |   yname = NULL)
19 | }
20 | 
21 | \description{
22 | The least squares estimate of \bold{\eqn{\beta}{b}} in the model
23 | \deqn{\bold{Y} = \bold{X \beta} + \bold{\epsilon}}{y = X b + e}
24 | is found.
25 | 
26 | Most of this documentation is copied from R's documentation for
27 | \code{lsfit}. The function \code{gpuLsfit} performs a least--squares fit
28 | using a GPU enabled QR decomposition.
29 | 
30 | Note:  The QR decomposition employed by \code{gpuLm} is optimized for speed
31 | and uses minimal pivoting.  If more precise pivoting is desired, then
32 | either the function \code{gpuQR} or, better still, \code{svd} should be used.
33 | }
34 | 
35 | \arguments{
36 | \item{x}{a matrix whose rows correspond to cases and whose columns
37 |   correspond to variables.}
38 | \item{y}{the responses, possibly a matrix if you want to fit multiple
39 |   left hand sides.}
40 | \item{wt}{an optional vector of weights for performing weighted least squares.}
41 | \item{intercept}{whether or not an intercept term should be used.}
42 | \item{useSingle}{whether to use single precision arithmetic on the gpu. Only the 'TRUE' option is implemented so far.}
43 | \item{tolerance}{the tolerance to be used in the matrix decomposition.
44 |   This defaults to 1e-04 for single--precision GPU computation.}
45 | \item{yname}{names to be used for the response variables.}
46 | }
47 | 
48 | \details{
49 | If weights are specified then a weighted least squares is performed
50 | with the weight given to the \emph{j}th case specified by the \emph{j}th
51 | entry in \code{wt}.
52 | 
53 | If any observation has a missing value in any field, that observation
54 | is removed before the analysis is carried out.
55 | This can be quite inefficient if there is a lot of missing data.
56 | 
57 | The implementation is via a modification of the LINPACK subroutines
58 | which allow for multiple left-hand sides.
59 | }
60 | 
61 | \value{
62 | A list with the following named components:
63 | \item{coef}{the least squares estimates of the coefficients in
64 |   the model (\bold{\eqn{\beta}{b}} as stated above).}
65 | \item{residuals}{residuals from the fit.}
66 | \item{intercept}{indicates whether an intercept was fitted.}
67 | \item{qr}{the QR decomposition of the design matrix.}
68 | }
69 | 
70 | \author{
71 | This function was adapted for Nvidia's CUDA--supporting GPGPUs 
72 | by Mark Seligman at Rapid Biologics LLC.  
73 | 	\code{http://www.rapidbiologics.com}
74 |   }
75 | 
76 | \references{
77 |   Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
78 |   \emph{The New S Language}. Wadsworth & Brooks/Cole.
79 | }
80 | \seealso{
81 | \code{\link{lsfit}}, \code{\link{lm}}, 
82 | \code{\link{ls.print}}, \code{\link{ls.diag}}
83 | }
84 | \examples{
85 | \dontshow{utils::example("lm", echo = FALSE)}
86 | ##-- Using the same data as the lm(.) example:
87 | lsD9 <- gpuLsfit(x = unclass(gl(2,10)), y = weight)
88 | ls.print(lsD9)
89 | }
90 | \keyword{regression}
91 | 


--------------------------------------------------------------------------------
/man/gpuMatMult.Rd:
--------------------------------------------------------------------------------
 1 | \name{gpuMatMult}
 2 | \alias{gpuMatMult}
 3 | \title{Perform Matrix Multiplication with a GPU}
 4 | 
 5 | \description{
 6 | 	Performs matrix multiplication using a GPU.  This function is merely a 
 7 | 	wrapper for the CUBLAS cublasDgemm function.
 8 | }
 9 | 
10 | \usage{
11 | 	gpuMatMult(a, b)
12 | }
13 | 
14 | \arguments{
15 | 	\item{a}{a numeric matrix.}
16 | 	\item{b}{a numeric matrix.}
17 | }
18 | 
19 | \value{
20 | 	A numeric matrix.
21 | 	The matrix is just the product of arguments 'a' and 'b'.
22 | }
23 | 
24 | \examples{
25 | matA <- matrix(runif(2*3), 2, 3)
26 | matB <- matrix(runif(3*4), 3, 4)
27 | gpuMatMult(matA, matB)
28 | }
29 | 
30 | \keyword{array}
31 | \keyword{algebra}
32 | 


--------------------------------------------------------------------------------
/man/gpuMi.Rd:
--------------------------------------------------------------------------------
 1 | \name{gpuMi}
 2 | \alias{gpuMi}
 3 | \title{B spline based mutual information}
 4 | 
 5 | \description{
 6 | 	This function estimates the mutual information for permutations
 7 | 	of pairs of columns of a matrix using a B spline approach on a
 8 | 	GPU device.  Please note, the data must be values from the interval
 9 | 	[0.0, 1.0].
10 | }
11 | 
12 | \usage{
13 | gpuMi(x, y = NULL, bins = 2, splineOrder = 1)
14 | }
15 | 
16 | \arguments{
17 | 	\item{x}{a matrix of floating point numbers from the interval [0.0, 1.0].
18 | 		Each column represents a list of samples of a random variable.  The
19 | 		mutual information between each column of x and each column of y will
20 | 		be computed.  If y is NULL then each pair of columns of x will be
21 | 		compared.}
22 | 	\item{y}{a matrix of floating point numbers from the interval [0.0, 1.0].
23 | 		Each column represents a list of samples of a random variable.  The
24 | 		mutual information between each column of x and each column of y will
25 | 		be computed.  If y is NULL then each pair of columns of x will be
26 | 		compared.}
27 | 	\item{bins}{a single integer value representing the number of equal
28 | 		intervals that [0.0, 1.0] will be divided into in order to determine
29 | 		the bins in which to place each value of the columns of x and y.  In
30 | 		the case of splineOrder = 1, this determines the histogram for
31 | 		traditional mutual information.  For splineOrder > 1, a single value
32 | 		may be placed in multiple adjoining bins with varying weights on
33 | 		membership.}
34 | 	\item{splineOrder}{a single integer value giving the degree of the spline
35 | 		polynomials used to define both the number of bins a single value will
36 | 		be placed in and the weight of membership given to the value.}
37 | }
38 | 
39 | \value{
40 | 	a matrix of single precision floating point values of order ncol(y) by
41 | 	ncol(x).  Entry $(i, j)$ of this matrix represents the mutual information
42 | 	calculation for $(y_i, x_j)$.
43 | }
44 | 
45 | \references{
46 | 	Carten O. Daub, Ralf Steuer, Joachim Selbig, and Sebastian Kloska.
47 | 	2004.  Estimating mutual information using B-spline functions -- an
48 | 	improved similarity measure for analysing gene expression data.
49 | 	\emph{BMC Bioinformatics}. 5:118.  Available from
50 | 	\url{http://www.biomedcentral.com/1471-2105/5/118}
51 | }
52 | 
53 | \examples{
54 | 	# get 3 random variables each with 20 samples
55 | 	x <- matrix(runif(60), 20, 3)
56 | 	y <- matrix(runif(60), 20, 3)
57 | 	# do something interesting
58 | 	y[,2] <- 3.0 * (x[,1] + x[,3])
59 | 	z <- gpuMi(x, y, bins = 10, splineOrder = 3)
60 | 	print(z)
61 | }
62 | 


--------------------------------------------------------------------------------
/man/gpuQr.Rd:
--------------------------------------------------------------------------------
 1 | \name{gpuQr}
 2 | \alias{gpuQr}
 3 | \title{Estimate the QR decomposition for a matrix}
 4 | 
 5 | \description{
 6 | 	gpuQR estimates the QR decomposition for a matrix using
 7 | 	column pivoting and householder matrices.  The work is done on a
 8 | 	GPU.
 9 | 
10 | 	Note:  a rank-revealing pivoting scheme is employed, potentially
11 | 	resulting in pivot distinctly different from ordinary "qr".
12 | }
13 | 
14 | \usage{
15 | gpuQr(x, tol = 1e-07)
16 | }
17 | 
18 | \arguments{
19 | 	\item{x}{a matrix of floating point numbers.
20 | 		This is the matrix that will	be decomposed into Q and R
21 | 		factors.}
22 | 	\item{tol}{a floating point value. It is used for estimating the rank
23 | 		of matrix x.}
24 | }
25 | 
26 | \value{
27 | an object of class 'qr'. This object has members qr, qraux, pivot, rank.
28 | It is meant to be identical to the output of R's base function 'qr'.
29 | From the documentation for R's 'qr' function:
30 | The attribute qr is a matrix with the same dimension as 'x'.
31 | The upper triangle contains the R of the QR decomposition.
32 | The lower triangle contains partial information to construct Q.
33 | The attribute qraux is a vector of length 'ncol(x)' contains
34 | more information to construct Q.
35 | The attribute rank is a single integer representing an estimation
36 | of the rank of input matrix x based on the results of the
37 | QR decomposition.  In some cases, this rank can be wildly different from
38 | the actual rank of the matrix x and so is only an estimation.
39 | The attribute pivot contains the permutation applied to columns of x
40 | in the process of calculating the QR decomposition.
41 | }
42 | 
43 | \author{
44 | The low--level implementation of this function for Nvidia's
45 | CUDA--supporting GPGPUs was written
46 | by Mark Seligman at Rapid Biologics LLC.  
47 | 	\code{http://www.rapidbiologics.com}
48 | }
49 | 
50 | \references{
51 |   Bischof, C. B. and Van Loan, C. F. (1987)
52 |   The WY Representation for Products of Householder Matrices
53 |   SIAM J Sci. and Stat. Comp, 8, s2--s13.
54 | 
55 | Bjorck, Ake (1996) Numerical methods for least squares
56 |   problems. SIAM.
57 | 
58 | Golub, Gene H. and Van Loan, C. F. (1996)
59 |  Matrix Computations, Ed. 3, ch. 5.
60 | }
61 | 
62 | \examples{
63 | # get some random data of any shape at all
64 | x <- matrix(runif(25), 5, 5)
65 | qr <- gpuQr(x)
66 | print(qr)
67 | }
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/man/gpuSolve.Rd:
--------------------------------------------------------------------------------
 1 | \name{gpuSolve}
 2 | \alias{gpuSolve}
 3 | \title{Estimate the solution to a matrix vector equation}
 4 | 
 5 | \description{
 6 | 	This function estimates the solution to an equation of the form x * b = y
 7 | 	where x is a matrix, b is an unknown vector, and y is a known vector.  It
 8 | 	does much calculation on a GPU. If the y argument is omitted, the
 9 | 	function returns the inverse of x.
10 | 
11 | 	The function uses R's base 'qr' and then applies the gpu to the result
12 | 	to get the final solution.
13 | }
14 | 
15 | \usage{
16 | 	gpuSolve(x, y=NULL)
17 | }
18 | 
19 | \arguments{
20 | 	\item{x}{a matrix of floating point numbers.}  
21 | 	\item{y}{a vector of floating point numbers of length nrow(x).}
22 | }
23 | 
24 | \value{
25 | a vector or matrix of floating point numbers. If y is not null,
26 | then the value is an estimate of the vector b of length ncol(x)
27 | where x * b = y.  If y is null or omitted, the value is a matrix,
28 | an estimate of a matrix multiplicative pseudo inverse of x.
29 | }
30 | 
31 | \examples{
32 | 	x <- matrix(runif(100), 10, 10)
33 | 	y <- runif(10)
34 | 	b <- gpuSolve(x, y)
35 | 	cat("Solution:\n")
36 | 	print(b)
37 | 	x.inverse <- gpuSolve(x)
38 | 	cat("an estimate of a pseudo inverse for x:\n")
39 | 	print(x.inverse)
40 | }
41 | 


--------------------------------------------------------------------------------
/man/gpuTcrossprod.Rd:
--------------------------------------------------------------------------------
 1 | \name{gpuTcrossprod}
 2 | \alias{gpuTcrossprod}
 3 | \title{Perform Matrix Transposed Cross-product with a GPU}
 4 | 
 5 | \description{
 6 | 	Performs transposed matrix cross-product using a GPU.  This function is merely a 
 7 | 	couple of wrappers for the CUBLAS cublasSgemm function.
 8 | }
 9 | 
10 | \usage{
11 | 	gpuTcrossprod(a, b)
12 | }
13 | 
14 | \arguments{
15 | 	\item{a}{a matrix of floating point values.}
16 | 	\item{b}{a matrix of floating point values.  If null, defaultsto
17 | 	'a'.}
18 | }
19 | 
20 | \value{
21 | 	A matrix of single precision floating point values.
22 | 	The matrix is the transposed cross-product of arguments 'a'
23 | 	and 'b', i.e., a * t(b).
24 | }
25 | 
26 | \examples{
27 | matA <- matrix(runif(2*3), 2, 3)
28 | matB <- matrix(runif(4*3), 4, 3)
29 | gpuTcrossprod(matA, matB)
30 | }
31 | 
32 | \keyword{array}
33 | \keyword{algebra}
34 | 


--------------------------------------------------------------------------------
/man/gpuTtest.Rd:
--------------------------------------------------------------------------------
 1 | \name{gpuTtest}
 2 | \alias{gpuTtest}
 3 | \title{T-Test Estimator with a GPU}
 4 | 
 5 | \description{
 6 | 	Given the number of samples and a Pearson correlation coefficient, this 
 7 | 	function estimates the t-score on a GPU. If an entry in goodPairs is
 8 | 	zero or one then you may get a NaN as the t-test result.
 9 | }
10 | 
11 | \usage{
12 | 	gpuTtest(goodPairs, coeffs)
13 | }
14 | 
15 | \arguments{
16 | 	\item{goodPairs}{a vector of positive integer values.  Value i 
17 | 		represents the number of samples used to calculate the i-th 
18 | 		value of the 'coeffs' argument.}
19 | 	\item{coeffs}{a vector of floating point values representing
20 | 		Pearson correlation coefficients.}
21 | }
22 | 
23 | \value{a vector of single precision floating point values.  The i-th entry
24 | 	is an estimate of the t-score of the i-th entry of the 'coeffs' argument.
25 | }
26 | 
27 | \seealso{
28 | 	\code{\link{gpuCor}}.
29 | }
30 | 
31 | \examples{
32 | goodPairs <- rpois(10, lambda=5)
33 | coeffs <- runif(10)
34 | gpuTtest(goodPairs, coeffs)
35 | }
36 | 


--------------------------------------------------------------------------------
/src/Makevars.in:
--------------------------------------------------------------------------------
1 | PKG_CPPFLAGS = @CUDA_CPPFLAGS@
2 | PKG_LIBS = @LIBS@


--------------------------------------------------------------------------------
/src/correlation.h:
--------------------------------------------------------------------------------
 1 | typedef enum useObs {everything, pairwiseComplete} UseObs; 
 2 | 
 3 | void pmcc(UseObs whichObs, const float * vectsa, size_t na,
 4 | 	const float * vectsb, size_t nb, size_t dim, float * numPairs, 
 5 |           float * correlations, float * signifs);
 6 | 
 7 | void setDevice(int device);
 8 | void getDevice(int * device);
 9 | 
10 | void getData(const int * images, 
11 | 	const int * xcoords, const int * ycoords, const int * zcoords,
12 | 	const int * mins, const int * maxes,
13 | 	const float * evs, size_t numrows, size_t numimages, float * output);
14 | 
15 | size_t parseResults(const int * imageList1, size_t numImages1, 
16 | 	const int * imageList2, size_t numImages2,
17 | 	int structureid,
18 | 	double cutCorrelation, int cutPairs,
19 | 	const double * correlations, const double * signifs, const int * numPairs, 
20 | 	double * results);
21 | 
22 | int isSignificant(double signif, int df);
23 | void testSignif(const float * goodPairs, const float * coeffs, 
24 |                 size_t n, float * tscores);
25 | 
26 | void hostSignif(const float * goodPairs, const float * coeffs, 
27 | 	size_t n, float * tscores);
28 | size_t signifFilter(const double * data, size_t rows, double * results);
29 | size_t gpuSignifFilter(const float * data, size_t rows, float * results);
30 | 
31 | void cublasPMCC(const float * sampsa, size_t numSampsA, const float * sampsb, 
32 |                 size_t numSampsB, size_t sampSize, float * res);
33 | 
34 | double hostKendall(const float * X, const float * Y, size_t n);
35 | void permHostKendall(const float * a, size_t na, const float * b, size_t nb,
36 | 	size_t sampleSize, double * results);
37 | 
38 | void masterKendall(const float * x,  size_t nx, const float * y, size_t ny, 
39 |                    size_t sampleSize, double * results);
40 | 


--------------------------------------------------------------------------------
/src/cudaUtils.cpp:
--------------------------------------------------------------------------------
  1 | #include <map>
  2 | #include <vector>
  3 | #include <string>
  4 | #include <algorithm>
  5 | 
  6 | #include "R.h"
  7 | #include "nvrtc.h"
  8 | #include "cuda.h"
  9 | 
 10 | #include "cudaUtils.h"
 11 | 
 12 | class CudaKernel {
 13 | public:
 14 |   const char * name;
 15 |   const char * ptx;
 16 |   nvrtcProgram * prog;
 17 | 
 18 |   CudaKernel(const char * _name, const char * _ptx, nvrtcProgram * _prog)
 19 |   {
 20 |     name = _name;
 21 |     ptx = _ptx;
 22 |     prog = _prog;
 23 |   }
 24 | };
 25 | 
 26 | std::map<std::string, CudaKernel *> * cudaKernels;
 27 | 
 28 | // Obtain compilation log from the program.
 29 | void printCompileLog(nvrtcProgram &prog) {
 30 |   size_t logSize;
 31 |   NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
 32 |   char * log = new char[logSize];
 33 |   NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
 34 |   warning(log);
 35 |   delete[] log;
 36 | }
 37 | 
 38 | static
 39 | std::vector<std::string> & getFileKernels(std::string file)
 40 | {
 41 |   std::vector<std::string> * kernels;
 42 |   if (file == "correlation") {  
 43 |     std::string newKernels[] =
 44 |       { "gpuSignif"
 45 |       , "gpuMeans"
 46 |       , "gpuSD"
 47 |       , "gpuPMCC"
 48 |       , "gpuMeansNoTest"
 49 |       , "gpuSDNoTest"
 50 |       , "gpuPMCCNoTest"
 51 |       , "dUpdateSignif"
 52 |       , "noNAsPmccMeans"
 53 |       };
 54 |     kernels = new std::vector<std::string>(newKernels, newKernels + 9);
 55 |   } else if (file == "distance") {
 56 |     std::string newKernels[] = 
 57 |       { "euclidean_kernel_same"
 58 |       , "maximum_kernel_same"
 59 |       , "manhattan_kernel_same"
 60 |       , "canberra_kernel_same"
 61 |       , "binary_kernel_same"
 62 |       , "minkowski_kernel_same"
 63 |       };
 64 |     kernels = new std::vector<std::string>(newKernels, newKernels + 6);
 65 |   } else if (file == "granger") {
 66 |     std::string newKernels[] = 
 67 |       { "getRestricted"
 68 |       , "getUnrestricted"
 69 |       , "ftest"
 70 |       , "getRestricted"
 71 |       , "getUnrestricted"
 72 |       , "ftest"
 73 |       };
 74 |     kernels = new std::vector<std::string>(newKernels, newKernels + 6);
 75 |   } else if (file == "hcluster") {
 76 |     std::string newKernels[] = 
 77 |       { "complete_kernel"
 78 |       , "wpgma_kernel"
 79 |       , "average_kernel"
 80 |       , "median_kernel"
 81 |       , "centroid_kernel"
 82 |       , "flexible_group_kernel"
 83 |       , "flexible_kernel"
 84 |       , "ward_kernel"
 85 |       , "mcquitty_kernel"
 86 |       , "single_kernel"
 87 |       , "convert_kernel"
 88 |       , "find_min1_kernel"
 89 |       , "find_min2_kernel"
 90 |       };
 91 |     kernels = new std::vector<std::string>(newKernels, newKernels + 13);
 92 |   } else if (file == "kendall") {
 93 |     std::string newKernels[] = { "gpuKendall" };
 94 |     kernels = new std::vector<std::string>(newKernels, newKernels + 1);
 95 |   } else if (file == "mi") {
 96 |     std::string newKernels[] =
 97 |       { "scale"
 98 |       , "get_bin_scores"
 99 |       , "get_entropy"
100 |       , "get_mi"
101 |       };
102 |     kernels = new std::vector<std::string>(newKernels, newKernels + 4);
103 |   } else if (file == "qrdecomp") {
104 |     std::string newKernels[] =
105 |       { "getColNorms"
106 |       , "gpuFindMax"
107 |       , "gpuSwapCol"
108 |       , "makeHVector"
109 |       , "UpdateHHNorms"
110 |       };
111 |     kernels = new std::vector<std::string>(newKernels, newKernels + 5);
112 |   } else {
113 |     kernels = new std::vector<std::string>();
114 |   }
115 |   return(*kernels);
116 | }
117 | 
118 | extern "C"
119 | void cuCompile(const int * numFiles,
120 |                const char ** cuFilenames,
121 |                const char ** cuSrc)
122 | {
123 |   cudaKernels = new std::map<std::string, CudaKernel *>();
124 |   CUDA_SAFE_CALL(cuInit(0));
125 | 
126 |   for (int i = 0; i < *numFiles; ++i) {
127 |     std::string file = cuFilenames[i];
128 |     const char * src = cuSrc[i];
129 | 
130 |     nvrtcProgram * prog = new nvrtcProgram();
131 |     NVRTC_SAFE_CALL(
132 |       nvrtcCreateProgram(prog,   // prog
133 |         src,                     // buffer
134 |         file.c_str(),            // name
135 |         0,                       // numHeaders
136 |         NULL,                    // headers
137 |         NULL));                  // includeNames
138 |     
139 |     std::vector<std::string> kernels = getFileKernels(file);
140 |     for(int i = 0; i < kernels.size(); ++i) {
141 |       NVRTC_SAFE_CALL(nvrtcAddNameExpression(*prog, kernels[i].c_str()));
142 |     }
143 |     
144 |     const char * options[] = { "--use_fast_math" };
145 | 
146 |     nvrtcResult compileResult = nvrtcCompileProgram(*prog, 1, options);
147 |     if (compileResult != NVRTC_SUCCESS) {
148 |       printCompileLog(*prog);
149 |       error("\ncuda kernel compile failed");
150 |     }
151 | 
152 |     //  Obtain PTX from the program.
153 |     size_t ptxSize;
154 |     NVRTC_SAFE_CALL(nvrtcGetPTXSize(*prog, &ptxSize));
155 | 
156 |     char * ptx = Calloc(ptxSize, char);
157 |     NVRTC_SAFE_CALL(nvrtcGetPTX(*prog, ptx));
158 | 
159 |     for(int i = 0; i < kernels.size(); ++i) {
160 |       const char * name;
161 |       NVRTC_SAFE_CALL(nvrtcGetLoweredName(*prog, kernels[i].c_str(), &name));
162 |       (*cudaKernels)[kernels[i]] = new CudaKernel(name, ptx, prog); 
163 |     }
164 |   }
165 | }
166 | 
167 | extern "C"
168 | void unloadPackage()
169 | {
170 |   std::vector<const char *> ptxs;
171 |   std::vector<nvrtcProgram *> progs;
172 | 
173 |   std::map<std::string, CudaKernel *>::iterator iter;  
174 |   for (iter = cudaKernels->begin(); iter != cudaKernels->end(); ++iter) {
175 |     ptxs.push_back(iter->second->ptx);
176 |     progs.push_back(iter->second->prog);
177 |     delete iter->second;
178 |   }
179 |   
180 |   delete cudaKernels;
181 | 
182 |   std::sort(ptxs.begin(), ptxs.end());
183 |   std::unique(ptxs.begin(), ptxs.end());
184 |   std::vector<const char *>::iterator ptx_i;
185 |   for (ptx_i = ptxs.begin(); ptx_i != ptxs.end(); ++ptx_i) {
186 |     Free(*ptx_i);
187 |     }
188 | 
189 |   std::sort(progs.begin(), progs.end());
190 |   std::unique(progs.begin(), progs.end());
191 |   std::vector<nvrtcProgram *>::iterator prog_i;
192 |   for (prog_i = progs.begin(); prog_i != progs.end(); ++prog_i) {
193 |     NVRTC_SAFE_CALL(nvrtcDestroyProgram(*prog_i));
194 |   }
195 | }
196 | 
197 | void cudaLaunch(std::string kernelName,
198 |                 void * args[],
199 |                 const dim3 &gridDim, const dim3 &blockDim,
200 |                 cudaStream_t stream)
201 | {
202 |   const CudaKernel * cudaKernel = (*cudaKernels)[kernelName];
203 | 
204 |   CUmodule module;
205 |   CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, cudaKernel->ptx, 0, 0, 0));
206 | 
207 |   CUfunction kernel;
208 |   CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, cudaKernel->name));
209 | 
210 |   CUDA_SAFE_CALL(
211 |     cuLaunchKernel(kernel,
212 |       gridDim.x, gridDim.y, gridDim.z,    // grid dim
213 |       blockDim.x, blockDim.y, blockDim.z, // block dim
214 |       0, stream,                    // shared mem and stream
215 |       args, 0));                  // arguments
216 |   //  CUDA_SAFE_CALL(cuCtxSynchronize());
217 |   CUDA_SAFE_CALL(cuModuleUnload(module));
218 | }
219 | 


--------------------------------------------------------------------------------
/src/cudaUtils.h:
--------------------------------------------------------------------------------
 1 | #ifndef _CUDAUTILS_H_
 2 | #define _CUDAUTILS_H_
 3 | 
 4 | #include <string>
 5 | 
 6 | #include "R.h"
 7 | #include "nvrtc.h"
 8 | #include "cuda_runtime_api.h"
 9 | 
10 | #define NVRTC_SAFE_CALL(x)                           \
11 |   do {                                               \
12 |     nvrtcResult result = x;                          \
13 |     if (result != NVRTC_SUCCESS) {                   \
14 |       error("\nerror: %d failed with error %s\n", x, \
15 |             nvrtcGetErrorString(result));            \
16 |     }                                                \
17 |   } while(0)
18 | 
19 | #define CUDA_SAFE_CALL(x)                                       \
20 |   do {                                                          \
21 |     CUresult result = x;                                        \
22 |     if (result != CUDA_SUCCESS) {                               \
23 |       const char *msg;                                          \
24 |       cuGetErrorName(result, &msg);                             \
25 |       error("\nerror: %d failed with error %s\n", x, msg);      \
26 |     }                                                           \
27 |   } while(0)
28 | 
29 | void cudaLaunch(std::string kernelName,
30 |                 void * args[],
31 |                 const dim3 &gridDim, const dim3 &blockDim,
32 |                 cudaStream_t stream = NULL);
33 | 
34 | #endif /* _CUDAUTILS_H_ */
35 | 


--------------------------------------------------------------------------------
/src/cuseful.cpp:
--------------------------------------------------------------------------------
  1 | #include<stdio.h>
  2 | #include<stdlib.h>
  3 | #include<time.h>
  4 | #include<string.h>
  5 | #include<cublas.h>
  6 | #include<R.h>
  7 | #include<Rinternals.h>
  8 | 
  9 | #include"cuseful.h"
 10 | 
 11 | #include<string>
 12 | 
 13 | #define HALF RAND_MAX/2
 14 | 
 15 | void fatal(const char * msg)
 16 | {
 17 | 	error(msg);
 18 | }
 19 | 
 20 | void getComputeNumber(int * major, int * minor)
 21 | {
 22 | 	int currentDevice = 0;
 23 | 	struct cudaDeviceProp dProps;
 24 | 
 25 | 	cudaGetDevice(&currentDevice);
 26 | 	cudaGetDeviceProperties(&dProps, currentDevice);
 27 | 
 28 | 	*major = dProps.major;
 29 | 	*minor = dProps.minor;
 30 | }
 31 | 
 32 | void checkDoubleCapable(const char * failMsg)
 33 | {
 34 | 	int major, minor;
 35 | 	major = minor = 0;
 36 | 	getComputeNumber(&major, &minor);
 37 | 	if((major < 1) || ((major == 1) && (minor < 3)))
 38 | 		error(failMsg);
 39 | }
 40 | 
 41 | float * getMatFromFile(int rows, int cols, const char * fn)
 42 | {
 43 | 	FILE * matFile;
 44 | 	matFile = fopen(fn, "r");
 45 | 	if(matFile == NULL)
 46 | 		error("unable to open file %s", fn);
 47 | 	float * mat = Calloc(rows*cols, float);
 48 | 	int i, j, err;
 49 | 	for(i = 0; i < rows; i++) {
 50 | 		for(j = 0; j < cols; j++) {
 51 | 			err = fscanf(matFile, " %f ", mat+i+j*rows);
 52 | 			if(err == EOF)
 53 | 				error("file %s incorrect: formatting or size", fn);
 54 | 		}
 55 | 		fscanf(matFile, " \n ");
 56 | 	}
 57 | 	fclose(matFile);
 58 | 	return mat;
 59 | }
 60 | 
 61 | char * getTime() {
 62 | 	time_t curtime;
 63 | 	struct tm *loctime;
 64 | 	curtime = time(NULL);
 65 | 	loctime = localtime(&curtime);
 66 | 
 67 | 	return asctime(loctime);
 68 | }
 69 | 
 70 | void printVect(int n, const float * vect, const char * msg) {
 71 | 	if(msg != NULL) Rprintf(msg);
 72 | 	for(int i = 0; i < n; i++) {
 73 | 		Rprintf("%6.4f, ", vect[i]);
 74 | 		if((i+1)%10 == 0) Rprintf("\n");
 75 | 	}
 76 | 	if(n%10 != 0) Rprintf("\n");
 77 | 	if(msg != NULL) Rprintf("----------\n");
 78 | }
 79 | 
 80 | void printMat(int rows, int cols, const float * mat, const char * msg) {
 81 | 	int i;
 82 | 	if(msg != NULL) Rprintf(msg);
 83 | 	for(i = 0; i < rows; i++)
 84 | 		printVect(cols, mat+i*cols, NULL);
 85 | 	if(msg != NULL) Rprintf("----------\n");
 86 | }
 87 | 
 88 | int hasCudaError(const char * msg) {
 89 | 	cudaError_t err = cudaGetLastError();
 90 | 	if(cudaSuccess != err)
 91 | 		error("cuda error : %s : %s\n", msg, cudaGetErrorString(err));
 92 | 	return 0;
 93 | }
 94 | 
 95 | void checkCudaError(const char * msg) {
 96 | 	cudaError_t err = cudaGetLastError();
 97 | 	if(cudaSuccess != err) {
 98 | 		if(msg != NULL)
 99 | 			warning(msg);
100 | 		error(cudaGetErrorString(err));
101 | 	}
102 | }
103 | 
104 | std::string cublasGetErrorString(cublasStatus err)
105 | {
106 | 	switch(err) {
107 | 		case CUBLAS_STATUS_SUCCESS :
108 | 			return "operation completed successfully";
109 | 		case CUBLAS_STATUS_NOT_INITIALIZED :
110 | 			return "CUBLAS library not initialized";
111 | 		case CUBLAS_STATUS_ALLOC_FAILED :
112 | 			return "resource allocation failed";
113 | 		case CUBLAS_STATUS_INVALID_VALUE :
114 | 			return "unsupported numerical value was passed to function";
115 | 		case CUBLAS_STATUS_ARCH_MISMATCH :
116 | 			return "function requires an architectural feature absent from \
117 | 			the architecture of the device";
118 | 		case CUBLAS_STATUS_MAPPING_ERROR :
119 | 			return "access to GPU memory space failed";
120 | 		case CUBLAS_STATUS_EXECUTION_FAILED :
121 | 			return "GPU program failed to execute";
122 | 		case CUBLAS_STATUS_INTERNAL_ERROR :
123 | 			return "an internal CUBLAS operation failed";
124 | 		default :
125 | 			return "unknown error type";
126 | 	}
127 | }
128 | 
129 | void checkCublasError(const char * msg)
130 | {
131 | 	cublasStatus err = cublasGetError();
132 | 	if(err != CUBLAS_STATUS_SUCCESS)
133 | 		error("cublas error : %s : %s\n", msg, cublasGetErrorString(err).c_str());
134 | }
135 | 
136 | int hasCublasError(const char * msg)
137 | {
138 | 	cublasStatus err = cublasGetError();
139 | 	if(err != CUBLAS_STATUS_SUCCESS)
140 | 		error("cublas error : %s : %s\n", msg, cublasGetErrorString(err).c_str());
141 | 	return 0;
142 | }
143 | 


--------------------------------------------------------------------------------
/src/cuseful.h:
--------------------------------------------------------------------------------
 1 | void fatal(const char * msg);
 2 | void getComputeNumber(int * major, int * minor);
 3 | void checkDoubleCapable(const char * failMsg);
 4 | char * getTime();
 5 | void printVect(int n, const float * vect, const char * msg);
 6 | void printMat(int rows, int cols, const float * mat, const char * msg);
 7 | void checkCudaError(const char * msg);
 8 | int hasCudaError(const char * msg);
 9 | float * getMatFromFile(int rows, int cols, const char * fn);
10 | void checkCublasError(const char * msg);
11 | int hasCublasError(const char * msg);
12 | 


--------------------------------------------------------------------------------
/src/distance.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | #include <string>
  4 | 
  5 | #include "R.h"
  6 | #include "Rinternals.h"
  7 | 
  8 | #include "cuda_runtime_api.h"
  9 | 
 10 | #include "cuseful.h"
 11 | #include "distance.h"
 12 | 
 13 | #include "cudaUtils.h"
 14 | 
 15 | #define NUM_THREADS 32
 16 | 
 17 | static void euclidean(const float * vg_a, size_t n_a, const float * vg_b, 
 18 |                       size_t n_b, size_t dim, float * d)
 19 | {
 20 |   float 
 21 |     sum, component;
 22 | 
 23 |   for(size_t y = 0; y < n_b; y++) {
 24 |     for(size_t x = 0; x < n_a; x++) {
 25 |       sum = 0.f;
 26 |       for(size_t i = 0; i < dim; i++) {
 27 |         component = vg_a[x * dim + i] - vg_b[y * dim + i];
 28 |         sum += component * component;
 29 |       }
 30 |       d[y * dim + x] = sqrtf(sum);
 31 |     }
 32 |   }
 33 | }
 34 | 
 35 | static void maximum(const float * vg_a, size_t n_a, const float * vg_b, 
 36 |                     size_t n_b, size_t dim, float * d)
 37 | {
 38 |   float 
 39 |     themax, previous, current;
 40 | 
 41 |   for(size_t y = 0; y < n_b; y++) {
 42 |     for(size_t x = 0; x < n_a; x++) {
 43 |       previous = 0.f;
 44 |       for(size_t i = 0; i < dim; i++) {
 45 |         current = fabsf(vg_a[x * dim + i] - vg_b[y * dim + i]);
 46 |         themax = (previous < current)? current : previous;
 47 |         previous = themax;
 48 |       }
 49 |       d[y * dim + x] = themax;
 50 |     }
 51 |   }
 52 | }
 53 | 
 54 | static void manhattan(const float * vg_a, size_t n_a, const float * vg_b, 
 55 |                       size_t n_b, size_t dim, float * d)
 56 | {
 57 |   float sum;
 58 | 
 59 |   for(size_t y = 0; y < n_b; y++) {
 60 |     for(size_t x = 0; x < n_a; x++) {
 61 |       sum = 0.f;
 62 |       for(size_t i = 0; i < dim; i++)
 63 |         sum += fabsf(vg_a[x * dim + i] - vg_b[y * dim + i]);
 64 |       d[y * dim + x] = sum;
 65 |     }
 66 |   }
 67 | }
 68 | 
 69 | static void canberra(const float * vg_a, size_t n_a, const float * vg_b, 
 70 |                      size_t n_b, size_t dim, float * d)
 71 | {
 72 |   float
 73 |     componentDiff, componentSum,
 74 |     acoord, bcoord,
 75 |     sum;
 76 | 
 77 |   for(size_t y = 0; y < n_b; y++) {
 78 |     for(size_t x = 0; x < n_a; x++) {
 79 |       sum = 0.f;
 80 |       for(size_t i = 0; i < dim; i++) {
 81 |         acoord = vg_a[x * dim + i];
 82 |         bcoord = vg_b[y * dim + i];
 83 | 
 84 |         componentDiff = fabsf(acoord - bcoord);
 85 |         componentSum = fabsf(acoord + bcoord);
 86 | 
 87 |         if(componentSum != 0.f) 
 88 |           sum += componentDiff / componentSum;
 89 |       }
 90 |       d[y * dim + x] = sum;
 91 |     }
 92 |   }
 93 | }
 94 | 
 95 | static void binary(const float * vg_a, size_t n_a, const float * vg_b, 
 96 |                    size_t n_b, size_t dim, float * d)
 97 | {
 98 |   int
 99 |     acoord, bcoord,
100 |     sharedOnes, ones;
101 |   float ratio;
102 | 
103 |   for(size_t y = 0; y < n_b; ++y) {
104 |     for(size_t x = 0; x < n_a; ++x) {
105 |       ones = sharedOnes = 0;
106 |       for(size_t i = 0; i < dim; ++i) {
107 |         acoord = (vg_a[x * dim + i] != 0.f);
108 |         bcoord = (vg_b[y * dim + i] != 0.f);
109 | 
110 |         if(acoord ^ bcoord) sharedOnes += 1;
111 |         if(acoord || bcoord) ones += 1;
112 |       }
113 |       ratio = (ones != 0)? ((float)sharedOnes / (float)ones) 
114 |         : (float)sharedOnes;
115 |                                 
116 |       d[y * dim + x] = ratio;
117 |     }
118 |   }
119 | }
120 | 
121 | static void minkowski(const float * vg_a, size_t n_a, const float * vg_b, 
122 |                       size_t n_b, size_t dim, float p, float * d)
123 | {
124 |   float
125 |     component, sum;
126 | 
127 |   for(size_t y = 0; y < n_b; y++) {
128 |     for(size_t x = 0; x < n_a; x++) {
129 |       sum = 0.f;
130 |       for(size_t i = 0; i < dim; i++) {
131 |         component = fabsf(vg_a[x * dim + i] - vg_b[y * dim + i]);
132 |         sum += powf(component, p);
133 |       }
134 |       d[y * dim + x] = powf(sum, (float)(1.f / p));
135 |     }
136 |   }
137 | }
138 | 
139 | /*
140 |   static void dot(const float * vg_a, size_t pitch_a, size_t n_a,
141 |   const float * vg_b, size_t pitch_b, size_t n_b,
142 |   size_t k,
143 |   float * d, size_t pitch_d)
144 |   {
145 |   // Two different vectors
146 |   if(vg_a != vg_b) {
147 |   for(size_t y = 0; y < n_b; ++y) {
148 |   for(size_t x = 0; x < n_a; ++x) {
149 |   float s = 0.0;
150 |   for(size_t i = 0; i < k; ++i) {
151 |   float t = vg_a[x * pitch_a + i] * vg_b[y * pitch_b + i];
152 |   s += t;
153 |   }
154 |   d[y * pitch_d + x] = s;
155 |   }
156 |   }
157 |   } else {
158 |   // Compute
159 |   for(size_t y = 1; y < n_b; ++y) {
160 |   for(size_t x = 0; x <= y; ++x) {
161 |   float s = 0.0;
162 |   for(size_t i = 0; i < k; ++i) {
163 |   float t = vg_a[x * pitch_a + i] * vg_b[y * pitch_b + i];
164 |   s += t;
165 |   }
166 |   d[y * pitch_d + x] = s;
167 |   d[x * pitch_d + y] = s;
168 |   }
169 |   }
170 |   }
171 |   }
172 | */
173 | 
174 | void distance_host(const float * vg_a, size_t pitch_a, size_t n_a,
175 |                    const float * vg_b, size_t pitch_b, size_t n_b,
176 |                    size_t k, float * d, size_t pitch_d,
177 |                    dist_method method, float p)
178 | {
179 |   switch(method) {
180 |   case EUCLIDEAN:
181 |     euclidean(vg_a, n_a, vg_b, n_b, k, d);
182 |     break;
183 |   case MAXIMUM:
184 |     maximum(vg_a, n_a, vg_b, n_b, k, d);
185 |     break;
186 |   case MANHATTAN:
187 |     manhattan(vg_a, n_a, vg_b, n_b, k, d);
188 |     break;
189 |   case CANBERRA:
190 |     canberra(vg_a, n_a, vg_b, n_b, k, d);
191 |     break;
192 |   case BINARY:
193 |     binary(vg_a, n_a, vg_b, n_b, k, d);
194 |     break;
195 |   case MINKOWSKI:
196 |     minkowski(vg_a, n_a, vg_b, n_b, k, p, d);
197 |     break;
198 |   default:
199 |     error("unknown distance method");
200 |     /*  case DOT:
201 |         dot(vg_a, pitch_a / sizeof(float), n_a,
202 |         vg_b, pitch_b / sizeof(float), n_b,
203 |         k,
204 |         d, pitch_d / sizeof(float));
205 |         break; */
206 |   }
207 | }
208 | 
209 | void distance_device(const float * vg_a_d, size_t pitch_a, size_t n_a,
210 |                      const float * vg_b_d, size_t pitch_b, size_t n_b,
211 |                      size_t k,
212 |                      float * d_d, size_t pitch_d,
213 |                      dist_method method, float p)
214 | {
215 |   dim3 block(NUM_THREADS, 1, 1);
216 |   dim3 grid(n_a, n_b, 1);
217 | 
218 |   size_t fbytes = sizeof(float);
219 | 
220 |   pitch_a /= fbytes;
221 |   pitch_b /= fbytes;
222 |   pitch_d /= fbytes;
223 | 
224 |   void * args[] = {
225 |     &vg_a_d, &pitch_a, &n_a,
226 |     &vg_b_d, &pitch_b, &n_b,
227 |     &k,
228 |     &d_d, &pitch_d, &p
229 |   };
230 | 
231 |   std::string kernelName;
232 |   
233 |   switch(method) {  // Calculate the distance
234 |   case EUCLIDEAN:
235 |     kernelName = "euclidean_kernel_same";
236 |     break;
237 |   case MAXIMUM:
238 |     kernelName = "maximum_kernel_same";
239 |     break;
240 |   case MANHATTAN:
241 |     kernelName = "manhattan_kernel_same";
242 |     break;
243 |   case CANBERRA:
244 |     kernelName = "canberra_kernel_same";
245 |     break;
246 |   case BINARY:
247 |     kernelName = "binary_kernel_same";
248 |     break;
249 |   case MINKOWSKI:
250 |     kernelName = "minkowski_kernel_same";
251 |     break;
252 |   default:
253 |     kernelName = "";
254 |     error("unknown distance method");
255 |   }
256 |   cudaLaunch(kernelName, args, grid, block);
257 | }
258 | 
259 | void distance(const float * vg_a, size_t pitch_a, size_t n_a,
260 |               const float * vg_b, size_t pitch_b, size_t n_b,
261 |               size_t k,
262 |               float * d, size_t pitch_d,
263 |               dist_method method, float p)
264 | {
265 |   size_t 
266 |     pitch_a_d, pitch_b_d, pitch_d_d;
267 |   int same = (vg_a == vg_b); // are the two sets of vectors the same?
268 | 
269 |   // Space for the vector data
270 |   float * distance_vg_a_d;
271 |   float * distance_vg_b_d;
272 | 
273 |   // Space for the resulting distance
274 |   float * distance_d_d;
275 | 
276 |   // Allocate space for the vectors and distances on the gpu
277 |   cudaMallocPitch((void**)&distance_vg_a_d, &pitch_a_d, k * sizeof(float), 
278 |                   n_a);
279 |   cudaMemcpy2D(distance_vg_a_d, pitch_a_d, vg_a, pitch_a, k * sizeof(float), 
280 |                n_a, cudaMemcpyHostToDevice);
281 |   cudaMallocPitch((void**)&distance_d_d, &pitch_d_d, n_a * sizeof(float), 
282 |                   n_b);
283 | 
284 |   checkCudaError("distance function : malloc and memcpy");
285 |     
286 |   if(same) // don't need to move vg_b to gpu 
287 |     distance_device(distance_vg_a_d, pitch_a_d, n_a, 
288 |                     distance_vg_a_d, pitch_a_d, n_a,
289 |                     k,
290 |                     distance_d_d, pitch_d_d,
291 |                     method, p);
292 |   else { // vg_b is a different set of pnts so store it on gpu too
293 |     cudaMallocPitch((void**)&distance_vg_b_d, &pitch_b_d, 
294 |                     k * sizeof(float), n_b);
295 |     cudaMemcpy2D(distance_vg_b_d, pitch_b_d, vg_b, pitch_b, 
296 |                  k * sizeof(float), n_b, cudaMemcpyHostToDevice);
297 | 
298 |     checkCudaError("distance function : malloc and memcpy");
299 | 
300 |     distance_device(distance_vg_a_d, pitch_a_d, n_a,
301 |                     distance_vg_b_d, pitch_b_d, n_b,
302 |                     k,
303 |                     distance_d_d, pitch_d_d,
304 |                     method, p);
305 |     cudaFree(distance_vg_b_d);
306 |   }
307 |   checkCudaError("distance function : kernel invocation");
308 |   // Copy the result back to cpu land now that gpu work is done
309 |   cudaMemcpy2D(d, pitch_d, distance_d_d, pitch_d_d, n_a * sizeof(float), 
310 |                n_b, cudaMemcpyDeviceToHost);
311 |   checkCudaError("distance function : memcpy");
312 |     
313 |   // Free allocated space
314 |   cudaFree(distance_vg_a_d);
315 |   cudaFree(distance_d_d);
316 | }
317 | 
318 | void distanceLeaveOnGpu(dist_method method, float p, const float * points, 
319 |                         size_t dim, size_t numPoints, 
320 |                         float ** gpuDistances, size_t * pitchDistances) // outputs
321 | {
322 |   size_t pitchPoints;
323 |   float * dPoints;
324 | 
325 |   // prepare the vectors and distance storage on the gpu
326 |   cudaMallocPitch((void**)&dPoints, 
327 |                   &pitchPoints, dim * sizeof(float), numPoints);
328 |   cudaMemcpy2D(dPoints, pitchPoints, points, 
329 |                dim * sizeof(float), dim * sizeof(float), numPoints, 
330 |                cudaMemcpyHostToDevice);
331 |   cudaMallocPitch((void**)gpuDistances, pitchDistances, 
332 |                   numPoints * sizeof(float), numPoints);
333 |   checkCudaError("distance on gpu func : malloc and memcpy");
334 |     
335 |   distance_device(dPoints, pitchPoints, numPoints, 
336 |                   dPoints, pitchPoints, numPoints,
337 |                   dim,
338 |                   *gpuDistances, *pitchDistances, 
339 |                   method, p);
340 |   checkCudaError("distance on gpu func : kernel invocation");
341 |         
342 |   // clean up resources
343 |   cudaFree(dPoints); // be kind rewind
344 | }
345 | 


--------------------------------------------------------------------------------
/src/distance.h:
--------------------------------------------------------------------------------
 1 | #ifndef DISTANCE_H
 2 | #define DISTANCE_H
 3 | 
 4 | // Methods for computing the distance
 5 | typedef enum {
 6 | 	EUCLIDEAN,	MAXIMUM,	MANHATTAN, CANBERRA, 
 7 | 	BINARY, 	MINKOWSKI //	,DOT
 8 | } dist_method;
 9 | 
10 | /* Calculate the distance matrix for vectors in group a and vectors in group b.
11 |  *
12 |  * The format for vg_a and vg_b is as follows:
13 |  *   There are n_* vectors, each of dimensionality k.  They are stored in
14 |  *   row major order with a row (or pitch) being pitch_* bytes.
15 |  * The calculated distances are stored in d such that the distance between
16 |  * vectors indexed a and b is located in d[b * pitch_d / sizeof(float) + a].
17 |  * The user is responsible for
18 |  * allocating storage for d.  It should be at least n_a * n_b * sizeof(float)
19 |  * bytes.  The pitch_d argument is the same as for vg_*.
20 |  * The method used to calculate the distance is dist_method.
21 |  * 
22 |  * The argument p is optional and used for the Minkowski method.
23 |  *
24 |  * This function may run on the CPU or GPU depending on the size and
25 |  * number of vectors.  Good data alignment will increase performance.
26 |  */
27 | void distance(const float * vg_a, size_t pitch_a, size_t n_a,
28 | 	      const float * vg_b, size_t pitch_b, size_t n_b,
29 | 	      size_t k,
30 | 	      float * d, size_t pitch_d,
31 | 	      dist_method method,
32 | 	      float p);
33 | 
34 | /* This function should be used to calculate a distance matrix when 
35 |  * the data is already stored on the GPU.  vg_a, vg_b, and
36 |  * d should already be allocated and initialized on the device.
37 |  * This function does not allocate or free any storage on the device.
38 |  */
39 | void distance_device(const float * vg_a, size_t pitch_a, size_t n_a,
40 | 		     const float * vg_b, size_t pitch_b, size_t n_b,
41 | 		     size_t k,
42 | 		     float * d, size_t pitch_d,
43 | 		     dist_method method,
44 | 		     float p);
45 | 
46 | /* This function is analogous to distance_device except that it
47 |  * will run on a CPU.  Storage for d is not allocated in this function.
48 |  */
49 | void distance_host(const float * vg_a, size_t pitch_a, size_t n_a,
50 | 		   const float * vg_b, size_t pitch_b, size_t n_b,
51 | 		   size_t k,
52 | 		   float * d, size_t pitch_d,
53 | 		   dist_method method,
54 | 		   float p = 2.0);
55 | 
56 | void distanceLeaveOnGpu(dist_method method, float p, const float * points, 
57 | 	size_t dim, size_t numPoints, float ** gpuDistances, 
58 |                         size_t * pitchDistances);
59 | 
60 | 
61 | #endif // DISTANCE_H
62 | 


--------------------------------------------------------------------------------
/src/granger.cpp:
--------------------------------------------------------------------------------
  1 | #include<stdio.h>
  2 | #include<stdlib.h>
  3 | #include<string.h>
  4 | #include<math.h>
  5 | 
  6 | #include "Rmath.h"
  7 | #include "cuda_runtime_api.h"
  8 | 
  9 | #include "cuseful.h"
 10 | #include "cudaUtils.h"
 11 | #include "granger.h"
 12 | 
 13 | #define max(a, b) ((a > b)?a:b)
 14 | 
 15 | #define THREADSPERDIM   16
 16 | 
 17 | #define FALSE 0
 18 | #define TRUE !FALSE
 19 | 
 20 | void getPValues(int rows, int cols, const float * fStats, int p, int embedRows,
 21 |                 float * pValues)
 22 | {
 23 |   float fscore = 0.f;
 24 | 
 25 |   for(int i = 0; i < rows; i++) {
 26 |     for(int j = 0; j < cols; j++) {
 27 |       fscore = fStats[i + j * rows];
 28 |       pValues[i + j * rows] = 1.f - (float) pf((double) fscore,
 29 |                                                (double) p, (double)embedRows - 2.0 * (double) p - 1.0,
 30 |                                                1, 0);
 31 |     }
 32 |   }
 33 | }
 34 | 
 35 | void granger(int rows, int cols, const float * y, int p, 
 36 |              float * fStats, float * pValues)
 37 | {
 38 |   if(cols < 2) {
 39 |     fatal("The Granger test needs at least 2 variables.\n");
 40 |     return;
 41 |   }
 42 |   int
 43 |     i, j, k, t = p+1,
 44 |     fbytes = sizeof(float),
 45 |     embedRows = rows-p, embedCols = t*2;
 46 |   float 
 47 |     * Y, * rQ, * rR,
 48 |     * unrQ, * unrR,
 49 |     * restricted, * unrestricted,
 50 |     * rdata, * unrdata,
 51 |     * dfStats; // * dpValues;
 52 |   size_t 
 53 |     size = cols*cols*fbytes, partSize = embedRows*size;
 54 | 
 55 |   cudaMalloc((void **)&Y, embedCols*partSize);
 56 | 
 57 |   cudaMalloc((void **)&rQ, t*embedRows*cols*fbytes);
 58 |   cudaMalloc((void **)&rR, t*t*cols*fbytes);
 59 |   cudaMalloc((void **)&rdata, t*embedRows*cols*fbytes);
 60 |   cudaMalloc((void **)&unrdata, (embedCols-1)*partSize);
 61 |   cudaMalloc((void **)&restricted, t*cols*fbytes);
 62 |   if( hasCudaError("granger: line 267: gpu memory allocation") ) return;
 63 | 
 64 |   int
 65 |     Ydim =  embedCols * embedRows,
 66 |     rQdim = t * embedRows, rRdim = t * t,
 67 |     rdataDim = t*embedRows, restrictedDim = t,
 68 |     unrQdim = (embedCols-1) * embedRows, 
 69 |     unrRdim = (embedCols-1) * (embedCols-1),
 70 |     unrestrictedDim = embedCols-1, unrdataDim = (embedCols-1)*embedRows;
 71 |   float 
 72 |     * ypos, * rdataPos, * unrdataPos,
 73 |     * evenCols;
 74 |   int 
 75 |     skip = 2*embedRows, colBytes = embedRows*fbytes;
 76 |   const float 
 77 |     * vectA, * vectB;
 78 | 
 79 |   for(i = 0; i < cols; i++) {
 80 |     rdataPos = rdata+i*rdataDim;
 81 |     evenCols = rdataPos+embedRows;
 82 |     vectA = y+i*rows; 
 83 |     for(j = 0; j < cols; j++) {
 84 |       if(i == j) continue;
 85 |                         
 86 |       ypos = Y+(i*cols+j)*Ydim;
 87 |       unrdataPos = unrdata+(i*cols+j)*unrdataDim;
 88 | 
 89 |       vectB = y+j*rows;
 90 | 
 91 |       for(k = 0; k < p+1; k++) { // produce t subcols
 92 |         cudaMemcpy(ypos+k*skip, vectA+(p-k), embedRows*fbytes, 
 93 |                    cudaMemcpyHostToDevice);
 94 |         cudaMemcpy(ypos+k*skip+embedRows, vectB+(p-k), 
 95 |                    embedRows*fbytes, cudaMemcpyHostToDevice);
 96 |       }
 97 |       cudaMemcpy(unrdataPos+embedRows, ypos+skip,
 98 |                  (embedCols-2)*embedRows*fbytes, cudaMemcpyDeviceToDevice);
 99 |     }
100 |     // build restricted data from last set of unrestricted data
101 |     // only need one per column, not one for each pairing
102 |     for(k = 0; k < embedCols-2; k+=2) { 
103 |       cudaMemcpy(evenCols+(k*embedRows)/2, unrdataPos+(1+k)*embedRows,
104 |                  colBytes, cudaMemcpyDeviceToDevice);
105 |     }
106 |   }
107 |   if( hasCudaError("granger : mem copy from host to device") ) return;
108 | 
109 |   int numBlocks = cols / THREADSPERDIM;
110 |   if(numBlocks * THREADSPERDIM < cols) numBlocks++;
111 | 
112 |   dim3 
113 |     dimRGrid(numBlocks), 
114 |     dimRBlock(THREADSPERDIM), 
115 |     dimUnrGrid(numBlocks, numBlocks), 
116 |     dimUnrBlock(THREADSPERDIM, THREADSPERDIM);
117 | 
118 |   void * restArgs[] = {
119 |     &cols, &cols,
120 |     &embedRows,
121 |     &t,
122 |     &rdata, 
123 |     &rdataDim,
124 |     &Y,
125 |     &Ydim,
126 |     &rQ,
127 |     &rQdim,
128 |     &rR,
129 |     &rRdim,
130 |     &restricted,
131 |     &restrictedDim
132 |   };
133 |   cudaLaunch("getRestricted", restArgs, dimRGrid, dimRBlock);
134 |   if( hasCudaError("granger: getRestricted kernel execution") ) return;
135 | 
136 |   cudaFree(rQ);
137 |   cudaFree(rR);
138 | 
139 |   cudaMalloc((void **)&unrQ, (embedCols-1)*partSize);
140 |   cudaMalloc((void **)&unrR, (embedCols-1)*(embedCols-1)*size);
141 |   cudaMalloc((void **)&unrestricted, (embedCols-1)*size);
142 |   if( hasCudaError("granger: line 336: attemped gpu memory allocation") ) 
143 |     return;
144 | 
145 |   size_t unrestT = embedCols - 1;
146 |   void * unrestArgs[] = {
147 |     &cols, &cols,
148 |     &embedRows,
149 |     &unrestT,
150 |     &unrdata, 
151 |     &unrdataDim,
152 |     &Y,
153 |     &Ydim,
154 |     &unrQ,
155 |     &unrQdim,
156 |     &unrR,
157 |     &unrRdim,
158 |     &unrestricted,
159 |     &unrestrictedDim
160 |   };
161 |   cudaLaunch("getUnrestricted", unrestArgs, dimUnrGrid, dimUnrBlock);
162 |   if( hasCudaError("granger : getUnRestricted kernel execution") ) return;
163 | 
164 |   cudaFree(unrQ);
165 |   cudaFree(unrR);
166 | 
167 |   size_t resultSize = cols*cols*fbytes;
168 |   cudaMalloc((void **)&dfStats, resultSize);
169 |   // cudaMalloc((void **)&dpValues, resultSize);
170 |   if( hasCudaError("granger: line 350: gpu memory allocation") ) return;
171 | 
172 |   int diagFlag = FALSE;
173 |   void * ftestArgs[] = {
174 |     &diagFlag,
175 |     &p,
176 |     &embedRows,
177 |     &cols, &cols,
178 |     &t, &unrestT,
179 |     &Y, &Ydim,
180 |     &restricted, &restrictedDim, 
181 |     &unrestricted, &unrestrictedDim,
182 |     &rdata, &rdataDim,
183 |     &unrdata, &unrdataDim,
184 |     &dfStats
185 |   };
186 |   cudaLaunch("ftest", ftestArgs, dimUnrGrid, dimUnrBlock); 
187 |   if( hasCudaError("granger : ftest kernel execution") ) return;
188 | 
189 |   cudaMemcpy(fStats, dfStats, resultSize, cudaMemcpyDeviceToHost);
190 |   // cudaMemcpy(pValues, dpValues, resultSize, cudaMemcpyDeviceToHost);
191 |   if( hasCudaError("granger : mem copy device to host") ) return;
192 | 
193 |   getPValues(cols, cols, fStats, p, embedRows, pValues);
194 | 
195 |   cudaFree(Y);
196 |   cudaFree(restricted);
197 |   cudaFree(unrestricted);
198 |   cudaFree(rdata);
199 |   cudaFree(unrdata);
200 |   cudaFree(dfStats);
201 |   // cudaFree(dpValues);
202 | }
203 | 
204 | void grangerxy(int rows, int colsx, const float * x, int colsy, 
205 |                const float * y, int p, float * fStats, float * pValues)
206 | {
207 | 
208 |   if((p < 0) || (rows < 1) || (colsx < 1) || (colsy < 1)) {
209 |     fatal("The Granger XY test needs at least a pair variables.\n");
210 |     return;
211 |   }
212 |   int
213 |     i, j, k, t = p+1,
214 |     fbytes = sizeof(float),
215 |     embedRows = rows-p, embedCols = t*2;
216 |   float 
217 |     * Y, * rQ, * rR,
218 |     * unrQ, * unrR,
219 |     * restricted, * unrestricted,
220 |     * rdata, * unrdata,
221 |     * dfStats; // * dpValues;
222 |   size_t 
223 |     size = colsx*colsy*fbytes, partSize = embedRows*size;
224 | 
225 |   cudaMalloc((void **)&Y, embedCols*partSize);
226 | 
227 |   cudaMalloc((void **)&rQ, t*embedRows*colsy*fbytes);
228 |   cudaMalloc((void **)&rR, t*t*colsy*fbytes);
229 |   cudaMalloc((void **)&rdata, t*embedRows*colsy*fbytes);
230 |   cudaMalloc((void **)&restricted, t*colsy*fbytes);
231 | 
232 |   cudaMalloc((void **)&unrQ, (embedCols-1)*partSize);
233 |   cudaMalloc((void **)&unrR, (embedCols-1)*(embedCols-1)*size);
234 |   cudaMalloc((void **)&unrestricted, (embedCols-1)*size);
235 |   cudaMalloc((void **)&unrdata, (embedCols-1)*partSize);
236 |   checkCudaError("grangerxy : attemped gpu memory allocation");
237 | 
238 |   int
239 |     Ydim =  embedCols * embedRows,
240 |     rQdim = t * embedRows, rRdim = t * t,
241 |     rdataDim = t*embedRows, restrictedDim = t,
242 |     unrQdim = (embedCols-1) * embedRows, 
243 |     unrRdim = (embedCols-1) * (embedCols-1),
244 |     unrestrictedDim = embedCols-1, unrdataDim = (embedCols-1)*embedRows;
245 |   float 
246 |     * ypos, * rdataPos, * unrdataPos;
247 | 
248 |   int 
249 |     skip = 2*embedRows, colBytes = embedRows*fbytes;
250 |   const float * vectA, * vectB;
251 |   float * evenCols;
252 | 
253 |   for(i = 0; i < colsy; i++) {
254 |     rdataPos = rdata+i*rdataDim;
255 |     evenCols = rdataPos+embedRows;
256 |     vectA = y+i*rows; 
257 |     for(j = 0; j < colsx; j++) {
258 |       ypos = Y+(i*colsx+j)*Ydim;
259 |       unrdataPos = unrdata+(i*colsx+j)*unrdataDim;
260 | 
261 |       vectB = x+j*rows;
262 | 
263 |       for(k = 0; k < p+1; k++) { // produce t subcols
264 |         cudaMemcpy(ypos+k*skip, vectA+(p-k), embedRows*fbytes, 
265 |                    cudaMemcpyHostToDevice);
266 |         cudaMemcpy(ypos+k*skip+embedRows, vectB+(p-k), 
267 |                    embedRows*fbytes, cudaMemcpyHostToDevice);
268 |       }
269 |       cudaMemcpy(unrdataPos+embedRows, ypos+skip,
270 |                  (embedCols-2)*embedRows*fbytes, cudaMemcpyDeviceToDevice);
271 |     }
272 |     // build restricted data from last set of unrestricted data
273 |     // only need one per column, not one for each pairing
274 |     for(k = 0; k < embedCols-2; k+=2) { 
275 |       cudaMemcpy(evenCols+(k*embedRows)/2, unrdataPos+(1+k)*embedRows, 
276 |                  colBytes, cudaMemcpyDeviceToDevice);
277 |     }
278 |     char errline[16];
279 |     sprintf(errline, "gxy err : %d\n", i);
280 |     if( hasCudaError(errline) ) return;
281 |   }
282 |   checkCudaError("grangerxy : mem copy from host to device");
283 | 
284 |   int 
285 |     numBlocksX = colsx / THREADSPERDIM,
286 |     numBlocksY = colsy / THREADSPERDIM;
287 | 
288 |   if(numBlocksX * THREADSPERDIM < colsx) numBlocksX++;
289 |   if(numBlocksY * THREADSPERDIM < colsy) numBlocksY++;
290 | 
291 |   dim3 
292 |     dimRGrid(numBlocksY), 
293 |     dimRBlock(THREADSPERDIM), 
294 |     dimUnrGrid(numBlocksX, numBlocksY), 
295 |     dimUnrBlock(THREADSPERDIM, THREADSPERDIM);
296 | 
297 |   void * restArgs[] = {
298 |     &colsx, &colsy,
299 |     &embedRows,
300 |     &t,
301 |     &rdata, &rdataDim,
302 |     &Y, &Ydim,
303 |     &rQ, &rQdim,
304 |     &rR, &rRdim,
305 |     &restricted, &restrictedDim
306 |   };
307 |   cudaLaunch("getRestricted", restArgs,
308 |                     dimRGrid, dimRBlock);
309 |   
310 |   size_t unrestT = embedCols - 1;
311 |   void * unrestArgs[] = {
312 |     &colsx, &colsy,
313 |     &embedRows,
314 |     &unrestT,
315 |     &unrdata, &unrdataDim,
316 |     &Y, &Ydim,
317 |     &unrQ, &unrQdim,
318 |     &unrR, &unrRdim,
319 |     &unrestricted, &unrestrictedDim
320 |   };
321 |   cudaLaunch("getUnrestricted", unrestArgs,
322 |                     dimUnrGrid, dimUnrBlock);
323 | 
324 |   checkCudaError("grangerxy : kernel execution get(Un)Restricted");
325 | 
326 |   cudaFree(rQ);
327 |   cudaFree(unrQ);
328 |   cudaFree(rR);
329 |   cudaFree(unrR);
330 | 
331 |   size_t resultSize = colsx*colsy*fbytes;
332 |   cudaMalloc((void **)&dfStats, resultSize);
333 |   // cudaMalloc((void **)&dpValues, resultSize);
334 |   checkCudaError("grangerxy : attemped gpu memory allocation");
335 | 
336 |   int diagFlag = TRUE;
337 |   void * ftestArgs[] = {
338 |     &diagFlag,
339 |     &p,
340 |     &embedRows,
341 |     &colsx, &colsy,
342 |     &t, &unrestT,
343 |     &Y, &Ydim,
344 |     &restricted, &restrictedDim, 
345 |     &unrestricted, &unrestrictedDim,
346 |     &rdata, &rdataDim,
347 |     &unrdata, &unrdataDim,
348 |     &dfStats
349 |   };
350 |   cudaLaunch("ftest", ftestArgs, dimUnrGrid, dimUnrBlock); 
351 |   checkCudaError("grangerxy : kernel execution ftest");
352 | 
353 |   cudaMemcpy(fStats, dfStats, resultSize, cudaMemcpyDeviceToHost);
354 |   // cudaMemcpy(pValues, dpValues, resultSize, cudaMemcpyDeviceToHost);
355 |   checkCudaError("grangerxy : mem copy from device to host");
356 | 
357 |   getPValues(colsx, colsy, fStats, p, embedRows, pValues);
358 | 
359 |   cudaFree(Y);
360 |   cudaFree(restricted);
361 |   cudaFree(unrestricted);
362 |   cudaFree(rdata);
363 |   cudaFree(unrdata);
364 |   cudaFree(dfStats);
365 |   // cudaFree(dpValues);
366 | }
367 | 


--------------------------------------------------------------------------------
/src/granger.h:
--------------------------------------------------------------------------------
 1 | #ifndef _GRANGER_H_
 2 | #define _GRANGER_H_
 3 | 
 4 | void granger(int rows, int cols,
 5 |              const float * y, int p, 
 6 |              float * fStats, float * pValues);
 7 | void grangerxy(int rows,
 8 |                int colsx, const float * x,
 9 |                int colsy, const float * y,
10 |                int p,
11 |                float * fStats,
12 |                float * pValues);
13 | 
14 | #endif /* _GRANGER_H_ */
15 | 


--------------------------------------------------------------------------------
/src/hcluster.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | #include <string>
  4 | 
  5 | #include "R.h"
  6 | 
  7 | #include "math_constants.h"
  8 | #include "cuseful.h"
  9 | #include "hcluster.h"
 10 | #include "cudaUtils.h"
 11 | 
 12 | #define NUM_THREADS 32
 13 | #define NUM_BLOCKS 1024
 14 | 
 15 | void hcluster(const float * dist, size_t dist_pitch, size_t n,
 16 |               int * sub, int * sup, float * val, hc_method method,
 17 |               float lambda, float beta)
 18 | {
 19 |   // Allocate space for the distance matrix
 20 |   size_t pitch_dist_d;
 21 |   float * hcluster_dist_d; // Distance matrix
 22 |   cudaMallocPitch((void**)&hcluster_dist_d, &pitch_dist_d,
 23 |                   n * sizeof(float), n);
 24 | 
 25 |   // Copy the distance matrix
 26 |   cudaMemcpy2D(hcluster_dist_d, pitch_dist_d,
 27 |                dist, dist_pitch, 
 28 |                n * sizeof(float), n,
 29 |                cudaMemcpyHostToDevice);
 30 |   
 31 |   hclusterPreparedDistances(hcluster_dist_d, pitch_dist_d, n,
 32 |                             sub, sup,
 33 |                             val,
 34 |                             method, lambda, beta);
 35 | 
 36 |   cudaFree(hcluster_dist_d);
 37 |   checkCudaError("hcluster : cudaFree");
 38 | }
 39 | 
 40 | void hclusterPreparedDistances(float * gpuDist, size_t pitch_dist_d, size_t n,
 41 |                                int * sub, int * sup,
 42 |                                float * val,
 43 |                                hc_method method,
 44 |                                float lambda, float beta)
 45 | {
 46 |   float
 47 |     * hcluster_count_d, // Number of elements in each cluster
 48 |     * hcluster_min_val_d, // find min of each row and
 49 |     * hcluster_merge_val_d; // Array of the values merged at
 50 | 
 51 |   // col containing the min of each row
 52 |   size_t * hcluster_min_col_d;
 53 | 
 54 |   // Arrays telling which cluster merged with which cluster
 55 |   int
 56 |     * hcluster_sub_d,
 57 |     * hcluster_sup_d;
 58 | 
 59 |   cudaMalloc((void**)&hcluster_count_d, n * sizeof(float));
 60 |   cudaMalloc((void**)&hcluster_min_val_d, n * sizeof(float));
 61 |   cudaMalloc((void**)&hcluster_min_col_d, n * sizeof(size_t));
 62 |   cudaMalloc((void**)&hcluster_sub_d, (n - 1) * sizeof(int));
 63 |   cudaMalloc((void**)&hcluster_sup_d, (n - 1) * sizeof(int));
 64 |   cudaMalloc((void**)&hcluster_merge_val_d, (n - 1) * sizeof(float));
 65 | 
 66 |   // Every element starts in its own cluster
 67 |   float * pre_count = Calloc(n, float);
 68 |   for(size_t i = 0; i < n; ++i)
 69 |     pre_count[i] = 1.0;
 70 | 
 71 |   cudaMemcpy(hcluster_count_d, pre_count, n * sizeof(float), 
 72 |              cudaMemcpyHostToDevice);
 73 |   checkCudaError("hcluster : malloc and memcpy");
 74 | 
 75 |   Free(pre_count);
 76 | 
 77 |   dim3 
 78 |     grid0(NUM_BLOCKS, 1, 1), block0(NUM_THREADS, 1, 1),
 79 |     grid1(1, 1, 1), block1(NUM_THREADS, 1, 1);
 80 | 
 81 |   size_t gpuPitch = pitch_dist_d / sizeof(float);
 82 |   void * convertArgs[] = {
 83 |     &gpuDist,
 84 |     &gpuPitch,
 85 |     &n
 86 |   };
 87 | 
 88 |   // Convert 0 on the diagonal to infinity
 89 |   cudaLaunch("convert_kernel", convertArgs, grid1, block1);
 90 |   checkCudaError("hcluster : convert kernel");
 91 | 
 92 |   std::string func;
 93 |   switch(method) {
 94 |   case COMPLETE:
 95 |     func = "complete_kernel";
 96 |     break;
 97 |   case WPGMA:
 98 |     func = "wpgma_kernel";
 99 |     break;
100 |   case AVERAGE:
101 |     func = "average_kernel";
102 |     break;
103 |   case MEDIAN:
104 |     func = "median_kernel";
105 |     break;
106 |   case CENTROID:
107 |     func = "centroid_kernel";
108 |     break;
109 |   case FLEXIBLE_GROUP:
110 |     func = "flexible_group_kernel";
111 |     break;
112 |   case FLEXIBLE:
113 |     func = "flexible_kernel";
114 |     break;
115 |   case WARD:
116 |     func = "ward_kernel";
117 |     break;
118 |   case MCQUITTY:
119 |     func = "mcquitty_kernel";
120 |     break;
121 |   case SINGLE:
122 |   default:
123 |     func = "single_kernel";
124 |     break;
125 |   }
126 | 
127 |   void * findMin1Args[] = {
128 |     &gpuDist, 
129 |     &gpuPitch,
130 |     &n,
131 |     &hcluster_count_d, 
132 |     &hcluster_min_val_d,
133 |     &hcluster_min_col_d,
134 |     0  // place holder for row_offset (a loop variable)
135 |   };
136 |   void * findMin2Args[] = {
137 |     &hcluster_min_val_d, 
138 |     &hcluster_min_col_d,
139 |     &hcluster_count_d, 
140 |     &hcluster_sub_d, 
141 |     &hcluster_sup_d, 
142 |     &hcluster_merge_val_d,
143 |     &n,
144 |     0 // place holder for iter
145 |   };
146 |   void * funcArgs[] = {
147 |     &gpuDist, 
148 |     &gpuPitch,
149 |     &n,
150 |     &hcluster_sub_d, 
151 |     &hcluster_sup_d,
152 |     &hcluster_count_d,
153 |     &hcluster_merge_val_d,
154 |     0,  // place holder for iter
155 |     0, // place holder for col_offset
156 |     &lambda,
157 |     &beta
158 |   };
159 |   size_t skip = NUM_BLOCKS * NUM_THREADS;
160 |   // Merge items n - 1 times
161 |   for(size_t iter = 0; iter < (n - 1); ++iter) {
162 |     // Find the minimum of each column
163 |     for(size_t row_offset = 0; row_offset < n; row_offset += NUM_BLOCKS) {
164 |       findMin1Args[6] = &row_offset;
165 |       cudaLaunch("find_min1_kernel", findMin1Args,
166 |                         grid0, block0);
167 |     }
168 |                 
169 |     // Find overall winner; update arrays sub, sup, val, count
170 |     findMin2Args[7] = &iter;
171 |     cudaLaunch("find_min2_kernel", findMin2Args,
172 |                       grid1, block1);
173 | 
174 |     // Update the distance matrix
175 |     funcArgs[7] = &iter;
176 |     for(size_t col_offset = 0; col_offset < n; col_offset += skip) {
177 |       funcArgs[8] = &col_offset;
178 |       cudaLaunch(func, funcArgs, grid0, block0);
179 |     }
180 |   }
181 |   checkCudaError("hcluster : method kernel calls");
182 | 
183 |   // Copy results
184 |   cudaMemcpy(sub, hcluster_sub_d, (n - 1) * sizeof(int), 
185 |              cudaMemcpyDeviceToHost);        
186 |   cudaMemcpy(sup, hcluster_sup_d, (n - 1) * sizeof(int), 
187 |              cudaMemcpyDeviceToHost);
188 |   cudaMemcpy(val, hcluster_merge_val_d, (n-1)*sizeof(float), 
189 |              cudaMemcpyDeviceToHost);
190 | 
191 |   checkCudaError("hcluster : results memcpy");
192 | 
193 |   cudaFree(hcluster_count_d);
194 |   cudaFree(hcluster_min_val_d);
195 |   cudaFree(hcluster_min_col_d);
196 |   cudaFree(hcluster_sub_d);
197 |   cudaFree(hcluster_sup_d);
198 |   cudaFree(hcluster_merge_val_d);
199 | 
200 |   checkCudaError("hcluster : cudaFree");
201 | }
202 | 


--------------------------------------------------------------------------------
/src/hcluster.h:
--------------------------------------------------------------------------------
 1 | #ifndef HCLUSTER_H
 2 | #define HCLUSTER_H
 3 | 
 4 | /* Methods for hierarchical clustering */
 5 | typedef enum {
 6 |   SINGLE,
 7 |   COMPLETE,
 8 |   WPGMA,
 9 |   AVERAGE,
10 |   MEDIAN,
11 |   CENTROID, 
12 |   FLEXIBLE_GROUP,
13 |   FLEXIBLE,
14 |   WARD,
15 |   MCQUITTY
16 | } hc_method;
17 | 
18 | void hcluster(const float * dist, size_t dist_pitch, size_t n,
19 |               int * sub, int * sup,
20 |               float * val,
21 |               hc_method method,
22 |               float lambda = 0.5, float beta = 0.5);
23 | 
24 | void hclusterPreparedDistances(float * gpuDist, size_t pitch_dist_d, size_t n,
25 |                                int * sub, int * sup,
26 |                                float * val,
27 |                                hc_method method,
28 |                                float lambda, float beta);
29 | 
30 | #endif // HCLUSTER_H
31 | 


--------------------------------------------------------------------------------
/src/kendall.cpp:
--------------------------------------------------------------------------------
 1 | #include "nvrtc.h"
 2 | #include "cuda.h"
 3 | 
 4 | #include "R.h"
 5 | 
 6 | #include "cuseful.h"
 7 | #include "cudaUtils.h"
 8 | 
 9 | #include "kendall.h"
10 | 
11 | #define NUMTHREADS 16
12 | 
13 | void masterKendall(const float * x, size_t nx, 
14 |                    const float * y, size_t ny,
15 |                    size_t sampleSize, double * results)
16 | {
17 | 	size_t 
18 | 		outputLength = nx * ny, outputBytes = outputLength*sizeof(double),
19 | 		xBytes = nx*sampleSize*sizeof(float), 
20 | 		yBytes = ny*sampleSize*sizeof(float); 
21 | 	float
22 | 		* gpux, * gpuy; 
23 | 	double
24 | 		* gpuResults;
25 | 	dim3
26 | 		grid(nx, ny), block(NUMTHREADS, NUMTHREADS);
27 | 
28 | 	cudaMalloc((void **)&gpux, xBytes);
29 | 	cudaMalloc((void **)&gpuy, yBytes);
30 | 	checkCudaError("input vector space allocation");
31 | 
32 | 	cudaMemcpy(gpux, x, xBytes, cudaMemcpyHostToDevice);
33 | 	cudaMemcpy(gpuy, y, yBytes, cudaMemcpyHostToDevice);
34 | 	checkCudaError("copying input vectors to gpu");
35 | 
36 | 	cudaMalloc((void **)&gpuResults, outputBytes);
37 | 	checkCudaError("allocation of space for result matrix");
38 | 
39 |   void *args[] =
40 |     { &gpux
41 |     , &nx
42 |     , &gpuy
43 |     , &ny
44 |     , &sampleSize
45 |     , &gpuResults
46 |     };
47 |   cudaLaunch("gpuKendall", args,
48 |       grid, block);
49 | 
50 |   cudaFree(gpux);
51 |   cudaFree(gpuy);
52 |   cudaMemcpy(results, gpuResults, outputBytes, cudaMemcpyDeviceToHost);
53 |   cudaFree(gpuResults);
54 |   checkCudaError("copying results from gpu and cleaning up");
55 | }
56 | 


--------------------------------------------------------------------------------
/src/kendall.h:
--------------------------------------------------------------------------------
 1 | #ifndef _KENDALL_H_
 2 | #define _KENDALL_H_
 3 | 
 4 | void masterKendall(const float * x,  size_t nx, 
 5 |                    const float * y, size_t ny,
 6 |                    size_t sampleSize,
 7 |                    double * results);
 8 | 
 9 | #endif /* _KENDALL_H_ */
10 | 


--------------------------------------------------------------------------------
/src/lsfit.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <algorithm>
  5 | 
  6 | #include "cublas.h"
  7 | 
  8 | #include "R.h"
  9 | 
 10 | #include "cuseful.h"
 11 | #include "lsfit.h"
 12 | #include "qrdecomp.h"
 13 | 
 14 | // Copyright 2009, Mark Seligman at Rapid Biologics, LLC.  All rights
 15 | // reserved.
 16 | //
 17 | 
 18 | // Rounds "length" up to the next multiple of the block length.
 19 | //
 20 | int alignBlock(int length, unsigned blockExp) {
 21 |   int blockSize = 1 << blockExp;
 22 |   return (length + blockSize - 1) & (((unsigned) -1)  << blockExp);
 23 | }
 24 | 
 25 | void gpuLSFitF(float * X, int rows, int cols, float * Y, int yCols,
 26 |                double tol, float * coeffs, float * resids, float * effects,
 27 |                int * rank, int * pivot, double * qrAux)
 28 | {
 29 | 	const int
 30 | 		fbytes = sizeof(float);
 31 | 
 32 | 	// Should be >= 4, to satisfy alignment criterea for memory
 33 | 	// coalescence.  For larger arrays (> 1000 rows), best performance
 34 | 	// has been observed at 7.
 35 | 	//
 36 | 	const unsigned blockExp = 7; // Gives blockSize = 2^7 = 128.
 37 | 
 38 | 	float *dQR;
 39 | 
 40 | 	int stride = alignBlock(rows, blockExp);
 41 | 	
 42 | 	cublasInit();
 43 | 	cublasAlloc(stride * cols, fbytes, (void **)&dQR);
 44 | 
 45 | 	// This is overkill:  just need to zero the padding.
 46 | 	//
 47 | 	cudaMemset2D(dQR, cols * fbytes, 0.f, cols * fbytes, stride); 
 48 | 	cublasSetMatrix(rows, cols, fbytes, X, rows, dQR, stride);
 49 | 
 50 |   // On return we have dQR in pivoted, packed QR form.
 51 |   
 52 |   getQRDecompBlocked(rows, cols, tol, dQR, 1 << blockExp,
 53 |                      stride, pivot, qrAux, rank);
 54 | 	cublasGetMatrix(rows, cols, fbytes, dQR, stride, X, rows);
 55 | 
 56 | 	if(*rank > 0)
 57 | 		getCRE(dQR, rows, cols, stride, *rank, qrAux, yCols, coeffs, resids, effects);
 58 | 	else // Residuals copied from Y.
 59 | 		memcpy(resids, Y, rows * yCols * fbytes);
 60 | 
 61 |    	cublasFree(dQR);
 62 | 	cublasShutdown();
 63 | }
 64 | 
 65 | void gpuLSFitD(double *X, int n, int p, double *Y, int nY,
 66 | 	   double tol, double *coeffs, double *resids, double *effects,
 67 | 	   int *rank, int *pivot, double * qrAux) 
 68 | {
 69 | // NYI
 70 | }
 71 | 
 72 | 
 73 | // Fills in the coefficients, residuals and effects matrices.
 74 | //
 75 | void getCRE(float *dQR, int rows, int cols, int stride, int rank, double *qrAux,
 76 | 	int yCols, float *coeffs, float *resids, float *effects)
 77 | {
 78 | 	const int
 79 | 		fbytes = sizeof(float);
 80 |         // Used by effects, residual computations.
 81 | 	//
 82 | 	int maxIdx = std::min(rank, rows - 1);
 83 | 
 84 | 	float
 85 | 		* diags = Calloc(rank * fbytes, float),
 86 | 		* dDiags, *dResids, *dCoeffs, *dEffects;
 87 | 
 88 | 	cublasAlloc(rank, fbytes, (void **) &dDiags);
 89 | 
 90 | 	cublasAlloc(cols * yCols, fbytes, (void **) &dCoeffs);
 91 | 	cublasAlloc(rows * yCols, fbytes, (void **) &dResids);
 92 | 	cublasAlloc(rows * yCols, fbytes, (void **) &dEffects);
 93 | 
 94 | 	// Temporarily swaps diagonals with qrAux.
 95 | 
 96 | 	cublasScopy(rank, dQR, stride + 1, dDiags, 1);
 97 | 	cublasGetVector(rank, fbytes, dDiags, 1, diags, 1);
 98 | 
 99 | 	float *qrAuxFloat = Calloc(maxIdx * fbytes, float);
100 | 	for (int i = 0; i < maxIdx; i++)
101 | 	  qrAuxFloat[i] = qrAux[i];
102 | 	cublasSetVector(maxIdx, fbytes, qrAuxFloat, 1, dQR, stride + 1);
103 | 	Free(qrAuxFloat);
104 | 
105 | 	cublasSetMatrix(cols, yCols, fbytes, coeffs, cols, dCoeffs, cols);
106 | 	cublasSetMatrix(rows, yCols, fbytes, effects, rows, dEffects, rows);
107 | 	cublasSetMatrix(rows, yCols, fbytes, resids, rows, dResids, rows);
108 | 
109 | 	// Computes the effects matrix, intialized by caller to Y.
110 | 
111 | 	float
112 | 		* pEffects = dEffects;
113 | 
114 | 	for (int i = 0; i < yCols; i++, pEffects += rows) {
115 | 		float
116 | 			* pQR = dQR;
117 | 
118 | 		for (int k = 0; k < maxIdx; k++, pQR += (stride + 1)) {
119 | 			double
120 | 				t = cublasSdot(rows - k, pQR, 1, pEffects +  k, 1);
121 | 
122 | 			t *= -1.0 / qrAux[k];
123 | 			cublasSaxpy(rows - k, t, pQR, 1, pEffects + k, 1);
124 | 		}
125 | 	}
126 | 
127 | 	// Computes the residuals matrix, initialized by caller to zero.
128 | 	// If not of full row rank, presets the remaining rows to those from
129 | 	// effects.
130 | 
131 | 	if(rank < rows) {
132 | 		for(int i = 0; i < yCols; i++) {
133 | 			cublasScopy(rows - rank,  dEffects + i*rows + rank, 1,
134 | 				dResids + i*rows + rank, 1);
135 | 		}
136 | 	}
137 | 
138 | 	float
139 | 		* pResids = dResids;
140 | 
141 | 	for (int i = 0; i < yCols; i++, pResids += rows) {
142 | 		for (int k = maxIdx - 1; k >= 0; k--) {
143 | 			double
144 | 				t = -(1.0 / qrAux[k])
145 | 					* cublasSdot(rows - k, dQR + k*stride + k, 1, pResids + k, 1);
146 | 
147 | 			cublasSaxpy(rows -k, t, dQR + k*stride + k, 1, pResids + k, 1);
148 | 		}
149 | 	}
150 | 	cublasScopy(maxIdx, dDiags, 1, dQR, stride + 1);
151 | 
152 | 	// Computes the coefficients matrix, initialized by caller to zero.
153 | 
154 | 	float
155 | 		* pCoeffs = dCoeffs;
156 | 
157 | 	for(int i = 0; i < yCols; i++, pCoeffs += cols) {
158 | 		cublasScopy(rank, dEffects + i*rows, 1, pCoeffs, 1);
159 | 
160 | 		float t;
161 | 		for(int k = rank - 1; k > 0; k--) {
162 | 		        cublasSscal(1, 1.f / diags[k], pCoeffs + k, 1);
163 | 			cublasGetVector(1, fbytes, pCoeffs + k, 1, &t, 1);
164 | 			cublasSaxpy(k, -t, dQR + k*stride, 1, pCoeffs, 1);
165 | 		}
166 | 		cublasSscal(1, 1.f / diags[0], pCoeffs, 1);
167 | 	}
168 | 	Free(diags);
169 | 
170 | 	cublasGetMatrix(cols, yCols, fbytes, dCoeffs, cols, coeffs, cols);
171 | 	cublasGetMatrix(rows, yCols, fbytes, dResids, rows, resids, rows);
172 | 	cublasGetMatrix(rows, yCols, fbytes, dEffects, rows, effects, rows);
173 | 
174 | 	cublasFree(dDiags);
175 | 	cublasFree(dCoeffs);
176 | 	cublasFree(dResids);
177 | 	cublasFree(dEffects);
178 | }
179 | 


--------------------------------------------------------------------------------
/src/lsfit.h:
--------------------------------------------------------------------------------
 1 | #ifndef _LSFIT_H_
 2 | #define _LSFIT_H_
 3 | 
 4 | void getCRE(float *dQR, int rows, int cols, int stride, int rank,
 5 |             double *qrAux, int yCols,
 6 |             float *coeffs, float *resids, float *effects);
 7 | void gpuLSFitF(float *X, int n, int p, float *Y, int nY,
 8 |                double tol, float *coeffs,
 9 |                float *resids, float *effects,
10 |                int *rank, int *pivot, double * qrAux);
11 | void gpuLSFitD(double *X, int n, int p, double *Y, int nY,
12 |                double tol, double *coeffs,
13 |                double *resids, double *effects,
14 |                int *rank, int *pivot, double * qrAux);
15 | 
16 | #endif /* _LSFIT_H_ */
17 | 


--------------------------------------------------------------------------------
/src/matmult.cpp:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | #include<stdlib.h>
 3 | #include<string.h>
 4 | 
 5 | #include<cuda_runtime.h>
 6 | #include<cublas_v2.h>
 7 | 
 8 | #include<cuseful.h>
 9 | 
10 | #include<R.h>
11 | #include<Rinternals.h>
12 | #include<R_ext/BLAS.h>
13 | 
14 | #include<matmult.h>
15 | 
16 | SEXP gpuMatMult(SEXP a, SEXP b) {
17 |   double
18 |     * xa = REAL(a), * xb = REAL(b),
19 |     * gpua, * gpub, * gpuc;
20 | 
21 |   SEXP
22 |     dima = getAttrib(a, R_DimSymbol),
23 |     dimb = getAttrib(b, R_DimSymbol);
24 | 
25 |   int
26 |     rowsa = INTEGER(dima)[0], colsa = INTEGER(dima)[1],
27 |     rowsb = INTEGER(dimb)[0], colsb = INTEGER(dimb)[1];
28 | 
29 |   cublasStatus_t stat;
30 |   cublasHandle_t handle;
31 | 
32 |   cudaError_t cudaStat;
33 | 
34 |   cudaStat = cudaMalloc((void**) &gpua, rowsa * colsa * sizeof(double));
35 |   if (cudaStat != cudaSuccess) error("device memory allocation failed");
36 | 
37 |   cudaStat = cudaMalloc((void**) &gpub, rowsb * colsb * sizeof(double));
38 |   if (cudaStat != cudaSuccess) error("device memory allocation failed");
39 | 
40 |   int
41 |     rowsOpA = rowsa, colsOpA = colsa, colsOpB = colsb;
42 | 
43 |   cudaStat = cudaMalloc((void**) &gpuc, rowsOpA * colsOpB * sizeof(double));
44 |   if (cudaStat != cudaSuccess) error("device memory allocation failed");
45 | 
46 |   stat = cublasCreate(&handle);
47 |   if(stat != CUBLAS_STATUS_SUCCESS) error("CUBLAS initialization failed\n");
48 | 
49 |   stat = cublasSetMatrix(rowsa, colsa, sizeof(double), xa, rowsa,
50 |       gpua, rowsa);
51 |   if(stat != CUBLAS_STATUS_SUCCESS) {
52 |     cudaFree(gpuc);
53 |     cudaFree(gpub);
54 |     cudaFree(gpua);
55 |     cublasDestroy(handle);
56 |     error("data download failed\n");
57 |   }
58 | 
59 |   stat = cublasSetMatrix(rowsb, colsb, sizeof(double), xb, rowsb,
60 |       gpub, rowsb);
61 |   if(stat != CUBLAS_STATUS_SUCCESS) {
62 |     cudaFree(gpuc);
63 |     cudaFree(gpub);
64 |     cudaFree(gpua);
65 |     cublasDestroy(handle);
66 |     error("data download failed\n");
67 |   }
68 | 
69 |   const double alpha = 1.0, beta = 0.0;
70 |   cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, rowsOpA, colsOpB, colsOpA,
71 |       &alpha, (const double *) gpua, rowsa, (const double *) gpub, rowsb,
72 |       &beta, gpuc, rowsOpA);
73 | 
74 |   SEXP ab, dimab;
75 |   PROTECT(ab = allocVector(REALSXP, rowsOpA * colsOpB));
76 |   PROTECT(dimab = allocVector(INTSXP, 2));
77 |   INTEGER(dimab)[0] = rowsOpA; INTEGER(dimab)[1] = colsOpB;
78 |   setAttrib(ab, R_DimSymbol, dimab);
79 | 
80 |   double * xab = REAL(ab);
81 |   stat = cublasGetMatrix(rowsOpA, colsOpB, sizeof(double), gpuc, rowsOpA,
82 |       xab, rowsOpA);
83 |   if(stat != CUBLAS_STATUS_SUCCESS) {
84 |     cudaFree(gpuc);
85 |     cudaFree(gpub);
86 |     cudaFree(gpua);
87 |     cublasDestroy(handle);
88 |     error("data upload failed\n");
89 |   }
90 | 
91 |   cudaFree(gpua);
92 |   cudaFree(gpub);
93 |   cudaFree(gpuc);
94 | 
95 |   cublasDestroy(handle);
96 |   UNPROTECT(2);
97 |   return ab;
98 | }
99 | 


--------------------------------------------------------------------------------
/src/matmult.h:
--------------------------------------------------------------------------------
1 | extern "C" {
2 |     SEXP gpuMatMult(SEXP a, SEXP b);
3 | }
4 | 


--------------------------------------------------------------------------------
/src/mi.cpp:
--------------------------------------------------------------------------------
  1 | #include<stdlib.h>
  2 | #include<stdio.h>
  3 | #include<string.h>
  4 | #include<math.h>
  5 | #include<R.h>
  6 | #include<cuseful.h>
  7 | #include<mi.h>
  8 | 
  9 | #include "cudaUtils.h"
 10 | 
 11 | #define NTHREADS 16
 12 | 
 13 | static int initKnots(int nbins, int order, float ** knots) {
 14 |   int
 15 |     om1 = order - 1,
 16 |     degree = nbins - 1,
 17 |     dpo = degree + order,
 18 |     nknots = dpo + 1;
 19 | 
 20 |   *knots = Calloc(nknots, float); 
 21 |   for(int i = 0; i < nknots; i++) {
 22 |     if(i <= om1)
 23 |       (*knots)[i] = 0.f;
 24 |     else if(i <= degree)
 25 |       (*knots)[i] = (*knots)[i-1] + 1.f;
 26 |     else
 27 |       (*knots)[i] = (*knots)[degree] + 1.f;
 28 |   }
 29 |   return nknots;
 30 | }
 31 | 
 32 | 
 33 | void bSplineMutualInfo(int nbins, int order, int nsamples,
 34 |                        int nx, const float * x,
 35 |                        int ny, const float * y,
 36 |                        float * out_mi)
 37 | {
 38 |   size_t
 39 |     pitch[2], pitch_bins[2],
 40 |     col_bytes = (size_t)nsamples * sizeof(float);
 41 |   int
 42 |     nknots, nblocks[2], size[2] = { nx, ny };
 43 |   float
 44 |     * knots, * dknots,
 45 |     * stage[2], * dx[2], * dentropy[2], * dbins[2];
 46 |   const float
 47 |     * data[2] = { x, y };
 48 | 
 49 |   nknots = initKnots(nbins, order, &knots);
 50 |   float knot_max = knots[nknots - 1];
 51 |   cudaMalloc((void **)&dknots, nknots * sizeof(float));
 52 |   cudaMemcpy(dknots, knots, nknots * sizeof(float), cudaMemcpyHostToDevice);
 53 |   Free(knots);
 54 | 
 55 |   checkCudaError("bSplineMutualInfoSingle: 1");
 56 |         
 57 |   for(int i = 0; i < 2; i++) {
 58 |     cudaMallocPitch((void **)&(dx[i]), pitch + i, col_bytes, size[i]);
 59 |     cudaMallocHost((void **)&(stage[i]), size[i] * col_bytes);
 60 |     cudaMalloc((void **)&(dentropy[i]), size[i] * sizeof(float));
 61 |     cudaMallocPitch((void **)&(dbins[i]), pitch_bins + i,
 62 |                     nbins * col_bytes, size[i]);
 63 |     cudaMemset2D(dbins[i], pitch_bins[i], 0, nbins * col_bytes, size[i]);
 64 | 
 65 |     nblocks[i] = size[i] / NTHREADS;
 66 |     if(nblocks[i] * NTHREADS < size[i])
 67 |       nblocks[i]++;
 68 |   }
 69 | 
 70 |   checkCudaError("bSplineMutualInfoSingle: 2");
 71 | 
 72 |   cudaStream_t stream[2];
 73 |   for(int i = 0; i < 2; i++)
 74 |     cudaStreamCreate(stream + i);
 75 |   for(int i = 0; i < 2; i++) {
 76 |     cudaMemcpyAsync(stage[i], data[i], size[i] * col_bytes,
 77 |                     cudaMemcpyHostToHost, stream[i]);
 78 |     cudaMemcpy2DAsync(dx[i], pitch[i], stage[i], col_bytes, col_bytes,
 79 |                       size[i], cudaMemcpyHostToDevice, stream[i]);
 80 |   }
 81 |   for(int i = 0; i < 2; i++) {
 82 |     dim3
 83 |       grid(nblocks[i]),
 84 |       block(NTHREADS);
 85 | 
 86 |     size_t xpitch = pitch[i] / sizeof(float);
 87 | 
 88 |     void * scaleArgs[] = {
 89 |       &knot_max,
 90 |       size + i,
 91 |       &nsamples,
 92 |       dx + i,
 93 |       &xpitch
 94 |     };
 95 |     cudaLaunch("scale", scaleArgs,
 96 |                       grid, block, stream[i]);
 97 | 
 98 |     size_t pitch_bins_i = pitch_bins[i] / sizeof(float);
 99 |     void * gbsArgs[] = {
100 |       &nbins, &order,
101 |       &nknots, &dknots,
102 |       &nsamples, size + i,
103 |       dx + i, &xpitch,
104 |       dbins + i,
105 |       &pitch_bins_i
106 |     };
107 |     cudaLaunch("get_bin_scores", gbsArgs,
108 |                       grid, block, stream[i]);
109 | 
110 |     void * entropyArgs[] = {
111 |       &nbins,
112 |       &nsamples,
113 |       size + i,
114 |       dbins + i,
115 |       &pitch_bins_i,
116 |       dentropy + i
117 |     };
118 |     cudaLaunch("get_entropy", entropyArgs,
119 |                       grid, block, stream[i]);
120 |   }
121 |   checkCudaError("bSplineMutualInfoSingle: 3");
122 | 
123 |   cudaFree(dknots);
124 |   for(int i = 0; i < 2; i++) {
125 |     cudaFreeHost(stage[i]);
126 |     cudaFree(dx[i]);
127 |   }
128 | 
129 |   size_t pitch_mi;
130 |   float * dmi;
131 |   cudaMallocPitch((void **)&dmi, &pitch_mi, ny * sizeof(float), nx);
132 | 
133 |   dim3
134 |     gridDim(nblocks[0], nblocks[1]), blockDim(NTHREADS, NTHREADS);
135 | 
136 |   pitch_bins[0] /= sizeof(float);
137 |   pitch_bins[1] /= sizeof(float);
138 | 
139 |   size_t dpitch_mi = pitch_mi / sizeof(float);
140 |   
141 |   void * miArgs[] = {
142 |     &nbins, &nsamples,
143 |     &nx, dbins, pitch_bins, dentropy,
144 |     &ny, dbins + 1, pitch_bins + 1, dentropy + 1,
145 |     &dmi, &dpitch_mi
146 |   };
147 |   cudaLaunch("get_mi", miArgs, gridDim, blockDim);
148 |   checkCudaError("bSplineMutualInfoSingle: 4");
149 | 
150 |   for(int i = 0; i < 2; i++)
151 |     cudaFree(dbins[i]);
152 | 
153 |   float * mi_stage;
154 |   cudaMallocHost((void **)&mi_stage, nx * ny * sizeof(float)); 
155 | 
156 |   cudaMemcpy2D(mi_stage, ny * sizeof(float), dmi, pitch_mi,
157 |                ny * sizeof(float), nx, cudaMemcpyDeviceToHost);
158 |   checkCudaError("bSplineMutualInfoSingle: 5");
159 |   cudaFree(dmi);
160 | 
161 |   memcpy(out_mi, mi_stage, nx * ny * sizeof(float));
162 |   cudaFreeHost(mi_stage);
163 |   checkCudaError("bSplineMutualInfoSingle: 6");
164 | }
165 | 


--------------------------------------------------------------------------------
/src/mi.h:
--------------------------------------------------------------------------------
 1 | #ifndef _MI_H_
 2 | #define _MI_H_
 3 | 
 4 | void bSplineMutualInfo(int nbins, int order, int nsamples,
 5 |                        int nx, const float * x,
 6 |                        int ny, const float * y,
 7 |                        float * out_mi);
 8 | 
 9 | #endif /* _MI_H_ */
10 | 


--------------------------------------------------------------------------------
/src/qrdecomp.h:
--------------------------------------------------------------------------------
 1 | #ifndef _QRDECOMP_H_
 2 | #define _QRDECOMP_H_
 3 | 
 4 | void qrdecompMGS(int rows, int cols, float * da, float * dq, float * dr, 
 5 | 	int * pivots);
 6 | void getQRDecomp(int rows, int cols, float * dq, float * da, int * pivot);
 7 | void qrSolver(int rows, int cols, float * matX, float * vectY, float * vectB);
 8 | void getQRDecompRR(int rows, int cols, double tol, float * dQR,
 9 |                    int * pivot, double * qrAux, int * rank);
10 | void getQRDecompBlocked(int rows, int cols, double tol, float * dQR,
11 |                         int blockSize, int rowsUnblocked, int * pivot,
12 |                         double * qrAux, int * rank);
13 | 
14 | void getInverseFromQR(int rows, int cols, const float * dQ, const float * dR, 
15 | 	float * dInverse);
16 | void solveFromQR(int rows, int cols, const float * matQ, const float * matR,
17 | 	const float * vectY, float * vectB);
18 | 
19 | #endif /* _QRDECOMP_H_ */
20 | 


--------------------------------------------------------------------------------
/src/rinterface.cpp:
--------------------------------------------------------------------------------
  1 | #include<stdio.h>
  2 | #include<string.h>
  3 | 
  4 | #include "cublas.h"
  5 | 
  6 | #include "R.h"
  7 | #include "Rinternals.h"
  8 | #include  "R_ext/Rdynload.h"
  9 | 
 10 | #include "correlation.h"
 11 | #include "cuseful.h"
 12 | #include "distance.h"
 13 | #include "granger.h"
 14 | #include "hcluster.h"
 15 | #include "kendall.h"
 16 | #include "lsfit.h"
 17 | #include "matmult.h"
 18 | #include "mi.h"
 19 | #include "qrdecomp.h"
 20 | 
 21 | #include "rinterface.h"
 22 | 
 23 | void R_init_mylib(DllInfo *info) {
 24 |   R_CallMethodDef callMethods[]  = {
 25 |     {"gpuMatMult", (DL_FUNC) &gpuMatMult, 2},
 26 |     {NULL, NULL, 0}
 27 |   };
 28 |   R_registerRoutines(info, NULL, callMethods, NULL, NULL);
 29 | }
 30 | 
 31 | // whichObs = 0 means everything
 32 | // whichObs = 1 means pairwiseComplete
 33 | void rpmcc(const int * whichObs,
 34 |            const float * samplesA, const int * numSamplesA,
 35 |            const float * samplesB, const int * numSamplesB,
 36 |            const int * sampleSize,
 37 |            float * numPairs, float * correlations, float * signifs)
 38 | {
 39 |   UseObs myObs;
 40 |   switch(*whichObs) {
 41 |   case 0:
 42 |     myObs = everything;
 43 |     break;
 44 |   case 1:
 45 |     myObs = pairwiseComplete;
 46 |     break;
 47 |   default:
 48 |     fatal("unknown use method");
 49 |   }
 50 |   pmcc(myObs,
 51 |        samplesA, *numSamplesA,
 52 |        samplesB, *numSamplesB,
 53 |        *sampleSize,
 54 |        numPairs, correlations, signifs);
 55 | }
 56 | 
 57 | void rformatInput(const int * images, 
 58 |                   const int * xcoords, const int * ycoords, const int * zcoords,
 59 |                   const int * mins, const int * maxes,
 60 |                   const float * evs, const int * numrows, const int * numimages, 
 61 |                   float * output)
 62 | {
 63 |   getData(images, xcoords, ycoords, zcoords, mins, maxes, evs, 
 64 |           *numrows, *numimages, output);
 65 | }
 66 | 
 67 | void rformatOutput(const int * imageList1, const int * numImages1, 
 68 |                    const int * imageList2, const int * numImages2, 
 69 |                    const int * structureid,
 70 |                    const double * cutCorrelation, const int * cutPairs,
 71 |                    const double * correlations, const double * signifs, const int * numPairs, 
 72 |                    double * results, int * nrows)
 73 | {
 74 |   *nrows = (int) parseResults(imageList1, *numImages1, imageList2, 
 75 |                               *numImages2, *structureid, *cutCorrelation, *cutPairs, 
 76 |                               correlations, signifs, numPairs, results);
 77 | }
 78 | 
 79 | void rsetDevice(const int * device) {
 80 |   setDevice(*device);
 81 | }
 82 | 
 83 | void rgetDevice(int * device) {
 84 |   getDevice(device);
 85 | }
 86 | 
 87 | void rtestT(const float * pairs, const float * coeffs, const int * n, 
 88 |             float * ts)
 89 | {
 90 |   testSignif(pairs, coeffs, (size_t) *n, ts);
 91 | }
 92 | 
 93 | void rhostT(const float * pairs, const float * coeffs, const int * n, 
 94 |             float * ts) 
 95 | {
 96 |   hostSignif(pairs, coeffs, (size_t) *n, ts);
 97 | }
 98 | 
 99 | void rSignifFilter(const double * data, int * rows, double * results) {
100 |   *rows = signifFilter(data, (size_t) *rows, results);
101 | }
102 | 
103 | void gSignifFilter(const float * data, int * rows, float * results) {
104 |   *rows = gpuSignifFilter(data, (size_t) *rows, results);
105 | }
106 | 
107 | void RcublasPMCC(const float * samplesA, const int * numSamplesA,
108 |                  const float * samplesB, const int * numSamplesB,
109 |                  const int * sampleSize,
110 |                  float * correlations)
111 | {
112 |   cublasPMCC(samplesA, *numSamplesA, samplesB, *numSamplesB, *sampleSize, 
113 |              correlations);
114 | }
115 | 
116 | void RhostKendall(const float * X, const float * Y, const int * n, 
117 |                   double * answer)
118 | {
119 |   *answer = hostKendall(X, Y, *n);
120 | }
121 | 
122 | void RpermHostKendall(const float * X, const int * nx, const float * Y, 
123 |                       const int * ny, const int * sampleSize, double * answers)
124 | {
125 |   permHostKendall(X, *nx, Y, *ny, *sampleSize, answers);
126 | }
127 | 
128 | void RgpuKendall(const float * X, const int * nx, const float * Y, 
129 |                  const int * ny, const int * sampleSize, double * answers)
130 | {
131 |   masterKendall(X, *nx, Y, *ny, *sampleSize, answers);
132 | }
133 | 
134 | void rgpuGranger(const int * rows, const int * cols, const float * y,
135 |                  const int * p, float * fStats, float * pValues)
136 | {
137 |   granger(*rows, *cols, y, *p, fStats, pValues);
138 | }
139 | 
140 | void rgpuGrangerXY(const int * rows, const int * colsx, const float * x, 
141 |                    const int * colsy, const float * y, const int * p, 
142 |                    float * fStats, float * pValues)
143 | {
144 |   grangerxy(*rows, *colsx, x, *colsy, y, *p, fStats, pValues);
145 | }
146 | 
147 | dist_method getDistEnum(const char * methodStr)
148 | {
149 |   if(0 == strcmp(methodStr,"maximum"))    return MAXIMUM;
150 |   if(0 == strcmp(methodStr,"manhattan"))  return MANHATTAN;
151 |   if(0 == strcmp(methodStr,"canberra"))   return CANBERRA;
152 |   if(0 == strcmp(methodStr,"binary"))     return BINARY;
153 |   if(0 == strcmp(methodStr,"minkowski"))  return MINKOWSKI;
154 |   //      if(0 == strcmp(methodStr,"dot"))                return DOT;
155 |   return EUCLIDEAN;
156 | }
157 | 
158 | hc_method getClusterEnum(const char * methodStr)
159 | {
160 |   if(0 == strcmp(methodStr,"complete"))           return COMPLETE;
161 |   if(0 == strcmp(methodStr,"wpgma"))                      return WPGMA;
162 |   if(0 == strcmp(methodStr,"average"))            return AVERAGE;
163 |   if(0 == strcmp(methodStr,"median"))                     return MEDIAN;
164 |   if(0 == strcmp(methodStr,"centroid"))           return CENTROID;
165 |   if(0 == strcmp(methodStr,"flexible_group"))     return FLEXIBLE_GROUP;
166 |   if(0 == strcmp(methodStr,"flexible"))           return FLEXIBLE;
167 |   if(0 == strcmp(methodStr,"ward"))                       return WARD;
168 |   if(0 == strcmp(methodStr,"mcquitty"))           return MCQUITTY;
169 |   return SINGLE;
170 | }
171 | 
172 | void Rdistclust(const char ** distmethod, const char ** clustmethod, 
173 |                 const float * points, const int * numPoints, const int * dim,
174 |                 int * merge, int * order, float * val)
175 | {
176 |   dist_method dmeth = getDistEnum(*distmethod); 
177 |   hc_method hcmeth = getClusterEnum(*clustmethod); 
178 | 
179 |   size_t dpitch = 0;
180 |   float * gpuDistances = NULL;
181 | 
182 |   distanceLeaveOnGpu(dmeth, 2.f, points, *dim, *numPoints, 
183 |                      &gpuDistances, &dpitch);
184 | 
185 |   size_t len = (*numPoints) - 1;
186 |   float 
187 |     lambda = 0.5f, beta = 0.5f;
188 |   int 
189 |     * presub, * presup;
190 | 
191 |   presub = Calloc(len, int);
192 |   presup = Calloc(len, int);
193 | 
194 |   hclusterPreparedDistances(gpuDistances, dpitch, *numPoints, 
195 |                             presub, presup,
196 |                             val,
197 |                             hcmeth,
198 |                             lambda, beta);
199 | 
200 |   formatClustering(len, presub, presup, merge, order);
201 | 
202 |   Free(presub);
203 |   Free(presup);
204 | }
205 | 
206 | void Rdistances(const float * points, const int * numPoints, const int * dim,
207 |                 float * distances, const char ** method, const float *p)
208 | {
209 |   dist_method nummethod = getDistEnum(*method); 
210 | 
211 |   distance(points, (*dim)*sizeof(float), *numPoints, points, 
212 |            (*dim)*sizeof(float), *numPoints, *dim, distances, 
213 |            (*numPoints)*sizeof(float), nummethod, *p);
214 | }
215 | 
216 | void Rhcluster(const float * distMat, const int * numPoints, 
217 |                int * merge, int * order, float * val,
218 |                const char ** method)
219 | {
220 |   hc_method nummethod = getClusterEnum(*method); 
221 |         
222 |   size_t len = (*numPoints) - 1;
223 |   size_t pitch = (*numPoints) * sizeof(float);
224 |   float lambda = 0.5;
225 |   float beta = 0.5;
226 |   int 
227 |     * presub, * presup;
228 | 
229 |   presub = Calloc(len, int);
230 |   presup = Calloc(len, int);
231 | 
232 |   hcluster(distMat, pitch, *numPoints, presub, presup, val, nummethod,
233 |            lambda, beta);
234 | 
235 |   formatClustering(len, presub, presup, merge, order);
236 | 
237 |   Free(presub);
238 |   Free(presup);
239 | }
240 | 
241 | void formatClustering(const int len, const int * sub,  const int * sup, 
242 |                       int * merge, int * order)
243 | {
244 |   for(size_t i = 0; i < len; i++) {
245 |     merge[i] = -(sub[i] + 1);
246 |     merge[i+len] = -(sup[i] + 1);
247 |   }
248 | 
249 |   for(size_t i = 0; i < len; i++) {
250 |     for(size_t j = i+1; j < len; j++) {
251 |       if((merge[j] == merge[i]) || (merge[j] == merge[i+len]))
252 |         merge[j] = i + 1;
253 |       if((merge[j+len] == merge[i]) || (merge[j+len] == merge[i+len]))
254 |         merge[j+len] = i + 1;
255 |       if(((merge[j+len] < 0) && (merge[j] > 0)) 
256 |          || ((merge[j] > 0) && (merge[j+len] > 0) 
257 |              && (merge[j] > merge[j+len]))) {
258 |         int holder = merge[j];
259 |         merge[j] = merge[j+len];
260 |         merge[j+len] = holder; 
261 |       }
262 |     }
263 |   }
264 |   getPrintOrder(len, merge, order);
265 | }
266 | 
267 | void getPrintOrder(const int len, const int * merge, int * order)
268 | {
269 |   int 
270 |     level = len-1, otop = len;
271 | 
272 |   depthFirst(len, merge, level, &otop, order);
273 | }
274 | 
275 | void depthFirst(const int len, const int * merge, int level, int * otop, 
276 |                 int * order)
277 | {
278 |   int
279 |     left = level, right = level + len;
280 | 
281 |   if(merge[right] < 0) {
282 |     order[*otop] = -merge[right];
283 |     (*otop)--;
284 |   } else
285 |     depthFirst(len, merge, merge[right]-1, otop, order);
286 | 
287 |   if(merge[left] < 0) {
288 |     order[*otop] = -merge[left];
289 |     (*otop)--;
290 |   } else
291 |     depthFirst(len, merge, merge[left]-1, otop, order);
292 | }
293 | 
294 | void RgetQRDecomp(int * rows, int * cols, float * a, float * q, int * pivot,
295 |                   int * rank)
296 | {
297 | 
298 |   int
299 |     fbytes = sizeof(float),
300 |     m = *rows, n = *cols;
301 |   float
302 |     * da, * dq;
303 | 
304 |   cublasAlloc(m*n, fbytes, (void **)&da);
305 |   cublasAlloc(m*m, fbytes, (void **)&dq);
306 |   cublasSetMatrix(m, n, fbytes, a, m, da, m);
307 | 
308 |   getQRDecomp(m, n, dq, da, pivot);
309 | 
310 |   cublasGetMatrix(m, n, fbytes, da, m, a, m);
311 |   cublasGetMatrix(m, m, fbytes, dq, m, q, m);
312 |   cublasFree(da);
313 |   cublasFree(dq);
314 | 
315 |   int foundZero = 0;
316 |   for(int i = 0; (i < m) && (i < n); i++) {
317 |     if((a[i+i*m] < 0.0001f) && (a[i+i*m] > -0.0001f)) {
318 |       foundZero = 1;
319 |       *rank = i+1;
320 |       break;
321 |     }
322 |   }
323 |   if(!foundZero) {
324 |     if(m > n) *rank = n;
325 |     else *rank = m;
326 |   }
327 | }
328 | 
329 | // solve for B:  XB=Y where B and Y are vectors and X is a matrix of
330 | // dimension rows x cols
331 | void RqrSolver(int * rows, int * cols, float * matX, float * vectY, 
332 |                float * vectB)
333 | {
334 |   int
335 |     fbytes = sizeof(float),
336 |     m = *rows, n = *cols;
337 |   float
338 |     * dX, * dY, * dB;
339 | 
340 |   cublasAlloc(m*n, fbytes, (void **)&dX);
341 |   cublasAlloc(n, fbytes, (void **)&dB);
342 |   cublasAlloc(m, fbytes, (void **)&dY);
343 |   checkCublasError("RqrSolver: line 80");
344 | 
345 |   cublasSetMatrix(m, n, fbytes, matX, m, dX, m);
346 |   cublasSetVector(m, fbytes, vectY, 1, dY, 1);
347 |   checkCublasError("RqrSolver: line 84");
348 | 
349 |   qrSolver(m, n, dX, dY, dB);
350 | 
351 |   cublasFree(dX);
352 |   cublasFree(dY);
353 | 
354 |   cublasGetVector(n, fbytes, dB, 1, vectB, 1);
355 |   checkCublasError("RqrSolver: line 93");
356 | 
357 |   cublasFree(dB);
358 | }
359 | 
360 | void rGetQRDecompRR(const int * rows, const int * cols,
361 |                     const double * tol, float * x, int * pivot,
362 |                     double * qraux, int * rank)
363 | {
364 |   float * dQR;
365 |   cudaMalloc((void **) &dQR, (*rows) * (*cols) * sizeof(float));
366 |   checkCudaError("rGetQRDecompRR:");
367 | 
368 |   cudaMemcpy(dQR, x, (*rows) * (*cols) * sizeof(float),
369 |              cudaMemcpyHostToDevice);
370 | 
371 |   getQRDecompRR(*rows, *cols, *tol, dQR, pivot, qraux, rank);
372 | 
373 |   cudaMemcpy(x, dQR, (*rows) * (*cols) * sizeof(float),
374 |              cudaMemcpyDeviceToHost);
375 |   checkCudaError("rGetQRDecompRR:");
376 |   cudaFree(dQR);
377 |   checkCudaError("rGetQRDecompRR:");
378 | }
379 | 
380 | void rGetInverseFromQR(const int * rows, const int * cols,
381 |                        const float * q, const float * r,
382 |                        float * inverse)
383 | {
384 |   float
385 |     * dQ, * dR, * dInverse;
386 | 
387 |   cudaMalloc((void **) &dQ, (*rows) * (*cols) * sizeof(float));
388 |   cudaMalloc((void **) &dR, (*cols) * (*cols) * sizeof(float));
389 |   cudaMalloc((void **) &dInverse,  (*rows) * (*cols) * sizeof(float));
390 |   checkCudaError("rGetInverseFromQR:");
391 | 
392 |   cudaMemcpy(dQ, q, (*rows) * (*cols) * sizeof(float),
393 |              cudaMemcpyHostToDevice);
394 |   cudaMemcpy(dR, r, (*cols) * (*cols) * sizeof(float),
395 |              cudaMemcpyHostToDevice);
396 | 
397 |   getInverseFromQR(*rows, *cols, dQ, dR, dInverse);
398 | 
399 |   cudaFree(dQ);
400 |   cudaFree(dR);
401 | 
402 |   cudaMemcpy(inverse, dInverse, (*rows) * (*cols) * sizeof(float),
403 |              cudaMemcpyDeviceToHost);
404 |   checkCudaError("rGetInverseFromQR:");
405 | 
406 |   cudaFree(dInverse);
407 | }
408 | 
409 | void rSolveFromQR(const int * rows, const int * cols, const float * q, const float * r,
410 |                   const float * y, float * b)
411 | {
412 |   solveFromQR(*rows, *cols, q, r, y, b);
413 | }
414 | 
415 | void rBSplineMutualInfo(int * nBins, int * splineOrder, int * nsamples,
416 |                         int * rowsA, const float * A,
417 |                         int * rowsB, const float * B, 
418 |                         float * mutualInfo)
419 | {
420 |   bSplineMutualInfo(*nBins, *splineOrder, *nsamples, *rowsA, A, *rowsB, B,
421 |                     mutualInfo);
422 | }
423 | 
424 | // Interface for R functions requiring least-squares computations.
425 | //
426 | void RgpuLSFit(float *X, int *n, int *p, float *Y, int *nY,
427 |                double *tol, float *coeffs, float *resids, float *effects,
428 |                int *rank, int *pivot, double * qrAux, int useSingle)
429 | {
430 |   if (useSingle) {
431 |     gpuLSFitF(X, *n, *p, Y, *nY, *tol, coeffs, resids, effects,
432 |               rank, pivot, qrAux);
433 |   }
434 |   else {
435 |     //    gpuLSFitD(X, *n, *p, Y, *nY, *tol, coeffs, resids, effects, rank, pivot, qrAux);
436 |   }    
437 | }
438 | 


--------------------------------------------------------------------------------
/src/rinterface.h:
--------------------------------------------------------------------------------
 1 | extern "C" {
 2 |   // whichObs = 0 means everything
 3 |   // whichObs = 1 means pairwiseComplete
 4 |   void rpmcc(const int * whichObs,
 5 |              const float * samplesA, const int * numSamplesA,
 6 |              const float * samplesB, const int * numSamplesB, 
 7 |              const int * sampleSize, float * numPairs,
 8 |              float * correlations, float * signifs);
 9 | 
10 |   void rformatInput(const int * images, 
11 |                     const int * xcoords, const int * ycoords, const int * zcoords,
12 |                     const int * mins, const int * maxes,
13 |                     const float * evs, const int * numrows, const int * numimages, 
14 |                     float * output);
15 | 
16 |   void rformatOutput(const int * imageList1, const int * numImages1, 
17 |                      const int * imageList2, const int * numImages2, 
18 |                      const int * structureid,
19 |                      const double * cutCorrelation, const int * cutPairs,
20 |                      const double * correlations, const double * signifs, 
21 |                      const int * numPairs, double * results, int * nrows);
22 | 
23 |   void rsetDevice(const int * device);
24 |   void rgetDevice(int * device);
25 | 
26 |   void rtestT(const float * pairs, const float * coeffs, const int * n, 
27 |               float * ts, const char ** kernelSrc);
28 |   void rhostT(const float * pairs, const float * coeffs, const int * n, 
29 |               float * ts); 
30 |   void rSignifFilter(const double * data, int * rows, double * results);
31 |   void gSignifFilter(const float * data, int * rows, float * results);
32 | 
33 |   void RcublasPMCC(const float * samplesA, const int * numSamplesA,
34 |                    const float * samplesB, const int * numSamplesB, 
35 |                    const int * sampleSize, float * correlations);
36 | 
37 |   void RhostKendall(const float * X, const float * Y, const int * n, 
38 |                     double * answer);
39 |   void RpermHostKendall(const float * X, const int * nx, const float * Y, 
40 |                         const int * ny, const int * sampleSize, double * answers);
41 |   void RgpuKendall(const float * X, const int * nx, const float * Y, 
42 |                    const int * ny, const int * sampleSize, double * answers);
43 | 
44 |   void rgpuGranger(const int * rows, const int * colsy, const float * y, 
45 |                    const int * p, float * fStats, float * pValues);
46 |   void rgpuGrangerXY(const int * rows, const int * colsx, const float * x, 
47 |                      const int * colsy, const float * y, const int * p, 
48 |                      float * fStats, float * pValues);
49 | 
50 |   void Rdistclust(const char ** distmethod, const char ** clustmethod, 
51 |                   const float * points, const int * numPoints, const int * dim,
52 |                   int * merge, int * order, float * val);
53 |   void Rdistances(const float * points, const int * numPoints, 
54 |                   const int * dim, float * distances, const char ** method,
55 |                   const float * p);
56 |   void Rhcluster(const float * distMat, const int * numPoints, 
57 |                  int * merge, int * order, float * val, const char ** method);
58 | 
59 |   void RgetQRDecomp(int * rows, int * cols, float * a, float * q, int * pivot,
60 |                     int * rank);
61 |   void RqrSolver(int * rows, int * cols, float * matX, float * vectY, 
62 |                  float * vectB);
63 | 
64 |   void rGetQRDecompRR(const int * rows, const int * cols,
65 |                       const double * tol, float * x, int * pivot,
66 |                       double * qraux, int * rank);
67 | 
68 |   void rGetInverseFromQR(const int * rows, const int * cols, const float * q,
69 |                          const float * r, float * inverse);
70 |   void rSolveFromQR(const int * rows, const int * cols, const float * q,
71 |                     const float * r, const float * y, float * b);
72 | 
73 |   void rBSplineMutualInfo(int * nBins, int * splineOrder, int * nsamples,
74 |                           int * rowsA, const float * A,
75 |                           int * rowsB, const float * B, 
76 |                           float * mutualInfo);
77 | 
78 |   void RgpuLSFit(float *X, int *n, int *p, float *Y, int *nY,
79 |                  double *tol, float *coeffs, float *resids, float *effects,
80 |                  int *rank, int *pivot, double * qrAux, int useSingle);
81 | 
82 |   void setDevice(int * device);
83 | }
84 | 
85 | void formatClustering(const int len, const int * sub,  const int * sup, 
86 |                       int * merge, int * order);
87 | void getPrintOrder(const int len, const int * merge, int * order);
88 | void depthFirst(const int len, const int * merge, int level, int * otop, 
89 |                 int * order);
90 | 


--------------------------------------------------------------------------------
/src/sort.cpp:
--------------------------------------------------------------------------------
  1 | #include<stdio.h>
  2 | #include<R.h>
  3 | 
  4 | #include"cuseful.h"
  5 | #include"sort.h"
  6 | 
  7 | void copyVect(int n, double * from, int incFrom, double * to, int incTo)
  8 | {
  9 | 	for(int i = 0; i < n; i++)
 10 | 		to[i*incTo] = from[i*incFrom];
 11 | }
 12 | 
 13 | //Swap rows in a col major array
 14 | void swapRows(int rows, int cols, double * array, int rowA, int rowB)
 15 | {
 16 | 	double * tempRow  = Calloc(cols, double);
 17 | 
 18 | 	copyVect(cols, array+rowA, rows, tempRow, 1);
 19 | 	copyVect(cols, array+rowB, rows, array+rowA, rows);
 20 | 	copyVect(cols, tempRow, 1, array+rowB, rows);
 21 | 	Free(tempRow);
 22 | }
 23 | 
 24 | //Find the index of the Median of the elements
 25 | //of array that occur at every "shift" positions.
 26 | int findMedianIndex(int rows, int cols, int colToSortOn, double * array, 
 27 | 	int left, int right, int shift)
 28 | {
 29 | 	int 
 30 | 		i, 
 31 | 		groups = (right - left)/shift + 1, 
 32 | 		k = left + groups/2*shift;
 33 | 
 34 | 	double 
 35 | 		* colToSort = array+colToSortOn*rows;
 36 | 
 37 | 	for(i = left; i <= k; i += shift) {
 38 | 
 39 | 		int 
 40 | 			minRow = i; 
 41 | 		double
 42 | 			minValue = colToSort[minRow];
 43 | 
 44 | 		for(int j = i; j <= right; j +=shift) {
 45 | 			if(colToSort[j] < minValue) {
 46 | 				minRow = j;
 47 | 				minValue = colToSort[minRow];
 48 | 			}
 49 | 		}
 50 | 		swapRows(rows, cols, array, i, minRow);
 51 | 	}
 52 | 	return k;
 53 | }
 54 |  
 55 | //Computes the median of each group of 5 elements and stores
 56 | //it as the first element of the group. Recursively does this
 57 | //till there is only one group and hence only one Median
 58 | double findMedianOfMedians(int rows, int cols, int colToSortOn, double * array, 
 59 | 	int left, int right)
 60 | {
 61 | 	double * colToSort = array+colToSortOn*rows;
 62 | 	if(left == right)
 63 | 		return colToSort[left];
 64 |  
 65 | 	int i, shift = 1;
 66 | 	while(shift <= (right - left)) {
 67 | 		for(i = left; i <= right; i+=shift*5) {
 68 | 			int endIndex = (i + shift*5 - 1 < right) ? i + shift*5 - 1 : right;
 69 | 
 70 | 			int medianIndex = findMedianIndex(rows, cols, colToSortOn, array, 
 71 | 				i, endIndex, shift);
 72 | 
 73 | 			swapRows(rows, cols, array, i, medianIndex);
 74 | 		}
 75 | 		shift *= 5;
 76 | 	}
 77 | 	return colToSort[left];
 78 | }
 79 | 
 80 | //Partition the array into two halves and return the
 81 | //index about which the array is partitioned
 82 | int partition(int rows, int cols, int colToSortOn, double * array, 
 83 | 	int left, int right)
 84 | {
 85 | 	//Makes the leftmost element a good pivot,
 86 | 	//specifically the median of medians
 87 | 	findMedianOfMedians(rows, cols, colToSortOn, array, left, right);
 88 | 
 89 | 	int 
 90 | 		pivotIndex = left, index = left, 
 91 | 		i;
 92 | 	double 
 93 | 		* colToSort = array+colToSortOn*rows,
 94 | 		pivotValue = colToSort[pivotIndex];
 95 |  
 96 | 	swapRows(rows, cols, array, pivotIndex, right);
 97 | 
 98 | 	for(i = left; i < right; i++) {
 99 | 		if(colToSort[i] < pivotValue) {
100 | 			swapRows(rows, cols, array, i, index);
101 | 			index += 1;
102 | 		}
103 | 	}
104 | 	swapRows(rows, cols, array, right, index);
105 | 	return index;
106 | }
107 |  
108 | //Quicksort the array
109 | void quicksort(int rows, int cols, int colToSortOn, double * array, 
110 | 	int left, int right)
111 | {
112 | 	if(left >= right)
113 | 		return;
114 |  
115 | 	int index = partition(rows, cols, colToSortOn, array, left, right);
116 | 	quicksort(rows, cols, colToSortOn, array, left, index - 1);
117 | 	quicksort(rows, cols, colToSortOn, array, index + 1, right);
118 | }
119 | 


--------------------------------------------------------------------------------
/src/sort.h:
--------------------------------------------------------------------------------
1 | void quicksort(int rows, int cols, int colToSortOn, double * array, 
2 | 	int left, int right);
3 | 


--------------------------------------------------------------------------------
/tools/mdate-sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | # Get modification time of a file or directory and pretty-print it.
  3 | 
  4 | scriptversion=2007-03-30.02
  5 | 
  6 | # Copyright (C) 1995, 1996, 1997, 2003, 2004, 2005, 2007 Free Software
  7 | # Foundation, Inc.
  8 | # written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, June 1995
  9 | #
 10 | # This program is free software; you can redistribute it and/or modify
 11 | # it under the terms of the GNU General Public License as published by
 12 | # the Free Software Foundation; either version 2, or (at your option)
 13 | # any later version.
 14 | #
 15 | # This program is distributed in the hope that it will be useful,
 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 | # GNU General Public License for more details.
 19 | #
 20 | # You should have received a copy of the GNU General Public License
 21 | # along with this program; if not, write to the Free Software Foundation,
 22 | # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 23 | 
 24 | # As a special exception to the GNU General Public License, if you
 25 | # distribute this file as part of a program that contains a
 26 | # configuration script generated by Autoconf, you may include it under
 27 | # the same distribution terms that you use for the rest of that program.
 28 | 
 29 | # This file is maintained in Automake, please report
 30 | # bugs to <bug-automake@gnu.org> or send patches to
 31 | # <automake-patches@gnu.org>.
 32 | 
 33 | case $1 in
 34 |   '')
 35 |      echo "$0: No file.  Try \`$0 --help' for more information." 1>&2
 36 |      exit 1;
 37 |      ;;
 38 |   -h | --h*)
 39 |     cat <<\EOF
 40 | Usage: mdate-sh [--help] [--version] FILE
 41 | 
 42 | Pretty-print the modification time of FILE.
 43 | 
 44 | Report bugs to <bug-automake@gnu.org>.
 45 | EOF
 46 |     exit $?
 47 |     ;;
 48 |   -v | --v*)
 49 |     echo "mdate-sh $scriptversion"
 50 |     exit $?
 51 |     ;;
 52 | esac
 53 | 
 54 | # Prevent date giving response in another language.
 55 | LANG=C
 56 | export LANG
 57 | LC_ALL=C
 58 | export LC_ALL
 59 | LC_TIME=C
 60 | export LC_TIME
 61 | 
 62 | # GNU ls changes its time format in response to the TIME_STYLE
 63 | # variable.  Since we cannot assume `unset' works, revert this
 64 | # variable to its documented default.
 65 | if test "${TIME_STYLE+set}" = set; then
 66 |   TIME_STYLE=posix-long-iso
 67 |   export TIME_STYLE
 68 | fi
 69 | 
 70 | save_arg1=$1
 71 | 
 72 | # Find out how to get the extended ls output of a file or directory.
 73 | if ls -L /dev/null 1>/dev/null 2>&1; then
 74 |   ls_command='ls -L -l -d'
 75 | else
 76 |   ls_command='ls -l -d'
 77 | fi
 78 | # Avoid user/group names that might have spaces, when possible.
 79 | if ls -n /dev/null 1>/dev/null 2>&1; then
 80 |   ls_command="$ls_command -n"
 81 | fi
 82 | 
 83 | # A `ls -l' line looks as follows on OS/2.
 84 | #  drwxrwx---        0 Aug 11  2001 foo
 85 | # This differs from Unix, which adds ownership information.
 86 | #  drwxrwx---   2 root  root      4096 Aug 11  2001 foo
 87 | #
 88 | # To find the date, we split the line on spaces and iterate on words
 89 | # until we find a month.  This cannot work with files whose owner is a
 90 | # user named `Jan', or `Feb', etc.  However, it's unlikely that `/'
 91 | # will be owned by a user whose name is a month.  So we first look at
 92 | # the extended ls output of the root directory to decide how many
 93 | # words should be skipped to get the date.
 94 | 
 95 | # On HPUX /bin/sh, "set" interprets "-rw-r--r--" as options, so the "x" below.
 96 | set x`$ls_command /`
 97 | 
 98 | # Find which argument is the month.
 99 | month=
100 | command=
101 | until test $month
102 | do
103 |   shift
104 |   # Add another shift to the command.
105 |   command="$command shift;"
106 |   case $1 in
107 |     Jan) month=January; nummonth=1;;
108 |     Feb) month=February; nummonth=2;;
109 |     Mar) month=March; nummonth=3;;
110 |     Apr) month=April; nummonth=4;;
111 |     May) month=May; nummonth=5;;
112 |     Jun) month=June; nummonth=6;;
113 |     Jul) month=July; nummonth=7;;
114 |     Aug) month=August; nummonth=8;;
115 |     Sep) month=September; nummonth=9;;
116 |     Oct) month=October; nummonth=10;;
117 |     Nov) month=November; nummonth=11;;
118 |     Dec) month=December; nummonth=12;;
119 |   esac
120 | done
121 | 
122 | # Get the extended ls output of the file or directory.
123 | set dummy x`eval "$ls_command \"\$save_arg1\""`
124 | 
125 | # Remove all preceding arguments
126 | eval $command
127 | 
128 | # Because of the dummy argument above, month is in $2.
129 | #
130 | # On a POSIX system, we should have
131 | #
132 | # $# = 5
133 | # $1 = file size
134 | # $2 = month
135 | # $3 = day
136 | # $4 = year or time
137 | # $5 = filename
138 | #
139 | # On Darwin 7.7.0 and 7.6.0, we have
140 | #
141 | # $# = 4
142 | # $1 = day
143 | # $2 = month
144 | # $3 = year or time
145 | # $4 = filename
146 | 
147 | # Get the month.
148 | case $2 in
149 |   Jan) month=January; nummonth=1;;
150 |   Feb) month=February; nummonth=2;;
151 |   Mar) month=March; nummonth=3;;
152 |   Apr) month=April; nummonth=4;;
153 |   May) month=May; nummonth=5;;
154 |   Jun) month=June; nummonth=6;;
155 |   Jul) month=July; nummonth=7;;
156 |   Aug) month=August; nummonth=8;;
157 |   Sep) month=September; nummonth=9;;
158 |   Oct) month=October; nummonth=10;;
159 |   Nov) month=November; nummonth=11;;
160 |   Dec) month=December; nummonth=12;;
161 | esac
162 | 
163 | case $3 in
164 |   ???*) day=$1;;
165 |   *) day=$3; shift;;
166 | esac
167 | 
168 | # Here we have to deal with the problem that the ls output gives either
169 | # the time of day or the year.
170 | case $3 in
171 |   *:*) set `date`; eval year=\$$#
172 |        case $2 in
173 | 	 Jan) nummonthtod=1;;
174 | 	 Feb) nummonthtod=2;;
175 | 	 Mar) nummonthtod=3;;
176 | 	 Apr) nummonthtod=4;;
177 | 	 May) nummonthtod=5;;
178 | 	 Jun) nummonthtod=6;;
179 | 	 Jul) nummonthtod=7;;
180 | 	 Aug) nummonthtod=8;;
181 | 	 Sep) nummonthtod=9;;
182 | 	 Oct) nummonthtod=10;;
183 | 	 Nov) nummonthtod=11;;
184 | 	 Dec) nummonthtod=12;;
185 |        esac
186 |        # For the first six month of the year the time notation can also
187 |        # be used for files modified in the last year.
188 |        if (expr $nummonth \> $nummonthtod) > /dev/null;
189 |        then
190 | 	 year=`expr $year - 1`
191 |        fi;;
192 |   *) year=$3;;
193 | esac
194 | 
195 | # The result.
196 | echo $day $month $year
197 | 
198 | # Local Variables:
199 | # mode: shell-script
200 | # sh-indentation: 2
201 | # eval: (add-hook 'write-file-hooks 'time-stamp)
202 | # time-stamp-start: "scriptversion="
203 | # time-stamp-format: "%:y-%02m-%02d.%02H"
204 | # time-stamp-end: "$"
205 | # End:
206 | 


--------------------------------------------------------------------------------
/tools/missing:
--------------------------------------------------------------------------------
  1 | #! /bin/sh
  2 | # Common stub for a few missing GNU programs while installing.
  3 | 
  4 | scriptversion=2006-05-10.23
  5 | 
  6 | # Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006
  7 | #   Free Software Foundation, Inc.
  8 | # Originally by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
  9 | 
 10 | # This program is free software; you can redistribute it and/or modify
 11 | # it under the terms of the GNU General Public License as published by
 12 | # the Free Software Foundation; either version 2, or (at your option)
 13 | # any later version.
 14 | 
 15 | # This program is distributed in the hope that it will be useful,
 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 | # GNU General Public License for more details.
 19 | 
 20 | # You should have received a copy of the GNU General Public License
 21 | # along with this program; if not, write to the Free Software
 22 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 23 | # 02110-1301, USA.
 24 | 
 25 | # As a special exception to the GNU General Public License, if you
 26 | # distribute this file as part of a program that contains a
 27 | # configuration script generated by Autoconf, you may include it under
 28 | # the same distribution terms that you use for the rest of that program.
 29 | 
 30 | if test $# -eq 0; then
 31 |   echo 1>&2 "Try \`$0 --help' for more information"
 32 |   exit 1
 33 | fi
 34 | 
 35 | run=:
 36 | sed_output='s/.* --output[ =]\([^ ]*\).*/\1/p'
 37 | sed_minuso='s/.* -o \([^ ]*\).*/\1/p'
 38 | 
 39 | # In the cases where this matters, `missing' is being run in the
 40 | # srcdir already.
 41 | if test -f configure.ac; then
 42 |   configure_ac=configure.ac
 43 | else
 44 |   configure_ac=configure.in
 45 | fi
 46 | 
 47 | msg="missing on your system"
 48 | 
 49 | case $1 in
 50 | --run)
 51 |   # Try to run requested program, and just exit if it succeeds.
 52 |   run=
 53 |   shift
 54 |   "$@" && exit 0
 55 |   # Exit code 63 means version mismatch.  This often happens
 56 |   # when the user try to use an ancient version of a tool on
 57 |   # a file that requires a minimum version.  In this case we
 58 |   # we should proceed has if the program had been absent, or
 59 |   # if --run hadn't been passed.
 60 |   if test $? = 63; then
 61 |     run=:
 62 |     msg="probably too old"
 63 |   fi
 64 |   ;;
 65 | 
 66 |   -h|--h|--he|--hel|--help)
 67 |     echo "\
 68 | $0 [OPTION]... PROGRAM [ARGUMENT]...
 69 | 
 70 | Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an
 71 | error status if there is no known handling for PROGRAM.
 72 | 
 73 | Options:
 74 |   -h, --help      display this help and exit
 75 |   -v, --version   output version information and exit
 76 |   --run           try to run the given command, and emulate it if it fails
 77 | 
 78 | Supported PROGRAM values:
 79 |   aclocal      touch file \`aclocal.m4'
 80 |   autoconf     touch file \`configure'
 81 |   autoheader   touch file \`config.h.in'
 82 |   autom4te     touch the output file, or create a stub one
 83 |   automake     touch all \`Makefile.in' files
 84 |   bison        create \`y.tab.[ch]', if possible, from existing .[ch]
 85 |   flex         create \`lex.yy.c', if possible, from existing .c
 86 |   help2man     touch the output file
 87 |   lex          create \`lex.yy.c', if possible, from existing .c
 88 |   makeinfo     touch the output file
 89 |   tar          try tar, gnutar, gtar, then tar without non-portable flags
 90 |   yacc         create \`y.tab.[ch]', if possible, from existing .[ch]
 91 | 
 92 | Send bug reports to <bug-automake@gnu.org>."
 93 |     exit $?
 94 |     ;;
 95 | 
 96 |   -v|--v|--ve|--ver|--vers|--versi|--versio|--version)
 97 |     echo "missing $scriptversion (GNU Automake)"
 98 |     exit $?
 99 |     ;;
100 | 
101 |   -*)
102 |     echo 1>&2 "$0: Unknown \`$1' option"
103 |     echo 1>&2 "Try \`$0 --help' for more information"
104 |     exit 1
105 |     ;;
106 | 
107 | esac
108 | 
109 | # Now exit if we have it, but it failed.  Also exit now if we
110 | # don't have it and --version was passed (most likely to detect
111 | # the program).
112 | case $1 in
113 |   lex|yacc)
114 |     # Not GNU programs, they don't have --version.
115 |     ;;
116 | 
117 |   tar)
118 |     if test -n "$run"; then
119 |        echo 1>&2 "ERROR: \`tar' requires --run"
120 |        exit 1
121 |     elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
122 |        exit 1
123 |     fi
124 |     ;;
125 | 
126 |   *)
127 |     if test -z "$run" && ($1 --version) > /dev/null 2>&1; then
128 |        # We have it, but it failed.
129 |        exit 1
130 |     elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
131 |        # Could not run --version or --help.  This is probably someone
132 |        # running `$TOOL --version' or `$TOOL --help' to check whether
133 |        # $TOOL exists and not knowing $TOOL uses missing.
134 |        exit 1
135 |     fi
136 |     ;;
137 | esac
138 | 
139 | # If it does not exist, or fails to run (possibly an outdated version),
140 | # try to emulate it.
141 | case $1 in
142 |   aclocal*)
143 |     echo 1>&2 "\
144 | WARNING: \`$1' is $msg.  You should only need it if
145 |          you modified \`acinclude.m4' or \`${configure_ac}'.  You might want
146 |          to install the \`Automake' and \`Perl' packages.  Grab them from
147 |          any GNU archive site."
148 |     touch aclocal.m4
149 |     ;;
150 | 
151 |   autoconf)
152 |     echo 1>&2 "\
153 | WARNING: \`$1' is $msg.  You should only need it if
154 |          you modified \`${configure_ac}'.  You might want to install the
155 |          \`Autoconf' and \`GNU m4' packages.  Grab them from any GNU
156 |          archive site."
157 |     touch configure
158 |     ;;
159 | 
160 |   autoheader)
161 |     echo 1>&2 "\
162 | WARNING: \`$1' is $msg.  You should only need it if
163 |          you modified \`acconfig.h' or \`${configure_ac}'.  You might want
164 |          to install the \`Autoconf' and \`GNU m4' packages.  Grab them
165 |          from any GNU archive site."
166 |     files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}`
167 |     test -z "$files" && files="config.h"
168 |     touch_files=
169 |     for f in $files; do
170 |       case $f in
171 |       *:*) touch_files="$touch_files "`echo "$f" |
172 | 				       sed -e 's/^[^:]*://' -e 's/:.*//'`;;
173 |       *) touch_files="$touch_files $f.in";;
174 |       esac
175 |     done
176 |     touch $touch_files
177 |     ;;
178 | 
179 |   automake*)
180 |     echo 1>&2 "\
181 | WARNING: \`$1' is $msg.  You should only need it if
182 |          you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'.
183 |          You might want to install the \`Automake' and \`Perl' packages.
184 |          Grab them from any GNU archive site."
185 |     find . -type f -name Makefile.am -print |
186 | 	   sed 's/\.am$/.in/' |
187 | 	   while read f; do touch "$f"; done
188 |     ;;
189 | 
190 |   autom4te)
191 |     echo 1>&2 "\
192 | WARNING: \`$1' is needed, but is $msg.
193 |          You might have modified some files without having the
194 |          proper tools for further handling them.
195 |          You can get \`$1' as part of \`Autoconf' from any GNU
196 |          archive site."
197 | 
198 |     file=`echo "$*" | sed -n "$sed_output"`
199 |     test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
200 |     if test -f "$file"; then
201 | 	touch $file
202 |     else
203 | 	test -z "$file" || exec >$file
204 | 	echo "#! /bin/sh"
205 | 	echo "# Created by GNU Automake missing as a replacement of"
206 | 	echo "#  $ $@"
207 | 	echo "exit 0"
208 | 	chmod +x $file
209 | 	exit 1
210 |     fi
211 |     ;;
212 | 
213 |   bison|yacc)
214 |     echo 1>&2 "\
215 | WARNING: \`$1' $msg.  You should only need it if
216 |          you modified a \`.y' file.  You may need the \`Bison' package
217 |          in order for those modifications to take effect.  You can get
218 |          \`Bison' from any GNU archive site."
219 |     rm -f y.tab.c y.tab.h
220 |     if test $# -ne 1; then
221 |         eval LASTARG="\${$#}"
222 | 	case $LASTARG in
223 | 	*.y)
224 | 	    SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'`
225 | 	    if test -f "$SRCFILE"; then
226 | 	         cp "$SRCFILE" y.tab.c
227 | 	    fi
228 | 	    SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'`
229 | 	    if test -f "$SRCFILE"; then
230 | 	         cp "$SRCFILE" y.tab.h
231 | 	    fi
232 | 	  ;;
233 | 	esac
234 |     fi
235 |     if test ! -f y.tab.h; then
236 | 	echo >y.tab.h
237 |     fi
238 |     if test ! -f y.tab.c; then
239 | 	echo 'main() { return 0; }' >y.tab.c
240 |     fi
241 |     ;;
242 | 
243 |   lex|flex)
244 |     echo 1>&2 "\
245 | WARNING: \`$1' is $msg.  You should only need it if
246 |          you modified a \`.l' file.  You may need the \`Flex' package
247 |          in order for those modifications to take effect.  You can get
248 |          \`Flex' from any GNU archive site."
249 |     rm -f lex.yy.c
250 |     if test $# -ne 1; then
251 |         eval LASTARG="\${$#}"
252 | 	case $LASTARG in
253 | 	*.l)
254 | 	    SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'`
255 | 	    if test -f "$SRCFILE"; then
256 | 	         cp "$SRCFILE" lex.yy.c
257 | 	    fi
258 | 	  ;;
259 | 	esac
260 |     fi
261 |     if test ! -f lex.yy.c; then
262 | 	echo 'main() { return 0; }' >lex.yy.c
263 |     fi
264 |     ;;
265 | 
266 |   help2man)
267 |     echo 1>&2 "\
268 | WARNING: \`$1' is $msg.  You should only need it if
269 | 	 you modified a dependency of a manual page.  You may need the
270 | 	 \`Help2man' package in order for those modifications to take
271 | 	 effect.  You can get \`Help2man' from any GNU archive site."
272 | 
273 |     file=`echo "$*" | sed -n "$sed_output"`
274 |     test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
275 |     if test -f "$file"; then
276 | 	touch $file
277 |     else
278 | 	test -z "$file" || exec >$file
279 | 	echo ".ab help2man is required to generate this page"
280 | 	exit 1
281 |     fi
282 |     ;;
283 | 
284 |   makeinfo)
285 |     echo 1>&2 "\
286 | WARNING: \`$1' is $msg.  You should only need it if
287 |          you modified a \`.texi' or \`.texinfo' file, or any other file
288 |          indirectly affecting the aspect of the manual.  The spurious
289 |          call might also be the consequence of using a buggy \`make' (AIX,
290 |          DU, IRIX).  You might want to install the \`Texinfo' package or
291 |          the \`GNU make' package.  Grab either from any GNU archive site."
292 |     # The file to touch is that specified with -o ...
293 |     file=`echo "$*" | sed -n "$sed_output"`
294 |     test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
295 |     if test -z "$file"; then
296 |       # ... or it is the one specified with @setfilename ...
297 |       infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'`
298 |       file=`sed -n '
299 | 	/^@setfilename/{
300 | 	  s/.* \([^ ]*\) *$/\1/
301 | 	  p
302 | 	  q
303 | 	}' $infile`
304 |       # ... or it is derived from the source name (dir/f.texi becomes f.info)
305 |       test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info
306 |     fi
307 |     # If the file does not exist, the user really needs makeinfo;
308 |     # let's fail without touching anything.
309 |     test -f $file || exit 1
310 |     touch $file
311 |     ;;
312 | 
313 |   tar)
314 |     shift
315 | 
316 |     # We have already tried tar in the generic part.
317 |     # Look for gnutar/gtar before invocation to avoid ugly error
318 |     # messages.
319 |     if (gnutar --version > /dev/null 2>&1); then
320 |        gnutar "$@" && exit 0
321 |     fi
322 |     if (gtar --version > /dev/null 2>&1); then
323 |        gtar "$@" && exit 0
324 |     fi
325 |     firstarg="$1"
326 |     if shift; then
327 | 	case $firstarg in
328 | 	*o*)
329 | 	    firstarg=`echo "$firstarg" | sed s/o//`
330 | 	    tar "$firstarg" "$@" && exit 0
331 | 	    ;;
332 | 	esac
333 | 	case $firstarg in
334 | 	*h*)
335 | 	    firstarg=`echo "$firstarg" | sed s/h//`
336 | 	    tar "$firstarg" "$@" && exit 0
337 | 	    ;;
338 | 	esac
339 |     fi
340 | 
341 |     echo 1>&2 "\
342 | WARNING: I can't seem to be able to run \`tar' with the given arguments.
343 |          You may want to install GNU tar or Free paxutils, or check the
344 |          command line arguments."
345 |     exit 1
346 |     ;;
347 | 
348 |   *)
349 |     echo 1>&2 "\
350 | WARNING: \`$1' is needed, and is $msg.
351 |          You might have modified some files without having the
352 |          proper tools for further handling them.  Check the \`README' file,
353 |          it often tells you about the needed prerequisites for installing
354 |          this package.  You may also peek at any GNU archive site, in case
355 |          some other package would contain this missing \`$1' program."
356 |     exit 1
357 |     ;;
358 | esac
359 | 
360 | exit 0
361 | 
362 | # Local variables:
363 | # eval: (add-hook 'write-file-hooks 'time-stamp)
364 | # time-stamp-start: "scriptversion="
365 | # time-stamp-format: "%:y-%02m-%02d.%02H"
366 | # time-stamp-end: "$"
367 | # End:
368 | 


--------------------------------------------------------------------------------