├── .Rbuildignore
├── .lintr
├── tests
    ├── pair.R
    └── tests.R
├── NAMESPACE
├── man
    ├── longrun.Rd
    ├── two_seven.Rd
    ├── tdist.Rd
    └── tcor.Rd
├── DESCRIPTION
├── README.md
├── .travis.yml
├── R
    ├── tdist.R
    ├── tcor.R
    └── common.R
└── vignettes
    └── brca.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | .travis.yml
2 | .lintr
3 | 


--------------------------------------------------------------------------------
/.lintr:
--------------------------------------------------------------------------------
1 | linters: with_defaults(open_curly_linter=NULL, line_length_linter=line_length_linter(130), assignment_linter=NULL)
2 | 


--------------------------------------------------------------------------------
/tests/pair.R:
--------------------------------------------------------------------------------
 1 | library(tcor)
 2 | set.seed(1)
 3 | x = matrix(rnorm(300), 10)
 4 | y = matrix(rnorm(200), 10)
 5 | x[, 30] = y[, 1] + 1e-1 * runif(10)
 6 | ans1 = tcor(x, y, t=0.7, p=3, filter="local")
 7 | ans2 = tcor(x, y, t=0.7, p=3)
 8 | z = cor(x, y)
 9 | 
10 | stopifnot(all.equal(nrow(ans1$indices), nrow(ans2$indices), sum(z > 0.7)))
11 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(tcor)
 4 | export(tdist)
 5 | importFrom(Matrix,colMeans)
 6 | importFrom(foreach,"%do%")
 7 | importFrom(foreach,"%dopar%")
 8 | importFrom(foreach,foreach)
 9 | importFrom(irlba,irlba)
10 | importFrom(stats,cor)
11 | importFrom(stats,dist)
12 | importFrom(stats,quantile)
13 | 


--------------------------------------------------------------------------------
/man/longrun.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/common.R
 3 | \name{longrun}
 4 | \alias{longrun}
 5 | \title{linear time longest run search: find the longest run of values in the
 6 | ordered vector within the specified limit}
 7 | \usage{
 8 | longrun(v, limit, group = NULL)
 9 | }
10 | \arguments{
11 | \item{v}{a vector with entries ordered in increasing order}
12 | 
13 | \item{limit}{distance interval}
14 | 
15 | \item{group}{optional vector with entries -1 and 1 corresponding to the group membership of each element in \code{v} (two groups)}
16 | }
17 | \value{
18 | run length
19 | }
20 | \description{
21 | linear time longest run search: find the longest run of values in the
22 | ordered vector within the specified limit
23 | }
24 | \keyword{internal}
25 | 
26 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: tcor
 2 | Type: Package
 3 | Title: Fast Thresholded Correlation and Distance Matrices
 4 | Version: 0.1.0
 5 | Date: 2016-04-03
 6 | Authors@R: c(
 7 |     person("B. W.", "Lewis", role=c("cre","aut"), email="blewis@illposed.net"),
 8 |     person("James", "Baglama", role=c("aut"), email="jbaglama@math.uri.edu"),
 9 |     person("Michael", "Kane", role=c("aut"), email="michael.kane@yale.edu"),
10 |     person("Alex", "Poliakov", role=c("aut"), email="apoliakov@paradigm4.com")
11 |     )
12 | URL: https://github.com/bwlewis/tcor, http://arxiv.org/abs/1512.07246
13 | VignetteBuilder: knitr
14 | BugReports: https://github.com/bwlewis/tcor/issues
15 | Description: Fast and memory efficient computation of thresholded correlation
16 |     matrices.
17 | Imports:
18 |     irlba (>= 2.0.0),
19 |     foreach,
20 |     stats
21 | Suggests:
22 |     knitr (>= 1.8)
23 | License: GPL-3
24 | RoxygenNote: 5.0.0
25 | 


--------------------------------------------------------------------------------
/tests/tests.R:
--------------------------------------------------------------------------------
 1 | library(tcor)
 2 | 
 3 | # Thresholded correlation (from the tcor examples, but adding a check for
 4 | # equivalence with the cor result)
 5 | 
 6 | set.seed(1)
 7 | s <- svd(matrix(rnorm(100 * 1000), nrow=100))
 8 | A <- s$u %*% (1 /( 1:100) * t(s$v)) 
 9 | C <- cor(A)
10 | C <- C * upper.tri(C)
11 | y <- which(C >= 0.98, arr.ind=TRUE)
12 | x <- tcor(A, t=0.98)$indices[, 1:2]
13 | # order x and y conformably for comparison
14 | swap <- x[, 1] > x[, 2]
15 | x2   <- x[, 2]
16 | x[swap, 2] <- x[swap, 1]
17 | x[swap, 1] <- x2[swap]
18 | x <- x[order(x[, 2]), ]
19 | y <- y[order(y[, 2]), ]
20 | stopifnot(all.equal(x, y, check.attributes=FALSE))
21 | 
22 | # Thresholded distance (from the tdist example plus a comparison)
23 | x  <- matrix(rnorm(100 * 20), nrow=100)
24 | td <- tdist(x, 10, rank=TRUE)
25 | d  <- dist(x)
26 | stopifnot(all.equal(td$indices[1:10, "val"], sort(d)[1:10]))
27 | 
28 | # non-rank version of tdist
29 | td2 <- tdist(x, t=td$indices[10, "val"])
30 | stopifnot(all.equal(td$indices[1:10, "val"], td2$indices[, "val"]))
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # tcor
 2 | An R package for fast and memory-efficient computation of thresholded correlation matrices
 3 | 
 4 | A preprint of the companion note is available from http://arxiv.org/abs/1512.07246.
 5 | 
 6 | 
 7 | ## Installation
 8 | 
 9 | The package depends on the 
10 | `irlba` (https://cran.r-project.org/web/packages/irlba/)
11 | and  `foreach` (https://cran.r-project.org/web/packages/foreach/)
12 | packages, each available on CRAN.
13 | You can install `tcor` using the `devtools` package
14 | (https://cran.r-project.org/web/packages/devtools/) with:
15 | ```r
16 | devtools::install_github("bwlewis/tcor")
17 | ```
18 | 
19 | The algorithm can optionally make use of `foreach` package "back-ends" to run
20 | in parallel Many are available, including `doMC`, `doParallel`, and `doRedis`.
21 | See the CRAN high performance computing task view for more info
22 | https://cran.r-project.org/web/views/HighPerformanceComputing.html.
23 | 
24 | ## Example
25 | 
26 | See the vignette https://github.com/bwlewis/tcor/blob/master/vignettes/brca.Rmd
27 | for an example that uses tcor to compute the most correlated gene expression
28 | vectors from TCGA RNASeq data.
29 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: c
 2 | sudo: false
 3 | 
 4 | warnings_are_errors: true
 5 | 
 6 | addons:
 7 |   apt:
 8 |     sources:
 9 |       - r-packages-precise
10 |     packages:
11 |       - r-base-dev
12 |       - r-recommended
13 |       - pandoc
14 |       - texinfo
15 |       - texlive
16 |       - texlive-latex-base
17 |       - texlive-latex-extra
18 |       - texlive-latex-recommended
19 |       - texlive-fonts-recommended
20 |       - texlive-fonts-extra
21 |       - texlive-math-extra
22 |       - qpdf
23 | 
24 | env:
25 |   global:
26 |     - R_LIBS_USER=~/R/library
27 | 
28 | cache:
29 |   directories:
30 |     $R_LIBS_USER
31 | 
32 | before_script:
33 |   - mkdir -p "$R_LIBS_USER"
34 |   - Rscript -e 'if (length(find.package("devtools", quiet = TRUE)) == 0L) { install.packages("devtools", repos = "http://cran.rstudio.com") }'
35 |   - Rscript -e 'devtools::update_packages("devtools", repos = "http://cran.rstudio.com")'
36 |   - Rscript -e 'devtools::install_deps(repos = "http://cran.rstudio.com", dependencies = TRUE)'
37 |   - Rscript -e 'devtools::install_github("jimhester/covr")'
38 |   - Rscript -e 'devtools::install_github("jimhester/lintr")'
39 | 
40 | script:
41 |   - Rscript -e 'devtools::check(document=FALSE)'
42 | 
43 | after_success:
44 |   - Rscript -e 'library(tcor); covr::codecov()'
45 |   - Rscript -e 'lintr::lint_package()'
46 | 


--------------------------------------------------------------------------------
/man/two_seven.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/common.R
 3 | \name{two_seven}
 4 | \alias{two_seven}
 5 | \title{Steps 2--7 of Algorithm 2.1, factored into a common function that can be used by a variety of distance metrics}
 6 | \usage{
 7 | two_seven(A, L, t, filter = c("distributed", "local"), normlim = 2 * (1 -
 8 |   t), full_dist_fun = function(idx) vapply(1:nrow(idx), function(k) cor(A[,
 9 |   idx[k, 1]], A[, idx[k, 2]]), 1), filter_fun = function(v, t) v >= t,
10 |   dry_run = FALSE, anti = FALSE, group = NULL)
11 | }
12 | \arguments{
13 | \item{A}{data matrix}
14 | 
15 | \item{L}{truncated SVD of A}
16 | 
17 | \item{t}{scalar threshold value}
18 | 
19 | \item{filter}{"distributed" for full threshold evaluation of pruned set on parallel workers,
20 | "local" for sequential evaluation of full threshold of pruned set to avoid copying data matrix.}
21 | 
22 | \item{normlim}{the squared norm limit in step 4, default value is for correlation}
23 | 
24 | \item{full_dist_fun}{non-projected distance function of a two-column matrix of rows of column
25 | indices that needs scoped access to A (step 7), default function is for correlation}
26 | 
27 | \item{filter_fun}{filter function of a vector and scalar that thresholds vector values
28 | from full_dist_fun, returning a logical vector of same length as v (step 7), default function is for correlation}
29 | 
30 | \item{dry_run}{a logical value, if \code{TRUE} quickly return statistics useful for tuning \code{p}}
31 | 
32 | \item{anti}{a logical value, if \code{TRUE} also include anti-correlated vectors}
33 | 
34 | \item{group}{either \code{NULL} for no grouping, or a vector of length \code{ncol(A)} consisting of \code{-1, 1} values
35 | indicating group membership of the columns.}
36 | }
37 | \value{
38 | a list with indices, ell, tot, and longest_run entries, unless dry_run=\code{TRUE} in which case
39 | a list with ell and tot is returned
40 | }
41 | \description{
42 | Steps 2--7 of Algorithm 2.1, factored into a common function that can be used by a variety of distance metrics
43 | }
44 | \keyword{internal}
45 | 
46 | 


--------------------------------------------------------------------------------
/man/tdist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tdist.R
 3 | \name{tdist}
 4 | \alias{tdist}
 5 | \title{Compute thresholded distances between rows or columns of a matrix}
 6 | \usage{
 7 | tdist(A, t, p = 10, filter = c("distributed", "local"),
 8 |   method = c("euclidean", "manhattan", "maximum"), rank = FALSE,
 9 |   dry_run = FALSE, max_iter = 4, columns = FALSE, restart, ...)
10 | }
11 | \arguments{
12 | \item{A}{an m by n real-valued dense or sparse matrix}
13 | 
14 | \item{t}{a threshold distance value either in absolute distance (the default) or rank order (see \code{rank} below);
15 | if missing an estimate derived from a 1-d SVD projection will be used}
16 | 
17 | \item{p}{projected subspace dimension}
18 | 
19 | \item{filter}{"local" filters candidate set sequentially,
20 | "distributed" computes thresholded correlations in a parallel code section which can be
21 | faster but requires that the data matrix is available (see notes).}
22 | 
23 | \item{method}{the distance measure to be used, one of
24 |          "euclidean", or "manhattan".
25 | Any unambiguous substring can be given.}
26 | 
27 | \item{rank}{when \code{TRUE}, the threshold \code{t} represents the top \code{t}
28 | closest vectors, otherwise the threshold \code{t} specifies absolute distance; when
29 | \code{rank=TRUE} then \code{t} must also be specified}
30 | 
31 | \item{dry_run}{set \code{TRUE} to return statistics and truncated SVD for tuning
32 | \code{p} (see notes)}
33 | 
34 | \item{max_iter}{when \code{rank=TRUE}, a portion of the algorithm may iterate; this
35 | number sets the maximum numer of such iterations}
36 | 
37 | \item{columns}{set to \code{TRUE} to compute distances between matrix columns instead
38 | of rows, saving the expense of a matrix transpose (which can be significant if \code{A} is large)}
39 | 
40 | \item{restart}{either output from a previous run of \code{tdist} with \code{dry_run=TRUE},
41 | or direct output from from \code{\link{irlba}} used to restart the \code{irlba}
42 | algorithm when tuning \code{p} (see notes)}
43 | 
44 | \item{...}{additional arguments passed to \code{\link{irlba}}}
45 | }
46 | \value{
47 | A list with elements:
48 | \enumerate{
49 |   \item \code{indices} A three-column matrix. The  first two columns contain
50 |         indices of rows meeting the distance threshold \code{t},
51 |         the third column contains the corresponding distance value (not returned
52 |         when \code{dry_run=TRUE}).
53 |   \item \code{restart} A truncated SVD returned by the IRLBA used to restart the
54 |   algorithm (only returned when \code{dry_run=TRUE}).
55 |   \item \code{tot} The total number of _possible_ vectors that meet
56 |     the correlation threshold identified by the algorithm.
57 |   \item \code{longest_run} The largest number of successive entries in the
58 |     ordered first singular vector within a projected distance defined by the
59 |     correlation threshold; Equivalently, the number of \code{n * p} matrix
60 |     vector products employed in the algorithm, not counting the truncated SVD step.
61 |   \item \code{t} The threshold value.
62 |   \item \code{svd_time} Time to compute truncated SVD.
63 |   \item \code{total_time} Total run time.
64 | }
65 | }
66 | \description{
67 | Compute and return distances and indices of rows or columns within a specified distance threshold
68 | with respect to a specified distance metric. The algorithm works best for Euclidean
69 | distance (the default option).
70 | Alternatively compute the \code{t} closest rows when \code{rank=TRUE}. Or use
71 | \code{columns=TRUE} to compute distances between columns instead, which is somewhat
72 | cheaper for this algorithm than computing row distances.
73 | Increase p to cut down the total number of candidate pairs evaluated,
74 | at the expense of costlier truncated SVDs.
75 | }
76 | \note{
77 | When \code{rank=TRUE} the method returns at least, and perhaps more than, the top \code{t} closest
78 | indices and their distances, unless they could not be found within the iteration
79 | limit \code{max_iter}.
80 | }
81 | \examples{
82 | x <- matrix(rnorm(100 * 20), nrow=100)
83 | # Find the top 10 closest vectors with respect to Euclidean distance:
84 | td <- tdist(x, 10, rank=TRUE)
85 | print(td$indices[1:10, ])
86 | 
87 | # Compare with distances from `dist`:
88 | d <- dist(x)
89 | print(sort(d)[1:10])
90 | 
91 | }
92 | \references{
93 | \url{http://arxiv.org/abs/1512.07246} (preprint)
94 | }
95 | \seealso{
96 | \code{\link{dist}}, \code{\link{tcor}}
97 | }
98 | 
99 | 


--------------------------------------------------------------------------------
/man/tcor.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/tcor.R
  3 | \name{tcor}
  4 | \alias{tcor}
  5 | \title{Thresholded Correlation}
  6 | \usage{
  7 | tcor(x, y = NULL, t = 0.99, p = 10, include_anti = FALSE,
  8 |   filter = c("distributed", "local"), dry_run = FALSE, rank = FALSE,
  9 |   max_iter = 4, restart, ...)
 10 | }
 11 | \arguments{
 12 | \item{x}{an m by n real-valued dense or sparse matrix}
 13 | 
 14 | \item{y}{\code{NULL} (default) or a matrix with compatible dimensions to \code{x} (same number of rows). The default
 15 | is equivalent to \code{y=x} but more efficient.}
 16 | 
 17 | \item{t}{a threshold value for correlation, -1 < t < 1, but usually t is near 1 (see \code{include_anti} below).}
 18 | 
 19 | \item{p}{projected subspace dimension, p << n (if p >= n it will be reduced)
 20 | (Increase \code{p} to cut down the total number of candidate pairs evaluated.
 21 | at the expense of costlier matrix-vector products. See the notes on tuning \code{p}.)}
 22 | 
 23 | \item{include_anti}{logical value, if \code{TRUE} then return both correlated
 24 | and anti-correlated values that meet the threshold in absolute value. NB Can be much more expensive when \code{TRUE}.}
 25 | 
 26 | \item{filter}{"local" filters candidate set sequentially,
 27 | "distributed" computes thresholded correlations in a parallel code section which can be
 28 | faster but requires the data matrix (see notes).}
 29 | 
 30 | \item{dry_run}{set \code{TRUE} to return statistics and truncated SVD for tuning
 31 | \code{p} (see notes).}
 32 | 
 33 | \item{rank}{when \code{TRUE}, the threshold \code{t} represents the top \code{t}
 34 | closest vectors, otherwise the threshold \code{t} specifies absolute correlation value.}
 35 | 
 36 | \item{max_iter}{when \code{rank=TRUE}, a portion of the algorithm may iterate; this
 37 | number sets the maximum numer of such iterations.}
 38 | 
 39 | \item{restart}{either output from a previous run of \code{tcor} with \code{dry_run=TRUE},
 40 | or direct output from from \code{\link{irlba}} used to restart the \code{irlba}
 41 | algorithm when tuning \code{p} (see notes).}
 42 | 
 43 | \item{...}{additional arguments passed to \code{\link{irlba}}.}
 44 | }
 45 | \value{
 46 | A list with elements:
 47 | \enumerate{
 48 |   \item \code{indices} A three-column matrix. The first two columns contain
 49 |         indices of vectors meeting the correlation threshold \code{t},
 50 |         the third column contains the corresponding correlation value
 51 |         (not returned when \code{dry_run=TRUE}).
 52 |   \item \code{restart} The truncated SVD from \code{\link{irlba}}, used to restart
 53 |   the \code{irlba} algorithm (only returned when \code{dry_run=TRUE}).
 54 |   \item \code{longest_run} The largest number of successive entries in the
 55 |     ordered first singular vector within a projected distance defined by the
 56 |     correlation threshold. This is the minimum number of \code{n * p} matrix-vector
 57 |     products required by the algorithm.
 58 |   \item \code{tot} The total number of _candidate_ vectors that met
 59 |     the correlation threshold identified by the algorithm, subsequently filtered
 60 |     down to just those indices corresponding to values meeting the threshold.
 61 |   \item \code{t} The threshold value.
 62 |   \item \code{svd_time} Time spent computing truncated SVD.
 63 |   \item \code{total_time} Total run time.
 64 | }
 65 | }
 66 | \description{
 67 | Compute a thresholded correlation matrix, returning vector indices
 68 | and correlation values that exceed the specified threshold \code{t}.
 69 | If \code{y} is a matrix then the thresholded correlations
 70 | between the columns of \code{x} and the columns of \code{y} are computed,
 71 | otherwise the correlation matrix defined by the columns of \code{x} is computed.
 72 | }
 73 | \note{
 74 | Register a parallel backend with \code{\link{foreach}} before invoking \code{\link{tcor}}
 75 | to run in parallel, otherwise it runs sequentially.
 76 | When \code{A} is large, use \code{filter=local} to avoid copying A to the
 77 | parallel R worker processes (unless the \code{doMC} parallel backend is used with
 78 | \code{\link{foreach}}).
 79 | 
 80 | Specify \code{dry_run=TRUE} to compute and return a truncated SVD of rank \code{p},
 81 | a lower bound on the number of \code{n*p} matrix vector products required by the full algorithm, and a lower-bound
 82 | estimate on the number of unpruned candidate vector pairs to be evaluated by the algorithm. You
 83 | can pass the returned value back in as input using the \code{restart} parameter to avoid
 84 | fully recomputing a truncated SVD. Use these options to tune \code{p} for a balance between
 85 | the matrix-vector product work and pruning efficiency.
 86 | 
 87 | When \code{rank=TRUE}, the method returns at least, and perhaps more than, the top \code{t} most correlated
 88 | indices, unless they couldn't be found within \code{max_iter} iterations.
 89 | }
 90 | \examples{
 91 | # Construct a 100 x 2,000 example matrix A:
 92 | set.seed(1)
 93 | s <- svd(matrix(rnorm(100 * 2000), nrow=100))
 94 | A <- s$u \%*\% (1 /( 1:100) * t(s$v)) 
 95 | 
 96 | C <- cor(A)
 97 | C <- C * upper.tri(C)
 98 | # Compare i with x$indices below:
 99 | (i <- which(C >= 0.98, arr.ind=TRUE))
100 | (x <- tcor(A, t=0.98))
101 | 
102 | # Same example with thresholded correlation _and_ anticorrelation
103 | (i <- which(abs(C) >= 0.98, arr.ind=TRUE))
104 | (x <- tcor(A, t=0.98, include_anti=TRUE))
105 | 
106 | # Example of tuning p with dry_run=TRUE:
107 | x1 <- tcor(A, t=0.98, p=3, dry_run=TRUE)
108 | print(x1$tot)
109 | # 211, see how much we can reduce this without increasing p too much...
110 | x1 <- tcor(A, t=0.98, p=5, dry_run=TRUE, restart=x1)
111 | print(x1$tot)
112 | # 39,  much better...
113 | x1 <- tcor(A, t=0.98, p=10, dry_run=TRUE, restart=x1)
114 | print(x1$tot)
115 | # 3,   even better!
116 | 
117 | # Once tuned, compute the full thresholded correlation:
118 | x <- tcor(A, t=0.98, p=10, restart=x1)
119 | 
120 | \dontrun{
121 | # Optionally, register a parallel backend first:
122 | library(doMC)
123 | registerDoMC()
124 | x <- tcor(A, t=0.98)  # Should now run faster on a multicore machine
125 | }
126 | 
127 | }
128 | \references{
129 | \url{http://arxiv.org/abs/1512.07246} (preprint)
130 | }
131 | \seealso{
132 | \code{\link{cor}}, \code{\link{tdist}}
133 | }
134 | 
135 | 


--------------------------------------------------------------------------------
/R/tdist.R:
--------------------------------------------------------------------------------
  1 | #' Compute thresholded distances between rows or columns of a matrix
  2 | #'
  3 | #' Compute and return distances and indices of rows or columns within a specified distance threshold
  4 | #' with respect to a specified distance metric. The algorithm works best for Euclidean
  5 | #' distance (the default option).
  6 | #' Alternatively compute the \code{t} closest rows when \code{rank=TRUE}. Or use
  7 | #' \code{columns=TRUE} to compute distances between columns instead, which is somewhat
  8 | #' cheaper for this algorithm than computing row distances.
  9 | #' Increase p to cut down the total number of candidate pairs evaluated,
 10 | #' at the expense of costlier truncated SVDs.
 11 | #'
 12 | #' @param A an m by n real-valued dense or sparse matrix
 13 | #' @param t a threshold distance value either in absolute distance (the default) or rank order (see \code{rank} below);
 14 | #' if missing an estimate derived from a 1-d SVD projection will be used
 15 | #' @param p projected subspace dimension
 16 | #' @param filter "local" filters candidate set sequentially,
 17 | #'  "distributed" computes thresholded correlations in a parallel code section which can be
 18 | #'  faster but requires that the data matrix is available (see notes).
 19 | #' @param method the distance measure to be used, one of
 20 | #'          "euclidean", or "manhattan".
 21 | #' Any unambiguous substring can be given.
 22 | #' @param rank when \code{TRUE}, the threshold \code{t} represents the top \code{t}
 23 | #' closest vectors, otherwise the threshold \code{t} specifies absolute distance; when
 24 | #' \code{rank=TRUE} then \code{t} must also be specified
 25 | #' @param dry_run set \code{TRUE} to return statistics and truncated SVD for tuning
 26 | #' \code{p} (see notes)
 27 | #' @param max_iter when \code{rank=TRUE}, a portion of the algorithm may iterate; this
 28 | #' number sets the maximum numer of such iterations
 29 | #' @param columns set to \code{TRUE} to compute distances between matrix columns instead
 30 | #' of rows, saving the expense of a matrix transpose (which can be significant if \code{A} is large)
 31 | #' @param restart either output from a previous run of \code{tdist} with \code{dry_run=TRUE},
 32 | #' or direct output from from \code{\link{irlba}} used to restart the \code{irlba}
 33 | #' algorithm when tuning \code{p} (see notes)
 34 | #' @param ... additional arguments passed to \code{\link{irlba}}
 35 | #'
 36 | #' @return A list with elements:
 37 | #' \enumerate{
 38 | #'   \item \code{indices} A three-column matrix. The  first two columns contain
 39 | #'         indices of rows meeting the distance threshold \code{t},
 40 | #'         the third column contains the corresponding distance value (not returned
 41 | #'         when \code{dry_run=TRUE}).
 42 | #'   \item \code{restart} A truncated SVD returned by the IRLBA used to restart the
 43 | #'   algorithm (only returned when \code{dry_run=TRUE}).
 44 | #'   \item \code{tot} The total number of _possible_ vectors that meet
 45 | #'     the correlation threshold identified by the algorithm.
 46 | #'   \item \code{longest_run} The largest number of successive entries in the
 47 | #'     ordered first singular vector within a projected distance defined by the
 48 | #'     correlation threshold; Equivalently, the number of \code{n * p} matrix
 49 | #'     vector products employed in the algorithm, not counting the truncated SVD step.
 50 | #'   \item \code{t} The threshold value.
 51 | #'   \item \code{svd_time} Time to compute truncated SVD.
 52 | #'   \item \code{total_time} Total run time.
 53 | #' }
 54 | #'
 55 | #' @note When \code{rank=TRUE} the method returns at least, and perhaps more than, the top \code{t} closest
 56 | #' indices and their distances, unless they could not be found within the iteration
 57 | #' limit \code{max_iter}.
 58 | #' @seealso \code{\link{dist}}, \code{\link{tcor}}
 59 | #' @references \url{http://arxiv.org/abs/1512.07246} (preprint)
 60 | #' @examples
 61 | #' x <- matrix(rnorm(100 * 20), nrow=100)
 62 | #' # Find the top 10 closest vectors with respect to Euclidean distance:
 63 | #' td <- tdist(x, 10, rank=TRUE)
 64 | #' print(td$indices[1:10, ])
 65 | #'
 66 | #' # Compare with distances from `dist`:
 67 | #' d <- dist(x)
 68 | #' print(sort(d)[1:10])
 69 | #'
 70 | #' @importFrom irlba irlba
 71 | #' @importFrom stats dist quantile
 72 | #' @export
 73 | tdist = function(A, t, p=10,
 74 |                  filter=c("distributed", "local"),
 75 |                  method=c("euclidean", "manhattan", "maximum"), rank=FALSE,
 76 |                  dry_run=FALSE, max_iter=4, columns=FALSE, restart, ...)
 77 | {
 78 |   filter = match.arg(filter)
 79 |   method = match.arg(method)
 80 |   if(!columns) A = base::t(A)  # XXX expensive, find a better approach...
 81 |   if(ncol(A) < p) p = max(1, floor(ncol(A) / 2 - 1))
 82 | 
 83 |   nlim = function(t)
 84 |            switch(method,
 85 |                   euclidean = t ^ 2,
 86 |                   maximum = nrow(A) * t ^ 2,  # XXX unlikely to be a good bound?
 87 |                   manhattan   = t ^ 2)        # just bound by 2-norm, not so great either
 88 | 
 89 |   t0 = proc.time()
 90 |   if(p == ncol(A) || p == nrow(A)) L = svd(A)
 91 |   else
 92 |   {
 93 |     if(missing(restart)) L  = irlba(A, p, ...)
 94 |     else
 95 |     {
 96 |       # Handle either output from tcor(..., dry_run=TRUE), or direct output from irlba:
 97 |       if("restart" %in% names(restart)) restart = restart$restart
 98 |       L = irlba(A, p, v=restart, ...)
 99 |     }
100 |   }
101 |   t1 = (proc.time() - t0)[[3]]
102 |   if(missing(t) && rank) stop("t must be specified when rank=TRUE")
103 |   N = 1
104 |   if(rank) N = t
105 |   if(missing(t) || rank)
106 |   {
107 |     # Estimate a threshold based on a 1-d projection, with crude tuning over a range of values.
108 |     v = L$v[order(L$v[,1]), 1]
109 |     ts = sort(c(quantile(L$d[1] * (v[-1] - v[1]), probs=c(0.001, 0.01, 0.1)),
110 |               quantile(L$d[1] * (v[length(v)] - v[-length(v)]), probs=c(0.001, 0.01, 0.1))))
111 |     # (that last expression considers two possible SVD bases of different sign)
112 |     as = lapply(ts, function(t) two_seven(A, L, t, filter, normlim=nlim(t), dry_run=TRUE)$tot)
113 |     i = which(as > 0)
114 |     if(length(i) > 0) t = ts[min(i)]
115 |     else t = ts[3]
116 |     attr(t, "names") = c()
117 |     if(rank && method == "maximum") t = t / 4    # just a fudge factor, these bounds are not great
118 |     if(rank && method == "manhattan") t = t * 4
119 |   }
120 | 
121 |   full_dist_fun =
122 |     switch(method,
123 |            euclidean = function(idx) vapply(1:nrow(idx), function(k) sqrt(crossprod(A[, idx[k,1]] - A[, idx[k, 2]])), 1),
124 |            manhattan = function(idx) vapply(1:nrow(idx), function(k) sum(abs(A[, idx[k,1]] - A[, idx[k, 2]])), 1),
125 |            maximum = function(idx) vapply(1:nrow(idx), function(k) max(abs(A[, idx[k,1]] - A[, idx[k, 2]])), 1)
126 |   )
127 |   filter_fun =  function(v, t) v <= t
128 | 
129 |   iter = 1
130 |   while(iter <= max_iter)
131 |   {
132 |     ans = two_seven(A, L, t, filter, normlim=nlim(t), full_dist_fun=full_dist_fun, filter_fun=filter_fun, dry_run=dry_run)
133 |     if(dry_run) return(list(restart=L, longest_run=ans$longest_run, tot=ans$tot, t=t, svd_time=t1))
134 |     if(!rank || (nrow(ans$indices) >= N)) break
135 |     iter = iter + 1
136 | # back off faster as we get closer to avoid too much filtering, at the expense of maybe more iterations
137 |     t = t * (2 - (nrow(ans$indices)/N)^(1/4)) 
138 |   }
139 |   ans$indices = ans$indices[order(ans$indices[,"val"]),]
140 |   c(ans, svd_time=t1, total_time=(proc.time() - t0)[[3]])
141 | }
142 | 


--------------------------------------------------------------------------------
/R/tcor.R:
--------------------------------------------------------------------------------
  1 | #' Thresholded Correlation
  2 | #'
  3 | #' Compute a thresholded correlation matrix, returning vector indices
  4 | #' and correlation values that exceed the specified threshold \code{t}.
  5 | #' If \code{y} is a matrix then the thresholded correlations
  6 | #' between the columns of \code{x} and the columns of \code{y} are computed,
  7 | #' otherwise the correlation matrix defined by the columns of \code{x} is computed.
  8 | #'
  9 | #' @param x an m by n real-valued dense or sparse matrix
 10 | #' @param y \code{NULL} (default) or a matrix with compatible dimensions to \code{x} (same number of rows). The default
 11 | #' is equivalent to \code{y=x} but more efficient.
 12 | #' @param t a threshold value for correlation, -1 < t < 1, but usually t is near 1 (see \code{include_anti} below).
 13 | #' @param p projected subspace dimension, p << n (if p >= n it will be reduced)
 14 | #' (Increase \code{p} to cut down the total number of candidate pairs evaluated.
 15 | #' at the expense of costlier matrix-vector products. See the notes on tuning \code{p}.)
 16 | #' @param include_anti logical value, if \code{TRUE} then return both correlated
 17 | #'        and anti-correlated values that meet the threshold in absolute value. NB Can be much more expensive when \code{TRUE}.
 18 | #' @param filter "local" filters candidate set sequentially,
 19 | #'  "distributed" computes thresholded correlations in a parallel code section which can be
 20 | #'  faster but requires the data matrix (see notes).
 21 | #' @param dry_run set \code{TRUE} to return statistics and truncated SVD for tuning
 22 | #' \code{p} (see notes).
 23 | #' @param rank when \code{TRUE}, the threshold \code{t} represents the top \code{t}
 24 | #' closest vectors, otherwise the threshold \code{t} specifies absolute correlation value.
 25 | #' @param max_iter when \code{rank=TRUE}, a portion of the algorithm may iterate; this
 26 | #' number sets the maximum numer of such iterations.
 27 | #' @param restart either output from a previous run of \code{tcor} with \code{dry_run=TRUE},
 28 | #' or direct output from from \code{\link{irlba}} used to restart the \code{irlba}
 29 | #' algorithm when tuning \code{p} (see notes).
 30 | #' @param ... additional arguments passed to \code{\link{irlba}}.
 31 | #'
 32 | #' @return A list with elements:
 33 | #' \enumerate{
 34 | #'   \item \code{indices} A three-column matrix. The first two columns contain
 35 | #'         indices of vectors meeting the correlation threshold \code{t},
 36 | #'         the third column contains the corresponding correlation value
 37 | #'         (not returned when \code{dry_run=TRUE}).
 38 | #'   \item \code{restart} The truncated SVD from \code{\link{irlba}}, used to restart
 39 | #'   the \code{irlba} algorithm (only returned when \code{dry_run=TRUE}).
 40 | #'   \item \code{longest_run} The largest number of successive entries in the
 41 | #'     ordered first singular vector within a projected distance defined by the
 42 | #'     correlation threshold. This is the minimum number of \code{n * p} matrix-vector
 43 | #'     products required by the algorithm.
 44 | #'   \item \code{tot} The total number of _candidate_ vectors that met
 45 | #'     the correlation threshold identified by the algorithm, subsequently filtered
 46 | #'     down to just those indices corresponding to values meeting the threshold.
 47 | #'   \item \code{t} The threshold value.
 48 | #'   \item \code{svd_time} Time spent computing truncated SVD.
 49 | #'   \item \code{total_time} Total run time.
 50 | #' }
 51 | #'
 52 | #' @note Register a parallel backend with \code{\link{foreach}} before invoking \code{\link{tcor}}
 53 | #' to run in parallel, otherwise it runs sequentially.
 54 | #' When \code{A} is large, use \code{filter=local} to avoid copying A to the
 55 | #' parallel R worker processes (unless the \code{doMC} parallel backend is used with
 56 | #' \code{\link{foreach}}).
 57 | #'
 58 | #' Specify \code{dry_run=TRUE} to compute and return a truncated SVD of rank \code{p},
 59 | #' a lower bound on the number of \code{n*p} matrix vector products required by the full algorithm, and a lower-bound
 60 | #' estimate on the number of unpruned candidate vector pairs to be evaluated by the algorithm. You
 61 | #' can pass the returned value back in as input using the \code{restart} parameter to avoid
 62 | #' fully recomputing a truncated SVD. Use these options to tune \code{p} for a balance between
 63 | #' the matrix-vector product work and pruning efficiency.
 64 | #'
 65 | #' When \code{rank=TRUE}, the method returns at least, and perhaps more than, the top \code{t} most correlated
 66 | #' indices, unless they couldn't be found within \code{max_iter} iterations.
 67 | #'
 68 | #' @seealso \code{\link{cor}}, \code{\link{tdist}}
 69 | #' @references \url{http://arxiv.org/abs/1512.07246} (preprint)
 70 | #' @examples
 71 | #' # Construct a 100 x 2,000 example matrix A:
 72 | #' set.seed(1)
 73 | #' s <- svd(matrix(rnorm(100 * 2000), nrow=100))
 74 | #' A <- s$u %*% (1 /( 1:100) * t(s$v)) 
 75 | #'
 76 | #' C <- cor(A)
 77 | #' C <- C * upper.tri(C)
 78 | #' # Compare i with x$indices below:
 79 | #' (i <- which(C >= 0.98, arr.ind=TRUE))
 80 | #' (x <- tcor(A, t=0.98))
 81 | #'
 82 | #' # Same example with thresholded correlation _and_ anticorrelation
 83 | #' (i <- which(abs(C) >= 0.98, arr.ind=TRUE))
 84 | #' (x <- tcor(A, t=0.98, include_anti=TRUE))
 85 | #'
 86 | #' # Example of tuning p with dry_run=TRUE:
 87 | #' x1 <- tcor(A, t=0.98, p=3, dry_run=TRUE)
 88 | #' print(x1$tot)
 89 | #' # 211, see how much we can reduce this without increasing p too much...
 90 | #' x1 <- tcor(A, t=0.98, p=5, dry_run=TRUE, restart=x1)
 91 | #' print(x1$tot)
 92 | #' # 39,  much better...
 93 | #' x1 <- tcor(A, t=0.98, p=10, dry_run=TRUE, restart=x1)
 94 | #' print(x1$tot)
 95 | #' # 3,   even better!
 96 | #'
 97 | #' # Once tuned, compute the full thresholded correlation:
 98 | #' x <- tcor(A, t=0.98, p=10, restart=x1)
 99 | #'
100 | #' \dontrun{
101 | #' # Optionally, register a parallel backend first:
102 | #' library(doMC)
103 | #' registerDoMC()
104 | #' x <- tcor(A, t=0.98)  # Should now run faster on a multicore machine
105 | #' }
106 | #'
107 | #' @importFrom irlba irlba
108 | #' @importFrom stats cor
109 | #' @importFrom Matrix colMeans
110 | #' @export
111 | tcor = function(x, y=NULL, t=0.99, p=10, include_anti=FALSE, filter=c("distributed", "local"),
112 |                 dry_run=FALSE, rank=FALSE, max_iter=4, restart, ...)
113 | {
114 |   filter = match.arg(filter)
115 |   group = NULL
116 |   if(!is.null(y))
117 |   {
118 |     group = c(rep(1L, ncol(x)), rep(-1L, ncol(y)))
119 |     x = cbind(x, y) # XXX Future version: custom irlba matrix product instead for large matrices?
120 |   }
121 |   if(ncol(x) < p) p = max(1, floor(ncol(x) / 2 - 1))
122 |   t0 = proc.time()
123 |   mu = colMeans(x)
124 |   s  = sqrt(apply(x, 2, crossprod) - nrow(x) * mu ^ 2) # col norms of centered matrix
125 |   if(include_anti) filter_fun = function(v, t) abs(v) >= t
126 |   else filter_fun = function(v, t) v >= t
127 |   if(any(s < 10 * .Machine$double.eps)) stop("the standard deviation is zero for some columns")
128 |   if(missing(restart)) L  = irlba(x, p, center=mu, scale=s, ...)
129 |   else
130 |   {
131 |     # Handle either output from tcor(..., dry_run=TRUE), or direct output from irlba:
132 |     if("restart" %in% names(restart)) restart = restart$restart
133 |     L = irlba(x, p, center=mu, scale=s, v=restart, ...)
134 |   }
135 |   t1 = (proc.time() - t0)[[3]]
136 | 
137 |   if(rank)
138 |   {
139 |     N = t
140 |     t = 0.99
141 |   }
142 |   iter = 1
143 |   old_n = 0
144 |   while(iter <= max_iter)
145 |   {
146 | # steps 2--7 of algorithm 2.1
147 |     ans = two_seven(x, L, t, filter, dry_run=dry_run, filter_fun=filter_fun, anti=include_anti, group=group)
148 |     ans$tot = old_n + ans$tot
149 |     old_n = ans$tot
150 |     if(dry_run) return(list(restart=L, longest_run=ans$longest_run, tot=ans$tot, t=t, svd_time=t1))
151 |     if(!rank || (nrow(ans$indices) >= N)) break
152 |     iter = iter + 1
153 |     t = max(t - 0.02, -1)
154 |   }
155 |   ans$indices = ans$indices[order(ans$indices[,"val"], decreasing=TRUE),]
156 |   c(ans, svd_time=t1, total_time=(proc.time() - t0)[[3]])
157 | }
158 | 


--------------------------------------------------------------------------------
/R/common.R:
--------------------------------------------------------------------------------
  1 | #' linear time longest run search: find the longest run of values in the
  2 | #' ordered vector within the specified limit
  3 | #' @param v a vector with entries ordered in increasing order
  4 | #' @param limit distance interval
  5 | #' @param group optional vector with entries -1 and 1 corresponding to the group membership of each element in \code{v} (two groups)
  6 | #' @return run length
  7 | #' @keywords internal
  8 | longrun = function(v, limit, group=NULL)
  9 | {
 10 |   lower <- 1
 11 |   ell = 1
 12 |   for(upper in 2:length(v))
 13 |   {
 14 |     if(v[upper] - v[lower] <= limit)
 15 |     {
 16 |       ell = max(ell, upper - lower + 1)
 17 |     } else
 18 |     {
 19 |       if(is.null(group)) while (lower < upper && v[upper] - v[lower] > limit) lower = lower + 1
 20 |       else while (lower < upper && v[upper] - v[lower] > limit && prod(group[lower:upper]) < 0) lower = lower + 1
 21 |     }
 22 |   }
 23 |   ell
 24 | }
 25 | 
 26 | #' Steps 2--7 of Algorithm 2.1, factored into a common function that can be used by a variety of distance metrics
 27 | #' @param A data matrix
 28 | #' @param L truncated SVD of A
 29 | #' @param t scalar threshold value
 30 | #' @param filter "distributed" for full threshold evaluation of pruned set on parallel workers,
 31 | #'         "local" for sequential evaluation of full threshold of pruned set to avoid copying data matrix.
 32 | #' @param normlim the squared norm limit in step 4, default value is for correlation
 33 | #' @param full_dist_fun non-projected distance function of a two-column matrix of rows of column
 34 | #'        indices that needs scoped access to A (step 7), default function is for correlation
 35 | #' @param filter_fun filter function of a vector and scalar that thresholds vector values
 36 | #'        from full_dist_fun, returning a logical vector of same length as v (step 7), default function is for correlation
 37 | #' @param dry_run a logical value, if \code{TRUE} quickly return statistics useful for tuning \code{p}
 38 | #' @param anti a logical value, if \code{TRUE} also include anti-correlated vectors
 39 | #' @param group either \code{NULL} for no grouping, or a vector of length \code{ncol(A)} consisting of \code{-1, 1} values
 40 | #'        indicating group membership of the columns.
 41 | #' @return a list with indices, ell, tot, and longest_run entries, unless dry_run=\code{TRUE} in which case
 42 | #' a list with ell and tot is returned
 43 | #' @importFrom foreach foreach %dopar% %do%
 44 | #' @keywords internal
 45 | two_seven = function(A, L, t, filter=c("distributed", "local"), normlim=2 * (1 - t),
 46 |                      full_dist_fun=function(idx) vapply(1:nrow(idx), function(k) cor(A[, idx[k,1]], A[, idx[k, 2]]), 1),
 47 |                      filter_fun=function(v, t) v >= t, dry_run=FALSE, anti=FALSE, group=NULL)
 48 | {
 49 |   filter = match.arg(filter)
 50 |   nx = ncol(A)
 51 |   if(!is.null(group)) nx = sum(group == group[1])  # number of columns in 1st array
 52 |   grouped = ! is.null(group)
 53 | # Find the projection among the first few basis vectors with the shortest
 54 | # maximum run length to minimize work in the next step. This is a cheap but
 55 | # usually not very significant optimization.
 56 |   p = length(L$d)
 57 |   ells = lapply(1:min(2, p), function(N)
 58 |   {
 59 |     P = order(L$v[, N])
 60 |     limit = sqrt(normlim) / L$d[N]
 61 |     ell = longrun(L$v[order(L$v[, N]), N], limit, group[P])
 62 |     list(P=P, limit=limit, ell=ell, N=N)
 63 |   })
 64 |   ellmin = which.min(vapply(ells, function(x) x$ell, 1))
 65 |   P = ells[[ellmin]]$P
 66 |   ell = min(ells[[ellmin]]$ell, ncol(A) - 1)
 67 |   eN = ells[[ellmin]]$N
 68 |   elim = ells[[ellmin]]$limit
 69 | 
 70 | # In the include anticorrelated case, we can use the same permutation but
 71 | # likely increase ell.
 72 |   if(anti)
 73 |   {
 74 |     v2 = c(L$v[, eN], -L$v[, eN])
 75 |     v2p = order(v2)
 76 |     v2 = v2[v2p]
 77 |     ell = longrun(v2, sqrt(normlim) / L$d[eN], c(group, group)[v2p])
 78 |   }
 79 | 
 80 |   if(dry_run)
 81 |   {
 82 |     d = diff(L$v[P, 1:p, drop=FALSE], lag=1) ^ 2 %*% L$d[1:p] ^ 2
 83 |     return(list(longest_run=ell, tot=sum(d <= normlim), t=t))
 84 |   }
 85 | 
 86 | # The big union in step 4 of algorithm 2.1 follows, combined with step 6 to
 87 | # convert back to original indices, and step 7 to evaluate the candiadtes.
 88 | # Each step from 1 to ell is independent of the others; the steps can run
 89 | # in parallel.
 90 |   combine = function(x, y)
 91 |   {
 92 |     list(indices=rbind(x$indices, y$indices), tot=x$n + y$n)
 93 |   }
 94 | 
 95 | # codetools has trouble detecting the foreach variable i below. We define
 96 | # it here to supress CRAN NOTEs and lintr warnings (is this really a
 97 | # problem in foreach or codetools?).
 98 |   i = 1
 99 | 
100 |   if(filter == "distributed")
101 |   {
102 |     indices = foreach(i=1:ell, .combine=combine, .inorder=FALSE, .packages=c("tcor", "Matrix")) %dopar%
103 |     {
104 |       d2 = Inf
105 |       # restrict focus to candidates from each group (if specified)
106 |       if(grouped)
107 |       {
108 |         # (all this does is make the matrix vector product cheaper)
109 |         gidx = which(diff(group[P], lag=i) != 0)
110 |         D = L$v[P[gidx + i], 1:p] - L$v[P[gidx], 1:p]
111 |         if(anti) D2 = L$v[P[gidx + i], 1:p] + L$v[P[gidx], 1:p]
112 |       } else
113 |       {
114 |         D = diff(L$v[P, 1:p, drop=FALSE], lag=i)
115 |         if(anti) D2 = diffint(L$v[P, 1:p, drop=FALSE], lag=i, sign=1)
116 |       }
117 |       d = D ^ 2 %*% L$d[1:p] ^ 2
118 |       if(anti) d2 = D2 ^ 2 %*% L$d[1:p] ^ 2
119 |       # These ordered indices meet the projected threshold:
120 |       j = c(which(d <= normlim), which(d2 <= normlim))
121 |       if(grouped) j = gidx[j]
122 |       n = length(j)
123 |       # return original un-permuted column indices that meet true threshold
124 |       # (step 7), and the number of possible candidates:
125 |       if(n == 0)
126 |       {
127 |         ans = vector("list", 2)
128 |         names(ans) = c("indices", "tot")
129 |         ans$indices = cbind(i=integer(0), j=integer(0), val=double(0))
130 |         ans$tot = n
131 |         return(ans)
132 |       }
133 |       v = full_dist_fun(cbind(P[j], P[j + i]))
134 |       h = filter_fun(v, t)
135 |       j = j[h]
136 |       v = v[h]
137 |       ans_i = P[j]
138 |       ans_j = P[j + i]
139 |       if(grouped)
140 |       {
141 |         idx = which(ans_i  > nx)
142 |         if(length(idx) > 0)
143 |         {
144 |           j2 = ans_i[idx]
145 |           ans_i[idx] = ans_j[idx]
146 |           ans_j[idx] = j2
147 |         }
148 |         ans_i = ans_i %% nx
149 |         ans_j = ans_j %% nx
150 |         ans_i[ans_i == 0] = nx
151 |         ans_j[ans_j == 0] = nx
152 |       }
153 |       list(indices=cbind(i=ans_i , j=ans_j, val=v), tot=n)
154 |     }
155 |     return(c(indices, longest_run=ell, t=t))
156 |   }
157 | 
158 | # The filter == "local" case, preventing copy of the data matrix to the workers
159 |   indices = foreach(i=1:ell, .combine=combine, .inorder=FALSE, .noexport="A") %dopar%
160 |   {
161 |     d2 = Inf
162 |     # restrict focus to candidates from each group (if specified)
163 |     if(grouped)
164 |     {
165 |       # (all this does is make the matrix vector product cheaper)
166 |       gidx = which(diff(group[P], lag=i) != 0)
167 |       D = L$v[P[gidx + i], 1:p] - L$v[P[gidx], 1:p]
168 |       if(anti) D2 = L$v[P[gidx + i], 1:p] + L$v[P[gidx], 1:p]
169 |     } else
170 |     {
171 |       D = diff(L$v[P, 1:p, drop=FALSE], lag=i)
172 |       if(anti) D2 = diffint(L$v[P, 1:p, drop=FALSE], lag=i, sign=1)
173 |     }
174 |     d = D ^ 2 %*% L$d[1:p] ^ 2
175 |     if(anti) d2 = D2 ^ 2 %*% L$d[1:p] ^ 2
176 |     # These ordered indices meet the projected threshold:
177 |     j = c(which(d <= normlim), which(d2 <= normlim))
178 |     if(grouped) j = gidx[j]
179 |     n = length(j)
180 |     # return original un-permuted column indices that meet true threshold
181 |     # (step 7), including the number of possible candidates for info.:
182 |     if(n == 0)
183 |     {
184 |       ans = vector("list", 2)
185 |       names(ans) = c("indices", "tot")
186 |       ans$tot = n
187 |       return(ans)
188 |     }
189 |     ans_i = P[j]
190 |     ans_j = P[j + i]
191 |     if(grouped)
192 |     {
193 |       idx = which(ans_i  > nx)
194 |       if(length(idx) > 0)
195 |       {
196 |         j2 = ans_i[idx]
197 |         ans_i[idx] = ans_j[idx]
198 |         ans_j[idx] = j2
199 |       }
200 |     }
201 |     list(indices=cbind(i=ans_i, j=ans_j), tot=n)
202 |   }
203 |   v = full_dist_fun(indices$indices)
204 |   h = filter_fun(v, t)
205 |   indices$indices = cbind(indices$indices[h,], val=v[h])
206 |   if(grouped)
207 |   {
208 |     indices$indices[, 1:2] = indices$indices[, 1:2] %% nx
209 |     indices$indices[indices$indices[, 1] == 0, 1] = nx
210 |     indices$indices[indices$indices[, 2] == 0, 2] = nx
211 |   }
212 |   c(indices, longest_run=ell, t=t)
213 | }
214 | 
215 | # replacement for diff that also supports sums
216 | diffint = function(x, lag=1, sign=-1)
217 | {
218 |   tail(x, nrow(x) - lag) + sign * head(x, nrow(x) - lag)
219 | }
220 | 


--------------------------------------------------------------------------------
/vignettes/brca.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Thresholded Correlation of RNASeq Gene Expression Data"
  3 | date: "`r Sys.Date()`"
  4 | output:
  5 |   html_document:
  6 |     highlight: kate
  7 |     toc: false
  8 |     toc_depth: 4
  9 |     mathjax: null
 10 | vignette: >
 11 |   %\VignetteIndexEntry{Correlation}
 12 |   %\VignetteEngine{knitr::rmarkdown}
 13 |   \usepackage[utf8]{inputenc}
 14 | ---
 15 | 
 16 | # Thresholded Correlation of RNASeq Gene Expression Data
 17 | 
 18 | We illustrate the use of the `tcor` package to compute a thresholded
 19 | gene expression correlation matrix of gene expression data from the
 20 | Cancer Genome Atlas (TCGA).
 21 | 
 22 | TCGA (http://cancergenome.nih.gov/) is a joint effort of the National Cancer
 23 | Institute and the National Human Genome Research Institute. TCGA provides
 24 | curated data and analyses, including large-scale genome sequencing, for
 25 | many cancer tumor types.
 26 | 
 27 | The example proceeds in two parts:
 28 | 
 29 | 1. Downloading RNASeq gene expression data from TCGA and tricks for efficiently reading the data into R
 30 | 2. Computing a thresholded gene by gene correlation matrix with `tcor`
 31 | 
 32 | ## Obtaining and reading the gene expression data
 33 | 
 34 | The data for this example are obtained from the Broad Institute's GDAC Firehose
 35 | http://firebrowse.org/?cohort=BRCA&download_dialog=true.  The GDAC provides a
 36 | convenient way to download versioned and standardized TCGA data organized as
 37 | sample by measurement tables in tab delimited text form.
 38 | 
 39 | Some of the following steps use Unix-like pipeline processing with shell
 40 | utilities and R's `pipe` function. Windows users may need to install the Rtools
 41 | suite (https://cran.r-project.org/bin/windows/Rtools/).
 42 | 
 43 | ### Download and decompress the data
 44 | 
 45 | We select breast invasive carcinoma gene expression data, one of the larger
 46 | available datasets.  The GDAC dashboard is available at
 47 | http://gdac.broadinstitute.org/. Data may be browsed and manually downloaded
 48 | directly from the dashboard, or downloaded and uncompressed using the
 49 | `download.file` and `untar` lines in the script below.
 50 | 
 51 | ```{r, eval=FALSE}
 52 | url = "http://gdac.broadinstitute.org/runs/stddata__2015_11_01/data/BRCA/20151101/gdac.broadinstitute.org_BRCA.Merge_rnaseq__illuminahiseq_rnaseq__unc_edu__Level_3__gene_expression__data.Level_3.2015110100.0.0.tar.gz"
 53 | destfile = "gdac.broadinstitute.org_BRCA.Merge_rnaseq__illuminahiseq_rnaseq__unc_edu__Level_3__gene_expression__data.Level_3.2015110100.0.0.tar.gz"
 54 | 
 55 | download.file(url, destfile)  # (about 300 MB)
 56 | untar(destfile)               # (about 600 MB)
 57 | 
 58 | # data file name:
 59 | fn = dir(gsub("\\.tar\\.gz", "", destfile), pattern="*.data.txt", full.names=TRUE)
 60 | ```
 61 | 
 62 | ### Efficiently reading the data into R
 63 | 
 64 | The data file is a tab-delimited text file with two header lines followed by
 65 | 20,532 data lines. Each data line specifies a gene followed by sample
 66 | measurements for that gene in the columns. The measurements include raw counts,
 67 | normalized, and reads per kilobase of transcript per million mapped reads
 68 | (RPKM) values.  We'll use the RPKM-normalized values in this example.
 69 | 
 70 | The header lines look like:
 71 | ```
 72 | # Hybridiz...   ID1        ID1                       ID1  ID2        ID2                      ID2        ...
 73 | # gene          raw_counts median_length_normalized  RPKM raw_counts median_length_normalized raw_counts ...
 74 | ```
 75 | where, `IDn` indicates the nth TCGA barcode ID.
 76 | 
 77 | We _could_ simply read these data into R using `read.table`, for example with:
 78 | ```{r, eval=FALSE}
 79 | # Simple but not particularly efficient way to read the data...
 80 | # (Don't run this, continue reading instead...)
 81 | id   = unlist(read.table(fn, sep="\t", stringsAsFactors=FALSE, header=FALSE, nrows=1))
 82 | brca = read.table(fn, sep="\t", stringsAsFactors=FALSE, header=FALSE, skip=2)
 83 | # ... now filter out just the gene and RPKM columns...
 84 | ```
 85 | But since we're only interested in the RPKM-normalized values that approach
 86 | reads too much from the file. It can be more efficient to skip the columns
 87 | we're not interested in and read in just the gene and RPKM columns.
 88 | 
 89 | Instead we can use some the simple but effective shell tool `cut` and the idea
 90 | of pipelines to process the data file to remove all but the gene and RPKM
 91 | columns on the fly as we read it into R. There are two distinct advantages to
 92 | this approach: we read in only what we're interested in (cutting processing
 93 | time and memory consumption), and we employ pipeline-style parallel processing
 94 | to further speed things up, running the column-skipping `cut` process in
 95 | parallel with the data parsing R `read.table` function.
 96 | 
 97 | The pipelined processing as described in the last paragraph can use at most two
 98 | CPU cores to process the data (in practice on average somewhat less due to I/O
 99 | and other overhead). Lots of even cheap PCs today have more than two cores,
100 | and often quite fast storage systems (for example, solid state disk drives).
101 | We can wring even more performance out of such systems by combining the
102 | pipeline parallelism with explicit parallel processing using R's myriad
103 | available parallel processing functions.
104 | 
105 | The example code below uses Steve Weston's elegant `foreach` framework for
106 | explicit parallel processing to read the data file in chunks. Chunks are
107 | processed concurrently using the pipelined parallelism described above: on
108 | a four-CPU computer this yields four R and four `cat` worker processes
109 | plus the controlling R process.
110 | 
111 | Any `foreach` parallel back end can be used for this task. We use the
112 | Unix-specific `doMC` backend below but Windows users can equivalently use the
113 | `doParallel` backend. The work can even be distributed across more than one
114 | computer with `doSNOW`, `doMPI`, or `doRedis` backends (without changing the
115 | code). The example below runs in under 30 seconds on my inexpensive quad-core
116 | Athlon home PC.
117 | 
118 | There are 2,635 columns (878 unique samples) of tab-separated data.  If
119 | we're interested only in RPKM values, then we want columns 
120 | 1, 4, 7, 10, ..., 2635.
121 | One efficient way to get just the columns of interest uses an external
122 | shell pipeline.
123 | 
124 | ```{r, eval=FALSE}
125 | h = 2       # total header lines in the data file
126 | N = 20534   # total number of lines in the data file
127 | 
128 | # The argument to cut -d is a TAB symbol inside of single quotes.
129 | # You can generate that by typing CTRL+V followed by TAB. For some
130 | # reason it often does not copy right (coming over as spaces instead),
131 | # so beware here...
132 | command = sprintf("cat %s | cut -d '	' -f %s", fn, paste(c(1,seq(from=4, to=2635, by=3)), collapse=","))
133 | 
134 | # Read the first header line of sample IDs:
135 | f = pipe(sprintf("%s | head -n 1", command), open="r")
136 | id = unlist(read.table(f, sep="\t", stringsAsFactors=FALSE, header=FALSE, nrows=1))
137 | id[1] = "gene"
138 | close(f)
139 | 
140 | # Read the rest of the file in parallel.
141 | library(doMC)
142 | cores = 4
143 | registerDoMC(cores)
144 | block = floor((N - h)/cores)
145 | 
146 | brca = foreach(j=1:cores, .combine=rbind) %dopar%
147 | {
148 |   skip = block * (j - 1) + h + 1
149 |   nrows = ifelse(j == cores, -1, block)
150 |   f = pipe(sprintf("%s | tail -n +%.0f", command, skip), open="r")
151 |   on.exit(close(f))
152 |   read.table(f, sep="\t", stringsAsFactors=FALSE, header=FALSE, nrows=nrows)
153 | }
154 | 
155 | # Finally, label the variables we've just read in using the TCGA sample IDs.
156 | names(brca) = id
157 | ```
158 | 
159 | Once finished, we have a data frame named `brca` with 20,532 rows and 879
160 | columns. The first column contains gene names, the rest contain sample
161 | RPKM values. The data frame column names include the TCGA barcode sample
162 | IDs.
163 | 
164 | See https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode for help understanding
165 | the TCGA barcode, a sequence of dash separated identifiers. In particular,
166 | the fourth identifier (sample/vial) indicates if the sample comes from normal
167 | tissue, solid tumor, or elsewhere, as described in
168 | https://tcga-data.nci.nih.gov/datareports/codeTablesReport.htm?codeTable=sample%20type.
169 | We can identify columns associated with tumor, metastatic, and normal tissue
170 | samples by:
171 | ```{r, eval=FALSE}
172 | tumor      = grep("^....-..-....-01.-...-....-..", names(brca))
173 | normal     = grep("^....-..-....-11.-...-....-..", names(brca))
174 | metastatic = grep("^....-..-....-06.-...-....-..", names(brca))
175 | ```
176 | 
177 | The next step of our example computes thresholded gene correlation matrices
178 | and works with data in matrix form, not data frames. The final step in this
179 | section assembles two matrices corresponding to tumor and normal samples:
180 | ```{r, eval=FALSE}
181 | brca_tumor = t(as.matrix(brca[, tumor]))
182 | brca_normal = t(as.matrix(brca[, normal]))
183 | colnames(brca_tumor) = brca$gene   # gene names for reference
184 | colnames(brca_normal) = brca$gene  # gene names for reference
185 | 
186 | print(dim(brca_tumor))
187 | print(dim(brca_normal))
188 | ```
189 | ```
190 | [1]   775 20532
191 | [1]   100 20532
192 | ```
193 | 
194 | 
195 | ## Efficient computation of thresholded correlation matrices with tcor
196 | 
197 | The tcor package (https://github.com/bwlewis/tcor, and companion preprint paper
198 | http://arxiv.org/abs/1512.07246) provides an implementation of the a new
199 | algorithm for fast and efficient thresholded correlation.
200 | You can install the development version of the R package directly from GitHub with
201 | ```{r, eval=FALSE}
202 | devtools::install_github("bwlewis/tcor")
203 | library(tcor)
204 | ```
205 | 
206 | Because we're interested in correlation among the columns (gene expression), we need to
207 | filter out constant-valued columns (including, for example, columns of all zeros):
208 | ```{r, eval=FALSE}
209 | brca_tumor_filtered = brca_tumor[, apply(brca_tumor, 2, sd) > 0]
210 | brca_normal_filtered = brca_normal[, apply(brca_normal, 2, sd) > 0]
211 | ```
212 | 
213 | Let's find all pairs of gene expression vectors  among the filtered tumor data
214 | with correlation values at least 0.99:
215 | ```{r, eval=FALSE}
216 | tumor_cor = tcor(brca_tumor_filtered, t=0.99)
217 | str(tumor_cor)
218 | ```
219 | ```
220 | List of 6
221 |  $ indices    : num [1:529, 1:3] 16749 8316 4320 4319 4320 ...
222 |   ..- attr(*, "dimnames")=List of 2
223 |   .. ..$ : NULL
224 |   .. ..$ : chr [1:3] "i" "j" "val"
225 |  $ n          : num 195369
226 |  $ longest_run: num 5467
227 |  $ t          : num 0.99
228 |  $ svd_time   : num 5.48
229 |  $ total_time : num 34.3
230 | ```
231 | 
232 | The `tcor` function found 529 such correlated gene expression vectors (out of a
233 | total 20522^2 = 421,152,484 possible gene pairs) in about 35 seconds on my
234 | quad-core home PC.  We can translate the listed matrix column indices into more
235 | easily readable gene names with, for example:
236 | ```{r, eval=FALSE}
237 | tumor = data.frame(i=colnames(brca_tumor_filtered)[tumor_cor$indices[,1]],
238 |                    j=colnames(brca_tumor_filtered)[tumor_cor$indices[,2]],
239 |                    val=tumor_cor$indices[,3])
240 | head(tumor)
241 | ```
242 | ```
243 |                      i                j       val
244 | 1 SNORD115-2|100033437 KRTAP20-3|337985 1.0000000
245 | 2      INS-IGF2|723961        IGF2|3481 0.9999759
246 | 3       CSN1S2A|286828        CSN2|1447 0.9999411
247 | 4              GC|2638   CSN1S2A|286828 0.9998493
248 | 5              GC|2638        CSN2|1447 0.9997900
249 | 6            NRAP|4892       CSRP3|8048 0.9997411
250 | ```
251 | We can verify the result by explicitly computing a full correlation matrix,
252 | but this takes a lot longer and uses much more memory (270 seconds and 6 GB
253 | on my PC):
254 | ```{r, eval=FALSE}
255 | # Uncomment the following lines if you want to run the full correlation
256 | # for comparison...
257 | 
258 | # C = cor(brca_tumor_filtered)
259 | # sum(C[upper.tri(C)] >= 0.99)
260 | # (You will get 529, same as computed above with tcor.)
261 | ```
262 | 
263 | 
264 | We can similarly identify the 20,331 pairs of correlated gene expression
265 | vectors for the normal samples (in about 17 seconds on my PC):
266 | ```{r, eval=FALSE}
267 | normal_cor = tcor(brca_normal_filtered, t=0.99)
268 | str(normal_cor)
269 | normal = data.frame(i=colnames(brca_normal_filtered)[normal_cor$indices[,1]],
270 |                     j=colnames(brca_normal_filtered)[normal_cor$indices[,2]],
271 |                     val=normal_cor$indices[,3])
272 | head(normal)
273 | ```
274 | ```
275 |                       i                     j val
276 | 1          OR1S1|219959          LELP1|149018   1
277 | 2          LELP1|149018          OR9K2|441639   1
278 | 3          OR5M3|219482 SNORD115-10|100033447   1
279 | 4 SNORD115-10|100033447         OR4C46|119749   1
280 | 5          OR5T2|219464          OR8D4|338662   1
281 | 6          OR8D4|338662          OR2M5|127059   1
282 | ```
283 | 


--------------------------------------------------------------------------------