├── .Rbuildignore ├── LICENSE ├── vignettes ├── pics │ ├── tweet1.png │ ├── tweet2.png │ ├── flatland.png │ ├── svd_sdev.png │ └── svd_clusters.png ├── include │ ├── uch_small.png │ ├── pbdML.bib │ ├── 00-acknowledgement.tex │ └── settings.tex ├── build_pdf.sh └── pbdML.Rnw ├── .gitignore ├── cleanup ├── inst └── benchmarks │ └── rpca.r ├── ChangeLog ├── tests ├── fld.R ├── decomp_recomp.R └── rpca.R ├── NAMESPACE ├── man ├── pbdML-package.Rd ├── fld.Rd ├── robpca.Rd ├── rsvd.Rd ├── rpca.Rd └── decomp_recomp.Rd ├── src ├── pbdML_native.c ├── checks.c └── robust_pca.c ├── R ├── pbdML-package.r ├── checks.r ├── assert.r ├── decomp_recomp.r ├── rpca.r ├── fld.r ├── rsvd.r └── robpca.r ├── DESCRIPTION └── README.md /.Rbuildignore: -------------------------------------------------------------------------------- 1 | redocument 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2014-2016 2 | COPYRIGHT HOLDER: Drew Schmidt and George Ostrouchov 3 | -------------------------------------------------------------------------------- /vignettes/pics/tweet1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RBigData/pbdML/HEAD/vignettes/pics/tweet1.png -------------------------------------------------------------------------------- /vignettes/pics/tweet2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RBigData/pbdML/HEAD/vignettes/pics/tweet2.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.in 2 | *.o 3 | *.so 4 | *.log 5 | *.status 6 | *~ 7 | *.swp 8 | 9 | inst/doc 10 | -------------------------------------------------------------------------------- /vignettes/pics/flatland.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RBigData/pbdML/HEAD/vignettes/pics/flatland.png -------------------------------------------------------------------------------- /vignettes/pics/svd_sdev.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RBigData/pbdML/HEAD/vignettes/pics/svd_sdev.png -------------------------------------------------------------------------------- /vignettes/include/uch_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RBigData/pbdML/HEAD/vignettes/include/uch_small.png -------------------------------------------------------------------------------- /vignettes/pics/svd_clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RBigData/pbdML/HEAD/vignettes/pics/svd_clusters.png -------------------------------------------------------------------------------- /cleanup: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | rm -rf ./src/*.so 4 | rm -rf ./src/*.o 5 | rm -rf ./src/*.d 6 | rm -rf ./src/*.dll 7 | 8 | -------------------------------------------------------------------------------- /inst/benchmarks/rpca.r: -------------------------------------------------------------------------------- 1 | library(rbenchmark) 2 | suppressPackageStartupMessages(library(pbdML)) 3 | 4 | ### non-mpi test only 5 | m <- 1000 6 | n <- 100 7 | x <- rnorm(m*n) 8 | dim(x) <- c(m, n) 9 | 10 | 11 | benchmark(rpca(x, k=1)) 12 | 13 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | Release 0.1-1 (//): 2 | * Added robsvd(). 3 | * Added vignette. 4 | * Fixed broken tests. 5 | 6 | Release 0.1-0 (2/29/2016): 7 | * Added fld method. 8 | * Added "decomp/recomp" function. 9 | * Added random random svd and random pca. 10 | -------------------------------------------------------------------------------- /tests/fld.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages(library(pbdML)) 2 | comm.set.seed(12345) 3 | 4 | ### non-mpi test only 5 | m <- 100 6 | n <- 10 7 | 8 | x <- matrix(rnorm(m*n), m, n) 9 | g <- sample(0:1, size=m, replace=TRUE, prob=c(.25, .75)) 10 | 11 | truth <- 0.224449140419224 12 | test <- fld(x, g)$c 13 | 14 | stopifnot(all.equal(truth, test)) 15 | 16 | 17 | #dx = as.ddmatrix(x) 18 | #dg = as.ddmatrix(g) 19 | 20 | #test <- fld(dx, dg)$c 21 | #stopifnot(all.equal(truth, test)) 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(print,fld) 4 | export(decomp_recomp) 5 | export(fld) 6 | export(robpca) 7 | export(rpca) 8 | export(rsvd) 9 | import(pbdDMAT) 10 | importFrom(pbdMPI,allreduce) 11 | importFrom(pbdMPI,comm.all) 12 | importFrom(pbdMPI,comm.cat) 13 | importFrom(pbdMPI,comm.print) 14 | importFrom(pbdMPI,comm.stop) 15 | importFrom(stats,runif) 16 | useDynLib(pbdML,R_check_badvals) 17 | useDynLib(pbdML,R_check_groupvar) 18 | useDynLib(pbdML,R_one_norm) 19 | useDynLib(pbdML,R_shrink_op) 20 | -------------------------------------------------------------------------------- /tests/decomp_recomp.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages(library(pbdML)) 2 | 3 | 4 | ### non-mpi test only 5 | m <- 100 6 | n <- 10 7 | 8 | exclude <- 1 9 | 10 | x <- matrix(rnorm(m*n), m, n) 11 | x_dr <- decomp_recomp(x, exclude, center=FALSE, scale=FALSE) 12 | 13 | pca.x <- prcomp(x) 14 | pca.x_dr <- prcomp(x_dr) 15 | 16 | u <- pca.x$u 17 | d <- pca.x$d 18 | vt <- pca.x$vt 19 | 20 | u_dr <- pca.x_dr$u 21 | d_dr <- pca.x_dr$d 22 | vt_dr <- pca.x_dr$vt 23 | 24 | 25 | stopifnot(all.equal(u_dr, u[, -exclude])) 26 | stopifnot(all.equal(d_dr, d[-exclude])) 27 | stopifnot(all.equal(vt_dr, vt[-exclude, ])) 28 | -------------------------------------------------------------------------------- /tests/rpca.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages(library(pbdML)) 2 | 3 | 4 | ### non-mpi test only 5 | x <- matrix(1:30, 10) 6 | 7 | pca_est <- rpca(x) 8 | 9 | # truncate full 10 | pca_full <- prcomp(x) 11 | pca_full$sdev <- pca_full$sdev[1] 12 | pca_full$rotation <- pca_full$rotation[, 1, drop=FALSE] 13 | pca_full$center <- pca_full$center[1] 14 | pca_full$x <- pca_full$x[, 1, drop=FALSE] 15 | 16 | # account for sign 17 | if (sign(pca_full$rotation[1,1]) != sign(pca_est$rotation[1,1])) 18 | { 19 | pca_full$rotation <- -pca_full$rotation 20 | pca_full$x <- -pca_full$x 21 | } 22 | 23 | 24 | 25 | stopifnot(all.equal(pca_est, pca_full)) 26 | -------------------------------------------------------------------------------- /vignettes/build_pdf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | fixVersion(){ 4 | PKGVER=`grep "Version:" ../DESCRIPTION | sed -e "s/Version: //"` 5 | sed -i -e "s/myversion{.*}/myversion{${PKGVER}}/" $1 6 | } 7 | 8 | cleanVignette(){ 9 | rm -f *.aux *.bbl *.blg *.log *.out *.toc *.dvi 10 | } 11 | 12 | buildVignette(){ 13 | fixVersion $1 14 | pdflatex $1 15 | bibname=`echo "$1" | sed -e 's/\..*//'` 16 | bibtex $bibname 17 | pdflatex $1 18 | pdflatex $1 19 | Rscript -e "tools::compactPDF('$1', gs_quality='ebook')" 20 | } 21 | 22 | 23 | cleanVignette 24 | buildVignette pbdML.Rnw 25 | cleanVignette 26 | 27 | 28 | mv -f *.pdf ../inst/doc/ 29 | cp -f *.Rnw ../inst/doc/ 30 | -------------------------------------------------------------------------------- /man/pbdML-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pbdML-package.r 3 | \docType{package} 4 | \name{pbdML-package} 5 | \alias{pbdML-package} 6 | \title{Machine Learning} 7 | \description{ 8 | TODO 9 | } 10 | \details{ 11 | \tabular{ll}{ 12 | Package: \tab pbdML \cr 13 | Type: \tab Package \cr 14 | License: \tab BSD 2-clause \cr 15 | LazyLoad: \tab yes \cr 16 | } 17 | 18 | This package requires an MPI library (OpenMPI, MPICH2, or LAM/MPI). 19 | } 20 | \references{ 21 | Programming with Big Data in R Website: \url{http://r-pbd.org/} 22 | } 23 | \author{ 24 | Drew Schmidt \email{schmidt AT math.utk.edu}, George Ostrouchov, and Wei-Chen Chen. 25 | } 26 | \keyword{Package} 27 | -------------------------------------------------------------------------------- /src/pbdML_native.c: -------------------------------------------------------------------------------- 1 | /* Automatically generated. Do not edit by hand. */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | extern SEXP R_check_badvals(SEXP x_); 9 | extern SEXP R_check_groupvar(SEXP g_); 10 | extern SEXP R_one_norm(SEXP x_); 11 | extern SEXP R_shrink_op(SEXP x_, SEXP tau_); 12 | 13 | static const R_CallMethodDef CallEntries[] = { 14 | {"R_check_badvals", (DL_FUNC) &R_check_badvals, 1}, 15 | {"R_check_groupvar", (DL_FUNC) &R_check_groupvar, 1}, 16 | {"R_one_norm", (DL_FUNC) &R_one_norm, 1}, 17 | {"R_shrink_op", (DL_FUNC) &R_shrink_op, 2}, 18 | {NULL, NULL, 0} 19 | }; 20 | 21 | void R_init_pbdML(DllInfo *dll) 22 | { 23 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 24 | R_useDynamicSymbols(dll, FALSE); 25 | } 26 | -------------------------------------------------------------------------------- /R/pbdML-package.r: -------------------------------------------------------------------------------- 1 | #' Machine Learning 2 | #' 3 | #' TODO 4 | #' 5 | #' \tabular{ll}{ 6 | #' Package: \tab pbdML \cr 7 | #' Type: \tab Package \cr 8 | #' License: \tab BSD 2-clause \cr 9 | #' LazyLoad: \tab yes \cr 10 | #' } 11 | #' 12 | #' This package requires an MPI library (OpenMPI, MPICH2, or LAM/MPI). 13 | #' 14 | #' @importFrom pbdMPI allreduce comm.stop comm.print comm.cat comm.all 15 | #' @importFrom stats runif 16 | #' @import pbdDMAT 17 | #' 18 | #' @useDynLib pbdML R_check_badvals R_check_groupvar R_one_norm R_shrink_op 19 | #' 20 | #' @name pbdML-package 21 | #' @docType package 22 | #' @author Drew Schmidt \email{schmidt AT math.utk.edu}, George Ostrouchov, and Wei-Chen Chen. 23 | #' @references Programming with Big Data in R Website: \url{http://r-pbd.org/} 24 | #' @keywords Package 25 | NULL 26 | -------------------------------------------------------------------------------- /src/checks.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | // Check for NA, Inf, and NaN 6 | SEXP R_check_badvals(SEXP x_) 7 | { 8 | SEXP ret; 9 | int check = 0; 10 | double *x = REAL(x_); 11 | 12 | for (int i=0; i 4 | #include 5 | #include 6 | 7 | #define SIGN(x) ((x)>0?1:-1) 8 | 9 | #define MAX(a,b) ((a)>(b)?(a):(b)) 10 | 11 | 12 | SEXP R_one_norm(SEXP x_) 13 | { 14 | SEXP ret; 15 | double norm = 0.0; 16 | int i, j; 17 | const int m = nrows(x_); 18 | const int n = ncols(x_); 19 | const double *const restrict x = REAL(x_); 20 | 21 | // sum(abs(x)) 22 | for (j=0; j= 3.0.0), 15 | pbdMPI (>= 0.3-0), 16 | pbdDMAT (>= 0.4-0) 17 | Imports: 18 | stats 19 | NeedsCompilation: yes 20 | ByteCompile: yes 21 | Authors@R: c(person("Drew", "Schmidt", role = c("aut", "cre"), email = 22 | "wrathematics@gmail.com"), 23 | person("George", "Ostrouchov", role = "aut"), 24 | person("Wei-Chen", "Chen", role = "aut"), 25 | person("Ahmed", "Moustafa", role="ctb", comment="described decomp/recomp problem")) 26 | URL: http://r-pbd.org/ 27 | BugReports: http://group.r-pbd.org/ 28 | MailingList: Please send questions and comments regarding pbdR to 29 | RBigData@gmail.com 30 | Maintainer: Drew Schmidt 31 | RoxygenNote: 5.0.1 32 | -------------------------------------------------------------------------------- /man/decomp_recomp.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/decomp_recomp.r 3 | \name{decomp_recomp} 4 | \alias{decomp_recomp} 5 | \title{Decompose/Recompose} 6 | \usage{ 7 | decomp_recomp(x, exclude, center = TRUE, scale = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{A matrix or ddmatrix of numeric values.} 11 | 12 | \item{exclude}{A vector of positive integer values which are the principal components 13 | whose variance contribution should be removed.} 14 | 15 | \item{center}{Logical; determines if the matrix should be centered first.} 16 | 17 | \item{scale}{Logical; determines if the matrix should be scaled first.} 18 | } 19 | \value{ 20 | A matrix or ddmatrix, matching the same type as the input \code{x}. 21 | } 22 | \description{ 23 | Exclude the variance contributed by the principal components 24 | enumerated in the vector \code{exclude} from the data matrix. 25 | } 26 | \examples{ 27 | \dontrun{ 28 | ## TODO add mpirun message boilerplate 29 | library(pbdDMAT) 30 | init.grid() 31 | 32 | x <- ddmatrix("rnorm", nrow=100, ncol=10, bldim=c(2,2)) 33 | # Remove the contribution of PC 1's variance 34 | decomp_recomp(x, 1) 35 | 36 | finalize() 37 | } 38 | 39 | } 40 | \references{ 41 | Inspired by a question from Ahmed Moustafa: \url{https://twitter.com/AhmedMoustafa/status/646310686725812224} 42 | } 43 | \author{ 44 | Drew Schmidt 45 | } 46 | -------------------------------------------------------------------------------- /vignettes/include/pbdML.bib: -------------------------------------------------------------------------------- 1 | @MISC{pbdR2012, 2 | author = {Ostrouchov, G. and Chen, W.-C. and Schmidt, D. and Patel, P.}, 3 | title = {{Programming with Big Data in R}}, 4 | year = {2012}, 5 | url = {http://r-pbd.org/} 6 | } 7 | 8 | @Misc{pbdDMAT, 9 | title = {{pbdDMAT}: Programming with Big Data -- Distributed Matrix Algebra Computation}, 10 | author = {Drew Schmidt and Wei-Chen Chen and George Ostrouchov and Pragneshkumar Patel}, 11 | year = {2012}, 12 | note = {{R} Package}, 13 | url = {http://cran.r-project.org/package=pbdDMAT}, 14 | } 15 | 16 | @Misc{pbdMPI, 17 | title = {{pbdMPI}: Programming with Big Data -- Interface to {MPI}}, 18 | author = {Wei-Chen Chen and George Ostrouchov and Drew Schmidt and Pragneshkumar Patel and Hao Yu}, 19 | year = {2012}, 20 | note = {{R} Package, URL http://cran.r-project.org/package=pbdMPI}, 21 | } 22 | 23 | @article{halko2011finding, 24 | title={Finding structure with randomness: Probabilistic algorithms for 25 | constructing approximate matrix decompositions}, 26 | author={Halko, Nathan and Martinsson, Per-Gunnar and Tropp, Joel A}, 27 | journal={SIAM review}, 28 | volume={53}, 29 | number={2}, 30 | pages={217--288}, 31 | year={2011}, 32 | publisher={SIAM} 33 | } 34 | 35 | @book{rencher, 36 | title={Methods of Multivariate Analysis}, 37 | author={Rencher, A.C.}, 38 | isbn={9780471418894}, 39 | lccn={2001046735}, 40 | series={Wiley series in probability and mathematical statistics. Probability and mathematical statistics}, 41 | url={http://books.google.bg/books?id=SpvBd7IUCxkC}, 42 | year={2002}, 43 | publisher={John Wiley \& Sons} 44 | } 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pbdML 2 | 3 | * **Version:** 0.1-2 4 | * **URL**: https://github.com/RBigData/pbdML 5 | * **License:** [![License](http://img.shields.io/badge/license-BSD%202--Clause-orange.svg?style=flat)](http://opensource.org/licenses/BSD-2-Clause) 6 | * **Author:** See section below. 7 | 8 | 9 | **pbdML** is an R package containing a collection of machine learning utilities. These functions can be used in serial with native R objects (matrices and vectors) or in parallel with distributed matrices from the **pbdDMAT** package. Here, we focus on ease of coding and understanding rather than performance (i.e., all code is written in R), and as such this package should primarily be thought of as a demonstration of the capabilities of the **pbdDMAT** package. 10 | 11 | 12 | 13 | ## Usage 14 | 15 | Functions have the same dispatch whether working with a regular 16 | R matrix or with a `ddmatrix` from the pbdDMAT package. 17 | 18 | So for example, to compute a randomized PCA, we can run 19 | 20 | ```r 21 | rpca(x) 22 | ``` 23 | 24 | where `x` is either a matrix or a `ddmatrix`. 25 | 26 | 27 | 28 | ## Installation 29 | **pbdML** requires: 30 | 31 | * R version 3.0.0 or higher 32 | * A system installation of MPI 33 | * The **pbdMPI** and **pbdDMAT** packages, as well as their dependencies. 34 | 35 | 39 | 40 | #### Development Version 41 | ```r 42 | remotes::install_github("RBigData/pbdML") 43 | ``` 44 | 45 | 46 | 47 | 48 | ## Authors 49 | 50 | pbdML is authored and maintained by members of the pbdR core team: 51 | * Drew Schmidt 52 | * George Ostrouchov 53 | * Wei-Chen Chen 54 | -------------------------------------------------------------------------------- /vignettes/include/00-acknowledgement.tex: -------------------------------------------------------------------------------- 1 | \section*{Acknowledgements and Disclaimer} 2 | Work for the \textbf{remoter} package is supported in part by the project 3 | *Harnessing Scalable Libraries for Statistical Computing on Modern Architectures 4 | and Bringing Statistics to Large Scale Computing* funded by the National 5 | Science Foundation Division of Mathematical Sciences under Grant No. 1418195. 6 | 7 | Any opinions, findings, and conclusions or recommendations expressed in this 8 | material are those of the authors and do not necessarily reflect the views of 9 | the National Science Foundation. The findings and conclusions in this article 10 | have not been formally disseminated by the U.S. Department of Health \& Human 11 | Services nor by the U.S. Department of Energy, and should not be construed to 12 | represent any determination or policy of University, Agency, Administration and 13 | National Laboratory. 14 | 15 | The \textbf{remoter} logo comes from the image 16 | ``\href{https://commons.wikimedia.org/wiki/File:Tr\%C3\%A5dtelefon-illustration.png\#/media/File:Tr\%C3\%A5dtelefon-illustration.png}{Tradtelefon-illustration}''. 17 | Licensed under Public Domain via Commons. 18 | 19 | This manual may be incorrect or out-of-date. The author(s) assume 20 | no responsibility for errors or omissions, or for damages resulting 21 | from the use of the information contained herein. 22 | 23 | This publication was typeset using \LaTeX. 24 | 25 | \vfill 26 | 27 | \null 28 | \vfill 29 | \copyright\ 2015--2016 Drew Schmidt. 30 | 31 | Permission is granted to make and distribute verbatim copies of 32 | this vignette and its source provided the copyright notice and 33 | this permission notice are preserved on all copies. 34 | -------------------------------------------------------------------------------- /R/assert.r: -------------------------------------------------------------------------------- 1 | assert.type <- function(x, type, nm=deparse(substitute(x))) 2 | { 3 | Rstuff <- c("character", "numeric", "integer", "double", "logical", "matrix", "data.frame", "vector") 4 | type <- match.arg(type, Rstuff) 5 | 6 | fun <- eval(parse(text=paste("is.", type, sep=""))) 7 | 8 | if (!fun(x)) 9 | pbdMPI::comm.stop(paste0("argument '", nm, "' must be of type ", type), call.=FALSE) 10 | 11 | return(invisible(TRUE)) 12 | } 13 | 14 | 15 | 16 | assert.nonneg <- function(x, nm=deparse(substitute(x))) 17 | { 18 | if (x < 0) 19 | pbdMPI::comm.stop(paste0("argument '", nm, "' must be >= 0; have ", nm, "=", x), call.=FALSE) 20 | 21 | return(invisible(TRUE)) 22 | } 23 | 24 | 25 | 26 | assert.positive <- function(x, nm=deparse(substitute(x))) 27 | { 28 | if (x <= 0) 29 | pbdMPI::comm.stop(paste0("argument '", nm, "' must be > 0; have ", nm, "=", x), call.=FALSE) 30 | 31 | return(invisible(TRUE)) 32 | } 33 | 34 | 35 | 36 | isint <- function(x) 37 | { 38 | epsilon <- 1e-8 39 | 40 | return(abs(x - round(x)) < epsilon) 41 | } 42 | 43 | 44 | 45 | assert.wholenum <- function(x, nm=deparse(substitute(x))) 46 | { 47 | if (!isint(x)) 48 | pbdMPI::comm.stop(paste0("argument '", nm, "' must be an integer; have ", nm, "=", x), call.=FALSE) 49 | 50 | return(invisible(TRUE)) 51 | } 52 | 53 | 54 | 55 | assert.natnum <- function(x) 56 | { 57 | nm <- deparse(substitute(x)) 58 | assert.wholenum(x, nm=nm) 59 | assert.nonneg(x, nm=nm) 60 | 61 | return(invisible(TRUE)) 62 | } 63 | 64 | 65 | 66 | assert.posint <- function(x) 67 | { 68 | nm <- deparse(substitute(x)) 69 | assert.wholenum(x, nm=nm) 70 | assert.positive(x, nm=nm) 71 | 72 | return(invisible(TRUE)) 73 | } 74 | -------------------------------------------------------------------------------- /R/decomp_recomp.r: -------------------------------------------------------------------------------- 1 | #' Decompose/Recompose 2 | #' 3 | #' Exclude the variance contributed by the principal components 4 | #' enumerated in the vector \code{exclude} from the data matrix. 5 | #' 6 | #' @param x 7 | #' A matrix or ddmatrix of numeric values. 8 | #' @param exclude 9 | #' A vector of positive integer values which are the principal components 10 | #' whose variance contribution should be removed. 11 | #' @param center 12 | #' Logical; determines if the matrix should be centered first. 13 | #' @param scale 14 | #' Logical; determines if the matrix should be scaled first. 15 | #' 16 | #' @return 17 | #' A matrix or ddmatrix, matching the same type as the input \code{x}. 18 | #' 19 | #' @references 20 | #' Inspired by a question from Ahmed Moustafa: \url{https://twitter.com/AhmedMoustafa/status/646310686725812224} 21 | #' 22 | #' @author 23 | #' Drew Schmidt 24 | #' 25 | #' @examples 26 | #' \dontrun{ 27 | #' ## TODO add mpirun message boilerplate 28 | #' library(pbdDMAT) 29 | #' init.grid() 30 | #' 31 | #' x <- ddmatrix("rnorm", nrow=100, ncol=10, bldim=c(2,2)) 32 | #' # Remove the contribution of PC 1's variance 33 | #' decomp_recomp(x, 1) 34 | #' 35 | #' finalize() 36 | #' } 37 | #' 38 | #' @export 39 | decomp_recomp <- function(x, exclude, center=TRUE, scale=FALSE) 40 | { 41 | if (class(x) != "matrix" && class(x) != "ddmatrix") 42 | comm.stop("Argument 'x' must be of class 'matrix' or 'ddmatrix'") 43 | 44 | if (any(exclude < 1)) 45 | comm.stop("positive") 46 | 47 | assert.type(center, "logical") 48 | assert.type(scale, "logical") 49 | 50 | 51 | if (center || scale) 52 | x <- scale(x, center=center, scale=scale) 53 | 54 | svd <- La.svd(x) 55 | 56 | u <- svd$u 57 | d <- svd$d 58 | vt <- svd$vt 59 | 60 | ud <- sweep(u[, -exclude], MARGIN=2, FUN="*", STATS=d[-exclude]) 61 | ud %*% vt[-exclude, ] 62 | } 63 | 64 | -------------------------------------------------------------------------------- /R/rpca.r: -------------------------------------------------------------------------------- 1 | #' Random PCA 2 | #' 3 | #' @param x 4 | #' The input data matrix. 5 | #' @param k 6 | #' The number of singular values and/or left/right singular vectors 7 | #' to estimate. 8 | #' @param q 9 | #' An integer exponent, say 1, 2, or 3. See the paper for details. 10 | #' @param retx 11 | #' Logical; determines if the rotated data should be returned. 12 | #' @param center,scale 13 | #' Logical; determines if the data should be centered/scaled first. 14 | #' 15 | #' @return 16 | #' An object of class \code{prcomp}. 17 | #' 18 | #' @references 19 | #' Halko, Martinsson, and Tropp. 2011. Finding structure with 20 | #' randomness: probabilistic algorithms for constructing approximate 21 | #' matrix decompositions. SIAM Review 53 217-288. 22 | #' 23 | #' Duda, R. O., Hart, P. E., & Stork, D. G. (2012). Pattern classification, 24 | #' chapter 10. John Wiley & Sons. 25 | #' 26 | #' @author 27 | #' Drew Schmidt 28 | #' 29 | #' @examples 30 | #' \dontrun{ 31 | #' x <- matrix(rnorm(30), 10) 32 | #' 33 | #' rpca(x) 34 | #' } 35 | #' 36 | #' @keywords SVD PCA 37 | #' @name rpca 38 | #' @rdname rpca 39 | #' @export 40 | rpca <- function(x, k=1, q=3, retx=TRUE, center=TRUE, scale=FALSE) 41 | { 42 | if (class(x) != "ddmatrix") 43 | x <- as.matrix(x) 44 | 45 | assert.type(retx, "logical") 46 | assert.type(center, "logical") 47 | assert.type(scale, "logical") 48 | 49 | # Check needs to be here so we don't do anything demanding before possibly needing to error out 50 | rsvd.checkargs(x=x, k=k, q=q, retu=FALSE, retvt=TRUE) 51 | 52 | assert.type(x, "numeric") 53 | 54 | if (center || scale) 55 | x <- scale(x, center=center, scale=scale) 56 | 57 | svd <- rsvd(x=x, k=k, q=q, retu=FALSE, retvt=TRUE) 58 | svd$d <- svd$d / sqrt(nrow(x) - 1L) 59 | 60 | if (center) 61 | center <- attr(x, "scaled:center")[1:k] 62 | if (scale) 63 | scale <- attr(x, "scaled:scale")[1:k] 64 | 65 | pca <- list(sdev=svd$d, rotation=t(svd$vt), center=center, scale=scale) 66 | 67 | if (is.matrix(x)) 68 | colnames(pca$rotation) <- paste0("PC", 1:ncol(pca$rotation)) 69 | # else #FIXME 70 | 71 | if (retx) 72 | pca$x <- x %*% pca$rotation 73 | class(pca) <- "prcomp" 74 | 75 | return(pca) 76 | } 77 | 78 | -------------------------------------------------------------------------------- /R/fld.r: -------------------------------------------------------------------------------- 1 | #' Fisher's Linear Discriminant 2 | #' 3 | #' Compute the 2-class Fisher's linear discriminant either in 4 | #' serial or parallel. 5 | #' 6 | #' @param x 7 | #' The data in the form of a matrix or ddmatrix. 8 | #' @param g 9 | #' The group variable in the form of a matrix/vector or a ddmatrix. 10 | #' The values should be 0 and 1 exclusively. 11 | #' 12 | #' @return 13 | #' A list of class 'fld' containing the prior probabilities, group means, 14 | #' w vector, and c scalar. In the distributed case, the priors and c scalar 15 | #' are both global, while the other values are distributed. 16 | #' 17 | #' @references 18 | #' Duda, R. O., Hart, P. E., & Stork, D. G. (2012). Pattern classification, 19 | #' chapter 5. John Wiley & Sons. 20 | #' 21 | #' @author 22 | #' Drew Schmidt 23 | #' 24 | #' @examples 25 | #' \dontrun{ 26 | #' x <- matrix(rnorm(30), 10) 27 | #' g <- sample(0:1, size=10, replace=TRUE) 28 | #' 29 | #' fld(x, g) 30 | #' } 31 | #' 32 | #' @name fld 33 | #' @rdname fld 34 | #' @export 35 | fld <- function(x, g) 36 | { 37 | if (!is.ddmatrix(x)) 38 | x <- as.matrix(x) 39 | 40 | # if (!all.sametype(x, g)) 41 | # comm.stop("arguments 'x' and 'g' must either both be of type 'matrix', or both of type 'ddmatrix'") 42 | 43 | n <- NROW(x) 44 | if (n != NROW(g)) 45 | comm.stop("argument 'g' must be the same length as 'x'") 46 | 47 | if (!comm.all(check_groupvar(g))) 48 | comm.stop("argument 'g' must be a vector of only 0's and 1's") 49 | 50 | 51 | ### Get group indices/priors 52 | if (is.ddmatrix(g)) 53 | g <- as.vector(g) ### FIXME 54 | 55 | ind0 <- which(submatrix(g) == 0) 56 | ind1 <- setdiff(1:n, ind0) 57 | 58 | prior0 <- length(ind0)/n 59 | prior1 <- length(ind1)/n 60 | 61 | ### Get group covariances and means 62 | x0 <- x[ind0, , drop=FALSE] 63 | x1 <- x[ind1, , drop=FALSE] 64 | 65 | cov0 <- cov(x0) 66 | cov1 <- cov(x1) 67 | 68 | mu0 <- colMeans(x0) 69 | mu1 <- colMeans(x1) 70 | 71 | ### fld 72 | mu_sum <- mu0 + mu1 73 | if (is.ddmatrix(mu_sum)) 74 | mu_sum <- t(mu_sum) 75 | 76 | w <- solve(cov0 + cov1, mu_sum) 77 | c <- as.vector(0.5 * crossprod(w, mu_sum)) 78 | 79 | ### wrangle return 80 | means <- list(mu0=mu0, mu1=mu1) 81 | prior <- c("0"=prior0, "1"=prior1) 82 | 83 | ret <- list(prior=prior, means=means, w=w, c=c) 84 | class(ret) <- "fld" 85 | 86 | return(ret) 87 | } 88 | 89 | 90 | 91 | #' @method print fld 92 | #' @export 93 | print.fld <- function(x, ...) 94 | { 95 | comm.cat("Prior probabilities of groups:\n", quiet=TRUE) 96 | comm.print(x$prior, quiet=TRUE) 97 | 98 | comm.cat("\nc =", x$c, "\n", quiet=TRUE) 99 | } 100 | -------------------------------------------------------------------------------- /R/rsvd.r: -------------------------------------------------------------------------------- 1 | rsvd.checkargs <- function(x, k, q, retu, retvt) 2 | { 3 | assert.type(retu, "logical") 4 | assert.type(retvt, "logical") 5 | 6 | assert.natnum(k) 7 | if (k > nrow(x)) 8 | comm.stop("'k' must be no greater than nrow(x)") 9 | 10 | assert.natnum(q) 11 | 12 | ### TODO check for NA, NaN, Inf 13 | 14 | invisible(TRUE) 15 | } 16 | 17 | 18 | 19 | #' Random SVD 20 | #' 21 | #' @param x 22 | #' The input data matrix. 23 | #' @param k 24 | #' The number of singular values and/or left/right singular vectors 25 | #' to estimate. 26 | #' @param q 27 | #' An integer exponent, say 1, 2, or 3. See the paper for details. 28 | #' @param retu 29 | #' Logical; should the left singular vectors ("U") be returned? 30 | #' @param retvt 31 | #' Logical; should the transposed right singular vectors ("VT") be returned? 32 | #' 33 | #' @return 34 | #' A list cotaining the singular values, and, if requested, the 35 | #' left and/or right singular vectors. 36 | #' 37 | #' @references 38 | #' Halko, Martinsson, and Tropp. 2011. Finding structure with 39 | #' randomness: probabilistic algorithms for constructing approximate 40 | #' matrix decompositions. SIAM Review 53 217-288. 41 | #' 42 | #' @author 43 | #' George Ostrouchov and Drew Schmidt 44 | #' 45 | #' @examples 46 | #' \dontrun{ 47 | #' x <- matrix(rnorm(30), 10) 48 | #' 49 | #' rsvd(x) 50 | #' } 51 | #' 52 | #' @keywords SVD PCA 53 | #' @name rsvd 54 | #' @rdname rsvd 55 | #' @export 56 | rsvd <- function(x, k=1, q=3, retu=TRUE, retvt=TRUE) 57 | { 58 | rsvd.checkargs(x=x, k=k, q=q, retu=retu, retvt=retvt) 59 | 60 | if (class(x) != "ddmatrix") 61 | x <- as.matrix(x) 62 | 63 | k <- as.integer(k) 64 | q <- as.integer(q) 65 | 66 | 67 | ### Stage A from the paper 68 | n <- ncol(x) 69 | 70 | if (class(x) == "matrix") 71 | Omega <- matrix(runif(n*2L*k), nrow=n, ncol=2L*k) 72 | else if (class(x) == "ddmatrix") 73 | Omega <- ddmatrix("runif", nrow=n, ncol=2L*k, bldim=x@bldim, ICTXT=x@ICTXT) 74 | 75 | Y <- x %*% Omega 76 | Q <- qr.Q(qr(Y)) 77 | 78 | for (i in 1:q) 79 | { 80 | Y <- crossprod(x, Q) 81 | Q <- qr.Q(qr(Y)) 82 | Y <- x %*% Q 83 | Q <- qr.Q(qr(Y)) 84 | } 85 | 86 | 87 | ### Stage B 88 | B <- crossprod(Q, x) 89 | 90 | if (!retu) 91 | nu <- 0 92 | else 93 | nu <- min(nrow(B), ncol(B)) 94 | 95 | if (!retvt) 96 | nv <- 0 97 | else 98 | nv <- min(nrow(B), ncol(B)) 99 | 100 | svd.B <- La.svd(x=B, nu=nu, nv=nv) 101 | 102 | d <- svd.B$d 103 | d <- d[1L:k] 104 | 105 | 106 | # Produce u/vt as desired 107 | if (retu) 108 | { 109 | u <- svd.B$u 110 | u <- Q %*% u 111 | 112 | u <- u[, 1L:k, drop=FALSE] 113 | } 114 | 115 | if (retvt) 116 | { 117 | vt <- svd.B$vt[1L:k, , drop=FALSE] 118 | } 119 | 120 | # wrangle return 121 | if (retu) 122 | { 123 | if (retvt) 124 | svd <- list(d=d, u=u, vt=vt) 125 | else 126 | svd <- list(d=d, u=u) 127 | } 128 | else 129 | { 130 | if (retvt) 131 | svd <- list(d=d, vt=vt) 132 | else 133 | svd <- list(d=d) 134 | } 135 | 136 | return( svd ) 137 | } 138 | -------------------------------------------------------------------------------- /R/robpca.r: -------------------------------------------------------------------------------- 1 | # Reference: "Robust Principal Component Analysis?" https://arxiv.org/pdf/0912.3599.pdf 2 | 3 | # sum(abs(X)) 4 | one_norm = function(X) 5 | { 6 | if (is.ddmatrix(X)) 7 | { 8 | ret = .Call(R_one_norm, X@Data) 9 | allreduce(ret) 10 | } 11 | else 12 | .Call(R_one_norm, X) 13 | } 14 | 15 | 16 | 17 | ## \mathcal{S} from the paper - sign(X) * pmax(abs(X) - tau, 0) 18 | shrink_op = function(X, tau) 19 | { 20 | # NOTE: this modifies the memory in place, which is potentially very dangerous 21 | if (is.ddmatrix(X)) 22 | .Call(R_shrink_op, X@Data, tau) 23 | else 24 | .Call(R_shrink_op, X, tau) 25 | } 26 | 27 | 28 | 29 | ## \mathcal{D} from the paper 30 | sv_thresh = function(X, tau) 31 | { 32 | decomp = La.svd(X) 33 | 34 | sigma = decomp$d 35 | shrink_op(sigma, tau) 36 | U = decomp$u 37 | Vt = decomp$vt 38 | 39 | U %*% (sigma * Vt) 40 | } 41 | 42 | 43 | 44 | #' robpca 45 | #' 46 | #' Implementation of the robust pca algorithm. 47 | #' 48 | #' @description 49 | #' The optimization problem is solved by an alternating directions technique. 50 | #' 51 | #' @param M 52 | #' The input data, stored as a numeric matrix or ddmatrix. 53 | #' @param delta 54 | #' Numeric termination criteria. A smaller (closer to 0) value will require more 55 | #' iterations. See the summary following the Algorithm 1 listing in the 56 | #' referenced paper for details. 57 | #' @param maxiter 58 | #' Maximum number of iterations. Should at least be a few hundred. 59 | #' 60 | #' @references 61 | #' Candes, E.J., Li, X., Ma, Y. and Wright, J., 2011. Robust principal component 62 | #' analysis?. Journal of the ACM (JACM), 58(3), p.11. 63 | #' 64 | #' @examples 65 | #' \dontrun{ 66 | #' m = 10 67 | #' n = 3 68 | #' M = matrix(rnorm(m*n), m) 69 | #' robsvd(M) 70 | #' } 71 | #' 72 | #' @author 73 | #' Drew Schmidt 74 | #' 75 | #' @export 76 | robpca = function(M, delta=1e-7, maxiter=1000) 77 | { 78 | ### I love dynamic typing 79 | assert.type(delta, "numeric") 80 | assert.posint(maxiter) 81 | 82 | if (class(M) != "ddmatrix") 83 | { 84 | M <- as.matrix(M) 85 | 86 | if (!is.double(M)) 87 | storage.mode(M) <- "double" 88 | } 89 | 90 | 91 | ### the actual work 92 | n1 = nrow(M) 93 | n2 = ncol(M) 94 | 95 | lambda = 1/sqrt(max(n1, n2)) 96 | 97 | mu = 0.25 * n1*n2 / one_norm(M) 98 | 99 | if (is.ddmatrix(M)) 100 | { 101 | ictxt = ICTXT(M) 102 | S = pbdDMAT::ddmatrix(0, n1, n2, ICTXT=ictxt) 103 | Y = pbdDMAT::ddmatrix(0, n1, n2, ICTXT=ictxt) 104 | } 105 | else 106 | { 107 | S = matrix(0, n1, n2) 108 | Y = matrix(0, n1, n2) 109 | } 110 | 111 | conv = FALSE 112 | iter = 0L 113 | 114 | ub = delta * norm(M, "F") 115 | 116 | while (!conv && iter < maxiter) 117 | { 118 | if (iter == 0) 119 | L = sv_thresh(M, 1/mu) 120 | else 121 | L = sv_thresh(M - S + Y, 1/mu) 122 | 123 | tmp = M - L 124 | S = tmp + Y 125 | shrink_op(S, lambda/mu) 126 | 127 | tmp = tmp - S 128 | Y = Y + tmp 129 | 130 | term = norm(tmp, "F") 131 | conv = (term <= ub) 132 | iter = iter + 1L 133 | } 134 | 135 | info = list(iterations=iter, converged=iter