├── .Rbuildignore ├── LICENSE ├── tests ├── testthat.R └── testthat │ ├── test-parital.R │ ├── test-count-basics.R │ ├── test-binary-basics.R │ ├── test-solveM-counts.R │ ├── test-multinomial-basics.R │ └── test-continuous-basics.R ├── .gitignore ├── man ├── generalizedPCA-package.Rd ├── log_like_Bernoulli.Rd ├── inv.logit.mat.Rd ├── exp_fam_deviance.Rd ├── plot.cv.gpca.Rd ├── project.Fantope.Rd ├── plot.efh.Rd ├── plot.gmf.Rd ├── plot.gpca.Rd ├── plot.cgpca.Rd ├── fitted.gpca.Rd ├── cv.gmf.Rd ├── cv.cgpca.Rd ├── predict.gpca.Rd ├── predict.efh.Rd ├── predict.cgpca.Rd ├── cv.gpca.Rd ├── predict.gmf.Rd ├── generalizedMF.Rd ├── exponential_family_harmonium.Rd ├── generalizedPCA.Rd └── convexGeneralizedPCA.Rd ├── R ├── generalizedPCA-package.R ├── exponential_family_functions.R ├── exponential_family_harmonium.R ├── generalizedMF.R ├── convexGeneralizedPCA.R └── generalizedPCA.R ├── generalizedPCA.Rproj ├── DESCRIPTION ├── NAMESPACE └── README.md /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2015 2 | COPYRIGHT HOLDER: Andrew J. Landgraf 3 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(generalizedPCA) 3 | 4 | test_check("generalizedPCA") 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | 5 | # files created by c 6 | src/*.o 7 | src/*.so 8 | src/*.dll 9 | 10 | # folders created when building the package 11 | src-i386 12 | src-x64 13 | -------------------------------------------------------------------------------- /man/generalizedPCA-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/generalizedPCA-package.R 3 | \docType{package} 4 | \name{generalizedPCA-package} 5 | \alias{generalizedPCA-package} 6 | \title{generalizedPCA-package} 7 | \description{ 8 | Generalized dimensionality reduction by extending PCA 9 | } 10 | \author{ 11 | Andrew J. Landgraf 12 | } 13 | \keyword{package} 14 | 15 | -------------------------------------------------------------------------------- /R/generalizedPCA-package.R: -------------------------------------------------------------------------------- 1 | #' Generalized dimensionality reduction by extending PCA 2 | #' 3 | #' @name generalizedPCA-package 4 | #' @docType package 5 | #' @title generalizedPCA-package 6 | #' @author Andrew J. Landgraf 7 | #' @keywords package 8 | #' 9 | #' @importFrom stats fitted predict rnorm runif 10 | #' @importFrom logisticPCA inv.logit.mat log_like_Bernoulli 11 | NULL 12 | 13 | 14 | 15 | # @useDynLib generalizedPCA, inv_logit_mat, compute_loglik 16 | -------------------------------------------------------------------------------- /generalizedPCA.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | -------------------------------------------------------------------------------- /man/log_like_Bernoulli.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exponential_family_functions.R 3 | \name{log_like_Bernoulli} 4 | \alias{log_like_Bernoulli} 5 | \title{Bernoulli Log Likelihood} 6 | \usage{ 7 | log_like_Bernoulli(x, theta, q) 8 | } 9 | \arguments{ 10 | \item{x}{matrix with all binary entries} 11 | 12 | \item{theta}{estimated natural parameters with 13 | same dimensions as x} 14 | 15 | \item{q}{instead of x, you can input matrix q which is 16 | -1 if \code{x = 0}, 1 if \code{x = 1}, and 0 if \code{is.na(x)}} 17 | } 18 | \description{ 19 | Calculate the Bernoulli log likelihood of matrix 20 | } 21 | 22 | -------------------------------------------------------------------------------- /man/inv.logit.mat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exponential_family_functions.R 3 | \name{inv.logit.mat} 4 | \alias{inv.logit.mat} 5 | \title{Inverse logit for matrices} 6 | \usage{ 7 | inv.logit.mat(x, min = 0, max = 1) 8 | } 9 | \arguments{ 10 | \item{x}{matrix} 11 | 12 | \item{min}{Lower end of logit interval} 13 | 14 | \item{max}{Upper end of logit interval} 15 | } 16 | \description{ 17 | Apply the inverse logit function to a matrix, element-wise. It 18 | generalizes the \code{inv.logit} function from the \code{gtools} 19 | library to matrices 20 | } 21 | \examples{ 22 | (mat = matrix(rnorm(10 * 5), nrow = 10, ncol = 5)) 23 | inv.logit.mat(mat) 24 | } 25 | 26 | -------------------------------------------------------------------------------- /man/exp_fam_deviance.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exponential_family_functions.R 3 | \name{exp_fam_deviance} 4 | \alias{exp_fam_deviance} 5 | \title{Calculate exponential family deviance} 6 | \usage{ 7 | exp_fam_deviance(x, theta, family, weights = 1) 8 | } 9 | \arguments{ 10 | \item{x}{a vector or matrix of data} 11 | 12 | \item{theta}{natural parameters. Must be same dimensions as \code{x}} 13 | 14 | \item{family}{exponential family distribution of data} 15 | 16 | \item{weights}{weights of datapoints. Must be a scalar or the same dimensions as the data} 17 | } 18 | \description{ 19 | Calculates the deviance of data assuming it comes from an exponential family 20 | } 21 | 22 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: generalizedPCA 2 | Type: Package 3 | Title: Generalized PCA 4 | Version: 0.3 5 | Date: 2018-02-24 6 | NeedsCompilation: yes 7 | ByteCompile: yes 8 | Author: Andrew J. Landgraf 9 | Maintainer: Andrew J. Landgraf 10 | Description: Generalized dimensionality reduction by extending PCA to exponential 11 | family data. This package includes three methods: Generalized PCA by Landgraf 12 | and Lee (2015), Exponential Family PCA (Generalized MF) by Collins et al. (2001), 13 | and Exponential Family Harmoniums by Welling et al. (2005). 14 | License: MIT + file LICENSE 15 | LazyData: TRUE 16 | URL: https://github.com/andland/generalizedPCA 17 | Imports: 18 | ggplot2, 19 | logisticPCA, 20 | RSpectra (>= 0.10-0) 21 | Suggests: 22 | testthat (>= 0.11.0) 23 | RoxygenNote: 5.0.1 24 | -------------------------------------------------------------------------------- /man/plot.cv.gpca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/generalizedPCA.R 3 | \name{plot.cv.gpca} 4 | \alias{plot.cv.gpca} 5 | \title{Plot CV for generalized PCA} 6 | \usage{ 7 | \method{plot}{cv.gpca}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{a \code{cv.gpca} object} 11 | 12 | \item{...}{Additional arguments} 13 | } 14 | \description{ 15 | Plot cross validation results generalized PCA 16 | } 17 | \examples{ 18 | # construct a low rank matrix in the natural parameter space 19 | rows = 100 20 | cols = 10 21 | set.seed(1) 22 | mat_np = outer(rnorm(rows), rnorm(cols)) 23 | 24 | # generate a count matrix 25 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 26 | 27 | \dontrun{ 28 | loglikes = cv.gpca(mat, ks = 1:9, Ms = 3:6, family = "poisson") 29 | plot(loglikes) 30 | } 31 | } 32 | 33 | -------------------------------------------------------------------------------- /man/project.Fantope.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/convexGeneralizedPCA.R 3 | \name{project.Fantope} 4 | \alias{project.Fantope} 5 | \title{Project onto the Fantope} 6 | \usage{ 7 | project.Fantope(x, k, partial_decomp = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{a symmetric matrix} 11 | 12 | \item{k}{the rank of the Fantope desired} 13 | 14 | \item{partial_decomp}{logical; if \code{TRUE}, the function uses the RSpectra package 15 | to more quickly calculate the eigen-decomposition. When the number of columns is small, 16 | the approximation may be less accurate and slower} 17 | } 18 | \value{ 19 | \item{H}{a rank \code{k} Fantope matrix} 20 | \item{U}{a \code{k}-dimentional orthonormal matrix with the first \code{k} eigenvectors of \code{H}} 21 | } 22 | \description{ 23 | Project a symmetric matrix onto the convex set of the rank k Fantope 24 | } 25 | 26 | -------------------------------------------------------------------------------- /man/plot.efh.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exponential_family_harmonium.R 3 | \name{plot.efh} 4 | \alias{plot.efh} 5 | \title{Plot exponential family harmonium} 6 | \usage{ 7 | \method{plot}{efh}(x, type = c("trace"), ...) 8 | } 9 | \arguments{ 10 | \item{x}{EFH object} 11 | 12 | \item{type}{the type of plot \code{type = "trace"} plots the algorithms progress by 13 | iteration} 14 | 15 | \item{...}{Additional arguments} 16 | } 17 | \description{ 18 | Plots the results of a EFH 19 | } 20 | \examples{ 21 | # construct a low rank matrix in the logit scale 22 | rows = 100 23 | cols = 10 24 | set.seed(1) 25 | mat_logit = outer(rnorm(rows), rnorm(cols)) 26 | 27 | # generate a binary matrix 28 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0 29 | 30 | # run logistic SVD on it 31 | efh = exponential_family_harmonium(mat, k = 2, family = "binomial", family_hidden = "binomial") 32 | 33 | \dontrun{ 34 | plot(efh) 35 | } 36 | } 37 | 38 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(fitted,gpca) 4 | S3method(plot,cgpca) 5 | S3method(plot,cv.gpca) 6 | S3method(plot,efh) 7 | S3method(plot,gmf) 8 | S3method(plot,gpca) 9 | S3method(predict,cgpca) 10 | S3method(predict,efh) 11 | S3method(predict,gmf) 12 | S3method(predict,gpca) 13 | S3method(print,cgpca) 14 | S3method(print,efh) 15 | S3method(print,gmf) 16 | S3method(print,gpca) 17 | export(convexGeneralizedPCA) 18 | export(cv.cgpca) 19 | export(cv.gmf) 20 | export(cv.gpca) 21 | export(exp_fam_deviance) 22 | export(exponential_family_harmonium) 23 | export(generalizedMF) 24 | export(generalizedPCA) 25 | export(inv.logit.mat) 26 | export(project.Fantope) 27 | importFrom(RSpectra,svds) 28 | importFrom(logisticPCA,inv.logit.mat) 29 | importFrom(logisticPCA,log_like_Bernoulli) 30 | importFrom(stats,fitted) 31 | importFrom(stats,predict) 32 | importFrom(stats,rbinom) 33 | importFrom(stats,rnorm) 34 | importFrom(stats,rpois) 35 | importFrom(stats,runif) 36 | importFrom(stats,var) 37 | importFrom(utils,type.convert) 38 | -------------------------------------------------------------------------------- /man/plot.gmf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/generalizedMF.R 3 | \name{plot.gmf} 4 | \alias{plot.gmf} 5 | \title{Plot generalized MF} 6 | \usage{ 7 | \method{plot}{gmf}(x, type = c("trace", "loadings", "scores"), ...) 8 | } 9 | \arguments{ 10 | \item{x}{generalized MF object} 11 | 12 | \item{type}{the type of plot \code{type = "trace"} plots the algorithms progress by 13 | iteration, \code{type = "loadings"} plots the first 2 principal component 14 | loadings, \code{type = "scores"} plots the loadings first 2 principal component scores} 15 | 16 | \item{...}{Additional arguments} 17 | } 18 | \description{ 19 | Plots the results of a generalized MF 20 | } 21 | \examples{ 22 | # construct a low rank matrix in the logit scale 23 | rows = 100 24 | cols = 10 25 | set.seed(1) 26 | mat_logit = outer(rnorm(rows), rnorm(cols)) 27 | 28 | # generate a binary matrix 29 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0 30 | 31 | # run logistic SVD on it 32 | gmf = generalizedMF(mat, k = 2, family = "binomial") 33 | 34 | \dontrun{ 35 | plot(gmf) 36 | } 37 | } 38 | 39 | -------------------------------------------------------------------------------- /man/plot.gpca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/generalizedPCA.R 3 | \name{plot.gpca} 4 | \alias{plot.gpca} 5 | \title{Plot generalized PCA} 6 | \usage{ 7 | \method{plot}{gpca}(x, type = c("trace", "loadings", "scores"), ...) 8 | } 9 | \arguments{ 10 | \item{x}{generalized PCA object} 11 | 12 | \item{type}{the type of plot \code{type = "trace"} plots the algorithms progress by 13 | iteration, \code{type = "loadings"} plots the first 2 principal component 14 | loadings, \code{type = "scores"} plots the loadings first 2 principal component scores} 15 | 16 | \item{...}{Additional arguments} 17 | } 18 | \description{ 19 | Plots the results of a generalized PCA 20 | } 21 | \examples{ 22 | # construct a low rank matrix in the natural parameter space 23 | rows = 100 24 | cols = 10 25 | set.seed(1) 26 | mat_np = outer(rnorm(rows), rnorm(cols)) 27 | 28 | # generate a count matrix 29 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 30 | 31 | # run logistic PCA on it 32 | gpca = generalizedPCA(mat, k = 2, M = 4, family = "poisson") 33 | 34 | \dontrun{ 35 | plot(gpca) 36 | } 37 | } 38 | 39 | -------------------------------------------------------------------------------- /man/plot.cgpca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/convexGeneralizedPCA.R 3 | \name{plot.cgpca} 4 | \alias{plot.cgpca} 5 | \title{Plot convex generalized PCA} 6 | \usage{ 7 | \method{plot}{cgpca}(x, type = c("trace", "loadings", "scores"), ...) 8 | } 9 | \arguments{ 10 | \item{x}{convex generalized PCA object} 11 | 12 | \item{type}{the type of plot \code{type = "trace"} plots the algorithms progress by 13 | iteration, \code{type = "loadings"} plots the first 2 PC loadings, 14 | \code{type = "scores"} plots the first 2 PC scores} 15 | 16 | \item{...}{Additional arguments} 17 | } 18 | \description{ 19 | Plots the results of a convex generalized PCA 20 | } 21 | \examples{ 22 | # construct a low rank matrix in the logit scale 23 | rows = 100 24 | cols = 10 25 | set.seed(1) 26 | mat_logit = outer(rnorm(rows), rnorm(cols)) 27 | 28 | # generate a binary matrix 29 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0 30 | 31 | # run convex generalized PCA on it 32 | cgpca = convexGeneralizedPCA(mat, k = 1, M = 4, family = "binomial") 33 | 34 | \dontrun{ 35 | plot(cgpca) 36 | } 37 | } 38 | 39 | -------------------------------------------------------------------------------- /man/fitted.gpca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/generalizedPCA.R 3 | \name{fitted.gpca} 4 | \alias{fitted.gpca} 5 | \title{Fitted values using generalized PCA} 6 | \usage{ 7 | \method{fitted}{gpca}(object, type = c("link", "response"), ...) 8 | } 9 | \arguments{ 10 | \item{object}{generalized PCA object} 11 | 12 | \item{type}{the type of fitting required. \code{type = "link"} gives output on the natural 13 | parameter scale and \code{type = "response"} gives output on the response scale} 14 | 15 | \item{...}{Additional arguments} 16 | } 17 | \description{ 18 | Fit a lower dimentional representation of the exponential family matrix using generalized PCA 19 | } 20 | \examples{ 21 | # construct a low rank matrix in the natural parameter space 22 | rows = 100 23 | cols = 10 24 | set.seed(1) 25 | mat_np = outer(rnorm(rows), rnorm(cols)) 26 | 27 | # generate a count matrix 28 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 29 | 30 | # run Poisson PCA on it 31 | gpca = generalizedPCA(mat, k = 1, M = 4, family = "poisson") 32 | 33 | # construct fitted expected value of counts matrix 34 | fit = fitted(gpca, type = "response") 35 | } 36 | 37 | -------------------------------------------------------------------------------- /man/cv.gmf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/generalizedMF.R 3 | \name{cv.gmf} 4 | \alias{cv.gmf} 5 | \title{CV for generalized MF} 6 | \usage{ 7 | cv.gmf(x, ks, family = c("gaussian", "binomial", "poisson", "multinomial"), 8 | folds = 5, quiet = TRUE, ...) 9 | } 10 | \arguments{ 11 | \item{x}{matrix of either binary, count, or continuous data} 12 | 13 | \item{ks}{the different dimensions \code{k} to try} 14 | 15 | \item{family}{exponential family distribution of data} 16 | 17 | \item{folds}{if \code{folds} is a scalar, then it is the number of folds. If 18 | it is a vector, it should be the same length as the number of rows in \code{x}} 19 | 20 | \item{quiet}{logical; whether the function should display progress} 21 | 22 | \item{...}{Additional arguments passed to generalizedMF} 23 | } 24 | \value{ 25 | A matrix of the CV deviance with \code{k} in rows 26 | } 27 | \description{ 28 | Run cross validation on dimension for generalized MF 29 | } 30 | \examples{ 31 | # construct a low rank matrix in the logit scale 32 | rows = 100 33 | cols = 10 34 | set.seed(1) 35 | mat_logit = outer(rnorm(rows), rnorm(cols)) 36 | 37 | # generate a binary matrix 38 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0 39 | 40 | \dontrun{ 41 | deviances = cv.gmf(mat, ks = 1:9, family = "binomial") 42 | plot(deviances) 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /man/cv.cgpca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/convexGeneralizedPCA.R 3 | \name{cv.cgpca} 4 | \alias{cv.cgpca} 5 | \title{CV for convex generalized PCA} 6 | \usage{ 7 | cv.cgpca(x, ks, Ms = seq(2, 10, by = 2), folds = 5, quiet = TRUE, ...) 8 | } 9 | \arguments{ 10 | \item{x}{matrix with all binary entries} 11 | 12 | \item{ks}{the different dimensions \code{k} to try} 13 | 14 | \item{Ms}{the different approximations to the saturated model \code{M} to try} 15 | 16 | \item{folds}{if \code{folds} is a scalar, then it is the number of folds. If 17 | it is a vector, it should be the same length as the number of rows in \code{x}} 18 | 19 | \item{quiet}{logical; whether the function should display progress} 20 | 21 | \item{...}{Additional arguments passed to convexGeneralizedPCA} 22 | } 23 | \value{ 24 | A matrix of the CV log likelihood with \code{k} in rows and 25 | \code{M} in columns 26 | } 27 | \description{ 28 | Run cross validation on dimension and \code{M} for convex generalized PCA 29 | } 30 | \examples{ 31 | # construct a low rank matrix in the logit scale 32 | rows = 100 33 | cols = 10 34 | set.seed(1) 35 | mat_logit = outer(rnorm(rows), rnorm(cols)) 36 | 37 | # generate a binary matrix 38 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0 39 | 40 | \dontrun{ 41 | loglikes = cv.cgpca(mat, ks = 1:9, Ms = 3:6) 42 | plot(loglikes) 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /man/predict.gpca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/generalizedPCA.R 3 | \name{predict.gpca} 4 | \alias{predict.gpca} 5 | \title{Predict generalized PCA scores or reconstruction on new data} 6 | \usage{ 7 | \method{predict}{gpca}(object, newdata, type = c("PCs", "link", "response"), 8 | ...) 9 | } 10 | \arguments{ 11 | \item{object}{generalized PCA object} 12 | 13 | \item{newdata}{matrix of the same exponential family as in \code{object}. 14 | If missing, will use the data that \code{object} was fit on} 15 | 16 | \item{type}{the type of fitting required. \code{type = "PCs"} gives the PC scores, 17 | \code{type = "link"} gives matrix on the natural parameter scale and 18 | \code{type = "response"} gives matrix on the response scale} 19 | 20 | \item{...}{Additional arguments} 21 | } 22 | \description{ 23 | Predict generalized PCA scores or reconstruction on new data 24 | } 25 | \examples{ 26 | # construct a low rank matrices in the natural parameter space 27 | rows = 100 28 | cols = 10 29 | set.seed(1) 30 | loadings = rnorm(cols) 31 | mat_np = outer(rnorm(rows), rnorm(cols)) 32 | mat_np_new = outer(rnorm(rows), loadings) 33 | 34 | # generate a count matrices 35 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 36 | mat_new = matrix(rpois(rows * cols, c(exp(mat_np_new))), rows, cols) 37 | 38 | # run Poisson PCA on it 39 | gpca = generalizedPCA(mat, k = 1, M = 4, family = "poisson") 40 | 41 | PCs = predict(gpca, mat_new) 42 | } 43 | 44 | -------------------------------------------------------------------------------- /man/predict.efh.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exponential_family_harmonium.R 3 | \name{predict.efh} 4 | \alias{predict.efh} 5 | \title{Predict exponential family harmonium reconstruction on new data} 6 | \usage{ 7 | \method{predict}{efh}(object, newdata, type = c("hidden", "link", "response"), 8 | ...) 9 | } 10 | \arguments{ 11 | \item{object}{EFH object} 12 | 13 | \item{newdata}{matrix of the same exponential family as covariates in \code{object}.} 14 | 15 | \item{type}{the type of fitting required. 16 | \code{type = "hidden"} gives matrix of hidden mean parameters of \code{x}, 17 | \code{type = "link"} gives a matrix on the natural parameter scale, and 18 | \code{type = "response"} gives a matrix on the response scale} 19 | 20 | \item{...}{Additional arguments} 21 | } 22 | \description{ 23 | Predict exponential family harmonium reconstruction on new data 24 | } 25 | \examples{ 26 | # construct a low rank matrices in the natural parameter space 27 | rows = 100 28 | cols = 10 29 | set.seed(1) 30 | loadings = rnorm(cols) 31 | mat_np = outer(rnorm(rows), rnorm(cols)) 32 | mat_np_new = outer(rnorm(rows), loadings) 33 | 34 | # generate a count matrices 35 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 36 | mat_new = matrix(rpois(rows * cols, c(exp(mat_np_new))), rows, cols) 37 | 38 | modp = exponential_family_harmonium(mat, k = 9, family = "poisson", max_iters = 1000) 39 | 40 | pred = predict(modp, mat_new, type = "response") 41 | 42 | } 43 | 44 | -------------------------------------------------------------------------------- /man/predict.cgpca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/convexGeneralizedPCA.R 3 | \name{predict.cgpca} 4 | \alias{predict.cgpca} 5 | \title{Predict Convex Generalized PCA scores or reconstruction on new data} 6 | \usage{ 7 | \method{predict}{cgpca}(object, newdata, type = c("PCs", "link", "response"), 8 | ...) 9 | } 10 | \arguments{ 11 | \item{object}{convex generalized PCA object} 12 | 13 | \item{newdata}{matrix with all binary entries. If missing, will use the 14 | data that \code{object} was fit on} 15 | 16 | \item{type}{the type of fitting required. \code{type = "PCs"} gives the PC scores, 17 | \code{type = "link"} gives matrix on the logit scale and \code{type = "response"} 18 | gives matrix on the probability scale} 19 | 20 | \item{...}{Additional arguments} 21 | } 22 | \description{ 23 | Predict Convex Generalized PCA scores or reconstruction on new data 24 | } 25 | \examples{ 26 | # construct a low rank matrices in the logit scale 27 | rows = 100 28 | cols = 10 29 | set.seed(1) 30 | loadings = rnorm(cols) 31 | mat_logit = outer(rnorm(rows), loadings) 32 | mat_logit_new = outer(rnorm(rows), loadings) 33 | 34 | # convert to a binary matrix 35 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0 36 | mat_new = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit_new)) * 1.0 37 | 38 | # run generalized PCA on it 39 | cgpca = convexGeneralizedPCA(mat, k = 1, M = 4, family = "binomial") 40 | 41 | PCs = predict(cgpca, mat_new) 42 | } 43 | 44 | -------------------------------------------------------------------------------- /man/cv.gpca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/generalizedPCA.R 3 | \name{cv.gpca} 4 | \alias{cv.gpca} 5 | \title{CV for generalized PCA} 6 | \usage{ 7 | cv.gpca(x, ks, Ms = seq(2, 10, by = 2), family = c("gaussian", "binomial", 8 | "poisson", "multinomial"), weights, folds = 5, quiet = TRUE, ...) 9 | } 10 | \arguments{ 11 | \item{x}{matrix of either binary, count, or continuous data} 12 | 13 | \item{ks}{the different dimensions \code{k} to try} 14 | 15 | \item{Ms}{the different approximations to the saturated model \code{M} to try} 16 | 17 | \item{family}{exponential family distribution of data} 18 | 19 | \item{weights}{an optional matrix of the same size as the \code{x} with data weights} 20 | 21 | \item{folds}{if \code{folds} is a scalar, then it is the number of folds. If 22 | it is a vector, it should be the same length as the number of rows in \code{x}} 23 | 24 | \item{quiet}{logical; whether the function should display progress} 25 | 26 | \item{...}{Additional arguments passed to \code{generalizedPCA}} 27 | } 28 | \value{ 29 | A matrix of the CV log likelihood with \code{k} in rows and 30 | \code{M} in columns 31 | } 32 | \description{ 33 | Run cross validation on dimension and \code{M} for generalized PCA 34 | } 35 | \examples{ 36 | # construct a low rank matrix in the natural parameter space 37 | rows = 100 38 | cols = 10 39 | set.seed(1) 40 | mat_np = outer(rnorm(rows), rnorm(cols)) 41 | 42 | # generate a count matrix 43 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 44 | 45 | \dontrun{ 46 | loglikes = cv.gpca(mat, ks = 1:9, Ms = 3:6, family = "poisson", quiet = FALSE) 47 | plot(loglikes) 48 | } 49 | } 50 | 51 | -------------------------------------------------------------------------------- /tests/testthat/test-parital.R: -------------------------------------------------------------------------------- 1 | context("Parital Decomp") 2 | 3 | # construct a low rank matrix in the natural parameter scale 4 | rows = 100 5 | cols = 10 6 | k = 1 7 | set.seed(1) 8 | mat_np = outer(rnorm(rows), rnorm(cols)) 9 | 10 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_np)) * 1.0 11 | 12 | gpca = generalizedPCA(mat, k = k, M = 4, family = "binomial", main_effects = TRUE, partial_decomp = FALSE) 13 | gmf = generalizedMF(mat, k = k, family = "binomial", main_effects = TRUE, partial_decomp = FALSE, method = "svd") 14 | 15 | ppca = generalizedPCA(mat, k = k, M = 4, family = "poisson", main_effects = TRUE, partial_decomp = FALSE) 16 | pmf = generalizedMF(mat, k = k, family = "poisson", main_effects = TRUE, partial_decomp = FALSE, method = "svd") 17 | 18 | gpca_part = generalizedPCA(mat, k = k, M = 4, family = "binomial", main_effects = TRUE, partial_decomp = TRUE) 19 | gmf_part = generalizedMF(mat, k = k, family = "binomial", main_effects = TRUE, partial_decomp = TRUE, method = "svd") 20 | 21 | ppca_part = generalizedPCA(mat, k = k, M = 4, family = "poisson", main_effects = TRUE, partial_decomp = TRUE) 22 | pmf_part = generalizedMF(mat, k = k, family = "poisson", main_effects = TRUE, partial_decomp = TRUE, method = "svd") 23 | 24 | test_that("partial_decomp = full decomp", { 25 | expect_equal(gpca$iters, gpca_part$iters) 26 | expect_equal(gmf$iters, gmf_part$iters) 27 | expect_equal(ppca$iters, ppca_part$iters) 28 | expect_equal(pmf$iters, pmf_part$iters) 29 | 30 | expect_equal(gpca$loss_trace, gpca_part$loss_trace) 31 | expect_equal(gmf$loss_trace, gmf_part$loss_trace) 32 | expect_equal(ppca$loss_trace, ppca_part$loss_trace) 33 | expect_equal(pmf$loss_trace, pmf_part$loss_trace) 34 | }) 35 | -------------------------------------------------------------------------------- /man/predict.gmf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/generalizedMF.R 3 | \name{predict.gmf} 4 | \alias{predict.gmf} 5 | \title{Predict generalized PCA scores or reconstruction on new data} 6 | \usage{ 7 | \method{predict}{gmf}(object, newdata, type = c("PCs", "link", "response"), 8 | quiet = TRUE, max_iters = 1000, conv_criteria = 1e-05, start_A, ...) 9 | } 10 | \arguments{ 11 | \item{object}{generalized MF object} 12 | 13 | \item{newdata}{matrix of the same exponential family as covariates in \code{object}. 14 | If missing, will use the data that \code{object} was fit on} 15 | 16 | \item{type}{the type of fitting required. 17 | \code{type = "PCs"} gives matrix of principal components of \code{x}, 18 | \code{type = "link"} gives a matrix on the natural parameter scale, and 19 | \code{type = "response"} gives a matrix on the response scale} 20 | 21 | \item{quiet}{logical; whether the calculation should show progress} 22 | 23 | \item{max_iters}{maximum number of iterations} 24 | 25 | \item{conv_criteria}{convergence criteria} 26 | 27 | \item{start_A}{initial value for \code{A}} 28 | 29 | \item{...}{Additional arguments} 30 | } 31 | \description{ 32 | Predict generalized PCA scores or reconstruction on new data 33 | } 34 | \examples{ 35 | # construct a low rank matrices in the natural parameter space 36 | rows = 100 37 | cols = 10 38 | set.seed(1) 39 | loadings = rnorm(cols) 40 | mat_np = outer(rnorm(rows), rnorm(cols)) 41 | mat_np_new = outer(rnorm(rows), loadings) 42 | 43 | # generate a count matrices 44 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 45 | mat_new = matrix(rpois(rows * cols, c(exp(mat_np_new))), rows, cols) 46 | 47 | # run Poisson PCA on it 48 | gmf = generalizedMF(mat, k = 1, family = "poisson") 49 | 50 | A = predict(gmf, mat_new) 51 | 52 | } 53 | 54 | -------------------------------------------------------------------------------- /tests/testthat/test-count-basics.R: -------------------------------------------------------------------------------- 1 | context("Counts") 2 | 3 | # construct a low rank matrix in the natural parameter scale 4 | rows = 100 5 | cols = 10 6 | k = 1 7 | set.seed(1) 8 | mat_np = outer(rnorm(rows), rnorm(cols)) 9 | 10 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 11 | 12 | gpca = generalizedPCA(mat, k = k, M = 4, family = "poisson", main_effects = TRUE) 13 | gmf = generalizedMF(mat, k = k, family = "poisson", method = "als") 14 | 15 | pred1 = predict(gpca, mat) 16 | pred1l = predict(gpca, mat, type = "link") 17 | pred1r = predict(gpca, mat, type = "response") 18 | fit1l = fitted(gpca, type = "link") 19 | fit1r = fitted(gpca, type = "response") 20 | 21 | test_that("correct classes", { 22 | expect_is(gpca, "gpca") 23 | 24 | expect_is(pred1, "matrix") 25 | expect_is(pred1l, "matrix") 26 | expect_is(pred1r, "matrix") 27 | expect_is(fit1l, "matrix") 28 | expect_is(fit1r, "matrix") 29 | 30 | }) 31 | 32 | test_that("k = 1 dimensions", { 33 | expect_equal(dim(gpca$U), c(cols, 1)) 34 | expect_equal(dim(gpca$PCs), c(rows, 1)) 35 | expect_equal(length(gpca$mu), cols) 36 | 37 | expect_equal(dim(pred1), c(rows, 1)) 38 | expect_equal(dim(pred1l), c(rows, cols)) 39 | expect_equal(dim(pred1r), c(rows, cols)) 40 | expect_equal(dim(fit1l), c(rows, cols)) 41 | expect_equal(dim(fit1r), c(rows, cols)) 42 | }) 43 | 44 | rm(gpca, pred1, pred1l, pred1r, fit1l, fit1r) 45 | 46 | k = 2 47 | gpca = generalizedPCA(mat, k = k, M = 4, family = "poisson", main_effects = TRUE) 48 | 49 | pred1 = predict(gpca, mat) 50 | pred1l = predict(gpca, mat, type = "link") 51 | pred1r = predict(gpca, mat, type = "response") 52 | fit1l = fitted(gpca, type = "link") 53 | fit1r = fitted(gpca, type = "response") 54 | 55 | test_that("k = 2 dimensions", { 56 | expect_equal(dim(gpca$U), c(cols, 2)) 57 | expect_equal(dim(gpca$PCs), c(rows, 2)) 58 | expect_equal(length(gpca$mu), cols) 59 | 60 | expect_equal(dim(pred1), c(rows, 2)) 61 | expect_equal(dim(pred1l), c(rows, cols)) 62 | expect_equal(dim(pred1r), c(rows, cols)) 63 | expect_equal(dim(fit1l), c(rows, cols)) 64 | expect_equal(dim(fit1r), c(rows, cols)) 65 | }) 66 | 67 | test_that("response between non-negative", { 68 | expect_gte(min(pred1r), 0) 69 | expect_gte(min(fit1r), 0) 70 | }) 71 | -------------------------------------------------------------------------------- /tests/testthat/test-binary-basics.R: -------------------------------------------------------------------------------- 1 | context("Binary") 2 | 3 | # construct a low rank matrix in the natural parameter scale 4 | rows = 100 5 | cols = 10 6 | k = 1 7 | set.seed(1) 8 | mat_np = outer(rnorm(rows), rnorm(cols)) 9 | 10 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_np)) * 1.0 11 | 12 | gpca = generalizedPCA(mat, k = k, M = 4, family = "binomial", main_effects = TRUE) 13 | 14 | pred1 = predict(gpca, mat) 15 | pred1l = predict(gpca, mat, type = "link") 16 | pred1r = predict(gpca, mat, type = "response") 17 | fit1l = fitted(gpca, type = "link") 18 | fit1r = fitted(gpca, type = "response") 19 | 20 | test_that("correct classes", { 21 | expect_is(gpca, "gpca") 22 | 23 | expect_is(pred1, "matrix") 24 | expect_is(pred1l, "matrix") 25 | expect_is(pred1r, "matrix") 26 | expect_is(fit1l, "matrix") 27 | expect_is(fit1r, "matrix") 28 | 29 | }) 30 | 31 | test_that("k = 1 dimensions", { 32 | expect_equal(dim(gpca$U), c(cols, 1)) 33 | expect_equal(dim(gpca$PCs), c(rows, 1)) 34 | expect_equal(length(gpca$mu), cols) 35 | 36 | expect_equal(dim(pred1), c(rows, 1)) 37 | expect_equal(dim(pred1l), c(rows, cols)) 38 | expect_equal(dim(pred1r), c(rows, cols)) 39 | expect_equal(dim(fit1l), c(rows, cols)) 40 | expect_equal(dim(fit1r), c(rows, cols)) 41 | }) 42 | 43 | rm(gpca, pred1, pred1l, pred1r, fit1l, fit1r) 44 | 45 | k = 2 46 | gpca = generalizedPCA(mat, k = k, M = 4, family = "binomial", main_effects = TRUE) 47 | 48 | pred1 = predict(gpca, mat) 49 | pred1l = predict(gpca, mat, type = "link") 50 | pred1r = predict(gpca, mat, type = "response") 51 | fit1l = fitted(gpca, type = "link") 52 | fit1r = fitted(gpca, type = "response") 53 | 54 | test_that("k = 2 dimensions", { 55 | expect_equal(dim(gpca$U), c(cols, 2)) 56 | expect_equal(dim(gpca$PCs), c(rows, 2)) 57 | expect_equal(length(gpca$mu), cols) 58 | 59 | expect_equal(dim(pred1), c(rows, 2)) 60 | expect_equal(dim(pred1l), c(rows, cols)) 61 | expect_equal(dim(pred1r), c(rows, cols)) 62 | expect_equal(dim(fit1l), c(rows, cols)) 63 | expect_equal(dim(fit1r), c(rows, cols)) 64 | }) 65 | 66 | test_that("response between 0 and 1", { 67 | expect_gte(min(pred1r), 0) 68 | expect_gte(min(fit1r), 0) 69 | 70 | expect_lte(max(pred1r), 1) 71 | expect_lte(max(fit1r), 1) 72 | }) 73 | -------------------------------------------------------------------------------- /tests/testthat/test-solveM-counts.R: -------------------------------------------------------------------------------- 1 | context("Solve M") 2 | 3 | # construct a low rank matrix in the natural parameter scale 4 | rows = 100 5 | cols = 10 6 | k = 2 7 | set.seed(1) 8 | 9 | loadings = rnorm(cols) 10 | mat_np1 = outer(rnorm(rows), loadings) 11 | mat_np2 = outer(rnorm(rows), loadings) 12 | 13 | mat = matrix(rpois(rows * cols, c(exp(mat_np1))), rows, cols) 14 | mat_test = matrix(rpois(rows * cols, c(exp(mat_np1))), rows, cols) 15 | 16 | gpca = generalizedPCA(mat, k = k, M = 0, family = "poisson", main_effects = TRUE) 17 | gpca_val = generalizedPCA(mat, k = k, M = 0, family = "poisson", main_effects = TRUE, validation = mat_test) 18 | 19 | pred1 = predict(gpca, mat) 20 | pred1l = predict(gpca, mat, type = "link") 21 | pred1r = predict(gpca, mat, type = "response") 22 | fit1l = fitted(gpca, type = "link") 23 | fit1r = fitted(gpca, type = "response") 24 | 25 | test_that("k = 2 poisson", { 26 | expect_equal(dim(gpca$U), c(cols, 2)) 27 | expect_equal(dim(gpca$PCs), c(rows, 2)) 28 | expect_equal(length(gpca$mu), cols) 29 | 30 | expect_equal(dim(pred1), c(rows, 2)) 31 | expect_equal(dim(pred1l), c(rows, cols)) 32 | expect_equal(dim(pred1r), c(rows, cols)) 33 | expect_equal(dim(fit1l), c(rows, cols)) 34 | expect_equal(dim(fit1r), c(rows, cols)) 35 | }) 36 | 37 | rm(gpca, gpca_val) 38 | 39 | tot = rowSums(mat) + rbinom(rows, 1, 0.5) 40 | tot_test = rowSums(mat_test) + rbinom(rows, 1, 0.5) 41 | matp = sweep(mat, 1, tot, "/") 42 | matp_test = sweep(mat_test, 1, tot_test, "/") 43 | wghts = outer(tot, rep(1, cols)) 44 | wghts_test = outer(tot_test, rep(1, cols)) 45 | 46 | gpca = generalizedPCA(matp, k = k, M = 0, family = "multinomial", weights = wghts) 47 | gpca_val = generalizedPCA(matp, k = k, M = 0, family = "multinomial", weights = wghts, 48 | validation = matp_test, val_weights = wghts_test) 49 | 50 | pred1 = predict(gpca, matp) 51 | pred1l = predict(gpca, matp, type = "link") 52 | pred1r = predict(gpca, matp, type = "response") 53 | fit1l = fitted(gpca, type = "link") 54 | fit1r = fitted(gpca, type = "response") 55 | 56 | test_that("k = 2 multinomial", { 57 | expect_equal(dim(gpca$U), c(cols, 2)) 58 | expect_equal(dim(gpca$PCs), c(rows, 2)) 59 | expect_equal(length(gpca$mu), cols) 60 | 61 | expect_equal(dim(pred1), c(rows, 2)) 62 | expect_equal(dim(pred1l), c(rows, cols)) 63 | expect_equal(dim(pred1r), c(rows, cols)) 64 | expect_equal(dim(fit1l), c(rows, cols)) 65 | expect_equal(dim(fit1r), c(rows, cols)) 66 | }) 67 | -------------------------------------------------------------------------------- /tests/testthat/test-multinomial-basics.R: -------------------------------------------------------------------------------- 1 | context("Multinomial") 2 | 3 | # construct a low rank matrix in the natural parameter scale 4 | rows = 100 5 | cols = 10 6 | k = 1 7 | set.seed(1) 8 | 9 | mat = model.matrix(~ factor(sample(1:(cols + 1), rows, replace = TRUE)) - 1)[, 1:cols] 10 | colnames(mat) <- NULL 11 | 12 | gpca = generalizedPCA(mat, k = k, M = 4, family = "multinomial", main_effects = TRUE) 13 | 14 | pred1 = predict(gpca, mat) 15 | pred1l = predict(gpca, mat, type = "link") 16 | pred1r = predict(gpca, mat, type = "response") 17 | fit1l = fitted(gpca, type = "link") 18 | fit1r = fitted(gpca, type = "response") 19 | 20 | test_that("correct classes", { 21 | expect_is(gpca, "gpca") 22 | 23 | expect_is(pred1, "matrix") 24 | expect_is(pred1l, "matrix") 25 | expect_is(pred1r, "matrix") 26 | expect_is(fit1l, "matrix") 27 | expect_is(fit1r, "matrix") 28 | 29 | }) 30 | 31 | test_that("k = 1 dimensions", { 32 | expect_equal(dim(gpca$U), c(cols, 1)) 33 | expect_equal(dim(gpca$PCs), c(rows, 1)) 34 | expect_equal(length(gpca$mu), cols) 35 | 36 | expect_equal(dim(pred1), c(rows, 1)) 37 | expect_equal(dim(pred1l), c(rows, cols)) 38 | expect_equal(dim(pred1r), c(rows, cols)) 39 | expect_equal(dim(fit1l), c(rows, cols)) 40 | expect_equal(dim(fit1r), c(rows, cols)) 41 | }) 42 | 43 | rm(gpca, pred1, pred1l, pred1r, fit1l, fit1r) 44 | 45 | k = 2 46 | gpca = generalizedPCA(mat, k = k, M = 4, family = "multinomial", main_effects = TRUE) 47 | 48 | pred1 = predict(gpca, mat) 49 | pred1l = predict(gpca, mat, type = "link") 50 | pred1r = predict(gpca, mat, type = "response") 51 | fit1l = fitted(gpca, type = "link") 52 | fit1r = fitted(gpca, type = "response") 53 | 54 | test_that("k = 2 dimensions", { 55 | expect_equal(dim(gpca$U), c(cols, 2)) 56 | expect_equal(dim(gpca$PCs), c(rows, 2)) 57 | expect_equal(length(gpca$mu), cols) 58 | 59 | expect_equal(dim(pred1), c(rows, 2)) 60 | expect_equal(dim(pred1l), c(rows, cols)) 61 | expect_equal(dim(pred1r), c(rows, cols)) 62 | expect_equal(dim(fit1l), c(rows, cols)) 63 | expect_equal(dim(fit1r), c(rows, cols)) 64 | }) 65 | 66 | test_that("response between 0 and 1", { 67 | expect_gte(min(pred1r), 0) 68 | expect_gte(min(fit1r), 0) 69 | 70 | expect_lte(max(pred1r), 1) 71 | expect_lte(max(fit1r), 1) 72 | }) 73 | 74 | test_that("fitted row sums = 1", { 75 | expect_gte(min(rowSums(pred1r)), 0) 76 | expect_gte(min(rowSums(fit1r)), 0) 77 | 78 | expect_lte(max(rowSums(pred1r)), 1) 79 | expect_lte(max(rowSums(fit1r)), 1) 80 | }) 81 | -------------------------------------------------------------------------------- /tests/testthat/test-continuous-basics.R: -------------------------------------------------------------------------------- 1 | context("Continuous") 2 | 3 | # construct a low rank matrix in the natural parameter scale 4 | rows = 100 5 | cols = 10 6 | k = 1 7 | set.seed(1) 8 | mat_np = outer(rnorm(rows), rnorm(cols)) 9 | 10 | mat = matrix(rnorm(rows * cols, c(mat_np)), rows, cols) 11 | 12 | gpca = generalizedPCA(mat, k = k, M = 4, family = "gaussian", main_effects = TRUE) 13 | 14 | pca_mu = colMeans(mat) 15 | pca = svd(scale(mat, center = pca_mu, scale = FALSE)) 16 | gpca2 = gpca 17 | gpca$mu = pca_mu 18 | gpca$U = matrix(pca$v[, 1:k], cols, k) 19 | 20 | 21 | pred1 = predict(gpca, mat) 22 | pred1l = predict(gpca, mat, type = "link") 23 | pred1r = predict(gpca, mat, type = "response") 24 | fit1l = fitted(gpca, type = "link") 25 | fit1r = fitted(gpca, type = "response") 26 | 27 | pred2l = predict(gpca2, mat, type = "link") 28 | pred2r = predict(gpca2, mat, type = "response") 29 | 30 | test_that("correct classes", { 31 | expect_is(gpca, "gpca") 32 | 33 | expect_is(pred1, "matrix") 34 | expect_is(pred1l, "matrix") 35 | expect_is(pred1r, "matrix") 36 | expect_is(fit1l, "matrix") 37 | expect_is(fit1r, "matrix") 38 | }) 39 | 40 | test_that("k = 1 dimensions", { 41 | expect_equal(dim(gpca$U), c(cols, 1)) 42 | expect_equal(dim(gpca$PCs), c(rows, 1)) 43 | expect_equal(length(gpca$mu), cols) 44 | 45 | expect_equal(dim(pred1), c(rows, 1)) 46 | expect_equal(dim(pred1l), c(rows, cols)) 47 | expect_equal(dim(pred1r), c(rows, cols)) 48 | expect_equal(dim(fit1l), c(rows, cols)) 49 | expect_equal(dim(fit1r), c(rows, cols)) 50 | }) 51 | 52 | test_that("k = 1 same fits as standard PCA", { 53 | expect_equal(pred1r, pred2r) 54 | expect_equal(pred1l, pred2l) 55 | # Don't compare fitted, because they rely on $PCs 56 | expect_equal(gpca$mu, pca_mu) 57 | expect_equal(pca$v[, 1], gpca$U[, 1]) 58 | expect_equal(pca$u[, 1] * pca$d[1], gpca$PCs[, 1]) 59 | }) 60 | 61 | rm(gpca, pred1, pred1l, pred1r, fit1l, fit1r) 62 | 63 | k = 2 64 | gpca = generalizedPCA(mat, k = k, M = 4, family = "gaussian", main_effects = TRUE) 65 | 66 | pred1 = predict(gpca, mat) 67 | pred1l = predict(gpca, mat, type = "link") 68 | pred1r = predict(gpca, mat, type = "response") 69 | fit1l = fitted(gpca, type = "link") 70 | fit1r = fitted(gpca, type = "response") 71 | 72 | test_that("k = 2 dimensions", { 73 | expect_equal(dim(gpca$U), c(cols, 2)) 74 | expect_equal(dim(gpca$PCs), c(rows, 2)) 75 | expect_equal(length(gpca$mu), cols) 76 | 77 | expect_equal(dim(pred1), c(rows, 2)) 78 | expect_equal(dim(pred1l), c(rows, cols)) 79 | expect_equal(dim(pred1r), c(rows, cols)) 80 | expect_equal(dim(fit1l), c(rows, cols)) 81 | expect_equal(dim(fit1r), c(rows, cols)) 82 | }) 83 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Generalized PCA 2 | 3 | `generalizedPCA` is an R package which extends principal component analysis to other types of data, much like [generalized linear models](http://en.wikipedia.org/wiki/Generalized_linear_model) extends linear regression. The package [logisticPCA](https://github.com/andland/logisticPCA) contains the extension to binary data, among other methods, and this package intends to generalize it to all exponential family distributions. Please note that it is still in the very early stages of development and the conventions will possibly change in the future. 4 | 5 | ## Installation 6 | 7 | To install R, visit [r-project.org/](http://www.r-project.org/). 8 | 9 | To install the package, first install `devtools` from CRAN. Then run the following commands. 10 | ```R 11 | # install.packages("devtools") 12 | library("devtools") 13 | install_github("andland/generalizedPCA") 14 | ``` 15 | 16 | ## Use 17 | The main function is `generalizedPCA()`. Like in generalized linear models, you must specify the distribution of your data. `generalizedPCA()` currently supports `"gaussian"`, `"binomial"`, `"poisson"`, or `"multinomial"` data. Unlike standard PCA, it can incorporate weights and missing data. If your data are proportions, you can use `family = "binomial"` with `weights` being a matrix of the number of opportunities. If your data is a multinomial variable with `d` levels, the input matrix should have `d - 1` columns. 18 | 19 | The function returns `mu`, the variable main effects vector of length `d`, and `U`, the `d x k` loadings matrix. 20 | 21 | ## Details 22 | `generalizedPCA()` estimates the natural parameters of an exponential family distribution in a lower dimensional space. This is done by projecting the natural parameters from the saturated model. A rank-`k` projection matrix, or equivalently a `d x k` orthogonal matrix, is solved for to minimize the deviance. 23 | 24 | For some distributions, the natural parameters from the saturated model are either negative or positive infinity, and an additional tuning parameter `M` is needed to approximate them. This occurs when `family = "binomial"` and your data include `0`'s or `1`'s or when `family = "poisson"` and your data include `0`'s. You can use `cv.gpca()` to select `M` by cross validation. Typical values are in the range of `3` to `10`. 25 | 26 | A manuscript describing generalized PCA applied to binary data can be found [here](http://www.stat.osu.edu/~yklee/mss/tr890.pdf). 27 | 28 | ## Methods 29 | The generalizedPCA class, `gpca`, has several methods to make data analysis easier. 30 | 31 | * `print()`: Prints a summary of the fitted model. 32 | * `fitted()`: Fits the low dimensional matrix of either natural parameters or response. 33 | * `predict()`: Predicts the PCs on new data. Can also predict the low dimensional matrix of natural parameters or response on new data. 34 | * `plot()`: Plots either the deviance trace by the number of iterations, the first two PC loadings, or the first two PC scores using the package `ggplot2`. 35 | 36 | In addition, there are functions for performing cross-validation. 37 | 38 | * `cv.gpca()`: Run cross-validation over the rows of the matrix to assess the fit of `M` and/or `k`. 39 | * `plot.cv()`: Plots the results of the `cv.gpca()` function using the package `ggplot2`. 40 | -------------------------------------------------------------------------------- /man/generalizedMF.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/generalizedMF.R 3 | \name{generalizedMF} 4 | \alias{generalizedMF} 5 | \title{Exponential Family Matrix Factorization} 6 | \usage{ 7 | generalizedMF(x, k = 2, family = c("gaussian", "binomial", "poisson"), 8 | weights, quiet = TRUE, max_iters = 1000, conv_criteria = 1e-05, 9 | partial_decomp = FALSE, random_start = FALSE, start_A, start_B, mu, 10 | main_effects = TRUE, method = c("als", "svd")) 11 | } 12 | \arguments{ 13 | \item{x}{matrix of either binary, proportions, count, or continuous data} 14 | 15 | \item{k}{dimension} 16 | 17 | \item{family}{exponential family distribution of data} 18 | 19 | \item{weights}{an optional matrix of the same size as the \code{x} with data weights} 20 | 21 | \item{quiet}{logical; whether the calculation should give feedback} 22 | 23 | \item{max_iters}{maximum number of iterations} 24 | 25 | \item{conv_criteria}{convergence criteria} 26 | 27 | \item{partial_decomp}{logical; if \code{TRUE}, the function uses the RSpectra package 28 | to more quickly calculate the SVD. When the number of columns is small, 29 | the approximation may be less accurate and slower} 30 | 31 | \item{random_start}{whether to randomly initialize \code{A} and \code{B}} 32 | 33 | \item{start_A}{initial value for \code{A}} 34 | 35 | \item{start_B}{initial value for \code{B}} 36 | 37 | \item{mu}{specific value for \code{mu}, the mean vector of \code{x}} 38 | 39 | \item{main_effects}{logical; whether to include main effects in the model} 40 | 41 | \item{method}{which algorithm to use. \code{"als"} uses alternating least squares. 42 | It has the benefit of majozing row-wise and column-wise for each of the updates. 43 | \code{"svd"} uses singular value decomposition (similar to de Leeuw, 2006). It has to 44 | a more gereral majorization, which may not work well for heterogeneous matrices.} 45 | } 46 | \value{ 47 | An S3 object of class \code{gmf} which is a list with the 48 | following components: 49 | \item{mu}{the main effects for dimensionality reduction} 50 | \item{A}{the \code{n}x\code{k}-dimentional matrix with the scores} 51 | \item{B}{the \code{d}x\code{k}-dimentional matrix with the loadings} 52 | \item{family}{the exponential family of the data} 53 | \item{iters}{number of iterations required for convergence} 54 | \item{loss_trace}{the trace of the average deviance of the algorithm. 55 | Should be non-increasing} 56 | \item{prop_deviance_expl}{the proportion of deviance explained by this model. 57 | If \code{main_effects = TRUE}, the null model is just the main effects, otherwise 58 | the null model estimates 0 for all natural parameters.} 59 | } 60 | \description{ 61 | Collins et al. (2001)'s Exponential Family PCA 62 | } 63 | \examples{ 64 | rows = 100 65 | cols = 10 66 | set.seed(1) 67 | mat_np = outer(rnorm(rows), rnorm(cols)) 68 | 69 | # generate a count matrix and binary response 70 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 71 | 72 | mod = generalizedMF(mat, k = 1, family = "poisson", quiet = FALSE) 73 | 74 | } 75 | \references{ 76 | de Leeuw, Jan, 2006. Principal component analysis of binary data 77 | by iterated singular value decomposition. Computational Statistics & Data Analysis 78 | 50 (1), 21--39. 79 | 80 | Collins, M., Dasgupta, S., & Schapire, R. E., 2001. A generalization of principal 81 | components analysis to the exponential family. In NIPS, 617--624. 82 | } 83 | 84 | -------------------------------------------------------------------------------- /man/exponential_family_harmonium.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exponential_family_harmonium.R 3 | \name{exponential_family_harmonium} 4 | \alias{exponential_family_harmonium} 5 | \title{Exponential Family Harmoniums} 6 | \usage{ 7 | exponential_family_harmonium(x, k = 2, family = c("gaussian", "binomial", 8 | "poisson"), family_hidden = c("gaussian", "binomial", "poisson"), 9 | cd_iters = 1, learning_rate = 0.001, max_iters = 100, 10 | rms_prop = FALSE, quiet = TRUE, random_start = TRUE, start_W, mu, 11 | main_effects = TRUE) 12 | } 13 | \arguments{ 14 | \item{x}{matrix of either binary, count, or continuous data} 15 | 16 | \item{k}{dimension (number of hidden units)} 17 | 18 | \item{family}{exponential family distribution of data} 19 | 20 | \item{family_hidden}{exponential family distribution of hidden units} 21 | 22 | \item{cd_iters}{number of iterations for contrastive divergence (CD) at each iteration. 23 | It should be a vector of two integers, which is the range of CD interations that the 24 | algorithm will perform from the beginning until the end, linearly interpolated} 25 | 26 | \item{learning_rate}{learning rate used for gradient descent} 27 | 28 | \item{max_iters}{maximum number of iterations} 29 | 30 | \item{rms_prop}{logical; whether to use RMS prop for optimization. Default is \code{FALSE}.} 31 | 32 | \item{quiet}{logical; whether the calculation should give feedback} 33 | 34 | \item{random_start}{whether to randomly initialize \code{W}} 35 | 36 | \item{start_W}{initial value for \code{W}} 37 | 38 | \item{mu}{specific value for \code{mu}, the mean (bias) vector of \code{x}} 39 | 40 | \item{main_effects}{logical; whether to include main effects (bias terms) in the model} 41 | } 42 | \value{ 43 | An S3 object of class \code{efh} which is a list with the 44 | following components: 45 | \item{mu}{the main effects (bias terms) for dimensionality reduction} 46 | \item{hidden_bias}{the bias for the hidden units (currently hard coded to 0)} 47 | \item{W}{the \code{d}x\code{k}-dimentional matrix with the loadings} 48 | \item{family}{the exponential family of the data} 49 | \item{family_hidden}{the exponential family of the hidden units} 50 | \item{iters}{number of iterations required for convergence} 51 | \item{loss_trace}{the trace of the average deviance of the algorithm. 52 | Should be non-increasing} 53 | \item{prop_deviance_expl}{the proportion of deviance explained by this model. 54 | If \code{main_effects = TRUE}, the null model is just the main effects, otherwise 55 | the null model estimates 0 for all natural parameters.} 56 | } 57 | \description{ 58 | Welling et al. (2005)'s Exponential Family Harmoniums 59 | } 60 | \examples{ 61 | rows = 100 62 | cols = 10 63 | set.seed(1) 64 | mat_np = outer(rnorm(rows), rnorm(cols)) 65 | 66 | # generate a count matrix and binary response 67 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 68 | mat[1, 1] <- NA 69 | 70 | modp = exponential_family_harmonium(mat, k = 2, family = "poisson", quiet = FALSE, 71 | learning_rate = 0.001, rms_prop = FALSE, max_iters = 250) 72 | gmf = generalizedMF(mat, k = 1, family = "poisson", quiet = FALSE) 73 | 74 | 75 | } 76 | \references{ 77 | Welling, Max, Michal Rosen-Zvi, and Geoffrey E. Hinton. "Exponential family 78 | harmoniums with an application to information retrieval." Advances in neural 79 | information processing systems. 2005. 80 | } 81 | 82 | -------------------------------------------------------------------------------- /man/generalizedPCA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/generalizedPCA.R 3 | \name{generalizedPCA} 4 | \alias{generalizedPCA} 5 | \title{Generalized Principal Component Analysis} 6 | \usage{ 7 | generalizedPCA(x, k = 2, M = 4, family = c("gaussian", "binomial", 8 | "poisson", "multinomial"), weights, quiet = TRUE, majorizer = c("row", 9 | "all"), partial_decomp = FALSE, max_iters = 1000, conv_criteria = 1e-05, 10 | random_start = FALSE, start_U, start_mu, main_effects = TRUE, 11 | normalize = FALSE, validation, val_weights) 12 | } 13 | \arguments{ 14 | \item{x}{matrix of either binary, proportions, count, or continuous data} 15 | 16 | \item{k}{number of principal components to return} 17 | 18 | \item{M}{value to approximate the saturated model} 19 | 20 | \item{family}{exponential family distribution of data} 21 | 22 | \item{weights}{an optional matrix of the same size as the \code{x} with data weights} 23 | 24 | \item{quiet}{logical; whether the calculation should give feedback} 25 | 26 | \item{majorizer}{how to majorize the deviance. \code{"row"} gives 27 | tighter majorization, but may take longer to calculate each iteration. 28 | \code{"all"} may be faster per iteration, but take more iterations} 29 | 30 | \item{partial_decomp}{logical; if \code{TRUE}, the function uses the RSpectra package 31 | to more quickly calculate the SVD. When the number of columns is small, 32 | the approximation may be less accurate and slower} 33 | 34 | \item{max_iters}{number of maximum iterations} 35 | 36 | \item{conv_criteria}{convergence criteria. The difference between average deviance 37 | in successive iterations} 38 | 39 | \item{random_start}{logical; whether to randomly inititalize the parameters. If \code{FALSE}, 40 | function will use an eigen-decomposition as starting value} 41 | 42 | \item{start_U}{starting value for the orthogonal matrix} 43 | 44 | \item{start_mu}{starting value for mu. Only used if \code{main_effects = TRUE}} 45 | 46 | \item{main_effects}{logical; whether to include main effects in the model} 47 | 48 | \item{normalize}{logical; whether to weight the variables to they all have equal influence} 49 | 50 | \item{validation}{a validation dataset to select \code{m} with} 51 | 52 | \item{val_weights}{weights associated with validation data} 53 | } 54 | \value{ 55 | An S3 object of class \code{gpca} which is a list with the 56 | following components: 57 | \item{mu}{the main effects} 58 | \item{U}{a \code{k}-dimentional orthonormal matrix with the loadings} 59 | \item{PCs}{the princial component scores} 60 | \item{M}{the parameter inputed} 61 | \item{family}{the exponential family used} 62 | \item{iters}{number of iterations required for convergence} 63 | \item{loss_trace}{the trace of the average deviance of the algorithm. 64 | Should be non-increasing} 65 | \item{prop_deviance_expl}{the proportion of deviance explained by this model. 66 | If \code{main_effects = TRUE}, the null model is just the main effects, otherwise 67 | the null model estimates 0 for all natural parameters.} 68 | } 69 | \description{ 70 | Dimension reduction for exponential family data by extending Pearson's 71 | PCA formulation 72 | } 73 | \examples{ 74 | # construct a low rank matrix in the natural parameter space 75 | rows = 100 76 | cols = 10 77 | set.seed(1) 78 | mat_np = outer(rnorm(rows), rnorm(cols)) 79 | 80 | # generate a count matrix 81 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 82 | 83 | # run Poisson PCA on it 84 | gpca = generalizedPCA(mat, k = 1, M = 4, family = "poisson") 85 | } 86 | 87 | -------------------------------------------------------------------------------- /man/convexGeneralizedPCA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/convexGeneralizedPCA.R 3 | \name{convexGeneralizedPCA} 4 | \alias{convexGeneralizedPCA} 5 | \title{Convex Generalized Principal Component Analysis} 6 | \usage{ 7 | convexGeneralizedPCA(x, k = 2, M = 4, family = c("gaussian", "binomial", 8 | "poisson", "multinomial"), weights, quiet = TRUE, partial_decomp = FALSE, 9 | max_iters = 1000, conv_criteria = 1e-06, random_start = FALSE, start_H, 10 | mu, main_effects = TRUE, normalize = FALSE, ss_factor = 1) 11 | } 12 | \arguments{ 13 | \item{x}{matrix of either binary, proportions, count, or continuous data} 14 | 15 | \item{k}{number of principal components to return} 16 | 17 | \item{M}{value to approximate the saturated model} 18 | 19 | \item{family}{exponential family distribution of data} 20 | 21 | \item{weights}{an optional matrix of the same size as the \code{x} with non-negative weights} 22 | 23 | \item{quiet}{logical; whether the calculation should give feedback} 24 | 25 | \item{partial_decomp}{logical; if \code{TRUE}, the function uses the RSpectra package 26 | to more quickly calculate the eigen-decomposition. When the number of columns is small, 27 | the approximation may be less accurate and slower} 28 | 29 | \item{max_iters}{number of maximum iterations} 30 | 31 | \item{conv_criteria}{convergence criteria. The difference between average deviance 32 | in successive iterations} 33 | 34 | \item{random_start}{logical; whether to randomly inititalize the parameters. If \code{FALSE}, 35 | function will use an eigen-decomposition as starting value} 36 | 37 | \item{start_H}{starting value for the Fantope matrix} 38 | 39 | \item{mu}{main effects vector. Only used if \code{main_effects = TRUE}} 40 | 41 | \item{main_effects}{logical; whether to include main effects in the model} 42 | 43 | \item{normalize}{logical; whether to weight the variables to they all have equal influence} 44 | 45 | \item{ss_factor}{step size multiplier. Amount by which to multiply the step size. Quadratic 46 | convergence rate can be proven for \code{ss_factor = 1}, but I have found higher values 47 | sometimes work better. The default is \code{ss_factor = 4}. 48 | If it is not converging, try \code{ss_factor = 1}.} 49 | } 50 | \value{ 51 | An S3 object of class \code{cgpca} which is a list with the 52 | following components: 53 | \item{mu}{the main effects} 54 | \item{H}{a rank \code{k} Fantope matrix} 55 | \item{U}{a \code{ceiling(k)}-dimentional orthonormal matrix with the loadings} 56 | \item{PCs}{the princial component scores} 57 | \item{M}{the parameter inputed} 58 | \item{iters}{number of iterations required for convergence} 59 | \item{loss_trace}{the trace of the average deviance using the Fantope matrix} 60 | \item{proj_loss_trace}{the trace of the average deviance using the projection matrix} 61 | \item{prop_deviance_expl}{the proportion of deviance explained by this model. 62 | If \code{main_effects = TRUE}, the null model is just the main effects, otherwise 63 | the null model estimates 0 for all natural parameters.} 64 | } 65 | \description{ 66 | Dimensionality reduction for exponential family data by extending Pearson's 67 | PCA formulation to minimize deviance. The convex relaxation 68 | to projection matrices, the Fantope, is used. 69 | } 70 | \examples{ 71 | # construct a low rank matrix in the logit scale 72 | rows = 100 73 | cols = 10 74 | set.seed(1) 75 | mat_logit = outer(rnorm(rows), rnorm(cols)) 76 | 77 | # generate a binary matrix 78 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0 79 | 80 | # run convex generalized PCA on it 81 | cgpca = convexGeneralizedPCA(mat, k = 1, M = 4, family = "binomial") 82 | } 83 | 84 | -------------------------------------------------------------------------------- /R/exponential_family_functions.R: -------------------------------------------------------------------------------- 1 | #' @title Inverse logit for matrices 2 | #' 3 | #' @description 4 | #' Apply the inverse logit function to a matrix, element-wise. It 5 | #' generalizes the \code{inv.logit} function from the \code{gtools} 6 | #' library to matrices 7 | #' 8 | #' @param x matrix 9 | #' @param min Lower end of logit interval 10 | #' @param max Upper end of logit interval 11 | #' @examples 12 | #' (mat = matrix(rnorm(10 * 5), nrow = 10, ncol = 5)) 13 | #' inv.logit.mat(mat) 14 | #' @export 15 | inv.logit.mat <- function(x, min = 0, max = 1) { 16 | # .Call("inv_logit_mat", x, min, max, PACKAGE = "generalizedPCA") 17 | logisticPCA::inv.logit.mat(x, min, max) 18 | } 19 | 20 | #' @title Bernoulli Log Likelihood 21 | #' 22 | #' @description 23 | #' Calculate the Bernoulli log likelihood of matrix 24 | #' 25 | #' @param x matrix with all binary entries 26 | #' @param theta estimated natural parameters with 27 | #' same dimensions as x 28 | #' @param q instead of x, you can input matrix q which is 29 | #' -1 if \code{x = 0}, 1 if \code{x = 1}, and 0 if \code{is.na(x)} 30 | # @export 31 | log_like_Bernoulli <- function(x, theta, q) { 32 | logisticPCA::log_like_Bernoulli(x, theta, q) 33 | } 34 | 35 | check_family <- function(x, family) { 36 | distinct_vals = unique(c(x[!is.na(x)])) 37 | if (family %in% c("binomial", "multinomial") && 38 | any(distinct_vals < 0 | distinct_vals > 1)) { 39 | stop(paste0(family, " family with data outside [0, 1]")) 40 | } 41 | if (any(family == "poisson" && distinct_vals < 0)) { 42 | stop("Negative data with poisson family") 43 | } 44 | if (family == "multinomial" && any(rowSums(x) > 1)) { 45 | stop(paste0(family, " family must have row sums <= 1")) 46 | } 47 | if (all(distinct_vals %in% c(0, 1))) { 48 | if (!(family %in% c("binomial", "multinomial"))) { 49 | message("All entries are binary. Are you sure you didn't mean binomial?") 50 | } 51 | } else if (all(distinct_vals >= 0 & distinct_vals %% 1 == 0)) { 52 | if (family != "poisson") { 53 | message("All entries are counts. Are you sure you didn't mean poisson?") 54 | } 55 | } 56 | if (any(distinct_vals %% 1 != 0) & family == "poisson") { 57 | message("Non-integer data with poisson data. Did you want family = guassian?") 58 | } 59 | } 60 | 61 | exp_fam_guess <- function(x) { 62 | distinct_vals = unique(c(x[!is.na(x)])) 63 | if (all(distinct_vals %in% c(0, 1))) { 64 | family = "binomial" 65 | message("Guessing data comes from a binomial distribution because it consists of all 0's and 1's") 66 | } else if (all(distinct_vals >= 0 & distinct_vals %% 1 == 0)) { 67 | family = "poisson" 68 | message("Guessing data comes from a poisson distribution because it consists of all non-negative integers") 69 | } else if (all(distinct_vals >=0 & distinct_vals <= 1) & all(rowSums(x, na.rm = TRUE) <= 1)) { 70 | family = "multinomial" 71 | message("Guessing data comes from a multinomial distribution because all the rows sum to less than 1") 72 | } else { 73 | family = "gaussian" 74 | } 75 | return(family) 76 | } 77 | 78 | saturated_natural_parameters <- function(x, family, M) { 79 | if (family == "gaussian") { 80 | eta = x 81 | } else if (family == "binomial") { 82 | eta = abs(M) * (2 * x - 1) 83 | non_binary = (x != 0 & x != 1 & !is.na(x)) 84 | if (sum(non_binary) > 0) { 85 | logitvals = log(x) - log(1 - x) 86 | eta[non_binary] = logitvals[non_binary] 87 | } 88 | } else if (family == "poisson") { 89 | eta = log(x) 90 | eta[x==0] = -abs(M) 91 | } else if (family == "multinomial") { 92 | # TODO: check this! Correct if 0/1s. 93 | eta = abs(M) * (2 * x - 1) 94 | non_binary = (x != 0 & x != 1 & !is.na(x)) 95 | if (sum(non_binary) > 0) { 96 | # TODO: give warning first time if any(rowSums(x) == 1) 97 | # TODO: doesn't deal with NAs, 98 | # although typically a whole categorical variable would be NA 99 | 100 | # last_cat_prob = ifelse(rowSums(x) > 1 - as.numeric(inv.logit.mat(-as.numeric(M))), 101 | # as.numeric(inv.logit.mat(-as.numeric(M))), 1 - rowSums(x)) 102 | # logitvals = sweep(log(x), 1, log(last_cat_prob), "-") 103 | 104 | # Below is in line with what is in tech report 105 | last_cat_prob = 1 - rowSums(x) 106 | last_cat_prob[last_cat_prob < exp(-as.numeric(M))] = exp(-as.numeric(M)) 107 | logitvals = sweep(log(x), 1, log(last_cat_prob), "-") 108 | 109 | eta[non_binary] = logitvals[non_binary] 110 | } 111 | } 112 | eta[is.na(x)] <- 0 113 | return(eta) 114 | } 115 | 116 | # @export 117 | exp_fam_mean <- function(theta, family) { 118 | if (family == "gaussian") { 119 | mean_mat = theta 120 | } else if (family == "binomial") { 121 | mean_mat = inv.logit.mat(theta) 122 | } else if (family == "poisson") { 123 | mean_mat = exp(theta) 124 | } else if (family == "multinomial") { 125 | exp_theta = exp(theta) 126 | mean_mat = sweep(exp_theta, 1, 1 + rowSums(exp_theta), "/") 127 | } 128 | return(mean_mat) 129 | } 130 | 131 | exp_fam_variance <- function(theta, family, weights = 1.0) { 132 | if (family == "gaussian") { 133 | var_mat = matrix(1, nrow(theta), ncol(theta)) * weights 134 | } else if (family %in% c("binomial", "multinomial")) { 135 | mean_mat = exp_fam_mean(theta, family) 136 | var_mat = mean_mat * (1 - mean_mat) * weights 137 | } else if (family == "poisson") { 138 | var_mat = exp(theta) * weights 139 | } 140 | return(var_mat) 141 | } 142 | 143 | # @export 144 | exp_fam_log_like <- function(x, theta, family, weights = 1.0) { 145 | if (family == "gaussian") { 146 | return(-0.5 * sum(weights * (x - theta)^2, na.rm = TRUE)) 147 | } else if (family == "binomial") { 148 | return(sum((weights * log(inv.logit.mat((2 * x - 1) * theta))), na.rm = TRUE)) 149 | # the below does not work with x == 1 and theta > 750 150 | # return(sum(weights * (x * theta - log(1 + exp(theta))), na.rm = TRUE)) 151 | } else if (family == "poisson") { 152 | return(sum(weights * (x * theta - exp(theta) - lfactorial(x)), na.rm = TRUE)) 153 | } else if (family == "multinomial") { 154 | if (length(weights) > 1 && any(apply(weights, 1, stats::var) > 0)) { 155 | stop("weights should be the same for every variable withing each row") 156 | } 157 | return(sum(weights * (x * theta) - weights / ncol(x) * 158 | outer(log(1 + rowSums(exp(theta))), rep(1, ncol(x))), na.rm = TRUE)) 159 | } 160 | } 161 | 162 | #' Calculate exponential family deviance 163 | #' 164 | #' Calculates the deviance of data assuming it comes from an exponential family 165 | #' 166 | #' @param x a vector or matrix of data 167 | #' @param theta natural parameters. Must be same dimensions as \code{x} 168 | #' @param family exponential family distribution of data 169 | #' @param weights weights of datapoints. Must be a scalar or the same dimensions as the data 170 | #' 171 | #' @export 172 | exp_fam_deviance <- function(x, theta, family, weights = 1.0) { 173 | eta_sat_nat = saturated_natural_parameters(x, family, M = Inf) 174 | sat_loglike = exp_fam_log_like(x, eta_sat_nat, family, weights) 175 | 176 | -2 * (exp_fam_log_like(x, theta, family, weights) - sat_loglike) 177 | } 178 | 179 | # for solving for m 180 | exp_fam_sat_ind_mat <- function(x, family) { 181 | if (family == "gaussian") { 182 | return(matrix(0, nrow(x), ncol(x))) 183 | } else if (family == "binomial") { 184 | # set 1's to 1, 0's to -1, and everything else to 0 185 | return((x == 1) - (x == 0)) 186 | } else if (family == "poisson") { 187 | return((x == 0) * -1.0) 188 | } else if (family == "multinomial") { 189 | # first see if last cat == 0 and x between 0 and 1 190 | last_cat_prob_zero = rowSums(x) > (1 - 1e-3) 191 | q = outer(last_cat_prob_zero, rep(TRUE, ncol(x))) & x > 0 & x < 1 192 | # then add in exact 0's and 1's 193 | q = q + (x == 1) - (x == 0) 194 | } 195 | } 196 | 197 | # for exponential family harmoniums 198 | exp_fam_sufficient_stat <- function(x, family) { 199 | if (family %in% c("gaussian", "binomial", "poisson")) { 200 | return(x) 201 | } 202 | } 203 | 204 | # for exponential family harmoniums 205 | #' @importFrom stats rbinom rpois rnorm 206 | exp_fam_sample <- function(theta, family) { 207 | mean_mat = exp_fam_mean(theta = theta, family = family) 208 | 209 | if (family == "gaussian") { 210 | return(matrix(rnorm(prod(dim(theta)), mean = c(mean_mat), sd = 1), nrow(theta), ncol(theta))) 211 | } else if (family == "binomial") { 212 | return(matrix(rbinom(prod(dim(theta)), size = 1, prob = c(mean_mat)), nrow(theta), ncol(theta))) 213 | } else if (family == "poisson") { 214 | return(matrix(rpois(prod(dim(theta)), lambda = c(mean_mat)), nrow(theta), ncol(theta))) 215 | } 216 | } 217 | -------------------------------------------------------------------------------- /R/exponential_family_harmonium.R: -------------------------------------------------------------------------------- 1 | #' Exponential Family Harmoniums 2 | #' 3 | #' @description Welling et al. (2005)'s Exponential Family Harmoniums 4 | #' 5 | #' @param x matrix of either binary, count, or continuous data 6 | #' @param k dimension (number of hidden units) 7 | #' @param family exponential family distribution of data 8 | #' @param family_hidden exponential family distribution of hidden units 9 | #' @param cd_iters number of iterations for contrastive divergence (CD) at each iteration. 10 | #' It should be a vector of two integers, which is the range of CD interations that the 11 | #' algorithm will perform from the beginning until the end, linearly interpolated 12 | #' @param max_iters maximum number of iterations 13 | #' @param learning_rate learning rate used for gradient descent 14 | #' @param rms_prop logical; whether to use RMS prop for optimization. Default is \code{FALSE}. 15 | #' @param quiet logical; whether the calculation should give feedback 16 | #' @param random_start whether to randomly initialize \code{W} 17 | #' @param start_W initial value for \code{W} 18 | #' @param mu specific value for \code{mu}, the mean (bias) vector of \code{x} 19 | #' @param main_effects logical; whether to include main effects (bias terms) in the model 20 | #' 21 | #' @return An S3 object of class \code{efh} which is a list with the 22 | #' following components: 23 | #' \item{mu}{the main effects (bias terms) for dimensionality reduction} 24 | #' \item{hidden_bias}{the bias for the hidden units (currently hard coded to 0)} 25 | #' \item{W}{the \code{d}x\code{k}-dimentional matrix with the loadings} 26 | #' \item{family}{the exponential family of the data} 27 | #' \item{family_hidden}{the exponential family of the hidden units} 28 | #' \item{iters}{number of iterations required for convergence} 29 | #' \item{loss_trace}{the trace of the average deviance of the algorithm. 30 | #' Should be non-increasing} 31 | #' \item{prop_deviance_expl}{the proportion of deviance explained by this model. 32 | #' If \code{main_effects = TRUE}, the null model is just the main effects, otherwise 33 | #' the null model estimates 0 for all natural parameters.} 34 | #' 35 | #' @export 36 | #' 37 | #' @references 38 | #' Welling, Max, Michal Rosen-Zvi, and Geoffrey E. Hinton. "Exponential family 39 | #' harmoniums with an application to information retrieval." Advances in neural 40 | #' information processing systems. 2005. 41 | #' 42 | #' @examples 43 | #' rows = 100 44 | #' cols = 10 45 | #' set.seed(1) 46 | #' mat_np = outer(rnorm(rows), rnorm(cols)) 47 | #' 48 | #' # generate a count matrix and binary response 49 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 50 | #' mat[1, 1] <- NA 51 | #' 52 | #' modp = exponential_family_harmonium(mat, k = 2, family = "poisson", quiet = FALSE, 53 | #' learning_rate = 0.001, rms_prop = FALSE, max_iters = 250) 54 | #' gmf = generalizedMF(mat, k = 1, family = "poisson", quiet = FALSE) 55 | #' 56 | #' 57 | exponential_family_harmonium <- function(x, k = 2, 58 | family = c("gaussian", "binomial", "poisson"), 59 | family_hidden = c("gaussian", "binomial", "poisson"), 60 | cd_iters = 1, learning_rate = 0.001, max_iters = 100, rms_prop = FALSE, 61 | quiet = TRUE, random_start = TRUE, start_W, mu, main_effects = TRUE) { 62 | family = match.arg(family) 63 | family_hidden = match.arg(family_hidden) 64 | check_family(x, family) 65 | # x = mat; k = 1; family = "poisson"; family_hidden = "gaussian"; cd_iters = 1; learning_rate = 0.001; max_iters = 100; quiet = FALSE; 66 | 67 | stopifnot(cd_iters > 0.5, length(cd_iters) %in% c(1, 2, max_iters)) 68 | if (length(cd_iters) == 1) { 69 | cd_iters = rep(round(cd_iters), max_iters) 70 | } else if (length(cd_iters) == 2) { 71 | cd_iters = round(cd_iters) 72 | cd_iters = pmax(min(cd_iters), pmin(max(cd_iters), round(seq(cd_iters[1] - 0.5, cd_iters[2] + 0.5, length.out = max_iters)))) 73 | } else { 74 | cd_iters = round(cd_iters) 75 | } 76 | 77 | x = as.matrix(x) 78 | missing_mat = is.na(x) 79 | sum_weights = sum(!missing_mat) 80 | 81 | # missing values are ignored, which is equivalent to setting them to 0 82 | x_imputed = x 83 | x_imputed[missing_mat] <- 0 84 | # scaled to take the avg with different number of missing values 85 | x_imputed_scaled = scale(x_imputed, FALSE, colSums(!missing_mat)) 86 | 87 | n = nrow(x) 88 | d = ncol(x) 89 | ones = rep(1, n) 90 | 91 | # calculate the null log likelihood for % deviance explained and normalization 92 | if (main_effects) { 93 | weighted_col_means = colMeans(x, na.rm = TRUE) 94 | null_theta = as.numeric(saturated_natural_parameters(matrix(weighted_col_means, 1), family, M = Inf)) 95 | } else { 96 | null_theta = rep(0, d) 97 | } 98 | null_deviance = exp_fam_deviance(x, outer(ones, null_theta), family) / sum_weights 99 | 100 | # Initialize # 101 | ################## 102 | if (main_effects) { 103 | if (missing(mu)) { 104 | mu = saturated_natural_parameters(weighted_col_means, family, Inf) 105 | } else { 106 | mu = as.numeric(mu) 107 | stopifnot(length(mu) == d) 108 | } 109 | } else { 110 | mu = rep(0, d) 111 | } 112 | 113 | # hard code to 0 for now 114 | hidden_bias = rep(0, k) 115 | 116 | if (!missing(start_W)) { 117 | stopifnot(dim(start_W) == c(d, k)) 118 | W = as.matrix(start_W) 119 | } else if (random_start) { 120 | # initialize with glorot_uniform (https://keras.io/initializers/#glorot_uniform) 121 | # W = matrix(runif(d * k, -sqrt(6 / (k + d)), sqrt(6 / (k + d))), d, k) 122 | W = matrix(rnorm(d * k, 0, 0.001), d, k) 123 | } else { 124 | udv = svd(scale(saturated_natural_parameters(x_imputed, family, 4), TRUE, FALSE)) 125 | W = udv$v[, 1:k, drop = FALSE] 126 | } 127 | 128 | loss_trace = numeric(max_iters + 1) 129 | theta = outer(ones, mu) + exp_fam_mean(outer(ones, hidden_bias) + x_imputed %*% W, family_hidden) %*% t(W) 130 | loss_trace[1] = exp_fam_deviance(x, theta, family) / sum_weights 131 | 132 | ptm <- proc.time() 133 | 134 | if (!quiet) { 135 | cat(0, " ", loss_trace[1], "") 136 | cat("0 hours elapsed\n") 137 | } 138 | 139 | # RMSProp 140 | if (rms_prop) { 141 | W_grad_sq = matrix(1, nrow(W), ncol(W)) 142 | } else { 143 | W_grad_sq = matrix(1, nrow(W), ncol(W)) 144 | } 145 | 146 | W_lag = W 147 | for (ii in seq_len(max_iters)) { 148 | # grad_deflate_times = 0 149 | while (TRUE) { 150 | hidden_mean_0 = exp_fam_mean(outer(ones, hidden_bias) + x_imputed %*% W, family_hidden) 151 | visible_hidden_0 = t(x_imputed_scaled) %*% hidden_mean_0 152 | 153 | visible_cd = contrastive_divergence(x = x_imputed, W = W, mu = mu, hidden_bias = hidden_bias, 154 | family = family, family_hidden = family_hidden, 155 | num_iter = cd_iters[ii]) 156 | hidden_mean_cd = exp_fam_mean(outer(ones, hidden_bias) + visible_cd %*% W, family_hidden) 157 | visible_hidden_cd = t(visible_cd) %*% hidden_mean_cd / n 158 | 159 | # the generated data can explode with gaussian hidden and poisson visible 160 | if (any(is.na(visible_hidden_cd))) { 161 | # grad_deflate_times = grad_deflate_times + 1 162 | if (!quiet) { 163 | cat("Contrastive Divergence resulted in NA's\n") 164 | } 165 | return( 166 | structure( 167 | list( 168 | mu = mu, 169 | W = W_lag, 170 | hidden_bias = hidden_bias, 171 | family = family, 172 | family_hidden = family_hidden, 173 | iters = ii - 1, 174 | cd_iters = cd_iters[1:(ii - 1)], 175 | loss_trace = loss_trace[2:ii], 176 | prop_deviance_expl = 1 - loss_trace[ii] / null_deviance, 177 | W_grad_sq = W_grad_sq 178 | ), 179 | class = "efh" 180 | ) 181 | ) 182 | if (ii > 1) { 183 | W_grad_sq = W_grad_sq * 4 184 | W = W_lag + learning_rate * W_grad / sqrt(W_grad_sq) 185 | } else { 186 | W = W / 2 187 | } 188 | next 189 | } 190 | 191 | W_grad = visible_hidden_0 - visible_hidden_cd 192 | W = W + learning_rate * W_grad / sqrt(W_grad_sq) 193 | if (rms_prop) { 194 | if (ii == 1) { 195 | W_grad_sq = W_grad^2 196 | } else { 197 | W_grad_sq = 0.1 * W_grad^2 + 0.9 * W_grad_sq 198 | } 199 | } 200 | 201 | # Calc Deviance 202 | theta = outer(ones, mu) + exp_fam_mean(outer(ones, hidden_bias) + x_imputed %*% W, family_hidden) %*% t(W) 203 | loss_trace[ii + 1] <- exp_fam_deviance(x, theta, family) / sum_weights 204 | 205 | # if (loss_trace[ii + 1] <= loss_trace[1] | grad_deflate_times >= 1) { 206 | # grad_deflate_times = 0 207 | break 208 | # } else { 209 | # if (!quiet) { 210 | # cat("Loss is larger than initialization\n") 211 | # } 212 | # grad_deflate_times = grad_deflate_times + 1 213 | # W_grad_sq = W_grad_sq * 4 214 | # } 215 | } 216 | 217 | if (!quiet) { 218 | time_elapsed = as.numeric(proc.time() - ptm)[3] 219 | tot_time = max_iters / ii * time_elapsed 220 | time_remain = tot_time - time_elapsed 221 | cat(ii, " ", loss_trace[ii + 1], "") 222 | cat(round(time_elapsed / 3600, 1), "hours elapsed. Max", round(time_remain / 3600, 1), "hours remain.\n") 223 | } 224 | 225 | if (ii > max_iters) { 226 | break 227 | } else { 228 | W_lag = W 229 | } 230 | } 231 | 232 | object <- list( 233 | mu = mu, 234 | W = W, 235 | hidden_bias = hidden_bias, 236 | family = family, 237 | family_hidden = family_hidden, 238 | iters = ii, 239 | cd_iters = cd_iters[1:ii], 240 | loss_trace = loss_trace[2:(ii + 1)], 241 | prop_deviance_expl = 1 - loss_trace[ii + 1] / null_deviance, 242 | W_grad_sq = W_grad_sq 243 | ) 244 | class(object) <- "efh" 245 | object 246 | } 247 | 248 | contrastive_divergence <- function(x, W, mu, hidden_bias, family, family_hidden, num_iter) { 249 | visible = x 250 | ones = rep(1, nrow(x)) 251 | for (cd_ii in seq_len(num_iter)) { 252 | hidden = exp_fam_sample(outer(ones, hidden_bias) + visible %*% W, family_hidden) 253 | visible = exp_fam_sample(outer(ones, mu) + hidden %*% t(W), family) 254 | } 255 | return(visible) 256 | } 257 | 258 | simulate_efh <- function(model, x, num_iter) { 259 | visible = x 260 | ones = rep(1, nrow(x)) 261 | for (cd_ii in seq_len(num_iter)) { 262 | hidden = exp_fam_sample(outer(ones, model$hidden_bias) + visible %*% model$W, model$family_hidden) 263 | visible = exp_fam_sample(outer(ones, model$mu) + hidden %*% t(model$W), model$family) 264 | } 265 | return(visible) 266 | } 267 | 268 | #' @title Predict exponential family harmonium reconstruction on new data 269 | #' 270 | #' @description Predict exponential family harmonium reconstruction on new data 271 | #' 272 | #' @param object EFH object 273 | #' @param newdata matrix of the same exponential family as covariates in \code{object}. 274 | # If missing, will use the data that \code{object} was fit on 275 | #' @param type the type of fitting required. 276 | #' \code{type = "hidden"} gives matrix of hidden mean parameters of \code{x}, 277 | #' \code{type = "link"} gives a matrix on the natural parameter scale, and 278 | #' \code{type = "response"} gives a matrix on the response scale 279 | #' @param ... Additional arguments 280 | #' @examples 281 | #' # construct a low rank matrices in the natural parameter space 282 | #' rows = 100 283 | #' cols = 10 284 | #' set.seed(1) 285 | #' loadings = rnorm(cols) 286 | #' mat_np = outer(rnorm(rows), rnorm(cols)) 287 | #' mat_np_new = outer(rnorm(rows), loadings) 288 | #' 289 | #' # generate a count matrices 290 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 291 | #' mat_new = matrix(rpois(rows * cols, c(exp(mat_np_new))), rows, cols) 292 | #' 293 | #' modp = exponential_family_harmonium(mat, k = 9, family = "poisson", max_iters = 1000) 294 | #' 295 | #' pred = predict(modp, mat_new, type = "response") 296 | #' 297 | #' @export 298 | predict.efh <- function(object, newdata, type = c("hidden", "link", "response"), ...) { 299 | type = match.arg(type) 300 | 301 | if (missing(newdata)) { 302 | stop("Need to supply data, even if it is the same used to fit the model!") 303 | } else { 304 | check_family(newdata, object$family) 305 | 306 | newdata[is.na(newdata)] <- 0 307 | hidden = exp_fam_mean(outer(rep(1, nrow(newdata)), object$hidden_bias) + 308 | newdata %*% object$W, object$family_hidden) 309 | } 310 | 311 | if (type == "hidden") { 312 | hidden 313 | } else { 314 | theta = outer(rep(1, nrow(newdata)), object$mu) + hidden %*% t(object$W) 315 | if (type == "link") { 316 | theta 317 | } else if (type == "response") { 318 | exp_fam_mean(theta, object$family) 319 | } 320 | } 321 | } 322 | 323 | #' @title Plot exponential family harmonium 324 | #' 325 | #' @description 326 | #' Plots the results of a EFH 327 | #' 328 | #' @param x EFH object 329 | #' @param type the type of plot \code{type = "trace"} plots the algorithms progress by 330 | #' iteration 331 | #' @param ... Additional arguments 332 | #' @examples 333 | #' # construct a low rank matrix in the logit scale 334 | #' rows = 100 335 | #' cols = 10 336 | #' set.seed(1) 337 | #' mat_logit = outer(rnorm(rows), rnorm(cols)) 338 | #' 339 | #' # generate a binary matrix 340 | #' mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0 341 | #' 342 | #' # run logistic SVD on it 343 | #' efh = exponential_family_harmonium(mat, k = 2, family = "binomial", family_hidden = "binomial") 344 | #' 345 | #' \dontrun{ 346 | #' plot(efh) 347 | #' } 348 | #' @export 349 | plot.efh <- function(x, type = c("trace"), ...) { 350 | type = match.arg(type) 351 | 352 | if (type == "trace") { 353 | df = data.frame(Iteration = 1:x$iters, 354 | Deviance = x$loss_trace) 355 | p <- ggplot2::ggplot(df, ggplot2::aes_string("Iteration", "Deviance")) + 356 | ggplot2::geom_line() 357 | } 358 | 359 | return(p) 360 | } 361 | 362 | #' @export 363 | print.efh <- function(x, ...) { 364 | # cat(nrow(x$A), "rows and ") 365 | cat(nrow(x$W), "columns\n") 366 | cat("Rank", ncol(x$W), "solution\n") 367 | cat("\n") 368 | cat(round(x$prop_deviance_expl * 100, 1), "% of deviance explained\n", sep = "") 369 | cat(x$iters, "iterations to converge\n") 370 | 371 | invisible(x) 372 | } 373 | -------------------------------------------------------------------------------- /R/generalizedMF.R: -------------------------------------------------------------------------------- 1 | #' Exponential Family Matrix Factorization 2 | #' 3 | #' @description Collins et al. (2001)'s Exponential Family PCA 4 | #' 5 | #' @param x matrix of either binary, proportions, count, or continuous data 6 | #' @param k dimension 7 | #' @param family exponential family distribution of data 8 | #' @param weights an optional matrix of the same size as the \code{x} with data weights 9 | #' @param quiet logical; whether the calculation should give feedback 10 | #' @param max_iters maximum number of iterations 11 | #' @param conv_criteria convergence criteria 12 | #' @param partial_decomp logical; if \code{TRUE}, the function uses the RSpectra package 13 | #' to more quickly calculate the SVD. When the number of columns is small, 14 | #' the approximation may be less accurate and slower 15 | #' @param random_start whether to randomly initialize \code{A} and \code{B} 16 | #' @param start_A initial value for \code{A} 17 | #' @param start_B initial value for \code{B} 18 | #' @param mu specific value for \code{mu}, the mean vector of \code{x} 19 | #' @param main_effects logical; whether to include main effects in the model 20 | #' @param method which algorithm to use. \code{"als"} uses alternating least squares. 21 | #' It has the benefit of majozing row-wise and column-wise for each of the updates. 22 | #' \code{"svd"} uses singular value decomposition (similar to de Leeuw, 2006). It has to 23 | #' a more gereral majorization, which may not work well for heterogeneous matrices. 24 | #' 25 | #' @return An S3 object of class \code{gmf} which is a list with the 26 | #' following components: 27 | #' \item{mu}{the main effects for dimensionality reduction} 28 | #' \item{A}{the \code{n}x\code{k}-dimentional matrix with the scores} 29 | #' \item{B}{the \code{d}x\code{k}-dimentional matrix with the loadings} 30 | #' \item{family}{the exponential family of the data} 31 | #' \item{iters}{number of iterations required for convergence} 32 | #' \item{loss_trace}{the trace of the average deviance of the algorithm. 33 | #' Should be non-increasing} 34 | #' \item{prop_deviance_expl}{the proportion of deviance explained by this model. 35 | #' If \code{main_effects = TRUE}, the null model is just the main effects, otherwise 36 | #' the null model estimates 0 for all natural parameters.} 37 | #' @export 38 | #' @importFrom RSpectra svds 39 | #' 40 | #' @references 41 | #' de Leeuw, Jan, 2006. Principal component analysis of binary data 42 | #' by iterated singular value decomposition. Computational Statistics & Data Analysis 43 | #' 50 (1), 21--39. 44 | #' 45 | #' Collins, M., Dasgupta, S., & Schapire, R. E., 2001. A generalization of principal 46 | #' components analysis to the exponential family. In NIPS, 617--624. 47 | #' 48 | #' @examples 49 | #' rows = 100 50 | #' cols = 10 51 | #' set.seed(1) 52 | #' mat_np = outer(rnorm(rows), rnorm(cols)) 53 | #' 54 | #' # generate a count matrix and binary response 55 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 56 | #' 57 | #' mod = generalizedMF(mat, k = 1, family = "poisson", quiet = FALSE) 58 | #' 59 | generalizedMF <- function(x, k = 2, family = c("gaussian", "binomial", "poisson"), 60 | weights, quiet = TRUE, max_iters = 1000, conv_criteria = 1e-5, 61 | partial_decomp = FALSE, 62 | random_start = FALSE, start_A, start_B, mu, main_effects = TRUE, 63 | method = c("als", "svd")) { 64 | family = match.arg(family) 65 | check_family(x, family) 66 | method = match.arg(method) 67 | 68 | if (method == "als") { 69 | # used in ALS to make sure the estimates do not diverge 70 | # a penalization like this was recommended in Rish et al. (2008), ICML 71 | L2_eps = 1e-5 72 | } 73 | 74 | x = as.matrix(x) 75 | missing_mat = is.na(x) 76 | n = nrow(x) 77 | d = ncol(x) 78 | ones = rep(1, n) 79 | 80 | if (missing(weights)) { 81 | weights = 1.0 82 | sum_weights = sum(!is.na(x)) 83 | } else { 84 | weights[is.na(x)] <- 0 85 | if (any(is.na(weights))) { 86 | stop("Can't have NA in weights") 87 | } 88 | if (any(weights < 0)) { 89 | stop("weights must be non-negative") 90 | } 91 | if (!all(dim(weights) == dim(x))) { 92 | stop("x and weights are not the same dimension") 93 | } 94 | sum_weights = sum(weights) 95 | } 96 | 97 | # calculate the null log likelihood for % deviance explained and normalization 98 | if (main_effects) { 99 | if (length(weights) == 1) { 100 | weighted_col_means = colMeans(x, na.rm = TRUE) 101 | } else { 102 | weighted_col_means = colSums(x * weights, na.rm = TRUE) / colSums(weights) 103 | } 104 | null_theta = as.numeric(saturated_natural_parameters(matrix(weighted_col_means, 1), family, M = Inf)) 105 | } else { 106 | null_theta = rep(0, d) 107 | } 108 | null_deviance = exp_fam_deviance(x, outer(ones, null_theta), family, weights) / sum_weights 109 | 110 | # Initialize # 111 | ################## 112 | if (main_effects) { 113 | if (missing(mu)) { 114 | mu = saturated_natural_parameters(weighted_col_means, family, Inf) 115 | } else { 116 | mu = as.numeric(mu) 117 | stopifnot(length(mu) == d) 118 | } 119 | } else { 120 | mu = rep(0, d) 121 | } 122 | 123 | if (!missing(start_B)) { 124 | stopifnot(dim(start_B) == c(d, k)) 125 | B = as.matrix(start_B) 126 | } else if (random_start) { 127 | B = matrix(rnorm(d * k), d, k) 128 | } else { 129 | if (partial_decomp) { 130 | udv = RSpectra::svds(scale(saturated_natural_parameters(x, family, 4), TRUE, FALSE), min(k + 1, d)) 131 | } else { 132 | udv = svd(scale(saturated_natural_parameters(x, family, 4), TRUE, FALSE)) 133 | } 134 | 135 | B = udv$v[, 1:k, drop = FALSE] 136 | } 137 | 138 | if (!missing(start_A)) { 139 | stopifnot(dim(start_A) == c(n, k)) 140 | A = as.matrix(start_A) 141 | } else if (random_start) { 142 | A = matrix(rnorm(n * k), n, k) 143 | } else { 144 | if (!missing(start_B)) { 145 | if (partial_decomp) { 146 | udv = RSpectra::svds(scale(saturated_natural_parameters(x, family, 4), TRUE, FALSE), min(k + 1, d)) 147 | } else { 148 | udv = svd(scale(saturated_natural_parameters(x, family, 4), TRUE, FALSE)) 149 | } 150 | } 151 | A = udv$u[, 1:k, drop = FALSE] %*% diag(udv$d, k, k) 152 | } 153 | 154 | loss_trace = numeric(max_iters) 155 | theta = outer(ones, mu) + tcrossprod(A, B) 156 | # not really the first iteration 157 | loss_trace[1] = exp_fam_deviance(x, theta, family, weights) / sum_weights 158 | 159 | ptm <- proc.time() 160 | 161 | if (!quiet) { 162 | cat(0, " ", loss_trace[1], "") 163 | cat("0 hours elapsed\n") 164 | } 165 | 166 | for (ii in seq_len(max_iters)) { 167 | if (method == "als") { 168 | # update A 169 | theta = outer(ones, mu) + tcrossprod(A, B) 170 | first_dir = exp_fam_mean(theta, family) 171 | second_dir = exp_fam_variance(theta, family, weights) 172 | 173 | W = apply(second_dir, 2, max) 174 | Z = as.matrix(theta + weights * (x - first_dir) / outer(ones, W)) 175 | Z[is.na(x)] <- theta[is.na(x)] 176 | 177 | A = t(solve(t(B) %*% diag(W) %*% B + diag(L2_eps, k, k), 178 | t(B) %*% diag(W) %*% t(scale(Z, center = mu, scale = FALSE)))) 179 | 180 | # update B 181 | theta = outer(ones, mu) + tcrossprod(A, B) 182 | first_dir = exp_fam_mean(theta, family) 183 | second_dir = exp_fam_variance(theta, family, weights) 184 | 185 | W = apply(second_dir, 1, max) 186 | Z = as.matrix(theta + weights * (x - first_dir) / outer(W, rep(1, d))) 187 | Z[is.na(x)] <- theta[is.na(x)] 188 | 189 | B = t(solve(t(A) %*% diag(W, n, n) %*% A + diag(L2_eps, k, k), 190 | t(A) %*% diag(W, n, n) %*% scale(Z, center = mu, scale = FALSE))) 191 | } else if (method == "svd") { 192 | theta = outer(ones, mu) + tcrossprod(A, B) 193 | first_dir = exp_fam_mean(theta, family) 194 | second_dir = exp_fam_variance(theta, family, weights) 195 | 196 | W = max(second_dir) 197 | Z = as.matrix(theta + weights * (x - first_dir) / W) 198 | Z[is.na(x)] <- theta[is.na(x)] 199 | 200 | if (partial_decomp) { 201 | udv = RSpectra::svds(scale(Z, center = mu, scale = FALSE), min(k + 1, d)) 202 | } else { 203 | udv = svd(scale(Z, center = mu, scale = FALSE)) 204 | } 205 | 206 | A = udv$u[, 1:k, drop = FALSE] %*% diag(udv$d, k, k) 207 | B = udv$v[, 1:k, drop = FALSE] 208 | } 209 | 210 | # Calc Deviance 211 | theta = outer(ones, mu) + tcrossprod(A, B) 212 | loss_trace[ii] <- exp_fam_deviance(x, theta, family, weights) / sum_weights 213 | 214 | if (!quiet) { 215 | time_elapsed = as.numeric(proc.time() - ptm)[3] 216 | tot_time = max_iters / ii * time_elapsed 217 | time_remain = tot_time - time_elapsed 218 | cat(ii, " ", loss_trace[ii], "") 219 | cat(round(time_elapsed / 3600, 1), "hours elapsed. Max", round(time_remain / 3600, 1), "hours remain.\n") 220 | } 221 | 222 | if (ii > 1 && abs(loss_trace[ii] - loss_trace[ii - 1]) < 2 * conv_criteria) { 223 | break 224 | } else { 225 | B_lag = B 226 | } 227 | } 228 | 229 | object <- list( 230 | mu = mu, 231 | A = A, 232 | B = B, 233 | family = family, 234 | iters = ii, 235 | loss_trace = loss_trace[1:ii], 236 | prop_deviance_expl = 1 - loss_trace[ii] / null_deviance 237 | ) 238 | class(object) <- "gmf" 239 | object 240 | } 241 | 242 | #' @title Predict generalized PCA scores or reconstruction on new data 243 | #' 244 | #' @description Predict generalized PCA scores or reconstruction on new data 245 | #' 246 | #' @param object generalized MF object 247 | #' @param newdata matrix of the same exponential family as covariates in \code{object}. 248 | #' If missing, will use the data that \code{object} was fit on 249 | #' @param type the type of fitting required. 250 | #' \code{type = "PCs"} gives matrix of principal components of \code{x}, 251 | #' \code{type = "link"} gives a matrix on the natural parameter scale, and 252 | #' \code{type = "response"} gives a matrix on the response scale 253 | #' @param quiet logical; whether the calculation should show progress 254 | #' @param max_iters maximum number of iterations 255 | #' @param conv_criteria convergence criteria 256 | #' @param start_A initial value for \code{A} 257 | #' @param ... Additional arguments 258 | #' @examples 259 | #' # construct a low rank matrices in the natural parameter space 260 | #' rows = 100 261 | #' cols = 10 262 | #' set.seed(1) 263 | #' loadings = rnorm(cols) 264 | #' mat_np = outer(rnorm(rows), rnorm(cols)) 265 | #' mat_np_new = outer(rnorm(rows), loadings) 266 | #' 267 | #' # generate a count matrices 268 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 269 | #' mat_new = matrix(rpois(rows * cols, c(exp(mat_np_new))), rows, cols) 270 | #' 271 | #' # run Poisson PCA on it 272 | #' gmf = generalizedMF(mat, k = 1, family = "poisson") 273 | #' 274 | #' A = predict(gmf, mat_new) 275 | #' 276 | #' @export 277 | predict.gmf <- function(object, newdata, type = c("PCs", "link", "response"), quiet = TRUE, 278 | max_iters = 1000, conv_criteria = 1e-5, start_A,...) { 279 | type = match.arg(type) 280 | 281 | if (missing(newdata)) { 282 | A = object$A 283 | } else { 284 | n = nrow(newdata) 285 | d = ncol(newdata) 286 | k = ncol(object$B) 287 | stopifnot(d == nrow(object$B)) 288 | check_family(newdata, object$family) 289 | L2_eps = 1e-5 290 | 291 | ones = rep(1, n) 292 | 293 | # solve for A 294 | if (missing(start_A)) { 295 | theta = outer(ones, object$mu) 296 | } else { 297 | stopifnot(dim(start_A) == c(n, k)) 298 | theta = outer(ones, object$mu) + tcrossprod(start_A, object$B) 299 | } 300 | 301 | last_deviance = exp_fam_deviance(newdata, theta, object$family) / sum(!is.na(newdata)) 302 | if (!quiet) { 303 | cat(0, " ", last_deviance, "\n") 304 | } 305 | for (ii in seq_len(max_iters)) { 306 | first_dir = exp_fam_mean(theta, object$family) 307 | second_dir = exp_fam_variance(theta, object$family) 308 | 309 | multiplier = 1 310 | # while (TRUE) { 311 | # W = apply(second_dir, 2, max) 312 | W = rep(max(second_dir), d) 313 | Z = as.matrix(theta + (newdata - first_dir) / outer(ones, W)) 314 | Z[is.na(newdata)] <- theta[is.na(newdata)] 315 | 316 | A = t(solve(t(object$B) %*% diag(W, d, d) %*% object$B + diag(L2_eps, k, k), 317 | t(object$B) %*% diag(W, d, d) %*% t(scale(Z, object$mu, FALSE)))) 318 | 319 | theta = outer(ones, object$mu) + tcrossprod(A, object$B) 320 | this_deviance = exp_fam_deviance(newdata, theta, object$family) / sum(!is.na(newdata)) 321 | 322 | # if (this_deviance < last_deviance) { 323 | # break 324 | # } else { 325 | # multiplier = multiplier * 2 326 | # } 327 | 328 | if (!quiet) { 329 | cat(ii ," ", this_deviance, "\n") 330 | } 331 | 332 | if (abs(last_deviance - this_deviance) < 2 * conv_criteria) 333 | break 334 | last_deviance = this_deviance 335 | } 336 | } 337 | 338 | if (type == "PCs") { 339 | return(A) 340 | } else { 341 | theta = outer(rep(1, nrow(A)), object$mu) + tcrossprod(A, object$B) 342 | 343 | if (type == "link") { 344 | return(theta) 345 | } else if (type == "response") { 346 | return(exp_fam_mean(theta, object$family)) 347 | } 348 | } 349 | } 350 | 351 | #' @title Plot generalized MF 352 | #' 353 | #' @description 354 | #' Plots the results of a generalized MF 355 | #' 356 | #' @param x generalized MF object 357 | #' @param type the type of plot \code{type = "trace"} plots the algorithms progress by 358 | #' iteration, \code{type = "loadings"} plots the first 2 principal component 359 | #' loadings, \code{type = "scores"} plots the loadings first 2 principal component scores 360 | #' @param ... Additional arguments 361 | #' @examples 362 | #' # construct a low rank matrix in the logit scale 363 | #' rows = 100 364 | #' cols = 10 365 | #' set.seed(1) 366 | #' mat_logit = outer(rnorm(rows), rnorm(cols)) 367 | #' 368 | #' # generate a binary matrix 369 | #' mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0 370 | #' 371 | #' # run logistic SVD on it 372 | #' gmf = generalizedMF(mat, k = 2, family = "binomial") 373 | #' 374 | #' \dontrun{ 375 | #' plot(gmf) 376 | #' } 377 | #' @export 378 | plot.gmf <- function(x, type = c("trace", "loadings", "scores"), ...) { 379 | type = match.arg(type) 380 | 381 | if (type == "trace") { 382 | df = data.frame(Iteration = 1:x$iters, 383 | Deviance = x$loss_trace) 384 | p <- ggplot2::ggplot(df, ggplot2::aes_string("Iteration", "Deviance")) + 385 | ggplot2::geom_line() 386 | } else if (type == "loadings") { 387 | df = data.frame(x$B) 388 | colnames(df) <- paste0("PC", 1:ncol(df)) 389 | if (ncol(df) == 1) { 390 | df$PC2 = 0 391 | p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() + 392 | ggplot2::labs(y = NULL) 393 | } else { 394 | p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() 395 | } 396 | } else if (type == "scores") { 397 | df = data.frame(x$A) 398 | colnames(df) <- paste0("PC", 1:ncol(df)) 399 | if (ncol(df) == 1) { 400 | df$PC2 = 0 401 | p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() + 402 | ggplot2::labs(y = NULL) 403 | } else { 404 | p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() 405 | } 406 | } 407 | 408 | return(p) 409 | } 410 | 411 | #' @export 412 | print.gmf <- function(x, ...) { 413 | cat(nrow(x$A), "rows and ") 414 | cat(nrow(x$B), "columns\n") 415 | cat("Rank", ncol(x$B), "solution\n") 416 | cat("\n") 417 | cat(round(x$prop_deviance_expl * 100, 1), "% of deviance explained\n", sep = "") 418 | cat(x$iters, "iterations to converge\n") 419 | 420 | invisible(x) 421 | } 422 | 423 | 424 | #' @title CV for generalized MF 425 | #' 426 | #' @description 427 | #' Run cross validation on dimension for generalized MF 428 | #' 429 | #' @param x matrix of either binary, count, or continuous data 430 | #' @param ks the different dimensions \code{k} to try 431 | #' @param family exponential family distribution of data 432 | #' @param folds if \code{folds} is a scalar, then it is the number of folds. If 433 | #' it is a vector, it should be the same length as the number of rows in \code{x} 434 | #' @param quiet logical; whether the function should display progress 435 | #' @param ... Additional arguments passed to generalizedMF 436 | #' 437 | #' @return A matrix of the CV deviance with \code{k} in rows 438 | #' 439 | #' @examples 440 | #' # construct a low rank matrix in the logit scale 441 | #' rows = 100 442 | #' cols = 10 443 | #' set.seed(1) 444 | #' mat_logit = outer(rnorm(rows), rnorm(cols)) 445 | #' 446 | #' # generate a binary matrix 447 | #' mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0 448 | #' 449 | #' \dontrun{ 450 | #' deviances = cv.gmf(mat, ks = 1:9, family = "binomial") 451 | #' plot(deviances) 452 | #' } 453 | #' @export 454 | cv.gmf <- function(x, ks, family = c("gaussian", "binomial", "poisson", "multinomial"), 455 | folds = 5, quiet = TRUE, ...) { 456 | family = match.arg(family) 457 | check_family(x, family) 458 | 459 | if (length(folds) > 1) { 460 | # does this work if factor? 461 | if (length(unique(folds)) <= 1) { 462 | stop("If inputing CV split, must be more than one level") 463 | } 464 | if (length(folds) != nrow(x)) { 465 | stop("if folds is a vector, it should be of same length as nrow(x)") 466 | } 467 | cv = folds 468 | } else { 469 | cv = sample(1:folds, nrow(q), replace = TRUE) 470 | } 471 | 472 | log_likes = matrix(0, length(ks), 1, 473 | dimnames = list(k = ks, M = "GMF")) 474 | for (k in ks) { 475 | if (!quiet) { 476 | cat("k =", k, "\n") 477 | } 478 | for (c in unique(cv)) { 479 | gmf = generalizedMF(x[c != cv, ], k = k, family = family, ...) 480 | pred_theta = predict(gmf, newdat = x[c == cv, ], type = "link") 481 | log_likes[k == ks] = log_likes[k == ks] + 482 | exp_fam_deviance(x[c == cv, ], theta = pred_theta, family = family) 483 | } 484 | } 485 | class(log_likes) <- c("matrix", "cv.gpca") 486 | which_max = which.max(log_likes) 487 | if (!quiet) { 488 | cat("Best: k =", ks[which_max], "\n") 489 | } 490 | 491 | return(-log_likes) 492 | } 493 | -------------------------------------------------------------------------------- /R/convexGeneralizedPCA.R: -------------------------------------------------------------------------------- 1 | #' @title Convex Generalized Principal Component Analysis 2 | #' 3 | #' @description 4 | #' Dimensionality reduction for exponential family data by extending Pearson's 5 | #' PCA formulation to minimize deviance. The convex relaxation 6 | #' to projection matrices, the Fantope, is used. 7 | #' 8 | #' @param x matrix of either binary, proportions, count, or continuous data 9 | #' @param k number of principal components to return 10 | #' @param M value to approximate the saturated model 11 | #' @param family exponential family distribution of data 12 | #' @param weights an optional matrix of the same size as the \code{x} with non-negative weights 13 | #' @param quiet logical; whether the calculation should give feedback 14 | #' @param partial_decomp logical; if \code{TRUE}, the function uses the RSpectra package 15 | #' to more quickly calculate the eigen-decomposition. When the number of columns is small, 16 | #' the approximation may be less accurate and slower 17 | #' @param max_iters number of maximum iterations 18 | #' @param conv_criteria convergence criteria. The difference between average deviance 19 | #' in successive iterations 20 | #' @param random_start logical; whether to randomly inititalize the parameters. If \code{FALSE}, 21 | #' function will use an eigen-decomposition as starting value 22 | #' @param start_H starting value for the Fantope matrix 23 | #' @param mu main effects vector. Only used if \code{main_effects = TRUE} 24 | #' @param main_effects logical; whether to include main effects in the model 25 | #' @param normalize logical; whether to weight the variables to they all have equal influence 26 | #' @param ss_factor step size multiplier. Amount by which to multiply the step size. Quadratic 27 | #' convergence rate can be proven for \code{ss_factor = 1}, but I have found higher values 28 | #' sometimes work better. The default is \code{ss_factor = 4}. 29 | #' If it is not converging, try \code{ss_factor = 1}. 30 | #' 31 | #' @return An S3 object of class \code{cgpca} which is a list with the 32 | #' following components: 33 | #' \item{mu}{the main effects} 34 | #' \item{H}{a rank \code{k} Fantope matrix} 35 | #' \item{U}{a \code{ceiling(k)}-dimentional orthonormal matrix with the loadings} 36 | #' \item{PCs}{the princial component scores} 37 | #' \item{M}{the parameter inputed} 38 | #' \item{iters}{number of iterations required for convergence} 39 | #' \item{loss_trace}{the trace of the average deviance using the Fantope matrix} 40 | #' \item{proj_loss_trace}{the trace of the average deviance using the projection matrix} 41 | #' \item{prop_deviance_expl}{the proportion of deviance explained by this model. 42 | #' If \code{main_effects = TRUE}, the null model is just the main effects, otherwise 43 | #' the null model estimates 0 for all natural parameters.} 44 | #' 45 | #' @examples 46 | #' # construct a low rank matrix in the logit scale 47 | #' rows = 100 48 | #' cols = 10 49 | #' set.seed(1) 50 | #' mat_logit = outer(rnorm(rows), rnorm(cols)) 51 | #' 52 | #' # generate a binary matrix 53 | #' mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0 54 | #' 55 | #' # run convex generalized PCA on it 56 | #' cgpca = convexGeneralizedPCA(mat, k = 1, M = 4, family = "binomial") 57 | #' @export 58 | #' @importFrom stats var 59 | convexGeneralizedPCA <- function(x, k = 2, M = 4, family = c("gaussian", "binomial", "poisson", "multinomial"), 60 | weights, quiet = TRUE, partial_decomp = FALSE, max_iters = 1000, 61 | conv_criteria = 1e-6, random_start = FALSE, start_H, mu, 62 | main_effects = TRUE, normalize = FALSE, ss_factor = 1) { 63 | 64 | family = match.arg(family) 65 | check_family(x, family) 66 | 67 | x = as.matrix(x) 68 | missing_mat = is.na(x) 69 | n = nrow(x) 70 | d = ncol(x) 71 | ones = rep(1, n) 72 | 73 | if (missing(weights) || length(weights) == 1) { 74 | weights = 1.0 75 | sum_weights = sum(!is.na(x)) 76 | } else { 77 | if (!all(dim(weights) == dim(x))) { 78 | stop("x and weights are not the same dimension") 79 | } 80 | weights[is.na(x)] <- 0 81 | if (any(is.na(weights))) { 82 | stop("Can't have NA in weights") 83 | } 84 | if (any(weights < 0)) { 85 | stop("weights must be non-negative") 86 | } 87 | sum_weights = sum(weights) 88 | } 89 | 90 | if (main_effects) { 91 | if (missing(mu)) { 92 | if (length(weights) == 1) { 93 | weighted_col_means = colMeans(x, na.rm = TRUE) 94 | } else { 95 | weighted_col_means = colSums(x * weights, na.rm = TRUE) / colSums(weights) 96 | } 97 | if (any(apply(x, 2, stats::var, na.rm = TRUE) == 0)) { 98 | stop("At least one variable has variance of 0") 99 | } 100 | if (family == "multinomial") { 101 | mu = as.numeric(saturated_natural_parameters(matrix(weighted_col_means, nrow = 1), family, M)) 102 | } else { 103 | mu = as.numeric(saturated_natural_parameters(weighted_col_means, family, M)) 104 | } 105 | } 106 | } else { 107 | mu = rep(0, d) 108 | } 109 | 110 | eta = saturated_natural_parameters(x, family, M = M) + missing_mat * outer(ones, mu) 111 | eta_centered = scale(eta, center = mu, scale = FALSE) 112 | mu_mat = outer(rep(1, n), mu) 113 | 114 | if (!missing(start_H)) { 115 | HU = project.Fantope(start_H, k, partial_decomp = partial_decomp) 116 | H = HU$H 117 | } else if (random_start) { 118 | U = matrix(rnorm(d * d), d, d) 119 | U = qr.Q(qr(U)) 120 | HU = project.Fantope(U %*% t(U), k, partial_decomp = partial_decomp) 121 | H = HU$H 122 | } else { 123 | if (partial_decomp) { 124 | udv = RSpectra::svds(scale(eta, center = mu, scale = normalize), k, nu = k, nv = k) 125 | } else { 126 | udv = svd(scale(eta, center = mu, scale = normalize), nu = k, nv = k) 127 | } 128 | HU = project.Fantope(udv$v[, 1:k] %*% t(udv$v[, 1:k]), k, partial_decomp = partial_decomp) 129 | H = HU$H 130 | } 131 | 132 | eta_sat_nat = saturated_natural_parameters(x, family, M = Inf) 133 | sat_loglike = exp_fam_log_like(x, eta_sat_nat, family, weights) 134 | 135 | # when x is missing eta = mu. So eta_centered is 0 136 | eta_centered[missing_mat] <- 0 137 | 138 | theta = mu_mat + eta_centered %*% H 139 | 140 | # Initial step size 141 | if (family %in% c("binomial", "multinomial")) { 142 | # TODO: I think this works with proportions, but need to check 143 | init_ss = 2 / (sum(eta_centered^2) * max(weights)) 144 | } else if (family == "gaussian") { 145 | init_ss = 0.5 / (sum(eta_centered^2) * max(weights)) 146 | } else { 147 | second_dir = exp_fam_variance(theta, family, weights) 148 | init_ss = mean((t(second_dir) %*% (eta_centered^2))^(-1)) / 2 149 | } 150 | init_ss = init_ss * ss_factor 151 | 152 | # only sum over non-missing x. Equivalent to replacing missing x with 0 153 | x_zeros = x 154 | x_zeros[missing_mat] <- 0 155 | 156 | etatX = t(eta_centered) %*% (weights * x_zeros) 157 | 158 | loglike = exp_fam_log_like(x, theta, family, weights) 159 | min_loss = 2 * (sat_loglike - loglike) / sum_weights 160 | best_HU = HU 161 | best_loglike = loglike 162 | if (!quiet) { 163 | cat(0," ", min_loss, "\n") 164 | } 165 | 166 | loss_trace <- proj_loss_trace <- numeric(max_iters + 1) 167 | loss_trace[1] <- proj_loss_trace[1] <- min_loss 168 | 169 | H_lag = H 170 | for (m in 1:max_iters) { 171 | if (family == "poisson") { 172 | step = init_ss / m 173 | } else { 174 | step = init_ss 175 | } 176 | 177 | first_dir = exp_fam_mean(theta, family) 178 | 179 | first_dir[missing_mat] <- 0 180 | etat_dir = t(eta_centered) %*% (first_dir * weights) 181 | deriv = etatX - etat_dir 182 | deriv = deriv + t(deriv) - diag(diag(deriv)) 183 | 184 | H = H + step * deriv 185 | HU = project.Fantope(H, k, partial_decomp = partial_decomp) 186 | H = HU$H 187 | 188 | theta = mu_mat + eta_centered %*% H 189 | loglike = exp_fam_log_like(x, theta, family, weights) 190 | loss_trace[m + 1] = 2 * (sat_loglike - loglike) / sum_weights 191 | 192 | proj_theta = mu_mat + eta_centered %*% tcrossprod(HU$U) 193 | proj_loglike = exp_fam_log_like(x, proj_theta, family, weights) 194 | proj_loss_trace[m + 1] = 2 * (sat_loglike - proj_loglike) / sum_weights 195 | 196 | if (!quiet) { 197 | cat(m, " ", loss_trace[m + 1], " ", proj_loss_trace[m + 1], "\n") 198 | } 199 | if (loss_trace[m + 1] < min_loss) { 200 | min_loss = loss_trace[m + 1] 201 | best_HU = HU 202 | best_loglike = loglike 203 | } 204 | if (abs(loss_trace[m+1]-loss_trace[m]) < 2 * conv_criteria | min_loss == 0) { 205 | break 206 | } 207 | } 208 | 209 | # calculate the null log likelihood for % deviance explained 210 | # assumes no missing data 211 | # if (main_effects) { 212 | # null_proportions = x_bar 213 | # } else { 214 | # null_proportions = rep(0.5, d) 215 | # } 216 | # null_loglikes <- null_proportions * log(null_proportions) + 217 | # (1 - null_proportions) * log(1 - null_proportions) 218 | # null_loglike = sum((null_loglikes * colSums(q!=0))[!(null_proportions %in% c(0, 1))]) 219 | null_loglike = exp_fam_log_like(x, mu_mat, family, weights) 220 | 221 | object = list(mu = mu, 222 | H = best_HU$H, 223 | U = best_HU$U, 224 | PCs = eta_centered %*% best_HU$U, 225 | M = M, 226 | family = family, 227 | iters = m, 228 | loss_trace = loss_trace[1:(m + 1)], 229 | proj_loss_trace = proj_loss_trace[1:(m + 1)], 230 | prop_deviance_expl = 1 - (best_loglike - sat_loglike) / (null_loglike - sat_loglike) 231 | ) 232 | class(object) <- "cgpca" 233 | return(object) 234 | } 235 | 236 | #' @title Project onto the Fantope 237 | #' 238 | #' @description 239 | #' Project a symmetric matrix onto the convex set of the rank k Fantope 240 | #' 241 | #' @param x a symmetric matrix 242 | #' @param k the rank of the Fantope desired 243 | #' @param partial_decomp logical; if \code{TRUE}, the function uses the RSpectra package 244 | #' to more quickly calculate the eigen-decomposition. When the number of columns is small, 245 | #' the approximation may be less accurate and slower 246 | 247 | #' @return 248 | #' \item{H}{a rank \code{k} Fantope matrix} 249 | #' \item{U}{a \code{k}-dimentional orthonormal matrix with the first \code{k} eigenvectors of \code{H}} 250 | #' @export 251 | project.Fantope <- function(x, k, partial_decomp = FALSE) { 252 | if (partial_decomp) { 253 | eig = RSpectra::eigs_sym(x, k = min(k + 2, ncol(x))) 254 | } else { 255 | eig = eigen(x, symmetric = TRUE) 256 | } 257 | vals = eig$values 258 | lower = vals[length(vals)] - k / length(vals) 259 | upper = max(vals) 260 | while(TRUE) { 261 | theta = (lower+upper) / 2 262 | sum.eig.vals = sum(pmin(pmax(vals - theta, 0), 1)) 263 | if (abs(sum.eig.vals-k) < 1e-10) { 264 | break 265 | } else if (sum.eig.vals>k) { 266 | lower = theta 267 | } else { 268 | upper = theta 269 | } 270 | } 271 | vals = pmin(pmax(vals - theta, 0), 1) 272 | return(list(H = eig$vectors %*% diag(vals) %*% t(eig$vectors), 273 | U = matrix(eig$vectors[, 1:ceiling(k)], nrow(x), ceiling(k)))) 274 | } 275 | 276 | #' @title Predict Convex Generalized PCA scores or reconstruction on new data 277 | #' 278 | #' @description Predict Convex Generalized PCA scores or reconstruction on new data 279 | #' 280 | #' @param object convex generalized PCA object 281 | #' @param newdata matrix with all binary entries. If missing, will use the 282 | #' data that \code{object} was fit on 283 | #' @param type the type of fitting required. \code{type = "PCs"} gives the PC scores, 284 | #' \code{type = "link"} gives matrix on the logit scale and \code{type = "response"} 285 | #' gives matrix on the probability scale 286 | #' @param ... Additional arguments 287 | #' @examples 288 | #' # construct a low rank matrices in the logit scale 289 | #' rows = 100 290 | #' cols = 10 291 | #' set.seed(1) 292 | #' loadings = rnorm(cols) 293 | #' mat_logit = outer(rnorm(rows), loadings) 294 | #' mat_logit_new = outer(rnorm(rows), loadings) 295 | #' 296 | #' # convert to a binary matrix 297 | #' mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0 298 | #' mat_new = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit_new)) * 1.0 299 | #' 300 | #' # run generalized PCA on it 301 | #' cgpca = convexGeneralizedPCA(mat, k = 1, M = 4, family = "binomial") 302 | #' 303 | #' PCs = predict(cgpca, mat_new) 304 | #' @export 305 | predict.cgpca <- function(object, newdata, type = c("PCs", "link", "response"), ...) { 306 | type = match.arg(type) 307 | 308 | if (type == "PCs") { 309 | if (missing(newdata)) { 310 | PCs = object$PCs 311 | } else { 312 | eta = ((as.matrix(newdata) * 2) - 1) * object$M 313 | eta_centered = scale(eta, center = object$mu, scale = FALSE) 314 | eta_centered[is.na(newdata)] <- 0 315 | PCs = eta_centered %*% object$U 316 | } 317 | return(PCs) 318 | } else { 319 | eta = ((as.matrix(newdata) * 2) - 1) * object$M 320 | eta_centered = scale(eta, center = object$mu, scale = FALSE) 321 | eta_centered[is.na(newdata)] <- 0 322 | theta = outer(rep(1, nrow(eta)), object$mu) + eta_centered %*% object$H 323 | if (type == "link") { 324 | return(theta) 325 | } else { 326 | return(inv.logit.mat(theta)) 327 | } 328 | } 329 | } 330 | 331 | #' @title Plot convex generalized PCA 332 | #' 333 | #' @description 334 | #' Plots the results of a convex generalized PCA 335 | #' 336 | #' @param x convex generalized PCA object 337 | #' @param type the type of plot \code{type = "trace"} plots the algorithms progress by 338 | #' iteration, \code{type = "loadings"} plots the first 2 PC loadings, 339 | #' \code{type = "scores"} plots the first 2 PC scores 340 | #' @param ... Additional arguments 341 | #' @examples 342 | #' # construct a low rank matrix in the logit scale 343 | #' rows = 100 344 | #' cols = 10 345 | #' set.seed(1) 346 | #' mat_logit = outer(rnorm(rows), rnorm(cols)) 347 | #' 348 | #' # generate a binary matrix 349 | #' mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0 350 | #' 351 | #' # run convex generalized PCA on it 352 | #' cgpca = convexGeneralizedPCA(mat, k = 1, M = 4, family = "binomial") 353 | #' 354 | #' \dontrun{ 355 | #' plot(cgpca) 356 | #' } 357 | #' @export 358 | plot.cgpca <- function(x, type = c("trace", "loadings", "scores"), ...) { 359 | type = match.arg(type) 360 | 361 | if (type == "trace") { 362 | df = data.frame(Iteration = 0:x$iters, 363 | Deviance = x$loss_trace) 364 | p <- ggplot2::ggplot(df, ggplot2::aes_string("Iteration", "Deviance")) + 365 | ggplot2::geom_line() 366 | } else if (type == "loadings") { 367 | df = data.frame(x$U) 368 | colnames(df) <- paste0("PC", 1:ncol(df)) 369 | if (ncol(df) == 1) { 370 | df$PC2 = 0 371 | p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() + 372 | ggplot2::labs(y = NULL) 373 | } else { 374 | p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() 375 | } 376 | } else if (type == "scores") { 377 | df = data.frame(x$PCs) 378 | colnames(df) <- paste0("PC", 1:ncol(df)) 379 | if (ncol(df) == 1) { 380 | df$PC2 = 0 381 | p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() + 382 | ggplot2::labs(y = NULL) 383 | } else { 384 | p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() 385 | } 386 | } 387 | 388 | return(p) 389 | } 390 | 391 | #' @export 392 | print.cgpca <- function(x, ...) { 393 | cat(nrow(x$PCs), "rows and ") 394 | cat(nrow(x$H), "columns\n") 395 | cat("Rank", ncol(x$U), "Fantope solution with M =", x$M, "\n") 396 | cat("\n") 397 | cat(round(x$prop_deviance_expl * 100, 1), "% of deviance explained\n", sep = "") 398 | cat(x$iters, "iterations to converge\n") 399 | 400 | invisible(x) 401 | } 402 | 403 | #' @title CV for convex generalized PCA 404 | #' 405 | #' @description 406 | #' Run cross validation on dimension and \code{M} for convex generalized PCA 407 | #' 408 | #' @param x matrix with all binary entries 409 | #' @param ks the different dimensions \code{k} to try 410 | #' @param Ms the different approximations to the saturated model \code{M} to try 411 | #' @param folds if \code{folds} is a scalar, then it is the number of folds. If 412 | #' it is a vector, it should be the same length as the number of rows in \code{x} 413 | #' @param quiet logical; whether the function should display progress 414 | #' @param ... Additional arguments passed to convexGeneralizedPCA 415 | #' 416 | #' @return A matrix of the CV log likelihood with \code{k} in rows and 417 | #' \code{M} in columns 418 | #' 419 | #' @examples 420 | #' # construct a low rank matrix in the logit scale 421 | #' rows = 100 422 | #' cols = 10 423 | #' set.seed(1) 424 | #' mat_logit = outer(rnorm(rows), rnorm(cols)) 425 | #' 426 | #' # generate a binary matrix 427 | #' mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0 428 | #' 429 | #' \dontrun{ 430 | #' loglikes = cv.cgpca(mat, ks = 1:9, Ms = 3:6) 431 | #' plot(loglikes) 432 | #' } 433 | #' @export 434 | cv.cgpca <- function(x, ks, Ms = seq(2, 10, by = 2), folds = 5, quiet = TRUE, ...) { 435 | # TODO: does not support weights 436 | q = 2 * as.matrix(x) - 1 437 | q[is.na(q)] <- 0 438 | 439 | if (length(folds) > 1) { 440 | # does this work if factor? 441 | if (length(unique(folds)) <= 1) { 442 | stop("If inputing CV split, must be more than one level") 443 | } 444 | if (length(folds) != nrow(x)) { 445 | stop("if folds is a vector, it should be of same length as nrow(x)") 446 | } 447 | cv = folds 448 | } else { 449 | cv = sample(1:folds, nrow(q), replace = TRUE) 450 | } 451 | 452 | log_likes = matrix(0, length(ks), length(Ms), 453 | dimnames = list(k = ks, M = Ms)) 454 | for (k in ks) { 455 | for (M in Ms) { 456 | if (!quiet) { 457 | cat("k =", k, "M =", M, "") 458 | } 459 | for (c in unique(cv)) { 460 | if (!quiet) { 461 | cat(".") 462 | } 463 | clpca = convexGeneralizedPCA(x[c != cv, ], k = k, M = M, ...) 464 | pred_theta = predict(clpca, newdat = x[c == cv, ], type = "link") 465 | log_likes[k == ks, M == Ms] = log_likes[k == ks, M == Ms] + 466 | log_like_Bernoulli(q = q[c == cv, ], theta = pred_theta) 467 | } 468 | if (!quiet) { 469 | cat("", log_likes[k == ks, M == Ms], "\n") 470 | } 471 | } 472 | } 473 | class(log_likes) <- c("matrix", "cv.gpca") 474 | which_min = which(log_likes == max(log_likes), arr.ind = TRUE) 475 | if (!quiet) { 476 | cat("Best: k =", ks[which_min[1]], "M =", Ms[which_min[2]], "\n") 477 | } 478 | 479 | return(log_likes) 480 | } 481 | -------------------------------------------------------------------------------- /R/generalizedPCA.R: -------------------------------------------------------------------------------- 1 | #' @title Generalized Principal Component Analysis 2 | #' 3 | #' @description 4 | #' Dimension reduction for exponential family data by extending Pearson's 5 | #' PCA formulation 6 | #' 7 | #' @param x matrix of either binary, proportions, count, or continuous data 8 | #' @param k number of principal components to return 9 | #' @param M value to approximate the saturated model 10 | #' @param family exponential family distribution of data 11 | #' @param weights an optional matrix of the same size as the \code{x} with data weights 12 | #' @param quiet logical; whether the calculation should give feedback 13 | #' @param majorizer how to majorize the deviance. \code{"row"} gives 14 | #' tighter majorization, but may take longer to calculate each iteration. 15 | #' \code{"all"} may be faster per iteration, but take more iterations 16 | #' @param partial_decomp logical; if \code{TRUE}, the function uses the RSpectra package 17 | #' to more quickly calculate the SVD. When the number of columns is small, 18 | #' the approximation may be less accurate and slower 19 | #' @param max_iters number of maximum iterations 20 | #' @param conv_criteria convergence criteria. The difference between average deviance 21 | #' in successive iterations 22 | #' @param random_start logical; whether to randomly inititalize the parameters. If \code{FALSE}, 23 | #' function will use an eigen-decomposition as starting value 24 | #' @param start_U starting value for the orthogonal matrix 25 | #' @param start_mu starting value for mu. Only used if \code{main_effects = TRUE} 26 | #' @param main_effects logical; whether to include main effects in the model 27 | #' @param normalize logical; whether to weight the variables to they all have equal influence 28 | #' @param validation a validation dataset to select \code{m} with 29 | #' @param val_weights weights associated with validation data 30 | #' 31 | #' @return An S3 object of class \code{gpca} which is a list with the 32 | #' following components: 33 | #' \item{mu}{the main effects} 34 | #' \item{U}{a \code{k}-dimentional orthonormal matrix with the loadings} 35 | #' \item{PCs}{the princial component scores} 36 | #' \item{M}{the parameter inputed} 37 | #' \item{family}{the exponential family used} 38 | #' \item{iters}{number of iterations required for convergence} 39 | #' \item{loss_trace}{the trace of the average deviance of the algorithm. 40 | #' Should be non-increasing} 41 | #' \item{prop_deviance_expl}{the proportion of deviance explained by this model. 42 | #' If \code{main_effects = TRUE}, the null model is just the main effects, otherwise 43 | #' the null model estimates 0 for all natural parameters.} 44 | #' @examples 45 | #' # construct a low rank matrix in the natural parameter space 46 | #' rows = 100 47 | #' cols = 10 48 | #' set.seed(1) 49 | #' mat_np = outer(rnorm(rows), rnorm(cols)) 50 | #' 51 | #' # generate a count matrix 52 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 53 | #' 54 | #' # run Poisson PCA on it 55 | #' gpca = generalizedPCA(mat, k = 1, M = 4, family = "poisson") 56 | #' @export 57 | #' @importFrom stats var 58 | generalizedPCA <- function(x, k = 2, M = 4, family = c("gaussian", "binomial", "poisson", "multinomial"), 59 | weights, quiet = TRUE, majorizer = c("row", "all"), 60 | partial_decomp = FALSE, max_iters = 1000, conv_criteria = 1e-5, 61 | random_start = FALSE, start_U, start_mu, main_effects = TRUE, 62 | normalize = FALSE, validation, val_weights) { 63 | family = match.arg(family) 64 | check_family(x, family) 65 | 66 | majorizer = match.arg(majorizer) 67 | 68 | x = as.matrix(x) 69 | missing_mat = is.na(x) 70 | n = nrow(x) 71 | d = ncol(x) 72 | ones = rep(1, n) 73 | 74 | if (missing(weights)) { 75 | weights = 1.0 76 | sum_weights = sum(!is.na(x)) 77 | } else { 78 | weights[is.na(x)] <- 0 79 | if (any(is.na(weights))) { 80 | stop("Can't have NA in weights") 81 | } 82 | if (any(weights < 0)) { 83 | stop("weights must be non-negative") 84 | } 85 | if (!all(dim(weights) == dim(x))) { 86 | stop("x and weights are not the same dimension") 87 | } 88 | sum_weights = sum(weights) 89 | } 90 | 91 | # calculate the null log likelihood for % deviance explained and normalization 92 | if (main_effects) { 93 | if (length(weights) == 1) { 94 | weighted_col_means = colMeans(x, na.rm = TRUE) 95 | } else { 96 | weighted_col_means = colSums(x * weights, na.rm = TRUE) / colSums(weights) 97 | } 98 | null_theta = as.numeric(saturated_natural_parameters(matrix(weighted_col_means, 1), family, M)) 99 | } else { 100 | null_theta = rep(0, d) 101 | } 102 | 103 | if (normalize) { 104 | if (any(apply(x, 2, stats::var, na.rm = TRUE) == 0)) { 105 | stop("At least one variable has variance of 0. Cannot normalize") 106 | } 107 | 108 | eta_sat_nat = saturated_natural_parameters(x, family, M = Inf) 109 | 110 | norms = sapply(1:d, function(j) { 111 | 2 * (exp_fam_log_like(x[, j], eta_sat_nat[, j], family, weights) - 112 | exp_fam_log_like(x[, j], rep(null_theta[j], n), family, weights)) 113 | }) / n 114 | if (any(norms <= 0)) { 115 | stop("Normalization caused weights to be <= 0") 116 | } 117 | 118 | if (length(weights) == 1) { 119 | weights = outer(ones, 1 / norms) 120 | } else { 121 | weights = sweep(weights, 2, 1 / norms, "*") 122 | } 123 | } 124 | 125 | if (M == 0) { 126 | if (any(is.na(x))) { 127 | stop("Cannot solve for M with missing weights") 128 | } 129 | M = 4 130 | solve_M = TRUE 131 | if (!missing(validation)) { 132 | if (ncol(validation) != ncol(x)) { 133 | stop("validation does not have the same variables as x") 134 | } 135 | if (missing(val_weights)) { 136 | val_weights = 1.0 137 | } else { 138 | if (!all(dim(val_weights) == dim(validation))) { 139 | stop("validation and val_weights are not the same dimension") 140 | } 141 | } 142 | validation = as.matrix(validation) 143 | M_mat = exp_fam_sat_ind_mat(validation, family) 144 | } else { 145 | M_mat = exp_fam_sat_ind_mat(x, family) 146 | } 147 | } else { 148 | solve_M = FALSE 149 | } 150 | 151 | # if it is standard PCA, only need 1 iteration 152 | if (family == "gaussian" & all(weights == 1) & sum(missing_mat) == 0) { 153 | max_iters = 0 154 | } 155 | 156 | # Initialize # 157 | ################## 158 | if (main_effects) { 159 | if (!missing(start_mu)) { 160 | mu = start_mu 161 | } else { 162 | eta = saturated_natural_parameters(x, family, M = M) 163 | is.na(eta[is.na(x)]) <- TRUE 164 | mu = colMeans(eta, na.rm = TRUE) 165 | # mu = saturated_natural_parameters(colMeans(x, na.rm = TRUE), family, M) 166 | } 167 | } else { 168 | mu = rep(0, d) 169 | } 170 | 171 | eta = saturated_natural_parameters(x, family, M = M) + missing_mat * outer(ones, mu) 172 | eta_centered = scale(eta, center = mu, scale = FALSE) 173 | 174 | if (!missing(start_U)) { 175 | U = sweep(start_U, 2, sqrt(colSums(start_U^2)), "/") 176 | } else if (random_start) { 177 | U = matrix(rnorm(d * k), d, k) 178 | U = qr.Q(qr(U)) 179 | } else { 180 | if (partial_decomp) { 181 | udv = RSpectra::svds(scale(eta, center = mu, scale = normalize), k, nu = k, nv = k) 182 | } else { 183 | udv = svd(scale(eta, center = mu, scale = normalize)) 184 | } 185 | U = matrix(udv$v[, 1:k], d, k) 186 | } 187 | 188 | eta_sat_nat = saturated_natural_parameters(x, family, M = Inf) 189 | sat_loglike = exp_fam_log_like(x, eta_sat_nat, family, weights) 190 | 191 | loss_trace = numeric(max_iters + 1) 192 | theta = outer(ones, mu) + eta_centered %*% tcrossprod(U) 193 | loglike <- exp_fam_log_like(x, theta, family, weights) 194 | loss_trace[1] = 2 * (sat_loglike - loglike) / sum_weights 195 | ptm <- proc.time() 196 | 197 | if (!quiet) { 198 | cat(0, " ", loss_trace[1], "") 199 | cat("0 hours elapsed\n") 200 | } 201 | 202 | for (m in seq_len(max_iters)) { 203 | last_U = U 204 | last_M = M 205 | last_mu = mu 206 | 207 | # TODO: incorporate missing data 208 | if (solve_M) { 209 | gpca_obj = structure(list(mu = mu, U = U, M = M, family = family), 210 | class = "gpca") 211 | if (missing(validation)) { 212 | fitted_theta = predict(gpca_obj, newdata = x, type = "link") 213 | } else { 214 | fitted_theta = predict(gpca_obj, newdata = validation, type = "link") 215 | } 216 | fitted_mean = exp_fam_mean(fitted_theta, family) 217 | 218 | if (missing(validation)) { 219 | M_slope = sum(((fitted_mean - x) * weights * (M_mat %*% tcrossprod(U)))[!is.na(M_mat)]) 220 | fitted_var = exp_fam_variance(fitted_theta, family, weights) 221 | } else { 222 | M_slope = sum(((fitted_mean - validation) * val_weights * (M_mat %*% tcrossprod(U)))[!is.na(M_mat)]) 223 | fitted_var = exp_fam_variance(fitted_theta, family, val_weights) 224 | } 225 | M_curve = sum((fitted_var * (M_mat %*% tcrossprod(U))^2)[!is.na(M_mat)]) 226 | 227 | M = max(M - M_slope / M_curve, 0) 228 | 229 | eta = saturated_natural_parameters(x, family, M = M) + missing_mat * outer(ones, mu) 230 | eta_centered = scale(eta, center = mu, scale = FALSE) 231 | theta = outer(ones, mu) + eta_centered %*% tcrossprod(U) 232 | } 233 | 234 | first_dir = exp_fam_mean(theta, family) 235 | second_dir = exp_fam_variance(theta, family, weights) 236 | if (majorizer == "row") { 237 | W = apply(second_dir, 1, max) 238 | } else if (majorizer == "all") { 239 | W = rep(max(second_dir), n) 240 | } 241 | 242 | # EM style estimate of Z with theta when missing data 243 | Z = as.matrix(theta + weights * (x - first_dir) / outer(W, rep(1, d))) 244 | Z[is.na(x)] <- theta[is.na(x)] 245 | if (main_effects) { 246 | mu = as.numeric(colSums((Z - eta %*% tcrossprod(U)) * W) / sum(W)) 247 | } 248 | 249 | eta = saturated_natural_parameters(x, family, M = M) + missing_mat * outer(ones, mu) 250 | eta_centered = scale(eta, center = mu, scale = FALSE) 251 | 252 | mat_temp = t(eta_centered * W) %*% scale(Z, center = mu, scale = FALSE) 253 | mat_temp = mat_temp + t(mat_temp) - 254 | t(eta_centered * W) %*% eta_centered 255 | 256 | # RSpectra could give poor estimates of e-vectors 257 | # so I switch to standard eigen if it does 258 | repeat { 259 | if (partial_decomp) { 260 | eig = RSpectra::eigs_sym(mat_temp, k = min(k + 2, d)) 261 | } else { 262 | eig = eigen(mat_temp, symmetric = TRUE) 263 | } 264 | U = matrix(eig$vectors[, 1:k], d, k) 265 | 266 | theta = outer(ones, mu) + eta_centered %*% tcrossprod(U) 267 | this_loglike <- exp_fam_log_like(x, theta, family, weights) 268 | 269 | if (!partial_decomp | this_loglike>=loglike) { 270 | loglike = this_loglike 271 | break 272 | } else { 273 | partial_decomp = FALSE 274 | if (!quiet) { 275 | cat("RSpectra::eigs_sym was too inaccurate in iteration ", m , 276 | ". Switched to base::eigen") 277 | } 278 | } 279 | } 280 | 281 | loss_trace[m + 1] = 2 * (sat_loglike - loglike) / sum_weights 282 | 283 | if (!quiet) { 284 | time_elapsed = as.numeric(proc.time() - ptm)[3] 285 | tot_time = max_iters / m * time_elapsed 286 | time_remain = tot_time - time_elapsed 287 | cat(m, " ", loss_trace[m + 1], "") 288 | cat(round(time_elapsed / 3600, 1), "hours elapsed. Max", round(time_remain / 3600, 1), "hours remain.\n") 289 | } 290 | if (m > 4) { 291 | if (abs(loss_trace[m] - loss_trace[m+1]) < 2 * conv_criteria) { 292 | break 293 | } 294 | } 295 | } 296 | 297 | # test if loss function increases 298 | if (max_iters > 0 && (loss_trace[m + 1] - loss_trace[m]) > (1e-10)) { 299 | U = last_U 300 | mu = last_mu 301 | M = last_M 302 | m = m - 1 303 | 304 | eta = saturated_natural_parameters(x, family, M = M) + missing_mat * outer(ones, mu) 305 | eta_centered = scale(eta, center = mu, scale = FALSE) 306 | 307 | if (family != "poisson") { 308 | # maybe possible with missing data? TODO: look into 309 | warning("Deviance increased in last iteration.\nThis should not happen!") 310 | } else { 311 | message("Deviance increased in last iteration.") 312 | } 313 | } else if (max_iters == 0) { 314 | m = 0 315 | } 316 | 317 | null_loglike = exp_fam_log_like(x, outer(ones, null_theta), family, weights) 318 | 319 | object <- list(mu = mu, 320 | U = U, 321 | PCs = eta_centered %*% U, 322 | M = M, 323 | family = family, 324 | iters = m, 325 | loss_trace = loss_trace[1:(m + 1)], 326 | prop_deviance_expl = 1 - (loglike - sat_loglike) / (null_loglike - sat_loglike) 327 | ) 328 | class(object) <- "gpca" 329 | object 330 | } 331 | 332 | 333 | #' @title Predict generalized PCA scores or reconstruction on new data 334 | #' 335 | #' @description Predict generalized PCA scores or reconstruction on new data 336 | #' 337 | #' @param object generalized PCA object 338 | #' @param newdata matrix of the same exponential family as in \code{object}. 339 | #' If missing, will use the data that \code{object} was fit on 340 | #' @param type the type of fitting required. \code{type = "PCs"} gives the PC scores, 341 | #' \code{type = "link"} gives matrix on the natural parameter scale and 342 | #' \code{type = "response"} gives matrix on the response scale 343 | #' @param ... Additional arguments 344 | #' @examples 345 | #' # construct a low rank matrices in the natural parameter space 346 | #' rows = 100 347 | #' cols = 10 348 | #' set.seed(1) 349 | #' loadings = rnorm(cols) 350 | #' mat_np = outer(rnorm(rows), rnorm(cols)) 351 | #' mat_np_new = outer(rnorm(rows), loadings) 352 | #' 353 | #' # generate a count matrices 354 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 355 | #' mat_new = matrix(rpois(rows * cols, c(exp(mat_np_new))), rows, cols) 356 | #' 357 | #' # run Poisson PCA on it 358 | #' gpca = generalizedPCA(mat, k = 1, M = 4, family = "poisson") 359 | #' 360 | #' PCs = predict(gpca, mat_new) 361 | #' @export 362 | predict.gpca <- function(object, newdata, type = c("PCs", "link", "response"), ...) { 363 | type = match.arg(type) 364 | 365 | if (missing(newdata)) { 366 | PCs = object$PCs 367 | } else { 368 | check_family(newdata, object$family) 369 | 370 | eta = saturated_natural_parameters(newdata, object$family, object$M) + 371 | is.na(newdata) * outer(rep(1, nrow(newdata)), object$mu) 372 | PCs = scale(eta, center = object$mu, scale = FALSE) %*% object$U 373 | } 374 | 375 | if (type == "PCs") { 376 | PCs 377 | } else { 378 | object$PCs = PCs 379 | fitted(object, type, ...) 380 | } 381 | } 382 | 383 | #' @title Fitted values using generalized PCA 384 | #' 385 | #' @description 386 | #' Fit a lower dimentional representation of the exponential family matrix using generalized PCA 387 | #' 388 | #' @param object generalized PCA object 389 | #' @param type the type of fitting required. \code{type = "link"} gives output on the natural 390 | #' parameter scale and \code{type = "response"} gives output on the response scale 391 | #' @param ... Additional arguments 392 | #' @examples 393 | #' # construct a low rank matrix in the natural parameter space 394 | #' rows = 100 395 | #' cols = 10 396 | #' set.seed(1) 397 | #' mat_np = outer(rnorm(rows), rnorm(cols)) 398 | #' 399 | #' # generate a count matrix 400 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 401 | #' 402 | #' # run Poisson PCA on it 403 | #' gpca = generalizedPCA(mat, k = 1, M = 4, family = "poisson") 404 | #' 405 | #' # construct fitted expected value of counts matrix 406 | #' fit = fitted(gpca, type = "response") 407 | #' @export 408 | fitted.gpca <- function(object, type = c("link", "response"), ...) { 409 | type = match.arg(type) 410 | n = nrow(object$PCs) 411 | 412 | theta = outer(rep(1, n), object$mu) + tcrossprod(object$PCs, object$U) 413 | 414 | if (type == "link") { 415 | return(theta) 416 | } else if (type == "response") { 417 | return(exp_fam_mean(theta, object$family)) 418 | } 419 | } 420 | 421 | #' @title Plot generalized PCA 422 | #' 423 | #' @description 424 | #' Plots the results of a generalized PCA 425 | #' 426 | #' @param x generalized PCA object 427 | #' @param type the type of plot \code{type = "trace"} plots the algorithms progress by 428 | #' iteration, \code{type = "loadings"} plots the first 2 principal component 429 | #' loadings, \code{type = "scores"} plots the loadings first 2 principal component scores 430 | #' @param ... Additional arguments 431 | #' @examples 432 | #' # construct a low rank matrix in the natural parameter space 433 | #' rows = 100 434 | #' cols = 10 435 | #' set.seed(1) 436 | #' mat_np = outer(rnorm(rows), rnorm(cols)) 437 | #' 438 | #' # generate a count matrix 439 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 440 | #' 441 | #' # run logistic PCA on it 442 | #' gpca = generalizedPCA(mat, k = 2, M = 4, family = "poisson") 443 | #' 444 | #' \dontrun{ 445 | #' plot(gpca) 446 | #' } 447 | #' @export 448 | plot.gpca <- function(x, type = c("trace", "loadings", "scores"), ...) { 449 | type = match.arg(type) 450 | 451 | if (type == "trace") { 452 | df = data.frame(Iteration = 0:x$iters, 453 | Deviance = x$loss_trace) 454 | p <- ggplot2::ggplot(df, ggplot2::aes_string("Iteration", "Deviance")) + 455 | ggplot2::geom_line() 456 | } else if (type == "loadings") { 457 | df = data.frame(x$U) 458 | colnames(df) <- paste0("PC", 1:ncol(df)) 459 | if (ncol(df) == 1) { 460 | df$PC2 = 0 461 | p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() + 462 | ggplot2::labs(y = NULL) 463 | } else { 464 | p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() 465 | } 466 | } else if (type == "scores") { 467 | df = data.frame(x$PCs) 468 | colnames(df) <- paste0("PC", 1:ncol(df)) 469 | if (ncol(df) == 1) { 470 | df$PC2 = 0 471 | p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() + 472 | ggplot2::labs(y = NULL) 473 | } else { 474 | p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() 475 | } 476 | } 477 | 478 | return(p) 479 | } 480 | 481 | #' @export 482 | print.gpca <- function(x, ...) { 483 | cat(nrow(x$PCs), "rows and ") 484 | cat(nrow(x$U), "columns of ") 485 | cat(x$family, "data\n") 486 | cat("Rank", ncol(x$U), "solution with M =", x$M, "\n") 487 | cat("\n") 488 | cat(round(x$prop_deviance_expl * 100, 1), "% of deviance explained\n", sep = "") 489 | cat(x$iters, "iterations to converge\n") 490 | 491 | invisible(x) 492 | } 493 | 494 | #' @title CV for generalized PCA 495 | #' 496 | #' @description 497 | #' Run cross validation on dimension and \code{M} for generalized PCA 498 | #' 499 | #' @param x matrix of either binary, count, or continuous data 500 | #' @param ks the different dimensions \code{k} to try 501 | #' @param Ms the different approximations to the saturated model \code{M} to try 502 | #' @param family exponential family distribution of data 503 | #' @param weights an optional matrix of the same size as the \code{x} with data weights 504 | #' @param folds if \code{folds} is a scalar, then it is the number of folds. If 505 | #' it is a vector, it should be the same length as the number of rows in \code{x} 506 | #' @param quiet logical; whether the function should display progress 507 | #' @param ... Additional arguments passed to \code{generalizedPCA} 508 | #' 509 | #' @return A matrix of the CV log likelihood with \code{k} in rows and 510 | #' \code{M} in columns 511 | #' @examples 512 | #' # construct a low rank matrix in the natural parameter space 513 | #' rows = 100 514 | #' cols = 10 515 | #' set.seed(1) 516 | #' mat_np = outer(rnorm(rows), rnorm(cols)) 517 | #' 518 | #' # generate a count matrix 519 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 520 | #' 521 | #' \dontrun{ 522 | #' loglikes = cv.gpca(mat, ks = 1:9, Ms = 3:6, family = "poisson", quiet = FALSE) 523 | #' plot(loglikes) 524 | #' } 525 | #' @export 526 | cv.gpca <- function(x, ks, Ms = seq(2, 10, by = 2), family = c("gaussian", "binomial", "poisson", "multinomial"), 527 | weights, folds = 5, quiet = TRUE, ...) { 528 | family = match.arg(family) 529 | check_family(x, family) 530 | 531 | if (length(folds) > 1) { 532 | # does this work if factor? 533 | if (length(unique(folds)) <= 1) { 534 | stop("If inputing CV split, must be more than one level") 535 | } 536 | if (length(folds) != nrow(x)) { 537 | stop("if folds is a vector, it should be of same length as nrow(x)") 538 | } 539 | cv = folds 540 | } else { 541 | cv = sample(1:folds, nrow(x), replace = TRUE) 542 | } 543 | 544 | if (missing(weights) || length(weights) == 1) { 545 | weights = matrix(1.0, nrow(x), ncol(x)) 546 | } 547 | 548 | log_likes = matrix(0, length(ks), length(Ms), 549 | dimnames = list(k = ks, M = Ms)) 550 | for (k in ks) { 551 | for (M in Ms) { 552 | if (!quiet) { 553 | cat("k =", k, "M =", M, "") 554 | } 555 | for (c in unique(cv)) { 556 | if (!quiet) { 557 | cat(".") 558 | } 559 | gpca = generalizedPCA(x[c != cv, ], k = k, M = M, family = family, weights = weights[c != cv, ], ...) 560 | pred_theta = predict(gpca, newdat = x[c == cv, ], type = "link") 561 | log_likes[k == ks, M == Ms] = log_likes[k == ks, M == Ms] + 562 | exp_fam_log_like(x = x[c == cv, ], theta = pred_theta, family = family, weights = weights[c == cv, ]) 563 | } 564 | if (!quiet) { 565 | cat("", log_likes[k == ks, M == Ms], "\n") 566 | } 567 | } 568 | } 569 | class(log_likes) <- c("matrix", "cv.gpca") 570 | which_min = which(log_likes == max(log_likes), arr.ind = TRUE) 571 | if (!quiet) { 572 | cat("Best: k =", ks[which_min[1]], "M =", Ms[which_min[2]], "\n") 573 | } 574 | 575 | return(log_likes) 576 | } 577 | 578 | #' @title Plot CV for generalized PCA 579 | #' 580 | #' @description 581 | #' Plot cross validation results generalized PCA 582 | #' 583 | #' @param x a \code{cv.gpca} object 584 | #' @param ... Additional arguments 585 | #' @examples 586 | #' # construct a low rank matrix in the natural parameter space 587 | #' rows = 100 588 | #' cols = 10 589 | #' set.seed(1) 590 | #' mat_np = outer(rnorm(rows), rnorm(cols)) 591 | #' 592 | #' # generate a count matrix 593 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols) 594 | #' 595 | #' \dontrun{ 596 | #' loglikes = cv.gpca(mat, ks = 1:9, Ms = 3:6, family = "poisson") 597 | #' plot(loglikes) 598 | #' } 599 | #' @export 600 | #' @importFrom utils type.convert 601 | plot.cv.gpca <- function(x, ...) { 602 | # replaces reshape2::melt(-x, value.name = "NegLogLikelihood") 603 | Ms = utils::type.convert(colnames(x)) 604 | ks = utils::type.convert(rownames(x)) 605 | df = data.frame(k = rep(ks, times = length(Ms)), 606 | m = rep(Ms, each = length(ks)), 607 | NegLogLikelihood = as.vector(-x)) 608 | 609 | if (ncol(x) == 1) { 610 | df$M = factor(df$M) 611 | p <- ggplot2::ggplot(df, ggplot2::aes_string("k", "NegLogLikelihood", colour = "m")) + 612 | ggplot2::geom_line() 613 | } else { 614 | df$k = factor(df$k) 615 | p <- ggplot2::ggplot(df, ggplot2::aes_string("m", "NegLogLikelihood", colour = "k")) + 616 | ggplot2::geom_line() 617 | } 618 | return(p) 619 | } 620 | --------------------------------------------------------------------------------