├── .Rbuildignore
├── LICENSE
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-parital.R
    │   ├── test-count-basics.R
    │   ├── test-binary-basics.R
    │   ├── test-solveM-counts.R
    │   ├── test-multinomial-basics.R
    │   └── test-continuous-basics.R
├── .gitignore
├── man
    ├── generalizedPCA-package.Rd
    ├── log_like_Bernoulli.Rd
    ├── inv.logit.mat.Rd
    ├── exp_fam_deviance.Rd
    ├── plot.cv.gpca.Rd
    ├── project.Fantope.Rd
    ├── plot.efh.Rd
    ├── plot.gmf.Rd
    ├── plot.gpca.Rd
    ├── plot.cgpca.Rd
    ├── fitted.gpca.Rd
    ├── cv.gmf.Rd
    ├── cv.cgpca.Rd
    ├── predict.gpca.Rd
    ├── predict.efh.Rd
    ├── predict.cgpca.Rd
    ├── cv.gpca.Rd
    ├── predict.gmf.Rd
    ├── generalizedMF.Rd
    ├── exponential_family_harmonium.Rd
    ├── generalizedPCA.Rd
    └── convexGeneralizedPCA.Rd
├── R
    ├── generalizedPCA-package.R
    ├── exponential_family_functions.R
    ├── exponential_family_harmonium.R
    ├── generalizedMF.R
    ├── convexGeneralizedPCA.R
    └── generalizedPCA.R
├── generalizedPCA.Rproj
├── DESCRIPTION
├── NAMESPACE
└── README.md


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2015
2 | COPYRIGHT HOLDER: Andrew J. Landgraf
3 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(generalizedPCA)
3 | 
4 | test_check("generalizedPCA")
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | 
 5 | # files created by c
 6 | src/*.o
 7 | src/*.so
 8 | src/*.dll
 9 | 
10 | # folders created when building the package
11 | src-i386
12 | src-x64
13 | 


--------------------------------------------------------------------------------
/man/generalizedPCA-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generalizedPCA-package.R
 3 | \docType{package}
 4 | \name{generalizedPCA-package}
 5 | \alias{generalizedPCA-package}
 6 | \title{generalizedPCA-package}
 7 | \description{
 8 | Generalized dimensionality reduction by extending PCA
 9 | }
10 | \author{
11 | Andrew J. Landgraf
12 | }
13 | \keyword{package}
14 | 
15 | 


--------------------------------------------------------------------------------
/R/generalizedPCA-package.R:
--------------------------------------------------------------------------------
 1 | #'  Generalized dimensionality reduction by extending PCA
 2 | #'
 3 | #' @name generalizedPCA-package
 4 | #' @docType package
 5 | #' @title generalizedPCA-package
 6 | #' @author Andrew J. Landgraf
 7 | #' @keywords package
 8 | #'
 9 | #' @importFrom stats fitted predict rnorm runif
10 | #' @importFrom logisticPCA inv.logit.mat log_like_Bernoulli
11 | NULL
12 | 
13 | 
14 | 
15 | # @useDynLib generalizedPCA, inv_logit_mat, compute_loglik
16 | 


--------------------------------------------------------------------------------
/generalizedPCA.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | 


--------------------------------------------------------------------------------
/man/log_like_Bernoulli.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/exponential_family_functions.R
 3 | \name{log_like_Bernoulli}
 4 | \alias{log_like_Bernoulli}
 5 | \title{Bernoulli Log Likelihood}
 6 | \usage{
 7 | log_like_Bernoulli(x, theta, q)
 8 | }
 9 | \arguments{
10 | \item{x}{matrix with all binary entries}
11 | 
12 | \item{theta}{estimated natural parameters with
13 | same dimensions as x}
14 | 
15 | \item{q}{instead of x, you can input matrix q which is
16 | -1 if \code{x = 0}, 1 if \code{x = 1}, and 0 if \code{is.na(x)}}
17 | }
18 | \description{
19 | Calculate the Bernoulli log likelihood of matrix
20 | }
21 | 
22 | 


--------------------------------------------------------------------------------
/man/inv.logit.mat.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/exponential_family_functions.R
 3 | \name{inv.logit.mat}
 4 | \alias{inv.logit.mat}
 5 | \title{Inverse logit for matrices}
 6 | \usage{
 7 | inv.logit.mat(x, min = 0, max = 1)
 8 | }
 9 | \arguments{
10 | \item{x}{matrix}
11 | 
12 | \item{min}{Lower end of logit interval}
13 | 
14 | \item{max}{Upper end of logit interval}
15 | }
16 | \description{
17 | Apply the inverse logit function to a matrix, element-wise. It
18 | generalizes the \code{inv.logit} function from the \code{gtools}
19 | library to matrices
20 | }
21 | \examples{
22 | (mat = matrix(rnorm(10 * 5), nrow = 10, ncol = 5))
23 | inv.logit.mat(mat)
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/man/exp_fam_deviance.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/exponential_family_functions.R
 3 | \name{exp_fam_deviance}
 4 | \alias{exp_fam_deviance}
 5 | \title{Calculate exponential family deviance}
 6 | \usage{
 7 | exp_fam_deviance(x, theta, family, weights = 1)
 8 | }
 9 | \arguments{
10 | \item{x}{a vector or matrix of data}
11 | 
12 | \item{theta}{natural parameters. Must be same dimensions as \code{x}}
13 | 
14 | \item{family}{exponential family distribution of data}
15 | 
16 | \item{weights}{weights of datapoints. Must be a scalar or the same dimensions as the data}
17 | }
18 | \description{
19 | Calculates the deviance of data assuming it comes from an exponential family
20 | }
21 | 
22 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: generalizedPCA
 2 | Type: Package
 3 | Title: Generalized PCA
 4 | Version: 0.3
 5 | Date: 2018-02-24
 6 | NeedsCompilation: yes
 7 | ByteCompile: yes
 8 | Author: Andrew J. Landgraf
 9 | Maintainer: Andrew J. Landgraf <landgraf.5@osu.edu>
10 | Description: Generalized dimensionality reduction by extending PCA to exponential
11 |   family data. This package includes three methods: Generalized PCA by Landgraf 
12 |   and Lee (2015), Exponential Family PCA (Generalized MF) by Collins et al. (2001),
13 |   and Exponential Family Harmoniums by Welling et al. (2005).
14 | License: MIT + file LICENSE
15 | LazyData: TRUE
16 | URL: https://github.com/andland/generalizedPCA
17 | Imports:
18 |     ggplot2,
19 |     logisticPCA,
20 |     RSpectra (>= 0.10-0)
21 | Suggests:
22 |     testthat (>= 0.11.0)
23 | RoxygenNote: 5.0.1
24 | 


--------------------------------------------------------------------------------
/man/plot.cv.gpca.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generalizedPCA.R
 3 | \name{plot.cv.gpca}
 4 | \alias{plot.cv.gpca}
 5 | \title{Plot CV for generalized PCA}
 6 | \usage{
 7 | \method{plot}{cv.gpca}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{a \code{cv.gpca} object}
11 | 
12 | \item{...}{Additional arguments}
13 | }
14 | \description{
15 | Plot cross validation results generalized PCA
16 | }
17 | \examples{
18 | # construct a low rank matrix in the natural parameter space
19 | rows = 100
20 | cols = 10
21 | set.seed(1)
22 | mat_np = outer(rnorm(rows), rnorm(cols))
23 | 
24 | # generate a count matrix
25 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
26 | 
27 | \dontrun{
28 | loglikes = cv.gpca(mat, ks = 1:9, Ms = 3:6, family = "poisson")
29 | plot(loglikes)
30 | }
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/man/project.Fantope.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/convexGeneralizedPCA.R
 3 | \name{project.Fantope}
 4 | \alias{project.Fantope}
 5 | \title{Project onto the Fantope}
 6 | \usage{
 7 | project.Fantope(x, k, partial_decomp = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x}{a symmetric matrix}
11 | 
12 | \item{k}{the rank of the Fantope desired}
13 | 
14 | \item{partial_decomp}{logical; if \code{TRUE}, the function uses the RSpectra package
15 | to more quickly calculate the eigen-decomposition. When the number of columns is small,
16 | the approximation may be less accurate and slower}
17 | }
18 | \value{
19 | \item{H}{a rank \code{k} Fantope matrix}
20 | \item{U}{a \code{k}-dimentional orthonormal matrix with the first \code{k} eigenvectors of \code{H}}
21 | }
22 | \description{
23 | Project a symmetric matrix onto the convex set of the rank k Fantope
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/man/plot.efh.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/exponential_family_harmonium.R
 3 | \name{plot.efh}
 4 | \alias{plot.efh}
 5 | \title{Plot exponential family harmonium}
 6 | \usage{
 7 | \method{plot}{efh}(x, type = c("trace"), ...)
 8 | }
 9 | \arguments{
10 | \item{x}{EFH object}
11 | 
12 | \item{type}{the type of plot \code{type = "trace"} plots the algorithms progress by
13 | iteration}
14 | 
15 | \item{...}{Additional arguments}
16 | }
17 | \description{
18 | Plots the results of a EFH
19 | }
20 | \examples{
21 | # construct a low rank matrix in the logit scale
22 | rows = 100
23 | cols = 10
24 | set.seed(1)
25 | mat_logit = outer(rnorm(rows), rnorm(cols))
26 | 
27 | # generate a binary matrix
28 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0
29 | 
30 | # run logistic SVD on it
31 | efh = exponential_family_harmonium(mat, k = 2, family = "binomial", family_hidden = "binomial")
32 | 
33 | \dontrun{
34 | plot(efh)
35 | }
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(fitted,gpca)
 4 | S3method(plot,cgpca)
 5 | S3method(plot,cv.gpca)
 6 | S3method(plot,efh)
 7 | S3method(plot,gmf)
 8 | S3method(plot,gpca)
 9 | S3method(predict,cgpca)
10 | S3method(predict,efh)
11 | S3method(predict,gmf)
12 | S3method(predict,gpca)
13 | S3method(print,cgpca)
14 | S3method(print,efh)
15 | S3method(print,gmf)
16 | S3method(print,gpca)
17 | export(convexGeneralizedPCA)
18 | export(cv.cgpca)
19 | export(cv.gmf)
20 | export(cv.gpca)
21 | export(exp_fam_deviance)
22 | export(exponential_family_harmonium)
23 | export(generalizedMF)
24 | export(generalizedPCA)
25 | export(inv.logit.mat)
26 | export(project.Fantope)
27 | importFrom(RSpectra,svds)
28 | importFrom(logisticPCA,inv.logit.mat)
29 | importFrom(logisticPCA,log_like_Bernoulli)
30 | importFrom(stats,fitted)
31 | importFrom(stats,predict)
32 | importFrom(stats,rbinom)
33 | importFrom(stats,rnorm)
34 | importFrom(stats,rpois)
35 | importFrom(stats,runif)
36 | importFrom(stats,var)
37 | importFrom(utils,type.convert)
38 | 


--------------------------------------------------------------------------------
/man/plot.gmf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generalizedMF.R
 3 | \name{plot.gmf}
 4 | \alias{plot.gmf}
 5 | \title{Plot generalized MF}
 6 | \usage{
 7 | \method{plot}{gmf}(x, type = c("trace", "loadings", "scores"), ...)
 8 | }
 9 | \arguments{
10 | \item{x}{generalized MF object}
11 | 
12 | \item{type}{the type of plot \code{type = "trace"} plots the algorithms progress by
13 | iteration, \code{type = "loadings"} plots the first 2 principal component
14 | loadings, \code{type = "scores"} plots the loadings first 2 principal component scores}
15 | 
16 | \item{...}{Additional arguments}
17 | }
18 | \description{
19 | Plots the results of a generalized MF
20 | }
21 | \examples{
22 | # construct a low rank matrix in the logit scale
23 | rows = 100
24 | cols = 10
25 | set.seed(1)
26 | mat_logit = outer(rnorm(rows), rnorm(cols))
27 | 
28 | # generate a binary matrix
29 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0
30 | 
31 | # run logistic SVD on it
32 | gmf = generalizedMF(mat, k = 2, family = "binomial")
33 | 
34 | \dontrun{
35 | plot(gmf)
36 | }
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/man/plot.gpca.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generalizedPCA.R
 3 | \name{plot.gpca}
 4 | \alias{plot.gpca}
 5 | \title{Plot generalized PCA}
 6 | \usage{
 7 | \method{plot}{gpca}(x, type = c("trace", "loadings", "scores"), ...)
 8 | }
 9 | \arguments{
10 | \item{x}{generalized PCA object}
11 | 
12 | \item{type}{the type of plot \code{type = "trace"} plots the algorithms progress by
13 | iteration, \code{type = "loadings"} plots the first 2 principal component
14 | loadings, \code{type = "scores"} plots the loadings first 2 principal component scores}
15 | 
16 | \item{...}{Additional arguments}
17 | }
18 | \description{
19 | Plots the results of a generalized PCA
20 | }
21 | \examples{
22 | # construct a low rank matrix in the natural parameter space
23 | rows = 100
24 | cols = 10
25 | set.seed(1)
26 | mat_np = outer(rnorm(rows), rnorm(cols))
27 | 
28 | # generate a count matrix
29 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
30 | 
31 | # run logistic PCA on it
32 | gpca = generalizedPCA(mat, k = 2, M = 4, family = "poisson")
33 | 
34 | \dontrun{
35 | plot(gpca)
36 | }
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/man/plot.cgpca.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/convexGeneralizedPCA.R
 3 | \name{plot.cgpca}
 4 | \alias{plot.cgpca}
 5 | \title{Plot convex generalized PCA}
 6 | \usage{
 7 | \method{plot}{cgpca}(x, type = c("trace", "loadings", "scores"), ...)
 8 | }
 9 | \arguments{
10 | \item{x}{convex generalized PCA object}
11 | 
12 | \item{type}{the type of plot \code{type = "trace"} plots the algorithms progress by
13 | iteration, \code{type = "loadings"} plots the first 2 PC loadings,
14 | \code{type = "scores"} plots the first 2 PC scores}
15 | 
16 | \item{...}{Additional arguments}
17 | }
18 | \description{
19 | Plots the results of a convex generalized PCA
20 | }
21 | \examples{
22 | # construct a low rank matrix in the logit scale
23 | rows = 100
24 | cols = 10
25 | set.seed(1)
26 | mat_logit = outer(rnorm(rows), rnorm(cols))
27 | 
28 | # generate a binary matrix
29 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0
30 | 
31 | # run convex generalized PCA on it
32 | cgpca = convexGeneralizedPCA(mat, k = 1, M = 4, family = "binomial")
33 | 
34 | \dontrun{
35 | plot(cgpca)
36 | }
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/man/fitted.gpca.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generalizedPCA.R
 3 | \name{fitted.gpca}
 4 | \alias{fitted.gpca}
 5 | \title{Fitted values using generalized PCA}
 6 | \usage{
 7 | \method{fitted}{gpca}(object, type = c("link", "response"), ...)
 8 | }
 9 | \arguments{
10 | \item{object}{generalized PCA object}
11 | 
12 | \item{type}{the type of fitting required. \code{type = "link"} gives output on the natural
13 | parameter scale and \code{type = "response"} gives output on the response scale}
14 | 
15 | \item{...}{Additional arguments}
16 | }
17 | \description{
18 | Fit a lower dimentional representation of the exponential family matrix using generalized PCA
19 | }
20 | \examples{
21 | # construct a low rank matrix in the natural parameter space
22 | rows = 100
23 | cols = 10
24 | set.seed(1)
25 | mat_np = outer(rnorm(rows), rnorm(cols))
26 | 
27 | # generate a count matrix
28 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
29 | 
30 | # run Poisson PCA on it
31 | gpca = generalizedPCA(mat, k = 1, M = 4, family = "poisson")
32 | 
33 | # construct fitted expected value of counts matrix
34 | fit = fitted(gpca, type = "response")
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/man/cv.gmf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generalizedMF.R
 3 | \name{cv.gmf}
 4 | \alias{cv.gmf}
 5 | \title{CV for generalized MF}
 6 | \usage{
 7 | cv.gmf(x, ks, family = c("gaussian", "binomial", "poisson", "multinomial"),
 8 |   folds = 5, quiet = TRUE, ...)
 9 | }
10 | \arguments{
11 | \item{x}{matrix of either binary, count, or continuous data}
12 | 
13 | \item{ks}{the different dimensions \code{k} to try}
14 | 
15 | \item{family}{exponential family distribution of data}
16 | 
17 | \item{folds}{if \code{folds} is a scalar, then it is the number of folds. If
18 | it is a vector, it should be the same length as the number of rows in \code{x}}
19 | 
20 | \item{quiet}{logical; whether the function should display progress}
21 | 
22 | \item{...}{Additional arguments passed to generalizedMF}
23 | }
24 | \value{
25 | A matrix of the CV deviance with \code{k} in rows
26 | }
27 | \description{
28 | Run cross validation on dimension for generalized MF
29 | }
30 | \examples{
31 | # construct a low rank matrix in the logit scale
32 | rows = 100
33 | cols = 10
34 | set.seed(1)
35 | mat_logit = outer(rnorm(rows), rnorm(cols))
36 | 
37 | # generate a binary matrix
38 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0
39 | 
40 | \dontrun{
41 | deviances = cv.gmf(mat, ks = 1:9, family = "binomial")
42 | plot(deviances)
43 | }
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/man/cv.cgpca.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/convexGeneralizedPCA.R
 3 | \name{cv.cgpca}
 4 | \alias{cv.cgpca}
 5 | \title{CV for convex generalized PCA}
 6 | \usage{
 7 | cv.cgpca(x, ks, Ms = seq(2, 10, by = 2), folds = 5, quiet = TRUE, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{matrix with all binary entries}
11 | 
12 | \item{ks}{the different dimensions \code{k} to try}
13 | 
14 | \item{Ms}{the different approximations to the saturated model \code{M} to try}
15 | 
16 | \item{folds}{if \code{folds} is a scalar, then it is the number of folds. If
17 | it is a vector, it should be the same length as the number of rows in \code{x}}
18 | 
19 | \item{quiet}{logical; whether the function should display progress}
20 | 
21 | \item{...}{Additional arguments passed to convexGeneralizedPCA}
22 | }
23 | \value{
24 | A matrix of the CV log likelihood with \code{k} in rows and
25 |  \code{M} in columns
26 | }
27 | \description{
28 | Run cross validation on dimension and \code{M} for convex generalized PCA
29 | }
30 | \examples{
31 | # construct a low rank matrix in the logit scale
32 | rows = 100
33 | cols = 10
34 | set.seed(1)
35 | mat_logit = outer(rnorm(rows), rnorm(cols))
36 | 
37 | # generate a binary matrix
38 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0
39 | 
40 | \dontrun{
41 | loglikes = cv.cgpca(mat, ks = 1:9, Ms = 3:6)
42 | plot(loglikes)
43 | }
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/man/predict.gpca.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generalizedPCA.R
 3 | \name{predict.gpca}
 4 | \alias{predict.gpca}
 5 | \title{Predict generalized PCA scores or reconstruction on new data}
 6 | \usage{
 7 | \method{predict}{gpca}(object, newdata, type = c("PCs", "link", "response"),
 8 |   ...)
 9 | }
10 | \arguments{
11 | \item{object}{generalized PCA object}
12 | 
13 | \item{newdata}{matrix of the same exponential family as in \code{object}.
14 | If missing, will use the data that \code{object} was fit on}
15 | 
16 | \item{type}{the type of fitting required. \code{type = "PCs"} gives the PC scores,
17 | \code{type = "link"} gives matrix on the natural parameter scale and
18 | \code{type = "response"} gives matrix on the response scale}
19 | 
20 | \item{...}{Additional arguments}
21 | }
22 | \description{
23 | Predict generalized PCA scores or reconstruction on new data
24 | }
25 | \examples{
26 | # construct a low rank matrices in the natural parameter space
27 | rows = 100
28 | cols = 10
29 | set.seed(1)
30 | loadings = rnorm(cols)
31 | mat_np = outer(rnorm(rows), rnorm(cols))
32 | mat_np_new = outer(rnorm(rows), loadings)
33 | 
34 | # generate a count matrices
35 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
36 | mat_new = matrix(rpois(rows * cols, c(exp(mat_np_new))), rows, cols)
37 | 
38 | # run Poisson PCA on it
39 | gpca = generalizedPCA(mat, k = 1, M = 4, family = "poisson")
40 | 
41 | PCs = predict(gpca, mat_new)
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/man/predict.efh.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/exponential_family_harmonium.R
 3 | \name{predict.efh}
 4 | \alias{predict.efh}
 5 | \title{Predict exponential family harmonium reconstruction on new data}
 6 | \usage{
 7 | \method{predict}{efh}(object, newdata, type = c("hidden", "link", "response"),
 8 |   ...)
 9 | }
10 | \arguments{
11 | \item{object}{EFH object}
12 | 
13 | \item{newdata}{matrix of the same exponential family as covariates in \code{object}.}
14 | 
15 | \item{type}{the type of fitting required.
16 | \code{type = "hidden"} gives matrix of hidden mean parameters of \code{x},
17 | \code{type = "link"} gives a matrix on the natural parameter scale, and
18 | \code{type = "response"} gives a matrix on the response scale}
19 | 
20 | \item{...}{Additional arguments}
21 | }
22 | \description{
23 | Predict exponential family harmonium reconstruction on new data
24 | }
25 | \examples{
26 | # construct a low rank matrices in the natural parameter space
27 | rows = 100
28 | cols = 10
29 | set.seed(1)
30 | loadings = rnorm(cols)
31 | mat_np = outer(rnorm(rows), rnorm(cols))
32 | mat_np_new = outer(rnorm(rows), loadings)
33 | 
34 | # generate a count matrices
35 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
36 | mat_new = matrix(rpois(rows * cols, c(exp(mat_np_new))), rows, cols)
37 | 
38 | modp = exponential_family_harmonium(mat, k = 9, family = "poisson", max_iters = 1000)
39 | 
40 | pred = predict(modp, mat_new, type = "response")
41 | 
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/man/predict.cgpca.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/convexGeneralizedPCA.R
 3 | \name{predict.cgpca}
 4 | \alias{predict.cgpca}
 5 | \title{Predict Convex Generalized PCA scores or reconstruction on new data}
 6 | \usage{
 7 | \method{predict}{cgpca}(object, newdata, type = c("PCs", "link", "response"),
 8 |   ...)
 9 | }
10 | \arguments{
11 | \item{object}{convex generalized PCA object}
12 | 
13 | \item{newdata}{matrix with all binary entries. If missing, will use the
14 | data that \code{object} was fit on}
15 | 
16 | \item{type}{the type of fitting required. \code{type = "PCs"} gives the PC scores,
17 | \code{type = "link"} gives matrix on the logit scale and \code{type = "response"}
18 | gives matrix on the probability scale}
19 | 
20 | \item{...}{Additional arguments}
21 | }
22 | \description{
23 | Predict Convex Generalized PCA scores or reconstruction on new data
24 | }
25 | \examples{
26 | # construct a low rank matrices in the logit scale
27 | rows = 100
28 | cols = 10
29 | set.seed(1)
30 | loadings = rnorm(cols)
31 | mat_logit = outer(rnorm(rows), loadings)
32 | mat_logit_new = outer(rnorm(rows), loadings)
33 | 
34 | # convert to a binary matrix
35 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0
36 | mat_new = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit_new)) * 1.0
37 | 
38 | # run generalized PCA on it
39 | cgpca = convexGeneralizedPCA(mat, k = 1, M = 4, family = "binomial")
40 | 
41 | PCs = predict(cgpca, mat_new)
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/man/cv.gpca.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generalizedPCA.R
 3 | \name{cv.gpca}
 4 | \alias{cv.gpca}
 5 | \title{CV for generalized PCA}
 6 | \usage{
 7 | cv.gpca(x, ks, Ms = seq(2, 10, by = 2), family = c("gaussian", "binomial",
 8 |   "poisson", "multinomial"), weights, folds = 5, quiet = TRUE, ...)
 9 | }
10 | \arguments{
11 | \item{x}{matrix of either binary, count, or continuous data}
12 | 
13 | \item{ks}{the different dimensions \code{k} to try}
14 | 
15 | \item{Ms}{the different approximations to the saturated model \code{M} to try}
16 | 
17 | \item{family}{exponential family distribution of data}
18 | 
19 | \item{weights}{an optional matrix of the same size as the \code{x} with data weights}
20 | 
21 | \item{folds}{if \code{folds} is a scalar, then it is the number of folds. If
22 | it is a vector, it should be the same length as the number of rows in \code{x}}
23 | 
24 | \item{quiet}{logical; whether the function should display progress}
25 | 
26 | \item{...}{Additional arguments passed to \code{generalizedPCA}}
27 | }
28 | \value{
29 | A matrix of the CV log likelihood with \code{k} in rows and
30 |  \code{M} in columns
31 | }
32 | \description{
33 | Run cross validation on dimension and \code{M} for generalized PCA
34 | }
35 | \examples{
36 | # construct a low rank matrix in the natural parameter space
37 | rows = 100
38 | cols = 10
39 | set.seed(1)
40 | mat_np = outer(rnorm(rows), rnorm(cols))
41 | 
42 | # generate a count matrix
43 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
44 | 
45 | \dontrun{
46 | loglikes = cv.gpca(mat, ks = 1:9, Ms = 3:6, family = "poisson", quiet = FALSE)
47 | plot(loglikes)
48 | }
49 | }
50 | 
51 | 


--------------------------------------------------------------------------------
/tests/testthat/test-parital.R:
--------------------------------------------------------------------------------
 1 | context("Parital Decomp")
 2 | 
 3 | # construct a low rank matrix in the natural parameter scale
 4 | rows = 100
 5 | cols = 10
 6 | k = 1
 7 | set.seed(1)
 8 | mat_np = outer(rnorm(rows), rnorm(cols))
 9 | 
10 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_np)) * 1.0
11 | 
12 | gpca = generalizedPCA(mat, k = k, M = 4, family = "binomial", main_effects = TRUE, partial_decomp = FALSE)
13 | gmf = generalizedMF(mat, k = k, family = "binomial", main_effects = TRUE, partial_decomp = FALSE, method = "svd")
14 | 
15 | ppca = generalizedPCA(mat, k = k, M = 4, family = "poisson", main_effects = TRUE, partial_decomp = FALSE)
16 | pmf = generalizedMF(mat, k = k, family = "poisson", main_effects = TRUE, partial_decomp = FALSE, method = "svd")
17 | 
18 | gpca_part = generalizedPCA(mat, k = k, M = 4, family = "binomial", main_effects = TRUE, partial_decomp = TRUE)
19 | gmf_part = generalizedMF(mat, k = k, family = "binomial", main_effects = TRUE, partial_decomp = TRUE, method = "svd")
20 | 
21 | ppca_part = generalizedPCA(mat, k = k, M = 4, family = "poisson", main_effects = TRUE, partial_decomp = TRUE)
22 | pmf_part = generalizedMF(mat, k = k, family = "poisson", main_effects = TRUE, partial_decomp = TRUE, method = "svd")
23 | 
24 | test_that("partial_decomp = full decomp", {
25 |   expect_equal(gpca$iters, gpca_part$iters)
26 |   expect_equal(gmf$iters, gmf_part$iters)
27 |   expect_equal(ppca$iters, ppca_part$iters)
28 |   expect_equal(pmf$iters, pmf_part$iters)
29 | 
30 |   expect_equal(gpca$loss_trace, gpca_part$loss_trace)
31 |   expect_equal(gmf$loss_trace, gmf_part$loss_trace)
32 |   expect_equal(ppca$loss_trace, ppca_part$loss_trace)
33 |   expect_equal(pmf$loss_trace, pmf_part$loss_trace)
34 | })
35 | 


--------------------------------------------------------------------------------
/man/predict.gmf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generalizedMF.R
 3 | \name{predict.gmf}
 4 | \alias{predict.gmf}
 5 | \title{Predict generalized PCA scores or reconstruction on new data}
 6 | \usage{
 7 | \method{predict}{gmf}(object, newdata, type = c("PCs", "link", "response"),
 8 |   quiet = TRUE, max_iters = 1000, conv_criteria = 1e-05, start_A, ...)
 9 | }
10 | \arguments{
11 | \item{object}{generalized MF object}
12 | 
13 | \item{newdata}{matrix of the same exponential family as covariates in \code{object}.
14 | If missing, will use the data that \code{object} was fit on}
15 | 
16 | \item{type}{the type of fitting required.
17 | \code{type = "PCs"} gives matrix of principal components of \code{x},
18 | \code{type = "link"} gives a matrix on the natural parameter scale, and
19 | \code{type = "response"} gives a matrix on the response scale}
20 | 
21 | \item{quiet}{logical; whether the calculation should show progress}
22 | 
23 | \item{max_iters}{maximum number of iterations}
24 | 
25 | \item{conv_criteria}{convergence criteria}
26 | 
27 | \item{start_A}{initial value for \code{A}}
28 | 
29 | \item{...}{Additional arguments}
30 | }
31 | \description{
32 | Predict generalized PCA scores or reconstruction on new data
33 | }
34 | \examples{
35 | # construct a low rank matrices in the natural parameter space
36 | rows = 100
37 | cols = 10
38 | set.seed(1)
39 | loadings = rnorm(cols)
40 | mat_np = outer(rnorm(rows), rnorm(cols))
41 | mat_np_new = outer(rnorm(rows), loadings)
42 | 
43 | # generate a count matrices
44 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
45 | mat_new = matrix(rpois(rows * cols, c(exp(mat_np_new))), rows, cols)
46 | 
47 | # run Poisson PCA on it
48 | gmf = generalizedMF(mat, k = 1, family = "poisson")
49 | 
50 | A = predict(gmf, mat_new)
51 | 
52 | }
53 | 
54 | 


--------------------------------------------------------------------------------
/tests/testthat/test-count-basics.R:
--------------------------------------------------------------------------------
 1 | context("Counts")
 2 | 
 3 | # construct a low rank matrix in the natural parameter scale
 4 | rows = 100
 5 | cols = 10
 6 | k = 1
 7 | set.seed(1)
 8 | mat_np = outer(rnorm(rows), rnorm(cols))
 9 | 
10 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
11 | 
12 | gpca = generalizedPCA(mat, k = k, M = 4, family = "poisson", main_effects = TRUE)
13 | gmf = generalizedMF(mat, k = k, family = "poisson", method = "als")
14 | 
15 | pred1 = predict(gpca, mat)
16 | pred1l = predict(gpca, mat, type = "link")
17 | pred1r = predict(gpca, mat, type = "response")
18 | fit1l = fitted(gpca, type = "link")
19 | fit1r = fitted(gpca, type = "response")
20 | 
21 | test_that("correct classes", {
22 |   expect_is(gpca, "gpca")
23 | 
24 |   expect_is(pred1, "matrix")
25 |   expect_is(pred1l, "matrix")
26 |   expect_is(pred1r, "matrix")
27 |   expect_is(fit1l, "matrix")
28 |   expect_is(fit1r, "matrix")
29 | 
30 | })
31 | 
32 | test_that("k = 1 dimensions", {
33 |   expect_equal(dim(gpca$U), c(cols, 1))
34 |   expect_equal(dim(gpca$PCs), c(rows, 1))
35 |   expect_equal(length(gpca$mu), cols)
36 | 
37 |   expect_equal(dim(pred1), c(rows, 1))
38 |   expect_equal(dim(pred1l), c(rows, cols))
39 |   expect_equal(dim(pred1r), c(rows, cols))
40 |   expect_equal(dim(fit1l), c(rows, cols))
41 |   expect_equal(dim(fit1r), c(rows, cols))
42 | })
43 | 
44 | rm(gpca, pred1, pred1l, pred1r, fit1l, fit1r)
45 | 
46 | k = 2
47 | gpca = generalizedPCA(mat, k = k, M = 4, family = "poisson", main_effects = TRUE)
48 | 
49 | pred1 = predict(gpca, mat)
50 | pred1l = predict(gpca, mat, type = "link")
51 | pred1r = predict(gpca, mat, type = "response")
52 | fit1l = fitted(gpca, type = "link")
53 | fit1r = fitted(gpca, type = "response")
54 | 
55 | test_that("k = 2 dimensions", {
56 |   expect_equal(dim(gpca$U), c(cols, 2))
57 |   expect_equal(dim(gpca$PCs), c(rows, 2))
58 |   expect_equal(length(gpca$mu), cols)
59 | 
60 |   expect_equal(dim(pred1), c(rows, 2))
61 |   expect_equal(dim(pred1l), c(rows, cols))
62 |   expect_equal(dim(pred1r), c(rows, cols))
63 |   expect_equal(dim(fit1l), c(rows, cols))
64 |   expect_equal(dim(fit1r), c(rows, cols))
65 | })
66 | 
67 | test_that("response between non-negative", {
68 |   expect_gte(min(pred1r), 0)
69 |   expect_gte(min(fit1r), 0)
70 | })
71 | 


--------------------------------------------------------------------------------
/tests/testthat/test-binary-basics.R:
--------------------------------------------------------------------------------
 1 | context("Binary")
 2 | 
 3 | # construct a low rank matrix in the natural parameter scale
 4 | rows = 100
 5 | cols = 10
 6 | k = 1
 7 | set.seed(1)
 8 | mat_np = outer(rnorm(rows), rnorm(cols))
 9 | 
10 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_np)) * 1.0
11 | 
12 | gpca = generalizedPCA(mat, k = k, M = 4, family = "binomial", main_effects = TRUE)
13 | 
14 | pred1 = predict(gpca, mat)
15 | pred1l = predict(gpca, mat, type = "link")
16 | pred1r = predict(gpca, mat, type = "response")
17 | fit1l = fitted(gpca, type = "link")
18 | fit1r = fitted(gpca, type = "response")
19 | 
20 | test_that("correct classes", {
21 |   expect_is(gpca, "gpca")
22 | 
23 |   expect_is(pred1, "matrix")
24 |   expect_is(pred1l, "matrix")
25 |   expect_is(pred1r, "matrix")
26 |   expect_is(fit1l, "matrix")
27 |   expect_is(fit1r, "matrix")
28 | 
29 | })
30 | 
31 | test_that("k = 1 dimensions", {
32 |   expect_equal(dim(gpca$U), c(cols, 1))
33 |   expect_equal(dim(gpca$PCs), c(rows, 1))
34 |   expect_equal(length(gpca$mu), cols)
35 | 
36 |   expect_equal(dim(pred1), c(rows, 1))
37 |   expect_equal(dim(pred1l), c(rows, cols))
38 |   expect_equal(dim(pred1r), c(rows, cols))
39 |   expect_equal(dim(fit1l), c(rows, cols))
40 |   expect_equal(dim(fit1r), c(rows, cols))
41 | })
42 | 
43 | rm(gpca, pred1, pred1l, pred1r, fit1l, fit1r)
44 | 
45 | k = 2
46 | gpca = generalizedPCA(mat, k = k, M = 4, family = "binomial", main_effects = TRUE)
47 | 
48 | pred1 = predict(gpca, mat)
49 | pred1l = predict(gpca, mat, type = "link")
50 | pred1r = predict(gpca, mat, type = "response")
51 | fit1l = fitted(gpca, type = "link")
52 | fit1r = fitted(gpca, type = "response")
53 | 
54 | test_that("k = 2 dimensions", {
55 |   expect_equal(dim(gpca$U), c(cols, 2))
56 |   expect_equal(dim(gpca$PCs), c(rows, 2))
57 |   expect_equal(length(gpca$mu), cols)
58 | 
59 |   expect_equal(dim(pred1), c(rows, 2))
60 |   expect_equal(dim(pred1l), c(rows, cols))
61 |   expect_equal(dim(pred1r), c(rows, cols))
62 |   expect_equal(dim(fit1l), c(rows, cols))
63 |   expect_equal(dim(fit1r), c(rows, cols))
64 | })
65 | 
66 | test_that("response between 0 and 1", {
67 |   expect_gte(min(pred1r), 0)
68 |   expect_gte(min(fit1r), 0)
69 | 
70 |   expect_lte(max(pred1r), 1)
71 |   expect_lte(max(fit1r), 1)
72 | })
73 | 


--------------------------------------------------------------------------------
/tests/testthat/test-solveM-counts.R:
--------------------------------------------------------------------------------
 1 | context("Solve M")
 2 | 
 3 | # construct a low rank matrix in the natural parameter scale
 4 | rows = 100
 5 | cols = 10
 6 | k = 2
 7 | set.seed(1)
 8 | 
 9 | loadings = rnorm(cols)
10 | mat_np1 = outer(rnorm(rows), loadings)
11 | mat_np2 = outer(rnorm(rows), loadings)
12 | 
13 | mat = matrix(rpois(rows * cols, c(exp(mat_np1))), rows, cols)
14 | mat_test = matrix(rpois(rows * cols, c(exp(mat_np1))), rows, cols)
15 | 
16 | gpca = generalizedPCA(mat, k = k, M = 0, family = "poisson", main_effects = TRUE)
17 | gpca_val = generalizedPCA(mat, k = k, M = 0, family = "poisson", main_effects = TRUE, validation = mat_test)
18 | 
19 | pred1 = predict(gpca, mat)
20 | pred1l = predict(gpca, mat, type = "link")
21 | pred1r = predict(gpca, mat, type = "response")
22 | fit1l = fitted(gpca, type = "link")
23 | fit1r = fitted(gpca, type = "response")
24 | 
25 | test_that("k = 2 poisson", {
26 |   expect_equal(dim(gpca$U), c(cols, 2))
27 |   expect_equal(dim(gpca$PCs), c(rows, 2))
28 |   expect_equal(length(gpca$mu), cols)
29 | 
30 |   expect_equal(dim(pred1), c(rows, 2))
31 |   expect_equal(dim(pred1l), c(rows, cols))
32 |   expect_equal(dim(pred1r), c(rows, cols))
33 |   expect_equal(dim(fit1l), c(rows, cols))
34 |   expect_equal(dim(fit1r), c(rows, cols))
35 | })
36 | 
37 | rm(gpca, gpca_val)
38 | 
39 | tot = rowSums(mat) + rbinom(rows, 1, 0.5)
40 | tot_test = rowSums(mat_test) + rbinom(rows, 1, 0.5)
41 | matp = sweep(mat, 1, tot, "/")
42 | matp_test = sweep(mat_test, 1, tot_test, "/")
43 | wghts = outer(tot, rep(1, cols))
44 | wghts_test = outer(tot_test, rep(1, cols))
45 | 
46 | gpca = generalizedPCA(matp, k = k, M = 0, family = "multinomial", weights = wghts)
47 | gpca_val = generalizedPCA(matp, k = k, M = 0, family = "multinomial", weights = wghts,
48 |                           validation = matp_test, val_weights = wghts_test)
49 | 
50 | pred1 = predict(gpca, matp)
51 | pred1l = predict(gpca, matp, type = "link")
52 | pred1r = predict(gpca, matp, type = "response")
53 | fit1l = fitted(gpca, type = "link")
54 | fit1r = fitted(gpca, type = "response")
55 | 
56 | test_that("k = 2 multinomial", {
57 |   expect_equal(dim(gpca$U), c(cols, 2))
58 |   expect_equal(dim(gpca$PCs), c(rows, 2))
59 |   expect_equal(length(gpca$mu), cols)
60 | 
61 |   expect_equal(dim(pred1), c(rows, 2))
62 |   expect_equal(dim(pred1l), c(rows, cols))
63 |   expect_equal(dim(pred1r), c(rows, cols))
64 |   expect_equal(dim(fit1l), c(rows, cols))
65 |   expect_equal(dim(fit1r), c(rows, cols))
66 | })
67 | 


--------------------------------------------------------------------------------
/tests/testthat/test-multinomial-basics.R:
--------------------------------------------------------------------------------
 1 | context("Multinomial")
 2 | 
 3 | # construct a low rank matrix in the natural parameter scale
 4 | rows = 100
 5 | cols = 10
 6 | k = 1
 7 | set.seed(1)
 8 | 
 9 | mat = model.matrix(~ factor(sample(1:(cols + 1), rows, replace = TRUE)) - 1)[, 1:cols]
10 | colnames(mat) <- NULL
11 | 
12 | gpca = generalizedPCA(mat, k = k, M = 4, family = "multinomial", main_effects = TRUE)
13 | 
14 | pred1 = predict(gpca, mat)
15 | pred1l = predict(gpca, mat, type = "link")
16 | pred1r = predict(gpca, mat, type = "response")
17 | fit1l = fitted(gpca, type = "link")
18 | fit1r = fitted(gpca, type = "response")
19 | 
20 | test_that("correct classes", {
21 |   expect_is(gpca, "gpca")
22 | 
23 |   expect_is(pred1, "matrix")
24 |   expect_is(pred1l, "matrix")
25 |   expect_is(pred1r, "matrix")
26 |   expect_is(fit1l, "matrix")
27 |   expect_is(fit1r, "matrix")
28 | 
29 | })
30 | 
31 | test_that("k = 1 dimensions", {
32 |   expect_equal(dim(gpca$U), c(cols, 1))
33 |   expect_equal(dim(gpca$PCs), c(rows, 1))
34 |   expect_equal(length(gpca$mu), cols)
35 | 
36 |   expect_equal(dim(pred1), c(rows, 1))
37 |   expect_equal(dim(pred1l), c(rows, cols))
38 |   expect_equal(dim(pred1r), c(rows, cols))
39 |   expect_equal(dim(fit1l), c(rows, cols))
40 |   expect_equal(dim(fit1r), c(rows, cols))
41 | })
42 | 
43 | rm(gpca, pred1, pred1l, pred1r, fit1l, fit1r)
44 | 
45 | k = 2
46 | gpca = generalizedPCA(mat, k = k, M = 4, family = "multinomial", main_effects = TRUE)
47 | 
48 | pred1 = predict(gpca, mat)
49 | pred1l = predict(gpca, mat, type = "link")
50 | pred1r = predict(gpca, mat, type = "response")
51 | fit1l = fitted(gpca, type = "link")
52 | fit1r = fitted(gpca, type = "response")
53 | 
54 | test_that("k = 2 dimensions", {
55 |   expect_equal(dim(gpca$U), c(cols, 2))
56 |   expect_equal(dim(gpca$PCs), c(rows, 2))
57 |   expect_equal(length(gpca$mu), cols)
58 | 
59 |   expect_equal(dim(pred1), c(rows, 2))
60 |   expect_equal(dim(pred1l), c(rows, cols))
61 |   expect_equal(dim(pred1r), c(rows, cols))
62 |   expect_equal(dim(fit1l), c(rows, cols))
63 |   expect_equal(dim(fit1r), c(rows, cols))
64 | })
65 | 
66 | test_that("response between 0 and 1", {
67 |   expect_gte(min(pred1r), 0)
68 |   expect_gte(min(fit1r), 0)
69 | 
70 |   expect_lte(max(pred1r), 1)
71 |   expect_lte(max(fit1r), 1)
72 | })
73 | 
74 | test_that("fitted row sums = 1", {
75 |   expect_gte(min(rowSums(pred1r)), 0)
76 |   expect_gte(min(rowSums(fit1r)), 0)
77 | 
78 |   expect_lte(max(rowSums(pred1r)), 1)
79 |   expect_lte(max(rowSums(fit1r)), 1)
80 | })
81 | 


--------------------------------------------------------------------------------
/tests/testthat/test-continuous-basics.R:
--------------------------------------------------------------------------------
 1 | context("Continuous")
 2 | 
 3 | # construct a low rank matrix in the natural parameter scale
 4 | rows = 100
 5 | cols = 10
 6 | k = 1
 7 | set.seed(1)
 8 | mat_np = outer(rnorm(rows), rnorm(cols))
 9 | 
10 | mat = matrix(rnorm(rows * cols, c(mat_np)), rows, cols)
11 | 
12 | gpca = generalizedPCA(mat, k = k, M = 4, family = "gaussian", main_effects = TRUE)
13 | 
14 | pca_mu = colMeans(mat)
15 | pca = svd(scale(mat, center = pca_mu, scale = FALSE))
16 | gpca2 = gpca
17 | gpca$mu = pca_mu
18 | gpca$U = matrix(pca$v[, 1:k], cols, k)
19 | 
20 | 
21 | pred1 = predict(gpca, mat)
22 | pred1l = predict(gpca, mat, type = "link")
23 | pred1r = predict(gpca, mat, type = "response")
24 | fit1l = fitted(gpca, type = "link")
25 | fit1r = fitted(gpca, type = "response")
26 | 
27 | pred2l = predict(gpca2, mat, type = "link")
28 | pred2r = predict(gpca2, mat, type = "response")
29 | 
30 | test_that("correct classes", {
31 |   expect_is(gpca, "gpca")
32 | 
33 |   expect_is(pred1, "matrix")
34 |   expect_is(pred1l, "matrix")
35 |   expect_is(pred1r, "matrix")
36 |   expect_is(fit1l, "matrix")
37 |   expect_is(fit1r, "matrix")
38 | })
39 | 
40 | test_that("k = 1 dimensions", {
41 |   expect_equal(dim(gpca$U), c(cols, 1))
42 |   expect_equal(dim(gpca$PCs), c(rows, 1))
43 |   expect_equal(length(gpca$mu), cols)
44 | 
45 |   expect_equal(dim(pred1), c(rows, 1))
46 |   expect_equal(dim(pred1l), c(rows, cols))
47 |   expect_equal(dim(pred1r), c(rows, cols))
48 |   expect_equal(dim(fit1l), c(rows, cols))
49 |   expect_equal(dim(fit1r), c(rows, cols))
50 | })
51 | 
52 | test_that("k = 1 same fits as standard PCA", {
53 |   expect_equal(pred1r, pred2r)
54 |   expect_equal(pred1l, pred2l)
55 |   # Don't compare fitted, because they rely on $PCs
56 |   expect_equal(gpca$mu, pca_mu)
57 |   expect_equal(pca$v[, 1], gpca$U[, 1])
58 |   expect_equal(pca$u[, 1] * pca$d[1], gpca$PCs[, 1])
59 | })
60 | 
61 | rm(gpca, pred1, pred1l, pred1r, fit1l, fit1r)
62 | 
63 | k = 2
64 | gpca = generalizedPCA(mat, k = k, M = 4, family = "gaussian", main_effects = TRUE)
65 | 
66 | pred1 = predict(gpca, mat)
67 | pred1l = predict(gpca, mat, type = "link")
68 | pred1r = predict(gpca, mat, type = "response")
69 | fit1l = fitted(gpca, type = "link")
70 | fit1r = fitted(gpca, type = "response")
71 | 
72 | test_that("k = 2 dimensions", {
73 |   expect_equal(dim(gpca$U), c(cols, 2))
74 |   expect_equal(dim(gpca$PCs), c(rows, 2))
75 |   expect_equal(length(gpca$mu), cols)
76 | 
77 |   expect_equal(dim(pred1), c(rows, 2))
78 |   expect_equal(dim(pred1l), c(rows, cols))
79 |   expect_equal(dim(pred1r), c(rows, cols))
80 |   expect_equal(dim(fit1l), c(rows, cols))
81 |   expect_equal(dim(fit1r), c(rows, cols))
82 | })
83 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Generalized PCA
 2 | 
 3 | `generalizedPCA` is an R package which extends principal component analysis to other types of data, much like [generalized linear models](http://en.wikipedia.org/wiki/Generalized_linear_model) extends linear regression. The package [logisticPCA](https://github.com/andland/logisticPCA) contains the extension to binary data, among other methods, and this package intends to generalize it to all exponential family distributions. Please note that it is still in the very early stages of development and the conventions will possibly change in the future.
 4 | 
 5 | ## Installation
 6 | 
 7 | To install R, visit [r-project.org/](http://www.r-project.org/).
 8 | 
 9 | To install the package, first install `devtools` from CRAN. Then run the following commands.
10 | ```R
11 | # install.packages("devtools")
12 | library("devtools")
13 | install_github("andland/generalizedPCA")
14 | ```
15 | 
16 | ## Use
17 | The main function is `generalizedPCA()`. Like in generalized linear models, you must specify the distribution of your data. `generalizedPCA()` currently supports `"gaussian"`, `"binomial"`, `"poisson"`, or `"multinomial"` data. Unlike standard PCA, it can incorporate weights and missing data.  If your data are proportions, you can use `family = "binomial"` with `weights` being a matrix of the number of opportunities. If your data is a multinomial variable with `d` levels, the input matrix should have `d - 1` columns.
18 | 
19 | The function returns `mu`, the variable main effects vector of length `d`, and `U`, the `d x k` loadings matrix.
20 | 
21 | ## Details
22 | `generalizedPCA()` estimates the natural parameters of an exponential family distribution in a lower dimensional space. This is done by projecting the natural parameters from the saturated model. A rank-`k` projection matrix, or equivalently a `d x k` orthogonal matrix, is solved for to minimize the deviance. 
23 | 
24 | For some distributions, the natural parameters from the saturated model are either negative or positive infinity, and an additional tuning parameter `M` is needed to approximate them. This occurs when `family = "binomial"` and your data include `0`'s or `1`'s or when `family = "poisson"` and your data include `0`'s. You can use `cv.gpca()` to select `M` by cross validation. Typical values are in the range of `3` to `10`.
25 | 
26 | A manuscript describing generalized PCA applied to binary data can be found [here](http://www.stat.osu.edu/~yklee/mss/tr890.pdf).
27 | 
28 | ## Methods
29 | The generalizedPCA class, `gpca`, has several methods to make data analysis easier.
30 | 
31 | * `print()`: Prints a summary of the fitted model.
32 | * `fitted()`: Fits the low dimensional matrix of either natural parameters or response.
33 | * `predict()`: Predicts the PCs on new data. Can also predict the low dimensional matrix of natural parameters or response on new data.
34 | * `plot()`: Plots either the deviance trace by the number of iterations, the first two PC loadings, or the first two PC scores using the package `ggplot2`.
35 | 
36 | In addition, there are functions for performing cross-validation.
37 | 
38 | * `cv.gpca()`: Run cross-validation over the rows of the matrix to assess the fit of `M` and/or `k`.
39 | * `plot.cv()`: Plots the results of the `cv.gpca()` function using the package `ggplot2`.
40 | 


--------------------------------------------------------------------------------
/man/generalizedMF.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generalizedMF.R
 3 | \name{generalizedMF}
 4 | \alias{generalizedMF}
 5 | \title{Exponential Family Matrix Factorization}
 6 | \usage{
 7 | generalizedMF(x, k = 2, family = c("gaussian", "binomial", "poisson"),
 8 |   weights, quiet = TRUE, max_iters = 1000, conv_criteria = 1e-05,
 9 |   partial_decomp = FALSE, random_start = FALSE, start_A, start_B, mu,
10 |   main_effects = TRUE, method = c("als", "svd"))
11 | }
12 | \arguments{
13 | \item{x}{matrix of either binary, proportions, count, or continuous data}
14 | 
15 | \item{k}{dimension}
16 | 
17 | \item{family}{exponential family distribution of data}
18 | 
19 | \item{weights}{an optional matrix of the same size as the \code{x} with data weights}
20 | 
21 | \item{quiet}{logical; whether the calculation should give feedback}
22 | 
23 | \item{max_iters}{maximum number of iterations}
24 | 
25 | \item{conv_criteria}{convergence criteria}
26 | 
27 | \item{partial_decomp}{logical; if \code{TRUE}, the function uses the RSpectra package
28 | to more quickly calculate the SVD. When the number of columns is small,
29 | the approximation may be less accurate and slower}
30 | 
31 | \item{random_start}{whether to randomly initialize \code{A} and \code{B}}
32 | 
33 | \item{start_A}{initial value for \code{A}}
34 | 
35 | \item{start_B}{initial value for \code{B}}
36 | 
37 | \item{mu}{specific value for \code{mu}, the mean vector of \code{x}}
38 | 
39 | \item{main_effects}{logical; whether to include main effects in the model}
40 | 
41 | \item{method}{which algorithm to use. \code{"als"} uses alternating least squares.
42 | It has the benefit of majozing row-wise and column-wise for each of the updates.
43 | \code{"svd"} uses singular value decomposition (similar to de Leeuw, 2006). It has to
44 | a more gereral majorization, which may not work well for heterogeneous matrices.}
45 | }
46 | \value{
47 | An S3 object of class \code{gmf} which is a list with the
48 | following components:
49 | \item{mu}{the main effects for dimensionality reduction}
50 | \item{A}{the \code{n}x\code{k}-dimentional matrix with the scores}
51 | \item{B}{the \code{d}x\code{k}-dimentional matrix with the loadings}
52 | \item{family}{the exponential family of the data}
53 | \item{iters}{number of iterations required for convergence}
54 | \item{loss_trace}{the trace of the average deviance of the algorithm.
55 |    Should be non-increasing}
56 | \item{prop_deviance_expl}{the proportion of deviance explained by this model.
57 |    If \code{main_effects = TRUE}, the null model is just the main effects, otherwise
58 |    the null model estimates 0 for all natural parameters.}
59 | }
60 | \description{
61 | Collins et al. (2001)'s Exponential Family PCA
62 | }
63 | \examples{
64 | rows = 100
65 | cols = 10
66 | set.seed(1)
67 | mat_np = outer(rnorm(rows), rnorm(cols))
68 | 
69 | # generate a count matrix and binary response
70 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
71 | 
72 | mod = generalizedMF(mat, k = 1, family = "poisson", quiet = FALSE)
73 | 
74 | }
75 | \references{
76 | de Leeuw, Jan, 2006. Principal component analysis of binary data
77 | by iterated singular value decomposition. Computational Statistics & Data Analysis
78 | 50 (1), 21--39.
79 | 
80 | Collins, M., Dasgupta, S., & Schapire, R. E., 2001. A generalization of principal
81 | components analysis to the exponential family. In NIPS, 617--624.
82 | }
83 | 
84 | 


--------------------------------------------------------------------------------
/man/exponential_family_harmonium.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/exponential_family_harmonium.R
 3 | \name{exponential_family_harmonium}
 4 | \alias{exponential_family_harmonium}
 5 | \title{Exponential Family Harmoniums}
 6 | \usage{
 7 | exponential_family_harmonium(x, k = 2, family = c("gaussian", "binomial",
 8 |   "poisson"), family_hidden = c("gaussian", "binomial", "poisson"),
 9 |   cd_iters = 1, learning_rate = 0.001, max_iters = 100,
10 |   rms_prop = FALSE, quiet = TRUE, random_start = TRUE, start_W, mu,
11 |   main_effects = TRUE)
12 | }
13 | \arguments{
14 | \item{x}{matrix of either binary, count, or continuous data}
15 | 
16 | \item{k}{dimension (number of hidden units)}
17 | 
18 | \item{family}{exponential family distribution of data}
19 | 
20 | \item{family_hidden}{exponential family distribution of hidden units}
21 | 
22 | \item{cd_iters}{number of iterations for contrastive divergence (CD) at each iteration.
23 | It should be a vector of two integers, which is the range of CD interations that the
24 | algorithm will perform from the beginning until the end, linearly interpolated}
25 | 
26 | \item{learning_rate}{learning rate used for gradient descent}
27 | 
28 | \item{max_iters}{maximum number of iterations}
29 | 
30 | \item{rms_prop}{logical; whether to use RMS prop for optimization. Default is \code{FALSE}.}
31 | 
32 | \item{quiet}{logical; whether the calculation should give feedback}
33 | 
34 | \item{random_start}{whether to randomly initialize \code{W}}
35 | 
36 | \item{start_W}{initial value for \code{W}}
37 | 
38 | \item{mu}{specific value for \code{mu}, the mean (bias) vector of \code{x}}
39 | 
40 | \item{main_effects}{logical; whether to include main effects (bias terms) in the model}
41 | }
42 | \value{
43 | An S3 object of class \code{efh} which is a list with the
44 | following components:
45 | \item{mu}{the main effects (bias terms) for dimensionality reduction}
46 | \item{hidden_bias}{the bias for the hidden units (currently hard coded to 0)}
47 | \item{W}{the \code{d}x\code{k}-dimentional matrix with the loadings}
48 | \item{family}{the exponential family of the data}
49 | \item{family_hidden}{the exponential family of the hidden units}
50 | \item{iters}{number of iterations required for convergence}
51 | \item{loss_trace}{the trace of the average deviance of the algorithm.
52 |    Should be non-increasing}
53 | \item{prop_deviance_expl}{the proportion of deviance explained by this model.
54 |    If \code{main_effects = TRUE}, the null model is just the main effects, otherwise
55 |    the null model estimates 0 for all natural parameters.}
56 | }
57 | \description{
58 | Welling et al. (2005)'s Exponential Family Harmoniums
59 | }
60 | \examples{
61 | rows = 100
62 | cols = 10
63 | set.seed(1)
64 | mat_np = outer(rnorm(rows), rnorm(cols))
65 | 
66 | # generate a count matrix and binary response
67 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
68 | mat[1, 1] <- NA
69 | 
70 | modp = exponential_family_harmonium(mat, k = 2, family = "poisson", quiet = FALSE,
71 |                                     learning_rate = 0.001, rms_prop = FALSE, max_iters = 250)
72 | gmf = generalizedMF(mat, k = 1, family = "poisson", quiet = FALSE)
73 | 
74 | 
75 | }
76 | \references{
77 | Welling, Max, Michal Rosen-Zvi, and Geoffrey E. Hinton. "Exponential family
78 | harmoniums with an application to information retrieval." Advances in neural
79 | information processing systems. 2005.
80 | }
81 | 
82 | 


--------------------------------------------------------------------------------
/man/generalizedPCA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/generalizedPCA.R
 3 | \name{generalizedPCA}
 4 | \alias{generalizedPCA}
 5 | \title{Generalized Principal Component Analysis}
 6 | \usage{
 7 | generalizedPCA(x, k = 2, M = 4, family = c("gaussian", "binomial",
 8 |   "poisson", "multinomial"), weights, quiet = TRUE, majorizer = c("row",
 9 |   "all"), partial_decomp = FALSE, max_iters = 1000, conv_criteria = 1e-05,
10 |   random_start = FALSE, start_U, start_mu, main_effects = TRUE,
11 |   normalize = FALSE, validation, val_weights)
12 | }
13 | \arguments{
14 | \item{x}{matrix of either binary, proportions, count, or continuous data}
15 | 
16 | \item{k}{number of principal components to return}
17 | 
18 | \item{M}{value to approximate the saturated model}
19 | 
20 | \item{family}{exponential family distribution of data}
21 | 
22 | \item{weights}{an optional matrix of the same size as the \code{x} with data weights}
23 | 
24 | \item{quiet}{logical; whether the calculation should give feedback}
25 | 
26 | \item{majorizer}{how to majorize the deviance. \code{"row"} gives
27 | tighter majorization, but may take longer to calculate each iteration.
28 | \code{"all"} may be faster per iteration, but take more iterations}
29 | 
30 | \item{partial_decomp}{logical; if \code{TRUE}, the function uses the RSpectra package
31 | to more quickly calculate the SVD. When the number of columns is small,
32 | the approximation may be less accurate and slower}
33 | 
34 | \item{max_iters}{number of maximum iterations}
35 | 
36 | \item{conv_criteria}{convergence criteria. The difference between average deviance
37 | in successive iterations}
38 | 
39 | \item{random_start}{logical; whether to randomly inititalize the parameters. If \code{FALSE},
40 | function will use an eigen-decomposition as starting value}
41 | 
42 | \item{start_U}{starting value for the orthogonal matrix}
43 | 
44 | \item{start_mu}{starting value for mu. Only used if \code{main_effects = TRUE}}
45 | 
46 | \item{main_effects}{logical; whether to include main effects in the model}
47 | 
48 | \item{normalize}{logical; whether to weight the variables to they all have equal influence}
49 | 
50 | \item{validation}{a validation dataset to select \code{m} with}
51 | 
52 | \item{val_weights}{weights associated with validation data}
53 | }
54 | \value{
55 | An S3 object of class \code{gpca} which is a list with the
56 | following components:
57 | \item{mu}{the main effects}
58 | \item{U}{a \code{k}-dimentional orthonormal matrix with the loadings}
59 | \item{PCs}{the princial component scores}
60 | \item{M}{the parameter inputed}
61 | \item{family}{the exponential family used}
62 | \item{iters}{number of iterations required for convergence}
63 | \item{loss_trace}{the trace of the average deviance of the algorithm.
64 |    Should be non-increasing}
65 | \item{prop_deviance_expl}{the proportion of deviance explained by this model.
66 |    If \code{main_effects = TRUE}, the null model is just the main effects, otherwise
67 |    the null model estimates 0 for all natural parameters.}
68 | }
69 | \description{
70 | Dimension reduction for exponential family data by extending Pearson's
71 | PCA formulation
72 | }
73 | \examples{
74 | # construct a low rank matrix in the natural parameter space
75 | rows = 100
76 | cols = 10
77 | set.seed(1)
78 | mat_np = outer(rnorm(rows), rnorm(cols))
79 | 
80 | # generate a count matrix
81 | mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
82 | 
83 | # run Poisson PCA on it
84 | gpca = generalizedPCA(mat, k = 1, M = 4, family = "poisson")
85 | }
86 | 
87 | 


--------------------------------------------------------------------------------
/man/convexGeneralizedPCA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/convexGeneralizedPCA.R
 3 | \name{convexGeneralizedPCA}
 4 | \alias{convexGeneralizedPCA}
 5 | \title{Convex Generalized Principal Component Analysis}
 6 | \usage{
 7 | convexGeneralizedPCA(x, k = 2, M = 4, family = c("gaussian", "binomial",
 8 |   "poisson", "multinomial"), weights, quiet = TRUE, partial_decomp = FALSE,
 9 |   max_iters = 1000, conv_criteria = 1e-06, random_start = FALSE, start_H,
10 |   mu, main_effects = TRUE, normalize = FALSE, ss_factor = 1)
11 | }
12 | \arguments{
13 | \item{x}{matrix of either binary, proportions, count, or continuous data}
14 | 
15 | \item{k}{number of principal components to return}
16 | 
17 | \item{M}{value to approximate the saturated model}
18 | 
19 | \item{family}{exponential family distribution of data}
20 | 
21 | \item{weights}{an optional matrix of the same size as the \code{x} with non-negative weights}
22 | 
23 | \item{quiet}{logical; whether the calculation should give feedback}
24 | 
25 | \item{partial_decomp}{logical; if \code{TRUE}, the function uses the RSpectra package
26 | to more quickly calculate the eigen-decomposition. When the number of columns is small,
27 | the approximation may be less accurate and slower}
28 | 
29 | \item{max_iters}{number of maximum iterations}
30 | 
31 | \item{conv_criteria}{convergence criteria. The difference between average deviance
32 | in successive iterations}
33 | 
34 | \item{random_start}{logical; whether to randomly inititalize the parameters. If \code{FALSE},
35 | function will use an eigen-decomposition as starting value}
36 | 
37 | \item{start_H}{starting value for the Fantope matrix}
38 | 
39 | \item{mu}{main effects vector. Only used if \code{main_effects = TRUE}}
40 | 
41 | \item{main_effects}{logical; whether to include main effects in the model}
42 | 
43 | \item{normalize}{logical; whether to weight the variables to they all have equal influence}
44 | 
45 | \item{ss_factor}{step size multiplier. Amount by which to multiply the step size. Quadratic
46 | convergence rate can be proven for \code{ss_factor = 1}, but I have found higher values
47 | sometimes work better. The default is \code{ss_factor = 4}.
48 | If it is not converging, try \code{ss_factor = 1}.}
49 | }
50 | \value{
51 | An S3 object of class \code{cgpca} which is a list with the
52 | following components:
53 | \item{mu}{the main effects}
54 | \item{H}{a rank \code{k} Fantope matrix}
55 | \item{U}{a \code{ceiling(k)}-dimentional orthonormal matrix with the loadings}
56 | \item{PCs}{the princial component scores}
57 | \item{M}{the parameter inputed}
58 | \item{iters}{number of iterations required for convergence}
59 | \item{loss_trace}{the trace of the average deviance using the Fantope matrix}
60 | \item{proj_loss_trace}{the trace of the average deviance using the projection matrix}
61 | \item{prop_deviance_expl}{the proportion of deviance explained by this model.
62 |    If \code{main_effects = TRUE}, the null model is just the main effects, otherwise
63 |    the null model estimates 0 for all natural parameters.}
64 | }
65 | \description{
66 | Dimensionality reduction for exponential family data by extending Pearson's
67 | PCA formulation to minimize deviance. The convex relaxation
68 | to projection matrices, the Fantope, is used.
69 | }
70 | \examples{
71 | # construct a low rank matrix in the logit scale
72 | rows = 100
73 | cols = 10
74 | set.seed(1)
75 | mat_logit = outer(rnorm(rows), rnorm(cols))
76 | 
77 | # generate a binary matrix
78 | mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0
79 | 
80 | # run convex generalized PCA on it
81 | cgpca = convexGeneralizedPCA(mat, k = 1, M = 4, family = "binomial")
82 | }
83 | 
84 | 


--------------------------------------------------------------------------------
/R/exponential_family_functions.R:
--------------------------------------------------------------------------------
  1 | #' @title Inverse logit for matrices
  2 | #'
  3 | #' @description
  4 | #' Apply the inverse logit function to a matrix, element-wise. It
  5 | #' generalizes the \code{inv.logit} function from the \code{gtools}
  6 | #' library to matrices
  7 | #'
  8 | #' @param x matrix
  9 | #' @param min Lower end of logit interval
 10 | #' @param max Upper end of logit interval
 11 | #' @examples
 12 | #' (mat = matrix(rnorm(10 * 5), nrow = 10, ncol = 5))
 13 | #' inv.logit.mat(mat)
 14 | #' @export
 15 | inv.logit.mat <- function(x, min = 0, max = 1) {
 16 |   # .Call("inv_logit_mat", x, min, max, PACKAGE = "generalizedPCA")
 17 |   logisticPCA::inv.logit.mat(x, min, max)
 18 | }
 19 | 
 20 | #' @title Bernoulli Log Likelihood
 21 | #'
 22 | #' @description
 23 | #' Calculate the Bernoulli log likelihood of matrix
 24 | #'
 25 | #' @param x matrix with all binary entries
 26 | #' @param theta estimated natural parameters with
 27 | #'  same dimensions as x
 28 | #' @param q instead of x, you can input matrix q which is
 29 | #'  -1 if \code{x = 0}, 1 if \code{x = 1}, and 0 if \code{is.na(x)}
 30 | # @export
 31 | log_like_Bernoulli <- function(x, theta, q) {
 32 |   logisticPCA::log_like_Bernoulli(x, theta, q)
 33 | }
 34 | 
 35 | check_family <- function(x, family) {
 36 |   distinct_vals = unique(c(x[!is.na(x)]))
 37 |   if (family %in% c("binomial", "multinomial") &&
 38 |       any(distinct_vals < 0 | distinct_vals > 1)) {
 39 |     stop(paste0(family, " family with data outside [0, 1]"))
 40 |   }
 41 |   if (any(family == "poisson" && distinct_vals < 0)) {
 42 |     stop("Negative data with poisson family")
 43 |   }
 44 |   if (family == "multinomial" && any(rowSums(x) > 1)) {
 45 |     stop(paste0(family, " family must have row sums <= 1"))
 46 |   }
 47 |   if (all(distinct_vals %in% c(0, 1))) {
 48 |     if (!(family %in% c("binomial", "multinomial"))) {
 49 |       message("All entries are binary. Are you sure you didn't mean binomial?")
 50 |     }
 51 |   } else if (all(distinct_vals >= 0 & distinct_vals %% 1 == 0)) {
 52 |     if (family != "poisson") {
 53 |       message("All entries are counts. Are you sure you didn't mean poisson?")
 54 |     }
 55 |   }
 56 |   if (any(distinct_vals %% 1 != 0) & family == "poisson") {
 57 |     message("Non-integer data with poisson data. Did you want family = guassian?")
 58 |   }
 59 | }
 60 | 
 61 | exp_fam_guess <- function(x) {
 62 |   distinct_vals = unique(c(x[!is.na(x)]))
 63 |   if (all(distinct_vals %in% c(0, 1))) {
 64 |     family = "binomial"
 65 |     message("Guessing data comes from a binomial distribution because it consists of all 0's and 1's")
 66 |   } else if (all(distinct_vals >= 0 & distinct_vals %% 1 == 0)) {
 67 |     family = "poisson"
 68 |     message("Guessing data comes from a poisson distribution because it consists of all non-negative integers")
 69 |   } else if (all(distinct_vals >=0 & distinct_vals <= 1) & all(rowSums(x, na.rm = TRUE) <= 1)) {
 70 |     family = "multinomial"
 71 |     message("Guessing data comes from a multinomial distribution because all the rows sum to less than 1")
 72 |   } else {
 73 |     family = "gaussian"
 74 |   }
 75 |   return(family)
 76 | }
 77 | 
 78 | saturated_natural_parameters <- function(x, family, M) {
 79 |   if (family == "gaussian") {
 80 |     eta = x
 81 |   } else if (family == "binomial") {
 82 |     eta = abs(M) * (2 * x - 1)
 83 |     non_binary = (x != 0 & x != 1 & !is.na(x))
 84 |     if (sum(non_binary) > 0) {
 85 |       logitvals = log(x) - log(1 - x)
 86 |       eta[non_binary] = logitvals[non_binary]
 87 |     }
 88 |   } else if (family == "poisson") {
 89 |     eta = log(x)
 90 |     eta[x==0] = -abs(M)
 91 |   } else if (family == "multinomial") {
 92 |     # TODO: check this! Correct if 0/1s.
 93 |     eta = abs(M) * (2 * x - 1)
 94 |     non_binary = (x != 0 & x != 1 & !is.na(x))
 95 |     if (sum(non_binary) > 0) {
 96 |       # TODO: give warning first time if any(rowSums(x) == 1)
 97 |       # TODO: doesn't deal with NAs,
 98 |       #   although typically a whole categorical variable would be NA
 99 | 
100 |       # last_cat_prob = ifelse(rowSums(x) > 1 - as.numeric(inv.logit.mat(-as.numeric(M))),
101 |       #                        as.numeric(inv.logit.mat(-as.numeric(M))), 1 - rowSums(x))
102 |       # logitvals = sweep(log(x), 1, log(last_cat_prob), "-")
103 | 
104 |       # Below is in line with what is in tech report
105 |       last_cat_prob = 1 - rowSums(x)
106 |       last_cat_prob[last_cat_prob < exp(-as.numeric(M))] = exp(-as.numeric(M))
107 |       logitvals = sweep(log(x), 1, log(last_cat_prob), "-")
108 | 
109 |       eta[non_binary] = logitvals[non_binary]
110 |     }
111 |   }
112 |   eta[is.na(x)] <- 0
113 |   return(eta)
114 | }
115 | 
116 | # @export
117 | exp_fam_mean <- function(theta, family) {
118 |   if (family == "gaussian") {
119 |     mean_mat = theta
120 |   } else if (family == "binomial") {
121 |     mean_mat = inv.logit.mat(theta)
122 |   } else if (family == "poisson") {
123 |     mean_mat = exp(theta)
124 |   } else if (family == "multinomial") {
125 |     exp_theta = exp(theta)
126 |     mean_mat = sweep(exp_theta, 1, 1 + rowSums(exp_theta), "/")
127 |   }
128 |   return(mean_mat)
129 | }
130 | 
131 | exp_fam_variance <- function(theta, family, weights = 1.0) {
132 |   if (family == "gaussian") {
133 |     var_mat = matrix(1, nrow(theta), ncol(theta)) * weights
134 |   } else if (family %in% c("binomial", "multinomial")) {
135 |     mean_mat = exp_fam_mean(theta, family)
136 |     var_mat = mean_mat * (1 - mean_mat) * weights
137 |   } else if (family == "poisson") {
138 |     var_mat = exp(theta) * weights
139 |   }
140 |   return(var_mat)
141 | }
142 | 
143 | # @export
144 | exp_fam_log_like <- function(x, theta, family, weights = 1.0) {
145 |   if (family == "gaussian") {
146 |     return(-0.5 * sum(weights * (x - theta)^2, na.rm = TRUE))
147 |   } else if (family == "binomial") {
148 |     return(sum((weights * log(inv.logit.mat((2 * x - 1) * theta))), na.rm = TRUE))
149 |     # the below does not work with x == 1 and theta > 750
150 |     # return(sum(weights * (x * theta - log(1 + exp(theta))), na.rm = TRUE))
151 |   } else if (family == "poisson") {
152 |     return(sum(weights * (x * theta - exp(theta) - lfactorial(x)), na.rm = TRUE))
153 |   } else if (family == "multinomial") {
154 |     if (length(weights) > 1 && any(apply(weights, 1, stats::var) > 0)) {
155 |       stop("weights should be the same for every variable withing each row")
156 |     }
157 |     return(sum(weights * (x * theta) - weights / ncol(x) *
158 |                  outer(log(1 + rowSums(exp(theta))), rep(1, ncol(x))), na.rm = TRUE))
159 |   }
160 | }
161 | 
162 | #' Calculate exponential family deviance
163 | #'
164 | #' Calculates the deviance of data assuming it comes from an exponential family
165 | #'
166 | #' @param x a vector or matrix of data
167 | #' @param theta natural parameters. Must be same dimensions as \code{x}
168 | #' @param family exponential family distribution of data
169 | #' @param weights weights of datapoints. Must be a scalar or the same dimensions as the data
170 | #'
171 | #' @export
172 | exp_fam_deviance <- function(x, theta, family, weights = 1.0) {
173 |   eta_sat_nat = saturated_natural_parameters(x, family, M = Inf)
174 |   sat_loglike = exp_fam_log_like(x, eta_sat_nat, family, weights)
175 | 
176 |   -2 * (exp_fam_log_like(x, theta, family, weights) - sat_loglike)
177 | }
178 | 
179 | # for solving for m
180 | exp_fam_sat_ind_mat <- function(x, family) {
181 |   if (family == "gaussian") {
182 |     return(matrix(0, nrow(x), ncol(x)))
183 |   } else if (family == "binomial") {
184 |     # set 1's to 1, 0's to -1, and everything else to 0
185 |     return((x == 1) - (x == 0))
186 |   } else if (family == "poisson") {
187 |     return((x == 0) * -1.0)
188 |   } else if (family == "multinomial") {
189 |     # first see if last cat == 0 and x between 0 and 1
190 |     last_cat_prob_zero = rowSums(x) > (1 - 1e-3)
191 |     q = outer(last_cat_prob_zero, rep(TRUE, ncol(x))) & x > 0 & x < 1
192 |     # then add in exact 0's and 1's
193 |     q = q + (x == 1) - (x == 0)
194 |   }
195 | }
196 | 
197 | # for exponential family harmoniums
198 | exp_fam_sufficient_stat <- function(x, family) {
199 |   if (family %in% c("gaussian", "binomial", "poisson")) {
200 |     return(x)
201 |   }
202 | }
203 | 
204 | # for exponential family harmoniums
205 | #' @importFrom stats rbinom rpois rnorm
206 | exp_fam_sample <- function(theta, family) {
207 |   mean_mat = exp_fam_mean(theta = theta, family = family)
208 | 
209 |   if (family == "gaussian") {
210 |     return(matrix(rnorm(prod(dim(theta)), mean = c(mean_mat), sd = 1), nrow(theta), ncol(theta)))
211 |   } else if (family == "binomial") {
212 |     return(matrix(rbinom(prod(dim(theta)), size = 1, prob = c(mean_mat)), nrow(theta), ncol(theta)))
213 |   } else if (family == "poisson") {
214 |     return(matrix(rpois(prod(dim(theta)), lambda = c(mean_mat)), nrow(theta), ncol(theta)))
215 |   }
216 | }
217 | 


--------------------------------------------------------------------------------
/R/exponential_family_harmonium.R:
--------------------------------------------------------------------------------
  1 | #' Exponential Family Harmoniums
  2 | #'
  3 | #' @description Welling et al. (2005)'s Exponential Family Harmoniums
  4 | #'
  5 | #' @param x matrix of either binary, count, or continuous data
  6 | #' @param k dimension (number of hidden units)
  7 | #' @param family exponential family distribution of data
  8 | #' @param family_hidden exponential family distribution of hidden units
  9 | #' @param cd_iters number of iterations for contrastive divergence (CD) at each iteration.
 10 | #'   It should be a vector of two integers, which is the range of CD interations that the
 11 | #'   algorithm will perform from the beginning until the end, linearly interpolated
 12 | #' @param max_iters maximum number of iterations
 13 | #' @param learning_rate learning rate used for gradient descent
 14 | #' @param rms_prop logical; whether to use RMS prop for optimization. Default is \code{FALSE}.
 15 | #' @param quiet logical; whether the calculation should give feedback
 16 | #' @param random_start whether to randomly initialize \code{W}
 17 | #' @param start_W initial value for \code{W}
 18 | #' @param mu specific value for \code{mu}, the mean (bias) vector of \code{x}
 19 | #' @param main_effects logical; whether to include main effects (bias terms) in the model
 20 | #'
 21 | #' @return An S3 object of class \code{efh} which is a list with the
 22 | #' following components:
 23 | #' \item{mu}{the main effects (bias terms) for dimensionality reduction}
 24 | #' \item{hidden_bias}{the bias for the hidden units (currently hard coded to 0)}
 25 | #' \item{W}{the \code{d}x\code{k}-dimentional matrix with the loadings}
 26 | #' \item{family}{the exponential family of the data}
 27 | #' \item{family_hidden}{the exponential family of the hidden units}
 28 | #' \item{iters}{number of iterations required for convergence}
 29 | #' \item{loss_trace}{the trace of the average deviance of the algorithm.
 30 | #'    Should be non-increasing}
 31 | #' \item{prop_deviance_expl}{the proportion of deviance explained by this model.
 32 | #'    If \code{main_effects = TRUE}, the null model is just the main effects, otherwise
 33 | #'    the null model estimates 0 for all natural parameters.}
 34 | #'
 35 | #' @export
 36 | #'
 37 | #' @references
 38 | #' Welling, Max, Michal Rosen-Zvi, and Geoffrey E. Hinton. "Exponential family
 39 | #' harmoniums with an application to information retrieval." Advances in neural
 40 | #' information processing systems. 2005.
 41 | #'
 42 | #' @examples
 43 | #' rows = 100
 44 | #' cols = 10
 45 | #' set.seed(1)
 46 | #' mat_np = outer(rnorm(rows), rnorm(cols))
 47 | #'
 48 | #' # generate a count matrix and binary response
 49 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
 50 | #' mat[1, 1] <- NA
 51 | #'
 52 | #' modp = exponential_family_harmonium(mat, k = 2, family = "poisson", quiet = FALSE,
 53 | #'                                     learning_rate = 0.001, rms_prop = FALSE, max_iters = 250)
 54 | #' gmf = generalizedMF(mat, k = 1, family = "poisson", quiet = FALSE)
 55 | #'
 56 | #'
 57 | exponential_family_harmonium <- function(x, k = 2,
 58 |                           family = c("gaussian", "binomial", "poisson"),
 59 |                           family_hidden = c("gaussian", "binomial", "poisson"),
 60 |                           cd_iters = 1, learning_rate = 0.001, max_iters = 100, rms_prop = FALSE,
 61 |                           quiet = TRUE, random_start = TRUE, start_W,  mu, main_effects = TRUE) {
 62 |   family = match.arg(family)
 63 |   family_hidden = match.arg(family_hidden)
 64 |   check_family(x, family)
 65 |   # x = mat; k = 1; family = "poisson"; family_hidden = "gaussian"; cd_iters = 1; learning_rate = 0.001; max_iters = 100; quiet = FALSE;
 66 | 
 67 |   stopifnot(cd_iters > 0.5, length(cd_iters) %in% c(1, 2, max_iters))
 68 |   if (length(cd_iters) == 1) {
 69 |     cd_iters = rep(round(cd_iters), max_iters)
 70 |   } else if (length(cd_iters) == 2) {
 71 |     cd_iters = round(cd_iters)
 72 |     cd_iters = pmax(min(cd_iters), pmin(max(cd_iters), round(seq(cd_iters[1] - 0.5, cd_iters[2] + 0.5, length.out = max_iters))))
 73 |   } else {
 74 |     cd_iters = round(cd_iters)
 75 |   }
 76 | 
 77 |   x = as.matrix(x)
 78 |   missing_mat = is.na(x)
 79 |   sum_weights = sum(!missing_mat)
 80 | 
 81 |   # missing values are ignored, which is equivalent to setting them to 0
 82 |   x_imputed = x
 83 |   x_imputed[missing_mat] <- 0
 84 |   # scaled to take the avg with different number of missing values
 85 |   x_imputed_scaled = scale(x_imputed, FALSE, colSums(!missing_mat))
 86 | 
 87 |   n = nrow(x)
 88 |   d = ncol(x)
 89 |   ones = rep(1, n)
 90 | 
 91 |   # calculate the null log likelihood for % deviance explained and normalization
 92 |   if (main_effects) {
 93 |     weighted_col_means = colMeans(x, na.rm = TRUE)
 94 |     null_theta = as.numeric(saturated_natural_parameters(matrix(weighted_col_means, 1), family, M = Inf))
 95 |   } else {
 96 |     null_theta = rep(0, d)
 97 |   }
 98 |   null_deviance = exp_fam_deviance(x, outer(ones, null_theta), family) / sum_weights
 99 | 
100 |   # Initialize #
101 |   ##################
102 |   if (main_effects) {
103 |     if (missing(mu)) {
104 |       mu = saturated_natural_parameters(weighted_col_means, family, Inf)
105 |     } else {
106 |       mu = as.numeric(mu)
107 |       stopifnot(length(mu) == d)
108 |     }
109 |   } else {
110 |     mu = rep(0, d)
111 |   }
112 | 
113 |   # hard code to 0 for now
114 |   hidden_bias = rep(0, k)
115 | 
116 |   if (!missing(start_W)) {
117 |     stopifnot(dim(start_W) == c(d, k))
118 |     W = as.matrix(start_W)
119 |   } else if (random_start) {
120 |     # initialize with glorot_uniform (https://keras.io/initializers/#glorot_uniform)
121 |     # W = matrix(runif(d * k, -sqrt(6  / (k + d)), sqrt(6  / (k + d))), d, k)
122 |     W = matrix(rnorm(d * k, 0, 0.001), d, k)
123 |   } else {
124 |     udv = svd(scale(saturated_natural_parameters(x_imputed, family, 4), TRUE, FALSE))
125 |     W = udv$v[, 1:k, drop = FALSE]
126 |   }
127 | 
128 |   loss_trace = numeric(max_iters + 1)
129 |   theta = outer(ones, mu) + exp_fam_mean(outer(ones, hidden_bias) + x_imputed %*% W, family_hidden) %*% t(W)
130 |   loss_trace[1] = exp_fam_deviance(x, theta, family) / sum_weights
131 | 
132 |   ptm <- proc.time()
133 | 
134 |   if (!quiet) {
135 |     cat(0, "  ", loss_trace[1], "")
136 |     cat("0 hours elapsed\n")
137 |   }
138 | 
139 |   # RMSProp
140 |   if (rms_prop) {
141 |     W_grad_sq = matrix(1, nrow(W), ncol(W))
142 |   } else {
143 |     W_grad_sq = matrix(1, nrow(W), ncol(W))
144 |   }
145 | 
146 |   W_lag = W
147 |   for (ii in seq_len(max_iters)) {
148 |     # grad_deflate_times = 0
149 |     while (TRUE) {
150 |       hidden_mean_0 = exp_fam_mean(outer(ones, hidden_bias) + x_imputed %*% W, family_hidden)
151 |       visible_hidden_0 = t(x_imputed_scaled) %*% hidden_mean_0
152 | 
153 |       visible_cd = contrastive_divergence(x = x_imputed, W = W, mu = mu, hidden_bias = hidden_bias,
154 |                                           family = family, family_hidden = family_hidden,
155 |                                           num_iter = cd_iters[ii])
156 |       hidden_mean_cd = exp_fam_mean(outer(ones, hidden_bias) + visible_cd %*% W, family_hidden)
157 |       visible_hidden_cd = t(visible_cd) %*% hidden_mean_cd / n
158 | 
159 |       # the generated data can explode with gaussian hidden and poisson visible
160 |       if (any(is.na(visible_hidden_cd))) {
161 |         # grad_deflate_times = grad_deflate_times + 1
162 |         if (!quiet) {
163 |           cat("Contrastive Divergence resulted in NA's\n")
164 |         }
165 |         return(
166 |           structure(
167 |             list(
168 |               mu = mu,
169 |               W = W_lag,
170 |               hidden_bias = hidden_bias,
171 |               family = family,
172 |               family_hidden = family_hidden,
173 |               iters = ii - 1,
174 |               cd_iters = cd_iters[1:(ii - 1)],
175 |               loss_trace = loss_trace[2:ii],
176 |               prop_deviance_expl = 1 - loss_trace[ii] / null_deviance,
177 |               W_grad_sq = W_grad_sq
178 |             ),
179 |             class = "efh"
180 |           )
181 |         )
182 |         if (ii > 1) {
183 |           W_grad_sq = W_grad_sq * 4
184 |           W = W_lag + learning_rate * W_grad / sqrt(W_grad_sq)
185 |         } else {
186 |           W = W / 2
187 |         }
188 |         next
189 |       }
190 | 
191 |       W_grad = visible_hidden_0 - visible_hidden_cd
192 |       W = W + learning_rate * W_grad / sqrt(W_grad_sq)
193 |       if (rms_prop) {
194 |         if (ii == 1) {
195 |           W_grad_sq = W_grad^2
196 |         } else {
197 |           W_grad_sq = 0.1 * W_grad^2 + 0.9 * W_grad_sq
198 |         }
199 |       }
200 | 
201 |       # Calc Deviance
202 |       theta = outer(ones, mu) + exp_fam_mean(outer(ones, hidden_bias) + x_imputed %*% W, family_hidden) %*% t(W)
203 |       loss_trace[ii + 1] <- exp_fam_deviance(x, theta, family) / sum_weights
204 | 
205 |       # if (loss_trace[ii + 1] <= loss_trace[1] | grad_deflate_times >= 1) {
206 |       #   grad_deflate_times = 0
207 |         break
208 |       # } else {
209 |       #   if (!quiet) {
210 |       #     cat("Loss is larger than initialization\n")
211 |       #   }
212 |       #   grad_deflate_times = grad_deflate_times + 1
213 |       #   W_grad_sq = W_grad_sq * 4
214 |       # }
215 |     }
216 | 
217 |     if (!quiet) {
218 |       time_elapsed = as.numeric(proc.time() - ptm)[3]
219 |       tot_time = max_iters / ii * time_elapsed
220 |       time_remain = tot_time - time_elapsed
221 |       cat(ii, "  ", loss_trace[ii + 1], "")
222 |       cat(round(time_elapsed / 3600, 1), "hours elapsed. Max", round(time_remain / 3600, 1), "hours remain.\n")
223 |     }
224 | 
225 |     if (ii > max_iters) {
226 |       break
227 |     } else {
228 |       W_lag = W
229 |     }
230 |   }
231 | 
232 |   object <- list(
233 |     mu = mu,
234 |     W = W,
235 |     hidden_bias = hidden_bias,
236 |     family = family,
237 |     family_hidden = family_hidden,
238 |     iters = ii,
239 |     cd_iters = cd_iters[1:ii],
240 |     loss_trace = loss_trace[2:(ii + 1)],
241 |     prop_deviance_expl = 1 - loss_trace[ii + 1] / null_deviance,
242 |     W_grad_sq = W_grad_sq
243 |   )
244 |   class(object) <- "efh"
245 |   object
246 | }
247 | 
248 | contrastive_divergence <- function(x, W, mu, hidden_bias, family, family_hidden, num_iter) {
249 |   visible = x
250 |   ones = rep(1, nrow(x))
251 |   for (cd_ii in seq_len(num_iter)) {
252 |     hidden = exp_fam_sample(outer(ones, hidden_bias) + visible %*% W, family_hidden)
253 |     visible = exp_fam_sample(outer(ones, mu) + hidden %*% t(W), family)
254 |   }
255 |   return(visible)
256 | }
257 | 
258 | simulate_efh <- function(model, x, num_iter) {
259 |   visible = x
260 |   ones = rep(1, nrow(x))
261 |   for (cd_ii in seq_len(num_iter)) {
262 |     hidden = exp_fam_sample(outer(ones, model$hidden_bias) + visible %*% model$W, model$family_hidden)
263 |     visible = exp_fam_sample(outer(ones, model$mu) + hidden %*% t(model$W), model$family)
264 |   }
265 |   return(visible)
266 | }
267 | 
268 | #' @title Predict exponential family harmonium reconstruction on new data
269 | #'
270 | #' @description Predict exponential family harmonium reconstruction on new data
271 | #'
272 | #' @param object EFH object
273 | #' @param newdata matrix of the same exponential family as covariates in \code{object}.
274 | #  If missing, will use the data that \code{object} was fit on
275 | #' @param type the type of fitting required.
276 | #'  \code{type = "hidden"} gives matrix of hidden mean parameters of \code{x},
277 | #'  \code{type = "link"} gives a matrix on the natural parameter scale, and
278 | #'  \code{type = "response"} gives a matrix on the response scale
279 | #' @param ... Additional arguments
280 | #' @examples
281 | #' # construct a low rank matrices in the natural parameter space
282 | #' rows = 100
283 | #' cols = 10
284 | #' set.seed(1)
285 | #' loadings = rnorm(cols)
286 | #' mat_np = outer(rnorm(rows), rnorm(cols))
287 | #' mat_np_new = outer(rnorm(rows), loadings)
288 | #'
289 | #' # generate a count matrices
290 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
291 | #' mat_new = matrix(rpois(rows * cols, c(exp(mat_np_new))), rows, cols)
292 | #'
293 | #' modp = exponential_family_harmonium(mat, k = 9, family = "poisson", max_iters = 1000)
294 | #'
295 | #' pred = predict(modp, mat_new, type = "response")
296 | #'
297 | #' @export
298 | predict.efh <- function(object, newdata, type = c("hidden", "link", "response"), ...) {
299 |   type = match.arg(type)
300 | 
301 |   if (missing(newdata)) {
302 |     stop("Need to supply data, even if it is the same used to fit the model!")
303 |   } else {
304 |     check_family(newdata, object$family)
305 | 
306 |     newdata[is.na(newdata)] <- 0
307 |     hidden = exp_fam_mean(outer(rep(1, nrow(newdata)), object$hidden_bias) +
308 |                             newdata %*% object$W, object$family_hidden)
309 |   }
310 | 
311 |   if (type == "hidden") {
312 |     hidden
313 |   } else {
314 |     theta = outer(rep(1, nrow(newdata)), object$mu) + hidden %*% t(object$W)
315 |     if (type == "link") {
316 |       theta
317 |     } else if (type == "response") {
318 |       exp_fam_mean(theta, object$family)
319 |     }
320 |   }
321 | }
322 | 
323 | #' @title Plot exponential family harmonium
324 | #'
325 | #' @description
326 | #' Plots the results of a EFH
327 | #'
328 | #' @param x EFH object
329 | #' @param type the type of plot \code{type = "trace"} plots the algorithms progress by
330 | #' iteration
331 | #' @param ... Additional arguments
332 | #' @examples
333 | #' # construct a low rank matrix in the logit scale
334 | #' rows = 100
335 | #' cols = 10
336 | #' set.seed(1)
337 | #' mat_logit = outer(rnorm(rows), rnorm(cols))
338 | #'
339 | #' # generate a binary matrix
340 | #' mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0
341 | #'
342 | #' # run logistic SVD on it
343 | #' efh = exponential_family_harmonium(mat, k = 2, family = "binomial", family_hidden = "binomial")
344 | #'
345 | #' \dontrun{
346 | #' plot(efh)
347 | #' }
348 | #' @export
349 | plot.efh <- function(x, type = c("trace"), ...) {
350 |   type = match.arg(type)
351 | 
352 |   if (type == "trace") {
353 |     df = data.frame(Iteration = 1:x$iters,
354 |                     Deviance = x$loss_trace)
355 |     p <- ggplot2::ggplot(df, ggplot2::aes_string("Iteration", "Deviance")) +
356 |       ggplot2::geom_line()
357 |   }
358 | 
359 |   return(p)
360 | }
361 | 
362 | #' @export
363 | print.efh <- function(x, ...) {
364 |   # cat(nrow(x$A), "rows and ")
365 |   cat(nrow(x$W), "columns\n")
366 |   cat("Rank", ncol(x$W), "solution\n")
367 |   cat("\n")
368 |   cat(round(x$prop_deviance_expl * 100, 1), "% of deviance explained\n", sep = "")
369 |   cat(x$iters, "iterations to converge\n")
370 | 
371 |   invisible(x)
372 | }
373 | 


--------------------------------------------------------------------------------
/R/generalizedMF.R:
--------------------------------------------------------------------------------
  1 | #' Exponential Family Matrix Factorization
  2 | #'
  3 | #' @description Collins et al. (2001)'s Exponential Family PCA
  4 | #'
  5 | #' @param x matrix of either binary, proportions, count, or continuous data
  6 | #' @param k dimension
  7 | #' @param family exponential family distribution of data
  8 | #' @param weights an optional matrix of the same size as the \code{x} with data weights
  9 | #' @param quiet logical; whether the calculation should give feedback
 10 | #' @param max_iters maximum number of iterations
 11 | #' @param conv_criteria convergence criteria
 12 | #' @param partial_decomp logical; if \code{TRUE}, the function uses the RSpectra package
 13 | #'   to more quickly calculate the SVD. When the number of columns is small,
 14 | #'   the approximation may be less accurate and slower
 15 | #' @param random_start whether to randomly initialize \code{A} and \code{B}
 16 | #' @param start_A initial value for \code{A}
 17 | #' @param start_B initial value for \code{B}
 18 | #' @param mu specific value for \code{mu}, the mean vector of \code{x}
 19 | #' @param main_effects logical; whether to include main effects in the model
 20 | #' @param method which algorithm to use. \code{"als"} uses alternating least squares.
 21 | #'   It has the benefit of majozing row-wise and column-wise for each of the updates.
 22 | #'   \code{"svd"} uses singular value decomposition (similar to de Leeuw, 2006). It has to
 23 | #'   a more gereral majorization, which may not work well for heterogeneous matrices.
 24 | #'
 25 | #' @return An S3 object of class \code{gmf} which is a list with the
 26 | #' following components:
 27 | #' \item{mu}{the main effects for dimensionality reduction}
 28 | #' \item{A}{the \code{n}x\code{k}-dimentional matrix with the scores}
 29 | #' \item{B}{the \code{d}x\code{k}-dimentional matrix with the loadings}
 30 | #' \item{family}{the exponential family of the data}
 31 | #' \item{iters}{number of iterations required for convergence}
 32 | #' \item{loss_trace}{the trace of the average deviance of the algorithm.
 33 | #'    Should be non-increasing}
 34 | #' \item{prop_deviance_expl}{the proportion of deviance explained by this model.
 35 | #'    If \code{main_effects = TRUE}, the null model is just the main effects, otherwise
 36 | #'    the null model estimates 0 for all natural parameters.}
 37 | #' @export
 38 | #' @importFrom RSpectra svds
 39 | #'
 40 | #' @references
 41 | #' de Leeuw, Jan, 2006. Principal component analysis of binary data
 42 | #' by iterated singular value decomposition. Computational Statistics & Data Analysis
 43 | #' 50 (1), 21--39.
 44 | #'
 45 | #' Collins, M., Dasgupta, S., & Schapire, R. E., 2001. A generalization of principal
 46 | #' components analysis to the exponential family. In NIPS, 617--624.
 47 | #'
 48 | #' @examples
 49 | #' rows = 100
 50 | #' cols = 10
 51 | #' set.seed(1)
 52 | #' mat_np = outer(rnorm(rows), rnorm(cols))
 53 | #'
 54 | #' # generate a count matrix and binary response
 55 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
 56 | #'
 57 | #' mod = generalizedMF(mat, k = 1, family = "poisson", quiet = FALSE)
 58 | #'
 59 | generalizedMF <- function(x, k = 2, family = c("gaussian", "binomial", "poisson"),
 60 |                       weights, quiet = TRUE, max_iters = 1000, conv_criteria = 1e-5,
 61 |                       partial_decomp = FALSE,
 62 |                       random_start = FALSE, start_A, start_B, mu, main_effects = TRUE,
 63 |                       method = c("als", "svd")) {
 64 |   family = match.arg(family)
 65 |   check_family(x, family)
 66 |   method = match.arg(method)
 67 | 
 68 |   if (method == "als") {
 69 |     # used in ALS to make sure the estimates do not diverge
 70 |     # a penalization like this was recommended in Rish et al. (2008), ICML
 71 |     L2_eps = 1e-5
 72 |   }
 73 | 
 74 |   x = as.matrix(x)
 75 |   missing_mat = is.na(x)
 76 |   n = nrow(x)
 77 |   d = ncol(x)
 78 |   ones = rep(1, n)
 79 | 
 80 |   if (missing(weights)) {
 81 |     weights = 1.0
 82 |     sum_weights = sum(!is.na(x))
 83 |   } else {
 84 |     weights[is.na(x)] <- 0
 85 |     if (any(is.na(weights))) {
 86 |       stop("Can't have NA in weights")
 87 |     }
 88 |     if (any(weights < 0)) {
 89 |       stop("weights must be non-negative")
 90 |     }
 91 |     if (!all(dim(weights) == dim(x))) {
 92 |       stop("x and weights are not the same dimension")
 93 |     }
 94 |     sum_weights = sum(weights)
 95 |   }
 96 | 
 97 |   # calculate the null log likelihood for % deviance explained and normalization
 98 |   if (main_effects) {
 99 |     if (length(weights) == 1) {
100 |       weighted_col_means = colMeans(x, na.rm = TRUE)
101 |     } else {
102 |       weighted_col_means = colSums(x * weights, na.rm = TRUE) / colSums(weights)
103 |     }
104 |     null_theta = as.numeric(saturated_natural_parameters(matrix(weighted_col_means, 1), family, M = Inf))
105 |   } else {
106 |     null_theta = rep(0, d)
107 |   }
108 |   null_deviance = exp_fam_deviance(x, outer(ones, null_theta), family, weights) / sum_weights
109 | 
110 |   # Initialize #
111 |   ##################
112 |   if (main_effects) {
113 |     if (missing(mu)) {
114 |       mu = saturated_natural_parameters(weighted_col_means, family, Inf)
115 |     } else {
116 |       mu = as.numeric(mu)
117 |       stopifnot(length(mu) == d)
118 |     }
119 |   } else {
120 |     mu = rep(0, d)
121 |   }
122 | 
123 |   if (!missing(start_B)) {
124 |     stopifnot(dim(start_B) == c(d, k))
125 |     B = as.matrix(start_B)
126 |   } else if (random_start) {
127 |     B = matrix(rnorm(d * k), d, k)
128 |   } else {
129 |     if (partial_decomp) {
130 |       udv = RSpectra::svds(scale(saturated_natural_parameters(x, family, 4), TRUE, FALSE), min(k + 1, d))
131 |     } else {
132 |       udv = svd(scale(saturated_natural_parameters(x, family, 4), TRUE, FALSE))
133 |     }
134 | 
135 |     B = udv$v[, 1:k, drop = FALSE]
136 |   }
137 | 
138 |   if (!missing(start_A)) {
139 |     stopifnot(dim(start_A) == c(n, k))
140 |     A = as.matrix(start_A)
141 |   } else if (random_start) {
142 |     A = matrix(rnorm(n * k), n, k)
143 |   } else {
144 |     if (!missing(start_B)) {
145 |       if (partial_decomp) {
146 |         udv = RSpectra::svds(scale(saturated_natural_parameters(x, family, 4), TRUE, FALSE), min(k + 1, d))
147 |       } else {
148 |         udv = svd(scale(saturated_natural_parameters(x, family, 4), TRUE, FALSE))
149 |       }
150 |     }
151 |     A = udv$u[, 1:k, drop = FALSE] %*% diag(udv$d, k, k)
152 |   }
153 | 
154 |   loss_trace = numeric(max_iters)
155 |   theta = outer(ones, mu) + tcrossprod(A, B)
156 |   # not really the first iteration
157 |   loss_trace[1] = exp_fam_deviance(x, theta, family, weights) / sum_weights
158 | 
159 |   ptm <- proc.time()
160 | 
161 |   if (!quiet) {
162 |     cat(0, "  ", loss_trace[1], "")
163 |     cat("0 hours elapsed\n")
164 |   }
165 | 
166 |   for (ii in seq_len(max_iters)) {
167 |     if (method == "als") {
168 |       # update A
169 |       theta = outer(ones, mu) + tcrossprod(A, B)
170 |       first_dir = exp_fam_mean(theta, family)
171 |       second_dir = exp_fam_variance(theta, family, weights)
172 | 
173 |       W = apply(second_dir, 2, max)
174 |       Z = as.matrix(theta + weights * (x - first_dir) / outer(ones, W))
175 |       Z[is.na(x)] <- theta[is.na(x)]
176 | 
177 |       A = t(solve(t(B) %*% diag(W) %*% B + diag(L2_eps, k, k),
178 |                   t(B) %*% diag(W) %*% t(scale(Z, center = mu, scale = FALSE))))
179 | 
180 |       # update B
181 |       theta = outer(ones, mu) + tcrossprod(A, B)
182 |       first_dir = exp_fam_mean(theta, family)
183 |       second_dir = exp_fam_variance(theta, family, weights)
184 | 
185 |       W = apply(second_dir, 1, max)
186 |       Z = as.matrix(theta + weights * (x - first_dir) / outer(W, rep(1, d)))
187 |       Z[is.na(x)] <- theta[is.na(x)]
188 | 
189 |       B = t(solve(t(A) %*% diag(W, n, n) %*% A + diag(L2_eps, k, k),
190 |                   t(A) %*% diag(W, n, n) %*% scale(Z, center = mu, scale = FALSE)))
191 |     } else if (method == "svd") {
192 |       theta = outer(ones, mu) + tcrossprod(A, B)
193 |       first_dir = exp_fam_mean(theta, family)
194 |       second_dir = exp_fam_variance(theta, family, weights)
195 | 
196 |       W = max(second_dir)
197 |       Z = as.matrix(theta + weights * (x - first_dir) / W)
198 |       Z[is.na(x)] <- theta[is.na(x)]
199 | 
200 |       if (partial_decomp) {
201 |         udv = RSpectra::svds(scale(Z, center = mu, scale = FALSE), min(k + 1, d))
202 |       } else {
203 |         udv = svd(scale(Z, center = mu, scale = FALSE))
204 |       }
205 | 
206 |       A = udv$u[, 1:k, drop = FALSE] %*% diag(udv$d, k, k)
207 |       B = udv$v[, 1:k, drop = FALSE]
208 |     }
209 | 
210 |     # Calc Deviance
211 |     theta = outer(ones, mu) + tcrossprod(A, B)
212 |     loss_trace[ii] <- exp_fam_deviance(x, theta, family, weights) / sum_weights
213 | 
214 |     if (!quiet) {
215 |       time_elapsed = as.numeric(proc.time() - ptm)[3]
216 |       tot_time = max_iters / ii * time_elapsed
217 |       time_remain = tot_time - time_elapsed
218 |       cat(ii, "  ", loss_trace[ii], "")
219 |       cat(round(time_elapsed / 3600, 1), "hours elapsed. Max", round(time_remain / 3600, 1), "hours remain.\n")
220 |     }
221 | 
222 |     if (ii > 1 && abs(loss_trace[ii] - loss_trace[ii - 1]) < 2 * conv_criteria) {
223 |       break
224 |     } else {
225 |       B_lag = B
226 |     }
227 |   }
228 | 
229 |   object <- list(
230 |     mu = mu,
231 |     A = A,
232 |     B = B,
233 |     family = family,
234 |     iters = ii,
235 |     loss_trace = loss_trace[1:ii],
236 |     prop_deviance_expl = 1 - loss_trace[ii] / null_deviance
237 |   )
238 |   class(object) <- "gmf"
239 |   object
240 | }
241 | 
242 | #' @title Predict generalized PCA scores or reconstruction on new data
243 | #'
244 | #' @description Predict generalized PCA scores or reconstruction on new data
245 | #'
246 | #' @param object generalized MF object
247 | #' @param newdata matrix of the same exponential family as covariates in \code{object}.
248 | #'  If missing, will use the data that \code{object} was fit on
249 | #' @param type the type of fitting required.
250 | #'  \code{type = "PCs"} gives matrix of principal components of \code{x},
251 | #'  \code{type = "link"} gives a matrix on the natural parameter scale, and
252 | #'  \code{type = "response"} gives a matrix on the response scale
253 | #' @param quiet logical; whether the calculation should show progress
254 | #' @param max_iters maximum number of iterations
255 | #' @param conv_criteria convergence criteria
256 | #' @param start_A initial value for \code{A}
257 | #' @param ... Additional arguments
258 | #' @examples
259 | #' # construct a low rank matrices in the natural parameter space
260 | #' rows = 100
261 | #' cols = 10
262 | #' set.seed(1)
263 | #' loadings = rnorm(cols)
264 | #' mat_np = outer(rnorm(rows), rnorm(cols))
265 | #' mat_np_new = outer(rnorm(rows), loadings)
266 | #'
267 | #' # generate a count matrices
268 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
269 | #' mat_new = matrix(rpois(rows * cols, c(exp(mat_np_new))), rows, cols)
270 | #'
271 | #' # run Poisson PCA on it
272 | #' gmf = generalizedMF(mat, k = 1, family = "poisson")
273 | #'
274 | #' A = predict(gmf, mat_new)
275 | #'
276 | #' @export
277 | predict.gmf <- function(object, newdata, type = c("PCs", "link", "response"), quiet = TRUE,
278 |                          max_iters = 1000, conv_criteria = 1e-5, start_A,...) {
279 |   type = match.arg(type)
280 | 
281 |   if (missing(newdata)) {
282 |     A = object$A
283 |   } else {
284 |     n = nrow(newdata)
285 |     d = ncol(newdata)
286 |     k = ncol(object$B)
287 |     stopifnot(d == nrow(object$B))
288 |     check_family(newdata, object$family)
289 |     L2_eps = 1e-5
290 | 
291 |     ones = rep(1, n)
292 | 
293 |     # solve for A
294 |     if (missing(start_A)) {
295 |       theta = outer(ones, object$mu)
296 |     } else {
297 |       stopifnot(dim(start_A) == c(n, k))
298 |       theta = outer(ones, object$mu) + tcrossprod(start_A, object$B)
299 |     }
300 | 
301 |     last_deviance = exp_fam_deviance(newdata, theta, object$family) / sum(!is.na(newdata))
302 |     if (!quiet) {
303 |       cat(0, " ", last_deviance, "\n")
304 |     }
305 |     for (ii in seq_len(max_iters)) {
306 |       first_dir = exp_fam_mean(theta, object$family)
307 |       second_dir = exp_fam_variance(theta, object$family)
308 | 
309 |       multiplier = 1
310 |       # while (TRUE) {
311 |         # W = apply(second_dir, 2, max)
312 |       W = rep(max(second_dir), d)
313 |       Z = as.matrix(theta + (newdata - first_dir) / outer(ones, W))
314 |       Z[is.na(newdata)] <- theta[is.na(newdata)]
315 | 
316 |       A = t(solve(t(object$B) %*% diag(W, d, d) %*% object$B + diag(L2_eps, k, k),
317 |                   t(object$B) %*% diag(W, d, d) %*% t(scale(Z, object$mu, FALSE))))
318 | 
319 |       theta = outer(ones, object$mu) + tcrossprod(A, object$B)
320 |       this_deviance = exp_fam_deviance(newdata, theta, object$family) / sum(!is.na(newdata))
321 | 
322 |         # if (this_deviance < last_deviance) {
323 |         #   break
324 |         # } else {
325 |         #   multiplier = multiplier * 2
326 |         # }
327 | 
328 |       if (!quiet) {
329 |         cat(ii ," ", this_deviance, "\n")
330 |       }
331 | 
332 |       if (abs(last_deviance - this_deviance) < 2 * conv_criteria)
333 |         break
334 |       last_deviance = this_deviance
335 |     }
336 |   }
337 | 
338 |   if (type == "PCs") {
339 |     return(A)
340 |   } else {
341 |     theta = outer(rep(1, nrow(A)), object$mu) + tcrossprod(A, object$B)
342 | 
343 |     if (type == "link") {
344 |       return(theta)
345 |     } else if (type == "response") {
346 |       return(exp_fam_mean(theta, object$family))
347 |     }
348 |   }
349 | }
350 | 
351 | #' @title Plot generalized MF
352 | #'
353 | #' @description
354 | #' Plots the results of a generalized MF
355 | #'
356 | #' @param x generalized MF object
357 | #' @param type the type of plot \code{type = "trace"} plots the algorithms progress by
358 | #' iteration, \code{type = "loadings"} plots the first 2 principal component
359 | #' loadings, \code{type = "scores"} plots the loadings first 2 principal component scores
360 | #' @param ... Additional arguments
361 | #' @examples
362 | #' # construct a low rank matrix in the logit scale
363 | #' rows = 100
364 | #' cols = 10
365 | #' set.seed(1)
366 | #' mat_logit = outer(rnorm(rows), rnorm(cols))
367 | #'
368 | #' # generate a binary matrix
369 | #' mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0
370 | #'
371 | #' # run logistic SVD on it
372 | #' gmf = generalizedMF(mat, k = 2, family = "binomial")
373 | #'
374 | #' \dontrun{
375 | #' plot(gmf)
376 | #' }
377 | #' @export
378 | plot.gmf <- function(x, type = c("trace", "loadings", "scores"), ...) {
379 |   type = match.arg(type)
380 | 
381 |   if (type == "trace") {
382 |     df = data.frame(Iteration = 1:x$iters,
383 |                     Deviance = x$loss_trace)
384 |     p <- ggplot2::ggplot(df, ggplot2::aes_string("Iteration", "Deviance")) +
385 |       ggplot2::geom_line()
386 |   } else if (type == "loadings") {
387 |     df = data.frame(x$B)
388 |     colnames(df) <- paste0("PC", 1:ncol(df))
389 |     if (ncol(df) == 1) {
390 |       df$PC2 = 0
391 |       p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() +
392 |         ggplot2::labs(y = NULL)
393 |     } else {
394 |       p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point()
395 |     }
396 |   } else if (type == "scores") {
397 |     df = data.frame(x$A)
398 |     colnames(df) <- paste0("PC", 1:ncol(df))
399 |     if (ncol(df) == 1) {
400 |       df$PC2 = 0
401 |       p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() +
402 |         ggplot2::labs(y = NULL)
403 |     } else {
404 |       p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point()
405 |     }
406 |   }
407 | 
408 |   return(p)
409 | }
410 | 
411 | #' @export
412 | print.gmf <- function(x, ...) {
413 |   cat(nrow(x$A), "rows and ")
414 |   cat(nrow(x$B), "columns\n")
415 |   cat("Rank", ncol(x$B), "solution\n")
416 |   cat("\n")
417 |   cat(round(x$prop_deviance_expl * 100, 1), "% of deviance explained\n", sep = "")
418 |   cat(x$iters, "iterations to converge\n")
419 | 
420 |   invisible(x)
421 | }
422 | 
423 | 
424 | #' @title CV for generalized MF
425 | #'
426 | #' @description
427 | #' Run cross validation on dimension for generalized MF
428 | #'
429 | #' @param x matrix of either binary, count, or continuous data
430 | #' @param ks the different dimensions \code{k} to try
431 | #' @param family exponential family distribution of data
432 | #' @param folds if \code{folds} is a scalar, then it is the number of folds. If
433 | #'  it is a vector, it should be the same length as the number of rows in \code{x}
434 | #' @param quiet logical; whether the function should display progress
435 | #' @param ... Additional arguments passed to generalizedMF
436 | #'
437 | #' @return A matrix of the CV deviance with \code{k} in rows
438 | #'
439 | #' @examples
440 | #' # construct a low rank matrix in the logit scale
441 | #' rows = 100
442 | #' cols = 10
443 | #' set.seed(1)
444 | #' mat_logit = outer(rnorm(rows), rnorm(cols))
445 | #'
446 | #' # generate a binary matrix
447 | #' mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0
448 | #'
449 | #' \dontrun{
450 | #' deviances = cv.gmf(mat, ks = 1:9, family = "binomial")
451 | #' plot(deviances)
452 | #' }
453 | #' @export
454 | cv.gmf <- function(x, ks, family = c("gaussian", "binomial", "poisson", "multinomial"),
455 |                     folds = 5, quiet = TRUE, ...) {
456 |   family = match.arg(family)
457 |   check_family(x, family)
458 | 
459 |   if (length(folds) > 1) {
460 |     # does this work if factor?
461 |     if (length(unique(folds)) <= 1) {
462 |       stop("If inputing CV split, must be more than one level")
463 |     }
464 |     if (length(folds) != nrow(x)) {
465 |       stop("if folds is a vector, it should be of same length as nrow(x)")
466 |     }
467 |     cv = folds
468 |   } else {
469 |     cv = sample(1:folds, nrow(q), replace = TRUE)
470 |   }
471 | 
472 |   log_likes = matrix(0, length(ks), 1,
473 |                      dimnames = list(k = ks, M = "GMF"))
474 |   for (k in ks) {
475 |     if (!quiet) {
476 |       cat("k =", k, "\n")
477 |     }
478 |     for (c in unique(cv)) {
479 |       gmf = generalizedMF(x[c != cv, ], k = k, family = family, ...)
480 |       pred_theta = predict(gmf, newdat = x[c == cv, ], type = "link")
481 |       log_likes[k == ks] = log_likes[k == ks] +
482 |         exp_fam_deviance(x[c == cv, ], theta = pred_theta, family = family)
483 |     }
484 |   }
485 |   class(log_likes) <- c("matrix", "cv.gpca")
486 |   which_max = which.max(log_likes)
487 |   if (!quiet) {
488 |     cat("Best: k =", ks[which_max], "\n")
489 |   }
490 | 
491 |   return(-log_likes)
492 | }
493 | 


--------------------------------------------------------------------------------
/R/convexGeneralizedPCA.R:
--------------------------------------------------------------------------------
  1 | #' @title Convex Generalized Principal Component Analysis
  2 | #'
  3 | #' @description
  4 | #' Dimensionality reduction for exponential family data by extending Pearson's
  5 | #' PCA formulation to minimize deviance. The convex relaxation
  6 | #' to projection matrices, the Fantope, is used.
  7 | #'
  8 | #' @param x matrix of either binary, proportions, count, or continuous data
  9 | #' @param k number of principal components to return
 10 | #' @param M value to approximate the saturated model
 11 | #' @param family exponential family distribution of data
 12 | #' @param weights an optional matrix of the same size as the \code{x} with non-negative weights
 13 | #' @param quiet logical; whether the calculation should give feedback
 14 | #' @param partial_decomp logical; if \code{TRUE}, the function uses the RSpectra package
 15 | #'   to more quickly calculate the eigen-decomposition. When the number of columns is small,
 16 | #'   the approximation may be less accurate and slower
 17 | #' @param max_iters number of maximum iterations
 18 | #' @param conv_criteria convergence criteria. The difference between average deviance
 19 | #'   in successive iterations
 20 | #' @param random_start logical; whether to randomly inititalize the parameters. If \code{FALSE},
 21 | #'   function will use an eigen-decomposition as starting value
 22 | #' @param start_H starting value for the Fantope matrix
 23 | #' @param mu main effects vector. Only used if \code{main_effects = TRUE}
 24 | #' @param main_effects logical; whether to include main effects in the model
 25 | #' @param normalize logical; whether to weight the variables to they all have equal influence
 26 | #' @param ss_factor step size multiplier. Amount by which to multiply the step size. Quadratic
 27 | #'   convergence rate can be proven for \code{ss_factor = 1}, but I have found higher values
 28 | #'   sometimes work better. The default is \code{ss_factor = 4}.
 29 | #'   If it is not converging, try \code{ss_factor = 1}.
 30 | #'
 31 | #' @return An S3 object of class \code{cgpca} which is a list with the
 32 | #' following components:
 33 | #' \item{mu}{the main effects}
 34 | #' \item{H}{a rank \code{k} Fantope matrix}
 35 | #' \item{U}{a \code{ceiling(k)}-dimentional orthonormal matrix with the loadings}
 36 | #' \item{PCs}{the princial component scores}
 37 | #' \item{M}{the parameter inputed}
 38 | #' \item{iters}{number of iterations required for convergence}
 39 | #' \item{loss_trace}{the trace of the average deviance using the Fantope matrix}
 40 | #' \item{proj_loss_trace}{the trace of the average deviance using the projection matrix}
 41 | #' \item{prop_deviance_expl}{the proportion of deviance explained by this model.
 42 | #'    If \code{main_effects = TRUE}, the null model is just the main effects, otherwise
 43 | #'    the null model estimates 0 for all natural parameters.}
 44 | #'
 45 | #' @examples
 46 | #' # construct a low rank matrix in the logit scale
 47 | #' rows = 100
 48 | #' cols = 10
 49 | #' set.seed(1)
 50 | #' mat_logit = outer(rnorm(rows), rnorm(cols))
 51 | #'
 52 | #' # generate a binary matrix
 53 | #' mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0
 54 | #'
 55 | #' # run convex generalized PCA on it
 56 | #' cgpca = convexGeneralizedPCA(mat, k = 1, M = 4, family = "binomial")
 57 | #' @export
 58 | #' @importFrom stats var
 59 | convexGeneralizedPCA <- function(x, k = 2, M = 4, family = c("gaussian", "binomial", "poisson", "multinomial"),
 60 |                                  weights, quiet = TRUE, partial_decomp = FALSE, max_iters = 1000,
 61 |                                  conv_criteria = 1e-6, random_start = FALSE, start_H, mu,
 62 |                                  main_effects = TRUE, normalize = FALSE, ss_factor = 1) {
 63 | 
 64 |   family = match.arg(family)
 65 |   check_family(x, family)
 66 | 
 67 |   x = as.matrix(x)
 68 |   missing_mat = is.na(x)
 69 |   n = nrow(x)
 70 |   d = ncol(x)
 71 |   ones = rep(1, n)
 72 | 
 73 |   if (missing(weights) || length(weights) == 1) {
 74 |     weights = 1.0
 75 |     sum_weights = sum(!is.na(x))
 76 |   } else {
 77 |     if (!all(dim(weights) == dim(x))) {
 78 |       stop("x and weights are not the same dimension")
 79 |     }
 80 |     weights[is.na(x)] <- 0
 81 |     if (any(is.na(weights))) {
 82 |       stop("Can't have NA in weights")
 83 |     }
 84 |     if (any(weights < 0)) {
 85 |       stop("weights must be non-negative")
 86 |     }
 87 |     sum_weights = sum(weights)
 88 |   }
 89 | 
 90 |   if (main_effects) {
 91 |     if (missing(mu)) {
 92 |       if (length(weights) == 1) {
 93 |         weighted_col_means = colMeans(x, na.rm = TRUE)
 94 |       } else {
 95 |         weighted_col_means = colSums(x * weights, na.rm = TRUE) / colSums(weights)
 96 |       }
 97 |       if (any(apply(x, 2, stats::var, na.rm = TRUE) == 0)) {
 98 |         stop("At least one variable has variance of 0")
 99 |       }
100 |       if (family == "multinomial") {
101 |         mu = as.numeric(saturated_natural_parameters(matrix(weighted_col_means, nrow = 1), family, M))
102 |       } else {
103 |         mu = as.numeric(saturated_natural_parameters(weighted_col_means, family, M))
104 |       }
105 |     }
106 |   } else {
107 |     mu = rep(0, d)
108 |   }
109 | 
110 |   eta = saturated_natural_parameters(x, family, M = M) + missing_mat * outer(ones, mu)
111 |   eta_centered = scale(eta, center = mu, scale = FALSE)
112 |   mu_mat = outer(rep(1, n), mu)
113 | 
114 |   if (!missing(start_H)) {
115 |     HU = project.Fantope(start_H, k, partial_decomp = partial_decomp)
116 |     H = HU$H
117 |   } else if (random_start) {
118 |     U = matrix(rnorm(d * d), d, d)
119 |     U = qr.Q(qr(U))
120 |     HU = project.Fantope(U %*% t(U), k, partial_decomp = partial_decomp)
121 |     H = HU$H
122 |   } else {
123 |     if (partial_decomp) {
124 |       udv = RSpectra::svds(scale(eta, center = mu, scale = normalize), k, nu = k, nv = k)
125 |     } else {
126 |       udv = svd(scale(eta, center = mu, scale = normalize), nu = k, nv = k)
127 |     }
128 |     HU = project.Fantope(udv$v[, 1:k] %*% t(udv$v[, 1:k]), k, partial_decomp = partial_decomp)
129 |     H = HU$H
130 |   }
131 | 
132 |   eta_sat_nat = saturated_natural_parameters(x, family, M = Inf)
133 |   sat_loglike = exp_fam_log_like(x, eta_sat_nat, family, weights)
134 | 
135 |   # when x is missing eta = mu. So eta_centered is 0
136 |   eta_centered[missing_mat] <- 0
137 | 
138 |   theta = mu_mat + eta_centered %*% H
139 | 
140 |   # Initial step size
141 |   if (family %in% c("binomial", "multinomial")) {
142 |     # TODO: I think this works with proportions, but need to check
143 |     init_ss = 2 / (sum(eta_centered^2) * max(weights))
144 |   } else if (family == "gaussian") {
145 |     init_ss = 0.5 / (sum(eta_centered^2) * max(weights))
146 |   } else {
147 |     second_dir = exp_fam_variance(theta, family, weights)
148 |     init_ss = mean((t(second_dir) %*% (eta_centered^2))^(-1)) / 2
149 |   }
150 |   init_ss = init_ss * ss_factor
151 | 
152 |   # only sum over non-missing x. Equivalent to replacing missing x with 0
153 |   x_zeros = x
154 |   x_zeros[missing_mat] <- 0
155 | 
156 |   etatX = t(eta_centered) %*% (weights * x_zeros)
157 | 
158 |   loglike = exp_fam_log_like(x, theta, family, weights)
159 |   min_loss = 2 * (sat_loglike - loglike) / sum_weights
160 |   best_HU = HU
161 |   best_loglike = loglike
162 |   if (!quiet) {
163 |     cat(0,"  ", min_loss, "\n")
164 |   }
165 | 
166 |   loss_trace <- proj_loss_trace <- numeric(max_iters + 1)
167 |   loss_trace[1] <- proj_loss_trace[1] <- min_loss
168 | 
169 |   H_lag = H
170 |   for (m in 1:max_iters) {
171 |     if (family == "poisson") {
172 |       step = init_ss / m
173 |     } else {
174 |       step = init_ss
175 |     }
176 | 
177 |     first_dir = exp_fam_mean(theta, family)
178 | 
179 |     first_dir[missing_mat] <- 0
180 |     etat_dir = t(eta_centered) %*% (first_dir * weights)
181 |     deriv = etatX - etat_dir
182 |     deriv = deriv + t(deriv) - diag(diag(deriv))
183 | 
184 |     H = H + step * deriv
185 |     HU = project.Fantope(H, k, partial_decomp = partial_decomp)
186 |     H = HU$H
187 | 
188 |     theta = mu_mat + eta_centered %*% H
189 |     loglike = exp_fam_log_like(x, theta, family, weights)
190 |     loss_trace[m + 1] = 2 * (sat_loglike - loglike) / sum_weights
191 | 
192 |     proj_theta = mu_mat + eta_centered %*% tcrossprod(HU$U)
193 |     proj_loglike = exp_fam_log_like(x, proj_theta, family, weights)
194 |     proj_loss_trace[m + 1] = 2 * (sat_loglike - proj_loglike) / sum_weights
195 | 
196 |     if (!quiet) {
197 |       cat(m, "  ", loss_trace[m + 1], "  ", proj_loss_trace[m + 1], "\n")
198 |     }
199 |     if (loss_trace[m + 1] < min_loss) {
200 |       min_loss = loss_trace[m + 1]
201 |       best_HU = HU
202 |       best_loglike = loglike
203 |     }
204 |     if (abs(loss_trace[m+1]-loss_trace[m]) < 2 * conv_criteria | min_loss == 0) {
205 |       break
206 |     }
207 |   }
208 | 
209 |   # calculate the null log likelihood for % deviance explained
210 |   # assumes no missing data
211 |   # if (main_effects) {
212 |   #   null_proportions = x_bar
213 |   # } else {
214 |   #   null_proportions = rep(0.5, d)
215 |   # }
216 |   # null_loglikes <- null_proportions * log(null_proportions) +
217 |   #   (1 - null_proportions) * log(1 - null_proportions)
218 |   # null_loglike = sum((null_loglikes * colSums(q!=0))[!(null_proportions %in% c(0, 1))])
219 |   null_loglike = exp_fam_log_like(x, mu_mat, family, weights)
220 | 
221 |   object = list(mu = mu,
222 |                 H = best_HU$H,
223 |                 U = best_HU$U,
224 |                 PCs = eta_centered %*% best_HU$U,
225 |                 M = M,
226 |                 family = family,
227 |                 iters = m,
228 |                 loss_trace = loss_trace[1:(m + 1)],
229 |                 proj_loss_trace = proj_loss_trace[1:(m + 1)],
230 |                 prop_deviance_expl = 1 - (best_loglike - sat_loglike) / (null_loglike - sat_loglike)
231 |   )
232 |   class(object) <- "cgpca"
233 |   return(object)
234 | }
235 | 
236 | #' @title Project onto the Fantope
237 | #'
238 | #' @description
239 | #' Project a symmetric matrix onto the convex set of the rank k Fantope
240 | #'
241 | #' @param x a symmetric matrix
242 | #' @param k the rank of the Fantope desired
243 | #' @param partial_decomp logical; if \code{TRUE}, the function uses the RSpectra package
244 | #'   to more quickly calculate the eigen-decomposition. When the number of columns is small,
245 | #'   the approximation may be less accurate and slower
246 | 
247 | #' @return
248 | #' \item{H}{a rank \code{k} Fantope matrix}
249 | #' \item{U}{a \code{k}-dimentional orthonormal matrix with the first \code{k} eigenvectors of \code{H}}
250 | #' @export
251 | project.Fantope <- function(x, k, partial_decomp = FALSE) {
252 |   if (partial_decomp) {
253 |     eig = RSpectra::eigs_sym(x, k = min(k + 2, ncol(x)))
254 |   } else {
255 |     eig = eigen(x, symmetric = TRUE)
256 |   }
257 |   vals = eig$values
258 |   lower = vals[length(vals)] - k / length(vals)
259 |   upper = max(vals)
260 |   while(TRUE) {
261 |     theta = (lower+upper) / 2
262 |     sum.eig.vals = sum(pmin(pmax(vals - theta, 0), 1))
263 |     if (abs(sum.eig.vals-k) < 1e-10) {
264 |       break
265 |     } else if (sum.eig.vals>k) {
266 |       lower = theta
267 |     } else {
268 |       upper = theta
269 |     }
270 |   }
271 |   vals = pmin(pmax(vals - theta, 0), 1)
272 |   return(list(H = eig$vectors %*% diag(vals) %*% t(eig$vectors),
273 |               U = matrix(eig$vectors[, 1:ceiling(k)], nrow(x), ceiling(k))))
274 | }
275 | 
276 | #' @title Predict Convex Generalized PCA scores or reconstruction on new data
277 | #'
278 | #' @description Predict Convex Generalized PCA scores or reconstruction on new data
279 | #'
280 | #' @param object convex generalized PCA object
281 | #' @param newdata matrix with all binary entries. If missing, will use the
282 | #'  data that \code{object} was fit on
283 | #' @param type the type of fitting required. \code{type = "PCs"} gives the PC scores,
284 | #'  \code{type = "link"} gives matrix on the logit scale and \code{type = "response"}
285 | #'  gives matrix on the probability scale
286 | #' @param ... Additional arguments
287 | #' @examples
288 | #' # construct a low rank matrices in the logit scale
289 | #' rows = 100
290 | #' cols = 10
291 | #' set.seed(1)
292 | #' loadings = rnorm(cols)
293 | #' mat_logit = outer(rnorm(rows), loadings)
294 | #' mat_logit_new = outer(rnorm(rows), loadings)
295 | #'
296 | #' # convert to a binary matrix
297 | #' mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0
298 | #' mat_new = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit_new)) * 1.0
299 | #'
300 | #' # run generalized PCA on it
301 | #' cgpca = convexGeneralizedPCA(mat, k = 1, M = 4, family = "binomial")
302 | #'
303 | #' PCs = predict(cgpca, mat_new)
304 | #' @export
305 | predict.cgpca <- function(object, newdata, type = c("PCs", "link", "response"), ...) {
306 |   type = match.arg(type)
307 | 
308 |   if (type == "PCs") {
309 |     if (missing(newdata)) {
310 |       PCs = object$PCs
311 |     } else {
312 |       eta = ((as.matrix(newdata) * 2) - 1) * object$M
313 |       eta_centered = scale(eta, center = object$mu, scale = FALSE)
314 |       eta_centered[is.na(newdata)] <- 0
315 |       PCs = eta_centered %*% object$U
316 |     }
317 |     return(PCs)
318 |   } else {
319 |     eta = ((as.matrix(newdata) * 2) - 1) * object$M
320 |     eta_centered = scale(eta, center = object$mu, scale = FALSE)
321 |     eta_centered[is.na(newdata)] <- 0
322 |     theta = outer(rep(1, nrow(eta)), object$mu) + eta_centered %*% object$H
323 |     if (type == "link") {
324 |       return(theta)
325 |     } else {
326 |       return(inv.logit.mat(theta))
327 |     }
328 |   }
329 | }
330 | 
331 | #' @title Plot convex generalized PCA
332 | #'
333 | #' @description
334 | #' Plots the results of a convex generalized PCA
335 | #'
336 | #' @param x convex generalized PCA object
337 | #' @param type the type of plot \code{type = "trace"} plots the algorithms progress by
338 | #' iteration, \code{type = "loadings"} plots the first 2 PC loadings,
339 | #' \code{type = "scores"} plots the first 2 PC scores
340 | #' @param ... Additional arguments
341 | #' @examples
342 | #' # construct a low rank matrix in the logit scale
343 | #' rows = 100
344 | #' cols = 10
345 | #' set.seed(1)
346 | #' mat_logit = outer(rnorm(rows), rnorm(cols))
347 | #'
348 | #' # generate a binary matrix
349 | #' mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0
350 | #'
351 | #' # run convex generalized PCA on it
352 | #' cgpca = convexGeneralizedPCA(mat, k = 1, M = 4, family = "binomial")
353 | #'
354 | #' \dontrun{
355 | #' plot(cgpca)
356 | #' }
357 | #' @export
358 | plot.cgpca <- function(x, type = c("trace", "loadings", "scores"), ...) {
359 |   type = match.arg(type)
360 | 
361 |   if (type == "trace") {
362 |     df = data.frame(Iteration = 0:x$iters,
363 |                     Deviance = x$loss_trace)
364 |     p <- ggplot2::ggplot(df, ggplot2::aes_string("Iteration", "Deviance")) +
365 |       ggplot2::geom_line()
366 |   } else if (type == "loadings") {
367 |     df = data.frame(x$U)
368 |     colnames(df) <- paste0("PC", 1:ncol(df))
369 |     if (ncol(df) == 1) {
370 |       df$PC2 = 0
371 |       p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() +
372 |         ggplot2::labs(y = NULL)
373 |     } else {
374 |       p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point()
375 |     }
376 |   } else if (type == "scores") {
377 |     df = data.frame(x$PCs)
378 |     colnames(df) <- paste0("PC", 1:ncol(df))
379 |     if (ncol(df) == 1) {
380 |       df$PC2 = 0
381 |       p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() +
382 |         ggplot2::labs(y = NULL)
383 |     } else {
384 |       p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point()
385 |     }
386 |   }
387 | 
388 |   return(p)
389 | }
390 | 
391 | #' @export
392 | print.cgpca <- function(x, ...) {
393 |   cat(nrow(x$PCs), "rows and ")
394 |   cat(nrow(x$H), "columns\n")
395 |   cat("Rank", ncol(x$U), "Fantope solution with M =", x$M, "\n")
396 |   cat("\n")
397 |   cat(round(x$prop_deviance_expl * 100, 1), "% of deviance explained\n", sep = "")
398 |   cat(x$iters, "iterations to converge\n")
399 | 
400 |   invisible(x)
401 | }
402 | 
403 | #' @title CV for convex generalized PCA
404 | #'
405 | #' @description
406 | #' Run cross validation on dimension and \code{M} for convex generalized PCA
407 | #'
408 | #' @param x matrix with all binary entries
409 | #' @param ks the different dimensions \code{k} to try
410 | #' @param Ms the different approximations to the saturated model \code{M} to try
411 | #' @param folds if \code{folds} is a scalar, then it is the number of folds. If
412 | #'  it is a vector, it should be the same length as the number of rows in \code{x}
413 | #' @param quiet logical; whether the function should display progress
414 | #' @param ... Additional arguments passed to convexGeneralizedPCA
415 | #'
416 | #' @return A matrix of the CV log likelihood with \code{k} in rows and
417 | #'  \code{M} in columns
418 | #'
419 | #' @examples
420 | #' # construct a low rank matrix in the logit scale
421 | #' rows = 100
422 | #' cols = 10
423 | #' set.seed(1)
424 | #' mat_logit = outer(rnorm(rows), rnorm(cols))
425 | #'
426 | #' # generate a binary matrix
427 | #' mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0
428 | #'
429 | #' \dontrun{
430 | #' loglikes = cv.cgpca(mat, ks = 1:9, Ms = 3:6)
431 | #' plot(loglikes)
432 | #' }
433 | #' @export
434 | cv.cgpca <- function(x, ks, Ms = seq(2, 10, by = 2), folds = 5, quiet = TRUE, ...) {
435 |   # TODO: does not support weights
436 |   q = 2 * as.matrix(x) - 1
437 |   q[is.na(q)] <- 0
438 | 
439 |   if (length(folds) > 1) {
440 |     # does this work if factor?
441 |     if (length(unique(folds)) <= 1) {
442 |       stop("If inputing CV split, must be more than one level")
443 |     }
444 |     if (length(folds) != nrow(x)) {
445 |       stop("if folds is a vector, it should be of same length as nrow(x)")
446 |     }
447 |     cv = folds
448 |   } else {
449 |     cv = sample(1:folds, nrow(q), replace = TRUE)
450 |   }
451 | 
452 |   log_likes = matrix(0, length(ks), length(Ms),
453 |                      dimnames = list(k = ks, M = Ms))
454 |   for (k in ks) {
455 |     for (M in Ms) {
456 |       if (!quiet) {
457 |         cat("k =", k, "M =", M, "")
458 |       }
459 |       for (c in unique(cv)) {
460 |         if (!quiet) {
461 |           cat(".")
462 |         }
463 |         clpca = convexGeneralizedPCA(x[c != cv, ], k = k, M = M, ...)
464 |         pred_theta = predict(clpca, newdat = x[c == cv, ], type = "link")
465 |         log_likes[k == ks, M == Ms] = log_likes[k == ks, M == Ms] +
466 |           log_like_Bernoulli(q = q[c == cv, ], theta = pred_theta)
467 |       }
468 |       if (!quiet) {
469 |         cat("", log_likes[k == ks, M == Ms], "\n")
470 |       }
471 |     }
472 |   }
473 |   class(log_likes) <- c("matrix", "cv.gpca")
474 |   which_min = which(log_likes == max(log_likes), arr.ind = TRUE)
475 |   if (!quiet) {
476 |     cat("Best: k =", ks[which_min[1]], "M =", Ms[which_min[2]], "\n")
477 |   }
478 | 
479 |   return(log_likes)
480 | }
481 | 


--------------------------------------------------------------------------------
/R/generalizedPCA.R:
--------------------------------------------------------------------------------
  1 | #' @title Generalized Principal Component Analysis
  2 | #'
  3 | #' @description
  4 | #' Dimension reduction for exponential family data by extending Pearson's
  5 | #' PCA formulation
  6 | #'
  7 | #' @param x matrix of either binary, proportions, count, or continuous data
  8 | #' @param k number of principal components to return
  9 | #' @param M value to approximate the saturated model
 10 | #' @param family exponential family distribution of data
 11 | #' @param weights an optional matrix of the same size as the \code{x} with data weights
 12 | #' @param quiet logical; whether the calculation should give feedback
 13 | #' @param majorizer how to majorize the deviance. \code{"row"} gives
 14 | #'  tighter majorization, but may take longer to calculate each iteration.
 15 | #'  \code{"all"} may be faster per iteration, but take more iterations
 16 | #' @param partial_decomp logical; if \code{TRUE}, the function uses the RSpectra package
 17 | #'   to more quickly calculate the SVD. When the number of columns is small,
 18 | #'   the approximation may be less accurate and slower
 19 | #' @param max_iters number of maximum iterations
 20 | #' @param conv_criteria convergence criteria. The difference between average deviance
 21 | #'   in successive iterations
 22 | #' @param random_start logical; whether to randomly inititalize the parameters. If \code{FALSE},
 23 | #'   function will use an eigen-decomposition as starting value
 24 | #' @param start_U starting value for the orthogonal matrix
 25 | #' @param start_mu starting value for mu. Only used if \code{main_effects = TRUE}
 26 | #' @param main_effects logical; whether to include main effects in the model
 27 | #' @param normalize logical; whether to weight the variables to they all have equal influence
 28 | #' @param validation a validation dataset to select \code{m} with
 29 | #' @param val_weights weights associated with validation data
 30 | #'
 31 | #' @return An S3 object of class \code{gpca} which is a list with the
 32 | #' following components:
 33 | #' \item{mu}{the main effects}
 34 | #' \item{U}{a \code{k}-dimentional orthonormal matrix with the loadings}
 35 | #' \item{PCs}{the princial component scores}
 36 | #' \item{M}{the parameter inputed}
 37 | #' \item{family}{the exponential family used}
 38 | #' \item{iters}{number of iterations required for convergence}
 39 | #' \item{loss_trace}{the trace of the average deviance of the algorithm.
 40 | #'    Should be non-increasing}
 41 | #' \item{prop_deviance_expl}{the proportion of deviance explained by this model.
 42 | #'    If \code{main_effects = TRUE}, the null model is just the main effects, otherwise
 43 | #'    the null model estimates 0 for all natural parameters.}
 44 | #' @examples
 45 | #' # construct a low rank matrix in the natural parameter space
 46 | #' rows = 100
 47 | #' cols = 10
 48 | #' set.seed(1)
 49 | #' mat_np = outer(rnorm(rows), rnorm(cols))
 50 | #'
 51 | #' # generate a count matrix
 52 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
 53 | #'
 54 | #' # run Poisson PCA on it
 55 | #' gpca = generalizedPCA(mat, k = 1, M = 4, family = "poisson")
 56 | #' @export
 57 | #' @importFrom stats var
 58 | generalizedPCA <- function(x, k = 2, M = 4, family = c("gaussian", "binomial", "poisson", "multinomial"),
 59 |                            weights, quiet = TRUE, majorizer = c("row", "all"),
 60 |                            partial_decomp = FALSE, max_iters = 1000, conv_criteria = 1e-5,
 61 |                            random_start = FALSE, start_U, start_mu, main_effects = TRUE,
 62 |                            normalize = FALSE, validation, val_weights) {
 63 |   family = match.arg(family)
 64 |   check_family(x, family)
 65 | 
 66 |   majorizer = match.arg(majorizer)
 67 | 
 68 |   x = as.matrix(x)
 69 |   missing_mat = is.na(x)
 70 |   n = nrow(x)
 71 |   d = ncol(x)
 72 |   ones = rep(1, n)
 73 | 
 74 |   if (missing(weights)) {
 75 |     weights = 1.0
 76 |     sum_weights = sum(!is.na(x))
 77 |   } else {
 78 |     weights[is.na(x)] <- 0
 79 |     if (any(is.na(weights))) {
 80 |       stop("Can't have NA in weights")
 81 |     }
 82 |     if (any(weights < 0)) {
 83 |       stop("weights must be non-negative")
 84 |     }
 85 |     if (!all(dim(weights) == dim(x))) {
 86 |       stop("x and weights are not the same dimension")
 87 |     }
 88 |     sum_weights = sum(weights)
 89 |   }
 90 | 
 91 |   # calculate the null log likelihood for % deviance explained and normalization
 92 |   if (main_effects) {
 93 |     if (length(weights) == 1) {
 94 |       weighted_col_means = colMeans(x, na.rm = TRUE)
 95 |     } else {
 96 |       weighted_col_means = colSums(x * weights, na.rm = TRUE) / colSums(weights)
 97 |     }
 98 |     null_theta = as.numeric(saturated_natural_parameters(matrix(weighted_col_means, 1), family, M))
 99 |   } else {
100 |     null_theta = rep(0, d)
101 |   }
102 | 
103 |   if (normalize) {
104 |     if (any(apply(x, 2, stats::var, na.rm = TRUE) == 0)) {
105 |       stop("At least one variable has variance of 0. Cannot normalize")
106 |     }
107 | 
108 |     eta_sat_nat = saturated_natural_parameters(x, family, M = Inf)
109 | 
110 |     norms = sapply(1:d, function(j) {
111 |       2 * (exp_fam_log_like(x[, j], eta_sat_nat[, j], family, weights) -
112 |              exp_fam_log_like(x[, j], rep(null_theta[j], n), family, weights))
113 |       }) / n
114 |     if (any(norms <= 0)) {
115 |       stop("Normalization caused weights to be <= 0")
116 |     }
117 | 
118 |     if (length(weights) == 1) {
119 |       weights = outer(ones, 1 / norms)
120 |     } else {
121 |       weights = sweep(weights, 2, 1 / norms, "*")
122 |     }
123 |   }
124 | 
125 |   if (M == 0) {
126 |     if (any(is.na(x))) {
127 |       stop("Cannot solve for M with missing weights")
128 |     }
129 |     M = 4
130 |     solve_M = TRUE
131 |     if (!missing(validation)) {
132 |       if (ncol(validation) != ncol(x)) {
133 |         stop("validation does not have the same variables as x")
134 |       }
135 |       if (missing(val_weights)) {
136 |         val_weights = 1.0
137 |       } else {
138 |         if (!all(dim(val_weights) == dim(validation))) {
139 |           stop("validation and val_weights are not the same dimension")
140 |         }
141 |       }
142 |       validation = as.matrix(validation)
143 |       M_mat = exp_fam_sat_ind_mat(validation, family)
144 |     } else {
145 |       M_mat = exp_fam_sat_ind_mat(x, family)
146 |     }
147 |   } else {
148 |     solve_M = FALSE
149 |   }
150 | 
151 |   # if it is standard PCA, only need 1 iteration
152 |   if (family == "gaussian" & all(weights == 1) & sum(missing_mat) == 0) {
153 |     max_iters = 0
154 |   }
155 | 
156 |   # Initialize #
157 |   ##################
158 |   if (main_effects) {
159 |     if (!missing(start_mu)) {
160 |       mu = start_mu
161 |     } else {
162 |       eta = saturated_natural_parameters(x, family, M = M)
163 |       is.na(eta[is.na(x)]) <- TRUE
164 |       mu = colMeans(eta, na.rm = TRUE)
165 |       # mu = saturated_natural_parameters(colMeans(x, na.rm = TRUE), family, M)
166 |     }
167 |   } else {
168 |     mu = rep(0, d)
169 |   }
170 | 
171 |   eta = saturated_natural_parameters(x, family, M = M) + missing_mat * outer(ones, mu)
172 |   eta_centered = scale(eta, center = mu, scale = FALSE)
173 | 
174 |   if (!missing(start_U)) {
175 |     U = sweep(start_U, 2, sqrt(colSums(start_U^2)), "/")
176 |   } else if (random_start) {
177 |     U = matrix(rnorm(d * k), d, k)
178 |     U = qr.Q(qr(U))
179 |   } else {
180 |     if (partial_decomp) {
181 |       udv = RSpectra::svds(scale(eta, center = mu, scale = normalize), k, nu = k, nv = k)
182 |     } else {
183 |       udv = svd(scale(eta, center = mu, scale = normalize))
184 |     }
185 |     U = matrix(udv$v[, 1:k], d, k)
186 |   }
187 | 
188 |   eta_sat_nat = saturated_natural_parameters(x, family, M = Inf)
189 |   sat_loglike = exp_fam_log_like(x, eta_sat_nat, family, weights)
190 | 
191 |   loss_trace = numeric(max_iters + 1)
192 |   theta = outer(ones, mu) + eta_centered %*% tcrossprod(U)
193 |   loglike <- exp_fam_log_like(x, theta, family, weights)
194 |   loss_trace[1] = 2 * (sat_loglike - loglike) / sum_weights
195 |   ptm <- proc.time()
196 | 
197 |   if (!quiet) {
198 |     cat(0, "  ", loss_trace[1], "")
199 |     cat("0 hours elapsed\n")
200 |   }
201 | 
202 |   for (m in seq_len(max_iters)) {
203 |     last_U = U
204 |     last_M = M
205 |     last_mu = mu
206 | 
207 |     # TODO: incorporate missing data
208 |     if (solve_M) {
209 |       gpca_obj = structure(list(mu = mu, U = U, M = M, family = family),
210 |                              class = "gpca")
211 |       if (missing(validation)) {
212 |         fitted_theta = predict(gpca_obj, newdata = x, type = "link")
213 |       } else {
214 |         fitted_theta = predict(gpca_obj, newdata = validation, type = "link")
215 |       }
216 |       fitted_mean = exp_fam_mean(fitted_theta, family)
217 | 
218 |       if (missing(validation)) {
219 |         M_slope = sum(((fitted_mean - x) * weights * (M_mat %*% tcrossprod(U)))[!is.na(M_mat)])
220 |         fitted_var = exp_fam_variance(fitted_theta, family, weights)
221 |       } else {
222 |         M_slope = sum(((fitted_mean - validation) * val_weights * (M_mat %*% tcrossprod(U)))[!is.na(M_mat)])
223 |         fitted_var = exp_fam_variance(fitted_theta, family, val_weights)
224 |       }
225 |       M_curve = sum((fitted_var * (M_mat %*% tcrossprod(U))^2)[!is.na(M_mat)])
226 | 
227 |       M = max(M - M_slope / M_curve, 0)
228 | 
229 |       eta = saturated_natural_parameters(x, family, M = M) + missing_mat * outer(ones, mu)
230 |       eta_centered = scale(eta, center = mu, scale = FALSE)
231 |       theta = outer(ones, mu) + eta_centered %*% tcrossprod(U)
232 |     }
233 | 
234 |     first_dir = exp_fam_mean(theta, family)
235 |     second_dir = exp_fam_variance(theta, family, weights)
236 |     if (majorizer == "row") {
237 |       W = apply(second_dir, 1, max)
238 |     } else if (majorizer == "all") {
239 |       W = rep(max(second_dir), n)
240 |     }
241 | 
242 |     # EM style estimate of Z with theta when missing data
243 |     Z = as.matrix(theta + weights * (x - first_dir) / outer(W, rep(1, d)))
244 |     Z[is.na(x)] <- theta[is.na(x)]
245 |     if (main_effects) {
246 |       mu = as.numeric(colSums((Z - eta %*% tcrossprod(U)) * W) / sum(W))
247 |     }
248 | 
249 |     eta = saturated_natural_parameters(x, family, M = M) + missing_mat * outer(ones, mu)
250 |     eta_centered = scale(eta, center = mu, scale = FALSE)
251 | 
252 |     mat_temp = t(eta_centered * W) %*% scale(Z, center = mu, scale = FALSE)
253 |     mat_temp = mat_temp + t(mat_temp) -
254 |       t(eta_centered * W) %*% eta_centered
255 | 
256 |     # RSpectra could give poor estimates of e-vectors
257 |     # so I switch to standard eigen if it does
258 |     repeat {
259 |       if (partial_decomp) {
260 |         eig = RSpectra::eigs_sym(mat_temp, k = min(k + 2, d))
261 |       } else {
262 |         eig = eigen(mat_temp, symmetric = TRUE)
263 |       }
264 |       U = matrix(eig$vectors[, 1:k], d, k)
265 | 
266 |       theta = outer(ones, mu) + eta_centered %*% tcrossprod(U)
267 |       this_loglike <- exp_fam_log_like(x, theta, family, weights)
268 | 
269 |       if (!partial_decomp | this_loglike>=loglike) {
270 |         loglike = this_loglike
271 |         break
272 |       } else {
273 |         partial_decomp = FALSE
274 |         if (!quiet) {
275 |           cat("RSpectra::eigs_sym was too inaccurate in iteration ", m ,
276 |               ". Switched to base::eigen")
277 |         }
278 |       }
279 |     }
280 | 
281 |     loss_trace[m + 1] = 2 * (sat_loglike - loglike) / sum_weights
282 | 
283 |     if (!quiet) {
284 |       time_elapsed = as.numeric(proc.time() - ptm)[3]
285 |       tot_time = max_iters / m * time_elapsed
286 |       time_remain = tot_time - time_elapsed
287 |       cat(m, "  ", loss_trace[m + 1], "")
288 |       cat(round(time_elapsed / 3600, 1), "hours elapsed. Max", round(time_remain / 3600, 1), "hours remain.\n")
289 |     }
290 |     if (m > 4) {
291 |       if (abs(loss_trace[m] - loss_trace[m+1]) < 2 * conv_criteria) {
292 |         break
293 |       }
294 |     }
295 |   }
296 | 
297 |   # test if loss function increases
298 |   if (max_iters > 0 && (loss_trace[m + 1] - loss_trace[m]) > (1e-10)) {
299 |     U = last_U
300 |     mu = last_mu
301 |     M = last_M
302 |     m = m - 1
303 | 
304 |     eta = saturated_natural_parameters(x, family, M = M) + missing_mat * outer(ones, mu)
305 |     eta_centered = scale(eta, center = mu, scale = FALSE)
306 | 
307 |     if (family != "poisson") {
308 |       # maybe possible with missing data? TODO: look into
309 |       warning("Deviance increased in last iteration.\nThis should not happen!")
310 |     } else {
311 |       message("Deviance increased in last iteration.")
312 |     }
313 |   } else if (max_iters == 0) {
314 |     m = 0
315 |   }
316 | 
317 |   null_loglike = exp_fam_log_like(x, outer(ones, null_theta), family, weights)
318 | 
319 |   object <- list(mu = mu,
320 |                  U = U,
321 |                  PCs = eta_centered %*% U,
322 |                  M = M,
323 |                  family = family,
324 |                  iters = m,
325 |                  loss_trace = loss_trace[1:(m + 1)],
326 |                  prop_deviance_expl = 1 - (loglike - sat_loglike) / (null_loglike - sat_loglike)
327 |   )
328 |   class(object) <- "gpca"
329 |   object
330 | }
331 | 
332 | 
333 | #' @title Predict generalized PCA scores or reconstruction on new data
334 | #'
335 | #' @description Predict generalized PCA scores or reconstruction on new data
336 | #'
337 | #' @param object generalized PCA object
338 | #' @param newdata matrix of the same exponential family as in \code{object}.
339 | #'  If missing, will use the data that \code{object} was fit on
340 | #' @param type the type of fitting required. \code{type = "PCs"} gives the PC scores,
341 | #'  \code{type = "link"} gives matrix on the natural parameter scale and
342 | #'  \code{type = "response"} gives matrix on the response scale
343 | #' @param ... Additional arguments
344 | #' @examples
345 | #' # construct a low rank matrices in the natural parameter space
346 | #' rows = 100
347 | #' cols = 10
348 | #' set.seed(1)
349 | #' loadings = rnorm(cols)
350 | #' mat_np = outer(rnorm(rows), rnorm(cols))
351 | #' mat_np_new = outer(rnorm(rows), loadings)
352 | #'
353 | #' # generate a count matrices
354 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
355 | #' mat_new = matrix(rpois(rows * cols, c(exp(mat_np_new))), rows, cols)
356 | #'
357 | #' # run Poisson PCA on it
358 | #' gpca = generalizedPCA(mat, k = 1, M = 4, family = "poisson")
359 | #'
360 | #' PCs = predict(gpca, mat_new)
361 | #' @export
362 | predict.gpca <- function(object, newdata, type = c("PCs", "link", "response"), ...) {
363 |   type = match.arg(type)
364 | 
365 |   if (missing(newdata)) {
366 |     PCs = object$PCs
367 |   } else {
368 |     check_family(newdata, object$family)
369 | 
370 |     eta = saturated_natural_parameters(newdata, object$family, object$M) +
371 |       is.na(newdata) * outer(rep(1, nrow(newdata)), object$mu)
372 |     PCs = scale(eta, center = object$mu, scale = FALSE) %*% object$U
373 |   }
374 | 
375 |   if (type == "PCs") {
376 |     PCs
377 |   } else {
378 |     object$PCs = PCs
379 |     fitted(object, type, ...)
380 |   }
381 | }
382 | 
383 | #' @title Fitted values using generalized PCA
384 | #'
385 | #' @description
386 | #' Fit a lower dimentional representation of the exponential family matrix using generalized PCA
387 | #'
388 | #' @param object generalized PCA object
389 | #' @param type the type of fitting required. \code{type = "link"} gives output on the natural
390 | #'  parameter scale and \code{type = "response"} gives output on the response scale
391 | #' @param ... Additional arguments
392 | #' @examples
393 | #' # construct a low rank matrix in the natural parameter space
394 | #' rows = 100
395 | #' cols = 10
396 | #' set.seed(1)
397 | #' mat_np = outer(rnorm(rows), rnorm(cols))
398 | #'
399 | #' # generate a count matrix
400 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
401 | #'
402 | #' # run Poisson PCA on it
403 | #' gpca = generalizedPCA(mat, k = 1, M = 4, family = "poisson")
404 | #'
405 | #' # construct fitted expected value of counts matrix
406 | #' fit = fitted(gpca, type = "response")
407 | #' @export
408 | fitted.gpca <- function(object, type = c("link", "response"), ...) {
409 |   type = match.arg(type)
410 |   n = nrow(object$PCs)
411 | 
412 |   theta = outer(rep(1, n), object$mu) + tcrossprod(object$PCs, object$U)
413 | 
414 |   if (type == "link") {
415 |     return(theta)
416 |   } else if (type == "response") {
417 |     return(exp_fam_mean(theta, object$family))
418 |   }
419 | }
420 | 
421 | #' @title Plot generalized PCA
422 | #'
423 | #' @description
424 | #' Plots the results of a generalized PCA
425 | #'
426 | #' @param x generalized PCA object
427 | #' @param type the type of plot \code{type = "trace"} plots the algorithms progress by
428 | #' iteration, \code{type = "loadings"} plots the first 2 principal component
429 | #' loadings, \code{type = "scores"} plots the loadings first 2 principal component scores
430 | #' @param ... Additional arguments
431 | #' @examples
432 | #' # construct a low rank matrix in the natural parameter space
433 | #' rows = 100
434 | #' cols = 10
435 | #' set.seed(1)
436 | #' mat_np = outer(rnorm(rows), rnorm(cols))
437 | #'
438 | #' # generate a count matrix
439 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
440 | #'
441 | #' # run logistic PCA on it
442 | #' gpca = generalizedPCA(mat, k = 2, M = 4, family = "poisson")
443 | #'
444 | #' \dontrun{
445 | #' plot(gpca)
446 | #' }
447 | #' @export
448 | plot.gpca <- function(x, type = c("trace", "loadings", "scores"), ...) {
449 |   type = match.arg(type)
450 | 
451 |   if (type == "trace") {
452 |     df = data.frame(Iteration = 0:x$iters,
453 |                     Deviance = x$loss_trace)
454 |     p <- ggplot2::ggplot(df, ggplot2::aes_string("Iteration", "Deviance")) +
455 |       ggplot2::geom_line()
456 |   } else if (type == "loadings") {
457 |     df = data.frame(x$U)
458 |     colnames(df) <- paste0("PC", 1:ncol(df))
459 |     if (ncol(df) == 1) {
460 |       df$PC2 = 0
461 |       p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() +
462 |         ggplot2::labs(y = NULL)
463 |     } else {
464 |       p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point()
465 |     }
466 |   } else if (type == "scores") {
467 |     df = data.frame(x$PCs)
468 |     colnames(df) <- paste0("PC", 1:ncol(df))
469 |     if (ncol(df) == 1) {
470 |       df$PC2 = 0
471 |       p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() +
472 |         ggplot2::labs(y = NULL)
473 |     } else {
474 |       p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point()
475 |     }
476 |   }
477 | 
478 |   return(p)
479 | }
480 | 
481 | #' @export
482 | print.gpca <- function(x, ...) {
483 |   cat(nrow(x$PCs), "rows and ")
484 |   cat(nrow(x$U), "columns of ")
485 |   cat(x$family, "data\n")
486 |   cat("Rank", ncol(x$U), "solution with M =", x$M, "\n")
487 |   cat("\n")
488 |   cat(round(x$prop_deviance_expl * 100, 1), "% of deviance explained\n", sep = "")
489 |   cat(x$iters, "iterations to converge\n")
490 | 
491 |   invisible(x)
492 | }
493 | 
494 | #' @title CV for generalized PCA
495 | #'
496 | #' @description
497 | #' Run cross validation on dimension and \code{M} for generalized PCA
498 | #'
499 | #' @param x matrix of either binary, count, or continuous data
500 | #' @param ks the different dimensions \code{k} to try
501 | #' @param Ms the different approximations to the saturated model \code{M} to try
502 | #' @param family exponential family distribution of data
503 | #' @param weights an optional matrix of the same size as the \code{x} with data weights
504 | #' @param folds if \code{folds} is a scalar, then it is the number of folds. If
505 | #'  it is a vector, it should be the same length as the number of rows in \code{x}
506 | #' @param quiet logical; whether the function should display progress
507 | #' @param ... Additional arguments passed to \code{generalizedPCA}
508 | #'
509 | #' @return A matrix of the CV log likelihood with \code{k} in rows and
510 | #'  \code{M} in columns
511 | #' @examples
512 | #' # construct a low rank matrix in the natural parameter space
513 | #' rows = 100
514 | #' cols = 10
515 | #' set.seed(1)
516 | #' mat_np = outer(rnorm(rows), rnorm(cols))
517 | #'
518 | #' # generate a count matrix
519 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
520 | #'
521 | #' \dontrun{
522 | #' loglikes = cv.gpca(mat, ks = 1:9, Ms = 3:6, family = "poisson", quiet = FALSE)
523 | #' plot(loglikes)
524 | #' }
525 | #' @export
526 | cv.gpca <- function(x, ks, Ms = seq(2, 10, by = 2), family = c("gaussian", "binomial", "poisson", "multinomial"),
527 |                     weights, folds = 5, quiet = TRUE, ...) {
528 |   family = match.arg(family)
529 |   check_family(x, family)
530 | 
531 |   if (length(folds) > 1) {
532 |     # does this work if factor?
533 |     if (length(unique(folds)) <= 1) {
534 |       stop("If inputing CV split, must be more than one level")
535 |     }
536 |     if (length(folds) != nrow(x)) {
537 |       stop("if folds is a vector, it should be of same length as nrow(x)")
538 |     }
539 |     cv = folds
540 |   } else {
541 |     cv = sample(1:folds, nrow(x), replace = TRUE)
542 |   }
543 | 
544 |   if (missing(weights) || length(weights) == 1) {
545 |     weights = matrix(1.0, nrow(x), ncol(x))
546 |   }
547 | 
548 |   log_likes = matrix(0, length(ks), length(Ms),
549 |                      dimnames = list(k = ks, M = Ms))
550 |   for (k in ks) {
551 |     for (M in Ms) {
552 |       if (!quiet) {
553 |         cat("k =", k, "M =", M, "")
554 |       }
555 |       for (c in unique(cv)) {
556 |         if (!quiet) {
557 |           cat(".")
558 |         }
559 |         gpca = generalizedPCA(x[c != cv, ], k = k, M = M, family = family, weights = weights[c != cv, ], ...)
560 |         pred_theta = predict(gpca, newdat = x[c == cv, ], type = "link")
561 |         log_likes[k == ks, M == Ms] = log_likes[k == ks, M == Ms] +
562 |           exp_fam_log_like(x = x[c == cv, ], theta = pred_theta, family = family, weights = weights[c == cv, ])
563 |       }
564 |       if (!quiet) {
565 |         cat("", log_likes[k == ks, M == Ms], "\n")
566 |       }
567 |     }
568 |   }
569 |   class(log_likes) <- c("matrix", "cv.gpca")
570 |   which_min = which(log_likes == max(log_likes), arr.ind = TRUE)
571 |   if (!quiet) {
572 |     cat("Best: k =", ks[which_min[1]], "M =", Ms[which_min[2]], "\n")
573 |   }
574 | 
575 |   return(log_likes)
576 | }
577 | 
578 | #' @title Plot CV for generalized PCA
579 | #'
580 | #' @description
581 | #' Plot cross validation results generalized PCA
582 | #'
583 | #' @param x a \code{cv.gpca} object
584 | #' @param ... Additional arguments
585 | #' @examples
586 | #' # construct a low rank matrix in the natural parameter space
587 | #' rows = 100
588 | #' cols = 10
589 | #' set.seed(1)
590 | #' mat_np = outer(rnorm(rows), rnorm(cols))
591 | #'
592 | #' # generate a count matrix
593 | #' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
594 | #'
595 | #' \dontrun{
596 | #' loglikes = cv.gpca(mat, ks = 1:9, Ms = 3:6, family = "poisson")
597 | #' plot(loglikes)
598 | #' }
599 | #' @export
600 | #' @importFrom utils type.convert
601 | plot.cv.gpca <- function(x, ...) {
602 |   # replaces reshape2::melt(-x, value.name = "NegLogLikelihood")
603 |   Ms = utils::type.convert(colnames(x))
604 |   ks = utils::type.convert(rownames(x))
605 |   df = data.frame(k = rep(ks, times = length(Ms)),
606 |                   m = rep(Ms, each = length(ks)),
607 |                   NegLogLikelihood = as.vector(-x))
608 | 
609 |   if (ncol(x) == 1) {
610 |     df$M = factor(df$M)
611 |     p <- ggplot2::ggplot(df, ggplot2::aes_string("k", "NegLogLikelihood", colour = "m")) +
612 |       ggplot2::geom_line()
613 |   } else {
614 |     df$k = factor(df$k)
615 |     p <- ggplot2::ggplot(df, ggplot2::aes_string("m", "NegLogLikelihood", colour = "k")) +
616 |       ggplot2::geom_line()
617 |   }
618 |   return(p)
619 | }
620 | 


--------------------------------------------------------------------------------