├── NAMESPACE ├── inst └── CITATION ├── MD5 ├── DESCRIPTION ├── man ├── rrBLUP-package.Rd ├── kinship.BLUP.Rd ├── A.mat.Rd ├── mixed.solve.Rd ├── GWAS.Rd └── kin.blup.Rd ├── R ├── kinship.BLUP.R ├── kin.blup.R ├── mixed.solve.R ├── A.mat.R └── GWAS.R └── NEWS /NAMESPACE: -------------------------------------------------------------------------------- 1 | importFrom(parallel,clusterExport) 2 | importFrom(parallel,makeCluster) 3 | importFrom(parallel,parLapply) 4 | importFrom(parallel,stopCluster) 5 | importFrom("grDevices", "dev.cur", "dev.new", "dev.next", "dev.set") 6 | importFrom("graphics", "axis", "lines", "par", "points", "title") 7 | importFrom("stats", "cov", "dist", "median", "model.matrix", "optimize", "pbeta", "ppoints", "predict", "smooth.spline","var") 8 | export(mixed.solve,GWAS,kinship.BLUP,A.mat,kin.blup) 9 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite the rrBLUP package in publications use:") 2 | 3 | bibentry( 4 | bibtype="Article", 5 | title="Ridge regression and other kernels for genomic selection with R package rrBLUP", 6 | author=as.person("J. B. Endelman"), 7 | journal="Plant Genome", 8 | year=2011, 9 | volume=4, 10 | pages="250-255", 11 | textVersion=paste("Endelman, J.B. 2011. Ridge regression and other kernels", 12 | "for genomic selection with R package rrBLUP. Plant Genome 4:250-255.") 13 | ) -------------------------------------------------------------------------------- /MD5: -------------------------------------------------------------------------------- 1 | 344385ca7db0830dc9a5c1f11db5f478 *DESCRIPTION 2 | 6d39635a89ceefd16d5eee86b2e0551c *NAMESPACE 3 | 2d546dd39382e9b747f331c96cde4ddc *NEWS 4 | 50ccb3ca85692abc5aa87eee8833fbf6 *R/A.mat.R 5 | db7b0e1640f03742d0a44ca5829558d9 *R/GWAS.R 6 | f42e2c8db0a184e47c684627a457a542 *R/kin.blup.R 7 | 8337be307fa8255206b4c505abb2a2e5 *R/kinship.BLUP.R 8 | f0b2d35594029936eff56e0b05cac09b *R/mixed.solve.R 9 | c79901b854bc8471e6ac0db523041139 *inst/CITATION 10 | 0f4883b4930ced50c23c212e55ae7b2b *man/A.mat.Rd 11 | 8873b1be71d10607ffd207d29d03415a *man/GWAS.Rd 12 | ae1576b1fedf894752b72d4635f76086 *man/kin.blup.Rd 13 | aeaff872c68712be50b3941de3873b50 *man/kinship.BLUP.Rd 14 | c2d59e14f73fae955f3cd17eecd6145e *man/mixed.solve.Rd 15 | 7ad01f9d6f2b401ae298065f8ad03e21 *man/rrBLUP-package.Rd 16 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: rrBLUP 2 | Title: Ridge Regression and Other Kernels for Genomic Selection 3 | Version: 4.6.3 4 | Author: Jeffrey Endelman 5 | Maintainer: Jeffrey Endelman 6 | Depends: R (>= 4.0) 7 | Imports: stats, graphics, grDevices, parallel 8 | Description: Software for genomic prediction with the RR-BLUP mixed model (Endelman 2011, ). One application is to estimate marker effects by ridge regression; alternatively, BLUPs can be calculated based on an additive relationship matrix or a Gaussian kernel. 9 | License: GPL-3 10 | URL: 11 | NeedsCompilation: no 12 | Packaged: 2023-12-09 20:44:08 UTC; endelman 13 | Repository: CRAN 14 | Date/Publication: 2023-12-10 17:10:06 UTC 15 | -------------------------------------------------------------------------------- /man/rrBLUP-package.Rd: -------------------------------------------------------------------------------- 1 | \name{rrBLUP-package} 2 | \alias{rrBLUP-package} 3 | \docType{package} 4 | \title{Ridge regression and other kernels for genomic selection} 5 | \description{This package has been developed primarily for genomic prediction with mixed models (but it can also do genome-wide association mapping with \code{\link{GWAS}}). The heart of the package is the function \code{\link{mixed.solve}}, which is a general-purpose solver for mixed models with a single variance component other than the error. Genomic predictions can be made by estimating marker effects (RR-BLUP) or by estimating line effects (G-BLUP). In Endelman (2011) I made the poor choice of using the letter G to denotype the genotype or marker data. To be consistent with Endelman (2011) I have retained this notation in \code{\link{kinship.BLUP}}. However, that function has now been superseded by \code{\link{kin.blup}} and \code{\link{A.mat}}, the latter being a utility for estimating the additive relationship matrix (A) from markers. In these newer functions I adopt the usual convention that G is the genetic covariance (not the marker data), which is also consistent with the notation in Endelman and Jannink (2012). 6 | 7 | Vignettes illustrating some of the features of this package can be found at \url{https://potatobreeding.cals.wisc.edu/software/}. 8 | } 9 | 10 | \references{ 11 | Endelman, J.B. 2011. Ridge regression and other kernels for genomic selection with R package rrBLUP. Plant Genome 4:250-255. 12 | 13 | Endelman, J.B., and J.-L. Jannink. 2012. Shrinkage estimation of the realized relationship matrix. G3:Genes, Genomes, Genetics 2:1405-1413. 14 | } 15 | -------------------------------------------------------------------------------- /R/kinship.BLUP.R: -------------------------------------------------------------------------------- 1 | kinship.BLUP <- function(y,G.train,G.pred=NULL,X=NULL,Z.train=NULL,K.method="RR",n.profile=10,mixed.method="REML",n.core=1) { 2 | #assumes genotypes coded on [-1,1] scale 3 | #continuous values OK 4 | 5 | K.method <- toupper(K.method) 6 | 7 | n.obs <- length(y) 8 | y <- matrix(y,n.obs,1) 9 | 10 | if (is.null(X)) { 11 | p <- 1 12 | X <- matrix(rep(1,n.obs),n.obs,1) 13 | } 14 | p <- ncol(X) 15 | if (is.null(p)) { 16 | p <- 1 17 | X <- matrix(X,length(X),1) 18 | } 19 | 20 | stopifnot(nrow(X)==n.obs) 21 | 22 | if (is.null(Z.train)) { 23 | Z.train <- diag(n.obs) 24 | } 25 | 26 | m <- ncol(G.train) 27 | n.train <- nrow(G.train) 28 | 29 | stopifnot(ncol(Z.train)==n.train) 30 | stopifnot(nrow(Z.train)==n.obs) 31 | 32 | if (!is.null(G.pred)) { 33 | stopifnot(ncol(G.pred)==m) 34 | n.pred <- nrow(G.pred) 35 | } else { 36 | n.pred <- 0 37 | } 38 | 39 | Z <- cbind(Z.train,matrix(rep(0,n.obs*n.pred),n.obs,n.pred)) 40 | G <- rbind(G.train,G.pred) 41 | 42 | if (K.method == "RR") { 43 | K <- A.mat(G,n.core=n.core) 44 | soln <- mixed.solve(y=y,X=X,Z=Z,K=K,method=mixed.method) 45 | if (n.pred > 0) { 46 | return(list(g.train=soln$u[1:n.train],g.pred=soln$u[n.train+1:n.pred],beta=soln$beta)) 47 | } else { 48 | return(list(g.train=soln$u[1:n.train],beta=soln$beta)) 49 | } 50 | } else { 51 | if ((K.method != "EXP")&(K.method != "GAUSS")) {stop("Invalid K.method")} 52 | # "exp" or "gauss" 53 | theta <- setdiff(seq(0,1,length.out=n.profile+1),0) 54 | D <- as.matrix(dist(G))/2/sqrt(m) 55 | 56 | ms.fun <- function(theta) { 57 | soln <- list() 58 | n.t <- length(theta) 59 | for (i in 1:n.t) { 60 | if (K.method == "EXP") {K <- exp(-D/theta[i])} 61 | if (K.method == "GAUSS") {K <- exp(-(D/theta[i])^2) } 62 | soln[[i]] <- mixed.solve(y=y,X=X,Z=Z,K=K,method=mixed.method) 63 | } 64 | return(soln) 65 | } 66 | 67 | if ((n.core > 1) & requireNamespace("parallel",quietly=TRUE)) { 68 | it <- split(theta,factor(cut(theta,n.core,labels=FALSE))) 69 | soln <- unlist(parallel::mclapply(it,ms.fun,mc.cores=n.core),recursive=FALSE) 70 | } else { 71 | soln <- ms.fun(theta) 72 | } 73 | 74 | LL <- rep(0,n.profile) 75 | for (i in 1:n.profile) {LL[i] <- soln[[i]]$LL} 76 | 77 | #find maximum LL soln 78 | max.LL <- which.max(LL) 79 | g.train <- soln[[max.LL]]$u[1:n.train] 80 | if (n.pred > 0) { 81 | g.pred <- soln[[max.LL]]$u[n.train+1:n.pred] 82 | return(list(profile=cbind(theta,LL),g.train=g.train,g.pred=g.pred,beta=soln[[max.LL]]$beta)) 83 | } else { 84 | return(list(profile=cbind(theta,LL),g.train=g.train,beta=soln[[max.LL]]$beta)) 85 | } 86 | 87 | } #if K.method 88 | } #function 89 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | Changes in 4.6.3: 2 | * Header in package overview help file 3 | 4 | Changes in 4.6.2: 5 | * Changed parallel execution in A.mat to use cluster instead of forking 6 | 7 | Changes in 4.6.1: 8 | * Updated code to address changes in behavior of "class" in base R 9 | 10 | Changes in 4.6: 11 | * Fixed error in reference manual regarding shrinkage option for A.mat 12 | * Changed normalization of A in Yang/Mueller shrinkage method 13 | 14 | Changes in 4.5: 15 | * Two shrinkage options now in A.mat 16 | * Removed "reduce" and heterogeneous error options 17 | 18 | Changes in 4.4: 19 | * Modified GWAS plot functionality to play nice with RStudio 20 | * kin.blup returns predicted values, averaged over the fixed effects 21 | * kin.blup allows specification of heterogeneous error variance 22 | 23 | Changes in 4.3: 24 | * Old-style vignettes are no longer allowed on CRAN, so they have been moved to http://potatobreeding.cals.wisc.edu/software. 25 | 26 | Changes in 4.2: 27 | * shrink=FALSE is now the default in A.mat 28 | * Only return imputed markers in A.mat when impute=TRUE 29 | 30 | Changes in 4.1: 31 | * Replaced function GWA with GWAS for association mapping. Improvements include (1) P3D is now optional, (2) multiple phenotypes handled, (3) Manhattan and qq plots are generated, (4) uses data frame for input. 32 | * Changed default to reduce=FALSE in kin.blup 33 | 34 | Changes in 4.0: 35 | 36 | * kinship.BLUP is no longer being improved. It has been replaced by kin.blup, which has a more intuitve interface. 37 | * Shrinkage estimation functionality has been added to A.mat, which is beneficial at low marker density. 38 | 39 | Changes in 3.8: 40 | 41 | * Fixed error regarding imputation of monomorphic markers in A.mat. 42 | 43 | Changes in 3.7: 44 | 45 | * Improved handling of positive semi-definite kinship matrix in mixed.solve. 46 | 47 | Changes in 3.6: 48 | 49 | * Improved error handling in mixed.solve. 50 | 51 | Changes in 3.5: 52 | 53 | * Labels now transfer from input to output with mixed.solve. 54 | 55 | Changes in 3.4: 56 | 57 | * Additional features for A.mat. 58 | 59 | Changes in 3.3: 60 | 61 | * Improved handling of missing genotype data in A.mat. 62 | * Removed impute function. Imputation is now done through A.mat. 63 | 64 | Changes in 3.2: 65 | 66 | * Parallel computation enabled for GWA, A.mat, and kinship.BLUP. 67 | * Missing genotypic data permitted for GWA. 68 | 69 | Changes in 3.1: 70 | 71 | * Fixed error in regression coefficient for adjusted A.mat. 72 | 73 | Changes in 3.0: 74 | 75 | * Major changes to A.mat. Read the help manual. 76 | 77 | Changes in 2.8: 78 | 79 | * Added NEWS file 80 | * Updated A.mat to calculate both the raw and adjusted UAR models. 81 | 82 | Changes in 2.7: 83 | 84 | * Updated mixed.solve to handle missing observations. 85 | * Updated A.mat to handle missing alleles. 86 | 87 | Changes in 2.5: 88 | 89 | * rrBLUP manuscript is published. Included CITATION file. 90 | 91 | -------------------------------------------------------------------------------- /man/kinship.BLUP.Rd: -------------------------------------------------------------------------------- 1 | \name{kinship.BLUP} 2 | \alias{kinship.BLUP} 3 | 4 | \title{ 5 | Genomic prediction by kinship-BLUP (deprecated) 6 | } 7 | \description{ 8 | ***This function has been superseded by \code{\link{kin.blup}}; please refer to its help page. 9 | } 10 | \usage{ 11 | kinship.BLUP(y, G.train, G.pred=NULL, X=NULL, Z.train=NULL, 12 | K.method="RR", n.profile=10, mixed.method="REML", n.core=1) 13 | } 14 | 15 | \arguments{ 16 | \item{y}{ 17 | Vector (\eqn{n.obs \times 1}) of observations. Missing values (NA) are omitted. 18 | } 19 | \item{G.train}{ 20 | Matrix (\eqn{n.train \times m}) of unphased genotypes for the training population: \eqn{n.train} lines with \eqn{m} bi-allelic markers. 21 | Genotypes should be coded as \{-1,0,1\}; fractional (imputed) and missing (NA) alleles are allowed. 22 | } 23 | \item{G.pred}{ 24 | Matrix (\eqn{n.pred \times m}) of unphased genotypes for the prediction population: \eqn{n.pred} lines with \eqn{m} bi-allelic markers. 25 | Genotypes should be coded as \{-1,0,1\}; fractional (imputed) and missing (NA) alleles are allowed. 26 | } 27 | \item{X}{ 28 | Design matrix (\eqn{n.obs \times p}) of fixed effects. If not passed, a vector of 1's is used 29 | to model the intercept. 30 | } 31 | \item{Z.train}{ 32 | 0-1 matrix (\eqn{n.obs \times n.train}) relating observations to lines in the training set. If not passed 33 | the identity matrix is used. 34 | } 35 | \item{K.method}{ 36 | "RR" (default) is ridge regression, for which K is the realized additive relationship matrix computed with \code{\link{A.mat}}. The option "GAUSS" is a Gaussian kernel (\eqn{K = e^{-D^2/\theta^2}}) and "EXP" is an exponential kernel (\eqn{K = e^{-D/\theta}}), where Euclidean distances \eqn{D} are computed with \code{\link{dist}}. 37 | } 38 | 39 | \item{n.profile}{ 40 | For K.method = "GAUSS" or "EXP", the number of points to use in the log-likelihood profile for the scale parameter \eqn{\theta}. 41 | } 42 | \item{mixed.method}{ 43 | Either "REML" (default) or "ML". 44 | } 45 | \item{n.core}{ 46 | Setting n.core > 1 will enable parallel execution of the Gaussian kernel computation (use only at UNIX command line). 47 | } 48 | } 49 | \value{ 50 | \describe{ 51 | \item{$g.train}{BLUP solution for the training set} 52 | \item{$g.pred}{BLUP solution for the prediction set (when G.pred != NULL)} 53 | \item{$beta}{ML estimate of fixed effects} 54 | } 55 | For GAUSS or EXP, function also returns 56 | \describe{ 57 | \item{$profile}{log-likelihood profile for the scale parameter} 58 | } 59 | } 60 | \references{ 61 | Endelman, J.B. 2011. Ridge regression and other kernels for genomic selection with R package rrBLUP. Plant Genome 4:250-255. 62 | } 63 | 64 | \examples{ 65 | #random population of 200 lines with 1000 markers 66 | G <- matrix(rep(0,200*1000),200,1000) 67 | for (i in 1:200) { 68 | G[i,] <- ifelse(runif(1000)<0.5,-1,1) 69 | } 70 | 71 | #random phenotypes 72 | g <- as.vector(crossprod(t(G),rnorm(1000))) 73 | h2 <- 0.5 74 | y <- g + rnorm(200,mean=0,sd=sqrt((1-h2)/h2*var(g))) 75 | 76 | #split in half for training and prediction 77 | train <- 1:100 78 | pred <- 101:200 79 | ans <- kinship.BLUP(y=y[train],G.train=G[train,],G.pred=G[pred,],K.method="GAUSS") 80 | 81 | #correlation accuracy 82 | r.gy <- cor(ans$g.pred,y[pred]) 83 | } -------------------------------------------------------------------------------- /man/A.mat.Rd: -------------------------------------------------------------------------------- 1 | \name{A.mat} 2 | \alias{A.mat} 3 | \title{Additive relationship matrix} 4 | \usage{ 5 | A.mat( 6 | X, 7 | min.MAF = NULL, 8 | max.missing = NULL, 9 | impute.method = "mean", 10 | tol = 0.02, 11 | n.core = 1, 12 | shrink = FALSE, 13 | return.imputed = FALSE 14 | ) 15 | } 16 | \arguments{ 17 | \item{X}{matrix (\eqn{n \times m}) of unphased genotypes for \eqn{n} lines and \eqn{m} biallelic markers, coded as \{-1,0,1\}. Fractional (imputed) and missing values (NA) are allowed.} 18 | 19 | \item{min.MAF}{Minimum minor allele frequency. The A matrix is not sensitive to rare alleles, so by default only monomorphic markers are removed.} 20 | 21 | \item{max.missing}{Maximum proportion of missing data; default removes completely missing markers.} 22 | 23 | \item{impute.method}{There are two options. The default is "mean", which imputes with the mean for each marker. The "EM" option imputes with an EM algorithm (see details).} 24 | 25 | \item{tol}{Specifies the convergence criterion for the EM algorithm (see details).} 26 | 27 | \item{n.core}{Specifies the number of cores to use for parallel execution of the EM algorithm} 28 | 29 | \item{shrink}{set shrink=FALSE to disable shrinkage estimation. See Details for how to enable shrinkage estimation.} 30 | 31 | \item{return.imputed}{When TRUE, the imputed marker matrix is returned.} 32 | } 33 | \value{ 34 | If return.imputed = FALSE, the \eqn{n \times n} additive relationship matrix is returned. If return.imputed = TRUE, the function returns a list containing 35 | \describe{ 36 | \item{$A}{the A matrix} 37 | \item{$imputed}{the imputed marker matrix} 38 | } 39 | } 40 | \description{ 41 | Calculates the realized additive relationship matrix 42 | } 43 | \details{ 44 | At high marker density, the relationship matrix is estimated as \eqn{A=W W'/c}, where \eqn{W_{ik} = X_{ik} + 1 - 2 p_k} and \eqn{p_k} is the frequency of the 1 allele at marker k. By using a normalization constant of \eqn{c = 2 \sum_k {p_k (1-p_k)}}, the mean of the diagonal elements is \eqn{1 + f} (Endelman and Jannink 2012). 45 | The EM imputation algorithm is based on the multivariate normal distribution and was designed for use with GBS (genotyping-by-sequencing) markers, which tend to be high density but with lots of missing data. Details are given in Poland et al. (2012). The EM algorithm stops at iteration \eqn{t} when the RMS error = \eqn{n^{-1} \|A_{t} - A_{t-1}\|_2} < tol. 46 | Shrinkage estimation can improve the accuracy of genome-wide marker-assisted selection, particularly at low marker density (Endelman and Jannink 2012). The shrinkage intensity ranges from 0 (no shrinkage) to 1 (\eqn{A=(1+f)I}). Two algorithms for estimating the shrinkage intensity are available. The first is the method described in Endelman and Jannink (2012) and is specified by \code{shrink=list(method="EJ")}. The second involves designating a random sample of the markers as simulated QTL and then regressing the A matrix based on the QTL against the A matrix based on the remaining markers (Yang et al. 2010; Mueller et al. 2015). The regression method is specified by \code{shrink=list(method="REG",n.qtl=100,n.iter=5)}, where the parameters \code{n.qtl} and \code{n.iter} can be varied to adjust the number of simulated QTL and number of iterations, respectively. 47 | The shrinkage and EM-imputation options are designed for opposite scenarios (low vs. high density) and cannot be used simultaneously. 48 | When the EM algorithm is used, the imputed alleles can lie outside the interval [-1,1]. Polymorphic markers that do not meet the min.MAF and max.missing criteria are not imputed. 49 | } 50 | \references{ 51 | Endelman, J.B., and J.-L. Jannink. 2012. Shrinkage estimation of the realized relationship matrix. G3:Genes, Genomes, Genetics. 2:1405-1413. 52 | 53 | Mueller et al. 2015. Shrinkage estimation of the genomic relationship matrix can improve genomic estimated breeding values in the training set. Theor Appl Genet 128:693-703. 54 | 55 | Poland, J., J. Endelman et al. 2012. Genomic selection in wheat breeding using genotyping-by-sequencing. Plant Genome 5:103-113. 56 | 57 | Yang et al. 2010. Common SNPs explain a large proportion of the heritability for human height. Nat. Genetics 42:565-569. 58 | } 59 | -------------------------------------------------------------------------------- /R/kin.blup.R: -------------------------------------------------------------------------------- 1 | kin.blup <- function(data,geno,pheno,GAUSS=FALSE,K=NULL,fixed=NULL,covariate=NULL,PEV=FALSE,n.core=1,theta.seq=NULL) { 2 | 3 | make.full <- function(X) { 4 | svd.X <- svd(X) 5 | r <- max(which(svd.X$d>1e-8)) 6 | return(as.matrix(svd.X$u[,1:r])) 7 | } 8 | 9 | names <- colnames(data) 10 | ypos <- match(pheno,names) 11 | if (is.na(ypos)) { 12 | stop("Phenotype name does not appear in data.") 13 | } else { 14 | y <- as.numeric(data[,ypos]) 15 | } 16 | 17 | not.miss <- which(!is.na(y)) 18 | resid <- vector("numeric",length(y))*NA 19 | if (length(not.miss) 1) {X <- cbind(X,model.matrix(~x-1,data.frame(x=xx)))} 32 | } 33 | } 34 | if (!is.null(covariate)) { 35 | p <- length(covariate) 36 | for (i in 1:p) { 37 | xpos <- match(covariate[i],names) 38 | X <- cbind(X,data[,xpos]) 39 | } 40 | } 41 | 42 | gid.pos <- match(geno,names) 43 | if (is.na(gid.pos)) {stop("Genotype name does not appear in data.")} 44 | 45 | not.miss.gid <- as.character(unique(data[,gid.pos])) 46 | if (is.null(K)) { 47 | gid <- not.miss.gid 48 | v <- length(gid) 49 | Z <- matrix(0,n,v) 50 | colnames(Z) <- gid 51 | Z[cbind(1:n,match(data[,gid.pos],gid))] <- 1 52 | X2 <- make.full(X) 53 | ans <- mixed.solve(y=y,X=X2,Z=Z,SE=PEV) 54 | resid[not.miss] <- y-X2%*%ans$beta-Z%*%ans$u 55 | if (PEV) { 56 | return(list(Vg=ans$Vu,Ve=ans$Ve,g=ans$u,PEV=ans$u.SE^2,resid=resid,pred=ans$u+as.numeric(colMeans(X2)%*%ans$beta))) 57 | } else { 58 | return(list(Vg=ans$Vu,Ve=ans$Ve,g=ans$u,resid=resid,pred=ans$u+as.numeric(colMeans(X2)%*%ans$beta))) 59 | } 60 | } else { 61 | 62 | if (inherits(K,what="dist")) {K <- as.matrix(K)} 63 | gid <- rownames(K) 64 | ix.pheno <- match(not.miss.gid,gid) 65 | miss.pheno.gid <- which(is.na(ix.pheno)) 66 | if (length(miss.pheno.gid)>0) { 67 | stop(paste("The following lines have phenotypes but no genotypes:",paste(not.miss.gid[miss.pheno.gid],collapse=" "))) 68 | } 69 | miss.gid <- setdiff(gid,not.miss.gid) 70 | ix <- c(ix.pheno,match(miss.gid,gid)) 71 | K <- K[ix,ix] 72 | v <- length(not.miss.gid) 73 | Z <- matrix(0,n,v) 74 | Z[cbind(1:n,match(data[,gid.pos],not.miss.gid))] <- 1 75 | 76 | X2 <- make.full(X) 77 | Z2 <- cbind(Z,matrix(0,n,nrow(K)-v)) 78 | 79 | if (!GAUSS) { 80 | ans <- mixed.solve(y=y,X=X2,Z=Z2,K=K,SE=PEV) 81 | ix <- match(gid,rownames(ans$u)) 82 | resid[not.miss] <- y-X2%*%ans$beta-Z2%*%ans$u 83 | if (PEV) { 84 | return(list(Vg=ans$Vu,Ve=ans$Ve,g=ans$u[ix],PEV=ans$u.SE[ix]^2,resid=resid,pred=ans$u[ix]+as.numeric(colMeans(X2)%*%ans$beta))) 85 | } else { 86 | return(list(Vg=ans$Vu,Ve=ans$Ve,g=ans$u[ix],resid=resid,pred=ans$u[ix]+as.numeric(colMeans(X2)%*%ans$beta))) 87 | } 88 | 89 | } else { 90 | 91 | if (is.null(theta.seq)) { 92 | theta <- setdiff(seq(0,max(K),length.out=11),0) 93 | } else { 94 | theta <- theta.seq 95 | } 96 | n.profile <- length(theta) 97 | ms.fun <- function(theta) { 98 | soln <- list() 99 | n.t <- length(theta) 100 | for (i in 1:n.t) { 101 | soln[[i]] <- mixed.solve(y=y,X=X2,Z=Z2,K=exp(-(K/theta[i])^2),SE=PEV) 102 | } 103 | return(soln) 104 | } 105 | 106 | if ((n.core > 1) & requireNamespace("parallel",quietly=TRUE)) { 107 | it <- split(theta,factor(cut(theta,n.core,labels=FALSE))) 108 | soln <- unlist(parallel::mclapply(it,ms.fun,mc.cores=n.core),recursive=FALSE) 109 | } else { 110 | soln <- ms.fun(theta) 111 | } 112 | 113 | LL <- rep(0,n.profile) 114 | for (i in 1:n.profile) {LL[i] <- soln[[i]]$LL} 115 | ans <- soln[[which.max(LL)]] 116 | profile <- cbind(theta,LL) 117 | ix <- match(gid,rownames(ans$u)) 118 | resid[not.miss] <- y-X2%*%ans$beta-Z2%*%ans$u 119 | 120 | if (PEV) { 121 | return(list(Vg=ans$Vu,Ve=ans$Ve,profile=profile,g=ans$u[ix],PEV=ans$u.SE[ix]^2,resid=resid,pred=ans$u[ix]+as.numeric(colMeans(X2)%*%ans$beta))) 122 | } else { 123 | return(list(Vg=ans$Vu,Ve=ans$Ve,profile=profile,g=ans$u[ix],resid=resid,pred=ans$u[ix]+as.numeric(colMeans(X2)%*%ans$beta))) 124 | } 125 | 126 | } #else GAUSS 127 | } #else is.null(K) 128 | } #kin.blup -------------------------------------------------------------------------------- /R/mixed.solve.R: -------------------------------------------------------------------------------- 1 | mixed.solve <- function (y, Z = NULL, K = NULL, X = NULL, method = "REML", bounds = c(1e-09,1e+09), SE = FALSE, return.Hinv = FALSE) { 2 | pi <- 3.14159 3 | n <- length(y) 4 | y <- matrix(y,n,1) 5 | 6 | not.NA <- which(!is.na(y)) 7 | 8 | if (is.null(X)) { 9 | p <- 1 10 | X <- matrix(rep(1,n),n,1) 11 | } 12 | p <- ncol(X) 13 | if (is.null(p)) { 14 | p <- 1 15 | X <- matrix(X,length(X),1) 16 | } 17 | if (is.null(Z)) { 18 | Z <- diag(n) 19 | } 20 | m <- ncol(Z) 21 | if (is.null(m)) { 22 | m <- 1 23 | Z <- matrix(Z,length(Z),1) 24 | } 25 | stopifnot(nrow(Z) == n) 26 | stopifnot(nrow(X) == n) 27 | if (!is.null(K)) { 28 | stopifnot(nrow(K) == m) 29 | stopifnot(ncol(K) == m) 30 | } 31 | 32 | Z <- as.matrix(Z[not.NA,]) 33 | X <- as.matrix(X[not.NA,]) 34 | n <- length(not.NA) 35 | y <- matrix(y[not.NA],n,1) 36 | 37 | XtX <- crossprod(X, X) 38 | rank.X <- qr(XtX)$rank 39 | if (rank.X < p) {stop("X not full rank")} 40 | XtXinv <- solve(XtX) 41 | S <- diag(n) - tcrossprod(X%*%XtXinv,X) 42 | if (n <= m + p) { 43 | spectral.method <- "eigen" 44 | } else { 45 | spectral.method <- "cholesky" 46 | if (!is.null(K)) { 47 | diag(K) <- diag(K) + 1e-6 48 | B <- try(chol(K),silent=TRUE) 49 | if (inherits(B,what="try-error")) {stop("K not positive semi-definite.")} 50 | } # if is.null 51 | } 52 | 53 | if (spectral.method=="cholesky") { 54 | if (is.null(K)) { 55 | ZBt <- Z 56 | } else { 57 | ZBt <- tcrossprod(Z,B) 58 | } 59 | svd.ZBt <- svd(ZBt,nu=n) 60 | U <- svd.ZBt$u 61 | phi <- c(svd.ZBt$d^2,rep(0,n-m)) 62 | SZBt <- S %*% ZBt 63 | svd.SZBt <- try(svd(SZBt),silent=TRUE) 64 | if (inherits(svd.SZBt,what="try-error")) {svd.SZBt <- svd(SZBt+matrix(1e-10,nrow=nrow(SZBt),ncol=ncol(SZBt)))} 65 | QR <- qr(cbind(X,svd.SZBt$u)) 66 | Q <- qr.Q(QR,complete=TRUE)[,(p+1):n] 67 | R <- qr.R(QR)[p+1:m,p+1:m] 68 | ans <- try(solve(t(R^2), svd.SZBt$d^2),silent=TRUE) 69 | if (inherits(ans,what="try-error")) { 70 | spectral.method <- "eigen" 71 | } else { 72 | theta <- c(ans,rep(0, n - p - m)) 73 | } 74 | } 75 | 76 | if (spectral.method=="eigen") { 77 | offset <- sqrt(n) 78 | if (is.null(K)) { 79 | Hb <- tcrossprod(Z,Z) + offset*diag(n) 80 | } else { 81 | Hb <- tcrossprod(Z%*%K,Z) + offset*diag(n) 82 | } 83 | Hb.system <- eigen(Hb, symmetric = TRUE) 84 | phi <- Hb.system$values - offset 85 | if (min(phi) < -1e-6) {stop("K not positive semi-definite.")} 86 | U <- Hb.system$vectors 87 | SHbS <- S %*% Hb %*% S 88 | SHbS.system <- eigen(SHbS, symmetric = TRUE) 89 | theta <- SHbS.system$values[1:(n - p)] - offset 90 | Q <- SHbS.system$vectors[, 1:(n - p)] 91 | } 92 | 93 | omega <- crossprod(Q, y) 94 | omega.sq <- omega^2 95 | if (method == "ML") { 96 | f.ML <- function(lambda, n, theta, omega.sq, phi) { 97 | n * log(sum(omega.sq/(theta + lambda))) + sum(log(phi + lambda)) 98 | } 99 | soln <- optimize(f.ML, interval = bounds, n, theta, omega.sq, phi) 100 | lambda.opt <- soln$minimum 101 | df <- n 102 | } else { 103 | f.REML <- function(lambda, n.p, theta, omega.sq) { 104 | n.p * log(sum(omega.sq/(theta + lambda))) + sum(log(theta + lambda)) 105 | } 106 | soln <- optimize(f.REML, interval = bounds, n - p, theta, omega.sq) 107 | lambda.opt <- soln$minimum 108 | df <- n - p 109 | } #if method 110 | Vu.opt <- sum(omega.sq/(theta + lambda.opt))/df 111 | Ve.opt <- lambda.opt * Vu.opt 112 | Hinv <- U %*% (t(U)/(phi+lambda.opt)) 113 | W <- crossprod(X,Hinv%*%X) 114 | beta <- array(solve(W,crossprod(X,Hinv%*%y))) 115 | rownames(beta) <- colnames(X) 116 | 117 | if (is.null(K)) { 118 | KZt <- t(Z) 119 | } else { 120 | KZt <- tcrossprod(K,Z) 121 | } 122 | KZt.Hinv <- KZt %*% Hinv 123 | u <- array(KZt.Hinv %*% (y - X%*%beta)) 124 | 125 | if (is.null(K)) { 126 | rownames(u) <- colnames(Z) 127 | } else { 128 | rownames(u) <- rownames(K) 129 | } 130 | 131 | LL = -0.5 * (soln$objective + df + df * log(2 * pi/df)) 132 | if (!SE) { 133 | if (return.Hinv) { 134 | return(list(Vu = Vu.opt, Ve = Ve.opt, beta = beta, u = u, LL = LL, Hinv = Hinv)) 135 | } else { 136 | return(list(Vu = Vu.opt, Ve = Ve.opt, beta = beta, u = u, LL = LL)) 137 | } 138 | } else { 139 | Winv <- solve(W) 140 | beta.SE <- array(sqrt(Vu.opt*diag(Winv))) 141 | rownames(beta.SE) <- rownames(beta) 142 | WW <- tcrossprod(KZt.Hinv,KZt) 143 | WWW <- KZt.Hinv%*%X 144 | if (is.null(K)) { 145 | u.SE <- array(sqrt(Vu.opt * (rep(1,m) - diag(WW) + diag(tcrossprod(WWW%*%Winv,WWW))))) 146 | } else { 147 | u.SE <- array(sqrt(Vu.opt * (diag(K) - diag(WW) + diag(tcrossprod(WWW%*%Winv,WWW))))) 148 | } 149 | rownames(u.SE) <- rownames(u) 150 | 151 | if (return.Hinv) { 152 | return(list(Vu = Vu.opt, Ve = Ve.opt, beta = beta, beta.SE = beta.SE, u = u, u.SE = u.SE, LL = LL, Hinv = Hinv)) 153 | } else { 154 | return(list(Vu = Vu.opt, Ve = Ve.opt, beta = beta, beta.SE = beta.SE, u = u, u.SE = u.SE, LL = LL)) 155 | } 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /man/mixed.solve.Rd: -------------------------------------------------------------------------------- 1 | \name{mixed.solve} 2 | \alias{mixed.solve} 3 | 4 | \title{ 5 | Mixed-model solver 6 | } 7 | \description{ 8 | Calculates maximum-likelihood (ML/REML) solutions for mixed models of the form 9 | 10 | \deqn{y = X \beta + Z u + \varepsilon} 11 | 12 | where \eqn{\beta} is a vector of fixed effects and \eqn{u} is a vector of random effects with 13 | \eqn{Var[u] = K \sigma^2_u}. The residual variance is \eqn{Var[\varepsilon] = I \sigma^2_e}. This class 14 | of mixed models, in which there is a single variance component other than the residual error, 15 | has a close relationship with ridge regression (ridge parameter \eqn{\lambda = \sigma_e^2 / \sigma^2_u}). 16 | } 17 | \usage{ 18 | mixed.solve(y, Z=NULL, K=NULL, X=NULL, method="REML", 19 | bounds=c(1e-09, 1e+09), SE=FALSE, return.Hinv=FALSE) 20 | } 21 | 22 | \arguments{ 23 | \item{y}{ 24 | Vector (\eqn{n \times 1}) of observations. Missing values (NA) are omitted, along with the corresponding rows of X and Z. 25 | } 26 | \item{Z}{ 27 | Design matrix (\eqn{n \times m}) for the random effects. If not passed, assumed to be the identity matrix. 28 | } 29 | \item{K}{ 30 | Covariance matrix (\eqn{m \times m}) for random effects; must be positive semi-definite. If not passed, assumed to 31 | be the identity matrix. 32 | } 33 | \item{X}{ 34 | Design matrix (\eqn{n \times p}) for the fixed effects. If not passed, a vector of 1's is used 35 | to model the intercept. X must be full column rank (implies \eqn{\beta} is estimable). 36 | } 37 | \item{method}{ 38 | Specifies whether the full ("ML") or restricted ("REML") maximum-likelihood method is used. 39 | } 40 | \item{bounds}{ 41 | Array with two elements specifying the lower and upper bound for the ridge parameter. 42 | } 43 | \item{SE}{ 44 | If TRUE, standard errors are calculated. 45 | } 46 | \item{return.Hinv}{ 47 | If TRUE, the function returns the inverse of \eqn{H = Z K Z' + \lambda I}. This is useful for \code{\link{GWAS}}. 48 | } 49 | } 50 | \details{ 51 | This function can be used to predict marker effects or breeding values (see examples). The numerical method 52 | is based on the spectral decomposition of \eqn{Z K Z'} and \eqn{S Z K Z' S}, where \eqn{S = I - X (X' X)^{-1} X'} is 53 | the projection operator for the nullspace of \eqn{X} (Kang et al., 2008). This algorithm generates the inverse phenotypic covariance matrix \eqn{V^{-1}}, which can then be used to calculate the BLUE and BLUP solutions for the fixed and random effects, respectively, using standard formulas (Searle et al. 1992): 54 | 55 | \deqn{BLUE(\beta) = \beta^* = (X'V^{-1}X)^{-1}X'V^{-1}y} 56 | \deqn{BLUP(u) = u^* = \sigma^2_u KZ'V^{-1}(y-X\beta^*)} 57 | 58 | The standard errors are calculated as the square root of the diagonal elements of the following matrices (Searle et al. 1992): 59 | \deqn{Var[\beta^*] = (X'V^{-1}X)^{-1}} 60 | \deqn{Var[u^*-u] = K \sigma^2_u - \sigma^4_u KZ'V^{-1}ZK + \sigma^4_u KZ'V^{-1}XVar[\beta^*]X'V^{-1}ZK} 61 | 62 | For marker effects where K = I, the function will run faster if K is not passed than if the user passes the identity matrix. 63 | } 64 | \value{ 65 | If SE=FALSE, the function returns a list containing 66 | \describe{ 67 | \item{$Vu}{estimator for \eqn{\sigma^2_u}} 68 | \item{$Ve}{estimator for \eqn{\sigma^2_e}} 69 | \item{$beta}{BLUE(\eqn{\beta})} 70 | \item{$u}{BLUP(\eqn{u})} 71 | \item{$LL}{maximized log-likelihood (full or restricted, depending on method)} 72 | } 73 | 74 | If SE=TRUE, the list also contains 75 | \describe{ 76 | \item{$beta.SE}{standard error for \eqn{\beta}} 77 | \item{$u.SE}{standard error for \eqn{u^*-u}} 78 | } 79 | 80 | If return.Hinv=TRUE, the list also contains 81 | \describe{ 82 | \item{$Hinv}{the inverse of \eqn{H}} 83 | } 84 | } 85 | \references{ 86 | Kang et al. 2008. Efficient control of population structure in model organism association mapping. 87 | Genetics 178:1709-1723. 88 | 89 | Endelman, J.B. 2011. Ridge regression and other kernels for genomic selection with R package rrBLUP. Plant Genome 4:250-255. 90 | 91 | Searle, S.R., G. Casella and C.E. McCulloch. 1992. Variance Components. John Wiley, Hoboken. 92 | } 93 | 94 | \examples{ 95 | #random population of 200 lines with 1000 markers 96 | M <- matrix(rep(0,200*1000),200,1000) 97 | for (i in 1:200) { 98 | M[i,] <- ifelse(runif(1000)<0.5,-1,1) 99 | } 100 | 101 | #random phenotypes 102 | u <- rnorm(1000) 103 | g <- as.vector(crossprod(t(M),u)) 104 | h2 <- 0.5 #heritability 105 | y <- g + rnorm(200,mean=0,sd=sqrt((1-h2)/h2*var(g))) 106 | 107 | #predict marker effects 108 | ans <- mixed.solve(y,Z=M) #By default K = I 109 | accuracy <- cor(u,ans$u) 110 | 111 | #predict breeding values 112 | ans <- mixed.solve(y,K=A.mat(M)) 113 | accuracy <- cor(g,ans$u) 114 | 115 | } 116 | 117 | -------------------------------------------------------------------------------- /man/GWAS.Rd: -------------------------------------------------------------------------------- 1 | \name{GWAS} 2 | \alias{GWAS} 3 | 4 | \title{ 5 | Genome-wide association analysis 6 | } 7 | \description{ 8 | Performs genome-wide association analysis based on the mixed model (Yu et al. 2006): 9 | 10 | \deqn{y = X \beta + Z g + S \tau + \varepsilon} 11 | 12 | where \eqn{\beta} is a vector of fixed effects that can model both environmental factors and population structure. 13 | The variable \eqn{g} models the genetic background of each line as a random effect with \eqn{Var[g] = K \sigma^2}. 14 | The variable \eqn{\tau} models the additive SNP effect as a fixed effect. The residual variance is \eqn{Var[\varepsilon] = I \sigma_e^2}. 15 | } 16 | \usage{ 17 | GWAS(pheno, geno, fixed=NULL, K=NULL, n.PC=0, 18 | min.MAF=0.05, n.core=1, P3D=TRUE, plot=TRUE) 19 | } 20 | 21 | \arguments{ 22 | \item{pheno}{ 23 | Data frame where the first column is the line name (gid). The remaining columns can be either a phenotype or the levels of a fixed effect. Any column not designated as a fixed effect is assumed to be a phenotype. 24 | } 25 | \item{geno}{ 26 | Data frame with the marker names in the first column. The second and third columns contain the chromosome and map position (either bp or cM), respectively, which are used only when plot=TRUE to make Manhattan plots. If the markers are unmapped, just use a placeholder for those two columns. Columns 4 and higher contain the marker scores for each line, coded as \{-1,0,1\} = \{aa,Aa,AA\}. Fractional (imputed) and missing (NA) values are allowed. The column names must match the line names in the "pheno" data frame. 27 | } 28 | \item{fixed}{ 29 | An array of strings containing the names of the columns that should be included as (categorical) fixed effects in the mixed model. 30 | } 31 | \item{K}{ 32 | Kinship matrix for the covariance between lines due to a polygenic effect. If not passed, it is calculated from the markers using \code{\link{A.mat}}. 33 | } 34 | \item{n.PC}{ 35 | Number of principal components to include as fixed effects. Default is 0 (equals K model). 36 | } 37 | \item{min.MAF}{ 38 | Specifies the minimum minor allele frequency (MAF). If a marker has a MAF less than min.MAF, it is assigned a zero score. 39 | } 40 | \item{n.core}{ 41 | Setting n.core > 1 will enable parallel execution on a machine with multiple cores (use only at UNIX command line). 42 | } 43 | \item{P3D}{ 44 | When P3D=TRUE, variance components are estimated by REML only once, without any markers in the model. When P3D=FALSE, variance components are estimated by REML for each marker separately. 45 | } 46 | \item{plot}{ 47 | When plot=TRUE, qq and Manhattan plots are generated. 48 | } 49 | } 50 | \details{ 51 | For unbalanced designs where phenotypes come from different environments, the environment mean can be modeled using the fixed option (e.g., fixed="env" if the column in the pheno data.frame is called "env"). When principal components are included (P+K model), the loadings are determined from an eigenvalue decomposition of the K matrix. 52 | 53 | The terminology "P3D" (population parameters previously determined) was introduced by Zhang et al. (2010). When P3D=FALSE, this function is equivalent to EMMA with REML (Kang et al. 2008). When P3D=TRUE, it is equivalent to EMMAX (Kang et al. 2010). The P3D=TRUE option is faster but can underestimate significance compared to P3D=FALSE. 54 | 55 | The dashed line in the Manhattan plots corresponds to an FDR rate of 0.05 and is calculated using the qvalue package (Storey and Tibshirani 2003). The p-value corresponding to a q-value of 0.05 is determined by interpolation. When there are no q-values less than 0.05, the dashed line is omitted. 56 | } 57 | \value{ 58 | Returns a data frame where the first three columns are the marker name, chromosome, and position, and subsequent columns are the marker scores \eqn{(-log_{10}p)} for the traits. 59 | } 60 | \references{ 61 | Kang et al. 2008. Efficient control of population structure in model organism association mapping. Genetics 178:1709-1723. 62 | 63 | Kang et al. 2010. Variance component model to account for sample structure in genome-wide association studies. 64 | Nat. Genet. 42:348-354. 65 | 66 | Storey and Tibshirani. 2003. Statistical significance for genome-wide studies. PNAS 100:9440-9445. 67 | 68 | Yu et al. 2006. A unified mixed-model method for association mapping that accounts for multiple levels of relatedness. Genetics 38:203-208. 69 | 70 | Zhang et al. 2010. Mixed linear model approach adapted for genome-wide association studies. Nat. Genet. 42:355-360. 71 | } 72 | 73 | \examples{ 74 | #random population of 200 lines with 1000 markers 75 | M <- matrix(rep(0,200*1000),1000,200) 76 | for (i in 1:200) { 77 | M[,i] <- ifelse(runif(1000)<0.5,-1,1) 78 | } 79 | colnames(M) <- 1:200 80 | geno <- data.frame(marker=1:1000,chrom=rep(1,1000),pos=1:1000,M,check.names=FALSE) 81 | 82 | QTL <- 100*(1:5) #pick 5 QTL 83 | u <- rep(0,1000) #marker effects 84 | u[QTL] <- 1 85 | g <- as.vector(crossprod(M,u)) 86 | h2 <- 0.5 87 | y <- g + rnorm(200,mean=0,sd=sqrt((1-h2)/h2*var(g))) 88 | 89 | pheno <- data.frame(line=1:200,y=y) 90 | scores <- GWAS(pheno,geno,plot=FALSE) 91 | } 92 | -------------------------------------------------------------------------------- /man/kin.blup.Rd: -------------------------------------------------------------------------------- 1 | \name{kin.blup} 2 | \alias{kin.blup} 3 | 4 | \title{ 5 | Genotypic value prediction based on kinship 6 | } 7 | \description{ 8 | Genotypic value prediction by G-BLUP, where the genotypic covariance G can be additive or based on a Gaussian kernel. 9 | } 10 | \usage{ 11 | kin.blup(data,geno,pheno,GAUSS=FALSE,K=NULL,fixed=NULL,covariate=NULL, 12 | PEV=FALSE,n.core=1,theta.seq=NULL) 13 | } 14 | 15 | \arguments{ 16 | \item{data}{ 17 | Data frame with columns for the phenotype, the genotype identifier, and any environmental variables. 18 | } 19 | \item{geno}{ 20 | Character string for the name of the column in the data frame that contains the genotype identifier. 21 | } 22 | \item{pheno}{ 23 | Character string for the name of the column in the data frame that contains the phenotype. 24 | } 25 | \item{GAUSS}{ 26 | To model genetic covariance with a Gaussian kernel, set GAUSS=TRUE and pass the Euclidean distance for K (see below). 27 | } 28 | \item{K}{ 29 | There are three options for specifying kinship: 30 | (1) If K=NULL, genotypes are assumed to be independent \eqn{(G=I \: V_g)}. 31 | (2) For breeding value prediction, set GAUSS=FALSE and use an additive relationship matrix for K to create the model \eqn{(G=K \: V_g)}. 32 | (3) For the Gaussian kernel, set GAUSS=TRUE and pass the Euclidean distance matrix for K to create the model \eqn{G_{ij}=e^{-(K_{ij}/\theta)^2} \: V_g}. 33 | } 34 | \item{fixed}{ 35 | An array of strings containing the names of columns that should be included as (categorical) fixed effects in the mixed model. 36 | } 37 | \item{covariate}{ 38 | An array of strings containing the names of columns that should be included as covariates in the mixed model. 39 | } 40 | \item{PEV}{ 41 | When PEV=TRUE, the function returns the prediction error variance for the genotypic values (\eqn{PEV_i = Var[g^*_i-g_i]}). 42 | } 43 | \item{n.core}{ 44 | Specifies the number of cores to use for parallel execution of the Gaussian kernel method (use only at UNIX command line). 45 | } 46 | \item{theta.seq}{ 47 | The scale parameter for the Gaussian kernel is set by maximizing the restricted log-likelihood over a grid of values. By default, the grid is constructed by dividing the interval (0,max(K)] into 10 points. Passing a numeric array to this variable (theta.seq = "theta sequence") will specify a different set of grid points (e.g., for large problems you might want fewer than 10). 48 | } 49 | } 50 | \details{ 51 | This function is a wrapper for \code{\link{mixed.solve}} and thus solves mixed models of the form: 52 | \deqn{y = X \beta + [Z \: 0] g + \varepsilon} 53 | where \eqn{\beta} is a vector of fixed effects, \eqn{g} is a vector of random genotypic values with covariance 54 | \eqn{G = Var[g]}, and the residuals follow \eqn{Var[\varepsilon_i] = R_i \sigma^2_e}, with \eqn{R_i = 1} by default. The design matrix for the genetic values has been partitioned to illustrate that not all lines need phenotypes (i.e., for genomic selection). Unlike \code{\link{mixed.solve}}, this function does not return estimates of the fixed effects, only the BLUP solution for the genotypic values. It was designed to replace \code{\link{kinship.BLUP}} and to relieve the user of having to explicitly construct design matrices. Variance components are estimated by REML and BLUP values are returned for every entry in K, regardless of whether it has been phenotyped. The rownames of K must match the genotype labels in the data frame for phenotyped lines; missing phenotypes (NA) are simply omitted. 55 | 56 | Unlike its predecessor, this function does not handle marker data directly. For breeding value prediction, the user must supply a relationship matrix, which can be calculated from markers with \code{\link{A.mat}}. For Gaussian kernel predictions, pass the Euclidean distance matrix for K, which can be calculated with \code{\link{dist}}. 57 | 58 | In the terminology of mixed models, both the "fixed" and "covariate" variables are fixed effects (\eqn{\beta} in the above equation): the former are treated as factors with distinct levels while the latter are continuous with one coefficient per variable. The population mean is automatically included as a fixed effect. 59 | 60 | The prediction error variance (PEV) is the square of the SE of the BLUPs (see \code{\link{mixed.solve}}) and can be used to estimate the expected accuracy of BLUP predictions according to \eqn{r^2_i = 1 - \frac{PEV_i}{V_g K_{ii}}}. 61 | 62 | } 63 | \value{ 64 | The function always returns 65 | \describe{ 66 | \item{$Vg}{REML estimate of the genetic variance} 67 | \item{$Ve}{REML estimate of the error variance} 68 | \item{$g}{BLUP solution for the genetic values} 69 | \item{$resid}{residuals} 70 | \item{$pred}{predicted genetic values, averaged over the fixed effects} 71 | } 72 | If PEV = TRUE, the list also includes 73 | \describe{ 74 | \item{$PEV}{Prediction error variance for the genetic values} 75 | } 76 | If GAUSS = TRUE, the list also includes 77 | \describe{ 78 | \item{$profile}{the log-likelihood profile for the scale parameter in the Gaussian kernel} 79 | } 80 | } 81 | 82 | \references{ 83 | Endelman, J.B. 2011. Ridge regression and other kernels for genomic selection with R package rrBLUP. Plant Genome 4:250-255. 84 | } 85 | 86 | \examples{ 87 | #random population of 200 lines with 1000 markers 88 | M <- matrix(rep(0,200*1000),200,1000) 89 | for (i in 1:200) { 90 | M[i,] <- ifelse(runif(1000)<0.5,-1,1) 91 | } 92 | rownames(M) <- 1:200 93 | A <- A.mat(M) 94 | 95 | #random phenotypes 96 | u <- rnorm(1000) 97 | g <- as.vector(crossprod(t(M),u)) 98 | h2 <- 0.5 #heritability 99 | y <- g + rnorm(200,mean=0,sd=sqrt((1-h2)/h2*var(g))) 100 | 101 | data <- data.frame(y=y,gid=1:200) 102 | 103 | #predict breeding values 104 | ans <- kin.blup(data=data,geno="gid",pheno="y",K=A) 105 | accuracy <- cor(g,ans$g) 106 | 107 | } 108 | -------------------------------------------------------------------------------- /R/A.mat.R: -------------------------------------------------------------------------------- 1 | A.mat <- function(X,min.MAF=NULL,max.missing=NULL, 2 | impute.method="mean",tol=0.02,n.core=1, 3 | shrink=FALSE,return.imputed=FALSE){ 4 | 5 | if (mode(shrink)=="list") { 6 | shrink.method <- shrink$method 7 | if (!is.element(shrink.method,c("EJ","REG"))) {stop("Invalid shrinkage method.")} 8 | shrink.iter <- shrink$n.iter 9 | n.qtl <- shrink$n.qtl 10 | shrink <- TRUE 11 | } else { 12 | if (shrink) { #included for backwards compatibility 13 | shrink.method <- "EJ" 14 | } 15 | } 16 | 17 | shrink.coeff <- function(i,W,n.qtl,p){ 18 | m <- ncol(W) 19 | n <- nrow(W) 20 | qtl <- sample(1:m,n.qtl) 21 | A.mark <- tcrossprod(W[,-qtl])/sum(2*p[-qtl]*(1-p[-qtl])) 22 | A.qtl <- tcrossprod(W[,qtl])/sum(2*p[qtl]*(1-p[qtl])) 23 | x <- as.vector(A.mark - mean(diag(A.mark))*diag(n)) 24 | y <- as.vector(A.qtl - mean(diag(A.qtl))*diag(n)) 25 | return(1-cov(y,x)/var(x)) 26 | } 27 | 28 | impute.EM <- function(W, cov.mat, mean.vec) { 29 | n <- nrow(W) 30 | m <- ncol(W) 31 | S <- matrix(0,n,n) 32 | for (i in 1:m) { 33 | Wi <- matrix(W[,i],n,1) 34 | missing <- which(is.na(Wi)) 35 | if (length(missing) > 0) { 36 | not.NA <- setdiff(1:n,missing) 37 | Bt <- solve(cov.mat[not.NA,not.NA],cov.mat[not.NA,missing]) 38 | Wi[missing] <- mean.vec[missing] + crossprod(Bt,Wi[not.NA]-mean.vec[not.NA]) 39 | C <- cov.mat[missing,missing] - crossprod(cov.mat[not.NA,missing],Bt) 40 | D <- tcrossprod(Wi) 41 | D[missing,missing] <- D[missing,missing] + C 42 | W[,i] <- Wi 43 | } else {D <- tcrossprod(Wi)} 44 | S <- S + D 45 | } 46 | return(list(S=S,W.imp=W)) 47 | } 48 | 49 | cov.W.shrink <- function(W) { 50 | m <- ncol(W) 51 | n <- nrow(W) 52 | Z <- t(scale(t(W),scale=FALSE)) 53 | Z2 <- Z^2 54 | S <- tcrossprod(Z)/m 55 | target <- mean(diag(S))*diag(n) 56 | var.S <- tcrossprod(Z2)/m^2-S^2/m 57 | b2 <- sum(var.S) 58 | d2 <- sum((S-target)^2) 59 | delta <- max(0,min(1,b2/d2)) 60 | print(paste("Shrinkage intensity:",round(delta,2))) 61 | return(target*delta + (1-delta)*S) 62 | } 63 | 64 | X <- as.matrix(X) 65 | n <- nrow(X) 66 | frac.missing <- apply(X,2,function(x){length(which(is.na(x)))/n}) 67 | missing <- max(frac.missing) > 0 68 | freq <- apply(X + 1, 2, function(x) {mean(x, na.rm = missing)})/2 69 | MAF <- apply(rbind(freq,1-freq),2,min) 70 | if (is.null(min.MAF)) {min.MAF <- 1/(2*n)} 71 | if (is.null(max.missing)) {max.missing <- 1 - 1/(2*n)} 72 | markers <- which((MAF >= min.MAF)&(frac.missing <= max.missing)) 73 | m <- length(markers) 74 | var.A <- 2 * mean(freq[markers] * (1 - freq[markers])) 75 | one <- matrix(1, n, 1) 76 | 77 | mono <- which(freq*(1-freq)==0) 78 | X[,mono] <- 2*tcrossprod(one,matrix(freq[mono],length(mono),1))-1 79 | 80 | freq.mat <- tcrossprod(one, matrix(freq[markers], m, 1)) 81 | W <- X[, markers] + 1 - 2 *freq.mat 82 | 83 | if (!missing) { 84 | if (shrink) { 85 | if (shrink.method=="EJ") { 86 | W.mean <- rowMeans(W) 87 | cov.W <- cov.W.shrink(W) 88 | A <- (cov.W+tcrossprod(W.mean))/var.A 89 | } else { 90 | if (n.core > 1) { 91 | cl <- makeCluster(n.core) 92 | clusterExport(cl=cl,varlist=NULL) 93 | it <- split(1:shrink.iter,factor(cut(1:shrink.iter,n.core,labels=FALSE))) 94 | 95 | delta <- unlist(parLapply(cl,X=it, 96 | fun=function(ix,W,n.qtl,p){apply(array(ix),1,shrink.coeff,W=W,n.qtl=n.qtl,p=p)}, 97 | W=W,n.qtl=n.qtl,p=freq.mat[1,])) 98 | stopCluster(cl) 99 | } else { 100 | delta <- apply(array(1:shrink.iter),1,shrink.coeff,W=W,n.qtl=n.qtl,p=freq.mat[1,]) 101 | } 102 | delta <- mean(delta,na.rm=T) 103 | print(paste("Shrinkage intensity:",round(delta,2))) 104 | A <- tcrossprod(W)/var.A/m 105 | A <- (1-delta)*A + delta*mean(diag(A))*diag(n) 106 | } 107 | } else { 108 | A <- tcrossprod(W)/var.A/m 109 | } 110 | rownames(A) <- rownames(X) 111 | colnames(A) <- rownames(A) 112 | if (return.imputed) { 113 | return(list(A=A,imputed=X)) 114 | } else { 115 | return(A) 116 | } 117 | } else { 118 | #impute 119 | isna <- which(is.na(W)) 120 | W[isna] <- 0 121 | 122 | if (toupper(impute.method)=="EM") { 123 | if (m < n) { 124 | print("Linear dependency among the lines: imputing with mean instead of EM algorithm.") 125 | } else { 126 | mean.vec.new <- matrix(rowMeans(W),n,1) 127 | cov.mat.new <- cov(t(W)) 128 | if (qr(cov.mat.new)$rank < nrow(cov.mat.new)-1) { 129 | print("Linear dependency among the lines: imputing with mean instead of EM algorithm.") 130 | } else { 131 | 132 | #do EM algorithm 133 | W[isna] <- NA 134 | A.new <- (cov.mat.new + tcrossprod(mean.vec.new))/var.A 135 | err <- tol+1 136 | print("A.mat converging:") 137 | if (n.core > 1) { 138 | cl <- makeCluster(n.core) 139 | clusterExport(cl=cl,varlist=NULL) 140 | } 141 | 142 | while (err >= tol) { 143 | A.old <- A.new 144 | cov.mat.old <- cov.mat.new 145 | mean.vec.old <- mean.vec.new 146 | if (n.core > 1) { 147 | it <- split(1:m,factor(cut(1:m,n.core,labels=FALSE))) 148 | pieces <- parLapply(cl,it,function(mark2){impute.EM(W[,mark2],cov.mat.old,mean.vec.old)}) 149 | } else { 150 | pieces <- list() 151 | pieces[[1]] <- impute.EM(W,cov.mat.old,mean.vec.old) 152 | } 153 | n.pieces <- length(pieces) 154 | S <- matrix(0,n,n) 155 | W.imp <- numeric(0) 156 | for (i in 1:n.pieces) { 157 | S <- S + pieces[[i]]$S 158 | W.imp <- cbind(W.imp,pieces[[i]]$W.imp) 159 | } 160 | mean.vec.new <- matrix(rowMeans(W.imp),n,1) 161 | cov.mat.new <- (S-tcrossprod(mean.vec.new)*m)/(m-1) 162 | A.new <- (cov.mat.new + tcrossprod(mean.vec.new))/var.A 163 | err <- norm(A.old-A.new,type="F")/n 164 | print(err,digits=3) 165 | } 166 | rownames(A.new) <- rownames(X) 167 | colnames(A.new) <- rownames(A.new) 168 | if (n.core > 1) 169 | stopCluster(cl) 170 | 171 | if (return.imputed) { 172 | Ximp <- W.imp - 1 + 2*freq.mat 173 | colnames(Ximp) <- colnames(X)[markers] 174 | rownames(Ximp) <- rownames(X) 175 | return(list(A=A.new,imputed=Ximp)) 176 | } else { 177 | return(A.new) 178 | } 179 | } #else EM 180 | } #else EM 181 | } #else EM 182 | 183 | #imputing with mean 184 | if (shrink) { 185 | if (shrink.method=="EJ") { 186 | W.mean <- rowMeans(W) 187 | cov.W <- cov.W.shrink(W) 188 | A <- (cov.W+tcrossprod(W.mean))/var.A 189 | } else { 190 | if (n.core > 1) { 191 | cl <- makeCluster(n.core) 192 | clusterExport(cl=cl,varlist=NULL) 193 | it <- split(1:shrink.iter,factor(cut(1:shrink.iter,n.core,labels=FALSE))) 194 | delta <- unlist(parLapply(it,function(ix,W,n.qtl){apply(array(ix),1,shrink.coeff,W=W,n.qtl=n.qtl)},W=W,n.qtl=n.qtl)) 195 | stopCluster(cl) 196 | } else { 197 | delta <- apply(array(1:shrink.iter),1,shrink.coeff,W=W,n.qtl=n.qtl) 198 | } 199 | delta <- mean(delta,na.rm=T) 200 | print(paste("Shrinkage intensity:",round(delta,2))) 201 | A <- tcrossprod(W)/var.A/m 202 | A <- (1-delta)*A + delta*mean(diag(A))*diag(n) 203 | } 204 | } else { 205 | A <- tcrossprod(W)/var.A/m 206 | } 207 | rownames(A) <- rownames(X) 208 | colnames(A) <- rownames(A) 209 | 210 | if (return.imputed) { 211 | Ximp <- W - 1 + 2*freq.mat 212 | colnames(Ximp) <- colnames(X)[markers] 213 | rownames(Ximp) <- rownames(X) 214 | return(list(A=A,imputed=Ximp)) 215 | } else { 216 | return(A) 217 | } 218 | } #else missing 219 | 220 | } #A.mat 221 | 222 | -------------------------------------------------------------------------------- /R/GWAS.R: -------------------------------------------------------------------------------- 1 | GWAS <- function(pheno,geno,fixed=NULL,K=NULL,n.PC=0,min.MAF=0.05,n.core=1,P3D=TRUE,plot=TRUE) { 2 | 3 | qvalue <- function(p) { 4 | smooth.df = 3 5 | 6 | if(min(p)<0 || max(p)>1) { 7 | print("ERROR: p-values not in valid range.") 8 | return(0) 9 | } 10 | 11 | lambda=seq(0,0.90,0.05) 12 | m <- length(p) 13 | 14 | pi0 <- rep(0,length(lambda)) 15 | for(i in 1:length(lambda)) {pi0[i] <- mean(p >= lambda[i])/(1-lambda[i])} 16 | 17 | spi0 <- smooth.spline(lambda,pi0,df=smooth.df) 18 | pi0 <- predict(spi0,x=max(lambda))$y 19 | pi0 <- min(pi0,1) 20 | 21 | if(pi0 <= 0) { 22 | print("ERROR: The estimated pi0 <= 0. Check that you have valid p-values.") 23 | return(0) 24 | } 25 | 26 | #The estimated q-values calculated here 27 | u <- order(p) 28 | 29 | # ranking function which returns number of observations less than or equal 30 | qvalue.rank <- function(x) { 31 | idx <- sort.list(x) 32 | 33 | fc <- factor(x) 34 | nl <- length(levels(fc)) 35 | bin <- as.integer(fc) 36 | tbl <- tabulate(bin) 37 | cs <- cumsum(tbl) 38 | 39 | tbl <- rep(cs, tbl) 40 | tbl[idx] <- tbl 41 | 42 | return(tbl) 43 | } 44 | 45 | v <- qvalue.rank(p) 46 | 47 | qvalue <- pi0*m*p/v 48 | qvalue[u[m]] <- min(qvalue[u[m]],1) 49 | for(i in (m-1):1) {qvalue[u[i]] <- min(qvalue[u[i]],qvalue[u[i+1]],1)} 50 | 51 | return(qvalue) 52 | } 53 | 54 | manhattan <- function(input,fdr.level=0.05) { 55 | #first column is marker name 56 | #second is chromosome 57 | #third is map position 58 | #fourth is score 59 | input <- input[order(input[,2],input[,3]),] 60 | 61 | chroms <- unique(input[,2]) 62 | n.chrom <- length(chroms) 63 | chrom.start <- rep(0,n.chrom) 64 | chrom.mid <- rep(0,n.chrom) 65 | 66 | if (n.chrom > 1) { 67 | for (i in 1:(n.chrom-1)) {chrom.start[i+1] <- chrom.start[i]+max(input[which(input[,2]==chroms[i]),3])+1} 68 | } 69 | x.max <- chrom.start[n.chrom]+max(input[which(input[,2]==chroms[n.chrom]),3]) 70 | plot(0,0,type="n",xlim=c(0,x.max),ylim=c(0,max(input[,4])+1),ylab="-log(p)",xlab="Chromosome",xaxt="n") 71 | 72 | for (i in seq(1,n.chrom,by=2)) { 73 | ix <- which(input[,2]==chroms[i]) 74 | chrom.mid[i] <- median(chrom.start[i]+input[ix,3]) 75 | points(chrom.start[i]+input[ix,3],input[ix,4],col="dark blue",pch=16) 76 | } 77 | 78 | if (n.chrom > 1){ 79 | for (i in seq(2,n.chrom,by=2)) { 80 | ix <- which(input[,2]==chroms[i]) 81 | chrom.mid[i] <- median(chrom.start[i]+input[ix,3]) 82 | points(chrom.start[i]+input[ix,3],input[ix,4],col="cornflowerblue",pch=16) 83 | } 84 | } 85 | 86 | q.ans <- qvalue(10^-input[,4]) 87 | temp <- cbind(q.ans,input[,4]) 88 | temp <- temp[order(temp[,1]),] 89 | if (temp[1,1]0) { 105 | x <- sort(scores[-remove],decreasing=TRUE) 106 | } else { 107 | x <- sort(scores,decreasing=TRUE) 108 | } 109 | n <- length(x) 110 | unif.p <- -log10(ppoints(n)) 111 | plot(unif.p,x,pch=16,xlab="Expected -log(p)",ylab="Observed -log(p)") 112 | lines(c(0,max(unif.p)),c(0,max(unif.p)),lty=2) 113 | } 114 | 115 | score.calc <- function(M) { 116 | m <- ncol(M) 117 | scores <- array(0,m) 118 | 119 | for (i in 1:m) { 120 | 121 | Mi <- M[,i] 122 | freq <- mean(Mi+1,na.rm=TRUE)/2 123 | MAF <- min(freq,1-freq) 124 | if (MAF >= min.MAF) { 125 | not.NA.gid <- which(!is.na(Mi)) 126 | temp <- rep(1,length(Mi)) 127 | temp[not.NA.gid] <- 0 128 | not.NA.obs <- which(Z%*%temp!=1) 129 | n2 <- length(not.NA.obs) 130 | y2 <- matrix(y[not.NA.obs],n2,1) 131 | Z2 <- Z[not.NA.obs,not.NA.gid] 132 | X3 <- cbind(X2[not.NA.obs,],Z2%*%Mi[not.NA.gid]) 133 | p <- ncol(X3) 134 | v1 <- 1 135 | v2 <- n2-p 136 | 137 | if (!P3D) { 138 | H2inv <- mixed.solve(y=y2,X=X3,Z=Z2,K=K2[not.NA.gid,not.NA.gid],return.Hinv=TRUE)$Hinv 139 | } else { 140 | H2inv <- Hinv[not.NA.obs,not.NA.obs] 141 | } 142 | 143 | W <- crossprod(X3,H2inv%*%X3) 144 | Winv <- try(solve(W),silent=TRUE) 145 | if (!inherits(Winv,what="try-error")) { 146 | beta <- Winv %*% crossprod(X3,H2inv%*%y2) 147 | resid <- y2 - X3 %*% beta 148 | s2 <- as.double(crossprod(resid,H2inv%*%resid))/v2 149 | CovBeta <- s2*Winv 150 | Fstat <- beta[p]^2/CovBeta[p,p] 151 | x <- v2/(v2+v1*Fstat) 152 | scores[i] <- -log10(pbeta(x,v2/2,v1/2)) 153 | } #if try 154 | } #MAF 155 | } #for i 156 | 157 | return(scores) 158 | } #end score.calc 159 | 160 | make.full <- function(X) { 161 | svd.X <- svd(X) 162 | r <- max(which(svd.X$d>1e-8)) 163 | return(as.matrix(svd.X$u[,1:r])) 164 | } 165 | 166 | 167 | n <- nrow(pheno) 168 | pheno.ix <- 2:ncol(pheno) 169 | names <- colnames(pheno) 170 | X <- matrix(1,n,1) 171 | if (!is.null(fixed)) { 172 | p <- length(fixed) 173 | for (i in 1:p) { 174 | xpos <- match(fixed[i],names) 175 | pheno.ix <- setdiff(pheno.ix,xpos) 176 | xx <- factor(pheno[,xpos]) 177 | if (length(unique(xx)) > 1) {X <- cbind(X,model.matrix(~x-1,data.frame(x=xx)))} 178 | } 179 | } 180 | 181 | geno <- geno[order(geno[,2],geno[,3]),] 182 | 183 | M <- t(geno[,-c(1:3)]) #first column is marker name, second is chrom, third is map position 184 | map <- geno[,1:3] 185 | m <- ncol(M) # number of markers 186 | 187 | geno.gid <- colnames(geno)[-c(1:3)] 188 | rownames(M) <- geno.gid 189 | 190 | if (is.null(K)) { 191 | K <- A.mat(M,shrink=FALSE) 192 | } 193 | 194 | if (n.PC > 0) { 195 | eig.vec <- eigen(K)$vectors 196 | } 197 | 198 | if (length(which(rownames(K)!=geno.gid))>0) { 199 | stop("Line names in K and genotype file do not match.") 200 | } 201 | 202 | n.phenos <- length(pheno.ix) 203 | all.scores <- matrix(0,m,n.phenos) 204 | trait.names <- colnames(pheno)[pheno.ix] 205 | colnames(all.scores) <- trait.names 206 | 207 | if (plot) { 208 | p1 <- floor(sqrt(n.phenos)) 209 | p2 <- ceiling(n.phenos/p1) 210 | par(mfrow=c(p1,p2)) 211 | } 212 | 213 | if (n.phenos==0) { 214 | stop("No phenotypes.") 215 | } 216 | for (i in 1:n.phenos) { 217 | print(paste("GWAS for trait:",trait.names[i])) 218 | y <- pheno[,pheno.ix[i]] 219 | not.miss <- which(!is.na(y)) 220 | y <- y[not.miss] 221 | n <- length(y) 222 | 223 | pheno.gid <- unique(pheno[not.miss,1]) 224 | n.gid <- length(pheno.gid) 225 | ix.pheno <- match(pheno.gid,geno.gid) 226 | miss.pheno.gid <- which(is.na(ix.pheno)) 227 | if (length(miss.pheno.gid)>0) { 228 | stop(paste("The following lines have phenotypes but no genotypes:",paste(unique(pheno.gid[miss.pheno.gid]),collapse=" "))) 229 | } 230 | 231 | Z <- matrix(0,n,length(pheno.gid)) 232 | Z[cbind(1:n,match(pheno[not.miss,1],pheno.gid))] <- 1 233 | K2 <- K[ix.pheno,ix.pheno] 234 | if (n.PC > 0) { 235 | X2 <- make.full(cbind(X[not.miss,],Z%*%eig.vec[ix.pheno,1:n.PC])) 236 | } else { 237 | X2 <- make.full(X[not.miss,]) 238 | } 239 | 240 | if (P3D) { 241 | Hinv <- mixed.solve(y,X=X2,Z=Z,K=K2,return.Hinv=TRUE)$Hinv 242 | print("Variance components estimated. Testing markers.") 243 | } 244 | 245 | if ((n.core > 1) & requireNamespace("parallel",quietly=TRUE)) { 246 | it <- split(1:m,factor(cut(1:m,n.core,labels=FALSE))) 247 | scores <- unlist(parallel::mclapply(it,function(markers){score.calc(M[ix.pheno,markers])},mc.cores=n.core)) 248 | } else { 249 | scores <- score.calc(M[ix.pheno,]) 250 | } 251 | if (plot) { 252 | qq(scores) 253 | title(main=trait.names[i]) 254 | } 255 | all.scores[,i] <- scores 256 | } 257 | 258 | if (plot) { 259 | if (length(grep("RStudio",names(dev.cur())))==0) { 260 | if (dev.cur()==dev.next()) { 261 | dev.new() 262 | } else { 263 | dev.set(dev.next()) 264 | } 265 | } 266 | par(mfrow=c(p1,p2)) 267 | for (j in 1:n.phenos) { 268 | manhattan(cbind(map,all.scores[,j])) 269 | title(main=trait.names[j]) 270 | } 271 | } 272 | 273 | return(data.frame(map,all.scores)) 274 | 275 | } #end function 276 | 277 | --------------------------------------------------------------------------------