├── varComp.png ├── boxplot1.png ├── CV1_2_scheme.png ├── Accuracy_distn_CV1_multiEnv.png ├── Accuracy_distn_CV2_multiEnv.png ├── get_CV1_partitions.R ├── run_jobs_multi.sh ├── prepareData_multi.R ├── get_CV2_partitions.R ├── get_VarComps_multi.R ├── fitModels_multi.R ├── single_environment.md ├── README.md └── multi_environment.md /varComp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcooLopez/Genomic-Selection/HEAD/varComp.png -------------------------------------------------------------------------------- /boxplot1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcooLopez/Genomic-Selection/HEAD/boxplot1.png -------------------------------------------------------------------------------- /CV1_2_scheme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcooLopez/Genomic-Selection/HEAD/CV1_2_scheme.png -------------------------------------------------------------------------------- /Accuracy_distn_CV1_multiEnv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcooLopez/Genomic-Selection/HEAD/Accuracy_distn_CV1_multiEnv.png -------------------------------------------------------------------------------- /Accuracy_distn_CV2_multiEnv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcooLopez/Genomic-Selection/HEAD/Accuracy_distn_CV2_multiEnv.png -------------------------------------------------------------------------------- /get_CV1_partitions.R: -------------------------------------------------------------------------------- 1 | setwd("/mnt/home/lopezcru/GS") 2 | rm(list=ls()) 3 | 4 | #========================================================= 5 | # User specifications 6 | #========================================================= 7 | # Number of replicates 8 | m <- 100 9 | 10 | # Percentage of the data assigned to Testing set 11 | percTST <- 0.3 12 | #========================================================= 13 | 14 | # Load data 15 | load("multiEnvironment/prepData_multi.RData") 16 | n <- nrow(Y) 17 | 18 | # Creation of seed for repeated randomizations 19 | set.seed(123) 20 | seeds <- round(seq(1E3,1E6,length=m)) 21 | 22 | nTST <- round(percTST*n) 23 | YNA <- vector("list",m) 24 | 25 | for(k in 1:m) 26 | { 27 | set.seed(seeds[k]) 28 | indexTST <- sample(1:n,size=nTST,replace=FALSE) 29 | YNA0 <- Y 30 | YNA0[indexTST,] <- NA 31 | YNA[[k]] <- YNA0 32 | } 33 | 34 | # Save YNA matrix 35 | save(YNA,file="multiEnvironment/YNA_CV1_multiEnv.RData") 36 | -------------------------------------------------------------------------------- /run_jobs_multi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Number of jobs in each block 3 | nb=10 4 | seq1=1 5 | seq2=1 6 | seq3=50 7 | 8 | # Create a 'wait' sentence between chunks 9 | waittext="wait " 10 | for i in $(seq 1 $nb) 11 | do 12 | waittext=("${waittext[*]}" "%$i") 13 | done 14 | 15 | # Create job_submit file 16 | cat > job_submit.sh <> job_submit.sh <> job_submit.sh <=n){ 34 | nRep <- floor(nNA/n) 35 | remain <- sample(1:n,nNA%%n,replace=FALSE) 36 | a0 <- sample(1:n,n,replace=FALSE) 37 | indexNA <- rep(a0,nRep) 38 | if(length(remain)>0){ 39 | a1 <- floor(length(indexNA)/nTST)*nTST 40 | a2 <- nNA - a1 - length(remain) 41 | bb <- sample(a0[!a0%in%remain],a2,replace=FALSE) 42 | noInIndexNA <- c(rep(a0,nRep-1),a0[!a0%in%bb]) 43 | indexNA <- c(noInIndexNA,bb,remain) 44 | } 45 | } 46 | indexEnv <- rep(1:nEnv,each=nTST) 47 | for(j in 1:nEnv) YNA0[indexNA[indexEnv==j],j] <- NA 48 | YNA[[k]] <- YNA0 49 | } 50 | 51 | # Save YNA matrix 52 | save(YNA,file="multiEnvironment/YNA_CV2_multiEnv.RData") 53 | -------------------------------------------------------------------------------- /get_VarComps_multi.R: -------------------------------------------------------------------------------- 1 | setwd("/mnt/home/lopezcru/GS") 2 | rm(list=ls()) 3 | library(BGLR) 4 | 5 | load("multiEnvironment/prepData_multi.RData") 6 | n <- nrow(Y) 7 | nEnv <- ncol(Y) 8 | y <- as.vector(Y) 9 | 10 | set.seed(123) 11 | 12 | # Matrix to store results. It will save variance components for each model 13 | outVAR <- matrix(NA,ncol=4,nrow=1+2*nEnv) 14 | dimnames(outVAR) <- list(c("Main",rep(paste0("Env ",colnames(Y)),2)),c("Single","Across","MxE","R-Norm")) 15 | 16 | # Number of iterations and burn-in for Bayesian models 17 | nIter <- 30000; burnIn <- 2000 18 | 19 | #-------------------------------------------------------- 20 | # 1. Single environment (within-environment) model 21 | #-------------------------------------------------------- 22 | ETA <- list(G=list(V=eigen_G$vectors,d=eigen_G$values,model='RKHS')) 23 | for(env in 1:nEnv){ 24 | fm <-BGLR(y=Y[,env],ETA=ETA,nIter=nIter,burnIn=burnIn) 25 | outVAR[env+1,1] <- fm$ETA[[1]]$varU 26 | outVAR[env+4,1] <- fm$varE 27 | } 28 | 29 | #-------------------------------------------------------- 30 | # 2. Across-environments model 31 | #-------------------------------------------------------- 32 | ETA <- list(list(~envID-1,model="FIXED")) 33 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS') 34 | 35 | # Model Fitting 36 | fm <- BGLR(y=y,ETA=ETA,nIter=nIter,burnIn=burnIn) 37 | outVAR[1,2] <- fm$ETA[[2]]$varU 38 | outVAR[(1:nEnv)+4,2] <- fm$varE 39 | 40 | #-------------------------------------------------------- 41 | # 3. MxE interaction model 42 | #-------------------------------------------------------- 43 | ETA <- list(list(~envID-1,model="FIXED")) 44 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS') 45 | 46 | # Adding interaction terms 47 | for(env in 1:nEnv){ 48 | eigen_G1 <- MxE_eigen[[env]] 49 | ETA[[(env+2)]] <- list(V=eigen_G1$vectors,d=eigen_G1$values,model='RKHS') 50 | } 51 | 52 | # Model Fitting 53 | fm <- BGLR(y=y,ETA=ETA,nIter=nIter,burnIn=burnIn) 54 | outVAR[1,3] <- fm$ETA[[2]]$varU 55 | for(env in 1:nEnv) outVAR[env+1,3] <- fm$ETA[[env+2]]$varU 56 | outVAR[(1:nEnv)+4,3] <- fm$varE 57 | 58 | #-------------------------------------------------------- 59 | # 4. Reaction-Norm model 60 | #-------------------------------------------------------- 61 | ETA <- list(list(~envID-1,model="FIXED")) 62 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS') 63 | ETA[[3]] <- list(V=eigen_GE$vectors,d=eigen_GE$values,model="RKHS") 64 | 65 | # Model Fitting 66 | fm <- BGLR(y=y,ETA=ETA,nIter=nIter,burnIn=burnIn) 67 | outVAR[1,4] <- fm$ETA[[2]]$varU 68 | outVAR[(1:nEnv)+1,4] <- fm$ETA[[3]]$varU 69 | outVAR[(1:nEnv)+4,4] <- fm$varE 70 | outVAR 71 | 72 | # Save results 73 | write.table(outVAR,file="multiEnvironment/varComps.csv",sep=",",row.names=F) 74 | -------------------------------------------------------------------------------- /fitModels_multi.R: -------------------------------------------------------------------------------- 1 | setwd("/mnt/home/lopezcru/GS") 2 | rm(list=ls()) 3 | library(BGLR) 4 | 5 | #========================================================= 6 | # User specifications 7 | #========================================================= 8 | # Choose one model. 1: single; 2:across; 3:MxE; 4:R-Norm 9 | mod <- 4 10 | 11 | # Type of CV. 1:CV1; 2:CV2 12 | CV <- 1 13 | 14 | # Partition number 15 | part <- 1 16 | #========================================================= 17 | 18 | # Read arguments passed from command line 19 | args=(commandArgs(TRUE)) 20 | if(length(args)==0){ 21 | cat('No args provided',"\n") 22 | }else{ 23 | for(i in 1:length(args)){ 24 | eval(parse(text=args[[i]])) 25 | } 26 | } 27 | 28 | # Load data 29 | load("multiEnvironment/prepData_multi.RData") 30 | load(paste0("multiEnvironment/YNA_CV",CV,"_multiEnv.RData")) 31 | n <- nrow(Y); nEnv <- ncol(Y) 32 | 33 | # Models 34 | models <- c("Single","Across","MxE","R-Norm") 35 | model <- models[mod] 36 | 37 | # Number of iterations and burn-in for Bayesian models 38 | nIter <- 30000; burnIn <- 2000 39 | 40 | YNA0 <- YNA[[part]] 41 | yNA <- as.vector(YNA0) 42 | 43 | #-------------------------------------------------------- 44 | # 1. Single environment (within-environment) model 45 | #-------------------------------------------------------- 46 | if(model=="Single") 47 | { 48 | YHat <- matrix(NA,nrow=nrow(Y),ncol=ncol(Y)) 49 | ETA <- list(G=list(V=eigen_G$vectors,d=eigen_G$values,model='RKHS')) 50 | for(env in 1:nEnv){ 51 | fm <-BGLR(y=YNA0[,env],ETA=ETA,nIter=nIter,burnIn=burnIn) 52 | YHat[,env] <- fm$yHat 53 | } 54 | } 55 | 56 | #-------------------------------------------------------- 57 | # 2. Across-environments model 58 | #-------------------------------------------------------- 59 | if(model=="Across") 60 | { 61 | ETA <- list(list(~envID-1,model="FIXED")) 62 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS') 63 | 64 | # Model Fitting 65 | fm <- BGLR(y=yNA,ETA=ETA,nIter=nIter,burnIn=burnIn) 66 | YHat <- matrix(fm$yHat,ncol=nEnv) 67 | } 68 | 69 | #-------------------------------------------------------- 70 | # 3. MxE interaction model 71 | #-------------------------------------------------------- 72 | if(model=="MxE") 73 | { 74 | ETA <- list(list(~envID-1,model="FIXED")) 75 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS') 76 | 77 | # Adding interaction terms 78 | for(env in 1:nEnv){ 79 | eigen_G1 <- MxE_eigen[[env]] 80 | ETA[[(env+2)]] <- list(V=eigen_G1$vectors,d=eigen_G1$values,model='RKHS') 81 | } 82 | 83 | # Model Fitting 84 | fm <- BGLR(y=yNA,ETA=ETA,nIter=nIter,burnIn=burnIn) 85 | YHat <- matrix(fm$yHat,ncol=nEnv) 86 | } 87 | 88 | #-------------------------------------------------------- 89 | # 4. Reaction-Norm model 90 | #-------------------------------------------------------- 91 | if(model=="R-Norm") 92 | { 93 | ETA <- list(list(~envID-1,model="FIXED")) 94 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS') 95 | ETA[[3]] <- list(V=eigen_GE$vectors,d=eigen_GE$values,model="RKHS") 96 | 97 | # Model Fitting 98 | fm <- BGLR(y=yNA,ETA=ETA,nIter=nIter,burnIn=burnIn) 99 | YHat <- matrix(fm$yHat,ncol=nEnv) 100 | } 101 | 102 | # Save results 103 | outfolder <- paste0("multiEnvironment/CV",CV,"/",model) 104 | if(!file.exists(outfolder)) dir.create(outfolder,recursive=T) 105 | save(YHat,file=paste0(outfolder,"/outPRED_multiEnv_partition_",part,".RData")) 106 | -------------------------------------------------------------------------------- /single_environment.md: -------------------------------------------------------------------------------- 1 | 2 | ## Model assessment 3 | ### Training-Testing random partitions 4 | The prediction power of the model will be assessed using the training-testing (TRN-TST) random partitions approach. Data is randomly splitted into training and testing sets. Model parameters are estimated in training set and model is tested in TST set. 5 | 6 | ## Data preparation 7 | ### Load data, generate G-matrix and create objects to store results 8 | ```r 9 | # Load libraries 10 | library(BGLR) 11 | library(rrBLUP) 12 | 13 | # Load data 14 | data(wheat) 15 | X <- wheat.X 16 | Y <- wheat.Y 17 | 18 | # Select a single environment. For instance, environment 2 19 | y <- Y[,2] 20 | 21 | n <- nrow(Y) 22 | p <- ncol(X) 23 | 24 | # Genomic relationship matrix 25 | M <- scale(X) 26 | G <- tcrossprod(M)/p 27 | 28 | # Design matrix for individuals. In this case is a diagonal since there are no replicates 29 | GID <- factor(rownames(Y),levels=rownames(Y)) 30 | Zg <- model.matrix(~GID-1) 31 | ``` 32 | 33 | ## Running models 34 | 35 | ### 1. Variance components estimation 36 | After running the 'data preparation' part, the code following code can be used to fit all models and to extract variance components and other parameters. 37 | 38 | ```r 39 | set.seed(123) 40 | 41 | # Matrix to store results. It will save the main parameters for each model 42 | outVAR <- matrix(NA,nrow=6,ncol=5) 43 | dimnames(outVAR) <- list(c("varU","varE","lambda","dfb","Sb","H2"),c("GBLUP1","GBLUP2","BRR","LASSO","BayesB")) 44 | 45 | # Number of iterations and burn-in for Bayesian models 46 | nIter <- 30000 47 | burnIn <- 5000 48 | 49 | # G-BLUP model using 'rrBLUP' package 50 | fm <- mixed.solve(y=y,Z=Zg,K=G) 51 | outVAR[1,1] <- fm$Vu 52 | outVAR[2,1] <- fm$Ve 53 | outVAR[6,1] <- fm$Vu/(fm$Vu+fm$Ve) # Heritability 54 | 55 | # G-BLUP model using 'BGLR' package. Model RKHS with K=G 56 | fm <- BGLR(y,ETA=list(list(K=G,model="RKHS")),nIter=nIter,burnIn=burnIn) 57 | outVAR[1,2] <- fm$ETA[[1]]$varU 58 | outVAR[2,2] <- fm$varE 59 | outVAR[6,2] <- outVAR[1,2]/(outVAR[1,2] + outVAR[2,2]) # Heritability 60 | 61 | # Bayesian Ridge Regression (BRR) using 'BGLR' package 62 | fm <- BGLR(y,ETA=list(list(X=M,model="BRR")),nIter=nIter,burnIn=burnIn) 63 | outVAR[1,3] <- fm$ETA[[1]]$varB*p # Multiply by p to obtain the right varU as in G-BLUP 64 | outVAR[2,3] <- fm$varE 65 | outVAR[6,3] <- outVAR[1,3]/(outVAR[1,3] + outVAR[2,3]) # Heritability 66 | 67 | # Bayesian LASSO model using 'BGLR' package 68 | fm <- BGLR(y,ETA=list(list(X=M,model="BL")),nIter=nIter,burnIn=burnIn) 69 | outVAR[2,4] <- fm$varE 70 | outVAR[3,4] <- fm$ETA[[1]]$lambda 71 | 72 | # Bayes B model using 'BGLR' package 73 | fm <- BGLR(y,ETA=list(list(X=M,model="BayesB")),nIter=nIter,burnIn=burnIn) 74 | outVAR[2,5] <- fm$varE 75 | outVAR[4,5] <- fm$ETA[[1]]$df0 76 | outVAR[5,5] <- fm$ETA[[1]]$S0 77 | 78 | print(outVAR) 79 | ``` 80 | 81 | #### Results 82 | 83 | | |GBLUP 1 | GBLUP 2 | BRR | LASSO | Bayes B | 84 | |-------|------|------|------|------|------| 85 | |![](https://latex.codecogs.com/gif.latex?%5Csigma%5E2_u) |0.468 |0.490|0.491| - | - | 86 | |![](https://latex.codecogs.com/gif.latex?%5Csigma%5E2_%5Cvarepsilon) |0.574|0.576|0.575|0.589|0.572| 87 | |![](https://latex.codecogs.com/gif.latex?%5Clambda) | - | - | - |58.805| - | 88 | |![](https://latex.codecogs.com/gif.latex?df_%5Cbeta) | - | - | - | - | 5 | 89 | |![](https://latex.codecogs.com/gif.latex?S_%5Cbeta) | - | - | - | - |0.033| 90 | |![](https://latex.codecogs.com/gif.latex?H%5E2_g) |0.449|0.460|0.461| - | - | 91 | 92 | 1: using 'rrBLUP'; 2: using 'BGLR' package 93 | 94 | # 95 | ### 2. Replicates of partitions to obtain standard deviations of predictions 96 | After running the 'data preparation' part, the code below runs repeated partitions to obtain mean and standard deviations of accuracies for all models. All the models will be run using 'BGLR' package. 97 | 98 | ```r 99 | # Models 100 | models <- c("GBLUP","BRR","LASSO","BayesB") 101 | 102 | #================================================== 103 | # User specifications 104 | #================================================== 105 | # Choose one model. 1:GBLUP; 2:BRR; 3:LASSO; 4:BayesB 106 | mod <- 1 107 | 108 | # Percentage of the data assigned to Testing set 109 | percTST <- 0.3 110 | 111 | # Number of replicates 112 | m <- 10 113 | #================================================== 114 | 115 | # Creation of seed for repeated randomizations 116 | set.seed(123) 117 | seeds <- round(seq(1E3,1E6,length=m)) 118 | 119 | # Matrix to store results. It will save the corelation for each partition 120 | outCOR <- matrix(NA,nrow=m,ncol=1) 121 | colnames(outCOR) <- models[mod] 122 | 123 | # Number of iterations and burn-in for Bayesian models 124 | nIter <- 1200 125 | burnIn <- 200 126 | 127 | model <- models[mod] 128 | nTST <- round(percTST*n) 129 | 130 | for(k in 1:m) # Loop for the replicates 131 | { 132 | set.seed(seeds[k]) 133 | indexTST <- sample(1:n,size=nTST,replace=FALSE) 134 | yNA <- y 135 | yNA[indexTST] <- NA 136 | 137 | if(model=="GBLUP") ETA <- list(list(K=G,model="RKHS")) 138 | if(model=="BRR") ETA <- list(list(X=M,model="BRR")) 139 | if(model=="LASSO") ETA <- list(list(X=M,model="BL")) 140 | if(model=="BayesB") ETA <- list(list(X=M,model="BayesB")) 141 | 142 | fm <- BGLR(yNA,ETA=ETA,nIter=nIter,burnIn=burnIn) 143 | outCOR[k,1] <- cor(fm$yHat[indexTST],y[indexTST]) 144 | } 145 | 146 | # Save results 147 | save(outCOR,file=paste0("outCOR_",model,".RData")) 148 | ``` 149 | 150 | #### 2.1 Results 151 | The code below will retrieve results for all models fitted previously 152 | 153 | ```r 154 | models <- c("GBLUP","BRR","LASSO","BayesB") 155 | 156 | OUT <- c() 157 | for(mod in seq_along(models)) 158 | { 159 | filename <- paste0("outCOR_",models[mod],".RData") 160 | if(file.exists(filename)){ 161 | load(filename,verbose=T) 162 | OUT <- cbind(OUT,outCOR) 163 | } 164 | } 165 | 166 | round(rbind(Mean=apply(OUT,2,mean),SD=apply(OUT,2,sd)),4) 167 | boxplot(outCOR,ylab="Accuracy",xlab="Model") 168 | ``` 169 | 170 | | |GBLUP | BRR | LASSO | Bayes B | 171 | |-------|-------|--------|------|-------| 172 | |Mean | 0.475 | 0.476 | 0.474 | 0.465 | 173 | |SD | 0.049 | 0.050 | 0.049 | 0.048 | 174 | 175 | Boxplot of distribution of the accuracies by model 176 | 177 | ![](https://github.com/MarcooLopez/Genomic-Selection/blob/master/boxplot1.png) 178 | 179 | # 180 | * **[back](https://github.com/MarcooLopez/Genomic-Selection-Demo/blob/master/README.md)** 181 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Genomic Selection Demo 2 | 3 | The standard genetic model assumes that phenotype is the sum of a genetic component and a non-genetic component (residual), ![](https://latex.codecogs.com/gif.latex?y_i%3Dg_i+%5Cvarepsilon_i). Genomic Selection uses genetic markers covering the whole genome and potentially explaining all the genetic variance. These markers are asumed to be in Linkage Disequilibrium (LD) with the QTL thus models including all markers can estimate breeding values ![](https://latex.codecogs.com/gif.latex?g_i) as combinatons of these QTL's. 4 | 5 | ## Model 6 | Response variable *y* for the *i*-th individual (*i=1,...,n*) is regressed on a function of *p* marker genotypes ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bx%7D_i%3D%5Bx_%7Bi1%7D%2C...%2Cx_%7Bip%7D%5D%27) that seeks to aproximate to the true genetic value of the individual, this is 7 | 8 |

9 | 10 | 11 | 12 | where function ![](https://latex.codecogs.com/gif.latex?f%28%5Ctextbf%7Bx%7D_i%29) can be a parametric or non-parametric and ![](https://latex.codecogs.com/gif.latex?%5Cboldsymbol%7B%5Cvarepsilon%7D%3D%5B%5Cvarepsilon_1%2C...%2C%5Cvarepsilon_n%5D%27) are the residuals which are usually assumed to be distributed Normal with constant variance ![](https://latex.codecogs.com/gif.latex?%5Cboldsymbol%7B%5Cvarepsilon%7D%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Csigma%5E2_%5Cvarepsilon%5Ctextbf%7BI%7D%29). 13 | 14 | ### Parametric regression 15 | The genotypic value of an individual is estimated using a **linear model** in which a linear combination of the marker genotypes are used, that is 16 |

17 | 18 | 19 | 20 | where ![](https://latex.codecogs.com/gif.latex?%5Cmu) is the intercept, ![](https://latex.codecogs.com/gif.latex?x_%7Bij%7D) is the genotype of the *i*-th individual at the *j*-th marker, ![](https://latex.codecogs.com/gif.latex?%5Cbeta_%7Bj%7D) is the corresponding marker effect. 21 | 22 | Model above presents some estimation difficulties when *p* is much bigger than *n* so penalization ans regularization aproaches are used to overcome this problem. Penalization and regularization solutions can be seen as posterior solutions in the Bayesian context. 23 | 24 | #### 1. Bayesian Ridge Regression (BRR). 25 | Is a penalization regression that assumes that the regression coefficients follow independently a Gaussian (Normal) prior distribution, this is ![](https://latex.codecogs.com/gif.latex?%5Cbeta_%7Bj%7D%5Csim%20N%280%2C%5Csigma%5E2_%5Cbeta%29). 26 | This prior induces shrinkage of estimates toward zero. 27 | 28 | #### 2. Bayesian LASSO. 29 | It assumes that the regression coefficients have a prior distribution double-exponential (*DE*, or Laplace) with parameters ![](https://latex.codecogs.com/gif.latex?%5Clambda%5E2) and ![](https://latex.codecogs.com/gif.latex?%5Csigma%5E2_%5Cvarepsilon). This prior is a thick-tailed prior that can be represented as a infinite mixture of normal densities scaled by exponential (![](https://latex.codecogs.com/gif.latex?Exp)) densities, this is 30 | 31 |

32 | 33 | 34 | 35 | #### 3. Bayes A 36 | The regression effects are assumed another thick-tailed prior, a scaled *t* distribution with degree of freedom ![](https://latex.codecogs.com/gif.latex?df_%5Cbeta) and scale ![](https://latex.codecogs.com/gif.latex?S_%5Cbeta) parameters. Similar as for doble-exponential, the scaled *t* distribution is represented as mixture of normal densities scaled with a scaled-inverse Chi-squared (![](https://latex.codecogs.com/gif.latex?%5Cchi%5E%7B-1%7D)) density, this is 37 | 38 |

39 | 40 | 41 | 42 | #### 4. Bayes B 43 | Markers effects are asummed to be equal to zero with probability ![](https://latex.codecogs.com/gif.latex?%5Cpi) and with probability 1-![](https://latex.codecogs.com/gif.latex?%5Cpi) are assumed to follow a scaled *t* distribution as in Bayes A model. 44 | 45 | #### 5. Bayes C 46 | Similar to Bayes B, markers effects are asummed to be equal to zero with probability ![](https://latex.codecogs.com/gif.latex?%5Cpi) and with probability 1-![](https://latex.codecogs.com/gif.latex?%5Cpi) are assumed to follow a Gaussian distribution as in BRR model. 47 | 48 | #### 6. G-BLUP model (RR-BLUP) 49 | The response is modeled as ![](https://latex.codecogs.com/gif.latex?y_i%3D%5Cmu+g_i+%5Cvarepsilon_i) and its solution is equivalent to that of the BRR model arised when in the model above we make the sustitution 50 |

51 | 52 | 53 | 54 | It can be shown that the random vector ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D%3D%5Bg_1%2C...%2Cg_n%5D%27) follows a Normal distribution ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Csigma%5E2_g%5Ctextbf%7BG%7D%29), where ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BG%7D%3D%5Ctextbf%7BX%7D%5Ctextbf%7BX%7D%27/p) with ***X*** is the matrix of centered and standardized marker genotypes and it is called genomic relationship matrix. 55 | 56 | 57 | ### Semi-parametric regression 58 | 59 | #### 7. RKHS regression. 60 | The genomic function ![](https://latex.codecogs.com/gif.latex?f%28%5Ctextbf%7Bx%7D_i%29) is expressed as a linear combination of some positive semi-definite basis functions called Reproducing Kernels (RK), ![](https://latex.codecogs.com/gif.latex?K%28%5Ctextbf%7Bx%7D_i%2C%5Ctextbf%7Bx%7D_%7Bi%27%7D%29), as follows 61 |

62 | 63 | 64 | 65 | This model can be rewritten as ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7By%7D%3D%5Ctextbf%7BK%7D%5Cboldsymbol%7B%5Calpha%7D%20+%5Cboldsymbol%7B%5Cvarepsilon%7D) where ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BK%7D%3D%5C%7BK%28%5Ctextbf%7Bx%7D_i%2C%5Ctextbf%7Bx%7D_%7Bi%27%7D%29%5C%7D) is a ![](https://latex.codecogs.com/gif.latex?n%5Ctimes%20n) matrix containing all the evaluations of the RK function at the point (*i*,*i'*) and ![](https://latex.codecogs.com/gif.latex?%5Cboldsymbol%7B%5Calpha%7D%20%3D%5B%5Calpha_1%2C...%2C%5Calpha_n%5D%27). 66 | 67 | This problem can be solved in a Bayesian fashion by assuming a prior ![](https://latex.codecogs.com/gif.latex?%5Cboldsymbol%7B%5Calpha%7D%20%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Csigma%5E2_%5Calpha%5Ctextbf%7BK%7D%5E%7B-1%7D%29). 68 | 69 | **Note:** The Ridge Regression (and consequently, G-BLUP) can be represented as a RKHS model by setting **K**=**G**. 70 | 71 | 72 | ## Implementation of models 73 | Models previously above described will be implemented in R software using R-packages 'BGLR' and 'rrBLUP'. Using public data, it will be shown how to run the models for the single-environment case and then how to perform a multi-environment analysis with the G-BLUP model using a marker-by-environment (MxE) and a Reaction Norm approaches that account for GxE interaction. 74 | 75 | ### Data 76 | Data from CIMMYT’s Global Wheat Program. Lines were evaluated for grain yield (each entry corresponds to an average of two plot records) at four different environments; phenotypes (*wheat.Y* object) were centered and standardized to a unit variance within environment. Each of the lines were genotyped for 1279 diversity array technology (DArT) markers. At each marker two homozygous genotypes were possible and these were coded as 0/1. Marker genotypes are given in the object *wheat.X*. Finally a matrix *wheat.A* provides the pedigree relationships between lines computed from the pedigree records. 77 | Data is available for download in the R-package 'BGLR'. 78 | 79 | ### R-packages installation 80 | ```r 81 | if(!"BGLR"%in%rownames(installed.packages())) install.packages("BGLR") 82 | if(!"rrBLUP"%in%rownames(installed.packages())) install.packages("rrBLUP") 83 | library(BGLR) 84 | library(rrBLUP) 85 | ``` 86 | 87 | ### Download data 88 | ```r 89 | data(wheat) 90 | X <- wheat.X 91 | Y <- wheat.Y 92 | A <- wheat.A 93 | 94 | # Visualize data 95 | head(Y) 96 | X[1:10,1:5] 97 | ``` 98 | 99 | ### Type of analyses 100 | * **[Single-environment](https://github.com/MarcooLopez/Genomic-Selection-Demo/blob/master/single_environment.md)** 101 | * **[Multi-environment](https://github.com/MarcooLopez/Genomic-Selection-Demo/blob/master/multi_environment.md)** 102 | 103 | ## 104 | 105 | # References 106 | * de los Campos, G., Gianola, D., Rosa, G. J. M., Weigel, K. A., & Crossa, J. (2010). **Semi-parametric genomic-enabled prediction of genetic values using reproducing kernel Hilbert spaces methods**. Genetics Research, 92(4), 295–308. 107 | * de los Campos, G., Hickey, J. M., Pong-Wong, R., Daetwyler, H. D., & Calus, M. P. L. (2013). **Whole-genome regression and prediction methods applied to plant and animal breeding**. Genetics, 193(2), 327–345. 108 | * Endelman, J. B. (2011). **Ridge Regression and Other Kernels for Genomic Selection with R Package rrBLUP**. The Plant Genome Journal, 4(3), 250–255. 109 | * Habier, D., Fernando, R. L., Kizilkaya, K., & Garrick, D. J. (2011). **Extension of the bayesian alphabet for genomic selection**. BMC Bioinformatics, 12(186), 1-12. 110 | * Jarquín, D., Crossa, J., Lacaze, X., Du Cheyron, P., Daucourt, J., Lorgeou, J., … de los Campos, G. (2014). **A reaction norm model for genomic selection using high-dimensional genomic and environmental data**. Theoretical and Applied Genetics, 127(3), 595–607. 111 | * Lopez-Cruz, M., Crossa, J., Bonnett, D., Dreisigacker, S., Poland, J., Jannink, J.-L., … de los Campos, G. (2015). **Increased prediction accuracy in wheat breeding trials using a marker × environment interaction genomic selection model**. G3: Genes, Genomes, Genetics, 5(4), 569–582. 112 | * Meuwissen, T. H. E., Hayes, B. J., & Goddard, M. E. (2001). **Prediction of total genetic value using genome-wide dense marker maps**. Genetics, 157(4), 1819–1829. 113 | * Park, T., & Casella, G. (2008). **The Bayesian Lasso**. Journal of the American Statistical Association, 103(482), 681–686. 114 | * Perez, P., & de los Campos, G. (2014). **Genome-wide regression and prediction with the BGLR statistical package**. Genetics, 198(2), 483–495. 115 | * R Development Core Team. (2015). **R: A Language and Environment for Statistical Computing**. Vienna, Austria: R Foundation for Statistical Computing. 116 | 117 | 118 | -------------------------------------------------------------------------------- /multi_environment.md: -------------------------------------------------------------------------------- 1 | 2 | # Multi-environment models 3 | 4 | * **Single-environment model** 5 | 6 | This model is obtained by regressing the phenotype vector containing the *n* records available in the *k*th environment (*k=1,2,...,s* environments), ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7By%7D_k%3D%5By_%7B1k%7D%2C...%2Cy_%7Bnk%7D%5D%27), where *i=1,2,...,n* indexes lines (individuals), on *p* markers using a linear model in the form 7 | 8 |

9 | 10 |

11 | 12 | where ![](https://latex.codecogs.com/gif.latex?%5Cmu_k) is the intercept, ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7B1%7D%3D%5B1%2C...%2C1%5D%27) is a *n*-vector of ones, ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BX%7D_k%3D%5C%7Bx_%7Bijk%7D%5C%7D) is the ![](https://latex.codecogs.com/gif.latex?n%5Ctimes%20p) matrix of centered and standardized markers available in the *k*th environment, ![](https://latex.codecogs.com/gif.latex?%5Cboldsymbol%7B%5Cbeta%7D_k%3D%5B%5Cbeta_%7B1k%7D%2C...%2C%5Cbeta_%7Bpk%7D%5D%27) is a *p*-vector of marker effects, and ![](https://latex.codecogs.com/gif.latex?%5Cboldsymbol%7B%5Cvarepsilon%7D_k%3D%5B%5Cvarepsilon_%7Bik%7D%2C...%2C%5Cvarepsilon_%7Bnk%7D%5D%27) is the *n*-vector of residuals. 13 | 14 | This model can be represented as a G-BLUP model by setting ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_k%3D%5Ctextbf%7BX%7D_k%5Cboldsymbol%7B%5Cbeta%7D_k), this is: 15 | 16 |

17 | 18 |

19 | 20 | where ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_k%3D%5Bg_%7B1k%7D%2C...%2Cg_%7Bnk%7D%5D%27) is a random effect assumed ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_k%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Csigma%5E2_%7Bg_k%7D%5Ctextbf%7BG%7D_k%29) with the genomic relationship matrix estimated as ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BG%7D_k%3D%5Ctextbf%7BX%7D_k%5Ctextbf%7BX%7D_k%27/p). 21 | 22 | * **Across-environments model** 23 | 24 | This model assumes that effects of markers are the same across environments, this is ![](https://latex.codecogs.com/gif.latex?%5Cboldsymbol%7B%5Cbeta%7D_1%3D%5Cboldsymbol%7B%5Cbeta%7D_2%3D...%3D%5Cboldsymbol%7B%5Cbeta%7D_s%3D%5Cboldsymbol%7B%5Cbeta%7D). The model above can be simultaneously fitted for all environments as (assume *s=3* environments) 25 | 26 |

27 | 28 |

29 | 30 | Similarly, it can be represented also as a G-BLUP model, by making ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_k%3D%5Ctextbf%7BX%7D_k%5Cboldsymbol%7B%5Cbeta%7D), as: 31 | 32 |

33 | 34 |

35 | 36 | where the random effect ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_0%3D%5B%5Ctextbf%7Bg%7D%27_1%2C%5Ctextbf%7Bg%7D%27_2%2C%5Ctextbf%7Bg%7D%27_3%5D%27) is assumed ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_0%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Csigma%5E2_%7Bg_0%7D%5Ctextbf%7BG%7D_0%29) with ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BG%7D_0) being the marker-derived genomic relationship calculated as 37 | 38 |

39 | 40 |

41 | 42 | * **MxE model** 43 | 44 | It models GxE interaction using a marker-by-environment (MxE) approach in which the effect of the *j*th marker on the *k*th environment, ![](https://latex.codecogs.com/gif.latex?%5Cbeta_%7Bjk%7D), is descomposed into an effect that is common to all environments (*b*j0) and an effect that is specific to each environment (*b*jk), this is ![](https://latex.codecogs.com/gif.latex?%5Cbeta_%7Bjk%7D%3Db_%7Bj0%7D+b_%7Bjk%7D). Thus, the multi-environmental model is 45 | 46 |

47 | 48 |

49 | 50 | Likewise, by making ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_%7B0,k%7D%3D%5Ctextbf%7BX%7D_k%5Ctextbf%7Bb%7D_0) and ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_%7B1,k%7D%3D%5Ctextbf%7BX%7D_k%5Ctextbf%7Bb%7D_k), the G-BLUP representation is 51 | 52 |

53 | 54 |

55 | 56 | where the random effects 57 |

58 | 59 | 60 | 61 | are assumed ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_0%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Csigma%5E2_%7Bg_0%7D%5Ctextbf%7BG%7D_0%29) and ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_1%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Ctextbf%7BG%7D_1%29), respectively. Here, ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BG%7D_0) is described previously and 62 | 63 |

64 | 65 | 66 | 67 | 68 | * **Reaction Norm model** 69 | 70 | The G-BLUP model presented in the across-environment approach can be extended to incorporate GxE by introducing covariance structures as a funcion of the marker information. This model, assumes the environment (*E*k) as a random effect as ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BE%7D%3D%5BE_1%2C...%2CE_s%5D%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Csigma%5E2_e%5Ctextbf%7BI%7D%29). These random effects are conected with individuals through the design matrix **Z**e. Thus, the main effect of environments, **e** = **Z**e**E**, is 71 | 72 | 73 | 74 | where ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Be%7D_k%3D%5Be_%7B1k%7D%2C...%2Ce_%7Bnk%7D%5D%27) are the effects of the environment *k* for the *i*th individual (*i=1,...,n*). The reaction norm model, incorporates GxE by introducing the interaction terms ![](https://latex.codecogs.com/gif.latex?g_%7Bik%7De_%7Bik%7D), as 75 | 76 |

77 | 78 |

79 | 80 | where ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_k%5Ctextbf%7Be%7D_k) is the interaction term between the random effect of markers and environments. It can be proved that, aproximately, 81 | 82 |

83 | 84 | 85 | 86 | where ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BG%7D_0) was described above and ![](https://latex.codecogs.com/gif.latex?%5Ccirc) denotes the element by element product (known as Hadamard product). 87 | 88 | **Note**. The Hadamard product above will yield a matrix similar to ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BG%7D_1) above described but with ![](https://latex.codecogs.com/gif.latex?%5Csigma%5E2_%7Bg_1%7D%3D%5Csigma%5E2_%7Bg_2%7D%3D%5Csigma%5E2_%7Bg_3%7D%3D1). This is, reaction norm model estimates environment-specific effects with constant variance ![](https://latex.codecogs.com/gif.latex?%5Csigma%5E2_%7Bge%7D) in contrast wth MxE model that accomodates environment-specific variance, ![](https://latex.codecogs.com/gif.latex?%5Csigma%5E2_%7Bg_k%7D), *k=1,...,s*. 89 | 90 | ## Model assessment 91 | Performance of the *Reaction Norm* and *MxE model* will be compared with that of the *across-environments* model that ignores GxE modeling and the *single-environment* model which is fitted within each environment. 92 | 93 | ### Training-Testing random partitions. 94 | The prediction power of the model will be assessed using the training-testing (TRN-TST) random partitions approach. 95 | Data is randomly splitted into training and testing sets. Model parameters are estimated in training set and model is tested in TST set. Two main estimations problems are addressed using the multi-environments models. 96 | 97 | * **Cross Validation 1 (CV1)**. Represent a scheme of prediction of lines that have not been evaluated in any field 98 | trials. 99 | 100 | * **Cross Validation 2 (CV2)**. Represent a scheme of prediction of lines that have been evaluated in some but all target environments. Thus, prediction of non-evaluated lines benefits from borrowing of information from lines that were evaluated in other environments. 101 | 102 |

103 | 104 | 105 | 106 | In our case, we will use 70% of the data for training set and the remaining 30% for the testing set. 107 | For CV1, we will create a scheme in which 30% of the lines are missing in all environments. 108 | CV2 scheme is created by having 30% of the entries missing in one environment but present in all the rest of environments. 109 | 110 | This procedure of TRN-TST can be repeated many times to allow for estimation of standard errors (SE). 111 | 112 | ## Data preparation 113 | ### Load data, generate G-matrix 114 | The following R code, [prepareData_multi.R](https://github.com/MarcooLopez/Genomic-Selection/blob/master/prepareData_multi.R), can be used to prepare the data for analizes of the multi-environmental models. 115 | 116 | ```r 117 | setwd("/mnt/home/lopezcru/GS") 118 | rm(list=ls()) 119 | library(BGLR) 120 | 121 | # Load data 122 | data(wheat) 123 | X <- wheat.X 124 | Y <- wheat.Y 125 | 126 | # Select environments. For instance, environments 2,4, and 5 127 | Y <- Y[,c(2,3,4)] 128 | 129 | # Genomic relationship matrix 130 | M <- scale(X) 131 | G <- tcrossprod(M)/ncol(X) 132 | 133 | # Design matrix for individuals. It connects individuals with environments 134 | GID <- factor(rep(rownames(Y),ncol(Y)),levels=rownames(Y)) 135 | Zg <- model.matrix(~GID-1) 136 | 137 | # Design matrix for environments. Used in the multi-environment R-Norm model 138 | envID <- factor(rep(colnames(Y),each=nrow(Y)),levels=colnames(Y)) 139 | ZE <- model.matrix(~envID-1) 140 | 141 | # Covariance structure for effects 142 | ZgGZgt <- Zg%*%G%*%t(Zg) # Genetic effect 143 | ZEZEt <- tcrossprod(ZE) # Environmental effect 144 | GE <- ZgGZgt*ZEZEt # GxE interaction term (R-Norm model) 145 | 146 | # Eigen decomposition (to speed computational time) 147 | eigen_G <- eigen(G) 148 | eigen_G0 <- eigen(ZgGZgt) 149 | eigen_GE <- eigen(GE) 150 | 151 | # Interaction terms (MxE model) 152 | MxE_eigen <- vector("list",ncol(Y)) 153 | for(env in 1:ncol(Y)){ 154 | tmp <- rep(0,ncol(Y)) ; tmp[env] <- 1; G1 <- kronecker(diag(tmp),G) 155 | MxE_eigen[[env]] <- eigen(G1) 156 | } 157 | 158 | # Save prepared data 159 | dir.create("multiEnvironment") 160 | save(Y,envID,eigen_G,eigen_G0,eigen_GE,MxE_eigen,file="multiEnvironment/prepData_multi.RData") 161 | ``` 162 | 163 | ## Running models 164 | 165 | ### 1. Variance components estimation 166 | Code below, [get_VarComps_multi.R](https://github.com/MarcooLopez/Genomic-Selection/blob/master/get_VarComps_multi.R) script, can be used after 'data preparation' part to fit all the models and to extract variance components. 167 | 168 | ```r 169 | setwd("/mnt/home/lopezcru/GS") 170 | rm(list=ls()) 171 | library(BGLR) 172 | 173 | load("multiEnvironment/prepData_multi.RData") 174 | n <- nrow(Y) 175 | nEnv <- ncol(Y) 176 | y <- as.vector(Y) 177 | 178 | set.seed(123) 179 | 180 | # Matrix to store results. It will save variance components for each model 181 | outVAR <- matrix(NA,ncol=4,nrow=1+2*nEnv) 182 | dimnames(outVAR) <- list(c("Main",rep(paste0("Env ",colnames(Y)),2)),c("Single","Across","MxE","R-Norm")) 183 | 184 | # Number of iterations and burn-in for Bayesian models 185 | nIter <- 30000; burnIn <- 2000 186 | 187 | #-------------------------------------------------------- 188 | # 1. Single environment (within-environment) model 189 | #-------------------------------------------------------- 190 | ETA <- list(G=list(V=eigen_G$vectors,d=eigen_G$values,model='RKHS')) 191 | for(env in 1:nEnv){ 192 | fm <-BGLR(y=Y[,env],ETA=ETA,nIter=nIter,burnIn=burnIn) 193 | outVAR[env+1,1] <- fm$ETA[[1]]$varU 194 | outVAR[env+4,1] <- fm$varE 195 | } 196 | 197 | #-------------------------------------------------------- 198 | # 2. Across-environments model 199 | #-------------------------------------------------------- 200 | ETA <- list(list(~envID-1,model="FIXED")) 201 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS') 202 | 203 | # Model Fitting 204 | fm <- BGLR(y=y,ETA=ETA,nIter=nIter,burnIn=burnIn) 205 | outVAR[1,2] <- fm$ETA[[2]]$varU 206 | outVAR[(1:nEnv)+4,2] <- fm$varE 207 | 208 | #-------------------------------------------------------- 209 | # 3. MxE interaction model 210 | #-------------------------------------------------------- 211 | ETA <- list(list(~envID-1,model="FIXED")) 212 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS') 213 | 214 | # Adding interaction terms 215 | for(env in 1:nEnv){ 216 | eigen_G1 <- MxE_eigen[[env]] 217 | ETA[[(env+2)]] <- list(V=eigen_G1$vectors,d=eigen_G1$values,model='RKHS') 218 | } 219 | 220 | # Model Fitting 221 | fm <- BGLR(y=y,ETA=ETA,nIter=nIter,burnIn=burnIn) 222 | outVAR[1,3] <- fm$ETA[[2]]$varU 223 | for(env in 1:nEnv) outVAR[env+1,3] <- fm$ETA[[env+2]]$varU 224 | outVAR[(1:nEnv)+4,3] <- fm$varE 225 | 226 | #-------------------------------------------------------- 227 | # 4. Reaction-Norm model 228 | #-------------------------------------------------------- 229 | ETA <- list(list(~envID-1,model="FIXED")) 230 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS') 231 | ETA[[3]] <- list(V=eigen_GE$vectors,d=eigen_GE$values,model="RKHS") 232 | 233 | # Model Fitting 234 | fm <- BGLR(y=y,ETA=ETA,nIter=nIter,burnIn=burnIn) 235 | outVAR[1,4] <- fm$ETA[[2]]$varU 236 | outVAR[(1:nEnv)+1,4] <- fm$ETA[[3]]$varU 237 | outVAR[(1:nEnv)+4,4] <- fm$varE 238 | outVAR 239 | 240 | # Save results 241 | write.table(outVAR,file="multiEnvironment/varComps.csv",sep=",",row.names=F) 242 | ``` 243 | 244 | #### Results 245 | The following table is the output of the code above for `nIter=30000` and `burnIn=2000`. 246 | 247 | 248 | ## 249 | ### 2. Replicates of partitions to obtain standard deviations of predictions 250 | Using a GBLUP approach, the prediction power of the multi-environment models (MxE and Reaction Norm) will be compared with that that ignores GxE (across-environment) and with the GBLUP model fitted within environment. 251 | 252 | #### 2.1. Training-Testing partitions 253 | After running the 'data preparation' part, it can be chosen either to perform CV1 or CV2 aproaches 254 | 255 | * **Cross Validation 1 (CV1)** 256 | 257 | Code below will generate a matrix YNA containing "NA" values for the entries corresponding to the TST set mimicing the CV1 prediction problem. It generates a 'list' with 'm' matrices containing the TRN-TST partitions 258 | 259 | ```r 260 | setwd("/mnt/home/lopezcru/GS") 261 | rm(list=ls()) 262 | 263 | #========================================================= 264 | # User specifications 265 | #========================================================= 266 | # Number of replicates 267 | m <- 100 268 | 269 | # Percentage of the data assigned to Testing set 270 | percTST <- 0.3 271 | #========================================================= 272 | 273 | # Load data 274 | load("multiEnvironment/prepData_multi.RData") 275 | n <- nrow(Y) 276 | 277 | # Creation of seed for repeated randomizations 278 | set.seed(123) 279 | seeds <- round(seq(1E3,1E6,length=m)) 280 | 281 | nTST <- round(percTST*n) 282 | YNA <- vector("list",m) 283 | 284 | for(k in 1:m) 285 | { 286 | set.seed(seeds[k]) 287 | indexTST <- sample(1:n,size=nTST,replace=FALSE) 288 | YNA0 <- Y 289 | YNA0[indexTST,] <- NA 290 | YNA[[k]] <- YNA0 291 | } 292 | 293 | # Save YNA matrix 294 | save(YNA,file="multiEnvironment/YNA_CV1_multiEnv.RData") 295 | ``` 296 | 297 | * **Cross Validation 2 (CV2)** 298 | 299 | Code below will generate a matrix YNA containing "NA" values for the entries corresponding to the TST set mimicing the CV2 prediction problem. It generates a 'list' with 'm' matrices containing the TRN-TST partitions 300 | 301 | ```r 302 | setwd("/mnt/home/lopezcru/GS") 303 | rm(list=ls()) 304 | 305 | #========================================================= 306 | # User specifications 307 | #========================================================= 308 | # Number of replicates 309 | m <- 100 310 | 311 | # Percentage of the data assigned to Testing set 312 | percTST <- 0.3 313 | #========================================================= 314 | 315 | # Load data 316 | load("multiEnvironment/prepData_multi.RData") 317 | n <- nrow(Y) 318 | nEnv <- ncol(Y) 319 | 320 | # Creation of seed for repeated randomizations 321 | set.seed(123) 322 | seeds <- round(seq(1E3,1E6,length=m)) 323 | 324 | nTST <- round(percTST*n) 325 | YNA <- vector("list",m) 326 | nNA <- nEnv*nTST 327 | 328 | for(k in 1:m) 329 | { 330 | set.seed(seeds[k]) 331 | YNA0 <- Y 332 | 333 | if(nNA=n){ 335 | nRep <- floor(nNA/n) 336 | remain <- sample(1:n,nNA%%n,replace=FALSE) 337 | a0 <- sample(1:n,n,replace=FALSE) 338 | indexNA <- rep(a0,nRep) 339 | if(length(remain)>0){ 340 | a1 <- floor(length(indexNA)/nTST)*nTST 341 | a2 <- nNA - a1 - length(remain) 342 | bb <- sample(a0[!a0%in%remain],a2,replace=FALSE) 343 | noInIndexNA <- c(rep(a0,nRep-1),a0[!a0%in%bb]) 344 | indexNA <- c(noInIndexNA,bb,remain) 345 | } 346 | } 347 | indexEnv <- rep(1:nEnv,each=nTST) 348 | for(j in 1:nEnv) YNA0[indexNA[indexEnv==j],j] <- NA 349 | YNA[[k]] <- YNA0 350 | } 351 | 352 | # Save YNA matrix 353 | save(YNA,file="multiEnvironment/YNA_CV2_multiEnv.RData") 354 | ``` 355 | 356 | After running the code to generate partitions for either CV1 or CV2 scenarios, the following script ([fitModels_multi.R](https://github.com/MarcooLopez/Genomic-Selection/blob/master/fitModels_multi.R)) can be run to fit the models repeatealy for all partitions. In all multi-environment models, main effect of 'environment' will be regarded as fixed effect. 357 | 358 | The code runs a single partition for each model either for CV1 or CV2. These specifications need to be passed in variables `mod`, `CV`, and `part`. 359 | 360 | ```r 361 | setwd("/mnt/home/lopezcru/GS") 362 | rm(list=ls()) 363 | library(BGLR) 364 | 365 | #========================================================= 366 | # User specifications 367 | #========================================================= 368 | # Choose one model. 1: single; 2:across; 3:MxE; 4:R-Norm 369 | mod <- 4 370 | 371 | # Type of CV. 1:CV1; 2:CV2 372 | CV <- 1 373 | 374 | # Partition number 375 | part <- 1 376 | #========================================================= 377 | 378 | # Read arguments passed from command line 379 | args=(commandArgs(TRUE)) 380 | if(length(args)==0){ 381 | cat('No args provided',"\n") 382 | }else{ 383 | for(i in 1:length(args)){ 384 | eval(parse(text=args[[i]])) 385 | } 386 | } 387 | 388 | # Load data 389 | load("multiEnvironment/prepData_multi.RData") 390 | load(paste0("multiEnvironment/YNA_CV",CV,"_multiEnv.RData")) 391 | n <- nrow(Y); nEnv <- ncol(Y) 392 | 393 | # Models 394 | models <- c("Single","Across","MxE","R-Norm") 395 | model <- models[mod] 396 | 397 | # Number of iterations and burn-in for Bayesian models 398 | nIter <- 30000; burnIn <- 2000 399 | 400 | YNA0 <- YNA[[part]] 401 | yNA <- as.vector(YNA0) 402 | 403 | #-------------------------------------------------------- 404 | # 1. Single environment (within-environment) model 405 | #-------------------------------------------------------- 406 | if(model=="Single") 407 | { 408 | YHat <- matrix(NA,nrow=nrow(Y),ncol=ncol(Y)) 409 | ETA <- list(G=list(V=eigen_G$vectors,d=eigen_G$values,model='RKHS')) 410 | for(env in 1:nEnv){ 411 | fm <-BGLR(y=YNA0[,env],ETA=ETA,nIter=nIter,burnIn=burnIn) 412 | YHat[,env] <- fm$yHat 413 | } 414 | } 415 | 416 | #-------------------------------------------------------- 417 | # 2. Across-environments model 418 | #-------------------------------------------------------- 419 | if(model=="Across") 420 | { 421 | ETA <- list(list(~envID-1,model="FIXED")) 422 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS') 423 | 424 | # Model Fitting 425 | fm <- BGLR(y=yNA,ETA=ETA,nIter=nIter,burnIn=burnIn) 426 | YHat <- matrix(fm$yHat,ncol=nEnv) 427 | } 428 | 429 | #-------------------------------------------------------- 430 | # 3. MxE interaction model 431 | #-------------------------------------------------------- 432 | if(model=="MxE") 433 | { 434 | ETA <- list(list(~envID-1,model="FIXED")) 435 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS') 436 | 437 | # Adding interaction terms 438 | for(env in 1:nEnv){ 439 | eigen_G1 <- MxE_eigen[[env]] 440 | ETA[[(env+2)]] <- list(V=eigen_G1$vectors,d=eigen_G1$values,model='RKHS') 441 | } 442 | 443 | # Model Fitting 444 | fm <- BGLR(y=yNA,ETA=ETA,nIter=nIter,burnIn=burnIn) 445 | YHat <- matrix(fm$yHat,ncol=nEnv) 446 | } 447 | 448 | #-------------------------------------------------------- 449 | # 4. Reaction-Norm model 450 | #-------------------------------------------------------- 451 | if(model=="R-Norm") 452 | { 453 | ETA <- list(list(~envID-1,model="FIXED")) 454 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS') 455 | ETA[[3]] <- list(V=eigen_GE$vectors,d=eigen_GE$values,model="RKHS") 456 | 457 | # Model Fitting 458 | fm <- BGLR(y=yNA,ETA=ETA,nIter=nIter,burnIn=burnIn) 459 | YHat <- matrix(fm$yHat,ncol=nEnv) 460 | } 461 | 462 | # Save results 463 | outfolder <- paste0("multiEnvironment/CV",CV,"/",model) 464 | if(!file.exists(outfolder)) dir.create(outfolder,recursive=T) 465 | save(YHat,file=paste0(outfolder,"/outPRED_multiEnv_partition_",part,".RData")) 466 | ``` 467 | 468 | #### 2.2 Running in parallel many jobs 469 | Code above will run a single combination of partition-model-CV, thus when running, for instance, several models for both CV1 and CV2, some parallelzation of jobs is needed for speeding of computation. The bash code called *[run_jobs_multi.sh](https://github.com/MarcooLopez/Genomic-Selection/blob/master/run_jobs_multi.sh)* will submit many jobs depending of the core capacity of the computer. Jobs will be sent by chunks whose size is specified in variable `nb` (for instance, `nb=10` will run 10 jobs at the time). After jobs in the chunk are done, another chunk will be submited to be run. Variables `seq1=2`, `seq2=4`, and `seq3=100` specify the 2 CV types, 4 models, and 100 partitions, respectiely. 470 | 471 | The bash shell script needs to be saved in the same directory as the R script 'fitModels_multi.R' and it can be run from command line as 472 | ``` 473 | sh run_jobs_multi.sh & 474 | ``` 475 | 476 | #### 2.3 Retrieving results 477 | 478 | The code below will retrieve results for all models fitted previously showing the within-environment correlation for all fitted models 479 | 480 | ```r 481 | setwd("/mnt/home/lopezcru/GS") 482 | rm(list=ls()) 483 | library(ggplot2) 484 | library(reshape) 485 | 486 | #========================================================= 487 | # User specifications 488 | #========================================================= 489 | # Type of CV. 1:CV1; 2:CV2 490 | CV <- 2 491 | #========================================================= 492 | 493 | models <- c("Single","Across","MxE","R-Norm") 494 | 495 | # Load data 496 | load("multiEnvironment/prepData_multi.RData") 497 | load(paste0("multiEnvironment/YNA_CV",CV,"_multiEnv.RData")) 498 | 499 | # Calculate within-environment correlation 500 | outCOR <- vector("list",length(models)) 501 | names(outCOR) <- models 502 | for(mod in seq_along(models)) 503 | { 504 | outcor <- c() 505 | for(part in 1:length(YNA)){ 506 | filename <- paste0("multiEnvironment/CV",CV,"/",models[mod],"/outPRED_multiEnv_partition_",part,".RData") 507 | if(file.exists(filename)) 508 | { 509 | load(filename) 510 | YNA0 <- YNA[[part]] 511 | tmp <- rep(NA,ncol(YNA0)) 512 | for(env in 1:ncol(YNA0)){ 513 | indexTST <- which(is.na(YNA0[,env])) 514 | tmp[env] <- cor(Y[indexTST,env],YHat[indexTST,env]) 515 | } 516 | outcor <- rbind(outcor,tmp) 517 | } 518 | } 519 | colnames(outcor) <- paste0("Env ",colnames(YNA0)) 520 | rownames(outcor) <- NULL 521 | outcor <- data.frame(model=models[mod],outcor) 522 | outCOR[[mod]] <- outcor 523 | } 524 | outCOR <- outCOR[!sapply(outCOR,is.null)] 525 | 526 | # Calculate means and SD's 527 | (means <- t(do.call("rbind",lapply(outCOR,function(x)apply(x[,-1],2,mean))))) 528 | (sds <- t(do.call("rbind",lapply(outCOR,function(x)apply(x[,-1],2,sd))))) 529 | 530 | write.csv(rbind(means,colnames(sds),sds),file=paste0("multiEnvironment/Accuracy_avg_CV",CV,"_multiEnv.csv")) 531 | 532 | toplot <- do.call("rbind",lapply(outCOR,function(x)melt(x,id="model"))) 533 | png(paste0("multiEnvironment/Accuracy_distn_CV",CV,"_multiEnv.png"),height=350) 534 | ggplot(toplot,aes(x=model,y=value,fill=variable)) + geom_boxplot()+ 535 | labs(fill="Env",y="Accuracy",title=paste0("Correlation between observed and predicted values. CV",CV)) 536 | dev.off() 537 | ``` 538 | 539 | Tables below are the results of running 100 partitions with `nIter=30000` and `burnIn=2000`. The mean and standard deviation (in parenthesis) across partitions are presented. 540 | 541 | **Cross Validation 1. CV1** 542 | 543 | | |Single-Env |Across-Env | MxE | RNorm | 544 | |-------|-------|--------|------|------| 545 | |Env 2 | 0.485(0.049) | 0.441(0.052) | 0.460(0.050) | 0.461(0.049) | 546 | |Env 4 | 0.377(0.055) | 0.395(0.053) | 0.382(0.055) | 0.382(0.055) | 547 | |Env 5 | 0.441(0.056) | 0.382(0.057) | 0.412(0.054) | 0.409(0.055) | 548 | 549 |   550 | 551 | 552 | 553 | ### 554 | **Cross Validation 2. CV2** 555 | 556 | | |Single-Env |Across-Env | MxE | RNorm | 557 | |-------|-------|--------|------|-----| 558 | |Env 2 | 0.485(0.049) | 0.629(0.032) | 0.647(0.033) | 0.642(0.033) | 559 | |Env 4 | 0.375(0.058) | 0.602(0.042) | 0.591(0.043) | 0.585(0.044) | 560 | |Env 5 | 0.442(0.048) | 0.493(0.045) | 0.529(0.042) | 0.528(0.043) | 561 | 562 |   563 | 564 | 565 | 566 | # 567 | * **[back](https://github.com/MarcooLopez/Genomic-Selection-Demo/blob/master/README.md)** 568 | 569 | # 570 | # References 571 | * Jarquín, D., Crossa, J., Lacaze, X., Du Cheyron, P., Daucourt, J., Lorgeou, J., … de los Campos, G. (2014). **A reaction norm model for genomic selection using high-dimensional genomic and environmental data**. Theoretical and Applied Genetics, 127(3), 595–607. 572 | * Lopez-Cruz, M., Crossa, J., Bonnett, D., Dreisigacker, S., Poland, J., Jannink, J.-L., … de los Campos, G. (2015). **Increased Prediction Accuracy in Wheat Breeding Trials Using a Marker × Environment Interaction Genomic Selection Model**. G3: Genes, Genomes, Genetics, 5(4), 569–582. 573 | 574 | 575 | 576 | --------------------------------------------------------------------------------