├── varComp.png
├── boxplot1.png
├── CV1_2_scheme.png
├── Accuracy_distn_CV1_multiEnv.png
├── Accuracy_distn_CV2_multiEnv.png
├── get_CV1_partitions.R
├── run_jobs_multi.sh
├── prepareData_multi.R
├── get_CV2_partitions.R
├── get_VarComps_multi.R
├── fitModels_multi.R
├── single_environment.md
├── README.md
└── multi_environment.md


/varComp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcooLopez/Genomic-Selection/HEAD/varComp.png


--------------------------------------------------------------------------------
/boxplot1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcooLopez/Genomic-Selection/HEAD/boxplot1.png


--------------------------------------------------------------------------------
/CV1_2_scheme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcooLopez/Genomic-Selection/HEAD/CV1_2_scheme.png


--------------------------------------------------------------------------------
/Accuracy_distn_CV1_multiEnv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcooLopez/Genomic-Selection/HEAD/Accuracy_distn_CV1_multiEnv.png


--------------------------------------------------------------------------------
/Accuracy_distn_CV2_multiEnv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MarcooLopez/Genomic-Selection/HEAD/Accuracy_distn_CV2_multiEnv.png


--------------------------------------------------------------------------------
/get_CV1_partitions.R:
--------------------------------------------------------------------------------
 1 | setwd("/mnt/home/lopezcru/GS")
 2 | rm(list=ls())
 3 | 
 4 | #=========================================================
 5 | # User specifications
 6 | #=========================================================
 7 | # Number of replicates
 8 | m <- 100
 9 | 
10 | # Percentage of the data assigned to Testing set
11 | percTST <- 0.3
12 | #=========================================================
13 | 
14 | # Load data
15 | load("multiEnvironment/prepData_multi.RData")
16 | n <- nrow(Y)
17 | 
18 | # Creation of seed for repeated randomizations
19 | set.seed(123)
20 | seeds <- round(seq(1E3,1E6,length=m))
21 | 
22 | nTST <- round(percTST*n)
23 | YNA <- vector("list",m)
24 | 
25 | for(k in 1:m)
26 | {
27 |     set.seed(seeds[k])
28 |     indexTST <- sample(1:n,size=nTST,replace=FALSE)
29 |     YNA0 <- Y
30 |     YNA0[indexTST,] <- NA
31 |     YNA[[k]] <- YNA0
32 | }
33 | 
34 | # Save YNA matrix
35 | save(YNA,file="multiEnvironment/YNA_CV1_multiEnv.RData")
36 | 


--------------------------------------------------------------------------------
/run_jobs_multi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Number of jobs in each block
 3 | nb=10
 4 | seq1=1
 5 | seq2=1
 6 | seq3=50
 7 | 
 8 | # Create a 'wait' sentence between chunks
 9 | waittext="wait "
10 | for i in $(seq 1 $nb)
11 | do
12 | 	waittext=("${waittext[*]}" "%$i")
13 | done
14 | 
15 | # Create job_submit file
16 | cat > job_submit.sh <<EOF 
17 | #!/bin/bash
18 | 
19 | EOF
20 | 
21 | n=$((seq1*seq2*seq3))
22 | cont=0
23 | for i in $(seq 1 $seq1)
24 | do
25 | 	for j in $(seq 1 $seq2)
26 | 	do
27 | 		for k in $(seq 1 $seq3)
28 | 		do
29 | 			cont=$(($cont +1))
30 | 			LOG=${i}_${j}_${k}
31 | 			
32 | cat >> job_submit.sh <<EOF 
33 | R CMD BATCH --no-save --no-restore '--args CV=$i mod=$j part=$k' fitModels_multi.R LOG_$LOG &
34 | EOF
35 |             
36 | 			if [ $(($cont % $nb)) == 0 ] && [ "$cont" -lt "$n" ]
37 | 			then
38 | cat >> job_submit.sh <<EOF 
39 | 
40 | ${waittext[@]}
41 | 
42 | EOF
43 | 			fi
44 |         done
45 |     done
46 | done
47 | 
48 | sh job_submit.sh &
49 | 


--------------------------------------------------------------------------------
/prepareData_multi.R:
--------------------------------------------------------------------------------
 1 | setwd("/mnt/home/lopezcru/GS")
 2 | rm(list=ls())
 3 | library(BGLR)
 4 | 
 5 | # Load data
 6 | data(wheat)
 7 | X <- wheat.X
 8 | Y <- wheat.Y
 9 | 
10 | # Select environments. For instance, environments 2,4, and 5
11 | Y <- Y[,c(2,3,4)]
12 | 
13 | # Genomic relationship matrix
14 | M <- scale(X)
15 | G <- tcrossprod(M)/ncol(X)
16 | 
17 | # Design matrix for individuals. It connects individuals with environments
18 | GID <- factor(rep(rownames(Y),ncol(Y)),levels=rownames(Y))
19 | Zg <- model.matrix(~GID-1)
20 | 
21 | # Design matrix for environments. Used in the multi-environment R-Norm model
22 | envID <- factor(rep(colnames(Y),each=nrow(Y)),levels=colnames(Y))
23 | ZE <- model.matrix(~envID-1)   
24 | 
25 | #  Covariance structure for effects
26 | ZgGZgt <- Zg%*%G%*%t(Zg)    # Genetic effect  
27 | ZEZEt <- tcrossprod(ZE)     # Environmental effect
28 | GE <- ZgGZgt*ZEZEt          # GxE interaction term (R-Norm model)
29 | 
30 | # Eigen decomposition (to speed computational time)
31 | eigen_G <- eigen(G)
32 | eigen_G0 <- eigen(ZgGZgt)
33 | eigen_GE <- eigen(GE)
34 | 
35 | # Interaction terms (MxE model)
36 | MxE_eigen <- vector("list",ncol(Y))
37 | for(env in 1:ncol(Y)){ 
38 |     tmp <- rep(0,ncol(Y)) ; tmp[env] <- 1; G1 <- kronecker(diag(tmp),G)
39 |     MxE_eigen[[env]] <- eigen(G1)
40 | }
41 | 
42 | # Save prepared data
43 | dir.create("multiEnvironment")
44 | save(Y,envID,eigen_G,eigen_G0,eigen_GE,MxE_eigen,file="multiEnvironment/prepData_multi.RData")
45 | 


--------------------------------------------------------------------------------
/get_CV2_partitions.R:
--------------------------------------------------------------------------------
 1 | setwd("/mnt/home/lopezcru/GS")
 2 | rm(list=ls())
 3 | 
 4 | #=========================================================
 5 | # User specifications
 6 | #=========================================================
 7 | # Number of replicates
 8 | m <- 100
 9 | 
10 | # Percentage of the data assigned to Testing set
11 | percTST <- 0.3
12 | #=========================================================
13 | 
14 | # Load data
15 | load("multiEnvironment/prepData_multi.RData")
16 | n <- nrow(Y)
17 | nEnv <- ncol(Y)
18 | 
19 | # Creation of seed for repeated randomizations
20 | set.seed(123)
21 | seeds <- round(seq(1E3,1E6,length=m))
22 | 
23 | nTST <- round(percTST*n)
24 | YNA <- vector("list",m)
25 | nNA <- nEnv*nTST
26 | 
27 | for(k in 1:m)
28 | {
29 |     set.seed(seeds[k])
30 |     YNA0 <- Y
31 |     
32 |     if(nNA<n){ indexNA <- sample(1:n,nNA,replace=FALSE) }
33 |     if(nNA>=n){
34 |         nRep <- floor(nNA/n)
35 |         remain <- sample(1:n,nNA%%n,replace=FALSE)
36 |         a0 <- sample(1:n,n,replace=FALSE)
37 |         indexNA <- rep(a0,nRep)
38 |         if(length(remain)>0){
39 |             a1 <- floor(length(indexNA)/nTST)*nTST
40 |             a2 <- nNA - a1 - length(remain)
41 |             bb <- sample(a0[!a0%in%remain],a2,replace=FALSE)
42 |             noInIndexNA <- c(rep(a0,nRep-1),a0[!a0%in%bb])
43 |             indexNA <- c(noInIndexNA,bb,remain)
44 |         }
45 |     }
46 |     indexEnv <- rep(1:nEnv,each=nTST)
47 |     for(j in 1:nEnv) YNA0[indexNA[indexEnv==j],j] <- NA
48 |     YNA[[k]] <- YNA0
49 | }
50 | 
51 | # Save YNA matrix
52 | save(YNA,file="multiEnvironment/YNA_CV2_multiEnv.RData")
53 | 


--------------------------------------------------------------------------------
/get_VarComps_multi.R:
--------------------------------------------------------------------------------
 1 | setwd("/mnt/home/lopezcru/GS")
 2 | rm(list=ls())
 3 | library(BGLR)
 4 | 
 5 | load("multiEnvironment/prepData_multi.RData")
 6 | n <- nrow(Y)
 7 | nEnv <- ncol(Y)
 8 | y <- as.vector(Y)
 9 | 
10 | set.seed(123)
11 | 
12 | # Matrix to store results. It will save variance components for each model
13 | outVAR <- matrix(NA,ncol=4,nrow=1+2*nEnv)
14 | dimnames(outVAR) <- list(c("Main",rep(paste0("Env ",colnames(Y)),2)),c("Single","Across","MxE","R-Norm"))
15 | 
16 | # Number of iterations and burn-in for Bayesian models
17 | nIter <- 30000; burnIn <- 2000
18 | 
19 | #--------------------------------------------------------
20 | # 1. Single environment (within-environment) model
21 | #--------------------------------------------------------
22 | ETA <- list(G=list(V=eigen_G$vectors,d=eigen_G$values,model='RKHS'))
23 | for(env in 1:nEnv){
24 |     fm <-BGLR(y=Y[,env],ETA=ETA,nIter=nIter,burnIn=burnIn)
25 |     outVAR[env+1,1] <- fm$ETA[[1]]$varU
26 |     outVAR[env+4,1] <- fm$varE
27 | }
28 | 
29 | #--------------------------------------------------------
30 | # 2. Across-environments model
31 | #--------------------------------------------------------
32 | ETA <- list(list(~envID-1,model="FIXED"))
33 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS')
34 | 
35 | # Model Fitting
36 | fm <- BGLR(y=y,ETA=ETA,nIter=nIter,burnIn=burnIn)
37 | outVAR[1,2] <- fm$ETA[[2]]$varU
38 | outVAR[(1:nEnv)+4,2] <- fm$varE
39 | 
40 | #--------------------------------------------------------
41 | # 3. MxE interaction model
42 | #--------------------------------------------------------
43 | ETA <- list(list(~envID-1,model="FIXED"))
44 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS')
45 | 
46 | # Adding interaction terms
47 | for(env in 1:nEnv){
48 |     eigen_G1 <- MxE_eigen[[env]]
49 |     ETA[[(env+2)]] <- list(V=eigen_G1$vectors,d=eigen_G1$values,model='RKHS')
50 | }
51 | 
52 | # Model Fitting
53 | fm <- BGLR(y=y,ETA=ETA,nIter=nIter,burnIn=burnIn)
54 | outVAR[1,3] <- fm$ETA[[2]]$varU
55 | for(env in 1:nEnv) outVAR[env+1,3] <- fm$ETA[[env+2]]$varU
56 | outVAR[(1:nEnv)+4,3] <- fm$varE
57 | 
58 | #--------------------------------------------------------
59 | # 4. Reaction-Norm model
60 | #--------------------------------------------------------
61 | ETA <- list(list(~envID-1,model="FIXED"))
62 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS')
63 | ETA[[3]] <- list(V=eigen_GE$vectors,d=eigen_GE$values,model="RKHS")
64 | 
65 | # Model Fitting
66 | fm <- BGLR(y=y,ETA=ETA,nIter=nIter,burnIn=burnIn)
67 | outVAR[1,4] <- fm$ETA[[2]]$varU
68 | outVAR[(1:nEnv)+1,4] <- fm$ETA[[3]]$varU
69 | outVAR[(1:nEnv)+4,4] <- fm$varE
70 | outVAR
71 | 
72 | # Save results
73 | write.table(outVAR,file="multiEnvironment/varComps.csv",sep=",",row.names=F)
74 | 


--------------------------------------------------------------------------------
/fitModels_multi.R:
--------------------------------------------------------------------------------
  1 | setwd("/mnt/home/lopezcru/GS")
  2 | rm(list=ls())
  3 | library(BGLR)
  4 | 
  5 | #=========================================================
  6 | # User specifications
  7 | #=========================================================
  8 | # Choose one model. 1: single; 2:across; 3:MxE; 4:R-Norm
  9 | mod <- 4
 10 | 
 11 | # Type of CV. 1:CV1; 2:CV2
 12 | CV <- 1
 13 | 
 14 | # Partition number
 15 | part <- 1
 16 | #=========================================================
 17 | 
 18 | # Read arguments passed from command line
 19 | args=(commandArgs(TRUE))
 20 | if(length(args)==0){
 21 |    cat('No args provided',"\n")
 22 |  }else{
 23 |     for(i in 1:length(args)){
 24 |          eval(parse(text=args[[i]]))
 25 |     }
 26 | }
 27 | 
 28 | # Load data
 29 | load("multiEnvironment/prepData_multi.RData")
 30 | load(paste0("multiEnvironment/YNA_CV",CV,"_multiEnv.RData"))
 31 | n <- nrow(Y);  nEnv <- ncol(Y)
 32 | 
 33 | # Models
 34 | models <- c("Single","Across","MxE","R-Norm")
 35 | model <- models[mod]
 36 | 
 37 | # Number of iterations and burn-in for Bayesian models
 38 | nIter <- 30000;  burnIn <- 2000
 39 | 
 40 | YNA0 <- YNA[[part]]
 41 | yNA <- as.vector(YNA0)
 42 |     
 43 | #--------------------------------------------------------
 44 | # 1. Single environment (within-environment) model
 45 | #--------------------------------------------------------
 46 | if(model=="Single")
 47 | {
 48 |     YHat <- matrix(NA,nrow=nrow(Y),ncol=ncol(Y))
 49 |     ETA <- list(G=list(V=eigen_G$vectors,d=eigen_G$values,model='RKHS'))
 50 |     for(env in 1:nEnv){
 51 |         fm <-BGLR(y=YNA0[,env],ETA=ETA,nIter=nIter,burnIn=burnIn)
 52 |         YHat[,env] <- fm$yHat
 53 |     }
 54 | }
 55 | 
 56 | #--------------------------------------------------------
 57 | # 2. Across-environments model
 58 | #--------------------------------------------------------
 59 | if(model=="Across")
 60 | {
 61 |     ETA <- list(list(~envID-1,model="FIXED"))
 62 |     ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS')
 63 | 
 64 |     # Model Fitting
 65 |     fm <- BGLR(y=yNA,ETA=ETA,nIter=nIter,burnIn=burnIn)
 66 |     YHat <- matrix(fm$yHat,ncol=nEnv)
 67 | }
 68 |     
 69 | #--------------------------------------------------------
 70 | # 3. MxE interaction model
 71 | #--------------------------------------------------------
 72 | if(model=="MxE")
 73 | {
 74 |     ETA <- list(list(~envID-1,model="FIXED"))
 75 |     ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS')
 76 | 
 77 |     # Adding interaction terms
 78 |     for(env in 1:nEnv){
 79 |         eigen_G1 <- MxE_eigen[[env]]
 80 |         ETA[[(env+2)]] <- list(V=eigen_G1$vectors,d=eigen_G1$values,model='RKHS')
 81 |     }
 82 | 
 83 |     # Model Fitting
 84 |     fm <- BGLR(y=yNA,ETA=ETA,nIter=nIter,burnIn=burnIn)
 85 |     YHat <- matrix(fm$yHat,ncol=nEnv)
 86 | }
 87 | 
 88 | #--------------------------------------------------------
 89 | # 4. Reaction-Norm model
 90 | #--------------------------------------------------------
 91 | if(model=="R-Norm")
 92 | {
 93 |     ETA <- list(list(~envID-1,model="FIXED"))
 94 |     ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS')
 95 |     ETA[[3]] <- list(V=eigen_GE$vectors,d=eigen_GE$values,model="RKHS")
 96 | 
 97 |     # Model Fitting
 98 |     fm <- BGLR(y=yNA,ETA=ETA,nIter=nIter,burnIn=burnIn)
 99 |     YHat <- matrix(fm$yHat,ncol=nEnv)
100 | }
101 | 
102 | # Save results
103 | outfolder <- paste0("multiEnvironment/CV",CV,"/",model)
104 | if(!file.exists(outfolder)) dir.create(outfolder,recursive=T)
105 | save(YHat,file=paste0(outfolder,"/outPRED_multiEnv_partition_",part,".RData"))
106 | 


--------------------------------------------------------------------------------
/single_environment.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Model assessment
  3 | ### Training-Testing random partitions
  4 | The prediction power of the model will be assessed using the training-testing (TRN-TST) random partitions approach. Data is randomly splitted into training and testing sets. Model parameters are estimated in training set and model is tested in TST set. 
  5 | 
  6 | ## Data preparation
  7 | ### Load data, generate G-matrix and create objects to store results
  8 | ```r
  9 | # Load libraries
 10 | library(BGLR)
 11 | library(rrBLUP)
 12 | 
 13 | # Load data
 14 | data(wheat)
 15 | X <- wheat.X
 16 | Y <- wheat.Y
 17 | 
 18 | # Select a single environment. For instance, environment 2
 19 | y <- Y[,2]
 20 | 
 21 | n <- nrow(Y)
 22 | p <- ncol(X)
 23 | 
 24 | # Genomic relationship matrix
 25 | M <- scale(X)
 26 | G <- tcrossprod(M)/p
 27 | 
 28 | # Design matrix for individuals. In this case is a diagonal since there are no replicates
 29 | GID <- factor(rownames(Y),levels=rownames(Y))
 30 | Zg <- model.matrix(~GID-1)
 31 | ```
 32 | 
 33 | ## Running models
 34 | 
 35 | ### 1. Variance components estimation
 36 | After running the 'data preparation' part, the code following code can be used to fit all models and to extract variance components and other parameters.
 37 | 
 38 | ```r
 39 | set.seed(123)
 40 | 
 41 | # Matrix to store results. It will save the main parameters for each model
 42 | outVAR <- matrix(NA,nrow=6,ncol=5)
 43 | dimnames(outVAR) <- list(c("varU","varE","lambda","dfb","Sb","H2"),c("GBLUP1","GBLUP2","BRR","LASSO","BayesB"))
 44 | 
 45 | # Number of iterations and burn-in for Bayesian models
 46 | nIter <- 30000
 47 | burnIn <- 5000
 48 | 
 49 | # G-BLUP model using 'rrBLUP' package
 50 | fm <- mixed.solve(y=y,Z=Zg,K=G) 
 51 | outVAR[1,1] <- fm$Vu
 52 | outVAR[2,1] <- fm$Ve
 53 | outVAR[6,1] <- fm$Vu/(fm$Vu+fm$Ve)    # Heritability
 54 | 
 55 | # G-BLUP model using 'BGLR' package. Model RKHS with K=G
 56 | fm <- BGLR(y,ETA=list(list(K=G,model="RKHS")),nIter=nIter,burnIn=burnIn)
 57 | outVAR[1,2] <- fm$ETA[[1]]$varU
 58 | outVAR[2,2] <- fm$varE
 59 | outVAR[6,2] <- outVAR[1,2]/(outVAR[1,2] + outVAR[2,2])  # Heritability
 60 | 
 61 | # Bayesian Ridge Regression (BRR) using 'BGLR' package
 62 | fm <- BGLR(y,ETA=list(list(X=M,model="BRR")),nIter=nIter,burnIn=burnIn)
 63 | outVAR[1,3] <- fm$ETA[[1]]$varB*p    # Multiply by p to obtain the right varU as in G-BLUP
 64 | outVAR[2,3] <- fm$varE
 65 | outVAR[6,3] <- outVAR[1,3]/(outVAR[1,3] + outVAR[2,3])  # Heritability
 66 | 
 67 | # Bayesian LASSO model using 'BGLR' package
 68 | fm <- BGLR(y,ETA=list(list(X=M,model="BL")),nIter=nIter,burnIn=burnIn)
 69 | outVAR[2,4] <- fm$varE
 70 | outVAR[3,4] <- fm$ETA[[1]]$lambda
 71 | 
 72 | # Bayes B model using 'BGLR' package
 73 | fm <- BGLR(y,ETA=list(list(X=M,model="BayesB")),nIter=nIter,burnIn=burnIn)
 74 | outVAR[2,5] <- fm$varE
 75 | outVAR[4,5] <- fm$ETA[[1]]$df0
 76 | outVAR[5,5] <- fm$ETA[[1]]$S0
 77 | 
 78 | print(outVAR)
 79 | ```
 80 | 
 81 | #### Results
 82 | 
 83 | |       |GBLUP <sup>1</sup>  | GBLUP <sup>2</sup>  | BRR | LASSO | Bayes B |
 84 | |-------|------|------|------|------|------|
 85 | |![](https://latex.codecogs.com/gif.latex?%5Csigma%5E2_u) |0.468 |0.490|0.491|  -  |  -  |
 86 | |![](https://latex.codecogs.com/gif.latex?%5Csigma%5E2_%5Cvarepsilon)  |0.574|0.576|0.575|0.589|0.572|
 87 | |![](https://latex.codecogs.com/gif.latex?%5Clambda)  |  -  |  -  |  -  |58.805|  -  |
 88 | |![](https://latex.codecogs.com/gif.latex?df_%5Cbeta)  |  -  |  -  |  -  |  -  |  5  |
 89 | |![](https://latex.codecogs.com/gif.latex?S_%5Cbeta)  |  -  |  -  |  -  |  -  |0.033|
 90 | |![](https://latex.codecogs.com/gif.latex?H%5E2_g) |0.449|0.460|0.461|  -  |  -  |
 91 | 
 92 | 1: using 'rrBLUP'; 2: using 'BGLR' package
 93 | 
 94 | #
 95 | ### 2. Replicates of partitions to obtain standard deviations of predictions
 96 | After running the 'data preparation' part, the code below runs repeated partitions to obtain mean and standard deviations of accuracies for all models. All the models will be run using 'BGLR' package.
 97 | 
 98 | ```r
 99 | # Models
100 | models <- c("GBLUP","BRR","LASSO","BayesB")
101 | 
102 | #==================================================
103 | # User specifications
104 | #==================================================
105 | # Choose one model. 1:GBLUP; 2:BRR; 3:LASSO; 4:BayesB
106 | mod <- 1
107 | 
108 | # Percentage of the data assigned to Testing set
109 | percTST <- 0.3
110 | 
111 | # Number of replicates
112 | m <- 10
113 | #==================================================
114 | 
115 | # Creation of seed for repeated randomizations
116 | set.seed(123)
117 | seeds <- round(seq(1E3,1E6,length=m))
118 | 
119 | # Matrix to store results. It will save the corelation for each partition
120 | outCOR <- matrix(NA,nrow=m,ncol=1)
121 | colnames(outCOR) <- models[mod]
122 | 
123 | # Number of iterations and burn-in for Bayesian models
124 | nIter <- 1200
125 | burnIn <- 200
126 | 
127 | model <- models[mod]
128 | nTST <- round(percTST*n)
129 | 
130 | for(k in 1:m)   # Loop for the replicates
131 | {
132 |     set.seed(seeds[k])
133 |     indexTST <- sample(1:n,size=nTST,replace=FALSE)
134 |     yNA <- y
135 |     yNA[indexTST] <- NA
136 |         
137 |     if(model=="GBLUP")  ETA <- list(list(K=G,model="RKHS"))
138 |     if(model=="BRR")    ETA <- list(list(X=M,model="BRR"))
139 |     if(model=="LASSO")  ETA <- list(list(X=M,model="BL"))
140 |     if(model=="BayesB") ETA <- list(list(X=M,model="BayesB"))
141 | 
142 |     fm <- BGLR(yNA,ETA=ETA,nIter=nIter,burnIn=burnIn)
143 |     outCOR[k,1] <- cor(fm$yHat[indexTST],y[indexTST])
144 | }
145 | 
146 | # Save results
147 | save(outCOR,file=paste0("outCOR_",model,".RData"))
148 | ```
149 | 
150 | #### 2.1 Results
151 | The code below will retrieve results for all models fitted previously
152 | 
153 | ```r
154 | models <- c("GBLUP","BRR","LASSO","BayesB")
155 | 
156 | OUT <- c()
157 | for(mod in seq_along(models))
158 | {
159 |     filename <- paste0("outCOR_",models[mod],".RData")    
160 |     if(file.exists(filename)){
161 |         load(filename,verbose=T)
162 |         OUT <- cbind(OUT,outCOR)
163 |     }    
164 | }
165 | 
166 | round(rbind(Mean=apply(OUT,2,mean),SD=apply(OUT,2,sd)),4)
167 | boxplot(outCOR,ylab="Accuracy",xlab="Model")
168 | ```
169 | 
170 | |       |GBLUP  | BRR  | LASSO | Bayes B |
171 | |-------|-------|--------|------|-------|
172 | |Mean | 0.475  | 0.476  | 0.474 | 0.465 |
173 | |SD  | 0.049 | 0.050 | 0.049 | 0.048 |
174 | 
175 | Boxplot of distribution of the accuracies by model
176 | 
177 | ![](https://github.com/MarcooLopez/Genomic-Selection/blob/master/boxplot1.png)
178 | 
179 | #
180 | * **[back](https://github.com/MarcooLopez/Genomic-Selection-Demo/blob/master/README.md)**
181 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Genomic Selection Demo 
  2 | 
  3 | The standard genetic model assumes that phenotype is the sum of a genetic component and a non-genetic component (residual), ![](https://latex.codecogs.com/gif.latex?y_i%3Dg_i&plus;%5Cvarepsilon_i). Genomic Selection uses genetic markers covering the whole genome and potentially explaining all the genetic variance. These markers are asumed to be in Linkage Disequilibrium (LD) with the QTL thus models including all markers can estimate breeding values ![](https://latex.codecogs.com/gif.latex?g_i) as combinatons of these QTL's.
  4 | 
  5 | ## Model
  6 | Response variable *y* for the *i*-th individual (*i=1,...,n*) is regressed on a function of *p* marker genotypes ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bx%7D_i%3D%5Bx_%7Bi1%7D%2C...%2Cx_%7Bip%7D%5D%27) that seeks to aproximate to the true genetic value of the individual, this is
  7 | 
  8 | <p align="center">
  9 | <img src="https://latex.codecogs.com/gif.latex?y_i%3Df%28%5Ctextbf%7Bx%7D_i%29&plus;%5Cvarepsilon_i">
 10 | </b>
 11 | 
 12 | where function ![](https://latex.codecogs.com/gif.latex?f%28%5Ctextbf%7Bx%7D_i%29) can be a parametric or non-parametric and ![](https://latex.codecogs.com/gif.latex?%5Cboldsymbol%7B%5Cvarepsilon%7D%3D%5B%5Cvarepsilon_1%2C...%2C%5Cvarepsilon_n%5D%27) are the residuals which are usually assumed to be distributed Normal with constant variance ![](https://latex.codecogs.com/gif.latex?%5Cboldsymbol%7B%5Cvarepsilon%7D%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Csigma%5E2_%5Cvarepsilon%5Ctextbf%7BI%7D%29).
 13 | 
 14 | ### Parametric regression
 15 | The genotypic value of an individual is estimated using a **linear model** in which a linear combination of the marker genotypes are used, that is
 16 | <p align="center">
 17 | <img src="https://latex.codecogs.com/gif.latex?f%28%5Ctextbf%7Bx%7D_i%29%3D%5Cmu&plus;%5Csum_%7Bj%3D1%7D%5Epx_%7Bij%7D%5Cbeta_j">
 18 | </b>
 19 | 
 20 | where ![](https://latex.codecogs.com/gif.latex?%5Cmu) is the intercept, ![](https://latex.codecogs.com/gif.latex?x_%7Bij%7D) is the genotype of the *i*-th individual at the *j*-th marker, ![](https://latex.codecogs.com/gif.latex?%5Cbeta_%7Bj%7D) is the corresponding marker effect.
 21 | 
 22 | Model above presents some estimation difficulties when *p* is much bigger than *n* so penalization ans regularization aproaches are used to overcome this problem. Penalization and regularization solutions can be seen as posterior solutions in the Bayesian context.
 23 | 
 24 | #### 1. Bayesian Ridge Regression (BRR).
 25 | Is a penalization regression that assumes that the regression coefficients follow independently a Gaussian (Normal) prior distribution, this is ![](https://latex.codecogs.com/gif.latex?%5Cbeta_%7Bj%7D%5Csim%20N%280%2C%5Csigma%5E2_%5Cbeta%29).
 26 | This prior induces shrinkage of estimates toward zero.
 27 | 
 28 | #### 2. Bayesian LASSO.
 29 | It assumes that the regression coefficients have a prior distribution double-exponential (*DE*, or Laplace) with parameters ![](https://latex.codecogs.com/gif.latex?%5Clambda%5E2) and ![](https://latex.codecogs.com/gif.latex?%5Csigma%5E2_%5Cvarepsilon). This prior is a thick-tailed prior that can be represented as a infinite mixture of normal densities scaled by exponential (![](https://latex.codecogs.com/gif.latex?Exp)) densities, this is
 30 | 
 31 | <p align="center">
 32 | <img src="https://latex.codecogs.com/gif.latex?%5Cbeta_j%5Csim%20DE%28%5Clambda%5E2%2C%5Csigma%5E2_%5Cvarepsilon%29%3D%5Cint%20N%28%5Cbeta_j%7C0%2C%5Csigma%5E2_%5Cvarepsilon%5Ctau%5E2_j%29Exp%28%5Ctau%5E2_j%7C%5Clambda%5E2/2%29%5Cpartial%20%5Csigma%5E2_%7B%5Cbeta_j%7D">
 33 | </b>
 34 | 
 35 | #### 3. Bayes A
 36 | The regression effects are assumed another thick-tailed prior, a scaled *t* distribution with degree of freedom ![](https://latex.codecogs.com/gif.latex?df_%5Cbeta) and scale ![](https://latex.codecogs.com/gif.latex?S_%5Cbeta) parameters. Similar as for doble-exponential, the scaled *t* distribution is represented as mixture of normal densities scaled with a scaled-inverse Chi-squared (![](https://latex.codecogs.com/gif.latex?%5Cchi%5E%7B-1%7D)) density, this is
 37 | 
 38 | <p align="center">
 39 | <img src="https://latex.codecogs.com/gif.latex?%5Cbeta_j%5Csim%20t%28df_%5Cbeta%2CS_%5Cbeta%29%3D%5Cint%20N%28%5Cbeta_j%7C0%2C%5Csigma%5E2_%7B%5Cbeta_j%7D%29%5Cchi%5E%7B-1%7D%28%5Csigma%5E2_%7B%5Cbeta_j%7D%7Cdf_%5Cbeta%2CS_%5Cbeta%29%5Cpartial%20%5Csigma%5E2_%7B%5Cbeta_j%7D">
 40 | </b>
 41 | 
 42 | #### 4. Bayes B
 43 | Markers effects are asummed to be equal to zero with probability ![](https://latex.codecogs.com/gif.latex?%5Cpi) and with probability 1-![](https://latex.codecogs.com/gif.latex?%5Cpi) are assumed to follow a scaled *t* distribution as in Bayes A model.
 44 | 
 45 | #### 5. Bayes C
 46 | Similar to Bayes B, markers effects are asummed to be equal to zero with probability ![](https://latex.codecogs.com/gif.latex?%5Cpi) and with probability 1-![](https://latex.codecogs.com/gif.latex?%5Cpi) are assumed to follow a Gaussian distribution as in BRR model.
 47 | 
 48 | #### 6. G-BLUP model (RR-BLUP)
 49 | The response is modeled as ![](https://latex.codecogs.com/gif.latex?y_i%3D%5Cmu&plus;g_i&plus;%5Cvarepsilon_i) and its solution is equivalent to that of the BRR model arised when in the model above we make the sustitution 
 50 | <p align="center">
 51 | <img src="https://latex.codecogs.com/gif.latex?g_i%3D%5Csum_%7Bj%3D1%7D%5Epx_%7Bij%7D%5Cbeta_j">
 52 | </b>
 53 | 
 54 | It can be shown that the random vector ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D%3D%5Bg_1%2C...%2Cg_n%5D%27) follows a Normal distribution ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Csigma%5E2_g%5Ctextbf%7BG%7D%29), where ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BG%7D%3D%5Ctextbf%7BX%7D%5Ctextbf%7BX%7D%27/p) with ***X*** is the matrix of centered and standardized marker genotypes and it is called genomic relationship matrix.
 55 | 
 56 | 
 57 | ### Semi-parametric regression
 58 | 
 59 | #### 7. RKHS regression.
 60 | The genomic function ![](https://latex.codecogs.com/gif.latex?f%28%5Ctextbf%7Bx%7D_i%29) is expressed as a linear combination of some positive semi-definite basis functions called Reproducing Kernels (RK), ![](https://latex.codecogs.com/gif.latex?K%28%5Ctextbf%7Bx%7D_i%2C%5Ctextbf%7Bx%7D_%7Bi%27%7D%29), as follows
 61 | <p align="center">
 62 | <img src="https://latex.codecogs.com/gif.latex?f%28%5Ctextbf%7Bx%7D_i%29%3D%5Csum_%7Bi%27%3D1%7D%5EnK%28%5Ctextbf%7Bx%7D_i%2C%5Ctextbf%7Bx%7D_%7Bi%27%7D%29%5Calpha_%7Bi%27%7D">
 63 | </b>
 64 | 
 65 | This model can be rewritten as ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7By%7D%3D%5Ctextbf%7BK%7D%5Cboldsymbol%7B%5Calpha%7D%20&plus;%5Cboldsymbol%7B%5Cvarepsilon%7D) where ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BK%7D%3D%5C%7BK%28%5Ctextbf%7Bx%7D_i%2C%5Ctextbf%7Bx%7D_%7Bi%27%7D%29%5C%7D) is a ![](https://latex.codecogs.com/gif.latex?n%5Ctimes%20n) matrix containing all the evaluations of the RK function at the point (*i*,*i'*) and ![](https://latex.codecogs.com/gif.latex?%5Cboldsymbol%7B%5Calpha%7D%20%3D%5B%5Calpha_1%2C...%2C%5Calpha_n%5D%27).
 66 | 
 67 | This problem can be solved in a Bayesian fashion by assuming a prior ![](https://latex.codecogs.com/gif.latex?%5Cboldsymbol%7B%5Calpha%7D%20%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Csigma%5E2_%5Calpha%5Ctextbf%7BK%7D%5E%7B-1%7D%29).
 68 | 
 69 | **Note:** The Ridge Regression (and consequently, G-BLUP) can be represented as a RKHS model by setting **K**=**G**.  
 70 | 
 71 | 
 72 | ## Implementation of models
 73 | Models previously above described will be implemented in R software using R-packages 'BGLR' and 'rrBLUP'. Using public data, it will be shown how to run the models for the single-environment case and then how to perform a multi-environment analysis with the G-BLUP model using a marker-by-environment (MxE) and a Reaction Norm approaches that account for GxE interaction.
 74 | 
 75 | ### Data
 76 | Data from CIMMYT’s Global Wheat Program. Lines were evaluated for grain yield (each entry corresponds to an average of two plot records) at four different environments; phenotypes (*wheat.Y* object) were centered and standardized to a unit variance within environment. Each of the lines were genotyped for 1279 diversity array technology (DArT) markers. At each marker two homozygous genotypes were possible and these were coded as 0/1. Marker genotypes are given in the object *wheat.X*. Finally a matrix *wheat.A* provides the pedigree relationships between lines computed from the pedigree records.
 77 | Data is available for download in the R-package 'BGLR'.
 78 | 
 79 | ### R-packages installation
 80 | ```r
 81 | if(!"BGLR"%in%rownames(installed.packages()))  install.packages("BGLR")
 82 | if(!"rrBLUP"%in%rownames(installed.packages())) install.packages("rrBLUP")
 83 | library(BGLR)
 84 | library(rrBLUP)
 85 | ```
 86 | 
 87 | ### Download data
 88 | ```r
 89 | data(wheat)
 90 | X <- wheat.X
 91 | Y <- wheat.Y
 92 | A <- wheat.A
 93 | 
 94 | # Visualize data
 95 | head(Y)
 96 | X[1:10,1:5]
 97 | ```
 98 | 
 99 | ### Type of analyses
100 | * **[Single-environment](https://github.com/MarcooLopez/Genomic-Selection-Demo/blob/master/single_environment.md)**
101 | * **[Multi-environment](https://github.com/MarcooLopez/Genomic-Selection-Demo/blob/master/multi_environment.md)**
102 | 
103 | ##
104 | 
105 | # References
106 | * de los Campos, G., Gianola, D., Rosa, G. J. M., Weigel, K. A., & Crossa, J. (2010). **Semi-parametric genomic-enabled prediction of genetic values using reproducing kernel Hilbert spaces methods**. Genetics Research, 92(4), 295–308. 
107 | * de los Campos, G., Hickey, J. M., Pong-Wong, R., Daetwyler, H. D., & Calus, M. P. L. (2013). **Whole-genome regression and prediction methods applied to plant and animal breeding**. Genetics, 193(2), 327–345.
108 | * Endelman, J. B. (2011). **Ridge Regression and Other Kernels for Genomic Selection with R Package rrBLUP**. The Plant Genome Journal, 4(3), 250–255. 
109 | * Habier, D., Fernando, R. L., Kizilkaya, K., & Garrick, D. J. (2011). **Extension of the bayesian alphabet for genomic selection**. BMC Bioinformatics, 12(186), 1-12.
110 | * Jarquín, D., Crossa, J., Lacaze, X., Du Cheyron, P., Daucourt, J., Lorgeou, J., … de los Campos, G. (2014). **A reaction norm model for genomic selection using high-dimensional genomic and environmental data**. Theoretical and Applied Genetics, 127(3), 595–607.
111 | * Lopez-Cruz, M., Crossa, J., Bonnett, D., Dreisigacker, S., Poland, J., Jannink, J.-L., … de los Campos, G. (2015). **Increased prediction accuracy in wheat breeding trials using a marker × environment interaction genomic selection model**. G3: Genes, Genomes, Genetics, 5(4), 569–582.
112 | * Meuwissen, T. H. E., Hayes, B. J., & Goddard, M. E. (2001). **Prediction of total genetic value using genome-wide dense marker maps**. Genetics, 157(4), 1819–1829.
113 | * Park, T., & Casella, G. (2008). **The Bayesian Lasso**. Journal of the American Statistical Association, 103(482), 681–686.
114 | * Perez, P., & de los Campos, G. (2014). **Genome-wide regression and prediction with the BGLR statistical package**. Genetics, 198(2), 483–495. 
115 | * R Development Core Team. (2015). **R: A Language and Environment for Statistical Computing**. Vienna, Austria: R Foundation for Statistical Computing.
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/multi_environment.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Multi-environment models
  3 | 
  4 | * **Single-environment model**
  5 | 
  6 | This model is obtained by regressing the phenotype vector containing the *n* records available in the *k*<sup>th</sup> environment (*k=1,2,...,s* environments), ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7By%7D_k%3D%5By_%7B1k%7D%2C...%2Cy_%7Bnk%7D%5D%27), where *i=1,2,...,n* indexes lines (individuals), on *p* markers using a linear model in the form
  7 | 
  8 | <p align="center">
  9 | <img src="https://latex.codecogs.com/gif.latex?%5Ctextbf%7By%7D_k%3D%5Cmu_k%5Ctextbf%7B1%7D&plus;%5Ctextbf%7BX%7D_k%5Cboldsymbol%7B%5Cbeta%7D_k&plus;%5Cboldsymbol%7B%5Cvarepsilon%7D_k">
 10 | </p>
 11 | 
 12 | where ![](https://latex.codecogs.com/gif.latex?%5Cmu_k) is the intercept, ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7B1%7D%3D%5B1%2C...%2C1%5D%27) is a *n*-vector of ones, ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BX%7D_k%3D%5C%7Bx_%7Bijk%7D%5C%7D) is the ![](https://latex.codecogs.com/gif.latex?n%5Ctimes%20p) matrix of centered and standardized markers available in the *k*<sup>th</sup> environment, ![](https://latex.codecogs.com/gif.latex?%5Cboldsymbol%7B%5Cbeta%7D_k%3D%5B%5Cbeta_%7B1k%7D%2C...%2C%5Cbeta_%7Bpk%7D%5D%27) is a *p*-vector of marker effects, and ![](https://latex.codecogs.com/gif.latex?%5Cboldsymbol%7B%5Cvarepsilon%7D_k%3D%5B%5Cvarepsilon_%7Bik%7D%2C...%2C%5Cvarepsilon_%7Bnk%7D%5D%27) is the *n*-vector of residuals.
 13 | 
 14 | This model can be represented as a G-BLUP model by setting ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_k%3D%5Ctextbf%7BX%7D_k%5Cboldsymbol%7B%5Cbeta%7D_k), this is: 
 15 | 
 16 | <p align="center">
 17 | <img src="https://latex.codecogs.com/gif.latex?%5Ctextbf%7By%7D_k%3D%5Cmu_k%5Ctextbf%7B1%7D&plus;%5Ctextbf%7Bg%7D_k&plus;%5Cboldsymbol%7B%5Cvarepsilon%7D_k">
 18 | </p>
 19 | 
 20 | where ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_k%3D%5Bg_%7B1k%7D%2C...%2Cg_%7Bnk%7D%5D%27) is a random effect assumed ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_k%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Csigma%5E2_%7Bg_k%7D%5Ctextbf%7BG%7D_k%29) with the genomic relationship matrix estimated as ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BG%7D_k%3D%5Ctextbf%7BX%7D_k%5Ctextbf%7BX%7D_k%27/p).
 21 | 
 22 | * **Across-environments model**
 23 | 
 24 | This model assumes that effects of markers are the same across environments, this is ![](https://latex.codecogs.com/gif.latex?%5Cboldsymbol%7B%5Cbeta%7D_1%3D%5Cboldsymbol%7B%5Cbeta%7D_2%3D...%3D%5Cboldsymbol%7B%5Cbeta%7D_s%3D%5Cboldsymbol%7B%5Cbeta%7D). The model above can be simultaneously fitted for all environments as (assume *s=3* environments)
 25 | 
 26 | <p align="center">
 27 | <img src="https://latex.codecogs.com/gif.latex?%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7By%7D_1%5C%5C%20%5Ctextbf%7By%7D_2%5C%5C%20%5Ctextbf%7By%7D_3%20%5Cend%7Bbmatrix%7D%3D%5Cbegin%7Bbmatrix%7D%20%5Cmu_1%5Ctextbf%7B1%7D%5C%5C%20%5Cmu_2%5Ctextbf%7B1%7D%5C%5C%20%5Cmu_3%5Ctextbf%7B1%7D%20%5Cend%7Bbmatrix%7D&plus;%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7BX%7D_1%5C%5C%20%5Ctextbf%7BX%7D_2%5C%5C%20%5Ctextbf%7BX%7D_3%20%5Cend%7Bbmatrix%7D%5Cboldsymbol%7B%5Cbeta%7D&plus;%20%5Cbegin%7Bbmatrix%7D%20%5Cboldsymbol%7B%5Cvarepsilon%7D_1%5C%5C%20%5Cboldsymbol%7B%5Cvarepsilon%7D_2%5C%5C%20%5Cboldsymbol%7B%5Cvarepsilon%7D_3%20%5Cend%7Bbmatrix%7D">
 28 | </p>
 29 | 
 30 | Similarly, it can be represented also as a G-BLUP model, by making ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_k%3D%5Ctextbf%7BX%7D_k%5Cboldsymbol%7B%5Cbeta%7D), as: 
 31 | 
 32 | <p align="center">
 33 | <img src="https://latex.codecogs.com/gif.latex?%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7By%7D_1%5C%5C%20%5Ctextbf%7By%7D_2%5C%5C%20%5Ctextbf%7By%7D_3%20%5Cend%7Bbmatrix%7D%3D%5Cbegin%7Bbmatrix%7D%20%5Cmu_1%5Ctextbf%7B1%7D%5C%5C%20%5Cmu_2%5Ctextbf%7B1%7D%5C%5C%20%5Cmu_3%5Ctextbf%7B1%7D%20%5Cend%7Bbmatrix%7D&plus;%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7Bg%7D_1%5C%5C%20%5Ctextbf%7Bg%7D_2%5C%5C%20%5Ctextbf%7Bg%7D_3%20%5Cend%7Bbmatrix%7D&plus;%20%5Cbegin%7Bbmatrix%7D%20%5Cboldsymbol%7B%5Cvarepsilon%7D_1%5C%5C%20%5Cboldsymbol%7B%5Cvarepsilon%7D_2%5C%5C%20%5Cboldsymbol%7B%5Cvarepsilon%7D_3%20%5Cend%7Bbmatrix%7D">
 34 | </p>
 35 | 
 36 | where the random effect ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_0%3D%5B%5Ctextbf%7Bg%7D%27_1%2C%5Ctextbf%7Bg%7D%27_2%2C%5Ctextbf%7Bg%7D%27_3%5D%27) is assumed ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_0%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Csigma%5E2_%7Bg_0%7D%5Ctextbf%7BG%7D_0%29) with ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BG%7D_0) being the marker-derived genomic relationship calculated as
 37 | 
 38 | <p align="center">
 39 | <img src="https://latex.codecogs.com/gif.latex?%5Ctextbf%7BG%7D_0%3D%5Cfrac%7B1%7D%7Bp%7D%20%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7BX%7D_1%5Ctextbf%7BX%7D_1%27%20%26%20%5Ctextbf%7BX%7D_1%5Ctextbf%7BX%7D_2%27%20%26%20%5Ctextbf%7BX%7D_1%5Ctextbf%7BX%7D_3%27%5C%5C%20%5Ctextbf%7BX%7D_2%5Ctextbf%7BX%7D_1%27%20%26%20%5Ctextbf%7BX%7D_2%5Ctextbf%7BX%7D_2%27%20%26%20%5Ctextbf%7BX%7D_2%5Ctextbf%7BX%7D_3%27%5C%5C%20%5Ctextbf%7BX%7D_3%5Ctextbf%7BX%7D_1%27%20%26%20%5Ctextbf%7BX%7D_3%5Ctextbf%7BX%7D_2%27%20%26%20%5Ctextbf%7BX%7D_3%5Ctextbf%7BX%7D_3%27%20%5Cend%7Bbmatrix%7D">
 40 | </p>
 41 | 
 42 | * **MxE model**
 43 | 
 44 | It models GxE interaction using a marker-by-environment (MxE) approach in which the effect of the *j*<sup>th</sup> marker on the *k*<sup>th</sup> environment, ![](https://latex.codecogs.com/gif.latex?%5Cbeta_%7Bjk%7D), is descomposed into an effect that is common to all environments (*b*<sub>j0</sub>) and an effect that is specific to each environment (*b*<sub>jk</sub>), this is ![](https://latex.codecogs.com/gif.latex?%5Cbeta_%7Bjk%7D%3Db_%7Bj0%7D&plus;b_%7Bjk%7D). Thus, the multi-environmental model is
 45 | 
 46 | <p align="center">
 47 | <img src="https://latex.codecogs.com/gif.latex?%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7By%7D_1%5C%5C%20%5Ctextbf%7By%7D_2%5C%5C%20%5Ctextbf%7By%7D_3%20%5Cend%7Bbmatrix%7D%3D%5Cbegin%7Bbmatrix%7D%20%5Cmu_1%5Ctextbf%7B1%7D%5C%5C%20%5Cmu_2%5Ctextbf%7B1%7D%5C%5C%20%5Cmu_3%5Ctextbf%7B1%7D%20%5Cend%7Bbmatrix%7D&plus;%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7BX%7D_1%5C%5C%20%5Ctextbf%7BX%7D_2%5C%5C%20%5Ctextbf%7BX%7D_3%20%5Cend%7Bbmatrix%7D%5Ctextbf%7Bb%7D_0&plus;%20%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7BX%7D_1%20%26%20%5Ctextbf%7B0%7D%20%26%20%5Ctextbf%7B0%7D%5C%5C%20%5Ctextbf%7B0%7D%20%26%20%5Ctextbf%7BX%7D_2%20%26%20%5Ctextbf%7B0%7D%5C%5C%20%5Ctextbf%7B0%7D%20%26%20%5Ctextbf%7B0%7D%20%26%20%5Ctextbf%7BX%7D_3%20%5Cend%7Bbmatrix%7D%20%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7Bb%7D_1%5C%5C%20%5Ctextbf%7Bb%7D_2%5C%5C%20%5Ctextbf%7Bb%7D_3%20%5Cend%7Bbmatrix%7D%20&plus;%5Cbegin%7Bbmatrix%7D%20%5Cboldsymbol%7B%5Cvarepsilon%7D_1%5C%5C%20%5Cboldsymbol%7B%5Cvarepsilon%7D_2%5C%5C%20%5Cboldsymbol%7B%5Cvarepsilon%7D_3%20%5Cend%7Bbmatrix%7D">
 48 | </p>
 49 | 
 50 | Likewise, by making ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_%7B0,k%7D%3D%5Ctextbf%7BX%7D_k%5Ctextbf%7Bb%7D_0) and ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_%7B1,k%7D%3D%5Ctextbf%7BX%7D_k%5Ctextbf%7Bb%7D_k), the G-BLUP representation is
 51 | 
 52 | <p align="center">
 53 | <img src="https://latex.codecogs.com/gif.latex?%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7By%7D_1%5C%5C%20%5Ctextbf%7By%7D_2%5C%5C%20%5Ctextbf%7By%7D_3%20%5Cend%7Bbmatrix%7D%3D%5Cbegin%7Bbmatrix%7D%20%5Cmu_1%5Ctextbf%7B1%7D%5C%5C%20%5Cmu_2%5Ctextbf%7B1%7D%5C%5C%20%5Cmu_3%5Ctextbf%7B1%7D%20%5Cend%7Bbmatrix%7D&plus;%20%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7Bg%7D_%7B0%2C1%7D%5C%5C%20%5Ctextbf%7Bg%7D_%7B0%2C2%7D%5C%5C%20%5Ctextbf%7Bg%7D_%7B0%2C3%7D%20%5Cend%7Bbmatrix%7D&plus;%20%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7Bg%7D_%7B1%2C1%7D%5C%5C%20%5Ctextbf%7Bg%7D_%7B1%2C2%7D%5C%5C%20%5Ctextbf%7Bg%7D_%7B1%2C3%7D%20%5Cend%7Bbmatrix%7D&plus;%20%5Cbegin%7Bbmatrix%7D%20%5Cboldsymbol%7B%5Cvarepsilon%7D_1%5C%5C%20%5Cboldsymbol%7B%5Cvarepsilon%7D_2%5C%5C%20%5Cboldsymbol%7B%5Cvarepsilon%7D_3%20%5Cend%7Bbmatrix%7D">
 54 | </p>
 55 | 
 56 | where the random effects 
 57 | <p align="center">
 58 | <img src="https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_0%3D%20%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7Bg%7D_%7B0%2C1%7D%5C%5C%20%5Ctextbf%7Bg%7D_%7B0%2C2%7D%5C%5C%20%5Ctextbf%7Bg%7D_%7B0%2C3%7D%20%5Cend%7Bbmatrix%7D%20%5Cqquad%5Ctext%7Band%7D%5Cqquad%20%5Ctextbf%7Bg%7D_1%3D%20%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7Bg%7D_%7B1%2C1%7D%5C%5C%20%5Ctextbf%7Bg%7D_%7B1%2C2%7D%5C%5C%20%5Ctextbf%7Bg%7D_%7B1%2C3%7D%20%5Cend%7Bbmatrix%7D">
 59 | </b>
 60 | 
 61 | are assumed ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_0%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Csigma%5E2_%7Bg_0%7D%5Ctextbf%7BG%7D_0%29) and ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_1%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Ctextbf%7BG%7D_1%29), respectively. Here, ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BG%7D_0) is described previously and
 62 | 
 63 | <p align="center">
 64 | <img src="https://latex.codecogs.com/gif.latex?%5Ctextbf%7BG%7D_1%3D%5Cfrac%7B1%7D%7Bp%7D%20%5Cbegin%7Bbmatrix%7D%20%5Csigma%5E2_%7Bg_1%7D%5Ctextbf%7BX%7D_1%5Ctextbf%7BX%7D_1%27%20%26%20%5Ctextbf%7B0%7D%20%26%20%5Ctextbf%7B0%7D%5C%5C%20%5Ctextbf%7B0%7D%20%26%20%5Csigma%5E2_%7Bg_2%7D%5Ctextbf%7BX%7D_2%5Ctextbf%7BX%7D_2%27%20%26%20%5Ctextbf%7B0%7D%5C%5C%20%5Ctextbf%7B0%7D%20%26%20%5Ctextbf%7B0%7D%20%26%20%5Csigma%5E2_%7Bg_3%7D%5Ctextbf%7BX%7D_3%5Ctextbf%7BX%7D_3%27%20%5Cend%7Bbmatrix%7D">
 65 | </b>
 66 | 
 67 | 
 68 | * **Reaction Norm model**
 69 | 
 70 | The G-BLUP model presented in the across-environment approach can be extended to incorporate GxE by introducing covariance structures as a funcion of the marker information. This model, assumes the environment (*E*<sub>k</sub>) as a random effect as ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BE%7D%3D%5BE_1%2C...%2CE_s%5D%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Csigma%5E2_e%5Ctextbf%7BI%7D%29). These random effects are conected with individuals through the design matrix **Z**<sub>e</sub>. Thus,  the main effect of environments, **e** = **Z**<sub>e</sub>**E**, is
 71 | 
 72 | <img src="https://latex.codecogs.com/gif.latex?%5Ctextbf%7Be%7D%3D%20%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7Be%7D_1%5C%5C%20%5Ctextbf%7Be%7D_2%5C%5C%20%5Ctextbf%7Be%7D_3%20%5Cend%7Bbmatrix%7D%3D%20%5Ctextbf%7BZ%7D_e%5Ctextbf%7BE%7D%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Csigma%5E2_e%5Ctextbf%7BZ%7D_e%5Ctextbf%7BZ%7D_e%27%29">
 73 | 
 74 | where ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Be%7D_k%3D%5Be_%7B1k%7D%2C...%2Ce_%7Bnk%7D%5D%27) are the effects of the environment *k* for the *i*<sup>th</sup> individual (*i=1,...,n*). The reaction norm model, incorporates GxE by introducing the interaction terms ![](https://latex.codecogs.com/gif.latex?g_%7Bik%7De_%7Bik%7D), as 
 75 | 
 76 | <p align="center">
 77 | <img src="https://latex.codecogs.com/gif.latex?%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7By%7D_1%5C%5C%20%5Ctextbf%7By%7D_2%5C%5C%20%5Ctextbf%7By%7D_3%20%5Cend%7Bbmatrix%7D%3D%20%5Cbegin%7Bbmatrix%7D%20%5Cmu_1%5Ctextbf%7B1%7D%5C%5C%20%5Cmu_2%5Ctextbf%7B1%7D%5C%5C%20%5Cmu_3%5Ctextbf%7B1%7D%20%5Cend%7Bbmatrix%7D&plus;%20%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7Bg%7D_1%5C%5C%20%5Ctextbf%7Bg%7D_2%5C%5C%20%5Ctextbf%7Bg%7D_3%20%5Cend%7Bbmatrix%7D&plus;%20%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7Bg%7D_1%5Ctextbf%7Be%7D_1%5C%5C%20%5Ctextbf%7Bg%7D_2%5Ctextbf%7Be%7D_2%5C%5C%20%5Ctextbf%7Bg%7D_3%5Ctextbf%7Be%7D_3%20%5Cend%7Bbmatrix%7D&plus;%20%5Cbegin%7Bbmatrix%7D%20%5Cboldsymbol%7B%5Cvarepsilon%7D_1%5C%5C%20%5Cboldsymbol%7B%5Cvarepsilon%7D_2%5C%5C%20%5Cboldsymbol%7B%5Cvarepsilon%7D_3%20%5Cend%7Bbmatrix%7D">
 78 | </p>
 79 | 
 80 | where ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D_k%5Ctextbf%7Be%7D_k) is the interaction term between the random effect of markers and environments. It can be proved that, aproximately, 
 81 | 
 82 | <p align="center">
 83 | <img src="https://latex.codecogs.com/gif.latex?%5Ctextbf%7Bg%7D%5Ctextbf%7Be%7D%3D%20%5Cbegin%7Bbmatrix%7D%20%5Ctextbf%7Bg%7D_1%5Ctextbf%7Be%7D_1%5C%5C%20%5Ctextbf%7Bg%7D_2%5Ctextbf%7Be%7D_2%5C%5C%20%5Ctextbf%7Bg%7D_3%5Ctextbf%7Be%7D_3%20%5Cend%7Bbmatrix%7D%20%5Csim%20N%28%5Ctextbf%7B0%7D%2C%5Csigma%5E2_%7Bge%7D%5Ctextbf%7BG%7D_0%5Ccirc%20%5B%5Ctextbf%7BZ%7D_e%5Ctextbf%7BZ%7D_e%27%5D%20%29">
 84 | </b>
 85 | 
 86 | where ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BG%7D_0) was described above and ![](https://latex.codecogs.com/gif.latex?%5Ccirc) denotes the element by element product (known as Hadamard product).
 87 | 
 88 | **Note**. The Hadamard product above will yield a matrix similar to ![](https://latex.codecogs.com/gif.latex?%5Ctextbf%7BG%7D_1) above described but with ![](https://latex.codecogs.com/gif.latex?%5Csigma%5E2_%7Bg_1%7D%3D%5Csigma%5E2_%7Bg_2%7D%3D%5Csigma%5E2_%7Bg_3%7D%3D1). This is, reaction norm model estimates environment-specific effects with constant variance ![](https://latex.codecogs.com/gif.latex?%5Csigma%5E2_%7Bge%7D) in contrast wth MxE model that accomodates environment-specific variance, ![](https://latex.codecogs.com/gif.latex?%5Csigma%5E2_%7Bg_k%7D), *k=1,...,s*.
 89 | 
 90 | ## Model assessment
 91 | Performance of the *Reaction Norm* and *MxE model* will be compared with that of the *across-environments* model that ignores GxE modeling and the *single-environment* model which is fitted within each environment.
 92 | 
 93 | ### Training-Testing random partitions.
 94 | The prediction power of the model will be assessed using the training-testing (TRN-TST) random partitions approach. 
 95 | Data is randomly splitted into training and testing sets. Model parameters are estimated in training set and model is tested in TST set.  Two main estimations problems are addressed using the multi-environments models. 
 96 | 
 97 | *   **Cross Validation 1 (CV1)**. Represent a scheme of prediction of lines that have not been evaluated in any field
 98 | trials.
 99 | 
100 | *   **Cross Validation 2 (CV2)**. Represent a scheme of prediction of lines that have been evaluated in some but all target environments. Thus, prediction of non-evaluated lines benefits from borrowing of information from lines that were evaluated in other environments.
101 | 
102 | <p align="center">
103 | <img src="https://github.com/MarcooLopez/Genomic-Selection-Demo/blob/master/CV1_2_scheme.png" width="450">
104 | </b>
105 | 
106 | In our case, we will use 70% of the data for training set and the remaining 30% for the testing set.
107 | For CV1, we will create a scheme in which 30% of the lines are missing in all environments. 
108 | CV2 scheme is created by having 30% of the entries missing in one environment but present in all the rest of environments.
109 | 
110 | This procedure of TRN-TST can be repeated many times to allow for estimation of standard errors (SE).
111 | 
112 | ## Data preparation
113 | ### Load data, generate G-matrix
114 | The following R code, [prepareData_multi.R](https://github.com/MarcooLopez/Genomic-Selection/blob/master/prepareData_multi.R), can be used to prepare the data for analizes of the multi-environmental models.
115 | 
116 | ```r
117 | setwd("/mnt/home/lopezcru/GS")
118 | rm(list=ls())
119 | library(BGLR)
120 | 
121 | # Load data
122 | data(wheat)
123 | X <- wheat.X
124 | Y <- wheat.Y
125 | 
126 | # Select environments. For instance, environments 2,4, and 5
127 | Y <- Y[,c(2,3,4)]
128 | 
129 | # Genomic relationship matrix
130 | M <- scale(X)
131 | G <- tcrossprod(M)/ncol(X)
132 | 
133 | # Design matrix for individuals. It connects individuals with environments
134 | GID <- factor(rep(rownames(Y),ncol(Y)),levels=rownames(Y))
135 | Zg <- model.matrix(~GID-1)
136 | 
137 | # Design matrix for environments. Used in the multi-environment R-Norm model
138 | envID <- factor(rep(colnames(Y),each=nrow(Y)),levels=colnames(Y))
139 | ZE <- model.matrix(~envID-1)   
140 | 
141 | #  Covariance structure for effects
142 | ZgGZgt <- Zg%*%G%*%t(Zg)    # Genetic effect  
143 | ZEZEt <- tcrossprod(ZE)     # Environmental effect
144 | GE <- ZgGZgt*ZEZEt          # GxE interaction term (R-Norm model)
145 | 
146 | # Eigen decomposition (to speed computational time)
147 | eigen_G <- eigen(G)
148 | eigen_G0 <- eigen(ZgGZgt)
149 | eigen_GE <- eigen(GE)
150 | 
151 | # Interaction terms (MxE model)
152 | MxE_eigen <- vector("list",ncol(Y))
153 | for(env in 1:ncol(Y)){ 
154 |     tmp <- rep(0,ncol(Y)) ; tmp[env] <- 1; G1 <- kronecker(diag(tmp),G)
155 |     MxE_eigen[[env]] <- eigen(G1)
156 | }
157 | 
158 | # Save prepared data
159 | dir.create("multiEnvironment")
160 | save(Y,envID,eigen_G,eigen_G0,eigen_GE,MxE_eigen,file="multiEnvironment/prepData_multi.RData")
161 | ```
162 | 
163 | ## Running models
164 | 
165 | ### 1. Variance components estimation 
166 | Code below, [get_VarComps_multi.R](https://github.com/MarcooLopez/Genomic-Selection/blob/master/get_VarComps_multi.R) script, can be used after 'data preparation' part to fit all the models and to extract variance components.
167 | 
168 | ```r
169 | setwd("/mnt/home/lopezcru/GS")
170 | rm(list=ls())
171 | library(BGLR)
172 | 
173 | load("multiEnvironment/prepData_multi.RData")
174 | n <- nrow(Y)
175 | nEnv <- ncol(Y)
176 | y <- as.vector(Y)
177 | 
178 | set.seed(123)
179 | 
180 | # Matrix to store results. It will save variance components for each model
181 | outVAR <- matrix(NA,ncol=4,nrow=1+2*nEnv)
182 | dimnames(outVAR) <- list(c("Main",rep(paste0("Env ",colnames(Y)),2)),c("Single","Across","MxE","R-Norm"))
183 | 
184 | # Number of iterations and burn-in for Bayesian models
185 | nIter <- 30000; burnIn <- 2000
186 | 
187 | #--------------------------------------------------------
188 | # 1. Single environment (within-environment) model
189 | #--------------------------------------------------------
190 | ETA <- list(G=list(V=eigen_G$vectors,d=eigen_G$values,model='RKHS'))
191 | for(env in 1:nEnv){
192 |     fm <-BGLR(y=Y[,env],ETA=ETA,nIter=nIter,burnIn=burnIn)
193 |     outVAR[env+1,1] <- fm$ETA[[1]]$varU
194 |     outVAR[env+4,1] <- fm$varE
195 | }
196 | 
197 | #--------------------------------------------------------
198 | # 2. Across-environments model
199 | #--------------------------------------------------------
200 | ETA <- list(list(~envID-1,model="FIXED"))
201 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS')
202 | 
203 | # Model Fitting
204 | fm <- BGLR(y=y,ETA=ETA,nIter=nIter,burnIn=burnIn)
205 | outVAR[1,2] <- fm$ETA[[2]]$varU
206 | outVAR[(1:nEnv)+4,2] <- fm$varE
207 | 
208 | #--------------------------------------------------------
209 | # 3. MxE interaction model
210 | #--------------------------------------------------------
211 | ETA <- list(list(~envID-1,model="FIXED"))
212 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS')
213 | 
214 | # Adding interaction terms
215 | for(env in 1:nEnv){
216 |     eigen_G1 <- MxE_eigen[[env]]
217 |     ETA[[(env+2)]] <- list(V=eigen_G1$vectors,d=eigen_G1$values,model='RKHS')
218 | }
219 | 
220 | # Model Fitting
221 | fm <- BGLR(y=y,ETA=ETA,nIter=nIter,burnIn=burnIn)
222 | outVAR[1,3] <- fm$ETA[[2]]$varU
223 | for(env in 1:nEnv) outVAR[env+1,3] <- fm$ETA[[env+2]]$varU
224 | outVAR[(1:nEnv)+4,3] <- fm$varE
225 | 
226 | #--------------------------------------------------------
227 | # 4. Reaction-Norm model
228 | #--------------------------------------------------------
229 | ETA <- list(list(~envID-1,model="FIXED"))
230 | ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS')
231 | ETA[[3]] <- list(V=eigen_GE$vectors,d=eigen_GE$values,model="RKHS")
232 | 
233 | # Model Fitting
234 | fm <- BGLR(y=y,ETA=ETA,nIter=nIter,burnIn=burnIn)
235 | outVAR[1,4] <- fm$ETA[[2]]$varU
236 | outVAR[(1:nEnv)+1,4] <- fm$ETA[[3]]$varU
237 | outVAR[(1:nEnv)+4,4] <- fm$varE
238 | outVAR
239 | 
240 | # Save results
241 | write.table(outVAR,file="multiEnvironment/varComps.csv",sep=",",row.names=F)
242 | ```
243 | 
244 | #### Results
245 | The following table is the output of the code above for `nIter=30000` and `burnIn=2000`.
246 | <img src="https://github.com/MarcooLopez/Genomic-Selection/blob/master/varComp.png" width="360">
247 | 
248 | ##
249 | ### 2. Replicates of partitions to obtain standard deviations of predictions
250 | Using a GBLUP approach, the prediction power of the multi-environment models (MxE and Reaction Norm) will be compared with that that ignores GxE (across-environment) and with the GBLUP model fitted within environment. 
251 | 
252 | #### 2.1. Training-Testing partitions
253 | After running the 'data preparation' part, it can be chosen either to perform CV1 or CV2 aproaches
254 | 
255 | * **Cross Validation 1 (CV1)**
256 | 
257 | Code below will generate a matrix YNA containing "NA" values for the entries corresponding to the TST set mimicing the CV1 prediction problem. It generates a 'list' with 'm' matrices containing the TRN-TST partitions
258 | 
259 | ```r
260 | setwd("/mnt/home/lopezcru/GS")
261 | rm(list=ls())
262 | 
263 | #=========================================================
264 | # User specifications
265 | #=========================================================
266 | # Number of replicates
267 | m <- 100
268 | 
269 | # Percentage of the data assigned to Testing set
270 | percTST <- 0.3
271 | #=========================================================
272 | 
273 | # Load data
274 | load("multiEnvironment/prepData_multi.RData")
275 | n <- nrow(Y)
276 | 
277 | # Creation of seed for repeated randomizations
278 | set.seed(123)
279 | seeds <- round(seq(1E3,1E6,length=m))
280 | 
281 | nTST <- round(percTST*n)
282 | YNA <- vector("list",m)
283 | 
284 | for(k in 1:m)
285 | {
286 |     set.seed(seeds[k])
287 |     indexTST <- sample(1:n,size=nTST,replace=FALSE)
288 |     YNA0 <- Y
289 |     YNA0[indexTST,] <- NA
290 |     YNA[[k]] <- YNA0
291 | }
292 | 
293 | # Save YNA matrix
294 | save(YNA,file="multiEnvironment/YNA_CV1_multiEnv.RData")
295 | ```
296 | 
297 | * **Cross Validation 2 (CV2)**
298 | 
299 | Code below will generate a matrix YNA containing "NA" values for the entries corresponding to the TST set mimicing the CV2 prediction problem. It generates a 'list' with 'm' matrices containing the TRN-TST partitions
300 | 
301 | ```r
302 | setwd("/mnt/home/lopezcru/GS")
303 | rm(list=ls())
304 | 
305 | #=========================================================
306 | # User specifications
307 | #=========================================================
308 | # Number of replicates
309 | m <- 100
310 | 
311 | # Percentage of the data assigned to Testing set
312 | percTST <- 0.3
313 | #=========================================================
314 | 
315 | # Load data
316 | load("multiEnvironment/prepData_multi.RData")
317 | n <- nrow(Y)
318 | nEnv <- ncol(Y)
319 | 
320 | # Creation of seed for repeated randomizations
321 | set.seed(123)
322 | seeds <- round(seq(1E3,1E6,length=m))
323 | 
324 | nTST <- round(percTST*n)
325 | YNA <- vector("list",m)
326 | nNA <- nEnv*nTST
327 | 
328 | for(k in 1:m)
329 | {
330 |     set.seed(seeds[k])
331 |     YNA0 <- Y
332 |     
333 |     if(nNA<n){ indexNA <- sample(1:n,nNA,replace=FALSE) }
334 |     if(nNA>=n){
335 |         nRep <- floor(nNA/n)
336 |         remain <- sample(1:n,nNA%%n,replace=FALSE)
337 |         a0 <- sample(1:n,n,replace=FALSE)
338 |         indexNA <- rep(a0,nRep)
339 |         if(length(remain)>0){
340 |             a1 <- floor(length(indexNA)/nTST)*nTST
341 |             a2 <- nNA - a1 - length(remain)
342 |             bb <- sample(a0[!a0%in%remain],a2,replace=FALSE)
343 |             noInIndexNA <- c(rep(a0,nRep-1),a0[!a0%in%bb])
344 |             indexNA <- c(noInIndexNA,bb,remain)
345 |         }
346 |     }
347 |     indexEnv <- rep(1:nEnv,each=nTST)
348 |     for(j in 1:nEnv) YNA0[indexNA[indexEnv==j],j] <- NA
349 |     YNA[[k]] <- YNA0
350 | }
351 | 
352 | # Save YNA matrix
353 | save(YNA,file="multiEnvironment/YNA_CV2_multiEnv.RData")
354 | ```
355 | 
356 | After running the code to generate partitions for either CV1 or CV2 scenarios, the following script ([fitModels_multi.R](https://github.com/MarcooLopez/Genomic-Selection/blob/master/fitModels_multi.R)) can be run to fit the models repeatealy for all partitions. In all multi-environment models, main effect of 'environment' will be regarded as fixed effect.
357 | 
358 | The code runs a single partition for each model either for CV1 or CV2. These specifications need to be passed in variables `mod`, `CV`, and `part`. 
359 | 
360 | ```r
361 | setwd("/mnt/home/lopezcru/GS")
362 | rm(list=ls())
363 | library(BGLR)
364 | 
365 | #=========================================================
366 | # User specifications
367 | #=========================================================
368 | # Choose one model. 1: single; 2:across; 3:MxE; 4:R-Norm
369 | mod <- 4
370 | 
371 | # Type of CV. 1:CV1; 2:CV2
372 | CV <- 1
373 | 
374 | # Partition number
375 | part <- 1
376 | #=========================================================
377 | 
378 | # Read arguments passed from command line
379 | args=(commandArgs(TRUE))
380 | if(length(args)==0){
381 |    cat('No args provided',"\n")
382 |  }else{
383 |     for(i in 1:length(args)){
384 |          eval(parse(text=args[[i]]))
385 |     }
386 | }
387 | 
388 | # Load data
389 | load("multiEnvironment/prepData_multi.RData")
390 | load(paste0("multiEnvironment/YNA_CV",CV,"_multiEnv.RData"))
391 | n <- nrow(Y);  nEnv <- ncol(Y)
392 | 
393 | # Models
394 | models <- c("Single","Across","MxE","R-Norm")
395 | model <- models[mod]
396 | 
397 | # Number of iterations and burn-in for Bayesian models
398 | nIter <- 30000;  burnIn <- 2000
399 | 
400 | YNA0 <- YNA[[part]]
401 | yNA <- as.vector(YNA0)
402 |     
403 | #--------------------------------------------------------
404 | # 1. Single environment (within-environment) model
405 | #--------------------------------------------------------
406 | if(model=="Single")
407 | {
408 |     YHat <- matrix(NA,nrow=nrow(Y),ncol=ncol(Y))
409 |     ETA <- list(G=list(V=eigen_G$vectors,d=eigen_G$values,model='RKHS'))
410 |     for(env in 1:nEnv){
411 |         fm <-BGLR(y=YNA0[,env],ETA=ETA,nIter=nIter,burnIn=burnIn)
412 |         YHat[,env] <- fm$yHat
413 |     }
414 | }
415 | 
416 | #--------------------------------------------------------
417 | # 2. Across-environments model
418 | #--------------------------------------------------------
419 | if(model=="Across")
420 | {
421 |     ETA <- list(list(~envID-1,model="FIXED"))
422 |     ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS')
423 | 
424 |     # Model Fitting
425 |     fm <- BGLR(y=yNA,ETA=ETA,nIter=nIter,burnIn=burnIn)
426 |     YHat <- matrix(fm$yHat,ncol=nEnv)
427 | }
428 |     
429 | #--------------------------------------------------------
430 | # 3. MxE interaction model
431 | #--------------------------------------------------------
432 | if(model=="MxE")
433 | {
434 |     ETA <- list(list(~envID-1,model="FIXED"))
435 |     ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS')
436 | 
437 |     # Adding interaction terms
438 |     for(env in 1:nEnv){
439 |         eigen_G1 <- MxE_eigen[[env]]
440 |         ETA[[(env+2)]] <- list(V=eigen_G1$vectors,d=eigen_G1$values,model='RKHS')
441 |     }
442 | 
443 |     # Model Fitting
444 |     fm <- BGLR(y=yNA,ETA=ETA,nIter=nIter,burnIn=burnIn)
445 |     YHat <- matrix(fm$yHat,ncol=nEnv)
446 | }
447 | 
448 | #--------------------------------------------------------
449 | # 4. Reaction-Norm model
450 | #--------------------------------------------------------
451 | if(model=="R-Norm")
452 | {
453 |     ETA <- list(list(~envID-1,model="FIXED"))
454 |     ETA[[2]] <- list(V=eigen_G0$vectors,d=eigen_G0$values,model='RKHS')
455 |     ETA[[3]] <- list(V=eigen_GE$vectors,d=eigen_GE$values,model="RKHS")
456 | 
457 |     # Model Fitting
458 |     fm <- BGLR(y=yNA,ETA=ETA,nIter=nIter,burnIn=burnIn)
459 |     YHat <- matrix(fm$yHat,ncol=nEnv)
460 | }
461 | 
462 | # Save results
463 | outfolder <- paste0("multiEnvironment/CV",CV,"/",model)
464 | if(!file.exists(outfolder)) dir.create(outfolder,recursive=T)
465 | save(YHat,file=paste0(outfolder,"/outPRED_multiEnv_partition_",part,".RData"))
466 | ```
467 | 
468 | #### 2.2 Running in parallel many jobs
469 | Code above will run a single combination of partition-model-CV, thus when running, for instance, several models for both CV1 and CV2, some parallelzation of jobs is needed for speeding of computation. The bash code called *[run_jobs_multi.sh](https://github.com/MarcooLopez/Genomic-Selection/blob/master/run_jobs_multi.sh)* will submit many jobs depending of the core capacity of the computer. Jobs will be sent by chunks whose size is specified in variable `nb` (for instance, `nb=10` will run 10 jobs at the time). After jobs in the chunk are done, another chunk will be submited to be run. Variables `seq1=2`, `seq2=4`, and `seq3=100` specify the 2 CV types, 4 models, and 100 partitions, respectiely.
470 | 
471 | The bash shell script needs to be saved in the same directory as the R script 'fitModels_multi.R' and it can be run from command line as
472 | ```
473 | sh run_jobs_multi.sh &
474 | ```
475 | 
476 | #### 2.3 Retrieving results
477 | 
478 | The code below will retrieve results for all models fitted previously showing the within-environment correlation for all fitted models
479 | 
480 | ```r
481 | setwd("/mnt/home/lopezcru/GS")
482 | rm(list=ls())
483 | library(ggplot2)
484 | library(reshape)
485 | 
486 | #=========================================================
487 | # User specifications
488 | #=========================================================
489 | # Type of CV. 1:CV1; 2:CV2
490 | CV <- 2
491 | #=========================================================
492 | 
493 | models <- c("Single","Across","MxE","R-Norm")
494 | 
495 | # Load data
496 | load("multiEnvironment/prepData_multi.RData")
497 | load(paste0("multiEnvironment/YNA_CV",CV,"_multiEnv.RData"))
498 | 
499 | # Calculate within-environment correlation
500 | outCOR <- vector("list",length(models))
501 | names(outCOR) <- models
502 | for(mod in seq_along(models))
503 | {
504 |     outcor <- c()
505 |     for(part in 1:length(YNA)){
506 |         filename <- paste0("multiEnvironment/CV",CV,"/",models[mod],"/outPRED_multiEnv_partition_",part,".RData")
507 |         if(file.exists(filename))
508 |         {
509 |             load(filename)
510 |             YNA0 <- YNA[[part]]
511 |             tmp <- rep(NA,ncol(YNA0))
512 |             for(env in 1:ncol(YNA0)){
513 |                 indexTST <- which(is.na(YNA0[,env]))
514 |                 tmp[env] <- cor(Y[indexTST,env],YHat[indexTST,env])
515 |             }
516 |             outcor <- rbind(outcor,tmp)
517 |         }
518 |     }    
519 |     colnames(outcor) <- paste0("Env ",colnames(YNA0))
520 |     rownames(outcor) <- NULL
521 |     outcor <- data.frame(model=models[mod],outcor)
522 |     outCOR[[mod]] <- outcor
523 | }
524 | outCOR <- outCOR[!sapply(outCOR,is.null)]
525 | 
526 | # Calculate means and SD's
527 | (means <- t(do.call("rbind",lapply(outCOR,function(x)apply(x[,-1],2,mean)))))
528 | (sds <- t(do.call("rbind",lapply(outCOR,function(x)apply(x[,-1],2,sd)))))
529 | 
530 | write.csv(rbind(means,colnames(sds),sds),file=paste0("multiEnvironment/Accuracy_avg_CV",CV,"_multiEnv.csv"))
531 | 
532 | toplot <- do.call("rbind",lapply(outCOR,function(x)melt(x,id="model")))
533 | png(paste0("multiEnvironment/Accuracy_distn_CV",CV,"_multiEnv.png"),height=350)
534 | ggplot(toplot,aes(x=model,y=value,fill=variable)) + geom_boxplot()+
535 | labs(fill="Env",y="Accuracy",title=paste0("Correlation between observed and predicted values. CV",CV))
536 | dev.off()
537 | ```
538 | 
539 | Tables below are the results of running 100 partitions with `nIter=30000` and `burnIn=2000`. The mean and standard deviation (in parenthesis) across partitions are presented.
540 | 
541 | **Cross Validation 1. CV1**
542 |  
543 | |       |Single-Env |Across-Env | MxE  | RNorm |
544 | |-------|-------|--------|------|------|
545 | |Env 2  | 0.485(0.049)  | 0.441(0.052)  | 0.460(0.050) | 0.461(0.049) |
546 | |Env 4  | 0.377(0.055)  | 0.395(0.053)  | 0.382(0.055) | 0.382(0.055) |
547 | |Env 5  | 0.441(0.056)  | 0.382(0.057)  | 0.412(0.054) | 0.409(0.055) |
548 | 
549 | &nbsp;
550 | 
551 | <img src="https://github.com/MarcooLopez/Genomic-Selection/blob/master/Accuracy_distn_CV1_multiEnv.png" width="400">
552 | 
553 | ###
554 | **Cross Validation 2. CV2**
555 | 
556 | |       |Single-Env |Across-Env | MxE  | RNorm |
557 | |-------|-------|--------|------|-----|
558 | |Env 2  | 0.485(0.049)  | 0.629(0.032)  | 0.647(0.033) | 0.642(0.033) |
559 | |Env 4  | 0.375(0.058)  | 0.602(0.042)  | 0.591(0.043) | 0.585(0.044) |
560 | |Env 5  | 0.442(0.048)  | 0.493(0.045)  | 0.529(0.042) | 0.528(0.043) |
561 | 
562 | &nbsp;
563 | 
564 | <img src="https://github.com/MarcooLopez/Genomic-Selection/blob/master/Accuracy_distn_CV2_multiEnv.png" width="400">
565 | 
566 | #
567 | * **[back](https://github.com/MarcooLopez/Genomic-Selection-Demo/blob/master/README.md)**
568 | 
569 | #
570 | # References
571 | * Jarquín, D., Crossa, J., Lacaze, X., Du Cheyron, P., Daucourt, J., Lorgeou, J., … de los Campos, G. (2014). **A reaction norm model for genomic selection using high-dimensional genomic and environmental data**. Theoretical and Applied Genetics, 127(3), 595–607. 
572 | * Lopez-Cruz, M., Crossa, J., Bonnett, D., Dreisigacker, S., Poland, J., Jannink, J.-L., … de los Campos, G. (2015). **Increased Prediction Accuracy in Wheat Breeding Trials Using a Marker × Environment Interaction Genomic Selection Model**. G3: Genes, Genomes, Genetics, 5(4), 569–582. 
573 | 
574 | 
575 | 
576 | 


--------------------------------------------------------------------------------