├── tissue_pair_work_v6.RData
├── code
    ├── disease_prediction.r
    ├── regression_model_articleoutput.r
    ├── regression_model_M1_M2.r
    └── function.r
└── README.md


/tissue_pair_work_v6.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbasugit/Imputation/HEAD/tissue_pair_work_v6.RData


--------------------------------------------------------------------------------
/code/disease_prediction.r:
--------------------------------------------------------------------------------
  1 | 
  2 | library(data.table)
  3 | args<-commandArgs(TRUE);
  4 | 
  5 | nfold=as.numeric(args[1]); #fold
  6 | ENS=as.numeric(args[2]); #ENS
  7 | 
  8 | #nfold=5; ENS=50; #one can change this parameters, nfold is partision for CV and ENS is number of independent runs
  9 | 
 10 | dir="/home/Desktop" #example of dir that needs to be given 
 11 | workdir=paste0(dir,"/imputation"); 
 12 | setwd(workdir)
 13 | #files for predicted expression 
 14 | folder.GSp=paste0(workdir,"/output/given"); #if user do not have predicted expression of target tissue use the one provided here
 15 | 
 16 | source(paste0(workdir,'/code/function.r'));
 17 | 
 18 | 
 19 | 
 20 | #load gtex data
 21 | load(paste0(workdir,"/input/gtexdata_v6.RData"));
 22 | gene=gtex.pc$Name
 23 | 
 24 | #load disease tissue and sample count
 25 | load(paste0(workdir,"/input/disease_blood_32tissue_common_sample.Rdata"))
 26 | 
 27 | disease=names(comm_sample); dis.tissue=list();samplecut=25;
 28 | 
 29 | for(dis in disease){
 30 | 	tmp=comm_sample[[dis]]; print(dis);
 31 | 	k=which(as.numeric(tmp[,2])>=samplecut & as.numeric(tmp[,3])>=samplecut)
 32 | 	dis.tissue[[dis]]=tmp[k,1];
 33 | }
 34 | disease.study=names(dis.tissue)[lapply(dis.tissue,length)>0]
 35 | 
 36 | 
 37 | tss1="Whole Blood"
 38 | files=list.files(path=folder.GSp,pattern=".Rdata");
 39 | 
 40 | preddisllr1=list();
 41 | 
 42 | #for(dis in disease.study){
 43 | #tissue=dis.tissue[[dis]];
 44 | #for(tss in tissue){
 45 | 
 46 | 
 47 | #For all disease and tissue disease prediction uncomment above 3 lines, and tissue disease loop end line
 48 | #and comment the following line
 49 | dis="MHHTN"; tissue=c("Artery - Tibial","Adipose - Subcutaneous");
 50 | 
 51 | fl=files[grep(tss,files,fixed=TRUE)]
 52 | tss2=tss; print(tss2);
 53 | 
 54 | #load predicted expression
 55 | load(paste0(folder.GSp,"/",fl))
 56 | mat.pred=gene.regress.glmnet[["prediction"]]; 
 57 | 
 58 | if(ncol(mat.pred)<length(gene)){   
 59 | nn=as.numeric(colnames(mat.pred)); colnames(mat.pred)=gene[nn]; patients=rownames(mat.pred); geneid=nn;
 60 | } else { colnames(mat.pred)=gene; patients=rownames(mat.pred); geneid=seq(length(gene))}
 61 | 
 62 | j1=c(); j2=c(); pat.com=c();
 63 | 	for (ipat in patients){
 64 | 		i1=which(expc.nt$patient==ipat & expc.nt$SMTSD==tss1);
 65 | 		i2=which(expc.nt$patient==ipat & expc.nt$SMTSD==tss2);
 66 | 		if (length(i1)>0 & length(i2)>0){ j1=append(j1,i1[1]); j2=append(j2,i2[1]); pat.com=append(pat.com,ipat);}
 67 | 				 }
 68 | 
 69 | nn1=length(j1); nn2=length(j2); it1=matrix(0, 1, nn1); it2=matrix(0, 1, nn2);
 70 | it1[1,]=j1; it2[1,]=j2;
 71 | gtex.tss1=gtex.pc[,c(it1[1,]), with=F]; #expr of blood
 72 | gtex.tss2=gtex.pc[,c(it2[1,]), with=F]; #expr of target tissue
 73 | 
 74 | pat=pat.com; mhh2=pheno.dt[,dis,with=F]; subj=pheno.subj
 75 | istat=matrix(-1, 1, length(pat))
 76 | 
 77 | for (i in seq(length(pat))){  ii=which(pat[i]==subj);  istat[i]=mhh2[ii];  }
 78 | istat[istat==0]=-1; istat[istat==99]=0; istat=as.numeric(istat);
 79 | 
 80 | k=which(istat==1|istat==-1)
 81 | temp.p=(t(mat.pred))
 82 | temp.o=as.matrix(gtex.tss2); 
 83 | temp.p=temp.p[,k]; temp.o=temp.o[,k]; istat=istat[k]; blood.oo=as.matrix(gtex.tss1)[,k]; 
 84 | 
 85 | #LLR gene
 86 | llrfdr=0.05
 87 | fdr=p.adjust(gene.select.LLR[['pval']],method="BH")
 88 | x2=gene.regress.glmnet[["Mean-end"]]; predPCC=rowMeans(x2[["cor"]]);
 89 | llrgene=which(fdr<=llrfdr & predPCC>0.3);
 90 | 
 91 | #print(c(dis,tss,length(llrgene)))
 92 | 
 93 | #perform prediciton if tere is minimum of 20 llr genes
 94 | if(length(llrgene)>20){ 
 95 | 	temp.p1=temp.p[llrgene,]; temp.o1=temp.o[llrgene,]; blood.oo1=blood.oo[llrgene,];
 96 | 
 97 | 	ptm <- proc.time()
 98 | 	z=prediction_dis_CVfeature(temp.p1,temp.o1,blood.oo1,istat,nfold,ENS,geneid) #using age,race,gender
 99 | 	print(proc.time() - ptm)
100 | 	preddisllr1[[dis]][[tss2]][[toString(paste0(llrfdr))]]=z;
101 | 	} else { 
102 |   	preddisllr1[[dis]][[tss2]][[toString(paste0(llrfdr))]]=NA;
103 |   	}
104 | 
105 | save(preddisllr1,file=paste0(workdir,"/output/given/dispred_ens50.Rdata"))
106 | 
107 | #} #tissue loop ends
108 | #} #disease loop ends
109 | 
110 | 
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # TEEBoT 
 2 | ## (Tissue Expression Estimation using Blood Transcriptome)
 3 | 
 4 | TEEBot is a tool to predict tissue-specific expression (TSGE) from an individual’s blood transcriptome and demographic 
 5 | information (age, gender and race), with clinical implications. If his/her genotype information is avalibale, it could boost the performance of TEEBot. 
 6 | We trained TEEBot on GTEx version 6 and evaluated its performance in a cross-validation manner. For each gene in each target tissue, we first evaluate its predictability based on LLR test and then fit a lasso regression model to estimate its TSGE. Our models require Whole blood gene expression (WBGE), Whole Blood splicing (WBSp) information, and three demographic ‘confounding’ factors (Age, Race, and Sex), with genetype information as one additonal option. 
 7 | 
 8 | ## Data download
 9 | Download data files from GTEx Portal and dbGaP. The data include gene expression, transcript expression, phenotype and genotype information (optional).
10 | 
11 | ## Main script
12 | The code to predict the gene expression of a target tissue consists of the following steps:
13 | 
14 | ### Model building 
15 | For each gene, the top PCs of Whole blood transcriptome (top 10 PCs for gene expression and top 20 PCs for splicing profile) as features are used to build a gene specific model to predict its expression in the target tissue. Five-fold cross validation are performed for all the genes across tissues. The model based on lasso regression is implemented using cv.glmnet() function from glmnet R-package.
16 | 
17 | ### Measuring prediction accuracy 
18 | The prediction accuracy for each gene are evaluated using Pearson correlation test between the predicted expression and the ground truth. Likelihood ratio test 
19 | is also performed for each gene to assess the independent contribution of blood transcriptome beyond the 
20 | confounders (age, race and sex). For each gene we provide both its prediatability score (in terms of Pearson correlation coefficient)
21 | and FDR of the LLR p-value. We only report the prediction accuracies of genes which pass the likelihood ratio 
22 | test (FDR<=0.05) and also have pearson correlation coefficient values above a predefined threshold. 
23 | 
24 | 
25 | ## Code running instructions
26 | ### For prediction of target tissue
27 | Code: regression_model_articleoutput.r 
28 | 
29 | #### Input (workdir/input)
30 | Input are the gene expression, phenotypen and genotype file downloaded from GTEx Portal and dbGaP. \
31 | i) Gene expression data: All_Tissue_Site_Details_Analysis.combined.rpkm.gct \
32 | ii) Phenotype data: phs000424.v6.pht002742.v6.p1.c1.GTEx_Subject_Phenotypes.GRU.txt \
33 | iii) Sample attribute: GTEx_Data_V6_Annotations_SampleAttributesDS.txt \
34 | iv) Gene property file from ncbi: Homo_sapiens.gene_info (downloaded from https://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/  on march2016) \
35 | v) Genotype data: genotype.v6.chr1_22.Rdata \
36 | These files are read in the code within function "format_gtexdatav6()".
37 |  We also have a input file "tissue_pair_work_v6.RData" which contains a table with first column 
38 |  as whole blood, second column target tissue and third colum are the number of common samples. 
39 | All these files needs to be kept within the "workdir/input" folder. Along with these files we need to keep the function.r in the "workdir/input" folder.
40 | 
41 | The input parameters to run the code are index, run, fold, ENS. 
42 | 1) "index" are the tissue ids, the ids for the 32 target tissues are as follows 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,32,34,37,39. 
43 | 2) "run" indicated which model to compute, M1 or M2. To compute model M1 run=100, and for M2 run=110
44 | 3) fold : how many fold cross validation we want.
45 | 4) ENS : how many independent predictions we want to do.
46 | 
47 | #### Output
48 | Each Target tissue prediction file for example "Whole Blood_xx_lasso_regress_G_Spl_10_20.Rdata", 
49 |         where xx will be replaced by the target tissue name. 
50 | This Rdata contanis two objects, gene.regress.glmnet and gene.select.LLR.
51 | 1) gene.regress.glmnet : is a list, gene.regress.glmnet[["Mean-first"]] contains our desired correlation coefficient 
52 | that is the predictibility scores.
53 | 2) gene.select.LLR : is a list with LLR ratio and pvalue
54 | 
55 | note: Download the code file (regression_model_articleoutput.r) and the function.r in the same folder. Example path to store the
56 | codes is "/home/Desktop/Imputation". Within Imputation make a folder named "input" and "output". In the "output" folder the output of the codes automatically gets saved.
57 | 
58 | #### Required Packages
59 | For the code the following R-packages needs to be installed 
60 | "glmnet", "data.table", "foreach", "doMC", "ROCR", "lmtest".
61 | 


--------------------------------------------------------------------------------
/code/regression_model_articleoutput.r:
--------------------------------------------------------------------------------
  1 | #code to find predicted expression in target tissue
  2 | 
  3 | dir="/home/Desktop" #example of dir that needs to be given 
  4 | workdir=paste0(dir,"/imputation"); 
  5 | 
  6 | setwd(workdir)
  7 | 
  8 | 
  9 | library(data.table)
 10 | args<-commandArgs(TRUE);
 11 | 
 12 | 
 13 | index=as.numeric(args[1]) #target tissue
 14 | run=toString(args[2]) #category
 15 | fold=as.numeric(args[3]); #fold
 16 | ENS=as.numeric(args[4]); #ENS
 17 | 
 18 | PC1=as.numeric(args[3]); #GE
 19 | PC2=as.numeric(args[4]); #Sp
 20 | PC3=as.numeric(args[5]); #Snp
 21 | #PC1=10; PC2=20; 
 22 | 
 23 | #workdir="/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/gtex-v6"
 24 | 
 25 | 
 26 | 
 27 | #loading functions
 28 | #source('/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/gtex-v6/function_regression.r')
 29 | source(paste0(workdir,'/code/function.r'));
 30 | 
 31 | #load gtex data 
 32 | format_gtexdatav6(paste0(workdir,"/input/")); 
 33 | #In this function we directly read gtex v6 phenotype data and gene expression data and merge them 
 34 | #into format convenient to be used in the code and create a gtexdata_v6.RData file in workdir/input/ folder.
 35 | load(paste0(workdir,"/input/gtexdata_v6.RData"));
 36 | 
 37 | #Common samples between blood and target tissue
 38 | #load("/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/tissue_pair_work_v6.RData")
 39 | load(paste0(workdir,"/input/tissue_pair_work_v6.RData"));
 40 | 
 41 | #finding patient ids that have snps information
 42 | load(paste0(workdir,"/input/genotype.v6.chr1_22.Rdata") #loading genotype data obatined from dbGaP
 43 | pat.snps=sapply(strsplit(colnames(genotype.v6[["chr2"]]), split="-"), '[[', 2)
 44 | 
 45 | 
 46 | gene=gtex.pc$Name
 47 | tss1="Whole Blood"
 48 | tss2=as.character(unlist(strsplit(tss_pair[index,],"Whole Blood"))[2]); #tss2: target tissue
 49 | allgeneid=as.numeric(seq(length(gene)))
 50 | 
 51 | comm.patients=patients[which(patients %in% pat.snps)]
 52 | 
 53 | #for common individuals seperate the gene expression for tss1 and tss2 
 54 | j1=c(); j2=c();
 55 | pat.com=c(); #common individuals between Blood (tss1) and target tissue (tss2)
 56 | for (ipat in comm.patients){
 57 | 	i1=which(expc.nt$patient==ipat & expc.nt$SMTSD==tss1);
 58 | 	i2=which(expc.nt$patient==ipat & expc.nt$SMTSD==tss2);
 59 | 	if (length(i1)>0 & length(i2)>0){ j1=append(j1,i1[1]); j2=append(j2,i2[1]); pat.com=append(pat.com,ipat);}}
 60 | nn1=length(j1); nn2=length(j2); it1=matrix(0, 1, nn1); it2=matrix(0, 1, nn2);
 61 | it1[1,]=j1; it2[1,]=j2;
 62 | gtex.tss1=gtex.pc[,c(it1[1,]), with=F]; 
 63 | gtex.tss2=gtex.pc[,c(it2[1,]), with=F]; 
 64 | 
 65 | 
 66 | #PCA for tss1, whole Blood gtex
 67 | mdsk=100;
 68 | d = dist(t(gtex.tss1), method="euclidean"); mdsk=dim(t(gtex.tss1))[1]-1;
 69 | pcfit = cmdscale(d, eig=TRUE, k=mdsk);
 70 | gtex.tss1.pc=t(pcfit$points); #row:PCs; col: samples
 71 | colnames(gtex.tss1.pc)=pat.com;
 72 | 
 73 | #loading splicing PCs
 74 | #load('/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/gtex-v6/PC_splicing/PCcmd_splicing_blood.Rdata' )
 75 | load(paste0(workdir,"/input/PCcmd_splicing_blood.Rdata"))
 76 | splice.mat.pc=splicing.pc[[tss2]]
 77 | 
 78 | #confounders
 79 | x=sapply(1:length(pat.com),function(i){which(pheno.subj %in% pat.com[i])})
 80 | age=as.numeric(pheno.dt$AGE[x]);
 81 | race=as.numeric(pheno.dt$RACE[x]); race[race==99]=0;
 82 | gender=as.numeric(pheno.dt$GENDER[x]);
 83 | confund=cbind(age,race,gender)
 84 | 
 85 | 
 86 |  #row:PCs; col: samples
 87 | 
 88 | if(run=="110"){
 89 | #Gene + Splicing --------------------------------------------
 90 | ptm <- proc.time()
 91 | 
 92 | #outdir=paste0(workdir,"/output")
 93 | #outdir="/cbcb/project2-scratch/mbasu/gtex_v6/blood_cross_talk/gtex-v6/regress_result/lasso_geneexpr_snps_splicing_confound/data/lasso_cv_lambda/G_Sp_CF"
 94 | outdir=paste0(workdir,"/output/G_Sp_CF")
 95 | if(!file.exists(outdir))dir.create(outdir)
 96 | str=sprintf("%s/%s_%s_lasso_regress_G_Spl_%d_%d.Rdata",outdir,tss1,tss2,PC1,PC2)
 97 | 
 98 | gene.regress.glmnet=regression_glmnet_gene_splicing(fold,ENS,PC1,PC2,gtex.tss1.pc,splice.mat.pc,gtex.tss2,allgeneid,pat.com,"lasso",confund)
 99 | 
100 | gene.select.LLR=regression_lm_loglik_selectgene_2(PC1,PC2,gtex.tss1.pc,splice.mat.pc,gtex.tss2,"llr",confund)
101 | gene.select.LLR.Sp=regression_lm_loglik_selectgene_2Sp(PC1,PC2,gtex.tss1.pc,splice.mat.pc,gtex.tss2,"llr",confund)
102 | gene.select.LLR.GE=regression_lm_loglik_selectgene_2GE(PC1,PC2,gtex.tss1.pc,splice.mat.pc,gtex.tss2,"llr",confund)
103 | 
104 | save(gene.regress.glmnet,gene.select.LLR,gene.select.LLR.Sp,gene.select.LLR.GE,file=str)
105 | print(proc.time() - ptm)
106 | #------------------------------------------------------------
107 | }
108 | 
109 | 
110 | if(run=="100"){
111 | #Gene  ------------------------------------------------------
112 | #outdir="/cbcb/project2-scratch/mbasu/gtex_v6/blood_cross_talk/gtex-v6/regress_result/lasso_geneexpr_snps_splicing_confound/data/lasso_cv_lambda/G_CF"
113 | outdir=paste0(workdir,"/output/G_CF")
114 | if(!file.exists(outdir))dir.create(outdir)
115 | str=sprintf("%s/%s_%s_lasso_regress_G_%d_%dfold_%dENS.Rdata",myfolder,tss1,tss2,PC1,fold,ENS)
116 | 
117 | ptm <- proc.time()
118 | gene.regress.glmnet=regression_glmnet_gene(fold,ENS,PC1,gtex.tss1.pc,gtex.tss2,allgeneid,pat.com,"lasso",confund)
119 | 
120 | gene.select.LLR=regression_lm_loglik_selectgene_1(PC1,gtex.tss1.pc,gtex.tss2,"llr",confund)
121 | save(gene.regress.glmnet,gene.select.LLR,file=str)
122 | print(proc.time() - ptm)
123 | #------------------------------------------------------------
124 | }
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/code/regression_model_M1_M2.r:
--------------------------------------------------------------------------------
  1 | #code to find predicted expression in target tissue
  2 | 
  3 | 
  4 | regression_glmnet_gene_splicing_M2<-function(nPC1,nPC2,temp.tss1,temp.snps,temp.tss2,genes,method,confund.x){
  5 | 
  6 | 	#nPC1=PC1; nPC2=PC2;temp.tss1=gtex.tss1.pc; temp.snps=splice.mat.pc; temp.tss2=gtex.tss2;genes=gene[1:10];method="lasso";confund.x=confund;
  7 | 
  8 | require(foreach)
  9 | require(glmnet)
 10 | #library(doMC)
 11 | #registerDoMC(cores=25)
 12 | if(method=="lasso"){index=1}
 13 | if(method=="ridge"){index=0}
 14 | 
 15 | 			gtex.train.tss2=as.matrix(temp.tss2); 
 16 | 			gtex.train.tss1.pc=as.matrix(rbind(temp.tss1[1:nPC1,],temp.snps[1:nPC2,],t(confund.x)));
 17 | 
 18 | 			eachgene=foreach(itr=1:length(genes),.inorder=T,.combine='c') %do% {
 19 | 				geneid=itr;
 20 | 				fit.cv=cv.glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index,nfolds=5)
 21 | 				lam=fit.cv$lambda.min
 22 | 				fit1 = glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index, lambda=lam)
 23 | 				#pred=predict(fit1,newx=t(gtex.test.tss1.pc))
 24 | 				
 25 | 				temp=list(fit1)
 26 | 				return(temp)
 27 | 				} #gene loop ends
 28 | 				names(eachgene)=genes
 29 | return(eachgene)
 30 | 
 31 | }
 32 | 
 33 | regression_glmnet_gene_M1<-function(nPC,temp.tss1,temp.tss2,genes,method,confund.x){
 34 | require(foreach)
 35 | require(glmnet)
 36 | #library(doMC)
 37 | #registerDoMC(cores=25)
 38 | if(method=="lasso"){index=1}
 39 | if(method=="ridge"){index=0}
 40 | 
 41 | 			gtex.train.tss2=as.matrix(temp.tss2); 
 42 | 			gtex.train.tss1.pc=as.matrix(rbind(temp.tss1[1:nPC,],t(confund.x)));
 43 | 
 44 | 			eachgene=foreach(itr=1:length(genes),.inorder=T,.combine='c') %do% {
 45 | 				geneid=itr;
 46 | 				fit.cv=cv.glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index,nfolds=5)
 47 | 				lam=fit.cv$lambda.min
 48 | 				fit1 = glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index, lambda=lam)
 49 | 				#pred=predict(fit1,newx=t(gtex.test.tss1.pc))
 50 | 				
 51 | 				temp=list(fit1)
 52 | 				return(temp)
 53 | 				} #gene loop ends
 54 | 				names(eachgene)=genes
 55 | return(eachgene)
 56 | 
 57 | }
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | dir="/home/Desktop" #example of dir that needs to be given 
 64 | workdir=paste0(dir,"/imputation"); 
 65 | 
 66 | setwd(workdir)
 67 | 
 68 | 
 69 | library(data.table)
 70 | args<-commandArgs(TRUE);
 71 | 
 72 | 
 73 | index=as.numeric(args[1]) #target tissue
 74 | run=toString(args[2]) #category
 75 | #fold=as.numeric(args[3]); #fold
 76 | #ENS=as.numeric(args[4]); #ENS
 77 | 
 78 | #PC1=as.numeric(args[3]); #GE
 79 | #PC2=as.numeric(args[4]); #Sp
 80 | #PC3=as.numeric(args[5]); #Snp
 81 | PC1=10; PC2=20; 
 82 | 
 83 | #workdir="/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/gtex-v6"
 84 | 
 85 | 
 86 | 
 87 | #loading functions
 88 | #source('/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/gtex-v6/function_regression.r')
 89 | #source(paste0(workdir,'/code/function.r'));
 90 | 
 91 | #load gtex data 
 92 | #load("/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/data/gtexdata_v6.RData")
 93 | load(paste0(workdir,"/input/gtexdata_v6.RData"));
 94 | 
 95 | #Common samples between blood and target tissue
 96 | #load("/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/tissue_pair_work_v6.RData")
 97 | load(paste0(workdir,"/input/tissue_pair_work_v6.RData"));
 98 | 
 99 | #loading patient ids that have snps information
100 | #load("/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/gtex-v6/PC_SNPs/snps_sample_v6.RData")
101 | load(paste0(workdir,"/input/snps_sample_v6.RData"));
102 | 
103 | 
104 | gene=gtex.pc$Name
105 | tss1="Whole Blood"
106 | for(index in c(1,2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 32, 34, 37, 39)){
107 | tss2=as.character(unlist(strsplit(tss_pair[index,],"Whole Blood"))[2]); #tss2: target tissue
108 | allgeneid=as.numeric(seq(length(gene)))
109 | print(tss2);
110 | comm.patients=patients[which(patients %in% pat.snps)]
111 | 
112 | #for common individuals seperate the gene expression for tss1 and tss2 
113 | j1=c(); j2=c();
114 | pat.com=c(); #common individuals between Blood (tss1) and target tissue (tss2)
115 | for (ipat in comm.patients){
116 | 	i1=which(expc.nt$patient==ipat & expc.nt$SMTSD==tss1);
117 | 	i2=which(expc.nt$patient==ipat & expc.nt$SMTSD==tss2);
118 | 	if (length(i1)>0 & length(i2)>0){ j1=append(j1,i1[1]); j2=append(j2,i2[1]); pat.com=append(pat.com,ipat);}}
119 | nn1=length(j1); nn2=length(j2); it1=matrix(0, 1, nn1); it2=matrix(0, 1, nn2);
120 | it1[1,]=j1; it2[1,]=j2;
121 | gtex.tss1=gtex.pc[,c(it1[1,]), with=F]; 
122 | gtex.tss2=gtex.pc[,c(it2[1,]), with=F]; 
123 | 
124 | 
125 | #PCA for tss1, whole Blood gtex
126 | mdsk=100;
127 | d = dist(t(gtex.tss1), method="euclidean"); mdsk=dim(t(gtex.tss1))[1]-1;
128 | pcfit = cmdscale(d, eig=TRUE, k=mdsk);
129 | gtex.tss1.pc=t(pcfit$points); #row:PCs; col: samples
130 | colnames(gtex.tss1.pc)=pat.com;
131 | 
132 | #loading splicing PCs
133 | #load('/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/gtex-v6/PC_splicing/PCcmd_splicing_blood.Rdata' )
134 | load(paste0(workdir,"/input/PCcmd_splicing_blood.Rdata"))
135 | splice.mat.pc=splicing.pc[[tss2]]
136 | 
137 | #confounders
138 | x=sapply(1:length(pat.com),function(i){which(pheno.subj %in% pat.com[i])})
139 | age=as.numeric(pheno.dt$AGE[x]);
140 | race=as.numeric(pheno.dt$RACE[x]); race[race==99]=0;
141 | gender=as.numeric(pheno.dt$GENDER[x]);
142 | confund=cbind(age,race,gender)
143 | 
144 | 
145 |  #row:PCs; col: samples
146 | 
147 | if(run=="110"){
148 | #Gene + Splicing --------------------------------------------
149 | outdir=paste0(workdir,"/output/Model/M2")
150 | if(!file.exists(outdir))dir.create(outdir, recursive = TRUE)
151 | str=sprintf("%s/%s_lasso_regress_G_Spl_%d_%d.Rdata",outdir,tss2,PC1,PC2)
152 | ptm <- proc.time()
153 | 
154 | modelM2=regression_glmnet_gene_splicing_M2(PC1,PC2,gtex.tss1.pc,splice.mat.pc,gtex.tss2,gene,"lasso",confund)
155 | 
156 | save(modelM2,file=str)
157 | print(proc.time() - ptm)
158 | #------------------------------------------------------------
159 | }
160 | 
161 | 
162 | if(run=="100"){
163 | #Gene  ------------------------------------------------------
164 | outdir=paste0(workdir,"/output/Model/M1")
165 | if(!file.exists(outdir))dir.create(outdir, recursive = TRUE)
166 | str=sprintf("%s/%s_lasso_regress_G_%d.Rdata",outdir,tss2,PC1)
167 | 
168 | ptm <- proc.time()
169 | modelM1=regression_glmnet_gene_M1(PC1,gtex.tss1.pc,gtex.tss2,gene,"lasso",confund)
170 | save(modelM1,file=str)
171 | 
172 | print(proc.time() - ptm)
173 | #------------------------------------------------------------
174 | }
175 | }
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 


--------------------------------------------------------------------------------
/code/function.r:
--------------------------------------------------------------------------------
  1 | format_gtexdatav6<-function(dirx){
  2 | library(data.table)
  3 | 	
  4 | dt=fread(paste0(dirx,"phs000424.v6.pht002742.v6.p1.c1.GTEx_Subject_Phenotypes.GRU.txt")) #phenotype data 
  5 | pheno.dt=dt[-1,]
  6 | setnames(pheno.dt,as.character(dt[1,]))
  7 | #subj=unlist(lapply(phe.dt$SUBJID, function(tt) {substr(tt,6,9)}))
  8 | pheno.subj=sapply(1:nrow(pheno.dt), function(x){ y=unlist(strsplit(pheno.dt$SUBJID[x], split="-")); l=length(y); y[l] })
  9 | 
 10 | #gene expression
 11 | rpkm=fread(paste0(dirx,"All_Tissue_Site_Details_Analysis.combined.rpkm.gct")) #gene expression data
 12 | rpkm$Name = sapply(strsplit(rpkm$Name, '[.]'), '[[', 1)
 13 | 
 14 | rpkm1=unique(rpkm)
 15 | 
 16 | rpkm.col = colnames(rpkm); #gtex.row = rownames(gtex);
 17 | patients = unique(sapply(strsplit(rpkm.col[-(1:2)], split="-"), '[[', 2))
 18 | 
 19 | 
 20 | annt=fread(paste0(dirx,"GTEx_Data_V6_Annotations_SampleAttributesDS.txt"))
 21 | tmp = data.table(SAMPID=rpkm.col[-(1:2)], myid = 1:(length(rpkm.col) -2)); 
 22 | setkey(tmp, SAMPID); 
 23 | setkey(annt, SAMPID);
 24 | annt = merge(x=tmp, y=annt, by="SAMPID"); 
 25 | annt$patient = sapply(strsplit(annt$SAMPID, split="-"), '[[', 2)
 26 | annt1=annt[order(annt$myid),]
 27 | expc.nt=annt1
 28 | #========================================================================
 29 | 
 30 | #=======================================================================
 31 | #### gene information to identify protein coding genes
 32 | require(pracma)
 33 | gene=fread(paste0(dirx,"Homo_sapiens.gene_info")) 
 34 | gene.symbol=gene$V3
 35 | 
 36 | gene.IDs=gene$V6 
 37 | ensmbl=array("", c(length(gene.IDs),1))
 38 | for (i in seq(length(gene.IDs))){
 39 | 	gIDs=gene.IDs[i]
 40 | 	tt=strfind(gIDs, 'Ensembl:', overlap = TRUE)
 41 | 	if (!is.null(tt)){
 42 | 		ensmbl[i]=substr(gIDs, tt+8, tt+22)
 43 | 	}
 44 | }
 45 | 
 46 | gene[, c("Name") := ensmbl]
 47 | 
 48 | tmp = data.table(Name=gene$Name, myid = 1:dim(gene)[1]); setkey(tmp, Name); 
 49 | newrpkm = merge(x=tmp, y=rpkm, by="Name") 
 50 | 
 51 | tmp = data.table(Name=rpkm$Name, myid = 1:dim(rpkm)[1]); setkey(tmp, Name);
 52 | newgene = merge(x=tmp, y=gene, by="Name") #contains the newgene with only those which are in gtex
 53 |   
 54 | newrpkm$myid = newgene$myid; 
 55 | #the order of genes in newrpkm and newgene are same, checked using which(newrpkm$myid!=newgene$myid)
 56 | newrpkm$geneSym = newgene$V3 #gene name
 57 | newrpkm$geneFun = newgene$V9; #gene description
 58 | newrpkm$geneKnd = newgene$V10; #gene type
 59 | newrpkm$chromosome = newgene$V7; #chromosome
 60 | 
 61 | newrpkm.pc = newrpkm[!is.na(newrpkm$geneKnd) & newrpkm$geneKnd=="protein-coding" ] #only protein coding genes
 62 | g=newrpkm.pc$Name
 63 | x=as.data.table(table(g))
 64 | xg=x$g[which(x$N>1)]
 65 | remove=unlist(sapply(1:length(xg),function(i){r=which(g %in% xg[i]); r[2:length(r)] }))
 66 | newrpkm.pc1=newrpkm.pc[-c(remove),];
 67 | 
 68 | newrpkm.pc2=newrpkm.pc
 69 | newrpkm.pc=newrpkm.pc1
 70 | l=dim(newrpkm.pc)[2]; 
 71 | gtex.pc1=newrpkm.pc[,c(-1,-2,-3,-(l-3),-(l-2),-(l-1),-l),with=F]
 72 | zero.l=sum(gtex.pc1==0)
 73 | extr.gtex.pc=newrpkm.pc[,c(1,2,3,(l-3),(l-2),(l-1),l),with=F]
 74 | gtex.pc=cbind(gtex.pc1,extr.gtex.pc)
 75 | save(gtex.pc,expc.nt,pheno.dt,pheno.subj,patients,file=paste0(dirx,"/input/gtexdata_v6.RData"))
 76 | }
 77 | 
 78 | 
 79 | 
 80 | regression_glmnet_gene<-function(nfold,ENS,nPC,temp.tss1,temp.tss2,genes,pat.com,method,confund.x){
 81 | require(foreach)
 82 | require(glmnet)
 83 | if(method=="lasso"){index=1}
 84 | if(method=="ridge"){index=0}
 85 | 
 86 | #library(doMC)
 87 | #registerDoMC(cores=25)
 88 | 
 89 | 	k=round(length(pat.com)/nfold-.5);
 90 | 	folds=rep(c(1:nfold),c(rep(k,(nfold-1)),(length(pat.com)-(nfold-1)*k)))
 91 | 	avg.pred=matrix(0,length(pat.com),length(genes))
 92 | 	corr.perENS=NULL; pval.perENS=NULL;
 93 | 	sample.matrix=matrix(0,ENS+1,length(pat.com))
 94 | 	colnames(sample.matrix)=pat.com; 
 95 | 	sample.matrix[ENS+1,]=folds
 96 | 	lambda.matrix=matrix(0,length(genes),ENS*nfold)
 97 | 	jj=0;
 98 | for(ee in seq(ENS)){
 99 | 	sam=sample(1:length(pat.com),length(pat.com),replace=F); sample.matrix[ee,]=sam;
100 | 	temp.pred=matrix(0,length(pat.com),length(genes))	
101 | 	print(ee)
102 | 
103 | for(ifold in seq(nfold)){ print(paste0("e",ee,"f",ifold)); jj=jj+1;
104 | 	test=sam[which(folds==ifold)]
105 | 	train=sam[which(folds!=ifold)]
106 | 	gtex.train.tss2=as.matrix(temp.tss2[,train,with=F]); gtex.test.tss2=as.matrix(temp.tss2[,test,with=F]);
107 | 	gtex.train.tss1.pc=as.matrix(rbind(temp.tss1[1:nPC,train],t(confund.x[train,]))); 
108 |         gtex.test.tss1.pc =as.matrix(rbind(temp.tss1[1:nPC,test],t(confund.x[test,])));
109 | 
110 | 
111 | #eachgene=foreach(itr=1:length(genes),.inorder=T,.combine='cbind') %dopar% {
112 | 	eachgene=foreach(itr=1:length(genes),.inorder=T,.combine='cbind') %do% {
113 | 
114 | 		geneid=genes[itr];
115 | 		fit.cv=cv.glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index,nfolds=5); 
116 | 		lam=fit.cv$lambda.min; 
117 | 		fit1 = glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index, lambda=lam)
118 | 		pred=predict(fit1,newx=t(gtex.test.tss1.pc))
119 | 		return(c(pred,lam))
120 | 		} #gene loop ends
121 | temp.pred[test,]=eachgene[1:length(test),]; lambda.matrix[,jj]=eachgene[length(test)+1,];
122 | } #fold loop ends
123 | 
124 | avg.pred=avg.pred+temp.pred; 
125 | 
126 | temp.tss22=temp.tss2[genes,]
127 | #temp=foreach(itr=1:nrow(temp.tss22),.inorder=T,.combine='rbind') %dopar% {
128 | 	temp=foreach(itr=1:nrow(temp.tss22),.inorder=T,.combine='rbind') %do% {
129 | 
130 | x=cor.test(as.numeric(temp.tss22[itr,]),temp.pred[,itr])
131 | c(x$estimate,x$p.value) }
132 | 
133 | corr.perENS=cbind(corr.perENS,temp[,1])
134 | pval.perENS=cbind(pval.perENS,temp[,2])
135 | 
136 | } #ENS  loop ends
137 | avg.pred=avg.pred/ENS
138 | 
139 | temp.tss22=temp.tss2[genes,]
140 | #corr=foreach(itr=1:nrow(temp.tss22),.inorder=T,.combine='rbind') %dopar% {
141 | 	corr=foreach(itr=1:nrow(temp.tss22),.inorder=T,.combine='rbind') %do% {
142 | 
143 | x=cor.test(as.numeric(temp.tss22[itr,]),avg.pred[,itr])
144 | c(x$estimate,x$p.value)
145 | }
146 | 
147 | rownames(avg.pred)=pat.com
148 | colnames(corr)=c("cor","pval");
149 | 
150 | result=list()
151 | result[["Mean-first"]]=corr
152 | result[["Mean-end"]][["cor"]]=corr.perENS
153 | result[["Mean-end"]][["pval"]]=corr.perENS
154 | result[["prediction"]]=avg.pred
155 | result[["sampling"]]=sample.matrix
156 | result[["lambda"]]=lambda.matrix
157 | return(result)
158 | 
159 | }
160 | 
161 | 
162 | regression_glmnet_gene_splicing<-function(nfold,ENS,nPC1,nPC2,temp.tss1,temp.snps,temp.tss2,genes,pat.com,method,confund.x){
163 | require(foreach)
164 | require(glmnet)
165 | library(doMC)
166 | registerDoMC(cores=25)
167 | if(method=="lasso"){index=1}
168 | if(method=="ridge"){index=0}
169 | 
170 | 
171 | 	k=round(length(pat.com)/nfold-.5);
172 | 	folds=rep(c(1:nfold),c(rep(k,(nfold-1)),(length(pat.com)-(nfold-1)*k)))
173 | 	avg.pred=matrix(0,length(pat.com),length(genes))
174 | 	corr.perENS=NULL; pval.perENS=NULL;
175 | 	sample.matrix=matrix(0,ENS+1,length(pat.com))
176 | 	colnames(sample.matrix)=pat.com; 
177 | 	sample.matrix[ENS+1,]=folds
178 | 	lambda.matrix=matrix(0,length(genes),ENS*nfold)
179 | 	jj=0;
180 | 
181 | for(ee in seq(ENS)){
182 | 	sam=sample(1:length(pat.com),length(pat.com),replace=F);sample.matrix[ee,]=sam;
183 | 	temp.pred=matrix(0,length(pat.com),length(genes))	
184 | 	#print(ee)
185 | 
186 | 		for(ifold in seq(nfold)){ print(paste0("e",ee,"f",ifold));
187 | 	    	jj=jj+1;
188 | 			test=sam[which(folds==ifold)];
189 | 			train=sam[which(folds!=ifold)];
190 | 			gtex.train.tss2=as.matrix(temp.tss2[,train,with=F]); gtex.test.tss2=as.matrix(temp.tss2[,test,with=F]);
191 | 			gtex.train.tss1.pc=as.matrix(rbind(temp.tss1[1:nPC1,train],temp.snps[1:nPC2,train],t(confund.x[train,]))); 
192 | 	        gtex.test.tss1.pc =as.matrix(rbind(temp.tss1[1:nPC1,test],temp.snps[1:nPC2,test],t(confund.x[test,])));
193 | 			#print("step1")
194 | 
195 | 			eachgene=foreach(itr=1:length(genes),.inorder=T,.combine='cbind') %dopar% {
196 | 				geneid=genes[itr];
197 | 				fit.cv=cv.glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index,nfolds=5)
198 | 				lam=fit.cv$lambda.min
199 | 				fit1 = glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index, lambda=lam)
200 | 				pred=predict(fit1,newx=t(gtex.test.tss1.pc))
201 | 				return(c(pred,lam))
202 | 				} #gene loop ends
203 | 		temp.pred[test,]=eachgene[1:length(test),]; lambda.matrix[,jj]=eachgene[length(test)+1,];
204 | 		} #fold loop ends
205 | 
206 | 	avg.pred=avg.pred+temp.pred; 
207 | 	temp.tss22=temp.tss2[genes,];
208 | 	temp=foreach(itr=1:nrow(temp.tss22),.inorder=T,.combine='rbind') %dopar% {
209 | 	x=cor.test(as.numeric(temp.tss22[itr,]),temp.pred[,itr]);
210 | 	c(x$estimate,x$p.value) }
211 | 
212 | 	corr.perENS=cbind(corr.perENS,temp[,1])
213 | 	pval.perENS=cbind(pval.perENS,temp[,2])
214 | 
215 | } #ENS  loop ends
216 | 
217 | avg.pred=avg.pred/ENS
218 | temp.tss22=temp.tss2[genes,]
219 | corr=foreach(itr=1:nrow(temp.tss22),.inorder=T,.combine='rbind') %dopar% {
220 | x=cor.test(as.numeric(temp.tss22[itr,]),avg.pred[,itr])
221 | c(x$estimate,x$p.value)
222 | }
223 | 
224 | rownames(avg.pred)=pat.com
225 | colnames(corr)=c("cor","pval");
226 | 
227 | result=list()
228 | result[["Mean-first"]]=corr
229 | result[["Mean-end"]][["cor"]]=corr.perENS
230 | result[["Mean-end"]][["pval"]]=corr.perENS
231 | result[["prediction"]]=avg.pred
232 | result[["sampling"]]=sample.matrix
233 | result[["lambda"]]=lambda.matrix
234 | return(result)
235 | 
236 | } #function ends
237 | 
238 | 
239 | #using loglikelihood of lm test  (GE+Sp+CF~CF): contribution from GE+Sp
240 | regression_lm_loglik_selectgene_2<-function(nPC1,nPC2,temp.tss1,temp.snps,temp.tss2,method,confund.x)
241 | {
242 | require('lmtest')
243 | xH1=as.matrix(cbind(t(temp.tss1[1:nPC1,]),t(temp.snps[1:nPC2,]),confund.x)); xH0=as.matrix(confund.x);
244 | y=as.matrix(t(temp.tss2));
245 | 
246 | LR=list();pval=c();
247 | 
248 | for(itr in seq(ncol(y))){
249 | 	fit1=lm(y[,itr]~xH1); fit0=lm(y[,itr]~xH0); LR[[itr]]=lrtest(fit1,fit0);
250 | 	pval=append(pval,LR[[itr]]$"Pr(>Chisq)"[2]);
251 | 	}
252 | 
253 | loglike.Sgenes=list();
254 | loglike.Sgenes[["loglikelihood-ratio"]]=LR
255 | loglike.Sgenes[["pval"]]=pval
256 | return(loglike.Sgenes)
257 | }
258 | 
259 | #using loglikelihood of lm test contribution from splicing (GE+Sp+CF~GE+CF): contribution from Sp
260 | regression_lm_loglik_selectgene_2Sp<-function(nPC1,nPC2,temp.tss1,temp.snps,temp.tss2,method,confund.x)
261 | {
262 | require('lmtest')
263 | xH1=as.matrix(cbind(t(temp.tss1[1:nPC1,]),t(temp.snps[1:nPC2,]),confund.x)); 
264 | xH0=as.matrix(cbind(t(temp.tss1[1:nPC1,]),confund.x));
265 | y=as.matrix(t(temp.tss2));
266 | 
267 | LR=list();pval=c();
268 | 
269 | for(itr in seq(ncol(y))){
270 | 	fit1=lm(y[,itr]~xH1); fit0=lm(y[,itr]~xH0); LR[[itr]]=lrtest(fit1,fit0);
271 | 	pval=append(pval,LR[[itr]]$"Pr(>Chisq)"[2]);
272 | 	}
273 | 
274 | loglike.Sgenes=list();
275 | loglike.Sgenes[["loglikelihood-ratio"]]=LR
276 | loglike.Sgenes[["pval"]]=pval
277 | return(loglike.Sgenes)
278 | }
279 | 
280 | 
281 | #using loglikelihood of lm test contribution from gene (GE+Sp+CF~Sp+CF): contribution from GE
282 | regression_lm_loglik_selectgene_2GE<-function(nPC1,nPC2,temp.tss1,temp.snps,temp.tss2,method,confund.x)
283 | {
284 | require('lmtest')
285 | xH1=as.matrix(cbind(t(temp.tss1[1:nPC1,]),t(temp.snps[1:nPC2,]),confund.x)); 
286 | xH0=as.matrix(cbind(t(temp.snps[1:nPC2,]),confund.x));
287 | y=as.matrix(t(temp.tss2));
288 | 
289 | LR=list();pval=c();
290 |   
291 | for(itr in seq(ncol(y))){
292 | 	fit1=lm(y[,itr]~xH1); fit0=lm(y[,itr]~xH0); LR[[itr]]=lrtest(fit1,fit0);
293 | 	pval=append(pval,LR[[itr]]$"Pr(>Chisq)"[2]);
294 | 	}
295 | 
296 | loglike.Sgenes=list();
297 | loglike.Sgenes[["loglikelihood-ratio"]]=LR
298 | loglike.Sgenes[["pval"]]=pval
299 | return(loglike.Sgenes)
300 | }
301 | 
302 | 
303 | #using loglikelihood of lm test  (GE/Sp/SNP+CF~CF): contribution from GE/Sp/SNP
304 | regression_lm_loglik_selectgene_1<-function(nPC,temp.tss1,temp.tss2,method,confund.x)
305 | {
306 | require('lmtest')
307 | xH1=as.matrix(cbind(t(temp.tss1[1:nPC,]),confund.x)); xH0=as.matrix(confund.x);
308 | y=as.matrix(t(temp.tss2));
309 | 
310 | LR=list();pval=c();
311 | 
312 | for(itr in seq(ncol(y))){
313 | 	fit1=lm(y[,itr]~xH1); fit0=lm(y[,itr]~xH0); LR[[itr]]=lrtest(fit1,fit0);
314 | 	#g=append(g,((LR[[itr]]$"Pr(>Chisq)"[2]<=0.05)*1));
315 | 	pval=append(pval,LR[[itr]]$"Pr(>Chisq)"[2]);
316 | 	}
317 | 
318 | loglike.Sgenes=list();
319 | loglike.Sgenes[["loglikelihood-ratio"]]=LR
320 | loglike.Sgenes[["pval"]]=pval
321 | return(loglike.Sgenes)
322 | }
323 | 
324 | 
325 | 
326 | prediction_dis_CVfeature<-function(expr.pre,expr.org,blood.expr,istat.x,nfold,ENS,geneid.x){
327 | 
328 | library(data.table) #cntrl.ox,cntrl.op,cntrl.ob,
329 | require(ROCR)
330 | require(glmnet)
331 | require(foreach)
332 | require('lmtest')
333 | 
334 | 		cvlasso<-function(xinput.train.x,xinput.test.x,yinput.train.x,yinput.test.x){
335 | 			fit.cv=cv.glmnet(xinput.train.x,yinput.train.x, alpha=1, nfolds=4);
336 | 			lam=fit.cv$lambda.min;
337 | 			fit = glmnet(xinput.train.x,yinput.train.x, alpha=1, lambda=lam);
338 | 			pre=predict(fit,newx=xinput.test.x);
339 | 			pred<-prediction(pre[,1],yinput.test.x);aucval<-performance(pred,"auc");
340 | 		return(aucval@y.values[[1]])
341 | 		}
342 | 
343 | 
344 | 		DEgene_disease_wilcox<-function(expr.x,istat.xx,confound.x,genes){
345 | 
346 | 		library(data.table)
347 | 		case=which(istat.xx==1);
348 | 		cntrl=which(istat.xx==-1);
349 | 		pval=c();
350 | 		for(g in seq(length(genes))){
351 | 			geneid.x=genes[g];
352 | 			pval=append(pval,wilcox.test(as.numeric(expr.x[geneid.x,case]),as.numeric(expr.x[geneid.x,cntrl]))$"p.value");}
353 | 		return(pval)
354 | 		}
355 | 
356 | 
357 | 		DEgene_disease<-function(expr.x,istat.xx,confound.x,genes){
358 | 
359 | 		library(data.table)
360 | 
361 | 		confound.x=t(confound.x)
362 | 
363 | 		pval=foreach(itr=1:length(genes),.inorder=T,.combine='cbind') %do% { #par
364 | 				geneid.x=genes[itr];
365 | 				yinput=istat.xx
366 | 				xinput=t(as.matrix(rbind(expr.x[geneid.x,],confound.x))); xinput0=t(as.matrix(confound.x)); 
367 | 				colnames(xinput)=c("expr",colnames(xinput)[-c(1)]);
368 | 				H1=lm(yinput~xinput); H0=lm(yinput~xinput0); LR=lrtest(H1,H0); 
369 | 				#pval=append(pval,LR[[itr]]$"Pr(>Chisq)"[2]);
370 | 				return(LR$"Pr(>Chisq)"[2])
371 | 				}
372 | 		return(pval)
373 | 		}
374 | 
375 | 
376 | 
377 | result=list();
378 | 
379 | expr.pre=as.matrix(expr.pre); expr.org=as.matrix(expr.org); blood.expr=as.matrix(blood.expr);
380 | 
381 | pat.com.x=colnames(expr.pre)
382 | 
383 | ind3_d=which(istat.x==1);ind3_n=which(istat.x==-1);
384 | 
385 | x.pred=c(); x.raw=c();x.blood=c(); 
386 | 
387 | 	#eachens.x=foreach(itr=1:ENS,.inorder=T,.combine='cbind') %dopar% {
388 | 	eachens.x=foreach(itr=1:ENS,.inorder=T) %do% {
389 | 	tag=0
390 | 	itag=0
391 | 	while(!tag){
392 | 	folds_d=sample(1:nfold,length(ind3_d),replace=T); folds_n=sample(1:nfold,length(ind3_n),replace=T);
393 | 	tnn=data.frame(table(folds_n)); td=data.frame(table(folds_d));
394 | 	u=(length(unique(folds_d))==nfold)*1; v=(length(unique(folds_n))==nfold)*1;
395 | 	tag=u*v*((sum((tnn$Freq>4)*1)==nfold)*1)*((sum((td$Freq>4)*1)==nfold)*1); 
396 | 	itag=itag+1; print(c(itag,tag)); }
397 | 
398 | foldsamples=c(folds_d,folds_n);
399 | 
400 | for(ifold in 1:nfold) {
401 | 
402 | 	val1_d=which(folds_d==ifold); 	val1_n=which(folds_n==ifold);
403 | 	trn1_d=which(folds_d!=ifold); 	trn1_n=which(folds_n!=ifold);
404 | 
405 | 	train=c(ind3_d[trn1_d],ind3_n[trn1_n])
406 | 	test=c(ind3_d[val1_d],ind3_n[val1_n]) # val has indices of istat
407 | 
408 | 	istat_trn=istat.x[train]; istat_val=istat.x[test]; 
409 | 
410 | 	expr.TEST.im=expr.pre[,test]
411 | 	expr.TRAIN.im=expr.pre[,train]
412 | 
413 | 	expr.TEST.original=expr.org[,test]
414 | 	expr.TRAIN.original=expr.org[,train]	
415 | 
416 | 	expr.TRAIN.blood=blood.expr[,train]
417 | 	expr.TEST.blood=blood.expr[,test]
418 | 	
419 | 		x.pred=append(x.pred,cvlasso(t(expr.TRAIN.im),t(expr.TEST.im),istat_trn,istat_val))
420 | 		x.raw=append(x.raw,cvlasso(t(expr.TRAIN.original),t(expr.TEST.original),istat_trn,istat_val))
421 | 		x.blood=append(x.blood,cvlasso(t(expr.TRAIN.blood),t(expr.TEST.blood),istat_trn,istat_val))
422 | 		
423 | 
424 | } #nfold loop ends
425 | 		
426 | print("step2")
427 | asd=rbind(x.pred,x.raw,x.blood);
428 | return(list(asd,foldsamples))
429 | } #---ENS loop
430 | 
431 | eachens=NULL;foldsamples.x=NULL;
432 | for(i in seq(ENS)){eachens=cbind(eachens,eachens.x[[i]][[1]]);
433 | foldsamples.x=cbind(foldsamples.x,eachens.x[[i]][[2]]);}
434 | 
435 | 
436 | result[["predicted-AUC"]]=eachens[1,];
437 | result[["original-AUC"]]=eachens[2,];
438 | result[["Blood-AUC"]]=eachens[3,];
439 | result[["ensfold"]]=foldsamples.x;
440 | return(result)
441 | }
442 | 
443 | 
444 | 
445 | regression_glmnet_gene_splicing_M2<-function(nPC1,nPC2,temp.tss1,temp.snps,temp.tss2,genes,method,confund.x){
446 | 
447 | 	#nPC1=PC1; nPC2=PC2;temp.tss1=gtex.tss1.pc; temp.snps=splice.mat.pc; temp.tss2=gtex.tss2;genes=gene[1:10];method="lasso";confund.x=confund;
448 | 
449 | require(foreach)
450 | require(glmnet)
451 | #library(doMC)
452 | #registerDoMC(cores=25)
453 | if(method=="lasso"){index=1}
454 | if(method=="ridge"){index=0}
455 | 
456 | 			gtex.train.tss2=as.matrix(temp.tss2); 
457 | 			gtex.train.tss1.pc=as.matrix(rbind(temp.tss1[1:nPC1,],temp.snps[1:nPC2,],t(confund.x)));
458 | 
459 | 			eachgene=foreach(itr=1:length(genes),.inorder=T,.combine='c') %do% {
460 | 				geneid=itr;
461 | 				fit.cv=cv.glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index,nfolds=5)
462 | 				lam=fit.cv$lambda.min
463 | 				fit1 = glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index, lambda=lam)
464 | 				#pred=predict(fit1,newx=t(gtex.test.tss1.pc))
465 | 				
466 | 				temp=list(fit1)
467 | 				return(temp)
468 | 				} #gene loop ends
469 | 				names(eachgene)=genes
470 | return(eachgene)
471 | 
472 | }
473 | 
474 | regression_glmnet_gene_M1<-function(nPC,temp.tss1,temp.tss2,genes,method,confund.x){
475 | require(foreach)
476 | require(glmnet)
477 | #library(doMC)
478 | #registerDoMC(cores=25)
479 | if(method=="lasso"){index=1}
480 | if(method=="ridge"){index=0}
481 | 
482 | 			gtex.train.tss2=as.matrix(temp.tss2); 
483 | 			gtex.train.tss1.pc=as.matrix(rbind(temp.tss1[1:nPC,],t(confund.x)));
484 | 
485 | 			eachgene=foreach(itr=1:length(genes),.inorder=T,.combine='c') %do% {
486 | 				geneid=itr;
487 | 				fit.cv=cv.glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index,nfolds=5)
488 | 				lam=fit.cv$lambda.min
489 | 				fit1 = glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index, lambda=lam)
490 | 				#pred=predict(fit1,newx=t(gtex.test.tss1.pc))
491 | 				
492 | 				temp=list(fit1)
493 | 				return(temp)
494 | 				} #gene loop ends
495 | 				names(eachgene)=genes
496 | return(eachgene)
497 | 
498 | }
499 | 
500 | 
501 | 
502 | 


--------------------------------------------------------------------------------