├── tissue_pair_work_v6.RData ├── code ├── disease_prediction.r ├── regression_model_articleoutput.r ├── regression_model_M1_M2.r └── function.r └── README.md /tissue_pair_work_v6.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbasugit/Imputation/HEAD/tissue_pair_work_v6.RData -------------------------------------------------------------------------------- /code/disease_prediction.r: -------------------------------------------------------------------------------- 1 | 2 | library(data.table) 3 | args<-commandArgs(TRUE); 4 | 5 | nfold=as.numeric(args[1]); #fold 6 | ENS=as.numeric(args[2]); #ENS 7 | 8 | #nfold=5; ENS=50; #one can change this parameters, nfold is partision for CV and ENS is number of independent runs 9 | 10 | dir="/home/Desktop" #example of dir that needs to be given 11 | workdir=paste0(dir,"/imputation"); 12 | setwd(workdir) 13 | #files for predicted expression 14 | folder.GSp=paste0(workdir,"/output/given"); #if user do not have predicted expression of target tissue use the one provided here 15 | 16 | source(paste0(workdir,'/code/function.r')); 17 | 18 | 19 | 20 | #load gtex data 21 | load(paste0(workdir,"/input/gtexdata_v6.RData")); 22 | gene=gtex.pc$Name 23 | 24 | #load disease tissue and sample count 25 | load(paste0(workdir,"/input/disease_blood_32tissue_common_sample.Rdata")) 26 | 27 | disease=names(comm_sample); dis.tissue=list();samplecut=25; 28 | 29 | for(dis in disease){ 30 | tmp=comm_sample[[dis]]; print(dis); 31 | k=which(as.numeric(tmp[,2])>=samplecut & as.numeric(tmp[,3])>=samplecut) 32 | dis.tissue[[dis]]=tmp[k,1]; 33 | } 34 | disease.study=names(dis.tissue)[lapply(dis.tissue,length)>0] 35 | 36 | 37 | tss1="Whole Blood" 38 | files=list.files(path=folder.GSp,pattern=".Rdata"); 39 | 40 | preddisllr1=list(); 41 | 42 | #for(dis in disease.study){ 43 | #tissue=dis.tissue[[dis]]; 44 | #for(tss in tissue){ 45 | 46 | 47 | #For all disease and tissue disease prediction uncomment above 3 lines, and tissue disease loop end line 48 | #and comment the following line 49 | dis="MHHTN"; tissue=c("Artery - Tibial","Adipose - Subcutaneous"); 50 | 51 | fl=files[grep(tss,files,fixed=TRUE)] 52 | tss2=tss; print(tss2); 53 | 54 | #load predicted expression 55 | load(paste0(folder.GSp,"/",fl)) 56 | mat.pred=gene.regress.glmnet[["prediction"]]; 57 | 58 | if(ncol(mat.pred)0 & length(i2)>0){ j1=append(j1,i1[1]); j2=append(j2,i2[1]); pat.com=append(pat.com,ipat);} 67 | } 68 | 69 | nn1=length(j1); nn2=length(j2); it1=matrix(0, 1, nn1); it2=matrix(0, 1, nn2); 70 | it1[1,]=j1; it2[1,]=j2; 71 | gtex.tss1=gtex.pc[,c(it1[1,]), with=F]; #expr of blood 72 | gtex.tss2=gtex.pc[,c(it2[1,]), with=F]; #expr of target tissue 73 | 74 | pat=pat.com; mhh2=pheno.dt[,dis,with=F]; subj=pheno.subj 75 | istat=matrix(-1, 1, length(pat)) 76 | 77 | for (i in seq(length(pat))){ ii=which(pat[i]==subj); istat[i]=mhh2[ii]; } 78 | istat[istat==0]=-1; istat[istat==99]=0; istat=as.numeric(istat); 79 | 80 | k=which(istat==1|istat==-1) 81 | temp.p=(t(mat.pred)) 82 | temp.o=as.matrix(gtex.tss2); 83 | temp.p=temp.p[,k]; temp.o=temp.o[,k]; istat=istat[k]; blood.oo=as.matrix(gtex.tss1)[,k]; 84 | 85 | #LLR gene 86 | llrfdr=0.05 87 | fdr=p.adjust(gene.select.LLR[['pval']],method="BH") 88 | x2=gene.regress.glmnet[["Mean-end"]]; predPCC=rowMeans(x2[["cor"]]); 89 | llrgene=which(fdr<=llrfdr & predPCC>0.3); 90 | 91 | #print(c(dis,tss,length(llrgene))) 92 | 93 | #perform prediciton if tere is minimum of 20 llr genes 94 | if(length(llrgene)>20){ 95 | temp.p1=temp.p[llrgene,]; temp.o1=temp.o[llrgene,]; blood.oo1=blood.oo[llrgene,]; 96 | 97 | ptm <- proc.time() 98 | z=prediction_dis_CVfeature(temp.p1,temp.o1,blood.oo1,istat,nfold,ENS,geneid) #using age,race,gender 99 | print(proc.time() - ptm) 100 | preddisllr1[[dis]][[tss2]][[toString(paste0(llrfdr))]]=z; 101 | } else { 102 | preddisllr1[[dis]][[tss2]][[toString(paste0(llrfdr))]]=NA; 103 | } 104 | 105 | save(preddisllr1,file=paste0(workdir,"/output/given/dispred_ens50.Rdata")) 106 | 107 | #} #tissue loop ends 108 | #} #disease loop ends 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TEEBoT 2 | ## (Tissue Expression Estimation using Blood Transcriptome) 3 | 4 | TEEBot is a tool to predict tissue-specific expression (TSGE) from an individual’s blood transcriptome and demographic 5 | information (age, gender and race), with clinical implications. If his/her genotype information is avalibale, it could boost the performance of TEEBot. 6 | We trained TEEBot on GTEx version 6 and evaluated its performance in a cross-validation manner. For each gene in each target tissue, we first evaluate its predictability based on LLR test and then fit a lasso regression model to estimate its TSGE. Our models require Whole blood gene expression (WBGE), Whole Blood splicing (WBSp) information, and three demographic ‘confounding’ factors (Age, Race, and Sex), with genetype information as one additonal option. 7 | 8 | ## Data download 9 | Download data files from GTEx Portal and dbGaP. The data include gene expression, transcript expression, phenotype and genotype information (optional). 10 | 11 | ## Main script 12 | The code to predict the gene expression of a target tissue consists of the following steps: 13 | 14 | ### Model building 15 | For each gene, the top PCs of Whole blood transcriptome (top 10 PCs for gene expression and top 20 PCs for splicing profile) as features are used to build a gene specific model to predict its expression in the target tissue. Five-fold cross validation are performed for all the genes across tissues. The model based on lasso regression is implemented using cv.glmnet() function from glmnet R-package. 16 | 17 | ### Measuring prediction accuracy 18 | The prediction accuracy for each gene are evaluated using Pearson correlation test between the predicted expression and the ground truth. Likelihood ratio test 19 | is also performed for each gene to assess the independent contribution of blood transcriptome beyond the 20 | confounders (age, race and sex). For each gene we provide both its prediatability score (in terms of Pearson correlation coefficient) 21 | and FDR of the LLR p-value. We only report the prediction accuracies of genes which pass the likelihood ratio 22 | test (FDR<=0.05) and also have pearson correlation coefficient values above a predefined threshold. 23 | 24 | 25 | ## Code running instructions 26 | ### For prediction of target tissue 27 | Code: regression_model_articleoutput.r 28 | 29 | #### Input (workdir/input) 30 | Input are the gene expression, phenotypen and genotype file downloaded from GTEx Portal and dbGaP. \ 31 | i) Gene expression data: All_Tissue_Site_Details_Analysis.combined.rpkm.gct \ 32 | ii) Phenotype data: phs000424.v6.pht002742.v6.p1.c1.GTEx_Subject_Phenotypes.GRU.txt \ 33 | iii) Sample attribute: GTEx_Data_V6_Annotations_SampleAttributesDS.txt \ 34 | iv) Gene property file from ncbi: Homo_sapiens.gene_info (downloaded from https://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/ on march2016) \ 35 | v) Genotype data: genotype.v6.chr1_22.Rdata \ 36 | These files are read in the code within function "format_gtexdatav6()". 37 | We also have a input file "tissue_pair_work_v6.RData" which contains a table with first column 38 | as whole blood, second column target tissue and third colum are the number of common samples. 39 | All these files needs to be kept within the "workdir/input" folder. Along with these files we need to keep the function.r in the "workdir/input" folder. 40 | 41 | The input parameters to run the code are index, run, fold, ENS. 42 | 1) "index" are the tissue ids, the ids for the 32 target tissues are as follows 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,32,34,37,39. 43 | 2) "run" indicated which model to compute, M1 or M2. To compute model M1 run=100, and for M2 run=110 44 | 3) fold : how many fold cross validation we want. 45 | 4) ENS : how many independent predictions we want to do. 46 | 47 | #### Output 48 | Each Target tissue prediction file for example "Whole Blood_xx_lasso_regress_G_Spl_10_20.Rdata", 49 | where xx will be replaced by the target tissue name. 50 | This Rdata contanis two objects, gene.regress.glmnet and gene.select.LLR. 51 | 1) gene.regress.glmnet : is a list, gene.regress.glmnet[["Mean-first"]] contains our desired correlation coefficient 52 | that is the predictibility scores. 53 | 2) gene.select.LLR : is a list with LLR ratio and pvalue 54 | 55 | note: Download the code file (regression_model_articleoutput.r) and the function.r in the same folder. Example path to store the 56 | codes is "/home/Desktop/Imputation". Within Imputation make a folder named "input" and "output". In the "output" folder the output of the codes automatically gets saved. 57 | 58 | #### Required Packages 59 | For the code the following R-packages needs to be installed 60 | "glmnet", "data.table", "foreach", "doMC", "ROCR", "lmtest". 61 | -------------------------------------------------------------------------------- /code/regression_model_articleoutput.r: -------------------------------------------------------------------------------- 1 | #code to find predicted expression in target tissue 2 | 3 | dir="/home/Desktop" #example of dir that needs to be given 4 | workdir=paste0(dir,"/imputation"); 5 | 6 | setwd(workdir) 7 | 8 | 9 | library(data.table) 10 | args<-commandArgs(TRUE); 11 | 12 | 13 | index=as.numeric(args[1]) #target tissue 14 | run=toString(args[2]) #category 15 | fold=as.numeric(args[3]); #fold 16 | ENS=as.numeric(args[4]); #ENS 17 | 18 | PC1=as.numeric(args[3]); #GE 19 | PC2=as.numeric(args[4]); #Sp 20 | PC3=as.numeric(args[5]); #Snp 21 | #PC1=10; PC2=20; 22 | 23 | #workdir="/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/gtex-v6" 24 | 25 | 26 | 27 | #loading functions 28 | #source('/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/gtex-v6/function_regression.r') 29 | source(paste0(workdir,'/code/function.r')); 30 | 31 | #load gtex data 32 | format_gtexdatav6(paste0(workdir,"/input/")); 33 | #In this function we directly read gtex v6 phenotype data and gene expression data and merge them 34 | #into format convenient to be used in the code and create a gtexdata_v6.RData file in workdir/input/ folder. 35 | load(paste0(workdir,"/input/gtexdata_v6.RData")); 36 | 37 | #Common samples between blood and target tissue 38 | #load("/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/tissue_pair_work_v6.RData") 39 | load(paste0(workdir,"/input/tissue_pair_work_v6.RData")); 40 | 41 | #finding patient ids that have snps information 42 | load(paste0(workdir,"/input/genotype.v6.chr1_22.Rdata") #loading genotype data obatined from dbGaP 43 | pat.snps=sapply(strsplit(colnames(genotype.v6[["chr2"]]), split="-"), '[[', 2) 44 | 45 | 46 | gene=gtex.pc$Name 47 | tss1="Whole Blood" 48 | tss2=as.character(unlist(strsplit(tss_pair[index,],"Whole Blood"))[2]); #tss2: target tissue 49 | allgeneid=as.numeric(seq(length(gene))) 50 | 51 | comm.patients=patients[which(patients %in% pat.snps)] 52 | 53 | #for common individuals seperate the gene expression for tss1 and tss2 54 | j1=c(); j2=c(); 55 | pat.com=c(); #common individuals between Blood (tss1) and target tissue (tss2) 56 | for (ipat in comm.patients){ 57 | i1=which(expc.nt$patient==ipat & expc.nt$SMTSD==tss1); 58 | i2=which(expc.nt$patient==ipat & expc.nt$SMTSD==tss2); 59 | if (length(i1)>0 & length(i2)>0){ j1=append(j1,i1[1]); j2=append(j2,i2[1]); pat.com=append(pat.com,ipat);}} 60 | nn1=length(j1); nn2=length(j2); it1=matrix(0, 1, nn1); it2=matrix(0, 1, nn2); 61 | it1[1,]=j1; it2[1,]=j2; 62 | gtex.tss1=gtex.pc[,c(it1[1,]), with=F]; 63 | gtex.tss2=gtex.pc[,c(it2[1,]), with=F]; 64 | 65 | 66 | #PCA for tss1, whole Blood gtex 67 | mdsk=100; 68 | d = dist(t(gtex.tss1), method="euclidean"); mdsk=dim(t(gtex.tss1))[1]-1; 69 | pcfit = cmdscale(d, eig=TRUE, k=mdsk); 70 | gtex.tss1.pc=t(pcfit$points); #row:PCs; col: samples 71 | colnames(gtex.tss1.pc)=pat.com; 72 | 73 | #loading splicing PCs 74 | #load('/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/gtex-v6/PC_splicing/PCcmd_splicing_blood.Rdata' ) 75 | load(paste0(workdir,"/input/PCcmd_splicing_blood.Rdata")) 76 | splice.mat.pc=splicing.pc[[tss2]] 77 | 78 | #confounders 79 | x=sapply(1:length(pat.com),function(i){which(pheno.subj %in% pat.com[i])}) 80 | age=as.numeric(pheno.dt$AGE[x]); 81 | race=as.numeric(pheno.dt$RACE[x]); race[race==99]=0; 82 | gender=as.numeric(pheno.dt$GENDER[x]); 83 | confund=cbind(age,race,gender) 84 | 85 | 86 | #row:PCs; col: samples 87 | 88 | if(run=="110"){ 89 | #Gene + Splicing -------------------------------------------- 90 | ptm <- proc.time() 91 | 92 | #outdir=paste0(workdir,"/output") 93 | #outdir="/cbcb/project2-scratch/mbasu/gtex_v6/blood_cross_talk/gtex-v6/regress_result/lasso_geneexpr_snps_splicing_confound/data/lasso_cv_lambda/G_Sp_CF" 94 | outdir=paste0(workdir,"/output/G_Sp_CF") 95 | if(!file.exists(outdir))dir.create(outdir) 96 | str=sprintf("%s/%s_%s_lasso_regress_G_Spl_%d_%d.Rdata",outdir,tss1,tss2,PC1,PC2) 97 | 98 | gene.regress.glmnet=regression_glmnet_gene_splicing(fold,ENS,PC1,PC2,gtex.tss1.pc,splice.mat.pc,gtex.tss2,allgeneid,pat.com,"lasso",confund) 99 | 100 | gene.select.LLR=regression_lm_loglik_selectgene_2(PC1,PC2,gtex.tss1.pc,splice.mat.pc,gtex.tss2,"llr",confund) 101 | gene.select.LLR.Sp=regression_lm_loglik_selectgene_2Sp(PC1,PC2,gtex.tss1.pc,splice.mat.pc,gtex.tss2,"llr",confund) 102 | gene.select.LLR.GE=regression_lm_loglik_selectgene_2GE(PC1,PC2,gtex.tss1.pc,splice.mat.pc,gtex.tss2,"llr",confund) 103 | 104 | save(gene.regress.glmnet,gene.select.LLR,gene.select.LLR.Sp,gene.select.LLR.GE,file=str) 105 | print(proc.time() - ptm) 106 | #------------------------------------------------------------ 107 | } 108 | 109 | 110 | if(run=="100"){ 111 | #Gene ------------------------------------------------------ 112 | #outdir="/cbcb/project2-scratch/mbasu/gtex_v6/blood_cross_talk/gtex-v6/regress_result/lasso_geneexpr_snps_splicing_confound/data/lasso_cv_lambda/G_CF" 113 | outdir=paste0(workdir,"/output/G_CF") 114 | if(!file.exists(outdir))dir.create(outdir) 115 | str=sprintf("%s/%s_%s_lasso_regress_G_%d_%dfold_%dENS.Rdata",myfolder,tss1,tss2,PC1,fold,ENS) 116 | 117 | ptm <- proc.time() 118 | gene.regress.glmnet=regression_glmnet_gene(fold,ENS,PC1,gtex.tss1.pc,gtex.tss2,allgeneid,pat.com,"lasso",confund) 119 | 120 | gene.select.LLR=regression_lm_loglik_selectgene_1(PC1,gtex.tss1.pc,gtex.tss2,"llr",confund) 121 | save(gene.regress.glmnet,gene.select.LLR,file=str) 122 | print(proc.time() - ptm) 123 | #------------------------------------------------------------ 124 | } 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /code/regression_model_M1_M2.r: -------------------------------------------------------------------------------- 1 | #code to find predicted expression in target tissue 2 | 3 | 4 | regression_glmnet_gene_splicing_M2<-function(nPC1,nPC2,temp.tss1,temp.snps,temp.tss2,genes,method,confund.x){ 5 | 6 | #nPC1=PC1; nPC2=PC2;temp.tss1=gtex.tss1.pc; temp.snps=splice.mat.pc; temp.tss2=gtex.tss2;genes=gene[1:10];method="lasso";confund.x=confund; 7 | 8 | require(foreach) 9 | require(glmnet) 10 | #library(doMC) 11 | #registerDoMC(cores=25) 12 | if(method=="lasso"){index=1} 13 | if(method=="ridge"){index=0} 14 | 15 | gtex.train.tss2=as.matrix(temp.tss2); 16 | gtex.train.tss1.pc=as.matrix(rbind(temp.tss1[1:nPC1,],temp.snps[1:nPC2,],t(confund.x))); 17 | 18 | eachgene=foreach(itr=1:length(genes),.inorder=T,.combine='c') %do% { 19 | geneid=itr; 20 | fit.cv=cv.glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index,nfolds=5) 21 | lam=fit.cv$lambda.min 22 | fit1 = glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index, lambda=lam) 23 | #pred=predict(fit1,newx=t(gtex.test.tss1.pc)) 24 | 25 | temp=list(fit1) 26 | return(temp) 27 | } #gene loop ends 28 | names(eachgene)=genes 29 | return(eachgene) 30 | 31 | } 32 | 33 | regression_glmnet_gene_M1<-function(nPC,temp.tss1,temp.tss2,genes,method,confund.x){ 34 | require(foreach) 35 | require(glmnet) 36 | #library(doMC) 37 | #registerDoMC(cores=25) 38 | if(method=="lasso"){index=1} 39 | if(method=="ridge"){index=0} 40 | 41 | gtex.train.tss2=as.matrix(temp.tss2); 42 | gtex.train.tss1.pc=as.matrix(rbind(temp.tss1[1:nPC,],t(confund.x))); 43 | 44 | eachgene=foreach(itr=1:length(genes),.inorder=T,.combine='c') %do% { 45 | geneid=itr; 46 | fit.cv=cv.glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index,nfolds=5) 47 | lam=fit.cv$lambda.min 48 | fit1 = glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index, lambda=lam) 49 | #pred=predict(fit1,newx=t(gtex.test.tss1.pc)) 50 | 51 | temp=list(fit1) 52 | return(temp) 53 | } #gene loop ends 54 | names(eachgene)=genes 55 | return(eachgene) 56 | 57 | } 58 | 59 | 60 | 61 | 62 | 63 | dir="/home/Desktop" #example of dir that needs to be given 64 | workdir=paste0(dir,"/imputation"); 65 | 66 | setwd(workdir) 67 | 68 | 69 | library(data.table) 70 | args<-commandArgs(TRUE); 71 | 72 | 73 | index=as.numeric(args[1]) #target tissue 74 | run=toString(args[2]) #category 75 | #fold=as.numeric(args[3]); #fold 76 | #ENS=as.numeric(args[4]); #ENS 77 | 78 | #PC1=as.numeric(args[3]); #GE 79 | #PC2=as.numeric(args[4]); #Sp 80 | #PC3=as.numeric(args[5]); #Snp 81 | PC1=10; PC2=20; 82 | 83 | #workdir="/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/gtex-v6" 84 | 85 | 86 | 87 | #loading functions 88 | #source('/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/gtex-v6/function_regression.r') 89 | #source(paste0(workdir,'/code/function.r')); 90 | 91 | #load gtex data 92 | #load("/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/data/gtexdata_v6.RData") 93 | load(paste0(workdir,"/input/gtexdata_v6.RData")); 94 | 95 | #Common samples between blood and target tissue 96 | #load("/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/tissue_pair_work_v6.RData") 97 | load(paste0(workdir,"/input/tissue_pair_work_v6.RData")); 98 | 99 | #loading patient ids that have snps information 100 | #load("/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/gtex-v6/PC_SNPs/snps_sample_v6.RData") 101 | load(paste0(workdir,"/input/snps_sample_v6.RData")); 102 | 103 | 104 | gene=gtex.pc$Name 105 | tss1="Whole Blood" 106 | for(index in c(1,2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 32, 34, 37, 39)){ 107 | tss2=as.character(unlist(strsplit(tss_pair[index,],"Whole Blood"))[2]); #tss2: target tissue 108 | allgeneid=as.numeric(seq(length(gene))) 109 | print(tss2); 110 | comm.patients=patients[which(patients %in% pat.snps)] 111 | 112 | #for common individuals seperate the gene expression for tss1 and tss2 113 | j1=c(); j2=c(); 114 | pat.com=c(); #common individuals between Blood (tss1) and target tissue (tss2) 115 | for (ipat in comm.patients){ 116 | i1=which(expc.nt$patient==ipat & expc.nt$SMTSD==tss1); 117 | i2=which(expc.nt$patient==ipat & expc.nt$SMTSD==tss2); 118 | if (length(i1)>0 & length(i2)>0){ j1=append(j1,i1[1]); j2=append(j2,i2[1]); pat.com=append(pat.com,ipat);}} 119 | nn1=length(j1); nn2=length(j2); it1=matrix(0, 1, nn1); it2=matrix(0, 1, nn2); 120 | it1[1,]=j1; it2[1,]=j2; 121 | gtex.tss1=gtex.pc[,c(it1[1,]), with=F]; 122 | gtex.tss2=gtex.pc[,c(it2[1,]), with=F]; 123 | 124 | 125 | #PCA for tss1, whole Blood gtex 126 | mdsk=100; 127 | d = dist(t(gtex.tss1), method="euclidean"); mdsk=dim(t(gtex.tss1))[1]-1; 128 | pcfit = cmdscale(d, eig=TRUE, k=mdsk); 129 | gtex.tss1.pc=t(pcfit$points); #row:PCs; col: samples 130 | colnames(gtex.tss1.pc)=pat.com; 131 | 132 | #loading splicing PCs 133 | #load('/Volumes/5TBbackup/UMD2019/project2_scratch_mbasu/gtex_v6/blood_cross_talk/gtex-v6/PC_splicing/PCcmd_splicing_blood.Rdata' ) 134 | load(paste0(workdir,"/input/PCcmd_splicing_blood.Rdata")) 135 | splice.mat.pc=splicing.pc[[tss2]] 136 | 137 | #confounders 138 | x=sapply(1:length(pat.com),function(i){which(pheno.subj %in% pat.com[i])}) 139 | age=as.numeric(pheno.dt$AGE[x]); 140 | race=as.numeric(pheno.dt$RACE[x]); race[race==99]=0; 141 | gender=as.numeric(pheno.dt$GENDER[x]); 142 | confund=cbind(age,race,gender) 143 | 144 | 145 | #row:PCs; col: samples 146 | 147 | if(run=="110"){ 148 | #Gene + Splicing -------------------------------------------- 149 | outdir=paste0(workdir,"/output/Model/M2") 150 | if(!file.exists(outdir))dir.create(outdir, recursive = TRUE) 151 | str=sprintf("%s/%s_lasso_regress_G_Spl_%d_%d.Rdata",outdir,tss2,PC1,PC2) 152 | ptm <- proc.time() 153 | 154 | modelM2=regression_glmnet_gene_splicing_M2(PC1,PC2,gtex.tss1.pc,splice.mat.pc,gtex.tss2,gene,"lasso",confund) 155 | 156 | save(modelM2,file=str) 157 | print(proc.time() - ptm) 158 | #------------------------------------------------------------ 159 | } 160 | 161 | 162 | if(run=="100"){ 163 | #Gene ------------------------------------------------------ 164 | outdir=paste0(workdir,"/output/Model/M1") 165 | if(!file.exists(outdir))dir.create(outdir, recursive = TRUE) 166 | str=sprintf("%s/%s_lasso_regress_G_%d.Rdata",outdir,tss2,PC1) 167 | 168 | ptm <- proc.time() 169 | modelM1=regression_glmnet_gene_M1(PC1,gtex.tss1.pc,gtex.tss2,gene,"lasso",confund) 170 | save(modelM1,file=str) 171 | 172 | print(proc.time() - ptm) 173 | #------------------------------------------------------------ 174 | } 175 | } 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | -------------------------------------------------------------------------------- /code/function.r: -------------------------------------------------------------------------------- 1 | format_gtexdatav6<-function(dirx){ 2 | library(data.table) 3 | 4 | dt=fread(paste0(dirx,"phs000424.v6.pht002742.v6.p1.c1.GTEx_Subject_Phenotypes.GRU.txt")) #phenotype data 5 | pheno.dt=dt[-1,] 6 | setnames(pheno.dt,as.character(dt[1,])) 7 | #subj=unlist(lapply(phe.dt$SUBJID, function(tt) {substr(tt,6,9)})) 8 | pheno.subj=sapply(1:nrow(pheno.dt), function(x){ y=unlist(strsplit(pheno.dt$SUBJID[x], split="-")); l=length(y); y[l] }) 9 | 10 | #gene expression 11 | rpkm=fread(paste0(dirx,"All_Tissue_Site_Details_Analysis.combined.rpkm.gct")) #gene expression data 12 | rpkm$Name = sapply(strsplit(rpkm$Name, '[.]'), '[[', 1) 13 | 14 | rpkm1=unique(rpkm) 15 | 16 | rpkm.col = colnames(rpkm); #gtex.row = rownames(gtex); 17 | patients = unique(sapply(strsplit(rpkm.col[-(1:2)], split="-"), '[[', 2)) 18 | 19 | 20 | annt=fread(paste0(dirx,"GTEx_Data_V6_Annotations_SampleAttributesDS.txt")) 21 | tmp = data.table(SAMPID=rpkm.col[-(1:2)], myid = 1:(length(rpkm.col) -2)); 22 | setkey(tmp, SAMPID); 23 | setkey(annt, SAMPID); 24 | annt = merge(x=tmp, y=annt, by="SAMPID"); 25 | annt$patient = sapply(strsplit(annt$SAMPID, split="-"), '[[', 2) 26 | annt1=annt[order(annt$myid),] 27 | expc.nt=annt1 28 | #======================================================================== 29 | 30 | #======================================================================= 31 | #### gene information to identify protein coding genes 32 | require(pracma) 33 | gene=fread(paste0(dirx,"Homo_sapiens.gene_info")) 34 | gene.symbol=gene$V3 35 | 36 | gene.IDs=gene$V6 37 | ensmbl=array("", c(length(gene.IDs),1)) 38 | for (i in seq(length(gene.IDs))){ 39 | gIDs=gene.IDs[i] 40 | tt=strfind(gIDs, 'Ensembl:', overlap = TRUE) 41 | if (!is.null(tt)){ 42 | ensmbl[i]=substr(gIDs, tt+8, tt+22) 43 | } 44 | } 45 | 46 | gene[, c("Name") := ensmbl] 47 | 48 | tmp = data.table(Name=gene$Name, myid = 1:dim(gene)[1]); setkey(tmp, Name); 49 | newrpkm = merge(x=tmp, y=rpkm, by="Name") 50 | 51 | tmp = data.table(Name=rpkm$Name, myid = 1:dim(rpkm)[1]); setkey(tmp, Name); 52 | newgene = merge(x=tmp, y=gene, by="Name") #contains the newgene with only those which are in gtex 53 | 54 | newrpkm$myid = newgene$myid; 55 | #the order of genes in newrpkm and newgene are same, checked using which(newrpkm$myid!=newgene$myid) 56 | newrpkm$geneSym = newgene$V3 #gene name 57 | newrpkm$geneFun = newgene$V9; #gene description 58 | newrpkm$geneKnd = newgene$V10; #gene type 59 | newrpkm$chromosome = newgene$V7; #chromosome 60 | 61 | newrpkm.pc = newrpkm[!is.na(newrpkm$geneKnd) & newrpkm$geneKnd=="protein-coding" ] #only protein coding genes 62 | g=newrpkm.pc$Name 63 | x=as.data.table(table(g)) 64 | xg=x$g[which(x$N>1)] 65 | remove=unlist(sapply(1:length(xg),function(i){r=which(g %in% xg[i]); r[2:length(r)] })) 66 | newrpkm.pc1=newrpkm.pc[-c(remove),]; 67 | 68 | newrpkm.pc2=newrpkm.pc 69 | newrpkm.pc=newrpkm.pc1 70 | l=dim(newrpkm.pc)[2]; 71 | gtex.pc1=newrpkm.pc[,c(-1,-2,-3,-(l-3),-(l-2),-(l-1),-l),with=F] 72 | zero.l=sum(gtex.pc1==0) 73 | extr.gtex.pc=newrpkm.pc[,c(1,2,3,(l-3),(l-2),(l-1),l),with=F] 74 | gtex.pc=cbind(gtex.pc1,extr.gtex.pc) 75 | save(gtex.pc,expc.nt,pheno.dt,pheno.subj,patients,file=paste0(dirx,"/input/gtexdata_v6.RData")) 76 | } 77 | 78 | 79 | 80 | regression_glmnet_gene<-function(nfold,ENS,nPC,temp.tss1,temp.tss2,genes,pat.com,method,confund.x){ 81 | require(foreach) 82 | require(glmnet) 83 | if(method=="lasso"){index=1} 84 | if(method=="ridge"){index=0} 85 | 86 | #library(doMC) 87 | #registerDoMC(cores=25) 88 | 89 | k=round(length(pat.com)/nfold-.5); 90 | folds=rep(c(1:nfold),c(rep(k,(nfold-1)),(length(pat.com)-(nfold-1)*k))) 91 | avg.pred=matrix(0,length(pat.com),length(genes)) 92 | corr.perENS=NULL; pval.perENS=NULL; 93 | sample.matrix=matrix(0,ENS+1,length(pat.com)) 94 | colnames(sample.matrix)=pat.com; 95 | sample.matrix[ENS+1,]=folds 96 | lambda.matrix=matrix(0,length(genes),ENS*nfold) 97 | jj=0; 98 | for(ee in seq(ENS)){ 99 | sam=sample(1:length(pat.com),length(pat.com),replace=F); sample.matrix[ee,]=sam; 100 | temp.pred=matrix(0,length(pat.com),length(genes)) 101 | print(ee) 102 | 103 | for(ifold in seq(nfold)){ print(paste0("e",ee,"f",ifold)); jj=jj+1; 104 | test=sam[which(folds==ifold)] 105 | train=sam[which(folds!=ifold)] 106 | gtex.train.tss2=as.matrix(temp.tss2[,train,with=F]); gtex.test.tss2=as.matrix(temp.tss2[,test,with=F]); 107 | gtex.train.tss1.pc=as.matrix(rbind(temp.tss1[1:nPC,train],t(confund.x[train,]))); 108 | gtex.test.tss1.pc =as.matrix(rbind(temp.tss1[1:nPC,test],t(confund.x[test,]))); 109 | 110 | 111 | #eachgene=foreach(itr=1:length(genes),.inorder=T,.combine='cbind') %dopar% { 112 | eachgene=foreach(itr=1:length(genes),.inorder=T,.combine='cbind') %do% { 113 | 114 | geneid=genes[itr]; 115 | fit.cv=cv.glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index,nfolds=5); 116 | lam=fit.cv$lambda.min; 117 | fit1 = glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index, lambda=lam) 118 | pred=predict(fit1,newx=t(gtex.test.tss1.pc)) 119 | return(c(pred,lam)) 120 | } #gene loop ends 121 | temp.pred[test,]=eachgene[1:length(test),]; lambda.matrix[,jj]=eachgene[length(test)+1,]; 122 | } #fold loop ends 123 | 124 | avg.pred=avg.pred+temp.pred; 125 | 126 | temp.tss22=temp.tss2[genes,] 127 | #temp=foreach(itr=1:nrow(temp.tss22),.inorder=T,.combine='rbind') %dopar% { 128 | temp=foreach(itr=1:nrow(temp.tss22),.inorder=T,.combine='rbind') %do% { 129 | 130 | x=cor.test(as.numeric(temp.tss22[itr,]),temp.pred[,itr]) 131 | c(x$estimate,x$p.value) } 132 | 133 | corr.perENS=cbind(corr.perENS,temp[,1]) 134 | pval.perENS=cbind(pval.perENS,temp[,2]) 135 | 136 | } #ENS loop ends 137 | avg.pred=avg.pred/ENS 138 | 139 | temp.tss22=temp.tss2[genes,] 140 | #corr=foreach(itr=1:nrow(temp.tss22),.inorder=T,.combine='rbind') %dopar% { 141 | corr=foreach(itr=1:nrow(temp.tss22),.inorder=T,.combine='rbind') %do% { 142 | 143 | x=cor.test(as.numeric(temp.tss22[itr,]),avg.pred[,itr]) 144 | c(x$estimate,x$p.value) 145 | } 146 | 147 | rownames(avg.pred)=pat.com 148 | colnames(corr)=c("cor","pval"); 149 | 150 | result=list() 151 | result[["Mean-first"]]=corr 152 | result[["Mean-end"]][["cor"]]=corr.perENS 153 | result[["Mean-end"]][["pval"]]=corr.perENS 154 | result[["prediction"]]=avg.pred 155 | result[["sampling"]]=sample.matrix 156 | result[["lambda"]]=lambda.matrix 157 | return(result) 158 | 159 | } 160 | 161 | 162 | regression_glmnet_gene_splicing<-function(nfold,ENS,nPC1,nPC2,temp.tss1,temp.snps,temp.tss2,genes,pat.com,method,confund.x){ 163 | require(foreach) 164 | require(glmnet) 165 | library(doMC) 166 | registerDoMC(cores=25) 167 | if(method=="lasso"){index=1} 168 | if(method=="ridge"){index=0} 169 | 170 | 171 | k=round(length(pat.com)/nfold-.5); 172 | folds=rep(c(1:nfold),c(rep(k,(nfold-1)),(length(pat.com)-(nfold-1)*k))) 173 | avg.pred=matrix(0,length(pat.com),length(genes)) 174 | corr.perENS=NULL; pval.perENS=NULL; 175 | sample.matrix=matrix(0,ENS+1,length(pat.com)) 176 | colnames(sample.matrix)=pat.com; 177 | sample.matrix[ENS+1,]=folds 178 | lambda.matrix=matrix(0,length(genes),ENS*nfold) 179 | jj=0; 180 | 181 | for(ee in seq(ENS)){ 182 | sam=sample(1:length(pat.com),length(pat.com),replace=F);sample.matrix[ee,]=sam; 183 | temp.pred=matrix(0,length(pat.com),length(genes)) 184 | #print(ee) 185 | 186 | for(ifold in seq(nfold)){ print(paste0("e",ee,"f",ifold)); 187 | jj=jj+1; 188 | test=sam[which(folds==ifold)]; 189 | train=sam[which(folds!=ifold)]; 190 | gtex.train.tss2=as.matrix(temp.tss2[,train,with=F]); gtex.test.tss2=as.matrix(temp.tss2[,test,with=F]); 191 | gtex.train.tss1.pc=as.matrix(rbind(temp.tss1[1:nPC1,train],temp.snps[1:nPC2,train],t(confund.x[train,]))); 192 | gtex.test.tss1.pc =as.matrix(rbind(temp.tss1[1:nPC1,test],temp.snps[1:nPC2,test],t(confund.x[test,]))); 193 | #print("step1") 194 | 195 | eachgene=foreach(itr=1:length(genes),.inorder=T,.combine='cbind') %dopar% { 196 | geneid=genes[itr]; 197 | fit.cv=cv.glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index,nfolds=5) 198 | lam=fit.cv$lambda.min 199 | fit1 = glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index, lambda=lam) 200 | pred=predict(fit1,newx=t(gtex.test.tss1.pc)) 201 | return(c(pred,lam)) 202 | } #gene loop ends 203 | temp.pred[test,]=eachgene[1:length(test),]; lambda.matrix[,jj]=eachgene[length(test)+1,]; 204 | } #fold loop ends 205 | 206 | avg.pred=avg.pred+temp.pred; 207 | temp.tss22=temp.tss2[genes,]; 208 | temp=foreach(itr=1:nrow(temp.tss22),.inorder=T,.combine='rbind') %dopar% { 209 | x=cor.test(as.numeric(temp.tss22[itr,]),temp.pred[,itr]); 210 | c(x$estimate,x$p.value) } 211 | 212 | corr.perENS=cbind(corr.perENS,temp[,1]) 213 | pval.perENS=cbind(pval.perENS,temp[,2]) 214 | 215 | } #ENS loop ends 216 | 217 | avg.pred=avg.pred/ENS 218 | temp.tss22=temp.tss2[genes,] 219 | corr=foreach(itr=1:nrow(temp.tss22),.inorder=T,.combine='rbind') %dopar% { 220 | x=cor.test(as.numeric(temp.tss22[itr,]),avg.pred[,itr]) 221 | c(x$estimate,x$p.value) 222 | } 223 | 224 | rownames(avg.pred)=pat.com 225 | colnames(corr)=c("cor","pval"); 226 | 227 | result=list() 228 | result[["Mean-first"]]=corr 229 | result[["Mean-end"]][["cor"]]=corr.perENS 230 | result[["Mean-end"]][["pval"]]=corr.perENS 231 | result[["prediction"]]=avg.pred 232 | result[["sampling"]]=sample.matrix 233 | result[["lambda"]]=lambda.matrix 234 | return(result) 235 | 236 | } #function ends 237 | 238 | 239 | #using loglikelihood of lm test (GE+Sp+CF~CF): contribution from GE+Sp 240 | regression_lm_loglik_selectgene_2<-function(nPC1,nPC2,temp.tss1,temp.snps,temp.tss2,method,confund.x) 241 | { 242 | require('lmtest') 243 | xH1=as.matrix(cbind(t(temp.tss1[1:nPC1,]),t(temp.snps[1:nPC2,]),confund.x)); xH0=as.matrix(confund.x); 244 | y=as.matrix(t(temp.tss2)); 245 | 246 | LR=list();pval=c(); 247 | 248 | for(itr in seq(ncol(y))){ 249 | fit1=lm(y[,itr]~xH1); fit0=lm(y[,itr]~xH0); LR[[itr]]=lrtest(fit1,fit0); 250 | pval=append(pval,LR[[itr]]$"Pr(>Chisq)"[2]); 251 | } 252 | 253 | loglike.Sgenes=list(); 254 | loglike.Sgenes[["loglikelihood-ratio"]]=LR 255 | loglike.Sgenes[["pval"]]=pval 256 | return(loglike.Sgenes) 257 | } 258 | 259 | #using loglikelihood of lm test contribution from splicing (GE+Sp+CF~GE+CF): contribution from Sp 260 | regression_lm_loglik_selectgene_2Sp<-function(nPC1,nPC2,temp.tss1,temp.snps,temp.tss2,method,confund.x) 261 | { 262 | require('lmtest') 263 | xH1=as.matrix(cbind(t(temp.tss1[1:nPC1,]),t(temp.snps[1:nPC2,]),confund.x)); 264 | xH0=as.matrix(cbind(t(temp.tss1[1:nPC1,]),confund.x)); 265 | y=as.matrix(t(temp.tss2)); 266 | 267 | LR=list();pval=c(); 268 | 269 | for(itr in seq(ncol(y))){ 270 | fit1=lm(y[,itr]~xH1); fit0=lm(y[,itr]~xH0); LR[[itr]]=lrtest(fit1,fit0); 271 | pval=append(pval,LR[[itr]]$"Pr(>Chisq)"[2]); 272 | } 273 | 274 | loglike.Sgenes=list(); 275 | loglike.Sgenes[["loglikelihood-ratio"]]=LR 276 | loglike.Sgenes[["pval"]]=pval 277 | return(loglike.Sgenes) 278 | } 279 | 280 | 281 | #using loglikelihood of lm test contribution from gene (GE+Sp+CF~Sp+CF): contribution from GE 282 | regression_lm_loglik_selectgene_2GE<-function(nPC1,nPC2,temp.tss1,temp.snps,temp.tss2,method,confund.x) 283 | { 284 | require('lmtest') 285 | xH1=as.matrix(cbind(t(temp.tss1[1:nPC1,]),t(temp.snps[1:nPC2,]),confund.x)); 286 | xH0=as.matrix(cbind(t(temp.snps[1:nPC2,]),confund.x)); 287 | y=as.matrix(t(temp.tss2)); 288 | 289 | LR=list();pval=c(); 290 | 291 | for(itr in seq(ncol(y))){ 292 | fit1=lm(y[,itr]~xH1); fit0=lm(y[,itr]~xH0); LR[[itr]]=lrtest(fit1,fit0); 293 | pval=append(pval,LR[[itr]]$"Pr(>Chisq)"[2]); 294 | } 295 | 296 | loglike.Sgenes=list(); 297 | loglike.Sgenes[["loglikelihood-ratio"]]=LR 298 | loglike.Sgenes[["pval"]]=pval 299 | return(loglike.Sgenes) 300 | } 301 | 302 | 303 | #using loglikelihood of lm test (GE/Sp/SNP+CF~CF): contribution from GE/Sp/SNP 304 | regression_lm_loglik_selectgene_1<-function(nPC,temp.tss1,temp.tss2,method,confund.x) 305 | { 306 | require('lmtest') 307 | xH1=as.matrix(cbind(t(temp.tss1[1:nPC,]),confund.x)); xH0=as.matrix(confund.x); 308 | y=as.matrix(t(temp.tss2)); 309 | 310 | LR=list();pval=c(); 311 | 312 | for(itr in seq(ncol(y))){ 313 | fit1=lm(y[,itr]~xH1); fit0=lm(y[,itr]~xH0); LR[[itr]]=lrtest(fit1,fit0); 314 | #g=append(g,((LR[[itr]]$"Pr(>Chisq)"[2]<=0.05)*1)); 315 | pval=append(pval,LR[[itr]]$"Pr(>Chisq)"[2]); 316 | } 317 | 318 | loglike.Sgenes=list(); 319 | loglike.Sgenes[["loglikelihood-ratio"]]=LR 320 | loglike.Sgenes[["pval"]]=pval 321 | return(loglike.Sgenes) 322 | } 323 | 324 | 325 | 326 | prediction_dis_CVfeature<-function(expr.pre,expr.org,blood.expr,istat.x,nfold,ENS,geneid.x){ 327 | 328 | library(data.table) #cntrl.ox,cntrl.op,cntrl.ob, 329 | require(ROCR) 330 | require(glmnet) 331 | require(foreach) 332 | require('lmtest') 333 | 334 | cvlasso<-function(xinput.train.x,xinput.test.x,yinput.train.x,yinput.test.x){ 335 | fit.cv=cv.glmnet(xinput.train.x,yinput.train.x, alpha=1, nfolds=4); 336 | lam=fit.cv$lambda.min; 337 | fit = glmnet(xinput.train.x,yinput.train.x, alpha=1, lambda=lam); 338 | pre=predict(fit,newx=xinput.test.x); 339 | pred<-prediction(pre[,1],yinput.test.x);aucval<-performance(pred,"auc"); 340 | return(aucval@y.values[[1]]) 341 | } 342 | 343 | 344 | DEgene_disease_wilcox<-function(expr.x,istat.xx,confound.x,genes){ 345 | 346 | library(data.table) 347 | case=which(istat.xx==1); 348 | cntrl=which(istat.xx==-1); 349 | pval=c(); 350 | for(g in seq(length(genes))){ 351 | geneid.x=genes[g]; 352 | pval=append(pval,wilcox.test(as.numeric(expr.x[geneid.x,case]),as.numeric(expr.x[geneid.x,cntrl]))$"p.value");} 353 | return(pval) 354 | } 355 | 356 | 357 | DEgene_disease<-function(expr.x,istat.xx,confound.x,genes){ 358 | 359 | library(data.table) 360 | 361 | confound.x=t(confound.x) 362 | 363 | pval=foreach(itr=1:length(genes),.inorder=T,.combine='cbind') %do% { #par 364 | geneid.x=genes[itr]; 365 | yinput=istat.xx 366 | xinput=t(as.matrix(rbind(expr.x[geneid.x,],confound.x))); xinput0=t(as.matrix(confound.x)); 367 | colnames(xinput)=c("expr",colnames(xinput)[-c(1)]); 368 | H1=lm(yinput~xinput); H0=lm(yinput~xinput0); LR=lrtest(H1,H0); 369 | #pval=append(pval,LR[[itr]]$"Pr(>Chisq)"[2]); 370 | return(LR$"Pr(>Chisq)"[2]) 371 | } 372 | return(pval) 373 | } 374 | 375 | 376 | 377 | result=list(); 378 | 379 | expr.pre=as.matrix(expr.pre); expr.org=as.matrix(expr.org); blood.expr=as.matrix(blood.expr); 380 | 381 | pat.com.x=colnames(expr.pre) 382 | 383 | ind3_d=which(istat.x==1);ind3_n=which(istat.x==-1); 384 | 385 | x.pred=c(); x.raw=c();x.blood=c(); 386 | 387 | #eachens.x=foreach(itr=1:ENS,.inorder=T,.combine='cbind') %dopar% { 388 | eachens.x=foreach(itr=1:ENS,.inorder=T) %do% { 389 | tag=0 390 | itag=0 391 | while(!tag){ 392 | folds_d=sample(1:nfold,length(ind3_d),replace=T); folds_n=sample(1:nfold,length(ind3_n),replace=T); 393 | tnn=data.frame(table(folds_n)); td=data.frame(table(folds_d)); 394 | u=(length(unique(folds_d))==nfold)*1; v=(length(unique(folds_n))==nfold)*1; 395 | tag=u*v*((sum((tnn$Freq>4)*1)==nfold)*1)*((sum((td$Freq>4)*1)==nfold)*1); 396 | itag=itag+1; print(c(itag,tag)); } 397 | 398 | foldsamples=c(folds_d,folds_n); 399 | 400 | for(ifold in 1:nfold) { 401 | 402 | val1_d=which(folds_d==ifold); val1_n=which(folds_n==ifold); 403 | trn1_d=which(folds_d!=ifold); trn1_n=which(folds_n!=ifold); 404 | 405 | train=c(ind3_d[trn1_d],ind3_n[trn1_n]) 406 | test=c(ind3_d[val1_d],ind3_n[val1_n]) # val has indices of istat 407 | 408 | istat_trn=istat.x[train]; istat_val=istat.x[test]; 409 | 410 | expr.TEST.im=expr.pre[,test] 411 | expr.TRAIN.im=expr.pre[,train] 412 | 413 | expr.TEST.original=expr.org[,test] 414 | expr.TRAIN.original=expr.org[,train] 415 | 416 | expr.TRAIN.blood=blood.expr[,train] 417 | expr.TEST.blood=blood.expr[,test] 418 | 419 | x.pred=append(x.pred,cvlasso(t(expr.TRAIN.im),t(expr.TEST.im),istat_trn,istat_val)) 420 | x.raw=append(x.raw,cvlasso(t(expr.TRAIN.original),t(expr.TEST.original),istat_trn,istat_val)) 421 | x.blood=append(x.blood,cvlasso(t(expr.TRAIN.blood),t(expr.TEST.blood),istat_trn,istat_val)) 422 | 423 | 424 | } #nfold loop ends 425 | 426 | print("step2") 427 | asd=rbind(x.pred,x.raw,x.blood); 428 | return(list(asd,foldsamples)) 429 | } #---ENS loop 430 | 431 | eachens=NULL;foldsamples.x=NULL; 432 | for(i in seq(ENS)){eachens=cbind(eachens,eachens.x[[i]][[1]]); 433 | foldsamples.x=cbind(foldsamples.x,eachens.x[[i]][[2]]);} 434 | 435 | 436 | result[["predicted-AUC"]]=eachens[1,]; 437 | result[["original-AUC"]]=eachens[2,]; 438 | result[["Blood-AUC"]]=eachens[3,]; 439 | result[["ensfold"]]=foldsamples.x; 440 | return(result) 441 | } 442 | 443 | 444 | 445 | regression_glmnet_gene_splicing_M2<-function(nPC1,nPC2,temp.tss1,temp.snps,temp.tss2,genes,method,confund.x){ 446 | 447 | #nPC1=PC1; nPC2=PC2;temp.tss1=gtex.tss1.pc; temp.snps=splice.mat.pc; temp.tss2=gtex.tss2;genes=gene[1:10];method="lasso";confund.x=confund; 448 | 449 | require(foreach) 450 | require(glmnet) 451 | #library(doMC) 452 | #registerDoMC(cores=25) 453 | if(method=="lasso"){index=1} 454 | if(method=="ridge"){index=0} 455 | 456 | gtex.train.tss2=as.matrix(temp.tss2); 457 | gtex.train.tss1.pc=as.matrix(rbind(temp.tss1[1:nPC1,],temp.snps[1:nPC2,],t(confund.x))); 458 | 459 | eachgene=foreach(itr=1:length(genes),.inorder=T,.combine='c') %do% { 460 | geneid=itr; 461 | fit.cv=cv.glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index,nfolds=5) 462 | lam=fit.cv$lambda.min 463 | fit1 = glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index, lambda=lam) 464 | #pred=predict(fit1,newx=t(gtex.test.tss1.pc)) 465 | 466 | temp=list(fit1) 467 | return(temp) 468 | } #gene loop ends 469 | names(eachgene)=genes 470 | return(eachgene) 471 | 472 | } 473 | 474 | regression_glmnet_gene_M1<-function(nPC,temp.tss1,temp.tss2,genes,method,confund.x){ 475 | require(foreach) 476 | require(glmnet) 477 | #library(doMC) 478 | #registerDoMC(cores=25) 479 | if(method=="lasso"){index=1} 480 | if(method=="ridge"){index=0} 481 | 482 | gtex.train.tss2=as.matrix(temp.tss2); 483 | gtex.train.tss1.pc=as.matrix(rbind(temp.tss1[1:nPC,],t(confund.x))); 484 | 485 | eachgene=foreach(itr=1:length(genes),.inorder=T,.combine='c') %do% { 486 | geneid=itr; 487 | fit.cv=cv.glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index,nfolds=5) 488 | lam=fit.cv$lambda.min 489 | fit1 = glmnet(t(gtex.train.tss1.pc),gtex.train.tss2[geneid,], alpha=index, lambda=lam) 490 | #pred=predict(fit1,newx=t(gtex.test.tss1.pc)) 491 | 492 | temp=list(fit1) 493 | return(temp) 494 | } #gene loop ends 495 | names(eachgene)=genes 496 | return(eachgene) 497 | 498 | } 499 | 500 | 501 | 502 | --------------------------------------------------------------------------------