├── src
    ├── readme
    ├── extract_SNP_location.py
    ├── genotype_2_bed.py
    ├── extract_3UTR_location.py
    ├── merge_finemap_results.R
    ├── QTL_plot.R
    ├── extract_read_depth.py
    ├── merge_apa_quant_res_by_chr.R
    ├── finemapping.R
    ├── generate_configure_for_Dapars2.py
    ├── prepare_susieR_uniqGene_location.py
    ├── recode_with_012.py
    ├── run_fine_mapping.sh
    ├── run_3aQTL_mapping.R
    ├── prepare_inputs_for_finemapping.sh
    ├── DaPars_Extract_Anno.py
    ├── curate_pheno_geno_covariates.R
    ├── prepare_inputs_for_apa_quant.sh
    ├── prepare_inputs_for_3aQTL_mapping.sh
    ├── Dapars2_Multi_Sample.py
    └── DaPars2_Multi_Sample_Multi_Chr.py
├── 3aQTL-pipe_Test_Dataset.zip
├── LICENSE
└── README.md


/src/readme:
--------------------------------------------------------------------------------
1 | update source codes of 3aQTL-pipe
2 | 


--------------------------------------------------------------------------------
/3aQTL-pipe_Test_Dataset.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/3UTR/3aQTL-pipe/HEAD/3aQTL-pipe_Test_Dataset.zip


--------------------------------------------------------------------------------
/src/extract_SNP_location.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | SNP location can be extract directly from processed gt file (embeded in SNP id)
 3 | run on python2 environment
 4 | usage: python extract_SNP_location.py --genotype_bed /path/to/genotype_matrix.bed --output /path/to/output/snp_location.txt
 5 | '''
 6 | import argparse
 7 | 
 8 | parser = argparse.ArgumentParser(description='')
 9 | parser.add_argument('--genotype_bed',type=str,help="provide the transformed genotype matrix")
10 | parser.add_argument('--output',type=str, default="snp_location.txt",help="specify SNP location file")
11 | 
12 | args = parser.parse_args()
13 | 
14 | 
15 | fh = open(args.genotype_bed,'r')
16 | fho = open(args.output,'w')
17 | header = fh.readline()
18 | print("SNP\tChr\tPos",file=fho)
19 | for line in fh.readlines():
20 |     line = line.strip()
21 |     snp = line.split("\t")[0]
22 |     w = snp.split("_")
23 |     if len(w)>=2:
24 |         chrom,pos = snp.split("_")[0:2]
25 |         print("%s\t%s\t%s" % (snp,chrom,pos), file=fho)
26 |     else:
27 |         print("Error:",snp)
28 | fh.close()
29 | fho.close()
30 | 
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Xudong Zou
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/genotype_2_bed.py:
--------------------------------------------------------------------------------
 1 | # Transform the 012 genotype matrix into a bed file
 2 | # by adding three columns (chr start end) as the first three columns
 3 | # output the Header line of genotype matrix for further use
 4 | 
 5 | import argparse
 6 | # - Main
 7 | if __name__ == "__main__":
 8 |     parser = argparse.ArgumentParser(description="")
 9 |     parser.add_argument('--genotype', help="provide the genotype matrix used in 3'aQTL mapping")
10 |     parser.add_argument('--out_bed', help="specify the output bed file name")
11 |     parser.add_argument('--out_header', help="specify a file to store the genotype header")
12 | 
13 |     args = parser.parse_args()
14 | 
15 |     fh = open(args.genotype,'r')
16 |     fho = open(args.out_bed,'w')
17 |     fho_header = open(args.out_header,'w')
18 |     header = fh.readline().strip()
19 |     print(header, file=fho_header)
20 |     fho_header.close()
21 |     i = 0
22 |     for line in fh.readlines():
23 |         i += 1
24 |         line = line.strip()
25 |         snp = line.split("\t")[0]
26 |         chrom,pos,ref,alt = snp.split("_")
27 |         pos = int(pos)
28 |         print("%s\t%d\t%d\t%s" % (chrom,pos-1,pos,line), file=fho)
29 | #        print(i)
30 | 
31 |     print(i,"SNPs been processed!")
32 | fh.close()
33 | fho.close()
34 | 


--------------------------------------------------------------------------------
/src/extract_3UTR_location.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | @ 3' UTR location of each transcript can be extracted from the results file of dapars2
 3 | @ which return the 3UTR location in its output (the fourth column)
 4 |  run on python3 environment
 5 |  usage: python extract_3UTR_location.py --dapars_res /path/to/Dapars2_res.all_chromosomes.txt --output /path/to/3utr_location.txt
 6 | '''
 7 | import argparse
 8 | 
 9 | parser = argparse.ArgumentParser(description='')
10 | parser.add_argument('--dapars_res',type=str,help="provide the output file generated by dapars2,the merged all chromosomes file is required if running dapars2 on each chromosome separately")
11 | parser.add_argument('--output',type=str, default="3utr_location.txt",help="specify 3' UTR location file")
12 | 
13 | args = parser.parse_args()
14 | 
15 | gene2loc = {} # a dict to record the location
16 | gene_order = [] # store every gene in this list to keep order for output
17 | 
18 | # -- load dapars result file and extract gene and location info
19 | fh = open(args.dapars_res,'r')
20 | header = fh.readline()
21 | for line in fh.readlines():
22 |     line = line.strip()
23 |     w = line.split("\t")
24 |     chrom,loc_code = w[3].split(":")
25 |     start,end = loc_code.split("-")
26 | 
27 |     gene2loc[w[0]] = (chrom,start,end)
28 |     gene_order.append(w[0])
29 | fh.close()
30 | print(len(gene_order),"genes have been processed!")
31 | 
32 | # -- output
33 | fho = open(args.output,'w')
34 | print("Gene\tChr\tStart\tEnd", file=fho)
35 | for gene in gene_order:
36 |     if gene in gene2loc:
37 |         print("%s\t%s\t%s\t%s" % (gene,gene2loc[gene][0],gene2loc[gene][1],gene2loc[gene][2]), file=fho)
38 | fho.close()
39 | 
40 | print("Done!")
41 | 


--------------------------------------------------------------------------------
/src/merge_finemap_results.R:
--------------------------------------------------------------------------------
 1 | library(optparse)
 2 | 
 3 | option_list <- list(
 4 | 		    make_option(c("-d","--directory_finemap"),type="character",default="./FineMapping",action="store",help="the base location of running fine-mapping, default is ./FineMapping"),
 5 | 		    make_option(c("-g","--gene_list"),type="character",default="picked_asso_list.loc_1000000.txt",action="store",help="the file contains gene list used for running fine-mapping, default is picked_asso_list.loc_1000000.txt"))
 6 | 
 7 | opt <- parse_args(OptionParser(option_list=option_list,usage="usage: %prog [options]"))
 8 | 
 9 | basedir <- opt$directory_finemap # the basic path of susieR analysis
10 | aGene_file <- opt$gene_list
11 | setwd(basedir)
12 | cat('Options:\n','basedir:',basedir,'\naGene_file:',aGene_file,'\n')
13 | gene_list <- read.table(paste0("./input/",aGene_file),header=F,sep="\t")
14 | gene_list <- gene_list[,1]
15 | independent_snp_count <- c()
16 | 
17 | susie_df <- data.frame(locus_id=c(),variant_id=c(),pip=c(),cs=c(),cs_size=c(),cs_purity=c())
18 | for(idx in 1:length(gene_list)){
19 | 	file_name <- paste0("./output/",gene_list[idx],"/3aQTL.SuSiE.txt")
20 | 	cat(file_name,"\n")
21 | 	if (file.exists(file_name)){
22 | 		df <- read.table(file_name,header=T,sep=" ")
23 | 		independent_snp_count[idx] <- dim(df)[1]
24 | 
25 | 		if (dim(df)[1]>0){
26 | 			df$locus_id <- gene_list[idx]
27 | 			susie_df <- rbind(susie_df,df)
28 | 		}
29 | 	}else{
30 | 		independent_snp_count[idx] <- NA
31 | 	}
32 | }
33 | 
34 | summary_susie <- data.frame(Gene=gene_list,Count=independent_snp_count)
35 | 
36 | write.table(susie_df,file="susieR_res.all_genes.txt",quote=F,row.names=F,sep="\t")
37 | write.table(summary_susie,file="susieR_res.stat.txt",quote=F,row.names=F,sep="\t")
38 | 


--------------------------------------------------------------------------------
/src/QTL_plot.R:
--------------------------------------------------------------------------------
 1 | library(optparse)
 2 | 
 3 | option_list <- list(
 4 | 	make_option(c("-s","--snp"),type="character",default="NA",action="store",help="specify a SNP"),
 5 | 	make_option(c("-g","--gene"),type="character",default="NA",action="store",help="specify a gene"),
 6 | 	make_option(c("-G","--genotype"),type="character",default="./Matrix_eQTL/Genotype_matrix.txt",action="store",help="specify the genotype matrix used in 3'aQTL mapping, default is ./Matrix_eQTL/Genotype_matrix.txt"),
 7 | 	make_option(c("-P","--phenotype"),type="character",default="./Matrix_eQTL/Phenotype_matrix.txt",action="store",help="specify the phenotype matrix used in 3'aQTL mapping, default is ./Matrix_eQTL/Phenotype_matrix.txt")
 8 | 		    )
 9 | 
10 | opt <- parse_args(OptionParser(option_list=option_list,usage="usage: %prog [options]"))
11 | 
12 | #load Genotype matrix and Phenotype matrix
13 | gt <- read.table(opt$genotype,header=T,sep="\t", check.names=FALSE)
14 | pt <- read.table(opt$phenotype,header=T,sep="\t", check.names=FALSE)
15 | 
16 | rownames(gt) <- gt[,1]
17 | rownames(pt) <- pt[,1]
18 | gt <- gt[,-1]
19 | pt <- pt[,-1]
20 | 
21 | snp <- as.character(opt$snp)
22 | gene <- as.character(opt$gene)
23 | geneName <- strsplit(gene,split="|",fixed=T)[[1]][2]
24 | 
25 | 
26 | e1 = as.numeric(pt[which(rownames(pt)==gene),])
27 | s1 = as.numeric(gt[which(rownames(gt)==snp),])
28 | 
29 | lm1 = lm(e1 ~ s1)
30 | pdf(paste(snp, geneName,"pdf", sep="."))
31 | boxplot(e1 ~ s1, lwd = 2, xaxt="n",xlab="Genotype",ylab="Normalized PDUI",main=paste(snp,gene,sep=" || "))
32 | axis(1,at=c(1:3),labels=c("REF","HET","ALT"))
33 | stripchart(e1 ~ s1, vertical = TRUE, method = "jitter", add = TRUE, pch = 20, col = c(rgb(102,194,165,max=255),rgb(252,141,98,max=255),rgb(141,160,203,max=255)))
34 | dev.off()
35 | 


--------------------------------------------------------------------------------
/src/extract_read_depth.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os.path
 3 | 
 4 | # -- Functions
 5 | def load_sample_list(input_sample_list_file):
 6 |     sample_list = []
 7 |     for line in open(input_sample_list_file,'r'):
 8 |         line = line.strip()
 9 |         sample_id = line.split("\t")[0]
10 |         sample_list.append(sample_id)
11 | 
12 |     return sample_list
13 | 
14 | def extract_total_reads(input_flagstat_file):
15 |     num_line = 0
16 |     total_reads = '-1'
17 |     #print input_flagstat_file
18 |     for line in open(input_flagstat_file,'r'):
19 |         num_line += 1
20 |         if num_line == 5:
21 |             total_reads = line.strip().split(' ')[0]
22 |             break
23 |     return total_reads
24 | 
25 | 
26 | # -- Main
27 | 
28 | if __name__ == '__main__':
29 |     parser = argparse.ArgumentParser(description='')
30 |     parser.add_argument('--sample_list',help="the file contains all samples")
31 |     parser.add_argument('--path_flagstat',default="./tmp",help="the location of flagstat files")
32 |     parser.add_argument('--path_wig',default="./wig",help="the location of wig files")
33 |     parser.add_argument('--output',help="the final output file with read depth of each sample")
34 | 
35 |     args = parser.parse_args()
36 | 
37 |     selected_samples = load_sample_list(os.path.abspath(args.sample_list))
38 |     path_wig = os.path.abspath(args.path_wig)
39 |     path_flagstat = os.path.abspath(args.path_flagstat)
40 |     
41 |     if path_wig[-1] != "/":
42 |         path_wig += "/"
43 |     else:
44 |         pass
45 | 
46 |     if path_flagstat[-1] != "/":
47 |         path_flagstat += "/"
48 |     else:
49 |         pass
50 | 
51 |     fho = open(args.output,'w')
52 |     for sample in selected_samples:
53 |         filename_sample = sample + ".flagstat"
54 |         read_depth = extract_total_reads(path_flagstat + filename_sample)
55 |         wig_location = path_wig + sample + ".wig"
56 |         print("%s\t%s" % (wig_location,read_depth),file=fho)
57 | 
58 |     fho.close()
59 | 


--------------------------------------------------------------------------------
/src/merge_apa_quant_res_by_chr.R:
--------------------------------------------------------------------------------
 1 | # merge Dapars2 output by chromosome
 2 | library(optparse)
 3 | # -- global variable
 4 | option_list <- list(
 5 | 		    make_option(c("-d","--dir_prefix"), type = "character", default = "Dapars2_out",
 6 | 				action = "store", help = "Specify the directory prefix of DaPars2 output"),
 7 | 		    make_option(c("-f", "--file_prefix"), type = "character", default = "Dapars2",
 8 | 				action = "store", help = "Specify the file prefix of DaPars2 output"),
 9 | 		    make_option(c("-s", "--sample_list"), type = "character", default = "sample_list.txt",
10 | 				action = "store", help = "A file contains the sample list"),
11 | 		    make_option(c("-c", "--chr_list"), type = "character", default = "chrList.txt",
12 | 				action = "store", help = "A file contains the chromosome list"),
13 | 		    make_option(c("-o", "--output"), type = "character", default = "Dapars2_res.all_chromosomes.txt",
14 | 				action = "store", help = "Specify the output file name")
15 | 		    )
16 | 
17 | opt <- parse_args(OptionParser(option_list=option_list,usage="usage: %prog [options]"))
18 | dir_pre <- opt$dir_prefix
19 | file_pre <- opt$file_prefix
20 | bamList <- opt$sample_list
21 | chromList <- opt$chr_list
22 | outFile <- opt$output
23 | cat("dir_pre:",dir_pre,"\nfile_pre:",file_pre,"\nchromList:",chromList,"\n")
24 | # -- functions
25 | load_dapars2_res <- function(chromosome,new_header){
26 | 	input_file <- paste0(dir_pre,"_",chromosome,"/",file_pre,"_result_temp.",chromosome,".txt")
27 | 	dap_res <- read.table(input_file,header=T, sep="\t")
28 | 	names(dap_res) <- new_header
29 | 
30 | 	return(dap_res)
31 | }
32 | 
33 | 
34 | # -- main
35 | 
36 | # load samples
37 | dat <- read.table(bamList,header=F)
38 | sample_list <- as.character(dat$V1)
39 | col_names <- c("Gene","fit_value","Predicted_Proximal_APA","Loci",sample_list)
40 | chrs_list <- read.table(chromList,header=F)
41 | chrs_vec <- as.character(chrs_list$V1)
42 | rm(chrs_list)
43 | if(substr(chrs_vec[1],1,3)!="chr"){
44 | 	chrs_vec <- paste0("chr",chrs_vec)
45 | }
46 | 
47 | chrs_vec
48 | res.df <- data.frame()
49 | 
50 | for(chr in chrs_vec){
51 | 	temp.df <- load_dapars2_res(chr,col_names)
52 | 	print(paste(chr,dim(temp.df)[1],sep=":"))
53 | 	res.df <- rbind(res.df,temp.df)
54 | }
55 | 
56 | dim(res.df)
57 | write.table(res.df,file=outFile,quote=F,sep="\t",row.names=F)
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/src/finemapping.R:
--------------------------------------------------------------------------------
 1 | # perform fine mapping on one gene
 2 | args <- commandArgs(trailingOnly=TRUE)
 3 | dir <- args[1]
 4 | Lvalue <- as.integer(args[2])
 5 | sp_var <- as.numeric(args[3])
 6 | mPIP <- as.numeric(args[4])
 7 | 
 8 | setwd(dir)
 9 | 
10 | cat('Running environment:',getwd(),'\nOptions:\n','Lvalue:',Lvalue,'\nsp_var:',sp_var,'\nmPIP:',mPIP,'\n')
11 | genotype = '3aQTL.vcf'
12 | phenotype = 'expr.phen'
13 | 
14 | # output file prefix
15 | prefix = tools::file_path_sans_ext(genotype)
16 | 
17 | X = t(read.table(genotype, head=T, row.names=1, quote="'", check.names=FALSE))
18 | # fill missing values in X with mean
19 | # because susieR does not deal with missing data explicitly for now
20 | for(i in 1:ncol(X)){
21 |   X[is.na(X[,i]), i] <- mean(X[,i], na.rm = TRUE)
22 | }
23 | y = read.table(phenotype, head=F)[,-1]
24 | # Adjust row names for phenotype data convention
25 | #rownames(y) = gsub("-", ".", y[,1])
26 | rownames(y) = y[,1]
27 | # Obtain intersect of X and y data, and reorder X to match y ordering
28 | x_idx = match(rownames(y), rownames(X))
29 | y_idx = which(!is.na(x_idx))
30 | x_idx = x_idx[!is.na(x_idx)]
31 | X = X[x_idx, ]
32 | y = y[y_idx,]
33 | if (!all(rownames(X) == rownames(y))) stop("X and y rownames mismatch")
34 | # Run SuSiE
35 | res = susieR::susie(X, y[,2], L=Lvalue, scaled_prior_variance=sp_var)
36 | # Visualize result
37 | pdf(paste0(prefix, '.SuSiE.pdf'), width=10,height=5)
38 | susieR::susie_plot(res, y = 'PIP')
39 | dev.off()
40 | # Format results focusing only on signals
41 | res$var_names = colnames(X)
42 | get_susie_output = function(unit, res, pip_cutoff = mPIP) {
43 |         cs_id = cs_size = cs_purity = rep(NA, length(res$var_names))
44 |         num_cs = length(res$sets$cs)
45 |         for(id in 1:num_cs){
46 |             idx = res$sets$cs[[id]]
47 |             cs_id[idx] = names(res$sets$cs)[id]
48 |             cs_size[idx] = length(res$sets$cs[[id]])
49 |             cs_purity[idx] = res$sets$purity[id,1]
50 |         }
51 |         out = cbind.data.frame(rep(unit, length(res$var_names)),
52 |                                 res$var_names,
53 |                                 res$pip, cs_id, cs_size, cs_purity)
54 |         colnames(out) = c("locus_id", "variant_id", "pip", "cs", "cs_size", "cs_purity")
55 |         out[which(out[,3] >= pip_cutoff | !is.na(out[,4])), ]
56 |     }
57 | text_output = get_susie_output(genotype, res)
58 | # Output to files
59 | write.table(text_output, paste0(prefix, '.SuSiE.txt'), quote=FALSE, row.names=FALSE)
60 | saveRDS(res, paste0(prefix, '.SuSiE.rds'))
61 | 


--------------------------------------------------------------------------------
/src/generate_configure_for_Dapars2.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os.path
 3 | 
 4 | # -- Functions
 5 | def extract_all_wigs(input_seq_depth_file):
 6 |     wig_files = []
 7 |     for line in open(input_seq_depth_file,'r'):
 8 |         line = line.strip()
 9 |         w = line.split("\t")
10 |         wig_files.append(w[0])
11 | 
12 |     return wig_files
13 | 
14 | 
15 | # -- Main
16 | 
17 | if __name__ == '__main__':
18 |     parser = argparse.ArgumentParser(description='')
19 |     parser.add_argument('--annotation_3utr',help="the location of reference 3'UTR bed file")
20 |     parser.add_argument('--wigFile_depth',help="the index file contains all wig files and read depth")
21 |     parser.add_argument('--coverage_threshold',type=str,default="10",help="specify the threshold of coverage,default=10")
22 |     parser.add_argument('--threads',type=str,default="1",help="specify the number of threads used,default=1")
23 |     parser.add_argument('--out_dir_prefix',type=str, default="Dapars2_out",help="specify the directory prefix of dapars2 output,default='Dapars2_out'")
24 |     parser.add_argument('--out_file_prefix',type=str,default="Dapars2",help="specify the name of result file prefix of dapars2,default='Dapars2'")
25 |     parser.add_argument('--out_config_name',type=str,default="Dapars2_running_configure.txt",help="specify configure file name,default='Dapars2_running_configure.txt'")
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     configure_file_name = args.out_config_name
30 |     fho = open(configure_file_name,'w')
31 |     # print Annotated_3UTR
32 |     print("# Specify the reference of 3'UTR region", file=fho)
33 |     print("\nAnnotated_3UTR=" + os.path.abspath(args.annotation_3utr), file=fho)
34 | 
35 |     # print wig files
36 |     all_wig_files = extract_all_wigs(os.path.abspath(args.wigFile_depth))
37 |     print("\n# A comma separated list of wig files of all samples", file=fho)
38 |     print("\nAligned_Wig_files=" + ",".join(all_wig_files), file=fho)
39 |     
40 |     # specify Output_directory and Output_result_file
41 |     print("\nOutput_directory=" + args.out_dir_prefix, file=fho)
42 |     print("\nOutput_result_file=" + args.out_file_prefix, file=fho)
43 | 
44 |     # specify Coverage_threshold
45 |     print("\n# Specify Coverage threshold", file=fho)
46 |     print("\nCoverage_threshold=" + args.coverage_threshold, file=fho)
47 | 
48 |     # specify the Num_Threads to process the analysis
49 |     print("\n# Specify the number of threads to process the analysis", file=fho)
50 |     print("\nNum_Threads=" + args.threads, file=fho)
51 | 
52 |     # Provide sequencing_depth_file for normalization
53 |     print("\n# Provide sequencing depth file for normalization", file=fho)
54 |     print("\nsequencing_depth_file=" + os.path.abspath(args.wigFile_depth), file=fho)
55 |     fho.close()
56 |     
57 | 
58 | 


--------------------------------------------------------------------------------
/src/prepare_susieR_uniqGene_location.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Desc:
 3 | Obtain unique and significant aGenes and their extended location (specified by --extend_size, default:1Mb)
 4 | Input files: 3utr_location.txt, Cis_aQTL_all_control_gene_exprs.txt
 5 | Output format: Gene\tchr:start-end
 6 | '''
 7 | 
 8 | import argparse
 9 | import os.path
10 | import time
11 | 
12 | # - Main
13 | if __name__ == "__main__":
14 |     parser = argparse.ArgumentParser(description="")
15 |     parser.add_argument('--utr_loc_file',help="input the reference 3'UTR location file, e.g. 3utr_location.txt")
16 |     parser.add_argument('--aQTL_map',help="specify the aQTL mapping file")
17 |     parser.add_argument('--Max_FDR',type=float,default=0.05,help="specify the maximum of FDR to select significant aQTL associations")
18 |     parser.add_argument('--extend_size',type=int, default=1000000,help="Int, extend N bp (default N=1e6) at both sides")
19 |     parser.add_argument('--outdir',help="specify the output dir")
20 |     parser.add_argument('--output',help="specify the output file name")
21 | 
22 |     args = parser.parse_args()
23 | 
24 |     # -- extract gene and location
25 |     print("Start extracting 3'UTR location file...")
26 |     print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
27 | 
28 |     fh = open(args.utr_loc_file,'r')
29 |     gene2loc = {}
30 |     head = fh.readline()
31 |     ext_size = int(args.extend_size)
32 | 
33 |     for line in fh.readlines():
34 |         line = line.strip()
35 |         w = line.split("\t")
36 |         start = max([0,int(w[2]) - ext_size])
37 |         end = int(w[3]) + ext_size
38 |         loc = w[1] + ":" + str(start) + "-" + str(end)
39 |         gene2loc[w[0]] = loc
40 |     fh.close()
41 | 
42 |     # -- processing aQTL mapping files
43 |     print("Start processing aQTL mapping file...")
44 |     print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
45 |     fdr_cutoff = float(args.Max_FDR)
46 |     fh = open(args.aQTL_map,'r')
47 |     outdict = {}
48 |     for line in fh.readlines()[1:]:
49 |         line = line.strip()
50 |         w = line.split("\t")
51 |         gene = w[1]
52 |         fdr = float(w[5])
53 |         if fdr < fdr_cutoff:
54 |             if gene not in outdict:
55 |                 outdict[gene] = gene2loc[gene]
56 |             else:
57 |                 continue
58 |         else:
59 |             continue
60 | 
61 |     fh.close()
62 | 
63 |     print(len(outdict),"unique aGenes are processed.")
64 |     
65 | 
66 |     output_path = os.path.abspath(args.outdir)
67 |     fho = open(output_path + "/" + args.output,'w')
68 |     i = 0
69 |     for gene in outdict:
70 |         i += 1
71 |         print(gene + "\t" + outdict[gene], file=fho)
72 |         print(i)
73 |     fho.close()
74 |     print("Done!")
75 |     print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
76 | 


--------------------------------------------------------------------------------
/src/recode_with_012.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | after getting *.frq and *.FORMAT files by vcftools,
 3 | this script will extracting allelic gt info from *.frq and recode gt code in *.FORMAT into 012 format
 4 | '''
 5 | 
 6 | import argparse
 7 | import time
 8 | 
 9 | 
10 | # - Functions
11 | # extracting allelic genotype from frq
12 | def extract_gt_from_frq(frq_file):
13 |     snp2gt = {}
14 |     fh = open(frq_file,'r')
15 |     for line in fh.readlines()[1:]:
16 |         line = line.strip()
17 |         w = line.split("\t")
18 |         if "chr" not in w[0]:
19 |             snp = "chr" + w[0] + "_" + w[1]
20 |         else:
21 |             snp = w[0] + "_" + w[1]
22 |         allele_ref = w[4].split(":")[0]
23 |         allele_alt = w[5].split(":")[0]
24 | 
25 |         if snp not in snp2gt:
26 |             snp2gt[snp] = allele_ref + "_" + allele_alt
27 |         else:
28 |             continue
29 |     fh.close()
30 |     return snp2gt
31 | 
32 | 
33 | 
34 | # recode genotype to 012 code
35 | def recode_with_012(gt):
36 |     if "." in gt:
37 |         return "NA"
38 |     else:
39 |         allele_1 = int(gt[0])
40 |         allele_2 = int(gt[-1])
41 |         return str(allele_1+allele_2)
42 | 
43 | # - Main
44 | if __name__ == "__main__":
45 |     parser = argparse.ArgumentParser(description="")
46 |     parser.add_argument('--frq',help="input the frq file generated by vcftools")
47 |     parser.add_argument('--GT',help="specify the GT format file generated by vcftools")
48 |     parser.add_argument('--output',help="specify the output file with 012 recoded genotype")
49 | 
50 |     args = parser.parse_args()
51 | 
52 |     # -- extract allelic gt from frq file
53 |     print("Start processing frq file...")
54 |     print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
55 | 
56 |     snv_gt = extract_gt_from_frq(args.frq)
57 |     print("Obtain genotype of %d SNPs" % (len(snv_gt)))
58 | 
59 |     # -- recode gt in GT file
60 |     print("Start processing GT file...")
61 |     print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
62 | 
63 |     fh = open(args.GT,'r')
64 |     fho = open(args.output,'w')
65 |     header = fh.readline().strip().split("\t")
66 |     print("%s\t%s" % ("id","\t".join(header[2:])), file=fho)
67 |     for line in fh.readlines():
68 |         line = line.strip()
69 |         w = line.split("\t")
70 |         if "chr" not in w[0]:
71 |             snp = "chr" + w[0] + "_" + w[1]
72 |         else:
73 |             snp = w[0] + "_" + w[1]
74 |         
75 |         if snp in snv_gt:
76 |             snp = snp + "_" + snv_gt[snp]
77 |         else:
78 |             snp = snp
79 | 
80 |         gt_012 = list(map(recode_with_012,w[2:]))
81 |         
82 |         print("%s\t%s" % (snp,"\t".join(gt_012)), file=fho)
83 | 
84 |     fh.close()
85 |     fho.close()
86 |     
87 |     print("recoded gt file has been write to",args.output)
88 |     print("Done!")
89 |     print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 3'aQTL-pipe
 2 | 
 3 | [![Github Release](https://img.shields.io/badge/release-v1.1-brightgreen)](https://github.com/3UTR/3aQTL-pipe)
 4 | [![python Release](https://img.shields.io/badge/python-3.8-brightgreen)](https://www.python.org/downloads/)
 5 | [![R Release](https://img.shields.io/badge/R-3.6.3-brightgreen)](https://cran.r-project.org/)
 6 | [![DOI](https://zenodo.org/badge/480019097.svg)](https://zenodo.org/badge/latestdoi/480019097)
 7 | 
 8 | **Abbreviation** 
 9 | * APA: alternative polyadenylation
10 | * 3'aQTL: 3′UTR alternative polyadenylation quantitative trait loci
11 | 
12 | This pipeline describes the step-by-step methods for analyzing dynamics alternative polyadenylation events across population-scale samples and performing association analysis between common genetic variants and APA usages to obtain a map of genetic regulation of APA. 
13 | 
14 | The scripts in this repository have been tested on 89 samples from Geuvadis RNA-seq Project and GTEx Project.
15 | For conditions to reuse of these scripts please refer to LICENSE file.
16 | 
17 | ## Using this pipeline
18 | Details on how to prepare environment and use the script can be found on [GitHub wiki](https://github.com/3UTR/3aQTL-pipe/wiki) pages for this repository.
19 | 
20 | This pipeline relies on [Dapars2](https://github.com/3UTR/DaPars2) for APA quantification, [Matrix-eQTL](http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/) for association mapping, and [SuSieR](https://github.com/stephenslab/susieR![image](https://user-images.githubusercontent.com/10413520/160762171-0a0e0d3c-f3ee-43a5-8b12-0920eba2dfac.png)
21 | ) for fine-mapping.
22 | 
23 | ## Authors
24 | 
25 | Xudong Zou, Ruofan Ding, Wenyan Chen, Gao Wang, Shumin Cheng, Wei Li, Lei Li
26 | 
27 | Institute of Systems and Physical Biology, Shenzhen Bay Laboratory, Shenzhen 518055, China
28 | 
29 | ## Citation
30 | * Code and Execution:
31 | 
32 | **Using population-scale transcriptomic and genomic data to map 3' UTR alternative polyadenylation quantitative trait loci**
33 | 
34 | Xudong Zou, Ruofan Ding, Wenyan Chen, Gao Wang, Shumin Cheng, Qin Wang, Wei Li, Lei Li. ***STAR Protocols***,3(3):101566 **(2022)**.
35 | DOI: https://doi.org/10.1016/j.xpro.2022.101566
36 | https://www.sciencedirect.com/science/article/pii/S2666166722004464?via%3Dihub
37 | 
38 | * The first 3'aQTL atlas of human tissues:
39 | 
40 | **An atlas of alternative polyadenylation quantitative trait loci contributing to complex trait and disease heritability**
41 | 
42 | Lei Li, Kai-Lieh Huang, Yipeng Gao, Ya Cui, Gao Wang, Nathan D. Elrod, Yumei Li, Yiling Elaine Chen, Ping Ji, Fanglue Peng, William K. Russell, Eric J. Wagner & Wei Li. ***Nature Genetics***,53,994-1005 **(2021)**. DOI:https://doi.org/10.1038/s41588-021-00864-5
43 | 
44 | https://www.nature.com/articles/s41588-021-00864-5
45 | 
46 | ## Contact
47 | For any issues, please create a GitHub Issue.
48 | 
49 | ## Funding
50 | This work was supported by National Natural Science Foundation of China (no. 32100533) and startup funds from Shenzhen Bay Laboratory to L.L.
51 | 


--------------------------------------------------------------------------------
/src/run_fine_mapping.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # This is the sub-pipe of 3'aQTL-pipe, here in this script we will perform fine-mapping of 3'aQTL detected by Matrix-eQTL with SuSieR
  3 | # @Xudong Zou, zouxd@szbl.ac.cn
  4 | # 2022-03-30
  5 | 
  6 | # -- Usage function
  7 | script_name=$0
  8 | function usage(){
  9 | 	echo "#=============================="
 10 | 	echo "Default usage:"
 11 | 	echo "#=============================="
 12 | 	echo "bash $script_name"
 13 | 	echo "Options:"
 14 | 	echo "        -w  integer,setting the window size around the genes for fine-mapping"
 15 | 	echo "        -p  float, specify the minimum PIP for filtering fine mapped 3'aQTLs"
 16 | 	echo "        -L  integer, specify the L value in susieR, default 10"
 17 | 	echo "        -V  float, specify the variance used in susieR, default 0.2"
 18 | 	echo "        -t  integer, setting threads to run susieR in parallel, default 1"
 19 | 	echo "        -h  print the help information"
 20 | 	exit 1
 21 | }
 22 | 
 23 | # define global variables from command parameters
 24 | currDir=`pwd`
 25 | sourceDir="./src"
 26 | PIP="0.1"
 27 | Variance="0.2"
 28 | L="10"
 29 | Threads="1"
 30 | window=`echo "1e6"|awk '{printf("%d",$0)}'`
 31 | 
 32 | while getopts :w:p:L:V:t:h opt
 33 | do
 34 | 	case $opt in
 35 | 		w)
 36 | 			window=`echo "$OPTARG"|awk '{printf("%d",$0)}'`
 37 | 		;;
 38 | 		p)
 39 | 			PIP="$OPTARG"
 40 | 		;;
 41 | 		L)
 42 | 			L="$OPTARG"
 43 | 		;;
 44 | 		V)
 45 | 			Variance="$OPTARG"
 46 | 		;;
 47 | 		t)
 48 | 			Threads="$OPTARG"
 49 | 		;;
 50 | 		h)
 51 | 			echo "Help message:"
 52 | 			usage
 53 | 		;;
 54 | 		:)
 55 | 			echo "The option -$OPTARG requires an argument."
 56 | 			exit 1
 57 | 		;;
 58 | 		?)
 59 | 			echo "Invalid option: $OPTARG"
 60 | 			usage
 61 | 			exit 2
 62 | 		;;
 63 | 	esac
 64 | done
 65 | 
 66 | # -- Basic settings
 67 | if [ ! -d "${currDir}/FineMapping/input" ]
 68 | then
 69 | 	echo "${currDir}/FineMapping/input not found!"
 70 | 	exit
 71 | fi
 72 | 
 73 | if [ ! -d "${currDir}/FineMapping/output" ]
 74 | then
 75 | 	echo "${currDir}/FineMapping/output not found!"
 76 | 	exit
 77 | fi
 78 | 
 79 | # -- Main function --
 80 | function main(){
 81 | 	echo "Running $script_name with the following parameters:"
 82 | 	echo "*************************************************"
 83 | 	echo "-w: $window"
 84 | 	echo "-p: $PIP"
 85 | 	echo "-L: $L"
 86 | 	echo "-V: $Variance"
 87 | 	echo "-t: $Threads"
 88 | 	echo "*************************************************"
 89 | 	echo "Start 3'aQTL fine mapping ..."
 90 | 	date
 91 | 	echo "Run fine-mapping analysis by susieR"
 92 | 	run_fine_mapping $window $PIP $L $Variance $Threads
 93 | 	echo "Done!"
 94 | 	date
 95 | }
 96 | 
 97 | 
 98 | # -- Other functions --
 99 | function run_fine_mapping(){
100 | 	w=$1
101 | 	min_PIP=$2
102 | 	L=$3
103 | 	Var=$4
104 | 	threads=$5
105 | 	if [ ! -f "${currDir}/FineMapping/input/picked_asso_list.loc_${w}.txt" ]
106 | 	then
107 | 		echo "File ${currDir}/FineMapping/input/picked_asso_list.loc_${w}.txt not found!"
108 | 		exit
109 | 	fi
110 | 
111 | 	if [ $threads -eq 1 ]
112 | 	then
113 | 		for gene in `cat ${currDir}/FineMapping/input/picked_asso.loc_${w}.txt|cut -f1`
114 | 		do
115 | 			echo "Analyzing $gene"
116 | 			if [ -d "${currDir}/FineMapping/output/$gene" ]
117 | 			then
118 | 				GeneDir=${currDir}/FineMapping/output/$gene
119 | 				if [ -f "${GeneDir}/3aQTL.vcf" -a -f "${GeneDir}/expr.phen" ]
120 | 				then
121 | 					Rscript ${sourceDir}/finemapping.R ${GeneDir} $L $Var $min_PIP &
122 | 					wait
123 | 				else
124 | 					echo "${gene}:File 3aQTL.vcf and expr.phen not found!"
125 | 					continue
126 | 				fi
127 | 			else
128 | 				echo "${gene} not exits!"
129 | 				continue
130 | 			fi
131 | 		done
132 | 		cd $currDir
133 | 	else
134 | 		if [ ! -d "${currDir}/FineMapping/output/tmp" ]
135 | 		then
136 | 			mkdir -p ${currDir}/FineMapping/output/tmp
137 | 		fi
138 | 		split -l $threads -d ${currDir}/FineMapping/input/picked_asso_list.loc_${w}.txt ${currDir}/FineMapping/output/tmp/finemap_task_ &
139 | 		wait
140 | 		for task in `ls ${currDir}/FineMapping/output/tmp/finemap_task_*`
141 | 		do
142 | 			for gene in `cat $task |cut -f1`
143 | 			do
144 | 				echo "Analyzing $gene"
145 | 				if [ -d "${currDir}/FineMapping/output/$gene" ]
146 | 				then
147 | 					GeneDir=${currDir}/FineMapping/output/$gene
148 | 					if [ -f "${GeneDir}/3aQTL.vcf" -a -f "${GeneDir}/expr.phen" ]
149 | 					then
150 | 						Rscript ${sourceDir}/finemapping.R ${GeneDir} $L $Var $min_PIP &
151 | 					else
152 | 						echo "${gene}:File 3aQTL.vcf and expr.phen not found!"
153 | 						continue
154 | 					fi
155 | 				else
156 | 					echo "${gene} not exits!"
157 | 					continue
158 | 				fi
159 | 			done
160 | 			wait
161 | 		done &
162 | 		wait
163 | 		cd $currDir
164 | 	fi
165 | 
166 | }
167 | 
168 | # - main
169 | main
170 | 


--------------------------------------------------------------------------------
/src/run_3aQTL_mapping.R:
--------------------------------------------------------------------------------
  1 | #!/opt/app/languages/R-3.6.3/bin/Rscript
  2 | library(optparse)
  3 | library(MatrixEQTL)
  4 | 
  5 | option_list <- list(
  6 | 	make_option(c("-p","--phenotype"),type="character",default="./Matrix_eQTL/Phenotype_matrix.txt",action="store",help="APA expression data for MatrixEQTL"),
  7 | 	make_option(c("-g","--genotype"),type="character",default="./Matrix_eQTL/Genotype_matrix.txt",action="store",help="Genotype data for MatrixEQTL"),
  8 | 	make_option(c("-c","--covariate"),type="character",default="./Matrix_eQTL/Covariate_matrix.txt",action="store",help="Covariates for MatrixEQTL"),
  9 | 	make_option(c("-s","--snp_location"),type="character",default="./Matrix_eQTL/snp_location.txt",action="store",help="SNP locations"),
 10 | 	make_option(c("-u","--utr_location"),type="character",default="./Matrix_eQTL/3UTR_location.txt",action="store",help="3UTR locations"),
 11 | 	make_option(c("-w","--window"),type="numeric",default=1e6,action="store",help="window size"),
 12 | 	make_option(c("-q","--cis_pvalue"),type="numeric",default=1e-2,action="store",help="p value threshold for cis-3'aQTL"),
 13 | 	make_option(c("-Q","--trans_pvalue"),type="numeric",default=1e-5,action="store",help="p value trheshold for trans-3'aQTL")
 14 | 		    )
 15 | 
 16 | opt <- parse_args(OptionParser(option_list=option_list,usage="usage: %prog [options]"))
 17 | 
 18 | PHENO <- opt$phenotype
 19 | GENO <- opt$genotype
 20 | COVARIATE <- opt$covariate
 21 | SNPLOC <- opt$snp_location
 22 | UTRLOC <- opt$utr_location
 23 | 
 24 | CIS_DISTANCE <- as.numeric(opt$window)
 25 | CIS_P_CUTOFF <- as.numeric(opt$cis_pvalue)
 26 | TRANS_P_CUTOFF <- as.numeric(opt$trans_pvalue)
 27 | 
 28 | cat('Options:\n','Phenotype:',PHENO,'\n','Genotype:',GENO,'\n','Covariates:',COVARIATE,'\n','CIS_DISTANCE:',CIS_DISTANCE,'\n',
 29 |     'CIS_P_CUTOFF:',CIS_P_CUTOFF,'\n','TRANS_P_CUTOFF:',TRANS_P_CUTOFF,'\n')
 30 | 
 31 | # - Use linear model
 32 | useModel = modelLINEAR # modelANOVA, modelLINEAR, or modelLINEAR_CROSS
 33 | 
 34 | # - Genotype file name
 35 | SNP_file_name = GENO
 36 | snps_location_file_name = SNPLOC
 37 | 
 38 | # - APA expression file name
 39 | expression_file_name = PHENO
 40 | gene_location_file_name = UTRLOC
 41 | 
 42 | # - Covariates file name
 43 | covariates_file_name = COVARIATE
 44 | 
 45 | # - output file name
 46 | output_file_name_cis = "./Matrix_eQTL/Cis_3aQTL_all_control_gene_exprs.txt"
 47 | output_file_name_tra = "./Matrix_eQTL/Trans_3aQTL_all_control_gene_exprs.txt"
 48 | output_figure_name_cis = "./Matrix_eQTL/Cis_3aQTL_genotype_info_control_gene_exprs.pdf"
 49 | pdf(output_figure_name_cis)
 50 | 
 51 | # - threshold
 52 | pvOutputThreshold_cis = CIS_P_CUTOFF;
 53 | pvOutputThreshold_tra = TRANS_P_CUTOFF;
 54 | 
 55 | # - Error covariance matrix
 56 | # set to numeric() for identity
 57 | errorCovariance = numeric();
 58 | 
 59 | # - Distance for local gene-SNP pairs
 60 | cisDist = CIS_DISTANCE;
 61 | 
 62 | # -- load genotype data
 63 | snps = SlicedData$new();
 64 | snps$fileDelimiter = "\t";
 65 | snps$fileOmitCharacters = "NA";
 66 | snps$fileSkipRows = 1;          # one row of column labels
 67 | snps$fileSkipColumns = 1;       # one column of row labels
 68 | snps$fileSliceSize = 2000;      # read file in slices of 2,000 rows
 69 | snps$LoadFile(SNP_file_name);
 70 | 
 71 | # -- load apa expression data
 72 | gene = SlicedData$new();
 73 | gene$fileDelimiter = "\t";      # the TAB character
 74 | gene$fileOmitCharacters = "NA"; # denote missing values;
 75 | gene$fileSkipRows = 1;          # one row of column labels
 76 | gene$fileSkipColumns = 1;       # one column of row labels
 77 | gene$fileSliceSize = 2000;      # read file in slices of 2,000 rows
 78 | gene$LoadFile(expression_file_name);
 79 | 
 80 | # -- load covariates data
 81 | cvrt = SlicedData$new();
 82 | cvrt$fileDelimiter = "\t";      # the TAB character
 83 | cvrt$fileOmitCharacters = "NA"; # denote missing values;
 84 | cvrt$fileSkipRows = 1;          # one row of column labels
 85 | cvrt$fileSkipColumns = 1;       # one column of row labels
 86 | if(length(covariates_file_name)>0) {
 87 | 	cvrt$LoadFile(covariates_file_name);
 88 | }
 89 | 
 90 | 
 91 | ## Run the analysis
 92 | snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE);
 93 | genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE);
 94 | 
 95 | me = Matrix_eQTL_main(
 96 | 		      snps = snps,
 97 | 		      gene = gene,
 98 | 		      cvrt = cvrt,
 99 | 		      output_file_name     = output_file_name_tra,
100 | 		      pvOutputThreshold     = pvOutputThreshold_tra,
101 | 		      useModel = useModel,
102 | 		      errorCovariance = errorCovariance,
103 | 		      verbose = TRUE,
104 | 		      output_file_name.cis = output_file_name_cis,
105 | 		      pvOutputThreshold.cis = pvOutputThreshold_cis,
106 | 		      snpspos = snpspos,
107 | 		      genepos = genepos,
108 | 		      cisDist = cisDist,
109 | 		      pvalue.hist = "qqplot",
110 | 		      min.pv.by.genesnp = TRUE,
111 | 		      noFDRsaveMemory = FALSE);
112 | 
113 | gz1 <- "./Matrix_eQTL/data.RDataw"
114 | save.image(gz1)
115 | #unlink(output_file_name_tra);
116 | #unlink(output_file_name_cis);
117 | 
118 | # -- Results
119 | cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n')
120 | cat('Detected local aQTLs:', '\n');
121 | show(me$cis$eqtls)
122 | cat('Detected distant aQTLs:', '\n');
123 | show(me$trans$eqtls)
124 | 
125 | write.table(me$cis$min.pv.gene, "./Matrix_eQTL/cis.min.pv.gene.txt")
126 | save(gene,snps,file="./Matrix_eQTL/Gene_SNP.RData")
127 | save(snps,genepos,file="./Matrix_eQTL/permutation.RData")
128 | 
129 | # -- plot the Q-Q plot of local and distant p-values
130 | plot(me)
131 | dev.off()
132 | 


--------------------------------------------------------------------------------
/src/prepare_inputs_for_finemapping.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Prepare input data for fine-mapping by susieR
  3 | # @Xudong Zou, zouxd@szbl.ac.cn
  4 | # 2022-03-30
  5 | 
  6 | # -- Usage function
  7 | script_name=$0
  8 | function usage(){
  9 | 	echo "#=============================="
 10 | 	echo "Default usage:"
 11 | 	echo "#=============================="
 12 | 	echo "bash $script_name"
 13 | 	echo "Options:"
 14 | 	echo "        -g  text file,the genotype matrix used in Matrix_eQTL, default is ./Matrix_eQTL/Genotype_matrix.txt"
 15 | 	echo "        -p  text file,the phenotype  matrix used in Matrix_eQTL, default is ./Matrix_eQTL/Phenotype_matrix.txt"
 16 | 	echo "        -a  text file,the cis association list that returned by Matrix-eQTL, default is ./Matrix_eQTL/Cis_3aQTL_all_control_gene_exprs.txt"
 17 | 	echo "        -u  text file,the 3'UTR location file, the one also used by Matrix-eQTL, default is ./Matrix_eQTL/3UTR_location.txt"
 18 | 	echo "        -w  integer,setting the window size around aGenes for fine-mapping"
 19 | 	echo "        -q  float, specify the maximum of FDR to filtering significant 3'aQTL association,default 0.05"
 20 | 	echo "        -h print the help information"
 21 | 	exit 1
 22 | }
 23 | 
 24 | # define global variables from command parameters
 25 | currDir=`pwd`
 26 | genotype="./Matrix_eQTL/Genotype_matrix.txt"
 27 | phenotype="./Matrix_eQTL/Phenotype_matrix.txt"
 28 | qtl_res="./Matrix_eQTL/Cis_3aQTL_all_control_gene_exprs.txt"
 29 | utr_loc="./Matrix_eQTL/3UTR_location.txt"
 30 | window=`echo "1e6"|awk '{printf("%d",$0)}'`
 31 | FDR="0.05"
 32 | sourceDir="./src"
 33 | while getopts :g:p:a:u:w:q:h opt
 34 | do
 35 | 	case $opt in 
 36 | 		g)
 37 | 			genotype="$OPTARG"
 38 | 		;;
 39 | 		p)
 40 | 			phenotype="$OPTARG"
 41 | 		;;
 42 | 		a)
 43 | 			qtl_res="$OPTARG"
 44 | 		;;
 45 | 		u)
 46 | 			utr_loc="$OPTARG"
 47 | 		;;
 48 | 		w)
 49 | 			window=`echo "$OPTARG"|awk '{printf("%d",$0)}'`
 50 | 		;;
 51 | 		q)
 52 | 			FDR="$OPTARG"
 53 | 		;;
 54 | 		h)
 55 | 			echo "Help message:"
 56 | 			usage
 57 | 		;;
 58 | 		:)
 59 | 			echo "The option -$OPTARG requires an argument."
 60 | 			exit 1
 61 | 		;;
 62 | 		?)
 63 | 			echo "Invalid option: $OPTARG"
 64 | 			usage
 65 | 			exit 2
 66 | 		;;
 67 | 	esac
 68 | done
 69 | 
 70 | # -- Basic settings
 71 | if [ ! -d "${currDir}/FineMapping/input" ]
 72 | then
 73 | 	mkdir -p ${currDir}/FineMapping/input
 74 | fi
 75 | 
 76 | if [ ! -d "${currDir}/output/FineMapping/output" ]
 77 | then
 78 | 	mkdir -p ${currDir}/FineMapping/output
 79 | fi
 80 | 
 81 | if [ ! -d "${currDir}/Matrix_eQTL" ]
 82 | 
 83 | then
 84 | 	echo "No Matrix-eQTL output found!"
 85 | 	exit
 86 | fi
 87 | 
 88 | # -- Main function --
 89 | function main(){
 90 | 	echo "Running $script_name with the following parameters:"
 91 | 	echo "*************************************************"
 92 | 	echo "-g: $genotype"
 93 | 	echo "-p: $phenotype"
 94 | 	echo "-a: $qtl_res"
 95 | 	echo "-u: $utr_loc"
 96 | 	echo "-w: $window"
 97 | 	echo "-q: $FDR"
 98 | 	echo "*************************************************"
 99 | 	date
100 | 	echo "Prepare input for susieR"
101 | 	prepare_input $genotype $phenotype $qtl_res $utr_loc $window $FDR
102 | 	echo "Done!"
103 | 	date
104 | }
105 | 
106 | 
107 | # -- Other functions --
108 | function prepare_input(){
109 | 	geno=$1
110 | 	pheno=$2
111 | 	aQTL=$3
112 | 	utrLoc=$4
113 | 	w=$5
114 | 	fdr=$6
115 | 	if [ ! -f "$utrLoc" ]
116 | 	then
117 | 		echo "File ${utrLoc} not found!"
118 | 		exit
119 | 	fi
120 | 
121 | 	if [ ! -f "${geno}" ]
122 | 	then
123 | 		echo "File ${geno} not found!"
124 | 		exit
125 | 	fi
126 | 
127 | 	echo "Prepare unique aGenes set..."
128 | 	python ${sourceDir}/prepare_susieR_uniqGene_location.py --utr_loc_file ${utrLoc} \
129 | 		--aQTL_map ${aQTL} \
130 | 		--extend_size $w \
131 | 		--Max_FDR $fdr \
132 | 		--outdir ${currDir}/FineMapping/input \
133 | 		--output picked_asso_list.loc_${w}.txt &
134 | 	wait
135 | 
136 | 	echo "Prepare SNP files in bed format..."
137 | 	python ${sourceDir}/genotype_2_bed.py --genotype ${geno} \
138 | 		--out_bed ${currDir}/FineMapping/input/Genotype_matrix.bed \
139 | 		--out_header ${currDir}/FineMapping/input/Header.txt &
140 | 	wait
141 | 
142 | 	if [ -f "${currDir}/FineMapping/input/Genotype_matrix.bed" ]
143 | 	then
144 | 		sort -k1,1 -k2,2n ${currDir}/FineMapping/input/Genotype_matrix.bed > tmp.bed &
145 | 		wait
146 | 		mv tmp.bed ${currDir}/FineMapping/input/Genotype_matrix.bed &
147 | 		wait
148 | 	else
149 | 		echo "File ${currDir}/FineMapping/input/Genotype_matrix.bed not found!"
150 | 		exit
151 | 	fi
152 | #  create a unique workspace for each gene in "picked_asso_list.loc_${w}.txt", and generate a "expr.phen" for each gene
153 | 	if [ ! -f "${pheno}" ]
154 | 	then
155 | 		echo "File ${pheno} not found!"
156 | 		exit
157 | 	fi
158 | 	echo "Make a directory and generate a expr.phen for each gene"
159 | 	for gene in `cat ${currDir}/FineMapping/input/picked_asso_list.loc_${w}.txt| cut -f1`
160 | 	do
161 | 		mkdir -p ${currDir}/FineMapping/output/$gene
162 | 		cat ${pheno} | awk -v aGENE=$gene -F"\t" 'BEGIN{OFS="\t"} {if(NR==1){for (i=2;i<NF;++i) SAMPLES[i]=$i} if ($1==aGENE){ for(i=2;i<NF;++i) print SAMPLES[i],SAMPLES[i],$i}}' > ${currDir}/FineMapping/output/$gene/expr.phen &
163 | 		wait
164 | 	done
165 | 
166 | 	echo "Select SNPs around a window of ${w}bp of the gene and generate 3aQTL.vcf in the gene's directory"
167 | 	while read line
168 | 	do
169 | 		gene=`echo $line|awk '{print $1}'`
170 | 		loc=`echo $line|awk '{print $2}'`
171 | 		cd ${currDir}/FineMapping/output/$gene
172 | 		CHR=${loc%:*}
173 | 		COORD=${loc#*:}
174 | 		S=${COORD%-*}
175 | 		E=${COORD#*-}
176 | 		echo -e "$CHR\t$S\t$E" > gene_loc.bed
177 | 		cat ${currDir}/FineMapping/input/Header.txt > 3aQTL.vcf
178 | 		bedtools intersect -a ${currDir}/FineMapping/input/Genotype_matrix.bed -b gene_loc.bed -wa |cut -f4- >> 3aQTL.vcf &
179 | 		wait
180 | 		rm gene_loc.bed
181 | 	done < ${currDir}/FineMapping/input/picked_asso_list.loc_${w}.txt
182 | 
183 | 	cd ${currDir}
184 | }
185 | 
186 | # - main
187 | main
188 | 


--------------------------------------------------------------------------------
/src/DaPars_Extract_Anno.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import os
  4 | import sys, getopt
  5 | import os.path
  6 | 
  7 | 
  8 | def Annotation_prepar_3UTR_extraction(gene_bed_file, gene_symbol_map_kfXref_file, output_utr_file):
  9 |     
 10 |     output_write = open(output_utr_file,'w')
 11 |     
 12 |     refseq_trapt_gene_symbol_dict = {}
 13 |     num_line = 0
 14 |     for line in open(gene_symbol_map_kfXref_file, 'r'):
 15 |         if num_line > 0:
 16 |             fields = line.strip('\n').strip('\r').split('\t')
 17 |             gene_symbol = fields[1]
 18 |             refseq_transcript_id = fields[0]
 19 |             refseq_trapt_gene_symbol_dict[refseq_transcript_id] = gene_symbol
 20 |         else:
 21 |             num_line += 1
 22 |     
 23 |     scanned_3UTR_list = []
 24 |     num_saved = 0
 25 |     for line in open(gene_bed_file,'r'):
 26 |         fields = line.strip('\n').split('\t')
 27 |         refseq_id = fields[3]
 28 |         if '_' not in fields[0]:
 29 |             
 30 |             if refseq_id not in refseq_trapt_gene_symbol_dict:
 31 |                 gene_symbol = "NA"
 32 |             else:
 33 |                 gene_symbol = refseq_trapt_gene_symbol_dict[refseq_id] 
 34 |             
 35 |             UTR_id = [refseq_id, gene_symbol,fields[0], fields[5]]
 36 |             UTR_id_new = '|'.join(UTR_id)
 37 |             curr_strand = fields[5]
 38 |             if curr_strand == "+":
 39 |                 UTR_end = fields[2]
 40 |                 gene_start = int(fields[1])
 41 |                 UTR_start = str(gene_start + int(fields[-1].strip(',').split(',')[-1])+1)#1base
 42 |             elif curr_strand == "-":
 43 |                 gene_start = int(fields[1])
 44 |                 UTR_start = str(gene_start + 1)#1base
 45 |                 UTR_end   = str(gene_start + int(fields[10].split(',')[0]))#1base, included
 46 |             
 47 |             this_UTR = fields[0]+UTR_start+UTR_end+curr_strand
 48 |             if this_UTR not in scanned_3UTR_list:
 49 |                 write_line = [fields[0], UTR_start, UTR_end,UTR_id_new, '0', curr_strand]
 50 |                 output_write.writelines('\t'.join(write_line) + '\n')
 51 |                 scanned_3UTR_list.append(this_UTR)
 52 |                 num_saved += 1
 53 |     
 54 |     
 55 |     output_write.close()   
 56 |     print("Total extracted 3' UTR: " + str(num_saved))
 57 | 
 58 | 
 59 | 
 60 | def Subtract_different_strand_overlap(input_gene_bed_file,output_utr_file):
 61 |     def UTRs_subtract_refine(UTRs_all):
 62 |         strand_info = UTRs_all[0].strip('\n').split('\t')[-1]
 63 |         if strand_info == '+':
 64 |             all_pos = []
 65 |             for curr_line in UTRs_all:
 66 |                 left_pos = curr_line.strip('\n').split('\t')[1]
 67 |                 all_pos.append(int(left_pos))
 68 |             selected_UTR_index = all_pos.index(min(all_pos))
 69 |             selected_UTR = UTRs_all[selected_UTR_index]
 70 |         else:
 71 |             all_pos = []
 72 |             for curr_line in UTRs_all:
 73 |                 left_pos = curr_line.strip('\n').split('\t')[2]
 74 |                 all_pos.append(int(left_pos))
 75 |             selected_UTR_index = all_pos.index(max(all_pos))
 76 |             selected_UTR = UTRs_all[selected_UTR_index]
 77 |         return selected_UTR
 78 |     temp_file = "overlap_opposite_strand_subtract.bed"
 79 |     cmd = 'subtractBed -a %s -b %s -S > %s' % (input_gene_bed_file, input_gene_bed_file, temp_file)
 80 |     os.system(cmd)
 81 |     
 82 |     read_subtract_result_dict = {}
 83 |     for line in open(temp_file,'r'):
 84 |         transcript_id = line.split('\t')[3].split('|')[0]
 85 |         if transcript_id not in read_subtract_result_dict:
 86 |             read_subtract_result_dict[transcript_id] = []
 87 |         read_subtract_result_dict[transcript_id].append(line)
 88 |     
 89 |     output_utr_write = open(output_utr_file,'w')
 90 |     for curr_trans_id in read_subtract_result_dict:
 91 |         curr_3UTRs = read_subtract_result_dict[curr_trans_id]
 92 |         num_3UTRs = len(curr_3UTRs)
 93 |         if num_3UTRs == 1:
 94 |             output_utr_write.writelines(curr_3UTRs[0])
 95 |         else:
 96 |             selected_UTR = UTRs_subtract_refine(curr_3UTRs)
 97 |             output_utr_write.writelines(selected_UTR)
 98 |     output_utr_write.close()
 99 |     
100 |     try:
101 |         os.remove(temp_file)
102 |     except OSError:
103 |         pass
104 | 
105 | def Extract_Anno_main(argv):
106 |     gene_bed_file = ''
107 |     gene_symbol_annotation_file = ''
108 |     output_extract_file = 'temp_anno_extracted.bed'
109 |     output_final_extract_file = ''
110 |     
111 |     try:
112 |         opts, args = getopt.getopt(argv,"hb:s:o:",["bed=","symbol=","ofile"])
113 |     except getopt.GetoptError:
114 |         print('python DaPars_Extract_Anno.py -b <gene_bed_file> -s <gene_symbol_map> -o <output_file>')
115 |         sys.exit(2)
116 |     for opt, arg in opts:
117 |         if opt == '-h':
118 |             print('python DaPars_Extract_Anno.py -b <gene_bed_file> -s <gene_symbol_map> -o <output_file>')
119 |             sys.exit()
120 |         elif opt in ("-b", "--bed"):
121 |             gene_bed_file = arg
122 |         elif opt in ("-s", "--symbol"):
123 |             gene_symbol_annotation_file = arg
124 |         elif opt in ("-o", "--ofile"):
125 |             output_final_extract_file = arg
126 |     
127 |     if gene_bed_file=='':
128 |         print("Error: No gene bed file!", file=sys.stderr)
129 |         exit(1)
130 |     if gene_symbol_annotation_file=='':
131 |         print("Error: No gene symbol file!", file=sys.stderr)
132 |         exit(1)
133 |     
134 |     if output_final_extract_file=='':
135 |         print("Error: No output file!", file=sys.stderr)
136 |         exit(1)
137 |     
138 |     print("Generating regions ...")
139 |     Annotation_prepar_3UTR_extraction(gene_bed_file, gene_symbol_annotation_file,output_extract_file)
140 |     Subtract_different_strand_overlap(output_extract_file,output_final_extract_file)
141 |     
142 |     try:
143 |         os.remove(output_extract_file)
144 |     except OSError:
145 |         pass
146 | 
147 |     
148 |     print("Finished")
149 | 
150 | if __name__ == '__main__':
151 |     Extract_Anno_main(sys.argv[1:])
152 |     
153 | 


--------------------------------------------------------------------------------
/src/curate_pheno_geno_covariates.R:
--------------------------------------------------------------------------------
  1 | #!/opt/app/languages/R-3.6.3/bin/Rscript
  2 | # 2022-01-08
  3 | library(optparse)
  4 | 
  5 | # -- global variable
  6 | option_list <- list(
  7 | 	make_option(c("-p", "--pheno_data"),type = "character", default = "Dapars2_res.all_chromosomes.txt", action = "store", help = "Proivde the merged ouput from DaPars2, Dapars2_res.all_chromosomes.txt in default"),
  8 | 	make_option(c("-g","--geno_pca"),type = "character", default = "./Matrix_eQTL/genotype_pca.eigenvec", action = "store", help = "the eigenvector of genotpye pca analysis, ./Matrix_eQTL/genotype_pca.eigenvec in default"),
  9 | 	make_option(c("-c","--known_covs"),type = "character", default = "NA", action = "store", help = "input a text file contains known covariates if available, NA in defaut"),
 10 | 	make_option(c("-n","--top_N_pca"),type = "integer", default = "5", action = "store", help = "specify the top N PCA on gt would be used, defaut value = 5")
 11 | 		    )
 12 | opt <- parse_args(OptionParser(option_list=option_list,usage="usage: %prog [options]"))
 13 | apa_res_file <- opt$pheno_data
 14 | gtPCA_file <- opt$geno_pca
 15 | known_cov_file <- opt$known_covs
 16 | topN_pca <- opt$top_N_pca
 17 | 
 18 | cat('Arguments:','\n',
 19 |     '--pheno_data',apa_res_file,'\n',
 20 |     '--geno_pca',gtPCA_file,'\n',
 21 |     '--known_covs',known_cov_file,'\n',
 22 |     '--top_N_pca',topN_pca,'\n')
 23 | 
 24 | cat('Current directory:')
 25 | getwd()
 26 | 
 27 | library(dplyr)
 28 | library(peer)
 29 | library(impute)
 30 | # --------------- prepare covariates -----------------
 31 | # load genotype pca
 32 | gt_pca <- read.table(gtPCA_file,header=F,sep=" ",stringsAsFactors=F)
 33 | N <- as.integer(topN_pca) + 1
 34 | gt_pca$V1 <- NULL;gt_pca <- gt_pca[,1:N]
 35 | names(gt_pca) <- c("subject_id",paste0("PC_",1:(N-1)))
 36 | rm(N)
 37 | # add known covariates into topN gt_pca if available
 38 | if(known_cov_file!="NA"){
 39 | 	known_cov <- read.table(known_cov_file,header=T,sep="\t",stringsAsFactors=F)
 40 | 	dim(known_cov)
 41 | 	N <- dim(known_cov)[2]
 42 | 	for(i in 2:N){
 43 | 		if(class(known_cov[,i])=="character"){
 44 | 			known_cov[,i] <- as.factor(known_cov[,i])
 45 | 			known_cov[,i] <- as.numeric(known_cov[,i])
 46 | 		}
 47 | 	}
 48 | 	col_names <- names(known_cov)[2:N]
 49 | 	names(known_cov) <- c("subject_id",col_names)
 50 | 	gt_pca <- merge(gt_pca,known_cov,by="subject_id")
 51 | }
 52 | 
 53 | 
 54 | # convert covariates' data.frame to matrix 
 55 | rownames(gt_pca) <- gt_pca$subject_id;gt_pca <- as.matrix(gt_pca[,-1])
 56 | cat('Dimension of gt_pca:',dim(gt_pca),'\n')
 57 | rm(known_cov,known_cov_file)
 58 | 
 59 | cat("Start phenotype matrix\n","Open APA results file:",apa_res_file,"\n")
 60 | # --------------- prepare phenotype matrix ------------------
 61 | pdui_mat <- read.table(apa_res_file, stringsAsFactors=FALSE, header=TRUE,sep="\t",check.names=FALSE)
 62 | pdui_mat <- pdui_mat[,-c(2,3,4)]
 63 | 
 64 | pdui_mat.sel <- pdui_mat %>% dplyr::select(all_of(rownames(gt_pca)))
 65 | pdui_mat.sel <- as.matrix(pdui_mat.sel)
 66 | rownames(pdui_mat.sel) <- pdui_mat[,1]
 67 | 
 68 | 
 69 | #remove genes with more than 50% entries missing and individuals with more than 80% missing data
 70 | pdui_mat.sel <- pdui_mat.sel[, colMeans(is.na(pdui_mat.sel)) <= 0.8];pdui_mat.sel <-  pdui_mat.sel[rowMeans(is.na(pdui_mat.sel)) < 0.5,]
 71 | class(pdui_mat.sel) <- 'numeric'
 72 | 
 73 | 
 74 | # run peer to estimate confounders
 75 | cat("Start covariate analysis by peer...")
 76 | #save.image(file="run_peer_impute.RData")
 77 | model <- PEER()
 78 | covs_se <- gt_pca
 79 | 
 80 | PEER_setCovariates(model, covs_se)
 81 | dim(PEER_getCovariates(model))
 82 | #impute missing values in PDUI matrix
 83 | mat.ds <- pdui_mat.sel
 84 | mat_impute <- impute.knn(mat.ds)
 85 | #quantile normalization
 86 | df_w <- as.data.frame(mat_impute$data)
 87 | for(gene in 1:nrow(df_w)){
 88 | 	mat = df_w[gene,]
 89 | 	mat = apply(mat,1,rank,ties.method = "average")
 90 | 	mat = qnorm(mat / (ncol(df_w)+1))
 91 | 	df_w[gene,] = mat
 92 | }
 93 | 
 94 | pdui_mat <- cbind(rownames(mat_impute$data),df_w)
 95 | y <- colnames(pdui_mat)[-1]
 96 | colnames(pdui_mat) <- c("Gene",y)
 97 | id_order <- colnames(pdui_mat)[-1]
 98 | cat("Output phenotype matrix:\n")
 99 | write.table(pdui_mat,file="./Matrix_eQTL/Phenotype_matrix.txt",row.names=F,col.names=T,quote=F,sep="\t")
100 | rm(y)
101 | 
102 | PEER_setPhenoMean(model, t(as.matrix(mat_impute$data)))
103 | 
104 | dim(PEER_getPhenoMean(model))
105 | 
106 | # set number of peer factors
107 | ## N < 150, use 15  PEERs, 150<=N<250, use 30 PEERs, N >=250 use 35 PEERs
108 | if (ncol(mat.ds) < 150) {
109 | 	numcov <- 15
110 | } else if (ncol(mat.ds) < 250) {
111 | 	numcov <- 30
112 | } else if (ncol(mat.ds) >= 250) {
113 | 	numcov <- 35
114 | }
115 | 
116 | PEER_setNk(model, numcov)
117 | PEER_getNk(model)
118 | 
119 | PEER_update(model)
120 | 
121 | # diag
122 | pdf('peer.diag.pdf', width=6, height=8)
123 | PEER_plotModel(model)
124 | dev.off()
125 | 
126 | 
127 | factors = t(PEER_getX(model))
128 | weights = PEER_getW(model)
129 | precision = PEER_getAlpha(model)
130 | 
131 | residuals = t(PEER_getResiduals(model))
132 | rownames(residuals) <- rownames(mat.ds)
133 | colnames(residuals) <- colnames(mat.ds)
134 | 
135 | rownames(factors) <- c(colnames(gt_pca), paste0("PEER_",1:numcov))
136 | colnames(factors) <- colnames(mat.ds)
137 | 
138 | residuals.ds <- residuals
139 | 
140 | 	#png(paste0(loop.pop[i], '.expr.peer.clust.png'), width=8, height=8, res=150, units='in')
141 | 	#heatmap.2(as.matrix(residuals.ds), distfun=function(x) dist(x,method='euclidian'), hclustfun=function(x) hclust(x,method='ward.D2'),
142 |         #  trace='none', dendrogram='both', Rowv=TRUE, Colv=TRUE, breaks=pairs.breaks, col=colorRampPalette(myCols), scale='none', symkey=T, na.color='grey', density.info='histogram', cexRow=0.2, cexCol=0.5, main=paste0(TISSUE, '\nexpr clustering'))
143 | 	#dev.off()
144 | 
145 | gz1 <- "pdui.peer.residuals.txt"
146 | write.table(cbind(rownames(residuals), residuals), file=gz1, row.names=FALSE, col.names=c("id",colnames(residuals)), quote=FALSE, sep='\t')
147 | rm(model,mat.ds,mat_impute,weights,precision,residuals,gz1)
148 | 
149 | # --------------------- prepare genotype matrix
150 | cat("Load genotype_matrix.bed:\n")
151 | gt_mat <- read.table("./Matrix_eQTL/genotype_matrix.bed",header=T,sep="\t",check.names=FALSE)
152 | dim(gt_mat)
153 | gt_mat.reorder <- gt_mat %>% dplyr::select("id",all_of(id_order))
154 | 
155 | cat("Output genotype:\n")
156 | write.table(gt_mat.reorder,file="./Matrix_eQTL/Genotype_matrix.txt",quote=F,sep="\t",row.names=F,col.names=T)
157 | 
158 | rm(gt_mat)
159 | 
160 | # -------------------- prepare covariates matrix
161 | factors.df <- cbind(rownames(factors),factors)
162 | colnames(factors.df) <- c("id",colnames(factors))
163 | factors.df <- as.data.frame(factors.df)
164 | factors.reorder <- factors.df %>% dplyr::select("id",all_of(id_order))
165 | covariate_file <- "./Matrix_eQTL/Covariate_matrix.txt"
166 | write.table(factors.reorder, file=covariate_file, row.names=FALSE,quote=FALSE, sep='\t',col.names=T)
167 | rm(factors,factors.df,id_order)
168 | 


--------------------------------------------------------------------------------
/src/prepare_inputs_for_apa_quant.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # This script takes bam files, a gene annotation (bed), a ID mapping file (between refseq and gene symbol),
  3 | # and generate bedgraph files, read depth, 3'UTR reference region
  4 | # Two input files represent the gene annotation file (in bed format) and the ID mapping between Refseq transcript ID and gene symbol are required for executing this script 
  5 | # @Xudong Zou, zouxd@szbl.ac.cn
  6 | # 2022-04-21
  7 | 
  8 | # -- Usage function
  9 | script_name=$0
 10 | function usage(){
 11 | 	echo "#=============================="
 12 | 	echo "Default usage:"
 13 | 	echo "#=============================="
 14 | 	echo "bash $script_name -s <sample_list> -g <geneAnnotation> -r <refseqID_to_GeneName> -t <N_threads> -c <Coverage_cutoff>"
 15 | 	echo "Options:"
 16 | 	echo "        -s  text file,input a text file contains all samples (column 1) and corresponding bam files (column 2)"
 17 | 	echo "        -g  text file,provide a RefSeq gene annotation file extracted from UCSC"
 18 | 	echo "        -r  text file,provide a file list the ID mapping between RefSeq transcript and gene name"
 19 | 	echo "        -t  integer,specify the number of threads used to parallelly running Dapars2,default=8"
 20 | 	echo "        -c  integer,define the threshold of reads coverage of the alternative APA site,default=15"
 21 | 	echo "        -o  file name,specify a name for the configure file which will be used by DaPars2"
 22 | 	echo "        -h  print the help information"
 23 | 	exit 1
 24 | }
 25 | 
 26 | # define global variables from command parameters
 27 | currDir=`pwd`
 28 | BAMlist="sample_list.txt"
 29 | GeneAnno=""
 30 | RefIDmap=""
 31 | P=8
 32 | Cutoff_Cov=15
 33 | Config="Dapars2_running_configure.txt"
 34 | 
 35 | while getopts :s:g:r:t:c:o:h opt
 36 | do
 37 | 	case $opt in 
 38 | 		s)
 39 | 			if [ ! -f "$OPTARG" ];then echo "File not exists"; exit;fi
 40 | 			BAMlist="$OPTARG"
 41 | 		;;
 42 | 		g)
 43 | 			if [ ! -f "$OPTARG" ];then echo "File $OPTARG not found!";exit;fi
 44 | 			GeneAnno="$OPTARG"
 45 | 		;;
 46 | 		r)
 47 | 			if [ ! -f "$OPTARG" ];then echo "File $OPTARG not found!";exit;fi
 48 | 			RefIDmap="$OPTARG"
 49 | 		;;
 50 | 		t)
 51 | 			P="$OPTARG"
 52 | 		;;
 53 | 		c)
 54 | 			Cutoff_Cov="$OPTARG"
 55 | 		;;
 56 | 		o)
 57 | 			Config="$OPTARG"
 58 | 		;;
 59 | 		h)
 60 | 			echo "Help message:"
 61 | 			usage
 62 | 		;;
 63 | 		:)
 64 | 			echo "The option -$OPTARG requires an argument."
 65 | 			exit 1
 66 | 		;;
 67 | 		?)
 68 | 			echo "Invalid option: $OPTARG"
 69 | 			usage
 70 | 			exit 2
 71 | 		;;
 72 | 	esac
 73 | done
 74 | 
 75 | 
 76 | # -- Main function --
 77 | function main(){
 78 | 	echo "Running $script_name with the following parameters:"
 79 | 	echo "*************************************************"
 80 | 	echo "-s: $BAMlist"
 81 | 	echo "-g: $GeneAnno"
 82 | 	echo "-r: $RefIDmap"
 83 | 	echo "-t $P"
 84 | 	echo "-c: $Cutoff_Cov"
 85 | 	echo "-o: $Config"
 86 | 	echo "*************************************************"
 87 | 	echo "Start preparing inputs for Dapars2 ..."
 88 | 	date
 89 | 	echo "Convert bam to bedgraph format ..."
 90 | 	run_bam2bedgraph $BAMlist $P
 91 | 	echo "Counting total aligned reads by samtools flagstat ..."
 92 | 	run_samtools_flagstat $BAMlist $P
 93 | 	echo "Generating the 3' UTR reference ..."
 94 | 	generate_3utr_reference $GeneAnno $RefIDmap
 95 | 	echo "Generating the wigFile_and_readDepth.txt ..."
 96 | 	generate_wigFileList_with_readDepth $BAMlist
 97 | 	echo "Generating a configure file for Dapars2 ..."
 98 | 	generate_configure_for_dapars2 $Cutoff_Cov $P
 99 | 	wait
100 | 	echo "Done!"
101 | 	date
102 | }
103 | 
104 | 
105 | # -- Other functions --
106 | function generate_configure_for_dapars2(){
107 | 	N_cov=$1
108 | 	N_threads=$2
109 | 	python ./src/generate_configure_for_Dapars2.py --annotation_3utr ${currDir}/refseq_3utr_annotation.bed \
110 | 		--wigFile_depth ${currDir}/wigFile_and_readDepth.txt \
111 | 		--coverage_threshold $N_cov \
112 | 		--threads $N_threads \
113 | 		--out_config_name ${currDir}/Dapars2_running_configure.txt &
114 | 	wait
115 | 	echo "Generate file Dapars2.allSamples_joint.configure.txt in ${currDir}/input/"
116 | }
117 | function generate_wigFileList_with_readDepth(){
118 | 	bamList=$1
119 | 	if [ ! -f "$bamList" ]
120 | 	then
121 | 		echo "File $bamList not found!"
122 | 		exit
123 | 	fi
124 | 	python ./src/extract_read_depth.py --sample_list $bamList --path_wig ${currDir}/wig --output ${currDir}/wigFile_and_readDepth.txt &
125 | 	wait
126 | 	echo "Generate file wigFile_and_readDepth.txt"
127 | 
128 | }
129 | function generate_3utr_reference(){
130 | 	gene_anno=$1
131 | 	refID2Symbol=$2
132 | 	python ./src/DaPars_Extract_Anno.py -b ${gene_anno} -s $refID2Symbol -o ${currDir}/refseq_3utr_annotation.bed &
133 | 	wait
134 | 	echo "Generate refseq_3utr_annotation.bed"
135 | }
136 | 
137 | function run_bam2bedgraph(){
138 | 	bamList=$1
139 | 	N_jobs=$2
140 | 	if [ ! -d "${currDir}/tmp" ]
141 | 	then
142 | 		mkdir -p ${currDir}/tmp
143 | 	fi
144 | 
145 | 	if [ ! -d "${currDir}/wig" ]
146 | 	then
147 | 		mkdir -p ${currDir}/wig
148 | 	fi
149 | 
150 | 	if [ ! -f "$bamList" ]
151 | 	then
152 | 		echo "File $bamList not found!"
153 | 		exit
154 | 	fi
155 | 
156 | 	N_samples=`cat $bamList|wc -l`
157 | 	echo "$N_samples bam files waiting for processing."
158 | 	if [ $N_jobs -lt $N_samples ]
159 | 	then
160 | 		split -l $N_jobs $bamList -d ${currDir}/tmp/task_ &
161 | 		wait
162 | 	else
163 | 		echo -e "Number of parallel jobs exceeds the total number of tasks.\n$N_samples threads will be used!"
164 | 		cat $bamList > $currDir/tmp/task_00 &
165 | 		wait
166 | 	fi
167 | 
168 | 	i_task=1
169 | 	for task in `ls ${currDir}/tmp/task_*`
170 | 	do
171 | 		echo "Start subtask ${i_task}..."
172 | 		while read line
173 | 		do
174 | 			sample=`echo $line | awk '{print $1}'`
175 | 			bam=`echo $line | awk '{print $2}'`
176 | 			echo $sample
177 | 			bedtools genomecov -ibam ${bam} -bga -split -trackline > ${currDir}/wig/${sample}.wig &
178 | 		done < $task
179 | 		wait
180 | 		echo "Subtask $i_task finished!"
181 | 		(( i_task += 1 ))
182 | 	done &
183 | 	wait
184 | 
185 | 	echo "$N_samples bam files processed"
186 | 	rm ${currDir}/tmp/*
187 | 	rmdir ${currDir}/tmp
188 | 	date
189 | }
190 | 
191 | function run_samtools_flagstat(){
192 | 	bamList=$1
193 | 	N_jobs=$2
194 | 	if [ ! -d "${currDir}/tmp" ]
195 | 	then
196 | 		mkdir -p ${currDir}/tmp
197 | 	fi
198 | 
199 | 	if [ ! -f "$bamList" ]
200 | 	then
201 | 		echo "File $bamList not found!"
202 | 		exit
203 | 	fi
204 | 	N_samples=`cat $bamList|wc -l`
205 | 	echo "$N_samples bam files waiting for processing."
206 | 	if [ $N_jobs -lt $N_samples ]
207 | 	then
208 | 		split -l $N_jobs $bamList -d ${currDir}/tmp/task_ &
209 | 		wait
210 | 	else
211 | 		echo -e "Number of parallel jobs exceeds the total number of tasks.\nN_samples will be used!"
212 | 		cat $bamList > ${currDir}/tmp/task_00 &
213 | 		wait
214 | 	fi
215 | 
216 |         i_task=1
217 |         for task in `ls ${currDir}/tmp/task_*`
218 |         do
219 |                 echo "Start subtask ${i_task}..."
220 |                 while read line
221 |                 do
222 | 			sample=`echo $line | awk '{print $1}'`
223 | 			bam=`echo $line | awk '{print $2}'`
224 |                         echo $sample
225 |                         samtools flagstat -@ 2 ${bam} > ${currDir}/tmp/${sample}.flagstat &
226 |                 done < $task
227 |                 wait
228 |                 echo "Subtask $i_task finished!"
229 |                 (( i_task += 1 ))
230 |         done
231 | }
232 | 
233 | 
234 | # -- run main
235 | main
236 | 


--------------------------------------------------------------------------------
/src/prepare_inputs_for_3aQTL_mapping.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # -- usage function
  4 | script_name=$0
  5 | function usage(){
  6 | 	echo "#=============================="
  7 | 	echo "Default usage:"
  8 | 	echo "bash $script_name -g <vcf_list.txt> -p <Dapars2_res.all_chromosomes.txt> -c <known_covariates.txt> -s <sample_list.txt> -m <0.05> -n <5>"
  9 | 	echo "Options:"
 10 | 	echo "        -g  text file,input a text file contains VCF file(s), default=vcf_list.txt"
 11 | 	echo "        -p  text file,input the merged DaPars2 results, default=Dapars2_res.all_chromosomes.txt"
 12 | 	echo "        -c  text file,input the known covariates like age and gender, default=NA"
 13 | 	echo "        -s  text file,input a text file contains the list of samples, default=sample_list.txt"
 14 | 	echo "        -m  float,minor allele frequency threshold for selecting common genetic variants, default=0.05"
 15 | 	echo "        -n  integer,the top N genotype PCA components to be used as covariates, default=5"
 16 | 	echo "        -h  print the help information"
 17 | 	exit 1
 18 | 
 19 | }
 20 | # define global variables from command parameters
 21 | currDir=`pwd`
 22 | VCFLIST="vcf_list.txt"
 23 | APA_RES="Dapars2_res.all_chromosomes.txt"
 24 | KNOWN_COV="NA"
 25 | SAMPLES="sample_list.txt"
 26 | MAF="0.05"
 27 | TOP_N="5"
 28 | 
 29 | while getopts g:p:c:s:m:n:h opt
 30 | do
 31 | 	case $opt in
 32 | 		g)
 33 | 			if [ ! -f "$OPTARG" ];then echo "File $OPTARG not found";exit 0;fi
 34 | 			VCFLIST="$OPTARG"
 35 | 		;;
 36 | 		p)
 37 | 			if [ ! -f "$OPTARG" ];then echo "File $OPTARG not found";exit 0;fi
 38 | 			APA_RES="$OPTARG"
 39 | 		;;
 40 | 		c)
 41 | 			if [ ! -f "$OPTARG" ];then echo "File $OPTARG not found";exit 0;fi
 42 | 			KNOWN_COV="$OPTARG"
 43 | 		;;
 44 | 		s)
 45 | 			if [ ! -f "$OPTARG" ];then echo "File $OPTARG not found";exit 0;fi
 46 | 			SAMPLES="$OPTARG"
 47 | 		;;
 48 | 		m)
 49 | 			MAF="$OPTARG"
 50 | 		;;
 51 | 		n)
 52 | 			TOP_N="$OPTARG"
 53 | 		;;
 54 | 		h)
 55 | 			echo "Help message:"
 56 | 			usage
 57 | 		;;
 58 | 		:)
 59 | 			echo "The option -$OPTARG requires an argument."
 60 | 			exit 1
 61 | 		;;
 62 | 		?)
 63 | 			echo "Invalid option: $OPTARG"
 64 | 			usage
 65 | 			exit 2
 66 | 		;;
 67 | 	esac
 68 | 
 69 | done
 70 | 
 71 | # -- Basic settings
 72 | if [ ! -d "${currDir}/tmp" ]
 73 | then
 74 | 	mkdir -p ${currDir}/tmp
 75 | 	echo "Create a directory called tmp/"
 76 | fi
 77 | 
 78 | if [ ! -d "${currDir}/Matrix_eQTL" ]
 79 | then
 80 | 	mkdir -p ${currDir}/Matrix_eQTL
 81 | 	echo "Create a directory called Matrix_eQTL/"
 82 | fi
 83 | 
 84 | # -- Main function --
 85 | function main(){
 86 | 	date
 87 | 	echo "Start..."
 88 | 	echo "Convert VCF file into 012 format..."
 89 | 	generate_gt_matrix $VCFLIST $MAF $SAMPLES
 90 | 	echo "PCA analysis on genotype by PLINK1.9 ..."
 91 | 	PCA_on_genotype $VCFLIST $SAMPLES $MAF
 92 | 	echo "Curate phenotype matrix, genotype matrix, and covariate matrix for Matrix-eQTL"
 93 | 	curate_pheno_geno_covariates $APA_RES $KNOWN_COV $TOP_N
 94 | 	echo "extracting 3UTR location and SNP location:"
 95 | 	snp_and_3utr_location $APA_RES
 96 | 	echo "Done!"
 97 | 	date
 98 | }
 99 | 
100 | function snp_and_3utr_location(){
101 | 	dapars2_res=$1
102 | 	python ./src/extract_SNP_location.py --genotype_bed ./Matrix_eQTL/genotype_matrix.bed --output ${currDir}/Matrix_eQTL/snp_location.txt &
103 | 	wait
104 | 	python ./src/extract_3UTR_location.py --dapars_res $dapars2_res --output ${currDir}/Matrix_eQTL/3UTR_location.txt &
105 | 	wait
106 | 	echo "Two location files: snp_location.txt and 3UTR_location.txt are generated!"
107 | }
108 | function curate_pheno_geno_covariates(){
109 | 	dapars2_res=$1
110 | 	known_cov=$2
111 | 	topN=$3
112 | 	Rscript ./src/curate_pheno_geno_covariates.R -p $dapars2_res -c $known_cov -n $topN
113 | }
114 | # PCA analysis on genotype by PLINK 1.9
115 | function PCA_on_genotype(){
116 | 	vcfList=$1
117 | 	keep_inds=$2
118 | 	maf=$3
119 | 
120 | 	if [ ! -f "$vcfList" ]
121 | 	then
122 | 		echo "$vcfList not exits!"
123 | 		exit
124 | 	fi
125 | 
126 | 	if [ ! -f "$keep_inds" ]
127 | 	then
128 | 		echo "File $keep_inds not found!"
129 | 		exit
130 | 	fi
131 | 
132 | 
133 | 	for vcf in `cat $vcfList`
134 | 	do
135 | 		if [ ! -f "$vcf" ]
136 | 		then
137 | 			echo "$vcf not exists!"
138 | 			exit
139 | 		else
140 | 			filename=`basename $vcf .gz`
141 | 			vcf_body=${filename%.*}
142 | 
143 | 			plink --vcf $vcf --const-fid --out ${currDir}/tmp/${vcf_body}.plink &
144 | 			wait
145 | 
146 | 			echo "${vcf_body}.plink.bed ${vcf_body}.plink.bim ${vcf_body}.plink.fam" >> ${currDir}/tmp/merge_list.txt
147 | 		fi
148 | 	done
149 | 	sleep 10
150 | 
151 | 	cd ${currDir}/tmp
152 | 	N=`cat merge_list.txt|wc -l`
153 | 
154 | 	if [ $N -gt 1 ]
155 | 	then
156 | 		tmp=`cat merge_list.txt|head -n 1|awk '{print $1}'`
157 | 		firstVCF=${tmp%.*}
158 | 		cat merge_list.txt|tail -n+2 > tmp.txt
159 | 		cat tmp.txt > merge_list.txt
160 | 		rm tmp.txt
161 | 
162 | 		echo "Merging genotype of multi-chromosomes into one file: merged_plink.* ..."
163 | 		plink --bfile $firstVCF --merge-list merge_list.txt --out merged_plink --allow-extra-chr &
164 | 		wait
165 | 
166 | 	else
167 | 		cd $currDir
168 | 		vcf=`cat $vcfList`
169 | 		filename=`basename $vcf .gz`
170 | 		vcf_body=${filename%.*}
171 | 		mv ${currDir}/tmp/${vcf_body}.plink.bed ${currDir}/tmp/merged_plink.bed
172 | 		mv ${currDir}/tmp/${vcf_body}.plink.bim ${currDir}/tmp/merged_plink.bim
173 | 		mv ${currDir}/tmp/${vcf_body}.plink.fam ${currDir}/tmp/merged_plink.fam
174 | 
175 | 	fi
176 | 
177 | 	# extract selected samples in plink file
178 | 	cd ${currDir}
179 | 	cat $keep_inds | awk '{print "0",$1}' > ${currDir}/tmp/keep.list
180 | 	cd ${currDir}/tmp
181 | 	plink --bfile merged_plink --keep keep.list --geno 0.02 --hwe 0.000001 --maf $maf --make-bed --out merged_plink_QC &
182 | 	wait
183 | 	sleep 20
184 | 
185 | 	# pca analysis
186 | 	plink --bfile merged_plink_QC --indep-pairwise 50 5 0.2 --out merged_plink_QC &
187 | 	wait
188 | 	plink --bfile merged_plink_QC --extract merged_plink_QC.prune.in --pca 30 --out genotype_pca &
189 | 	wait
190 | 
191 | 	# mv pca results to input file for furhter analysis
192 | 	if [ -f "genotype_pca.eigenvec" ]
193 | 	then
194 | 		cp genotype_pca.eigenvec ${currDir}/Matrix_eQTL
195 | 		echo "move genotype_pca.eigenvec to ${currDir}/Matrix_eQTL"
196 | 
197 | 		cd $currDir
198 | 	else
199 | 		echo "genotype_pca.eigenvec not found!"
200 | 		exit
201 | 	fi
202 | 
203 | }
204 | 
205 | # recode genotype into 012 format from VCF
206 | function generate_gt_matrix(){
207 | 	vcfList=$1
208 | 	maf=$2
209 | 	keep_inds=$3
210 | 	
211 | 	cat $keep_inds |cut -f1 > ${currDir}/tmp/keep_inds.txt
212 | 
213 | 	for vcf in `cat $vcfList`
214 | 	do
215 | 		if [ ! -f "$vcf" ]
216 | 		then
217 | 			echo "$vcf not exists!"
218 | 			exit
219 | 		else
220 | 			last_suffix=${vcf##*.}
221 | 			if [ $last_suffix = "gz" ]
222 | 			then
223 | 				filename=`basename $vcf`
224 | 				tmp=${filename%.*}
225 | 				vcf_body=${tmp%.*}
226 | 				echo "Extract genotype from $vcf ..."
227 | 				vcftools --gzvcf $vcf --out ${currDir}/tmp/${vcf_body}.gt_filtering --remove-filtered-all --keep $keep_inds --maf $maf --max-missing-count 10 --extract-FORMAT-info GT &
228 | 				
229 | 				echo "Extract allele frequence from $vcf ..."
230 | 				vcftools --gzvcf $vcf --out ${currDir}/tmp/${vcf_body}.gt_filtering --remove-filtered-all --keep $keep_inds  --maf $maf --max-missing-count 10 --freq &
231 | 				
232 | 				wait
233 | 				echo "$vcf Done!"
234 | 			elif [ $last_suffix = "vcf" ]
235 | 			then
236 | 				filename=`basename $vcf`
237 | 				vcf_body=${filename%.*}
238 | 				echo "Extract genotype from $vcf ..."
239 | 				vcftools --vcf $vcf --out ${currDir}/tmp/${vcf_body}.gt_filtering --remove-filtered-all --keep $keep_inds --maf $maf --max-missing-count 10 --extract-FORMAT-info GT &
240 | 				
241 | 				echo "Extract allele frequence from $vcf ..."
242 | 				vcftools --vcf $vcf --out ${currDir}/tmp/${vcf_body}.gt_filtering --remove-filtered-all --keep $keep_inds --maf $maf --max-missing-count 10 --freq &
243 | 				wait
244 | 				echo "$vcf Done!"
245 | 			else
246 | 				echo "Unrecognized file format."
247 | 				exit
248 | 
249 | 			fi
250 | 
251 | 		fi
252 | 	done &
253 | 	wait
254 | 
255 | 	for frq in `ls ${currDir}/tmp/*.frq`
256 | 	do
257 | 		filename=`basename $frq`
258 | 		sample=${filename%.*}
259 | 		python ./src/recode_with_012.py --frq $frq --GT ${currDir}/tmp/${sample}.GT.FORMAT --output ${currDir}/tmp/${sample}.GT.bed &
260 | 		wait
261 | 	done &
262 | 	wait
263 | 
264 | 	cat ${currDir}/tmp/*.GT.bed |head -n 1 > ${currDir}/Matrix_eQTL/genotype_matrix.bed &
265 | 	wait
266 | 	for bed in `ls ${currDir}/tmp/*.GT.bed`
267 | 	do
268 | 		cat $bed |tail -n+2 >> ${currDir}/Matrix_eQTL/genotype_matrix.bed &
269 | 		wait
270 | 	done
271 | 
272 | 	echo "Genotype matrix has been generated: ./Matrix_eQTL/genotype_matrix.bed"
273 | 	rm ${currDir}/tmp/*
274 | 
275 | }
276 | 
277 | # -- main
278 | main
279 | 


--------------------------------------------------------------------------------
/src/Dapars2_Multi_Sample.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import sys
  4 | import datetime
  5 | import threading
  6 | import scipy as sp
  7 | import scipy.stats
  8 | from multiprocessing import Pool
  9 | from bisect import bisect
 10 | 
 11 | import math
 12 | import time
 13 | 
 14 | import multiprocessing
 15 | 
 16 | 
 17 | def time_now():#return time
 18 |     curr_time = datetime.datetime.now()
 19 |     return curr_time.strftime("%c")
 20 | 
 21 | def Convert_wig_into_bp_coverage(extracted_coverage,extracted_3UTR_region,strand_info):
 22 |     bp_coverage = np.zeros(extracted_3UTR_region[-1] - extracted_3UTR_region[0])
 23 |     relative_start = extracted_3UTR_region[0]
 24 |     for i in range(len(extracted_coverage)):
 25 |         curr_region_start = extracted_3UTR_region[i] - relative_start
 26 |         curr_region_end = extracted_3UTR_region[i+1] - relative_start
 27 |         bp_coverage[curr_region_start:curr_region_end] = extracted_coverage[i]
 28 |     if strand_info == '-':
 29 |         bp_coverage = bp_coverage[::-1]
 30 | 
 31 |     return bp_coverage
 32 | 
 33 | def parse_cfgfile(cfg_file):
 34 |     '''Parse configure file
 35 |     '''
 36 |     Aligned_Wig_files=''
 37 |     output_directory=''
 38 |     Annotated_3UTR_file=''
 39 |     Output_result_file=''
 40 |     Coverage_threshold = 1
 41 |     Num_threads = 1
 42 |     sequencing_depth_file = ''
 43 | 
 44 |     for line in open(cfg_file, 'r'):
 45 |         if line[0] == '\n' or line[0] == '#':
 46 |             comments = line;
 47 |         else:
 48 |             line = line.rstrip()
 49 |             command = line.split('=');
 50 |             if command[0] == 'Aligned_Wig_files':
 51 |                 Aligned_Wig_files = command[1].split(',');
 52 |             if command[0] == 'Output_directory':
 53 |                 output_directory = command[1]
 54 |                 if output_directory[-1] != '/':
 55 |                     output_directory += '/'
 56 |             if command[0] == 'Annotated_3UTR':
 57 |                 Annotated_3UTR_file = command[1]
 58 |             if command[0] == 'Output_result_file':
 59 |                 Output_result_file = command[1]
 60 |             if command[0] == 'sequencing_depth_file':
 61 |                 sequencing_depth_file = command[1]
 62 |             if command[0] == 'Num_Threads':
 63 |                 Num_threads = int(command[1])
 64 |             if command[0] == 'Coverage_threshold':
 65 |                 Coverage_threshold = int(command[1])
 66 | 
 67 | 
 68 |     if Aligned_Wig_files == '':
 69 |         print("No aligned BAM file found!", file=sys.stderr)
 70 |         exit(1)
 71 |     if output_directory=='':
 72 |         print("No output directory!", file=sys.stderr)
 73 |         exit(1)
 74 |     if Annotated_3UTR_file=='':
 75 |         print("No annotated 3' UTR file!", file=sys.stderr)
 76 |         exit(1)
 77 |     if Output_result_file=='':
 78 |         print("No result file name!", file=sys.stderr)
 79 |         exit(1)
 80 |     if sequencing_depth_file=='':
 81 |         print("No sequencing depth file!", file=sys.stderr)
 82 |         exit(1)
 83 | 
 84 |     return Aligned_Wig_files, output_directory, Annotated_3UTR_file, Output_result_file, sequencing_depth_file, Num_threads, Coverage_threshold
 85 | 
 86 | def load_sequencing_depth(depth_file):
 87 |     seq_depth_list = []
 88 |     for line in open(depth_file, 'r'):
 89 |         fields = line.strip('\n').split('\t')
 90 |         seq_depth_list.append(int(fields[-1]))
 91 | 
 92 |     return np.array(seq_depth_list)
 93 | 
 94 | def De_Novo_3UTR_Identification_Loading_Target_Wig_for_TCGA_Multiple_Samples_Multiple_threads_Main3_shared_list(argv=None):
 95 |     '''multiple threads version
 96 |     '''
 97 |     if len(sys.argv) == 1:
 98 |         print("Please provide the configure file and specify chr name...")
 99 |         exit(1)
100 |     cfg_file = sys.argv[1]
101 |     curr_processing_chr = sys.argv[2]
102 |     if "chr" not in curr_processing_chr:
103 |         curr_processing_chr = "chr" + curr_processing_chr
104 |         
105 |     print("[%s] Start Analysis ..." % time_now(), file=sys.stderr)
106 |     Group1_Tophat_aligned_file, output_directory, Annotated_3UTR_file, Output_result_file, sequencing_depth_file, Num_threads, Coverage_threshold = parse_cfgfile(cfg_file)
107 | 
108 |     All_Sample_files = Group1_Tophat_aligned_file[:]
109 |     Sample_name = []
110 |     for sample in All_Sample_files:
111 |         sample_name = sample.rsplit('.',1)[0]
112 |         Sample_name.append(sample_name)
113 | 
114 |     ##Prepare output directory
115 |     output_directory = output_directory.strip('/') + '_' + curr_processing_chr + '/'
116 |     d = os.path.dirname(output_directory)
117 |     if not os.path.exists(d):
118 |         os.makedirs(d)
119 |     temp_dir = d + '/tmp/'
120 |     if not os.path.exists(temp_dir):
121 |         os.makedirs(temp_dir)
122 | 
123 |     Output_all_prediction_file = output_directory + Output_result_file + '_result_temp.' + curr_processing_chr + '.txt'
124 |     Output_result = open(Output_all_prediction_file, 'w')
125 | 
126 |     num_samples = len(All_Sample_files)
127 | 
128 |     print("All samples Joint Processing %s ..." % curr_processing_chr, file=sys.stderr)
129 |     print("[%s] Loading Coverage ..." % time_now(), file=sys.stderr)
130 | 
131 |     All_samples_Target_3UTR_coverages, UTR_events_dict = Load_Target_Wig_files_Multiple_threads_shared_dict_sampleid_key(All_Sample_files, Annotated_3UTR_file, Num_threads,curr_processing_chr)
132 |     All_samples_sequencing_depths = load_sequencing_depth(sequencing_depth_file)
133 | 
134 |     print(All_samples_sequencing_depths)
135 |     All_sample_coverage_weights = All_samples_sequencing_depths/np.mean(All_samples_sequencing_depths)
136 | 
137 |     #print All_sample_coverage_weights
138 |     print("[%s] Loading Coverage Finished ..." % time_now(), file=sys.stderr)
139 |     #Write the first line
140 |     first_line = ['Gene','fit_value','Predicted_Proximal_APA','Loci']
141 |     for i in range(num_samples):
142 |         #curr_long_exp = 'Sample_%s_long_exp' % str(i+1)
143 |         #curr_short_exp = 'Sample_%s_short_exp' % str(i+1)
144 |         curr_ratio = '%s_PDUI' % str(Sample_name[i])
145 |         #first_line.extend([curr_long_exp,curr_short_exp,curr_ratio])
146 |         first_line.append(curr_ratio)
147 | 
148 |     Output_result.writelines('\t'.join(first_line) + '\n')
149 | 
150 |     All_events_ids = list(UTR_events_dict.keys())
151 |     num_threads = Num_threads
152 |     Assigned_events_ids_all_threads = Assign_to_different_processor_balance_events(All_events_ids, num_threads)
153 | 
154 |     num_real_threads = len(Assigned_events_ids_all_threads)
155 | 
156 |     Output_each_processor_all = []
157 |     for i in range(num_real_threads):
158 |         curr_temp_output = temp_dir + 'Each_processor_3UTR_Result_%s.txt' % (str(i+1))
159 |         Output_each_processor_all.append(curr_temp_output)
160 | 
161 |     processes = []
162 |     for i in range(num_real_threads):
163 |         process = multiprocessing.Process(target=Each_Thread_3UTR_estimation_list_version_sample_ids, args=(Assigned_events_ids_all_threads[i], UTR_events_dict, All_sample_coverage_weights, num_samples, Output_each_processor_all[i], All_samples_Target_3UTR_coverages, Coverage_threshold))
164 |         process.start()
165 |         processes.append(process)
166 | 
167 |     for p in processes:
168 |         p.join()
169 | 
170 |     #Combine results
171 |     for i in range(num_real_threads):
172 |         curr_result = Output_each_processor_all[i]
173 |         for line in open(curr_result, 'r'):
174 |             Output_result.writelines(line)
175 |     Output_result.close()
176 | 
177 |     #print >> sys.stderr, "[%s] Filtering the Results ..." % time_now()
178 | 
179 |     #Output_all_filtered_prediction_file = output_directory + Output_result_file + '_results_final.' + curr_processing_chr + '.txt'
180 |     #Dapars_Filtering(Output_all_prediction_file, num_samples, Output_all_filtered_prediction_file)
181 | 
182 |     print("[%s] Finished!" % time_now(), file=sys.stderr)
183 | 
184 | 
185 | def Each_Thread_3UTR_estimation_list_version_sample_ids(curr_thread_UTR_events_ids, UTR_events_dict, All_sample_coverage_weights, num_samples, Output_result_file, All_samples_coverage_shared_dict, Coverage_threshold):
186 |     Output_result = open(Output_result_file,'w')
187 | 
188 |     for curr_3UTR_id in curr_thread_UTR_events_ids:
189 |         curr_3UTR_structure = UTR_events_dict[curr_3UTR_id]
190 |         region_start = curr_3UTR_structure[1]
191 |         region_end = curr_3UTR_structure[2]
192 |         curr_strand = curr_3UTR_structure[-2]
193 |         UTR_pos = curr_3UTR_structure[-1]
194 |         curr_3UTR_all_samples_bp_coverage = []
195 | 
196 |         for i in range(num_samples):
197 |             curr_sample_curr_3UTR_coverage_wig = All_samples_coverage_shared_dict[curr_3UTR_id, i]
198 |             curr_3UTR_curr_sample_bp_coverage = Convert_wig_into_bp_coverage(curr_sample_curr_3UTR_coverage_wig[0], curr_sample_curr_3UTR_coverage_wig[1], curr_strand)
199 |             curr_3UTR_all_samples_bp_coverage.append(curr_3UTR_curr_sample_bp_coverage)
200 | 
201 |         select_mean_squared_error, selected_break_point, UTR_abundances = De_Novo_3UTR_Coverage_estimation_Genome_for_multiple_samples(curr_3UTR_all_samples_bp_coverage, region_start, region_end,curr_strand,All_sample_coverage_weights, Coverage_threshold)
202 | 
203 |         if str(select_mean_squared_error) != "Na":
204 |             num_non_zero = 1
205 |             if num_non_zero > 0:
206 |                 All_long_inclusion_ratios = []
207 |                 line_write = [curr_3UTR_id, "%.1f" % select_mean_squared_error, str(selected_break_point), UTR_pos]
208 | 
209 |                 for i in range(num_samples):
210 |                     if UTR_abundances[0][i] != 'NA':
211 |                         # long 3'UTR percentage
212 |                         curr_sample_ratio = float(UTR_abundances[0][i])/(float(UTR_abundances[0][i]) + float(UTR_abundances[1][i]))
213 |                         All_long_inclusion_ratios.append(curr_sample_ratio)
214 |                         #line_write.append("%.2f" % UTR_abundances[0][i])#long 3' UTR abundance
215 |                         #line_write.append("%.2f" % UTR_abundances[1][i])#short 3' UTR abundance
216 |                         line_write.append("%.2f" % curr_sample_ratio)
217 |                     else:
218 |                         line_write.extend(['NA']*1)
219 | 
220 |                 Output_result.writelines( '\t'.join(line_write) + '\n')
221 | 
222 |     Output_result.close()
223 | 
224 | 
225 | def De_Novo_3UTR_Coverage_estimation_Genome_for_multiple_samples(All_Samples_curr_3UTR_coverages, UTR_start, UTR_end, curr_strand, weight_for_second_coverage, Coverage_threshold):
226 |     coverage_threshold = Coverage_threshold
227 |     search_point_start = 150 ##200
228 |     search_point_end = int(abs((UTR_end - UTR_start))*0.05)
229 | 
230 |     num_samples = len(All_Samples_curr_3UTR_coverages)
231 |     #Read Coverage
232 |     Region_Coverages = []
233 |     Pass_threshold_index = []
234 |     for i in range(num_samples):
235 |         curr_Region_Coverage_raw = All_Samples_curr_3UTR_coverages[i]
236 |         curr_Region_Coverage = curr_Region_Coverage_raw/weight_for_second_coverage[i]#@xdzou: not modified yet
237 | 
238 |         curr_first_100_coverage = np.mean(curr_Region_Coverage_raw[0:99])
239 |         if curr_first_100_coverage > coverage_threshold:
240 |             Pass_threshold_index.append(i)
241 |             Region_Coverages.append(curr_Region_Coverage)
242 | 
243 |     least_pass_coverage_num = num_samples * least_pass_coverage_percentage
244 |     if len(Pass_threshold_index) > least_pass_coverage_num and UTR_end - UTR_start >=150:
245 |         if curr_strand == "+":
246 |             search_region = list(range(UTR_start+search_point_start, UTR_end-search_point_end+1))
247 |         else:
248 |             search_region = list(range(UTR_end - search_point_start, UTR_start+search_point_end-1, -1))
249 | 
250 |         search_region_start = search_point_start
251 |         search_region_end = UTR_end - UTR_start - search_point_end
252 |         Mean_squared_error_list = []
253 |         Estimated_3UTR_abundance_list = []
254 |         for curr_point in range(search_region_start, search_region_end+1):
255 |             curr_search_point = curr_point
256 |             All_samples_result = [[],[],[]]
257 |             for curr_sample_region_coverage in Region_Coverages:
258 |                 Mean_Squared_error, Long_UTR_abun, Short_UTR_abun = Estimation_abundance(curr_sample_region_coverage, curr_search_point)
259 |                 All_samples_result[0].append(Mean_Squared_error)
260 |                 All_samples_result[1].append(Long_UTR_abun)
261 |                 All_samples_result[2].append(Short_UTR_abun)
262 | 
263 |             Mean_Squared_error = np.mean(np.array(All_samples_result[0]))
264 |             Mean_squared_error_list.append(Mean_Squared_error)
265 |             Estimated_3UTR_abundance_list.append([All_samples_result[1],All_samples_result[2]])
266 | 
267 |         if len(Mean_squared_error_list) > 1:
268 |             min_ele_index = Mean_squared_error_list.index(min(Mean_squared_error_list))
269 | 
270 |             select_mean_squared_error = Mean_squared_error_list[min_ele_index]
271 |             selected_break_point = search_region[min_ele_index]
272 | 
273 |             UTR_abundances = [['NA']*num_samples, ['NA']*num_samples]
274 |             UTR_abundances_passed = Estimated_3UTR_abundance_list[min_ele_index]
275 |             for k in range(len(Pass_threshold_index)):
276 |                 UTR_abundances[0][Pass_threshold_index[k]] = UTR_abundances_passed[0][k]
277 |                 UTR_abundances[1][Pass_threshold_index[k]] = UTR_abundances_passed[1][k]
278 | 
279 |         else:
280 |             selected_break_point = 'Na'
281 |             UTR_abundances = 'Na'
282 |             select_mean_squared_error = 'Na'
283 | 
284 |     else:
285 |         selected_break_point = 'Na'
286 |         UTR_abundances = 'Na'
287 |         select_mean_squared_error = 'Na'
288 | 
289 |     return select_mean_squared_error, selected_break_point, UTR_abundances
290 | 
291 | 
292 | def Estimation_abundance(Region_Coverage, break_point):
293 |     Long_UTR_abun = np.mean(Region_Coverage[break_point:])
294 |     Short_UTR_abun = np.mean(Region_Coverage[0:break_point] - Long_UTR_abun)
295 |     if Short_UTR_abun < 0:
296 |         Short_UTR_abun = 0
297 |     Coverage_diff = Region_Coverage[0:break_point] - Long_UTR_abun - Short_UTR_abun
298 |     Coverage_diff= np.append(Coverage_diff, Region_Coverage[break_point:] - Long_UTR_abun)
299 |     Mean_Squared_error = np.mean(Coverage_diff**2)
300 | 
301 |     return Mean_Squared_error, Long_UTR_abun, Short_UTR_abun
302 | 
303 | 
304 | def Load_Target_Wig_files_Multiple_threads_shared_dict_sampleid_key(All_Wig_files,UTR_Annotation_file, num_threads,curr_processing_chr):
305 |     num_samples = len(All_Wig_files)
306 |     UTR_events_dict = {}
307 |     for line in open(UTR_Annotation_file, 'r'):
308 |         fields = line.strip('\n').split('\t')
309 |         curr_chr = fields[0]
310 |         if "chr" not in "curr_chr":
311 |             curr_chr = "chr" + curr_chr
312 |             
313 |         if curr_chr == curr_processing_chr:
314 |             region_start = fields[1]
315 |             region_end = fields[2]
316 | 
317 |             curr_strand = fields[-1]
318 |             UTR_pos = "%s:%s-%s" %(curr_chr, region_start, region_end)
319 |             end_shift = int(round(abs(int(region_start) - int(region_end)) * 0.2))
320 |             if curr_strand == "+":
321 |                 region_end = str(int(region_end) - end_shift)
322 |             else:
323 |                 region_start = str(int(region_start) + end_shift)
324 |             region_start = int(region_start) + 1
325 |             region_end = int(region_end) - 1
326 |             if region_start + 50 < region_end:
327 |                 UTR_events_dict[fields[3]] = [fields[0],region_start,region_end,fields[-1],UTR_pos]
328 | 
329 |     Assigned_index = Assign_to_different_processor_balance(num_samples, num_threads)
330 | 
331 |     manager = multiprocessing.Manager() # create only 1 Manager
332 |     All_samples_extracted_3UTR_coverage_dict = manager.dict() # create only 1 dict
333 | 
334 |     processes = []
335 |     Final_assigned_threads_num = len(Assigned_index)
336 |     for i in range(Final_assigned_threads_num):
337 |         process = multiprocessing.Process(target=load_wig_funct_shared_dict_sampleid_key, args=(All_Wig_files, Assigned_index[i], UTR_events_dict,curr_processing_chr,All_samples_extracted_3UTR_coverage_dict))
338 |         process.start()
339 |         processes.append(process)
340 | 
341 |     for p in processes:
342 |         p.join()
343 | 
344 |     return All_samples_extracted_3UTR_coverage_dict, UTR_events_dict
345 | 
346 | 
347 | def load_wig_funct_shared_dict_sampleid_key(All_wig_files, assigned_indexes,UTR_events_dict, curr_processing_chr, All_samples_extracted_3UTR_coverage_dict):
348 |     '''
349 |     All_samples_extracted_3UTR_coverage_dict: sample id is the key.
350 |     '''
351 |     for i in assigned_indexes:
352 |         curr_wig_file = All_wig_files[i]
353 |         print(curr_wig_file, file=sys.stderr)
354 |         curr_sample_All_chroms_coverage_dict = {}
355 |         with open(curr_wig_file, 'r') as fin:
356 |             for line in fin:
357 |                 if line[0] != '#' and line[0] != 't':
358 |                     fields = line.strip('\n').split('\t')
359 |                     chrom_name = fields[0]
360 |                     if chrom_name == curr_processing_chr:
361 |                         region_start = int(fields[1])
362 |                         region_end = int(fields[2])
363 | 
364 | 
365 |                         if chrom_name not in curr_sample_All_chroms_coverage_dict:
366 |                             curr_sample_All_chroms_coverage_dict[chrom_name] = [[0],[0]]
367 |                         if region_start > curr_sample_All_chroms_coverage_dict[chrom_name][0][-1]:
368 |                             curr_sample_All_chroms_coverage_dict[chrom_name][0].append(region_start)
369 |                             curr_sample_All_chroms_coverage_dict[chrom_name][1].append(0)
370 |                         curr_sample_All_chroms_coverage_dict[chrom_name][0].append(region_end)
371 |                         curr_sample_All_chroms_coverage_dict[chrom_name][1].append(int(float(fields[-1])))
372 |                     else:
373 |                         if len(curr_sample_All_chroms_coverage_dict)>0:
374 |                             break
375 |             fin.close()
376 |         if curr_processing_chr not in curr_sample_All_chroms_coverage_dict:
377 |             print('no wig: ' + curr_wig_file, file=sys.stderr)
378 |         else:
379 |             curr_sample_All_chroms_coverage_dict[curr_processing_chr][1].append(0)
380 | 
381 |         curr_sample_coverage_dict = {}
382 | 
383 |         for curr_3UTR_event_id in UTR_events_dict:
384 |             curr_3UTR_structure = UTR_events_dict[curr_3UTR_event_id]
385 |             curr_chr_local = curr_3UTR_structure[0]
386 |             if curr_chr_local in curr_sample_All_chroms_coverage_dict:
387 |                 curr_chr_coverage = curr_sample_All_chroms_coverage_dict[curr_chr_local]
388 |                 region_start = curr_3UTR_structure[1]
389 |                 region_end = curr_3UTR_structure[2]
390 |                 left_region_index = bisect(curr_chr_coverage[0],region_start)
391 |                 right_region_index = bisect(curr_chr_coverage[0],region_end)
392 | 
393 |                 extracted_coverage = curr_chr_coverage[1][left_region_index:right_region_index+1]
394 |                 extracted_3UTR_region = curr_chr_coverage[0][left_region_index:right_region_index]
395 |                 extracted_3UTR_region.insert(0,region_start)
396 |                 extracted_3UTR_region.append(region_end)
397 | 
398 |                 curr_event_info = [extracted_coverage,extracted_3UTR_region]
399 |                 All_samples_extracted_3UTR_coverage_dict[curr_3UTR_event_id,i] = curr_event_info
400 | 
401 | def Assign_to_different_processor_balance(Total_number, num_processors):
402 |     Assigned_results = []
403 |     num_each_processor = int(Total_number/num_processors)#@xdzou: add int() to convert the value into a integer othwise float
404 | 
405 |     if num_each_processor == 0:
406 |         for i in range(Total_number):
407 |             Assigned_results.append([i])
408 |     else:
409 |         remain = Total_number - num_processors * num_each_processor
410 |         for i in range(remain):
411 |             Assigned_results.append(list(range((i)*(num_each_processor + 1), (i+1)*(num_each_processor + 1))))
412 |         for i in range(num_processors-remain):
413 |             Assigned_results.append(list(range(i*num_each_processor+remain*(num_each_processor+1), (i+1)*num_each_processor+remain*(num_each_processor+1))))
414 | 
415 |     return Assigned_results
416 | 
417 | 
418 | def Assign_to_different_processor_balance_events(All_events_ids, num_processors):
419 |     Assigned_results = []
420 |     Total_number = len(All_events_ids)
421 |     num_each_processor = int(Total_number/num_processors) #@xdzou, add int()
422 | 
423 |     if num_each_processor == 0:
424 |         for i in range(Total_number):
425 |             Assigned_results.append([i])
426 |     else:
427 |         remain = Total_number - num_processors * num_each_processor
428 |         for i in range(remain):
429 |             Assigned_results.append(list(range((i)*(num_each_processor+1), (i+1)*(num_each_processor+1))))
430 | 
431 |         for i in range(num_processors-remain):
432 |             Assigned_results.append(list(range(i*num_each_processor+remain*(num_each_processor+1), (i+1)*num_each_processor+remain*(num_each_processor+1))))
433 |     #print assigned Results
434 |     Assigned_events = []
435 |     print('#assigned events:')
436 |     for curr_processor_inds in Assigned_results:
437 |         curr_processor_events = []
438 |         print(len(curr_processor_inds))
439 |         for curr_ele in curr_processor_inds:
440 |             curr_processor_events.append(All_events_ids[curr_ele])
441 |         Assigned_events.append(curr_processor_events)
442 |     return Assigned_events
443 | 
444 | #global parameters
445 | least_pass_coverage_percentage = 0.3
446 | 
447 | De_Novo_3UTR_Identification_Loading_Target_Wig_for_TCGA_Multiple_Samples_Multiple_threads_Main3_shared_list(sys.argv)
448 | 


--------------------------------------------------------------------------------
/src/DaPars2_Multi_Sample_Multi_Chr.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import sys
  4 | import datetime
  5 | import threading
  6 | import scipy as sp
  7 | import scipy.stats
  8 | from multiprocessing import Pool
  9 | from bisect import bisect
 10 | 
 11 | 
 12 | import math
 13 | import time
 14 | 
 15 | import multiprocessing
 16 | 
 17 | 
 18 | def time_now():#return time
 19 |     curr_time = datetime.datetime.now()
 20 |     return curr_time.strftime("%c")
 21 | 
 22 | def Convert_wig_into_bp_coverage(extracted_coverage,extracted_3UTR_region,strand_info):
 23 |     bp_coverage = np.zeros(extracted_3UTR_region[-1] - extracted_3UTR_region[0])
 24 |     relative_start = extracted_3UTR_region[0]
 25 |     for i in range(len(extracted_coverage)):
 26 |         curr_region_start = extracted_3UTR_region[i] - relative_start
 27 |         curr_region_end = extracted_3UTR_region[i+1] - relative_start
 28 |         bp_coverage[curr_region_start:curr_region_end] = extracted_coverage[i]
 29 |     if strand_info == '-':
 30 |         bp_coverage = bp_coverage[::-1]
 31 | 
 32 |     return bp_coverage
 33 | 
 34 | def parse_cfgfile(cfg_file):
 35 |     '''Parse configure file
 36 |     '''
 37 |     Aligned_Wig_files=''
 38 |     output_directory=''
 39 |     Annotated_3UTR_file=''
 40 |     Output_result_file=''
 41 |     Coverage_threshold = 1
 42 |     Num_threads = 1
 43 |     sequencing_depth_file = ''
 44 | 
 45 |     for line in open(cfg_file, 'r'):
 46 |         if line[0] == '\n' or line[0] == '#':
 47 |             comments = line;
 48 |         else:
 49 |             line = line.rstrip()
 50 |             command = line.split('=');
 51 |             if command[0] == 'Aligned_Wig_files':
 52 |                 Aligned_Wig_files = command[1].split(',');
 53 |             if command[0] == 'Output_directory':
 54 |                 output_directory = command[1]
 55 |                 if output_directory[-1] != '/':
 56 |                     output_directory += '/'
 57 |             if command[0] == 'Annotated_3UTR':
 58 |                 Annotated_3UTR_file = command[1]
 59 |             if command[0] == 'Output_result_file':
 60 |                 Output_result_file = command[1]
 61 |             if command[0] == 'sequencing_depth_file':
 62 |                 sequencing_depth_file = command[1]
 63 |             if command[0] == 'Num_Threads':
 64 |                 Num_threads = int(command[1])
 65 |             if command[0] == 'Coverage_threshold':
 66 |                 Coverage_threshold = int(command[1])
 67 | 
 68 | 
 69 |     if Aligned_Wig_files == '':
 70 |         print("No aligned BAM file found!", file=sys.stderr)
 71 |         exit(1)
 72 |     if output_directory=='':
 73 |         print("No output directory!", file=sys.stderr)
 74 |         exit(1)
 75 |     if Annotated_3UTR_file=='':
 76 |         print("No annotated 3' UTR file!", file=sys.stderr)
 77 |         exit(1)
 78 |     if Output_result_file=='':
 79 |         print("No result file name!", file=sys.stderr)
 80 |         exit(1)
 81 |     if sequencing_depth_file=='':
 82 |         print("No sequencing depth file!", file=sys.stderr)
 83 |         exit(1)
 84 | 
 85 |     return Aligned_Wig_files, output_directory, Annotated_3UTR_file, Output_result_file, sequencing_depth_file, Num_threads, Coverage_threshold
 86 | 
 87 | def load_sequencing_depth(depth_file):
 88 |     seq_depth_list = []
 89 |     for line in open(depth_file, 'r'):
 90 |         fields = line.strip('\n').split('\t')
 91 |         seq_depth_list.append(int(fields[-1]))
 92 | 
 93 |     return np.array(seq_depth_list)
 94 | 
 95 | def De_Novo_3UTR_Identification_Loading_Target_Wig_for_TCGA_Multiple_Samples_Multiple_threads_Main3_shared_list(argv=None):
 96 |     '''multiple threads version
 97 |     '''
 98 |     print((len(sys.argv)))
 99 |     if len(sys.argv) == 1:
100 |         print("Please provide the configure file and specify chr name...")
101 |         exit(1)
102 |     
103 |     cfg_file = sys.argv[1]
104 |     Group1_Tophat_aligned_file, output_folder, Annotated_3UTR_file, Output_result_file, sequencing_depth_file, Num_threads, Coverage_threshold = parse_cfgfile(cfg_file)
105 |     All_Sample_files = Group1_Tophat_aligned_file[:]
106 |     Sample_name = []
107 |     for sample in All_Sample_files:
108 |         sample_name = sample.rsplit('.',1)[0]
109 |         Sample_name.append(sample_name)
110 |     #curr_processing_chr = sys.argv[2]
111 | 
112 | #-- @xdzou: extract processing chromosomes from command line
113 |     fh=open(sys.argv[2],'r')
114 |     All_chroms = []
115 |     for line in fh.readlines():
116 |         line = line.strip()
117 |         All_chroms.append(line)
118 |     fh.close()
119 | 
120 |      
121 |     for curr_processing_chr in All_chroms:
122 |         print("[%s] Start Analysis ..." % time_now(), file=sys.stderr)
123 | 
124 |         ##Prepare output directory
125 |         output_directory = output_folder.rstrip('/') + '_' + curr_processing_chr + '/' #@xdzou,change strip() to rsrip()
126 |         d = os.path.dirname(output_directory)
127 |         print(d)
128 |         if not os.path.exists(d):
129 |             os.makedirs(d)
130 |         temp_dir = d + '/tmp/'
131 |         if not os.path.exists(temp_dir):
132 |             os.makedirs(temp_dir)
133 | 
134 |         Output_all_prediction_file = output_directory + Output_result_file + '_result_temp.' + curr_processing_chr + '.txt'
135 |         Output_result = open(Output_all_prediction_file, 'w')
136 | 
137 |         num_samples = len(All_Sample_files)
138 | 
139 |         print("All samples Joint Processing %s ..." % curr_processing_chr, file=sys.stderr)
140 |         print("[%s] Loading Coverage ..." % time_now(), file=sys.stderr)
141 | 
142 |         All_samples_Target_3UTR_coverages, UTR_events_dict = Load_Target_Wig_files_Multiple_threads_shared_dict_sampleid_key(All_Sample_files, Annotated_3UTR_file, Num_threads,curr_processing_chr)
143 |         All_samples_sequencing_depths = load_sequencing_depth(sequencing_depth_file)
144 | 
145 |         print(All_samples_sequencing_depths)
146 |         All_sample_coverage_weights = All_samples_sequencing_depths/np.mean(All_samples_sequencing_depths)
147 | 
148 |         #print All_sample_coverage_weights
149 |         print("[%s] Loading Coverage Finished ..." % time_now(), file=sys.stderr)
150 |         #Write the first line
151 |         first_line = ['Gene','fit_value','Predicted_Proximal_APA','Loci']
152 |         for i in range(num_samples):
153 |             #curr_long_exp = 'Sample_%s_long_exp' % str(i+1)
154 |             #curr_short_exp = 'Sample_%s_short_exp' % str(i+1)
155 |             curr_ratio = '%s_PDUI' % str(Sample_name[i])
156 |             #first_line.extend([curr_long_exp,curr_short_exp,curr_ratio])
157 |             first_line.append(curr_ratio)
158 | 
159 |         Output_result.writelines('\t'.join(first_line) + '\n')
160 | 
161 |         All_events_ids = list(UTR_events_dict.keys())
162 |         num_threads = Num_threads
163 |         Assigned_events_ids_all_threads = Assign_to_different_processor_balance_events(All_events_ids, num_threads)
164 | 
165 |         num_real_threads = len(Assigned_events_ids_all_threads)
166 | 
167 |         Output_each_processor_all = []
168 |         for i in range(num_real_threads):
169 |             curr_temp_output = temp_dir + 'Each_processor_3UTR_Result_%s.txt' % (str(i+1))
170 |             Output_each_processor_all.append(curr_temp_output)
171 | 
172 |         processes = []
173 |         for i in range(num_real_threads):
174 |             process = multiprocessing.Process(target=Each_Thread_3UTR_estimation_list_version_sample_ids, args=(Assigned_events_ids_all_threads[i], UTR_events_dict, All_sample_coverage_weights, num_samples, Output_each_processor_all[i], All_samples_Target_3UTR_coverages, Coverage_threshold))
175 |        	    process.start()
176 |             processes.append(process)
177 | 
178 |         for p in processes:
179 |             p.join()
180 | 
181 |     	#Combine results
182 |         for i in range(num_real_threads):
183 |             curr_result = Output_each_processor_all[i]
184 |             for line in open(curr_result, 'r'):
185 |                 Output_result.writelines(line)
186 |     
187 |         Output_result.close()
188 | 
189 | 
190 |     print("[%s] Finished!" % time_now(), file=sys.stderr)
191 | 
192 | 
193 | def Each_Thread_3UTR_estimation_list_version_sample_ids(curr_thread_UTR_events_ids, UTR_events_dict, All_sample_coverage_weights, num_samples, Output_result_file, All_samples_coverage_shared_dict, Coverage_threshold):
194 |     Output_result = open(Output_result_file,'w')
195 | 
196 |     for curr_3UTR_id in curr_thread_UTR_events_ids:
197 |         curr_3UTR_structure = UTR_events_dict[curr_3UTR_id]
198 |         region_start = curr_3UTR_structure[1]
199 |         region_end = curr_3UTR_structure[2]
200 |         curr_strand = curr_3UTR_structure[-2]
201 |         UTR_pos = curr_3UTR_structure[-1]
202 |         curr_3UTR_all_samples_bp_coverage = []
203 | 
204 |         for i in range(num_samples):
205 |             curr_sample_curr_3UTR_coverage_wig = All_samples_coverage_shared_dict[curr_3UTR_id, i]
206 |             curr_3UTR_curr_sample_bp_coverage = Convert_wig_into_bp_coverage(curr_sample_curr_3UTR_coverage_wig[0], curr_sample_curr_3UTR_coverage_wig[1], curr_strand)
207 |             curr_3UTR_all_samples_bp_coverage.append(curr_3UTR_curr_sample_bp_coverage)
208 | 
209 |         select_mean_squared_error, selected_break_point, UTR_abundances = De_Novo_3UTR_Coverage_estimation_Genome_for_multiple_samples(curr_3UTR_all_samples_bp_coverage, region_start, region_end,curr_strand,All_sample_coverage_weights, Coverage_threshold)
210 | 
211 |         if str(select_mean_squared_error) != "Na":
212 |             num_non_zero = 1
213 |             if num_non_zero > 0:
214 |                 All_long_inclusion_ratios = []
215 |                 line_write = [curr_3UTR_id, "%.1f" % select_mean_squared_error, str(selected_break_point), UTR_pos]
216 | 
217 |                 for i in range(num_samples):
218 |                     if UTR_abundances[0][i] != 'NA':
219 |                         # long 3'UTR percentage
220 |                         curr_sample_ratio = float(UTR_abundances[0][i])/(float(UTR_abundances[0][i]) + float(UTR_abundances[1][i]))
221 |                         All_long_inclusion_ratios.append(curr_sample_ratio)
222 |                         #line_write.append("%.2f" % UTR_abundances[0][i])#long 3' UTR abundance
223 |                         #line_write.append("%.2f" % UTR_abundances[1][i])#short 3' UTR abundance
224 |                         line_write.append("%.2f" % curr_sample_ratio)
225 |                     else:
226 |                         line_write.extend(['NA']*1)
227 | 
228 |                 Output_result.writelines( '\t'.join(line_write) + '\n')
229 | 
230 |     Output_result.close()
231 | 
232 | 
233 | def De_Novo_3UTR_Coverage_estimation_Genome_for_multiple_samples(All_Samples_curr_3UTR_coverages, UTR_start, UTR_end, curr_strand, weight_for_second_coverage, Coverage_threshold):
234 |     coverage_threshold = Coverage_threshold
235 |     search_point_start = 150 ##200
236 |     search_point_end = int(abs((UTR_end - UTR_start))*0.05)
237 | 
238 |     num_samples = len(All_Samples_curr_3UTR_coverages)
239 |     #Read Coverage
240 |     Region_Coverages = []
241 |     Pass_threshold_index = []
242 |     for i in range(num_samples):
243 |         curr_Region_Coverage_raw = All_Samples_curr_3UTR_coverages[i]
244 |         curr_Region_Coverage = curr_Region_Coverage_raw/weight_for_second_coverage[i]
245 | 
246 |         curr_first_100_coverage = np.mean(curr_Region_Coverage_raw[0:99])
247 |         if curr_first_100_coverage > coverage_threshold:
248 |             Pass_threshold_index.append(i)
249 |             Region_Coverages.append(curr_Region_Coverage)
250 | 
251 |     least_pass_coverage_num = num_samples * least_pass_coverage_percentage
252 |     if len(Pass_threshold_index) > least_pass_coverage_num and UTR_end - UTR_start >=150:
253 |         if curr_strand == "+":
254 |             search_region = list(range(UTR_start+search_point_start, UTR_end-search_point_end+1))
255 |         else:
256 |             search_region = list(range(UTR_end - search_point_start, UTR_start+search_point_end-1, -1))
257 | 
258 |         search_region_start = search_point_start
259 |         search_region_end = UTR_end - UTR_start - search_point_end
260 |         Mean_squared_error_list = []
261 |         Estimated_3UTR_abundance_list = []
262 |         for curr_point in range(search_region_start, search_region_end+1):
263 |             curr_search_point = curr_point
264 |             All_samples_result = [[],[],[]]
265 |             for curr_sample_region_coverage in Region_Coverages:
266 |                 Mean_Squared_error, Long_UTR_abun, Short_UTR_abun = Estimation_abundance(curr_sample_region_coverage, curr_search_point)
267 |                 All_samples_result[0].append(Mean_Squared_error)
268 |                 All_samples_result[1].append(Long_UTR_abun)
269 |                 All_samples_result[2].append(Short_UTR_abun)
270 | 
271 |             Mean_Squared_error = np.mean(np.array(All_samples_result[0]))
272 |             Mean_squared_error_list.append(Mean_Squared_error)
273 |             Estimated_3UTR_abundance_list.append([All_samples_result[1],All_samples_result[2]])
274 | 
275 |         if len(Mean_squared_error_list) > 1:
276 |             min_ele_index = Mean_squared_error_list.index(min(Mean_squared_error_list))
277 | 
278 |             select_mean_squared_error = Mean_squared_error_list[min_ele_index]
279 |             selected_break_point = search_region[min_ele_index]
280 | 
281 |             UTR_abundances = [['NA']*num_samples, ['NA']*num_samples]
282 |             UTR_abundances_passed = Estimated_3UTR_abundance_list[min_ele_index]
283 |             for k in range(len(Pass_threshold_index)):
284 |                 UTR_abundances[0][Pass_threshold_index[k]] = UTR_abundances_passed[0][k]
285 |                 UTR_abundances[1][Pass_threshold_index[k]] = UTR_abundances_passed[1][k]
286 | 
287 |         else:
288 |             selected_break_point = 'Na'
289 |             UTR_abundances = 'Na'
290 |             select_mean_squared_error = 'Na'
291 | 
292 |     else:
293 |         selected_break_point = 'Na'
294 |         UTR_abundances = 'Na'
295 |         select_mean_squared_error = 'Na'
296 | 
297 |     return select_mean_squared_error, selected_break_point, UTR_abundances
298 | 
299 | 
300 | def Estimation_abundance(Region_Coverage, break_point):
301 |     Long_UTR_abun = np.mean(Region_Coverage[break_point:])
302 |     Short_UTR_abun = np.mean(Region_Coverage[0:break_point] - Long_UTR_abun)
303 |     if Short_UTR_abun < 0:
304 |         Short_UTR_abun = 0
305 |     Coverage_diff = Region_Coverage[0:break_point] - Long_UTR_abun - Short_UTR_abun
306 |     Coverage_diff= np.append(Coverage_diff, Region_Coverage[break_point:] - Long_UTR_abun)
307 |     Mean_Squared_error = np.mean(Coverage_diff**2)
308 | 
309 |     return Mean_Squared_error, Long_UTR_abun, Short_UTR_abun
310 | 
311 | 
312 | def Load_Target_Wig_files_Multiple_threads_shared_dict_sampleid_key(All_Wig_files,UTR_Annotation_file, num_threads,curr_processing_chr):
313 |     num_samples = len(All_Wig_files)
314 |     UTR_events_dict = {}
315 |     for line in open(UTR_Annotation_file, 'r'):
316 |         fields = line.strip('\n').split('\t')
317 |         curr_chr = fields[0]
318 |         if curr_chr == curr_processing_chr:
319 |             region_start = fields[1]
320 |             region_end = fields[2]
321 | 
322 |             curr_strand = fields[-1]
323 |             UTR_pos = "%s:%s-%s" %(curr_chr, region_start, region_end)
324 |             end_shift = int(round(abs(int(region_start) - int(region_end)) * 0.2))
325 |             if curr_strand == "+":
326 |                 region_end = str(int(region_end) - end_shift)
327 |             else:
328 |                 region_start = str(int(region_start) + end_shift)
329 |             region_start = int(region_start) + 1
330 |             region_end = int(region_end) - 1
331 |             if region_start + 50 < region_end:
332 |                 UTR_events_dict[fields[3]] = [fields[0],region_start,region_end,fields[-1],UTR_pos]
333 | 
334 |     Assigned_index = Assign_to_different_processor_balance(num_samples, num_threads)
335 | 
336 |     manager = multiprocessing.Manager() # create only 1 Manager
337 |     All_samples_extracted_3UTR_coverage_dict = manager.dict() # create only 1 dict
338 | 
339 |     processes = []
340 |     Final_assigned_threads_num = len(Assigned_index)
341 |     for i in range(Final_assigned_threads_num):
342 |         process = multiprocessing.Process(target=load_wig_funct_shared_dict_sampleid_key, args=(All_Wig_files, Assigned_index[i], UTR_events_dict,curr_processing_chr,All_samples_extracted_3UTR_coverage_dict))
343 |         process.start()
344 |         processes.append(process)
345 | 
346 |     for p in processes:
347 |         p.join()
348 | 
349 |     return All_samples_extracted_3UTR_coverage_dict, UTR_events_dict
350 | 
351 | 
352 | def load_wig_funct_shared_dict_sampleid_key(All_wig_files, assigned_indexes,UTR_events_dict, curr_processing_chr, All_samples_extracted_3UTR_coverage_dict):
353 |     '''
354 |     All_samples_extracted_3UTR_coverage_dict: sample id is the key.
355 |     '''
356 |     for i in assigned_indexes:
357 |         curr_wig_file = All_wig_files[i]
358 |         print(curr_wig_file, file=sys.stderr)
359 |         curr_sample_All_chroms_coverage_dict = {}
360 |         with open(curr_wig_file, 'r') as fin:
361 |             for line in fin:
362 |                 if line[0] != '#' and line[0] != 't':
363 |                     fields = line.strip('\n').split('\t')
364 |                     chrom_name = fields[0]
365 |                     if chrom_name == curr_processing_chr:
366 |                         region_start = int(fields[1])
367 |                         region_end = int(fields[2])
368 | 
369 | 
370 |                         if chrom_name not in curr_sample_All_chroms_coverage_dict:
371 |                             curr_sample_All_chroms_coverage_dict[chrom_name] = [[0],[0]]
372 |                         if region_start > curr_sample_All_chroms_coverage_dict[chrom_name][0][-1]:
373 |                             curr_sample_All_chroms_coverage_dict[chrom_name][0].append(region_start)
374 |                             curr_sample_All_chroms_coverage_dict[chrom_name][1].append(0)
375 |                         curr_sample_All_chroms_coverage_dict[chrom_name][0].append(region_end)
376 |                         curr_sample_All_chroms_coverage_dict[chrom_name][1].append(int(float(fields[-1])))
377 |                     else:
378 |                         if len(curr_sample_All_chroms_coverage_dict)>0:
379 |                             break
380 |             fin.close()
381 |         if curr_processing_chr not in curr_sample_All_chroms_coverage_dict:
382 |             print('no wig: ' + curr_wig_file, file=sys.stderr)
383 |         else:
384 |             curr_sample_All_chroms_coverage_dict[curr_processing_chr][1].append(0)
385 | 
386 |         curr_sample_coverage_dict = {}
387 | 
388 |         for curr_3UTR_event_id in UTR_events_dict:
389 |             curr_3UTR_structure = UTR_events_dict[curr_3UTR_event_id]
390 |             curr_chr_local = curr_3UTR_structure[0]
391 |             if curr_chr_local in curr_sample_All_chroms_coverage_dict:
392 |                 curr_chr_coverage = curr_sample_All_chroms_coverage_dict[curr_chr_local]
393 |                 region_start = curr_3UTR_structure[1]
394 |                 region_end = curr_3UTR_structure[2]
395 |                 left_region_index = bisect(curr_chr_coverage[0],region_start)
396 |                 right_region_index = bisect(curr_chr_coverage[0],region_end)
397 | 
398 |                 extracted_coverage = curr_chr_coverage[1][left_region_index:right_region_index+1]
399 |                 extracted_3UTR_region = curr_chr_coverage[0][left_region_index:right_region_index]
400 |                 extracted_3UTR_region.insert(0,region_start)
401 |                 extracted_3UTR_region.append(region_end)
402 | 
403 |                 curr_event_info = [extracted_coverage,extracted_3UTR_region]
404 |                 All_samples_extracted_3UTR_coverage_dict[curr_3UTR_event_id,i] = curr_event_info
405 | 
406 | def Assign_to_different_processor_balance(Total_number, num_processors):
407 |     Assigned_results = []
408 |     num_each_processor = int(Total_number/num_processors)#@xdzou: add int()
409 | 
410 |     if num_each_processor == 0:
411 |         for i in range(Total_number):
412 |             Assigned_results.append([i])
413 |     else:
414 |         remain = Total_number - num_processors * num_each_processor
415 |         for i in range(remain):
416 |             Assigned_results.append(list(range((i)*(num_each_processor + 1), (i+1)*(num_each_processor + 1))))
417 |         for i in range(num_processors-remain):
418 |             Assigned_results.append(list(range(i*num_each_processor+remain*(num_each_processor+1), (i+1)*num_each_processor+remain*(num_each_processor+1))))
419 | 
420 |     return Assigned_results
421 | 
422 | 
423 | def Assign_to_different_processor_balance_events(All_events_ids, num_processors):
424 |     Assigned_results = []
425 |     Total_number = len(All_events_ids)
426 |     num_each_processor = int(Total_number/num_processors) #xdzou: add int()
427 | 
428 |     if num_each_processor == 0:
429 |         for i in range(Total_number):
430 |             Assigned_results.append([i])
431 |     else:
432 |         remain = Total_number - num_processors * num_each_processor
433 |         for i in range(remain):
434 |             Assigned_results.append(list(range((i)*(num_each_processor+1), (i+1)*(num_each_processor+1))))
435 | 
436 |         for i in range(num_processors-remain):
437 |             Assigned_results.append(list(range(i*num_each_processor+remain*(num_each_processor+1), (i+1)*num_each_processor+remain*(num_each_processor+1))))
438 |     #print assigned Results
439 |     Assigned_events = []
440 |     print('#assigned events:')
441 |     for curr_processor_inds in Assigned_results:
442 |         curr_processor_events = []
443 |         print(len(curr_processor_inds))
444 |         for curr_ele in curr_processor_inds:
445 |             curr_processor_events.append(All_events_ids[curr_ele])
446 |         Assigned_events.append(curr_processor_events)
447 |     return Assigned_events
448 | 
449 | #global parameters
450 | least_pass_coverage_percentage = 0.3
451 | 
452 | De_Novo_3UTR_Identification_Loading_Target_Wig_for_TCGA_Multiple_Samples_Multiple_threads_Main3_shared_list(sys.argv)
453 | 


--------------------------------------------------------------------------------