├── src ├── readme ├── extract_SNP_location.py ├── genotype_2_bed.py ├── extract_3UTR_location.py ├── merge_finemap_results.R ├── QTL_plot.R ├── extract_read_depth.py ├── merge_apa_quant_res_by_chr.R ├── finemapping.R ├── generate_configure_for_Dapars2.py ├── prepare_susieR_uniqGene_location.py ├── recode_with_012.py ├── run_fine_mapping.sh ├── run_3aQTL_mapping.R ├── prepare_inputs_for_finemapping.sh ├── DaPars_Extract_Anno.py ├── curate_pheno_geno_covariates.R ├── prepare_inputs_for_apa_quant.sh ├── prepare_inputs_for_3aQTL_mapping.sh ├── Dapars2_Multi_Sample.py └── DaPars2_Multi_Sample_Multi_Chr.py ├── 3aQTL-pipe_Test_Dataset.zip ├── LICENSE └── README.md /src/readme: -------------------------------------------------------------------------------- 1 | update source codes of 3aQTL-pipe 2 | -------------------------------------------------------------------------------- /3aQTL-pipe_Test_Dataset.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/3UTR/3aQTL-pipe/HEAD/3aQTL-pipe_Test_Dataset.zip -------------------------------------------------------------------------------- /src/extract_SNP_location.py: -------------------------------------------------------------------------------- 1 | ''' 2 | SNP location can be extract directly from processed gt file (embeded in SNP id) 3 | run on python2 environment 4 | usage: python extract_SNP_location.py --genotype_bed /path/to/genotype_matrix.bed --output /path/to/output/snp_location.txt 5 | ''' 6 | import argparse 7 | 8 | parser = argparse.ArgumentParser(description='') 9 | parser.add_argument('--genotype_bed',type=str,help="provide the transformed genotype matrix") 10 | parser.add_argument('--output',type=str, default="snp_location.txt",help="specify SNP location file") 11 | 12 | args = parser.parse_args() 13 | 14 | 15 | fh = open(args.genotype_bed,'r') 16 | fho = open(args.output,'w') 17 | header = fh.readline() 18 | print("SNP\tChr\tPos",file=fho) 19 | for line in fh.readlines(): 20 | line = line.strip() 21 | snp = line.split("\t")[0] 22 | w = snp.split("_") 23 | if len(w)>=2: 24 | chrom,pos = snp.split("_")[0:2] 25 | print("%s\t%s\t%s" % (snp,chrom,pos), file=fho) 26 | else: 27 | print("Error:",snp) 28 | fh.close() 29 | fho.close() 30 | 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Xudong Zou 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/genotype_2_bed.py: -------------------------------------------------------------------------------- 1 | # Transform the 012 genotype matrix into a bed file 2 | # by adding three columns (chr start end) as the first three columns 3 | # output the Header line of genotype matrix for further use 4 | 5 | import argparse 6 | # - Main 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser(description="") 9 | parser.add_argument('--genotype', help="provide the genotype matrix used in 3'aQTL mapping") 10 | parser.add_argument('--out_bed', help="specify the output bed file name") 11 | parser.add_argument('--out_header', help="specify a file to store the genotype header") 12 | 13 | args = parser.parse_args() 14 | 15 | fh = open(args.genotype,'r') 16 | fho = open(args.out_bed,'w') 17 | fho_header = open(args.out_header,'w') 18 | header = fh.readline().strip() 19 | print(header, file=fho_header) 20 | fho_header.close() 21 | i = 0 22 | for line in fh.readlines(): 23 | i += 1 24 | line = line.strip() 25 | snp = line.split("\t")[0] 26 | chrom,pos,ref,alt = snp.split("_") 27 | pos = int(pos) 28 | print("%s\t%d\t%d\t%s" % (chrom,pos-1,pos,line), file=fho) 29 | # print(i) 30 | 31 | print(i,"SNPs been processed!") 32 | fh.close() 33 | fho.close() 34 | -------------------------------------------------------------------------------- /src/extract_3UTR_location.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @ 3' UTR location of each transcript can be extracted from the results file of dapars2 3 | @ which return the 3UTR location in its output (the fourth column) 4 | run on python3 environment 5 | usage: python extract_3UTR_location.py --dapars_res /path/to/Dapars2_res.all_chromosomes.txt --output /path/to/3utr_location.txt 6 | ''' 7 | import argparse 8 | 9 | parser = argparse.ArgumentParser(description='') 10 | parser.add_argument('--dapars_res',type=str,help="provide the output file generated by dapars2,the merged all chromosomes file is required if running dapars2 on each chromosome separately") 11 | parser.add_argument('--output',type=str, default="3utr_location.txt",help="specify 3' UTR location file") 12 | 13 | args = parser.parse_args() 14 | 15 | gene2loc = {} # a dict to record the location 16 | gene_order = [] # store every gene in this list to keep order for output 17 | 18 | # -- load dapars result file and extract gene and location info 19 | fh = open(args.dapars_res,'r') 20 | header = fh.readline() 21 | for line in fh.readlines(): 22 | line = line.strip() 23 | w = line.split("\t") 24 | chrom,loc_code = w[3].split(":") 25 | start,end = loc_code.split("-") 26 | 27 | gene2loc[w[0]] = (chrom,start,end) 28 | gene_order.append(w[0]) 29 | fh.close() 30 | print(len(gene_order),"genes have been processed!") 31 | 32 | # -- output 33 | fho = open(args.output,'w') 34 | print("Gene\tChr\tStart\tEnd", file=fho) 35 | for gene in gene_order: 36 | if gene in gene2loc: 37 | print("%s\t%s\t%s\t%s" % (gene,gene2loc[gene][0],gene2loc[gene][1],gene2loc[gene][2]), file=fho) 38 | fho.close() 39 | 40 | print("Done!") 41 | -------------------------------------------------------------------------------- /src/merge_finemap_results.R: -------------------------------------------------------------------------------- 1 | library(optparse) 2 | 3 | option_list <- list( 4 | make_option(c("-d","--directory_finemap"),type="character",default="./FineMapping",action="store",help="the base location of running fine-mapping, default is ./FineMapping"), 5 | make_option(c("-g","--gene_list"),type="character",default="picked_asso_list.loc_1000000.txt",action="store",help="the file contains gene list used for running fine-mapping, default is picked_asso_list.loc_1000000.txt")) 6 | 7 | opt <- parse_args(OptionParser(option_list=option_list,usage="usage: %prog [options]")) 8 | 9 | basedir <- opt$directory_finemap # the basic path of susieR analysis 10 | aGene_file <- opt$gene_list 11 | setwd(basedir) 12 | cat('Options:\n','basedir:',basedir,'\naGene_file:',aGene_file,'\n') 13 | gene_list <- read.table(paste0("./input/",aGene_file),header=F,sep="\t") 14 | gene_list <- gene_list[,1] 15 | independent_snp_count <- c() 16 | 17 | susie_df <- data.frame(locus_id=c(),variant_id=c(),pip=c(),cs=c(),cs_size=c(),cs_purity=c()) 18 | for(idx in 1:length(gene_list)){ 19 | file_name <- paste0("./output/",gene_list[idx],"/3aQTL.SuSiE.txt") 20 | cat(file_name,"\n") 21 | if (file.exists(file_name)){ 22 | df <- read.table(file_name,header=T,sep=" ") 23 | independent_snp_count[idx] <- dim(df)[1] 24 | 25 | if (dim(df)[1]>0){ 26 | df$locus_id <- gene_list[idx] 27 | susie_df <- rbind(susie_df,df) 28 | } 29 | }else{ 30 | independent_snp_count[idx] <- NA 31 | } 32 | } 33 | 34 | summary_susie <- data.frame(Gene=gene_list,Count=independent_snp_count) 35 | 36 | write.table(susie_df,file="susieR_res.all_genes.txt",quote=F,row.names=F,sep="\t") 37 | write.table(summary_susie,file="susieR_res.stat.txt",quote=F,row.names=F,sep="\t") 38 | -------------------------------------------------------------------------------- /src/QTL_plot.R: -------------------------------------------------------------------------------- 1 | library(optparse) 2 | 3 | option_list <- list( 4 | make_option(c("-s","--snp"),type="character",default="NA",action="store",help="specify a SNP"), 5 | make_option(c("-g","--gene"),type="character",default="NA",action="store",help="specify a gene"), 6 | make_option(c("-G","--genotype"),type="character",default="./Matrix_eQTL/Genotype_matrix.txt",action="store",help="specify the genotype matrix used in 3'aQTL mapping, default is ./Matrix_eQTL/Genotype_matrix.txt"), 7 | make_option(c("-P","--phenotype"),type="character",default="./Matrix_eQTL/Phenotype_matrix.txt",action="store",help="specify the phenotype matrix used in 3'aQTL mapping, default is ./Matrix_eQTL/Phenotype_matrix.txt") 8 | ) 9 | 10 | opt <- parse_args(OptionParser(option_list=option_list,usage="usage: %prog [options]")) 11 | 12 | #load Genotype matrix and Phenotype matrix 13 | gt <- read.table(opt$genotype,header=T,sep="\t", check.names=FALSE) 14 | pt <- read.table(opt$phenotype,header=T,sep="\t", check.names=FALSE) 15 | 16 | rownames(gt) <- gt[,1] 17 | rownames(pt) <- pt[,1] 18 | gt <- gt[,-1] 19 | pt <- pt[,-1] 20 | 21 | snp <- as.character(opt$snp) 22 | gene <- as.character(opt$gene) 23 | geneName <- strsplit(gene,split="|",fixed=T)[[1]][2] 24 | 25 | 26 | e1 = as.numeric(pt[which(rownames(pt)==gene),]) 27 | s1 = as.numeric(gt[which(rownames(gt)==snp),]) 28 | 29 | lm1 = lm(e1 ~ s1) 30 | pdf(paste(snp, geneName,"pdf", sep=".")) 31 | boxplot(e1 ~ s1, lwd = 2, xaxt="n",xlab="Genotype",ylab="Normalized PDUI",main=paste(snp,gene,sep=" || ")) 32 | axis(1,at=c(1:3),labels=c("REF","HET","ALT")) 33 | stripchart(e1 ~ s1, vertical = TRUE, method = "jitter", add = TRUE, pch = 20, col = c(rgb(102,194,165,max=255),rgb(252,141,98,max=255),rgb(141,160,203,max=255))) 34 | dev.off() 35 | -------------------------------------------------------------------------------- /src/extract_read_depth.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os.path 3 | 4 | # -- Functions 5 | def load_sample_list(input_sample_list_file): 6 | sample_list = [] 7 | for line in open(input_sample_list_file,'r'): 8 | line = line.strip() 9 | sample_id = line.split("\t")[0] 10 | sample_list.append(sample_id) 11 | 12 | return sample_list 13 | 14 | def extract_total_reads(input_flagstat_file): 15 | num_line = 0 16 | total_reads = '-1' 17 | #print input_flagstat_file 18 | for line in open(input_flagstat_file,'r'): 19 | num_line += 1 20 | if num_line == 5: 21 | total_reads = line.strip().split(' ')[0] 22 | break 23 | return total_reads 24 | 25 | 26 | # -- Main 27 | 28 | if __name__ == '__main__': 29 | parser = argparse.ArgumentParser(description='') 30 | parser.add_argument('--sample_list',help="the file contains all samples") 31 | parser.add_argument('--path_flagstat',default="./tmp",help="the location of flagstat files") 32 | parser.add_argument('--path_wig',default="./wig",help="the location of wig files") 33 | parser.add_argument('--output',help="the final output file with read depth of each sample") 34 | 35 | args = parser.parse_args() 36 | 37 | selected_samples = load_sample_list(os.path.abspath(args.sample_list)) 38 | path_wig = os.path.abspath(args.path_wig) 39 | path_flagstat = os.path.abspath(args.path_flagstat) 40 | 41 | if path_wig[-1] != "/": 42 | path_wig += "/" 43 | else: 44 | pass 45 | 46 | if path_flagstat[-1] != "/": 47 | path_flagstat += "/" 48 | else: 49 | pass 50 | 51 | fho = open(args.output,'w') 52 | for sample in selected_samples: 53 | filename_sample = sample + ".flagstat" 54 | read_depth = extract_total_reads(path_flagstat + filename_sample) 55 | wig_location = path_wig + sample + ".wig" 56 | print("%s\t%s" % (wig_location,read_depth),file=fho) 57 | 58 | fho.close() 59 | -------------------------------------------------------------------------------- /src/merge_apa_quant_res_by_chr.R: -------------------------------------------------------------------------------- 1 | # merge Dapars2 output by chromosome 2 | library(optparse) 3 | # -- global variable 4 | option_list <- list( 5 | make_option(c("-d","--dir_prefix"), type = "character", default = "Dapars2_out", 6 | action = "store", help = "Specify the directory prefix of DaPars2 output"), 7 | make_option(c("-f", "--file_prefix"), type = "character", default = "Dapars2", 8 | action = "store", help = "Specify the file prefix of DaPars2 output"), 9 | make_option(c("-s", "--sample_list"), type = "character", default = "sample_list.txt", 10 | action = "store", help = "A file contains the sample list"), 11 | make_option(c("-c", "--chr_list"), type = "character", default = "chrList.txt", 12 | action = "store", help = "A file contains the chromosome list"), 13 | make_option(c("-o", "--output"), type = "character", default = "Dapars2_res.all_chromosomes.txt", 14 | action = "store", help = "Specify the output file name") 15 | ) 16 | 17 | opt <- parse_args(OptionParser(option_list=option_list,usage="usage: %prog [options]")) 18 | dir_pre <- opt$dir_prefix 19 | file_pre <- opt$file_prefix 20 | bamList <- opt$sample_list 21 | chromList <- opt$chr_list 22 | outFile <- opt$output 23 | cat("dir_pre:",dir_pre,"\nfile_pre:",file_pre,"\nchromList:",chromList,"\n") 24 | # -- functions 25 | load_dapars2_res <- function(chromosome,new_header){ 26 | input_file <- paste0(dir_pre,"_",chromosome,"/",file_pre,"_result_temp.",chromosome,".txt") 27 | dap_res <- read.table(input_file,header=T, sep="\t") 28 | names(dap_res) <- new_header 29 | 30 | return(dap_res) 31 | } 32 | 33 | 34 | # -- main 35 | 36 | # load samples 37 | dat <- read.table(bamList,header=F) 38 | sample_list <- as.character(dat$V1) 39 | col_names <- c("Gene","fit_value","Predicted_Proximal_APA","Loci",sample_list) 40 | chrs_list <- read.table(chromList,header=F) 41 | chrs_vec <- as.character(chrs_list$V1) 42 | rm(chrs_list) 43 | if(substr(chrs_vec[1],1,3)!="chr"){ 44 | chrs_vec <- paste0("chr",chrs_vec) 45 | } 46 | 47 | chrs_vec 48 | res.df <- data.frame() 49 | 50 | for(chr in chrs_vec){ 51 | temp.df <- load_dapars2_res(chr,col_names) 52 | print(paste(chr,dim(temp.df)[1],sep=":")) 53 | res.df <- rbind(res.df,temp.df) 54 | } 55 | 56 | dim(res.df) 57 | write.table(res.df,file=outFile,quote=F,sep="\t",row.names=F) 58 | 59 | 60 | -------------------------------------------------------------------------------- /src/finemapping.R: -------------------------------------------------------------------------------- 1 | # perform fine mapping on one gene 2 | args <- commandArgs(trailingOnly=TRUE) 3 | dir <- args[1] 4 | Lvalue <- as.integer(args[2]) 5 | sp_var <- as.numeric(args[3]) 6 | mPIP <- as.numeric(args[4]) 7 | 8 | setwd(dir) 9 | 10 | cat('Running environment:',getwd(),'\nOptions:\n','Lvalue:',Lvalue,'\nsp_var:',sp_var,'\nmPIP:',mPIP,'\n') 11 | genotype = '3aQTL.vcf' 12 | phenotype = 'expr.phen' 13 | 14 | # output file prefix 15 | prefix = tools::file_path_sans_ext(genotype) 16 | 17 | X = t(read.table(genotype, head=T, row.names=1, quote="'", check.names=FALSE)) 18 | # fill missing values in X with mean 19 | # because susieR does not deal with missing data explicitly for now 20 | for(i in 1:ncol(X)){ 21 | X[is.na(X[,i]), i] <- mean(X[,i], na.rm = TRUE) 22 | } 23 | y = read.table(phenotype, head=F)[,-1] 24 | # Adjust row names for phenotype data convention 25 | #rownames(y) = gsub("-", ".", y[,1]) 26 | rownames(y) = y[,1] 27 | # Obtain intersect of X and y data, and reorder X to match y ordering 28 | x_idx = match(rownames(y), rownames(X)) 29 | y_idx = which(!is.na(x_idx)) 30 | x_idx = x_idx[!is.na(x_idx)] 31 | X = X[x_idx, ] 32 | y = y[y_idx,] 33 | if (!all(rownames(X) == rownames(y))) stop("X and y rownames mismatch") 34 | # Run SuSiE 35 | res = susieR::susie(X, y[,2], L=Lvalue, scaled_prior_variance=sp_var) 36 | # Visualize result 37 | pdf(paste0(prefix, '.SuSiE.pdf'), width=10,height=5) 38 | susieR::susie_plot(res, y = 'PIP') 39 | dev.off() 40 | # Format results focusing only on signals 41 | res$var_names = colnames(X) 42 | get_susie_output = function(unit, res, pip_cutoff = mPIP) { 43 | cs_id = cs_size = cs_purity = rep(NA, length(res$var_names)) 44 | num_cs = length(res$sets$cs) 45 | for(id in 1:num_cs){ 46 | idx = res$sets$cs[[id]] 47 | cs_id[idx] = names(res$sets$cs)[id] 48 | cs_size[idx] = length(res$sets$cs[[id]]) 49 | cs_purity[idx] = res$sets$purity[id,1] 50 | } 51 | out = cbind.data.frame(rep(unit, length(res$var_names)), 52 | res$var_names, 53 | res$pip, cs_id, cs_size, cs_purity) 54 | colnames(out) = c("locus_id", "variant_id", "pip", "cs", "cs_size", "cs_purity") 55 | out[which(out[,3] >= pip_cutoff | !is.na(out[,4])), ] 56 | } 57 | text_output = get_susie_output(genotype, res) 58 | # Output to files 59 | write.table(text_output, paste0(prefix, '.SuSiE.txt'), quote=FALSE, row.names=FALSE) 60 | saveRDS(res, paste0(prefix, '.SuSiE.rds')) 61 | -------------------------------------------------------------------------------- /src/generate_configure_for_Dapars2.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os.path 3 | 4 | # -- Functions 5 | def extract_all_wigs(input_seq_depth_file): 6 | wig_files = [] 7 | for line in open(input_seq_depth_file,'r'): 8 | line = line.strip() 9 | w = line.split("\t") 10 | wig_files.append(w[0]) 11 | 12 | return wig_files 13 | 14 | 15 | # -- Main 16 | 17 | if __name__ == '__main__': 18 | parser = argparse.ArgumentParser(description='') 19 | parser.add_argument('--annotation_3utr',help="the location of reference 3'UTR bed file") 20 | parser.add_argument('--wigFile_depth',help="the index file contains all wig files and read depth") 21 | parser.add_argument('--coverage_threshold',type=str,default="10",help="specify the threshold of coverage,default=10") 22 | parser.add_argument('--threads',type=str,default="1",help="specify the number of threads used,default=1") 23 | parser.add_argument('--out_dir_prefix',type=str, default="Dapars2_out",help="specify the directory prefix of dapars2 output,default='Dapars2_out'") 24 | parser.add_argument('--out_file_prefix',type=str,default="Dapars2",help="specify the name of result file prefix of dapars2,default='Dapars2'") 25 | parser.add_argument('--out_config_name',type=str,default="Dapars2_running_configure.txt",help="specify configure file name,default='Dapars2_running_configure.txt'") 26 | 27 | args = parser.parse_args() 28 | 29 | configure_file_name = args.out_config_name 30 | fho = open(configure_file_name,'w') 31 | # print Annotated_3UTR 32 | print("# Specify the reference of 3'UTR region", file=fho) 33 | print("\nAnnotated_3UTR=" + os.path.abspath(args.annotation_3utr), file=fho) 34 | 35 | # print wig files 36 | all_wig_files = extract_all_wigs(os.path.abspath(args.wigFile_depth)) 37 | print("\n# A comma separated list of wig files of all samples", file=fho) 38 | print("\nAligned_Wig_files=" + ",".join(all_wig_files), file=fho) 39 | 40 | # specify Output_directory and Output_result_file 41 | print("\nOutput_directory=" + args.out_dir_prefix, file=fho) 42 | print("\nOutput_result_file=" + args.out_file_prefix, file=fho) 43 | 44 | # specify Coverage_threshold 45 | print("\n# Specify Coverage threshold", file=fho) 46 | print("\nCoverage_threshold=" + args.coverage_threshold, file=fho) 47 | 48 | # specify the Num_Threads to process the analysis 49 | print("\n# Specify the number of threads to process the analysis", file=fho) 50 | print("\nNum_Threads=" + args.threads, file=fho) 51 | 52 | # Provide sequencing_depth_file for normalization 53 | print("\n# Provide sequencing depth file for normalization", file=fho) 54 | print("\nsequencing_depth_file=" + os.path.abspath(args.wigFile_depth), file=fho) 55 | fho.close() 56 | 57 | 58 | -------------------------------------------------------------------------------- /src/prepare_susieR_uniqGene_location.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Desc: 3 | Obtain unique and significant aGenes and their extended location (specified by --extend_size, default:1Mb) 4 | Input files: 3utr_location.txt, Cis_aQTL_all_control_gene_exprs.txt 5 | Output format: Gene\tchr:start-end 6 | ''' 7 | 8 | import argparse 9 | import os.path 10 | import time 11 | 12 | # - Main 13 | if __name__ == "__main__": 14 | parser = argparse.ArgumentParser(description="") 15 | parser.add_argument('--utr_loc_file',help="input the reference 3'UTR location file, e.g. 3utr_location.txt") 16 | parser.add_argument('--aQTL_map',help="specify the aQTL mapping file") 17 | parser.add_argument('--Max_FDR',type=float,default=0.05,help="specify the maximum of FDR to select significant aQTL associations") 18 | parser.add_argument('--extend_size',type=int, default=1000000,help="Int, extend N bp (default N=1e6) at both sides") 19 | parser.add_argument('--outdir',help="specify the output dir") 20 | parser.add_argument('--output',help="specify the output file name") 21 | 22 | args = parser.parse_args() 23 | 24 | # -- extract gene and location 25 | print("Start extracting 3'UTR location file...") 26 | print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) 27 | 28 | fh = open(args.utr_loc_file,'r') 29 | gene2loc = {} 30 | head = fh.readline() 31 | ext_size = int(args.extend_size) 32 | 33 | for line in fh.readlines(): 34 | line = line.strip() 35 | w = line.split("\t") 36 | start = max([0,int(w[2]) - ext_size]) 37 | end = int(w[3]) + ext_size 38 | loc = w[1] + ":" + str(start) + "-" + str(end) 39 | gene2loc[w[0]] = loc 40 | fh.close() 41 | 42 | # -- processing aQTL mapping files 43 | print("Start processing aQTL mapping file...") 44 | print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) 45 | fdr_cutoff = float(args.Max_FDR) 46 | fh = open(args.aQTL_map,'r') 47 | outdict = {} 48 | for line in fh.readlines()[1:]: 49 | line = line.strip() 50 | w = line.split("\t") 51 | gene = w[1] 52 | fdr = float(w[5]) 53 | if fdr < fdr_cutoff: 54 | if gene not in outdict: 55 | outdict[gene] = gene2loc[gene] 56 | else: 57 | continue 58 | else: 59 | continue 60 | 61 | fh.close() 62 | 63 | print(len(outdict),"unique aGenes are processed.") 64 | 65 | 66 | output_path = os.path.abspath(args.outdir) 67 | fho = open(output_path + "/" + args.output,'w') 68 | i = 0 69 | for gene in outdict: 70 | i += 1 71 | print(gene + "\t" + outdict[gene], file=fho) 72 | print(i) 73 | fho.close() 74 | print("Done!") 75 | print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) 76 | -------------------------------------------------------------------------------- /src/recode_with_012.py: -------------------------------------------------------------------------------- 1 | ''' 2 | after getting *.frq and *.FORMAT files by vcftools, 3 | this script will extracting allelic gt info from *.frq and recode gt code in *.FORMAT into 012 format 4 | ''' 5 | 6 | import argparse 7 | import time 8 | 9 | 10 | # - Functions 11 | # extracting allelic genotype from frq 12 | def extract_gt_from_frq(frq_file): 13 | snp2gt = {} 14 | fh = open(frq_file,'r') 15 | for line in fh.readlines()[1:]: 16 | line = line.strip() 17 | w = line.split("\t") 18 | if "chr" not in w[0]: 19 | snp = "chr" + w[0] + "_" + w[1] 20 | else: 21 | snp = w[0] + "_" + w[1] 22 | allele_ref = w[4].split(":")[0] 23 | allele_alt = w[5].split(":")[0] 24 | 25 | if snp not in snp2gt: 26 | snp2gt[snp] = allele_ref + "_" + allele_alt 27 | else: 28 | continue 29 | fh.close() 30 | return snp2gt 31 | 32 | 33 | 34 | # recode genotype to 012 code 35 | def recode_with_012(gt): 36 | if "." in gt: 37 | return "NA" 38 | else: 39 | allele_1 = int(gt[0]) 40 | allele_2 = int(gt[-1]) 41 | return str(allele_1+allele_2) 42 | 43 | # - Main 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser(description="") 46 | parser.add_argument('--frq',help="input the frq file generated by vcftools") 47 | parser.add_argument('--GT',help="specify the GT format file generated by vcftools") 48 | parser.add_argument('--output',help="specify the output file with 012 recoded genotype") 49 | 50 | args = parser.parse_args() 51 | 52 | # -- extract allelic gt from frq file 53 | print("Start processing frq file...") 54 | print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) 55 | 56 | snv_gt = extract_gt_from_frq(args.frq) 57 | print("Obtain genotype of %d SNPs" % (len(snv_gt))) 58 | 59 | # -- recode gt in GT file 60 | print("Start processing GT file...") 61 | print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) 62 | 63 | fh = open(args.GT,'r') 64 | fho = open(args.output,'w') 65 | header = fh.readline().strip().split("\t") 66 | print("%s\t%s" % ("id","\t".join(header[2:])), file=fho) 67 | for line in fh.readlines(): 68 | line = line.strip() 69 | w = line.split("\t") 70 | if "chr" not in w[0]: 71 | snp = "chr" + w[0] + "_" + w[1] 72 | else: 73 | snp = w[0] + "_" + w[1] 74 | 75 | if snp in snv_gt: 76 | snp = snp + "_" + snv_gt[snp] 77 | else: 78 | snp = snp 79 | 80 | gt_012 = list(map(recode_with_012,w[2:])) 81 | 82 | print("%s\t%s" % (snp,"\t".join(gt_012)), file=fho) 83 | 84 | fh.close() 85 | fho.close() 86 | 87 | print("recoded gt file has been write to",args.output) 88 | print("Done!") 89 | print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 3'aQTL-pipe 2 | 3 | [![Github Release](https://img.shields.io/badge/release-v1.1-brightgreen)](https://github.com/3UTR/3aQTL-pipe) 4 | [![python Release](https://img.shields.io/badge/python-3.8-brightgreen)](https://www.python.org/downloads/) 5 | [![R Release](https://img.shields.io/badge/R-3.6.3-brightgreen)](https://cran.r-project.org/) 6 | [![DOI](https://zenodo.org/badge/480019097.svg)](https://zenodo.org/badge/latestdoi/480019097) 7 | 8 | **Abbreviation** 9 | * APA: alternative polyadenylation 10 | * 3'aQTL: 3′UTR alternative polyadenylation quantitative trait loci 11 | 12 | This pipeline describes the step-by-step methods for analyzing dynamics alternative polyadenylation events across population-scale samples and performing association analysis between common genetic variants and APA usages to obtain a map of genetic regulation of APA. 13 | 14 | The scripts in this repository have been tested on 89 samples from Geuvadis RNA-seq Project and GTEx Project. 15 | For conditions to reuse of these scripts please refer to LICENSE file. 16 | 17 | ## Using this pipeline 18 | Details on how to prepare environment and use the script can be found on [GitHub wiki](https://github.com/3UTR/3aQTL-pipe/wiki) pages for this repository. 19 | 20 | This pipeline relies on [Dapars2](https://github.com/3UTR/DaPars2) for APA quantification, [Matrix-eQTL](http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/) for association mapping, and [SuSieR](https://github.com/stephenslab/susieR![image](https://user-images.githubusercontent.com/10413520/160762171-0a0e0d3c-f3ee-43a5-8b12-0920eba2dfac.png) 21 | ) for fine-mapping. 22 | 23 | ## Authors 24 | 25 | Xudong Zou, Ruofan Ding, Wenyan Chen, Gao Wang, Shumin Cheng, Wei Li, Lei Li 26 | 27 | Institute of Systems and Physical Biology, Shenzhen Bay Laboratory, Shenzhen 518055, China 28 | 29 | ## Citation 30 | * Code and Execution: 31 | 32 | **Using population-scale transcriptomic and genomic data to map 3' UTR alternative polyadenylation quantitative trait loci** 33 | 34 | Xudong Zou, Ruofan Ding, Wenyan Chen, Gao Wang, Shumin Cheng, Qin Wang, Wei Li, Lei Li. ***STAR Protocols***,3(3):101566 **(2022)**. 35 | DOI: https://doi.org/10.1016/j.xpro.2022.101566 36 | https://www.sciencedirect.com/science/article/pii/S2666166722004464?via%3Dihub 37 | 38 | * The first 3'aQTL atlas of human tissues: 39 | 40 | **An atlas of alternative polyadenylation quantitative trait loci contributing to complex trait and disease heritability** 41 | 42 | Lei Li, Kai-Lieh Huang, Yipeng Gao, Ya Cui, Gao Wang, Nathan D. Elrod, Yumei Li, Yiling Elaine Chen, Ping Ji, Fanglue Peng, William K. Russell, Eric J. Wagner & Wei Li. ***Nature Genetics***,53,994-1005 **(2021)**. DOI:https://doi.org/10.1038/s41588-021-00864-5 43 | 44 | https://www.nature.com/articles/s41588-021-00864-5 45 | 46 | ## Contact 47 | For any issues, please create a GitHub Issue. 48 | 49 | ## Funding 50 | This work was supported by National Natural Science Foundation of China (no. 32100533) and startup funds from Shenzhen Bay Laboratory to L.L. 51 | -------------------------------------------------------------------------------- /src/run_fine_mapping.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This is the sub-pipe of 3'aQTL-pipe, here in this script we will perform fine-mapping of 3'aQTL detected by Matrix-eQTL with SuSieR 3 | # @Xudong Zou, zouxd@szbl.ac.cn 4 | # 2022-03-30 5 | 6 | # -- Usage function 7 | script_name=$0 8 | function usage(){ 9 | echo "#==============================" 10 | echo "Default usage:" 11 | echo "#==============================" 12 | echo "bash $script_name" 13 | echo "Options:" 14 | echo " -w integer,setting the window size around the genes for fine-mapping" 15 | echo " -p float, specify the minimum PIP for filtering fine mapped 3'aQTLs" 16 | echo " -L integer, specify the L value in susieR, default 10" 17 | echo " -V float, specify the variance used in susieR, default 0.2" 18 | echo " -t integer, setting threads to run susieR in parallel, default 1" 19 | echo " -h print the help information" 20 | exit 1 21 | } 22 | 23 | # define global variables from command parameters 24 | currDir=`pwd` 25 | sourceDir="./src" 26 | PIP="0.1" 27 | Variance="0.2" 28 | L="10" 29 | Threads="1" 30 | window=`echo "1e6"|awk '{printf("%d",$0)}'` 31 | 32 | while getopts :w:p:L:V:t:h opt 33 | do 34 | case $opt in 35 | w) 36 | window=`echo "$OPTARG"|awk '{printf("%d",$0)}'` 37 | ;; 38 | p) 39 | PIP="$OPTARG" 40 | ;; 41 | L) 42 | L="$OPTARG" 43 | ;; 44 | V) 45 | Variance="$OPTARG" 46 | ;; 47 | t) 48 | Threads="$OPTARG" 49 | ;; 50 | h) 51 | echo "Help message:" 52 | usage 53 | ;; 54 | :) 55 | echo "The option -$OPTARG requires an argument." 56 | exit 1 57 | ;; 58 | ?) 59 | echo "Invalid option: $OPTARG" 60 | usage 61 | exit 2 62 | ;; 63 | esac 64 | done 65 | 66 | # -- Basic settings 67 | if [ ! -d "${currDir}/FineMapping/input" ] 68 | then 69 | echo "${currDir}/FineMapping/input not found!" 70 | exit 71 | fi 72 | 73 | if [ ! -d "${currDir}/FineMapping/output" ] 74 | then 75 | echo "${currDir}/FineMapping/output not found!" 76 | exit 77 | fi 78 | 79 | # -- Main function -- 80 | function main(){ 81 | echo "Running $script_name with the following parameters:" 82 | echo "*************************************************" 83 | echo "-w: $window" 84 | echo "-p: $PIP" 85 | echo "-L: $L" 86 | echo "-V: $Variance" 87 | echo "-t: $Threads" 88 | echo "*************************************************" 89 | echo "Start 3'aQTL fine mapping ..." 90 | date 91 | echo "Run fine-mapping analysis by susieR" 92 | run_fine_mapping $window $PIP $L $Variance $Threads 93 | echo "Done!" 94 | date 95 | } 96 | 97 | 98 | # -- Other functions -- 99 | function run_fine_mapping(){ 100 | w=$1 101 | min_PIP=$2 102 | L=$3 103 | Var=$4 104 | threads=$5 105 | if [ ! -f "${currDir}/FineMapping/input/picked_asso_list.loc_${w}.txt" ] 106 | then 107 | echo "File ${currDir}/FineMapping/input/picked_asso_list.loc_${w}.txt not found!" 108 | exit 109 | fi 110 | 111 | if [ $threads -eq 1 ] 112 | then 113 | for gene in `cat ${currDir}/FineMapping/input/picked_asso.loc_${w}.txt|cut -f1` 114 | do 115 | echo "Analyzing $gene" 116 | if [ -d "${currDir}/FineMapping/output/$gene" ] 117 | then 118 | GeneDir=${currDir}/FineMapping/output/$gene 119 | if [ -f "${GeneDir}/3aQTL.vcf" -a -f "${GeneDir}/expr.phen" ] 120 | then 121 | Rscript ${sourceDir}/finemapping.R ${GeneDir} $L $Var $min_PIP & 122 | wait 123 | else 124 | echo "${gene}:File 3aQTL.vcf and expr.phen not found!" 125 | continue 126 | fi 127 | else 128 | echo "${gene} not exits!" 129 | continue 130 | fi 131 | done 132 | cd $currDir 133 | else 134 | if [ ! -d "${currDir}/FineMapping/output/tmp" ] 135 | then 136 | mkdir -p ${currDir}/FineMapping/output/tmp 137 | fi 138 | split -l $threads -d ${currDir}/FineMapping/input/picked_asso_list.loc_${w}.txt ${currDir}/FineMapping/output/tmp/finemap_task_ & 139 | wait 140 | for task in `ls ${currDir}/FineMapping/output/tmp/finemap_task_*` 141 | do 142 | for gene in `cat $task |cut -f1` 143 | do 144 | echo "Analyzing $gene" 145 | if [ -d "${currDir}/FineMapping/output/$gene" ] 146 | then 147 | GeneDir=${currDir}/FineMapping/output/$gene 148 | if [ -f "${GeneDir}/3aQTL.vcf" -a -f "${GeneDir}/expr.phen" ] 149 | then 150 | Rscript ${sourceDir}/finemapping.R ${GeneDir} $L $Var $min_PIP & 151 | else 152 | echo "${gene}:File 3aQTL.vcf and expr.phen not found!" 153 | continue 154 | fi 155 | else 156 | echo "${gene} not exits!" 157 | continue 158 | fi 159 | done 160 | wait 161 | done & 162 | wait 163 | cd $currDir 164 | fi 165 | 166 | } 167 | 168 | # - main 169 | main 170 | -------------------------------------------------------------------------------- /src/run_3aQTL_mapping.R: -------------------------------------------------------------------------------- 1 | #!/opt/app/languages/R-3.6.3/bin/Rscript 2 | library(optparse) 3 | library(MatrixEQTL) 4 | 5 | option_list <- list( 6 | make_option(c("-p","--phenotype"),type="character",default="./Matrix_eQTL/Phenotype_matrix.txt",action="store",help="APA expression data for MatrixEQTL"), 7 | make_option(c("-g","--genotype"),type="character",default="./Matrix_eQTL/Genotype_matrix.txt",action="store",help="Genotype data for MatrixEQTL"), 8 | make_option(c("-c","--covariate"),type="character",default="./Matrix_eQTL/Covariate_matrix.txt",action="store",help="Covariates for MatrixEQTL"), 9 | make_option(c("-s","--snp_location"),type="character",default="./Matrix_eQTL/snp_location.txt",action="store",help="SNP locations"), 10 | make_option(c("-u","--utr_location"),type="character",default="./Matrix_eQTL/3UTR_location.txt",action="store",help="3UTR locations"), 11 | make_option(c("-w","--window"),type="numeric",default=1e6,action="store",help="window size"), 12 | make_option(c("-q","--cis_pvalue"),type="numeric",default=1e-2,action="store",help="p value threshold for cis-3'aQTL"), 13 | make_option(c("-Q","--trans_pvalue"),type="numeric",default=1e-5,action="store",help="p value trheshold for trans-3'aQTL") 14 | ) 15 | 16 | opt <- parse_args(OptionParser(option_list=option_list,usage="usage: %prog [options]")) 17 | 18 | PHENO <- opt$phenotype 19 | GENO <- opt$genotype 20 | COVARIATE <- opt$covariate 21 | SNPLOC <- opt$snp_location 22 | UTRLOC <- opt$utr_location 23 | 24 | CIS_DISTANCE <- as.numeric(opt$window) 25 | CIS_P_CUTOFF <- as.numeric(opt$cis_pvalue) 26 | TRANS_P_CUTOFF <- as.numeric(opt$trans_pvalue) 27 | 28 | cat('Options:\n','Phenotype:',PHENO,'\n','Genotype:',GENO,'\n','Covariates:',COVARIATE,'\n','CIS_DISTANCE:',CIS_DISTANCE,'\n', 29 | 'CIS_P_CUTOFF:',CIS_P_CUTOFF,'\n','TRANS_P_CUTOFF:',TRANS_P_CUTOFF,'\n') 30 | 31 | # - Use linear model 32 | useModel = modelLINEAR # modelANOVA, modelLINEAR, or modelLINEAR_CROSS 33 | 34 | # - Genotype file name 35 | SNP_file_name = GENO 36 | snps_location_file_name = SNPLOC 37 | 38 | # - APA expression file name 39 | expression_file_name = PHENO 40 | gene_location_file_name = UTRLOC 41 | 42 | # - Covariates file name 43 | covariates_file_name = COVARIATE 44 | 45 | # - output file name 46 | output_file_name_cis = "./Matrix_eQTL/Cis_3aQTL_all_control_gene_exprs.txt" 47 | output_file_name_tra = "./Matrix_eQTL/Trans_3aQTL_all_control_gene_exprs.txt" 48 | output_figure_name_cis = "./Matrix_eQTL/Cis_3aQTL_genotype_info_control_gene_exprs.pdf" 49 | pdf(output_figure_name_cis) 50 | 51 | # - threshold 52 | pvOutputThreshold_cis = CIS_P_CUTOFF; 53 | pvOutputThreshold_tra = TRANS_P_CUTOFF; 54 | 55 | # - Error covariance matrix 56 | # set to numeric() for identity 57 | errorCovariance = numeric(); 58 | 59 | # - Distance for local gene-SNP pairs 60 | cisDist = CIS_DISTANCE; 61 | 62 | # -- load genotype data 63 | snps = SlicedData$new(); 64 | snps$fileDelimiter = "\t"; 65 | snps$fileOmitCharacters = "NA"; 66 | snps$fileSkipRows = 1; # one row of column labels 67 | snps$fileSkipColumns = 1; # one column of row labels 68 | snps$fileSliceSize = 2000; # read file in slices of 2,000 rows 69 | snps$LoadFile(SNP_file_name); 70 | 71 | # -- load apa expression data 72 | gene = SlicedData$new(); 73 | gene$fileDelimiter = "\t"; # the TAB character 74 | gene$fileOmitCharacters = "NA"; # denote missing values; 75 | gene$fileSkipRows = 1; # one row of column labels 76 | gene$fileSkipColumns = 1; # one column of row labels 77 | gene$fileSliceSize = 2000; # read file in slices of 2,000 rows 78 | gene$LoadFile(expression_file_name); 79 | 80 | # -- load covariates data 81 | cvrt = SlicedData$new(); 82 | cvrt$fileDelimiter = "\t"; # the TAB character 83 | cvrt$fileOmitCharacters = "NA"; # denote missing values; 84 | cvrt$fileSkipRows = 1; # one row of column labels 85 | cvrt$fileSkipColumns = 1; # one column of row labels 86 | if(length(covariates_file_name)>0) { 87 | cvrt$LoadFile(covariates_file_name); 88 | } 89 | 90 | 91 | ## Run the analysis 92 | snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE); 93 | genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE); 94 | 95 | me = Matrix_eQTL_main( 96 | snps = snps, 97 | gene = gene, 98 | cvrt = cvrt, 99 | output_file_name = output_file_name_tra, 100 | pvOutputThreshold = pvOutputThreshold_tra, 101 | useModel = useModel, 102 | errorCovariance = errorCovariance, 103 | verbose = TRUE, 104 | output_file_name.cis = output_file_name_cis, 105 | pvOutputThreshold.cis = pvOutputThreshold_cis, 106 | snpspos = snpspos, 107 | genepos = genepos, 108 | cisDist = cisDist, 109 | pvalue.hist = "qqplot", 110 | min.pv.by.genesnp = TRUE, 111 | noFDRsaveMemory = FALSE); 112 | 113 | gz1 <- "./Matrix_eQTL/data.RDataw" 114 | save.image(gz1) 115 | #unlink(output_file_name_tra); 116 | #unlink(output_file_name_cis); 117 | 118 | # -- Results 119 | cat('Analysis done in: ', me$time.in.sec, ' seconds', '\n') 120 | cat('Detected local aQTLs:', '\n'); 121 | show(me$cis$eqtls) 122 | cat('Detected distant aQTLs:', '\n'); 123 | show(me$trans$eqtls) 124 | 125 | write.table(me$cis$min.pv.gene, "./Matrix_eQTL/cis.min.pv.gene.txt") 126 | save(gene,snps,file="./Matrix_eQTL/Gene_SNP.RData") 127 | save(snps,genepos,file="./Matrix_eQTL/permutation.RData") 128 | 129 | # -- plot the Q-Q plot of local and distant p-values 130 | plot(me) 131 | dev.off() 132 | -------------------------------------------------------------------------------- /src/prepare_inputs_for_finemapping.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Prepare input data for fine-mapping by susieR 3 | # @Xudong Zou, zouxd@szbl.ac.cn 4 | # 2022-03-30 5 | 6 | # -- Usage function 7 | script_name=$0 8 | function usage(){ 9 | echo "#==============================" 10 | echo "Default usage:" 11 | echo "#==============================" 12 | echo "bash $script_name" 13 | echo "Options:" 14 | echo " -g text file,the genotype matrix used in Matrix_eQTL, default is ./Matrix_eQTL/Genotype_matrix.txt" 15 | echo " -p text file,the phenotype matrix used in Matrix_eQTL, default is ./Matrix_eQTL/Phenotype_matrix.txt" 16 | echo " -a text file,the cis association list that returned by Matrix-eQTL, default is ./Matrix_eQTL/Cis_3aQTL_all_control_gene_exprs.txt" 17 | echo " -u text file,the 3'UTR location file, the one also used by Matrix-eQTL, default is ./Matrix_eQTL/3UTR_location.txt" 18 | echo " -w integer,setting the window size around aGenes for fine-mapping" 19 | echo " -q float, specify the maximum of FDR to filtering significant 3'aQTL association,default 0.05" 20 | echo " -h print the help information" 21 | exit 1 22 | } 23 | 24 | # define global variables from command parameters 25 | currDir=`pwd` 26 | genotype="./Matrix_eQTL/Genotype_matrix.txt" 27 | phenotype="./Matrix_eQTL/Phenotype_matrix.txt" 28 | qtl_res="./Matrix_eQTL/Cis_3aQTL_all_control_gene_exprs.txt" 29 | utr_loc="./Matrix_eQTL/3UTR_location.txt" 30 | window=`echo "1e6"|awk '{printf("%d",$0)}'` 31 | FDR="0.05" 32 | sourceDir="./src" 33 | while getopts :g:p:a:u:w:q:h opt 34 | do 35 | case $opt in 36 | g) 37 | genotype="$OPTARG" 38 | ;; 39 | p) 40 | phenotype="$OPTARG" 41 | ;; 42 | a) 43 | qtl_res="$OPTARG" 44 | ;; 45 | u) 46 | utr_loc="$OPTARG" 47 | ;; 48 | w) 49 | window=`echo "$OPTARG"|awk '{printf("%d",$0)}'` 50 | ;; 51 | q) 52 | FDR="$OPTARG" 53 | ;; 54 | h) 55 | echo "Help message:" 56 | usage 57 | ;; 58 | :) 59 | echo "The option -$OPTARG requires an argument." 60 | exit 1 61 | ;; 62 | ?) 63 | echo "Invalid option: $OPTARG" 64 | usage 65 | exit 2 66 | ;; 67 | esac 68 | done 69 | 70 | # -- Basic settings 71 | if [ ! -d "${currDir}/FineMapping/input" ] 72 | then 73 | mkdir -p ${currDir}/FineMapping/input 74 | fi 75 | 76 | if [ ! -d "${currDir}/output/FineMapping/output" ] 77 | then 78 | mkdir -p ${currDir}/FineMapping/output 79 | fi 80 | 81 | if [ ! -d "${currDir}/Matrix_eQTL" ] 82 | 83 | then 84 | echo "No Matrix-eQTL output found!" 85 | exit 86 | fi 87 | 88 | # -- Main function -- 89 | function main(){ 90 | echo "Running $script_name with the following parameters:" 91 | echo "*************************************************" 92 | echo "-g: $genotype" 93 | echo "-p: $phenotype" 94 | echo "-a: $qtl_res" 95 | echo "-u: $utr_loc" 96 | echo "-w: $window" 97 | echo "-q: $FDR" 98 | echo "*************************************************" 99 | date 100 | echo "Prepare input for susieR" 101 | prepare_input $genotype $phenotype $qtl_res $utr_loc $window $FDR 102 | echo "Done!" 103 | date 104 | } 105 | 106 | 107 | # -- Other functions -- 108 | function prepare_input(){ 109 | geno=$1 110 | pheno=$2 111 | aQTL=$3 112 | utrLoc=$4 113 | w=$5 114 | fdr=$6 115 | if [ ! -f "$utrLoc" ] 116 | then 117 | echo "File ${utrLoc} not found!" 118 | exit 119 | fi 120 | 121 | if [ ! -f "${geno}" ] 122 | then 123 | echo "File ${geno} not found!" 124 | exit 125 | fi 126 | 127 | echo "Prepare unique aGenes set..." 128 | python ${sourceDir}/prepare_susieR_uniqGene_location.py --utr_loc_file ${utrLoc} \ 129 | --aQTL_map ${aQTL} \ 130 | --extend_size $w \ 131 | --Max_FDR $fdr \ 132 | --outdir ${currDir}/FineMapping/input \ 133 | --output picked_asso_list.loc_${w}.txt & 134 | wait 135 | 136 | echo "Prepare SNP files in bed format..." 137 | python ${sourceDir}/genotype_2_bed.py --genotype ${geno} \ 138 | --out_bed ${currDir}/FineMapping/input/Genotype_matrix.bed \ 139 | --out_header ${currDir}/FineMapping/input/Header.txt & 140 | wait 141 | 142 | if [ -f "${currDir}/FineMapping/input/Genotype_matrix.bed" ] 143 | then 144 | sort -k1,1 -k2,2n ${currDir}/FineMapping/input/Genotype_matrix.bed > tmp.bed & 145 | wait 146 | mv tmp.bed ${currDir}/FineMapping/input/Genotype_matrix.bed & 147 | wait 148 | else 149 | echo "File ${currDir}/FineMapping/input/Genotype_matrix.bed not found!" 150 | exit 151 | fi 152 | # create a unique workspace for each gene in "picked_asso_list.loc_${w}.txt", and generate a "expr.phen" for each gene 153 | if [ ! -f "${pheno}" ] 154 | then 155 | echo "File ${pheno} not found!" 156 | exit 157 | fi 158 | echo "Make a directory and generate a expr.phen for each gene" 159 | for gene in `cat ${currDir}/FineMapping/input/picked_asso_list.loc_${w}.txt| cut -f1` 160 | do 161 | mkdir -p ${currDir}/FineMapping/output/$gene 162 | cat ${pheno} | awk -v aGENE=$gene -F"\t" 'BEGIN{OFS="\t"} {if(NR==1){for (i=2;i ${currDir}/FineMapping/output/$gene/expr.phen & 163 | wait 164 | done 165 | 166 | echo "Select SNPs around a window of ${w}bp of the gene and generate 3aQTL.vcf in the gene's directory" 167 | while read line 168 | do 169 | gene=`echo $line|awk '{print $1}'` 170 | loc=`echo $line|awk '{print $2}'` 171 | cd ${currDir}/FineMapping/output/$gene 172 | CHR=${loc%:*} 173 | COORD=${loc#*:} 174 | S=${COORD%-*} 175 | E=${COORD#*-} 176 | echo -e "$CHR\t$S\t$E" > gene_loc.bed 177 | cat ${currDir}/FineMapping/input/Header.txt > 3aQTL.vcf 178 | bedtools intersect -a ${currDir}/FineMapping/input/Genotype_matrix.bed -b gene_loc.bed -wa |cut -f4- >> 3aQTL.vcf & 179 | wait 180 | rm gene_loc.bed 181 | done < ${currDir}/FineMapping/input/picked_asso_list.loc_${w}.txt 182 | 183 | cd ${currDir} 184 | } 185 | 186 | # - main 187 | main 188 | -------------------------------------------------------------------------------- /src/DaPars_Extract_Anno.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys, getopt 5 | import os.path 6 | 7 | 8 | def Annotation_prepar_3UTR_extraction(gene_bed_file, gene_symbol_map_kfXref_file, output_utr_file): 9 | 10 | output_write = open(output_utr_file,'w') 11 | 12 | refseq_trapt_gene_symbol_dict = {} 13 | num_line = 0 14 | for line in open(gene_symbol_map_kfXref_file, 'r'): 15 | if num_line > 0: 16 | fields = line.strip('\n').strip('\r').split('\t') 17 | gene_symbol = fields[1] 18 | refseq_transcript_id = fields[0] 19 | refseq_trapt_gene_symbol_dict[refseq_transcript_id] = gene_symbol 20 | else: 21 | num_line += 1 22 | 23 | scanned_3UTR_list = [] 24 | num_saved = 0 25 | for line in open(gene_bed_file,'r'): 26 | fields = line.strip('\n').split('\t') 27 | refseq_id = fields[3] 28 | if '_' not in fields[0]: 29 | 30 | if refseq_id not in refseq_trapt_gene_symbol_dict: 31 | gene_symbol = "NA" 32 | else: 33 | gene_symbol = refseq_trapt_gene_symbol_dict[refseq_id] 34 | 35 | UTR_id = [refseq_id, gene_symbol,fields[0], fields[5]] 36 | UTR_id_new = '|'.join(UTR_id) 37 | curr_strand = fields[5] 38 | if curr_strand == "+": 39 | UTR_end = fields[2] 40 | gene_start = int(fields[1]) 41 | UTR_start = str(gene_start + int(fields[-1].strip(',').split(',')[-1])+1)#1base 42 | elif curr_strand == "-": 43 | gene_start = int(fields[1]) 44 | UTR_start = str(gene_start + 1)#1base 45 | UTR_end = str(gene_start + int(fields[10].split(',')[0]))#1base, included 46 | 47 | this_UTR = fields[0]+UTR_start+UTR_end+curr_strand 48 | if this_UTR not in scanned_3UTR_list: 49 | write_line = [fields[0], UTR_start, UTR_end,UTR_id_new, '0', curr_strand] 50 | output_write.writelines('\t'.join(write_line) + '\n') 51 | scanned_3UTR_list.append(this_UTR) 52 | num_saved += 1 53 | 54 | 55 | output_write.close() 56 | print("Total extracted 3' UTR: " + str(num_saved)) 57 | 58 | 59 | 60 | def Subtract_different_strand_overlap(input_gene_bed_file,output_utr_file): 61 | def UTRs_subtract_refine(UTRs_all): 62 | strand_info = UTRs_all[0].strip('\n').split('\t')[-1] 63 | if strand_info == '+': 64 | all_pos = [] 65 | for curr_line in UTRs_all: 66 | left_pos = curr_line.strip('\n').split('\t')[1] 67 | all_pos.append(int(left_pos)) 68 | selected_UTR_index = all_pos.index(min(all_pos)) 69 | selected_UTR = UTRs_all[selected_UTR_index] 70 | else: 71 | all_pos = [] 72 | for curr_line in UTRs_all: 73 | left_pos = curr_line.strip('\n').split('\t')[2] 74 | all_pos.append(int(left_pos)) 75 | selected_UTR_index = all_pos.index(max(all_pos)) 76 | selected_UTR = UTRs_all[selected_UTR_index] 77 | return selected_UTR 78 | temp_file = "overlap_opposite_strand_subtract.bed" 79 | cmd = 'subtractBed -a %s -b %s -S > %s' % (input_gene_bed_file, input_gene_bed_file, temp_file) 80 | os.system(cmd) 81 | 82 | read_subtract_result_dict = {} 83 | for line in open(temp_file,'r'): 84 | transcript_id = line.split('\t')[3].split('|')[0] 85 | if transcript_id not in read_subtract_result_dict: 86 | read_subtract_result_dict[transcript_id] = [] 87 | read_subtract_result_dict[transcript_id].append(line) 88 | 89 | output_utr_write = open(output_utr_file,'w') 90 | for curr_trans_id in read_subtract_result_dict: 91 | curr_3UTRs = read_subtract_result_dict[curr_trans_id] 92 | num_3UTRs = len(curr_3UTRs) 93 | if num_3UTRs == 1: 94 | output_utr_write.writelines(curr_3UTRs[0]) 95 | else: 96 | selected_UTR = UTRs_subtract_refine(curr_3UTRs) 97 | output_utr_write.writelines(selected_UTR) 98 | output_utr_write.close() 99 | 100 | try: 101 | os.remove(temp_file) 102 | except OSError: 103 | pass 104 | 105 | def Extract_Anno_main(argv): 106 | gene_bed_file = '' 107 | gene_symbol_annotation_file = '' 108 | output_extract_file = 'temp_anno_extracted.bed' 109 | output_final_extract_file = '' 110 | 111 | try: 112 | opts, args = getopt.getopt(argv,"hb:s:o:",["bed=","symbol=","ofile"]) 113 | except getopt.GetoptError: 114 | print('python DaPars_Extract_Anno.py -b -s -o ') 115 | sys.exit(2) 116 | for opt, arg in opts: 117 | if opt == '-h': 118 | print('python DaPars_Extract_Anno.py -b -s -o ') 119 | sys.exit() 120 | elif opt in ("-b", "--bed"): 121 | gene_bed_file = arg 122 | elif opt in ("-s", "--symbol"): 123 | gene_symbol_annotation_file = arg 124 | elif opt in ("-o", "--ofile"): 125 | output_final_extract_file = arg 126 | 127 | if gene_bed_file=='': 128 | print("Error: No gene bed file!", file=sys.stderr) 129 | exit(1) 130 | if gene_symbol_annotation_file=='': 131 | print("Error: No gene symbol file!", file=sys.stderr) 132 | exit(1) 133 | 134 | if output_final_extract_file=='': 135 | print("Error: No output file!", file=sys.stderr) 136 | exit(1) 137 | 138 | print("Generating regions ...") 139 | Annotation_prepar_3UTR_extraction(gene_bed_file, gene_symbol_annotation_file,output_extract_file) 140 | Subtract_different_strand_overlap(output_extract_file,output_final_extract_file) 141 | 142 | try: 143 | os.remove(output_extract_file) 144 | except OSError: 145 | pass 146 | 147 | 148 | print("Finished") 149 | 150 | if __name__ == '__main__': 151 | Extract_Anno_main(sys.argv[1:]) 152 | 153 | -------------------------------------------------------------------------------- /src/curate_pheno_geno_covariates.R: -------------------------------------------------------------------------------- 1 | #!/opt/app/languages/R-3.6.3/bin/Rscript 2 | # 2022-01-08 3 | library(optparse) 4 | 5 | # -- global variable 6 | option_list <- list( 7 | make_option(c("-p", "--pheno_data"),type = "character", default = "Dapars2_res.all_chromosomes.txt", action = "store", help = "Proivde the merged ouput from DaPars2, Dapars2_res.all_chromosomes.txt in default"), 8 | make_option(c("-g","--geno_pca"),type = "character", default = "./Matrix_eQTL/genotype_pca.eigenvec", action = "store", help = "the eigenvector of genotpye pca analysis, ./Matrix_eQTL/genotype_pca.eigenvec in default"), 9 | make_option(c("-c","--known_covs"),type = "character", default = "NA", action = "store", help = "input a text file contains known covariates if available, NA in defaut"), 10 | make_option(c("-n","--top_N_pca"),type = "integer", default = "5", action = "store", help = "specify the top N PCA on gt would be used, defaut value = 5") 11 | ) 12 | opt <- parse_args(OptionParser(option_list=option_list,usage="usage: %prog [options]")) 13 | apa_res_file <- opt$pheno_data 14 | gtPCA_file <- opt$geno_pca 15 | known_cov_file <- opt$known_covs 16 | topN_pca <- opt$top_N_pca 17 | 18 | cat('Arguments:','\n', 19 | '--pheno_data',apa_res_file,'\n', 20 | '--geno_pca',gtPCA_file,'\n', 21 | '--known_covs',known_cov_file,'\n', 22 | '--top_N_pca',topN_pca,'\n') 23 | 24 | cat('Current directory:') 25 | getwd() 26 | 27 | library(dplyr) 28 | library(peer) 29 | library(impute) 30 | # --------------- prepare covariates ----------------- 31 | # load genotype pca 32 | gt_pca <- read.table(gtPCA_file,header=F,sep=" ",stringsAsFactors=F) 33 | N <- as.integer(topN_pca) + 1 34 | gt_pca$V1 <- NULL;gt_pca <- gt_pca[,1:N] 35 | names(gt_pca) <- c("subject_id",paste0("PC_",1:(N-1))) 36 | rm(N) 37 | # add known covariates into topN gt_pca if available 38 | if(known_cov_file!="NA"){ 39 | known_cov <- read.table(known_cov_file,header=T,sep="\t",stringsAsFactors=F) 40 | dim(known_cov) 41 | N <- dim(known_cov)[2] 42 | for(i in 2:N){ 43 | if(class(known_cov[,i])=="character"){ 44 | known_cov[,i] <- as.factor(known_cov[,i]) 45 | known_cov[,i] <- as.numeric(known_cov[,i]) 46 | } 47 | } 48 | col_names <- names(known_cov)[2:N] 49 | names(known_cov) <- c("subject_id",col_names) 50 | gt_pca <- merge(gt_pca,known_cov,by="subject_id") 51 | } 52 | 53 | 54 | # convert covariates' data.frame to matrix 55 | rownames(gt_pca) <- gt_pca$subject_id;gt_pca <- as.matrix(gt_pca[,-1]) 56 | cat('Dimension of gt_pca:',dim(gt_pca),'\n') 57 | rm(known_cov,known_cov_file) 58 | 59 | cat("Start phenotype matrix\n","Open APA results file:",apa_res_file,"\n") 60 | # --------------- prepare phenotype matrix ------------------ 61 | pdui_mat <- read.table(apa_res_file, stringsAsFactors=FALSE, header=TRUE,sep="\t",check.names=FALSE) 62 | pdui_mat <- pdui_mat[,-c(2,3,4)] 63 | 64 | pdui_mat.sel <- pdui_mat %>% dplyr::select(all_of(rownames(gt_pca))) 65 | pdui_mat.sel <- as.matrix(pdui_mat.sel) 66 | rownames(pdui_mat.sel) <- pdui_mat[,1] 67 | 68 | 69 | #remove genes with more than 50% entries missing and individuals with more than 80% missing data 70 | pdui_mat.sel <- pdui_mat.sel[, colMeans(is.na(pdui_mat.sel)) <= 0.8];pdui_mat.sel <- pdui_mat.sel[rowMeans(is.na(pdui_mat.sel)) < 0.5,] 71 | class(pdui_mat.sel) <- 'numeric' 72 | 73 | 74 | # run peer to estimate confounders 75 | cat("Start covariate analysis by peer...") 76 | #save.image(file="run_peer_impute.RData") 77 | model <- PEER() 78 | covs_se <- gt_pca 79 | 80 | PEER_setCovariates(model, covs_se) 81 | dim(PEER_getCovariates(model)) 82 | #impute missing values in PDUI matrix 83 | mat.ds <- pdui_mat.sel 84 | mat_impute <- impute.knn(mat.ds) 85 | #quantile normalization 86 | df_w <- as.data.frame(mat_impute$data) 87 | for(gene in 1:nrow(df_w)){ 88 | mat = df_w[gene,] 89 | mat = apply(mat,1,rank,ties.method = "average") 90 | mat = qnorm(mat / (ncol(df_w)+1)) 91 | df_w[gene,] = mat 92 | } 93 | 94 | pdui_mat <- cbind(rownames(mat_impute$data),df_w) 95 | y <- colnames(pdui_mat)[-1] 96 | colnames(pdui_mat) <- c("Gene",y) 97 | id_order <- colnames(pdui_mat)[-1] 98 | cat("Output phenotype matrix:\n") 99 | write.table(pdui_mat,file="./Matrix_eQTL/Phenotype_matrix.txt",row.names=F,col.names=T,quote=F,sep="\t") 100 | rm(y) 101 | 102 | PEER_setPhenoMean(model, t(as.matrix(mat_impute$data))) 103 | 104 | dim(PEER_getPhenoMean(model)) 105 | 106 | # set number of peer factors 107 | ## N < 150, use 15 PEERs, 150<=N<250, use 30 PEERs, N >=250 use 35 PEERs 108 | if (ncol(mat.ds) < 150) { 109 | numcov <- 15 110 | } else if (ncol(mat.ds) < 250) { 111 | numcov <- 30 112 | } else if (ncol(mat.ds) >= 250) { 113 | numcov <- 35 114 | } 115 | 116 | PEER_setNk(model, numcov) 117 | PEER_getNk(model) 118 | 119 | PEER_update(model) 120 | 121 | # diag 122 | pdf('peer.diag.pdf', width=6, height=8) 123 | PEER_plotModel(model) 124 | dev.off() 125 | 126 | 127 | factors = t(PEER_getX(model)) 128 | weights = PEER_getW(model) 129 | precision = PEER_getAlpha(model) 130 | 131 | residuals = t(PEER_getResiduals(model)) 132 | rownames(residuals) <- rownames(mat.ds) 133 | colnames(residuals) <- colnames(mat.ds) 134 | 135 | rownames(factors) <- c(colnames(gt_pca), paste0("PEER_",1:numcov)) 136 | colnames(factors) <- colnames(mat.ds) 137 | 138 | residuals.ds <- residuals 139 | 140 | #png(paste0(loop.pop[i], '.expr.peer.clust.png'), width=8, height=8, res=150, units='in') 141 | #heatmap.2(as.matrix(residuals.ds), distfun=function(x) dist(x,method='euclidian'), hclustfun=function(x) hclust(x,method='ward.D2'), 142 | # trace='none', dendrogram='both', Rowv=TRUE, Colv=TRUE, breaks=pairs.breaks, col=colorRampPalette(myCols), scale='none', symkey=T, na.color='grey', density.info='histogram', cexRow=0.2, cexCol=0.5, main=paste0(TISSUE, '\nexpr clustering')) 143 | #dev.off() 144 | 145 | gz1 <- "pdui.peer.residuals.txt" 146 | write.table(cbind(rownames(residuals), residuals), file=gz1, row.names=FALSE, col.names=c("id",colnames(residuals)), quote=FALSE, sep='\t') 147 | rm(model,mat.ds,mat_impute,weights,precision,residuals,gz1) 148 | 149 | # --------------------- prepare genotype matrix 150 | cat("Load genotype_matrix.bed:\n") 151 | gt_mat <- read.table("./Matrix_eQTL/genotype_matrix.bed",header=T,sep="\t",check.names=FALSE) 152 | dim(gt_mat) 153 | gt_mat.reorder <- gt_mat %>% dplyr::select("id",all_of(id_order)) 154 | 155 | cat("Output genotype:\n") 156 | write.table(gt_mat.reorder,file="./Matrix_eQTL/Genotype_matrix.txt",quote=F,sep="\t",row.names=F,col.names=T) 157 | 158 | rm(gt_mat) 159 | 160 | # -------------------- prepare covariates matrix 161 | factors.df <- cbind(rownames(factors),factors) 162 | colnames(factors.df) <- c("id",colnames(factors)) 163 | factors.df <- as.data.frame(factors.df) 164 | factors.reorder <- factors.df %>% dplyr::select("id",all_of(id_order)) 165 | covariate_file <- "./Matrix_eQTL/Covariate_matrix.txt" 166 | write.table(factors.reorder, file=covariate_file, row.names=FALSE,quote=FALSE, sep='\t',col.names=T) 167 | rm(factors,factors.df,id_order) 168 | -------------------------------------------------------------------------------- /src/prepare_inputs_for_apa_quant.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script takes bam files, a gene annotation (bed), a ID mapping file (between refseq and gene symbol), 3 | # and generate bedgraph files, read depth, 3'UTR reference region 4 | # Two input files represent the gene annotation file (in bed format) and the ID mapping between Refseq transcript ID and gene symbol are required for executing this script 5 | # @Xudong Zou, zouxd@szbl.ac.cn 6 | # 2022-04-21 7 | 8 | # -- Usage function 9 | script_name=$0 10 | function usage(){ 11 | echo "#==============================" 12 | echo "Default usage:" 13 | echo "#==============================" 14 | echo "bash $script_name -s -g -r -t -c " 15 | echo "Options:" 16 | echo " -s text file,input a text file contains all samples (column 1) and corresponding bam files (column 2)" 17 | echo " -g text file,provide a RefSeq gene annotation file extracted from UCSC" 18 | echo " -r text file,provide a file list the ID mapping between RefSeq transcript and gene name" 19 | echo " -t integer,specify the number of threads used to parallelly running Dapars2,default=8" 20 | echo " -c integer,define the threshold of reads coverage of the alternative APA site,default=15" 21 | echo " -o file name,specify a name for the configure file which will be used by DaPars2" 22 | echo " -h print the help information" 23 | exit 1 24 | } 25 | 26 | # define global variables from command parameters 27 | currDir=`pwd` 28 | BAMlist="sample_list.txt" 29 | GeneAnno="" 30 | RefIDmap="" 31 | P=8 32 | Cutoff_Cov=15 33 | Config="Dapars2_running_configure.txt" 34 | 35 | while getopts :s:g:r:t:c:o:h opt 36 | do 37 | case $opt in 38 | s) 39 | if [ ! -f "$OPTARG" ];then echo "File not exists"; exit;fi 40 | BAMlist="$OPTARG" 41 | ;; 42 | g) 43 | if [ ! -f "$OPTARG" ];then echo "File $OPTARG not found!";exit;fi 44 | GeneAnno="$OPTARG" 45 | ;; 46 | r) 47 | if [ ! -f "$OPTARG" ];then echo "File $OPTARG not found!";exit;fi 48 | RefIDmap="$OPTARG" 49 | ;; 50 | t) 51 | P="$OPTARG" 52 | ;; 53 | c) 54 | Cutoff_Cov="$OPTARG" 55 | ;; 56 | o) 57 | Config="$OPTARG" 58 | ;; 59 | h) 60 | echo "Help message:" 61 | usage 62 | ;; 63 | :) 64 | echo "The option -$OPTARG requires an argument." 65 | exit 1 66 | ;; 67 | ?) 68 | echo "Invalid option: $OPTARG" 69 | usage 70 | exit 2 71 | ;; 72 | esac 73 | done 74 | 75 | 76 | # -- Main function -- 77 | function main(){ 78 | echo "Running $script_name with the following parameters:" 79 | echo "*************************************************" 80 | echo "-s: $BAMlist" 81 | echo "-g: $GeneAnno" 82 | echo "-r: $RefIDmap" 83 | echo "-t $P" 84 | echo "-c: $Cutoff_Cov" 85 | echo "-o: $Config" 86 | echo "*************************************************" 87 | echo "Start preparing inputs for Dapars2 ..." 88 | date 89 | echo "Convert bam to bedgraph format ..." 90 | run_bam2bedgraph $BAMlist $P 91 | echo "Counting total aligned reads by samtools flagstat ..." 92 | run_samtools_flagstat $BAMlist $P 93 | echo "Generating the 3' UTR reference ..." 94 | generate_3utr_reference $GeneAnno $RefIDmap 95 | echo "Generating the wigFile_and_readDepth.txt ..." 96 | generate_wigFileList_with_readDepth $BAMlist 97 | echo "Generating a configure file for Dapars2 ..." 98 | generate_configure_for_dapars2 $Cutoff_Cov $P 99 | wait 100 | echo "Done!" 101 | date 102 | } 103 | 104 | 105 | # -- Other functions -- 106 | function generate_configure_for_dapars2(){ 107 | N_cov=$1 108 | N_threads=$2 109 | python ./src/generate_configure_for_Dapars2.py --annotation_3utr ${currDir}/refseq_3utr_annotation.bed \ 110 | --wigFile_depth ${currDir}/wigFile_and_readDepth.txt \ 111 | --coverage_threshold $N_cov \ 112 | --threads $N_threads \ 113 | --out_config_name ${currDir}/Dapars2_running_configure.txt & 114 | wait 115 | echo "Generate file Dapars2.allSamples_joint.configure.txt in ${currDir}/input/" 116 | } 117 | function generate_wigFileList_with_readDepth(){ 118 | bamList=$1 119 | if [ ! -f "$bamList" ] 120 | then 121 | echo "File $bamList not found!" 122 | exit 123 | fi 124 | python ./src/extract_read_depth.py --sample_list $bamList --path_wig ${currDir}/wig --output ${currDir}/wigFile_and_readDepth.txt & 125 | wait 126 | echo "Generate file wigFile_and_readDepth.txt" 127 | 128 | } 129 | function generate_3utr_reference(){ 130 | gene_anno=$1 131 | refID2Symbol=$2 132 | python ./src/DaPars_Extract_Anno.py -b ${gene_anno} -s $refID2Symbol -o ${currDir}/refseq_3utr_annotation.bed & 133 | wait 134 | echo "Generate refseq_3utr_annotation.bed" 135 | } 136 | 137 | function run_bam2bedgraph(){ 138 | bamList=$1 139 | N_jobs=$2 140 | if [ ! -d "${currDir}/tmp" ] 141 | then 142 | mkdir -p ${currDir}/tmp 143 | fi 144 | 145 | if [ ! -d "${currDir}/wig" ] 146 | then 147 | mkdir -p ${currDir}/wig 148 | fi 149 | 150 | if [ ! -f "$bamList" ] 151 | then 152 | echo "File $bamList not found!" 153 | exit 154 | fi 155 | 156 | N_samples=`cat $bamList|wc -l` 157 | echo "$N_samples bam files waiting for processing." 158 | if [ $N_jobs -lt $N_samples ] 159 | then 160 | split -l $N_jobs $bamList -d ${currDir}/tmp/task_ & 161 | wait 162 | else 163 | echo -e "Number of parallel jobs exceeds the total number of tasks.\n$N_samples threads will be used!" 164 | cat $bamList > $currDir/tmp/task_00 & 165 | wait 166 | fi 167 | 168 | i_task=1 169 | for task in `ls ${currDir}/tmp/task_*` 170 | do 171 | echo "Start subtask ${i_task}..." 172 | while read line 173 | do 174 | sample=`echo $line | awk '{print $1}'` 175 | bam=`echo $line | awk '{print $2}'` 176 | echo $sample 177 | bedtools genomecov -ibam ${bam} -bga -split -trackline > ${currDir}/wig/${sample}.wig & 178 | done < $task 179 | wait 180 | echo "Subtask $i_task finished!" 181 | (( i_task += 1 )) 182 | done & 183 | wait 184 | 185 | echo "$N_samples bam files processed" 186 | rm ${currDir}/tmp/* 187 | rmdir ${currDir}/tmp 188 | date 189 | } 190 | 191 | function run_samtools_flagstat(){ 192 | bamList=$1 193 | N_jobs=$2 194 | if [ ! -d "${currDir}/tmp" ] 195 | then 196 | mkdir -p ${currDir}/tmp 197 | fi 198 | 199 | if [ ! -f "$bamList" ] 200 | then 201 | echo "File $bamList not found!" 202 | exit 203 | fi 204 | N_samples=`cat $bamList|wc -l` 205 | echo "$N_samples bam files waiting for processing." 206 | if [ $N_jobs -lt $N_samples ] 207 | then 208 | split -l $N_jobs $bamList -d ${currDir}/tmp/task_ & 209 | wait 210 | else 211 | echo -e "Number of parallel jobs exceeds the total number of tasks.\nN_samples will be used!" 212 | cat $bamList > ${currDir}/tmp/task_00 & 213 | wait 214 | fi 215 | 216 | i_task=1 217 | for task in `ls ${currDir}/tmp/task_*` 218 | do 219 | echo "Start subtask ${i_task}..." 220 | while read line 221 | do 222 | sample=`echo $line | awk '{print $1}'` 223 | bam=`echo $line | awk '{print $2}'` 224 | echo $sample 225 | samtools flagstat -@ 2 ${bam} > ${currDir}/tmp/${sample}.flagstat & 226 | done < $task 227 | wait 228 | echo "Subtask $i_task finished!" 229 | (( i_task += 1 )) 230 | done 231 | } 232 | 233 | 234 | # -- run main 235 | main 236 | -------------------------------------------------------------------------------- /src/prepare_inputs_for_3aQTL_mapping.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # -- usage function 4 | script_name=$0 5 | function usage(){ 6 | echo "#==============================" 7 | echo "Default usage:" 8 | echo "bash $script_name -g -p -c -s -m <0.05> -n <5>" 9 | echo "Options:" 10 | echo " -g text file,input a text file contains VCF file(s), default=vcf_list.txt" 11 | echo " -p text file,input the merged DaPars2 results, default=Dapars2_res.all_chromosomes.txt" 12 | echo " -c text file,input the known covariates like age and gender, default=NA" 13 | echo " -s text file,input a text file contains the list of samples, default=sample_list.txt" 14 | echo " -m float,minor allele frequency threshold for selecting common genetic variants, default=0.05" 15 | echo " -n integer,the top N genotype PCA components to be used as covariates, default=5" 16 | echo " -h print the help information" 17 | exit 1 18 | 19 | } 20 | # define global variables from command parameters 21 | currDir=`pwd` 22 | VCFLIST="vcf_list.txt" 23 | APA_RES="Dapars2_res.all_chromosomes.txt" 24 | KNOWN_COV="NA" 25 | SAMPLES="sample_list.txt" 26 | MAF="0.05" 27 | TOP_N="5" 28 | 29 | while getopts g:p:c:s:m:n:h opt 30 | do 31 | case $opt in 32 | g) 33 | if [ ! -f "$OPTARG" ];then echo "File $OPTARG not found";exit 0;fi 34 | VCFLIST="$OPTARG" 35 | ;; 36 | p) 37 | if [ ! -f "$OPTARG" ];then echo "File $OPTARG not found";exit 0;fi 38 | APA_RES="$OPTARG" 39 | ;; 40 | c) 41 | if [ ! -f "$OPTARG" ];then echo "File $OPTARG not found";exit 0;fi 42 | KNOWN_COV="$OPTARG" 43 | ;; 44 | s) 45 | if [ ! -f "$OPTARG" ];then echo "File $OPTARG not found";exit 0;fi 46 | SAMPLES="$OPTARG" 47 | ;; 48 | m) 49 | MAF="$OPTARG" 50 | ;; 51 | n) 52 | TOP_N="$OPTARG" 53 | ;; 54 | h) 55 | echo "Help message:" 56 | usage 57 | ;; 58 | :) 59 | echo "The option -$OPTARG requires an argument." 60 | exit 1 61 | ;; 62 | ?) 63 | echo "Invalid option: $OPTARG" 64 | usage 65 | exit 2 66 | ;; 67 | esac 68 | 69 | done 70 | 71 | # -- Basic settings 72 | if [ ! -d "${currDir}/tmp" ] 73 | then 74 | mkdir -p ${currDir}/tmp 75 | echo "Create a directory called tmp/" 76 | fi 77 | 78 | if [ ! -d "${currDir}/Matrix_eQTL" ] 79 | then 80 | mkdir -p ${currDir}/Matrix_eQTL 81 | echo "Create a directory called Matrix_eQTL/" 82 | fi 83 | 84 | # -- Main function -- 85 | function main(){ 86 | date 87 | echo "Start..." 88 | echo "Convert VCF file into 012 format..." 89 | generate_gt_matrix $VCFLIST $MAF $SAMPLES 90 | echo "PCA analysis on genotype by PLINK1.9 ..." 91 | PCA_on_genotype $VCFLIST $SAMPLES $MAF 92 | echo "Curate phenotype matrix, genotype matrix, and covariate matrix for Matrix-eQTL" 93 | curate_pheno_geno_covariates $APA_RES $KNOWN_COV $TOP_N 94 | echo "extracting 3UTR location and SNP location:" 95 | snp_and_3utr_location $APA_RES 96 | echo "Done!" 97 | date 98 | } 99 | 100 | function snp_and_3utr_location(){ 101 | dapars2_res=$1 102 | python ./src/extract_SNP_location.py --genotype_bed ./Matrix_eQTL/genotype_matrix.bed --output ${currDir}/Matrix_eQTL/snp_location.txt & 103 | wait 104 | python ./src/extract_3UTR_location.py --dapars_res $dapars2_res --output ${currDir}/Matrix_eQTL/3UTR_location.txt & 105 | wait 106 | echo "Two location files: snp_location.txt and 3UTR_location.txt are generated!" 107 | } 108 | function curate_pheno_geno_covariates(){ 109 | dapars2_res=$1 110 | known_cov=$2 111 | topN=$3 112 | Rscript ./src/curate_pheno_geno_covariates.R -p $dapars2_res -c $known_cov -n $topN 113 | } 114 | # PCA analysis on genotype by PLINK 1.9 115 | function PCA_on_genotype(){ 116 | vcfList=$1 117 | keep_inds=$2 118 | maf=$3 119 | 120 | if [ ! -f "$vcfList" ] 121 | then 122 | echo "$vcfList not exits!" 123 | exit 124 | fi 125 | 126 | if [ ! -f "$keep_inds" ] 127 | then 128 | echo "File $keep_inds not found!" 129 | exit 130 | fi 131 | 132 | 133 | for vcf in `cat $vcfList` 134 | do 135 | if [ ! -f "$vcf" ] 136 | then 137 | echo "$vcf not exists!" 138 | exit 139 | else 140 | filename=`basename $vcf .gz` 141 | vcf_body=${filename%.*} 142 | 143 | plink --vcf $vcf --const-fid --out ${currDir}/tmp/${vcf_body}.plink & 144 | wait 145 | 146 | echo "${vcf_body}.plink.bed ${vcf_body}.plink.bim ${vcf_body}.plink.fam" >> ${currDir}/tmp/merge_list.txt 147 | fi 148 | done 149 | sleep 10 150 | 151 | cd ${currDir}/tmp 152 | N=`cat merge_list.txt|wc -l` 153 | 154 | if [ $N -gt 1 ] 155 | then 156 | tmp=`cat merge_list.txt|head -n 1|awk '{print $1}'` 157 | firstVCF=${tmp%.*} 158 | cat merge_list.txt|tail -n+2 > tmp.txt 159 | cat tmp.txt > merge_list.txt 160 | rm tmp.txt 161 | 162 | echo "Merging genotype of multi-chromosomes into one file: merged_plink.* ..." 163 | plink --bfile $firstVCF --merge-list merge_list.txt --out merged_plink --allow-extra-chr & 164 | wait 165 | 166 | else 167 | cd $currDir 168 | vcf=`cat $vcfList` 169 | filename=`basename $vcf .gz` 170 | vcf_body=${filename%.*} 171 | mv ${currDir}/tmp/${vcf_body}.plink.bed ${currDir}/tmp/merged_plink.bed 172 | mv ${currDir}/tmp/${vcf_body}.plink.bim ${currDir}/tmp/merged_plink.bim 173 | mv ${currDir}/tmp/${vcf_body}.plink.fam ${currDir}/tmp/merged_plink.fam 174 | 175 | fi 176 | 177 | # extract selected samples in plink file 178 | cd ${currDir} 179 | cat $keep_inds | awk '{print "0",$1}' > ${currDir}/tmp/keep.list 180 | cd ${currDir}/tmp 181 | plink --bfile merged_plink --keep keep.list --geno 0.02 --hwe 0.000001 --maf $maf --make-bed --out merged_plink_QC & 182 | wait 183 | sleep 20 184 | 185 | # pca analysis 186 | plink --bfile merged_plink_QC --indep-pairwise 50 5 0.2 --out merged_plink_QC & 187 | wait 188 | plink --bfile merged_plink_QC --extract merged_plink_QC.prune.in --pca 30 --out genotype_pca & 189 | wait 190 | 191 | # mv pca results to input file for furhter analysis 192 | if [ -f "genotype_pca.eigenvec" ] 193 | then 194 | cp genotype_pca.eigenvec ${currDir}/Matrix_eQTL 195 | echo "move genotype_pca.eigenvec to ${currDir}/Matrix_eQTL" 196 | 197 | cd $currDir 198 | else 199 | echo "genotype_pca.eigenvec not found!" 200 | exit 201 | fi 202 | 203 | } 204 | 205 | # recode genotype into 012 format from VCF 206 | function generate_gt_matrix(){ 207 | vcfList=$1 208 | maf=$2 209 | keep_inds=$3 210 | 211 | cat $keep_inds |cut -f1 > ${currDir}/tmp/keep_inds.txt 212 | 213 | for vcf in `cat $vcfList` 214 | do 215 | if [ ! -f "$vcf" ] 216 | then 217 | echo "$vcf not exists!" 218 | exit 219 | else 220 | last_suffix=${vcf##*.} 221 | if [ $last_suffix = "gz" ] 222 | then 223 | filename=`basename $vcf` 224 | tmp=${filename%.*} 225 | vcf_body=${tmp%.*} 226 | echo "Extract genotype from $vcf ..." 227 | vcftools --gzvcf $vcf --out ${currDir}/tmp/${vcf_body}.gt_filtering --remove-filtered-all --keep $keep_inds --maf $maf --max-missing-count 10 --extract-FORMAT-info GT & 228 | 229 | echo "Extract allele frequence from $vcf ..." 230 | vcftools --gzvcf $vcf --out ${currDir}/tmp/${vcf_body}.gt_filtering --remove-filtered-all --keep $keep_inds --maf $maf --max-missing-count 10 --freq & 231 | 232 | wait 233 | echo "$vcf Done!" 234 | elif [ $last_suffix = "vcf" ] 235 | then 236 | filename=`basename $vcf` 237 | vcf_body=${filename%.*} 238 | echo "Extract genotype from $vcf ..." 239 | vcftools --vcf $vcf --out ${currDir}/tmp/${vcf_body}.gt_filtering --remove-filtered-all --keep $keep_inds --maf $maf --max-missing-count 10 --extract-FORMAT-info GT & 240 | 241 | echo "Extract allele frequence from $vcf ..." 242 | vcftools --vcf $vcf --out ${currDir}/tmp/${vcf_body}.gt_filtering --remove-filtered-all --keep $keep_inds --maf $maf --max-missing-count 10 --freq & 243 | wait 244 | echo "$vcf Done!" 245 | else 246 | echo "Unrecognized file format." 247 | exit 248 | 249 | fi 250 | 251 | fi 252 | done & 253 | wait 254 | 255 | for frq in `ls ${currDir}/tmp/*.frq` 256 | do 257 | filename=`basename $frq` 258 | sample=${filename%.*} 259 | python ./src/recode_with_012.py --frq $frq --GT ${currDir}/tmp/${sample}.GT.FORMAT --output ${currDir}/tmp/${sample}.GT.bed & 260 | wait 261 | done & 262 | wait 263 | 264 | cat ${currDir}/tmp/*.GT.bed |head -n 1 > ${currDir}/Matrix_eQTL/genotype_matrix.bed & 265 | wait 266 | for bed in `ls ${currDir}/tmp/*.GT.bed` 267 | do 268 | cat $bed |tail -n+2 >> ${currDir}/Matrix_eQTL/genotype_matrix.bed & 269 | wait 270 | done 271 | 272 | echo "Genotype matrix has been generated: ./Matrix_eQTL/genotype_matrix.bed" 273 | rm ${currDir}/tmp/* 274 | 275 | } 276 | 277 | # -- main 278 | main 279 | -------------------------------------------------------------------------------- /src/Dapars2_Multi_Sample.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import sys 4 | import datetime 5 | import threading 6 | import scipy as sp 7 | import scipy.stats 8 | from multiprocessing import Pool 9 | from bisect import bisect 10 | 11 | import math 12 | import time 13 | 14 | import multiprocessing 15 | 16 | 17 | def time_now():#return time 18 | curr_time = datetime.datetime.now() 19 | return curr_time.strftime("%c") 20 | 21 | def Convert_wig_into_bp_coverage(extracted_coverage,extracted_3UTR_region,strand_info): 22 | bp_coverage = np.zeros(extracted_3UTR_region[-1] - extracted_3UTR_region[0]) 23 | relative_start = extracted_3UTR_region[0] 24 | for i in range(len(extracted_coverage)): 25 | curr_region_start = extracted_3UTR_region[i] - relative_start 26 | curr_region_end = extracted_3UTR_region[i+1] - relative_start 27 | bp_coverage[curr_region_start:curr_region_end] = extracted_coverage[i] 28 | if strand_info == '-': 29 | bp_coverage = bp_coverage[::-1] 30 | 31 | return bp_coverage 32 | 33 | def parse_cfgfile(cfg_file): 34 | '''Parse configure file 35 | ''' 36 | Aligned_Wig_files='' 37 | output_directory='' 38 | Annotated_3UTR_file='' 39 | Output_result_file='' 40 | Coverage_threshold = 1 41 | Num_threads = 1 42 | sequencing_depth_file = '' 43 | 44 | for line in open(cfg_file, 'r'): 45 | if line[0] == '\n' or line[0] == '#': 46 | comments = line; 47 | else: 48 | line = line.rstrip() 49 | command = line.split('='); 50 | if command[0] == 'Aligned_Wig_files': 51 | Aligned_Wig_files = command[1].split(','); 52 | if command[0] == 'Output_directory': 53 | output_directory = command[1] 54 | if output_directory[-1] != '/': 55 | output_directory += '/' 56 | if command[0] == 'Annotated_3UTR': 57 | Annotated_3UTR_file = command[1] 58 | if command[0] == 'Output_result_file': 59 | Output_result_file = command[1] 60 | if command[0] == 'sequencing_depth_file': 61 | sequencing_depth_file = command[1] 62 | if command[0] == 'Num_Threads': 63 | Num_threads = int(command[1]) 64 | if command[0] == 'Coverage_threshold': 65 | Coverage_threshold = int(command[1]) 66 | 67 | 68 | if Aligned_Wig_files == '': 69 | print("No aligned BAM file found!", file=sys.stderr) 70 | exit(1) 71 | if output_directory=='': 72 | print("No output directory!", file=sys.stderr) 73 | exit(1) 74 | if Annotated_3UTR_file=='': 75 | print("No annotated 3' UTR file!", file=sys.stderr) 76 | exit(1) 77 | if Output_result_file=='': 78 | print("No result file name!", file=sys.stderr) 79 | exit(1) 80 | if sequencing_depth_file=='': 81 | print("No sequencing depth file!", file=sys.stderr) 82 | exit(1) 83 | 84 | return Aligned_Wig_files, output_directory, Annotated_3UTR_file, Output_result_file, sequencing_depth_file, Num_threads, Coverage_threshold 85 | 86 | def load_sequencing_depth(depth_file): 87 | seq_depth_list = [] 88 | for line in open(depth_file, 'r'): 89 | fields = line.strip('\n').split('\t') 90 | seq_depth_list.append(int(fields[-1])) 91 | 92 | return np.array(seq_depth_list) 93 | 94 | def De_Novo_3UTR_Identification_Loading_Target_Wig_for_TCGA_Multiple_Samples_Multiple_threads_Main3_shared_list(argv=None): 95 | '''multiple threads version 96 | ''' 97 | if len(sys.argv) == 1: 98 | print("Please provide the configure file and specify chr name...") 99 | exit(1) 100 | cfg_file = sys.argv[1] 101 | curr_processing_chr = sys.argv[2] 102 | if "chr" not in curr_processing_chr: 103 | curr_processing_chr = "chr" + curr_processing_chr 104 | 105 | print("[%s] Start Analysis ..." % time_now(), file=sys.stderr) 106 | Group1_Tophat_aligned_file, output_directory, Annotated_3UTR_file, Output_result_file, sequencing_depth_file, Num_threads, Coverage_threshold = parse_cfgfile(cfg_file) 107 | 108 | All_Sample_files = Group1_Tophat_aligned_file[:] 109 | Sample_name = [] 110 | for sample in All_Sample_files: 111 | sample_name = sample.rsplit('.',1)[0] 112 | Sample_name.append(sample_name) 113 | 114 | ##Prepare output directory 115 | output_directory = output_directory.strip('/') + '_' + curr_processing_chr + '/' 116 | d = os.path.dirname(output_directory) 117 | if not os.path.exists(d): 118 | os.makedirs(d) 119 | temp_dir = d + '/tmp/' 120 | if not os.path.exists(temp_dir): 121 | os.makedirs(temp_dir) 122 | 123 | Output_all_prediction_file = output_directory + Output_result_file + '_result_temp.' + curr_processing_chr + '.txt' 124 | Output_result = open(Output_all_prediction_file, 'w') 125 | 126 | num_samples = len(All_Sample_files) 127 | 128 | print("All samples Joint Processing %s ..." % curr_processing_chr, file=sys.stderr) 129 | print("[%s] Loading Coverage ..." % time_now(), file=sys.stderr) 130 | 131 | All_samples_Target_3UTR_coverages, UTR_events_dict = Load_Target_Wig_files_Multiple_threads_shared_dict_sampleid_key(All_Sample_files, Annotated_3UTR_file, Num_threads,curr_processing_chr) 132 | All_samples_sequencing_depths = load_sequencing_depth(sequencing_depth_file) 133 | 134 | print(All_samples_sequencing_depths) 135 | All_sample_coverage_weights = All_samples_sequencing_depths/np.mean(All_samples_sequencing_depths) 136 | 137 | #print All_sample_coverage_weights 138 | print("[%s] Loading Coverage Finished ..." % time_now(), file=sys.stderr) 139 | #Write the first line 140 | first_line = ['Gene','fit_value','Predicted_Proximal_APA','Loci'] 141 | for i in range(num_samples): 142 | #curr_long_exp = 'Sample_%s_long_exp' % str(i+1) 143 | #curr_short_exp = 'Sample_%s_short_exp' % str(i+1) 144 | curr_ratio = '%s_PDUI' % str(Sample_name[i]) 145 | #first_line.extend([curr_long_exp,curr_short_exp,curr_ratio]) 146 | first_line.append(curr_ratio) 147 | 148 | Output_result.writelines('\t'.join(first_line) + '\n') 149 | 150 | All_events_ids = list(UTR_events_dict.keys()) 151 | num_threads = Num_threads 152 | Assigned_events_ids_all_threads = Assign_to_different_processor_balance_events(All_events_ids, num_threads) 153 | 154 | num_real_threads = len(Assigned_events_ids_all_threads) 155 | 156 | Output_each_processor_all = [] 157 | for i in range(num_real_threads): 158 | curr_temp_output = temp_dir + 'Each_processor_3UTR_Result_%s.txt' % (str(i+1)) 159 | Output_each_processor_all.append(curr_temp_output) 160 | 161 | processes = [] 162 | for i in range(num_real_threads): 163 | process = multiprocessing.Process(target=Each_Thread_3UTR_estimation_list_version_sample_ids, args=(Assigned_events_ids_all_threads[i], UTR_events_dict, All_sample_coverage_weights, num_samples, Output_each_processor_all[i], All_samples_Target_3UTR_coverages, Coverage_threshold)) 164 | process.start() 165 | processes.append(process) 166 | 167 | for p in processes: 168 | p.join() 169 | 170 | #Combine results 171 | for i in range(num_real_threads): 172 | curr_result = Output_each_processor_all[i] 173 | for line in open(curr_result, 'r'): 174 | Output_result.writelines(line) 175 | Output_result.close() 176 | 177 | #print >> sys.stderr, "[%s] Filtering the Results ..." % time_now() 178 | 179 | #Output_all_filtered_prediction_file = output_directory + Output_result_file + '_results_final.' + curr_processing_chr + '.txt' 180 | #Dapars_Filtering(Output_all_prediction_file, num_samples, Output_all_filtered_prediction_file) 181 | 182 | print("[%s] Finished!" % time_now(), file=sys.stderr) 183 | 184 | 185 | def Each_Thread_3UTR_estimation_list_version_sample_ids(curr_thread_UTR_events_ids, UTR_events_dict, All_sample_coverage_weights, num_samples, Output_result_file, All_samples_coverage_shared_dict, Coverage_threshold): 186 | Output_result = open(Output_result_file,'w') 187 | 188 | for curr_3UTR_id in curr_thread_UTR_events_ids: 189 | curr_3UTR_structure = UTR_events_dict[curr_3UTR_id] 190 | region_start = curr_3UTR_structure[1] 191 | region_end = curr_3UTR_structure[2] 192 | curr_strand = curr_3UTR_structure[-2] 193 | UTR_pos = curr_3UTR_structure[-1] 194 | curr_3UTR_all_samples_bp_coverage = [] 195 | 196 | for i in range(num_samples): 197 | curr_sample_curr_3UTR_coverage_wig = All_samples_coverage_shared_dict[curr_3UTR_id, i] 198 | curr_3UTR_curr_sample_bp_coverage = Convert_wig_into_bp_coverage(curr_sample_curr_3UTR_coverage_wig[0], curr_sample_curr_3UTR_coverage_wig[1], curr_strand) 199 | curr_3UTR_all_samples_bp_coverage.append(curr_3UTR_curr_sample_bp_coverage) 200 | 201 | select_mean_squared_error, selected_break_point, UTR_abundances = De_Novo_3UTR_Coverage_estimation_Genome_for_multiple_samples(curr_3UTR_all_samples_bp_coverage, region_start, region_end,curr_strand,All_sample_coverage_weights, Coverage_threshold) 202 | 203 | if str(select_mean_squared_error) != "Na": 204 | num_non_zero = 1 205 | if num_non_zero > 0: 206 | All_long_inclusion_ratios = [] 207 | line_write = [curr_3UTR_id, "%.1f" % select_mean_squared_error, str(selected_break_point), UTR_pos] 208 | 209 | for i in range(num_samples): 210 | if UTR_abundances[0][i] != 'NA': 211 | # long 3'UTR percentage 212 | curr_sample_ratio = float(UTR_abundances[0][i])/(float(UTR_abundances[0][i]) + float(UTR_abundances[1][i])) 213 | All_long_inclusion_ratios.append(curr_sample_ratio) 214 | #line_write.append("%.2f" % UTR_abundances[0][i])#long 3' UTR abundance 215 | #line_write.append("%.2f" % UTR_abundances[1][i])#short 3' UTR abundance 216 | line_write.append("%.2f" % curr_sample_ratio) 217 | else: 218 | line_write.extend(['NA']*1) 219 | 220 | Output_result.writelines( '\t'.join(line_write) + '\n') 221 | 222 | Output_result.close() 223 | 224 | 225 | def De_Novo_3UTR_Coverage_estimation_Genome_for_multiple_samples(All_Samples_curr_3UTR_coverages, UTR_start, UTR_end, curr_strand, weight_for_second_coverage, Coverage_threshold): 226 | coverage_threshold = Coverage_threshold 227 | search_point_start = 150 ##200 228 | search_point_end = int(abs((UTR_end - UTR_start))*0.05) 229 | 230 | num_samples = len(All_Samples_curr_3UTR_coverages) 231 | #Read Coverage 232 | Region_Coverages = [] 233 | Pass_threshold_index = [] 234 | for i in range(num_samples): 235 | curr_Region_Coverage_raw = All_Samples_curr_3UTR_coverages[i] 236 | curr_Region_Coverage = curr_Region_Coverage_raw/weight_for_second_coverage[i]#@xdzou: not modified yet 237 | 238 | curr_first_100_coverage = np.mean(curr_Region_Coverage_raw[0:99]) 239 | if curr_first_100_coverage > coverage_threshold: 240 | Pass_threshold_index.append(i) 241 | Region_Coverages.append(curr_Region_Coverage) 242 | 243 | least_pass_coverage_num = num_samples * least_pass_coverage_percentage 244 | if len(Pass_threshold_index) > least_pass_coverage_num and UTR_end - UTR_start >=150: 245 | if curr_strand == "+": 246 | search_region = list(range(UTR_start+search_point_start, UTR_end-search_point_end+1)) 247 | else: 248 | search_region = list(range(UTR_end - search_point_start, UTR_start+search_point_end-1, -1)) 249 | 250 | search_region_start = search_point_start 251 | search_region_end = UTR_end - UTR_start - search_point_end 252 | Mean_squared_error_list = [] 253 | Estimated_3UTR_abundance_list = [] 254 | for curr_point in range(search_region_start, search_region_end+1): 255 | curr_search_point = curr_point 256 | All_samples_result = [[],[],[]] 257 | for curr_sample_region_coverage in Region_Coverages: 258 | Mean_Squared_error, Long_UTR_abun, Short_UTR_abun = Estimation_abundance(curr_sample_region_coverage, curr_search_point) 259 | All_samples_result[0].append(Mean_Squared_error) 260 | All_samples_result[1].append(Long_UTR_abun) 261 | All_samples_result[2].append(Short_UTR_abun) 262 | 263 | Mean_Squared_error = np.mean(np.array(All_samples_result[0])) 264 | Mean_squared_error_list.append(Mean_Squared_error) 265 | Estimated_3UTR_abundance_list.append([All_samples_result[1],All_samples_result[2]]) 266 | 267 | if len(Mean_squared_error_list) > 1: 268 | min_ele_index = Mean_squared_error_list.index(min(Mean_squared_error_list)) 269 | 270 | select_mean_squared_error = Mean_squared_error_list[min_ele_index] 271 | selected_break_point = search_region[min_ele_index] 272 | 273 | UTR_abundances = [['NA']*num_samples, ['NA']*num_samples] 274 | UTR_abundances_passed = Estimated_3UTR_abundance_list[min_ele_index] 275 | for k in range(len(Pass_threshold_index)): 276 | UTR_abundances[0][Pass_threshold_index[k]] = UTR_abundances_passed[0][k] 277 | UTR_abundances[1][Pass_threshold_index[k]] = UTR_abundances_passed[1][k] 278 | 279 | else: 280 | selected_break_point = 'Na' 281 | UTR_abundances = 'Na' 282 | select_mean_squared_error = 'Na' 283 | 284 | else: 285 | selected_break_point = 'Na' 286 | UTR_abundances = 'Na' 287 | select_mean_squared_error = 'Na' 288 | 289 | return select_mean_squared_error, selected_break_point, UTR_abundances 290 | 291 | 292 | def Estimation_abundance(Region_Coverage, break_point): 293 | Long_UTR_abun = np.mean(Region_Coverage[break_point:]) 294 | Short_UTR_abun = np.mean(Region_Coverage[0:break_point] - Long_UTR_abun) 295 | if Short_UTR_abun < 0: 296 | Short_UTR_abun = 0 297 | Coverage_diff = Region_Coverage[0:break_point] - Long_UTR_abun - Short_UTR_abun 298 | Coverage_diff= np.append(Coverage_diff, Region_Coverage[break_point:] - Long_UTR_abun) 299 | Mean_Squared_error = np.mean(Coverage_diff**2) 300 | 301 | return Mean_Squared_error, Long_UTR_abun, Short_UTR_abun 302 | 303 | 304 | def Load_Target_Wig_files_Multiple_threads_shared_dict_sampleid_key(All_Wig_files,UTR_Annotation_file, num_threads,curr_processing_chr): 305 | num_samples = len(All_Wig_files) 306 | UTR_events_dict = {} 307 | for line in open(UTR_Annotation_file, 'r'): 308 | fields = line.strip('\n').split('\t') 309 | curr_chr = fields[0] 310 | if "chr" not in "curr_chr": 311 | curr_chr = "chr" + curr_chr 312 | 313 | if curr_chr == curr_processing_chr: 314 | region_start = fields[1] 315 | region_end = fields[2] 316 | 317 | curr_strand = fields[-1] 318 | UTR_pos = "%s:%s-%s" %(curr_chr, region_start, region_end) 319 | end_shift = int(round(abs(int(region_start) - int(region_end)) * 0.2)) 320 | if curr_strand == "+": 321 | region_end = str(int(region_end) - end_shift) 322 | else: 323 | region_start = str(int(region_start) + end_shift) 324 | region_start = int(region_start) + 1 325 | region_end = int(region_end) - 1 326 | if region_start + 50 < region_end: 327 | UTR_events_dict[fields[3]] = [fields[0],region_start,region_end,fields[-1],UTR_pos] 328 | 329 | Assigned_index = Assign_to_different_processor_balance(num_samples, num_threads) 330 | 331 | manager = multiprocessing.Manager() # create only 1 Manager 332 | All_samples_extracted_3UTR_coverage_dict = manager.dict() # create only 1 dict 333 | 334 | processes = [] 335 | Final_assigned_threads_num = len(Assigned_index) 336 | for i in range(Final_assigned_threads_num): 337 | process = multiprocessing.Process(target=load_wig_funct_shared_dict_sampleid_key, args=(All_Wig_files, Assigned_index[i], UTR_events_dict,curr_processing_chr,All_samples_extracted_3UTR_coverage_dict)) 338 | process.start() 339 | processes.append(process) 340 | 341 | for p in processes: 342 | p.join() 343 | 344 | return All_samples_extracted_3UTR_coverage_dict, UTR_events_dict 345 | 346 | 347 | def load_wig_funct_shared_dict_sampleid_key(All_wig_files, assigned_indexes,UTR_events_dict, curr_processing_chr, All_samples_extracted_3UTR_coverage_dict): 348 | ''' 349 | All_samples_extracted_3UTR_coverage_dict: sample id is the key. 350 | ''' 351 | for i in assigned_indexes: 352 | curr_wig_file = All_wig_files[i] 353 | print(curr_wig_file, file=sys.stderr) 354 | curr_sample_All_chroms_coverage_dict = {} 355 | with open(curr_wig_file, 'r') as fin: 356 | for line in fin: 357 | if line[0] != '#' and line[0] != 't': 358 | fields = line.strip('\n').split('\t') 359 | chrom_name = fields[0] 360 | if chrom_name == curr_processing_chr: 361 | region_start = int(fields[1]) 362 | region_end = int(fields[2]) 363 | 364 | 365 | if chrom_name not in curr_sample_All_chroms_coverage_dict: 366 | curr_sample_All_chroms_coverage_dict[chrom_name] = [[0],[0]] 367 | if region_start > curr_sample_All_chroms_coverage_dict[chrom_name][0][-1]: 368 | curr_sample_All_chroms_coverage_dict[chrom_name][0].append(region_start) 369 | curr_sample_All_chroms_coverage_dict[chrom_name][1].append(0) 370 | curr_sample_All_chroms_coverage_dict[chrom_name][0].append(region_end) 371 | curr_sample_All_chroms_coverage_dict[chrom_name][1].append(int(float(fields[-1]))) 372 | else: 373 | if len(curr_sample_All_chroms_coverage_dict)>0: 374 | break 375 | fin.close() 376 | if curr_processing_chr not in curr_sample_All_chroms_coverage_dict: 377 | print('no wig: ' + curr_wig_file, file=sys.stderr) 378 | else: 379 | curr_sample_All_chroms_coverage_dict[curr_processing_chr][1].append(0) 380 | 381 | curr_sample_coverage_dict = {} 382 | 383 | for curr_3UTR_event_id in UTR_events_dict: 384 | curr_3UTR_structure = UTR_events_dict[curr_3UTR_event_id] 385 | curr_chr_local = curr_3UTR_structure[0] 386 | if curr_chr_local in curr_sample_All_chroms_coverage_dict: 387 | curr_chr_coverage = curr_sample_All_chroms_coverage_dict[curr_chr_local] 388 | region_start = curr_3UTR_structure[1] 389 | region_end = curr_3UTR_structure[2] 390 | left_region_index = bisect(curr_chr_coverage[0],region_start) 391 | right_region_index = bisect(curr_chr_coverage[0],region_end) 392 | 393 | extracted_coverage = curr_chr_coverage[1][left_region_index:right_region_index+1] 394 | extracted_3UTR_region = curr_chr_coverage[0][left_region_index:right_region_index] 395 | extracted_3UTR_region.insert(0,region_start) 396 | extracted_3UTR_region.append(region_end) 397 | 398 | curr_event_info = [extracted_coverage,extracted_3UTR_region] 399 | All_samples_extracted_3UTR_coverage_dict[curr_3UTR_event_id,i] = curr_event_info 400 | 401 | def Assign_to_different_processor_balance(Total_number, num_processors): 402 | Assigned_results = [] 403 | num_each_processor = int(Total_number/num_processors)#@xdzou: add int() to convert the value into a integer othwise float 404 | 405 | if num_each_processor == 0: 406 | for i in range(Total_number): 407 | Assigned_results.append([i]) 408 | else: 409 | remain = Total_number - num_processors * num_each_processor 410 | for i in range(remain): 411 | Assigned_results.append(list(range((i)*(num_each_processor + 1), (i+1)*(num_each_processor + 1)))) 412 | for i in range(num_processors-remain): 413 | Assigned_results.append(list(range(i*num_each_processor+remain*(num_each_processor+1), (i+1)*num_each_processor+remain*(num_each_processor+1)))) 414 | 415 | return Assigned_results 416 | 417 | 418 | def Assign_to_different_processor_balance_events(All_events_ids, num_processors): 419 | Assigned_results = [] 420 | Total_number = len(All_events_ids) 421 | num_each_processor = int(Total_number/num_processors) #@xdzou, add int() 422 | 423 | if num_each_processor == 0: 424 | for i in range(Total_number): 425 | Assigned_results.append([i]) 426 | else: 427 | remain = Total_number - num_processors * num_each_processor 428 | for i in range(remain): 429 | Assigned_results.append(list(range((i)*(num_each_processor+1), (i+1)*(num_each_processor+1)))) 430 | 431 | for i in range(num_processors-remain): 432 | Assigned_results.append(list(range(i*num_each_processor+remain*(num_each_processor+1), (i+1)*num_each_processor+remain*(num_each_processor+1)))) 433 | #print assigned Results 434 | Assigned_events = [] 435 | print('#assigned events:') 436 | for curr_processor_inds in Assigned_results: 437 | curr_processor_events = [] 438 | print(len(curr_processor_inds)) 439 | for curr_ele in curr_processor_inds: 440 | curr_processor_events.append(All_events_ids[curr_ele]) 441 | Assigned_events.append(curr_processor_events) 442 | return Assigned_events 443 | 444 | #global parameters 445 | least_pass_coverage_percentage = 0.3 446 | 447 | De_Novo_3UTR_Identification_Loading_Target_Wig_for_TCGA_Multiple_Samples_Multiple_threads_Main3_shared_list(sys.argv) 448 | -------------------------------------------------------------------------------- /src/DaPars2_Multi_Sample_Multi_Chr.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import sys 4 | import datetime 5 | import threading 6 | import scipy as sp 7 | import scipy.stats 8 | from multiprocessing import Pool 9 | from bisect import bisect 10 | 11 | 12 | import math 13 | import time 14 | 15 | import multiprocessing 16 | 17 | 18 | def time_now():#return time 19 | curr_time = datetime.datetime.now() 20 | return curr_time.strftime("%c") 21 | 22 | def Convert_wig_into_bp_coverage(extracted_coverage,extracted_3UTR_region,strand_info): 23 | bp_coverage = np.zeros(extracted_3UTR_region[-1] - extracted_3UTR_region[0]) 24 | relative_start = extracted_3UTR_region[0] 25 | for i in range(len(extracted_coverage)): 26 | curr_region_start = extracted_3UTR_region[i] - relative_start 27 | curr_region_end = extracted_3UTR_region[i+1] - relative_start 28 | bp_coverage[curr_region_start:curr_region_end] = extracted_coverage[i] 29 | if strand_info == '-': 30 | bp_coverage = bp_coverage[::-1] 31 | 32 | return bp_coverage 33 | 34 | def parse_cfgfile(cfg_file): 35 | '''Parse configure file 36 | ''' 37 | Aligned_Wig_files='' 38 | output_directory='' 39 | Annotated_3UTR_file='' 40 | Output_result_file='' 41 | Coverage_threshold = 1 42 | Num_threads = 1 43 | sequencing_depth_file = '' 44 | 45 | for line in open(cfg_file, 'r'): 46 | if line[0] == '\n' or line[0] == '#': 47 | comments = line; 48 | else: 49 | line = line.rstrip() 50 | command = line.split('='); 51 | if command[0] == 'Aligned_Wig_files': 52 | Aligned_Wig_files = command[1].split(','); 53 | if command[0] == 'Output_directory': 54 | output_directory = command[1] 55 | if output_directory[-1] != '/': 56 | output_directory += '/' 57 | if command[0] == 'Annotated_3UTR': 58 | Annotated_3UTR_file = command[1] 59 | if command[0] == 'Output_result_file': 60 | Output_result_file = command[1] 61 | if command[0] == 'sequencing_depth_file': 62 | sequencing_depth_file = command[1] 63 | if command[0] == 'Num_Threads': 64 | Num_threads = int(command[1]) 65 | if command[0] == 'Coverage_threshold': 66 | Coverage_threshold = int(command[1]) 67 | 68 | 69 | if Aligned_Wig_files == '': 70 | print("No aligned BAM file found!", file=sys.stderr) 71 | exit(1) 72 | if output_directory=='': 73 | print("No output directory!", file=sys.stderr) 74 | exit(1) 75 | if Annotated_3UTR_file=='': 76 | print("No annotated 3' UTR file!", file=sys.stderr) 77 | exit(1) 78 | if Output_result_file=='': 79 | print("No result file name!", file=sys.stderr) 80 | exit(1) 81 | if sequencing_depth_file=='': 82 | print("No sequencing depth file!", file=sys.stderr) 83 | exit(1) 84 | 85 | return Aligned_Wig_files, output_directory, Annotated_3UTR_file, Output_result_file, sequencing_depth_file, Num_threads, Coverage_threshold 86 | 87 | def load_sequencing_depth(depth_file): 88 | seq_depth_list = [] 89 | for line in open(depth_file, 'r'): 90 | fields = line.strip('\n').split('\t') 91 | seq_depth_list.append(int(fields[-1])) 92 | 93 | return np.array(seq_depth_list) 94 | 95 | def De_Novo_3UTR_Identification_Loading_Target_Wig_for_TCGA_Multiple_Samples_Multiple_threads_Main3_shared_list(argv=None): 96 | '''multiple threads version 97 | ''' 98 | print((len(sys.argv))) 99 | if len(sys.argv) == 1: 100 | print("Please provide the configure file and specify chr name...") 101 | exit(1) 102 | 103 | cfg_file = sys.argv[1] 104 | Group1_Tophat_aligned_file, output_folder, Annotated_3UTR_file, Output_result_file, sequencing_depth_file, Num_threads, Coverage_threshold = parse_cfgfile(cfg_file) 105 | All_Sample_files = Group1_Tophat_aligned_file[:] 106 | Sample_name = [] 107 | for sample in All_Sample_files: 108 | sample_name = sample.rsplit('.',1)[0] 109 | Sample_name.append(sample_name) 110 | #curr_processing_chr = sys.argv[2] 111 | 112 | #-- @xdzou: extract processing chromosomes from command line 113 | fh=open(sys.argv[2],'r') 114 | All_chroms = [] 115 | for line in fh.readlines(): 116 | line = line.strip() 117 | All_chroms.append(line) 118 | fh.close() 119 | 120 | 121 | for curr_processing_chr in All_chroms: 122 | print("[%s] Start Analysis ..." % time_now(), file=sys.stderr) 123 | 124 | ##Prepare output directory 125 | output_directory = output_folder.rstrip('/') + '_' + curr_processing_chr + '/' #@xdzou,change strip() to rsrip() 126 | d = os.path.dirname(output_directory) 127 | print(d) 128 | if not os.path.exists(d): 129 | os.makedirs(d) 130 | temp_dir = d + '/tmp/' 131 | if not os.path.exists(temp_dir): 132 | os.makedirs(temp_dir) 133 | 134 | Output_all_prediction_file = output_directory + Output_result_file + '_result_temp.' + curr_processing_chr + '.txt' 135 | Output_result = open(Output_all_prediction_file, 'w') 136 | 137 | num_samples = len(All_Sample_files) 138 | 139 | print("All samples Joint Processing %s ..." % curr_processing_chr, file=sys.stderr) 140 | print("[%s] Loading Coverage ..." % time_now(), file=sys.stderr) 141 | 142 | All_samples_Target_3UTR_coverages, UTR_events_dict = Load_Target_Wig_files_Multiple_threads_shared_dict_sampleid_key(All_Sample_files, Annotated_3UTR_file, Num_threads,curr_processing_chr) 143 | All_samples_sequencing_depths = load_sequencing_depth(sequencing_depth_file) 144 | 145 | print(All_samples_sequencing_depths) 146 | All_sample_coverage_weights = All_samples_sequencing_depths/np.mean(All_samples_sequencing_depths) 147 | 148 | #print All_sample_coverage_weights 149 | print("[%s] Loading Coverage Finished ..." % time_now(), file=sys.stderr) 150 | #Write the first line 151 | first_line = ['Gene','fit_value','Predicted_Proximal_APA','Loci'] 152 | for i in range(num_samples): 153 | #curr_long_exp = 'Sample_%s_long_exp' % str(i+1) 154 | #curr_short_exp = 'Sample_%s_short_exp' % str(i+1) 155 | curr_ratio = '%s_PDUI' % str(Sample_name[i]) 156 | #first_line.extend([curr_long_exp,curr_short_exp,curr_ratio]) 157 | first_line.append(curr_ratio) 158 | 159 | Output_result.writelines('\t'.join(first_line) + '\n') 160 | 161 | All_events_ids = list(UTR_events_dict.keys()) 162 | num_threads = Num_threads 163 | Assigned_events_ids_all_threads = Assign_to_different_processor_balance_events(All_events_ids, num_threads) 164 | 165 | num_real_threads = len(Assigned_events_ids_all_threads) 166 | 167 | Output_each_processor_all = [] 168 | for i in range(num_real_threads): 169 | curr_temp_output = temp_dir + 'Each_processor_3UTR_Result_%s.txt' % (str(i+1)) 170 | Output_each_processor_all.append(curr_temp_output) 171 | 172 | processes = [] 173 | for i in range(num_real_threads): 174 | process = multiprocessing.Process(target=Each_Thread_3UTR_estimation_list_version_sample_ids, args=(Assigned_events_ids_all_threads[i], UTR_events_dict, All_sample_coverage_weights, num_samples, Output_each_processor_all[i], All_samples_Target_3UTR_coverages, Coverage_threshold)) 175 | process.start() 176 | processes.append(process) 177 | 178 | for p in processes: 179 | p.join() 180 | 181 | #Combine results 182 | for i in range(num_real_threads): 183 | curr_result = Output_each_processor_all[i] 184 | for line in open(curr_result, 'r'): 185 | Output_result.writelines(line) 186 | 187 | Output_result.close() 188 | 189 | 190 | print("[%s] Finished!" % time_now(), file=sys.stderr) 191 | 192 | 193 | def Each_Thread_3UTR_estimation_list_version_sample_ids(curr_thread_UTR_events_ids, UTR_events_dict, All_sample_coverage_weights, num_samples, Output_result_file, All_samples_coverage_shared_dict, Coverage_threshold): 194 | Output_result = open(Output_result_file,'w') 195 | 196 | for curr_3UTR_id in curr_thread_UTR_events_ids: 197 | curr_3UTR_structure = UTR_events_dict[curr_3UTR_id] 198 | region_start = curr_3UTR_structure[1] 199 | region_end = curr_3UTR_structure[2] 200 | curr_strand = curr_3UTR_structure[-2] 201 | UTR_pos = curr_3UTR_structure[-1] 202 | curr_3UTR_all_samples_bp_coverage = [] 203 | 204 | for i in range(num_samples): 205 | curr_sample_curr_3UTR_coverage_wig = All_samples_coverage_shared_dict[curr_3UTR_id, i] 206 | curr_3UTR_curr_sample_bp_coverage = Convert_wig_into_bp_coverage(curr_sample_curr_3UTR_coverage_wig[0], curr_sample_curr_3UTR_coverage_wig[1], curr_strand) 207 | curr_3UTR_all_samples_bp_coverage.append(curr_3UTR_curr_sample_bp_coverage) 208 | 209 | select_mean_squared_error, selected_break_point, UTR_abundances = De_Novo_3UTR_Coverage_estimation_Genome_for_multiple_samples(curr_3UTR_all_samples_bp_coverage, region_start, region_end,curr_strand,All_sample_coverage_weights, Coverage_threshold) 210 | 211 | if str(select_mean_squared_error) != "Na": 212 | num_non_zero = 1 213 | if num_non_zero > 0: 214 | All_long_inclusion_ratios = [] 215 | line_write = [curr_3UTR_id, "%.1f" % select_mean_squared_error, str(selected_break_point), UTR_pos] 216 | 217 | for i in range(num_samples): 218 | if UTR_abundances[0][i] != 'NA': 219 | # long 3'UTR percentage 220 | curr_sample_ratio = float(UTR_abundances[0][i])/(float(UTR_abundances[0][i]) + float(UTR_abundances[1][i])) 221 | All_long_inclusion_ratios.append(curr_sample_ratio) 222 | #line_write.append("%.2f" % UTR_abundances[0][i])#long 3' UTR abundance 223 | #line_write.append("%.2f" % UTR_abundances[1][i])#short 3' UTR abundance 224 | line_write.append("%.2f" % curr_sample_ratio) 225 | else: 226 | line_write.extend(['NA']*1) 227 | 228 | Output_result.writelines( '\t'.join(line_write) + '\n') 229 | 230 | Output_result.close() 231 | 232 | 233 | def De_Novo_3UTR_Coverage_estimation_Genome_for_multiple_samples(All_Samples_curr_3UTR_coverages, UTR_start, UTR_end, curr_strand, weight_for_second_coverage, Coverage_threshold): 234 | coverage_threshold = Coverage_threshold 235 | search_point_start = 150 ##200 236 | search_point_end = int(abs((UTR_end - UTR_start))*0.05) 237 | 238 | num_samples = len(All_Samples_curr_3UTR_coverages) 239 | #Read Coverage 240 | Region_Coverages = [] 241 | Pass_threshold_index = [] 242 | for i in range(num_samples): 243 | curr_Region_Coverage_raw = All_Samples_curr_3UTR_coverages[i] 244 | curr_Region_Coverage = curr_Region_Coverage_raw/weight_for_second_coverage[i] 245 | 246 | curr_first_100_coverage = np.mean(curr_Region_Coverage_raw[0:99]) 247 | if curr_first_100_coverage > coverage_threshold: 248 | Pass_threshold_index.append(i) 249 | Region_Coverages.append(curr_Region_Coverage) 250 | 251 | least_pass_coverage_num = num_samples * least_pass_coverage_percentage 252 | if len(Pass_threshold_index) > least_pass_coverage_num and UTR_end - UTR_start >=150: 253 | if curr_strand == "+": 254 | search_region = list(range(UTR_start+search_point_start, UTR_end-search_point_end+1)) 255 | else: 256 | search_region = list(range(UTR_end - search_point_start, UTR_start+search_point_end-1, -1)) 257 | 258 | search_region_start = search_point_start 259 | search_region_end = UTR_end - UTR_start - search_point_end 260 | Mean_squared_error_list = [] 261 | Estimated_3UTR_abundance_list = [] 262 | for curr_point in range(search_region_start, search_region_end+1): 263 | curr_search_point = curr_point 264 | All_samples_result = [[],[],[]] 265 | for curr_sample_region_coverage in Region_Coverages: 266 | Mean_Squared_error, Long_UTR_abun, Short_UTR_abun = Estimation_abundance(curr_sample_region_coverage, curr_search_point) 267 | All_samples_result[0].append(Mean_Squared_error) 268 | All_samples_result[1].append(Long_UTR_abun) 269 | All_samples_result[2].append(Short_UTR_abun) 270 | 271 | Mean_Squared_error = np.mean(np.array(All_samples_result[0])) 272 | Mean_squared_error_list.append(Mean_Squared_error) 273 | Estimated_3UTR_abundance_list.append([All_samples_result[1],All_samples_result[2]]) 274 | 275 | if len(Mean_squared_error_list) > 1: 276 | min_ele_index = Mean_squared_error_list.index(min(Mean_squared_error_list)) 277 | 278 | select_mean_squared_error = Mean_squared_error_list[min_ele_index] 279 | selected_break_point = search_region[min_ele_index] 280 | 281 | UTR_abundances = [['NA']*num_samples, ['NA']*num_samples] 282 | UTR_abundances_passed = Estimated_3UTR_abundance_list[min_ele_index] 283 | for k in range(len(Pass_threshold_index)): 284 | UTR_abundances[0][Pass_threshold_index[k]] = UTR_abundances_passed[0][k] 285 | UTR_abundances[1][Pass_threshold_index[k]] = UTR_abundances_passed[1][k] 286 | 287 | else: 288 | selected_break_point = 'Na' 289 | UTR_abundances = 'Na' 290 | select_mean_squared_error = 'Na' 291 | 292 | else: 293 | selected_break_point = 'Na' 294 | UTR_abundances = 'Na' 295 | select_mean_squared_error = 'Na' 296 | 297 | return select_mean_squared_error, selected_break_point, UTR_abundances 298 | 299 | 300 | def Estimation_abundance(Region_Coverage, break_point): 301 | Long_UTR_abun = np.mean(Region_Coverage[break_point:]) 302 | Short_UTR_abun = np.mean(Region_Coverage[0:break_point] - Long_UTR_abun) 303 | if Short_UTR_abun < 0: 304 | Short_UTR_abun = 0 305 | Coverage_diff = Region_Coverage[0:break_point] - Long_UTR_abun - Short_UTR_abun 306 | Coverage_diff= np.append(Coverage_diff, Region_Coverage[break_point:] - Long_UTR_abun) 307 | Mean_Squared_error = np.mean(Coverage_diff**2) 308 | 309 | return Mean_Squared_error, Long_UTR_abun, Short_UTR_abun 310 | 311 | 312 | def Load_Target_Wig_files_Multiple_threads_shared_dict_sampleid_key(All_Wig_files,UTR_Annotation_file, num_threads,curr_processing_chr): 313 | num_samples = len(All_Wig_files) 314 | UTR_events_dict = {} 315 | for line in open(UTR_Annotation_file, 'r'): 316 | fields = line.strip('\n').split('\t') 317 | curr_chr = fields[0] 318 | if curr_chr == curr_processing_chr: 319 | region_start = fields[1] 320 | region_end = fields[2] 321 | 322 | curr_strand = fields[-1] 323 | UTR_pos = "%s:%s-%s" %(curr_chr, region_start, region_end) 324 | end_shift = int(round(abs(int(region_start) - int(region_end)) * 0.2)) 325 | if curr_strand == "+": 326 | region_end = str(int(region_end) - end_shift) 327 | else: 328 | region_start = str(int(region_start) + end_shift) 329 | region_start = int(region_start) + 1 330 | region_end = int(region_end) - 1 331 | if region_start + 50 < region_end: 332 | UTR_events_dict[fields[3]] = [fields[0],region_start,region_end,fields[-1],UTR_pos] 333 | 334 | Assigned_index = Assign_to_different_processor_balance(num_samples, num_threads) 335 | 336 | manager = multiprocessing.Manager() # create only 1 Manager 337 | All_samples_extracted_3UTR_coverage_dict = manager.dict() # create only 1 dict 338 | 339 | processes = [] 340 | Final_assigned_threads_num = len(Assigned_index) 341 | for i in range(Final_assigned_threads_num): 342 | process = multiprocessing.Process(target=load_wig_funct_shared_dict_sampleid_key, args=(All_Wig_files, Assigned_index[i], UTR_events_dict,curr_processing_chr,All_samples_extracted_3UTR_coverage_dict)) 343 | process.start() 344 | processes.append(process) 345 | 346 | for p in processes: 347 | p.join() 348 | 349 | return All_samples_extracted_3UTR_coverage_dict, UTR_events_dict 350 | 351 | 352 | def load_wig_funct_shared_dict_sampleid_key(All_wig_files, assigned_indexes,UTR_events_dict, curr_processing_chr, All_samples_extracted_3UTR_coverage_dict): 353 | ''' 354 | All_samples_extracted_3UTR_coverage_dict: sample id is the key. 355 | ''' 356 | for i in assigned_indexes: 357 | curr_wig_file = All_wig_files[i] 358 | print(curr_wig_file, file=sys.stderr) 359 | curr_sample_All_chroms_coverage_dict = {} 360 | with open(curr_wig_file, 'r') as fin: 361 | for line in fin: 362 | if line[0] != '#' and line[0] != 't': 363 | fields = line.strip('\n').split('\t') 364 | chrom_name = fields[0] 365 | if chrom_name == curr_processing_chr: 366 | region_start = int(fields[1]) 367 | region_end = int(fields[2]) 368 | 369 | 370 | if chrom_name not in curr_sample_All_chroms_coverage_dict: 371 | curr_sample_All_chroms_coverage_dict[chrom_name] = [[0],[0]] 372 | if region_start > curr_sample_All_chroms_coverage_dict[chrom_name][0][-1]: 373 | curr_sample_All_chroms_coverage_dict[chrom_name][0].append(region_start) 374 | curr_sample_All_chroms_coverage_dict[chrom_name][1].append(0) 375 | curr_sample_All_chroms_coverage_dict[chrom_name][0].append(region_end) 376 | curr_sample_All_chroms_coverage_dict[chrom_name][1].append(int(float(fields[-1]))) 377 | else: 378 | if len(curr_sample_All_chroms_coverage_dict)>0: 379 | break 380 | fin.close() 381 | if curr_processing_chr not in curr_sample_All_chroms_coverage_dict: 382 | print('no wig: ' + curr_wig_file, file=sys.stderr) 383 | else: 384 | curr_sample_All_chroms_coverage_dict[curr_processing_chr][1].append(0) 385 | 386 | curr_sample_coverage_dict = {} 387 | 388 | for curr_3UTR_event_id in UTR_events_dict: 389 | curr_3UTR_structure = UTR_events_dict[curr_3UTR_event_id] 390 | curr_chr_local = curr_3UTR_structure[0] 391 | if curr_chr_local in curr_sample_All_chroms_coverage_dict: 392 | curr_chr_coverage = curr_sample_All_chroms_coverage_dict[curr_chr_local] 393 | region_start = curr_3UTR_structure[1] 394 | region_end = curr_3UTR_structure[2] 395 | left_region_index = bisect(curr_chr_coverage[0],region_start) 396 | right_region_index = bisect(curr_chr_coverage[0],region_end) 397 | 398 | extracted_coverage = curr_chr_coverage[1][left_region_index:right_region_index+1] 399 | extracted_3UTR_region = curr_chr_coverage[0][left_region_index:right_region_index] 400 | extracted_3UTR_region.insert(0,region_start) 401 | extracted_3UTR_region.append(region_end) 402 | 403 | curr_event_info = [extracted_coverage,extracted_3UTR_region] 404 | All_samples_extracted_3UTR_coverage_dict[curr_3UTR_event_id,i] = curr_event_info 405 | 406 | def Assign_to_different_processor_balance(Total_number, num_processors): 407 | Assigned_results = [] 408 | num_each_processor = int(Total_number/num_processors)#@xdzou: add int() 409 | 410 | if num_each_processor == 0: 411 | for i in range(Total_number): 412 | Assigned_results.append([i]) 413 | else: 414 | remain = Total_number - num_processors * num_each_processor 415 | for i in range(remain): 416 | Assigned_results.append(list(range((i)*(num_each_processor + 1), (i+1)*(num_each_processor + 1)))) 417 | for i in range(num_processors-remain): 418 | Assigned_results.append(list(range(i*num_each_processor+remain*(num_each_processor+1), (i+1)*num_each_processor+remain*(num_each_processor+1)))) 419 | 420 | return Assigned_results 421 | 422 | 423 | def Assign_to_different_processor_balance_events(All_events_ids, num_processors): 424 | Assigned_results = [] 425 | Total_number = len(All_events_ids) 426 | num_each_processor = int(Total_number/num_processors) #xdzou: add int() 427 | 428 | if num_each_processor == 0: 429 | for i in range(Total_number): 430 | Assigned_results.append([i]) 431 | else: 432 | remain = Total_number - num_processors * num_each_processor 433 | for i in range(remain): 434 | Assigned_results.append(list(range((i)*(num_each_processor+1), (i+1)*(num_each_processor+1)))) 435 | 436 | for i in range(num_processors-remain): 437 | Assigned_results.append(list(range(i*num_each_processor+remain*(num_each_processor+1), (i+1)*num_each_processor+remain*(num_each_processor+1)))) 438 | #print assigned Results 439 | Assigned_events = [] 440 | print('#assigned events:') 441 | for curr_processor_inds in Assigned_results: 442 | curr_processor_events = [] 443 | print(len(curr_processor_inds)) 444 | for curr_ele in curr_processor_inds: 445 | curr_processor_events.append(All_events_ids[curr_ele]) 446 | Assigned_events.append(curr_processor_events) 447 | return Assigned_events 448 | 449 | #global parameters 450 | least_pass_coverage_percentage = 0.3 451 | 452 | De_Novo_3UTR_Identification_Loading_Target_Wig_for_TCGA_Multiple_Samples_Multiple_threads_Main3_shared_list(sys.argv) 453 | --------------------------------------------------------------------------------