├── test ├── single │ ├── data │ │ ├── gene_test.bed │ │ ├── rice_leaf_DHpeaks_test.bed │ │ ├── rice_callus_DHpeaks_test.bed │ │ ├── rice_H3K27ac_test.bw │ │ ├── rice_leaf_DHS_test.bw │ │ ├── rice_callus_DHS_test.bw │ │ ├── rice.chrom.sizes │ │ ├── genopheno_test.bed │ │ ├── editing_results_test.bed │ │ └── genes_motifs_JASPAR_test.bed │ └── config.ini └── batch │ ├── data │ └── README.md │ └── config.ini ├── requirements.txt ├── .gitignore ├── config.ini ├── lib ├── cores.py ├── genopheno.py ├── misc.py └── features.py ├── README.md ├── single.py └── batch.py /test/single/data/gene_test.bed: -------------------------------------------------------------------------------- 1 | Chr1 4003659 4004888 LOC_Os01g08220 . - 2 | -------------------------------------------------------------------------------- /test/single/data/rice_leaf_DHpeaks_test.bed: -------------------------------------------------------------------------------- 1 | Chr1 4000759 4001568 2 | Chr1 4004878 4005453 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | scipy 4 | tqdm 5 | biopython 6 | pyBigWig 7 | pybedtools -------------------------------------------------------------------------------- /test/single/data/rice_callus_DHpeaks_test.bed: -------------------------------------------------------------------------------- 1 | Chr1 4000853 4001184 2 | Chr1 4002860 4003235 3 | Chr1 4004230 4004515 4 | -------------------------------------------------------------------------------- /test/single/data/rice_H3K27ac_test.bw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/CAPE/main/test/single/data/rice_H3K27ac_test.bw -------------------------------------------------------------------------------- /test/single/data/rice_leaf_DHS_test.bw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/CAPE/main/test/single/data/rice_leaf_DHS_test.bw -------------------------------------------------------------------------------- /test/single/data/rice_callus_DHS_test.bw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangtaolab/CAPE/main/test/single/data/rice_callus_DHS_test.bw -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | node_modules 3 | .vscode 4 | .idea 5 | .env 6 | .envrc 7 | .venv 8 | *.local 9 | *.log* 10 | logs 11 | .DS_Store -------------------------------------------------------------------------------- /test/single/data/rice.chrom.sizes: -------------------------------------------------------------------------------- 1 | Chr1 43270923 2 | Chr2 35937250 3 | Chr3 36413819 4 | Chr4 35502694 5 | Chr5 29958434 6 | Chr6 31248787 7 | Chr7 29697621 8 | Chr8 28443022 9 | Chr9 23012720 10 | Chr10 23207287 11 | Chr11 29021106 12 | Chr12 27531856 13 | ChrSy 592136 14 | ChrUn 633585 15 | ChrC 134525 16 | ChrM 490520 17 | -------------------------------------------------------------------------------- /test/batch/data/README.md: -------------------------------------------------------------------------------- 1 | For genome-wide analysis, omics data can be downloaded from several database: 2 | 3 | (1) [PlantDHS](http://plantdhs.org/): DNase-seq data 4 | (2) [PlantRegMap](http://plantregmap.gao-lab.org/): TF motifs, sequence conservation (CNSs) 5 | (3) [MBKbase](http://www.mbkbase.org/rice): Genomic variation and phenotypes 6 | 7 | Then, data files are put in the data folder. -------------------------------------------------------------------------------- /test/batch/config.ini: -------------------------------------------------------------------------------- 1 | [General] 2 | workdir = results 3 | binsize = 10 4 | step = 10 5 | upstream = 2000 6 | slop = 200 7 | withutr = 0 8 | threads = 64 9 | 10 | [Features] 11 | ocfiles = data/rice_leaf_DHS_test.bw,data/rice_callus_DHS_test.bw 12 | ocpeaks = data/rice_leaf_DHpeaks_test.bed,data/rice_callus_DHpeaks_test.bed 13 | ptmfiles = data/rice_H3K27ac_test.bw 14 | motifs = data/genome_wide_motifs_JASPAR_test.bed 15 | cnss = data/genome_wide_PhastCons_test.bedGraph 16 | genopheno = 17 | phenodata = 18 | 19 | [Genes] 20 | gene_file = 21 | gff_file = data/annotation.gff3 22 | chrom_sizes = data/rice.chrom.sizes 23 | -------------------------------------------------------------------------------- /test/single/config.ini: -------------------------------------------------------------------------------- 1 | [General] 2 | workdir = results 3 | binsize = 10 4 | step = 10 5 | upstream = 2000 6 | slop = 200 7 | withutr = 0 8 | threads = 8 9 | 10 | [Features] 11 | ocfiles = data/rice_leaf_DHS_test.bw,data/rice_callus_DHS_test.bw 12 | ocpeaks = data/rice_leaf_DHpeaks_test.bed,data/rice_callus_DHpeaks_test.bed 13 | ptmfiles = data/rice_H3K27ac_test.bw 14 | motifs = data/genes_motifs_JASPAR_test.bed 15 | cnss = data/genes_PhastCons_test.bedGraph 16 | genopheno = data/genopheno_test.bed 17 | phenodata = data/editing_results_test.bed 18 | 19 | [Genes] 20 | gene_file = data/gene_test.bed 21 | gff_file = 22 | chrom_sizes = data/rice.chrom.sizes 23 | -------------------------------------------------------------------------------- /config.ini: -------------------------------------------------------------------------------- 1 | [General] 2 | # Work directory ( also known as output directory ) 3 | workdir = results 4 | # binsize and sliding step, not recommend to change 5 | binsize = 10 6 | step = 10 7 | # Promoter length defined as sequence upstream of the TSS 8 | upstream = 2000 9 | # Extended length for generating raw scores of each features ( Useful for genome browser visualization ) 10 | slop = 200 11 | # Whether or not including the 5'-UTR for analysis ( 0: Not include; 1: promoter + 5'-UTR ) 12 | withutr = 0 13 | # Threads for batch mode (simultaneously process n genes) 14 | threads = 8 15 | 16 | [Features] 17 | # Features with 1-bp resolution are recommended. 18 | # If feature files are unavailable, just leave a blank. 19 | # Multiple files are separated by comma. 20 | 21 | # Open chromatin BigWig files ( from ATAC-seq/DNase-seq/MNase-seq/etc. ) 22 | ocfiles = ATAC_profile.bw 23 | # Open chromatin peaks ( from MACS2/Genrich/Popera/etc. ) 24 | ocpeaks = ATAC_peaks.bed 25 | # Histone modification BigWig files ( H3K27ac from ChIP-seq ) 26 | ptmfiles = H3K27ac.bw 27 | # TF binding motifs ( from PlantTFBS/JARSPR motifs called by FIMO ) 28 | motifs = genome_wide_motifs_JASPAR.bed 29 | # Conserved non-coding sequences ( from PhastCons/mVISTA scores ) 30 | cnss = PhastCons.bedGraph 31 | # Genotype and phenotype files directory ( from MBKbase/etc. ) 32 | genopheno = 33 | # Phenotypes for evaluation ( Phenodata measured after gene-editing ) 34 | phenodata = 35 | 36 | [Genes] 37 | # Gene for single mode (BED format: chr start end genename . strand) 38 | gene_file = gene.bed 39 | # GFF/GFF3 file for batch mode ( Use batch mode if gff_file is defined ) 40 | gff_file = annotation.gff3 41 | # Chromosome length ( in case out of range ) 42 | chrom_sizes = genome.chrom.sizes 43 | -------------------------------------------------------------------------------- /lib/cores.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from pybedtools import BedTool 4 | 5 | 6 | def get_scores(geneinfo, scorefile, regionfile): 7 | score_bed = BedTool(scorefile) 8 | region_bed = BedTool(regionfile) 9 | scores = {} 10 | for interval in score_bed.intersect(region_bed, wo=True): 11 | info = str(interval).rstrip().split("\t") 12 | chrom = info[0] 13 | score = float(info[3]) 14 | region_start = info[5] 15 | region_end = info[6] 16 | name = "_".join([chrom, region_start, region_end]) 17 | if name not in scores: 18 | scores[name] = [] 19 | scores[name].append(score) 20 | return scores 21 | 22 | 23 | def get_cores(geneinfo, scores): 24 | binsize = geneinfo.binsize 25 | cores = "" 26 | for region in scores: 27 | values = scores[region] 28 | if len(values) >= 5: 29 | cutoff = np.average(values) 30 | else: 31 | cutoff = 0 32 | chrom, rstart, rend = region.split("_") 33 | for i, score in enumerate(values): 34 | if score >= cutoff: 35 | start = int(rstart) + int(binsize * i) 36 | end = start + binsize 37 | cores += "\t".join([chrom, str(start), str(end)]) + "\n" 38 | core_regions = BedTool(cores, from_string=True).merge() 39 | return core_regions 40 | 41 | 42 | def output_cores(geneinfo, scorefile, regionfile, minlen = 2, outfile = ""): 43 | scores = get_scores(geneinfo, scorefile, regionfile) 44 | cores = get_cores(geneinfo, scores) 45 | binsize = geneinfo.binsize 46 | core_regions = [] 47 | if not outfile: 48 | outfile = regionfile.replace("key_regions_merged", "core_regions") 49 | outf = open(outfile, "w") 50 | for interval in cores: 51 | info = str(interval).rstrip().split("\t") 52 | chrom = info[0] 53 | start = int(info[1]) 54 | end = int(info[2]) 55 | if end - start >= binsize * minlen: 56 | core_regions.append([chrom, start, end]) 57 | print(chrom, start, end, sep="\t", file=outf) 58 | outf.close() 59 | return core_regions 60 | 61 | -------------------------------------------------------------------------------- /test/single/data/genopheno_test.bed: -------------------------------------------------------------------------------- 1 | Chr1 4004909 4004910 25.08108508360879 3.578586951318265e-06 5.446288425978628 2 | Chr1 4004918 4004919 11.643805566694892 0.0006441668288457089 3.1910016427888794 3 | Chr1 4004944 4004945 13.03230509818196 0.0014793498931556398 2.8299290953100207 4 | Chr1 4005034 4005035 39.84383952471281 2.2285386897378036e-09 8.65197982171073 5 | Chr1 4005064 4005065 39.78991300947906 2.2894447758468627e-09 8.640269827713585 6 | Chr1 4005209 4005210 29.288540872193153 6.23633981593134e-08 7.205070228287081 7 | Chr1 4005297 4005298 119.18579893750855 1.3156264619263447e-26 25.88086739989521 8 | Chr1 4005403 4005404 11.643805566694892 0.0006441668288457089 3.1910016427888794 9 | Chr1 4005530 4005531 110.42334562249201 7.914747797493608e-26 25.101562919323662 10 | Chr1 4005658 4005659 1.0339549084633033 0.30923158012311003 0.5097161603783631 11 | Chr1 4005677 4005678 39.26142674474035 2.981875498831951e-09 8.525510493444743 12 | Chr1 4005679 4005680 10.544113986204366 0.001165586841256689 2.9334553641223033 13 | Chr1 4005691 4005692 15.548917696064617 8.039766076595917e-05 4.094756587211936 14 | Chr1 4005703 4005704 26.935587397725662 1.4158313132317057e-06 5.848988486827513 15 | Chr1 4005717 4005718 2.0448089072330937 0.15272616080064952 0.8160865653126603 16 | Chr1 4005796 4005797 55.075947097667324 1.0975139395761731e-12 11.959589955056167 17 | Chr1 4005839 4005840 14.03972184543109 0.0008939498123160222 3.0486868624636307 18 | Chr1 4005869 4005870 0.06792994717983242 0.7943749668296061 0.0999744502365751 19 | Chr1 4005872 4005873 29.350441454008816 4.232847455651902e-07 6.373367382450242 20 | Chr1 4005888 4005889 2.708257829770254 0.09982996124023241 1.000739097505115 21 | Chr1 4005917 4005918 177.4827977726221 2.8846966307842555e-39 38.539899852700266 22 | Chr1 4005956 4005957 11.643805566694892 0.0006441668288457089 3.1910016427888794 23 | Chr1 4005978 4005979 12.852202910120727 0.0016187493160071459 2.7908204020831735 24 | Chr1 4006010 4006011 11.643805566694892 0.0006441668288457089 3.1910016427888794 25 | Chr1 4006040 4006041 110.69254779348225 9.19218280830204e-25 24.036581347260658 26 | Chr1 4006055 4006056 60.973148779748776 5.752398579227342e-14 13.240151029655443 27 | Chr1 4006137 4006138 15.137103054769431 0.0005164399562436108 3.2869801643436105 28 | Chr1 4006467 4006468 16.791333717316366 0.00022584382019740588 3.6461917886132573 29 | Chr1 4006638 4006639 177.2643953648572 3.2175523937172934e-39 38.492474372436924 30 | Chr1 4006648 4006649 1.1183907309091496 0.2902652352336645 0.5372049760535923 31 | Chr1 4006756 4006757 164.24334090511462 1.3384517804049304e-37 36.8733972702606 32 | Chr1 4006758 4006759 110.42334562249201 7.914747797493608e-26 25.101562919323662 33 | Chr1 4006787 4006788 29.288540872193153 6.23633981593134e-08 7.205070228287081 34 | Chr1 4006817 4006818 134.09712513211326 5.2027049613596005e-31 30.28377080168162 35 | Chr1 4006835 4006836 11.643805566694892 0.0006441668288457089 3.1910016427888794 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CAPE 2 | 3 | The computational pipeline of CAPE (CRISPR-Cas12a promoter editing) 4 | 5 | 6 | ## Prerequisition 7 | 8 | 1. Python >= 3.5 9 | 2. Open chromatin data (profiles in BigWig format, peaks in BED format) 10 | 3. TF binding motifs (identified by FIMO, matrix files are from PlantTFDB or JARSPR) 11 | 4. Sequence conservation (Scores are from PhastCons/mVISTA, or manually calculate with PHAST package) 12 | 5. Genome annotation file (in GFF3 format) and chromosome sizes file 13 | 6. (Optional) H3K27ac histone modification profile (BigWig format), genomic variations and phenotypes from rice3K/RFGB/MBKBase/etc. 14 | 15 | ## Install 16 | 17 | ```bash 18 | # Install CAPE dependencies 19 | git clone https://github.com/zhangtaolab/CAPE.git 20 | cd CAPE 21 | pip install -r requirements.txt 22 | 23 | # Run test for single gene 24 | cd test/single 25 | python ../../single.py config.ini 26 | ``` 27 | 28 | ### Run the pipeline for single gene 29 | 30 | ```bash 31 | # Modify the config.ini file 32 | [General] 33 | workdir = results 34 | binsize = 10 35 | step = 10 36 | upstream = 2000 37 | slop = 200 38 | withutr = 0 39 | threads = 16 40 | 41 | [Features] 42 | ocfiles = Rice_leaf_DNase.bw,Rice_callus_DNase.bw 43 | ocpeaks = TIGR7_DHSs.bed 44 | ptmfiles = rice_H3K27ac.bw 45 | motifs = genome_wide_motifs_JASPAR.bed 46 | cnss = Osj_PhastCons.bedGraph 47 | genopheno = 48 | phenodata = 49 | 50 | [Genes] 51 | gene_file = gene.bed 52 | gff_file = 53 | chrom_sizes = osativa_7.chrom.sizes 54 | ``` 55 | 56 | ```bash 57 | # Run the pipeline 58 | python single.py config.ini 59 | ``` 60 | 61 | ### Run the pipeline for whole genome genes 62 | 63 | ```bash 64 | # Modify the config.ini file 65 | [General] 66 | workdir = results 67 | binsize = 10 68 | step = 10 69 | upstream = 2000 70 | slop = 200 71 | withutr = 0 72 | threads = 16 73 | 74 | [Features] 75 | ocfiles = Rice_leaf_DNase.bw,Rice_callus_DNase.bw 76 | ocpeaks = TIGR7_DHSs.bed 77 | ptmfiles = rice_H3K27ac.bw 78 | motifs = genome_wide_motifs_JASPAR.bed 79 | cnss = Osj_PhastCons.bedGraph 80 | genopheno = 81 | phenodata = 82 | 83 | [Genes] 84 | gene_file = 85 | gff_file = TIGR7_all.gff3 86 | chrom_sizes = osativa_7.chrom.sizes 87 | ``` 88 | 89 | ```bash 90 | # Run the pipeline 91 | python batch.py config.ini 92 | ``` 93 | 94 | ## Input (Feature data processing) 95 | 96 | The instruction of how to generate feature data for calculation: 97 | 1. Open chromatin data: 98 | (1) Raw sequencing data (from DNase-seq/ATAC-seq/MNase-seq) first align to reference genome by BWA/Bowtie2; 99 | (2) Call peaks from the alignment using Macs2/Genrich/F-seq2/Popera; 100 | (3) Generate profiles from the alignment (BigWig format, using DeepTools/F-seq2/Popera). 101 | 2. TF binding motifs: 102 | (1) Download the TF PFM data from database (PlantTFDB/JASPAR/CisBP); 103 | (2) Find the occurrences of TF motifs in the genome by FIMO; 104 | (3) Merge results of all TF motifs (BED format, TFs from the same family can be merged into one). 105 | 3. Sequence conservation: 106 | (1) Pre-calculated sequence conservation of plant genomes can be retrieved from PlantRegMap database; 107 | (2) If no existed result for the target genome, calculate conservation scores using multiple close related genomes with PHAST/mVISTA. 108 | 4. H3K27ac histone modification: 109 | (1) Raw sequencing data (from ChIP-seq) first align to reference genome by BWA/Bowtie2; 110 | (2) Generate profiles from the alignment (BigWig format, using DeepTools). 111 | 5. Relationships between genomic variations and phenotypes (GenoPheno): 112 | (1) Get the genotype data from public database, in FASTA format (for rice, using rice3K/RFGB/MBKBase/etc); 113 | (2) Get the corresponding phenotype data from public database. 114 | (two column tab format, first column is Genotype_ID, second is Phenotype_Values separated by comma) 115 | 6. Genome annotation file (BED/GFF3 format) is required for getting the promoter of target gene. 116 | 7. Chromosome sizes file is required for converting input file format. 117 | (two column tab format, first column is chromosome name, second is chromosome length) 118 | 119 | \* Note that H3K27ac and GenoPheno data are optional for analysis. 120 | 121 | ## Output 122 | 123 | All output files are stored in the workdir defined in the config.ini file. 124 | A folder will be created for each gene analyzed. 125 | In the output gene folder, several files are generated: 126 | 1. analysis_region.bed (File records the analyzed regions in the genome for this gene) 127 | 2. OCpeaks_*_raw.bed (Open chromatin regions overlap with the analysis region) 128 | 3. OCscores*.bedGraph (Open chromatin scores for the analysis region, suffix 'raw' means raw scores from BigWig file, others are normalized in range 0 to 1) 129 | 4. motifs*.bedGraph (Raw file contains motifs identified in the analysis region, another file is the normalized motifs scores) 130 | 5. CNS*.bedGraph (Raw file contains raw conserved score in the analysis region, another file is the normalized CNS scores) 131 | 6. PTM*.bedGraph (H3K27ac profile for the analysis region, scores from BigWig file, others are normalized in range 0 to 1) 132 | 7. aggregate.bedGraph (The aggregate scores (AS) calculated from all above features) 133 | 8. key_regions_*.bed (Merged file means merged key regions when two key regions are adjacent) 134 | 9. core_regions.bed (Core regions which have high AS within the key regions) 135 | ( **Optional:** if CRISPR edited phenotype data are provided, also export the statistical analysis results. ) 136 | 10. phenoscores_*.bedGraph (phenotype scores, measured by kmeans-like method) 137 | 11. scores_by_sample.txt (Features scores and aggregate scores for each CRISPR edited sample) 138 | 12. plot_scores.txt (Comparison between phenotype difference and estimated scores) 139 | 13. statistics.txt (Cutoff for defining key regions and significance analysis) 140 | 141 | -------------------------------------------------------------------------------- /lib/genopheno.py: -------------------------------------------------------------------------------- 1 | from Bio import pairwise2 2 | import re 3 | from tqdm import tqdm 4 | from itertools import chain 5 | import numpy as np 6 | import pandas as pd 7 | from scipy import stats 8 | 9 | 10 | def load_fasta(seqfile): 11 | seqinfo = {} 12 | with open(seqfile, "r") as infile: 13 | for line in infile: 14 | if line.startswith(">"): 15 | info = line.split("|") 16 | name = info[0][1:] 17 | if name == "REF": 18 | sample_num = 0 19 | else: 20 | sample_num = int(info[1].split(":")[1]) 21 | else: 22 | seq = line.rstrip() 23 | if name != "REF": 24 | if seq == seqinfo["REF"]["seq"]: 25 | refid = name 26 | seqinfo[name] = {} 27 | seqinfo[name]["seq"] = seq 28 | seqinfo[name]["num"] = sample_num 29 | infile.close() 30 | return seqinfo, refid 31 | 32 | 33 | def parse_alignment(alignment): 34 | aligninfo = {} 35 | refseq = alignment[0] 36 | altseq = alignment[1] 37 | indels = re.compile(r'-+') 38 | inspos = {} 39 | for i in range(len(refseq.replace("-", ""))): 40 | inspos[i] = 0 41 | for m in indels.finditer(refseq): 42 | start = m.span()[0] 43 | end = m.span()[1] 44 | for j in range(start+1, len(inspos)): 45 | inspos[j] += end - start 46 | aligninfo[start-inspos[start]] = {} 47 | aligninfo[start-inspos[start]]["ref"] = refseq[start-inspos[start]] 48 | aligninfo[start-inspos[start]]["alt"] = altseq[start-inspos[start]:end] 49 | for m in indels.finditer(altseq): 50 | start = m.span()[0] 51 | end = m.span()[1] 52 | aligninfo[start-inspos[start]] = {} 53 | aligninfo[start-inspos[start]]["ref"] = refseq[start-inspos[start]:end] 54 | aligninfo[start-inspos[start]]["alt"] = altseq[start-inspos[start]] 55 | for i in range(len(refseq)): 56 | refbase = refseq[i] 57 | altbase = altseq[i] 58 | if refbase != altbase: 59 | if refbase != "-" and altbase != "-": 60 | aligninfo[i-inspos[i]] = {} 61 | aligninfo[i-inspos[i]]["ref"] = refbase 62 | aligninfo[i-inspos[i]]["alt"] = altbase 63 | return aligninfo 64 | 65 | 66 | def pairwise_alignment(seqfile): 67 | seqinfo, refid = load_fasta(seqfile) 68 | mutinfo = {} 69 | refseq = seqinfo["REF"]["seq"] 70 | total_num = sum([seqinfo[x]["num"] for x in seqinfo]) 71 | count = 0 72 | for sample in tqdm(seqinfo, desc="Finding mutations"): 73 | if sample == "REF": 74 | continue 75 | # if count >= 5: 76 | # break 77 | altseq = seqinfo[sample]["seq"] 78 | num = seqinfo[sample]["num"] 79 | ratio = round(num / total_num, 4) 80 | alignments = pairwise2.align.globalms(refseq, altseq, 2, -1, -1.5, -.5) 81 | # print(sample, ratio, alignments[0], sep="\n") 82 | mutinfo[sample] = {} 83 | mutinfo[sample]["ratio"] = ratio 84 | mutinfo[sample]["alignment"] = parse_alignment(alignments[0]) 85 | count += 1 86 | return mutinfo, refid 87 | 88 | 89 | def mut2pos(seqfile): 90 | mutinfo, refid = pairwise_alignment(seqfile) 91 | vcfinfo = {} 92 | for sample in mutinfo: 93 | for pos in mutinfo[sample]["alignment"]: 94 | refbase = mutinfo[sample]["alignment"][pos]["ref"] 95 | altbase = mutinfo[sample]["alignment"][pos]["alt"] 96 | if altbase in ["a", "c", "g", "t", "n"]: 97 | altbase = altbase.upper() 98 | homozygous = 1 99 | else: 100 | homozygous = 0 101 | if pos not in vcfinfo: 102 | vcfinfo[pos] = {} 103 | vcfinfo[pos]["ref"] = refbase 104 | vcfinfo[pos]["alt"] = {} 105 | if altbase not in vcfinfo[pos]["alt"]: 106 | vcfinfo[pos]["alt"][altbase] = {} 107 | ratio = mutinfo[sample]["ratio"] 108 | vcfinfo[pos]["alt"][altbase][sample] = [ratio, homozygous] 109 | return vcfinfo, refid 110 | 111 | 112 | def load_phenodata(phenodata): 113 | gid_info = {} 114 | with open(phenodata, "r") as infile: 115 | for line in infile: 116 | if line.startswith("Genotype_ID"): 117 | continue 118 | info = line.rstrip().split("\t") 119 | sample = info[0] 120 | if len(info) > 1: 121 | values = list(map(float, [x for x in info[1].split(", ")])) 122 | if len(values) > 1: 123 | gid_info[sample] = values 124 | infile.close() 125 | return gid_info 126 | 127 | 128 | def link_genopheno(genoinfo, seqfile, phenodata): 129 | posinfo, refid = mut2pos(seqfile) 130 | phenoinfo = load_phenodata(phenodata) 131 | startpos = genoinfo.start 132 | outfile = seqfile.replace(".fasta", "_geno_pheno.txt") 133 | outf = open(outfile, "w") 134 | print("name", "pos", "ref", "alt", "value", "avg", "sd", sep="\t", file=outf) 135 | for pos in sorted(posinfo): 136 | pos_abs = pos + startpos 137 | ref = posinfo[pos]["ref"] 138 | flag = 0 139 | for alt in posinfo[pos]["alt"]: 140 | input_lst = [phenoinfo[x] for x in posinfo[pos]["alt"][alt] if x in phenoinfo] 141 | values = list(chain(*input_lst)) 142 | name = str(pos_abs) + "_" + ref + "/" + alt 143 | if values: 144 | flag = 1 145 | avg_value = round(np.average(values), 4) 146 | sd = round(np.std(values), 4) 147 | for value in values: 148 | print(name, pos_abs, ref, alt, value, avg_value, sd, sep="\t", file=outf) 149 | if flag: 150 | ref_values = phenoinfo[refid] 151 | ref_avg = round(np.average(ref_values), 4) 152 | ref_sd = round(np.std(ref_values), 4) 153 | ref_name = str(pos_abs) + "_" + ref + "/" + ref 154 | for value in ref_values: 155 | print(ref_name, pos_abs, ref, ref, value, ref_avg, ref_sd, sep="\t", file=outf) 156 | outf.close() 157 | return outfile 158 | 159 | 160 | def output_genopheno(genoinfo, seqfile, phenodata, outfile = "", startpos = 0): 161 | infile = link_genopheno(genoinfo, seqfile, phenodata) 162 | geno_pheno = pd.read_table(infile) 163 | chrom = genoinfo.chrom 164 | if not outfile: 165 | outfile = infile.replace(".txt", ".bed") 166 | outf = open(outfile, "w") 167 | for pos in pd.unique(geno_pheno.pos): 168 | value_lst = [] 169 | ref = pd.unique(geno_pheno[geno_pheno.pos==pos].ref) 170 | for alt in pd.unique(geno_pheno[geno_pheno.pos==pos].alt): 171 | value_lst.append(geno_pheno[(geno_pheno.pos==pos) & (geno_pheno.alt==alt)].value.tolist()) 172 | if len(value_lst) > 1: 173 | kruskal = stats.kruskal(*value_lst) 174 | statistic = kruskal[0] 175 | pvalue1 = kruskal[1] 176 | pvalue2 = -np.log10(pvalue1) 177 | # print(statistic, pvalue, pvalue2) 178 | real_pos = pos + startpos 179 | print(chrom, real_pos, real_pos+len(ref), statistic, pvalue1, pvalue2, sep="\t", file=outf) 180 | outf.close() 181 | return outfile 182 | 183 | -------------------------------------------------------------------------------- /test/single/data/editing_results_test.bed: -------------------------------------------------------------------------------- 1 | Chr1 4004888 4007388 WT 98.0 1.5811388300841898 2 | Chr1 4006709 4006711 pZJP078-01-1-1-3 87.8 1.6911534525287764 3 | Chr1 4006505 4006509 pZJP078-01-1-1-3 87.8 1.6911534525287764 4 | Chr1 4006092 4006097 pZJP078-01-1-1-3 87.8 1.6911534525287764 5 | Chr1 4005476 4005478 pZJP078-01-1-1-3 87.8 1.6911534525287764 6 | Chr1 4006503 4006505 pZJP078-02-1-1-3 89.6 1.5937377450509227 7 | Chr1 4006091 4006100 pZJP078-02-1-1-3 89.6 1.5937377450509227 8 | Chr1 4005475 4005484 pZJP078-02-1-1-3 89.6 1.5937377450509227 9 | Chr1 4006490 4006521 pZJP078-02-1-2-2 83.3 1.3266499161421599 10 | Chr1 4006469 4006486 pZJP078-02-1-2-2 83.3 1.3266499161421599 11 | Chr1 4006092 4006099 pZJP078-02-1-2-2 83.3 1.3266499161421599 12 | Chr1 4005474 4005482 pZJP078-02-1-2-2 83.3 1.3266499161421599 13 | Chr1 4005244 4005279 pZJP078-02-1-2-2 83.3 1.3266499161421599 14 | Chr1 4006701 4006722 pZJP078-04-2-1-1 80.1 1.5620499351813308 15 | Chr1 4006505 4006508 pZJP078-04-2-1-1 80.1 1.5620499351813308 16 | Chr1 4006091 4006099 pZJP078-04-2-1-1 80.1 1.5620499351813308 17 | Chr1 4005411 4005482 pZJP078-04-2-1-1 80.1 1.5620499351813308 18 | Chr1 4005242 4005254 pZJP078-04-2-1-1 80.1 1.5620499351813308 19 | Chr1 4006698 4006715 pZJP078-05-1-1-1 95.5 1.0 20 | Chr1 4006508 4006514 pZJP078-05-1-1-1 95.5 1.0 21 | Chr1 4006092 4006098 pZJP078-05-1-1-1 95.5 1.0 22 | Chr1 4005475 4005490 pZJP078-05-1-1-1 95.5 1.0 23 | Chr1 4006487 4006514 pZJP078-05-2-1-2 85.2 1.5033296378372907 24 | Chr1 4006091 4006099 pZJP078-05-2-1-2 85.2 1.5033296378372907 25 | Chr1 4005474 4005481 pZJP078-05-2-1-2 85.2 1.5033296378372907 26 | Chr1 4005246 4005256 pZJP078-05-2-1-2 85.2 1.5033296378372907 27 | Chr1 4006704 4006725 pZJP078-07-2-1-1 90.0 1.224744871391589 28 | Chr1 4006507 4006511 pZJP078-07-2-1-1 90.0 1.224744871391589 29 | Chr1 4006088 4006097 pZJP078-07-2-1-1 90.0 1.224744871391589 30 | Chr1 4005465 4005482 pZJP078-07-2-1-1 90.0 1.224744871391589 31 | Chr1 4005236 4005256 pZJP078-07-2-1-1 90.0 1.224744871391589 32 | Chr1 4006704 4006725 pZJP078-07-1-2-3 87.1 0.66332495807108 33 | Chr1 4006507 4006511 pZJP078-07-1-2-3 87.1 0.66332495807108 34 | Chr1 4006088 4006097 pZJP078-07-1-2-3 87.1 0.66332495807108 35 | Chr1 4005465 4005482 pZJP078-07-1-2-3 87.1 0.66332495807108 36 | Chr1 4005236 4005256 pZJP078-07-1-2-3 87.1 0.66332495807108 37 | Chr1 4006506 4006512 pZJP078-08-1-1-1 81.7 1.0770329614269007 38 | Chr1 4005744 4006469 pZJP078-08-1-1-1 81.7 1.0770329614269007 39 | Chr1 4005474 4005482 pZJP078-08-1-1-1 81.7 1.0770329614269007 40 | Chr1 4005245 4005253 pZJP078-08-1-1-1 81.7 1.0770329614269007 41 | Chr1 4006506 4006512 pZJP078-08-2-1-1 83.5 1.0 42 | Chr1 4005744 4006469 pZJP078-08-2-1-1 83.5 1.0 43 | Chr1 4005474 4005482 pZJP078-08-2-1-1 83.5 1.0 44 | Chr1 4005245 4005253 pZJP078-08-2-1-1 83.5 1.0 45 | Chr1 4006506 4006512 pZJP078-08-2-2-2 82.7 2.6758176320519302 46 | Chr1 4005744 4006469 pZJP078-08-2-2-2 82.7 2.6758176320519302 47 | Chr1 4005474 4005482 pZJP078-08-2-2-2 82.7 2.6758176320519302 48 | Chr1 4005245 4005253 pZJP078-08-2-2-2 82.7 2.6758176320519302 49 | Chr1 4006706 4006714 pZJP078-08-1-2-1 73.1 1.019803902718557 50 | Chr1 4006503 4006505 pZJP078-08-1-2-1 73.1 1.019803902718557 51 | Chr1 4006092 4006100 pZJP078-08-1-2-1 73.1 1.019803902718557 52 | Chr1 4005553 4005615 pZJP078-08-1-2-1 73.1 1.019803902718557 53 | Chr1 4005263 4005552 pZJP078-08-1-2-1 73.1 1.019803902718557 54 | Chr1 4005246 4005262 pZJP078-08-1-2-1 73.1 1.019803902718557 55 | Chr1 4006706 4006714 pZJP078-08-3-1-3 74.9 1.2 56 | Chr1 4006503 4006505 pZJP078-08-3-1-3 74.9 1.2 57 | Chr1 4006092 4006100 pZJP078-08-3-1-3 74.9 1.2 58 | Chr1 4005553 4005615 pZJP078-08-3-1-3 74.9 1.2 59 | Chr1 4005263 4005552 pZJP078-08-3-1-3 74.9 1.2 60 | Chr1 4005246 4005262 pZJP078-08-3-1-3 74.9 1.2 61 | Chr1 4006503 4006505 pZJP078-09-1-2-1 87.2 0.9273618495495702 62 | Chr1 4006090 4006100 pZJP078-09-1-2-1 87.2 0.9273618495495702 63 | Chr1 4005474 4005480 pZJP078-09-1-2-1 87.2 0.9273618495495702 64 | Chr1 4005240 4005350 pZJP078-09-1-2-1 87.2 0.9273618495495702 65 | Chr1 4006709 4006715 pZJP078-10-1-1-2 95.5 1.0 66 | Chr1 4006505 4006510 pZJP078-10-1-1-2 95.5 1.0 67 | Chr1 4006091 4006100 pZJP078-10-1-1-2 95.5 1.0 68 | Chr1 4005474 4005480 pZJP078-10-1-1-2 95.5 1.0 69 | Chr1 4005251 4005253 pZJP078-10-1-1-2 95.5 1.0 70 | Chr1 4006506 4006713 pZJP078-10-1-2-1 87.0 1.3038404810405297 71 | Chr1 4006095 4006100 pZJP078-10-1-2-1 87.0 1.3038404810405297 72 | Chr1 4005475 4005482 pZJP078-10-1-2-1 87.0 1.3038404810405297 73 | Chr1 4005239 4005257 pZJP078-10-1-2-1 87.0 1.3038404810405297 74 | Chr1 4006709 4006711 pZJP078-10-5-1-1 87.6 1.019803902718557 75 | Chr1 4006504 4006514 pZJP078-10-5-1-1 87.6 1.019803902718557 76 | Chr1 4006095 4006100 pZJP078-10-5-1-1 87.6 1.019803902718557 77 | Chr1 4005473 4005481 pZJP078-10-5-1-1 87.6 1.019803902718557 78 | Chr1 4005250 4005255 pZJP078-10-5-1-1 87.6 1.019803902718557 79 | Chr1 4006504 4006717 pZJP078-12-2-1-1 84.9 0.9165151389911681 80 | Chr1 4006092 4006097 pZJP078-12-2-1-1 84.9 0.9165151389911681 81 | Chr1 4005474 4005484 pZJP078-12-2-1-1 84.9 0.9165151389911681 82 | Chr1 4005246 4005257 pZJP078-12-2-1-1 84.9 0.9165151389911681 83 | Chr1 4006504 4006717 pZJP078-12-2-1-2 83.5 1.0 84 | Chr1 4006092 4006097 pZJP078-12-2-1-2 83.5 1.0 85 | Chr1 4005474 4005484 pZJP078-12-2-1-2 83.5 1.0 86 | Chr1 4005246 4005257 pZJP078-12-2-1-2 83.5 1.0 87 | Chr1 4006504 4006715 pZJP078-12-3-1-1 80.1 1.42828568570857 88 | Chr1 4005473 4006102 pZJP078-12-3-1-1 80.1 1.42828568570857 89 | Chr1 4005250 4005253 pZJP078-12-3-1-1 80.1 1.42828568570857 90 | Chr1 4006158 4006165 pZJP079-1-1-01-2 91.9 1.5620499351813308 91 | Chr1 4005990 4005994 pZJP079-1-1-01-2 91.9 1.5620499351813308 92 | Chr1 4005168 4005177 pZJP079-1-1-01-2 91.9 1.5620499351813308 93 | Chr1 4006835 4006852 pZJP079-1-1-02-1 80.9 1.2806248474865698 94 | Chr1 4006159 4006167 pZJP079-1-1-02-1 80.9 1.2806248474865698 95 | Chr1 4005945 4006001 pZJP079-1-1-02-1 80.9 1.2806248474865698 96 | Chr1 4005171 4005351 pZJP079-1-1-02-1 80.9 1.2806248474865698 97 | Chr1 4006835 4006852 pZJP079-1-1-01-1 80.4 0.8602325267042626 98 | Chr1 4006159 4006167 pZJP079-1-1-01-1 80.4 0.8602325267042626 99 | Chr1 4005945 4006001 pZJP079-1-1-01-1 80.4 0.8602325267042626 100 | Chr1 4005171 4005351 pZJP079-1-1-01-1 80.4 0.8602325267042626 101 | Chr1 4006611 4006864 pZJP079-5-1-02-2 80.1 1.8 102 | Chr1 4006159 4006206 pZJP079-5-1-02-2 80.1 1.8 103 | Chr1 4005989 4006024 pZJP079-5-1-02-2 80.1 1.8 104 | Chr1 4005330 4005366 pZJP079-5-1-02-2 80.1 1.8 105 | Chr1 4005171 4005174 pZJP079-5-1-02-2 80.1 1.8 106 | Chr1 4005348 4006852 pZJP079-6-3-01-3 66.2 1.4696938456699067 107 | Chr1 4005171 4005174 pZJP079-6-3-01-3 66.2 1.4696938456699067 108 | Chr1 4006163 4006166 pZJP079-7-1-01-2 68.5 1.1832159566199232 109 | Chr1 4005168 4005997 pZJP079-7-1-01-2 68.5 1.1832159566199232 110 | Chr1 4006163 4006166 pZJP079-7-2-03-2 68.8 1.5033296378372907 111 | Chr1 4005168 4005997 pZJP079-7-2-03-2 68.8 1.5033296378372907 112 | Chr1 4005987 4006558 pZJP079-8-1-01-2 74.3 1.0770329614269007 113 | Chr1 4005287 4005378 pZJP079-8-1-01-2 74.3 1.0770329614269007 114 | Chr1 4005171 4005174 pZJP079-8-1-01-2 74.3 1.0770329614269007 115 | Chr1 4006833 4006852 pZJP079-7-2-23-1 69.6 2.0591260281974 116 | Chr1 4006156 4006166 pZJP079-7-2-23-1 69.6 2.0591260281974 117 | Chr1 4005171 4005993 pZJP079-7-2-23-1 69.6 2.0591260281974 118 | Chr1 4006163 4006166 pZJP079-3-3-04-1 96.3 1.2489995996796797 119 | Chr1 4006138 4006166 pZJP079-8-1-14-1 89.6 1.3564659966250536 120 | Chr1 4005980 4005996 pZJP079-8-1-14-1 89.6 1.3564659966250536 121 | Chr1 4005346 4005354 pZJP079-8-1-14-1 89.6 1.3564659966250536 122 | Chr1 4005171 4005174 pZJP079-8-1-14-1 89.6 1.3564659966250536 123 | -------------------------------------------------------------------------------- /lib/misc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | from scipy import stats 5 | import pyBigWig 6 | from pybedtools import BedTool 7 | 8 | 9 | def check_outdir(path): 10 | dirpath = os.path.abspath(os.path.dirname(path)) 11 | if not os.path.exists(dirpath): 12 | print("Create directory:", dirpath) 13 | os.makedirs(dirpath) 14 | 15 | 16 | def split_region(geneinfo): 17 | # Position info split by binsize and step 18 | step = geneinfo.step 19 | binstart = geneinfo.start 20 | binstop = geneinfo.end 21 | posinfo = {} 22 | for i, pos in enumerate(range(binstart, binstop, step)): 23 | posinfo[i] = pos 24 | 25 | return posinfo 26 | 27 | 28 | def get_chrom_sizes(file): 29 | chrlens = {} 30 | with open(file) as infile: 31 | for line in infile: 32 | if line.startswith("#"): 33 | continue 34 | info = line.rstrip().split("\t") 35 | if len(info) == 2: 36 | chrom = info[0] 37 | length = info[1] 38 | else: 39 | chrom = info[0] 40 | length = info[2] 41 | chrlens[chrom] = int(length) 42 | return chrlens 43 | 44 | 45 | def bigwig2bedGraph(bwfile, geneinfo, chrlens, outfile, ext = 50): 46 | # Convert the bigwig file from Deeptools/Popera to bedGraph file in single-base-pair resolution 47 | # Suitable for DNase-seq/ATAC-seq/MNase-seq/ChIP-seq 48 | bwin = pyBigWig.open(bwfile) 49 | chrom = geneinfo.chrom 50 | start = geneinfo.start 51 | end = geneinfo.end 52 | chrom_len = chrlens[chrom] 53 | check_outdir(outfile) 54 | outf = open(outfile, "w") 55 | for i in range(max(1, start-ext), min(end+ext, chrom_len)): 56 | try: 57 | value = bwin.values(chrom, i, i+1)[0] 58 | if np.isnan(value): 59 | value = 0 60 | print(chrom, i, i+1, value, sep="\t", file=outf) 61 | except: 62 | continue 63 | outf.close() 64 | 65 | 66 | def fimo_filter(gfffile, matrixinfo, geneinfo, outfile, pcut = 1e-5, qcut = 1): 67 | # Filter FIMO results with p-value or q-value cutoff 68 | motif_family = {} 69 | # Matrix from JASPAR 70 | if matrixinfo.startswith("JASPAR"): 71 | with open(matrixinfo, "r") as infile: 72 | for line in infile: 73 | if line.startswith("MOTIF"): 74 | info = line.rstrip().split() 75 | motif_id = info[1] 76 | motif_name = info[2] 77 | motif_family[motif_id] = motif_name 78 | # Matrix from PlantTFDB 79 | else: 80 | with open(matrixinfo, "r") as infile: 81 | for line in infile: 82 | if line.startswith("#"): 83 | continue 84 | info = line.rstrip().split() 85 | genename = info[0] 86 | family = info[1] 87 | motif_family[genename] = family 88 | # Get gene info 89 | chrom = geneinfo.chrom 90 | begin = geneinfo.start 91 | # Output filtered motifs 92 | motif_list = [] 93 | with open(gfffile, "r") as infile: 94 | for line in infile: 95 | if line.startswith("#"): 96 | continue 97 | info = line.rstrip().split("\t") 98 | start = int(info[3]) + begin 99 | end = int(info[4]) + begin 100 | strand = info[6] 101 | desc = info[8].split(";") 102 | motif_id = desc[0].split("=")[1] 103 | motif_name = motif_family[motif_id] 104 | pvalue = float(desc[2].split("=")[1]) 105 | qvalue = float(desc[3].split("= ")[1]) 106 | if pvalue <= pcut and qvalue <= qcut: 107 | motif_list.append([chrom, start, end, motif_name, ".", strand, pvalue, qvalue]) 108 | outf = open(outfile, "w") 109 | for lst in sorted(motif_list): 110 | print("\t".join(list(map(str, lst))), file=outf) 111 | outf.close() 112 | 113 | 114 | def smooth_scores_fill2(info, posinfo, minscore=0.01, minratio=0.5): 115 | """ 116 | Make the discrete score values smoothly (fill zero scores). 117 | 118 | Mandatory parameters: 119 | 1. info - A list contains scores in different bins 120 | 2. posinfo - Position information of each bin 121 | 122 | """ 123 | 124 | # In case original score info be modified 125 | new_info = info.copy() 126 | minval = max(min([x for x in new_info if x]), minscore)*minratio 127 | zerocnt = 0 128 | flag = 0 129 | for i in posinfo: 130 | pos = posinfo[i] 131 | score = new_info[i] 132 | if i == 0: 133 | if score == 0: 134 | flag = 1 135 | zerocnt += 1 136 | zerostart = i 137 | continue 138 | elif i == len(new_info)-1: 139 | if score == 0: 140 | for j in range(zerostart+1, len(new_info), 1): 141 | score1 = new_info[j-1] 142 | score2 = new_info[j] 143 | if score1 == new_info[zerostart]: 144 | new_info[j] = np.average([score1*minratio, minval]) 145 | else: 146 | new_info[j] = np.average([score1, minval]) 147 | else: 148 | if score == 0: 149 | zerocnt += 1 150 | zerostart = i 151 | continue 152 | else: 153 | if flag: 154 | for j in range(i, zerostart, -1): 155 | new_info[j-1] = np.average([score*minratio, minval]) 156 | flag = 0 157 | else: 158 | if zerocnt: 159 | right = int(score*zerocnt/(score+new_info[zerostart])) 160 | left = zerocnt - right 161 | for j in range(zerostart+1, zerostart+left+1, 1): 162 | score1 = new_info[j-1] 163 | score2 = new_info[j] 164 | if score1 == new_info[zerostart]: 165 | new_info[j] = np.average([score1*minratio, minval]) 166 | else: 167 | new_info[j] = np.average([score1, minval]) 168 | for k in range(i-1, zerostart+left, -1): 169 | score1 = new_info[k] 170 | score2 = new_info[k+1] 171 | if score2 == score: 172 | new_info[k] = np.average([minval, score2*minratio]) 173 | else: 174 | new_info[k] = np.average([minval, score2]) 175 | zerostart = i 176 | zerocnt = 0 177 | # smooth scores 178 | smooth_info = smooth_scores2(new_info, posinfo, keep_tails=False) 179 | 180 | return smooth_info 181 | 182 | 183 | def smooth_scores_fill(info, posinfo): 184 | """ 185 | Make the discrete score values smoothly (fill zero scores). 186 | 187 | Mandatory parameters: 188 | 1. info - A list contains scores in different bins 189 | 2. posinfo - Position information of each bin 190 | 191 | """ 192 | 193 | new_info = info.copy() 194 | smooth_info = {} 195 | nonzero = [x for x in new_info if x] 196 | if sum(nonzero): 197 | minscore = min(nonzero) 198 | maxscore = max(new_info) 199 | else: 200 | return smooth_info 201 | # Set the minimum fill score 202 | if minscore / maxscore > 0.1: 203 | bottom = 0.1 204 | else: 205 | bottom = minscore 206 | for i in range(len(new_info)): 207 | if i: 208 | score0 = new_info[i-1] 209 | score1 = new_info[i] 210 | if not score1: 211 | for j in range(i+1, len(new_info)): 212 | score2 = new_info[j] 213 | if score2: 214 | break 215 | if j == len(new_info)-1 and score2 == 0: 216 | score2 = bottom 217 | ranges = j - i 218 | diff1 = abs(score0 - bottom) 219 | diff2 = abs(score2 - bottom) 220 | total = diff1 + diff2 221 | if total: 222 | mid = int(ranges * diff1 / total) 223 | else: 224 | mid = 0 225 | # print(i, j, ranges, mid, score0, score2, diff1, diff2, sep="\t") 226 | if ranges > 1: 227 | for k in range(mid): 228 | new_info[i+k] = score0 - diff1 * (k+1)/(mid+1) 229 | for k in range(mid+1, ranges): 230 | new_info[i+k] = bottom + diff2 * (k-mid)/(ranges-mid) 231 | new_info[i+mid] = bottom 232 | else: 233 | score = new_info[i] 234 | if score: 235 | pass 236 | else: 237 | new_info[i] = bottom 238 | 239 | smooth_info = smooth_scores2(new_info, posinfo, keep_tails=False) 240 | 241 | return smooth_info 242 | 243 | 244 | def smooth_scores1(info, posinfo, keep_tails=True): 245 | """ 246 | Make the discrete score values smoothly. 247 | (Remove missing values between two scores) 248 | 249 | Mandatory parameters: 250 | 1. info - A list contains scores in different bins 251 | 2. posinfo - Position information of each bin 252 | 253 | """ 254 | 255 | # In case original score info be modified 256 | new_info = info.copy() 257 | score_num = len(new_info) 258 | # Fill gap between two scores 259 | for i in range(score_num): 260 | score = new_info[i] 261 | if i == 0: 262 | tmp_score = score 263 | tmp_idx = i 264 | else: 265 | if score and tmp_score: 266 | interval = i - tmp_idx 267 | if interval > 1: 268 | for n, j in enumerate(range(tmp_idx+1, i)): 269 | new_info[j] = tmp_score + (score - tmp_score) * n / (i - tmp_idx) 270 | tmp_score = score 271 | tmp_idx = i 272 | smooth_info = {} 273 | if max(new_info): 274 | new_info = [x/max(new_info) for x in new_info] 275 | else: 276 | return smooth_info 277 | # Smooth the scores 278 | smooth_info = smooth_scores2(new_info, posinfo, keep_tails=keep_tails) 279 | 280 | return smooth_info 281 | 282 | 283 | def smooth_scores2(info, posinfo, keep_tails=False): 284 | """ 285 | Make the discrete score values smoothly. 286 | 287 | Mandatory parameters: 288 | 1. info - A list contains scores in different bins 289 | 2. posinfo - Position information of each bin 290 | 291 | Alternative parameters: 292 | 1. keep_tails - Whether or not to keep the missing values in the two tails 293 | 294 | """ 295 | 296 | # In case original score info be modified 297 | new_info = info.copy() 298 | smooth_info = {} 299 | if not max(new_info): 300 | return smooth_info 301 | score_num = len(new_info) 302 | begin = 0 303 | end = score_num 304 | # Find the two tails 305 | for i in range(end): 306 | if i: 307 | begin_avg = np.average(new_info[:i]) 308 | else: 309 | begin_avg = new_info[i] 310 | if i == end-1: 311 | end_avg = new_info[i] 312 | else: 313 | end_avg = np.average(new_info[i:]) 314 | if begin_avg == 0: 315 | begin = i 316 | if end_avg == 0: 317 | end = i 318 | break 319 | # Get average value in adjacent scores 320 | if not keep_tails: 321 | for i in range(begin, 0, -1): 322 | if i: 323 | if begin == score_num-1: 324 | score = new_info[i] 325 | else: 326 | score = (new_info[i-1] + new_info[i] + new_info[i+1]) / 3 327 | else: 328 | score = (new_info[i] + new_info[i+1]) / 2 329 | new_info[i] = score 330 | for i in range(end, score_num): 331 | if i < score_num - 1: 332 | score = (new_info[i-1] + new_info[i] + new_info[i+1]) / 3 333 | else: 334 | score = (new_info[i-1] + new_info[i]) / 2 335 | new_info[i] = score 336 | for i in range(begin, end): 337 | if i == begin: 338 | if begin == score_num-1: 339 | score = new_info[i] 340 | else: 341 | score = (new_info[i] + new_info[i+1]) / 2 342 | elif i == end - 1: 343 | score = (new_info[i-1] + new_info[i]) / 2 344 | else: 345 | score = (new_info[i-1] + new_info[i] + new_info[i+1]) / 3 346 | new_info[i] = score 347 | for i in posinfo: 348 | # provide real positions for smoothed scores 349 | pos = posinfo[i] 350 | smooth_info[pos] = new_info[i] / max(new_info) 351 | 352 | return smooth_info 353 | 354 | 355 | def merge_regions(regions, geneinfo, minlen = 2, mindist = 1): 356 | # Filter and merge key regions 357 | chromosome = geneinfo.chrom 358 | binsize = geneinfo.binsize 359 | merged = {} 360 | for pos, score in regions: 361 | start = pos 362 | end = pos + binsize 363 | if not merged: 364 | tmppos = start 365 | merged[tmppos] = [end, [score]] 366 | tmp_end = end 367 | continue 368 | if start - tmp_end <= binsize * mindist: 369 | merged[tmppos][0] = end 370 | merged[tmppos][1].append(score) 371 | else: 372 | merged[start] = [end, [score]] 373 | tmppos = start 374 | tmp_end = end 375 | 376 | merged_regions = [] 377 | for pos in merged: 378 | start = pos 379 | end = merged[pos][0] 380 | if end - start >= binsize * minlen: 381 | score = np.average(merged[pos][1]) 382 | merged_regions.append([chromosome, start, end, score]) 383 | 384 | return merged_regions 385 | 386 | 387 | def calc_importance(phenotypes, scorelist, namelist, geneinfo, outdir="./", side="none"): 388 | # Calculate the correlation between phenodata and scores from different features 389 | ziplist = zip(scorelist, namelist) 390 | gene = geneinfo.gene 391 | genename = geneinfo.alias 392 | if genename == "NA": 393 | gene_alias = gene 394 | else: 395 | gene_alias = genename 396 | sample_scores = {} 397 | for item in ziplist: 398 | scores = item[0] 399 | name = item[1] 400 | score_bed = BedTool("\n".join(["\t".join(map(str, [geneinfo.chrom, x, x+geneinfo.binsize, scores[x]])) 401 | for x in scores]), 402 | from_string=True) 403 | pheno_bed = BedTool(phenotypes) 404 | intersect = pheno_bed.intersect(score_bed, wo=True) 405 | fscores = {} 406 | for interval in intersect: 407 | info = str(interval).rstrip().split("\t") 408 | sample = info[3] 409 | if sample == "WT": 410 | wt_value = float(info[4]) 411 | continue 412 | ratio = int(info[-1]) / geneinfo.binsize 413 | if sample in fscores: 414 | fscores[sample]["feature"] += float(info[-2]) * ratio 415 | else: 416 | fscores[sample] = {} 417 | if side == "none": 418 | fscores[sample]["pheno"] = abs(float(info[4]) - wt_value) 419 | else: 420 | fscores[sample]["pheno"] = float(info[4]) - wt_value 421 | fscores[sample]["feature"] = float(info[-2]) * ratio 422 | min_score = min([fscores[x]["feature"] for x in fscores]) 423 | max_score = max([fscores[x]["feature"] for x in fscores]) 424 | avg_pheno = np.average([fscores[x]["pheno"] for x in fscores]) 425 | if avg_pheno < 0: 426 | for s in fscores: 427 | fscores[s]["pheno"] *= -1 428 | min_pheno = min([fscores[x]["pheno"] for x in fscores]) 429 | max_pheno = max([fscores[x]["pheno"] for x in fscores]) 430 | feature_scores = [] 431 | pheno_scores = [] 432 | for s in fscores: 433 | score1 = (fscores[s]["feature"]-min_score)/(max_score-min_score) 434 | feature_scores.append(score1) 435 | if side == "none": 436 | score2 = fscores[s]["pheno"] 437 | else: 438 | score2 = (fscores[s]["pheno"]-min_pheno)/(max_pheno-min_pheno) 439 | pheno_scores.append(score2) 440 | if s not in sample_scores: 441 | sample_scores[s] = {} 442 | sample_scores[s]["pheno"] = score2 443 | sample_scores[s][name] = score1 444 | pearson = stats.pearsonr(feature_scores, pheno_scores) 445 | print(name, "Pearson correlation:", pearson[0]) 446 | 447 | outfile = outdir + "/" + gene_alias + "/scores_by_sample.txt" 448 | df = pd.DataFrame(sample_scores).T 449 | df.index.name = "sample" 450 | df.to_csv(outfile, sep="\t") 451 | 452 | -------------------------------------------------------------------------------- /single.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | ################################################## 4 | # CRISPR-Cas12a promoter editing (CAPE) # 5 | # Script: Single Mode # 6 | ################################################## 7 | 8 | import os 9 | import sys 10 | import shutil 11 | from glob import glob 12 | import configparser 13 | from time import time 14 | from multiprocessing import Pool 15 | from pybedtools import BedTool, cleanup 16 | 17 | from lib import misc 18 | from lib.features import * 19 | from lib.cores import output_cores 20 | 21 | 22 | class Features_info(): 23 | # Provide Gene infomation 24 | def __init__(self): 25 | self.geneinfo = Geneinfo() 26 | self.feature = "feature" 27 | self.workdir = "results" 28 | self.outname = "name" 29 | self.slop = 200 30 | self.config = {} 31 | self.chrlens = {} 32 | 33 | 34 | def get_gene_info(gene_file): 35 | 36 | genes_info = {} 37 | with open(gene_file) as infile: 38 | for line in infile: 39 | if line.startswith("#") or line.startswith("\n"): 40 | continue 41 | info = line.rstrip().split("\t") 42 | chrom = info[0] 43 | start = int(info[1]) 44 | end = int(info[2]) 45 | gene_name = info[3] 46 | strand = info[5] 47 | genes_info[gene_name] = [chrom, start, end, strand] 48 | break 49 | 50 | print("Genes infomation loaded.\n") 51 | 52 | return genes_info 53 | 54 | 55 | def generate_regions(geneinfo, workdir, gene, chrlens): 56 | chrom = geneinfo.chrom 57 | start = geneinfo.start 58 | end = geneinfo.end 59 | strand = geneinfo.strand 60 | chrom_len = chrlens[chrom] 61 | outfile = workdir + "/" + gene + "/analysis_region.bed" 62 | misc.check_outdir(outfile) 63 | if os.path.exists(outfile): 64 | return outfile 65 | outf = open(outfile, "w") 66 | print(chrom, max(0, start), min(end, chrom_len), gene, '.', strand, 67 | sep="\t", file=outf) 68 | outf.close() 69 | 70 | return outfile 71 | 72 | 73 | def generate_features(Features_info): 74 | 75 | geneinfo = Features_info.geneinfo 76 | feature_file = Features_info.feature 77 | workdir = Features_info.workdir 78 | outname = Features_info.outname 79 | slop = Features_info.slop 80 | chrlens = Features_info.chrlens 81 | 82 | chrom = geneinfo.chrom 83 | start = geneinfo.start 84 | end = geneinfo.end 85 | gene = geneinfo.gene 86 | 87 | if "peak" in outname: 88 | outfile = workdir + "/" + gene + "/" + outname + "_raw.bed" 89 | else: 90 | outfile = workdir + "/" + gene + "/" + outname + "_raw.bedGraph" 91 | if os.path.exists(outfile): 92 | return outfile 93 | if feature_file.endswith(".bw") or feature_file.endswith(".bigwig"): 94 | misc.bigwig2bedGraph(feature_file, geneinfo, chrlens, outfile, ext = slop) 95 | else: 96 | target_bed = BedTool("\t".join([chrom, str(max(0, start-slop)), str(end+slop)]), from_string = True) 97 | feature_bed = BedTool(feature_file) 98 | feature_bed.intersect(target_bed, wa=True).moveto(outfile) 99 | 100 | cleanup() 101 | 102 | return outfile 103 | 104 | 105 | def generate_features_from_large(inputfile, genes_info, upstream, slop, workdir, feature): 106 | 107 | basemap = {} 108 | existed = set() 109 | num = 0 110 | genelens = len(genes_info) 111 | for gene in genes_info: 112 | outfile = os.path.join(workdir, gene, feature+"_raw.bedGraph") 113 | if os.path.exists(outfile): 114 | filesize = os.path.getsize(outfile) 115 | if filesize > 10: 116 | existed.add(gene) 117 | chrom, start, end, strand = genes_info[gene][:4] 118 | if chrom not in basemap: 119 | basemap[chrom] = {} 120 | if strand == "+": 121 | for i in range(max(0, start-upstream-slop), start+slop+1): 122 | if i in basemap[chrom]: 123 | basemap[chrom][i].append(gene) 124 | else: 125 | basemap[chrom][i] = [gene] 126 | else: 127 | for i in range(end-slop, end+upstream+slop+1): 128 | if i in basemap[chrom]: 129 | basemap[chrom][i].append(gene) 130 | else: 131 | basemap[chrom][i] = [gene] 132 | print("%s / %s genes processed, %s existed genes." % (num, genelens, len(existed)), 133 | end="\r") 134 | num += 1 135 | print("Load genes completed.", " "*30) 136 | genenums = [] 137 | total_num = max(1, genelens-len(existed)) 138 | outf = {} 139 | split = 500 140 | kept = split * 0.9 141 | tmp_cnt = 0 142 | tmp_mod = 0 143 | num = 0 144 | with open(inputfile) as infile: 145 | for line in infile: 146 | chrom, start, end = line.rstrip().split("\t")[:3] 147 | if chrom not in basemap: 148 | continue 149 | if feature == "CNS": 150 | s = int(start) 151 | else: 152 | s = int((int(start) + int(end)) / 2) 153 | if s in basemap[chrom]: 154 | genes = basemap[chrom][s] 155 | for gene in genes: 156 | if gene in existed: 157 | continue 158 | else: 159 | outfile = os.path.join(workdir, gene, feature+"_raw.bedGraph") 160 | if gene not in outf: 161 | outf[gene] = open(outfile, "w") 162 | # try: 163 | # outf[gene] = open(outfile, "w") 164 | # except: 165 | # opened = len(outf) 166 | # print("Processing %s, %s genes opened." % (gene, opened)) 167 | # outf[gene] = open(outfile, "w") 168 | print(line.rstrip(), file=outf[gene]) 169 | if gene not in genenums: 170 | genenums.append(gene) 171 | cnt = len(genenums) 172 | remain = cnt % split 173 | mod = cnt // split 174 | if mod - tmp_mod > 0: 175 | st = max(0, int(split * (mod - 1) - kept - 1)) 176 | ed = int(split * mod - kept) 177 | # print("#"*100+"\n", tmp_cnt, cnt, tmp_mod, mod, st, ed, genes, sep=", ") 178 | for j in genenums[st:ed]: 179 | outf[j].close() 180 | if tmp_cnt != cnt: 181 | pct = round(cnt * 100 / total_num, 2) 182 | print(pct, "%", " output.", end="\r") 183 | tmp_cnt = cnt 184 | tmp_mod = mod 185 | print("All files output.") 186 | 187 | for gene in outf: 188 | outf[gene].close() 189 | 190 | return cnt 191 | 192 | 193 | def run_analysis(feature_info): 194 | 195 | workdir = feature_info.workdir 196 | geneinfo = feature_info.geneinfo 197 | gene = feature_info.geneinfo.gene 198 | 199 | # Check if calculated 200 | # check = os.path.join(workdir, gene, "key_regions_merged.bed") 201 | # if os.path.exists(check): 202 | # return (gene, 0) 203 | check = os.path.join(workdir, gene, "aggregate.bedGraph") 204 | if os.path.exists(check): 205 | filesize = os.path.getsize(check) 206 | if filesize > 10: 207 | return (gene, 0) 208 | 209 | # Open chromatin 210 | ocscores = glob(os.path.join(workdir, gene, "OCscores*_raw.bedGraph")) 211 | ocpeaks = glob(os.path.join(workdir, gene, "OCpeaks*_raw.bed")) 212 | # Calculate scores 213 | ocscorelist = [] 214 | for idx, ocscorefile in enumerate(ocscores): 215 | if idx + 1 > len(ocpeaks): 216 | ocpeakfile = "" 217 | else: 218 | ocpeakfile = ocpeaks[idx] 219 | if len(ocscores) > 1: 220 | ocname = os.path.basename(ocscorefile).split("_raw")[0] 221 | else: 222 | ocname = "OCscores" 223 | scores_oc1 = openchromatin_scores(geneinfo, ocscorefile, ocpeakfile, 224 | samplename = ocname, outdir = workdir) 225 | ocscorelist.append(scores_oc1) 226 | if len(ocscores) > 1: 227 | scores_oc = merge_reps(geneinfo, ocscorelist, samplename = "OCscores", outdir = workdir) 228 | else: 229 | scores_oc = scores_oc1 230 | 231 | # Histone modification 232 | ptmfiles = glob(os.path.join(workdir, gene, "PTM*_raw.bedGraph")) 233 | # Calculate scores 234 | ptmscorelist = [] 235 | for ptmscorefile in ptmfiles: 236 | if len(ptmfiles) > 1: 237 | ptmname = os.path.basename(ptmscorefile).split("_raw")[0] 238 | else: 239 | ptmname = "PTMscores" 240 | scores_ptm1 = ptm_scores(geneinfo, ptmscorefile, ocname="OCscores", 241 | samplename = ptmname, outdir = workdir) 242 | ptmscorelist.append(scores_ptm1) 243 | if len(ptmfiles) > 1: 244 | scores_ptm = merge_reps(geneinfo, ptmscorelist, samplename = "PTMscores", outdir = workdir) 245 | else: 246 | scores_ptm = scores_ptm1 247 | 248 | # TF motifs 249 | motiffile = os.path.join(workdir, gene, "motifs_raw.bedGraph") 250 | # Calculate scores 251 | scores_motif = motif_scores(geneinfo, motiffile, outdir = workdir) 252 | 253 | # Conserved sequences 254 | cnsfile = os.path.join(workdir, gene, "CNS_raw.bedGraph") 255 | # Calculate scores 256 | scores_cns = cns_scores(geneinfo, cnsfile, outdir = workdir) 257 | 258 | # Genotype versus Phenotype (MBKbase) 259 | genopheno = os.path.join(workdir, gene, "genopheno_raw.bedGraph") 260 | # Calculate scores 261 | if os.path.exists(genopheno): 262 | scores_genopheno = genopheno_scores(geneinfo, genopheno, outdir = workdir) 263 | else: 264 | scores_genopheno = {} 265 | 266 | # Aggregate scores 267 | if scores_genopheno: 268 | scorelist = [scores_oc, scores_motif, scores_cns, scores_ptm, scores_genopheno] 269 | weightlist = [0.25, 0.2, 0.3, 0.1, 0.05] 270 | else: 271 | scorelist = [scores_oc, scores_motif, scores_cns, scores_ptm] 272 | weightlist = [0.25, 0.2, 0.3, 0.1] 273 | scores_aggregate = aggregate_scores(geneinfo, scorelist, weightlist, outdir = workdir) 274 | 275 | # Load phenodata from CRISPR-edited results 276 | phenodata = os.path.join(workdir, gene, "phenoscores_raw.bedGraph") 277 | # Calculate scores 278 | if os.path.exists(phenodata): 279 | scores_phenodata = phenodata_scores(geneinfo, phenodata, method = "kmeans2", 280 | outdir = workdir) 281 | else: 282 | scores_phenodata = {} 283 | 284 | # Find the feature importance 285 | if scores_phenodata: 286 | namelist = ["DHS", "H3K27ac", "TF motif", "CNS", "GenoPheno", "Aggregate"] 287 | misc.calc_importance(phenodata, scorelist+[scores_aggregate], 288 | namelist, geneinfo, side="both", outdir = workdir) 289 | 290 | # Define key regions 291 | key_regions = define_key_regions(geneinfo, scores_aggregate, phenodata, 292 | outdir = workdir) 293 | 294 | # Get the core of key regions 295 | scorefile = os.path.join(workdir, gene, "aggregate.bedGraph") 296 | regionfile = os.path.join(workdir, gene, "key_regions_merged.bed") 297 | core_regions = output_cores(geneinfo, scorefile, regionfile) 298 | 299 | cleanup() 300 | 301 | return (gene, 1) 302 | 303 | 304 | def check_options(config): 305 | 306 | print("# Using the following options:") 307 | if config["General"]["workdir"]: 308 | config["General"]["workdir"] = os.path.abspath(config["General"]["workdir"]) 309 | else: 310 | config["General"]["workdir"] = "results" 311 | misc.check_outdir(config["General"]["workdir"]) 312 | for section in config.sections(): 313 | for param in config.options(section): 314 | values = config[section][param] 315 | if section == "Features": 316 | if "," in values: 317 | values = values.split(",") 318 | for file in values: 319 | if file and not os.path.exists(file): 320 | print("# Error, cannot find the %s: %s" % (param, file)) 321 | sys.exit(1) 322 | else: 323 | file = values 324 | if file and not os.path.exists(file): 325 | print("# Error, cannot find the %s: %s" % (param, file)) 326 | sys.exit(1) 327 | print("%s: %s" % (param, values)) 328 | if int(config["General"]["threads"]) > os.cpu_count(): 329 | config["General"]["threads"] = os.cpu_count() 330 | if int(config["General"]["slop"]) > 5e4: 331 | config["General"]["slop"] = 5e4 332 | if int(config["General"]["upstream"]) > 1e4: 333 | config["General"]["upstream"] = 1e4 334 | if int(config["General"]["binsize"]) > int(config["General"]["upstream"]) / 2: 335 | config["General"]["binsize"] = int(config["General"]["upstream"]) / 2 336 | if int(config["General"]["step"]) > int(config["General"]["binsize"]): 337 | config["General"]["step"] = int(config["General"]["binsize"]) 338 | if config["Genes"]["gene_file"]: 339 | print("\n# Using Single mode.\n") 340 | 341 | return config 342 | 343 | 344 | def main(): 345 | 346 | # Load configs 347 | config = configparser.ConfigParser() 348 | if len(sys.argv) == 1: 349 | config_file = "config.ini" 350 | elif len(sys.argv) == 2: 351 | config_file = sys.argv[1] 352 | else: 353 | print("Usage:\n python single.py [configfile]\n") 354 | sys.exit(1) 355 | config.read(config_file) 356 | 357 | config = check_options(config) 358 | workdir = config["General"]["workdir"] 359 | threads = int(config["General"]["threads"]) 360 | slop = int(config["General"]["slop"]) 361 | upstream = int(config["General"]["upstream"]) 362 | binsize = int(config["General"]["binsize"]) 363 | step = int(config["General"]["step"]) 364 | gene_file = config["Genes"]["gene_file"] 365 | chrom_sizes = config["Genes"]["chrom_sizes"] 366 | 367 | # Load genes 368 | if gene_file: 369 | genes_info = get_gene_info(gene_file) 370 | else: 371 | print("No gene annotation file found, stop!") 372 | sys.exit(1) 373 | 374 | # Define the input numbers of multiprocessing list 375 | inputnum = 512 376 | if inputnum < threads: 377 | inputnum = threads 378 | else: 379 | roundnum = (inputnum // threads) * threads 380 | inputnum = int(max(roundnum, threads*4)) 381 | 382 | # Load chromosome sizes 383 | chrlens = misc.get_chrom_sizes(chrom_sizes) 384 | 385 | # Define features information 386 | feature_map = {"ocfiles":"OCscores", "ocpeaks":"OCpeaks", "ptmfiles":"PTM", 387 | "motifs":"motifs", "cnss":"CNS", "genopheno":"genopheno", 388 | "phenodata":"phenoscores"} 389 | for item in config["Features"]: 390 | feature_files = config["Features"][item] 391 | if not feature_files: 392 | continue 393 | filelist = feature_files.split(",") 394 | count = 1 395 | for file in filelist: 396 | feature_infos = [] 397 | num = 1 398 | for gene in genes_info: 399 | chrom = genes_info[gene][0] 400 | start = genes_info[gene][1] 401 | end = genes_info[gene][2] 402 | strand = genes_info[gene][3] 403 | feature_info = Features_info() 404 | feature_info.workdir = workdir 405 | feature_info.slop = slop 406 | feature_info.config = config 407 | feature_info.idx = num 408 | feature_info.geneinfo = Geneinfo() 409 | feature_info.geneinfo.gene = gene 410 | feature_info.geneinfo.chrom = chrom 411 | feature_info.geneinfo.strand = strand 412 | if strand == "+": 413 | feature_info.geneinfo.start = start - upstream 414 | feature_info.geneinfo.end = start - 1 415 | else: 416 | feature_info.geneinfo.start = end 417 | feature_info.geneinfo.end = end + upstream - 1 418 | feature_info.geneinfo.binsize = binsize 419 | feature_info.geneinfo.step = step 420 | num += 1 421 | # Output analyzed gene regions 422 | generate_regions(feature_info.geneinfo, workdir, gene, chrlens) 423 | feature_info.feature = file 424 | if len(filelist) > 1: 425 | outname = feature_map[item] + "_" + str(count) 426 | else: 427 | outname = feature_map[item] 428 | feature_info.outname = outname 429 | feature_info.chrlens = chrlens 430 | feature_infos.append(feature_info) 431 | count += 1 432 | # Generate features file 433 | time_st = time() 434 | file_suffix = file.split(".")[-1].lower() 435 | filesize = os.path.getsize(file) 436 | if file_suffix in ["bed", "bedgraph", "txt"] and filesize > 1e8: 437 | results = generate_features_from_large(file, genes_info, upstream, slop, 438 | workdir, outname) 439 | else: 440 | # Multiprocessing 441 | results = generate_features(feature_infos[0]) 442 | time_ed = time() 443 | time_elapse = round(time_ed - time_st) 444 | print("Generate %s features files finished.\nUsing %ss" % (outname, time_elapse)) 445 | 446 | # Perform analysis 447 | time_st = time() 448 | result = run_analysis(feature_infos[0]) 449 | if result[1]: 450 | time_total = round(time() - time_st, 2) 451 | print("\nGene analysis finished using %ss. %s\n" % (time_total, " "*30)) 452 | 453 | print("All the processes completed.", " "*10) 454 | 455 | 456 | 457 | if __name__ == '__main__': 458 | 459 | try: 460 | main() 461 | 462 | except KeyboardInterrupt: 463 | sys.stderr.write("User interrupt\n") 464 | sys.exit(0) 465 | 466 | -------------------------------------------------------------------------------- /batch.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | ################################################## 4 | # CRISPR-Cas12a promoter editing (CAPE) # 5 | # Script: Batch Mode # 6 | ################################################## 7 | 8 | import os 9 | import sys 10 | import shutil 11 | from glob import glob 12 | import configparser 13 | from time import time 14 | from multiprocessing import Pool 15 | from pybedtools import BedTool, cleanup 16 | 17 | from lib import misc 18 | from lib.features import * 19 | from lib.cores import output_cores 20 | 21 | 22 | class Features_info(): 23 | # Provide Gene infomation 24 | def __init__(self): 25 | self.geneinfo = Geneinfo() 26 | self.feature = "feature" 27 | self.workdir = "results" 28 | self.outname = "name" 29 | self.slop = 200 30 | self.config = {} 31 | self.chrlens = {} 32 | 33 | 34 | def get_gene_info(gff_file): 35 | 36 | suffix = gff_file.split(".")[-1].lower() 37 | 38 | if suffix not in ["gtf", "gff", "gff3"]: 39 | print("Input gene annotataion file is not in GFF/GFF3 format.\nPlease check the file.") 40 | sys.exit(1) 41 | 42 | genes_info = {} 43 | cds_info = {} 44 | with open(gff_file) as infile: 45 | for line in infile: 46 | if line.startswith("#") or line.startswith("\n"): 47 | continue 48 | info = line.rstrip().split("\t") 49 | chrom = info[0] 50 | category = info[2].lower() 51 | start = int(info[3]) 52 | end = int(info[4]) 53 | strand = info[6] 54 | desc = info[8].split(";") 55 | if category == "gene": 56 | gene_name = desc[0].split("=")[-1] 57 | genes_info[gene_name] = [chrom, start, end, strand] 58 | elif category == "transcript": 59 | gene_name = desc[0].split("\"")[1] 60 | genes_info[gene_name] = [chrom, start, end, strand] 61 | elif category == "cds": 62 | if gene_name in cds_info: 63 | cds_info[gene_name] = [min(cds_info[gene_name][0], start), 64 | max(cds_info[gene_name][1], end)] 65 | else: 66 | cds_info[gene_name] = [start, end] 67 | 68 | for gene in genes_info: 69 | if gene in cds_info: 70 | s, e = cds_info[gene] 71 | dist1 = s - genes_info[gene][1] 72 | dist2 = genes_info[gene][2] - e 73 | if dist1 >= 0 and dist2 >= 0: 74 | genes_info[gene].extend([dist1, dist2]) 75 | else: 76 | genes_info[gene].extend([0, 0]) 77 | else: 78 | genes_info[gene].extend([0, 0]) 79 | 80 | print("%s genes found in the annotation file.\n" % len(genes_info)) 81 | 82 | return genes_info 83 | 84 | 85 | def generate_regions(geneinfo, workdir, gene, chrlens): 86 | chrom = geneinfo.chrom 87 | start = geneinfo.start 88 | end = geneinfo.end 89 | strand = geneinfo.strand 90 | chrom_len = chrlens[chrom] 91 | outfile = workdir + "/" + gene + "/analysis_region.bed" 92 | misc.check_outdir(outfile) 93 | if os.path.exists(outfile): 94 | return outfile 95 | outf = open(outfile, "w") 96 | print(chrom, max(0, start), min(end, chrom_len), gene, '.', strand, 97 | sep="\t", file=outf) 98 | outf.close() 99 | 100 | return outfile 101 | 102 | 103 | def generate_features(Features_info): 104 | 105 | geneinfo = Features_info.geneinfo 106 | feature_file = Features_info.feature 107 | workdir = Features_info.workdir 108 | outname = Features_info.outname 109 | slop = Features_info.slop 110 | chrlens = Features_info.chrlens 111 | 112 | chrom = geneinfo.chrom 113 | start = geneinfo.start 114 | end = geneinfo.end 115 | gene = geneinfo.gene 116 | 117 | if "peak" in outname: 118 | outfile = workdir + "/" + gene + "/" + outname + "_raw.bed" 119 | else: 120 | outfile = workdir + "/" + gene + "/" + outname + "_raw.bedGraph" 121 | if os.path.exists(outfile): 122 | return outfile 123 | if feature_file.endswith(".bw") or feature_file.endswith(".bigwig"): 124 | misc.bigwig2bedGraph(feature_file, geneinfo, chrlens, outfile, ext = slop) 125 | else: 126 | target_bed = BedTool("\t".join([chrom, str(max(0, start-slop)), str(end+slop)]), from_string = True) 127 | feature_bed = BedTool(feature_file) 128 | target_bed.intersect(feature_bed).moveto(outfile) 129 | 130 | cleanup() 131 | 132 | return outfile 133 | 134 | 135 | def generate_features_from_large(inputfile, genes_info, upstream, slop, workdir, feature): 136 | 137 | basemap = {} 138 | existed = set() 139 | num = 0 140 | genelens = len(genes_info) 141 | for gene in genes_info: 142 | outfile = os.path.join(workdir, gene, feature+"_raw.bedGraph") 143 | if os.path.exists(outfile): 144 | filesize = os.path.getsize(outfile) 145 | if filesize > 10: 146 | existed.add(gene) 147 | chrom, start, end, strand = genes_info[gene][:4] 148 | if chrom not in basemap: 149 | basemap[chrom] = {} 150 | if strand == "+": 151 | for i in range(max(0, start-upstream-slop), start+slop+1): 152 | if i in basemap[chrom]: 153 | basemap[chrom][i].append(gene) 154 | else: 155 | basemap[chrom][i] = [gene] 156 | else: 157 | for i in range(end-slop, end+upstream+slop+1): 158 | if i in basemap[chrom]: 159 | basemap[chrom][i].append(gene) 160 | else: 161 | basemap[chrom][i] = [gene] 162 | print("%s / %s genes processed, %s existed genes." % (num, genelens, len(existed)), 163 | end="\r") 164 | num += 1 165 | print("Load genes completed.", " "*30) 166 | genenums = [] 167 | total_num = max(1, genelens-len(existed)) 168 | outf = {} 169 | split = 500 170 | kept = split * 0.9 171 | tmp_cnt = 0 172 | tmp_mod = 0 173 | num = 0 174 | with open(inputfile) as infile: 175 | for line in infile: 176 | chrom, start, end = line.rstrip().split("\t")[:3] 177 | if chrom not in basemap: 178 | continue 179 | if feature == "CNS": 180 | s = int(start) 181 | else: 182 | s = int((int(start) + int(end)) / 2) 183 | if s in basemap[chrom]: 184 | genes = basemap[chrom][s] 185 | for gene in genes: 186 | if gene in existed: 187 | continue 188 | else: 189 | outfile = os.path.join(workdir, gene, feature+"_raw.bedGraph") 190 | if gene not in outf: 191 | outf[gene] = open(outfile, "w") 192 | # try: 193 | # outf[gene] = open(outfile, "w") 194 | # except: 195 | # opened = len(outf) 196 | # print("Processing %s, %s genes opened." % (gene, opened)) 197 | # outf[gene] = open(outfile, "w") 198 | print(line.rstrip(), file=outf[gene]) 199 | if gene not in genenums: 200 | genenums.append(gene) 201 | cnt = len(genenums) 202 | remain = cnt % split 203 | mod = cnt // split 204 | if mod - tmp_mod > 0: 205 | st = max(0, int(split * (mod - 1) - kept - 1)) 206 | ed = int(split * mod - kept) 207 | # print("#"*100+"\n", tmp_cnt, cnt, tmp_mod, mod, st, ed, genes, sep=", ") 208 | for j in genenums[st:ed]: 209 | outf[j].close() 210 | if tmp_cnt != cnt: 211 | pct = round(cnt * 100 / total_num, 2) 212 | print(pct, "%", " output.", end="\r") 213 | tmp_cnt = cnt 214 | tmp_mod = mod 215 | print("All files output.") 216 | 217 | for gene in outf: 218 | outf[gene].close() 219 | 220 | return cnt 221 | 222 | 223 | def run_analysis(feature_info): 224 | 225 | workdir = feature_info.workdir 226 | geneinfo = feature_info.geneinfo 227 | gene = feature_info.geneinfo.gene 228 | 229 | # Check if calculated 230 | # check = os.path.join(workdir, gene, "key_regions_merged.bed") 231 | # if os.path.exists(check): 232 | # return (gene, 0) 233 | check = os.path.join(workdir, gene, "aggregate.bedGraph") 234 | if os.path.exists(check): 235 | filesize = os.path.getsize(check) 236 | if filesize > 10: 237 | return (gene, 0) 238 | 239 | # Open chromatin 240 | ocscores = glob(os.path.join(workdir, gene, "OCscores*_raw.bedGraph")) 241 | ocpeaks = glob(os.path.join(workdir, gene, "OCpeaks*_raw.bed")) 242 | # Calculate scores 243 | ocscorelist = [] 244 | for idx, ocscorefile in enumerate(ocscores): 245 | if idx + 1 > len(ocpeaks): 246 | ocpeakfile = "" 247 | else: 248 | ocpeakfile = ocpeaks[idx] 249 | if len(ocscores) > 1: 250 | ocname = os.path.basename(ocscorefile).split("_raw")[0] 251 | else: 252 | ocname = "OCscores" 253 | scores_oc1 = openchromatin_scores(geneinfo, ocscorefile, ocpeakfile, 254 | samplename = ocname, outdir = workdir) 255 | ocscorelist.append(scores_oc1) 256 | if len(ocscores) > 1: 257 | scores_oc = merge_reps(geneinfo, ocscorelist, samplename = "OCscores", outdir = workdir) 258 | else: 259 | scores_oc = scores_oc1 260 | 261 | # Histone modification 262 | ptmfiles = glob(os.path.join(workdir, gene, "PTM*_raw.bedGraph")) 263 | # Calculate scores 264 | ptmscorelist = [] 265 | for ptmscorefile in ptmfiles: 266 | if len(ptmfiles) > 1: 267 | ptmname = os.path.basename(ptmscorefile).split("_raw")[0] 268 | else: 269 | ptmname = "PTMscores" 270 | scores_ptm1 = ptm_scores(geneinfo, ptmscorefile, ocname="OCscores", 271 | samplename = ptmname, outdir = workdir) 272 | ptmscorelist.append(scores_ptm1) 273 | if len(ptmfiles) > 1: 274 | scores_ptm = merge_reps(geneinfo, ptmscorelist, samplename = "PTMscores", outdir = workdir) 275 | else: 276 | scores_ptm = scores_ptm1 277 | 278 | # TF motifs 279 | motiffile = os.path.join(workdir, gene, "motifs_raw.bedGraph") 280 | # Calculate scores 281 | scores_motif = motif_scores(geneinfo, motiffile, outdir = workdir) 282 | 283 | # Conserved sequences 284 | cnsfile = os.path.join(workdir, gene, "CNS_raw.bedGraph") 285 | # Calculate scores 286 | scores_cns = cns_scores(geneinfo, cnsfile, outdir = workdir) 287 | 288 | # Genotype versus Phenotype (MBKbase) 289 | genopheno = os.path.join(workdir, gene, "genopheno_raw.bedGraph") 290 | # Calculate scores 291 | scores_genopheno = genopheno_scores(geneinfo, genopheno, outdir = workdir) 292 | 293 | # Aggregate scores 294 | if scores_genopheno: 295 | scorelist = [scores_oc, scores_motif, scores_cns, scores_ptm, scores_genopheno] 296 | weightlist = [0.25, 0.2, 0.3, 0.1, 0.05] 297 | else: 298 | scorelist = [scores_oc, scores_motif, scores_cns, scores_ptm] 299 | weightlist = [0.25, 0.2, 0.3, 0.1] 300 | scores_aggregate = aggregate_scores(geneinfo, scorelist, weightlist, outdir = workdir) 301 | 302 | # Load phenodata from CRISPR-edited results 303 | phenodata = os.path.join(workdir, gene, "phenoscores_raw.bedGraph") 304 | # Calculate scores 305 | if os.path.exists(phenodata): 306 | scores_phenodata = phenodata_scores(geneinfo, phenodata, method = "kmeans2", 307 | outdir = workdir) 308 | else: 309 | scores_phenodata = {} 310 | 311 | # Find the feature importance 312 | if scores_phenodata: 313 | namelist = ["DHS", "H3K27ac", "TF motif", "CNS", "GenoPheno", "Aggregate"] 314 | misc.calc_importance(phenodata, scorelist+[scores_aggregate], 315 | namelist, geneinfo, side="both", outdir = workdir) 316 | 317 | # Define key regions 318 | key_regions = define_key_regions(geneinfo, scores_aggregate, phenodata, 319 | outdir = workdir) 320 | 321 | # Get the core of key regions 322 | scorefile = os.path.join(workdir, gene, "aggregate.bedGraph") 323 | regionfile = os.path.join(workdir, gene, "key_regions_merged.bed") 324 | core_regions = output_cores(geneinfo, scorefile, regionfile) 325 | 326 | cleanup() 327 | 328 | return (gene, 1) 329 | 330 | 331 | def check_options(config): 332 | 333 | print("# Using the following options:") 334 | if config["General"]["workdir"]: 335 | config["General"]["workdir"] = os.path.abspath(config["General"]["workdir"]) 336 | else: 337 | config["General"]["workdir"] = "results" 338 | misc.check_outdir(config["General"]["workdir"]) 339 | for section in config.sections(): 340 | for param in config.options(section): 341 | values = config[section][param] 342 | if section == "Features": 343 | if "," in values: 344 | values = values.split(",") 345 | for file in values: 346 | if file and not os.path.exists(file): 347 | print("# Error, cannot find the %s: %s" % (param, file)) 348 | sys.exit(1) 349 | else: 350 | file = values 351 | if file and not os.path.exists(file): 352 | print("# Error, cannot find the %s: %s" % (param, file)) 353 | sys.exit(1) 354 | print("%s: %s" % (param, values)) 355 | if int(config["General"]["threads"]) > os.cpu_count(): 356 | config["General"]["threads"] = os.cpu_count() 357 | if int(config["General"]["slop"]) > 5e4: 358 | config["General"]["slop"] = 5e4 359 | if int(config["General"]["upstream"]) > 1e4: 360 | config["General"]["upstream"] = 1e4 361 | if int(config["General"]["binsize"]) > int(config["General"]["upstream"]) / 2: 362 | config["General"]["binsize"] = int(config["General"]["upstream"]) / 2 363 | if int(config["General"]["step"]) > int(config["General"]["binsize"]): 364 | config["General"]["step"] = int(config["General"]["binsize"]) 365 | if config["Genes"]["gff_file"]: 366 | print("\n# Using Batch mode.\n") 367 | 368 | return config 369 | 370 | 371 | def main(): 372 | 373 | # Load configs 374 | config = configparser.ConfigParser() 375 | if len(sys.argv) == 1: 376 | config_file = "config.ini" 377 | elif len(sys.argv) == 2: 378 | config_file = sys.argv[1] 379 | else: 380 | print("Usage:\n python batch.py [configfile]\n") 381 | sys.exit(1) 382 | config.read(config_file) 383 | 384 | config = check_options(config) 385 | workdir = config["General"]["workdir"] 386 | threads = int(config["General"]["threads"]) 387 | slop = int(config["General"]["slop"]) 388 | upstream = int(config["General"]["upstream"]) 389 | binsize = int(config["General"]["binsize"]) 390 | step = int(config["General"]["step"]) 391 | withutr = int(config["General"]["withutr"]) 392 | gff_file = config["Genes"]["gff_file"] 393 | chrom_sizes = config["Genes"]["chrom_sizes"] 394 | 395 | # Load genes 396 | if gff_file: 397 | genes_info = get_gene_info(gff_file) 398 | total_genes = len(genes_info) 399 | else: 400 | print("No genome annotation file found, stop!") 401 | sys.exit(1) 402 | 403 | # Define the input numbers of multiprocessing list 404 | inputnum = 512 405 | if inputnum < threads: 406 | inputnum = threads 407 | else: 408 | roundnum = (inputnum // threads) * threads 409 | inputnum = int(max(roundnum, threads*4)) 410 | 411 | # Load chromosome sizes 412 | chrlens = misc.get_chrom_sizes(chrom_sizes) 413 | 414 | # Define features information 415 | feature_map = {"ocfiles":"OCscores", "ocpeaks":"OCpeaks", "ptmfiles":"PTM", 416 | "motifs":"motifs", "cnss":"CNS", "genopheno":"genopheno", 417 | "phenodata":"phenoscores"} 418 | for item in config["Features"]: 419 | feature_files = config["Features"][item] 420 | if not feature_files: 421 | continue 422 | filelist = feature_files.split(",") 423 | count = 1 424 | for file in filelist: 425 | feature_infos = [] 426 | num = 1 427 | for gene in genes_info: 428 | chrom = genes_info[gene][0] 429 | start = genes_info[gene][1] 430 | end = genes_info[gene][2] 431 | strand = genes_info[gene][3] 432 | utrst = genes_info[gene][4] 433 | utred = genes_info[gene][5] 434 | feature_info = Features_info() 435 | feature_info.workdir = workdir 436 | feature_info.slop = slop 437 | feature_info.config = config 438 | feature_info.idx = num 439 | feature_info.geneinfo = Geneinfo() 440 | feature_info.geneinfo.gene = gene 441 | feature_info.geneinfo.chrom = chrom 442 | feature_info.geneinfo.strand = strand 443 | if strand == "+": 444 | feature_info.geneinfo.start = start - upstream 445 | if withutr: 446 | feature_info.geneinfo.end = start + utrst 447 | else: 448 | feature_info.geneinfo.end = start 449 | else: 450 | if withutr: 451 | feature_info.geneinfo.start = end - utred 452 | else: 453 | feature_info.geneinfo.start = end 454 | feature_info.geneinfo.end = end + upstream 455 | feature_info.geneinfo.binsize = binsize 456 | feature_info.geneinfo.step = step 457 | num += 1 458 | # Output analyzed gene regions 459 | generate_regions(feature_info.geneinfo, workdir, gene, chrlens) 460 | feature_info.feature = file 461 | if len(filelist) > 1: 462 | outname = feature_map[item] + "_" + str(count) 463 | else: 464 | outname = feature_map[item] 465 | feature_info.outname = outname 466 | feature_info.chrlens = chrlens 467 | feature_infos.append(feature_info) 468 | count += 1 469 | # continue # Skip generate features raw data 470 | # Generate features file 471 | time_st = time() 472 | file_suffix = file.split(".")[-1].lower() 473 | filesize = os.path.getsize(file) 474 | if file_suffix in ["bed", "bedgraph", "txt"] and filesize > 1e8: 475 | results = generate_features_from_large(file, genes_info, upstream, slop, 476 | workdir, outname) 477 | else: 478 | # Multiprocessing 479 | for i in range(0, total_genes, inputnum): 480 | # Set Pool size 481 | pool = Pool(threads) 482 | if i + inputnum < total_genes: 483 | inputlist = feature_infos[i:i+inputnum] 484 | else: 485 | inputlist = feature_infos[i:] 486 | results = pool.map(generate_features, inputlist) 487 | pool.close() 488 | pool.join() 489 | print("Round %s finished." % round(i/inputnum)) 490 | time_ed = time() 491 | time_elapse = round(time_ed - time_st) 492 | print("Generate %s features files finished.\nUsing %ss" % (outname, time_elapse)) 493 | 494 | # Perform analysis 495 | time_st = time() 496 | cnt = 1 497 | new = 0 498 | for i in range(0, total_genes, inputnum): 499 | # Set Pool size 500 | pool = Pool(threads) 501 | if i + inputnum < total_genes: 502 | inputlist = feature_infos[i:i+inputnum] 503 | else: 504 | inputlist = feature_infos[i:] 505 | for result in pool.imap_unordered(run_analysis, inputlist): 506 | if result[1]: 507 | new += 1 508 | time_ed = time() 509 | if new: 510 | speed = round((time_ed - time_st) / new, 2) 511 | else: 512 | time_st = time() 513 | speed = 0.0 514 | print("%s / %s Gene (%s) analyzed (speed %s s)." % (cnt, total_genes, result[0], speed)) 515 | cnt += 1 516 | pool.close() 517 | pool.join() 518 | print("Round %s finished." % round(i/inputnum)) 519 | time_total = round(time() - time_st, 2) 520 | print("\nAll the genes analysis finished using %ss. %s\n" % (time_total, " "*30)) 521 | 522 | print("All the processes completed.", " "*10) 523 | 524 | 525 | 526 | if __name__ == '__main__': 527 | 528 | try: 529 | main() 530 | 531 | except KeyboardInterrupt: 532 | sys.stderr.write("User interrupt\n") 533 | sys.exit(0) 534 | 535 | -------------------------------------------------------------------------------- /test/single/data/genes_motifs_JASPAR_test.bed: -------------------------------------------------------------------------------- 1 | Chr1 4001197 4001217 AT2G28810 . - 2.22e-07 0.00793 2 | Chr1 4001197 4001217 AT5G02460 . + 1.71e-07 0.0044 3 | Chr1 4001198 4001218 OBP1 . - 3.96e-07 0.00957 4 | Chr1 4001236 4001248 O2 . - 9.03e-06 0.301 5 | Chr1 4001296 4001305 ERF6 . + 9.61e-06 0.159 6 | Chr1 4001395 4001406 bZIP42 . - 7.49e-06 0.417 7 | Chr1 4002087 4002097 bHLH80 . + 2.49e-07 0.183 8 | Chr1 4002208 4002218 LEC2 . + 4.49e-06 0.403 9 | Chr1 4002257 4002275 AT3G45610 . - 5.07e-07 0.0322 10 | Chr1 4002257 4002277 COG1 . + 4.99e-07 0.0211 11 | Chr1 4002260 4002280 Adof1 . + 8.69e-08 0.00468 12 | Chr1 4002264 4002282 AT3G45610 . - 1.84e-06 0.0522 13 | Chr1 4002265 4002275 AT3G52440 . + 2.77e-06 0.412 14 | Chr1 4002265 4002275 DAG2 . + 4.51e-07 0.244 15 | Chr1 4002265 4002278 OBP4 . + 2.42e-06 0.132 16 | Chr1 4002301 4002315 AT3G46070 . + 2.75e-06 0.305 17 | Chr1 4002446 4002474 AT5G66940 . - 6.68e-10 0.000144 18 | Chr1 4002447 4002467 Adof1 . + 4.23e-08 0.0033 19 | Chr1 4002448 4002466 dof4.2 . + 2.98e-07 0.0263 20 | Chr1 4002448 4002468 OBP3 . - 2.29e-07 0.0043 21 | Chr1 4002448 4002474 AT1G69570 . - 7.81e-11 8.63e-05 22 | Chr1 4002449 4002469 OBP3 . - 3.85e-07 0.00558 23 | Chr1 4002450 4002464 IDD2 . + 4.71e-06 0.238 24 | Chr1 4002450 4002469 AT1G14580 . - 7.57e-06 0.243 25 | Chr1 4002450 4002470 OBP3 . - 8.67e-08 0.0026 26 | Chr1 4002451 4002471 OBP3 . - 1.48e-07 0.00347 27 | Chr1 4002452 4002472 AT2G28810 . - 1.09e-09 0.000736 28 | Chr1 4002452 4002472 OBP3 . - 3.65e-10 0.000258 29 | Chr1 4002452 4002472 AT5G02460 . + 2.32e-10 0.000203 30 | Chr1 4002452 4002480 AT5G66940 . - 1.64e-06 0.0126 31 | Chr1 4002453 4002473 OBP1 . - 2.45e-09 0.0007 32 | Chr1 4002453 4002481 AT5G66940 . - 8.62e-07 0.00878 33 | Chr1 4002454 4002472 AT3G45610 . - 2.38e-06 0.0569 34 | Chr1 4002454 4002474 FLC . + 9.09e-06 0.238 35 | Chr1 4002454 4002474 COG1 . + 2.91e-08 0.00648 36 | Chr1 4002454 4002482 AT5G66940 . - 1.31e-06 0.0111 37 | Chr1 4002455 4002483 AT5G66940 . - 3.34e-07 0.00507 38 | Chr1 4002456 4002482 AT1G69570 . - 7.97e-11 8.63e-05 39 | Chr1 4002456 4002484 AT5G66940 . - 2.41e-13 4.58e-06 40 | Chr1 4002457 4002477 Adof1 . + 1.76e-08 0.00217 41 | Chr1 4002458 4002476 dof4.2 . + 7.77e-07 0.0374 42 | Chr1 4002458 4002478 OBP3 . - 2.98e-07 0.0049 43 | Chr1 4002459 4002472 PI . + 3.44e-06 0.121 44 | Chr1 4002459 4002479 OBP3 . - 4.05e-08 0.00179 45 | Chr1 4002460 4002474 IDD2 . + 4.71e-06 0.238 46 | Chr1 4002460 4002479 AT1G14580 . - 8.53e-06 0.243 47 | Chr1 4002460 4002480 OBP3 . - 3.01e-11 0.000105 48 | Chr1 4002462 4002480 AT3G45610 . - 1.37e-07 0.0213 49 | Chr1 4002462 4002482 COG1 . + 1.7e-08 0.00523 50 | Chr1 4002462 4002482 AT2G28810 . - 5.06e-10 0.000736 51 | Chr1 4002462 4002482 AT5G02460 . + 4.9e-11 0.000151 52 | Chr1 4002463 4002483 OBP1 . - 6.9e-10 0.000466 53 | Chr1 4002464 4002492 AT5G66940 . - 3.37e-10 0.000104 54 | Chr1 4002465 4002485 OBP3 . - 1.5e-07 0.00349 55 | Chr1 4002465 4002485 Adof1 . + 2.53e-09 0.000917 56 | Chr1 4002466 4002484 dof4.2 . + 3.04e-07 0.0263 57 | Chr1 4002466 4002486 OBP3 . - 2.03e-07 0.00408 58 | Chr1 4002466 4002492 AT1G69570 . - 8.94e-08 0.00258 59 | Chr1 4002467 4002480 PI . + 3.73e-06 0.121 60 | Chr1 4002467 4002487 OBP3 . - 3.69e-07 0.00546 61 | Chr1 4002468 4002488 OBP3 . - 4.62e-07 0.00611 62 | Chr1 4002469 4002489 OBP3 . - 6.01e-07 0.00699 63 | Chr1 4002470 4002490 AT2G28810 . - 6.95e-08 0.00437 64 | Chr1 4002470 4002490 OBP3 . - 3.72e-08 0.0017 65 | Chr1 4002470 4002490 AT5G02460 . + 2.51e-08 0.00146 66 | Chr1 4002471 4002491 OBP1 . - 5.85e-08 0.00332 67 | Chr1 4002472 4002490 AT3G45610 . - 9.86e-06 0.0968 68 | Chr1 4002574 4002588 ATHB34 . - 4.56e-06 0.167 69 | Chr1 4002662 4002675 OBP4 . - 1.92e-06 0.121 70 | Chr1 4002663 4002683 COG1 . - 3.89e-06 0.0492 71 | Chr1 4002665 4002675 AT3G52440 . - 3.89e-06 0.413 72 | Chr1 4002665 4002675 DAG2 . - 1.35e-06 0.244 73 | Chr1 4002665 4002683 AT3G45610 . + 6.95e-06 0.0849 74 | Chr1 4003265 4003285 COG1 . - 6.22e-06 0.0592 75 | Chr1 4003265 4003291 AT1G69570 . + 6.17e-07 0.00778 76 | Chr1 4003267 4003285 AT3G45610 . + 9.86e-06 0.0968 77 | Chr1 4003273 4003293 Adof1 . - 5.11e-07 0.0113 78 | Chr1 4003360 4003380 ATHB40 . + 3.79e-06 0.371 79 | Chr1 4003363 4003373 ATHB53 . + 3.89e-06 0.497 80 | Chr1 4003363 4003373 ATHB20 . + 3.27e-06 0.401 81 | Chr1 4003363 4003373 ATHB13 . + 1.82e-06 0.435 82 | Chr1 4003366 4003375 ATHB23 . + 7.3e-06 0.138 83 | Chr1 4003367 4003379 ZHD1 . + 8.36e-06 0.148 84 | Chr1 4003368 4003377 ATHB23 . - 2.43e-06 0.0816 85 | Chr1 4003439 4003451 ZHD1 . + 5.09e-06 0.145 86 | Chr1 4003440 4003452 ZHD1 . - 2.47e-06 0.124 87 | Chr1 4003442 4003451 ATHB23 . + 2.43e-06 0.0816 88 | Chr1 4003442 4003456 ATHB34 . + 1.68e-07 0.0503 89 | Chr1 4003443 4003455 ZHD1 . + 4.49e-07 0.0757 90 | Chr1 4003443 4003457 ATHB34 . - 1.68e-07 0.0503 91 | Chr1 4003444 4003453 ATHB23 . - 2.43e-06 0.0816 92 | Chr1 4003444 4003456 ZHD1 . - 4.49e-07 0.0757 93 | Chr1 4003446 4003455 ATHB23 . + 2.43e-06 0.0816 94 | Chr1 4003446 4003460 ATHB34 . + 1.68e-07 0.0503 95 | Chr1 4003447 4003459 ZHD1 . + 4.49e-07 0.0757 96 | Chr1 4003447 4003461 ATHB34 . - 5.3e-07 0.0728 97 | Chr1 4003447 4003461 ZHD6 . - 5.02e-06 0.19 98 | Chr1 4003448 4003457 ATHB23 . - 2.43e-06 0.0816 99 | Chr1 4003448 4003460 ZHD1 . - 4.49e-07 0.0757 100 | Chr1 4003450 4003459 ATHB23 . + 2.43e-06 0.0816 101 | Chr1 4003452 4003461 ATHB23 . - 7.3e-06 0.138 102 | Chr1 4003579 4003599 OBP3 . + 1.77e-07 0.0038 103 | Chr1 4003579 4003599 Adof1 . - 2.54e-08 0.00256 104 | Chr1 4003580 4003598 dof4.2 . - 7.74e-06 0.0818 105 | Chr1 4003582 4003608 AT1G69570 . + 5.08e-07 0.00696 106 | Chr1 4003584 4003604 OBP3 . + 1.72e-07 0.00374 107 | Chr1 4003600 4003628 AT5G66940 . + 1.12e-06 0.0102 108 | Chr1 4003602 4003622 AT5G02460 . - 1.49e-06 0.0153 109 | Chr1 4003607 4003627 Adof1 . - 5.57e-07 0.0118 110 | Chr1 4003610 4003630 COG1 . - 7.99e-06 0.0649 111 | Chr1 4003664 4003684 RAP212 . + 3.07e-08 0.000483 112 | Chr1 4003664 4003684 ERF9 . + 1.45e-08 0.000157 113 | Chr1 4003665 4003685 LEP . - 4.68e-09 7.99e-05 114 | Chr1 4003667 4003687 RAP212 . + 5.88e-08 0.00073 115 | Chr1 4003667 4003687 ERF9 . + 5.02e-09 7.55e-05 116 | Chr1 4003668 4003679 CBF1 . - 2.11e-06 0.413 117 | Chr1 4003668 4003686 ABR1 . + 7.15e-08 0.00108 118 | Chr1 4003668 4003688 LEP . - 1.78e-09 4.03e-05 119 | Chr1 4003669 4003681 RAP21 . + 2.92e-06 0.0754 120 | Chr1 4003669 4003683 AT4G16750 . - 1.84e-07 0.0115 121 | Chr1 4003670 4003684 AT5G67000 . + 9.22e-07 0.0127 122 | Chr1 4003670 4003684 CEJ1 . - 8.85e-07 0.015 123 | Chr1 4003670 4003684 AT1G44830 . - 4.22e-07 0.00624 124 | Chr1 4003670 4003684 AT1G75490 . - 3.01e-07 0.003 125 | Chr1 4003670 4003688 ESE3 . + 4.53e-08 0.000528 126 | Chr1 4003670 4003690 RAP212 . + 1.98e-07 0.00165 127 | Chr1 4003671 4003684 AT1G36060 . + 8.82e-06 0.121 128 | Chr1 4003671 4003685 AT5G18450 . + 4.52e-07 0.00453 129 | Chr1 4003671 4003689 ERF104 . - 4.57e-09 6.88e-05 130 | Chr1 4003671 4003690 DREB26 . - 7.25e-07 0.0108 131 | Chr1 4003671 4003690 AT4G28140 . + 6.38e-09 0.00014 132 | Chr1 4003672 4003686 AT4G16750 . - 1.06e-06 0.0159 133 | Chr1 4003672 4003686 ERF15 . - 1.88e-07 0.00152 134 | Chr1 4003672 4003686 ERF105 . + 1.04e-08 0.000197 135 | Chr1 4003673 4003683 ERF118 . - 7.65e-08 0.000844 136 | Chr1 4003673 4003684 CRF4 . - 3.82e-08 0.000617 137 | Chr1 4003673 4003687 AT1G44830 . - 5.84e-07 0.00709 138 | Chr1 4003673 4003687 RAP211 . - 2.18e-07 0.00205 139 | Chr1 4003673 4003687 AT1G75490 . - 1.54e-07 0.00218 140 | Chr1 4003673 4003687 RAP26 . + 3.11e-08 0.000521 141 | Chr1 4003673 4003687 PUCHI . - 1.64e-08 0.000302 142 | Chr1 4003673 4003687 ERF087 . + 1.23e-08 0.000228 143 | Chr1 4003673 4003687 ERF5 . - 4.47e-09 0.000101 144 | Chr1 4003673 4003687 ESE1 . + 2.71e-09 6.43e-05 145 | Chr1 4003673 4003693 ERF2 . - 1.84e-09 3.48e-05 146 | Chr1 4003674 4003684 AT3G57600 . - 2.83e-07 0.00301 147 | Chr1 4003674 4003692 ERF104 . - 6.75e-10 1.67e-05 148 | Chr1 4003674 4003694 ERF10 . - 5.11e-09 7.85e-05 149 | Chr1 4003675 4003685 ERF3 . - 7.65e-08 0.000947 150 | Chr1 4003675 4003685 AT2G33710 . - 7.65e-08 0.000947 151 | Chr1 4003675 4003689 AT4G16750 . - 1.31e-06 0.0173 152 | Chr1 4003675 4003689 ERF105 . + 4.06e-08 0.000499 153 | Chr1 4003675 4003689 ERF15 . - 5.85e-09 0.000126 154 | Chr1 4003675 4003691 AT4G18450 . + 5.98e-09 0.000101 155 | Chr1 4003676 4003686 ERF118 . - 7.65e-08 0.000844 156 | Chr1 4003676 4003690 RAP211 . - 3.47e-07 0.00286 157 | Chr1 4003697 4003711 ERF021 . - 5.97e-06 0.132 158 | Chr1 4003697 4003717 RAP212 . + 1.98e-07 0.00165 159 | Chr1 4003698 4003711 AT1G36060 . + 4.95e-06 0.113 160 | Chr1 4003698 4003718 LEP . - 9.25e-09 0.000126 161 | Chr1 4003699 4003711 RAP21 . + 3.24e-06 0.0756 162 | Chr1 4003699 4003713 AT4G16750 . - 8.99e-08 0.0115 163 | Chr1 4003700 4003714 AT5G67000 . + 9.22e-07 0.0127 164 | Chr1 4003700 4003714 CEJ1 . - 8.85e-07 0.015 165 | Chr1 4003700 4003714 AT1G44830 . - 4.22e-07 0.00624 166 | Chr1 4003700 4003714 AT1G75490 . - 3.01e-07 0.003 167 | Chr1 4003700 4003718 ESE3 . + 8.12e-09 0.000172 168 | Chr1 4003700 4003720 RAP212 . + 7.38e-09 0.00019 169 | Chr1 4003700 4003720 ERF2 . - 2.75e-09 4.69e-05 170 | Chr1 4003700 4003720 ERF9 . + 3.89e-10 1.26e-05 171 | Chr1 4003701 4003714 AT1G36060 . + 8.82e-06 0.121 172 | Chr1 4003701 4003715 AT5G18450 . + 4.52e-07 0.00453 173 | Chr1 4003701 4003719 ABR1 . + 5.63e-09 0.000235 174 | Chr1 4003701 4003719 ERF104 . - 2.99e-10 9.2e-06 175 | Chr1 4003701 4003720 DREB26 . - 1.51e-07 0.00495 176 | Chr1 4003701 4003720 AT4G28140 . + 4.85e-10 2.32e-05 177 | Chr1 4003701 4003721 ERF10 . - 6.61e-09 9.44e-05 178 | Chr1 4003701 4003721 LEP . - 1.04e-09 2.74e-05 179 | Chr1 4003702 4003716 AT4G16750 . - 1.06e-06 0.0159 180 | Chr1 4003702 4003716 ERF15 . - 1.88e-07 0.00152 181 | Chr1 4003702 4003716 ERF105 . + 1.04e-08 0.000197 182 | Chr1 4003702 4003718 AT4G18450 . + 3.24e-09 6.39e-05 183 | Chr1 4003703 4003713 ERF118 . - 7.65e-08 0.000844 184 | Chr1 4003703 4003714 CRF4 . - 3.82e-08 0.000617 185 | Chr1 4003703 4003717 CEJ1 . - 1.04e-06 0.015 186 | Chr1 4003703 4003717 AT5G67000 . + 2.14e-07 0.00625 187 | Chr1 4003703 4003717 AT1G44830 . - 1.31e-07 0.00309 188 | Chr1 4003703 4003717 RAP211 . - 1.06e-08 0.000349 189 | Chr1 4003703 4003717 RAP26 . + 1.67e-09 6.56e-05 190 | Chr1 4003703 4003717 ERF087 . + 1.02e-09 4.07e-05 191 | Chr1 4003703 4003717 PUCHI . - 7.31e-10 2.93e-05 192 | Chr1 4003703 4003717 AT1G75490 . - 1.24e-08 0.000337 193 | Chr1 4003703 4003717 ESE1 . + 1.98e-10 9.3e-06 194 | Chr1 4003703 4003717 ERF5 . - 1.98e-10 9.29e-06 195 | Chr1 4003703 4003721 ESE3 . + 1.43e-08 0.000248 196 | Chr1 4003703 4003723 RAP212 . + 2.09e-08 0.00037 197 | Chr1 4003703 4003723 ERF2 . - 8.05e-12 6.1e-07 198 | Chr1 4003703 4003723 ERF9 . + 1.56e-09 3.3e-05 199 | Chr1 4003704 4003714 AT3G57600 . - 2.83e-07 0.00301 200 | Chr1 4003704 4003718 AT5G18450 . + 2.56e-08 0.000707 201 | Chr1 4003704 4003718 AT1G22810 . + 2.22e-08 0.000651 202 | Chr1 4003704 4003722 ERF104 . - 7.24e-11 3.51e-06 203 | Chr1 4003704 4003723 AT4G28140 . + 3.3e-10 1.73e-05 204 | Chr1 4003704 4003723 DREB26 . - 3.14e-07 0.00693 205 | Chr1 4003704 4003724 LEP . - 2.37e-08 0.00024 206 | Chr1 4003704 4003724 ERF10 . - 2.74e-11 1.87e-06 207 | Chr1 4003705 4003715 ERF3 . - 7.65e-08 0.000947 208 | Chr1 4003705 4003715 AT2G33710 . - 7.65e-08 0.000947 209 | Chr1 4003705 4003719 AT4G16750 . - 5.16e-07 0.0115 210 | Chr1 4003705 4003719 ERF105 . + 4.38e-10 1.66e-05 211 | Chr1 4003705 4003719 ERF15 . - 1.98e-10 8.39e-06 212 | Chr1 4003705 4003721 AT4G18450 . + 7.11e-10 2.28e-05 213 | Chr1 4003706 4003716 ERF118 . - 7.65e-08 0.000844 214 | Chr1 4003706 4003717 CRF4 . - 3.82e-08 0.000617 215 | Chr1 4003706 4003720 AT1G75490 . - 3.68e-07 0.00332 216 | Chr1 4003706 4003720 RAP26 . + 4.91e-08 0.000723 217 | Chr1 4003706 4003720 PUCHI . - 2.43e-08 0.00038 218 | Chr1 4003706 4003720 ESE1 . + 9.34e-09 0.000168 219 | Chr1 4003706 4003720 RAP211 . - 2.01e-07 0.00205 220 | Chr1 4003706 4003720 ERF087 . + 1.57e-08 0.000268 221 | Chr1 4003706 4003720 ERF5 . - 1.59e-08 0.000252 222 | Chr1 4003706 4003726 ERF2 . - 5.52e-11 2.61e-06 223 | Chr1 4003707 4003717 AT3G57600 . - 2.83e-07 0.00301 224 | Chr1 4003707 4003725 ERF104 . - 2.99e-10 9.2e-06 225 | Chr1 4003707 4003726 DREB26 . - 2.6e-07 0.00644 226 | Chr1 4003707 4003726 AT4G28140 . + 6.29e-10 2.82e-05 227 | Chr1 4003707 4003727 AT1G77640 . + 2.68e-10 0.000394 228 | Chr1 4003707 4003727 ERF10 . - 4.95e-11 2.8e-06 229 | Chr1 4003708 4003718 ERF3 . - 7.65e-08 0.000947 230 | Chr1 4003708 4003718 AT2G33710 . - 7.65e-08 0.000947 231 | Chr1 4003708 4003722 ERF15 . - 2.86e-08 0.00041 232 | Chr1 4003708 4003724 AT4G18450 . + 2.07e-09 4.75e-05 233 | Chr1 4003709 4003719 ERF118 . - 7.65e-08 0.000844 234 | Chr1 4003709 4003723 AT1G75490 . - 4.69e-07 0.00399 235 | Chr1 4003709 4003723 RAP211 . - 4.12e-07 0.00318 236 | Chr1 4003709 4003723 ERF5 . - 3.5e-08 0.000401 237 | Chr1 4003709 4003729 ERF2 . - 5.71e-09 7.86e-05 238 | Chr1 4003710 4003729 DREB26 . - 8.17e-07 0.0115 239 | Chr1 4003710 4003729 AT4G28140 . + 9.14e-09 0.000179 240 | Chr1 4003712 4003732 DREB2 . + 8.63e-06 0.113 241 | Chr1 4003713 4003727 AT1G01250 . + 3.39e-06 0.434 242 | Chr1 4003713 4003727 AT1G22810 . + 1.24e-08 0.000651 243 | Chr1 4003714 4003728 RAP2-1 . - 8.85e-06 0.405 244 | Chr1 4003714 4003728 TINY . - 1.41e-06 0.435 245 | Chr1 4003714 4003728 ERF019 . - 4e-07 0.171 246 | Chr1 4003715 4003729 AT2G44940 . + 1.9e-06 0.468 247 | Chr1 4003716 4003727 ERF015 . - 8.81e-06 0.435 248 | Chr1 4003716 4003729 AT1G36060 . + 7.16e-06 0.118 249 | Chr1 4003717 4003729 RAP21 . + 1.69e-06 0.0581 250 | Chr1 4003789 4003798 PIF3 . + 1.36e-06 0.427 251 | Chr1 4003789 4003800 ABF4 . + 9.55e-06 0.37 252 | Chr1 4003789 4003800 HYH . - 6.67e-06 0.454 253 | Chr1 4003790 4003797 PIF4 . - 9.9e-06 0.439 254 | Chr1 4003988 4004007 DREB26 . + 1.08e-06 0.0135 255 | Chr1 4003994 4004008 AT1G44830 . + 3.45e-08 0.00309 256 | Chr1 4003994 4004008 AT1G75490 . + 1.3e-08 0.000348 257 | Chr1 4003994 4004008 CEJ1 . + 1.25e-07 0.0118 258 | Chr1 4003997 4004007 AT3G57600 . + 9.31e-08 0.00301 259 | Chr1 4003997 4004011 RAP211 . + 2.4e-08 0.000683 260 | Chr1 4003997 4004016 AT4G28140 . - 2.68e-10 1.47e-05 261 | Chr1 4003997 4004016 DREB26 . + 1.57e-08 0.00198 262 | Chr1 4003997 4004017 ERF9 . - 1.79e-08 0.000181 263 | Chr1 4003999 4004013 AT5G18450 . - 2.85e-07 0.0039 264 | Chr1 4003999 4004019 AT1G77640 . - 1.81e-07 0.0145 265 | Chr1 4004003 4004012 ERF6 . - 4.62e-06 0.159 266 | Chr1 4004003 4004017 CEJ1 . + 5.77e-07 0.015 267 | Chr1 4004003 4004017 AT1G44830 . + 1.21e-07 0.00309 268 | Chr1 4004003 4004017 AT1G75490 . + 5.05e-07 0.00422 269 | Chr1 4004003 4004023 DREB2 . - 1.59e-06 0.0611 270 | Chr1 4004006 4004018 RAP21 . - 1.88e-06 0.0606 271 | Chr1 4004006 4004019 AT1G36060 . - 7.55e-06 0.12 272 | Chr1 4004007 4004021 ERF019 . + 7.8e-06 0.216 273 | Chr1 4004158 4004178 AT1G77640 . - 4.17e-06 0.0683 274 | Chr1 4004164 4004175 ERF4 . + 3.71e-06 0.0782 275 | Chr1 4004164 4004175 ERF11 . + 2.41e-06 0.0748 276 | Chr1 4004257 4004270 ARF7 . - 4.3e-06 0.479 277 | Chr1 4004374 4004384 LEC2 . - 4.99e-07 0.28 278 | Chr1 4004534 4004541 ERF008 . - 6.67e-06 0.101 279 | Chr1 4004593 4004602 ERF6 . - 1.33e-06 0.159 280 | Chr1 4004644 4004651 RAP2-3 . + 6.67e-06 0.07 281 | Chr1 4004644 4004651 RAP2-6 . + 6.67e-06 0.07 282 | Chr1 4004644 4004651 ERF109 . + 6.67e-06 0.0701 283 | Chr1 4004644 4004652 ERF069 . + 5.17e-06 0.0506 284 | Chr1 4004647 4004656 ERF6 . - 4.62e-06 0.159 285 | Chr1 4004693 4004700 ERF008 . + 6.67e-06 0.101 286 | Chr1 4004710 4004717 RAP2-3 . + 6.67e-06 0.07 287 | Chr1 4004710 4004717 RAP2-6 . + 6.67e-06 0.07 288 | Chr1 4004710 4004717 ERF109 . + 6.67e-06 0.0701 289 | Chr1 4004710 4004718 ERF069 . + 5.17e-06 0.0506 290 | Chr1 4004716 4004724 CMTA3 . + 1.83e-06 0.202 291 | Chr1 4004751 4004771 ERF2 . + 8.69e-09 0.000108 292 | Chr1 4004752 4004770 ERF104 . + 4.9e-09 7.24e-05 293 | Chr1 4004753 4004773 ERF10 . + 1.49e-09 3.27e-05 294 | Chr1 4004754 4004773 DREB26 . + 1.06e-06 0.0134 295 | Chr1 4004754 4004773 AT4G28140 . - 3.82e-09 0.0001 296 | Chr1 4004754 4004774 RAP212 . - 1.35e-07 0.00128 297 | Chr1 4004754 4004774 ERF2 . + 3.33e-10 9.84e-06 298 | Chr1 4004755 4004769 ERF15 . + 4.67e-08 0.000571 299 | Chr1 4004755 4004773 ERF104 . + 2.43e-10 8.01e-06 300 | Chr1 4004756 4004772 AT4G18450 . - 2.29e-09 4.99e-05 301 | Chr1 4004756 4004774 ESE3 . - 1.67e-09 6.11e-05 302 | Chr1 4004756 4004776 ERF10 . + 1.72e-11 1.31e-06 303 | Chr1 4004756 4004776 LEP . + 1.64e-11 1.45e-06 304 | Chr1 4004757 4004771 AT1G44830 . + 5.84e-07 0.00709 305 | Chr1 4004757 4004771 RAP211 . + 2.18e-07 0.00205 306 | Chr1 4004757 4004771 AT1G75490 . + 1.54e-07 0.00218 307 | Chr1 4004757 4004771 RAP26 . - 3.11e-08 0.000521 308 | Chr1 4004757 4004771 PUCHI . + 1.64e-08 0.000302 309 | Chr1 4004757 4004771 ERF087 . - 1.23e-08 0.000228 310 | Chr1 4004757 4004771 ERF5 . + 4.47e-09 0.000101 311 | Chr1 4004757 4004771 ESE1 . - 2.71e-09 6.43e-05 312 | Chr1 4004757 4004776 AT4G28140 . - 8.21e-11 6.99e-06 313 | Chr1 4004757 4004776 DREB26 . + 1.59e-07 0.00495 314 | Chr1 4004757 4004777 RAP212 . - 1.53e-10 1.82e-05 315 | Chr1 4004757 4004777 ERF2 . + 1.03e-12 1.47e-07 316 | Chr1 4004757 4004777 ERF9 . - 4.22e-13 9.89e-08 317 | Chr1 4004758 4004768 ERF118 . + 7.65e-08 0.000844 318 | Chr1 4004758 4004772 AT4G16750 . + 5.16e-07 0.0115 319 | Chr1 4004758 4004772 ERF105 . - 4.38e-10 1.66e-05 320 | Chr1 4004758 4004772 ERF15 . + 1.98e-10 8.39e-06 321 | Chr1 4004758 4004776 ABR1 . - 3.36e-10 3.92e-05 322 | Chr1 4004758 4004776 ERF104 . + 5.1e-13 8.58e-08 323 | Chr1 4004759 4004769 ERF3 . + 7.65e-08 0.000947 324 | Chr1 4004759 4004769 AT2G33710 . + 7.65e-08 0.000947 325 | Chr1 4004759 4004773 AT5G18450 . - 2.56e-08 0.000707 326 | Chr1 4004759 4004773 AT1G22810 . - 2.22e-08 0.000651 327 | Chr1 4004759 4004775 AT4G18450 . - 1e-11 9.64e-07 328 | Chr1 4004759 4004777 ESE3 . - 5.72e-11 6.48e-06 329 | Chr1 4004759 4004779 AT1G77640 . - 4.82e-06 0.0722 330 | Chr1 4004759 4004779 ERF10 . + 4.96e-12 5.43e-07 331 | Chr1 4004759 4004779 LEP . + 3.52e-12 5.17e-07 332 | Chr1 4004760 4004770 AT3G57600 . + 2.83e-07 0.00301 333 | Chr1 4004760 4004771 CRF4 . + 3.82e-08 0.000617 334 | Chr1 4004760 4004774 CEJ1 . + 1.04e-06 0.015 335 | Chr1 4004760 4004774 AT5G67000 . - 2.14e-07 0.00625 336 | Chr1 4004760 4004774 AT1G44830 . + 1.31e-07 0.00309 337 | Chr1 4004760 4004774 RAP211 . + 1.06e-08 0.000349 338 | Chr1 4004760 4004774 RAP26 . - 1.67e-09 6.56e-05 339 | Chr1 4004760 4004774 ERF087 . - 1.02e-09 4.07e-05 340 | Chr1 4004760 4004774 PUCHI . + 7.31e-10 2.93e-05 341 | Chr1 4004760 4004774 AT1G75490 . + 1.24e-08 0.000337 342 | Chr1 4004760 4004774 ERF5 . + 1.98e-10 9.29e-06 343 | Chr1 4004760 4004774 ESE1 . - 1.98e-10 9.3e-06 344 | Chr1 4004760 4004779 DREB26 . + 2.04e-07 0.00564 345 | Chr1 4004760 4004779 AT4G28140 . - 3.77e-11 4.5e-06 346 | Chr1 4004760 4004780 RAP212 . - 1.32e-09 6.24e-05 347 | Chr1 4004760 4004780 ERF2 . + 1.65e-10 6.05e-06 348 | Chr1 4004760 4004780 ERF9 . - 4.09e-11 2.53e-06 349 | Chr1 4004761 4004771 ERF118 . + 7.65e-08 0.000844 350 | Chr1 4004761 4004775 AT4G16750 . + 5.16e-07 0.0115 351 | Chr1 4004761 4004775 ERF105 . - 4.38e-10 1.66e-05 352 | Chr1 4004761 4004775 ERF15 . + 1.98e-10 8.39e-06 353 | Chr1 4004761 4004779 ERF104 . + 3.62e-12 4.16e-07 354 | Chr1 4004761 4004779 ABR1 . - 2.51e-09 0.000135 355 | Chr1 4004762 4004772 ERF3 . + 7.65e-08 0.000947 356 | Chr1 4004762 4004772 AT2G33710 . + 7.65e-08 0.000947 357 | Chr1 4004762 4004776 AT5G18450 . - 2.56e-08 0.000707 358 | Chr1 4004762 4004776 AT1G22810 . - 2.22e-08 0.000651 359 | Chr1 4004762 4004778 AT4G18450 . - 4.67e-11 3.4e-06 360 | Chr1 4004762 4004780 ESE3 . - 7.68e-09 0.000166 361 | Chr1 4004762 4004782 LEP . + 9.02e-10 2.51e-05 362 | Chr1 4004763 4004773 AT3G57600 . + 2.83e-07 0.00301 363 | Chr1 4004763 4004774 CRF4 . + 3.82e-08 0.000617 364 | Chr1 4004763 4004777 CEJ1 . + 1.04e-06 0.015 365 | Chr1 4004763 4004777 AT5G67000 . - 2.14e-07 0.00625 366 | Chr1 4004763 4004777 AT1G44830 . + 1.31e-07 0.00309 367 | Chr1 4004763 4004777 RAP211 . + 1.06e-08 0.000349 368 | Chr1 4004763 4004777 RAP26 . - 1.67e-09 6.56e-05 369 | Chr1 4004763 4004777 ERF087 . - 1.02e-09 4.07e-05 370 | Chr1 4004763 4004777 PUCHI . + 7.31e-10 2.93e-05 371 | Chr1 4004763 4004777 AT1G75490 . + 1.24e-08 0.000337 372 | Chr1 4004763 4004777 ERF5 . + 1.98e-10 9.29e-06 373 | Chr1 4004763 4004777 ESE1 . - 1.98e-10 9.3e-06 374 | Chr1 4004763 4004783 RAP212 . - 3.23e-08 0.000499 375 | Chr1 4004763 4004783 ERF9 . - 1.12e-08 0.00013 376 | Chr1 4004764 4004774 ERF118 . + 7.65e-08 0.000844 377 | Chr1 4004764 4004778 AT4G16750 . + 1.75e-06 0.02 378 | Chr1 4004764 4004778 ERF105 . - 6.12e-09 0.000131 379 | Chr1 4004764 4004778 ERF15 . + 2.81e-09 6.94e-05 380 | Chr1 4004764 4004782 ABR1 . - 8.01e-08 0.00116 381 | Chr1 4004765 4004775 ERF3 . + 7.65e-08 0.000947 382 | Chr1 4004765 4004775 AT2G33710 . + 7.65e-08 0.000947 383 | Chr1 4004765 4004779 AT5G18450 . - 3.55e-07 0.00402 384 | Chr1 4004765 4004779 AT1G22810 . - 1.88e-07 0.00284 385 | Chr1 4004766 4004776 AT3G57600 . + 2.83e-07 0.00301 386 | Chr1 4004766 4004777 CRF4 . + 3.82e-08 0.000617 387 | Chr1 4004766 4004780 RAP211 . + 2.68e-08 0.000729 388 | Chr1 4004767 4004777 ERF118 . + 7.65e-08 0.000844 389 | Chr1 4004768 4004778 ERF3 . + 4.49e-07 0.0035 390 | Chr1 4004768 4004778 AT2G33710 . + 3.56e-07 0.00302 391 | Chr1 4004769 4004782 AT1G36060 . - 1.49e-06 0.1 392 | Chr1 4004769 4004782 ERF017 . - 3.65e-08 0.064 393 | Chr1 4004834 4004841 ERF008 . + 6.67e-06 0.101 394 | Chr1 4005108 4005128 COG1 . + 2.8e-06 0.0424 395 | Chr1 4005169 4005181 ZHD1 . + 4.72e-06 0.14 396 | Chr1 4005172 4005181 ATHB23 . + 4.87e-06 0.124 397 | Chr1 4005222 4005236 AT3G46070 . - 7.32e-06 0.429 398 | Chr1 4005552 4005566 ZHD6 . - 1.56e-07 0.084 399 | Chr1 4005553 4005562 ATHB23 . - 7.3e-06 0.138 400 | Chr1 4005776 4005785 ATHB23 . - 9.73e-06 0.153 401 | Chr1 4005779 4005793 ATHB34 . - 5.15e-06 0.167 402 | Chr1 4005859 4005868 WRKY40 . + 5.98e-06 0.352 403 | Chr1 4005941 4005969 AT5G66940 . + 1.72e-06 0.013 404 | Chr1 4006116 4006126 REF6 . + 6.08e-07 0.182 405 | Chr1 4006183 4006212 BPC5 . + 3.23e-10 6.32e-06 406 | Chr1 4006187 4006216 BPC5 . + 1.09e-09 1.79e-05 407 | Chr1 4006188 4006211 BPC1 . + 6.88e-09 7.67e-05 408 | Chr1 4006189 4006218 BPC5 . + 9.8e-13 3.88e-08 409 | Chr1 4006191 4006220 BPC5 . + 8.75e-15 5.32e-10 410 | Chr1 4006192 4006212 BPC6 . - 4.25e-08 0.000425 411 | Chr1 4006192 4006215 BPC1 . + 4.19e-12 1.37e-07 412 | Chr1 4006193 4006222 BPC5 . + 4.26e-16 3.18e-11 413 | Chr1 4006194 4006214 BPC6 . - 7.6e-12 2.74e-07 414 | Chr1 4006194 4006217 BPC1 . + 1.88e-13 8.38e-09 415 | Chr1 4006195 4006224 BPC5 . + 4.25e-16 3.18e-11 416 | Chr1 4006196 4006216 BPC6 . - 1.28e-09 2.35e-05 417 | Chr1 4006196 4006219 BPC1 . + 1.36e-13 6.14e-09 418 | Chr1 4006197 4006226 BPC5 . + 4.26e-16 3.18e-11 419 | Chr1 4006198 4006211 RAMOSA1 . + 3.63e-08 0.00049 420 | Chr1 4006198 4006218 BPC6 . - 3.01e-11 8.79e-07 421 | Chr1 4006198 4006221 BPC1 . + 1.05e-13 4.88e-09 422 | Chr1 4006199 4006228 BPC5 . + 7.5e-19 8.4e-14 423 | Chr1 4006200 4006213 RAMOSA1 . + 3.48e-09 7.3e-05 424 | Chr1 4006200 4006220 BPC6 . - 1.86e-13 8.8e-09 425 | Chr1 4006200 4006223 BPC1 . + 3.16e-15 2.04e-10 426 | Chr1 4006201 4006230 BPC5 . + 7.5e-19 8.4e-14 427 | Chr1 4006202 4006215 RAMOSA1 . + 3.48e-09 7.3e-05 428 | Chr1 4006202 4006222 BPC6 . - 1.86e-13 8.8e-09 429 | Chr1 4006202 4006225 BPC1 . + 3.16e-15 2.04e-10 430 | Chr1 4006203 4006232 BPC5 . + 7.5e-19 8.4e-14 431 | Chr1 4006204 4006217 RAMOSA1 . + 3.48e-09 7.3e-05 432 | Chr1 4006204 4006224 BPC6 . - 1.86e-13 8.8e-09 433 | Chr1 4006204 4006227 BPC1 . + 3.16e-15 2.04e-10 434 | Chr1 4006205 4006234 BPC5 . + 2.05e-15 1.41e-10 435 | Chr1 4006206 4006219 RAMOSA1 . + 3.48e-09 7.3e-05 436 | Chr1 4006206 4006226 BPC6 . - 1.86e-13 8.8e-09 437 | Chr1 4006206 4006229 BPC1 . + 3.16e-15 2.04e-10 438 | Chr1 4006207 4006236 BPC5 . + 5.12e-12 1.72e-07 439 | Chr1 4006208 4006221 RAMOSA1 . + 3.48e-09 7.3e-05 440 | Chr1 4006208 4006228 BPC6 . - 1.86e-13 8.8e-09 441 | Chr1 4006208 4006231 BPC1 . + 3.16e-15 2.04e-10 442 | Chr1 4006209 4006238 BPC5 . + 1.51e-13 7.3e-09 443 | Chr1 4006210 4006223 RAMOSA1 . + 3.48e-09 7.3e-05 444 | Chr1 4006210 4006230 BPC6 . - 1.86e-13 8.8e-09 445 | Chr1 4006210 4006233 BPC1 . + 2.64e-14 1.5e-09 446 | Chr1 4006212 4006225 RAMOSA1 . + 3.48e-09 7.3e-05 447 | Chr1 4006212 4006232 BPC6 . - 1.86e-13 8.8e-09 448 | Chr1 4006212 4006235 BPC1 . + 8.32e-12 2.49e-07 449 | Chr1 4006214 4006227 RAMOSA1 . + 3.48e-09 7.3e-05 450 | Chr1 4006214 4006234 BPC6 . - 8.29e-12 2.9e-07 451 | Chr1 4006214 4006237 BPC1 . + 1.12e-10 2.43e-06 452 | Chr1 4006216 4006229 RAMOSA1 . + 3.48e-09 7.3e-05 453 | Chr1 4006216 4006236 BPC6 . - 1.32e-09 2.42e-05 454 | Chr1 4006218 4006231 RAMOSA1 . + 3.48e-09 7.3e-05 455 | Chr1 4006220 4006233 RAMOSA1 . + 6.34e-09 0.000117 456 | Chr1 4006312 4006321 ERF7 . - 2.34e-06 0.113 457 | Chr1 4006312 4006321 ERF8 . - 2.34e-06 0.115 458 | Chr1 4006312 4006323 ERF11 . - 6.47e-06 0.0878 459 | Chr1 4006312 4006323 ERF4 . - 5.49e-06 0.0839 460 | Chr1 4006314 4006321 ERF1B . - 8.13e-06 0.12 461 | Chr1 4006314 4006321 ERF13 . - 8.13e-06 0.12 462 | Chr1 4006327 4006334 ERF008 . - 6.67e-06 0.101 463 | Chr1 4006339 4006348 ERF7 . - 9.64e-06 0.118 464 | Chr1 4006432 4006442 AT3G57600 . + 2.83e-07 0.00301 465 | Chr1 4006432 4006445 AT1G36060 . - 9.53e-06 0.124 466 | Chr1 4006436 4006443 ERF008 . + 6.67e-06 0.101 467 | Chr1 4006500 4006509 abi4 . + 1.84e-06 0.0682 468 | Chr1 4006502 4006509 RAP2-3 . + 6.67e-06 0.07 469 | Chr1 4006502 4006509 RAP2-6 . + 6.67e-06 0.07 470 | Chr1 4006502 4006509 ERF109 . + 6.67e-06 0.0701 471 | Chr1 4006503 4006510 ERF008 . + 6.67e-06 0.101 472 | Chr1 4006620 4006634 At5g05790 . + 5.6e-06 0.347 473 | Chr1 4006679 4006699 TRP2 . - 6.24e-06 0.298 474 | Chr1 4006688 4006706 dof4.2 . - 8.21e-06 0.0838 475 | Chr1 4006862 4006888 AT1G69570 . - 1.16e-07 0.003 476 | Chr1 4006868 4006888 COG1 . + 4.79e-08 0.00795 477 | Chr1 4006870 4006898 AT5G66940 . - 8.61e-08 0.00226 478 | Chr1 4006871 4006891 Adof1 . + 3.38e-07 0.00921 479 | Chr1 4006872 4006890 dof4.2 . + 1.95e-07 0.0235 480 | Chr1 4006872 4006892 Adof1 . + 1.01e-06 0.0163 481 | Chr1 4006873 4006893 OBP3 . - 1.05e-07 0.00287 482 | Chr1 4006874 4006894 OBP3 . - 2.09e-07 0.00412 483 | Chr1 4006875 4006888 PI . + 8e-06 0.155 484 | Chr1 4006876 4006889 OBP4 . + 9.69e-07 0.0923 485 | Chr1 4006876 4006896 AT5G02460 . + 4.76e-07 0.00809 486 | Chr1 4006876 4006896 AT2G28810 . - 1.2e-07 0.00572 487 | Chr1 4006877 4006897 OBP1 . - 1.25e-07 0.00512 488 | Chr1 4007043 4007055 GBF2 . - 8.58e-06 0.447 489 | Chr1 4007045 4007056 HYH . - 4.13e-06 0.359 490 | Chr1 4007046 4007053 PIF4 . - 9.9e-06 0.439 491 | Chr1 4007283 4007302 DREB26 . + 1.14e-06 0.0139 492 | Chr1 4007289 4007303 CEJ1 . + 4.58e-07 0.015 493 | Chr1 4007289 4007303 AT1G44830 . + 8.94e-08 0.00309 494 | Chr1 4007289 4007303 AT1G75490 . + 5.64e-08 0.00117 495 | Chr1 4007347 4007361 AT5G18450 . - 5.01e-07 0.00483 496 | Chr1 4007347 4007367 AT1G77640 . - 4.46e-06 0.0705 497 | Chr1 4007348 4007361 AT1G36060 . - 1.45e-06 0.1 498 | Chr1 4007352 4007359 ERF008 . + 6.67e-06 0.101 499 | -------------------------------------------------------------------------------- /lib/features.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import pandas as pd 4 | import numpy as np 5 | from scipy import stats 6 | from pybedtools import BedTool 7 | 8 | from lib import misc 9 | 10 | 11 | class Geneinfo(): 12 | # Provide Gene infomation 13 | def __init__(self): 14 | self.gene = "Gene" 15 | self.alias = "NA" 16 | self.chrom = "chrom" 17 | self.start = 0 18 | self.end = 1000 19 | self.strand = "+" 20 | self.binsize = 10 21 | self.step = 10 22 | 23 | 24 | def kmeans_like_diff(wt, control): 25 | # Calculate the k-means-like phenotype difference between mutants and WT 26 | ## Do not consider the length of mutations 27 | y_avg = np.average(wt) 28 | diff = 0 29 | for x in control: 30 | diff += abs(x - y_avg) 31 | diff_score = diff / len(control) 32 | return diff_score 33 | 34 | 35 | def kmeans_like_diff2(wt, control, binsize): 36 | # Calculate the k-means-like phenotype difference between mutants and WT 37 | ## Consider the influence of length of mutations 38 | y_avg = np.average(wt) 39 | diff = 0 40 | for x in control: 41 | diff += abs(float(x[4]) - y_avg) * ((int(x[2]) - int(x[1]))/binsize) 42 | diff_score = diff / len(control) 43 | return diff_score 44 | 45 | 46 | def openchromatin_scores(geneinfo, bedfile, peakfile = "", outdir = "./", samplename = "openchromatin"): 47 | """ 48 | Generate the open chromatin feature in specific bins. 49 | (Alternative data: ATAC-seq, DNase-seq, MNase-seq) 50 | 51 | Mandatory parameters: 52 | 1. geneinfo - A class that defines the information of target gene 53 | 2. bedfile - Open chromatin values in bedGraph format 54 | 3. peakfile - Enrichment regions called from open chromatin data in BED format 55 | 56 | Alternative parameters: 57 | 1. outdir - Output directory for saving the scores file (bedGraph format) 58 | """ 59 | 60 | # Get gene info 61 | gene = geneinfo.gene 62 | genename = geneinfo.alias 63 | if genename == "NA": 64 | gene_alias = gene 65 | else: 66 | gene_alias = genename 67 | chromosome = geneinfo.chrom 68 | binstart = geneinfo.start 69 | binstop = geneinfo.end 70 | binsize = geneinfo.binsize 71 | step = geneinfo.step 72 | 73 | # Check output directory 74 | misc.check_outdir(outdir) 75 | if not os.path.exists(bedfile): 76 | smooth_openchromatin = {} 77 | return smooth_openchromatin 78 | 79 | # Convert BigWig file to bedGraph file 80 | # Load bedGraph file as bed file 81 | if peakfile: 82 | oc_peak = BedTool(peakfile) 83 | oc_score = BedTool(bedfile) 84 | 85 | # Calculate scores 86 | oc_info = [] 87 | overlap_list = [] 88 | posinfo = {} 89 | for i, pos in enumerate(range(binstart, binstop, step)): 90 | posinfo[i] = pos 91 | binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n", 92 | from_string=True) 93 | score_in_bin = oc_score.intersect(binbed) 94 | if peakfile: 95 | peak_in_bin = oc_peak.intersect(binbed) 96 | overlap = [(int(str(x).split()[2])-int(str(x).split()[1]))/binsize for x in peak_in_bin] 97 | if overlap: 98 | overlap = 1 99 | else: 100 | overlap = 0.5 101 | else: 102 | overlap = 0.5 103 | sclst = [float(str(x).split()[3]) for x in score_in_bin] 104 | if sum(sclst): 105 | score = np.average(sclst) 106 | else: 107 | score = 0 108 | oc_info.append(score) 109 | overlap_list.append(overlap) 110 | 111 | # Smooth the scores 112 | max_score = max(oc_info) 113 | smooth_openchromatin = {} 114 | if max_score: 115 | oc_info = [x*overlap_list[i]/max_score for i,x in enumerate(oc_info)] 116 | else: 117 | return smooth_openchromatin 118 | outf = open(outdir + "/" + gene_alias + "/" + samplename + ".bedGraph", "w") 119 | for i in posinfo: 120 | pos = posinfo[i] 121 | score = oc_info[i] 122 | smooth_openchromatin[pos] = score 123 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outf) 124 | outf.close() 125 | 126 | return smooth_openchromatin 127 | 128 | 129 | def ptm_scores(geneinfo, bedfile, ocname, outdir = "./", samplename = "PTM", minratio = 0.2): 130 | """ 131 | Generate the histone modification feature in specific bins. 132 | (Alternative data: ChIP-seq) 133 | 134 | Mandatory parameters: 135 | 1. geneinfo - A class that defines the information of target gene 136 | 2. bedfile - histone modification values in bedGraph format 137 | 138 | Alternative parameters: 139 | 1. outdir - Output directory for saving the scores file (bedGraph format) 140 | """ 141 | 142 | # Get gene info 143 | gene = geneinfo.gene 144 | genename = geneinfo.alias 145 | if genename == "NA": 146 | gene_alias = gene 147 | else: 148 | gene_alias = genename 149 | chromosome = geneinfo.chrom 150 | binstart = geneinfo.start 151 | binstop = geneinfo.end 152 | binsize = geneinfo.binsize 153 | step = geneinfo.step 154 | 155 | # Check output directory 156 | misc.check_outdir(outdir) 157 | 158 | # Load bedGraph file as bed file 159 | ptm_score = BedTool(bedfile) 160 | 161 | # Calculate scores 162 | ptm_info = [] 163 | posinfo = {} 164 | for i, pos in enumerate(range(binstart, binstop, step)): 165 | posinfo[i] = pos 166 | binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n", 167 | from_string=True) 168 | ptm_in_bin = ptm_score.intersect(binbed) 169 | score = np.average([float(str(x).split()[3]) for x in ptm_in_bin]) 170 | if pd.isna(score): 171 | score = 0 172 | ptm_info.append(score) 173 | 174 | max_score = max(ptm_info) 175 | ptm_info = [x/max_score for x in ptm_info] 176 | 177 | # Get ratios from open chromatin results 178 | ocfile = outdir + "/" + gene_alias + "/" + ocname + ".bedGraph" 179 | oc_scores = BedTool(ocfile) 180 | oc_ratios = {} 181 | for interval in oc_scores: 182 | chrom, start, end, score = str(interval).rstrip().split("\t") 183 | if float(score) > minratio: 184 | oc_ratios[int(start)] = float(score) 185 | else: 186 | oc_ratios[int(start)] = minratio 187 | 188 | # Smooth the scores 189 | smooth_ptm = misc.smooth_scores_fill2(ptm_info, posinfo) 190 | outf = open(outdir + "/" + gene_alias + "/" + samplename + ".bedGraph", "w") 191 | ptm_scores = [(1-smooth_ptm[x])*oc_ratios[x] for x in smooth_ptm] 192 | max_score2 = max(ptm_scores) 193 | for pos in smooth_ptm: 194 | # score = smooth_ptm[pos] 195 | score = (1 - smooth_ptm[pos]) * oc_ratios[pos] / max_score2 196 | smooth_ptm[pos] = score 197 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outf) 198 | outf.close() 199 | 200 | return smooth_ptm 201 | 202 | 203 | def merge_reps(geneinfo, feature_list, outdir = "./", samplename = "merged"): 204 | """ 205 | Merge the NGS feature in specific bins. 206 | (Alternative data: DNase-seq, ATAC-seq, ChIP-seq) 207 | 208 | Mandatory parameters: 209 | 1. geneinfo - A class that defines the information of target gene 210 | 2. feature_list - A list contains features need to be merged 211 | 212 | Alternative parameters: 213 | 1. outdir - Output directory for saving the scores file (bedGraph format) 214 | """ 215 | 216 | # Get gene info 217 | gene = geneinfo.gene 218 | genename = geneinfo.alias 219 | if genename == "NA": 220 | gene_alias = gene 221 | else: 222 | gene_alias = genename 223 | chromosome = geneinfo.chrom 224 | binsize = geneinfo.binsize 225 | 226 | # Check output directory 227 | misc.check_outdir(outdir) 228 | 229 | # Merge the feature scores 230 | scorelist = [] 231 | cnt = 0 232 | for feature in feature_list: 233 | if not feature: 234 | continue 235 | if cnt: 236 | for i, pos in enumerate(feature): 237 | scorelist[i] += feature[pos] 238 | else: 239 | for pos in feature: 240 | scorelist.append(feature[pos]) 241 | cnt += 1 242 | scores_merge = {} 243 | if not scorelist: 244 | return scores_merge 245 | outf = open(outdir + "/" + gene_alias + "/" + samplename + ".bedGraph", "w") 246 | for i, pos in enumerate(feature_list[0]): 247 | score = scorelist[i] / max(scorelist) 248 | scores_merge[pos] = score 249 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outf) 250 | outf.close() 251 | 252 | return scores_merge 253 | 254 | 255 | def motif_scores(geneinfo, bedfile, outdir = "./", flanking = 3): 256 | """ 257 | Generate the TF motifs feature in specific bins. 258 | (Alternative data: Motif sites calculated by FIMO with PlantTFDB/JASPAR PWM files) 259 | 260 | Mandatory parameters: 261 | 1. geneinfo - A class that defines the information of target gene 262 | 2. bedfile - Motif positions in BED format 263 | 264 | Alternative parameters: 265 | 1. outdir - Output directory for saving the scores file (bedGraph format) 266 | """ 267 | 268 | # Get gene info 269 | gene = geneinfo.gene 270 | genename = geneinfo.alias 271 | if genename == "NA": 272 | gene_alias = gene 273 | else: 274 | gene_alias = genename 275 | chromosome = geneinfo.chrom 276 | binstart = geneinfo.start 277 | binstop = geneinfo.end 278 | strand = geneinfo.strand 279 | binsize = geneinfo.binsize 280 | step = geneinfo.step 281 | 282 | # Check output directory 283 | misc.check_outdir(outdir) 284 | 285 | # Load bed file 286 | tf_motif = BedTool(bedfile) 287 | 288 | # Calculate scores 289 | motif_density = [] 290 | motif_info = [] 291 | posinfo = {} 292 | count = 0 293 | for i, pos in enumerate(range(binstart, binstop, step)): 294 | posinfo[i] = pos 295 | binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n", 296 | from_string=True) 297 | motif_in_bin = tf_motif.intersect(binbed) 298 | motif_lens = [int(str(x).split()[1]) for x in motif_in_bin]+[int(str(x).split()[2]) for x in motif_in_bin] 299 | if motif_lens: 300 | motif_density.append((max(motif_lens) - min(motif_lens)) / binsize) 301 | else: 302 | motif_density.append(0) 303 | bincount = len(motif_density) 304 | for i, score in enumerate(motif_density): 305 | if i > flanking: 306 | if i+flanking+1 > bincount: 307 | density = sum(motif_density[i-flanking:]) / (flanking+bincount-i) 308 | else: 309 | density = sum(motif_density[i-flanking:i+flanking+1]) / (2*flanking+1) 310 | else: 311 | density = sum(motif_density[:i+flanking+1]) / (flanking+i+1) 312 | motif_info.append(density) 313 | 314 | # Smooth the scores 315 | smooth_motif = misc.smooth_scores_fill2(motif_info, posinfo, minratio=1) 316 | max_score = max(smooth_motif.values()) 317 | outf = open(outdir + "/" + gene_alias + "/motifs.bedGraph", "w") 318 | for pos in smooth_motif: 319 | score = smooth_motif[pos] / max_score 320 | smooth_motif[pos] = score 321 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outf) 322 | outf.close() 323 | 324 | return smooth_motif 325 | 326 | 327 | def cns_scores(geneinfo, bedfile, outdir = "./"): 328 | """ 329 | Generate the conservation feature in specific bins. 330 | (Alternative data: Phastcons scores) 331 | 332 | Mandatory parameters: 333 | 1. geneinfo - A class that defines the information of target gene 334 | 2. bedfile - Conservation scores in BED format 335 | 336 | Alternative parameters: 337 | 1. outdir - Output directory for saving the scores file (bedGraph format) 338 | """ 339 | 340 | # Get gene info 341 | gene = geneinfo.gene 342 | genename = geneinfo.alias 343 | if genename == "NA": 344 | gene_alias = gene 345 | else: 346 | gene_alias = genename 347 | chromosome = geneinfo.chrom 348 | binstart = geneinfo.start 349 | binstop = geneinfo.end 350 | binsize = geneinfo.binsize 351 | step = geneinfo.step 352 | 353 | # Check output directory 354 | misc.check_outdir(outdir) 355 | if not os.path.exists(bedfile): 356 | smooth_cns = {} 357 | return smooth_cns 358 | 359 | # Load bed file 360 | cns = BedTool(bedfile) 361 | 362 | # Calculate scores 363 | cns_info = [] 364 | posinfo = {} 365 | for i, pos in enumerate(range(binstart, binstop, step)): 366 | posinfo[i] = pos 367 | binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n", 368 | from_string=True) 369 | cns_in_bin = cns.intersect(binbed) 370 | sclst = [float(str(x).split()[3]) for x in cns_in_bin] 371 | if sum(sclst): 372 | score = np.average(sclst) 373 | else: 374 | score = 0 375 | cns_info.append(score) 376 | 377 | # Smooth the scores 378 | smooth_cns = misc.smooth_scores2(cns_info, posinfo) 379 | outf = open(outdir + "/" + gene_alias + "/CNS.bedGraph", "w") 380 | for pos in smooth_cns: 381 | score = smooth_cns[pos] 382 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outf) 383 | outf.close() 384 | 385 | return smooth_cns 386 | 387 | 388 | def genopheno_scores(geneinfo, bedfile, outdir = "./"): 389 | """ 390 | Generate the genotype and phenotype relationship feature in specific bins. 391 | (Alternative data: SNPs&Indels and Phenotype data) 392 | 393 | Mandatory parameters: 394 | 1. geneinfo - A class that defines the information of target gene 395 | 2. bedfile - genotype and phenotype relationship scores in BED format 396 | 397 | Alternative parameters: 398 | 1. outdir - Output directory for saving the scores file (bedGraph format) 399 | """ 400 | 401 | # Get gene info 402 | gene = geneinfo.gene 403 | genename = geneinfo.alias 404 | if genename == "NA": 405 | gene_alias = gene 406 | else: 407 | gene_alias = genename 408 | chromosome = geneinfo.chrom 409 | binstart = geneinfo.start 410 | binstop = geneinfo.end 411 | binsize = geneinfo.binsize 412 | step = geneinfo.step 413 | 414 | # Check output directory 415 | misc.check_outdir(outdir) 416 | 417 | # Load bed file 418 | genopheno = BedTool(bedfile) 419 | 420 | # Calculate scores 421 | genopheno_info = [] 422 | posinfo = {} 423 | for i, pos in enumerate(range(binstart, binstop, step)): 424 | posinfo[i] = pos 425 | binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n", 426 | from_string=True) 427 | genopheno_in_bin = genopheno.intersect(binbed) 428 | values = [float(str(x).split()[-1]) for x in genopheno_in_bin] 429 | # values = [x if x <= highest else highest for x in values] 430 | if values: 431 | score = sum(values) 432 | else: 433 | score = 0 434 | genopheno_info.append(score) 435 | 436 | # Smooth the scores 437 | smooth_genopheno = misc.smooth_scores_fill2(genopheno_info, posinfo) 438 | max_score = max(smooth_genopheno.values()) 439 | outf = open(outdir + "/" + gene_alias + "/genopheno.bedGraph", "w") 440 | for pos in smooth_genopheno: 441 | score = smooth_genopheno[pos] / max_score 442 | smooth_genopheno[pos] = score 443 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outf) 444 | outf.close() 445 | 446 | return smooth_genopheno 447 | 448 | 449 | def aggregate_scores(geneinfo, scorelist, weightlist, outdir = "./"): 450 | """ 451 | Generate the aggregate score in specific bins. 452 | 453 | Mandatory parameters: 454 | 1. geneinfo - A class that defines the information of target gene 455 | 2. scorelist - A list host multiple feature scores from different data 456 | 3. weightlist - A list contains different weights assigned to different features 457 | (Should have the same order and numbers as scorelist) 458 | 459 | Alternative parameters: 460 | 1. outdir - Output directory for saving the scores file (bedGraph format) 461 | """ 462 | 463 | # Get gene info 464 | gene = geneinfo.gene 465 | genename = geneinfo.alias 466 | if genename == "NA": 467 | gene_alias = gene 468 | else: 469 | gene_alias = genename 470 | chromosome = geneinfo.chrom 471 | binsize = geneinfo.binsize 472 | 473 | # Check output directory 474 | misc.check_outdir(outdir) 475 | 476 | # Calculate scores 477 | outf = open(outdir + "/" + gene_alias + "/aggregate.bedGraph", "w") 478 | ziplist = zip(scorelist, weightlist) 479 | aggregate_info = {} 480 | total = sum(weightlist) 481 | for item in ziplist: 482 | scorelist = item[0] 483 | weight = item[1] 484 | for pos in scorelist: 485 | aggregate = scorelist[pos] * weight / total 486 | if pos in aggregate_info: 487 | aggregate_info[pos] += aggregate 488 | else: 489 | aggregate_info[pos] = aggregate 490 | if aggregate_info: 491 | max_score = max(aggregate_info.values()) 492 | else: 493 | print(gene_alias) 494 | return aggregate_info 495 | if not max_score: 496 | return aggregate_info 497 | for pos in aggregate_info: 498 | score = aggregate_info[pos] / max_score 499 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outf) 500 | outf.close() 501 | 502 | return aggregate_info 503 | 504 | 505 | def phenodata_scores(geneinfo, bedfile, method = "kmeans1", outdir = "./", randbg = 0.02): 506 | """ 507 | Calcuate the average phenodata value from multiple samples in specific bins. 508 | 509 | Mandatory parameters: 510 | 1. geneinfo - A class that defines the information of target gene 511 | 2. phenodata - Phenotype data of mutants in BED format 512 | chrom start end samplename avg_value 513 | 514 | Alternative parameters: 515 | 1. method - Methods used for calculating phenotype difference between WT and mutants 516 | ["ratio", "stdev", "utest", "kmeans1", "kmeans2"] 517 | 2. outdir - Output directory for saving the scores file (bedGraph format) 518 | """ 519 | 520 | # Get gene info 521 | gene = geneinfo.gene 522 | genename = geneinfo.alias 523 | if genename == "NA": 524 | gene_alias = gene 525 | else: 526 | gene_alias = genename 527 | chromosome = geneinfo.chrom 528 | binstart = geneinfo.start 529 | binstop = geneinfo.end 530 | binsize = geneinfo.binsize 531 | step = geneinfo.step 532 | 533 | # Check output directory 534 | misc.check_outdir(outdir) 535 | 536 | # Load bed file 537 | phenodata = BedTool(bedfile) 538 | 539 | # Calculate scores 540 | methods = ["ratio", "stdev", "utest", "kmeans1", "kmeans2"] 541 | phenoinfo = [] 542 | posinfo = {} 543 | for i, pos in enumerate(range(binstart, binstop, step)): 544 | posinfo[i] = pos 545 | binbed = BedTool("\t".join([chromosome, str(pos), str(pos+binsize)])+"\n", 546 | from_string=True) 547 | pheno_in_bin = phenodata.intersect(binbed) 548 | mutant_phenos = [float(str(x).split()[4]) for x in pheno_in_bin if str(x).split()[3] != "WT"] 549 | wt_phenos = [float(str(x).split()[4]) for x in pheno_in_bin if str(x).split()[3] == "WT"] 550 | if mutant_phenos: 551 | if method == methods[0]: 552 | score = np.average(mutant_phenos) / np.average(wt_phenos) 553 | elif method == methods[1]: 554 | score = np.std(wt_phenos + mutant_phenos) 555 | elif method == methods[2]: 556 | mannwhitneyu = stats.mannwhitneyu(wt_phenos, mutant_phenos) 557 | score = -np.log10(mannwhitneyu[1]) 558 | elif method == methods[3]: 559 | score = kmeans_like_diff(wt_phenos, mutant_phenos) 560 | elif method == methods[4]: 561 | mutant_phenos = [str(x).split()[:5] for x in pheno_in_bin if str(x).split()[3] != "WT"] 562 | score = kmeans_like_diff2(wt_phenos, mutant_phenos, binsize) 563 | else: 564 | print("Cannot find this method. Available methods are:", methods) 565 | else: 566 | score = 0 567 | phenoinfo.append(score) 568 | 569 | # Smooth the scores 570 | max_score = max(phenoinfo) 571 | random.seed(81) 572 | phenoinfo = [max(x/max_score+random.uniform(-randbg, randbg), 0) if x else x for x in phenoinfo] 573 | # Output raw scores of phenotypes 574 | outraw = open(outdir + "/" + gene_alias + "/phenoscores_" + method + "_raw.bedGraph", "w") 575 | for i in posinfo: 576 | pos = posinfo[i] 577 | score = phenoinfo[i] 578 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outraw) 579 | outraw.close() 580 | # Output smooth and gap-filled scores of phenotypes 581 | smooth_phenos = misc.smooth_scores1(phenoinfo, posinfo) 582 | max_score = max(smooth_phenos.values()) 583 | min_score = min([x for x in smooth_phenos.values() if x]) 584 | outf = open(outdir + "/" + gene_alias + "/phenoscores_" + method + ".bedGraph", "w") 585 | for pos in smooth_phenos: 586 | if smooth_phenos[pos]: 587 | score = (smooth_phenos[pos] - min_score) / (max_score - min_score) 588 | else: 589 | score = 0 590 | smooth_phenos[pos] = score 591 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outf) 592 | outf.close() 593 | 594 | return smooth_phenos 595 | 596 | 597 | def define_key_regions(geneinfo, aggregate, phenodata, threshold = 0, outdir = "./"): 598 | """ 599 | Define the key regions of the target site. 600 | 601 | Mandatory parameters: 602 | 1. geneinfo - A class that defines the information of target gene 603 | 2. aggregate - Aggregate scores 604 | 3. phenotypes - Phenotype scores 605 | (Should have the same order and numbers as scorelist) 606 | 607 | Alternative parameters: 608 | 1. threshold - Bin with score above the threshold is defined as a key region 609 | (Default: average of aggregate scores) 610 | 2. outdir - Output directory for saving the scores file (bedGraph format) 611 | 612 | Outputs: 613 | 1. plot_scores - Phenotype and aggregate scores for R/ggplot2 614 | 2. key_regions - Key regions in the target site 615 | 3. stats - Statistics of Pearson correlation and differential significance 616 | """ 617 | 618 | # Get gene info 619 | gene = geneinfo.gene 620 | genename = geneinfo.alias 621 | if genename == "NA": 622 | gene_alias = gene 623 | else: 624 | gene_alias = genename 625 | chromosome = geneinfo.chrom 626 | binsize = geneinfo.binsize 627 | 628 | # Check output directory 629 | misc.check_outdir(outdir) 630 | 631 | # Define the cutoff 632 | if threshold: 633 | cutoff = threshold 634 | cutoff_dev = 0 635 | else: 636 | cutoff_dev = np.std(list(aggregate.values())) 637 | cutoff = np.average(list(aggregate.values())) 638 | 639 | # Classify key regions and other regions 640 | key_regions = [] 641 | aggregate_all = [] 642 | phenotype_all = [] 643 | for pos in aggregate: 644 | score = aggregate[pos] 645 | if score >= cutoff: 646 | aggregate_all.append(score) 647 | key_regions.append([pos, score]) 648 | else: 649 | aggregate_all.append(score) 650 | 651 | # Output key regions info 652 | merged_regions = misc.merge_regions(key_regions, geneinfo) 653 | raw_file = outdir + "/" + gene_alias + "/key_regions_raw.bed" 654 | outregion1 = open(raw_file, "w") 655 | merged_file = outdir + "/" + gene_alias + "/key_regions_merged.bed" 656 | outregion2 = open(merged_file, "w") 657 | for region in key_regions: 658 | pos, score = region 659 | print(chromosome, pos, pos+binsize, score, sep="\t", file=outregion1) 660 | for lst in merged_regions: 661 | print("\t".join(list(map(str, lst))), file=outregion2) 662 | outregion1.close() 663 | outregion2.close() 664 | 665 | if os.path.exists(phenodata): 666 | # Calculate statistical values 667 | outf = open(outdir + "/" + gene_alias + "/plot_scores.txt", "w") 668 | print("sample", "group", "ratio", "difference", sep="\t", file=outf) 669 | outstat = open(outdir + "/" + gene_alias + "/statistics.txt", "w") 670 | # Cutoff of key regions definition 671 | print("Cutoff for defining key regions: %s" % cutoff, file=outstat) 672 | print("Cutoff deviation: %s" % cutoff_dev, file=outstat) 673 | # Calculate difference 674 | pheno_all = new_stats(geneinfo, phenodata, outdir = outdir) 675 | mean_ratio = np.average([x[1] for x in pheno_all]) 676 | min_ratio = min(([x[1] for x in pheno_all])) 677 | max_ratio = max(([x[1] for x in pheno_all])) 678 | high_edited = [] 679 | high_edited2 = [] 680 | low_edited = [] 681 | low_edited2 = [] 682 | for scores in pheno_all: 683 | diff = scores[0] 684 | ratio = scores[1] 685 | sample = scores[2] 686 | if ratio > mean_ratio: 687 | high_edited.append((diff)) 688 | high_edited2.append((diff-min_ratio)/(max_ratio-min_ratio)) 689 | print(sample, "high", ratio, diff, sep="\t", file=outf) 690 | else: 691 | low_edited.append(diff) 692 | low_edited2.append((diff-min_ratio)/(max_ratio-min_ratio)) 693 | print(sample, "low", ratio, diff, sep="\t", file=outf) 694 | outf.close() 695 | 696 | phe_high = np.average(high_edited2) 697 | phe_low = np.average(low_edited2) 698 | phe_ratio = phe_high / phe_low 699 | phe_pvalue = stats.mannwhitneyu(low_edited, high_edited) 700 | phe_pvalue2 = stats.ks_2samp(low_edited, high_edited, alternative="greater") 701 | phe_pvalue3 = stats.f_oneway(low_edited, high_edited) 702 | print("Phenotype differential ratio:", phe_ratio) 703 | print("Phenotype significance (U test):", phe_pvalue[1]) 704 | print("Phenotype significance (KS test):", phe_pvalue2[1]) 705 | print("Phenotype significance (ANOVA):", phe_pvalue3[1]) 706 | print("Phenotype differential ratio:", phe_ratio, file=outstat) 707 | print("Phenotype significance (U test):", phe_pvalue[1], file=outstat) 708 | print("Phenotype significance (KS test):", phe_pvalue2[1], file=outstat) 709 | print("Phenotype significance (ANOVA):", phe_pvalue3[1], file=outstat) 710 | outstat.close() 711 | else: 712 | print("No Phenotype data detected, output key regions.") 713 | 714 | return key_regions 715 | 716 | 717 | def new_stats(geneinfo, phenodata, outdir = "./", side="both"): 718 | 719 | pheno_bed = BedTool(phenodata) 720 | gene = geneinfo.gene 721 | genename = geneinfo.alias 722 | if genename == "NA": 723 | gene_alias = gene 724 | else: 725 | gene_alias = genename 726 | key_regions = outdir + "/" + gene_alias + "/key_regions_merged.bed" 727 | region_bed = BedTool(key_regions) 728 | regionlens = sum([int(str(x).split()[2])-int(str(x).split()[1]) for x in region_bed]) 729 | intersect = pheno_bed.intersect(region_bed, wao=True) 730 | sample_values = {} 731 | for interval in intersect: 732 | info = str(interval).rstrip().split("\t") 733 | sample = info[3] 734 | pheno = float(info[4]) 735 | if sample == "WT": 736 | wt_value = pheno 737 | continue 738 | length = int(info[-1]) 739 | if sample not in sample_values: 740 | if side == "none": 741 | phenoscore = abs(pheno - wt_value) 742 | else: 743 | phenoscore = pheno - wt_value 744 | sample_values[sample] = [phenoscore, 0] 745 | sample_values[sample][1] += length / regionlens 746 | max_ratio = max([x[1] for x in sample_values.values()]) 747 | mean_pheno = np.average([x[0] for x in sample_values.values()]) 748 | if mean_pheno < 0: 749 | for s in sample_values: 750 | sample_values[s][0] *= -1 751 | scores_list = sorted([(sample_values[s][0], sample_values[s][1]/max_ratio, s) for s in sample_values], 752 | key=lambda x:x[1], reverse=True) 753 | 754 | return scores_list 755 | 756 | --------------------------------------------------------------------------------